philschmid
/

gemma-7b-dolly-chatml

Generated from Trainer

Model card Files Files and versions Metrics Training metrics Community

gemma-7b-dolly-chatml / trl-lora.py

philschmid's picture

philschmid HF staff

Upload 2 files

48c7a89 verified 9 months ago

2.82 kB

	from datasets import load_dataset
	from transformers import TrainingArguments
	from trl import SFTTrainer
	import torch
	from transformers import AutoTokenizer, AutoModelForCausalLM
	from peft import LoraConfig

	# Load jsonl data from disk
	dataset = load_dataset("philschmid/dolly-15k-oai-style", split="train")

	# Hugging Face model id
	model_id = "google/gemma-7b"
	tokenizer_id = "philschmid/gemma-tokenizer-chatml"

	# Load model and tokenizer
	model = AutoModelForCausalLM.from_pretrained(
	model_id,
	device_map="auto",
	attn_implementation="flash_attention_2",
	torch_dtype=torch.bfloat16,
	)
	tokenizer = AutoTokenizer.from_pretrained(tokenizer_id)
	tokenizer.padding_side = 'right' # to prevent warnings

	# LoRA config based on QLoRA paper & Sebastian Raschka experiment
	peft_config = LoraConfig(
	lora_alpha=8,
	lora_dropout=0.05,
	r=16,
	bias="none",
	target_modules="all-linear",
	task_type="CAUSAL_LM",
	)

	args = TrainingArguments(
	output_dir="gemma-7b-dolly-chatml", # directory to save and repository id
	num_train_epochs=3, # number of training epochs
	per_device_train_batch_size=8, # batch size per device during training
	gradient_checkpointing=True, # use gradient checkpointing to save memory
	optim="adamw_torch_fused", # use fused adamw optimizer
	logging_steps=10, # log every 10 steps
	save_strategy="epoch", # save checkpoint every epoch
	bf16=True, # use bfloat16 precision
	tf32=True, # use tf32 precision
	### peft specific arguments ###
	learning_rate=2e-4, # learning rate, based on QLoRA paper
	max_grad_norm=0.3, # max gradient norm based on QLoRA paper
	warmup_ratio=0.03, # warmup ratio based on QLoRA paper
	lr_scheduler_type="constant", # use constant learning rate scheduler
	report_to="tensorboard", # report metrics to tensorboard
	push_to_hub=True, # push model to hub

	)

	max_seq_length = 1512 # max sequence length for model and packing of the dataset

	trainer = SFTTrainer(
	model=model,
	args=args,
	train_dataset=dataset,
	### peft specific arguments ###
	peft_config=peft_config,
	max_seq_length=max_seq_length,
	tokenizer=tokenizer,
	packing=True,
	dataset_kwargs={
	"add_special_tokens": True, # make sure we add <bos> and <eos> tokens
	"append_concat_token": False, # make sure to not add additional tokens when packing
	}
	)

	# start training, the model will be automatically saved to the hub and the output directory
	trainer.train()

	# save model
	trainer.save_model()