|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import bitsandbytes as bnb |
|
from datasets import load_dataset |
|
from functools import partial |
|
import os |
|
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, AutoPeftModelForCausalLM |
|
import torch |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, Trainer, TrainingArguments, BitsAndBytesConfig, \ |
|
DataCollatorForLanguageModeling, Trainer, TrainingArguments |
|
from datasets import load_dataset |
|
|
|
def load_model(model_name, bnb_config): |
|
n_gpus = torch.cuda.device_count() |
|
max_memory = f'{40960}MB' |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
model_name, |
|
quantization_config=bnb_config, |
|
device_map="auto", |
|
max_memory = {i: max_memory for i in range(n_gpus)}, |
|
) |
|
tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True) |
|
|
|
|
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
return model, tokenizer |
|
|
|
|
|
from datasets import load_dataset |
|
|
|
dataset = load_dataset("databricks/databricks-dolly-15k", split="train") |
|
|
|
print(f'Number of prompts: {len(dataset)}') |
|
print(f'Column names are: {dataset.column_names}') |
|
|
|
|
|
|
|
|
|
|
|
|
|
def create_prompt_formats(sample): |
|
""" |
|
Format various fields of the sample ('instruction', 'context', 'response') |
|
Then concatenate them using two newline characters |
|
:param sample: Sample dictionnary |
|
""" |
|
|
|
INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." |
|
INSTRUCTION_KEY = "### Instruction:" |
|
INPUT_KEY = "Input:" |
|
RESPONSE_KEY = "### Response:" |
|
END_KEY = "### End" |
|
|
|
blurb = f"{INTRO_BLURB}" |
|
instruction = f"{INSTRUCTION_KEY}\n{sample['instruction']}" |
|
input_context = f"{INPUT_KEY}\n{sample['context']}" if sample["context"] else None |
|
response = f"{RESPONSE_KEY}\n{sample['response']}" |
|
end = f"{END_KEY}" |
|
|
|
parts = [part for part in [blurb, instruction, input_context, response, end] if part] |
|
|
|
formatted_prompt = "\n\n".join(parts) |
|
|
|
sample["text"] = formatted_prompt |
|
|
|
return sample |
|
|
|
|
|
|
|
def get_max_length(model): |
|
conf = model.config |
|
max_length = None |
|
for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]: |
|
max_length = getattr(model.config, length_setting, None) |
|
if max_length: |
|
print(f"Found max lenth: {max_length}") |
|
break |
|
if not max_length: |
|
max_length = 1024 |
|
print(f"Using default max length: {max_length}") |
|
return max_length |
|
|
|
|
|
def preprocess_batch(batch, tokenizer, max_length): |
|
""" |
|
Tokenizing a batch |
|
""" |
|
return tokenizer( |
|
batch["text"], |
|
max_length=max_length, |
|
truncation=True, |
|
) |
|
|
|
|
|
|
|
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int, seed, dataset: str): |
|
"""Format & tokenize it so it is ready for training |
|
:param tokenizer (AutoTokenizer): Model Tokenizer |
|
:param max_length (int): Maximum number of tokens to emit from tokenizer |
|
""" |
|
|
|
|
|
print("Preprocessing dataset...") |
|
dataset = dataset.map(create_prompt_formats) |
|
|
|
|
|
_preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer) |
|
dataset = dataset.map( |
|
_preprocessing_function, |
|
batched=True, |
|
remove_columns=["instruction", "context", "response", "text", "category"], |
|
) |
|
|
|
|
|
dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length) |
|
|
|
|
|
dataset = dataset.shuffle(seed=seed) |
|
|
|
return dataset |
|
|
|
|
|
def create_bnb_config(): |
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_use_double_quant=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.bfloat16, |
|
) |
|
|
|
return bnb_config |
|
|
|
def create_peft_config(modules): |
|
""" |
|
Create Parameter-Efficient Fine-Tuning config for your model |
|
:param modules: Names of the modules to apply Lora to |
|
""" |
|
config = LoraConfig( |
|
r=16, |
|
lora_alpha=64, |
|
target_modules=modules, |
|
lora_dropout=0.1, |
|
bias="none", |
|
task_type="CAUSAL_LM", |
|
) |
|
|
|
return config |
|
|
|
|
|
|
|
def find_all_linear_names(model): |
|
cls = bnb.nn.Linear4bit |
|
lora_module_names = set() |
|
for name, module in model.named_modules(): |
|
if isinstance(module, cls): |
|
names = name.split('.') |
|
lora_module_names.add(names[0] if len(names) == 1 else names[-1]) |
|
|
|
if 'lm_head' in lora_module_names: |
|
lora_module_names.remove('lm_head') |
|
return list(lora_module_names) |
|
|
|
def print_trainable_parameters(model, use_4bit=False): |
|
""" |
|
Prints the number of trainable parameters in the model. |
|
""" |
|
trainable_params = 0 |
|
all_param = 0 |
|
for _, param in model.named_parameters(): |
|
num_params = param.numel() |
|
|
|
if num_params == 0 and hasattr(param, "ds_numel"): |
|
num_params = param.ds_numel |
|
|
|
all_param += num_params |
|
if param.requires_grad: |
|
trainable_params += num_params |
|
if use_4bit: |
|
trainable_params /= 2 |
|
print( |
|
f"all params: {all_param:,d} || trainable params: {trainable_params:,d} || trainable%: {100 * trainable_params / all_param}" |
|
) |
|
|
|
|
|
|
|
model_name = "meta-llama/Llama-2-7b-hf" |
|
|
|
bnb_config = create_bnb_config() |
|
|
|
model, tokenizer = load_model(model_name, bnb_config) |
|
|
|
print(model) |
|
|
|
|
|
|
|
max_length = get_max_length(model) |
|
|
|
print(max_length) |
|
|
|
|
|
|
|
seed = 98345 |
|
|
|
dataset = preprocess_dataset(tokenizer, max_length, seed, dataset) |
|
|
|
|
|
def train(model, tokenizer, dataset, output_dir): |
|
|
|
|
|
model.gradient_checkpointing_enable() |
|
|
|
|
|
model = prepare_model_for_kbit_training(model) |
|
|
|
|
|
modules = find_all_linear_names(model) |
|
|
|
|
|
peft_config = create_peft_config(modules) |
|
model = get_peft_model(model, peft_config) |
|
|
|
|
|
print_trainable_parameters(model) |
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
train_dataset=dataset, |
|
args=TrainingArguments( |
|
per_device_train_batch_size=1, |
|
gradient_accumulation_steps=4, |
|
warmup_steps=2, |
|
max_steps=20, |
|
learning_rate=2e-4, |
|
fp16=True, |
|
logging_steps=1, |
|
output_dir="outputs", |
|
optim="paged_adamw_8bit", |
|
), |
|
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) |
|
) |
|
|
|
model.config.use_cache = False |
|
|
|
|
|
|
|
|
|
dtypes = {} |
|
for _, p in model.named_parameters(): |
|
dtype = p.dtype |
|
if dtype not in dtypes: dtypes[dtype] = 0 |
|
dtypes[dtype] += p.numel() |
|
total = 0 |
|
for k, v in dtypes.items(): total+= v |
|
for k, v in dtypes.items(): |
|
print(k, v, v/total) |
|
|
|
do_train = True |
|
|
|
|
|
print("Training...") |
|
|
|
if do_train: |
|
train_result = trainer.train() |
|
metrics = train_result.metrics |
|
trainer.log_metrics("train", metrics) |
|
trainer.save_metrics("train", metrics) |
|
trainer.save_state() |
|
print(metrics) |
|
|
|
|
|
|
|
|
|
print("Saving last checkpoint of the model...") |
|
os.makedirs(output_dir, exist_ok=True) |
|
trainer.model.save_pretrained(output_dir) |
|
|
|
|
|
del model |
|
del trainer |
|
torch.cuda.empty_cache() |
|
|
|
output_dir = "results/llama2/final_checkpoint" |
|
|
|
|
|
print("Run train ...") |
|
train(model, tokenizer, dataset, output_dir) |
|
|
|
|
|
|
|
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.bfloat16) |
|
model = model.merge_and_unload() |
|
|
|
output_merged_dir = "results/llama2/final_merged_checkpoint" |
|
os.makedirs(output_merged_dir, exist_ok=True) |
|
model.save_pretrained(output_merged_dir, safe_serialization=True) |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
tokenizer.save_pretrained(output_merged_dir) |
|
|
|
|