PhiForCausalLM does not support gradient checkpointing.
I am FT Phi-2 and i am getting this error--PhiForCausalLM does not support gradient checkpointing. Has anyone dealt with this error?
This is my FT code -
import os
import pandas as pd
import logging
import re
import numpy as np
import torch
import argparse
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
DataCollatorForLanguageModeling,
Trainer,Pipeline
)
from datetime import datetime
from datasets import load_dataset, Dataset, DatasetDict
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model,PeftModel
from datasets import Dataset, DatasetDict
Set up logging
logger = logging.getLogger(name)
log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"logs_finetuning_{datetime.now().strftime('%Y%m%d%H%M%S')}.txt")
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def main():
"""Main function to run ALPaCA LORA training script."""
parser = argparse.ArgumentParser(description="Run ALPaCA LORA training script")
parser.add_argument("--sample_size", type=int, default=1000, help="Number of samples")
parser.add_argument("--model_name", type=str, default="databricks/dolly-v2-3b", help="Pretrained model name")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--MICRO_BATCH_SIZE", type=int, default=4, help="MICRO_BATCH_SIZE")
parser.add_argument("--output_dir", type=str, default="AlpacaWeights", help="Output Directory")
args = parser.parse_args()
# Log command-line arguments
logging.info(f"Command-line arguments: {args}")
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(args.model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(args.model_name, device_map="auto", torch_dtype=torch.bfloat16, load_in_8bit=True)
# Load dataset(From Hugging Face)
# data = load_dataset("tatsu-lab/alpaca")
# def generate_prompt(data_point):
# return data_point["text"]
# data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})
# sampled_data = data['train'].shuffle(seed=42).select(range(args.sample_size))
# sampled_dataset_dict = DatasetDict({"train": sampled_data})
# data=sampled_dataset_dict
# End of Hugging Face Dataset code
# Load dataset(From Local dataframe)
df1 = pd.read_csv('mycsvfile.csv')
df1 = df1.sample(frac=1.0, random_state=42).reset_index(drop=True)
# Convert the Pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df1)
# Create a DatasetDict
dataset_dict = DatasetDict({
"train": hf_dataset
})
data=dataset_dict
def generate_prompt(data_point):
return f"{data_point['consolidated_prompt_response']} "
data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})
# Settings for A100 - For 3090
MICRO_BATCH_SIZE = args.MICRO_BATCH_SIZE
BATCH_SIZE = args.batch_size
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1 # paper uses 3
LEARNING_RATE = 2e-5
CUTOFF_LEN = 200
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05
model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)
config = LoraConfig(
r=LORA_R,
lora_alpha=LORA_ALPHA,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0 # unk. we want this to be different from the eos token
data = data.shuffle().map(
lambda data_point: tokenizer(
generate_prompt(data_point),
truncation=True,
max_length=CUTOFF_LEN,
padding="max_length",
)
)
trainer = Trainer(
model=model,
train_dataset=data["train"],
args=TrainingArguments(
per_device_train_batch_size=MICRO_BATCH_SIZE,
gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
warmup_steps=100,
num_train_epochs=EPOCHS,
learning_rate=LEARNING_RATE,
fp16=True,
logging_steps=10,
output_dir="my output directory",
save_total_limit=3,
gradient_checkpointing=False, # Disable gradient checkpointing
),
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)
model.save_pretrained("my model path")
if name == "main":
main()
Phi-2 is currently not supported for gradient checkpointing. You need to set
model = prepare_model_for_int8_training(model, use_gradient_checkpointing=False) and then reduce the max seq length if you run into OOM errors.
@Aditiyadav You can use appropriate branch by specifying revision to enable checkpointing -
base_model,
quantization_config=bnb_config,
trust_remote_code=True,
flash_attn=True,
flash_rotary=True,
fused_dense=True,
low_cpu_mem_usage=True,
device_map={"": 0},
revision="refs/pr/23",
)```
I am able to run it now by using use_gradient_checkpointing=False