PhiForCausalLM does not support gradient checkpointing.

#31
by Aditiyadav - opened

I am FT Phi-2 and i am getting this error--PhiForCausalLM does not support gradient checkpointing. Has anyone dealt with this error?

This is my FT code -

import os
import pandas as pd
import logging
import re
import numpy as np
import torch
import argparse
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
DataCollatorForLanguageModeling,
Trainer,Pipeline
)
from datetime import datetime
from datasets import load_dataset, Dataset, DatasetDict
from peft import prepare_model_for_int8_training, LoraConfig, get_peft_model,PeftModel
from datasets import Dataset, DatasetDict

Set up logging

logger = logging.getLogger(name)

log_dir = "logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, f"logs_finetuning_{datetime.now().strftime('%Y%m%d%H%M%S')}.txt")
logging.basicConfig(filename=log_file, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def main():
"""Main function to run ALPaCA LORA training script."""
parser = argparse.ArgumentParser(description="Run ALPaCA LORA training script")
parser.add_argument("--sample_size", type=int, default=1000, help="Number of samples")
parser.add_argument("--model_name", type=str, default="databricks/dolly-v2-3b", help="Pretrained model name")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size")
parser.add_argument("--MICRO_BATCH_SIZE", type=int, default=4, help="MICRO_BATCH_SIZE")
parser.add_argument("--output_dir", type=str, default="AlpacaWeights", help="Output Directory")

args = parser.parse_args()

# Log command-line arguments
logging.info(f"Command-line arguments: {args}")

# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(args.model_name, padding_side="left")
model = AutoModelForCausalLM.from_pretrained(args.model_name, device_map="auto", torch_dtype=torch.bfloat16, load_in_8bit=True)

# Load dataset(From Hugging Face)

# data = load_dataset("tatsu-lab/alpaca")
# def generate_prompt(data_point):
#     return data_point["text"]

# data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})
# sampled_data = data['train'].shuffle(seed=42).select(range(args.sample_size))
# sampled_dataset_dict = DatasetDict({"train": sampled_data})
# data=sampled_dataset_dict

# End of Hugging Face Dataset code



# Load dataset(From Local dataframe)
df1 = pd.read_csv('mycsvfile.csv')
df1 = df1.sample(frac=1.0, random_state=42).reset_index(drop=True)
# Convert the Pandas DataFrame to Hugging Face Dataset
hf_dataset = Dataset.from_pandas(df1)
# Create a DatasetDict
dataset_dict = DatasetDict({
    "train": hf_dataset
})
data=dataset_dict
def generate_prompt(data_point):
    return f"{data_point['consolidated_prompt_response']} "


data = data.map(lambda data_point: {"prompt": tokenizer(generate_prompt(data_point))})

# Settings for A100 - For 3090 
MICRO_BATCH_SIZE = args.MICRO_BATCH_SIZE
BATCH_SIZE = args.batch_size
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
EPOCHS = 1  # paper uses 3
LEARNING_RATE = 2e-5  
CUTOFF_LEN = 200  
LORA_R = 4
LORA_ALPHA = 16
LORA_DROPOUT = 0.05

model = prepare_model_for_int8_training(model, use_gradient_checkpointing=True)

config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)
tokenizer.pad_token_id = 0  # unk. we want this to be different from the eos token

data = data.shuffle().map(
    lambda data_point: tokenizer(
        generate_prompt(data_point),
        truncation=True,
        max_length=CUTOFF_LEN,
        padding="max_length",
    )
)

trainer = Trainer(
    model=model,
    train_dataset=data["train"],
    args=TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=100,
        num_train_epochs=EPOCHS,
        learning_rate=LEARNING_RATE,
        fp16=True,
        logging_steps=10,
        output_dir="my output directory",
        save_total_limit=3,
        gradient_checkpointing=False,  # Disable gradient checkpointing

    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
model.config.use_cache = False
trainer.train(resume_from_checkpoint=False)

model.save_pretrained("my model path")

if name == "main":
main()

Phi-2 is currently not supported for gradient checkpointing. You need to set
model = prepare_model_for_int8_training(model, use_gradient_checkpointing=False) and then reduce the max seq length if you run into OOM errors.

@Aditiyadav You can use appropriate branch by specifying revision to enable checkpointing -

    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    flash_attn=True,
    flash_rotary=True, 
    fused_dense=True,
    low_cpu_mem_usage=True,
    device_map={"": 0},
    revision="refs/pr/23",
)```

I am able to run it now by using use_gradient_checkpointing=False

Aditiyadav changed discussion status to closed

Sign up or log in to comment