autotrain-playground / train_llm.py
dsmueller's picture
Add model details and set training parameters
fe25472
raw
history blame contribute delete
No virus
4.96 kB
import os
from uuid import uuid4
import pandas as pd
from datasets import load_dataset
import subprocess
from transformers import AutoTokenizer
### Read environment variables
# from dotenv import load_dotenv,find_dotenv
# load_dotenv(find_dotenv(),override=True)
### Functions
def max_token_len(dataset):
max_seq_length = 0
for row in dataset:
tokens = len(tokenizer(row['text'])['input_ids'])
if tokens > max_seq_length:
max_seq_length = tokens
return max_seq_length
### Model details
# model_name='TinyLlama/TinyLlama-1.1B-Chat-v0.1'
model_name = 'mistralai/Mistral-7B-v0.1'
# model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_max_length = tokenizer.model_max_length
print("Model Max Length:", model_max_length)
### Repo name, dataset initialization, and data directory
# Load dataset
dataset_name = 'ai-aerospace/ams_data_train_generic_v0.1_100'
dataset=load_dataset(dataset_name)
# Write dataset files into data directory
data_directory = './fine_tune_data/'
# Create the data directory if it doesn't exist
os.makedirs(data_directory, exist_ok=True)
# Write the train data to a CSV file
train_data='train_data'
train_filename = os.path.join(data_directory, train_data)
dataset['train'].to_pandas().to_csv(train_filename+'.csv', columns=['text'], index=False)
max_token_length_train=max_token_len(dataset['train'])
print('Max token length train: '+str(max_token_length_train))
# Write the validation data to a CSV file
validation_data='validation_data'
validation_filename = os.path.join(data_directory, validation_data)
dataset['validation'].to_pandas().to_csv(validation_filename+'.csv', columns=['text'], index=False)
max_token_length_validation=max_token_len(dataset['validation'])
print('Max token length validation: '+str(max_token_length_validation))
max_token_length=max(max_token_length_train,max_token_length_validation)
if max_token_length > model_max_length:
raise ValueError("Maximum token length exceeds model limits.")
block_size=2*max_token_length
# Define project parameters
username='ai-aerospace'
project_name='./llms/'+'ams_data_train-100_'+str(uuid4())
repo_name='ams-data-train-100-'+str(uuid4())
### Set training params
model_params={
"project_name": project_name,
"model_name": model_name,
"repo_id": username+'/'+repo_name,
"train_data": train_data,
"validation_data": validation_data,
"data_directory": data_directory,
"block_size": block_size,
"model_max_length": max_token_length,
"logging_steps": -1,
"evaluation_strategy": "epoch",
"save_total_limit": 1,
"save_strategy": "epoch",
"mixed_precision": "fp16",
"lr": 0.00003,
"epochs": 3,
"batch_size": 2,
"warmup_ratio": 0.1,
"gradient_accumulation": 1,
"optimizer": "adamw_torch",
"scheduler": "linear",
"weight_decay": 0,
"max_grad_norm": 1,
"seed": 42,
"quantization": "int4",
"target_modules": "",
"lora_r": 16,
"lora_alpha": 32,
"lora_dropout": 0.05
}
for key, value in model_params.items():
os.environ[key] = str(value)
### Feed into and run autotrain command
# Set .venv and execute the autotrain script
# To see all parameters: autotrain llm --help
# !autotrain llm --train --project_name my-llm --model TinyLlama/TinyLlama-1.1B-Chat-v0.1 --data_path . --use-peft --use_int4 --learning_rate 2e-4 --train_batch_size 6 --num_train_epochs 3 --trainer sft
command=f"""
autotrain llm --train \
--trainer sft \
--project_name {model_params['project_name']} \
--model {model_params['model_name']} \
--data_path {model_params['data_directory']} \
--train_split {model_params['train_data']} \
--valid_split {model_params['validation_data']} \
--repo_id {model_params['repo_id']} \
--push_to_hub \
--token HUGGINGFACE_TOKEN
--block_size {model_params['block_size']} \
--model_max_length {model_params['model_max_length']} \
--logging_steps {model_params['logging_steps']} \
--evaluation_strategy {model_params['evaluation_strategy']} \
--save_total_limit {model_params['save_total_limit']} \
--save_strategy {model_params['save_strategy']} \
--fp16 \
--lr {model_params['lr']} \
--num_train_epochs {model_params['lr']} \
--batch_size {model_params['batch_size']} \
--warmup_ratio {model_params['warmup_ratio']} \
--gradient_accumulation {model_params['gradient_accumulation']} \
--optimizer {model_params['gradient_accumulation']} \
--scheduler linear \
--weight_decay {model_params['weight_decay']} \
--max_grad_norm {model_params['max_grad_norm']} \
--seed {model_params['seed']} \
--use_int4 \
--target_modules {model_params['target_modules']} \
--use-peft \
--lora_r {model_params['lora_r']} \
--lora_alpha {model_params['lora_alpha']} \
--lora_dropout {model_params['lora_dropout']}
"""
# Use subprocess.run() to execute the command
subprocess.run(command, shell=True, check=True)