你好!我想問一下能出一個教學嗎?
#1
by
win10
- opened
請問模型是如何訓練的?
有教學嗎?
目前還沒有教學,但是模型架構主要參考 paligemma。而訓練部分使用的是 huggingface trainer 而已。
from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM,TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, AutoTokenizer
import torch
from peft import get_peft_model, LoraConfig
import argparse
import wandb
import transformers
import datasets
torch.set_num_threads(1)
torch.multiprocessing.set_sharing_strategy('file_system')
torch.cuda.empty_cache()
from datasets import disable_caching
disable_caching()
parser = argparse.ArgumentParser()
parser.add_argument("--save", type=str, default="checkpoint", help="save path")
parser.add_argument("--export", type=str, default="TaiVision_base", help="export path")
parser.add_argument("--epoch", type=int, default=1, help="number of training epochs")
parser.add_argument("--batch_size", type=int, default=16, help="batch size for training")
parser.add_argument("--lr", type=float, default=5e-5, help="learning rate")
parser.add_argument("--test_size", type=float, default=0.01, help="test size")
args = parser.parse_args()
processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
# collate_fn
def collate_fn(batch):
texts = [example['conversations'][0]['content'] for example in batch]
labels= [example['conversations'][1]['content'] for example in batch]
images = [example["image"] for example in batch]
tokens = processor(text=texts, images=images, suffix=labels,
return_tensors="pt", padding="longest",
tokenize_newline_separately=False)
tokens = tokens.to(torch.float16)
return tokens
if __name__ == "__main__":
torch.multiprocessing.set_start_method('spawn') # good solution !!!!
device = "cuda" if torch.cuda.is_available() else "cpu"
wandb.init(project="TaiVision",name = "init")
# load dataset
ds = load_dataset('benchang1110/TaiVision-pretrain-1M', split="train")
ds.cleanup_cache_files()
dataset = ds.train_test_split(test_size=args.test_size)
train_ds = dataset['train']
test_ds = dataset['test']
print(train_ds, test_ds)
processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
# load model ==> apply lora at language model and vision projector
model = AutoModelForCausalLM.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
lora_config = LoraConfig(
r=8,
lora_alpha=16,
use_rslora=True,
target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
task_type="CAUSAL_LM",
)
model = get_peft_model(model, lora_config)
# full parameter training for vision projector
for param in model.vision_projector.parameters():
param.requires_grad = True
model = model.to(device)
model.print_trainable_parameters()
data_collator = collate_fn
training_args = TrainingArguments(
output_dir=args.save, #The output directory
overwrite_output_dir=True, #overwrite the content of the output directory
num_train_epochs=args.epoch, # number of training epochs
per_device_train_batch_size=args.batch_size, # batch size for training
per_device_eval_batch_size=args.batch_size, # batch size for evaluation
learning_rate=args.lr,
weight_decay = 1e-4,
warmup_ratio = 0.1,
max_grad_norm = 1.0, #gradient clipping
fp16=True,
gradient_accumulation_steps=1,
remove_unused_columns=False,
logging_strategy="steps",
logging_first_step=True,
logging_steps= 10,
eval_strategy="steps",
load_best_model_at_end=True,
save_steps = 5000,
save_total_limit=3,
eval_accumulation_steps=1,
dataloader_num_workers=4,
dataloader_pin_memory=True,
gradient_checkpointing=False,
eval_steps=5000,
auto_find_batch_size=True,
report_to="wandb",
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_ds,
eval_dataset=test_ds,
data_collator=data_collator,
)
trainer.train()
merge_model = model.merge_and_unload() # this will modify the state dict of the model
merge_model.save_pretrained(args.export)
如有問題,歡迎指正,謝謝您!
目前還沒有教學,但是模型架構主要參考 paligemma。而訓練部分使用的是 huggingface trainer 而已。
from datasets import load_dataset from transformers import AutoProcessor, AutoModelForCausalLM,TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, AutoTokenizer import torch from peft import get_peft_model, LoraConfig import argparse import wandb import transformers import datasets torch.set_num_threads(1) torch.multiprocessing.set_sharing_strategy('file_system') torch.cuda.empty_cache() from datasets import disable_caching disable_caching() parser = argparse.ArgumentParser() parser.add_argument("--save", type=str, default="checkpoint", help="save path") parser.add_argument("--export", type=str, default="TaiVision_base", help="export path") parser.add_argument("--epoch", type=int, default=1, help="number of training epochs") parser.add_argument("--batch_size", type=int, default=16, help="batch size for training") parser.add_argument("--lr", type=float, default=5e-5, help="learning rate") parser.add_argument("--test_size", type=float, default=0.01, help="test size") args = parser.parse_args() processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True) # collate_fn def collate_fn(batch): texts = [example['conversations'][0]['content'] for example in batch] labels= [example['conversations'][1]['content'] for example in batch] images = [example["image"] for example in batch] tokens = processor(text=texts, images=images, suffix=labels, return_tensors="pt", padding="longest", tokenize_newline_separately=False) tokens = tokens.to(torch.float16) return tokens if __name__ == "__main__": torch.multiprocessing.set_start_method('spawn') # good solution !!!! device = "cuda" if torch.cuda.is_available() else "cpu" wandb.init(project="TaiVision",name = "init") # load dataset ds = load_dataset('benchang1110/TaiVision-pretrain-1M', split="train") ds.cleanup_cache_files() dataset = ds.train_test_split(test_size=args.test_size) train_ds = dataset['train'] test_ds = dataset['test'] print(train_ds, test_ds) processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True) # load model ==> apply lora at language model and vision projector model = AutoModelForCausalLM.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True) lora_config = LoraConfig( r=8, lora_alpha=16, use_rslora=True, target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"], task_type="CAUSAL_LM", ) model = get_peft_model(model, lora_config) # full parameter training for vision projector for param in model.vision_projector.parameters(): param.requires_grad = True model = model.to(device) model.print_trainable_parameters() data_collator = collate_fn training_args = TrainingArguments( output_dir=args.save, #The output directory overwrite_output_dir=True, #overwrite the content of the output directory num_train_epochs=args.epoch, # number of training epochs per_device_train_batch_size=args.batch_size, # batch size for training per_device_eval_batch_size=args.batch_size, # batch size for evaluation learning_rate=args.lr, weight_decay = 1e-4, warmup_ratio = 0.1, max_grad_norm = 1.0, #gradient clipping fp16=True, gradient_accumulation_steps=1, remove_unused_columns=False, logging_strategy="steps", logging_first_step=True, logging_steps= 10, eval_strategy="steps", load_best_model_at_end=True, save_steps = 5000, save_total_limit=3, eval_accumulation_steps=1, dataloader_num_workers=4, dataloader_pin_memory=True, gradient_checkpointing=False, eval_steps=5000, auto_find_batch_size=True, report_to="wandb", ) trainer = Trainer( model=model, args=training_args, train_dataset=train_ds, eval_dataset=test_ds, data_collator=data_collator, ) trainer.train() merge_model = model.merge_and_unload() # this will modify the state dict of the model merge_model.save_pretrained(args.export)
如有問題,歡迎指正,謝謝您!
底層的llm模型能換嗎?
可以,更改configuration_taivisionlm.py
中的 language_model的參數就可以。