你好!我想問一下能出一個教學嗎?

#1
by win10 - opened

請問模型是如何訓練的?
有教學嗎?

目前還沒有教學,但是模型架構主要參考 paligemma。而訓練部分使用的是 huggingface trainer 而已。

from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM,TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, AutoTokenizer
import torch
from peft import get_peft_model, LoraConfig
import argparse
import wandb
import transformers
import datasets

torch.set_num_threads(1)
torch.multiprocessing.set_sharing_strategy('file_system')
torch.cuda.empty_cache()
from datasets import disable_caching
disable_caching()

parser = argparse.ArgumentParser()
parser.add_argument("--save", type=str, default="checkpoint", help="save path")
parser.add_argument("--export", type=str, default="TaiVision_base", help="export path")
parser.add_argument("--epoch", type=int, default=1, help="number of training epochs")
parser.add_argument("--batch_size", type=int, default=16, help="batch size for training")
parser.add_argument("--lr", type=float, default=5e-5, help="learning rate")
parser.add_argument("--test_size", type=float, default=0.01, help="test size")
args = parser.parse_args()


processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)

# collate_fn
def collate_fn(batch):
    texts = [example['conversations'][0]['content'] for example in batch]
    labels= [example['conversations'][1]['content'] for example in batch]
    images = [example["image"] for example in batch]
    tokens = processor(text=texts, images=images, suffix=labels,
                    return_tensors="pt", padding="longest",
                    tokenize_newline_separately=False)

    tokens = tokens.to(torch.float16)
    return tokens

if __name__ == "__main__":
    torch.multiprocessing.set_start_method('spawn') # good solution !!!!
    device = "cuda" if torch.cuda.is_available() else "cpu"
    wandb.init(project="TaiVision",name = "init")
    # load dataset
    ds = load_dataset('benchang1110/TaiVision-pretrain-1M', split="train")
    ds.cleanup_cache_files()
    
    dataset = ds.train_test_split(test_size=args.test_size)
    train_ds = dataset['train']
    test_ds = dataset['test']
    print(train_ds, test_ds)
    
    processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    # load model ==> apply lora at language model and vision projector
    model = AutoModelForCausalLM.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        use_rslora=True,
        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)
    # full parameter training for vision projector
    for param in model.vision_projector.parameters():
        param.requires_grad = True

    model = model.to(device)
    model.print_trainable_parameters()
    
    data_collator = collate_fn
    
    training_args = TrainingArguments(
        output_dir=args.save, #The output directory
        overwrite_output_dir=True, #overwrite the content of the output directory
        num_train_epochs=args.epoch, # number of training epochs
        per_device_train_batch_size=args.batch_size, # batch size for training
        per_device_eval_batch_size=args.batch_size,  # batch size for evaluation
        learning_rate=args.lr,
        weight_decay = 1e-4,
        warmup_ratio = 0.1,
        max_grad_norm = 1.0, #gradient clipping
        
        fp16=True,
        gradient_accumulation_steps=1,
        remove_unused_columns=False,
        logging_strategy="steps",
        logging_first_step=True,
        logging_steps= 10,
        eval_strategy="steps",
        load_best_model_at_end=True,
        save_steps = 5000,
        save_total_limit=3,
        
        eval_accumulation_steps=1,
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
        gradient_checkpointing=False,
        eval_steps=5000,
        auto_find_batch_size=True,
        report_to="wandb",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        data_collator=data_collator,
    )
    
    trainer.train()
    
    merge_model = model.merge_and_unload() # this will modify the state dict of the model
    merge_model.save_pretrained(args.export)

如有問題,歡迎指正,謝謝您!

目前還沒有教學,但是模型架構主要參考 paligemma。而訓練部分使用的是 huggingface trainer 而已。

from datasets import load_dataset
from transformers import AutoProcessor, AutoModelForCausalLM,TrainingArguments, Trainer, DataCollatorWithPadding, DataCollatorForLanguageModeling, DataCollatorForSeq2Seq, AutoTokenizer
import torch
from peft import get_peft_model, LoraConfig
import argparse
import wandb
import transformers
import datasets

torch.set_num_threads(1)
torch.multiprocessing.set_sharing_strategy('file_system')
torch.cuda.empty_cache()
from datasets import disable_caching
disable_caching()

parser = argparse.ArgumentParser()
parser.add_argument("--save", type=str, default="checkpoint", help="save path")
parser.add_argument("--export", type=str, default="TaiVision_base", help="export path")
parser.add_argument("--epoch", type=int, default=1, help="number of training epochs")
parser.add_argument("--batch_size", type=int, default=16, help="batch size for training")
parser.add_argument("--lr", type=float, default=5e-5, help="learning rate")
parser.add_argument("--test_size", type=float, default=0.01, help="test size")
args = parser.parse_args()


processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)

# collate_fn
def collate_fn(batch):
    texts = [example['conversations'][0]['content'] for example in batch]
    labels= [example['conversations'][1]['content'] for example in batch]
    images = [example["image"] for example in batch]
    tokens = processor(text=texts, images=images, suffix=labels,
                    return_tensors="pt", padding="longest",
                    tokenize_newline_separately=False)

    tokens = tokens.to(torch.float16)
    return tokens

if __name__ == "__main__":
    torch.multiprocessing.set_start_method('spawn') # good solution !!!!
    device = "cuda" if torch.cuda.is_available() else "cpu"
    wandb.init(project="TaiVision",name = "init")
    # load dataset
    ds = load_dataset('benchang1110/TaiVision-pretrain-1M', split="train")
    ds.cleanup_cache_files()
    
    dataset = ds.train_test_split(test_size=args.test_size)
    train_ds = dataset['train']
    test_ds = dataset['test']
    print(train_ds, test_ds)
    
    processor = AutoProcessor.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    # load model ==> apply lora at language model and vision projector
    model = AutoModelForCausalLM.from_pretrained("benchang1110/TaiVision-base", trust_remote_code=True)
    lora_config = LoraConfig(
        r=8,
        lora_alpha=16,
        use_rslora=True,
        target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
        task_type="CAUSAL_LM",
    )
    model = get_peft_model(model, lora_config)
    # full parameter training for vision projector
    for param in model.vision_projector.parameters():
        param.requires_grad = True

    model = model.to(device)
    model.print_trainable_parameters()
    
    data_collator = collate_fn
    
    training_args = TrainingArguments(
        output_dir=args.save, #The output directory
        overwrite_output_dir=True, #overwrite the content of the output directory
        num_train_epochs=args.epoch, # number of training epochs
        per_device_train_batch_size=args.batch_size, # batch size for training
        per_device_eval_batch_size=args.batch_size,  # batch size for evaluation
        learning_rate=args.lr,
        weight_decay = 1e-4,
        warmup_ratio = 0.1,
        max_grad_norm = 1.0, #gradient clipping
        
        fp16=True,
        gradient_accumulation_steps=1,
        remove_unused_columns=False,
        logging_strategy="steps",
        logging_first_step=True,
        logging_steps= 10,
        eval_strategy="steps",
        load_best_model_at_end=True,
        save_steps = 5000,
        save_total_limit=3,
        
        eval_accumulation_steps=1,
        dataloader_num_workers=4,
        dataloader_pin_memory=True,
        gradient_checkpointing=False,
        eval_steps=5000,
        auto_find_batch_size=True,
        report_to="wandb",
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_ds,
        eval_dataset=test_ds,
        data_collator=data_collator,
    )
    
    trainer.train()
    
    merge_model = model.merge_and_unload() # this will modify the state dict of the model
    merge_model.save_pretrained(args.export)

如有問題,歡迎指正,謝謝您!

底層的llm模型能換嗎?

可以,更改configuration_taivisionlm.py中的 language_model的參數就可以。

Sign up or log in to comment