# Example inspired from https://huggingface.co/microsoft/Phi-3-vision-128k-instruct # Import necessary libraries from PIL import Image import requests from transformers import AutoModelForCausalLM from transformers import AutoProcessor from transformers import BitsAndBytesConfig from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, default_data_collator import torch import pandas as pd from torchmetrics.text import CharErrorRate from peft import LoraConfig, get_peft_model from data import AlphaPenPhi3Dataset from sklearn.model_selection import train_test_split from datetime import datetime import os import evaluate # tqdm.pandas() os.environ["WANDB_PROJECT"]="Alphapen" # Define model ID model_id = "microsoft/Phi-3-vision-128k-instruct" # Load data df_path = "/mnt/data1/Datasets/AlphaPen/" + "training_data.csv" df = pd.read_csv(df_path) df.dropna(inplace=True) train_df, test_df = train_test_split(df, test_size=0.15, random_state=0) # we reset the indices to start from zero train_df.reset_index(drop=True, inplace=True) test_df.reset_index(drop=True, inplace=True) root_dir = "/mnt/data1/Datasets/OCR/Alphapen/clean_data/final_cropped_rotated_" processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True) tokenizer = processor.tokenizer train_dataset = AlphaPenPhi3Dataset(root_dir=root_dir, dataframe=train_df, tokenizer=tokenizer, max_length=128, image_size=128) eval_dataset = AlphaPenPhi3Dataset(root_dir=root_dir, dataframe=test_df.iloc[:10,], tokenizer=tokenizer, max_length=128, image_size=128) print(train_dataset[0]) nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16, ) # Load model with 4-bit quantization and map to CUDA model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", trust_remote_code=True, torch_dtype="auto", quantization_config=nf4_config, ) # set special tokens used for creating the decoder_input_ids from the labels model.config.decoder_start_token_id = processor.tokenizer.cls_token_id model.config.pad_token_id = processor.tokenizer.pad_token_id # make sure vocab size is set correctly # model.config.vocab_size = model.config.decoder.vocab_size # for peft # model.vocab_size = model.config.decoder.vocab_size # set beam search parameters model.config.eos_token_id = processor.tokenizer.sep_token_id model.config.max_new_tokens= 128 model.config.early_stopping = True model.config.no_repeat_ngram_size = 3 model.config.length_penalty = 2.0 model.config.num_beams = 4 # LoRa lora_config = LoraConfig( r=64, lora_alpha=16, lora_dropout=0.1, # target_modules = 'all-linear' target_modules=[ "q_proj", "k_proj", "v_proj", "o_proj", # "gate_proj", # "up_proj", # "down_proj", ], ) # print(model) # import torch # from transformers import Conv1D # def get_specific_layer_names(model): # # Create a list to store the layer names # layer_names = [] # # Recursively visit all modules and submodules # for name, module in model.named_modules(): # # Check if the module is an instance of the specified layers # if isinstance(module, (torch.nn.Linear, torch.nn.Embedding, torch.nn.Conv2d, Conv1D)): # # model name parsing # layer_names.append('.'.join(name.split('.')[4:]).split('.')[0]) # return layer_names # print(list(set(get_specific_layer_names(model)))) # device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # model.to(device) model = get_peft_model(model, lora_config) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) # print(model.vocab_size) # run_name=f"Mistral-7B-SQL-QLoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}" # # Step 3: Define the training arguments training_args = Seq2SeqTrainingArguments( predict_with_generate=True, evaluation_strategy="steps", per_device_train_batch_size=8, per_device_eval_batch_size=8, bf16=True, bf16_full_eval=True, output_dir="./", logging_steps=100, save_steps=1000, eval_steps=100, report_to="wandb", run_name=f"phi3-vision-LoRA-{datetime.now().strftime('%Y-%m-%d-%H-%M-%s')}", optim="adamw_torch_fused", lr_scheduler_type="cosine", gradient_accumulation_steps=2, learning_rate=1.0e-4, max_steps=10000, push_to_hub=True, hub_model_id="hadrakey/alphapen_phi3", ) def compute_metrics(pred): # accuracy_metric = evaluate.load("precision") cer_metric = evaluate.load("cer") labels_ids = pred.label_ids pred_ids = pred.predictions print(labels_ids.shape, pred_ids.shape) max_length = max(pred_ids.shape[1], labels_ids.shape[1]) pred_str = processor.batch_decode(pred_ids, skip_special_tokens=False, clean_up_tokenization_spaces=False) print(pred_str) # pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True) labels_ids[labels_ids == -100] = tokenizer.pad_token_id label_str = processor.batch_decode(labels_ids, skip_special_tokens=True) print(label_str) cer = cer_metric.compute(predictions=pred_str, references=label_str) # accuracy = accuracy_metric.compute(predictions=pred_ids.tolist(), references=labels_ids.tolist()) return {"cer": cer} # # Step 5: Define the Trainer trainer = Seq2SeqTrainer( model=model, tokenizer=tokenizer, args=training_args, compute_metrics=compute_metrics, train_dataset=train_dataset, eval_dataset=eval_dataset, data_collator=default_data_collator ) trainer.train()