from peft import PeftModel import pandas as pd import shelve from datasets import Dataset from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig from transformers import AutoModelForCausalLM import torch from datasets import load_dataset, Dataset import datasets from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig from peft import LoraConfig, get_peft_model #model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype="auto", trust_remote_code=True) tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True) tokenizer.pad_token = tokenizer.eos_token moodb = shelve.open('mood.db') happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n') for i, h in enumerate(happy): happy[i] = "Prompt:"+h+"Completion: You're feeling happy" for i, s in enumerate(sad): sad[i] = "Prompt:"+s+"Completion: You're feeling sad" happy = list(zip(happy, ["You're happy" for d in range(len(happy))])) sad = list(zip(sad, ["You're sad" for d in range(len(sad))])) data = sad+happy #print(data) df = pd.DataFrame(data, columns=['Prompt', 'Completion']) #print(df) def tokenize(sample): tokenized_text = tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512) return tokenized_text data = Dataset.from_pandas(df) tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names) bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.float16 ) model = AutoModelForCausalLM.from_pretrained( "EleutherAI/gpt-neo-1.3B", device_map={"":0}, trust_remote_code=True, quantization_config=bnb_config ) lora_config = LoraConfig( r=16, lora_alpha=16, target_modules=["Wqkv", "out_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM" ) model = get_peft_model(model, lora_config) training_arguments = TrainingArguments( output_dir="Multi-lingual-finetuned-med-text", per_device_train_batch_size=4, gradient_accumulation_steps=1, learning_rate=2e-4, lr_scheduler_type="cosine", save_strategy="epoch", logging_steps=1000, max_steps=55550, num_train_epochs=1 ) trainer = Trainer( model=model, train_dataset=tokenized_data, args=training_arguments, data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) ) trainer.train() #peft_model = PeftModel.from_pretrained(model, "/root/projects/Multi-lingual-finetuned-med-text/checkpoint-10/", from_transformers=True) #model = peft_model.merge_and_unload() # model