File size: 2,882 Bytes
41d7246 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
from peft import PeftModel
import pandas as pd
import shelve
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch
from datasets import load_dataset, Dataset
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model
#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
moodb = shelve.open('mood.db')
happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n')
for i, h in enumerate(happy):
happy[i] = "Prompt:"+h+"Completion: You're feeling happy"
for i, s in enumerate(sad):
sad[i] = "Prompt:"+s+"Completion: You're feeling sad"
happy = list(zip(happy, ["You're happy" for d in range(len(happy))]))
sad = list(zip(sad, ["You're sad" for d in range(len(sad))]))
data = sad+happy
#print(data)
df = pd.DataFrame(data, columns=['Prompt', 'Completion'])
#print(df)
def tokenize(sample):
tokenized_text = tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512)
return tokenized_text
data = Dataset.from_pandas(df)
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16
)
model = AutoModelForCausalLM.from_pretrained(
"EleutherAI/gpt-neo-1.3B",
device_map={"":0},
trust_remote_code=True,
quantization_config=bnb_config
)
lora_config = LoraConfig(
r=16,
lora_alpha=16,
target_modules=["Wqkv", "out_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
training_arguments = TrainingArguments(
output_dir="Multi-lingual-finetuned-med-text",
per_device_train_batch_size=4,
gradient_accumulation_steps=1,
learning_rate=2e-4,
lr_scheduler_type="cosine",
save_strategy="epoch",
logging_steps=1000,
max_steps=55550,
num_train_epochs=1
)
trainer = Trainer(
model=model,
train_dataset=tokenized_data,
args=training_arguments,
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
#peft_model = PeftModel.from_pretrained(model, "/root/projects/Multi-lingual-finetuned-med-text/checkpoint-10/", from_transformers=True)
#model = peft_model.merge_and_unload()
# model
|