|
|
|
from peft import PeftModel |
|
import pandas as pd |
|
import shelve |
|
from datasets import Dataset |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig |
|
from transformers import AutoModelForCausalLM |
|
import torch |
|
from datasets import load_dataset, Dataset |
|
import datasets |
|
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig |
|
from peft import LoraConfig, get_peft_model |
|
|
|
|
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True) |
|
tokenizer.pad_token = tokenizer.eos_token |
|
|
|
|
|
|
|
moodb = shelve.open('mood.db') |
|
happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n') |
|
|
|
for i, h in enumerate(happy): |
|
happy[i] = "Prompt:"+h+"Completion: You're feeling happy" |
|
|
|
|
|
for i, s in enumerate(sad): |
|
sad[i] = "Prompt:"+s+"Completion: You're feeling sad" |
|
|
|
happy = list(zip(happy, ["You're happy" for d in range(len(happy))])) |
|
sad = list(zip(sad, ["You're sad" for d in range(len(sad))])) |
|
|
|
data = sad+happy |
|
|
|
df = pd.DataFrame(data, columns=['Prompt', 'Completion']) |
|
|
|
|
|
def tokenize(sample): |
|
tokenized_text = tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512) |
|
return tokenized_text |
|
|
|
|
|
data = Dataset.from_pandas(df) |
|
|
|
tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names) |
|
|
|
|
|
bnb_config = BitsAndBytesConfig( |
|
load_in_4bit=True, |
|
bnb_4bit_use_double_quant=True, |
|
bnb_4bit_quant_type="nf4", |
|
bnb_4bit_compute_dtype=torch.float16 |
|
) |
|
|
|
model = AutoModelForCausalLM.from_pretrained( |
|
"EleutherAI/gpt-neo-1.3B", |
|
device_map={"":0}, |
|
trust_remote_code=True, |
|
quantization_config=bnb_config |
|
) |
|
|
|
|
|
|
|
|
|
lora_config = LoraConfig( |
|
r=16, |
|
lora_alpha=16, |
|
target_modules=["Wqkv", "out_proj"], |
|
lora_dropout=0.05, |
|
bias="none", |
|
task_type="CAUSAL_LM" |
|
) |
|
|
|
model = get_peft_model(model, lora_config) |
|
|
|
|
|
|
|
training_arguments = TrainingArguments( |
|
output_dir="Multi-lingual-finetuned-med-text", |
|
per_device_train_batch_size=4, |
|
gradient_accumulation_steps=1, |
|
learning_rate=2e-4, |
|
lr_scheduler_type="cosine", |
|
save_strategy="epoch", |
|
logging_steps=1000, |
|
max_steps=55550, |
|
num_train_epochs=1 |
|
) |
|
|
|
|
|
|
|
trainer = Trainer( |
|
model=model, |
|
train_dataset=tokenized_data, |
|
args=training_arguments, |
|
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False) |
|
) |
|
trainer.train() |
|
|
|
|
|
|
|
|
|
|
|
|