File size: 2,882 Bytes
41d7246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106

from peft import PeftModel
import pandas as pd
import shelve
from datasets import Dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from transformers import AutoModelForCausalLM
import torch
from datasets import load_dataset, Dataset
import datasets
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model



#model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-neo-1.3B", torch_dtype="auto", trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-neo-1.3B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token



moodb = shelve.open('mood.db')
happy, sad = moodb['happy'][1].split('\n'), moodb['sad'][1].split('\n')

for i, h in enumerate(happy):
    happy[i] = "Prompt:"+h+"Completion: You're feeling happy"


for i, s in enumerate(sad):
    sad[i] = "Prompt:"+s+"Completion: You're feeling sad"

happy = list(zip(happy, ["You're happy" for d in range(len(happy))]))
sad = list(zip(sad, ["You're sad" for d in range(len(sad))]))
 
data = sad+happy
#print(data)
df = pd.DataFrame(data, columns=['Prompt', 'Completion'])

#print(df)
def tokenize(sample):
    tokenized_text =  tokenizer(sample['Prompt'], padding=True, truncation=True, max_length=512)
    return tokenized_text


data = Dataset.from_pandas(df)

tokenized_data = data.map(tokenize, batched=True, desc="Tokenizing data", remove_columns=data.column_names)


bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-1.3B",
    device_map={"":0},
    trust_remote_code=True,
    quantization_config=bnb_config
)




lora_config = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=["Wqkv", "out_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)



training_arguments = TrainingArguments(
        output_dir="Multi-lingual-finetuned-med-text",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=1,
        learning_rate=2e-4,
        lr_scheduler_type="cosine",
        save_strategy="epoch",
        logging_steps=1000,
        max_steps=55550,
        num_train_epochs=1
    )



trainer = Trainer(
    model=model,
    train_dataset=tokenized_data,
    args=training_arguments,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
trainer.train()
#peft_model = PeftModel.from_pretrained(model, "/root/projects/Multi-lingual-finetuned-med-text/checkpoint-10/", from_transformers=True)

#model = peft_model.merge_and_unload()

# model