In [1]:
# !pip install transformers
# !pip install torch
# !pip install accelerate -U

#### Below is the funtion to find trainable parameters of the Model. 

In [5]:
sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())

737641472

In [1]:
import pandas as pd
import json
import torch


In [2]:

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  AdamW, TrainingArguments, Trainer
from torch.utils.data import TensorDataset

tokenizer = AutoTokenizer.from_pretrained("microsoft/GODEL-v1_1-large-seq2seq", padding_side='right', truncation_side='left')


In [5]:
model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/GODEL-v1_1-large-seq2seq").to('cuda')

#### Here the data preprocessed, Note that the data loaded to this model is in the following format. It is in the form of mulit-turn conversation between two persons.
#### [[person1, person2, person1, person2, person1, person2],
#### [person1, person2, person1, person2, person1, person2],
#### [person1, person2, person1, person2, person1, person2],
#### [person1, person2, person1, person2, person1, person2],
#### [person1, person2, person1, person2, person1, person2]]

In [6]:
def read_data_from_txt(file_path):
    try:
        with open(file_path, 'rb') as file:
            content = file.readlines()
        content = [_.decode('utf-8').strip() for _ in content]
        content = '\n'.join(content)

        # Split the content based on the delimiter (triple single quotes)
        data_list = content.split("''','''")

        # Remove empty elements from the list
        data_list = [section.strip("'''") for section in data_list]
        data_list = [_.strip().split('\n') for _ in data_list]

        return data_list
    except FileNotFoundError:
        print(f"File '{file_path}' not found.")
        return None
    except Exception as e:
        print(f"Error occurred while reading the file: {e}")
        return None


In [7]:

file_path = 'your_data.txt'
data_list = read_data_from_txt(file_path)


In [8]:
training_data = data_list


In [10]:

def create_input_output(data_list):
    input_data = []
    output_data = []
    instructions = "You are Woice AI. Answer the queires relevant to rev9 Solutions only. If not relevant, asnwer 'I applogize, I can't answer your question as I am just an AI chatbot.'"
    knowledge = ""
    for lines in data_list:
        for i in range(1, len(lines), 2):
            input_lines = lines[:i]
            input_text = ' EOS '.join(input_lines).strip()
            input_data.append(f'[INSTRUCTION] {instructions} [CONTEXT] ' + input_text )
            output_data.append(lines[i] + ' EOS')
    return input_data, output_data


In [11]:

train_input, train_output = create_input_output(training_data)

In [13]:
def generation_tokenized_dataset(input, output):
    
    input_tokens = tokenizer(input, padding="longest", truncation=True, return_tensors="pt", max_length=768)
    output_tokens = tokenizer(output, padding="longest", truncation=True, return_tensors="pt", max_length=768)
    dataset = TensorDataset(input_tokens.input_ids, input_tokens.attention_mask,
                            output_tokens.input_ids, output_tokens.attention_mask)

    return dataset


In [14]:
train_set = generation_tokenized_dataset(train_input, train_output)


In [15]:
class CustomDataCollator:
    def __call__(self, features):
        input_ids = torch.stack([f[0] for f in features])
        attention_mask = torch.stack([f[1] for f in features])
        labels = torch.stack([f[2] for f in features])

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': labels
        }


In [None]:
import torch
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
optimizer = AdamW(model.parameters(), lr=1e-5)

In [17]:
from transformers import EarlyStoppingCallback

In [18]:
from transformers import get_linear_schedule_with_warmup

In [17]:
callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]

In [19]:
lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                               num_warmup_steps=300,
                                               num_training_steps=1200)

In [20]:
training_args = TrainingArguments(
    output_dir='./godel/v0.0.5',
    num_train_epochs= 20,
    per_device_train_batch_size=2,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./godel/v0.0.5/logs',
    logging_steps=50,
    save_total_limit=1,
    gradient_accumulation_steps=8,
    learning_rate=0.001,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    save_strategy='epoch',
    evaluation_strategy='epoch'

)

training_args = training_args.set_lr_scheduler(name='linear',
                                              num_epochs=40,
                                              warmup_steps=100)


#### Here model is evaluated and trained on the same dataset as I was short on the dataset. If you have a large dataset, split them with the desired ratio (recommended=  15:85, respectively)

In [21]:


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=train_set,
    data_collator=CustomDataCollator(),
    callbacks=callbacks,

)

In [None]:
trainer.train()

In [23]:
trainer.evaluate(train_set)

{'eval_loss': 0.00055647426052019,
 'eval_runtime': 79.6939,
 'eval_samples_per_second': 16.036,
 'eval_steps_per_second': 2.008,
 'epoch': 39.56}

In [24]:
trainer.save_model()
trainer.save_state()
tokenizer.save_pretrained(trainer.args.output_dir)

('./godel/v0.0.5/tokenizer_config.json',
 './godel/v0.0.5/special_tokens_map.json',
 './godel/v0.0.5/tokenizer.json')

#### You can chat with your model here. Pass in instrucions or knowledge as you desire.

In [25]:
from time import time 

In [26]:
def generate(instruction, dialog, knowledge):
    if knowledge != '':
        knowledge = '[KNOWLEDGE] ' + knowledge
    dialog = ' EOS '.join(dialog)
    query = f"{instruction} [CONTEXT] {dialog} {knowledge}"
    t = time()
    
    input_ids = tokenizer(f"{query}", return_tensors="pt").to('cuda').input_ids
    outputs = model.generate(input_ids, max_length=32102, min_length=8, top_p=0.9, do_sample=True)
    output = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print('time:', time() - t)
    return output

In [None]:
dialog = list()
while True:
    query = input("Human: ")
    dialog.append(query)
    instruction = "You are Woice AI, you are here to answer queries emphatically. Don't be rude and say vulgar words. Any thing unrelated to your training, do not answer randomly. Be polite."
    knowledge = ''
    output = "AI: " + generate(instruction, dialog, knowledge)
    dialog.append(output)
    print(output)