{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "cUzq1tXyk5Ga"
   },
   "outputs": [],
   "source": [
    "# !pip install transformers\n",
    "# !pip install torch\n",
    "# !pip install accelerate -U"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Below is the funtion to find trainable parameters of the Model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "737641472"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:18.853671Z",
     "iopub.status.busy": "2023-09-12T05:38:18.853483Z",
     "iopub.status.idle": "2023-09-12T05:38:20.511295Z",
     "shell.execute_reply": "2023-09-12T05:38:20.510634Z",
     "shell.execute_reply.started": "2023-09-12T05:38:18.853650Z"
    },
    "id": "_GqhK_n0JWC4"
   },
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import json\n",
    "import torch\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:21.617293Z",
     "iopub.status.busy": "2023-09-12T05:38:21.616915Z",
     "iopub.status.idle": "2023-09-12T05:38:34.474328Z",
     "shell.execute_reply": "2023-09-12T05:38:34.473820Z",
     "shell.execute_reply.started": "2023-09-12T05:38:21.617267Z"
    },
    "id": "FVBPeMW99Z7G"
   },
   "outputs": [],
   "source": [
    "\n",
    "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,  AdamW, TrainingArguments, Trainer\n",
    "from torch.utils.data import TensorDataset\n",
    "\n",
    "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/GODEL-v1_1-large-seq2seq\", padding_side='right', truncation_side='left')\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:37.343460Z",
     "iopub.status.busy": "2023-09-12T05:38:37.343116Z",
     "iopub.status.idle": "2023-09-12T05:38:43.015610Z",
     "shell.execute_reply": "2023-09-12T05:38:43.015175Z",
     "shell.execute_reply.started": "2023-09-12T05:38:37.343436Z"
    },
    "id": "Bee7KFF2MWQ_"
   },
   "outputs": [],
   "source": [
    "model = AutoModelForSeq2SeqLM.from_pretrained(\"microsoft/GODEL-v1_1-large-seq2seq\").to('cuda')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Here the data preprocessed, Note that the data loaded to this model is in the following format. It is in the form of mulit-turn conversation between two persons.\n",
    "#### [[person1, person2, person1, person2, person1, person2],\n",
    "#### [person1, person2, person1, person2, person1, person2],\n",
    "#### [person1, person2, person1, person2, person1, person2],\n",
    "#### [person1, person2, person1, person2, person1, person2],\n",
    "#### [person1, person2, person1, person2, person1, person2]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:44.400644Z",
     "iopub.status.busy": "2023-09-12T05:38:44.400155Z",
     "iopub.status.idle": "2023-09-12T05:38:44.405992Z",
     "shell.execute_reply": "2023-09-12T05:38:44.405263Z",
     "shell.execute_reply.started": "2023-09-12T05:38:44.400620Z"
    },
    "id": "Mjd9Us2Sr6Hq"
   },
   "outputs": [],
   "source": [
    "def read_data_from_txt(file_path):\n",
    "    try:\n",
    "        with open(file_path, 'rb') as file:\n",
    "            content = file.readlines()\n",
    "        content = [_.decode('utf-8').strip() for _ in content]\n",
    "        content = '\\n'.join(content)\n",
    "\n",
    "        # Split the content based on the delimiter (triple single quotes)\n",
    "        data_list = content.split(\"''','''\")\n",
    "\n",
    "        # Remove empty elements from the list\n",
    "        data_list = [section.strip(\"'''\") for section in data_list]\n",
    "        data_list = [_.strip().split('\\n') for _ in data_list]\n",
    "\n",
    "        return data_list\n",
    "    except FileNotFoundError:\n",
    "        print(f\"File '{file_path}' not found.\")\n",
    "        return None\n",
    "    except Exception as e:\n",
    "        print(f\"Error occurred while reading the file: {e}\")\n",
    "        return None\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:45.632305Z",
     "iopub.status.busy": "2023-09-12T05:38:45.631923Z",
     "iopub.status.idle": "2023-09-12T05:38:45.637764Z",
     "shell.execute_reply": "2023-09-12T05:38:45.637089Z",
     "shell.execute_reply.started": "2023-09-12T05:38:45.632280Z"
    },
    "id": "N4WTX9MfKTBX"
   },
   "outputs": [],
   "source": [
    "\n",
    "file_path = 'your_data.txt'\n",
    "data_list = read_data_from_txt(file_path)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:46.529136Z",
     "iopub.status.busy": "2023-09-12T05:38:46.528726Z",
     "iopub.status.idle": "2023-09-12T05:38:46.532045Z",
     "shell.execute_reply": "2023-09-12T05:38:46.531505Z",
     "shell.execute_reply.started": "2023-09-12T05:38:46.529112Z"
    }
   },
   "outputs": [],
   "source": [
    "training_data = data_list\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:52.640741Z",
     "iopub.status.busy": "2023-09-12T05:38:52.639972Z",
     "iopub.status.idle": "2023-09-12T05:38:52.646245Z",
     "shell.execute_reply": "2023-09-12T05:38:52.645854Z",
     "shell.execute_reply.started": "2023-09-12T05:38:52.640704Z"
    },
    "id": "fxgyXq64Q1GP"
   },
   "outputs": [],
   "source": [
    "\n",
    "def create_input_output(data_list):\n",
    "    input_data = []\n",
    "    output_data = []\n",
    "    instructions = \"You are Woice AI. Answer the queires relevant to rev9 Solutions only. If not relevant, asnwer 'I applogize, I can't answer your question as I am just an AI chatbot.'\"\n",
    "    knowledge = \"\"\n",
    "    for lines in data_list:\n",
    "        for i in range(1, len(lines), 2):\n",
    "            input_lines = lines[:i]\n",
    "            input_text = ' EOS '.join(input_lines).strip()\n",
    "            input_data.append(f'[INSTRUCTION] {instructions} [CONTEXT] ' + input_text )\n",
    "            output_data.append(lines[i] + ' EOS')\n",
    "    return input_data, output_data\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:38:54.366890Z",
     "iopub.status.busy": "2023-09-12T05:38:54.366544Z",
     "iopub.status.idle": "2023-09-12T05:38:54.371721Z",
     "shell.execute_reply": "2023-09-12T05:38:54.371144Z",
     "shell.execute_reply.started": "2023-09-12T05:38:54.366866Z"
    }
   },
   "outputs": [],
   "source": [
    "\n",
    "train_input, train_output = create_input_output(training_data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:10.350357Z",
     "iopub.status.busy": "2023-09-12T05:39:10.350006Z",
     "iopub.status.idle": "2023-09-12T05:39:10.354580Z",
     "shell.execute_reply": "2023-09-12T05:39:10.353920Z",
     "shell.execute_reply.started": "2023-09-12T05:39:10.350333Z"
    },
    "id": "VyrEDi_G9NfY"
   },
   "outputs": [],
   "source": [
    "def generation_tokenized_dataset(input, output):\n",
    "    \n",
    "    input_tokens = tokenizer(input, padding=\"longest\", truncation=True, return_tensors=\"pt\", max_length=768)\n",
    "    output_tokens = tokenizer(output, padding=\"longest\", truncation=True, return_tensors=\"pt\", max_length=768)\n",
    "    dataset = TensorDataset(input_tokens.input_ids, input_tokens.attention_mask,\n",
    "                            output_tokens.input_ids, output_tokens.attention_mask)\n",
    "\n",
    "    return dataset\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:11.118317Z",
     "iopub.status.busy": "2023-09-12T05:39:11.117702Z",
     "iopub.status.idle": "2023-09-12T05:39:11.459556Z",
     "shell.execute_reply": "2023-09-12T05:39:11.459151Z",
     "shell.execute_reply.started": "2023-09-12T05:39:11.118292Z"
    },
    "id": "Q0IjwcBPfVEm"
   },
   "outputs": [],
   "source": [
    "train_set = generation_tokenized_dataset(train_input, train_output)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:12.526146Z",
     "iopub.status.busy": "2023-09-12T05:39:12.525838Z",
     "iopub.status.idle": "2023-09-12T05:39:12.530858Z",
     "shell.execute_reply": "2023-09-12T05:39:12.530178Z",
     "shell.execute_reply.started": "2023-09-12T05:39:12.526123Z"
    },
    "id": "hhz3a3j2Sa0P"
   },
   "outputs": [],
   "source": [
    "class CustomDataCollator:\n",
    "    def __call__(self, features):\n",
    "        input_ids = torch.stack([f[0] for f in features])\n",
    "        attention_mask = torch.stack([f[1] for f in features])\n",
    "        labels = torch.stack([f[2] for f in features])\n",
    "\n",
    "        return {\n",
    "            'input_ids': input_ids,\n",
    "            'attention_mask': attention_mask,\n",
    "            'labels': labels\n",
    "        }\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:13.295224Z",
     "iopub.status.busy": "2023-09-12T05:39:13.294666Z",
     "iopub.status.idle": "2023-09-12T05:39:13.307836Z",
     "shell.execute_reply": "2023-09-12T05:39:13.307503Z",
     "shell.execute_reply.started": "2023-09-12T05:39:13.295200Z"
    },
    "id": "CN5JWUqmS8wM"
   },
   "outputs": [],
   "source": [
    "import torch\n",
    "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n",
    "model.to(device)\n",
    "optimizer = AdamW(model.parameters(), lr=1e-5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:14.655823Z",
     "iopub.status.busy": "2023-09-12T05:39:14.655033Z",
     "iopub.status.idle": "2023-09-12T05:39:14.659506Z",
     "shell.execute_reply": "2023-09-12T05:39:14.658681Z",
     "shell.execute_reply.started": "2023-09-12T05:39:14.655786Z"
    },
    "id": "zfsQaXAEWZLD"
   },
   "outputs": [],
   "source": [
    "from transformers import EarlyStoppingCallback"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:15.342624Z",
     "iopub.status.busy": "2023-09-12T05:39:15.342125Z",
     "iopub.status.idle": "2023-09-12T05:39:15.345769Z",
     "shell.execute_reply": "2023-09-12T05:39:15.345059Z",
     "shell.execute_reply.started": "2023-09-12T05:39:15.342600Z"
    },
    "id": "zd7CDp3xXVMp"
   },
   "outputs": [],
   "source": [
    "from transformers import get_linear_schedule_with_warmup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-11T11:42:31.617024Z",
     "iopub.status.busy": "2023-09-11T11:42:31.616702Z",
     "iopub.status.idle": "2023-09-11T11:42:31.620157Z",
     "shell.execute_reply": "2023-09-11T11:42:31.619476Z",
     "shell.execute_reply.started": "2023-09-11T11:42:31.617001Z"
    },
    "id": "rcMlWRgMWcOA"
   },
   "outputs": [],
   "source": [
    "callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:17.359370Z",
     "iopub.status.busy": "2023-09-12T05:39:17.358967Z",
     "iopub.status.idle": "2023-09-12T05:39:17.362640Z",
     "shell.execute_reply": "2023-09-12T05:39:17.362096Z",
     "shell.execute_reply.started": "2023-09-12T05:39:17.359346Z"
    },
    "id": "WgGbwECpXXwd"
   },
   "outputs": [],
   "source": [
    "lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,\n",
    "                                               num_warmup_steps=300,\n",
    "                                               num_training_steps=1200)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:26.782170Z",
     "iopub.status.busy": "2023-09-12T05:39:26.781759Z",
     "iopub.status.idle": "2023-09-12T05:39:26.788708Z",
     "shell.execute_reply": "2023-09-12T05:39:26.788007Z",
     "shell.execute_reply.started": "2023-09-12T05:39:26.782126Z"
    },
    "id": "UCpUorNtUTxJ"
   },
   "outputs": [],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir='./godel/v0.0.5',\n",
    "    num_train_epochs= 20,\n",
    "    per_device_train_batch_size=2,\n",
    "    warmup_steps=100,\n",
    "    weight_decay=0.01,\n",
    "    logging_dir='./godel/v0.0.5/logs',\n",
    "    logging_steps=50,\n",
    "    save_total_limit=1,\n",
    "    gradient_accumulation_steps=8,\n",
    "    learning_rate=0.001,\n",
    "    load_best_model_at_end=True,\n",
    "    metric_for_best_model='loss',\n",
    "    greater_is_better=False,\n",
    "    save_strategy='epoch',\n",
    "    evaluation_strategy='epoch'\n",
    "\n",
    ")\n",
    "\n",
    "training_args = training_args.set_lr_scheduler(name='linear',\n",
    "                                              num_epochs=40,\n",
    "                                              warmup_steps=100)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Here model is evaluated and trained on the same dataset as I was short on the dataset. If you have a large dataset, split them with the desired ratio (recommended=  15:85, respectively)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:27.630008Z",
     "iopub.status.busy": "2023-09-12T05:39:27.629250Z",
     "iopub.status.idle": "2023-09-12T05:39:27.642183Z",
     "shell.execute_reply": "2023-09-12T05:39:27.641782Z",
     "shell.execute_reply.started": "2023-09-12T05:39:27.629973Z"
    },
    "id": "KxAyHTuJOBIQ"
   },
   "outputs": [],
   "source": [
    "\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_set,\n",
    "    eval_dataset=train_set,\n",
    "    data_collator=CustomDataCollator(),\n",
    "    callbacks=callbacks,\n",
    "\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T05:39:29.327544Z",
     "iopub.status.busy": "2023-09-12T05:39:29.327023Z",
     "iopub.status.idle": "2023-09-12T09:31:20.343378Z",
     "shell.execute_reply": "2023-09-12T09:31:20.343016Z",
     "shell.execute_reply.started": "2023-09-12T05:39:29.327521Z"
    },
    "id": "brO0zCjN9U_P"
   },
   "outputs": [],
   "source": [
    "trainer.train()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T09:31:20.344170Z",
     "iopub.status.busy": "2023-09-12T09:31:20.344000Z",
     "iopub.status.idle": "2023-09-12T09:32:40.040850Z",
     "shell.execute_reply": "2023-09-12T09:32:40.040458Z",
     "shell.execute_reply.started": "2023-09-12T09:31:20.344157Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='160' max='160' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [160/160 01:19]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "text/plain": [
       "{'eval_loss': 0.00055647426052019,\n",
       " 'eval_runtime': 79.6939,\n",
       " 'eval_samples_per_second': 16.036,\n",
       " 'eval_steps_per_second': 2.008,\n",
       " 'epoch': 39.56}"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.evaluate(train_set)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T09:33:05.820118Z",
     "iopub.status.busy": "2023-09-12T09:33:05.819417Z",
     "iopub.status.idle": "2023-09-12T09:33:08.026572Z",
     "shell.execute_reply": "2023-09-12T09:33:08.026139Z",
     "shell.execute_reply.started": "2023-09-12T09:33:05.820082Z"
    }
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('./godel/v0.0.5/tokenizer_config.json',\n",
       " './godel/v0.0.5/special_tokens_map.json',\n",
       " './godel/v0.0.5/tokenizer.json')"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.save_model()\n",
    "trainer.save_state()\n",
    "tokenizer.save_pretrained(trainer.args.output_dir)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### You can chat with your model here. Pass in instrucions or knowledge as you desire."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T09:33:11.243375Z",
     "iopub.status.busy": "2023-09-12T09:33:11.242979Z",
     "iopub.status.idle": "2023-09-12T09:33:11.246636Z",
     "shell.execute_reply": "2023-09-12T09:33:11.246071Z",
     "shell.execute_reply.started": "2023-09-12T09:33:11.243351Z"
    }
   },
   "outputs": [],
   "source": [
    "from time import time "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T09:33:11.802465Z",
     "iopub.status.busy": "2023-09-12T09:33:11.802159Z",
     "iopub.status.idle": "2023-09-12T09:33:11.807265Z",
     "shell.execute_reply": "2023-09-12T09:33:11.806707Z",
     "shell.execute_reply.started": "2023-09-12T09:33:11.802443Z"
    }
   },
   "outputs": [],
   "source": [
    "def generate(instruction, dialog, knowledge):\n",
    "    if knowledge != '':\n",
    "        knowledge = '[KNOWLEDGE] ' + knowledge\n",
    "    dialog = ' EOS '.join(dialog)\n",
    "    query = f\"{instruction} [CONTEXT] {dialog} {knowledge}\"\n",
    "    t = time()\n",
    "    \n",
    "    input_ids = tokenizer(f\"{query}\", return_tensors=\"pt\").to('cuda').input_ids\n",
    "    outputs = model.generate(input_ids, max_length=32102, min_length=8, top_p=0.9, do_sample=True)\n",
    "    output = tokenizer.decode(outputs[0], skip_special_tokens=True)\n",
    "    print('time:', time() - t)\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "execution": {
     "iopub.execute_input": "2023-09-12T09:41:13.476490Z",
     "iopub.status.busy": "2023-09-12T09:41:13.476127Z"
    }
   },
   "outputs": [],
   "source": [
    "dialog = list()\n",
    "while True:\n",
    "    query = input(\"Human: \")\n",
    "    dialog.append(query)\n",
    "    instruction = \"You are Woice AI, you are here to answer queries emphatically. Don't be rude and say vulgar words. Any thing unrelated to your training, do not answer randomly. Be polite.\"\n",
    "    knowledge = ''\n",
    "    output = \"AI: \" + generate(instruction, dialog, knowledge)\n",
    "    dialog.append(output)\n",
    "    print(output)"
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}