{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "id": "cUzq1tXyk5Ga" }, "outputs": [], "source": [ "# !pip install transformers\n", "# !pip install torch\n", "# !pip install accelerate -U" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Below is the funtion to find trainable parameters of the Model. " ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "737641472" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "sum(dict((p.data_ptr(), p.numel()) for p in model.parameters()).values())" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:18.853671Z", "iopub.status.busy": "2023-09-12T05:38:18.853483Z", "iopub.status.idle": "2023-09-12T05:38:20.511295Z", "shell.execute_reply": "2023-09-12T05:38:20.510634Z", "shell.execute_reply.started": "2023-09-12T05:38:18.853650Z" }, "id": "_GqhK_n0JWC4" }, "outputs": [], "source": [ "import pandas as pd\n", "import json\n", "import torch\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:21.617293Z", "iopub.status.busy": "2023-09-12T05:38:21.616915Z", "iopub.status.idle": "2023-09-12T05:38:34.474328Z", "shell.execute_reply": "2023-09-12T05:38:34.473820Z", "shell.execute_reply.started": "2023-09-12T05:38:21.617267Z" }, "id": "FVBPeMW99Z7G" }, "outputs": [], "source": [ "\n", "from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AdamW, TrainingArguments, Trainer\n", "from torch.utils.data import TensorDataset\n", "\n", "tokenizer = AutoTokenizer.from_pretrained(\"microsoft/GODEL-v1_1-large-seq2seq\", padding_side='right', truncation_side='left')\n" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:37.343460Z", "iopub.status.busy": "2023-09-12T05:38:37.343116Z", "iopub.status.idle": "2023-09-12T05:38:43.015610Z", "shell.execute_reply": "2023-09-12T05:38:43.015175Z", "shell.execute_reply.started": "2023-09-12T05:38:37.343436Z" }, "id": "Bee7KFF2MWQ_" }, "outputs": [], "source": [ "model = AutoModelForSeq2SeqLM.from_pretrained(\"microsoft/GODEL-v1_1-large-seq2seq\").to('cuda')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Here the data preprocessed, Note that the data loaded to this model is in the following format. It is in the form of mulit-turn conversation between two persons.\n", "#### [[person1, person2, person1, person2, person1, person2],\n", "#### [person1, person2, person1, person2, person1, person2],\n", "#### [person1, person2, person1, person2, person1, person2],\n", "#### [person1, person2, person1, person2, person1, person2],\n", "#### [person1, person2, person1, person2, person1, person2]]" ] }, { "cell_type": "code", "execution_count": 6, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:44.400644Z", "iopub.status.busy": "2023-09-12T05:38:44.400155Z", "iopub.status.idle": "2023-09-12T05:38:44.405992Z", "shell.execute_reply": "2023-09-12T05:38:44.405263Z", "shell.execute_reply.started": "2023-09-12T05:38:44.400620Z" }, "id": "Mjd9Us2Sr6Hq" }, "outputs": [], "source": [ "def read_data_from_txt(file_path):\n", " try:\n", " with open(file_path, 'rb') as file:\n", " content = file.readlines()\n", " content = [_.decode('utf-8').strip() for _ in content]\n", " content = '\\n'.join(content)\n", "\n", " # Split the content based on the delimiter (triple single quotes)\n", " data_list = content.split(\"''','''\")\n", "\n", " # Remove empty elements from the list\n", " data_list = [section.strip(\"'''\") for section in data_list]\n", " data_list = [_.strip().split('\\n') for _ in data_list]\n", "\n", " return data_list\n", " except FileNotFoundError:\n", " print(f\"File '{file_path}' not found.\")\n", " return None\n", " except Exception as e:\n", " print(f\"Error occurred while reading the file: {e}\")\n", " return None\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:45.632305Z", "iopub.status.busy": "2023-09-12T05:38:45.631923Z", "iopub.status.idle": "2023-09-12T05:38:45.637764Z", "shell.execute_reply": "2023-09-12T05:38:45.637089Z", "shell.execute_reply.started": "2023-09-12T05:38:45.632280Z" }, "id": "N4WTX9MfKTBX" }, "outputs": [], "source": [ "\n", "file_path = 'your_data.txt'\n", "data_list = read_data_from_txt(file_path)\n" ] }, { "cell_type": "code", "execution_count": 8, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:46.529136Z", "iopub.status.busy": "2023-09-12T05:38:46.528726Z", "iopub.status.idle": "2023-09-12T05:38:46.532045Z", "shell.execute_reply": "2023-09-12T05:38:46.531505Z", "shell.execute_reply.started": "2023-09-12T05:38:46.529112Z" } }, "outputs": [], "source": [ "training_data = data_list\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:52.640741Z", "iopub.status.busy": "2023-09-12T05:38:52.639972Z", "iopub.status.idle": "2023-09-12T05:38:52.646245Z", "shell.execute_reply": "2023-09-12T05:38:52.645854Z", "shell.execute_reply.started": "2023-09-12T05:38:52.640704Z" }, "id": "fxgyXq64Q1GP" }, "outputs": [], "source": [ "\n", "def create_input_output(data_list):\n", " input_data = []\n", " output_data = []\n", " instructions = \"You are Woice AI. Answer the queires relevant to rev9 Solutions only. If not relevant, asnwer 'I applogize, I can't answer your question as I am just an AI chatbot.'\"\n", " knowledge = \"\"\n", " for lines in data_list:\n", " for i in range(1, len(lines), 2):\n", " input_lines = lines[:i]\n", " input_text = ' EOS '.join(input_lines).strip()\n", " input_data.append(f'[INSTRUCTION] {instructions} [CONTEXT] ' + input_text )\n", " output_data.append(lines[i] + ' EOS')\n", " return input_data, output_data\n" ] }, { "cell_type": "code", "execution_count": 11, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:38:54.366890Z", "iopub.status.busy": "2023-09-12T05:38:54.366544Z", "iopub.status.idle": "2023-09-12T05:38:54.371721Z", "shell.execute_reply": "2023-09-12T05:38:54.371144Z", "shell.execute_reply.started": "2023-09-12T05:38:54.366866Z" } }, "outputs": [], "source": [ "\n", "train_input, train_output = create_input_output(training_data)" ] }, { "cell_type": "code", "execution_count": 13, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:10.350357Z", "iopub.status.busy": "2023-09-12T05:39:10.350006Z", "iopub.status.idle": "2023-09-12T05:39:10.354580Z", "shell.execute_reply": "2023-09-12T05:39:10.353920Z", "shell.execute_reply.started": "2023-09-12T05:39:10.350333Z" }, "id": "VyrEDi_G9NfY" }, "outputs": [], "source": [ "def generation_tokenized_dataset(input, output):\n", " \n", " input_tokens = tokenizer(input, padding=\"longest\", truncation=True, return_tensors=\"pt\", max_length=768)\n", " output_tokens = tokenizer(output, padding=\"longest\", truncation=True, return_tensors=\"pt\", max_length=768)\n", " dataset = TensorDataset(input_tokens.input_ids, input_tokens.attention_mask,\n", " output_tokens.input_ids, output_tokens.attention_mask)\n", "\n", " return dataset\n" ] }, { "cell_type": "code", "execution_count": 14, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:11.118317Z", "iopub.status.busy": "2023-09-12T05:39:11.117702Z", "iopub.status.idle": "2023-09-12T05:39:11.459556Z", "shell.execute_reply": "2023-09-12T05:39:11.459151Z", "shell.execute_reply.started": "2023-09-12T05:39:11.118292Z" }, "id": "Q0IjwcBPfVEm" }, "outputs": [], "source": [ "train_set = generation_tokenized_dataset(train_input, train_output)\n" ] }, { "cell_type": "code", "execution_count": 15, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:12.526146Z", "iopub.status.busy": "2023-09-12T05:39:12.525838Z", "iopub.status.idle": "2023-09-12T05:39:12.530858Z", "shell.execute_reply": "2023-09-12T05:39:12.530178Z", "shell.execute_reply.started": "2023-09-12T05:39:12.526123Z" }, "id": "hhz3a3j2Sa0P" }, "outputs": [], "source": [ "class CustomDataCollator:\n", " def __call__(self, features):\n", " input_ids = torch.stack([f[0] for f in features])\n", " attention_mask = torch.stack([f[1] for f in features])\n", " labels = torch.stack([f[2] for f in features])\n", "\n", " return {\n", " 'input_ids': input_ids,\n", " 'attention_mask': attention_mask,\n", " 'labels': labels\n", " }\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:13.295224Z", "iopub.status.busy": "2023-09-12T05:39:13.294666Z", "iopub.status.idle": "2023-09-12T05:39:13.307836Z", "shell.execute_reply": "2023-09-12T05:39:13.307503Z", "shell.execute_reply.started": "2023-09-12T05:39:13.295200Z" }, "id": "CN5JWUqmS8wM" }, "outputs": [], "source": [ "import torch\n", "device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')\n", "model.to(device)\n", "optimizer = AdamW(model.parameters(), lr=1e-5)" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:14.655823Z", "iopub.status.busy": "2023-09-12T05:39:14.655033Z", "iopub.status.idle": "2023-09-12T05:39:14.659506Z", "shell.execute_reply": "2023-09-12T05:39:14.658681Z", "shell.execute_reply.started": "2023-09-12T05:39:14.655786Z" }, "id": "zfsQaXAEWZLD" }, "outputs": [], "source": [ "from transformers import EarlyStoppingCallback" ] }, { "cell_type": "code", "execution_count": 18, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:15.342624Z", "iopub.status.busy": "2023-09-12T05:39:15.342125Z", "iopub.status.idle": "2023-09-12T05:39:15.345769Z", "shell.execute_reply": "2023-09-12T05:39:15.345059Z", "shell.execute_reply.started": "2023-09-12T05:39:15.342600Z" }, "id": "zd7CDp3xXVMp" }, "outputs": [], "source": [ "from transformers import get_linear_schedule_with_warmup" ] }, { "cell_type": "code", "execution_count": 17, "metadata": { "execution": { "iopub.execute_input": "2023-09-11T11:42:31.617024Z", "iopub.status.busy": "2023-09-11T11:42:31.616702Z", "iopub.status.idle": "2023-09-11T11:42:31.620157Z", "shell.execute_reply": "2023-09-11T11:42:31.619476Z", "shell.execute_reply.started": "2023-09-11T11:42:31.617001Z" }, "id": "rcMlWRgMWcOA" }, "outputs": [], "source": [ "callbacks = [EarlyStoppingCallback(early_stopping_patience=4)]" ] }, { "cell_type": "code", "execution_count": 19, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:17.359370Z", "iopub.status.busy": "2023-09-12T05:39:17.358967Z", "iopub.status.idle": "2023-09-12T05:39:17.362640Z", "shell.execute_reply": "2023-09-12T05:39:17.362096Z", "shell.execute_reply.started": "2023-09-12T05:39:17.359346Z" }, "id": "WgGbwECpXXwd" }, "outputs": [], "source": [ "lr_scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,\n", " num_warmup_steps=300,\n", " num_training_steps=1200)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:26.782170Z", "iopub.status.busy": "2023-09-12T05:39:26.781759Z", "iopub.status.idle": "2023-09-12T05:39:26.788708Z", "shell.execute_reply": "2023-09-12T05:39:26.788007Z", "shell.execute_reply.started": "2023-09-12T05:39:26.782126Z" }, "id": "UCpUorNtUTxJ" }, "outputs": [], "source": [ "training_args = TrainingArguments(\n", " output_dir='./godel/v0.0.5',\n", " num_train_epochs= 20,\n", " per_device_train_batch_size=2,\n", " warmup_steps=100,\n", " weight_decay=0.01,\n", " logging_dir='./godel/v0.0.5/logs',\n", " logging_steps=50,\n", " save_total_limit=1,\n", " gradient_accumulation_steps=8,\n", " learning_rate=0.001,\n", " load_best_model_at_end=True,\n", " metric_for_best_model='loss',\n", " greater_is_better=False,\n", " save_strategy='epoch',\n", " evaluation_strategy='epoch'\n", "\n", ")\n", "\n", "training_args = training_args.set_lr_scheduler(name='linear',\n", " num_epochs=40,\n", " warmup_steps=100)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Here model is evaluated and trained on the same dataset as I was short on the dataset. If you have a large dataset, split them with the desired ratio (recommended= 15:85, respectively)" ] }, { "cell_type": "code", "execution_count": 21, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:27.630008Z", "iopub.status.busy": "2023-09-12T05:39:27.629250Z", "iopub.status.idle": "2023-09-12T05:39:27.642183Z", "shell.execute_reply": "2023-09-12T05:39:27.641782Z", "shell.execute_reply.started": "2023-09-12T05:39:27.629973Z" }, "id": "KxAyHTuJOBIQ" }, "outputs": [], "source": [ "\n", "\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_set,\n", " eval_dataset=train_set,\n", " data_collator=CustomDataCollator(),\n", " callbacks=callbacks,\n", "\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T05:39:29.327544Z", "iopub.status.busy": "2023-09-12T05:39:29.327023Z", "iopub.status.idle": "2023-09-12T09:31:20.343378Z", "shell.execute_reply": "2023-09-12T09:31:20.343016Z", "shell.execute_reply.started": "2023-09-12T05:39:29.327521Z" }, "id": "brO0zCjN9U_P" }, "outputs": [], "source": [ "trainer.train()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T09:31:20.344170Z", "iopub.status.busy": "2023-09-12T09:31:20.344000Z", "iopub.status.idle": "2023-09-12T09:32:40.040850Z", "shell.execute_reply": "2023-09-12T09:32:40.040458Z", "shell.execute_reply.started": "2023-09-12T09:31:20.344157Z" } }, "outputs": [ { "data": { "text/html": [ "\n", "
\n", " \n", " \n", " [160/160 01:19]\n", "
\n", " " ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "{'eval_loss': 0.00055647426052019,\n", " 'eval_runtime': 79.6939,\n", " 'eval_samples_per_second': 16.036,\n", " 'eval_steps_per_second': 2.008,\n", " 'epoch': 39.56}" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.evaluate(train_set)" ] }, { "cell_type": "code", "execution_count": 24, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T09:33:05.820118Z", "iopub.status.busy": "2023-09-12T09:33:05.819417Z", "iopub.status.idle": "2023-09-12T09:33:08.026572Z", "shell.execute_reply": "2023-09-12T09:33:08.026139Z", "shell.execute_reply.started": "2023-09-12T09:33:05.820082Z" } }, "outputs": [ { "data": { "text/plain": [ "('./godel/v0.0.5/tokenizer_config.json',\n", " './godel/v0.0.5/special_tokens_map.json',\n", " './godel/v0.0.5/tokenizer.json')" ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "trainer.save_model()\n", "trainer.save_state()\n", "tokenizer.save_pretrained(trainer.args.output_dir)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### You can chat with your model here. Pass in instrucions or knowledge as you desire." ] }, { "cell_type": "code", "execution_count": 25, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T09:33:11.243375Z", "iopub.status.busy": "2023-09-12T09:33:11.242979Z", "iopub.status.idle": "2023-09-12T09:33:11.246636Z", "shell.execute_reply": "2023-09-12T09:33:11.246071Z", "shell.execute_reply.started": "2023-09-12T09:33:11.243351Z" } }, "outputs": [], "source": [ "from time import time " ] }, { "cell_type": "code", "execution_count": 26, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T09:33:11.802465Z", "iopub.status.busy": "2023-09-12T09:33:11.802159Z", "iopub.status.idle": "2023-09-12T09:33:11.807265Z", "shell.execute_reply": "2023-09-12T09:33:11.806707Z", "shell.execute_reply.started": "2023-09-12T09:33:11.802443Z" } }, "outputs": [], "source": [ "def generate(instruction, dialog, knowledge):\n", " if knowledge != '':\n", " knowledge = '[KNOWLEDGE] ' + knowledge\n", " dialog = ' EOS '.join(dialog)\n", " query = f\"{instruction} [CONTEXT] {dialog} {knowledge}\"\n", " t = time()\n", " \n", " input_ids = tokenizer(f\"{query}\", return_tensors=\"pt\").to('cuda').input_ids\n", " outputs = model.generate(input_ids, max_length=32102, min_length=8, top_p=0.9, do_sample=True)\n", " output = tokenizer.decode(outputs[0], skip_special_tokens=True)\n", " print('time:', time() - t)\n", " return output" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "execution": { "iopub.execute_input": "2023-09-12T09:41:13.476490Z", "iopub.status.busy": "2023-09-12T09:41:13.476127Z" } }, "outputs": [], "source": [ "dialog = list()\n", "while True:\n", " query = input(\"Human: \")\n", " dialog.append(query)\n", " instruction = \"You are Woice AI, you are here to answer queries emphatically. Don't be rude and say vulgar words. Any thing unrelated to your training, do not answer randomly. Be polite.\"\n", " knowledge = ''\n", " output = \"AI: \" + generate(instruction, dialog, knowledge)\n", " dialog.append(output)\n", " print(output)" ] } ], "metadata": { "accelerator": "GPU", "colab": { "gpuType": "T4", "provenance": [] }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 4 }