{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import datasets\n",
    "import transformers\n",
    "import torch\n",
    "\n",
    "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "MODEL = \"EleutherAI/pythia-125m-deduped\"\n",
    "\n",
    "config = AutoConfig.from_pretrained(MODEL)\n",
    "tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n",
    "model = AutoModelForCausalLM.from_pretrained(MODEL)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Added 1 tokens!\n"
     ]
    }
   ],
   "source": [
    "# @title Extend model\n",
    "\n",
    "num_added_tokens = tokenizer.add_special_tokens({\"sep_token\": \"<|STK_SP|>\"})\n",
    "print(f\"Added {num_added_tokens} tokens!\")\n",
    "model.resize_token_embeddings(len(tokenizer))\n",
    "\n",
    "# TODO: ???\n",
    "tokenizer.pad_token = tokenizer.eos_token\n",
    "\n",
    "assert tokenizer.sep_token == \"<|STK_SP|>\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using custom data configuration default-b39c74bc29b6f917\n",
      "Found cached dataset json (C:/Users/lego-/.cache/huggingface/datasets/json/default-b39c74bc29b6f917/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a5ad5093bc064d4096b9646f195e4723",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/2 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "# @title Load in the dataset\n",
    "\n",
    "from datasets import load_dataset\n",
    "\n",
    "data_files = {\n",
    "    \"train\": \"./dataset-r1/train.jsonl\",\n",
    "    \"validation\": \"./dataset-r1/valid.jsonl\",\n",
    "}\n",
    "\n",
    "raw_datasets = load_dataset(\n",
    "    \"json\",\n",
    "    data_files=data_files,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-d06df8923a2befa8.arrow\n",
      "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-847113bf21349cf9.arrow\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total processed datasets sizes are  2755 150\n"
     ]
    }
   ],
   "source": [
    "# @title Tokenize the dataset\n",
    "tokenized_datasets = raw_datasets.map(\n",
    "    lambda e: tokenizer(e[\"input\"] + e[\"output\"] + tokenizer.eos_token),\n",
    "    #batched=True,\n",
    "    #num_proc=4,\n",
    "    remove_columns=[\"input\", \"output\", \"coder\", \"system\", \"god\", \"user\", \"ai\", \"topic\"]\n",
    ")\n",
    "\n",
    "for i in range(len(tokenized_datasets[\"train\"])):\n",
    "    if len(tokenized_datasets[\"train\"][i][\"input_ids\"]) > config.max_position_embeddings:\n",
    "        print(f\"Error in {i} of train\")\n",
    "for i in range(len(tokenized_datasets[\"validation\"])):\n",
    "    if len(tokenized_datasets[\"validation\"][i][\"input_ids\"]) > config.max_position_embeddings:\n",
    "        print(f\"Error in {i} of validation\")\n",
    "\n",
    "# [tokenized_datasets[\"train\"][1], tokenized_datasets[\"validation\"][1]]\n",
    "print(\"Total processed datasets sizes are \", len(tokenized_datasets[\"train\"]), len(tokenized_datasets[\"validation\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0cad348a2c094680ac2b0ab5e7dc2c8c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Grouping texts in chunks of 2048:   0%|          | 0/3 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eef956243d5542fcbf41bfdaa04ad5ea",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Grouping texts in chunks of 2048:   0%|          | 0/1 [00:00<?, ?ba/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total LM datasets sizes are  628 31\n"
     ]
    }
   ],
   "source": [
    "# TODO: maybe group?\n",
    "\n",
    "from itertools import chain\n",
    "\n",
    "block_size = 2048\n",
    "def group_texts(examples):\n",
    "    # Concatenate all texts.\n",
    "    #print(list(chain(*examples['input_ids'])))\n",
    "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
    "    # customize this part to your needs.\n",
    "    if total_length >= block_size:\n",
    "        total_length = (total_length // block_size) * block_size\n",
    "    # Split by chunks of max_len.\n",
    "    result = {\n",
    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    return result\n",
    "\n",
    "lm_datasets = tokenized_datasets.map(\n",
    "    group_texts,\n",
    "    batched=True,\n",
    "    # num_proc=data_args.preprocessing_num_workers,\n",
    "    load_from_cache_file=False,\n",
    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
    ")\n",
    "\n",
    "print(\"Total LM datasets sizes are \", len(lm_datasets[\"train\"]), len(lm_datasets[\"validation\"]))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Using magick windows DLL!\n",
      "CUDA SETUP: Loading binary d:\\projects\\python\\distilchatgpt2\\venv\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cudaall.dll...\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using cuda_amp half precision backend\n"
     ]
    }
   ],
   "source": [
    "from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding\n",
    "from transformers.trainer_pt_utils import get_parameter_names\n",
    "import evaluate\n",
    "\n",
    "import bitsandbytes as bnb\n",
    "from bitsandbytes.optim import GlobalOptimManager\n",
    "\n",
    "def preprocess_logits_for_metrics(logits, labels):\n",
    "    if isinstance(logits, tuple):\n",
    "        # Depending on the model and config, logits may contain extra tensors,\n",
    "        # like past_key_values, but logits always come first\n",
    "        logits = logits[0]\n",
    "    return logits.argmax(dim=-1)\n",
    "\n",
    "metric = evaluate.load(\"accuracy\")\n",
    "\n",
    "def compute_metrics(eval_preds):\n",
    "    preds, labels = eval_preds\n",
    "    # preds have the same shape as the labels, after the argmax(-1) has been calculated\n",
    "    # by preprocess_logits_for_metrics but we need to shift the labels\n",
    "    labels = labels[:, 1:].reshape(-1)\n",
    "    preds = preds[:, :-1].reshape(-1)\n",
    "    return metric.compute(predictions=preds, references=labels)\n",
    "\n",
    "model.config.use_cache = False\n",
    "\n",
    "#data_collator_pad = DataCollatorWithPadding(tokenizer)\n",
    "def data_collator(data_):\n",
    "    data = default_data_collator(data_)\n",
    "    #print(data)\n",
    "    return {'input_ids': torch.stack([i for i in data['input_ids']]),\n",
    "      'attention_mask': torch.stack([i for i in data['attention_mask']]),\n",
    "      'labels': torch.stack([i for i in data['input_ids']])}\n",
    "\n",
    "training_args = TrainingArguments(\n",
    "    \"./openchatgpt-neox-r1.1/\",\n",
    "    do_train=True, \n",
    "    do_eval=True,\n",
    "    \n",
    "    push_to_hub=False,\n",
    "\n",
    "    # Pulled from examples\n",
    "    evaluation_strategy=\"epoch\",\n",
    "    #learning_rate=2e-5,\n",
    "    #weight_decay=0.01,\n",
    "\n",
    "    save_steps=300,\n",
    "\n",
    "    per_device_train_batch_size=1,\n",
    "    per_device_eval_batch_size=1,\n",
    "\n",
    "    gradient_accumulation_steps=2,\n",
    "    gradient_checkpointing=True,\n",
    "\n",
    "    fp16=True,\n",
    ")\n",
    "\n",
    "optim = bnb.optim.Adam8bit\n",
    "def set_optim_to_run_embedding_in_fp32(model):\n",
    "    for module in model.modules():\n",
    "        if isinstance(module, torch.nn.Embedding):\n",
    "            GlobalOptimManager.get_instance().register_module_override(module, 'weight', {'optim_bits': 32})\n",
    "set_optim_to_run_embedding_in_fp32(model)\n",
    "# model.cuda()\n",
    "\n",
    "decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])\n",
    "decay_parameters = [name for name in decay_parameters if \"bias\" not in name]\n",
    "optimizer_grouped_parameters = [\n",
    "    {\n",
    "        \"params\": [p for n, p in model.named_parameters() if n in decay_parameters],\n",
    "        \"weight_decay\": training_args.weight_decay,\n",
    "    },\n",
    "    {\n",
    "        \"params\": [p for n, p in model.named_parameters() if n not in decay_parameters],\n",
    "        \"weight_decay\": 0.0,\n",
    "    },\n",
    "]\n",
    "\n",
    "adam_bnb_optim = optim(\n",
    "    optimizer_grouped_parameters,\n",
    "    betas=(training_args.adam_beta1, training_args.adam_beta2),\n",
    "    eps=training_args.adam_epsilon,\n",
    "    lr=training_args.learning_rate,\n",
    ")\n",
    "\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    #train_dataset=tokenized_datasets[\"train\"],\n",
    "    #eval_dataset=tokenized_datasets[\"validation\"],\n",
    "    train_dataset=lm_datasets[\"train\"],\n",
    "    eval_dataset=lm_datasets[\"validation\"],\n",
    "    tokenizer=tokenizer,\n",
    "\n",
    "    data_collator=data_collator,\n",
    "    compute_metrics=compute_metrics,\n",
    "    preprocess_logits_for_metrics=preprocess_logits_for_metrics,\n",
    "\n",
    "    # data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),\n",
    "    #     'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),\n",
    "    #     'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])},\n",
    "\n",
    "    args=training_args,\n",
    "\n",
    "    optimizers=(adam_bnb_optim, None),\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "No last checkpoint detected!\n"
     ]
    }
   ],
   "source": [
    "# @title Get last model checkpoint if any...\n",
    "\n",
    "from transformers.trainer_utils import get_last_checkpoint\n",
    "\n",
    "last_checkpoint = get_last_checkpoint(\"./openchatgpt-neox-r1.1/\")\n",
    "if last_checkpoint is None:\n",
    "    print(\"No last checkpoint detected!\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running training *****\n",
      "  Num examples = 628\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 1\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 2\n",
      "  Gradient Accumulation steps = 2\n",
      "  Total optimization steps = 942\n",
      "  Number of trainable parameters = 162283008\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='942' max='942' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [942/942 1:31:15, Epoch 3/3]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Epoch</th>\n",
       "      <th>Training Loss</th>\n",
       "      <th>Validation Loss</th>\n",
       "      <th>Accuracy</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>No log</td>\n",
       "      <td>0.881487</td>\n",
       "      <td>0.787100</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>0.811800</td>\n",
       "      <td>0.871694</td>\n",
       "      <td>0.791922</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>0.811800</td>\n",
       "      <td>0.896573</td>\n",
       "      <td>0.792001</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-300\n",
      "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-300\\config.json\n",
      "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-300\\pytorch_model.bin\n",
      "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\tokenizer_config.json\n",
      "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-300\\special_tokens_map.json\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 31\n",
      "  Batch size = 1\n",
      "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-600\n",
      "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-600\\config.json\n",
      "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-600\\pytorch_model.bin\n",
      "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\tokenizer_config.json\n",
      "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-600\\special_tokens_map.json\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 31\n",
      "  Batch size = 1\n",
      "Saving model checkpoint to ./openchatgpt-neox-r1.1/checkpoint-900\n",
      "Configuration saved in ./openchatgpt-neox-r1.1/checkpoint-900\\config.json\n",
      "Model weights saved in ./openchatgpt-neox-r1.1/checkpoint-900\\pytorch_model.bin\n",
      "tokenizer config file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\tokenizer_config.json\n",
      "Special tokens file saved in ./openchatgpt-neox-r1.1/checkpoint-900\\special_tokens_map.json\n",
      "***** Running Evaluation *****\n",
      "  Num examples = 31\n",
      "  Batch size = 1\n",
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "TrainOutput(global_step=942, training_loss=0.6499279856428726, metrics={'train_runtime': 5481.9853, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.172, 'total_flos': 2863022229946368.0, 'train_loss': 0.6499279856428726, 'epoch': 3.0})"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "trainer.train(resume_from_checkpoint=last_checkpoint)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running Evaluation *****\n",
      "  Num examples = 31\n",
      "  Batch size = 1\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='31' max='31' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [31/31 00:25]\n",
       "    </div>\n",
       "    "
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Perplexity: 2.45\n"
     ]
    }
   ],
   "source": [
    "import math\n",
    "eval_results = trainer.evaluate()\n",
    "print(f\"Perplexity: {math.exp(eval_results['eval_loss']):.2f}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Dropping the following result as it does not have all the necessary fields:\n",
      "{'task': {'name': 'Causal Language Modeling', 'type': 'text-generation'}, 'metrics': [{'name': 'Accuracy', 'type': 'accuracy', 'value': 0.7920008824873537}]}\n",
      "Saving model checkpoint to ./openchatgpt-neox-r1.1/\n",
      "Configuration saved in ./openchatgpt-neox-r1.1/config.json\n",
      "Model weights saved in ./openchatgpt-neox-r1.1/pytorch_model.bin\n",
      "tokenizer config file saved in ./openchatgpt-neox-r1.1/tokenizer_config.json\n",
      "Special tokens file saved in ./openchatgpt-neox-r1.1/special_tokens_map.json\n"
     ]
    }
   ],
   "source": [
    "trainer.save_state()\n",
    "trainer.create_model_card(tasks=\"text-generation\", finetuned_from=MODEL, dataset=\"openchatgpt safe-r1\")\n",
    "trainer.save_model()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.1"
  },
  "vscode": {
   "interpreter": {
    "hash": "545eac55c68d45fc1a0aaedcc380eacb641aa49675db0309d358f8f72d496c6d"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}