{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import transformers\n",
    "from transformers import (\n",
    "    CONFIG_MAPPING,\n",
    "    MODEL_FOR_CAUSAL_LM_MAPPING,\n",
    "    AutoConfig,\n",
    "    AutoModelForCausalLM,\n",
    "    AutoTokenizer,\n",
    "    HfArgumentParser,\n",
    "    Trainer,\n",
    "    TrainingArguments,\n",
    "    default_data_collator,\n",
    "    is_torch_tpu_available,\n",
    "    set_seed,\n",
    ")\n",
    "\n",
    "from itertools import chain\n",
    "\n",
    "from transformers.testing_utils import CaptureLogger\n",
    "from transformers.trainer_utils import get_last_checkpoint\n",
    "# from transformers.utils import check_min_version, send_example_telemetry\n",
    "from transformers.utils.versions import require_version\n",
    "\n",
    "import datasets\n",
    "from datasets import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check_min_version(\"4.23.0.dev0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "require_version(\"datasets>=1.8.0\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "set_seed(37)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:97: FutureWarning: Deprecated argument(s) used in 'dataset_info': token. Will not be supported from version '0.12'.\n",
      "  warnings.warn(message, FutureWarning)\n",
      "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n",
      "Found cached dataset csv (/Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "67606d054e4a4b2f9ddf99f07c02c328",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/1 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    }
   ],
   "source": [
    "raw_datasets = load_dataset(\"merve/poetry\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "tokenizer = AutoTokenizer.from_pretrained('gpt2')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "config = AutoConfig.from_pretrained('gpt2')\n",
    "\n",
    "# max_seq_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Embedding(50257, 768)"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "model = AutoModelForCausalLM.from_pretrained(\n",
    "    \"gpt2\",\n",
    "    config=config\n",
    ")\n",
    "model.max_seq_length = 128\n",
    "model.resize_token_embeddings(len(tokenizer))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dataset({\n",
       "    features: ['author', 'content', 'poem name', 'age', 'type'],\n",
       "    num_rows: 573\n",
       "})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets['train']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'Mythology & Folklore'"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets['train']['type'][0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "DatasetDict({\n",
       "    train: Dataset({\n",
       "        features: ['author', 'content', 'poem name', 'age', 'type'],\n",
       "        num_rows: 573\n",
       "    })\n",
       "})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "raw_datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "tok_logger = transformers.utils.logging.get_logger(\n",
    "    \"transformers.tokenization_utils_base\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "def tokenize_function(examples):\n",
    "    with CaptureLogger(tok_logger) as cl:\n",
    "        output = tokenizer(examples[text_column_name])\n",
    "    # clm input could be much much longer than block_size\n",
    "    if \"Token indices sequence length is longer than the\" in cl.out:\n",
    "        tok_logger.warning(\n",
    "            \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits\"\n",
    "            \" before being passed to the model.\"\n",
    "        )\n",
    "    return output"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "column_names = raw_datasets[\"train\"].column_names\n",
    "# text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
    "text_column_name = \"content\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-62fd9c772e30c8d3.arrow\n"
     ]
    }
   ],
   "source": [
    "tokenized_datasets = raw_datasets.map(\n",
    "    tokenize_function,\n",
    "    batched=True,\n",
    "    # num_proc=data_args.preprocessing_num_workers,\n",
    "    remove_columns=column_names,\n",
    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
    "    desc=\"Running tokenizer on dataset\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [],
   "source": [
    "block_size = tokenizer.model_max_length"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n",
    "def group_texts(examples):\n",
    "    # Concatenate all texts.\n",
    "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
    "    # customize this part to your needs.\n",
    "    if total_length >= block_size:\n",
    "        total_length = (total_length // block_size) * block_size\n",
    "    # Split by chunks of max_len.\n",
    "    result = {\n",
    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
    "        for k, t in concatenated_examples.items()\n",
    "    }\n",
    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
    "    return result"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-88d7c64be469684a.arrow\n"
     ]
    }
   ],
   "source": [
    "lm_datasets = tokenized_datasets.map(\n",
    "    group_texts,\n",
    "    batched=True,\n",
    "    # num_proc=data_args.preprocessing_num_workers,\n",
    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_dataset = lm_datasets[\"train\"]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Do the fine-tuning"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_args = TrainingArguments(\n",
    "    output_dir=\"gpt2-poetry-model\", \n",
    "    overwrite_output_dir=True,\n",
    "    # per_gpu_train_batch_size=256\n",
    "    per_device_train_batch_size=16,\n",
    "    push_to_hub=True,\n",
    "    push_to_hub_token=\"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\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Initialize our Trainer\n",
    "trainer = Trainer(\n",
    "    model=model,\n",
    "    args=training_args,\n",
    "    train_dataset=train_dataset,\n",
    "    # eval_dataset=eval_dataset,\n",
    "    tokenizer=tokenizer,\n",
    "    # Data collator will default to DataCollatorWithPadding, so we change it.\n",
    "    data_collator=default_data_collator,\n",
    "    # compute_metrics=compute_metrics\n",
    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
    "    # else None,\n",
    "    # preprocess_logits_for_metrics=preprocess_logits_for_metrics\n",
    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
    "    # else None,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "***** Running training *****\n",
      "  Num examples = 171\n",
      "  Num Epochs = 3\n",
      "  Instantaneous batch size per device = 8\n",
      "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
      "  Gradient Accumulation steps = 1\n",
      "  Total optimization steps = 66\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "59ebc6f251bd42e4bd3474b574614d1f",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "  0%|          | 0/66 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n",
      "\n",
      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
      "\n",
      "\n",
      "Saving model checkpoint to tmp_trainer\n",
      "Configuration saved in tmp_trainer/config.json\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'train_runtime': 2967.2818, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 4.249474265358665, 'epoch': 3.0}\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Model weights saved in tmp_trainer/pytorch_model.bin\n",
      "tokenizer config file saved in tmp_trainer/tokenizer_config.json\n",
      "Special tokens file saved in tmp_trainer/special_tokens_map.json\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "***** train metrics *****\n",
      "  epoch                    =        3.0\n",
      "  train_loss               =     4.2495\n",
      "  train_runtime            = 0:49:27.28\n",
      "  train_samples            =        171\n",
      "  train_samples_per_second =      0.173\n",
      "  train_steps_per_second   =      0.022\n"
     ]
    }
   ],
   "source": [
    "# Training\n",
    "# checkpoint = None\n",
    "# train_result = trainer.train(resume_from_checkpoint=checkpoint)\n",
    "# trainer.save_model()  # Saves the tokenizer too for easy upload\n",
    "\n",
    "# metrics = train_result.metrics\n",
    "\n",
    "# max_train_samples = (len(train_dataset))\n",
    "# metrics[\"train_samples\"] = min(max_train_samples, len(train_dataset))\n",
    "\n",
    "# trainer.log_metrics(\"train\", metrics)\n",
    "# trainer.save_metrics(\"train\", metrics)\n",
    "# trainer.save_state()\n",
    "# # Upload the the hugging face hub for easy use in inference.\n",
    "# trainer.push_to_hub()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2cec8af2b332409bb857695a7b099653",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Saving model checkpoint to gpt2-poetry-model\n",
      "Configuration saved in gpt2-poetry-model/config.json\n",
      "Model weights saved in gpt2-poetry-model/pytorch_model.bin\n",
      "tokenizer config file saved in gpt2-poetry-model/tokenizer_config.json\n",
      "Special tokens file saved in gpt2-poetry-model/special_tokens_map.json\n"
     ]
    },
    {
     "ename": "AttributeError",
     "evalue": "'Trainer' object has no attribute 'repo'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mAttributeError\u001b[0m                            Traceback (most recent call last)",
      "Cell \u001b[0;32mIn [27], line 3\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mhuggingface_hub\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m notebook_login\n\u001b[1;32m      2\u001b[0m notebook_login()\n\u001b[0;32m----> 3\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m   2674\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_world_process_zero():\n\u001b[1;32m   2675\u001b[0m     \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrepo\u001b[39m.\u001b[39mpush_to_hub(commit_message\u001b[39m=\u001b[39mcommit_message, blocking\u001b[39m=\u001b[39mblocking)\n\u001b[1;32m   2678\u001b[0m \u001b[39m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m   2679\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mshould_save:\n",
      "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "notebook_login()\n",
    "trainer.push_to_hub()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.10.6 ('augmented_poetry')",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.13"
  },
  "orig_nbformat": 4,
  "vscode": {
   "interpreter": {
    "hash": "00664817f4a09ab74dd392ee5a8d12e3606381c26df296db9ea5c334bb5d1b65"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}