{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import transformers\n", "from transformers import (\n", " CONFIG_MAPPING,\n", " MODEL_FOR_CAUSAL_LM_MAPPING,\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " HfArgumentParser,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " is_torch_tpu_available,\n", " set_seed,\n", ")\n", "\n", "from itertools import chain\n", "\n", "from transformers.testing_utils import CaptureLogger\n", "from transformers.trainer_utils import get_last_checkpoint\n", "# from transformers.utils import check_min_version, send_example_telemetry\n", "from transformers.utils.versions import require_version\n", "\n", "import datasets\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# check_min_version(\"4.23.0.dev0\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "require_version(\"datasets>=1.8.0\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "set_seed(37)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:97: FutureWarning: Deprecated argument(s) used in 'dataset_info': token. Will not be supported from version '0.12'.\n", " warnings.warn(message, FutureWarning)\n", "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n", "Found cached dataset csv (/Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "67606d054e4a4b2f9ddf99f07c02c328", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "raw_datasets = load_dataset(\"merve/poetry\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained('gpt2')" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "config = AutoConfig.from_pretrained('gpt2')\n", "\n", "# max_seq_length" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Embedding(50257, 768)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model = AutoModelForCausalLM.from_pretrained(\n", " \"gpt2\",\n", " config=config\n", ")\n", "model.max_seq_length = 128\n", "model.resize_token_embeddings(len(tokenizer))" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Dataset({\n", " features: ['author', 'content', 'poem name', 'age', 'type'],\n", " num_rows: 573\n", "})" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'Mythology & Folklore'" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets['train']['type'][0]" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "DatasetDict({\n", " train: Dataset({\n", " features: ['author', 'content', 'poem name', 'age', 'type'],\n", " num_rows: 573\n", " })\n", "})" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "raw_datasets" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "tok_logger = transformers.utils.logging.get_logger(\n", " \"transformers.tokenization_utils_base\"\n", ")" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "def tokenize_function(examples):\n", " with CaptureLogger(tok_logger) as cl:\n", " output = tokenizer(examples[text_column_name])\n", " # clm input could be much much longer than block_size\n", " if \"Token indices sequence length is longer than the\" in cl.out:\n", " tok_logger.warning(\n", " \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits\"\n", " \" before being passed to the model.\"\n", " )\n", " return output" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "column_names = raw_datasets[\"train\"].column_names\n", "# text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n", "text_column_name = \"content\"" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-62fd9c772e30c8d3.arrow\n" ] } ], "source": [ "tokenized_datasets = raw_datasets.map(\n", " tokenize_function,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " remove_columns=column_names,\n", " # load_from_cache_file=not data_args.overwrite_cache,\n", " desc=\"Running tokenizer on dataset\",\n", ")" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "block_size = tokenizer.model_max_length" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n", "def group_texts(examples):\n", " # Concatenate all texts.\n", " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n", " total_length = len(concatenated_examples[list(examples.keys())[0]])\n", " # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n", " # customize this part to your needs.\n", " if total_length >= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of max_len.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-88d7c64be469684a.arrow\n" ] } ], "source": [ "lm_datasets = tokenized_datasets.map(\n", " group_texts,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " # load_from_cache_file=not data_args.overwrite_cache,\n", " desc=f\"Grouping texts in chunks of {block_size}\",\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "train_dataset = lm_datasets[\"train\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Do the fine-tuning" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "training_args = TrainingArguments(\n", " output_dir=\"gpt2-poetry-model\", \n", " overwrite_output_dir=True,\n", " # per_gpu_train_batch_size=256\n", " per_device_train_batch_size=16,\n", " push_to_hub=True,\n", " push_to_hub_token=\"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\"\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Initialize our Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " # eval_dataset=eval_dataset,\n", " tokenizer=tokenizer,\n", " # Data collator will default to DataCollatorWithPadding, so we change it.\n", " data_collator=default_data_collator,\n", " # compute_metrics=compute_metrics\n", " # if training_args.do_eval and not is_torch_tpu_available()\n", " # else None,\n", " # preprocess_logits_for_metrics=preprocess_logits_for_metrics\n", " # if training_args.do_eval and not is_torch_tpu_available()\n", " # else None,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** Running training *****\n", " Num examples = 171\n", " Num Epochs = 3\n", " Instantaneous batch size per device = 8\n", " Total train batch size (w. parallel, distributed & accumulation) = 8\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 66\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59ebc6f251bd42e4bd3474b574614d1f", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/66 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stderr", "output_type": "stream", "text": [ "\n", "\n", "Training completed. Do not forget to share your model on huggingface.co/models =)\n", "\n", "\n", "Saving model checkpoint to tmp_trainer\n", "Configuration saved in tmp_trainer/config.json\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'train_runtime': 2967.2818, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 4.249474265358665, 'epoch': 3.0}\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Model weights saved in tmp_trainer/pytorch_model.bin\n", "tokenizer config file saved in tmp_trainer/tokenizer_config.json\n", "Special tokens file saved in tmp_trainer/special_tokens_map.json\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "***** train metrics *****\n", " epoch = 3.0\n", " train_loss = 4.2495\n", " train_runtime = 0:49:27.28\n", " train_samples = 171\n", " train_samples_per_second = 0.173\n", " train_steps_per_second = 0.022\n" ] } ], "source": [ "# Training\n", "# checkpoint = None\n", "# train_result = trainer.train(resume_from_checkpoint=checkpoint)\n", "# trainer.save_model() # Saves the tokenizer too for easy upload\n", "\n", "# metrics = train_result.metrics\n", "\n", "# max_train_samples = (len(train_dataset))\n", "# metrics[\"train_samples\"] = min(max_train_samples, len(train_dataset))\n", "\n", "# trainer.log_metrics(\"train\", metrics)\n", "# trainer.save_metrics(\"train\", metrics)\n", "# trainer.save_state()\n", "# # Upload the the hugging face hub for easy use in inference.\n", "# trainer.push_to_hub()" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2cec8af2b332409bb857695a7b099653", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='