{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import datasets\n", "import transformers\n", "import torch\n", "\n", "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "MODEL = \"EleutherAI/pythia-125m-deduped\"\n", "\n", "config = AutoConfig.from_pretrained(MODEL)\n", "tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)\n", "model = AutoModelForCausalLM.from_pretrained(MODEL)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Added 1 tokens!\n" ] } ], "source": [ "# @title Extend model\n", "\n", "num_added_tokens = tokenizer.add_special_tokens({\"sep_token\": \"<|STK_SP|>\"})\n", "print(f\"Added {num_added_tokens} tokens!\")\n", "model.resize_token_embeddings(len(tokenizer))\n", "\n", "# TODO: ???\n", "tokenizer.pad_token = tokenizer.eos_token\n", "\n", "assert tokenizer.sep_token == \"<|STK_SP|>\"" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration default-b39c74bc29b6f917\n", "Found cached dataset json (C:/Users/lego-/.cache/huggingface/datasets/json/default-b39c74bc29b6f917/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "a5ad5093bc064d4096b9646f195e4723", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/2 [00:00, ?it/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# @title Load in the dataset\n", "\n", "from datasets import load_dataset\n", "\n", "data_files = {\n", " \"train\": \"./dataset-r1/train.jsonl\",\n", " \"validation\": \"./dataset-r1/valid.jsonl\",\n", "}\n", "\n", "raw_datasets = load_dataset(\n", " \"json\",\n", " data_files=data_files,\n", ")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-d06df8923a2befa8.arrow\n", "Loading cached processed dataset at C:\\Users\\lego-\\.cache\\huggingface\\datasets\\json\\default-b39c74bc29b6f917\\0.0.0\\0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51\\cache-847113bf21349cf9.arrow\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Total processed datasets sizes are 2755 150\n" ] } ], "source": [ "# @title Tokenize the dataset\n", "tokenized_datasets = raw_datasets.map(\n", " lambda e: tokenizer(e[\"input\"] + e[\"output\"] + tokenizer.eos_token),\n", " #batched=True,\n", " #num_proc=4,\n", " remove_columns=[\"input\", \"output\", \"coder\", \"system\", \"god\", \"user\", \"ai\", \"topic\"]\n", ")\n", "\n", "for i in range(len(tokenized_datasets[\"train\"])):\n", " if len(tokenized_datasets[\"train\"][i][\"input_ids\"]) > config.max_position_embeddings:\n", " print(f\"Error in {i} of train\")\n", "for i in range(len(tokenized_datasets[\"validation\"])):\n", " if len(tokenized_datasets[\"validation\"][i][\"input_ids\"]) > config.max_position_embeddings:\n", " print(f\"Error in {i} of validation\")\n", "\n", "# [tokenized_datasets[\"train\"][1], tokenized_datasets[\"validation\"][1]]\n", "print(\"Total processed datasets sizes are \", len(tokenized_datasets[\"train\"]), len(tokenized_datasets[\"validation\"]))" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0cad348a2c094680ac2b0ab5e7dc2c8c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Grouping texts in chunks of 2048: 0%| | 0/3 [00:00, ?ba/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "eef956243d5542fcbf41bfdaa04ad5ea", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Grouping texts in chunks of 2048: 0%| | 0/1 [00:00, ?ba/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "Total LM datasets sizes are 628 31\n" ] } ], "source": [ "# TODO: maybe group?\n", "\n", "from itertools import chain\n", "\n", "block_size = 2048\n", "def group_texts(examples):\n", " # Concatenate all texts.\n", " #print(list(chain(*examples['input_ids'])))\n", " concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n", " total_length = len(concatenated_examples[list(examples.keys())[0]])\n", " # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n", " # customize this part to your needs.\n", " if total_length >= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of max_len.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result\n", "\n", "lm_datasets = tokenized_datasets.map(\n", " group_texts,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " load_from_cache_file=False,\n", " desc=f\"Grouping texts in chunks of {block_size}\",\n", ")\n", "\n", "print(\"Total LM datasets sizes are \", len(lm_datasets[\"train\"]), len(lm_datasets[\"validation\"]))" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using magick windows DLL!\n", "CUDA SETUP: Loading binary d:\\projects\\python\\distilchatgpt2\\venv\\lib\\site-packages\\bitsandbytes\\libbitsandbytes_cudaall.dll...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Using cuda_amp half precision backend\n" ] } ], "source": [ "from transformers import Trainer, TrainingArguments, default_data_collator, DataCollatorWithPadding\n", "from transformers.trainer_pt_utils import get_parameter_names\n", "import evaluate\n", "\n", "import bitsandbytes as bnb\n", "from bitsandbytes.optim import GlobalOptimManager\n", "\n", "def preprocess_logits_for_metrics(logits, labels):\n", " if isinstance(logits, tuple):\n", " # Depending on the model and config, logits may contain extra tensors,\n", " # like past_key_values, but logits always come first\n", " logits = logits[0]\n", " return logits.argmax(dim=-1)\n", "\n", "metric = evaluate.load(\"accuracy\")\n", "\n", "def compute_metrics(eval_preds):\n", " preds, labels = eval_preds\n", " # preds have the same shape as the labels, after the argmax(-1) has been calculated\n", " # by preprocess_logits_for_metrics but we need to shift the labels\n", " labels = labels[:, 1:].reshape(-1)\n", " preds = preds[:, :-1].reshape(-1)\n", " return metric.compute(predictions=preds, references=labels)\n", "\n", "model.config.use_cache = False\n", "\n", "#data_collator_pad = DataCollatorWithPadding(tokenizer)\n", "def data_collator(data_):\n", " data = default_data_collator(data_)\n", " #print(data)\n", " return {'input_ids': torch.stack([i for i in data['input_ids']]),\n", " 'attention_mask': torch.stack([i for i in data['attention_mask']]),\n", " 'labels': torch.stack([i for i in data['input_ids']])}\n", "\n", "training_args = TrainingArguments(\n", " \"./openchatgpt-neox-r1.1/\",\n", " do_train=True, \n", " do_eval=True,\n", " \n", " push_to_hub=False,\n", "\n", " # Pulled from examples\n", " evaluation_strategy=\"epoch\",\n", " #learning_rate=2e-5,\n", " #weight_decay=0.01,\n", "\n", " save_steps=300,\n", "\n", " per_device_train_batch_size=1,\n", " per_device_eval_batch_size=1,\n", "\n", " gradient_accumulation_steps=2,\n", " gradient_checkpointing=True,\n", "\n", " fp16=True,\n", ")\n", "\n", "optim = bnb.optim.Adam8bit\n", "def set_optim_to_run_embedding_in_fp32(model):\n", " for module in model.modules():\n", " if isinstance(module, torch.nn.Embedding):\n", " GlobalOptimManager.get_instance().register_module_override(module, 'weight', {'optim_bits': 32})\n", "set_optim_to_run_embedding_in_fp32(model)\n", "# model.cuda()\n", "\n", "decay_parameters = get_parameter_names(model, [torch.nn.LayerNorm])\n", "decay_parameters = [name for name in decay_parameters if \"bias\" not in name]\n", "optimizer_grouped_parameters = [\n", " {\n", " \"params\": [p for n, p in model.named_parameters() if n in decay_parameters],\n", " \"weight_decay\": training_args.weight_decay,\n", " },\n", " {\n", " \"params\": [p for n, p in model.named_parameters() if n not in decay_parameters],\n", " \"weight_decay\": 0.0,\n", " },\n", "]\n", "\n", "adam_bnb_optim = optim(\n", " optimizer_grouped_parameters,\n", " betas=(training_args.adam_beta1, training_args.adam_beta2),\n", " eps=training_args.adam_epsilon,\n", " lr=training_args.learning_rate,\n", ")\n", "\n", "trainer = Trainer(\n", " model=model,\n", " #train_dataset=tokenized_datasets[\"train\"],\n", " #eval_dataset=tokenized_datasets[\"validation\"],\n", " train_dataset=lm_datasets[\"train\"],\n", " eval_dataset=lm_datasets[\"validation\"],\n", " tokenizer=tokenizer,\n", "\n", " data_collator=data_collator,\n", " compute_metrics=compute_metrics,\n", " preprocess_logits_for_metrics=preprocess_logits_for_metrics,\n", "\n", " # data_collator=lambda data: {'input_ids': torch.stack([torch.tensor(f['input_ids']) for f in data]),\n", " # 'attention_mask': torch.stack([torch.tensor(f['attention_mask']) for f in data]),\n", " # 'labels': torch.stack([torch.tensor(f['input_ids']) for f in data])},\n", "\n", " args=training_args,\n", "\n", " optimizers=(adam_bnb_optim, None),\n", ")" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "No last checkpoint detected!\n" ] } ], "source": [ "# @title Get last model checkpoint if any...\n", "\n", "from transformers.trainer_utils import get_last_checkpoint\n", "\n", "last_checkpoint = get_last_checkpoint(\"./openchatgpt-neox-r1.1/\")\n", "if last_checkpoint is None:\n", " print(\"No last checkpoint detected!\")" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** Running training *****\n", " Num examples = 628\n", " Num Epochs = 3\n", " Instantaneous batch size per device = 1\n", " Total train batch size (w. parallel, distributed & accumulation) = 2\n", " Gradient Accumulation steps = 2\n", " Total optimization steps = 942\n", " Number of trainable parameters = 162283008\n" ] }, { "data": { "text/html": [ "\n", "
Epoch | \n", "Training Loss | \n", "Validation Loss | \n", "Accuracy | \n", "
---|---|---|---|
1 | \n", "No log | \n", "0.881487 | \n", "0.787100 | \n", "
2 | \n", "0.811800 | \n", "0.871694 | \n", "0.791922 | \n", "
3 | \n", "0.811800 | \n", "0.896573 | \n", "0.792001 | \n", "
"
],
"text/plain": [
"