{ "cells": [ { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import transformers\n", "from transformers import (\n", " CONFIG_MAPPING,\n", " MODEL_FOR_CAUSAL_LM_MAPPING,\n", " AutoConfig,\n", " AutoModelForCausalLM,\n", " AutoTokenizer,\n", " HfArgumentParser,\n", " Trainer,\n", " TrainingArguments,\n", " default_data_collator,\n", " is_torch_tpu_available,\n", " set_seed,\n", ")\n", "\n", "from itertools import chain\n", "\n", "from transformers.testing_utils import CaptureLogger\n", "from transformers.trainer_utils import get_last_checkpoint\n", "# from transformers.utils import check_min_version, send_example_telemetry\n", "from transformers.utils.versions import require_version\n", "\n", "import datasets\n", "from datasets import load_dataset" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "# check_min_version(\"4.23.0.dev0\")" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "require_version(\"datasets>=1.8.0\")" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "set_seed(37)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset." ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/huggingface_hub/utils/_deprecation.py:97: FutureWarning: Deprecated argument(s) used in 'dataset_info': token. Will not be supported from version '0.12'.\n", " warnings.warn(message, FutureWarning)\n", "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n", "Found cached dataset csv (/Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a)\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "67606d054e4a4b2f9ddf99f07c02c328", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/1 [00:00= block_size:\n", " total_length = (total_length // block_size) * block_size\n", " # Split by chunks of max_len.\n", " result = {\n", " k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n", " for k, t in concatenated_examples.items()\n", " }\n", " result[\"labels\"] = result[\"input_ids\"].copy()\n", " return result" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Loading cached processed dataset at /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a/cache-88d7c64be469684a.arrow\n" ] } ], "source": [ "lm_datasets = tokenized_datasets.map(\n", " group_texts,\n", " batched=True,\n", " # num_proc=data_args.preprocessing_num_workers,\n", " # load_from_cache_file=not data_args.overwrite_cache,\n", " desc=f\"Grouping texts in chunks of {block_size}\",\n", ")" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "train_dataset = lm_datasets[\"train\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Do the fine-tuning" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "training_args = TrainingArguments(\n", " output_dir=\"gpt2-poetry-model\", \n", " overwrite_output_dir=True,\n", " # per_gpu_train_batch_size=256\n", " per_device_train_batch_size=16,\n", " push_to_hub=True,\n", " push_to_hub_token=\"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\"\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "# Initialize our Trainer\n", "trainer = Trainer(\n", " model=model,\n", " args=training_args,\n", " train_dataset=train_dataset,\n", " # eval_dataset=eval_dataset,\n", " tokenizer=tokenizer,\n", " # Data collator will default to DataCollatorWithPadding, so we change it.\n", " data_collator=default_data_collator,\n", " # compute_metrics=compute_metrics\n", " # if training_args.do_eval and not is_torch_tpu_available()\n", " # else None,\n", " # preprocess_logits_for_metrics=preprocess_logits_for_metrics\n", " # if training_args.do_eval and not is_torch_tpu_available()\n", " # else None,\n", ")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "***** Running training *****\n", " Num examples = 171\n", " Num Epochs = 3\n", " Instantaneous batch size per device = 8\n", " Total train batch size (w. parallel, distributed & accumulation) = 8\n", " Gradient Accumulation steps = 1\n", " Total optimization steps = 66\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "59ebc6f251bd42e4bd3474b574614d1f", "version_major": 2, "version_minor": 0 }, "text/plain": [ " 0%| | 0/66 [00:00 3\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpush_to_hub\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/transformers/trainer.py:2677\u001b[0m, in \u001b[0;36mTrainer.push_to_hub\u001b[0;34m(self, commit_message, blocking, **kwargs)\u001b[0m\n\u001b[1;32m 2674\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mis_world_process_zero():\n\u001b[1;32m 2675\u001b[0m \u001b[39mreturn\u001b[39;00m\n\u001b[0;32m-> 2677\u001b[0m git_head_commit_url \u001b[39m=\u001b[39m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49mrepo\u001b[39m.\u001b[39mpush_to_hub(commit_message\u001b[39m=\u001b[39mcommit_message, blocking\u001b[39m=\u001b[39mblocking)\n\u001b[1;32m 2678\u001b[0m \u001b[39m# push separately the model card to be independant from the rest of the model\u001b[39;00m\n\u001b[1;32m 2679\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39margs\u001b[39m.\u001b[39mshould_save:\n", "\u001b[0;31mAttributeError\u001b[0m: 'Trainer' object has no attribute 'repo'" ] } ], "source": [ "from huggingface_hub import notebook_login\n", "notebook_login()\n", "trainer.push_to_hub()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.6 ('augmented_poetry')", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.13" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "00664817f4a09ab74dd392ee5a8d12e3606381c26df296db9ea5c334bb5d1b65" } } }, "nbformat": 4, "nbformat_minor": 2 }