Spaces:

matthh
/

augmented_poetry

Build error

App Files Files Community

Matthew Hollings commited on Oct 2, 2022

Commit

9de53c6

•

1 Parent(s): 5497d17

Fine-tune a GPT model and load into the interface.

Browse files

Files changed (5) hide show

.gitignore +3 -1
README.md +16 -10
app.py +1 -1
fine-tune-llm.ipynb +129 -15
fine-tuning-for-casual-language-model.ipynb +603 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,5 @@
 __pycache__
 flagged/
-gutenberg-dammit-files-v002.zip

 __pycache__
 flagged/
+gutenberg-dammit-files-v002.zip
+tmp_trainer
+*.gz

README.md CHANGED Viewed

@@ -10,23 +10,29 @@ pinned: false
 ---
 - 1. fine-tune a large language model (LLM) using the text corpus of a specific poet
-- 1.1 if possible the entire poem should be used for generating the next line not
-just the last line
-- 2. build a web interface for a user to prompt and then respond
-- 2.1 the poem should persist on machine reload
-- 2.2 it should be possible to remove the last line and rerun
-- 2.3 retry to get a new response from the model
 run in a docker container and transfer to another machine
 ## Research
 <https://github.com/aparrish/gutenberg-dammit/>
-TODO:
-automatically activate conda env on cd in directory
 implement language generation with a basic transformer
-get the website running to display responses in a user friendly way
-Docker image?
 <https://github.com/aparrish/gutenberg-poetry-corpus>
 Gutenberg Poetry Autocomplete, a search engine-like interface for writing poems mined from Project Gutenberg. (A poem written using this interface was recently published in the Indianapolis Review!)

 ---
 - 1. fine-tune a large language model (LLM) using the text corpus of a specific poet
+- select a certain rhyme from the gutenberg corpus and fine-tune on this
+- try fine-tuning on a few lines of a poem that Eva has started
 run in a docker container and transfer to another machine
+Is it better to have a sequence to sequence transformer trained on sucessive lines of the poetry corpus??
+merve/poetry only has 573 rows.
+TODO: - upload the gutenberg poetry corpus up to huggingface - ask the lady who made it
 ## Research
 <https://github.com/aparrish/gutenberg-dammit/>
 implement language generation with a basic transformer
 <https://github.com/aparrish/gutenberg-poetry-corpus>
 Gutenberg Poetry Autocomplete, a search engine-like interface for writing poems mined from Project Gutenberg. (A poem written using this interface was recently published in the Indianapolis Review!)
+https://ymeadows.com/en-articles/fine-tuning-transformer-based-language-models
+https://thegradient.pub/prompting/
+https://towardsdatascience.com/fine-tuning-for-domain-adaptation-in-nlp-c47def356fd6
+https://ruder.io/recent-advances-lm-fine-tuning/
+https://streamlit.io/

app.py CHANGED Viewed

@@ -3,7 +3,7 @@ import gradio as gr
 from transformers import pipeline
 # Set up the generatove model transformer pipeline
-generator = pipeline("text-generation", model="gpt2")
 # A sequence of lines both those typed in and the line so far
 # when save is clicked the txt file is downloaded

 from transformers import pipeline
 # Set up the generatove model transformer pipeline
+generator = pipeline("text-generation", model="tmp_trainer")
 # A sequence of lines both those typed in and the line so far
 # when save is clicked the txt file is downloaded

fine-tune-llm.ipynb CHANGED Viewed

@@ -379,30 +379,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "{'Author': ['Jules Verne'],\n",
-       " 'Author Birth': [1828],\n",
-       " 'Author Death': [1905],\n",
-       " 'Author Given': ['Jules'],\n",
-       " 'Author Surname': ['Verne'],\n",
        " 'Copyright Status': ['Not copyrighted in the United States.'],\n",
        " 'Language': ['English'],\n",
-       " 'LoC Class': ['PQ: Language and Literatures: Romance literatures: French, Italian, Spanish, Portuguese'],\n",
-       " 'Num': '103',\n",
-       " 'Subject': ['Adventure stories', 'Voyages around the world -- Fiction'],\n",
-       " 'Title': ['Around the World in 80 Days'],\n",
        " 'charset': 'us-ascii',\n",
-       " 'gd-num-padded': '00103',\n",
-       " 'gd-path': '001/00103.txt',\n",
-       " 'href': '/1/0/103/103.zip'}"
       ]
      },
-     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -410,7 +412,7 @@
    "source": [
     "from gutenbergdammit.ziputils import loadmetadata\n",
     "metadata = loadmetadata(\"gutenberg-dammit-files-v002.zip\")\n",
-    "metadata[100]\n",
     "# ['Essays in the Art of Writing']"
    ]
   },
@@ -557,6 +559,118 @@
     "tf.config.list_physical_devices('CPU')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,

   },
   {
    "cell_type": "code",
+   "execution_count": 23,
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
+       "{'Author': ['Franklin Delano Roosevelt'],\n",
+       " 'Author Birth': [1882],\n",
+       " 'Author Death': [1945],\n",
+       " 'Author Given': ['Franklin Delano'],\n",
+       " 'Author Surname': ['Roosevelt'],\n",
        " 'Copyright Status': ['Not copyrighted in the United States.'],\n",
        " 'Language': ['English'],\n",
+       " 'LoC Class': ['E740: History: America: Twentieth century'],\n",
+       " 'Num': '104',\n",
+       " 'Subject': ['New Deal, 1933-1939',\n",
+       "  'Presidents -- United States -- Inaugural addresses',\n",
+       "  'United States -- Politics and government -- 1933-1945'],\n",
+       " 'Title': [\"Franklin Delano Roosevelt's First Inaugural Address\"],\n",
        " 'charset': 'us-ascii',\n",
+       " 'gd-num-padded': '00104',\n",
+       " 'gd-path': '001/00104.txt',\n",
+       " 'href': '/1/0/104/104.zip'}"
       ]
      },
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
    "source": [
     "from gutenbergdammit.ziputils import loadmetadata\n",
     "metadata = loadmetadata(\"gutenberg-dammit-files-v002.zip\")\n",
+    "metadata[101]\n",
     "# ['Essays in the Art of Writing']"
    ]
   },
     "tf.config.list_physical_devices('CPU')"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Source data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "curl -O http://static.decontextualize.com/gutenberg-poetry-v001.ndjson.gz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import gzip, json\n",
+    "all_lines = []\n",
+    "for line in gzip.open(\"gutenberg-poetry-v001.ndjson.gz\"):\n",
+    "    all_lines.append(json.loads(line.strip()))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[{'s': 'The Song of Hiawatha is based on the legends and stories of', 'gid': '19'}, {'s': 'many North American Indian tribes, but especially those of the', 'gid': '19'}, {'s': 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.', 'gid': '19'}, {'s': 'They were collected by Henry Rowe Schoolcraft, the reknowned', 'gid': '19'}, {'s': 'Schoolcraft married Jane, O-bah-bahm-wawa-ge-zhe-go-qua (The', 'gid': '19'}, {'s': 'fur trader, and O-shau-gus-coday-way-qua (The Woman of the Green', 'gid': '19'}, {'s': 'Prairie), who was a daughter of Waub-o-jeeg (The White Fisher),', 'gid': '19'}, {'s': 'who was Chief of the Ojibway tribe at La Pointe, Wisconsin.', 'gid': '19'}, {'s': 'Jane and her mother are credited with having researched,', 'gid': '19'}, {'s': 'authenticated, and compiled much of the material Schoolcraft', 'gid': '19'}]\n"
+     ]
+    }
+   ],
+   "source": [
+    "import random\n",
+    "random.sample(all_lines, 8)\n",
+    "\n",
+    "print(all_lines[0:10])\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'Author': ['Henry Rider Haggard'],\n",
+       " 'Author Birth': [1856],\n",
+       " 'Author Death': [1925],\n",
+       " 'Author Given': ['Henry Rider'],\n",
+       " 'Author Surname': ['Haggard'],\n",
+       " 'Copyright Status': ['Not copyrighted in the United States.'],\n",
+       " 'Language': ['English'],\n",
+       " 'LoC Class': ['PR: Language and Literatures: English literature'],\n",
+       " 'Num': '2721',\n",
+       " 'Subject': ['Iceland -- Fiction'],\n",
+       " 'Title': ['Eric Brighteyes'],\n",
+       " 'charset': 'iso-8859-1',\n",
+       " 'gd-num-padded': '02721',\n",
+       " 'gd-path': '027/02721.txt',\n",
+       " 'href': '/2/7/2/2721/2721_8.zip'}"
+      ]
+     },
+     "execution_count": 33,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from gutenbergdammit.ziputils import loadmetadata\n",
+    "metadata = loadmetadata(\"gutenberg-dammit-files-v002.zip\")\n",
+    "metadata[2620]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['The Song of Hiawatha is based on the legends and stories of',\n",
+       " 'many North American Indian tribes, but especially those of the',\n",
+       " 'Ojibway Indians of northern Michigan, Wisconsin, and Minnesota.',\n",
+       " 'They were collected by Henry Rowe Schoolcraft, the reknowned',\n",
+       " 'Schoolcraft married Jane, O-bah-bahm-wawa-ge-zhe-go-qua (The',\n",
+       " 'fur trader, and O-shau-gus-coday-way-qua (The Woman of the Green',\n",
+       " 'Prairie), who was a daughter of Waub-o-jeeg (The White Fisher),',\n",
+       " 'who was Chief of the Ojibway tribe at La Pointe, Wisconsin.',\n",
+       " 'Jane and her mother are credited with having researched,',\n",
+       " 'authenticated, and compiled much of the material Schoolcraft']"
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "[line['s'] for line in all_lines[0:10]]"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,

fine-tuning-for-casual-language-model.ipynb ADDED Viewed

	@@ -0,0 +1,603 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# https://github.com/huggingface/transformers/tree/main/examples/pytorch/language-modeling"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import transformers\n",
+    "from transformers import (\n",
+    "    CONFIG_MAPPING,\n",
+    "    MODEL_FOR_CAUSAL_LM_MAPPING,\n",
+    "    AutoConfig,\n",
+    "    AutoModelForCausalLM,\n",
+    "    AutoTokenizer,\n",
+    "    HfArgumentParser,\n",
+    "    Trainer,\n",
+    "    TrainingArguments,\n",
+    "    default_data_collator,\n",
+    "    is_torch_tpu_available,\n",
+    "    set_seed,\n",
+    ")\n",
+    "\n",
+    "from itertools import chain\n",
+    "\n",
+    "from transformers.testing_utils import CaptureLogger\n",
+    "from transformers.trainer_utils import get_last_checkpoint\n",
+    "# from transformers.utils import check_min_version, send_example_telemetry\n",
+    "from transformers.utils.versions import require_version\n",
+    "\n",
+    "import datasets\n",
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "ImportError",
+     "evalue": "This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers.",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn [4], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mcheck_min_version\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m4.23.0.dev0\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/homebrew/Caskroom/miniforge/base/envs/augmented_poetry/lib/python3.8/site-packages/transformers/utils/__init__.py:32\u001b[0m, in \u001b[0;36mcheck_min_version\u001b[0;34m(min_version)\u001b[0m\n\u001b[1;32m     30\u001b[0m     error_message \u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mThis example requires a minimum version of \u001b[39m\u001b[39m{\u001b[39;00mmin_version\u001b[39m}\u001b[39;00m\u001b[39m,\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     31\u001b[0m error_message \u001b[39m+\u001b[39m\u001b[39m=\u001b[39m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m but the version found is \u001b[39m\u001b[39m{\u001b[39;00m__version__\u001b[39m}\u001b[39;00m\u001b[39m.\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m---> 32\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mImportError\u001b[39;00m(\n\u001b[1;32m     33\u001b[0m     error_message\n\u001b[1;32m     34\u001b[0m     \u001b[39m+\u001b[39m (\n\u001b[1;32m     35\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other \u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     36\u001b[0m         \u001b[39m\"\u001b[39m\u001b[39mversions of HuggingFace Transformers.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m     37\u001b[0m     )\n\u001b[1;32m     38\u001b[0m )\n",
+      "\u001b[0;31mImportError\u001b[0m: This example requires a source install from HuggingFace Transformers (see `https://huggingface.co/transformers/installation.html#installing-from-source`), but the version found is 4.11.3.\nCheck out https://huggingface.co/transformers/examples.html for the examples corresponding to other versions of HuggingFace Transformers."
+     ]
+    }
+   ],
+   "source": [
+    "# check_min_version(\"4.23.0.dev0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "require_version(\"datasets>=1.8.0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "set_seed(37)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "##### Get all of the huggingface objects that we need: tokenzier, gpt2 model, poetry dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using custom data configuration merve--poetry-ca9a13ef5858cc3a\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Downloading and preparing dataset csv/merve--poetry to /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ed56ee6b324647798b19ac7bf5accc40",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "32c10441ff20404cb153f6b27f16a829",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading data:   0%|          | 0.00/606k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7ca47bc06937463e91d3948d7703ac64",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "1631dbdc53d04b14a8a7733883bbd1cc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "0 tables [00:00, ? tables/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dataset csv downloaded and prepared to /Users/matth/.cache/huggingface/datasets/merve___csv/merve--poetry-ca9a13ef5858cc3a/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3c93229d66ad46d9a88da5f6a9528f2e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/1 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "raw_datasets = load_dataset(\"merve/poetry\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tokenizer = AutoTokenizer.from_pretrained('gpt2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "config = AutoConfig.from_pretrained('gpt2')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Embedding(50257, 768)"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    \"gpt2\",\n",
+    "    config=config\n",
+    ")\n",
+    "model.resize_token_embeddings(len(tokenizer))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Dataset({\n",
+       "    features: ['author', 'content', 'poem name', 'age', 'type'],\n",
+       "    num_rows: 573\n",
+       "})"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_datasets['train']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'Mythology & Folklore'"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_datasets['train']['type'][0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "DatasetDict({\n",
+       "    train: Dataset({\n",
+       "        features: ['author', 'content', 'poem name', 'age', 'type'],\n",
+       "        num_rows: 573\n",
+       "    })\n",
+       "})"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "raw_datasets"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tok_logger = transformers.utils.logging.get_logger(\n",
+    "    \"transformers.tokenization_utils_base\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def tokenize_function(examples):\n",
+    "    with CaptureLogger(tok_logger) as cl:\n",
+    "        output = tokenizer(examples[text_column_name])\n",
+    "    # clm input could be much much longer than block_size\n",
+    "    if \"Token indices sequence length is longer than the\" in cl.out:\n",
+    "        tok_logger.warning(\n",
+    "            \"^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits\"\n",
+    "            \" before being passed to the model.\"\n",
+    "        )\n",
+    "    return output"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "column_names = raw_datasets[\"train\"].column_names\n",
+    "# text_column_name = \"text\" if \"text\" in column_names else column_names[0]\n",
+    "text_column_name = \"content\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "82c09dbdfa1a47d79607a4c9729fb286",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Running tokenizer on dataset:   0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Token indices sequence length is longer than the specified maximum sequence length for this model (7725 > 1024). Running this sequence through the model will result in indexing errors\n",
+      "^^^^^^^^^^^^^^^^ Please ignore the warning above - this long input will be chunked into smaller bits before being passed to the model.\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenized_datasets = raw_datasets.map(\n",
+    "    tokenize_function,\n",
+    "    batched=True,\n",
+    "    # num_proc=data_args.preprocessing_num_workers,\n",
+    "    remove_columns=column_names,\n",
+    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
+    "    desc=\"Running tokenizer on dataset\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "block_size = tokenizer.model_max_length"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Main data processing function that will concatenate all texts from our dataset and generate chunks of block_size.\n",
+    "def group_texts(examples):\n",
+    "    # Concatenate all texts.\n",
+    "    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}\n",
+    "    total_length = len(concatenated_examples[list(examples.keys())[0]])\n",
+    "    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can\n",
+    "    # customize this part to your needs.\n",
+    "    if total_length >= block_size:\n",
+    "        total_length = (total_length // block_size) * block_size\n",
+    "    # Split by chunks of max_len.\n",
+    "    result = {\n",
+    "        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]\n",
+    "        for k, t in concatenated_examples.items()\n",
+    "    }\n",
+    "    result[\"labels\"] = result[\"input_ids\"].copy()\n",
+    "    return result"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ca2f64461e304df6aecb16e8cfcd42ac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Grouping texts in chunks of 1024:   0%|          | 0/1 [00:00<?, ?ba/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "lm_datasets = tokenized_datasets.map(\n",
+    "    group_texts,\n",
+    "    batched=True,\n",
+    "    # num_proc=data_args.preprocessing_num_workers,\n",
+    "    # load_from_cache_file=not data_args.overwrite_cache,\n",
+    "    desc=f\"Grouping texts in chunks of {block_size}\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = lm_datasets[\"train\"]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Do the fine-tuning"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Initialize our Trainer\n",
+    "trainer = Trainer(\n",
+    "    model=model,\n",
+    "    # args=training_args,\n",
+    "    train_dataset=train_dataset,\n",
+    "    # eval_dataset=eval_dataset,\n",
+    "    tokenizer=tokenizer,\n",
+    "    # Data collator will default to DataCollatorWithPadding, so we change it.\n",
+    "    data_collator=default_data_collator,\n",
+    "    # compute_metrics=compute_metrics\n",
+    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
+    "    # else None,\n",
+    "    # preprocess_logits_for_metrics=preprocess_logits_for_metrics\n",
+    "    # if training_args.do_eval and not is_torch_tpu_available()\n",
+    "    # else None,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "***** Running training *****\n",
+      "  Num examples = 171\n",
+      "  Num Epochs = 3\n",
+      "  Instantaneous batch size per device = 8\n",
+      "  Total train batch size (w. parallel, distributed & accumulation) = 8\n",
+      "  Gradient Accumulation steps = 1\n",
+      "  Total optimization steps = 66\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "59ebc6f251bd42e4bd3474b574614d1f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "  0%|          | 0/66 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "\n",
+      "Training completed. Do not forget to share your model on huggingface.co/models =)\n",
+      "\n",
+      "\n",
+      "Saving model checkpoint to tmp_trainer\n",
+      "Configuration saved in tmp_trainer/config.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'train_runtime': 2967.2818, 'train_samples_per_second': 0.173, 'train_steps_per_second': 0.022, 'train_loss': 4.249474265358665, 'epoch': 3.0}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Model weights saved in tmp_trainer/pytorch_model.bin\n",
+      "tokenizer config file saved in tmp_trainer/tokenizer_config.json\n",
+      "Special tokens file saved in tmp_trainer/special_tokens_map.json\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "***** train metrics *****\n",
+      "  epoch                    =        3.0\n",
+      "  train_loss               =     4.2495\n",
+      "  train_runtime            = 0:49:27.28\n",
+      "  train_samples            =        171\n",
+      "  train_samples_per_second =      0.173\n",
+      "  train_steps_per_second   =      0.022\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Training\n",
+    "checkpoint = None\n",
+    "train_result = trainer.train(resume_from_checkpoint=checkpoint)\n",
+    "trainer.save_model()  # Saves the tokenizer too for easy upload\n",
+    "\n",
+    "metrics = train_result.metrics\n",
+    "\n",
+    "max_train_samples = (len(train_dataset))\n",
+    "metrics[\"train_samples\"] = min(max_train_samples, len(train_dataset))\n",
+    "\n",
+    "trainer.log_metrics(\"train\", metrics)\n",
+    "trainer.save_metrics(\"train\", metrics)\n",
+    "trainer.save_state()"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.10.6 ('augmented_poetry')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.13"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "00664817f4a09ab74dd392ee5a8d12e3606381c26df296db9ea5c334bb5d1b65"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}