<a href="https://colab.research.google.com/github/MMBazel/LO_GenAI_Workshops/blob/main/%5BMini%5D_HelloTaylorSwift_FineTuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

HellowWorld based on this tutorial:

*   https://www.kaggle.com/code/tommyadams/fine-tuning-tinyllama
*   Model: https://huggingface.co/huggingartists/taylor-swift
*   Dataset: https://huggingface.co/datasets/huggingartists/taylor-swift
*   https://www.youtube.com/watch?v=OVqe6GTrDFM


*   https://dev.to/_ken0x/tinyllama-llm-a-step-by-step-guide-to-implementing-the-11b-model-on-google-colab-1pjh
*   https://www.youtube.com/watch?v=6XeTk8cZUsM
*   https://github.com/uygarkurt/SFT-TinyLlama/tree/main









# Get Set-up

### Install necessary libraries

In [22]:
!pip install trl transformers accelerate git+https://github.com/huggingface/peft.git -Uqqq
!pip install -i https://pypi.org/simple/ bitsandbytes -qqq
!pip install einops wandb -Uqqq

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


In [23]:
!pip freeze > requirements.txt

In [24]:
import torch
import re
from peft import get_peft_model, LoraConfig, prepare_model_for_kbit_training
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, TrainingArguments
from trl import SFTTrainer
from datasets import Dataset
import random

### Login With Credentials

In [25]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Load Data

### Load small data subset

In [26]:
from datasets import load_dataset

dataset = load_dataset("mmbazel/Taylor-Swift-Example")

In [27]:
# Extracting the lyrics from the dataset
train_data = dataset["train"]
lyrics = train_data["lyric"]

In [28]:
# Cleaning the lyrics
replace_with_space = ['\u2005', '\u200b', '\u205f', '\xa0', '-']
replace_letters = {'í':'i', 'é':'e', 'ï':'i', 'ó':'o', ';':',', ''':'\'', ''':'\'', ':':',', 'е':'e'}
remove_list = ['\)', '\(', '–','"','"', '"', '\[.*\]', '.*\|.*', '—']

In [29]:
cleaned_lyrics = []
for lyric in lyrics:
    cleaned_lyric = lyric
    for old, new in replace_letters.items():
        cleaned_lyric = cleaned_lyric.replace(old, new)
    for string in remove_list:
        cleaned_lyric = re.sub(string,'',cleaned_lyric)
    for string in replace_with_space:
        cleaned_lyric = re.sub(string,' ',cleaned_lyric)
    cleaned_lyrics.append(cleaned_lyric)

### Split train-test set

In [30]:
# Splitting the cleaned_lyrics into training, validation, and test sets
train_percentage = 0.9
validation_percentage = 0.05
test_percentage = 0.05

In [31]:
# Calculate split indices
train_index = int(len(cleaned_lyrics) * train_percentage)
validation_index = int(len(cleaned_lyrics) * (train_percentage + validation_percentage))



In [32]:
# Splitting cleaned_lyrics into training, validation, and test sets
train_lyrics = cleaned_lyrics[:train_index]
validation_lyrics = cleaned_lyrics[train_index:validation_index]
test_lyrics = cleaned_lyrics[validation_index:]

In [33]:
# Create new datasets with only the 'lyric' column for training, validation, and testing
train_lyrics_dataset = Dataset.from_dict({'text': train_lyrics})
validation_lyrics_dataset = Dataset.from_dict({'text': validation_lyrics})
test_lyrics_dataset = Dataset.from_dict({'text': test_lyrics})

## Load Models

In [34]:
# Loading the pre-trained model
model_name = "PY007/TinyLlama-1.1B-step-50K-105b"
model = AutoModelForCausalLM.from_pretrained(model_name)

In [35]:
# Creating tokenizer and defining the pad token
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [36]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)  # Move the model to the appropriate device

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 2048)
    (layers): ModuleList(
      (0-21): 22 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=256, bias=False)
          (v_proj): Linear(in_features=2048, out_features=256, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5632, bias=False)
          (down_proj): Linear(in_features=5632, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm()
        (post_attention_layernorm): LlamaRMSNorm()
      )
    )
    (norm): LlamaRMSNorm()
  )
  (lm_head): Line

In [93]:
# repetition_penalty originally set to 1.3 - bumped to 2.0
# max_new_tokens originally 250

def generate_lyrics(query, model, tokenizer):
    encoding = tokenizer(query, return_tensors="pt").to(device)
    generation_config = GenerationConfig(max_new_tokens=200, pad_token_id=tokenizer.eos_token_id, repetition_penalty=1.5, eos_token_id=tokenizer.eos_token_id, temperature=1.3,do_sample=True)
    outputs = model.generate(input_ids=encoding.input_ids, generation_config=generation_config)
    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    output_lines = text_output[len(query):].split('\n')
    for line in output_lines:
        if line.strip():
            print(line)

In [94]:
# Preparing the model for low-rank adaptation (e.g., LoRA)
prepared_model = prepare_model_for_kbit_training(model)

lora_alpha = 32
lora_dropout = 0.05
lora_rank = 32

# Configuring the LoRA parameters
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_rank,
    bias="none",  # setting to 'none' for only training weight params instead of biases
    task_type="CAUSAL_LM")

# Applying LoRA to the prepared model
peft_model = get_peft_model(prepared_model, peft_config)

In [95]:
# Setting training arguments
output_dir = "mmbazel/tinyllama_tayswifty" # Model repo on your hugging face account where you want to save your model
per_device_train_batch_size = 3
gradient_accumulation_steps = 2
optim = "paged_adamw_32bit"
save_steps = 10
logging_steps = 10
learning_rate = 2e-3
max_grad_norm = 0.3 # Sets limit for gradient clipping
max_steps = 200     # Number of training steps
warmup_ratio = 0.03 # Portion of steps used for learning_rate to warmup from 0
lr_scheduler_type = "cosine" # I chose cosine to avoid learning plateaus

In [96]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    push_to_hub=True,
    report_to='none'
)

In [97]:
# Creating the SFT trainer
trainer = SFTTrainer(
    model=peft_model,
    train_dataset=train_lyrics_dataset,
    peft_config=peft_config,
    max_seq_length=500,
    dataset_text_field='text',
    tokenizer=tokenizer,
    args=training_arguments
)
peft_model.config.use_cache = False

Map:   0%|          | 0/7522 [00:00<?, ? examples/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [98]:
# Training the model
trainer.train()

Step,Training Loss
10,4.7909
20,3.4515
30,3.4947
40,3.3161
50,3.2292
60,3.181
70,3.0821
80,3.2482
90,3.0432
100,3.2312


TrainOutput(global_step=200, training_loss=3.254313144683838, metrics={'train_runtime': 61.602, 'train_samples_per_second': 19.48, 'train_steps_per_second': 3.247, 'total_flos': 103816598925312.0, 'train_loss': 3.254313144683838, 'epoch': 0.16})

In [99]:
# Generate lyrics using random segments of the test lyrics
num_examples = 5  # Number of random examples to generate
max_segment_length = 200  # Maximum length of each lyric segment

In [100]:
for i in range(num_examples):
    # Randomly select a starting index for the lyric segment
    start_index = random.randint(0, len(test_lyrics) - max_segment_length)
    end_index = start_index + max_segment_length

    # Extract the lyric segment
    lyric_segment = ' '.join(test_lyrics[start_index:end_index])

    print(f"Example {i+1}:")
    print("INPUT:")
    print(lyric_segment)
    print("OUTPUT:")
    generate_lyrics(lyric_segment, model, tokenizer)
    print("\n")

Example 1:
INPUT:
Never be so polite You forget your power Never wield such power You forget to be polite And if I didn't know better I'd think you were listening to me now If I didn't know better I'd think you were still around What died didn't stay dead What died didn't stay dead You're alive, you're alive in my head What died didn't stay dead What died didn't stay dead You're alive, so alive The autumn chill that wakes me up You loved the amber skies so much Long limbs and frozen swims You'd always go past where our feet could touch And I complained the whole way there The car ride back and up the stairs I should've asked you questions I should've asked you how to be Asked you to write it down for me Should've kept every grocery store receipt 'Cause every scrap of you would be taken from me Watched as you signed your name Marjorie All your closets of backlogged dreams And how you left them all to me What died didn't stay dead What died didn't stay dead You're alive, you're alive in 