Spaces:
Sleeping
Sleeping
File size: 6,973 Bytes
3c2b5ed |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 |
# -*- coding: utf-8 -*-
"""FineTuning GPT2
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1PlLmPZ7NMPjZFz7xisoypLSHIjE2s7Qm
# This notebook by Zack DeSario is a remix / combination of many sources as all good code is.
The code is mainly from [@DigitalSreeni](https://youtu.be/DxygPxcfW_I). Their code cites the [huggingface official tutorial](https://huggingface.co/transformers/v2.2.0/pretrained_models.html).
"""
# !pip install transformers
# !pip install torch
# !pip install transformers[torch]
import os
import re
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments
from huggingface_hub import notebook_login
notebook_login()
"""Required functions to read text from various files located in a directory. Files can be a mix of pdf, docx, or txt."""
### THIS CODE IS 100% WRITTEN BY THE FIRST SOURCE. VERY HELPFUL FUNCTIONS, TY.
# Functions to read different file types
def read_txt(file_path):
with open(file_path, "r") as file:
text = file.read()
return text
def read_documents_from_directory(directory):
combined_text = ""
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
if filename.endswith(".pdf"):
combined_text += read_pdf(file_path)
elif filename.endswith(".docx"):
combined_text += read_word(file_path)
elif filename.endswith(".txt"):
combined_text += read_txt(file_path)
return combined_text
# ANOTHER HELPER FUNCTION
def generate_response(model, tokenizer, prompt, max_length=100):
input_ids = tokenizer.encode(prompt, return_tensors="pt")
# Create the attention mask and pad token id
attention_mask = torch.ones_like(input_ids)
pad_token_id = tokenizer.eos_token_id
output = model.generate(
input_ids,
max_length=max_length,
num_return_sequences=1,
attention_mask=attention_mask,
pad_token_id=pad_token_id
)
return tokenizer.decode(output[0], skip_special_tokens=True)
"""## Now load the base model and test it to see if it already does what we need to do or not...."""
# Set up the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl
model = GPT2LMHeadModel.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl
prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.'
response = generate_response(model, tokenizer, prompt, max_length=200)
print(response)
prompt = 'Who is Fry TV show Futurama? Describe them in detail.'
response = generate_response(model, tokenizer, prompt, max_length=200)
print(response)
"""# Mmkay, it clearly does not know who Fry is or how to write a TV Script.
### Lets train it to learn how to write a TV script for Futurama.
## Adding your data
1. Open the side panel, click on the folder icon, create a new folder called `my_data`, and drag and drop your data into that side panel. I will demonstrate during class.
2. Also, create a new folder called `my_trained_model`. That is where we will temporarily store our trained model.
Load your data
* You can download the data I used here: UPLOAD LINK SOON.
"""
directory = "/content/my_data/" # Replace with the path to your directory containing the files
model_output_path = "/content/my_trained_models/"
train_fraction=0.8
# Read documents from the directory
combined_text = read_documents_from_directory(directory)
combined_text = re.sub(r'\n+', '\n', combined_text).strip() # Remove excess newline characters
# Split the text into training and validation sets
split_index = int(train_fraction * len(combined_text))
train_text = combined_text[:split_index]
val_text = combined_text[split_index:]
# Save the training and validation data as text files
with open("train.txt", "w") as f:
f.write(train_text)
with open("val.txt", "w") as f:
f.write(val_text)
len(train_text)
print(train_text[:1000])
"""The train_chatbot function uses the combined text data to train a GPT-2 model using the provided training arguments. The resulting trained model and tokenizer are then saved to a specified output directory."""
# Prepare the dataset
train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)
val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
# Set up the training arguments
training_args = TrainingArguments(
output_dir=model_output_path,
overwrite_output_dir=True,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_train_epochs=33,
save_steps=10_000,
save_total_limit=2,
logging_dir='./logs',
)
# Train the model
trainer = Trainer(
model=model,
args=training_args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
)
## THIS TAKES 30 MINS FOR JUST 10 EPOCHS SO I AM NOT GOING TO DUE THAT DURING CLASS....
## AND ~2HRS FOR 33 EPOCHS
trainer.train()
# Save the model
trainer.save_model(model_output_path)
# Save the tokenizer
tokenizer.save_pretrained(model_output_path)
print("SAVED MODELS LOCALLY YO!!!!!!")
directory = "/content/my_data/" # Replace with the path to your directory containing the files
model_output_path = "/content/my_trained_models/"
model = GPT2LMHeadModel.from_pretrained(model_output_path)
tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)
# Test the chatbot
prompt = "Write a TV show script for the TV show Futurama about Fry getting stuck in a hole." # Replace with your desired prompt
# prompt = "What is bulk metallic glass?" # Replace with your desired prompt
response = generate_response(model, tokenizer, prompt, max_length=1000)
print("Generated response:", response)
## PUSH THE MODELS TO YOUR HUGGING-FACE.
model.push_to_hub(repo_id='KingZack/future-futurama-maker')
tokenizer.push_to_hub('KingZack/future-futurama-maker')
"""### check out the model you made in the offical hub.
--> https://huggingface.co/KingZack/future-futurama-maker
## Now load it from the hub and test it out.
"""
# Use a pipeline as a high-level helper
# from transformers import pipeline
# pipe = pipeline("text-generation", model="KingZack/future-futurama-maker")
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("KingZack/future-futurama-maker")
model = AutoModelForCausalLM.from_pretrained("KingZack/future-futurama-maker")
# Test the chatbot
prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.'
response = generate_response(model, tokenizer, prompt, max_length=1000)
print("Generated response:", response)
|