Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
"""FineTuning GPT2 | |
Automatically generated by Colaboratory. | |
Original file is located at | |
https://colab.research.google.com/drive/1PlLmPZ7NMPjZFz7xisoypLSHIjE2s7Qm | |
# This notebook by Zack DeSario is a remix / combination of many sources as all good code is. | |
The code is mainly from [@DigitalSreeni](https://youtu.be/DxygPxcfW_I). Their code cites the [huggingface official tutorial](https://huggingface.co/transformers/v2.2.0/pretrained_models.html). | |
""" | |
# !pip install transformers | |
# !pip install torch | |
# !pip install transformers[torch] | |
import os | |
import re | |
import torch | |
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling | |
from transformers import Trainer, TrainingArguments | |
from huggingface_hub import notebook_login | |
notebook_login() | |
"""Required functions to read text from various files located in a directory. Files can be a mix of pdf, docx, or txt.""" | |
### THIS CODE IS 100% WRITTEN BY THE FIRST SOURCE. VERY HELPFUL FUNCTIONS, TY. | |
# Functions to read different file types | |
def read_txt(file_path): | |
with open(file_path, "r") as file: | |
text = file.read() | |
return text | |
def read_documents_from_directory(directory): | |
combined_text = "" | |
for filename in os.listdir(directory): | |
file_path = os.path.join(directory, filename) | |
if filename.endswith(".pdf"): | |
combined_text += read_pdf(file_path) | |
elif filename.endswith(".docx"): | |
combined_text += read_word(file_path) | |
elif filename.endswith(".txt"): | |
combined_text += read_txt(file_path) | |
return combined_text | |
# ANOTHER HELPER FUNCTION | |
def generate_response(model, tokenizer, prompt, max_length=100): | |
input_ids = tokenizer.encode(prompt, return_tensors="pt") | |
# Create the attention mask and pad token id | |
attention_mask = torch.ones_like(input_ids) | |
pad_token_id = tokenizer.eos_token_id | |
output = model.generate( | |
input_ids, | |
max_length=max_length, | |
num_return_sequences=1, | |
attention_mask=attention_mask, | |
pad_token_id=pad_token_id | |
) | |
return tokenizer.decode(output[0], skip_special_tokens=True) | |
"""## Now load the base model and test it to see if it already does what we need to do or not....""" | |
# Set up the tokenizer and model | |
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl | |
model = GPT2LMHeadModel.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl | |
prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.' | |
response = generate_response(model, tokenizer, prompt, max_length=200) | |
print(response) | |
prompt = 'Who is Fry TV show Futurama? Describe them in detail.' | |
response = generate_response(model, tokenizer, prompt, max_length=200) | |
print(response) | |
"""# Mmkay, it clearly does not know who Fry is or how to write a TV Script. | |
### Lets train it to learn how to write a TV script for Futurama. | |
## Adding your data | |
1. Open the side panel, click on the folder icon, create a new folder called `my_data`, and drag and drop your data into that side panel. I will demonstrate during class. | |
2. Also, create a new folder called `my_trained_model`. That is where we will temporarily store our trained model. | |
Load your data | |
* You can download the data I used here: UPLOAD LINK SOON. | |
""" | |
directory = "/content/my_data/" # Replace with the path to your directory containing the files | |
model_output_path = "/content/my_trained_models/" | |
train_fraction=0.8 | |
# Read documents from the directory | |
combined_text = read_documents_from_directory(directory) | |
combined_text = re.sub(r'\n+', '\n', combined_text).strip() # Remove excess newline characters | |
# Split the text into training and validation sets | |
split_index = int(train_fraction * len(combined_text)) | |
train_text = combined_text[:split_index] | |
val_text = combined_text[split_index:] | |
# Save the training and validation data as text files | |
with open("train.txt", "w") as f: | |
f.write(train_text) | |
with open("val.txt", "w") as f: | |
f.write(val_text) | |
len(train_text) | |
print(train_text[:1000]) | |
"""The train_chatbot function uses the combined text data to train a GPT-2 model using the provided training arguments. The resulting trained model and tokenizer are then saved to a specified output directory.""" | |
# Prepare the dataset | |
train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128) | |
val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128) | |
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) | |
# Set up the training arguments | |
training_args = TrainingArguments( | |
output_dir=model_output_path, | |
overwrite_output_dir=True, | |
per_device_train_batch_size=4, | |
per_device_eval_batch_size=4, | |
num_train_epochs=33, | |
save_steps=10_000, | |
save_total_limit=2, | |
logging_dir='./logs', | |
) | |
# Train the model | |
trainer = Trainer( | |
model=model, | |
args=training_args, | |
data_collator=data_collator, | |
train_dataset=train_dataset, | |
eval_dataset=val_dataset, | |
) | |
## THIS TAKES 30 MINS FOR JUST 10 EPOCHS SO I AM NOT GOING TO DUE THAT DURING CLASS.... | |
## AND ~2HRS FOR 33 EPOCHS | |
trainer.train() | |
# Save the model | |
trainer.save_model(model_output_path) | |
# Save the tokenizer | |
tokenizer.save_pretrained(model_output_path) | |
print("SAVED MODELS LOCALLY YO!!!!!!") | |
directory = "/content/my_data/" # Replace with the path to your directory containing the files | |
model_output_path = "/content/my_trained_models/" | |
model = GPT2LMHeadModel.from_pretrained(model_output_path) | |
tokenizer = GPT2Tokenizer.from_pretrained(model_output_path) | |
# Test the chatbot | |
prompt = "Write a TV show script for the TV show Futurama about Fry getting stuck in a hole." # Replace with your desired prompt | |
# prompt = "What is bulk metallic glass?" # Replace with your desired prompt | |
response = generate_response(model, tokenizer, prompt, max_length=1000) | |
print("Generated response:", response) | |
## PUSH THE MODELS TO YOUR HUGGING-FACE. | |
model.push_to_hub(repo_id='KingZack/future-futurama-maker') | |
tokenizer.push_to_hub('KingZack/future-futurama-maker') | |
"""### check out the model you made in the offical hub. | |
--> https://huggingface.co/KingZack/future-futurama-maker | |
## Now load it from the hub and test it out. | |
""" | |
# Use a pipeline as a high-level helper | |
# from transformers import pipeline | |
# pipe = pipeline("text-generation", model="KingZack/future-futurama-maker") | |
# Load model directly | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
tokenizer = AutoTokenizer.from_pretrained("KingZack/future-futurama-maker") | |
model = AutoModelForCausalLM.from_pretrained("KingZack/future-futurama-maker") | |
# Test the chatbot | |
prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.' | |
response = generate_response(model, tokenizer, prompt, max_length=1000) | |
print("Generated response:", response) | |