Spaces:

KingZack
/

future-futurama-maker

Sleeping

App Files Files Community

future-futurama-maker / finetuning_gpt2.py

KingZack

adding data and code for training

3c2b5ed about 1 year ago

raw

history blame

6.97 kB

	# -- coding: utf-8 --
	"""FineTuning GPT2

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/1PlLmPZ7NMPjZFz7xisoypLSHIjE2s7Qm

	# This notebook by Zack DeSario is a remix / combination of many sources as all good code is.
	The code is mainly from [@DigitalSreeni](https://youtu.be/DxygPxcfW_I). Their code cites the [huggingface official tutorial](https://huggingface.co/transformers/v2.2.0/pretrained_models.html).
	"""

	# !pip install transformers
	# !pip install torch
	# !pip install transformers[torch]

	import os
	import re
	import torch
	from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling
	from transformers import Trainer, TrainingArguments

	from huggingface_hub import notebook_login
	notebook_login()

	"""Required functions to read text from various files located in a directory. Files can be a mix of pdf, docx, or txt."""

	### THIS CODE IS 100% WRITTEN BY THE FIRST SOURCE. VERY HELPFUL FUNCTIONS, TY.
	# Functions to read different file types
	def read_txt(file_path):
	with open(file_path, "r") as file:
	text = file.read()
	return text

	def read_documents_from_directory(directory):
	combined_text = ""
	for filename in os.listdir(directory):
	file_path = os.path.join(directory, filename)
	if filename.endswith(".pdf"):
	combined_text += read_pdf(file_path)
	elif filename.endswith(".docx"):
	combined_text += read_word(file_path)
	elif filename.endswith(".txt"):
	combined_text += read_txt(file_path)
	return combined_text


	# ANOTHER HELPER FUNCTION
	def generate_response(model, tokenizer, prompt, max_length=100):
	input_ids = tokenizer.encode(prompt, return_tensors="pt")

	# Create the attention mask and pad token id
	attention_mask = torch.ones_like(input_ids)
	pad_token_id = tokenizer.eos_token_id

	output = model.generate(
	input_ids,
	max_length=max_length,
	num_return_sequences=1,
	attention_mask=attention_mask,
	pad_token_id=pad_token_id
	)

	return tokenizer.decode(output[0], skip_special_tokens=True)

	"""## Now load the base model and test it to see if it already does what we need to do or not...."""

	# Set up the tokenizer and model
	tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl
	model = GPT2LMHeadModel.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl

	prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.'
	response = generate_response(model, tokenizer, prompt, max_length=200)
	print(response)

	prompt = 'Who is Fry TV show Futurama? Describe them in detail.'
	response = generate_response(model, tokenizer, prompt, max_length=200)
	print(response)

	"""# Mmkay, it clearly does not know who Fry is or how to write a TV Script.

	### Lets train it to learn how to write a TV script for Futurama.

	## Adding your data
	1. Open the side panel, click on the folder icon, create a new folder called `my_data`, and drag and drop your data into that side panel. I will demonstrate during class.

	2. Also, create a new folder called `my_trained_model`. That is where we will temporarily store our trained model.

	Load your data
	* You can download the data I used here: UPLOAD LINK SOON.
	"""

	directory = "/content/my_data/" # Replace with the path to your directory containing the files
	model_output_path = "/content/my_trained_models/"
	train_fraction=0.8
	# Read documents from the directory
	combined_text = read_documents_from_directory(directory)
	combined_text = re.sub(r'\n+', '\n', combined_text).strip() # Remove excess newline characters

	# Split the text into training and validation sets
	split_index = int(train_fraction * len(combined_text))
	train_text = combined_text[:split_index]
	val_text = combined_text[split_index:]

	# Save the training and validation data as text files
	with open("train.txt", "w") as f:
	f.write(train_text)
	with open("val.txt", "w") as f:
	f.write(val_text)

	len(train_text)
	print(train_text[:1000])

	"""The train_chatbot function uses the combined text data to train a GPT-2 model using the provided training arguments. The resulting trained model and tokenizer are then saved to a specified output directory."""

	# Prepare the dataset
	train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128)
	val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128)
	data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

	# Set up the training arguments
	training_args = TrainingArguments(
	output_dir=model_output_path,
	overwrite_output_dir=True,
	per_device_train_batch_size=4,
	per_device_eval_batch_size=4,
	num_train_epochs=33,
	save_steps=10_000,
	save_total_limit=2,
	logging_dir='./logs',
	)

	# Train the model
	trainer = Trainer(
	model=model,
	args=training_args,
	data_collator=data_collator,
	train_dataset=train_dataset,
	eval_dataset=val_dataset,
	)

	## THIS TAKES 30 MINS FOR JUST 10 EPOCHS SO I AM NOT GOING TO DUE THAT DURING CLASS....
	## AND ~2HRS FOR 33 EPOCHS
	trainer.train()

	# Save the model
	trainer.save_model(model_output_path)

	# Save the tokenizer
	tokenizer.save_pretrained(model_output_path)

	print("SAVED MODELS LOCALLY YO!!!!!!")

	directory = "/content/my_data/" # Replace with the path to your directory containing the files
	model_output_path = "/content/my_trained_models/"

	model = GPT2LMHeadModel.from_pretrained(model_output_path)
	tokenizer = GPT2Tokenizer.from_pretrained(model_output_path)

	# Test the chatbot
	prompt = "Write a TV show script for the TV show Futurama about Fry getting stuck in a hole." # Replace with your desired prompt
	# prompt = "What is bulk metallic glass?" # Replace with your desired prompt

	response = generate_response(model, tokenizer, prompt, max_length=1000)
	print("Generated response:", response)

	## PUSH THE MODELS TO YOUR HUGGING-FACE.

	model.push_to_hub(repo_id='KingZack/future-futurama-maker')
	tokenizer.push_to_hub('KingZack/future-futurama-maker')

	"""### check out the model you made in the offical hub.
	--> https://huggingface.co/KingZack/future-futurama-maker

	## Now load it from the hub and test it out.
	"""

	# Use a pipeline as a high-level helper
	# from transformers import pipeline

	# pipe = pipeline("text-generation", model="KingZack/future-futurama-maker")


	# Load model directly
	from transformers import AutoTokenizer, AutoModelForCausalLM

	tokenizer = AutoTokenizer.from_pretrained("KingZack/future-futurama-maker")
	model = AutoModelForCausalLM.from_pretrained("KingZack/future-futurama-maker")

	# Test the chatbot
	prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.'

	response = generate_response(model, tokenizer, prompt, max_length=1000)
	print("Generated response:", response)