# -*- coding: utf-8 -*- """FineTuning GPT2 Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/1PlLmPZ7NMPjZFz7xisoypLSHIjE2s7Qm # This notebook by Zack DeSario is a remix / combination of many sources as all good code is. The code is mainly from [@DigitalSreeni](https://youtu.be/DxygPxcfW_I). Their code cites the [huggingface official tutorial](https://huggingface.co/transformers/v2.2.0/pretrained_models.html). """ # !pip install transformers # !pip install torch # !pip install transformers[torch] import os import re import torch from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling from transformers import Trainer, TrainingArguments from huggingface_hub import notebook_login notebook_login() """Required functions to read text from various files located in a directory. Files can be a mix of pdf, docx, or txt.""" ### THIS CODE IS 100% WRITTEN BY THE FIRST SOURCE. VERY HELPFUL FUNCTIONS, TY. # Functions to read different file types def read_txt(file_path): with open(file_path, "r") as file: text = file.read() return text def read_documents_from_directory(directory): combined_text = "" for filename in os.listdir(directory): file_path = os.path.join(directory, filename) if filename.endswith(".pdf"): combined_text += read_pdf(file_path) elif filename.endswith(".docx"): combined_text += read_word(file_path) elif filename.endswith(".txt"): combined_text += read_txt(file_path) return combined_text # ANOTHER HELPER FUNCTION def generate_response(model, tokenizer, prompt, max_length=100): input_ids = tokenizer.encode(prompt, return_tensors="pt") # Create the attention mask and pad token id attention_mask = torch.ones_like(input_ids) pad_token_id = tokenizer.eos_token_id output = model.generate( input_ids, max_length=max_length, num_return_sequences=1, attention_mask=attention_mask, pad_token_id=pad_token_id ) return tokenizer.decode(output[0], skip_special_tokens=True) """## Now load the base model and test it to see if it already does what we need to do or not....""" # Set up the tokenizer and model tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl model = GPT2LMHeadModel.from_pretrained("gpt2-medium") #also try gpt2, gpt2-large and gpt2-medium, also gpt2-xl prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.' response = generate_response(model, tokenizer, prompt, max_length=200) print(response) prompt = 'Who is Fry TV show Futurama? Describe them in detail.' response = generate_response(model, tokenizer, prompt, max_length=200) print(response) """# Mmkay, it clearly does not know who Fry is or how to write a TV Script. ### Lets train it to learn how to write a TV script for Futurama. ## Adding your data 1. Open the side panel, click on the folder icon, create a new folder called `my_data`, and drag and drop your data into that side panel. I will demonstrate during class. 2. Also, create a new folder called `my_trained_model`. That is where we will temporarily store our trained model. Load your data * You can download the data I used here: UPLOAD LINK SOON. """ directory = "/content/my_data/" # Replace with the path to your directory containing the files model_output_path = "/content/my_trained_models/" train_fraction=0.8 # Read documents from the directory combined_text = read_documents_from_directory(directory) combined_text = re.sub(r'\n+', '\n', combined_text).strip() # Remove excess newline characters # Split the text into training and validation sets split_index = int(train_fraction * len(combined_text)) train_text = combined_text[:split_index] val_text = combined_text[split_index:] # Save the training and validation data as text files with open("train.txt", "w") as f: f.write(train_text) with open("val.txt", "w") as f: f.write(val_text) len(train_text) print(train_text[:1000]) """The train_chatbot function uses the combined text data to train a GPT-2 model using the provided training arguments. The resulting trained model and tokenizer are then saved to a specified output directory.""" # Prepare the dataset train_dataset = TextDataset(tokenizer=tokenizer, file_path="train.txt", block_size=128) val_dataset = TextDataset(tokenizer=tokenizer, file_path="val.txt", block_size=128) data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # Set up the training arguments training_args = TrainingArguments( output_dir=model_output_path, overwrite_output_dir=True, per_device_train_batch_size=4, per_device_eval_batch_size=4, num_train_epochs=33, save_steps=10_000, save_total_limit=2, logging_dir='./logs', ) # Train the model trainer = Trainer( model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, ) ## THIS TAKES 30 MINS FOR JUST 10 EPOCHS SO I AM NOT GOING TO DUE THAT DURING CLASS.... ## AND ~2HRS FOR 33 EPOCHS trainer.train() # Save the model trainer.save_model(model_output_path) # Save the tokenizer tokenizer.save_pretrained(model_output_path) print("SAVED MODELS LOCALLY YO!!!!!!") directory = "/content/my_data/" # Replace with the path to your directory containing the files model_output_path = "/content/my_trained_models/" model = GPT2LMHeadModel.from_pretrained(model_output_path) tokenizer = GPT2Tokenizer.from_pretrained(model_output_path) # Test the chatbot prompt = "Write a TV show script for the TV show Futurama about Fry getting stuck in a hole." # Replace with your desired prompt # prompt = "What is bulk metallic glass?" # Replace with your desired prompt response = generate_response(model, tokenizer, prompt, max_length=1000) print("Generated response:", response) ## PUSH THE MODELS TO YOUR HUGGING-FACE. model.push_to_hub(repo_id='KingZack/future-futurama-maker') tokenizer.push_to_hub('KingZack/future-futurama-maker') """### check out the model you made in the offical hub. --> https://huggingface.co/KingZack/future-futurama-maker ## Now load it from the hub and test it out. """ # Use a pipeline as a high-level helper # from transformers import pipeline # pipe = pipeline("text-generation", model="KingZack/future-futurama-maker") # Load model directly from transformers import AutoTokenizer, AutoModelForCausalLM tokenizer = AutoTokenizer.from_pretrained("KingZack/future-futurama-maker") model = AutoModelForCausalLM.from_pretrained("KingZack/future-futurama-maker") # Test the chatbot prompt = 'Write a script for the TV show Futurama about Fry getting stuck in a hole.' response = generate_response(model, tokenizer, prompt, max_length=1000) print("Generated response:", response)