Model Card for t5-small-qg
Model Details
Model Description
This model was trained to generate questions out of a given context.
- Developed by: philipp-zettl
- Model type: Transformer (T5)
- Language(s) (NLP): English
- License: M.I.T
- Finetuned from model [optional]: google/flan-t5-small
Model Sources [optional]
Fine-tune of the amazing google/flan-t5-small
Uses
It's intended to use the model to generate questions from given context. The context should not exceed the model's context length.
Bias, Risks, and Limitations
No bias evaluation was performed on this model.
How to Get Started with the Model
Use the code below to get started with the model.
context = "This is a long text based of multiple concatenated paragraphs."
model_inputs = tokenizer([f"context: {context}"], max_length=512, padding=True, truncation=True)
input_ids = torch.tensor(model_inputs['input_ids']).to(device)
attention_mask = torch.tensor(model_inputs['attention_mask']).to(device)
with torch.no_grad():
sample_output = model.generate(input_ids[:1], max_length=85)
sample_output_text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
print(f"Sample Input:\n \"{input_text}\"\n\n")
print(f"Model Output: \"{sample_output_text}\"")
Training Details
Training Data
This model was trained on philipp-zettl/qg-tydiqa_squad2.
The training data was collected by combining philipp-zettl/tydiqa-task_2-english with nvidia/ChatQA-Training-Data#squad2.0.
From each base dataset we selected the context
and question
attributes of each sample. Then joined them together into philipp-zettl/qg-tydiqa_squad2.
Training Procedure
Below you can find the full training pipeline used to achieve this fine-tune.
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
# Base model (e.g., T5-large)
# https://huggingface.co/collections/google/flan-t5-release-65005c39e3201fff885e22fb
model_name = 'google/flan-t5-small'
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Move only the student model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
Load dataset
from datasets import load_dataset
# Load dataset
squad_dataset = load_dataset('philipp-zettl/qg-tydiqa_squad2')
# Split the dataset into training and validation
train_dataset = squad_dataset['train']
validation_dataset = squad_dataset['test']
Preprocessing: tokenize inputs and labels for faster training cycles, i.e. no need for tokenization during training anymore
def preprocess_batch(batch, tokenizer, max_input_length=512, max_output_length=128):
contexts = batch['context']
answers = batch['question']
inputs = [f"context: {c}" for c in contexts]
model_inputs = tokenizer(inputs, max_length=max_input_length, padding=True, truncation=True)
labels = tokenizer(answers, max_length=max_output_length, padding=True, truncation=True)
model_inputs['labels'] = labels['input_ids']
return model_inputs
# Tokenize the dataset
train_dataset = train_dataset.map(lambda batch: preprocess_batch(batch, tokenizer), batched=True)
validation_dataset = validation_dataset.map(lambda batch: preprocess_batch(batch, tokenizer), batched=True)
# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
validation_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
The train loop
from tqdm import tqdm
from transformers import AdamW, DataCollatorForSeq2Seq
from torch.utils.data import DataLoader
from torch.utils.tensorboard import SummaryWriter
torch.cuda.empty_cache()
model.to(device)
# Training parameters
epochs = 3
learning_rate = 5e-5
temperature = 2.0
batch_size = 8
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
# Create a data collator for padding and batching
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)
# Create DataLoaders with the data collator
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=data_collator)
validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size, collate_fn=data_collator)
writer = SummaryWriter(comment='t5-small-qg')
print("Starting training...")
# Training loop
for epoch in range(epochs):
model.train()
total_loss = 0
print(f"Epoch {epoch+1}/{epochs}")
progress_bar = tqdm(train_dataloader, desc="Training", leave=False)
for step, batch in enumerate(progress_bar):
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
logits = outputs.logits
# Calculate losses
loss = outputs.loss # Cross-entropy loss
writer.add_scalar("Loss/train", loss, step)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
# Verbose logging
if step % 100 == 1 or step == len(train_dataloader) - 1:
progress_bar.set_postfix({
'step': step,
'loss': loss.item(),
})
# Generate a sample output from the student model
model.eval()
with torch.no_grad():
sample_output = model.generate(input_ids[:1], max_length=50)
sample_output_text = tokenizer.decode(sample_output[0], skip_special_tokens=True)
input_text = tokenizer.decode(input_ids[0], skip_special_tokens=True)
writer.add_text(f"Sample Input", input_text, step)
writer.add_text(f"Sample Output", sample_output_text, step)
model.train()
avg_loss = total_loss / len(train_dataloader)
print(f"Epoch {epoch+1} completed. Average Loss: {avg_loss:.4f}")
writer.add_scalar("AVG Loss/train", avg_loss, epoch)
print("Training complete.")
writer.close()
- Downloads last month
- 16