Ragdoll / inference /scale_utils.py
abhaskumarsinha's picture
Added the alpha version of Corpus2GPT
ceed47a
import numpy as np
from models.GPT import build_GPT
# Utils to work with estimation functions
def normalize_list(numbers):
"""
Normalizes a list of numbers to the range [0, 1].
Args:
numbers (list of numeric): List of numbers to be normalized.
Returns:
list of float: Normalized list of numbers.
"""
min_val = min(numbers)
max_val = max(numbers)
normalized = [(x - min_val) / (max_val - min_val) for x in numbers]
return normalized
def estimate_optimal_ratios_from_models(model_configs,
train_seq_len,
x_train,
y_train,
max_epochs,
batch_size):
"""
Estimate the optimal ratios of model size and number of training tokens from FLOP counts.
Args:
- model_configs (list): List of tuples representing model configurations.
Each tuple contains parameters for building the model.
- train_seq_len (list): List of integers representing different numbers of training sequences.
- x_train (numpy array): Input data for training.
- y_train (numpy array): Target data for training.
- max_epochs (int): Maximum number of epochs for training.
- batch_size (int): Batch size for training.
Returns:
- flops (numpy array): Array of FLOP counts for each experiment.
- loss_history (numpy array): Array of loss histories for each experiment.
- model_params (numpy array): Array of total model parameters for each experiment.
"""
total_models = len(model_configs)
total_seq_len = len(train_seq_len)
print('Total Number of Experiments: ' + str(total_models * total_seq_len))
experiment_number = 0
_flops = []
_loss_history = []
_model_params = []
for model_config in model_configs:
for seq_len in train_seq_len:
experiment_number += 1
print('Train Number: ' + str(experiment_number))
# Build the model and calculate FLOPs
GPT, flops = build_GPT(*model_config)
# Train the model
history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs)
# Count model parameters
model_params = GPT.count_params()
# Extract loss history
loss_history = history.history['loss']
# Store results
_flops.append(flops*seq_len*max_epochs)
_loss_history.append(loss_history)
_model_params.append(model_params)
return (np.array(_flops), np.array(_loss_history), np.array(_model_params))
import numpy as np
def estimate_optimal_ratios_from_flops(flop_list,
input_len,
num_heads,
head_dims,
num_decoders,
fc_dim_factor,
vocab_size,
dropout_rate,
x_train,
y_train,
trials_per_flop=2,
batch_size=32):
"""
Estimates optimal ratios of various model parameters based on FLOP count.
Args:
flop_list (list): List of FLOP counts to estimate optimal ratios for.
input_len (int): Length of the input sequence.
num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads.
head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads.
num_decoders (int): Number of decoder layers.
fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers.
vocab_size (int): Size of the vocabulary.
dropout_rate (float): Dropout rate.
x_train (numpy.ndarray): Training input data.
y_train (numpy.ndarray): Training target data.
trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2.
batch_size (int, optional): Batch size for training. Defaults to 32.
Returns:
tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial.
"""
loss_history = []
flop_history = []
parameters = []
for flop in flop_list:
for _ in range(trials_per_flop):
f_num_heads = np.random.randint(num_heads[0], num_heads[1])
f_head_dims = np.random.randint(head_dims[0], head_dims[1])
f_embed_dim = f_num_heads * f_head_dims
f_num_decoders = np.random.randint(1, num_decoders)
f_fc_dim_factor = np.random.randint(1, fc_dim_factor)
args = (input_len,
vocab_size,
f_embed_dim,
f_num_decoders,
dropout_rate,
f_num_heads,
f_head_dims,
f_fc_dim_factor
)
GPT, flop_per_inference = build_GPT(*args) # Assuming build_GPT is defined elsewhere
print(GPT.summary())
epochs = flop // flop_per_inference
if epochs <= 0:
raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small')
history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
loss_history.append(history.history['loss'])
flop_history.append(flop*batch_size*epochs)
parameters.append(GPT.count_params())
return loss_history, flop_history, parameters