Spaces:
Running
Running
File size: 5,898 Bytes
ceed47a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 |
import numpy as np
from models.GPT import build_GPT
# Utils to work with estimation functions
def normalize_list(numbers):
"""
Normalizes a list of numbers to the range [0, 1].
Args:
numbers (list of numeric): List of numbers to be normalized.
Returns:
list of float: Normalized list of numbers.
"""
min_val = min(numbers)
max_val = max(numbers)
normalized = [(x - min_val) / (max_val - min_val) for x in numbers]
return normalized
def estimate_optimal_ratios_from_models(model_configs,
train_seq_len,
x_train,
y_train,
max_epochs,
batch_size):
"""
Estimate the optimal ratios of model size and number of training tokens from FLOP counts.
Args:
- model_configs (list): List of tuples representing model configurations.
Each tuple contains parameters for building the model.
- train_seq_len (list): List of integers representing different numbers of training sequences.
- x_train (numpy array): Input data for training.
- y_train (numpy array): Target data for training.
- max_epochs (int): Maximum number of epochs for training.
- batch_size (int): Batch size for training.
Returns:
- flops (numpy array): Array of FLOP counts for each experiment.
- loss_history (numpy array): Array of loss histories for each experiment.
- model_params (numpy array): Array of total model parameters for each experiment.
"""
total_models = len(model_configs)
total_seq_len = len(train_seq_len)
print('Total Number of Experiments: ' + str(total_models * total_seq_len))
experiment_number = 0
_flops = []
_loss_history = []
_model_params = []
for model_config in model_configs:
for seq_len in train_seq_len:
experiment_number += 1
print('Train Number: ' + str(experiment_number))
# Build the model and calculate FLOPs
GPT, flops = build_GPT(*model_config)
# Train the model
history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs)
# Count model parameters
model_params = GPT.count_params()
# Extract loss history
loss_history = history.history['loss']
# Store results
_flops.append(flops*seq_len*max_epochs)
_loss_history.append(loss_history)
_model_params.append(model_params)
return (np.array(_flops), np.array(_loss_history), np.array(_model_params))
import numpy as np
def estimate_optimal_ratios_from_flops(flop_list,
input_len,
num_heads,
head_dims,
num_decoders,
fc_dim_factor,
vocab_size,
dropout_rate,
x_train,
y_train,
trials_per_flop=2,
batch_size=32):
"""
Estimates optimal ratios of various model parameters based on FLOP count.
Args:
flop_list (list): List of FLOP counts to estimate optimal ratios for.
input_len (int): Length of the input sequence.
num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads.
head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads.
num_decoders (int): Number of decoder layers.
fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers.
vocab_size (int): Size of the vocabulary.
dropout_rate (float): Dropout rate.
x_train (numpy.ndarray): Training input data.
y_train (numpy.ndarray): Training target data.
trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2.
batch_size (int, optional): Batch size for training. Defaults to 32.
Returns:
tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial.
"""
loss_history = []
flop_history = []
parameters = []
for flop in flop_list:
for _ in range(trials_per_flop):
f_num_heads = np.random.randint(num_heads[0], num_heads[1])
f_head_dims = np.random.randint(head_dims[0], head_dims[1])
f_embed_dim = f_num_heads * f_head_dims
f_num_decoders = np.random.randint(1, num_decoders)
f_fc_dim_factor = np.random.randint(1, fc_dim_factor)
args = (input_len,
vocab_size,
f_embed_dim,
f_num_decoders,
dropout_rate,
f_num_heads,
f_head_dims,
f_fc_dim_factor
)
GPT, flop_per_inference = build_GPT(*args) # Assuming build_GPT is defined elsewhere
print(GPT.summary())
epochs = flop // flop_per_inference
if epochs <= 0:
raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small')
history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)
loss_history.append(history.history['loss'])
flop_history.append(flop*batch_size*epochs)
parameters.append(GPT.count_params())
return loss_history, flop_history, parameters
|