import numpy as np from models.GPT import build_GPT # Utils to work with estimation functions def normalize_list(numbers): """ Normalizes a list of numbers to the range [0, 1]. Args: numbers (list of numeric): List of numbers to be normalized. Returns: list of float: Normalized list of numbers. """ min_val = min(numbers) max_val = max(numbers) normalized = [(x - min_val) / (max_val - min_val) for x in numbers] return normalized def estimate_optimal_ratios_from_models(model_configs, train_seq_len, x_train, y_train, max_epochs, batch_size): """ Estimate the optimal ratios of model size and number of training tokens from FLOP counts. Args: - model_configs (list): List of tuples representing model configurations. Each tuple contains parameters for building the model. - train_seq_len (list): List of integers representing different numbers of training sequences. - x_train (numpy array): Input data for training. - y_train (numpy array): Target data for training. - max_epochs (int): Maximum number of epochs for training. - batch_size (int): Batch size for training. Returns: - flops (numpy array): Array of FLOP counts for each experiment. - loss_history (numpy array): Array of loss histories for each experiment. - model_params (numpy array): Array of total model parameters for each experiment. """ total_models = len(model_configs) total_seq_len = len(train_seq_len) print('Total Number of Experiments: ' + str(total_models * total_seq_len)) experiment_number = 0 _flops = [] _loss_history = [] _model_params = [] for model_config in model_configs: for seq_len in train_seq_len: experiment_number += 1 print('Train Number: ' + str(experiment_number)) # Build the model and calculate FLOPs GPT, flops = build_GPT(*model_config) # Train the model history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs) # Count model parameters model_params = GPT.count_params() # Extract loss history loss_history = history.history['loss'] # Store results _flops.append(flops*seq_len*max_epochs) _loss_history.append(loss_history) _model_params.append(model_params) return (np.array(_flops), np.array(_loss_history), np.array(_model_params)) import numpy as np def estimate_optimal_ratios_from_flops(flop_list, input_len, num_heads, head_dims, num_decoders, fc_dim_factor, vocab_size, dropout_rate, x_train, y_train, trials_per_flop=2, batch_size=32): """ Estimates optimal ratios of various model parameters based on FLOP count. Args: flop_list (list): List of FLOP counts to estimate optimal ratios for. input_len (int): Length of the input sequence. num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads. head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads. num_decoders (int): Number of decoder layers. fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers. vocab_size (int): Size of the vocabulary. dropout_rate (float): Dropout rate. x_train (numpy.ndarray): Training input data. y_train (numpy.ndarray): Training target data. trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2. batch_size (int, optional): Batch size for training. Defaults to 32. Returns: tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial. """ loss_history = [] flop_history = [] parameters = [] for flop in flop_list: for _ in range(trials_per_flop): f_num_heads = np.random.randint(num_heads[0], num_heads[1]) f_head_dims = np.random.randint(head_dims[0], head_dims[1]) f_embed_dim = f_num_heads * f_head_dims f_num_decoders = np.random.randint(1, num_decoders) f_fc_dim_factor = np.random.randint(1, fc_dim_factor) args = (input_len, vocab_size, f_embed_dim, f_num_decoders, dropout_rate, f_num_heads, f_head_dims, f_fc_dim_factor ) GPT, flop_per_inference = build_GPT(*args) # Assuming build_GPT is defined elsewhere print(GPT.summary()) epochs = flop // flop_per_inference if epochs <= 0: raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small') history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs) loss_history.append(history.history['loss']) flop_history.append(flop*batch_size*epochs) parameters.append(GPT.count_params()) return loss_history, flop_history, parameters