File size: 5,898 Bytes
ceed47a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import numpy as np
from models.GPT import build_GPT

# Utils to work with estimation functions

def normalize_list(numbers):
    """
    Normalizes a list of numbers to the range [0, 1].
    
    Args:
        numbers (list of numeric): List of numbers to be normalized.
        
    Returns:
        list of float: Normalized list of numbers.
    """
    min_val = min(numbers)
    max_val = max(numbers)
    normalized = [(x - min_val) / (max_val - min_val) for x in numbers]
    return normalized


def estimate_optimal_ratios_from_models(model_configs, 
                                       train_seq_len,
                                       x_train,
                                       y_train,
                                       max_epochs,
                                       batch_size):
    """
    Estimate the optimal ratios of model size and number of training tokens from FLOP counts.

    Args:
    - model_configs (list): List of tuples representing model configurations.
                            Each tuple contains parameters for building the model.
    - train_seq_len (list): List of integers representing different numbers of training sequences.
    - x_train (numpy array): Input data for training.
    - y_train (numpy array): Target data for training.
    - max_epochs (int): Maximum number of epochs for training.
    - batch_size (int): Batch size for training.

    Returns:
    - flops (numpy array): Array of FLOP counts for each experiment.
    - loss_history (numpy array): Array of loss histories for each experiment.
    - model_params (numpy array): Array of total model parameters for each experiment.
    """

    total_models = len(model_configs)
    total_seq_len = len(train_seq_len)

    print('Total Number of Experiments: ' + str(total_models * total_seq_len))

    experiment_number = 0
    _flops = []
    _loss_history = []
    _model_params = []
    for model_config in model_configs:
        for seq_len in train_seq_len:
            experiment_number += 1
            print('Train Number: ' + str(experiment_number))

            # Build the model and calculate FLOPs
            GPT, flops = build_GPT(*model_config)
            
            # Train the model
            history = GPT.fit(x_train[:seq_len], y_train[:seq_len], batch_size=batch_size, epochs=max_epochs)
            
            # Count model parameters
            model_params = GPT.count_params()

            # Extract loss history
            loss_history = history.history['loss']
            
            # Store results
            _flops.append(flops*seq_len*max_epochs)
            _loss_history.append(loss_history)
            _model_params.append(model_params)
    
    return (np.array(_flops), np.array(_loss_history), np.array(_model_params))

import numpy as np

def estimate_optimal_ratios_from_flops(flop_list,
                                       input_len,
                                       num_heads,
                                       head_dims,
                                       num_decoders,
                                       fc_dim_factor,
                                       vocab_size,
                                       dropout_rate,
                                       x_train,
                                       y_train,
                                       trials_per_flop=2,
                                       batch_size=32):
    """
    Estimates optimal ratios of various model parameters based on FLOP count.

    Args:
        flop_list (list): List of FLOP counts to estimate optimal ratios for.
        input_len (int): Length of the input sequence.
        num_heads (tuple): Tuple containing the minimum and maximum values for the number of attention heads.
        head_dims (tuple): Tuple containing the minimum and maximum values for the dimensionality of attention heads.
        num_decoders (int): Number of decoder layers.
        fc_dim_factor (int): Factor to determine the dimensionality of fully connected layers.
        vocab_size (int): Size of the vocabulary.
        dropout_rate (float): Dropout rate.
        x_train (numpy.ndarray): Training input data.
        y_train (numpy.ndarray): Training target data.
        trials_per_flop (int, optional): Number of trials per FLOP count. Defaults to 2.
        batch_size (int, optional): Batch size for training. Defaults to 32.

    Returns:
        tuple: Tuple containing loss history, FLOP history, and number of parameters for each trial.
    """

    loss_history = []
    flop_history = []
    parameters = []

    for flop in flop_list:
        for _ in range(trials_per_flop):
            f_num_heads = np.random.randint(num_heads[0], num_heads[1])
            f_head_dims = np.random.randint(head_dims[0], head_dims[1])
            f_embed_dim = f_num_heads * f_head_dims
            f_num_decoders = np.random.randint(1, num_decoders)
            f_fc_dim_factor = np.random.randint(1, fc_dim_factor)

            args = (input_len,
                    vocab_size,
                    f_embed_dim,
                    f_num_decoders,
                    dropout_rate,
                    f_num_heads,
                    f_head_dims,
                    f_fc_dim_factor
                    )

            GPT, flop_per_inference = build_GPT(*args)  # Assuming build_GPT is defined elsewhere
            print(GPT.summary())

            epochs = flop // flop_per_inference
            if epochs <= 0:
                raise Exception('The provided FLOP count is too small: ' + str(flop) + ' is too small')

            history = GPT.fit(x_train, y_train, batch_size=batch_size, epochs=epochs)

            loss_history.append(history.history['loss'])
            flop_history.append(flop*batch_size*epochs)
            parameters.append(GPT.count_params())

    return loss_history, flop_history, parameters