File size: 4,542 Bytes
e20929a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
import math
import numpy as np
from scipy.optimize import fsolve


def Nnv_to_d(Nnv):
    if Nnv <= 50_000_000:
        d = 512
    elif 50_000_000 < Nnv <= 200_000_000:
        d = 768 
    elif 200_000_000 < Nnv <= 500_000_000:
        d = 1024 
    elif 500_000_000 < Nnv <= 1_000_000_000:
        d = 1536     
    elif 1_000_000_000 < Nnv <= 2_000_000_000:
        d = 2048 
    elif 2_000_000_000 < Nnv <= 5_000_000_000:
        d = 3200 
    elif 5_000_000_000 < Nnv <= 10_000_000_000:
        d = 4096 
    elif 10_000_000_000 < Nnv <= 20_000_000_000:
        d = 5120    
    elif 20_000_000_000 < Nnv <= 50_000_000_000:
        d = 6048  
    elif 50_000_000_000 < Nnv <= 100_000_000_000:
        d = 8192  
    elif 100_000_000_000 < Nnv <= 200_000_000_000:
        d = 12288  
    elif 200_000_000_000 < Nnv <= 500_000_000_000:
        d = 16384   
    elif 500_000_000_000 < Nnv <= 1000_000_000_000:
        d = 20480
    else:
        d = 24576
        # raise ValueError()       
    return float(d) 


def Nnvopt_to_flops(Nnv):
    '''Return the corresponding training-optimal FLOPs budget
     given the non-vocabulary parameters Nnv'''
    FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
    return FLOPs


def flops_to_Nnvopt(FLOPs):
    '''Return the corresponding training-optimal non-vocabulary parameters Nnv
     given the FLOPs budget'''
    return np.exp(-2.4846510161625193) * FLOPs **0.5


def approach1_isoflops(Nnv):
    '''Predict the training-optimal vocabulary parameters by the approach 1:
    Build the relationship between studied attributes and FLOPs'''    
    d = Nnv_to_d(Nnv)
    FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
    Nv = np.exp(-1.589031299255507)* FLOPs ** 0.4163622634135234
    return int(Nv/d)

def approach2_derivative(Nnv):
    '''Predict the training-optimal vocabulary parameters by the approach 2:
    Derivative-based fast estimation'''       
    d = Nnv_to_d(Nnv)
    best_vocab_para = 3145728
    best_alpha = 0.8353974035228025
    return int((best_vocab_para * (Nnv / 33_000_000) ** best_alpha)/d)

def approach3_isoloss(Nnv, FLOPs=None):   
    '''Predict the training-optimal vocabulary parameters by the approach 3:
    Parametric fit of loss function.
    Different from the approach 1 & 2 that assumes the the training data and 
    non-vocabulary parameters are EQUALLY scaled to essure the optimal compute allocation,
    the approach 3 is more flexible that it can also be used in the cases the training data is
    not EQUALLY scaled with the non-vocabulary parameters, for example, the number of data 
    is insufficient or overly sufficient. One can assign a FLOPs budget to 
    adjust the number of available training data.
     '''       
    def dl_dv(V, Nnv, d, F):
        term1 = 0  # Derivative of -E
        term2 = 0  # Derivative of A1/[Nnv]^alpha1
        term3 = -alpha2 * A2 * d / (V * d) ** (alpha2 + 1)
        u = F / (6 * (Nnv + V * d))
        du_dV = F * d / (6 * (Nnv + V * d) ** 2)
        term4 = beta * B * du_dV / (u ** (beta + 1))
        return term1 + term2 + term3 + term4
    A1, A2, B, E = 1.8313851559554126, 0.19584238398665638, 2.1241123120064955, 5.5327846803337435,
    alpha1, alpha2, beta = 0.44660634152009615, 0.6707374679896795, 0.44660634152009615
    
    d = Nnv_to_d(Nnv)
    if FLOPs is None:
        FLOPs = Nnvopt_to_flops(Nnv)
    # normalization
    Nnv = Nnv / 1_000_000
    d = d / 1_000   
    FLOPs = FLOPs / (1_000_000_000*1_000_000)
    V = fsolve(dl_dv, 1, args=(Nnv,d,FLOPs))[0]
    # de-normalization
    Nnv = Nnv * 1_000_000
    d = d * 1_000   
    FLOPs = FLOPs * (1_000_000_000*1_000_000)
    return int(V*1000)    


if __name__ == '__main__':
    '''
    By using the coefficient fitted in the proposed 3 approaches, this code
    provide an example about how to predict the optimal vocabulary 
    parameters (Nv) and vocabulary size, given the non-vocabulary parameters (Nnv).
    '''
    # Nnv = 7*10**9
    # Nvopt_app1 = approach1_isoflops(Nnv)
    # Nvopt_app2 = approach2_derivative(Nnv)
    # Nvopt_app3 = approach3_isoloss(Nnv)  
    # FLOPs = Nnvopt_to_flops(Nnv)
    # print(FLOPs)
    # d = Nnv_to_d(Nnv)
    # Vopt_app1, Vopt_app2, Vopt_app3 = int(Nvopt_app1/d), int(Nvopt_app2/d), int(Nvopt_app3/d)
    # print(f'Given Nnv={Nnv}: The predicted optimal vocabulary size is {Nvopt_app1}, {Nvopt_app2}, {Nvopt_app3} by the 3 proposed approaches.\
    # The predicted optimal vocabulary size is {Vopt_app1}, {Vopt_app2}, {Vopt_app3} by the 3 proposed approaches.')