Spaces:

sail
/

scaling-with-vocab-demo

Running

App Files Files Community

scaling-with-vocab-demo / utils.py

tcftrees

Add application file

e20929a 3 months ago

raw

history blame

4.54 kB

	import math
	import numpy as np
	from scipy.optimize import fsolve


	def Nnv_to_d(Nnv):
	if Nnv <= 50_000_000:
	d = 512
	elif 50_000_000 < Nnv <= 200_000_000:
	d = 768
	elif 200_000_000 < Nnv <= 500_000_000:
	d = 1024
	elif 500_000_000 < Nnv <= 1_000_000_000:
	d = 1536
	elif 1_000_000_000 < Nnv <= 2_000_000_000:
	d = 2048
	elif 2_000_000_000 < Nnv <= 5_000_000_000:
	d = 3200
	elif 5_000_000_000 < Nnv <= 10_000_000_000:
	d = 4096
	elif 10_000_000_000 < Nnv <= 20_000_000_000:
	d = 5120
	elif 20_000_000_000 < Nnv <= 50_000_000_000:
	d = 6048
	elif 50_000_000_000 < Nnv <= 100_000_000_000:
	d = 8192
	elif 100_000_000_000 < Nnv <= 200_000_000_000:
	d = 12288
	elif 200_000_000_000 < Nnv <= 500_000_000_000:
	d = 16384
	elif 500_000_000_000 < Nnv <= 1000_000_000_000:
	d = 20480
	else:
	d = 24576
	# raise ValueError()
	return float(d)


	def Nnvopt_to_flops(Nnv):
	'''Return the corresponding training-optimal FLOPs budget
	given the non-vocabulary parameters Nnv'''
	FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
	return FLOPs


	def flops_to_Nnvopt(FLOPs):
	'''Return the corresponding training-optimal non-vocabulary parameters Nnv
	given the FLOPs budget'''
	return np.exp(-2.4846510161625193) * FLOPs **0.5


	def approach1_isoflops(Nnv):
	'''Predict the training-optimal vocabulary parameters by the approach 1:
	Build the relationship between studied attributes and FLOPs'''
	d = Nnv_to_d(Nnv)
	FLOPs = ( Nnv/np.exp(-2.4846510161625193)) ** (1/0.5)
	Nv = np.exp(-1.589031299255507)* FLOPs ** 0.4163622634135234
	return int(Nv/d)

	def approach2_derivative(Nnv):
	'''Predict the training-optimal vocabulary parameters by the approach 2:
	Derivative-based fast estimation'''
	d = Nnv_to_d(Nnv)
	best_vocab_para = 3145728
	best_alpha = 0.8353974035228025
	return int((best_vocab_para * (Nnv / 33_000_000) ** best_alpha)/d)

	def approach3_isoloss(Nnv, FLOPs=None):
	'''Predict the training-optimal vocabulary parameters by the approach 3:
	Parametric fit of loss function.
	Different from the approach 1 & 2 that assumes the the training data and
	non-vocabulary parameters are EQUALLY scaled to essure the optimal compute allocation,
	the approach 3 is more flexible that it can also be used in the cases the training data is
	not EQUALLY scaled with the non-vocabulary parameters, for example, the number of data
	is insufficient or overly sufficient. One can assign a FLOPs budget to
	adjust the number of available training data.
	'''
	def dl_dv(V, Nnv, d, F):
	term1 = 0 # Derivative of -E
	term2 = 0 # Derivative of A1/[Nnv]^alpha1
	term3 = -alpha2 * A2 * d / (V * d) ** (alpha2 + 1)
	u = F / (6 * (Nnv + V * d))
	du_dV = F * d / (6 * (Nnv + V * d) ** 2)
	term4 = beta * B * du_dV / (u ** (beta + 1))
	return term1 + term2 + term3 + term4
	A1, A2, B, E = 1.8313851559554126, 0.19584238398665638, 2.1241123120064955, 5.5327846803337435,
	alpha1, alpha2, beta = 0.44660634152009615, 0.6707374679896795, 0.44660634152009615

	d = Nnv_to_d(Nnv)
	if FLOPs is None:
	FLOPs = Nnvopt_to_flops(Nnv)
	# normalization
	Nnv = Nnv / 1_000_000
	d = d / 1_000
	FLOPs = FLOPs / (1_000_000_000*1_000_000)
	V = fsolve(dl_dv, 1, args=(Nnv,d,FLOPs))[0]
	# de-normalization
	Nnv = Nnv * 1_000_000
	d = d * 1_000
	FLOPs = FLOPs * (1_000_000_000*1_000_000)
	return int(V*1000)


	if __name__ == '__main__':
	'''
	By using the coefficient fitted in the proposed 3 approaches, this code
	provide an example about how to predict the optimal vocabulary
	parameters (Nv) and vocabulary size, given the non-vocabulary parameters (Nnv).
	'''
	# Nnv = 710*9
	# Nvopt_app1 = approach1_isoflops(Nnv)
	# Nvopt_app2 = approach2_derivative(Nnv)
	# Nvopt_app3 = approach3_isoloss(Nnv)
	# FLOPs = Nnvopt_to_flops(Nnv)
	# print(FLOPs)
	# d = Nnv_to_d(Nnv)
	# Vopt_app1, Vopt_app2, Vopt_app3 = int(Nvopt_app1/d), int(Nvopt_app2/d), int(Nvopt_app3/d)
	# print(f'Given Nnv={Nnv}: The predicted optimal vocabulary size is {Nvopt_app1}, {Nvopt_app2}, {Nvopt_app3} by the 3 proposed approaches.\
	# The predicted optimal vocabulary size is {Vopt_app1}, {Vopt_app2}, {Vopt_app3} by the 3 proposed approaches.')