nanotron
/

minicpm-nanotron

Model card Files Files and versions Community

minicpm-nanotron / config_minicpm.py

thomwolf's picture

thomwolf HF staff

update

54ba632 7 months ago

history blame contribute delete

3.69 kB

	""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.

	Usage:
	```
	python config_tiny_mistral.py
	```
	"""
	import os
	from dataclasses import dataclass
	from typing import Optional

	from nanotron.config import (
	CheckpointsArgs,
	Config,
	DataArgs,
	GeneralArgs,
	LoggingArgs,
	LRSchedulerArgs,
	ModelArgs,
	OptimizerArgs,
	ParallelismArgs,
	PretrainDatasetsArgs,
	RandomInit,
	TokenizerArgs,
	TokensArgs,
	)
	from nanotron.logging import human_format


	@dataclass
	class MiniCPMConfig:
	"""Configuration for a MiniCPM model.

	Be careful on having a coherent typing as we use it to reconstruct the model from yaml
	"""

	attn_pdrop: float = 0.0
	bos_token_id: int =1
	eos_token_id: int =2
	pad_token_id: Optional[int] = None
	hidden_act: str ="silu"
	hidden_size: int =2304
	initializer_range: float =0.1
	intermediate_size: int =5760
	max_position_embeddings: int =2048
	num_attention_heads: int =36
	num_hidden_layers: int =40
	num_key_value_heads: int =36
	pretraining_tp: int=1
	rms_norm_eps: float=1e-05
	rope_theta: float = 10000.0
	tie_word_embeddings: bool =True
	use_cache: bool =True
	vocab_size: int = 122753
	scale_emb: float = 12
	dim_model_base: int= 256
	scale_depth: float = 1.4

	def __post_init__(self):
	# for backward compatibility
	if self.num_key_value_heads is None:
	self.num_key_value_heads = self.num_attention_heads

	def get_num_params(model_config: MiniCPMConfig) -> int:
	num_params = model_config.vocab_size * model_config.hidden_size * 2 + \
	model_config.num_hidden_layers * (
	3 * model_config.hidden_size * model_config.intermediate_size
	+ 2 * model_config.hidden_size * model_config.hidden_size
	+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
	)
	return num_params

	def get_num_params_no_embed(model_config: MiniCPMConfig) -> int:
	num_params = model_config.num_hidden_layers * (
	3 * model_config.hidden_size * model_config.intermediate_size
	+ 2 * model_config.hidden_size * model_config.hidden_size
	+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
	)
	return num_params

	MODEL_CONFIG = MiniCPMConfig()

	num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
	num_params_no_embed = human_format(get_num_params_no_embed(MODEL_CONFIG)).replace(".", "p")

	print(f"Model has {num_params} parameters or {num_params_no_embed} without embeddings")

	PARALLELISM = ParallelismArgs(
	dp=1,
	pp=1,
	tp=1,
	pp_engine="1f1b",
	tp_mode="REDUCE_SCATTER",
	tp_linear_async_communication=True,
	recompute_granularity="selective",
	)

	CONFIG = Config(
	general=GeneralArgs(project="openbmb", run="MiniCPM-2B-dpo-bf16", seed=42, step=0),
	checkpoints=None,
	parallelism=PARALLELISM,
	model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
	tokenizer=TokenizerArgs("openbmb/MiniCPM-2B-dpo-bf16"),
	optimizer=None,
	logging=None,
	tokens=None,
	data=None,
	profiler=None,
	lighteval=None,
	)

	if __name__ == "__main__":
	file_path = os.path.abspath(__file__)

	file_path = file_path.replace(".py", ".yaml")
	# Save config as YAML file
	CONFIG.save_as_yaml(file_path)

	# You can now train a model with this config using `/run_train.py`