|
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information. |
|
|
|
Usage: |
|
``` |
|
python config_tiny_mistral.py |
|
``` |
|
""" |
|
import os |
|
from dataclasses import dataclass |
|
from typing import Optional |
|
|
|
from nanotron.config import ( |
|
CheckpointsArgs, |
|
Config, |
|
DataArgs, |
|
GeneralArgs, |
|
LoggingArgs, |
|
LRSchedulerArgs, |
|
ModelArgs, |
|
OptimizerArgs, |
|
ParallelismArgs, |
|
PretrainDatasetsArgs, |
|
RandomInit, |
|
TokenizerArgs, |
|
TokensArgs, |
|
) |
|
from nanotron.logging import human_format |
|
|
|
|
|
@dataclass |
|
class MiniCPMConfig: |
|
"""Configuration for a MiniCPM model. |
|
|
|
Be careful on having a coherent typing as we use it to reconstruct the model from yaml |
|
""" |
|
|
|
attn_pdrop: float = 0.0 |
|
bos_token_id: int =1 |
|
eos_token_id: int =2 |
|
pad_token_id: Optional[int] = None |
|
hidden_act: str ="silu" |
|
hidden_size: int =2304 |
|
initializer_range: float =0.1 |
|
intermediate_size: int =5760 |
|
max_position_embeddings: int =2048 |
|
num_attention_heads: int =36 |
|
num_hidden_layers: int =40 |
|
num_key_value_heads: int =36 |
|
pretraining_tp: int=1 |
|
rms_norm_eps: float=1e-05 |
|
rope_theta: float = 10000.0 |
|
tie_word_embeddings: bool =True |
|
use_cache: bool =True |
|
vocab_size: int = 122753 |
|
scale_emb: float = 12 |
|
dim_model_base: int= 256 |
|
scale_depth: float = 1.4 |
|
|
|
def __post_init__(self): |
|
|
|
if self.num_key_value_heads is None: |
|
self.num_key_value_heads = self.num_attention_heads |
|
|
|
def get_num_params(model_config: MiniCPMConfig) -> int: |
|
num_params = model_config.vocab_size * model_config.hidden_size * 2 + \ |
|
model_config.num_hidden_layers * ( |
|
3 * model_config.hidden_size * model_config.intermediate_size |
|
+ 2 * model_config.hidden_size * model_config.hidden_size |
|
+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads)) |
|
) |
|
return num_params |
|
|
|
def get_num_params_no_embed(model_config: MiniCPMConfig) -> int: |
|
num_params = model_config.num_hidden_layers * ( |
|
3 * model_config.hidden_size * model_config.intermediate_size |
|
+ 2 * model_config.hidden_size * model_config.hidden_size |
|
+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads)) |
|
) |
|
return num_params |
|
|
|
MODEL_CONFIG = MiniCPMConfig() |
|
|
|
num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p") |
|
num_params_no_embed = human_format(get_num_params_no_embed(MODEL_CONFIG)).replace(".", "p") |
|
|
|
print(f"Model has {num_params} parameters or {num_params_no_embed} without embeddings") |
|
|
|
PARALLELISM = ParallelismArgs( |
|
dp=1, |
|
pp=1, |
|
tp=1, |
|
pp_engine="1f1b", |
|
tp_mode="REDUCE_SCATTER", |
|
tp_linear_async_communication=True, |
|
recompute_granularity="selective", |
|
) |
|
|
|
CONFIG = Config( |
|
general=GeneralArgs(project="openbmb", run="MiniCPM-2B-dpo-bf16", seed=42, step=0), |
|
checkpoints=None, |
|
parallelism=PARALLELISM, |
|
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG), |
|
tokenizer=TokenizerArgs("openbmb/MiniCPM-2B-dpo-bf16"), |
|
optimizer=None, |
|
logging=None, |
|
tokens=None, |
|
data=None, |
|
profiler=None, |
|
lighteval=None, |
|
) |
|
|
|
if __name__ == "__main__": |
|
file_path = os.path.abspath(__file__) |
|
|
|
file_path = file_path.replace(".py", ".yaml") |
|
|
|
CONFIG.save_as_yaml(file_path) |
|
|
|
|
|
|