minicpm-nanotron / config_minicpm.py
thomwolf's picture
thomwolf HF staff
update
54ba632
""" Example python script to generate a YAML config file which can be used to run a training with nanotron. Refer to "examples" section in the `/README.md` for more information.
Usage:
```
python config_tiny_mistral.py
```
"""
import os
from dataclasses import dataclass
from typing import Optional
from nanotron.config import (
CheckpointsArgs,
Config,
DataArgs,
GeneralArgs,
LoggingArgs,
LRSchedulerArgs,
ModelArgs,
OptimizerArgs,
ParallelismArgs,
PretrainDatasetsArgs,
RandomInit,
TokenizerArgs,
TokensArgs,
)
from nanotron.logging import human_format
@dataclass
class MiniCPMConfig:
"""Configuration for a MiniCPM model.
Be careful on having a coherent typing as we use it to reconstruct the model from yaml
"""
attn_pdrop: float = 0.0
bos_token_id: int =1
eos_token_id: int =2
pad_token_id: Optional[int] = None
hidden_act: str ="silu"
hidden_size: int =2304
initializer_range: float =0.1
intermediate_size: int =5760
max_position_embeddings: int =2048
num_attention_heads: int =36
num_hidden_layers: int =40
num_key_value_heads: int =36
pretraining_tp: int=1
rms_norm_eps: float=1e-05
rope_theta: float = 10000.0
tie_word_embeddings: bool =True
use_cache: bool =True
vocab_size: int = 122753
scale_emb: float = 12
dim_model_base: int= 256
scale_depth: float = 1.4
def __post_init__(self):
# for backward compatibility
if self.num_key_value_heads is None:
self.num_key_value_heads = self.num_attention_heads
def get_num_params(model_config: MiniCPMConfig) -> int:
num_params = model_config.vocab_size * model_config.hidden_size * 2 + \
model_config.num_hidden_layers * (
3 * model_config.hidden_size * model_config.intermediate_size
+ 2 * model_config.hidden_size * model_config.hidden_size
+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
)
return num_params
def get_num_params_no_embed(model_config: MiniCPMConfig) -> int:
num_params = model_config.num_hidden_layers * (
3 * model_config.hidden_size * model_config.intermediate_size
+ 2 * model_config.hidden_size * model_config.hidden_size
+ 2 * model_config.hidden_size * (model_config.hidden_size / (model_config.num_attention_heads / model_config.num_key_value_heads))
)
return num_params
MODEL_CONFIG = MiniCPMConfig()
num_params = human_format(get_num_params(MODEL_CONFIG)).replace(".", "p")
num_params_no_embed = human_format(get_num_params_no_embed(MODEL_CONFIG)).replace(".", "p")
print(f"Model has {num_params} parameters or {num_params_no_embed} without embeddings")
PARALLELISM = ParallelismArgs(
dp=1,
pp=1,
tp=1,
pp_engine="1f1b",
tp_mode="REDUCE_SCATTER",
tp_linear_async_communication=True,
recompute_granularity="selective",
)
CONFIG = Config(
general=GeneralArgs(project="openbmb", run="MiniCPM-2B-dpo-bf16", seed=42, step=0),
checkpoints=None,
parallelism=PARALLELISM,
model=ModelArgs(init_method=RandomInit(std=0.025), model_config=MODEL_CONFIG),
tokenizer=TokenizerArgs("openbmb/MiniCPM-2B-dpo-bf16"),
optimizer=None,
logging=None,
tokens=None,
data=None,
profiler=None,
lighteval=None,
)
if __name__ == "__main__":
file_path = os.path.abspath(__file__)
file_path = file_path.replace(".py", ".yaml")
# Save config as YAML file
CONFIG.save_as_yaml(file_path)
# You can now train a model with this config using `/run_train.py`