File size: 4,810 Bytes
fd468b1 716dba4 fd468b1 c62a845 716dba4 c62a845 fd468b1 c62a845 fd468b1 c62a845 716dba4 fd468b1 1816ac6 fd468b1 400c392 fd468b1 54c27fe fd468b1 716dba4 fd468b1 400c392 fd468b1 c62a845 fd468b1 400c392 fd468b1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: "Llama-3.2-1B"
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
padded_vocab_size: 38400
vocab_size: 38400
block_size: 8192
n_layer: 5
n_head: 32
head_size: null
n_embd: 1024
n_query_groups: 8
rotary_percentage: 1.0
parallel_residual: false
bias: false
norm_class_name: "RMSNorm"
norm_eps: 1e-05
mlp_class_name: "LLaMAMLP"
intermediate_size: 3584
rope_base: 500000
rope_adjustments:
factor: 32.0
low_freq_factor: 1.0
high_freq_factor: 4.0
original_max_seq_len: 8192
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/pretrain/"
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true
# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
# resume: false
resume: "auto"
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
class_path: LitData
init_args:
data_path: "../pretrain-data/"
num_workers: 16
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 500
# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
global_batch_size: 512
# Number of samples per data-parallel rank (type: int, default: 4)
micro_batch_size: 16
# Number of iterations with learning rate warmup active (type: int, default: 2000)
lr_warmup_steps: 2000
# Number of epochs to train on (type: Optional[int], default: null)
epochs:
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
# max_tokens: 3000000000000
max_tokens: ??? # ? * 2049 * 5
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:
# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length:
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
tie_embeddings:
# (type: Optional[float], default: 1.0)
max_norm: 1.0
# (type: float, default: 4e-05)
min_lr: 1e-4
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
# Number of optimizer steps between evaluation calls (type: int, default: 1000)
interval: 100
# Number of tokens to generate (type: Optional[int], default: null)
max_new_tokens:
# Number of iterations (type: int, default: 100)
max_iters: 100
# Whether to evaluate on the validation set at the beginning of the training
initial_validation: false
# Whether to evaluate on the validation set at the end the training
final_validation: true
# Optimizer-related arguments
optimizer:
# class_path: torch.optim.AdamW
class_path: grokadamw.GrokAdamW
# class_path: bitsandbytes.optim.AdamW8bit
# class_path: bitsandbytes.optim.PagedAdamW8bit
init_args:
# (type: float, default: 0.001)
lr: 1e-3
# (type: float, default: 0.01)
weight_decay: 0.01
# (type: tuple, default: (0.9,0.999))
betas:
- 0.9
- 0.95
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto
# How many nodes to use. (type: int, default: 1)
num_nodes: 1
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../"
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"
# The random seed to use for reproducibility. (type: int, default: 42)
seed: 42
|