File size: 4,817 Bytes
54b95ae d647b4a f520594 54b95ae f520594 6892511 54b95ae 590516a 54b95ae f520594 54b95ae 0be7100 7a40fb6 54b95ae 590516a 54b95ae a7aafec 54b95ae 590516a 54b95ae 590516a 54b95ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 |
# The name of the model to pretrain. Choose from names in ``litgpt.config``. Mutually exclusive with
# ``model_config``. (type: Optional[str], default: null)
model_name: "tiny-llama-1.1b"
# A ``litgpt.Config`` object to define the model architecture. Mutually exclusive with
# ``model_config``. (type: Optional[Config], default: null)
model_config:
padded_vocab_size: 32768
vocab_size: 32768
block_size: 32768
n_layer: 20
n_head: 32
head_size: null
n_embd: 512
n_query_groups: 4
rotary_percentage: 1.0
parallel_residual: false
bias: false
norm_class_name: "RMSNorm"
norm_eps: 1.0e-05
mlp_class_name: "LLaMAMLP"
intermediate_size: 2048
rope_base: 500000
# Directory in which to save checkpoints and logs. If running in a Lightning Studio Job, look for it in
# /teamspace/jobs/<job-name>/share. (type: <class 'Path'>, default: out/pretrain)
out_dir: "../out/pretrain/"
# The precision to use for pretraining. Possible choices: "bf16-true", "bf16-mixed", "32-true". (type: Optional[str], default: null)
# precision: bf16-mixed
precision: bf16-true
# Optional path to a checkpoint directory to initialize the model from.
# Useful for continued pretraining. Mutually exclusive with ``resume``. (type: Optional[Path], default: null)
initial_checkpoint_dir:
# Path to a checkpoint directory to resume from in case training was interrupted, or ``True`` to resume
# from the latest checkpoint in ``out_dir``. An error will be raised if no checkpoint is found. Passing
# ``'auto'`` will resume from the latest checkpoint but not error if no checkpoint exists.
# (type: Union[bool, Literal["auto"], Path], default: False)
# resume: false
resume: "auto"
# Data-related arguments. If not provided, the default is ``litgpt.data.TinyLlama``.
data:
class_path: LitData
init_args:
data_path: "../data/"
num_workers: 16
# num_workers: 3
# Training-related arguments. See ``litgpt.args.TrainArgs`` for details
train:
# Number of optimizer steps between saving checkpoints (type: Optional[int], default: 1000)
save_interval: 1000
# Number of iterations between logging calls (type: int, default: 1)
log_interval: 1
# Number of samples between optimizer steps across data-parallel ranks (type: int, default: 512)
global_batch_size: 512
# Number of samples per data-parallel rank (type: int, default: 4)
# micro_batch_size: 16
micro_batch_size: 12
# Number of iterations with learning rate warmup active (type: int, default: 2000)
lr_warmup_steps: 2000
# Number of epochs to train on (type: Optional[int], default: null)
epochs:
# Total number of tokens to train on (type: Optional[int], default: 3000000000000)
# max_tokens: 3000000000000
max_tokens: 9782206713 # 1591379 * 2049 * 3
# Limits the number of optimizer steps to run. (type: Optional[int], default: null)
max_steps:
# Limits the length of samples. Off by default (type: Optional[int], default: null)
max_seq_length: 2048
# Whether to tie the embedding weights with the language modeling head weights. (type: Optional[bool], default: False)
tie_embeddings:
# (type: Optional[float], default: 1.0)
max_norm: 1.0
# (type: float, default: 4e-05)
# min_lr: 4.0e-05
min_lr: 1.0e-3
# Evaluation-related arguments. See ``litgpt.args.EvalArgs`` for details
eval:
# Number of optimizer steps between evaluation calls (type: int, default: 1000)
interval: 100
# Number of tokens to generate (type: Optional[int], default: null)
max_new_tokens:
# Number of iterations (type: int, default: 100)
max_iters: 100
# Whether to evaluate on the validation set at the beginning of the training
initial_validation: false
# Whether to evaluate on the validation set at the end the training
final_validation: false
# Optimizer-related arguments
optimizer:
# class_path: torch.optim.AdamW
class_path: grokadamw.GrokAdamW
# class_path: bitsandbytes.optim.AdamW8bit
# class_path: bitsandbytes.optim.PagedAdamW8bit
init_args:
# (type: float, default: 0.001)
# lr: 5e-5
lr: 1.0e-3
# (type: float, default: 0.01)
# weight_decay: 0.1
weight_decay: 0.01
# (type: tuple, default: (0.9,0.999))
betas:
- 0.9
- 0.95
# How many devices/GPUs to use. Uses all GPUs by default. (type: Union[int, str], default: auto)
devices: auto
# How many nodes to use. (type: int, default: 1)
num_nodes: 1
# Optional path to the tokenizer dir that was used for preprocessing the dataset. Only some data
# module require this. (type: Optional[Path], default: null)
tokenizer_dir: "../"
# The name of the logger to send metrics to. (type: Literal['wandb', 'tensorboard', 'csv'], default: tensorboard)
logger_name: "wandb"
# The random seed to use for reproducibility. (type: int, default: 42)
seed: 42
|