|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.5070993914807302, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02535496957403651, |
|
"grad_norm": 8.872879028320312, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 2.5906, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05070993914807302, |
|
"grad_norm": 10.428619384765625, |
|
"learning_rate": 4.994602438146344e-05, |
|
"loss": 2.704, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07606490872210954, |
|
"grad_norm": 8.182428359985352, |
|
"learning_rate": 4.972714782003472e-05, |
|
"loss": 2.8506, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.10141987829614604, |
|
"grad_norm": 6.291181564331055, |
|
"learning_rate": 4.934147215158731e-05, |
|
"loss": 3.0837, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12677484787018256, |
|
"grad_norm": 5.771434307098389, |
|
"learning_rate": 4.87915989845867e-05, |
|
"loss": 2.7152, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.15212981744421908, |
|
"grad_norm": 5.961496353149414, |
|
"learning_rate": 4.8081237535878116e-05, |
|
"loss": 2.884, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.17748478701825557, |
|
"grad_norm": 6.111754894256592, |
|
"learning_rate": 4.7215179609844665e-05, |
|
"loss": 2.8407, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2028397565922921, |
|
"grad_norm": 6.186964988708496, |
|
"learning_rate": 4.6199267274877736e-05, |
|
"loss": 2.7641, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.2281947261663286, |
|
"grad_norm": 6.564132213592529, |
|
"learning_rate": 4.504035345520115e-05, |
|
"loss": 2.6484, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.2535496957403651, |
|
"grad_norm": 2243.718994140625, |
|
"learning_rate": 4.374625570388008e-05, |
|
"loss": 2.7105, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.2789046653144016, |
|
"grad_norm": 11.651649475097656, |
|
"learning_rate": 4.2325703468843025e-05, |
|
"loss": 2.5379, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.30425963488843816, |
|
"grad_norm": 6.585482120513916, |
|
"learning_rate": 4.078827920763835e-05, |
|
"loss": 2.5446, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.32961460446247465, |
|
"grad_norm": 5.859741687774658, |
|
"learning_rate": 3.914435374814092e-05, |
|
"loss": 2.5159, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.35496957403651114, |
|
"grad_norm": 5.331320762634277, |
|
"learning_rate": 3.740501633123872e-05, |
|
"loss": 2.567, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.3803245436105477, |
|
"grad_norm": 5.613489151000977, |
|
"learning_rate": 3.558199980740263e-05, |
|
"loss": 2.5492, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.4056795131845842, |
|
"grad_norm": 5.667895317077637, |
|
"learning_rate": 3.368760149173219e-05, |
|
"loss": 2.3591, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.43103448275862066, |
|
"grad_norm": 5.731890678405762, |
|
"learning_rate": 3.1734600211356654e-05, |
|
"loss": 2.4657, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.4563894523326572, |
|
"grad_norm": 4.859386920928955, |
|
"learning_rate": 2.9736170104755075e-05, |
|
"loss": 2.4864, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.4817444219066937, |
|
"grad_norm": 5.270420551300049, |
|
"learning_rate": 2.7705791754469607e-05, |
|
"loss": 2.2326, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.5070993914807302, |
|
"grad_norm": 5.213580131530762, |
|
"learning_rate": 2.5657161252674044e-05, |
|
"loss": 2.0531, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 986, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 4.6110257184768e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|