|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9977827050997783, |
|
"eval_steps": 500, |
|
"global_step": 225, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004434589800443459, |
|
"grad_norm": 1.91265869140625, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 2.8127, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.022172949002217297, |
|
"grad_norm": 1.5314122438430786, |
|
"learning_rate": 0.0002173913043478261, |
|
"loss": 2.7241, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04434589800443459, |
|
"grad_norm": 0.6431057453155518, |
|
"learning_rate": 0.0004347826086956522, |
|
"loss": 2.2423, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.06651884700665188, |
|
"grad_norm": 0.5257381200790405, |
|
"learning_rate": 0.0006521739130434783, |
|
"loss": 1.9505, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.08869179600886919, |
|
"grad_norm": 0.37703999876976013, |
|
"learning_rate": 0.0008695652173913044, |
|
"loss": 1.7881, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.11086474501108648, |
|
"grad_norm": 0.30256885290145874, |
|
"learning_rate": 0.0009900990099009901, |
|
"loss": 1.7031, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.13303769401330376, |
|
"grad_norm": 0.3443244993686676, |
|
"learning_rate": 0.0009653465346534653, |
|
"loss": 1.6352, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.15521064301552107, |
|
"grad_norm": 0.369827538728714, |
|
"learning_rate": 0.0009405940594059406, |
|
"loss": 1.5746, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.17738359201773837, |
|
"grad_norm": 0.231527641415596, |
|
"learning_rate": 0.0009158415841584159, |
|
"loss": 1.5409, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.19955654101995565, |
|
"grad_norm": 0.22827404737472534, |
|
"learning_rate": 0.0008910891089108911, |
|
"loss": 1.5187, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.22172949002217296, |
|
"grad_norm": 0.2396710067987442, |
|
"learning_rate": 0.0008663366336633663, |
|
"loss": 1.5128, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.24390243902439024, |
|
"grad_norm": 0.20095600187778473, |
|
"learning_rate": 0.0008415841584158416, |
|
"loss": 1.4848, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2660753880266075, |
|
"grad_norm": 0.28900983929634094, |
|
"learning_rate": 0.0008168316831683168, |
|
"loss": 1.4962, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.28824833702882485, |
|
"grad_norm": 0.25716254115104675, |
|
"learning_rate": 0.0007920792079207921, |
|
"loss": 1.4789, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.31042128603104213, |
|
"grad_norm": 0.252340167760849, |
|
"learning_rate": 0.0007673267326732674, |
|
"loss": 1.458, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3325942350332594, |
|
"grad_norm": 0.20464155077934265, |
|
"learning_rate": 0.0007425742574257426, |
|
"loss": 1.4558, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.35476718403547675, |
|
"grad_norm": 0.23394732177257538, |
|
"learning_rate": 0.0007178217821782178, |
|
"loss": 1.4562, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.376940133037694, |
|
"grad_norm": 0.2164139449596405, |
|
"learning_rate": 0.000693069306930693, |
|
"loss": 1.4338, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.3991130820399113, |
|
"grad_norm": 0.215862438082695, |
|
"learning_rate": 0.0006683168316831684, |
|
"loss": 1.4287, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4212860310421286, |
|
"grad_norm": 0.20270515978336334, |
|
"learning_rate": 0.0006435643564356436, |
|
"loss": 1.4226, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4434589800443459, |
|
"grad_norm": 0.20255711674690247, |
|
"learning_rate": 0.0006188118811881188, |
|
"loss": 1.4314, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.4656319290465632, |
|
"grad_norm": 0.20747065544128418, |
|
"learning_rate": 0.000594059405940594, |
|
"loss": 1.4194, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.4878048780487805, |
|
"grad_norm": 0.2104884535074234, |
|
"learning_rate": 0.0005693069306930693, |
|
"loss": 1.4106, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5099778270509978, |
|
"grad_norm": 0.21514882147312164, |
|
"learning_rate": 0.0005445544554455446, |
|
"loss": 1.42, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.532150776053215, |
|
"grad_norm": 0.20466424524784088, |
|
"learning_rate": 0.0005198019801980198, |
|
"loss": 1.3937, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.5543237250554324, |
|
"grad_norm": 0.2181282341480255, |
|
"learning_rate": 0.0004950495049504951, |
|
"loss": 1.3972, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.5764966740576497, |
|
"grad_norm": 0.22615699470043182, |
|
"learning_rate": 0.0004702970297029703, |
|
"loss": 1.3882, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.5986696230598669, |
|
"grad_norm": 0.1967965066432953, |
|
"learning_rate": 0.00044554455445544556, |
|
"loss": 1.388, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6208425720620843, |
|
"grad_norm": 0.2030034065246582, |
|
"learning_rate": 0.0004207920792079208, |
|
"loss": 1.4048, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6430155210643016, |
|
"grad_norm": 0.2136310189962387, |
|
"learning_rate": 0.00039603960396039607, |
|
"loss": 1.3918, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.6651884700665188, |
|
"grad_norm": 0.22149060666561127, |
|
"learning_rate": 0.0003712871287128713, |
|
"loss": 1.4023, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.6873614190687362, |
|
"grad_norm": 0.2130667269229889, |
|
"learning_rate": 0.0003465346534653465, |
|
"loss": 1.3933, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7095343680709535, |
|
"grad_norm": 0.19920696318149567, |
|
"learning_rate": 0.0003217821782178218, |
|
"loss": 1.3815, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7317073170731707, |
|
"grad_norm": 0.20453611016273499, |
|
"learning_rate": 0.000297029702970297, |
|
"loss": 1.3648, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.753880266075388, |
|
"grad_norm": 0.21325863897800446, |
|
"learning_rate": 0.0002722772277227723, |
|
"loss": 1.3773, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.7760532150776053, |
|
"grad_norm": 0.2014823704957962, |
|
"learning_rate": 0.00024752475247524753, |
|
"loss": 1.3881, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.7982261640798226, |
|
"grad_norm": 0.20359407365322113, |
|
"learning_rate": 0.00022277227722772278, |
|
"loss": 1.3826, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.8203991130820399, |
|
"grad_norm": 0.21738748252391815, |
|
"learning_rate": 0.00019801980198019803, |
|
"loss": 1.3705, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.8425720620842572, |
|
"grad_norm": 0.1990172564983368, |
|
"learning_rate": 0.00017326732673267326, |
|
"loss": 1.3693, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.8647450110864745, |
|
"grad_norm": 0.2007543295621872, |
|
"learning_rate": 0.0001485148514851485, |
|
"loss": 1.3575, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.8869179600886918, |
|
"grad_norm": 0.5149243474006653, |
|
"learning_rate": 0.00012376237623762376, |
|
"loss": 1.374, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9090909090909091, |
|
"grad_norm": 0.2131042778491974, |
|
"learning_rate": 9.900990099009902e-05, |
|
"loss": 1.3636, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9312638580931264, |
|
"grad_norm": 0.19097404181957245, |
|
"learning_rate": 7.425742574257426e-05, |
|
"loss": 1.3489, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.9534368070953437, |
|
"grad_norm": 0.19905418157577515, |
|
"learning_rate": 4.950495049504951e-05, |
|
"loss": 1.3442, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.975609756097561, |
|
"grad_norm": 0.19617854058742523, |
|
"learning_rate": 2.4752475247524754e-05, |
|
"loss": 1.3721, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.9977827050997783, |
|
"grad_norm": 0.20064575970172882, |
|
"learning_rate": 0.0, |
|
"loss": 1.3767, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9977827050997783, |
|
"eval_loss": 1.7732421159744263, |
|
"eval_runtime": 0.5415, |
|
"eval_samples_per_second": 1.847, |
|
"eval_steps_per_second": 1.847, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.9977827050997783, |
|
"step": 225, |
|
"total_flos": 3.3259687694984806e+17, |
|
"train_loss": 1.4963340536753336, |
|
"train_runtime": 725.2803, |
|
"train_samples_per_second": 9.934, |
|
"train_steps_per_second": 0.31 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 225, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.3259687694984806e+17, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|