|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.01301959014330229, |
|
"eval_steps": 500, |
|
"global_step": 500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002603918028660458, |
|
"grad_norm": 0.4500846266746521, |
|
"learning_rate": 5.194805194805195e-06, |
|
"loss": 1.0381, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005207836057320916, |
|
"grad_norm": 0.35188010334968567, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 1.0108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0007811754085981374, |
|
"grad_norm": 0.2300374060869217, |
|
"learning_rate": 1.5584415584415583e-05, |
|
"loss": 0.9668, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0010415672114641832, |
|
"grad_norm": 0.16189467906951904, |
|
"learning_rate": 2.077922077922078e-05, |
|
"loss": 0.918, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001301959014330229, |
|
"grad_norm": 0.18843211233615875, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 0.9265, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0015623508171962747, |
|
"grad_norm": 0.20334510505199432, |
|
"learning_rate": 3.1168831168831166e-05, |
|
"loss": 0.9234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0018227426200623205, |
|
"grad_norm": 0.1745327115058899, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.881, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0020831344229283663, |
|
"grad_norm": 0.18667331337928772, |
|
"learning_rate": 4.155844155844156e-05, |
|
"loss": 0.8592, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.002343526225794412, |
|
"grad_norm": 0.1848158985376358, |
|
"learning_rate": 4.675324675324675e-05, |
|
"loss": 0.8537, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.002603918028660458, |
|
"grad_norm": 0.17589879035949707, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.8518, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0028643098315265037, |
|
"grad_norm": 0.2132624089717865, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.8511, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0031247016343925495, |
|
"grad_norm": 0.23070092499256134, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 0.7975, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0033850934372585953, |
|
"grad_norm": 0.25368157029151917, |
|
"learning_rate": 6.753246753246754e-05, |
|
"loss": 0.8134, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003645485240124641, |
|
"grad_norm": 0.22897231578826904, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.8322, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003905877042990687, |
|
"grad_norm": 0.19932536780834198, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.7959, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004166268845856733, |
|
"grad_norm": 0.21011792123317719, |
|
"learning_rate": 8.311688311688312e-05, |
|
"loss": 0.8102, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004426660648722778, |
|
"grad_norm": 0.20594824850559235, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 0.8128, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004687052451588824, |
|
"grad_norm": 0.20465536415576935, |
|
"learning_rate": 9.35064935064935e-05, |
|
"loss": 0.7989, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00494744425445487, |
|
"grad_norm": 0.4109392762184143, |
|
"learning_rate": 9.870129870129871e-05, |
|
"loss": 0.8108, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.005207836057320916, |
|
"grad_norm": 0.4293076694011688, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.8101, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005468227860186962, |
|
"grad_norm": 0.31628963351249695, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.7989, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005728619663053007, |
|
"grad_norm": 0.24642810225486755, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.7751, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.005989011465919053, |
|
"grad_norm": 0.3599106967449188, |
|
"learning_rate": 0.00011948051948051949, |
|
"loss": 0.8063, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.006249403268785099, |
|
"grad_norm": 0.17053447663784027, |
|
"learning_rate": 0.00012467532467532467, |
|
"loss": 0.7751, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006509795071651145, |
|
"grad_norm": 0.17303769290447235, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.7883, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0067701868745171905, |
|
"grad_norm": 0.1815861016511917, |
|
"learning_rate": 0.00013506493506493507, |
|
"loss": 0.788, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007030578677383236, |
|
"grad_norm": 0.24125365912914276, |
|
"learning_rate": 0.00014025974025974028, |
|
"loss": 0.8018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.007290970480249282, |
|
"grad_norm": 0.19443446397781372, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.7908, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007551362283115328, |
|
"grad_norm": 0.17829768359661102, |
|
"learning_rate": 0.00015064935064935066, |
|
"loss": 0.8033, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.007811754085981374, |
|
"grad_norm": 0.19535653293132782, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.7997, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008072145888847419, |
|
"grad_norm": 0.19930541515350342, |
|
"learning_rate": 0.00016103896103896104, |
|
"loss": 0.7945, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008332537691713465, |
|
"grad_norm": 0.2156297266483307, |
|
"learning_rate": 0.00016623376623376625, |
|
"loss": 0.8018, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00859292949457951, |
|
"grad_norm": 0.1924206018447876, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.7746, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.008853321297445557, |
|
"grad_norm": 0.2294880747795105, |
|
"learning_rate": 0.00017662337662337663, |
|
"loss": 0.8152, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009113713100311602, |
|
"grad_norm": 0.16817067563533783, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.7972, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009374104903177648, |
|
"grad_norm": 0.18544812500476837, |
|
"learning_rate": 0.000187012987012987, |
|
"loss": 0.7801, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.009634496706043693, |
|
"grad_norm": 0.19597066938877106, |
|
"learning_rate": 0.00019220779220779222, |
|
"loss": 0.7706, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00989488850890974, |
|
"grad_norm": 0.40291881561279297, |
|
"learning_rate": 0.00019740259740259742, |
|
"loss": 0.7911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.010155280311775785, |
|
"grad_norm": 0.23841074109077454, |
|
"learning_rate": 0.00019999996515752773, |
|
"loss": 0.7861, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.010415672114641832, |
|
"grad_norm": 0.1675388514995575, |
|
"learning_rate": 0.00019999968641789507, |
|
"loss": 0.788, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010676063917507876, |
|
"grad_norm": 1.8860758543014526, |
|
"learning_rate": 0.0001999991289394067, |
|
"loss": 0.7632, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.010936455720373923, |
|
"grad_norm": 0.17022117972373962, |
|
"learning_rate": 0.00019999829272361654, |
|
"loss": 0.784, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.011196847523239968, |
|
"grad_norm": 0.21460269391536713, |
|
"learning_rate": 0.00019999717777285545, |
|
"loss": 0.761, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.011457239326106015, |
|
"grad_norm": 0.19413785636425018, |
|
"learning_rate": 0.00019999578409023126, |
|
"loss": 0.7772, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01171763112897206, |
|
"grad_norm": 0.20223405957221985, |
|
"learning_rate": 0.00019999411167962868, |
|
"loss": 0.7811, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.011978022931838106, |
|
"grad_norm": 0.15166303515434265, |
|
"learning_rate": 0.00019999216054570942, |
|
"loss": 0.7709, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.012238414734704151, |
|
"grad_norm": 0.16307081282138824, |
|
"learning_rate": 0.00019998993069391205, |
|
"loss": 0.7811, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.012498806537570198, |
|
"grad_norm": 0.15996049344539642, |
|
"learning_rate": 0.00019998742213045206, |
|
"loss": 0.7599, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.012759198340436243, |
|
"grad_norm": 0.17560279369354248, |
|
"learning_rate": 0.00019998463486232179, |
|
"loss": 0.7572, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01301959014330229, |
|
"grad_norm": 0.17571642994880676, |
|
"learning_rate": 0.0001999815688972905, |
|
"loss": 0.7643, |
|
"step": 500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 19202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.3458366881792e+17, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|