|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.997590361445782, |
|
"eval_steps": 500, |
|
"global_step": 2070, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004819277108433735, |
|
"grad_norm": 7.09375, |
|
"learning_rate": 9.66183574879227e-07, |
|
"loss": 3.0067, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.024096385542168676, |
|
"grad_norm": 6.625, |
|
"learning_rate": 4.830917874396135e-06, |
|
"loss": 3.0384, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.04819277108433735, |
|
"grad_norm": 4.65625, |
|
"learning_rate": 9.66183574879227e-06, |
|
"loss": 2.9932, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07228915662650602, |
|
"grad_norm": 3.0625, |
|
"learning_rate": 1.4492753623188407e-05, |
|
"loss": 2.8805, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0963855421686747, |
|
"grad_norm": 2.828125, |
|
"learning_rate": 1.932367149758454e-05, |
|
"loss": 2.7091, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.12048192771084337, |
|
"grad_norm": 2.265625, |
|
"learning_rate": 2.4154589371980676e-05, |
|
"loss": 2.5046, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.14457831325301204, |
|
"grad_norm": 1.359375, |
|
"learning_rate": 2.8985507246376814e-05, |
|
"loss": 2.3296, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1686746987951807, |
|
"grad_norm": 1.53125, |
|
"learning_rate": 3.381642512077295e-05, |
|
"loss": 2.1499, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.1927710843373494, |
|
"grad_norm": 6.21875, |
|
"learning_rate": 3.864734299516908e-05, |
|
"loss": 1.9713, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.21686746987951808, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 4.347826086956522e-05, |
|
"loss": 1.8202, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.24096385542168675, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 4.830917874396135e-05, |
|
"loss": 1.6894, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.26506024096385544, |
|
"grad_norm": 0.58984375, |
|
"learning_rate": 5.3140096618357496e-05, |
|
"loss": 1.571, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2891566265060241, |
|
"grad_norm": 0.52734375, |
|
"learning_rate": 5.797101449275363e-05, |
|
"loss": 1.4798, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3132530120481928, |
|
"grad_norm": 0.4375, |
|
"learning_rate": 6.280193236714976e-05, |
|
"loss": 1.4004, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.3373493975903614, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 6.76328502415459e-05, |
|
"loss": 1.3436, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.3614457831325301, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 7.246376811594203e-05, |
|
"loss": 1.2867, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.3855421686746988, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 7.729468599033817e-05, |
|
"loss": 1.2529, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.40963855421686746, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 8.21256038647343e-05, |
|
"loss": 1.215, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.43373493975903615, |
|
"grad_norm": 0.1796875, |
|
"learning_rate": 8.695652173913044e-05, |
|
"loss": 1.1938, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.4578313253012048, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 9.178743961352657e-05, |
|
"loss": 1.1765, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.4819277108433735, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 9.66183574879227e-05, |
|
"loss": 1.1637, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.5060240963855421, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.00010144927536231885, |
|
"loss": 1.1525, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.5301204819277109, |
|
"grad_norm": 0.765625, |
|
"learning_rate": 0.00010628019323671499, |
|
"loss": 1.1378, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.5542168674698795, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.125, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.5783132530120482, |
|
"grad_norm": 0.375, |
|
"learning_rate": 0.00011594202898550725, |
|
"loss": 1.1077, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.6024096385542169, |
|
"grad_norm": 0.96484375, |
|
"learning_rate": 0.00012077294685990339, |
|
"loss": 1.1027, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.6265060240963856, |
|
"grad_norm": 0.359375, |
|
"learning_rate": 0.00012560386473429953, |
|
"loss": 1.1005, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.6506024096385542, |
|
"grad_norm": 0.515625, |
|
"learning_rate": 0.00013043478260869567, |
|
"loss": 1.0961, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.6746987951807228, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.0001352657004830918, |
|
"loss": 1.0835, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.6987951807228916, |
|
"grad_norm": 0.400390625, |
|
"learning_rate": 0.00014009661835748792, |
|
"loss": 1.0731, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.7228915662650602, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00014492753623188405, |
|
"loss": 1.0655, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.7469879518072289, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.0001497584541062802, |
|
"loss": 1.064, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.7710843373493976, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 0.00015458937198067633, |
|
"loss": 1.0539, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.7951807228915663, |
|
"grad_norm": 0.87109375, |
|
"learning_rate": 0.00015942028985507247, |
|
"loss": 1.057, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.8192771084337349, |
|
"grad_norm": 1.046875, |
|
"learning_rate": 0.0001642512077294686, |
|
"loss": 1.052, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.8433734939759037, |
|
"grad_norm": 1.203125, |
|
"learning_rate": 0.00016908212560386474, |
|
"loss": 1.0539, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.8674698795180723, |
|
"grad_norm": 0.40625, |
|
"learning_rate": 0.00017391304347826088, |
|
"loss": 1.0393, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.891566265060241, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.00017874396135265702, |
|
"loss": 1.0354, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.9156626506024096, |
|
"grad_norm": 0.3984375, |
|
"learning_rate": 0.00018357487922705313, |
|
"loss": 1.0443, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.9397590361445783, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00018840579710144927, |
|
"loss": 1.0303, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.963855421686747, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.0001932367149758454, |
|
"loss": 1.0281, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.9879518072289156, |
|
"grad_norm": 0.64453125, |
|
"learning_rate": 0.00019806763285024154, |
|
"loss": 1.0246, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.9975903614457832, |
|
"eval_loss": 2.4549527168273926, |
|
"eval_runtime": 0.4905, |
|
"eval_samples_per_second": 20.386, |
|
"eval_steps_per_second": 2.039, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.0120481927710843, |
|
"grad_norm": 0.57421875, |
|
"learning_rate": 0.00019999872036643513, |
|
"loss": 1.0207, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.036144578313253, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00019999090050213636, |
|
"loss": 1.0131, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.0602409638554218, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019997597223631895, |
|
"loss": 1.0098, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.0843373493975903, |
|
"grad_norm": 0.80078125, |
|
"learning_rate": 0.00019995393663024054, |
|
"loss": 1.0082, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.108433734939759, |
|
"grad_norm": 1.0625, |
|
"learning_rate": 0.00019992479525042303, |
|
"loss": 1.0136, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.1325301204819278, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.0001998885501685412, |
|
"loss": 1.004, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.1566265060240963, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 0.00019984520396127553, |
|
"loss": 1.0012, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.180722891566265, |
|
"grad_norm": 0.5234375, |
|
"learning_rate": 0.0001997947597101288, |
|
"loss": 0.9956, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.2048192771084336, |
|
"grad_norm": 0.3515625, |
|
"learning_rate": 0.0001997372210012073, |
|
"loss": 0.9968, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.2289156626506024, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 0.0001996725919249657, |
|
"loss": 0.997, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.2530120481927711, |
|
"grad_norm": 0.8125, |
|
"learning_rate": 0.00019960087707591626, |
|
"loss": 1.0023, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.2771084337349397, |
|
"grad_norm": 0.482421875, |
|
"learning_rate": 0.00019952208155230234, |
|
"loss": 0.9934, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.3012048192771084, |
|
"grad_norm": 0.53515625, |
|
"learning_rate": 0.00019943621095573586, |
|
"loss": 0.9971, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.3253012048192772, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00019934327139079915, |
|
"loss": 0.9945, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.3493975903614457, |
|
"grad_norm": 0.341796875, |
|
"learning_rate": 0.00019924326946461074, |
|
"loss": 0.9864, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.3734939759036144, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.0001991362122863561, |
|
"loss": 0.9804, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.3975903614457832, |
|
"grad_norm": 0.447265625, |
|
"learning_rate": 0.0001990221074667818, |
|
"loss": 0.9826, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.4216867469879517, |
|
"grad_norm": 0.326171875, |
|
"learning_rate": 0.00019890096311765465, |
|
"loss": 0.9858, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.4457831325301205, |
|
"grad_norm": 0.45703125, |
|
"learning_rate": 0.00019877278785118517, |
|
"loss": 0.9808, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.4698795180722892, |
|
"grad_norm": 0.54296875, |
|
"learning_rate": 0.00019863759077941504, |
|
"loss": 0.9733, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.4939759036144578, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00019849538151356955, |
|
"loss": 0.9788, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.5180722891566265, |
|
"grad_norm": 0.474609375, |
|
"learning_rate": 0.0001983461701633742, |
|
"loss": 0.981, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.5421686746987953, |
|
"grad_norm": 0.318359375, |
|
"learning_rate": 0.00019818996733633618, |
|
"loss": 0.9806, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.5662650602409638, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00019802678413699006, |
|
"loss": 0.9681, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.5903614457831325, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019785663216610844, |
|
"loss": 0.9758, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.6144578313253013, |
|
"grad_norm": 0.369140625, |
|
"learning_rate": 0.0001976795235198773, |
|
"loss": 0.9711, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.6385542168674698, |
|
"grad_norm": 0.3671875, |
|
"learning_rate": 0.00019749547078903604, |
|
"loss": 0.9666, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.6626506024096386, |
|
"grad_norm": 0.41015625, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.9673, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.6867469879518073, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00019710658590384227, |
|
"loss": 0.9631, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.7108433734939759, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00019690178139550443, |
|
"loss": 0.9646, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.7349397590361446, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00019669008809262062, |
|
"loss": 0.9699, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.7590361445783134, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00019647152104457013, |
|
"loss": 0.9639, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.783132530120482, |
|
"grad_norm": 0.337890625, |
|
"learning_rate": 0.00019624609578939027, |
|
"loss": 0.9597, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.8072289156626506, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.0001960138283526715, |
|
"loss": 0.962, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.8313253012048194, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.0001957747352464184, |
|
"loss": 0.9628, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.855421686746988, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.00019552883346787552, |
|
"loss": 0.9582, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.8795180722891565, |
|
"grad_norm": 0.4296875, |
|
"learning_rate": 0.0001952761404983194, |
|
"loss": 0.9566, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.9036144578313254, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 0.0001950166743018156, |
|
"loss": 0.9614, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.927710843373494, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00019475045332394153, |
|
"loss": 0.9607, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.9518072289156625, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00019447749649047542, |
|
"loss": 0.9511, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.9759036144578315, |
|
"grad_norm": 0.384765625, |
|
"learning_rate": 0.0001941978232060507, |
|
"loss": 0.9516, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00019391145335277655, |
|
"loss": 0.9556, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.453037977218628, |
|
"eval_runtime": 0.4785, |
|
"eval_samples_per_second": 20.897, |
|
"eval_steps_per_second": 2.09, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.0240963855421685, |
|
"grad_norm": 0.373046875, |
|
"learning_rate": 0.00019361840728882447, |
|
"loss": 0.9377, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.0481927710843375, |
|
"grad_norm": 0.4609375, |
|
"learning_rate": 0.00019331870584698093, |
|
"loss": 0.9384, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.072289156626506, |
|
"grad_norm": 0.44921875, |
|
"learning_rate": 0.00019301237033316659, |
|
"loss": 0.9315, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.0963855421686746, |
|
"grad_norm": 0.26953125, |
|
"learning_rate": 0.00019269942252492133, |
|
"loss": 0.9328, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.1204819277108435, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001923798846698564, |
|
"loss": 0.9374, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.144578313253012, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00019205377948407258, |
|
"loss": 0.9304, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.1686746987951806, |
|
"grad_norm": 0.4453125, |
|
"learning_rate": 0.00019172113015054532, |
|
"loss": 0.9319, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.1927710843373496, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00019138196031747681, |
|
"loss": 0.9386, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.216867469879518, |
|
"grad_norm": 0.46484375, |
|
"learning_rate": 0.0001910362940966147, |
|
"loss": 0.9383, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.2409638554216866, |
|
"grad_norm": 0.470703125, |
|
"learning_rate": 0.00019068415606153787, |
|
"loss": 0.9369, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 2.2650602409638556, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00019032557124590974, |
|
"loss": 0.9336, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 2.289156626506024, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00018996056514169844, |
|
"loss": 0.9292, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 2.3132530120481927, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.0001895891636973646, |
|
"loss": 0.9386, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 2.337349397590361, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00018921139331601667, |
|
"loss": 0.9404, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 2.36144578313253, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00018882728085353392, |
|
"loss": 0.9267, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 2.3855421686746987, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 0.00018843685361665723, |
|
"loss": 0.9285, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 2.4096385542168672, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00018804013936104792, |
|
"loss": 0.9396, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 2.433734939759036, |
|
"grad_norm": 0.30078125, |
|
"learning_rate": 0.00018763716628931437, |
|
"loss": 0.929, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 2.4578313253012047, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.0001872279630490074, |
|
"loss": 0.9277, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 2.4819277108433733, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 0.00018681255873058338, |
|
"loss": 0.9293, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 2.5060240963855422, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00018639098286533644, |
|
"loss": 0.9253, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 2.5301204819277108, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00018596326542329888, |
|
"loss": 0.9231, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 2.5542168674698793, |
|
"grad_norm": 0.5390625, |
|
"learning_rate": 0.00018552943681111067, |
|
"loss": 0.9241, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 2.5783132530120483, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.0001850895278698579, |
|
"loss": 0.9234, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 2.602409638554217, |
|
"grad_norm": 0.361328125, |
|
"learning_rate": 0.00018464356987288013, |
|
"loss": 0.9284, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 2.6265060240963853, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001841915945235472, |
|
"loss": 0.9247, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 2.6506024096385543, |
|
"grad_norm": 0.34765625, |
|
"learning_rate": 0.00018373363395300554, |
|
"loss": 0.9314, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 2.674698795180723, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.0001832697207178938, |
|
"loss": 0.9308, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 2.6987951807228914, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00018279988779802833, |
|
"loss": 0.9275, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 2.7228915662650603, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00018232416859405895, |
|
"loss": 0.9261, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 2.746987951807229, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.00018184259692509406, |
|
"loss": 0.925, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 2.7710843373493974, |
|
"grad_norm": 0.310546875, |
|
"learning_rate": 0.00018135520702629675, |
|
"loss": 0.9211, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 2.7951807228915664, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00018086203354645089, |
|
"loss": 0.9219, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 2.819277108433735, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00018036311154549784, |
|
"loss": 0.9304, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 2.8433734939759034, |
|
"grad_norm": 0.390625, |
|
"learning_rate": 0.00017985847649204417, |
|
"loss": 0.926, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 2.8674698795180724, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 0.00017934816426084008, |
|
"loss": 0.9136, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 2.891566265060241, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00017883221113022916, |
|
"loss": 0.9189, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 2.9156626506024095, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0001783106537795692, |
|
"loss": 0.9138, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 2.9397590361445785, |
|
"grad_norm": 0.44140625, |
|
"learning_rate": 0.00017778352928662474, |
|
"loss": 0.923, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 2.963855421686747, |
|
"grad_norm": 0.24609375, |
|
"learning_rate": 0.0001772508751249311, |
|
"loss": 0.9259, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 2.9879518072289155, |
|
"grad_norm": 0.36328125, |
|
"learning_rate": 0.00017671272916113052, |
|
"loss": 0.9114, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 2.9975903614457833, |
|
"eval_loss": 2.46407151222229, |
|
"eval_runtime": 0.6669, |
|
"eval_samples_per_second": 14.995, |
|
"eval_steps_per_second": 1.5, |
|
"step": 622 |
|
}, |
|
{ |
|
"epoch": 3.0120481927710845, |
|
"grad_norm": 0.259765625, |
|
"learning_rate": 0.00017616912965228001, |
|
"loss": 0.9017, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 3.036144578313253, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 0.00017562011524313185, |
|
"loss": 0.897, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 3.0602409638554215, |
|
"grad_norm": 0.28515625, |
|
"learning_rate": 0.0001750657249633861, |
|
"loss": 0.896, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 3.0843373493975905, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00017450599822491615, |
|
"loss": 0.9036, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 3.108433734939759, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00017394097481896676, |
|
"loss": 0.9018, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 3.1325301204819276, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00017337069491332537, |
|
"loss": 0.8983, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 3.1566265060240966, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.00017279519904946647, |
|
"loss": 0.896, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 3.180722891566265, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001722145281396697, |
|
"loss": 0.907, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 3.2048192771084336, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00017162872346411102, |
|
"loss": 0.8986, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 3.2289156626506026, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 0.00017103782666792844, |
|
"loss": 0.9046, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 3.253012048192771, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00017044187975826124, |
|
"loss": 0.9018, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 3.2771084337349397, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.00016984092510126367, |
|
"loss": 0.903, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 3.3012048192771086, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001692350054190932, |
|
"loss": 0.9077, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 3.325301204819277, |
|
"grad_norm": 0.380859375, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.9033, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 3.3493975903614457, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00016800844362963147, |
|
"loss": 0.9051, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 3.3734939759036147, |
|
"grad_norm": 0.3203125, |
|
"learning_rate": 0.00016738788871921152, |
|
"loss": 0.8974, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 3.397590361445783, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 0.00016676254317116252, |
|
"loss": 0.9016, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 3.4216867469879517, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.0001661324514416022, |
|
"loss": 0.9006, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 3.4457831325301207, |
|
"grad_norm": 0.376953125, |
|
"learning_rate": 0.00016549765832405653, |
|
"loss": 0.9005, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 3.4698795180722892, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.0001648582089462756, |
|
"loss": 0.9024, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 3.4939759036144578, |
|
"grad_norm": 0.3046875, |
|
"learning_rate": 0.00016421414876702518, |
|
"loss": 0.8996, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 3.5180722891566267, |
|
"grad_norm": 0.328125, |
|
"learning_rate": 0.00016356552357285522, |
|
"loss": 0.8988, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 3.5421686746987953, |
|
"grad_norm": 0.392578125, |
|
"learning_rate": 0.0001629123794748447, |
|
"loss": 0.8975, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 3.566265060240964, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.00016225476290532374, |
|
"loss": 0.9, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 3.5903614457831328, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00016159272061457255, |
|
"loss": 0.8948, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 3.6144578313253013, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0001609262996674981, |
|
"loss": 0.8854, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 3.63855421686747, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.0001602555474402881, |
|
"loss": 0.8981, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 3.662650602409639, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00015958051161704307, |
|
"loss": 0.8997, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 3.6867469879518073, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 0.00015890124018638638, |
|
"loss": 0.8909, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 3.710843373493976, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00015821778143805296, |
|
"loss": 0.8904, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 3.734939759036145, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00015753018395945598, |
|
"loss": 0.896, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 3.7590361445783134, |
|
"grad_norm": 0.421875, |
|
"learning_rate": 0.00015683849663223308, |
|
"loss": 0.9004, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 3.783132530120482, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.00015614276862877113, |
|
"loss": 0.8979, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 3.807228915662651, |
|
"grad_norm": 0.353515625, |
|
"learning_rate": 0.0001554430494087107, |
|
"loss": 0.893, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 3.8313253012048194, |
|
"grad_norm": 0.40234375, |
|
"learning_rate": 0.00015473938871542986, |
|
"loss": 0.8977, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 3.855421686746988, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 0.00015403183657250788, |
|
"loss": 0.8969, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 3.8795180722891565, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 0.00015332044328016914, |
|
"loss": 0.8921, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 3.9036144578313254, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 0.00015260525941170712, |
|
"loss": 0.8913, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 3.927710843373494, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.00015188633580988926, |
|
"loss": 0.896, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 3.9518072289156625, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00015116372358334233, |
|
"loss": 0.892, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 3.9759036144578315, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 0.00015043747410291945, |
|
"loss": 0.8948, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00014970763899804763, |
|
"loss": 0.8927, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.4882054328918457, |
|
"eval_runtime": 0.482, |
|
"eval_samples_per_second": 20.748, |
|
"eval_steps_per_second": 2.075, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 4.024096385542169, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 0.0001489742701530578, |
|
"loss": 0.8742, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 4.048192771084337, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00014823741970349606, |
|
"loss": 0.8702, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 4.072289156626506, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.0001474971400324177, |
|
"loss": 0.8777, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 4.096385542168675, |
|
"grad_norm": 0.50390625, |
|
"learning_rate": 0.00014675348376666278, |
|
"loss": 0.8776, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 4.120481927710843, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00014600650377311522, |
|
"loss": 0.875, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 4.144578313253012, |
|
"grad_norm": 0.294921875, |
|
"learning_rate": 0.00014525625315494435, |
|
"loss": 0.8817, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 4.168674698795181, |
|
"grad_norm": 0.4140625, |
|
"learning_rate": 0.00014450278524782986, |
|
"loss": 0.8808, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 4.192771084337349, |
|
"grad_norm": 0.49609375, |
|
"learning_rate": 0.00014374615361616985, |
|
"loss": 0.8775, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 4.216867469879518, |
|
"grad_norm": 0.455078125, |
|
"learning_rate": 0.00014298641204927342, |
|
"loss": 0.8796, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 4.240963855421687, |
|
"grad_norm": 0.322265625, |
|
"learning_rate": 0.0001422236145575362, |
|
"loss": 0.8753, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 4.265060240963855, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00014145781536860122, |
|
"loss": 0.8735, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 4.289156626506024, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 0.00014068906892350343, |
|
"loss": 0.8753, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 4.313253012048193, |
|
"grad_norm": 0.416015625, |
|
"learning_rate": 0.0001399174298727998, |
|
"loss": 0.8752, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 4.337349397590361, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.00013914295307268396, |
|
"loss": 0.8734, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 4.36144578313253, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00013836569358108647, |
|
"loss": 0.8815, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 4.385542168674699, |
|
"grad_norm": 0.32421875, |
|
"learning_rate": 0.00013758570665376086, |
|
"loss": 0.8834, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 4.409638554216867, |
|
"grad_norm": 0.349609375, |
|
"learning_rate": 0.00013680304774035538, |
|
"loss": 0.8787, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 4.433734939759036, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.00013601777248047105, |
|
"loss": 0.8785, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 4.457831325301205, |
|
"grad_norm": 0.255859375, |
|
"learning_rate": 0.0001352299366997062, |
|
"loss": 0.8729, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 4.481927710843373, |
|
"grad_norm": 0.412109375, |
|
"learning_rate": 0.0001344395964056878, |
|
"loss": 0.8778, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 4.506024096385542, |
|
"grad_norm": 0.404296875, |
|
"learning_rate": 0.00013364680778409, |
|
"loss": 0.8816, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 4.530120481927711, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 0.00013285162719463961, |
|
"loss": 0.8762, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 4.554216867469879, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00013205411116710972, |
|
"loss": 0.8776, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 4.578313253012048, |
|
"grad_norm": 0.33203125, |
|
"learning_rate": 0.0001312543163973007, |
|
"loss": 0.8674, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 4.602409638554217, |
|
"grad_norm": 0.33984375, |
|
"learning_rate": 0.00013045229974300993, |
|
"loss": 0.8759, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 4.626506024096385, |
|
"grad_norm": 0.47265625, |
|
"learning_rate": 0.0001296481182199896, |
|
"loss": 0.8727, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 4.650602409638554, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.00012884182899789343, |
|
"loss": 0.8739, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 4.674698795180722, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00012803348939621252, |
|
"loss": 0.874, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 4.698795180722891, |
|
"grad_norm": 0.236328125, |
|
"learning_rate": 0.00012722315688020047, |
|
"loss": 0.8745, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 4.72289156626506, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00012641088905678802, |
|
"loss": 0.8738, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 4.746987951807229, |
|
"grad_norm": 0.23046875, |
|
"learning_rate": 0.000125596743670488, |
|
"loss": 0.8721, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 4.771084337349397, |
|
"grad_norm": 0.296875, |
|
"learning_rate": 0.00012478077859929, |
|
"loss": 0.8734, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 4.795180722891566, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 0.0001239630518505459, |
|
"loss": 0.876, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 4.8192771084337345, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 0.00012314362155684612, |
|
"loss": 0.874, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 4.843373493975903, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00012232254597188688, |
|
"loss": 0.8752, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 4.867469879518072, |
|
"grad_norm": 0.287109375, |
|
"learning_rate": 0.00012149988346632894, |
|
"loss": 0.8689, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 4.891566265060241, |
|
"grad_norm": 0.2373046875, |
|
"learning_rate": 0.00012067569252364809, |
|
"loss": 0.8704, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 4.9156626506024095, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 0.0001198500317359774, |
|
"loss": 0.8785, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 4.9397590361445785, |
|
"grad_norm": 0.25, |
|
"learning_rate": 0.00011902295979994192, |
|
"loss": 0.8732, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 4.9638554216867465, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 0.00011819453551248592, |
|
"loss": 0.8796, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 4.9879518072289155, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.8752, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 4.997590361445783, |
|
"eval_loss": 2.508066177368164, |
|
"eval_runtime": 0.5, |
|
"eval_samples_per_second": 20.0, |
|
"eval_steps_per_second": 2.0, |
|
"step": 1037 |
|
}, |
|
{ |
|
"epoch": 5.0120481927710845, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 0.00011653386554759946, |
|
"loss": 0.8612, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 5.036144578313253, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 0.00011570173792800066, |
|
"loss": 0.8559, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 5.0602409638554215, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 0.00011486849406425188, |
|
"loss": 0.8507, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 5.0843373493975905, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00011403419319206284, |
|
"loss": 0.8603, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 5.108433734939759, |
|
"grad_norm": 0.26171875, |
|
"learning_rate": 0.0001131988946222863, |
|
"loss": 0.8592, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 5.132530120481928, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 0.00011236265773670196, |
|
"loss": 0.855, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 5.156626506024097, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 0.00011152554198379484, |
|
"loss": 0.8566, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 5.180722891566265, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00011068760687452895, |
|
"loss": 0.8521, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 5.204819277108434, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 0.00010984891197811687, |
|
"loss": 0.8605, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 5.228915662650603, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 0.00010900951691778481, |
|
"loss": 0.8623, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 5.253012048192771, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 0.00010816948136653386, |
|
"loss": 0.8583, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 5.27710843373494, |
|
"grad_norm": 0.2353515625, |
|
"learning_rate": 0.000107328865042898, |
|
"loss": 0.8571, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 5.301204819277109, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 0.00010648772770669861, |
|
"loss": 0.862, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 5.325301204819277, |
|
"grad_norm": 0.244140625, |
|
"learning_rate": 0.00010564612915479612, |
|
"loss": 0.8531, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 5.349397590361446, |
|
"grad_norm": 0.28125, |
|
"learning_rate": 0.00010480412921683888, |
|
"loss": 0.8646, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 5.373493975903615, |
|
"grad_norm": 0.2734375, |
|
"learning_rate": 0.00010396178775101014, |
|
"loss": 0.858, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 5.397590361445783, |
|
"grad_norm": 0.29296875, |
|
"learning_rate": 0.00010311916463977242, |
|
"loss": 0.863, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 5.421686746987952, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 0.00010227631978561056, |
|
"loss": 0.8601, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 5.445783132530121, |
|
"grad_norm": 0.2421875, |
|
"learning_rate": 0.00010143331310677331, |
|
"loss": 0.8566, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 5.469879518072289, |
|
"grad_norm": 0.240234375, |
|
"learning_rate": 0.00010059020453301345, |
|
"loss": 0.8538, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 5.493975903614458, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 9.974705400132764e-05, |
|
"loss": 0.8591, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 5.518072289156627, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 9.890392145169531e-05, |
|
"loss": 0.8576, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 5.542168674698795, |
|
"grad_norm": 0.2392578125, |
|
"learning_rate": 9.806086682281758e-05, |
|
"loss": 0.8615, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 5.566265060240964, |
|
"grad_norm": 0.291015625, |
|
"learning_rate": 9.721795004785605e-05, |
|
"loss": 0.8572, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 5.590361445783133, |
|
"grad_norm": 0.283203125, |
|
"learning_rate": 9.637523105017229e-05, |
|
"loss": 0.8644, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 5.614457831325301, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 9.553276973906786e-05, |
|
"loss": 0.854, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 5.63855421686747, |
|
"grad_norm": 0.2578125, |
|
"learning_rate": 9.469062600552509e-05, |
|
"loss": 0.8526, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 5.662650602409639, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 9.384885971794961e-05, |
|
"loss": 0.8563, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 5.686746987951807, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 9.300753071791434e-05, |
|
"loss": 0.8614, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 5.710843373493976, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 9.216669881590515e-05, |
|
"loss": 0.8633, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 5.734939759036145, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 9.132642378706894e-05, |
|
"loss": 0.8548, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 5.759036144578313, |
|
"grad_norm": 0.2890625, |
|
"learning_rate": 9.048676536696425e-05, |
|
"loss": 0.8603, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 5.783132530120482, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 8.964778324731467e-05, |
|
"loss": 0.8643, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 5.807228915662651, |
|
"grad_norm": 0.330078125, |
|
"learning_rate": 8.880953707176514e-05, |
|
"loss": 0.8562, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 5.831325301204819, |
|
"grad_norm": 0.25, |
|
"learning_rate": 8.797208643164212e-05, |
|
"loss": 0.8565, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 5.855421686746988, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 8.713549086171691e-05, |
|
"loss": 0.8566, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 5.879518072289157, |
|
"grad_norm": 0.279296875, |
|
"learning_rate": 8.629980983597358e-05, |
|
"loss": 0.8596, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 5.903614457831325, |
|
"grad_norm": 0.34375, |
|
"learning_rate": 8.546510276338078e-05, |
|
"loss": 0.8522, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 5.927710843373494, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 8.463142898366834e-05, |
|
"loss": 0.8561, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 5.951807228915663, |
|
"grad_norm": 0.275390625, |
|
"learning_rate": 8.37988477631088e-05, |
|
"loss": 0.8611, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 5.975903614457831, |
|
"grad_norm": 0.267578125, |
|
"learning_rate": 8.296741829030418e-05, |
|
"loss": 0.8552, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.248046875, |
|
"learning_rate": 8.213719967197817e-05, |
|
"loss": 0.8602, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.5277345180511475, |
|
"eval_runtime": 0.4824, |
|
"eval_samples_per_second": 20.732, |
|
"eval_steps_per_second": 2.073, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 6.024096385542169, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 8.130825092877418e-05, |
|
"loss": 0.8432, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 6.048192771084337, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 8.04806309910597e-05, |
|
"loss": 0.8387, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 6.072289156626506, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 7.965439869473664e-05, |
|
"loss": 0.8463, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 6.096385542168675, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 7.882961277705895e-05, |
|
"loss": 0.8402, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 6.120481927710843, |
|
"grad_norm": 0.2451171875, |
|
"learning_rate": 7.800633187245673e-05, |
|
"loss": 0.8472, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 6.144578313253012, |
|
"grad_norm": 0.271484375, |
|
"learning_rate": 7.718461450836804e-05, |
|
"loss": 0.8457, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 6.168674698795181, |
|
"grad_norm": 0.27734375, |
|
"learning_rate": 7.636451910107806e-05, |
|
"loss": 0.8381, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 6.192771084337349, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 7.554610395156624e-05, |
|
"loss": 0.8384, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 6.216867469879518, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 7.472942724136174e-05, |
|
"loss": 0.8468, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 6.240963855421687, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 7.391454702840722e-05, |
|
"loss": 0.8472, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 6.265060240963855, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.310152124293146e-05, |
|
"loss": 0.8441, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 6.289156626506024, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 7.229040768333115e-05, |
|
"loss": 0.8424, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 6.313253012048193, |
|
"grad_norm": 0.2216796875, |
|
"learning_rate": 7.14812640120618e-05, |
|
"loss": 0.8377, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 6.337349397590361, |
|
"grad_norm": 0.25390625, |
|
"learning_rate": 7.067414775153871e-05, |
|
"loss": 0.8428, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 6.36144578313253, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 6.986911628004753e-05, |
|
"loss": 0.8411, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 6.385542168674699, |
|
"grad_norm": 0.302734375, |
|
"learning_rate": 6.906622682766526e-05, |
|
"loss": 0.8451, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 6.409638554216867, |
|
"grad_norm": 0.263671875, |
|
"learning_rate": 6.826553647219175e-05, |
|
"loss": 0.8468, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 6.433734939759036, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 6.74671021350919e-05, |
|
"loss": 0.8499, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 6.457831325301205, |
|
"grad_norm": 0.234375, |
|
"learning_rate": 6.667098057744927e-05, |
|
"loss": 0.8447, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 6.481927710843373, |
|
"grad_norm": 0.31640625, |
|
"learning_rate": 6.587722839593073e-05, |
|
"loss": 0.8462, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 6.506024096385542, |
|
"grad_norm": 0.3125, |
|
"learning_rate": 6.508590201876317e-05, |
|
"loss": 0.8477, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 6.530120481927711, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 6.429705770172168e-05, |
|
"loss": 0.8473, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 6.554216867469879, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 6.351075152413068e-05, |
|
"loss": 0.8411, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 6.578313253012048, |
|
"grad_norm": 0.2470703125, |
|
"learning_rate": 6.272703938487694e-05, |
|
"loss": 0.8514, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 6.602409638554217, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 6.194597699843581e-05, |
|
"loss": 0.8415, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 6.626506024096385, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 6.116761989091042e-05, |
|
"loss": 0.8429, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 6.650602409638554, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.8435, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 6.674698795180722, |
|
"grad_norm": 0.220703125, |
|
"learning_rate": 5.961924265148777e-05, |
|
"loss": 0.8455, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 6.698795180722891, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 5.884933259447798e-05, |
|
"loss": 0.8457, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 6.72289156626506, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 5.8082347958333625e-05, |
|
"loss": 0.8481, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 6.746987951807229, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 5.731834326836366e-05, |
|
"loss": 0.8469, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 6.771084337349397, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 5.6557372838031384e-05, |
|
"loss": 0.8459, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 6.795180722891566, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 5.579949076509305e-05, |
|
"loss": 0.8455, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 6.8192771084337345, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 5.5044750927752106e-05, |
|
"loss": 0.8444, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 6.843373493975903, |
|
"grad_norm": 0.251953125, |
|
"learning_rate": 5.429320698082887e-05, |
|
"loss": 0.8427, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 6.867469879518072, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.354491235194635e-05, |
|
"loss": 0.8466, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 6.891566265060241, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 5.279992023773195e-05, |
|
"loss": 0.8419, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 6.9156626506024095, |
|
"grad_norm": 0.232421875, |
|
"learning_rate": 5.205828360003568e-05, |
|
"loss": 0.8468, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 6.9397590361445785, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 5.1320055162165115e-05, |
|
"loss": 0.8435, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 6.9638554216867465, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 5.0585287405137305e-05, |
|
"loss": 0.8444, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 6.9879518072289155, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 4.9854032563947714e-05, |
|
"loss": 0.8464, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 6.997590361445783, |
|
"eval_loss": 2.5512688159942627, |
|
"eval_runtime": 0.4955, |
|
"eval_samples_per_second": 20.183, |
|
"eval_steps_per_second": 2.018, |
|
"step": 1452 |
|
}, |
|
{ |
|
"epoch": 7.0120481927710845, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.912634262385695e-05, |
|
"loss": 0.8349, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 7.036144578313253, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 4.8402269316695134e-05, |
|
"loss": 0.8372, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 7.0602409638554215, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 4.768186411718417e-05, |
|
"loss": 0.8283, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 7.0843373493975905, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 4.696517823927842e-05, |
|
"loss": 0.842, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 7.108433734939759, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 4.625226263252386e-05, |
|
"loss": 0.8375, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 7.132530120481928, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.554316797843609e-05, |
|
"loss": 0.8257, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 7.156626506024097, |
|
"grad_norm": 0.2255859375, |
|
"learning_rate": 4.483794468689728e-05, |
|
"loss": 0.8347, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 7.180722891566265, |
|
"grad_norm": 0.2314453125, |
|
"learning_rate": 4.413664289257265e-05, |
|
"loss": 0.8354, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 7.204819277108434, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 4.343931245134616e-05, |
|
"loss": 0.8382, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 7.228915662650603, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 4.274600293677647e-05, |
|
"loss": 0.8359, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 7.253012048192771, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 4.2056763636572574e-05, |
|
"loss": 0.8351, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 7.27710843373494, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 4.137164354908999e-05, |
|
"loss": 0.8377, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 7.301204819277109, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 4.069069137984731e-05, |
|
"loss": 0.8313, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 7.325301204819277, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 4.001395553806391e-05, |
|
"loss": 0.8382, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 7.349397590361446, |
|
"grad_norm": 0.2431640625, |
|
"learning_rate": 3.9341484133218366e-05, |
|
"loss": 0.8368, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 7.373493975903615, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.8673324971628357e-05, |
|
"loss": 0.8419, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 7.397590361445783, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.800952555305216e-05, |
|
"loss": 0.8357, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 7.421686746987952, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 3.7350133067311686e-05, |
|
"loss": 0.8393, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 7.445783132530121, |
|
"grad_norm": 0.2412109375, |
|
"learning_rate": 3.669519439093801e-05, |
|
"loss": 0.8319, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 7.469879518072289, |
|
"grad_norm": 0.23828125, |
|
"learning_rate": 3.604475608383858e-05, |
|
"loss": 0.8358, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 7.493975903614458, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.539886438598756e-05, |
|
"loss": 0.8369, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 7.518072289156627, |
|
"grad_norm": 0.2158203125, |
|
"learning_rate": 3.475756521413839e-05, |
|
"loss": 0.8355, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 7.542168674698795, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.412090415855963e-05, |
|
"loss": 0.8374, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 7.566265060240964, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 3.348892647979389e-05, |
|
"loss": 0.8353, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 7.590361445783133, |
|
"grad_norm": 0.2333984375, |
|
"learning_rate": 3.2861677105440336e-05, |
|
"loss": 0.8368, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 7.614457831325301, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.223920062696052e-05, |
|
"loss": 0.8315, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 7.63855421686747, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 3.1621541296508695e-05, |
|
"loss": 0.8373, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 7.662650602409639, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 3.100874302378559e-05, |
|
"loss": 0.8315, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 7.686746987951807, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 3.0400849372917073e-05, |
|
"loss": 0.839, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 7.710843373493976, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 2.979790355935703e-05, |
|
"loss": 0.8368, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 7.734939759036145, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.919994844681524e-05, |
|
"loss": 0.8348, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 7.759036144578313, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.8607026544210114e-05, |
|
"loss": 0.8323, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 7.783132530120482, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.801918000264665e-05, |
|
"loss": 0.8344, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 7.807228915662651, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 2.7436450612420095e-05, |
|
"loss": 0.8343, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 7.831325301204819, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 2.6858879800044866e-05, |
|
"loss": 0.8329, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 7.855421686746988, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 2.6286508625309624e-05, |
|
"loss": 0.8354, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 7.879518072289157, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 2.57193777783582e-05, |
|
"loss": 0.8399, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 7.903614457831325, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 2.515752757679707e-05, |
|
"loss": 0.8325, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 7.927710843373494, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 2.4600997962828987e-05, |
|
"loss": 0.8338, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 7.951807228915663, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 2.404982850041363e-05, |
|
"loss": 0.8371, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 7.975903614457831, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.3504058372454884e-05, |
|
"loss": 0.8319, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.2265625, |
|
"learning_rate": 2.2963726378015327e-05, |
|
"loss": 0.8353, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.5614659786224365, |
|
"eval_runtime": 0.4882, |
|
"eval_samples_per_second": 20.483, |
|
"eval_steps_per_second": 2.048, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 8.024096385542169, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 2.242887092955801e-05, |
|
"loss": 0.8313, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 8.048192771084338, |
|
"grad_norm": 0.2275390625, |
|
"learning_rate": 2.189953005021569e-05, |
|
"loss": 0.8278, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 8.072289156626505, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 2.1375741371087677e-05, |
|
"loss": 0.8321, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 8.096385542168674, |
|
"grad_norm": 0.2177734375, |
|
"learning_rate": 2.085754212856471e-05, |
|
"loss": 0.8368, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 8.120481927710843, |
|
"grad_norm": 0.21875, |
|
"learning_rate": 2.0344969161681792e-05, |
|
"loss": 0.8258, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 8.144578313253012, |
|
"grad_norm": 0.2236328125, |
|
"learning_rate": 1.983805890949927e-05, |
|
"loss": 0.8273, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 8.168674698795181, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 1.9336847408512328e-05, |
|
"loss": 0.8272, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 8.19277108433735, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 1.884137029008921e-05, |
|
"loss": 0.8334, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 8.216867469879517, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.8351662777938127e-05, |
|
"loss": 0.8337, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 8.240963855421686, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 1.7867759685603114e-05, |
|
"loss": 0.8306, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 8.265060240963855, |
|
"grad_norm": 0.2138671875, |
|
"learning_rate": 1.738969541398926e-05, |
|
"loss": 0.8306, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 8.289156626506024, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 1.691750394891707e-05, |
|
"loss": 0.8299, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 8.313253012048193, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.829, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 8.337349397590362, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.59908732917899e-05, |
|
"loss": 0.831, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 8.36144578313253, |
|
"grad_norm": 0.2197265625, |
|
"learning_rate": 1.5536499974356866e-05, |
|
"loss": 0.8269, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 8.385542168674698, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.5088131208026367e-05, |
|
"loss": 0.834, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 8.409638554216867, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 1.4645798867551008e-05, |
|
"loss": 0.8324, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 8.433734939759036, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 1.4209534398551016e-05, |
|
"loss": 0.8325, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 8.457831325301205, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 1.3779368815278647e-05, |
|
"loss": 0.8392, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 8.481927710843374, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 1.335533269841347e-05, |
|
"loss": 0.8284, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 8.506024096385541, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 1.2937456192888309e-05, |
|
"loss": 0.8332, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 8.53012048192771, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.252576900574618e-05, |
|
"loss": 0.8269, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 8.55421686746988, |
|
"grad_norm": 0.224609375, |
|
"learning_rate": 1.2120300404028507e-05, |
|
"loss": 0.8346, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 8.578313253012048, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.1721079212694452e-05, |
|
"loss": 0.8331, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 8.602409638554217, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 1.1328133812571784e-05, |
|
"loss": 0.8324, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 8.626506024096386, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 1.0941492138339183e-05, |
|
"loss": 0.8285, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 8.650602409638553, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 1.0561181676540444e-05, |
|
"loss": 0.838, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 8.674698795180722, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 1.01872294636304e-05, |
|
"loss": 0.827, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 8.698795180722891, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 9.81966208405285e-06, |
|
"loss": 0.8331, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 8.72289156626506, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 9.458505668350759e-06, |
|
"loss": 0.8304, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 8.74698795180723, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 9.103785891308547e-06, |
|
"loss": 0.8333, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 8.771084337349398, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 8.755527970126853e-06, |
|
"loss": 0.8325, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 8.795180722891565, |
|
"grad_norm": 0.21484375, |
|
"learning_rate": 8.413756662629879e-06, |
|
"loss": 0.8326, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 8.819277108433734, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 8.07849626550531e-06, |
|
"loss": 0.8315, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 8.843373493975903, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 7.74977061257709e-06, |
|
"loss": 0.833, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 8.867469879518072, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 7.427603073110967e-06, |
|
"loss": 0.8299, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 8.891566265060241, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 7.1120165501533e-06, |
|
"loss": 0.8282, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 8.91566265060241, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 6.803033478902765e-06, |
|
"loss": 0.8267, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 8.939759036144578, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 6.500675825115454e-06, |
|
"loss": 0.83, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 8.963855421686747, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 6.204965083543368e-06, |
|
"loss": 0.8267, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 8.987951807228916, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 5.915922276406249e-06, |
|
"loss": 0.8267, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 8.997590361445782, |
|
"eval_loss": 2.5673654079437256, |
|
"eval_runtime": 0.7609, |
|
"eval_samples_per_second": 13.142, |
|
"eval_steps_per_second": 1.314, |
|
"step": 1867 |
|
}, |
|
{ |
|
"epoch": 9.012048192771084, |
|
"grad_norm": 0.197265625, |
|
"learning_rate": 5.633567951897145e-06, |
|
"loss": 0.8245, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 9.036144578313253, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 5.357922182721687e-06, |
|
"loss": 0.8271, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 9.060240963855422, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 5.08900456467103e-06, |
|
"loss": 0.8373, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 9.08433734939759, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 4.826834215228826e-06, |
|
"loss": 0.8242, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 9.108433734939759, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 4.5714297722121106e-06, |
|
"loss": 0.8283, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 9.132530120481928, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.322809392446392e-06, |
|
"loss": 0.8313, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 9.156626506024097, |
|
"grad_norm": 0.208984375, |
|
"learning_rate": 4.080990750474778e-06, |
|
"loss": 0.8287, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 9.180722891566266, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 3.84599103730161e-06, |
|
"loss": 0.8265, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 9.204819277108435, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 3.617826959170256e-06, |
|
"loss": 0.8307, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 9.228915662650602, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.3965147363754555e-06, |
|
"loss": 0.8343, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 9.25301204819277, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 3.182070102110257e-06, |
|
"loss": 0.8313, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 9.27710843373494, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 2.974508301347534e-06, |
|
"loss": 0.8319, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 9.301204819277109, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 2.7738440897561723e-06, |
|
"loss": 0.8307, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 9.325301204819278, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 2.580091732652101e-06, |
|
"loss": 0.8316, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 9.349397590361447, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 2.3932650039841687e-06, |
|
"loss": 0.8309, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 9.373493975903614, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 2.213377185354959e-06, |
|
"loss": 0.8335, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 9.397590361445783, |
|
"grad_norm": 0.2060546875, |
|
"learning_rate": 2.0404410650765817e-06, |
|
"loss": 0.8307, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 9.421686746987952, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 1.874468937261531e-06, |
|
"loss": 0.8301, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 9.44578313253012, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 1.71547260094872e-06, |
|
"loss": 0.8262, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 9.46987951807229, |
|
"grad_norm": 0.2294921875, |
|
"learning_rate": 1.5634633592646609e-06, |
|
"loss": 0.8284, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 9.493975903614459, |
|
"grad_norm": 0.205078125, |
|
"learning_rate": 1.4184520186199202e-06, |
|
"loss": 0.8309, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 9.518072289156626, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 1.2804488879408993e-06, |
|
"loss": 0.8288, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 9.542168674698795, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 1.1494637779369766e-06, |
|
"loss": 0.8309, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 9.566265060240964, |
|
"grad_norm": 0.2021484375, |
|
"learning_rate": 1.0255060004030093e-06, |
|
"loss": 0.8251, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 9.590361445783133, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 9.085843675574079e-07, |
|
"loss": 0.8351, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 9.614457831325302, |
|
"grad_norm": 0.201171875, |
|
"learning_rate": 7.987071914156596e-07, |
|
"loss": 0.8259, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 9.638554216867469, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 6.958822831994005e-07, |
|
"loss": 0.8283, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 9.684337349397591, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 6.001169527811268e-07, |
|
"loss": 0.8287, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 9.708433734939758, |
|
"grad_norm": 0.20703125, |
|
"learning_rate": 5.114180081645214e-07, |
|
"loss": 0.8321, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 9.732530120481927, |
|
"grad_norm": 0.19921875, |
|
"learning_rate": 4.2979175500050817e-07, |
|
"loss": 0.8297, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 9.756626506024096, |
|
"grad_norm": 0.228515625, |
|
"learning_rate": 3.552439961389431e-07, |
|
"loss": 0.8235, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 9.780722891566265, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 2.877800312160783e-07, |
|
"loss": 0.8302, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 9.804819277108434, |
|
"grad_norm": 0.2109375, |
|
"learning_rate": 2.274046562778409e-07, |
|
"loss": 0.8331, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 9.828915662650603, |
|
"grad_norm": 0.2119140625, |
|
"learning_rate": 1.7412216343885014e-07, |
|
"loss": 0.8281, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 9.85301204819277, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.2793634057732818e-07, |
|
"loss": 0.8321, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 9.87710843373494, |
|
"grad_norm": 0.212890625, |
|
"learning_rate": 8.885047106578227e-08, |
|
"loss": 0.832, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 9.901204819277108, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 5.6867333537580226e-08, |
|
"loss": 0.8308, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 9.925301204819277, |
|
"grad_norm": 0.2001953125, |
|
"learning_rate": 3.1989201689452967e-08, |
|
"loss": 0.8283, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 9.949397590361446, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 1.4217844119857048e-08, |
|
"loss": 0.8307, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 9.973493975903615, |
|
"grad_norm": 0.2080078125, |
|
"learning_rate": 3.554524203175369e-09, |
|
"loss": 0.8252, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 9.997590361445782, |
|
"grad_norm": 0.203125, |
|
"learning_rate": 0.0, |
|
"loss": 0.8289, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.997590361445782, |
|
"eval_loss": 2.5681488513946533, |
|
"eval_runtime": 0.4835, |
|
"eval_samples_per_second": 20.683, |
|
"eval_steps_per_second": 2.068, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 9.997590361445782, |
|
"step": 2070, |
|
"total_flos": 3.290190024938619e+18, |
|
"train_loss": 0.02805145143886695, |
|
"train_runtime": 534.5593, |
|
"train_samples_per_second": 495.96, |
|
"train_steps_per_second": 3.872 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2070, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.290190024938619e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|