|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 3957, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.08993126249565843, |
|
"learning_rate": 5.050505050505052e-07, |
|
"loss": 1.9018, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.07331289037109069, |
|
"learning_rate": 2.5252525252525253e-06, |
|
"loss": 1.764, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.07765988729484478, |
|
"learning_rate": 5.050505050505051e-06, |
|
"loss": 1.6755, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.09328963425207142, |
|
"learning_rate": 7.5757575757575764e-06, |
|
"loss": 1.7942, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.08439300726187475, |
|
"learning_rate": 1.0101010101010101e-05, |
|
"loss": 1.9255, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.09969799609843567, |
|
"learning_rate": 1.2626262626262628e-05, |
|
"loss": 1.6785, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.08101062126150285, |
|
"learning_rate": 1.5151515151515153e-05, |
|
"loss": 1.6021, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.09324937558599246, |
|
"learning_rate": 1.7676767676767676e-05, |
|
"loss": 1.8021, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.08809158633106223, |
|
"learning_rate": 2.0202020202020203e-05, |
|
"loss": 1.8128, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.12040981734567617, |
|
"learning_rate": 2.272727272727273e-05, |
|
"loss": 1.9518, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.10564260792758735, |
|
"learning_rate": 2.5252525252525256e-05, |
|
"loss": 1.844, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.11067902372614258, |
|
"learning_rate": 2.777777777777778e-05, |
|
"loss": 1.7309, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.17208150270588693, |
|
"learning_rate": 3.0303030303030306e-05, |
|
"loss": 1.83, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.17753444313487116, |
|
"learning_rate": 3.282828282828283e-05, |
|
"loss": 1.7143, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.11795959596262973, |
|
"learning_rate": 3.535353535353535e-05, |
|
"loss": 1.6863, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.1604849588266011, |
|
"learning_rate": 3.787878787878788e-05, |
|
"loss": 1.7895, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.14614002357667696, |
|
"learning_rate": 4.0404040404040405e-05, |
|
"loss": 1.7037, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.17612584653207902, |
|
"learning_rate": 4.292929292929293e-05, |
|
"loss": 1.7624, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.1725623148760258, |
|
"learning_rate": 4.545454545454546e-05, |
|
"loss": 1.7826, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.20023707797673052, |
|
"learning_rate": 4.797979797979798e-05, |
|
"loss": 1.7551, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.19340080776803978, |
|
"learning_rate": 5.050505050505051e-05, |
|
"loss": 1.8434, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.17540911026085398, |
|
"learning_rate": 5.303030303030303e-05, |
|
"loss": 1.8444, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.17607693067428137, |
|
"learning_rate": 5.555555555555556e-05, |
|
"loss": 1.7179, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.18623446289553894, |
|
"learning_rate": 5.808080808080808e-05, |
|
"loss": 1.8005, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.22653423501586226, |
|
"learning_rate": 6.060606060606061e-05, |
|
"loss": 1.8171, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.19917898944232487, |
|
"learning_rate": 6.313131313131313e-05, |
|
"loss": 1.7935, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.17977021019465064, |
|
"learning_rate": 6.565656565656566e-05, |
|
"loss": 1.847, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.21882213186080465, |
|
"learning_rate": 6.818181818181818e-05, |
|
"loss": 1.7249, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.19872378885041136, |
|
"learning_rate": 7.07070707070707e-05, |
|
"loss": 1.8317, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.18503126257452687, |
|
"learning_rate": 7.323232323232324e-05, |
|
"loss": 1.7036, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.19374257378242796, |
|
"learning_rate": 7.575757575757576e-05, |
|
"loss": 1.7408, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.20435751977610797, |
|
"learning_rate": 7.828282828282829e-05, |
|
"loss": 1.7453, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.18626365580812038, |
|
"learning_rate": 8.080808080808081e-05, |
|
"loss": 1.7538, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.22638414276196805, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 1.7755, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.19644895370384188, |
|
"learning_rate": 8.585858585858586e-05, |
|
"loss": 1.8446, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.19159413735540007, |
|
"learning_rate": 8.83838383838384e-05, |
|
"loss": 1.6323, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.17020103839194523, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.7957, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.19164694691999767, |
|
"learning_rate": 9.343434343434344e-05, |
|
"loss": 1.9204, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.19378174604020243, |
|
"learning_rate": 9.595959595959596e-05, |
|
"loss": 1.6792, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.24199163008867994, |
|
"learning_rate": 9.848484848484849e-05, |
|
"loss": 1.7818, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.1791702851062047, |
|
"learning_rate": 0.00010101010101010102, |
|
"loss": 1.6407, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2127448005277486, |
|
"learning_rate": 0.00010353535353535353, |
|
"loss": 1.8173, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.18625979651987537, |
|
"learning_rate": 0.00010606060606060606, |
|
"loss": 1.7401, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.2602576963144457, |
|
"learning_rate": 0.0001085858585858586, |
|
"loss": 1.8104, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.19387518149584881, |
|
"learning_rate": 0.00011111111111111112, |
|
"loss": 1.8442, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.22413096563678928, |
|
"learning_rate": 0.00011363636363636365, |
|
"loss": 1.6172, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.21913536165908545, |
|
"learning_rate": 0.00011616161616161616, |
|
"loss": 1.6973, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.2083524734994055, |
|
"learning_rate": 0.00011868686868686869, |
|
"loss": 1.7593, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.22803196006710846, |
|
"learning_rate": 0.00012121212121212122, |
|
"loss": 1.731, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.16039461658504198, |
|
"learning_rate": 0.00012373737373737374, |
|
"loss": 1.5913, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.2185859764067758, |
|
"learning_rate": 0.00012626262626262626, |
|
"loss": 1.637, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.19449925000530618, |
|
"learning_rate": 0.00012878787878787878, |
|
"loss": 1.5634, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.18094727231062543, |
|
"learning_rate": 0.00013131313131313133, |
|
"loss": 1.6769, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.2362383395641708, |
|
"learning_rate": 0.00013383838383838385, |
|
"loss": 1.7723, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.1756303905738309, |
|
"learning_rate": 0.00013636363636363637, |
|
"loss": 1.7622, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.18784556886056825, |
|
"learning_rate": 0.0001388888888888889, |
|
"loss": 1.648, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.23195176017229427, |
|
"learning_rate": 0.0001414141414141414, |
|
"loss": 1.846, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.22241261139284105, |
|
"learning_rate": 0.00014393939393939396, |
|
"loss": 1.6282, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.1959378752266171, |
|
"learning_rate": 0.00014646464646464648, |
|
"loss": 1.7298, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.18110574265575713, |
|
"learning_rate": 0.000148989898989899, |
|
"loss": 1.6463, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.19727075597861782, |
|
"learning_rate": 0.00015151515151515152, |
|
"loss": 1.7925, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.20574697015902954, |
|
"learning_rate": 0.00015404040404040406, |
|
"loss": 1.6835, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.18178501884804188, |
|
"learning_rate": 0.00015656565656565658, |
|
"loss": 1.8534, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.20396286221213047, |
|
"learning_rate": 0.0001590909090909091, |
|
"loss": 1.9553, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.19731656072570272, |
|
"learning_rate": 0.00016161616161616162, |
|
"loss": 1.7907, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.15745281662564334, |
|
"learning_rate": 0.00016414141414141414, |
|
"loss": 1.7516, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.17389045576146547, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 1.6362, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.2055099842458337, |
|
"learning_rate": 0.00016919191919191918, |
|
"loss": 1.711, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.16967943859036833, |
|
"learning_rate": 0.00017171717171717173, |
|
"loss": 1.7327, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.20493364192749108, |
|
"learning_rate": 0.00017424242424242425, |
|
"loss": 1.7575, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.22713873700263487, |
|
"learning_rate": 0.0001767676767676768, |
|
"loss": 1.6266, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.22661135493794904, |
|
"learning_rate": 0.00017929292929292931, |
|
"loss": 1.5738, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.2181059846275241, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 1.6742, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.17088148508773793, |
|
"learning_rate": 0.00018434343434343435, |
|
"loss": 1.7483, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.2533679574468662, |
|
"learning_rate": 0.00018686868686868687, |
|
"loss": 1.8377, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.19728510430536142, |
|
"learning_rate": 0.00018939393939393942, |
|
"loss": 1.6412, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.1846496893995934, |
|
"learning_rate": 0.00019191919191919191, |
|
"loss": 1.6605, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.20677111282109845, |
|
"learning_rate": 0.00019444444444444446, |
|
"loss": 1.9151, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.1843158891748435, |
|
"learning_rate": 0.00019696969696969698, |
|
"loss": 1.6697, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.19889363775332344, |
|
"learning_rate": 0.0001994949494949495, |
|
"loss": 1.7293, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.18003384908917786, |
|
"learning_rate": 0.00019999937734807612, |
|
"loss": 1.8024, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.18260287569380637, |
|
"learning_rate": 0.00019999684783792443, |
|
"loss": 1.6779, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.21622290040357123, |
|
"learning_rate": 0.00019999237260298072, |
|
"loss": 1.6577, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.19728853094941184, |
|
"learning_rate": 0.00019998595173032347, |
|
"loss": 1.6211, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.16427481018358323, |
|
"learning_rate": 0.00019997758534488915, |
|
"loss": 1.6793, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.1464512551401983, |
|
"learning_rate": 0.00019996727360946972, |
|
"loss": 1.731, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.1895744669006413, |
|
"learning_rate": 0.00019995501672470951, |
|
"loss": 1.7024, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.13688692966034832, |
|
"learning_rate": 0.00019994081492910124, |
|
"loss": 1.8371, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.20101726127225358, |
|
"learning_rate": 0.0001999246684989815, |
|
"loss": 1.7473, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.20241522090213954, |
|
"learning_rate": 0.00019990657774852534, |
|
"loss": 1.7423, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.1767592377256186, |
|
"learning_rate": 0.00019988654302974, |
|
"loss": 1.8304, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.2373965969657545, |
|
"learning_rate": 0.00019986456473245826, |
|
"loss": 1.8509, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.21300866974991087, |
|
"learning_rate": 0.00019984064328433084, |
|
"loss": 1.7339, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.185425069119908, |
|
"learning_rate": 0.00019981477915081793, |
|
"loss": 1.7523, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.1955299660793198, |
|
"learning_rate": 0.00019978697283518023, |
|
"loss": 1.804, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.19829821726437152, |
|
"learning_rate": 0.00019975722487846918, |
|
"loss": 1.8378, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.13816451624075418, |
|
"learning_rate": 0.0001997255358595164, |
|
"loss": 1.791, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.17416550139224937, |
|
"learning_rate": 0.00019969190639492244, |
|
"loss": 1.6882, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.19361009276270708, |
|
"learning_rate": 0.00019965633713904472, |
|
"loss": 1.7448, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.20655351119978135, |
|
"learning_rate": 0.00019961882878398492, |
|
"loss": 1.7804, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.2104318907698028, |
|
"learning_rate": 0.0001995793820595754, |
|
"loss": 1.7399, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.1970506865196183, |
|
"learning_rate": 0.00019953799773336507, |
|
"loss": 1.662, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.12911497323739385, |
|
"learning_rate": 0.00019949467661060433, |
|
"loss": 1.6589, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.18727055645023982, |
|
"learning_rate": 0.00019944941953422968, |
|
"loss": 1.7437, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.21063285499774953, |
|
"learning_rate": 0.000199402227384847, |
|
"loss": 1.837, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.16895692207829008, |
|
"learning_rate": 0.00019935310108071453, |
|
"loss": 1.7406, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.2316031917603028, |
|
"learning_rate": 0.00019930204157772515, |
|
"loss": 1.8237, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.14077975973845075, |
|
"learning_rate": 0.00019924904986938754, |
|
"loss": 1.8804, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.25152961069767266, |
|
"learning_rate": 0.000199194126986807, |
|
"loss": 1.7984, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.18475665649785333, |
|
"learning_rate": 0.00019913727399866545, |
|
"loss": 1.7, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.15993162566307856, |
|
"learning_rate": 0.00019907849201120033, |
|
"loss": 1.8694, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.21887423989587396, |
|
"learning_rate": 0.00019901778216818345, |
|
"loss": 1.699, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.18385555657977046, |
|
"learning_rate": 0.00019895514565089855, |
|
"loss": 1.7936, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.15762946661816535, |
|
"learning_rate": 0.00019889058367811822, |
|
"loss": 1.6613, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.19204775302628793, |
|
"learning_rate": 0.0001988240975060804, |
|
"loss": 1.5856, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1697199863146858, |
|
"learning_rate": 0.00019875568842846382, |
|
"loss": 1.672, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1410887592852674, |
|
"learning_rate": 0.0001986853577763628, |
|
"loss": 1.6269, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.1783222763204088, |
|
"learning_rate": 0.00019861310691826143, |
|
"loss": 1.8029, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.20484278901882244, |
|
"learning_rate": 0.00019853893726000683, |
|
"loss": 1.6194, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1808969694192384, |
|
"learning_rate": 0.00019846285024478202, |
|
"loss": 1.7084, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.1965951187170914, |
|
"learning_rate": 0.00019838484735307748, |
|
"loss": 1.706, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.1555012346720015, |
|
"learning_rate": 0.0001983049301026627, |
|
"loss": 1.464, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.2229027944987823, |
|
"learning_rate": 0.00019822310004855652, |
|
"loss": 1.673, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.169635050611861, |
|
"learning_rate": 0.00019813935878299662, |
|
"loss": 1.6593, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.16624303946845476, |
|
"learning_rate": 0.0001980537079354091, |
|
"loss": 1.7164, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.1544208624543807, |
|
"learning_rate": 0.00019796614917237616, |
|
"loss": 1.5616, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.1900272509930039, |
|
"learning_rate": 0.00019787668419760408, |
|
"loss": 1.6552, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.20362594606792483, |
|
"learning_rate": 0.00019778531475188996, |
|
"loss": 1.7175, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.15933464850430776, |
|
"learning_rate": 0.00019769204261308774, |
|
"loss": 1.6599, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.15846354449923994, |
|
"learning_rate": 0.00019759686959607383, |
|
"loss": 1.7152, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.1731064728813603, |
|
"learning_rate": 0.00019749979755271155, |
|
"loss": 1.8006, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.2036118054344575, |
|
"learning_rate": 0.00019740082837181526, |
|
"loss": 1.6992, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.20595935892977982, |
|
"learning_rate": 0.00019729996397911356, |
|
"loss": 1.7571, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.1816167430276872, |
|
"learning_rate": 0.00019719720633721178, |
|
"loss": 1.8058, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.2139611683255453, |
|
"learning_rate": 0.00019709255744555389, |
|
"loss": 1.8398, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.18514013236898805, |
|
"learning_rate": 0.0001969860193403835, |
|
"loss": 1.7307, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.17288244809213096, |
|
"learning_rate": 0.00019687759409470426, |
|
"loss": 1.7242, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.15953349037713735, |
|
"learning_rate": 0.00019676728381823956, |
|
"loss": 1.6435, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.20963390389942183, |
|
"learning_rate": 0.00019665509065739149, |
|
"loss": 1.6791, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.19075148330166494, |
|
"learning_rate": 0.000196541016795199, |
|
"loss": 1.505, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.22817672978454195, |
|
"learning_rate": 0.00019642506445129545, |
|
"loss": 1.8361, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.1925013343867196, |
|
"learning_rate": 0.00019630723588186545, |
|
"loss": 1.7126, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.16780528759294142, |
|
"learning_rate": 0.000196187533379601, |
|
"loss": 1.6649, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.17707927803137202, |
|
"learning_rate": 0.00019606595927365675, |
|
"loss": 1.6551, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.22525846033337887, |
|
"learning_rate": 0.00019594251592960479, |
|
"loss": 1.7401, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.1953310514707257, |
|
"learning_rate": 0.0001958172057493886, |
|
"loss": 1.6944, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.2085121645512001, |
|
"learning_rate": 0.0001956900311712763, |
|
"loss": 1.663, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.17093646250615369, |
|
"learning_rate": 0.0001955609946698131, |
|
"loss": 1.772, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.19564116222725914, |
|
"learning_rate": 0.00019543009875577346, |
|
"loss": 1.6328, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.215195812549034, |
|
"learning_rate": 0.0001952973459761118, |
|
"loss": 1.6438, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.19377558972597342, |
|
"learning_rate": 0.0001951627389139134, |
|
"loss": 1.7442, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.1792011980095539, |
|
"learning_rate": 0.00019502628018834372, |
|
"loss": 1.7518, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.18977603295326154, |
|
"learning_rate": 0.00019488797245459773, |
|
"loss": 1.688, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.19341540153355985, |
|
"learning_rate": 0.00019474781840384816, |
|
"loss": 1.7562, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.14738398424312027, |
|
"learning_rate": 0.00019460582076319302, |
|
"loss": 1.7244, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.1496446026997031, |
|
"learning_rate": 0.00019446198229560276, |
|
"loss": 1.7083, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.2151992641933425, |
|
"learning_rate": 0.00019431630579986632, |
|
"loss": 1.7078, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.1972075447483379, |
|
"learning_rate": 0.00019416879411053673, |
|
"loss": 1.7665, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.20871968848692934, |
|
"learning_rate": 0.00019401945009787594, |
|
"loss": 1.6636, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.2047491094137733, |
|
"learning_rate": 0.0001938682766677991, |
|
"loss": 1.8061, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.1622522396758859, |
|
"learning_rate": 0.00019371527676181777, |
|
"loss": 1.8645, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.1714969472958251, |
|
"learning_rate": 0.00019356045335698296, |
|
"loss": 1.8266, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.1971306915514917, |
|
"learning_rate": 0.00019340380946582695, |
|
"loss": 1.7205, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.17020111323913545, |
|
"learning_rate": 0.00019324534813630487, |
|
"loss": 1.7339, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.18250825908624654, |
|
"learning_rate": 0.00019308507245173527, |
|
"loss": 1.5188, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.18593221945740382, |
|
"learning_rate": 0.0001929229855307402, |
|
"loss": 1.654, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.22850385689556876, |
|
"learning_rate": 0.00019275909052718447, |
|
"loss": 1.7814, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.19759950903326942, |
|
"learning_rate": 0.00019259339063011432, |
|
"loss": 1.744, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.17215997030755548, |
|
"learning_rate": 0.00019242588906369536, |
|
"loss": 1.8283, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.19136317284315416, |
|
"learning_rate": 0.00019225658908714983, |
|
"loss": 1.6163, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.254426335434924, |
|
"learning_rate": 0.00019208549399469318, |
|
"loss": 1.7618, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.21881435842952657, |
|
"learning_rate": 0.00019191260711547001, |
|
"loss": 1.7315, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.20799528199612635, |
|
"learning_rate": 0.0001917379318134892, |
|
"loss": 1.7859, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.17796834357588534, |
|
"learning_rate": 0.00019156147148755855, |
|
"loss": 1.7345, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.1855849493493474, |
|
"learning_rate": 0.0001913832295712186, |
|
"loss": 1.6232, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.20017349406812152, |
|
"learning_rate": 0.00019120320953267586, |
|
"loss": 1.7546, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.2146332192452092, |
|
"learning_rate": 0.0001910214148747352, |
|
"loss": 1.6231, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.15896122862532144, |
|
"learning_rate": 0.0001908378491347319, |
|
"loss": 1.5535, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.17416174476856394, |
|
"learning_rate": 0.00019065251588446265, |
|
"loss": 1.6337, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.23962933659259386, |
|
"learning_rate": 0.0001904654187301161, |
|
"loss": 1.8581, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.17002626630746845, |
|
"learning_rate": 0.0001902765613122028, |
|
"loss": 1.6537, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.23553588791103638, |
|
"learning_rate": 0.0001900859473054841, |
|
"loss": 1.7497, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.17184692025828147, |
|
"learning_rate": 0.00018989358041890094, |
|
"loss": 1.6305, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.19276600455036005, |
|
"learning_rate": 0.00018969946439550148, |
|
"loss": 1.6965, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.2266174277702017, |
|
"learning_rate": 0.0001895036030123684, |
|
"loss": 1.7845, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.15948175213103422, |
|
"learning_rate": 0.0001893060000805453, |
|
"loss": 1.582, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.20415510379076665, |
|
"learning_rate": 0.00018910665944496264, |
|
"loss": 1.6576, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.20826411615417578, |
|
"learning_rate": 0.00018890558498436282, |
|
"loss": 1.7243, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.2535182644413305, |
|
"learning_rate": 0.00018870278061122484, |
|
"loss": 1.5795, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.17063517897800512, |
|
"learning_rate": 0.00018849825027168803, |
|
"loss": 1.6361, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.1898841623155248, |
|
"learning_rate": 0.00018829199794547535, |
|
"loss": 1.7526, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.19033639448531828, |
|
"learning_rate": 0.00018808402764581596, |
|
"loss": 1.5943, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.1647576077524525, |
|
"learning_rate": 0.0001878743434193671, |
|
"loss": 1.7575, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.2070226518164384, |
|
"learning_rate": 0.00018766294934613535, |
|
"loss": 1.741, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.21633633400820462, |
|
"learning_rate": 0.00018744984953939726, |
|
"loss": 1.6967, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.2039504411965307, |
|
"learning_rate": 0.0001872350481456193, |
|
"loss": 1.6825, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.19382809212719235, |
|
"learning_rate": 0.0001870185493443772, |
|
"loss": 1.7494, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.17836311560595738, |
|
"learning_rate": 0.0001868003573482746, |
|
"loss": 1.6326, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.18940985276826594, |
|
"learning_rate": 0.0001865804764028611, |
|
"loss": 1.6823, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.15827883706638377, |
|
"learning_rate": 0.0001863589107865496, |
|
"loss": 1.8507, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.2024112582787964, |
|
"learning_rate": 0.00018613566481053315, |
|
"loss": 1.6737, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.18631332115379975, |
|
"learning_rate": 0.00018591074281870099, |
|
"loss": 1.6391, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.2322177223268837, |
|
"learning_rate": 0.00018568414918755397, |
|
"loss": 1.7185, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.19585063603806546, |
|
"learning_rate": 0.00018545588832611956, |
|
"loss": 1.8829, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.19046268057109556, |
|
"learning_rate": 0.00018522596467586598, |
|
"loss": 1.6889, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.2319521660184869, |
|
"learning_rate": 0.00018499438271061568, |
|
"loss": 1.7148, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.18401426887501984, |
|
"learning_rate": 0.0001847611469364584, |
|
"loss": 1.6355, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.16467673844089234, |
|
"learning_rate": 0.00018452626189166345, |
|
"loss": 1.5748, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.21515271715044545, |
|
"learning_rate": 0.0001842897321465915, |
|
"loss": 1.7172, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.20010536585072475, |
|
"learning_rate": 0.0001840515623036055, |
|
"loss": 1.7331, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.15220183718369495, |
|
"learning_rate": 0.0001838117569969812, |
|
"loss": 1.7703, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.19249950248721495, |
|
"learning_rate": 0.00018357032089281702, |
|
"loss": 1.7356, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.15685889188495356, |
|
"learning_rate": 0.00018332725868894313, |
|
"loss": 1.5789, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.22123166856945198, |
|
"learning_rate": 0.00018308257511483018, |
|
"loss": 1.7449, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.21921162541787237, |
|
"learning_rate": 0.00018283627493149721, |
|
"loss": 1.592, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.15892072068340937, |
|
"learning_rate": 0.00018258836293141907, |
|
"loss": 1.6588, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.2129268440643301, |
|
"learning_rate": 0.000182338843938433, |
|
"loss": 1.6687, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.18558886049158316, |
|
"learning_rate": 0.000182087722807645, |
|
"loss": 1.6204, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.21759739469279235, |
|
"learning_rate": 0.00018183500442533514, |
|
"loss": 1.7012, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.16739812153050462, |
|
"learning_rate": 0.00018158069370886266, |
|
"loss": 1.7749, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.2120028175464506, |
|
"learning_rate": 0.0001813247956065702, |
|
"loss": 1.7076, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.21506301830058508, |
|
"learning_rate": 0.00018106731509768753, |
|
"loss": 1.6561, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.21374007008875692, |
|
"learning_rate": 0.00018080825719223468, |
|
"loss": 1.7721, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.21473556112453085, |
|
"learning_rate": 0.00018054762693092444, |
|
"loss": 1.5391, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.1928094864305794, |
|
"learning_rate": 0.00018028542938506426, |
|
"loss": 1.7297, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.22195616352181186, |
|
"learning_rate": 0.0001800216696564576, |
|
"loss": 1.6239, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.2493704349381919, |
|
"learning_rate": 0.00017975635287730473, |
|
"loss": 1.7736, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.1871166430212898, |
|
"learning_rate": 0.00017948948421010264, |
|
"loss": 1.67, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.16460126336549072, |
|
"learning_rate": 0.00017922106884754488, |
|
"loss": 1.7331, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.18707990225784327, |
|
"learning_rate": 0.0001789511120124203, |
|
"loss": 1.5608, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.21751770239029078, |
|
"learning_rate": 0.00017867961895751163, |
|
"loss": 1.721, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.1674742118307801, |
|
"learning_rate": 0.00017840659496549298, |
|
"loss": 1.7339, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.19173527103482793, |
|
"learning_rate": 0.00017813204534882738, |
|
"loss": 1.7348, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.18468049289167895, |
|
"learning_rate": 0.0001778559754496631, |
|
"loss": 1.6823, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2069730744593729, |
|
"learning_rate": 0.00017757839063972997, |
|
"loss": 1.8253, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.2139312404074137, |
|
"learning_rate": 0.00017729929632023472, |
|
"loss": 1.7013, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.1764736094502213, |
|
"learning_rate": 0.00017701869792175593, |
|
"loss": 1.8235, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.21944309103277923, |
|
"learning_rate": 0.00017673660090413823, |
|
"loss": 1.8237, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.20268987883171422, |
|
"learning_rate": 0.00017645301075638634, |
|
"loss": 1.6992, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.19400968339090352, |
|
"learning_rate": 0.00017616793299655794, |
|
"loss": 1.8662, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.18489832863809344, |
|
"learning_rate": 0.00017588137317165657, |
|
"loss": 1.6986, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.17738333103257395, |
|
"learning_rate": 0.0001755933368575235, |
|
"loss": 1.6783, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.17926192606119037, |
|
"learning_rate": 0.0001753038296587294, |
|
"loss": 1.7627, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.20194075183870522, |
|
"learning_rate": 0.00017501285720846523, |
|
"loss": 1.7846, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.19331786071311133, |
|
"learning_rate": 0.0001747204251684325, |
|
"loss": 1.7143, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.23530188097310437, |
|
"learning_rate": 0.00017442653922873327, |
|
"loss": 1.7296, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.17594594764152405, |
|
"learning_rate": 0.0001741312051077594, |
|
"loss": 1.7335, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.20934249136020208, |
|
"learning_rate": 0.00017383442855208124, |
|
"loss": 1.6646, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.2111005617028846, |
|
"learning_rate": 0.00017353621533633583, |
|
"loss": 1.5756, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.21413727626671644, |
|
"learning_rate": 0.00017323657126311454, |
|
"loss": 1.4917, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.2391299536210697, |
|
"learning_rate": 0.0001729355021628502, |
|
"loss": 1.7283, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.19381232926045663, |
|
"learning_rate": 0.00017263301389370362, |
|
"loss": 1.7907, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.21223075585900172, |
|
"learning_rate": 0.0001723291123414495, |
|
"loss": 1.7412, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.18560634331207926, |
|
"learning_rate": 0.00017202380341936212, |
|
"loss": 1.7287, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.18941317978765862, |
|
"learning_rate": 0.00017171709306810012, |
|
"loss": 1.5956, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.17108900888850623, |
|
"learning_rate": 0.000171408987255591, |
|
"loss": 1.7789, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.19233373904164977, |
|
"learning_rate": 0.00017109949197691485, |
|
"loss": 1.7397, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.1697480170006848, |
|
"learning_rate": 0.00017078861325418797, |
|
"loss": 1.5765, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.17575403691888572, |
|
"learning_rate": 0.00017047635713644528, |
|
"loss": 1.8137, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.19700603487956356, |
|
"learning_rate": 0.00017016272969952304, |
|
"loss": 1.8248, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.25577968967800774, |
|
"learning_rate": 0.0001698477370459405, |
|
"loss": 1.5227, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.21182493743068362, |
|
"learning_rate": 0.00016953138530478092, |
|
"loss": 1.6463, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.24187008234174068, |
|
"learning_rate": 0.0001692136806315726, |
|
"loss": 1.677, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.23079613874981772, |
|
"learning_rate": 0.00016889462920816902, |
|
"loss": 1.6987, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.18959747421576906, |
|
"learning_rate": 0.00016857423724262849, |
|
"loss": 1.6143, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.19193767521915664, |
|
"learning_rate": 0.00016825251096909343, |
|
"loss": 1.6523, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.1851789336505185, |
|
"learning_rate": 0.00016792945664766907, |
|
"loss": 1.5728, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.14492627661875204, |
|
"learning_rate": 0.00016760508056430152, |
|
"loss": 1.5701, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.2700845196747031, |
|
"learning_rate": 0.0001672793890306556, |
|
"loss": 1.8245, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.1983440671335701, |
|
"learning_rate": 0.00016695238838399206, |
|
"loss": 1.7108, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 0.17701113866794518, |
|
"learning_rate": 0.0001666240849870441, |
|
"loss": 1.5517, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.16944238367848802, |
|
"learning_rate": 0.0001662944852278936, |
|
"loss": 1.7263, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.20201061964568917, |
|
"learning_rate": 0.00016596359551984704, |
|
"loss": 1.6212, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"grad_norm": 0.16272112017898177, |
|
"learning_rate": 0.0001656314223013104, |
|
"loss": 1.6557, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.2050184080142653, |
|
"learning_rate": 0.00016529797203566405, |
|
"loss": 1.6203, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.18868029622446703, |
|
"learning_rate": 0.00016496325121113706, |
|
"loss": 1.5994, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.18530725289838731, |
|
"learning_rate": 0.00016462726634068075, |
|
"loss": 1.661, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.22254932266214475, |
|
"learning_rate": 0.00016429002396184215, |
|
"loss": 1.5779, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 1.05, |
|
"grad_norm": 0.35454879952816054, |
|
"learning_rate": 0.00016395153063663667, |
|
"loss": 1.4926, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.2083539962991777, |
|
"learning_rate": 0.00016361179295142046, |
|
"loss": 1.668, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.20105783428150303, |
|
"learning_rate": 0.00016327081751676227, |
|
"loss": 1.7475, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.19073307130103012, |
|
"learning_rate": 0.0001629286109673148, |
|
"loss": 1.6726, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.21132776602726958, |
|
"learning_rate": 0.00016258517996168564, |
|
"loss": 1.745, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 1.07, |
|
"grad_norm": 0.23336177448110548, |
|
"learning_rate": 0.0001622405311823076, |
|
"loss": 1.7185, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.19045792239686193, |
|
"learning_rate": 0.00016189467133530884, |
|
"loss": 1.6369, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.1470674402518224, |
|
"learning_rate": 0.0001615476071503823, |
|
"loss": 1.6593, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.1895764202504411, |
|
"learning_rate": 0.0001611993453806547, |
|
"loss": 1.5879, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.21781610564885606, |
|
"learning_rate": 0.0001608498928025553, |
|
"loss": 1.6377, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"grad_norm": 0.21082770226036582, |
|
"learning_rate": 0.00016049925621568382, |
|
"loss": 1.5626, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.2288377931408156, |
|
"learning_rate": 0.00016014744244267833, |
|
"loss": 1.7531, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.1822265551052057, |
|
"learning_rate": 0.00015979445832908242, |
|
"loss": 1.691, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.23100268355259115, |
|
"learning_rate": 0.00015944031074321204, |
|
"loss": 1.7622, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.18624779842903288, |
|
"learning_rate": 0.00015908500657602174, |
|
"loss": 1.5919, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.20357926913176824, |
|
"learning_rate": 0.0001587285527409707, |
|
"loss": 1.6288, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.20919686630022472, |
|
"learning_rate": 0.00015837095617388827, |
|
"loss": 1.6705, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.1993582841062667, |
|
"learning_rate": 0.0001580122238328387, |
|
"loss": 1.6516, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.2547942602076731, |
|
"learning_rate": 0.00015765236269798627, |
|
"loss": 1.5036, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.1807424509361345, |
|
"learning_rate": 0.00015729137977145893, |
|
"loss": 1.6089, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.18264437292208377, |
|
"learning_rate": 0.0001569292820772124, |
|
"loss": 1.7353, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.21311009253554458, |
|
"learning_rate": 0.00015656607666089334, |
|
"loss": 1.6574, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.18453680363642788, |
|
"learning_rate": 0.0001562017705897024, |
|
"loss": 1.5736, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.23644760312940358, |
|
"learning_rate": 0.00015583637095225656, |
|
"loss": 1.7076, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"grad_norm": 0.19899933139163767, |
|
"learning_rate": 0.00015546988485845125, |
|
"loss": 1.665, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.23202505382527974, |
|
"learning_rate": 0.0001551023194393221, |
|
"loss": 1.7191, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.21073033640879407, |
|
"learning_rate": 0.00015473368184690597, |
|
"loss": 1.6123, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.22019036120363472, |
|
"learning_rate": 0.00015436397925410201, |
|
"loss": 1.6909, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.20817813902248655, |
|
"learning_rate": 0.00015399321885453202, |
|
"loss": 1.7648, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 0.21714232280510767, |
|
"learning_rate": 0.00015362140786240035, |
|
"loss": 1.6718, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.20478633851375716, |
|
"learning_rate": 0.00015324855351235372, |
|
"loss": 1.7586, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"grad_norm": 0.19046880552839732, |
|
"learning_rate": 0.00015287466305934037, |
|
"loss": 1.695, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.23309832393442634, |
|
"learning_rate": 0.0001524997437784689, |
|
"loss": 1.584, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.23887396172176847, |
|
"learning_rate": 0.00015212380296486652, |
|
"loss": 1.5742, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.18128179052277552, |
|
"learning_rate": 0.0001517468479335376, |
|
"loss": 1.6802, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.22086322507654135, |
|
"learning_rate": 0.00015136888601922072, |
|
"loss": 1.7222, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.18870219517815454, |
|
"learning_rate": 0.0001509899245762464, |
|
"loss": 1.5664, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.19, |
|
"grad_norm": 0.2276718864826248, |
|
"learning_rate": 0.00015060997097839386, |
|
"loss": 1.7565, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.20329327239158157, |
|
"learning_rate": 0.00015022903261874748, |
|
"loss": 1.6774, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.18898688137814482, |
|
"learning_rate": 0.00014984711690955297, |
|
"loss": 1.6518, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.22865474882055875, |
|
"learning_rate": 0.00014946423128207322, |
|
"loss": 1.7247, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.21027592834116465, |
|
"learning_rate": 0.00014908038318644373, |
|
"loss": 1.7849, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 1.21, |
|
"grad_norm": 0.20948671991840284, |
|
"learning_rate": 0.0001486955800915274, |
|
"loss": 1.5386, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.21227729763658884, |
|
"learning_rate": 0.0001483098294847695, |
|
"loss": 1.602, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.21630672435558576, |
|
"learning_rate": 0.00014792313887205182, |
|
"loss": 1.6772, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 1.22, |
|
"grad_norm": 0.21541507503873228, |
|
"learning_rate": 0.00014753551577754664, |
|
"loss": 1.6862, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.2480903001762983, |
|
"learning_rate": 0.0001471469677435704, |
|
"loss": 1.5916, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.20716645798924263, |
|
"learning_rate": 0.00014675750233043679, |
|
"loss": 1.7072, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.22397565488829696, |
|
"learning_rate": 0.00014636712711630978, |
|
"loss": 1.6036, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.19584834615434676, |
|
"learning_rate": 0.00014597584969705616, |
|
"loss": 1.6366, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 0.22273274810197669, |
|
"learning_rate": 0.00014558367768609766, |
|
"loss": 1.6545, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.30141032612570196, |
|
"learning_rate": 0.00014519061871426286, |
|
"loss": 1.6668, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2508746414625482, |
|
"learning_rate": 0.0001447966804296387, |
|
"loss": 1.5583, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.2656543660091513, |
|
"learning_rate": 0.00014440187049742165, |
|
"loss": 1.6114, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.22762072721537044, |
|
"learning_rate": 0.00014400619659976863, |
|
"loss": 1.5218, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 1.26, |
|
"grad_norm": 0.21625802298436558, |
|
"learning_rate": 0.00014360966643564747, |
|
"loss": 1.6282, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.18758356388629857, |
|
"learning_rate": 0.00014321228772068702, |
|
"loss": 1.5724, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.22894089207752852, |
|
"learning_rate": 0.0001428140681870272, |
|
"loss": 1.5875, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.25952806547918694, |
|
"learning_rate": 0.0001424150155831685, |
|
"loss": 1.6728, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.3304544222948505, |
|
"learning_rate": 0.00014201513767382108, |
|
"loss": 1.6944, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.21745874371742022, |
|
"learning_rate": 0.00014161444223975383, |
|
"loss": 1.5649, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.18668861489627886, |
|
"learning_rate": 0.0001412129370776429, |
|
"loss": 1.6646, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.2514658628873574, |
|
"learning_rate": 0.00014081062999992005, |
|
"loss": 1.6427, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.29, |
|
"grad_norm": 0.23075565689636676, |
|
"learning_rate": 0.0001404075288346206, |
|
"loss": 1.7089, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.2005453142298327, |
|
"learning_rate": 0.00014000364142523103, |
|
"loss": 1.7236, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.21925735664261978, |
|
"learning_rate": 0.00013959897563053662, |
|
"loss": 1.7193, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.22755950679993744, |
|
"learning_rate": 0.00013919353932446822, |
|
"loss": 1.6178, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.24575725474371382, |
|
"learning_rate": 0.0001387873403959492, |
|
"loss": 1.6914, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"grad_norm": 0.22868287217989744, |
|
"learning_rate": 0.00013838038674874193, |
|
"loss": 1.6021, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.21889496061933156, |
|
"learning_rate": 0.00013797268630129413, |
|
"loss": 1.8092, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.19238702480865116, |
|
"learning_rate": 0.0001375642469865844, |
|
"loss": 1.54, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.24437133183257548, |
|
"learning_rate": 0.00013715507675196836, |
|
"loss": 1.5477, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.21331661362588805, |
|
"learning_rate": 0.0001367451835590237, |
|
"loss": 1.6229, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.22934227073111574, |
|
"learning_rate": 0.00013633457538339514, |
|
"loss": 1.7056, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 1.33, |
|
"grad_norm": 0.21991726124527775, |
|
"learning_rate": 0.00013592326021463977, |
|
"loss": 1.7322, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.2279246851535844, |
|
"learning_rate": 0.00013551124605607097, |
|
"loss": 1.5663, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.21252716182463233, |
|
"learning_rate": 0.00013509854092460312, |
|
"loss": 1.6308, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.19276878334978295, |
|
"learning_rate": 0.0001346851528505954, |
|
"loss": 1.629, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.20349606898831232, |
|
"learning_rate": 0.00013427108987769566, |
|
"loss": 1.6323, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 1.35, |
|
"grad_norm": 0.280403908850998, |
|
"learning_rate": 0.00013385636006268368, |
|
"loss": 1.5647, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.204649437629767, |
|
"learning_rate": 0.00013344097147531469, |
|
"loss": 1.6706, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.2355526525352747, |
|
"learning_rate": 0.00013302493219816223, |
|
"loss": 1.6661, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 1.36, |
|
"grad_norm": 0.23955342033240548, |
|
"learning_rate": 0.00013260825032646083, |
|
"loss": 1.7684, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.1782918443154143, |
|
"learning_rate": 0.00013219093396794852, |
|
"loss": 1.7357, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.20676511108669285, |
|
"learning_rate": 0.00013177299124270911, |
|
"loss": 1.7935, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.24468072304122832, |
|
"learning_rate": 0.0001313544302830142, |
|
"loss": 1.6357, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.3442798924803141, |
|
"learning_rate": 0.00013093525923316482, |
|
"loss": 1.7283, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 1.38, |
|
"grad_norm": 0.18543047699982895, |
|
"learning_rate": 0.00013051548624933314, |
|
"loss": 1.6756, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.18961104598393633, |
|
"learning_rate": 0.00013009511949940358, |
|
"loss": 1.6258, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.23772840081980506, |
|
"learning_rate": 0.00012967416716281414, |
|
"loss": 1.6197, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.20599306112898513, |
|
"learning_rate": 0.00012925263743039693, |
|
"loss": 1.6155, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.17872981947947883, |
|
"learning_rate": 0.00012883053850421897, |
|
"loss": 1.817, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.21082979842365093, |
|
"learning_rate": 0.00012840787859742266, |
|
"loss": 1.7045, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.21065592453908275, |
|
"learning_rate": 0.00012798466593406583, |
|
"loss": 1.5825, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.21798103821826761, |
|
"learning_rate": 0.00012756090874896172, |
|
"loss": 1.7622, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.22916268453103483, |
|
"learning_rate": 0.00012713661528751888, |
|
"loss": 1.5324, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.2668875410933402, |
|
"learning_rate": 0.00012671179380558062, |
|
"loss": 1.647, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"grad_norm": 0.19627830855058848, |
|
"learning_rate": 0.00012628645256926438, |
|
"loss": 1.5994, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.21241423084048555, |
|
"learning_rate": 0.0001258605998548009, |
|
"loss": 1.622, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.2546778643093178, |
|
"learning_rate": 0.0001254342439483733, |
|
"loss": 1.6916, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 1.43, |
|
"grad_norm": 0.20610950008732792, |
|
"learning_rate": 0.00012500739314595563, |
|
"loss": 1.7455, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2219569529434739, |
|
"learning_rate": 0.00012458005575315147, |
|
"loss": 1.6683, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.20787095642170883, |
|
"learning_rate": 0.0001241522400850327, |
|
"loss": 1.6202, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2275845179745845, |
|
"learning_rate": 0.0001237239544659771, |
|
"loss": 1.8088, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.24655110446766015, |
|
"learning_rate": 0.0001232952072295069, |
|
"loss": 1.5618, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"grad_norm": 0.23084716022254811, |
|
"learning_rate": 0.0001228660067181263, |
|
"loss": 1.7204, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.2420965499906573, |
|
"learning_rate": 0.00012243636128315939, |
|
"loss": 1.5581, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.25054116126933823, |
|
"learning_rate": 0.0001220062792845873, |
|
"loss": 1.5808, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.24876893838844386, |
|
"learning_rate": 0.00012157576909088599, |
|
"loss": 1.6291, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.22724411732153027, |
|
"learning_rate": 0.00012114483907886308, |
|
"loss": 1.7218, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.23781633823944948, |
|
"learning_rate": 0.00012071349763349484, |
|
"loss": 1.6696, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 1.47, |
|
"grad_norm": 0.2611267676195103, |
|
"learning_rate": 0.00012028175314776344, |
|
"loss": 1.7099, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.25342034309056527, |
|
"learning_rate": 0.00011984961402249311, |
|
"loss": 1.6931, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.20391686876564638, |
|
"learning_rate": 0.00011941708866618697, |
|
"loss": 1.7043, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.2005457898894919, |
|
"learning_rate": 0.0001189841854948634, |
|
"loss": 1.5758, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.19157508121631642, |
|
"learning_rate": 0.00011855091293189234, |
|
"loss": 1.5831, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 1.49, |
|
"grad_norm": 0.23409302527114853, |
|
"learning_rate": 0.00011811727940783108, |
|
"loss": 1.6668, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.19820344277697435, |
|
"learning_rate": 0.00011768329336026062, |
|
"loss": 1.6894, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.23641920754497897, |
|
"learning_rate": 0.0001172489632336213, |
|
"loss": 1.8362, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.20503090615743924, |
|
"learning_rate": 0.00011681429747904842, |
|
"loss": 1.6885, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.18474233550647523, |
|
"learning_rate": 0.00011637930455420798, |
|
"loss": 1.7196, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.2775657036754379, |
|
"learning_rate": 0.00011594399292313192, |
|
"loss": 1.7362, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.23760102898739513, |
|
"learning_rate": 0.00011550837105605354, |
|
"loss": 1.5986, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.18850041877265183, |
|
"learning_rate": 0.00011507244742924274, |
|
"loss": 1.7116, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.2164959021230041, |
|
"learning_rate": 0.000114636230524841, |
|
"loss": 1.578, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.269300085641628, |
|
"learning_rate": 0.00011419972883069623, |
|
"loss": 1.5605, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.24787445167484887, |
|
"learning_rate": 0.00011376295084019792, |
|
"loss": 1.6663, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.21140623194389616, |
|
"learning_rate": 0.00011332590505211159, |
|
"loss": 1.658, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.25921900302870593, |
|
"learning_rate": 0.00011288859997041353, |
|
"loss": 1.6459, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 1.54, |
|
"grad_norm": 0.2608666502284525, |
|
"learning_rate": 0.00011245104410412537, |
|
"loss": 1.6928, |
|
"step": 2035 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.22406449938146802, |
|
"learning_rate": 0.00011201324596714844, |
|
"loss": 1.4791, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.19647960391415928, |
|
"learning_rate": 0.00011157521407809815, |
|
"loss": 1.698, |
|
"step": 2045 |
|
}, |
|
{ |
|
"epoch": 1.55, |
|
"grad_norm": 0.1897962583849219, |
|
"learning_rate": 0.00011113695696013824, |
|
"loss": 1.8167, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.20712759197533817, |
|
"learning_rate": 0.0001106984831408149, |
|
"loss": 1.7501, |
|
"step": 2055 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.23079961827033185, |
|
"learning_rate": 0.00011025980115189086, |
|
"loss": 1.5934, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.22104873487185864, |
|
"learning_rate": 0.00010982091952917943, |
|
"loss": 1.6686, |
|
"step": 2065 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.20639504694734737, |
|
"learning_rate": 0.00010938184681237833, |
|
"loss": 1.7136, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 1.57, |
|
"grad_norm": 0.2417721960073701, |
|
"learning_rate": 0.00010894259154490354, |
|
"loss": 1.6702, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.21810729625691397, |
|
"learning_rate": 0.00010850316227372312, |
|
"loss": 1.7477, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.23170201171415503, |
|
"learning_rate": 0.00010806356754919091, |
|
"loss": 1.6943, |
|
"step": 2085 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.22093119739393355, |
|
"learning_rate": 0.00010762381592488002, |
|
"loss": 1.623, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.21034721922753088, |
|
"learning_rate": 0.00010718391595741657, |
|
"loss": 1.6084, |
|
"step": 2095 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"grad_norm": 0.22443726771939806, |
|
"learning_rate": 0.00010674387620631308, |
|
"loss": 1.5536, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.22568508558473213, |
|
"learning_rate": 0.00010630370523380202, |
|
"loss": 1.469, |
|
"step": 2105 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.3332888137498032, |
|
"learning_rate": 0.00010586341160466904, |
|
"loss": 1.6488, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.2129808005413702, |
|
"learning_rate": 0.00010542300388608652, |
|
"loss": 1.6101, |
|
"step": 2115 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.20553693555408575, |
|
"learning_rate": 0.00010498249064744679, |
|
"loss": 1.4872, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.2445112542992352, |
|
"learning_rate": 0.00010454188046019524, |
|
"loss": 1.7005, |
|
"step": 2125 |
|
}, |
|
{ |
|
"epoch": 1.61, |
|
"grad_norm": 0.20844778510756687, |
|
"learning_rate": 0.00010410118189766387, |
|
"loss": 1.5589, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.2223212290874802, |
|
"learning_rate": 0.0001036604035349041, |
|
"loss": 1.6621, |
|
"step": 2135 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"grad_norm": 0.20479585313872112, |
|
"learning_rate": 0.00010321955394852018, |
|
"loss": 1.7061, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.17606184812861142, |
|
"learning_rate": 0.0001027786417165022, |
|
"loss": 1.5607, |
|
"step": 2145 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2676349610853098, |
|
"learning_rate": 0.0001023376754180592, |
|
"loss": 1.6232, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.2068560787418325, |
|
"learning_rate": 0.00010189666363345223, |
|
"loss": 1.5724, |
|
"step": 2155 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.19641973239797275, |
|
"learning_rate": 0.00010145561494382742, |
|
"loss": 1.5305, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"grad_norm": 0.2574797520893005, |
|
"learning_rate": 0.00010101453793104898, |
|
"loss": 1.6025, |
|
"step": 2165 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.2827194584842853, |
|
"learning_rate": 0.00010057344117753222, |
|
"loss": 1.5882, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.19936180521947827, |
|
"learning_rate": 0.00010013233326607661, |
|
"loss": 1.5706, |
|
"step": 2175 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.21819696462759022, |
|
"learning_rate": 9.969122277969865e-05, |
|
"loss": 1.6623, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.225417352707018, |
|
"learning_rate": 9.9250118301465e-05, |
|
"loss": 1.6255, |
|
"step": 2185 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.3143651738447285, |
|
"learning_rate": 9.880902841432544e-05, |
|
"loss": 1.4905, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 1.66, |
|
"grad_norm": 0.23749234423783855, |
|
"learning_rate": 9.836796170094571e-05, |
|
"loss": 1.6156, |
|
"step": 2195 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.23579593383210742, |
|
"learning_rate": 9.792692674354079e-05, |
|
"loss": 1.6963, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.2032329245708717, |
|
"learning_rate": 9.748593212370773e-05, |
|
"loss": 1.6733, |
|
"step": 2205 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.20661047812325195, |
|
"learning_rate": 9.704498642225856e-05, |
|
"loss": 1.622, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.18970352315906064, |
|
"learning_rate": 9.660409821905363e-05, |
|
"loss": 1.7834, |
|
"step": 2215 |
|
}, |
|
{ |
|
"epoch": 1.68, |
|
"grad_norm": 0.17832580771616308, |
|
"learning_rate": 9.616327609283445e-05, |
|
"loss": 1.6989, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.21859704299949706, |
|
"learning_rate": 9.572252862105673e-05, |
|
"loss": 1.7946, |
|
"step": 2225 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.24897942412148671, |
|
"learning_rate": 9.528186437972368e-05, |
|
"loss": 1.564, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 1.69, |
|
"grad_norm": 0.20109922805508615, |
|
"learning_rate": 9.484129194321896e-05, |
|
"loss": 1.6594, |
|
"step": 2235 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.19546463521855884, |
|
"learning_rate": 9.440081988413987e-05, |
|
"loss": 1.542, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.20254596218430737, |
|
"learning_rate": 9.396045677313067e-05, |
|
"loss": 1.8142, |
|
"step": 2245 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.1936135057396683, |
|
"learning_rate": 9.352021117871574e-05, |
|
"loss": 1.5564, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.2096445430714542, |
|
"learning_rate": 9.308009166713263e-05, |
|
"loss": 1.6735, |
|
"step": 2255 |
|
}, |
|
{ |
|
"epoch": 1.71, |
|
"grad_norm": 0.22672329152194862, |
|
"learning_rate": 9.264010680216583e-05, |
|
"loss": 1.6761, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.24482242735211057, |
|
"learning_rate": 9.220026514497983e-05, |
|
"loss": 1.5988, |
|
"step": 2265 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.24736418279884478, |
|
"learning_rate": 9.176057525395252e-05, |
|
"loss": 1.5844, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.1987944867199659, |
|
"learning_rate": 9.132104568450879e-05, |
|
"loss": 1.6997, |
|
"step": 2275 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.1850913674566201, |
|
"learning_rate": 9.088168498895408e-05, |
|
"loss": 1.5696, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"grad_norm": 0.24393794217168674, |
|
"learning_rate": 9.044250171630778e-05, |
|
"loss": 1.7403, |
|
"step": 2285 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.19475525279873163, |
|
"learning_rate": 9.000350441213708e-05, |
|
"loss": 1.5984, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.2218761532729913, |
|
"learning_rate": 8.956470161839072e-05, |
|
"loss": 1.6681, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.24957778768532196, |
|
"learning_rate": 8.912610187323248e-05, |
|
"loss": 1.6169, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.2510725868859042, |
|
"learning_rate": 8.868771371087539e-05, |
|
"loss": 1.639, |
|
"step": 2305 |
|
}, |
|
{ |
|
"epoch": 1.75, |
|
"grad_norm": 0.19643293153400068, |
|
"learning_rate": 8.82495456614155e-05, |
|
"loss": 1.7237, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.26450396919742597, |
|
"learning_rate": 8.781160625066588e-05, |
|
"loss": 1.6528, |
|
"step": 2315 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.22179001551390587, |
|
"learning_rate": 8.737390399999086e-05, |
|
"loss": 1.5533, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.2346687653947156, |
|
"learning_rate": 8.693644742614017e-05, |
|
"loss": 1.6104, |
|
"step": 2325 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.25806483606045055, |
|
"learning_rate": 8.649924504108302e-05, |
|
"loss": 1.6052, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.1786075330646357, |
|
"learning_rate": 8.606230535184283e-05, |
|
"loss": 1.5603, |
|
"step": 2335 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.254068816191818, |
|
"learning_rate": 8.562563686033145e-05, |
|
"loss": 1.7643, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.21344041020108453, |
|
"learning_rate": 8.518924806318378e-05, |
|
"loss": 1.6584, |
|
"step": 2345 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"grad_norm": 0.2082041629797306, |
|
"learning_rate": 8.47531474515925e-05, |
|
"loss": 1.7992, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.2645099180130053, |
|
"learning_rate": 8.431734351114284e-05, |
|
"loss": 1.6361, |
|
"step": 2355 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.22698336003173047, |
|
"learning_rate": 8.388184472164736e-05, |
|
"loss": 1.646, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.24003288864061173, |
|
"learning_rate": 8.34466595569811e-05, |
|
"loss": 1.6379, |
|
"step": 2365 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.19443085064409085, |
|
"learning_rate": 8.301179648491669e-05, |
|
"loss": 1.73, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.24311509067570025, |
|
"learning_rate": 8.257726396695933e-05, |
|
"loss": 1.6802, |
|
"step": 2375 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.24648929428851593, |
|
"learning_rate": 8.214307045818254e-05, |
|
"loss": 1.7708, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.1940516179768531, |
|
"learning_rate": 8.17092244070634e-05, |
|
"loss": 1.5857, |
|
"step": 2385 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.2361070277608161, |
|
"learning_rate": 8.127573425531814e-05, |
|
"loss": 1.6411, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.2835364928454071, |
|
"learning_rate": 8.084260843773799e-05, |
|
"loss": 1.7818, |
|
"step": 2395 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.18047213778922655, |
|
"learning_rate": 8.040985538202505e-05, |
|
"loss": 1.587, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.82, |
|
"grad_norm": 0.21940093931140764, |
|
"learning_rate": 7.997748350862822e-05, |
|
"loss": 1.6795, |
|
"step": 2405 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.20557324059132212, |
|
"learning_rate": 7.954550123057939e-05, |
|
"loss": 1.638, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.23522437885683956, |
|
"learning_rate": 7.911391695332988e-05, |
|
"loss": 1.6176, |
|
"step": 2415 |
|
}, |
|
{ |
|
"epoch": 1.83, |
|
"grad_norm": 0.20227659422834685, |
|
"learning_rate": 7.868273907458661e-05, |
|
"loss": 1.5562, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.17957107180807144, |
|
"learning_rate": 7.825197598414895e-05, |
|
"loss": 1.6577, |
|
"step": 2425 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.21134479099989728, |
|
"learning_rate": 7.782163606374536e-05, |
|
"loss": 1.5407, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.2190101821746382, |
|
"learning_rate": 7.739172768687028e-05, |
|
"loss": 1.6901, |
|
"step": 2435 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.22909832831883262, |
|
"learning_rate": 7.696225921862126e-05, |
|
"loss": 1.6517, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 1.85, |
|
"grad_norm": 0.1922087104118847, |
|
"learning_rate": 7.653323901553625e-05, |
|
"loss": 1.5558, |
|
"step": 2445 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.2535390902934386, |
|
"learning_rate": 7.610467542543073e-05, |
|
"loss": 1.7802, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.20264859592749507, |
|
"learning_rate": 7.567657678723565e-05, |
|
"loss": 1.6141, |
|
"step": 2455 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2534081482654566, |
|
"learning_rate": 7.52489514308349e-05, |
|
"loss": 1.6593, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.24401202904206418, |
|
"learning_rate": 7.482180767690334e-05, |
|
"loss": 1.5982, |
|
"step": 2465 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"grad_norm": 0.2805376490259695, |
|
"learning_rate": 7.439515383674485e-05, |
|
"loss": 1.7126, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.24585333566417664, |
|
"learning_rate": 7.396899821213072e-05, |
|
"loss": 1.5644, |
|
"step": 2475 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.22491029483008115, |
|
"learning_rate": 7.354334909513791e-05, |
|
"loss": 1.6765, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 0.23458997274256846, |
|
"learning_rate": 7.311821476798789e-05, |
|
"loss": 1.6122, |
|
"step": 2485 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.17595992796667512, |
|
"learning_rate": 7.269360350288547e-05, |
|
"loss": 1.8356, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.18759163970832302, |
|
"learning_rate": 7.226952356185765e-05, |
|
"loss": 1.4984, |
|
"step": 2495 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.236927434671597, |
|
"learning_rate": 7.184598319659317e-05, |
|
"loss": 1.6798, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.26802038257147875, |
|
"learning_rate": 7.142299064828169e-05, |
|
"loss": 1.5844, |
|
"step": 2505 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.1751974293734832, |
|
"learning_rate": 7.100055414745346e-05, |
|
"loss": 1.6365, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.23254005323825433, |
|
"learning_rate": 7.057868191381936e-05, |
|
"loss": 1.4657, |
|
"step": 2515 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.264348812986722, |
|
"learning_rate": 7.015738215611079e-05, |
|
"loss": 1.7816, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.27530320883320614, |
|
"learning_rate": 6.973666307191996e-05, |
|
"loss": 1.6751, |
|
"step": 2525 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.19339613251333393, |
|
"learning_rate": 6.931653284754042e-05, |
|
"loss": 1.7293, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 0.2151392309486146, |
|
"learning_rate": 6.889699965780787e-05, |
|
"loss": 1.7334, |
|
"step": 2535 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.22448766537331677, |
|
"learning_rate": 6.847807166594083e-05, |
|
"loss": 1.6827, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.2286115948636003, |
|
"learning_rate": 6.805975702338208e-05, |
|
"loss": 1.6562, |
|
"step": 2545 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.2118908130790939, |
|
"learning_rate": 6.764206386963991e-05, |
|
"loss": 1.6091, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.240925966059138, |
|
"learning_rate": 6.722500033212974e-05, |
|
"loss": 1.6314, |
|
"step": 2555 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2271694074825516, |
|
"learning_rate": 6.680857452601598e-05, |
|
"loss": 1.7589, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 1.94, |
|
"grad_norm": 0.2168118018671656, |
|
"learning_rate": 6.639279455405432e-05, |
|
"loss": 1.6201, |
|
"step": 2565 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.21224810091098364, |
|
"learning_rate": 6.597766850643361e-05, |
|
"loss": 1.5842, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"grad_norm": 0.19581859607212743, |
|
"learning_rate": 6.556320446061902e-05, |
|
"loss": 1.5586, |
|
"step": 2575 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.20327112477714954, |
|
"learning_rate": 6.514941048119435e-05, |
|
"loss": 1.6303, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.22810086515914976, |
|
"learning_rate": 6.47362946197055e-05, |
|
"loss": 1.7332, |
|
"step": 2585 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.22278333474431392, |
|
"learning_rate": 6.432386491450361e-05, |
|
"loss": 1.6293, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.23128655487134384, |
|
"learning_rate": 6.391212939058861e-05, |
|
"loss": 1.6937, |
|
"step": 2595 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.24641830926598107, |
|
"learning_rate": 6.350109605945323e-05, |
|
"loss": 1.4982, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.97, |
|
"grad_norm": 0.24123146757419323, |
|
"learning_rate": 6.309077291892702e-05, |
|
"loss": 1.5107, |
|
"step": 2605 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.24138969338364216, |
|
"learning_rate": 6.268116795302068e-05, |
|
"loss": 1.5448, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.2515434111696446, |
|
"learning_rate": 6.227228913177081e-05, |
|
"loss": 1.559, |
|
"step": 2615 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.2554427971564699, |
|
"learning_rate": 6.186414441108487e-05, |
|
"loss": 1.6211, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.20773791558688393, |
|
"learning_rate": 6.14567417325861e-05, |
|
"loss": 1.6058, |
|
"step": 2625 |
|
}, |
|
{ |
|
"epoch": 1.99, |
|
"grad_norm": 0.20109572317054908, |
|
"learning_rate": 6.105008902345935e-05, |
|
"loss": 1.5911, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.21186779196561445, |
|
"learning_rate": 6.064419419629662e-05, |
|
"loss": 1.6227, |
|
"step": 2635 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.2150487580932417, |
|
"learning_rate": 6.023906514894313e-05, |
|
"loss": 1.5839, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.24636199955981808, |
|
"learning_rate": 5.983470976434369e-05, |
|
"loss": 1.5764, |
|
"step": 2645 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.22093610448062864, |
|
"learning_rate": 5.943113591038928e-05, |
|
"loss": 1.7157, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"grad_norm": 0.21359568862552614, |
|
"learning_rate": 5.902835143976393e-05, |
|
"loss": 1.6359, |
|
"step": 2655 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.2219633405623727, |
|
"learning_rate": 5.862636418979198e-05, |
|
"loss": 1.6484, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.24148935530595134, |
|
"learning_rate": 5.822518198228565e-05, |
|
"loss": 1.52, |
|
"step": 2665 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 0.22871052628894134, |
|
"learning_rate": 5.782481262339261e-05, |
|
"loss": 1.5583, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.18016152517949127, |
|
"learning_rate": 5.742526390344427e-05, |
|
"loss": 1.7094, |
|
"step": 2675 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"grad_norm": 0.27927714573640977, |
|
"learning_rate": 5.702654359680428e-05, |
|
"loss": 1.7229, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.20272089890919007, |
|
"learning_rate": 5.662865946171696e-05, |
|
"loss": 1.7436, |
|
"step": 2685 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.25187946618078394, |
|
"learning_rate": 5.6231619240156694e-05, |
|
"loss": 1.5926, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 0.23619447456603418, |
|
"learning_rate": 5.5835430657676976e-05, |
|
"loss": 1.5177, |
|
"step": 2695 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.23076862233533377, |
|
"learning_rate": 5.544010142326026e-05, |
|
"loss": 1.6432, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.2509266079111979, |
|
"learning_rate": 5.504563922916799e-05, |
|
"loss": 1.6125, |
|
"step": 2705 |
|
}, |
|
{ |
|
"epoch": 2.05, |
|
"grad_norm": 0.26527998507107736, |
|
"learning_rate": 5.4652051750790825e-05, |
|
"loss": 1.5384, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.24254486560490685, |
|
"learning_rate": 5.425934664649921e-05, |
|
"loss": 1.6641, |
|
"step": 2715 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"grad_norm": 0.22497341374372068, |
|
"learning_rate": 5.3867531557494674e-05, |
|
"loss": 1.4442, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.22811203680708553, |
|
"learning_rate": 5.347661410766087e-05, |
|
"loss": 1.6313, |
|
"step": 2725 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.2193211927138723, |
|
"learning_rate": 5.308660190341528e-05, |
|
"loss": 1.4835, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 2.07, |
|
"grad_norm": 0.23158894991713072, |
|
"learning_rate": 5.2697502533561226e-05, |
|
"loss": 1.5765, |
|
"step": 2735 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.2160152191509828, |
|
"learning_rate": 5.230932356914032e-05, |
|
"loss": 1.6395, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.23138300560468752, |
|
"learning_rate": 5.1922072563284986e-05, |
|
"loss": 1.6645, |
|
"step": 2745 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 0.27219186986752913, |
|
"learning_rate": 5.153575705107152e-05, |
|
"loss": 1.5842, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.24365055871265076, |
|
"learning_rate": 5.115038454937362e-05, |
|
"loss": 1.7234, |
|
"step": 2755 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"grad_norm": 0.22921672259925305, |
|
"learning_rate": 5.076596255671592e-05, |
|
"loss": 1.5756, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.2538431765730713, |
|
"learning_rate": 5.0382498553128265e-05, |
|
"loss": 1.6491, |
|
"step": 2765 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.25913968900209966, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.5438, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 2.1, |
|
"grad_norm": 0.291257818004918, |
|
"learning_rate": 4.9618474339934916e-05, |
|
"loss": 1.5995, |
|
"step": 2775 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.24432948267207238, |
|
"learning_rate": 4.9237928996606384e-05, |
|
"loss": 1.5999, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 2.11, |
|
"grad_norm": 0.26418330324646966, |
|
"learning_rate": 4.88583713746129e-05, |
|
"loss": 1.7175, |
|
"step": 2785 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.2647804130194954, |
|
"learning_rate": 4.8479808859333964e-05, |
|
"loss": 1.5083, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.23990236642151055, |
|
"learning_rate": 4.810224881678652e-05, |
|
"loss": 1.5032, |
|
"step": 2795 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 0.22406476212806528, |
|
"learning_rate": 4.772569859348156e-05, |
|
"loss": 1.6183, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.17599248862626268, |
|
"learning_rate": 4.735016551628095e-05, |
|
"loss": 1.694, |
|
"step": 2805 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.27545889362059484, |
|
"learning_rate": 4.697565689225528e-05, |
|
"loss": 1.6074, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 2.13, |
|
"grad_norm": 0.27997532830437954, |
|
"learning_rate": 4.660218000854143e-05, |
|
"loss": 1.5062, |
|
"step": 2815 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.2803170335965896, |
|
"learning_rate": 4.6229742132200746e-05, |
|
"loss": 1.6516, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 2.14, |
|
"grad_norm": 0.22582531196940026, |
|
"learning_rate": 4.585835051007774e-05, |
|
"loss": 1.6168, |
|
"step": 2825 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.22856148303418752, |
|
"learning_rate": 4.548801236865912e-05, |
|
"loss": 1.5435, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.2764784030904549, |
|
"learning_rate": 4.511873491393304e-05, |
|
"loss": 1.6409, |
|
"step": 2835 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"grad_norm": 0.21257264261069672, |
|
"learning_rate": 4.475052533124893e-05, |
|
"loss": 1.5581, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.21196439275175047, |
|
"learning_rate": 4.438339078517785e-05, |
|
"loss": 1.5538, |
|
"step": 2845 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.2832145647608719, |
|
"learning_rate": 4.401733841937279e-05, |
|
"loss": 1.724, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 0.27147849615384506, |
|
"learning_rate": 4.3652375356429974e-05, |
|
"loss": 1.5014, |
|
"step": 2855 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.2610576760484019, |
|
"learning_rate": 4.328850869775001e-05, |
|
"loss": 1.6749, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"grad_norm": 0.23914287887699434, |
|
"learning_rate": 4.292574552339981e-05, |
|
"loss": 1.5328, |
|
"step": 2865 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.24065502762902322, |
|
"learning_rate": 4.256409289197495e-05, |
|
"loss": 1.5942, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.2083191016158885, |
|
"learning_rate": 4.2203557840462214e-05, |
|
"loss": 1.5539, |
|
"step": 2875 |
|
}, |
|
{ |
|
"epoch": 2.18, |
|
"grad_norm": 0.20639182389301813, |
|
"learning_rate": 4.184414738410248e-05, |
|
"loss": 1.5646, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.23727403239283584, |
|
"learning_rate": 4.148586851625461e-05, |
|
"loss": 1.5353, |
|
"step": 2885 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.24508287577637505, |
|
"learning_rate": 4.112872820825915e-05, |
|
"loss": 1.4418, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 2.19, |
|
"grad_norm": 0.2475936795575314, |
|
"learning_rate": 4.077273340930263e-05, |
|
"loss": 1.6643, |
|
"step": 2895 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.2505899184192717, |
|
"learning_rate": 4.041789104628241e-05, |
|
"loss": 1.5577, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.24093576954008833, |
|
"learning_rate": 4.006420802367205e-05, |
|
"loss": 1.6784, |
|
"step": 2905 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.2561236323684272, |
|
"learning_rate": 3.971169122338668e-05, |
|
"loss": 1.6165, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.24280603594696593, |
|
"learning_rate": 3.936034750464927e-05, |
|
"loss": 1.6695, |
|
"step": 2915 |
|
}, |
|
{ |
|
"epoch": 2.21, |
|
"grad_norm": 0.2602730047803284, |
|
"learning_rate": 3.901018370385724e-05, |
|
"loss": 1.5697, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.21146640994821633, |
|
"learning_rate": 3.866120663444914e-05, |
|
"loss": 1.5399, |
|
"step": 2925 |
|
}, |
|
{ |
|
"epoch": 2.22, |
|
"grad_norm": 0.24075711387924426, |
|
"learning_rate": 3.831342308677247e-05, |
|
"loss": 1.5597, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.24793331779495362, |
|
"learning_rate": 3.7966839827951196e-05, |
|
"loss": 1.6434, |
|
"step": 2935 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.19558506394109187, |
|
"learning_rate": 3.762146360175427e-05, |
|
"loss": 1.6499, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"grad_norm": 0.35587028915030966, |
|
"learning_rate": 3.727730112846444e-05, |
|
"loss": 1.5089, |
|
"step": 2945 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.2570330063437446, |
|
"learning_rate": 3.693435910474732e-05, |
|
"loss": 1.6548, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.28077059284475103, |
|
"learning_rate": 3.659264420352122e-05, |
|
"loss": 1.6528, |
|
"step": 2955 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 0.23035257395244374, |
|
"learning_rate": 3.6252163073827294e-05, |
|
"loss": 1.4482, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.2051186918638722, |
|
"learning_rate": 3.5912922340700206e-05, |
|
"loss": 1.5015, |
|
"step": 2965 |
|
}, |
|
{ |
|
"epoch": 2.25, |
|
"grad_norm": 0.22455945185810877, |
|
"learning_rate": 3.557492860503893e-05, |
|
"loss": 1.5176, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.23453638209680727, |
|
"learning_rate": 3.5238188443478795e-05, |
|
"loss": 1.6343, |
|
"step": 2975 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.24470156257503126, |
|
"learning_rate": 3.4902708408263066e-05, |
|
"loss": 1.7663, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"grad_norm": 0.23135832322132918, |
|
"learning_rate": 3.45684950271158e-05, |
|
"loss": 1.5837, |
|
"step": 2985 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.2608640064079802, |
|
"learning_rate": 3.423555480311457e-05, |
|
"loss": 1.6173, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.31078928098679404, |
|
"learning_rate": 3.3903894214564026e-05, |
|
"loss": 1.5177, |
|
"step": 2995 |
|
}, |
|
{ |
|
"epoch": 2.27, |
|
"grad_norm": 0.26258430453244713, |
|
"learning_rate": 3.3573519714869914e-05, |
|
"loss": 1.6865, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.2733284038434726, |
|
"learning_rate": 3.324443773241349e-05, |
|
"loss": 1.3619, |
|
"step": 3005 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"grad_norm": 0.2369163548191094, |
|
"learning_rate": 3.291665467042618e-05, |
|
"loss": 1.6509, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.2664340527286697, |
|
"learning_rate": 3.25901769068654e-05, |
|
"loss": 1.6038, |
|
"step": 3015 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.23398120063750877, |
|
"learning_rate": 3.2265010794290195e-05, |
|
"loss": 1.663, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 2.29, |
|
"grad_norm": 0.2781275708933271, |
|
"learning_rate": 3.1941162659737647e-05, |
|
"loss": 1.6429, |
|
"step": 3025 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.2687866606825216, |
|
"learning_rate": 3.16186388045998e-05, |
|
"loss": 1.6853, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.23644485510225058, |
|
"learning_rate": 3.129744550450113e-05, |
|
"loss": 1.6027, |
|
"step": 3035 |
|
}, |
|
{ |
|
"epoch": 2.3, |
|
"grad_norm": 0.24644290933624716, |
|
"learning_rate": 3.09775890091763e-05, |
|
"loss": 1.6018, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.2259139537131363, |
|
"learning_rate": 3.065907554234858e-05, |
|
"loss": 1.6607, |
|
"step": 3045 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"grad_norm": 0.24004959008038543, |
|
"learning_rate": 3.034191130160887e-05, |
|
"loss": 1.5377, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.2213008661812979, |
|
"learning_rate": 3.0026102458294924e-05, |
|
"loss": 1.5613, |
|
"step": 3055 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.2079094581228579, |
|
"learning_rate": 2.9711655157371443e-05, |
|
"loss": 1.5085, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 0.2527748569210639, |
|
"learning_rate": 2.9398575517310355e-05, |
|
"loss": 1.5855, |
|
"step": 3065 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.2141370968928817, |
|
"learning_rate": 2.9086869629971836e-05, |
|
"loss": 1.5732, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 2.33, |
|
"grad_norm": 0.24493685886391817, |
|
"learning_rate": 2.8776543560485857e-05, |
|
"loss": 1.6197, |
|
"step": 3075 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.2316788505534105, |
|
"learning_rate": 2.8467603347133997e-05, |
|
"loss": 1.648, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.25146411778731, |
|
"learning_rate": 2.816005500123203e-05, |
|
"loss": 1.5525, |
|
"step": 3085 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"grad_norm": 0.22407696629199808, |
|
"learning_rate": 2.785390450701303e-05, |
|
"loss": 1.7218, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.27013300544460844, |
|
"learning_rate": 2.7549157821510885e-05, |
|
"loss": 1.5804, |
|
"step": 3095 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.25388595748221704, |
|
"learning_rate": 2.7245820874444272e-05, |
|
"loss": 1.7398, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 2.35, |
|
"grad_norm": 0.19843759758285218, |
|
"learning_rate": 2.6943899568101405e-05, |
|
"loss": 1.6999, |
|
"step": 3105 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.20783915655026464, |
|
"learning_rate": 2.6643399777225232e-05, |
|
"loss": 1.6114, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 0.2496800397125067, |
|
"learning_rate": 2.6344327348898958e-05, |
|
"loss": 1.5217, |
|
"step": 3115 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.22235249882770752, |
|
"learning_rate": 2.6046688102432382e-05, |
|
"loss": 1.6871, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.2462186333352102, |
|
"learning_rate": 2.5750487829248726e-05, |
|
"loss": 1.7788, |
|
"step": 3125 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"grad_norm": 0.20018170839209692, |
|
"learning_rate": 2.545573229277175e-05, |
|
"loss": 1.6076, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.2704237119402894, |
|
"learning_rate": 2.5162427228313857e-05, |
|
"loss": 1.6456, |
|
"step": 3135 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.2735737465777087, |
|
"learning_rate": 2.4870578342964245e-05, |
|
"loss": 1.6402, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 2.38, |
|
"grad_norm": 0.2188413596766906, |
|
"learning_rate": 2.458019131547803e-05, |
|
"loss": 1.5193, |
|
"step": 3145 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.2821633184600081, |
|
"learning_rate": 2.429127179616575e-05, |
|
"loss": 1.6363, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 2.39, |
|
"grad_norm": 0.20714886526036308, |
|
"learning_rate": 2.4003825406783308e-05, |
|
"loss": 1.669, |
|
"step": 3155 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2661408497359453, |
|
"learning_rate": 2.3717857740422644e-05, |
|
"loss": 1.5488, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2535527034852724, |
|
"learning_rate": 2.343337436140295e-05, |
|
"loss": 1.5851, |
|
"step": 3165 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.2629746106882043, |
|
"learning_rate": 2.3150380805162418e-05, |
|
"loss": 1.5467, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.24098285831571226, |
|
"learning_rate": 2.2868882578150285e-05, |
|
"loss": 1.6417, |
|
"step": 3175 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.28638431202213366, |
|
"learning_rate": 2.258888515772005e-05, |
|
"loss": 1.6915, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 2.41, |
|
"grad_norm": 0.319171053435643, |
|
"learning_rate": 2.2310393992022704e-05, |
|
"loss": 1.6324, |
|
"step": 3185 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.2054749944090956, |
|
"learning_rate": 2.2033414499900685e-05, |
|
"loss": 1.5694, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"grad_norm": 0.2515694982134836, |
|
"learning_rate": 2.1757952070782504e-05, |
|
"loss": 1.598, |
|
"step": 3195 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.23267628383812705, |
|
"learning_rate": 2.148401206457793e-05, |
|
"loss": 1.4513, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.25390773868938254, |
|
"learning_rate": 2.121159981157359e-05, |
|
"loss": 1.5906, |
|
"step": 3205 |
|
}, |
|
{ |
|
"epoch": 2.43, |
|
"grad_norm": 0.251154990702733, |
|
"learning_rate": 2.0940720612329258e-05, |
|
"loss": 1.4707, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.24909067323121328, |
|
"learning_rate": 2.067137973757489e-05, |
|
"loss": 1.6214, |
|
"step": 3215 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 0.23515254331621996, |
|
"learning_rate": 2.0403582428107792e-05, |
|
"loss": 1.3762, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.24320094875542947, |
|
"learning_rate": 2.0137333894690912e-05, |
|
"loss": 1.4732, |
|
"step": 3225 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.26976839590657536, |
|
"learning_rate": 1.987263931795126e-05, |
|
"loss": 1.5325, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"grad_norm": 0.2480855244121356, |
|
"learning_rate": 1.9609503848279144e-05, |
|
"loss": 1.6336, |
|
"step": 3235 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.23767732175608752, |
|
"learning_rate": 1.9347932605728093e-05, |
|
"loss": 1.564, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.2727265524309786, |
|
"learning_rate": 1.9087930679915023e-05, |
|
"loss": 1.6079, |
|
"step": 3245 |
|
}, |
|
{ |
|
"epoch": 2.46, |
|
"grad_norm": 0.22286517973459688, |
|
"learning_rate": 1.882950312992131e-05, |
|
"loss": 1.4002, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.2456900771009275, |
|
"learning_rate": 1.8572654984194392e-05, |
|
"loss": 1.5994, |
|
"step": 3255 |
|
}, |
|
{ |
|
"epoch": 2.47, |
|
"grad_norm": 0.2771873738066393, |
|
"learning_rate": 1.8317391240449876e-05, |
|
"loss": 1.6214, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.2770007292533942, |
|
"learning_rate": 1.8063716865574266e-05, |
|
"loss": 1.4663, |
|
"step": 3265 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.24034756553369535, |
|
"learning_rate": 1.781163679552831e-05, |
|
"loss": 1.6507, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 0.2286386450562912, |
|
"learning_rate": 1.7561155935251094e-05, |
|
"loss": 1.5512, |
|
"step": 3275 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.2594167587325395, |
|
"learning_rate": 1.7312279158564415e-05, |
|
"loss": 1.6027, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.2127951461073897, |
|
"learning_rate": 1.706501130807806e-05, |
|
"loss": 1.6896, |
|
"step": 3285 |
|
}, |
|
{ |
|
"epoch": 2.49, |
|
"grad_norm": 0.2796245456905501, |
|
"learning_rate": 1.6819357195095597e-05, |
|
"loss": 1.6376, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.23110557133342613, |
|
"learning_rate": 1.657532159952062e-05, |
|
"loss": 1.5277, |
|
"step": 3295 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 0.24542029689976314, |
|
"learning_rate": 1.6332909269763953e-05, |
|
"loss": 1.7143, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.23539162074782163, |
|
"learning_rate": 1.609212492265103e-05, |
|
"loss": 1.7028, |
|
"step": 3305 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.2629785684260658, |
|
"learning_rate": 1.585297324333027e-05, |
|
"loss": 1.4392, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"grad_norm": 0.2933973664128153, |
|
"learning_rate": 1.561545888518192e-05, |
|
"loss": 1.7234, |
|
"step": 3315 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.23954470728817145, |
|
"learning_rate": 1.537958646972737e-05, |
|
"loss": 1.4944, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.23980954615598538, |
|
"learning_rate": 1.5145360586539336e-05, |
|
"loss": 1.5851, |
|
"step": 3325 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 0.21087011175193957, |
|
"learning_rate": 1.4912785793152583e-05, |
|
"loss": 1.5208, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.23976449951280604, |
|
"learning_rate": 1.4681866614975227e-05, |
|
"loss": 1.5722, |
|
"step": 3335 |
|
}, |
|
{ |
|
"epoch": 2.53, |
|
"grad_norm": 0.22800754377440097, |
|
"learning_rate": 1.4452607545200492e-05, |
|
"loss": 1.6206, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.21262175469660566, |
|
"learning_rate": 1.4225013044719615e-05, |
|
"loss": 1.5784, |
|
"step": 3345 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.2436947408558131, |
|
"learning_rate": 1.3999087542034817e-05, |
|
"loss": 1.5594, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"grad_norm": 0.23425763672194239, |
|
"learning_rate": 1.3774835433173172e-05, |
|
"loss": 1.6784, |
|
"step": 3355 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.22447628853598572, |
|
"learning_rate": 1.3552261081601091e-05, |
|
"loss": 1.6606, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.2410908531230671, |
|
"learning_rate": 1.3331368818139445e-05, |
|
"loss": 1.5011, |
|
"step": 3365 |
|
}, |
|
{ |
|
"epoch": 2.55, |
|
"grad_norm": 0.2107811996936363, |
|
"learning_rate": 1.3112162940879225e-05, |
|
"loss": 1.6211, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.23349707446690013, |
|
"learning_rate": 1.289464771509804e-05, |
|
"loss": 1.4912, |
|
"step": 3375 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 0.23161951144663487, |
|
"learning_rate": 1.2678827373176894e-05, |
|
"loss": 1.5809, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.23879959809777346, |
|
"learning_rate": 1.2464706114518088e-05, |
|
"loss": 1.6276, |
|
"step": 3385 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.2421829350347233, |
|
"learning_rate": 1.2252288105463405e-05, |
|
"loss": 1.6212, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 2.57, |
|
"grad_norm": 0.21883362123837063, |
|
"learning_rate": 1.2041577479212963e-05, |
|
"loss": 1.6288, |
|
"step": 3395 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.22802087223126732, |
|
"learning_rate": 1.1832578335744882e-05, |
|
"loss": 1.6313, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.58, |
|
"grad_norm": 0.26121795799269726, |
|
"learning_rate": 1.1625294741735526e-05, |
|
"loss": 1.656, |
|
"step": 3405 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.24409811650460989, |
|
"learning_rate": 1.1419730730480305e-05, |
|
"loss": 1.618, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.21817658760534914, |
|
"learning_rate": 1.1215890301815201e-05, |
|
"loss": 1.5273, |
|
"step": 3415 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"grad_norm": 0.21593460359416924, |
|
"learning_rate": 1.101377742203903e-05, |
|
"loss": 1.5447, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.2425820551445123, |
|
"learning_rate": 1.0813396023836142e-05, |
|
"loss": 1.5712, |
|
"step": 3425 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.2625166861814797, |
|
"learning_rate": 1.0614750006200014e-05, |
|
"loss": 1.6605, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.2615766591081601, |
|
"learning_rate": 1.0417843234357282e-05, |
|
"loss": 1.5986, |
|
"step": 3435 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.2581808450444552, |
|
"learning_rate": 1.022267953969257e-05, |
|
"loss": 1.641, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 2.61, |
|
"grad_norm": 0.2746391682993569, |
|
"learning_rate": 1.0029262719674015e-05, |
|
"loss": 1.6293, |
|
"step": 3445 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.20322726411468045, |
|
"learning_rate": 9.837596537779237e-06, |
|
"loss": 1.5418, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.24234603845064367, |
|
"learning_rate": 9.647684723422213e-06, |
|
"loss": 1.6451, |
|
"step": 3455 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"grad_norm": 0.24585798612987492, |
|
"learning_rate": 9.459530971880681e-06, |
|
"loss": 1.5217, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.24090792198358563, |
|
"learning_rate": 9.27313894422428e-06, |
|
"loss": 1.7077, |
|
"step": 3465 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.2794920103874086, |
|
"learning_rate": 9.088512267243143e-06, |
|
"loss": 1.7315, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 2.63, |
|
"grad_norm": 0.21561282445650495, |
|
"learning_rate": 8.905654533377583e-06, |
|
"loss": 1.6059, |
|
"step": 3475 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.2348946654088957, |
|
"learning_rate": 8.724569300648034e-06, |
|
"loss": 1.7123, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 0.29915508239234007, |
|
"learning_rate": 8.545260092585805e-06, |
|
"loss": 1.6167, |
|
"step": 3485 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.2609864197177778, |
|
"learning_rate": 8.367730398164574e-06, |
|
"loss": 1.6634, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.32383778611721475, |
|
"learning_rate": 8.19198367173255e-06, |
|
"loss": 1.631, |
|
"step": 3495 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"grad_norm": 0.24225506413140852, |
|
"learning_rate": 8.018023332945112e-06, |
|
"loss": 1.5466, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.2686708766662986, |
|
"learning_rate": 7.845852766698426e-06, |
|
"loss": 1.5889, |
|
"step": 3505 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.24593185494412043, |
|
"learning_rate": 7.675475323063475e-06, |
|
"loss": 1.5796, |
|
"step": 3510 |
|
}, |
|
{ |
|
"epoch": 2.66, |
|
"grad_norm": 0.27604151432217455, |
|
"learning_rate": 7.5068943172209025e-06, |
|
"loss": 1.6281, |
|
"step": 3515 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.2211924967690316, |
|
"learning_rate": 7.340113029396567e-06, |
|
"loss": 1.5407, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"grad_norm": 0.2549240432311639, |
|
"learning_rate": 7.175134704797592e-06, |
|
"loss": 1.6782, |
|
"step": 3525 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.22194169097137223, |
|
"learning_rate": 7.011962553549345e-06, |
|
"loss": 1.639, |
|
"step": 3530 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.24109057602354814, |
|
"learning_rate": 6.8505997506329024e-06, |
|
"loss": 1.6421, |
|
"step": 3535 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 0.298662548692409, |
|
"learning_rate": 6.691049435823327e-06, |
|
"loss": 1.5672, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.21821362720901652, |
|
"learning_rate": 6.533314713628458e-06, |
|
"loss": 1.5832, |
|
"step": 3545 |
|
}, |
|
{ |
|
"epoch": 2.69, |
|
"grad_norm": 0.28320776205122955, |
|
"learning_rate": 6.377398653228661e-06, |
|
"loss": 1.5686, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.2647885175395758, |
|
"learning_rate": 6.22330428841702e-06, |
|
"loss": 1.3694, |
|
"step": 3555 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.23693055785085496, |
|
"learning_rate": 6.071034617540294e-06, |
|
"loss": 1.4096, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"grad_norm": 0.2811316512655128, |
|
"learning_rate": 5.9205926034406e-06, |
|
"loss": 1.7223, |
|
"step": 3565 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.25921259473226205, |
|
"learning_rate": 5.771981173397811e-06, |
|
"loss": 1.6491, |
|
"step": 3570 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.2384614576434685, |
|
"learning_rate": 5.625203219072495e-06, |
|
"loss": 1.5796, |
|
"step": 3575 |
|
}, |
|
{ |
|
"epoch": 2.71, |
|
"grad_norm": 0.24972301573225342, |
|
"learning_rate": 5.480261596449698e-06, |
|
"loss": 1.6484, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.2790877252142927, |
|
"learning_rate": 5.337159125783453e-06, |
|
"loss": 1.6747, |
|
"step": 3585 |
|
}, |
|
{ |
|
"epoch": 2.72, |
|
"grad_norm": 0.26526860024829096, |
|
"learning_rate": 5.195898591541748e-06, |
|
"loss": 1.631, |
|
"step": 3590 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.2359453480631305, |
|
"learning_rate": 5.056482742352486e-06, |
|
"loss": 1.5224, |
|
"step": 3595 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.2749211055400865, |
|
"learning_rate": 4.9189142909498945e-06, |
|
"loss": 1.5348, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"grad_norm": 0.20381886685736014, |
|
"learning_rate": 4.783195914121818e-06, |
|
"loss": 1.6092, |
|
"step": 3605 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.27694673318367125, |
|
"learning_rate": 4.649330252657613e-06, |
|
"loss": 1.5524, |
|
"step": 3610 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.24104039690274334, |
|
"learning_rate": 4.517319911296747e-06, |
|
"loss": 1.6131, |
|
"step": 3615 |
|
}, |
|
{ |
|
"epoch": 2.74, |
|
"grad_norm": 0.21654379256196502, |
|
"learning_rate": 4.387167458678121e-06, |
|
"loss": 1.5537, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.22452456642122062, |
|
"learning_rate": 4.2588754272900985e-06, |
|
"loss": 1.5051, |
|
"step": 3625 |
|
}, |
|
{ |
|
"epoch": 2.75, |
|
"grad_norm": 0.2338963199910069, |
|
"learning_rate": 4.132446313421246e-06, |
|
"loss": 1.6882, |
|
"step": 3630 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.2584997684032959, |
|
"learning_rate": 4.00788257711171e-06, |
|
"loss": 1.5014, |
|
"step": 3635 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.2908213514842905, |
|
"learning_rate": 3.885186642105376e-06, |
|
"loss": 1.6277, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 0.19233168126321265, |
|
"learning_rate": 3.7643608958027543e-06, |
|
"loss": 1.7565, |
|
"step": 3645 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.270149572768491, |
|
"learning_rate": 3.6454076892144418e-06, |
|
"loss": 1.6004, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.2403966892086897, |
|
"learning_rate": 3.5283293369154036e-06, |
|
"loss": 1.5425, |
|
"step": 3655 |
|
}, |
|
{ |
|
"epoch": 2.77, |
|
"grad_norm": 0.1959498671099479, |
|
"learning_rate": 3.4131281170000083e-06, |
|
"loss": 1.6043, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.2291068939021477, |
|
"learning_rate": 3.2998062710375864e-06, |
|
"loss": 1.6167, |
|
"step": 3665 |
|
}, |
|
{ |
|
"epoch": 2.78, |
|
"grad_norm": 0.2669658026144572, |
|
"learning_rate": 3.188366004028931e-06, |
|
"loss": 1.6093, |
|
"step": 3670 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.21167755912643296, |
|
"learning_rate": 3.0788094843632655e-06, |
|
"loss": 1.6288, |
|
"step": 3675 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.27869409187190786, |
|
"learning_rate": 2.9711388437761445e-06, |
|
"loss": 1.5781, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"grad_norm": 0.21975752059084885, |
|
"learning_rate": 2.8653561773079764e-06, |
|
"loss": 1.6193, |
|
"step": 3685 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.22758480435581485, |
|
"learning_rate": 2.7614635432632097e-06, |
|
"loss": 1.7111, |
|
"step": 3690 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.253380043181827, |
|
"learning_rate": 2.6594629631702783e-06, |
|
"loss": 1.6528, |
|
"step": 3695 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.22066629732671186, |
|
"learning_rate": 2.5593564217423314e-06, |
|
"loss": 1.5717, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.20545309594834268, |
|
"learning_rate": 2.461145866838599e-06, |
|
"loss": 1.5816, |
|
"step": 3705 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"grad_norm": 0.2298245690861381, |
|
"learning_rate": 2.364833209426376e-06, |
|
"loss": 1.5273, |
|
"step": 3710 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.28452051640315046, |
|
"learning_rate": 2.270420323544009e-06, |
|
"loss": 1.5568, |
|
"step": 3715 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.20796391559506347, |
|
"learning_rate": 2.177909046264348e-06, |
|
"loss": 1.6991, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 2.82, |
|
"grad_norm": 0.26581893695586506, |
|
"learning_rate": 2.0873011776589957e-06, |
|
"loss": 1.517, |
|
"step": 3725 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.22796087968697157, |
|
"learning_rate": 1.998598480763247e-06, |
|
"loss": 1.7992, |
|
"step": 3730 |
|
}, |
|
{ |
|
"epoch": 2.83, |
|
"grad_norm": 0.2336977716987997, |
|
"learning_rate": 1.911802681541919e-06, |
|
"loss": 1.513, |
|
"step": 3735 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.2497871821535283, |
|
"learning_rate": 1.8269154688556056e-06, |
|
"loss": 1.5704, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.23620748564286875, |
|
"learning_rate": 1.7439384944279213e-06, |
|
"loss": 1.4392, |
|
"step": 3745 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 0.23378030531695476, |
|
"learning_rate": 1.6628733728133227e-06, |
|
"loss": 1.5813, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.2146995504847581, |
|
"learning_rate": 1.5837216813656908e-06, |
|
"loss": 1.5966, |
|
"step": 3755 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.2382092413535891, |
|
"learning_rate": 1.506484960207677e-06, |
|
"loss": 1.553, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 2.85, |
|
"grad_norm": 0.19096405564266747, |
|
"learning_rate": 1.4311647122006721e-06, |
|
"loss": 1.5538, |
|
"step": 3765 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.2877533644995907, |
|
"learning_rate": 1.3577624029155966e-06, |
|
"loss": 1.5703, |
|
"step": 3770 |
|
}, |
|
{ |
|
"epoch": 2.86, |
|
"grad_norm": 0.25619739351479454, |
|
"learning_rate": 1.2862794606044337e-06, |
|
"loss": 1.4537, |
|
"step": 3775 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.17985034388431237, |
|
"learning_rate": 1.216717276172341e-06, |
|
"loss": 1.7393, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.24865342548144834, |
|
"learning_rate": 1.1490772031506392e-06, |
|
"loss": 1.681, |
|
"step": 3785 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"grad_norm": 0.2682691239787855, |
|
"learning_rate": 1.0833605576705096e-06, |
|
"loss": 1.7253, |
|
"step": 3790 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.2694273976328748, |
|
"learning_rate": 1.0195686184373166e-06, |
|
"loss": 1.5678, |
|
"step": 3795 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.21203591608829347, |
|
"learning_rate": 9.577026267057476e-07, |
|
"loss": 1.615, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 0.24345143310387468, |
|
"learning_rate": 8.97763786255712e-07, |
|
"loss": 1.5338, |
|
"step": 3805 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.2579598936883795, |
|
"learning_rate": 8.397532633688254e-07, |
|
"loss": 1.5515, |
|
"step": 3810 |
|
}, |
|
{ |
|
"epoch": 2.89, |
|
"grad_norm": 0.2434697228699266, |
|
"learning_rate": 7.836721868058061e-07, |
|
"loss": 1.7675, |
|
"step": 3815 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.2440247854883621, |
|
"learning_rate": 7.295216477844702e-07, |
|
"loss": 1.6179, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.25830362738889806, |
|
"learning_rate": 6.773026999584708e-07, |
|
"loss": 1.698, |
|
"step": 3825 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"grad_norm": 0.2622736838540384, |
|
"learning_rate": 6.270163593968703e-07, |
|
"loss": 1.6485, |
|
"step": 3830 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.2065165799768009, |
|
"learning_rate": 5.786636045643112e-07, |
|
"loss": 1.6278, |
|
"step": 3835 |
|
}, |
|
{ |
|
"epoch": 2.91, |
|
"grad_norm": 0.22985561856815895, |
|
"learning_rate": 5.322453763019653e-07, |
|
"loss": 1.5524, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.2727208719350262, |
|
"learning_rate": 4.877625778092809e-07, |
|
"loss": 1.6646, |
|
"step": 3845 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.2664985065225481, |
|
"learning_rate": 4.4521607462640893e-07, |
|
"loss": 1.5143, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 0.24545453500067022, |
|
"learning_rate": 4.046066946172822e-07, |
|
"loss": 1.6567, |
|
"step": 3855 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.24678453590974866, |
|
"learning_rate": 3.659352279535733e-07, |
|
"loss": 1.6106, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.28024299756070853, |
|
"learning_rate": 3.292024270993399e-07, |
|
"loss": 1.4444, |
|
"step": 3865 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"grad_norm": 0.21189400608157236, |
|
"learning_rate": 2.9440900679631457e-07, |
|
"loss": 1.5323, |
|
"step": 3870 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.24352825534009542, |
|
"learning_rate": 2.615556440500377e-07, |
|
"loss": 1.6129, |
|
"step": 3875 |
|
}, |
|
{ |
|
"epoch": 2.94, |
|
"grad_norm": 0.24710743453957604, |
|
"learning_rate": 2.306429781166908e-07, |
|
"loss": 1.6064, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.2939949777477219, |
|
"learning_rate": 2.016716104906391e-07, |
|
"loss": 1.5547, |
|
"step": 3885 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.24855903276634353, |
|
"learning_rate": 1.7464210489273047e-07, |
|
"loss": 1.4292, |
|
"step": 3890 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"grad_norm": 0.29838466749808656, |
|
"learning_rate": 1.4955498725932604e-07, |
|
"loss": 1.6796, |
|
"step": 3895 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.24063497135231618, |
|
"learning_rate": 1.2641074573209733e-07, |
|
"loss": 1.6524, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.266868894727991, |
|
"learning_rate": 1.0520983064847833e-07, |
|
"loss": 1.6033, |
|
"step": 3905 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 0.2468474483223204, |
|
"learning_rate": 8.595265453292811e-08, |
|
"loss": 1.5643, |
|
"step": 3910 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.20433798961816185, |
|
"learning_rate": 6.86395920889149e-08, |
|
"loss": 1.5261, |
|
"step": 3915 |
|
}, |
|
{ |
|
"epoch": 2.97, |
|
"grad_norm": 0.24801643390934636, |
|
"learning_rate": 5.327098019159982e-08, |
|
"loss": 1.7088, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.25316391379721404, |
|
"learning_rate": 3.9847117881308685e-08, |
|
"loss": 1.6097, |
|
"step": 3925 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.2589299223311166, |
|
"learning_rate": 2.8368266357681194e-08, |
|
"loss": 1.7897, |
|
"step": 3930 |
|
}, |
|
{ |
|
"epoch": 2.98, |
|
"grad_norm": 0.2615270175678582, |
|
"learning_rate": 1.8834648974630497e-08, |
|
"loss": 1.7153, |
|
"step": 3935 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.23820419945511057, |
|
"learning_rate": 1.12464512359578e-08, |
|
"loss": 1.489, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.24057944682864915, |
|
"learning_rate": 5.603820791755254e-09, |
|
"loss": 1.4223, |
|
"step": 3945 |
|
}, |
|
{ |
|
"epoch": 2.99, |
|
"grad_norm": 0.28827145571809903, |
|
"learning_rate": 1.9068674355415815e-09, |
|
"loss": 1.6161, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.4006550408742116, |
|
"learning_rate": 1.5566310213044333e-10, |
|
"loss": 1.556, |
|
"step": 3955 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 3957, |
|
"total_flos": 1.244366244937728e+16, |
|
"train_loss": 1.656136889695097, |
|
"train_runtime": 24205.6115, |
|
"train_samples_per_second": 0.654, |
|
"train_steps_per_second": 0.163 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 3957, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"total_flos": 1.244366244937728e+16, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|