|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.07811754085981373, |
|
"eval_steps": 500, |
|
"global_step": 3000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0002603918028660458, |
|
"grad_norm": 0.4500846266746521, |
|
"learning_rate": 5.194805194805195e-06, |
|
"loss": 1.0381, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0005207836057320916, |
|
"grad_norm": 0.35188010334968567, |
|
"learning_rate": 1.038961038961039e-05, |
|
"loss": 1.0108, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0007811754085981374, |
|
"grad_norm": 0.2300374060869217, |
|
"learning_rate": 1.5584415584415583e-05, |
|
"loss": 0.9668, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.0010415672114641832, |
|
"grad_norm": 0.16189467906951904, |
|
"learning_rate": 2.077922077922078e-05, |
|
"loss": 0.918, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.001301959014330229, |
|
"grad_norm": 0.18843211233615875, |
|
"learning_rate": 2.5974025974025972e-05, |
|
"loss": 0.9265, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0015623508171962747, |
|
"grad_norm": 0.20334510505199432, |
|
"learning_rate": 3.1168831168831166e-05, |
|
"loss": 0.9234, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0018227426200623205, |
|
"grad_norm": 0.1745327115058899, |
|
"learning_rate": 3.6363636363636364e-05, |
|
"loss": 0.881, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0020831344229283663, |
|
"grad_norm": 0.18667331337928772, |
|
"learning_rate": 4.155844155844156e-05, |
|
"loss": 0.8592, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.002343526225794412, |
|
"grad_norm": 0.1848158985376358, |
|
"learning_rate": 4.675324675324675e-05, |
|
"loss": 0.8537, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.002603918028660458, |
|
"grad_norm": 0.17589879035949707, |
|
"learning_rate": 5.1948051948051944e-05, |
|
"loss": 0.8518, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0028643098315265037, |
|
"grad_norm": 0.2132624089717865, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.8511, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.0031247016343925495, |
|
"grad_norm": 0.23070092499256134, |
|
"learning_rate": 6.233766233766233e-05, |
|
"loss": 0.7975, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0033850934372585953, |
|
"grad_norm": 0.25368157029151917, |
|
"learning_rate": 6.753246753246754e-05, |
|
"loss": 0.8134, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.003645485240124641, |
|
"grad_norm": 0.22897231578826904, |
|
"learning_rate": 7.272727272727273e-05, |
|
"loss": 0.8322, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.003905877042990687, |
|
"grad_norm": 0.19932536780834198, |
|
"learning_rate": 7.792207792207793e-05, |
|
"loss": 0.7959, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.004166268845856733, |
|
"grad_norm": 0.21011792123317719, |
|
"learning_rate": 8.311688311688312e-05, |
|
"loss": 0.8102, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.004426660648722778, |
|
"grad_norm": 0.20594824850559235, |
|
"learning_rate": 8.831168831168831e-05, |
|
"loss": 0.8128, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.004687052451588824, |
|
"grad_norm": 0.20465536415576935, |
|
"learning_rate": 9.35064935064935e-05, |
|
"loss": 0.7989, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.00494744425445487, |
|
"grad_norm": 0.4109392762184143, |
|
"learning_rate": 9.870129870129871e-05, |
|
"loss": 0.8108, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.005207836057320916, |
|
"grad_norm": 0.4293076694011688, |
|
"learning_rate": 0.00010389610389610389, |
|
"loss": 0.8101, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.005468227860186962, |
|
"grad_norm": 0.31628963351249695, |
|
"learning_rate": 0.00010909090909090909, |
|
"loss": 0.7989, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.005728619663053007, |
|
"grad_norm": 0.24642810225486755, |
|
"learning_rate": 0.00011428571428571428, |
|
"loss": 0.7751, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.005989011465919053, |
|
"grad_norm": 0.3599106967449188, |
|
"learning_rate": 0.00011948051948051949, |
|
"loss": 0.8063, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.006249403268785099, |
|
"grad_norm": 0.17053447663784027, |
|
"learning_rate": 0.00012467532467532467, |
|
"loss": 0.7751, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.006509795071651145, |
|
"grad_norm": 0.17303769290447235, |
|
"learning_rate": 0.00012987012987012987, |
|
"loss": 0.7883, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.0067701868745171905, |
|
"grad_norm": 0.1815861016511917, |
|
"learning_rate": 0.00013506493506493507, |
|
"loss": 0.788, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.007030578677383236, |
|
"grad_norm": 0.24125365912914276, |
|
"learning_rate": 0.00014025974025974028, |
|
"loss": 0.8018, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.007290970480249282, |
|
"grad_norm": 0.19443446397781372, |
|
"learning_rate": 0.00014545454545454546, |
|
"loss": 0.7908, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.007551362283115328, |
|
"grad_norm": 0.17829768359661102, |
|
"learning_rate": 0.00015064935064935066, |
|
"loss": 0.8033, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.007811754085981374, |
|
"grad_norm": 0.19535653293132782, |
|
"learning_rate": 0.00015584415584415587, |
|
"loss": 0.7997, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.008072145888847419, |
|
"grad_norm": 0.19930541515350342, |
|
"learning_rate": 0.00016103896103896104, |
|
"loss": 0.7945, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.008332537691713465, |
|
"grad_norm": 0.2156297266483307, |
|
"learning_rate": 0.00016623376623376625, |
|
"loss": 0.8018, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.00859292949457951, |
|
"grad_norm": 0.1924206018447876, |
|
"learning_rate": 0.00017142857142857143, |
|
"loss": 0.7746, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.008853321297445557, |
|
"grad_norm": 0.2294880747795105, |
|
"learning_rate": 0.00017662337662337663, |
|
"loss": 0.8152, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.009113713100311602, |
|
"grad_norm": 0.16817067563533783, |
|
"learning_rate": 0.00018181818181818183, |
|
"loss": 0.7972, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.009374104903177648, |
|
"grad_norm": 0.18544812500476837, |
|
"learning_rate": 0.000187012987012987, |
|
"loss": 0.7801, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.009634496706043693, |
|
"grad_norm": 0.19597066938877106, |
|
"learning_rate": 0.00019220779220779222, |
|
"loss": 0.7706, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.00989488850890974, |
|
"grad_norm": 0.40291881561279297, |
|
"learning_rate": 0.00019740259740259742, |
|
"loss": 0.7911, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.010155280311775785, |
|
"grad_norm": 0.23841074109077454, |
|
"learning_rate": 0.00019999996515752773, |
|
"loss": 0.7861, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.010415672114641832, |
|
"grad_norm": 0.1675388514995575, |
|
"learning_rate": 0.00019999968641789507, |
|
"loss": 0.788, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.010676063917507876, |
|
"grad_norm": 1.8860758543014526, |
|
"learning_rate": 0.0001999991289394067, |
|
"loss": 0.7632, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.010936455720373923, |
|
"grad_norm": 0.17022117972373962, |
|
"learning_rate": 0.00019999829272361654, |
|
"loss": 0.784, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.011196847523239968, |
|
"grad_norm": 0.21460269391536713, |
|
"learning_rate": 0.00019999717777285545, |
|
"loss": 0.761, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.011457239326106015, |
|
"grad_norm": 0.19413785636425018, |
|
"learning_rate": 0.00019999578409023126, |
|
"loss": 0.7772, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01171763112897206, |
|
"grad_norm": 0.20223405957221985, |
|
"learning_rate": 0.00019999411167962868, |
|
"loss": 0.7811, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.011978022931838106, |
|
"grad_norm": 0.15166303515434265, |
|
"learning_rate": 0.00019999216054570942, |
|
"loss": 0.7709, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.012238414734704151, |
|
"grad_norm": 0.16307081282138824, |
|
"learning_rate": 0.00019998993069391205, |
|
"loss": 0.7811, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.012498806537570198, |
|
"grad_norm": 0.15996049344539642, |
|
"learning_rate": 0.00019998742213045206, |
|
"loss": 0.7599, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.012759198340436243, |
|
"grad_norm": 0.17560279369354248, |
|
"learning_rate": 0.00019998463486232179, |
|
"loss": 0.7572, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.01301959014330229, |
|
"grad_norm": 0.17571642994880676, |
|
"learning_rate": 0.0001999815688972905, |
|
"loss": 0.7643, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.013279981946168334, |
|
"grad_norm": 0.17719799280166626, |
|
"learning_rate": 0.00019997822424390422, |
|
"loss": 0.7923, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.013540373749034381, |
|
"grad_norm": 0.19846616685390472, |
|
"learning_rate": 0.00019997460091148586, |
|
"loss": 0.7674, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.013800765551900426, |
|
"grad_norm": 0.2715558111667633, |
|
"learning_rate": 0.00019997069891013503, |
|
"loss": 0.7421, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.014061157354766473, |
|
"grad_norm": 0.1725197583436966, |
|
"learning_rate": 0.00019996651825072826, |
|
"loss": 0.7663, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.014321549157632518, |
|
"grad_norm": 0.15060502290725708, |
|
"learning_rate": 0.00019996205894491856, |
|
"loss": 0.7794, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.014581940960498564, |
|
"grad_norm": 0.16645808517932892, |
|
"learning_rate": 0.00019995732100513592, |
|
"loss": 0.752, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.014842332763364609, |
|
"grad_norm": 0.1736789345741272, |
|
"learning_rate": 0.00019995230444458682, |
|
"loss": 0.7788, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.015102724566230656, |
|
"grad_norm": 0.15416319668293, |
|
"learning_rate": 0.0001999470092772544, |
|
"loss": 0.7656, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.0153631163690967, |
|
"grad_norm": 0.16610187292099, |
|
"learning_rate": 0.00019994143551789839, |
|
"loss": 0.7676, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.015623508171962747, |
|
"grad_norm": 0.15843011438846588, |
|
"learning_rate": 0.00019993558318205507, |
|
"loss": 0.7746, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.015883899974828794, |
|
"grad_norm": 0.26837801933288574, |
|
"learning_rate": 0.00019992945228603724, |
|
"loss": 0.7617, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.016144291777694837, |
|
"grad_norm": 0.15099173784255981, |
|
"learning_rate": 0.0001999230428469341, |
|
"loss": 0.7601, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.016404683580560884, |
|
"grad_norm": 0.15511856973171234, |
|
"learning_rate": 0.00019991635488261138, |
|
"loss": 0.7647, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.01666507538342693, |
|
"grad_norm": 0.14919579029083252, |
|
"learning_rate": 0.00019990938841171104, |
|
"loss": 0.7692, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.016925467186292977, |
|
"grad_norm": 0.15838642418384552, |
|
"learning_rate": 0.0001999021434536514, |
|
"loss": 0.7763, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.01718585898915902, |
|
"grad_norm": 0.15956635773181915, |
|
"learning_rate": 0.00019989462002862704, |
|
"loss": 0.7598, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.017446250792025067, |
|
"grad_norm": 0.1499069333076477, |
|
"learning_rate": 0.0001998868181576088, |
|
"loss": 0.7626, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.017706642594891114, |
|
"grad_norm": 0.2170073390007019, |
|
"learning_rate": 0.00019987873786234348, |
|
"loss": 0.7569, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.01796703439775716, |
|
"grad_norm": 0.17841948568820953, |
|
"learning_rate": 0.00019987037916535417, |
|
"loss": 0.7494, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.018227426200623204, |
|
"grad_norm": 0.2066909372806549, |
|
"learning_rate": 0.0001998617420899398, |
|
"loss": 0.7609, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01848781800348925, |
|
"grad_norm": 0.17015361785888672, |
|
"learning_rate": 0.0001998528266601754, |
|
"loss": 0.7761, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.018748209806355297, |
|
"grad_norm": 0.22166290879249573, |
|
"learning_rate": 0.0001998436329009118, |
|
"loss": 0.7573, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01900860160922134, |
|
"grad_norm": 0.15084640681743622, |
|
"learning_rate": 0.00019983416083777563, |
|
"loss": 0.7775, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.019268993412087387, |
|
"grad_norm": 0.17800921201705933, |
|
"learning_rate": 0.0001998244104971693, |
|
"loss": 0.7359, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.019529385214953433, |
|
"grad_norm": 0.17354707419872284, |
|
"learning_rate": 0.0001998143819062709, |
|
"loss": 0.7415, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.01978977701781948, |
|
"grad_norm": 0.16408118605613708, |
|
"learning_rate": 0.00019980407509303413, |
|
"loss": 0.7708, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.020050168820685523, |
|
"grad_norm": 0.16820089519023895, |
|
"learning_rate": 0.00019979349008618808, |
|
"loss": 0.791, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.02031056062355157, |
|
"grad_norm": 0.15958388149738312, |
|
"learning_rate": 0.00019978262691523743, |
|
"loss": 0.7412, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.020570952426417616, |
|
"grad_norm": 0.1646542251110077, |
|
"learning_rate": 0.00019977148561046217, |
|
"loss": 0.7529, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.020831344229283663, |
|
"grad_norm": 0.17032025754451752, |
|
"learning_rate": 0.0001997600662029175, |
|
"loss": 0.7656, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.021091736032149706, |
|
"grad_norm": 0.17189227044582367, |
|
"learning_rate": 0.00019974836872443388, |
|
"loss": 0.7433, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.021352127835015753, |
|
"grad_norm": 0.16334249079227448, |
|
"learning_rate": 0.0001997363932076168, |
|
"loss": 0.7703, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.0216125196378818, |
|
"grad_norm": 0.1676424890756607, |
|
"learning_rate": 0.00019972413968584682, |
|
"loss": 0.7603, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.021872911440747846, |
|
"grad_norm": 0.16826209425926208, |
|
"learning_rate": 0.0001997116081932793, |
|
"loss": 0.7569, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.02213330324361389, |
|
"grad_norm": 0.1876436173915863, |
|
"learning_rate": 0.0001996987987648446, |
|
"loss": 0.7553, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.022393695046479936, |
|
"grad_norm": 0.17252250015735626, |
|
"learning_rate": 0.0001996857114362476, |
|
"loss": 0.7644, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.022654086849345983, |
|
"grad_norm": 0.1632252335548401, |
|
"learning_rate": 0.00019967234624396793, |
|
"loss": 0.7568, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.02291447865221203, |
|
"grad_norm": 0.1818259060382843, |
|
"learning_rate": 0.00019965870322525965, |
|
"loss": 0.7672, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.023174870455078073, |
|
"grad_norm": 0.15418195724487305, |
|
"learning_rate": 0.0001996447824181513, |
|
"loss": 0.7642, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.02343526225794412, |
|
"grad_norm": 0.17383505403995514, |
|
"learning_rate": 0.0001996305838614457, |
|
"loss": 0.7607, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.023695654060810166, |
|
"grad_norm": 0.17794272303581238, |
|
"learning_rate": 0.00019961610759471984, |
|
"loss": 0.7588, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.023956045863676213, |
|
"grad_norm": 0.1909121572971344, |
|
"learning_rate": 0.00019960135365832486, |
|
"loss": 0.7438, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.024216437666542256, |
|
"grad_norm": 0.17758873105049133, |
|
"learning_rate": 0.00019958632209338587, |
|
"loss": 0.7323, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.024476829469408302, |
|
"grad_norm": 0.15553662180900574, |
|
"learning_rate": 0.00019957101294180174, |
|
"loss": 0.7508, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.02473722127227435, |
|
"grad_norm": 0.15310749411582947, |
|
"learning_rate": 0.00019955542624624522, |
|
"loss": 0.7563, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.024997613075140396, |
|
"grad_norm": 0.1628728210926056, |
|
"learning_rate": 0.00019953956205016256, |
|
"loss": 0.7524, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.02525800487800644, |
|
"grad_norm": 0.16211454570293427, |
|
"learning_rate": 0.00019952342039777362, |
|
"loss": 0.7564, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.025518396680872486, |
|
"grad_norm": 0.15663012862205505, |
|
"learning_rate": 0.00019950700133407163, |
|
"loss": 0.7395, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.025778788483738532, |
|
"grad_norm": 0.1684863567352295, |
|
"learning_rate": 0.00019949030490482296, |
|
"loss": 0.753, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.02603918028660458, |
|
"grad_norm": 0.1561436653137207, |
|
"learning_rate": 0.0001994733311565673, |
|
"loss": 0.7409, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.026299572089470622, |
|
"grad_norm": 0.1781485229730606, |
|
"learning_rate": 0.0001994560801366171, |
|
"loss": 0.762, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.02655996389233667, |
|
"grad_norm": 0.15422071516513824, |
|
"learning_rate": 0.00019943855189305792, |
|
"loss": 0.7291, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.026820355695202715, |
|
"grad_norm": 0.17980527877807617, |
|
"learning_rate": 0.00019942074647474786, |
|
"loss": 0.7732, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.027080747498068762, |
|
"grad_norm": 0.15810626745224, |
|
"learning_rate": 0.00019940266393131775, |
|
"loss": 0.7764, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.027341139300934805, |
|
"grad_norm": 0.16385480761528015, |
|
"learning_rate": 0.00019938430431317081, |
|
"loss": 0.7404, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.027601531103800852, |
|
"grad_norm": 0.15134255588054657, |
|
"learning_rate": 0.00019936566767148257, |
|
"loss": 0.7506, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.0278619229066669, |
|
"grad_norm": 0.1592187136411667, |
|
"learning_rate": 0.00019934675405820077, |
|
"loss": 0.73, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.028122314709532945, |
|
"grad_norm": 0.16852422058582306, |
|
"learning_rate": 0.00019932756352604515, |
|
"loss": 0.7443, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02838270651239899, |
|
"grad_norm": 0.15741507709026337, |
|
"learning_rate": 0.00019930809612850735, |
|
"loss": 0.7377, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.028643098315265035, |
|
"grad_norm": 0.22424879670143127, |
|
"learning_rate": 0.00019928835191985076, |
|
"loss": 0.7544, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.028903490118131082, |
|
"grad_norm": 0.2047310769557953, |
|
"learning_rate": 0.0001992683309551103, |
|
"loss": 0.7441, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.02916388192099713, |
|
"grad_norm": 0.16392463445663452, |
|
"learning_rate": 0.00019924803329009243, |
|
"loss": 0.7606, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.02942427372386317, |
|
"grad_norm": 0.16227149963378906, |
|
"learning_rate": 0.00019922745898137473, |
|
"loss": 0.736, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.029684665526729218, |
|
"grad_norm": 0.15652808547019958, |
|
"learning_rate": 0.00019920660808630598, |
|
"loss": 0.7513, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.029945057329595265, |
|
"grad_norm": 0.15162768959999084, |
|
"learning_rate": 0.00019918548066300592, |
|
"loss": 0.7303, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03020544913246131, |
|
"grad_norm": 0.17650415003299713, |
|
"learning_rate": 0.0001991640767703651, |
|
"loss": 0.7254, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.030465840935327355, |
|
"grad_norm": 0.1594468355178833, |
|
"learning_rate": 0.00019914239646804462, |
|
"loss": 0.741, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.0307262327381934, |
|
"grad_norm": 0.17928367853164673, |
|
"learning_rate": 0.00019912043981647616, |
|
"loss": 0.7515, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.030986624541059448, |
|
"grad_norm": 0.17009998857975006, |
|
"learning_rate": 0.00019909820687686157, |
|
"loss": 0.7539, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.031247016343925495, |
|
"grad_norm": 0.16556763648986816, |
|
"learning_rate": 0.0001990756977111729, |
|
"loss": 0.7418, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03150740814679154, |
|
"grad_norm": 0.1561640352010727, |
|
"learning_rate": 0.0001990529123821522, |
|
"loss": 0.7465, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.03176779994965759, |
|
"grad_norm": 0.15182287991046906, |
|
"learning_rate": 0.00019902985095331113, |
|
"loss": 0.7694, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.03202819175252363, |
|
"grad_norm": 0.15173685550689697, |
|
"learning_rate": 0.00019900651348893114, |
|
"loss": 0.7519, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.032288583555389674, |
|
"grad_norm": 0.16535787284374237, |
|
"learning_rate": 0.00019898290005406296, |
|
"loss": 0.7646, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.032548975358255725, |
|
"grad_norm": 0.19272534549236298, |
|
"learning_rate": 0.00019895901071452667, |
|
"loss": 0.7655, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03280936716112177, |
|
"grad_norm": 0.1672705113887787, |
|
"learning_rate": 0.0001989348455369113, |
|
"loss": 0.7486, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.03306975896398781, |
|
"grad_norm": 0.1525493860244751, |
|
"learning_rate": 0.0001989104045885748, |
|
"loss": 0.7546, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.03333015076685386, |
|
"grad_norm": 0.16333037614822388, |
|
"learning_rate": 0.00019888568793764385, |
|
"loss": 0.7299, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.033590542569719904, |
|
"grad_norm": 0.1590205729007721, |
|
"learning_rate": 0.00019886069565301355, |
|
"loss": 0.762, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.033850934372585954, |
|
"grad_norm": 0.15006420016288757, |
|
"learning_rate": 0.00019883542780434733, |
|
"loss": 0.7531, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.034111326175452, |
|
"grad_norm": 0.18390792608261108, |
|
"learning_rate": 0.0001988098844620767, |
|
"loss": 0.7621, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.03437171797831804, |
|
"grad_norm": 0.17046166956424713, |
|
"learning_rate": 0.0001987840656974011, |
|
"loss": 0.7422, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.03463210978118409, |
|
"grad_norm": 0.15121813118457794, |
|
"learning_rate": 0.00019875797158228775, |
|
"loss": 0.7555, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.034892501584050134, |
|
"grad_norm": 0.16219307482242584, |
|
"learning_rate": 0.00019873160218947125, |
|
"loss": 0.7301, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.03515289338691618, |
|
"grad_norm": 0.1779986321926117, |
|
"learning_rate": 0.00019870495759245362, |
|
"loss": 0.7356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03541328518978223, |
|
"grad_norm": 0.16951359808444977, |
|
"learning_rate": 0.0001986780378655039, |
|
"loss": 0.7645, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.03567367699264827, |
|
"grad_norm": 0.16620802879333496, |
|
"learning_rate": 0.0001986508430836581, |
|
"loss": 0.7331, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.03593406879551432, |
|
"grad_norm": 0.1577858328819275, |
|
"learning_rate": 0.0001986233733227188, |
|
"loss": 0.7667, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.036194460598380364, |
|
"grad_norm": 0.1637091338634491, |
|
"learning_rate": 0.00019859562865925525, |
|
"loss": 0.7521, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.03645485240124641, |
|
"grad_norm": 0.15061691403388977, |
|
"learning_rate": 0.00019856760917060277, |
|
"loss": 0.744, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.03671524420411246, |
|
"grad_norm": 0.15373477339744568, |
|
"learning_rate": 0.00019853931493486287, |
|
"loss": 0.7677, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.0369756360069785, |
|
"grad_norm": 0.16468606889247894, |
|
"learning_rate": 0.00019851074603090277, |
|
"loss": 0.7179, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.037236027809844544, |
|
"grad_norm": 0.16084876656532288, |
|
"learning_rate": 0.00019848190253835536, |
|
"loss": 0.749, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.037496419612710594, |
|
"grad_norm": 0.16743004322052002, |
|
"learning_rate": 0.00019845278453761896, |
|
"loss": 0.7483, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.03775681141557664, |
|
"grad_norm": 0.17335088551044464, |
|
"learning_rate": 0.00019842339210985696, |
|
"loss": 0.735, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.03801720321844268, |
|
"grad_norm": 0.1546197235584259, |
|
"learning_rate": 0.00019839372533699774, |
|
"loss": 0.7549, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.03827759502130873, |
|
"grad_norm": 0.16218656301498413, |
|
"learning_rate": 0.00019836378430173438, |
|
"loss": 0.7425, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.03853798682417477, |
|
"grad_norm": 0.1712743639945984, |
|
"learning_rate": 0.0001983335690875245, |
|
"loss": 0.733, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.03879837862704082, |
|
"grad_norm": 0.15490613877773285, |
|
"learning_rate": 0.00019830307977858984, |
|
"loss": 0.7265, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.03905877042990687, |
|
"grad_norm": 0.1646670252084732, |
|
"learning_rate": 0.00019827231645991623, |
|
"loss": 0.7315, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.03931916223277291, |
|
"grad_norm": 0.1599082201719284, |
|
"learning_rate": 0.00019824127921725326, |
|
"loss": 0.7293, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.03957955403563896, |
|
"grad_norm": 0.1565747708082199, |
|
"learning_rate": 0.00019820996813711407, |
|
"loss": 0.7396, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.039839945838505, |
|
"grad_norm": 0.154826357960701, |
|
"learning_rate": 0.0001981783833067751, |
|
"loss": 0.7217, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.040100337641371046, |
|
"grad_norm": 0.16705222427845, |
|
"learning_rate": 0.0001981465248142758, |
|
"loss": 0.761, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.040360729444237096, |
|
"grad_norm": 0.15651623904705048, |
|
"learning_rate": 0.00019811439274841842, |
|
"loss": 0.7565, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04062112124710314, |
|
"grad_norm": 0.16211090981960297, |
|
"learning_rate": 0.00019808198719876782, |
|
"loss": 0.7555, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.04088151304996919, |
|
"grad_norm": 0.16856881976127625, |
|
"learning_rate": 0.00019804930825565112, |
|
"loss": 0.7567, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.04114190485283523, |
|
"grad_norm": 0.1588718593120575, |
|
"learning_rate": 0.00019801635601015752, |
|
"loss": 0.729, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.041402296655701276, |
|
"grad_norm": 0.17078711092472076, |
|
"learning_rate": 0.00019798313055413808, |
|
"loss": 0.7418, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.041662688458567326, |
|
"grad_norm": 0.16652734577655792, |
|
"learning_rate": 0.00019794963198020525, |
|
"loss": 0.7341, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04192308026143337, |
|
"grad_norm": 0.15535488724708557, |
|
"learning_rate": 0.00019791586038173296, |
|
"loss": 0.7396, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.04218347206429941, |
|
"grad_norm": 0.3506317734718323, |
|
"learning_rate": 0.00019788181585285602, |
|
"loss": 0.7345, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.04244386386716546, |
|
"grad_norm": 0.16875872015953064, |
|
"learning_rate": 0.00019784749848847003, |
|
"loss": 0.7214, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.042704255670031506, |
|
"grad_norm": 0.17675861716270447, |
|
"learning_rate": 0.0001978129083842312, |
|
"loss": 0.7431, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.042964647472897556, |
|
"grad_norm": 0.15601837635040283, |
|
"learning_rate": 0.00019777804563655583, |
|
"loss": 0.7215, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.0432250392757636, |
|
"grad_norm": 0.1874823123216629, |
|
"learning_rate": 0.00019774291034262026, |
|
"loss": 0.727, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.04348543107862964, |
|
"grad_norm": 0.17005637288093567, |
|
"learning_rate": 0.00019770750260036054, |
|
"loss": 0.7446, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.04374582288149569, |
|
"grad_norm": 0.17069579660892487, |
|
"learning_rate": 0.00019767182250847207, |
|
"loss": 0.7266, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.044006214684361736, |
|
"grad_norm": 0.16133156418800354, |
|
"learning_rate": 0.00019763587016640948, |
|
"loss": 0.7568, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.04426660648722778, |
|
"grad_norm": 0.16229428350925446, |
|
"learning_rate": 0.00019759964567438623, |
|
"loss": 0.7402, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04452699829009383, |
|
"grad_norm": 0.1622512936592102, |
|
"learning_rate": 0.00019756314913337432, |
|
"loss": 0.7536, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.04478739009295987, |
|
"grad_norm": 0.2161218672990799, |
|
"learning_rate": 0.00019752638064510415, |
|
"loss": 0.723, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.04504778189582592, |
|
"grad_norm": 0.154169961810112, |
|
"learning_rate": 0.00019748934031206414, |
|
"loss": 0.7441, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.045308173698691966, |
|
"grad_norm": 0.15468057990074158, |
|
"learning_rate": 0.00019745202823750034, |
|
"loss": 0.7349, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.04556856550155801, |
|
"grad_norm": 0.2015281468629837, |
|
"learning_rate": 0.0001974144445254164, |
|
"loss": 0.726, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04582895730442406, |
|
"grad_norm": 0.1931644082069397, |
|
"learning_rate": 0.00019737658928057302, |
|
"loss": 0.7604, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.0460893491072901, |
|
"grad_norm": 0.1528482288122177, |
|
"learning_rate": 0.00019733846260848776, |
|
"loss": 0.7408, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.046349740910156145, |
|
"grad_norm": 0.16370061039924622, |
|
"learning_rate": 0.0001973000646154349, |
|
"loss": 0.7647, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.046610132713022195, |
|
"grad_norm": 0.16271348297595978, |
|
"learning_rate": 0.00019726139540844484, |
|
"loss": 0.7212, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.04687052451588824, |
|
"grad_norm": 0.16218173503875732, |
|
"learning_rate": 0.00019722245509530401, |
|
"loss": 0.735, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.04713091631875429, |
|
"grad_norm": 0.17063820362091064, |
|
"learning_rate": 0.00019718324378455458, |
|
"loss": 0.7311, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.04739130812162033, |
|
"grad_norm": 0.1678459346294403, |
|
"learning_rate": 0.00019714376158549404, |
|
"loss": 0.7486, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.047651699924486375, |
|
"grad_norm": 0.15926459431648254, |
|
"learning_rate": 0.00019710400860817494, |
|
"loss": 0.743, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.047912091727352425, |
|
"grad_norm": 0.1775251179933548, |
|
"learning_rate": 0.00019706398496340463, |
|
"loss": 0.7512, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.04817248353021847, |
|
"grad_norm": 0.1572408229112625, |
|
"learning_rate": 0.00019702369076274494, |
|
"loss": 0.733, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.04843287533308451, |
|
"grad_norm": 0.29658186435699463, |
|
"learning_rate": 0.0001969831261185118, |
|
"loss": 0.7297, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.04869326713595056, |
|
"grad_norm": 0.16520118713378906, |
|
"learning_rate": 0.00019694229114377494, |
|
"loss": 0.721, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.048953658938816605, |
|
"grad_norm": 0.17762574553489685, |
|
"learning_rate": 0.00019690118595235774, |
|
"loss": 0.7304, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.049214050741682655, |
|
"grad_norm": 0.16636615991592407, |
|
"learning_rate": 0.00019685981065883663, |
|
"loss": 0.7257, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.0494744425445487, |
|
"grad_norm": 0.1622323989868164, |
|
"learning_rate": 0.00019681816537854102, |
|
"loss": 0.7353, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.04973483434741474, |
|
"grad_norm": 0.17419832944869995, |
|
"learning_rate": 0.00019677625022755289, |
|
"loss": 0.7452, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.04999522615028079, |
|
"grad_norm": 0.17460434138774872, |
|
"learning_rate": 0.00019673406532270634, |
|
"loss": 0.7391, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.050255617953146835, |
|
"grad_norm": 0.15844550728797913, |
|
"learning_rate": 0.00019669161078158753, |
|
"loss": 0.7327, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.05051600975601288, |
|
"grad_norm": 0.1638839989900589, |
|
"learning_rate": 0.0001966488867225341, |
|
"loss": 0.745, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.05077640155887893, |
|
"grad_norm": 0.1587786227464676, |
|
"learning_rate": 0.00019660589326463498, |
|
"loss": 0.7476, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05103679336174497, |
|
"grad_norm": 0.15708380937576294, |
|
"learning_rate": 0.00019656263052773002, |
|
"loss": 0.7208, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.05129718516461102, |
|
"grad_norm": 0.15816234052181244, |
|
"learning_rate": 0.00019651909863240965, |
|
"loss": 0.7262, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.051557576967477065, |
|
"grad_norm": 0.16749270260334015, |
|
"learning_rate": 0.00019647529770001456, |
|
"loss": 0.7284, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.05181796877034311, |
|
"grad_norm": 0.16943767666816711, |
|
"learning_rate": 0.00019643122785263536, |
|
"loss": 0.7225, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.05207836057320916, |
|
"grad_norm": 0.42929205298423767, |
|
"learning_rate": 0.00019638688921311224, |
|
"loss": 0.7305, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0523387523760752, |
|
"grad_norm": 0.15851692855358124, |
|
"learning_rate": 0.00019634228190503454, |
|
"loss": 0.7344, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.052599144178941244, |
|
"grad_norm": 0.16053883731365204, |
|
"learning_rate": 0.00019629740605274062, |
|
"loss": 0.7468, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.052859535981807294, |
|
"grad_norm": 0.16504009068012238, |
|
"learning_rate": 0.00019625226178131728, |
|
"loss": 0.7375, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 0.05311992778467334, |
|
"grad_norm": 0.1618044674396515, |
|
"learning_rate": 0.00019620684921659953, |
|
"loss": 0.7201, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.05338031958753939, |
|
"grad_norm": 0.15512776374816895, |
|
"learning_rate": 0.00019616116848517027, |
|
"loss": 0.7355, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.05364071139040543, |
|
"grad_norm": 0.17377036809921265, |
|
"learning_rate": 0.00019611521971435979, |
|
"loss": 0.7226, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.053901103193271474, |
|
"grad_norm": 0.1685250997543335, |
|
"learning_rate": 0.0001960690030322456, |
|
"loss": 0.7483, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 0.054161494996137524, |
|
"grad_norm": 0.18394522368907928, |
|
"learning_rate": 0.00019602251856765194, |
|
"loss": 0.7385, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.05442188679900357, |
|
"grad_norm": 0.1753673106431961, |
|
"learning_rate": 0.0001959757664501495, |
|
"loss": 0.7378, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 0.05468227860186961, |
|
"grad_norm": 0.1795465052127838, |
|
"learning_rate": 0.000195928746810055, |
|
"loss": 0.748, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05494267040473566, |
|
"grad_norm": 0.16327305138111115, |
|
"learning_rate": 0.0001958814597784309, |
|
"loss": 0.7306, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 0.055203062207601704, |
|
"grad_norm": 0.15880291163921356, |
|
"learning_rate": 0.00019583390548708486, |
|
"loss": 0.7281, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.05546345401046775, |
|
"grad_norm": 0.1702323853969574, |
|
"learning_rate": 0.0001957860840685696, |
|
"loss": 0.7407, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 0.0557238458133338, |
|
"grad_norm": 0.16931670904159546, |
|
"learning_rate": 0.0001957379956561825, |
|
"loss": 0.7272, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.05598423761619984, |
|
"grad_norm": 0.15455976128578186, |
|
"learning_rate": 0.000195689640383965, |
|
"loss": 0.7398, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.05624462941906589, |
|
"grad_norm": 0.16061417758464813, |
|
"learning_rate": 0.0001956410183867024, |
|
"loss": 0.749, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.056505021221931934, |
|
"grad_norm": 0.14933143556118011, |
|
"learning_rate": 0.00019559212979992365, |
|
"loss": 0.7418, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 0.05676541302479798, |
|
"grad_norm": 0.1592816412448883, |
|
"learning_rate": 0.00019554297475990058, |
|
"loss": 0.7423, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.05702580482766403, |
|
"grad_norm": 0.1677238792181015, |
|
"learning_rate": 0.00019549355340364787, |
|
"loss": 0.7101, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 0.05728619663053007, |
|
"grad_norm": 0.3558599054813385, |
|
"learning_rate": 0.00019544386586892238, |
|
"loss": 0.725, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.05754658843339611, |
|
"grad_norm": 0.1746376007795334, |
|
"learning_rate": 0.00019539391229422313, |
|
"loss": 0.7479, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 0.057806980236262163, |
|
"grad_norm": 0.15979182720184326, |
|
"learning_rate": 0.00019534369281879049, |
|
"loss": 0.7352, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.05806737203912821, |
|
"grad_norm": 0.16173166036605835, |
|
"learning_rate": 0.0001952932075826061, |
|
"loss": 0.7364, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 0.05832776384199426, |
|
"grad_norm": 0.1514744907617569, |
|
"learning_rate": 0.00019524245672639245, |
|
"loss": 0.734, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.0585881556448603, |
|
"grad_norm": 0.16860373318195343, |
|
"learning_rate": 0.00019519144039161222, |
|
"loss": 0.7098, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.05884854744772634, |
|
"grad_norm": 0.16847743093967438, |
|
"learning_rate": 0.00019514015872046833, |
|
"loss": 0.7103, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.05910893925059239, |
|
"grad_norm": 0.16181516647338867, |
|
"learning_rate": 0.00019508861185590307, |
|
"loss": 0.7561, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 0.059369331053458436, |
|
"grad_norm": 0.16594484448432922, |
|
"learning_rate": 0.0001950367999415981, |
|
"loss": 0.7308, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.05962972285632448, |
|
"grad_norm": 0.166441410779953, |
|
"learning_rate": 0.00019498472312197375, |
|
"loss": 0.735, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 0.05989011465919053, |
|
"grad_norm": 0.16273920238018036, |
|
"learning_rate": 0.00019493238154218886, |
|
"loss": 0.7458, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06015050646205657, |
|
"grad_norm": 0.16227276623249054, |
|
"learning_rate": 0.00019487977534814012, |
|
"loss": 0.7143, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 0.06041089826492262, |
|
"grad_norm": 0.1619606912136078, |
|
"learning_rate": 0.000194826904686462, |
|
"loss": 0.7285, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.060671290067788666, |
|
"grad_norm": 0.1596045345067978, |
|
"learning_rate": 0.00019477376970452603, |
|
"loss": 0.7513, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 0.06093168187065471, |
|
"grad_norm": 0.17504757642745972, |
|
"learning_rate": 0.00019472037055044044, |
|
"loss": 0.7376, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.06119207367352076, |
|
"grad_norm": 0.1559167355298996, |
|
"learning_rate": 0.00019466670737304992, |
|
"loss": 0.7339, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.0614524654763868, |
|
"grad_norm": 0.1624836027622223, |
|
"learning_rate": 0.0001946127803219351, |
|
"loss": 0.7258, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.061712857279252846, |
|
"grad_norm": 0.17907138168811798, |
|
"learning_rate": 0.00019455858954741206, |
|
"loss": 0.72, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 0.061973249082118896, |
|
"grad_norm": 0.15922705829143524, |
|
"learning_rate": 0.00019450413520053202, |
|
"loss": 0.7187, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.06223364088498494, |
|
"grad_norm": 0.1552513986825943, |
|
"learning_rate": 0.0001944494174330809, |
|
"loss": 0.7183, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 0.06249403268785099, |
|
"grad_norm": 0.16838514804840088, |
|
"learning_rate": 0.00019439443639757885, |
|
"loss": 0.7286, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06275442449071703, |
|
"grad_norm": 0.17352423071861267, |
|
"learning_rate": 0.00019433919224727986, |
|
"loss": 0.7436, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 0.06301481629358308, |
|
"grad_norm": 0.17366603016853333, |
|
"learning_rate": 0.0001942836851361713, |
|
"loss": 0.7265, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.06327520809644913, |
|
"grad_norm": 0.14833413064479828, |
|
"learning_rate": 0.00019422791521897357, |
|
"loss": 0.7234, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 0.06353559989931518, |
|
"grad_norm": 0.16602723300457, |
|
"learning_rate": 0.00019417188265113958, |
|
"loss": 0.725, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.06379599170218121, |
|
"grad_norm": 0.17290353775024414, |
|
"learning_rate": 0.00019411558758885438, |
|
"loss": 0.7174, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.06405638350504726, |
|
"grad_norm": 0.16486665606498718, |
|
"learning_rate": 0.0001940590301890346, |
|
"loss": 0.7301, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.06431677530791331, |
|
"grad_norm": 0.16255232691764832, |
|
"learning_rate": 0.00019400221060932827, |
|
"loss": 0.7462, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 0.06457716711077935, |
|
"grad_norm": 0.16139757633209229, |
|
"learning_rate": 0.0001939451290081141, |
|
"loss": 0.7424, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.0648375589136454, |
|
"grad_norm": 0.165597602725029, |
|
"learning_rate": 0.00019388778554450117, |
|
"loss": 0.7426, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 0.06509795071651145, |
|
"grad_norm": 0.19819000363349915, |
|
"learning_rate": 0.00019383018037832854, |
|
"loss": 0.7356, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06535834251937749, |
|
"grad_norm": 0.16469696164131165, |
|
"learning_rate": 0.00019377231367016467, |
|
"loss": 0.718, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 0.06561873432224354, |
|
"grad_norm": 0.1644965261220932, |
|
"learning_rate": 0.00019371418558130702, |
|
"loss": 0.7253, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.06587912612510959, |
|
"grad_norm": 0.15347526967525482, |
|
"learning_rate": 0.00019365579627378174, |
|
"loss": 0.7214, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 0.06613951792797562, |
|
"grad_norm": 0.1618672013282776, |
|
"learning_rate": 0.00019359714591034302, |
|
"loss": 0.7204, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.06639990973084167, |
|
"grad_norm": 0.17043665051460266, |
|
"learning_rate": 0.00019353823465447268, |
|
"loss": 0.7278, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.06666030153370772, |
|
"grad_norm": 0.15762579441070557, |
|
"learning_rate": 0.00019347906267037983, |
|
"loss": 0.7283, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.06692069333657376, |
|
"grad_norm": 0.1622801572084427, |
|
"learning_rate": 0.00019341963012300029, |
|
"loss": 0.7193, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 0.06718108513943981, |
|
"grad_norm": 0.16705769300460815, |
|
"learning_rate": 0.00019335993717799617, |
|
"loss": 0.7414, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.06744147694230586, |
|
"grad_norm": 0.15886452794075012, |
|
"learning_rate": 0.00019329998400175545, |
|
"loss": 0.7242, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 0.06770186874517191, |
|
"grad_norm": 0.17994090914726257, |
|
"learning_rate": 0.00019323977076139142, |
|
"loss": 0.7017, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.06796226054803794, |
|
"grad_norm": 0.1609068214893341, |
|
"learning_rate": 0.00019317929762474232, |
|
"loss": 0.7352, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 0.068222652350904, |
|
"grad_norm": 0.15605950355529785, |
|
"learning_rate": 0.0001931185647603708, |
|
"loss": 0.7249, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.06848304415377005, |
|
"grad_norm": 0.16057750582695007, |
|
"learning_rate": 0.00019305757233756352, |
|
"loss": 0.7521, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 0.06874343595663608, |
|
"grad_norm": 0.1703862100839615, |
|
"learning_rate": 0.00019299632052633054, |
|
"loss": 0.7245, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.06900382775950213, |
|
"grad_norm": 0.16324444115161896, |
|
"learning_rate": 0.00019293480949740505, |
|
"loss": 0.7395, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.06926421956236818, |
|
"grad_norm": 0.15283791720867157, |
|
"learning_rate": 0.00019287303942224266, |
|
"loss": 0.7158, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.06952461136523422, |
|
"grad_norm": 0.1882282942533493, |
|
"learning_rate": 0.00019281101047302114, |
|
"loss": 0.724, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 0.06978500316810027, |
|
"grad_norm": 0.16147953271865845, |
|
"learning_rate": 0.00019274872282263984, |
|
"loss": 0.7365, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.07004539497096632, |
|
"grad_norm": 0.1614103466272354, |
|
"learning_rate": 0.00019268617664471916, |
|
"loss": 0.7206, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 0.07030578677383235, |
|
"grad_norm": 0.16784432530403137, |
|
"learning_rate": 0.00019262337211360016, |
|
"loss": 0.7279, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.0705661785766984, |
|
"grad_norm": 0.15966112911701202, |
|
"learning_rate": 0.000192560309404344, |
|
"loss": 0.7274, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 0.07082657037956445, |
|
"grad_norm": 0.16970521211624146, |
|
"learning_rate": 0.0001924969886927315, |
|
"loss": 0.7038, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.07108696218243049, |
|
"grad_norm": 0.16143856942653656, |
|
"learning_rate": 0.00019243341015526272, |
|
"loss": 0.7097, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 0.07134735398529654, |
|
"grad_norm": 0.16041269898414612, |
|
"learning_rate": 0.00019236957396915623, |
|
"loss": 0.722, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.07160774578816259, |
|
"grad_norm": 0.15845969319343567, |
|
"learning_rate": 0.00019230548031234882, |
|
"loss": 0.7238, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.07186813759102864, |
|
"grad_norm": 0.14966030418872833, |
|
"learning_rate": 0.00019224112936349502, |
|
"loss": 0.7182, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.07212852939389468, |
|
"grad_norm": 0.16525116562843323, |
|
"learning_rate": 0.00019217652130196653, |
|
"loss": 0.7397, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 0.07238892119676073, |
|
"grad_norm": 0.18119119107723236, |
|
"learning_rate": 0.0001921116563078516, |
|
"loss": 0.7222, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.07264931299962678, |
|
"grad_norm": 0.1709197610616684, |
|
"learning_rate": 0.00019204653456195478, |
|
"loss": 0.7068, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 0.07290970480249281, |
|
"grad_norm": 0.16309161484241486, |
|
"learning_rate": 0.00019198115624579625, |
|
"loss": 0.7349, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.07317009660535886, |
|
"grad_norm": 0.1736750453710556, |
|
"learning_rate": 0.00019191552154161135, |
|
"loss": 0.7445, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 0.07343048840822491, |
|
"grad_norm": 0.15009112656116486, |
|
"learning_rate": 0.00019184963063235006, |
|
"loss": 0.7034, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.07369088021109095, |
|
"grad_norm": 0.17244628071784973, |
|
"learning_rate": 0.0001917834837016766, |
|
"loss": 0.7285, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 0.073951272013957, |
|
"grad_norm": 0.15991820394992828, |
|
"learning_rate": 0.00019171708093396861, |
|
"loss": 0.7096, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.07421166381682305, |
|
"grad_norm": 0.17037667334079742, |
|
"learning_rate": 0.0001916504225143171, |
|
"loss": 0.7177, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.07447205561968909, |
|
"grad_norm": 0.16700348258018494, |
|
"learning_rate": 0.00019158350862852553, |
|
"loss": 0.7453, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.07473244742255514, |
|
"grad_norm": 0.17683659493923187, |
|
"learning_rate": 0.00019151633946310948, |
|
"loss": 0.7331, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 0.07499283922542119, |
|
"grad_norm": 0.16364306211471558, |
|
"learning_rate": 0.00019144891520529608, |
|
"loss": 0.7347, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.07525323102828722, |
|
"grad_norm": 0.1781424731016159, |
|
"learning_rate": 0.00019138123604302355, |
|
"loss": 0.7169, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 0.07551362283115327, |
|
"grad_norm": 0.16007259488105774, |
|
"learning_rate": 0.00019131330216494064, |
|
"loss": 0.7269, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07577401463401932, |
|
"grad_norm": 0.1604921519756317, |
|
"learning_rate": 0.00019124511376040598, |
|
"loss": 0.7094, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 0.07603440643688536, |
|
"grad_norm": 0.16649965941905975, |
|
"learning_rate": 0.00019117667101948782, |
|
"loss": 0.7271, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.07629479823975141, |
|
"grad_norm": 0.16084066033363342, |
|
"learning_rate": 0.0001911079741329632, |
|
"loss": 0.7239, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 0.07655519004261746, |
|
"grad_norm": 0.1651066243648529, |
|
"learning_rate": 0.0001910390232923177, |
|
"loss": 0.7304, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.07681558184548351, |
|
"grad_norm": 0.1528957635164261, |
|
"learning_rate": 0.00019096981868974467, |
|
"loss": 0.7068, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.07707597364834955, |
|
"grad_norm": 0.172830730676651, |
|
"learning_rate": 0.00019090036051814483, |
|
"loss": 0.7277, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.0773363654512156, |
|
"grad_norm": 0.15909147262573242, |
|
"learning_rate": 0.00019083064897112571, |
|
"loss": 0.7135, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 0.07759675725408165, |
|
"grad_norm": 0.16273066401481628, |
|
"learning_rate": 0.0001907606842430011, |
|
"loss": 0.7346, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.07785714905694768, |
|
"grad_norm": 0.1595291793346405, |
|
"learning_rate": 0.00019069046652879049, |
|
"loss": 0.7377, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 0.07811754085981373, |
|
"grad_norm": 0.15573470294475555, |
|
"learning_rate": 0.0001906199960242185, |
|
"loss": 0.7026, |
|
"step": 3000 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 19202, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.40750201290752e+18, |
|
"train_batch_size": 5, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|