|
{ |
|
"best_metric": 1.208633542060852, |
|
"best_model_checkpoint": "/mnt/users/n3thakur/vectara/huggingface-dpo/trained_models/v3/Mistral-7B-Instruct-v0.2-miracl-raft-sft-v2.0/checkpoint-2000", |
|
"epoch": 0.9997531473710195, |
|
"eval_steps": 200, |
|
"global_step": 2025, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0004937052579609973, |
|
"grad_norm": 3.3225639243513934, |
|
"learning_rate": 4.926108374384237e-08, |
|
"loss": 1.6639, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0024685262898049864, |
|
"grad_norm": 2.953462068009019, |
|
"learning_rate": 2.4630541871921185e-07, |
|
"loss": 1.7388, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.004937052579609973, |
|
"grad_norm": 2.4057426602070646, |
|
"learning_rate": 4.926108374384237e-07, |
|
"loss": 1.5798, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.00740557886941496, |
|
"grad_norm": 3.186500367639329, |
|
"learning_rate": 7.389162561576356e-07, |
|
"loss": 1.659, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.009874105159219946, |
|
"grad_norm": 2.396847476828635, |
|
"learning_rate": 9.852216748768474e-07, |
|
"loss": 1.6374, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012342631449024932, |
|
"grad_norm": 2.5391731431636106, |
|
"learning_rate": 1.2315270935960593e-06, |
|
"loss": 1.6314, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.01481115773882992, |
|
"grad_norm": 2.2574409610476462, |
|
"learning_rate": 1.4778325123152712e-06, |
|
"loss": 1.5888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.017279684028634903, |
|
"grad_norm": 1.8342813542038656, |
|
"learning_rate": 1.724137931034483e-06, |
|
"loss": 1.5412, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.01974821031843989, |
|
"grad_norm": 1.8380484598711315, |
|
"learning_rate": 1.970443349753695e-06, |
|
"loss": 1.4889, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02221673660824488, |
|
"grad_norm": 1.613058249947001, |
|
"learning_rate": 2.2167487684729067e-06, |
|
"loss": 1.5403, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.024685262898049863, |
|
"grad_norm": 1.8920016704605567, |
|
"learning_rate": 2.4630541871921186e-06, |
|
"loss": 1.4831, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.02715378918785485, |
|
"grad_norm": 1.2203038504329438, |
|
"learning_rate": 2.70935960591133e-06, |
|
"loss": 1.4233, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.02962231547765984, |
|
"grad_norm": 1.095184752565883, |
|
"learning_rate": 2.9556650246305424e-06, |
|
"loss": 1.4255, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.03209084176746482, |
|
"grad_norm": 0.8448900941377993, |
|
"learning_rate": 3.201970443349754e-06, |
|
"loss": 1.4892, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.03455936805726981, |
|
"grad_norm": 0.9405655862570454, |
|
"learning_rate": 3.448275862068966e-06, |
|
"loss": 1.4673, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.0370278943470748, |
|
"grad_norm": 0.6713923929675227, |
|
"learning_rate": 3.6945812807881777e-06, |
|
"loss": 1.4148, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.03949642063687978, |
|
"grad_norm": 0.7755902373813679, |
|
"learning_rate": 3.94088669950739e-06, |
|
"loss": 1.4867, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.04196494692668477, |
|
"grad_norm": 0.7400218273582495, |
|
"learning_rate": 4.1871921182266015e-06, |
|
"loss": 1.3834, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.04443347321648976, |
|
"grad_norm": 0.7245551973919236, |
|
"learning_rate": 4.4334975369458135e-06, |
|
"loss": 1.44, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.04690199950629474, |
|
"grad_norm": 0.6731482962094358, |
|
"learning_rate": 4.6798029556650245e-06, |
|
"loss": 1.362, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.049370525796099726, |
|
"grad_norm": 0.7105341248736622, |
|
"learning_rate": 4.926108374384237e-06, |
|
"loss": 1.3716, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.05183905208590472, |
|
"grad_norm": 0.6774223469533757, |
|
"learning_rate": 5.172413793103449e-06, |
|
"loss": 1.4056, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.0543075783757097, |
|
"grad_norm": 0.6745745206164803, |
|
"learning_rate": 5.41871921182266e-06, |
|
"loss": 1.337, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.056776104665514686, |
|
"grad_norm": 0.5935854583319804, |
|
"learning_rate": 5.665024630541872e-06, |
|
"loss": 1.3615, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.05924463095531968, |
|
"grad_norm": 0.556924082351685, |
|
"learning_rate": 5.911330049261085e-06, |
|
"loss": 1.4248, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.06171315724512466, |
|
"grad_norm": 0.5427807889259738, |
|
"learning_rate": 6.157635467980296e-06, |
|
"loss": 1.3286, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.06418168353492965, |
|
"grad_norm": 0.6063519166723843, |
|
"learning_rate": 6.403940886699508e-06, |
|
"loss": 1.3176, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.06665020982473463, |
|
"grad_norm": 0.5670363529677274, |
|
"learning_rate": 6.65024630541872e-06, |
|
"loss": 1.3273, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.06911873611453961, |
|
"grad_norm": 0.5846835420330245, |
|
"learning_rate": 6.896551724137932e-06, |
|
"loss": 1.326, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.07158726240434461, |
|
"grad_norm": 0.5686293276495719, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 1.376, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.0740557886941496, |
|
"grad_norm": 0.5275299029056365, |
|
"learning_rate": 7.3891625615763555e-06, |
|
"loss": 1.3364, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.07652431498395458, |
|
"grad_norm": 0.5464387846115857, |
|
"learning_rate": 7.635467980295567e-06, |
|
"loss": 1.3654, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.07899284127375956, |
|
"grad_norm": 0.5229203264129956, |
|
"learning_rate": 7.88177339901478e-06, |
|
"loss": 1.3009, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.08146136756356455, |
|
"grad_norm": 0.5746356087172889, |
|
"learning_rate": 8.12807881773399e-06, |
|
"loss": 1.2611, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.08392989385336953, |
|
"grad_norm": 0.5922232695792946, |
|
"learning_rate": 8.374384236453203e-06, |
|
"loss": 1.3643, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.08639842014317453, |
|
"grad_norm": 0.5295655281983137, |
|
"learning_rate": 8.620689655172414e-06, |
|
"loss": 1.3165, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.08886694643297952, |
|
"grad_norm": 0.5850731545805168, |
|
"learning_rate": 8.866995073891627e-06, |
|
"loss": 1.3105, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0913354727227845, |
|
"grad_norm": 0.5551320012809824, |
|
"learning_rate": 9.113300492610838e-06, |
|
"loss": 1.278, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.09380399901258948, |
|
"grad_norm": 0.5711739398485313, |
|
"learning_rate": 9.359605911330049e-06, |
|
"loss": 1.3197, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.09627252530239447, |
|
"grad_norm": 0.5559894427552352, |
|
"learning_rate": 9.605911330049262e-06, |
|
"loss": 1.3409, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.09874105159219945, |
|
"grad_norm": 0.5671580477100892, |
|
"learning_rate": 9.852216748768475e-06, |
|
"loss": 1.3095, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.09874105159219945, |
|
"eval_loss": 1.2800045013427734, |
|
"eval_runtime": 2727.7043, |
|
"eval_samples_per_second": 1.466, |
|
"eval_steps_per_second": 0.122, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.10120957788200444, |
|
"grad_norm": 0.5279833943359168, |
|
"learning_rate": 9.999970269475589e-06, |
|
"loss": 1.2966, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.10367810417180943, |
|
"grad_norm": 0.5644566149733632, |
|
"learning_rate": 9.99963580513638e-06, |
|
"loss": 1.2874, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.10614663046161442, |
|
"grad_norm": 0.6258961143244912, |
|
"learning_rate": 9.998929738244678e-06, |
|
"loss": 1.3209, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.1086151567514194, |
|
"grad_norm": 0.4834621448531187, |
|
"learning_rate": 9.997852121279563e-06, |
|
"loss": 1.3313, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.11108368304122439, |
|
"grad_norm": 0.5481752837030147, |
|
"learning_rate": 9.996403034335912e-06, |
|
"loss": 1.2738, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.11355220933102937, |
|
"grad_norm": 0.5886589355414898, |
|
"learning_rate": 9.994582585118449e-06, |
|
"loss": 1.2758, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.11602073562083436, |
|
"grad_norm": 0.5757139355018718, |
|
"learning_rate": 9.992390908933746e-06, |
|
"loss": 1.3187, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.11848926191063935, |
|
"grad_norm": 0.5464825333851621, |
|
"learning_rate": 9.989828168680164e-06, |
|
"loss": 1.3677, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.12095778820044434, |
|
"grad_norm": 0.6372982363888493, |
|
"learning_rate": 9.986894554835735e-06, |
|
"loss": 1.2668, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.12342631449024932, |
|
"grad_norm": 0.5445141174147589, |
|
"learning_rate": 9.983590285444025e-06, |
|
"loss": 1.2917, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.1258948407800543, |
|
"grad_norm": 0.6832031232821291, |
|
"learning_rate": 9.979915606097907e-06, |
|
"loss": 1.2675, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.1283633670698593, |
|
"grad_norm": 0.62128138673847, |
|
"learning_rate": 9.975870789921322e-06, |
|
"loss": 1.3187, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.13083189335966428, |
|
"grad_norm": 0.5161196413352727, |
|
"learning_rate": 9.971456137548971e-06, |
|
"loss": 1.3031, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.13330041964946926, |
|
"grad_norm": 0.5524745641605668, |
|
"learning_rate": 9.966671977103972e-06, |
|
"loss": 1.2749, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.13576894593927424, |
|
"grad_norm": 0.6669242272051678, |
|
"learning_rate": 9.961518664173473e-06, |
|
"loss": 1.3409, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.13823747222907923, |
|
"grad_norm": 0.5555562003933405, |
|
"learning_rate": 9.955996581782218e-06, |
|
"loss": 1.2468, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.14070599851888424, |
|
"grad_norm": 0.6244202172570701, |
|
"learning_rate": 9.950106140364089e-06, |
|
"loss": 1.3318, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.14317452480868922, |
|
"grad_norm": 0.5100271270558925, |
|
"learning_rate": 9.943847777731584e-06, |
|
"loss": 1.2522, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.1456430510984942, |
|
"grad_norm": 0.5482368116306139, |
|
"learning_rate": 9.937221959043294e-06, |
|
"loss": 1.3044, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.1481115773882992, |
|
"grad_norm": 0.5919271032213149, |
|
"learning_rate": 9.93022917676932e-06, |
|
"loss": 1.3131, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.15058010367810418, |
|
"grad_norm": 0.5428829828459178, |
|
"learning_rate": 9.922869950654662e-06, |
|
"loss": 1.2306, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.15304862996790916, |
|
"grad_norm": 0.5461192699131175, |
|
"learning_rate": 9.915144827680606e-06, |
|
"loss": 1.3151, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.15551715625771415, |
|
"grad_norm": 0.5113904915941117, |
|
"learning_rate": 9.907054382024058e-06, |
|
"loss": 1.2813, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.15798568254751913, |
|
"grad_norm": 0.6272053783824121, |
|
"learning_rate": 9.898599215014868e-06, |
|
"loss": 1.3064, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.1604542088373241, |
|
"grad_norm": 0.5671094073178861, |
|
"learning_rate": 9.889779955091142e-06, |
|
"loss": 1.2734, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.1629227351271291, |
|
"grad_norm": 0.582371136771928, |
|
"learning_rate": 9.880597257752522e-06, |
|
"loss": 1.3075, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.16539126141693408, |
|
"grad_norm": 0.5520015589132342, |
|
"learning_rate": 9.87105180551148e-06, |
|
"loss": 1.2802, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.16785978770673907, |
|
"grad_norm": 0.5937587353133906, |
|
"learning_rate": 9.861144307842574e-06, |
|
"loss": 1.2893, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.17032831399654405, |
|
"grad_norm": 0.5371728696508287, |
|
"learning_rate": 9.850875501129726e-06, |
|
"loss": 1.219, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.17279684028634906, |
|
"grad_norm": 0.5892603164875664, |
|
"learning_rate": 9.840246148611485e-06, |
|
"loss": 1.3094, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.17526536657615405, |
|
"grad_norm": 0.5502008403202052, |
|
"learning_rate": 9.829257040324308e-06, |
|
"loss": 1.2543, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.17773389286595903, |
|
"grad_norm": 0.6273336128612022, |
|
"learning_rate": 9.817908993043819e-06, |
|
"loss": 1.3107, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.18020241915576402, |
|
"grad_norm": 0.5761032807193177, |
|
"learning_rate": 9.806202850224123e-06, |
|
"loss": 1.2657, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.182670945445569, |
|
"grad_norm": 0.5628854954179761, |
|
"learning_rate": 9.794139481935108e-06, |
|
"loss": 1.258, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.18513947173537398, |
|
"grad_norm": 0.5637909618250402, |
|
"learning_rate": 9.781719784797773e-06, |
|
"loss": 1.2406, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.18760799802517897, |
|
"grad_norm": 0.5212794091813217, |
|
"learning_rate": 9.768944681917582e-06, |
|
"loss": 1.2391, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.19007652431498395, |
|
"grad_norm": 0.6416799620777229, |
|
"learning_rate": 9.755815122815871e-06, |
|
"loss": 1.3188, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.19254505060478894, |
|
"grad_norm": 0.5487444911675088, |
|
"learning_rate": 9.742332083359252e-06, |
|
"loss": 1.2884, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.19501357689459392, |
|
"grad_norm": 0.5697317991057302, |
|
"learning_rate": 9.728496565687096e-06, |
|
"loss": 1.2798, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.1974821031843989, |
|
"grad_norm": 0.6703007559314436, |
|
"learning_rate": 9.714309598137045e-06, |
|
"loss": 1.249, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1974821031843989, |
|
"eval_loss": 1.2516121864318848, |
|
"eval_runtime": 2575.7168, |
|
"eval_samples_per_second": 1.553, |
|
"eval_steps_per_second": 0.13, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.1999506294742039, |
|
"grad_norm": 0.526231295870319, |
|
"learning_rate": 9.699772235168572e-06, |
|
"loss": 1.2554, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.20241915576400887, |
|
"grad_norm": 0.5513334850915074, |
|
"learning_rate": 9.68488555728462e-06, |
|
"loss": 1.2753, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.20488768205381389, |
|
"grad_norm": 0.5979774809603526, |
|
"learning_rate": 9.669650670951282e-06, |
|
"loss": 1.2562, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.20735620834361887, |
|
"grad_norm": 0.5596269907913185, |
|
"learning_rate": 9.654068708515564e-06, |
|
"loss": 1.2829, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.20982473463342385, |
|
"grad_norm": 0.5593282633769885, |
|
"learning_rate": 9.638140828121232e-06, |
|
"loss": 1.2843, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.21229326092322884, |
|
"grad_norm": 0.5775937654131708, |
|
"learning_rate": 9.621868213622713e-06, |
|
"loss": 1.3001, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.21476178721303382, |
|
"grad_norm": 0.5661901033745343, |
|
"learning_rate": 9.605252074497125e-06, |
|
"loss": 1.3038, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.2172303135028388, |
|
"grad_norm": 0.6132749209816828, |
|
"learning_rate": 9.588293645754363e-06, |
|
"loss": 1.2843, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.2196988397926438, |
|
"grad_norm": 0.5624360623535388, |
|
"learning_rate": 9.570994187845323e-06, |
|
"loss": 1.2342, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.22216736608244878, |
|
"grad_norm": 0.5567610470805882, |
|
"learning_rate": 9.553354986568201e-06, |
|
"loss": 1.2955, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.22463589237225376, |
|
"grad_norm": 0.6255724221196046, |
|
"learning_rate": 9.53537735297294e-06, |
|
"loss": 1.2921, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.22710441866205874, |
|
"grad_norm": 0.5322242379012073, |
|
"learning_rate": 9.517062623263768e-06, |
|
"loss": 1.3011, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.22957294495186373, |
|
"grad_norm": 0.5444205798338807, |
|
"learning_rate": 9.498412158699905e-06, |
|
"loss": 1.2733, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.2320414712416687, |
|
"grad_norm": 0.5426713243893322, |
|
"learning_rate": 9.479427345494366e-06, |
|
"loss": 1.2312, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.23450999753147372, |
|
"grad_norm": 0.5871783813919782, |
|
"learning_rate": 9.460109594710942e-06, |
|
"loss": 1.3655, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.2369785238212787, |
|
"grad_norm": 0.574852380091512, |
|
"learning_rate": 9.440460342159314e-06, |
|
"loss": 1.2915, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.2394470501110837, |
|
"grad_norm": 0.5336092545421678, |
|
"learning_rate": 9.42048104828834e-06, |
|
"loss": 1.2963, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 0.24191557640088868, |
|
"grad_norm": 0.5998428802300876, |
|
"learning_rate": 9.40017319807751e-06, |
|
"loss": 1.3058, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.24438410269069366, |
|
"grad_norm": 0.5421507806800733, |
|
"learning_rate": 9.379538300926553e-06, |
|
"loss": 1.2881, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 0.24685262898049865, |
|
"grad_norm": 0.5358621498972941, |
|
"learning_rate": 9.358577890543277e-06, |
|
"loss": 1.2602, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.24932115527030363, |
|
"grad_norm": 0.564112204428148, |
|
"learning_rate": 9.33729352482956e-06, |
|
"loss": 1.279, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 0.2517896815601086, |
|
"grad_norm": 0.6382679375882034, |
|
"learning_rate": 9.315686785765556e-06, |
|
"loss": 1.2534, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.2542582078499136, |
|
"grad_norm": 0.5744585475791394, |
|
"learning_rate": 9.293759279292116e-06, |
|
"loss": 1.2744, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 0.2567267341397186, |
|
"grad_norm": 0.615942623926986, |
|
"learning_rate": 9.271512635191427e-06, |
|
"loss": 1.3055, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.25919526042952357, |
|
"grad_norm": 0.5780670121734512, |
|
"learning_rate": 9.248948506965877e-06, |
|
"loss": 1.3175, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.26166378671932855, |
|
"grad_norm": 0.5777138377025286, |
|
"learning_rate": 9.22606857171515e-06, |
|
"loss": 1.2869, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.26413231300913353, |
|
"grad_norm": 0.5611724611846367, |
|
"learning_rate": 9.202874530011583e-06, |
|
"loss": 1.3199, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 0.2666008392989385, |
|
"grad_norm": 0.540794710590132, |
|
"learning_rate": 9.179368105773768e-06, |
|
"loss": 1.208, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.2690693655887435, |
|
"grad_norm": 0.5581497544995145, |
|
"learning_rate": 9.155551046138408e-06, |
|
"loss": 1.2638, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 0.2715378918785485, |
|
"grad_norm": 0.560865648598851, |
|
"learning_rate": 9.131425121330477e-06, |
|
"loss": 1.2629, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.27400641816835347, |
|
"grad_norm": 0.5458754463390333, |
|
"learning_rate": 9.10699212453164e-06, |
|
"loss": 1.2578, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 0.27647494445815846, |
|
"grad_norm": 0.5468153448281193, |
|
"learning_rate": 9.082253871746962e-06, |
|
"loss": 1.2488, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.27894347074796344, |
|
"grad_norm": 0.6168084406611584, |
|
"learning_rate": 9.057212201669952e-06, |
|
"loss": 1.2931, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 0.2814119970377685, |
|
"grad_norm": 0.5767023372783159, |
|
"learning_rate": 9.031868975545884e-06, |
|
"loss": 1.2267, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.28388052332757346, |
|
"grad_norm": 0.5315895904457054, |
|
"learning_rate": 9.006226077033464e-06, |
|
"loss": 1.2463, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.28634904961737845, |
|
"grad_norm": 0.5616058952533509, |
|
"learning_rate": 8.980285412064827e-06, |
|
"loss": 1.287, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.28881757590718343, |
|
"grad_norm": 0.5746998443271042, |
|
"learning_rate": 8.954048908703873e-06, |
|
"loss": 1.2929, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 0.2912861021969884, |
|
"grad_norm": 0.5551746835964705, |
|
"learning_rate": 8.92751851700297e-06, |
|
"loss": 1.298, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.2937546284867934, |
|
"grad_norm": 0.578564867995815, |
|
"learning_rate": 8.900696208857996e-06, |
|
"loss": 1.2973, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 0.2962231547765984, |
|
"grad_norm": 0.5925663520696334, |
|
"learning_rate": 8.873583977861802e-06, |
|
"loss": 1.2514, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.2962231547765984, |
|
"eval_loss": 1.2368682622909546, |
|
"eval_runtime": 2566.7596, |
|
"eval_samples_per_second": 1.558, |
|
"eval_steps_per_second": 0.13, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.29869168106640337, |
|
"grad_norm": 0.5605310856508363, |
|
"learning_rate": 8.846183839156015e-06, |
|
"loss": 1.286, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 0.30116020735620835, |
|
"grad_norm": 0.6632798685747615, |
|
"learning_rate": 8.818497829281272e-06, |
|
"loss": 1.2916, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.30362873364601334, |
|
"grad_norm": 0.6145012170463651, |
|
"learning_rate": 8.790528006025848e-06, |
|
"loss": 1.2788, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 0.3060972599358183, |
|
"grad_norm": 0.6017170291600934, |
|
"learning_rate": 8.762276448272709e-06, |
|
"loss": 1.3156, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.3085657862256233, |
|
"grad_norm": 0.5728547538871892, |
|
"learning_rate": 8.733745255844996e-06, |
|
"loss": 1.2592, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.3110343125154283, |
|
"grad_norm": 0.558142508046803, |
|
"learning_rate": 8.70493654934996e-06, |
|
"loss": 1.309, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.3135028388052333, |
|
"grad_norm": 0.5596812007471911, |
|
"learning_rate": 8.675852470021344e-06, |
|
"loss": 1.2746, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 0.31597136509503826, |
|
"grad_norm": 0.5909265132847957, |
|
"learning_rate": 8.646495179560221e-06, |
|
"loss": 1.2686, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.31843989138484324, |
|
"grad_norm": 0.6185942591784858, |
|
"learning_rate": 8.616866859974344e-06, |
|
"loss": 1.2759, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 0.3209084176746482, |
|
"grad_norm": 0.6157204431679958, |
|
"learning_rate": 8.586969713415949e-06, |
|
"loss": 1.2957, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.3233769439644532, |
|
"grad_norm": 0.5974197754755597, |
|
"learning_rate": 8.556805962018091e-06, |
|
"loss": 1.27, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 0.3258454702542582, |
|
"grad_norm": 0.5389440161380957, |
|
"learning_rate": 8.526377847729475e-06, |
|
"loss": 1.2925, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.3283139965440632, |
|
"grad_norm": 0.5370983741740369, |
|
"learning_rate": 8.495687632147817e-06, |
|
"loss": 1.2522, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 0.33078252283386816, |
|
"grad_norm": 0.5639132359450145, |
|
"learning_rate": 8.46473759635176e-06, |
|
"loss": 1.2595, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.33325104912367315, |
|
"grad_norm": 0.5598705018251675, |
|
"learning_rate": 8.433530040731321e-06, |
|
"loss": 1.2746, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.33571957541347813, |
|
"grad_norm": 0.6303186487688077, |
|
"learning_rate": 8.402067284816919e-06, |
|
"loss": 1.2701, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.3381881017032831, |
|
"grad_norm": 0.562747309348665, |
|
"learning_rate": 8.370351667106969e-06, |
|
"loss": 1.2305, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 0.3406566279930881, |
|
"grad_norm": 0.5720387765798051, |
|
"learning_rate": 8.338385544894073e-06, |
|
"loss": 1.2047, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.3431251542828931, |
|
"grad_norm": 0.5465830505695308, |
|
"learning_rate": 8.306171294089808e-06, |
|
"loss": 1.2507, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 0.3455936805726981, |
|
"grad_norm": 0.5572297207326813, |
|
"learning_rate": 8.273711309048145e-06, |
|
"loss": 1.2599, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.3480622068625031, |
|
"grad_norm": 0.5916945311296786, |
|
"learning_rate": 8.241008002387474e-06, |
|
"loss": 1.2615, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 0.3505307331523081, |
|
"grad_norm": 0.6326075200444886, |
|
"learning_rate": 8.208063804811293e-06, |
|
"loss": 1.2559, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.3529992594421131, |
|
"grad_norm": 0.6229843020575793, |
|
"learning_rate": 8.174881164927535e-06, |
|
"loss": 1.2652, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 0.35546778573191806, |
|
"grad_norm": 0.5926153932237264, |
|
"learning_rate": 8.141462549066581e-06, |
|
"loss": 1.2423, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.35793631202172305, |
|
"grad_norm": 0.5293071287095781, |
|
"learning_rate": 8.107810441097948e-06, |
|
"loss": 1.2185, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.36040483831152803, |
|
"grad_norm": 0.5950082298726722, |
|
"learning_rate": 8.073927342245663e-06, |
|
"loss": 1.2458, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.362873364601333, |
|
"grad_norm": 0.5437872955630408, |
|
"learning_rate": 8.039815770902368e-06, |
|
"loss": 1.2699, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 0.365341890891138, |
|
"grad_norm": 0.5842632003875607, |
|
"learning_rate": 8.005478262442132e-06, |
|
"loss": 1.2489, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.367810417180943, |
|
"grad_norm": 0.5957543279120926, |
|
"learning_rate": 7.970917369032011e-06, |
|
"loss": 1.2808, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 0.37027894347074797, |
|
"grad_norm": 0.5573632520708609, |
|
"learning_rate": 7.936135659442355e-06, |
|
"loss": 1.2394, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.37274746976055295, |
|
"grad_norm": 0.5383442104756702, |
|
"learning_rate": 7.901135718855877e-06, |
|
"loss": 1.2584, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 0.37521599605035794, |
|
"grad_norm": 0.5269547291918393, |
|
"learning_rate": 7.86592014867551e-06, |
|
"loss": 1.32, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.3776845223401629, |
|
"grad_norm": 0.6059173481615415, |
|
"learning_rate": 7.830491566331063e-06, |
|
"loss": 1.2705, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 0.3801530486299679, |
|
"grad_norm": 0.5905241537228486, |
|
"learning_rate": 7.794852605084661e-06, |
|
"loss": 1.2661, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.3826215749197729, |
|
"grad_norm": 0.6119492506708828, |
|
"learning_rate": 7.759005913835048e-06, |
|
"loss": 1.2573, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.3850901012095779, |
|
"grad_norm": 0.6449864393640712, |
|
"learning_rate": 7.722954156920675e-06, |
|
"loss": 1.2681, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.38755862749938286, |
|
"grad_norm": 0.5777516112864801, |
|
"learning_rate": 7.686700013921704e-06, |
|
"loss": 1.2999, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 0.39002715378918784, |
|
"grad_norm": 0.5818063096150684, |
|
"learning_rate": 7.650246179460826e-06, |
|
"loss": 1.2842, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.3924956800789928, |
|
"grad_norm": 0.5844315528318011, |
|
"learning_rate": 7.613595363002977e-06, |
|
"loss": 1.2995, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 0.3949642063687978, |
|
"grad_norm": 0.5560255613889942, |
|
"learning_rate": 7.57675028865397e-06, |
|
"loss": 1.275, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3949642063687978, |
|
"eval_loss": 1.2263342142105103, |
|
"eval_runtime": 2463.6634, |
|
"eval_samples_per_second": 1.624, |
|
"eval_steps_per_second": 0.136, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.3974327326586028, |
|
"grad_norm": 0.5523940138743026, |
|
"learning_rate": 7.539713694958013e-06, |
|
"loss": 1.2202, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 0.3999012589484078, |
|
"grad_norm": 0.5936001183365429, |
|
"learning_rate": 7.502488334694167e-06, |
|
"loss": 1.2444, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.40236978523821276, |
|
"grad_norm": 0.6143038376732798, |
|
"learning_rate": 7.465076974671739e-06, |
|
"loss": 1.2032, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 0.40483831152801775, |
|
"grad_norm": 0.5865451493919344, |
|
"learning_rate": 7.427482395524646e-06, |
|
"loss": 1.2733, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.4073068378178228, |
|
"grad_norm": 0.5980943581114722, |
|
"learning_rate": 7.389707391504728e-06, |
|
"loss": 1.2732, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.40977536410762777, |
|
"grad_norm": 0.6323487686008166, |
|
"learning_rate": 7.35175477027408e-06, |
|
"loss": 1.244, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.41224389039743276, |
|
"grad_norm": 0.6562081554973773, |
|
"learning_rate": 7.313627352696353e-06, |
|
"loss": 1.2642, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 0.41471241668723774, |
|
"grad_norm": 0.5554470118072983, |
|
"learning_rate": 7.2753279726271e-06, |
|
"loss": 1.2556, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.4171809429770427, |
|
"grad_norm": 0.5740654163988275, |
|
"learning_rate": 7.236859476703148e-06, |
|
"loss": 1.2292, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 0.4196494692668477, |
|
"grad_norm": 0.6062582969566837, |
|
"learning_rate": 7.198224724131012e-06, |
|
"loss": 1.235, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.4221179955566527, |
|
"grad_norm": 0.5434614048201878, |
|
"learning_rate": 7.159426586474388e-06, |
|
"loss": 1.2224, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 0.4245865218464577, |
|
"grad_norm": 0.5254561702235886, |
|
"learning_rate": 7.120467947440719e-06, |
|
"loss": 1.2557, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.42705504813626266, |
|
"grad_norm": 0.5713031391494172, |
|
"learning_rate": 7.081351702666863e-06, |
|
"loss": 1.2063, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 0.42952357442606764, |
|
"grad_norm": 0.5969980245366532, |
|
"learning_rate": 7.042080759503866e-06, |
|
"loss": 1.2418, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.43199210071587263, |
|
"grad_norm": 0.5718940130718101, |
|
"learning_rate": 7.00265803680088e-06, |
|
"loss": 1.2108, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.4344606270056776, |
|
"grad_norm": 0.6045555591926912, |
|
"learning_rate": 6.963086464688209e-06, |
|
"loss": 1.2597, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.4369291532954826, |
|
"grad_norm": 0.5566709780037437, |
|
"learning_rate": 6.923368984359526e-06, |
|
"loss": 1.2174, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 0.4393976795852876, |
|
"grad_norm": 0.5630200258106689, |
|
"learning_rate": 6.883508547853268e-06, |
|
"loss": 1.2244, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.44186620587509257, |
|
"grad_norm": 0.5348314552481888, |
|
"learning_rate": 6.843508117833224e-06, |
|
"loss": 1.2687, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 0.44433473216489755, |
|
"grad_norm": 0.49625311943608336, |
|
"learning_rate": 6.8033706673683276e-06, |
|
"loss": 1.1986, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.44680325845470253, |
|
"grad_norm": 0.5542218838145379, |
|
"learning_rate": 6.763099179711685e-06, |
|
"loss": 1.2286, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 0.4492717847445075, |
|
"grad_norm": 0.594098893943127, |
|
"learning_rate": 6.722696648078838e-06, |
|
"loss": 1.2335, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.4517403110343125, |
|
"grad_norm": 0.5478077068384012, |
|
"learning_rate": 6.682166075425298e-06, |
|
"loss": 1.264, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 0.4542088373241175, |
|
"grad_norm": 0.5727528301850252, |
|
"learning_rate": 6.641510474223338e-06, |
|
"loss": 1.226, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.45667736361392247, |
|
"grad_norm": 0.5888269073825134, |
|
"learning_rate": 6.600732866238097e-06, |
|
"loss": 1.212, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.45914588990372746, |
|
"grad_norm": 0.5736288265128395, |
|
"learning_rate": 6.559836282302984e-06, |
|
"loss": 1.25, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.46161441619353244, |
|
"grad_norm": 0.6651036803926929, |
|
"learning_rate": 6.5188237620943965e-06, |
|
"loss": 1.2672, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 0.4640829424833374, |
|
"grad_norm": 0.5547382454730273, |
|
"learning_rate": 6.477698353905808e-06, |
|
"loss": 1.2887, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.4665514687731424, |
|
"grad_norm": 0.5627833712727636, |
|
"learning_rate": 6.436463114421199e-06, |
|
"loss": 1.2674, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 0.46901999506294745, |
|
"grad_norm": 0.5562108977867529, |
|
"learning_rate": 6.395121108487855e-06, |
|
"loss": 1.2973, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.47148852135275243, |
|
"grad_norm": 0.5940300188918287, |
|
"learning_rate": 6.353675408888582e-06, |
|
"loss": 1.278, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 0.4739570476425574, |
|
"grad_norm": 0.6499724681591359, |
|
"learning_rate": 6.312129096113313e-06, |
|
"loss": 1.242, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.4764255739323624, |
|
"grad_norm": 0.5794092582819724, |
|
"learning_rate": 6.270485258130146e-06, |
|
"loss": 1.2263, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 0.4788941002221674, |
|
"grad_norm": 0.5810005883829364, |
|
"learning_rate": 6.228746990155831e-06, |
|
"loss": 1.2166, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.48136262651197237, |
|
"grad_norm": 0.5523321758038612, |
|
"learning_rate": 6.186917394425715e-06, |
|
"loss": 1.2666, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.48383115280177735, |
|
"grad_norm": 0.5353766340095819, |
|
"learning_rate": 6.144999579963164e-06, |
|
"loss": 1.2332, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.48629967909158234, |
|
"grad_norm": 0.5962559333577797, |
|
"learning_rate": 6.102996662348485e-06, |
|
"loss": 1.2985, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 0.4887682053813873, |
|
"grad_norm": 0.573508927377536, |
|
"learning_rate": 6.060911763487353e-06, |
|
"loss": 1.2353, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.4912367316711923, |
|
"grad_norm": 0.6190411186907346, |
|
"learning_rate": 6.0187480113787765e-06, |
|
"loss": 1.2668, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 0.4937052579609973, |
|
"grad_norm": 0.537107101144104, |
|
"learning_rate": 5.976508539882604e-06, |
|
"loss": 1.1984, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4937052579609973, |
|
"eval_loss": 1.2196881771087646, |
|
"eval_runtime": 2373.8686, |
|
"eval_samples_per_second": 1.685, |
|
"eval_steps_per_second": 0.141, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.4961737842508023, |
|
"grad_norm": 0.5673334311067016, |
|
"learning_rate": 5.934196488486594e-06, |
|
"loss": 1.2573, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 0.49864231054060726, |
|
"grad_norm": 0.6141102747872601, |
|
"learning_rate": 5.891815002073081e-06, |
|
"loss": 1.2776, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.5011108368304122, |
|
"grad_norm": 0.5866475421501153, |
|
"learning_rate": 5.849367230685214e-06, |
|
"loss": 1.2139, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 0.5035793631202172, |
|
"grad_norm": 0.5973223110810923, |
|
"learning_rate": 5.806856329292839e-06, |
|
"loss": 1.2809, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.5060478894100222, |
|
"grad_norm": 0.6385978269750231, |
|
"learning_rate": 5.764285457557994e-06, |
|
"loss": 1.2511, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.5085164156998272, |
|
"grad_norm": 0.5607340345191899, |
|
"learning_rate": 5.721657779600071e-06, |
|
"loss": 1.2421, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.5109849419896322, |
|
"grad_norm": 0.5444555426859482, |
|
"learning_rate": 5.678976463760635e-06, |
|
"loss": 1.2561, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 0.5134534682794372, |
|
"grad_norm": 0.5663913305474535, |
|
"learning_rate": 5.636244682367937e-06, |
|
"loss": 1.2324, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.5159219945692421, |
|
"grad_norm": 0.6001697304401695, |
|
"learning_rate": 5.593465611501127e-06, |
|
"loss": 1.2206, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 0.5183905208590471, |
|
"grad_norm": 0.5922209574486257, |
|
"learning_rate": 5.5506424307541895e-06, |
|
"loss": 1.2777, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.5208590471488521, |
|
"grad_norm": 0.5810845811643376, |
|
"learning_rate": 5.507778322999615e-06, |
|
"loss": 1.2186, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 0.5233275734386571, |
|
"grad_norm": 0.5661815755139697, |
|
"learning_rate": 5.464876474151835e-06, |
|
"loss": 1.2465, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.5257960997284621, |
|
"grad_norm": 0.6016645517449551, |
|
"learning_rate": 5.421940072930415e-06, |
|
"loss": 1.2269, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 0.5282646260182671, |
|
"grad_norm": 0.6268744087157316, |
|
"learning_rate": 5.3789723106230675e-06, |
|
"loss": 1.2089, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.530733152308072, |
|
"grad_norm": 0.5374231313658383, |
|
"learning_rate": 5.3359763808484396e-06, |
|
"loss": 1.2371, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 0.533201678597877, |
|
"grad_norm": 0.5696825743006079, |
|
"learning_rate": 5.292955479318756e-06, |
|
"loss": 1.2288, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.535670204887682, |
|
"grad_norm": 0.5474403893705062, |
|
"learning_rate": 5.249912803602287e-06, |
|
"loss": 1.2631, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 0.538138731177487, |
|
"grad_norm": 0.611438366860115, |
|
"learning_rate": 5.206851552885691e-06, |
|
"loss": 1.2395, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.540607257467292, |
|
"grad_norm": 0.6437738368971478, |
|
"learning_rate": 5.163774927736228e-06, |
|
"loss": 1.3132, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 0.543075783757097, |
|
"grad_norm": 0.5438676695949717, |
|
"learning_rate": 5.120686129863882e-06, |
|
"loss": 1.2807, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.545544310046902, |
|
"grad_norm": 0.6135072081701597, |
|
"learning_rate": 5.077588361883379e-06, |
|
"loss": 1.2239, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 0.5480128363367069, |
|
"grad_norm": 0.546701645842348, |
|
"learning_rate": 5.0344848270761635e-06, |
|
"loss": 1.2121, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.5504813626265119, |
|
"grad_norm": 0.6153049309551597, |
|
"learning_rate": 4.9913787291523e-06, |
|
"loss": 1.2832, |
|
"step": 1115 |
|
}, |
|
{ |
|
"epoch": 0.5529498889163169, |
|
"grad_norm": 0.6148368644966669, |
|
"learning_rate": 4.948273272012363e-06, |
|
"loss": 1.2536, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.5554184152061219, |
|
"grad_norm": 0.5911800001869699, |
|
"learning_rate": 4.905171659509294e-06, |
|
"loss": 1.2789, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 0.5578869414959269, |
|
"grad_norm": 0.5450128065258734, |
|
"learning_rate": 4.862077095210284e-06, |
|
"loss": 1.1595, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.5603554677857319, |
|
"grad_norm": 0.5629093671549396, |
|
"learning_rate": 4.818992782158658e-06, |
|
"loss": 1.2854, |
|
"step": 1135 |
|
}, |
|
{ |
|
"epoch": 0.562823994075537, |
|
"grad_norm": 0.6634778146032412, |
|
"learning_rate": 4.775921922635806e-06, |
|
"loss": 1.2405, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.5652925203653419, |
|
"grad_norm": 0.5439361692157106, |
|
"learning_rate": 4.732867717923174e-06, |
|
"loss": 1.265, |
|
"step": 1145 |
|
}, |
|
{ |
|
"epoch": 0.5677610466551469, |
|
"grad_norm": 0.5860651769650387, |
|
"learning_rate": 4.689833368064326e-06, |
|
"loss": 1.2511, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.5702295729449519, |
|
"grad_norm": 0.627265270599233, |
|
"learning_rate": 4.646822071627089e-06, |
|
"loss": 1.2813, |
|
"step": 1155 |
|
}, |
|
{ |
|
"epoch": 0.5726980992347569, |
|
"grad_norm": 0.5634927900565491, |
|
"learning_rate": 4.603837025465829e-06, |
|
"loss": 1.22, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.5751666255245619, |
|
"grad_norm": 0.6482363315867818, |
|
"learning_rate": 4.560881424483833e-06, |
|
"loss": 1.3095, |
|
"step": 1165 |
|
}, |
|
{ |
|
"epoch": 0.5776351518143669, |
|
"grad_norm": 0.4805380958857345, |
|
"learning_rate": 4.517958461395846e-06, |
|
"loss": 1.2737, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.5801036781041718, |
|
"grad_norm": 0.5854150858325277, |
|
"learning_rate": 4.475071326490781e-06, |
|
"loss": 1.2282, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 0.5825722043939768, |
|
"grad_norm": 0.554230131541799, |
|
"learning_rate": 4.432223207394577e-06, |
|
"loss": 1.178, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.5850407306837818, |
|
"grad_norm": 0.6930360615517788, |
|
"learning_rate": 4.389417288833292e-06, |
|
"loss": 1.2781, |
|
"step": 1185 |
|
}, |
|
{ |
|
"epoch": 0.5875092569735868, |
|
"grad_norm": 0.6042088339838697, |
|
"learning_rate": 4.346656752396388e-06, |
|
"loss": 1.2813, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.5899777832633918, |
|
"grad_norm": 0.6280387565672664, |
|
"learning_rate": 4.303944776300262e-06, |
|
"loss": 1.2433, |
|
"step": 1195 |
|
}, |
|
{ |
|
"epoch": 0.5924463095531968, |
|
"grad_norm": 0.5502891803034431, |
|
"learning_rate": 4.261284535152016e-06, |
|
"loss": 1.1556, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5924463095531968, |
|
"eval_loss": 1.2148913145065308, |
|
"eval_runtime": 2558.7024, |
|
"eval_samples_per_second": 1.563, |
|
"eval_steps_per_second": 0.131, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.5949148358430018, |
|
"grad_norm": 0.5429417971755677, |
|
"learning_rate": 4.218679199713505e-06, |
|
"loss": 1.2398, |
|
"step": 1205 |
|
}, |
|
{ |
|
"epoch": 0.5973833621328067, |
|
"grad_norm": 0.5573592415141271, |
|
"learning_rate": 4.176131936665669e-06, |
|
"loss": 1.2348, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.5998518884226117, |
|
"grad_norm": 0.5662130620287456, |
|
"learning_rate": 4.133645908373159e-06, |
|
"loss": 1.1894, |
|
"step": 1215 |
|
}, |
|
{ |
|
"epoch": 0.6023204147124167, |
|
"grad_norm": 0.5330337777111593, |
|
"learning_rate": 4.0912242726493e-06, |
|
"loss": 1.267, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.6047889410022217, |
|
"grad_norm": 0.589763462299109, |
|
"learning_rate": 4.048870182521374e-06, |
|
"loss": 1.2461, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 0.6072574672920267, |
|
"grad_norm": 0.5798241574940401, |
|
"learning_rate": 4.006586785996285e-06, |
|
"loss": 1.2503, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.6097259935818317, |
|
"grad_norm": 0.5714021679563045, |
|
"learning_rate": 3.96437722582656e-06, |
|
"loss": 1.2322, |
|
"step": 1235 |
|
}, |
|
{ |
|
"epoch": 0.6121945198716366, |
|
"grad_norm": 0.5926307509257247, |
|
"learning_rate": 3.922244639276773e-06, |
|
"loss": 1.2692, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.6146630461614416, |
|
"grad_norm": 0.6016557090563102, |
|
"learning_rate": 3.880192157890365e-06, |
|
"loss": 1.2642, |
|
"step": 1245 |
|
}, |
|
{ |
|
"epoch": 0.6171315724512466, |
|
"grad_norm": 0.5454381088492659, |
|
"learning_rate": 3.838222907256884e-06, |
|
"loss": 1.239, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.6196000987410516, |
|
"grad_norm": 0.5582749852816064, |
|
"learning_rate": 3.7963400067796774e-06, |
|
"loss": 1.2851, |
|
"step": 1255 |
|
}, |
|
{ |
|
"epoch": 0.6220686250308566, |
|
"grad_norm": 0.5562967849735465, |
|
"learning_rate": 3.7545465694440363e-06, |
|
"loss": 1.2432, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.6245371513206616, |
|
"grad_norm": 0.5419669962437569, |
|
"learning_rate": 3.7128457015858198e-06, |
|
"loss": 1.2103, |
|
"step": 1265 |
|
}, |
|
{ |
|
"epoch": 0.6270056776104665, |
|
"grad_norm": 0.558873424565738, |
|
"learning_rate": 3.6712405026605792e-06, |
|
"loss": 1.2388, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.6294742039002715, |
|
"grad_norm": 0.5712282397945332, |
|
"learning_rate": 3.6297340650131785e-06, |
|
"loss": 1.2819, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 0.6319427301900765, |
|
"grad_norm": 0.5643697726223241, |
|
"learning_rate": 3.5883294736479612e-06, |
|
"loss": 1.2386, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.6344112564798815, |
|
"grad_norm": 0.6332020317807455, |
|
"learning_rate": 3.5470298059994545e-06, |
|
"loss": 1.2677, |
|
"step": 1285 |
|
}, |
|
{ |
|
"epoch": 0.6368797827696865, |
|
"grad_norm": 0.6276157822500693, |
|
"learning_rate": 3.5058381317036285e-06, |
|
"loss": 1.2137, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.6393483090594915, |
|
"grad_norm": 0.5139753708360036, |
|
"learning_rate": 3.46475751236975e-06, |
|
"loss": 1.2436, |
|
"step": 1295 |
|
}, |
|
{ |
|
"epoch": 0.6418168353492965, |
|
"grad_norm": 0.5868933304811402, |
|
"learning_rate": 3.423791001352823e-06, |
|
"loss": 1.1681, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.6442853616391014, |
|
"grad_norm": 0.5592137564928078, |
|
"learning_rate": 3.382941643526644e-06, |
|
"loss": 1.2443, |
|
"step": 1305 |
|
}, |
|
{ |
|
"epoch": 0.6467538879289064, |
|
"grad_norm": 0.567548616583169, |
|
"learning_rate": 3.3422124750574902e-06, |
|
"loss": 1.2604, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.6492224142187114, |
|
"grad_norm": 0.568882999500645, |
|
"learning_rate": 3.3016065231784587e-06, |
|
"loss": 1.1595, |
|
"step": 1315 |
|
}, |
|
{ |
|
"epoch": 0.6516909405085164, |
|
"grad_norm": 0.628304707671549, |
|
"learning_rate": 3.2611268059644535e-06, |
|
"loss": 1.2841, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.6541594667983214, |
|
"grad_norm": 0.5686219665932154, |
|
"learning_rate": 3.2207763321078737e-06, |
|
"loss": 1.2347, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 0.6566279930881264, |
|
"grad_norm": 0.6424587872522304, |
|
"learning_rate": 3.1805581006949856e-06, |
|
"loss": 1.2329, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.6590965193779313, |
|
"grad_norm": 0.6654374856920555, |
|
"learning_rate": 3.1404751009830124e-06, |
|
"loss": 1.2423, |
|
"step": 1335 |
|
}, |
|
{ |
|
"epoch": 0.6615650456677363, |
|
"grad_norm": 0.5206675422652753, |
|
"learning_rate": 3.100530312177956e-06, |
|
"loss": 1.2329, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.6640335719575413, |
|
"grad_norm": 0.6656795155578475, |
|
"learning_rate": 3.0607267032131704e-06, |
|
"loss": 1.3062, |
|
"step": 1345 |
|
}, |
|
{ |
|
"epoch": 0.6665020982473463, |
|
"grad_norm": 0.6071844948708964, |
|
"learning_rate": 3.0210672325286806e-06, |
|
"loss": 1.2656, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.6689706245371513, |
|
"grad_norm": 0.6211025479318184, |
|
"learning_rate": 2.9815548478513034e-06, |
|
"loss": 1.2167, |
|
"step": 1355 |
|
}, |
|
{ |
|
"epoch": 0.6714391508269563, |
|
"grad_norm": 0.5801456765244887, |
|
"learning_rate": 2.9421924859755525e-06, |
|
"loss": 1.2249, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.6739076771167613, |
|
"grad_norm": 0.564862030285346, |
|
"learning_rate": 2.9029830725453545e-06, |
|
"loss": 1.2414, |
|
"step": 1365 |
|
}, |
|
{ |
|
"epoch": 0.6763762034065662, |
|
"grad_norm": 0.5538133203567932, |
|
"learning_rate": 2.8639295218366115e-06, |
|
"loss": 1.2191, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.6788447296963712, |
|
"grad_norm": 0.5925104037633543, |
|
"learning_rate": 2.8250347365405737e-06, |
|
"loss": 1.2318, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 0.6813132559861762, |
|
"grad_norm": 0.6173909875052214, |
|
"learning_rate": 2.78630160754811e-06, |
|
"loss": 1.2555, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.6837817822759812, |
|
"grad_norm": 0.6579800769123958, |
|
"learning_rate": 2.747733013734835e-06, |
|
"loss": 1.2553, |
|
"step": 1385 |
|
}, |
|
{ |
|
"epoch": 0.6862503085657862, |
|
"grad_norm": 0.6097488788659552, |
|
"learning_rate": 2.709331821747133e-06, |
|
"loss": 1.2482, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.6887188348555913, |
|
"grad_norm": 0.5717544066297715, |
|
"learning_rate": 2.6711008857890928e-06, |
|
"loss": 1.2477, |
|
"step": 1395 |
|
}, |
|
{ |
|
"epoch": 0.6911873611453963, |
|
"grad_norm": 0.5675063300875494, |
|
"learning_rate": 2.63304304741037e-06, |
|
"loss": 1.2386, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6911873611453963, |
|
"eval_loss": 1.211606740951538, |
|
"eval_runtime": 2914.6181, |
|
"eval_samples_per_second": 1.372, |
|
"eval_steps_per_second": 0.115, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.6936558874352012, |
|
"grad_norm": 0.623871781326139, |
|
"learning_rate": 2.595161135294978e-06, |
|
"loss": 1.2484, |
|
"step": 1405 |
|
}, |
|
{ |
|
"epoch": 0.6961244137250062, |
|
"grad_norm": 0.5967791678571923, |
|
"learning_rate": 2.55745796505105e-06, |
|
"loss": 1.2816, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.6985929400148112, |
|
"grad_norm": 0.5958918786737188, |
|
"learning_rate": 2.5199363390015645e-06, |
|
"loss": 1.2518, |
|
"step": 1415 |
|
}, |
|
{ |
|
"epoch": 0.7010614663046162, |
|
"grad_norm": 0.5716469845277612, |
|
"learning_rate": 2.482599045976059e-06, |
|
"loss": 1.2518, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.7035299925944212, |
|
"grad_norm": 0.5601354887821722, |
|
"learning_rate": 2.445448861103348e-06, |
|
"loss": 1.2114, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 0.7059985188842262, |
|
"grad_norm": 0.5783618487395104, |
|
"learning_rate": 2.408488545605265e-06, |
|
"loss": 1.2801, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.7084670451740311, |
|
"grad_norm": 0.600120666255256, |
|
"learning_rate": 2.3717208465914193e-06, |
|
"loss": 1.2928, |
|
"step": 1435 |
|
}, |
|
{ |
|
"epoch": 0.7109355714638361, |
|
"grad_norm": 0.6823362059514299, |
|
"learning_rate": 2.3351484968550264e-06, |
|
"loss": 1.2306, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.7134040977536411, |
|
"grad_norm": 0.5869728269343567, |
|
"learning_rate": 2.298774214669785e-06, |
|
"loss": 1.2417, |
|
"step": 1445 |
|
}, |
|
{ |
|
"epoch": 0.7158726240434461, |
|
"grad_norm": 0.597629982893601, |
|
"learning_rate": 2.2626007035878377e-06, |
|
"loss": 1.1912, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.7183411503332511, |
|
"grad_norm": 0.6222473980576229, |
|
"learning_rate": 2.226630652238836e-06, |
|
"loss": 1.2083, |
|
"step": 1455 |
|
}, |
|
{ |
|
"epoch": 0.7208096766230561, |
|
"grad_norm": 0.5978767327421509, |
|
"learning_rate": 2.1908667341300923e-06, |
|
"loss": 1.2577, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.723278202912861, |
|
"grad_norm": 0.6156905912164004, |
|
"learning_rate": 2.155311607447877e-06, |
|
"loss": 1.2922, |
|
"step": 1465 |
|
}, |
|
{ |
|
"epoch": 0.725746729202666, |
|
"grad_norm": 0.6341472520929511, |
|
"learning_rate": 2.1199679148598434e-06, |
|
"loss": 1.2667, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.728215255492471, |
|
"grad_norm": 0.5655996654676207, |
|
"learning_rate": 2.084838283318616e-06, |
|
"loss": 1.1939, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 0.730683781782276, |
|
"grad_norm": 0.5824088027115487, |
|
"learning_rate": 2.0499253238665284e-06, |
|
"loss": 1.242, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.733152308072081, |
|
"grad_norm": 0.6063388402546945, |
|
"learning_rate": 2.0152316314415602e-06, |
|
"loss": 1.2482, |
|
"step": 1485 |
|
}, |
|
{ |
|
"epoch": 0.735620834361886, |
|
"grad_norm": 0.6226805122487513, |
|
"learning_rate": 1.9807597846844737e-06, |
|
"loss": 1.255, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.738089360651691, |
|
"grad_norm": 0.5854379294811827, |
|
"learning_rate": 1.9465123457471395e-06, |
|
"loss": 1.1786, |
|
"step": 1495 |
|
}, |
|
{ |
|
"epoch": 0.7405578869414959, |
|
"grad_norm": 0.5577052246580572, |
|
"learning_rate": 1.9124918601021124e-06, |
|
"loss": 1.2358, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.7430264132313009, |
|
"grad_norm": 0.5754079743445688, |
|
"learning_rate": 1.8787008563534326e-06, |
|
"loss": 1.1945, |
|
"step": 1505 |
|
}, |
|
{ |
|
"epoch": 0.7454949395211059, |
|
"grad_norm": 0.6099556355269008, |
|
"learning_rate": 1.845141846048691e-06, |
|
"loss": 1.2379, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.7479634658109109, |
|
"grad_norm": 0.5782704010521243, |
|
"learning_rate": 1.8118173234923447e-06, |
|
"loss": 1.2542, |
|
"step": 1515 |
|
}, |
|
{ |
|
"epoch": 0.7504319921007159, |
|
"grad_norm": 0.5382858254483444, |
|
"learning_rate": 1.778729765560337e-06, |
|
"loss": 1.2327, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.7529005183905209, |
|
"grad_norm": 0.6082642317550977, |
|
"learning_rate": 1.7458816315159937e-06, |
|
"loss": 1.2631, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 0.7553690446803258, |
|
"grad_norm": 0.6120502232540203, |
|
"learning_rate": 1.7132753628272403e-06, |
|
"loss": 1.2687, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.7578375709701308, |
|
"grad_norm": 0.5800190917782422, |
|
"learning_rate": 1.6809133829851344e-06, |
|
"loss": 1.1809, |
|
"step": 1535 |
|
}, |
|
{ |
|
"epoch": 0.7603060972599358, |
|
"grad_norm": 0.6248767795672576, |
|
"learning_rate": 1.6487980973237434e-06, |
|
"loss": 1.2102, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.7627746235497408, |
|
"grad_norm": 0.6214869106372124, |
|
"learning_rate": 1.6169318928413574e-06, |
|
"loss": 1.3183, |
|
"step": 1545 |
|
}, |
|
{ |
|
"epoch": 0.7652431498395458, |
|
"grad_norm": 0.6509287986960063, |
|
"learning_rate": 1.5853171380230791e-06, |
|
"loss": 1.2394, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.7677116761293508, |
|
"grad_norm": 0.5548564286839581, |
|
"learning_rate": 1.5539561826647832e-06, |
|
"loss": 1.2278, |
|
"step": 1555 |
|
}, |
|
{ |
|
"epoch": 0.7701802024191557, |
|
"grad_norm": 0.5873399173100068, |
|
"learning_rate": 1.5228513576984633e-06, |
|
"loss": 1.2419, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.7726487287089607, |
|
"grad_norm": 0.5698526241039991, |
|
"learning_rate": 1.4920049750189852e-06, |
|
"loss": 1.2134, |
|
"step": 1565 |
|
}, |
|
{ |
|
"epoch": 0.7751172549987657, |
|
"grad_norm": 0.5462525752885333, |
|
"learning_rate": 1.4614193273122562e-06, |
|
"loss": 1.2013, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 0.7775857812885707, |
|
"grad_norm": 0.5604406125512932, |
|
"learning_rate": 1.4310966878848116e-06, |
|
"loss": 1.2319, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 0.7800543075783757, |
|
"grad_norm": 0.5512496837811336, |
|
"learning_rate": 1.401039310494855e-06, |
|
"loss": 1.2436, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.7825228338681807, |
|
"grad_norm": 0.6804998312407946, |
|
"learning_rate": 1.3712494291847416e-06, |
|
"loss": 1.2567, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 0.7849913601579857, |
|
"grad_norm": 0.6655723000722049, |
|
"learning_rate": 1.3417292581149388e-06, |
|
"loss": 1.2682, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 0.7874598864477906, |
|
"grad_norm": 0.539222744257867, |
|
"learning_rate": 1.3124809913994458e-06, |
|
"loss": 1.2009, |
|
"step": 1595 |
|
}, |
|
{ |
|
"epoch": 0.7899284127375956, |
|
"grad_norm": 0.622721298212167, |
|
"learning_rate": 1.2835068029427188e-06, |
|
"loss": 1.2661, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7899284127375956, |
|
"eval_loss": 1.2096235752105713, |
|
"eval_runtime": 2576.8943, |
|
"eval_samples_per_second": 1.552, |
|
"eval_steps_per_second": 0.13, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.7923969390274006, |
|
"grad_norm": 0.5470842930259888, |
|
"learning_rate": 1.2548088462781006e-06, |
|
"loss": 1.2244, |
|
"step": 1605 |
|
}, |
|
{ |
|
"epoch": 0.7948654653172056, |
|
"grad_norm": 0.5718801309412294, |
|
"learning_rate": 1.2263892544077439e-06, |
|
"loss": 1.2498, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 0.7973339916070106, |
|
"grad_norm": 0.5818869817428877, |
|
"learning_rate": 1.1982501396440831e-06, |
|
"loss": 1.2044, |
|
"step": 1615 |
|
}, |
|
{ |
|
"epoch": 0.7998025178968156, |
|
"grad_norm": 0.5534354350847027, |
|
"learning_rate": 1.1703935934528327e-06, |
|
"loss": 1.2328, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.8022710441866205, |
|
"grad_norm": 0.5862274808604895, |
|
"learning_rate": 1.1428216862975383e-06, |
|
"loss": 1.2741, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 0.8047395704764255, |
|
"grad_norm": 0.5781950796979888, |
|
"learning_rate": 1.1155364674856834e-06, |
|
"loss": 1.2679, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 0.8072080967662305, |
|
"grad_norm": 0.5751302301159884, |
|
"learning_rate": 1.088539965016377e-06, |
|
"loss": 1.2153, |
|
"step": 1635 |
|
}, |
|
{ |
|
"epoch": 0.8096766230560355, |
|
"grad_norm": 0.6150065644184977, |
|
"learning_rate": 1.0618341854296176e-06, |
|
"loss": 1.2245, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.8121451493458405, |
|
"grad_norm": 0.5893743060234344, |
|
"learning_rate": 1.0354211136571586e-06, |
|
"loss": 1.2091, |
|
"step": 1645 |
|
}, |
|
{ |
|
"epoch": 0.8146136756356456, |
|
"grad_norm": 0.554001627193442, |
|
"learning_rate": 1.0093027128749722e-06, |
|
"loss": 1.22, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.8170822019254506, |
|
"grad_norm": 0.5554016650617593, |
|
"learning_rate": 9.834809243573406e-07, |
|
"loss": 1.2736, |
|
"step": 1655 |
|
}, |
|
{ |
|
"epoch": 0.8195507282152555, |
|
"grad_norm": 0.6467820952863279, |
|
"learning_rate": 9.57957667332562e-07, |
|
"loss": 1.2504, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.8220192545050605, |
|
"grad_norm": 0.5388841867240308, |
|
"learning_rate": 9.327348388403063e-07, |
|
"loss": 1.2134, |
|
"step": 1665 |
|
}, |
|
{ |
|
"epoch": 0.8244877807948655, |
|
"grad_norm": 0.5511949198965124, |
|
"learning_rate": 9.078143135906154e-07, |
|
"loss": 1.2373, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 0.8269563070846705, |
|
"grad_norm": 0.5662492648467455, |
|
"learning_rate": 8.831979438245619e-07, |
|
"loss": 1.2379, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 0.8294248333744755, |
|
"grad_norm": 0.6308948625824087, |
|
"learning_rate": 8.588875591765838e-07, |
|
"loss": 1.1868, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.8318933596642805, |
|
"grad_norm": 0.576660126030343, |
|
"learning_rate": 8.348849665384906e-07, |
|
"loss": 1.2891, |
|
"step": 1685 |
|
}, |
|
{ |
|
"epoch": 0.8343618859540854, |
|
"grad_norm": 0.556606789107177, |
|
"learning_rate": 8.111919499251653e-07, |
|
"loss": 1.2021, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 0.8368304122438904, |
|
"grad_norm": 0.5661740275037651, |
|
"learning_rate": 7.878102703419683e-07, |
|
"loss": 1.2536, |
|
"step": 1695 |
|
}, |
|
{ |
|
"epoch": 0.8392989385336954, |
|
"grad_norm": 0.5967205392911274, |
|
"learning_rate": 7.647416656538464e-07, |
|
"loss": 1.2373, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.8417674648235004, |
|
"grad_norm": 0.5528061162446166, |
|
"learning_rate": 7.419878504561651e-07, |
|
"loss": 1.2199, |
|
"step": 1705 |
|
}, |
|
{ |
|
"epoch": 0.8442359911133054, |
|
"grad_norm": 0.6479872928308008, |
|
"learning_rate": 7.195505159472726e-07, |
|
"loss": 1.2368, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 0.8467045174031104, |
|
"grad_norm": 0.594834011459554, |
|
"learning_rate": 6.974313298027946e-07, |
|
"loss": 1.1997, |
|
"step": 1715 |
|
}, |
|
{ |
|
"epoch": 0.8491730436929154, |
|
"grad_norm": 0.5442970599231537, |
|
"learning_rate": 6.756319360516856e-07, |
|
"loss": 1.2037, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.8516415699827203, |
|
"grad_norm": 0.6655980946948994, |
|
"learning_rate": 6.541539549540383e-07, |
|
"loss": 1.3013, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 0.8541100962725253, |
|
"grad_norm": 0.599651741019629, |
|
"learning_rate": 6.329989828806482e-07, |
|
"loss": 1.2454, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 0.8565786225623303, |
|
"grad_norm": 0.7507415296204425, |
|
"learning_rate": 6.121685921943688e-07, |
|
"loss": 1.2347, |
|
"step": 1735 |
|
}, |
|
{ |
|
"epoch": 0.8590471488521353, |
|
"grad_norm": 0.5883088948787556, |
|
"learning_rate": 5.916643311332438e-07, |
|
"loss": 1.2566, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.8615156751419403, |
|
"grad_norm": 0.5844649067792757, |
|
"learning_rate": 5.71487723695427e-07, |
|
"loss": 1.2176, |
|
"step": 1745 |
|
}, |
|
{ |
|
"epoch": 0.8639842014317453, |
|
"grad_norm": 0.570757598339604, |
|
"learning_rate": 5.516402695259165e-07, |
|
"loss": 1.2111, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.8664527277215502, |
|
"grad_norm": 0.6101964731318252, |
|
"learning_rate": 5.321234438050893e-07, |
|
"loss": 1.2552, |
|
"step": 1755 |
|
}, |
|
{ |
|
"epoch": 0.8689212540113552, |
|
"grad_norm": 0.6114031483570134, |
|
"learning_rate": 5.12938697139056e-07, |
|
"loss": 1.2339, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.8713897803011602, |
|
"grad_norm": 0.5640524033820485, |
|
"learning_rate": 4.940874554518465e-07, |
|
"loss": 1.2594, |
|
"step": 1765 |
|
}, |
|
{ |
|
"epoch": 0.8738583065909652, |
|
"grad_norm": 0.6433079417694005, |
|
"learning_rate": 4.755711198794233e-07, |
|
"loss": 1.2854, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 0.8763268328807702, |
|
"grad_norm": 0.604973387553276, |
|
"learning_rate": 4.573910666655429e-07, |
|
"loss": 1.3237, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 0.8787953591705752, |
|
"grad_norm": 0.5628418770325067, |
|
"learning_rate": 4.395486470594645e-07, |
|
"loss": 1.1982, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.8812638854603801, |
|
"grad_norm": 0.6659219563445046, |
|
"learning_rate": 4.220451872155179e-07, |
|
"loss": 1.2309, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 0.8837324117501851, |
|
"grad_norm": 0.5361789546629312, |
|
"learning_rate": 4.048819880945337e-07, |
|
"loss": 1.199, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 0.8862009380399901, |
|
"grad_norm": 0.5558192723511216, |
|
"learning_rate": 3.880603253671522e-07, |
|
"loss": 1.2263, |
|
"step": 1795 |
|
}, |
|
{ |
|
"epoch": 0.8886694643297951, |
|
"grad_norm": 0.5634804859248715, |
|
"learning_rate": 3.7158144931900395e-07, |
|
"loss": 1.2752, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8886694643297951, |
|
"eval_loss": 1.2087970972061157, |
|
"eval_runtime": 2557.7862, |
|
"eval_samples_per_second": 1.564, |
|
"eval_steps_per_second": 0.131, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.8911379906196001, |
|
"grad_norm": 0.6032610406878897, |
|
"learning_rate": 3.5544658475778317e-07, |
|
"loss": 1.1999, |
|
"step": 1805 |
|
}, |
|
{ |
|
"epoch": 0.8936065169094051, |
|
"grad_norm": 0.6216254522630721, |
|
"learning_rate": 3.396569309222114e-07, |
|
"loss": 1.2339, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 0.89607504319921, |
|
"grad_norm": 0.5807256981071689, |
|
"learning_rate": 3.2421366139290423e-07, |
|
"loss": 1.3057, |
|
"step": 1815 |
|
}, |
|
{ |
|
"epoch": 0.898543569489015, |
|
"grad_norm": 0.5211008570948544, |
|
"learning_rate": 3.091179240051462e-07, |
|
"loss": 1.2022, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.90101209577882, |
|
"grad_norm": 0.5525058863296126, |
|
"learning_rate": 2.943708407635704e-07, |
|
"loss": 1.2048, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 0.903480622068625, |
|
"grad_norm": 0.6377145176064325, |
|
"learning_rate": 2.799735077587695e-07, |
|
"loss": 1.213, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 0.90594914835843, |
|
"grad_norm": 0.5813161900855606, |
|
"learning_rate": 2.659269950858273e-07, |
|
"loss": 1.33, |
|
"step": 1835 |
|
}, |
|
{ |
|
"epoch": 0.908417674648235, |
|
"grad_norm": 0.6256712692686102, |
|
"learning_rate": 2.5223234676478193e-07, |
|
"loss": 1.2418, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.91088620093804, |
|
"grad_norm": 0.598042344925788, |
|
"learning_rate": 2.3889058066302873e-07, |
|
"loss": 1.2928, |
|
"step": 1845 |
|
}, |
|
{ |
|
"epoch": 0.9133547272278449, |
|
"grad_norm": 0.6144058961581507, |
|
"learning_rate": 2.2590268841966357e-07, |
|
"loss": 1.2522, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.9158232535176499, |
|
"grad_norm": 0.6086868817654493, |
|
"learning_rate": 2.132696353717839e-07, |
|
"loss": 1.2275, |
|
"step": 1855 |
|
}, |
|
{ |
|
"epoch": 0.9182917798074549, |
|
"grad_norm": 0.6193803813904503, |
|
"learning_rate": 2.0099236048273407e-07, |
|
"loss": 1.2102, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.9207603060972599, |
|
"grad_norm": 0.6206660621687174, |
|
"learning_rate": 1.890717762723182e-07, |
|
"loss": 1.2413, |
|
"step": 1865 |
|
}, |
|
{ |
|
"epoch": 0.9232288323870649, |
|
"grad_norm": 0.5195254310690817, |
|
"learning_rate": 1.7750876874897627e-07, |
|
"loss": 1.2536, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 0.9256973586768699, |
|
"grad_norm": 0.6172193600635592, |
|
"learning_rate": 1.6630419734393e-07, |
|
"loss": 1.1877, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 0.9281658849666748, |
|
"grad_norm": 0.5854056073690375, |
|
"learning_rate": 1.554588948473068e-07, |
|
"loss": 1.2694, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.9306344112564798, |
|
"grad_norm": 0.5939692455470944, |
|
"learning_rate": 1.4497366734623874e-07, |
|
"loss": 1.2223, |
|
"step": 1885 |
|
}, |
|
{ |
|
"epoch": 0.9331029375462848, |
|
"grad_norm": 0.558716522853661, |
|
"learning_rate": 1.3484929416495096e-07, |
|
"loss": 1.1465, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 0.9355714638360898, |
|
"grad_norm": 0.601778856283905, |
|
"learning_rate": 1.2508652780683916e-07, |
|
"loss": 1.2618, |
|
"step": 1895 |
|
}, |
|
{ |
|
"epoch": 0.9380399901258949, |
|
"grad_norm": 0.5724230357863298, |
|
"learning_rate": 1.1568609389853546e-07, |
|
"loss": 1.199, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.9405085164156999, |
|
"grad_norm": 0.5858685464797397, |
|
"learning_rate": 1.0664869113598097e-07, |
|
"loss": 1.2416, |
|
"step": 1905 |
|
}, |
|
{ |
|
"epoch": 0.9429770427055049, |
|
"grad_norm": 0.5955002776535666, |
|
"learning_rate": 9.7974991232489e-08, |
|
"loss": 1.2621, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 0.9454455689953098, |
|
"grad_norm": 0.6031053768787782, |
|
"learning_rate": 8.966563886882107e-08, |
|
"loss": 1.2966, |
|
"step": 1915 |
|
}, |
|
{ |
|
"epoch": 0.9479140952851148, |
|
"grad_norm": 0.5626513433181811, |
|
"learning_rate": 8.172125164527312e-08, |
|
"loss": 1.197, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.9503826215749198, |
|
"grad_norm": 0.6147790631492948, |
|
"learning_rate": 7.414242003576876e-08, |
|
"loss": 1.2476, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 0.9528511478647248, |
|
"grad_norm": 0.6387128598756113, |
|
"learning_rate": 6.692970734397176e-08, |
|
"loss": 1.2717, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 0.9553196741545298, |
|
"grad_norm": 0.58519229057596, |
|
"learning_rate": 6.0083649661421e-08, |
|
"loss": 1.2427, |
|
"step": 1935 |
|
}, |
|
{ |
|
"epoch": 0.9577882004443348, |
|
"grad_norm": 0.5732049204953203, |
|
"learning_rate": 5.360475582768088e-08, |
|
"loss": 1.2499, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.9602567267341398, |
|
"grad_norm": 0.5510115335869762, |
|
"learning_rate": 4.7493507392524226e-08, |
|
"loss": 1.1837, |
|
"step": 1945 |
|
}, |
|
{ |
|
"epoch": 0.9627252530239447, |
|
"grad_norm": 0.5959129330379044, |
|
"learning_rate": 4.175035858013987e-08, |
|
"loss": 1.2595, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.9651937793137497, |
|
"grad_norm": 0.6525575790551825, |
|
"learning_rate": 3.637573625537183e-08, |
|
"loss": 1.3283, |
|
"step": 1955 |
|
}, |
|
{ |
|
"epoch": 0.9676623056035547, |
|
"grad_norm": 0.6761446719619785, |
|
"learning_rate": 3.13700398919925e-08, |
|
"loss": 1.2633, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.9701308318933597, |
|
"grad_norm": 0.5705669812908541, |
|
"learning_rate": 2.673364154301028e-08, |
|
"loss": 1.2446, |
|
"step": 1965 |
|
}, |
|
{ |
|
"epoch": 0.9725993581831647, |
|
"grad_norm": 0.6197155608101478, |
|
"learning_rate": 2.2466885813018925e-08, |
|
"loss": 1.2492, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 0.9750678844729697, |
|
"grad_norm": 0.5667304098455904, |
|
"learning_rate": 1.857008983258135e-08, |
|
"loss": 1.2485, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 0.9775364107627746, |
|
"grad_norm": 0.6113665999543747, |
|
"learning_rate": 1.504354323466073e-08, |
|
"loss": 1.2573, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.9800049370525796, |
|
"grad_norm": 0.5726714283406965, |
|
"learning_rate": 1.188750813309214e-08, |
|
"loss": 1.2264, |
|
"step": 1985 |
|
}, |
|
{ |
|
"epoch": 0.9824734633423846, |
|
"grad_norm": 0.5521047354644366, |
|
"learning_rate": 9.102219103103161e-09, |
|
"loss": 1.2194, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 0.9849419896321896, |
|
"grad_norm": 0.6819693929722572, |
|
"learning_rate": 6.687883163873921e-09, |
|
"loss": 1.244, |
|
"step": 1995 |
|
}, |
|
{ |
|
"epoch": 0.9874105159219946, |
|
"grad_norm": 0.6016814387388122, |
|
"learning_rate": 4.644679763155524e-09, |
|
"loss": 1.2701, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9874105159219946, |
|
"eval_loss": 1.208633542060852, |
|
"eval_runtime": 2553.7159, |
|
"eval_samples_per_second": 1.566, |
|
"eval_steps_per_second": 0.131, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.9898790422117996, |
|
"grad_norm": 0.5854483828292536, |
|
"learning_rate": 2.97276076392905e-09, |
|
"loss": 1.2735, |
|
"step": 2005 |
|
}, |
|
{ |
|
"epoch": 0.9923475685016045, |
|
"grad_norm": 0.6149856349841143, |
|
"learning_rate": 1.6722504331195822e-09, |
|
"loss": 1.1829, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 0.9948160947914095, |
|
"grad_norm": 0.5776580228856067, |
|
"learning_rate": 7.432454323597071e-10, |
|
"loss": 1.2584, |
|
"step": 2015 |
|
}, |
|
{ |
|
"epoch": 0.9972846210812145, |
|
"grad_norm": 0.5955477076581019, |
|
"learning_rate": 1.8581481080415242e-10, |
|
"loss": 1.1737, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.9997531473710195, |
|
"grad_norm": 0.6070167910291095, |
|
"learning_rate": 0.0, |
|
"loss": 1.1858, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 0.9997531473710195, |
|
"step": 2025, |
|
"total_flos": 4526278881050624.0, |
|
"train_loss": 1.270192005722611, |
|
"train_runtime": 113933.3906, |
|
"train_samples_per_second": 0.427, |
|
"train_steps_per_second": 0.018 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 2025, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 200, |
|
"total_flos": 4526278881050624.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|