radic2682's picture
End of training
5d02493 verified
raw
history blame contribute delete
No virus
60 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 2000,
"global_step": 16425,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0091324200913242,
"grad_norm": 221.6422576904297,
"learning_rate": 9.97564687975647e-06,
"loss": 6.0077,
"step": 50
},
{
"epoch": 0.0182648401826484,
"grad_norm": 53.356204986572266,
"learning_rate": 9.945205479452056e-06,
"loss": 4.1135,
"step": 100
},
{
"epoch": 0.0273972602739726,
"grad_norm": 82.45592498779297,
"learning_rate": 9.914764079147641e-06,
"loss": 2.8067,
"step": 150
},
{
"epoch": 0.0365296803652968,
"grad_norm": 55.64297866821289,
"learning_rate": 9.884322678843227e-06,
"loss": 2.3073,
"step": 200
},
{
"epoch": 0.045662100456621,
"grad_norm": 41.508663177490234,
"learning_rate": 9.853881278538814e-06,
"loss": 1.924,
"step": 250
},
{
"epoch": 0.0547945205479452,
"grad_norm": 202.38238525390625,
"learning_rate": 9.824048706240488e-06,
"loss": 1.6561,
"step": 300
},
{
"epoch": 0.0639269406392694,
"grad_norm": 40.34769821166992,
"learning_rate": 9.793607305936074e-06,
"loss": 1.7111,
"step": 350
},
{
"epoch": 0.0730593607305936,
"grad_norm": 44.13899612426758,
"learning_rate": 9.76316590563166e-06,
"loss": 1.7346,
"step": 400
},
{
"epoch": 0.0821917808219178,
"grad_norm": 33.424198150634766,
"learning_rate": 9.732724505327246e-06,
"loss": 1.5305,
"step": 450
},
{
"epoch": 0.091324200913242,
"grad_norm": 41.58274459838867,
"learning_rate": 9.702283105022831e-06,
"loss": 1.4748,
"step": 500
},
{
"epoch": 0.1004566210045662,
"grad_norm": 24.718538284301758,
"learning_rate": 9.671841704718417e-06,
"loss": 1.3904,
"step": 550
},
{
"epoch": 0.1095890410958904,
"grad_norm": 106.5848388671875,
"learning_rate": 9.641400304414004e-06,
"loss": 1.5063,
"step": 600
},
{
"epoch": 0.1187214611872146,
"grad_norm": 30.068777084350586,
"learning_rate": 9.61095890410959e-06,
"loss": 1.4255,
"step": 650
},
{
"epoch": 0.1278538812785388,
"grad_norm": 40.35118103027344,
"learning_rate": 9.580517503805176e-06,
"loss": 1.3404,
"step": 700
},
{
"epoch": 0.136986301369863,
"grad_norm": 25.713661193847656,
"learning_rate": 9.550076103500761e-06,
"loss": 1.2721,
"step": 750
},
{
"epoch": 0.1461187214611872,
"grad_norm": 37.896827697753906,
"learning_rate": 9.519634703196347e-06,
"loss": 1.2902,
"step": 800
},
{
"epoch": 0.1552511415525114,
"grad_norm": 28.79201316833496,
"learning_rate": 9.489193302891934e-06,
"loss": 1.3189,
"step": 850
},
{
"epoch": 0.1643835616438356,
"grad_norm": 40.73591232299805,
"learning_rate": 9.45875190258752e-06,
"loss": 1.2969,
"step": 900
},
{
"epoch": 0.1735159817351598,
"grad_norm": 37.75580596923828,
"learning_rate": 9.428310502283106e-06,
"loss": 1.1992,
"step": 950
},
{
"epoch": 0.182648401826484,
"grad_norm": 23.1336727142334,
"learning_rate": 9.397869101978691e-06,
"loss": 1.2553,
"step": 1000
},
{
"epoch": 0.1917808219178082,
"grad_norm": 24.494401931762695,
"learning_rate": 9.367427701674277e-06,
"loss": 1.22,
"step": 1050
},
{
"epoch": 0.2009132420091324,
"grad_norm": 25.258699417114258,
"learning_rate": 9.336986301369864e-06,
"loss": 1.2455,
"step": 1100
},
{
"epoch": 0.2100456621004566,
"grad_norm": 24.47820281982422,
"learning_rate": 9.30654490106545e-06,
"loss": 1.2092,
"step": 1150
},
{
"epoch": 0.2191780821917808,
"grad_norm": 21.027442932128906,
"learning_rate": 9.276103500761036e-06,
"loss": 1.1863,
"step": 1200
},
{
"epoch": 0.228310502283105,
"grad_norm": 28.407264709472656,
"learning_rate": 9.245662100456621e-06,
"loss": 1.1964,
"step": 1250
},
{
"epoch": 0.2374429223744292,
"grad_norm": 37.75485610961914,
"learning_rate": 9.215220700152207e-06,
"loss": 1.1217,
"step": 1300
},
{
"epoch": 0.2465753424657534,
"grad_norm": 36.54895782470703,
"learning_rate": 9.184779299847793e-06,
"loss": 1.1317,
"step": 1350
},
{
"epoch": 0.2557077625570776,
"grad_norm": 20.005041122436523,
"learning_rate": 9.15433789954338e-06,
"loss": 1.0816,
"step": 1400
},
{
"epoch": 0.2648401826484018,
"grad_norm": 30.079883575439453,
"learning_rate": 9.123896499238966e-06,
"loss": 1.1259,
"step": 1450
},
{
"epoch": 0.273972602739726,
"grad_norm": 21.35468864440918,
"learning_rate": 9.093455098934552e-06,
"loss": 1.0557,
"step": 1500
},
{
"epoch": 0.2831050228310502,
"grad_norm": 35.95537185668945,
"learning_rate": 9.063013698630137e-06,
"loss": 1.2042,
"step": 1550
},
{
"epoch": 0.2922374429223744,
"grad_norm": 24.126571655273438,
"learning_rate": 9.032572298325723e-06,
"loss": 1.0724,
"step": 1600
},
{
"epoch": 0.3013698630136986,
"grad_norm": 37.01513671875,
"learning_rate": 9.00213089802131e-06,
"loss": 1.0762,
"step": 1650
},
{
"epoch": 0.3105022831050228,
"grad_norm": 29.829130172729492,
"learning_rate": 8.971689497716896e-06,
"loss": 1.075,
"step": 1700
},
{
"epoch": 0.319634703196347,
"grad_norm": 27.351768493652344,
"learning_rate": 8.941248097412482e-06,
"loss": 1.099,
"step": 1750
},
{
"epoch": 0.3287671232876712,
"grad_norm": 151.5540771484375,
"learning_rate": 8.910806697108067e-06,
"loss": 1.0758,
"step": 1800
},
{
"epoch": 0.3378995433789954,
"grad_norm": 22.22905158996582,
"learning_rate": 8.880365296803653e-06,
"loss": 1.2048,
"step": 1850
},
{
"epoch": 0.3470319634703196,
"grad_norm": 22.852474212646484,
"learning_rate": 8.849923896499239e-06,
"loss": 1.1506,
"step": 1900
},
{
"epoch": 0.3561643835616438,
"grad_norm": 25.591411590576172,
"learning_rate": 8.819482496194826e-06,
"loss": 1.0681,
"step": 1950
},
{
"epoch": 0.365296803652968,
"grad_norm": 57.05115509033203,
"learning_rate": 8.789041095890412e-06,
"loss": 1.0501,
"step": 2000
},
{
"epoch": 0.365296803652968,
"eval_exact_match": 78.10785241248817,
"eval_f1": 86.60142116809823,
"eval_runtime": 408.6372,
"eval_samples_per_second": 25.866,
"eval_steps_per_second": 1.618,
"step": 2000
},
{
"epoch": 0.3744292237442922,
"grad_norm": 18.742263793945312,
"learning_rate": 8.758599695585997e-06,
"loss": 1.0993,
"step": 2050
},
{
"epoch": 0.3835616438356164,
"grad_norm": 44.574337005615234,
"learning_rate": 8.728158295281583e-06,
"loss": 1.101,
"step": 2100
},
{
"epoch": 0.3926940639269406,
"grad_norm": 13.44596004486084,
"learning_rate": 8.697716894977169e-06,
"loss": 1.0706,
"step": 2150
},
{
"epoch": 0.4018264840182648,
"grad_norm": 35.56928253173828,
"learning_rate": 8.667275494672756e-06,
"loss": 1.0132,
"step": 2200
},
{
"epoch": 0.410958904109589,
"grad_norm": 23.093910217285156,
"learning_rate": 8.636834094368342e-06,
"loss": 1.075,
"step": 2250
},
{
"epoch": 0.4200913242009132,
"grad_norm": 53.623291015625,
"learning_rate": 8.606392694063928e-06,
"loss": 1.0114,
"step": 2300
},
{
"epoch": 0.4292237442922374,
"grad_norm": 19.787992477416992,
"learning_rate": 8.575951293759513e-06,
"loss": 0.9579,
"step": 2350
},
{
"epoch": 0.4383561643835616,
"grad_norm": 85.31494903564453,
"learning_rate": 8.545509893455099e-06,
"loss": 1.0426,
"step": 2400
},
{
"epoch": 0.4474885844748858,
"grad_norm": 14.577733993530273,
"learning_rate": 8.515068493150686e-06,
"loss": 1.0229,
"step": 2450
},
{
"epoch": 0.45662100456621,
"grad_norm": 40.19715881347656,
"learning_rate": 8.484627092846272e-06,
"loss": 1.05,
"step": 2500
},
{
"epoch": 0.4657534246575342,
"grad_norm": 30.88080596923828,
"learning_rate": 8.454185692541858e-06,
"loss": 0.9825,
"step": 2550
},
{
"epoch": 0.4748858447488584,
"grad_norm": 23.49712562561035,
"learning_rate": 8.423744292237443e-06,
"loss": 1.0332,
"step": 2600
},
{
"epoch": 0.4840182648401826,
"grad_norm": 28.84528160095215,
"learning_rate": 8.393302891933029e-06,
"loss": 0.9683,
"step": 2650
},
{
"epoch": 0.4931506849315068,
"grad_norm": 93.11555480957031,
"learning_rate": 8.362861491628615e-06,
"loss": 0.9531,
"step": 2700
},
{
"epoch": 0.502283105022831,
"grad_norm": 30.038545608520508,
"learning_rate": 8.332420091324202e-06,
"loss": 0.9505,
"step": 2750
},
{
"epoch": 0.5114155251141552,
"grad_norm": 26.936176300048828,
"learning_rate": 8.301978691019788e-06,
"loss": 1.0348,
"step": 2800
},
{
"epoch": 0.5205479452054794,
"grad_norm": 47.295257568359375,
"learning_rate": 8.271537290715373e-06,
"loss": 0.9554,
"step": 2850
},
{
"epoch": 0.5296803652968036,
"grad_norm": 18.67024803161621,
"learning_rate": 8.241095890410959e-06,
"loss": 0.981,
"step": 2900
},
{
"epoch": 0.5388127853881278,
"grad_norm": 35.41249084472656,
"learning_rate": 8.210654490106545e-06,
"loss": 0.9714,
"step": 2950
},
{
"epoch": 0.547945205479452,
"grad_norm": 89.86371612548828,
"learning_rate": 8.180213089802132e-06,
"loss": 0.969,
"step": 3000
},
{
"epoch": 0.5570776255707762,
"grad_norm": 17.144739151000977,
"learning_rate": 8.149771689497718e-06,
"loss": 0.9632,
"step": 3050
},
{
"epoch": 0.5662100456621004,
"grad_norm": 34.07986068725586,
"learning_rate": 8.119330289193304e-06,
"loss": 0.9724,
"step": 3100
},
{
"epoch": 0.5753424657534246,
"grad_norm": 33.85867691040039,
"learning_rate": 8.08888888888889e-06,
"loss": 0.9273,
"step": 3150
},
{
"epoch": 0.5844748858447488,
"grad_norm": 28.088581085205078,
"learning_rate": 8.058447488584475e-06,
"loss": 0.9382,
"step": 3200
},
{
"epoch": 0.593607305936073,
"grad_norm": 21.896081924438477,
"learning_rate": 8.028006088280062e-06,
"loss": 0.8934,
"step": 3250
},
{
"epoch": 0.6027397260273972,
"grad_norm": 24.811033248901367,
"learning_rate": 7.997564687975648e-06,
"loss": 0.8633,
"step": 3300
},
{
"epoch": 0.6118721461187214,
"grad_norm": 17.348302841186523,
"learning_rate": 7.967123287671234e-06,
"loss": 0.9522,
"step": 3350
},
{
"epoch": 0.6210045662100456,
"grad_norm": 16.166751861572266,
"learning_rate": 7.93668188736682e-06,
"loss": 0.8775,
"step": 3400
},
{
"epoch": 0.6301369863013698,
"grad_norm": 17.120800018310547,
"learning_rate": 7.906240487062405e-06,
"loss": 0.915,
"step": 3450
},
{
"epoch": 0.639269406392694,
"grad_norm": 24.14845848083496,
"learning_rate": 7.87579908675799e-06,
"loss": 0.8992,
"step": 3500
},
{
"epoch": 0.6484018264840182,
"grad_norm": 21.47500228881836,
"learning_rate": 7.845357686453578e-06,
"loss": 0.8837,
"step": 3550
},
{
"epoch": 0.6575342465753424,
"grad_norm": 893.0142822265625,
"learning_rate": 7.814916286149164e-06,
"loss": 0.7566,
"step": 3600
},
{
"epoch": 0.6666666666666666,
"grad_norm": 19.631641387939453,
"learning_rate": 7.78447488584475e-06,
"loss": 0.9036,
"step": 3650
},
{
"epoch": 0.6757990867579908,
"grad_norm": 21.34396743774414,
"learning_rate": 7.754033485540335e-06,
"loss": 0.8583,
"step": 3700
},
{
"epoch": 0.684931506849315,
"grad_norm": 41.54507064819336,
"learning_rate": 7.72359208523592e-06,
"loss": 0.8158,
"step": 3750
},
{
"epoch": 0.6940639269406392,
"grad_norm": 18.237974166870117,
"learning_rate": 7.693150684931508e-06,
"loss": 0.8654,
"step": 3800
},
{
"epoch": 0.7031963470319634,
"grad_norm": 8.820842742919922,
"learning_rate": 7.662709284627094e-06,
"loss": 0.8388,
"step": 3850
},
{
"epoch": 0.7123287671232876,
"grad_norm": 12.438421249389648,
"learning_rate": 7.63226788432268e-06,
"loss": 0.9076,
"step": 3900
},
{
"epoch": 0.7214611872146118,
"grad_norm": 31.41686248779297,
"learning_rate": 7.601826484018265e-06,
"loss": 0.8863,
"step": 3950
},
{
"epoch": 0.730593607305936,
"grad_norm": 54.51583480834961,
"learning_rate": 7.571385083713852e-06,
"loss": 0.865,
"step": 4000
},
{
"epoch": 0.730593607305936,
"eval_exact_match": 83.91674550614948,
"eval_f1": 91.06161273436486,
"eval_runtime": 406.2293,
"eval_samples_per_second": 26.02,
"eval_steps_per_second": 1.627,
"step": 4000
},
{
"epoch": 0.7397260273972602,
"grad_norm": 23.26498031616211,
"learning_rate": 7.540943683409437e-06,
"loss": 0.875,
"step": 4050
},
{
"epoch": 0.7488584474885844,
"grad_norm": 22.035991668701172,
"learning_rate": 7.510502283105023e-06,
"loss": 0.8573,
"step": 4100
},
{
"epoch": 0.7579908675799086,
"grad_norm": 38.90880584716797,
"learning_rate": 7.4800608828006096e-06,
"loss": 0.9069,
"step": 4150
},
{
"epoch": 0.7671232876712328,
"grad_norm": 21.68418312072754,
"learning_rate": 7.449619482496195e-06,
"loss": 0.8559,
"step": 4200
},
{
"epoch": 0.776255707762557,
"grad_norm": 94.90328979492188,
"learning_rate": 7.419178082191782e-06,
"loss": 0.876,
"step": 4250
},
{
"epoch": 0.7853881278538812,
"grad_norm": 40.229251861572266,
"learning_rate": 7.3887366818873674e-06,
"loss": 0.8589,
"step": 4300
},
{
"epoch": 0.7945205479452054,
"grad_norm": 23.035062789916992,
"learning_rate": 7.358295281582953e-06,
"loss": 0.8919,
"step": 4350
},
{
"epoch": 0.8036529680365296,
"grad_norm": 21.854232788085938,
"learning_rate": 7.32785388127854e-06,
"loss": 0.9156,
"step": 4400
},
{
"epoch": 0.8127853881278538,
"grad_norm": 14.7982759475708,
"learning_rate": 7.297412480974125e-06,
"loss": 0.7823,
"step": 4450
},
{
"epoch": 0.821917808219178,
"grad_norm": 11.709835052490234,
"learning_rate": 7.266971080669711e-06,
"loss": 0.8257,
"step": 4500
},
{
"epoch": 0.8310502283105022,
"grad_norm": 34.76528549194336,
"learning_rate": 7.2365296803652975e-06,
"loss": 0.794,
"step": 4550
},
{
"epoch": 0.8401826484018264,
"grad_norm": 29.65485954284668,
"learning_rate": 7.206088280060883e-06,
"loss": 0.8861,
"step": 4600
},
{
"epoch": 0.8493150684931506,
"grad_norm": 17.04764747619629,
"learning_rate": 7.17564687975647e-06,
"loss": 0.8877,
"step": 4650
},
{
"epoch": 0.8584474885844748,
"grad_norm": 16.863462448120117,
"learning_rate": 7.145205479452055e-06,
"loss": 0.8717,
"step": 4700
},
{
"epoch": 0.867579908675799,
"grad_norm": 22.229736328125,
"learning_rate": 7.114764079147641e-06,
"loss": 0.8354,
"step": 4750
},
{
"epoch": 0.8767123287671232,
"grad_norm": 21.367616653442383,
"learning_rate": 7.084322678843228e-06,
"loss": 0.8083,
"step": 4800
},
{
"epoch": 0.8858447488584474,
"grad_norm": 23.51657485961914,
"learning_rate": 7.053881278538813e-06,
"loss": 0.8566,
"step": 4850
},
{
"epoch": 0.8949771689497716,
"grad_norm": 25.13926124572754,
"learning_rate": 7.023439878234399e-06,
"loss": 0.7737,
"step": 4900
},
{
"epoch": 0.9041095890410958,
"grad_norm": 8.506234169006348,
"learning_rate": 6.9929984779299855e-06,
"loss": 0.7892,
"step": 4950
},
{
"epoch": 0.91324200913242,
"grad_norm": 18.09160041809082,
"learning_rate": 6.962557077625571e-06,
"loss": 0.7986,
"step": 5000
},
{
"epoch": 0.9223744292237442,
"grad_norm": 16.591453552246094,
"learning_rate": 6.932115677321158e-06,
"loss": 0.8058,
"step": 5050
},
{
"epoch": 0.9315068493150684,
"grad_norm": 12.483757972717285,
"learning_rate": 6.901674277016743e-06,
"loss": 0.7827,
"step": 5100
},
{
"epoch": 0.9406392694063926,
"grad_norm": 16.847013473510742,
"learning_rate": 6.871232876712329e-06,
"loss": 0.8306,
"step": 5150
},
{
"epoch": 0.9497716894977168,
"grad_norm": 27.550743103027344,
"learning_rate": 6.840791476407916e-06,
"loss": 0.7677,
"step": 5200
},
{
"epoch": 0.958904109589041,
"grad_norm": 118.00872802734375,
"learning_rate": 6.810350076103501e-06,
"loss": 0.8312,
"step": 5250
},
{
"epoch": 0.9680365296803652,
"grad_norm": 28.487119674682617,
"learning_rate": 6.779908675799087e-06,
"loss": 0.8193,
"step": 5300
},
{
"epoch": 0.9771689497716894,
"grad_norm": 25.242734909057617,
"learning_rate": 6.7494672754946735e-06,
"loss": 0.8542,
"step": 5350
},
{
"epoch": 0.9863013698630136,
"grad_norm": 29.314556121826172,
"learning_rate": 6.719025875190259e-06,
"loss": 0.8617,
"step": 5400
},
{
"epoch": 0.9954337899543378,
"grad_norm": 7.821478366851807,
"learning_rate": 6.688584474885846e-06,
"loss": 0.7687,
"step": 5450
},
{
"epoch": 1.004566210045662,
"grad_norm": 8.230454444885254,
"learning_rate": 6.658143074581431e-06,
"loss": 0.7007,
"step": 5500
},
{
"epoch": 1.0136986301369864,
"grad_norm": 17.57550048828125,
"learning_rate": 6.627701674277017e-06,
"loss": 0.6705,
"step": 5550
},
{
"epoch": 1.0228310502283104,
"grad_norm": 31.853958129882812,
"learning_rate": 6.597260273972604e-06,
"loss": 0.6468,
"step": 5600
},
{
"epoch": 1.0319634703196348,
"grad_norm": 25.719881057739258,
"learning_rate": 6.566818873668189e-06,
"loss": 0.7003,
"step": 5650
},
{
"epoch": 1.0410958904109588,
"grad_norm": 7.838934421539307,
"learning_rate": 6.536377473363775e-06,
"loss": 0.6471,
"step": 5700
},
{
"epoch": 1.0502283105022832,
"grad_norm": 39.390769958496094,
"learning_rate": 6.5059360730593615e-06,
"loss": 0.6483,
"step": 5750
},
{
"epoch": 1.0593607305936072,
"grad_norm": 45.38280487060547,
"learning_rate": 6.475494672754947e-06,
"loss": 0.5945,
"step": 5800
},
{
"epoch": 1.0684931506849316,
"grad_norm": 9.456714630126953,
"learning_rate": 6.445053272450533e-06,
"loss": 0.6261,
"step": 5850
},
{
"epoch": 1.0776255707762556,
"grad_norm": 16.920135498046875,
"learning_rate": 6.414611872146119e-06,
"loss": 0.6281,
"step": 5900
},
{
"epoch": 1.08675799086758,
"grad_norm": 33.61515426635742,
"learning_rate": 6.384170471841705e-06,
"loss": 0.7094,
"step": 5950
},
{
"epoch": 1.095890410958904,
"grad_norm": 14.23517894744873,
"learning_rate": 6.3537290715372916e-06,
"loss": 0.6322,
"step": 6000
},
{
"epoch": 1.095890410958904,
"eval_exact_match": 85.34531693472091,
"eval_f1": 92.10865701973381,
"eval_runtime": 406.1815,
"eval_samples_per_second": 26.023,
"eval_steps_per_second": 1.627,
"step": 6000
},
{
"epoch": 1.1050228310502284,
"grad_norm": 12.242176055908203,
"learning_rate": 6.323287671232877e-06,
"loss": 0.6571,
"step": 6050
},
{
"epoch": 1.1141552511415524,
"grad_norm": 22.35227394104004,
"learning_rate": 6.292846270928463e-06,
"loss": 0.7238,
"step": 6100
},
{
"epoch": 1.1232876712328768,
"grad_norm": 19.85144805908203,
"learning_rate": 6.2624048706240495e-06,
"loss": 0.6978,
"step": 6150
},
{
"epoch": 1.1324200913242009,
"grad_norm": 37.92237854003906,
"learning_rate": 6.231963470319635e-06,
"loss": 0.6241,
"step": 6200
},
{
"epoch": 1.1415525114155252,
"grad_norm": 21.54163932800293,
"learning_rate": 6.201522070015221e-06,
"loss": 0.6512,
"step": 6250
},
{
"epoch": 1.1506849315068493,
"grad_norm": 19.37819480895996,
"learning_rate": 6.171080669710807e-06,
"loss": 0.5888,
"step": 6300
},
{
"epoch": 1.1598173515981736,
"grad_norm": 59.02834701538086,
"learning_rate": 6.140639269406393e-06,
"loss": 0.6524,
"step": 6350
},
{
"epoch": 1.1689497716894977,
"grad_norm": 19.709335327148438,
"learning_rate": 6.1101978691019796e-06,
"loss": 0.6272,
"step": 6400
},
{
"epoch": 1.178082191780822,
"grad_norm": 94.84259033203125,
"learning_rate": 6.079756468797565e-06,
"loss": 0.6506,
"step": 6450
},
{
"epoch": 1.187214611872146,
"grad_norm": 27.324445724487305,
"learning_rate": 6.049315068493151e-06,
"loss": 0.7212,
"step": 6500
},
{
"epoch": 1.1963470319634704,
"grad_norm": 27.72054100036621,
"learning_rate": 6.0188736681887374e-06,
"loss": 0.6363,
"step": 6550
},
{
"epoch": 1.2054794520547945,
"grad_norm": 19.417390823364258,
"learning_rate": 5.988432267884323e-06,
"loss": 0.7211,
"step": 6600
},
{
"epoch": 1.2146118721461188,
"grad_norm": 17.682470321655273,
"learning_rate": 5.957990867579909e-06,
"loss": 0.7258,
"step": 6650
},
{
"epoch": 1.2237442922374429,
"grad_norm": 13.04556655883789,
"learning_rate": 5.927549467275495e-06,
"loss": 0.728,
"step": 6700
},
{
"epoch": 1.2328767123287672,
"grad_norm": 19.57162094116211,
"learning_rate": 5.897108066971081e-06,
"loss": 0.6404,
"step": 6750
},
{
"epoch": 1.2420091324200913,
"grad_norm": 14.33209228515625,
"learning_rate": 5.8666666666666675e-06,
"loss": 0.6511,
"step": 6800
},
{
"epoch": 1.2511415525114156,
"grad_norm": 11.948081970214844,
"learning_rate": 5.836225266362253e-06,
"loss": 0.6386,
"step": 6850
},
{
"epoch": 1.2602739726027397,
"grad_norm": 11.66781997680664,
"learning_rate": 5.805783866057839e-06,
"loss": 0.6319,
"step": 6900
},
{
"epoch": 1.269406392694064,
"grad_norm": 40.73119354248047,
"learning_rate": 5.775342465753425e-06,
"loss": 0.6657,
"step": 6950
},
{
"epoch": 1.278538812785388,
"grad_norm": 9.036286354064941,
"learning_rate": 5.744901065449011e-06,
"loss": 0.5752,
"step": 7000
},
{
"epoch": 1.2876712328767124,
"grad_norm": 23.672693252563477,
"learning_rate": 5.715068493150685e-06,
"loss": 0.7783,
"step": 7050
},
{
"epoch": 1.2968036529680365,
"grad_norm": 19.77155303955078,
"learning_rate": 5.684627092846271e-06,
"loss": 0.6248,
"step": 7100
},
{
"epoch": 1.3059360730593608,
"grad_norm": 11.635490417480469,
"learning_rate": 5.654185692541857e-06,
"loss": 0.6601,
"step": 7150
},
{
"epoch": 1.3150684931506849,
"grad_norm": 9.181543350219727,
"learning_rate": 5.623744292237443e-06,
"loss": 0.6029,
"step": 7200
},
{
"epoch": 1.3242009132420092,
"grad_norm": 9.500978469848633,
"learning_rate": 5.593302891933029e-06,
"loss": 0.6676,
"step": 7250
},
{
"epoch": 1.3333333333333333,
"grad_norm": 16.34039878845215,
"learning_rate": 5.562861491628615e-06,
"loss": 0.6372,
"step": 7300
},
{
"epoch": 1.3424657534246576,
"grad_norm": 31.06061553955078,
"learning_rate": 5.532420091324201e-06,
"loss": 0.7154,
"step": 7350
},
{
"epoch": 1.3515981735159817,
"grad_norm": 27.624256134033203,
"learning_rate": 5.501978691019787e-06,
"loss": 0.6578,
"step": 7400
},
{
"epoch": 1.360730593607306,
"grad_norm": 15.686092376708984,
"learning_rate": 5.471537290715373e-06,
"loss": 0.6471,
"step": 7450
},
{
"epoch": 1.36986301369863,
"grad_norm": 13.39659595489502,
"learning_rate": 5.441095890410959e-06,
"loss": 0.6481,
"step": 7500
},
{
"epoch": 1.3789954337899544,
"grad_norm": 8.473033905029297,
"learning_rate": 5.410654490106545e-06,
"loss": 0.6719,
"step": 7550
},
{
"epoch": 1.3881278538812785,
"grad_norm": 20.205528259277344,
"learning_rate": 5.380213089802131e-06,
"loss": 0.5757,
"step": 7600
},
{
"epoch": 1.3972602739726028,
"grad_norm": 19.173370361328125,
"learning_rate": 5.349771689497717e-06,
"loss": 0.6266,
"step": 7650
},
{
"epoch": 1.4063926940639269,
"grad_norm": 18.172975540161133,
"learning_rate": 5.319330289193303e-06,
"loss": 0.5782,
"step": 7700
},
{
"epoch": 1.4155251141552512,
"grad_norm": 9.981927871704102,
"learning_rate": 5.288888888888889e-06,
"loss": 0.6568,
"step": 7750
},
{
"epoch": 1.4246575342465753,
"grad_norm": 10.33353328704834,
"learning_rate": 5.2584474885844746e-06,
"loss": 0.6683,
"step": 7800
},
{
"epoch": 1.4337899543378996,
"grad_norm": 29.337627410888672,
"learning_rate": 5.228006088280061e-06,
"loss": 0.6609,
"step": 7850
},
{
"epoch": 1.4429223744292237,
"grad_norm": 12.93662166595459,
"learning_rate": 5.197564687975647e-06,
"loss": 0.6648,
"step": 7900
},
{
"epoch": 1.452054794520548,
"grad_norm": 29.355287551879883,
"learning_rate": 5.167123287671233e-06,
"loss": 0.6149,
"step": 7950
},
{
"epoch": 1.461187214611872,
"grad_norm": 16.967021942138672,
"learning_rate": 5.136681887366819e-06,
"loss": 0.7242,
"step": 8000
},
{
"epoch": 1.461187214611872,
"eval_exact_match": 86.1116367076632,
"eval_f1": 92.4648953066692,
"eval_runtime": 406.5019,
"eval_samples_per_second": 26.002,
"eval_steps_per_second": 1.626,
"step": 8000
},
{
"epoch": 1.4703196347031964,
"grad_norm": 11.783562660217285,
"learning_rate": 5.106240487062405e-06,
"loss": 0.6163,
"step": 8050
},
{
"epoch": 1.4794520547945205,
"grad_norm": 13.724154472351074,
"learning_rate": 5.075799086757991e-06,
"loss": 0.654,
"step": 8100
},
{
"epoch": 1.4885844748858448,
"grad_norm": 10.402881622314453,
"learning_rate": 5.045357686453577e-06,
"loss": 0.5943,
"step": 8150
},
{
"epoch": 1.4977168949771689,
"grad_norm": 20.88226318359375,
"learning_rate": 5.0149162861491625e-06,
"loss": 0.6884,
"step": 8200
},
{
"epoch": 1.5068493150684932,
"grad_norm": 10.194794654846191,
"learning_rate": 4.984474885844749e-06,
"loss": 0.6385,
"step": 8250
},
{
"epoch": 1.5159817351598175,
"grad_norm": 5.215353488922119,
"learning_rate": 4.954033485540336e-06,
"loss": 0.6616,
"step": 8300
},
{
"epoch": 1.5251141552511416,
"grad_norm": 12.108441352844238,
"learning_rate": 4.923592085235921e-06,
"loss": 0.6275,
"step": 8350
},
{
"epoch": 1.5342465753424657,
"grad_norm": 15.30664348602295,
"learning_rate": 4.893150684931508e-06,
"loss": 0.6444,
"step": 8400
},
{
"epoch": 1.54337899543379,
"grad_norm": 18.936824798583984,
"learning_rate": 4.8627092846270935e-06,
"loss": 0.6434,
"step": 8450
},
{
"epoch": 1.5525114155251143,
"grad_norm": 7.780643939971924,
"learning_rate": 4.832267884322679e-06,
"loss": 0.6749,
"step": 8500
},
{
"epoch": 1.5616438356164384,
"grad_norm": 19.338315963745117,
"learning_rate": 4.801826484018266e-06,
"loss": 0.7254,
"step": 8550
},
{
"epoch": 1.5707762557077625,
"grad_norm": 15.236742973327637,
"learning_rate": 4.771385083713851e-06,
"loss": 0.6367,
"step": 8600
},
{
"epoch": 1.5799086757990868,
"grad_norm": 14.18782901763916,
"learning_rate": 4.740943683409437e-06,
"loss": 0.6125,
"step": 8650
},
{
"epoch": 1.589041095890411,
"grad_norm": 24.446998596191406,
"learning_rate": 4.710502283105024e-06,
"loss": 0.7166,
"step": 8700
},
{
"epoch": 1.5981735159817352,
"grad_norm": 10.571455955505371,
"learning_rate": 4.680060882800609e-06,
"loss": 0.6405,
"step": 8750
},
{
"epoch": 1.6073059360730593,
"grad_norm": 12.6207857131958,
"learning_rate": 4.649619482496196e-06,
"loss": 0.699,
"step": 8800
},
{
"epoch": 1.6164383561643836,
"grad_norm": 16.94082260131836,
"learning_rate": 4.6191780821917815e-06,
"loss": 0.6562,
"step": 8850
},
{
"epoch": 1.625570776255708,
"grad_norm": 12.49853229522705,
"learning_rate": 4.588736681887367e-06,
"loss": 0.6067,
"step": 8900
},
{
"epoch": 1.634703196347032,
"grad_norm": 31.45545196533203,
"learning_rate": 4.558295281582954e-06,
"loss": 0.5983,
"step": 8950
},
{
"epoch": 1.643835616438356,
"grad_norm": 11.867836952209473,
"learning_rate": 4.527853881278539e-06,
"loss": 0.6285,
"step": 9000
},
{
"epoch": 1.6529680365296804,
"grad_norm": 14.369145393371582,
"learning_rate": 4.497412480974125e-06,
"loss": 0.6224,
"step": 9050
},
{
"epoch": 1.6621004566210047,
"grad_norm": 10.372947692871094,
"learning_rate": 4.4669710806697116e-06,
"loss": 0.6846,
"step": 9100
},
{
"epoch": 1.6712328767123288,
"grad_norm": 31.318424224853516,
"learning_rate": 4.436529680365297e-06,
"loss": 0.6266,
"step": 9150
},
{
"epoch": 1.6803652968036529,
"grad_norm": 9.971720695495605,
"learning_rate": 4.406088280060884e-06,
"loss": 0.6116,
"step": 9200
},
{
"epoch": 1.6894977168949772,
"grad_norm": 13.311767578125,
"learning_rate": 4.3756468797564694e-06,
"loss": 0.6772,
"step": 9250
},
{
"epoch": 1.6986301369863015,
"grad_norm": 10.240290641784668,
"learning_rate": 4.345205479452055e-06,
"loss": 0.6617,
"step": 9300
},
{
"epoch": 1.7077625570776256,
"grad_norm": 13.63064956665039,
"learning_rate": 4.314764079147642e-06,
"loss": 0.6619,
"step": 9350
},
{
"epoch": 1.7168949771689497,
"grad_norm": 10.325277328491211,
"learning_rate": 4.284322678843227e-06,
"loss": 0.593,
"step": 9400
},
{
"epoch": 1.726027397260274,
"grad_norm": 11.242974281311035,
"learning_rate": 4.253881278538813e-06,
"loss": 0.5745,
"step": 9450
},
{
"epoch": 1.7351598173515983,
"grad_norm": 9.956265449523926,
"learning_rate": 4.2234398782343995e-06,
"loss": 0.6093,
"step": 9500
},
{
"epoch": 1.7442922374429224,
"grad_norm": 7.890584945678711,
"learning_rate": 4.192998477929985e-06,
"loss": 0.5413,
"step": 9550
},
{
"epoch": 1.7534246575342465,
"grad_norm": 27.36825942993164,
"learning_rate": 4.162557077625572e-06,
"loss": 0.6878,
"step": 9600
},
{
"epoch": 1.7625570776255708,
"grad_norm": 27.449216842651367,
"learning_rate": 4.1321156773211574e-06,
"loss": 0.5506,
"step": 9650
},
{
"epoch": 1.771689497716895,
"grad_norm": 13.601576805114746,
"learning_rate": 4.101674277016743e-06,
"loss": 0.6537,
"step": 9700
},
{
"epoch": 1.7808219178082192,
"grad_norm": 20.260234832763672,
"learning_rate": 4.07123287671233e-06,
"loss": 0.6504,
"step": 9750
},
{
"epoch": 1.7899543378995433,
"grad_norm": 8.724568367004395,
"learning_rate": 4.040791476407915e-06,
"loss": 0.646,
"step": 9800
},
{
"epoch": 1.7990867579908676,
"grad_norm": 12.175917625427246,
"learning_rate": 4.010350076103501e-06,
"loss": 0.6627,
"step": 9850
},
{
"epoch": 1.808219178082192,
"grad_norm": 9.895513534545898,
"learning_rate": 3.9799086757990875e-06,
"loss": 0.665,
"step": 9900
},
{
"epoch": 1.817351598173516,
"grad_norm": 9.971092224121094,
"learning_rate": 3.949467275494673e-06,
"loss": 0.6,
"step": 9950
},
{
"epoch": 1.82648401826484,
"grad_norm": 8.606634140014648,
"learning_rate": 3.91902587519026e-06,
"loss": 0.6091,
"step": 10000
},
{
"epoch": 1.82648401826484,
"eval_exact_match": 85.96026490066225,
"eval_f1": 92.43954998062115,
"eval_runtime": 406.4548,
"eval_samples_per_second": 26.005,
"eval_steps_per_second": 1.626,
"step": 10000
},
{
"epoch": 1.8356164383561644,
"grad_norm": 22.578126907348633,
"learning_rate": 3.888584474885845e-06,
"loss": 0.6935,
"step": 10050
},
{
"epoch": 1.8447488584474887,
"grad_norm": 29.010652542114258,
"learning_rate": 3.858143074581431e-06,
"loss": 0.6552,
"step": 10100
},
{
"epoch": 1.8538812785388128,
"grad_norm": 14.48583984375,
"learning_rate": 3.827701674277018e-06,
"loss": 0.6621,
"step": 10150
},
{
"epoch": 1.8630136986301369,
"grad_norm": 20.008892059326172,
"learning_rate": 3.797260273972603e-06,
"loss": 0.6703,
"step": 10200
},
{
"epoch": 1.8721461187214612,
"grad_norm": 32.114173889160156,
"learning_rate": 3.766818873668189e-06,
"loss": 0.6611,
"step": 10250
},
{
"epoch": 1.8812785388127855,
"grad_norm": 11.282811164855957,
"learning_rate": 3.7363774733637747e-06,
"loss": 0.6402,
"step": 10300
},
{
"epoch": 1.8904109589041096,
"grad_norm": 35.3563232421875,
"learning_rate": 3.7059360730593608e-06,
"loss": 0.5949,
"step": 10350
},
{
"epoch": 1.8995433789954337,
"grad_norm": 13.916271209716797,
"learning_rate": 3.675494672754947e-06,
"loss": 0.6059,
"step": 10400
},
{
"epoch": 1.908675799086758,
"grad_norm": 27.980178833007812,
"learning_rate": 3.645053272450533e-06,
"loss": 0.6022,
"step": 10450
},
{
"epoch": 1.9178082191780823,
"grad_norm": 10.44404411315918,
"learning_rate": 3.6146118721461186e-06,
"loss": 0.6091,
"step": 10500
},
{
"epoch": 1.9269406392694064,
"grad_norm": 9.538860321044922,
"learning_rate": 3.5841704718417047e-06,
"loss": 0.6522,
"step": 10550
},
{
"epoch": 1.9360730593607305,
"grad_norm": 10.887898445129395,
"learning_rate": 3.553729071537291e-06,
"loss": 0.6392,
"step": 10600
},
{
"epoch": 1.9452054794520548,
"grad_norm": 40.29354476928711,
"learning_rate": 3.5232876712328765e-06,
"loss": 0.6288,
"step": 10650
},
{
"epoch": 1.954337899543379,
"grad_norm": 29.277299880981445,
"learning_rate": 3.4928462709284626e-06,
"loss": 0.7117,
"step": 10700
},
{
"epoch": 1.9634703196347032,
"grad_norm": 30.566862106323242,
"learning_rate": 3.4624048706240487e-06,
"loss": 0.6687,
"step": 10750
},
{
"epoch": 1.9726027397260273,
"grad_norm": 40.0589485168457,
"learning_rate": 3.431963470319635e-06,
"loss": 0.6745,
"step": 10800
},
{
"epoch": 1.9817351598173516,
"grad_norm": 17.117198944091797,
"learning_rate": 3.4015220700152205e-06,
"loss": 0.6129,
"step": 10850
},
{
"epoch": 1.990867579908676,
"grad_norm": 22.412439346313477,
"learning_rate": 3.3710806697108066e-06,
"loss": 0.6759,
"step": 10900
},
{
"epoch": 2.0,
"grad_norm": 11.01193904876709,
"learning_rate": 3.3406392694063927e-06,
"loss": 0.6857,
"step": 10950
},
{
"epoch": 2.009132420091324,
"grad_norm": 8.564952850341797,
"learning_rate": 3.310197869101979e-06,
"loss": 0.4644,
"step": 11000
},
{
"epoch": 2.018264840182648,
"grad_norm": 13.085915565490723,
"learning_rate": 3.2797564687975645e-06,
"loss": 0.5165,
"step": 11050
},
{
"epoch": 2.0273972602739727,
"grad_norm": 14.992934226989746,
"learning_rate": 3.2493150684931506e-06,
"loss": 0.4602,
"step": 11100
},
{
"epoch": 2.036529680365297,
"grad_norm": 15.46022891998291,
"learning_rate": 3.2188736681887367e-06,
"loss": 0.4808,
"step": 11150
},
{
"epoch": 2.045662100456621,
"grad_norm": 18.564807891845703,
"learning_rate": 3.188432267884323e-06,
"loss": 0.5057,
"step": 11200
},
{
"epoch": 2.0547945205479454,
"grad_norm": 9.455687522888184,
"learning_rate": 3.1579908675799085e-06,
"loss": 0.5159,
"step": 11250
},
{
"epoch": 2.0639269406392695,
"grad_norm": 24.577774047851562,
"learning_rate": 3.1275494672754946e-06,
"loss": 0.5139,
"step": 11300
},
{
"epoch": 2.0730593607305936,
"grad_norm": 13.79776668548584,
"learning_rate": 3.0971080669710807e-06,
"loss": 0.4821,
"step": 11350
},
{
"epoch": 2.0821917808219177,
"grad_norm": 8.983718872070312,
"learning_rate": 3.066666666666667e-06,
"loss": 0.5006,
"step": 11400
},
{
"epoch": 2.091324200913242,
"grad_norm": 5.1394171714782715,
"learning_rate": 3.0362252663622525e-06,
"loss": 0.4897,
"step": 11450
},
{
"epoch": 2.1004566210045663,
"grad_norm": 20.14058494567871,
"learning_rate": 3.0057838660578386e-06,
"loss": 0.4637,
"step": 11500
},
{
"epoch": 2.1095890410958904,
"grad_norm": 12.311975479125977,
"learning_rate": 2.9753424657534247e-06,
"loss": 0.4711,
"step": 11550
},
{
"epoch": 2.1187214611872145,
"grad_norm": 14.106832504272461,
"learning_rate": 2.944901065449011e-06,
"loss": 0.4794,
"step": 11600
},
{
"epoch": 2.127853881278539,
"grad_norm": 13.880457878112793,
"learning_rate": 2.9144596651445965e-06,
"loss": 0.4539,
"step": 11650
},
{
"epoch": 2.136986301369863,
"grad_norm": 13.430336952209473,
"learning_rate": 2.884627092846271e-06,
"loss": 0.5021,
"step": 11700
},
{
"epoch": 2.146118721461187,
"grad_norm": 9.724105834960938,
"learning_rate": 2.854185692541857e-06,
"loss": 0.4958,
"step": 11750
},
{
"epoch": 2.1552511415525113,
"grad_norm": 9.325925827026367,
"learning_rate": 2.823744292237443e-06,
"loss": 0.5021,
"step": 11800
},
{
"epoch": 2.1643835616438354,
"grad_norm": 8.804998397827148,
"learning_rate": 2.7933028919330292e-06,
"loss": 0.491,
"step": 11850
},
{
"epoch": 2.17351598173516,
"grad_norm": 29.120508193969727,
"learning_rate": 2.762861491628615e-06,
"loss": 0.4636,
"step": 11900
},
{
"epoch": 2.182648401826484,
"grad_norm": 7.916499137878418,
"learning_rate": 2.732420091324201e-06,
"loss": 0.417,
"step": 11950
},
{
"epoch": 2.191780821917808,
"grad_norm": 4.8723297119140625,
"learning_rate": 2.701978691019787e-06,
"loss": 0.5017,
"step": 12000
},
{
"epoch": 2.191780821917808,
"eval_exact_match": 85.71428571428571,
"eval_f1": 92.4631336628407,
"eval_runtime": 406.3845,
"eval_samples_per_second": 26.01,
"eval_steps_per_second": 1.627,
"step": 12000
},
{
"epoch": 2.2009132420091326,
"grad_norm": 10.724153518676758,
"learning_rate": 2.6715372907153732e-06,
"loss": 0.4026,
"step": 12050
},
{
"epoch": 2.2100456621004567,
"grad_norm": 24.394311904907227,
"learning_rate": 2.641095890410959e-06,
"loss": 0.4777,
"step": 12100
},
{
"epoch": 2.219178082191781,
"grad_norm": 24.320796966552734,
"learning_rate": 2.610654490106545e-06,
"loss": 0.5017,
"step": 12150
},
{
"epoch": 2.228310502283105,
"grad_norm": 16.416059494018555,
"learning_rate": 2.580213089802131e-06,
"loss": 0.457,
"step": 12200
},
{
"epoch": 2.237442922374429,
"grad_norm": 12.136763572692871,
"learning_rate": 2.5497716894977172e-06,
"loss": 0.5128,
"step": 12250
},
{
"epoch": 2.2465753424657535,
"grad_norm": 11.742807388305664,
"learning_rate": 2.519330289193303e-06,
"loss": 0.4782,
"step": 12300
},
{
"epoch": 2.2557077625570776,
"grad_norm": 18.11116600036621,
"learning_rate": 2.488888888888889e-06,
"loss": 0.5777,
"step": 12350
},
{
"epoch": 2.2648401826484017,
"grad_norm": 19.796613693237305,
"learning_rate": 2.458447488584475e-06,
"loss": 0.4448,
"step": 12400
},
{
"epoch": 2.2739726027397262,
"grad_norm": 21.082096099853516,
"learning_rate": 2.4280060882800612e-06,
"loss": 0.5114,
"step": 12450
},
{
"epoch": 2.2831050228310503,
"grad_norm": 45.51653289794922,
"learning_rate": 2.397564687975647e-06,
"loss": 0.483,
"step": 12500
},
{
"epoch": 2.2922374429223744,
"grad_norm": 19.373531341552734,
"learning_rate": 2.367123287671233e-06,
"loss": 0.5448,
"step": 12550
},
{
"epoch": 2.3013698630136985,
"grad_norm": 13.89834976196289,
"learning_rate": 2.336681887366819e-06,
"loss": 0.4464,
"step": 12600
},
{
"epoch": 2.3105022831050226,
"grad_norm": 7.9293317794799805,
"learning_rate": 2.306240487062405e-06,
"loss": 0.5133,
"step": 12650
},
{
"epoch": 2.319634703196347,
"grad_norm": 12.511297225952148,
"learning_rate": 2.275799086757991e-06,
"loss": 0.5569,
"step": 12700
},
{
"epoch": 2.328767123287671,
"grad_norm": 11.701761245727539,
"learning_rate": 2.245357686453577e-06,
"loss": 0.449,
"step": 12750
},
{
"epoch": 2.3378995433789953,
"grad_norm": 9.876680374145508,
"learning_rate": 2.214916286149163e-06,
"loss": 0.4601,
"step": 12800
},
{
"epoch": 2.34703196347032,
"grad_norm": 16.884044647216797,
"learning_rate": 2.1844748858447488e-06,
"loss": 0.4786,
"step": 12850
},
{
"epoch": 2.356164383561644,
"grad_norm": 7.96138858795166,
"learning_rate": 2.154033485540335e-06,
"loss": 0.4868,
"step": 12900
},
{
"epoch": 2.365296803652968,
"grad_norm": 20.283720016479492,
"learning_rate": 2.123592085235921e-06,
"loss": 0.5288,
"step": 12950
},
{
"epoch": 2.374429223744292,
"grad_norm": 14.561513900756836,
"learning_rate": 2.093150684931507e-06,
"loss": 0.4397,
"step": 13000
},
{
"epoch": 2.383561643835616,
"grad_norm": 31.751888275146484,
"learning_rate": 2.0627092846270928e-06,
"loss": 0.451,
"step": 13050
},
{
"epoch": 2.3926940639269407,
"grad_norm": 20.009138107299805,
"learning_rate": 2.032267884322679e-06,
"loss": 0.4765,
"step": 13100
},
{
"epoch": 2.401826484018265,
"grad_norm": 15.038084030151367,
"learning_rate": 2.001826484018265e-06,
"loss": 0.4864,
"step": 13150
},
{
"epoch": 2.410958904109589,
"grad_norm": 7.947664737701416,
"learning_rate": 1.971385083713851e-06,
"loss": 0.4893,
"step": 13200
},
{
"epoch": 2.4200913242009134,
"grad_norm": 32.7381591796875,
"learning_rate": 1.9409436834094368e-06,
"loss": 0.5467,
"step": 13250
},
{
"epoch": 2.4292237442922375,
"grad_norm": 13.605596542358398,
"learning_rate": 1.910502283105023e-06,
"loss": 0.5008,
"step": 13300
},
{
"epoch": 2.4383561643835616,
"grad_norm": 15.449544906616211,
"learning_rate": 1.880060882800609e-06,
"loss": 0.4838,
"step": 13350
},
{
"epoch": 2.4474885844748857,
"grad_norm": 19.095609664916992,
"learning_rate": 1.8496194824961949e-06,
"loss": 0.4473,
"step": 13400
},
{
"epoch": 2.45662100456621,
"grad_norm": 28.19283676147461,
"learning_rate": 1.819178082191781e-06,
"loss": 0.4662,
"step": 13450
},
{
"epoch": 2.4657534246575343,
"grad_norm": 9.908361434936523,
"learning_rate": 1.7887366818873668e-06,
"loss": 0.5119,
"step": 13500
},
{
"epoch": 2.4748858447488584,
"grad_norm": 11.031998634338379,
"learning_rate": 1.758295281582953e-06,
"loss": 0.5061,
"step": 13550
},
{
"epoch": 2.4840182648401825,
"grad_norm": 16.79950714111328,
"learning_rate": 1.7278538812785388e-06,
"loss": 0.4602,
"step": 13600
},
{
"epoch": 2.493150684931507,
"grad_norm": 73.04358673095703,
"learning_rate": 1.697412480974125e-06,
"loss": 0.4397,
"step": 13650
},
{
"epoch": 2.502283105022831,
"grad_norm": 9.9924955368042,
"learning_rate": 1.6669710806697108e-06,
"loss": 0.5315,
"step": 13700
},
{
"epoch": 2.5114155251141552,
"grad_norm": 10.066008567810059,
"learning_rate": 1.636529680365297e-06,
"loss": 0.4622,
"step": 13750
},
{
"epoch": 2.5205479452054793,
"grad_norm": 14.022153854370117,
"learning_rate": 1.6060882800608828e-06,
"loss": 0.545,
"step": 13800
},
{
"epoch": 2.5296803652968034,
"grad_norm": 9.877713203430176,
"learning_rate": 1.575646879756469e-06,
"loss": 0.4607,
"step": 13850
},
{
"epoch": 2.538812785388128,
"grad_norm": 9.370101928710938,
"learning_rate": 1.5452054794520548e-06,
"loss": 0.4565,
"step": 13900
},
{
"epoch": 2.547945205479452,
"grad_norm": 18.39552879333496,
"learning_rate": 1.514764079147641e-06,
"loss": 0.5645,
"step": 13950
},
{
"epoch": 2.557077625570776,
"grad_norm": 17.700393676757812,
"learning_rate": 1.4843226788432268e-06,
"loss": 0.4371,
"step": 14000
},
{
"epoch": 2.557077625570776,
"eval_exact_match": 86.20624408703878,
"eval_f1": 92.64669797483194,
"eval_runtime": 406.4221,
"eval_samples_per_second": 26.007,
"eval_steps_per_second": 1.626,
"step": 14000
},
{
"epoch": 2.5662100456621006,
"grad_norm": 12.384415626525879,
"learning_rate": 1.453881278538813e-06,
"loss": 0.462,
"step": 14050
},
{
"epoch": 2.5753424657534247,
"grad_norm": 12.213122367858887,
"learning_rate": 1.4240487062404874e-06,
"loss": 0.4749,
"step": 14100
},
{
"epoch": 2.584474885844749,
"grad_norm": 13.337528228759766,
"learning_rate": 1.3936073059360733e-06,
"loss": 0.4622,
"step": 14150
},
{
"epoch": 2.593607305936073,
"grad_norm": 247.74070739746094,
"learning_rate": 1.3631659056316594e-06,
"loss": 0.4973,
"step": 14200
},
{
"epoch": 2.602739726027397,
"grad_norm": 19.168542861938477,
"learning_rate": 1.3327245053272453e-06,
"loss": 0.4236,
"step": 14250
},
{
"epoch": 2.6118721461187215,
"grad_norm": 38.19758224487305,
"learning_rate": 1.3022831050228314e-06,
"loss": 0.4858,
"step": 14300
},
{
"epoch": 2.6210045662100456,
"grad_norm": 19.262054443359375,
"learning_rate": 1.2718417047184173e-06,
"loss": 0.4157,
"step": 14350
},
{
"epoch": 2.6301369863013697,
"grad_norm": 8.72314739227295,
"learning_rate": 1.2414003044140032e-06,
"loss": 0.4958,
"step": 14400
},
{
"epoch": 2.6392694063926943,
"grad_norm": 14.77987289428711,
"learning_rate": 1.210958904109589e-06,
"loss": 0.4949,
"step": 14450
},
{
"epoch": 2.6484018264840183,
"grad_norm": 26.928768157958984,
"learning_rate": 1.1805175038051752e-06,
"loss": 0.4543,
"step": 14500
},
{
"epoch": 2.6575342465753424,
"grad_norm": 26.89804458618164,
"learning_rate": 1.150076103500761e-06,
"loss": 0.4866,
"step": 14550
},
{
"epoch": 2.6666666666666665,
"grad_norm": 24.923315048217773,
"learning_rate": 1.1196347031963471e-06,
"loss": 0.5296,
"step": 14600
},
{
"epoch": 2.6757990867579906,
"grad_norm": 22.595211029052734,
"learning_rate": 1.089193302891933e-06,
"loss": 0.5194,
"step": 14650
},
{
"epoch": 2.684931506849315,
"grad_norm": 6.4614949226379395,
"learning_rate": 1.0587519025875191e-06,
"loss": 0.5242,
"step": 14700
},
{
"epoch": 2.6940639269406392,
"grad_norm": 12.488426208496094,
"learning_rate": 1.028310502283105e-06,
"loss": 0.4829,
"step": 14750
},
{
"epoch": 2.7031963470319633,
"grad_norm": 11.035359382629395,
"learning_rate": 9.978691019786911e-07,
"loss": 0.4688,
"step": 14800
},
{
"epoch": 2.712328767123288,
"grad_norm": 15.173184394836426,
"learning_rate": 9.67427701674277e-07,
"loss": 0.4491,
"step": 14850
},
{
"epoch": 2.721461187214612,
"grad_norm": 10.253326416015625,
"learning_rate": 9.369863013698631e-07,
"loss": 0.4675,
"step": 14900
},
{
"epoch": 2.730593607305936,
"grad_norm": 21.999897003173828,
"learning_rate": 9.065449010654491e-07,
"loss": 0.4482,
"step": 14950
},
{
"epoch": 2.73972602739726,
"grad_norm": 20.603681564331055,
"learning_rate": 8.761035007610351e-07,
"loss": 0.4908,
"step": 15000
},
{
"epoch": 2.748858447488584,
"grad_norm": 28.629119873046875,
"learning_rate": 8.456621004566211e-07,
"loss": 0.51,
"step": 15050
},
{
"epoch": 2.7579908675799087,
"grad_norm": 10.979697227478027,
"learning_rate": 8.152207001522071e-07,
"loss": 0.4738,
"step": 15100
},
{
"epoch": 2.767123287671233,
"grad_norm": 23.009090423583984,
"learning_rate": 7.847792998477931e-07,
"loss": 0.4778,
"step": 15150
},
{
"epoch": 2.776255707762557,
"grad_norm": 13.957650184631348,
"learning_rate": 7.543378995433791e-07,
"loss": 0.5011,
"step": 15200
},
{
"epoch": 2.7853881278538815,
"grad_norm": 51.35847091674805,
"learning_rate": 7.238964992389651e-07,
"loss": 0.4852,
"step": 15250
},
{
"epoch": 2.7945205479452055,
"grad_norm": 13.39809799194336,
"learning_rate": 6.934550989345511e-07,
"loss": 0.4967,
"step": 15300
},
{
"epoch": 2.8036529680365296,
"grad_norm": 16.116273880004883,
"learning_rate": 6.630136986301371e-07,
"loss": 0.4922,
"step": 15350
},
{
"epoch": 2.8127853881278537,
"grad_norm": 11.181424140930176,
"learning_rate": 6.325722983257231e-07,
"loss": 0.4525,
"step": 15400
},
{
"epoch": 2.821917808219178,
"grad_norm": 11.645832061767578,
"learning_rate": 6.021308980213091e-07,
"loss": 0.4547,
"step": 15450
},
{
"epoch": 2.8310502283105023,
"grad_norm": 37.076683044433594,
"learning_rate": 5.716894977168951e-07,
"loss": 0.4633,
"step": 15500
},
{
"epoch": 2.8401826484018264,
"grad_norm": 13.449100494384766,
"learning_rate": 5.412480974124811e-07,
"loss": 0.4936,
"step": 15550
},
{
"epoch": 2.8493150684931505,
"grad_norm": 20.85655975341797,
"learning_rate": 5.10806697108067e-07,
"loss": 0.4931,
"step": 15600
},
{
"epoch": 2.858447488584475,
"grad_norm": 19.300870895385742,
"learning_rate": 4.80365296803653e-07,
"loss": 0.4486,
"step": 15650
},
{
"epoch": 2.867579908675799,
"grad_norm": 30.8187255859375,
"learning_rate": 4.49923896499239e-07,
"loss": 0.4809,
"step": 15700
},
{
"epoch": 2.8767123287671232,
"grad_norm": 15.742734909057617,
"learning_rate": 4.19482496194825e-07,
"loss": 0.4727,
"step": 15750
},
{
"epoch": 2.8858447488584473,
"grad_norm": 11.21142578125,
"learning_rate": 3.89041095890411e-07,
"loss": 0.4315,
"step": 15800
},
{
"epoch": 2.8949771689497714,
"grad_norm": 19.95684242248535,
"learning_rate": 3.58599695585997e-07,
"loss": 0.4594,
"step": 15850
},
{
"epoch": 2.904109589041096,
"grad_norm": 10.31857681274414,
"learning_rate": 3.28158295281583e-07,
"loss": 0.4698,
"step": 15900
},
{
"epoch": 2.91324200913242,
"grad_norm": 11.127820014953613,
"learning_rate": 2.97716894977169e-07,
"loss": 0.4376,
"step": 15950
},
{
"epoch": 2.922374429223744,
"grad_norm": 11.543863296508789,
"learning_rate": 2.6727549467275497e-07,
"loss": 0.4423,
"step": 16000
},
{
"epoch": 2.922374429223744,
"eval_exact_match": 86.14001892147587,
"eval_f1": 92.68818723743551,
"eval_runtime": 406.3458,
"eval_samples_per_second": 26.012,
"eval_steps_per_second": 1.627,
"step": 16000
},
{
"epoch": 2.9315068493150687,
"grad_norm": 6.806540012359619,
"learning_rate": 2.3683409436834097e-07,
"loss": 0.4837,
"step": 16050
},
{
"epoch": 2.9406392694063928,
"grad_norm": 11.885407447814941,
"learning_rate": 2.0639269406392697e-07,
"loss": 0.4772,
"step": 16100
},
{
"epoch": 2.949771689497717,
"grad_norm": 14.364296913146973,
"learning_rate": 1.7595129375951297e-07,
"loss": 0.4339,
"step": 16150
},
{
"epoch": 2.958904109589041,
"grad_norm": 30.527639389038086,
"learning_rate": 1.461187214611872e-07,
"loss": 0.5007,
"step": 16200
},
{
"epoch": 2.968036529680365,
"grad_norm": 31.71573829650879,
"learning_rate": 1.1567732115677322e-07,
"loss": 0.4726,
"step": 16250
},
{
"epoch": 2.9771689497716896,
"grad_norm": 12.93132209777832,
"learning_rate": 8.523592085235922e-08,
"loss": 0.4475,
"step": 16300
},
{
"epoch": 2.9863013698630136,
"grad_norm": 5.9728569984436035,
"learning_rate": 5.479452054794521e-08,
"loss": 0.4457,
"step": 16350
},
{
"epoch": 2.9954337899543377,
"grad_norm": 9.14331340789795,
"learning_rate": 2.4353120243531205e-08,
"loss": 0.4563,
"step": 16400
},
{
"epoch": 3.0,
"step": 16425,
"total_flos": 2.0062622083669033e+18,
"train_loss": 0.7534013637850455,
"train_runtime": 50383.896,
"train_samples_per_second": 5.216,
"train_steps_per_second": 0.326
}
],
"logging_steps": 50,
"max_steps": 16425,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.0062622083669033e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}