climate-5day / checkpoint-362 /trainer_state.json
howard
update
9c90ebb
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 91,
"global_step": 362,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027624309392265192,
"grad_norm": 0.42503952980041504,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.7314,
"step": 1
},
{
"epoch": 0.0027624309392265192,
"eval_loss": 1.3109512329101562,
"eval_runtime": 138.9562,
"eval_samples_per_second": 10.241,
"eval_steps_per_second": 0.324,
"step": 1
},
{
"epoch": 0.0055248618784530384,
"grad_norm": 0.4957458972930908,
"learning_rate": 4.000000000000001e-06,
"loss": 1.8536,
"step": 2
},
{
"epoch": 0.008287292817679558,
"grad_norm": 0.4254479706287384,
"learning_rate": 6e-06,
"loss": 1.7545,
"step": 3
},
{
"epoch": 0.011049723756906077,
"grad_norm": 0.4761411249637604,
"learning_rate": 8.000000000000001e-06,
"loss": 1.7814,
"step": 4
},
{
"epoch": 0.013812154696132596,
"grad_norm": 0.47434934973716736,
"learning_rate": 1e-05,
"loss": 1.7703,
"step": 5
},
{
"epoch": 0.016574585635359115,
"grad_norm": 0.4699034094810486,
"learning_rate": 1.2e-05,
"loss": 1.83,
"step": 6
},
{
"epoch": 0.019337016574585635,
"grad_norm": 0.4335905909538269,
"learning_rate": 1.4e-05,
"loss": 1.6648,
"step": 7
},
{
"epoch": 0.022099447513812154,
"grad_norm": 0.4264945685863495,
"learning_rate": 1.6000000000000003e-05,
"loss": 1.7164,
"step": 8
},
{
"epoch": 0.024861878453038673,
"grad_norm": 0.417251318693161,
"learning_rate": 1.8e-05,
"loss": 1.7058,
"step": 9
},
{
"epoch": 0.027624309392265192,
"grad_norm": 0.42205414175987244,
"learning_rate": 2e-05,
"loss": 1.71,
"step": 10
},
{
"epoch": 0.03038674033149171,
"grad_norm": 0.414096862077713,
"learning_rate": 1.9999601726381415e-05,
"loss": 1.6954,
"step": 11
},
{
"epoch": 0.03314917127071823,
"grad_norm": 0.412890762090683,
"learning_rate": 1.9998406937250035e-05,
"loss": 1.6941,
"step": 12
},
{
"epoch": 0.03591160220994475,
"grad_norm": 0.4428125321865082,
"learning_rate": 1.9996415727776456e-05,
"loss": 1.6439,
"step": 13
},
{
"epoch": 0.03867403314917127,
"grad_norm": 0.4378657341003418,
"learning_rate": 1.999362825656992e-05,
"loss": 1.7518,
"step": 14
},
{
"epoch": 0.04143646408839779,
"grad_norm": 0.4670448303222656,
"learning_rate": 1.9990044745665672e-05,
"loss": 1.6461,
"step": 15
},
{
"epoch": 0.04419889502762431,
"grad_norm": 0.44707468152046204,
"learning_rate": 1.998566548050729e-05,
"loss": 1.7201,
"step": 16
},
{
"epoch": 0.04696132596685083,
"grad_norm": 0.4317484200000763,
"learning_rate": 1.9980490809923928e-05,
"loss": 1.7697,
"step": 17
},
{
"epoch": 0.049723756906077346,
"grad_norm": 0.390918105840683,
"learning_rate": 1.9974521146102535e-05,
"loss": 1.6336,
"step": 18
},
{
"epoch": 0.052486187845303865,
"grad_norm": 0.44283199310302734,
"learning_rate": 1.9967756964555044e-05,
"loss": 1.7058,
"step": 19
},
{
"epoch": 0.055248618784530384,
"grad_norm": 0.40721943974494934,
"learning_rate": 1.9960198804080462e-05,
"loss": 1.7619,
"step": 20
},
{
"epoch": 0.058011049723756904,
"grad_norm": 0.3747328817844391,
"learning_rate": 1.995184726672197e-05,
"loss": 1.6998,
"step": 21
},
{
"epoch": 0.06077348066298342,
"grad_norm": 0.3749975860118866,
"learning_rate": 1.9942703017718977e-05,
"loss": 1.7488,
"step": 22
},
{
"epoch": 0.06353591160220995,
"grad_norm": 0.36129963397979736,
"learning_rate": 1.99327667854541e-05,
"loss": 1.6649,
"step": 23
},
{
"epoch": 0.06629834254143646,
"grad_norm": 0.3862815797328949,
"learning_rate": 1.9922039361395186e-05,
"loss": 1.626,
"step": 24
},
{
"epoch": 0.06906077348066299,
"grad_norm": 0.4051111042499542,
"learning_rate": 1.991052160003223e-05,
"loss": 1.6523,
"step": 25
},
{
"epoch": 0.0718232044198895,
"grad_norm": 0.42131727933883667,
"learning_rate": 1.989821441880933e-05,
"loss": 1.8082,
"step": 26
},
{
"epoch": 0.07458563535911603,
"grad_norm": 0.39152222871780396,
"learning_rate": 1.9885118798051607e-05,
"loss": 1.6831,
"step": 27
},
{
"epoch": 0.07734806629834254,
"grad_norm": 0.40526384115219116,
"learning_rate": 1.9871235780887114e-05,
"loss": 1.6613,
"step": 28
},
{
"epoch": 0.08011049723756906,
"grad_norm": 0.3861314654350281,
"learning_rate": 1.9856566473163747e-05,
"loss": 1.6981,
"step": 29
},
{
"epoch": 0.08287292817679558,
"grad_norm": 0.38509440422058105,
"learning_rate": 1.984111204336116e-05,
"loss": 1.6309,
"step": 30
},
{
"epoch": 0.0856353591160221,
"grad_norm": 0.3785051107406616,
"learning_rate": 1.9824873722497694e-05,
"loss": 1.675,
"step": 31
},
{
"epoch": 0.08839779005524862,
"grad_norm": 0.4013555645942688,
"learning_rate": 1.9807852804032306e-05,
"loss": 1.6033,
"step": 32
},
{
"epoch": 0.09116022099447514,
"grad_norm": 0.36987578868865967,
"learning_rate": 1.9790050643761552e-05,
"loss": 1.643,
"step": 33
},
{
"epoch": 0.09392265193370165,
"grad_norm": 0.34048470854759216,
"learning_rate": 1.9771468659711595e-05,
"loss": 1.7037,
"step": 34
},
{
"epoch": 0.09668508287292818,
"grad_norm": 0.370815247297287,
"learning_rate": 1.975210833202524e-05,
"loss": 1.7189,
"step": 35
},
{
"epoch": 0.09944751381215469,
"grad_norm": 0.3450065553188324,
"learning_rate": 1.9731971202844036e-05,
"loss": 1.6371,
"step": 36
},
{
"epoch": 0.10220994475138122,
"grad_norm": 0.3642140328884125,
"learning_rate": 1.9711058876185446e-05,
"loss": 1.6962,
"step": 37
},
{
"epoch": 0.10497237569060773,
"grad_norm": 0.3298833668231964,
"learning_rate": 1.9689373017815076e-05,
"loss": 1.5134,
"step": 38
},
{
"epoch": 0.10773480662983426,
"grad_norm": 0.35403338074684143,
"learning_rate": 1.9666915355113976e-05,
"loss": 1.7167,
"step": 39
},
{
"epoch": 0.11049723756906077,
"grad_norm": 0.36911338567733765,
"learning_rate": 1.964368767694107e-05,
"loss": 1.6674,
"step": 40
},
{
"epoch": 0.1132596685082873,
"grad_norm": 0.38089630007743835,
"learning_rate": 1.9619691833490645e-05,
"loss": 1.5777,
"step": 41
},
{
"epoch": 0.11602209944751381,
"grad_norm": 0.3290576934814453,
"learning_rate": 1.9594929736144978e-05,
"loss": 1.6851,
"step": 42
},
{
"epoch": 0.11878453038674033,
"grad_norm": 0.3029409348964691,
"learning_rate": 1.956940335732209e-05,
"loss": 1.2519,
"step": 43
},
{
"epoch": 0.12154696132596685,
"grad_norm": 0.3706592321395874,
"learning_rate": 1.954311473031864e-05,
"loss": 1.6061,
"step": 44
},
{
"epoch": 0.12430939226519337,
"grad_norm": 0.36404719948768616,
"learning_rate": 1.9516065949147945e-05,
"loss": 1.5515,
"step": 45
},
{
"epoch": 0.1270718232044199,
"grad_norm": 0.33177605271339417,
"learning_rate": 1.9488259168373198e-05,
"loss": 1.5692,
"step": 46
},
{
"epoch": 0.1298342541436464,
"grad_norm": 0.3582829535007477,
"learning_rate": 1.9459696602935838e-05,
"loss": 1.5336,
"step": 47
},
{
"epoch": 0.13259668508287292,
"grad_norm": 0.3565369248390198,
"learning_rate": 1.9430380527979124e-05,
"loss": 1.6251,
"step": 48
},
{
"epoch": 0.13535911602209943,
"grad_norm": 0.3471137285232544,
"learning_rate": 1.94003132786669e-05,
"loss": 1.6245,
"step": 49
},
{
"epoch": 0.13812154696132597,
"grad_norm": 0.4011210501194,
"learning_rate": 1.936949724999762e-05,
"loss": 1.3687,
"step": 50
},
{
"epoch": 0.1408839779005525,
"grad_norm": 0.36058053374290466,
"learning_rate": 1.9337934896613516e-05,
"loss": 1.5283,
"step": 51
},
{
"epoch": 0.143646408839779,
"grad_norm": 0.39054054021835327,
"learning_rate": 1.930562873260514e-05,
"loss": 1.5784,
"step": 52
},
{
"epoch": 0.1464088397790055,
"grad_norm": 0.39804011583328247,
"learning_rate": 1.927258133131105e-05,
"loss": 1.605,
"step": 53
},
{
"epoch": 0.14917127071823205,
"grad_norm": 0.42572247982025146,
"learning_rate": 1.9238795325112867e-05,
"loss": 1.4554,
"step": 54
},
{
"epoch": 0.15193370165745856,
"grad_norm": 0.4022100269794464,
"learning_rate": 1.9204273405225588e-05,
"loss": 1.5905,
"step": 55
},
{
"epoch": 0.15469613259668508,
"grad_norm": 0.43215224146842957,
"learning_rate": 1.9169018321483198e-05,
"loss": 1.6233,
"step": 56
},
{
"epoch": 0.1574585635359116,
"grad_norm": 0.38549479842185974,
"learning_rate": 1.9133032882119656e-05,
"loss": 1.5868,
"step": 57
},
{
"epoch": 0.16022099447513813,
"grad_norm": 0.42155200242996216,
"learning_rate": 1.9096319953545186e-05,
"loss": 1.4519,
"step": 58
},
{
"epoch": 0.16298342541436464,
"grad_norm": 0.4292687177658081,
"learning_rate": 1.9058882460117972e-05,
"loss": 1.6053,
"step": 59
},
{
"epoch": 0.16574585635359115,
"grad_norm": 0.44173118472099304,
"learning_rate": 1.9020723383911214e-05,
"loss": 1.6328,
"step": 60
},
{
"epoch": 0.1685082872928177,
"grad_norm": 0.4508700966835022,
"learning_rate": 1.8981845764475585e-05,
"loss": 1.6,
"step": 61
},
{
"epoch": 0.1712707182320442,
"grad_norm": 0.42624732851982117,
"learning_rate": 1.8942252698597113e-05,
"loss": 1.6086,
"step": 62
},
{
"epoch": 0.17403314917127072,
"grad_norm": 0.4029814600944519,
"learning_rate": 1.890194734005053e-05,
"loss": 1.5955,
"step": 63
},
{
"epoch": 0.17679558011049723,
"grad_norm": 0.4393555223941803,
"learning_rate": 1.8860932899348028e-05,
"loss": 1.5735,
"step": 64
},
{
"epoch": 0.17955801104972377,
"grad_norm": 0.40672072768211365,
"learning_rate": 1.881921264348355e-05,
"loss": 1.5975,
"step": 65
},
{
"epoch": 0.18232044198895028,
"grad_norm": 0.4309908151626587,
"learning_rate": 1.8776789895672557e-05,
"loss": 1.5539,
"step": 66
},
{
"epoch": 0.1850828729281768,
"grad_norm": 0.41728851199150085,
"learning_rate": 1.8733668035087302e-05,
"loss": 1.5678,
"step": 67
},
{
"epoch": 0.1878453038674033,
"grad_norm": 0.46532243490219116,
"learning_rate": 1.8689850496587674e-05,
"loss": 1.5277,
"step": 68
},
{
"epoch": 0.19060773480662985,
"grad_norm": 0.43453872203826904,
"learning_rate": 1.8645340770447595e-05,
"loss": 1.5192,
"step": 69
},
{
"epoch": 0.19337016574585636,
"grad_norm": 0.46961575746536255,
"learning_rate": 1.8600142402077006e-05,
"loss": 1.4962,
"step": 70
},
{
"epoch": 0.19613259668508287,
"grad_norm": 0.47958558797836304,
"learning_rate": 1.8554258991739454e-05,
"loss": 1.502,
"step": 71
},
{
"epoch": 0.19889502762430938,
"grad_norm": 0.4123697280883789,
"learning_rate": 1.850769419426531e-05,
"loss": 1.4997,
"step": 72
},
{
"epoch": 0.20165745856353592,
"grad_norm": 0.5050204396247864,
"learning_rate": 1.8460451718760653e-05,
"loss": 1.5283,
"step": 73
},
{
"epoch": 0.20441988950276244,
"grad_norm": 0.44690632820129395,
"learning_rate": 1.8412535328311813e-05,
"loss": 1.5429,
"step": 74
},
{
"epoch": 0.20718232044198895,
"grad_norm": 0.45188507437705994,
"learning_rate": 1.8363948839685638e-05,
"loss": 1.532,
"step": 75
},
{
"epoch": 0.20994475138121546,
"grad_norm": 0.4514811038970947,
"learning_rate": 1.8314696123025456e-05,
"loss": 1.4147,
"step": 76
},
{
"epoch": 0.212707182320442,
"grad_norm": 0.5114113688468933,
"learning_rate": 1.8264781101542797e-05,
"loss": 1.5552,
"step": 77
},
{
"epoch": 0.2154696132596685,
"grad_norm": 0.51650071144104,
"learning_rate": 1.8214207751204917e-05,
"loss": 1.5554,
"step": 78
},
{
"epoch": 0.21823204419889503,
"grad_norm": 0.48588883876800537,
"learning_rate": 1.816298010041806e-05,
"loss": 1.5688,
"step": 79
},
{
"epoch": 0.22099447513812154,
"grad_norm": 0.4869782328605652,
"learning_rate": 1.8111102229706593e-05,
"loss": 1.6062,
"step": 80
},
{
"epoch": 0.22375690607734808,
"grad_norm": 0.4736042320728302,
"learning_rate": 1.805857827138798e-05,
"loss": 1.4834,
"step": 81
},
{
"epoch": 0.2265193370165746,
"grad_norm": 0.5085079073905945,
"learning_rate": 1.8005412409243604e-05,
"loss": 1.4549,
"step": 82
},
{
"epoch": 0.2292817679558011,
"grad_norm": 0.5268322229385376,
"learning_rate": 1.7951608878185533e-05,
"loss": 1.4762,
"step": 83
},
{
"epoch": 0.23204419889502761,
"grad_norm": 0.5247999429702759,
"learning_rate": 1.789717196391916e-05,
"loss": 1.4625,
"step": 84
},
{
"epoch": 0.23480662983425415,
"grad_norm": 0.5090611577033997,
"learning_rate": 1.7842106002601854e-05,
"loss": 1.4634,
"step": 85
},
{
"epoch": 0.23756906077348067,
"grad_norm": 0.500877857208252,
"learning_rate": 1.778641538049755e-05,
"loss": 1.5259,
"step": 86
},
{
"epoch": 0.24033149171270718,
"grad_norm": 0.5528935790061951,
"learning_rate": 1.773010453362737e-05,
"loss": 1.4783,
"step": 87
},
{
"epoch": 0.2430939226519337,
"grad_norm": 0.4904051125049591,
"learning_rate": 1.7673177947416258e-05,
"loss": 1.5713,
"step": 88
},
{
"epoch": 0.24585635359116023,
"grad_norm": 0.4856030344963074,
"learning_rate": 1.7615640156335713e-05,
"loss": 1.5448,
"step": 89
},
{
"epoch": 0.24861878453038674,
"grad_norm": 0.4999338388442993,
"learning_rate": 1.7557495743542586e-05,
"loss": 1.5286,
"step": 90
},
{
"epoch": 0.2513812154696133,
"grad_norm": 0.6124939322471619,
"learning_rate": 1.749874934051401e-05,
"loss": 1.5815,
"step": 91
},
{
"epoch": 0.2513812154696133,
"eval_loss": 1.121329426765442,
"eval_runtime": 153.7165,
"eval_samples_per_second": 9.257,
"eval_steps_per_second": 0.293,
"step": 91
},
{
"epoch": 0.2541436464088398,
"grad_norm": 0.5473253726959229,
"learning_rate": 1.7439405626678496e-05,
"loss": 1.5358,
"step": 92
},
{
"epoch": 0.2569060773480663,
"grad_norm": 0.515661895275116,
"learning_rate": 1.7379469329043166e-05,
"loss": 1.5705,
"step": 93
},
{
"epoch": 0.2596685082872928,
"grad_norm": 0.5037718415260315,
"learning_rate": 1.7318945221817255e-05,
"loss": 1.5362,
"step": 94
},
{
"epoch": 0.26243093922651933,
"grad_norm": 0.5043213367462158,
"learning_rate": 1.7257838126031797e-05,
"loss": 1.5082,
"step": 95
},
{
"epoch": 0.26519337016574585,
"grad_norm": 0.5211421847343445,
"learning_rate": 1.719615290915563e-05,
"loss": 1.5486,
"step": 96
},
{
"epoch": 0.26795580110497236,
"grad_norm": 0.5876896381378174,
"learning_rate": 1.7133894484707657e-05,
"loss": 1.4926,
"step": 97
},
{
"epoch": 0.27071823204419887,
"grad_norm": 0.5891074538230896,
"learning_rate": 1.7071067811865477e-05,
"loss": 1.5895,
"step": 98
},
{
"epoch": 0.27348066298342544,
"grad_norm": 0.539527952671051,
"learning_rate": 1.7007677895070358e-05,
"loss": 1.4588,
"step": 99
},
{
"epoch": 0.27624309392265195,
"grad_norm": 0.4934506416320801,
"learning_rate": 1.694372978362861e-05,
"loss": 1.5116,
"step": 100
},
{
"epoch": 0.27900552486187846,
"grad_norm": 0.5579091906547546,
"learning_rate": 1.6879228571309377e-05,
"loss": 1.4786,
"step": 101
},
{
"epoch": 0.281767955801105,
"grad_norm": 0.5706738233566284,
"learning_rate": 1.6814179395938915e-05,
"loss": 1.549,
"step": 102
},
{
"epoch": 0.2845303867403315,
"grad_norm": 0.5942355394363403,
"learning_rate": 1.6748587438991303e-05,
"loss": 1.4979,
"step": 103
},
{
"epoch": 0.287292817679558,
"grad_norm": 0.5835041403770447,
"learning_rate": 1.6682457925175762e-05,
"loss": 1.4915,
"step": 104
},
{
"epoch": 0.2900552486187845,
"grad_norm": 0.6035829782485962,
"learning_rate": 1.6615796122020443e-05,
"loss": 1.5754,
"step": 105
},
{
"epoch": 0.292817679558011,
"grad_norm": 0.5598711967468262,
"learning_rate": 1.6548607339452853e-05,
"loss": 1.5017,
"step": 106
},
{
"epoch": 0.2955801104972376,
"grad_norm": 0.5402417778968811,
"learning_rate": 1.6480896929376905e-05,
"loss": 1.5558,
"step": 107
},
{
"epoch": 0.2983425414364641,
"grad_norm": 0.6131693720817566,
"learning_rate": 1.641267028524661e-05,
"loss": 1.532,
"step": 108
},
{
"epoch": 0.3011049723756906,
"grad_norm": 0.5868387222290039,
"learning_rate": 1.6343932841636455e-05,
"loss": 1.4656,
"step": 109
},
{
"epoch": 0.30386740331491713,
"grad_norm": 0.5688586831092834,
"learning_rate": 1.627469007380852e-05,
"loss": 1.5049,
"step": 110
},
{
"epoch": 0.30662983425414364,
"grad_norm": 0.5797792673110962,
"learning_rate": 1.6204947497276346e-05,
"loss": 1.5484,
"step": 111
},
{
"epoch": 0.30939226519337015,
"grad_norm": 0.5769445300102234,
"learning_rate": 1.6134710667365598e-05,
"loss": 1.4402,
"step": 112
},
{
"epoch": 0.31215469613259667,
"grad_norm": 0.6333233118057251,
"learning_rate": 1.6063985178771555e-05,
"loss": 1.4678,
"step": 113
},
{
"epoch": 0.3149171270718232,
"grad_norm": 0.558728814125061,
"learning_rate": 1.599277666511347e-05,
"loss": 1.4787,
"step": 114
},
{
"epoch": 0.31767955801104975,
"grad_norm": 0.5599526166915894,
"learning_rate": 1.592109079848583e-05,
"loss": 1.5417,
"step": 115
},
{
"epoch": 0.32044198895027626,
"grad_norm": 0.5926061272621155,
"learning_rate": 1.584893328900653e-05,
"loss": 1.3919,
"step": 116
},
{
"epoch": 0.32320441988950277,
"grad_norm": 0.589717447757721,
"learning_rate": 1.577630988436206e-05,
"loss": 1.5362,
"step": 117
},
{
"epoch": 0.3259668508287293,
"grad_norm": 0.6871376037597656,
"learning_rate": 1.5703226369349642e-05,
"loss": 1.4358,
"step": 118
},
{
"epoch": 0.3287292817679558,
"grad_norm": 0.607738733291626,
"learning_rate": 1.562968856541648e-05,
"loss": 1.5095,
"step": 119
},
{
"epoch": 0.3314917127071823,
"grad_norm": 0.635498583316803,
"learning_rate": 1.5555702330196024e-05,
"loss": 1.4874,
"step": 120
},
{
"epoch": 0.3342541436464088,
"grad_norm": 0.6137527227401733,
"learning_rate": 1.5481273557041402e-05,
"loss": 1.4166,
"step": 121
},
{
"epoch": 0.3370165745856354,
"grad_norm": 0.6075506210327148,
"learning_rate": 1.5406408174555978e-05,
"loss": 1.5638,
"step": 122
},
{
"epoch": 0.3397790055248619,
"grad_norm": 0.6399998068809509,
"learning_rate": 1.5331112146121104e-05,
"loss": 1.503,
"step": 123
},
{
"epoch": 0.3425414364640884,
"grad_norm": 0.5782871246337891,
"learning_rate": 1.525539146942113e-05,
"loss": 1.481,
"step": 124
},
{
"epoch": 0.3453038674033149,
"grad_norm": 0.6390048265457153,
"learning_rate": 1.5179252175965632e-05,
"loss": 1.4298,
"step": 125
},
{
"epoch": 0.34806629834254144,
"grad_norm": 0.5457322001457214,
"learning_rate": 1.5102700330609e-05,
"loss": 1.4868,
"step": 126
},
{
"epoch": 0.35082872928176795,
"grad_norm": 0.5977615118026733,
"learning_rate": 1.5025742031067316e-05,
"loss": 1.4753,
"step": 127
},
{
"epoch": 0.35359116022099446,
"grad_norm": 0.6722098588943481,
"learning_rate": 1.4948383407432678e-05,
"loss": 1.5022,
"step": 128
},
{
"epoch": 0.356353591160221,
"grad_norm": 0.6676556468009949,
"learning_rate": 1.4870630621684873e-05,
"loss": 1.4862,
"step": 129
},
{
"epoch": 0.35911602209944754,
"grad_norm": 0.6139523386955261,
"learning_rate": 1.479248986720057e-05,
"loss": 1.4543,
"step": 130
},
{
"epoch": 0.36187845303867405,
"grad_norm": 0.621616542339325,
"learning_rate": 1.4713967368259981e-05,
"loss": 1.4795,
"step": 131
},
{
"epoch": 0.36464088397790057,
"grad_norm": 0.6133718490600586,
"learning_rate": 1.4635069379551054e-05,
"loss": 1.4821,
"step": 132
},
{
"epoch": 0.3674033149171271,
"grad_norm": 0.7033741474151611,
"learning_rate": 1.4555802185671297e-05,
"loss": 1.5079,
"step": 133
},
{
"epoch": 0.3701657458563536,
"grad_norm": 0.6731663942337036,
"learning_rate": 1.4476172100627127e-05,
"loss": 1.4438,
"step": 134
},
{
"epoch": 0.3729281767955801,
"grad_norm": 0.6156182885169983,
"learning_rate": 1.4396185467330974e-05,
"loss": 1.5067,
"step": 135
},
{
"epoch": 0.3756906077348066,
"grad_norm": 0.6311376690864563,
"learning_rate": 1.4315848657096006e-05,
"loss": 1.4958,
"step": 136
},
{
"epoch": 0.3784530386740331,
"grad_norm": 0.6299065947532654,
"learning_rate": 1.4235168069128657e-05,
"loss": 1.4514,
"step": 137
},
{
"epoch": 0.3812154696132597,
"grad_norm": 0.7021288275718689,
"learning_rate": 1.4154150130018867e-05,
"loss": 1.4633,
"step": 138
},
{
"epoch": 0.3839779005524862,
"grad_norm": 0.6808854937553406,
"learning_rate": 1.407280129322819e-05,
"loss": 1.5116,
"step": 139
},
{
"epoch": 0.3867403314917127,
"grad_norm": 0.6397327780723572,
"learning_rate": 1.3991128038575741e-05,
"loss": 1.4773,
"step": 140
},
{
"epoch": 0.38950276243093923,
"grad_norm": 0.6835840344429016,
"learning_rate": 1.3909136871722066e-05,
"loss": 1.4515,
"step": 141
},
{
"epoch": 0.39226519337016574,
"grad_norm": 0.6662083268165588,
"learning_rate": 1.3826834323650899e-05,
"loss": 1.4796,
"step": 142
},
{
"epoch": 0.39502762430939226,
"grad_norm": 0.6650438904762268,
"learning_rate": 1.374422695014897e-05,
"loss": 1.5343,
"step": 143
},
{
"epoch": 0.39779005524861877,
"grad_norm": 0.6556645035743713,
"learning_rate": 1.3661321331283796e-05,
"loss": 1.5149,
"step": 144
},
{
"epoch": 0.4005524861878453,
"grad_norm": 0.7009831666946411,
"learning_rate": 1.3578124070879534e-05,
"loss": 1.3801,
"step": 145
},
{
"epoch": 0.40331491712707185,
"grad_norm": 0.6953743696212769,
"learning_rate": 1.3494641795990986e-05,
"loss": 1.3673,
"step": 146
},
{
"epoch": 0.40607734806629836,
"grad_norm": 0.7101163268089294,
"learning_rate": 1.3410881156375684e-05,
"loss": 1.4491,
"step": 147
},
{
"epoch": 0.4088397790055249,
"grad_norm": 0.6939963102340698,
"learning_rate": 1.3326848823964243e-05,
"loss": 1.4347,
"step": 148
},
{
"epoch": 0.4116022099447514,
"grad_norm": 0.7195811867713928,
"learning_rate": 1.3242551492328875e-05,
"loss": 1.4454,
"step": 149
},
{
"epoch": 0.4143646408839779,
"grad_norm": 0.7362983226776123,
"learning_rate": 1.3157995876150252e-05,
"loss": 1.4402,
"step": 150
},
{
"epoch": 0.4171270718232044,
"grad_norm": 0.706754744052887,
"learning_rate": 1.3073188710682612e-05,
"loss": 1.2887,
"step": 151
},
{
"epoch": 0.4198895027624309,
"grad_norm": 0.7431464195251465,
"learning_rate": 1.2988136751217292e-05,
"loss": 1.4228,
"step": 152
},
{
"epoch": 0.42265193370165743,
"grad_norm": 0.7429276704788208,
"learning_rate": 1.2902846772544625e-05,
"loss": 1.5767,
"step": 153
},
{
"epoch": 0.425414364640884,
"grad_norm": 0.672516405582428,
"learning_rate": 1.2817325568414299e-05,
"loss": 1.4013,
"step": 154
},
{
"epoch": 0.4281767955801105,
"grad_norm": 0.7116546630859375,
"learning_rate": 1.27315799509942e-05,
"loss": 1.4784,
"step": 155
},
{
"epoch": 0.430939226519337,
"grad_norm": 0.7850514054298401,
"learning_rate": 1.2645616750327792e-05,
"loss": 1.4335,
"step": 156
},
{
"epoch": 0.43370165745856354,
"grad_norm": 0.7218865752220154,
"learning_rate": 1.2559442813790077e-05,
"loss": 1.4628,
"step": 157
},
{
"epoch": 0.43646408839779005,
"grad_norm": 0.746462881565094,
"learning_rate": 1.2473065005542155e-05,
"loss": 1.4531,
"step": 158
},
{
"epoch": 0.43922651933701656,
"grad_norm": 0.7100480794906616,
"learning_rate": 1.2386490205984488e-05,
"loss": 1.4729,
"step": 159
},
{
"epoch": 0.4419889502762431,
"grad_norm": 0.7361482977867126,
"learning_rate": 1.2299725311208807e-05,
"loss": 1.5175,
"step": 160
},
{
"epoch": 0.4447513812154696,
"grad_norm": 0.7085789442062378,
"learning_rate": 1.2212777232448837e-05,
"loss": 1.4351,
"step": 161
},
{
"epoch": 0.44751381215469616,
"grad_norm": 0.6986429691314697,
"learning_rate": 1.2125652895529766e-05,
"loss": 1.484,
"step": 162
},
{
"epoch": 0.45027624309392267,
"grad_norm": 0.656014084815979,
"learning_rate": 1.2038359240316589e-05,
"loss": 1.4858,
"step": 163
},
{
"epoch": 0.4530386740331492,
"grad_norm": 0.7168847322463989,
"learning_rate": 1.1950903220161286e-05,
"loss": 1.4222,
"step": 164
},
{
"epoch": 0.4558011049723757,
"grad_norm": 0.6796424388885498,
"learning_rate": 1.186329180134898e-05,
"loss": 1.5088,
"step": 165
},
{
"epoch": 0.4585635359116022,
"grad_norm": 0.6602550148963928,
"learning_rate": 1.1775531962543036e-05,
"loss": 1.2236,
"step": 166
},
{
"epoch": 0.4613259668508287,
"grad_norm": 0.7656331062316895,
"learning_rate": 1.1687630694229159e-05,
"loss": 1.4906,
"step": 167
},
{
"epoch": 0.46408839779005523,
"grad_norm": 0.7794011235237122,
"learning_rate": 1.1599594998158602e-05,
"loss": 1.4335,
"step": 168
},
{
"epoch": 0.46685082872928174,
"grad_norm": 0.7844555377960205,
"learning_rate": 1.1511431886790407e-05,
"loss": 1.3969,
"step": 169
},
{
"epoch": 0.4696132596685083,
"grad_norm": 0.6949150562286377,
"learning_rate": 1.1423148382732854e-05,
"loss": 1.4488,
"step": 170
},
{
"epoch": 0.4723756906077348,
"grad_norm": 0.866104781627655,
"learning_rate": 1.1334751518184062e-05,
"loss": 1.3399,
"step": 171
},
{
"epoch": 0.47513812154696133,
"grad_norm": 0.763599157333374,
"learning_rate": 1.124624833437186e-05,
"loss": 1.5064,
"step": 172
},
{
"epoch": 0.47790055248618785,
"grad_norm": 0.7990726232528687,
"learning_rate": 1.1157645880992901e-05,
"loss": 1.4328,
"step": 173
},
{
"epoch": 0.48066298342541436,
"grad_norm": 0.782218873500824,
"learning_rate": 1.1068951215651132e-05,
"loss": 1.3896,
"step": 174
},
{
"epoch": 0.48342541436464087,
"grad_norm": 0.741022527217865,
"learning_rate": 1.098017140329561e-05,
"loss": 1.3803,
"step": 175
},
{
"epoch": 0.4861878453038674,
"grad_norm": 0.8042486310005188,
"learning_rate": 1.089131351565776e-05,
"loss": 1.4333,
"step": 176
},
{
"epoch": 0.4889502762430939,
"grad_norm": 0.7474654316902161,
"learning_rate": 1.080238463068808e-05,
"loss": 1.4138,
"step": 177
},
{
"epoch": 0.49171270718232046,
"grad_norm": 0.7193496227264404,
"learning_rate": 1.0713391831992324e-05,
"loss": 1.4721,
"step": 178
},
{
"epoch": 0.494475138121547,
"grad_norm": 0.7830453515052795,
"learning_rate": 1.0624342208267293e-05,
"loss": 1.475,
"step": 179
},
{
"epoch": 0.4972375690607735,
"grad_norm": 0.6899390816688538,
"learning_rate": 1.0535242852736152e-05,
"loss": 1.3806,
"step": 180
},
{
"epoch": 0.5,
"grad_norm": 0.779139518737793,
"learning_rate": 1.0446100862583459e-05,
"loss": 1.4362,
"step": 181
},
{
"epoch": 0.5027624309392266,
"grad_norm": 0.7901574969291687,
"learning_rate": 1.0356923338389807e-05,
"loss": 1.5008,
"step": 182
},
{
"epoch": 0.5027624309392266,
"eval_loss": 1.09184730052948,
"eval_runtime": 154.1688,
"eval_samples_per_second": 9.23,
"eval_steps_per_second": 0.292,
"step": 182
},
{
"epoch": 0.505524861878453,
"grad_norm": 0.7908266186714172,
"learning_rate": 1.0267717383566247e-05,
"loss": 1.4892,
"step": 183
},
{
"epoch": 0.5082872928176796,
"grad_norm": 0.7942709922790527,
"learning_rate": 1.0178490103788462e-05,
"loss": 1.391,
"step": 184
},
{
"epoch": 0.511049723756906,
"grad_norm": 0.76580411195755,
"learning_rate": 1.0089248606430775e-05,
"loss": 1.382,
"step": 185
},
{
"epoch": 0.5138121546961326,
"grad_norm": 0.7731238603591919,
"learning_rate": 1e-05,
"loss": 1.4122,
"step": 186
},
{
"epoch": 0.5165745856353591,
"grad_norm": 0.8293752074241638,
"learning_rate": 9.910751393569228e-06,
"loss": 1.5193,
"step": 187
},
{
"epoch": 0.5193370165745856,
"grad_norm": 0.9073200225830078,
"learning_rate": 9.82150989621154e-06,
"loss": 1.3044,
"step": 188
},
{
"epoch": 0.5220994475138122,
"grad_norm": 0.7942474484443665,
"learning_rate": 9.732282616433756e-06,
"loss": 1.4997,
"step": 189
},
{
"epoch": 0.5248618784530387,
"grad_norm": 0.7780297994613647,
"learning_rate": 9.643076661610197e-06,
"loss": 1.4907,
"step": 190
},
{
"epoch": 0.5276243093922652,
"grad_norm": 0.8634337782859802,
"learning_rate": 9.553899137416546e-06,
"loss": 1.3418,
"step": 191
},
{
"epoch": 0.5303867403314917,
"grad_norm": 0.7954802513122559,
"learning_rate": 9.464757147263849e-06,
"loss": 1.4508,
"step": 192
},
{
"epoch": 0.5331491712707183,
"grad_norm": 0.8032029271125793,
"learning_rate": 9.37565779173271e-06,
"loss": 1.4302,
"step": 193
},
{
"epoch": 0.5359116022099447,
"grad_norm": 0.7397053837776184,
"learning_rate": 9.286608168007678e-06,
"loss": 1.5004,
"step": 194
},
{
"epoch": 0.5386740331491713,
"grad_norm": 0.9212160706520081,
"learning_rate": 9.197615369311926e-06,
"loss": 1.4778,
"step": 195
},
{
"epoch": 0.5414364640883977,
"grad_norm": 0.779822587966919,
"learning_rate": 9.108686484342241e-06,
"loss": 1.4482,
"step": 196
},
{
"epoch": 0.5441988950276243,
"grad_norm": 0.7739251852035522,
"learning_rate": 9.019828596704394e-06,
"loss": 1.4599,
"step": 197
},
{
"epoch": 0.5469613259668509,
"grad_norm": 0.7287247776985168,
"learning_rate": 8.931048784348875e-06,
"loss": 1.4667,
"step": 198
},
{
"epoch": 0.5497237569060773,
"grad_norm": 0.758631706237793,
"learning_rate": 8.8423541190071e-06,
"loss": 1.3432,
"step": 199
},
{
"epoch": 0.5524861878453039,
"grad_norm": 0.7681453824043274,
"learning_rate": 8.753751665628141e-06,
"loss": 1.5143,
"step": 200
},
{
"epoch": 0.5552486187845304,
"grad_norm": 0.8416883945465088,
"learning_rate": 8.665248481815941e-06,
"loss": 1.362,
"step": 201
},
{
"epoch": 0.5580110497237569,
"grad_norm": 0.7365747094154358,
"learning_rate": 8.576851617267151e-06,
"loss": 1.4192,
"step": 202
},
{
"epoch": 0.5607734806629834,
"grad_norm": 0.8545944690704346,
"learning_rate": 8.488568113209593e-06,
"loss": 1.4204,
"step": 203
},
{
"epoch": 0.56353591160221,
"grad_norm": 0.8342128396034241,
"learning_rate": 8.4004050018414e-06,
"loss": 1.3981,
"step": 204
},
{
"epoch": 0.5662983425414365,
"grad_norm": 0.7602656483650208,
"learning_rate": 8.312369305770843e-06,
"loss": 1.437,
"step": 205
},
{
"epoch": 0.569060773480663,
"grad_norm": 0.8728726506233215,
"learning_rate": 8.224468037456969e-06,
"loss": 1.332,
"step": 206
},
{
"epoch": 0.5718232044198895,
"grad_norm": 0.845259428024292,
"learning_rate": 8.136708198651022e-06,
"loss": 1.4522,
"step": 207
},
{
"epoch": 0.574585635359116,
"grad_norm": 0.8054398894309998,
"learning_rate": 8.04909677983872e-06,
"loss": 1.3441,
"step": 208
},
{
"epoch": 0.5773480662983426,
"grad_norm": 0.7871323227882385,
"learning_rate": 7.961640759683416e-06,
"loss": 1.4925,
"step": 209
},
{
"epoch": 0.580110497237569,
"grad_norm": 0.8431714177131653,
"learning_rate": 7.874347104470234e-06,
"loss": 1.3429,
"step": 210
},
{
"epoch": 0.5828729281767956,
"grad_norm": 0.8485934138298035,
"learning_rate": 7.787222767551164e-06,
"loss": 1.4277,
"step": 211
},
{
"epoch": 0.585635359116022,
"grad_norm": 0.8744496703147888,
"learning_rate": 7.700274688791196e-06,
"loss": 1.436,
"step": 212
},
{
"epoch": 0.5883977900552486,
"grad_norm": 0.8178616762161255,
"learning_rate": 7.613509794015517e-06,
"loss": 1.4106,
"step": 213
},
{
"epoch": 0.5911602209944752,
"grad_norm": 0.8785072565078735,
"learning_rate": 7.5269349944578454e-06,
"loss": 1.461,
"step": 214
},
{
"epoch": 0.5939226519337016,
"grad_norm": 0.7647153735160828,
"learning_rate": 7.440557186209927e-06,
"loss": 1.3674,
"step": 215
},
{
"epoch": 0.5966850828729282,
"grad_norm": 0.7870045900344849,
"learning_rate": 7.354383249672212e-06,
"loss": 1.4552,
"step": 216
},
{
"epoch": 0.5994475138121547,
"grad_norm": 0.791970431804657,
"learning_rate": 7.268420049005806e-06,
"loss": 1.4316,
"step": 217
},
{
"epoch": 0.6022099447513812,
"grad_norm": 0.8452421426773071,
"learning_rate": 7.182674431585703e-06,
"loss": 1.4539,
"step": 218
},
{
"epoch": 0.6049723756906077,
"grad_norm": 0.9066639542579651,
"learning_rate": 7.097153227455379e-06,
"loss": 1.4379,
"step": 219
},
{
"epoch": 0.6077348066298343,
"grad_norm": 0.8649947643280029,
"learning_rate": 7.011863248782711e-06,
"loss": 1.3483,
"step": 220
},
{
"epoch": 0.6104972375690608,
"grad_norm": 0.8475552201271057,
"learning_rate": 6.92681128931739e-06,
"loss": 1.4798,
"step": 221
},
{
"epoch": 0.6132596685082873,
"grad_norm": 0.8348631262779236,
"learning_rate": 6.8420041238497525e-06,
"loss": 1.4084,
"step": 222
},
{
"epoch": 0.6160220994475138,
"grad_norm": 0.9010429382324219,
"learning_rate": 6.7574485076711285e-06,
"loss": 1.4011,
"step": 223
},
{
"epoch": 0.6187845303867403,
"grad_norm": 0.8535116910934448,
"learning_rate": 6.673151176035762e-06,
"loss": 1.3661,
"step": 224
},
{
"epoch": 0.6215469613259669,
"grad_norm": 0.8269426822662354,
"learning_rate": 6.589118843624316e-06,
"loss": 1.3894,
"step": 225
},
{
"epoch": 0.6243093922651933,
"grad_norm": 0.76876300573349,
"learning_rate": 6.505358204009018e-06,
"loss": 1.4496,
"step": 226
},
{
"epoch": 0.6270718232044199,
"grad_norm": 0.7975645065307617,
"learning_rate": 6.421875929120469e-06,
"loss": 1.3987,
"step": 227
},
{
"epoch": 0.6298342541436464,
"grad_norm": 0.8084754347801208,
"learning_rate": 6.33867866871621e-06,
"loss": 1.4141,
"step": 228
},
{
"epoch": 0.6325966850828729,
"grad_norm": 0.7989472150802612,
"learning_rate": 6.25577304985103e-06,
"loss": 1.2641,
"step": 229
},
{
"epoch": 0.6353591160220995,
"grad_norm": 0.8317887783050537,
"learning_rate": 6.173165676349103e-06,
"loss": 1.3783,
"step": 230
},
{
"epoch": 0.638121546961326,
"grad_norm": 0.883558452129364,
"learning_rate": 6.090863128277938e-06,
"loss": 1.3727,
"step": 231
},
{
"epoch": 0.6408839779005525,
"grad_norm": 0.7857452630996704,
"learning_rate": 6.008871961424259e-06,
"loss": 1.4159,
"step": 232
},
{
"epoch": 0.643646408839779,
"grad_norm": 0.7758345007896423,
"learning_rate": 5.927198706771813e-06,
"loss": 1.4271,
"step": 233
},
{
"epoch": 0.6464088397790055,
"grad_norm": 0.8178966045379639,
"learning_rate": 5.845849869981137e-06,
"loss": 1.4526,
"step": 234
},
{
"epoch": 0.649171270718232,
"grad_norm": 0.8668413758277893,
"learning_rate": 5.764831930871346e-06,
"loss": 1.4209,
"step": 235
},
{
"epoch": 0.6519337016574586,
"grad_norm": 0.866632342338562,
"learning_rate": 5.684151342903992e-06,
"loss": 1.4785,
"step": 236
},
{
"epoch": 0.6546961325966851,
"grad_norm": 0.8386754393577576,
"learning_rate": 5.603814532669032e-06,
"loss": 1.4133,
"step": 237
},
{
"epoch": 0.6574585635359116,
"grad_norm": 0.8321985006332397,
"learning_rate": 5.523827899372876e-06,
"loss": 1.4049,
"step": 238
},
{
"epoch": 0.6602209944751382,
"grad_norm": 0.8785558342933655,
"learning_rate": 5.444197814328707e-06,
"loss": 1.3697,
"step": 239
},
{
"epoch": 0.6629834254143646,
"grad_norm": 0.7912325263023376,
"learning_rate": 5.364930620448946e-06,
"loss": 1.4928,
"step": 240
},
{
"epoch": 0.6657458563535912,
"grad_norm": 0.8722138404846191,
"learning_rate": 5.286032631740023e-06,
"loss": 1.4547,
"step": 241
},
{
"epoch": 0.6685082872928176,
"grad_norm": 0.8593927621841431,
"learning_rate": 5.207510132799436e-06,
"loss": 1.4188,
"step": 242
},
{
"epoch": 0.6712707182320442,
"grad_norm": 0.7740910649299622,
"learning_rate": 5.129369378315128e-06,
"loss": 1.4642,
"step": 243
},
{
"epoch": 0.6740331491712708,
"grad_norm": 0.8590474724769592,
"learning_rate": 5.051616592567323e-06,
"loss": 1.2543,
"step": 244
},
{
"epoch": 0.6767955801104972,
"grad_norm": 0.862617015838623,
"learning_rate": 4.974257968932687e-06,
"loss": 1.4393,
"step": 245
},
{
"epoch": 0.6795580110497238,
"grad_norm": 0.8476159572601318,
"learning_rate": 4.897299669391006e-06,
"loss": 1.4986,
"step": 246
},
{
"epoch": 0.6823204419889503,
"grad_norm": 0.8105822205543518,
"learning_rate": 4.820747824034369e-06,
"loss": 1.5027,
"step": 247
},
{
"epoch": 0.6850828729281768,
"grad_norm": 0.9459635615348816,
"learning_rate": 4.744608530578872e-06,
"loss": 1.3336,
"step": 248
},
{
"epoch": 0.6878453038674033,
"grad_norm": 0.8690259456634521,
"learning_rate": 4.668887853878896e-06,
"loss": 1.3479,
"step": 249
},
{
"epoch": 0.6906077348066298,
"grad_norm": 0.8513994812965393,
"learning_rate": 4.593591825444028e-06,
"loss": 1.5261,
"step": 250
},
{
"epoch": 0.6933701657458563,
"grad_norm": 0.8456645011901855,
"learning_rate": 4.518726442958599e-06,
"loss": 1.4669,
"step": 251
},
{
"epoch": 0.6961325966850829,
"grad_norm": 0.8776260018348694,
"learning_rate": 4.444297669803981e-06,
"loss": 1.4177,
"step": 252
},
{
"epoch": 0.6988950276243094,
"grad_norm": 0.860339343547821,
"learning_rate": 4.370311434583525e-06,
"loss": 1.454,
"step": 253
},
{
"epoch": 0.7016574585635359,
"grad_norm": 0.902574360370636,
"learning_rate": 4.296773630650358e-06,
"loss": 1.3987,
"step": 254
},
{
"epoch": 0.7044198895027625,
"grad_norm": 0.8529084920883179,
"learning_rate": 4.223690115637944e-06,
"loss": 1.4428,
"step": 255
},
{
"epoch": 0.7071823204419889,
"grad_norm": 0.8335188627243042,
"learning_rate": 4.15106671099347e-06,
"loss": 1.4812,
"step": 256
},
{
"epoch": 0.7099447513812155,
"grad_norm": 0.8945775628089905,
"learning_rate": 4.078909201514172e-06,
"loss": 1.3559,
"step": 257
},
{
"epoch": 0.712707182320442,
"grad_norm": 0.9545679688453674,
"learning_rate": 4.007223334886531e-06,
"loss": 1.3806,
"step": 258
},
{
"epoch": 0.7154696132596685,
"grad_norm": 0.7985166311264038,
"learning_rate": 3.936014821228448e-06,
"loss": 1.3433,
"step": 259
},
{
"epoch": 0.7182320441988951,
"grad_norm": 0.8410876393318176,
"learning_rate": 3.865289332634407e-06,
"loss": 1.4393,
"step": 260
},
{
"epoch": 0.7209944751381215,
"grad_norm": 0.8620865941047668,
"learning_rate": 3.7950525027236585e-06,
"loss": 1.2242,
"step": 261
},
{
"epoch": 0.7237569060773481,
"grad_norm": 0.8445248603820801,
"learning_rate": 3.7253099261914794e-06,
"loss": 1.403,
"step": 262
},
{
"epoch": 0.7265193370165746,
"grad_norm": 0.8497399687767029,
"learning_rate": 3.6560671583635467e-06,
"loss": 1.4453,
"step": 263
},
{
"epoch": 0.7292817679558011,
"grad_norm": 0.83712238073349,
"learning_rate": 3.5873297147533913e-06,
"loss": 1.503,
"step": 264
},
{
"epoch": 0.7320441988950276,
"grad_norm": 0.8133417963981628,
"learning_rate": 3.5191030706230967e-06,
"loss": 1.3821,
"step": 265
},
{
"epoch": 0.7348066298342542,
"grad_norm": 0.8870840668678284,
"learning_rate": 3.4513926605471504e-06,
"loss": 1.4533,
"step": 266
},
{
"epoch": 0.7375690607734806,
"grad_norm": 0.9070349335670471,
"learning_rate": 3.3842038779795594e-06,
"loss": 1.4055,
"step": 267
},
{
"epoch": 0.7403314917127072,
"grad_norm": 0.8012280464172363,
"learning_rate": 3.3175420748242405e-06,
"loss": 1.2719,
"step": 268
},
{
"epoch": 0.7430939226519337,
"grad_norm": 0.8506048321723938,
"learning_rate": 3.2514125610086957e-06,
"loss": 1.3588,
"step": 269
},
{
"epoch": 0.7458563535911602,
"grad_norm": 0.7189351916313171,
"learning_rate": 3.1858206040610883e-06,
"loss": 1.2039,
"step": 270
},
{
"epoch": 0.7486187845303868,
"grad_norm": 0.8134811520576477,
"learning_rate": 3.1207714286906253e-06,
"loss": 1.2114,
"step": 271
},
{
"epoch": 0.7513812154696132,
"grad_norm": 0.8526425957679749,
"learning_rate": 3.0562702163713954e-06,
"loss": 1.5044,
"step": 272
},
{
"epoch": 0.7541436464088398,
"grad_norm": 0.9256671071052551,
"learning_rate": 2.9923221049296448e-06,
"loss": 1.3762,
"step": 273
},
{
"epoch": 0.7541436464088398,
"eval_loss": 1.0853379964828491,
"eval_runtime": 155.2763,
"eval_samples_per_second": 9.164,
"eval_steps_per_second": 0.29,
"step": 273
},
{
"epoch": 0.7569060773480663,
"grad_norm": 1.0012723207473755,
"learning_rate": 2.9289321881345257e-06,
"loss": 1.4391,
"step": 274
},
{
"epoch": 0.7596685082872928,
"grad_norm": 0.9301908016204834,
"learning_rate": 2.8661055152923456e-06,
"loss": 1.3913,
"step": 275
},
{
"epoch": 0.7624309392265194,
"grad_norm": 0.9066746830940247,
"learning_rate": 2.8038470908443717e-06,
"loss": 1.3527,
"step": 276
},
{
"epoch": 0.7651933701657458,
"grad_norm": 0.8317292332649231,
"learning_rate": 2.742161873968202e-06,
"loss": 1.4783,
"step": 277
},
{
"epoch": 0.7679558011049724,
"grad_norm": 0.892299473285675,
"learning_rate": 2.681054778182748e-06,
"loss": 1.3643,
"step": 278
},
{
"epoch": 0.7707182320441989,
"grad_norm": 0.8800353407859802,
"learning_rate": 2.6205306709568358e-06,
"loss": 1.4015,
"step": 279
},
{
"epoch": 0.7734806629834254,
"grad_norm": 0.8604403138160706,
"learning_rate": 2.5605943733215044e-06,
"loss": 1.4247,
"step": 280
},
{
"epoch": 0.7762430939226519,
"grad_norm": 0.8747833967208862,
"learning_rate": 2.501250659485992e-06,
"loss": 1.3748,
"step": 281
},
{
"epoch": 0.7790055248618785,
"grad_norm": 0.8854520320892334,
"learning_rate": 2.4425042564574186e-06,
"loss": 1.4518,
"step": 282
},
{
"epoch": 0.7817679558011049,
"grad_norm": 0.8976154327392578,
"learning_rate": 2.38435984366429e-06,
"loss": 1.3765,
"step": 283
},
{
"epoch": 0.7845303867403315,
"grad_norm": 0.8289183378219604,
"learning_rate": 2.3268220525837436e-06,
"loss": 1.4559,
"step": 284
},
{
"epoch": 0.787292817679558,
"grad_norm": 0.8268212080001831,
"learning_rate": 2.26989546637263e-06,
"loss": 1.4105,
"step": 285
},
{
"epoch": 0.7900552486187845,
"grad_norm": 0.7702677249908447,
"learning_rate": 2.213584619502451e-06,
"loss": 1.4579,
"step": 286
},
{
"epoch": 0.7928176795580111,
"grad_norm": 0.9030853509902954,
"learning_rate": 2.157893997398146e-06,
"loss": 1.3844,
"step": 287
},
{
"epoch": 0.7955801104972375,
"grad_norm": 0.8461067080497742,
"learning_rate": 2.1028280360808405e-06,
"loss": 1.4283,
"step": 288
},
{
"epoch": 0.7983425414364641,
"grad_norm": 0.860618531703949,
"learning_rate": 2.0483911218144713e-06,
"loss": 1.4576,
"step": 289
},
{
"epoch": 0.8011049723756906,
"grad_norm": 0.9135028123855591,
"learning_rate": 1.994587590756397e-06,
"loss": 1.3549,
"step": 290
},
{
"epoch": 0.8038674033149171,
"grad_norm": 0.936713457107544,
"learning_rate": 1.941421728612023e-06,
"loss": 1.4119,
"step": 291
},
{
"epoch": 0.8066298342541437,
"grad_norm": 0.9670917987823486,
"learning_rate": 1.8888977702934086e-06,
"loss": 1.4523,
"step": 292
},
{
"epoch": 0.8093922651933702,
"grad_norm": 0.927078902721405,
"learning_rate": 1.8370198995819432e-06,
"loss": 1.405,
"step": 293
},
{
"epoch": 0.8121546961325967,
"grad_norm": 0.8833804130554199,
"learning_rate": 1.7857922487950873e-06,
"loss": 1.4527,
"step": 294
},
{
"epoch": 0.8149171270718232,
"grad_norm": 0.821182131767273,
"learning_rate": 1.7352188984572026e-06,
"loss": 1.3541,
"step": 295
},
{
"epoch": 0.8176795580110497,
"grad_norm": 0.8782601952552795,
"learning_rate": 1.6853038769745466e-06,
"loss": 1.45,
"step": 296
},
{
"epoch": 0.8204419889502762,
"grad_norm": 0.8677568435668945,
"learning_rate": 1.6360511603143648e-06,
"loss": 1.4061,
"step": 297
},
{
"epoch": 0.8232044198895028,
"grad_norm": 0.9125531911849976,
"learning_rate": 1.587464671688187e-06,
"loss": 1.4138,
"step": 298
},
{
"epoch": 0.8259668508287292,
"grad_norm": 0.8667160272598267,
"learning_rate": 1.5395482812393513e-06,
"loss": 1.4394,
"step": 299
},
{
"epoch": 0.8287292817679558,
"grad_norm": 0.9092755913734436,
"learning_rate": 1.492305805734693e-06,
"loss": 1.4223,
"step": 300
},
{
"epoch": 0.8314917127071824,
"grad_norm": 0.813664436340332,
"learning_rate": 1.4457410082605483e-06,
"loss": 1.4421,
"step": 301
},
{
"epoch": 0.8342541436464088,
"grad_norm": 0.9390541911125183,
"learning_rate": 1.3998575979229944e-06,
"loss": 1.3292,
"step": 302
},
{
"epoch": 0.8370165745856354,
"grad_norm": 0.9930654168128967,
"learning_rate": 1.3546592295524075e-06,
"loss": 1.4802,
"step": 303
},
{
"epoch": 0.8397790055248618,
"grad_norm": 0.8965404629707336,
"learning_rate": 1.3101495034123313e-06,
"loss": 1.4596,
"step": 304
},
{
"epoch": 0.8425414364640884,
"grad_norm": 0.9070413112640381,
"learning_rate": 1.2663319649127025e-06,
"loss": 1.4076,
"step": 305
},
{
"epoch": 0.8453038674033149,
"grad_norm": 0.7749320268630981,
"learning_rate": 1.2232101043274437e-06,
"loss": 1.2553,
"step": 306
},
{
"epoch": 0.8480662983425414,
"grad_norm": 0.949621856212616,
"learning_rate": 1.1807873565164507e-06,
"loss": 1.3909,
"step": 307
},
{
"epoch": 0.850828729281768,
"grad_norm": 0.8850001096725464,
"learning_rate": 1.139067100651976e-06,
"loss": 1.3992,
"step": 308
},
{
"epoch": 0.8535911602209945,
"grad_norm": 0.8339431285858154,
"learning_rate": 1.0980526599494733e-06,
"loss": 1.4141,
"step": 309
},
{
"epoch": 0.856353591160221,
"grad_norm": 0.8964627385139465,
"learning_rate": 1.0577473014028872e-06,
"loss": 1.3828,
"step": 310
},
{
"epoch": 0.8591160220994475,
"grad_norm": 0.8783918023109436,
"learning_rate": 1.0181542355244167e-06,
"loss": 1.3791,
"step": 311
},
{
"epoch": 0.861878453038674,
"grad_norm": 0.8714961409568787,
"learning_rate": 9.792766160887868e-07,
"loss": 1.4513,
"step": 312
},
{
"epoch": 0.8646408839779005,
"grad_norm": 0.8861087560653687,
"learning_rate": 9.411175398820271e-07,
"loss": 1.3653,
"step": 313
},
{
"epoch": 0.8674033149171271,
"grad_norm": 0.8015440702438354,
"learning_rate": 9.036800464548157e-07,
"loss": 1.4818,
"step": 314
},
{
"epoch": 0.8701657458563536,
"grad_norm": 0.8085585832595825,
"learning_rate": 8.669671178803485e-07,
"loss": 1.4441,
"step": 315
},
{
"epoch": 0.8729281767955801,
"grad_norm": 0.8612155318260193,
"learning_rate": 8.309816785168035e-07,
"loss": 1.3938,
"step": 316
},
{
"epoch": 0.8756906077348067,
"grad_norm": 0.866340160369873,
"learning_rate": 7.957265947744131e-07,
"loss": 1.3557,
"step": 317
},
{
"epoch": 0.8784530386740331,
"grad_norm": 0.8810187578201294,
"learning_rate": 7.612046748871327e-07,
"loss": 1.3672,
"step": 318
},
{
"epoch": 0.8812154696132597,
"grad_norm": 0.9028764367103577,
"learning_rate": 7.274186686889539e-07,
"loss": 1.411,
"step": 319
},
{
"epoch": 0.8839779005524862,
"grad_norm": 0.9450622797012329,
"learning_rate": 6.943712673948643e-07,
"loss": 1.3494,
"step": 320
},
{
"epoch": 0.8867403314917127,
"grad_norm": 0.7640666365623474,
"learning_rate": 6.620651033864844e-07,
"loss": 1.4108,
"step": 321
},
{
"epoch": 0.8895027624309392,
"grad_norm": 0.8005813360214233,
"learning_rate": 6.305027500023841e-07,
"loss": 1.4023,
"step": 322
},
{
"epoch": 0.8922651933701657,
"grad_norm": 0.8076258301734924,
"learning_rate": 5.996867213330993e-07,
"loss": 1.4143,
"step": 323
},
{
"epoch": 0.8950276243093923,
"grad_norm": 0.9263111352920532,
"learning_rate": 5.696194720208792e-07,
"loss": 1.4247,
"step": 324
},
{
"epoch": 0.8977900552486188,
"grad_norm": 0.7982999682426453,
"learning_rate": 5.403033970641647e-07,
"loss": 1.4022,
"step": 325
},
{
"epoch": 0.9005524861878453,
"grad_norm": 0.8366706967353821,
"learning_rate": 5.117408316268047e-07,
"loss": 1.5076,
"step": 326
},
{
"epoch": 0.9033149171270718,
"grad_norm": 0.8495275974273682,
"learning_rate": 4.839340508520563e-07,
"loss": 1.4251,
"step": 327
},
{
"epoch": 0.9060773480662984,
"grad_norm": 0.8747670650482178,
"learning_rate": 4.5688526968136193e-07,
"loss": 1.3898,
"step": 328
},
{
"epoch": 0.9088397790055248,
"grad_norm": 0.8361295461654663,
"learning_rate": 4.305966426779118e-07,
"loss": 1.4397,
"step": 329
},
{
"epoch": 0.9116022099447514,
"grad_norm": 0.8162137866020203,
"learning_rate": 4.0507026385502747e-07,
"loss": 1.4023,
"step": 330
},
{
"epoch": 0.914364640883978,
"grad_norm": 0.8866202235221863,
"learning_rate": 3.8030816650935777e-07,
"loss": 1.3697,
"step": 331
},
{
"epoch": 0.9171270718232044,
"grad_norm": 0.865460216999054,
"learning_rate": 3.5631232305893047e-07,
"loss": 1.4998,
"step": 332
},
{
"epoch": 0.919889502762431,
"grad_norm": 0.9784380197525024,
"learning_rate": 3.3308464488602587e-07,
"loss": 1.4262,
"step": 333
},
{
"epoch": 0.9226519337016574,
"grad_norm": 0.8989951610565186,
"learning_rate": 3.106269821849273e-07,
"loss": 1.3742,
"step": 334
},
{
"epoch": 0.925414364640884,
"grad_norm": 0.8595617413520813,
"learning_rate": 2.889411238145545e-07,
"loss": 1.4038,
"step": 335
},
{
"epoch": 0.9281767955801105,
"grad_norm": 0.8496826887130737,
"learning_rate": 2.6802879715596585e-07,
"loss": 1.3938,
"step": 336
},
{
"epoch": 0.930939226519337,
"grad_norm": 0.8510773777961731,
"learning_rate": 2.478916679747623e-07,
"loss": 1.4617,
"step": 337
},
{
"epoch": 0.9337016574585635,
"grad_norm": 0.8786448240280151,
"learning_rate": 2.2853134028840594e-07,
"loss": 1.3615,
"step": 338
},
{
"epoch": 0.93646408839779,
"grad_norm": 0.9002432823181152,
"learning_rate": 2.099493562384469e-07,
"loss": 1.4047,
"step": 339
},
{
"epoch": 0.9392265193370166,
"grad_norm": 0.8121012449264526,
"learning_rate": 1.921471959676957e-07,
"loss": 1.4307,
"step": 340
},
{
"epoch": 0.9419889502762431,
"grad_norm": 0.8053271770477295,
"learning_rate": 1.7512627750230772e-07,
"loss": 1.4443,
"step": 341
},
{
"epoch": 0.9447513812154696,
"grad_norm": 0.9050955176353455,
"learning_rate": 1.5888795663883904e-07,
"loss": 1.4102,
"step": 342
},
{
"epoch": 0.9475138121546961,
"grad_norm": 0.9110968112945557,
"learning_rate": 1.4343352683625412e-07,
"loss": 1.4198,
"step": 343
},
{
"epoch": 0.9502762430939227,
"grad_norm": 0.929118812084198,
"learning_rate": 1.2876421911288906e-07,
"loss": 1.4203,
"step": 344
},
{
"epoch": 0.9530386740331491,
"grad_norm": 0.9060593843460083,
"learning_rate": 1.148812019483958e-07,
"loss": 1.3268,
"step": 345
},
{
"epoch": 0.9558011049723757,
"grad_norm": 0.8292207717895508,
"learning_rate": 1.0178558119067316e-07,
"loss": 1.3877,
"step": 346
},
{
"epoch": 0.9585635359116023,
"grad_norm": 0.8450669646263123,
"learning_rate": 8.947839996777286e-08,
"loss": 1.3981,
"step": 347
},
{
"epoch": 0.9613259668508287,
"grad_norm": 0.9085505604743958,
"learning_rate": 7.796063860481595e-08,
"loss": 1.353,
"step": 348
},
{
"epoch": 0.9640883977900553,
"grad_norm": 0.8617837429046631,
"learning_rate": 6.723321454590093e-08,
"loss": 1.4215,
"step": 349
},
{
"epoch": 0.9668508287292817,
"grad_norm": 0.8624401092529297,
"learning_rate": 5.7296982281026534e-08,
"loss": 1.3668,
"step": 350
},
{
"epoch": 0.9696132596685083,
"grad_norm": 0.8622123003005981,
"learning_rate": 4.815273327803183e-08,
"loss": 1.3723,
"step": 351
},
{
"epoch": 0.9723756906077348,
"grad_norm": 0.7842567563056946,
"learning_rate": 3.980119591954101e-08,
"loss": 1.3255,
"step": 352
},
{
"epoch": 0.9751381215469613,
"grad_norm": 0.8146648406982422,
"learning_rate": 3.224303544495766e-08,
"loss": 1.3913,
"step": 353
},
{
"epoch": 0.9779005524861878,
"grad_norm": 0.8488260507583618,
"learning_rate": 2.547885389746485e-08,
"loss": 1.385,
"step": 354
},
{
"epoch": 0.9806629834254144,
"grad_norm": 0.8932926058769226,
"learning_rate": 1.9509190076074657e-08,
"loss": 1.3992,
"step": 355
},
{
"epoch": 0.9834254143646409,
"grad_norm": 0.9198254346847534,
"learning_rate": 1.4334519492711362e-08,
"loss": 1.3375,
"step": 356
},
{
"epoch": 0.9861878453038674,
"grad_norm": 0.9434136748313904,
"learning_rate": 9.955254334328424e-09,
"loss": 1.3329,
"step": 357
},
{
"epoch": 0.988950276243094,
"grad_norm": 0.8083603978157043,
"learning_rate": 6.371743430082511e-09,
"loss": 1.4152,
"step": 358
},
{
"epoch": 0.9917127071823204,
"grad_norm": 0.8143340349197388,
"learning_rate": 3.5842722235468475e-09,
"loss": 1.3503,
"step": 359
},
{
"epoch": 0.994475138121547,
"grad_norm": 0.8487191200256348,
"learning_rate": 1.593062749967178e-09,
"loss": 1.4374,
"step": 360
},
{
"epoch": 0.9972375690607734,
"grad_norm": 0.8385122418403625,
"learning_rate": 3.982736185859093e-10,
"loss": 1.384,
"step": 361
},
{
"epoch": 1.0,
"grad_norm": 0.8801374435424805,
"learning_rate": 0.0,
"loss": 1.368,
"step": 362
}
],
"logging_steps": 1,
"max_steps": 362,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.6738557053055795e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}