SimpleLlamaSentences / trainer_state.json
SummerSigh's picture
Upload 8 files
bd8a4e1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.2783143881998358,
"eval_steps": 500,
"global_step": 116000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016529927433618567,
"grad_norm": 2.137389659881592,
"learning_rate": 0.0001599141895763845,
"loss": 6.3906,
"num_input_tokens_seen": 872064,
"step": 150
},
{
"epoch": 0.0033059854867237134,
"grad_norm": 1.725138783454895,
"learning_rate": 0.000159826028182259,
"loss": 5.6466,
"num_input_tokens_seen": 1744032,
"step": 300
},
{
"epoch": 0.00495897823008557,
"grad_norm": 1.9830477237701416,
"learning_rate": 0.00015973786678813348,
"loss": 5.4449,
"num_input_tokens_seen": 2615552,
"step": 450
},
{
"epoch": 0.006611970973447427,
"grad_norm": 1.8168050050735474,
"learning_rate": 0.00015964970539400798,
"loss": 5.3521,
"num_input_tokens_seen": 3469792,
"step": 600
},
{
"epoch": 0.008264963716809284,
"grad_norm": 1.9247260093688965,
"learning_rate": 0.00015956154399988245,
"loss": 5.2863,
"num_input_tokens_seen": 4342304,
"step": 750
},
{
"epoch": 0.00991795646017114,
"grad_norm": 1.9191653728485107,
"learning_rate": 0.00015947338260575695,
"loss": 5.2405,
"num_input_tokens_seen": 5211616,
"step": 900
},
{
"epoch": 0.011570949203532997,
"grad_norm": 1.748758316040039,
"learning_rate": 0.00015938522121163143,
"loss": 5.1998,
"num_input_tokens_seen": 6098432,
"step": 1050
},
{
"epoch": 0.013223941946894854,
"grad_norm": 2.057206869125366,
"learning_rate": 0.00015929705981750593,
"loss": 5.1706,
"num_input_tokens_seen": 6967904,
"step": 1200
},
{
"epoch": 0.014876934690256709,
"grad_norm": 2.0983762741088867,
"learning_rate": 0.0001592088984233804,
"loss": 5.1521,
"num_input_tokens_seen": 7851072,
"step": 1350
},
{
"epoch": 0.016529927433618568,
"grad_norm": 1.9689010381698608,
"learning_rate": 0.0001591207370292549,
"loss": 5.1314,
"num_input_tokens_seen": 8719744,
"step": 1500
},
{
"epoch": 0.01818292017698042,
"grad_norm": 1.7962827682495117,
"learning_rate": 0.00015903257563512938,
"loss": 5.0878,
"num_input_tokens_seen": 9566912,
"step": 1650
},
{
"epoch": 0.01983591292034228,
"grad_norm": 1.8615788221359253,
"learning_rate": 0.00015894441424100389,
"loss": 5.0739,
"num_input_tokens_seen": 10450048,
"step": 1800
},
{
"epoch": 0.021488905663704136,
"grad_norm": 1.8449132442474365,
"learning_rate": 0.00015885625284687836,
"loss": 5.0658,
"num_input_tokens_seen": 11348640,
"step": 1950
},
{
"epoch": 0.023141898407065993,
"grad_norm": 1.8516219854354858,
"learning_rate": 0.00015876809145275284,
"loss": 5.0462,
"num_input_tokens_seen": 12226144,
"step": 2100
},
{
"epoch": 0.02479489115042785,
"grad_norm": 1.9487632513046265,
"learning_rate": 0.00015867993005862734,
"loss": 5.0265,
"num_input_tokens_seen": 13119040,
"step": 2250
},
{
"epoch": 0.026447883893789707,
"grad_norm": 1.8343034982681274,
"learning_rate": 0.0001585917686645018,
"loss": 5.0056,
"num_input_tokens_seen": 13993280,
"step": 2400
},
{
"epoch": 0.028100876637151564,
"grad_norm": 2.002856731414795,
"learning_rate": 0.00015850360727037632,
"loss": 4.9993,
"num_input_tokens_seen": 14876768,
"step": 2550
},
{
"epoch": 0.029753869380513418,
"grad_norm": 1.981166124343872,
"learning_rate": 0.0001584154458762508,
"loss": 4.9943,
"num_input_tokens_seen": 15749088,
"step": 2700
},
{
"epoch": 0.03140686212387528,
"grad_norm": 1.9732400178909302,
"learning_rate": 0.0001583272844821253,
"loss": 4.9762,
"num_input_tokens_seen": 16634176,
"step": 2850
},
{
"epoch": 0.033059854867237136,
"grad_norm": 1.8369622230529785,
"learning_rate": 0.00015823912308799977,
"loss": 4.9525,
"num_input_tokens_seen": 17509824,
"step": 3000
},
{
"epoch": 0.034712847610598986,
"grad_norm": 1.9628883600234985,
"learning_rate": 0.00015815096169387427,
"loss": 4.9481,
"num_input_tokens_seen": 18398112,
"step": 3150
},
{
"epoch": 0.03636584035396084,
"grad_norm": 1.8523181676864624,
"learning_rate": 0.00015806280029974874,
"loss": 4.9306,
"num_input_tokens_seen": 19272928,
"step": 3300
},
{
"epoch": 0.0380188330973227,
"grad_norm": 1.9627933502197266,
"learning_rate": 0.00015797463890562325,
"loss": 4.9217,
"num_input_tokens_seen": 20162880,
"step": 3450
},
{
"epoch": 0.03967182584068456,
"grad_norm": 1.8966543674468994,
"learning_rate": 0.00015788647751149772,
"loss": 4.919,
"num_input_tokens_seen": 21041888,
"step": 3600
},
{
"epoch": 0.041324818584046415,
"grad_norm": 1.9131779670715332,
"learning_rate": 0.00015779831611737222,
"loss": 4.9118,
"num_input_tokens_seen": 21914272,
"step": 3750
},
{
"epoch": 0.04297781132740827,
"grad_norm": 1.8262194395065308,
"learning_rate": 0.0001577101547232467,
"loss": 4.9139,
"num_input_tokens_seen": 22802432,
"step": 3900
},
{
"epoch": 0.04463080407077013,
"grad_norm": 1.9549835920333862,
"learning_rate": 0.0001576219933291212,
"loss": 4.8919,
"num_input_tokens_seen": 23680544,
"step": 4050
},
{
"epoch": 0.046283796814131986,
"grad_norm": 1.9537177085876465,
"learning_rate": 0.00015753383193499568,
"loss": 4.8895,
"num_input_tokens_seen": 24572928,
"step": 4200
},
{
"epoch": 0.04793678955749384,
"grad_norm": 1.9916348457336426,
"learning_rate": 0.00015744567054087018,
"loss": 4.8798,
"num_input_tokens_seen": 25458752,
"step": 4350
},
{
"epoch": 0.0495897823008557,
"grad_norm": 1.9964395761489868,
"learning_rate": 0.00015735750914674465,
"loss": 4.8734,
"num_input_tokens_seen": 26339424,
"step": 4500
},
{
"epoch": 0.05124277504421756,
"grad_norm": 1.9595707654953003,
"learning_rate": 0.00015726934775261916,
"loss": 4.8654,
"num_input_tokens_seen": 27208928,
"step": 4650
},
{
"epoch": 0.052895767787579415,
"grad_norm": 2.002746820449829,
"learning_rate": 0.00015718118635849363,
"loss": 4.8536,
"num_input_tokens_seen": 28083488,
"step": 4800
},
{
"epoch": 0.05454876053094127,
"grad_norm": 2.014301300048828,
"learning_rate": 0.00015709302496436813,
"loss": 4.8513,
"num_input_tokens_seen": 28948832,
"step": 4950
},
{
"epoch": 0.05620175327430313,
"grad_norm": 1.82748544216156,
"learning_rate": 0.0001570048635702426,
"loss": 4.8477,
"num_input_tokens_seen": 29830752,
"step": 5100
},
{
"epoch": 0.05785474601766498,
"grad_norm": 1.907245397567749,
"learning_rate": 0.0001569167021761171,
"loss": 4.8445,
"num_input_tokens_seen": 30709248,
"step": 5250
},
{
"epoch": 0.059507738761026836,
"grad_norm": 1.9649808406829834,
"learning_rate": 0.00015682854078199158,
"loss": 4.8313,
"num_input_tokens_seen": 31597856,
"step": 5400
},
{
"epoch": 0.06116073150438869,
"grad_norm": 1.9375178813934326,
"learning_rate": 0.00015674037938786606,
"loss": 4.8206,
"num_input_tokens_seen": 32485120,
"step": 5550
},
{
"epoch": 0.06281372424775056,
"grad_norm": 1.8886380195617676,
"learning_rate": 0.00015665221799374056,
"loss": 4.8152,
"num_input_tokens_seen": 33354688,
"step": 5700
},
{
"epoch": 0.06446671699111241,
"grad_norm": 1.8993780612945557,
"learning_rate": 0.00015656405659961504,
"loss": 4.8155,
"num_input_tokens_seen": 34230592,
"step": 5850
},
{
"epoch": 0.06611970973447427,
"grad_norm": 1.8930308818817139,
"learning_rate": 0.00015647589520548954,
"loss": 4.8173,
"num_input_tokens_seen": 35090336,
"step": 6000
},
{
"epoch": 0.06777270247783612,
"grad_norm": 1.951819658279419,
"learning_rate": 0.00015638773381136401,
"loss": 4.8118,
"num_input_tokens_seen": 35973024,
"step": 6150
},
{
"epoch": 0.06942569522119797,
"grad_norm": 1.9142402410507202,
"learning_rate": 0.00015629957241723852,
"loss": 4.8079,
"num_input_tokens_seen": 36855936,
"step": 6300
},
{
"epoch": 0.07107868796455984,
"grad_norm": 1.9393310546875,
"learning_rate": 0.000156211411023113,
"loss": 4.7883,
"num_input_tokens_seen": 37722848,
"step": 6450
},
{
"epoch": 0.07273168070792169,
"grad_norm": 1.8511933088302612,
"learning_rate": 0.00015612383737161498,
"loss": 4.8043,
"num_input_tokens_seen": 38597600,
"step": 6600
},
{
"epoch": 0.07438467345128355,
"grad_norm": 1.8763892650604248,
"learning_rate": 0.00015603567597748946,
"loss": 4.7932,
"num_input_tokens_seen": 39493152,
"step": 6750
},
{
"epoch": 0.0760376661946454,
"grad_norm": 1.9806557893753052,
"learning_rate": 0.00015594751458336396,
"loss": 4.7813,
"num_input_tokens_seen": 40352640,
"step": 6900
},
{
"epoch": 0.07769065893800726,
"grad_norm": 2.001722574234009,
"learning_rate": 0.00015585935318923843,
"loss": 4.7941,
"num_input_tokens_seen": 41236160,
"step": 7050
},
{
"epoch": 0.07934365168136912,
"grad_norm": 2.1065292358398438,
"learning_rate": 0.00015577119179511294,
"loss": 4.7819,
"num_input_tokens_seen": 42111296,
"step": 7200
},
{
"epoch": 0.08099664442473098,
"grad_norm": 1.8941328525543213,
"learning_rate": 0.0001556830304009874,
"loss": 4.7737,
"num_input_tokens_seen": 42992864,
"step": 7350
},
{
"epoch": 0.08264963716809283,
"grad_norm": 1.8765467405319214,
"learning_rate": 0.00015559486900686191,
"loss": 4.764,
"num_input_tokens_seen": 43871808,
"step": 7500
},
{
"epoch": 0.0843026299114547,
"grad_norm": 1.9826706647872925,
"learning_rate": 0.0001555067076127364,
"loss": 4.7805,
"num_input_tokens_seen": 44742496,
"step": 7650
},
{
"epoch": 0.08595562265481654,
"grad_norm": 1.9296499490737915,
"learning_rate": 0.0001554185462186109,
"loss": 4.7585,
"num_input_tokens_seen": 45594112,
"step": 7800
},
{
"epoch": 0.08760861539817841,
"grad_norm": 1.9379116296768188,
"learning_rate": 0.00015533038482448537,
"loss": 4.7682,
"num_input_tokens_seen": 46465952,
"step": 7950
},
{
"epoch": 0.08926160814154026,
"grad_norm": 1.8769218921661377,
"learning_rate": 0.00015524222343035987,
"loss": 4.7574,
"num_input_tokens_seen": 47325664,
"step": 8100
},
{
"epoch": 0.09091460088490212,
"grad_norm": 1.8942108154296875,
"learning_rate": 0.00015515406203623434,
"loss": 4.7493,
"num_input_tokens_seen": 48202944,
"step": 8250
},
{
"epoch": 0.09256759362826397,
"grad_norm": 1.84010648727417,
"learning_rate": 0.00015506590064210885,
"loss": 4.7515,
"num_input_tokens_seen": 49074048,
"step": 8400
},
{
"epoch": 0.09422058637162582,
"grad_norm": 1.8978796005249023,
"learning_rate": 0.00015497773924798332,
"loss": 4.7512,
"num_input_tokens_seen": 49959008,
"step": 8550
},
{
"epoch": 0.09587357911498769,
"grad_norm": 1.9536223411560059,
"learning_rate": 0.0001548895778538578,
"loss": 4.758,
"num_input_tokens_seen": 50863648,
"step": 8700
},
{
"epoch": 0.09752657185834954,
"grad_norm": 2.0626060962677,
"learning_rate": 0.0001548014164597323,
"loss": 4.7434,
"num_input_tokens_seen": 51730944,
"step": 8850
},
{
"epoch": 0.0991795646017114,
"grad_norm": 1.9423109292984009,
"learning_rate": 0.00015471325506560677,
"loss": 4.736,
"num_input_tokens_seen": 52593024,
"step": 9000
},
{
"epoch": 0.10083255734507325,
"grad_norm": 1.9180619716644287,
"learning_rate": 0.00015462509367148127,
"loss": 4.7187,
"num_input_tokens_seen": 53470528,
"step": 9150
},
{
"epoch": 0.10248555008843512,
"grad_norm": 1.8776642084121704,
"learning_rate": 0.00015453693227735575,
"loss": 4.7382,
"num_input_tokens_seen": 54362720,
"step": 9300
},
{
"epoch": 0.10413854283179697,
"grad_norm": 1.9289714097976685,
"learning_rate": 0.00015444877088323025,
"loss": 4.7181,
"num_input_tokens_seen": 55238304,
"step": 9450
},
{
"epoch": 0.10579153557515883,
"grad_norm": 1.9489550590515137,
"learning_rate": 0.00015436060948910473,
"loss": 4.7287,
"num_input_tokens_seen": 56107040,
"step": 9600
},
{
"epoch": 0.10744452831852068,
"grad_norm": 2.01839280128479,
"learning_rate": 0.00015427244809497923,
"loss": 4.7097,
"num_input_tokens_seen": 56995456,
"step": 9750
},
{
"epoch": 0.10909752106188254,
"grad_norm": 1.9155646562576294,
"learning_rate": 0.0001541842867008537,
"loss": 4.7153,
"num_input_tokens_seen": 57852640,
"step": 9900
},
{
"epoch": 0.1107505138052444,
"grad_norm": 2.008150100708008,
"learning_rate": 0.0001540961253067282,
"loss": 4.7139,
"num_input_tokens_seen": 58719840,
"step": 10050
},
{
"epoch": 0.11240350654860626,
"grad_norm": 1.9440505504608154,
"learning_rate": 0.00015400796391260268,
"loss": 4.7184,
"num_input_tokens_seen": 59594784,
"step": 10200
},
{
"epoch": 0.11405649929196811,
"grad_norm": 1.9298348426818848,
"learning_rate": 0.00015391980251847718,
"loss": 4.708,
"num_input_tokens_seen": 60451712,
"step": 10350
},
{
"epoch": 0.11570949203532996,
"grad_norm": 1.9444379806518555,
"learning_rate": 0.00015383164112435166,
"loss": 4.6979,
"num_input_tokens_seen": 61335008,
"step": 10500
},
{
"epoch": 0.11736248477869182,
"grad_norm": 2.0216357707977295,
"learning_rate": 0.00015374406747285365,
"loss": 4.7055,
"num_input_tokens_seen": 62197344,
"step": 10650
},
{
"epoch": 0.11901547752205367,
"grad_norm": 1.9788328409194946,
"learning_rate": 0.00015365590607872815,
"loss": 4.6948,
"num_input_tokens_seen": 63058464,
"step": 10800
},
{
"epoch": 0.12066847026541554,
"grad_norm": 2.0648193359375,
"learning_rate": 0.00015356774468460263,
"loss": 4.7058,
"num_input_tokens_seen": 63942112,
"step": 10950
},
{
"epoch": 0.12232146300877739,
"grad_norm": 1.9497121572494507,
"learning_rate": 0.00015348017103310462,
"loss": 4.6924,
"num_input_tokens_seen": 64815232,
"step": 11100
},
{
"epoch": 0.12397445575213925,
"grad_norm": 1.9825148582458496,
"learning_rate": 0.0001533920096389791,
"loss": 4.7021,
"num_input_tokens_seen": 65689856,
"step": 11250
},
{
"epoch": 0.12562744849550112,
"grad_norm": 1.9766299724578857,
"learning_rate": 0.0001533038482448536,
"loss": 4.6921,
"num_input_tokens_seen": 66570272,
"step": 11400
},
{
"epoch": 0.12728044123886295,
"grad_norm": 1.9706653356552124,
"learning_rate": 0.00015321568685072807,
"loss": 4.6847,
"num_input_tokens_seen": 67436096,
"step": 11550
},
{
"epoch": 0.12893343398222482,
"grad_norm": 1.9741766452789307,
"learning_rate": 0.00015312811319923006,
"loss": 4.6835,
"num_input_tokens_seen": 68326816,
"step": 11700
},
{
"epoch": 0.13058642672558668,
"grad_norm": 1.850825548171997,
"learning_rate": 0.00015303995180510456,
"loss": 4.6874,
"num_input_tokens_seen": 69203328,
"step": 11850
},
{
"epoch": 0.13223941946894854,
"grad_norm": 2.0040206909179688,
"learning_rate": 0.00015295179041097904,
"loss": 4.6805,
"num_input_tokens_seen": 70049696,
"step": 12000
},
{
"epoch": 0.13389241221231038,
"grad_norm": 1.9326891899108887,
"learning_rate": 0.00015286362901685354,
"loss": 4.6781,
"num_input_tokens_seen": 70929856,
"step": 12150
},
{
"epoch": 0.13554540495567224,
"grad_norm": 1.8233270645141602,
"learning_rate": 0.00015277546762272802,
"loss": 4.6732,
"num_input_tokens_seen": 71797184,
"step": 12300
},
{
"epoch": 0.1371983976990341,
"grad_norm": 2.026263475418091,
"learning_rate": 0.00015268730622860252,
"loss": 4.672,
"num_input_tokens_seen": 72662528,
"step": 12450
},
{
"epoch": 0.13885139044239594,
"grad_norm": 1.8338570594787598,
"learning_rate": 0.000152599144834477,
"loss": 4.6718,
"num_input_tokens_seen": 73543840,
"step": 12600
},
{
"epoch": 0.1405043831857578,
"grad_norm": 1.934313416481018,
"learning_rate": 0.0001525109834403515,
"loss": 4.6595,
"num_input_tokens_seen": 74426752,
"step": 12750
},
{
"epoch": 0.14215737592911967,
"grad_norm": 1.861647367477417,
"learning_rate": 0.00015242282204622597,
"loss": 4.6692,
"num_input_tokens_seen": 75309888,
"step": 12900
},
{
"epoch": 0.14381036867248154,
"grad_norm": 1.9282541275024414,
"learning_rate": 0.00015233466065210047,
"loss": 4.6604,
"num_input_tokens_seen": 76194976,
"step": 13050
},
{
"epoch": 0.14546336141584337,
"grad_norm": 1.975542664527893,
"learning_rate": 0.00015224649925797495,
"loss": 4.662,
"num_input_tokens_seen": 77078048,
"step": 13200
},
{
"epoch": 0.14711635415920524,
"grad_norm": 1.8979029655456543,
"learning_rate": 0.00015215833786384945,
"loss": 4.6554,
"num_input_tokens_seen": 77940000,
"step": 13350
},
{
"epoch": 0.1487693469025671,
"grad_norm": 1.875108242034912,
"learning_rate": 0.00015207017646972392,
"loss": 4.6599,
"num_input_tokens_seen": 78822336,
"step": 13500
},
{
"epoch": 0.15042233964592897,
"grad_norm": 1.9476161003112793,
"learning_rate": 0.0001519820150755984,
"loss": 4.6688,
"num_input_tokens_seen": 79708256,
"step": 13650
},
{
"epoch": 0.1520753323892908,
"grad_norm": 1.9902242422103882,
"learning_rate": 0.0001518938536814729,
"loss": 4.6626,
"num_input_tokens_seen": 80595168,
"step": 13800
},
{
"epoch": 0.15372832513265267,
"grad_norm": 1.7958108186721802,
"learning_rate": 0.00015180569228734738,
"loss": 4.662,
"num_input_tokens_seen": 81484416,
"step": 13950
},
{
"epoch": 0.15538131787601453,
"grad_norm": 1.8727210760116577,
"learning_rate": 0.00015171753089322188,
"loss": 4.6579,
"num_input_tokens_seen": 82355712,
"step": 14100
},
{
"epoch": 0.1570343106193764,
"grad_norm": 2.0186071395874023,
"learning_rate": 0.00015162936949909635,
"loss": 4.6494,
"num_input_tokens_seen": 83223136,
"step": 14250
},
{
"epoch": 0.15868730336273823,
"grad_norm": 1.8837051391601562,
"learning_rate": 0.00015154120810497086,
"loss": 4.6463,
"num_input_tokens_seen": 84097152,
"step": 14400
},
{
"epoch": 0.1603402961061001,
"grad_norm": 1.8699517250061035,
"learning_rate": 0.00015145304671084533,
"loss": 4.6489,
"num_input_tokens_seen": 84976576,
"step": 14550
},
{
"epoch": 0.16199328884946196,
"grad_norm": 1.956932783126831,
"learning_rate": 0.00015136488531671983,
"loss": 4.649,
"num_input_tokens_seen": 85868224,
"step": 14700
},
{
"epoch": 0.1636462815928238,
"grad_norm": 2.020624876022339,
"learning_rate": 0.0001512767239225943,
"loss": 4.6458,
"num_input_tokens_seen": 86746944,
"step": 14850
},
{
"epoch": 0.16529927433618566,
"grad_norm": 1.9445135593414307,
"learning_rate": 0.0001511885625284688,
"loss": 4.6387,
"num_input_tokens_seen": 87626976,
"step": 15000
},
{
"epoch": 0.16695226707954752,
"grad_norm": 1.9843000173568726,
"learning_rate": 0.00015110040113434328,
"loss": 4.6481,
"num_input_tokens_seen": 88483616,
"step": 15150
},
{
"epoch": 0.1686052598229094,
"grad_norm": 2.0259897708892822,
"learning_rate": 0.00015101282748284528,
"loss": 4.6317,
"num_input_tokens_seen": 89374048,
"step": 15300
},
{
"epoch": 0.17025825256627122,
"grad_norm": 1.8472915887832642,
"learning_rate": 0.00015092466608871975,
"loss": 4.6468,
"num_input_tokens_seen": 90265376,
"step": 15450
},
{
"epoch": 0.1719112453096331,
"grad_norm": 1.9485039710998535,
"learning_rate": 0.00015083650469459425,
"loss": 4.6272,
"num_input_tokens_seen": 91161504,
"step": 15600
},
{
"epoch": 0.17356423805299495,
"grad_norm": 2.0340664386749268,
"learning_rate": 0.00015074834330046873,
"loss": 4.6252,
"num_input_tokens_seen": 92033536,
"step": 15750
},
{
"epoch": 0.17521723079635682,
"grad_norm": 1.8034217357635498,
"learning_rate": 0.00015066018190634323,
"loss": 4.6347,
"num_input_tokens_seen": 92906464,
"step": 15900
},
{
"epoch": 0.17687022353971865,
"grad_norm": 1.9323750734329224,
"learning_rate": 0.0001505720205122177,
"loss": 4.624,
"num_input_tokens_seen": 93773664,
"step": 16050
},
{
"epoch": 0.17852321628308052,
"grad_norm": 1.9791151285171509,
"learning_rate": 0.00015048385911809218,
"loss": 4.6184,
"num_input_tokens_seen": 94646528,
"step": 16200
},
{
"epoch": 0.18017620902644238,
"grad_norm": 2.0325284004211426,
"learning_rate": 0.00015039569772396668,
"loss": 4.6129,
"num_input_tokens_seen": 95522304,
"step": 16350
},
{
"epoch": 0.18182920176980424,
"grad_norm": 1.773972511291504,
"learning_rate": 0.00015030753632984116,
"loss": 4.6283,
"num_input_tokens_seen": 96397632,
"step": 16500
},
{
"epoch": 0.18348219451316608,
"grad_norm": 1.792601466178894,
"learning_rate": 0.00015021937493571566,
"loss": 4.6201,
"num_input_tokens_seen": 97283072,
"step": 16650
},
{
"epoch": 0.18513518725652794,
"grad_norm": 1.9488441944122314,
"learning_rate": 0.00015013121354159013,
"loss": 4.6174,
"num_input_tokens_seen": 98164960,
"step": 16800
},
{
"epoch": 0.1867881799998898,
"grad_norm": 1.8708151578903198,
"learning_rate": 0.00015004305214746464,
"loss": 4.6156,
"num_input_tokens_seen": 99030464,
"step": 16950
},
{
"epoch": 0.18844117274325164,
"grad_norm": 1.9848783016204834,
"learning_rate": 0.0001499548907533391,
"loss": 4.6069,
"num_input_tokens_seen": 99913184,
"step": 17100
},
{
"epoch": 0.1900941654866135,
"grad_norm": 1.9591269493103027,
"learning_rate": 0.00014986672935921361,
"loss": 4.6195,
"num_input_tokens_seen": 100797056,
"step": 17250
},
{
"epoch": 0.19174715822997537,
"grad_norm": 1.9400300979614258,
"learning_rate": 0.0001497785679650881,
"loss": 4.6167,
"num_input_tokens_seen": 101678336,
"step": 17400
},
{
"epoch": 0.19340015097333724,
"grad_norm": 1.9163286685943604,
"learning_rate": 0.0001496904065709626,
"loss": 4.6135,
"num_input_tokens_seen": 102532640,
"step": 17550
},
{
"epoch": 0.19505314371669907,
"grad_norm": 1.86648690700531,
"learning_rate": 0.00014960224517683707,
"loss": 4.6063,
"num_input_tokens_seen": 103403264,
"step": 17700
},
{
"epoch": 0.19670613646006094,
"grad_norm": 1.9310001134872437,
"learning_rate": 0.00014951408378271157,
"loss": 4.6143,
"num_input_tokens_seen": 104304224,
"step": 17850
},
{
"epoch": 0.1983591292034228,
"grad_norm": 1.9832515716552734,
"learning_rate": 0.00014942592238858604,
"loss": 4.6138,
"num_input_tokens_seen": 105184128,
"step": 18000
},
{
"epoch": 0.20001212194678467,
"grad_norm": 1.9453548192977905,
"learning_rate": 0.00014933776099446055,
"loss": 4.6143,
"num_input_tokens_seen": 106070880,
"step": 18150
},
{
"epoch": 0.2016651146901465,
"grad_norm": 1.8135297298431396,
"learning_rate": 0.00014924959960033502,
"loss": 4.6197,
"num_input_tokens_seen": 106944960,
"step": 18300
},
{
"epoch": 0.20331810743350837,
"grad_norm": 1.892717719078064,
"learning_rate": 0.00014916143820620952,
"loss": 4.6039,
"num_input_tokens_seen": 107808512,
"step": 18450
},
{
"epoch": 0.20497110017687023,
"grad_norm": 1.9304077625274658,
"learning_rate": 0.000149073276812084,
"loss": 4.6103,
"num_input_tokens_seen": 108679584,
"step": 18600
},
{
"epoch": 0.20662409292023207,
"grad_norm": 1.874104380607605,
"learning_rate": 0.0001489851154179585,
"loss": 4.6092,
"num_input_tokens_seen": 109561440,
"step": 18750
},
{
"epoch": 0.20827708566359393,
"grad_norm": 1.9672309160232544,
"learning_rate": 0.00014889695402383297,
"loss": 4.6027,
"num_input_tokens_seen": 110445760,
"step": 18900
},
{
"epoch": 0.2099300784069558,
"grad_norm": 1.9013960361480713,
"learning_rate": 0.00014880879262970748,
"loss": 4.6161,
"num_input_tokens_seen": 111310304,
"step": 19050
},
{
"epoch": 0.21158307115031766,
"grad_norm": 1.902948021888733,
"learning_rate": 0.00014872063123558195,
"loss": 4.6118,
"num_input_tokens_seen": 112181440,
"step": 19200
},
{
"epoch": 0.2132360638936795,
"grad_norm": 1.9160059690475464,
"learning_rate": 0.00014863246984145645,
"loss": 4.5929,
"num_input_tokens_seen": 113074496,
"step": 19350
},
{
"epoch": 0.21488905663704136,
"grad_norm": 1.843983769416809,
"learning_rate": 0.00014854489618995845,
"loss": 4.5982,
"num_input_tokens_seen": 113972512,
"step": 19500
},
{
"epoch": 0.21654204938040322,
"grad_norm": 1.83791184425354,
"learning_rate": 0.00014845673479583292,
"loss": 4.6046,
"num_input_tokens_seen": 114839680,
"step": 19650
},
{
"epoch": 0.2181950421237651,
"grad_norm": 1.8458436727523804,
"learning_rate": 0.00014836857340170742,
"loss": 4.5987,
"num_input_tokens_seen": 115716000,
"step": 19800
},
{
"epoch": 0.21984803486712692,
"grad_norm": 2.0030035972595215,
"learning_rate": 0.0001482804120075819,
"loss": 4.5873,
"num_input_tokens_seen": 116581408,
"step": 19950
},
{
"epoch": 0.2215010276104888,
"grad_norm": 1.8120313882827759,
"learning_rate": 0.00014819225061345637,
"loss": 4.5893,
"num_input_tokens_seen": 117446368,
"step": 20100
},
{
"epoch": 0.22315402035385065,
"grad_norm": 1.8799773454666138,
"learning_rate": 0.00014810408921933087,
"loss": 4.5746,
"num_input_tokens_seen": 118320096,
"step": 20250
},
{
"epoch": 0.22480701309721252,
"grad_norm": 1.9042309522628784,
"learning_rate": 0.00014801592782520535,
"loss": 4.5851,
"num_input_tokens_seen": 119192128,
"step": 20400
},
{
"epoch": 0.22646000584057435,
"grad_norm": 1.8850473165512085,
"learning_rate": 0.00014792776643107985,
"loss": 4.5883,
"num_input_tokens_seen": 120065888,
"step": 20550
},
{
"epoch": 0.22811299858393622,
"grad_norm": 1.8963854312896729,
"learning_rate": 0.00014783960503695433,
"loss": 4.5869,
"num_input_tokens_seen": 120921504,
"step": 20700
},
{
"epoch": 0.22976599132729808,
"grad_norm": 1.8145036697387695,
"learning_rate": 0.00014775144364282883,
"loss": 4.5857,
"num_input_tokens_seen": 121819936,
"step": 20850
},
{
"epoch": 0.23141898407065992,
"grad_norm": 1.8780988454818726,
"learning_rate": 0.0001476632822487033,
"loss": 4.5793,
"num_input_tokens_seen": 122700576,
"step": 21000
},
{
"epoch": 0.23307197681402178,
"grad_norm": 1.8859424591064453,
"learning_rate": 0.00014757512085457778,
"loss": 4.5847,
"num_input_tokens_seen": 123578848,
"step": 21150
},
{
"epoch": 0.23472496955738364,
"grad_norm": 1.8556190729141235,
"learning_rate": 0.00014748695946045228,
"loss": 4.5915,
"num_input_tokens_seen": 124451552,
"step": 21300
},
{
"epoch": 0.2363779623007455,
"grad_norm": 1.8445396423339844,
"learning_rate": 0.00014739879806632676,
"loss": 4.5857,
"num_input_tokens_seen": 125326208,
"step": 21450
},
{
"epoch": 0.23803095504410735,
"grad_norm": 1.903262972831726,
"learning_rate": 0.00014731063667220126,
"loss": 4.5811,
"num_input_tokens_seen": 126206560,
"step": 21600
},
{
"epoch": 0.2396839477874692,
"grad_norm": 1.7595880031585693,
"learning_rate": 0.00014722247527807573,
"loss": 4.567,
"num_input_tokens_seen": 127077440,
"step": 21750
},
{
"epoch": 0.24133694053083107,
"grad_norm": 1.8828771114349365,
"learning_rate": 0.00014713431388395024,
"loss": 4.5792,
"num_input_tokens_seen": 127955584,
"step": 21900
},
{
"epoch": 0.24298993327419294,
"grad_norm": 1.8850219249725342,
"learning_rate": 0.0001470461524898247,
"loss": 4.5749,
"num_input_tokens_seen": 128807200,
"step": 22050
},
{
"epoch": 0.24464292601755477,
"grad_norm": 1.9162580966949463,
"learning_rate": 0.00014695799109569919,
"loss": 4.5725,
"num_input_tokens_seen": 129672192,
"step": 22200
},
{
"epoch": 0.24629591876091664,
"grad_norm": 1.866351842880249,
"learning_rate": 0.0001468698297015737,
"loss": 4.569,
"num_input_tokens_seen": 130541696,
"step": 22350
},
{
"epoch": 0.2479489115042785,
"grad_norm": 1.849186658859253,
"learning_rate": 0.00014678166830744816,
"loss": 4.5832,
"num_input_tokens_seen": 131420992,
"step": 22500
},
{
"epoch": 0.24960190424764037,
"grad_norm": 1.8402087688446045,
"learning_rate": 0.00014669350691332266,
"loss": 4.5664,
"num_input_tokens_seen": 132312352,
"step": 22650
},
{
"epoch": 0.25125489699100223,
"grad_norm": 1.8887277841567993,
"learning_rate": 0.00014660534551919714,
"loss": 4.5771,
"num_input_tokens_seen": 133195968,
"step": 22800
},
{
"epoch": 0.25290788973436407,
"grad_norm": 2.029491424560547,
"learning_rate": 0.00014651718412507164,
"loss": 4.5718,
"num_input_tokens_seen": 134079040,
"step": 22950
},
{
"epoch": 0.2545608824777259,
"grad_norm": 1.9162187576293945,
"learning_rate": 0.00014642902273094612,
"loss": 4.5746,
"num_input_tokens_seen": 134957440,
"step": 23100
},
{
"epoch": 0.2562138752210878,
"grad_norm": 1.8331772089004517,
"learning_rate": 0.00014634086133682062,
"loss": 4.5736,
"num_input_tokens_seen": 135835328,
"step": 23250
},
{
"epoch": 0.25786686796444963,
"grad_norm": 1.9076178073883057,
"learning_rate": 0.0001462526999426951,
"loss": 4.5647,
"num_input_tokens_seen": 136701184,
"step": 23400
},
{
"epoch": 0.25951986070781147,
"grad_norm": 1.9407544136047363,
"learning_rate": 0.0001461645385485696,
"loss": 4.5607,
"num_input_tokens_seen": 137547264,
"step": 23550
},
{
"epoch": 0.26117285345117336,
"grad_norm": 1.9287118911743164,
"learning_rate": 0.0001460769648970716,
"loss": 4.5642,
"num_input_tokens_seen": 138417888,
"step": 23700
},
{
"epoch": 0.2628258461945352,
"grad_norm": 2.0106074810028076,
"learning_rate": 0.00014598939124557358,
"loss": 4.5689,
"num_input_tokens_seen": 139297088,
"step": 23850
},
{
"epoch": 0.2644788389378971,
"grad_norm": 1.8796470165252686,
"learning_rate": 0.00014590122985144808,
"loss": 4.5736,
"num_input_tokens_seen": 140165984,
"step": 24000
},
{
"epoch": 0.2661318316812589,
"grad_norm": 1.8495882749557495,
"learning_rate": 0.00014581306845732256,
"loss": 4.5613,
"num_input_tokens_seen": 141043680,
"step": 24150
},
{
"epoch": 0.26778482442462076,
"grad_norm": 1.8603812456130981,
"learning_rate": 0.00014572490706319706,
"loss": 4.5707,
"num_input_tokens_seen": 141913088,
"step": 24300
},
{
"epoch": 0.26943781716798265,
"grad_norm": 1.8450992107391357,
"learning_rate": 0.00014563674566907153,
"loss": 4.5685,
"num_input_tokens_seen": 142780992,
"step": 24450
},
{
"epoch": 0.2710908099113445,
"grad_norm": 1.9459301233291626,
"learning_rate": 0.000145548584274946,
"loss": 4.5624,
"num_input_tokens_seen": 143655008,
"step": 24600
},
{
"epoch": 0.2727438026547063,
"grad_norm": 1.87797212600708,
"learning_rate": 0.0001454604228808205,
"loss": 4.5753,
"num_input_tokens_seen": 144533760,
"step": 24750
},
{
"epoch": 0.2743967953980682,
"grad_norm": 1.9368420839309692,
"learning_rate": 0.00014537226148669498,
"loss": 4.5473,
"num_input_tokens_seen": 145405376,
"step": 24900
},
{
"epoch": 0.27604978814143005,
"grad_norm": 1.9425833225250244,
"learning_rate": 0.0001452841000925695,
"loss": 4.5621,
"num_input_tokens_seen": 146278176,
"step": 25050
},
{
"epoch": 0.2777027808847919,
"grad_norm": 1.8315942287445068,
"learning_rate": 0.00014519593869844396,
"loss": 4.5441,
"num_input_tokens_seen": 147150880,
"step": 25200
},
{
"epoch": 0.2793557736281538,
"grad_norm": 1.865020990371704,
"learning_rate": 0.00014510777730431846,
"loss": 4.5681,
"num_input_tokens_seen": 148039488,
"step": 25350
},
{
"epoch": 0.2810087663715156,
"grad_norm": 1.9058725833892822,
"learning_rate": 0.00014501961591019294,
"loss": 4.55,
"num_input_tokens_seen": 148898720,
"step": 25500
},
{
"epoch": 0.2826617591148775,
"grad_norm": 1.9358283281326294,
"learning_rate": 0.00014493145451606741,
"loss": 4.5608,
"num_input_tokens_seen": 149776576,
"step": 25650
},
{
"epoch": 0.28431475185823935,
"grad_norm": 1.871090292930603,
"learning_rate": 0.00014484329312194192,
"loss": 4.539,
"num_input_tokens_seen": 150667296,
"step": 25800
},
{
"epoch": 0.2859677446016012,
"grad_norm": 1.885185956954956,
"learning_rate": 0.0001447551317278164,
"loss": 4.5421,
"num_input_tokens_seen": 151536512,
"step": 25950
},
{
"epoch": 0.2876207373449631,
"grad_norm": 1.851283073425293,
"learning_rate": 0.0001446669703336909,
"loss": 4.5469,
"num_input_tokens_seen": 152395904,
"step": 26100
},
{
"epoch": 0.2892737300883249,
"grad_norm": 1.8593121767044067,
"learning_rate": 0.00014457880893956537,
"loss": 4.5536,
"num_input_tokens_seen": 153263904,
"step": 26250
},
{
"epoch": 0.29092672283168675,
"grad_norm": 1.8973345756530762,
"learning_rate": 0.00014449064754543987,
"loss": 4.5613,
"num_input_tokens_seen": 154143616,
"step": 26400
},
{
"epoch": 0.29257971557504864,
"grad_norm": 1.9224932193756104,
"learning_rate": 0.00014440307389394186,
"loss": 4.5513,
"num_input_tokens_seen": 155013568,
"step": 26550
},
{
"epoch": 0.2942327083184105,
"grad_norm": 1.8901547193527222,
"learning_rate": 0.00014431491249981634,
"loss": 4.5552,
"num_input_tokens_seen": 155889408,
"step": 26700
},
{
"epoch": 0.29588570106177237,
"grad_norm": 1.8522729873657227,
"learning_rate": 0.0001442267511056908,
"loss": 4.546,
"num_input_tokens_seen": 156769184,
"step": 26850
},
{
"epoch": 0.2975386938051342,
"grad_norm": 1.8729475736618042,
"learning_rate": 0.00014413858971156531,
"loss": 4.553,
"num_input_tokens_seen": 157648416,
"step": 27000
},
{
"epoch": 0.29919168654849604,
"grad_norm": 1.8859171867370605,
"learning_rate": 0.0001440504283174398,
"loss": 4.5468,
"num_input_tokens_seen": 158528480,
"step": 27150
},
{
"epoch": 0.30084467929185793,
"grad_norm": 1.8305902481079102,
"learning_rate": 0.0001439628546659418,
"loss": 4.554,
"num_input_tokens_seen": 159401376,
"step": 27300
},
{
"epoch": 0.30249767203521977,
"grad_norm": 1.7858612537384033,
"learning_rate": 0.00014387469327181628,
"loss": 4.5459,
"num_input_tokens_seen": 160297888,
"step": 27450
},
{
"epoch": 0.3041506647785816,
"grad_norm": 1.9333041906356812,
"learning_rate": 0.00014378653187769078,
"loss": 4.5488,
"num_input_tokens_seen": 161189632,
"step": 27600
},
{
"epoch": 0.3058036575219435,
"grad_norm": 1.904167652130127,
"learning_rate": 0.00014369837048356526,
"loss": 4.5457,
"num_input_tokens_seen": 162069728,
"step": 27750
},
{
"epoch": 0.30745665026530533,
"grad_norm": 1.8620493412017822,
"learning_rate": 0.00014361020908943976,
"loss": 4.543,
"num_input_tokens_seen": 162956000,
"step": 27900
},
{
"epoch": 0.30910964300866717,
"grad_norm": 1.9523295164108276,
"learning_rate": 0.00014352204769531424,
"loss": 4.5368,
"num_input_tokens_seen": 163850880,
"step": 28050
},
{
"epoch": 0.31076263575202906,
"grad_norm": 1.842017412185669,
"learning_rate": 0.00014343388630118874,
"loss": 4.5423,
"num_input_tokens_seen": 164723840,
"step": 28200
},
{
"epoch": 0.3124156284953909,
"grad_norm": 2.015977621078491,
"learning_rate": 0.0001433457249070632,
"loss": 4.5306,
"num_input_tokens_seen": 165582464,
"step": 28350
},
{
"epoch": 0.3140686212387528,
"grad_norm": 1.7622820138931274,
"learning_rate": 0.00014325756351293772,
"loss": 4.5336,
"num_input_tokens_seen": 166442496,
"step": 28500
},
{
"epoch": 0.3157216139821146,
"grad_norm": 1.8762463331222534,
"learning_rate": 0.0001431694021188122,
"loss": 4.5363,
"num_input_tokens_seen": 167318048,
"step": 28650
},
{
"epoch": 0.31737460672547646,
"grad_norm": 1.9524190425872803,
"learning_rate": 0.00014308124072468667,
"loss": 4.5498,
"num_input_tokens_seen": 168209312,
"step": 28800
},
{
"epoch": 0.31902759946883835,
"grad_norm": 1.8535164594650269,
"learning_rate": 0.00014299307933056117,
"loss": 4.5481,
"num_input_tokens_seen": 169096896,
"step": 28950
},
{
"epoch": 0.3206805922122002,
"grad_norm": 1.9309056997299194,
"learning_rate": 0.00014290491793643564,
"loss": 4.5416,
"num_input_tokens_seen": 169963008,
"step": 29100
},
{
"epoch": 0.322333584955562,
"grad_norm": 1.8341314792633057,
"learning_rate": 0.00014281675654231014,
"loss": 4.5439,
"num_input_tokens_seen": 170836736,
"step": 29250
},
{
"epoch": 0.3239865776989239,
"grad_norm": 1.8341432809829712,
"learning_rate": 0.00014272859514818462,
"loss": 4.5365,
"num_input_tokens_seen": 171728896,
"step": 29400
},
{
"epoch": 0.32563957044228575,
"grad_norm": 1.9161962270736694,
"learning_rate": 0.00014264043375405912,
"loss": 4.5428,
"num_input_tokens_seen": 172603008,
"step": 29550
},
{
"epoch": 0.3272925631856476,
"grad_norm": 1.8521162271499634,
"learning_rate": 0.0001425522723599336,
"loss": 4.5283,
"num_input_tokens_seen": 173475392,
"step": 29700
},
{
"epoch": 0.3289455559290095,
"grad_norm": 1.9026546478271484,
"learning_rate": 0.0001424641109658081,
"loss": 4.5306,
"num_input_tokens_seen": 174360832,
"step": 29850
},
{
"epoch": 0.3305985486723713,
"grad_norm": 1.9297667741775513,
"learning_rate": 0.00014237594957168257,
"loss": 4.5301,
"num_input_tokens_seen": 175244576,
"step": 30000
},
{
"epoch": 0.3322515414157332,
"grad_norm": 1.9747087955474854,
"learning_rate": 0.00014228778817755705,
"loss": 4.54,
"num_input_tokens_seen": 176108352,
"step": 30150
},
{
"epoch": 0.33390453415909505,
"grad_norm": 1.9235451221466064,
"learning_rate": 0.00014219962678343155,
"loss": 4.5438,
"num_input_tokens_seen": 176987808,
"step": 30300
},
{
"epoch": 0.3355575269024569,
"grad_norm": 1.8416277170181274,
"learning_rate": 0.00014211146538930603,
"loss": 4.5476,
"num_input_tokens_seen": 177875584,
"step": 30450
},
{
"epoch": 0.3372105196458188,
"grad_norm": 1.899798035621643,
"learning_rate": 0.00014202330399518053,
"loss": 4.5413,
"num_input_tokens_seen": 178752480,
"step": 30600
},
{
"epoch": 0.3388635123891806,
"grad_norm": 1.8849375247955322,
"learning_rate": 0.000141935142601055,
"loss": 4.5199,
"num_input_tokens_seen": 179634080,
"step": 30750
},
{
"epoch": 0.34051650513254245,
"grad_norm": 1.7944519519805908,
"learning_rate": 0.0001418469812069295,
"loss": 4.5268,
"num_input_tokens_seen": 180501472,
"step": 30900
},
{
"epoch": 0.34216949787590434,
"grad_norm": 1.8572932481765747,
"learning_rate": 0.00014175881981280398,
"loss": 4.5268,
"num_input_tokens_seen": 181388384,
"step": 31050
},
{
"epoch": 0.3438224906192662,
"grad_norm": 1.8637559413909912,
"learning_rate": 0.00014167065841867846,
"loss": 4.5245,
"num_input_tokens_seen": 182271040,
"step": 31200
},
{
"epoch": 0.345475483362628,
"grad_norm": 1.902794361114502,
"learning_rate": 0.00014158249702455296,
"loss": 4.5138,
"num_input_tokens_seen": 183160032,
"step": 31350
},
{
"epoch": 0.3471284761059899,
"grad_norm": 1.8915212154388428,
"learning_rate": 0.00014149433563042743,
"loss": 4.5296,
"num_input_tokens_seen": 184045856,
"step": 31500
},
{
"epoch": 0.34878146884935174,
"grad_norm": 1.9054772853851318,
"learning_rate": 0.00014140617423630193,
"loss": 4.5343,
"num_input_tokens_seen": 184935872,
"step": 31650
},
{
"epoch": 0.35043446159271363,
"grad_norm": 1.8381603956222534,
"learning_rate": 0.0001413180128421764,
"loss": 4.5241,
"num_input_tokens_seen": 185812160,
"step": 31800
},
{
"epoch": 0.35208745433607547,
"grad_norm": 1.8929849863052368,
"learning_rate": 0.0001412298514480509,
"loss": 4.5311,
"num_input_tokens_seen": 186698304,
"step": 31950
},
{
"epoch": 0.3537404470794373,
"grad_norm": 1.8554471731185913,
"learning_rate": 0.0001411422777965529,
"loss": 4.52,
"num_input_tokens_seen": 187570560,
"step": 32100
},
{
"epoch": 0.3553934398227992,
"grad_norm": 1.8524342775344849,
"learning_rate": 0.00014105411640242738,
"loss": 4.5231,
"num_input_tokens_seen": 188446880,
"step": 32250
},
{
"epoch": 0.35704643256616103,
"grad_norm": 1.8730753660202026,
"learning_rate": 0.00014096595500830188,
"loss": 4.5315,
"num_input_tokens_seen": 189329856,
"step": 32400
},
{
"epoch": 0.35869942530952287,
"grad_norm": 1.8252793550491333,
"learning_rate": 0.00014087779361417636,
"loss": 4.5165,
"num_input_tokens_seen": 190203872,
"step": 32550
},
{
"epoch": 0.36035241805288476,
"grad_norm": 1.8541933298110962,
"learning_rate": 0.00014078963222005086,
"loss": 4.5318,
"num_input_tokens_seen": 191086784,
"step": 32700
},
{
"epoch": 0.3620054107962466,
"grad_norm": 1.9790152311325073,
"learning_rate": 0.00014070147082592533,
"loss": 4.5202,
"num_input_tokens_seen": 191964288,
"step": 32850
},
{
"epoch": 0.3636584035396085,
"grad_norm": 1.980690836906433,
"learning_rate": 0.00014061330943179983,
"loss": 4.5282,
"num_input_tokens_seen": 192848672,
"step": 33000
},
{
"epoch": 0.3653113962829703,
"grad_norm": 1.8499431610107422,
"learning_rate": 0.0001405251480376743,
"loss": 4.5094,
"num_input_tokens_seen": 193723936,
"step": 33150
},
{
"epoch": 0.36696438902633216,
"grad_norm": 1.7975043058395386,
"learning_rate": 0.0001404369866435488,
"loss": 4.5296,
"num_input_tokens_seen": 194603072,
"step": 33300
},
{
"epoch": 0.36861738176969405,
"grad_norm": 1.8439886569976807,
"learning_rate": 0.0001403488252494233,
"loss": 4.5274,
"num_input_tokens_seen": 195468512,
"step": 33450
},
{
"epoch": 0.3702703745130559,
"grad_norm": 1.8969649076461792,
"learning_rate": 0.0001402606638552978,
"loss": 4.5195,
"num_input_tokens_seen": 196345888,
"step": 33600
},
{
"epoch": 0.3719233672564177,
"grad_norm": 1.8763043880462646,
"learning_rate": 0.00014017250246117226,
"loss": 4.5071,
"num_input_tokens_seen": 197239776,
"step": 33750
},
{
"epoch": 0.3735763599997796,
"grad_norm": 1.8754463195800781,
"learning_rate": 0.00014008434106704677,
"loss": 4.513,
"num_input_tokens_seen": 198122528,
"step": 33900
},
{
"epoch": 0.37522935274314145,
"grad_norm": 2.011179208755493,
"learning_rate": 0.00013999617967292124,
"loss": 4.5146,
"num_input_tokens_seen": 198998208,
"step": 34050
},
{
"epoch": 0.3768823454865033,
"grad_norm": 1.8127686977386475,
"learning_rate": 0.00013990801827879574,
"loss": 4.5103,
"num_input_tokens_seen": 199877440,
"step": 34200
},
{
"epoch": 0.3785353382298652,
"grad_norm": 1.792159080505371,
"learning_rate": 0.00013981985688467022,
"loss": 4.5139,
"num_input_tokens_seen": 200738880,
"step": 34350
},
{
"epoch": 0.380188330973227,
"grad_norm": 1.7812391519546509,
"learning_rate": 0.0001397322832331722,
"loss": 4.5151,
"num_input_tokens_seen": 201618880,
"step": 34500
},
{
"epoch": 0.3818413237165889,
"grad_norm": 1.8573509454727173,
"learning_rate": 0.00013964412183904668,
"loss": 4.5113,
"num_input_tokens_seen": 202481888,
"step": 34650
},
{
"epoch": 0.38349431645995075,
"grad_norm": 1.9190624952316284,
"learning_rate": 0.0001395559604449212,
"loss": 4.5118,
"num_input_tokens_seen": 203344320,
"step": 34800
},
{
"epoch": 0.3851473092033126,
"grad_norm": 1.8508821725845337,
"learning_rate": 0.00013946779905079566,
"loss": 4.5152,
"num_input_tokens_seen": 204227936,
"step": 34950
},
{
"epoch": 0.3868003019466745,
"grad_norm": 1.8276251554489136,
"learning_rate": 0.00013937963765667016,
"loss": 4.5182,
"num_input_tokens_seen": 205119008,
"step": 35100
},
{
"epoch": 0.3884532946900363,
"grad_norm": 1.773224949836731,
"learning_rate": 0.00013929147626254464,
"loss": 4.5091,
"num_input_tokens_seen": 205989696,
"step": 35250
},
{
"epoch": 0.39010628743339815,
"grad_norm": 1.889553189277649,
"learning_rate": 0.00013920331486841914,
"loss": 4.5111,
"num_input_tokens_seen": 206865088,
"step": 35400
},
{
"epoch": 0.39175928017676004,
"grad_norm": 1.844975233078003,
"learning_rate": 0.00013911515347429362,
"loss": 4.4982,
"num_input_tokens_seen": 207738880,
"step": 35550
},
{
"epoch": 0.3934122729201219,
"grad_norm": 1.8811978101730347,
"learning_rate": 0.0001390269920801681,
"loss": 4.5168,
"num_input_tokens_seen": 208626688,
"step": 35700
},
{
"epoch": 0.3950652656634837,
"grad_norm": 1.8564339876174927,
"learning_rate": 0.0001389388306860426,
"loss": 4.5122,
"num_input_tokens_seen": 209484800,
"step": 35850
},
{
"epoch": 0.3967182584068456,
"grad_norm": 1.8316396474838257,
"learning_rate": 0.00013885066929191707,
"loss": 4.5046,
"num_input_tokens_seen": 210369792,
"step": 36000
},
{
"epoch": 0.39837125115020744,
"grad_norm": 1.925075650215149,
"learning_rate": 0.00013876250789779157,
"loss": 4.5116,
"num_input_tokens_seen": 211250176,
"step": 36150
},
{
"epoch": 0.40002424389356933,
"grad_norm": 1.860660195350647,
"learning_rate": 0.00013867434650366605,
"loss": 4.4998,
"num_input_tokens_seen": 212136000,
"step": 36300
},
{
"epoch": 0.40167723663693117,
"grad_norm": 1.8505064249038696,
"learning_rate": 0.00013858618510954055,
"loss": 4.5124,
"num_input_tokens_seen": 213009056,
"step": 36450
},
{
"epoch": 0.403330229380293,
"grad_norm": 1.8910654783248901,
"learning_rate": 0.00013849802371541502,
"loss": 4.4966,
"num_input_tokens_seen": 213871744,
"step": 36600
},
{
"epoch": 0.4049832221236549,
"grad_norm": 1.883748173713684,
"learning_rate": 0.0001384098623212895,
"loss": 4.5074,
"num_input_tokens_seen": 214739520,
"step": 36750
},
{
"epoch": 0.40663621486701673,
"grad_norm": 1.8115665912628174,
"learning_rate": 0.000138321700927164,
"loss": 4.5187,
"num_input_tokens_seen": 215629472,
"step": 36900
},
{
"epoch": 0.40828920761037857,
"grad_norm": 1.9036102294921875,
"learning_rate": 0.000138234127275666,
"loss": 4.4892,
"num_input_tokens_seen": 216497184,
"step": 37050
},
{
"epoch": 0.40994220035374046,
"grad_norm": 1.8916597366333008,
"learning_rate": 0.0001381459658815405,
"loss": 4.5019,
"num_input_tokens_seen": 217381152,
"step": 37200
},
{
"epoch": 0.4115951930971023,
"grad_norm": 1.8847101926803589,
"learning_rate": 0.00013805780448741497,
"loss": 4.5064,
"num_input_tokens_seen": 218251456,
"step": 37350
},
{
"epoch": 0.41324818584046413,
"grad_norm": 1.730322241783142,
"learning_rate": 0.00013796964309328947,
"loss": 4.4988,
"num_input_tokens_seen": 219141056,
"step": 37500
},
{
"epoch": 0.414901178583826,
"grad_norm": 1.833764672279358,
"learning_rate": 0.00013788148169916395,
"loss": 4.5155,
"num_input_tokens_seen": 220036992,
"step": 37650
},
{
"epoch": 0.41655417132718786,
"grad_norm": 1.8188276290893555,
"learning_rate": 0.00013779332030503845,
"loss": 4.4984,
"num_input_tokens_seen": 220910208,
"step": 37800
},
{
"epoch": 0.41820716407054975,
"grad_norm": 1.7197022438049316,
"learning_rate": 0.00013770515891091292,
"loss": 4.5018,
"num_input_tokens_seen": 221803520,
"step": 37950
},
{
"epoch": 0.4198601568139116,
"grad_norm": 1.879470944404602,
"learning_rate": 0.00013761699751678742,
"loss": 4.497,
"num_input_tokens_seen": 222679392,
"step": 38100
},
{
"epoch": 0.4215131495572734,
"grad_norm": 1.716430902481079,
"learning_rate": 0.0001375288361226619,
"loss": 4.4979,
"num_input_tokens_seen": 223568224,
"step": 38250
},
{
"epoch": 0.4231661423006353,
"grad_norm": 1.8879536390304565,
"learning_rate": 0.0001374406747285364,
"loss": 4.4972,
"num_input_tokens_seen": 224460192,
"step": 38400
},
{
"epoch": 0.42481913504399715,
"grad_norm": 1.9361037015914917,
"learning_rate": 0.00013735251333441088,
"loss": 4.4968,
"num_input_tokens_seen": 225318816,
"step": 38550
},
{
"epoch": 0.426472127787359,
"grad_norm": 1.8587005138397217,
"learning_rate": 0.00013726435194028535,
"loss": 4.4982,
"num_input_tokens_seen": 226176704,
"step": 38700
},
{
"epoch": 0.4281251205307209,
"grad_norm": 1.8819633722305298,
"learning_rate": 0.00013717619054615985,
"loss": 4.4929,
"num_input_tokens_seen": 227049984,
"step": 38850
},
{
"epoch": 0.4297781132740827,
"grad_norm": 1.8268516063690186,
"learning_rate": 0.00013708802915203433,
"loss": 4.5063,
"num_input_tokens_seen": 227926784,
"step": 39000
},
{
"epoch": 0.4314311060174446,
"grad_norm": 1.8819466829299927,
"learning_rate": 0.00013699986775790883,
"loss": 4.5024,
"num_input_tokens_seen": 228823584,
"step": 39150
},
{
"epoch": 0.43308409876080645,
"grad_norm": 1.8801319599151611,
"learning_rate": 0.0001369117063637833,
"loss": 4.5043,
"num_input_tokens_seen": 229691904,
"step": 39300
},
{
"epoch": 0.4347370915041683,
"grad_norm": 1.8677760362625122,
"learning_rate": 0.0001368241327122853,
"loss": 4.5102,
"num_input_tokens_seen": 230569056,
"step": 39450
},
{
"epoch": 0.4363900842475302,
"grad_norm": 1.8280800580978394,
"learning_rate": 0.0001367359713181598,
"loss": 4.5014,
"num_input_tokens_seen": 231437792,
"step": 39600
},
{
"epoch": 0.438043076990892,
"grad_norm": 1.7932913303375244,
"learning_rate": 0.00013664780992403427,
"loss": 4.5082,
"num_input_tokens_seen": 232310848,
"step": 39750
},
{
"epoch": 0.43969606973425385,
"grad_norm": 1.7872642278671265,
"learning_rate": 0.00013655964852990878,
"loss": 4.5091,
"num_input_tokens_seen": 233190464,
"step": 39900
},
{
"epoch": 0.44134906247761574,
"grad_norm": 1.9207595586776733,
"learning_rate": 0.00013647148713578325,
"loss": 4.4932,
"num_input_tokens_seen": 234064256,
"step": 40050
},
{
"epoch": 0.4430020552209776,
"grad_norm": 1.9176338911056519,
"learning_rate": 0.00013638332574165773,
"loss": 4.4982,
"num_input_tokens_seen": 234931104,
"step": 40200
},
{
"epoch": 0.4446550479643394,
"grad_norm": 1.839328646659851,
"learning_rate": 0.00013629516434753223,
"loss": 4.5,
"num_input_tokens_seen": 235790176,
"step": 40350
},
{
"epoch": 0.4463080407077013,
"grad_norm": 1.8120410442352295,
"learning_rate": 0.0001362070029534067,
"loss": 4.4866,
"num_input_tokens_seen": 236663488,
"step": 40500
},
{
"epoch": 0.44796103345106314,
"grad_norm": 1.7365509271621704,
"learning_rate": 0.0001361188415592812,
"loss": 4.484,
"num_input_tokens_seen": 237550016,
"step": 40650
},
{
"epoch": 0.44961402619442503,
"grad_norm": 1.8573724031448364,
"learning_rate": 0.00013603068016515568,
"loss": 4.4949,
"num_input_tokens_seen": 238428800,
"step": 40800
},
{
"epoch": 0.45126701893778687,
"grad_norm": 1.8714196681976318,
"learning_rate": 0.00013594251877103018,
"loss": 4.4861,
"num_input_tokens_seen": 239301216,
"step": 40950
},
{
"epoch": 0.4529200116811487,
"grad_norm": 1.813636064529419,
"learning_rate": 0.00013585435737690466,
"loss": 4.4939,
"num_input_tokens_seen": 240172352,
"step": 41100
},
{
"epoch": 0.4545730044245106,
"grad_norm": 1.7828494310379028,
"learning_rate": 0.00013576619598277913,
"loss": 4.5073,
"num_input_tokens_seen": 241058368,
"step": 41250
},
{
"epoch": 0.45622599716787243,
"grad_norm": 1.9278478622436523,
"learning_rate": 0.00013567803458865363,
"loss": 4.4831,
"num_input_tokens_seen": 241944320,
"step": 41400
},
{
"epoch": 0.45787898991123427,
"grad_norm": 1.8244856595993042,
"learning_rate": 0.0001355898731945281,
"loss": 4.4889,
"num_input_tokens_seen": 242827808,
"step": 41550
},
{
"epoch": 0.45953198265459616,
"grad_norm": 1.9578852653503418,
"learning_rate": 0.0001355017118004026,
"loss": 4.4938,
"num_input_tokens_seen": 243697248,
"step": 41700
},
{
"epoch": 0.461184975397958,
"grad_norm": 1.8681639432907104,
"learning_rate": 0.0001354135504062771,
"loss": 4.493,
"num_input_tokens_seen": 244549632,
"step": 41850
},
{
"epoch": 0.46283796814131983,
"grad_norm": 1.9330469369888306,
"learning_rate": 0.0001353253890121516,
"loss": 4.4823,
"num_input_tokens_seen": 245429600,
"step": 42000
},
{
"epoch": 0.4644909608846817,
"grad_norm": 1.775819182395935,
"learning_rate": 0.00013523722761802606,
"loss": 4.4887,
"num_input_tokens_seen": 246307872,
"step": 42150
},
{
"epoch": 0.46614395362804356,
"grad_norm": 1.9359744787216187,
"learning_rate": 0.00013514906622390057,
"loss": 4.4813,
"num_input_tokens_seen": 247194432,
"step": 42300
},
{
"epoch": 0.46779694637140545,
"grad_norm": 1.8034732341766357,
"learning_rate": 0.00013506090482977504,
"loss": 4.4957,
"num_input_tokens_seen": 248049248,
"step": 42450
},
{
"epoch": 0.4694499391147673,
"grad_norm": 1.8069413900375366,
"learning_rate": 0.00013497274343564954,
"loss": 4.484,
"num_input_tokens_seen": 248930624,
"step": 42600
},
{
"epoch": 0.4711029318581291,
"grad_norm": 1.8796617984771729,
"learning_rate": 0.00013488458204152402,
"loss": 4.4887,
"num_input_tokens_seen": 249808512,
"step": 42750
},
{
"epoch": 0.472755924601491,
"grad_norm": 1.8367630243301392,
"learning_rate": 0.00013479642064739852,
"loss": 4.4779,
"num_input_tokens_seen": 250669088,
"step": 42900
},
{
"epoch": 0.47440891734485285,
"grad_norm": 1.9085215330123901,
"learning_rate": 0.0001347088469959005,
"loss": 4.489,
"num_input_tokens_seen": 251553792,
"step": 43050
},
{
"epoch": 0.4760619100882147,
"grad_norm": 1.8323450088500977,
"learning_rate": 0.000134620685601775,
"loss": 4.482,
"num_input_tokens_seen": 252440032,
"step": 43200
},
{
"epoch": 0.4777149028315766,
"grad_norm": 2.1364452838897705,
"learning_rate": 0.0001345325242076495,
"loss": 4.4873,
"num_input_tokens_seen": 253320576,
"step": 43350
},
{
"epoch": 0.4793678955749384,
"grad_norm": 1.8431365489959717,
"learning_rate": 0.00013444436281352396,
"loss": 4.4844,
"num_input_tokens_seen": 254203552,
"step": 43500
},
{
"epoch": 0.4810208883183003,
"grad_norm": 1.853715419769287,
"learning_rate": 0.00013435620141939847,
"loss": 4.4905,
"num_input_tokens_seen": 255078656,
"step": 43650
},
{
"epoch": 0.48267388106166215,
"grad_norm": 2.0351803302764893,
"learning_rate": 0.00013426804002527294,
"loss": 4.4854,
"num_input_tokens_seen": 255961792,
"step": 43800
},
{
"epoch": 0.484326873805024,
"grad_norm": 1.867595911026001,
"learning_rate": 0.00013417987863114744,
"loss": 4.4734,
"num_input_tokens_seen": 256823744,
"step": 43950
},
{
"epoch": 0.4859798665483859,
"grad_norm": 1.9231115579605103,
"learning_rate": 0.00013409171723702192,
"loss": 4.4982,
"num_input_tokens_seen": 257687488,
"step": 44100
},
{
"epoch": 0.4876328592917477,
"grad_norm": 1.9620647430419922,
"learning_rate": 0.00013400355584289642,
"loss": 4.4815,
"num_input_tokens_seen": 258561568,
"step": 44250
},
{
"epoch": 0.48928585203510955,
"grad_norm": 1.741739273071289,
"learning_rate": 0.0001339153944487709,
"loss": 4.4863,
"num_input_tokens_seen": 259449344,
"step": 44400
},
{
"epoch": 0.49093884477847144,
"grad_norm": 1.9869145154953003,
"learning_rate": 0.0001338272330546454,
"loss": 4.48,
"num_input_tokens_seen": 260317888,
"step": 44550
},
{
"epoch": 0.4925918375218333,
"grad_norm": 1.9364657402038574,
"learning_rate": 0.00013373907166051987,
"loss": 4.477,
"num_input_tokens_seen": 261198656,
"step": 44700
},
{
"epoch": 0.4942448302651951,
"grad_norm": 1.946509599685669,
"learning_rate": 0.00013365091026639437,
"loss": 4.4762,
"num_input_tokens_seen": 262082432,
"step": 44850
},
{
"epoch": 0.495897823008557,
"grad_norm": 1.7882816791534424,
"learning_rate": 0.00013356274887226885,
"loss": 4.478,
"num_input_tokens_seen": 262961504,
"step": 45000
},
{
"epoch": 0.49755081575191884,
"grad_norm": 1.8857409954071045,
"learning_rate": 0.00013347458747814332,
"loss": 4.48,
"num_input_tokens_seen": 263810336,
"step": 45150
},
{
"epoch": 0.49920380849528073,
"grad_norm": 1.8983428478240967,
"learning_rate": 0.00013338642608401783,
"loss": 4.4792,
"num_input_tokens_seen": 264684512,
"step": 45300
},
{
"epoch": 0.5008568012386425,
"grad_norm": 1.8649096488952637,
"learning_rate": 0.0001332982646898923,
"loss": 4.473,
"num_input_tokens_seen": 265553056,
"step": 45450
},
{
"epoch": 0.5025097939820045,
"grad_norm": 1.8233070373535156,
"learning_rate": 0.0001332101032957668,
"loss": 4.4854,
"num_input_tokens_seen": 266438976,
"step": 45600
},
{
"epoch": 0.5041627867253663,
"grad_norm": 1.8780871629714966,
"learning_rate": 0.00013312194190164128,
"loss": 4.4806,
"num_input_tokens_seen": 267305760,
"step": 45750
},
{
"epoch": 0.5058157794687281,
"grad_norm": 1.9368178844451904,
"learning_rate": 0.00013303378050751578,
"loss": 4.4805,
"num_input_tokens_seen": 268185792,
"step": 45900
},
{
"epoch": 0.50746877221209,
"grad_norm": 1.8207215070724487,
"learning_rate": 0.00013294561911339026,
"loss": 4.4839,
"num_input_tokens_seen": 269074432,
"step": 46050
},
{
"epoch": 0.5091217649554518,
"grad_norm": 1.9221956729888916,
"learning_rate": 0.00013285745771926473,
"loss": 4.4724,
"num_input_tokens_seen": 269945728,
"step": 46200
},
{
"epoch": 0.5107747576988138,
"grad_norm": 1.8592416048049927,
"learning_rate": 0.00013276929632513923,
"loss": 4.4801,
"num_input_tokens_seen": 270826912,
"step": 46350
},
{
"epoch": 0.5124277504421756,
"grad_norm": 1.8793606758117676,
"learning_rate": 0.0001326811349310137,
"loss": 4.4832,
"num_input_tokens_seen": 271696640,
"step": 46500
},
{
"epoch": 0.5140807431855374,
"grad_norm": 1.9293113946914673,
"learning_rate": 0.0001325929735368882,
"loss": 4.4703,
"num_input_tokens_seen": 272577280,
"step": 46650
},
{
"epoch": 0.5157337359288993,
"grad_norm": 1.8582684993743896,
"learning_rate": 0.00013250481214276269,
"loss": 4.4785,
"num_input_tokens_seen": 273448128,
"step": 46800
},
{
"epoch": 0.5173867286722611,
"grad_norm": 1.8803818225860596,
"learning_rate": 0.0001324166507486372,
"loss": 4.4851,
"num_input_tokens_seen": 274313472,
"step": 46950
},
{
"epoch": 0.5190397214156229,
"grad_norm": 1.989717721939087,
"learning_rate": 0.00013232907709713918,
"loss": 4.4814,
"num_input_tokens_seen": 275168416,
"step": 47100
},
{
"epoch": 0.5206927141589849,
"grad_norm": 1.9770921468734741,
"learning_rate": 0.00013224091570301365,
"loss": 4.4825,
"num_input_tokens_seen": 276035776,
"step": 47250
},
{
"epoch": 0.5223457069023467,
"grad_norm": 1.8764408826828003,
"learning_rate": 0.00013215275430888816,
"loss": 4.4788,
"num_input_tokens_seen": 276909952,
"step": 47400
},
{
"epoch": 0.5239986996457086,
"grad_norm": 1.860009789466858,
"learning_rate": 0.00013206459291476263,
"loss": 4.4834,
"num_input_tokens_seen": 277795232,
"step": 47550
},
{
"epoch": 0.5256516923890704,
"grad_norm": 1.949278473854065,
"learning_rate": 0.00013197643152063713,
"loss": 4.4752,
"num_input_tokens_seen": 278690720,
"step": 47700
},
{
"epoch": 0.5273046851324322,
"grad_norm": 1.868780255317688,
"learning_rate": 0.0001318882701265116,
"loss": 4.4797,
"num_input_tokens_seen": 279567232,
"step": 47850
},
{
"epoch": 0.5289576778757942,
"grad_norm": 1.7434320449829102,
"learning_rate": 0.0001318001087323861,
"loss": 4.4726,
"num_input_tokens_seen": 280424832,
"step": 48000
},
{
"epoch": 0.530610670619156,
"grad_norm": 1.8644661903381348,
"learning_rate": 0.00013171194733826059,
"loss": 4.4648,
"num_input_tokens_seen": 281294752,
"step": 48150
},
{
"epoch": 0.5322636633625178,
"grad_norm": 1.9029775857925415,
"learning_rate": 0.00013162378594413506,
"loss": 4.4791,
"num_input_tokens_seen": 282156544,
"step": 48300
},
{
"epoch": 0.5339166561058797,
"grad_norm": 1.7862669229507446,
"learning_rate": 0.00013153562455000956,
"loss": 4.4688,
"num_input_tokens_seen": 283033248,
"step": 48450
},
{
"epoch": 0.5355696488492415,
"grad_norm": 1.7603411674499512,
"learning_rate": 0.00013144746315588404,
"loss": 4.4807,
"num_input_tokens_seen": 283922048,
"step": 48600
},
{
"epoch": 0.5372226415926034,
"grad_norm": 1.868235468864441,
"learning_rate": 0.00013135930176175854,
"loss": 4.4702,
"num_input_tokens_seen": 284800416,
"step": 48750
},
{
"epoch": 0.5388756343359653,
"grad_norm": 1.864640474319458,
"learning_rate": 0.00013127114036763301,
"loss": 4.479,
"num_input_tokens_seen": 285673888,
"step": 48900
},
{
"epoch": 0.5405286270793271,
"grad_norm": 1.7705098390579224,
"learning_rate": 0.00013118297897350752,
"loss": 4.4782,
"num_input_tokens_seen": 286553856,
"step": 49050
},
{
"epoch": 0.542181619822689,
"grad_norm": 1.9763901233673096,
"learning_rate": 0.000131094817579382,
"loss": 4.4654,
"num_input_tokens_seen": 287436736,
"step": 49200
},
{
"epoch": 0.5438346125660508,
"grad_norm": 1.905661702156067,
"learning_rate": 0.0001310066561852565,
"loss": 4.4718,
"num_input_tokens_seen": 288306976,
"step": 49350
},
{
"epoch": 0.5454876053094126,
"grad_norm": 1.8861178159713745,
"learning_rate": 0.00013091849479113097,
"loss": 4.4618,
"num_input_tokens_seen": 289176352,
"step": 49500
},
{
"epoch": 0.5471405980527746,
"grad_norm": 1.8697651624679565,
"learning_rate": 0.00013083033339700547,
"loss": 4.4725,
"num_input_tokens_seen": 290061760,
"step": 49650
},
{
"epoch": 0.5487935907961364,
"grad_norm": 1.8051494359970093,
"learning_rate": 0.00013074217200287995,
"loss": 4.459,
"num_input_tokens_seen": 290924640,
"step": 49800
},
{
"epoch": 0.5504465835394983,
"grad_norm": 1.7766984701156616,
"learning_rate": 0.00013065401060875445,
"loss": 4.4729,
"num_input_tokens_seen": 291801280,
"step": 49950
},
{
"epoch": 0.5520995762828601,
"grad_norm": 1.7969352006912231,
"learning_rate": 0.00013056584921462892,
"loss": 4.4789,
"num_input_tokens_seen": 292673632,
"step": 50100
},
{
"epoch": 0.5537525690262219,
"grad_norm": 1.8939694166183472,
"learning_rate": 0.00013047768782050343,
"loss": 4.4674,
"num_input_tokens_seen": 293542176,
"step": 50250
},
{
"epoch": 0.5554055617695838,
"grad_norm": 1.8721749782562256,
"learning_rate": 0.0001303895264263779,
"loss": 4.4591,
"num_input_tokens_seen": 294415424,
"step": 50400
},
{
"epoch": 0.5570585545129457,
"grad_norm": 1.7810068130493164,
"learning_rate": 0.0001303013650322524,
"loss": 4.475,
"num_input_tokens_seen": 295292384,
"step": 50550
},
{
"epoch": 0.5587115472563076,
"grad_norm": 1.8184670209884644,
"learning_rate": 0.00013021320363812688,
"loss": 4.473,
"num_input_tokens_seen": 296168640,
"step": 50700
},
{
"epoch": 0.5603645399996694,
"grad_norm": 1.8167736530303955,
"learning_rate": 0.00013012504224400138,
"loss": 4.4604,
"num_input_tokens_seen": 297035904,
"step": 50850
},
{
"epoch": 0.5620175327430312,
"grad_norm": 1.902001142501831,
"learning_rate": 0.00013003688084987585,
"loss": 4.4653,
"num_input_tokens_seen": 297893248,
"step": 51000
},
{
"epoch": 0.5636705254863931,
"grad_norm": 1.7989624738693237,
"learning_rate": 0.00012994930719837785,
"loss": 4.4581,
"num_input_tokens_seen": 298773088,
"step": 51150
},
{
"epoch": 0.565323518229755,
"grad_norm": 1.9608672857284546,
"learning_rate": 0.00012986114580425232,
"loss": 4.4553,
"num_input_tokens_seen": 299639712,
"step": 51300
},
{
"epoch": 0.5669765109731169,
"grad_norm": 1.8063366413116455,
"learning_rate": 0.00012977298441012682,
"loss": 4.4748,
"num_input_tokens_seen": 300504576,
"step": 51450
},
{
"epoch": 0.5686295037164787,
"grad_norm": 1.7892522811889648,
"learning_rate": 0.0001296848230160013,
"loss": 4.4759,
"num_input_tokens_seen": 301381600,
"step": 51600
},
{
"epoch": 0.5702824964598405,
"grad_norm": 1.9009445905685425,
"learning_rate": 0.00012959666162187577,
"loss": 4.4643,
"num_input_tokens_seen": 302264064,
"step": 51750
},
{
"epoch": 0.5719354892032024,
"grad_norm": 1.8855173587799072,
"learning_rate": 0.00012950850022775027,
"loss": 4.4569,
"num_input_tokens_seen": 303131648,
"step": 51900
},
{
"epoch": 0.5735884819465642,
"grad_norm": 1.8198268413543701,
"learning_rate": 0.00012942033883362475,
"loss": 4.465,
"num_input_tokens_seen": 304010848,
"step": 52050
},
{
"epoch": 0.5752414746899261,
"grad_norm": 1.8278980255126953,
"learning_rate": 0.00012933217743949925,
"loss": 4.4674,
"num_input_tokens_seen": 304901952,
"step": 52200
},
{
"epoch": 0.576894467433288,
"grad_norm": 1.8281151056289673,
"learning_rate": 0.00012924401604537373,
"loss": 4.4718,
"num_input_tokens_seen": 305763424,
"step": 52350
},
{
"epoch": 0.5785474601766498,
"grad_norm": 1.8816980123519897,
"learning_rate": 0.00012915585465124823,
"loss": 4.4726,
"num_input_tokens_seen": 306641856,
"step": 52500
},
{
"epoch": 0.5802004529200117,
"grad_norm": 1.8755218982696533,
"learning_rate": 0.0001290676932571227,
"loss": 4.4744,
"num_input_tokens_seen": 307520288,
"step": 52650
},
{
"epoch": 0.5818534456633735,
"grad_norm": 1.8329659700393677,
"learning_rate": 0.0001289795318629972,
"loss": 4.4572,
"num_input_tokens_seen": 308410624,
"step": 52800
},
{
"epoch": 0.5835064384067354,
"grad_norm": 1.9654128551483154,
"learning_rate": 0.00012889137046887168,
"loss": 4.4671,
"num_input_tokens_seen": 309288736,
"step": 52950
},
{
"epoch": 0.5851594311500973,
"grad_norm": 1.8310860395431519,
"learning_rate": 0.00012880320907474618,
"loss": 4.4684,
"num_input_tokens_seen": 310165440,
"step": 53100
},
{
"epoch": 0.5868124238934591,
"grad_norm": 1.8104560375213623,
"learning_rate": 0.00012871563542324817,
"loss": 4.4546,
"num_input_tokens_seen": 311044352,
"step": 53250
},
{
"epoch": 0.588465416636821,
"grad_norm": 1.8414585590362549,
"learning_rate": 0.00012862747402912265,
"loss": 4.4625,
"num_input_tokens_seen": 311919424,
"step": 53400
},
{
"epoch": 0.5901184093801828,
"grad_norm": 1.724381685256958,
"learning_rate": 0.00012853931263499715,
"loss": 4.4632,
"num_input_tokens_seen": 312803712,
"step": 53550
},
{
"epoch": 0.5917714021235447,
"grad_norm": 1.7701301574707031,
"learning_rate": 0.00012845115124087163,
"loss": 4.4609,
"num_input_tokens_seen": 313672800,
"step": 53700
},
{
"epoch": 0.5934243948669066,
"grad_norm": 1.8755768537521362,
"learning_rate": 0.00012836298984674613,
"loss": 4.461,
"num_input_tokens_seen": 314543776,
"step": 53850
},
{
"epoch": 0.5950773876102684,
"grad_norm": 1.8842816352844238,
"learning_rate": 0.0001282748284526206,
"loss": 4.462,
"num_input_tokens_seen": 315413216,
"step": 54000
},
{
"epoch": 0.5967303803536302,
"grad_norm": 1.8173580169677734,
"learning_rate": 0.0001281866670584951,
"loss": 4.4595,
"num_input_tokens_seen": 316286592,
"step": 54150
},
{
"epoch": 0.5983833730969921,
"grad_norm": 1.8613582849502563,
"learning_rate": 0.00012809850566436958,
"loss": 4.4729,
"num_input_tokens_seen": 317171968,
"step": 54300
},
{
"epoch": 0.6000363658403539,
"grad_norm": 1.8345390558242798,
"learning_rate": 0.00012801034427024408,
"loss": 4.4558,
"num_input_tokens_seen": 318069504,
"step": 54450
},
{
"epoch": 0.6016893585837159,
"grad_norm": 1.9001188278198242,
"learning_rate": 0.00012792218287611856,
"loss": 4.4493,
"num_input_tokens_seen": 318929088,
"step": 54600
},
{
"epoch": 0.6033423513270777,
"grad_norm": 1.7820019721984863,
"learning_rate": 0.00012783402148199306,
"loss": 4.4529,
"num_input_tokens_seen": 319802400,
"step": 54750
},
{
"epoch": 0.6049953440704395,
"grad_norm": 1.8836514949798584,
"learning_rate": 0.00012774586008786754,
"loss": 4.4601,
"num_input_tokens_seen": 320667168,
"step": 54900
},
{
"epoch": 0.6066483368138014,
"grad_norm": 1.820078730583191,
"learning_rate": 0.00012765828643636953,
"loss": 4.4583,
"num_input_tokens_seen": 321544160,
"step": 55050
},
{
"epoch": 0.6083013295571632,
"grad_norm": 1.7549668550491333,
"learning_rate": 0.000127570125042244,
"loss": 4.4507,
"num_input_tokens_seen": 322413504,
"step": 55200
},
{
"epoch": 0.6099543223005252,
"grad_norm": 1.819643497467041,
"learning_rate": 0.0001274819636481185,
"loss": 4.4572,
"num_input_tokens_seen": 323301120,
"step": 55350
},
{
"epoch": 0.611607315043887,
"grad_norm": 1.8832948207855225,
"learning_rate": 0.00012739380225399298,
"loss": 4.4499,
"num_input_tokens_seen": 324179040,
"step": 55500
},
{
"epoch": 0.6132603077872488,
"grad_norm": 1.9329428672790527,
"learning_rate": 0.00012730564085986748,
"loss": 4.4598,
"num_input_tokens_seen": 325044704,
"step": 55650
},
{
"epoch": 0.6149133005306107,
"grad_norm": 1.8368948698043823,
"learning_rate": 0.00012721747946574196,
"loss": 4.4636,
"num_input_tokens_seen": 325903360,
"step": 55800
},
{
"epoch": 0.6165662932739725,
"grad_norm": 1.8610767126083374,
"learning_rate": 0.00012712931807161646,
"loss": 4.4602,
"num_input_tokens_seen": 326791648,
"step": 55950
},
{
"epoch": 0.6182192860173343,
"grad_norm": 1.853129506111145,
"learning_rate": 0.00012704115667749093,
"loss": 4.4581,
"num_input_tokens_seen": 327669216,
"step": 56100
},
{
"epoch": 0.6198722787606963,
"grad_norm": 1.8630894422531128,
"learning_rate": 0.0001269529952833654,
"loss": 4.4584,
"num_input_tokens_seen": 328547424,
"step": 56250
},
{
"epoch": 0.6215252715040581,
"grad_norm": 1.8581258058547974,
"learning_rate": 0.0001268648338892399,
"loss": 4.4632,
"num_input_tokens_seen": 329414400,
"step": 56400
},
{
"epoch": 0.62317826424742,
"grad_norm": 1.8294817209243774,
"learning_rate": 0.00012677667249511439,
"loss": 4.4634,
"num_input_tokens_seen": 330297376,
"step": 56550
},
{
"epoch": 0.6248312569907818,
"grad_norm": 1.9625203609466553,
"learning_rate": 0.0001266885111009889,
"loss": 4.4598,
"num_input_tokens_seen": 331181792,
"step": 56700
},
{
"epoch": 0.6264842497341436,
"grad_norm": 1.821718454360962,
"learning_rate": 0.00012660034970686336,
"loss": 4.4549,
"num_input_tokens_seen": 332056416,
"step": 56850
},
{
"epoch": 0.6281372424775056,
"grad_norm": 1.8366010189056396,
"learning_rate": 0.00012651218831273786,
"loss": 4.4418,
"num_input_tokens_seen": 332908864,
"step": 57000
},
{
"epoch": 0.6297902352208674,
"grad_norm": 1.858789086341858,
"learning_rate": 0.00012642402691861234,
"loss": 4.4592,
"num_input_tokens_seen": 333791520,
"step": 57150
},
{
"epoch": 0.6314432279642292,
"grad_norm": 1.9188382625579834,
"learning_rate": 0.00012633586552448684,
"loss": 4.4531,
"num_input_tokens_seen": 334675488,
"step": 57300
},
{
"epoch": 0.6330962207075911,
"grad_norm": 1.8480638265609741,
"learning_rate": 0.00012624770413036132,
"loss": 4.4557,
"num_input_tokens_seen": 335569664,
"step": 57450
},
{
"epoch": 0.6347492134509529,
"grad_norm": 1.8409630060195923,
"learning_rate": 0.00012615954273623582,
"loss": 4.454,
"num_input_tokens_seen": 336438752,
"step": 57600
},
{
"epoch": 0.6364022061943148,
"grad_norm": 1.7734564542770386,
"learning_rate": 0.0001260713813421103,
"loss": 4.4587,
"num_input_tokens_seen": 337318464,
"step": 57750
},
{
"epoch": 0.6380551989376767,
"grad_norm": 1.8258734941482544,
"learning_rate": 0.0001259832199479848,
"loss": 4.4501,
"num_input_tokens_seen": 338196416,
"step": 57900
},
{
"epoch": 0.6397081916810385,
"grad_norm": 1.8730100393295288,
"learning_rate": 0.00012589505855385927,
"loss": 4.4508,
"num_input_tokens_seen": 339066272,
"step": 58050
},
{
"epoch": 0.6413611844244004,
"grad_norm": 1.7968626022338867,
"learning_rate": 0.00012580689715973375,
"loss": 4.4465,
"num_input_tokens_seen": 339940576,
"step": 58200
},
{
"epoch": 0.6430141771677622,
"grad_norm": 1.8305721282958984,
"learning_rate": 0.00012571873576560825,
"loss": 4.4452,
"num_input_tokens_seen": 340827872,
"step": 58350
},
{
"epoch": 0.644667169911124,
"grad_norm": 1.8106398582458496,
"learning_rate": 0.00012563057437148272,
"loss": 4.4436,
"num_input_tokens_seen": 341710720,
"step": 58500
},
{
"epoch": 0.646320162654486,
"grad_norm": 1.8428856134414673,
"learning_rate": 0.00012554300071998474,
"loss": 4.4607,
"num_input_tokens_seen": 342592992,
"step": 58650
},
{
"epoch": 0.6479731553978478,
"grad_norm": 1.89970064163208,
"learning_rate": 0.00012545483932585922,
"loss": 4.446,
"num_input_tokens_seen": 343458496,
"step": 58800
},
{
"epoch": 0.6496261481412097,
"grad_norm": 1.887024998664856,
"learning_rate": 0.00012536667793173372,
"loss": 4.4438,
"num_input_tokens_seen": 344324192,
"step": 58950
},
{
"epoch": 0.6512791408845715,
"grad_norm": 1.751080870628357,
"learning_rate": 0.0001252785165376082,
"loss": 4.4439,
"num_input_tokens_seen": 345204320,
"step": 59100
},
{
"epoch": 0.6529321336279333,
"grad_norm": 1.8455328941345215,
"learning_rate": 0.0001251903551434827,
"loss": 4.4446,
"num_input_tokens_seen": 346100960,
"step": 59250
},
{
"epoch": 0.6545851263712952,
"grad_norm": 1.9079509973526,
"learning_rate": 0.00012510219374935717,
"loss": 4.4441,
"num_input_tokens_seen": 346981376,
"step": 59400
},
{
"epoch": 0.6562381191146571,
"grad_norm": 1.8034120798110962,
"learning_rate": 0.00012501403235523167,
"loss": 4.4555,
"num_input_tokens_seen": 347855584,
"step": 59550
},
{
"epoch": 0.657891111858019,
"grad_norm": 1.7936707735061646,
"learning_rate": 0.00012492587096110615,
"loss": 4.4493,
"num_input_tokens_seen": 348733664,
"step": 59700
},
{
"epoch": 0.6595441046013808,
"grad_norm": 1.80596923828125,
"learning_rate": 0.00012483770956698065,
"loss": 4.4445,
"num_input_tokens_seen": 349592128,
"step": 59850
},
{
"epoch": 0.6611970973447426,
"grad_norm": 1.7837984561920166,
"learning_rate": 0.00012474954817285512,
"loss": 4.4472,
"num_input_tokens_seen": 350464672,
"step": 60000
},
{
"epoch": 0.6628500900881045,
"grad_norm": 1.8550629615783691,
"learning_rate": 0.0001246613867787296,
"loss": 4.4436,
"num_input_tokens_seen": 351360576,
"step": 60150
},
{
"epoch": 0.6645030828314664,
"grad_norm": 1.8099464178085327,
"learning_rate": 0.0001245738131272316,
"loss": 4.4439,
"num_input_tokens_seen": 352250944,
"step": 60300
},
{
"epoch": 0.6661560755748283,
"grad_norm": 1.869233250617981,
"learning_rate": 0.0001244856517331061,
"loss": 4.45,
"num_input_tokens_seen": 353120608,
"step": 60450
},
{
"epoch": 0.6678090683181901,
"grad_norm": 1.8628960847854614,
"learning_rate": 0.00012439749033898057,
"loss": 4.4446,
"num_input_tokens_seen": 353986944,
"step": 60600
},
{
"epoch": 0.6694620610615519,
"grad_norm": 1.7935791015625,
"learning_rate": 0.00012430932894485504,
"loss": 4.4481,
"num_input_tokens_seen": 354851456,
"step": 60750
},
{
"epoch": 0.6711150538049138,
"grad_norm": 1.919735074043274,
"learning_rate": 0.00012422116755072955,
"loss": 4.4491,
"num_input_tokens_seen": 355735616,
"step": 60900
},
{
"epoch": 0.6727680465482756,
"grad_norm": 1.9296785593032837,
"learning_rate": 0.00012413300615660402,
"loss": 4.4384,
"num_input_tokens_seen": 356613408,
"step": 61050
},
{
"epoch": 0.6744210392916375,
"grad_norm": 1.8167061805725098,
"learning_rate": 0.00012404484476247852,
"loss": 4.4326,
"num_input_tokens_seen": 357463840,
"step": 61200
},
{
"epoch": 0.6760740320349994,
"grad_norm": 1.86695396900177,
"learning_rate": 0.000123956683368353,
"loss": 4.4501,
"num_input_tokens_seen": 358354464,
"step": 61350
},
{
"epoch": 0.6777270247783612,
"grad_norm": 1.8627629280090332,
"learning_rate": 0.0001238685219742275,
"loss": 4.4496,
"num_input_tokens_seen": 359222016,
"step": 61500
},
{
"epoch": 0.6793800175217231,
"grad_norm": 1.8496758937835693,
"learning_rate": 0.00012378036058010197,
"loss": 4.4505,
"num_input_tokens_seen": 360112096,
"step": 61650
},
{
"epoch": 0.6810330102650849,
"grad_norm": 1.8193156719207764,
"learning_rate": 0.00012369219918597645,
"loss": 4.452,
"num_input_tokens_seen": 360995520,
"step": 61800
},
{
"epoch": 0.6826860030084468,
"grad_norm": 1.7519707679748535,
"learning_rate": 0.00012360403779185095,
"loss": 4.4439,
"num_input_tokens_seen": 361873184,
"step": 61950
},
{
"epoch": 0.6843389957518087,
"grad_norm": 1.9227124452590942,
"learning_rate": 0.00012351587639772543,
"loss": 4.4416,
"num_input_tokens_seen": 362749312,
"step": 62100
},
{
"epoch": 0.6859919884951705,
"grad_norm": 1.8492848873138428,
"learning_rate": 0.00012342771500359993,
"loss": 4.4541,
"num_input_tokens_seen": 363635936,
"step": 62250
},
{
"epoch": 0.6876449812385323,
"grad_norm": 1.946057677268982,
"learning_rate": 0.0001233395536094744,
"loss": 4.435,
"num_input_tokens_seen": 364500576,
"step": 62400
},
{
"epoch": 0.6892979739818942,
"grad_norm": 1.8880736827850342,
"learning_rate": 0.0001232513922153489,
"loss": 4.4442,
"num_input_tokens_seen": 365363744,
"step": 62550
},
{
"epoch": 0.690950966725256,
"grad_norm": 1.864534854888916,
"learning_rate": 0.00012316323082122338,
"loss": 4.4398,
"num_input_tokens_seen": 366253600,
"step": 62700
},
{
"epoch": 0.692603959468618,
"grad_norm": 1.8077435493469238,
"learning_rate": 0.00012307506942709788,
"loss": 4.4462,
"num_input_tokens_seen": 367119136,
"step": 62850
},
{
"epoch": 0.6942569522119798,
"grad_norm": 1.8797168731689453,
"learning_rate": 0.00012298690803297236,
"loss": 4.4535,
"num_input_tokens_seen": 367998656,
"step": 63000
},
{
"epoch": 0.6959099449553416,
"grad_norm": 1.9124201536178589,
"learning_rate": 0.00012289874663884686,
"loss": 4.4314,
"num_input_tokens_seen": 368873888,
"step": 63150
},
{
"epoch": 0.6975629376987035,
"grad_norm": 1.919708013534546,
"learning_rate": 0.00012281058524472134,
"loss": 4.4524,
"num_input_tokens_seen": 369761216,
"step": 63300
},
{
"epoch": 0.6992159304420653,
"grad_norm": 1.8248168230056763,
"learning_rate": 0.00012272242385059584,
"loss": 4.4422,
"num_input_tokens_seen": 370638688,
"step": 63450
},
{
"epoch": 0.7008689231854273,
"grad_norm": 1.810051441192627,
"learning_rate": 0.0001226342624564703,
"loss": 4.4313,
"num_input_tokens_seen": 371541344,
"step": 63600
},
{
"epoch": 0.7025219159287891,
"grad_norm": 1.8361635208129883,
"learning_rate": 0.00012254610106234481,
"loss": 4.436,
"num_input_tokens_seen": 372415168,
"step": 63750
},
{
"epoch": 0.7041749086721509,
"grad_norm": 1.8005433082580566,
"learning_rate": 0.0001224579396682193,
"loss": 4.4353,
"num_input_tokens_seen": 373283872,
"step": 63900
},
{
"epoch": 0.7058279014155128,
"grad_norm": 1.8291569948196411,
"learning_rate": 0.0001223697782740938,
"loss": 4.4486,
"num_input_tokens_seen": 374156960,
"step": 64050
},
{
"epoch": 0.7074808941588746,
"grad_norm": 1.6987590789794922,
"learning_rate": 0.00012228161687996827,
"loss": 4.4402,
"num_input_tokens_seen": 375045632,
"step": 64200
},
{
"epoch": 0.7091338869022366,
"grad_norm": 1.8456915616989136,
"learning_rate": 0.00012219345548584277,
"loss": 4.4502,
"num_input_tokens_seen": 375936576,
"step": 64350
},
{
"epoch": 0.7107868796455984,
"grad_norm": 1.9141839742660522,
"learning_rate": 0.00012210529409171724,
"loss": 4.4521,
"num_input_tokens_seen": 376816512,
"step": 64500
},
{
"epoch": 0.7124398723889602,
"grad_norm": 1.8822457790374756,
"learning_rate": 0.00012201713269759175,
"loss": 4.4347,
"num_input_tokens_seen": 377684448,
"step": 64650
},
{
"epoch": 0.7140928651323221,
"grad_norm": 1.8143234252929688,
"learning_rate": 0.00012192897130346622,
"loss": 4.4336,
"num_input_tokens_seen": 378553120,
"step": 64800
},
{
"epoch": 0.7157458578756839,
"grad_norm": 1.8877683877944946,
"learning_rate": 0.00012184080990934071,
"loss": 4.4425,
"num_input_tokens_seen": 379413888,
"step": 64950
},
{
"epoch": 0.7173988506190457,
"grad_norm": 1.8746610879898071,
"learning_rate": 0.0001217526485152152,
"loss": 4.4417,
"num_input_tokens_seen": 380290304,
"step": 65100
},
{
"epoch": 0.7190518433624077,
"grad_norm": 2.0395500659942627,
"learning_rate": 0.00012166448712108969,
"loss": 4.4423,
"num_input_tokens_seen": 381185920,
"step": 65250
},
{
"epoch": 0.7207048361057695,
"grad_norm": 1.992492914199829,
"learning_rate": 0.00012157632572696418,
"loss": 4.4484,
"num_input_tokens_seen": 382075808,
"step": 65400
},
{
"epoch": 0.7223578288491314,
"grad_norm": 1.8621459007263184,
"learning_rate": 0.00012148816433283866,
"loss": 4.4274,
"num_input_tokens_seen": 382955232,
"step": 65550
},
{
"epoch": 0.7240108215924932,
"grad_norm": 1.8787345886230469,
"learning_rate": 0.00012140000293871315,
"loss": 4.4378,
"num_input_tokens_seen": 383834592,
"step": 65700
},
{
"epoch": 0.725663814335855,
"grad_norm": 1.8640894889831543,
"learning_rate": 0.00012131184154458764,
"loss": 4.4557,
"num_input_tokens_seen": 384710016,
"step": 65850
},
{
"epoch": 0.727316807079217,
"grad_norm": 1.918143630027771,
"learning_rate": 0.00012122368015046212,
"loss": 4.4467,
"num_input_tokens_seen": 385593120,
"step": 66000
},
{
"epoch": 0.7289697998225788,
"grad_norm": 1.8295505046844482,
"learning_rate": 0.00012113551875633662,
"loss": 4.4257,
"num_input_tokens_seen": 386460160,
"step": 66150
},
{
"epoch": 0.7306227925659406,
"grad_norm": 1.880216360092163,
"learning_rate": 0.00012104794510483861,
"loss": 4.4312,
"num_input_tokens_seen": 387328000,
"step": 66300
},
{
"epoch": 0.7322757853093025,
"grad_norm": 1.818788766860962,
"learning_rate": 0.00012095978371071308,
"loss": 4.4402,
"num_input_tokens_seen": 388200192,
"step": 66450
},
{
"epoch": 0.7339287780526643,
"grad_norm": 1.82147216796875,
"learning_rate": 0.00012087162231658759,
"loss": 4.4468,
"num_input_tokens_seen": 389079264,
"step": 66600
},
{
"epoch": 0.7355817707960262,
"grad_norm": 1.8930702209472656,
"learning_rate": 0.00012078346092246206,
"loss": 4.4362,
"num_input_tokens_seen": 389974112,
"step": 66750
},
{
"epoch": 0.7372347635393881,
"grad_norm": 1.8484946489334106,
"learning_rate": 0.00012069529952833656,
"loss": 4.4409,
"num_input_tokens_seen": 390875168,
"step": 66900
},
{
"epoch": 0.7388877562827499,
"grad_norm": 1.894093632698059,
"learning_rate": 0.00012060713813421104,
"loss": 4.4197,
"num_input_tokens_seen": 391736384,
"step": 67050
},
{
"epoch": 0.7405407490261118,
"grad_norm": 1.918149471282959,
"learning_rate": 0.00012051897674008553,
"loss": 4.4487,
"num_input_tokens_seen": 392626688,
"step": 67200
},
{
"epoch": 0.7421937417694736,
"grad_norm": 1.8563427925109863,
"learning_rate": 0.00012043081534596002,
"loss": 4.4375,
"num_input_tokens_seen": 393508640,
"step": 67350
},
{
"epoch": 0.7438467345128355,
"grad_norm": 1.8275529146194458,
"learning_rate": 0.0001203426539518345,
"loss": 4.4333,
"num_input_tokens_seen": 394397120,
"step": 67500
},
{
"epoch": 0.7454997272561974,
"grad_norm": 1.824823260307312,
"learning_rate": 0.00012025449255770899,
"loss": 4.4213,
"num_input_tokens_seen": 395273408,
"step": 67650
},
{
"epoch": 0.7471527199995592,
"grad_norm": 1.7815489768981934,
"learning_rate": 0.00012016633116358348,
"loss": 4.4295,
"num_input_tokens_seen": 396151584,
"step": 67800
},
{
"epoch": 0.7488057127429211,
"grad_norm": 1.9288073778152466,
"learning_rate": 0.00012007816976945797,
"loss": 4.4348,
"num_input_tokens_seen": 397006880,
"step": 67950
},
{
"epoch": 0.7504587054862829,
"grad_norm": 1.866746425628662,
"learning_rate": 0.00011999000837533245,
"loss": 4.4306,
"num_input_tokens_seen": 397879072,
"step": 68100
},
{
"epoch": 0.7521116982296447,
"grad_norm": 1.8168858289718628,
"learning_rate": 0.00011990184698120693,
"loss": 4.4321,
"num_input_tokens_seen": 398772672,
"step": 68250
},
{
"epoch": 0.7537646909730066,
"grad_norm": 1.7801350355148315,
"learning_rate": 0.00011981368558708142,
"loss": 4.4358,
"num_input_tokens_seen": 399663136,
"step": 68400
},
{
"epoch": 0.7554176837163685,
"grad_norm": 1.9442716836929321,
"learning_rate": 0.00011972611193558343,
"loss": 4.4357,
"num_input_tokens_seen": 400543936,
"step": 68550
},
{
"epoch": 0.7570706764597304,
"grad_norm": 1.8754234313964844,
"learning_rate": 0.0001196379505414579,
"loss": 4.4279,
"num_input_tokens_seen": 401411136,
"step": 68700
},
{
"epoch": 0.7587236692030922,
"grad_norm": 1.8986996412277222,
"learning_rate": 0.0001195497891473324,
"loss": 4.4345,
"num_input_tokens_seen": 402290464,
"step": 68850
},
{
"epoch": 0.760376661946454,
"grad_norm": 1.8807158470153809,
"learning_rate": 0.00011946162775320688,
"loss": 4.4329,
"num_input_tokens_seen": 403176768,
"step": 69000
},
{
"epoch": 0.7620296546898159,
"grad_norm": 1.8661843538284302,
"learning_rate": 0.00011937346635908138,
"loss": 4.4327,
"num_input_tokens_seen": 404053888,
"step": 69150
},
{
"epoch": 0.7636826474331778,
"grad_norm": 1.9022386074066162,
"learning_rate": 0.00011928530496495586,
"loss": 4.4304,
"num_input_tokens_seen": 404951328,
"step": 69300
},
{
"epoch": 0.7653356401765397,
"grad_norm": 1.9497708082199097,
"learning_rate": 0.00011919714357083035,
"loss": 4.4319,
"num_input_tokens_seen": 405824128,
"step": 69450
},
{
"epoch": 0.7669886329199015,
"grad_norm": 1.7283419370651245,
"learning_rate": 0.00011910898217670483,
"loss": 4.4222,
"num_input_tokens_seen": 406694592,
"step": 69600
},
{
"epoch": 0.7686416256632633,
"grad_norm": 1.8692352771759033,
"learning_rate": 0.00011902082078257931,
"loss": 4.4257,
"num_input_tokens_seen": 407577856,
"step": 69750
},
{
"epoch": 0.7702946184066252,
"grad_norm": 1.918215036392212,
"learning_rate": 0.00011893265938845381,
"loss": 4.4275,
"num_input_tokens_seen": 408455424,
"step": 69900
},
{
"epoch": 0.771947611149987,
"grad_norm": 1.8184279203414917,
"learning_rate": 0.00011884449799432829,
"loss": 4.4287,
"num_input_tokens_seen": 409330752,
"step": 70050
},
{
"epoch": 0.773600603893349,
"grad_norm": 1.846740961074829,
"learning_rate": 0.00011875633660020279,
"loss": 4.4244,
"num_input_tokens_seen": 410215104,
"step": 70200
},
{
"epoch": 0.7752535966367108,
"grad_norm": 1.9468152523040771,
"learning_rate": 0.00011866817520607726,
"loss": 4.4223,
"num_input_tokens_seen": 411089696,
"step": 70350
},
{
"epoch": 0.7769065893800726,
"grad_norm": 1.87180495262146,
"learning_rate": 0.00011858001381195175,
"loss": 4.4511,
"num_input_tokens_seen": 411988000,
"step": 70500
},
{
"epoch": 0.7785595821234345,
"grad_norm": 1.8375773429870605,
"learning_rate": 0.00011849185241782624,
"loss": 4.4361,
"num_input_tokens_seen": 412876544,
"step": 70650
},
{
"epoch": 0.7802125748667963,
"grad_norm": 1.7592004537582397,
"learning_rate": 0.00011840369102370073,
"loss": 4.4272,
"num_input_tokens_seen": 413737824,
"step": 70800
},
{
"epoch": 0.7818655676101582,
"grad_norm": 1.9243676662445068,
"learning_rate": 0.00011831552962957522,
"loss": 4.4343,
"num_input_tokens_seen": 414613184,
"step": 70950
},
{
"epoch": 0.7835185603535201,
"grad_norm": 1.9014674425125122,
"learning_rate": 0.0001182273682354497,
"loss": 4.4217,
"num_input_tokens_seen": 415494144,
"step": 71100
},
{
"epoch": 0.7851715530968819,
"grad_norm": 1.8528156280517578,
"learning_rate": 0.0001181397945839517,
"loss": 4.4323,
"num_input_tokens_seen": 416354240,
"step": 71250
},
{
"epoch": 0.7868245458402437,
"grad_norm": 1.7702356576919556,
"learning_rate": 0.0001180516331898262,
"loss": 4.4335,
"num_input_tokens_seen": 417214944,
"step": 71400
},
{
"epoch": 0.7884775385836056,
"grad_norm": 1.893778920173645,
"learning_rate": 0.00011796347179570067,
"loss": 4.4288,
"num_input_tokens_seen": 418073984,
"step": 71550
},
{
"epoch": 0.7901305313269674,
"grad_norm": 1.8179432153701782,
"learning_rate": 0.00011787531040157515,
"loss": 4.4161,
"num_input_tokens_seen": 418956256,
"step": 71700
},
{
"epoch": 0.7917835240703294,
"grad_norm": 1.8786159753799438,
"learning_rate": 0.00011778714900744965,
"loss": 4.4224,
"num_input_tokens_seen": 419832544,
"step": 71850
},
{
"epoch": 0.7934365168136912,
"grad_norm": 1.864493727684021,
"learning_rate": 0.00011769898761332413,
"loss": 4.4294,
"num_input_tokens_seen": 420706176,
"step": 72000
},
{
"epoch": 0.795089509557053,
"grad_norm": 1.7827798128128052,
"learning_rate": 0.00011761082621919863,
"loss": 4.4265,
"num_input_tokens_seen": 421594880,
"step": 72150
},
{
"epoch": 0.7967425023004149,
"grad_norm": 1.8714325428009033,
"learning_rate": 0.0001175226648250731,
"loss": 4.4428,
"num_input_tokens_seen": 422456256,
"step": 72300
},
{
"epoch": 0.7983954950437767,
"grad_norm": 1.8954764604568481,
"learning_rate": 0.0001174345034309476,
"loss": 4.4198,
"num_input_tokens_seen": 423334208,
"step": 72450
},
{
"epoch": 0.8000484877871387,
"grad_norm": 1.9334732294082642,
"learning_rate": 0.00011734634203682208,
"loss": 4.4285,
"num_input_tokens_seen": 424227104,
"step": 72600
},
{
"epoch": 0.8017014805305005,
"grad_norm": 1.8234983682632446,
"learning_rate": 0.00011725818064269657,
"loss": 4.438,
"num_input_tokens_seen": 425093536,
"step": 72750
},
{
"epoch": 0.8033544732738623,
"grad_norm": 1.8719639778137207,
"learning_rate": 0.00011717001924857106,
"loss": 4.432,
"num_input_tokens_seen": 425967904,
"step": 72900
},
{
"epoch": 0.8050074660172242,
"grad_norm": 1.879062533378601,
"learning_rate": 0.00011708185785444555,
"loss": 4.4157,
"num_input_tokens_seen": 426848192,
"step": 73050
},
{
"epoch": 0.806660458760586,
"grad_norm": 1.8409887552261353,
"learning_rate": 0.00011699369646032003,
"loss": 4.4173,
"num_input_tokens_seen": 427730528,
"step": 73200
},
{
"epoch": 0.8083134515039478,
"grad_norm": 1.9242078065872192,
"learning_rate": 0.00011690553506619452,
"loss": 4.4296,
"num_input_tokens_seen": 428621792,
"step": 73350
},
{
"epoch": 0.8099664442473098,
"grad_norm": 1.8767496347427368,
"learning_rate": 0.00011681737367206901,
"loss": 4.4245,
"num_input_tokens_seen": 429494624,
"step": 73500
},
{
"epoch": 0.8116194369906716,
"grad_norm": 1.8519647121429443,
"learning_rate": 0.0001167292122779435,
"loss": 4.4254,
"num_input_tokens_seen": 430360704,
"step": 73650
},
{
"epoch": 0.8132724297340335,
"grad_norm": 1.9251487255096436,
"learning_rate": 0.00011664105088381798,
"loss": 4.4362,
"num_input_tokens_seen": 431239616,
"step": 73800
},
{
"epoch": 0.8149254224773953,
"grad_norm": 1.8970694541931152,
"learning_rate": 0.00011655288948969248,
"loss": 4.4261,
"num_input_tokens_seen": 432109120,
"step": 73950
},
{
"epoch": 0.8165784152207571,
"grad_norm": 1.8284028768539429,
"learning_rate": 0.00011646472809556695,
"loss": 4.4406,
"num_input_tokens_seen": 433005440,
"step": 74100
},
{
"epoch": 0.8182314079641191,
"grad_norm": 1.7933986186981201,
"learning_rate": 0.00011637656670144145,
"loss": 4.4333,
"num_input_tokens_seen": 433891456,
"step": 74250
},
{
"epoch": 0.8198844007074809,
"grad_norm": 1.802509069442749,
"learning_rate": 0.00011628840530731593,
"loss": 4.4201,
"num_input_tokens_seen": 434769856,
"step": 74400
},
{
"epoch": 0.8215373934508428,
"grad_norm": 1.7515144348144531,
"learning_rate": 0.00011620024391319043,
"loss": 4.4225,
"num_input_tokens_seen": 435665920,
"step": 74550
},
{
"epoch": 0.8231903861942046,
"grad_norm": 1.8373006582260132,
"learning_rate": 0.00011611208251906491,
"loss": 4.4265,
"num_input_tokens_seen": 436549984,
"step": 74700
},
{
"epoch": 0.8248433789375664,
"grad_norm": 1.8570173978805542,
"learning_rate": 0.00011602392112493941,
"loss": 4.4196,
"num_input_tokens_seen": 437427456,
"step": 74850
},
{
"epoch": 0.8264963716809283,
"grad_norm": 1.9485052824020386,
"learning_rate": 0.00011593575973081388,
"loss": 4.4235,
"num_input_tokens_seen": 438316576,
"step": 75000
},
{
"epoch": 0.8281493644242902,
"grad_norm": 1.8972394466400146,
"learning_rate": 0.00011584818607931588,
"loss": 4.4231,
"num_input_tokens_seen": 439211648,
"step": 75150
},
{
"epoch": 0.829802357167652,
"grad_norm": 1.778745412826538,
"learning_rate": 0.00011576002468519036,
"loss": 4.423,
"num_input_tokens_seen": 440086912,
"step": 75300
},
{
"epoch": 0.8314553499110139,
"grad_norm": 1.923743486404419,
"learning_rate": 0.00011567245103369236,
"loss": 4.4112,
"num_input_tokens_seen": 440976736,
"step": 75450
},
{
"epoch": 0.8331083426543757,
"grad_norm": 1.8902959823608398,
"learning_rate": 0.00011558428963956683,
"loss": 4.4171,
"num_input_tokens_seen": 441827264,
"step": 75600
},
{
"epoch": 0.8347613353977376,
"grad_norm": 1.882279396057129,
"learning_rate": 0.00011549612824544133,
"loss": 4.4386,
"num_input_tokens_seen": 442687328,
"step": 75750
},
{
"epoch": 0.8364143281410995,
"grad_norm": 1.8508954048156738,
"learning_rate": 0.00011540796685131581,
"loss": 4.4261,
"num_input_tokens_seen": 443558176,
"step": 75900
},
{
"epoch": 0.8380673208844613,
"grad_norm": 1.8582794666290283,
"learning_rate": 0.00011531980545719031,
"loss": 4.4214,
"num_input_tokens_seen": 444448512,
"step": 76050
},
{
"epoch": 0.8397203136278232,
"grad_norm": 1.8337671756744385,
"learning_rate": 0.00011523164406306478,
"loss": 4.4257,
"num_input_tokens_seen": 445311808,
"step": 76200
},
{
"epoch": 0.841373306371185,
"grad_norm": 1.8980998992919922,
"learning_rate": 0.00011514348266893929,
"loss": 4.4294,
"num_input_tokens_seen": 446200832,
"step": 76350
},
{
"epoch": 0.8430262991145469,
"grad_norm": 1.8506239652633667,
"learning_rate": 0.00011505532127481376,
"loss": 4.4205,
"num_input_tokens_seen": 447081376,
"step": 76500
},
{
"epoch": 0.8446792918579088,
"grad_norm": 1.8824795484542847,
"learning_rate": 0.00011496715988068826,
"loss": 4.4213,
"num_input_tokens_seen": 447960352,
"step": 76650
},
{
"epoch": 0.8463322846012706,
"grad_norm": 1.8223339319229126,
"learning_rate": 0.00011487899848656274,
"loss": 4.4267,
"num_input_tokens_seen": 448832096,
"step": 76800
},
{
"epoch": 0.8479852773446325,
"grad_norm": 1.8224749565124512,
"learning_rate": 0.00011479083709243724,
"loss": 4.4181,
"num_input_tokens_seen": 449706720,
"step": 76950
},
{
"epoch": 0.8496382700879943,
"grad_norm": 1.903432011604309,
"learning_rate": 0.00011470267569831172,
"loss": 4.4309,
"num_input_tokens_seen": 450578752,
"step": 77100
},
{
"epoch": 0.8512912628313561,
"grad_norm": 1.8261497020721436,
"learning_rate": 0.0001146145143041862,
"loss": 4.4195,
"num_input_tokens_seen": 451441120,
"step": 77250
},
{
"epoch": 0.852944255574718,
"grad_norm": 1.8583135604858398,
"learning_rate": 0.00011452635291006069,
"loss": 4.4198,
"num_input_tokens_seen": 452318560,
"step": 77400
},
{
"epoch": 0.8545972483180799,
"grad_norm": 1.8936694860458374,
"learning_rate": 0.00011443819151593518,
"loss": 4.4287,
"num_input_tokens_seen": 453195136,
"step": 77550
},
{
"epoch": 0.8562502410614418,
"grad_norm": 1.9256082773208618,
"learning_rate": 0.00011435003012180967,
"loss": 4.4156,
"num_input_tokens_seen": 454055456,
"step": 77700
},
{
"epoch": 0.8579032338048036,
"grad_norm": 1.8237937688827515,
"learning_rate": 0.00011426186872768416,
"loss": 4.4348,
"num_input_tokens_seen": 454932800,
"step": 77850
},
{
"epoch": 0.8595562265481654,
"grad_norm": 1.8298827409744263,
"learning_rate": 0.00011417370733355865,
"loss": 4.4178,
"num_input_tokens_seen": 455826208,
"step": 78000
},
{
"epoch": 0.8612092192915273,
"grad_norm": 1.895670771598816,
"learning_rate": 0.00011408554593943314,
"loss": 4.4164,
"num_input_tokens_seen": 456726848,
"step": 78150
},
{
"epoch": 0.8628622120348892,
"grad_norm": 1.750807523727417,
"learning_rate": 0.00011399738454530761,
"loss": 4.4051,
"num_input_tokens_seen": 457606464,
"step": 78300
},
{
"epoch": 0.8645152047782511,
"grad_norm": 1.8419345617294312,
"learning_rate": 0.00011390922315118211,
"loss": 4.4249,
"num_input_tokens_seen": 458484896,
"step": 78450
},
{
"epoch": 0.8661681975216129,
"grad_norm": 2.033911943435669,
"learning_rate": 0.00011382106175705659,
"loss": 4.421,
"num_input_tokens_seen": 459353824,
"step": 78600
},
{
"epoch": 0.8678211902649747,
"grad_norm": 1.9020805358886719,
"learning_rate": 0.00011373290036293109,
"loss": 4.4016,
"num_input_tokens_seen": 460221184,
"step": 78750
},
{
"epoch": 0.8694741830083366,
"grad_norm": 1.91862952709198,
"learning_rate": 0.00011364473896880557,
"loss": 4.4093,
"num_input_tokens_seen": 461087744,
"step": 78900
},
{
"epoch": 0.8711271757516984,
"grad_norm": 1.7994396686553955,
"learning_rate": 0.00011355657757468007,
"loss": 4.4281,
"num_input_tokens_seen": 461960160,
"step": 79050
},
{
"epoch": 0.8727801684950603,
"grad_norm": 1.7911181449890137,
"learning_rate": 0.00011346841618055454,
"loss": 4.4229,
"num_input_tokens_seen": 462838144,
"step": 79200
},
{
"epoch": 0.8744331612384222,
"grad_norm": 1.923474907875061,
"learning_rate": 0.00011338025478642904,
"loss": 4.4103,
"num_input_tokens_seen": 463703328,
"step": 79350
},
{
"epoch": 0.876086153981784,
"grad_norm": 1.994814157485962,
"learning_rate": 0.00011329268113493102,
"loss": 4.4128,
"num_input_tokens_seen": 464568896,
"step": 79500
},
{
"epoch": 0.8777391467251459,
"grad_norm": 1.875200867652893,
"learning_rate": 0.00011320451974080551,
"loss": 4.4224,
"num_input_tokens_seen": 465434144,
"step": 79650
},
{
"epoch": 0.8793921394685077,
"grad_norm": 1.8729829788208008,
"learning_rate": 0.00011311635834668,
"loss": 4.4274,
"num_input_tokens_seen": 466293984,
"step": 79800
},
{
"epoch": 0.8810451322118696,
"grad_norm": 1.772687315940857,
"learning_rate": 0.00011302819695255449,
"loss": 4.4178,
"num_input_tokens_seen": 467169280,
"step": 79950
},
{
"epoch": 0.8826981249552315,
"grad_norm": 1.8293451070785522,
"learning_rate": 0.00011294003555842898,
"loss": 4.412,
"num_input_tokens_seen": 468023552,
"step": 80100
},
{
"epoch": 0.8843511176985933,
"grad_norm": 1.9000316858291626,
"learning_rate": 0.00011285187416430346,
"loss": 4.4114,
"num_input_tokens_seen": 468883168,
"step": 80250
},
{
"epoch": 0.8860041104419552,
"grad_norm": 1.8056668043136597,
"learning_rate": 0.00011276371277017795,
"loss": 4.4225,
"num_input_tokens_seen": 469761120,
"step": 80400
},
{
"epoch": 0.887657103185317,
"grad_norm": 1.7813293933868408,
"learning_rate": 0.00011267555137605243,
"loss": 4.4176,
"num_input_tokens_seen": 470629344,
"step": 80550
},
{
"epoch": 0.8893100959286788,
"grad_norm": 1.8244847059249878,
"learning_rate": 0.00011258738998192693,
"loss": 4.4082,
"num_input_tokens_seen": 471489280,
"step": 80700
},
{
"epoch": 0.8909630886720408,
"grad_norm": 1.8946529626846313,
"learning_rate": 0.00011249981633042892,
"loss": 4.4162,
"num_input_tokens_seen": 472365504,
"step": 80850
},
{
"epoch": 0.8926160814154026,
"grad_norm": 1.870685338973999,
"learning_rate": 0.0001124116549363034,
"loss": 4.4009,
"num_input_tokens_seen": 473238752,
"step": 81000
},
{
"epoch": 0.8942690741587644,
"grad_norm": 1.9169375896453857,
"learning_rate": 0.0001123234935421779,
"loss": 4.4176,
"num_input_tokens_seen": 474122880,
"step": 81150
},
{
"epoch": 0.8959220669021263,
"grad_norm": 1.9780856370925903,
"learning_rate": 0.00011223533214805237,
"loss": 4.401,
"num_input_tokens_seen": 475002240,
"step": 81300
},
{
"epoch": 0.8975750596454881,
"grad_norm": 1.8493030071258545,
"learning_rate": 0.00011214717075392688,
"loss": 4.4091,
"num_input_tokens_seen": 475882624,
"step": 81450
},
{
"epoch": 0.8992280523888501,
"grad_norm": 1.8798983097076416,
"learning_rate": 0.00011205900935980135,
"loss": 4.4113,
"num_input_tokens_seen": 476749664,
"step": 81600
},
{
"epoch": 0.9008810451322119,
"grad_norm": 1.8249197006225586,
"learning_rate": 0.00011197084796567584,
"loss": 4.4059,
"num_input_tokens_seen": 477650784,
"step": 81750
},
{
"epoch": 0.9025340378755737,
"grad_norm": 1.9157739877700806,
"learning_rate": 0.00011188268657155033,
"loss": 4.4109,
"num_input_tokens_seen": 478523840,
"step": 81900
},
{
"epoch": 0.9041870306189356,
"grad_norm": 1.9503858089447021,
"learning_rate": 0.00011179452517742482,
"loss": 4.4139,
"num_input_tokens_seen": 479399296,
"step": 82050
},
{
"epoch": 0.9058400233622974,
"grad_norm": 1.8298823833465576,
"learning_rate": 0.0001117063637832993,
"loss": 4.4123,
"num_input_tokens_seen": 480262240,
"step": 82200
},
{
"epoch": 0.9074930161056592,
"grad_norm": 1.9161386489868164,
"learning_rate": 0.0001116182023891738,
"loss": 4.4058,
"num_input_tokens_seen": 481141056,
"step": 82350
},
{
"epoch": 0.9091460088490212,
"grad_norm": 1.872722864151001,
"learning_rate": 0.00011153004099504828,
"loss": 4.4279,
"num_input_tokens_seen": 482014112,
"step": 82500
},
{
"epoch": 0.910799001592383,
"grad_norm": 1.8831090927124023,
"learning_rate": 0.00011144187960092277,
"loss": 4.4121,
"num_input_tokens_seen": 482898336,
"step": 82650
},
{
"epoch": 0.9124519943357449,
"grad_norm": 1.8128923177719116,
"learning_rate": 0.00011135371820679725,
"loss": 4.4105,
"num_input_tokens_seen": 483773760,
"step": 82800
},
{
"epoch": 0.9141049870791067,
"grad_norm": 1.8982397317886353,
"learning_rate": 0.00011126555681267175,
"loss": 4.4234,
"num_input_tokens_seen": 484671008,
"step": 82950
},
{
"epoch": 0.9157579798224685,
"grad_norm": 1.8295831680297852,
"learning_rate": 0.00011117739541854622,
"loss": 4.4227,
"num_input_tokens_seen": 485553984,
"step": 83100
},
{
"epoch": 0.9174109725658305,
"grad_norm": 1.8975720405578613,
"learning_rate": 0.00011108923402442071,
"loss": 4.4176,
"num_input_tokens_seen": 486416672,
"step": 83250
},
{
"epoch": 0.9190639653091923,
"grad_norm": 1.8207321166992188,
"learning_rate": 0.0001110010726302952,
"loss": 4.424,
"num_input_tokens_seen": 487286816,
"step": 83400
},
{
"epoch": 0.9207169580525542,
"grad_norm": 1.9241523742675781,
"learning_rate": 0.00011091291123616969,
"loss": 4.4129,
"num_input_tokens_seen": 488157088,
"step": 83550
},
{
"epoch": 0.922369950795916,
"grad_norm": 1.8391443490982056,
"learning_rate": 0.00011082474984204418,
"loss": 4.4036,
"num_input_tokens_seen": 489008320,
"step": 83700
},
{
"epoch": 0.9240229435392778,
"grad_norm": 1.9244701862335205,
"learning_rate": 0.00011073658844791865,
"loss": 4.4215,
"num_input_tokens_seen": 489887328,
"step": 83850
},
{
"epoch": 0.9256759362826397,
"grad_norm": 1.8949611186981201,
"learning_rate": 0.00011064842705379315,
"loss": 4.4205,
"num_input_tokens_seen": 490765504,
"step": 84000
},
{
"epoch": 0.9273289290260016,
"grad_norm": 1.810594081878662,
"learning_rate": 0.00011056026565966763,
"loss": 4.4149,
"num_input_tokens_seen": 491650144,
"step": 84150
},
{
"epoch": 0.9289819217693635,
"grad_norm": 1.8556066751480103,
"learning_rate": 0.00011047210426554213,
"loss": 4.4102,
"num_input_tokens_seen": 492539968,
"step": 84300
},
{
"epoch": 0.9306349145127253,
"grad_norm": 1.8486409187316895,
"learning_rate": 0.00011038394287141661,
"loss": 4.4231,
"num_input_tokens_seen": 493424352,
"step": 84450
},
{
"epoch": 0.9322879072560871,
"grad_norm": 1.8193395137786865,
"learning_rate": 0.00011029578147729111,
"loss": 4.4195,
"num_input_tokens_seen": 494301152,
"step": 84600
},
{
"epoch": 0.933940899999449,
"grad_norm": 1.8344619274139404,
"learning_rate": 0.00011020762008316558,
"loss": 4.4075,
"num_input_tokens_seen": 495177600,
"step": 84750
},
{
"epoch": 0.9355938927428109,
"grad_norm": 1.781654953956604,
"learning_rate": 0.00011012004643166758,
"loss": 4.4075,
"num_input_tokens_seen": 496047680,
"step": 84900
},
{
"epoch": 0.9372468854861727,
"grad_norm": 1.935810923576355,
"learning_rate": 0.00011003188503754206,
"loss": 4.408,
"num_input_tokens_seen": 496919488,
"step": 85050
},
{
"epoch": 0.9388998782295346,
"grad_norm": 1.8130574226379395,
"learning_rate": 0.00010994372364341655,
"loss": 4.4152,
"num_input_tokens_seen": 497821280,
"step": 85200
},
{
"epoch": 0.9405528709728964,
"grad_norm": 1.9481176137924194,
"learning_rate": 0.00010985556224929104,
"loss": 4.4115,
"num_input_tokens_seen": 498694560,
"step": 85350
},
{
"epoch": 0.9422058637162583,
"grad_norm": 1.8938475847244263,
"learning_rate": 0.00010976740085516553,
"loss": 4.4077,
"num_input_tokens_seen": 499570016,
"step": 85500
},
{
"epoch": 0.9438588564596201,
"grad_norm": 1.8449296951293945,
"learning_rate": 0.00010967923946104002,
"loss": 4.4043,
"num_input_tokens_seen": 500436288,
"step": 85650
},
{
"epoch": 0.945511849202982,
"grad_norm": 1.775891661643982,
"learning_rate": 0.00010959166580954201,
"loss": 4.4112,
"num_input_tokens_seen": 501298944,
"step": 85800
},
{
"epoch": 0.9471648419463439,
"grad_norm": 1.827708125114441,
"learning_rate": 0.00010950350441541651,
"loss": 4.4088,
"num_input_tokens_seen": 502206976,
"step": 85950
},
{
"epoch": 0.9488178346897057,
"grad_norm": 1.8833259344100952,
"learning_rate": 0.00010941534302129099,
"loss": 4.4107,
"num_input_tokens_seen": 503083488,
"step": 86100
},
{
"epoch": 0.9504708274330675,
"grad_norm": 1.8116602897644043,
"learning_rate": 0.00010932718162716549,
"loss": 4.4109,
"num_input_tokens_seen": 503978240,
"step": 86250
},
{
"epoch": 0.9521238201764294,
"grad_norm": 1.8248368501663208,
"learning_rate": 0.00010923902023303996,
"loss": 4.4041,
"num_input_tokens_seen": 504859744,
"step": 86400
},
{
"epoch": 0.9537768129197913,
"grad_norm": 1.862371802330017,
"learning_rate": 0.00010915085883891444,
"loss": 4.4221,
"num_input_tokens_seen": 505740576,
"step": 86550
},
{
"epoch": 0.9554298056631532,
"grad_norm": 1.8358848094940186,
"learning_rate": 0.00010906269744478894,
"loss": 4.4051,
"num_input_tokens_seen": 506615680,
"step": 86700
},
{
"epoch": 0.957082798406515,
"grad_norm": 1.8686786890029907,
"learning_rate": 0.00010897453605066342,
"loss": 4.41,
"num_input_tokens_seen": 507477824,
"step": 86850
},
{
"epoch": 0.9587357911498768,
"grad_norm": 1.833525538444519,
"learning_rate": 0.00010888637465653792,
"loss": 4.4188,
"num_input_tokens_seen": 508371904,
"step": 87000
},
{
"epoch": 0.9603887838932387,
"grad_norm": 1.9611468315124512,
"learning_rate": 0.00010879821326241239,
"loss": 4.4034,
"num_input_tokens_seen": 509250272,
"step": 87150
},
{
"epoch": 0.9620417766366006,
"grad_norm": 1.6934946775436401,
"learning_rate": 0.00010871005186828688,
"loss": 4.4067,
"num_input_tokens_seen": 510129568,
"step": 87300
},
{
"epoch": 0.9636947693799625,
"grad_norm": 1.8400757312774658,
"learning_rate": 0.00010862189047416137,
"loss": 4.4,
"num_input_tokens_seen": 510995328,
"step": 87450
},
{
"epoch": 0.9653477621233243,
"grad_norm": 1.8682547807693481,
"learning_rate": 0.00010853372908003586,
"loss": 4.4026,
"num_input_tokens_seen": 511880416,
"step": 87600
},
{
"epoch": 0.9670007548666861,
"grad_norm": 1.8408825397491455,
"learning_rate": 0.00010844556768591035,
"loss": 4.4019,
"num_input_tokens_seen": 512759072,
"step": 87750
},
{
"epoch": 0.968653747610048,
"grad_norm": 1.9082870483398438,
"learning_rate": 0.00010835740629178484,
"loss": 4.4076,
"num_input_tokens_seen": 513640896,
"step": 87900
},
{
"epoch": 0.9703067403534098,
"grad_norm": 1.9512287378311157,
"learning_rate": 0.00010826983264028683,
"loss": 4.4217,
"num_input_tokens_seen": 514532256,
"step": 88050
},
{
"epoch": 0.9719597330967717,
"grad_norm": 1.9278032779693604,
"learning_rate": 0.0001081816712461613,
"loss": 4.4217,
"num_input_tokens_seen": 515412864,
"step": 88200
},
{
"epoch": 0.9736127258401336,
"grad_norm": 1.814817190170288,
"learning_rate": 0.0001080935098520358,
"loss": 4.4178,
"num_input_tokens_seen": 516301408,
"step": 88350
},
{
"epoch": 0.9752657185834954,
"grad_norm": 2.0495548248291016,
"learning_rate": 0.00010800534845791028,
"loss": 4.4101,
"num_input_tokens_seen": 517179648,
"step": 88500
},
{
"epoch": 0.9769187113268573,
"grad_norm": 1.8378854990005493,
"learning_rate": 0.00010791718706378478,
"loss": 4.4031,
"num_input_tokens_seen": 518064288,
"step": 88650
},
{
"epoch": 0.9785717040702191,
"grad_norm": 1.8407827615737915,
"learning_rate": 0.00010782902566965926,
"loss": 4.4135,
"num_input_tokens_seen": 518947776,
"step": 88800
},
{
"epoch": 0.980224696813581,
"grad_norm": 1.845199704170227,
"learning_rate": 0.00010774086427553376,
"loss": 4.4,
"num_input_tokens_seen": 519822560,
"step": 88950
},
{
"epoch": 0.9818776895569429,
"grad_norm": 1.8627071380615234,
"learning_rate": 0.00010765270288140823,
"loss": 4.4125,
"num_input_tokens_seen": 520709888,
"step": 89100
},
{
"epoch": 0.9835306823003047,
"grad_norm": 1.826648235321045,
"learning_rate": 0.00010756454148728274,
"loss": 4.4006,
"num_input_tokens_seen": 521586080,
"step": 89250
},
{
"epoch": 0.9851836750436666,
"grad_norm": 1.9315438270568848,
"learning_rate": 0.00010747638009315721,
"loss": 4.4104,
"num_input_tokens_seen": 522450944,
"step": 89400
},
{
"epoch": 0.9868366677870284,
"grad_norm": 1.8507201671600342,
"learning_rate": 0.0001073882186990317,
"loss": 4.412,
"num_input_tokens_seen": 523335744,
"step": 89550
},
{
"epoch": 0.9884896605303902,
"grad_norm": 1.8950568437576294,
"learning_rate": 0.00010730005730490619,
"loss": 4.4106,
"num_input_tokens_seen": 524216960,
"step": 89700
},
{
"epoch": 0.9901426532737522,
"grad_norm": 1.92140531539917,
"learning_rate": 0.00010721189591078068,
"loss": 4.4001,
"num_input_tokens_seen": 525081088,
"step": 89850
},
{
"epoch": 0.991795646017114,
"grad_norm": 1.9179210662841797,
"learning_rate": 0.00010712373451665516,
"loss": 4.4044,
"num_input_tokens_seen": 525964384,
"step": 90000
},
{
"epoch": 0.9934486387604758,
"grad_norm": 1.8168158531188965,
"learning_rate": 0.00010703557312252965,
"loss": 4.4082,
"num_input_tokens_seen": 526852256,
"step": 90150
},
{
"epoch": 0.9951016315038377,
"grad_norm": 2.0058753490448,
"learning_rate": 0.00010694741172840414,
"loss": 4.4061,
"num_input_tokens_seen": 527721152,
"step": 90300
},
{
"epoch": 0.9967546242471995,
"grad_norm": 2.0036473274230957,
"learning_rate": 0.00010685925033427863,
"loss": 4.4013,
"num_input_tokens_seen": 528601472,
"step": 90450
},
{
"epoch": 0.9984076169905615,
"grad_norm": 1.8912723064422607,
"learning_rate": 0.0001067710889401531,
"loss": 4.408,
"num_input_tokens_seen": 529485344,
"step": 90600
},
{
"epoch": 1.0000606097339233,
"grad_norm": 1.8539482355117798,
"learning_rate": 0.00010668292754602761,
"loss": 4.4072,
"num_input_tokens_seen": 530388834,
"step": 90750
},
{
"epoch": 1.001713602477285,
"grad_norm": 1.9648711681365967,
"learning_rate": 0.00010659476615190208,
"loss": 4.4023,
"num_input_tokens_seen": 531261986,
"step": 90900
},
{
"epoch": 1.003366595220647,
"grad_norm": 1.8683501482009888,
"learning_rate": 0.00010650660475777658,
"loss": 4.3953,
"num_input_tokens_seen": 532138626,
"step": 91050
},
{
"epoch": 1.005019587964009,
"grad_norm": 1.9149645566940308,
"learning_rate": 0.00010641844336365106,
"loss": 4.3907,
"num_input_tokens_seen": 533008418,
"step": 91200
},
{
"epoch": 1.0066725807073706,
"grad_norm": 1.804408073425293,
"learning_rate": 0.00010633028196952556,
"loss": 4.4053,
"num_input_tokens_seen": 533881122,
"step": 91350
},
{
"epoch": 1.0083255734507326,
"grad_norm": 1.8145511150360107,
"learning_rate": 0.00010624212057540004,
"loss": 4.3993,
"num_input_tokens_seen": 534752738,
"step": 91500
},
{
"epoch": 1.0099785661940943,
"grad_norm": 1.8206557035446167,
"learning_rate": 0.00010615395918127454,
"loss": 4.3954,
"num_input_tokens_seen": 535622338,
"step": 91650
},
{
"epoch": 1.0116315589374563,
"grad_norm": 1.880231261253357,
"learning_rate": 0.00010606579778714901,
"loss": 4.4002,
"num_input_tokens_seen": 536488386,
"step": 91800
},
{
"epoch": 1.0132845516808182,
"grad_norm": 1.8914505243301392,
"learning_rate": 0.00010597763639302352,
"loss": 4.3961,
"num_input_tokens_seen": 537374658,
"step": 91950
},
{
"epoch": 1.01493754442418,
"grad_norm": 1.9163919687271118,
"learning_rate": 0.0001058900627415255,
"loss": 4.3973,
"num_input_tokens_seen": 538244194,
"step": 92100
},
{
"epoch": 1.0165905371675419,
"grad_norm": 1.9003725051879883,
"learning_rate": 0.00010580190134739998,
"loss": 4.3957,
"num_input_tokens_seen": 539130914,
"step": 92250
},
{
"epoch": 1.0182435299109036,
"grad_norm": 1.838493824005127,
"learning_rate": 0.00010571373995327447,
"loss": 4.3878,
"num_input_tokens_seen": 540022850,
"step": 92400
},
{
"epoch": 1.0198965226542656,
"grad_norm": 1.9080275297164917,
"learning_rate": 0.00010562557855914896,
"loss": 4.3917,
"num_input_tokens_seen": 540895874,
"step": 92550
},
{
"epoch": 1.0215495153976275,
"grad_norm": 1.8060060739517212,
"learning_rate": 0.00010553741716502345,
"loss": 4.3953,
"num_input_tokens_seen": 541762658,
"step": 92700
},
{
"epoch": 1.0232025081409892,
"grad_norm": 1.903151273727417,
"learning_rate": 0.00010544925577089792,
"loss": 4.3952,
"num_input_tokens_seen": 542643138,
"step": 92850
},
{
"epoch": 1.0248555008843512,
"grad_norm": 1.9957008361816406,
"learning_rate": 0.00010536109437677243,
"loss": 4.3952,
"num_input_tokens_seen": 543505570,
"step": 93000
},
{
"epoch": 1.026508493627713,
"grad_norm": 1.8897976875305176,
"learning_rate": 0.0001052729329826469,
"loss": 4.395,
"num_input_tokens_seen": 544378466,
"step": 93150
},
{
"epoch": 1.0281614863710749,
"grad_norm": 1.895654320716858,
"learning_rate": 0.0001051847715885214,
"loss": 4.4016,
"num_input_tokens_seen": 545256738,
"step": 93300
},
{
"epoch": 1.0298144791144366,
"grad_norm": 1.9977262020111084,
"learning_rate": 0.00010509661019439588,
"loss": 4.3994,
"num_input_tokens_seen": 546150498,
"step": 93450
},
{
"epoch": 1.0314674718577985,
"grad_norm": 1.82341468334198,
"learning_rate": 0.00010500844880027038,
"loss": 4.4003,
"num_input_tokens_seen": 547021922,
"step": 93600
},
{
"epoch": 1.0331204646011605,
"grad_norm": 1.7573907375335693,
"learning_rate": 0.00010492028740614485,
"loss": 4.3936,
"num_input_tokens_seen": 547888450,
"step": 93750
},
{
"epoch": 1.0347734573445222,
"grad_norm": 2.011516571044922,
"learning_rate": 0.00010483212601201936,
"loss": 4.3861,
"num_input_tokens_seen": 548752514,
"step": 93900
},
{
"epoch": 1.0364264500878841,
"grad_norm": 1.8368171453475952,
"learning_rate": 0.00010474396461789383,
"loss": 4.3975,
"num_input_tokens_seen": 549641218,
"step": 94050
},
{
"epoch": 1.0380794428312459,
"grad_norm": 2.0658929347991943,
"learning_rate": 0.00010465639096639582,
"loss": 4.4042,
"num_input_tokens_seen": 550521378,
"step": 94200
},
{
"epoch": 1.0397324355746078,
"grad_norm": 1.8516744375228882,
"learning_rate": 0.00010456822957227031,
"loss": 4.3937,
"num_input_tokens_seen": 551403138,
"step": 94350
},
{
"epoch": 1.0413854283179698,
"grad_norm": 1.9704523086547852,
"learning_rate": 0.0001044800681781448,
"loss": 4.3892,
"num_input_tokens_seen": 552268866,
"step": 94500
},
{
"epoch": 1.0430384210613315,
"grad_norm": 1.8856583833694458,
"learning_rate": 0.00010439190678401929,
"loss": 4.3969,
"num_input_tokens_seen": 553139522,
"step": 94650
},
{
"epoch": 1.0446914138046934,
"grad_norm": 1.9823240041732788,
"learning_rate": 0.00010430374538989378,
"loss": 4.3937,
"num_input_tokens_seen": 554009858,
"step": 94800
},
{
"epoch": 1.0463444065480552,
"grad_norm": 1.8391404151916504,
"learning_rate": 0.00010421558399576827,
"loss": 4.3891,
"num_input_tokens_seen": 554896962,
"step": 94950
},
{
"epoch": 1.047997399291417,
"grad_norm": 1.829777717590332,
"learning_rate": 0.00010412742260164274,
"loss": 4.3996,
"num_input_tokens_seen": 555778274,
"step": 95100
},
{
"epoch": 1.049650392034779,
"grad_norm": 1.884190320968628,
"learning_rate": 0.00010403926120751724,
"loss": 4.3899,
"num_input_tokens_seen": 556658210,
"step": 95250
},
{
"epoch": 1.0513033847781408,
"grad_norm": 1.8368123769760132,
"learning_rate": 0.00010395109981339172,
"loss": 4.3989,
"num_input_tokens_seen": 557549442,
"step": 95400
},
{
"epoch": 1.0529563775215027,
"grad_norm": 1.7985849380493164,
"learning_rate": 0.00010386293841926622,
"loss": 4.3868,
"num_input_tokens_seen": 558417634,
"step": 95550
},
{
"epoch": 1.0546093702648645,
"grad_norm": 1.8913172483444214,
"learning_rate": 0.0001037747770251407,
"loss": 4.4031,
"num_input_tokens_seen": 559314882,
"step": 95700
},
{
"epoch": 1.0562623630082264,
"grad_norm": 1.9179192781448364,
"learning_rate": 0.0001036866156310152,
"loss": 4.3812,
"num_input_tokens_seen": 560207298,
"step": 95850
},
{
"epoch": 1.0579153557515883,
"grad_norm": 1.8890949487686157,
"learning_rate": 0.00010359845423688967,
"loss": 4.3916,
"num_input_tokens_seen": 561097570,
"step": 96000
},
{
"epoch": 1.05956834849495,
"grad_norm": 1.7995752096176147,
"learning_rate": 0.00010351088058539166,
"loss": 4.3933,
"num_input_tokens_seen": 561973218,
"step": 96150
},
{
"epoch": 1.061221341238312,
"grad_norm": 1.928031086921692,
"learning_rate": 0.00010342330693389365,
"loss": 4.3914,
"num_input_tokens_seen": 562851074,
"step": 96300
},
{
"epoch": 1.0628743339816737,
"grad_norm": 1.94650137424469,
"learning_rate": 0.00010333514553976814,
"loss": 4.3766,
"num_input_tokens_seen": 563725506,
"step": 96450
},
{
"epoch": 1.0645273267250357,
"grad_norm": 1.8535209894180298,
"learning_rate": 0.00010324698414564263,
"loss": 4.3916,
"num_input_tokens_seen": 564576034,
"step": 96600
},
{
"epoch": 1.0661803194683976,
"grad_norm": 1.9456048011779785,
"learning_rate": 0.00010315882275151712,
"loss": 4.3975,
"num_input_tokens_seen": 565446626,
"step": 96750
},
{
"epoch": 1.0678333122117594,
"grad_norm": 1.8319114446640015,
"learning_rate": 0.0001030706613573916,
"loss": 4.3977,
"num_input_tokens_seen": 566321378,
"step": 96900
},
{
"epoch": 1.0694863049551213,
"grad_norm": 2.1267592906951904,
"learning_rate": 0.0001029824999632661,
"loss": 4.3896,
"num_input_tokens_seen": 567203778,
"step": 97050
},
{
"epoch": 1.071139297698483,
"grad_norm": 1.8523855209350586,
"learning_rate": 0.00010289433856914057,
"loss": 4.3868,
"num_input_tokens_seen": 568083906,
"step": 97200
},
{
"epoch": 1.072792290441845,
"grad_norm": 1.882645606994629,
"learning_rate": 0.00010280617717501507,
"loss": 4.3976,
"num_input_tokens_seen": 568970690,
"step": 97350
},
{
"epoch": 1.0744452831852067,
"grad_norm": 1.9347394704818726,
"learning_rate": 0.00010271801578088955,
"loss": 4.3905,
"num_input_tokens_seen": 569844482,
"step": 97500
},
{
"epoch": 1.0760982759285687,
"grad_norm": 1.855491280555725,
"learning_rate": 0.00010262985438676405,
"loss": 4.3886,
"num_input_tokens_seen": 570717602,
"step": 97650
},
{
"epoch": 1.0777512686719306,
"grad_norm": 1.8031153678894043,
"learning_rate": 0.00010254169299263853,
"loss": 4.3807,
"num_input_tokens_seen": 571591586,
"step": 97800
},
{
"epoch": 1.0794042614152923,
"grad_norm": 1.9792248010635376,
"learning_rate": 0.00010245353159851303,
"loss": 4.389,
"num_input_tokens_seen": 572476162,
"step": 97950
},
{
"epoch": 1.0810572541586543,
"grad_norm": 1.9110472202301025,
"learning_rate": 0.0001023653702043875,
"loss": 4.3889,
"num_input_tokens_seen": 573353346,
"step": 98100
},
{
"epoch": 1.082710246902016,
"grad_norm": 1.9655945301055908,
"learning_rate": 0.000102277208810262,
"loss": 4.3808,
"num_input_tokens_seen": 574237986,
"step": 98250
},
{
"epoch": 1.084363239645378,
"grad_norm": 1.806372880935669,
"learning_rate": 0.00010218904741613648,
"loss": 4.389,
"num_input_tokens_seen": 575113346,
"step": 98400
},
{
"epoch": 1.08601623238874,
"grad_norm": 1.9266657829284668,
"learning_rate": 0.00010210147376463847,
"loss": 4.3957,
"num_input_tokens_seen": 576010818,
"step": 98550
},
{
"epoch": 1.0876692251321016,
"grad_norm": 1.8409209251403809,
"learning_rate": 0.00010201331237051296,
"loss": 4.3949,
"num_input_tokens_seen": 576872610,
"step": 98700
},
{
"epoch": 1.0893222178754636,
"grad_norm": 1.7804383039474487,
"learning_rate": 0.00010192515097638745,
"loss": 4.3835,
"num_input_tokens_seen": 577737090,
"step": 98850
},
{
"epoch": 1.0909752106188253,
"grad_norm": 1.8269861936569214,
"learning_rate": 0.00010183698958226194,
"loss": 4.3967,
"num_input_tokens_seen": 578610178,
"step": 99000
},
{
"epoch": 1.0926282033621872,
"grad_norm": 1.9065062999725342,
"learning_rate": 0.00010174882818813641,
"loss": 4.395,
"num_input_tokens_seen": 579483074,
"step": 99150
},
{
"epoch": 1.0942811961055492,
"grad_norm": 1.8511546850204468,
"learning_rate": 0.00010166066679401092,
"loss": 4.3847,
"num_input_tokens_seen": 580339138,
"step": 99300
},
{
"epoch": 1.095934188848911,
"grad_norm": 1.9003854990005493,
"learning_rate": 0.00010157250539988539,
"loss": 4.3878,
"num_input_tokens_seen": 581214146,
"step": 99450
},
{
"epoch": 1.0975871815922729,
"grad_norm": 1.9078856706619263,
"learning_rate": 0.00010148434400575989,
"loss": 4.3976,
"num_input_tokens_seen": 582094658,
"step": 99600
},
{
"epoch": 1.0992401743356346,
"grad_norm": 1.8750337362289429,
"learning_rate": 0.00010139618261163437,
"loss": 4.3999,
"num_input_tokens_seen": 582981922,
"step": 99750
},
{
"epoch": 1.1008931670789965,
"grad_norm": 1.9243488311767578,
"learning_rate": 0.00010130802121750887,
"loss": 4.3879,
"num_input_tokens_seen": 583869026,
"step": 99900
},
{
"epoch": 1.1025461598223583,
"grad_norm": 1.8446391820907593,
"learning_rate": 0.00010121985982338334,
"loss": 4.3894,
"num_input_tokens_seen": 584749826,
"step": 100050
},
{
"epoch": 1.1041991525657202,
"grad_norm": 1.726158857345581,
"learning_rate": 0.00010113169842925785,
"loss": 4.3985,
"num_input_tokens_seen": 585630274,
"step": 100200
},
{
"epoch": 1.1058521453090822,
"grad_norm": 1.8227604627609253,
"learning_rate": 0.00010104353703513232,
"loss": 4.3906,
"num_input_tokens_seen": 586484930,
"step": 100350
},
{
"epoch": 1.1075051380524439,
"grad_norm": 1.9156420230865479,
"learning_rate": 0.00010095537564100682,
"loss": 4.3893,
"num_input_tokens_seen": 587352738,
"step": 100500
},
{
"epoch": 1.1091581307958058,
"grad_norm": 1.8385225534439087,
"learning_rate": 0.0001008678019895088,
"loss": 4.3994,
"num_input_tokens_seen": 588239810,
"step": 100650
},
{
"epoch": 1.1108111235391678,
"grad_norm": 1.9076261520385742,
"learning_rate": 0.00010077964059538329,
"loss": 4.3922,
"num_input_tokens_seen": 589116514,
"step": 100800
},
{
"epoch": 1.1124641162825295,
"grad_norm": 1.8701651096343994,
"learning_rate": 0.00010069147920125778,
"loss": 4.4015,
"num_input_tokens_seen": 589983426,
"step": 100950
},
{
"epoch": 1.1141171090258915,
"grad_norm": 1.9545180797576904,
"learning_rate": 0.00010060331780713227,
"loss": 4.3978,
"num_input_tokens_seen": 590856994,
"step": 101100
},
{
"epoch": 1.1157701017692532,
"grad_norm": 1.9418137073516846,
"learning_rate": 0.00010051515641300676,
"loss": 4.3893,
"num_input_tokens_seen": 591735490,
"step": 101250
},
{
"epoch": 1.1174230945126151,
"grad_norm": 1.892683982849121,
"learning_rate": 0.00010042699501888123,
"loss": 4.3833,
"num_input_tokens_seen": 592622626,
"step": 101400
},
{
"epoch": 1.1190760872559768,
"grad_norm": 1.830404281616211,
"learning_rate": 0.00010033883362475573,
"loss": 4.3939,
"num_input_tokens_seen": 593500354,
"step": 101550
},
{
"epoch": 1.1207290799993388,
"grad_norm": 1.8536481857299805,
"learning_rate": 0.00010025067223063021,
"loss": 4.3826,
"num_input_tokens_seen": 594383234,
"step": 101700
},
{
"epoch": 1.1223820727427007,
"grad_norm": 1.84872567653656,
"learning_rate": 0.00010016251083650471,
"loss": 4.3847,
"num_input_tokens_seen": 595255266,
"step": 101850
},
{
"epoch": 1.1240350654860625,
"grad_norm": 1.8653180599212646,
"learning_rate": 0.00010007434944237918,
"loss": 4.392,
"num_input_tokens_seen": 596135586,
"step": 102000
},
{
"epoch": 1.1256880582294244,
"grad_norm": 1.8534561395645142,
"learning_rate": 9.998618804825369e-05,
"loss": 4.3862,
"num_input_tokens_seen": 597009218,
"step": 102150
},
{
"epoch": 1.1273410509727861,
"grad_norm": 1.8982864618301392,
"learning_rate": 9.989802665412816e-05,
"loss": 4.3969,
"num_input_tokens_seen": 597873026,
"step": 102300
},
{
"epoch": 1.128994043716148,
"grad_norm": 1.9212620258331299,
"learning_rate": 9.980986526000266e-05,
"loss": 4.3872,
"num_input_tokens_seen": 598748322,
"step": 102450
},
{
"epoch": 1.13064703645951,
"grad_norm": 1.8133482933044434,
"learning_rate": 9.972170386587714e-05,
"loss": 4.3801,
"num_input_tokens_seen": 599625410,
"step": 102600
},
{
"epoch": 1.1323000292028718,
"grad_norm": 1.8521312475204468,
"learning_rate": 9.963354247175164e-05,
"loss": 4.3867,
"num_input_tokens_seen": 600489762,
"step": 102750
},
{
"epoch": 1.1339530219462337,
"grad_norm": 2.050074577331543,
"learning_rate": 9.954538107762612e-05,
"loss": 4.3813,
"num_input_tokens_seen": 601357666,
"step": 102900
},
{
"epoch": 1.1356060146895954,
"grad_norm": 1.8785549402236938,
"learning_rate": 9.945721968350062e-05,
"loss": 4.3799,
"num_input_tokens_seen": 602239362,
"step": 103050
},
{
"epoch": 1.1372590074329574,
"grad_norm": 1.9237360954284668,
"learning_rate": 9.93690582893751e-05,
"loss": 4.3902,
"num_input_tokens_seen": 603119650,
"step": 103200
},
{
"epoch": 1.1389120001763193,
"grad_norm": 1.8664278984069824,
"learning_rate": 9.928089689524957e-05,
"loss": 4.3905,
"num_input_tokens_seen": 603985666,
"step": 103350
},
{
"epoch": 1.140564992919681,
"grad_norm": 1.812515139579773,
"learning_rate": 9.919273550112407e-05,
"loss": 4.3757,
"num_input_tokens_seen": 604874530,
"step": 103500
},
{
"epoch": 1.142217985663043,
"grad_norm": 1.9093918800354004,
"learning_rate": 9.910457410699855e-05,
"loss": 4.4058,
"num_input_tokens_seen": 605755394,
"step": 103650
},
{
"epoch": 1.1438709784064047,
"grad_norm": 1.9712496995925903,
"learning_rate": 9.901641271287305e-05,
"loss": 4.3848,
"num_input_tokens_seen": 606649794,
"step": 103800
},
{
"epoch": 1.1455239711497667,
"grad_norm": 1.9102181196212769,
"learning_rate": 9.892825131874752e-05,
"loss": 4.3926,
"num_input_tokens_seen": 607513858,
"step": 103950
},
{
"epoch": 1.1471769638931284,
"grad_norm": 1.7749512195587158,
"learning_rate": 9.884008992462201e-05,
"loss": 4.3906,
"num_input_tokens_seen": 608391202,
"step": 104100
},
{
"epoch": 1.1488299566364903,
"grad_norm": 1.8394023180007935,
"learning_rate": 9.87519285304965e-05,
"loss": 4.3814,
"num_input_tokens_seen": 609282018,
"step": 104250
},
{
"epoch": 1.1504829493798523,
"grad_norm": 1.9161593914031982,
"learning_rate": 9.866376713637099e-05,
"loss": 4.3947,
"num_input_tokens_seen": 610168514,
"step": 104400
},
{
"epoch": 1.152135942123214,
"grad_norm": 1.930790901184082,
"learning_rate": 9.857560574224548e-05,
"loss": 4.3928,
"num_input_tokens_seen": 611052354,
"step": 104550
},
{
"epoch": 1.153788934866576,
"grad_norm": 1.836146354675293,
"learning_rate": 9.848803209074748e-05,
"loss": 4.3977,
"num_input_tokens_seen": 611926498,
"step": 104700
},
{
"epoch": 1.155441927609938,
"grad_norm": 1.7802364826202393,
"learning_rate": 9.839987069662196e-05,
"loss": 4.3921,
"num_input_tokens_seen": 612818210,
"step": 104850
},
{
"epoch": 1.1570949203532996,
"grad_norm": 1.9587794542312622,
"learning_rate": 9.831170930249643e-05,
"loss": 4.3925,
"num_input_tokens_seen": 613694850,
"step": 105000
},
{
"epoch": 1.1587479130966616,
"grad_norm": 1.9676165580749512,
"learning_rate": 9.822354790837093e-05,
"loss": 4.3782,
"num_input_tokens_seen": 614583618,
"step": 105150
},
{
"epoch": 1.1604009058400233,
"grad_norm": 1.8942914009094238,
"learning_rate": 9.813538651424541e-05,
"loss": 4.3792,
"num_input_tokens_seen": 615478530,
"step": 105300
},
{
"epoch": 1.1620538985833853,
"grad_norm": 1.8436447381973267,
"learning_rate": 9.804722512011991e-05,
"loss": 4.3848,
"num_input_tokens_seen": 616374914,
"step": 105450
},
{
"epoch": 1.163706891326747,
"grad_norm": 1.9150909185409546,
"learning_rate": 9.795906372599439e-05,
"loss": 4.381,
"num_input_tokens_seen": 617260162,
"step": 105600
},
{
"epoch": 1.165359884070109,
"grad_norm": 2.0403525829315186,
"learning_rate": 9.787090233186889e-05,
"loss": 4.3835,
"num_input_tokens_seen": 618136386,
"step": 105750
},
{
"epoch": 1.1670128768134709,
"grad_norm": 1.8062185049057007,
"learning_rate": 9.778274093774336e-05,
"loss": 4.3821,
"num_input_tokens_seen": 619009282,
"step": 105900
},
{
"epoch": 1.1686658695568326,
"grad_norm": 1.9948753118515015,
"learning_rate": 9.769457954361787e-05,
"loss": 4.3911,
"num_input_tokens_seen": 619886722,
"step": 106050
},
{
"epoch": 1.1703188623001946,
"grad_norm": 1.8109992742538452,
"learning_rate": 9.760641814949234e-05,
"loss": 4.3791,
"num_input_tokens_seen": 620758178,
"step": 106200
},
{
"epoch": 1.1719718550435563,
"grad_norm": 1.9707014560699463,
"learning_rate": 9.751825675536684e-05,
"loss": 4.3809,
"num_input_tokens_seen": 621629506,
"step": 106350
},
{
"epoch": 1.1736248477869182,
"grad_norm": 1.9458143711090088,
"learning_rate": 9.743009536124132e-05,
"loss": 4.3952,
"num_input_tokens_seen": 622496418,
"step": 106500
},
{
"epoch": 1.17527784053028,
"grad_norm": 1.9349957704544067,
"learning_rate": 9.734310945237081e-05,
"loss": 4.379,
"num_input_tokens_seen": 623395010,
"step": 106650
},
{
"epoch": 1.176930833273642,
"grad_norm": 1.9133590459823608,
"learning_rate": 9.725494805824531e-05,
"loss": 4.3689,
"num_input_tokens_seen": 624262434,
"step": 106800
},
{
"epoch": 1.1785838260170038,
"grad_norm": 1.9451539516448975,
"learning_rate": 9.716678666411979e-05,
"loss": 4.3863,
"num_input_tokens_seen": 625153506,
"step": 106950
},
{
"epoch": 1.1802368187603656,
"grad_norm": 2.0072357654571533,
"learning_rate": 9.707862526999429e-05,
"loss": 4.378,
"num_input_tokens_seen": 626026690,
"step": 107100
},
{
"epoch": 1.1818898115037275,
"grad_norm": 1.7655397653579712,
"learning_rate": 9.699046387586877e-05,
"loss": 4.3801,
"num_input_tokens_seen": 626902594,
"step": 107250
},
{
"epoch": 1.1835428042470895,
"grad_norm": 1.9583156108856201,
"learning_rate": 9.690230248174325e-05,
"loss": 4.3902,
"num_input_tokens_seen": 627796194,
"step": 107400
},
{
"epoch": 1.1851957969904512,
"grad_norm": 1.7717612981796265,
"learning_rate": 9.681414108761774e-05,
"loss": 4.3812,
"num_input_tokens_seen": 628675970,
"step": 107550
},
{
"epoch": 1.1868487897338131,
"grad_norm": 1.9090009927749634,
"learning_rate": 9.672597969349223e-05,
"loss": 4.3889,
"num_input_tokens_seen": 629549794,
"step": 107700
},
{
"epoch": 1.1885017824771749,
"grad_norm": 1.8910843133926392,
"learning_rate": 9.663781829936672e-05,
"loss": 4.3913,
"num_input_tokens_seen": 630437378,
"step": 107850
},
{
"epoch": 1.1901547752205368,
"grad_norm": 1.840728521347046,
"learning_rate": 9.654965690524121e-05,
"loss": 4.3792,
"num_input_tokens_seen": 631313666,
"step": 108000
},
{
"epoch": 1.1918077679638985,
"grad_norm": 1.8772791624069214,
"learning_rate": 9.64614955111157e-05,
"loss": 4.3813,
"num_input_tokens_seen": 632194466,
"step": 108150
},
{
"epoch": 1.1934607607072605,
"grad_norm": 1.9666273593902588,
"learning_rate": 9.637333411699017e-05,
"loss": 4.3716,
"num_input_tokens_seen": 633058978,
"step": 108300
},
{
"epoch": 1.1951137534506224,
"grad_norm": 1.930409550666809,
"learning_rate": 9.628517272286466e-05,
"loss": 4.3934,
"num_input_tokens_seen": 633935458,
"step": 108450
},
{
"epoch": 1.1967667461939842,
"grad_norm": 1.8000093698501587,
"learning_rate": 9.619701132873915e-05,
"loss": 4.3794,
"num_input_tokens_seen": 634825634,
"step": 108600
},
{
"epoch": 1.198419738937346,
"grad_norm": 1.8369793891906738,
"learning_rate": 9.610884993461364e-05,
"loss": 4.386,
"num_input_tokens_seen": 635701666,
"step": 108750
},
{
"epoch": 1.2000727316807078,
"grad_norm": 1.9381849765777588,
"learning_rate": 9.602068854048813e-05,
"loss": 4.3824,
"num_input_tokens_seen": 636568994,
"step": 108900
},
{
"epoch": 1.2017257244240698,
"grad_norm": 1.8089631795883179,
"learning_rate": 9.593252714636261e-05,
"loss": 4.3733,
"num_input_tokens_seen": 637444034,
"step": 109050
},
{
"epoch": 1.2033787171674317,
"grad_norm": 1.7429847717285156,
"learning_rate": 9.584436575223709e-05,
"loss": 4.3766,
"num_input_tokens_seen": 638321634,
"step": 109200
},
{
"epoch": 1.2050317099107934,
"grad_norm": 1.9182720184326172,
"learning_rate": 9.575620435811159e-05,
"loss": 4.3724,
"num_input_tokens_seen": 639189538,
"step": 109350
},
{
"epoch": 1.2066847026541554,
"grad_norm": 1.9700244665145874,
"learning_rate": 9.566804296398607e-05,
"loss": 4.3859,
"num_input_tokens_seen": 640080354,
"step": 109500
},
{
"epoch": 1.2083376953975171,
"grad_norm": 1.86391019821167,
"learning_rate": 9.557988156986057e-05,
"loss": 4.3875,
"num_input_tokens_seen": 640977634,
"step": 109650
},
{
"epoch": 1.209990688140879,
"grad_norm": 1.9451704025268555,
"learning_rate": 9.549230791836256e-05,
"loss": 4.3928,
"num_input_tokens_seen": 641871874,
"step": 109800
},
{
"epoch": 1.211643680884241,
"grad_norm": 2.063884735107422,
"learning_rate": 9.540414652423704e-05,
"loss": 4.3704,
"num_input_tokens_seen": 642751170,
"step": 109950
},
{
"epoch": 1.2132966736276027,
"grad_norm": 1.8499351739883423,
"learning_rate": 9.531598513011154e-05,
"loss": 4.3886,
"num_input_tokens_seen": 643629698,
"step": 110100
},
{
"epoch": 1.2149496663709647,
"grad_norm": 1.9735474586486816,
"learning_rate": 9.522782373598601e-05,
"loss": 4.3854,
"num_input_tokens_seen": 644509698,
"step": 110250
},
{
"epoch": 1.2166026591143264,
"grad_norm": 1.9430962800979614,
"learning_rate": 9.513966234186051e-05,
"loss": 4.3905,
"num_input_tokens_seen": 645395394,
"step": 110400
},
{
"epoch": 1.2182556518576884,
"grad_norm": 1.9608047008514404,
"learning_rate": 9.505150094773499e-05,
"loss": 4.383,
"num_input_tokens_seen": 646254626,
"step": 110550
},
{
"epoch": 1.21990864460105,
"grad_norm": 1.9237737655639648,
"learning_rate": 9.4963927296237e-05,
"loss": 4.3886,
"num_input_tokens_seen": 647146658,
"step": 110700
},
{
"epoch": 1.221561637344412,
"grad_norm": 1.9678759574890137,
"learning_rate": 9.487576590211147e-05,
"loss": 4.3858,
"num_input_tokens_seen": 648004962,
"step": 110850
},
{
"epoch": 1.223214630087774,
"grad_norm": 1.8643629550933838,
"learning_rate": 9.478760450798597e-05,
"loss": 4.3718,
"num_input_tokens_seen": 648877602,
"step": 111000
},
{
"epoch": 1.2248676228311357,
"grad_norm": 1.8100017309188843,
"learning_rate": 9.469944311386045e-05,
"loss": 4.38,
"num_input_tokens_seen": 649743970,
"step": 111150
},
{
"epoch": 1.2265206155744977,
"grad_norm": 1.8271883726119995,
"learning_rate": 9.461128171973495e-05,
"loss": 4.3911,
"num_input_tokens_seen": 650620130,
"step": 111300
},
{
"epoch": 1.2281736083178596,
"grad_norm": 1.9749687910079956,
"learning_rate": 9.452312032560942e-05,
"loss": 4.3715,
"num_input_tokens_seen": 651492738,
"step": 111450
},
{
"epoch": 1.2298266010612213,
"grad_norm": 1.9666537046432495,
"learning_rate": 9.44349589314839e-05,
"loss": 4.3823,
"num_input_tokens_seen": 652359170,
"step": 111600
},
{
"epoch": 1.2314795938045833,
"grad_norm": 1.9260027408599854,
"learning_rate": 9.43467975373584e-05,
"loss": 4.3862,
"num_input_tokens_seen": 653229570,
"step": 111750
},
{
"epoch": 1.233132586547945,
"grad_norm": 1.8240337371826172,
"learning_rate": 9.425863614323288e-05,
"loss": 4.3771,
"num_input_tokens_seen": 654109090,
"step": 111900
},
{
"epoch": 1.234785579291307,
"grad_norm": 1.957507848739624,
"learning_rate": 9.417047474910738e-05,
"loss": 4.3817,
"num_input_tokens_seen": 654980482,
"step": 112050
},
{
"epoch": 1.2364385720346687,
"grad_norm": 1.8944330215454102,
"learning_rate": 9.408231335498185e-05,
"loss": 4.3812,
"num_input_tokens_seen": 655849634,
"step": 112200
},
{
"epoch": 1.2380915647780306,
"grad_norm": 1.8677889108657837,
"learning_rate": 9.399415196085636e-05,
"loss": 4.3803,
"num_input_tokens_seen": 656736738,
"step": 112350
},
{
"epoch": 1.2397445575213926,
"grad_norm": 1.8283082246780396,
"learning_rate": 9.390599056673083e-05,
"loss": 4.3933,
"num_input_tokens_seen": 657615938,
"step": 112500
},
{
"epoch": 1.2413975502647543,
"grad_norm": 1.9106853008270264,
"learning_rate": 9.381782917260533e-05,
"loss": 4.3847,
"num_input_tokens_seen": 658494850,
"step": 112650
},
{
"epoch": 1.2430505430081162,
"grad_norm": 1.8882030248641968,
"learning_rate": 9.372966777847981e-05,
"loss": 4.3862,
"num_input_tokens_seen": 659363618,
"step": 112800
},
{
"epoch": 1.244703535751478,
"grad_norm": 1.964934229850769,
"learning_rate": 9.36415063843543e-05,
"loss": 4.3805,
"num_input_tokens_seen": 660234946,
"step": 112950
},
{
"epoch": 1.24635652849484,
"grad_norm": 1.8856420516967773,
"learning_rate": 9.355334499022878e-05,
"loss": 4.3794,
"num_input_tokens_seen": 661115810,
"step": 113100
},
{
"epoch": 1.2480095212382019,
"grad_norm": 1.8618583679199219,
"learning_rate": 9.346518359610327e-05,
"loss": 4.3883,
"num_input_tokens_seen": 661994434,
"step": 113250
},
{
"epoch": 1.2496625139815636,
"grad_norm": 1.9158508777618408,
"learning_rate": 9.337702220197776e-05,
"loss": 4.3739,
"num_input_tokens_seen": 662868834,
"step": 113400
},
{
"epoch": 1.2513155067249255,
"grad_norm": 1.8499860763549805,
"learning_rate": 9.328886080785225e-05,
"loss": 4.379,
"num_input_tokens_seen": 663752002,
"step": 113550
},
{
"epoch": 1.2529684994682873,
"grad_norm": 1.8565645217895508,
"learning_rate": 9.320069941372673e-05,
"loss": 4.3854,
"num_input_tokens_seen": 664622402,
"step": 113700
},
{
"epoch": 1.2546214922116492,
"grad_norm": 2.060188055038452,
"learning_rate": 9.311253801960123e-05,
"loss": 4.3758,
"num_input_tokens_seen": 665495618,
"step": 113850
},
{
"epoch": 1.2562744849550112,
"grad_norm": 1.892635464668274,
"learning_rate": 9.30243766254757e-05,
"loss": 4.3884,
"num_input_tokens_seen": 666361922,
"step": 114000
},
{
"epoch": 1.2579274776983729,
"grad_norm": 1.9154144525527954,
"learning_rate": 9.29362152313502e-05,
"loss": 4.3752,
"num_input_tokens_seen": 667241410,
"step": 114150
},
{
"epoch": 1.2595804704417348,
"grad_norm": 1.9253753423690796,
"learning_rate": 9.284805383722468e-05,
"loss": 4.3875,
"num_input_tokens_seen": 668132226,
"step": 114300
},
{
"epoch": 1.2612334631850965,
"grad_norm": 1.9465709924697876,
"learning_rate": 9.275989244309918e-05,
"loss": 4.3742,
"num_input_tokens_seen": 669015202,
"step": 114450
},
{
"epoch": 1.2628864559284585,
"grad_norm": 1.9070016145706177,
"learning_rate": 9.267173104897366e-05,
"loss": 4.3737,
"num_input_tokens_seen": 669892578,
"step": 114600
},
{
"epoch": 1.2645394486718202,
"grad_norm": 1.9075013399124146,
"learning_rate": 9.258356965484816e-05,
"loss": 4.3789,
"num_input_tokens_seen": 670773314,
"step": 114750
},
{
"epoch": 1.2661924414151822,
"grad_norm": 1.8648816347122192,
"learning_rate": 9.249540826072263e-05,
"loss": 4.3583,
"num_input_tokens_seen": 671644514,
"step": 114900
},
{
"epoch": 1.2678454341585441,
"grad_norm": 1.9572055339813232,
"learning_rate": 9.240724686659714e-05,
"loss": 4.3871,
"num_input_tokens_seen": 672523202,
"step": 115050
},
{
"epoch": 1.2694984269019058,
"grad_norm": 1.9419187307357788,
"learning_rate": 9.231908547247161e-05,
"loss": 4.3802,
"num_input_tokens_seen": 673387298,
"step": 115200
},
{
"epoch": 1.2711514196452678,
"grad_norm": 1.9556363821029663,
"learning_rate": 9.223092407834611e-05,
"loss": 4.3922,
"num_input_tokens_seen": 674262786,
"step": 115350
},
{
"epoch": 1.2728044123886297,
"grad_norm": 1.8693435192108154,
"learning_rate": 9.214276268422059e-05,
"loss": 4.3719,
"num_input_tokens_seen": 675145058,
"step": 115500
},
{
"epoch": 1.2744574051319915,
"grad_norm": 1.9475206136703491,
"learning_rate": 9.205460129009508e-05,
"loss": 4.38,
"num_input_tokens_seen": 676008962,
"step": 115650
},
{
"epoch": 1.2761103978753534,
"grad_norm": 1.8718332052230835,
"learning_rate": 9.196643989596957e-05,
"loss": 4.3734,
"num_input_tokens_seen": 676887042,
"step": 115800
},
{
"epoch": 1.2777633906187151,
"grad_norm": 1.8318613767623901,
"learning_rate": 9.187827850184405e-05,
"loss": 4.3857,
"num_input_tokens_seen": 677766690,
"step": 115950
}
],
"logging_steps": 150,
"max_steps": 272232,
"num_input_tokens_seen": 678060130,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.04696823656832e+16,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}