t5_law / trainer_state.json
hghaan's picture
update file
f9e562e verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.0,
"eval_steps": 500,
"global_step": 17216,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.023234200743494422,
"grad_norm": 0.33875614404678345,
"learning_rate": 1.1149825783972125e-06,
"loss": 3.2044,
"step": 100
},
{
"epoch": 0.046468401486988845,
"grad_norm": 0.09059225022792816,
"learning_rate": 2.2764227642276426e-06,
"loss": 3.1582,
"step": 200
},
{
"epoch": 0.06970260223048327,
"grad_norm": 0.24917162954807281,
"learning_rate": 3.4378629500580724e-06,
"loss": 3.1608,
"step": 300
},
{
"epoch": 0.09293680297397769,
"grad_norm": 0.465605229139328,
"learning_rate": 4.599303135888502e-06,
"loss": 3.1981,
"step": 400
},
{
"epoch": 0.11617100371747212,
"grad_norm": 0.27495619654655457,
"learning_rate": 5.7607433217189324e-06,
"loss": 3.1815,
"step": 500
},
{
"epoch": 0.13940520446096655,
"grad_norm": 0.19188807904720306,
"learning_rate": 6.922183507549362e-06,
"loss": 3.1294,
"step": 600
},
{
"epoch": 0.16263940520446096,
"grad_norm": 0.5246957540512085,
"learning_rate": 8.083623693379791e-06,
"loss": 3.0677,
"step": 700
},
{
"epoch": 0.18587360594795538,
"grad_norm": 0.258408784866333,
"learning_rate": 9.24506387921022e-06,
"loss": 2.9832,
"step": 800
},
{
"epoch": 0.20910780669144982,
"grad_norm": 0.31014084815979004,
"learning_rate": 1.0406504065040652e-05,
"loss": 2.9743,
"step": 900
},
{
"epoch": 0.23234200743494424,
"grad_norm": 0.4873325824737549,
"learning_rate": 1.1567944250871081e-05,
"loss": 2.8721,
"step": 1000
},
{
"epoch": 0.2555762081784387,
"grad_norm": 0.7442412972450256,
"learning_rate": 1.272938443670151e-05,
"loss": 2.7949,
"step": 1100
},
{
"epoch": 0.2788104089219331,
"grad_norm": 0.6129536628723145,
"learning_rate": 1.389082462253194e-05,
"loss": 2.6381,
"step": 1200
},
{
"epoch": 0.3020446096654275,
"grad_norm": 0.5687291026115417,
"learning_rate": 1.5052264808362371e-05,
"loss": 2.4031,
"step": 1300
},
{
"epoch": 0.3252788104089219,
"grad_norm": 0.6154528856277466,
"learning_rate": 1.62137049941928e-05,
"loss": 2.107,
"step": 1400
},
{
"epoch": 0.34851301115241634,
"grad_norm": 0.8730382323265076,
"learning_rate": 1.7375145180023228e-05,
"loss": 1.981,
"step": 1500
},
{
"epoch": 0.37174721189591076,
"grad_norm": 0.6668545603752136,
"learning_rate": 1.8536585365853663e-05,
"loss": 1.9311,
"step": 1600
},
{
"epoch": 0.3949814126394052,
"grad_norm": 0.6021186709403992,
"learning_rate": 1.969802555168409e-05,
"loss": 1.8733,
"step": 1700
},
{
"epoch": 0.41821561338289964,
"grad_norm": 0.8815124034881592,
"learning_rate": 1.9904479153220604e-05,
"loss": 1.8434,
"step": 1800
},
{
"epoch": 0.44144981412639406,
"grad_norm": 1.1727079153060913,
"learning_rate": 1.9775396927843037e-05,
"loss": 1.8051,
"step": 1900
},
{
"epoch": 0.4646840148698885,
"grad_norm": 1.1215996742248535,
"learning_rate": 1.964631470246547e-05,
"loss": 1.733,
"step": 2000
},
{
"epoch": 0.4879182156133829,
"grad_norm": 1.1965365409851074,
"learning_rate": 1.9517232477087907e-05,
"loss": 1.6994,
"step": 2100
},
{
"epoch": 0.5111524163568774,
"grad_norm": 1.2489936351776123,
"learning_rate": 1.938815025171034e-05,
"loss": 1.6529,
"step": 2200
},
{
"epoch": 0.5343866171003717,
"grad_norm": 1.5988222360610962,
"learning_rate": 1.9259068026332776e-05,
"loss": 1.5897,
"step": 2300
},
{
"epoch": 0.5576208178438662,
"grad_norm": 0.6558517217636108,
"learning_rate": 1.912998580095521e-05,
"loss": 1.5099,
"step": 2400
},
{
"epoch": 0.5808550185873605,
"grad_norm": 0.7629631757736206,
"learning_rate": 1.900219439783142e-05,
"loss": 1.4466,
"step": 2500
},
{
"epoch": 0.604089219330855,
"grad_norm": 0.9707331657409668,
"learning_rate": 1.8873112172453855e-05,
"loss": 1.3949,
"step": 2600
},
{
"epoch": 0.6273234200743495,
"grad_norm": 0.849176287651062,
"learning_rate": 1.874402994707629e-05,
"loss": 1.3449,
"step": 2700
},
{
"epoch": 0.6505576208178439,
"grad_norm": 0.460151731967926,
"learning_rate": 1.8614947721698724e-05,
"loss": 1.3182,
"step": 2800
},
{
"epoch": 0.6737918215613383,
"grad_norm": 0.652923047542572,
"learning_rate": 1.8485865496321157e-05,
"loss": 1.2623,
"step": 2900
},
{
"epoch": 0.6970260223048327,
"grad_norm": 0.5269683599472046,
"learning_rate": 1.8356783270943594e-05,
"loss": 1.2059,
"step": 3000
},
{
"epoch": 0.7202602230483272,
"grad_norm": 0.6761623024940491,
"learning_rate": 1.8227701045566027e-05,
"loss": 1.1477,
"step": 3100
},
{
"epoch": 0.7434944237918215,
"grad_norm": 0.4611155390739441,
"learning_rate": 1.809861882018846e-05,
"loss": 1.1063,
"step": 3200
},
{
"epoch": 0.766728624535316,
"grad_norm": 1.20090913772583,
"learning_rate": 1.7969536594810896e-05,
"loss": 1.0812,
"step": 3300
},
{
"epoch": 0.7899628252788105,
"grad_norm": 0.5198754072189331,
"learning_rate": 1.7840454369433332e-05,
"loss": 1.0637,
"step": 3400
},
{
"epoch": 0.8131970260223048,
"grad_norm": 0.7287588119506836,
"learning_rate": 1.7711372144055765e-05,
"loss": 1.0311,
"step": 3500
},
{
"epoch": 0.8364312267657993,
"grad_norm": 0.850121021270752,
"learning_rate": 1.75822899186782e-05,
"loss": 0.9687,
"step": 3600
},
{
"epoch": 0.8596654275092936,
"grad_norm": 0.5256717801094055,
"learning_rate": 1.7453207693300635e-05,
"loss": 0.8706,
"step": 3700
},
{
"epoch": 0.8828996282527881,
"grad_norm": 0.6515185236930847,
"learning_rate": 1.7324125467923068e-05,
"loss": 0.8474,
"step": 3800
},
{
"epoch": 0.9061338289962825,
"grad_norm": 0.8604176640510559,
"learning_rate": 1.7195043242545504e-05,
"loss": 0.8302,
"step": 3900
},
{
"epoch": 0.929368029739777,
"grad_norm": 0.3369189202785492,
"learning_rate": 1.7065961017167937e-05,
"loss": 0.7959,
"step": 4000
},
{
"epoch": 0.9526022304832714,
"grad_norm": 0.4804532527923584,
"learning_rate": 1.6936878791790373e-05,
"loss": 0.7945,
"step": 4100
},
{
"epoch": 0.9758364312267658,
"grad_norm": 0.3839660882949829,
"learning_rate": 1.6807796566412806e-05,
"loss": 0.7975,
"step": 4200
},
{
"epoch": 0.9990706319702602,
"grad_norm": 0.31136325001716614,
"learning_rate": 1.667871434103524e-05,
"loss": 0.7804,
"step": 4300
},
{
"epoch": 1.0223048327137547,
"grad_norm": 0.2822754681110382,
"learning_rate": 1.6549632115657676e-05,
"loss": 0.7502,
"step": 4400
},
{
"epoch": 1.045539033457249,
"grad_norm": 0.3364527225494385,
"learning_rate": 1.6420549890280112e-05,
"loss": 0.747,
"step": 4500
},
{
"epoch": 1.0687732342007434,
"grad_norm": 0.45242545008659363,
"learning_rate": 1.6291467664902545e-05,
"loss": 0.7263,
"step": 4600
},
{
"epoch": 1.092007434944238,
"grad_norm": 0.2541595995426178,
"learning_rate": 1.6162385439524978e-05,
"loss": 0.7311,
"step": 4700
},
{
"epoch": 1.1152416356877324,
"grad_norm": 0.32410866022109985,
"learning_rate": 1.6033303214147415e-05,
"loss": 0.7213,
"step": 4800
},
{
"epoch": 1.1384758364312269,
"grad_norm": 0.28702208399772644,
"learning_rate": 1.5904220988769848e-05,
"loss": 0.7103,
"step": 4900
},
{
"epoch": 1.161710037174721,
"grad_norm": 0.2637524902820587,
"learning_rate": 1.577513876339228e-05,
"loss": 0.7033,
"step": 5000
},
{
"epoch": 1.1849442379182156,
"grad_norm": 0.38048645853996277,
"learning_rate": 1.5646056538014717e-05,
"loss": 0.7111,
"step": 5100
},
{
"epoch": 1.20817843866171,
"grad_norm": 0.22926197946071625,
"learning_rate": 1.5516974312637153e-05,
"loss": 0.7053,
"step": 5200
},
{
"epoch": 1.2314126394052045,
"grad_norm": 0.2666023373603821,
"learning_rate": 1.5387892087259586e-05,
"loss": 0.6915,
"step": 5300
},
{
"epoch": 1.2546468401486988,
"grad_norm": 0.2618410587310791,
"learning_rate": 1.525880986188202e-05,
"loss": 0.6843,
"step": 5400
},
{
"epoch": 1.2778810408921932,
"grad_norm": 0.24479706585407257,
"learning_rate": 1.5129727636504454e-05,
"loss": 0.6775,
"step": 5500
},
{
"epoch": 1.3011152416356877,
"grad_norm": 0.19555561244487762,
"learning_rate": 1.5000645411126889e-05,
"loss": 0.6601,
"step": 5600
},
{
"epoch": 1.3243494423791822,
"grad_norm": 0.2121550738811493,
"learning_rate": 1.4871563185749323e-05,
"loss": 0.6625,
"step": 5700
},
{
"epoch": 1.3475836431226766,
"grad_norm": 0.36492133140563965,
"learning_rate": 1.474248096037176e-05,
"loss": 0.6567,
"step": 5800
},
{
"epoch": 1.370817843866171,
"grad_norm": 0.28411343693733215,
"learning_rate": 1.4613398734994193e-05,
"loss": 0.6424,
"step": 5900
},
{
"epoch": 1.3940520446096654,
"grad_norm": 0.3487832248210907,
"learning_rate": 1.4484316509616627e-05,
"loss": 0.6508,
"step": 6000
},
{
"epoch": 1.4172862453531598,
"grad_norm": 0.4025629758834839,
"learning_rate": 1.4355234284239062e-05,
"loss": 0.6374,
"step": 6100
},
{
"epoch": 1.4405204460966543,
"grad_norm": 0.31936919689178467,
"learning_rate": 1.4226152058861495e-05,
"loss": 0.6462,
"step": 6200
},
{
"epoch": 1.4637546468401488,
"grad_norm": 0.27360206842422485,
"learning_rate": 1.409706983348393e-05,
"loss": 0.6382,
"step": 6300
},
{
"epoch": 1.486988847583643,
"grad_norm": 0.35483697056770325,
"learning_rate": 1.3967987608106366e-05,
"loss": 0.6274,
"step": 6400
},
{
"epoch": 1.5102230483271375,
"grad_norm": 0.30311813950538635,
"learning_rate": 1.38389053827288e-05,
"loss": 0.6258,
"step": 6500
},
{
"epoch": 1.533457249070632,
"grad_norm": 0.3184954524040222,
"learning_rate": 1.3709823157351234e-05,
"loss": 0.6313,
"step": 6600
},
{
"epoch": 1.5566914498141264,
"grad_norm": 0.2632908821105957,
"learning_rate": 1.3580740931973668e-05,
"loss": 0.6217,
"step": 6700
},
{
"epoch": 1.579925650557621,
"grad_norm": 0.22145096957683563,
"learning_rate": 1.3451658706596103e-05,
"loss": 0.6245,
"step": 6800
},
{
"epoch": 1.6031598513011152,
"grad_norm": 0.5008528828620911,
"learning_rate": 1.3322576481218536e-05,
"loss": 0.6187,
"step": 6900
},
{
"epoch": 1.6263940520446096,
"grad_norm": 0.25452372431755066,
"learning_rate": 1.3193494255840972e-05,
"loss": 0.6084,
"step": 7000
},
{
"epoch": 1.649628252788104,
"grad_norm": 0.3917735815048218,
"learning_rate": 1.3064412030463407e-05,
"loss": 0.6088,
"step": 7100
},
{
"epoch": 1.6728624535315983,
"grad_norm": 0.28736940026283264,
"learning_rate": 1.2935329805085842e-05,
"loss": 0.6084,
"step": 7200
},
{
"epoch": 1.696096654275093,
"grad_norm": 0.3900860548019409,
"learning_rate": 1.2807538401962051e-05,
"loss": 0.6017,
"step": 7300
},
{
"epoch": 1.7193308550185873,
"grad_norm": 0.2482582926750183,
"learning_rate": 1.2678456176584486e-05,
"loss": 0.5964,
"step": 7400
},
{
"epoch": 1.7425650557620818,
"grad_norm": 0.2464774250984192,
"learning_rate": 1.254937395120692e-05,
"loss": 0.5929,
"step": 7500
},
{
"epoch": 1.7657992565055762,
"grad_norm": 0.36112162470817566,
"learning_rate": 1.2420291725829354e-05,
"loss": 0.5913,
"step": 7600
},
{
"epoch": 1.7890334572490705,
"grad_norm": 0.30204829573631287,
"learning_rate": 1.2291209500451788e-05,
"loss": 0.5804,
"step": 7700
},
{
"epoch": 1.8122676579925652,
"grad_norm": 0.2731075584888458,
"learning_rate": 1.2162127275074223e-05,
"loss": 0.5881,
"step": 7800
},
{
"epoch": 1.8355018587360594,
"grad_norm": 0.24604862928390503,
"learning_rate": 1.2033045049696656e-05,
"loss": 0.5679,
"step": 7900
},
{
"epoch": 1.858736059479554,
"grad_norm": 0.3449194133281708,
"learning_rate": 1.1903962824319092e-05,
"loss": 0.582,
"step": 8000
},
{
"epoch": 1.8819702602230484,
"grad_norm": 0.310375452041626,
"learning_rate": 1.1774880598941527e-05,
"loss": 0.575,
"step": 8100
},
{
"epoch": 1.9052044609665426,
"grad_norm": 0.28315114974975586,
"learning_rate": 1.1645798373563962e-05,
"loss": 0.5722,
"step": 8200
},
{
"epoch": 1.9284386617100373,
"grad_norm": 0.3091906011104584,
"learning_rate": 1.1516716148186395e-05,
"loss": 0.5533,
"step": 8300
},
{
"epoch": 1.9516728624535316,
"grad_norm": 0.28990840911865234,
"learning_rate": 1.138763392280883e-05,
"loss": 0.5724,
"step": 8400
},
{
"epoch": 1.974907063197026,
"grad_norm": 0.44591304659843445,
"learning_rate": 1.1258551697431264e-05,
"loss": 0.5701,
"step": 8500
},
{
"epoch": 1.9981412639405205,
"grad_norm": 0.26404786109924316,
"learning_rate": 1.11294694720537e-05,
"loss": 0.553,
"step": 8600
},
{
"epoch": 2.0213754646840147,
"grad_norm": 0.2843058705329895,
"learning_rate": 1.1000387246676133e-05,
"loss": 0.5631,
"step": 8700
},
{
"epoch": 2.0446096654275094,
"grad_norm": 0.20029422640800476,
"learning_rate": 1.0871305021298568e-05,
"loss": 0.5495,
"step": 8800
},
{
"epoch": 2.0678438661710037,
"grad_norm": 0.26215997338294983,
"learning_rate": 1.0742222795921003e-05,
"loss": 0.5562,
"step": 8900
},
{
"epoch": 2.091078066914498,
"grad_norm": 0.29611942172050476,
"learning_rate": 1.0613140570543436e-05,
"loss": 0.5541,
"step": 9000
},
{
"epoch": 2.1143122676579926,
"grad_norm": 0.2809213697910309,
"learning_rate": 1.048405834516587e-05,
"loss": 0.5429,
"step": 9100
},
{
"epoch": 2.137546468401487,
"grad_norm": 0.4684973657131195,
"learning_rate": 1.0354976119788307e-05,
"loss": 0.5518,
"step": 9200
},
{
"epoch": 2.1607806691449816,
"grad_norm": 0.2790776193141937,
"learning_rate": 1.0225893894410741e-05,
"loss": 0.5485,
"step": 9300
},
{
"epoch": 2.184014869888476,
"grad_norm": 0.24624982476234436,
"learning_rate": 1.0096811669033174e-05,
"loss": 0.5434,
"step": 9400
},
{
"epoch": 2.20724907063197,
"grad_norm": 0.27161070704460144,
"learning_rate": 9.967729443655609e-06,
"loss": 0.5503,
"step": 9500
},
{
"epoch": 2.2304832713754648,
"grad_norm": 0.2635902166366577,
"learning_rate": 9.838647218278044e-06,
"loss": 0.538,
"step": 9600
},
{
"epoch": 2.253717472118959,
"grad_norm": 0.35729700326919556,
"learning_rate": 9.709564992900478e-06,
"loss": 0.5376,
"step": 9700
},
{
"epoch": 2.2769516728624537,
"grad_norm": 0.224281907081604,
"learning_rate": 9.580482767522913e-06,
"loss": 0.5423,
"step": 9800
},
{
"epoch": 2.300185873605948,
"grad_norm": 0.2016523778438568,
"learning_rate": 9.451400542145348e-06,
"loss": 0.54,
"step": 9900
},
{
"epoch": 2.323420074349442,
"grad_norm": 0.3719424605369568,
"learning_rate": 9.322318316767782e-06,
"loss": 0.5326,
"step": 10000
},
{
"epoch": 2.346654275092937,
"grad_norm": 0.22268572449684143,
"learning_rate": 9.193236091390217e-06,
"loss": 0.5379,
"step": 10100
},
{
"epoch": 2.369888475836431,
"grad_norm": 0.3181590735912323,
"learning_rate": 9.06415386601265e-06,
"loss": 0.5328,
"step": 10200
},
{
"epoch": 2.393122676579926,
"grad_norm": 0.2703763246536255,
"learning_rate": 8.935071640635087e-06,
"loss": 0.5276,
"step": 10300
},
{
"epoch": 2.41635687732342,
"grad_norm": 0.2698732912540436,
"learning_rate": 8.80598941525752e-06,
"loss": 0.5338,
"step": 10400
},
{
"epoch": 2.4395910780669143,
"grad_norm": 0.2765790820121765,
"learning_rate": 8.676907189879954e-06,
"loss": 0.5418,
"step": 10500
},
{
"epoch": 2.462825278810409,
"grad_norm": 0.36516493558883667,
"learning_rate": 8.547824964502389e-06,
"loss": 0.5249,
"step": 10600
},
{
"epoch": 2.4860594795539033,
"grad_norm": 0.23371903598308563,
"learning_rate": 8.418742739124824e-06,
"loss": 0.5318,
"step": 10700
},
{
"epoch": 2.5092936802973975,
"grad_norm": 0.23883387446403503,
"learning_rate": 8.289660513747258e-06,
"loss": 0.5336,
"step": 10800
},
{
"epoch": 2.532527881040892,
"grad_norm": 0.23600026965141296,
"learning_rate": 8.160578288369693e-06,
"loss": 0.5207,
"step": 10900
},
{
"epoch": 2.5557620817843865,
"grad_norm": 0.22283987700939178,
"learning_rate": 8.031496062992128e-06,
"loss": 0.5261,
"step": 11000
},
{
"epoch": 2.578996282527881,
"grad_norm": 0.3077383041381836,
"learning_rate": 7.90241383761456e-06,
"loss": 0.5117,
"step": 11100
},
{
"epoch": 2.6022304832713754,
"grad_norm": 0.24372899532318115,
"learning_rate": 7.773331612236995e-06,
"loss": 0.5251,
"step": 11200
},
{
"epoch": 2.6254646840148697,
"grad_norm": 0.3168962001800537,
"learning_rate": 7.64424938685943e-06,
"loss": 0.5238,
"step": 11300
},
{
"epoch": 2.6486988847583643,
"grad_norm": 0.2522094249725342,
"learning_rate": 7.515167161481865e-06,
"loss": 0.5141,
"step": 11400
},
{
"epoch": 2.6719330855018586,
"grad_norm": 0.4139024317264557,
"learning_rate": 7.3860849361042984e-06,
"loss": 0.5185,
"step": 11500
},
{
"epoch": 2.6951672862453533,
"grad_norm": 0.2781153619289398,
"learning_rate": 7.257002710726734e-06,
"loss": 0.5121,
"step": 11600
},
{
"epoch": 2.7184014869888475,
"grad_norm": 0.38515913486480713,
"learning_rate": 7.127920485349168e-06,
"loss": 0.5178,
"step": 11700
},
{
"epoch": 2.741635687732342,
"grad_norm": 0.33289971947669983,
"learning_rate": 6.998838259971602e-06,
"loss": 0.5124,
"step": 11800
},
{
"epoch": 2.7648698884758365,
"grad_norm": 0.36876046657562256,
"learning_rate": 6.871046856847813e-06,
"loss": 0.5137,
"step": 11900
},
{
"epoch": 2.7881040892193307,
"grad_norm": 0.28098130226135254,
"learning_rate": 6.7419646314702466e-06,
"loss": 0.509,
"step": 12000
},
{
"epoch": 2.8113382899628254,
"grad_norm": 0.32521939277648926,
"learning_rate": 6.612882406092681e-06,
"loss": 0.512,
"step": 12100
},
{
"epoch": 2.8345724907063197,
"grad_norm": 0.23627902567386627,
"learning_rate": 6.483800180715116e-06,
"loss": 0.5084,
"step": 12200
},
{
"epoch": 2.857806691449814,
"grad_norm": 0.23111554980278015,
"learning_rate": 6.354717955337551e-06,
"loss": 0.517,
"step": 12300
},
{
"epoch": 2.8810408921933086,
"grad_norm": 0.3062553107738495,
"learning_rate": 6.2256357299599844e-06,
"loss": 0.5063,
"step": 12400
},
{
"epoch": 2.904275092936803,
"grad_norm": 0.3274383842945099,
"learning_rate": 6.09655350458242e-06,
"loss": 0.5066,
"step": 12500
},
{
"epoch": 2.9275092936802976,
"grad_norm": 0.25803956389427185,
"learning_rate": 5.967471279204854e-06,
"loss": 0.5064,
"step": 12600
},
{
"epoch": 2.950743494423792,
"grad_norm": 0.29026666283607483,
"learning_rate": 5.838389053827288e-06,
"loss": 0.5088,
"step": 12700
},
{
"epoch": 2.973977695167286,
"grad_norm": 0.36228805780410767,
"learning_rate": 5.709306828449723e-06,
"loss": 0.507,
"step": 12800
},
{
"epoch": 2.9972118959107807,
"grad_norm": 0.2669726014137268,
"learning_rate": 5.580224603072157e-06,
"loss": 0.4934,
"step": 12900
},
{
"epoch": 3.020446096654275,
"grad_norm": 0.24396216869354248,
"learning_rate": 5.451142377694592e-06,
"loss": 0.5099,
"step": 13000
},
{
"epoch": 3.0436802973977697,
"grad_norm": 0.25540581345558167,
"learning_rate": 5.322060152317027e-06,
"loss": 0.5037,
"step": 13100
},
{
"epoch": 3.066914498141264,
"grad_norm": 0.1964583396911621,
"learning_rate": 5.192977926939461e-06,
"loss": 0.5055,
"step": 13200
},
{
"epoch": 3.090148698884758,
"grad_norm": 0.2318154275417328,
"learning_rate": 5.063895701561895e-06,
"loss": 0.5041,
"step": 13300
},
{
"epoch": 3.113382899628253,
"grad_norm": 0.28110265731811523,
"learning_rate": 4.9348134761843295e-06,
"loss": 0.5043,
"step": 13400
},
{
"epoch": 3.136617100371747,
"grad_norm": 0.3360753357410431,
"learning_rate": 4.805731250806764e-06,
"loss": 0.4915,
"step": 13500
},
{
"epoch": 3.159851301115242,
"grad_norm": 0.3044135868549347,
"learning_rate": 4.676649025429199e-06,
"loss": 0.499,
"step": 13600
},
{
"epoch": 3.183085501858736,
"grad_norm": 0.28163620829582214,
"learning_rate": 4.547566800051634e-06,
"loss": 0.4996,
"step": 13700
},
{
"epoch": 3.2063197026022303,
"grad_norm": 0.23853909969329834,
"learning_rate": 4.418484574674068e-06,
"loss": 0.5073,
"step": 13800
},
{
"epoch": 3.229553903345725,
"grad_norm": 0.25510174036026,
"learning_rate": 4.289402349296502e-06,
"loss": 0.4988,
"step": 13900
},
{
"epoch": 3.2527881040892193,
"grad_norm": 0.650174081325531,
"learning_rate": 4.160320123918937e-06,
"loss": 0.5024,
"step": 14000
},
{
"epoch": 3.276022304832714,
"grad_norm": 0.36293137073516846,
"learning_rate": 4.0312378985413715e-06,
"loss": 0.4913,
"step": 14100
},
{
"epoch": 3.299256505576208,
"grad_norm": 0.35399818420410156,
"learning_rate": 3.902155673163805e-06,
"loss": 0.4993,
"step": 14200
},
{
"epoch": 3.3224907063197024,
"grad_norm": 0.2553289830684662,
"learning_rate": 3.7730734477862404e-06,
"loss": 0.5017,
"step": 14300
},
{
"epoch": 3.345724907063197,
"grad_norm": 0.25535061955451965,
"learning_rate": 3.643991222408675e-06,
"loss": 0.4895,
"step": 14400
},
{
"epoch": 3.3689591078066914,
"grad_norm": 0.2772742509841919,
"learning_rate": 3.514908997031109e-06,
"loss": 0.4954,
"step": 14500
},
{
"epoch": 3.392193308550186,
"grad_norm": 0.26105812191963196,
"learning_rate": 3.387117593907319e-06,
"loss": 0.4964,
"step": 14600
},
{
"epoch": 3.4154275092936803,
"grad_norm": 0.2538992166519165,
"learning_rate": 3.258035368529754e-06,
"loss": 0.4985,
"step": 14700
},
{
"epoch": 3.4386617100371746,
"grad_norm": 0.2889178693294525,
"learning_rate": 3.128953143152188e-06,
"loss": 0.4969,
"step": 14800
},
{
"epoch": 3.4618959107806693,
"grad_norm": 0.28792130947113037,
"learning_rate": 2.9998709177746228e-06,
"loss": 0.4985,
"step": 14900
},
{
"epoch": 3.4851301115241635,
"grad_norm": 0.36826494336128235,
"learning_rate": 2.8707886923970575e-06,
"loss": 0.4937,
"step": 15000
},
{
"epoch": 3.508364312267658,
"grad_norm": 0.24432937800884247,
"learning_rate": 2.7417064670194917e-06,
"loss": 0.4892,
"step": 15100
},
{
"epoch": 3.5315985130111525,
"grad_norm": 0.36436623334884644,
"learning_rate": 2.6126242416419264e-06,
"loss": 0.5029,
"step": 15200
},
{
"epoch": 3.5548327137546467,
"grad_norm": 0.3257830739021301,
"learning_rate": 2.4835420162643606e-06,
"loss": 0.484,
"step": 15300
},
{
"epoch": 3.5780669144981414,
"grad_norm": 0.20910651981830597,
"learning_rate": 2.354459790886795e-06,
"loss": 0.4934,
"step": 15400
},
{
"epoch": 3.6013011152416357,
"grad_norm": 0.27706313133239746,
"learning_rate": 2.2253775655092296e-06,
"loss": 0.4972,
"step": 15500
},
{
"epoch": 3.6245353159851303,
"grad_norm": 0.28043028712272644,
"learning_rate": 2.0962953401316643e-06,
"loss": 0.4878,
"step": 15600
},
{
"epoch": 3.6477695167286246,
"grad_norm": 0.34835153818130493,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.4954,
"step": 15700
},
{
"epoch": 3.671003717472119,
"grad_norm": 0.3561202585697174,
"learning_rate": 1.838130889376533e-06,
"loss": 0.4992,
"step": 15800
},
{
"epoch": 3.6942379182156135,
"grad_norm": 0.2767621576786041,
"learning_rate": 1.7090486639989677e-06,
"loss": 0.4982,
"step": 15900
},
{
"epoch": 3.717472118959108,
"grad_norm": 0.22851090133190155,
"learning_rate": 1.579966438621402e-06,
"loss": 0.498,
"step": 16000
},
{
"epoch": 3.7407063197026025,
"grad_norm": 0.28282201290130615,
"learning_rate": 1.4508842132438364e-06,
"loss": 0.4898,
"step": 16100
},
{
"epoch": 3.7639405204460967,
"grad_norm": 0.24474182724952698,
"learning_rate": 1.3218019878662709e-06,
"loss": 0.501,
"step": 16200
},
{
"epoch": 3.787174721189591,
"grad_norm": 0.27427938580513,
"learning_rate": 1.1927197624887055e-06,
"loss": 0.4966,
"step": 16300
},
{
"epoch": 3.8104089219330852,
"grad_norm": 0.38391393423080444,
"learning_rate": 1.0636375371111398e-06,
"loss": 0.4941,
"step": 16400
},
{
"epoch": 3.83364312267658,
"grad_norm": 0.3098974823951721,
"learning_rate": 9.345553117335744e-07,
"loss": 0.4879,
"step": 16500
},
{
"epoch": 3.8568773234200746,
"grad_norm": 0.2817577123641968,
"learning_rate": 8.054730863560088e-07,
"loss": 0.4925,
"step": 16600
},
{
"epoch": 3.880111524163569,
"grad_norm": 0.3037372827529907,
"learning_rate": 6.763908609784433e-07,
"loss": 0.4927,
"step": 16700
},
{
"epoch": 3.903345724907063,
"grad_norm": 0.2850995659828186,
"learning_rate": 5.473086356008779e-07,
"loss": 0.4909,
"step": 16800
},
{
"epoch": 3.9265799256505574,
"grad_norm": 0.25115731358528137,
"learning_rate": 4.182264102233123e-07,
"loss": 0.5,
"step": 16900
},
{
"epoch": 3.949814126394052,
"grad_norm": 0.4323899745941162,
"learning_rate": 2.8914418484574677e-07,
"loss": 0.4861,
"step": 17000
},
{
"epoch": 3.9730483271375467,
"grad_norm": 0.30076873302459717,
"learning_rate": 1.6006195946818127e-07,
"loss": 0.4855,
"step": 17100
},
{
"epoch": 3.996282527881041,
"grad_norm": 0.2874129116535187,
"learning_rate": 3.097973409061573e-08,
"loss": 0.4957,
"step": 17200
}
],
"logging_steps": 100,
"max_steps": 17216,
"num_input_tokens_seen": 0,
"num_train_epochs": 4,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.805111076121907e+16,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}