sartajbhuvaji's picture
Upload folder using huggingface_hub
adee540 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 4656,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006443298969072165,
"grad_norm": 37.304439544677734,
"learning_rate": 1.0000000000000002e-06,
"loss": 3.2892,
"step": 1
},
{
"epoch": 0.01610824742268041,
"grad_norm": 21.749683380126953,
"learning_rate": 2.5e-05,
"loss": 3.1951,
"step": 25
},
{
"epoch": 0.03221649484536082,
"grad_norm": 17.803585052490234,
"learning_rate": 5e-05,
"loss": 3.3824,
"step": 50
},
{
"epoch": 0.04832474226804124,
"grad_norm": 13.760115623474121,
"learning_rate": 4.97286148501954e-05,
"loss": 3.4591,
"step": 75
},
{
"epoch": 0.06443298969072164,
"grad_norm": 17.308778762817383,
"learning_rate": 4.945722970039079e-05,
"loss": 3.5677,
"step": 100
},
{
"epoch": 0.08054123711340207,
"grad_norm": 11.235258102416992,
"learning_rate": 4.9185844550586194e-05,
"loss": 3.5688,
"step": 125
},
{
"epoch": 0.09664948453608248,
"grad_norm": 11.886427879333496,
"learning_rate": 4.891445940078159e-05,
"loss": 3.5572,
"step": 150
},
{
"epoch": 0.11275773195876289,
"grad_norm": 10.693597793579102,
"learning_rate": 4.864307425097699e-05,
"loss": 3.5816,
"step": 175
},
{
"epoch": 0.12886597938144329,
"grad_norm": 10.654956817626953,
"learning_rate": 4.8371689101172386e-05,
"loss": 3.4257,
"step": 200
},
{
"epoch": 0.14497422680412372,
"grad_norm": 78.01908111572266,
"learning_rate": 4.810030395136778e-05,
"loss": 3.4916,
"step": 225
},
{
"epoch": 0.16108247422680413,
"grad_norm": 14.090380668640137,
"learning_rate": 4.782891880156318e-05,
"loss": 3.4205,
"step": 250
},
{
"epoch": 0.17719072164948454,
"grad_norm": 9.427169799804688,
"learning_rate": 4.755753365175858e-05,
"loss": 3.4519,
"step": 275
},
{
"epoch": 0.19329896907216496,
"grad_norm": 57.52346420288086,
"learning_rate": 4.728614850195397e-05,
"loss": 3.5561,
"step": 300
},
{
"epoch": 0.20940721649484537,
"grad_norm": 16.087291717529297,
"learning_rate": 4.701476335214937e-05,
"loss": 3.4669,
"step": 325
},
{
"epoch": 0.22551546391752578,
"grad_norm": 11.201750755310059,
"learning_rate": 4.674337820234477e-05,
"loss": 3.4119,
"step": 350
},
{
"epoch": 0.2416237113402062,
"grad_norm": 12.096583366394043,
"learning_rate": 4.647199305254017e-05,
"loss": 3.3902,
"step": 375
},
{
"epoch": 0.25773195876288657,
"grad_norm": 12.698060989379883,
"learning_rate": 4.620060790273557e-05,
"loss": 3.4245,
"step": 400
},
{
"epoch": 0.27384020618556704,
"grad_norm": 6.767716884613037,
"learning_rate": 4.592922275293096e-05,
"loss": 3.3878,
"step": 425
},
{
"epoch": 0.28994845360824745,
"grad_norm": 8.80782413482666,
"learning_rate": 4.565783760312636e-05,
"loss": 3.36,
"step": 450
},
{
"epoch": 0.30605670103092786,
"grad_norm": 8.567368507385254,
"learning_rate": 4.538645245332175e-05,
"loss": 3.46,
"step": 475
},
{
"epoch": 0.32216494845360827,
"grad_norm": 7.334051132202148,
"learning_rate": 4.5115067303517154e-05,
"loss": 3.4295,
"step": 500
},
{
"epoch": 0.3382731958762887,
"grad_norm": 35.50625991821289,
"learning_rate": 4.484368215371255e-05,
"loss": 3.458,
"step": 525
},
{
"epoch": 0.3543814432989691,
"grad_norm": 8.980048179626465,
"learning_rate": 4.457229700390795e-05,
"loss": 3.3348,
"step": 550
},
{
"epoch": 0.3704896907216495,
"grad_norm": 11.022858619689941,
"learning_rate": 4.4300911854103346e-05,
"loss": 3.4155,
"step": 575
},
{
"epoch": 0.3865979381443299,
"grad_norm": 7.455577373504639,
"learning_rate": 4.402952670429874e-05,
"loss": 3.3107,
"step": 600
},
{
"epoch": 0.4027061855670103,
"grad_norm": 6.974651336669922,
"learning_rate": 4.375814155449414e-05,
"loss": 3.371,
"step": 625
},
{
"epoch": 0.41881443298969073,
"grad_norm": 6.6951680183410645,
"learning_rate": 4.348675640468954e-05,
"loss": 3.3197,
"step": 650
},
{
"epoch": 0.43492268041237114,
"grad_norm": 7.696976661682129,
"learning_rate": 4.321537125488493e-05,
"loss": 3.3954,
"step": 675
},
{
"epoch": 0.45103092783505155,
"grad_norm": 7.18176794052124,
"learning_rate": 4.294398610508033e-05,
"loss": 3.3443,
"step": 700
},
{
"epoch": 0.46713917525773196,
"grad_norm": 6.54254150390625,
"learning_rate": 4.267260095527572e-05,
"loss": 3.235,
"step": 725
},
{
"epoch": 0.4832474226804124,
"grad_norm": 6.953215599060059,
"learning_rate": 4.2401215805471125e-05,
"loss": 3.3103,
"step": 750
},
{
"epoch": 0.4993556701030928,
"grad_norm": 5.925103187561035,
"learning_rate": 4.212983065566653e-05,
"loss": 3.3243,
"step": 775
},
{
"epoch": 0.5154639175257731,
"grad_norm": 7.676642417907715,
"learning_rate": 4.185844550586192e-05,
"loss": 3.2962,
"step": 800
},
{
"epoch": 0.5315721649484536,
"grad_norm": 5.8870038986206055,
"learning_rate": 4.158706035605732e-05,
"loss": 3.2808,
"step": 825
},
{
"epoch": 0.5476804123711341,
"grad_norm": 6.311049938201904,
"learning_rate": 4.131567520625272e-05,
"loss": 3.2797,
"step": 850
},
{
"epoch": 0.5637886597938144,
"grad_norm": 12.73991584777832,
"learning_rate": 4.1044290056448114e-05,
"loss": 3.2348,
"step": 875
},
{
"epoch": 0.5798969072164949,
"grad_norm": 8.892550468444824,
"learning_rate": 4.077290490664351e-05,
"loss": 3.2308,
"step": 900
},
{
"epoch": 0.5960051546391752,
"grad_norm": 5.8089704513549805,
"learning_rate": 4.0501519756838904e-05,
"loss": 3.2951,
"step": 925
},
{
"epoch": 0.6121134020618557,
"grad_norm": 6.098554611206055,
"learning_rate": 4.0230134607034306e-05,
"loss": 3.1594,
"step": 950
},
{
"epoch": 0.6282216494845361,
"grad_norm": 26.497699737548828,
"learning_rate": 3.995874945722971e-05,
"loss": 3.2403,
"step": 975
},
{
"epoch": 0.6443298969072165,
"grad_norm": 7.092996597290039,
"learning_rate": 3.96873643074251e-05,
"loss": 3.1879,
"step": 1000
},
{
"epoch": 0.6604381443298969,
"grad_norm": 5.452559947967529,
"learning_rate": 3.94159791576205e-05,
"loss": 3.2312,
"step": 1025
},
{
"epoch": 0.6765463917525774,
"grad_norm": 11.570932388305664,
"learning_rate": 3.914459400781589e-05,
"loss": 3.1979,
"step": 1050
},
{
"epoch": 0.6926546391752577,
"grad_norm": 10.654516220092773,
"learning_rate": 3.887320885801129e-05,
"loss": 3.1124,
"step": 1075
},
{
"epoch": 0.7087628865979382,
"grad_norm": 5.750201225280762,
"learning_rate": 3.860182370820669e-05,
"loss": 3.1902,
"step": 1100
},
{
"epoch": 0.7248711340206185,
"grad_norm": 6.332087993621826,
"learning_rate": 3.8330438558402085e-05,
"loss": 3.1835,
"step": 1125
},
{
"epoch": 0.740979381443299,
"grad_norm": 6.187074661254883,
"learning_rate": 3.805905340859749e-05,
"loss": 3.2402,
"step": 1150
},
{
"epoch": 0.7570876288659794,
"grad_norm": 5.326789379119873,
"learning_rate": 3.778766825879288e-05,
"loss": 3.1311,
"step": 1175
},
{
"epoch": 0.7731958762886598,
"grad_norm": 5.872878551483154,
"learning_rate": 3.751628310898828e-05,
"loss": 3.168,
"step": 1200
},
{
"epoch": 0.7893041237113402,
"grad_norm": 5.528806209564209,
"learning_rate": 3.724489795918368e-05,
"loss": 3.2025,
"step": 1225
},
{
"epoch": 0.8054123711340206,
"grad_norm": 4.855608940124512,
"learning_rate": 3.6973512809379074e-05,
"loss": 3.1711,
"step": 1250
},
{
"epoch": 0.821520618556701,
"grad_norm": 5.053402423858643,
"learning_rate": 3.670212765957447e-05,
"loss": 3.0436,
"step": 1275
},
{
"epoch": 0.8376288659793815,
"grad_norm": 6.834145545959473,
"learning_rate": 3.6430742509769864e-05,
"loss": 3.0668,
"step": 1300
},
{
"epoch": 0.8537371134020618,
"grad_norm": 5.844705104827881,
"learning_rate": 3.615935735996526e-05,
"loss": 3.129,
"step": 1325
},
{
"epoch": 0.8698453608247423,
"grad_norm": 5.622738361358643,
"learning_rate": 3.588797221016066e-05,
"loss": 3.123,
"step": 1350
},
{
"epoch": 0.8859536082474226,
"grad_norm": 5.435595512390137,
"learning_rate": 3.561658706035606e-05,
"loss": 3.1695,
"step": 1375
},
{
"epoch": 0.9020618556701031,
"grad_norm": 5.923786640167236,
"learning_rate": 3.534520191055146e-05,
"loss": 3.1486,
"step": 1400
},
{
"epoch": 0.9181701030927835,
"grad_norm": 5.717883586883545,
"learning_rate": 3.507381676074685e-05,
"loss": 3.119,
"step": 1425
},
{
"epoch": 0.9342783505154639,
"grad_norm": 5.194445610046387,
"learning_rate": 3.480243161094225e-05,
"loss": 3.0391,
"step": 1450
},
{
"epoch": 0.9503865979381443,
"grad_norm": 4.672726154327393,
"learning_rate": 3.453104646113765e-05,
"loss": 3.0887,
"step": 1475
},
{
"epoch": 0.9664948453608248,
"grad_norm": 5.593866348266602,
"learning_rate": 3.4259661311333045e-05,
"loss": 3.1433,
"step": 1500
},
{
"epoch": 0.9826030927835051,
"grad_norm": 6.05122709274292,
"learning_rate": 3.398827616152844e-05,
"loss": 3.094,
"step": 1525
},
{
"epoch": 0.9987113402061856,
"grad_norm": 5.456536769866943,
"learning_rate": 3.371689101172384e-05,
"loss": 3.0894,
"step": 1550
},
{
"epoch": 1.014819587628866,
"grad_norm": 6.8430867195129395,
"learning_rate": 3.344550586191924e-05,
"loss": 2.5031,
"step": 1575
},
{
"epoch": 1.0309278350515463,
"grad_norm": 6.864569664001465,
"learning_rate": 3.317412071211464e-05,
"loss": 2.3879,
"step": 1600
},
{
"epoch": 1.0470360824742269,
"grad_norm": 7.638180732727051,
"learning_rate": 3.2902735562310034e-05,
"loss": 2.3948,
"step": 1625
},
{
"epoch": 1.0631443298969072,
"grad_norm": 5.917698860168457,
"learning_rate": 3.263135041250543e-05,
"loss": 2.3638,
"step": 1650
},
{
"epoch": 1.0792525773195876,
"grad_norm": 6.708238124847412,
"learning_rate": 3.2359965262700824e-05,
"loss": 2.3536,
"step": 1675
},
{
"epoch": 1.0953608247422681,
"grad_norm": 9.36337947845459,
"learning_rate": 3.208858011289622e-05,
"loss": 2.4048,
"step": 1700
},
{
"epoch": 1.1114690721649485,
"grad_norm": 7.072855472564697,
"learning_rate": 3.181719496309162e-05,
"loss": 2.3709,
"step": 1725
},
{
"epoch": 1.1275773195876289,
"grad_norm": 6.986050128936768,
"learning_rate": 3.154580981328702e-05,
"loss": 2.46,
"step": 1750
},
{
"epoch": 1.1436855670103092,
"grad_norm": 6.583354949951172,
"learning_rate": 3.127442466348242e-05,
"loss": 2.3884,
"step": 1775
},
{
"epoch": 1.1597938144329896,
"grad_norm": 6.607515811920166,
"learning_rate": 3.100303951367781e-05,
"loss": 2.3733,
"step": 1800
},
{
"epoch": 1.1759020618556701,
"grad_norm": 7.239434719085693,
"learning_rate": 3.073165436387321e-05,
"loss": 2.4139,
"step": 1825
},
{
"epoch": 1.1920103092783505,
"grad_norm": 7.7802042961120605,
"learning_rate": 3.046026921406861e-05,
"loss": 2.3074,
"step": 1850
},
{
"epoch": 1.2081185567010309,
"grad_norm": 5.834593772888184,
"learning_rate": 3.0188884064264005e-05,
"loss": 2.3383,
"step": 1875
},
{
"epoch": 1.2242268041237114,
"grad_norm": 6.189608097076416,
"learning_rate": 2.9917498914459403e-05,
"loss": 2.2833,
"step": 1900
},
{
"epoch": 1.2403350515463918,
"grad_norm": 6.848288536071777,
"learning_rate": 2.9646113764654798e-05,
"loss": 2.3957,
"step": 1925
},
{
"epoch": 1.2564432989690721,
"grad_norm": 6.78605842590332,
"learning_rate": 2.9374728614850193e-05,
"loss": 2.4055,
"step": 1950
},
{
"epoch": 1.2725515463917525,
"grad_norm": 7.676894664764404,
"learning_rate": 2.9103343465045595e-05,
"loss": 2.3365,
"step": 1975
},
{
"epoch": 1.2886597938144329,
"grad_norm": 6.011926651000977,
"learning_rate": 2.8831958315240993e-05,
"loss": 2.3317,
"step": 2000
},
{
"epoch": 1.3047680412371134,
"grad_norm": 6.217193126678467,
"learning_rate": 2.856057316543639e-05,
"loss": 2.3736,
"step": 2025
},
{
"epoch": 1.3208762886597938,
"grad_norm": 7.027468681335449,
"learning_rate": 2.8289188015631784e-05,
"loss": 2.3509,
"step": 2050
},
{
"epoch": 1.3369845360824741,
"grad_norm": 7.210168838500977,
"learning_rate": 2.8017802865827182e-05,
"loss": 2.449,
"step": 2075
},
{
"epoch": 1.3530927835051547,
"grad_norm": 7.149182319641113,
"learning_rate": 2.7746417716022584e-05,
"loss": 2.3631,
"step": 2100
},
{
"epoch": 1.369201030927835,
"grad_norm": 6.41991662979126,
"learning_rate": 2.747503256621798e-05,
"loss": 2.4368,
"step": 2125
},
{
"epoch": 1.3853092783505154,
"grad_norm": 6.897440433502197,
"learning_rate": 2.7203647416413374e-05,
"loss": 2.3772,
"step": 2150
},
{
"epoch": 1.401417525773196,
"grad_norm": 6.562511444091797,
"learning_rate": 2.6932262266608772e-05,
"loss": 2.346,
"step": 2175
},
{
"epoch": 1.4175257731958764,
"grad_norm": 6.86238431930542,
"learning_rate": 2.6660877116804168e-05,
"loss": 2.3861,
"step": 2200
},
{
"epoch": 1.4336340206185567,
"grad_norm": 7.627070426940918,
"learning_rate": 2.638949196699957e-05,
"loss": 2.3415,
"step": 2225
},
{
"epoch": 1.449742268041237,
"grad_norm": 6.463057518005371,
"learning_rate": 2.6118106817194964e-05,
"loss": 2.3558,
"step": 2250
},
{
"epoch": 1.4658505154639174,
"grad_norm": 6.722979545593262,
"learning_rate": 2.584672166739036e-05,
"loss": 2.3812,
"step": 2275
},
{
"epoch": 1.481958762886598,
"grad_norm": 7.5143585205078125,
"learning_rate": 2.5575336517585758e-05,
"loss": 2.3577,
"step": 2300
},
{
"epoch": 1.4980670103092784,
"grad_norm": 6.2719197273254395,
"learning_rate": 2.5303951367781153e-05,
"loss": 2.3249,
"step": 2325
},
{
"epoch": 1.5141752577319587,
"grad_norm": 6.567588806152344,
"learning_rate": 2.5032566217976555e-05,
"loss": 2.3663,
"step": 2350
},
{
"epoch": 1.5302835051546393,
"grad_norm": 6.04072380065918,
"learning_rate": 2.476118106817195e-05,
"loss": 2.3098,
"step": 2375
},
{
"epoch": 1.5463917525773194,
"grad_norm": 6.608715057373047,
"learning_rate": 2.448979591836735e-05,
"loss": 2.3335,
"step": 2400
},
{
"epoch": 1.5625,
"grad_norm": 6.724149227142334,
"learning_rate": 2.4218410768562747e-05,
"loss": 2.3538,
"step": 2425
},
{
"epoch": 1.5786082474226806,
"grad_norm": 7.360804080963135,
"learning_rate": 2.3947025618758142e-05,
"loss": 2.3797,
"step": 2450
},
{
"epoch": 1.5947164948453607,
"grad_norm": 7.265044689178467,
"learning_rate": 2.367564046895354e-05,
"loss": 2.3377,
"step": 2475
},
{
"epoch": 1.6108247422680413,
"grad_norm": 7.212481498718262,
"learning_rate": 2.340425531914894e-05,
"loss": 2.3111,
"step": 2500
},
{
"epoch": 1.6269329896907216,
"grad_norm": 6.6800456047058105,
"learning_rate": 2.3132870169344334e-05,
"loss": 2.3555,
"step": 2525
},
{
"epoch": 1.643041237113402,
"grad_norm": 6.473804950714111,
"learning_rate": 2.2861485019539732e-05,
"loss": 2.2877,
"step": 2550
},
{
"epoch": 1.6591494845360826,
"grad_norm": 13.455022811889648,
"learning_rate": 2.2590099869735127e-05,
"loss": 2.2963,
"step": 2575
},
{
"epoch": 1.675257731958763,
"grad_norm": 6.606278419494629,
"learning_rate": 2.2318714719930526e-05,
"loss": 2.3671,
"step": 2600
},
{
"epoch": 1.6913659793814433,
"grad_norm": 6.745218276977539,
"learning_rate": 2.2047329570125924e-05,
"loss": 2.3202,
"step": 2625
},
{
"epoch": 1.7074742268041239,
"grad_norm": 7.282406330108643,
"learning_rate": 2.177594442032132e-05,
"loss": 2.3242,
"step": 2650
},
{
"epoch": 1.723582474226804,
"grad_norm": 7.313311576843262,
"learning_rate": 2.1504559270516718e-05,
"loss": 2.3228,
"step": 2675
},
{
"epoch": 1.7396907216494846,
"grad_norm": 7.339620590209961,
"learning_rate": 2.1233174120712116e-05,
"loss": 2.3336,
"step": 2700
},
{
"epoch": 1.755798969072165,
"grad_norm": 6.999018669128418,
"learning_rate": 2.096178897090751e-05,
"loss": 2.2578,
"step": 2725
},
{
"epoch": 1.7719072164948453,
"grad_norm": 6.459262371063232,
"learning_rate": 2.069040382110291e-05,
"loss": 2.2741,
"step": 2750
},
{
"epoch": 1.7880154639175259,
"grad_norm": 7.308042049407959,
"learning_rate": 2.0419018671298308e-05,
"loss": 2.2925,
"step": 2775
},
{
"epoch": 1.8041237113402062,
"grad_norm": 6.555530071258545,
"learning_rate": 2.0147633521493707e-05,
"loss": 2.3082,
"step": 2800
},
{
"epoch": 1.8202319587628866,
"grad_norm": 6.764036655426025,
"learning_rate": 1.9876248371689102e-05,
"loss": 2.2095,
"step": 2825
},
{
"epoch": 1.8363402061855671,
"grad_norm": 7.759133815765381,
"learning_rate": 1.96048632218845e-05,
"loss": 2.3166,
"step": 2850
},
{
"epoch": 1.8524484536082473,
"grad_norm": 6.442126274108887,
"learning_rate": 1.9333478072079895e-05,
"loss": 2.3075,
"step": 2875
},
{
"epoch": 1.8685567010309279,
"grad_norm": 6.804947376251221,
"learning_rate": 1.9062092922275294e-05,
"loss": 2.2889,
"step": 2900
},
{
"epoch": 1.8846649484536082,
"grad_norm": 6.473705291748047,
"learning_rate": 1.8790707772470692e-05,
"loss": 2.2423,
"step": 2925
},
{
"epoch": 1.9007731958762886,
"grad_norm": 6.420748710632324,
"learning_rate": 1.8519322622666087e-05,
"loss": 2.2308,
"step": 2950
},
{
"epoch": 1.9168814432989691,
"grad_norm": 7.469099044799805,
"learning_rate": 1.8247937472861486e-05,
"loss": 2.3467,
"step": 2975
},
{
"epoch": 1.9329896907216495,
"grad_norm": 7.019501686096191,
"learning_rate": 1.7976552323056884e-05,
"loss": 2.3117,
"step": 3000
},
{
"epoch": 1.9490979381443299,
"grad_norm": 7.53558874130249,
"learning_rate": 1.770516717325228e-05,
"loss": 2.2763,
"step": 3025
},
{
"epoch": 1.9652061855670104,
"grad_norm": 6.622589588165283,
"learning_rate": 1.7433782023447678e-05,
"loss": 2.2725,
"step": 3050
},
{
"epoch": 1.9813144329896906,
"grad_norm": 6.681495189666748,
"learning_rate": 1.7162396873643076e-05,
"loss": 2.1871,
"step": 3075
},
{
"epoch": 1.9974226804123711,
"grad_norm": 5.900623321533203,
"learning_rate": 1.6891011723838475e-05,
"loss": 2.2892,
"step": 3100
},
{
"epoch": 2.0135309278350517,
"grad_norm": 9.32268238067627,
"learning_rate": 1.661962657403387e-05,
"loss": 1.3008,
"step": 3125
},
{
"epoch": 2.029639175257732,
"grad_norm": 7.467723369598389,
"learning_rate": 1.6348241424229265e-05,
"loss": 1.0731,
"step": 3150
},
{
"epoch": 2.0457474226804124,
"grad_norm": 8.434012413024902,
"learning_rate": 1.6076856274424663e-05,
"loss": 1.0061,
"step": 3175
},
{
"epoch": 2.0618556701030926,
"grad_norm": 9.433366775512695,
"learning_rate": 1.580547112462006e-05,
"loss": 1.0132,
"step": 3200
},
{
"epoch": 2.077963917525773,
"grad_norm": 7.6198039054870605,
"learning_rate": 1.553408597481546e-05,
"loss": 0.9944,
"step": 3225
},
{
"epoch": 2.0940721649484537,
"grad_norm": 8.139434814453125,
"learning_rate": 1.5262700825010855e-05,
"loss": 0.9757,
"step": 3250
},
{
"epoch": 2.110180412371134,
"grad_norm": 8.175223350524902,
"learning_rate": 1.4991315675206252e-05,
"loss": 0.978,
"step": 3275
},
{
"epoch": 2.1262886597938144,
"grad_norm": 8.026739120483398,
"learning_rate": 1.4719930525401652e-05,
"loss": 0.9758,
"step": 3300
},
{
"epoch": 2.142396907216495,
"grad_norm": 8.502424240112305,
"learning_rate": 1.4448545375597047e-05,
"loss": 0.9047,
"step": 3325
},
{
"epoch": 2.158505154639175,
"grad_norm": 9.062753677368164,
"learning_rate": 1.4177160225792445e-05,
"loss": 0.9462,
"step": 3350
},
{
"epoch": 2.1746134020618557,
"grad_norm": 9.223316192626953,
"learning_rate": 1.3905775075987842e-05,
"loss": 0.9564,
"step": 3375
},
{
"epoch": 2.1907216494845363,
"grad_norm": 8.59533977508545,
"learning_rate": 1.3634389926183239e-05,
"loss": 0.9476,
"step": 3400
},
{
"epoch": 2.2068298969072164,
"grad_norm": 8.367724418640137,
"learning_rate": 1.3363004776378637e-05,
"loss": 0.9684,
"step": 3425
},
{
"epoch": 2.222938144329897,
"grad_norm": 9.15878963470459,
"learning_rate": 1.3091619626574034e-05,
"loss": 0.9847,
"step": 3450
},
{
"epoch": 2.239046391752577,
"grad_norm": 10.106039047241211,
"learning_rate": 1.2820234476769433e-05,
"loss": 0.9148,
"step": 3475
},
{
"epoch": 2.2551546391752577,
"grad_norm": 8.91595458984375,
"learning_rate": 1.254884932696483e-05,
"loss": 0.9293,
"step": 3500
},
{
"epoch": 2.2712628865979383,
"grad_norm": 9.854774475097656,
"learning_rate": 1.2277464177160226e-05,
"loss": 0.9469,
"step": 3525
},
{
"epoch": 2.2873711340206184,
"grad_norm": 8.479780197143555,
"learning_rate": 1.2006079027355625e-05,
"loss": 0.925,
"step": 3550
},
{
"epoch": 2.303479381443299,
"grad_norm": 8.944768905639648,
"learning_rate": 1.1734693877551021e-05,
"loss": 0.9593,
"step": 3575
},
{
"epoch": 2.319587628865979,
"grad_norm": 8.820865631103516,
"learning_rate": 1.1463308727746418e-05,
"loss": 0.9583,
"step": 3600
},
{
"epoch": 2.3356958762886597,
"grad_norm": 9.563779830932617,
"learning_rate": 1.1191923577941815e-05,
"loss": 0.9298,
"step": 3625
},
{
"epoch": 2.3518041237113403,
"grad_norm": 8.982272148132324,
"learning_rate": 1.0920538428137213e-05,
"loss": 0.9376,
"step": 3650
},
{
"epoch": 2.367912371134021,
"grad_norm": 9.715324401855469,
"learning_rate": 1.064915327833261e-05,
"loss": 0.9428,
"step": 3675
},
{
"epoch": 2.384020618556701,
"grad_norm": 8.481626510620117,
"learning_rate": 1.0377768128528009e-05,
"loss": 0.9254,
"step": 3700
},
{
"epoch": 2.4001288659793816,
"grad_norm": 9.785958290100098,
"learning_rate": 1.0106382978723404e-05,
"loss": 0.9266,
"step": 3725
},
{
"epoch": 2.4162371134020617,
"grad_norm": 10.050392150878906,
"learning_rate": 9.834997828918802e-06,
"loss": 0.9335,
"step": 3750
},
{
"epoch": 2.4323453608247423,
"grad_norm": 8.707124710083008,
"learning_rate": 9.563612679114199e-06,
"loss": 0.872,
"step": 3775
},
{
"epoch": 2.448453608247423,
"grad_norm": 9.095705032348633,
"learning_rate": 9.292227529309597e-06,
"loss": 0.8928,
"step": 3800
},
{
"epoch": 2.464561855670103,
"grad_norm": 9.691436767578125,
"learning_rate": 9.020842379504994e-06,
"loss": 0.8993,
"step": 3825
},
{
"epoch": 2.4806701030927836,
"grad_norm": 17.811647415161133,
"learning_rate": 8.749457229700392e-06,
"loss": 0.8943,
"step": 3850
},
{
"epoch": 2.4967783505154637,
"grad_norm": 9.972207069396973,
"learning_rate": 8.478072079895788e-06,
"loss": 0.9248,
"step": 3875
},
{
"epoch": 2.5128865979381443,
"grad_norm": 9.202563285827637,
"learning_rate": 8.206686930091186e-06,
"loss": 0.9077,
"step": 3900
},
{
"epoch": 2.528994845360825,
"grad_norm": 9.509817123413086,
"learning_rate": 7.935301780286583e-06,
"loss": 0.9037,
"step": 3925
},
{
"epoch": 2.545103092783505,
"grad_norm": 8.833476066589355,
"learning_rate": 7.663916630481981e-06,
"loss": 0.8766,
"step": 3950
},
{
"epoch": 2.5612113402061856,
"grad_norm": 10.363802909851074,
"learning_rate": 7.392531480677378e-06,
"loss": 0.895,
"step": 3975
},
{
"epoch": 2.5773195876288657,
"grad_norm": 9.111068725585938,
"learning_rate": 7.121146330872775e-06,
"loss": 0.9224,
"step": 4000
},
{
"epoch": 2.5934278350515463,
"grad_norm": 10.667325019836426,
"learning_rate": 6.849761181068172e-06,
"loss": 0.8776,
"step": 4025
},
{
"epoch": 2.609536082474227,
"grad_norm": 11.279472351074219,
"learning_rate": 6.578376031263569e-06,
"loss": 0.8723,
"step": 4050
},
{
"epoch": 2.6256443298969074,
"grad_norm": 15.722869873046875,
"learning_rate": 6.306990881458967e-06,
"loss": 0.8858,
"step": 4075
},
{
"epoch": 2.6417525773195876,
"grad_norm": 10.252237319946289,
"learning_rate": 6.035605731654364e-06,
"loss": 0.8639,
"step": 4100
},
{
"epoch": 2.657860824742268,
"grad_norm": 9.055089950561523,
"learning_rate": 5.764220581849761e-06,
"loss": 0.8794,
"step": 4125
},
{
"epoch": 2.6739690721649483,
"grad_norm": 9.109421730041504,
"learning_rate": 5.492835432045159e-06,
"loss": 0.8667,
"step": 4150
},
{
"epoch": 2.690077319587629,
"grad_norm": 9.12623119354248,
"learning_rate": 5.221450282240556e-06,
"loss": 0.8626,
"step": 4175
},
{
"epoch": 2.7061855670103094,
"grad_norm": 9.60417366027832,
"learning_rate": 4.950065132435953e-06,
"loss": 0.9106,
"step": 4200
},
{
"epoch": 2.7222938144329896,
"grad_norm": 9.32435417175293,
"learning_rate": 4.678679982631351e-06,
"loss": 0.8714,
"step": 4225
},
{
"epoch": 2.73840206185567,
"grad_norm": 9.819196701049805,
"learning_rate": 4.407294832826748e-06,
"loss": 0.8621,
"step": 4250
},
{
"epoch": 2.7545103092783503,
"grad_norm": 8.934945106506348,
"learning_rate": 4.135909683022145e-06,
"loss": 0.8644,
"step": 4275
},
{
"epoch": 2.770618556701031,
"grad_norm": 10.425902366638184,
"learning_rate": 3.864524533217543e-06,
"loss": 0.8937,
"step": 4300
},
{
"epoch": 2.7867268041237114,
"grad_norm": 9.629773139953613,
"learning_rate": 3.5931393834129398e-06,
"loss": 0.8774,
"step": 4325
},
{
"epoch": 2.802835051546392,
"grad_norm": 9.796236038208008,
"learning_rate": 3.3217542336083374e-06,
"loss": 0.8589,
"step": 4350
},
{
"epoch": 2.818943298969072,
"grad_norm": 9.853483200073242,
"learning_rate": 3.050369083803734e-06,
"loss": 0.8394,
"step": 4375
},
{
"epoch": 2.8350515463917527,
"grad_norm": 9.696287155151367,
"learning_rate": 2.7789839339991317e-06,
"loss": 0.8523,
"step": 4400
},
{
"epoch": 2.851159793814433,
"grad_norm": 9.468950271606445,
"learning_rate": 2.507598784194529e-06,
"loss": 0.8444,
"step": 4425
},
{
"epoch": 2.8672680412371134,
"grad_norm": 9.996761322021484,
"learning_rate": 2.236213634389926e-06,
"loss": 0.8517,
"step": 4450
},
{
"epoch": 2.883376288659794,
"grad_norm": 12.354564666748047,
"learning_rate": 1.9648284845853233e-06,
"loss": 0.8523,
"step": 4475
},
{
"epoch": 2.899484536082474,
"grad_norm": 11.12836742401123,
"learning_rate": 1.6934433347807209e-06,
"loss": 0.8458,
"step": 4500
},
{
"epoch": 2.9155927835051547,
"grad_norm": 9.318047523498535,
"learning_rate": 1.4220581849761183e-06,
"loss": 0.8548,
"step": 4525
},
{
"epoch": 2.931701030927835,
"grad_norm": 9.987869262695312,
"learning_rate": 1.1506730351715155e-06,
"loss": 0.8567,
"step": 4550
},
{
"epoch": 2.9478092783505154,
"grad_norm": 10.364538192749023,
"learning_rate": 8.792878853669127e-07,
"loss": 0.8547,
"step": 4575
},
{
"epoch": 2.963917525773196,
"grad_norm": 10.010146141052246,
"learning_rate": 6.0790273556231e-07,
"loss": 0.8464,
"step": 4600
},
{
"epoch": 2.980025773195876,
"grad_norm": 8.987505912780762,
"learning_rate": 3.3651758575770737e-07,
"loss": 0.8361,
"step": 4625
},
{
"epoch": 2.9961340206185567,
"grad_norm": 9.563461303710938,
"learning_rate": 6.513243595310464e-08,
"loss": 0.8486,
"step": 4650
}
],
"logging_steps": 25,
"max_steps": 4656,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 1000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.3657646372356096e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}