terry69's picture
Model save
68a44ee verified
raw
history blame
36.8 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9992688276870583,
"eval_steps": 500,
"global_step": 1025,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0009748964172556666,
"grad_norm": 22.86059565076889,
"learning_rate": 9.70873786407767e-08,
"loss": 1.3065,
"step": 1
},
{
"epoch": 0.004874482086278333,
"grad_norm": 22.08339033526192,
"learning_rate": 4.854368932038835e-07,
"loss": 1.316,
"step": 5
},
{
"epoch": 0.009748964172556666,
"grad_norm": 8.710899949749317,
"learning_rate": 9.70873786407767e-07,
"loss": 1.2194,
"step": 10
},
{
"epoch": 0.014623446258834999,
"grad_norm": 8.431065000460007,
"learning_rate": 1.4563106796116506e-06,
"loss": 1.0655,
"step": 15
},
{
"epoch": 0.01949792834511333,
"grad_norm": 3.0129778026962346,
"learning_rate": 1.941747572815534e-06,
"loss": 0.9273,
"step": 20
},
{
"epoch": 0.024372410431391666,
"grad_norm": 2.4586920682855142,
"learning_rate": 2.427184466019418e-06,
"loss": 0.8841,
"step": 25
},
{
"epoch": 0.029246892517669997,
"grad_norm": 2.1763561124357684,
"learning_rate": 2.912621359223301e-06,
"loss": 0.853,
"step": 30
},
{
"epoch": 0.03412137460394833,
"grad_norm": 2.2029840594866466,
"learning_rate": 3.398058252427185e-06,
"loss": 0.8341,
"step": 35
},
{
"epoch": 0.03899585669022666,
"grad_norm": 2.446000778730272,
"learning_rate": 3.883495145631068e-06,
"loss": 0.8118,
"step": 40
},
{
"epoch": 0.043870338776504994,
"grad_norm": 2.451352004117067,
"learning_rate": 4.368932038834952e-06,
"loss": 0.7943,
"step": 45
},
{
"epoch": 0.04874482086278333,
"grad_norm": 2.4129113465126957,
"learning_rate": 4.854368932038836e-06,
"loss": 0.7768,
"step": 50
},
{
"epoch": 0.05361930294906166,
"grad_norm": 2.3127065430589298,
"learning_rate": 5.3398058252427185e-06,
"loss": 0.7778,
"step": 55
},
{
"epoch": 0.058493785035339994,
"grad_norm": 2.405876637403882,
"learning_rate": 5.825242718446602e-06,
"loss": 0.7534,
"step": 60
},
{
"epoch": 0.06336826712161833,
"grad_norm": 2.2639828354846387,
"learning_rate": 6.310679611650487e-06,
"loss": 0.7398,
"step": 65
},
{
"epoch": 0.06824274920789666,
"grad_norm": 2.3243442460154657,
"learning_rate": 6.79611650485437e-06,
"loss": 0.7261,
"step": 70
},
{
"epoch": 0.073117231294175,
"grad_norm": 2.4227745775819836,
"learning_rate": 7.2815533980582534e-06,
"loss": 0.7168,
"step": 75
},
{
"epoch": 0.07799171338045333,
"grad_norm": 4.25071801960126,
"learning_rate": 7.766990291262136e-06,
"loss": 0.71,
"step": 80
},
{
"epoch": 0.08286619546673166,
"grad_norm": 2.148825747496477,
"learning_rate": 8.25242718446602e-06,
"loss": 0.7074,
"step": 85
},
{
"epoch": 0.08774067755300999,
"grad_norm": 2.380339632272055,
"learning_rate": 8.737864077669904e-06,
"loss": 0.7032,
"step": 90
},
{
"epoch": 0.09261515963928832,
"grad_norm": 2.3544801176511916,
"learning_rate": 9.223300970873788e-06,
"loss": 0.7,
"step": 95
},
{
"epoch": 0.09748964172556666,
"grad_norm": 2.1648774285785013,
"learning_rate": 9.708737864077671e-06,
"loss": 0.6812,
"step": 100
},
{
"epoch": 0.102364123811845,
"grad_norm": 2.103759384635341,
"learning_rate": 9.999883898929927e-06,
"loss": 0.6822,
"step": 105
},
{
"epoch": 0.10723860589812333,
"grad_norm": 2.1317773929603576,
"learning_rate": 9.998577823812066e-06,
"loss": 0.6845,
"step": 110
},
{
"epoch": 0.11211308798440166,
"grad_norm": 2.5038388167151924,
"learning_rate": 9.995820927586548e-06,
"loss": 0.6799,
"step": 115
},
{
"epoch": 0.11698757007067999,
"grad_norm": 2.2890913021002772,
"learning_rate": 9.99161401043362e-06,
"loss": 0.6893,
"step": 120
},
{
"epoch": 0.12186205215695832,
"grad_norm": 2.287115199507479,
"learning_rate": 9.985958293397433e-06,
"loss": 0.6801,
"step": 125
},
{
"epoch": 0.12673653424323666,
"grad_norm": 2.111605184938528,
"learning_rate": 9.978855418031633e-06,
"loss": 0.6761,
"step": 130
},
{
"epoch": 0.131611016329515,
"grad_norm": 2.3968266021989573,
"learning_rate": 9.970307445922905e-06,
"loss": 0.6659,
"step": 135
},
{
"epoch": 0.13648549841579333,
"grad_norm": 2.0562856783183383,
"learning_rate": 9.960316858092613e-06,
"loss": 0.6735,
"step": 140
},
{
"epoch": 0.14135998050207166,
"grad_norm": 2.2596310641814634,
"learning_rate": 9.948886554276689e-06,
"loss": 0.6658,
"step": 145
},
{
"epoch": 0.14623446258835,
"grad_norm": 2.3851159107252022,
"learning_rate": 9.936019852083982e-06,
"loss": 0.6592,
"step": 150
},
{
"epoch": 0.15110894467462832,
"grad_norm": 2.0161901435788083,
"learning_rate": 9.921720486033348e-06,
"loss": 0.657,
"step": 155
},
{
"epoch": 0.15598342676090665,
"grad_norm": 2.127638747482369,
"learning_rate": 9.905992606469708e-06,
"loss": 0.6595,
"step": 160
},
{
"epoch": 0.16085790884718498,
"grad_norm": 2.0531639641862447,
"learning_rate": 9.888840778359431e-06,
"loss": 0.6515,
"step": 165
},
{
"epoch": 0.1657323909334633,
"grad_norm": 1.8950315321987827,
"learning_rate": 9.870269979965364e-06,
"loss": 0.6492,
"step": 170
},
{
"epoch": 0.17060687301974164,
"grad_norm": 1.9060347213321087,
"learning_rate": 9.850285601401899e-06,
"loss": 0.6458,
"step": 175
},
{
"epoch": 0.17548135510601998,
"grad_norm": 2.074587641369688,
"learning_rate": 9.828893443070527e-06,
"loss": 0.6515,
"step": 180
},
{
"epoch": 0.1803558371922983,
"grad_norm": 2.021646896648869,
"learning_rate": 9.806099713976277e-06,
"loss": 0.6306,
"step": 185
},
{
"epoch": 0.18523031927857664,
"grad_norm": 2.183178620435626,
"learning_rate": 9.781911029925573e-06,
"loss": 0.6317,
"step": 190
},
{
"epoch": 0.190104801364855,
"grad_norm": 1.9176655344163256,
"learning_rate": 9.756334411606028e-06,
"loss": 0.6304,
"step": 195
},
{
"epoch": 0.19497928345113333,
"grad_norm": 2.0283053056639226,
"learning_rate": 9.729377282548696e-06,
"loss": 0.6273,
"step": 200
},
{
"epoch": 0.19985376553741166,
"grad_norm": 2.2614639215711794,
"learning_rate": 9.701047466973429e-06,
"loss": 0.6227,
"step": 205
},
{
"epoch": 0.20472824762369,
"grad_norm": 2.048850431237601,
"learning_rate": 9.67135318751792e-06,
"loss": 0.6289,
"step": 210
},
{
"epoch": 0.20960272970996832,
"grad_norm": 2.0716723168430224,
"learning_rate": 9.640303062851101e-06,
"loss": 0.6235,
"step": 215
},
{
"epoch": 0.21447721179624665,
"grad_norm": 2.165745603614713,
"learning_rate": 9.607906105171613e-06,
"loss": 0.6254,
"step": 220
},
{
"epoch": 0.21935169388252498,
"grad_norm": 2.121524753608251,
"learning_rate": 9.574171717592038e-06,
"loss": 0.6215,
"step": 225
},
{
"epoch": 0.22422617596880332,
"grad_norm": 2.0155505697588207,
"learning_rate": 9.539109691409677e-06,
"loss": 0.6125,
"step": 230
},
{
"epoch": 0.22910065805508165,
"grad_norm": 2.5497104730670457,
"learning_rate": 9.502730203264656e-06,
"loss": 0.6116,
"step": 235
},
{
"epoch": 0.23397514014135998,
"grad_norm": 2.0859151782380163,
"learning_rate": 9.465043812186194e-06,
"loss": 0.6026,
"step": 240
},
{
"epoch": 0.2388496222276383,
"grad_norm": 2.0192086858289673,
"learning_rate": 9.426061456527871e-06,
"loss": 0.601,
"step": 245
},
{
"epoch": 0.24372410431391664,
"grad_norm": 2.08123508728777,
"learning_rate": 9.385794450792818e-06,
"loss": 0.593,
"step": 250
},
{
"epoch": 0.24859858640019497,
"grad_norm": 1.970692344326779,
"learning_rate": 9.344254482349702e-06,
"loss": 0.5879,
"step": 255
},
{
"epoch": 0.25347306848647333,
"grad_norm": 2.0360574888538037,
"learning_rate": 9.301453608040523e-06,
"loss": 0.5884,
"step": 260
},
{
"epoch": 0.25834755057275166,
"grad_norm": 1.9510711367676767,
"learning_rate": 9.25740425068114e-06,
"loss": 0.5937,
"step": 265
},
{
"epoch": 0.26322203265903,
"grad_norm": 2.0679042358578057,
"learning_rate": 9.2121191954556e-06,
"loss": 0.5838,
"step": 270
},
{
"epoch": 0.2680965147453083,
"grad_norm": 2.0443212477081008,
"learning_rate": 9.165611586205268e-06,
"loss": 0.5719,
"step": 275
},
{
"epoch": 0.27297099683158665,
"grad_norm": 2.0408583983165234,
"learning_rate": 9.11789492161388e-06,
"loss": 0.5839,
"step": 280
},
{
"epoch": 0.277845478917865,
"grad_norm": 1.9767895517261123,
"learning_rate": 9.068983051289589e-06,
"loss": 0.5865,
"step": 285
},
{
"epoch": 0.2827199610041433,
"grad_norm": 1.881981157978116,
"learning_rate": 9.018890171745156e-06,
"loss": 0.5793,
"step": 290
},
{
"epoch": 0.28759444309042165,
"grad_norm": 1.9932110390018152,
"learning_rate": 8.967630822277472e-06,
"loss": 0.5808,
"step": 295
},
{
"epoch": 0.2924689251767,
"grad_norm": 1.9915669044094475,
"learning_rate": 8.915219880747555e-06,
"loss": 0.569,
"step": 300
},
{
"epoch": 0.2973434072629783,
"grad_norm": 2.046268209029942,
"learning_rate": 8.861672559262316e-06,
"loss": 0.5759,
"step": 305
},
{
"epoch": 0.30221788934925664,
"grad_norm": 1.9535790327085676,
"learning_rate": 8.80700439975928e-06,
"loss": 0.5717,
"step": 310
},
{
"epoch": 0.30709237143553497,
"grad_norm": 2.280776169449102,
"learning_rate": 8.751231269495604e-06,
"loss": 0.5703,
"step": 315
},
{
"epoch": 0.3119668535218133,
"grad_norm": 2.0042263363709996,
"learning_rate": 8.694369356442638e-06,
"loss": 0.5668,
"step": 320
},
{
"epoch": 0.31684133560809163,
"grad_norm": 2.098934601526615,
"learning_rate": 8.636435164587436e-06,
"loss": 0.5532,
"step": 325
},
{
"epoch": 0.32171581769436997,
"grad_norm": 2.0675738967767496,
"learning_rate": 8.577445509142514e-06,
"loss": 0.5585,
"step": 330
},
{
"epoch": 0.3265902997806483,
"grad_norm": 2.3766255529170692,
"learning_rate": 8.517417511665299e-06,
"loss": 0.5658,
"step": 335
},
{
"epoch": 0.3314647818669266,
"grad_norm": 2.1910659775597248,
"learning_rate": 8.456368595088647e-06,
"loss": 0.5507,
"step": 340
},
{
"epoch": 0.33633926395320496,
"grad_norm": 1.8703655986029497,
"learning_rate": 8.394316478663886e-06,
"loss": 0.5406,
"step": 345
},
{
"epoch": 0.3412137460394833,
"grad_norm": 1.9244217565682906,
"learning_rate": 8.331279172817876e-06,
"loss": 0.542,
"step": 350
},
{
"epoch": 0.3460882281257616,
"grad_norm": 2.0014619556553335,
"learning_rate": 8.26727497392553e-06,
"loss": 0.5392,
"step": 355
},
{
"epoch": 0.35096271021203995,
"grad_norm": 1.8442031091685414,
"learning_rate": 8.20232245899935e-06,
"loss": 0.5318,
"step": 360
},
{
"epoch": 0.3558371922983183,
"grad_norm": 1.9799423972998886,
"learning_rate": 8.136440480297514e-06,
"loss": 0.5414,
"step": 365
},
{
"epoch": 0.3607116743845966,
"grad_norm": 2.0577170932676596,
"learning_rate": 8.069648159852059e-06,
"loss": 0.5296,
"step": 370
},
{
"epoch": 0.36558615647087495,
"grad_norm": 2.037446340676939,
"learning_rate": 8.001964883918793e-06,
"loss": 0.5348,
"step": 375
},
{
"epoch": 0.3704606385571533,
"grad_norm": 2.0638439360397665,
"learning_rate": 7.933410297350472e-06,
"loss": 0.5229,
"step": 380
},
{
"epoch": 0.3753351206434316,
"grad_norm": 1.9569671735850687,
"learning_rate": 7.864004297894963e-06,
"loss": 0.5275,
"step": 385
},
{
"epoch": 0.38020960272971,
"grad_norm": 1.9870139157026745,
"learning_rate": 7.793767030419975e-06,
"loss": 0.533,
"step": 390
},
{
"epoch": 0.3850840848159883,
"grad_norm": 1.9631002389899173,
"learning_rate": 7.722718881066086e-06,
"loss": 0.5245,
"step": 395
},
{
"epoch": 0.38995856690226666,
"grad_norm": 1.9193362092353006,
"learning_rate": 7.650880471329725e-06,
"loss": 0.5203,
"step": 400
},
{
"epoch": 0.394833048988545,
"grad_norm": 1.9397872066961752,
"learning_rate": 7.578272652077849e-06,
"loss": 0.5144,
"step": 405
},
{
"epoch": 0.3997075310748233,
"grad_norm": 1.952394556800401,
"learning_rate": 7.504916497496051e-06,
"loss": 0.5168,
"step": 410
},
{
"epoch": 0.40458201316110165,
"grad_norm": 1.9410715991990146,
"learning_rate": 7.43083329897184e-06,
"loss": 0.4964,
"step": 415
},
{
"epoch": 0.40945649524738,
"grad_norm": 1.9371602423339187,
"learning_rate": 7.3560445589148875e-06,
"loss": 0.5136,
"step": 420
},
{
"epoch": 0.4143309773336583,
"grad_norm": 1.9398436533180357,
"learning_rate": 7.2805719845160195e-06,
"loss": 0.5012,
"step": 425
},
{
"epoch": 0.41920545941993664,
"grad_norm": 1.8404891949351312,
"learning_rate": 7.20443748144678e-06,
"loss": 0.4985,
"step": 430
},
{
"epoch": 0.424079941506215,
"grad_norm": 1.9219013215637166,
"learning_rate": 7.127663147501377e-06,
"loss": 0.497,
"step": 435
},
{
"epoch": 0.4289544235924933,
"grad_norm": 2.087372537148497,
"learning_rate": 7.050271266182862e-06,
"loss": 0.4954,
"step": 440
},
{
"epoch": 0.43382890567877164,
"grad_norm": 1.9732921364375713,
"learning_rate": 6.97228430023543e-06,
"loss": 0.4914,
"step": 445
},
{
"epoch": 0.43870338776504997,
"grad_norm": 1.8509876619225545,
"learning_rate": 6.893724885124668e-06,
"loss": 0.4816,
"step": 450
},
{
"epoch": 0.4435778698513283,
"grad_norm": 1.876089892137146,
"learning_rate": 6.814615822467691e-06,
"loss": 0.4863,
"step": 455
},
{
"epoch": 0.44845235193760663,
"grad_norm": 1.956376416630627,
"learning_rate": 6.734980073415038e-06,
"loss": 0.4914,
"step": 460
},
{
"epoch": 0.45332683402388496,
"grad_norm": 1.831312024979652,
"learning_rate": 6.654840751986282e-06,
"loss": 0.4773,
"step": 465
},
{
"epoch": 0.4582013161101633,
"grad_norm": 1.9745029935310336,
"learning_rate": 6.574221118361254e-06,
"loss": 0.4843,
"step": 470
},
{
"epoch": 0.4630757981964416,
"grad_norm": 2.090848022257445,
"learning_rate": 6.493144572128852e-06,
"loss": 0.4891,
"step": 475
},
{
"epoch": 0.46795028028271995,
"grad_norm": 2.0539193178381345,
"learning_rate": 6.411634645495388e-06,
"loss": 0.465,
"step": 480
},
{
"epoch": 0.4728247623689983,
"grad_norm": 1.8380774504527437,
"learning_rate": 6.329714996454436e-06,
"loss": 0.4717,
"step": 485
},
{
"epoch": 0.4776992444552766,
"grad_norm": 1.9501438562254991,
"learning_rate": 6.247409401920184e-06,
"loss": 0.47,
"step": 490
},
{
"epoch": 0.48257372654155495,
"grad_norm": 1.9399214168055752,
"learning_rate": 6.164741750826246e-06,
"loss": 0.4696,
"step": 495
},
{
"epoch": 0.4874482086278333,
"grad_norm": 1.9438999474698049,
"learning_rate": 6.081736037191998e-06,
"loss": 0.4761,
"step": 500
},
{
"epoch": 0.4923226907141116,
"grad_norm": 2.109991763270123,
"learning_rate": 5.998416353158369e-06,
"loss": 0.467,
"step": 505
},
{
"epoch": 0.49719717280038994,
"grad_norm": 1.9305942803443825,
"learning_rate": 5.914806881995192e-06,
"loss": 0.4519,
"step": 510
},
{
"epoch": 0.5020716548866683,
"grad_norm": 1.8894659709352353,
"learning_rate": 5.830931891082077e-06,
"loss": 0.4625,
"step": 515
},
{
"epoch": 0.5069461369729467,
"grad_norm": 1.943600690573164,
"learning_rate": 5.746815724864884e-06,
"loss": 0.4486,
"step": 520
},
{
"epoch": 0.5118206190592249,
"grad_norm": 1.8604126506438579,
"learning_rate": 5.662482797789833e-06,
"loss": 0.4501,
"step": 525
},
{
"epoch": 0.5166951011455033,
"grad_norm": 1.8945174085661423,
"learning_rate": 5.577957587217281e-06,
"loss": 0.4576,
"step": 530
},
{
"epoch": 0.5215695832317816,
"grad_norm": 1.9454371455405457,
"learning_rate": 5.493264626317252e-06,
"loss": 0.4546,
"step": 535
},
{
"epoch": 0.52644406531806,
"grad_norm": 1.9257763152448473,
"learning_rate": 5.408428496948761e-06,
"loss": 0.4476,
"step": 540
},
{
"epoch": 0.5313185474043383,
"grad_norm": 1.942601355689532,
"learning_rate": 5.323473822525011e-06,
"loss": 0.4419,
"step": 545
},
{
"epoch": 0.5361930294906166,
"grad_norm": 1.8380797740924733,
"learning_rate": 5.238425260866524e-06,
"loss": 0.4339,
"step": 550
},
{
"epoch": 0.5410675115768949,
"grad_norm": 1.9856061522664756,
"learning_rate": 5.153307497044291e-06,
"loss": 0.4486,
"step": 555
},
{
"epoch": 0.5459419936631733,
"grad_norm": 2.0152615373656446,
"learning_rate": 5.068145236215007e-06,
"loss": 0.4361,
"step": 560
},
{
"epoch": 0.5508164757494516,
"grad_norm": 1.8713611793445957,
"learning_rate": 4.982963196450478e-06,
"loss": 0.4388,
"step": 565
},
{
"epoch": 0.55569095783573,
"grad_norm": 1.9591094110255243,
"learning_rate": 4.8977861015632865e-06,
"loss": 0.4382,
"step": 570
},
{
"epoch": 0.5605654399220082,
"grad_norm": 1.8237556125663812,
"learning_rate": 4.812638673930777e-06,
"loss": 0.4289,
"step": 575
},
{
"epoch": 0.5654399220082866,
"grad_norm": 1.8090312867754712,
"learning_rate": 4.72754562731947e-06,
"loss": 0.4258,
"step": 580
},
{
"epoch": 0.5703144040945649,
"grad_norm": 1.8257902996618338,
"learning_rate": 4.64253165971197e-06,
"loss": 0.4303,
"step": 585
},
{
"epoch": 0.5751888861808433,
"grad_norm": 2.0254032476042223,
"learning_rate": 4.557621446138455e-06,
"loss": 0.4202,
"step": 590
},
{
"epoch": 0.5800633682671216,
"grad_norm": 1.9947879579802112,
"learning_rate": 4.47283963151483e-06,
"loss": 0.424,
"step": 595
},
{
"epoch": 0.5849378503534,
"grad_norm": 1.8982751664240438,
"learning_rate": 4.388210823489616e-06,
"loss": 0.4221,
"step": 600
},
{
"epoch": 0.5898123324396782,
"grad_norm": 1.8999729869459319,
"learning_rate": 4.3037595853016645e-06,
"loss": 0.4162,
"step": 605
},
{
"epoch": 0.5946868145259566,
"grad_norm": 1.9051454199437885,
"learning_rate": 4.219510428650752e-06,
"loss": 0.4154,
"step": 610
},
{
"epoch": 0.5995612966122349,
"grad_norm": 1.8892938847030196,
"learning_rate": 4.135487806583141e-06,
"loss": 0.4183,
"step": 615
},
{
"epoch": 0.6044357786985133,
"grad_norm": 1.8391212463048745,
"learning_rate": 4.051716106394162e-06,
"loss": 0.4169,
"step": 620
},
{
"epoch": 0.6093102607847917,
"grad_norm": 1.9164931200596216,
"learning_rate": 3.968219642549876e-06,
"loss": 0.4096,
"step": 625
},
{
"epoch": 0.6141847428710699,
"grad_norm": 1.8882587070789514,
"learning_rate": 3.885022649629887e-06,
"loss": 0.4089,
"step": 630
},
{
"epoch": 0.6190592249573483,
"grad_norm": 2.0414351606801677,
"learning_rate": 3.8021492752933196e-06,
"loss": 0.4118,
"step": 635
},
{
"epoch": 0.6239337070436266,
"grad_norm": 1.88948983848305,
"learning_rate": 3.7196235732700546e-06,
"loss": 0.4155,
"step": 640
},
{
"epoch": 0.628808189129905,
"grad_norm": 1.9185526120999834,
"learning_rate": 3.637469496379201e-06,
"loss": 0.3988,
"step": 645
},
{
"epoch": 0.6336826712161833,
"grad_norm": 1.8441572398843216,
"learning_rate": 3.5557108895768723e-06,
"loss": 0.4099,
"step": 650
},
{
"epoch": 0.6385571533024617,
"grad_norm": 2.0485958888309796,
"learning_rate": 3.4743714830352604e-06,
"loss": 0.405,
"step": 655
},
{
"epoch": 0.6434316353887399,
"grad_norm": 1.806992701143073,
"learning_rate": 3.3934748852550353e-06,
"loss": 0.4037,
"step": 660
},
{
"epoch": 0.6483061174750183,
"grad_norm": 1.943526238424205,
"learning_rate": 3.3130445762130485e-06,
"loss": 0.3967,
"step": 665
},
{
"epoch": 0.6531805995612966,
"grad_norm": 1.8848557764463123,
"learning_rate": 3.2331039005473495e-06,
"loss": 0.3924,
"step": 670
},
{
"epoch": 0.658055081647575,
"grad_norm": 1.892841917353598,
"learning_rate": 3.1536760607814747e-06,
"loss": 0.3961,
"step": 675
},
{
"epoch": 0.6629295637338533,
"grad_norm": 1.7833910365050067,
"learning_rate": 3.0747841105899965e-06,
"loss": 0.3973,
"step": 680
},
{
"epoch": 0.6678040458201316,
"grad_norm": 1.8266463006432494,
"learning_rate": 2.9964509481072627e-06,
"loss": 0.3829,
"step": 685
},
{
"epoch": 0.6726785279064099,
"grad_norm": 1.9794830429310166,
"learning_rate": 2.918699309281292e-06,
"loss": 0.3886,
"step": 690
},
{
"epoch": 0.6775530099926883,
"grad_norm": 1.8605018457416864,
"learning_rate": 2.84155176127473e-06,
"loss": 0.3889,
"step": 695
},
{
"epoch": 0.6824274920789666,
"grad_norm": 1.8677944161418016,
"learning_rate": 2.765030695914815e-06,
"loss": 0.3878,
"step": 700
},
{
"epoch": 0.687301974165245,
"grad_norm": 1.7809933219767806,
"learning_rate": 2.689158323194212e-06,
"loss": 0.389,
"step": 705
},
{
"epoch": 0.6921764562515232,
"grad_norm": 1.9534924091406372,
"learning_rate": 2.6139566648246355e-06,
"loss": 0.38,
"step": 710
},
{
"epoch": 0.6970509383378016,
"grad_norm": 1.8154536663255623,
"learning_rate": 2.5394475478451246e-06,
"loss": 0.3819,
"step": 715
},
{
"epoch": 0.7019254204240799,
"grad_norm": 1.910488831165307,
"learning_rate": 2.4656525982868106e-06,
"loss": 0.3805,
"step": 720
},
{
"epoch": 0.7067999025103583,
"grad_norm": 1.8313054725277078,
"learning_rate": 2.39259323489603e-06,
"loss": 0.3742,
"step": 725
},
{
"epoch": 0.7116743845966366,
"grad_norm": 1.8851239942706999,
"learning_rate": 2.320290662917607e-06,
"loss": 0.3726,
"step": 730
},
{
"epoch": 0.716548866682915,
"grad_norm": 1.8525938618294533,
"learning_rate": 2.2487658679400943e-06,
"loss": 0.3812,
"step": 735
},
{
"epoch": 0.7214233487691932,
"grad_norm": 1.952969026800498,
"learning_rate": 2.178039609804777e-06,
"loss": 0.3757,
"step": 740
},
{
"epoch": 0.7262978308554716,
"grad_norm": 1.8641758450652162,
"learning_rate": 2.108132416580198e-06,
"loss": 0.3794,
"step": 745
},
{
"epoch": 0.7311723129417499,
"grad_norm": 1.8518338990761232,
"learning_rate": 2.0390645786039406e-06,
"loss": 0.3713,
"step": 750
},
{
"epoch": 0.7360467950280283,
"grad_norm": 1.8015066231257286,
"learning_rate": 1.9708561425934393e-06,
"loss": 0.3784,
"step": 755
},
{
"epoch": 0.7409212771143066,
"grad_norm": 1.8409794783180748,
"learning_rate": 1.903526905827474e-06,
"loss": 0.3751,
"step": 760
},
{
"epoch": 0.7457957592005849,
"grad_norm": 1.8641076874515972,
"learning_rate": 1.8370964104000783e-06,
"loss": 0.3746,
"step": 765
},
{
"epoch": 0.7506702412868632,
"grad_norm": 1.786737876865444,
"learning_rate": 1.7715839375485067e-06,
"loss": 0.3628,
"step": 770
},
{
"epoch": 0.7555447233731416,
"grad_norm": 1.8446799495571504,
"learning_rate": 1.7070085020569194e-06,
"loss": 0.3644,
"step": 775
},
{
"epoch": 0.76041920545942,
"grad_norm": 1.751658260263151,
"learning_rate": 1.6433888467374015e-06,
"loss": 0.37,
"step": 780
},
{
"epoch": 0.7652936875456983,
"grad_norm": 1.8477843728803192,
"learning_rate": 1.5807434369899248e-06,
"loss": 0.3628,
"step": 785
},
{
"epoch": 0.7701681696319767,
"grad_norm": 1.8593690412561963,
"learning_rate": 1.51909045544282e-06,
"loss": 0.3708,
"step": 790
},
{
"epoch": 0.7750426517182549,
"grad_norm": 1.747782138054366,
"learning_rate": 1.4584477966753324e-06,
"loss": 0.3652,
"step": 795
},
{
"epoch": 0.7799171338045333,
"grad_norm": 1.755667670884649,
"learning_rate": 1.398833062023775e-06,
"loss": 0.3691,
"step": 800
},
{
"epoch": 0.7847916158908116,
"grad_norm": 1.8425636184668681,
"learning_rate": 1.3402635544727992e-06,
"loss": 0.366,
"step": 805
},
{
"epoch": 0.78966609797709,
"grad_norm": 1.9026395556203364,
"learning_rate": 1.2827562736332555e-06,
"loss": 0.3589,
"step": 810
},
{
"epoch": 0.7945405800633683,
"grad_norm": 1.7133796113318556,
"learning_rate": 1.226327910808116e-06,
"loss": 0.3597,
"step": 815
},
{
"epoch": 0.7994150621496466,
"grad_norm": 1.8616850952101744,
"learning_rate": 1.1709948441478763e-06,
"loss": 0.3583,
"step": 820
},
{
"epoch": 0.8042895442359249,
"grad_norm": 1.7675149524570417,
"learning_rate": 1.116773133896848e-06,
"loss": 0.3605,
"step": 825
},
{
"epoch": 0.8091640263222033,
"grad_norm": 1.900735872689697,
"learning_rate": 1.0636785177317255e-06,
"loss": 0.3547,
"step": 830
},
{
"epoch": 0.8140385084084816,
"grad_norm": 1.7353000724393655,
"learning_rate": 1.0117264061937777e-06,
"loss": 0.3543,
"step": 835
},
{
"epoch": 0.81891299049476,
"grad_norm": 1.8226701532638399,
"learning_rate": 9.60931878215985e-07,
"loss": 0.3523,
"step": 840
},
{
"epoch": 0.8237874725810382,
"grad_norm": 1.7199982449094653,
"learning_rate": 9.113096767464302e-07,
"loss": 0.3572,
"step": 845
},
{
"epoch": 0.8286619546673166,
"grad_norm": 1.7416045554651556,
"learning_rate": 8.62874204469204e-07,
"loss": 0.3546,
"step": 850
},
{
"epoch": 0.8335364367535949,
"grad_norm": 1.7748467098472374,
"learning_rate": 8.156395196240752e-07,
"loss": 0.3488,
"step": 855
},
{
"epoch": 0.8384109188398733,
"grad_norm": 1.8256676668006175,
"learning_rate": 7.696193319261242e-07,
"loss": 0.3467,
"step": 860
},
{
"epoch": 0.8432854009261516,
"grad_norm": 1.8827284517179357,
"learning_rate": 7.248269985865514e-07,
"loss": 0.3521,
"step": 865
},
{
"epoch": 0.84815988301243,
"grad_norm": 1.6781224427131647,
"learning_rate": 6.812755204357857e-07,
"loss": 0.3535,
"step": 870
},
{
"epoch": 0.8530343650987082,
"grad_norm": 1.6995357059191782,
"learning_rate": 6.389775381500351e-07,
"loss": 0.3435,
"step": 875
},
{
"epoch": 0.8579088471849866,
"grad_norm": 1.7402333449103136,
"learning_rate": 5.979453285823711e-07,
"loss": 0.3443,
"step": 880
},
{
"epoch": 0.8627833292712649,
"grad_norm": 1.7615298544617708,
"learning_rate": 5.58190801199413e-07,
"loss": 0.3528,
"step": 885
},
{
"epoch": 0.8676578113575433,
"grad_norm": 1.7343491942185678,
"learning_rate": 5.197254946246416e-07,
"loss": 0.3495,
"step": 890
},
{
"epoch": 0.8725322934438215,
"grad_norm": 1.657555480865734,
"learning_rate": 4.825605732893546e-07,
"loss": 0.3468,
"step": 895
},
{
"epoch": 0.8774067755300999,
"grad_norm": 1.6499922538828395,
"learning_rate": 4.4670682419221954e-07,
"loss": 0.3396,
"step": 900
},
{
"epoch": 0.8822812576163782,
"grad_norm": 1.7518714860746227,
"learning_rate": 4.121746537683907e-07,
"loss": 0.3504,
"step": 905
},
{
"epoch": 0.8871557397026566,
"grad_norm": 1.770832100980816,
"learning_rate": 3.789740848690682e-07,
"loss": 0.3518,
"step": 910
},
{
"epoch": 0.8920302217889349,
"grad_norm": 1.7567597669594073,
"learning_rate": 3.4711475385240057e-07,
"loss": 0.3492,
"step": 915
},
{
"epoch": 0.8969047038752133,
"grad_norm": 1.7161657861744943,
"learning_rate": 3.1660590778656406e-07,
"loss": 0.3428,
"step": 920
},
{
"epoch": 0.9017791859614915,
"grad_norm": 1.728233513619595,
"learning_rate": 2.8745640176582766e-07,
"loss": 0.3396,
"step": 925
},
{
"epoch": 0.9066536680477699,
"grad_norm": 1.8381612957565303,
"learning_rate": 2.5967469634039177e-07,
"loss": 0.345,
"step": 930
},
{
"epoch": 0.9115281501340483,
"grad_norm": 1.7243006426458327,
"learning_rate": 2.3326885506074314e-07,
"loss": 0.3465,
"step": 935
},
{
"epoch": 0.9164026322203266,
"grad_norm": 1.5970738101828363,
"learning_rate": 2.0824654213723038e-07,
"loss": 0.3429,
"step": 940
},
{
"epoch": 0.921277114306605,
"grad_norm": 1.6533959425871336,
"learning_rate": 1.8461502021555721e-07,
"loss": 0.3389,
"step": 945
},
{
"epoch": 0.9261515963928832,
"grad_norm": 1.7738912948147607,
"learning_rate": 1.6238114826881868e-07,
"loss": 0.3439,
"step": 950
},
{
"epoch": 0.9310260784791616,
"grad_norm": 1.7429674496591296,
"learning_rate": 1.4155137960670974e-07,
"loss": 0.3389,
"step": 955
},
{
"epoch": 0.9359005605654399,
"grad_norm": 1.7343774559673155,
"learning_rate": 1.2213176000246852e-07,
"loss": 0.345,
"step": 960
},
{
"epoch": 0.9407750426517183,
"grad_norm": 1.7876244818652967,
"learning_rate": 1.0412792593811505e-07,
"loss": 0.3426,
"step": 965
},
{
"epoch": 0.9456495247379966,
"grad_norm": 1.6810981829120015,
"learning_rate": 8.754510296847651e-08,
"loss": 0.3444,
"step": 970
},
{
"epoch": 0.950524006824275,
"grad_norm": 1.7314637174387817,
"learning_rate": 7.238810420448883e-08,
"loss": 0.3339,
"step": 975
},
{
"epoch": 0.9553984889105532,
"grad_norm": 1.6057497995450185,
"learning_rate": 5.866132891620746e-08,
"loss": 0.346,
"step": 980
},
{
"epoch": 0.9602729709968316,
"grad_norm": 1.7119707817106466,
"learning_rate": 4.6368761255930485e-08,
"loss": 0.3379,
"step": 985
},
{
"epoch": 0.9651474530831099,
"grad_norm": 1.7236174016491694,
"learning_rate": 3.551396910181415e-08,
"loss": 0.3412,
"step": 990
},
{
"epoch": 0.9700219351693883,
"grad_norm": 1.6591399917697027,
"learning_rate": 2.6100103022306257e-08,
"loss": 0.3433,
"step": 995
},
{
"epoch": 0.9748964172556666,
"grad_norm": 1.7276142814853213,
"learning_rate": 1.812989536170484e-08,
"loss": 0.3414,
"step": 1000
},
{
"epoch": 0.979770899341945,
"grad_norm": 1.6558936839338774,
"learning_rate": 1.1605659447102568e-08,
"loss": 0.3366,
"step": 1005
},
{
"epoch": 0.9846453814282232,
"grad_norm": 1.6766541339225858,
"learning_rate": 6.529288916952703e-09,
"loss": 0.3331,
"step": 1010
},
{
"epoch": 0.9895198635145016,
"grad_norm": 1.7452767005894518,
"learning_rate": 2.9022571714448776e-09,
"loss": 0.3429,
"step": 1015
},
{
"epoch": 0.9943943456007799,
"grad_norm": 1.8097610241635727,
"learning_rate": 7.256169448560668e-10,
"loss": 0.3415,
"step": 1020
},
{
"epoch": 0.9992688276870583,
"grad_norm": 1.758337391190151,
"learning_rate": 0.0,
"loss": 0.3467,
"step": 1025
},
{
"epoch": 0.9992688276870583,
"eval_loss": 0.3359443247318268,
"eval_runtime": 96.9406,
"eval_samples_per_second": 3.115,
"eval_steps_per_second": 0.784,
"step": 1025
},
{
"epoch": 0.9992688276870583,
"step": 1025,
"total_flos": 214561802158080.0,
"train_loss": 0.5001517156275307,
"train_runtime": 26987.6611,
"train_samples_per_second": 1.216,
"train_steps_per_second": 0.038
}
],
"logging_steps": 5,
"max_steps": 1025,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 214561802158080.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}