eightwords-241113-mt2 / trainer_state.json
clinno's picture
Upload folder using huggingface_hub
93eb9ee verified
raw
history blame
85.3 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 63.298904538341155,
"eval_steps": 1000,
"global_step": 5056,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.12519561815336464,
"grad_norm": 27.375,
"learning_rate": 1.9762845849802374e-07,
"loss": 2.599,
"step": 10
},
{
"epoch": 0.25039123630672927,
"grad_norm": 24.625,
"learning_rate": 3.9525691699604747e-07,
"loss": 2.5019,
"step": 20
},
{
"epoch": 0.3755868544600939,
"grad_norm": 23.875,
"learning_rate": 5.928853754940712e-07,
"loss": 2.4051,
"step": 30
},
{
"epoch": 0.5007824726134585,
"grad_norm": 15.9375,
"learning_rate": 7.905138339920949e-07,
"loss": 2.4794,
"step": 40
},
{
"epoch": 0.6259780907668232,
"grad_norm": 37.25,
"learning_rate": 9.881422924901187e-07,
"loss": 2.3233,
"step": 50
},
{
"epoch": 0.7511737089201878,
"grad_norm": 26.375,
"learning_rate": 1.1857707509881424e-06,
"loss": 2.2146,
"step": 60
},
{
"epoch": 0.8763693270735524,
"grad_norm": 22.875,
"learning_rate": 1.3833992094861662e-06,
"loss": 2.259,
"step": 70
},
{
"epoch": 1.001564945226917,
"grad_norm": 20.625,
"learning_rate": 1.5810276679841899e-06,
"loss": 2.1342,
"step": 80
},
{
"epoch": 1.1267605633802817,
"grad_norm": 17.375,
"learning_rate": 1.7786561264822136e-06,
"loss": 2.1221,
"step": 90
},
{
"epoch": 1.2519561815336462,
"grad_norm": 4.8125,
"learning_rate": 1.9762845849802374e-06,
"loss": 2.0237,
"step": 100
},
{
"epoch": 1.3771517996870108,
"grad_norm": 4.5625,
"learning_rate": 2.173913043478261e-06,
"loss": 1.9913,
"step": 110
},
{
"epoch": 1.5023474178403755,
"grad_norm": 4.4375,
"learning_rate": 2.371541501976285e-06,
"loss": 2.0106,
"step": 120
},
{
"epoch": 1.6275430359937402,
"grad_norm": 4.09375,
"learning_rate": 2.5691699604743086e-06,
"loss": 1.9704,
"step": 130
},
{
"epoch": 1.7527386541471048,
"grad_norm": 5.0,
"learning_rate": 2.7667984189723323e-06,
"loss": 1.9563,
"step": 140
},
{
"epoch": 1.8779342723004695,
"grad_norm": 3.84375,
"learning_rate": 2.964426877470356e-06,
"loss": 1.939,
"step": 150
},
{
"epoch": 2.003129890453834,
"grad_norm": 4.40625,
"learning_rate": 3.1620553359683798e-06,
"loss": 1.9485,
"step": 160
},
{
"epoch": 2.128325508607199,
"grad_norm": 4.625,
"learning_rate": 3.3596837944664035e-06,
"loss": 1.8826,
"step": 170
},
{
"epoch": 2.2535211267605635,
"grad_norm": 4.53125,
"learning_rate": 3.5573122529644273e-06,
"loss": 1.7508,
"step": 180
},
{
"epoch": 2.378716744913928,
"grad_norm": 5.21875,
"learning_rate": 3.754940711462451e-06,
"loss": 1.7644,
"step": 190
},
{
"epoch": 2.5039123630672924,
"grad_norm": 4.34375,
"learning_rate": 3.952569169960475e-06,
"loss": 1.9122,
"step": 200
},
{
"epoch": 2.629107981220657,
"grad_norm": 4.8125,
"learning_rate": 4.150197628458498e-06,
"loss": 1.7887,
"step": 210
},
{
"epoch": 2.7543035993740217,
"grad_norm": 4.21875,
"learning_rate": 4.347826086956522e-06,
"loss": 1.8224,
"step": 220
},
{
"epoch": 2.8794992175273864,
"grad_norm": 3.8125,
"learning_rate": 4.5454545454545455e-06,
"loss": 1.8029,
"step": 230
},
{
"epoch": 3.004694835680751,
"grad_norm": 5.25,
"learning_rate": 4.74308300395257e-06,
"loss": 1.8384,
"step": 240
},
{
"epoch": 3.1298904538341157,
"grad_norm": 4.15625,
"learning_rate": 4.940711462450593e-06,
"loss": 1.6869,
"step": 250
},
{
"epoch": 3.2550860719874803,
"grad_norm": 3.71875,
"learning_rate": 5.138339920948617e-06,
"loss": 1.6394,
"step": 260
},
{
"epoch": 3.380281690140845,
"grad_norm": 4.0625,
"learning_rate": 5.335968379446641e-06,
"loss": 1.6657,
"step": 270
},
{
"epoch": 3.5054773082942097,
"grad_norm": 3.953125,
"learning_rate": 5.533596837944665e-06,
"loss": 1.6843,
"step": 280
},
{
"epoch": 3.6306729264475743,
"grad_norm": 4.125,
"learning_rate": 5.731225296442689e-06,
"loss": 1.6496,
"step": 290
},
{
"epoch": 3.755868544600939,
"grad_norm": 3.875,
"learning_rate": 5.928853754940712e-06,
"loss": 1.7035,
"step": 300
},
{
"epoch": 3.8810641627543037,
"grad_norm": 3.453125,
"learning_rate": 6.126482213438736e-06,
"loss": 1.6954,
"step": 310
},
{
"epoch": 4.006259780907668,
"grad_norm": 3.390625,
"learning_rate": 6.3241106719367596e-06,
"loss": 1.7039,
"step": 320
},
{
"epoch": 4.131455399061033,
"grad_norm": 4.125,
"learning_rate": 6.521739130434783e-06,
"loss": 1.5767,
"step": 330
},
{
"epoch": 4.256651017214398,
"grad_norm": 5.0,
"learning_rate": 6.719367588932807e-06,
"loss": 1.5438,
"step": 340
},
{
"epoch": 4.381846635367762,
"grad_norm": 3.125,
"learning_rate": 6.91699604743083e-06,
"loss": 1.4615,
"step": 350
},
{
"epoch": 4.507042253521127,
"grad_norm": 4.28125,
"learning_rate": 7.1146245059288545e-06,
"loss": 1.5243,
"step": 360
},
{
"epoch": 4.632237871674492,
"grad_norm": 3.46875,
"learning_rate": 7.312252964426878e-06,
"loss": 1.5667,
"step": 370
},
{
"epoch": 4.757433489827856,
"grad_norm": 3.5625,
"learning_rate": 7.509881422924902e-06,
"loss": 1.548,
"step": 380
},
{
"epoch": 4.882629107981221,
"grad_norm": 3.421875,
"learning_rate": 7.707509881422925e-06,
"loss": 1.5454,
"step": 390
},
{
"epoch": 5.007824726134586,
"grad_norm": 3.03125,
"learning_rate": 7.90513833992095e-06,
"loss": 1.526,
"step": 400
},
{
"epoch": 5.13302034428795,
"grad_norm": 3.234375,
"learning_rate": 8.102766798418974e-06,
"loss": 1.4051,
"step": 410
},
{
"epoch": 5.258215962441315,
"grad_norm": 2.953125,
"learning_rate": 8.300395256916996e-06,
"loss": 1.4057,
"step": 420
},
{
"epoch": 5.383411580594679,
"grad_norm": 2.90625,
"learning_rate": 8.49802371541502e-06,
"loss": 1.3852,
"step": 430
},
{
"epoch": 5.508607198748043,
"grad_norm": 3.78125,
"learning_rate": 8.695652173913044e-06,
"loss": 1.3708,
"step": 440
},
{
"epoch": 5.633802816901408,
"grad_norm": 3.125,
"learning_rate": 8.893280632411067e-06,
"loss": 1.3361,
"step": 450
},
{
"epoch": 5.758998435054773,
"grad_norm": 3.3125,
"learning_rate": 9.090909090909091e-06,
"loss": 1.4118,
"step": 460
},
{
"epoch": 5.884194053208137,
"grad_norm": 3.5,
"learning_rate": 9.288537549407115e-06,
"loss": 1.3362,
"step": 470
},
{
"epoch": 6.009389671361502,
"grad_norm": 3.453125,
"learning_rate": 9.48616600790514e-06,
"loss": 1.477,
"step": 480
},
{
"epoch": 6.134585289514867,
"grad_norm": 4.46875,
"learning_rate": 9.683794466403162e-06,
"loss": 1.2172,
"step": 490
},
{
"epoch": 6.259780907668231,
"grad_norm": 3.5625,
"learning_rate": 9.881422924901186e-06,
"loss": 1.1359,
"step": 500
},
{
"epoch": 6.384976525821596,
"grad_norm": 3.359375,
"learning_rate": 9.999980930615864e-06,
"loss": 1.2047,
"step": 510
},
{
"epoch": 6.510172143974961,
"grad_norm": 3.515625,
"learning_rate": 9.999766401714795e-06,
"loss": 1.1727,
"step": 520
},
{
"epoch": 6.635367762128325,
"grad_norm": 3.4375,
"learning_rate": 9.999313517443876e-06,
"loss": 1.3046,
"step": 530
},
{
"epoch": 6.76056338028169,
"grad_norm": 3.640625,
"learning_rate": 9.998622299393598e-06,
"loss": 1.2101,
"step": 540
},
{
"epoch": 6.885758998435055,
"grad_norm": 3.25,
"learning_rate": 9.997692780516608e-06,
"loss": 1.1944,
"step": 550
},
{
"epoch": 7.010954616588419,
"grad_norm": 3.53125,
"learning_rate": 9.996525005126135e-06,
"loss": 1.2125,
"step": 560
},
{
"epoch": 7.136150234741784,
"grad_norm": 2.96875,
"learning_rate": 9.995119028893888e-06,
"loss": 1.0638,
"step": 570
},
{
"epoch": 7.261345852895149,
"grad_norm": 4.03125,
"learning_rate": 9.993474918847401e-06,
"loss": 1.0207,
"step": 580
},
{
"epoch": 7.386541471048513,
"grad_norm": 3.484375,
"learning_rate": 9.991592753366822e-06,
"loss": 1.0174,
"step": 590
},
{
"epoch": 7.511737089201878,
"grad_norm": 3.484375,
"learning_rate": 9.989472622181194e-06,
"loss": 1.0496,
"step": 600
},
{
"epoch": 7.636932707355243,
"grad_norm": 3.359375,
"learning_rate": 9.987114626364172e-06,
"loss": 0.9966,
"step": 610
},
{
"epoch": 7.762128325508607,
"grad_norm": 3.484375,
"learning_rate": 9.984518878329197e-06,
"loss": 0.9866,
"step": 620
},
{
"epoch": 7.887323943661972,
"grad_norm": 3.25,
"learning_rate": 9.98168550182415e-06,
"loss": 1.0372,
"step": 630
},
{
"epoch": 8.012519561815337,
"grad_norm": 3.125,
"learning_rate": 9.978614631925442e-06,
"loss": 0.9753,
"step": 640
},
{
"epoch": 8.137715179968701,
"grad_norm": 3.234375,
"learning_rate": 9.975306415031577e-06,
"loss": 0.8432,
"step": 650
},
{
"epoch": 8.262910798122066,
"grad_norm": 3.390625,
"learning_rate": 9.97176100885618e-06,
"loss": 0.8252,
"step": 660
},
{
"epoch": 8.38810641627543,
"grad_norm": 3.484375,
"learning_rate": 9.967978582420463e-06,
"loss": 0.8158,
"step": 670
},
{
"epoch": 8.513302034428795,
"grad_norm": 3.34375,
"learning_rate": 9.963959316045185e-06,
"loss": 0.833,
"step": 680
},
{
"epoch": 8.63849765258216,
"grad_norm": 3.078125,
"learning_rate": 9.959703401342037e-06,
"loss": 0.8328,
"step": 690
},
{
"epoch": 8.763693270735525,
"grad_norm": 2.703125,
"learning_rate": 9.955211041204529e-06,
"loss": 0.7759,
"step": 700
},
{
"epoch": 8.88888888888889,
"grad_norm": 3.015625,
"learning_rate": 9.950482449798295e-06,
"loss": 0.8572,
"step": 710
},
{
"epoch": 9.014084507042254,
"grad_norm": 2.296875,
"learning_rate": 9.9455178525509e-06,
"loss": 0.7794,
"step": 720
},
{
"epoch": 9.139280125195619,
"grad_norm": 3.84375,
"learning_rate": 9.940317486141084e-06,
"loss": 0.6747,
"step": 730
},
{
"epoch": 9.264475743348983,
"grad_norm": 3.03125,
"learning_rate": 9.934881598487478e-06,
"loss": 0.6434,
"step": 740
},
{
"epoch": 9.389671361502348,
"grad_norm": 2.296875,
"learning_rate": 9.929210448736797e-06,
"loss": 0.6149,
"step": 750
},
{
"epoch": 9.514866979655713,
"grad_norm": 2.8125,
"learning_rate": 9.923304307251467e-06,
"loss": 0.6818,
"step": 760
},
{
"epoch": 9.640062597809077,
"grad_norm": 2.640625,
"learning_rate": 9.917163455596753e-06,
"loss": 0.6376,
"step": 770
},
{
"epoch": 9.765258215962442,
"grad_norm": 2.953125,
"learning_rate": 9.910788186527325e-06,
"loss": 0.6487,
"step": 780
},
{
"epoch": 9.890453834115807,
"grad_norm": 2.640625,
"learning_rate": 9.904178803973306e-06,
"loss": 0.6511,
"step": 790
},
{
"epoch": 10.015649452269171,
"grad_norm": 1.8046875,
"learning_rate": 9.89733562302578e-06,
"loss": 0.6073,
"step": 800
},
{
"epoch": 10.140845070422536,
"grad_norm": 2.1875,
"learning_rate": 9.890258969921777e-06,
"loss": 0.4669,
"step": 810
},
{
"epoch": 10.2660406885759,
"grad_norm": 2.359375,
"learning_rate": 9.882949182028709e-06,
"loss": 0.5051,
"step": 820
},
{
"epoch": 10.391236306729265,
"grad_norm": 2.140625,
"learning_rate": 9.8754066078283e-06,
"loss": 0.4987,
"step": 830
},
{
"epoch": 10.51643192488263,
"grad_norm": 2.046875,
"learning_rate": 9.867631606899957e-06,
"loss": 0.5019,
"step": 840
},
{
"epoch": 10.641627543035995,
"grad_norm": 1.90625,
"learning_rate": 9.859624549903646e-06,
"loss": 0.5047,
"step": 850
},
{
"epoch": 10.766823161189357,
"grad_norm": 1.7109375,
"learning_rate": 9.851385818562204e-06,
"loss": 0.5325,
"step": 860
},
{
"epoch": 10.892018779342724,
"grad_norm": 1.890625,
"learning_rate": 9.842915805643156e-06,
"loss": 0.5332,
"step": 870
},
{
"epoch": 11.017214397496087,
"grad_norm": 1.484375,
"learning_rate": 9.834214914939977e-06,
"loss": 0.4857,
"step": 880
},
{
"epoch": 11.142410015649451,
"grad_norm": 1.84375,
"learning_rate": 9.82528356125285e-06,
"loss": 0.3812,
"step": 890
},
{
"epoch": 11.267605633802816,
"grad_norm": 1.234375,
"learning_rate": 9.816122170368891e-06,
"loss": 0.3738,
"step": 900
},
{
"epoch": 11.39280125195618,
"grad_norm": 1.5234375,
"learning_rate": 9.806731179041849e-06,
"loss": 0.4202,
"step": 910
},
{
"epoch": 11.517996870109545,
"grad_norm": 1.3984375,
"learning_rate": 9.797111034971278e-06,
"loss": 0.3592,
"step": 920
},
{
"epoch": 11.64319248826291,
"grad_norm": 1.359375,
"learning_rate": 9.787262196781208e-06,
"loss": 0.3406,
"step": 930
},
{
"epoch": 11.768388106416275,
"grad_norm": 1.3828125,
"learning_rate": 9.777185133998268e-06,
"loss": 0.3592,
"step": 940
},
{
"epoch": 11.89358372456964,
"grad_norm": 1.3046875,
"learning_rate": 9.76688032702931e-06,
"loss": 0.3725,
"step": 950
},
{
"epoch": 12.018779342723004,
"grad_norm": 1.296875,
"learning_rate": 9.756348267138497e-06,
"loss": 0.4022,
"step": 960
},
{
"epoch": 12.143974960876369,
"grad_norm": 1.265625,
"learning_rate": 9.745589456423897e-06,
"loss": 0.2901,
"step": 970
},
{
"epoch": 12.269170579029733,
"grad_norm": 1.5625,
"learning_rate": 9.734604407793529e-06,
"loss": 0.3043,
"step": 980
},
{
"epoch": 12.394366197183098,
"grad_norm": 1.4453125,
"learning_rate": 9.72339364494093e-06,
"loss": 0.2778,
"step": 990
},
{
"epoch": 12.519561815336463,
"grad_norm": 1.234375,
"learning_rate": 9.711957702320176e-06,
"loss": 0.2795,
"step": 1000
},
{
"epoch": 12.519561815336463,
"eval_loss": 2.0127437114715576,
"eval_runtime": 3.1921,
"eval_samples_per_second": 22.556,
"eval_steps_per_second": 22.556,
"step": 1000
},
{
"epoch": 12.644757433489827,
"grad_norm": 1.203125,
"learning_rate": 9.7002971251204e-06,
"loss": 0.2932,
"step": 1010
},
{
"epoch": 12.769953051643192,
"grad_norm": 1.1484375,
"learning_rate": 9.688412469239812e-06,
"loss": 0.3021,
"step": 1020
},
{
"epoch": 12.895148669796557,
"grad_norm": 1.0390625,
"learning_rate": 9.676304301259196e-06,
"loss": 0.2861,
"step": 1030
},
{
"epoch": 13.020344287949921,
"grad_norm": 0.921875,
"learning_rate": 9.663973198414888e-06,
"loss": 0.2959,
"step": 1040
},
{
"epoch": 13.145539906103286,
"grad_norm": 1.1953125,
"learning_rate": 9.651419748571272e-06,
"loss": 0.2115,
"step": 1050
},
{
"epoch": 13.27073552425665,
"grad_norm": 1.1640625,
"learning_rate": 9.638644550192741e-06,
"loss": 0.2322,
"step": 1060
},
{
"epoch": 13.395931142410015,
"grad_norm": 1.3359375,
"learning_rate": 9.625648212315177e-06,
"loss": 0.2443,
"step": 1070
},
{
"epoch": 13.52112676056338,
"grad_norm": 1.3125,
"learning_rate": 9.612431354516912e-06,
"loss": 0.2237,
"step": 1080
},
{
"epoch": 13.646322378716745,
"grad_norm": 1.7578125,
"learning_rate": 9.598994606889187e-06,
"loss": 0.2261,
"step": 1090
},
{
"epoch": 13.77151799687011,
"grad_norm": 1.0625,
"learning_rate": 9.585338610006122e-06,
"loss": 0.2163,
"step": 1100
},
{
"epoch": 13.896713615023474,
"grad_norm": 1.1640625,
"learning_rate": 9.571464014894168e-06,
"loss": 0.2223,
"step": 1110
},
{
"epoch": 14.021909233176839,
"grad_norm": 0.8828125,
"learning_rate": 9.557371483001078e-06,
"loss": 0.2216,
"step": 1120
},
{
"epoch": 14.147104851330203,
"grad_norm": 0.921875,
"learning_rate": 9.543061686164374e-06,
"loss": 0.1752,
"step": 1130
},
{
"epoch": 14.272300469483568,
"grad_norm": 0.953125,
"learning_rate": 9.528535306579306e-06,
"loss": 0.1694,
"step": 1140
},
{
"epoch": 14.397496087636933,
"grad_norm": 0.94921875,
"learning_rate": 9.513793036766345e-06,
"loss": 0.1597,
"step": 1150
},
{
"epoch": 14.522691705790297,
"grad_norm": 1.1328125,
"learning_rate": 9.498835579538164e-06,
"loss": 0.1627,
"step": 1160
},
{
"epoch": 14.647887323943662,
"grad_norm": 0.8359375,
"learning_rate": 9.483663647966124e-06,
"loss": 0.187,
"step": 1170
},
{
"epoch": 14.773082942097027,
"grad_norm": 1.015625,
"learning_rate": 9.468277965346292e-06,
"loss": 0.168,
"step": 1180
},
{
"epoch": 14.898278560250391,
"grad_norm": 1.140625,
"learning_rate": 9.452679265164951e-06,
"loss": 0.1605,
"step": 1190
},
{
"epoch": 15.023474178403756,
"grad_norm": 0.74609375,
"learning_rate": 9.43686829106363e-06,
"loss": 0.1667,
"step": 1200
},
{
"epoch": 15.14866979655712,
"grad_norm": 0.8046875,
"learning_rate": 9.42084579680366e-06,
"loss": 0.1442,
"step": 1210
},
{
"epoch": 15.273865414710485,
"grad_norm": 1.03125,
"learning_rate": 9.404612546230244e-06,
"loss": 0.1222,
"step": 1220
},
{
"epoch": 15.39906103286385,
"grad_norm": 0.640625,
"learning_rate": 9.38816931323602e-06,
"loss": 0.1281,
"step": 1230
},
{
"epoch": 15.524256651017215,
"grad_norm": 0.765625,
"learning_rate": 9.371516881724192e-06,
"loss": 0.1207,
"step": 1240
},
{
"epoch": 15.64945226917058,
"grad_norm": 0.68359375,
"learning_rate": 9.35465604557114e-06,
"loss": 0.1191,
"step": 1250
},
{
"epoch": 15.774647887323944,
"grad_norm": 0.75390625,
"learning_rate": 9.337587608588588e-06,
"loss": 0.1207,
"step": 1260
},
{
"epoch": 15.899843505477309,
"grad_norm": 0.59765625,
"learning_rate": 9.320312384485274e-06,
"loss": 0.1312,
"step": 1270
},
{
"epoch": 16.025039123630673,
"grad_norm": 0.447265625,
"learning_rate": 9.30283119682816e-06,
"loss": 0.1173,
"step": 1280
},
{
"epoch": 16.150234741784036,
"grad_norm": 0.7734375,
"learning_rate": 9.285144879003173e-06,
"loss": 0.0862,
"step": 1290
},
{
"epoch": 16.275430359937403,
"grad_norm": 0.53125,
"learning_rate": 9.267254274175467e-06,
"loss": 0.0801,
"step": 1300
},
{
"epoch": 16.400625978090765,
"grad_norm": 0.609375,
"learning_rate": 9.24916023524924e-06,
"loss": 0.0852,
"step": 1310
},
{
"epoch": 16.525821596244132,
"grad_norm": 0.796875,
"learning_rate": 9.23086362482706e-06,
"loss": 0.113,
"step": 1320
},
{
"epoch": 16.651017214397495,
"grad_norm": 0.66015625,
"learning_rate": 9.212365315168743e-06,
"loss": 0.0951,
"step": 1330
},
{
"epoch": 16.77621283255086,
"grad_norm": 0.625,
"learning_rate": 9.193666188149782e-06,
"loss": 0.0917,
"step": 1340
},
{
"epoch": 16.901408450704224,
"grad_norm": 0.640625,
"learning_rate": 9.174767135219291e-06,
"loss": 0.0849,
"step": 1350
},
{
"epoch": 17.02660406885759,
"grad_norm": 0.61328125,
"learning_rate": 9.155669057357515e-06,
"loss": 0.0907,
"step": 1360
},
{
"epoch": 17.151799687010953,
"grad_norm": 0.828125,
"learning_rate": 9.136372865032871e-06,
"loss": 0.0577,
"step": 1370
},
{
"epoch": 17.27699530516432,
"grad_norm": 0.4921875,
"learning_rate": 9.116879478158552e-06,
"loss": 0.0689,
"step": 1380
},
{
"epoch": 17.402190923317683,
"grad_norm": 0.51953125,
"learning_rate": 9.09718982604866e-06,
"loss": 0.0723,
"step": 1390
},
{
"epoch": 17.52738654147105,
"grad_norm": 0.578125,
"learning_rate": 9.077304847373913e-06,
"loss": 0.0639,
"step": 1400
},
{
"epoch": 17.652582159624412,
"grad_norm": 0.68359375,
"learning_rate": 9.057225490116887e-06,
"loss": 0.0594,
"step": 1410
},
{
"epoch": 17.77777777777778,
"grad_norm": 0.640625,
"learning_rate": 9.036952711526834e-06,
"loss": 0.0752,
"step": 1420
},
{
"epoch": 17.90297339593114,
"grad_norm": 0.59375,
"learning_rate": 9.016487478074032e-06,
"loss": 0.0627,
"step": 1430
},
{
"epoch": 18.028169014084508,
"grad_norm": 0.462890625,
"learning_rate": 8.995830765403721e-06,
"loss": 0.0656,
"step": 1440
},
{
"epoch": 18.15336463223787,
"grad_norm": 0.5078125,
"learning_rate": 8.974983558289586e-06,
"loss": 0.0413,
"step": 1450
},
{
"epoch": 18.278560250391237,
"grad_norm": 0.53125,
"learning_rate": 8.953946850586813e-06,
"loss": 0.0448,
"step": 1460
},
{
"epoch": 18.4037558685446,
"grad_norm": 0.451171875,
"learning_rate": 8.932721645184707e-06,
"loss": 0.0438,
"step": 1470
},
{
"epoch": 18.528951486697967,
"grad_norm": 0.578125,
"learning_rate": 8.911308953958875e-06,
"loss": 0.0477,
"step": 1480
},
{
"epoch": 18.65414710485133,
"grad_norm": 0.6875,
"learning_rate": 8.889709797723002e-06,
"loss": 0.0478,
"step": 1490
},
{
"epoch": 18.779342723004696,
"grad_norm": 0.40234375,
"learning_rate": 8.867925206180166e-06,
"loss": 0.0505,
"step": 1500
},
{
"epoch": 18.90453834115806,
"grad_norm": 0.43359375,
"learning_rate": 8.845956217873763e-06,
"loss": 0.0463,
"step": 1510
},
{
"epoch": 19.029733959311425,
"grad_norm": 0.37109375,
"learning_rate": 8.823803880137993e-06,
"loss": 0.0517,
"step": 1520
},
{
"epoch": 19.154929577464788,
"grad_norm": 0.455078125,
"learning_rate": 8.801469249047923e-06,
"loss": 0.0342,
"step": 1530
},
{
"epoch": 19.280125195618155,
"grad_norm": 0.302734375,
"learning_rate": 8.77895338936915e-06,
"loss": 0.0287,
"step": 1540
},
{
"epoch": 19.405320813771517,
"grad_norm": 0.62890625,
"learning_rate": 8.756257374507036e-06,
"loss": 0.0333,
"step": 1550
},
{
"epoch": 19.530516431924884,
"grad_norm": 0.41796875,
"learning_rate": 8.733382286455536e-06,
"loss": 0.0313,
"step": 1560
},
{
"epoch": 19.655712050078247,
"grad_norm": 0.515625,
"learning_rate": 8.710329215745612e-06,
"loss": 0.0274,
"step": 1570
},
{
"epoch": 19.780907668231613,
"grad_norm": 0.4296875,
"learning_rate": 8.687099261393249e-06,
"loss": 0.0351,
"step": 1580
},
{
"epoch": 19.906103286384976,
"grad_norm": 0.380859375,
"learning_rate": 8.663693530847056e-06,
"loss": 0.0331,
"step": 1590
},
{
"epoch": 20.031298904538342,
"grad_norm": 1.96875,
"learning_rate": 8.640113139935484e-06,
"loss": 0.0275,
"step": 1600
},
{
"epoch": 20.156494522691705,
"grad_norm": 13.75,
"learning_rate": 8.616359212813607e-06,
"loss": 0.0466,
"step": 1610
},
{
"epoch": 20.281690140845072,
"grad_norm": 16.375,
"learning_rate": 8.592432881909548e-06,
"loss": 0.0779,
"step": 1620
},
{
"epoch": 20.406885758998435,
"grad_norm": 16.75,
"learning_rate": 8.568335287870488e-06,
"loss": 0.116,
"step": 1630
},
{
"epoch": 20.5320813771518,
"grad_norm": 11.25,
"learning_rate": 8.544067579508292e-06,
"loss": 0.1198,
"step": 1640
},
{
"epoch": 20.657276995305164,
"grad_norm": 38.5,
"learning_rate": 8.519630913744726e-06,
"loss": 0.1259,
"step": 1650
},
{
"epoch": 20.78247261345853,
"grad_norm": 26.875,
"learning_rate": 8.495026455556318e-06,
"loss": 0.1304,
"step": 1660
},
{
"epoch": 20.907668231611893,
"grad_norm": 30.375,
"learning_rate": 8.470255377918821e-06,
"loss": 0.1338,
"step": 1670
},
{
"epoch": 21.03286384976526,
"grad_norm": 31.75,
"learning_rate": 8.445318861751278e-06,
"loss": 0.1232,
"step": 1680
},
{
"epoch": 21.158059467918623,
"grad_norm": 28.25,
"learning_rate": 8.420218095859735e-06,
"loss": 0.154,
"step": 1690
},
{
"epoch": 21.28325508607199,
"grad_norm": 9.3125,
"learning_rate": 8.394954276880568e-06,
"loss": 0.1363,
"step": 1700
},
{
"epoch": 21.408450704225352,
"grad_norm": 8.25,
"learning_rate": 8.36952860922343e-06,
"loss": 0.13,
"step": 1710
},
{
"epoch": 21.53364632237872,
"grad_norm": 11.9375,
"learning_rate": 8.343942305013833e-06,
"loss": 0.1398,
"step": 1720
},
{
"epoch": 21.65884194053208,
"grad_norm": 8.6875,
"learning_rate": 8.318196584035367e-06,
"loss": 0.1428,
"step": 1730
},
{
"epoch": 21.784037558685448,
"grad_norm": 8.5,
"learning_rate": 8.292292673671542e-06,
"loss": 0.1451,
"step": 1740
},
{
"epoch": 21.90923317683881,
"grad_norm": 9.3125,
"learning_rate": 8.266231808847284e-06,
"loss": 0.1157,
"step": 1750
},
{
"epoch": 22.034428794992174,
"grad_norm": 5.78125,
"learning_rate": 8.24001523197005e-06,
"loss": 0.1287,
"step": 1760
},
{
"epoch": 22.15962441314554,
"grad_norm": 5.84375,
"learning_rate": 8.213644192870609e-06,
"loss": 0.1034,
"step": 1770
},
{
"epoch": 22.284820031298903,
"grad_norm": 7.6875,
"learning_rate": 8.18711994874345e-06,
"loss": 0.1126,
"step": 1780
},
{
"epoch": 22.41001564945227,
"grad_norm": 9.375,
"learning_rate": 8.160443764086855e-06,
"loss": 0.1295,
"step": 1790
},
{
"epoch": 22.535211267605632,
"grad_norm": 10.0625,
"learning_rate": 8.13361691064261e-06,
"loss": 0.1149,
"step": 1800
},
{
"epoch": 22.660406885759,
"grad_norm": 7.875,
"learning_rate": 8.10664066733538e-06,
"loss": 0.1125,
"step": 1810
},
{
"epoch": 22.78560250391236,
"grad_norm": 8.3125,
"learning_rate": 8.079516320211746e-06,
"loss": 0.1538,
"step": 1820
},
{
"epoch": 22.910798122065728,
"grad_norm": 6.6875,
"learning_rate": 8.052245162378871e-06,
"loss": 0.1213,
"step": 1830
},
{
"epoch": 23.03599374021909,
"grad_norm": 5.125,
"learning_rate": 8.024828493942882e-06,
"loss": 0.1065,
"step": 1840
},
{
"epoch": 23.161189358372457,
"grad_norm": 6.65625,
"learning_rate": 7.997267621946871e-06,
"loss": 0.0972,
"step": 1850
},
{
"epoch": 23.28638497652582,
"grad_norm": 7.71875,
"learning_rate": 7.96956386030859e-06,
"loss": 0.0994,
"step": 1860
},
{
"epoch": 23.411580594679187,
"grad_norm": 7.21875,
"learning_rate": 7.94171852975782e-06,
"loss": 0.1061,
"step": 1870
},
{
"epoch": 23.53677621283255,
"grad_norm": 5.9375,
"learning_rate": 7.913732957773385e-06,
"loss": 0.1005,
"step": 1880
},
{
"epoch": 23.661971830985916,
"grad_norm": 6.3125,
"learning_rate": 7.885608478519894e-06,
"loss": 0.0963,
"step": 1890
},
{
"epoch": 23.78716744913928,
"grad_norm": 6.4375,
"learning_rate": 7.857346432784116e-06,
"loss": 0.1074,
"step": 1900
},
{
"epoch": 23.912363067292645,
"grad_norm": 5.5625,
"learning_rate": 7.828948167911073e-06,
"loss": 0.0948,
"step": 1910
},
{
"epoch": 24.037558685446008,
"grad_norm": 5.71875,
"learning_rate": 7.800415037739802e-06,
"loss": 0.1132,
"step": 1920
},
{
"epoch": 24.162754303599375,
"grad_norm": 6.5,
"learning_rate": 7.771748402538808e-06,
"loss": 0.0783,
"step": 1930
},
{
"epoch": 24.287949921752737,
"grad_norm": 5.46875,
"learning_rate": 7.742949628941232e-06,
"loss": 0.0743,
"step": 1940
},
{
"epoch": 24.413145539906104,
"grad_norm": 5.34375,
"learning_rate": 7.714020089879683e-06,
"loss": 0.092,
"step": 1950
},
{
"epoch": 24.538341158059467,
"grad_norm": 5.0,
"learning_rate": 7.684961164520792e-06,
"loss": 0.0822,
"step": 1960
},
{
"epoch": 24.663536776212833,
"grad_norm": 5.3125,
"learning_rate": 7.655774238199459e-06,
"loss": 0.0989,
"step": 1970
},
{
"epoch": 24.788732394366196,
"grad_norm": 7.59375,
"learning_rate": 7.6264607023528135e-06,
"loss": 0.1003,
"step": 1980
},
{
"epoch": 24.913928012519563,
"grad_norm": 5.25,
"learning_rate": 7.597021954453887e-06,
"loss": 0.089,
"step": 1990
},
{
"epoch": 25.039123630672925,
"grad_norm": 4.375,
"learning_rate": 7.567459397944972e-06,
"loss": 0.0784,
"step": 2000
},
{
"epoch": 25.039123630672925,
"eval_loss": 2.359957456588745,
"eval_runtime": 3.1779,
"eval_samples_per_second": 22.657,
"eval_steps_per_second": 22.657,
"step": 2000
},
{
"epoch": 25.164319248826292,
"grad_norm": 5.46875,
"learning_rate": 7.537774442170731e-06,
"loss": 0.0569,
"step": 2010
},
{
"epoch": 25.289514866979655,
"grad_norm": 5.4375,
"learning_rate": 7.507968502311005e-06,
"loss": 0.0682,
"step": 2020
},
{
"epoch": 25.41471048513302,
"grad_norm": 5.28125,
"learning_rate": 7.478042999313342e-06,
"loss": 0.0679,
"step": 2030
},
{
"epoch": 25.539906103286384,
"grad_norm": 3.234375,
"learning_rate": 7.447999359825263e-06,
"loss": 0.0564,
"step": 2040
},
{
"epoch": 25.66510172143975,
"grad_norm": 4.6875,
"learning_rate": 7.417839016126242e-06,
"loss": 0.0661,
"step": 2050
},
{
"epoch": 25.790297339593113,
"grad_norm": 5.78125,
"learning_rate": 7.387563406059433e-06,
"loss": 0.0728,
"step": 2060
},
{
"epoch": 25.91549295774648,
"grad_norm": 4.4375,
"learning_rate": 7.357173972963112e-06,
"loss": 0.0758,
"step": 2070
},
{
"epoch": 26.040688575899843,
"grad_norm": 2.90625,
"learning_rate": 7.32667216560188e-06,
"loss": 0.0679,
"step": 2080
},
{
"epoch": 26.16588419405321,
"grad_norm": 3.546875,
"learning_rate": 7.296059438097589e-06,
"loss": 0.0469,
"step": 2090
},
{
"epoch": 26.291079812206572,
"grad_norm": 2.984375,
"learning_rate": 7.265337249860015e-06,
"loss": 0.0479,
"step": 2100
},
{
"epoch": 26.41627543035994,
"grad_norm": 2.859375,
"learning_rate": 7.234507065517297e-06,
"loss": 0.0499,
"step": 2110
},
{
"epoch": 26.5414710485133,
"grad_norm": 5.3125,
"learning_rate": 7.2035703548461e-06,
"loss": 0.0529,
"step": 2120
},
{
"epoch": 26.666666666666668,
"grad_norm": 3.078125,
"learning_rate": 7.17252859270155e-06,
"loss": 0.0557,
"step": 2130
},
{
"epoch": 26.79186228482003,
"grad_norm": 2.625,
"learning_rate": 7.141383258946926e-06,
"loss": 0.0492,
"step": 2140
},
{
"epoch": 26.917057902973397,
"grad_norm": 3.921875,
"learning_rate": 7.110135838383105e-06,
"loss": 0.0541,
"step": 2150
},
{
"epoch": 27.04225352112676,
"grad_norm": 2.640625,
"learning_rate": 7.078787820677784e-06,
"loss": 0.0528,
"step": 2160
},
{
"epoch": 27.167449139280127,
"grad_norm": 2.765625,
"learning_rate": 7.047340700294454e-06,
"loss": 0.0453,
"step": 2170
},
{
"epoch": 27.29264475743349,
"grad_norm": 3.5,
"learning_rate": 7.015795976421156e-06,
"loss": 0.036,
"step": 2180
},
{
"epoch": 27.417840375586856,
"grad_norm": 4.96875,
"learning_rate": 6.984155152899016e-06,
"loss": 0.0427,
"step": 2190
},
{
"epoch": 27.54303599374022,
"grad_norm": 4.21875,
"learning_rate": 6.952419738150546e-06,
"loss": 0.0424,
"step": 2200
},
{
"epoch": 27.668231611893585,
"grad_norm": 3.6875,
"learning_rate": 6.9205912451077305e-06,
"loss": 0.0336,
"step": 2210
},
{
"epoch": 27.793427230046948,
"grad_norm": 2.609375,
"learning_rate": 6.88867119113991e-06,
"loss": 0.0462,
"step": 2220
},
{
"epoch": 27.918622848200314,
"grad_norm": 2.296875,
"learning_rate": 6.856661097981433e-06,
"loss": 0.0492,
"step": 2230
},
{
"epoch": 28.043818466353677,
"grad_norm": 1.3359375,
"learning_rate": 6.824562491659112e-06,
"loss": 0.0363,
"step": 2240
},
{
"epoch": 28.169014084507044,
"grad_norm": 1.9609375,
"learning_rate": 6.792376902419478e-06,
"loss": 0.03,
"step": 2250
},
{
"epoch": 28.294209702660407,
"grad_norm": 2.15625,
"learning_rate": 6.7601058646558195e-06,
"loss": 0.0331,
"step": 2260
},
{
"epoch": 28.419405320813773,
"grad_norm": 2.046875,
"learning_rate": 6.7277509168350445e-06,
"loss": 0.0364,
"step": 2270
},
{
"epoch": 28.544600938967136,
"grad_norm": 2.890625,
"learning_rate": 6.695313601424326e-06,
"loss": 0.0296,
"step": 2280
},
{
"epoch": 28.669796557120502,
"grad_norm": 2.0625,
"learning_rate": 6.662795464817573e-06,
"loss": 0.0323,
"step": 2290
},
{
"epoch": 28.794992175273865,
"grad_norm": 2.578125,
"learning_rate": 6.63019805726171e-06,
"loss": 0.0312,
"step": 2300
},
{
"epoch": 28.920187793427232,
"grad_norm": 1.109375,
"learning_rate": 6.597522932782765e-06,
"loss": 0.0318,
"step": 2310
},
{
"epoch": 29.045383411580595,
"grad_norm": 0.859375,
"learning_rate": 6.564771649111792e-06,
"loss": 0.0269,
"step": 2320
},
{
"epoch": 29.170579029733958,
"grad_norm": 1.578125,
"learning_rate": 6.531945767610604e-06,
"loss": 0.0205,
"step": 2330
},
{
"epoch": 29.295774647887324,
"grad_norm": 1.640625,
"learning_rate": 6.499046853197338e-06,
"loss": 0.024,
"step": 2340
},
{
"epoch": 29.420970266040687,
"grad_norm": 1.4765625,
"learning_rate": 6.46607647427185e-06,
"loss": 0.024,
"step": 2350
},
{
"epoch": 29.546165884194053,
"grad_norm": 0.65234375,
"learning_rate": 6.4330362026409506e-06,
"loss": 0.0262,
"step": 2360
},
{
"epoch": 29.671361502347416,
"grad_norm": 1.4140625,
"learning_rate": 6.3999276134434595e-06,
"loss": 0.0252,
"step": 2370
},
{
"epoch": 29.796557120500783,
"grad_norm": 1.8203125,
"learning_rate": 6.366752285075125e-06,
"loss": 0.0224,
"step": 2380
},
{
"epoch": 29.921752738654146,
"grad_norm": 0.515625,
"learning_rate": 6.33351179911337e-06,
"loss": 0.0204,
"step": 2390
},
{
"epoch": 30.046948356807512,
"grad_norm": 0.70703125,
"learning_rate": 6.300207740241895e-06,
"loss": 0.0195,
"step": 2400
},
{
"epoch": 30.172143974960875,
"grad_norm": 0.796875,
"learning_rate": 6.266841696175132e-06,
"loss": 0.0164,
"step": 2410
},
{
"epoch": 30.29733959311424,
"grad_norm": 1.2109375,
"learning_rate": 6.233415257582551e-06,
"loss": 0.0167,
"step": 2420
},
{
"epoch": 30.422535211267604,
"grad_norm": 0.87890625,
"learning_rate": 6.19993001801283e-06,
"loss": 0.0149,
"step": 2430
},
{
"epoch": 30.54773082942097,
"grad_norm": 1.1015625,
"learning_rate": 6.166387573817881e-06,
"loss": 0.0178,
"step": 2440
},
{
"epoch": 30.672926447574334,
"grad_norm": 0.66015625,
"learning_rate": 6.132789524076751e-06,
"loss": 0.0181,
"step": 2450
},
{
"epoch": 30.7981220657277,
"grad_norm": 0.6484375,
"learning_rate": 6.0991374705193866e-06,
"loss": 0.0147,
"step": 2460
},
{
"epoch": 30.923317683881063,
"grad_norm": 0.96875,
"learning_rate": 6.065433017450276e-06,
"loss": 0.0182,
"step": 2470
},
{
"epoch": 31.04851330203443,
"grad_norm": 0.68359375,
"learning_rate": 6.031677771671962e-06,
"loss": 0.0149,
"step": 2480
},
{
"epoch": 31.173708920187792,
"grad_norm": 0.23046875,
"learning_rate": 5.997873342408446e-06,
"loss": 0.0135,
"step": 2490
},
{
"epoch": 31.29890453834116,
"grad_norm": 0.25390625,
"learning_rate": 5.964021341228468e-06,
"loss": 0.0138,
"step": 2500
},
{
"epoch": 31.42410015649452,
"grad_norm": 1.8203125,
"learning_rate": 5.930123381968677e-06,
"loss": 0.0138,
"step": 2510
},
{
"epoch": 31.549295774647888,
"grad_norm": 0.453125,
"learning_rate": 5.8961810806567e-06,
"loss": 0.0158,
"step": 2520
},
{
"epoch": 31.67449139280125,
"grad_norm": 0.58984375,
"learning_rate": 5.862196055434089e-06,
"loss": 0.0116,
"step": 2530
},
{
"epoch": 31.799687010954617,
"grad_norm": 0.7265625,
"learning_rate": 5.828169926479191e-06,
"loss": 0.0124,
"step": 2540
},
{
"epoch": 31.92488262910798,
"grad_norm": 0.498046875,
"learning_rate": 5.794104315929904e-06,
"loss": 0.0144,
"step": 2550
},
{
"epoch": 32.05007824726135,
"grad_norm": 0.25,
"learning_rate": 5.760000847806337e-06,
"loss": 0.0127,
"step": 2560
},
{
"epoch": 32.17527386541471,
"grad_norm": 0.12060546875,
"learning_rate": 5.725861147933403e-06,
"loss": 0.0116,
"step": 2570
},
{
"epoch": 32.30046948356807,
"grad_norm": 0.279296875,
"learning_rate": 5.6916868438632976e-06,
"loss": 0.0133,
"step": 2580
},
{
"epoch": 32.42566510172144,
"grad_norm": 0.435546875,
"learning_rate": 5.657479564797914e-06,
"loss": 0.0091,
"step": 2590
},
{
"epoch": 32.550860719874805,
"grad_norm": 0.1123046875,
"learning_rate": 5.623240941511173e-06,
"loss": 0.01,
"step": 2600
},
{
"epoch": 32.67605633802817,
"grad_norm": 0.1806640625,
"learning_rate": 5.588972606271276e-06,
"loss": 0.0091,
"step": 2610
},
{
"epoch": 32.80125195618153,
"grad_norm": 0.400390625,
"learning_rate": 5.554676192762891e-06,
"loss": 0.0111,
"step": 2620
},
{
"epoch": 32.9264475743349,
"grad_norm": 0.5859375,
"learning_rate": 5.520353336009274e-06,
"loss": 0.0102,
"step": 2630
},
{
"epoch": 33.051643192488264,
"grad_norm": 0.0849609375,
"learning_rate": 5.48600567229431e-06,
"loss": 0.0084,
"step": 2640
},
{
"epoch": 33.17683881064163,
"grad_norm": 0.18359375,
"learning_rate": 5.451634839084523e-06,
"loss": 0.009,
"step": 2650
},
{
"epoch": 33.30203442879499,
"grad_norm": 0.349609375,
"learning_rate": 5.417242474950999e-06,
"loss": 0.0083,
"step": 2660
},
{
"epoch": 33.42723004694836,
"grad_norm": 0.34765625,
"learning_rate": 5.382830219491271e-06,
"loss": 0.0091,
"step": 2670
},
{
"epoch": 33.55242566510172,
"grad_norm": 0.447265625,
"learning_rate": 5.348399713251163e-06,
"loss": 0.0115,
"step": 2680
},
{
"epoch": 33.677621283255085,
"grad_norm": 0.275390625,
"learning_rate": 5.3139525976465675e-06,
"loss": 0.0089,
"step": 2690
},
{
"epoch": 33.80281690140845,
"grad_norm": 0.263671875,
"learning_rate": 5.279490514885207e-06,
"loss": 0.0075,
"step": 2700
},
{
"epoch": 33.92801251956182,
"grad_norm": 0.236328125,
"learning_rate": 5.245015107888335e-06,
"loss": 0.0095,
"step": 2710
},
{
"epoch": 34.05320813771518,
"grad_norm": 0.23828125,
"learning_rate": 5.210528020212412e-06,
"loss": 0.0081,
"step": 2720
},
{
"epoch": 34.178403755868544,
"grad_norm": 0.310546875,
"learning_rate": 5.176030895970761e-06,
"loss": 0.007,
"step": 2730
},
{
"epoch": 34.30359937402191,
"grad_norm": 0.087890625,
"learning_rate": 5.141525379755178e-06,
"loss": 0.0053,
"step": 2740
},
{
"epoch": 34.42879499217528,
"grad_norm": 0.09326171875,
"learning_rate": 5.10701311655753e-06,
"loss": 0.0065,
"step": 2750
},
{
"epoch": 34.55399061032864,
"grad_norm": 0.06689453125,
"learning_rate": 5.072495751691338e-06,
"loss": 0.0072,
"step": 2760
},
{
"epoch": 34.679186228482,
"grad_norm": 0.10546875,
"learning_rate": 5.037974930713338e-06,
"loss": 0.0058,
"step": 2770
},
{
"epoch": 34.804381846635366,
"grad_norm": 0.0859375,
"learning_rate": 5.003452299345024e-06,
"loss": 0.0055,
"step": 2780
},
{
"epoch": 34.929577464788736,
"grad_norm": 0.24609375,
"learning_rate": 4.968929503394206e-06,
"loss": 0.0061,
"step": 2790
},
{
"epoch": 35.0547730829421,
"grad_norm": 0.056396484375,
"learning_rate": 4.934408188676531e-06,
"loss": 0.0062,
"step": 2800
},
{
"epoch": 35.17996870109546,
"grad_norm": 0.1435546875,
"learning_rate": 4.8998900009370366e-06,
"loss": 0.0049,
"step": 2810
},
{
"epoch": 35.305164319248824,
"grad_norm": 0.06298828125,
"learning_rate": 4.865376585771687e-06,
"loss": 0.0043,
"step": 2820
},
{
"epoch": 35.430359937402194,
"grad_norm": 0.056640625,
"learning_rate": 4.830869588548918e-06,
"loss": 0.006,
"step": 2830
},
{
"epoch": 35.55555555555556,
"grad_norm": 0.048828125,
"learning_rate": 4.796370654331205e-06,
"loss": 0.0055,
"step": 2840
},
{
"epoch": 35.68075117370892,
"grad_norm": 0.171875,
"learning_rate": 4.7618814277966325e-06,
"loss": 0.0052,
"step": 2850
},
{
"epoch": 35.80594679186228,
"grad_norm": 0.045166015625,
"learning_rate": 4.727403553160484e-06,
"loss": 0.0057,
"step": 2860
},
{
"epoch": 35.93114241001565,
"grad_norm": 0.048583984375,
"learning_rate": 4.692938674096867e-06,
"loss": 0.005,
"step": 2870
},
{
"epoch": 36.056338028169016,
"grad_norm": 0.03857421875,
"learning_rate": 4.658488433660341e-06,
"loss": 0.0047,
"step": 2880
},
{
"epoch": 36.18153364632238,
"grad_norm": 0.357421875,
"learning_rate": 4.624054474207597e-06,
"loss": 0.0048,
"step": 2890
},
{
"epoch": 36.30672926447574,
"grad_norm": 0.185546875,
"learning_rate": 4.589638437319157e-06,
"loss": 0.0054,
"step": 2900
},
{
"epoch": 36.431924882629104,
"grad_norm": 0.11962890625,
"learning_rate": 4.555241963721118e-06,
"loss": 0.0041,
"step": 2910
},
{
"epoch": 36.557120500782474,
"grad_norm": 0.2158203125,
"learning_rate": 4.5208666932069255e-06,
"loss": 0.0043,
"step": 2920
},
{
"epoch": 36.68231611893584,
"grad_norm": 0.154296875,
"learning_rate": 4.486514264559206e-06,
"loss": 0.0045,
"step": 2930
},
{
"epoch": 36.8075117370892,
"grad_norm": 0.0400390625,
"learning_rate": 4.452186315471641e-06,
"loss": 0.0039,
"step": 2940
},
{
"epoch": 36.93270735524256,
"grad_norm": 0.396484375,
"learning_rate": 4.417884482470887e-06,
"loss": 0.0042,
"step": 2950
},
{
"epoch": 37.05790297339593,
"grad_norm": 0.189453125,
"learning_rate": 4.383610400838561e-06,
"loss": 0.0039,
"step": 2960
},
{
"epoch": 37.183098591549296,
"grad_norm": 0.034423828125,
"learning_rate": 4.349365704533285e-06,
"loss": 0.0038,
"step": 2970
},
{
"epoch": 37.30829420970266,
"grad_norm": 0.03173828125,
"learning_rate": 4.31515202611278e-06,
"loss": 0.0037,
"step": 2980
},
{
"epoch": 37.43348982785602,
"grad_norm": 0.033447265625,
"learning_rate": 4.2809709966560435e-06,
"loss": 0.0031,
"step": 2990
},
{
"epoch": 37.55868544600939,
"grad_norm": 0.2275390625,
"learning_rate": 4.246824245685591e-06,
"loss": 0.0037,
"step": 3000
},
{
"epoch": 37.55868544600939,
"eval_loss": 2.755915641784668,
"eval_runtime": 3.2484,
"eval_samples_per_second": 22.165,
"eval_steps_per_second": 22.165,
"step": 3000
},
{
"epoch": 37.683881064162755,
"grad_norm": 0.031005859375,
"learning_rate": 4.2127134010897695e-06,
"loss": 0.003,
"step": 3010
},
{
"epoch": 37.80907668231612,
"grad_norm": 0.043701171875,
"learning_rate": 4.178640089045147e-06,
"loss": 0.0031,
"step": 3020
},
{
"epoch": 37.93427230046948,
"grad_norm": 0.1708984375,
"learning_rate": 4.144605933938993e-06,
"loss": 0.0032,
"step": 3030
},
{
"epoch": 38.05946791862285,
"grad_norm": 0.02783203125,
"learning_rate": 4.1106125582918385e-06,
"loss": 0.0032,
"step": 3040
},
{
"epoch": 38.18466353677621,
"grad_norm": 0.03125,
"learning_rate": 4.07666158268012e-06,
"loss": 0.003,
"step": 3050
},
{
"epoch": 38.309859154929576,
"grad_norm": 0.037109375,
"learning_rate": 4.042754625658929e-06,
"loss": 0.0028,
"step": 3060
},
{
"epoch": 38.43505477308294,
"grad_norm": 0.0303955078125,
"learning_rate": 4.008893303684837e-06,
"loss": 0.0035,
"step": 3070
},
{
"epoch": 38.56025039123631,
"grad_norm": 0.05908203125,
"learning_rate": 3.975079231038848e-06,
"loss": 0.0025,
"step": 3080
},
{
"epoch": 38.68544600938967,
"grad_norm": 0.050048828125,
"learning_rate": 3.941314019749438e-06,
"loss": 0.0036,
"step": 3090
},
{
"epoch": 38.810641627543035,
"grad_norm": 0.033203125,
"learning_rate": 3.9075992795156916e-06,
"loss": 0.003,
"step": 3100
},
{
"epoch": 38.9358372456964,
"grad_norm": 0.04931640625,
"learning_rate": 3.873936617630578e-06,
"loss": 0.0028,
"step": 3110
},
{
"epoch": 39.06103286384977,
"grad_norm": 0.03369140625,
"learning_rate": 3.840327638904321e-06,
"loss": 0.0026,
"step": 3120
},
{
"epoch": 39.18622848200313,
"grad_norm": 0.328125,
"learning_rate": 3.8067739455878844e-06,
"loss": 0.0026,
"step": 3130
},
{
"epoch": 39.31142410015649,
"grad_norm": 0.0341796875,
"learning_rate": 3.7732771372965987e-06,
"loss": 0.0024,
"step": 3140
},
{
"epoch": 39.436619718309856,
"grad_norm": 0.0458984375,
"learning_rate": 3.7398388109338984e-06,
"loss": 0.0026,
"step": 3150
},
{
"epoch": 39.561815336463226,
"grad_norm": 0.02783203125,
"learning_rate": 3.7064605606151866e-06,
"loss": 0.0026,
"step": 3160
},
{
"epoch": 39.68701095461659,
"grad_norm": 0.0703125,
"learning_rate": 3.6731439775918467e-06,
"loss": 0.0024,
"step": 3170
},
{
"epoch": 39.81220657276995,
"grad_norm": 0.052978515625,
"learning_rate": 3.639890650175379e-06,
"loss": 0.0027,
"step": 3180
},
{
"epoch": 39.937402190923315,
"grad_norm": 0.04296875,
"learning_rate": 3.6067021636616793e-06,
"loss": 0.002,
"step": 3190
},
{
"epoch": 40.062597809076685,
"grad_norm": 0.1357421875,
"learning_rate": 3.5735801002554615e-06,
"loss": 0.002,
"step": 3200
},
{
"epoch": 40.18779342723005,
"grad_norm": 0.2412109375,
"learning_rate": 3.540526038994834e-06,
"loss": 0.0018,
"step": 3210
},
{
"epoch": 40.31298904538341,
"grad_norm": 1.7890625,
"learning_rate": 3.5075415556760157e-06,
"loss": 0.0025,
"step": 3220
},
{
"epoch": 40.438184663536774,
"grad_norm": 0.69921875,
"learning_rate": 3.4746282227782164e-06,
"loss": 0.0037,
"step": 3230
},
{
"epoch": 40.563380281690144,
"grad_norm": 3.21875,
"learning_rate": 3.4417876093886705e-06,
"loss": 0.0042,
"step": 3240
},
{
"epoch": 40.68857589984351,
"grad_norm": 1.9375,
"learning_rate": 3.409021281127835e-06,
"loss": 0.0085,
"step": 3250
},
{
"epoch": 40.81377151799687,
"grad_norm": 5.21875,
"learning_rate": 3.3763308000747453e-06,
"loss": 0.0072,
"step": 3260
},
{
"epoch": 40.93896713615023,
"grad_norm": 9.6875,
"learning_rate": 3.3437177246925547e-06,
"loss": 0.0108,
"step": 3270
},
{
"epoch": 41.0641627543036,
"grad_norm": 11.125,
"learning_rate": 3.31118360975423e-06,
"loss": 0.0087,
"step": 3280
},
{
"epoch": 41.189358372456965,
"grad_norm": 15.875,
"learning_rate": 3.278730006268432e-06,
"loss": 0.0112,
"step": 3290
},
{
"epoch": 41.31455399061033,
"grad_norm": 0.46484375,
"learning_rate": 3.246358461405579e-06,
"loss": 0.0105,
"step": 3300
},
{
"epoch": 41.43974960876369,
"grad_norm": 1.9609375,
"learning_rate": 3.2140705184240783e-06,
"loss": 0.0098,
"step": 3310
},
{
"epoch": 41.56494522691706,
"grad_norm": 3.125,
"learning_rate": 3.181867716596765e-06,
"loss": 0.0071,
"step": 3320
},
{
"epoch": 41.690140845070424,
"grad_norm": 3.71875,
"learning_rate": 3.1497515911375113e-06,
"loss": 0.0124,
"step": 3330
},
{
"epoch": 41.81533646322379,
"grad_norm": 0.84375,
"learning_rate": 3.11772367312804e-06,
"loss": 0.0075,
"step": 3340
},
{
"epoch": 41.94053208137715,
"grad_norm": 10.6875,
"learning_rate": 3.085785489444936e-06,
"loss": 0.0093,
"step": 3350
},
{
"epoch": 42.06572769953052,
"grad_norm": 0.90625,
"learning_rate": 3.05393856268685e-06,
"loss": 0.0073,
"step": 3360
},
{
"epoch": 42.19092331768388,
"grad_norm": 2.8125,
"learning_rate": 3.0221844111019166e-06,
"loss": 0.0039,
"step": 3370
},
{
"epoch": 42.316118935837245,
"grad_norm": 1.1015625,
"learning_rate": 2.99052454851537e-06,
"loss": 0.0065,
"step": 3380
},
{
"epoch": 42.44131455399061,
"grad_norm": 1.140625,
"learning_rate": 2.9589604842573762e-06,
"loss": 0.0064,
"step": 3390
},
{
"epoch": 42.56651017214398,
"grad_norm": 2.140625,
"learning_rate": 2.927493723091078e-06,
"loss": 0.0058,
"step": 3400
},
{
"epoch": 42.69170579029734,
"grad_norm": 0.9140625,
"learning_rate": 2.8961257651408627e-06,
"loss": 0.0094,
"step": 3410
},
{
"epoch": 42.816901408450704,
"grad_norm": 5.5,
"learning_rate": 2.8648581058208387e-06,
"loss": 0.0053,
"step": 3420
},
{
"epoch": 42.94209702660407,
"grad_norm": 2.84375,
"learning_rate": 2.8336922357635464e-06,
"loss": 0.0084,
"step": 3430
},
{
"epoch": 43.06729264475744,
"grad_norm": 1.0625,
"learning_rate": 2.802629640748898e-06,
"loss": 0.0044,
"step": 3440
},
{
"epoch": 43.1924882629108,
"grad_norm": 2.078125,
"learning_rate": 2.7716718016333432e-06,
"loss": 0.0069,
"step": 3450
},
{
"epoch": 43.31768388106416,
"grad_norm": 3.09375,
"learning_rate": 2.7408201942792755e-06,
"loss": 0.0061,
"step": 3460
},
{
"epoch": 43.442879499217526,
"grad_norm": 2.109375,
"learning_rate": 2.7100762894846633e-06,
"loss": 0.0065,
"step": 3470
},
{
"epoch": 43.568075117370896,
"grad_norm": 0.80859375,
"learning_rate": 2.6794415529129402e-06,
"loss": 0.0052,
"step": 3480
},
{
"epoch": 43.69327073552426,
"grad_norm": 0.34375,
"learning_rate": 2.6489174450231353e-06,
"loss": 0.0056,
"step": 3490
},
{
"epoch": 43.81846635367762,
"grad_norm": 0.6328125,
"learning_rate": 2.618505421000237e-06,
"loss": 0.005,
"step": 3500
},
{
"epoch": 43.943661971830984,
"grad_norm": 0.62109375,
"learning_rate": 2.588206930685827e-06,
"loss": 0.0069,
"step": 3510
},
{
"epoch": 44.06885758998435,
"grad_norm": 1.5234375,
"learning_rate": 2.5580234185089647e-06,
"loss": 0.0043,
"step": 3520
},
{
"epoch": 44.19405320813772,
"grad_norm": 1.3125,
"learning_rate": 2.5279563234173177e-06,
"loss": 0.0054,
"step": 3530
},
{
"epoch": 44.31924882629108,
"grad_norm": 0.7265625,
"learning_rate": 2.4980070788085655e-06,
"loss": 0.0043,
"step": 3540
},
{
"epoch": 44.44444444444444,
"grad_norm": 1.25,
"learning_rate": 2.4681771124620716e-06,
"loss": 0.005,
"step": 3550
},
{
"epoch": 44.569640062597806,
"grad_norm": 2.84375,
"learning_rate": 2.4384678464708077e-06,
"loss": 0.0042,
"step": 3560
},
{
"epoch": 44.694835680751176,
"grad_norm": 2.5,
"learning_rate": 2.4088806971735584e-06,
"loss": 0.0056,
"step": 3570
},
{
"epoch": 44.82003129890454,
"grad_norm": 1.6875,
"learning_rate": 2.3794170750874094e-06,
"loss": 0.0052,
"step": 3580
},
{
"epoch": 44.9452269170579,
"grad_norm": 2.171875,
"learning_rate": 2.3500783848404906e-06,
"loss": 0.0055,
"step": 3590
},
{
"epoch": 45.070422535211264,
"grad_norm": 0.412109375,
"learning_rate": 2.320866025105016e-06,
"loss": 0.0041,
"step": 3600
},
{
"epoch": 45.195618153364634,
"grad_norm": 0.4375,
"learning_rate": 2.2917813885306196e-06,
"loss": 0.0035,
"step": 3610
},
{
"epoch": 45.320813771518,
"grad_norm": 0.1181640625,
"learning_rate": 2.262825861677938e-06,
"loss": 0.0026,
"step": 3620
},
{
"epoch": 45.44600938967136,
"grad_norm": 0.84765625,
"learning_rate": 2.234000824952525e-06,
"loss": 0.0046,
"step": 3630
},
{
"epoch": 45.57120500782472,
"grad_norm": 1.40625,
"learning_rate": 2.2053076525390434e-06,
"loss": 0.004,
"step": 3640
},
{
"epoch": 45.69640062597809,
"grad_norm": 0.1923828125,
"learning_rate": 2.1767477123357424e-06,
"loss": 0.0041,
"step": 3650
},
{
"epoch": 45.821596244131456,
"grad_norm": 2.875,
"learning_rate": 2.1483223658892545e-06,
"loss": 0.0041,
"step": 3660
},
{
"epoch": 45.94679186228482,
"grad_norm": 0.55078125,
"learning_rate": 2.120032968329687e-06,
"loss": 0.004,
"step": 3670
},
{
"epoch": 46.07198748043818,
"grad_norm": 1.1484375,
"learning_rate": 2.091880868306011e-06,
"loss": 0.003,
"step": 3680
},
{
"epoch": 46.19718309859155,
"grad_norm": 0.0869140625,
"learning_rate": 2.0638674079217687e-06,
"loss": 0.0023,
"step": 3690
},
{
"epoch": 46.322378716744915,
"grad_norm": 0.9921875,
"learning_rate": 2.0359939226711002e-06,
"loss": 0.0025,
"step": 3700
},
{
"epoch": 46.44757433489828,
"grad_norm": 0.08251953125,
"learning_rate": 2.008261741375063e-06,
"loss": 0.0026,
"step": 3710
},
{
"epoch": 46.57276995305164,
"grad_norm": 0.0810546875,
"learning_rate": 1.9806721861182907e-06,
"loss": 0.0026,
"step": 3720
},
{
"epoch": 46.69796557120501,
"grad_norm": 0.095703125,
"learning_rate": 1.95322657218596e-06,
"loss": 0.0037,
"step": 3730
},
{
"epoch": 46.82316118935837,
"grad_norm": 1.1328125,
"learning_rate": 1.9259262080010938e-06,
"loss": 0.0028,
"step": 3740
},
{
"epoch": 46.948356807511736,
"grad_norm": 0.08544921875,
"learning_rate": 1.8987723950621805e-06,
"loss": 0.0024,
"step": 3750
},
{
"epoch": 47.0735524256651,
"grad_norm": 0.06640625,
"learning_rate": 1.8717664278811198e-06,
"loss": 0.0023,
"step": 3760
},
{
"epoch": 47.19874804381847,
"grad_norm": 0.5703125,
"learning_rate": 1.844909593921525e-06,
"loss": 0.0021,
"step": 3770
},
{
"epoch": 47.32394366197183,
"grad_norm": 0.271484375,
"learning_rate": 1.8182031735373302e-06,
"loss": 0.002,
"step": 3780
},
{
"epoch": 47.449139280125195,
"grad_norm": 0.07373046875,
"learning_rate": 1.7916484399117579e-06,
"loss": 0.0038,
"step": 3790
},
{
"epoch": 47.57433489827856,
"grad_norm": 0.08056640625,
"learning_rate": 1.7652466589966271e-06,
"loss": 0.004,
"step": 3800
},
{
"epoch": 47.69953051643193,
"grad_norm": 0.439453125,
"learning_rate": 1.738999089451991e-06,
"loss": 0.0032,
"step": 3810
},
{
"epoch": 47.82472613458529,
"grad_norm": 0.0693359375,
"learning_rate": 1.7129069825861388e-06,
"loss": 0.0023,
"step": 3820
},
{
"epoch": 47.94992175273865,
"grad_norm": 0.1552734375,
"learning_rate": 1.6869715822959437e-06,
"loss": 0.0021,
"step": 3830
},
{
"epoch": 48.075117370892016,
"grad_norm": 0.07568359375,
"learning_rate": 1.6611941250075558e-06,
"loss": 0.002,
"step": 3840
},
{
"epoch": 48.200312989045386,
"grad_norm": 0.0751953125,
"learning_rate": 1.6355758396174603e-06,
"loss": 0.0023,
"step": 3850
},
{
"epoch": 48.32550860719875,
"grad_norm": 0.83203125,
"learning_rate": 1.610117947433897e-06,
"loss": 0.002,
"step": 3860
},
{
"epoch": 48.45070422535211,
"grad_norm": 0.044189453125,
"learning_rate": 1.5848216621186268e-06,
"loss": 0.0026,
"step": 3870
},
{
"epoch": 48.575899843505475,
"grad_norm": 0.04638671875,
"learning_rate": 1.55968818962908e-06,
"loss": 0.0018,
"step": 3880
},
{
"epoch": 48.701095461658845,
"grad_norm": 0.06640625,
"learning_rate": 1.5347187281608622e-06,
"loss": 0.0019,
"step": 3890
},
{
"epoch": 48.82629107981221,
"grad_norm": 0.04833984375,
"learning_rate": 1.5099144680906348e-06,
"loss": 0.0022,
"step": 3900
},
{
"epoch": 48.95148669796557,
"grad_norm": 0.045166015625,
"learning_rate": 1.4852765919193584e-06,
"loss": 0.0019,
"step": 3910
},
{
"epoch": 49.076682316118934,
"grad_norm": 0.043701171875,
"learning_rate": 1.460806274215924e-06,
"loss": 0.0019,
"step": 3920
},
{
"epoch": 49.201877934272304,
"grad_norm": 0.041748046875,
"learning_rate": 1.4365046815611622e-06,
"loss": 0.0017,
"step": 3930
},
{
"epoch": 49.32707355242567,
"grad_norm": 0.052734375,
"learning_rate": 1.4123729724922198e-06,
"loss": 0.0017,
"step": 3940
},
{
"epoch": 49.45226917057903,
"grad_norm": 0.03955078125,
"learning_rate": 1.3884122974473307e-06,
"loss": 0.0019,
"step": 3950
},
{
"epoch": 49.57746478873239,
"grad_norm": 0.048828125,
"learning_rate": 1.3646237987109772e-06,
"loss": 0.0018,
"step": 3960
},
{
"epoch": 49.70266040688576,
"grad_norm": 0.048828125,
"learning_rate": 1.3410086103594256e-06,
"loss": 0.0018,
"step": 3970
},
{
"epoch": 49.827856025039125,
"grad_norm": 0.03466796875,
"learning_rate": 1.317567858206661e-06,
"loss": 0.002,
"step": 3980
},
{
"epoch": 49.95305164319249,
"grad_norm": 0.044189453125,
"learning_rate": 1.2943026597507268e-06,
"loss": 0.0017,
"step": 3990
},
{
"epoch": 50.07824726134585,
"grad_norm": 0.0380859375,
"learning_rate": 1.2712141241204352e-06,
"loss": 0.0016,
"step": 4000
},
{
"epoch": 50.07824726134585,
"eval_loss": 2.970012903213501,
"eval_runtime": 3.2782,
"eval_samples_per_second": 21.963,
"eval_steps_per_second": 21.963,
"step": 4000
},
{
"epoch": 50.20344287949922,
"grad_norm": 0.0296630859375,
"learning_rate": 1.2483033520224996e-06,
"loss": 0.0016,
"step": 4010
},
{
"epoch": 50.328638497652584,
"grad_norm": 0.0302734375,
"learning_rate": 1.225571435689062e-06,
"loss": 0.0016,
"step": 4020
},
{
"epoch": 50.45383411580595,
"grad_norm": 0.02880859375,
"learning_rate": 1.2030194588256183e-06,
"loss": 0.0016,
"step": 4030
},
{
"epoch": 50.57902973395931,
"grad_norm": 0.0322265625,
"learning_rate": 1.1806484965593546e-06,
"loss": 0.0016,
"step": 4040
},
{
"epoch": 50.70422535211267,
"grad_norm": 0.0284423828125,
"learning_rate": 1.1584596153878923e-06,
"loss": 0.0016,
"step": 4050
},
{
"epoch": 50.82942097026604,
"grad_norm": 0.030029296875,
"learning_rate": 1.1364538731284514e-06,
"loss": 0.0021,
"step": 4060
},
{
"epoch": 50.954616588419405,
"grad_norm": 0.0262451171875,
"learning_rate": 1.1146323188674102e-06,
"loss": 0.0015,
"step": 4070
},
{
"epoch": 51.07981220657277,
"grad_norm": 0.024658203125,
"learning_rate": 1.0929959929102968e-06,
"loss": 0.0016,
"step": 4080
},
{
"epoch": 51.20500782472613,
"grad_norm": 0.0286865234375,
"learning_rate": 1.0715459267321998e-06,
"loss": 0.0014,
"step": 4090
},
{
"epoch": 51.3302034428795,
"grad_norm": 0.0255126953125,
"learning_rate": 1.0502831429285842e-06,
"loss": 0.0014,
"step": 4100
},
{
"epoch": 51.455399061032864,
"grad_norm": 0.024658203125,
"learning_rate": 1.0292086551665464e-06,
"loss": 0.0015,
"step": 4110
},
{
"epoch": 51.58059467918623,
"grad_norm": 0.0238037109375,
"learning_rate": 1.0083234681364934e-06,
"loss": 0.0015,
"step": 4120
},
{
"epoch": 51.70579029733959,
"grad_norm": 0.0233154296875,
"learning_rate": 9.87628577504236e-07,
"loss": 0.0015,
"step": 4130
},
{
"epoch": 51.83098591549296,
"grad_norm": 0.0250244140625,
"learning_rate": 9.671249698635294e-07,
"loss": 0.0014,
"step": 4140
},
{
"epoch": 51.95618153364632,
"grad_norm": 0.0260009765625,
"learning_rate": 9.468136226890384e-07,
"loss": 0.0014,
"step": 4150
},
{
"epoch": 52.081377151799686,
"grad_norm": 0.0172119140625,
"learning_rate": 9.266955042897357e-07,
"loss": 0.0014,
"step": 4160
},
{
"epoch": 52.20657276995305,
"grad_norm": 0.017578125,
"learning_rate": 9.067715737627391e-07,
"loss": 0.0014,
"step": 4170
},
{
"epoch": 52.33176838810642,
"grad_norm": 0.0181884765625,
"learning_rate": 8.870427809475907e-07,
"loss": 0.0014,
"step": 4180
},
{
"epoch": 52.45696400625978,
"grad_norm": 0.0167236328125,
"learning_rate": 8.675100663809766e-07,
"loss": 0.0013,
"step": 4190
},
{
"epoch": 52.582159624413144,
"grad_norm": 0.019287109375,
"learning_rate": 8.481743612518795e-07,
"loss": 0.0014,
"step": 4200
},
{
"epoch": 52.70735524256651,
"grad_norm": 0.02001953125,
"learning_rate": 8.290365873571954e-07,
"loss": 0.0014,
"step": 4210
},
{
"epoch": 52.83255086071988,
"grad_norm": 0.022705078125,
"learning_rate": 8.100976570577856e-07,
"loss": 0.0013,
"step": 4220
},
{
"epoch": 52.95774647887324,
"grad_norm": 0.0206298828125,
"learning_rate": 7.913584732349788e-07,
"loss": 0.0013,
"step": 4230
},
{
"epoch": 53.0829420970266,
"grad_norm": 0.0181884765625,
"learning_rate": 7.728199292475297e-07,
"loss": 0.0014,
"step": 4240
},
{
"epoch": 53.208137715179966,
"grad_norm": 0.017333984375,
"learning_rate": 7.544829088890326e-07,
"loss": 0.0013,
"step": 4250
},
{
"epoch": 53.333333333333336,
"grad_norm": 0.09130859375,
"learning_rate": 7.363482863457821e-07,
"loss": 0.0013,
"step": 4260
},
{
"epoch": 53.4585289514867,
"grad_norm": 0.017333984375,
"learning_rate": 7.184169261551005e-07,
"loss": 0.0013,
"step": 4270
},
{
"epoch": 53.58372456964006,
"grad_norm": 0.02978515625,
"learning_rate": 7.006896831641257e-07,
"loss": 0.0013,
"step": 4280
},
{
"epoch": 53.708920187793424,
"grad_norm": 0.01611328125,
"learning_rate": 6.831674024890533e-07,
"loss": 0.0012,
"step": 4290
},
{
"epoch": 53.834115805946794,
"grad_norm": 0.015869140625,
"learning_rate": 6.658509194748463e-07,
"loss": 0.0013,
"step": 4300
},
{
"epoch": 53.95931142410016,
"grad_norm": 0.018310546875,
"learning_rate": 6.487410596554178e-07,
"loss": 0.0013,
"step": 4310
},
{
"epoch": 54.08450704225352,
"grad_norm": 0.0184326171875,
"learning_rate": 6.3183863871427e-07,
"loss": 0.0013,
"step": 4320
},
{
"epoch": 54.20970266040688,
"grad_norm": 0.01470947265625,
"learning_rate": 6.15144462445606e-07,
"loss": 0.0013,
"step": 4330
},
{
"epoch": 54.33489827856025,
"grad_norm": 0.0147705078125,
"learning_rate": 5.986593267159224e-07,
"loss": 0.0013,
"step": 4340
},
{
"epoch": 54.460093896713616,
"grad_norm": 0.01348876953125,
"learning_rate": 5.823840174260603e-07,
"loss": 0.0012,
"step": 4350
},
{
"epoch": 54.58528951486698,
"grad_norm": 0.01611328125,
"learning_rate": 5.663193104737413e-07,
"loss": 0.0014,
"step": 4360
},
{
"epoch": 54.71048513302034,
"grad_norm": 0.01409912109375,
"learning_rate": 5.504659717165812e-07,
"loss": 0.0012,
"step": 4370
},
{
"epoch": 54.83568075117371,
"grad_norm": 0.01953125,
"learning_rate": 5.348247569355736e-07,
"loss": 0.0013,
"step": 4380
},
{
"epoch": 54.960876369327075,
"grad_norm": 0.01806640625,
"learning_rate": 5.193964117990625e-07,
"loss": 0.0013,
"step": 4390
},
{
"epoch": 55.08607198748044,
"grad_norm": 0.0400390625,
"learning_rate": 5.041816718271925e-07,
"loss": 0.0012,
"step": 4400
},
{
"epoch": 55.2112676056338,
"grad_norm": 0.01409912109375,
"learning_rate": 4.891812623568476e-07,
"loss": 0.0012,
"step": 4410
},
{
"epoch": 55.33646322378717,
"grad_norm": 0.0135498046875,
"learning_rate": 4.743958985070662e-07,
"loss": 0.0013,
"step": 4420
},
{
"epoch": 55.46165884194053,
"grad_norm": 0.0142822265625,
"learning_rate": 4.598262851449525e-07,
"loss": 0.0013,
"step": 4430
},
{
"epoch": 55.586854460093896,
"grad_norm": 0.01171875,
"learning_rate": 4.454731168520754e-07,
"loss": 0.0013,
"step": 4440
},
{
"epoch": 55.71205007824726,
"grad_norm": 0.01202392578125,
"learning_rate": 4.3133707789134895e-07,
"loss": 0.0011,
"step": 4450
},
{
"epoch": 55.83724569640063,
"grad_norm": 0.01226806640625,
"learning_rate": 4.174188421744174e-07,
"loss": 0.0013,
"step": 4460
},
{
"epoch": 55.96244131455399,
"grad_norm": 0.01300048828125,
"learning_rate": 4.0371907322952654e-07,
"loss": 0.0012,
"step": 4470
},
{
"epoch": 56.087636932707355,
"grad_norm": 0.01226806640625,
"learning_rate": 3.902384241698876e-07,
"loss": 0.0012,
"step": 4480
},
{
"epoch": 56.21283255086072,
"grad_norm": 0.01202392578125,
"learning_rate": 3.769775376625423e-07,
"loss": 0.0012,
"step": 4490
},
{
"epoch": 56.33802816901409,
"grad_norm": 0.01318359375,
"learning_rate": 3.639370458977304e-07,
"loss": 0.0013,
"step": 4500
},
{
"epoch": 56.46322378716745,
"grad_norm": 0.01171875,
"learning_rate": 3.511175705587433e-07,
"loss": 0.0012,
"step": 4510
},
{
"epoch": 56.58841940532081,
"grad_norm": 0.01385498046875,
"learning_rate": 3.3851972279228983e-07,
"loss": 0.0012,
"step": 4520
},
{
"epoch": 56.713615023474176,
"grad_norm": 0.0177001953125,
"learning_rate": 3.261441031793638e-07,
"loss": 0.0013,
"step": 4530
},
{
"epoch": 56.838810641627546,
"grad_norm": 0.01214599609375,
"learning_rate": 3.139913017066054e-07,
"loss": 0.0012,
"step": 4540
},
{
"epoch": 56.96400625978091,
"grad_norm": 0.012939453125,
"learning_rate": 3.0206189773818005e-07,
"loss": 0.0013,
"step": 4550
},
{
"epoch": 57.08920187793427,
"grad_norm": 0.01507568359375,
"learning_rate": 2.903564599881586e-07,
"loss": 0.0013,
"step": 4560
},
{
"epoch": 57.214397496087635,
"grad_norm": 0.0189208984375,
"learning_rate": 2.788755464934001e-07,
"loss": 0.0012,
"step": 4570
},
{
"epoch": 57.339593114241005,
"grad_norm": 0.01275634765625,
"learning_rate": 2.676197045869511e-07,
"loss": 0.0012,
"step": 4580
},
{
"epoch": 57.46478873239437,
"grad_norm": 0.01129150390625,
"learning_rate": 2.565894708719552e-07,
"loss": 0.0013,
"step": 4590
},
{
"epoch": 57.58998435054773,
"grad_norm": 0.01361083984375,
"learning_rate": 2.457853711960673e-07,
"loss": 0.0012,
"step": 4600
},
{
"epoch": 57.715179968701094,
"grad_norm": 0.0135498046875,
"learning_rate": 2.3520792062638576e-07,
"loss": 0.0012,
"step": 4610
},
{
"epoch": 57.840375586854464,
"grad_norm": 0.01385498046875,
"learning_rate": 2.248576234248967e-07,
"loss": 0.0012,
"step": 4620
},
{
"epoch": 57.96557120500783,
"grad_norm": 0.0155029296875,
"learning_rate": 2.1473497302443857e-07,
"loss": 0.0012,
"step": 4630
},
{
"epoch": 58.09076682316119,
"grad_norm": 0.01300048828125,
"learning_rate": 2.0484045200517222e-07,
"loss": 0.0013,
"step": 4640
},
{
"epoch": 58.21596244131455,
"grad_norm": 0.0137939453125,
"learning_rate": 1.9517453207157865e-07,
"loss": 0.0012,
"step": 4650
},
{
"epoch": 58.341158059467915,
"grad_norm": 0.0125732421875,
"learning_rate": 1.8573767402997155e-07,
"loss": 0.0012,
"step": 4660
},
{
"epoch": 58.466353677621285,
"grad_norm": 0.01531982421875,
"learning_rate": 1.7653032776652702e-07,
"loss": 0.0012,
"step": 4670
},
{
"epoch": 58.59154929577465,
"grad_norm": 0.014404296875,
"learning_rate": 1.675529322258368e-07,
"loss": 0.0013,
"step": 4680
},
{
"epoch": 58.71674491392801,
"grad_norm": 0.0142822265625,
"learning_rate": 1.5880591538998292e-07,
"loss": 0.0012,
"step": 4690
},
{
"epoch": 58.841940532081374,
"grad_norm": 0.0128173828125,
"learning_rate": 1.50289694258135e-07,
"loss": 0.0011,
"step": 4700
},
{
"epoch": 58.967136150234744,
"grad_norm": 0.01123046875,
"learning_rate": 1.420046748266668e-07,
"loss": 0.0012,
"step": 4710
},
{
"epoch": 59.09233176838811,
"grad_norm": 0.0145263671875,
"learning_rate": 1.3395125206980774e-07,
"loss": 0.0012,
"step": 4720
},
{
"epoch": 59.21752738654147,
"grad_norm": 0.0126953125,
"learning_rate": 1.261298099208047e-07,
"loss": 0.0012,
"step": 4730
},
{
"epoch": 59.34272300469483,
"grad_norm": 0.048828125,
"learning_rate": 1.185407212536277e-07,
"loss": 0.0013,
"step": 4740
},
{
"epoch": 59.4679186228482,
"grad_norm": 0.0198974609375,
"learning_rate": 1.1118434786518473e-07,
"loss": 0.0013,
"step": 4750
},
{
"epoch": 59.593114241001565,
"grad_norm": 0.01470947265625,
"learning_rate": 1.0406104045808274e-07,
"loss": 0.0012,
"step": 4760
},
{
"epoch": 59.71830985915493,
"grad_norm": 0.015869140625,
"learning_rate": 9.717113862389993e-08,
"loss": 0.0012,
"step": 4770
},
{
"epoch": 59.84350547730829,
"grad_norm": 0.024169921875,
"learning_rate": 9.051497082700256e-08,
"loss": 0.0012,
"step": 4780
},
{
"epoch": 59.96870109546166,
"grad_norm": 0.0205078125,
"learning_rate": 8.40928543888836e-08,
"loss": 0.0012,
"step": 4790
},
{
"epoch": 60.093896713615024,
"grad_norm": 0.050048828125,
"learning_rate": 7.790509547303427e-08,
"loss": 0.0011,
"step": 4800
},
{
"epoch": 60.21909233176839,
"grad_norm": 0.06787109375,
"learning_rate": 7.195198907034906e-08,
"loss": 0.0013,
"step": 4810
},
{
"epoch": 60.34428794992175,
"grad_norm": 0.08154296875,
"learning_rate": 6.623381898506365e-08,
"loss": 0.0012,
"step": 4820
},
{
"epoch": 60.46948356807512,
"grad_norm": 0.1455078125,
"learning_rate": 6.075085782122237e-08,
"loss": 0.0012,
"step": 4830
},
{
"epoch": 60.59467918622848,
"grad_norm": 0.053466796875,
"learning_rate": 5.550336696968472e-08,
"loss": 0.0012,
"step": 4840
},
{
"epoch": 60.719874804381845,
"grad_norm": 0.1455078125,
"learning_rate": 5.0491596595663714e-08,
"loss": 0.0014,
"step": 4850
},
{
"epoch": 60.84507042253521,
"grad_norm": 0.51171875,
"learning_rate": 4.571578562679757e-08,
"loss": 0.0012,
"step": 4860
},
{
"epoch": 60.97026604068858,
"grad_norm": 0.138671875,
"learning_rate": 4.1176161741760535e-08,
"loss": 0.0012,
"step": 4870
},
{
"epoch": 61.09546165884194,
"grad_norm": 0.142578125,
"learning_rate": 3.687294135941044e-08,
"loss": 0.0013,
"step": 4880
},
{
"epoch": 61.220657276995304,
"grad_norm": 0.1298828125,
"learning_rate": 3.280632962846919e-08,
"loss": 0.0013,
"step": 4890
},
{
"epoch": 61.34585289514867,
"grad_norm": 0.0322265625,
"learning_rate": 2.8976520417742794e-08,
"loss": 0.0012,
"step": 4900
},
{
"epoch": 61.47104851330204,
"grad_norm": 0.043212890625,
"learning_rate": 2.5383696306878756e-08,
"loss": 0.0012,
"step": 4910
},
{
"epoch": 61.5962441314554,
"grad_norm": 0.042724609375,
"learning_rate": 2.202802857766362e-08,
"loss": 0.0012,
"step": 4920
},
{
"epoch": 61.72143974960876,
"grad_norm": 0.04052734375,
"learning_rate": 1.8909677205856682e-08,
"loss": 0.0012,
"step": 4930
},
{
"epoch": 61.846635367762126,
"grad_norm": 0.039794921875,
"learning_rate": 1.6028790853561126e-08,
"loss": 0.0013,
"step": 4940
},
{
"epoch": 61.971830985915496,
"grad_norm": 0.03564453125,
"learning_rate": 1.3385506862140795e-08,
"loss": 0.0013,
"step": 4950
},
{
"epoch": 62.09702660406886,
"grad_norm": 0.040771484375,
"learning_rate": 1.0979951245669307e-08,
"loss": 0.0013,
"step": 4960
},
{
"epoch": 62.22222222222222,
"grad_norm": 0.05419921875,
"learning_rate": 8.812238684923758e-09,
"loss": 0.0012,
"step": 4970
},
{
"epoch": 62.347417840375584,
"grad_norm": 0.0400390625,
"learning_rate": 6.882472521919093e-09,
"loss": 0.0012,
"step": 4980
},
{
"epoch": 62.472613458528954,
"grad_norm": 0.06982421875,
"learning_rate": 5.190744754978716e-09,
"loss": 0.0012,
"step": 4990
},
{
"epoch": 62.59780907668232,
"grad_norm": 0.033447265625,
"learning_rate": 3.737136034349109e-09,
"loss": 0.0013,
"step": 5000
},
{
"epoch": 62.59780907668232,
"eval_loss": 3.0162200927734375,
"eval_runtime": 3.1935,
"eval_samples_per_second": 22.546,
"eval_steps_per_second": 22.546,
"step": 5000
},
{
"epoch": 62.72300469483568,
"grad_norm": 0.03369140625,
"learning_rate": 2.5217156583579037e-09,
"loss": 0.0012,
"step": 5010
},
{
"epoch": 62.84820031298904,
"grad_norm": 0.041259765625,
"learning_rate": 1.5445415701065281e-09,
"loss": 0.0013,
"step": 5020
},
{
"epoch": 62.97339593114241,
"grad_norm": 0.03564453125,
"learning_rate": 8.056603547090813e-10,
"loss": 0.0012,
"step": 5030
},
{
"epoch": 63.098591549295776,
"grad_norm": 0.02685546875,
"learning_rate": 3.0510723707299907e-10,
"loss": 0.0013,
"step": 5040
},
{
"epoch": 63.22378716744914,
"grad_norm": 0.060546875,
"learning_rate": 4.290608021706444e-11,
"loss": 0.0012,
"step": 5050
},
{
"epoch": 63.298904538341155,
"step": 5056,
"total_flos": 5.606358848176128e+17,
"train_loss": 0.27379138119090674,
"train_runtime": 5351.3398,
"train_samples_per_second": 7.642,
"train_steps_per_second": 0.945
}
],
"logging_steps": 10,
"max_steps": 5056,
"num_input_tokens_seen": 0,
"num_train_epochs": 64,
"save_steps": 0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 5.606358848176128e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}