gpt_train_12_384 / trainer_state.json
gokulsrinivasagan's picture
End of training
674480c verified
raw
history blame
146 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.016516516516516516,
"eval_steps": 1,
"global_step": 341,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 4.8435532306500046e-05,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 1
},
{
"epoch": 4.8435532306500046e-05,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 276.4656,
"eval_samples_per_second": 122.138,
"eval_steps_per_second": 3.82,
"step": 1
},
{
"epoch": 9.687106461300009e-05,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 2
},
{
"epoch": 9.687106461300009e-05,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.1935,
"eval_samples_per_second": 122.703,
"eval_steps_per_second": 3.837,
"step": 2
},
{
"epoch": 0.00014530659691950015,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 3
},
{
"epoch": 0.00014530659691950015,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.13,
"eval_samples_per_second": 122.731,
"eval_steps_per_second": 3.838,
"step": 3
},
{
"epoch": 0.00019374212922600018,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 4
},
{
"epoch": 0.00019374212922600018,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 276.2699,
"eval_samples_per_second": 122.225,
"eval_steps_per_second": 3.822,
"step": 4
},
{
"epoch": 0.00024217766153250024,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 5
},
{
"epoch": 0.00024217766153250024,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 274.7331,
"eval_samples_per_second": 122.908,
"eval_steps_per_second": 3.844,
"step": 5
},
{
"epoch": 0.0002906131938390003,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 6
},
{
"epoch": 0.0002906131938390003,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.5447,
"eval_samples_per_second": 122.546,
"eval_steps_per_second": 3.832,
"step": 6
},
{
"epoch": 0.00033904872614550033,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 7
},
{
"epoch": 0.00033904872614550033,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.1675,
"eval_samples_per_second": 122.714,
"eval_steps_per_second": 3.838,
"step": 7
},
{
"epoch": 0.00038748425845200037,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 8
},
{
"epoch": 0.00038748425845200037,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.5655,
"eval_samples_per_second": 122.537,
"eval_steps_per_second": 3.832,
"step": 8
},
{
"epoch": 0.00043591979075850045,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 9
},
{
"epoch": 0.00043591979075850045,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 273.3419,
"eval_samples_per_second": 123.534,
"eval_steps_per_second": 3.863,
"step": 9
},
{
"epoch": 0.0004843553230650005,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 10
},
{
"epoch": 0.0004843553230650005,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.1369,
"eval_samples_per_second": 122.728,
"eval_steps_per_second": 3.838,
"step": 10
},
{
"epoch": 0.0005327908553715005,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 11
},
{
"epoch": 0.0005327908553715005,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 274.4663,
"eval_samples_per_second": 123.028,
"eval_steps_per_second": 3.847,
"step": 11
},
{
"epoch": 0.0005812263876780006,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 12
},
{
"epoch": 0.0005812263876780006,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 273.8795,
"eval_samples_per_second": 123.291,
"eval_steps_per_second": 3.856,
"step": 12
},
{
"epoch": 0.0006296619199845006,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 13
},
{
"epoch": 0.0006296619199845006,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 273.17,
"eval_samples_per_second": 123.612,
"eval_steps_per_second": 3.866,
"step": 13
},
{
"epoch": 0.0006780974522910007,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 14
},
{
"epoch": 0.0006780974522910007,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 274.4591,
"eval_samples_per_second": 123.031,
"eval_steps_per_second": 3.848,
"step": 14
},
{
"epoch": 0.0007265329845975008,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 15
},
{
"epoch": 0.0007265329845975008,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 275.1413,
"eval_samples_per_second": 122.726,
"eval_steps_per_second": 3.838,
"step": 15
},
{
"epoch": 0.0007749685169040007,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.8984,
"step": 16
},
{
"epoch": 0.0007749685169040007,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 276.8384,
"eval_samples_per_second": 121.974,
"eval_steps_per_second": 3.814,
"step": 16
},
{
"epoch": 0.0008234040492105008,
"grad_norm": 0.0,
"learning_rate": 1e-05,
"loss": 10.9062,
"step": 17
},
{
"epoch": 0.0008234040492105008,
"eval_accuracy": 0.0001320069300164392,
"eval_loss": 10.90625,
"eval_runtime": 273.566,
"eval_samples_per_second": 123.433,
"eval_steps_per_second": 3.86,
"step": 17
},
{
"epoch": 0.0008718395815170009,
"grad_norm": 6.863816738128662,
"learning_rate": 9.99999515644677e-06,
"loss": 10.9062,
"step": 18
},
{
"epoch": 0.0008718395815170009,
"eval_accuracy": 0.010980023790775268,
"eval_loss": 10.7578125,
"eval_runtime": 273.6742,
"eval_samples_per_second": 123.384,
"eval_steps_per_second": 3.859,
"step": 18
},
{
"epoch": 0.0009202751138235009,
"grad_norm": 6.298513889312744,
"learning_rate": 9.999990312893539e-06,
"loss": 10.7734,
"step": 19
},
{
"epoch": 0.0009202751138235009,
"eval_accuracy": 0.028515349612393204,
"eval_loss": 10.65625,
"eval_runtime": 273.1376,
"eval_samples_per_second": 123.626,
"eval_steps_per_second": 3.866,
"step": 19
},
{
"epoch": 0.000968710646130001,
"grad_norm": 5.340964317321777,
"learning_rate": 9.999985469340309e-06,
"loss": 10.6797,
"step": 20
},
{
"epoch": 0.000968710646130001,
"eval_accuracy": 0.04691063110573666,
"eval_loss": 10.578125,
"eval_runtime": 273.0387,
"eval_samples_per_second": 123.671,
"eval_steps_per_second": 3.868,
"step": 20
},
{
"epoch": 0.001017146178436501,
"grad_norm": 4.327230930328369,
"learning_rate": 9.999980625787079e-06,
"loss": 10.6016,
"step": 21
},
{
"epoch": 0.001017146178436501,
"eval_accuracy": 0.04854858814680248,
"eval_loss": 10.5234375,
"eval_runtime": 272.9282,
"eval_samples_per_second": 123.721,
"eval_steps_per_second": 3.869,
"step": 21
},
{
"epoch": 0.001065581710743001,
"grad_norm": 3.803434133529663,
"learning_rate": 9.999975782233847e-06,
"loss": 10.5234,
"step": 22
},
{
"epoch": 0.001065581710743001,
"eval_accuracy": 0.04776647603534324,
"eval_loss": 10.4765625,
"eval_runtime": 272.6159,
"eval_samples_per_second": 123.863,
"eval_steps_per_second": 3.874,
"step": 22
},
{
"epoch": 0.001114017243049501,
"grad_norm": 3.2490711212158203,
"learning_rate": 9.999970938680617e-06,
"loss": 10.5,
"step": 23
},
{
"epoch": 0.001114017243049501,
"eval_accuracy": 0.04827195257153118,
"eval_loss": 10.4375,
"eval_runtime": 272.3658,
"eval_samples_per_second": 123.977,
"eval_steps_per_second": 3.877,
"step": 23
},
{
"epoch": 0.0011624527753560012,
"grad_norm": 2.9085004329681396,
"learning_rate": 9.999966095127386e-06,
"loss": 10.4531,
"step": 24
},
{
"epoch": 0.0011624527753560012,
"eval_accuracy": 0.05073292650302844,
"eval_loss": 10.40625,
"eval_runtime": 272.851,
"eval_samples_per_second": 123.756,
"eval_steps_per_second": 3.87,
"step": 24
},
{
"epoch": 0.0012108883076625012,
"grad_norm": 2.668471574783325,
"learning_rate": 9.999961251574155e-06,
"loss": 10.4141,
"step": 25
},
{
"epoch": 0.0012108883076625012,
"eval_accuracy": 0.05310853016333744,
"eval_loss": 10.3828125,
"eval_runtime": 272.4493,
"eval_samples_per_second": 123.939,
"eval_steps_per_second": 3.876,
"step": 25
},
{
"epoch": 0.0012593238399690012,
"grad_norm": 2.864935874938965,
"learning_rate": 9.999956408020926e-06,
"loss": 10.3672,
"step": 26
},
{
"epoch": 0.0012593238399690012,
"eval_accuracy": 0.0555671013371173,
"eval_loss": 10.359375,
"eval_runtime": 273.3963,
"eval_samples_per_second": 123.509,
"eval_steps_per_second": 3.863,
"step": 26
},
{
"epoch": 0.0013077593722755014,
"grad_norm": 2.2354369163513184,
"learning_rate": 9.999951564467694e-06,
"loss": 10.3828,
"step": 27
},
{
"epoch": 0.0013077593722755014,
"eval_accuracy": 0.05616663281094196,
"eval_loss": 10.3359375,
"eval_runtime": 272.6273,
"eval_samples_per_second": 123.858,
"eval_steps_per_second": 3.873,
"step": 27
},
{
"epoch": 0.0013561949045820013,
"grad_norm": 2.1477534770965576,
"learning_rate": 9.999946720914464e-06,
"loss": 10.3594,
"step": 28
},
{
"epoch": 0.0013561949045820013,
"eval_accuracy": 0.05624939768219569,
"eval_loss": 10.3203125,
"eval_runtime": 273.0918,
"eval_samples_per_second": 123.647,
"eval_steps_per_second": 3.867,
"step": 28
},
{
"epoch": 0.0014046304368885013,
"grad_norm": 2.097315549850464,
"learning_rate": 9.999941877361234e-06,
"loss": 10.3281,
"step": 29
},
{
"epoch": 0.0014046304368885013,
"eval_accuracy": 0.055895844911079295,
"eval_loss": 10.3046875,
"eval_runtime": 273.0626,
"eval_samples_per_second": 123.66,
"eval_steps_per_second": 3.867,
"step": 29
},
{
"epoch": 0.0014530659691950015,
"grad_norm": 1.8777693510055542,
"learning_rate": 9.999937033808002e-06,
"loss": 10.3203,
"step": 30
},
{
"epoch": 0.0014530659691950015,
"eval_accuracy": 0.0563090902895847,
"eval_loss": 10.296875,
"eval_runtime": 272.1369,
"eval_samples_per_second": 124.081,
"eval_steps_per_second": 3.88,
"step": 30
},
{
"epoch": 0.0015015015015015015,
"grad_norm": 1.8313064575195312,
"learning_rate": 9.999932190254772e-06,
"loss": 10.3281,
"step": 31
},
{
"epoch": 0.0015015015015015015,
"eval_accuracy": 0.05664084454791549,
"eval_loss": 10.28125,
"eval_runtime": 272.0928,
"eval_samples_per_second": 124.101,
"eval_steps_per_second": 3.881,
"step": 31
},
{
"epoch": 0.0015499370338080015,
"grad_norm": 1.7771973609924316,
"learning_rate": 9.999927346701542e-06,
"loss": 10.3359,
"step": 32
},
{
"epoch": 0.0015499370338080015,
"eval_accuracy": 0.05662431473277527,
"eval_loss": 10.2734375,
"eval_runtime": 272.172,
"eval_samples_per_second": 124.065,
"eval_steps_per_second": 3.88,
"step": 32
},
{
"epoch": 0.0015983725661145017,
"grad_norm": 1.8934669494628906,
"learning_rate": 9.99992250314831e-06,
"loss": 10.2656,
"step": 33
},
{
"epoch": 0.0015983725661145017,
"eval_accuracy": 0.057003458321026435,
"eval_loss": 10.265625,
"eval_runtime": 271.9989,
"eval_samples_per_second": 124.144,
"eval_steps_per_second": 3.882,
"step": 33
},
{
"epoch": 0.0016468080984210016,
"grad_norm": 1.789952039718628,
"learning_rate": 9.99991765959508e-06,
"loss": 10.2656,
"step": 34
},
{
"epoch": 0.0016468080984210016,
"eval_accuracy": 0.056085286435208145,
"eval_loss": 10.2578125,
"eval_runtime": 272.24,
"eval_samples_per_second": 124.034,
"eval_steps_per_second": 3.879,
"step": 34
},
{
"epoch": 0.0016952436307275016,
"grad_norm": 1.7584354877471924,
"learning_rate": 9.99991281604185e-06,
"loss": 10.2656,
"step": 35
},
{
"epoch": 0.0016952436307275016,
"eval_accuracy": 0.056159366640013426,
"eval_loss": 10.2421875,
"eval_runtime": 272.3595,
"eval_samples_per_second": 123.98,
"eval_steps_per_second": 3.877,
"step": 35
},
{
"epoch": 0.0017436791630340018,
"grad_norm": 1.7618820667266846,
"learning_rate": 9.999907972488618e-06,
"loss": 10.2656,
"step": 36
},
{
"epoch": 0.0017436791630340018,
"eval_accuracy": 0.057521267083571186,
"eval_loss": 10.234375,
"eval_runtime": 272.7034,
"eval_samples_per_second": 123.823,
"eval_steps_per_second": 3.872,
"step": 36
},
{
"epoch": 0.0017921146953405018,
"grad_norm": 1.6511751413345337,
"learning_rate": 9.999903128935388e-06,
"loss": 10.2656,
"step": 37
},
{
"epoch": 0.0017921146953405018,
"eval_accuracy": 0.05863849152438795,
"eval_loss": 10.2265625,
"eval_runtime": 273.7952,
"eval_samples_per_second": 123.329,
"eval_steps_per_second": 3.857,
"step": 37
},
{
"epoch": 0.0018405502276470018,
"grad_norm": 1.8650130033493042,
"learning_rate": 9.999898285382156e-06,
"loss": 10.2109,
"step": 38
},
{
"epoch": 0.0018405502276470018,
"eval_accuracy": 0.059270706292946944,
"eval_loss": 10.21875,
"eval_runtime": 274.2569,
"eval_samples_per_second": 123.122,
"eval_steps_per_second": 3.85,
"step": 38
},
{
"epoch": 0.001888985759953502,
"grad_norm": 1.7996951341629028,
"learning_rate": 9.999893441828926e-06,
"loss": 10.2656,
"step": 39
},
{
"epoch": 0.001888985759953502,
"eval_accuracy": 0.05958891247161815,
"eval_loss": 10.2109375,
"eval_runtime": 274.6294,
"eval_samples_per_second": 122.955,
"eval_steps_per_second": 3.845,
"step": 39
},
{
"epoch": 0.001937421292260002,
"grad_norm": 1.7773430347442627,
"learning_rate": 9.999888598275696e-06,
"loss": 10.2266,
"step": 40
},
{
"epoch": 0.001937421292260002,
"eval_accuracy": 0.05994316001605042,
"eval_loss": 10.203125,
"eval_runtime": 273.9324,
"eval_samples_per_second": 123.268,
"eval_steps_per_second": 3.855,
"step": 40
},
{
"epoch": 0.001985856824566502,
"grad_norm": 1.7419933080673218,
"learning_rate": 9.999883754722464e-06,
"loss": 10.2109,
"step": 41
},
{
"epoch": 0.001985856824566502,
"eval_accuracy": 0.06009832605659606,
"eval_loss": 10.1953125,
"eval_runtime": 274.4948,
"eval_samples_per_second": 123.015,
"eval_steps_per_second": 3.847,
"step": 41
},
{
"epoch": 0.002034292356873002,
"grad_norm": 1.7278474569320679,
"learning_rate": 9.999878911169234e-06,
"loss": 10.2109,
"step": 42
},
{
"epoch": 0.002034292356873002,
"eval_accuracy": 0.060353828943509456,
"eval_loss": 10.1796875,
"eval_runtime": 275.0795,
"eval_samples_per_second": 122.754,
"eval_steps_per_second": 3.839,
"step": 42
},
{
"epoch": 0.002082727889179502,
"grad_norm": 1.8463383913040161,
"learning_rate": 9.999874067616004e-06,
"loss": 10.2109,
"step": 43
},
{
"epoch": 0.002082727889179502,
"eval_accuracy": 0.060806444809914505,
"eval_loss": 10.171875,
"eval_runtime": 275.6513,
"eval_samples_per_second": 122.499,
"eval_steps_per_second": 3.831,
"step": 43
},
{
"epoch": 0.002131163421486002,
"grad_norm": 1.8434734344482422,
"learning_rate": 9.999869224062774e-06,
"loss": 10.1484,
"step": 44
},
{
"epoch": 0.002131163421486002,
"eval_accuracy": 0.061002660373873155,
"eval_loss": 10.1640625,
"eval_runtime": 275.2495,
"eval_samples_per_second": 122.678,
"eval_steps_per_second": 3.837,
"step": 44
},
{
"epoch": 0.002179598953792502,
"grad_norm": 1.8196474313735962,
"learning_rate": 9.999864380509543e-06,
"loss": 10.1875,
"step": 45
},
{
"epoch": 0.002179598953792502,
"eval_accuracy": 0.061108642253432405,
"eval_loss": 10.1484375,
"eval_runtime": 275.2499,
"eval_samples_per_second": 122.678,
"eval_steps_per_second": 3.837,
"step": 45
},
{
"epoch": 0.002228034486099002,
"grad_norm": 1.7358877658843994,
"learning_rate": 9.999859536956312e-06,
"loss": 10.1719,
"step": 46
},
{
"epoch": 0.002228034486099002,
"eval_accuracy": 0.061226840563795806,
"eval_loss": 10.140625,
"eval_runtime": 273.2604,
"eval_samples_per_second": 123.571,
"eval_steps_per_second": 3.864,
"step": 46
},
{
"epoch": 0.0022764700184055024,
"grad_norm": 1.7613184452056885,
"learning_rate": 9.999854693403081e-06,
"loss": 10.1484,
"step": 47
},
{
"epoch": 0.0022764700184055024,
"eval_accuracy": 0.06154533623134863,
"eval_loss": 10.1328125,
"eval_runtime": 274.8774,
"eval_samples_per_second": 122.844,
"eval_steps_per_second": 3.842,
"step": 47
},
{
"epoch": 0.0023249055507120024,
"grad_norm": 1.926283597946167,
"learning_rate": 9.999849849849851e-06,
"loss": 10.1172,
"step": 48
},
{
"epoch": 0.0023249055507120024,
"eval_accuracy": 0.062220076916616865,
"eval_loss": 10.1171875,
"eval_runtime": 275.5341,
"eval_samples_per_second": 122.551,
"eval_steps_per_second": 3.833,
"step": 48
},
{
"epoch": 0.0023733410830185024,
"grad_norm": 1.7182645797729492,
"learning_rate": 9.99984500629662e-06,
"loss": 10.1797,
"step": 49
},
{
"epoch": 0.0023733410830185024,
"eval_accuracy": 0.06321505020272762,
"eval_loss": 10.109375,
"eval_runtime": 275.3575,
"eval_samples_per_second": 122.63,
"eval_steps_per_second": 3.835,
"step": 49
},
{
"epoch": 0.0024217766153250024,
"grad_norm": 1.756512999534607,
"learning_rate": 9.99984016274339e-06,
"loss": 10.1016,
"step": 50
},
{
"epoch": 0.0024217766153250024,
"eval_accuracy": 0.06421378684429936,
"eval_loss": 10.1015625,
"eval_runtime": 274.9451,
"eval_samples_per_second": 122.814,
"eval_steps_per_second": 3.841,
"step": 50
},
{
"epoch": 0.0024702121476315024,
"grad_norm": 1.8228658437728882,
"learning_rate": 9.99983531919016e-06,
"loss": 10.1406,
"step": 51
},
{
"epoch": 0.0024702121476315024,
"eval_accuracy": 0.06511247612838496,
"eval_loss": 10.09375,
"eval_runtime": 274.7644,
"eval_samples_per_second": 122.894,
"eval_steps_per_second": 3.843,
"step": 51
},
{
"epoch": 0.0025186476799380023,
"grad_norm": 1.6864567995071411,
"learning_rate": 9.999830475636927e-06,
"loss": 10.1406,
"step": 52
},
{
"epoch": 0.0025186476799380023,
"eval_accuracy": 0.0658020965421682,
"eval_loss": 10.0859375,
"eval_runtime": 275.2863,
"eval_samples_per_second": 122.661,
"eval_steps_per_second": 3.836,
"step": 52
},
{
"epoch": 0.0025670832122445027,
"grad_norm": 1.7754981517791748,
"learning_rate": 9.999825632083697e-06,
"loss": 10.1094,
"step": 53
},
{
"epoch": 0.0025670832122445027,
"eval_accuracy": 0.06627448449918756,
"eval_loss": 10.078125,
"eval_runtime": 274.8943,
"eval_samples_per_second": 122.836,
"eval_steps_per_second": 3.841,
"step": 53
},
{
"epoch": 0.0026155187445510027,
"grad_norm": 1.7636278867721558,
"learning_rate": 9.999820788530467e-06,
"loss": 10.1016,
"step": 54
},
{
"epoch": 0.0026155187445510027,
"eval_accuracy": 0.06685522814459541,
"eval_loss": 10.0703125,
"eval_runtime": 275.7861,
"eval_samples_per_second": 122.439,
"eval_steps_per_second": 3.829,
"step": 54
},
{
"epoch": 0.0026639542768575027,
"grad_norm": 1.7524579763412476,
"learning_rate": 9.999815944977235e-06,
"loss": 10.0781,
"step": 55
},
{
"epoch": 0.0026639542768575027,
"eval_accuracy": 0.06716721031231189,
"eval_loss": 10.0625,
"eval_runtime": 274.8867,
"eval_samples_per_second": 122.84,
"eval_steps_per_second": 3.842,
"step": 55
},
{
"epoch": 0.0027123898091640027,
"grad_norm": 1.8897311687469482,
"learning_rate": 9.999811101424005e-06,
"loss": 10.0703,
"step": 56
},
{
"epoch": 0.0027123898091640027,
"eval_accuracy": 0.06777936350137496,
"eval_loss": 10.0546875,
"eval_runtime": 276.2638,
"eval_samples_per_second": 122.227,
"eval_steps_per_second": 3.822,
"step": 56
},
{
"epoch": 0.0027608253414705027,
"grad_norm": 1.7320737838745117,
"learning_rate": 9.999806257870775e-06,
"loss": 10.0703,
"step": 57
},
{
"epoch": 0.0027608253414705027,
"eval_accuracy": 0.06813957451676851,
"eval_loss": 10.046875,
"eval_runtime": 275.4873,
"eval_samples_per_second": 122.572,
"eval_steps_per_second": 3.833,
"step": 57
},
{
"epoch": 0.0028092608737770026,
"grad_norm": 1.685152530670166,
"learning_rate": 9.999801414317543e-06,
"loss": 10.0469,
"step": 58
},
{
"epoch": 0.0028092608737770026,
"eval_accuracy": 0.06857357624808572,
"eval_loss": 10.0390625,
"eval_runtime": 275.0861,
"eval_samples_per_second": 122.751,
"eval_steps_per_second": 3.839,
"step": 58
},
{
"epoch": 0.002857696406083503,
"grad_norm": 1.6026166677474976,
"learning_rate": 9.999796570764313e-06,
"loss": 10.1016,
"step": 59
},
{
"epoch": 0.002857696406083503,
"eval_accuracy": 0.06889201401786221,
"eval_loss": 10.03125,
"eval_runtime": 273.8074,
"eval_samples_per_second": 123.324,
"eval_steps_per_second": 3.857,
"step": 59
},
{
"epoch": 0.002906131938390003,
"grad_norm": 1.7406948804855347,
"learning_rate": 9.999791727211083e-06,
"loss": 10.0547,
"step": 60
},
{
"epoch": 0.002906131938390003,
"eval_accuracy": 0.06942965276879759,
"eval_loss": 10.03125,
"eval_runtime": 274.1162,
"eval_samples_per_second": 123.185,
"eval_steps_per_second": 3.852,
"step": 60
},
{
"epoch": 0.002954567470696503,
"grad_norm": 2.25240421295166,
"learning_rate": 9.999786883657853e-06,
"loss": 10.0391,
"step": 61
},
{
"epoch": 0.002954567470696503,
"eval_accuracy": 0.06947052859888163,
"eval_loss": 10.0234375,
"eval_runtime": 274.759,
"eval_samples_per_second": 122.897,
"eval_steps_per_second": 3.843,
"step": 61
},
{
"epoch": 0.003003003003003003,
"grad_norm": 1.6132714748382568,
"learning_rate": 9.999782040104623e-06,
"loss": 10.0547,
"step": 62
},
{
"epoch": 0.003003003003003003,
"eval_accuracy": 0.06921656000304079,
"eval_loss": 10.015625,
"eval_runtime": 275.2004,
"eval_samples_per_second": 122.7,
"eval_steps_per_second": 3.837,
"step": 62
},
{
"epoch": 0.003051438535309503,
"grad_norm": 1.6277832984924316,
"learning_rate": 9.99977719655139e-06,
"loss": 10.0312,
"step": 63
},
{
"epoch": 0.003051438535309503,
"eval_accuracy": 0.06882297091959703,
"eval_loss": 10.0078125,
"eval_runtime": 274.9078,
"eval_samples_per_second": 122.83,
"eval_steps_per_second": 3.841,
"step": 63
},
{
"epoch": 0.003099874067616003,
"grad_norm": 1.6769694089889526,
"learning_rate": 9.99977235299816e-06,
"loss": 10.0547,
"step": 64
},
{
"epoch": 0.003099874067616003,
"eval_accuracy": 0.06873357675295433,
"eval_loss": 10.0,
"eval_runtime": 273.9236,
"eval_samples_per_second": 123.272,
"eval_steps_per_second": 3.855,
"step": 64
},
{
"epoch": 0.0031483095999225033,
"grad_norm": 1.6080327033996582,
"learning_rate": 9.99976750944493e-06,
"loss": 10.0547,
"step": 65
},
{
"epoch": 0.0031483095999225033,
"eval_accuracy": 0.06925561205317066,
"eval_loss": 9.9921875,
"eval_runtime": 275.0761,
"eval_samples_per_second": 122.755,
"eval_steps_per_second": 3.839,
"step": 65
},
{
"epoch": 0.0031967451322290033,
"grad_norm": 1.6163508892059326,
"learning_rate": 9.999762665891699e-06,
"loss": 9.9922,
"step": 66
},
{
"epoch": 0.0031967451322290033,
"eval_accuracy": 0.0697469615319358,
"eval_loss": 9.984375,
"eval_runtime": 273.8628,
"eval_samples_per_second": 123.299,
"eval_steps_per_second": 3.856,
"step": 66
},
{
"epoch": 0.0032451806645355033,
"grad_norm": 1.625279426574707,
"learning_rate": 9.999757822338468e-06,
"loss": 10.0234,
"step": 67
},
{
"epoch": 0.0032451806645355033,
"eval_accuracy": 0.0704554855696885,
"eval_loss": 9.9765625,
"eval_runtime": 274.3158,
"eval_samples_per_second": 123.095,
"eval_steps_per_second": 3.85,
"step": 67
},
{
"epoch": 0.0032936161968420033,
"grad_norm": 1.6738680601119995,
"learning_rate": 9.999752978785238e-06,
"loss": 10.0,
"step": 68
},
{
"epoch": 0.0032936161968420033,
"eval_accuracy": 0.07112654974616023,
"eval_loss": 9.96875,
"eval_runtime": 275.9505,
"eval_samples_per_second": 122.366,
"eval_steps_per_second": 3.827,
"step": 68
},
{
"epoch": 0.0033420517291485033,
"grad_norm": 1.5247821807861328,
"learning_rate": 9.999748135232007e-06,
"loss": 10.0,
"step": 69
},
{
"epoch": 0.0033420517291485033,
"eval_accuracy": 0.0715033484744703,
"eval_loss": 9.9609375,
"eval_runtime": 274.4182,
"eval_samples_per_second": 123.049,
"eval_steps_per_second": 3.848,
"step": 69
},
{
"epoch": 0.0033904872614550032,
"grad_norm": 1.8255083560943604,
"learning_rate": 9.999743291678776e-06,
"loss": 9.9688,
"step": 70
},
{
"epoch": 0.0033904872614550032,
"eval_accuracy": 0.07161468589833944,
"eval_loss": 9.9609375,
"eval_runtime": 274.1089,
"eval_samples_per_second": 123.188,
"eval_steps_per_second": 3.852,
"step": 70
},
{
"epoch": 0.0034389227937615036,
"grad_norm": 1.845422387123108,
"learning_rate": 9.999738448125546e-06,
"loss": 9.9922,
"step": 71
},
{
"epoch": 0.0034389227937615036,
"eval_accuracy": 0.07169394795412562,
"eval_loss": 9.953125,
"eval_runtime": 275.2085,
"eval_samples_per_second": 122.696,
"eval_steps_per_second": 3.837,
"step": 71
},
{
"epoch": 0.0034873583260680036,
"grad_norm": 1.663128137588501,
"learning_rate": 9.999733604572314e-06,
"loss": 9.9844,
"step": 72
},
{
"epoch": 0.0034873583260680036,
"eval_accuracy": 0.07159517434771859,
"eval_loss": 9.9453125,
"eval_runtime": 274.6498,
"eval_samples_per_second": 122.946,
"eval_steps_per_second": 3.845,
"step": 72
},
{
"epoch": 0.0035357938583745036,
"grad_norm": 1.6756772994995117,
"learning_rate": 9.999728761019084e-06,
"loss": 9.9688,
"step": 73
},
{
"epoch": 0.0035357938583745036,
"eval_accuracy": 0.07181347791334446,
"eval_loss": 9.9375,
"eval_runtime": 274.5733,
"eval_samples_per_second": 122.98,
"eval_steps_per_second": 3.846,
"step": 73
},
{
"epoch": 0.0035842293906810036,
"grad_norm": 1.746936559677124,
"learning_rate": 9.999723917465854e-06,
"loss": 9.9453,
"step": 74
},
{
"epoch": 0.0035842293906810036,
"eval_accuracy": 0.07256360150338524,
"eval_loss": 9.9296875,
"eval_runtime": 275.0915,
"eval_samples_per_second": 122.748,
"eval_steps_per_second": 3.839,
"step": 74
},
{
"epoch": 0.0036326649229875036,
"grad_norm": 1.6043540239334106,
"learning_rate": 9.999719073912622e-06,
"loss": 9.9375,
"step": 75
},
{
"epoch": 0.0036326649229875036,
"eval_accuracy": 0.07335680103901034,
"eval_loss": 9.921875,
"eval_runtime": 274.3328,
"eval_samples_per_second": 123.088,
"eval_steps_per_second": 3.849,
"step": 75
},
{
"epoch": 0.0036811004552940035,
"grad_norm": 1.6499953269958496,
"learning_rate": 9.999714230359392e-06,
"loss": 9.9141,
"step": 76
},
{
"epoch": 0.0036811004552940035,
"eval_accuracy": 0.0744010163838838,
"eval_loss": 9.9140625,
"eval_runtime": 274.4219,
"eval_samples_per_second": 123.048,
"eval_steps_per_second": 3.848,
"step": 76
},
{
"epoch": 0.0037295359876005035,
"grad_norm": 1.6161168813705444,
"learning_rate": 9.999709386806162e-06,
"loss": 9.9062,
"step": 77
},
{
"epoch": 0.0037295359876005035,
"eval_accuracy": 0.07513791033203478,
"eval_loss": 9.90625,
"eval_runtime": 274.8621,
"eval_samples_per_second": 122.851,
"eval_steps_per_second": 3.842,
"step": 77
},
{
"epoch": 0.003777971519907004,
"grad_norm": 1.760338544845581,
"learning_rate": 9.999704543252932e-06,
"loss": 9.9219,
"step": 78
},
{
"epoch": 0.003777971519907004,
"eval_accuracy": 0.07549183943869726,
"eval_loss": 9.90625,
"eval_runtime": 273.942,
"eval_samples_per_second": 123.263,
"eval_steps_per_second": 3.855,
"step": 78
},
{
"epoch": 0.003826407052213504,
"grad_norm": 2.1402640342712402,
"learning_rate": 9.9996996996997e-06,
"loss": 9.9219,
"step": 79
},
{
"epoch": 0.003826407052213504,
"eval_accuracy": 0.07561446692894938,
"eval_loss": 9.8984375,
"eval_runtime": 273.1637,
"eval_samples_per_second": 123.615,
"eval_steps_per_second": 3.866,
"step": 79
},
{
"epoch": 0.003874842584520004,
"grad_norm": 1.5549274682998657,
"learning_rate": 9.99969485614647e-06,
"loss": 9.9219,
"step": 80
},
{
"epoch": 0.003874842584520004,
"eval_accuracy": 0.07565650071455989,
"eval_loss": 9.890625,
"eval_runtime": 274.1597,
"eval_samples_per_second": 123.165,
"eval_steps_per_second": 3.852,
"step": 80
},
{
"epoch": 0.003923278116826504,
"grad_norm": 1.619598388671875,
"learning_rate": 9.99969001259324e-06,
"loss": 9.875,
"step": 81
},
{
"epoch": 0.003923278116826504,
"eval_accuracy": 0.07585523483178858,
"eval_loss": 9.8828125,
"eval_runtime": 273.6756,
"eval_samples_per_second": 123.383,
"eval_steps_per_second": 3.859,
"step": 81
},
{
"epoch": 0.003971713649133004,
"grad_norm": 1.4982187747955322,
"learning_rate": 9.999685169040008e-06,
"loss": 9.9219,
"step": 82
},
{
"epoch": 0.003971713649133004,
"eval_accuracy": 0.07604369209371994,
"eval_loss": 9.875,
"eval_runtime": 274.2745,
"eval_samples_per_second": 123.114,
"eval_steps_per_second": 3.85,
"step": 82
},
{
"epoch": 0.004020149181439504,
"grad_norm": 1.8369065523147583,
"learning_rate": 9.999680325486778e-06,
"loss": 9.875,
"step": 83
},
{
"epoch": 0.004020149181439504,
"eval_accuracy": 0.07629766068956077,
"eval_loss": 9.875,
"eval_runtime": 273.8951,
"eval_samples_per_second": 123.284,
"eval_steps_per_second": 3.855,
"step": 83
},
{
"epoch": 0.004068584713746004,
"grad_norm": 1.5859246253967285,
"learning_rate": 9.999675481933548e-06,
"loss": 9.8672,
"step": 84
},
{
"epoch": 0.004068584713746004,
"eval_accuracy": 0.07654106294122268,
"eval_loss": 9.8671875,
"eval_runtime": 273.7145,
"eval_samples_per_second": 123.366,
"eval_steps_per_second": 3.858,
"step": 84
},
{
"epoch": 0.004117020246052504,
"grad_norm": 1.527214765548706,
"learning_rate": 9.999670638380316e-06,
"loss": 9.9062,
"step": 85
},
{
"epoch": 0.004117020246052504,
"eval_accuracy": 0.07687032759517157,
"eval_loss": 9.859375,
"eval_runtime": 273.3281,
"eval_samples_per_second": 123.54,
"eval_steps_per_second": 3.863,
"step": 85
},
{
"epoch": 0.004165455778359004,
"grad_norm": 1.5885719060897827,
"learning_rate": 9.999665794827086e-06,
"loss": 9.8828,
"step": 86
},
{
"epoch": 0.004165455778359004,
"eval_accuracy": 0.07730965592191048,
"eval_loss": 9.8515625,
"eval_runtime": 273.6316,
"eval_samples_per_second": 123.403,
"eval_steps_per_second": 3.859,
"step": 86
},
{
"epoch": 0.004213891310665505,
"grad_norm": 1.7169041633605957,
"learning_rate": 9.999660951273856e-06,
"loss": 9.8594,
"step": 87
},
{
"epoch": 0.004213891310665505,
"eval_accuracy": 0.07752955167638524,
"eval_loss": 9.8515625,
"eval_runtime": 273.4517,
"eval_samples_per_second": 123.484,
"eval_steps_per_second": 3.862,
"step": 87
},
{
"epoch": 0.004262326842972004,
"grad_norm": 1.5023819208145142,
"learning_rate": 9.999656107720624e-06,
"loss": 9.8906,
"step": 88
},
{
"epoch": 0.004262326842972004,
"eval_accuracy": 0.07768043328148298,
"eval_loss": 9.84375,
"eval_runtime": 275.7256,
"eval_samples_per_second": 122.466,
"eval_steps_per_second": 3.83,
"step": 88
},
{
"epoch": 0.0043107623752785046,
"grad_norm": 1.6757872104644775,
"learning_rate": 9.999651264167394e-06,
"loss": 9.8047,
"step": 89
},
{
"epoch": 0.0043107623752785046,
"eval_accuracy": 0.07773103593798929,
"eval_loss": 9.8359375,
"eval_runtime": 275.8678,
"eval_samples_per_second": 122.403,
"eval_steps_per_second": 3.828,
"step": 89
},
{
"epoch": 0.004359197907585004,
"grad_norm": 2.2149763107299805,
"learning_rate": 9.999646420614163e-06,
"loss": 9.8203,
"step": 90
},
{
"epoch": 0.004359197907585004,
"eval_accuracy": 0.07783635199312082,
"eval_loss": 9.8359375,
"eval_runtime": 275.5895,
"eval_samples_per_second": 122.526,
"eval_steps_per_second": 3.832,
"step": 90
},
{
"epoch": 0.0044076334398915045,
"grad_norm": 1.6437429189682007,
"learning_rate": 9.999641577060932e-06,
"loss": 9.8594,
"step": 91
},
{
"epoch": 0.0044076334398915045,
"eval_accuracy": 0.07813093587905225,
"eval_loss": 9.828125,
"eval_runtime": 274.7605,
"eval_samples_per_second": 122.896,
"eval_steps_per_second": 3.843,
"step": 91
},
{
"epoch": 0.004456068972198004,
"grad_norm": 1.6756585836410522,
"learning_rate": 9.999636733507701e-06,
"loss": 9.8438,
"step": 92
},
{
"epoch": 0.004456068972198004,
"eval_accuracy": 0.07858427546766132,
"eval_loss": 9.8203125,
"eval_runtime": 275.56,
"eval_samples_per_second": 122.54,
"eval_steps_per_second": 3.832,
"step": 92
},
{
"epoch": 0.0045045045045045045,
"grad_norm": 1.6290555000305176,
"learning_rate": 9.999631889954471e-06,
"loss": 9.8438,
"step": 93
},
{
"epoch": 0.0045045045045045045,
"eval_accuracy": 0.07898075943992122,
"eval_loss": 9.8203125,
"eval_runtime": 275.4203,
"eval_samples_per_second": 122.602,
"eval_steps_per_second": 3.834,
"step": 93
},
{
"epoch": 0.004552940036811005,
"grad_norm": 1.552886724472046,
"learning_rate": 9.999627046401241e-06,
"loss": 9.8438,
"step": 94
},
{
"epoch": 0.004552940036811005,
"eval_accuracy": 0.0792507946686917,
"eval_loss": 9.8125,
"eval_runtime": 276.2603,
"eval_samples_per_second": 122.229,
"eval_steps_per_second": 3.822,
"step": 94
},
{
"epoch": 0.004601375569117504,
"grad_norm": 1.6093745231628418,
"learning_rate": 9.999622202848011e-06,
"loss": 9.8359,
"step": 95
},
{
"epoch": 0.004601375569117504,
"eval_accuracy": 0.07942483538431863,
"eval_loss": 9.8046875,
"eval_runtime": 277.5976,
"eval_samples_per_second": 121.64,
"eval_steps_per_second": 3.804,
"step": 95
},
{
"epoch": 0.004649811101424005,
"grad_norm": 1.6716474294662476,
"learning_rate": 9.99961735929478e-06,
"loss": 9.8281,
"step": 96
},
{
"epoch": 0.004649811101424005,
"eval_accuracy": 0.07951587963758655,
"eval_loss": 9.8046875,
"eval_runtime": 277.2247,
"eval_samples_per_second": 121.804,
"eval_steps_per_second": 3.809,
"step": 96
},
{
"epoch": 0.004698246633730504,
"grad_norm": 1.5188281536102295,
"learning_rate": 9.999612515741549e-06,
"loss": 9.8516,
"step": 97
},
{
"epoch": 0.004698246633730504,
"eval_accuracy": 0.07964241522774047,
"eval_loss": 9.796875,
"eval_runtime": 277.219,
"eval_samples_per_second": 121.806,
"eval_steps_per_second": 3.809,
"step": 97
},
{
"epoch": 0.004746682166037005,
"grad_norm": 1.5686155557632446,
"learning_rate": 9.999607672188319e-06,
"loss": 9.8281,
"step": 98
},
{
"epoch": 0.004746682166037005,
"eval_accuracy": 0.07971328210595982,
"eval_loss": 9.7890625,
"eval_runtime": 276.6601,
"eval_samples_per_second": 122.052,
"eval_steps_per_second": 3.817,
"step": 98
},
{
"epoch": 0.004795117698343505,
"grad_norm": 1.6188207864761353,
"learning_rate": 9.999602828635087e-06,
"loss": 9.7734,
"step": 99
},
{
"epoch": 0.004795117698343505,
"eval_accuracy": 0.0798379649672714,
"eval_loss": 9.7890625,
"eval_runtime": 276.9636,
"eval_samples_per_second": 121.919,
"eval_steps_per_second": 3.813,
"step": 99
},
{
"epoch": 0.004843553230650005,
"grad_norm": 1.6795498132705688,
"learning_rate": 9.999597985081857e-06,
"loss": 9.8125,
"step": 100
},
{
"epoch": 0.004843553230650005,
"eval_accuracy": 0.08018245673639325,
"eval_loss": 9.78125,
"eval_runtime": 277.3903,
"eval_samples_per_second": 121.731,
"eval_steps_per_second": 3.807,
"step": 100
},
{
"epoch": 0.004891988762956505,
"grad_norm": 1.516228199005127,
"learning_rate": 9.999593141528627e-06,
"loss": 9.8203,
"step": 101
},
{
"epoch": 0.004891988762956505,
"eval_accuracy": 0.08056970601332963,
"eval_loss": 9.7734375,
"eval_runtime": 277.0892,
"eval_samples_per_second": 121.863,
"eval_steps_per_second": 3.811,
"step": 101
},
{
"epoch": 0.004940424295263005,
"grad_norm": 1.485206961631775,
"learning_rate": 9.999588297975395e-06,
"loss": 9.8281,
"step": 102
},
{
"epoch": 0.004940424295263005,
"eval_accuracy": 0.0809254588999463,
"eval_loss": 9.7734375,
"eval_runtime": 277.0597,
"eval_samples_per_second": 121.876,
"eval_steps_per_second": 3.811,
"step": 102
},
{
"epoch": 0.004988859827569505,
"grad_norm": 1.6925771236419678,
"learning_rate": 9.999583454422165e-06,
"loss": 9.7734,
"step": 103
},
{
"epoch": 0.004988859827569505,
"eval_accuracy": 0.081113974059654,
"eval_loss": 9.765625,
"eval_runtime": 276.5455,
"eval_samples_per_second": 122.103,
"eval_steps_per_second": 3.819,
"step": 103
},
{
"epoch": 0.005037295359876005,
"grad_norm": 1.6215219497680664,
"learning_rate": 9.999578610868935e-06,
"loss": 9.7891,
"step": 104
},
{
"epoch": 0.005037295359876005,
"eval_accuracy": 0.08127527726448987,
"eval_loss": 9.7578125,
"eval_runtime": 275.7199,
"eval_samples_per_second": 122.469,
"eval_steps_per_second": 3.83,
"step": 104
},
{
"epoch": 0.005085730892182505,
"grad_norm": 1.5104496479034424,
"learning_rate": 9.999573767315703e-06,
"loss": 9.8047,
"step": 105
},
{
"epoch": 0.005085730892182505,
"eval_accuracy": 0.08140079964355813,
"eval_loss": 9.7578125,
"eval_runtime": 275.8702,
"eval_samples_per_second": 122.402,
"eval_steps_per_second": 3.828,
"step": 105
},
{
"epoch": 0.0051341664244890055,
"grad_norm": 1.5603739023208618,
"learning_rate": 9.999568923762473e-06,
"loss": 9.7578,
"step": 106
},
{
"epoch": 0.0051341664244890055,
"eval_accuracy": 0.0814951151211883,
"eval_loss": 9.75,
"eval_runtime": 276.6948,
"eval_samples_per_second": 122.037,
"eval_steps_per_second": 3.816,
"step": 106
},
{
"epoch": 0.005182601956795505,
"grad_norm": 1.6554555892944336,
"learning_rate": 9.999564080209243e-06,
"loss": 9.7734,
"step": 107
},
{
"epoch": 0.005182601956795505,
"eval_accuracy": 0.08162946691114582,
"eval_loss": 9.75,
"eval_runtime": 277.5011,
"eval_samples_per_second": 121.682,
"eval_steps_per_second": 3.805,
"step": 107
},
{
"epoch": 0.0052310374891020054,
"grad_norm": 1.4874709844589233,
"learning_rate": 9.99955923665601e-06,
"loss": 9.7891,
"step": 108
},
{
"epoch": 0.0052310374891020054,
"eval_accuracy": 0.08175817366791184,
"eval_loss": 9.7421875,
"eval_runtime": 277.2974,
"eval_samples_per_second": 121.772,
"eval_steps_per_second": 3.808,
"step": 108
},
{
"epoch": 0.005279473021408505,
"grad_norm": 1.5930671691894531,
"learning_rate": 9.99955439310278e-06,
"loss": 9.75,
"step": 109
},
{
"epoch": 0.005279473021408505,
"eval_accuracy": 0.081921676988248,
"eval_loss": 9.734375,
"eval_runtime": 278.2891,
"eval_samples_per_second": 121.338,
"eval_steps_per_second": 3.795,
"step": 109
},
{
"epoch": 0.005327908553715005,
"grad_norm": 1.7005099058151245,
"learning_rate": 9.99954954954955e-06,
"loss": 9.75,
"step": 110
},
{
"epoch": 0.005327908553715005,
"eval_accuracy": 0.08213621719841287,
"eval_loss": 9.734375,
"eval_runtime": 276.9822,
"eval_samples_per_second": 121.91,
"eval_steps_per_second": 3.813,
"step": 110
},
{
"epoch": 0.005376344086021506,
"grad_norm": 1.5735907554626465,
"learning_rate": 9.99954470599632e-06,
"loss": 9.7266,
"step": 111
},
{
"epoch": 0.005376344086021506,
"eval_accuracy": 0.08227051109059406,
"eval_loss": 9.7265625,
"eval_runtime": 277.2678,
"eval_samples_per_second": 121.785,
"eval_steps_per_second": 3.809,
"step": 111
},
{
"epoch": 0.005424779618328005,
"grad_norm": 1.473027229309082,
"learning_rate": 9.99953986244309e-06,
"loss": 9.7656,
"step": 112
},
{
"epoch": 0.005424779618328005,
"eval_accuracy": 0.08236607137041518,
"eval_loss": 9.71875,
"eval_runtime": 276.699,
"eval_samples_per_second": 122.035,
"eval_steps_per_second": 3.816,
"step": 112
},
{
"epoch": 0.005473215150634506,
"grad_norm": 1.4636644124984741,
"learning_rate": 9.999535018889858e-06,
"loss": 9.7812,
"step": 113
},
{
"epoch": 0.005473215150634506,
"eval_accuracy": 0.0823751613212979,
"eval_loss": 9.71875,
"eval_runtime": 276.4525,
"eval_samples_per_second": 122.144,
"eval_steps_per_second": 3.82,
"step": 113
},
{
"epoch": 0.005521650682941005,
"grad_norm": 1.4979418516159058,
"learning_rate": 9.999530175336628e-06,
"loss": 9.7734,
"step": 114
},
{
"epoch": 0.005521650682941005,
"eval_accuracy": 0.08236062897944081,
"eval_loss": 9.7109375,
"eval_runtime": 277.8616,
"eval_samples_per_second": 121.525,
"eval_steps_per_second": 3.8,
"step": 114
},
{
"epoch": 0.005570086215247506,
"grad_norm": 1.8021794557571411,
"learning_rate": 9.999525331783398e-06,
"loss": 9.7266,
"step": 115
},
{
"epoch": 0.005570086215247506,
"eval_accuracy": 0.08244796777502407,
"eval_loss": 9.7109375,
"eval_runtime": 277.0309,
"eval_samples_per_second": 121.889,
"eval_steps_per_second": 3.812,
"step": 115
},
{
"epoch": 0.005618521747554005,
"grad_norm": 1.8129605054855347,
"learning_rate": 9.999520488230166e-06,
"loss": 9.7266,
"step": 116
},
{
"epoch": 0.005618521747554005,
"eval_accuracy": 0.08262273221285504,
"eval_loss": 9.703125,
"eval_runtime": 278.4245,
"eval_samples_per_second": 121.279,
"eval_steps_per_second": 3.793,
"step": 116
},
{
"epoch": 0.005666957279860506,
"grad_norm": 1.5428948402404785,
"learning_rate": 9.999515644676936e-06,
"loss": 9.7109,
"step": 117
},
{
"epoch": 0.005666957279860506,
"eval_accuracy": 0.08277873777115737,
"eval_loss": 9.6953125,
"eval_runtime": 276.3715,
"eval_samples_per_second": 122.18,
"eval_steps_per_second": 3.821,
"step": 117
},
{
"epoch": 0.005715392812167006,
"grad_norm": 1.7619973421096802,
"learning_rate": 9.999510801123706e-06,
"loss": 9.6719,
"step": 118
},
{
"epoch": 0.005715392812167006,
"eval_accuracy": 0.08289928094146184,
"eval_loss": 9.6953125,
"eval_runtime": 277.665,
"eval_samples_per_second": 121.611,
"eval_steps_per_second": 3.803,
"step": 118
},
{
"epoch": 0.005763828344473506,
"grad_norm": 1.5316611528396606,
"learning_rate": 9.999505957570474e-06,
"loss": 9.6953,
"step": 119
},
{
"epoch": 0.005763828344473506,
"eval_accuracy": 0.08300393117216567,
"eval_loss": 9.6875,
"eval_runtime": 276.1306,
"eval_samples_per_second": 122.286,
"eval_steps_per_second": 3.824,
"step": 119
},
{
"epoch": 0.005812263876780006,
"grad_norm": 1.7051466703414917,
"learning_rate": 9.999501114017244e-06,
"loss": 9.6719,
"step": 120
},
{
"epoch": 0.005812263876780006,
"eval_accuracy": 0.08307213475267416,
"eval_loss": 9.6875,
"eval_runtime": 276.7696,
"eval_samples_per_second": 122.004,
"eval_steps_per_second": 3.815,
"step": 120
},
{
"epoch": 0.005860699409086506,
"grad_norm": 1.6584818363189697,
"learning_rate": 9.999496270464012e-06,
"loss": 9.6953,
"step": 121
},
{
"epoch": 0.005860699409086506,
"eval_accuracy": 0.08307086100159505,
"eval_loss": 9.6796875,
"eval_runtime": 277.4403,
"eval_samples_per_second": 121.709,
"eval_steps_per_second": 3.806,
"step": 121
},
{
"epoch": 0.005909134941393006,
"grad_norm": 1.7079665660858154,
"learning_rate": 9.999491426910782e-06,
"loss": 9.6875,
"step": 122
},
{
"epoch": 0.005909134941393006,
"eval_accuracy": 0.08310041781640795,
"eval_loss": 9.6796875,
"eval_runtime": 275.7807,
"eval_samples_per_second": 122.442,
"eval_steps_per_second": 3.829,
"step": 122
},
{
"epoch": 0.0059575704736995055,
"grad_norm": 1.6613987684249878,
"learning_rate": 9.999486583357552e-06,
"loss": 9.6719,
"step": 123
},
{
"epoch": 0.0059575704736995055,
"eval_accuracy": 0.08318772766310303,
"eval_loss": 9.671875,
"eval_runtime": 277.1478,
"eval_samples_per_second": 121.838,
"eval_steps_per_second": 3.81,
"step": 123
},
{
"epoch": 0.006006006006006006,
"grad_norm": 1.5512877702713013,
"learning_rate": 9.99948173980432e-06,
"loss": 9.6719,
"step": 124
},
{
"epoch": 0.006006006006006006,
"eval_accuracy": 0.08327202682542932,
"eval_loss": 9.6640625,
"eval_runtime": 276.8801,
"eval_samples_per_second": 121.955,
"eval_steps_per_second": 3.814,
"step": 124
},
{
"epoch": 0.006054441538312506,
"grad_norm": 1.6818300485610962,
"learning_rate": 9.99947689625109e-06,
"loss": 9.625,
"step": 125
},
{
"epoch": 0.006054441538312506,
"eval_accuracy": 0.08333837767709547,
"eval_loss": 9.6640625,
"eval_runtime": 275.4944,
"eval_samples_per_second": 122.569,
"eval_steps_per_second": 3.833,
"step": 125
},
{
"epoch": 0.006102877070619006,
"grad_norm": 1.497159719467163,
"learning_rate": 9.99947205269786e-06,
"loss": 9.6719,
"step": 126
},
{
"epoch": 0.006102877070619006,
"eval_accuracy": 0.08344485168775347,
"eval_loss": 9.65625,
"eval_runtime": 276.6512,
"eval_samples_per_second": 122.056,
"eval_steps_per_second": 3.817,
"step": 126
},
{
"epoch": 0.006151312602925506,
"grad_norm": 1.4452403783798218,
"learning_rate": 9.99946720914463e-06,
"loss": 9.6953,
"step": 127
},
{
"epoch": 0.006151312602925506,
"eval_accuracy": 0.08355928664265588,
"eval_loss": 9.65625,
"eval_runtime": 277.0928,
"eval_samples_per_second": 121.862,
"eval_steps_per_second": 3.811,
"step": 127
},
{
"epoch": 0.006199748135232006,
"grad_norm": 1.4734400510787964,
"learning_rate": 9.9994623655914e-06,
"loss": 9.6719,
"step": 128
},
{
"epoch": 0.006199748135232006,
"eval_accuracy": 0.08367464796197946,
"eval_loss": 9.6484375,
"eval_runtime": 278.2759,
"eval_samples_per_second": 121.344,
"eval_steps_per_second": 3.795,
"step": 128
},
{
"epoch": 0.006248183667538506,
"grad_norm": 1.4783730506896973,
"learning_rate": 9.999457522038168e-06,
"loss": 9.6797,
"step": 129
},
{
"epoch": 0.006248183667538506,
"eval_accuracy": 0.08380396264539687,
"eval_loss": 9.640625,
"eval_runtime": 276.6233,
"eval_samples_per_second": 122.069,
"eval_steps_per_second": 3.817,
"step": 129
},
{
"epoch": 0.006296619199845007,
"grad_norm": 1.7012325525283813,
"learning_rate": 9.999452678484938e-06,
"loss": 9.6484,
"step": 130
},
{
"epoch": 0.006296619199845007,
"eval_accuracy": 0.08385152566864622,
"eval_loss": 9.640625,
"eval_runtime": 276.6061,
"eval_samples_per_second": 122.076,
"eval_steps_per_second": 3.818,
"step": 130
},
{
"epoch": 0.006345054732151506,
"grad_norm": 1.5358777046203613,
"learning_rate": 9.999447834931707e-06,
"loss": 9.6719,
"step": 131
},
{
"epoch": 0.006345054732151506,
"eval_accuracy": 0.0839065864539294,
"eval_loss": 9.6328125,
"eval_runtime": 276.1031,
"eval_samples_per_second": 122.299,
"eval_steps_per_second": 3.825,
"step": 131
},
{
"epoch": 0.006393490264458007,
"grad_norm": 1.5622602701187134,
"learning_rate": 9.999442991378476e-06,
"loss": 9.6328,
"step": 132
},
{
"epoch": 0.006393490264458007,
"eval_accuracy": 0.08391194199823927,
"eval_loss": 9.6328125,
"eval_runtime": 276.4355,
"eval_samples_per_second": 122.151,
"eval_steps_per_second": 3.82,
"step": 132
},
{
"epoch": 0.006441925796764506,
"grad_norm": 1.5135513544082642,
"learning_rate": 9.999438147825245e-06,
"loss": 9.6719,
"step": 133
},
{
"epoch": 0.006441925796764506,
"eval_accuracy": 0.08392389788904997,
"eval_loss": 9.625,
"eval_runtime": 277.1805,
"eval_samples_per_second": 121.823,
"eval_steps_per_second": 3.81,
"step": 133
},
{
"epoch": 0.006490361329071007,
"grad_norm": 1.4829246997833252,
"learning_rate": 9.999433304272015e-06,
"loss": 9.6484,
"step": 134
},
{
"epoch": 0.006490361329071007,
"eval_accuracy": 0.08400426000258629,
"eval_loss": 9.6171875,
"eval_runtime": 276.9095,
"eval_samples_per_second": 121.942,
"eval_steps_per_second": 3.814,
"step": 134
},
{
"epoch": 0.006538796861377506,
"grad_norm": 1.506585955619812,
"learning_rate": 9.999428460718784e-06,
"loss": 9.6406,
"step": 135
},
{
"epoch": 0.006538796861377506,
"eval_accuracy": 0.084119823964127,
"eval_loss": 9.6171875,
"eval_runtime": 275.7802,
"eval_samples_per_second": 122.442,
"eval_steps_per_second": 3.829,
"step": 135
},
{
"epoch": 0.0065872323936840066,
"grad_norm": 1.597743272781372,
"learning_rate": 9.999423617165553e-06,
"loss": 9.6094,
"step": 136
},
{
"epoch": 0.0065872323936840066,
"eval_accuracy": 0.08430503895058428,
"eval_loss": 9.609375,
"eval_runtime": 278.6385,
"eval_samples_per_second": 121.186,
"eval_steps_per_second": 3.79,
"step": 136
},
{
"epoch": 0.006635667925990507,
"grad_norm": 1.5326935052871704,
"learning_rate": 9.999418773612323e-06,
"loss": 9.625,
"step": 137
},
{
"epoch": 0.006635667925990507,
"eval_accuracy": 0.08447033710198644,
"eval_loss": 9.609375,
"eval_runtime": 276.8784,
"eval_samples_per_second": 121.956,
"eval_steps_per_second": 3.814,
"step": 137
},
{
"epoch": 0.0066841034582970065,
"grad_norm": 1.5170117616653442,
"learning_rate": 9.999413930059091e-06,
"loss": 9.6562,
"step": 138
},
{
"epoch": 0.0066841034582970065,
"eval_accuracy": 0.0845831509191518,
"eval_loss": 9.6015625,
"eval_runtime": 276.8242,
"eval_samples_per_second": 121.98,
"eval_steps_per_second": 3.815,
"step": 138
},
{
"epoch": 0.006732538990603507,
"grad_norm": 1.5148200988769531,
"learning_rate": 9.999409086505861e-06,
"loss": 9.6172,
"step": 139
},
{
"epoch": 0.006732538990603507,
"eval_accuracy": 0.0846733846035512,
"eval_loss": 9.6015625,
"eval_runtime": 276.0924,
"eval_samples_per_second": 122.303,
"eval_steps_per_second": 3.825,
"step": 139
},
{
"epoch": 0.0067809745229100065,
"grad_norm": 1.584030032157898,
"learning_rate": 9.999404242952631e-06,
"loss": 9.6094,
"step": 140
},
{
"epoch": 0.0067809745229100065,
"eval_accuracy": 0.08471049707817424,
"eval_loss": 9.59375,
"eval_runtime": 276.8263,
"eval_samples_per_second": 121.979,
"eval_steps_per_second": 3.815,
"step": 140
},
{
"epoch": 0.006829410055216507,
"grad_norm": 1.5023019313812256,
"learning_rate": 9.9993993993994e-06,
"loss": 9.6562,
"step": 141
},
{
"epoch": 0.006829410055216507,
"eval_accuracy": 0.08469269351195492,
"eval_loss": 9.5859375,
"eval_runtime": 276.3818,
"eval_samples_per_second": 122.175,
"eval_steps_per_second": 3.821,
"step": 141
},
{
"epoch": 0.006877845587523007,
"grad_norm": 1.5090259313583374,
"learning_rate": 9.99939455584617e-06,
"loss": 9.6562,
"step": 142
},
{
"epoch": 0.006877845587523007,
"eval_accuracy": 0.08472097657568871,
"eval_loss": 9.5859375,
"eval_runtime": 277.9886,
"eval_samples_per_second": 121.469,
"eval_steps_per_second": 3.799,
"step": 142
},
{
"epoch": 0.006926281119829507,
"grad_norm": 1.4967498779296875,
"learning_rate": 9.999389712292939e-06,
"loss": 9.6562,
"step": 143
},
{
"epoch": 0.006926281119829507,
"eval_accuracy": 0.0847566126570155,
"eval_loss": 9.578125,
"eval_runtime": 276.5132,
"eval_samples_per_second": 122.117,
"eval_steps_per_second": 3.819,
"step": 143
},
{
"epoch": 0.006974716652136007,
"grad_norm": 1.8095794916152954,
"learning_rate": 9.999384868739709e-06,
"loss": 9.6016,
"step": 144
},
{
"epoch": 0.006974716652136007,
"eval_accuracy": 0.08490364405998777,
"eval_loss": 9.578125,
"eval_runtime": 276.1595,
"eval_samples_per_second": 122.274,
"eval_steps_per_second": 3.824,
"step": 144
},
{
"epoch": 0.007023152184442507,
"grad_norm": 1.7810986042022705,
"learning_rate": 9.999380025186479e-06,
"loss": 9.6094,
"step": 145
},
{
"epoch": 0.007023152184442507,
"eval_accuracy": 0.08503437723892511,
"eval_loss": 9.5703125,
"eval_runtime": 276.2917,
"eval_samples_per_second": 122.215,
"eval_steps_per_second": 3.822,
"step": 145
},
{
"epoch": 0.007071587716749007,
"grad_norm": 1.5788795948028564,
"learning_rate": 9.999375181633247e-06,
"loss": 9.5938,
"step": 146
},
{
"epoch": 0.007071587716749007,
"eval_accuracy": 0.08506937644471235,
"eval_loss": 9.5703125,
"eval_runtime": 276.3857,
"eval_samples_per_second": 122.173,
"eval_steps_per_second": 3.821,
"step": 146
},
{
"epoch": 0.007120023249055507,
"grad_norm": 1.8074451684951782,
"learning_rate": 9.999370338080017e-06,
"loss": 9.5703,
"step": 147
},
{
"epoch": 0.007120023249055507,
"eval_accuracy": 0.08507481883568672,
"eval_loss": 9.5625,
"eval_runtime": 274.2982,
"eval_samples_per_second": 123.103,
"eval_steps_per_second": 3.85,
"step": 147
},
{
"epoch": 0.007168458781362007,
"grad_norm": 1.7187494039535522,
"learning_rate": 9.999365494526787e-06,
"loss": 9.5859,
"step": 148
},
{
"epoch": 0.007168458781362007,
"eval_accuracy": 0.08513216658313465,
"eval_loss": 9.5625,
"eval_runtime": 275.5423,
"eval_samples_per_second": 122.547,
"eval_steps_per_second": 3.832,
"step": 148
},
{
"epoch": 0.007216894313668508,
"grad_norm": 1.6044690608978271,
"learning_rate": 9.999360650973555e-06,
"loss": 9.625,
"step": 149
},
{
"epoch": 0.007216894313668508,
"eval_accuracy": 0.08522213972754059,
"eval_loss": 9.5546875,
"eval_runtime": 275.5733,
"eval_samples_per_second": 122.534,
"eval_steps_per_second": 3.832,
"step": 149
},
{
"epoch": 0.007265329845975007,
"grad_norm": 1.7572296857833862,
"learning_rate": 9.999355807420325e-06,
"loss": 9.5859,
"step": 150
},
{
"epoch": 0.007265329845975007,
"eval_accuracy": 0.0853520623376094,
"eval_loss": 9.546875,
"eval_runtime": 275.4002,
"eval_samples_per_second": 122.611,
"eval_steps_per_second": 3.834,
"step": 150
},
{
"epoch": 0.0073137653782815075,
"grad_norm": 1.5954887866973877,
"learning_rate": 9.999350963867095e-06,
"loss": 9.5625,
"step": 151
},
{
"epoch": 0.0073137653782815075,
"eval_accuracy": 0.0855014675494109,
"eval_loss": 9.546875,
"eval_runtime": 275.8854,
"eval_samples_per_second": 122.395,
"eval_steps_per_second": 3.828,
"step": 151
},
{
"epoch": 0.007362200910588007,
"grad_norm": 1.6131614446640015,
"learning_rate": 9.999346120313863e-06,
"loss": 9.5547,
"step": 152
},
{
"epoch": 0.007362200910588007,
"eval_accuracy": 0.0856389168704017,
"eval_loss": 9.5390625,
"eval_runtime": 276.1363,
"eval_samples_per_second": 122.284,
"eval_steps_per_second": 3.824,
"step": 152
},
{
"epoch": 0.0074106364428945075,
"grad_norm": 1.4832433462142944,
"learning_rate": 9.999341276760633e-06,
"loss": 9.5703,
"step": 153
},
{
"epoch": 0.0074106364428945075,
"eval_accuracy": 0.08576559720499642,
"eval_loss": 9.5390625,
"eval_runtime": 274.8867,
"eval_samples_per_second": 122.84,
"eval_steps_per_second": 3.842,
"step": 153
},
{
"epoch": 0.007459071975201007,
"grad_norm": 1.7311336994171143,
"learning_rate": 9.999336433207402e-06,
"loss": 9.5391,
"step": 154
},
{
"epoch": 0.007459071975201007,
"eval_accuracy": 0.08582592668792499,
"eval_loss": 9.53125,
"eval_runtime": 275.7658,
"eval_samples_per_second": 122.448,
"eval_steps_per_second": 3.829,
"step": 154
},
{
"epoch": 0.0075075075075075074,
"grad_norm": 1.9239146709442139,
"learning_rate": 9.99933158965417e-06,
"loss": 9.5391,
"step": 155
},
{
"epoch": 0.0075075075075075074,
"eval_accuracy": 0.08592603194318746,
"eval_loss": 9.53125,
"eval_runtime": 274.2731,
"eval_samples_per_second": 123.115,
"eval_steps_per_second": 3.85,
"step": 155
},
{
"epoch": 0.007555943039814008,
"grad_norm": 1.8369977474212646,
"learning_rate": 9.99932674610094e-06,
"loss": 9.5,
"step": 156
},
{
"epoch": 0.007555943039814008,
"eval_accuracy": 0.08610542820312428,
"eval_loss": 9.5234375,
"eval_runtime": 274.5904,
"eval_samples_per_second": 122.972,
"eval_steps_per_second": 3.846,
"step": 156
},
{
"epoch": 0.007604378572120507,
"grad_norm": 1.5703845024108887,
"learning_rate": 9.99932190254771e-06,
"loss": 9.5547,
"step": 157
},
{
"epoch": 0.007604378572120507,
"eval_accuracy": 0.08628861676741025,
"eval_loss": 9.515625,
"eval_runtime": 275.108,
"eval_samples_per_second": 122.741,
"eval_steps_per_second": 3.838,
"step": 157
},
{
"epoch": 0.007652814104427008,
"grad_norm": 1.5686722993850708,
"learning_rate": 9.999317058994478e-06,
"loss": 9.5391,
"step": 158
},
{
"epoch": 0.007652814104427008,
"eval_accuracy": 0.08633840885504802,
"eval_loss": 9.515625,
"eval_runtime": 274.8868,
"eval_samples_per_second": 122.84,
"eval_steps_per_second": 3.842,
"step": 158
},
{
"epoch": 0.007701249636733507,
"grad_norm": 1.6259181499481201,
"learning_rate": 9.999312215441248e-06,
"loss": 9.5312,
"step": 159
},
{
"epoch": 0.007701249636733507,
"eval_accuracy": 0.08635719668346484,
"eval_loss": 9.515625,
"eval_runtime": 274.9284,
"eval_samples_per_second": 122.821,
"eval_steps_per_second": 3.841,
"step": 159
},
{
"epoch": 0.007749685169040008,
"grad_norm": 1.6887496709823608,
"learning_rate": 9.999307371888018e-06,
"loss": 9.5391,
"step": 160
},
{
"epoch": 0.007749685169040008,
"eval_accuracy": 0.08644670664566019,
"eval_loss": 9.5078125,
"eval_runtime": 274.7492,
"eval_samples_per_second": 122.901,
"eval_steps_per_second": 3.844,
"step": 160
},
{
"epoch": 0.007798120701346508,
"grad_norm": 1.6951507329940796,
"learning_rate": 9.999302528334788e-06,
"loss": 9.4688,
"step": 161
},
{
"epoch": 0.007798120701346508,
"eval_accuracy": 0.08658233218669682,
"eval_loss": 9.5,
"eval_runtime": 275.2855,
"eval_samples_per_second": 122.662,
"eval_steps_per_second": 3.836,
"step": 161
},
{
"epoch": 0.007846556233653008,
"grad_norm": 1.4970242977142334,
"learning_rate": 9.999297684781556e-06,
"loss": 9.5547,
"step": 162
},
{
"epoch": 0.007846556233653008,
"eval_accuracy": 0.0867301741585376,
"eval_loss": 9.5,
"eval_runtime": 272.7321,
"eval_samples_per_second": 123.81,
"eval_steps_per_second": 3.872,
"step": 162
},
{
"epoch": 0.007894991765959508,
"grad_norm": 1.5665501356124878,
"learning_rate": 9.999292841228326e-06,
"loss": 9.5078,
"step": 163
},
{
"epoch": 0.007894991765959508,
"eval_accuracy": 0.08686166000856714,
"eval_loss": 9.4921875,
"eval_runtime": 274.5735,
"eval_samples_per_second": 122.98,
"eval_steps_per_second": 3.846,
"step": 163
},
{
"epoch": 0.007943427298266009,
"grad_norm": 1.5631929636001587,
"learning_rate": 9.999287997675096e-06,
"loss": 9.5078,
"step": 164
},
{
"epoch": 0.007943427298266009,
"eval_accuracy": 0.08703300847759506,
"eval_loss": 9.4921875,
"eval_runtime": 275.4664,
"eval_samples_per_second": 122.581,
"eval_steps_per_second": 3.833,
"step": 164
},
{
"epoch": 0.007991862830572507,
"grad_norm": 1.5439754724502563,
"learning_rate": 9.999283154121864e-06,
"loss": 9.5,
"step": 165
},
{
"epoch": 0.007991862830572507,
"eval_accuracy": 0.08722841347268517,
"eval_loss": 9.484375,
"eval_runtime": 275.1068,
"eval_samples_per_second": 122.741,
"eval_steps_per_second": 3.839,
"step": 165
},
{
"epoch": 0.008040298362879008,
"grad_norm": 1.5011335611343384,
"learning_rate": 9.999278310568634e-06,
"loss": 9.5312,
"step": 166
},
{
"epoch": 0.008040298362879008,
"eval_accuracy": 0.08746153886904974,
"eval_loss": 9.484375,
"eval_runtime": 274.8226,
"eval_samples_per_second": 122.868,
"eval_steps_per_second": 3.842,
"step": 166
},
{
"epoch": 0.008088733895185508,
"grad_norm": 1.5114489793777466,
"learning_rate": 9.999273467015404e-06,
"loss": 9.5156,
"step": 167
},
{
"epoch": 0.008088733895185508,
"eval_accuracy": 0.08767709229030025,
"eval_loss": 9.4765625,
"eval_runtime": 275.1958,
"eval_samples_per_second": 122.702,
"eval_steps_per_second": 3.837,
"step": 167
},
{
"epoch": 0.008137169427492008,
"grad_norm": 1.6843675374984741,
"learning_rate": 9.999268623462172e-06,
"loss": 9.4844,
"step": 168
},
{
"epoch": 0.008137169427492008,
"eval_accuracy": 0.08782279204441709,
"eval_loss": 9.4765625,
"eval_runtime": 275.685,
"eval_samples_per_second": 122.484,
"eval_steps_per_second": 3.83,
"step": 168
},
{
"epoch": 0.008185604959798509,
"grad_norm": 1.6421033143997192,
"learning_rate": 9.999263779908942e-06,
"loss": 9.4688,
"step": 169
},
{
"epoch": 0.008185604959798509,
"eval_accuracy": 0.08784629854160422,
"eval_loss": 9.46875,
"eval_runtime": 276.249,
"eval_samples_per_second": 122.234,
"eval_steps_per_second": 3.823,
"step": 169
},
{
"epoch": 0.008234040492105008,
"grad_norm": 1.6387994289398193,
"learning_rate": 9.999258936355712e-06,
"loss": 9.5156,
"step": 170
},
{
"epoch": 0.008234040492105008,
"eval_accuracy": 0.08786224937898121,
"eval_loss": 9.4609375,
"eval_runtime": 276.6173,
"eval_samples_per_second": 122.071,
"eval_steps_per_second": 3.818,
"step": 170
},
{
"epoch": 0.008282476024411508,
"grad_norm": 1.5107547044754028,
"learning_rate": 9.99925409280248e-06,
"loss": 9.4922,
"step": 171
},
{
"epoch": 0.008282476024411508,
"eval_accuracy": 0.08785258045033527,
"eval_loss": 9.4609375,
"eval_runtime": 276.9505,
"eval_samples_per_second": 121.924,
"eval_steps_per_second": 3.813,
"step": 171
},
{
"epoch": 0.008330911556718008,
"grad_norm": 1.5190666913986206,
"learning_rate": 9.99924924924925e-06,
"loss": 9.4844,
"step": 172
},
{
"epoch": 0.008330911556718008,
"eval_accuracy": 0.08782476056881207,
"eval_loss": 9.453125,
"eval_runtime": 275.3735,
"eval_samples_per_second": 122.623,
"eval_steps_per_second": 3.835,
"step": 172
},
{
"epoch": 0.008379347089024509,
"grad_norm": 1.560573935508728,
"learning_rate": 9.99924440569602e-06,
"loss": 9.5234,
"step": 173
},
{
"epoch": 0.008379347089024509,
"eval_accuracy": 0.08786801020772535,
"eval_loss": 9.453125,
"eval_runtime": 276.2814,
"eval_samples_per_second": 122.22,
"eval_steps_per_second": 3.822,
"step": 173
},
{
"epoch": 0.00842778262133101,
"grad_norm": 1.7032357454299927,
"learning_rate": 9.999239562142788e-06,
"loss": 9.4844,
"step": 174
},
{
"epoch": 0.00842778262133101,
"eval_accuracy": 0.08790746754228948,
"eval_loss": 9.4453125,
"eval_runtime": 275.8479,
"eval_samples_per_second": 122.412,
"eval_steps_per_second": 3.828,
"step": 174
},
{
"epoch": 0.008476218153637508,
"grad_norm": 1.550713300704956,
"learning_rate": 9.999234718589558e-06,
"loss": 9.4219,
"step": 175
},
{
"epoch": 0.008476218153637508,
"eval_accuracy": 0.08798829283803639,
"eval_loss": 9.4453125,
"eval_runtime": 275.7584,
"eval_samples_per_second": 122.451,
"eval_steps_per_second": 3.829,
"step": 175
},
{
"epoch": 0.008524653685944008,
"grad_norm": 1.6866670846939087,
"learning_rate": 9.999229875036328e-06,
"loss": 9.4062,
"step": 176
},
{
"epoch": 0.008524653685944008,
"eval_accuracy": 0.08809621429310245,
"eval_loss": 9.4375,
"eval_runtime": 276.5351,
"eval_samples_per_second": 122.107,
"eval_steps_per_second": 3.819,
"step": 176
},
{
"epoch": 0.008573089218250509,
"grad_norm": 1.622749924659729,
"learning_rate": 9.999225031483096e-06,
"loss": 9.4375,
"step": 177
},
{
"epoch": 0.008573089218250509,
"eval_accuracy": 0.08827711589522366,
"eval_loss": 9.4375,
"eval_runtime": 278.1964,
"eval_samples_per_second": 121.378,
"eval_steps_per_second": 3.796,
"step": 177
},
{
"epoch": 0.008621524750557009,
"grad_norm": 1.5966665744781494,
"learning_rate": 9.999220187929867e-06,
"loss": 9.4375,
"step": 178
},
{
"epoch": 0.008621524750557009,
"eval_accuracy": 0.08849081658763186,
"eval_loss": 9.4296875,
"eval_runtime": 277.208,
"eval_samples_per_second": 121.811,
"eval_steps_per_second": 3.809,
"step": 178
},
{
"epoch": 0.00866996028286351,
"grad_norm": 1.499353289604187,
"learning_rate": 9.999215344376635e-06,
"loss": 9.4688,
"step": 179
},
{
"epoch": 0.00866996028286351,
"eval_accuracy": 0.08868196609616225,
"eval_loss": 9.4296875,
"eval_runtime": 276.9344,
"eval_samples_per_second": 121.931,
"eval_steps_per_second": 3.813,
"step": 179
},
{
"epoch": 0.008718395815170008,
"grad_norm": 1.5957542657852173,
"learning_rate": 9.999210500823405e-06,
"loss": 9.4453,
"step": 180
},
{
"epoch": 0.008718395815170008,
"eval_accuracy": 0.08884607734314978,
"eval_loss": 9.421875,
"eval_runtime": 277.5515,
"eval_samples_per_second": 121.66,
"eval_steps_per_second": 3.805,
"step": 180
},
{
"epoch": 0.008766831347476509,
"grad_norm": 1.519926905632019,
"learning_rate": 9.999205657270175e-06,
"loss": 9.4219,
"step": 181
},
{
"epoch": 0.008766831347476509,
"eval_accuracy": 0.0889903586017467,
"eval_loss": 9.421875,
"eval_runtime": 277.0317,
"eval_samples_per_second": 121.889,
"eval_steps_per_second": 3.812,
"step": 181
},
{
"epoch": 0.008815266879783009,
"grad_norm": 1.5913316011428833,
"learning_rate": 9.999200813716943e-06,
"loss": 9.4141,
"step": 182
},
{
"epoch": 0.008815266879783009,
"eval_accuracy": 0.08903491094062725,
"eval_loss": 9.4140625,
"eval_runtime": 277.095,
"eval_samples_per_second": 121.861,
"eval_steps_per_second": 3.811,
"step": 182
},
{
"epoch": 0.00886370241208951,
"grad_norm": 1.5328583717346191,
"learning_rate": 9.999195970163713e-06,
"loss": 9.4375,
"step": 183
},
{
"epoch": 0.00886370241208951,
"eval_accuracy": 0.08903395562731792,
"eval_loss": 9.40625,
"eval_runtime": 276.2305,
"eval_samples_per_second": 122.242,
"eval_steps_per_second": 3.823,
"step": 183
},
{
"epoch": 0.008912137944396008,
"grad_norm": 1.5967031717300415,
"learning_rate": 9.999191126610483e-06,
"loss": 9.3984,
"step": 184
},
{
"epoch": 0.008912137944396008,
"eval_accuracy": 0.08904220606044394,
"eval_loss": 9.40625,
"eval_runtime": 278.9395,
"eval_samples_per_second": 121.055,
"eval_steps_per_second": 3.786,
"step": 184
},
{
"epoch": 0.008960573476702509,
"grad_norm": 1.596799612045288,
"learning_rate": 9.999186283057251e-06,
"loss": 9.4297,
"step": 185
},
{
"epoch": 0.008960573476702509,
"eval_accuracy": 0.08908192393500153,
"eval_loss": 9.3984375,
"eval_runtime": 277.1703,
"eval_samples_per_second": 121.828,
"eval_steps_per_second": 3.81,
"step": 185
},
{
"epoch": 0.009009009009009009,
"grad_norm": 1.5406758785247803,
"learning_rate": 9.999181439504021e-06,
"loss": 9.3984,
"step": 186
},
{
"epoch": 0.009009009009009009,
"eval_accuracy": 0.089139011142456,
"eval_loss": 9.3984375,
"eval_runtime": 276.7972,
"eval_samples_per_second": 121.992,
"eval_steps_per_second": 3.815,
"step": 186
},
{
"epoch": 0.00905744454131551,
"grad_norm": 1.6137006282806396,
"learning_rate": 9.999176595950791e-06,
"loss": 9.3906,
"step": 187
},
{
"epoch": 0.00905744454131551,
"eval_accuracy": 0.08919705366321981,
"eval_loss": 9.390625,
"eval_runtime": 277.048,
"eval_samples_per_second": 121.881,
"eval_steps_per_second": 3.812,
"step": 187
},
{
"epoch": 0.00910588007362201,
"grad_norm": 1.5155887603759766,
"learning_rate": 9.999171752397559e-06,
"loss": 9.4219,
"step": 188
},
{
"epoch": 0.00910588007362201,
"eval_accuracy": 0.08929362715412657,
"eval_loss": 9.390625,
"eval_runtime": 277.0594,
"eval_samples_per_second": 121.876,
"eval_steps_per_second": 3.811,
"step": 188
},
{
"epoch": 0.009154315605928508,
"grad_norm": 1.7281869649887085,
"learning_rate": 9.999166908844329e-06,
"loss": 9.4062,
"step": 189
},
{
"epoch": 0.009154315605928508,
"eval_accuracy": 0.08947652622953092,
"eval_loss": 9.3828125,
"eval_runtime": 277.1329,
"eval_samples_per_second": 121.844,
"eval_steps_per_second": 3.81,
"step": 189
},
{
"epoch": 0.009202751138235009,
"grad_norm": 1.5536915063858032,
"learning_rate": 9.999162065291099e-06,
"loss": 9.375,
"step": 190
},
{
"epoch": 0.009202751138235009,
"eval_accuracy": 0.08965256441844101,
"eval_loss": 9.3828125,
"eval_runtime": 277.1552,
"eval_samples_per_second": 121.834,
"eval_steps_per_second": 3.81,
"step": 190
},
{
"epoch": 0.00925118667054151,
"grad_norm": 1.6295173168182373,
"learning_rate": 9.999157221737867e-06,
"loss": 9.3828,
"step": 191
},
{
"epoch": 0.00925118667054151,
"eval_accuracy": 0.08979913263920268,
"eval_loss": 9.375,
"eval_runtime": 276.596,
"eval_samples_per_second": 122.081,
"eval_steps_per_second": 3.818,
"step": 191
},
{
"epoch": 0.00929962220284801,
"grad_norm": 1.5873547792434692,
"learning_rate": 9.999152378184637e-06,
"loss": 9.3906,
"step": 192
},
{
"epoch": 0.00929962220284801,
"eval_accuracy": 0.08980952529005266,
"eval_loss": 9.375,
"eval_runtime": 275.7787,
"eval_samples_per_second": 122.442,
"eval_steps_per_second": 3.829,
"step": 192
},
{
"epoch": 0.00934805773515451,
"grad_norm": 1.4720993041992188,
"learning_rate": 9.999147534631407e-06,
"loss": 9.3906,
"step": 193
},
{
"epoch": 0.00934805773515451,
"eval_accuracy": 0.08985268808230146,
"eval_loss": 9.3671875,
"eval_runtime": 276.9238,
"eval_samples_per_second": 121.936,
"eval_steps_per_second": 3.813,
"step": 193
},
{
"epoch": 0.009396493267461009,
"grad_norm": 1.603896975517273,
"learning_rate": 9.999142691078175e-06,
"loss": 9.4141,
"step": 194
},
{
"epoch": 0.009396493267461009,
"eval_accuracy": 0.08978121327743072,
"eval_loss": 9.3671875,
"eval_runtime": 275.8739,
"eval_samples_per_second": 122.4,
"eval_steps_per_second": 3.828,
"step": 194
},
{
"epoch": 0.00944492879976751,
"grad_norm": 1.6265010833740234,
"learning_rate": 9.999137847524946e-06,
"loss": 9.3203,
"step": 195
},
{
"epoch": 0.00944492879976751,
"eval_accuracy": 0.08979632459705102,
"eval_loss": 9.359375,
"eval_runtime": 276.051,
"eval_samples_per_second": 122.322,
"eval_steps_per_second": 3.825,
"step": 195
},
{
"epoch": 0.00949336433207401,
"grad_norm": 1.609118103981018,
"learning_rate": 9.999133003971715e-06,
"loss": 9.3906,
"step": 196
},
{
"epoch": 0.00949336433207401,
"eval_accuracy": 0.08980888841451311,
"eval_loss": 9.359375,
"eval_runtime": 274.9932,
"eval_samples_per_second": 122.792,
"eval_steps_per_second": 3.84,
"step": 196
},
{
"epoch": 0.00954179986438051,
"grad_norm": 1.6511759757995605,
"learning_rate": 9.999128160418484e-06,
"loss": 9.3594,
"step": 197
},
{
"epoch": 0.00954179986438051,
"eval_accuracy": 0.08997595244809313,
"eval_loss": 9.3515625,
"eval_runtime": 275.7963,
"eval_samples_per_second": 122.435,
"eval_steps_per_second": 3.829,
"step": 197
},
{
"epoch": 0.00959023539668701,
"grad_norm": 1.5398412942886353,
"learning_rate": 9.999123316865254e-06,
"loss": 9.3516,
"step": 198
},
{
"epoch": 0.00959023539668701,
"eval_accuracy": 0.09011600716901846,
"eval_loss": 9.3515625,
"eval_runtime": 275.8953,
"eval_samples_per_second": 122.391,
"eval_steps_per_second": 3.828,
"step": 198
},
{
"epoch": 0.009638670928993509,
"grad_norm": 1.5655171871185303,
"learning_rate": 9.999118473312022e-06,
"loss": 9.3438,
"step": 199
},
{
"epoch": 0.009638670928993509,
"eval_accuracy": 0.0902209468886039,
"eval_loss": 9.34375,
"eval_runtime": 276.4152,
"eval_samples_per_second": 122.16,
"eval_steps_per_second": 3.82,
"step": 199
},
{
"epoch": 0.00968710646130001,
"grad_norm": 1.5900487899780273,
"learning_rate": 9.999113629758792e-06,
"loss": 9.3516,
"step": 200
},
{
"epoch": 0.00968710646130001,
"eval_accuracy": 0.09037614187803769,
"eval_loss": 9.34375,
"eval_runtime": 274.9294,
"eval_samples_per_second": 122.821,
"eval_steps_per_second": 3.841,
"step": 200
},
{
"epoch": 0.00973554199360651,
"grad_norm": 1.549442172050476,
"learning_rate": 9.999108786205562e-06,
"loss": 9.3125,
"step": 201
},
{
"epoch": 0.00973554199360651,
"eval_accuracy": 0.09055328012469792,
"eval_loss": 9.3359375,
"eval_runtime": 275.1689,
"eval_samples_per_second": 122.714,
"eval_steps_per_second": 3.838,
"step": 201
},
{
"epoch": 0.00978397752591301,
"grad_norm": 1.5649633407592773,
"learning_rate": 9.99910394265233e-06,
"loss": 9.3516,
"step": 202
},
{
"epoch": 0.00978397752591301,
"eval_accuracy": 0.0907463113109588,
"eval_loss": 9.3359375,
"eval_runtime": 276.2363,
"eval_samples_per_second": 122.24,
"eval_steps_per_second": 3.823,
"step": 202
},
{
"epoch": 0.009832413058219509,
"grad_norm": 1.6223474740982056,
"learning_rate": 9.9990990990991e-06,
"loss": 9.3359,
"step": 203
},
{
"epoch": 0.009832413058219509,
"eval_accuracy": 0.09079908513407721,
"eval_loss": 9.328125,
"eval_runtime": 276.2485,
"eval_samples_per_second": 122.234,
"eval_steps_per_second": 3.823,
"step": 203
},
{
"epoch": 0.00988084859052601,
"grad_norm": 1.5935430526733398,
"learning_rate": 9.999094255545868e-06,
"loss": 9.3516,
"step": 204
},
{
"epoch": 0.00988084859052601,
"eval_accuracy": 0.09073201056020702,
"eval_loss": 9.328125,
"eval_runtime": 277.2721,
"eval_samples_per_second": 121.783,
"eval_steps_per_second": 3.809,
"step": 204
},
{
"epoch": 0.00992928412283251,
"grad_norm": 1.6288846731185913,
"learning_rate": 9.999089411992638e-06,
"loss": 9.3281,
"step": 205
},
{
"epoch": 0.00992928412283251,
"eval_accuracy": 0.09064690082901221,
"eval_loss": 9.3203125,
"eval_runtime": 276.8611,
"eval_samples_per_second": 121.964,
"eval_steps_per_second": 3.814,
"step": 205
},
{
"epoch": 0.00997771965513901,
"grad_norm": 1.4847911596298218,
"learning_rate": 9.999084568439408e-06,
"loss": 9.375,
"step": 206
},
{
"epoch": 0.00997771965513901,
"eval_accuracy": 0.0904633648780683,
"eval_loss": 9.3125,
"eval_runtime": 275.7135,
"eval_samples_per_second": 122.471,
"eval_steps_per_second": 3.83,
"step": 206
},
{
"epoch": 0.01002615518744551,
"grad_norm": 1.6263271570205688,
"learning_rate": 9.999079724886176e-06,
"loss": 9.2812,
"step": 207
},
{
"epoch": 0.01002615518744551,
"eval_accuracy": 0.09040885412166019,
"eval_loss": 9.3125,
"eval_runtime": 275.4549,
"eval_samples_per_second": 122.586,
"eval_steps_per_second": 3.834,
"step": 207
},
{
"epoch": 0.01007459071975201,
"grad_norm": 1.5669511556625366,
"learning_rate": 9.999074881332946e-06,
"loss": 9.3281,
"step": 208
},
{
"epoch": 0.01007459071975201,
"eval_accuracy": 0.09057241533977267,
"eval_loss": 9.3046875,
"eval_runtime": 276.0055,
"eval_samples_per_second": 122.342,
"eval_steps_per_second": 3.826,
"step": 208
},
{
"epoch": 0.01012302625205851,
"grad_norm": 1.5233213901519775,
"learning_rate": 9.999070037779716e-06,
"loss": 9.3281,
"step": 209
},
{
"epoch": 0.01012302625205851,
"eval_accuracy": 0.09081425435147383,
"eval_loss": 9.3046875,
"eval_runtime": 275.4632,
"eval_samples_per_second": 122.583,
"eval_steps_per_second": 3.834,
"step": 209
},
{
"epoch": 0.01017146178436501,
"grad_norm": 1.6155483722686768,
"learning_rate": 9.999065194226484e-06,
"loss": 9.3594,
"step": 210
},
{
"epoch": 0.01017146178436501,
"eval_accuracy": 0.0911761733512689,
"eval_loss": 9.296875,
"eval_runtime": 276.0337,
"eval_samples_per_second": 122.329,
"eval_steps_per_second": 3.826,
"step": 210
},
{
"epoch": 0.01021989731667151,
"grad_norm": 1.5271143913269043,
"learning_rate": 9.999060350673254e-06,
"loss": 9.3438,
"step": 211
},
{
"epoch": 0.01021989731667151,
"eval_accuracy": 0.09151863869821945,
"eval_loss": 9.296875,
"eval_runtime": 274.7088,
"eval_samples_per_second": 122.919,
"eval_steps_per_second": 3.844,
"step": 211
},
{
"epoch": 0.010268332848978011,
"grad_norm": 1.6638132333755493,
"learning_rate": 9.999055507120024e-06,
"loss": 9.2891,
"step": 212
},
{
"epoch": 0.010268332848978011,
"eval_accuracy": 0.09163796601522115,
"eval_loss": 9.2890625,
"eval_runtime": 275.693,
"eval_samples_per_second": 122.48,
"eval_steps_per_second": 3.83,
"step": 212
},
{
"epoch": 0.01031676838128451,
"grad_norm": 1.5015349388122559,
"learning_rate": 9.999050663566794e-06,
"loss": 9.3438,
"step": 213
},
{
"epoch": 0.01031676838128451,
"eval_accuracy": 0.09161217255586926,
"eval_loss": 9.2890625,
"eval_runtime": 274.9875,
"eval_samples_per_second": 122.795,
"eval_steps_per_second": 3.84,
"step": 213
},
{
"epoch": 0.01036520391359101,
"grad_norm": 1.5039061307907104,
"learning_rate": 9.999045820013564e-06,
"loss": 9.3047,
"step": 214
},
{
"epoch": 0.01036520391359101,
"eval_accuracy": 0.09152989981571427,
"eval_loss": 9.28125,
"eval_runtime": 274.8846,
"eval_samples_per_second": 122.841,
"eval_steps_per_second": 3.842,
"step": 214
},
{
"epoch": 0.01041363944589751,
"grad_norm": 1.6265090703964233,
"learning_rate": 9.999040976460332e-06,
"loss": 9.2656,
"step": 215
},
{
"epoch": 0.01041363944589751,
"eval_accuracy": 0.09139931138121775,
"eval_loss": 9.28125,
"eval_runtime": 274.6634,
"eval_samples_per_second": 122.94,
"eval_steps_per_second": 3.845,
"step": 215
},
{
"epoch": 0.010462074978204011,
"grad_norm": 1.5140306949615479,
"learning_rate": 9.999036132907102e-06,
"loss": 9.2734,
"step": 216
},
{
"epoch": 0.010462074978204011,
"eval_accuracy": 0.09134954824246813,
"eval_loss": 9.2734375,
"eval_runtime": 274.7537,
"eval_samples_per_second": 122.899,
"eval_steps_per_second": 3.843,
"step": 216
},
{
"epoch": 0.010510510510510511,
"grad_norm": 1.5547981262207031,
"learning_rate": 9.999031289353872e-06,
"loss": 9.2891,
"step": 217
},
{
"epoch": 0.010510510510510511,
"eval_accuracy": 0.09132992089629463,
"eval_loss": 9.2734375,
"eval_runtime": 275.3328,
"eval_samples_per_second": 122.641,
"eval_steps_per_second": 3.835,
"step": 217
},
{
"epoch": 0.01055894604281701,
"grad_norm": 1.5140680074691772,
"learning_rate": 9.99902644580064e-06,
"loss": 9.2969,
"step": 218
},
{
"epoch": 0.01055894604281701,
"eval_accuracy": 0.091310090907904,
"eval_loss": 9.265625,
"eval_runtime": 275.0207,
"eval_samples_per_second": 122.78,
"eval_steps_per_second": 3.84,
"step": 218
},
{
"epoch": 0.01060738157512351,
"grad_norm": 1.5878396034240723,
"learning_rate": 9.99902160224741e-06,
"loss": 9.25,
"step": 219
},
{
"epoch": 0.01060738157512351,
"eval_accuracy": 0.09137461797961599,
"eval_loss": 9.265625,
"eval_runtime": 275.3479,
"eval_samples_per_second": 122.634,
"eval_steps_per_second": 3.835,
"step": 219
},
{
"epoch": 0.01065581710743001,
"grad_norm": 1.5309175252914429,
"learning_rate": 9.99901675869418e-06,
"loss": 9.2578,
"step": 220
},
{
"epoch": 0.01065581710743001,
"eval_accuracy": 0.09149657964544039,
"eval_loss": 9.2578125,
"eval_runtime": 274.9496,
"eval_samples_per_second": 122.812,
"eval_steps_per_second": 3.841,
"step": 220
},
{
"epoch": 0.010704252639736511,
"grad_norm": 1.5207297801971436,
"learning_rate": 9.999011915140948e-06,
"loss": 9.25,
"step": 221
},
{
"epoch": 0.010704252639736511,
"eval_accuracy": 0.09163194464648355,
"eval_loss": 9.2578125,
"eval_runtime": 274.7916,
"eval_samples_per_second": 122.882,
"eval_steps_per_second": 3.843,
"step": 221
},
{
"epoch": 0.010752688172043012,
"grad_norm": 1.5458952188491821,
"learning_rate": 9.999007071587717e-06,
"loss": 9.2656,
"step": 222
},
{
"epoch": 0.010752688172043012,
"eval_accuracy": 0.09197788386001349,
"eval_loss": 9.25,
"eval_runtime": 274.1902,
"eval_samples_per_second": 123.152,
"eval_steps_per_second": 3.851,
"step": 222
},
{
"epoch": 0.01080112370434951,
"grad_norm": 1.468177080154419,
"learning_rate": 9.999002228034487e-06,
"loss": 9.2578,
"step": 223
},
{
"epoch": 0.01080112370434951,
"eval_accuracy": 0.09228760801445338,
"eval_loss": 9.25,
"eval_runtime": 274.801,
"eval_samples_per_second": 122.878,
"eval_steps_per_second": 3.843,
"step": 223
},
{
"epoch": 0.01084955923665601,
"grad_norm": 1.466130018234253,
"learning_rate": 9.998997384481255e-06,
"loss": 9.2734,
"step": 224
},
{
"epoch": 0.01084955923665601,
"eval_accuracy": 0.09260112447324241,
"eval_loss": 9.2421875,
"eval_runtime": 274.0532,
"eval_samples_per_second": 123.213,
"eval_steps_per_second": 3.853,
"step": 224
},
{
"epoch": 0.010897994768962511,
"grad_norm": 1.4513353109359741,
"learning_rate": 9.998992540928025e-06,
"loss": 9.2891,
"step": 225
},
{
"epoch": 0.010897994768962511,
"eval_accuracy": 0.09285208238471446,
"eval_loss": 9.2421875,
"eval_runtime": 274.2911,
"eval_samples_per_second": 123.106,
"eval_steps_per_second": 3.85,
"step": 225
},
{
"epoch": 0.010946430301269012,
"grad_norm": 1.6049507856369019,
"learning_rate": 9.998987697374795e-06,
"loss": 9.25,
"step": 226
},
{
"epoch": 0.010946430301269012,
"eval_accuracy": 0.09283341035185029,
"eval_loss": 9.234375,
"eval_runtime": 275.8109,
"eval_samples_per_second": 122.428,
"eval_steps_per_second": 3.829,
"step": 226
},
{
"epoch": 0.01099486583357551,
"grad_norm": 1.6145049333572388,
"learning_rate": 9.998982853821563e-06,
"loss": 9.2344,
"step": 227
},
{
"epoch": 0.01099486583357551,
"eval_accuracy": 0.09279450304616123,
"eval_loss": 9.234375,
"eval_runtime": 275.8634,
"eval_samples_per_second": 122.405,
"eval_steps_per_second": 3.828,
"step": 227
},
{
"epoch": 0.01104330136588201,
"grad_norm": 1.5092509984970093,
"learning_rate": 9.998978010268333e-06,
"loss": 9.2656,
"step": 228
},
{
"epoch": 0.01104330136588201,
"eval_accuracy": 0.09270768533056489,
"eval_loss": 9.2265625,
"eval_runtime": 275.1066,
"eval_samples_per_second": 122.742,
"eval_steps_per_second": 3.839,
"step": 228
},
{
"epoch": 0.011091736898188511,
"grad_norm": 1.6245758533477783,
"learning_rate": 9.998973166715103e-06,
"loss": 9.2656,
"step": 229
},
{
"epoch": 0.011091736898188511,
"eval_accuracy": 0.09277232819782952,
"eval_loss": 9.2265625,
"eval_runtime": 274.1156,
"eval_samples_per_second": 123.185,
"eval_steps_per_second": 3.852,
"step": 229
},
{
"epoch": 0.011140172430495011,
"grad_norm": 1.5349066257476807,
"learning_rate": 9.998968323161873e-06,
"loss": 9.2656,
"step": 230
},
{
"epoch": 0.011140172430495011,
"eval_accuracy": 0.09297572308605222,
"eval_loss": 9.21875,
"eval_runtime": 275.053,
"eval_samples_per_second": 122.765,
"eval_steps_per_second": 3.839,
"step": 230
},
{
"epoch": 0.011188607962801512,
"grad_norm": 1.5491435527801514,
"learning_rate": 9.998963479608643e-06,
"loss": 9.25,
"step": 231
},
{
"epoch": 0.011188607962801512,
"eval_accuracy": 0.09332889952162252,
"eval_loss": 9.21875,
"eval_runtime": 275.8534,
"eval_samples_per_second": 122.409,
"eval_steps_per_second": 3.828,
"step": 231
},
{
"epoch": 0.01123704349510801,
"grad_norm": 1.5584843158721924,
"learning_rate": 9.998958636055411e-06,
"loss": 9.2891,
"step": 232
},
{
"epoch": 0.01123704349510801,
"eval_accuracy": 0.0936537349956827,
"eval_loss": 9.2109375,
"eval_runtime": 275.2078,
"eval_samples_per_second": 122.696,
"eval_steps_per_second": 3.837,
"step": 232
},
{
"epoch": 0.011285479027414511,
"grad_norm": 1.6923131942749023,
"learning_rate": 9.99895379250218e-06,
"loss": 9.2188,
"step": 233
},
{
"epoch": 0.011285479027414511,
"eval_accuracy": 0.09384227910427856,
"eval_loss": 9.203125,
"eval_runtime": 275.578,
"eval_samples_per_second": 122.532,
"eval_steps_per_second": 3.832,
"step": 233
},
{
"epoch": 0.011333914559721011,
"grad_norm": 1.636615514755249,
"learning_rate": 9.99894894894895e-06,
"loss": 9.2578,
"step": 234
},
{
"epoch": 0.011333914559721011,
"eval_accuracy": 0.09388075217664518,
"eval_loss": 9.203125,
"eval_runtime": 275.1327,
"eval_samples_per_second": 122.73,
"eval_steps_per_second": 3.838,
"step": 234
},
{
"epoch": 0.011382350092027512,
"grad_norm": 1.5573487281799316,
"learning_rate": 9.998944105395719e-06,
"loss": 9.2422,
"step": 235
},
{
"epoch": 0.011382350092027512,
"eval_accuracy": 0.09375699567975478,
"eval_loss": 9.1953125,
"eval_runtime": 274.1524,
"eval_samples_per_second": 123.169,
"eval_steps_per_second": 3.852,
"step": 235
},
{
"epoch": 0.011430785624334012,
"grad_norm": 1.758978009223938,
"learning_rate": 9.998939261842489e-06,
"loss": 9.2109,
"step": 236
},
{
"epoch": 0.011430785624334012,
"eval_accuracy": 0.09350146384395322,
"eval_loss": 9.1953125,
"eval_runtime": 276.5184,
"eval_samples_per_second": 122.115,
"eval_steps_per_second": 3.819,
"step": 236
},
{
"epoch": 0.01147922115664051,
"grad_norm": 1.6766207218170166,
"learning_rate": 9.998934418289259e-06,
"loss": 9.1797,
"step": 237
},
{
"epoch": 0.01147922115664051,
"eval_accuracy": 0.09353747626082612,
"eval_loss": 9.1953125,
"eval_runtime": 275.7856,
"eval_samples_per_second": 122.439,
"eval_steps_per_second": 3.829,
"step": 237
},
{
"epoch": 0.011527656688947011,
"grad_norm": 1.7581781148910522,
"learning_rate": 9.998929574736027e-06,
"loss": 9.1953,
"step": 238
},
{
"epoch": 0.011527656688947011,
"eval_accuracy": 0.09377196225493427,
"eval_loss": 9.1875,
"eval_runtime": 276.3511,
"eval_samples_per_second": 122.189,
"eval_steps_per_second": 3.821,
"step": 238
},
{
"epoch": 0.011576092221253512,
"grad_norm": 2.0294253826141357,
"learning_rate": 9.998924731182797e-06,
"loss": 9.1797,
"step": 239
},
{
"epoch": 0.011576092221253512,
"eval_accuracy": 0.09428256274432681,
"eval_loss": 9.1875,
"eval_runtime": 275.8013,
"eval_samples_per_second": 122.432,
"eval_steps_per_second": 3.829,
"step": 239
},
{
"epoch": 0.011624527753560012,
"grad_norm": 1.4771103858947754,
"learning_rate": 9.998919887629566e-06,
"loss": 9.2266,
"step": 240
},
{
"epoch": 0.011624527753560012,
"eval_accuracy": 0.09478583916501448,
"eval_loss": 9.1796875,
"eval_runtime": 273.9293,
"eval_samples_per_second": 123.269,
"eval_steps_per_second": 3.855,
"step": 240
},
{
"epoch": 0.011672963285866512,
"grad_norm": 1.494795322418213,
"learning_rate": 9.998915044076335e-06,
"loss": 9.2109,
"step": 241
},
{
"epoch": 0.011672963285866512,
"eval_accuracy": 0.09512494644093829,
"eval_loss": 9.171875,
"eval_runtime": 274.6237,
"eval_samples_per_second": 122.957,
"eval_steps_per_second": 3.845,
"step": 241
},
{
"epoch": 0.011721398818173011,
"grad_norm": 1.4708678722381592,
"learning_rate": 9.998910200523105e-06,
"loss": 9.1719,
"step": 242
},
{
"epoch": 0.011721398818173011,
"eval_accuracy": 0.0953564506995658,
"eval_loss": 9.171875,
"eval_runtime": 274.319,
"eval_samples_per_second": 123.094,
"eval_steps_per_second": 3.85,
"step": 242
},
{
"epoch": 0.011769834350479512,
"grad_norm": 1.5596672296524048,
"learning_rate": 9.998905356969873e-06,
"loss": 9.2031,
"step": 243
},
{
"epoch": 0.011769834350479512,
"eval_accuracy": 0.09549230783170773,
"eval_loss": 9.171875,
"eval_runtime": 276.3644,
"eval_samples_per_second": 122.183,
"eval_steps_per_second": 3.821,
"step": 243
},
{
"epoch": 0.011818269882786012,
"grad_norm": 1.6623671054840088,
"learning_rate": 9.998900513416643e-06,
"loss": 9.1953,
"step": 244
},
{
"epoch": 0.011818269882786012,
"eval_accuracy": 0.09535106620636777,
"eval_loss": 9.1640625,
"eval_runtime": 276.0732,
"eval_samples_per_second": 122.312,
"eval_steps_per_second": 3.825,
"step": 244
},
{
"epoch": 0.011866705415092512,
"grad_norm": 1.5597991943359375,
"learning_rate": 9.998895669863412e-06,
"loss": 9.1875,
"step": 245
},
{
"epoch": 0.011866705415092512,
"eval_accuracy": 0.09501265370375983,
"eval_loss": 9.1640625,
"eval_runtime": 276.6977,
"eval_samples_per_second": 122.036,
"eval_steps_per_second": 3.816,
"step": 245
},
{
"epoch": 0.011915140947399011,
"grad_norm": 1.540256381034851,
"learning_rate": 9.998890826310182e-06,
"loss": 9.2031,
"step": 246
},
{
"epoch": 0.011915140947399011,
"eval_accuracy": 0.09489992673325895,
"eval_loss": 9.15625,
"eval_runtime": 275.9711,
"eval_samples_per_second": 122.357,
"eval_steps_per_second": 3.826,
"step": 246
},
{
"epoch": 0.011963576479705511,
"grad_norm": 1.7622281312942505,
"learning_rate": 9.998885982756952e-06,
"loss": 9.1797,
"step": 247
},
{
"epoch": 0.011963576479705511,
"eval_accuracy": 0.09502625968119574,
"eval_loss": 9.1484375,
"eval_runtime": 276.4265,
"eval_samples_per_second": 122.155,
"eval_steps_per_second": 3.82,
"step": 247
},
{
"epoch": 0.012012012012012012,
"grad_norm": 1.5139068365097046,
"learning_rate": 9.99888113920372e-06,
"loss": 9.1484,
"step": 248
},
{
"epoch": 0.012012012012012012,
"eval_accuracy": 0.09517540435300378,
"eval_loss": 9.1484375,
"eval_runtime": 276.0853,
"eval_samples_per_second": 122.306,
"eval_steps_per_second": 3.825,
"step": 248
},
{
"epoch": 0.012060447544318512,
"grad_norm": 1.8858153820037842,
"learning_rate": 9.99887629565049e-06,
"loss": 9.1406,
"step": 249
},
{
"epoch": 0.012060447544318512,
"eval_accuracy": 0.0953735305435811,
"eval_loss": 9.1484375,
"eval_runtime": 277.0667,
"eval_samples_per_second": 121.873,
"eval_steps_per_second": 3.811,
"step": 249
},
{
"epoch": 0.012108883076625013,
"grad_norm": 1.5456604957580566,
"learning_rate": 9.99887145209726e-06,
"loss": 9.1641,
"step": 250
},
{
"epoch": 0.012108883076625013,
"eval_accuracy": 0.09559869499570124,
"eval_loss": 9.140625,
"eval_runtime": 277.6666,
"eval_samples_per_second": 121.61,
"eval_steps_per_second": 3.803,
"step": 250
},
{
"epoch": 0.012157318608931511,
"grad_norm": 1.594663143157959,
"learning_rate": 9.998866608544028e-06,
"loss": 9.1406,
"step": 251
},
{
"epoch": 0.012157318608931511,
"eval_accuracy": 0.09564637381450322,
"eval_loss": 9.140625,
"eval_runtime": 276.8219,
"eval_samples_per_second": 121.981,
"eval_steps_per_second": 3.815,
"step": 251
},
{
"epoch": 0.012205754141238012,
"grad_norm": 1.6868451833724976,
"learning_rate": 9.998861764990798e-06,
"loss": 9.1719,
"step": 252
},
{
"epoch": 0.012205754141238012,
"eval_accuracy": 0.09536970929034377,
"eval_loss": 9.1328125,
"eval_runtime": 274.9951,
"eval_samples_per_second": 122.791,
"eval_steps_per_second": 3.84,
"step": 252
},
{
"epoch": 0.012254189673544512,
"grad_norm": 1.5256409645080566,
"learning_rate": 9.998856921437568e-06,
"loss": 9.125,
"step": 253
},
{
"epoch": 0.012254189673544512,
"eval_accuracy": 0.09525680862651392,
"eval_loss": 9.1328125,
"eval_runtime": 275.3586,
"eval_samples_per_second": 122.629,
"eval_steps_per_second": 3.835,
"step": 253
},
{
"epoch": 0.012302625205851013,
"grad_norm": 1.565302848815918,
"learning_rate": 9.998852077884336e-06,
"loss": 9.1719,
"step": 254
},
{
"epoch": 0.012302625205851013,
"eval_accuracy": 0.09499291056203368,
"eval_loss": 9.125,
"eval_runtime": 275.0624,
"eval_samples_per_second": 122.761,
"eval_steps_per_second": 3.839,
"step": 254
},
{
"epoch": 0.012351060738157513,
"grad_norm": 1.4815526008605957,
"learning_rate": 9.998847234331106e-06,
"loss": 9.1797,
"step": 255
},
{
"epoch": 0.012351060738157513,
"eval_accuracy": 0.09496150101837846,
"eval_loss": 9.125,
"eval_runtime": 275.5286,
"eval_samples_per_second": 122.554,
"eval_steps_per_second": 3.833,
"step": 255
},
{
"epoch": 0.012399496270464012,
"grad_norm": 1.6366430521011353,
"learning_rate": 9.998842390777876e-06,
"loss": 9.0859,
"step": 256
},
{
"epoch": 0.012399496270464012,
"eval_accuracy": 0.09506522488466111,
"eval_loss": 9.1171875,
"eval_runtime": 276.1079,
"eval_samples_per_second": 122.296,
"eval_steps_per_second": 3.825,
"step": 256
},
{
"epoch": 0.012447931802770512,
"grad_norm": 1.6034120321273804,
"learning_rate": 9.998837547224644e-06,
"loss": 9.1875,
"step": 257
},
{
"epoch": 0.012447931802770512,
"eval_accuracy": 0.09566794073618354,
"eval_loss": 9.1171875,
"eval_runtime": 275.8176,
"eval_samples_per_second": 122.425,
"eval_steps_per_second": 3.829,
"step": 257
},
{
"epoch": 0.012496367335077013,
"grad_norm": 1.6382652521133423,
"learning_rate": 9.998832703671414e-06,
"loss": 9.1094,
"step": 258
},
{
"epoch": 0.012496367335077013,
"eval_accuracy": 0.09628628898731317,
"eval_loss": 9.109375,
"eval_runtime": 275.7044,
"eval_samples_per_second": 122.475,
"eval_steps_per_second": 3.83,
"step": 258
},
{
"epoch": 0.012544802867383513,
"grad_norm": 1.4967926740646362,
"learning_rate": 9.998827860118184e-06,
"loss": 9.0938,
"step": 259
},
{
"epoch": 0.012544802867383513,
"eval_accuracy": 0.09678612049030963,
"eval_loss": 9.109375,
"eval_runtime": 276.0511,
"eval_samples_per_second": 122.322,
"eval_steps_per_second": 3.825,
"step": 259
},
{
"epoch": 0.012593238399690013,
"grad_norm": 2.137125015258789,
"learning_rate": 9.998823016564952e-06,
"loss": 9.1016,
"step": 260
},
{
"epoch": 0.012593238399690013,
"eval_accuracy": 0.09689392614982306,
"eval_loss": 9.1015625,
"eval_runtime": 276.485,
"eval_samples_per_second": 122.13,
"eval_steps_per_second": 3.819,
"step": 260
},
{
"epoch": 0.012641673931996512,
"grad_norm": 1.655360460281372,
"learning_rate": 9.998818173011722e-06,
"loss": 9.1406,
"step": 261
},
{
"epoch": 0.012641673931996512,
"eval_accuracy": 0.09685108179534405,
"eval_loss": 9.1015625,
"eval_runtime": 276.2683,
"eval_samples_per_second": 122.225,
"eval_steps_per_second": 3.822,
"step": 261
},
{
"epoch": 0.012690109464303012,
"grad_norm": 1.615159273147583,
"learning_rate": 9.998813329458492e-06,
"loss": 9.0781,
"step": 262
},
{
"epoch": 0.012690109464303012,
"eval_accuracy": 0.09661187713246557,
"eval_loss": 9.09375,
"eval_runtime": 277.0541,
"eval_samples_per_second": 121.879,
"eval_steps_per_second": 3.812,
"step": 262
},
{
"epoch": 0.012738544996609513,
"grad_norm": 1.56972074508667,
"learning_rate": 9.998808485905261e-06,
"loss": 9.1094,
"step": 263
},
{
"epoch": 0.012738544996609513,
"eval_accuracy": 0.09626272459234972,
"eval_loss": 9.09375,
"eval_runtime": 276.2994,
"eval_samples_per_second": 122.212,
"eval_steps_per_second": 3.822,
"step": 263
},
{
"epoch": 0.012786980528916013,
"grad_norm": 1.5011804103851318,
"learning_rate": 9.998803642352031e-06,
"loss": 9.1172,
"step": 264
},
{
"epoch": 0.012786980528916013,
"eval_accuracy": 0.09589736067486343,
"eval_loss": 9.0859375,
"eval_runtime": 276.7096,
"eval_samples_per_second": 122.03,
"eval_steps_per_second": 3.816,
"step": 264
},
{
"epoch": 0.012835416061222512,
"grad_norm": 1.6870362758636475,
"learning_rate": 9.9987987987988e-06,
"loss": 9.1172,
"step": 265
},
{
"epoch": 0.012835416061222512,
"eval_accuracy": 0.09563450477035701,
"eval_loss": 9.0859375,
"eval_runtime": 275.3219,
"eval_samples_per_second": 122.646,
"eval_steps_per_second": 3.836,
"step": 265
},
{
"epoch": 0.012883851593529012,
"grad_norm": 1.5479800701141357,
"learning_rate": 9.99879395524557e-06,
"loss": 9.125,
"step": 266
},
{
"epoch": 0.012883851593529012,
"eval_accuracy": 0.09551361421339459,
"eval_loss": 9.0859375,
"eval_runtime": 275.3988,
"eval_samples_per_second": 122.611,
"eval_steps_per_second": 3.834,
"step": 266
},
{
"epoch": 0.012932287125835513,
"grad_norm": 1.5906175374984741,
"learning_rate": 9.99878911169234e-06,
"loss": 9.1094,
"step": 267
},
{
"epoch": 0.012932287125835513,
"eval_accuracy": 0.09573892340995555,
"eval_loss": 9.078125,
"eval_runtime": 273.7737,
"eval_samples_per_second": 123.339,
"eval_steps_per_second": 3.857,
"step": 267
},
{
"epoch": 0.012980722658142013,
"grad_norm": 1.5682505369186401,
"learning_rate": 9.998784268139107e-06,
"loss": 9.0781,
"step": 268
},
{
"epoch": 0.012980722658142013,
"eval_accuracy": 0.09638306512043707,
"eval_loss": 9.078125,
"eval_runtime": 272.9963,
"eval_samples_per_second": 123.69,
"eval_steps_per_second": 3.868,
"step": 268
},
{
"epoch": 0.013029158190448514,
"grad_norm": 1.5259824991226196,
"learning_rate": 9.998779424585877e-06,
"loss": 9.125,
"step": 269
},
{
"epoch": 0.013029158190448514,
"eval_accuracy": 0.09727434348915333,
"eval_loss": 9.0703125,
"eval_runtime": 274.1813,
"eval_samples_per_second": 123.156,
"eval_steps_per_second": 3.851,
"step": 269
},
{
"epoch": 0.013077593722755012,
"grad_norm": 1.5006844997406006,
"learning_rate": 9.998774581032647e-06,
"loss": 9.0547,
"step": 270
},
{
"epoch": 0.013077593722755012,
"eval_accuracy": 0.09799540239547996,
"eval_loss": 9.0703125,
"eval_runtime": 272.3774,
"eval_samples_per_second": 123.971,
"eval_steps_per_second": 3.877,
"step": 270
},
{
"epoch": 0.013126029255061513,
"grad_norm": 1.4817960262298584,
"learning_rate": 9.998769737479415e-06,
"loss": 9.0781,
"step": 271
},
{
"epoch": 0.013126029255061513,
"eval_accuracy": 0.09830503970325537,
"eval_loss": 9.0625,
"eval_runtime": 271.8893,
"eval_samples_per_second": 124.194,
"eval_steps_per_second": 3.884,
"step": 271
},
{
"epoch": 0.013174464787368013,
"grad_norm": 1.6597894430160522,
"learning_rate": 9.998764893926185e-06,
"loss": 9.1016,
"step": 272
},
{
"epoch": 0.013174464787368013,
"eval_accuracy": 0.09809840253955858,
"eval_loss": 9.0625,
"eval_runtime": 273.6177,
"eval_samples_per_second": 123.409,
"eval_steps_per_second": 3.859,
"step": 272
},
{
"epoch": 0.013222900319674514,
"grad_norm": 1.5939491987228394,
"learning_rate": 9.998760050372955e-06,
"loss": 9.0703,
"step": 273
},
{
"epoch": 0.013222900319674514,
"eval_accuracy": 0.09753586774480431,
"eval_loss": 9.0546875,
"eval_runtime": 273.2408,
"eval_samples_per_second": 123.58,
"eval_steps_per_second": 3.865,
"step": 273
},
{
"epoch": 0.013271335851981014,
"grad_norm": 1.5878655910491943,
"learning_rate": 9.998755206819723e-06,
"loss": 9.0547,
"step": 274
},
{
"epoch": 0.013271335851981014,
"eval_accuracy": 0.09690096072964631,
"eval_loss": 9.0546875,
"eval_runtime": 272.9584,
"eval_samples_per_second": 123.707,
"eval_steps_per_second": 3.869,
"step": 274
},
{
"epoch": 0.013319771384287513,
"grad_norm": 1.6010398864746094,
"learning_rate": 9.998750363266493e-06,
"loss": 9.0312,
"step": 275
},
{
"epoch": 0.013319771384287513,
"eval_accuracy": 0.09638720481144417,
"eval_loss": 9.046875,
"eval_runtime": 272.7096,
"eval_samples_per_second": 123.82,
"eval_steps_per_second": 3.872,
"step": 275
},
{
"epoch": 0.013368206916594013,
"grad_norm": 1.7441232204437256,
"learning_rate": 9.998745519713263e-06,
"loss": 9.0938,
"step": 276
},
{
"epoch": 0.013368206916594013,
"eval_accuracy": 0.09639140240022759,
"eval_loss": 9.046875,
"eval_runtime": 273.1864,
"eval_samples_per_second": 123.604,
"eval_steps_per_second": 3.865,
"step": 276
},
{
"epoch": 0.013416642448900513,
"grad_norm": 1.586517095565796,
"learning_rate": 9.998740676160031e-06,
"loss": 9.0156,
"step": 277
},
{
"epoch": 0.013416642448900513,
"eval_accuracy": 0.09671230082549781,
"eval_loss": 9.0390625,
"eval_runtime": 272.732,
"eval_samples_per_second": 123.81,
"eval_steps_per_second": 3.872,
"step": 277
},
{
"epoch": 0.013465077981207014,
"grad_norm": 1.5039782524108887,
"learning_rate": 9.998735832606801e-06,
"loss": 9.1094,
"step": 278
},
{
"epoch": 0.013465077981207014,
"eval_accuracy": 0.0972694800759422,
"eval_loss": 9.0390625,
"eval_runtime": 272.288,
"eval_samples_per_second": 124.012,
"eval_steps_per_second": 3.878,
"step": 278
},
{
"epoch": 0.013513513513513514,
"grad_norm": 1.534090518951416,
"learning_rate": 9.99873098905357e-06,
"loss": 9.0859,
"step": 279
},
{
"epoch": 0.013513513513513514,
"eval_accuracy": 0.09795999790525846,
"eval_loss": 9.03125,
"eval_runtime": 273.6685,
"eval_samples_per_second": 123.387,
"eval_steps_per_second": 3.859,
"step": 279
},
{
"epoch": 0.013561949045820013,
"grad_norm": 1.5849289894104004,
"learning_rate": 9.99872614550034e-06,
"loss": 9.0234,
"step": 280
},
{
"epoch": 0.013561949045820013,
"eval_accuracy": 0.09837451703484297,
"eval_loss": 9.03125,
"eval_runtime": 273.5891,
"eval_samples_per_second": 123.422,
"eval_steps_per_second": 3.86,
"step": 280
},
{
"epoch": 0.013610384578126513,
"grad_norm": 1.523674488067627,
"learning_rate": 9.99872130194711e-06,
"loss": 9.0781,
"step": 281
},
{
"epoch": 0.013610384578126513,
"eval_accuracy": 0.09843887041322598,
"eval_loss": 9.0234375,
"eval_runtime": 271.771,
"eval_samples_per_second": 124.248,
"eval_steps_per_second": 3.886,
"step": 281
},
{
"epoch": 0.013658820110433014,
"grad_norm": 1.646908164024353,
"learning_rate": 9.998716458393879e-06,
"loss": 9.0547,
"step": 282
},
{
"epoch": 0.013658820110433014,
"eval_accuracy": 0.09825828724887455,
"eval_loss": 9.0234375,
"eval_runtime": 272.8785,
"eval_samples_per_second": 123.744,
"eval_steps_per_second": 3.87,
"step": 282
},
{
"epoch": 0.013707255642739514,
"grad_norm": 1.6313369274139404,
"learning_rate": 9.998711614840649e-06,
"loss": 9.0234,
"step": 283
},
{
"epoch": 0.013707255642739514,
"eval_accuracy": 0.09794141271905878,
"eval_loss": 9.015625,
"eval_runtime": 273.6116,
"eval_samples_per_second": 123.412,
"eval_steps_per_second": 3.859,
"step": 283
},
{
"epoch": 0.013755691175046015,
"grad_norm": 1.6014082431793213,
"learning_rate": 9.998706771287417e-06,
"loss": 9.0312,
"step": 284
},
{
"epoch": 0.013755691175046015,
"eval_accuracy": 0.09782020372432657,
"eval_loss": 9.015625,
"eval_runtime": 273.8768,
"eval_samples_per_second": 123.293,
"eval_steps_per_second": 3.856,
"step": 284
},
{
"epoch": 0.013804126707352513,
"grad_norm": 1.5171185731887817,
"learning_rate": 9.998701927734187e-06,
"loss": 9.0391,
"step": 285
},
{
"epoch": 0.013804126707352513,
"eval_accuracy": 0.09783482291284813,
"eval_loss": 9.0078125,
"eval_runtime": 273.0741,
"eval_samples_per_second": 123.655,
"eval_steps_per_second": 3.867,
"step": 285
},
{
"epoch": 0.013852562239659014,
"grad_norm": 1.5492215156555176,
"learning_rate": 9.998697084180956e-06,
"loss": 9.0312,
"step": 286
},
{
"epoch": 0.013852562239659014,
"eval_accuracy": 0.09799062582893332,
"eval_loss": 9.0078125,
"eval_runtime": 273.3737,
"eval_samples_per_second": 123.52,
"eval_steps_per_second": 3.863,
"step": 286
},
{
"epoch": 0.013900997771965514,
"grad_norm": 1.6462546586990356,
"learning_rate": 9.998692240627725e-06,
"loss": 9.0625,
"step": 287
},
{
"epoch": 0.013900997771965514,
"eval_accuracy": 0.09823570711610857,
"eval_loss": 9.0078125,
"eval_runtime": 272.8604,
"eval_samples_per_second": 123.752,
"eval_steps_per_second": 3.87,
"step": 287
},
{
"epoch": 0.013949433304272014,
"grad_norm": 1.6392829418182373,
"learning_rate": 9.998687397074494e-06,
"loss": 9.0234,
"step": 288
},
{
"epoch": 0.013949433304272014,
"eval_accuracy": 0.09855127894595708,
"eval_loss": 9.0,
"eval_runtime": 272.8831,
"eval_samples_per_second": 123.742,
"eval_steps_per_second": 3.87,
"step": 288
},
{
"epoch": 0.013997868836578513,
"grad_norm": 1.5253773927688599,
"learning_rate": 9.998682553521264e-06,
"loss": 9.0078,
"step": 289
},
{
"epoch": 0.013997868836578513,
"eval_accuracy": 0.09903811239816902,
"eval_loss": 9.0,
"eval_runtime": 272.9158,
"eval_samples_per_second": 123.727,
"eval_steps_per_second": 3.869,
"step": 289
},
{
"epoch": 0.014046304368885014,
"grad_norm": 1.496385931968689,
"learning_rate": 9.998677709968032e-06,
"loss": 9.0,
"step": 290
},
{
"epoch": 0.014046304368885014,
"eval_accuracy": 0.09958177251784199,
"eval_loss": 8.9921875,
"eval_runtime": 273.8166,
"eval_samples_per_second": 123.32,
"eval_steps_per_second": 3.857,
"step": 290
},
{
"epoch": 0.014094739901191514,
"grad_norm": 1.5430630445480347,
"learning_rate": 9.998672866414802e-06,
"loss": 9.0078,
"step": 291
},
{
"epoch": 0.014094739901191514,
"eval_accuracy": 0.09971363470341763,
"eval_loss": 8.9921875,
"eval_runtime": 272.085,
"eval_samples_per_second": 124.105,
"eval_steps_per_second": 3.881,
"step": 291
},
{
"epoch": 0.014143175433498014,
"grad_norm": 1.7000993490219116,
"learning_rate": 9.998668022861572e-06,
"loss": 9.0,
"step": 292
},
{
"epoch": 0.014143175433498014,
"eval_accuracy": 0.09992762488470744,
"eval_loss": 8.984375,
"eval_runtime": 273.2642,
"eval_samples_per_second": 123.569,
"eval_steps_per_second": 3.864,
"step": 292
},
{
"epoch": 0.014191610965804515,
"grad_norm": 1.5401760339736938,
"learning_rate": 9.99866317930834e-06,
"loss": 9.0078,
"step": 293
},
{
"epoch": 0.014191610965804515,
"eval_accuracy": 0.09988570689464958,
"eval_loss": 8.984375,
"eval_runtime": 271.53,
"eval_samples_per_second": 124.358,
"eval_steps_per_second": 3.889,
"step": 293
},
{
"epoch": 0.014240046498111014,
"grad_norm": 1.5899308919906616,
"learning_rate": 9.99865833575511e-06,
"loss": 8.9922,
"step": 294
},
{
"epoch": 0.014240046498111014,
"eval_accuracy": 0.09954338629213985,
"eval_loss": 8.9765625,
"eval_runtime": 271.4912,
"eval_samples_per_second": 124.376,
"eval_steps_per_second": 3.89,
"step": 294
},
{
"epoch": 0.014288482030417514,
"grad_norm": 1.5780622959136963,
"learning_rate": 9.99865349220188e-06,
"loss": 9.0078,
"step": 295
},
{
"epoch": 0.014288482030417514,
"eval_accuracy": 0.09903443588937252,
"eval_loss": 8.9765625,
"eval_runtime": 272.7386,
"eval_samples_per_second": 123.807,
"eval_steps_per_second": 3.872,
"step": 295
},
{
"epoch": 0.014336917562724014,
"grad_norm": 1.6593127250671387,
"learning_rate": 9.998648648648648e-06,
"loss": 8.9844,
"step": 296
},
{
"epoch": 0.014336917562724014,
"eval_accuracy": 0.09852673028879613,
"eval_loss": 8.96875,
"eval_runtime": 273.5255,
"eval_samples_per_second": 123.451,
"eval_steps_per_second": 3.861,
"step": 296
},
{
"epoch": 0.014385353095030515,
"grad_norm": 1.5654476881027222,
"learning_rate": 9.99864380509542e-06,
"loss": 8.9766,
"step": 297
},
{
"epoch": 0.014385353095030515,
"eval_accuracy": 0.09832848830266619,
"eval_loss": 8.96875,
"eval_runtime": 272.9229,
"eval_samples_per_second": 123.724,
"eval_steps_per_second": 3.869,
"step": 297
},
{
"epoch": 0.014433788627337015,
"grad_norm": 1.604347586631775,
"learning_rate": 9.998638961542188e-06,
"loss": 8.9531,
"step": 298
},
{
"epoch": 0.014433788627337015,
"eval_accuracy": 0.09845606605279392,
"eval_loss": 8.9609375,
"eval_runtime": 274.3511,
"eval_samples_per_second": 123.08,
"eval_steps_per_second": 3.849,
"step": 298
},
{
"epoch": 0.014482224159643514,
"grad_norm": 1.6617177724838257,
"learning_rate": 9.998634117988958e-06,
"loss": 8.9688,
"step": 299
},
{
"epoch": 0.014482224159643514,
"eval_accuracy": 0.09881535070376629,
"eval_loss": 8.9609375,
"eval_runtime": 275.2558,
"eval_samples_per_second": 122.675,
"eval_steps_per_second": 3.836,
"step": 299
},
{
"epoch": 0.014530659691950014,
"grad_norm": 1.5470112562179565,
"learning_rate": 9.998629274435728e-06,
"loss": 9.0312,
"step": 300
},
{
"epoch": 0.014530659691950014,
"eval_accuracy": 0.09944139935914688,
"eval_loss": 8.953125,
"eval_runtime": 276.2061,
"eval_samples_per_second": 122.253,
"eval_steps_per_second": 3.823,
"step": 300
},
{
"epoch": 0.014579095224256515,
"grad_norm": 1.5366243124008179,
"learning_rate": 9.998624430882496e-06,
"loss": 9.0156,
"step": 301
},
{
"epoch": 0.014579095224256515,
"eval_accuracy": 0.0997650189799043,
"eval_loss": 8.953125,
"eval_runtime": 273.6153,
"eval_samples_per_second": 123.41,
"eval_steps_per_second": 3.859,
"step": 301
},
{
"epoch": 0.014627530756563015,
"grad_norm": 1.8393828868865967,
"learning_rate": 9.998619587329266e-06,
"loss": 8.9688,
"step": 302
},
{
"epoch": 0.014627530756563015,
"eval_accuracy": 0.09985768437090925,
"eval_loss": 8.9453125,
"eval_runtime": 272.5368,
"eval_samples_per_second": 123.899,
"eval_steps_per_second": 3.875,
"step": 302
},
{
"epoch": 0.014675966288869515,
"grad_norm": 1.5231480598449707,
"learning_rate": 9.998614743776036e-06,
"loss": 9.0,
"step": 303
},
{
"epoch": 0.014675966288869515,
"eval_accuracy": 0.09973019346744601,
"eval_loss": 8.9453125,
"eval_runtime": 272.9977,
"eval_samples_per_second": 123.69,
"eval_steps_per_second": 3.868,
"step": 303
},
{
"epoch": 0.014724401821176014,
"grad_norm": 1.4661198854446411,
"learning_rate": 9.998609900222804e-06,
"loss": 8.9375,
"step": 304
},
{
"epoch": 0.014724401821176014,
"eval_accuracy": 0.09956171093834608,
"eval_loss": 8.9375,
"eval_runtime": 273.4012,
"eval_samples_per_second": 123.507,
"eval_steps_per_second": 3.862,
"step": 304
},
{
"epoch": 0.014772837353482515,
"grad_norm": 1.561277985572815,
"learning_rate": 9.998605056669574e-06,
"loss": 8.9766,
"step": 305
},
{
"epoch": 0.014772837353482515,
"eval_accuracy": 0.09941792181084791,
"eval_loss": 8.9375,
"eval_runtime": 273.3274,
"eval_samples_per_second": 123.54,
"eval_steps_per_second": 3.863,
"step": 305
},
{
"epoch": 0.014821272885789015,
"grad_norm": 1.5084242820739746,
"learning_rate": 9.998600213116343e-06,
"loss": 8.9375,
"step": 306
},
{
"epoch": 0.014821272885789015,
"eval_accuracy": 0.09944964979227292,
"eval_loss": 8.9375,
"eval_runtime": 272.8323,
"eval_samples_per_second": 123.765,
"eval_steps_per_second": 3.871,
"step": 306
},
{
"epoch": 0.014869708418095515,
"grad_norm": 1.533602237701416,
"learning_rate": 9.998595369563112e-06,
"loss": 8.9688,
"step": 307
},
{
"epoch": 0.014869708418095515,
"eval_accuracy": 0.09968031453314374,
"eval_loss": 8.9296875,
"eval_runtime": 273.2909,
"eval_samples_per_second": 123.557,
"eval_steps_per_second": 3.864,
"step": 307
},
{
"epoch": 0.014918143950402014,
"grad_norm": 1.527116060256958,
"learning_rate": 9.998590526009882e-06,
"loss": 8.9531,
"step": 308
},
{
"epoch": 0.014918143950402014,
"eval_accuracy": 0.099938480717768,
"eval_loss": 8.9296875,
"eval_runtime": 273.5392,
"eval_samples_per_second": 123.445,
"eval_steps_per_second": 3.861,
"step": 308
},
{
"epoch": 0.014966579482708514,
"grad_norm": 1.5343533754348755,
"learning_rate": 9.998585682456651e-06,
"loss": 8.9531,
"step": 309
},
{
"epoch": 0.014966579482708514,
"eval_accuracy": 0.10016824804310583,
"eval_loss": 8.921875,
"eval_runtime": 273.4771,
"eval_samples_per_second": 123.473,
"eval_steps_per_second": 3.861,
"step": 309
},
{
"epoch": 0.015015015015015015,
"grad_norm": 2.02919602394104,
"learning_rate": 9.99858083890342e-06,
"loss": 8.9062,
"step": 310
},
{
"epoch": 0.015015015015015015,
"eval_accuracy": 0.10033235929009336,
"eval_loss": 8.921875,
"eval_runtime": 272.9511,
"eval_samples_per_second": 123.711,
"eval_steps_per_second": 3.869,
"step": 310
},
{
"epoch": 0.015063450547321515,
"grad_norm": 1.50547456741333,
"learning_rate": 9.99857599535019e-06,
"loss": 8.9375,
"step": 311
},
{
"epoch": 0.015063450547321515,
"eval_accuracy": 0.10040134449058222,
"eval_loss": 8.9140625,
"eval_runtime": 274.4167,
"eval_samples_per_second": 123.05,
"eval_steps_per_second": 3.848,
"step": 311
},
{
"epoch": 0.015111886079628016,
"grad_norm": 1.5935693979263306,
"learning_rate": 9.99857115179696e-06,
"loss": 8.8828,
"step": 312
},
{
"epoch": 0.015111886079628016,
"eval_accuracy": 0.1003402333876733,
"eval_loss": 8.9140625,
"eval_runtime": 272.7238,
"eval_samples_per_second": 123.814,
"eval_steps_per_second": 3.872,
"step": 312
},
{
"epoch": 0.015160321611934514,
"grad_norm": 1.4832584857940674,
"learning_rate": 9.998566308243727e-06,
"loss": 8.9219,
"step": 313
},
{
"epoch": 0.015160321611934514,
"eval_accuracy": 0.10030963441288658,
"eval_loss": 8.90625,
"eval_runtime": 272.1097,
"eval_samples_per_second": 124.093,
"eval_steps_per_second": 3.881,
"step": 313
},
{
"epoch": 0.015208757144241015,
"grad_norm": 1.4832618236541748,
"learning_rate": 9.998561464690499e-06,
"loss": 8.9219,
"step": 314
},
{
"epoch": 0.015208757144241015,
"eval_accuracy": 0.10044369671396249,
"eval_loss": 8.90625,
"eval_runtime": 273.1456,
"eval_samples_per_second": 123.623,
"eval_steps_per_second": 3.866,
"step": 314
},
{
"epoch": 0.015257192676547515,
"grad_norm": 1.5148617029190063,
"learning_rate": 9.998556621137267e-06,
"loss": 8.9297,
"step": 315
},
{
"epoch": 0.015257192676547515,
"eval_accuracy": 0.10085141285482906,
"eval_loss": 8.90625,
"eval_runtime": 273.2988,
"eval_samples_per_second": 123.553,
"eval_steps_per_second": 3.864,
"step": 315
},
{
"epoch": 0.015305628208854016,
"grad_norm": 1.433423638343811,
"learning_rate": 9.998551777584037e-06,
"loss": 8.9922,
"step": 316
},
{
"epoch": 0.015305628208854016,
"eval_accuracy": 0.10107848793356786,
"eval_loss": 8.8984375,
"eval_runtime": 273.7559,
"eval_samples_per_second": 123.347,
"eval_steps_per_second": 3.857,
"step": 316
},
{
"epoch": 0.015354063741160516,
"grad_norm": 1.5502877235412598,
"learning_rate": 9.998546934030807e-06,
"loss": 8.9062,
"step": 317
},
{
"epoch": 0.015354063741160516,
"eval_accuracy": 0.10110254445963007,
"eval_loss": 8.8984375,
"eval_runtime": 273.1804,
"eval_samples_per_second": 123.607,
"eval_steps_per_second": 3.866,
"step": 317
},
{
"epoch": 0.015402499273467015,
"grad_norm": 1.4339704513549805,
"learning_rate": 9.998542090477575e-06,
"loss": 8.9297,
"step": 318
},
{
"epoch": 0.015402499273467015,
"eval_accuracy": 0.10105709470521651,
"eval_loss": 8.890625,
"eval_runtime": 272.4751,
"eval_samples_per_second": 123.927,
"eval_steps_per_second": 3.876,
"step": 318
},
{
"epoch": 0.015450934805773515,
"grad_norm": 1.5828499794006348,
"learning_rate": 9.998537246924345e-06,
"loss": 8.9531,
"step": 319
},
{
"epoch": 0.015450934805773515,
"eval_accuracy": 0.1008436256039136,
"eval_loss": 8.890625,
"eval_runtime": 273.6696,
"eval_samples_per_second": 123.386,
"eval_steps_per_second": 3.859,
"step": 319
},
{
"epoch": 0.015499370338080016,
"grad_norm": 1.4665073156356812,
"learning_rate": 9.998532403371115e-06,
"loss": 8.9531,
"step": 320
},
{
"epoch": 0.015499370338080016,
"eval_accuracy": 0.10056403724204985,
"eval_loss": 8.8828125,
"eval_runtime": 273.7537,
"eval_samples_per_second": 123.348,
"eval_steps_per_second": 3.857,
"step": 320
},
{
"epoch": 0.015547805870386516,
"grad_norm": 1.5187170505523682,
"learning_rate": 9.998527559817883e-06,
"loss": 8.9375,
"step": 321
},
{
"epoch": 0.015547805870386516,
"eval_accuracy": 0.10036628738701864,
"eval_loss": 8.8828125,
"eval_runtime": 274.5883,
"eval_samples_per_second": 122.973,
"eval_steps_per_second": 3.846,
"step": 321
},
{
"epoch": 0.015596241402693016,
"grad_norm": 1.6168104410171509,
"learning_rate": 9.998522716264653e-06,
"loss": 8.9219,
"step": 322
},
{
"epoch": 0.015596241402693016,
"eval_accuracy": 0.10023019866377143,
"eval_loss": 8.875,
"eval_runtime": 274.4874,
"eval_samples_per_second": 123.018,
"eval_steps_per_second": 3.847,
"step": 322
},
{
"epoch": 0.015644676934999515,
"grad_norm": 1.5896227359771729,
"learning_rate": 9.998517872711423e-06,
"loss": 8.9062,
"step": 323
},
{
"epoch": 0.015644676934999515,
"eval_accuracy": 0.10039622053737764,
"eval_loss": 8.875,
"eval_runtime": 273.3822,
"eval_samples_per_second": 123.516,
"eval_steps_per_second": 3.863,
"step": 323
},
{
"epoch": 0.015693112467306015,
"grad_norm": 1.476304531097412,
"learning_rate": 9.99851302915819e-06,
"loss": 8.8906,
"step": 324
},
{
"epoch": 0.015693112467306015,
"eval_accuracy": 0.10063834903796041,
"eval_loss": 8.875,
"eval_runtime": 273.0848,
"eval_samples_per_second": 123.65,
"eval_steps_per_second": 3.867,
"step": 324
},
{
"epoch": 0.015741547999612516,
"grad_norm": 1.493653655052185,
"learning_rate": 9.99850818560496e-06,
"loss": 8.8906,
"step": 325
},
{
"epoch": 0.015741547999612516,
"eval_accuracy": 0.10110543934844622,
"eval_loss": 8.8671875,
"eval_runtime": 273.5247,
"eval_samples_per_second": 123.451,
"eval_steps_per_second": 3.861,
"step": 325
},
{
"epoch": 0.015789983531919016,
"grad_norm": 1.5021024942398071,
"learning_rate": 9.998503342051729e-06,
"loss": 8.8672,
"step": 326
},
{
"epoch": 0.015789983531919016,
"eval_accuracy": 0.10160095746710661,
"eval_loss": 8.8671875,
"eval_runtime": 273.3554,
"eval_samples_per_second": 123.528,
"eval_steps_per_second": 3.863,
"step": 326
},
{
"epoch": 0.015838419064225517,
"grad_norm": 1.4941586256027222,
"learning_rate": 9.998498498498499e-06,
"loss": 8.875,
"step": 327
},
{
"epoch": 0.015838419064225517,
"eval_accuracy": 0.1018975967240975,
"eval_loss": 8.859375,
"eval_runtime": 272.1708,
"eval_samples_per_second": 124.065,
"eval_steps_per_second": 3.88,
"step": 327
},
{
"epoch": 0.015886854596532017,
"grad_norm": 1.484066128730774,
"learning_rate": 9.998493654945269e-06,
"loss": 8.8516,
"step": 328
},
{
"epoch": 0.015886854596532017,
"eval_accuracy": 0.10215561816428094,
"eval_loss": 8.859375,
"eval_runtime": 272.903,
"eval_samples_per_second": 123.733,
"eval_steps_per_second": 3.87,
"step": 328
},
{
"epoch": 0.015935290128838518,
"grad_norm": 1.6349196434020996,
"learning_rate": 9.998488811392037e-06,
"loss": 8.8672,
"step": 329
},
{
"epoch": 0.015935290128838518,
"eval_accuracy": 0.10202540606533052,
"eval_loss": 8.8515625,
"eval_runtime": 273.5376,
"eval_samples_per_second": 123.446,
"eval_steps_per_second": 3.861,
"step": 329
},
{
"epoch": 0.015983725661145014,
"grad_norm": 1.4366816282272339,
"learning_rate": 9.998483967838807e-06,
"loss": 8.8984,
"step": 330
},
{
"epoch": 0.015983725661145014,
"eval_accuracy": 0.10181083690627749,
"eval_loss": 8.8515625,
"eval_runtime": 273.8934,
"eval_samples_per_second": 123.285,
"eval_steps_per_second": 3.856,
"step": 330
},
{
"epoch": 0.016032161193451515,
"grad_norm": 1.4731358289718628,
"learning_rate": 9.998479124285576e-06,
"loss": 8.875,
"step": 331
},
{
"epoch": 0.016032161193451515,
"eval_accuracy": 0.10155426291050211,
"eval_loss": 8.84375,
"eval_runtime": 273.3318,
"eval_samples_per_second": 123.538,
"eval_steps_per_second": 3.863,
"step": 331
},
{
"epoch": 0.016080596725758015,
"grad_norm": 1.4572798013687134,
"learning_rate": 9.998474280732346e-06,
"loss": 8.8828,
"step": 332
},
{
"epoch": 0.016080596725758015,
"eval_accuracy": 0.10136490823303775,
"eval_loss": 8.84375,
"eval_runtime": 273.42,
"eval_samples_per_second": 123.499,
"eval_steps_per_second": 3.862,
"step": 332
},
{
"epoch": 0.016129032258064516,
"grad_norm": 1.5042625665664673,
"learning_rate": 9.998469437179116e-06,
"loss": 8.8438,
"step": 333
},
{
"epoch": 0.016129032258064516,
"eval_accuracy": 0.10138867527021833,
"eval_loss": 8.8359375,
"eval_runtime": 273.8134,
"eval_samples_per_second": 123.321,
"eval_steps_per_second": 3.857,
"step": 333
},
{
"epoch": 0.016177467790371016,
"grad_norm": 1.9862890243530273,
"learning_rate": 9.998464593625884e-06,
"loss": 8.7969,
"step": 334
},
{
"epoch": 0.016177467790371016,
"eval_accuracy": 0.10165384708577767,
"eval_loss": 8.8359375,
"eval_runtime": 273.5642,
"eval_samples_per_second": 123.434,
"eval_steps_per_second": 3.86,
"step": 334
},
{
"epoch": 0.016225903322677517,
"grad_norm": 1.5006351470947266,
"learning_rate": 9.998459750072654e-06,
"loss": 8.8828,
"step": 335
},
{
"epoch": 0.016225903322677517,
"eval_accuracy": 0.10197017158671838,
"eval_loss": 8.828125,
"eval_runtime": 273.8604,
"eval_samples_per_second": 123.3,
"eval_steps_per_second": 3.856,
"step": 335
},
{
"epoch": 0.016274338854984017,
"grad_norm": 1.5454577207565308,
"learning_rate": 9.998454906519424e-06,
"loss": 8.8281,
"step": 336
},
{
"epoch": 0.016274338854984017,
"eval_accuracy": 0.10246210004324674,
"eval_loss": 8.828125,
"eval_runtime": 273.8457,
"eval_samples_per_second": 123.307,
"eval_steps_per_second": 3.856,
"step": 336
},
{
"epoch": 0.016322774387290517,
"grad_norm": 1.6074914932250977,
"learning_rate": 9.998450062966192e-06,
"loss": 8.8203,
"step": 337
},
{
"epoch": 0.016322774387290517,
"eval_accuracy": 0.10273896720962333,
"eval_loss": 8.828125,
"eval_runtime": 273.8399,
"eval_samples_per_second": 123.309,
"eval_steps_per_second": 3.856,
"step": 337
},
{
"epoch": 0.016371209919597018,
"grad_norm": 1.5352425575256348,
"learning_rate": 9.998445219412962e-06,
"loss": 8.8594,
"step": 338
},
{
"epoch": 0.016371209919597018,
"eval_accuracy": 0.10277469013761462,
"eval_loss": 8.8203125,
"eval_runtime": 273.8619,
"eval_samples_per_second": 123.299,
"eval_steps_per_second": 3.856,
"step": 338
},
{
"epoch": 0.016419645451903518,
"grad_norm": 1.5608229637145996,
"learning_rate": 9.998440375859732e-06,
"loss": 8.8594,
"step": 339
},
{
"epoch": 0.016419645451903518,
"eval_accuracy": 0.10268248792882025,
"eval_loss": 8.8203125,
"eval_runtime": 273.0524,
"eval_samples_per_second": 123.665,
"eval_steps_per_second": 3.867,
"step": 339
},
{
"epoch": 0.016468080984210015,
"grad_norm": 1.4532408714294434,
"learning_rate": 9.9984355323065e-06,
"loss": 8.8203,
"step": 340
},
{
"epoch": 0.016468080984210015,
"eval_accuracy": 0.10246901882751734,
"eval_loss": 8.8125,
"eval_runtime": 273.0044,
"eval_samples_per_second": 123.687,
"eval_steps_per_second": 3.868,
"step": 340
},
{
"epoch": 0.016516516516516516,
"grad_norm": 1.5474005937576294,
"learning_rate": 9.99843068875327e-06,
"loss": 8.8359,
"step": 341
},
{
"epoch": 0.016516516516516516,
"eval_accuracy": 0.10244681503029747,
"eval_loss": 8.8125,
"eval_runtime": 272.933,
"eval_samples_per_second": 123.719,
"eval_steps_per_second": 3.869,
"step": 341
},
{
"epoch": 0.016516516516516516,
"step": 341,
"total_flos": 1427643152990208.0,
"train_loss": 9.567586143695015,
"train_runtime": 93928.2806,
"train_samples_per_second": 703.348,
"train_steps_per_second": 21.981
}
],
"logging_steps": 1,
"max_steps": 2064600,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1427643152990208.0,
"train_batch_size": 32,
"trial_name": null,
"trial_params": null
}