Mistral-Chem-v1-134M / trainer_state.json
RaphaelMourad's picture
Upload 9 files
b399758 verified
{
"best_metric": 1.3305245637893677,
"best_model_checkpoint": "./results/models/checkpoint-182628",
"epoch": 19.0,
"eval_steps": 500,
"global_step": 182628,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.05201831044527674,
"grad_norm": 1.3984375,
"learning_rate": 0.001997919267582189,
"loss": 2.3433,
"step": 500
},
{
"epoch": 0.10403662089055347,
"grad_norm": 0.92578125,
"learning_rate": 0.001995838535164378,
"loss": 1.9524,
"step": 1000
},
{
"epoch": 0.1560549313358302,
"grad_norm": 0.6171875,
"learning_rate": 0.001993757802746567,
"loss": 1.8833,
"step": 1500
},
{
"epoch": 0.20807324178110695,
"grad_norm": 0.80859375,
"learning_rate": 0.0019916770703287557,
"loss": 1.8518,
"step": 2000
},
{
"epoch": 0.2600915522263837,
"grad_norm": 0.494140625,
"learning_rate": 0.0019895963379109446,
"loss": 1.8064,
"step": 2500
},
{
"epoch": 0.3121098626716604,
"grad_norm": 0.61328125,
"learning_rate": 0.0019875156054931335,
"loss": 1.7881,
"step": 3000
},
{
"epoch": 0.3641281731169372,
"grad_norm": 0.71484375,
"learning_rate": 0.0019854348730753224,
"loss": 1.7552,
"step": 3500
},
{
"epoch": 0.4161464835622139,
"grad_norm": 0.34375,
"learning_rate": 0.0019833541406575114,
"loss": 1.7362,
"step": 4000
},
{
"epoch": 0.4681647940074906,
"grad_norm": 0.310546875,
"learning_rate": 0.0019812734082397003,
"loss": 1.7302,
"step": 4500
},
{
"epoch": 0.5201831044527674,
"grad_norm": 0.33203125,
"learning_rate": 0.0019791926758218896,
"loss": 1.6997,
"step": 5000
},
{
"epoch": 0.5722014148980441,
"grad_norm": 0.41796875,
"learning_rate": 0.001977111943404078,
"loss": 1.6694,
"step": 5500
},
{
"epoch": 0.6242197253433208,
"grad_norm": 0.71484375,
"learning_rate": 0.001975031210986267,
"loss": 1.6882,
"step": 6000
},
{
"epoch": 0.6762380357885975,
"grad_norm": 0.40625,
"learning_rate": 0.0019729504785684564,
"loss": 1.6712,
"step": 6500
},
{
"epoch": 0.7282563462338744,
"grad_norm": 1.21875,
"learning_rate": 0.0019708697461506453,
"loss": 1.6519,
"step": 7000
},
{
"epoch": 0.7802746566791511,
"grad_norm": 0.27734375,
"learning_rate": 0.0019687890137328337,
"loss": 1.6525,
"step": 7500
},
{
"epoch": 0.8322929671244278,
"grad_norm": 0.408203125,
"learning_rate": 0.001966708281315023,
"loss": 1.6239,
"step": 8000
},
{
"epoch": 0.8843112775697045,
"grad_norm": 0.546875,
"learning_rate": 0.001964627548897212,
"loss": 1.6152,
"step": 8500
},
{
"epoch": 0.9363295880149812,
"grad_norm": 0.609375,
"learning_rate": 0.0019625468164794005,
"loss": 1.6053,
"step": 9000
},
{
"epoch": 0.9883478984602581,
"grad_norm": 0.86328125,
"learning_rate": 0.00196046608406159,
"loss": 1.6078,
"step": 9500
},
{
"epoch": 1.0,
"eval_loss": 1.5721051692962646,
"eval_runtime": 1.4853,
"eval_samples_per_second": 673.283,
"eval_steps_per_second": 0.673,
"step": 9612
},
{
"epoch": 1.0403662089055348,
"grad_norm": 0.31640625,
"learning_rate": 0.0019583853516437788,
"loss": 1.5953,
"step": 10000
},
{
"epoch": 1.0923845193508115,
"grad_norm": 0.484375,
"learning_rate": 0.0019563046192259677,
"loss": 1.587,
"step": 10500
},
{
"epoch": 1.1444028297960882,
"grad_norm": 0.5625,
"learning_rate": 0.0019542238868081566,
"loss": 1.5759,
"step": 11000
},
{
"epoch": 1.196421140241365,
"grad_norm": 0.6484375,
"learning_rate": 0.0019521431543903455,
"loss": 1.5785,
"step": 11500
},
{
"epoch": 1.2484394506866416,
"grad_norm": 0.3984375,
"learning_rate": 0.0019500624219725344,
"loss": 1.5734,
"step": 12000
},
{
"epoch": 1.3004577611319184,
"grad_norm": 0.3359375,
"learning_rate": 0.0019479816895547233,
"loss": 1.5593,
"step": 12500
},
{
"epoch": 1.352476071577195,
"grad_norm": 0.408203125,
"learning_rate": 0.0019459009571369122,
"loss": 1.5589,
"step": 13000
},
{
"epoch": 1.404494382022472,
"grad_norm": 0.4296875,
"learning_rate": 0.0019438202247191011,
"loss": 1.555,
"step": 13500
},
{
"epoch": 1.4565126924677487,
"grad_norm": 0.310546875,
"learning_rate": 0.00194173949230129,
"loss": 1.548,
"step": 14000
},
{
"epoch": 1.5085310029130254,
"grad_norm": 0.5234375,
"learning_rate": 0.001939658759883479,
"loss": 1.5519,
"step": 14500
},
{
"epoch": 1.5605493133583022,
"grad_norm": 0.81640625,
"learning_rate": 0.001937578027465668,
"loss": 1.5534,
"step": 15000
},
{
"epoch": 1.6125676238035789,
"grad_norm": 0.275390625,
"learning_rate": 0.0019354972950478568,
"loss": 1.5408,
"step": 15500
},
{
"epoch": 1.6645859342488556,
"grad_norm": 0.421875,
"learning_rate": 0.0019334165626300457,
"loss": 1.5339,
"step": 16000
},
{
"epoch": 1.7166042446941323,
"grad_norm": 0.337890625,
"learning_rate": 0.0019313358302122348,
"loss": 1.5238,
"step": 16500
},
{
"epoch": 1.768622555139409,
"grad_norm": 0.474609375,
"learning_rate": 0.0019292550977944235,
"loss": 1.527,
"step": 17000
},
{
"epoch": 1.8206408655846857,
"grad_norm": 0.345703125,
"learning_rate": 0.0019271743653766125,
"loss": 1.5253,
"step": 17500
},
{
"epoch": 1.8726591760299627,
"grad_norm": 0.53125,
"learning_rate": 0.0019250936329588016,
"loss": 1.5217,
"step": 18000
},
{
"epoch": 1.9246774864752392,
"grad_norm": 0.416015625,
"learning_rate": 0.0019230129005409905,
"loss": 1.5201,
"step": 18500
},
{
"epoch": 1.9766957969205161,
"grad_norm": 0.212890625,
"learning_rate": 0.0019209321681231794,
"loss": 1.5157,
"step": 19000
},
{
"epoch": 2.0,
"eval_loss": 1.4967154264450073,
"eval_runtime": 1.4155,
"eval_samples_per_second": 706.48,
"eval_steps_per_second": 0.706,
"step": 19224
},
{
"epoch": 2.0287141073657926,
"grad_norm": 0.640625,
"learning_rate": 0.0019188514357053683,
"loss": 1.5093,
"step": 19500
},
{
"epoch": 2.0807324178110695,
"grad_norm": 0.349609375,
"learning_rate": 0.0019167707032875572,
"loss": 1.5081,
"step": 20000
},
{
"epoch": 2.132750728256346,
"grad_norm": 0.447265625,
"learning_rate": 0.0019146899708697464,
"loss": 1.5137,
"step": 20500
},
{
"epoch": 2.184769038701623,
"grad_norm": 0.30078125,
"learning_rate": 0.001912609238451935,
"loss": 1.5052,
"step": 21000
},
{
"epoch": 2.2367873491468995,
"grad_norm": 0.423828125,
"learning_rate": 0.001910528506034124,
"loss": 1.4989,
"step": 21500
},
{
"epoch": 2.2888056595921764,
"grad_norm": 0.322265625,
"learning_rate": 0.0019084477736163131,
"loss": 1.4933,
"step": 22000
},
{
"epoch": 2.3408239700374533,
"grad_norm": 0.287109375,
"learning_rate": 0.0019063670411985018,
"loss": 1.4908,
"step": 22500
},
{
"epoch": 2.39284228048273,
"grad_norm": 0.60546875,
"learning_rate": 0.0019042863087806907,
"loss": 1.483,
"step": 23000
},
{
"epoch": 2.444860590928007,
"grad_norm": 0.37890625,
"learning_rate": 0.0019022055763628799,
"loss": 1.4808,
"step": 23500
},
{
"epoch": 2.4968789013732833,
"grad_norm": 0.28125,
"learning_rate": 0.0019001248439450688,
"loss": 1.4751,
"step": 24000
},
{
"epoch": 2.54889721181856,
"grad_norm": 0.26953125,
"learning_rate": 0.0018980441115272575,
"loss": 1.4713,
"step": 24500
},
{
"epoch": 2.6009155222638367,
"grad_norm": 0.2392578125,
"learning_rate": 0.0018959633791094466,
"loss": 1.4743,
"step": 25000
},
{
"epoch": 2.6529338327091136,
"grad_norm": 0.255859375,
"learning_rate": 0.0018938826466916355,
"loss": 1.4703,
"step": 25500
},
{
"epoch": 2.70495214315439,
"grad_norm": 0.333984375,
"learning_rate": 0.0018918019142738244,
"loss": 1.4722,
"step": 26000
},
{
"epoch": 2.756970453599667,
"grad_norm": 0.87890625,
"learning_rate": 0.0018897211818560133,
"loss": 1.4728,
"step": 26500
},
{
"epoch": 2.808988764044944,
"grad_norm": 0.435546875,
"learning_rate": 0.0018876404494382023,
"loss": 1.4738,
"step": 27000
},
{
"epoch": 2.8610070744902205,
"grad_norm": 1.7265625,
"learning_rate": 0.0018855597170203914,
"loss": 1.4717,
"step": 27500
},
{
"epoch": 2.9130253849354975,
"grad_norm": 0.361328125,
"learning_rate": 0.00188347898460258,
"loss": 1.473,
"step": 28000
},
{
"epoch": 2.965043695380774,
"grad_norm": 0.3984375,
"learning_rate": 0.001881398252184769,
"loss": 1.472,
"step": 28500
},
{
"epoch": 3.0,
"eval_loss": 1.4684182405471802,
"eval_runtime": 1.4391,
"eval_samples_per_second": 694.9,
"eval_steps_per_second": 0.695,
"step": 28836
},
{
"epoch": 3.017062005826051,
"grad_norm": 0.23046875,
"learning_rate": 0.0018793175197669581,
"loss": 1.4727,
"step": 29000
},
{
"epoch": 3.0690803162713274,
"grad_norm": 0.73046875,
"learning_rate": 0.001877236787349147,
"loss": 1.4677,
"step": 29500
},
{
"epoch": 3.1210986267166043,
"grad_norm": 0.470703125,
"learning_rate": 0.0018751560549313357,
"loss": 1.4668,
"step": 30000
},
{
"epoch": 3.173116937161881,
"grad_norm": 1.2578125,
"learning_rate": 0.0018730753225135249,
"loss": 1.4653,
"step": 30500
},
{
"epoch": 3.2251352476071578,
"grad_norm": 0.2890625,
"learning_rate": 0.0018709945900957138,
"loss": 1.4667,
"step": 31000
},
{
"epoch": 3.2771535580524347,
"grad_norm": 0.361328125,
"learning_rate": 0.0018689138576779025,
"loss": 1.4613,
"step": 31500
},
{
"epoch": 3.329171868497711,
"grad_norm": 0.462890625,
"learning_rate": 0.0018668331252600916,
"loss": 1.4604,
"step": 32000
},
{
"epoch": 3.381190178942988,
"grad_norm": 0.5546875,
"learning_rate": 0.0018647523928422805,
"loss": 1.4672,
"step": 32500
},
{
"epoch": 3.4332084893882646,
"grad_norm": 2.0625,
"learning_rate": 0.0018626716604244697,
"loss": 1.465,
"step": 33000
},
{
"epoch": 3.4852267998335416,
"grad_norm": 0.39453125,
"learning_rate": 0.0018605909280066584,
"loss": 1.4604,
"step": 33500
},
{
"epoch": 3.537245110278818,
"grad_norm": 0.255859375,
"learning_rate": 0.0018585101955888473,
"loss": 1.4539,
"step": 34000
},
{
"epoch": 3.589263420724095,
"grad_norm": 0.59375,
"learning_rate": 0.0018564294631710364,
"loss": 1.4536,
"step": 34500
},
{
"epoch": 3.6412817311693715,
"grad_norm": 0.30078125,
"learning_rate": 0.001854348730753225,
"loss": 1.4549,
"step": 35000
},
{
"epoch": 3.6933000416146484,
"grad_norm": 0.345703125,
"learning_rate": 0.001852267998335414,
"loss": 1.4571,
"step": 35500
},
{
"epoch": 3.7453183520599254,
"grad_norm": 0.376953125,
"learning_rate": 0.0018501872659176031,
"loss": 1.4584,
"step": 36000
},
{
"epoch": 3.797336662505202,
"grad_norm": 0.53515625,
"learning_rate": 0.001848106533499792,
"loss": 1.4542,
"step": 36500
},
{
"epoch": 3.8493549729504783,
"grad_norm": 0.3515625,
"learning_rate": 0.0018460258010819808,
"loss": 1.4588,
"step": 37000
},
{
"epoch": 3.9013732833957553,
"grad_norm": 0.279296875,
"learning_rate": 0.0018439450686641699,
"loss": 1.456,
"step": 37500
},
{
"epoch": 3.9533915938410322,
"grad_norm": 0.31640625,
"learning_rate": 0.0018418643362463588,
"loss": 1.4596,
"step": 38000
},
{
"epoch": 4.0,
"eval_loss": 1.4403541088104248,
"eval_runtime": 1.4124,
"eval_samples_per_second": 708.019,
"eval_steps_per_second": 0.708,
"step": 38448
},
{
"epoch": 4.005409904286309,
"grad_norm": 0.259765625,
"learning_rate": 0.0018397836038285475,
"loss": 1.449,
"step": 38500
},
{
"epoch": 4.057428214731585,
"grad_norm": 0.349609375,
"learning_rate": 0.0018377028714107366,
"loss": 1.445,
"step": 39000
},
{
"epoch": 4.109446525176862,
"grad_norm": 0.330078125,
"learning_rate": 0.0018356221389929255,
"loss": 1.4441,
"step": 39500
},
{
"epoch": 4.161464835622139,
"grad_norm": 0.283203125,
"learning_rate": 0.0018335414065751145,
"loss": 1.4423,
"step": 40000
},
{
"epoch": 4.213483146067416,
"grad_norm": 0.54296875,
"learning_rate": 0.0018314606741573034,
"loss": 1.442,
"step": 40500
},
{
"epoch": 4.265501456512692,
"grad_norm": 0.23046875,
"learning_rate": 0.0018293799417394923,
"loss": 1.4348,
"step": 41000
},
{
"epoch": 4.317519766957969,
"grad_norm": 0.50390625,
"learning_rate": 0.0018272992093216814,
"loss": 1.4352,
"step": 41500
},
{
"epoch": 4.369538077403246,
"grad_norm": 0.4765625,
"learning_rate": 0.0018252184769038703,
"loss": 1.4344,
"step": 42000
},
{
"epoch": 4.421556387848523,
"grad_norm": 0.400390625,
"learning_rate": 0.001823137744486059,
"loss": 1.438,
"step": 42500
},
{
"epoch": 4.473574698293799,
"grad_norm": 0.42578125,
"learning_rate": 0.0018210570120682482,
"loss": 1.4417,
"step": 43000
},
{
"epoch": 4.525593008739076,
"grad_norm": 1.2265625,
"learning_rate": 0.001818976279650437,
"loss": 1.436,
"step": 43500
},
{
"epoch": 4.577611319184353,
"grad_norm": 0.48828125,
"learning_rate": 0.0018168955472326258,
"loss": 1.4367,
"step": 44000
},
{
"epoch": 4.62962962962963,
"grad_norm": 0.37109375,
"learning_rate": 0.001814814814814815,
"loss": 1.4299,
"step": 44500
},
{
"epoch": 4.681647940074907,
"grad_norm": 0.31640625,
"learning_rate": 0.0018127340823970038,
"loss": 1.4332,
"step": 45000
},
{
"epoch": 4.733666250520183,
"grad_norm": 0.455078125,
"learning_rate": 0.0018106533499791927,
"loss": 1.4269,
"step": 45500
},
{
"epoch": 4.78568456096546,
"grad_norm": 0.671875,
"learning_rate": 0.0018085726175613816,
"loss": 1.4276,
"step": 46000
},
{
"epoch": 4.837702871410737,
"grad_norm": 1.0078125,
"learning_rate": 0.0018064918851435705,
"loss": 1.4288,
"step": 46500
},
{
"epoch": 4.889721181856014,
"grad_norm": 1.34375,
"learning_rate": 0.0018044111527257595,
"loss": 1.4351,
"step": 47000
},
{
"epoch": 4.94173949230129,
"grad_norm": 0.470703125,
"learning_rate": 0.0018023304203079484,
"loss": 1.4307,
"step": 47500
},
{
"epoch": 4.9937578027465666,
"grad_norm": 0.2470703125,
"learning_rate": 0.0018002496878901373,
"loss": 1.4257,
"step": 48000
},
{
"epoch": 5.0,
"eval_loss": 1.4106667041778564,
"eval_runtime": 1.4218,
"eval_samples_per_second": 703.325,
"eval_steps_per_second": 0.703,
"step": 48060
},
{
"epoch": 5.0457761131918435,
"grad_norm": 0.345703125,
"learning_rate": 0.0017981689554723264,
"loss": 1.4193,
"step": 48500
},
{
"epoch": 5.09779442363712,
"grad_norm": 0.322265625,
"learning_rate": 0.0017960882230545153,
"loss": 1.4176,
"step": 49000
},
{
"epoch": 5.149812734082397,
"grad_norm": 0.27734375,
"learning_rate": 0.001794007490636704,
"loss": 1.4157,
"step": 49500
},
{
"epoch": 5.201831044527673,
"grad_norm": 0.333984375,
"learning_rate": 0.0017919267582188932,
"loss": 1.4186,
"step": 50000
},
{
"epoch": 5.25384935497295,
"grad_norm": 0.2734375,
"learning_rate": 0.001789846025801082,
"loss": 1.4155,
"step": 50500
},
{
"epoch": 5.305867665418227,
"grad_norm": 2.640625,
"learning_rate": 0.0017877652933832708,
"loss": 1.4142,
"step": 51000
},
{
"epoch": 5.357885975863504,
"grad_norm": 0.2431640625,
"learning_rate": 0.00178568456096546,
"loss": 1.4155,
"step": 51500
},
{
"epoch": 5.40990428630878,
"grad_norm": 0.244140625,
"learning_rate": 0.0017836038285476488,
"loss": 1.4113,
"step": 52000
},
{
"epoch": 5.461922596754057,
"grad_norm": 0.21484375,
"learning_rate": 0.0017815230961298377,
"loss": 1.4145,
"step": 52500
},
{
"epoch": 5.513940907199334,
"grad_norm": 0.42578125,
"learning_rate": 0.0017794423637120266,
"loss": 1.4132,
"step": 53000
},
{
"epoch": 5.565959217644611,
"grad_norm": 0.2060546875,
"learning_rate": 0.0017773616312942156,
"loss": 1.4141,
"step": 53500
},
{
"epoch": 5.617977528089888,
"grad_norm": 0.51171875,
"learning_rate": 0.0017752808988764045,
"loss": 1.4118,
"step": 54000
},
{
"epoch": 5.669995838535164,
"grad_norm": 0.73828125,
"learning_rate": 0.0017732001664585936,
"loss": 1.4094,
"step": 54500
},
{
"epoch": 5.722014148980441,
"grad_norm": 0.390625,
"learning_rate": 0.0017711194340407823,
"loss": 1.4088,
"step": 55000
},
{
"epoch": 5.774032459425718,
"grad_norm": 2.34375,
"learning_rate": 0.0017690387016229714,
"loss": 1.4068,
"step": 55500
},
{
"epoch": 5.826050769870995,
"grad_norm": 0.265625,
"learning_rate": 0.0017669579692051603,
"loss": 1.4059,
"step": 56000
},
{
"epoch": 5.878069080316271,
"grad_norm": 0.283203125,
"learning_rate": 0.001764877236787349,
"loss": 1.4041,
"step": 56500
},
{
"epoch": 5.930087390761548,
"grad_norm": 0.77734375,
"learning_rate": 0.0017627965043695382,
"loss": 1.4024,
"step": 57000
},
{
"epoch": 5.982105701206825,
"grad_norm": 0.30859375,
"learning_rate": 0.001760715771951727,
"loss": 1.4006,
"step": 57500
},
{
"epoch": 6.0,
"eval_loss": 1.3921489715576172,
"eval_runtime": 1.4219,
"eval_samples_per_second": 703.273,
"eval_steps_per_second": 0.703,
"step": 57672
},
{
"epoch": 6.034124011652102,
"grad_norm": 0.255859375,
"learning_rate": 0.001758635039533916,
"loss": 1.4001,
"step": 58000
},
{
"epoch": 6.086142322097379,
"grad_norm": 0.2431640625,
"learning_rate": 0.001756554307116105,
"loss": 1.3981,
"step": 58500
},
{
"epoch": 6.138160632542655,
"grad_norm": 0.4296875,
"learning_rate": 0.0017544735746982938,
"loss": 1.4018,
"step": 59000
},
{
"epoch": 6.190178942987932,
"grad_norm": 0.85546875,
"learning_rate": 0.0017523928422804827,
"loss": 1.397,
"step": 59500
},
{
"epoch": 6.242197253433209,
"grad_norm": 0.361328125,
"learning_rate": 0.0017503121098626717,
"loss": 1.3974,
"step": 60000
},
{
"epoch": 6.294215563878486,
"grad_norm": 3.578125,
"learning_rate": 0.0017482313774448606,
"loss": 1.397,
"step": 60500
},
{
"epoch": 6.346233874323762,
"grad_norm": 0.2451171875,
"learning_rate": 0.0017461506450270495,
"loss": 1.3971,
"step": 61000
},
{
"epoch": 6.398252184769039,
"grad_norm": 0.53125,
"learning_rate": 0.0017440699126092386,
"loss": 1.3955,
"step": 61500
},
{
"epoch": 6.4502704952143155,
"grad_norm": 0.24609375,
"learning_rate": 0.0017419891801914273,
"loss": 1.3959,
"step": 62000
},
{
"epoch": 6.502288805659592,
"grad_norm": 0.390625,
"learning_rate": 0.0017399084477736164,
"loss": 1.3951,
"step": 62500
},
{
"epoch": 6.554307116104869,
"grad_norm": 0.3125,
"learning_rate": 0.0017378277153558054,
"loss": 1.3977,
"step": 63000
},
{
"epoch": 6.606325426550145,
"grad_norm": 0.279296875,
"learning_rate": 0.001735746982937994,
"loss": 1.3976,
"step": 63500
},
{
"epoch": 6.658343736995422,
"grad_norm": 0.83984375,
"learning_rate": 0.0017336662505201832,
"loss": 1.3964,
"step": 64000
},
{
"epoch": 6.710362047440699,
"grad_norm": 0.357421875,
"learning_rate": 0.001731585518102372,
"loss": 1.396,
"step": 64500
},
{
"epoch": 6.762380357885976,
"grad_norm": 0.27734375,
"learning_rate": 0.001729504785684561,
"loss": 1.3931,
"step": 65000
},
{
"epoch": 6.814398668331252,
"grad_norm": 0.330078125,
"learning_rate": 0.00172742405326675,
"loss": 1.3931,
"step": 65500
},
{
"epoch": 6.866416978776529,
"grad_norm": 0.40625,
"learning_rate": 0.0017253433208489388,
"loss": 1.388,
"step": 66000
},
{
"epoch": 6.918435289221806,
"grad_norm": 0.2578125,
"learning_rate": 0.0017232625884311278,
"loss": 1.3862,
"step": 66500
},
{
"epoch": 6.970453599667083,
"grad_norm": 0.3984375,
"learning_rate": 0.0017211818560133169,
"loss": 1.3865,
"step": 67000
},
{
"epoch": 7.0,
"eval_loss": 1.3738893270492554,
"eval_runtime": 1.4206,
"eval_samples_per_second": 703.936,
"eval_steps_per_second": 0.704,
"step": 67284
},
{
"epoch": 7.022471910112359,
"grad_norm": 0.298828125,
"learning_rate": 0.0017191011235955056,
"loss": 1.385,
"step": 67500
},
{
"epoch": 7.074490220557636,
"grad_norm": 0.39453125,
"learning_rate": 0.0017170203911776945,
"loss": 1.3827,
"step": 68000
},
{
"epoch": 7.126508531002913,
"grad_norm": 0.609375,
"learning_rate": 0.0017149396587598836,
"loss": 1.3864,
"step": 68500
},
{
"epoch": 7.17852684144819,
"grad_norm": 0.51953125,
"learning_rate": 0.0017128589263420723,
"loss": 1.3874,
"step": 69000
},
{
"epoch": 7.230545151893467,
"grad_norm": 0.326171875,
"learning_rate": 0.0017107781939242615,
"loss": 1.3913,
"step": 69500
},
{
"epoch": 7.282563462338743,
"grad_norm": 0.69921875,
"learning_rate": 0.0017086974615064504,
"loss": 1.3892,
"step": 70000
},
{
"epoch": 7.33458177278402,
"grad_norm": 0.38671875,
"learning_rate": 0.0017066167290886393,
"loss": 1.3888,
"step": 70500
},
{
"epoch": 7.386600083229297,
"grad_norm": 0.259765625,
"learning_rate": 0.0017045359966708282,
"loss": 1.3908,
"step": 71000
},
{
"epoch": 7.438618393674574,
"grad_norm": 0.419921875,
"learning_rate": 0.001702455264253017,
"loss": 1.3879,
"step": 71500
},
{
"epoch": 7.49063670411985,
"grad_norm": 0.2197265625,
"learning_rate": 0.001700374531835206,
"loss": 1.3873,
"step": 72000
},
{
"epoch": 7.542655014565127,
"grad_norm": 0.2119140625,
"learning_rate": 0.001698293799417395,
"loss": 1.3845,
"step": 72500
},
{
"epoch": 7.594673325010404,
"grad_norm": 0.34375,
"learning_rate": 0.0016962130669995838,
"loss": 1.3832,
"step": 73000
},
{
"epoch": 7.646691635455681,
"grad_norm": 0.55078125,
"learning_rate": 0.0016941323345817728,
"loss": 1.38,
"step": 73500
},
{
"epoch": 7.698709945900957,
"grad_norm": 0.240234375,
"learning_rate": 0.001692051602163962,
"loss": 1.3798,
"step": 74000
},
{
"epoch": 7.750728256346234,
"grad_norm": 0.2490234375,
"learning_rate": 0.0016899708697461506,
"loss": 1.3775,
"step": 74500
},
{
"epoch": 7.802746566791511,
"grad_norm": 1.015625,
"learning_rate": 0.0016878901373283395,
"loss": 1.3797,
"step": 75000
},
{
"epoch": 7.8547648772367875,
"grad_norm": 0.263671875,
"learning_rate": 0.0016858094049105286,
"loss": 1.3813,
"step": 75500
},
{
"epoch": 7.9067831876820645,
"grad_norm": 0.271484375,
"learning_rate": 0.0016837286724927173,
"loss": 1.3779,
"step": 76000
},
{
"epoch": 7.9588014981273405,
"grad_norm": 0.6875,
"learning_rate": 0.0016816479400749065,
"loss": 1.3771,
"step": 76500
},
{
"epoch": 8.0,
"eval_loss": 1.3700777292251587,
"eval_runtime": 1.4197,
"eval_samples_per_second": 704.38,
"eval_steps_per_second": 0.704,
"step": 76896
},
{
"epoch": 8.010819808572618,
"grad_norm": 0.384765625,
"learning_rate": 0.0016795672076570954,
"loss": 1.3753,
"step": 77000
},
{
"epoch": 8.062838119017893,
"grad_norm": 0.376953125,
"learning_rate": 0.0016774864752392843,
"loss": 1.3763,
"step": 77500
},
{
"epoch": 8.11485642946317,
"grad_norm": 0.263671875,
"learning_rate": 0.0016754057428214732,
"loss": 1.3761,
"step": 78000
},
{
"epoch": 8.166874739908447,
"grad_norm": 0.392578125,
"learning_rate": 0.0016733250104036621,
"loss": 1.3786,
"step": 78500
},
{
"epoch": 8.218893050353724,
"grad_norm": 0.287109375,
"learning_rate": 0.001671244277985851,
"loss": 1.3786,
"step": 79000
},
{
"epoch": 8.270911360799001,
"grad_norm": 0.298828125,
"learning_rate": 0.0016691635455680402,
"loss": 1.3797,
"step": 79500
},
{
"epoch": 8.322929671244278,
"grad_norm": 0.341796875,
"learning_rate": 0.0016670828131502289,
"loss": 1.3791,
"step": 80000
},
{
"epoch": 8.374947981689555,
"grad_norm": 0.302734375,
"learning_rate": 0.0016650020807324178,
"loss": 1.3783,
"step": 80500
},
{
"epoch": 8.426966292134832,
"grad_norm": 0.318359375,
"learning_rate": 0.001662921348314607,
"loss": 1.376,
"step": 81000
},
{
"epoch": 8.478984602580109,
"grad_norm": 1.1796875,
"learning_rate": 0.0016608406158967956,
"loss": 1.3763,
"step": 81500
},
{
"epoch": 8.531002913025384,
"grad_norm": 0.408203125,
"learning_rate": 0.0016587598834789845,
"loss": 1.3757,
"step": 82000
},
{
"epoch": 8.583021223470661,
"grad_norm": 0.224609375,
"learning_rate": 0.0016566791510611736,
"loss": 1.377,
"step": 82500
},
{
"epoch": 8.635039533915938,
"grad_norm": 0.55859375,
"learning_rate": 0.0016545984186433626,
"loss": 1.3755,
"step": 83000
},
{
"epoch": 8.687057844361215,
"grad_norm": 0.23828125,
"learning_rate": 0.0016525176862255513,
"loss": 1.3739,
"step": 83500
},
{
"epoch": 8.739076154806492,
"grad_norm": 0.33984375,
"learning_rate": 0.0016504369538077404,
"loss": 1.3744,
"step": 84000
},
{
"epoch": 8.791094465251769,
"grad_norm": 0.36328125,
"learning_rate": 0.0016483562213899293,
"loss": 1.3733,
"step": 84500
},
{
"epoch": 8.843112775697046,
"grad_norm": 0.263671875,
"learning_rate": 0.0016462754889721182,
"loss": 1.3735,
"step": 85000
},
{
"epoch": 8.895131086142323,
"grad_norm": 0.478515625,
"learning_rate": 0.0016441947565543071,
"loss": 1.3754,
"step": 85500
},
{
"epoch": 8.947149396587598,
"grad_norm": 0.515625,
"learning_rate": 0.001642114024136496,
"loss": 1.3749,
"step": 86000
},
{
"epoch": 8.999167707032875,
"grad_norm": 0.5390625,
"learning_rate": 0.0016400332917186852,
"loss": 1.3739,
"step": 86500
},
{
"epoch": 9.0,
"eval_loss": 1.3678644895553589,
"eval_runtime": 1.4359,
"eval_samples_per_second": 696.406,
"eval_steps_per_second": 0.696,
"step": 86508
},
{
"epoch": 9.051186017478152,
"grad_norm": 0.26171875,
"learning_rate": 0.0016379525593008739,
"loss": 1.3717,
"step": 87000
},
{
"epoch": 9.103204327923429,
"grad_norm": 0.2373046875,
"learning_rate": 0.0016358718268830628,
"loss": 1.3705,
"step": 87500
},
{
"epoch": 9.155222638368706,
"grad_norm": 0.2578125,
"learning_rate": 0.001633791094465252,
"loss": 1.3719,
"step": 88000
},
{
"epoch": 9.207240948813983,
"grad_norm": 0.32421875,
"learning_rate": 0.0016317103620474408,
"loss": 1.3727,
"step": 88500
},
{
"epoch": 9.25925925925926,
"grad_norm": 5.125,
"learning_rate": 0.0016296296296296295,
"loss": 1.3737,
"step": 89000
},
{
"epoch": 9.311277569704536,
"grad_norm": 0.263671875,
"learning_rate": 0.0016275488972118187,
"loss": 1.3707,
"step": 89500
},
{
"epoch": 9.363295880149813,
"grad_norm": 0.2021484375,
"learning_rate": 0.0016254681647940076,
"loss": 1.3694,
"step": 90000
},
{
"epoch": 9.41531419059509,
"grad_norm": 0.353515625,
"learning_rate": 0.0016233874323761963,
"loss": 1.3687,
"step": 90500
},
{
"epoch": 9.467332501040365,
"grad_norm": 0.7578125,
"learning_rate": 0.0016213066999583854,
"loss": 1.368,
"step": 91000
},
{
"epoch": 9.519350811485642,
"grad_norm": 0.30078125,
"learning_rate": 0.0016192259675405743,
"loss": 1.3716,
"step": 91500
},
{
"epoch": 9.57136912193092,
"grad_norm": 0.23828125,
"learning_rate": 0.0016171452351227634,
"loss": 1.3697,
"step": 92000
},
{
"epoch": 9.623387432376196,
"grad_norm": 0.271484375,
"learning_rate": 0.0016150645027049521,
"loss": 1.3692,
"step": 92500
},
{
"epoch": 9.675405742821473,
"grad_norm": 0.470703125,
"learning_rate": 0.001612983770287141,
"loss": 1.3682,
"step": 93000
},
{
"epoch": 9.72742405326675,
"grad_norm": 0.41015625,
"learning_rate": 0.0016109030378693302,
"loss": 1.3673,
"step": 93500
},
{
"epoch": 9.779442363712027,
"grad_norm": 0.25,
"learning_rate": 0.0016088223054515189,
"loss": 1.3663,
"step": 94000
},
{
"epoch": 9.831460674157304,
"grad_norm": 0.2578125,
"learning_rate": 0.0016067415730337078,
"loss": 1.3662,
"step": 94500
},
{
"epoch": 9.88347898460258,
"grad_norm": 0.361328125,
"learning_rate": 0.001604660840615897,
"loss": 1.37,
"step": 95000
},
{
"epoch": 9.935497295047856,
"grad_norm": 0.2578125,
"learning_rate": 0.0016025801081980858,
"loss": 1.372,
"step": 95500
},
{
"epoch": 9.987515605493133,
"grad_norm": 0.30859375,
"learning_rate": 0.0016004993757802745,
"loss": 1.3699,
"step": 96000
},
{
"epoch": 10.0,
"eval_loss": 1.3671537637710571,
"eval_runtime": 2.0428,
"eval_samples_per_second": 489.53,
"eval_steps_per_second": 0.49,
"step": 96120
},
{
"epoch": 10.03953391593841,
"grad_norm": 0.2578125,
"learning_rate": 0.0015984186433624637,
"loss": 1.3686,
"step": 96500
},
{
"epoch": 10.091552226383687,
"grad_norm": 1.484375,
"learning_rate": 0.0015963379109446526,
"loss": 1.3682,
"step": 97000
},
{
"epoch": 10.143570536828964,
"grad_norm": 0.369140625,
"learning_rate": 0.0015942571785268413,
"loss": 1.3659,
"step": 97500
},
{
"epoch": 10.19558884727424,
"grad_norm": 0.263671875,
"learning_rate": 0.0015921764461090304,
"loss": 1.3653,
"step": 98000
},
{
"epoch": 10.247607157719518,
"grad_norm": 0.26171875,
"learning_rate": 0.0015900957136912193,
"loss": 1.3668,
"step": 98500
},
{
"epoch": 10.299625468164795,
"grad_norm": 0.2373046875,
"learning_rate": 0.0015880149812734085,
"loss": 1.3695,
"step": 99000
},
{
"epoch": 10.35164377861007,
"grad_norm": 0.2392578125,
"learning_rate": 0.0015859342488555972,
"loss": 1.3673,
"step": 99500
},
{
"epoch": 10.403662089055347,
"grad_norm": 0.248046875,
"learning_rate": 0.001583853516437786,
"loss": 1.3669,
"step": 100000
},
{
"epoch": 10.455680399500624,
"grad_norm": 0.3359375,
"learning_rate": 0.0015817727840199752,
"loss": 1.367,
"step": 100500
},
{
"epoch": 10.5076987099459,
"grad_norm": 0.330078125,
"learning_rate": 0.0015796920516021641,
"loss": 1.3668,
"step": 101000
},
{
"epoch": 10.559717020391178,
"grad_norm": 0.3125,
"learning_rate": 0.0015776113191843528,
"loss": 1.3654,
"step": 101500
},
{
"epoch": 10.611735330836455,
"grad_norm": 0.45703125,
"learning_rate": 0.001575530586766542,
"loss": 1.3655,
"step": 102000
},
{
"epoch": 10.663753641281732,
"grad_norm": 0.361328125,
"learning_rate": 0.0015734498543487309,
"loss": 1.3673,
"step": 102500
},
{
"epoch": 10.715771951727008,
"grad_norm": 0.2734375,
"learning_rate": 0.0015713691219309195,
"loss": 1.3651,
"step": 103000
},
{
"epoch": 10.767790262172285,
"grad_norm": 0.28125,
"learning_rate": 0.0015692883895131087,
"loss": 1.3652,
"step": 103500
},
{
"epoch": 10.81980857261756,
"grad_norm": 0.1982421875,
"learning_rate": 0.0015672076570952976,
"loss": 1.366,
"step": 104000
},
{
"epoch": 10.871826883062838,
"grad_norm": 0.416015625,
"learning_rate": 0.0015651269246774865,
"loss": 1.3624,
"step": 104500
},
{
"epoch": 10.923845193508114,
"grad_norm": 0.94921875,
"learning_rate": 0.0015630461922596754,
"loss": 1.3605,
"step": 105000
},
{
"epoch": 10.975863503953391,
"grad_norm": 0.74609375,
"learning_rate": 0.0015609654598418643,
"loss": 1.36,
"step": 105500
},
{
"epoch": 11.0,
"eval_loss": 1.3506468534469604,
"eval_runtime": 1.4359,
"eval_samples_per_second": 696.43,
"eval_steps_per_second": 0.696,
"step": 105732
},
{
"epoch": 11.027881814398668,
"grad_norm": 0.30078125,
"learning_rate": 0.0015588847274240535,
"loss": 1.3598,
"step": 106000
},
{
"epoch": 11.079900124843945,
"grad_norm": 0.2177734375,
"learning_rate": 0.0015568039950062422,
"loss": 1.3614,
"step": 106500
},
{
"epoch": 11.131918435289222,
"grad_norm": 0.35546875,
"learning_rate": 0.001554723262588431,
"loss": 1.3587,
"step": 107000
},
{
"epoch": 11.1839367457345,
"grad_norm": 0.2109375,
"learning_rate": 0.0015526425301706202,
"loss": 1.3585,
"step": 107500
},
{
"epoch": 11.235955056179776,
"grad_norm": 1.2890625,
"learning_rate": 0.0015505617977528091,
"loss": 1.3577,
"step": 108000
},
{
"epoch": 11.287973366625051,
"grad_norm": 0.2109375,
"learning_rate": 0.0015484810653349978,
"loss": 1.3581,
"step": 108500
},
{
"epoch": 11.339991677070328,
"grad_norm": 0.298828125,
"learning_rate": 0.001546400332917187,
"loss": 1.3574,
"step": 109000
},
{
"epoch": 11.392009987515605,
"grad_norm": 0.330078125,
"learning_rate": 0.0015443196004993759,
"loss": 1.3586,
"step": 109500
},
{
"epoch": 11.444028297960882,
"grad_norm": 0.828125,
"learning_rate": 0.0015422388680815646,
"loss": 1.3582,
"step": 110000
},
{
"epoch": 11.496046608406159,
"grad_norm": 0.255859375,
"learning_rate": 0.0015401581356637537,
"loss": 1.3573,
"step": 110500
},
{
"epoch": 11.548064918851436,
"grad_norm": 0.23828125,
"learning_rate": 0.0015380774032459426,
"loss": 1.3592,
"step": 111000
},
{
"epoch": 11.600083229296713,
"grad_norm": 0.244140625,
"learning_rate": 0.0015359966708281315,
"loss": 1.3558,
"step": 111500
},
{
"epoch": 11.65210153974199,
"grad_norm": 0.2314453125,
"learning_rate": 0.0015339159384103204,
"loss": 1.3578,
"step": 112000
},
{
"epoch": 11.704119850187267,
"grad_norm": 0.220703125,
"learning_rate": 0.0015318352059925093,
"loss": 1.3562,
"step": 112500
},
{
"epoch": 11.756138160632542,
"grad_norm": 0.3828125,
"learning_rate": 0.0015297544735746985,
"loss": 1.3575,
"step": 113000
},
{
"epoch": 11.808156471077819,
"grad_norm": 0.30078125,
"learning_rate": 0.0015276737411568874,
"loss": 1.3568,
"step": 113500
},
{
"epoch": 11.860174781523096,
"grad_norm": 0.310546875,
"learning_rate": 0.001525593008739076,
"loss": 1.3611,
"step": 114000
},
{
"epoch": 11.912193091968373,
"grad_norm": 0.2236328125,
"learning_rate": 0.0015235122763212652,
"loss": 1.3598,
"step": 114500
},
{
"epoch": 11.96421140241365,
"grad_norm": 4.28125,
"learning_rate": 0.0015214315439034541,
"loss": 1.3598,
"step": 115000
},
{
"epoch": 12.0,
"eval_loss": 1.3485276699066162,
"eval_runtime": 1.4101,
"eval_samples_per_second": 709.157,
"eval_steps_per_second": 0.709,
"step": 115344
},
{
"epoch": 12.016229712858927,
"grad_norm": 0.26171875,
"learning_rate": 0.0015193508114856428,
"loss": 1.3563,
"step": 115500
},
{
"epoch": 12.068248023304204,
"grad_norm": 0.255859375,
"learning_rate": 0.001517270079067832,
"loss": 1.3552,
"step": 116000
},
{
"epoch": 12.12026633374948,
"grad_norm": 0.32421875,
"learning_rate": 0.0015151893466500209,
"loss": 1.3552,
"step": 116500
},
{
"epoch": 12.172284644194757,
"grad_norm": 0.353515625,
"learning_rate": 0.0015131086142322098,
"loss": 1.3543,
"step": 117000
},
{
"epoch": 12.224302954640033,
"grad_norm": 0.248046875,
"learning_rate": 0.0015110278818143987,
"loss": 1.3534,
"step": 117500
},
{
"epoch": 12.27632126508531,
"grad_norm": 0.337890625,
"learning_rate": 0.0015089471493965876,
"loss": 1.3541,
"step": 118000
},
{
"epoch": 12.328339575530586,
"grad_norm": 0.228515625,
"learning_rate": 0.0015068664169787765,
"loss": 1.3542,
"step": 118500
},
{
"epoch": 12.380357885975863,
"grad_norm": 0.208984375,
"learning_rate": 0.0015047856845609654,
"loss": 1.3561,
"step": 119000
},
{
"epoch": 12.43237619642114,
"grad_norm": 0.251953125,
"learning_rate": 0.0015027049521431544,
"loss": 1.3544,
"step": 119500
},
{
"epoch": 12.484394506866417,
"grad_norm": 0.2353515625,
"learning_rate": 0.0015006242197253433,
"loss": 1.3561,
"step": 120000
},
{
"epoch": 12.536412817311694,
"grad_norm": 0.25390625,
"learning_rate": 0.0014985434873075324,
"loss": 1.3533,
"step": 120500
},
{
"epoch": 12.588431127756971,
"grad_norm": 0.34765625,
"learning_rate": 0.001496462754889721,
"loss": 1.3546,
"step": 121000
},
{
"epoch": 12.640449438202246,
"grad_norm": 0.341796875,
"learning_rate": 0.0014943820224719102,
"loss": 1.3548,
"step": 121500
},
{
"epoch": 12.692467748647523,
"grad_norm": 0.234375,
"learning_rate": 0.0014923012900540991,
"loss": 1.3531,
"step": 122000
},
{
"epoch": 12.7444860590928,
"grad_norm": 0.298828125,
"learning_rate": 0.0014902205576362878,
"loss": 1.3518,
"step": 122500
},
{
"epoch": 12.796504369538077,
"grad_norm": 0.326171875,
"learning_rate": 0.001488139825218477,
"loss": 1.3512,
"step": 123000
},
{
"epoch": 12.848522679983354,
"grad_norm": 0.546875,
"learning_rate": 0.0014860590928006659,
"loss": 1.3506,
"step": 123500
},
{
"epoch": 12.900540990428631,
"grad_norm": 0.28515625,
"learning_rate": 0.0014839783603828548,
"loss": 1.3539,
"step": 124000
},
{
"epoch": 12.952559300873908,
"grad_norm": 0.240234375,
"learning_rate": 0.0014818976279650437,
"loss": 1.3581,
"step": 124500
},
{
"epoch": 13.0,
"eval_loss": 1.3496302366256714,
"eval_runtime": 1.4282,
"eval_samples_per_second": 700.187,
"eval_steps_per_second": 0.7,
"step": 124956
},
{
"epoch": 13.004577611319185,
"grad_norm": 0.2578125,
"learning_rate": 0.0014798168955472326,
"loss": 1.3558,
"step": 125000
},
{
"epoch": 13.056595921764462,
"grad_norm": 0.5390625,
"learning_rate": 0.0014777361631294215,
"loss": 1.3526,
"step": 125500
},
{
"epoch": 13.108614232209737,
"grad_norm": 0.2255859375,
"learning_rate": 0.0014756554307116107,
"loss": 1.3508,
"step": 126000
},
{
"epoch": 13.160632542655014,
"grad_norm": 0.361328125,
"learning_rate": 0.0014735746982937994,
"loss": 1.3507,
"step": 126500
},
{
"epoch": 13.21265085310029,
"grad_norm": 0.478515625,
"learning_rate": 0.0014714939658759883,
"loss": 1.3525,
"step": 127000
},
{
"epoch": 13.264669163545568,
"grad_norm": 0.2490234375,
"learning_rate": 0.0014694132334581774,
"loss": 1.3525,
"step": 127500
},
{
"epoch": 13.316687473990845,
"grad_norm": 0.25,
"learning_rate": 0.0014673325010403661,
"loss": 1.3529,
"step": 128000
},
{
"epoch": 13.368705784436122,
"grad_norm": 0.318359375,
"learning_rate": 0.0014652517686225552,
"loss": 1.3518,
"step": 128500
},
{
"epoch": 13.420724094881399,
"grad_norm": 0.228515625,
"learning_rate": 0.0014631710362047442,
"loss": 1.3534,
"step": 129000
},
{
"epoch": 13.472742405326676,
"grad_norm": 0.23828125,
"learning_rate": 0.001461090303786933,
"loss": 1.3513,
"step": 129500
},
{
"epoch": 13.524760715771952,
"grad_norm": 0.2421875,
"learning_rate": 0.001459009571369122,
"loss": 1.3516,
"step": 130000
},
{
"epoch": 13.576779026217228,
"grad_norm": 0.21484375,
"learning_rate": 0.001456928838951311,
"loss": 1.3521,
"step": 130500
},
{
"epoch": 13.628797336662505,
"grad_norm": 0.255859375,
"learning_rate": 0.0014548481065334998,
"loss": 1.353,
"step": 131000
},
{
"epoch": 13.680815647107782,
"grad_norm": 0.341796875,
"learning_rate": 0.0014527673741156887,
"loss": 1.3525,
"step": 131500
},
{
"epoch": 13.732833957553058,
"grad_norm": 0.287109375,
"learning_rate": 0.0014506866416978776,
"loss": 1.3501,
"step": 132000
},
{
"epoch": 13.784852267998335,
"grad_norm": 0.30859375,
"learning_rate": 0.0014486059092800666,
"loss": 1.3495,
"step": 132500
},
{
"epoch": 13.836870578443612,
"grad_norm": 0.28125,
"learning_rate": 0.0014465251768622557,
"loss": 1.3513,
"step": 133000
},
{
"epoch": 13.88888888888889,
"grad_norm": 0.353515625,
"learning_rate": 0.0014444444444444444,
"loss": 1.3516,
"step": 133500
},
{
"epoch": 13.940907199334166,
"grad_norm": 0.4921875,
"learning_rate": 0.0014423637120266333,
"loss": 1.3512,
"step": 134000
},
{
"epoch": 13.992925509779443,
"grad_norm": 0.287109375,
"learning_rate": 0.0014402829796088224,
"loss": 1.3493,
"step": 134500
},
{
"epoch": 14.0,
"eval_loss": 1.3465324640274048,
"eval_runtime": 1.419,
"eval_samples_per_second": 704.714,
"eval_steps_per_second": 0.705,
"step": 134568
},
{
"epoch": 14.044943820224718,
"grad_norm": 0.400390625,
"learning_rate": 0.0014382022471910111,
"loss": 1.3478,
"step": 135000
},
{
"epoch": 14.096962130669995,
"grad_norm": 0.203125,
"learning_rate": 0.0014361215147732003,
"loss": 1.3485,
"step": 135500
},
{
"epoch": 14.148980441115272,
"grad_norm": 0.322265625,
"learning_rate": 0.0014340407823553892,
"loss": 1.3473,
"step": 136000
},
{
"epoch": 14.20099875156055,
"grad_norm": 0.578125,
"learning_rate": 0.001431960049937578,
"loss": 1.3469,
"step": 136500
},
{
"epoch": 14.253017062005826,
"grad_norm": 0.205078125,
"learning_rate": 0.001429879317519767,
"loss": 1.3488,
"step": 137000
},
{
"epoch": 14.305035372451103,
"grad_norm": 0.28125,
"learning_rate": 0.001427798585101956,
"loss": 1.3482,
"step": 137500
},
{
"epoch": 14.35705368289638,
"grad_norm": 0.85546875,
"learning_rate": 0.0014257178526841448,
"loss": 1.3485,
"step": 138000
},
{
"epoch": 14.409071993341657,
"grad_norm": 0.48828125,
"learning_rate": 0.001423637120266334,
"loss": 1.3479,
"step": 138500
},
{
"epoch": 14.461090303786934,
"grad_norm": 0.208984375,
"learning_rate": 0.0014215563878485226,
"loss": 1.3483,
"step": 139000
},
{
"epoch": 14.513108614232209,
"grad_norm": 0.4140625,
"learning_rate": 0.0014194756554307116,
"loss": 1.3471,
"step": 139500
},
{
"epoch": 14.565126924677486,
"grad_norm": 0.30078125,
"learning_rate": 0.0014173949230129007,
"loss": 1.3475,
"step": 140000
},
{
"epoch": 14.617145235122763,
"grad_norm": 0.4453125,
"learning_rate": 0.0014153141905950894,
"loss": 1.3467,
"step": 140500
},
{
"epoch": 14.66916354556804,
"grad_norm": 0.2255859375,
"learning_rate": 0.0014132334581772783,
"loss": 1.3462,
"step": 141000
},
{
"epoch": 14.721181856013317,
"grad_norm": 0.2451171875,
"learning_rate": 0.0014111527257594674,
"loss": 1.3456,
"step": 141500
},
{
"epoch": 14.773200166458594,
"grad_norm": 0.259765625,
"learning_rate": 0.0014090719933416563,
"loss": 1.3455,
"step": 142000
},
{
"epoch": 14.82521847690387,
"grad_norm": 0.2578125,
"learning_rate": 0.0014069912609238453,
"loss": 1.3447,
"step": 142500
},
{
"epoch": 14.877236787349148,
"grad_norm": 2.359375,
"learning_rate": 0.0014049105285060342,
"loss": 1.3454,
"step": 143000
},
{
"epoch": 14.929255097794425,
"grad_norm": 0.4296875,
"learning_rate": 0.001402829796088223,
"loss": 1.3461,
"step": 143500
},
{
"epoch": 14.9812734082397,
"grad_norm": 0.33203125,
"learning_rate": 0.001400749063670412,
"loss": 1.3454,
"step": 144000
},
{
"epoch": 15.0,
"eval_loss": 1.3407135009765625,
"eval_runtime": 1.4239,
"eval_samples_per_second": 702.319,
"eval_steps_per_second": 0.702,
"step": 144180
},
{
"epoch": 15.033291718684977,
"grad_norm": 0.63671875,
"learning_rate": 0.001398668331252601,
"loss": 1.3444,
"step": 144500
},
{
"epoch": 15.085310029130254,
"grad_norm": 0.275390625,
"learning_rate": 0.0013965875988347898,
"loss": 1.3444,
"step": 145000
},
{
"epoch": 15.13732833957553,
"grad_norm": 0.56640625,
"learning_rate": 0.001394506866416979,
"loss": 1.3437,
"step": 145500
},
{
"epoch": 15.189346650020807,
"grad_norm": 0.56640625,
"learning_rate": 0.0013924261339991677,
"loss": 1.3473,
"step": 146000
},
{
"epoch": 15.241364960466084,
"grad_norm": 0.201171875,
"learning_rate": 0.0013903454015813566,
"loss": 1.3474,
"step": 146500
},
{
"epoch": 15.293383270911361,
"grad_norm": 0.369140625,
"learning_rate": 0.0013882646691635457,
"loss": 1.3472,
"step": 147000
},
{
"epoch": 15.345401581356638,
"grad_norm": 0.279296875,
"learning_rate": 0.0013861839367457346,
"loss": 1.3457,
"step": 147500
},
{
"epoch": 15.397419891801913,
"grad_norm": 0.314453125,
"learning_rate": 0.0013841032043279233,
"loss": 1.3438,
"step": 148000
},
{
"epoch": 15.44943820224719,
"grad_norm": 0.283203125,
"learning_rate": 0.0013820224719101124,
"loss": 1.3452,
"step": 148500
},
{
"epoch": 15.501456512692467,
"grad_norm": 0.236328125,
"learning_rate": 0.0013799417394923014,
"loss": 1.3461,
"step": 149000
},
{
"epoch": 15.553474823137744,
"grad_norm": 0.6640625,
"learning_rate": 0.0013778610070744903,
"loss": 1.344,
"step": 149500
},
{
"epoch": 15.605493133583021,
"grad_norm": 0.287109375,
"learning_rate": 0.0013757802746566792,
"loss": 1.3437,
"step": 150000
},
{
"epoch": 15.657511444028298,
"grad_norm": 0.255859375,
"learning_rate": 0.001373699542238868,
"loss": 1.3468,
"step": 150500
},
{
"epoch": 15.709529754473575,
"grad_norm": 0.337890625,
"learning_rate": 0.0013716188098210572,
"loss": 1.3439,
"step": 151000
},
{
"epoch": 15.761548064918852,
"grad_norm": 0.81640625,
"learning_rate": 0.001369538077403246,
"loss": 1.3435,
"step": 151500
},
{
"epoch": 15.813566375364129,
"grad_norm": 1.5078125,
"learning_rate": 0.0013674573449854348,
"loss": 1.3463,
"step": 152000
},
{
"epoch": 15.865584685809406,
"grad_norm": 0.2392578125,
"learning_rate": 0.001365376612567624,
"loss": 1.3467,
"step": 152500
},
{
"epoch": 15.917602996254681,
"grad_norm": 1.1015625,
"learning_rate": 0.0013632958801498127,
"loss": 1.3462,
"step": 153000
},
{
"epoch": 15.969621306699958,
"grad_norm": 0.2158203125,
"learning_rate": 0.0013612151477320016,
"loss": 1.3461,
"step": 153500
},
{
"epoch": 16.0,
"eval_loss": 1.3369859457015991,
"eval_runtime": 1.4272,
"eval_samples_per_second": 700.674,
"eval_steps_per_second": 0.701,
"step": 153792
},
{
"epoch": 16.021639617145237,
"grad_norm": 0.2294921875,
"learning_rate": 0.0013591344153141907,
"loss": 1.3446,
"step": 154000
},
{
"epoch": 16.073657927590514,
"grad_norm": 0.2890625,
"learning_rate": 0.0013570536828963796,
"loss": 1.3422,
"step": 154500
},
{
"epoch": 16.125676238035787,
"grad_norm": 0.27734375,
"learning_rate": 0.0013549729504785683,
"loss": 1.3409,
"step": 155000
},
{
"epoch": 16.177694548481064,
"grad_norm": 0.6796875,
"learning_rate": 0.0013528922180607575,
"loss": 1.3428,
"step": 155500
},
{
"epoch": 16.22971285892634,
"grad_norm": 0.66796875,
"learning_rate": 0.0013508114856429464,
"loss": 1.3443,
"step": 156000
},
{
"epoch": 16.281731169371618,
"grad_norm": 0.376953125,
"learning_rate": 0.001348730753225135,
"loss": 1.3423,
"step": 156500
},
{
"epoch": 16.333749479816895,
"grad_norm": 0.486328125,
"learning_rate": 0.0013466500208073242,
"loss": 1.3408,
"step": 157000
},
{
"epoch": 16.38576779026217,
"grad_norm": 0.55859375,
"learning_rate": 0.0013445692883895131,
"loss": 1.3421,
"step": 157500
},
{
"epoch": 16.43778610070745,
"grad_norm": 0.443359375,
"learning_rate": 0.0013424885559717022,
"loss": 1.3424,
"step": 158000
},
{
"epoch": 16.489804411152726,
"grad_norm": 0.2734375,
"learning_rate": 0.001340407823553891,
"loss": 1.3412,
"step": 158500
},
{
"epoch": 16.541822721598002,
"grad_norm": 0.46875,
"learning_rate": 0.0013383270911360799,
"loss": 1.3419,
"step": 159000
},
{
"epoch": 16.59384103204328,
"grad_norm": 0.267578125,
"learning_rate": 0.001336246358718269,
"loss": 1.3415,
"step": 159500
},
{
"epoch": 16.645859342488556,
"grad_norm": 5.65625,
"learning_rate": 0.001334165626300458,
"loss": 1.3417,
"step": 160000
},
{
"epoch": 16.697877652933833,
"grad_norm": 0.181640625,
"learning_rate": 0.0013320848938826466,
"loss": 1.3405,
"step": 160500
},
{
"epoch": 16.74989596337911,
"grad_norm": 0.298828125,
"learning_rate": 0.0013300041614648357,
"loss": 1.339,
"step": 161000
},
{
"epoch": 16.801914273824387,
"grad_norm": 0.609375,
"learning_rate": 0.0013279234290470246,
"loss": 1.3415,
"step": 161500
},
{
"epoch": 16.853932584269664,
"grad_norm": 0.2392578125,
"learning_rate": 0.0013258426966292133,
"loss": 1.341,
"step": 162000
},
{
"epoch": 16.90595089471494,
"grad_norm": 0.330078125,
"learning_rate": 0.0013237619642114025,
"loss": 1.3424,
"step": 162500
},
{
"epoch": 16.957969205160218,
"grad_norm": 0.25390625,
"learning_rate": 0.0013216812317935914,
"loss": 1.3479,
"step": 163000
},
{
"epoch": 17.0,
"eval_loss": 1.339566707611084,
"eval_runtime": 1.4145,
"eval_samples_per_second": 706.982,
"eval_steps_per_second": 0.707,
"step": 163404
},
{
"epoch": 17.00998751560549,
"grad_norm": 0.43359375,
"learning_rate": 0.0013196004993757803,
"loss": 1.3445,
"step": 163500
},
{
"epoch": 17.06200582605077,
"grad_norm": 0.314453125,
"learning_rate": 0.0013175197669579692,
"loss": 1.3435,
"step": 164000
},
{
"epoch": 17.114024136496045,
"grad_norm": 0.2314453125,
"learning_rate": 0.0013154390345401581,
"loss": 1.3448,
"step": 164500
},
{
"epoch": 17.166042446941322,
"grad_norm": 0.294921875,
"learning_rate": 0.0013133583021223473,
"loss": 1.3436,
"step": 165000
},
{
"epoch": 17.2180607573866,
"grad_norm": 0.275390625,
"learning_rate": 0.001311277569704536,
"loss": 1.3445,
"step": 165500
},
{
"epoch": 17.270079067831876,
"grad_norm": 1.7890625,
"learning_rate": 0.0013091968372867249,
"loss": 1.343,
"step": 166000
},
{
"epoch": 17.322097378277153,
"grad_norm": 0.37109375,
"learning_rate": 0.001307116104868914,
"loss": 1.3429,
"step": 166500
},
{
"epoch": 17.37411568872243,
"grad_norm": 0.240234375,
"learning_rate": 0.001305035372451103,
"loss": 1.342,
"step": 167000
},
{
"epoch": 17.426133999167707,
"grad_norm": 2.875,
"learning_rate": 0.0013029546400332916,
"loss": 1.3429,
"step": 167500
},
{
"epoch": 17.478152309612984,
"grad_norm": 0.310546875,
"learning_rate": 0.0013008739076154807,
"loss": 1.3424,
"step": 168000
},
{
"epoch": 17.53017062005826,
"grad_norm": 0.4453125,
"learning_rate": 0.0012987931751976696,
"loss": 1.3422,
"step": 168500
},
{
"epoch": 17.582188930503538,
"grad_norm": 0.33203125,
"learning_rate": 0.0012967124427798583,
"loss": 1.3419,
"step": 169000
},
{
"epoch": 17.634207240948815,
"grad_norm": 0.3515625,
"learning_rate": 0.0012946317103620475,
"loss": 1.3419,
"step": 169500
},
{
"epoch": 17.68622555139409,
"grad_norm": 0.21875,
"learning_rate": 0.0012925509779442364,
"loss": 1.3411,
"step": 170000
},
{
"epoch": 17.73824386183937,
"grad_norm": 0.240234375,
"learning_rate": 0.0012904702455264253,
"loss": 1.3409,
"step": 170500
},
{
"epoch": 17.790262172284645,
"grad_norm": 0.26953125,
"learning_rate": 0.0012883895131086142,
"loss": 1.3429,
"step": 171000
},
{
"epoch": 17.842280482729922,
"grad_norm": 0.28515625,
"learning_rate": 0.0012863087806908031,
"loss": 1.3426,
"step": 171500
},
{
"epoch": 17.8942987931752,
"grad_norm": 0.248046875,
"learning_rate": 0.0012842280482729923,
"loss": 1.342,
"step": 172000
},
{
"epoch": 17.946317103620473,
"grad_norm": 0.392578125,
"learning_rate": 0.0012821473158551812,
"loss": 1.3418,
"step": 172500
},
{
"epoch": 17.99833541406575,
"grad_norm": 0.30078125,
"learning_rate": 0.0012800665834373699,
"loss": 1.3429,
"step": 173000
},
{
"epoch": 18.0,
"eval_loss": 1.3371888399124146,
"eval_runtime": 1.4206,
"eval_samples_per_second": 703.942,
"eval_steps_per_second": 0.704,
"step": 173016
},
{
"epoch": 18.050353724511027,
"grad_norm": 0.51953125,
"learning_rate": 0.001277985851019559,
"loss": 1.3399,
"step": 173500
},
{
"epoch": 18.102372034956304,
"grad_norm": 0.26953125,
"learning_rate": 0.001275905118601748,
"loss": 1.3414,
"step": 174000
},
{
"epoch": 18.15439034540158,
"grad_norm": 0.2431640625,
"learning_rate": 0.0012738243861839366,
"loss": 1.3425,
"step": 174500
},
{
"epoch": 18.206408655846857,
"grad_norm": 0.255859375,
"learning_rate": 0.0012717436537661257,
"loss": 1.339,
"step": 175000
},
{
"epoch": 18.258426966292134,
"grad_norm": 0.19921875,
"learning_rate": 0.0012696629213483147,
"loss": 1.3403,
"step": 175500
},
{
"epoch": 18.31044527673741,
"grad_norm": 0.328125,
"learning_rate": 0.0012675821889305036,
"loss": 1.3388,
"step": 176000
},
{
"epoch": 18.36246358718269,
"grad_norm": 0.220703125,
"learning_rate": 0.0012655014565126925,
"loss": 1.3385,
"step": 176500
},
{
"epoch": 18.414481897627965,
"grad_norm": 0.251953125,
"learning_rate": 0.0012634207240948814,
"loss": 1.3372,
"step": 177000
},
{
"epoch": 18.466500208073242,
"grad_norm": 0.1962890625,
"learning_rate": 0.0012613399916770703,
"loss": 1.3384,
"step": 177500
},
{
"epoch": 18.51851851851852,
"grad_norm": 0.2490234375,
"learning_rate": 0.0012592592592592592,
"loss": 1.3377,
"step": 178000
},
{
"epoch": 18.570536828963796,
"grad_norm": 0.2109375,
"learning_rate": 0.0012571785268414481,
"loss": 1.3372,
"step": 178500
},
{
"epoch": 18.622555139409073,
"grad_norm": 0.2490234375,
"learning_rate": 0.0012550977944236373,
"loss": 1.3368,
"step": 179000
},
{
"epoch": 18.67457344985435,
"grad_norm": 0.201171875,
"learning_rate": 0.0012530170620058262,
"loss": 1.3384,
"step": 179500
},
{
"epoch": 18.726591760299627,
"grad_norm": 0.2451171875,
"learning_rate": 0.0012509363295880149,
"loss": 1.3374,
"step": 180000
},
{
"epoch": 18.778610070744904,
"grad_norm": 0.431640625,
"learning_rate": 0.001248855597170204,
"loss": 1.3378,
"step": 180500
},
{
"epoch": 18.83062838119018,
"grad_norm": 0.4765625,
"learning_rate": 0.001246774864752393,
"loss": 1.3397,
"step": 181000
},
{
"epoch": 18.882646691635454,
"grad_norm": 0.2216796875,
"learning_rate": 0.0012446941323345816,
"loss": 1.3379,
"step": 181500
},
{
"epoch": 18.93466500208073,
"grad_norm": 0.55078125,
"learning_rate": 0.0012426133999167708,
"loss": 1.3377,
"step": 182000
},
{
"epoch": 18.986683312526008,
"grad_norm": 0.337890625,
"learning_rate": 0.0012405326674989597,
"loss": 1.3379,
"step": 182500
},
{
"epoch": 19.0,
"eval_loss": 1.3305245637893677,
"eval_runtime": 1.4316,
"eval_samples_per_second": 698.511,
"eval_steps_per_second": 0.699,
"step": 182628
}
],
"logging_steps": 500,
"max_steps": 480600,
"num_input_tokens_seen": 0,
"num_train_epochs": 50,
"save_steps": 500,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 3,
"early_stopping_threshold": 0.0
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.3420668838410615e+19,
"train_batch_size": 1024,
"trial_name": null,
"trial_params": null
}