tebak-gambar-mobilevit / trainer_state.json
akahana's picture
tebak-gambar-mobilevit
ac34bcd verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 5000,
"global_step": 52737,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.005688605722737357,
"grad_norm": 2.727705717086792,
"learning_rate": 0.0007984982080891974,
"loss": 2.3682,
"step": 100
},
{
"epoch": 0.011377211445474714,
"grad_norm": 2.5978145599365234,
"learning_rate": 0.0007969812465631342,
"loss": 2.2098,
"step": 200
},
{
"epoch": 0.01706581716821207,
"grad_norm": 2.488832473754883,
"learning_rate": 0.0007954642850370708,
"loss": 2.1041,
"step": 300
},
{
"epoch": 0.02275442289094943,
"grad_norm": 2.1368465423583984,
"learning_rate": 0.0007939473235110074,
"loss": 2.0233,
"step": 400
},
{
"epoch": 0.028443028613686784,
"grad_norm": 2.7280213832855225,
"learning_rate": 0.0007924303619849442,
"loss": 1.9639,
"step": 500
},
{
"epoch": 0.03413163433642414,
"grad_norm": 2.5257463455200195,
"learning_rate": 0.0007909134004588809,
"loss": 2.0229,
"step": 600
},
{
"epoch": 0.0398202400591615,
"grad_norm": 2.748051404953003,
"learning_rate": 0.0007893964389328175,
"loss": 1.9867,
"step": 700
},
{
"epoch": 0.04550884578189886,
"grad_norm": 2.212047815322876,
"learning_rate": 0.0007878794774067543,
"loss": 1.9354,
"step": 800
},
{
"epoch": 0.05119745150463621,
"grad_norm": 2.423400640487671,
"learning_rate": 0.000786362515880691,
"loss": 1.9127,
"step": 900
},
{
"epoch": 0.05688605722737357,
"grad_norm": 2.379678726196289,
"learning_rate": 0.0007848455543546277,
"loss": 1.8927,
"step": 1000
},
{
"epoch": 0.06257466295011092,
"grad_norm": 2.5806541442871094,
"learning_rate": 0.0007833285928285645,
"loss": 1.8471,
"step": 1100
},
{
"epoch": 0.06826326867284828,
"grad_norm": 2.4539499282836914,
"learning_rate": 0.0007818116313025011,
"loss": 1.8296,
"step": 1200
},
{
"epoch": 0.07395187439558565,
"grad_norm": 2.7818546295166016,
"learning_rate": 0.0007802946697764378,
"loss": 1.8136,
"step": 1300
},
{
"epoch": 0.079640480118323,
"grad_norm": 2.979959487915039,
"learning_rate": 0.0007787777082503746,
"loss": 1.7844,
"step": 1400
},
{
"epoch": 0.08532908584106036,
"grad_norm": 2.3885879516601562,
"learning_rate": 0.0007772607467243113,
"loss": 1.7627,
"step": 1500
},
{
"epoch": 0.09101769156379771,
"grad_norm": 2.2447025775909424,
"learning_rate": 0.0007757437851982479,
"loss": 1.7375,
"step": 1600
},
{
"epoch": 0.09670629728653507,
"grad_norm": 2.184580087661743,
"learning_rate": 0.0007742268236721846,
"loss": 1.7212,
"step": 1700
},
{
"epoch": 0.10239490300927243,
"grad_norm": 2.2506866455078125,
"learning_rate": 0.0007727098621461213,
"loss": 1.7264,
"step": 1800
},
{
"epoch": 0.10808350873200978,
"grad_norm": 2.059812068939209,
"learning_rate": 0.000771192900620058,
"loss": 1.7123,
"step": 1900
},
{
"epoch": 0.11377211445474714,
"grad_norm": 2.3013007640838623,
"learning_rate": 0.0007696759390939948,
"loss": 1.7122,
"step": 2000
},
{
"epoch": 0.11946072017748449,
"grad_norm": 2.7073047161102295,
"learning_rate": 0.0007681741471831921,
"loss": 1.6847,
"step": 2100
},
{
"epoch": 0.12514932590022185,
"grad_norm": 2.023949384689331,
"learning_rate": 0.0007666571856571288,
"loss": 1.6941,
"step": 2200
},
{
"epoch": 0.1308379316229592,
"grad_norm": 1.9444501399993896,
"learning_rate": 0.0007651402241310656,
"loss": 1.6682,
"step": 2300
},
{
"epoch": 0.13652653734569656,
"grad_norm": 2.691826105117798,
"learning_rate": 0.0007636232626050023,
"loss": 1.6409,
"step": 2400
},
{
"epoch": 0.1422151430684339,
"grad_norm": 2.483386993408203,
"learning_rate": 0.0007621063010789389,
"loss": 1.662,
"step": 2500
},
{
"epoch": 0.1479037487911713,
"grad_norm": 2.1850545406341553,
"learning_rate": 0.0007605893395528756,
"loss": 1.6532,
"step": 2600
},
{
"epoch": 0.15359235451390865,
"grad_norm": 1.989560842514038,
"learning_rate": 0.0007590723780268123,
"loss": 1.6231,
"step": 2700
},
{
"epoch": 0.159280960236646,
"grad_norm": 2.1362531185150146,
"learning_rate": 0.000757555416500749,
"loss": 1.6019,
"step": 2800
},
{
"epoch": 0.16496956595938336,
"grad_norm": 2.3262641429901123,
"learning_rate": 0.0007560384549746857,
"loss": 1.6103,
"step": 2900
},
{
"epoch": 0.17065817168212072,
"grad_norm": 2.297419309616089,
"learning_rate": 0.0007545214934486224,
"loss": 1.6314,
"step": 3000
},
{
"epoch": 0.17634677740485807,
"grad_norm": 2.1368629932403564,
"learning_rate": 0.0007530045319225591,
"loss": 1.5838,
"step": 3100
},
{
"epoch": 0.18203538312759543,
"grad_norm": 2.3383195400238037,
"learning_rate": 0.0007514875703964959,
"loss": 1.5857,
"step": 3200
},
{
"epoch": 0.18772398885033278,
"grad_norm": 2.149740219116211,
"learning_rate": 0.0007499706088704326,
"loss": 1.6016,
"step": 3300
},
{
"epoch": 0.19341259457307014,
"grad_norm": 2.096703290939331,
"learning_rate": 0.0007484536473443692,
"loss": 1.5904,
"step": 3400
},
{
"epoch": 0.1991012002958075,
"grad_norm": 2.2043957710266113,
"learning_rate": 0.000746936685818306,
"loss": 1.5787,
"step": 3500
},
{
"epoch": 0.20478980601854485,
"grad_norm": 2.6369898319244385,
"learning_rate": 0.0007454197242922427,
"loss": 1.5539,
"step": 3600
},
{
"epoch": 0.2104784117412822,
"grad_norm": 1.9776628017425537,
"learning_rate": 0.0007439027627661794,
"loss": 1.5815,
"step": 3700
},
{
"epoch": 0.21616701746401956,
"grad_norm": 2.2001795768737793,
"learning_rate": 0.0007423858012401161,
"loss": 1.5583,
"step": 3800
},
{
"epoch": 0.22185562318675692,
"grad_norm": 2.2252562046051025,
"learning_rate": 0.0007408688397140527,
"loss": 1.5539,
"step": 3900
},
{
"epoch": 0.22754422890949427,
"grad_norm": 2.15871262550354,
"learning_rate": 0.0007393518781879895,
"loss": 1.5786,
"step": 4000
},
{
"epoch": 0.23323283463223163,
"grad_norm": 2.026066303253174,
"learning_rate": 0.0007378500862771869,
"loss": 1.5452,
"step": 4100
},
{
"epoch": 0.23892144035496898,
"grad_norm": 2.116511583328247,
"learning_rate": 0.0007363331247511235,
"loss": 1.5381,
"step": 4200
},
{
"epoch": 0.24461004607770637,
"grad_norm": 1.9454152584075928,
"learning_rate": 0.0007348161632250602,
"loss": 1.557,
"step": 4300
},
{
"epoch": 0.2502986518004437,
"grad_norm": 1.8668495416641235,
"learning_rate": 0.000733299201698997,
"loss": 1.5406,
"step": 4400
},
{
"epoch": 0.2559872575231811,
"grad_norm": 2.0886125564575195,
"learning_rate": 0.0007317822401729337,
"loss": 1.519,
"step": 4500
},
{
"epoch": 0.2616758632459184,
"grad_norm": 2.5768446922302246,
"learning_rate": 0.0007302652786468704,
"loss": 1.5178,
"step": 4600
},
{
"epoch": 0.2673644689686558,
"grad_norm": 2.4169631004333496,
"learning_rate": 0.000728748317120807,
"loss": 1.5283,
"step": 4700
},
{
"epoch": 0.2730530746913931,
"grad_norm": 2.7676291465759277,
"learning_rate": 0.0007272313555947437,
"loss": 1.5265,
"step": 4800
},
{
"epoch": 0.2787416804141305,
"grad_norm": 1.9152452945709229,
"learning_rate": 0.0007257143940686805,
"loss": 1.5152,
"step": 4900
},
{
"epoch": 0.2844302861368678,
"grad_norm": 2.56608510017395,
"learning_rate": 0.0007241974325426172,
"loss": 1.5084,
"step": 5000
},
{
"epoch": 0.2844302861368678,
"eval_accuracy": 0.636444,
"eval_loss": 1.469992995262146,
"eval_runtime": 85.5907,
"eval_samples_per_second": 2920.879,
"eval_steps_per_second": 11.415,
"step": 5000
},
{
"epoch": 0.2901188918596052,
"grad_norm": 2.2403135299682617,
"learning_rate": 0.0007226804710165538,
"loss": 1.5076,
"step": 5100
},
{
"epoch": 0.2958074975823426,
"grad_norm": 2.058535099029541,
"learning_rate": 0.0007211635094904906,
"loss": 1.4973,
"step": 5200
},
{
"epoch": 0.3014961033050799,
"grad_norm": 1.9374159574508667,
"learning_rate": 0.0007196465479644273,
"loss": 1.5055,
"step": 5300
},
{
"epoch": 0.3071847090278173,
"grad_norm": 1.8894695043563843,
"learning_rate": 0.000718129586438364,
"loss": 1.4996,
"step": 5400
},
{
"epoch": 0.31287331475055463,
"grad_norm": 2.5466501712799072,
"learning_rate": 0.0007166126249123008,
"loss": 1.5006,
"step": 5500
},
{
"epoch": 0.318561920473292,
"grad_norm": 1.9721605777740479,
"learning_rate": 0.0007150956633862374,
"loss": 1.4932,
"step": 5600
},
{
"epoch": 0.32425052619602934,
"grad_norm": 1.8921763896942139,
"learning_rate": 0.0007135787018601741,
"loss": 1.4762,
"step": 5700
},
{
"epoch": 0.3299391319187667,
"grad_norm": 2.49052357673645,
"learning_rate": 0.0007120617403341108,
"loss": 1.4636,
"step": 5800
},
{
"epoch": 0.33562773764150405,
"grad_norm": 1.8825891017913818,
"learning_rate": 0.0007105447788080475,
"loss": 1.4903,
"step": 5900
},
{
"epoch": 0.34131634336424144,
"grad_norm": 1.9227776527404785,
"learning_rate": 0.0007090278172819842,
"loss": 1.4602,
"step": 6000
},
{
"epoch": 0.34700494908697876,
"grad_norm": 2.173774242401123,
"learning_rate": 0.0007075108557559209,
"loss": 1.4581,
"step": 6100
},
{
"epoch": 0.35269355480971615,
"grad_norm": 1.9840656518936157,
"learning_rate": 0.0007059938942298576,
"loss": 1.4504,
"step": 6200
},
{
"epoch": 0.3583821605324535,
"grad_norm": 2.368171453475952,
"learning_rate": 0.000704492102319055,
"loss": 1.4659,
"step": 6300
},
{
"epoch": 0.36407076625519086,
"grad_norm": 2.005125045776367,
"learning_rate": 0.0007029751407929917,
"loss": 1.4698,
"step": 6400
},
{
"epoch": 0.3697593719779282,
"grad_norm": 1.8724095821380615,
"learning_rate": 0.0007014581792669284,
"loss": 1.4429,
"step": 6500
},
{
"epoch": 0.37544797770066557,
"grad_norm": 1.8412431478500366,
"learning_rate": 0.0006999412177408651,
"loss": 1.4368,
"step": 6600
},
{
"epoch": 0.3811365834234029,
"grad_norm": 1.9016755819320679,
"learning_rate": 0.0006984242562148018,
"loss": 1.4351,
"step": 6700
},
{
"epoch": 0.3868251891461403,
"grad_norm": 1.9896953105926514,
"learning_rate": 0.0006969072946887384,
"loss": 1.4563,
"step": 6800
},
{
"epoch": 0.39251379486887766,
"grad_norm": 2.3341548442840576,
"learning_rate": 0.0006953903331626751,
"loss": 1.4457,
"step": 6900
},
{
"epoch": 0.398202400591615,
"grad_norm": 1.95259690284729,
"learning_rate": 0.0006938733716366119,
"loss": 1.4636,
"step": 7000
},
{
"epoch": 0.4038910063143524,
"grad_norm": 1.8444461822509766,
"learning_rate": 0.0006923564101105486,
"loss": 1.4418,
"step": 7100
},
{
"epoch": 0.4095796120370897,
"grad_norm": 1.9170624017715454,
"learning_rate": 0.0006908394485844853,
"loss": 1.4267,
"step": 7200
},
{
"epoch": 0.4152682177598271,
"grad_norm": 1.6293827295303345,
"learning_rate": 0.000689322487058422,
"loss": 1.4474,
"step": 7300
},
{
"epoch": 0.4209568234825644,
"grad_norm": 2.2202467918395996,
"learning_rate": 0.0006878055255323587,
"loss": 1.4166,
"step": 7400
},
{
"epoch": 0.4266454292053018,
"grad_norm": 1.9069397449493408,
"learning_rate": 0.0006862885640062954,
"loss": 1.4194,
"step": 7500
},
{
"epoch": 0.4323340349280391,
"grad_norm": 2.0205297470092773,
"learning_rate": 0.0006847716024802322,
"loss": 1.4328,
"step": 7600
},
{
"epoch": 0.4380226406507765,
"grad_norm": 1.6736252307891846,
"learning_rate": 0.0006832546409541689,
"loss": 1.4309,
"step": 7700
},
{
"epoch": 0.44371124637351383,
"grad_norm": 1.7010937929153442,
"learning_rate": 0.0006817376794281055,
"loss": 1.412,
"step": 7800
},
{
"epoch": 0.4493998520962512,
"grad_norm": 2.748424768447876,
"learning_rate": 0.0006802207179020422,
"loss": 1.421,
"step": 7900
},
{
"epoch": 0.45508845781898855,
"grad_norm": 1.908728837966919,
"learning_rate": 0.0006787037563759789,
"loss": 1.4172,
"step": 8000
},
{
"epoch": 0.46077706354172593,
"grad_norm": 1.8672014474868774,
"learning_rate": 0.0006771867948499157,
"loss": 1.4448,
"step": 8100
},
{
"epoch": 0.46646566926446326,
"grad_norm": 2.128519058227539,
"learning_rate": 0.0006756698333238524,
"loss": 1.4158,
"step": 8200
},
{
"epoch": 0.47215427498720064,
"grad_norm": 1.7498713731765747,
"learning_rate": 0.000674152871797789,
"loss": 1.412,
"step": 8300
},
{
"epoch": 0.47784288070993797,
"grad_norm": 1.7801289558410645,
"learning_rate": 0.0006726510798869864,
"loss": 1.4146,
"step": 8400
},
{
"epoch": 0.48353148643267535,
"grad_norm": 1.9360538721084595,
"learning_rate": 0.0006711341183609232,
"loss": 1.4252,
"step": 8500
},
{
"epoch": 0.48922009215541273,
"grad_norm": 2.3669304847717285,
"learning_rate": 0.0006696171568348598,
"loss": 1.4057,
"step": 8600
},
{
"epoch": 0.49490869787815006,
"grad_norm": 1.7751379013061523,
"learning_rate": 0.0006681001953087965,
"loss": 1.4049,
"step": 8700
},
{
"epoch": 0.5005973036008874,
"grad_norm": 2.5389885902404785,
"learning_rate": 0.0006665832337827332,
"loss": 1.3837,
"step": 8800
},
{
"epoch": 0.5062859093236248,
"grad_norm": 2.5082690715789795,
"learning_rate": 0.0006650662722566699,
"loss": 1.3924,
"step": 8900
},
{
"epoch": 0.5119745150463622,
"grad_norm": 2.011589527130127,
"learning_rate": 0.0006635493107306066,
"loss": 1.3956,
"step": 9000
},
{
"epoch": 0.5176631207690995,
"grad_norm": 1.819793939590454,
"learning_rate": 0.0006620323492045433,
"loss": 1.4063,
"step": 9100
},
{
"epoch": 0.5233517264918368,
"grad_norm": 2.081247568130493,
"learning_rate": 0.00066051538767848,
"loss": 1.4145,
"step": 9200
},
{
"epoch": 0.5290403322145742,
"grad_norm": 2.151563882827759,
"learning_rate": 0.0006589984261524168,
"loss": 1.4002,
"step": 9300
},
{
"epoch": 0.5347289379373116,
"grad_norm": 1.9170759916305542,
"learning_rate": 0.0006574814646263535,
"loss": 1.3863,
"step": 9400
},
{
"epoch": 0.5404175436600489,
"grad_norm": 1.6435186862945557,
"learning_rate": 0.0006559645031002901,
"loss": 1.3888,
"step": 9500
},
{
"epoch": 0.5461061493827862,
"grad_norm": 1.8130972385406494,
"learning_rate": 0.0006544475415742268,
"loss": 1.3904,
"step": 9600
},
{
"epoch": 0.5517947551055237,
"grad_norm": 1.8200345039367676,
"learning_rate": 0.0006529305800481636,
"loss": 1.3647,
"step": 9700
},
{
"epoch": 0.557483360828261,
"grad_norm": 1.7286423444747925,
"learning_rate": 0.0006514136185221003,
"loss": 1.3815,
"step": 9800
},
{
"epoch": 0.5631719665509983,
"grad_norm": 2.345879554748535,
"learning_rate": 0.000649896656996037,
"loss": 1.3919,
"step": 9900
},
{
"epoch": 0.5688605722737357,
"grad_norm": 1.8189209699630737,
"learning_rate": 0.0006483796954699736,
"loss": 1.3684,
"step": 10000
},
{
"epoch": 0.5688605722737357,
"eval_accuracy": 0.667392,
"eval_loss": 1.3353288173675537,
"eval_runtime": 85.0475,
"eval_samples_per_second": 2939.533,
"eval_steps_per_second": 11.488,
"step": 10000
},
{
"epoch": 0.5745491779964731,
"grad_norm": 1.7264429330825806,
"learning_rate": 0.0006468627339439103,
"loss": 1.4044,
"step": 10100
},
{
"epoch": 0.5802377837192104,
"grad_norm": 1.8806540966033936,
"learning_rate": 0.0006453457724178471,
"loss": 1.3746,
"step": 10200
},
{
"epoch": 0.5859263894419477,
"grad_norm": 1.7714815139770508,
"learning_rate": 0.0006438288108917838,
"loss": 1.3837,
"step": 10300
},
{
"epoch": 0.5916149951646852,
"grad_norm": 1.713157057762146,
"learning_rate": 0.0006423118493657205,
"loss": 1.3939,
"step": 10400
},
{
"epoch": 0.5973036008874225,
"grad_norm": 2.169168472290039,
"learning_rate": 0.0006408100574549179,
"loss": 1.3658,
"step": 10500
},
{
"epoch": 0.6029922066101598,
"grad_norm": 1.727501630783081,
"learning_rate": 0.0006392930959288546,
"loss": 1.3907,
"step": 10600
},
{
"epoch": 0.6086808123328972,
"grad_norm": 2.0120322704315186,
"learning_rate": 0.0006377761344027913,
"loss": 1.3757,
"step": 10700
},
{
"epoch": 0.6143694180556346,
"grad_norm": 1.799139142036438,
"learning_rate": 0.0006362591728767279,
"loss": 1.3803,
"step": 10800
},
{
"epoch": 0.6200580237783719,
"grad_norm": 1.8817808628082275,
"learning_rate": 0.0006347422113506646,
"loss": 1.3702,
"step": 10900
},
{
"epoch": 0.6257466295011093,
"grad_norm": 2.1144518852233887,
"learning_rate": 0.0006332252498246013,
"loss": 1.3832,
"step": 11000
},
{
"epoch": 0.6314352352238466,
"grad_norm": 2.1396071910858154,
"learning_rate": 0.0006317082882985381,
"loss": 1.3611,
"step": 11100
},
{
"epoch": 0.637123840946584,
"grad_norm": 1.6794757843017578,
"learning_rate": 0.0006301913267724747,
"loss": 1.368,
"step": 11200
},
{
"epoch": 0.6428124466693214,
"grad_norm": 2.268433094024658,
"learning_rate": 0.0006286743652464114,
"loss": 1.3498,
"step": 11300
},
{
"epoch": 0.6485010523920587,
"grad_norm": 1.8515706062316895,
"learning_rate": 0.0006271574037203482,
"loss": 1.3489,
"step": 11400
},
{
"epoch": 0.654189658114796,
"grad_norm": 2.482171058654785,
"learning_rate": 0.0006256404421942849,
"loss": 1.3501,
"step": 11500
},
{
"epoch": 0.6598782638375335,
"grad_norm": 1.9485667943954468,
"learning_rate": 0.0006241234806682216,
"loss": 1.3483,
"step": 11600
},
{
"epoch": 0.6655668695602708,
"grad_norm": 1.8601367473602295,
"learning_rate": 0.0006226065191421583,
"loss": 1.3392,
"step": 11700
},
{
"epoch": 0.6712554752830081,
"grad_norm": 1.870851993560791,
"learning_rate": 0.000621089557616095,
"loss": 1.352,
"step": 11800
},
{
"epoch": 0.6769440810057455,
"grad_norm": 1.9454014301300049,
"learning_rate": 0.0006195725960900317,
"loss": 1.3537,
"step": 11900
},
{
"epoch": 0.6826326867284829,
"grad_norm": 1.9180669784545898,
"learning_rate": 0.0006180556345639685,
"loss": 1.3541,
"step": 12000
},
{
"epoch": 0.6883212924512202,
"grad_norm": 1.7796809673309326,
"learning_rate": 0.0006165386730379051,
"loss": 1.331,
"step": 12100
},
{
"epoch": 0.6940098981739575,
"grad_norm": 2.040998935699463,
"learning_rate": 0.0006150217115118417,
"loss": 1.3214,
"step": 12200
},
{
"epoch": 0.699698503896695,
"grad_norm": 1.7188791036605835,
"learning_rate": 0.0006135047499857785,
"loss": 1.3577,
"step": 12300
},
{
"epoch": 0.7053871096194323,
"grad_norm": 1.9152625799179077,
"learning_rate": 0.0006119877884597152,
"loss": 1.3682,
"step": 12400
},
{
"epoch": 0.7110757153421696,
"grad_norm": 2.150810718536377,
"learning_rate": 0.0006104708269336519,
"loss": 1.3388,
"step": 12500
},
{
"epoch": 0.716764321064907,
"grad_norm": 1.97470223903656,
"learning_rate": 0.0006089538654075887,
"loss": 1.3319,
"step": 12600
},
{
"epoch": 0.7224529267876444,
"grad_norm": 1.663122296333313,
"learning_rate": 0.0006074369038815253,
"loss": 1.3593,
"step": 12700
},
{
"epoch": 0.7281415325103817,
"grad_norm": 1.6453677415847778,
"learning_rate": 0.0006059351119707227,
"loss": 1.3592,
"step": 12800
},
{
"epoch": 0.733830138233119,
"grad_norm": 1.6896419525146484,
"learning_rate": 0.0006044181504446595,
"loss": 1.3183,
"step": 12900
},
{
"epoch": 0.7395187439558564,
"grad_norm": 1.7903008460998535,
"learning_rate": 0.000602901188918596,
"loss": 1.3373,
"step": 13000
},
{
"epoch": 0.7452073496785938,
"grad_norm": 2.2026655673980713,
"learning_rate": 0.0006013842273925327,
"loss": 1.3403,
"step": 13100
},
{
"epoch": 0.7508959554013311,
"grad_norm": 1.9204201698303223,
"learning_rate": 0.0005998672658664695,
"loss": 1.3199,
"step": 13200
},
{
"epoch": 0.7565845611240685,
"grad_norm": 1.946899652481079,
"learning_rate": 0.0005983503043404062,
"loss": 1.3298,
"step": 13300
},
{
"epoch": 0.7622731668468058,
"grad_norm": 2.019131898880005,
"learning_rate": 0.0005968333428143428,
"loss": 1.3449,
"step": 13400
},
{
"epoch": 0.7679617725695432,
"grad_norm": 1.848008155822754,
"learning_rate": 0.0005953163812882796,
"loss": 1.3206,
"step": 13500
},
{
"epoch": 0.7736503782922806,
"grad_norm": 2.373288631439209,
"learning_rate": 0.0005937994197622163,
"loss": 1.3564,
"step": 13600
},
{
"epoch": 0.7793389840150179,
"grad_norm": 2.556985855102539,
"learning_rate": 0.000592282458236153,
"loss": 1.3254,
"step": 13700
},
{
"epoch": 0.7850275897377553,
"grad_norm": 1.8957433700561523,
"learning_rate": 0.0005907654967100898,
"loss": 1.3498,
"step": 13800
},
{
"epoch": 0.7907161954604927,
"grad_norm": 1.7315127849578857,
"learning_rate": 0.0005892485351840264,
"loss": 1.3249,
"step": 13900
},
{
"epoch": 0.79640480118323,
"grad_norm": 1.973764419555664,
"learning_rate": 0.0005877315736579631,
"loss": 1.3305,
"step": 14000
},
{
"epoch": 0.8020934069059673,
"grad_norm": 1.711145281791687,
"learning_rate": 0.0005862146121318999,
"loss": 1.3011,
"step": 14100
},
{
"epoch": 0.8077820126287047,
"grad_norm": 1.8515042066574097,
"learning_rate": 0.0005846976506058365,
"loss": 1.3195,
"step": 14200
},
{
"epoch": 0.8134706183514421,
"grad_norm": 1.6278700828552246,
"learning_rate": 0.0005831806890797733,
"loss": 1.3308,
"step": 14300
},
{
"epoch": 0.8191592240741794,
"grad_norm": 1.444455623626709,
"learning_rate": 0.0005816637275537099,
"loss": 1.3119,
"step": 14400
},
{
"epoch": 0.8248478297969167,
"grad_norm": 1.6277796030044556,
"learning_rate": 0.0005801467660276466,
"loss": 1.3227,
"step": 14500
},
{
"epoch": 0.8305364355196542,
"grad_norm": 1.8428665399551392,
"learning_rate": 0.0005786298045015834,
"loss": 1.3336,
"step": 14600
},
{
"epoch": 0.8362250412423915,
"grad_norm": 1.6377763748168945,
"learning_rate": 0.0005771128429755201,
"loss": 1.3141,
"step": 14700
},
{
"epoch": 0.8419136469651288,
"grad_norm": 1.7305645942687988,
"learning_rate": 0.0005755958814494568,
"loss": 1.3062,
"step": 14800
},
{
"epoch": 0.8476022526878662,
"grad_norm": 2.469701051712036,
"learning_rate": 0.0005740940895386541,
"loss": 1.3074,
"step": 14900
},
{
"epoch": 0.8532908584106036,
"grad_norm": 1.952755331993103,
"learning_rate": 0.0005725771280125909,
"loss": 1.3568,
"step": 15000
},
{
"epoch": 0.8532908584106036,
"eval_accuracy": 0.68038,
"eval_loss": 1.2764052152633667,
"eval_runtime": 82.5452,
"eval_samples_per_second": 3028.643,
"eval_steps_per_second": 11.836,
"step": 15000
},
{
"epoch": 0.8589794641333409,
"grad_norm": 3.221471071243286,
"learning_rate": 0.0005710601664865274,
"loss": 1.3341,
"step": 15100
},
{
"epoch": 0.8646680698560782,
"grad_norm": 2.2455317974090576,
"learning_rate": 0.0005695432049604642,
"loss": 1.3276,
"step": 15200
},
{
"epoch": 0.8703566755788157,
"grad_norm": 1.8076684474945068,
"learning_rate": 0.0005680262434344009,
"loss": 1.2922,
"step": 15300
},
{
"epoch": 0.876045281301553,
"grad_norm": 1.701774001121521,
"learning_rate": 0.0005665092819083376,
"loss": 1.3003,
"step": 15400
},
{
"epoch": 0.8817338870242903,
"grad_norm": 1.5403673648834229,
"learning_rate": 0.0005649923203822744,
"loss": 1.3207,
"step": 15500
},
{
"epoch": 0.8874224927470277,
"grad_norm": 1.9462639093399048,
"learning_rate": 0.000563475358856211,
"loss": 1.3098,
"step": 15600
},
{
"epoch": 0.8931110984697651,
"grad_norm": 1.6688456535339355,
"learning_rate": 0.0005619583973301477,
"loss": 1.2993,
"step": 15700
},
{
"epoch": 0.8987997041925024,
"grad_norm": 1.6060837507247925,
"learning_rate": 0.0005604414358040845,
"loss": 1.3145,
"step": 15800
},
{
"epoch": 0.9044883099152398,
"grad_norm": 1.8593111038208008,
"learning_rate": 0.0005589244742780212,
"loss": 1.2836,
"step": 15900
},
{
"epoch": 0.9101769156379771,
"grad_norm": 2.035261869430542,
"learning_rate": 0.0005574075127519579,
"loss": 1.3125,
"step": 16000
},
{
"epoch": 0.9158655213607145,
"grad_norm": 1.6091046333312988,
"learning_rate": 0.0005558905512258946,
"loss": 1.2868,
"step": 16100
},
{
"epoch": 0.9215541270834519,
"grad_norm": 1.656204104423523,
"learning_rate": 0.0005543735896998313,
"loss": 1.3075,
"step": 16200
},
{
"epoch": 0.9272427328061892,
"grad_norm": 1.5555946826934814,
"learning_rate": 0.0005528566281737679,
"loss": 1.2963,
"step": 16300
},
{
"epoch": 0.9329313385289265,
"grad_norm": 1.7379626035690308,
"learning_rate": 0.0005513396666477047,
"loss": 1.2905,
"step": 16400
},
{
"epoch": 0.938619944251664,
"grad_norm": 1.5103166103363037,
"learning_rate": 0.0005498227051216414,
"loss": 1.2848,
"step": 16500
},
{
"epoch": 0.9443085499744013,
"grad_norm": 1.5895978212356567,
"learning_rate": 0.000548305743595578,
"loss": 1.293,
"step": 16600
},
{
"epoch": 0.9499971556971386,
"grad_norm": 1.6526978015899658,
"learning_rate": 0.0005467887820695148,
"loss": 1.288,
"step": 16700
},
{
"epoch": 0.9556857614198759,
"grad_norm": 1.7471717596054077,
"learning_rate": 0.0005452718205434515,
"loss": 1.3099,
"step": 16800
},
{
"epoch": 0.9613743671426134,
"grad_norm": 1.5995450019836426,
"learning_rate": 0.0005437700286326488,
"loss": 1.2886,
"step": 16900
},
{
"epoch": 0.9670629728653507,
"grad_norm": 1.7462047338485718,
"learning_rate": 0.0005422530671065856,
"loss": 1.317,
"step": 17000
},
{
"epoch": 0.972751578588088,
"grad_norm": 1.5739308595657349,
"learning_rate": 0.0005407361055805223,
"loss": 1.2997,
"step": 17100
},
{
"epoch": 0.9784401843108255,
"grad_norm": 1.6608139276504517,
"learning_rate": 0.0005392191440544589,
"loss": 1.3037,
"step": 17200
},
{
"epoch": 0.9841287900335628,
"grad_norm": 1.7515637874603271,
"learning_rate": 0.0005377021825283956,
"loss": 1.302,
"step": 17300
},
{
"epoch": 0.9898173957563001,
"grad_norm": 1.572986364364624,
"learning_rate": 0.0005361852210023323,
"loss": 1.2945,
"step": 17400
},
{
"epoch": 0.9955060014790375,
"grad_norm": 1.9207016229629517,
"learning_rate": 0.000534668259476269,
"loss": 1.2747,
"step": 17500
},
{
"epoch": 1.0011946072017748,
"grad_norm": 1.9010945558547974,
"learning_rate": 0.0005331512979502058,
"loss": 1.263,
"step": 17600
},
{
"epoch": 1.0068832129245122,
"grad_norm": 2.4259393215179443,
"learning_rate": 0.0005316343364241425,
"loss": 1.2741,
"step": 17700
},
{
"epoch": 1.0125718186472497,
"grad_norm": 2.5002028942108154,
"learning_rate": 0.0005301173748980791,
"loss": 1.2686,
"step": 17800
},
{
"epoch": 1.0182604243699869,
"grad_norm": 1.7075704336166382,
"learning_rate": 0.0005286004133720159,
"loss": 1.2661,
"step": 17900
},
{
"epoch": 1.0239490300927243,
"grad_norm": 1.7390458583831787,
"learning_rate": 0.0005270834518459526,
"loss": 1.2698,
"step": 18000
},
{
"epoch": 1.0296376358154615,
"grad_norm": 1.980185627937317,
"learning_rate": 0.0005255664903198893,
"loss": 1.2569,
"step": 18100
},
{
"epoch": 1.035326241538199,
"grad_norm": 1.79970383644104,
"learning_rate": 0.0005240495287938261,
"loss": 1.2738,
"step": 18200
},
{
"epoch": 1.0410148472609364,
"grad_norm": 1.6184749603271484,
"learning_rate": 0.0005225325672677627,
"loss": 1.2637,
"step": 18300
},
{
"epoch": 1.0467034529836736,
"grad_norm": 2.3463358879089355,
"learning_rate": 0.0005210156057416993,
"loss": 1.2665,
"step": 18400
},
{
"epoch": 1.052392058706411,
"grad_norm": 1.8550745248794556,
"learning_rate": 0.0005194986442156361,
"loss": 1.2664,
"step": 18500
},
{
"epoch": 1.0580806644291485,
"grad_norm": 1.8582580089569092,
"learning_rate": 0.0005179816826895728,
"loss": 1.2442,
"step": 18600
},
{
"epoch": 1.0637692701518857,
"grad_norm": 1.88007390499115,
"learning_rate": 0.0005164647211635095,
"loss": 1.2536,
"step": 18700
},
{
"epoch": 1.0694578758746232,
"grad_norm": 1.804671287536621,
"learning_rate": 0.0005149477596374462,
"loss": 1.2459,
"step": 18800
},
{
"epoch": 1.0751464815973604,
"grad_norm": 1.7329107522964478,
"learning_rate": 0.0005134307981113829,
"loss": 1.2499,
"step": 18900
},
{
"epoch": 1.0808350873200978,
"grad_norm": 1.693323016166687,
"learning_rate": 0.0005119290062005802,
"loss": 1.25,
"step": 19000
},
{
"epoch": 1.0865236930428352,
"grad_norm": 1.600060224533081,
"learning_rate": 0.000510412044674517,
"loss": 1.2515,
"step": 19100
},
{
"epoch": 1.0922122987655725,
"grad_norm": 1.8084614276885986,
"learning_rate": 0.0005088950831484537,
"loss": 1.246,
"step": 19200
},
{
"epoch": 1.09790090448831,
"grad_norm": 1.8022205829620361,
"learning_rate": 0.0005073781216223904,
"loss": 1.2597,
"step": 19300
},
{
"epoch": 1.1035895102110473,
"grad_norm": 1.6137562990188599,
"learning_rate": 0.0005058611600963271,
"loss": 1.2685,
"step": 19400
},
{
"epoch": 1.1092781159337846,
"grad_norm": 1.7756201028823853,
"learning_rate": 0.0005043441985702637,
"loss": 1.2606,
"step": 19500
},
{
"epoch": 1.114966721656522,
"grad_norm": 1.8828805685043335,
"learning_rate": 0.0005028272370442004,
"loss": 1.2582,
"step": 19600
},
{
"epoch": 1.1206553273792594,
"grad_norm": 1.6829185485839844,
"learning_rate": 0.0005013102755181372,
"loss": 1.2563,
"step": 19700
},
{
"epoch": 1.1263439331019967,
"grad_norm": 1.6716195344924927,
"learning_rate": 0.0004997933139920739,
"loss": 1.2405,
"step": 19800
},
{
"epoch": 1.132032538824734,
"grad_norm": 1.7629872560501099,
"learning_rate": 0.0004982763524660106,
"loss": 1.2649,
"step": 19900
},
{
"epoch": 1.1377211445474713,
"grad_norm": 1.704967737197876,
"learning_rate": 0.0004967593909399473,
"loss": 1.226,
"step": 20000
},
{
"epoch": 1.1377211445474713,
"eval_accuracy": 0.692396,
"eval_loss": 1.2322564125061035,
"eval_runtime": 82.0826,
"eval_samples_per_second": 3045.712,
"eval_steps_per_second": 11.903,
"step": 20000
},
{
"epoch": 1.1434097502702087,
"grad_norm": 1.5182781219482422,
"learning_rate": 0.000495242429413884,
"loss": 1.2343,
"step": 20100
},
{
"epoch": 1.1490983559929462,
"grad_norm": 2.637796640396118,
"learning_rate": 0.0004937254678878207,
"loss": 1.2506,
"step": 20200
},
{
"epoch": 1.1547869617156834,
"grad_norm": 1.8955748081207275,
"learning_rate": 0.0004922085063617575,
"loss": 1.2625,
"step": 20300
},
{
"epoch": 1.1604755674384208,
"grad_norm": 2.0370640754699707,
"learning_rate": 0.0004906915448356942,
"loss": 1.2551,
"step": 20400
},
{
"epoch": 1.1661641731611583,
"grad_norm": 1.8047020435333252,
"learning_rate": 0.0004891745833096308,
"loss": 1.2489,
"step": 20500
},
{
"epoch": 1.1718527788838955,
"grad_norm": 1.5440089702606201,
"learning_rate": 0.0004876576217835675,
"loss": 1.2646,
"step": 20600
},
{
"epoch": 1.177541384606633,
"grad_norm": 1.5029830932617188,
"learning_rate": 0.00048614066025750425,
"loss": 1.2536,
"step": 20700
},
{
"epoch": 1.1832299903293704,
"grad_norm": 1.4674205780029297,
"learning_rate": 0.0004846236987314409,
"loss": 1.2457,
"step": 20800
},
{
"epoch": 1.1889185960521076,
"grad_norm": 1.5259037017822266,
"learning_rate": 0.00048310673720537765,
"loss": 1.2449,
"step": 20900
},
{
"epoch": 1.194607201774845,
"grad_norm": 1.6339012384414673,
"learning_rate": 0.0004815897756793144,
"loss": 1.2163,
"step": 21000
},
{
"epoch": 1.2002958074975822,
"grad_norm": 1.5565885305404663,
"learning_rate": 0.00048007281415325106,
"loss": 1.2461,
"step": 21100
},
{
"epoch": 1.2059844132203197,
"grad_norm": 1.676540493965149,
"learning_rate": 0.0004785558526271878,
"loss": 1.2555,
"step": 21200
},
{
"epoch": 1.2116730189430571,
"grad_norm": 1.6003342866897583,
"learning_rate": 0.00047705406071638513,
"loss": 1.2507,
"step": 21300
},
{
"epoch": 1.2173616246657943,
"grad_norm": 2.2655630111694336,
"learning_rate": 0.00047553709919032175,
"loss": 1.2144,
"step": 21400
},
{
"epoch": 1.2230502303885318,
"grad_norm": 1.695094108581543,
"learning_rate": 0.0004740201376642585,
"loss": 1.2415,
"step": 21500
},
{
"epoch": 1.2287388361112692,
"grad_norm": 1.8387731313705444,
"learning_rate": 0.0004725031761381952,
"loss": 1.2406,
"step": 21600
},
{
"epoch": 1.2344274418340064,
"grad_norm": 1.6776598691940308,
"learning_rate": 0.0004709862146121319,
"loss": 1.2673,
"step": 21700
},
{
"epoch": 1.2401160475567439,
"grad_norm": 1.6573506593704224,
"learning_rate": 0.0004694692530860686,
"loss": 1.2587,
"step": 21800
},
{
"epoch": 1.2458046532794813,
"grad_norm": 1.6786317825317383,
"learning_rate": 0.00046795229156000535,
"loss": 1.2464,
"step": 21900
},
{
"epoch": 1.2514932590022185,
"grad_norm": 1.887971043586731,
"learning_rate": 0.00046643533003394203,
"loss": 1.2501,
"step": 22000
},
{
"epoch": 1.257181864724956,
"grad_norm": 1.7499343156814575,
"learning_rate": 0.00046491836850787876,
"loss": 1.2296,
"step": 22100
},
{
"epoch": 1.2628704704476932,
"grad_norm": 2.057670831680298,
"learning_rate": 0.00046340140698181544,
"loss": 1.2346,
"step": 22200
},
{
"epoch": 1.2685590761704306,
"grad_norm": 1.7353135347366333,
"learning_rate": 0.00046188444545575217,
"loss": 1.2512,
"step": 22300
},
{
"epoch": 1.274247681893168,
"grad_norm": 2.0662734508514404,
"learning_rate": 0.0004603674839296889,
"loss": 1.2493,
"step": 22400
},
{
"epoch": 1.2799362876159053,
"grad_norm": 1.5519914627075195,
"learning_rate": 0.0004588505224036255,
"loss": 1.2404,
"step": 22500
},
{
"epoch": 1.2856248933386427,
"grad_norm": 1.8667906522750854,
"learning_rate": 0.0004573335608775622,
"loss": 1.247,
"step": 22600
},
{
"epoch": 1.29131349906138,
"grad_norm": 1.8621453046798706,
"learning_rate": 0.00045581659935149893,
"loss": 1.2428,
"step": 22700
},
{
"epoch": 1.2970021047841174,
"grad_norm": 1.7203937768936157,
"learning_rate": 0.00045429963782543566,
"loss": 1.2273,
"step": 22800
},
{
"epoch": 1.3026907105068548,
"grad_norm": 1.7497667074203491,
"learning_rate": 0.00045278267629937234,
"loss": 1.2458,
"step": 22900
},
{
"epoch": 1.3083793162295922,
"grad_norm": 2.057507276535034,
"learning_rate": 0.0004512657147733091,
"loss": 1.2325,
"step": 23000
},
{
"epoch": 1.3140679219523295,
"grad_norm": 1.4594337940216064,
"learning_rate": 0.0004497487532472458,
"loss": 1.2319,
"step": 23100
},
{
"epoch": 1.319756527675067,
"grad_norm": 2.1696736812591553,
"learning_rate": 0.0004482317917211825,
"loss": 1.234,
"step": 23200
},
{
"epoch": 1.3254451333978041,
"grad_norm": 1.8165019750595093,
"learning_rate": 0.0004467148301951192,
"loss": 1.2256,
"step": 23300
},
{
"epoch": 1.3311337391205416,
"grad_norm": 1.5531728267669678,
"learning_rate": 0.00044519786866905594,
"loss": 1.2518,
"step": 23400
},
{
"epoch": 1.336822344843279,
"grad_norm": 1.4592831134796143,
"learning_rate": 0.0004436809071429926,
"loss": 1.2192,
"step": 23500
},
{
"epoch": 1.3425109505660162,
"grad_norm": 1.74478280544281,
"learning_rate": 0.00044216394561692935,
"loss": 1.2427,
"step": 23600
},
{
"epoch": 1.3481995562887537,
"grad_norm": 1.8685113191604614,
"learning_rate": 0.000440646984090866,
"loss": 1.2581,
"step": 23700
},
{
"epoch": 1.3538881620114909,
"grad_norm": 1.7366535663604736,
"learning_rate": 0.0004391300225648027,
"loss": 1.2471,
"step": 23800
},
{
"epoch": 1.3595767677342283,
"grad_norm": 1.6585444211959839,
"learning_rate": 0.0004376130610387394,
"loss": 1.2198,
"step": 23900
},
{
"epoch": 1.3652653734569657,
"grad_norm": 1.9299806356430054,
"learning_rate": 0.0004360960995126761,
"loss": 1.2314,
"step": 24000
},
{
"epoch": 1.3709539791797032,
"grad_norm": 1.8172481060028076,
"learning_rate": 0.00043457913798661285,
"loss": 1.2098,
"step": 24100
},
{
"epoch": 1.3766425849024404,
"grad_norm": 1.5579493045806885,
"learning_rate": 0.0004330621764605495,
"loss": 1.2043,
"step": 24200
},
{
"epoch": 1.3823311906251778,
"grad_norm": 1.8178203105926514,
"learning_rate": 0.00043154521493448625,
"loss": 1.2235,
"step": 24300
},
{
"epoch": 1.388019796347915,
"grad_norm": 1.676126480102539,
"learning_rate": 0.000430028253408423,
"loss": 1.2388,
"step": 24400
},
{
"epoch": 1.3937084020706525,
"grad_norm": 1.863893985748291,
"learning_rate": 0.00042851129188235966,
"loss": 1.2206,
"step": 24500
},
{
"epoch": 1.39939700779339,
"grad_norm": 1.5618318319320679,
"learning_rate": 0.0004269943303562964,
"loss": 1.2359,
"step": 24600
},
{
"epoch": 1.4050856135161272,
"grad_norm": 1.3972681760787964,
"learning_rate": 0.0004254773688302331,
"loss": 1.2152,
"step": 24700
},
{
"epoch": 1.4107742192388646,
"grad_norm": 1.584274411201477,
"learning_rate": 0.00042396040730416975,
"loss": 1.211,
"step": 24800
},
{
"epoch": 1.4164628249616018,
"grad_norm": 1.7282276153564453,
"learning_rate": 0.0004224434457781064,
"loss": 1.2034,
"step": 24900
},
{
"epoch": 1.4221514306843392,
"grad_norm": 2.2420654296875,
"learning_rate": 0.00042092648425204316,
"loss": 1.2125,
"step": 25000
},
{
"epoch": 1.4221514306843392,
"eval_accuracy": 0.703072,
"eval_loss": 1.185011863708496,
"eval_runtime": 81.7158,
"eval_samples_per_second": 3059.385,
"eval_steps_per_second": 11.956,
"step": 25000
},
{
"epoch": 1.4278400364070767,
"grad_norm": 1.5998648405075073,
"learning_rate": 0.0004194095227259799,
"loss": 1.2145,
"step": 25100
},
{
"epoch": 1.433528642129814,
"grad_norm": 1.8546173572540283,
"learning_rate": 0.00041789256119991656,
"loss": 1.2293,
"step": 25200
},
{
"epoch": 1.4392172478525513,
"grad_norm": 1.6815022230148315,
"learning_rate": 0.00041639076928911395,
"loss": 1.2165,
"step": 25300
},
{
"epoch": 1.4449058535752888,
"grad_norm": 1.5567988157272339,
"learning_rate": 0.0004148889773783113,
"loss": 1.2231,
"step": 25400
},
{
"epoch": 1.450594459298026,
"grad_norm": 1.9424728155136108,
"learning_rate": 0.000413372015852248,
"loss": 1.2302,
"step": 25500
},
{
"epoch": 1.4562830650207634,
"grad_norm": 1.9052510261535645,
"learning_rate": 0.00041185505432618464,
"loss": 1.2174,
"step": 25600
},
{
"epoch": 1.4619716707435009,
"grad_norm": 1.470513939857483,
"learning_rate": 0.0004103380928001213,
"loss": 1.2167,
"step": 25700
},
{
"epoch": 1.467660276466238,
"grad_norm": 1.5082899332046509,
"learning_rate": 0.00040882113127405805,
"loss": 1.2098,
"step": 25800
},
{
"epoch": 1.4733488821889755,
"grad_norm": 1.7309447526931763,
"learning_rate": 0.0004073041697479948,
"loss": 1.2111,
"step": 25900
},
{
"epoch": 1.4790374879117127,
"grad_norm": 1.7894470691680908,
"learning_rate": 0.00040578720822193146,
"loss": 1.2237,
"step": 26000
},
{
"epoch": 1.4847260936344502,
"grad_norm": 1.712557077407837,
"learning_rate": 0.0004042702466958682,
"loss": 1.2314,
"step": 26100
},
{
"epoch": 1.4904146993571876,
"grad_norm": 1.794565200805664,
"learning_rate": 0.0004027532851698049,
"loss": 1.2072,
"step": 26200
},
{
"epoch": 1.4961033050799248,
"grad_norm": 1.655179500579834,
"learning_rate": 0.0004012363236437416,
"loss": 1.218,
"step": 26300
},
{
"epoch": 1.5017919108026623,
"grad_norm": 1.8749343156814575,
"learning_rate": 0.00039971936211767833,
"loss": 1.2037,
"step": 26400
},
{
"epoch": 1.5074805165253995,
"grad_norm": 1.5337320566177368,
"learning_rate": 0.000398202400591615,
"loss": 1.2179,
"step": 26500
},
{
"epoch": 1.513169122248137,
"grad_norm": 1.5731686353683472,
"learning_rate": 0.0003966854390655517,
"loss": 1.2037,
"step": 26600
},
{
"epoch": 1.5188577279708744,
"grad_norm": 1.5700329542160034,
"learning_rate": 0.0003951684775394884,
"loss": 1.2189,
"step": 26700
},
{
"epoch": 1.5245463336936118,
"grad_norm": 1.9315118789672852,
"learning_rate": 0.00039365151601342515,
"loss": 1.2314,
"step": 26800
},
{
"epoch": 1.530234939416349,
"grad_norm": 1.6017844676971436,
"learning_rate": 0.0003921345544873618,
"loss": 1.211,
"step": 26900
},
{
"epoch": 1.5359235451390862,
"grad_norm": 1.586595058441162,
"learning_rate": 0.0003906175929612985,
"loss": 1.2079,
"step": 27000
},
{
"epoch": 1.5416121508618237,
"grad_norm": 1.8215593099594116,
"learning_rate": 0.00038910063143523523,
"loss": 1.2022,
"step": 27100
},
{
"epoch": 1.5473007565845611,
"grad_norm": 1.7390124797821045,
"learning_rate": 0.00038758366990917196,
"loss": 1.2143,
"step": 27200
},
{
"epoch": 1.5529893623072986,
"grad_norm": 1.792608618736267,
"learning_rate": 0.00038606670838310864,
"loss": 1.2104,
"step": 27300
},
{
"epoch": 1.558677968030036,
"grad_norm": 1.802167296409607,
"learning_rate": 0.00038454974685704537,
"loss": 1.1924,
"step": 27400
},
{
"epoch": 1.5643665737527732,
"grad_norm": 1.7943332195281982,
"learning_rate": 0.0003830327853309821,
"loss": 1.2096,
"step": 27500
},
{
"epoch": 1.5700551794755104,
"grad_norm": 1.745893120765686,
"learning_rate": 0.0003815158238049187,
"loss": 1.1941,
"step": 27600
},
{
"epoch": 1.5757437851982479,
"grad_norm": 1.6740118265151978,
"learning_rate": 0.00037999886227885546,
"loss": 1.2355,
"step": 27700
},
{
"epoch": 1.5814323909209853,
"grad_norm": 1.681840419769287,
"learning_rate": 0.0003784970703680528,
"loss": 1.2034,
"step": 27800
},
{
"epoch": 1.5871209966437227,
"grad_norm": 1.6897751092910767,
"learning_rate": 0.0003769801088419895,
"loss": 1.2169,
"step": 27900
},
{
"epoch": 1.59280960236646,
"grad_norm": 1.686784267425537,
"learning_rate": 0.0003754631473159262,
"loss": 1.2093,
"step": 28000
},
{
"epoch": 1.5984982080891972,
"grad_norm": 1.6020421981811523,
"learning_rate": 0.00037394618578986293,
"loss": 1.2131,
"step": 28100
},
{
"epoch": 1.6041868138119346,
"grad_norm": 1.478246808052063,
"learning_rate": 0.0003724292242637996,
"loss": 1.2048,
"step": 28200
},
{
"epoch": 1.609875419534672,
"grad_norm": 1.4912410974502563,
"learning_rate": 0.00037091226273773634,
"loss": 1.18,
"step": 28300
},
{
"epoch": 1.6155640252574095,
"grad_norm": 1.6362539529800415,
"learning_rate": 0.00036939530121167307,
"loss": 1.2022,
"step": 28400
},
{
"epoch": 1.6212526309801467,
"grad_norm": 1.5238479375839233,
"learning_rate": 0.00036787833968560975,
"loss": 1.2105,
"step": 28500
},
{
"epoch": 1.6269412367028842,
"grad_norm": 1.6359635591506958,
"learning_rate": 0.0003663613781595464,
"loss": 1.1833,
"step": 28600
},
{
"epoch": 1.6326298424256214,
"grad_norm": 1.6206257343292236,
"learning_rate": 0.00036484441663348316,
"loss": 1.1945,
"step": 28700
},
{
"epoch": 1.6383184481483588,
"grad_norm": 1.7032015323638916,
"learning_rate": 0.00036332745510741983,
"loss": 1.2065,
"step": 28800
},
{
"epoch": 1.6440070538710962,
"grad_norm": 1.7177228927612305,
"learning_rate": 0.00036181049358135657,
"loss": 1.2035,
"step": 28900
},
{
"epoch": 1.6496956595938337,
"grad_norm": 1.5967752933502197,
"learning_rate": 0.0003602935320552933,
"loss": 1.2036,
"step": 29000
},
{
"epoch": 1.655384265316571,
"grad_norm": 1.6632803678512573,
"learning_rate": 0.00035877657052923,
"loss": 1.226,
"step": 29100
},
{
"epoch": 1.6610728710393081,
"grad_norm": 1.5134357213974,
"learning_rate": 0.00035725960900316665,
"loss": 1.1947,
"step": 29200
},
{
"epoch": 1.6667614767620456,
"grad_norm": 1.5506322383880615,
"learning_rate": 0.0003557426474771034,
"loss": 1.1963,
"step": 29300
},
{
"epoch": 1.672450082484783,
"grad_norm": 1.4821183681488037,
"learning_rate": 0.0003542256859510401,
"loss": 1.2039,
"step": 29400
},
{
"epoch": 1.6781386882075204,
"grad_norm": 2.278379440307617,
"learning_rate": 0.0003527087244249768,
"loss": 1.205,
"step": 29500
},
{
"epoch": 1.6838272939302577,
"grad_norm": 1.5077921152114868,
"learning_rate": 0.0003511917628989135,
"loss": 1.1984,
"step": 29600
},
{
"epoch": 1.689515899652995,
"grad_norm": 1.629607915878296,
"learning_rate": 0.0003496748013728502,
"loss": 1.2023,
"step": 29700
},
{
"epoch": 1.6952045053757323,
"grad_norm": 1.5007668733596802,
"learning_rate": 0.0003481578398467869,
"loss": 1.1907,
"step": 29800
},
{
"epoch": 1.7008931110984697,
"grad_norm": 1.7543882131576538,
"learning_rate": 0.0003466408783207236,
"loss": 1.1949,
"step": 29900
},
{
"epoch": 1.7065817168212072,
"grad_norm": 1.6254594326019287,
"learning_rate": 0.00034512391679466034,
"loss": 1.1912,
"step": 30000
},
{
"epoch": 1.7065817168212072,
"eval_accuracy": 0.709248,
"eval_loss": 1.1566522121429443,
"eval_runtime": 79.5399,
"eval_samples_per_second": 3143.077,
"eval_steps_per_second": 12.283,
"step": 30000
},
{
"epoch": 1.7122703225439446,
"grad_norm": 1.873049020767212,
"learning_rate": 0.000343606955268597,
"loss": 1.2063,
"step": 30100
},
{
"epoch": 1.7179589282666818,
"grad_norm": 1.5862141847610474,
"learning_rate": 0.00034208999374253375,
"loss": 1.1926,
"step": 30200
},
{
"epoch": 1.723647533989419,
"grad_norm": 1.9915696382522583,
"learning_rate": 0.0003405730322164704,
"loss": 1.1952,
"step": 30300
},
{
"epoch": 1.7293361397121565,
"grad_norm": 1.856048822402954,
"learning_rate": 0.0003390560706904071,
"loss": 1.1953,
"step": 30400
},
{
"epoch": 1.735024745434894,
"grad_norm": 1.6758267879486084,
"learning_rate": 0.00033753910916434383,
"loss": 1.1906,
"step": 30500
},
{
"epoch": 1.7407133511576314,
"grad_norm": 1.8683140277862549,
"learning_rate": 0.00033602214763828056,
"loss": 1.2025,
"step": 30600
},
{
"epoch": 1.7464019568803686,
"grad_norm": 1.452721118927002,
"learning_rate": 0.00033450518611221724,
"loss": 1.1866,
"step": 30700
},
{
"epoch": 1.752090562603106,
"grad_norm": 1.5711089372634888,
"learning_rate": 0.0003329882245861539,
"loss": 1.1856,
"step": 30800
},
{
"epoch": 1.7577791683258432,
"grad_norm": 2.0584185123443604,
"learning_rate": 0.00033147126306009065,
"loss": 1.1873,
"step": 30900
},
{
"epoch": 1.7634677740485807,
"grad_norm": 1.5743275880813599,
"learning_rate": 0.0003299543015340274,
"loss": 1.1988,
"step": 31000
},
{
"epoch": 1.7691563797713181,
"grad_norm": 1.5788936614990234,
"learning_rate": 0.00032843734000796406,
"loss": 1.1932,
"step": 31100
},
{
"epoch": 1.7748449854940556,
"grad_norm": 1.6406651735305786,
"learning_rate": 0.0003269203784819008,
"loss": 1.1876,
"step": 31200
},
{
"epoch": 1.7805335912167928,
"grad_norm": 1.6410019397735596,
"learning_rate": 0.00032540341695583747,
"loss": 1.1859,
"step": 31300
},
{
"epoch": 1.78622219693953,
"grad_norm": 1.548140287399292,
"learning_rate": 0.00032388645542977414,
"loss": 1.2032,
"step": 31400
},
{
"epoch": 1.7919108026622674,
"grad_norm": 1.9242947101593018,
"learning_rate": 0.0003223694939037109,
"loss": 1.199,
"step": 31500
},
{
"epoch": 1.7975994083850049,
"grad_norm": 2.0189428329467773,
"learning_rate": 0.0003208525323776476,
"loss": 1.1832,
"step": 31600
},
{
"epoch": 1.8032880141077423,
"grad_norm": 1.740432620048523,
"learning_rate": 0.0003193355708515843,
"loss": 1.183,
"step": 31700
},
{
"epoch": 1.8089766198304795,
"grad_norm": 1.743503451347351,
"learning_rate": 0.0003178337789407816,
"loss": 1.1991,
"step": 31800
},
{
"epoch": 1.8146652255532167,
"grad_norm": 1.7166736125946045,
"learning_rate": 0.00031631681741471835,
"loss": 1.2031,
"step": 31900
},
{
"epoch": 1.8203538312759542,
"grad_norm": 1.626386046409607,
"learning_rate": 0.000314799855888655,
"loss": 1.1987,
"step": 32000
},
{
"epoch": 1.8260424369986916,
"grad_norm": 1.5402131080627441,
"learning_rate": 0.00031328289436259176,
"loss": 1.172,
"step": 32100
},
{
"epoch": 1.831731042721429,
"grad_norm": 1.6522256135940552,
"learning_rate": 0.0003117659328365285,
"loss": 1.179,
"step": 32200
},
{
"epoch": 1.8374196484441665,
"grad_norm": 1.482009768486023,
"learning_rate": 0.00031024897131046517,
"loss": 1.1903,
"step": 32300
},
{
"epoch": 1.8431082541669037,
"grad_norm": 1.6417380571365356,
"learning_rate": 0.00030873200978440184,
"loss": 1.2024,
"step": 32400
},
{
"epoch": 1.848796859889641,
"grad_norm": 1.532333493232727,
"learning_rate": 0.0003072150482583386,
"loss": 1.1843,
"step": 32500
},
{
"epoch": 1.8544854656123784,
"grad_norm": 2.004293441772461,
"learning_rate": 0.00030569808673227525,
"loss": 1.192,
"step": 32600
},
{
"epoch": 1.8601740713351158,
"grad_norm": 1.7226125001907349,
"learning_rate": 0.000304181125206212,
"loss": 1.1902,
"step": 32700
},
{
"epoch": 1.8658626770578532,
"grad_norm": 1.7714165449142456,
"learning_rate": 0.0003026641636801487,
"loss": 1.1908,
"step": 32800
},
{
"epoch": 1.8715512827805905,
"grad_norm": 1.5100337266921997,
"learning_rate": 0.00030114720215408534,
"loss": 1.1735,
"step": 32900
},
{
"epoch": 1.8772398885033277,
"grad_norm": 1.6792744398117065,
"learning_rate": 0.00029963024062802207,
"loss": 1.191,
"step": 33000
},
{
"epoch": 1.8829284942260651,
"grad_norm": 1.705554723739624,
"learning_rate": 0.0002981132791019588,
"loss": 1.1878,
"step": 33100
},
{
"epoch": 1.8886170999488026,
"grad_norm": 1.4528917074203491,
"learning_rate": 0.0002965963175758955,
"loss": 1.1685,
"step": 33200
},
{
"epoch": 1.89430570567154,
"grad_norm": 1.7752711772918701,
"learning_rate": 0.0002950793560498322,
"loss": 1.1743,
"step": 33300
},
{
"epoch": 1.8999943113942772,
"grad_norm": 1.762074589729309,
"learning_rate": 0.00029356239452376894,
"loss": 1.1775,
"step": 33400
},
{
"epoch": 1.9056829171170147,
"grad_norm": 1.6388828754425049,
"learning_rate": 0.0002920454329977056,
"loss": 1.1762,
"step": 33500
},
{
"epoch": 1.9113715228397519,
"grad_norm": 1.5171791315078735,
"learning_rate": 0.0002905284714716423,
"loss": 1.1649,
"step": 33600
},
{
"epoch": 1.9170601285624893,
"grad_norm": 1.6547460556030273,
"learning_rate": 0.000289011509945579,
"loss": 1.1904,
"step": 33700
},
{
"epoch": 1.9227487342852267,
"grad_norm": 1.705083966255188,
"learning_rate": 0.00028750971803477636,
"loss": 1.1667,
"step": 33800
},
{
"epoch": 1.9284373400079642,
"grad_norm": 1.731803059577942,
"learning_rate": 0.00028599275650871304,
"loss": 1.1788,
"step": 33900
},
{
"epoch": 1.9341259457307014,
"grad_norm": 2.056766986846924,
"learning_rate": 0.00028447579498264977,
"loss": 1.1878,
"step": 34000
},
{
"epoch": 1.9398145514534386,
"grad_norm": 1.8016914129257202,
"learning_rate": 0.00028295883345658644,
"loss": 1.1632,
"step": 34100
},
{
"epoch": 1.945503157176176,
"grad_norm": 1.7706475257873535,
"learning_rate": 0.0002814418719305232,
"loss": 1.1658,
"step": 34200
},
{
"epoch": 1.9511917628989135,
"grad_norm": 1.8184970617294312,
"learning_rate": 0.0002799249104044599,
"loss": 1.1666,
"step": 34300
},
{
"epoch": 1.956880368621651,
"grad_norm": 1.6529743671417236,
"learning_rate": 0.0002784079488783966,
"loss": 1.1846,
"step": 34400
},
{
"epoch": 1.9625689743443882,
"grad_norm": 1.5860931873321533,
"learning_rate": 0.00027689098735233326,
"loss": 1.1917,
"step": 34500
},
{
"epoch": 1.9682575800671256,
"grad_norm": 1.672756552696228,
"learning_rate": 0.00027537402582627,
"loss": 1.1654,
"step": 34600
},
{
"epoch": 1.9739461857898628,
"grad_norm": 1.7606583833694458,
"learning_rate": 0.0002738570643002067,
"loss": 1.176,
"step": 34700
},
{
"epoch": 1.9796347915126002,
"grad_norm": 1.912277340888977,
"learning_rate": 0.0002723401027741434,
"loss": 1.1695,
"step": 34800
},
{
"epoch": 1.9853233972353377,
"grad_norm": 1.7096484899520874,
"learning_rate": 0.00027082314124808013,
"loss": 1.1669,
"step": 34900
},
{
"epoch": 1.9910120029580751,
"grad_norm": 1.7793241739273071,
"learning_rate": 0.0002693061797220168,
"loss": 1.1902,
"step": 35000
},
{
"epoch": 1.9910120029580751,
"eval_accuracy": 0.71648,
"eval_loss": 1.1297262907028198,
"eval_runtime": 79.5966,
"eval_samples_per_second": 3140.839,
"eval_steps_per_second": 12.274,
"step": 35000
},
{
"epoch": 1.9967006086808123,
"grad_norm": 1.4913907051086426,
"learning_rate": 0.0002677892181959535,
"loss": 1.1607,
"step": 35100
},
{
"epoch": 2.0023892144035496,
"grad_norm": 1.639985203742981,
"learning_rate": 0.0002662874262851509,
"loss": 1.1727,
"step": 35200
},
{
"epoch": 2.008077820126287,
"grad_norm": 1.6419970989227295,
"learning_rate": 0.00026477046475908755,
"loss": 1.1392,
"step": 35300
},
{
"epoch": 2.0137664258490244,
"grad_norm": 1.8132672309875488,
"learning_rate": 0.00026325350323302423,
"loss": 1.1503,
"step": 35400
},
{
"epoch": 2.019455031571762,
"grad_norm": 1.4656819105148315,
"learning_rate": 0.00026173654170696096,
"loss": 1.1565,
"step": 35500
},
{
"epoch": 2.0251436372944993,
"grad_norm": 1.3595716953277588,
"learning_rate": 0.0002602195801808977,
"loss": 1.1526,
"step": 35600
},
{
"epoch": 2.0308322430172363,
"grad_norm": 1.6904360055923462,
"learning_rate": 0.00025870261865483437,
"loss": 1.1448,
"step": 35700
},
{
"epoch": 2.0365208487399737,
"grad_norm": 1.7240209579467773,
"learning_rate": 0.0002571856571287711,
"loss": 1.1424,
"step": 35800
},
{
"epoch": 2.042209454462711,
"grad_norm": 1.5376731157302856,
"learning_rate": 0.00025566869560270783,
"loss": 1.143,
"step": 35900
},
{
"epoch": 2.0478980601854486,
"grad_norm": 1.893202781677246,
"learning_rate": 0.00025415173407664445,
"loss": 1.1519,
"step": 36000
},
{
"epoch": 2.053586665908186,
"grad_norm": 1.9057211875915527,
"learning_rate": 0.0002526347725505812,
"loss": 1.1375,
"step": 36100
},
{
"epoch": 2.059275271630923,
"grad_norm": 1.7818187475204468,
"learning_rate": 0.0002511178110245179,
"loss": 1.1424,
"step": 36200
},
{
"epoch": 2.0649638773536605,
"grad_norm": 1.825323462486267,
"learning_rate": 0.0002496008494984546,
"loss": 1.1196,
"step": 36300
},
{
"epoch": 2.070652483076398,
"grad_norm": 2.0049736499786377,
"learning_rate": 0.0002480838879723913,
"loss": 1.1317,
"step": 36400
},
{
"epoch": 2.0763410887991354,
"grad_norm": 1.599846363067627,
"learning_rate": 0.000246566926446328,
"loss": 1.1573,
"step": 36500
},
{
"epoch": 2.082029694521873,
"grad_norm": 1.5434855222702026,
"learning_rate": 0.00024504996492026473,
"loss": 1.1493,
"step": 36600
},
{
"epoch": 2.0877183002446102,
"grad_norm": 1.6306787729263306,
"learning_rate": 0.0002435330033942014,
"loss": 1.1564,
"step": 36700
},
{
"epoch": 2.0934069059673472,
"grad_norm": 1.6914353370666504,
"learning_rate": 0.00024201604186813814,
"loss": 1.1395,
"step": 36800
},
{
"epoch": 2.0990955116900847,
"grad_norm": 1.6444432735443115,
"learning_rate": 0.00024049908034207485,
"loss": 1.1615,
"step": 36900
},
{
"epoch": 2.104784117412822,
"grad_norm": 1.821244239807129,
"learning_rate": 0.00023898211881601155,
"loss": 1.1429,
"step": 37000
},
{
"epoch": 2.1104727231355596,
"grad_norm": 1.6050491333007812,
"learning_rate": 0.00023746515728994823,
"loss": 1.1376,
"step": 37100
},
{
"epoch": 2.116161328858297,
"grad_norm": 1.7375249862670898,
"learning_rate": 0.00023594819576388493,
"loss": 1.1253,
"step": 37200
},
{
"epoch": 2.121849934581034,
"grad_norm": 2.0717177391052246,
"learning_rate": 0.00023443123423782166,
"loss": 1.1527,
"step": 37300
},
{
"epoch": 2.1275385403037714,
"grad_norm": 1.43324875831604,
"learning_rate": 0.00023291427271175837,
"loss": 1.1475,
"step": 37400
},
{
"epoch": 2.133227146026509,
"grad_norm": 1.448669195175171,
"learning_rate": 0.00023139731118569507,
"loss": 1.1133,
"step": 37500
},
{
"epoch": 2.1389157517492463,
"grad_norm": 1.521912932395935,
"learning_rate": 0.00022988034965963178,
"loss": 1.1292,
"step": 37600
},
{
"epoch": 2.1446043574719837,
"grad_norm": 1.6070728302001953,
"learning_rate": 0.00022836338813356845,
"loss": 1.1384,
"step": 37700
},
{
"epoch": 2.1502929631947207,
"grad_norm": 1.3853884935379028,
"learning_rate": 0.00022684642660750516,
"loss": 1.1344,
"step": 37800
},
{
"epoch": 2.155981568917458,
"grad_norm": 1.569415807723999,
"learning_rate": 0.0002253294650814419,
"loss": 1.1572,
"step": 37900
},
{
"epoch": 2.1616701746401956,
"grad_norm": 1.544966220855713,
"learning_rate": 0.0002238125035553786,
"loss": 1.1378,
"step": 38000
},
{
"epoch": 2.167358780362933,
"grad_norm": 1.6090420484542847,
"learning_rate": 0.0002222955420293153,
"loss": 1.1331,
"step": 38100
},
{
"epoch": 2.1730473860856705,
"grad_norm": 1.542605996131897,
"learning_rate": 0.00022077858050325197,
"loss": 1.1302,
"step": 38200
},
{
"epoch": 2.178735991808408,
"grad_norm": 1.744084119796753,
"learning_rate": 0.00021926161897718868,
"loss": 1.1305,
"step": 38300
},
{
"epoch": 2.184424597531145,
"grad_norm": 1.630118489265442,
"learning_rate": 0.0002177446574511254,
"loss": 1.1294,
"step": 38400
},
{
"epoch": 2.1901132032538824,
"grad_norm": 1.6920104026794434,
"learning_rate": 0.0002162276959250621,
"loss": 1.1337,
"step": 38500
},
{
"epoch": 2.19580180897662,
"grad_norm": 1.654189944267273,
"learning_rate": 0.00021471073439899882,
"loss": 1.1182,
"step": 38600
},
{
"epoch": 2.2014904146993572,
"grad_norm": 1.8575996160507202,
"learning_rate": 0.00021319377287293555,
"loss": 1.1261,
"step": 38700
},
{
"epoch": 2.2071790204220947,
"grad_norm": 1.5796535015106201,
"learning_rate": 0.0002116768113468722,
"loss": 1.1389,
"step": 38800
},
{
"epoch": 2.212867626144832,
"grad_norm": 1.6893657445907593,
"learning_rate": 0.00021015984982080893,
"loss": 1.122,
"step": 38900
},
{
"epoch": 2.218556231867569,
"grad_norm": 1.5983092784881592,
"learning_rate": 0.00020864288829474563,
"loss": 1.1487,
"step": 39000
},
{
"epoch": 2.2242448375903066,
"grad_norm": 1.632049798965454,
"learning_rate": 0.00020712592676868234,
"loss": 1.1476,
"step": 39100
},
{
"epoch": 2.229933443313044,
"grad_norm": 2.039854049682617,
"learning_rate": 0.00020562413485787965,
"loss": 1.1443,
"step": 39200
},
{
"epoch": 2.2356220490357814,
"grad_norm": 1.5673627853393555,
"learning_rate": 0.00020410717333181638,
"loss": 1.1259,
"step": 39300
},
{
"epoch": 2.241310654758519,
"grad_norm": 1.6900497674942017,
"learning_rate": 0.00020259021180575308,
"loss": 1.1356,
"step": 39400
},
{
"epoch": 2.246999260481256,
"grad_norm": 1.8306878805160522,
"learning_rate": 0.00020107325027968979,
"loss": 1.1349,
"step": 39500
},
{
"epoch": 2.2526878662039933,
"grad_norm": 1.620490550994873,
"learning_rate": 0.0001995562887536265,
"loss": 1.1417,
"step": 39600
},
{
"epoch": 2.2583764719267307,
"grad_norm": 1.828751802444458,
"learning_rate": 0.0001980393272275632,
"loss": 1.1302,
"step": 39700
},
{
"epoch": 2.264065077649468,
"grad_norm": 1.4963942766189575,
"learning_rate": 0.0001965223657014999,
"loss": 1.152,
"step": 39800
},
{
"epoch": 2.2697536833722056,
"grad_norm": 2.081669807434082,
"learning_rate": 0.0001950054041754366,
"loss": 1.1385,
"step": 39900
},
{
"epoch": 2.2754422890949426,
"grad_norm": 1.6873656511306763,
"learning_rate": 0.0001934884426493733,
"loss": 1.131,
"step": 40000
},
{
"epoch": 2.2754422890949426,
"eval_accuracy": 0.721316,
"eval_loss": 1.1105972528457642,
"eval_runtime": 80.6985,
"eval_samples_per_second": 3097.949,
"eval_steps_per_second": 12.107,
"step": 40000
},
{
"epoch": 2.28113089481768,
"grad_norm": 1.5599457025527954,
"learning_rate": 0.00019197148112331004,
"loss": 1.1525,
"step": 40100
},
{
"epoch": 2.2868195005404175,
"grad_norm": 1.816628098487854,
"learning_rate": 0.00019045451959724672,
"loss": 1.1295,
"step": 40200
},
{
"epoch": 2.292508106263155,
"grad_norm": 1.5481749773025513,
"learning_rate": 0.00018893755807118342,
"loss": 1.124,
"step": 40300
},
{
"epoch": 2.2981967119858924,
"grad_norm": 1.632873296737671,
"learning_rate": 0.00018742059654512015,
"loss": 1.1217,
"step": 40400
},
{
"epoch": 2.3038853177086294,
"grad_norm": 1.4403363466262817,
"learning_rate": 0.00018590363501905683,
"loss": 1.1315,
"step": 40500
},
{
"epoch": 2.309573923431367,
"grad_norm": 1.6744205951690674,
"learning_rate": 0.00018438667349299353,
"loss": 1.1473,
"step": 40600
},
{
"epoch": 2.3152625291541042,
"grad_norm": 1.5021002292633057,
"learning_rate": 0.00018286971196693026,
"loss": 1.1127,
"step": 40700
},
{
"epoch": 2.3209511348768417,
"grad_norm": 1.689931869506836,
"learning_rate": 0.00018135275044086694,
"loss": 1.1394,
"step": 40800
},
{
"epoch": 2.326639740599579,
"grad_norm": 2.1370577812194824,
"learning_rate": 0.00017983578891480367,
"loss": 1.148,
"step": 40900
},
{
"epoch": 2.3323283463223166,
"grad_norm": 1.9048566818237305,
"learning_rate": 0.00017831882738874038,
"loss": 1.1181,
"step": 41000
},
{
"epoch": 2.338016952045054,
"grad_norm": 1.8328748941421509,
"learning_rate": 0.00017680186586267705,
"loss": 1.1302,
"step": 41100
},
{
"epoch": 2.343705557767791,
"grad_norm": 1.7709869146347046,
"learning_rate": 0.0001753000739518744,
"loss": 1.1369,
"step": 41200
},
{
"epoch": 2.3493941634905284,
"grad_norm": 1.6296570301055908,
"learning_rate": 0.00017378311242581112,
"loss": 1.1302,
"step": 41300
},
{
"epoch": 2.355082769213266,
"grad_norm": 1.6044236421585083,
"learning_rate": 0.0001722661508997478,
"loss": 1.1313,
"step": 41400
},
{
"epoch": 2.3607713749360033,
"grad_norm": 1.4571659564971924,
"learning_rate": 0.00017074918937368453,
"loss": 1.1249,
"step": 41500
},
{
"epoch": 2.3664599806587407,
"grad_norm": 1.7237457036972046,
"learning_rate": 0.00016923222784762123,
"loss": 1.1312,
"step": 41600
},
{
"epoch": 2.3721485863814777,
"grad_norm": 1.552881121635437,
"learning_rate": 0.0001677152663215579,
"loss": 1.1282,
"step": 41700
},
{
"epoch": 2.377837192104215,
"grad_norm": 1.6091784238815308,
"learning_rate": 0.00016619830479549464,
"loss": 1.1236,
"step": 41800
},
{
"epoch": 2.3835257978269526,
"grad_norm": 1.8620885610580444,
"learning_rate": 0.00016468134326943134,
"loss": 1.1469,
"step": 41900
},
{
"epoch": 2.38921440354969,
"grad_norm": 1.717551827430725,
"learning_rate": 0.00016316438174336802,
"loss": 1.121,
"step": 42000
},
{
"epoch": 2.3949030092724275,
"grad_norm": 1.6212184429168701,
"learning_rate": 0.00016164742021730475,
"loss": 1.0997,
"step": 42100
},
{
"epoch": 2.4005916149951645,
"grad_norm": 1.3878498077392578,
"learning_rate": 0.00016013045869124146,
"loss": 1.1362,
"step": 42200
},
{
"epoch": 2.406280220717902,
"grad_norm": 1.6336196660995483,
"learning_rate": 0.00015861349716517816,
"loss": 1.1256,
"step": 42300
},
{
"epoch": 2.4119688264406394,
"grad_norm": 1.7155201435089111,
"learning_rate": 0.00015709653563911486,
"loss": 1.1133,
"step": 42400
},
{
"epoch": 2.417657432163377,
"grad_norm": 1.7675564289093018,
"learning_rate": 0.00015557957411305157,
"loss": 1.1416,
"step": 42500
},
{
"epoch": 2.4233460378861142,
"grad_norm": 1.676527976989746,
"learning_rate": 0.00015406261258698827,
"loss": 1.1378,
"step": 42600
},
{
"epoch": 2.4290346436088512,
"grad_norm": 1.6293052434921265,
"learning_rate": 0.00015254565106092498,
"loss": 1.1177,
"step": 42700
},
{
"epoch": 2.4347232493315887,
"grad_norm": 1.5264780521392822,
"learning_rate": 0.00015102868953486168,
"loss": 1.1063,
"step": 42800
},
{
"epoch": 2.440411855054326,
"grad_norm": 1.6453486680984497,
"learning_rate": 0.00014951172800879839,
"loss": 1.1375,
"step": 42900
},
{
"epoch": 2.4461004607770636,
"grad_norm": 1.692336082458496,
"learning_rate": 0.0001479947664827351,
"loss": 1.1004,
"step": 43000
},
{
"epoch": 2.451789066499801,
"grad_norm": 1.868812084197998,
"learning_rate": 0.0001464778049566718,
"loss": 1.1288,
"step": 43100
},
{
"epoch": 2.4574776722225384,
"grad_norm": 1.7713991403579712,
"learning_rate": 0.0001449608434306085,
"loss": 1.1229,
"step": 43200
},
{
"epoch": 2.4631662779452754,
"grad_norm": 1.6394290924072266,
"learning_rate": 0.00014345905151980583,
"loss": 1.0968,
"step": 43300
},
{
"epoch": 2.468854883668013,
"grad_norm": 1.7240723371505737,
"learning_rate": 0.00014194208999374254,
"loss": 1.1151,
"step": 43400
},
{
"epoch": 2.4745434893907503,
"grad_norm": 1.9284464120864868,
"learning_rate": 0.00014042512846767924,
"loss": 1.1302,
"step": 43500
},
{
"epoch": 2.4802320951134877,
"grad_norm": 1.6855792999267578,
"learning_rate": 0.00013890816694161595,
"loss": 1.1163,
"step": 43600
},
{
"epoch": 2.485920700836225,
"grad_norm": 1.8182587623596191,
"learning_rate": 0.00013739120541555265,
"loss": 1.1172,
"step": 43700
},
{
"epoch": 2.4916093065589626,
"grad_norm": 1.5971157550811768,
"learning_rate": 0.00013587424388948935,
"loss": 1.1071,
"step": 43800
},
{
"epoch": 2.4972979122816996,
"grad_norm": 1.7139756679534912,
"learning_rate": 0.00013435728236342606,
"loss": 1.1239,
"step": 43900
},
{
"epoch": 2.502986518004437,
"grad_norm": 1.7199363708496094,
"learning_rate": 0.00013284032083736276,
"loss": 1.1444,
"step": 44000
},
{
"epoch": 2.5086751237271745,
"grad_norm": 1.7295994758605957,
"learning_rate": 0.00013132335931129947,
"loss": 1.122,
"step": 44100
},
{
"epoch": 2.514363729449912,
"grad_norm": 1.9433492422103882,
"learning_rate": 0.00012980639778523617,
"loss": 1.1209,
"step": 44200
},
{
"epoch": 2.5200523351726494,
"grad_norm": 1.5811411142349243,
"learning_rate": 0.0001282894362591729,
"loss": 1.1084,
"step": 44300
},
{
"epoch": 2.5257409408953864,
"grad_norm": 1.5232020616531372,
"learning_rate": 0.00012677247473310958,
"loss": 1.1372,
"step": 44400
},
{
"epoch": 2.531429546618124,
"grad_norm": 2.6212551593780518,
"learning_rate": 0.00012525551320704628,
"loss": 1.1246,
"step": 44500
},
{
"epoch": 2.5371181523408612,
"grad_norm": 1.4962718486785889,
"learning_rate": 0.00012373855168098301,
"loss": 1.1386,
"step": 44600
},
{
"epoch": 2.5428067580635987,
"grad_norm": 1.7713087797164917,
"learning_rate": 0.0001222215901549197,
"loss": 1.1314,
"step": 44700
},
{
"epoch": 2.548495363786336,
"grad_norm": 1.5493218898773193,
"learning_rate": 0.00012070462862885641,
"loss": 1.1204,
"step": 44800
},
{
"epoch": 2.554183969509073,
"grad_norm": 1.6126313209533691,
"learning_rate": 0.00011918766710279313,
"loss": 1.1283,
"step": 44900
},
{
"epoch": 2.5598725752318106,
"grad_norm": 1.5327433347702026,
"learning_rate": 0.00011767070557672982,
"loss": 1.124,
"step": 45000
},
{
"epoch": 2.5598725752318106,
"eval_accuracy": 0.725824,
"eval_loss": 1.0916061401367188,
"eval_runtime": 80.2149,
"eval_samples_per_second": 3116.626,
"eval_steps_per_second": 12.18,
"step": 45000
},
{
"epoch": 2.565561180954548,
"grad_norm": 1.5026576519012451,
"learning_rate": 0.00011615374405066652,
"loss": 1.1197,
"step": 45100
},
{
"epoch": 2.5712497866772854,
"grad_norm": 1.6989002227783203,
"learning_rate": 0.00011463678252460321,
"loss": 1.1247,
"step": 45200
},
{
"epoch": 2.576938392400023,
"grad_norm": 1.5901920795440674,
"learning_rate": 0.00011313499061380057,
"loss": 1.1352,
"step": 45300
},
{
"epoch": 2.58262699812276,
"grad_norm": 1.4382330179214478,
"learning_rate": 0.00011161802908773727,
"loss": 1.1093,
"step": 45400
},
{
"epoch": 2.5883156038454973,
"grad_norm": 1.8520530462265015,
"learning_rate": 0.00011010106756167397,
"loss": 1.1081,
"step": 45500
},
{
"epoch": 2.5940042095682347,
"grad_norm": 1.8772435188293457,
"learning_rate": 0.00010858410603561066,
"loss": 1.1157,
"step": 45600
},
{
"epoch": 2.599692815290972,
"grad_norm": 1.6013365983963013,
"learning_rate": 0.00010706714450954738,
"loss": 1.1463,
"step": 45700
},
{
"epoch": 2.6053814210137096,
"grad_norm": 1.582515835762024,
"learning_rate": 0.0001055501829834841,
"loss": 1.1135,
"step": 45800
},
{
"epoch": 2.611070026736447,
"grad_norm": 1.3782535791397095,
"learning_rate": 0.00010403322145742079,
"loss": 1.1101,
"step": 45900
},
{
"epoch": 2.6167586324591845,
"grad_norm": 1.465584397315979,
"learning_rate": 0.00010251625993135749,
"loss": 1.1354,
"step": 46000
},
{
"epoch": 2.6224472381819215,
"grad_norm": 1.4038536548614502,
"learning_rate": 0.00010099929840529421,
"loss": 1.1078,
"step": 46100
},
{
"epoch": 2.628135843904659,
"grad_norm": 1.9926286935806274,
"learning_rate": 9.948233687923091e-05,
"loss": 1.1044,
"step": 46200
},
{
"epoch": 2.6338244496273964,
"grad_norm": 1.6215740442276,
"learning_rate": 9.796537535316762e-05,
"loss": 1.1188,
"step": 46300
},
{
"epoch": 2.639513055350134,
"grad_norm": 1.5623165369033813,
"learning_rate": 9.644841382710431e-05,
"loss": 1.1155,
"step": 46400
},
{
"epoch": 2.6452016610728712,
"grad_norm": 1.491926670074463,
"learning_rate": 9.493145230104102e-05,
"loss": 1.1211,
"step": 46500
},
{
"epoch": 2.6508902667956082,
"grad_norm": 1.7084381580352783,
"learning_rate": 9.341449077497773e-05,
"loss": 1.1091,
"step": 46600
},
{
"epoch": 2.6565788725183457,
"grad_norm": 1.5060371160507202,
"learning_rate": 9.189752924891443e-05,
"loss": 1.1198,
"step": 46700
},
{
"epoch": 2.662267478241083,
"grad_norm": 1.7321504354476929,
"learning_rate": 9.038056772285112e-05,
"loss": 1.1157,
"step": 46800
},
{
"epoch": 2.6679560839638206,
"grad_norm": 1.559877634048462,
"learning_rate": 8.886360619678784e-05,
"loss": 1.1035,
"step": 46900
},
{
"epoch": 2.673644689686558,
"grad_norm": 1.8588401079177856,
"learning_rate": 8.734664467072455e-05,
"loss": 1.1288,
"step": 47000
},
{
"epoch": 2.679333295409295,
"grad_norm": 1.751246452331543,
"learning_rate": 8.582968314466125e-05,
"loss": 1.1206,
"step": 47100
},
{
"epoch": 2.6850219011320324,
"grad_norm": 1.7309458255767822,
"learning_rate": 8.431272161859795e-05,
"loss": 1.1089,
"step": 47200
},
{
"epoch": 2.69071050685477,
"grad_norm": 1.8057925701141357,
"learning_rate": 8.281092970779529e-05,
"loss": 1.1244,
"step": 47300
},
{
"epoch": 2.6963991125775073,
"grad_norm": 1.7594059705734253,
"learning_rate": 8.1293968181732e-05,
"loss": 1.1188,
"step": 47400
},
{
"epoch": 2.7020877183002447,
"grad_norm": 1.686438798904419,
"learning_rate": 7.97770066556687e-05,
"loss": 1.1068,
"step": 47500
},
{
"epoch": 2.7077763240229817,
"grad_norm": 1.6962246894836426,
"learning_rate": 7.82600451296054e-05,
"loss": 1.1043,
"step": 47600
},
{
"epoch": 2.713464929745719,
"grad_norm": 1.5946807861328125,
"learning_rate": 7.67430836035421e-05,
"loss": 1.1109,
"step": 47700
},
{
"epoch": 2.7191535354684566,
"grad_norm": 1.4834094047546387,
"learning_rate": 7.522612207747881e-05,
"loss": 1.114,
"step": 47800
},
{
"epoch": 2.724842141191194,
"grad_norm": 1.763058066368103,
"learning_rate": 7.370916055141553e-05,
"loss": 1.1091,
"step": 47900
},
{
"epoch": 2.7305307469139315,
"grad_norm": 1.9240601062774658,
"learning_rate": 7.219219902535223e-05,
"loss": 1.0936,
"step": 48000
},
{
"epoch": 2.7362193526366685,
"grad_norm": 1.4768198728561401,
"learning_rate": 7.067523749928892e-05,
"loss": 1.1158,
"step": 48100
},
{
"epoch": 2.7419079583594064,
"grad_norm": 1.9692409038543701,
"learning_rate": 6.915827597322563e-05,
"loss": 1.1201,
"step": 48200
},
{
"epoch": 2.7475965640821434,
"grad_norm": 1.636785864830017,
"learning_rate": 6.764131444716234e-05,
"loss": 1.1092,
"step": 48300
},
{
"epoch": 2.753285169804881,
"grad_norm": 1.5599926710128784,
"learning_rate": 6.612435292109905e-05,
"loss": 1.0932,
"step": 48400
},
{
"epoch": 2.7589737755276182,
"grad_norm": 1.695862054824829,
"learning_rate": 6.460739139503574e-05,
"loss": 1.1227,
"step": 48500
},
{
"epoch": 2.7646623812503557,
"grad_norm": 1.8806819915771484,
"learning_rate": 6.309042986897246e-05,
"loss": 1.1049,
"step": 48600
},
{
"epoch": 2.770350986973093,
"grad_norm": 1.814792513847351,
"learning_rate": 6.157346834290916e-05,
"loss": 1.1149,
"step": 48700
},
{
"epoch": 2.77603959269583,
"grad_norm": 2.068614959716797,
"learning_rate": 6.005650681684586e-05,
"loss": 1.1181,
"step": 48800
},
{
"epoch": 2.7817281984185676,
"grad_norm": 1.5576444864273071,
"learning_rate": 5.853954529078256e-05,
"loss": 1.1223,
"step": 48900
},
{
"epoch": 2.787416804141305,
"grad_norm": 1.8175384998321533,
"learning_rate": 5.7022583764719273e-05,
"loss": 1.1113,
"step": 49000
},
{
"epoch": 2.7931054098640424,
"grad_norm": 1.570915937423706,
"learning_rate": 5.550562223865598e-05,
"loss": 1.1123,
"step": 49100
},
{
"epoch": 2.79879401558678,
"grad_norm": 1.9663364887237549,
"learning_rate": 5.3988660712592675e-05,
"loss": 1.1065,
"step": 49200
},
{
"epoch": 2.804482621309517,
"grad_norm": 2.2906079292297363,
"learning_rate": 5.248686880179001e-05,
"loss": 1.0993,
"step": 49300
},
{
"epoch": 2.8101712270322543,
"grad_norm": 1.566801905632019,
"learning_rate": 5.096990727572673e-05,
"loss": 1.0964,
"step": 49400
},
{
"epoch": 2.8158598327549917,
"grad_norm": 1.7769867181777954,
"learning_rate": 4.9452945749663425e-05,
"loss": 1.0978,
"step": 49500
},
{
"epoch": 2.821548438477729,
"grad_norm": 1.9856287240982056,
"learning_rate": 4.7935984223600136e-05,
"loss": 1.0875,
"step": 49600
},
{
"epoch": 2.8272370442004666,
"grad_norm": 1.7836079597473145,
"learning_rate": 4.6419022697536834e-05,
"loss": 1.1056,
"step": 49700
},
{
"epoch": 2.8329256499232036,
"grad_norm": 1.9246402978897095,
"learning_rate": 4.4902061171473545e-05,
"loss": 1.1074,
"step": 49800
},
{
"epoch": 2.838614255645941,
"grad_norm": 1.3988184928894043,
"learning_rate": 4.338509964541024e-05,
"loss": 1.1206,
"step": 49900
},
{
"epoch": 2.8443028613686785,
"grad_norm": 1.7193849086761475,
"learning_rate": 4.186813811934695e-05,
"loss": 1.1245,
"step": 50000
},
{
"epoch": 2.8443028613686785,
"eval_accuracy": 0.729988,
"eval_loss": 1.0782374143600464,
"eval_runtime": 82.4205,
"eval_samples_per_second": 3033.226,
"eval_steps_per_second": 11.854,
"step": 50000
},
{
"epoch": 2.849991467091416,
"grad_norm": 1.7059062719345093,
"learning_rate": 4.035117659328366e-05,
"loss": 1.1009,
"step": 50100
},
{
"epoch": 2.8556800728141534,
"grad_norm": 1.4554681777954102,
"learning_rate": 3.883421506722036e-05,
"loss": 1.1108,
"step": 50200
},
{
"epoch": 2.8613686785368904,
"grad_norm": 1.7067590951919556,
"learning_rate": 3.7317253541157065e-05,
"loss": 1.0956,
"step": 50300
},
{
"epoch": 2.867057284259628,
"grad_norm": 1.7176940441131592,
"learning_rate": 3.580029201509377e-05,
"loss": 1.0841,
"step": 50400
},
{
"epoch": 2.8727458899823652,
"grad_norm": 1.7251313924789429,
"learning_rate": 3.4283330489030474e-05,
"loss": 1.0908,
"step": 50500
},
{
"epoch": 2.8784344957051027,
"grad_norm": 1.668372631072998,
"learning_rate": 3.276636896296718e-05,
"loss": 1.0954,
"step": 50600
},
{
"epoch": 2.88412310142784,
"grad_norm": 1.889109492301941,
"learning_rate": 3.124940743690388e-05,
"loss": 1.1096,
"step": 50700
},
{
"epoch": 2.8898117071505776,
"grad_norm": 1.509391188621521,
"learning_rate": 2.973244591084059e-05,
"loss": 1.09,
"step": 50800
},
{
"epoch": 2.895500312873315,
"grad_norm": 2.024489402770996,
"learning_rate": 2.821548438477729e-05,
"loss": 1.0993,
"step": 50900
},
{
"epoch": 2.901188918596052,
"grad_norm": 2.007756471633911,
"learning_rate": 2.6698522858713998e-05,
"loss": 1.1029,
"step": 51000
},
{
"epoch": 2.9068775243187894,
"grad_norm": 1.5296841859817505,
"learning_rate": 2.51815613326507e-05,
"loss": 1.095,
"step": 51100
},
{
"epoch": 2.912566130041527,
"grad_norm": 1.6109613180160522,
"learning_rate": 2.3664599806587406e-05,
"loss": 1.1044,
"step": 51200
},
{
"epoch": 2.9182547357642643,
"grad_norm": 1.8067957162857056,
"learning_rate": 2.216280789578474e-05,
"loss": 1.1074,
"step": 51300
},
{
"epoch": 2.9239433414870017,
"grad_norm": 1.6997472047805786,
"learning_rate": 2.064584636972145e-05,
"loss": 1.1036,
"step": 51400
},
{
"epoch": 2.9296319472097387,
"grad_norm": 1.6443182229995728,
"learning_rate": 1.9128884843658153e-05,
"loss": 1.1258,
"step": 51500
},
{
"epoch": 2.935320552932476,
"grad_norm": 1.868161916732788,
"learning_rate": 1.7611923317594857e-05,
"loss": 1.115,
"step": 51600
},
{
"epoch": 2.9410091586552136,
"grad_norm": 1.5620206594467163,
"learning_rate": 1.609496179153156e-05,
"loss": 1.1075,
"step": 51700
},
{
"epoch": 2.946697764377951,
"grad_norm": 1.6326332092285156,
"learning_rate": 1.4578000265468267e-05,
"loss": 1.1113,
"step": 51800
},
{
"epoch": 2.9523863701006885,
"grad_norm": 1.805126428604126,
"learning_rate": 1.3061038739404973e-05,
"loss": 1.0937,
"step": 51900
},
{
"epoch": 2.9580749758234255,
"grad_norm": 1.5693707466125488,
"learning_rate": 1.1544077213341677e-05,
"loss": 1.1053,
"step": 52000
},
{
"epoch": 2.963763581546163,
"grad_norm": 1.4851309061050415,
"learning_rate": 1.0027115687278382e-05,
"loss": 1.1101,
"step": 52100
},
{
"epoch": 2.9694521872689004,
"grad_norm": 1.8946778774261475,
"learning_rate": 8.510154161215086e-06,
"loss": 1.0934,
"step": 52200
},
{
"epoch": 2.975140792991638,
"grad_norm": 1.5624499320983887,
"learning_rate": 6.993192635151791e-06,
"loss": 1.0994,
"step": 52300
},
{
"epoch": 2.9808293987143752,
"grad_norm": 1.5662641525268555,
"learning_rate": 5.476231109088497e-06,
"loss": 1.1058,
"step": 52400
},
{
"epoch": 2.9865180044371122,
"grad_norm": 1.514809250831604,
"learning_rate": 3.959269583025201e-06,
"loss": 1.0967,
"step": 52500
},
{
"epoch": 2.9922066101598497,
"grad_norm": 1.8442556858062744,
"learning_rate": 2.4423080569619053e-06,
"loss": 1.1041,
"step": 52600
},
{
"epoch": 2.997895215882587,
"grad_norm": 1.7445664405822754,
"learning_rate": 9.253465308986101e-07,
"loss": 1.0786,
"step": 52700
},
{
"epoch": 3.0,
"step": 52737,
"total_flos": 1.3116020904e+17,
"train_loss": 1.269093582550002,
"train_runtime": 7252.1306,
"train_samples_per_second": 1861.522,
"train_steps_per_second": 7.272
}
],
"logging_steps": 100,
"max_steps": 52737,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 5000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.3116020904e+17,
"train_batch_size": 256,
"trial_name": null,
"trial_params": null
}