AdaptLLM's picture
Upload folder using huggingface_hub (#1)
387bee1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999365683476055,
"eval_steps": 500,
"global_step": 3941,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002537266095781795,
"grad_norm": 112.85723727111593,
"learning_rate": 1.2658227848101266e-07,
"loss": 1.8967,
"step": 10
},
{
"epoch": 0.00507453219156359,
"grad_norm": 111.7277255600065,
"learning_rate": 2.5316455696202533e-07,
"loss": 1.8962,
"step": 20
},
{
"epoch": 0.007611798287345386,
"grad_norm": 100.54525732943956,
"learning_rate": 3.79746835443038e-07,
"loss": 1.8773,
"step": 30
},
{
"epoch": 0.01014906438312718,
"grad_norm": 60.43888528431074,
"learning_rate": 5.063291139240507e-07,
"loss": 1.7742,
"step": 40
},
{
"epoch": 0.012686330478908976,
"grad_norm": 28.26912001667364,
"learning_rate": 6.329113924050634e-07,
"loss": 1.7141,
"step": 50
},
{
"epoch": 0.015223596574690771,
"grad_norm": 27.480791814917016,
"learning_rate": 7.59493670886076e-07,
"loss": 1.6293,
"step": 60
},
{
"epoch": 0.017760862670472565,
"grad_norm": 20.239901551582367,
"learning_rate": 8.860759493670887e-07,
"loss": 1.5743,
"step": 70
},
{
"epoch": 0.02029812876625436,
"grad_norm": 11.923875375009446,
"learning_rate": 1.0126582278481013e-06,
"loss": 1.5202,
"step": 80
},
{
"epoch": 0.022835394862036156,
"grad_norm": 6.627037994110648,
"learning_rate": 1.139240506329114e-06,
"loss": 1.4872,
"step": 90
},
{
"epoch": 0.02537266095781795,
"grad_norm": 8.16791147643463,
"learning_rate": 1.2658227848101267e-06,
"loss": 1.4824,
"step": 100
},
{
"epoch": 0.027909927053599747,
"grad_norm": 9.357053020719055,
"learning_rate": 1.3924050632911392e-06,
"loss": 1.4456,
"step": 110
},
{
"epoch": 0.030447193149381543,
"grad_norm": 8.946206463663723,
"learning_rate": 1.518987341772152e-06,
"loss": 1.4253,
"step": 120
},
{
"epoch": 0.032984459245163335,
"grad_norm": 6.684895053435472,
"learning_rate": 1.6455696202531647e-06,
"loss": 1.4119,
"step": 130
},
{
"epoch": 0.03552172534094513,
"grad_norm": 5.858068093439728,
"learning_rate": 1.7721518987341774e-06,
"loss": 1.3944,
"step": 140
},
{
"epoch": 0.038058991436726926,
"grad_norm": 10.736099643334308,
"learning_rate": 1.8987341772151901e-06,
"loss": 1.3937,
"step": 150
},
{
"epoch": 0.04059625753250872,
"grad_norm": 5.52407932247684,
"learning_rate": 2.0253164556962026e-06,
"loss": 1.3624,
"step": 160
},
{
"epoch": 0.04313352362829052,
"grad_norm": 8.107714144715326,
"learning_rate": 2.1518987341772153e-06,
"loss": 1.3854,
"step": 170
},
{
"epoch": 0.04567078972407231,
"grad_norm": 5.726773653383632,
"learning_rate": 2.278481012658228e-06,
"loss": 1.3745,
"step": 180
},
{
"epoch": 0.04820805581985411,
"grad_norm": 5.421447552101552,
"learning_rate": 2.4050632911392408e-06,
"loss": 1.3616,
"step": 190
},
{
"epoch": 0.0507453219156359,
"grad_norm": 5.519589755451587,
"learning_rate": 2.5316455696202535e-06,
"loss": 1.3802,
"step": 200
},
{
"epoch": 0.0532825880114177,
"grad_norm": 4.997598050858228,
"learning_rate": 2.6582278481012658e-06,
"loss": 1.3501,
"step": 210
},
{
"epoch": 0.055819854107199494,
"grad_norm": 7.739443027895296,
"learning_rate": 2.7848101265822785e-06,
"loss": 1.3529,
"step": 220
},
{
"epoch": 0.05835712020298129,
"grad_norm": 14.153932111981923,
"learning_rate": 2.9113924050632912e-06,
"loss": 1.3425,
"step": 230
},
{
"epoch": 0.060894386298763085,
"grad_norm": 6.821474472121072,
"learning_rate": 3.037974683544304e-06,
"loss": 1.3268,
"step": 240
},
{
"epoch": 0.06343165239454487,
"grad_norm": 7.105335682421715,
"learning_rate": 3.164556962025317e-06,
"loss": 1.3391,
"step": 250
},
{
"epoch": 0.06596891849032667,
"grad_norm": 9.157922661641283,
"learning_rate": 3.2911392405063294e-06,
"loss": 1.331,
"step": 260
},
{
"epoch": 0.06850618458610847,
"grad_norm": 8.558636103035896,
"learning_rate": 3.417721518987342e-06,
"loss": 1.3318,
"step": 270
},
{
"epoch": 0.07104345068189026,
"grad_norm": 6.250103925808961,
"learning_rate": 3.544303797468355e-06,
"loss": 1.3356,
"step": 280
},
{
"epoch": 0.07358071677767206,
"grad_norm": 6.294667609868247,
"learning_rate": 3.6708860759493675e-06,
"loss": 1.32,
"step": 290
},
{
"epoch": 0.07611798287345385,
"grad_norm": 6.6361456210836725,
"learning_rate": 3.7974683544303802e-06,
"loss": 1.3224,
"step": 300
},
{
"epoch": 0.07865524896923565,
"grad_norm": 8.289069596597512,
"learning_rate": 3.924050632911393e-06,
"loss": 1.3176,
"step": 310
},
{
"epoch": 0.08119251506501744,
"grad_norm": 5.542830103144929,
"learning_rate": 4.050632911392405e-06,
"loss": 1.2962,
"step": 320
},
{
"epoch": 0.08372978116079924,
"grad_norm": 10.531069717984824,
"learning_rate": 4.177215189873418e-06,
"loss": 1.3005,
"step": 330
},
{
"epoch": 0.08626704725658103,
"grad_norm": 17.37898460345658,
"learning_rate": 4.303797468354431e-06,
"loss": 1.3031,
"step": 340
},
{
"epoch": 0.08880431335236283,
"grad_norm": 9.252485190691019,
"learning_rate": 4.430379746835443e-06,
"loss": 1.3107,
"step": 350
},
{
"epoch": 0.09134157944814462,
"grad_norm": 7.525737017146553,
"learning_rate": 4.556962025316456e-06,
"loss": 1.2886,
"step": 360
},
{
"epoch": 0.09387884554392642,
"grad_norm": 10.083996939013279,
"learning_rate": 4.683544303797468e-06,
"loss": 1.2954,
"step": 370
},
{
"epoch": 0.09641611163970822,
"grad_norm": 18.864544413704667,
"learning_rate": 4.8101265822784815e-06,
"loss": 1.2959,
"step": 380
},
{
"epoch": 0.09895337773549001,
"grad_norm": 15.18986427449958,
"learning_rate": 4.936708860759495e-06,
"loss": 1.2918,
"step": 390
},
{
"epoch": 0.1014906438312718,
"grad_norm": 8.154706578846366,
"learning_rate": 4.999975471465892e-06,
"loss": 1.2952,
"step": 400
},
{
"epoch": 0.1040279099270536,
"grad_norm": 8.626125448906336,
"learning_rate": 4.999779246080933e-06,
"loss": 1.2812,
"step": 410
},
{
"epoch": 0.1065651760228354,
"grad_norm": 14.880568433586623,
"learning_rate": 4.999386810712926e-06,
"loss": 1.2532,
"step": 420
},
{
"epoch": 0.1091024421186172,
"grad_norm": 10.892903720339616,
"learning_rate": 4.9987981961644855e-06,
"loss": 1.2982,
"step": 430
},
{
"epoch": 0.11163970821439899,
"grad_norm": 4.797131746271338,
"learning_rate": 4.998013448636512e-06,
"loss": 1.271,
"step": 440
},
{
"epoch": 0.11417697431018078,
"grad_norm": 18.557197921826557,
"learning_rate": 4.997032629724564e-06,
"loss": 1.2825,
"step": 450
},
{
"epoch": 0.11671424040596258,
"grad_norm": 15.001794851401208,
"learning_rate": 4.995855816414024e-06,
"loss": 1.2765,
"step": 460
},
{
"epoch": 0.11925150650174438,
"grad_norm": 10.965425508561747,
"learning_rate": 4.9944831010740576e-06,
"loss": 1.2585,
"step": 470
},
{
"epoch": 0.12178877259752617,
"grad_norm": 6.63862480684244,
"learning_rate": 4.992914591450358e-06,
"loss": 1.2929,
"step": 480
},
{
"epoch": 0.12432603869330797,
"grad_norm": 6.984972641361711,
"learning_rate": 4.991150410656697e-06,
"loss": 1.2733,
"step": 490
},
{
"epoch": 0.12686330478908975,
"grad_norm": 18.903688844829993,
"learning_rate": 4.9891906971652545e-06,
"loss": 1.2558,
"step": 500
},
{
"epoch": 0.12940057088487156,
"grad_norm": 7.177194302737898,
"learning_rate": 4.987035604795753e-06,
"loss": 1.2659,
"step": 510
},
{
"epoch": 0.13193783698065334,
"grad_norm": 10.981192386536817,
"learning_rate": 4.984685302703385e-06,
"loss": 1.2606,
"step": 520
},
{
"epoch": 0.13447510307643515,
"grad_norm": 9.47429133139194,
"learning_rate": 4.982139975365533e-06,
"loss": 1.2785,
"step": 530
},
{
"epoch": 0.13701236917221693,
"grad_norm": 17.731391325371494,
"learning_rate": 4.979399822567292e-06,
"loss": 1.2709,
"step": 540
},
{
"epoch": 0.13954963526799874,
"grad_norm": 13.256927828543148,
"learning_rate": 4.976465059385788e-06,
"loss": 1.248,
"step": 550
},
{
"epoch": 0.14208690136378052,
"grad_norm": 9.285697826647898,
"learning_rate": 4.973335916173294e-06,
"loss": 1.2462,
"step": 560
},
{
"epoch": 0.14462416745956233,
"grad_norm": 10.853767808913329,
"learning_rate": 4.970012638539152e-06,
"loss": 1.2533,
"step": 570
},
{
"epoch": 0.1471614335553441,
"grad_norm": 16.143378806845213,
"learning_rate": 4.966495487330496e-06,
"loss": 1.2526,
"step": 580
},
{
"epoch": 0.14969869965112592,
"grad_norm": 46.2688542748775,
"learning_rate": 4.962784738611774e-06,
"loss": 1.265,
"step": 590
},
{
"epoch": 0.1522359657469077,
"grad_norm": 6.132364126798611,
"learning_rate": 4.958880683643082e-06,
"loss": 1.2733,
"step": 600
},
{
"epoch": 0.1547732318426895,
"grad_norm": 7.964503594558826,
"learning_rate": 4.954783628857302e-06,
"loss": 1.2626,
"step": 610
},
{
"epoch": 0.1573104979384713,
"grad_norm": 13.68977454870837,
"learning_rate": 4.95049389583605e-06,
"loss": 1.2657,
"step": 620
},
{
"epoch": 0.1598477640342531,
"grad_norm": 14.540626560719817,
"learning_rate": 4.9460118212844355e-06,
"loss": 1.2372,
"step": 630
},
{
"epoch": 0.16238503013003489,
"grad_norm": 17.246204651829867,
"learning_rate": 4.941337757004631e-06,
"loss": 1.2355,
"step": 640
},
{
"epoch": 0.1649222962258167,
"grad_norm": 5.660216888492281,
"learning_rate": 4.936472069868262e-06,
"loss": 1.2439,
"step": 650
},
{
"epoch": 0.16745956232159848,
"grad_norm": 32.30203917124339,
"learning_rate": 4.931415141787607e-06,
"loss": 1.2384,
"step": 660
},
{
"epoch": 0.16999682841738029,
"grad_norm": 9.519199679528043,
"learning_rate": 4.926167369685626e-06,
"loss": 1.2452,
"step": 670
},
{
"epoch": 0.17253409451316207,
"grad_norm": 9.559300124244501,
"learning_rate": 4.920729165464799e-06,
"loss": 1.2564,
"step": 680
},
{
"epoch": 0.17507136060894388,
"grad_norm": 7.099728750507589,
"learning_rate": 4.915100955974802e-06,
"loss": 1.2695,
"step": 690
},
{
"epoch": 0.17760862670472566,
"grad_norm": 8.197172932423973,
"learning_rate": 4.909283182978998e-06,
"loss": 1.2379,
"step": 700
},
{
"epoch": 0.18014589280050744,
"grad_norm": 9.726988142249633,
"learning_rate": 4.903276303119765e-06,
"loss": 1.2483,
"step": 710
},
{
"epoch": 0.18268315889628925,
"grad_norm": 5.550536710160647,
"learning_rate": 4.897080787882656e-06,
"loss": 1.2493,
"step": 720
},
{
"epoch": 0.18522042499207103,
"grad_norm": 15.553779441262579,
"learning_rate": 4.890697123559385e-06,
"loss": 1.2635,
"step": 730
},
{
"epoch": 0.18775769108785284,
"grad_norm": 7.070319331844888,
"learning_rate": 4.884125811209665e-06,
"loss": 1.2439,
"step": 740
},
{
"epoch": 0.19029495718363462,
"grad_norm": 9.252450309844942,
"learning_rate": 4.877367366621874e-06,
"loss": 1.2423,
"step": 750
},
{
"epoch": 0.19283222327941643,
"grad_norm": 8.297702205658675,
"learning_rate": 4.870422320272576e-06,
"loss": 1.2322,
"step": 760
},
{
"epoch": 0.1953694893751982,
"grad_norm": 14.013643448769688,
"learning_rate": 4.863291217284872e-06,
"loss": 1.2354,
"step": 770
},
{
"epoch": 0.19790675547098002,
"grad_norm": 8.039135652565351,
"learning_rate": 4.855974617385629e-06,
"loss": 1.2257,
"step": 780
},
{
"epoch": 0.2004440215667618,
"grad_norm": 6.465541475557579,
"learning_rate": 4.8484730948615336e-06,
"loss": 1.2477,
"step": 790
},
{
"epoch": 0.2029812876625436,
"grad_norm": 38.736482675881,
"learning_rate": 4.840787238514019e-06,
"loss": 1.242,
"step": 800
},
{
"epoch": 0.2055185537583254,
"grad_norm": 6.2027098825049745,
"learning_rate": 4.832917651613055e-06,
"loss": 1.2481,
"step": 810
},
{
"epoch": 0.2080558198541072,
"grad_norm": 7.699764524449651,
"learning_rate": 4.824864951849787e-06,
"loss": 1.2422,
"step": 820
},
{
"epoch": 0.210593085949889,
"grad_norm": 7.045914854236814,
"learning_rate": 4.8166297712880635e-06,
"loss": 1.2296,
"step": 830
},
{
"epoch": 0.2131303520456708,
"grad_norm": 6.848231755692189,
"learning_rate": 4.808212756314815e-06,
"loss": 1.2185,
"step": 840
},
{
"epoch": 0.21566761814145258,
"grad_norm": 7.591181911551738,
"learning_rate": 4.7996145675893255e-06,
"loss": 1.2348,
"step": 850
},
{
"epoch": 0.2182048842372344,
"grad_norm": 13.992556734602806,
"learning_rate": 4.7908358799913735e-06,
"loss": 1.259,
"step": 860
},
{
"epoch": 0.22074215033301617,
"grad_norm": 13.882288651265467,
"learning_rate": 4.781877382568261e-06,
"loss": 1.2305,
"step": 870
},
{
"epoch": 0.22327941642879798,
"grad_norm": 8.53486541599463,
"learning_rate": 4.772739778480729e-06,
"loss": 1.2343,
"step": 880
},
{
"epoch": 0.22581668252457976,
"grad_norm": 9.455012927215373,
"learning_rate": 4.7634237849477645e-06,
"loss": 1.2194,
"step": 890
},
{
"epoch": 0.22835394862036157,
"grad_norm": 13.790102358483903,
"learning_rate": 4.7539301331903125e-06,
"loss": 1.2267,
"step": 900
},
{
"epoch": 0.23089121471614335,
"grad_norm": 14.2481962464266,
"learning_rate": 4.7442595683738705e-06,
"loss": 1.2132,
"step": 910
},
{
"epoch": 0.23342848081192516,
"grad_norm": 14.416373746309988,
"learning_rate": 4.734412849550007e-06,
"loss": 1.2094,
"step": 920
},
{
"epoch": 0.23596574690770694,
"grad_norm": 9.539618032350603,
"learning_rate": 4.7243907495967815e-06,
"loss": 1.2294,
"step": 930
},
{
"epoch": 0.23850301300348875,
"grad_norm": 5.29440353230921,
"learning_rate": 4.7141940551580824e-06,
"loss": 1.2208,
"step": 940
},
{
"epoch": 0.24104027909927053,
"grad_norm": 12.765581152374985,
"learning_rate": 4.703823566581877e-06,
"loss": 1.2324,
"step": 950
},
{
"epoch": 0.24357754519505234,
"grad_norm": 8.58495407107471,
"learning_rate": 4.693280097857398e-06,
"loss": 1.2222,
"step": 960
},
{
"epoch": 0.24611481129083412,
"grad_norm": 9.562572836922989,
"learning_rate": 4.6825644765512475e-06,
"loss": 1.2185,
"step": 970
},
{
"epoch": 0.24865207738661593,
"grad_norm": 9.23186285176845,
"learning_rate": 4.6716775437424465e-06,
"loss": 1.2192,
"step": 980
},
{
"epoch": 0.25118934348239774,
"grad_norm": 5.073620743419921,
"learning_rate": 4.660620153956409e-06,
"loss": 1.2241,
"step": 990
},
{
"epoch": 0.2537266095781795,
"grad_norm": 8.006469237916,
"learning_rate": 4.649393175097879e-06,
"loss": 1.2281,
"step": 1000
},
{
"epoch": 0.2562638756739613,
"grad_norm": 8.265727241877757,
"learning_rate": 4.637997488382801e-06,
"loss": 1.2286,
"step": 1010
},
{
"epoch": 0.2588011417697431,
"grad_norm": 6.416174543416175,
"learning_rate": 4.626433988269156e-06,
"loss": 1.2217,
"step": 1020
},
{
"epoch": 0.2613384078655249,
"grad_norm": 7.028656405225393,
"learning_rate": 4.614703582386755e-06,
"loss": 1.2181,
"step": 1030
},
{
"epoch": 0.2638756739613067,
"grad_norm": 5.865759830273811,
"learning_rate": 4.602807191465993e-06,
"loss": 1.2382,
"step": 1040
},
{
"epoch": 0.2664129400570885,
"grad_norm": 13.872254227369961,
"learning_rate": 4.5907457492655895e-06,
"loss": 1.2359,
"step": 1050
},
{
"epoch": 0.2689502061528703,
"grad_norm": 13.664329240119352,
"learning_rate": 4.578520202499286e-06,
"loss": 1.2154,
"step": 1060
},
{
"epoch": 0.27148747224865205,
"grad_norm": 14.099557581365481,
"learning_rate": 4.566131510761548e-06,
"loss": 1.2279,
"step": 1070
},
{
"epoch": 0.27402473834443386,
"grad_norm": 9.810709350520776,
"learning_rate": 4.553580646452238e-06,
"loss": 1.2276,
"step": 1080
},
{
"epoch": 0.27656200444021567,
"grad_norm": 6.042192876271809,
"learning_rate": 4.5408685947002915e-06,
"loss": 1.2159,
"step": 1090
},
{
"epoch": 0.2790992705359975,
"grad_norm": 7.672603258347116,
"learning_rate": 4.5279963532864e-06,
"loss": 1.209,
"step": 1100
},
{
"epoch": 0.28163653663177923,
"grad_norm": 7.981333790512745,
"learning_rate": 4.5149649325646875e-06,
"loss": 1.2227,
"step": 1110
},
{
"epoch": 0.28417380272756104,
"grad_norm": 7.786264862698563,
"learning_rate": 4.501775355383406e-06,
"loss": 1.2299,
"step": 1120
},
{
"epoch": 0.28671106882334285,
"grad_norm": 23.110853155380898,
"learning_rate": 4.48842865700466e-06,
"loss": 1.1972,
"step": 1130
},
{
"epoch": 0.28924833491912466,
"grad_norm": 13.48010845409501,
"learning_rate": 4.474925885023136e-06,
"loss": 1.2043,
"step": 1140
},
{
"epoch": 0.2917856010149064,
"grad_norm": 9.52808074720464,
"learning_rate": 4.461268099283886e-06,
"loss": 1.2107,
"step": 1150
},
{
"epoch": 0.2943228671106882,
"grad_norm": 7.355296672812608,
"learning_rate": 4.4474563717991345e-06,
"loss": 1.2014,
"step": 1160
},
{
"epoch": 0.29686013320647003,
"grad_norm": 6.320569333174802,
"learning_rate": 4.433491786664134e-06,
"loss": 1.2068,
"step": 1170
},
{
"epoch": 0.29939739930225184,
"grad_norm": 10.030481328592858,
"learning_rate": 4.419375439972075e-06,
"loss": 1.2276,
"step": 1180
},
{
"epoch": 0.3019346653980336,
"grad_norm": 6.409643921761458,
"learning_rate": 4.405108439728057e-06,
"loss": 1.217,
"step": 1190
},
{
"epoch": 0.3044719314938154,
"grad_norm": 8.589272885921144,
"learning_rate": 4.390691905762111e-06,
"loss": 1.2141,
"step": 1200
},
{
"epoch": 0.3070091975895972,
"grad_norm": 8.928374282259929,
"learning_rate": 4.376126969641311e-06,
"loss": 1.2067,
"step": 1210
},
{
"epoch": 0.309546463685379,
"grad_norm": 6.743646650661921,
"learning_rate": 4.361414774580952e-06,
"loss": 1.2126,
"step": 1220
},
{
"epoch": 0.3120837297811608,
"grad_norm": 11.981450710885838,
"learning_rate": 4.34655647535482e-06,
"loss": 1.1928,
"step": 1230
},
{
"epoch": 0.3146209958769426,
"grad_norm": 9.30568404144152,
"learning_rate": 4.3315532382045535e-06,
"loss": 1.2233,
"step": 1240
},
{
"epoch": 0.3171582619727244,
"grad_norm": 9.124358676503277,
"learning_rate": 4.3164062407480974e-06,
"loss": 1.2208,
"step": 1250
},
{
"epoch": 0.3196955280685062,
"grad_norm": 9.471883506779196,
"learning_rate": 4.301116671887281e-06,
"loss": 1.2009,
"step": 1260
},
{
"epoch": 0.32223279416428796,
"grad_norm": 6.941617674454578,
"learning_rate": 4.285685731714493e-06,
"loss": 1.2188,
"step": 1270
},
{
"epoch": 0.32477006026006977,
"grad_norm": 12.5252338418682,
"learning_rate": 4.270114631418487e-06,
"loss": 1.1947,
"step": 1280
},
{
"epoch": 0.3273073263558516,
"grad_norm": 11.108780709122486,
"learning_rate": 4.254404593189316e-06,
"loss": 1.2063,
"step": 1290
},
{
"epoch": 0.3298445924516334,
"grad_norm": 8.555766487500795,
"learning_rate": 4.238556850122394e-06,
"loss": 1.1988,
"step": 1300
},
{
"epoch": 0.33238185854741514,
"grad_norm": 5.929688704429909,
"learning_rate": 4.222572646121723e-06,
"loss": 1.2084,
"step": 1310
},
{
"epoch": 0.33491912464319695,
"grad_norm": 7.403468884662153,
"learning_rate": 4.2064532358022446e-06,
"loss": 1.209,
"step": 1320
},
{
"epoch": 0.33745639073897876,
"grad_norm": 11.518020383952775,
"learning_rate": 4.190199884391371e-06,
"loss": 1.2069,
"step": 1330
},
{
"epoch": 0.33999365683476057,
"grad_norm": 8.809667396360213,
"learning_rate": 4.173813867629672e-06,
"loss": 1.2291,
"step": 1340
},
{
"epoch": 0.3425309229305423,
"grad_norm": 10.783849201384296,
"learning_rate": 4.157296471670747e-06,
"loss": 1.1787,
"step": 1350
},
{
"epoch": 0.34506818902632413,
"grad_norm": 5.923243263259089,
"learning_rate": 4.140648992980269e-06,
"loss": 1.1972,
"step": 1360
},
{
"epoch": 0.34760545512210594,
"grad_norm": 7.577491053933149,
"learning_rate": 4.1238727382342245e-06,
"loss": 1.2193,
"step": 1370
},
{
"epoch": 0.35014272121788775,
"grad_norm": 15.766504679065461,
"learning_rate": 4.106969024216348e-06,
"loss": 1.1896,
"step": 1380
},
{
"epoch": 0.3526799873136695,
"grad_norm": 5.128475553118625,
"learning_rate": 4.089939177714778e-06,
"loss": 1.1916,
"step": 1390
},
{
"epoch": 0.3552172534094513,
"grad_norm": 31.663873389243054,
"learning_rate": 4.0727845354178995e-06,
"loss": 1.2037,
"step": 1400
},
{
"epoch": 0.3577545195052331,
"grad_norm": 5.0548828855937975,
"learning_rate": 4.055506443809441e-06,
"loss": 1.2243,
"step": 1410
},
{
"epoch": 0.3602917856010149,
"grad_norm": 6.115218417588514,
"learning_rate": 4.038106259062778e-06,
"loss": 1.1927,
"step": 1420
},
{
"epoch": 0.3628290516967967,
"grad_norm": 12.060043038462219,
"learning_rate": 4.020585346934493e-06,
"loss": 1.2077,
"step": 1430
},
{
"epoch": 0.3653663177925785,
"grad_norm": 5.116355293016737,
"learning_rate": 4.002945082657167e-06,
"loss": 1.2,
"step": 1440
},
{
"epoch": 0.3679035838883603,
"grad_norm": 7.955675984619225,
"learning_rate": 3.985186850831446e-06,
"loss": 1.1917,
"step": 1450
},
{
"epoch": 0.37044084998414206,
"grad_norm": 7.18463544096605,
"learning_rate": 3.967312045317357e-06,
"loss": 1.1917,
"step": 1460
},
{
"epoch": 0.37297811607992387,
"grad_norm": 13.238317735595116,
"learning_rate": 3.9493220691249e-06,
"loss": 1.207,
"step": 1470
},
{
"epoch": 0.3755153821757057,
"grad_norm": 17.592657693606654,
"learning_rate": 3.931218334303933e-06,
"loss": 1.1963,
"step": 1480
},
{
"epoch": 0.3780526482714875,
"grad_norm": 13.71018630119955,
"learning_rate": 3.913002261833331e-06,
"loss": 1.201,
"step": 1490
},
{
"epoch": 0.38058991436726924,
"grad_norm": 4.264076010568766,
"learning_rate": 3.894675281509455e-06,
"loss": 1.2088,
"step": 1500
},
{
"epoch": 0.38312718046305105,
"grad_norm": 7.007776785175352,
"learning_rate": 3.876238831833927e-06,
"loss": 1.216,
"step": 1510
},
{
"epoch": 0.38566444655883286,
"grad_norm": 6.561717211592676,
"learning_rate": 3.857694359900719e-06,
"loss": 1.211,
"step": 1520
},
{
"epoch": 0.3882017126546147,
"grad_norm": 9.351379080752213,
"learning_rate": 3.83904332128257e-06,
"loss": 1.173,
"step": 1530
},
{
"epoch": 0.3907389787503964,
"grad_norm": 5.389365116394579,
"learning_rate": 3.820287179916736e-06,
"loss": 1.1883,
"step": 1540
},
{
"epoch": 0.39327624484617824,
"grad_norm": 4.843914279235025,
"learning_rate": 3.8014274079900842e-06,
"loss": 1.2002,
"step": 1550
},
{
"epoch": 0.39581351094196005,
"grad_norm": 8.271741525384778,
"learning_rate": 3.7824654858235433e-06,
"loss": 1.2019,
"step": 1560
},
{
"epoch": 0.39835077703774185,
"grad_norm": 16.214867643854607,
"learning_rate": 3.763402901755905e-06,
"loss": 1.1936,
"step": 1570
},
{
"epoch": 0.4008880431335236,
"grad_norm": 6.671379474352313,
"learning_rate": 3.7442411520270096e-06,
"loss": 1.1872,
"step": 1580
},
{
"epoch": 0.4034253092293054,
"grad_norm": 8.707941769488675,
"learning_rate": 3.7249817406602996e-06,
"loss": 1.197,
"step": 1590
},
{
"epoch": 0.4059625753250872,
"grad_norm": 6.5621327717722995,
"learning_rate": 3.7056261793447707e-06,
"loss": 1.1983,
"step": 1600
},
{
"epoch": 0.40849984142086904,
"grad_norm": 7.068156090864818,
"learning_rate": 3.686175987316317e-06,
"loss": 1.1853,
"step": 1610
},
{
"epoch": 0.4110371075166508,
"grad_norm": 7.890900003065767,
"learning_rate": 3.6666326912384854e-06,
"loss": 1.1925,
"step": 1620
},
{
"epoch": 0.4135743736124326,
"grad_norm": 6.7232162942204665,
"learning_rate": 3.6469978250826433e-06,
"loss": 1.1954,
"step": 1630
},
{
"epoch": 0.4161116397082144,
"grad_norm": 10.010345842937971,
"learning_rate": 3.6272729300075808e-06,
"loss": 1.209,
"step": 1640
},
{
"epoch": 0.4186489058039962,
"grad_norm": 5.740167290944564,
"learning_rate": 3.6074595542385387e-06,
"loss": 1.185,
"step": 1650
},
{
"epoch": 0.421186171899778,
"grad_norm": 7.619213938711831,
"learning_rate": 3.5875592529456926e-06,
"loss": 1.2068,
"step": 1660
},
{
"epoch": 0.4237234379955598,
"grad_norm": 11.256221468519641,
"learning_rate": 3.567573588122079e-06,
"loss": 1.1777,
"step": 1670
},
{
"epoch": 0.4262607040913416,
"grad_norm": 5.7316159588312106,
"learning_rate": 3.5475041284609977e-06,
"loss": 1.1843,
"step": 1680
},
{
"epoch": 0.4287979701871234,
"grad_norm": 5.682433911486404,
"learning_rate": 3.527352449232886e-06,
"loss": 1.1777,
"step": 1690
},
{
"epoch": 0.43133523628290515,
"grad_norm": 6.786252039469139,
"learning_rate": 3.5071201321616673e-06,
"loss": 1.1888,
"step": 1700
},
{
"epoch": 0.43387250237868696,
"grad_norm": 5.407930902552358,
"learning_rate": 3.4868087653006045e-06,
"loss": 1.1901,
"step": 1710
},
{
"epoch": 0.4364097684744688,
"grad_norm": 6.399699747010517,
"learning_rate": 3.466419942907652e-06,
"loss": 1.2107,
"step": 1720
},
{
"epoch": 0.4389470345702506,
"grad_norm": 7.825821295728962,
"learning_rate": 3.445955265320321e-06,
"loss": 1.1918,
"step": 1730
},
{
"epoch": 0.44148430066603234,
"grad_norm": 9.488411335781475,
"learning_rate": 3.425416338830067e-06,
"loss": 1.189,
"step": 1740
},
{
"epoch": 0.44402156676181415,
"grad_norm": 5.88274413352344,
"learning_rate": 3.4048047755562093e-06,
"loss": 1.183,
"step": 1750
},
{
"epoch": 0.44655883285759596,
"grad_norm": 5.6831092141225055,
"learning_rate": 3.3841221933193965e-06,
"loss": 1.1882,
"step": 1760
},
{
"epoch": 0.4490960989533777,
"grad_norm": 19.983254773097197,
"learning_rate": 3.3633702155146216e-06,
"loss": 1.1911,
"step": 1770
},
{
"epoch": 0.4516333650491595,
"grad_norm": 12.549540089577652,
"learning_rate": 3.342550470983798e-06,
"loss": 1.1811,
"step": 1780
},
{
"epoch": 0.45417063114494133,
"grad_norm": 5.976267812124573,
"learning_rate": 3.3216645938879134e-06,
"loss": 1.1865,
"step": 1790
},
{
"epoch": 0.45670789724072314,
"grad_norm": 9.218000333375254,
"learning_rate": 3.3007142235787624e-06,
"loss": 1.1842,
"step": 1800
},
{
"epoch": 0.4592451633365049,
"grad_norm": 6.320981630879619,
"learning_rate": 3.2797010044702697e-06,
"loss": 1.1776,
"step": 1810
},
{
"epoch": 0.4617824294322867,
"grad_norm": 6.190899234462718,
"learning_rate": 3.258626585909422e-06,
"loss": 1.1917,
"step": 1820
},
{
"epoch": 0.4643196955280685,
"grad_norm": 7.909431608694366,
"learning_rate": 3.2374926220468067e-06,
"loss": 1.1789,
"step": 1830
},
{
"epoch": 0.4668569616238503,
"grad_norm": 5.69897203883322,
"learning_rate": 3.216300771706776e-06,
"loss": 1.1733,
"step": 1840
},
{
"epoch": 0.4693942277196321,
"grad_norm": 20.816445025574357,
"learning_rate": 3.1950526982572484e-06,
"loss": 1.1712,
"step": 1850
},
{
"epoch": 0.4719314938154139,
"grad_norm": 11.982164138237327,
"learning_rate": 3.1737500694791424e-06,
"loss": 1.1815,
"step": 1860
},
{
"epoch": 0.4744687599111957,
"grad_norm": 24.74208511970109,
"learning_rate": 3.1523945574354763e-06,
"loss": 1.1863,
"step": 1870
},
{
"epoch": 0.4770060260069775,
"grad_norm": 7.842936948875007,
"learning_rate": 3.130987838340126e-06,
"loss": 1.1834,
"step": 1880
},
{
"epoch": 0.47954329210275926,
"grad_norm": 6.4021446109798505,
"learning_rate": 3.1095315924262544e-06,
"loss": 1.184,
"step": 1890
},
{
"epoch": 0.48208055819854106,
"grad_norm": 11.634701038383552,
"learning_rate": 3.0880275038144296e-06,
"loss": 1.1748,
"step": 1900
},
{
"epoch": 0.4846178242943229,
"grad_norm": 13.567790113056438,
"learning_rate": 3.066477260380441e-06,
"loss": 1.1647,
"step": 1910
},
{
"epoch": 0.4871550903901047,
"grad_norm": 8.85124389947068,
"learning_rate": 3.044882553622808e-06,
"loss": 1.1906,
"step": 1920
},
{
"epoch": 0.48969235648588644,
"grad_norm": 4.941061338275534,
"learning_rate": 3.0232450785300207e-06,
"loss": 1.1653,
"step": 1930
},
{
"epoch": 0.49222962258166825,
"grad_norm": 6.532283199565601,
"learning_rate": 3.0015665334474937e-06,
"loss": 1.1869,
"step": 1940
},
{
"epoch": 0.49476688867745006,
"grad_norm": 9.662609662864591,
"learning_rate": 2.9798486199442627e-06,
"loss": 1.1707,
"step": 1950
},
{
"epoch": 0.49730415477323187,
"grad_norm": 7.538232506265863,
"learning_rate": 2.958093042679429e-06,
"loss": 1.192,
"step": 1960
},
{
"epoch": 0.4998414208690136,
"grad_norm": 72.64241187548325,
"learning_rate": 2.9363015092683566e-06,
"loss": 1.1762,
"step": 1970
},
{
"epoch": 0.5023786869647955,
"grad_norm": 9.900383081254226,
"learning_rate": 2.9144757301486387e-06,
"loss": 1.169,
"step": 1980
},
{
"epoch": 0.5049159530605772,
"grad_norm": 6.366667290767634,
"learning_rate": 2.8926174184458484e-06,
"loss": 1.174,
"step": 1990
},
{
"epoch": 0.507453219156359,
"grad_norm": 6.005401235845572,
"learning_rate": 2.8707282898390703e-06,
"loss": 1.1596,
"step": 2000
},
{
"epoch": 0.5099904852521409,
"grad_norm": 8.195058772390636,
"learning_rate": 2.848810062426236e-06,
"loss": 1.1636,
"step": 2010
},
{
"epoch": 0.5125277513479226,
"grad_norm": 6.0595964456253055,
"learning_rate": 2.82686445658927e-06,
"loss": 1.1622,
"step": 2020
},
{
"epoch": 0.5150650174437044,
"grad_norm": 23.70711855670117,
"learning_rate": 2.8048931948590537e-06,
"loss": 1.1679,
"step": 2030
},
{
"epoch": 0.5176022835394862,
"grad_norm": 5.907186404005163,
"learning_rate": 2.7828980017802236e-06,
"loss": 1.1883,
"step": 2040
},
{
"epoch": 0.520139549635268,
"grad_norm": 7.344357732813401,
"learning_rate": 2.760880603775811e-06,
"loss": 1.173,
"step": 2050
},
{
"epoch": 0.5226768157310498,
"grad_norm": 7.426398590607293,
"learning_rate": 2.73884272901173e-06,
"loss": 1.191,
"step": 2060
},
{
"epoch": 0.5252140818268316,
"grad_norm": 7.057487598088109,
"learning_rate": 2.7167861072611374e-06,
"loss": 1.1717,
"step": 2070
},
{
"epoch": 0.5277513479226134,
"grad_norm": 7.223259822911996,
"learning_rate": 2.6947124697686553e-06,
"loss": 1.1638,
"step": 2080
},
{
"epoch": 0.5302886140183952,
"grad_norm": 10.407626808887795,
"learning_rate": 2.6726235491144886e-06,
"loss": 1.1682,
"step": 2090
},
{
"epoch": 0.532825880114177,
"grad_norm": 6.589791365210622,
"learning_rate": 2.650521079078433e-06,
"loss": 1.1645,
"step": 2100
},
{
"epoch": 0.5353631462099587,
"grad_norm": 11.999381746057887,
"learning_rate": 2.6284067945037855e-06,
"loss": 1.1646,
"step": 2110
},
{
"epoch": 0.5379004123057406,
"grad_norm": 7.613081340354655,
"learning_rate": 2.6062824311611775e-06,
"loss": 1.1705,
"step": 2120
},
{
"epoch": 0.5404376784015223,
"grad_norm": 6.366501800737734,
"learning_rate": 2.5841497256123326e-06,
"loss": 1.1573,
"step": 2130
},
{
"epoch": 0.5429749444973041,
"grad_norm": 18.74317163106254,
"learning_rate": 2.5620104150737626e-06,
"loss": 1.165,
"step": 2140
},
{
"epoch": 0.545512210593086,
"grad_norm": 11.370554877805095,
"learning_rate": 2.5398662372804105e-06,
"loss": 1.1701,
"step": 2150
},
{
"epoch": 0.5480494766888677,
"grad_norm": 6.5553872071155554,
"learning_rate": 2.517718930349254e-06,
"loss": 1.1475,
"step": 2160
},
{
"epoch": 0.5505867427846496,
"grad_norm": 5.741036264156473,
"learning_rate": 2.495570232642881e-06,
"loss": 1.1483,
"step": 2170
},
{
"epoch": 0.5531240088804313,
"grad_norm": 6.767884733983484,
"learning_rate": 2.47342188263304e-06,
"loss": 1.1689,
"step": 2180
},
{
"epoch": 0.5556612749762131,
"grad_norm": 13.529490798235566,
"learning_rate": 2.4512756187641936e-06,
"loss": 1.1614,
"step": 2190
},
{
"epoch": 0.558198541071995,
"grad_norm": 9.536331865874597,
"learning_rate": 2.4291331793170545e-06,
"loss": 1.1606,
"step": 2200
},
{
"epoch": 0.5607358071677767,
"grad_norm": 4.782009512084195,
"learning_rate": 2.4069963022721597e-06,
"loss": 1.1817,
"step": 2210
},
{
"epoch": 0.5632730732635585,
"grad_norm": 12.360173413460117,
"learning_rate": 2.3848667251734424e-06,
"loss": 1.1682,
"step": 2220
},
{
"epoch": 0.5658103393593403,
"grad_norm": 6.111517885569779,
"learning_rate": 2.3627461849918604e-06,
"loss": 1.1526,
"step": 2230
},
{
"epoch": 0.5683476054551221,
"grad_norm": 5.607775951834258,
"learning_rate": 2.3406364179890532e-06,
"loss": 1.1562,
"step": 2240
},
{
"epoch": 0.570884871550904,
"grad_norm": 14.959200098647552,
"learning_rate": 2.3185391595810635e-06,
"loss": 1.1592,
"step": 2250
},
{
"epoch": 0.5734221376466857,
"grad_norm": 5.308853581413382,
"learning_rate": 2.2964561442021255e-06,
"loss": 1.1432,
"step": 2260
},
{
"epoch": 0.5759594037424675,
"grad_norm": 7.548080678466621,
"learning_rate": 2.2743891051685222e-06,
"loss": 1.1594,
"step": 2270
},
{
"epoch": 0.5784966698382493,
"grad_norm": 12.307106947771112,
"learning_rate": 2.2523397745425387e-06,
"loss": 1.1627,
"step": 2280
},
{
"epoch": 0.5810339359340311,
"grad_norm": 6.981005935033202,
"learning_rate": 2.2303098829965124e-06,
"loss": 1.1669,
"step": 2290
},
{
"epoch": 0.5835712020298128,
"grad_norm": 4.0554069435676805,
"learning_rate": 2.208301159676987e-06,
"loss": 1.1631,
"step": 2300
},
{
"epoch": 0.5861084681255947,
"grad_norm": 5.723818233434463,
"learning_rate": 2.1863153320689958e-06,
"loss": 1.1435,
"step": 2310
},
{
"epoch": 0.5886457342213764,
"grad_norm": 14.029515219171557,
"learning_rate": 2.164354125860462e-06,
"loss": 1.1499,
"step": 2320
},
{
"epoch": 0.5911830003171583,
"grad_norm": 7.141317865826808,
"learning_rate": 2.1424192648067582e-06,
"loss": 1.1832,
"step": 2330
},
{
"epoch": 0.5937202664129401,
"grad_norm": 6.123987923859252,
"learning_rate": 2.120512470595396e-06,
"loss": 1.1503,
"step": 2340
},
{
"epoch": 0.5962575325087218,
"grad_norm": 6.013035072240124,
"learning_rate": 2.098635462710898e-06,
"loss": 1.1601,
"step": 2350
},
{
"epoch": 0.5987947986045037,
"grad_norm": 4.754989174196253,
"learning_rate": 2.0767899582998293e-06,
"loss": 1.1641,
"step": 2360
},
{
"epoch": 0.6013320647002854,
"grad_norm": 14.321979761759893,
"learning_rate": 2.054977672036018e-06,
"loss": 1.1417,
"step": 2370
},
{
"epoch": 0.6038693307960672,
"grad_norm": 10.760086157167773,
"learning_rate": 2.033200315985969e-06,
"loss": 1.1482,
"step": 2380
},
{
"epoch": 0.6064065968918491,
"grad_norm": 12.684127714673926,
"learning_rate": 2.011459599474483e-06,
"loss": 1.1497,
"step": 2390
},
{
"epoch": 0.6089438629876308,
"grad_norm": 6.757204304399067,
"learning_rate": 1.989757228950491e-06,
"loss": 1.1583,
"step": 2400
},
{
"epoch": 0.6114811290834127,
"grad_norm": 14.606095290784646,
"learning_rate": 1.9680949078531097e-06,
"loss": 1.1528,
"step": 2410
},
{
"epoch": 0.6140183951791944,
"grad_norm": 7.4618089543548995,
"learning_rate": 1.9464743364779388e-06,
"loss": 1.1594,
"step": 2420
},
{
"epoch": 0.6165556612749762,
"grad_norm": 5.801205347666092,
"learning_rate": 1.924897211843606e-06,
"loss": 1.1428,
"step": 2430
},
{
"epoch": 0.619092927370758,
"grad_norm": 9.568596376954924,
"learning_rate": 1.9033652275585624e-06,
"loss": 1.1321,
"step": 2440
},
{
"epoch": 0.6216301934665398,
"grad_norm": 6.223606964643186,
"learning_rate": 1.8818800736881518e-06,
"loss": 1.1568,
"step": 2450
},
{
"epoch": 0.6241674595623216,
"grad_norm": 55.44156669240569,
"learning_rate": 1.8604434366219573e-06,
"loss": 1.1594,
"step": 2460
},
{
"epoch": 0.6267047256581034,
"grad_norm": 10.778268976853173,
"learning_rate": 1.8390569989414303e-06,
"loss": 1.1588,
"step": 2470
},
{
"epoch": 0.6292419917538852,
"grad_norm": 5.7854893075832745,
"learning_rate": 1.8177224392878279e-06,
"loss": 1.1535,
"step": 2480
},
{
"epoch": 0.6317792578496669,
"grad_norm": 13.300897200946059,
"learning_rate": 1.7964414322304525e-06,
"loss": 1.1339,
"step": 2490
},
{
"epoch": 0.6343165239454488,
"grad_norm": 7.933290169459755,
"learning_rate": 1.7752156481352123e-06,
"loss": 1.1388,
"step": 2500
},
{
"epoch": 0.6368537900412305,
"grad_norm": 5.231248714361998,
"learning_rate": 1.7540467530335172e-06,
"loss": 1.1374,
"step": 2510
},
{
"epoch": 0.6393910561370124,
"grad_norm": 8.049075294237095,
"learning_rate": 1.732936408491504e-06,
"loss": 1.1587,
"step": 2520
},
{
"epoch": 0.6419283222327942,
"grad_norm": 10.356860200627077,
"learning_rate": 1.7118862714796253e-06,
"loss": 1.1572,
"step": 2530
},
{
"epoch": 0.6444655883285759,
"grad_norm": 4.057173286082011,
"learning_rate": 1.6908979942425868e-06,
"loss": 1.141,
"step": 2540
},
{
"epoch": 0.6470028544243578,
"grad_norm": 7.051670677225095,
"learning_rate": 1.6699732241696636e-06,
"loss": 1.1581,
"step": 2550
},
{
"epoch": 0.6495401205201395,
"grad_norm": 7.595067587076725,
"learning_rate": 1.649113603665396e-06,
"loss": 1.1466,
"step": 2560
},
{
"epoch": 0.6520773866159213,
"grad_norm": 6.1918160975787,
"learning_rate": 1.628320770020673e-06,
"loss": 1.1615,
"step": 2570
},
{
"epoch": 0.6546146527117032,
"grad_norm": 14.301509764417062,
"learning_rate": 1.6075963552842211e-06,
"loss": 1.1641,
"step": 2580
},
{
"epoch": 0.6571519188074849,
"grad_norm": 6.875665068775734,
"learning_rate": 1.5869419861345042e-06,
"loss": 1.142,
"step": 2590
},
{
"epoch": 0.6596891849032668,
"grad_norm": 7.541794891346959,
"learning_rate": 1.5663592837520453e-06,
"loss": 1.1469,
"step": 2600
},
{
"epoch": 0.6622264509990485,
"grad_norm": 12.46212690117266,
"learning_rate": 1.5458498636921727e-06,
"loss": 1.1598,
"step": 2610
},
{
"epoch": 0.6647637170948303,
"grad_norm": 7.379528099438572,
"learning_rate": 1.5254153357582208e-06,
"loss": 1.1582,
"step": 2620
},
{
"epoch": 0.6673009831906122,
"grad_norm": 4.907692232962446,
"learning_rate": 1.5050573038751693e-06,
"loss": 1.1452,
"step": 2630
},
{
"epoch": 0.6698382492863939,
"grad_norm": 6.079547602614854,
"learning_rate": 1.4847773659637546e-06,
"loss": 1.1297,
"step": 2640
},
{
"epoch": 0.6723755153821757,
"grad_norm": 5.450549895798723,
"learning_rate": 1.4645771138150433e-06,
"loss": 1.1603,
"step": 2650
},
{
"epoch": 0.6749127814779575,
"grad_norm": 4.368959175756317,
"learning_rate": 1.4444581329654916e-06,
"loss": 1.1439,
"step": 2660
},
{
"epoch": 0.6774500475737393,
"grad_norm": 9.44947484477195,
"learning_rate": 1.424422002572502e-06,
"loss": 1.1749,
"step": 2670
},
{
"epoch": 0.6799873136695211,
"grad_norm": 10.050734306484166,
"learning_rate": 1.404470295290461e-06,
"loss": 1.1545,
"step": 2680
},
{
"epoch": 0.6825245797653029,
"grad_norm": 12.005497232488006,
"learning_rate": 1.3846045771473116e-06,
"loss": 1.149,
"step": 2690
},
{
"epoch": 0.6850618458610847,
"grad_norm": 5.9236681532953925,
"learning_rate": 1.3648264074216282e-06,
"loss": 1.1394,
"step": 2700
},
{
"epoch": 0.6875991119568665,
"grad_norm": 9.66814179972378,
"learning_rate": 1.345137338520231e-06,
"loss": 1.1463,
"step": 2710
},
{
"epoch": 0.6901363780526483,
"grad_norm": 37.98586030031826,
"learning_rate": 1.3255389158563299e-06,
"loss": 1.1521,
"step": 2720
},
{
"epoch": 0.69267364414843,
"grad_norm": 4.6889018268529705,
"learning_rate": 1.3060326777282312e-06,
"loss": 1.1437,
"step": 2730
},
{
"epoch": 0.6952109102442119,
"grad_norm": 14.32310959027276,
"learning_rate": 1.2866201551985935e-06,
"loss": 1.1394,
"step": 2740
},
{
"epoch": 0.6977481763399936,
"grad_norm": 6.337986082170704,
"learning_rate": 1.2673028719742461e-06,
"loss": 1.1455,
"step": 2750
},
{
"epoch": 0.7002854424357755,
"grad_norm": 15.376375293741257,
"learning_rate": 1.2480823442866017e-06,
"loss": 1.1401,
"step": 2760
},
{
"epoch": 0.7028227085315573,
"grad_norm": 5.041231343225159,
"learning_rate": 1.2289600807726406e-06,
"loss": 1.156,
"step": 2770
},
{
"epoch": 0.705359974627339,
"grad_norm": 6.869042630836821,
"learning_rate": 1.2099375823564948e-06,
"loss": 1.1615,
"step": 2780
},
{
"epoch": 0.7078972407231209,
"grad_norm": 12.482011259323276,
"learning_rate": 1.1910163421316447e-06,
"loss": 1.1321,
"step": 2790
},
{
"epoch": 0.7104345068189026,
"grad_norm": 6.399176247127537,
"learning_rate": 1.1721978452437205e-06,
"loss": 1.153,
"step": 2800
},
{
"epoch": 0.7129717729146844,
"grad_norm": 6.270550675278331,
"learning_rate": 1.1534835687739323e-06,
"loss": 1.1551,
"step": 2810
},
{
"epoch": 0.7155090390104663,
"grad_norm": 4.695573760402524,
"learning_rate": 1.1348749816231347e-06,
"loss": 1.12,
"step": 2820
},
{
"epoch": 0.718046305106248,
"grad_norm": 6.7962150090749995,
"learning_rate": 1.1163735443965298e-06,
"loss": 1.1684,
"step": 2830
},
{
"epoch": 0.7205835712020298,
"grad_norm": 8.834255061556856,
"learning_rate": 1.0979807092890205e-06,
"loss": 1.1414,
"step": 2840
},
{
"epoch": 0.7231208372978116,
"grad_norm": 6.053865837653651,
"learning_rate": 1.079697919971232e-06,
"loss": 1.1355,
"step": 2850
},
{
"epoch": 0.7256581033935934,
"grad_norm": 19.55990021937961,
"learning_rate": 1.0615266114761932e-06,
"loss": 1.122,
"step": 2860
},
{
"epoch": 0.7281953694893752,
"grad_norm": 8.424559736567739,
"learning_rate": 1.0434682100866995e-06,
"loss": 1.1422,
"step": 2870
},
{
"epoch": 0.730732635585157,
"grad_norm": 5.986352186146355,
"learning_rate": 1.0255241332233636e-06,
"loss": 1.1312,
"step": 2880
},
{
"epoch": 0.7332699016809388,
"grad_norm": 20.203613019858313,
"learning_rate": 1.0076957893333602e-06,
"loss": 1.1271,
"step": 2890
},
{
"epoch": 0.7358071677767206,
"grad_norm": 19.714820742115197,
"learning_rate": 9.899845777798777e-07,
"loss": 1.154,
"step": 2900
},
{
"epoch": 0.7383444338725024,
"grad_norm": 13.735614202784944,
"learning_rate": 9.723918887322757e-07,
"loss": 1.1359,
"step": 2910
},
{
"epoch": 0.7408816999682841,
"grad_norm": 6.129734292866207,
"learning_rate": 9.549191030569751e-07,
"loss": 1.1313,
"step": 2920
},
{
"epoch": 0.743418966064066,
"grad_norm": 7.37718825001788,
"learning_rate": 9.375675922090707e-07,
"loss": 1.1319,
"step": 2930
},
{
"epoch": 0.7459562321598477,
"grad_norm": 4.427499085813396,
"learning_rate": 9.203387181246831e-07,
"loss": 1.1399,
"step": 2940
},
{
"epoch": 0.7484934982556296,
"grad_norm": 29.506989634102897,
"learning_rate": 9.032338331140603e-07,
"loss": 1.158,
"step": 2950
},
{
"epoch": 0.7510307643514114,
"grad_norm": 5.7586848271722495,
"learning_rate": 8.862542797554341e-07,
"loss": 1.1486,
"step": 2960
},
{
"epoch": 0.7535680304471931,
"grad_norm": 8.650793498941589,
"learning_rate": 8.694013907896363e-07,
"loss": 1.1461,
"step": 2970
},
{
"epoch": 0.756105296542975,
"grad_norm": 25.74372735382993,
"learning_rate": 8.526764890154965e-07,
"loss": 1.1353,
"step": 2980
},
{
"epoch": 0.7586425626387567,
"grad_norm": 5.0755009425487625,
"learning_rate": 8.36080887186011e-07,
"loss": 1.1553,
"step": 2990
},
{
"epoch": 0.7611798287345385,
"grad_norm": 5.4429094875047115,
"learning_rate": 8.19615887905301e-07,
"loss": 1.1598,
"step": 3000
},
{
"epoch": 0.7637170948303204,
"grad_norm": 6.821786199139472,
"learning_rate": 8.032827835263773e-07,
"loss": 1.1318,
"step": 3010
},
{
"epoch": 0.7662543609261021,
"grad_norm": 4.919272329647964,
"learning_rate": 7.87082856049696e-07,
"loss": 1.1188,
"step": 3020
},
{
"epoch": 0.768791627021884,
"grad_norm": 6.777841153958167,
"learning_rate": 7.710173770225335e-07,
"loss": 1.1351,
"step": 3030
},
{
"epoch": 0.7713288931176657,
"grad_norm": 9.668494091003875,
"learning_rate": 7.550876074391852e-07,
"loss": 1.1388,
"step": 3040
},
{
"epoch": 0.7738661592134475,
"grad_norm": 5.891510434517859,
"learning_rate": 7.392947976419867e-07,
"loss": 1.1307,
"step": 3050
},
{
"epoch": 0.7764034253092293,
"grad_norm": 4.629328829939173,
"learning_rate": 7.23640187223173e-07,
"loss": 1.1442,
"step": 3060
},
{
"epoch": 0.7789406914050111,
"grad_norm": 12.582358698587784,
"learning_rate": 7.081250049275804e-07,
"loss": 1.147,
"step": 3070
},
{
"epoch": 0.7814779575007929,
"grad_norm": 12.6779267097399,
"learning_rate": 6.927504685562075e-07,
"loss": 1.1461,
"step": 3080
},
{
"epoch": 0.7840152235965747,
"grad_norm": 4.951520708126675,
"learning_rate": 6.775177848706193e-07,
"loss": 1.1272,
"step": 3090
},
{
"epoch": 0.7865524896923565,
"grad_norm": 8.236149543257104,
"learning_rate": 6.624281494982359e-07,
"loss": 1.1478,
"step": 3100
},
{
"epoch": 0.7890897557881383,
"grad_norm": 15.16637456873453,
"learning_rate": 6.474827468384811e-07,
"loss": 1.1411,
"step": 3110
},
{
"epoch": 0.7916270218839201,
"grad_norm": 11.435649281181314,
"learning_rate": 6.326827499698218e-07,
"loss": 1.128,
"step": 3120
},
{
"epoch": 0.7941642879797018,
"grad_norm": 7.369755927934268,
"learning_rate": 6.180293205576873e-07,
"loss": 1.1503,
"step": 3130
},
{
"epoch": 0.7967015540754837,
"grad_norm": 7.910994986606561,
"learning_rate": 6.035236087632928e-07,
"loss": 1.1305,
"step": 3140
},
{
"epoch": 0.7992388201712655,
"grad_norm": 4.538240411984233,
"learning_rate": 5.891667531533643e-07,
"loss": 1.1624,
"step": 3150
},
{
"epoch": 0.8017760862670472,
"grad_norm": 7.092761380330254,
"learning_rate": 5.749598806107634e-07,
"loss": 1.1571,
"step": 3160
},
{
"epoch": 0.8043133523628291,
"grad_norm": 6.547620136814868,
"learning_rate": 5.609041062460451e-07,
"loss": 1.1253,
"step": 3170
},
{
"epoch": 0.8068506184586108,
"grad_norm": 5.7902809800252175,
"learning_rate": 5.470005333099288e-07,
"loss": 1.1229,
"step": 3180
},
{
"epoch": 0.8093878845543926,
"grad_norm": 7.71191733136425,
"learning_rate": 5.332502531067007e-07,
"loss": 1.1279,
"step": 3190
},
{
"epoch": 0.8119251506501745,
"grad_norm": 4.0181700071905295,
"learning_rate": 5.196543449085617e-07,
"loss": 1.1268,
"step": 3200
},
{
"epoch": 0.8144624167459562,
"grad_norm": 8.13336313004874,
"learning_rate": 5.062138758709098e-07,
"loss": 1.1409,
"step": 3210
},
{
"epoch": 0.8169996828417381,
"grad_norm": 9.802266531368725,
"learning_rate": 4.929299009485799e-07,
"loss": 1.1385,
"step": 3220
},
{
"epoch": 0.8195369489375198,
"grad_norm": 6.636513914507316,
"learning_rate": 4.798034628130396e-07,
"loss": 1.1454,
"step": 3230
},
{
"epoch": 0.8220742150333016,
"grad_norm": 4.227565986903538,
"learning_rate": 4.668355917705486e-07,
"loss": 1.1257,
"step": 3240
},
{
"epoch": 0.8246114811290834,
"grad_norm": 8.17867439720174,
"learning_rate": 4.540273056812869e-07,
"loss": 1.1337,
"step": 3250
},
{
"epoch": 0.8271487472248652,
"grad_norm": 5.424511876014973,
"learning_rate": 4.4137960987946707e-07,
"loss": 1.1265,
"step": 3260
},
{
"epoch": 0.829686013320647,
"grad_norm": 7.92386965501813,
"learning_rate": 4.2889349709441945e-07,
"loss": 1.1303,
"step": 3270
},
{
"epoch": 0.8322232794164288,
"grad_norm": 10.370529382382568,
"learning_rate": 4.165699473726756e-07,
"loss": 1.1401,
"step": 3280
},
{
"epoch": 0.8347605455122106,
"grad_norm": 7.578569844581463,
"learning_rate": 4.044099280010405e-07,
"loss": 1.1413,
"step": 3290
},
{
"epoch": 0.8372978116079924,
"grad_norm": 4.726374382802555,
"learning_rate": 3.9241439343067205e-07,
"loss": 1.1189,
"step": 3300
},
{
"epoch": 0.8398350777037742,
"grad_norm": 7.984970758900439,
"learning_rate": 3.8058428520216407e-07,
"loss": 1.1397,
"step": 3310
},
{
"epoch": 0.842372343799556,
"grad_norm": 9.346707987442102,
"learning_rate": 3.689205318716424e-07,
"loss": 1.1348,
"step": 3320
},
{
"epoch": 0.8449096098953378,
"grad_norm": 5.016282437243575,
"learning_rate": 3.574240489378847e-07,
"loss": 1.1473,
"step": 3330
},
{
"epoch": 0.8474468759911196,
"grad_norm": 6.2078193804951365,
"learning_rate": 3.4609573877046054e-07,
"loss": 1.1445,
"step": 3340
},
{
"epoch": 0.8499841420869013,
"grad_norm": 5.5201202597039085,
"learning_rate": 3.3493649053890325e-07,
"loss": 1.1434,
"step": 3350
},
{
"epoch": 0.8525214081826832,
"grad_norm": 17.229253814968214,
"learning_rate": 3.239471801429186e-07,
"loss": 1.1305,
"step": 3360
},
{
"epoch": 0.8550586742784649,
"grad_norm": 8.70530756713737,
"learning_rate": 3.1312867014363534e-07,
"loss": 1.1322,
"step": 3370
},
{
"epoch": 0.8575959403742468,
"grad_norm": 7.012844741047474,
"learning_rate": 3.024818096958995e-07,
"loss": 1.1303,
"step": 3380
},
{
"epoch": 0.8601332064700286,
"grad_norm": 4.756022687074469,
"learning_rate": 2.920074344816268e-07,
"loss": 1.1259,
"step": 3390
},
{
"epoch": 0.8626704725658103,
"grad_norm": 4.7358895032579555,
"learning_rate": 2.8170636664420715e-07,
"loss": 1.1299,
"step": 3400
},
{
"epoch": 0.8652077386615922,
"grad_norm": 8.683945332579281,
"learning_rate": 2.7157941472397393e-07,
"loss": 1.1496,
"step": 3410
},
{
"epoch": 0.8677450047573739,
"grad_norm": 5.37690958165037,
"learning_rate": 2.6162737359474195e-07,
"loss": 1.1242,
"step": 3420
},
{
"epoch": 0.8702822708531557,
"grad_norm": 7.164420959217891,
"learning_rate": 2.518510244014161e-07,
"loss": 1.1284,
"step": 3430
},
{
"epoch": 0.8728195369489375,
"grad_norm": 7.002358607609309,
"learning_rate": 2.4225113449867834e-07,
"loss": 1.1304,
"step": 3440
},
{
"epoch": 0.8753568030447193,
"grad_norm": 4.573582959362798,
"learning_rate": 2.3282845739075855e-07,
"loss": 1.1282,
"step": 3450
},
{
"epoch": 0.8778940691405012,
"grad_norm": 4.899953600348181,
"learning_rate": 2.2358373267229006e-07,
"loss": 1.1298,
"step": 3460
},
{
"epoch": 0.8804313352362829,
"grad_norm": 5.271497221424272,
"learning_rate": 2.1451768597025995e-07,
"loss": 1.1256,
"step": 3470
},
{
"epoch": 0.8829686013320647,
"grad_norm": 6.165041784465411,
"learning_rate": 2.0563102888705027e-07,
"loss": 1.1286,
"step": 3480
},
{
"epoch": 0.8855058674278465,
"grad_norm": 11.088879537008697,
"learning_rate": 1.9692445894458845e-07,
"loss": 1.1252,
"step": 3490
},
{
"epoch": 0.8880431335236283,
"grad_norm": 9.424576280240377,
"learning_rate": 1.883986595295953e-07,
"loss": 1.1151,
"step": 3500
},
{
"epoch": 0.89058039961941,
"grad_norm": 5.754831282009581,
"learning_rate": 1.8005429983994487e-07,
"loss": 1.1334,
"step": 3510
},
{
"epoch": 0.8931176657151919,
"grad_norm": 10.215467753583647,
"learning_rate": 1.7189203483213984e-07,
"loss": 1.1411,
"step": 3520
},
{
"epoch": 0.8956549318109737,
"grad_norm": 12.91382494925556,
"learning_rate": 1.6391250516990448e-07,
"loss": 1.1236,
"step": 3530
},
{
"epoch": 0.8981921979067554,
"grad_norm": 10.128521645418763,
"learning_rate": 1.5611633717389467e-07,
"loss": 1.1159,
"step": 3540
},
{
"epoch": 0.9007294640025373,
"grad_norm": 9.199799321868463,
"learning_rate": 1.4850414277254088e-07,
"loss": 1.1368,
"step": 3550
},
{
"epoch": 0.903266730098319,
"grad_norm": 4.892191302349406,
"learning_rate": 1.41076519454017e-07,
"loss": 1.1305,
"step": 3560
},
{
"epoch": 0.9058039961941009,
"grad_norm": 5.956493220397556,
"learning_rate": 1.3383405021933998e-07,
"loss": 1.1264,
"step": 3570
},
{
"epoch": 0.9083412622898827,
"grad_norm": 5.275784999413581,
"learning_rate": 1.267773035366135e-07,
"loss": 1.1195,
"step": 3580
},
{
"epoch": 0.9108785283856644,
"grad_norm": 9.540366769966049,
"learning_rate": 1.1990683329640567e-07,
"loss": 1.1381,
"step": 3590
},
{
"epoch": 0.9134157944814463,
"grad_norm": 4.801855741665693,
"learning_rate": 1.1322317876827416e-07,
"loss": 1.1315,
"step": 3600
},
{
"epoch": 0.915953060577228,
"grad_norm": 5.056163865646408,
"learning_rate": 1.0672686455843934e-07,
"loss": 1.1382,
"step": 3610
},
{
"epoch": 0.9184903266730098,
"grad_norm": 8.706560880258346,
"learning_rate": 1.004184005686068e-07,
"loss": 1.119,
"step": 3620
},
{
"epoch": 0.9210275927687916,
"grad_norm": 6.266874220907701,
"learning_rate": 9.429828195594459e-08,
"loss": 1.1316,
"step": 3630
},
{
"epoch": 0.9235648588645734,
"grad_norm": 4.598497385552673,
"learning_rate": 8.836698909421848e-08,
"loss": 1.1316,
"step": 3640
},
{
"epoch": 0.9261021249603553,
"grad_norm": 5.423769767831511,
"learning_rate": 8.2624987536086e-08,
"loss": 1.1305,
"step": 3650
},
{
"epoch": 0.928639391056137,
"grad_norm": 7.1199155337882045,
"learning_rate": 7.707272797655597e-08,
"loss": 1.104,
"step": 3660
},
{
"epoch": 0.9311766571519188,
"grad_norm": 5.964210281087181,
"learning_rate": 7.171064621761121e-08,
"loss": 1.1392,
"step": 3670
},
{
"epoch": 0.9337139232477006,
"grad_norm": 7.0222277210918485,
"learning_rate": 6.653916313400483e-08,
"loss": 1.1307,
"step": 3680
},
{
"epoch": 0.9362511893434824,
"grad_norm": 6.820545499544659,
"learning_rate": 6.155868464022218e-08,
"loss": 1.1311,
"step": 3690
},
{
"epoch": 0.9387884554392641,
"grad_norm": 8.543306468091403,
"learning_rate": 5.676960165862333e-08,
"loss": 1.1396,
"step": 3700
},
{
"epoch": 0.941325721535046,
"grad_norm": 5.998847968884944,
"learning_rate": 5.217229008875696e-08,
"loss": 1.1142,
"step": 3710
},
{
"epoch": 0.9438629876308278,
"grad_norm": 7.233927115402458,
"learning_rate": 4.7767110777856285e-08,
"loss": 1.1316,
"step": 3720
},
{
"epoch": 0.9464002537266096,
"grad_norm": 14.818426116599344,
"learning_rate": 4.355440949251638e-08,
"loss": 1.1291,
"step": 3730
},
{
"epoch": 0.9489375198223914,
"grad_norm": 11.275146260000886,
"learning_rate": 3.953451689155369e-08,
"loss": 1.1377,
"step": 3740
},
{
"epoch": 0.9514747859181731,
"grad_norm": 4.976497150845647,
"learning_rate": 3.5707748500053706e-08,
"loss": 1.1341,
"step": 3750
},
{
"epoch": 0.954012052013955,
"grad_norm": 8.165947802169324,
"learning_rate": 3.2074404684603325e-08,
"loss": 1.1131,
"step": 3760
},
{
"epoch": 0.9565493181097368,
"grad_norm": 7.141736239500508,
"learning_rate": 2.863477062971659e-08,
"loss": 1.1354,
"step": 3770
},
{
"epoch": 0.9590865842055185,
"grad_norm": 6.6723385448588814,
"learning_rate": 2.5389116315448768e-08,
"loss": 1.1349,
"step": 3780
},
{
"epoch": 0.9616238503013004,
"grad_norm": 9.205939869377257,
"learning_rate": 2.2337696496206317e-08,
"loss": 1.1358,
"step": 3790
},
{
"epoch": 0.9641611163970821,
"grad_norm": 6.430232530777119,
"learning_rate": 1.948075068075067e-08,
"loss": 1.1454,
"step": 3800
},
{
"epoch": 0.966698382492864,
"grad_norm": 4.597818211923664,
"learning_rate": 1.6818503113398832e-08,
"loss": 1.121,
"step": 3810
},
{
"epoch": 0.9692356485886457,
"grad_norm": 11.864724480832432,
"learning_rate": 1.4351162756422454e-08,
"loss": 1.1279,
"step": 3820
},
{
"epoch": 0.9717729146844275,
"grad_norm": 10.20794178542716,
"learning_rate": 1.2078923273646236e-08,
"loss": 1.1423,
"step": 3830
},
{
"epoch": 0.9743101807802094,
"grad_norm": 6.419864028983356,
"learning_rate": 1.0001963015247585e-08,
"loss": 1.1206,
"step": 3840
},
{
"epoch": 0.9768474468759911,
"grad_norm": 6.587741451008303,
"learning_rate": 8.120445003755306e-09,
"loss": 1.1429,
"step": 3850
},
{
"epoch": 0.9793847129717729,
"grad_norm": 4.807651227856488,
"learning_rate": 6.434516921257905e-09,
"loss": 1.1184,
"step": 3860
},
{
"epoch": 0.9819219790675547,
"grad_norm": 17.54136739327046,
"learning_rate": 4.9443110978078525e-09,
"loss": 1.143,
"step": 3870
},
{
"epoch": 0.9844592451633365,
"grad_norm": 84.07353671535705,
"learning_rate": 3.649944501037672e-09,
"loss": 1.1314,
"step": 3880
},
{
"epoch": 0.9869965112591182,
"grad_norm": 5.176156752927596,
"learning_rate": 2.5515187269772866e-09,
"loss": 1.1163,
"step": 3890
},
{
"epoch": 0.9895337773549001,
"grad_norm": 4.911729600848666,
"learning_rate": 1.6491199920809498e-09,
"loss": 1.1519,
"step": 3900
},
{
"epoch": 0.9920710434506819,
"grad_norm": 5.8502366235556345,
"learning_rate": 9.428191264596042e-10,
"loss": 1.1472,
"step": 3910
},
{
"epoch": 0.9946083095464637,
"grad_norm": 17.952194768106622,
"learning_rate": 4.3267156832033085e-10,
"loss": 1.1114,
"step": 3920
},
{
"epoch": 0.9971455756422455,
"grad_norm": 13.67739524605559,
"learning_rate": 1.187173596167712e-10,
"loss": 1.1338,
"step": 3930
},
{
"epoch": 0.9996828417380272,
"grad_norm": 10.095200847738084,
"learning_rate": 9.811429044215282e-13,
"loss": 1.1399,
"step": 3940
},
{
"epoch": 0.9999365683476055,
"step": 3941,
"total_flos": 2.4957269414919537e+18,
"train_loss": 1.201914404482746,
"train_runtime": 115316.4416,
"train_samples_per_second": 4.375,
"train_steps_per_second": 0.034
}
],
"logging_steps": 10,
"max_steps": 3941,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.4957269414919537e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}