{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999365683476055, "eval_steps": 500, "global_step": 3941, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002537266095781795, "grad_norm": 112.85723727111593, "learning_rate": 1.2658227848101266e-07, "loss": 1.8967, "step": 10 }, { "epoch": 0.00507453219156359, "grad_norm": 111.7277255600065, "learning_rate": 2.5316455696202533e-07, "loss": 1.8962, "step": 20 }, { "epoch": 0.007611798287345386, "grad_norm": 100.54525732943956, "learning_rate": 3.79746835443038e-07, "loss": 1.8773, "step": 30 }, { "epoch": 0.01014906438312718, "grad_norm": 60.43888528431074, "learning_rate": 5.063291139240507e-07, "loss": 1.7742, "step": 40 }, { "epoch": 0.012686330478908976, "grad_norm": 28.26912001667364, "learning_rate": 6.329113924050634e-07, "loss": 1.7141, "step": 50 }, { "epoch": 0.015223596574690771, "grad_norm": 27.480791814917016, "learning_rate": 7.59493670886076e-07, "loss": 1.6293, "step": 60 }, { "epoch": 0.017760862670472565, "grad_norm": 20.239901551582367, "learning_rate": 8.860759493670887e-07, "loss": 1.5743, "step": 70 }, { "epoch": 0.02029812876625436, "grad_norm": 11.923875375009446, "learning_rate": 1.0126582278481013e-06, "loss": 1.5202, "step": 80 }, { "epoch": 0.022835394862036156, "grad_norm": 6.627037994110648, "learning_rate": 1.139240506329114e-06, "loss": 1.4872, "step": 90 }, { "epoch": 0.02537266095781795, "grad_norm": 8.16791147643463, "learning_rate": 1.2658227848101267e-06, "loss": 1.4824, "step": 100 }, { "epoch": 0.027909927053599747, "grad_norm": 9.357053020719055, "learning_rate": 1.3924050632911392e-06, "loss": 1.4456, "step": 110 }, { "epoch": 0.030447193149381543, "grad_norm": 8.946206463663723, "learning_rate": 1.518987341772152e-06, "loss": 1.4253, "step": 120 }, { "epoch": 0.032984459245163335, "grad_norm": 6.684895053435472, "learning_rate": 1.6455696202531647e-06, "loss": 1.4119, "step": 130 }, { "epoch": 0.03552172534094513, "grad_norm": 5.858068093439728, "learning_rate": 1.7721518987341774e-06, "loss": 1.3944, "step": 140 }, { "epoch": 0.038058991436726926, "grad_norm": 10.736099643334308, "learning_rate": 1.8987341772151901e-06, "loss": 1.3937, "step": 150 }, { "epoch": 0.04059625753250872, "grad_norm": 5.52407932247684, "learning_rate": 2.0253164556962026e-06, "loss": 1.3624, "step": 160 }, { "epoch": 0.04313352362829052, "grad_norm": 8.107714144715326, "learning_rate": 2.1518987341772153e-06, "loss": 1.3854, "step": 170 }, { "epoch": 0.04567078972407231, "grad_norm": 5.726773653383632, "learning_rate": 2.278481012658228e-06, "loss": 1.3745, "step": 180 }, { "epoch": 0.04820805581985411, "grad_norm": 5.421447552101552, "learning_rate": 2.4050632911392408e-06, "loss": 1.3616, "step": 190 }, { "epoch": 0.0507453219156359, "grad_norm": 5.519589755451587, "learning_rate": 2.5316455696202535e-06, "loss": 1.3802, "step": 200 }, { "epoch": 0.0532825880114177, "grad_norm": 4.997598050858228, "learning_rate": 2.6582278481012658e-06, "loss": 1.3501, "step": 210 }, { "epoch": 0.055819854107199494, "grad_norm": 7.739443027895296, "learning_rate": 2.7848101265822785e-06, "loss": 1.3529, "step": 220 }, { "epoch": 0.05835712020298129, "grad_norm": 14.153932111981923, "learning_rate": 2.9113924050632912e-06, "loss": 1.3425, "step": 230 }, { "epoch": 0.060894386298763085, "grad_norm": 6.821474472121072, "learning_rate": 3.037974683544304e-06, "loss": 1.3268, "step": 240 }, { "epoch": 0.06343165239454487, "grad_norm": 7.105335682421715, "learning_rate": 3.164556962025317e-06, "loss": 1.3391, "step": 250 }, { "epoch": 0.06596891849032667, "grad_norm": 9.157922661641283, "learning_rate": 3.2911392405063294e-06, "loss": 1.331, "step": 260 }, { "epoch": 0.06850618458610847, "grad_norm": 8.558636103035896, "learning_rate": 3.417721518987342e-06, "loss": 1.3318, "step": 270 }, { "epoch": 0.07104345068189026, "grad_norm": 6.250103925808961, "learning_rate": 3.544303797468355e-06, "loss": 1.3356, "step": 280 }, { "epoch": 0.07358071677767206, "grad_norm": 6.294667609868247, "learning_rate": 3.6708860759493675e-06, "loss": 1.32, "step": 290 }, { "epoch": 0.07611798287345385, "grad_norm": 6.6361456210836725, "learning_rate": 3.7974683544303802e-06, "loss": 1.3224, "step": 300 }, { "epoch": 0.07865524896923565, "grad_norm": 8.289069596597512, "learning_rate": 3.924050632911393e-06, "loss": 1.3176, "step": 310 }, { "epoch": 0.08119251506501744, "grad_norm": 5.542830103144929, "learning_rate": 4.050632911392405e-06, "loss": 1.2962, "step": 320 }, { "epoch": 0.08372978116079924, "grad_norm": 10.531069717984824, "learning_rate": 4.177215189873418e-06, "loss": 1.3005, "step": 330 }, { "epoch": 0.08626704725658103, "grad_norm": 17.37898460345658, "learning_rate": 4.303797468354431e-06, "loss": 1.3031, "step": 340 }, { "epoch": 0.08880431335236283, "grad_norm": 9.252485190691019, "learning_rate": 4.430379746835443e-06, "loss": 1.3107, "step": 350 }, { "epoch": 0.09134157944814462, "grad_norm": 7.525737017146553, "learning_rate": 4.556962025316456e-06, "loss": 1.2886, "step": 360 }, { "epoch": 0.09387884554392642, "grad_norm": 10.083996939013279, "learning_rate": 4.683544303797468e-06, "loss": 1.2954, "step": 370 }, { "epoch": 0.09641611163970822, "grad_norm": 18.864544413704667, "learning_rate": 4.8101265822784815e-06, "loss": 1.2959, "step": 380 }, { "epoch": 0.09895337773549001, "grad_norm": 15.18986427449958, "learning_rate": 4.936708860759495e-06, "loss": 1.2918, "step": 390 }, { "epoch": 0.1014906438312718, "grad_norm": 8.154706578846366, "learning_rate": 4.999975471465892e-06, "loss": 1.2952, "step": 400 }, { "epoch": 0.1040279099270536, "grad_norm": 8.626125448906336, "learning_rate": 4.999779246080933e-06, "loss": 1.2812, "step": 410 }, { "epoch": 0.1065651760228354, "grad_norm": 14.880568433586623, "learning_rate": 4.999386810712926e-06, "loss": 1.2532, "step": 420 }, { "epoch": 0.1091024421186172, "grad_norm": 10.892903720339616, "learning_rate": 4.9987981961644855e-06, "loss": 1.2982, "step": 430 }, { "epoch": 0.11163970821439899, "grad_norm": 4.797131746271338, "learning_rate": 4.998013448636512e-06, "loss": 1.271, "step": 440 }, { "epoch": 0.11417697431018078, "grad_norm": 18.557197921826557, "learning_rate": 4.997032629724564e-06, "loss": 1.2825, "step": 450 }, { "epoch": 0.11671424040596258, "grad_norm": 15.001794851401208, "learning_rate": 4.995855816414024e-06, "loss": 1.2765, "step": 460 }, { "epoch": 0.11925150650174438, "grad_norm": 10.965425508561747, "learning_rate": 4.9944831010740576e-06, "loss": 1.2585, "step": 470 }, { "epoch": 0.12178877259752617, "grad_norm": 6.63862480684244, "learning_rate": 4.992914591450358e-06, "loss": 1.2929, "step": 480 }, { "epoch": 0.12432603869330797, "grad_norm": 6.984972641361711, "learning_rate": 4.991150410656697e-06, "loss": 1.2733, "step": 490 }, { "epoch": 0.12686330478908975, "grad_norm": 18.903688844829993, "learning_rate": 4.9891906971652545e-06, "loss": 1.2558, "step": 500 }, { "epoch": 0.12940057088487156, "grad_norm": 7.177194302737898, "learning_rate": 4.987035604795753e-06, "loss": 1.2659, "step": 510 }, { "epoch": 0.13193783698065334, "grad_norm": 10.981192386536817, "learning_rate": 4.984685302703385e-06, "loss": 1.2606, "step": 520 }, { "epoch": 0.13447510307643515, "grad_norm": 9.47429133139194, "learning_rate": 4.982139975365533e-06, "loss": 1.2785, "step": 530 }, { "epoch": 0.13701236917221693, "grad_norm": 17.731391325371494, "learning_rate": 4.979399822567292e-06, "loss": 1.2709, "step": 540 }, { "epoch": 0.13954963526799874, "grad_norm": 13.256927828543148, "learning_rate": 4.976465059385788e-06, "loss": 1.248, "step": 550 }, { "epoch": 0.14208690136378052, "grad_norm": 9.285697826647898, "learning_rate": 4.973335916173294e-06, "loss": 1.2462, "step": 560 }, { "epoch": 0.14462416745956233, "grad_norm": 10.853767808913329, "learning_rate": 4.970012638539152e-06, "loss": 1.2533, "step": 570 }, { "epoch": 0.1471614335553441, "grad_norm": 16.143378806845213, "learning_rate": 4.966495487330496e-06, "loss": 1.2526, "step": 580 }, { "epoch": 0.14969869965112592, "grad_norm": 46.2688542748775, "learning_rate": 4.962784738611774e-06, "loss": 1.265, "step": 590 }, { "epoch": 0.1522359657469077, "grad_norm": 6.132364126798611, "learning_rate": 4.958880683643082e-06, "loss": 1.2733, "step": 600 }, { "epoch": 0.1547732318426895, "grad_norm": 7.964503594558826, "learning_rate": 4.954783628857302e-06, "loss": 1.2626, "step": 610 }, { "epoch": 0.1573104979384713, "grad_norm": 13.68977454870837, "learning_rate": 4.95049389583605e-06, "loss": 1.2657, "step": 620 }, { "epoch": 0.1598477640342531, "grad_norm": 14.540626560719817, "learning_rate": 4.9460118212844355e-06, "loss": 1.2372, "step": 630 }, { "epoch": 0.16238503013003489, "grad_norm": 17.246204651829867, "learning_rate": 4.941337757004631e-06, "loss": 1.2355, "step": 640 }, { "epoch": 0.1649222962258167, "grad_norm": 5.660216888492281, "learning_rate": 4.936472069868262e-06, "loss": 1.2439, "step": 650 }, { "epoch": 0.16745956232159848, "grad_norm": 32.30203917124339, "learning_rate": 4.931415141787607e-06, "loss": 1.2384, "step": 660 }, { "epoch": 0.16999682841738029, "grad_norm": 9.519199679528043, "learning_rate": 4.926167369685626e-06, "loss": 1.2452, "step": 670 }, { "epoch": 0.17253409451316207, "grad_norm": 9.559300124244501, "learning_rate": 4.920729165464799e-06, "loss": 1.2564, "step": 680 }, { "epoch": 0.17507136060894388, "grad_norm": 7.099728750507589, "learning_rate": 4.915100955974802e-06, "loss": 1.2695, "step": 690 }, { "epoch": 0.17760862670472566, "grad_norm": 8.197172932423973, "learning_rate": 4.909283182978998e-06, "loss": 1.2379, "step": 700 }, { "epoch": 0.18014589280050744, "grad_norm": 9.726988142249633, "learning_rate": 4.903276303119765e-06, "loss": 1.2483, "step": 710 }, { "epoch": 0.18268315889628925, "grad_norm": 5.550536710160647, "learning_rate": 4.897080787882656e-06, "loss": 1.2493, "step": 720 }, { "epoch": 0.18522042499207103, "grad_norm": 15.553779441262579, "learning_rate": 4.890697123559385e-06, "loss": 1.2635, "step": 730 }, { "epoch": 0.18775769108785284, "grad_norm": 7.070319331844888, "learning_rate": 4.884125811209665e-06, "loss": 1.2439, "step": 740 }, { "epoch": 0.19029495718363462, "grad_norm": 9.252450309844942, "learning_rate": 4.877367366621874e-06, "loss": 1.2423, "step": 750 }, { "epoch": 0.19283222327941643, "grad_norm": 8.297702205658675, "learning_rate": 4.870422320272576e-06, "loss": 1.2322, "step": 760 }, { "epoch": 0.1953694893751982, "grad_norm": 14.013643448769688, "learning_rate": 4.863291217284872e-06, "loss": 1.2354, "step": 770 }, { "epoch": 0.19790675547098002, "grad_norm": 8.039135652565351, "learning_rate": 4.855974617385629e-06, "loss": 1.2257, "step": 780 }, { "epoch": 0.2004440215667618, "grad_norm": 6.465541475557579, "learning_rate": 4.8484730948615336e-06, "loss": 1.2477, "step": 790 }, { "epoch": 0.2029812876625436, "grad_norm": 38.736482675881, "learning_rate": 4.840787238514019e-06, "loss": 1.242, "step": 800 }, { "epoch": 0.2055185537583254, "grad_norm": 6.2027098825049745, "learning_rate": 4.832917651613055e-06, "loss": 1.2481, "step": 810 }, { "epoch": 0.2080558198541072, "grad_norm": 7.699764524449651, "learning_rate": 4.824864951849787e-06, "loss": 1.2422, "step": 820 }, { "epoch": 0.210593085949889, "grad_norm": 7.045914854236814, "learning_rate": 4.8166297712880635e-06, "loss": 1.2296, "step": 830 }, { "epoch": 0.2131303520456708, "grad_norm": 6.848231755692189, "learning_rate": 4.808212756314815e-06, "loss": 1.2185, "step": 840 }, { "epoch": 0.21566761814145258, "grad_norm": 7.591181911551738, "learning_rate": 4.7996145675893255e-06, "loss": 1.2348, "step": 850 }, { "epoch": 0.2182048842372344, "grad_norm": 13.992556734602806, "learning_rate": 4.7908358799913735e-06, "loss": 1.259, "step": 860 }, { "epoch": 0.22074215033301617, "grad_norm": 13.882288651265467, "learning_rate": 4.781877382568261e-06, "loss": 1.2305, "step": 870 }, { "epoch": 0.22327941642879798, "grad_norm": 8.53486541599463, "learning_rate": 4.772739778480729e-06, "loss": 1.2343, "step": 880 }, { "epoch": 0.22581668252457976, "grad_norm": 9.455012927215373, "learning_rate": 4.7634237849477645e-06, "loss": 1.2194, "step": 890 }, { "epoch": 0.22835394862036157, "grad_norm": 13.790102358483903, "learning_rate": 4.7539301331903125e-06, "loss": 1.2267, "step": 900 }, { "epoch": 0.23089121471614335, "grad_norm": 14.2481962464266, "learning_rate": 4.7442595683738705e-06, "loss": 1.2132, "step": 910 }, { "epoch": 0.23342848081192516, "grad_norm": 14.416373746309988, "learning_rate": 4.734412849550007e-06, "loss": 1.2094, "step": 920 }, { "epoch": 0.23596574690770694, "grad_norm": 9.539618032350603, "learning_rate": 4.7243907495967815e-06, "loss": 1.2294, "step": 930 }, { "epoch": 0.23850301300348875, "grad_norm": 5.29440353230921, "learning_rate": 4.7141940551580824e-06, "loss": 1.2208, "step": 940 }, { "epoch": 0.24104027909927053, "grad_norm": 12.765581152374985, "learning_rate": 4.703823566581877e-06, "loss": 1.2324, "step": 950 }, { "epoch": 0.24357754519505234, "grad_norm": 8.58495407107471, "learning_rate": 4.693280097857398e-06, "loss": 1.2222, "step": 960 }, { "epoch": 0.24611481129083412, "grad_norm": 9.562572836922989, "learning_rate": 4.6825644765512475e-06, "loss": 1.2185, "step": 970 }, { "epoch": 0.24865207738661593, "grad_norm": 9.23186285176845, "learning_rate": 4.6716775437424465e-06, "loss": 1.2192, "step": 980 }, { "epoch": 0.25118934348239774, "grad_norm": 5.073620743419921, "learning_rate": 4.660620153956409e-06, "loss": 1.2241, "step": 990 }, { "epoch": 0.2537266095781795, "grad_norm": 8.006469237916, "learning_rate": 4.649393175097879e-06, "loss": 1.2281, "step": 1000 }, { "epoch": 0.2562638756739613, "grad_norm": 8.265727241877757, "learning_rate": 4.637997488382801e-06, "loss": 1.2286, "step": 1010 }, { "epoch": 0.2588011417697431, "grad_norm": 6.416174543416175, "learning_rate": 4.626433988269156e-06, "loss": 1.2217, "step": 1020 }, { "epoch": 0.2613384078655249, "grad_norm": 7.028656405225393, "learning_rate": 4.614703582386755e-06, "loss": 1.2181, "step": 1030 }, { "epoch": 0.2638756739613067, "grad_norm": 5.865759830273811, "learning_rate": 4.602807191465993e-06, "loss": 1.2382, "step": 1040 }, { "epoch": 0.2664129400570885, "grad_norm": 13.872254227369961, "learning_rate": 4.5907457492655895e-06, "loss": 1.2359, "step": 1050 }, { "epoch": 0.2689502061528703, "grad_norm": 13.664329240119352, "learning_rate": 4.578520202499286e-06, "loss": 1.2154, "step": 1060 }, { "epoch": 0.27148747224865205, "grad_norm": 14.099557581365481, "learning_rate": 4.566131510761548e-06, "loss": 1.2279, "step": 1070 }, { "epoch": 0.27402473834443386, "grad_norm": 9.810709350520776, "learning_rate": 4.553580646452238e-06, "loss": 1.2276, "step": 1080 }, { "epoch": 0.27656200444021567, "grad_norm": 6.042192876271809, "learning_rate": 4.5408685947002915e-06, "loss": 1.2159, "step": 1090 }, { "epoch": 0.2790992705359975, "grad_norm": 7.672603258347116, "learning_rate": 4.5279963532864e-06, "loss": 1.209, "step": 1100 }, { "epoch": 0.28163653663177923, "grad_norm": 7.981333790512745, "learning_rate": 4.5149649325646875e-06, "loss": 1.2227, "step": 1110 }, { "epoch": 0.28417380272756104, "grad_norm": 7.786264862698563, "learning_rate": 4.501775355383406e-06, "loss": 1.2299, "step": 1120 }, { "epoch": 0.28671106882334285, "grad_norm": 23.110853155380898, "learning_rate": 4.48842865700466e-06, "loss": 1.1972, "step": 1130 }, { "epoch": 0.28924833491912466, "grad_norm": 13.48010845409501, "learning_rate": 4.474925885023136e-06, "loss": 1.2043, "step": 1140 }, { "epoch": 0.2917856010149064, "grad_norm": 9.52808074720464, "learning_rate": 4.461268099283886e-06, "loss": 1.2107, "step": 1150 }, { "epoch": 0.2943228671106882, "grad_norm": 7.355296672812608, "learning_rate": 4.4474563717991345e-06, "loss": 1.2014, "step": 1160 }, { "epoch": 0.29686013320647003, "grad_norm": 6.320569333174802, "learning_rate": 4.433491786664134e-06, "loss": 1.2068, "step": 1170 }, { "epoch": 0.29939739930225184, "grad_norm": 10.030481328592858, "learning_rate": 4.419375439972075e-06, "loss": 1.2276, "step": 1180 }, { "epoch": 0.3019346653980336, "grad_norm": 6.409643921761458, "learning_rate": 4.405108439728057e-06, "loss": 1.217, "step": 1190 }, { "epoch": 0.3044719314938154, "grad_norm": 8.589272885921144, "learning_rate": 4.390691905762111e-06, "loss": 1.2141, "step": 1200 }, { "epoch": 0.3070091975895972, "grad_norm": 8.928374282259929, "learning_rate": 4.376126969641311e-06, "loss": 1.2067, "step": 1210 }, { "epoch": 0.309546463685379, "grad_norm": 6.743646650661921, "learning_rate": 4.361414774580952e-06, "loss": 1.2126, "step": 1220 }, { "epoch": 0.3120837297811608, "grad_norm": 11.981450710885838, "learning_rate": 4.34655647535482e-06, "loss": 1.1928, "step": 1230 }, { "epoch": 0.3146209958769426, "grad_norm": 9.30568404144152, "learning_rate": 4.3315532382045535e-06, "loss": 1.2233, "step": 1240 }, { "epoch": 0.3171582619727244, "grad_norm": 9.124358676503277, "learning_rate": 4.3164062407480974e-06, "loss": 1.2208, "step": 1250 }, { "epoch": 0.3196955280685062, "grad_norm": 9.471883506779196, "learning_rate": 4.301116671887281e-06, "loss": 1.2009, "step": 1260 }, { "epoch": 0.32223279416428796, "grad_norm": 6.941617674454578, "learning_rate": 4.285685731714493e-06, "loss": 1.2188, "step": 1270 }, { "epoch": 0.32477006026006977, "grad_norm": 12.5252338418682, "learning_rate": 4.270114631418487e-06, "loss": 1.1947, "step": 1280 }, { "epoch": 0.3273073263558516, "grad_norm": 11.108780709122486, "learning_rate": 4.254404593189316e-06, "loss": 1.2063, "step": 1290 }, { "epoch": 0.3298445924516334, "grad_norm": 8.555766487500795, "learning_rate": 4.238556850122394e-06, "loss": 1.1988, "step": 1300 }, { "epoch": 0.33238185854741514, "grad_norm": 5.929688704429909, "learning_rate": 4.222572646121723e-06, "loss": 1.2084, "step": 1310 }, { "epoch": 0.33491912464319695, "grad_norm": 7.403468884662153, "learning_rate": 4.2064532358022446e-06, "loss": 1.209, "step": 1320 }, { "epoch": 0.33745639073897876, "grad_norm": 11.518020383952775, "learning_rate": 4.190199884391371e-06, "loss": 1.2069, "step": 1330 }, { "epoch": 0.33999365683476057, "grad_norm": 8.809667396360213, "learning_rate": 4.173813867629672e-06, "loss": 1.2291, "step": 1340 }, { "epoch": 0.3425309229305423, "grad_norm": 10.783849201384296, "learning_rate": 4.157296471670747e-06, "loss": 1.1787, "step": 1350 }, { "epoch": 0.34506818902632413, "grad_norm": 5.923243263259089, "learning_rate": 4.140648992980269e-06, "loss": 1.1972, "step": 1360 }, { "epoch": 0.34760545512210594, "grad_norm": 7.577491053933149, "learning_rate": 4.1238727382342245e-06, "loss": 1.2193, "step": 1370 }, { "epoch": 0.35014272121788775, "grad_norm": 15.766504679065461, "learning_rate": 4.106969024216348e-06, "loss": 1.1896, "step": 1380 }, { "epoch": 0.3526799873136695, "grad_norm": 5.128475553118625, "learning_rate": 4.089939177714778e-06, "loss": 1.1916, "step": 1390 }, { "epoch": 0.3552172534094513, "grad_norm": 31.663873389243054, "learning_rate": 4.0727845354178995e-06, "loss": 1.2037, "step": 1400 }, { "epoch": 0.3577545195052331, "grad_norm": 5.0548828855937975, "learning_rate": 4.055506443809441e-06, "loss": 1.2243, "step": 1410 }, { "epoch": 0.3602917856010149, "grad_norm": 6.115218417588514, "learning_rate": 4.038106259062778e-06, "loss": 1.1927, "step": 1420 }, { "epoch": 0.3628290516967967, "grad_norm": 12.060043038462219, "learning_rate": 4.020585346934493e-06, "loss": 1.2077, "step": 1430 }, { "epoch": 0.3653663177925785, "grad_norm": 5.116355293016737, "learning_rate": 4.002945082657167e-06, "loss": 1.2, "step": 1440 }, { "epoch": 0.3679035838883603, "grad_norm": 7.955675984619225, "learning_rate": 3.985186850831446e-06, "loss": 1.1917, "step": 1450 }, { "epoch": 0.37044084998414206, "grad_norm": 7.18463544096605, "learning_rate": 3.967312045317357e-06, "loss": 1.1917, "step": 1460 }, { "epoch": 0.37297811607992387, "grad_norm": 13.238317735595116, "learning_rate": 3.9493220691249e-06, "loss": 1.207, "step": 1470 }, { "epoch": 0.3755153821757057, "grad_norm": 17.592657693606654, "learning_rate": 3.931218334303933e-06, "loss": 1.1963, "step": 1480 }, { "epoch": 0.3780526482714875, "grad_norm": 13.71018630119955, "learning_rate": 3.913002261833331e-06, "loss": 1.201, "step": 1490 }, { "epoch": 0.38058991436726924, "grad_norm": 4.264076010568766, "learning_rate": 3.894675281509455e-06, "loss": 1.2088, "step": 1500 }, { "epoch": 0.38312718046305105, "grad_norm": 7.007776785175352, "learning_rate": 3.876238831833927e-06, "loss": 1.216, "step": 1510 }, { "epoch": 0.38566444655883286, "grad_norm": 6.561717211592676, "learning_rate": 3.857694359900719e-06, "loss": 1.211, "step": 1520 }, { "epoch": 0.3882017126546147, "grad_norm": 9.351379080752213, "learning_rate": 3.83904332128257e-06, "loss": 1.173, "step": 1530 }, { "epoch": 0.3907389787503964, "grad_norm": 5.389365116394579, "learning_rate": 3.820287179916736e-06, "loss": 1.1883, "step": 1540 }, { "epoch": 0.39327624484617824, "grad_norm": 4.843914279235025, "learning_rate": 3.8014274079900842e-06, "loss": 1.2002, "step": 1550 }, { "epoch": 0.39581351094196005, "grad_norm": 8.271741525384778, "learning_rate": 3.7824654858235433e-06, "loss": 1.2019, "step": 1560 }, { "epoch": 0.39835077703774185, "grad_norm": 16.214867643854607, "learning_rate": 3.763402901755905e-06, "loss": 1.1936, "step": 1570 }, { "epoch": 0.4008880431335236, "grad_norm": 6.671379474352313, "learning_rate": 3.7442411520270096e-06, "loss": 1.1872, "step": 1580 }, { "epoch": 0.4034253092293054, "grad_norm": 8.707941769488675, "learning_rate": 3.7249817406602996e-06, "loss": 1.197, "step": 1590 }, { "epoch": 0.4059625753250872, "grad_norm": 6.5621327717722995, "learning_rate": 3.7056261793447707e-06, "loss": 1.1983, "step": 1600 }, { "epoch": 0.40849984142086904, "grad_norm": 7.068156090864818, "learning_rate": 3.686175987316317e-06, "loss": 1.1853, "step": 1610 }, { "epoch": 0.4110371075166508, "grad_norm": 7.890900003065767, "learning_rate": 3.6666326912384854e-06, "loss": 1.1925, "step": 1620 }, { "epoch": 0.4135743736124326, "grad_norm": 6.7232162942204665, "learning_rate": 3.6469978250826433e-06, "loss": 1.1954, "step": 1630 }, { "epoch": 0.4161116397082144, "grad_norm": 10.010345842937971, "learning_rate": 3.6272729300075808e-06, "loss": 1.209, "step": 1640 }, { "epoch": 0.4186489058039962, "grad_norm": 5.740167290944564, "learning_rate": 3.6074595542385387e-06, "loss": 1.185, "step": 1650 }, { "epoch": 0.421186171899778, "grad_norm": 7.619213938711831, "learning_rate": 3.5875592529456926e-06, "loss": 1.2068, "step": 1660 }, { "epoch": 0.4237234379955598, "grad_norm": 11.256221468519641, "learning_rate": 3.567573588122079e-06, "loss": 1.1777, "step": 1670 }, { "epoch": 0.4262607040913416, "grad_norm": 5.7316159588312106, "learning_rate": 3.5475041284609977e-06, "loss": 1.1843, "step": 1680 }, { "epoch": 0.4287979701871234, "grad_norm": 5.682433911486404, "learning_rate": 3.527352449232886e-06, "loss": 1.1777, "step": 1690 }, { "epoch": 0.43133523628290515, "grad_norm": 6.786252039469139, "learning_rate": 3.5071201321616673e-06, "loss": 1.1888, "step": 1700 }, { "epoch": 0.43387250237868696, "grad_norm": 5.407930902552358, "learning_rate": 3.4868087653006045e-06, "loss": 1.1901, "step": 1710 }, { "epoch": 0.4364097684744688, "grad_norm": 6.399699747010517, "learning_rate": 3.466419942907652e-06, "loss": 1.2107, "step": 1720 }, { "epoch": 0.4389470345702506, "grad_norm": 7.825821295728962, "learning_rate": 3.445955265320321e-06, "loss": 1.1918, "step": 1730 }, { "epoch": 0.44148430066603234, "grad_norm": 9.488411335781475, "learning_rate": 3.425416338830067e-06, "loss": 1.189, "step": 1740 }, { "epoch": 0.44402156676181415, "grad_norm": 5.88274413352344, "learning_rate": 3.4048047755562093e-06, "loss": 1.183, "step": 1750 }, { "epoch": 0.44655883285759596, "grad_norm": 5.6831092141225055, "learning_rate": 3.3841221933193965e-06, "loss": 1.1882, "step": 1760 }, { "epoch": 0.4490960989533777, "grad_norm": 19.983254773097197, "learning_rate": 3.3633702155146216e-06, "loss": 1.1911, "step": 1770 }, { "epoch": 0.4516333650491595, "grad_norm": 12.549540089577652, "learning_rate": 3.342550470983798e-06, "loss": 1.1811, "step": 1780 }, { "epoch": 0.45417063114494133, "grad_norm": 5.976267812124573, "learning_rate": 3.3216645938879134e-06, "loss": 1.1865, "step": 1790 }, { "epoch": 0.45670789724072314, "grad_norm": 9.218000333375254, "learning_rate": 3.3007142235787624e-06, "loss": 1.1842, "step": 1800 }, { "epoch": 0.4592451633365049, "grad_norm": 6.320981630879619, "learning_rate": 3.2797010044702697e-06, "loss": 1.1776, "step": 1810 }, { "epoch": 0.4617824294322867, "grad_norm": 6.190899234462718, "learning_rate": 3.258626585909422e-06, "loss": 1.1917, "step": 1820 }, { "epoch": 0.4643196955280685, "grad_norm": 7.909431608694366, "learning_rate": 3.2374926220468067e-06, "loss": 1.1789, "step": 1830 }, { "epoch": 0.4668569616238503, "grad_norm": 5.69897203883322, "learning_rate": 3.216300771706776e-06, "loss": 1.1733, "step": 1840 }, { "epoch": 0.4693942277196321, "grad_norm": 20.816445025574357, "learning_rate": 3.1950526982572484e-06, "loss": 1.1712, "step": 1850 }, { "epoch": 0.4719314938154139, "grad_norm": 11.982164138237327, "learning_rate": 3.1737500694791424e-06, "loss": 1.1815, "step": 1860 }, { "epoch": 0.4744687599111957, "grad_norm": 24.74208511970109, "learning_rate": 3.1523945574354763e-06, "loss": 1.1863, "step": 1870 }, { "epoch": 0.4770060260069775, "grad_norm": 7.842936948875007, "learning_rate": 3.130987838340126e-06, "loss": 1.1834, "step": 1880 }, { "epoch": 0.47954329210275926, "grad_norm": 6.4021446109798505, "learning_rate": 3.1095315924262544e-06, "loss": 1.184, "step": 1890 }, { "epoch": 0.48208055819854106, "grad_norm": 11.634701038383552, "learning_rate": 3.0880275038144296e-06, "loss": 1.1748, "step": 1900 }, { "epoch": 0.4846178242943229, "grad_norm": 13.567790113056438, "learning_rate": 3.066477260380441e-06, "loss": 1.1647, "step": 1910 }, { "epoch": 0.4871550903901047, "grad_norm": 8.85124389947068, "learning_rate": 3.044882553622808e-06, "loss": 1.1906, "step": 1920 }, { "epoch": 0.48969235648588644, "grad_norm": 4.941061338275534, "learning_rate": 3.0232450785300207e-06, "loss": 1.1653, "step": 1930 }, { "epoch": 0.49222962258166825, "grad_norm": 6.532283199565601, "learning_rate": 3.0015665334474937e-06, "loss": 1.1869, "step": 1940 }, { "epoch": 0.49476688867745006, "grad_norm": 9.662609662864591, "learning_rate": 2.9798486199442627e-06, "loss": 1.1707, "step": 1950 }, { "epoch": 0.49730415477323187, "grad_norm": 7.538232506265863, "learning_rate": 2.958093042679429e-06, "loss": 1.192, "step": 1960 }, { "epoch": 0.4998414208690136, "grad_norm": 72.64241187548325, "learning_rate": 2.9363015092683566e-06, "loss": 1.1762, "step": 1970 }, { "epoch": 0.5023786869647955, "grad_norm": 9.900383081254226, "learning_rate": 2.9144757301486387e-06, "loss": 1.169, "step": 1980 }, { "epoch": 0.5049159530605772, "grad_norm": 6.366667290767634, "learning_rate": 2.8926174184458484e-06, "loss": 1.174, "step": 1990 }, { "epoch": 0.507453219156359, "grad_norm": 6.005401235845572, "learning_rate": 2.8707282898390703e-06, "loss": 1.1596, "step": 2000 }, { "epoch": 0.5099904852521409, "grad_norm": 8.195058772390636, "learning_rate": 2.848810062426236e-06, "loss": 1.1636, "step": 2010 }, { "epoch": 0.5125277513479226, "grad_norm": 6.0595964456253055, "learning_rate": 2.82686445658927e-06, "loss": 1.1622, "step": 2020 }, { "epoch": 0.5150650174437044, "grad_norm": 23.70711855670117, "learning_rate": 2.8048931948590537e-06, "loss": 1.1679, "step": 2030 }, { "epoch": 0.5176022835394862, "grad_norm": 5.907186404005163, "learning_rate": 2.7828980017802236e-06, "loss": 1.1883, "step": 2040 }, { "epoch": 0.520139549635268, "grad_norm": 7.344357732813401, "learning_rate": 2.760880603775811e-06, "loss": 1.173, "step": 2050 }, { "epoch": 0.5226768157310498, "grad_norm": 7.426398590607293, "learning_rate": 2.73884272901173e-06, "loss": 1.191, "step": 2060 }, { "epoch": 0.5252140818268316, "grad_norm": 7.057487598088109, "learning_rate": 2.7167861072611374e-06, "loss": 1.1717, "step": 2070 }, { "epoch": 0.5277513479226134, "grad_norm": 7.223259822911996, "learning_rate": 2.6947124697686553e-06, "loss": 1.1638, "step": 2080 }, { "epoch": 0.5302886140183952, "grad_norm": 10.407626808887795, "learning_rate": 2.6726235491144886e-06, "loss": 1.1682, "step": 2090 }, { "epoch": 0.532825880114177, "grad_norm": 6.589791365210622, "learning_rate": 2.650521079078433e-06, "loss": 1.1645, "step": 2100 }, { "epoch": 0.5353631462099587, "grad_norm": 11.999381746057887, "learning_rate": 2.6284067945037855e-06, "loss": 1.1646, "step": 2110 }, { "epoch": 0.5379004123057406, "grad_norm": 7.613081340354655, "learning_rate": 2.6062824311611775e-06, "loss": 1.1705, "step": 2120 }, { "epoch": 0.5404376784015223, "grad_norm": 6.366501800737734, "learning_rate": 2.5841497256123326e-06, "loss": 1.1573, "step": 2130 }, { "epoch": 0.5429749444973041, "grad_norm": 18.74317163106254, "learning_rate": 2.5620104150737626e-06, "loss": 1.165, "step": 2140 }, { "epoch": 0.545512210593086, "grad_norm": 11.370554877805095, "learning_rate": 2.5398662372804105e-06, "loss": 1.1701, "step": 2150 }, { "epoch": 0.5480494766888677, "grad_norm": 6.5553872071155554, "learning_rate": 2.517718930349254e-06, "loss": 1.1475, "step": 2160 }, { "epoch": 0.5505867427846496, "grad_norm": 5.741036264156473, "learning_rate": 2.495570232642881e-06, "loss": 1.1483, "step": 2170 }, { "epoch": 0.5531240088804313, "grad_norm": 6.767884733983484, "learning_rate": 2.47342188263304e-06, "loss": 1.1689, "step": 2180 }, { "epoch": 0.5556612749762131, "grad_norm": 13.529490798235566, "learning_rate": 2.4512756187641936e-06, "loss": 1.1614, "step": 2190 }, { "epoch": 0.558198541071995, "grad_norm": 9.536331865874597, "learning_rate": 2.4291331793170545e-06, "loss": 1.1606, "step": 2200 }, { "epoch": 0.5607358071677767, "grad_norm": 4.782009512084195, "learning_rate": 2.4069963022721597e-06, "loss": 1.1817, "step": 2210 }, { "epoch": 0.5632730732635585, "grad_norm": 12.360173413460117, "learning_rate": 2.3848667251734424e-06, "loss": 1.1682, "step": 2220 }, { "epoch": 0.5658103393593403, "grad_norm": 6.111517885569779, "learning_rate": 2.3627461849918604e-06, "loss": 1.1526, "step": 2230 }, { "epoch": 0.5683476054551221, "grad_norm": 5.607775951834258, "learning_rate": 2.3406364179890532e-06, "loss": 1.1562, "step": 2240 }, { "epoch": 0.570884871550904, "grad_norm": 14.959200098647552, "learning_rate": 2.3185391595810635e-06, "loss": 1.1592, "step": 2250 }, { "epoch": 0.5734221376466857, "grad_norm": 5.308853581413382, "learning_rate": 2.2964561442021255e-06, "loss": 1.1432, "step": 2260 }, { "epoch": 0.5759594037424675, "grad_norm": 7.548080678466621, "learning_rate": 2.2743891051685222e-06, "loss": 1.1594, "step": 2270 }, { "epoch": 0.5784966698382493, "grad_norm": 12.307106947771112, "learning_rate": 2.2523397745425387e-06, "loss": 1.1627, "step": 2280 }, { "epoch": 0.5810339359340311, "grad_norm": 6.981005935033202, "learning_rate": 2.2303098829965124e-06, "loss": 1.1669, "step": 2290 }, { "epoch": 0.5835712020298128, "grad_norm": 4.0554069435676805, "learning_rate": 2.208301159676987e-06, "loss": 1.1631, "step": 2300 }, { "epoch": 0.5861084681255947, "grad_norm": 5.723818233434463, "learning_rate": 2.1863153320689958e-06, "loss": 1.1435, "step": 2310 }, { "epoch": 0.5886457342213764, "grad_norm": 14.029515219171557, "learning_rate": 2.164354125860462e-06, "loss": 1.1499, "step": 2320 }, { "epoch": 0.5911830003171583, "grad_norm": 7.141317865826808, "learning_rate": 2.1424192648067582e-06, "loss": 1.1832, "step": 2330 }, { "epoch": 0.5937202664129401, "grad_norm": 6.123987923859252, "learning_rate": 2.120512470595396e-06, "loss": 1.1503, "step": 2340 }, { "epoch": 0.5962575325087218, "grad_norm": 6.013035072240124, "learning_rate": 2.098635462710898e-06, "loss": 1.1601, "step": 2350 }, { "epoch": 0.5987947986045037, "grad_norm": 4.754989174196253, "learning_rate": 2.0767899582998293e-06, "loss": 1.1641, "step": 2360 }, { "epoch": 0.6013320647002854, "grad_norm": 14.321979761759893, "learning_rate": 2.054977672036018e-06, "loss": 1.1417, "step": 2370 }, { "epoch": 0.6038693307960672, "grad_norm": 10.760086157167773, "learning_rate": 2.033200315985969e-06, "loss": 1.1482, "step": 2380 }, { "epoch": 0.6064065968918491, "grad_norm": 12.684127714673926, "learning_rate": 2.011459599474483e-06, "loss": 1.1497, "step": 2390 }, { "epoch": 0.6089438629876308, "grad_norm": 6.757204304399067, "learning_rate": 1.989757228950491e-06, "loss": 1.1583, "step": 2400 }, { "epoch": 0.6114811290834127, "grad_norm": 14.606095290784646, "learning_rate": 1.9680949078531097e-06, "loss": 1.1528, "step": 2410 }, { "epoch": 0.6140183951791944, "grad_norm": 7.4618089543548995, "learning_rate": 1.9464743364779388e-06, "loss": 1.1594, "step": 2420 }, { "epoch": 0.6165556612749762, "grad_norm": 5.801205347666092, "learning_rate": 1.924897211843606e-06, "loss": 1.1428, "step": 2430 }, { "epoch": 0.619092927370758, "grad_norm": 9.568596376954924, "learning_rate": 1.9033652275585624e-06, "loss": 1.1321, "step": 2440 }, { "epoch": 0.6216301934665398, "grad_norm": 6.223606964643186, "learning_rate": 1.8818800736881518e-06, "loss": 1.1568, "step": 2450 }, { "epoch": 0.6241674595623216, "grad_norm": 55.44156669240569, "learning_rate": 1.8604434366219573e-06, "loss": 1.1594, "step": 2460 }, { "epoch": 0.6267047256581034, "grad_norm": 10.778268976853173, "learning_rate": 1.8390569989414303e-06, "loss": 1.1588, "step": 2470 }, { "epoch": 0.6292419917538852, "grad_norm": 5.7854893075832745, "learning_rate": 1.8177224392878279e-06, "loss": 1.1535, "step": 2480 }, { "epoch": 0.6317792578496669, "grad_norm": 13.300897200946059, "learning_rate": 1.7964414322304525e-06, "loss": 1.1339, "step": 2490 }, { "epoch": 0.6343165239454488, "grad_norm": 7.933290169459755, "learning_rate": 1.7752156481352123e-06, "loss": 1.1388, "step": 2500 }, { "epoch": 0.6368537900412305, "grad_norm": 5.231248714361998, "learning_rate": 1.7540467530335172e-06, "loss": 1.1374, "step": 2510 }, { "epoch": 0.6393910561370124, "grad_norm": 8.049075294237095, "learning_rate": 1.732936408491504e-06, "loss": 1.1587, "step": 2520 }, { "epoch": 0.6419283222327942, "grad_norm": 10.356860200627077, "learning_rate": 1.7118862714796253e-06, "loss": 1.1572, "step": 2530 }, { "epoch": 0.6444655883285759, "grad_norm": 4.057173286082011, "learning_rate": 1.6908979942425868e-06, "loss": 1.141, "step": 2540 }, { "epoch": 0.6470028544243578, "grad_norm": 7.051670677225095, "learning_rate": 1.6699732241696636e-06, "loss": 1.1581, "step": 2550 }, { "epoch": 0.6495401205201395, "grad_norm": 7.595067587076725, "learning_rate": 1.649113603665396e-06, "loss": 1.1466, "step": 2560 }, { "epoch": 0.6520773866159213, "grad_norm": 6.1918160975787, "learning_rate": 1.628320770020673e-06, "loss": 1.1615, "step": 2570 }, { "epoch": 0.6546146527117032, "grad_norm": 14.301509764417062, "learning_rate": 1.6075963552842211e-06, "loss": 1.1641, "step": 2580 }, { "epoch": 0.6571519188074849, "grad_norm": 6.875665068775734, "learning_rate": 1.5869419861345042e-06, "loss": 1.142, "step": 2590 }, { "epoch": 0.6596891849032668, "grad_norm": 7.541794891346959, "learning_rate": 1.5663592837520453e-06, "loss": 1.1469, "step": 2600 }, { "epoch": 0.6622264509990485, "grad_norm": 12.46212690117266, "learning_rate": 1.5458498636921727e-06, "loss": 1.1598, "step": 2610 }, { "epoch": 0.6647637170948303, "grad_norm": 7.379528099438572, "learning_rate": 1.5254153357582208e-06, "loss": 1.1582, "step": 2620 }, { "epoch": 0.6673009831906122, "grad_norm": 4.907692232962446, "learning_rate": 1.5050573038751693e-06, "loss": 1.1452, "step": 2630 }, { "epoch": 0.6698382492863939, "grad_norm": 6.079547602614854, "learning_rate": 1.4847773659637546e-06, "loss": 1.1297, "step": 2640 }, { "epoch": 0.6723755153821757, "grad_norm": 5.450549895798723, "learning_rate": 1.4645771138150433e-06, "loss": 1.1603, "step": 2650 }, { "epoch": 0.6749127814779575, "grad_norm": 4.368959175756317, "learning_rate": 1.4444581329654916e-06, "loss": 1.1439, "step": 2660 }, { "epoch": 0.6774500475737393, "grad_norm": 9.44947484477195, "learning_rate": 1.424422002572502e-06, "loss": 1.1749, "step": 2670 }, { "epoch": 0.6799873136695211, "grad_norm": 10.050734306484166, "learning_rate": 1.404470295290461e-06, "loss": 1.1545, "step": 2680 }, { "epoch": 0.6825245797653029, "grad_norm": 12.005497232488006, "learning_rate": 1.3846045771473116e-06, "loss": 1.149, "step": 2690 }, { "epoch": 0.6850618458610847, "grad_norm": 5.9236681532953925, "learning_rate": 1.3648264074216282e-06, "loss": 1.1394, "step": 2700 }, { "epoch": 0.6875991119568665, "grad_norm": 9.66814179972378, "learning_rate": 1.345137338520231e-06, "loss": 1.1463, "step": 2710 }, { "epoch": 0.6901363780526483, "grad_norm": 37.98586030031826, "learning_rate": 1.3255389158563299e-06, "loss": 1.1521, "step": 2720 }, { "epoch": 0.69267364414843, "grad_norm": 4.6889018268529705, "learning_rate": 1.3060326777282312e-06, "loss": 1.1437, "step": 2730 }, { "epoch": 0.6952109102442119, "grad_norm": 14.32310959027276, "learning_rate": 1.2866201551985935e-06, "loss": 1.1394, "step": 2740 }, { "epoch": 0.6977481763399936, "grad_norm": 6.337986082170704, "learning_rate": 1.2673028719742461e-06, "loss": 1.1455, "step": 2750 }, { "epoch": 0.7002854424357755, "grad_norm": 15.376375293741257, "learning_rate": 1.2480823442866017e-06, "loss": 1.1401, "step": 2760 }, { "epoch": 0.7028227085315573, "grad_norm": 5.041231343225159, "learning_rate": 1.2289600807726406e-06, "loss": 1.156, "step": 2770 }, { "epoch": 0.705359974627339, "grad_norm": 6.869042630836821, "learning_rate": 1.2099375823564948e-06, "loss": 1.1615, "step": 2780 }, { "epoch": 0.7078972407231209, "grad_norm": 12.482011259323276, "learning_rate": 1.1910163421316447e-06, "loss": 1.1321, "step": 2790 }, { "epoch": 0.7104345068189026, "grad_norm": 6.399176247127537, "learning_rate": 1.1721978452437205e-06, "loss": 1.153, "step": 2800 }, { "epoch": 0.7129717729146844, "grad_norm": 6.270550675278331, "learning_rate": 1.1534835687739323e-06, "loss": 1.1551, "step": 2810 }, { "epoch": 0.7155090390104663, "grad_norm": 4.695573760402524, "learning_rate": 1.1348749816231347e-06, "loss": 1.12, "step": 2820 }, { "epoch": 0.718046305106248, "grad_norm": 6.7962150090749995, "learning_rate": 1.1163735443965298e-06, "loss": 1.1684, "step": 2830 }, { "epoch": 0.7205835712020298, "grad_norm": 8.834255061556856, "learning_rate": 1.0979807092890205e-06, "loss": 1.1414, "step": 2840 }, { "epoch": 0.7231208372978116, "grad_norm": 6.053865837653651, "learning_rate": 1.079697919971232e-06, "loss": 1.1355, "step": 2850 }, { "epoch": 0.7256581033935934, "grad_norm": 19.55990021937961, "learning_rate": 1.0615266114761932e-06, "loss": 1.122, "step": 2860 }, { "epoch": 0.7281953694893752, "grad_norm": 8.424559736567739, "learning_rate": 1.0434682100866995e-06, "loss": 1.1422, "step": 2870 }, { "epoch": 0.730732635585157, "grad_norm": 5.986352186146355, "learning_rate": 1.0255241332233636e-06, "loss": 1.1312, "step": 2880 }, { "epoch": 0.7332699016809388, "grad_norm": 20.203613019858313, "learning_rate": 1.0076957893333602e-06, "loss": 1.1271, "step": 2890 }, { "epoch": 0.7358071677767206, "grad_norm": 19.714820742115197, "learning_rate": 9.899845777798777e-07, "loss": 1.154, "step": 2900 }, { "epoch": 0.7383444338725024, "grad_norm": 13.735614202784944, "learning_rate": 9.723918887322757e-07, "loss": 1.1359, "step": 2910 }, { "epoch": 0.7408816999682841, "grad_norm": 6.129734292866207, "learning_rate": 9.549191030569751e-07, "loss": 1.1313, "step": 2920 }, { "epoch": 0.743418966064066, "grad_norm": 7.37718825001788, "learning_rate": 9.375675922090707e-07, "loss": 1.1319, "step": 2930 }, { "epoch": 0.7459562321598477, "grad_norm": 4.427499085813396, "learning_rate": 9.203387181246831e-07, "loss": 1.1399, "step": 2940 }, { "epoch": 0.7484934982556296, "grad_norm": 29.506989634102897, "learning_rate": 9.032338331140603e-07, "loss": 1.158, "step": 2950 }, { "epoch": 0.7510307643514114, "grad_norm": 5.7586848271722495, "learning_rate": 8.862542797554341e-07, "loss": 1.1486, "step": 2960 }, { "epoch": 0.7535680304471931, "grad_norm": 8.650793498941589, "learning_rate": 8.694013907896363e-07, "loss": 1.1461, "step": 2970 }, { "epoch": 0.756105296542975, "grad_norm": 25.74372735382993, "learning_rate": 8.526764890154965e-07, "loss": 1.1353, "step": 2980 }, { "epoch": 0.7586425626387567, "grad_norm": 5.0755009425487625, "learning_rate": 8.36080887186011e-07, "loss": 1.1553, "step": 2990 }, { "epoch": 0.7611798287345385, "grad_norm": 5.4429094875047115, "learning_rate": 8.19615887905301e-07, "loss": 1.1598, "step": 3000 }, { "epoch": 0.7637170948303204, "grad_norm": 6.821786199139472, "learning_rate": 8.032827835263773e-07, "loss": 1.1318, "step": 3010 }, { "epoch": 0.7662543609261021, "grad_norm": 4.919272329647964, "learning_rate": 7.87082856049696e-07, "loss": 1.1188, "step": 3020 }, { "epoch": 0.768791627021884, "grad_norm": 6.777841153958167, "learning_rate": 7.710173770225335e-07, "loss": 1.1351, "step": 3030 }, { "epoch": 0.7713288931176657, "grad_norm": 9.668494091003875, "learning_rate": 7.550876074391852e-07, "loss": 1.1388, "step": 3040 }, { "epoch": 0.7738661592134475, "grad_norm": 5.891510434517859, "learning_rate": 7.392947976419867e-07, "loss": 1.1307, "step": 3050 }, { "epoch": 0.7764034253092293, "grad_norm": 4.629328829939173, "learning_rate": 7.23640187223173e-07, "loss": 1.1442, "step": 3060 }, { "epoch": 0.7789406914050111, "grad_norm": 12.582358698587784, "learning_rate": 7.081250049275804e-07, "loss": 1.147, "step": 3070 }, { "epoch": 0.7814779575007929, "grad_norm": 12.6779267097399, "learning_rate": 6.927504685562075e-07, "loss": 1.1461, "step": 3080 }, { "epoch": 0.7840152235965747, "grad_norm": 4.951520708126675, "learning_rate": 6.775177848706193e-07, "loss": 1.1272, "step": 3090 }, { "epoch": 0.7865524896923565, "grad_norm": 8.236149543257104, "learning_rate": 6.624281494982359e-07, "loss": 1.1478, "step": 3100 }, { "epoch": 0.7890897557881383, "grad_norm": 15.16637456873453, "learning_rate": 6.474827468384811e-07, "loss": 1.1411, "step": 3110 }, { "epoch": 0.7916270218839201, "grad_norm": 11.435649281181314, "learning_rate": 6.326827499698218e-07, "loss": 1.128, "step": 3120 }, { "epoch": 0.7941642879797018, "grad_norm": 7.369755927934268, "learning_rate": 6.180293205576873e-07, "loss": 1.1503, "step": 3130 }, { "epoch": 0.7967015540754837, "grad_norm": 7.910994986606561, "learning_rate": 6.035236087632928e-07, "loss": 1.1305, "step": 3140 }, { "epoch": 0.7992388201712655, "grad_norm": 4.538240411984233, "learning_rate": 5.891667531533643e-07, "loss": 1.1624, "step": 3150 }, { "epoch": 0.8017760862670472, "grad_norm": 7.092761380330254, "learning_rate": 5.749598806107634e-07, "loss": 1.1571, "step": 3160 }, { "epoch": 0.8043133523628291, "grad_norm": 6.547620136814868, "learning_rate": 5.609041062460451e-07, "loss": 1.1253, "step": 3170 }, { "epoch": 0.8068506184586108, "grad_norm": 5.7902809800252175, "learning_rate": 5.470005333099288e-07, "loss": 1.1229, "step": 3180 }, { "epoch": 0.8093878845543926, "grad_norm": 7.71191733136425, "learning_rate": 5.332502531067007e-07, "loss": 1.1279, "step": 3190 }, { "epoch": 0.8119251506501745, "grad_norm": 4.0181700071905295, "learning_rate": 5.196543449085617e-07, "loss": 1.1268, "step": 3200 }, { "epoch": 0.8144624167459562, "grad_norm": 8.13336313004874, "learning_rate": 5.062138758709098e-07, "loss": 1.1409, "step": 3210 }, { "epoch": 0.8169996828417381, "grad_norm": 9.802266531368725, "learning_rate": 4.929299009485799e-07, "loss": 1.1385, "step": 3220 }, { "epoch": 0.8195369489375198, "grad_norm": 6.636513914507316, "learning_rate": 4.798034628130396e-07, "loss": 1.1454, "step": 3230 }, { "epoch": 0.8220742150333016, "grad_norm": 4.227565986903538, "learning_rate": 4.668355917705486e-07, "loss": 1.1257, "step": 3240 }, { "epoch": 0.8246114811290834, "grad_norm": 8.17867439720174, "learning_rate": 4.540273056812869e-07, "loss": 1.1337, "step": 3250 }, { "epoch": 0.8271487472248652, "grad_norm": 5.424511876014973, "learning_rate": 4.4137960987946707e-07, "loss": 1.1265, "step": 3260 }, { "epoch": 0.829686013320647, "grad_norm": 7.92386965501813, "learning_rate": 4.2889349709441945e-07, "loss": 1.1303, "step": 3270 }, { "epoch": 0.8322232794164288, "grad_norm": 10.370529382382568, "learning_rate": 4.165699473726756e-07, "loss": 1.1401, "step": 3280 }, { "epoch": 0.8347605455122106, "grad_norm": 7.578569844581463, "learning_rate": 4.044099280010405e-07, "loss": 1.1413, "step": 3290 }, { "epoch": 0.8372978116079924, "grad_norm": 4.726374382802555, "learning_rate": 3.9241439343067205e-07, "loss": 1.1189, "step": 3300 }, { "epoch": 0.8398350777037742, "grad_norm": 7.984970758900439, "learning_rate": 3.8058428520216407e-07, "loss": 1.1397, "step": 3310 }, { "epoch": 0.842372343799556, "grad_norm": 9.346707987442102, "learning_rate": 3.689205318716424e-07, "loss": 1.1348, "step": 3320 }, { "epoch": 0.8449096098953378, "grad_norm": 5.016282437243575, "learning_rate": 3.574240489378847e-07, "loss": 1.1473, "step": 3330 }, { "epoch": 0.8474468759911196, "grad_norm": 6.2078193804951365, "learning_rate": 3.4609573877046054e-07, "loss": 1.1445, "step": 3340 }, { "epoch": 0.8499841420869013, "grad_norm": 5.5201202597039085, "learning_rate": 3.3493649053890325e-07, "loss": 1.1434, "step": 3350 }, { "epoch": 0.8525214081826832, "grad_norm": 17.229253814968214, "learning_rate": 3.239471801429186e-07, "loss": 1.1305, "step": 3360 }, { "epoch": 0.8550586742784649, "grad_norm": 8.70530756713737, "learning_rate": 3.1312867014363534e-07, "loss": 1.1322, "step": 3370 }, { "epoch": 0.8575959403742468, "grad_norm": 7.012844741047474, "learning_rate": 3.024818096958995e-07, "loss": 1.1303, "step": 3380 }, { "epoch": 0.8601332064700286, "grad_norm": 4.756022687074469, "learning_rate": 2.920074344816268e-07, "loss": 1.1259, "step": 3390 }, { "epoch": 0.8626704725658103, "grad_norm": 4.7358895032579555, "learning_rate": 2.8170636664420715e-07, "loss": 1.1299, "step": 3400 }, { "epoch": 0.8652077386615922, "grad_norm": 8.683945332579281, "learning_rate": 2.7157941472397393e-07, "loss": 1.1496, "step": 3410 }, { "epoch": 0.8677450047573739, "grad_norm": 5.37690958165037, "learning_rate": 2.6162737359474195e-07, "loss": 1.1242, "step": 3420 }, { "epoch": 0.8702822708531557, "grad_norm": 7.164420959217891, "learning_rate": 2.518510244014161e-07, "loss": 1.1284, "step": 3430 }, { "epoch": 0.8728195369489375, "grad_norm": 7.002358607609309, "learning_rate": 2.4225113449867834e-07, "loss": 1.1304, "step": 3440 }, { "epoch": 0.8753568030447193, "grad_norm": 4.573582959362798, "learning_rate": 2.3282845739075855e-07, "loss": 1.1282, "step": 3450 }, { "epoch": 0.8778940691405012, "grad_norm": 4.899953600348181, "learning_rate": 2.2358373267229006e-07, "loss": 1.1298, "step": 3460 }, { "epoch": 0.8804313352362829, "grad_norm": 5.271497221424272, "learning_rate": 2.1451768597025995e-07, "loss": 1.1256, "step": 3470 }, { "epoch": 0.8829686013320647, "grad_norm": 6.165041784465411, "learning_rate": 2.0563102888705027e-07, "loss": 1.1286, "step": 3480 }, { "epoch": 0.8855058674278465, "grad_norm": 11.088879537008697, "learning_rate": 1.9692445894458845e-07, "loss": 1.1252, "step": 3490 }, { "epoch": 0.8880431335236283, "grad_norm": 9.424576280240377, "learning_rate": 1.883986595295953e-07, "loss": 1.1151, "step": 3500 }, { "epoch": 0.89058039961941, "grad_norm": 5.754831282009581, "learning_rate": 1.8005429983994487e-07, "loss": 1.1334, "step": 3510 }, { "epoch": 0.8931176657151919, "grad_norm": 10.215467753583647, "learning_rate": 1.7189203483213984e-07, "loss": 1.1411, "step": 3520 }, { "epoch": 0.8956549318109737, "grad_norm": 12.91382494925556, "learning_rate": 1.6391250516990448e-07, "loss": 1.1236, "step": 3530 }, { "epoch": 0.8981921979067554, "grad_norm": 10.128521645418763, "learning_rate": 1.5611633717389467e-07, "loss": 1.1159, "step": 3540 }, { "epoch": 0.9007294640025373, "grad_norm": 9.199799321868463, "learning_rate": 1.4850414277254088e-07, "loss": 1.1368, "step": 3550 }, { "epoch": 0.903266730098319, "grad_norm": 4.892191302349406, "learning_rate": 1.41076519454017e-07, "loss": 1.1305, "step": 3560 }, { "epoch": 0.9058039961941009, "grad_norm": 5.956493220397556, "learning_rate": 1.3383405021933998e-07, "loss": 1.1264, "step": 3570 }, { "epoch": 0.9083412622898827, "grad_norm": 5.275784999413581, "learning_rate": 1.267773035366135e-07, "loss": 1.1195, "step": 3580 }, { "epoch": 0.9108785283856644, "grad_norm": 9.540366769966049, "learning_rate": 1.1990683329640567e-07, "loss": 1.1381, "step": 3590 }, { "epoch": 0.9134157944814463, "grad_norm": 4.801855741665693, "learning_rate": 1.1322317876827416e-07, "loss": 1.1315, "step": 3600 }, { "epoch": 0.915953060577228, "grad_norm": 5.056163865646408, "learning_rate": 1.0672686455843934e-07, "loss": 1.1382, "step": 3610 }, { "epoch": 0.9184903266730098, "grad_norm": 8.706560880258346, "learning_rate": 1.004184005686068e-07, "loss": 1.119, "step": 3620 }, { "epoch": 0.9210275927687916, "grad_norm": 6.266874220907701, "learning_rate": 9.429828195594459e-08, "loss": 1.1316, "step": 3630 }, { "epoch": 0.9235648588645734, "grad_norm": 4.598497385552673, "learning_rate": 8.836698909421848e-08, "loss": 1.1316, "step": 3640 }, { "epoch": 0.9261021249603553, "grad_norm": 5.423769767831511, "learning_rate": 8.2624987536086e-08, "loss": 1.1305, "step": 3650 }, { "epoch": 0.928639391056137, "grad_norm": 7.1199155337882045, "learning_rate": 7.707272797655597e-08, "loss": 1.104, "step": 3660 }, { "epoch": 0.9311766571519188, "grad_norm": 5.964210281087181, "learning_rate": 7.171064621761121e-08, "loss": 1.1392, "step": 3670 }, { "epoch": 0.9337139232477006, "grad_norm": 7.0222277210918485, "learning_rate": 6.653916313400483e-08, "loss": 1.1307, "step": 3680 }, { "epoch": 0.9362511893434824, "grad_norm": 6.820545499544659, "learning_rate": 6.155868464022218e-08, "loss": 1.1311, "step": 3690 }, { "epoch": 0.9387884554392641, "grad_norm": 8.543306468091403, "learning_rate": 5.676960165862333e-08, "loss": 1.1396, "step": 3700 }, { "epoch": 0.941325721535046, "grad_norm": 5.998847968884944, "learning_rate": 5.217229008875696e-08, "loss": 1.1142, "step": 3710 }, { "epoch": 0.9438629876308278, "grad_norm": 7.233927115402458, "learning_rate": 4.7767110777856285e-08, "loss": 1.1316, "step": 3720 }, { "epoch": 0.9464002537266096, "grad_norm": 14.818426116599344, "learning_rate": 4.355440949251638e-08, "loss": 1.1291, "step": 3730 }, { "epoch": 0.9489375198223914, "grad_norm": 11.275146260000886, "learning_rate": 3.953451689155369e-08, "loss": 1.1377, "step": 3740 }, { "epoch": 0.9514747859181731, "grad_norm": 4.976497150845647, "learning_rate": 3.5707748500053706e-08, "loss": 1.1341, "step": 3750 }, { "epoch": 0.954012052013955, "grad_norm": 8.165947802169324, "learning_rate": 3.2074404684603325e-08, "loss": 1.1131, "step": 3760 }, { "epoch": 0.9565493181097368, "grad_norm": 7.141736239500508, "learning_rate": 2.863477062971659e-08, "loss": 1.1354, "step": 3770 }, { "epoch": 0.9590865842055185, "grad_norm": 6.6723385448588814, "learning_rate": 2.5389116315448768e-08, "loss": 1.1349, "step": 3780 }, { "epoch": 0.9616238503013004, "grad_norm": 9.205939869377257, "learning_rate": 2.2337696496206317e-08, "loss": 1.1358, "step": 3790 }, { "epoch": 0.9641611163970821, "grad_norm": 6.430232530777119, "learning_rate": 1.948075068075067e-08, "loss": 1.1454, "step": 3800 }, { "epoch": 0.966698382492864, "grad_norm": 4.597818211923664, "learning_rate": 1.6818503113398832e-08, "loss": 1.121, "step": 3810 }, { "epoch": 0.9692356485886457, "grad_norm": 11.864724480832432, "learning_rate": 1.4351162756422454e-08, "loss": 1.1279, "step": 3820 }, { "epoch": 0.9717729146844275, "grad_norm": 10.20794178542716, "learning_rate": 1.2078923273646236e-08, "loss": 1.1423, "step": 3830 }, { "epoch": 0.9743101807802094, "grad_norm": 6.419864028983356, "learning_rate": 1.0001963015247585e-08, "loss": 1.1206, "step": 3840 }, { "epoch": 0.9768474468759911, "grad_norm": 6.587741451008303, "learning_rate": 8.120445003755306e-09, "loss": 1.1429, "step": 3850 }, { "epoch": 0.9793847129717729, "grad_norm": 4.807651227856488, "learning_rate": 6.434516921257905e-09, "loss": 1.1184, "step": 3860 }, { "epoch": 0.9819219790675547, "grad_norm": 17.54136739327046, "learning_rate": 4.9443110978078525e-09, "loss": 1.143, "step": 3870 }, { "epoch": 0.9844592451633365, "grad_norm": 84.07353671535705, "learning_rate": 3.649944501037672e-09, "loss": 1.1314, "step": 3880 }, { "epoch": 0.9869965112591182, "grad_norm": 5.176156752927596, "learning_rate": 2.5515187269772866e-09, "loss": 1.1163, "step": 3890 }, { "epoch": 0.9895337773549001, "grad_norm": 4.911729600848666, "learning_rate": 1.6491199920809498e-09, "loss": 1.1519, "step": 3900 }, { "epoch": 0.9920710434506819, "grad_norm": 5.8502366235556345, "learning_rate": 9.428191264596042e-10, "loss": 1.1472, "step": 3910 }, { "epoch": 0.9946083095464637, "grad_norm": 17.952194768106622, "learning_rate": 4.3267156832033085e-10, "loss": 1.1114, "step": 3920 }, { "epoch": 0.9971455756422455, "grad_norm": 13.67739524605559, "learning_rate": 1.187173596167712e-10, "loss": 1.1338, "step": 3930 }, { "epoch": 0.9996828417380272, "grad_norm": 10.095200847738084, "learning_rate": 9.811429044215282e-13, "loss": 1.1399, "step": 3940 }, { "epoch": 0.9999365683476055, "step": 3941, "total_flos": 2.4957269414919537e+18, "train_loss": 1.201914404482746, "train_runtime": 115316.4416, "train_samples_per_second": 4.375, "train_steps_per_second": 0.034 } ], "logging_steps": 10, "max_steps": 3941, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.4957269414919537e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }