diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3951 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 183, + "global_step": 549, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0018214936247723133, + "grad_norm": 8.483417195876436, + "learning_rate": 4.705882352941176e-07, + "loss": 0.792, + "step": 1 + }, + { + "epoch": 0.0036429872495446266, + "grad_norm": 7.370293355302168, + "learning_rate": 9.411764705882352e-07, + "loss": 0.7618, + "step": 2 + }, + { + "epoch": 0.00546448087431694, + "grad_norm": 8.828281470296401, + "learning_rate": 1.411764705882353e-06, + "loss": 0.798, + "step": 3 + }, + { + "epoch": 0.007285974499089253, + "grad_norm": 7.996641905843589, + "learning_rate": 1.8823529411764705e-06, + "loss": 0.799, + "step": 4 + }, + { + "epoch": 0.009107468123861567, + "grad_norm": 5.679308683510637, + "learning_rate": 2.352941176470588e-06, + "loss": 0.703, + "step": 5 + }, + { + "epoch": 0.01092896174863388, + "grad_norm": 4.017882280804956, + "learning_rate": 2.823529411764706e-06, + "loss": 0.6024, + "step": 6 + }, + { + "epoch": 0.012750455373406194, + "grad_norm": 2.6645091422211795, + "learning_rate": 3.294117647058823e-06, + "loss": 0.716, + "step": 7 + }, + { + "epoch": 0.014571948998178506, + "grad_norm": 3.0108132390816054, + "learning_rate": 3.764705882352941e-06, + "loss": 0.6157, + "step": 8 + }, + { + "epoch": 0.01639344262295082, + "grad_norm": 2.748696274813349, + "learning_rate": 4.235294117647058e-06, + "loss": 0.674, + "step": 9 + }, + { + "epoch": 0.018214936247723135, + "grad_norm": 2.765963400363717, + "learning_rate": 4.705882352941176e-06, + "loss": 0.662, + "step": 10 + }, + { + "epoch": 0.020036429872495445, + "grad_norm": 2.0283815116624813, + "learning_rate": 5.176470588235294e-06, + "loss": 0.5478, + "step": 11 + }, + { + "epoch": 0.02185792349726776, + "grad_norm": 1.6884205496189686, + "learning_rate": 5.647058823529412e-06, + "loss": 0.6197, + "step": 12 + }, + { + "epoch": 0.023679417122040074, + "grad_norm": 1.9209865174719332, + "learning_rate": 6.1176470588235285e-06, + "loss": 0.6172, + "step": 13 + }, + { + "epoch": 0.025500910746812388, + "grad_norm": 1.6671170743222523, + "learning_rate": 6.588235294117646e-06, + "loss": 0.7003, + "step": 14 + }, + { + "epoch": 0.0273224043715847, + "grad_norm": 1.7653937318251773, + "learning_rate": 7.058823529411764e-06, + "loss": 0.6144, + "step": 15 + }, + { + "epoch": 0.029143897996357013, + "grad_norm": 1.6694163536385278, + "learning_rate": 7.529411764705882e-06, + "loss": 0.6644, + "step": 16 + }, + { + "epoch": 0.030965391621129327, + "grad_norm": 1.24459662447303, + "learning_rate": 8e-06, + "loss": 0.6239, + "step": 17 + }, + { + "epoch": 0.03278688524590164, + "grad_norm": 1.4386147481898333, + "learning_rate": 7.999930256262932e-06, + "loss": 0.598, + "step": 18 + }, + { + "epoch": 0.03460837887067395, + "grad_norm": 1.6183710176455128, + "learning_rate": 7.999721027483818e-06, + "loss": 0.6282, + "step": 19 + }, + { + "epoch": 0.03642987249544627, + "grad_norm": 2.2610452246136794, + "learning_rate": 7.999372320958861e-06, + "loss": 0.5843, + "step": 20 + }, + { + "epoch": 0.03825136612021858, + "grad_norm": 1.7393146801183827, + "learning_rate": 7.998884148848109e-06, + "loss": 0.6479, + "step": 21 + }, + { + "epoch": 0.04007285974499089, + "grad_norm": 1.3703584066908119, + "learning_rate": 7.998256528175033e-06, + "loss": 0.5559, + "step": 22 + }, + { + "epoch": 0.04189435336976321, + "grad_norm": 1.3670186569963991, + "learning_rate": 7.997489480825941e-06, + "loss": 0.5666, + "step": 23 + }, + { + "epoch": 0.04371584699453552, + "grad_norm": 1.383027766675318, + "learning_rate": 7.996583033549204e-06, + "loss": 0.6172, + "step": 24 + }, + { + "epoch": 0.04553734061930783, + "grad_norm": 1.4278949896780369, + "learning_rate": 7.995537217954335e-06, + "loss": 0.6508, + "step": 25 + }, + { + "epoch": 0.04735883424408015, + "grad_norm": 1.3763761366268141, + "learning_rate": 7.994352070510876e-06, + "loss": 0.5794, + "step": 26 + }, + { + "epoch": 0.04918032786885246, + "grad_norm": 1.269210429185682, + "learning_rate": 7.993027632547137e-06, + "loss": 0.6494, + "step": 27 + }, + { + "epoch": 0.051001821493624776, + "grad_norm": 1.293399215498077, + "learning_rate": 7.991563950248739e-06, + "loss": 0.6337, + "step": 28 + }, + { + "epoch": 0.052823315118397086, + "grad_norm": 1.3443794138088765, + "learning_rate": 7.989961074657023e-06, + "loss": 0.6124, + "step": 29 + }, + { + "epoch": 0.0546448087431694, + "grad_norm": 1.4797305271067247, + "learning_rate": 7.988219061667252e-06, + "loss": 0.64, + "step": 30 + }, + { + "epoch": 0.056466302367941715, + "grad_norm": 2.20965791616683, + "learning_rate": 7.986337972026678e-06, + "loss": 0.5096, + "step": 31 + }, + { + "epoch": 0.058287795992714025, + "grad_norm": 1.3472988619381234, + "learning_rate": 7.98431787133241e-06, + "loss": 0.5845, + "step": 32 + }, + { + "epoch": 0.060109289617486336, + "grad_norm": 1.922191039308769, + "learning_rate": 7.982158830029133e-06, + "loss": 0.5812, + "step": 33 + }, + { + "epoch": 0.061930783242258654, + "grad_norm": 1.5178941257266974, + "learning_rate": 7.979860923406654e-06, + "loss": 0.6065, + "step": 34 + }, + { + "epoch": 0.06375227686703097, + "grad_norm": 1.2582259195864762, + "learning_rate": 7.977424231597266e-06, + "loss": 0.6219, + "step": 35 + }, + { + "epoch": 0.06557377049180328, + "grad_norm": 1.440820716830347, + "learning_rate": 7.97484883957297e-06, + "loss": 0.6243, + "step": 36 + }, + { + "epoch": 0.06739526411657559, + "grad_norm": 1.2398972840100644, + "learning_rate": 7.972134837142497e-06, + "loss": 0.5014, + "step": 37 + }, + { + "epoch": 0.0692167577413479, + "grad_norm": 1.2685725818359594, + "learning_rate": 7.969282318948179e-06, + "loss": 0.6028, + "step": 38 + }, + { + "epoch": 0.07103825136612021, + "grad_norm": 1.3336293286163448, + "learning_rate": 7.966291384462662e-06, + "loss": 0.5084, + "step": 39 + }, + { + "epoch": 0.07285974499089254, + "grad_norm": 1.3736224319100918, + "learning_rate": 7.963162137985416e-06, + "loss": 0.6208, + "step": 40 + }, + { + "epoch": 0.07468123861566485, + "grad_norm": 1.1372782370001664, + "learning_rate": 7.959894688639114e-06, + "loss": 0.5566, + "step": 41 + }, + { + "epoch": 0.07650273224043716, + "grad_norm": 1.1370294392240514, + "learning_rate": 7.956489150365818e-06, + "loss": 0.5316, + "step": 42 + }, + { + "epoch": 0.07832422586520947, + "grad_norm": 1.3638397949259238, + "learning_rate": 7.952945641923014e-06, + "loss": 0.5665, + "step": 43 + }, + { + "epoch": 0.08014571948998178, + "grad_norm": 1.3721010139488579, + "learning_rate": 7.949264286879461e-06, + "loss": 0.6686, + "step": 44 + }, + { + "epoch": 0.08196721311475409, + "grad_norm": 1.3079030861579901, + "learning_rate": 7.94544521361089e-06, + "loss": 0.5678, + "step": 45 + }, + { + "epoch": 0.08378870673952642, + "grad_norm": 1.3169311772009933, + "learning_rate": 7.941488555295519e-06, + "loss": 0.5883, + "step": 46 + }, + { + "epoch": 0.08561020036429873, + "grad_norm": 1.187489505664262, + "learning_rate": 7.937394449909417e-06, + "loss": 0.6398, + "step": 47 + }, + { + "epoch": 0.08743169398907104, + "grad_norm": 1.3538084566677648, + "learning_rate": 7.933163040221691e-06, + "loss": 0.6001, + "step": 48 + }, + { + "epoch": 0.08925318761384335, + "grad_norm": 1.282030778796261, + "learning_rate": 7.928794473789502e-06, + "loss": 0.6299, + "step": 49 + }, + { + "epoch": 0.09107468123861566, + "grad_norm": 1.291046733294755, + "learning_rate": 7.924288902952924e-06, + "loss": 0.6643, + "step": 50 + }, + { + "epoch": 0.09289617486338798, + "grad_norm": 1.2600029350057804, + "learning_rate": 7.91964648482963e-06, + "loss": 0.6266, + "step": 51 + }, + { + "epoch": 0.0947176684881603, + "grad_norm": 1.383888278177588, + "learning_rate": 7.914867381309417e-06, + "loss": 0.5565, + "step": 52 + }, + { + "epoch": 0.0965391621129326, + "grad_norm": 1.2920207521690645, + "learning_rate": 7.909951759048553e-06, + "loss": 0.5539, + "step": 53 + }, + { + "epoch": 0.09836065573770492, + "grad_norm": 1.4806098875225748, + "learning_rate": 7.904899789463974e-06, + "loss": 0.4836, + "step": 54 + }, + { + "epoch": 0.10018214936247723, + "grad_norm": 1.3236781330536374, + "learning_rate": 7.899711648727295e-06, + "loss": 0.6716, + "step": 55 + }, + { + "epoch": 0.10200364298724955, + "grad_norm": 1.307628398165257, + "learning_rate": 7.894387517758679e-06, + "loss": 0.5806, + "step": 56 + }, + { + "epoch": 0.10382513661202186, + "grad_norm": 1.2494530103588137, + "learning_rate": 7.888927582220521e-06, + "loss": 0.5255, + "step": 57 + }, + { + "epoch": 0.10564663023679417, + "grad_norm": 1.1220353680400945, + "learning_rate": 7.883332032510978e-06, + "loss": 0.5951, + "step": 58 + }, + { + "epoch": 0.10746812386156648, + "grad_norm": 1.2679713293816075, + "learning_rate": 7.877601063757322e-06, + "loss": 0.6249, + "step": 59 + }, + { + "epoch": 0.1092896174863388, + "grad_norm": 1.367413675973217, + "learning_rate": 7.871734875809141e-06, + "loss": 0.5839, + "step": 60 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.3310878119591731, + "learning_rate": 7.86573367323137e-06, + "loss": 0.5441, + "step": 61 + }, + { + "epoch": 0.11293260473588343, + "grad_norm": 1.2672834778790825, + "learning_rate": 7.859597665297158e-06, + "loss": 0.5799, + "step": 62 + }, + { + "epoch": 0.11475409836065574, + "grad_norm": 1.4949905078759134, + "learning_rate": 7.853327065980567e-06, + "loss": 0.5811, + "step": 63 + }, + { + "epoch": 0.11657559198542805, + "grad_norm": 1.2772501302963368, + "learning_rate": 7.84692209394911e-06, + "loss": 0.5454, + "step": 64 + }, + { + "epoch": 0.11839708561020036, + "grad_norm": 1.4486798634496518, + "learning_rate": 7.84038297255613e-06, + "loss": 0.6189, + "step": 65 + }, + { + "epoch": 0.12021857923497267, + "grad_norm": 1.247869533369392, + "learning_rate": 7.83370992983301e-06, + "loss": 0.6109, + "step": 66 + }, + { + "epoch": 0.122040072859745, + "grad_norm": 1.4278988044592194, + "learning_rate": 7.826903198481218e-06, + "loss": 0.6362, + "step": 67 + }, + { + "epoch": 0.12386156648451731, + "grad_norm": 1.4421005783850482, + "learning_rate": 7.819963015864195e-06, + "loss": 0.5953, + "step": 68 + }, + { + "epoch": 0.12568306010928962, + "grad_norm": 1.2853062012314034, + "learning_rate": 7.812889623999077e-06, + "loss": 0.6449, + "step": 69 + }, + { + "epoch": 0.12750455373406194, + "grad_norm": 1.1806756552505506, + "learning_rate": 7.805683269548253e-06, + "loss": 0.4961, + "step": 70 + }, + { + "epoch": 0.12932604735883424, + "grad_norm": 1.1874724065893263, + "learning_rate": 7.798344203810772e-06, + "loss": 0.5731, + "step": 71 + }, + { + "epoch": 0.13114754098360656, + "grad_norm": 1.4302204052792575, + "learning_rate": 7.790872682713567e-06, + "loss": 0.5694, + "step": 72 + }, + { + "epoch": 0.13296903460837886, + "grad_norm": 1.621022341496324, + "learning_rate": 7.783268966802538e-06, + "loss": 0.5683, + "step": 73 + }, + { + "epoch": 0.13479052823315119, + "grad_norm": 1.266495179779122, + "learning_rate": 7.77553332123347e-06, + "loss": 0.6351, + "step": 74 + }, + { + "epoch": 0.1366120218579235, + "grad_norm": 1.4124120835397862, + "learning_rate": 7.767666015762775e-06, + "loss": 0.5941, + "step": 75 + }, + { + "epoch": 0.1384335154826958, + "grad_norm": 1.3413868354070095, + "learning_rate": 7.7596673247381e-06, + "loss": 0.5407, + "step": 76 + }, + { + "epoch": 0.14025500910746813, + "grad_norm": 1.305749508434761, + "learning_rate": 7.751537527088742e-06, + "loss": 0.5331, + "step": 77 + }, + { + "epoch": 0.14207650273224043, + "grad_norm": 1.2388240813097555, + "learning_rate": 7.743276906315936e-06, + "loss": 0.6214, + "step": 78 + }, + { + "epoch": 0.14389799635701275, + "grad_norm": 1.4526353435545132, + "learning_rate": 7.734885750482967e-06, + "loss": 0.5391, + "step": 79 + }, + { + "epoch": 0.14571948998178508, + "grad_norm": 1.3313440090499233, + "learning_rate": 7.726364352205116e-06, + "loss": 0.6254, + "step": 80 + }, + { + "epoch": 0.14754098360655737, + "grad_norm": 1.336317757950119, + "learning_rate": 7.717713008639463e-06, + "loss": 0.4752, + "step": 81 + }, + { + "epoch": 0.1493624772313297, + "grad_norm": 1.2089723824473042, + "learning_rate": 7.708932021474524e-06, + "loss": 0.7128, + "step": 82 + }, + { + "epoch": 0.151183970856102, + "grad_norm": 1.4923241883131872, + "learning_rate": 7.70002169691973e-06, + "loss": 0.6695, + "step": 83 + }, + { + "epoch": 0.15300546448087432, + "grad_norm": 1.1956673865353409, + "learning_rate": 7.690982345694746e-06, + "loss": 0.5046, + "step": 84 + }, + { + "epoch": 0.15482695810564662, + "grad_norm": 1.3110716423599806, + "learning_rate": 7.68181428301864e-06, + "loss": 0.5885, + "step": 85 + }, + { + "epoch": 0.15664845173041894, + "grad_norm": 1.340432170090615, + "learning_rate": 7.67251782859889e-06, + "loss": 0.5455, + "step": 86 + }, + { + "epoch": 0.15846994535519127, + "grad_norm": 1.2506027715723123, + "learning_rate": 7.663093306620228e-06, + "loss": 0.5864, + "step": 87 + }, + { + "epoch": 0.16029143897996356, + "grad_norm": 1.3974083197789515, + "learning_rate": 7.653541045733351e-06, + "loss": 0.5428, + "step": 88 + }, + { + "epoch": 0.1621129326047359, + "grad_norm": 1.5848220165792029, + "learning_rate": 7.643861379043442e-06, + "loss": 0.577, + "step": 89 + }, + { + "epoch": 0.16393442622950818, + "grad_norm": 1.319727068673188, + "learning_rate": 7.634054644098566e-06, + "loss": 0.5787, + "step": 90 + }, + { + "epoch": 0.1657559198542805, + "grad_norm": 1.3075529460521724, + "learning_rate": 7.624121182877892e-06, + "loss": 0.6063, + "step": 91 + }, + { + "epoch": 0.16757741347905283, + "grad_norm": 1.3869090444503254, + "learning_rate": 7.614061341779777e-06, + "loss": 0.5533, + "step": 92 + }, + { + "epoch": 0.16939890710382513, + "grad_norm": 1.3038369655118094, + "learning_rate": 7.6038754716096755e-06, + "loss": 0.6403, + "step": 93 + }, + { + "epoch": 0.17122040072859745, + "grad_norm": 1.201815586378659, + "learning_rate": 7.593563927567915e-06, + "loss": 0.5341, + "step": 94 + }, + { + "epoch": 0.17304189435336975, + "grad_norm": 1.2948505897209415, + "learning_rate": 7.583127069237302e-06, + "loss": 0.5988, + "step": 95 + }, + { + "epoch": 0.17486338797814208, + "grad_norm": 1.2777544112935704, + "learning_rate": 7.5725652605705876e-06, + "loss": 0.5467, + "step": 96 + }, + { + "epoch": 0.1766848816029144, + "grad_norm": 1.347734488800424, + "learning_rate": 7.561878869877778e-06, + "loss": 0.6734, + "step": 97 + }, + { + "epoch": 0.1785063752276867, + "grad_norm": 1.3934793951098927, + "learning_rate": 7.551068269813282e-06, + "loss": 0.5408, + "step": 98 + }, + { + "epoch": 0.18032786885245902, + "grad_norm": 1.2265308094279608, + "learning_rate": 7.540133837362924e-06, + "loss": 0.5798, + "step": 99 + }, + { + "epoch": 0.18214936247723132, + "grad_norm": 1.2042353648647974, + "learning_rate": 7.5290759538307944e-06, + "loss": 0.5349, + "step": 100 + }, + { + "epoch": 0.18397085610200364, + "grad_norm": 1.198872560339186, + "learning_rate": 7.517895004825955e-06, + "loss": 0.6078, + "step": 101 + }, + { + "epoch": 0.18579234972677597, + "grad_norm": 1.5341596102925161, + "learning_rate": 7.506591380248991e-06, + "loss": 0.669, + "step": 102 + }, + { + "epoch": 0.18761384335154827, + "grad_norm": 1.2181363687268225, + "learning_rate": 7.495165474278411e-06, + "loss": 0.5508, + "step": 103 + }, + { + "epoch": 0.1894353369763206, + "grad_norm": 1.2061034316572241, + "learning_rate": 7.483617685356906e-06, + "loss": 0.6374, + "step": 104 + }, + { + "epoch": 0.1912568306010929, + "grad_norm": 1.270570136570114, + "learning_rate": 7.471948416177452e-06, + "loss": 0.6582, + "step": 105 + }, + { + "epoch": 0.1930783242258652, + "grad_norm": 1.2220914581087463, + "learning_rate": 7.460158073669271e-06, + "loss": 0.5195, + "step": 106 + }, + { + "epoch": 0.19489981785063754, + "grad_norm": 1.1327340940905815, + "learning_rate": 7.448247068983638e-06, + "loss": 0.6108, + "step": 107 + }, + { + "epoch": 0.19672131147540983, + "grad_norm": 1.2389921551395293, + "learning_rate": 7.43621581747954e-06, + "loss": 0.5197, + "step": 108 + }, + { + "epoch": 0.19854280510018216, + "grad_norm": 1.3119245038499585, + "learning_rate": 7.4240647387092e-06, + "loss": 0.5787, + "step": 109 + }, + { + "epoch": 0.20036429872495445, + "grad_norm": 1.2985603971431927, + "learning_rate": 7.411794256403439e-06, + "loss": 0.5408, + "step": 110 + }, + { + "epoch": 0.20218579234972678, + "grad_norm": 1.3547972287329184, + "learning_rate": 7.399404798456901e-06, + "loss": 0.5645, + "step": 111 + }, + { + "epoch": 0.2040072859744991, + "grad_norm": 1.247671888832723, + "learning_rate": 7.3868967969131364e-06, + "loss": 0.594, + "step": 112 + }, + { + "epoch": 0.2058287795992714, + "grad_norm": 1.1653540957694166, + "learning_rate": 7.374270687949531e-06, + "loss": 0.6022, + "step": 113 + }, + { + "epoch": 0.20765027322404372, + "grad_norm": 1.110075329220651, + "learning_rate": 7.3615269118620945e-06, + "loss": 0.4952, + "step": 114 + }, + { + "epoch": 0.20947176684881602, + "grad_norm": 1.1773358008446368, + "learning_rate": 7.348665913050114e-06, + "loss": 0.5796, + "step": 115 + }, + { + "epoch": 0.21129326047358835, + "grad_norm": 1.1762014415226016, + "learning_rate": 7.3356881400006485e-06, + "loss": 0.5019, + "step": 116 + }, + { + "epoch": 0.21311475409836064, + "grad_norm": 1.2857996230820723, + "learning_rate": 7.3225940452728915e-06, + "loss": 0.6128, + "step": 117 + }, + { + "epoch": 0.21493624772313297, + "grad_norm": 1.278319933723205, + "learning_rate": 7.309384085482396e-06, + "loss": 0.6233, + "step": 118 + }, + { + "epoch": 0.2167577413479053, + "grad_norm": 1.3246399685797854, + "learning_rate": 7.29605872128514e-06, + "loss": 0.5597, + "step": 119 + }, + { + "epoch": 0.2185792349726776, + "grad_norm": 1.3738785761031618, + "learning_rate": 7.282618417361476e-06, + "loss": 0.6105, + "step": 120 + }, + { + "epoch": 0.2204007285974499, + "grad_norm": 1.510255253584312, + "learning_rate": 7.269063642399912e-06, + "loss": 0.5677, + "step": 121 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 1.3705154463280955, + "learning_rate": 7.25539486908078e-06, + "loss": 0.5172, + "step": 122 + }, + { + "epoch": 0.22404371584699453, + "grad_norm": 1.4229862954590966, + "learning_rate": 7.241612574059745e-06, + "loss": 0.5987, + "step": 123 + }, + { + "epoch": 0.22586520947176686, + "grad_norm": 1.3813913047228632, + "learning_rate": 7.227717237951189e-06, + "loss": 0.6519, + "step": 124 + }, + { + "epoch": 0.22768670309653916, + "grad_norm": 1.4086532700921839, + "learning_rate": 7.213709345311444e-06, + "loss": 0.5507, + "step": 125 + }, + { + "epoch": 0.22950819672131148, + "grad_norm": 1.3222980277500922, + "learning_rate": 7.1995893846219035e-06, + "loss": 0.5821, + "step": 126 + }, + { + "epoch": 0.23132969034608378, + "grad_norm": 1.2772860113277071, + "learning_rate": 7.185357848271977e-06, + "loss": 0.5552, + "step": 127 + }, + { + "epoch": 0.2331511839708561, + "grad_norm": 1.3184892568778903, + "learning_rate": 7.17101523254193e-06, + "loss": 0.4548, + "step": 128 + }, + { + "epoch": 0.23497267759562843, + "grad_norm": 1.417969805143716, + "learning_rate": 7.156562037585575e-06, + "loss": 0.5907, + "step": 129 + }, + { + "epoch": 0.23679417122040072, + "grad_norm": 1.3194920301350814, + "learning_rate": 7.1419987674128225e-06, + "loss": 0.5377, + "step": 130 + }, + { + "epoch": 0.23861566484517305, + "grad_norm": 1.1915857881381822, + "learning_rate": 7.127325929872119e-06, + "loss": 0.5803, + "step": 131 + }, + { + "epoch": 0.24043715846994534, + "grad_norm": 1.294311599245113, + "learning_rate": 7.1125440366327245e-06, + "loss": 0.533, + "step": 132 + }, + { + "epoch": 0.24225865209471767, + "grad_norm": 1.2869026066621274, + "learning_rate": 7.0976536031668775e-06, + "loss": 0.5978, + "step": 133 + }, + { + "epoch": 0.24408014571949, + "grad_norm": 1.2882193076261699, + "learning_rate": 7.082655148731815e-06, + "loss": 0.5998, + "step": 134 + }, + { + "epoch": 0.2459016393442623, + "grad_norm": 1.2783117350056112, + "learning_rate": 7.067549196351669e-06, + "loss": 0.5792, + "step": 135 + }, + { + "epoch": 0.24772313296903462, + "grad_norm": 1.2465596890374582, + "learning_rate": 7.052336272799226e-06, + "loss": 0.6114, + "step": 136 + }, + { + "epoch": 0.2495446265938069, + "grad_norm": 1.3115289926959033, + "learning_rate": 7.037016908577555e-06, + "loss": 0.5711, + "step": 137 + }, + { + "epoch": 0.25136612021857924, + "grad_norm": 1.2945372654161171, + "learning_rate": 7.02159163790151e-06, + "loss": 0.6108, + "step": 138 + }, + { + "epoch": 0.25318761384335153, + "grad_norm": 1.179854303445542, + "learning_rate": 7.006060998679105e-06, + "loss": 0.5712, + "step": 139 + }, + { + "epoch": 0.2550091074681239, + "grad_norm": 1.3505552689452494, + "learning_rate": 6.990425532492747e-06, + "loss": 0.607, + "step": 140 + }, + { + "epoch": 0.2568306010928962, + "grad_norm": 1.293079490327599, + "learning_rate": 6.974685784580359e-06, + "loss": 0.5907, + "step": 141 + }, + { + "epoch": 0.2586520947176685, + "grad_norm": 1.2632127589893052, + "learning_rate": 6.958842303816359e-06, + "loss": 0.6264, + "step": 142 + }, + { + "epoch": 0.2604735883424408, + "grad_norm": 1.3425160590690703, + "learning_rate": 6.942895642692527e-06, + "loss": 0.4246, + "step": 143 + }, + { + "epoch": 0.26229508196721313, + "grad_norm": 1.5771791991344761, + "learning_rate": 6.926846357298732e-06, + "loss": 0.5806, + "step": 144 + }, + { + "epoch": 0.2641165755919854, + "grad_norm": 1.4030090930033172, + "learning_rate": 6.910695007303544e-06, + "loss": 0.5502, + "step": 145 + }, + { + "epoch": 0.2659380692167577, + "grad_norm": 1.3313664381580208, + "learning_rate": 6.894442155934719e-06, + "loss": 0.5853, + "step": 146 + }, + { + "epoch": 0.2677595628415301, + "grad_norm": 1.2706667077309404, + "learning_rate": 6.878088369959553e-06, + "loss": 0.5632, + "step": 147 + }, + { + "epoch": 0.26958105646630237, + "grad_norm": 1.2404396727083782, + "learning_rate": 6.861634219665117e-06, + "loss": 0.6277, + "step": 148 + }, + { + "epoch": 0.27140255009107467, + "grad_norm": 1.266069671654807, + "learning_rate": 6.845080278838381e-06, + "loss": 0.5883, + "step": 149 + }, + { + "epoch": 0.273224043715847, + "grad_norm": 1.4551654748561211, + "learning_rate": 6.82842712474619e-06, + "loss": 0.5585, + "step": 150 + }, + { + "epoch": 0.2750455373406193, + "grad_norm": 1.4120447459169916, + "learning_rate": 6.811675338115146e-06, + "loss": 0.5859, + "step": 151 + }, + { + "epoch": 0.2768670309653916, + "grad_norm": 1.4794301221707373, + "learning_rate": 6.7948255031113505e-06, + "loss": 0.5735, + "step": 152 + }, + { + "epoch": 0.2786885245901639, + "grad_norm": 1.4445882004520383, + "learning_rate": 6.777878207320034e-06, + "loss": 0.5308, + "step": 153 + }, + { + "epoch": 0.28051001821493626, + "grad_norm": 1.259161824393288, + "learning_rate": 6.760834041725068e-06, + "loss": 0.5683, + "step": 154 + }, + { + "epoch": 0.28233151183970856, + "grad_norm": 1.5252085328619271, + "learning_rate": 6.743693600688352e-06, + "loss": 0.625, + "step": 155 + }, + { + "epoch": 0.28415300546448086, + "grad_norm": 1.4215413394995613, + "learning_rate": 6.726457481929095e-06, + "loss": 0.5376, + "step": 156 + }, + { + "epoch": 0.2859744990892532, + "grad_norm": 1.2834803437487838, + "learning_rate": 6.7091262865029645e-06, + "loss": 0.5586, + "step": 157 + }, + { + "epoch": 0.2877959927140255, + "grad_norm": 1.320167275314629, + "learning_rate": 6.691700618781126e-06, + "loss": 0.5789, + "step": 158 + }, + { + "epoch": 0.2896174863387978, + "grad_norm": 1.8005652724042749, + "learning_rate": 6.674181086429177e-06, + "loss": 0.5188, + "step": 159 + }, + { + "epoch": 0.29143897996357016, + "grad_norm": 1.2690750482423212, + "learning_rate": 6.656568300385944e-06, + "loss": 0.5046, + "step": 160 + }, + { + "epoch": 0.29326047358834245, + "grad_norm": 1.31082889556021, + "learning_rate": 6.6388628748421895e-06, + "loss": 0.578, + "step": 161 + }, + { + "epoch": 0.29508196721311475, + "grad_norm": 1.3247928047009208, + "learning_rate": 6.62106542721918e-06, + "loss": 0.6034, + "step": 162 + }, + { + "epoch": 0.29690346083788705, + "grad_norm": 1.2229208641682885, + "learning_rate": 6.603176578147174e-06, + "loss": 0.4996, + "step": 163 + }, + { + "epoch": 0.2987249544626594, + "grad_norm": 1.3325184853391303, + "learning_rate": 6.585196951443763e-06, + "loss": 0.4985, + "step": 164 + }, + { + "epoch": 0.3005464480874317, + "grad_norm": 1.326999208770471, + "learning_rate": 6.5671271740921266e-06, + "loss": 0.5379, + "step": 165 + }, + { + "epoch": 0.302367941712204, + "grad_norm": 1.2382829663292223, + "learning_rate": 6.548967876219163e-06, + "loss": 0.5481, + "step": 166 + }, + { + "epoch": 0.30418943533697634, + "grad_norm": 1.3992391592431397, + "learning_rate": 6.530719691073521e-06, + "loss": 0.5941, + "step": 167 + }, + { + "epoch": 0.30601092896174864, + "grad_norm": 1.4237814669133988, + "learning_rate": 6.5123832550035165e-06, + "loss": 0.5879, + "step": 168 + }, + { + "epoch": 0.30783242258652094, + "grad_norm": 1.2434267947044435, + "learning_rate": 6.493959207434934e-06, + "loss": 0.5608, + "step": 169 + }, + { + "epoch": 0.30965391621129323, + "grad_norm": 1.2949876195852905, + "learning_rate": 6.47544819084874e-06, + "loss": 0.5477, + "step": 170 + }, + { + "epoch": 0.3114754098360656, + "grad_norm": 1.2709700316435848, + "learning_rate": 6.4568508507586715e-06, + "loss": 0.5913, + "step": 171 + }, + { + "epoch": 0.3132969034608379, + "grad_norm": 1.1917528631285046, + "learning_rate": 6.438167835688725e-06, + "loss": 0.5811, + "step": 172 + }, + { + "epoch": 0.3151183970856102, + "grad_norm": 1.4420942251279008, + "learning_rate": 6.41939979715055e-06, + "loss": 0.5258, + "step": 173 + }, + { + "epoch": 0.31693989071038253, + "grad_norm": 1.3898239712198435, + "learning_rate": 6.400547389620716e-06, + "loss": 0.4382, + "step": 174 + }, + { + "epoch": 0.31876138433515483, + "grad_norm": 1.1944425406473265, + "learning_rate": 6.3816112705178984e-06, + "loss": 0.4981, + "step": 175 + }, + { + "epoch": 0.3205828779599271, + "grad_norm": 1.2940463807027722, + "learning_rate": 6.362592100179958e-06, + "loss": 0.5679, + "step": 176 + }, + { + "epoch": 0.3224043715846995, + "grad_norm": 1.1827687350225293, + "learning_rate": 6.343490541840899e-06, + "loss": 0.5778, + "step": 177 + }, + { + "epoch": 0.3242258652094718, + "grad_norm": 1.2684254658308312, + "learning_rate": 6.3243072616077535e-06, + "loss": 0.5077, + "step": 178 + }, + { + "epoch": 0.32604735883424407, + "grad_norm": 1.4375046609341398, + "learning_rate": 6.3050429284373465e-06, + "loss": 0.5428, + "step": 179 + }, + { + "epoch": 0.32786885245901637, + "grad_norm": 1.2298035631627269, + "learning_rate": 6.285698214112974e-06, + "loss": 0.5333, + "step": 180 + }, + { + "epoch": 0.3296903460837887, + "grad_norm": 1.3091190041481056, + "learning_rate": 6.2662737932209695e-06, + "loss": 0.4829, + "step": 181 + }, + { + "epoch": 0.331511839708561, + "grad_norm": 1.399739792081344, + "learning_rate": 6.246770343127185e-06, + "loss": 0.5039, + "step": 182 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 1.4099523447236924, + "learning_rate": 6.227188543953368e-06, + "loss": 0.4718, + "step": 183 + }, + { + "epoch": 0.3333333333333333, + "eval_accuracy": 0.8100095999478772, + "eval_accuracy_first_token": 0.9386401326699834, + "eval_accuracy_first_token_all": 0.9740702326270558, + "eval_accuracy_first_token_all_total": 6749, + "eval_accuracy_first_token_calculate": 0.8181818181818182, + "eval_accuracy_first_token_calculate_total": 44, + "eval_accuracy_first_token_execute": 0.995049504950495, + "eval_accuracy_first_token_execute_total": 202, + "eval_accuracy_first_token_get": 0.9409190371991247, + "eval_accuracy_first_token_get_total": 457, + "eval_accuracy_first_token_python": 0.7979899497487437, + "eval_accuracy_first_token_python_total": 995, + "eval_loss": 0.5817924737930298, + "eval_perplexity": 1.1146453885480792, + "eval_runtime": 276.9015, + "eval_samples_per_second": 2.506, + "eval_steps_per_second": 0.628, + "eval_total_number_first_token": 9648, + "step": 183 + }, + { + "epoch": 0.33515482695810567, + "grad_norm": 1.3753718658598428, + "learning_rate": 6.207529078553444e-06, + "loss": 0.4554, + "step": 184 + }, + { + "epoch": 0.33697632058287796, + "grad_norm": 1.23834147665285, + "learning_rate": 6.1877926324897085e-06, + "loss": 0.5276, + "step": 185 + }, + { + "epoch": 0.33879781420765026, + "grad_norm": 1.4417460479810351, + "learning_rate": 6.16797989400891e-06, + "loss": 0.5891, + "step": 186 + }, + { + "epoch": 0.3406193078324226, + "grad_norm": 1.212881090816042, + "learning_rate": 6.148091554018264e-06, + "loss": 0.51, + "step": 187 + }, + { + "epoch": 0.3424408014571949, + "grad_norm": 1.3090197042208114, + "learning_rate": 6.128128306061346e-06, + "loss": 0.5551, + "step": 188 + }, + { + "epoch": 0.3442622950819672, + "grad_norm": 1.4028504868912617, + "learning_rate": 6.108090846293915e-06, + "loss": 0.571, + "step": 189 + }, + { + "epoch": 0.3460837887067395, + "grad_norm": 1.2383621150830924, + "learning_rate": 6.087979873459634e-06, + "loss": 0.4812, + "step": 190 + }, + { + "epoch": 0.34790528233151186, + "grad_norm": 1.1655822819675992, + "learning_rate": 6.0677960888657015e-06, + "loss": 0.5027, + "step": 191 + }, + { + "epoch": 0.34972677595628415, + "grad_norm": 1.2330586732923372, + "learning_rate": 6.047540196358404e-06, + "loss": 0.5472, + "step": 192 + }, + { + "epoch": 0.35154826958105645, + "grad_norm": 1.2120668135216859, + "learning_rate": 6.02721290229856e-06, + "loss": 0.4723, + "step": 193 + }, + { + "epoch": 0.3533697632058288, + "grad_norm": 1.2127776371529766, + "learning_rate": 6.006814915536894e-06, + "loss": 0.5994, + "step": 194 + }, + { + "epoch": 0.3551912568306011, + "grad_norm": 1.3781231876403488, + "learning_rate": 5.9863469473893225e-06, + "loss": 0.6055, + "step": 195 + }, + { + "epoch": 0.3570127504553734, + "grad_norm": 1.2377625062579942, + "learning_rate": 5.965809711612137e-06, + "loss": 0.6285, + "step": 196 + }, + { + "epoch": 0.3588342440801457, + "grad_norm": 1.2436458612174006, + "learning_rate": 5.945203924377125e-06, + "loss": 0.5051, + "step": 197 + }, + { + "epoch": 0.36065573770491804, + "grad_norm": 1.067560558211732, + "learning_rate": 5.92453030424659e-06, + "loss": 0.557, + "step": 198 + }, + { + "epoch": 0.36247723132969034, + "grad_norm": 1.339153659611944, + "learning_rate": 5.903789572148294e-06, + "loss": 0.5624, + "step": 199 + }, + { + "epoch": 0.36429872495446264, + "grad_norm": 1.3921715569996027, + "learning_rate": 5.88298245135032e-06, + "loss": 0.6122, + "step": 200 + }, + { + "epoch": 0.366120218579235, + "grad_norm": 1.0724617492860091, + "learning_rate": 5.862109667435853e-06, + "loss": 0.406, + "step": 201 + }, + { + "epoch": 0.3679417122040073, + "grad_norm": 1.3579568036237248, + "learning_rate": 5.8411719482778645e-06, + "loss": 0.5615, + "step": 202 + }, + { + "epoch": 0.3697632058287796, + "grad_norm": 1.392362486259979, + "learning_rate": 5.820170024013746e-06, + "loss": 0.4783, + "step": 203 + }, + { + "epoch": 0.37158469945355194, + "grad_norm": 1.4663688083889719, + "learning_rate": 5.79910462701984e-06, + "loss": 0.6462, + "step": 204 + }, + { + "epoch": 0.37340619307832423, + "grad_norm": 1.5906378406347372, + "learning_rate": 5.777976491885903e-06, + "loss": 0.5989, + "step": 205 + }, + { + "epoch": 0.37522768670309653, + "grad_norm": 1.2690398154345799, + "learning_rate": 5.756786355389481e-06, + "loss": 0.4675, + "step": 206 + }, + { + "epoch": 0.3770491803278688, + "grad_norm": 1.340814834772367, + "learning_rate": 5.735534956470232e-06, + "loss": 0.4978, + "step": 207 + }, + { + "epoch": 0.3788706739526412, + "grad_norm": 1.2798668544381595, + "learning_rate": 5.714223036204144e-06, + "loss": 0.5032, + "step": 208 + }, + { + "epoch": 0.3806921675774135, + "grad_norm": 1.2535515686753846, + "learning_rate": 5.6928513377777e-06, + "loss": 0.53, + "step": 209 + }, + { + "epoch": 0.3825136612021858, + "grad_norm": 1.2457137330041876, + "learning_rate": 5.671420606461956e-06, + "loss": 0.5851, + "step": 210 + }, + { + "epoch": 0.3843351548269581, + "grad_norm": 1.4696601482702962, + "learning_rate": 5.649931589586557e-06, + "loss": 0.654, + "step": 211 + }, + { + "epoch": 0.3861566484517304, + "grad_norm": 1.2987011689556953, + "learning_rate": 5.628385036513676e-06, + "loss": 0.5669, + "step": 212 + }, + { + "epoch": 0.3879781420765027, + "grad_norm": 1.3159510108963395, + "learning_rate": 5.606781698611878e-06, + "loss": 0.5532, + "step": 213 + }, + { + "epoch": 0.38979963570127507, + "grad_norm": 1.1640550154922134, + "learning_rate": 5.585122329229923e-06, + "loss": 0.4804, + "step": 214 + }, + { + "epoch": 0.39162112932604737, + "grad_norm": 1.257926153405232, + "learning_rate": 5.56340768367049e-06, + "loss": 0.5356, + "step": 215 + }, + { + "epoch": 0.39344262295081966, + "grad_norm": 1.2768971849963524, + "learning_rate": 5.541638519163849e-06, + "loss": 0.5141, + "step": 216 + }, + { + "epoch": 0.39526411657559196, + "grad_norm": 1.1748448811474719, + "learning_rate": 5.51981559484144e-06, + "loss": 0.6517, + "step": 217 + }, + { + "epoch": 0.3970856102003643, + "grad_norm": 2.0539459488388427, + "learning_rate": 5.49793967170941e-06, + "loss": 0.5137, + "step": 218 + }, + { + "epoch": 0.3989071038251366, + "grad_norm": 1.3340041273272538, + "learning_rate": 5.476011512622076e-06, + "loss": 0.5224, + "step": 219 + }, + { + "epoch": 0.4007285974499089, + "grad_norm": 1.2742409163320605, + "learning_rate": 5.454031882255319e-06, + "loss": 0.593, + "step": 220 + }, + { + "epoch": 0.40255009107468126, + "grad_norm": 1.4312401458156725, + "learning_rate": 5.43200154707992e-06, + "loss": 0.6052, + "step": 221 + }, + { + "epoch": 0.40437158469945356, + "grad_norm": 1.1725298771962427, + "learning_rate": 5.4099212753348294e-06, + "loss": 0.5926, + "step": 222 + }, + { + "epoch": 0.40619307832422585, + "grad_norm": 1.2329994896181276, + "learning_rate": 5.3877918370003806e-06, + "loss": 0.4412, + "step": 223 + }, + { + "epoch": 0.4080145719489982, + "grad_norm": 1.3077181540697282, + "learning_rate": 5.365614003771439e-06, + "loss": 0.4822, + "step": 224 + }, + { + "epoch": 0.4098360655737705, + "grad_norm": 1.3216563432171085, + "learning_rate": 5.343388549030491e-06, + "loss": 0.5746, + "step": 225 + }, + { + "epoch": 0.4116575591985428, + "grad_norm": 1.4069728629721256, + "learning_rate": 5.321116247820669e-06, + "loss": 0.4688, + "step": 226 + }, + { + "epoch": 0.4134790528233151, + "grad_norm": 1.1054828696912837, + "learning_rate": 5.298797876818734e-06, + "loss": 0.5574, + "step": 227 + }, + { + "epoch": 0.41530054644808745, + "grad_norm": 1.3971039327183563, + "learning_rate": 5.276434214307986e-06, + "loss": 0.5568, + "step": 228 + }, + { + "epoch": 0.41712204007285975, + "grad_norm": 1.2052367676116202, + "learning_rate": 5.2540260401511255e-06, + "loss": 0.4715, + "step": 229 + }, + { + "epoch": 0.41894353369763204, + "grad_norm": 1.2975437539065673, + "learning_rate": 5.231574135763052e-06, + "loss": 0.531, + "step": 230 + }, + { + "epoch": 0.4207650273224044, + "grad_norm": 1.2072349064171715, + "learning_rate": 5.209079284083626e-06, + "loss": 0.4933, + "step": 231 + }, + { + "epoch": 0.4225865209471767, + "grad_norm": 1.28034314366732, + "learning_rate": 5.186542269550359e-06, + "loss": 0.6263, + "step": 232 + }, + { + "epoch": 0.424408014571949, + "grad_norm": 1.2774088433481947, + "learning_rate": 5.163963878071058e-06, + "loss": 0.5761, + "step": 233 + }, + { + "epoch": 0.4262295081967213, + "grad_norm": 1.2827700082849656, + "learning_rate": 5.141344896996421e-06, + "loss": 0.5472, + "step": 234 + }, + { + "epoch": 0.42805100182149364, + "grad_norm": 1.2399786652373073, + "learning_rate": 5.1186861150925844e-06, + "loss": 0.4721, + "step": 235 + }, + { + "epoch": 0.42987249544626593, + "grad_norm": 1.4866480679484804, + "learning_rate": 5.09598832251361e-06, + "loss": 0.5454, + "step": 236 + }, + { + "epoch": 0.43169398907103823, + "grad_norm": 1.2892625562378888, + "learning_rate": 5.073252310773939e-06, + "loss": 0.522, + "step": 237 + }, + { + "epoch": 0.4335154826958106, + "grad_norm": 1.4228683878206347, + "learning_rate": 5.050478872720782e-06, + "loss": 0.5789, + "step": 238 + }, + { + "epoch": 0.4353369763205829, + "grad_norm": 1.2557222729475248, + "learning_rate": 5.027668802506476e-06, + "loss": 0.542, + "step": 239 + }, + { + "epoch": 0.4371584699453552, + "grad_norm": 1.3719564205247436, + "learning_rate": 5.004822895560794e-06, + "loss": 0.5978, + "step": 240 + }, + { + "epoch": 0.43897996357012753, + "grad_norm": 1.2002491118216863, + "learning_rate": 4.981941948563196e-06, + "loss": 0.567, + "step": 241 + }, + { + "epoch": 0.4408014571948998, + "grad_norm": 1.23705839412736, + "learning_rate": 4.959026759415061e-06, + "loss": 0.5622, + "step": 242 + }, + { + "epoch": 0.4426229508196721, + "grad_norm": 1.2443680094563607, + "learning_rate": 4.936078127211849e-06, + "loss": 0.4652, + "step": 243 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 1.2292169146784966, + "learning_rate": 4.913096852215248e-06, + "loss": 0.4945, + "step": 244 + }, + { + "epoch": 0.44626593806921677, + "grad_norm": 1.2199454755400951, + "learning_rate": 4.890083735825257e-06, + "loss": 0.4975, + "step": 245 + }, + { + "epoch": 0.44808743169398907, + "grad_norm": 1.1351286410957175, + "learning_rate": 4.867039580552247e-06, + "loss": 0.6239, + "step": 246 + }, + { + "epoch": 0.44990892531876137, + "grad_norm": 2.489162682585333, + "learning_rate": 4.843965189988969e-06, + "loss": 0.4521, + "step": 247 + }, + { + "epoch": 0.4517304189435337, + "grad_norm": 1.3760193870465582, + "learning_rate": 4.820861368782537e-06, + "loss": 0.5452, + "step": 248 + }, + { + "epoch": 0.453551912568306, + "grad_norm": 1.9314160205055073, + "learning_rate": 4.79772892260637e-06, + "loss": 0.524, + "step": 249 + }, + { + "epoch": 0.4553734061930783, + "grad_norm": 1.381935096821563, + "learning_rate": 4.774568658132086e-06, + "loss": 0.5054, + "step": 250 + }, + { + "epoch": 0.45719489981785066, + "grad_norm": 1.4937455355331937, + "learning_rate": 4.751381383001386e-06, + "loss": 0.5263, + "step": 251 + }, + { + "epoch": 0.45901639344262296, + "grad_norm": 1.2817536849435798, + "learning_rate": 4.728167905797877e-06, + "loss": 0.4674, + "step": 252 + }, + { + "epoch": 0.46083788706739526, + "grad_norm": 1.2354902761906357, + "learning_rate": 4.7049290360188875e-06, + "loss": 0.4812, + "step": 253 + }, + { + "epoch": 0.46265938069216755, + "grad_norm": 1.278394919844599, + "learning_rate": 4.681665584047227e-06, + "loss": 0.5462, + "step": 254 + }, + { + "epoch": 0.4644808743169399, + "grad_norm": 1.2675137056539372, + "learning_rate": 4.658378361122936e-06, + "loss": 0.5193, + "step": 255 + }, + { + "epoch": 0.4663023679417122, + "grad_norm": 1.1337542247765147, + "learning_rate": 4.6350681793149884e-06, + "loss": 0.4399, + "step": 256 + }, + { + "epoch": 0.4681238615664845, + "grad_norm": 1.1926312684911093, + "learning_rate": 4.611735851492984e-06, + "loss": 0.392, + "step": 257 + }, + { + "epoch": 0.46994535519125685, + "grad_norm": 1.5088135476737818, + "learning_rate": 4.588382191298787e-06, + "loss": 0.6285, + "step": 258 + }, + { + "epoch": 0.47176684881602915, + "grad_norm": 1.2008807604194953, + "learning_rate": 4.5650080131181675e-06, + "loss": 0.6111, + "step": 259 + }, + { + "epoch": 0.47358834244080145, + "grad_norm": 1.231409667649599, + "learning_rate": 4.541614132052393e-06, + "loss": 0.5028, + "step": 260 + }, + { + "epoch": 0.47540983606557374, + "grad_norm": 1.0843389080843633, + "learning_rate": 4.51820136388981e-06, + "loss": 0.5245, + "step": 261 + }, + { + "epoch": 0.4772313296903461, + "grad_norm": 1.3503009764858278, + "learning_rate": 4.494770525077391e-06, + "loss": 0.5069, + "step": 262 + }, + { + "epoch": 0.4790528233151184, + "grad_norm": 1.2840200177200862, + "learning_rate": 4.4713224326922655e-06, + "loss": 0.5877, + "step": 263 + }, + { + "epoch": 0.4808743169398907, + "grad_norm": 1.1859994564826624, + "learning_rate": 4.447857904413231e-06, + "loss": 0.5355, + "step": 264 + }, + { + "epoch": 0.48269581056466304, + "grad_norm": 1.1755333908210361, + "learning_rate": 4.424377758492232e-06, + "loss": 0.5262, + "step": 265 + }, + { + "epoch": 0.48451730418943534, + "grad_norm": 1.3928180479700631, + "learning_rate": 4.40088281372583e-06, + "loss": 0.5019, + "step": 266 + }, + { + "epoch": 0.48633879781420764, + "grad_norm": 1.1971102520263333, + "learning_rate": 4.377373889426649e-06, + "loss": 0.5133, + "step": 267 + }, + { + "epoch": 0.48816029143898, + "grad_norm": 1.3847287014410063, + "learning_rate": 4.353851805394808e-06, + "loss": 0.5929, + "step": 268 + }, + { + "epoch": 0.4899817850637523, + "grad_norm": 1.3913642906155013, + "learning_rate": 4.33031738188933e-06, + "loss": 0.5482, + "step": 269 + }, + { + "epoch": 0.4918032786885246, + "grad_norm": 1.2060593356882499, + "learning_rate": 4.306771439599534e-06, + "loss": 0.5505, + "step": 270 + }, + { + "epoch": 0.4936247723132969, + "grad_norm": 1.1459886490704883, + "learning_rate": 4.283214799616428e-06, + "loss": 0.5472, + "step": 271 + }, + { + "epoch": 0.49544626593806923, + "grad_norm": 1.388116383059367, + "learning_rate": 4.259648283404062e-06, + "loss": 0.5006, + "step": 272 + }, + { + "epoch": 0.4972677595628415, + "grad_norm": 1.267084901831605, + "learning_rate": 4.236072712770891e-06, + "loss": 0.5578, + "step": 273 + }, + { + "epoch": 0.4990892531876138, + "grad_norm": 1.2590176689522474, + "learning_rate": 4.2124889098411175e-06, + "loss": 0.4804, + "step": 274 + }, + { + "epoch": 0.5009107468123861, + "grad_norm": 1.2991828184628833, + "learning_rate": 4.1888976970260135e-06, + "loss": 0.4865, + "step": 275 + }, + { + "epoch": 0.5027322404371585, + "grad_norm": 1.3502024483330752, + "learning_rate": 4.165299896995252e-06, + "loss": 0.6084, + "step": 276 + }, + { + "epoch": 0.5045537340619308, + "grad_norm": 1.24662926047003, + "learning_rate": 4.141696332648216e-06, + "loss": 0.5299, + "step": 277 + }, + { + "epoch": 0.5063752276867031, + "grad_norm": 1.1169437339901658, + "learning_rate": 4.118087827085294e-06, + "loss": 0.4495, + "step": 278 + }, + { + "epoch": 0.5081967213114754, + "grad_norm": 1.5004394388158784, + "learning_rate": 4.094475203579191e-06, + "loss": 0.5311, + "step": 279 + }, + { + "epoch": 0.5100182149362478, + "grad_norm": 1.383813491474933, + "learning_rate": 4.070859285546209e-06, + "loss": 0.5176, + "step": 280 + }, + { + "epoch": 0.51183970856102, + "grad_norm": 1.4019514520014746, + "learning_rate": 4.047240896517539e-06, + "loss": 0.5066, + "step": 281 + }, + { + "epoch": 0.5136612021857924, + "grad_norm": 1.2338591970281994, + "learning_rate": 4.023620860110533e-06, + "loss": 0.5499, + "step": 282 + }, + { + "epoch": 0.5154826958105647, + "grad_norm": 1.4230250812531824, + "learning_rate": 4e-06, + "loss": 0.5209, + "step": 283 + }, + { + "epoch": 0.517304189435337, + "grad_norm": 1.3829946645846831, + "learning_rate": 3.976379139889467e-06, + "loss": 0.5384, + "step": 284 + }, + { + "epoch": 0.5191256830601093, + "grad_norm": 1.3807362256783324, + "learning_rate": 3.9527591034824616e-06, + "loss": 0.4828, + "step": 285 + }, + { + "epoch": 0.5209471766848816, + "grad_norm": 1.2837504835980802, + "learning_rate": 3.929140714453791e-06, + "loss": 0.5297, + "step": 286 + }, + { + "epoch": 0.5227686703096539, + "grad_norm": 1.2883082935052266, + "learning_rate": 3.9055247964208075e-06, + "loss": 0.5485, + "step": 287 + }, + { + "epoch": 0.5245901639344263, + "grad_norm": 1.2663249100766605, + "learning_rate": 3.8819121729147055e-06, + "loss": 0.4972, + "step": 288 + }, + { + "epoch": 0.5264116575591985, + "grad_norm": 1.325952034342347, + "learning_rate": 3.8583036673517845e-06, + "loss": 0.529, + "step": 289 + }, + { + "epoch": 0.5282331511839709, + "grad_norm": 1.4160215190484486, + "learning_rate": 3.834700103004747e-06, + "loss": 0.4542, + "step": 290 + }, + { + "epoch": 0.5300546448087432, + "grad_norm": 1.2682053508757782, + "learning_rate": 3.8111023029739866e-06, + "loss": 0.4907, + "step": 291 + }, + { + "epoch": 0.5318761384335154, + "grad_norm": 1.1893254878402797, + "learning_rate": 3.787511090158884e-06, + "loss": 0.5084, + "step": 292 + }, + { + "epoch": 0.5336976320582878, + "grad_norm": 1.2969065452294093, + "learning_rate": 3.763927287229109e-06, + "loss": 0.4577, + "step": 293 + }, + { + "epoch": 0.5355191256830601, + "grad_norm": 1.2651027758547007, + "learning_rate": 3.740351716595939e-06, + "loss": 0.6394, + "step": 294 + }, + { + "epoch": 0.5373406193078324, + "grad_norm": 1.3693523974507902, + "learning_rate": 3.7167852003835723e-06, + "loss": 0.5101, + "step": 295 + }, + { + "epoch": 0.5391621129326047, + "grad_norm": 1.2607338758277125, + "learning_rate": 3.6932285604004656e-06, + "loss": 0.5171, + "step": 296 + }, + { + "epoch": 0.5409836065573771, + "grad_norm": 1.3133605575947254, + "learning_rate": 3.669682618110671e-06, + "loss": 0.5532, + "step": 297 + }, + { + "epoch": 0.5428051001821493, + "grad_norm": 1.1776402028496125, + "learning_rate": 3.646148194605191e-06, + "loss": 0.488, + "step": 298 + }, + { + "epoch": 0.5446265938069217, + "grad_norm": 1.3163668976765623, + "learning_rate": 3.622626110573351e-06, + "loss": 0.5082, + "step": 299 + }, + { + "epoch": 0.546448087431694, + "grad_norm": 1.234697706107032, + "learning_rate": 3.5991171862741713e-06, + "loss": 0.6067, + "step": 300 + }, + { + "epoch": 0.5482695810564663, + "grad_norm": 1.4127072359417963, + "learning_rate": 3.575622241507768e-06, + "loss": 0.5798, + "step": 301 + }, + { + "epoch": 0.5500910746812386, + "grad_norm": 1.2117668842288387, + "learning_rate": 3.5521420955867683e-06, + "loss": 0.5147, + "step": 302 + }, + { + "epoch": 0.5519125683060109, + "grad_norm": 1.2967372119013212, + "learning_rate": 3.5286775673077332e-06, + "loss": 0.5121, + "step": 303 + }, + { + "epoch": 0.5537340619307832, + "grad_norm": 1.2224035940340945, + "learning_rate": 3.505229474922609e-06, + "loss": 0.5522, + "step": 304 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.249619173678802, + "learning_rate": 3.481798636110191e-06, + "loss": 0.5408, + "step": 305 + }, + { + "epoch": 0.5573770491803278, + "grad_norm": 1.3457532512289554, + "learning_rate": 3.458385867947607e-06, + "loss": 0.4521, + "step": 306 + }, + { + "epoch": 0.5591985428051002, + "grad_norm": 1.360942386694437, + "learning_rate": 3.434991986881833e-06, + "loss": 0.493, + "step": 307 + }, + { + "epoch": 0.5610200364298725, + "grad_norm": 1.3130428998562595, + "learning_rate": 3.4116178087012136e-06, + "loss": 0.5506, + "step": 308 + }, + { + "epoch": 0.5628415300546448, + "grad_norm": 1.1190533809006658, + "learning_rate": 3.388264148507016e-06, + "loss": 0.5126, + "step": 309 + }, + { + "epoch": 0.5646630236794171, + "grad_norm": 1.2987174512716828, + "learning_rate": 3.3649318206850116e-06, + "loss": 0.4693, + "step": 310 + }, + { + "epoch": 0.5664845173041895, + "grad_norm": 1.199702517512821, + "learning_rate": 3.3416216388770635e-06, + "loss": 0.6163, + "step": 311 + }, + { + "epoch": 0.5683060109289617, + "grad_norm": 1.4350182469129544, + "learning_rate": 3.3183344159527736e-06, + "loss": 0.4995, + "step": 312 + }, + { + "epoch": 0.5701275045537341, + "grad_norm": 1.1958548262022524, + "learning_rate": 3.2950709639811134e-06, + "loss": 0.49, + "step": 313 + }, + { + "epoch": 0.5719489981785064, + "grad_norm": 1.3945811458876904, + "learning_rate": 3.271832094202123e-06, + "loss": 0.5864, + "step": 314 + }, + { + "epoch": 0.5737704918032787, + "grad_norm": 1.4302437878542158, + "learning_rate": 3.2486186169986153e-06, + "loss": 0.5064, + "step": 315 + }, + { + "epoch": 0.575591985428051, + "grad_norm": 1.2460171889983833, + "learning_rate": 3.2254313418679154e-06, + "loss": 0.4993, + "step": 316 + }, + { + "epoch": 0.5774134790528234, + "grad_norm": 1.3262935951428076, + "learning_rate": 3.2022710773936304e-06, + "loss": 0.4744, + "step": 317 + }, + { + "epoch": 0.5792349726775956, + "grad_norm": 1.3389955195972856, + "learning_rate": 3.1791386312174633e-06, + "loss": 0.4928, + "step": 318 + }, + { + "epoch": 0.581056466302368, + "grad_norm": 1.2940542358009448, + "learning_rate": 3.1560348100110315e-06, + "loss": 0.5806, + "step": 319 + }, + { + "epoch": 0.5828779599271403, + "grad_norm": 1.3088940762820873, + "learning_rate": 3.1329604194477535e-06, + "loss": 0.4656, + "step": 320 + }, + { + "epoch": 0.5846994535519126, + "grad_norm": 1.27093825835977, + "learning_rate": 3.1099162641747427e-06, + "loss": 0.4405, + "step": 321 + }, + { + "epoch": 0.5865209471766849, + "grad_norm": 1.2054913034625938, + "learning_rate": 3.0869031477847507e-06, + "loss": 0.5713, + "step": 322 + }, + { + "epoch": 0.5883424408014571, + "grad_norm": 1.2012296656668295, + "learning_rate": 3.0639218727881508e-06, + "loss": 0.5791, + "step": 323 + }, + { + "epoch": 0.5901639344262295, + "grad_norm": 1.2982012651862458, + "learning_rate": 3.04097324058494e-06, + "loss": 0.5778, + "step": 324 + }, + { + "epoch": 0.5919854280510018, + "grad_norm": 1.2933931880238856, + "learning_rate": 3.0180580514368034e-06, + "loss": 0.5219, + "step": 325 + }, + { + "epoch": 0.5938069216757741, + "grad_norm": 1.4720885088406803, + "learning_rate": 2.9951771044392066e-06, + "loss": 0.5007, + "step": 326 + }, + { + "epoch": 0.5956284153005464, + "grad_norm": 1.4987465964654607, + "learning_rate": 2.972331197493523e-06, + "loss": 0.4449, + "step": 327 + }, + { + "epoch": 0.5974499089253188, + "grad_norm": 1.24446661500654, + "learning_rate": 2.949521127279218e-06, + "loss": 0.5865, + "step": 328 + }, + { + "epoch": 0.599271402550091, + "grad_norm": 1.2063717392200657, + "learning_rate": 2.926747689226062e-06, + "loss": 0.5257, + "step": 329 + }, + { + "epoch": 0.6010928961748634, + "grad_norm": 1.3076236400800156, + "learning_rate": 2.9040116774863896e-06, + "loss": 0.4262, + "step": 330 + }, + { + "epoch": 0.6029143897996357, + "grad_norm": 1.486619018414579, + "learning_rate": 2.881313884907416e-06, + "loss": 0.5119, + "step": 331 + }, + { + "epoch": 0.604735883424408, + "grad_norm": 1.4291961410481424, + "learning_rate": 2.8586551030035797e-06, + "loss": 0.5522, + "step": 332 + }, + { + "epoch": 0.6065573770491803, + "grad_norm": 1.363647670787008, + "learning_rate": 2.836036121928942e-06, + "loss": 0.4966, + "step": 333 + }, + { + "epoch": 0.6083788706739527, + "grad_norm": 1.3510346555153874, + "learning_rate": 2.813457730449641e-06, + "loss": 0.557, + "step": 334 + }, + { + "epoch": 0.6102003642987249, + "grad_norm": 1.3056707121910645, + "learning_rate": 2.790920715916372e-06, + "loss": 0.4749, + "step": 335 + }, + { + "epoch": 0.6120218579234973, + "grad_norm": 1.2674665608771611, + "learning_rate": 2.7684258642369484e-06, + "loss": 0.5118, + "step": 336 + }, + { + "epoch": 0.6138433515482696, + "grad_norm": 1.1704712319064594, + "learning_rate": 2.7459739598488762e-06, + "loss": 0.5549, + "step": 337 + }, + { + "epoch": 0.6156648451730419, + "grad_norm": 1.4094367691998289, + "learning_rate": 2.723565785692013e-06, + "loss": 0.5281, + "step": 338 + }, + { + "epoch": 0.6174863387978142, + "grad_norm": 1.1692398204308077, + "learning_rate": 2.701202123181266e-06, + "loss": 0.5117, + "step": 339 + }, + { + "epoch": 0.6193078324225865, + "grad_norm": 1.1555269052808808, + "learning_rate": 2.6788837521793328e-06, + "loss": 0.5099, + "step": 340 + }, + { + "epoch": 0.6211293260473588, + "grad_norm": 1.2795356877186783, + "learning_rate": 2.6566114509695096e-06, + "loss": 0.5776, + "step": 341 + }, + { + "epoch": 0.6229508196721312, + "grad_norm": 1.402365827706012, + "learning_rate": 2.634385996228561e-06, + "loss": 0.5233, + "step": 342 + }, + { + "epoch": 0.6247723132969034, + "grad_norm": 1.1473627159160211, + "learning_rate": 2.6122081629996195e-06, + "loss": 0.6332, + "step": 343 + }, + { + "epoch": 0.6265938069216758, + "grad_norm": 1.363153716598952, + "learning_rate": 2.5900787246651715e-06, + "loss": 0.5023, + "step": 344 + }, + { + "epoch": 0.6284153005464481, + "grad_norm": 1.3207498781298728, + "learning_rate": 2.567998452920081e-06, + "loss": 0.5304, + "step": 345 + }, + { + "epoch": 0.6302367941712204, + "grad_norm": 1.3224239362488786, + "learning_rate": 2.5459681177446797e-06, + "loss": 0.4926, + "step": 346 + }, + { + "epoch": 0.6320582877959927, + "grad_norm": 1.1481892109560257, + "learning_rate": 2.523988487377924e-06, + "loss": 0.4879, + "step": 347 + }, + { + "epoch": 0.6338797814207651, + "grad_norm": 1.3002026066911114, + "learning_rate": 2.50206032829059e-06, + "loss": 0.5203, + "step": 348 + }, + { + "epoch": 0.6357012750455373, + "grad_norm": 1.278028126213566, + "learning_rate": 2.4801844051585604e-06, + "loss": 0.3582, + "step": 349 + }, + { + "epoch": 0.6375227686703097, + "grad_norm": 1.2059444131681067, + "learning_rate": 2.4583614808361508e-06, + "loss": 0.6105, + "step": 350 + }, + { + "epoch": 0.639344262295082, + "grad_norm": 1.216009601153044, + "learning_rate": 2.4365923163295083e-06, + "loss": 0.5276, + "step": 351 + }, + { + "epoch": 0.6411657559198543, + "grad_norm": 1.3096172150636922, + "learning_rate": 2.4148776707700775e-06, + "loss": 0.5187, + "step": 352 + }, + { + "epoch": 0.6429872495446266, + "grad_norm": 1.288189929013976, + "learning_rate": 2.393218301388123e-06, + "loss": 0.5577, + "step": 353 + }, + { + "epoch": 0.644808743169399, + "grad_norm": 1.3155373684757863, + "learning_rate": 2.3716149634863244e-06, + "loss": 0.5057, + "step": 354 + }, + { + "epoch": 0.6466302367941712, + "grad_norm": 1.3419874756397705, + "learning_rate": 2.3500684104134433e-06, + "loss": 0.4088, + "step": 355 + }, + { + "epoch": 0.6484517304189436, + "grad_norm": 1.433794079225407, + "learning_rate": 2.328579393538046e-06, + "loss": 0.5522, + "step": 356 + }, + { + "epoch": 0.6502732240437158, + "grad_norm": 1.4381038473896983, + "learning_rate": 2.3071486622223e-06, + "loss": 0.5745, + "step": 357 + }, + { + "epoch": 0.6520947176684881, + "grad_norm": 1.2039623063737521, + "learning_rate": 2.2857769637958554e-06, + "loss": 0.4083, + "step": 358 + }, + { + "epoch": 0.6539162112932605, + "grad_norm": 1.1858586411225656, + "learning_rate": 2.2644650435297675e-06, + "loss": 0.5221, + "step": 359 + }, + { + "epoch": 0.6557377049180327, + "grad_norm": 1.2748116882040539, + "learning_rate": 2.243213644610519e-06, + "loss": 0.5085, + "step": 360 + }, + { + "epoch": 0.6575591985428051, + "grad_norm": 1.1732780097848132, + "learning_rate": 2.2220235081140985e-06, + "loss": 0.4498, + "step": 361 + }, + { + "epoch": 0.6593806921675774, + "grad_norm": 1.4343186803172785, + "learning_rate": 2.2008953729801583e-06, + "loss": 0.4823, + "step": 362 + }, + { + "epoch": 0.6612021857923497, + "grad_norm": 1.496026602053126, + "learning_rate": 2.1798299759862545e-06, + "loss": 0.4367, + "step": 363 + }, + { + "epoch": 0.663023679417122, + "grad_norm": 1.3025991152153253, + "learning_rate": 2.158828051722137e-06, + "loss": 0.571, + "step": 364 + }, + { + "epoch": 0.6648451730418944, + "grad_norm": 1.1193584120775901, + "learning_rate": 2.137890332564147e-06, + "loss": 0.4189, + "step": 365 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 1.2674961696906415, + "learning_rate": 2.117017548649678e-06, + "loss": 0.5501, + "step": 366 + }, + { + "epoch": 0.6666666666666666, + "eval_accuracy": 0.813597109631812, + "eval_accuracy_first_token": 0.9505597014925373, + "eval_accuracy_first_token_all": 0.9743665728256038, + "eval_accuracy_first_token_all_total": 6749, + "eval_accuracy_first_token_calculate": 0.8409090909090909, + "eval_accuracy_first_token_calculate_total": 44, + "eval_accuracy_first_token_execute": 1.0, + "eval_accuracy_first_token_execute_total": 202, + "eval_accuracy_first_token_get": 0.912472647702407, + "eval_accuracy_first_token_get_total": 457, + "eval_accuracy_first_token_python": 0.871356783919598, + "eval_accuracy_first_token_python_total": 995, + "eval_loss": 0.5661724805831909, + "eval_perplexity": 1.1117590100934498, + "eval_runtime": 274.9553, + "eval_samples_per_second": 2.524, + "eval_steps_per_second": 0.633, + "eval_total_number_first_token": 9648, + "step": 366 + }, + { + "epoch": 0.668488160291439, + "grad_norm": 1.2382082369437504, + "learning_rate": 2.0962104278517058e-06, + "loss": 0.4206, + "step": 367 + }, + { + "epoch": 0.6703096539162113, + "grad_norm": 1.3495235838757789, + "learning_rate": 2.0754696957534105e-06, + "loss": 0.5516, + "step": 368 + }, + { + "epoch": 0.6721311475409836, + "grad_norm": 1.192568387623234, + "learning_rate": 2.0547960756228746e-06, + "loss": 0.4107, + "step": 369 + }, + { + "epoch": 0.6739526411657559, + "grad_norm": 1.180512972457561, + "learning_rate": 2.0341902883878626e-06, + "loss": 0.486, + "step": 370 + }, + { + "epoch": 0.6757741347905283, + "grad_norm": 1.4275500278475863, + "learning_rate": 2.013653052610678e-06, + "loss": 0.4959, + "step": 371 + }, + { + "epoch": 0.6775956284153005, + "grad_norm": 1.2902691782551479, + "learning_rate": 1.993185084463106e-06, + "loss": 0.4299, + "step": 372 + }, + { + "epoch": 0.6794171220400729, + "grad_norm": 1.1323245803277089, + "learning_rate": 1.97278709770144e-06, + "loss": 0.449, + "step": 373 + }, + { + "epoch": 0.6812386156648452, + "grad_norm": 1.1729164825962006, + "learning_rate": 1.952459803641597e-06, + "loss": 0.5054, + "step": 374 + }, + { + "epoch": 0.6830601092896175, + "grad_norm": 1.302619775593189, + "learning_rate": 1.9322039111342977e-06, + "loss": 0.5069, + "step": 375 + }, + { + "epoch": 0.6848816029143898, + "grad_norm": 1.3225743591209795, + "learning_rate": 1.912020126540366e-06, + "loss": 0.4455, + "step": 376 + }, + { + "epoch": 0.6867030965391621, + "grad_norm": 1.292024983046111, + "learning_rate": 1.8919091537060847e-06, + "loss": 0.5328, + "step": 377 + }, + { + "epoch": 0.6885245901639344, + "grad_norm": 1.2278749426015196, + "learning_rate": 1.8718716939386541e-06, + "loss": 0.4648, + "step": 378 + }, + { + "epoch": 0.6903460837887068, + "grad_norm": 1.2977606986012467, + "learning_rate": 1.8519084459817362e-06, + "loss": 0.4924, + "step": 379 + }, + { + "epoch": 0.692167577413479, + "grad_norm": 1.227698057643303, + "learning_rate": 1.83202010599109e-06, + "loss": 0.4792, + "step": 380 + }, + { + "epoch": 0.6939890710382514, + "grad_norm": 1.3176757819527871, + "learning_rate": 1.8122073675102932e-06, + "loss": 0.4661, + "step": 381 + }, + { + "epoch": 0.6958105646630237, + "grad_norm": 1.60497524399408, + "learning_rate": 1.792470921446557e-06, + "loss": 0.4696, + "step": 382 + }, + { + "epoch": 0.697632058287796, + "grad_norm": 1.3498126136904973, + "learning_rate": 1.7728114560466324e-06, + "loss": 0.5222, + "step": 383 + }, + { + "epoch": 0.6994535519125683, + "grad_norm": 1.2558395717712512, + "learning_rate": 1.753229656872815e-06, + "loss": 0.4103, + "step": 384 + }, + { + "epoch": 0.7012750455373407, + "grad_norm": 1.3697944306184828, + "learning_rate": 1.7337262067790319e-06, + "loss": 0.4544, + "step": 385 + }, + { + "epoch": 0.7030965391621129, + "grad_norm": 1.368204116383326, + "learning_rate": 1.7143017858870259e-06, + "loss": 0.4975, + "step": 386 + }, + { + "epoch": 0.7049180327868853, + "grad_norm": 1.2028196112608587, + "learning_rate": 1.6949570715626532e-06, + "loss": 0.5793, + "step": 387 + }, + { + "epoch": 0.7067395264116576, + "grad_norm": 1.2193898504656977, + "learning_rate": 1.675692738392247e-06, + "loss": 0.4841, + "step": 388 + }, + { + "epoch": 0.7085610200364298, + "grad_norm": 1.2121663225442056, + "learning_rate": 1.6565094581591015e-06, + "loss": 0.4891, + "step": 389 + }, + { + "epoch": 0.7103825136612022, + "grad_norm": 1.2920888769630756, + "learning_rate": 1.6374078998200424e-06, + "loss": 0.5242, + "step": 390 + }, + { + "epoch": 0.7122040072859745, + "grad_norm": 1.2474786830204843, + "learning_rate": 1.6183887294820995e-06, + "loss": 0.4941, + "step": 391 + }, + { + "epoch": 0.7140255009107468, + "grad_norm": 1.2316800527163396, + "learning_rate": 1.5994526103792852e-06, + "loss": 0.4402, + "step": 392 + }, + { + "epoch": 0.7158469945355191, + "grad_norm": 1.267209993054678, + "learning_rate": 1.5806002028494509e-06, + "loss": 0.4665, + "step": 393 + }, + { + "epoch": 0.7176684881602914, + "grad_norm": 1.0939936334202054, + "learning_rate": 1.5618321643112738e-06, + "loss": 0.5385, + "step": 394 + }, + { + "epoch": 0.7194899817850637, + "grad_norm": 1.2229947652634188, + "learning_rate": 1.5431491492413286e-06, + "loss": 0.563, + "step": 395 + }, + { + "epoch": 0.7213114754098361, + "grad_norm": 1.3179598742616503, + "learning_rate": 1.52455180915126e-06, + "loss": 0.5043, + "step": 396 + }, + { + "epoch": 0.7231329690346083, + "grad_norm": 1.177007091506137, + "learning_rate": 1.506040792565066e-06, + "loss": 0.5163, + "step": 397 + }, + { + "epoch": 0.7249544626593807, + "grad_norm": 1.1922238886428833, + "learning_rate": 1.487616744996484e-06, + "loss": 0.4374, + "step": 398 + }, + { + "epoch": 0.726775956284153, + "grad_norm": 1.434536851445206, + "learning_rate": 1.4692803089264772e-06, + "loss": 0.4976, + "step": 399 + }, + { + "epoch": 0.7285974499089253, + "grad_norm": 1.1083154204777248, + "learning_rate": 1.4510321237808377e-06, + "loss": 0.4397, + "step": 400 + }, + { + "epoch": 0.7304189435336976, + "grad_norm": 1.446108741355073, + "learning_rate": 1.4328728259078746e-06, + "loss": 0.4707, + "step": 401 + }, + { + "epoch": 0.73224043715847, + "grad_norm": 1.3655307747592895, + "learning_rate": 1.414803048556236e-06, + "loss": 0.4509, + "step": 402 + }, + { + "epoch": 0.7340619307832422, + "grad_norm": 1.2590237151009611, + "learning_rate": 1.396823421852825e-06, + "loss": 0.4137, + "step": 403 + }, + { + "epoch": 0.7358834244080146, + "grad_norm": 1.3245324404535452, + "learning_rate": 1.3789345727808207e-06, + "loss": 0.5792, + "step": 404 + }, + { + "epoch": 0.7377049180327869, + "grad_norm": 1.2386242097070082, + "learning_rate": 1.3611371251578114e-06, + "loss": 0.5098, + "step": 405 + }, + { + "epoch": 0.7395264116575592, + "grad_norm": 1.1915406492227152, + "learning_rate": 1.3434316996140553e-06, + "loss": 0.5163, + "step": 406 + }, + { + "epoch": 0.7413479052823315, + "grad_norm": 1.4026256649335254, + "learning_rate": 1.3258189135708229e-06, + "loss": 0.4404, + "step": 407 + }, + { + "epoch": 0.7431693989071039, + "grad_norm": 1.2980107514185337, + "learning_rate": 1.3082993812188735e-06, + "loss": 0.54, + "step": 408 + }, + { + "epoch": 0.7449908925318761, + "grad_norm": 1.2715578342804374, + "learning_rate": 1.2908737134970364e-06, + "loss": 0.4448, + "step": 409 + }, + { + "epoch": 0.7468123861566485, + "grad_norm": 1.143848333980239, + "learning_rate": 1.2735425180709039e-06, + "loss": 0.4223, + "step": 410 + }, + { + "epoch": 0.7486338797814208, + "grad_norm": 1.1255231193873407, + "learning_rate": 1.2563063993116482e-06, + "loss": 0.46, + "step": 411 + }, + { + "epoch": 0.7504553734061931, + "grad_norm": 1.3419216896904982, + "learning_rate": 1.239165958274933e-06, + "loss": 0.5361, + "step": 412 + }, + { + "epoch": 0.7522768670309654, + "grad_norm": 1.4023805964702172, + "learning_rate": 1.2221217926799652e-06, + "loss": 0.4397, + "step": 413 + }, + { + "epoch": 0.7540983606557377, + "grad_norm": 1.330897728853757, + "learning_rate": 1.2051744968886489e-06, + "loss": 0.4957, + "step": 414 + }, + { + "epoch": 0.75591985428051, + "grad_norm": 1.1838422813407694, + "learning_rate": 1.1883246618848533e-06, + "loss": 0.4462, + "step": 415 + }, + { + "epoch": 0.7577413479052824, + "grad_norm": 1.1723617674919102, + "learning_rate": 1.1715728752538101e-06, + "loss": 0.451, + "step": 416 + }, + { + "epoch": 0.7595628415300546, + "grad_norm": 1.206209172251872, + "learning_rate": 1.1549197211616203e-06, + "loss": 0.415, + "step": 417 + }, + { + "epoch": 0.761384335154827, + "grad_norm": 1.3707028766377027, + "learning_rate": 1.1383657803348835e-06, + "loss": 0.4914, + "step": 418 + }, + { + "epoch": 0.7632058287795993, + "grad_norm": 1.8225393046839982, + "learning_rate": 1.1219116300404486e-06, + "loss": 0.5696, + "step": 419 + }, + { + "epoch": 0.7650273224043715, + "grad_norm": 1.2961370764387552, + "learning_rate": 1.10555784406528e-06, + "loss": 0.4906, + "step": 420 + }, + { + "epoch": 0.7668488160291439, + "grad_norm": 1.273661302678146, + "learning_rate": 1.089304992696455e-06, + "loss": 0.4176, + "step": 421 + }, + { + "epoch": 0.7686703096539163, + "grad_norm": 1.1358792116562773, + "learning_rate": 1.0731536427012695e-06, + "loss": 0.5669, + "step": 422 + }, + { + "epoch": 0.7704918032786885, + "grad_norm": 1.1543809051499938, + "learning_rate": 1.0571043573074736e-06, + "loss": 0.4731, + "step": 423 + }, + { + "epoch": 0.7723132969034608, + "grad_norm": 1.268995763936549, + "learning_rate": 1.041157696183641e-06, + "loss": 0.4944, + "step": 424 + }, + { + "epoch": 0.7741347905282332, + "grad_norm": 1.3502204931718385, + "learning_rate": 1.0253142154196415e-06, + "loss": 0.5632, + "step": 425 + }, + { + "epoch": 0.7759562841530054, + "grad_norm": 1.2861585316767994, + "learning_rate": 1.0095744675072525e-06, + "loss": 0.5321, + "step": 426 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 1.2500169007804636, + "learning_rate": 9.93939001320895e-07, + "loss": 0.5018, + "step": 427 + }, + { + "epoch": 0.7795992714025501, + "grad_norm": 1.1927946674927596, + "learning_rate": 9.784083620984884e-07, + "loss": 0.4495, + "step": 428 + }, + { + "epoch": 0.7814207650273224, + "grad_norm": 1.211453529275201, + "learning_rate": 9.62983091422446e-07, + "loss": 0.459, + "step": 429 + }, + { + "epoch": 0.7832422586520947, + "grad_norm": 1.191077248020614, + "learning_rate": 9.476637272007746e-07, + "loss": 0.5285, + "step": 430 + }, + { + "epoch": 0.785063752276867, + "grad_norm": 1.5578769325659387, + "learning_rate": 9.324508036483303e-07, + "loss": 0.4826, + "step": 431 + }, + { + "epoch": 0.7868852459016393, + "grad_norm": 1.143092830687595, + "learning_rate": 9.173448512681848e-07, + "loss": 0.4689, + "step": 432 + }, + { + "epoch": 0.7887067395264117, + "grad_norm": 1.3178766788423166, + "learning_rate": 9.023463968331238e-07, + "loss": 0.423, + "step": 433 + }, + { + "epoch": 0.7905282331511839, + "grad_norm": 1.270598374226235, + "learning_rate": 8.874559633672754e-07, + "loss": 0.5087, + "step": 434 + }, + { + "epoch": 0.7923497267759563, + "grad_norm": 1.3159704576848927, + "learning_rate": 8.726740701278808e-07, + "loss": 0.4658, + "step": 435 + }, + { + "epoch": 0.7941712204007286, + "grad_norm": 1.308828411323957, + "learning_rate": 8.580012325871773e-07, + "loss": 0.4874, + "step": 436 + }, + { + "epoch": 0.7959927140255009, + "grad_norm": 1.1534207003253993, + "learning_rate": 8.434379624144261e-07, + "loss": 0.559, + "step": 437 + }, + { + "epoch": 0.7978142076502732, + "grad_norm": 1.2149317958628532, + "learning_rate": 8.289847674580702e-07, + "loss": 0.4374, + "step": 438 + }, + { + "epoch": 0.7996357012750456, + "grad_norm": 1.7952195261148596, + "learning_rate": 8.146421517280226e-07, + "loss": 0.4826, + "step": 439 + }, + { + "epoch": 0.8014571948998178, + "grad_norm": 1.204321359741705, + "learning_rate": 8.004106153780967e-07, + "loss": 0.5445, + "step": 440 + }, + { + "epoch": 0.8032786885245902, + "grad_norm": 1.3038648482659696, + "learning_rate": 7.862906546885559e-07, + "loss": 0.4759, + "step": 441 + }, + { + "epoch": 0.8051001821493625, + "grad_norm": 1.3025645847276193, + "learning_rate": 7.722827620488108e-07, + "loss": 0.4493, + "step": 442 + }, + { + "epoch": 0.8069216757741348, + "grad_norm": 1.3302596060388403, + "learning_rate": 7.583874259402545e-07, + "loss": 0.5191, + "step": 443 + }, + { + "epoch": 0.8087431693989071, + "grad_norm": 1.2390558736887658, + "learning_rate": 7.446051309192203e-07, + "loss": 0.5035, + "step": 444 + }, + { + "epoch": 0.8105646630236795, + "grad_norm": 1.2987933255329953, + "learning_rate": 7.30936357600088e-07, + "loss": 0.5051, + "step": 445 + }, + { + "epoch": 0.8123861566484517, + "grad_norm": 1.3795445751301032, + "learning_rate": 7.173815826385246e-07, + "loss": 0.4551, + "step": 446 + }, + { + "epoch": 0.8142076502732241, + "grad_norm": 1.2599528493900025, + "learning_rate": 7.039412787148586e-07, + "loss": 0.4923, + "step": 447 + }, + { + "epoch": 0.8160291438979964, + "grad_norm": 1.1755103488000105, + "learning_rate": 6.906159145176049e-07, + "loss": 0.5205, + "step": 448 + }, + { + "epoch": 0.8178506375227687, + "grad_norm": 1.301352850179719, + "learning_rate": 6.774059547271087e-07, + "loss": 0.516, + "step": 449 + }, + { + "epoch": 0.819672131147541, + "grad_norm": 1.2962192786547895, + "learning_rate": 6.643118599993518e-07, + "loss": 0.4872, + "step": 450 + }, + { + "epoch": 0.8214936247723132, + "grad_norm": 1.5585419654309247, + "learning_rate": 6.513340869498858e-07, + "loss": 0.5804, + "step": 451 + }, + { + "epoch": 0.8233151183970856, + "grad_norm": 1.387119348514666, + "learning_rate": 6.384730881379048e-07, + "loss": 0.5619, + "step": 452 + }, + { + "epoch": 0.825136612021858, + "grad_norm": 1.4119192925549904, + "learning_rate": 6.257293120504692e-07, + "loss": 0.631, + "step": 453 + }, + { + "epoch": 0.8269581056466302, + "grad_norm": 1.3308860456915488, + "learning_rate": 6.131032030868635e-07, + "loss": 0.485, + "step": 454 + }, + { + "epoch": 0.8287795992714025, + "grad_norm": 1.4307724793695609, + "learning_rate": 6.005952015430993e-07, + "loss": 0.5642, + "step": 455 + }, + { + "epoch": 0.8306010928961749, + "grad_norm": 1.2106456273020985, + "learning_rate": 5.882057435965619e-07, + "loss": 0.4985, + "step": 456 + }, + { + "epoch": 0.8324225865209471, + "grad_norm": 1.3115620293405426, + "learning_rate": 5.759352612907999e-07, + "loss": 0.5307, + "step": 457 + }, + { + "epoch": 0.8342440801457195, + "grad_norm": 1.248365530305497, + "learning_rate": 5.637841825204588e-07, + "loss": 0.5194, + "step": 458 + }, + { + "epoch": 0.8360655737704918, + "grad_norm": 1.2238266091876802, + "learning_rate": 5.517529310163627e-07, + "loss": 0.5482, + "step": 459 + }, + { + "epoch": 0.8378870673952641, + "grad_norm": 1.4210179624401589, + "learning_rate": 5.398419263307281e-07, + "loss": 0.4293, + "step": 460 + }, + { + "epoch": 0.8397085610200364, + "grad_norm": 1.2412519735070302, + "learning_rate": 5.280515838225477e-07, + "loss": 0.4852, + "step": 461 + }, + { + "epoch": 0.8415300546448088, + "grad_norm": 1.2092346961035594, + "learning_rate": 5.163823146430944e-07, + "loss": 0.5908, + "step": 462 + }, + { + "epoch": 0.843351548269581, + "grad_norm": 1.3880323085259179, + "learning_rate": 5.048345257215892e-07, + "loss": 0.5084, + "step": 463 + }, + { + "epoch": 0.8451730418943534, + "grad_norm": 1.2676463291192996, + "learning_rate": 4.934086197510088e-07, + "loss": 0.5547, + "step": 464 + }, + { + "epoch": 0.8469945355191257, + "grad_norm": 1.2609901690813994, + "learning_rate": 4.821049951740441e-07, + "loss": 0.5434, + "step": 465 + }, + { + "epoch": 0.848816029143898, + "grad_norm": 1.3054902000162272, + "learning_rate": 4.7092404616920547e-07, + "loss": 0.4394, + "step": 466 + }, + { + "epoch": 0.8506375227686703, + "grad_norm": 1.323905342489299, + "learning_rate": 4.59866162637077e-07, + "loss": 0.5067, + "step": 467 + }, + { + "epoch": 0.8524590163934426, + "grad_norm": 1.4483470172100243, + "learning_rate": 4.4893173018671816e-07, + "loss": 0.4553, + "step": 468 + }, + { + "epoch": 0.8542805100182149, + "grad_norm": 1.2403981869880174, + "learning_rate": 4.3812113012222164e-07, + "loss": 0.5209, + "step": 469 + }, + { + "epoch": 0.8561020036429873, + "grad_norm": 1.8179855460416823, + "learning_rate": 4.2743473942941177e-07, + "loss": 0.4736, + "step": 470 + }, + { + "epoch": 0.8579234972677595, + "grad_norm": 1.2339086882585282, + "learning_rate": 4.168729307626977e-07, + "loss": 0.5098, + "step": 471 + }, + { + "epoch": 0.8597449908925319, + "grad_norm": 1.276189228460651, + "learning_rate": 4.0643607243208455e-07, + "loss": 0.4989, + "step": 472 + }, + { + "epoch": 0.8615664845173042, + "grad_norm": 1.2203026643363581, + "learning_rate": 3.9612452839032384e-07, + "loss": 0.4462, + "step": 473 + }, + { + "epoch": 0.8633879781420765, + "grad_norm": 1.242544747026145, + "learning_rate": 3.859386582202231e-07, + "loss": 0.5232, + "step": 474 + }, + { + "epoch": 0.8652094717668488, + "grad_norm": 1.4374137888249117, + "learning_rate": 3.758788171221079e-07, + "loss": 0.5248, + "step": 475 + }, + { + "epoch": 0.8670309653916212, + "grad_norm": 1.1588668244652476, + "learning_rate": 3.659453559014345e-07, + "loss": 0.4631, + "step": 476 + }, + { + "epoch": 0.8688524590163934, + "grad_norm": 1.3937157924335801, + "learning_rate": 3.561386209565582e-07, + "loss": 0.4675, + "step": 477 + }, + { + "epoch": 0.8706739526411658, + "grad_norm": 1.376102069963406, + "learning_rate": 3.464589542666485e-07, + "loss": 0.458, + "step": 478 + }, + { + "epoch": 0.8724954462659381, + "grad_norm": 1.2706836490870992, + "learning_rate": 3.3690669337976996e-07, + "loss": 0.4851, + "step": 479 + }, + { + "epoch": 0.8743169398907104, + "grad_norm": 1.221114304425684, + "learning_rate": 3.2748217140111e-07, + "loss": 0.5032, + "step": 480 + }, + { + "epoch": 0.8761384335154827, + "grad_norm": 1.3883301701023747, + "learning_rate": 3.1818571698135976e-07, + "loss": 0.4749, + "step": 481 + }, + { + "epoch": 0.8779599271402551, + "grad_norm": 1.1787371549756893, + "learning_rate": 3.0901765430525337e-07, + "loss": 0.5106, + "step": 482 + }, + { + "epoch": 0.8797814207650273, + "grad_norm": 1.346207438421883, + "learning_rate": 2.9997830308027003e-07, + "loss": 0.4383, + "step": 483 + }, + { + "epoch": 0.8816029143897997, + "grad_norm": 1.348993463677863, + "learning_rate": 2.9106797852547483e-07, + "loss": 0.4673, + "step": 484 + }, + { + "epoch": 0.8834244080145719, + "grad_norm": 1.201847671893057, + "learning_rate": 2.8228699136053726e-07, + "loss": 0.5759, + "step": 485 + }, + { + "epoch": 0.8852459016393442, + "grad_norm": 1.340461205562142, + "learning_rate": 2.7363564779488446e-07, + "loss": 0.4465, + "step": 486 + }, + { + "epoch": 0.8870673952641166, + "grad_norm": 1.1752701027649086, + "learning_rate": 2.6511424951703244e-07, + "loss": 0.4118, + "step": 487 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 1.2789959623625877, + "learning_rate": 2.567230936840632e-07, + "loss": 0.5079, + "step": 488 + }, + { + "epoch": 0.8907103825136612, + "grad_norm": 1.2640082921345366, + "learning_rate": 2.4846247291125897e-07, + "loss": 0.4711, + "step": 489 + }, + { + "epoch": 0.8925318761384335, + "grad_norm": 1.1282519699248084, + "learning_rate": 2.4033267526190057e-07, + "loss": 0.5091, + "step": 490 + }, + { + "epoch": 0.8943533697632058, + "grad_norm": 1.2710706301896888, + "learning_rate": 2.323339842372234e-07, + "loss": 0.4149, + "step": 491 + }, + { + "epoch": 0.8961748633879781, + "grad_norm": 1.4092310204502283, + "learning_rate": 2.2446667876652968e-07, + "loss": 0.5713, + "step": 492 + }, + { + "epoch": 0.8979963570127505, + "grad_norm": 1.1624712007482816, + "learning_rate": 2.1673103319746146e-07, + "loss": 0.4281, + "step": 493 + }, + { + "epoch": 0.8998178506375227, + "grad_norm": 1.2035233792963602, + "learning_rate": 2.0912731728643362e-07, + "loss": 0.4497, + "step": 494 + }, + { + "epoch": 0.9016393442622951, + "grad_norm": 1.2087725899212483, + "learning_rate": 2.0165579618922757e-07, + "loss": 0.4554, + "step": 495 + }, + { + "epoch": 0.9034608378870674, + "grad_norm": 1.3284543377876135, + "learning_rate": 1.943167304517459e-07, + "loss": 0.4693, + "step": 496 + }, + { + "epoch": 0.9052823315118397, + "grad_norm": 1.4011820069689847, + "learning_rate": 1.871103760009234e-07, + "loss": 0.5348, + "step": 497 + }, + { + "epoch": 0.907103825136612, + "grad_norm": 1.2193252171112197, + "learning_rate": 1.8003698413580427e-07, + "loss": 0.4572, + "step": 498 + }, + { + "epoch": 0.9089253187613844, + "grad_norm": 1.1222006814434855, + "learning_rate": 1.7309680151878126e-07, + "loss": 0.5136, + "step": 499 + }, + { + "epoch": 0.9107468123861566, + "grad_norm": 1.2891850954780644, + "learning_rate": 1.6629007016698916e-07, + "loss": 0.4979, + "step": 500 + }, + { + "epoch": 0.912568306010929, + "grad_norm": 1.3322769925178861, + "learning_rate": 1.5961702744386973e-07, + "loss": 0.5423, + "step": 501 + }, + { + "epoch": 0.9143897996357013, + "grad_norm": 1.160971888154378, + "learning_rate": 1.5307790605089045e-07, + "loss": 0.4617, + "step": 502 + }, + { + "epoch": 0.9162112932604736, + "grad_norm": 1.364660127297357, + "learning_rate": 1.4667293401943393e-07, + "loss": 0.4879, + "step": 503 + }, + { + "epoch": 0.9180327868852459, + "grad_norm": 1.220502331061608, + "learning_rate": 1.404023347028418e-07, + "loss": 0.4495, + "step": 504 + }, + { + "epoch": 0.9198542805100182, + "grad_norm": 1.5127357404088118, + "learning_rate": 1.342663267686297e-07, + "loss": 0.4893, + "step": 505 + }, + { + "epoch": 0.9216757741347905, + "grad_norm": 1.0948101040744869, + "learning_rate": 1.2826512419085922e-07, + "loss": 0.4689, + "step": 506 + }, + { + "epoch": 0.9234972677595629, + "grad_norm": 1.2102894214186626, + "learning_rate": 1.223989362426785e-07, + "loss": 0.4728, + "step": 507 + }, + { + "epoch": 0.9253187613843351, + "grad_norm": 1.2964990563502878, + "learning_rate": 1.1666796748902142e-07, + "loss": 0.4591, + "step": 508 + }, + { + "epoch": 0.9271402550091075, + "grad_norm": 1.3179562572508972, + "learning_rate": 1.1107241777947774e-07, + "loss": 0.5085, + "step": 509 + }, + { + "epoch": 0.9289617486338798, + "grad_norm": 1.124362582099017, + "learning_rate": 1.0561248224132091e-07, + "loss": 0.43, + "step": 510 + }, + { + "epoch": 0.930783242258652, + "grad_norm": 1.263872009096953, + "learning_rate": 1.0028835127270552e-07, + "loss": 0.4993, + "step": 511 + }, + { + "epoch": 0.9326047358834244, + "grad_norm": 1.2997351666975574, + "learning_rate": 9.510021053602679e-08, + "loss": 0.4404, + "step": 512 + }, + { + "epoch": 0.9344262295081968, + "grad_norm": 1.329253107069971, + "learning_rate": 9.004824095144581e-08, + "loss": 0.5285, + "step": 513 + }, + { + "epoch": 0.936247723132969, + "grad_norm": 1.0886350408667627, + "learning_rate": 8.513261869058209e-08, + "loss": 0.4606, + "step": 514 + }, + { + "epoch": 0.9380692167577414, + "grad_norm": 1.2876516793995008, + "learning_rate": 8.035351517036914e-08, + "loss": 0.4777, + "step": 515 + }, + { + "epoch": 0.9398907103825137, + "grad_norm": 1.2692698581081259, + "learning_rate": 7.571109704707623e-08, + "loss": 0.4554, + "step": 516 + }, + { + "epoch": 0.941712204007286, + "grad_norm": 1.2791868173597516, + "learning_rate": 7.120552621049825e-08, + "loss": 0.4488, + "step": 517 + }, + { + "epoch": 0.9435336976320583, + "grad_norm": 1.208557284284356, + "learning_rate": 6.68369597783096e-08, + "loss": 0.4034, + "step": 518 + }, + { + "epoch": 0.9453551912568307, + "grad_norm": 1.2963091541864111, + "learning_rate": 6.260555009058288e-08, + "loss": 0.4838, + "step": 519 + }, + { + "epoch": 0.9471766848816029, + "grad_norm": 1.3727003252206291, + "learning_rate": 5.851144470448144e-08, + "loss": 0.4996, + "step": 520 + }, + { + "epoch": 0.9489981785063752, + "grad_norm": 1.3210931262200563, + "learning_rate": 5.455478638911071e-08, + "loss": 0.5332, + "step": 521 + }, + { + "epoch": 0.9508196721311475, + "grad_norm": 1.2713274273103237, + "learning_rate": 5.073571312053815e-08, + "loss": 0.4783, + "step": 522 + }, + { + "epoch": 0.9526411657559198, + "grad_norm": 1.2497269961588167, + "learning_rate": 4.705435807698555e-08, + "loss": 0.4485, + "step": 523 + }, + { + "epoch": 0.9544626593806922, + "grad_norm": 1.2309482179042086, + "learning_rate": 4.351084963418117e-08, + "loss": 0.4331, + "step": 524 + }, + { + "epoch": 0.9562841530054644, + "grad_norm": 1.1950945289140942, + "learning_rate": 4.010531136088691e-08, + "loss": 0.4604, + "step": 525 + }, + { + "epoch": 0.9581056466302368, + "grad_norm": 1.2931423414382452, + "learning_rate": 3.683786201458439e-08, + "loss": 0.3767, + "step": 526 + }, + { + "epoch": 0.9599271402550091, + "grad_norm": 1.389899139107159, + "learning_rate": 3.370861553733784e-08, + "loss": 0.4897, + "step": 527 + }, + { + "epoch": 0.9617486338797814, + "grad_norm": 1.362732666201115, + "learning_rate": 3.071768105181993e-08, + "loss": 0.499, + "step": 528 + }, + { + "epoch": 0.9635701275045537, + "grad_norm": 1.1771528381852838, + "learning_rate": 2.786516285750373e-08, + "loss": 0.5627, + "step": 529 + }, + { + "epoch": 0.9653916211293261, + "grad_norm": 1.2404159607795244, + "learning_rate": 2.5151160427029582e-08, + "loss": 0.4175, + "step": 530 + }, + { + "epoch": 0.9672131147540983, + "grad_norm": 1.3555377819209309, + "learning_rate": 2.2575768402733232e-08, + "loss": 0.4961, + "step": 531 + }, + { + "epoch": 0.9690346083788707, + "grad_norm": 1.4576298033097894, + "learning_rate": 2.013907659334624e-08, + "loss": 0.5458, + "step": 532 + }, + { + "epoch": 0.970856102003643, + "grad_norm": 1.4545178388397544, + "learning_rate": 1.7841169970866042e-08, + "loss": 0.557, + "step": 533 + }, + { + "epoch": 0.9726775956284153, + "grad_norm": 1.2516530596811049, + "learning_rate": 1.5682128667589e-08, + "loss": 0.5176, + "step": 534 + }, + { + "epoch": 0.9744990892531876, + "grad_norm": 1.395868262099181, + "learning_rate": 1.3662027973320612e-08, + "loss": 0.6025, + "step": 535 + }, + { + "epoch": 0.97632058287796, + "grad_norm": 1.228580089232679, + "learning_rate": 1.1780938332746515e-08, + "loss": 0.4352, + "step": 536 + }, + { + "epoch": 0.9781420765027322, + "grad_norm": 1.307882851191111, + "learning_rate": 1.0038925342977122e-08, + "loss": 0.4912, + "step": 537 + }, + { + "epoch": 0.9799635701275046, + "grad_norm": 1.412138408392618, + "learning_rate": 8.43604975126011e-09, + "loss": 0.5536, + "step": 538 + }, + { + "epoch": 0.9817850637522769, + "grad_norm": 1.3540327476191583, + "learning_rate": 6.972367452863004e-09, + "loss": 0.4909, + "step": 539 + }, + { + "epoch": 0.9836065573770492, + "grad_norm": 1.2531752735063777, + "learning_rate": 5.647929489122738e-09, + "loss": 0.4375, + "step": 540 + }, + { + "epoch": 0.9854280510018215, + "grad_norm": 1.2878472135871981, + "learning_rate": 4.462782045664859e-09, + "loss": 0.4857, + "step": 541 + }, + { + "epoch": 0.9872495446265938, + "grad_norm": 1.2374394809536116, + "learning_rate": 3.4169664507959216e-09, + "loss": 0.4435, + "step": 542 + }, + { + "epoch": 0.9890710382513661, + "grad_norm": 1.3708150206530674, + "learning_rate": 2.5105191740597553e-09, + "loss": 0.5403, + "step": 543 + }, + { + "epoch": 0.9908925318761385, + "grad_norm": 1.1853808948747229, + "learning_rate": 1.7434718249664803e-09, + "loss": 0.4532, + "step": 544 + }, + { + "epoch": 0.9927140255009107, + "grad_norm": 1.190207243267208, + "learning_rate": 1.1158511518902791e-09, + "loss": 0.4696, + "step": 545 + }, + { + "epoch": 0.994535519125683, + "grad_norm": 1.2336375950670702, + "learning_rate": 6.276790411372524e-10, + "loss": 0.4729, + "step": 546 + }, + { + "epoch": 0.9963570127504554, + "grad_norm": 1.1739246355744901, + "learning_rate": 2.789725161806977e-10, + "loss": 0.5413, + "step": 547 + }, + { + "epoch": 0.9981785063752276, + "grad_norm": 1.2297320117715114, + "learning_rate": 6.974373706869486e-11, + "loss": 0.4542, + "step": 548 + }, + { + "epoch": 1.0, + "grad_norm": 1.2871587171589334, + "learning_rate": 0.0, + "loss": 0.4884, + "step": 549 + }, + { + "epoch": 1.0, + "eval_accuracy": 0.8146578886019198, + "eval_accuracy_first_token": 0.9528399668325042, + "eval_accuracy_first_token_all": 0.9712550007408505, + "eval_accuracy_first_token_all_total": 6749, + "eval_accuracy_first_token_calculate": 0.8636363636363636, + "eval_accuracy_first_token_calculate_total": 44, + "eval_accuracy_first_token_execute": 1.0, + "eval_accuracy_first_token_execute_total": 202, + "eval_accuracy_first_token_get": 0.9343544857768052, + "eval_accuracy_first_token_get_total": 457, + "eval_accuracy_first_token_python": 0.8753768844221106, + "eval_accuracy_first_token_python_total": 995, + "eval_loss": 0.5619608163833618, + "eval_perplexity": 1.1109600199882654, + "eval_runtime": 283.0681, + "eval_samples_per_second": 2.452, + "eval_steps_per_second": 0.615, + "eval_total_number_first_token": 9648, + "step": 549 + }, + { + "epoch": 1.0, + "step": 549, + "total_flos": 229794172108800.0, + "train_loss": 0.5332211447652355, + "train_runtime": 12454.5665, + "train_samples_per_second": 0.705, + "train_steps_per_second": 0.044 + } + ], + "logging_steps": 1.0, + "max_steps": 549, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 5.0, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": false, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 229794172108800.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}