{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.3535169785169785, "eval_steps": 500, "global_step": 500000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 4.596311569213867, "learning_rate": 6.249999999999999e-07, "loss": 9.2114, "step": 500 }, { "epoch": 0.0, "grad_norm": 5.16270637512207, "learning_rate": 1.2499999999999999e-06, "loss": 7.3081, "step": 1000 }, { "epoch": 0.0, "grad_norm": 3.267263412475586, "learning_rate": 1.875e-06, "loss": 6.1634, "step": 1500 }, { "epoch": 0.01, "grad_norm": 2.1591992378234863, "learning_rate": 2.4999999999999998e-06, "loss": 5.421, "step": 2000 }, { "epoch": 0.01, "grad_norm": 2.160722494125366, "learning_rate": 3.125e-06, "loss": 4.856, "step": 2500 }, { "epoch": 0.01, "grad_norm": 2.330343723297119, "learning_rate": 3.75e-06, "loss": 4.3826, "step": 3000 }, { "epoch": 0.01, "grad_norm": 2.1275618076324463, "learning_rate": 4.3750000000000005e-06, "loss": 3.9848, "step": 3500 }, { "epoch": 0.01, "grad_norm": 2.1402294635772705, "learning_rate": 4.9999999999999996e-06, "loss": 3.6491, "step": 4000 }, { "epoch": 0.01, "grad_norm": 2.159619092941284, "learning_rate": 5.625e-06, "loss": 3.3861, "step": 4500 }, { "epoch": 0.01, "grad_norm": 2.5018796920776367, "learning_rate": 6.25e-06, "loss": 3.1845, "step": 5000 }, { "epoch": 0.01, "grad_norm": 1.9996334314346313, "learning_rate": 6.875e-06, "loss": 3.0335, "step": 5500 }, { "epoch": 0.02, "grad_norm": 2.103320598602295, "learning_rate": 7.5e-06, "loss": 2.9096, "step": 6000 }, { "epoch": 0.02, "grad_norm": 2.025847911834717, "learning_rate": 8.125e-06, "loss": 2.8088, "step": 6500 }, { "epoch": 0.02, "grad_norm": 2.030522108078003, "learning_rate": 8.750000000000001e-06, "loss": 2.7156, "step": 7000 }, { "epoch": 0.02, "grad_norm": 1.992558479309082, "learning_rate": 9.375000000000001e-06, "loss": 2.6262, "step": 7500 }, { "epoch": 0.02, "grad_norm": 2.1512062549591064, "learning_rate": 9.999999999999999e-06, "loss": 2.5432, "step": 8000 }, { "epoch": 0.02, "grad_norm": 2.0734474658966064, "learning_rate": 1.0625e-05, "loss": 2.4722, "step": 8500 }, { "epoch": 0.02, "grad_norm": 1.9478808641433716, "learning_rate": 1.125e-05, "loss": 2.4111, "step": 9000 }, { "epoch": 0.03, "grad_norm": 1.762665033340454, "learning_rate": 1.1874999999999999e-05, "loss": 2.3521, "step": 9500 }, { "epoch": 0.03, "grad_norm": 1.8274019956588745, "learning_rate": 1.25e-05, "loss": 2.3099, "step": 10000 }, { "epoch": 0.03, "grad_norm": 1.905918002128601, "learning_rate": 1.3125e-05, "loss": 2.2629, "step": 10500 }, { "epoch": 0.03, "grad_norm": 1.8081414699554443, "learning_rate": 1.375e-05, "loss": 2.2239, "step": 11000 }, { "epoch": 0.03, "grad_norm": 7.712226867675781, "learning_rate": 1.4375e-05, "loss": 2.1907, "step": 11500 }, { "epoch": 0.03, "grad_norm": 1.6963427066802979, "learning_rate": 1.5e-05, "loss": 2.1602, "step": 12000 }, { "epoch": 0.03, "grad_norm": 1.717537522315979, "learning_rate": 1.5625e-05, "loss": 2.1384, "step": 12500 }, { "epoch": 0.04, "grad_norm": 1.745806336402893, "learning_rate": 1.625e-05, "loss": 2.1061, "step": 13000 }, { "epoch": 0.04, "grad_norm": 1.7633601427078247, "learning_rate": 1.6875e-05, "loss": 2.0838, "step": 13500 }, { "epoch": 0.04, "grad_norm": 1.7061880826950073, "learning_rate": 1.7500000000000002e-05, "loss": 2.0648, "step": 14000 }, { "epoch": 0.04, "grad_norm": 1.7471063137054443, "learning_rate": 1.8125e-05, "loss": 2.0462, "step": 14500 }, { "epoch": 0.04, "grad_norm": 1.705340027809143, "learning_rate": 1.8750000000000002e-05, "loss": 2.0281, "step": 15000 }, { "epoch": 0.04, "grad_norm": 41.675968170166016, "learning_rate": 1.9375e-05, "loss": 2.003, "step": 15500 }, { "epoch": 0.04, "grad_norm": 1.737722396850586, "learning_rate": 1.9999999999999998e-05, "loss": 1.9914, "step": 16000 }, { "epoch": 0.04, "grad_norm": 1.8232406377792358, "learning_rate": 2.0625e-05, "loss": 1.9724, "step": 16500 }, { "epoch": 0.05, "grad_norm": 1.8312487602233887, "learning_rate": 2.125e-05, "loss": 1.9577, "step": 17000 }, { "epoch": 0.05, "grad_norm": 2.025630235671997, "learning_rate": 2.1875e-05, "loss": 1.9411, "step": 17500 }, { "epoch": 0.05, "grad_norm": 1.9454607963562012, "learning_rate": 2.25e-05, "loss": 1.9263, "step": 18000 }, { "epoch": 0.05, "grad_norm": 1.637341856956482, "learning_rate": 2.3125000000000003e-05, "loss": 1.9221, "step": 18500 }, { "epoch": 0.05, "grad_norm": 1.846366286277771, "learning_rate": 2.3749999999999998e-05, "loss": 1.9086, "step": 19000 }, { "epoch": 0.05, "grad_norm": 1.802040457725525, "learning_rate": 2.4375e-05, "loss": 1.8961, "step": 19500 }, { "epoch": 0.05, "grad_norm": 1.7378031015396118, "learning_rate": 2.5e-05, "loss": 1.8893, "step": 20000 }, { "epoch": 0.06, "grad_norm": 1.6410856246948242, "learning_rate": 2.5625e-05, "loss": 1.8752, "step": 20500 }, { "epoch": 0.06, "grad_norm": 1.7153388261795044, "learning_rate": 2.625e-05, "loss": 1.862, "step": 21000 }, { "epoch": 0.06, "grad_norm": 1.6210004091262817, "learning_rate": 2.6875000000000003e-05, "loss": 1.855, "step": 21500 }, { "epoch": 0.06, "grad_norm": 1.6593818664550781, "learning_rate": 2.75e-05, "loss": 1.8478, "step": 22000 }, { "epoch": 0.06, "grad_norm": 1.659287691116333, "learning_rate": 2.8125e-05, "loss": 1.8353, "step": 22500 }, { "epoch": 0.06, "grad_norm": 1.703875184059143, "learning_rate": 2.875e-05, "loss": 1.8288, "step": 23000 }, { "epoch": 0.06, "grad_norm": 1.7122712135314941, "learning_rate": 2.9375e-05, "loss": 1.8289, "step": 23500 }, { "epoch": 0.06, "grad_norm": 1.6744304895401, "learning_rate": 3e-05, "loss": 1.8219, "step": 24000 }, { "epoch": 0.07, "grad_norm": 1.7783963680267334, "learning_rate": 2.9968487394957983e-05, "loss": 1.8141, "step": 24500 }, { "epoch": 0.07, "grad_norm": 1.7388477325439453, "learning_rate": 2.9936974789915968e-05, "loss": 1.805, "step": 25000 }, { "epoch": 0.07, "grad_norm": 1.6574689149856567, "learning_rate": 2.990546218487395e-05, "loss": 1.8005, "step": 25500 }, { "epoch": 0.07, "grad_norm": 1.6803966760635376, "learning_rate": 2.9873949579831935e-05, "loss": 1.7902, "step": 26000 }, { "epoch": 0.07, "grad_norm": 1.6314315795898438, "learning_rate": 2.9842436974789916e-05, "loss": 1.7832, "step": 26500 }, { "epoch": 0.07, "grad_norm": 1.6180912256240845, "learning_rate": 2.98109243697479e-05, "loss": 1.7774, "step": 27000 }, { "epoch": 0.07, "grad_norm": 1.6669533252716064, "learning_rate": 2.9779411764705883e-05, "loss": 1.774, "step": 27500 }, { "epoch": 0.08, "grad_norm": 1.5653916597366333, "learning_rate": 2.9747899159663868e-05, "loss": 1.7673, "step": 28000 }, { "epoch": 0.08, "grad_norm": 1.6632215976715088, "learning_rate": 2.971638655462185e-05, "loss": 1.7639, "step": 28500 }, { "epoch": 0.08, "grad_norm": 1.6262154579162598, "learning_rate": 2.9684873949579835e-05, "loss": 1.757, "step": 29000 }, { "epoch": 0.08, "grad_norm": 4.847783088684082, "learning_rate": 2.9653361344537817e-05, "loss": 1.9286, "step": 29500 }, { "epoch": 0.08, "grad_norm": 2.6416807174682617, "learning_rate": 2.9621848739495802e-05, "loss": 3.7773, "step": 30000 }, { "epoch": 0.08, "grad_norm": 3.4526023864746094, "learning_rate": 2.9590336134453784e-05, "loss": 4.3611, "step": 30500 }, { "epoch": 0.08, "grad_norm": 65.76104736328125, "learning_rate": 2.9558823529411766e-05, "loss": 4.5628, "step": 31000 }, { "epoch": 0.09, "grad_norm": 6.145516395568848, "learning_rate": 2.9527310924369747e-05, "loss": 4.388, "step": 31500 }, { "epoch": 0.09, "grad_norm": 4.991481781005859, "learning_rate": 2.949579831932773e-05, "loss": 4.1991, "step": 32000 }, { "epoch": 0.09, "grad_norm": 2.632403612136841, "learning_rate": 2.9464285714285714e-05, "loss": 3.7935, "step": 32500 }, { "epoch": 0.09, "grad_norm": 3.691666841506958, "learning_rate": 2.9432773109243696e-05, "loss": 3.4704, "step": 33000 }, { "epoch": 0.09, "grad_norm": 14.81291675567627, "learning_rate": 2.940126050420168e-05, "loss": 2.6663, "step": 33500 }, { "epoch": 0.09, "grad_norm": 2.4295215606689453, "learning_rate": 2.9369747899159663e-05, "loss": 2.4661, "step": 34000 }, { "epoch": 0.09, "grad_norm": 52.97163391113281, "learning_rate": 2.9338235294117648e-05, "loss": 2.1129, "step": 34500 }, { "epoch": 0.09, "grad_norm": 2.337153196334839, "learning_rate": 2.930672268907563e-05, "loss": 1.7961, "step": 35000 }, { "epoch": 0.1, "grad_norm": 6.669353008270264, "learning_rate": 2.9275210084033615e-05, "loss": 1.7907, "step": 35500 }, { "epoch": 0.1, "grad_norm": 1.5874249935150146, "learning_rate": 2.9243697478991596e-05, "loss": 1.7663, "step": 36000 }, { "epoch": 0.1, "grad_norm": 1.7114965915679932, "learning_rate": 2.921218487394958e-05, "loss": 1.7439, "step": 36500 }, { "epoch": 0.1, "grad_norm": 1.8134816884994507, "learning_rate": 2.9180672268907563e-05, "loss": 1.7361, "step": 37000 }, { "epoch": 0.1, "grad_norm": 1.505012035369873, "learning_rate": 2.9149159663865545e-05, "loss": 1.7323, "step": 37500 }, { "epoch": 0.1, "grad_norm": 1.6047751903533936, "learning_rate": 2.911764705882353e-05, "loss": 1.7212, "step": 38000 }, { "epoch": 0.1, "grad_norm": 1.5497486591339111, "learning_rate": 2.9086134453781512e-05, "loss": 1.7215, "step": 38500 }, { "epoch": 0.11, "grad_norm": 1.5367647409439087, "learning_rate": 2.9054621848739497e-05, "loss": 1.7027, "step": 39000 }, { "epoch": 0.11, "grad_norm": 4.223250865936279, "learning_rate": 2.902310924369748e-05, "loss": 1.6914, "step": 39500 }, { "epoch": 0.11, "grad_norm": 1.5872981548309326, "learning_rate": 2.8991596638655464e-05, "loss": 1.6878, "step": 40000 }, { "epoch": 0.11, "grad_norm": 1.5480022430419922, "learning_rate": 2.8960084033613446e-05, "loss": 1.6816, "step": 40500 }, { "epoch": 0.11, "grad_norm": 1.5464568138122559, "learning_rate": 2.892857142857143e-05, "loss": 1.6796, "step": 41000 }, { "epoch": 0.11, "grad_norm": 1.557543158531189, "learning_rate": 2.8897058823529413e-05, "loss": 1.6709, "step": 41500 }, { "epoch": 0.11, "grad_norm": 1.5462812185287476, "learning_rate": 2.8865546218487398e-05, "loss": 1.6728, "step": 42000 }, { "epoch": 0.12, "grad_norm": 1.5833927392959595, "learning_rate": 2.883403361344538e-05, "loss": 1.6676, "step": 42500 }, { "epoch": 0.12, "grad_norm": 1.63410222530365, "learning_rate": 2.8802521008403365e-05, "loss": 1.6696, "step": 43000 }, { "epoch": 0.12, "grad_norm": 1.4682618379592896, "learning_rate": 2.8771008403361346e-05, "loss": 1.6693, "step": 43500 }, { "epoch": 0.12, "grad_norm": 1.5386840105056763, "learning_rate": 2.8739495798319328e-05, "loss": 1.6602, "step": 44000 }, { "epoch": 0.12, "grad_norm": 1.5572445392608643, "learning_rate": 2.8707983193277313e-05, "loss": 1.6581, "step": 44500 }, { "epoch": 0.12, "grad_norm": 1.5247888565063477, "learning_rate": 2.8676470588235295e-05, "loss": 1.6546, "step": 45000 }, { "epoch": 0.12, "grad_norm": 1.5297437906265259, "learning_rate": 2.864495798319328e-05, "loss": 1.6467, "step": 45500 }, { "epoch": 0.12, "grad_norm": 1.5252556800842285, "learning_rate": 2.8613445378151262e-05, "loss": 1.6504, "step": 46000 }, { "epoch": 0.13, "grad_norm": 1.4626063108444214, "learning_rate": 2.8581932773109244e-05, "loss": 1.6441, "step": 46500 }, { "epoch": 0.13, "grad_norm": 1.511093020439148, "learning_rate": 2.8550420168067225e-05, "loss": 1.6433, "step": 47000 }, { "epoch": 0.13, "grad_norm": 1.572654366493225, "learning_rate": 2.851890756302521e-05, "loss": 1.6527, "step": 47500 }, { "epoch": 0.13, "grad_norm": 1.5643205642700195, "learning_rate": 2.8487394957983192e-05, "loss": 1.6376, "step": 48000 }, { "epoch": 0.13, "grad_norm": 1.497128963470459, "learning_rate": 2.8455882352941177e-05, "loss": 1.6397, "step": 48500 }, { "epoch": 0.13, "grad_norm": 1.464203953742981, "learning_rate": 2.842436974789916e-05, "loss": 1.6358, "step": 49000 }, { "epoch": 0.13, "grad_norm": 1.8414405584335327, "learning_rate": 2.8392857142857144e-05, "loss": 1.6366, "step": 49500 }, { "epoch": 0.14, "grad_norm": 1.7834322452545166, "learning_rate": 2.8361344537815126e-05, "loss": 1.642, "step": 50000 }, { "epoch": 0.14, "grad_norm": 1.477858304977417, "learning_rate": 2.8329831932773108e-05, "loss": 1.6342, "step": 50500 }, { "epoch": 0.14, "grad_norm": 1.5328236818313599, "learning_rate": 2.8298319327731093e-05, "loss": 1.6333, "step": 51000 }, { "epoch": 0.14, "grad_norm": 1.540300965309143, "learning_rate": 2.8266806722689075e-05, "loss": 1.6352, "step": 51500 }, { "epoch": 0.14, "grad_norm": 1.8767386674880981, "learning_rate": 2.823529411764706e-05, "loss": 1.6328, "step": 52000 }, { "epoch": 0.14, "grad_norm": 1.5387629270553589, "learning_rate": 2.820378151260504e-05, "loss": 1.632, "step": 52500 }, { "epoch": 0.14, "grad_norm": 1.6315770149230957, "learning_rate": 2.8172268907563027e-05, "loss": 1.627, "step": 53000 }, { "epoch": 0.14, "grad_norm": 5.726038455963135, "learning_rate": 2.814075630252101e-05, "loss": 1.6293, "step": 53500 }, { "epoch": 0.15, "grad_norm": 1.5697258710861206, "learning_rate": 2.8109243697478993e-05, "loss": 1.6211, "step": 54000 }, { "epoch": 0.15, "grad_norm": 1.5938401222229004, "learning_rate": 2.8077731092436975e-05, "loss": 1.6196, "step": 54500 }, { "epoch": 0.15, "grad_norm": 1.5256606340408325, "learning_rate": 2.804621848739496e-05, "loss": 1.6177, "step": 55000 }, { "epoch": 0.15, "grad_norm": 2.223390817642212, "learning_rate": 2.8014705882352942e-05, "loss": 1.6246, "step": 55500 }, { "epoch": 0.15, "grad_norm": 1.4948030710220337, "learning_rate": 2.7983193277310927e-05, "loss": 1.6239, "step": 56000 }, { "epoch": 0.15, "grad_norm": 1.5147298574447632, "learning_rate": 2.795168067226891e-05, "loss": 1.6164, "step": 56500 }, { "epoch": 0.15, "grad_norm": 1.5068755149841309, "learning_rate": 2.792016806722689e-05, "loss": 1.612, "step": 57000 }, { "epoch": 0.16, "grad_norm": 1.5074622631072998, "learning_rate": 2.7888655462184876e-05, "loss": 1.6113, "step": 57500 }, { "epoch": 0.16, "grad_norm": 1.4880355596542358, "learning_rate": 2.7857142857142858e-05, "loss": 1.6102, "step": 58000 }, { "epoch": 0.16, "grad_norm": 1.6379941701889038, "learning_rate": 2.7825630252100843e-05, "loss": 1.6084, "step": 58500 }, { "epoch": 0.16, "grad_norm": 1.4973347187042236, "learning_rate": 2.7794117647058824e-05, "loss": 1.6007, "step": 59000 }, { "epoch": 0.16, "grad_norm": 1.5474885702133179, "learning_rate": 2.776260504201681e-05, "loss": 1.6042, "step": 59500 }, { "epoch": 0.16, "grad_norm": 1.602220058441162, "learning_rate": 2.773109243697479e-05, "loss": 1.6106, "step": 60000 }, { "epoch": 0.16, "grad_norm": 1.6185747385025024, "learning_rate": 2.7699579831932776e-05, "loss": 1.6058, "step": 60500 }, { "epoch": 0.17, "grad_norm": 1.56905996799469, "learning_rate": 2.7668067226890758e-05, "loss": 1.6013, "step": 61000 }, { "epoch": 0.17, "grad_norm": 1.5619949102401733, "learning_rate": 2.763655462184874e-05, "loss": 1.6034, "step": 61500 }, { "epoch": 0.17, "grad_norm": 1.504239559173584, "learning_rate": 2.7605042016806722e-05, "loss": 1.6057, "step": 62000 }, { "epoch": 0.17, "grad_norm": 1.4879348278045654, "learning_rate": 2.7573529411764707e-05, "loss": 1.6021, "step": 62500 }, { "epoch": 0.17, "grad_norm": 1.5099623203277588, "learning_rate": 2.754201680672269e-05, "loss": 1.6026, "step": 63000 }, { "epoch": 0.17, "grad_norm": 1.4979091882705688, "learning_rate": 2.751050420168067e-05, "loss": 1.5986, "step": 63500 }, { "epoch": 0.17, "grad_norm": 1.4825040102005005, "learning_rate": 2.7478991596638655e-05, "loss": 1.5957, "step": 64000 }, { "epoch": 0.17, "grad_norm": 1.493453860282898, "learning_rate": 2.7447478991596637e-05, "loss": 1.5989, "step": 64500 }, { "epoch": 0.18, "grad_norm": 1.530388593673706, "learning_rate": 2.7415966386554622e-05, "loss": 1.5953, "step": 65000 }, { "epoch": 0.18, "grad_norm": 1.5459638833999634, "learning_rate": 2.7384453781512604e-05, "loss": 1.5957, "step": 65500 }, { "epoch": 0.18, "grad_norm": 2.0421242713928223, "learning_rate": 2.735294117647059e-05, "loss": 1.5984, "step": 66000 }, { "epoch": 0.18, "grad_norm": 1.4634993076324463, "learning_rate": 2.732142857142857e-05, "loss": 1.5897, "step": 66500 }, { "epoch": 0.18, "grad_norm": 1.530594825744629, "learning_rate": 2.7289915966386556e-05, "loss": 1.5902, "step": 67000 }, { "epoch": 0.18, "grad_norm": 1.5332798957824707, "learning_rate": 2.7258403361344538e-05, "loss": 1.5874, "step": 67500 }, { "epoch": 0.18, "grad_norm": 1.753754734992981, "learning_rate": 2.7226890756302523e-05, "loss": 1.59, "step": 68000 }, { "epoch": 0.19, "grad_norm": 1.5545145273208618, "learning_rate": 2.7195378151260505e-05, "loss": 1.5949, "step": 68500 }, { "epoch": 0.19, "grad_norm": 1.5194141864776611, "learning_rate": 2.716386554621849e-05, "loss": 1.588, "step": 69000 }, { "epoch": 0.19, "grad_norm": 1.532632827758789, "learning_rate": 2.713235294117647e-05, "loss": 1.5918, "step": 69500 }, { "epoch": 0.19, "grad_norm": 1.4970754384994507, "learning_rate": 2.7100840336134453e-05, "loss": 1.5851, "step": 70000 }, { "epoch": 0.19, "grad_norm": 1.4157612323760986, "learning_rate": 2.706932773109244e-05, "loss": 1.5823, "step": 70500 }, { "epoch": 0.19, "grad_norm": 1.5014020204544067, "learning_rate": 2.703781512605042e-05, "loss": 1.5847, "step": 71000 }, { "epoch": 0.19, "grad_norm": 1.4652481079101562, "learning_rate": 2.7006302521008405e-05, "loss": 1.5886, "step": 71500 }, { "epoch": 0.19, "grad_norm": 1.5810528993606567, "learning_rate": 2.6974789915966387e-05, "loss": 1.5805, "step": 72000 }, { "epoch": 0.2, "grad_norm": 1.4908738136291504, "learning_rate": 2.6943277310924372e-05, "loss": 1.5812, "step": 72500 }, { "epoch": 0.2, "grad_norm": 1.4520491361618042, "learning_rate": 2.6911764705882354e-05, "loss": 1.5837, "step": 73000 }, { "epoch": 0.2, "grad_norm": 1.46824049949646, "learning_rate": 2.688025210084034e-05, "loss": 1.5778, "step": 73500 }, { "epoch": 0.2, "grad_norm": 1.5032325983047485, "learning_rate": 2.684873949579832e-05, "loss": 1.5777, "step": 74000 }, { "epoch": 0.2, "grad_norm": 1.5338232517242432, "learning_rate": 2.6817226890756306e-05, "loss": 1.5768, "step": 74500 }, { "epoch": 0.2, "grad_norm": 1.5439281463623047, "learning_rate": 2.6785714285714288e-05, "loss": 1.5782, "step": 75000 }, { "epoch": 0.2, "grad_norm": 1.536665439605713, "learning_rate": 2.675420168067227e-05, "loss": 1.5758, "step": 75500 }, { "epoch": 0.21, "grad_norm": 1.4520212411880493, "learning_rate": 2.6722689075630255e-05, "loss": 1.5732, "step": 76000 }, { "epoch": 0.21, "grad_norm": 1.5352224111557007, "learning_rate": 2.6691176470588233e-05, "loss": 1.5745, "step": 76500 }, { "epoch": 0.21, "grad_norm": 1.4939314126968384, "learning_rate": 2.6659663865546218e-05, "loss": 1.5724, "step": 77000 }, { "epoch": 0.21, "grad_norm": 1.4967976808547974, "learning_rate": 2.66281512605042e-05, "loss": 1.5693, "step": 77500 }, { "epoch": 0.21, "grad_norm": 1.4980648756027222, "learning_rate": 2.6596638655462185e-05, "loss": 1.5721, "step": 78000 }, { "epoch": 0.21, "grad_norm": 1.5700784921646118, "learning_rate": 2.6565126050420167e-05, "loss": 1.5713, "step": 78500 }, { "epoch": 0.21, "grad_norm": 1.5124626159667969, "learning_rate": 2.6533613445378152e-05, "loss": 1.5709, "step": 79000 }, { "epoch": 0.22, "grad_norm": 1.465012788772583, "learning_rate": 2.6502100840336134e-05, "loss": 1.5702, "step": 79500 }, { "epoch": 0.22, "grad_norm": 1.4589452743530273, "learning_rate": 2.647058823529412e-05, "loss": 1.5675, "step": 80000 }, { "epoch": 0.22, "grad_norm": 1.547255516052246, "learning_rate": 2.64390756302521e-05, "loss": 1.567, "step": 80500 }, { "epoch": 0.22, "grad_norm": 1.5208017826080322, "learning_rate": 2.6407563025210086e-05, "loss": 1.5654, "step": 81000 }, { "epoch": 0.22, "grad_norm": 1.563560128211975, "learning_rate": 2.6376050420168067e-05, "loss": 1.5651, "step": 81500 }, { "epoch": 0.22, "grad_norm": 1.4551901817321777, "learning_rate": 2.634453781512605e-05, "loss": 1.5692, "step": 82000 }, { "epoch": 0.22, "grad_norm": 3.783536672592163, "learning_rate": 2.6313025210084034e-05, "loss": 1.5698, "step": 82500 }, { "epoch": 0.22, "grad_norm": 1.5397638082504272, "learning_rate": 2.6281512605042016e-05, "loss": 1.5614, "step": 83000 }, { "epoch": 0.23, "grad_norm": 1.5307060480117798, "learning_rate": 2.625e-05, "loss": 1.5596, "step": 83500 }, { "epoch": 0.23, "grad_norm": 1.5148283243179321, "learning_rate": 2.6218487394957983e-05, "loss": 1.5612, "step": 84000 }, { "epoch": 0.23, "grad_norm": 1.531973958015442, "learning_rate": 2.6186974789915968e-05, "loss": 1.559, "step": 84500 }, { "epoch": 0.23, "grad_norm": 1.5402531623840332, "learning_rate": 2.615546218487395e-05, "loss": 1.5624, "step": 85000 }, { "epoch": 0.23, "grad_norm": 1.486365795135498, "learning_rate": 2.6123949579831935e-05, "loss": 1.5601, "step": 85500 }, { "epoch": 0.23, "grad_norm": 1.513438105583191, "learning_rate": 2.6092436974789917e-05, "loss": 1.5567, "step": 86000 }, { "epoch": 0.23, "grad_norm": 1.5112252235412598, "learning_rate": 2.6060924369747902e-05, "loss": 1.5574, "step": 86500 }, { "epoch": 0.24, "grad_norm": 1.4394776821136475, "learning_rate": 2.6029411764705883e-05, "loss": 1.5562, "step": 87000 }, { "epoch": 0.24, "grad_norm": 1.6592140197753906, "learning_rate": 2.599789915966387e-05, "loss": 1.5551, "step": 87500 }, { "epoch": 0.24, "grad_norm": 1.4790719747543335, "learning_rate": 2.596638655462185e-05, "loss": 1.5544, "step": 88000 }, { "epoch": 0.24, "grad_norm": 1.4369221925735474, "learning_rate": 2.5934873949579832e-05, "loss": 1.5538, "step": 88500 }, { "epoch": 0.24, "grad_norm": 1.5175668001174927, "learning_rate": 2.5903361344537817e-05, "loss": 1.5556, "step": 89000 }, { "epoch": 0.24, "grad_norm": 1.4514554738998413, "learning_rate": 2.58718487394958e-05, "loss": 1.5539, "step": 89500 }, { "epoch": 0.24, "grad_norm": 1.4288485050201416, "learning_rate": 2.5840336134453784e-05, "loss": 1.5525, "step": 90000 }, { "epoch": 0.24, "grad_norm": 1.546531081199646, "learning_rate": 2.5808823529411766e-05, "loss": 1.5527, "step": 90500 }, { "epoch": 0.25, "grad_norm": 1.567368507385254, "learning_rate": 2.5777310924369748e-05, "loss": 1.5491, "step": 91000 }, { "epoch": 0.25, "grad_norm": 1.5126845836639404, "learning_rate": 2.574579831932773e-05, "loss": 1.5504, "step": 91500 }, { "epoch": 0.25, "grad_norm": 1.5570114850997925, "learning_rate": 2.5714285714285714e-05, "loss": 1.5469, "step": 92000 }, { "epoch": 0.25, "grad_norm": 1.4678915739059448, "learning_rate": 2.5682773109243696e-05, "loss": 1.5493, "step": 92500 }, { "epoch": 0.25, "grad_norm": 1.4618594646453857, "learning_rate": 2.565126050420168e-05, "loss": 1.555, "step": 93000 }, { "epoch": 0.25, "grad_norm": 1.5945430994033813, "learning_rate": 2.5619747899159663e-05, "loss": 1.547, "step": 93500 }, { "epoch": 0.25, "grad_norm": 1.4740761518478394, "learning_rate": 2.5588235294117648e-05, "loss": 1.5463, "step": 94000 }, { "epoch": 0.26, "grad_norm": 1.4022290706634521, "learning_rate": 2.555672268907563e-05, "loss": 1.5449, "step": 94500 }, { "epoch": 0.26, "grad_norm": 2.622828722000122, "learning_rate": 2.552521008403361e-05, "loss": 1.55, "step": 95000 }, { "epoch": 0.26, "grad_norm": 1.409568428993225, "learning_rate": 2.5493697478991597e-05, "loss": 1.5436, "step": 95500 }, { "epoch": 0.26, "grad_norm": 1.4889922142028809, "learning_rate": 2.546218487394958e-05, "loss": 1.5441, "step": 96000 }, { "epoch": 0.26, "grad_norm": 1.4589875936508179, "learning_rate": 2.5430672268907564e-05, "loss": 1.5468, "step": 96500 }, { "epoch": 0.26, "grad_norm": 1.4680520296096802, "learning_rate": 2.5399159663865545e-05, "loss": 1.5429, "step": 97000 }, { "epoch": 0.26, "grad_norm": 1.4456883668899536, "learning_rate": 2.536764705882353e-05, "loss": 1.5458, "step": 97500 }, { "epoch": 0.27, "grad_norm": 1.4655406475067139, "learning_rate": 2.5336134453781512e-05, "loss": 1.5399, "step": 98000 }, { "epoch": 0.27, "grad_norm": 7.581863880157471, "learning_rate": 2.5304621848739497e-05, "loss": 1.5423, "step": 98500 }, { "epoch": 0.27, "grad_norm": 1.5289582014083862, "learning_rate": 2.527310924369748e-05, "loss": 1.5434, "step": 99000 }, { "epoch": 0.27, "grad_norm": 1.475637674331665, "learning_rate": 2.5241596638655464e-05, "loss": 1.5415, "step": 99500 }, { "epoch": 0.27, "grad_norm": 1.45746910572052, "learning_rate": 2.5210084033613446e-05, "loss": 1.5401, "step": 100000 }, { "epoch": 0.27, "grad_norm": 1.4924384355545044, "learning_rate": 2.517857142857143e-05, "loss": 1.5382, "step": 100500 }, { "epoch": 0.27, "grad_norm": 1.4440650939941406, "learning_rate": 2.5147058823529413e-05, "loss": 1.539, "step": 101000 }, { "epoch": 0.27, "grad_norm": 1.5022001266479492, "learning_rate": 2.5115546218487395e-05, "loss": 1.5375, "step": 101500 }, { "epoch": 0.28, "grad_norm": 1.4573357105255127, "learning_rate": 2.508403361344538e-05, "loss": 1.5423, "step": 102000 }, { "epoch": 0.28, "grad_norm": 1.4948347806930542, "learning_rate": 2.505252100840336e-05, "loss": 1.538, "step": 102500 }, { "epoch": 0.28, "grad_norm": 1.5028940439224243, "learning_rate": 2.5021008403361347e-05, "loss": 1.5368, "step": 103000 }, { "epoch": 0.28, "grad_norm": 1.510446310043335, "learning_rate": 2.498949579831933e-05, "loss": 1.534, "step": 103500 }, { "epoch": 0.28, "grad_norm": 1.516194462776184, "learning_rate": 2.4957983193277314e-05, "loss": 1.5404, "step": 104000 }, { "epoch": 0.28, "grad_norm": 1.452358365058899, "learning_rate": 2.4926470588235295e-05, "loss": 1.5349, "step": 104500 }, { "epoch": 0.28, "grad_norm": 1.4550226926803589, "learning_rate": 2.489495798319328e-05, "loss": 1.5373, "step": 105000 }, { "epoch": 0.29, "grad_norm": 1.4559545516967773, "learning_rate": 2.4863445378151262e-05, "loss": 1.5341, "step": 105500 }, { "epoch": 0.29, "grad_norm": 1.4436681270599365, "learning_rate": 2.4831932773109244e-05, "loss": 1.5344, "step": 106000 }, { "epoch": 0.29, "grad_norm": 1.4642813205718994, "learning_rate": 2.4800420168067226e-05, "loss": 1.5333, "step": 106500 }, { "epoch": 0.29, "grad_norm": 1.4824906587600708, "learning_rate": 2.476890756302521e-05, "loss": 1.5291, "step": 107000 }, { "epoch": 0.29, "grad_norm": 1.515098214149475, "learning_rate": 2.4737394957983193e-05, "loss": 1.5285, "step": 107500 }, { "epoch": 0.29, "grad_norm": 2.073720693588257, "learning_rate": 2.4705882352941174e-05, "loss": 1.5348, "step": 108000 }, { "epoch": 0.29, "grad_norm": 1.884777545928955, "learning_rate": 2.467436974789916e-05, "loss": 1.5321, "step": 108500 }, { "epoch": 0.3, "grad_norm": 1.4791995286941528, "learning_rate": 2.464285714285714e-05, "loss": 1.5305, "step": 109000 }, { "epoch": 0.3, "grad_norm": 1.4546101093292236, "learning_rate": 2.4611344537815126e-05, "loss": 1.5308, "step": 109500 }, { "epoch": 0.3, "grad_norm": 1.421767234802246, "learning_rate": 2.4579831932773108e-05, "loss": 1.532, "step": 110000 }, { "epoch": 0.3, "grad_norm": 1.476372480392456, "learning_rate": 2.4548319327731093e-05, "loss": 1.5303, "step": 110500 }, { "epoch": 0.3, "grad_norm": 1.4746720790863037, "learning_rate": 2.4516806722689075e-05, "loss": 1.531, "step": 111000 }, { "epoch": 0.3, "grad_norm": 1.486217975616455, "learning_rate": 2.448529411764706e-05, "loss": 1.5277, "step": 111500 }, { "epoch": 0.3, "grad_norm": 1.4249714612960815, "learning_rate": 2.4453781512605042e-05, "loss": 1.525, "step": 112000 }, { "epoch": 0.3, "grad_norm": 1.4237457513809204, "learning_rate": 2.4422268907563027e-05, "loss": 1.5263, "step": 112500 }, { "epoch": 0.31, "grad_norm": 1.4878206253051758, "learning_rate": 2.439075630252101e-05, "loss": 1.5239, "step": 113000 }, { "epoch": 0.31, "grad_norm": 1.4781346321105957, "learning_rate": 2.4359243697478994e-05, "loss": 1.528, "step": 113500 }, { "epoch": 0.31, "grad_norm": 1.4943785667419434, "learning_rate": 2.4327731092436976e-05, "loss": 1.5231, "step": 114000 }, { "epoch": 0.31, "grad_norm": 1.466009497642517, "learning_rate": 2.4296218487394957e-05, "loss": 1.5233, "step": 114500 }, { "epoch": 0.31, "grad_norm": 2.4329051971435547, "learning_rate": 2.4264705882352942e-05, "loss": 1.5266, "step": 115000 }, { "epoch": 0.31, "grad_norm": 1.477039098739624, "learning_rate": 2.4233193277310924e-05, "loss": 1.5278, "step": 115500 }, { "epoch": 0.31, "grad_norm": 1.5693820714950562, "learning_rate": 2.420168067226891e-05, "loss": 1.5254, "step": 116000 }, { "epoch": 0.32, "grad_norm": 1.4393528699874878, "learning_rate": 2.417016806722689e-05, "loss": 1.5236, "step": 116500 }, { "epoch": 0.32, "grad_norm": 1.4845529794692993, "learning_rate": 2.4138655462184876e-05, "loss": 1.5206, "step": 117000 }, { "epoch": 0.32, "grad_norm": 1.476683259010315, "learning_rate": 2.4107142857142858e-05, "loss": 1.5208, "step": 117500 }, { "epoch": 0.32, "grad_norm": 1.428836703300476, "learning_rate": 2.4075630252100843e-05, "loss": 1.5234, "step": 118000 }, { "epoch": 0.32, "grad_norm": 1.449540138244629, "learning_rate": 2.4044117647058825e-05, "loss": 1.5234, "step": 118500 }, { "epoch": 0.32, "grad_norm": 1.4410090446472168, "learning_rate": 2.401260504201681e-05, "loss": 1.5203, "step": 119000 }, { "epoch": 0.32, "grad_norm": 1.4714431762695312, "learning_rate": 2.398109243697479e-05, "loss": 1.5208, "step": 119500 }, { "epoch": 0.32, "grad_norm": 1.469762921333313, "learning_rate": 2.3949579831932777e-05, "loss": 1.524, "step": 120000 }, { "epoch": 0.33, "grad_norm": 1.5507971048355103, "learning_rate": 2.391806722689076e-05, "loss": 1.5224, "step": 120500 }, { "epoch": 0.33, "grad_norm": 1.5093679428100586, "learning_rate": 2.3886554621848737e-05, "loss": 1.5235, "step": 121000 }, { "epoch": 0.33, "grad_norm": 1.492244839668274, "learning_rate": 2.3855042016806722e-05, "loss": 1.5196, "step": 121500 }, { "epoch": 0.33, "grad_norm": 1.4522676467895508, "learning_rate": 2.3823529411764704e-05, "loss": 1.5209, "step": 122000 }, { "epoch": 0.33, "grad_norm": 1.527627944946289, "learning_rate": 2.379201680672269e-05, "loss": 1.5198, "step": 122500 }, { "epoch": 0.33, "grad_norm": 1.488146424293518, "learning_rate": 2.376050420168067e-05, "loss": 1.5165, "step": 123000 }, { "epoch": 0.33, "grad_norm": 1.4484755992889404, "learning_rate": 2.3728991596638656e-05, "loss": 1.5123, "step": 123500 }, { "epoch": 0.34, "grad_norm": 1.5184931755065918, "learning_rate": 2.3697478991596638e-05, "loss": 1.5177, "step": 124000 }, { "epoch": 0.34, "grad_norm": 1.4979966878890991, "learning_rate": 2.3665966386554623e-05, "loss": 1.5193, "step": 124500 }, { "epoch": 0.34, "grad_norm": 1.4858919382095337, "learning_rate": 2.3634453781512604e-05, "loss": 1.5129, "step": 125000 }, { "epoch": 0.34, "grad_norm": 1.6100457906723022, "learning_rate": 2.360294117647059e-05, "loss": 1.5153, "step": 125500 }, { "epoch": 0.34, "grad_norm": 1.4573218822479248, "learning_rate": 2.357142857142857e-05, "loss": 1.5173, "step": 126000 }, { "epoch": 0.34, "grad_norm": 1.4780622720718384, "learning_rate": 2.3539915966386556e-05, "loss": 1.5142, "step": 126500 }, { "epoch": 0.34, "grad_norm": 1.4847768545150757, "learning_rate": 2.3508403361344538e-05, "loss": 1.5123, "step": 127000 }, { "epoch": 0.35, "grad_norm": 1.789902925491333, "learning_rate": 2.347689075630252e-05, "loss": 1.5128, "step": 127500 }, { "epoch": 0.35, "grad_norm": 1.4414323568344116, "learning_rate": 2.3445378151260505e-05, "loss": 1.5112, "step": 128000 }, { "epoch": 0.35, "grad_norm": 1.542536735534668, "learning_rate": 2.3413865546218487e-05, "loss": 1.5132, "step": 128500 }, { "epoch": 0.35, "grad_norm": 1.479336142539978, "learning_rate": 2.3382352941176472e-05, "loss": 1.5091, "step": 129000 }, { "epoch": 0.35, "grad_norm": 1.5068061351776123, "learning_rate": 2.3350840336134454e-05, "loss": 1.5157, "step": 129500 }, { "epoch": 0.35, "grad_norm": 1.5134038925170898, "learning_rate": 2.331932773109244e-05, "loss": 1.5145, "step": 130000 }, { "epoch": 0.35, "grad_norm": 2.804521083831787, "learning_rate": 2.328781512605042e-05, "loss": 1.71, "step": 130500 }, { "epoch": 0.35, "grad_norm": 9.153915405273438, "learning_rate": 2.3256302521008406e-05, "loss": 1.5874, "step": 131000 }, { "epoch": 0.36, "grad_norm": 3.567737579345703, "learning_rate": 2.3224789915966387e-05, "loss": 1.5532, "step": 131500 }, { "epoch": 0.36, "grad_norm": 1.5058925151824951, "learning_rate": 2.3193277310924373e-05, "loss": 1.5241, "step": 132000 }, { "epoch": 0.36, "grad_norm": 1.48910653591156, "learning_rate": 2.3161764705882354e-05, "loss": 1.5197, "step": 132500 }, { "epoch": 0.36, "grad_norm": 1.477921962738037, "learning_rate": 2.313025210084034e-05, "loss": 1.5191, "step": 133000 }, { "epoch": 0.36, "grad_norm": 1.503013014793396, "learning_rate": 2.309873949579832e-05, "loss": 1.5112, "step": 133500 }, { "epoch": 0.36, "grad_norm": 1.457146406173706, "learning_rate": 2.3067226890756303e-05, "loss": 1.5158, "step": 134000 }, { "epoch": 0.36, "grad_norm": 1.8954756259918213, "learning_rate": 2.3035714285714288e-05, "loss": 1.5138, "step": 134500 }, { "epoch": 0.37, "grad_norm": 1.5171183347702026, "learning_rate": 2.300420168067227e-05, "loss": 1.5201, "step": 135000 }, { "epoch": 0.37, "grad_norm": 1.454849362373352, "learning_rate": 2.2972689075630255e-05, "loss": 1.5113, "step": 135500 }, { "epoch": 0.37, "grad_norm": 2.3639023303985596, "learning_rate": 2.2941176470588233e-05, "loss": 1.5089, "step": 136000 }, { "epoch": 0.37, "grad_norm": 1.4599758386611938, "learning_rate": 2.290966386554622e-05, "loss": 1.5099, "step": 136500 }, { "epoch": 0.37, "grad_norm": 1.5151523351669312, "learning_rate": 2.28781512605042e-05, "loss": 1.5077, "step": 137000 }, { "epoch": 0.37, "grad_norm": 1.518723726272583, "learning_rate": 2.2846638655462185e-05, "loss": 1.5097, "step": 137500 }, { "epoch": 0.37, "grad_norm": 1.5430985689163208, "learning_rate": 2.2815126050420167e-05, "loss": 1.5089, "step": 138000 }, { "epoch": 0.37, "grad_norm": 1.468233585357666, "learning_rate": 2.2783613445378152e-05, "loss": 1.5075, "step": 138500 }, { "epoch": 0.38, "grad_norm": 1.540824294090271, "learning_rate": 2.2752100840336134e-05, "loss": 1.5095, "step": 139000 }, { "epoch": 0.38, "grad_norm": 1.4792211055755615, "learning_rate": 2.272058823529412e-05, "loss": 1.5123, "step": 139500 }, { "epoch": 0.38, "grad_norm": 1.4582479000091553, "learning_rate": 2.26890756302521e-05, "loss": 1.5041, "step": 140000 }, { "epoch": 0.38, "grad_norm": 1.4484353065490723, "learning_rate": 2.2657563025210083e-05, "loss": 1.5098, "step": 140500 }, { "epoch": 0.38, "grad_norm": 2.090087413787842, "learning_rate": 2.2626050420168068e-05, "loss": 1.504, "step": 141000 }, { "epoch": 0.38, "grad_norm": 1.5165677070617676, "learning_rate": 2.259453781512605e-05, "loss": 1.5037, "step": 141500 }, { "epoch": 0.38, "grad_norm": 1.4467180967330933, "learning_rate": 2.2563025210084035e-05, "loss": 1.5037, "step": 142000 }, { "epoch": 0.39, "grad_norm": 1.53107750415802, "learning_rate": 2.2531512605042016e-05, "loss": 1.5048, "step": 142500 }, { "epoch": 0.39, "grad_norm": 1.685832142829895, "learning_rate": 2.25e-05, "loss": 1.5051, "step": 143000 }, { "epoch": 0.39, "grad_norm": 1.722901701927185, "learning_rate": 2.2468487394957983e-05, "loss": 1.5038, "step": 143500 }, { "epoch": 0.39, "grad_norm": 1.5191560983657837, "learning_rate": 2.2436974789915968e-05, "loss": 1.5021, "step": 144000 }, { "epoch": 0.39, "grad_norm": 1.6680717468261719, "learning_rate": 2.240546218487395e-05, "loss": 1.5019, "step": 144500 }, { "epoch": 0.39, "grad_norm": 1.5664371252059937, "learning_rate": 2.2373949579831935e-05, "loss": 1.5028, "step": 145000 }, { "epoch": 0.39, "grad_norm": 1.484131932258606, "learning_rate": 2.2342436974789917e-05, "loss": 1.5028, "step": 145500 }, { "epoch": 0.4, "grad_norm": 1.4882657527923584, "learning_rate": 2.2310924369747902e-05, "loss": 1.4993, "step": 146000 }, { "epoch": 0.4, "grad_norm": 1.4583569765090942, "learning_rate": 2.2279411764705884e-05, "loss": 1.5037, "step": 146500 }, { "epoch": 0.4, "grad_norm": 1.559399127960205, "learning_rate": 2.2247899159663866e-05, "loss": 1.4994, "step": 147000 }, { "epoch": 0.4, "grad_norm": 1.537287950515747, "learning_rate": 2.221638655462185e-05, "loss": 1.5008, "step": 147500 }, { "epoch": 0.4, "grad_norm": 1.4840517044067383, "learning_rate": 2.2184873949579832e-05, "loss": 1.5003, "step": 148000 }, { "epoch": 0.4, "grad_norm": 1.6292195320129395, "learning_rate": 2.2153361344537818e-05, "loss": 1.4975, "step": 148500 }, { "epoch": 0.4, "grad_norm": 1.4870771169662476, "learning_rate": 2.21218487394958e-05, "loss": 1.4962, "step": 149000 }, { "epoch": 0.4, "grad_norm": 1.4792907238006592, "learning_rate": 2.2090336134453784e-05, "loss": 1.4978, "step": 149500 }, { "epoch": 0.41, "grad_norm": 1.4179558753967285, "learning_rate": 2.2058823529411766e-05, "loss": 1.5012, "step": 150000 }, { "epoch": 0.41, "grad_norm": 1.4594039916992188, "learning_rate": 2.2027310924369748e-05, "loss": 1.4987, "step": 150500 }, { "epoch": 0.41, "grad_norm": 1.5356736183166504, "learning_rate": 2.199579831932773e-05, "loss": 1.4975, "step": 151000 }, { "epoch": 0.41, "grad_norm": 1.4961708784103394, "learning_rate": 2.1964285714285715e-05, "loss": 1.4966, "step": 151500 }, { "epoch": 0.41, "grad_norm": 1.5061964988708496, "learning_rate": 2.1932773109243697e-05, "loss": 1.4952, "step": 152000 }, { "epoch": 0.41, "grad_norm": 1.4668192863464355, "learning_rate": 2.190126050420168e-05, "loss": 1.4955, "step": 152500 }, { "epoch": 0.41, "grad_norm": 1.520202398300171, "learning_rate": 2.1869747899159663e-05, "loss": 1.4987, "step": 153000 }, { "epoch": 0.42, "grad_norm": 1.5048165321350098, "learning_rate": 2.1838235294117645e-05, "loss": 1.4943, "step": 153500 }, { "epoch": 0.42, "grad_norm": 1.4194804430007935, "learning_rate": 2.180672268907563e-05, "loss": 1.4962, "step": 154000 }, { "epoch": 0.42, "grad_norm": 1.4963053464889526, "learning_rate": 2.1775210084033612e-05, "loss": 1.4939, "step": 154500 }, { "epoch": 0.42, "grad_norm": 1.5189534425735474, "learning_rate": 2.1743697478991597e-05, "loss": 1.4955, "step": 155000 }, { "epoch": 0.42, "grad_norm": 1.844502329826355, "learning_rate": 2.171218487394958e-05, "loss": 1.4932, "step": 155500 }, { "epoch": 0.42, "grad_norm": 1.6127697229385376, "learning_rate": 2.1680672268907564e-05, "loss": 1.4972, "step": 156000 }, { "epoch": 0.42, "grad_norm": 2.39309024810791, "learning_rate": 2.1649159663865546e-05, "loss": 1.4961, "step": 156500 }, { "epoch": 0.43, "grad_norm": 1.7886457443237305, "learning_rate": 2.161764705882353e-05, "loss": 1.4981, "step": 157000 }, { "epoch": 0.43, "grad_norm": 1.5055351257324219, "learning_rate": 2.1586134453781513e-05, "loss": 1.4937, "step": 157500 }, { "epoch": 0.43, "grad_norm": 2.2209436893463135, "learning_rate": 2.1554621848739498e-05, "loss": 1.4958, "step": 158000 }, { "epoch": 0.43, "grad_norm": 1.4863665103912354, "learning_rate": 2.152310924369748e-05, "loss": 1.4937, "step": 158500 }, { "epoch": 0.43, "grad_norm": 1.6290695667266846, "learning_rate": 2.1491596638655465e-05, "loss": 1.4934, "step": 159000 }, { "epoch": 0.43, "grad_norm": 1.5069892406463623, "learning_rate": 2.1460084033613446e-05, "loss": 1.4966, "step": 159500 }, { "epoch": 0.43, "grad_norm": 1.4480432271957397, "learning_rate": 2.1428571428571428e-05, "loss": 1.4928, "step": 160000 }, { "epoch": 0.43, "grad_norm": 1.4599815607070923, "learning_rate": 2.1397058823529413e-05, "loss": 1.4907, "step": 160500 }, { "epoch": 0.44, "grad_norm": 1.5667592287063599, "learning_rate": 2.1365546218487395e-05, "loss": 1.4946, "step": 161000 }, { "epoch": 0.44, "grad_norm": 1.591620683670044, "learning_rate": 2.133403361344538e-05, "loss": 1.4932, "step": 161500 }, { "epoch": 0.44, "grad_norm": 1.4108275175094604, "learning_rate": 2.1302521008403362e-05, "loss": 1.4918, "step": 162000 }, { "epoch": 0.44, "grad_norm": 1.3984153270721436, "learning_rate": 2.1271008403361347e-05, "loss": 1.4912, "step": 162500 }, { "epoch": 0.44, "grad_norm": 1.5187551975250244, "learning_rate": 2.123949579831933e-05, "loss": 1.4896, "step": 163000 }, { "epoch": 0.44, "grad_norm": 1.4671634435653687, "learning_rate": 2.1207983193277314e-05, "loss": 1.4909, "step": 163500 }, { "epoch": 0.44, "grad_norm": 1.5398577451705933, "learning_rate": 2.1176470588235296e-05, "loss": 1.4898, "step": 164000 }, { "epoch": 0.45, "grad_norm": 1.4390913248062134, "learning_rate": 2.114495798319328e-05, "loss": 1.4917, "step": 164500 }, { "epoch": 0.45, "grad_norm": 1.466871976852417, "learning_rate": 2.1113445378151263e-05, "loss": 1.486, "step": 165000 }, { "epoch": 0.45, "grad_norm": 1.4268947839736938, "learning_rate": 2.1081932773109244e-05, "loss": 1.486, "step": 165500 }, { "epoch": 0.45, "grad_norm": 1.473212718963623, "learning_rate": 2.1050420168067226e-05, "loss": 1.4906, "step": 166000 }, { "epoch": 0.45, "grad_norm": 1.4817694425582886, "learning_rate": 2.1018907563025208e-05, "loss": 1.4876, "step": 166500 }, { "epoch": 0.45, "grad_norm": 1.4899072647094727, "learning_rate": 2.0987394957983193e-05, "loss": 1.4853, "step": 167000 }, { "epoch": 0.45, "grad_norm": 1.472068428993225, "learning_rate": 2.0955882352941175e-05, "loss": 1.4859, "step": 167500 }, { "epoch": 0.45, "grad_norm": 1.4609180688858032, "learning_rate": 2.092436974789916e-05, "loss": 1.4867, "step": 168000 }, { "epoch": 0.46, "grad_norm": 1.3884390592575073, "learning_rate": 2.089285714285714e-05, "loss": 1.4845, "step": 168500 }, { "epoch": 0.46, "grad_norm": 1.4505021572113037, "learning_rate": 2.0861344537815127e-05, "loss": 1.4804, "step": 169000 }, { "epoch": 0.46, "grad_norm": 1.4579660892486572, "learning_rate": 2.082983193277311e-05, "loss": 1.4828, "step": 169500 }, { "epoch": 0.46, "grad_norm": 1.4193936586380005, "learning_rate": 2.0798319327731094e-05, "loss": 1.4846, "step": 170000 }, { "epoch": 0.46, "grad_norm": 1.8833608627319336, "learning_rate": 2.0766806722689075e-05, "loss": 1.4832, "step": 170500 }, { "epoch": 0.46, "grad_norm": 1.394463062286377, "learning_rate": 2.073529411764706e-05, "loss": 1.4858, "step": 171000 }, { "epoch": 0.46, "grad_norm": 1.4402869939804077, "learning_rate": 2.0703781512605042e-05, "loss": 1.4853, "step": 171500 }, { "epoch": 0.47, "grad_norm": 1.5677118301391602, "learning_rate": 2.0672268907563024e-05, "loss": 1.4828, "step": 172000 }, { "epoch": 0.47, "grad_norm": 1.412744402885437, "learning_rate": 2.064075630252101e-05, "loss": 1.4861, "step": 172500 }, { "epoch": 0.47, "grad_norm": 1.578121542930603, "learning_rate": 2.060924369747899e-05, "loss": 1.4825, "step": 173000 }, { "epoch": 0.47, "grad_norm": 1.4429398775100708, "learning_rate": 2.0577731092436976e-05, "loss": 1.4806, "step": 173500 }, { "epoch": 0.47, "grad_norm": 1.5229464769363403, "learning_rate": 2.0546218487394958e-05, "loss": 1.4822, "step": 174000 }, { "epoch": 0.47, "grad_norm": 1.533868670463562, "learning_rate": 2.0514705882352943e-05, "loss": 1.4788, "step": 174500 }, { "epoch": 0.47, "grad_norm": 1.4442238807678223, "learning_rate": 2.0483193277310925e-05, "loss": 1.4845, "step": 175000 }, { "epoch": 0.48, "grad_norm": 1.8768386840820312, "learning_rate": 2.045168067226891e-05, "loss": 1.481, "step": 175500 }, { "epoch": 0.48, "grad_norm": 1.5719354152679443, "learning_rate": 2.042016806722689e-05, "loss": 1.4815, "step": 176000 }, { "epoch": 0.48, "grad_norm": 1.6776522397994995, "learning_rate": 2.0388655462184877e-05, "loss": 1.4834, "step": 176500 }, { "epoch": 0.48, "grad_norm": 1.462403416633606, "learning_rate": 2.0357142857142858e-05, "loss": 1.4829, "step": 177000 }, { "epoch": 0.48, "grad_norm": 1.441434621810913, "learning_rate": 2.0325630252100843e-05, "loss": 1.4817, "step": 177500 }, { "epoch": 0.48, "grad_norm": 1.7203949689865112, "learning_rate": 2.0294117647058825e-05, "loss": 1.4819, "step": 178000 }, { "epoch": 0.48, "grad_norm": 1.6117925643920898, "learning_rate": 2.0262605042016807e-05, "loss": 1.48, "step": 178500 }, { "epoch": 0.48, "grad_norm": 1.4840322732925415, "learning_rate": 2.0231092436974792e-05, "loss": 1.4804, "step": 179000 }, { "epoch": 0.49, "grad_norm": 1.4823276996612549, "learning_rate": 2.0199579831932774e-05, "loss": 1.4783, "step": 179500 }, { "epoch": 0.49, "grad_norm": 1.467035174369812, "learning_rate": 2.016806722689076e-05, "loss": 1.4826, "step": 180000 }, { "epoch": 0.49, "grad_norm": 1.4519331455230713, "learning_rate": 2.0136554621848737e-05, "loss": 1.4793, "step": 180500 }, { "epoch": 0.49, "grad_norm": 1.4830392599105835, "learning_rate": 2.0105042016806722e-05, "loss": 1.478, "step": 181000 }, { "epoch": 0.49, "grad_norm": 1.4889652729034424, "learning_rate": 2.0073529411764704e-05, "loss": 1.4825, "step": 181500 }, { "epoch": 0.49, "grad_norm": 1.4417020082473755, "learning_rate": 2.004201680672269e-05, "loss": 1.4781, "step": 182000 }, { "epoch": 0.49, "grad_norm": 1.5612033605575562, "learning_rate": 2.001050420168067e-05, "loss": 1.4749, "step": 182500 }, { "epoch": 0.5, "grad_norm": 1.923521637916565, "learning_rate": 1.9978991596638656e-05, "loss": 1.4742, "step": 183000 }, { "epoch": 0.5, "grad_norm": 1.4759869575500488, "learning_rate": 1.9947478991596638e-05, "loss": 1.4772, "step": 183500 }, { "epoch": 0.5, "grad_norm": 1.4529997110366821, "learning_rate": 1.9915966386554623e-05, "loss": 1.4758, "step": 184000 }, { "epoch": 0.5, "grad_norm": 1.4907563924789429, "learning_rate": 1.9884453781512605e-05, "loss": 1.477, "step": 184500 }, { "epoch": 0.5, "grad_norm": 1.4529681205749512, "learning_rate": 1.9852941176470586e-05, "loss": 1.4754, "step": 185000 }, { "epoch": 0.5, "grad_norm": 1.4950664043426514, "learning_rate": 1.982142857142857e-05, "loss": 1.477, "step": 185500 }, { "epoch": 0.5, "grad_norm": 1.5445144176483154, "learning_rate": 1.9789915966386553e-05, "loss": 1.4763, "step": 186000 }, { "epoch": 0.5, "grad_norm": 2.2947561740875244, "learning_rate": 1.975840336134454e-05, "loss": 1.4771, "step": 186500 }, { "epoch": 0.51, "grad_norm": 1.4762338399887085, "learning_rate": 1.972689075630252e-05, "loss": 1.4748, "step": 187000 }, { "epoch": 0.51, "grad_norm": 1.5006557703018188, "learning_rate": 1.9695378151260505e-05, "loss": 1.474, "step": 187500 }, { "epoch": 0.51, "grad_norm": 1.5126187801361084, "learning_rate": 1.9663865546218487e-05, "loss": 1.4769, "step": 188000 }, { "epoch": 0.51, "grad_norm": 3.9213035106658936, "learning_rate": 1.9632352941176472e-05, "loss": 1.4724, "step": 188500 }, { "epoch": 0.51, "grad_norm": 1.3832660913467407, "learning_rate": 1.9600840336134454e-05, "loss": 1.4743, "step": 189000 }, { "epoch": 0.51, "grad_norm": 1.438021183013916, "learning_rate": 1.956932773109244e-05, "loss": 1.4732, "step": 189500 }, { "epoch": 0.51, "grad_norm": 1.552357792854309, "learning_rate": 1.953781512605042e-05, "loss": 1.4693, "step": 190000 }, { "epoch": 0.52, "grad_norm": 1.4992841482162476, "learning_rate": 1.9506302521008406e-05, "loss": 1.4741, "step": 190500 }, { "epoch": 0.52, "grad_norm": 1.4546705484390259, "learning_rate": 1.9474789915966388e-05, "loss": 1.4709, "step": 191000 }, { "epoch": 0.52, "grad_norm": 1.5536097288131714, "learning_rate": 1.944327731092437e-05, "loss": 1.4715, "step": 191500 }, { "epoch": 0.52, "grad_norm": 1.4430129528045654, "learning_rate": 1.9411764705882355e-05, "loss": 1.4694, "step": 192000 }, { "epoch": 0.52, "grad_norm": 1.4931637048721313, "learning_rate": 1.9380252100840336e-05, "loss": 1.4704, "step": 192500 }, { "epoch": 0.52, "grad_norm": 1.4820243120193481, "learning_rate": 1.934873949579832e-05, "loss": 1.4707, "step": 193000 }, { "epoch": 0.52, "grad_norm": 1.5232768058776855, "learning_rate": 1.9317226890756303e-05, "loss": 1.4692, "step": 193500 }, { "epoch": 0.53, "grad_norm": 1.517333745956421, "learning_rate": 1.928571428571429e-05, "loss": 1.4731, "step": 194000 }, { "epoch": 0.53, "grad_norm": 1.4523952007293701, "learning_rate": 1.925420168067227e-05, "loss": 1.4698, "step": 194500 }, { "epoch": 0.53, "grad_norm": 1.4807761907577515, "learning_rate": 1.9222689075630255e-05, "loss": 1.4719, "step": 195000 }, { "epoch": 0.53, "grad_norm": 1.4389820098876953, "learning_rate": 1.9191176470588234e-05, "loss": 1.4709, "step": 195500 }, { "epoch": 0.53, "grad_norm": 3.7379424571990967, "learning_rate": 1.915966386554622e-05, "loss": 1.4663, "step": 196000 }, { "epoch": 0.53, "grad_norm": 1.4896109104156494, "learning_rate": 1.91281512605042e-05, "loss": 1.4709, "step": 196500 }, { "epoch": 0.53, "grad_norm": 5.979303359985352, "learning_rate": 1.9096638655462186e-05, "loss": 1.4743, "step": 197000 }, { "epoch": 0.53, "grad_norm": 1.4648813009262085, "learning_rate": 1.9065126050420167e-05, "loss": 1.4687, "step": 197500 }, { "epoch": 0.54, "grad_norm": 1.739353895187378, "learning_rate": 1.903361344537815e-05, "loss": 1.4702, "step": 198000 }, { "epoch": 0.54, "grad_norm": 1.4263814687728882, "learning_rate": 1.9002100840336134e-05, "loss": 1.4695, "step": 198500 }, { "epoch": 0.54, "grad_norm": 1.5090336799621582, "learning_rate": 1.8970588235294116e-05, "loss": 1.4667, "step": 199000 }, { "epoch": 0.54, "grad_norm": 1.4606796503067017, "learning_rate": 1.89390756302521e-05, "loss": 1.4665, "step": 199500 }, { "epoch": 0.54, "grad_norm": 1.4979524612426758, "learning_rate": 1.8907563025210083e-05, "loss": 1.4645, "step": 200000 }, { "epoch": 0.54, "grad_norm": 1.5032795667648315, "learning_rate": 1.8876050420168068e-05, "loss": 1.4697, "step": 200500 }, { "epoch": 0.54, "grad_norm": 1.4917629957199097, "learning_rate": 1.884453781512605e-05, "loss": 1.4654, "step": 201000 }, { "epoch": 0.55, "grad_norm": 1.5047801733016968, "learning_rate": 1.8813025210084035e-05, "loss": 1.4665, "step": 201500 }, { "epoch": 0.55, "grad_norm": 1.5550223588943481, "learning_rate": 1.8781512605042017e-05, "loss": 1.4669, "step": 202000 }, { "epoch": 0.55, "grad_norm": 1.4432892799377441, "learning_rate": 1.8750000000000002e-05, "loss": 1.4652, "step": 202500 }, { "epoch": 0.55, "grad_norm": 1.4227643013000488, "learning_rate": 1.8718487394957983e-05, "loss": 1.465, "step": 203000 }, { "epoch": 0.55, "grad_norm": 1.5878413915634155, "learning_rate": 1.868697478991597e-05, "loss": 1.4675, "step": 203500 }, { "epoch": 0.55, "grad_norm": 1.5786782503128052, "learning_rate": 1.865546218487395e-05, "loss": 1.4596, "step": 204000 }, { "epoch": 0.55, "grad_norm": 1.4224051237106323, "learning_rate": 1.8623949579831932e-05, "loss": 1.462, "step": 204500 }, { "epoch": 0.55, "grad_norm": 1.7678115367889404, "learning_rate": 1.8592436974789917e-05, "loss": 1.4614, "step": 205000 }, { "epoch": 0.56, "grad_norm": 1.4170020818710327, "learning_rate": 1.85609243697479e-05, "loss": 1.4649, "step": 205500 }, { "epoch": 0.56, "grad_norm": 1.5474693775177002, "learning_rate": 1.8529411764705884e-05, "loss": 1.464, "step": 206000 }, { "epoch": 0.56, "grad_norm": 1.4655749797821045, "learning_rate": 1.8497899159663866e-05, "loss": 1.4654, "step": 206500 }, { "epoch": 0.56, "grad_norm": 1.6294610500335693, "learning_rate": 1.846638655462185e-05, "loss": 1.4616, "step": 207000 }, { "epoch": 0.56, "grad_norm": 1.4760308265686035, "learning_rate": 1.8434873949579833e-05, "loss": 1.4643, "step": 207500 }, { "epoch": 0.56, "grad_norm": 1.4796357154846191, "learning_rate": 1.8403361344537818e-05, "loss": 1.4659, "step": 208000 }, { "epoch": 0.56, "grad_norm": 1.9592546224594116, "learning_rate": 1.83718487394958e-05, "loss": 1.4611, "step": 208500 }, { "epoch": 0.57, "grad_norm": 1.493324637413025, "learning_rate": 1.8340336134453785e-05, "loss": 1.4626, "step": 209000 }, { "epoch": 0.57, "grad_norm": 1.453369379043579, "learning_rate": 1.8308823529411766e-05, "loss": 1.4603, "step": 209500 }, { "epoch": 0.57, "grad_norm": 1.5146046876907349, "learning_rate": 1.8277310924369748e-05, "loss": 1.4594, "step": 210000 }, { "epoch": 0.57, "grad_norm": 1.424707293510437, "learning_rate": 1.824579831932773e-05, "loss": 1.4631, "step": 210500 }, { "epoch": 0.57, "grad_norm": 1.464998722076416, "learning_rate": 1.8214285714285712e-05, "loss": 1.4617, "step": 211000 }, { "epoch": 0.57, "grad_norm": 1.4314439296722412, "learning_rate": 1.8182773109243697e-05, "loss": 1.4611, "step": 211500 }, { "epoch": 0.57, "grad_norm": 1.4533342123031616, "learning_rate": 1.815126050420168e-05, "loss": 1.4591, "step": 212000 }, { "epoch": 0.58, "grad_norm": 1.5328502655029297, "learning_rate": 1.8119747899159664e-05, "loss": 1.4606, "step": 212500 }, { "epoch": 0.58, "grad_norm": 1.4684851169586182, "learning_rate": 1.8088235294117645e-05, "loss": 1.463, "step": 213000 }, { "epoch": 0.58, "grad_norm": 1.512421727180481, "learning_rate": 1.805672268907563e-05, "loss": 1.4585, "step": 213500 }, { "epoch": 0.58, "grad_norm": 1.5069866180419922, "learning_rate": 1.8025210084033612e-05, "loss": 1.4565, "step": 214000 }, { "epoch": 0.58, "grad_norm": 1.4224152565002441, "learning_rate": 1.7993697478991597e-05, "loss": 1.4575, "step": 214500 }, { "epoch": 0.58, "grad_norm": 1.6329984664916992, "learning_rate": 1.796218487394958e-05, "loss": 1.4541, "step": 215000 }, { "epoch": 0.58, "grad_norm": 1.587007761001587, "learning_rate": 1.7930672268907564e-05, "loss": 1.4572, "step": 215500 }, { "epoch": 0.58, "grad_norm": 1.4805065393447876, "learning_rate": 1.7899159663865546e-05, "loss": 1.4618, "step": 216000 }, { "epoch": 0.59, "grad_norm": 1.517993450164795, "learning_rate": 1.786764705882353e-05, "loss": 1.4538, "step": 216500 }, { "epoch": 0.59, "grad_norm": 1.4399406909942627, "learning_rate": 1.7836134453781513e-05, "loss": 1.4576, "step": 217000 }, { "epoch": 0.59, "grad_norm": 1.4458235502243042, "learning_rate": 1.7804621848739495e-05, "loss": 1.4558, "step": 217500 }, { "epoch": 0.59, "grad_norm": 1.5840320587158203, "learning_rate": 1.777310924369748e-05, "loss": 1.4562, "step": 218000 }, { "epoch": 0.59, "grad_norm": 1.4832299947738647, "learning_rate": 1.774159663865546e-05, "loss": 1.456, "step": 218500 }, { "epoch": 0.59, "grad_norm": 1.4003788232803345, "learning_rate": 1.7710084033613447e-05, "loss": 1.4555, "step": 219000 }, { "epoch": 0.59, "grad_norm": 1.5091036558151245, "learning_rate": 1.767857142857143e-05, "loss": 1.4596, "step": 219500 }, { "epoch": 0.6, "grad_norm": 1.4758837223052979, "learning_rate": 1.7647058823529414e-05, "loss": 1.4566, "step": 220000 }, { "epoch": 0.6, "grad_norm": 1.4372687339782715, "learning_rate": 1.7615546218487395e-05, "loss": 1.4524, "step": 220500 }, { "epoch": 0.6, "grad_norm": 1.4391896724700928, "learning_rate": 1.758403361344538e-05, "loss": 1.4565, "step": 221000 }, { "epoch": 0.6, "grad_norm": 1.4493831396102905, "learning_rate": 1.7552521008403362e-05, "loss": 1.4543, "step": 221500 }, { "epoch": 0.6, "grad_norm": 2.0319833755493164, "learning_rate": 1.7521008403361347e-05, "loss": 1.4536, "step": 222000 }, { "epoch": 0.6, "grad_norm": 1.4861342906951904, "learning_rate": 1.748949579831933e-05, "loss": 1.454, "step": 222500 }, { "epoch": 0.6, "grad_norm": 1.4432348012924194, "learning_rate": 1.7457983193277314e-05, "loss": 1.4546, "step": 223000 }, { "epoch": 0.61, "grad_norm": 1.4457755088806152, "learning_rate": 1.7426470588235296e-05, "loss": 1.4542, "step": 223500 }, { "epoch": 0.61, "grad_norm": 1.4785292148590088, "learning_rate": 1.7394957983193278e-05, "loss": 1.4539, "step": 224000 }, { "epoch": 0.61, "grad_norm": 1.4646965265274048, "learning_rate": 1.7363445378151263e-05, "loss": 1.4557, "step": 224500 }, { "epoch": 0.61, "grad_norm": 1.3340420722961426, "learning_rate": 1.733193277310924e-05, "loss": 1.4512, "step": 225000 }, { "epoch": 0.61, "grad_norm": 1.4864197969436646, "learning_rate": 1.7300420168067226e-05, "loss": 1.4514, "step": 225500 }, { "epoch": 0.61, "grad_norm": 1.441954493522644, "learning_rate": 1.7268907563025208e-05, "loss": 1.4565, "step": 226000 }, { "epoch": 0.61, "grad_norm": 1.4796494245529175, "learning_rate": 1.7237394957983193e-05, "loss": 1.4549, "step": 226500 }, { "epoch": 0.61, "grad_norm": 1.5095195770263672, "learning_rate": 1.7205882352941175e-05, "loss": 1.4538, "step": 227000 }, { "epoch": 0.62, "grad_norm": 1.6988993883132935, "learning_rate": 1.717436974789916e-05, "loss": 1.4552, "step": 227500 }, { "epoch": 0.62, "grad_norm": 1.4422426223754883, "learning_rate": 1.7142857142857142e-05, "loss": 1.4514, "step": 228000 }, { "epoch": 0.62, "grad_norm": 1.4488030672073364, "learning_rate": 1.7111344537815127e-05, "loss": 1.4545, "step": 228500 }, { "epoch": 0.62, "grad_norm": 1.4784460067749023, "learning_rate": 1.707983193277311e-05, "loss": 1.4527, "step": 229000 }, { "epoch": 0.62, "grad_norm": 1.4642586708068848, "learning_rate": 1.7048319327731094e-05, "loss": 1.4483, "step": 229500 }, { "epoch": 0.62, "grad_norm": 1.509343147277832, "learning_rate": 1.7016806722689076e-05, "loss": 1.4543, "step": 230000 }, { "epoch": 0.62, "grad_norm": 1.3862849473953247, "learning_rate": 1.6985294117647057e-05, "loss": 1.4531, "step": 230500 }, { "epoch": 0.63, "grad_norm": 1.4223895072937012, "learning_rate": 1.6953781512605042e-05, "loss": 1.451, "step": 231000 }, { "epoch": 0.63, "grad_norm": 1.4616318941116333, "learning_rate": 1.6922268907563024e-05, "loss": 1.4511, "step": 231500 }, { "epoch": 0.63, "grad_norm": 1.4746378660202026, "learning_rate": 1.689075630252101e-05, "loss": 1.4497, "step": 232000 }, { "epoch": 0.63, "grad_norm": 1.461519479751587, "learning_rate": 1.685924369747899e-05, "loss": 1.4516, "step": 232500 }, { "epoch": 0.63, "grad_norm": 1.3925315141677856, "learning_rate": 1.6827731092436976e-05, "loss": 1.4507, "step": 233000 }, { "epoch": 0.63, "grad_norm": 1.4032963514328003, "learning_rate": 1.6796218487394958e-05, "loss": 1.4497, "step": 233500 }, { "epoch": 0.63, "grad_norm": 1.4162888526916504, "learning_rate": 1.6764705882352943e-05, "loss": 1.4482, "step": 234000 }, { "epoch": 0.63, "grad_norm": 1.3672780990600586, "learning_rate": 1.6733193277310925e-05, "loss": 1.4518, "step": 234500 }, { "epoch": 0.64, "grad_norm": 1.522310733795166, "learning_rate": 1.670168067226891e-05, "loss": 1.4516, "step": 235000 }, { "epoch": 0.64, "grad_norm": 1.3994154930114746, "learning_rate": 1.6670168067226892e-05, "loss": 1.4468, "step": 235500 }, { "epoch": 0.64, "grad_norm": 1.4941591024398804, "learning_rate": 1.6638655462184877e-05, "loss": 1.4491, "step": 236000 }, { "epoch": 0.64, "grad_norm": 1.4521230459213257, "learning_rate": 1.660714285714286e-05, "loss": 1.4475, "step": 236500 }, { "epoch": 0.64, "grad_norm": 1.528152585029602, "learning_rate": 1.657563025210084e-05, "loss": 1.4473, "step": 237000 }, { "epoch": 0.64, "grad_norm": 1.4769060611724854, "learning_rate": 1.6544117647058825e-05, "loss": 1.4463, "step": 237500 }, { "epoch": 0.64, "grad_norm": 1.4506659507751465, "learning_rate": 1.6512605042016807e-05, "loss": 1.4458, "step": 238000 }, { "epoch": 0.65, "grad_norm": 1.491810917854309, "learning_rate": 1.6481092436974792e-05, "loss": 1.4498, "step": 238500 }, { "epoch": 0.65, "grad_norm": 1.4600553512573242, "learning_rate": 1.6449579831932774e-05, "loss": 1.4444, "step": 239000 }, { "epoch": 0.65, "grad_norm": 1.4451686143875122, "learning_rate": 1.641806722689076e-05, "loss": 1.4441, "step": 239500 }, { "epoch": 0.65, "grad_norm": 1.4227120876312256, "learning_rate": 1.6386554621848738e-05, "loss": 1.4448, "step": 240000 }, { "epoch": 0.65, "grad_norm": 1.5668320655822754, "learning_rate": 1.6355042016806723e-05, "loss": 1.4456, "step": 240500 }, { "epoch": 0.65, "grad_norm": 1.3923659324645996, "learning_rate": 1.6323529411764704e-05, "loss": 1.4477, "step": 241000 }, { "epoch": 0.65, "grad_norm": 1.4962598085403442, "learning_rate": 1.629201680672269e-05, "loss": 1.4454, "step": 241500 }, { "epoch": 0.66, "grad_norm": 1.4878734350204468, "learning_rate": 1.626050420168067e-05, "loss": 1.4461, "step": 242000 }, { "epoch": 0.66, "grad_norm": 1.4973180294036865, "learning_rate": 1.6228991596638656e-05, "loss": 1.4464, "step": 242500 }, { "epoch": 0.66, "grad_norm": 1.4737753868103027, "learning_rate": 1.6197478991596638e-05, "loss": 1.444, "step": 243000 }, { "epoch": 0.66, "grad_norm": 1.4609256982803345, "learning_rate": 1.616596638655462e-05, "loss": 1.4479, "step": 243500 }, { "epoch": 0.66, "grad_norm": 1.4048258066177368, "learning_rate": 1.6134453781512605e-05, "loss": 1.4428, "step": 244000 }, { "epoch": 0.66, "grad_norm": 1.399703025817871, "learning_rate": 1.6102941176470587e-05, "loss": 1.4433, "step": 244500 }, { "epoch": 0.66, "grad_norm": 1.5445500612258911, "learning_rate": 1.6071428571428572e-05, "loss": 1.4455, "step": 245000 }, { "epoch": 0.66, "grad_norm": 1.4742292165756226, "learning_rate": 1.6039915966386554e-05, "loss": 1.4428, "step": 245500 }, { "epoch": 0.67, "grad_norm": 1.4535382986068726, "learning_rate": 1.600840336134454e-05, "loss": 1.4453, "step": 246000 }, { "epoch": 0.67, "grad_norm": 1.467373013496399, "learning_rate": 1.597689075630252e-05, "loss": 1.4459, "step": 246500 }, { "epoch": 0.67, "grad_norm": 1.4863603115081787, "learning_rate": 1.5945378151260506e-05, "loss": 1.4444, "step": 247000 }, { "epoch": 0.67, "grad_norm": 1.5373426675796509, "learning_rate": 1.5913865546218487e-05, "loss": 1.4418, "step": 247500 }, { "epoch": 0.67, "grad_norm": 1.4747397899627686, "learning_rate": 1.5882352941176473e-05, "loss": 1.4423, "step": 248000 }, { "epoch": 0.67, "grad_norm": 1.5024008750915527, "learning_rate": 1.5850840336134454e-05, "loss": 1.4466, "step": 248500 }, { "epoch": 0.67, "grad_norm": 1.481330394744873, "learning_rate": 1.581932773109244e-05, "loss": 1.4395, "step": 249000 }, { "epoch": 0.68, "grad_norm": 1.419636607170105, "learning_rate": 1.578781512605042e-05, "loss": 1.4416, "step": 249500 }, { "epoch": 0.68, "grad_norm": 1.4620583057403564, "learning_rate": 1.5756302521008403e-05, "loss": 1.447, "step": 250000 }, { "epoch": 0.68, "grad_norm": 1.4666600227355957, "learning_rate": 1.5724789915966388e-05, "loss": 1.4378, "step": 250500 }, { "epoch": 0.68, "grad_norm": 1.4554154872894287, "learning_rate": 1.569327731092437e-05, "loss": 1.4439, "step": 251000 }, { "epoch": 0.68, "grad_norm": 1.4908123016357422, "learning_rate": 1.5661764705882355e-05, "loss": 1.4427, "step": 251500 }, { "epoch": 0.68, "grad_norm": 1.471479892730713, "learning_rate": 1.5630252100840337e-05, "loss": 1.4433, "step": 252000 }, { "epoch": 0.68, "grad_norm": 1.4541757106781006, "learning_rate": 1.5598739495798322e-05, "loss": 1.4438, "step": 252500 }, { "epoch": 0.68, "grad_norm": 1.7064818143844604, "learning_rate": 1.5567226890756304e-05, "loss": 1.4409, "step": 253000 }, { "epoch": 0.69, "grad_norm": 1.5056750774383545, "learning_rate": 1.553571428571429e-05, "loss": 1.4405, "step": 253500 }, { "epoch": 0.69, "grad_norm": 1.4601994752883911, "learning_rate": 1.550420168067227e-05, "loss": 1.4407, "step": 254000 }, { "epoch": 0.69, "grad_norm": 1.4508180618286133, "learning_rate": 1.5472689075630256e-05, "loss": 1.4471, "step": 254500 }, { "epoch": 0.69, "grad_norm": 1.476529598236084, "learning_rate": 1.5441176470588234e-05, "loss": 1.4416, "step": 255000 }, { "epoch": 0.69, "grad_norm": 1.5242764949798584, "learning_rate": 1.540966386554622e-05, "loss": 1.4406, "step": 255500 }, { "epoch": 0.69, "grad_norm": 1.405678153038025, "learning_rate": 1.53781512605042e-05, "loss": 1.4399, "step": 256000 }, { "epoch": 0.69, "grad_norm": 1.4689253568649292, "learning_rate": 1.5346638655462183e-05, "loss": 1.4409, "step": 256500 }, { "epoch": 0.7, "grad_norm": 1.5302820205688477, "learning_rate": 1.5315126050420168e-05, "loss": 1.4435, "step": 257000 }, { "epoch": 0.7, "grad_norm": 1.4745590686798096, "learning_rate": 1.528361344537815e-05, "loss": 1.4411, "step": 257500 }, { "epoch": 0.7, "grad_norm": 1.5703048706054688, "learning_rate": 1.5252100840336135e-05, "loss": 1.4372, "step": 258000 }, { "epoch": 0.7, "grad_norm": 1.4982346296310425, "learning_rate": 1.5220588235294118e-05, "loss": 1.4342, "step": 258500 }, { "epoch": 0.7, "grad_norm": 1.4562139511108398, "learning_rate": 1.51890756302521e-05, "loss": 1.4403, "step": 259000 }, { "epoch": 0.7, "grad_norm": 1.5004678964614868, "learning_rate": 1.5157563025210083e-05, "loss": 1.4405, "step": 259500 }, { "epoch": 0.7, "grad_norm": 1.4451349973678589, "learning_rate": 1.5126050420168067e-05, "loss": 1.436, "step": 260000 }, { "epoch": 0.71, "grad_norm": 1.420857548713684, "learning_rate": 1.509453781512605e-05, "loss": 1.4402, "step": 260500 }, { "epoch": 0.71, "grad_norm": 1.4772206544876099, "learning_rate": 1.5063025210084034e-05, "loss": 1.4373, "step": 261000 }, { "epoch": 0.71, "grad_norm": 1.4933620691299438, "learning_rate": 1.5031512605042017e-05, "loss": 1.4392, "step": 261500 }, { "epoch": 0.71, "grad_norm": 1.5023765563964844, "learning_rate": 1.5e-05, "loss": 1.438, "step": 262000 }, { "epoch": 0.71, "grad_norm": 1.4560567140579224, "learning_rate": 1.4968487394957984e-05, "loss": 1.439, "step": 262500 }, { "epoch": 0.71, "grad_norm": 1.5497692823410034, "learning_rate": 1.4936974789915967e-05, "loss": 1.4347, "step": 263000 }, { "epoch": 0.71, "grad_norm": 1.5201669931411743, "learning_rate": 1.490546218487395e-05, "loss": 1.4365, "step": 263500 }, { "epoch": 0.71, "grad_norm": 1.4907211065292358, "learning_rate": 1.4873949579831934e-05, "loss": 1.4334, "step": 264000 }, { "epoch": 0.72, "grad_norm": 1.4821357727050781, "learning_rate": 1.4842436974789918e-05, "loss": 1.4361, "step": 264500 }, { "epoch": 0.72, "grad_norm": 1.4968074560165405, "learning_rate": 1.4810924369747901e-05, "loss": 1.4352, "step": 265000 }, { "epoch": 0.72, "grad_norm": 1.475728154182434, "learning_rate": 1.4779411764705883e-05, "loss": 1.4365, "step": 265500 }, { "epoch": 0.72, "grad_norm": 1.560935378074646, "learning_rate": 1.4747899159663864e-05, "loss": 1.4381, "step": 266000 }, { "epoch": 0.72, "grad_norm": 1.4216580390930176, "learning_rate": 1.4716386554621848e-05, "loss": 1.4322, "step": 266500 }, { "epoch": 0.72, "grad_norm": 1.499648094177246, "learning_rate": 1.4684873949579831e-05, "loss": 1.4378, "step": 267000 }, { "epoch": 0.72, "grad_norm": 1.4971799850463867, "learning_rate": 1.4653361344537815e-05, "loss": 1.4334, "step": 267500 }, { "epoch": 0.73, "grad_norm": 1.5106513500213623, "learning_rate": 1.4621848739495798e-05, "loss": 1.4347, "step": 268000 }, { "epoch": 0.73, "grad_norm": 1.488006353378296, "learning_rate": 1.4590336134453782e-05, "loss": 1.4361, "step": 268500 }, { "epoch": 0.73, "grad_norm": 1.484994888305664, "learning_rate": 1.4558823529411765e-05, "loss": 1.4389, "step": 269000 }, { "epoch": 0.73, "grad_norm": 1.4334303140640259, "learning_rate": 1.4527310924369749e-05, "loss": 1.4366, "step": 269500 }, { "epoch": 0.73, "grad_norm": 1.4980212450027466, "learning_rate": 1.4495798319327732e-05, "loss": 1.4335, "step": 270000 }, { "epoch": 0.73, "grad_norm": 1.4758628606796265, "learning_rate": 1.4464285714285715e-05, "loss": 1.4367, "step": 270500 }, { "epoch": 0.73, "grad_norm": 1.4914411306381226, "learning_rate": 1.4432773109243699e-05, "loss": 1.4373, "step": 271000 }, { "epoch": 0.73, "grad_norm": 1.5274006128311157, "learning_rate": 1.4401260504201682e-05, "loss": 1.4364, "step": 271500 }, { "epoch": 0.74, "grad_norm": 1.4571418762207031, "learning_rate": 1.4369747899159664e-05, "loss": 1.4354, "step": 272000 }, { "epoch": 0.74, "grad_norm": 1.5726255178451538, "learning_rate": 1.4338235294117647e-05, "loss": 1.4338, "step": 272500 }, { "epoch": 0.74, "grad_norm": 1.5626286268234253, "learning_rate": 1.4306722689075631e-05, "loss": 1.4345, "step": 273000 }, { "epoch": 0.74, "grad_norm": 1.4581658840179443, "learning_rate": 1.4275210084033613e-05, "loss": 1.4339, "step": 273500 }, { "epoch": 0.74, "grad_norm": 1.4836556911468506, "learning_rate": 1.4243697478991596e-05, "loss": 1.4331, "step": 274000 }, { "epoch": 0.74, "grad_norm": 1.4955805540084839, "learning_rate": 1.421218487394958e-05, "loss": 1.434, "step": 274500 }, { "epoch": 0.74, "grad_norm": 1.5095798969268799, "learning_rate": 1.4180672268907563e-05, "loss": 1.4335, "step": 275000 }, { "epoch": 0.75, "grad_norm": 1.517565131187439, "learning_rate": 1.4149159663865546e-05, "loss": 1.4339, "step": 275500 }, { "epoch": 0.75, "grad_norm": 1.5089333057403564, "learning_rate": 1.411764705882353e-05, "loss": 1.4303, "step": 276000 }, { "epoch": 0.75, "grad_norm": 1.490110993385315, "learning_rate": 1.4086134453781513e-05, "loss": 1.4378, "step": 276500 }, { "epoch": 0.75, "grad_norm": 1.4934676885604858, "learning_rate": 1.4054621848739497e-05, "loss": 1.4309, "step": 277000 }, { "epoch": 0.75, "grad_norm": 1.453904628753662, "learning_rate": 1.402310924369748e-05, "loss": 1.4345, "step": 277500 }, { "epoch": 0.75, "grad_norm": 1.4364333152770996, "learning_rate": 1.3991596638655464e-05, "loss": 1.4347, "step": 278000 }, { "epoch": 0.75, "grad_norm": 1.5105829238891602, "learning_rate": 1.3960084033613445e-05, "loss": 1.4373, "step": 278500 }, { "epoch": 0.76, "grad_norm": 1.5879383087158203, "learning_rate": 1.3928571428571429e-05, "loss": 1.4337, "step": 279000 }, { "epoch": 0.76, "grad_norm": 1.4907859563827515, "learning_rate": 1.3897058823529412e-05, "loss": 1.4378, "step": 279500 }, { "epoch": 0.76, "grad_norm": 1.4965413808822632, "learning_rate": 1.3865546218487396e-05, "loss": 1.4332, "step": 280000 }, { "epoch": 0.76, "grad_norm": 1.4512360095977783, "learning_rate": 1.3834033613445379e-05, "loss": 1.4293, "step": 280500 }, { "epoch": 0.76, "grad_norm": 1.5323312282562256, "learning_rate": 1.3802521008403361e-05, "loss": 1.4348, "step": 281000 }, { "epoch": 0.76, "grad_norm": 1.515937089920044, "learning_rate": 1.3771008403361344e-05, "loss": 1.435, "step": 281500 }, { "epoch": 0.76, "grad_norm": 1.5589243173599243, "learning_rate": 1.3739495798319328e-05, "loss": 1.4276, "step": 282000 }, { "epoch": 0.76, "grad_norm": 1.4904866218566895, "learning_rate": 1.3707983193277311e-05, "loss": 1.4317, "step": 282500 }, { "epoch": 0.77, "grad_norm": 1.4851187467575073, "learning_rate": 1.3676470588235295e-05, "loss": 1.4297, "step": 283000 }, { "epoch": 0.77, "grad_norm": 1.3728834390640259, "learning_rate": 1.3644957983193278e-05, "loss": 1.4322, "step": 283500 }, { "epoch": 0.77, "grad_norm": 1.738533854484558, "learning_rate": 1.3613445378151261e-05, "loss": 1.4293, "step": 284000 }, { "epoch": 0.77, "grad_norm": 1.5092045068740845, "learning_rate": 1.3581932773109245e-05, "loss": 1.4292, "step": 284500 }, { "epoch": 0.77, "grad_norm": 1.5049362182617188, "learning_rate": 1.3550420168067227e-05, "loss": 1.4286, "step": 285000 }, { "epoch": 0.77, "grad_norm": 1.4427067041397095, "learning_rate": 1.351890756302521e-05, "loss": 1.4279, "step": 285500 }, { "epoch": 0.77, "grad_norm": 1.4460445642471313, "learning_rate": 1.3487394957983194e-05, "loss": 1.4301, "step": 286000 }, { "epoch": 0.78, "grad_norm": 1.5012342929840088, "learning_rate": 1.3455882352941177e-05, "loss": 1.4287, "step": 286500 }, { "epoch": 0.78, "grad_norm": 1.4399917125701904, "learning_rate": 1.342436974789916e-05, "loss": 1.4308, "step": 287000 }, { "epoch": 0.78, "grad_norm": 1.4089640378952026, "learning_rate": 1.3392857142857144e-05, "loss": 1.4264, "step": 287500 }, { "epoch": 0.78, "grad_norm": 1.5012991428375244, "learning_rate": 1.3361344537815127e-05, "loss": 1.4296, "step": 288000 }, { "epoch": 0.78, "grad_norm": 1.4144240617752075, "learning_rate": 1.3329831932773109e-05, "loss": 1.4259, "step": 288500 }, { "epoch": 0.78, "grad_norm": 1.4895191192626953, "learning_rate": 1.3298319327731092e-05, "loss": 1.4312, "step": 289000 }, { "epoch": 0.78, "grad_norm": 1.5855236053466797, "learning_rate": 1.3266806722689076e-05, "loss": 1.4275, "step": 289500 }, { "epoch": 0.79, "grad_norm": 1.4119740724563599, "learning_rate": 1.323529411764706e-05, "loss": 1.428, "step": 290000 }, { "epoch": 0.79, "grad_norm": 1.5101768970489502, "learning_rate": 1.3203781512605043e-05, "loss": 1.4289, "step": 290500 }, { "epoch": 0.79, "grad_norm": 1.4803494215011597, "learning_rate": 1.3172268907563025e-05, "loss": 1.4273, "step": 291000 }, { "epoch": 0.79, "grad_norm": 1.5688806772232056, "learning_rate": 1.3140756302521008e-05, "loss": 1.4276, "step": 291500 }, { "epoch": 0.79, "grad_norm": 2.2357559204101562, "learning_rate": 1.3109243697478991e-05, "loss": 1.4294, "step": 292000 }, { "epoch": 0.79, "grad_norm": 1.4668666124343872, "learning_rate": 1.3077731092436975e-05, "loss": 1.4293, "step": 292500 }, { "epoch": 0.79, "grad_norm": 1.46941339969635, "learning_rate": 1.3046218487394958e-05, "loss": 1.4321, "step": 293000 }, { "epoch": 0.79, "grad_norm": 1.633657455444336, "learning_rate": 1.3014705882352942e-05, "loss": 1.4272, "step": 293500 }, { "epoch": 0.8, "grad_norm": 1.6233292818069458, "learning_rate": 1.2983193277310925e-05, "loss": 1.4268, "step": 294000 }, { "epoch": 0.8, "grad_norm": 1.4441863298416138, "learning_rate": 1.2951680672268909e-05, "loss": 1.4262, "step": 294500 }, { "epoch": 0.8, "grad_norm": 1.5020571947097778, "learning_rate": 1.2920168067226892e-05, "loss": 1.4247, "step": 295000 }, { "epoch": 0.8, "grad_norm": 1.476090669631958, "learning_rate": 1.2888655462184874e-05, "loss": 1.426, "step": 295500 }, { "epoch": 0.8, "grad_norm": 1.4784507751464844, "learning_rate": 1.2857142857142857e-05, "loss": 1.4262, "step": 296000 }, { "epoch": 0.8, "grad_norm": 1.4484635591506958, "learning_rate": 1.282563025210084e-05, "loss": 1.426, "step": 296500 }, { "epoch": 0.8, "grad_norm": 1.5106843709945679, "learning_rate": 1.2794117647058824e-05, "loss": 1.4282, "step": 297000 }, { "epoch": 0.81, "grad_norm": 1.401078701019287, "learning_rate": 1.2762605042016806e-05, "loss": 1.4229, "step": 297500 }, { "epoch": 0.81, "grad_norm": 1.4721170663833618, "learning_rate": 1.273109243697479e-05, "loss": 1.4281, "step": 298000 }, { "epoch": 0.81, "grad_norm": 1.5121667385101318, "learning_rate": 1.2699579831932773e-05, "loss": 1.4272, "step": 298500 }, { "epoch": 0.81, "grad_norm": 1.4307163953781128, "learning_rate": 1.2668067226890756e-05, "loss": 1.4269, "step": 299000 }, { "epoch": 0.81, "grad_norm": 1.520992398262024, "learning_rate": 1.263655462184874e-05, "loss": 1.426, "step": 299500 }, { "epoch": 0.81, "grad_norm": 1.4671803712844849, "learning_rate": 1.2605042016806723e-05, "loss": 1.4207, "step": 300000 }, { "epoch": 0.81, "grad_norm": 1.4773739576339722, "learning_rate": 1.2573529411764706e-05, "loss": 1.4248, "step": 300500 }, { "epoch": 0.81, "grad_norm": 1.4782676696777344, "learning_rate": 1.254201680672269e-05, "loss": 1.4265, "step": 301000 }, { "epoch": 0.82, "grad_norm": 1.5411614179611206, "learning_rate": 1.2510504201680673e-05, "loss": 1.4223, "step": 301500 }, { "epoch": 0.82, "grad_norm": 1.4932873249053955, "learning_rate": 1.2478991596638657e-05, "loss": 1.4252, "step": 302000 }, { "epoch": 0.82, "grad_norm": 1.451866626739502, "learning_rate": 1.244747899159664e-05, "loss": 1.4234, "step": 302500 }, { "epoch": 0.82, "grad_norm": 1.4181545972824097, "learning_rate": 1.2415966386554622e-05, "loss": 1.4249, "step": 303000 }, { "epoch": 0.82, "grad_norm": 1.460598349571228, "learning_rate": 1.2384453781512605e-05, "loss": 1.4237, "step": 303500 }, { "epoch": 0.82, "grad_norm": 1.4560647010803223, "learning_rate": 1.2352941176470587e-05, "loss": 1.4199, "step": 304000 }, { "epoch": 0.82, "grad_norm": 1.4535589218139648, "learning_rate": 1.232142857142857e-05, "loss": 1.4248, "step": 304500 }, { "epoch": 0.83, "grad_norm": 1.4643712043762207, "learning_rate": 1.2289915966386554e-05, "loss": 1.4257, "step": 305000 }, { "epoch": 0.83, "grad_norm": 1.5106630325317383, "learning_rate": 1.2258403361344537e-05, "loss": 1.4248, "step": 305500 }, { "epoch": 0.83, "grad_norm": 1.489579439163208, "learning_rate": 1.2226890756302521e-05, "loss": 1.4215, "step": 306000 }, { "epoch": 0.83, "grad_norm": 1.4746323823928833, "learning_rate": 1.2195378151260504e-05, "loss": 1.4202, "step": 306500 }, { "epoch": 0.83, "grad_norm": 1.4702941179275513, "learning_rate": 1.2163865546218488e-05, "loss": 1.4214, "step": 307000 }, { "epoch": 0.83, "grad_norm": 1.5852062702178955, "learning_rate": 1.2132352941176471e-05, "loss": 1.4229, "step": 307500 }, { "epoch": 0.83, "grad_norm": 1.5045883655548096, "learning_rate": 1.2100840336134455e-05, "loss": 1.4245, "step": 308000 }, { "epoch": 0.84, "grad_norm": 1.4635881185531616, "learning_rate": 1.2069327731092438e-05, "loss": 1.425, "step": 308500 }, { "epoch": 0.84, "grad_norm": 1.4574062824249268, "learning_rate": 1.2037815126050422e-05, "loss": 1.4241, "step": 309000 }, { "epoch": 0.84, "grad_norm": 1.4566025733947754, "learning_rate": 1.2006302521008405e-05, "loss": 1.4204, "step": 309500 }, { "epoch": 0.84, "grad_norm": 1.525225281715393, "learning_rate": 1.1974789915966388e-05, "loss": 1.4218, "step": 310000 }, { "epoch": 0.84, "grad_norm": 1.4726413488388062, "learning_rate": 1.1943277310924368e-05, "loss": 1.422, "step": 310500 }, { "epoch": 0.84, "grad_norm": 1.4462370872497559, "learning_rate": 1.1911764705882352e-05, "loss": 1.4174, "step": 311000 }, { "epoch": 0.84, "grad_norm": 1.4930446147918701, "learning_rate": 1.1880252100840335e-05, "loss": 1.4168, "step": 311500 }, { "epoch": 0.84, "grad_norm": 2.050973892211914, "learning_rate": 1.1848739495798319e-05, "loss": 1.4205, "step": 312000 }, { "epoch": 0.85, "grad_norm": 1.514642596244812, "learning_rate": 1.1817226890756302e-05, "loss": 1.42, "step": 312500 }, { "epoch": 0.85, "grad_norm": 1.4417085647583008, "learning_rate": 1.1785714285714286e-05, "loss": 1.42, "step": 313000 }, { "epoch": 0.85, "grad_norm": 1.473029375076294, "learning_rate": 1.1754201680672269e-05, "loss": 1.4228, "step": 313500 }, { "epoch": 0.85, "grad_norm": 1.573533296585083, "learning_rate": 1.1722689075630253e-05, "loss": 1.4193, "step": 314000 }, { "epoch": 0.85, "grad_norm": 1.5040185451507568, "learning_rate": 1.1691176470588236e-05, "loss": 1.4209, "step": 314500 }, { "epoch": 0.85, "grad_norm": 1.472280740737915, "learning_rate": 1.165966386554622e-05, "loss": 1.4203, "step": 315000 }, { "epoch": 0.85, "grad_norm": 1.4371939897537231, "learning_rate": 1.1628151260504203e-05, "loss": 1.4197, "step": 315500 }, { "epoch": 0.86, "grad_norm": 1.74043607711792, "learning_rate": 1.1596638655462186e-05, "loss": 1.4189, "step": 316000 }, { "epoch": 0.86, "grad_norm": 1.5340248346328735, "learning_rate": 1.156512605042017e-05, "loss": 1.4178, "step": 316500 }, { "epoch": 0.86, "grad_norm": 1.4650968313217163, "learning_rate": 1.1533613445378151e-05, "loss": 1.4157, "step": 317000 }, { "epoch": 0.86, "grad_norm": 1.6052621603012085, "learning_rate": 1.1502100840336135e-05, "loss": 1.4221, "step": 317500 }, { "epoch": 0.86, "grad_norm": 1.4934183359146118, "learning_rate": 1.1470588235294117e-05, "loss": 1.4219, "step": 318000 }, { "epoch": 0.86, "grad_norm": 1.6604057550430298, "learning_rate": 1.14390756302521e-05, "loss": 1.4165, "step": 318500 }, { "epoch": 0.86, "grad_norm": 1.448686957359314, "learning_rate": 1.1407563025210084e-05, "loss": 1.4167, "step": 319000 }, { "epoch": 0.86, "grad_norm": 1.4600298404693604, "learning_rate": 1.1376050420168067e-05, "loss": 1.4196, "step": 319500 }, { "epoch": 0.87, "grad_norm": 1.4856675863265991, "learning_rate": 1.134453781512605e-05, "loss": 1.4188, "step": 320000 }, { "epoch": 0.87, "grad_norm": 1.5987657308578491, "learning_rate": 1.1313025210084034e-05, "loss": 1.4176, "step": 320500 }, { "epoch": 0.87, "grad_norm": 1.4707138538360596, "learning_rate": 1.1281512605042017e-05, "loss": 1.4177, "step": 321000 }, { "epoch": 0.87, "grad_norm": 1.4592325687408447, "learning_rate": 1.125e-05, "loss": 1.419, "step": 321500 }, { "epoch": 0.87, "grad_norm": 1.477171540260315, "learning_rate": 1.1218487394957984e-05, "loss": 1.4118, "step": 322000 }, { "epoch": 0.87, "grad_norm": 1.5284925699234009, "learning_rate": 1.1186974789915968e-05, "loss": 1.418, "step": 322500 }, { "epoch": 0.87, "grad_norm": 1.5696572065353394, "learning_rate": 1.1155462184873951e-05, "loss": 1.4175, "step": 323000 }, { "epoch": 0.88, "grad_norm": 1.5421068668365479, "learning_rate": 1.1123949579831933e-05, "loss": 1.4134, "step": 323500 }, { "epoch": 0.88, "grad_norm": 1.5944511890411377, "learning_rate": 1.1092436974789916e-05, "loss": 1.4139, "step": 324000 }, { "epoch": 0.88, "grad_norm": 1.4496880769729614, "learning_rate": 1.10609243697479e-05, "loss": 1.4131, "step": 324500 }, { "epoch": 0.88, "grad_norm": 1.5021952390670776, "learning_rate": 1.1029411764705883e-05, "loss": 1.4144, "step": 325000 }, { "epoch": 0.88, "grad_norm": 1.5261799097061157, "learning_rate": 1.0997899159663865e-05, "loss": 1.4149, "step": 325500 }, { "epoch": 0.88, "grad_norm": 1.396974802017212, "learning_rate": 1.0966386554621848e-05, "loss": 1.4149, "step": 326000 }, { "epoch": 0.88, "grad_norm": 1.561023235321045, "learning_rate": 1.0934873949579832e-05, "loss": 1.4183, "step": 326500 }, { "epoch": 0.89, "grad_norm": 1.509398102760315, "learning_rate": 1.0903361344537815e-05, "loss": 1.4158, "step": 327000 }, { "epoch": 0.89, "grad_norm": 1.5046377182006836, "learning_rate": 1.0871848739495799e-05, "loss": 1.4137, "step": 327500 }, { "epoch": 0.89, "grad_norm": 1.504531979560852, "learning_rate": 1.0840336134453782e-05, "loss": 1.4155, "step": 328000 }, { "epoch": 0.89, "grad_norm": 1.6807337999343872, "learning_rate": 1.0808823529411765e-05, "loss": 1.4161, "step": 328500 }, { "epoch": 0.89, "grad_norm": 1.4374127388000488, "learning_rate": 1.0777310924369749e-05, "loss": 1.4162, "step": 329000 }, { "epoch": 0.89, "grad_norm": 1.4737296104431152, "learning_rate": 1.0745798319327732e-05, "loss": 1.4176, "step": 329500 }, { "epoch": 0.89, "grad_norm": 1.5063775777816772, "learning_rate": 1.0714285714285714e-05, "loss": 1.4128, "step": 330000 }, { "epoch": 0.89, "grad_norm": 1.506156325340271, "learning_rate": 1.0682773109243698e-05, "loss": 1.4176, "step": 330500 }, { "epoch": 0.9, "grad_norm": 1.5394564867019653, "learning_rate": 1.0651260504201681e-05, "loss": 1.4119, "step": 331000 }, { "epoch": 0.9, "grad_norm": 1.4483675956726074, "learning_rate": 1.0619747899159664e-05, "loss": 1.4138, "step": 331500 }, { "epoch": 0.9, "grad_norm": 2.412644147872925, "learning_rate": 1.0588235294117648e-05, "loss": 1.4146, "step": 332000 }, { "epoch": 0.9, "grad_norm": 1.9123421907424927, "learning_rate": 1.0556722689075631e-05, "loss": 1.4194, "step": 332500 }, { "epoch": 0.9, "grad_norm": 1.4911080598831177, "learning_rate": 1.0525210084033613e-05, "loss": 1.418, "step": 333000 }, { "epoch": 0.9, "grad_norm": 1.511194109916687, "learning_rate": 1.0493697478991596e-05, "loss": 1.4114, "step": 333500 }, { "epoch": 0.9, "grad_norm": 1.4733537435531616, "learning_rate": 1.046218487394958e-05, "loss": 1.4149, "step": 334000 }, { "epoch": 0.91, "grad_norm": 1.4742454290390015, "learning_rate": 1.0430672268907563e-05, "loss": 1.4163, "step": 334500 }, { "epoch": 0.91, "grad_norm": 1.4842146635055542, "learning_rate": 1.0399159663865547e-05, "loss": 1.4118, "step": 335000 }, { "epoch": 0.91, "grad_norm": 1.5346875190734863, "learning_rate": 1.036764705882353e-05, "loss": 1.4148, "step": 335500 }, { "epoch": 0.91, "grad_norm": 1.6554747819900513, "learning_rate": 1.0336134453781512e-05, "loss": 1.416, "step": 336000 }, { "epoch": 0.91, "grad_norm": 1.5015145540237427, "learning_rate": 1.0304621848739495e-05, "loss": 1.4146, "step": 336500 }, { "epoch": 0.91, "grad_norm": 1.4634381532669067, "learning_rate": 1.0273109243697479e-05, "loss": 1.4199, "step": 337000 }, { "epoch": 0.91, "grad_norm": 1.7802950143814087, "learning_rate": 1.0241596638655462e-05, "loss": 1.4127, "step": 337500 }, { "epoch": 0.91, "grad_norm": 3.0422604084014893, "learning_rate": 1.0210084033613446e-05, "loss": 1.4121, "step": 338000 }, { "epoch": 0.92, "grad_norm": 1.4957752227783203, "learning_rate": 1.0178571428571429e-05, "loss": 1.4151, "step": 338500 }, { "epoch": 0.92, "grad_norm": 1.6368649005889893, "learning_rate": 1.0147058823529413e-05, "loss": 1.4211, "step": 339000 }, { "epoch": 0.92, "grad_norm": 1.493455410003662, "learning_rate": 1.0115546218487396e-05, "loss": 1.4131, "step": 339500 }, { "epoch": 0.92, "grad_norm": 1.5789108276367188, "learning_rate": 1.008403361344538e-05, "loss": 1.413, "step": 340000 }, { "epoch": 0.92, "grad_norm": 1.4984022378921509, "learning_rate": 1.0052521008403361e-05, "loss": 1.4156, "step": 340500 }, { "epoch": 0.92, "grad_norm": 1.443871021270752, "learning_rate": 1.0021008403361345e-05, "loss": 1.4123, "step": 341000 }, { "epoch": 0.92, "grad_norm": 1.532205581665039, "learning_rate": 9.989495798319328e-06, "loss": 1.4145, "step": 341500 }, { "epoch": 0.93, "grad_norm": 1.487888216972351, "learning_rate": 9.957983193277312e-06, "loss": 1.4132, "step": 342000 }, { "epoch": 0.93, "grad_norm": 1.5009286403656006, "learning_rate": 9.926470588235293e-06, "loss": 1.4132, "step": 342500 }, { "epoch": 0.93, "grad_norm": 1.53665292263031, "learning_rate": 9.894957983193277e-06, "loss": 1.4114, "step": 343000 }, { "epoch": 0.93, "grad_norm": 1.4559004306793213, "learning_rate": 9.86344537815126e-06, "loss": 1.4128, "step": 343500 }, { "epoch": 0.93, "grad_norm": 1.472882628440857, "learning_rate": 9.831932773109244e-06, "loss": 1.4106, "step": 344000 }, { "epoch": 0.93, "grad_norm": 1.528029203414917, "learning_rate": 9.800420168067227e-06, "loss": 1.4133, "step": 344500 }, { "epoch": 0.93, "grad_norm": 1.4509416818618774, "learning_rate": 9.76890756302521e-06, "loss": 1.4099, "step": 345000 }, { "epoch": 0.94, "grad_norm": 1.644581913948059, "learning_rate": 9.737394957983194e-06, "loss": 1.4102, "step": 345500 }, { "epoch": 0.94, "grad_norm": 1.5054335594177246, "learning_rate": 9.705882352941177e-06, "loss": 1.4119, "step": 346000 }, { "epoch": 0.94, "grad_norm": 1.47361421585083, "learning_rate": 9.67436974789916e-06, "loss": 1.4094, "step": 346500 }, { "epoch": 0.94, "grad_norm": 1.461796522140503, "learning_rate": 9.642857142857144e-06, "loss": 1.4108, "step": 347000 }, { "epoch": 0.94, "grad_norm": 1.6115666627883911, "learning_rate": 9.611344537815128e-06, "loss": 1.4096, "step": 347500 }, { "epoch": 0.94, "grad_norm": 1.526082992553711, "learning_rate": 9.57983193277311e-06, "loss": 1.4094, "step": 348000 }, { "epoch": 0.94, "grad_norm": 1.4482905864715576, "learning_rate": 9.548319327731093e-06, "loss": 1.4082, "step": 348500 }, { "epoch": 0.94, "grad_norm": 1.5066174268722534, "learning_rate": 9.516806722689075e-06, "loss": 1.4122, "step": 349000 }, { "epoch": 0.95, "grad_norm": 1.5225650072097778, "learning_rate": 9.485294117647058e-06, "loss": 1.4069, "step": 349500 }, { "epoch": 0.95, "grad_norm": 1.4794243574142456, "learning_rate": 9.453781512605041e-06, "loss": 1.4087, "step": 350000 }, { "epoch": 0.95, "grad_norm": 1.4825611114501953, "learning_rate": 9.422268907563025e-06, "loss": 1.4098, "step": 350500 }, { "epoch": 0.95, "grad_norm": 1.50911283493042, "learning_rate": 9.390756302521008e-06, "loss": 1.4066, "step": 351000 }, { "epoch": 0.95, "grad_norm": 1.5070313215255737, "learning_rate": 9.359243697478992e-06, "loss": 1.4067, "step": 351500 }, { "epoch": 0.95, "grad_norm": 1.4434587955474854, "learning_rate": 9.327731092436975e-06, "loss": 1.4074, "step": 352000 }, { "epoch": 0.95, "grad_norm": 1.4484858512878418, "learning_rate": 9.296218487394959e-06, "loss": 1.4056, "step": 352500 }, { "epoch": 0.96, "grad_norm": 1.6141736507415771, "learning_rate": 9.264705882352942e-06, "loss": 1.4084, "step": 353000 }, { "epoch": 0.96, "grad_norm": 1.4847619533538818, "learning_rate": 9.233193277310925e-06, "loss": 1.4092, "step": 353500 }, { "epoch": 0.96, "grad_norm": 1.4862167835235596, "learning_rate": 9.201680672268909e-06, "loss": 1.4086, "step": 354000 }, { "epoch": 0.96, "grad_norm": 1.5454356670379639, "learning_rate": 9.170168067226892e-06, "loss": 1.4088, "step": 354500 }, { "epoch": 0.96, "grad_norm": 1.4676494598388672, "learning_rate": 9.138655462184874e-06, "loss": 1.4094, "step": 355000 }, { "epoch": 0.96, "grad_norm": 1.4859504699707031, "learning_rate": 9.107142857142856e-06, "loss": 1.4076, "step": 355500 }, { "epoch": 0.96, "grad_norm": 1.499040961265564, "learning_rate": 9.07563025210084e-06, "loss": 1.4104, "step": 356000 }, { "epoch": 0.97, "grad_norm": 1.4864604473114014, "learning_rate": 9.044117647058823e-06, "loss": 1.4061, "step": 356500 }, { "epoch": 0.97, "grad_norm": 1.4507191181182861, "learning_rate": 9.012605042016806e-06, "loss": 1.4062, "step": 357000 }, { "epoch": 0.97, "grad_norm": 1.468526840209961, "learning_rate": 8.98109243697479e-06, "loss": 1.4081, "step": 357500 }, { "epoch": 0.97, "grad_norm": 1.6709305047988892, "learning_rate": 8.949579831932773e-06, "loss": 1.4126, "step": 358000 }, { "epoch": 0.97, "grad_norm": 1.9611443281173706, "learning_rate": 8.918067226890756e-06, "loss": 1.4079, "step": 358500 }, { "epoch": 0.97, "grad_norm": 1.6809275150299072, "learning_rate": 8.88655462184874e-06, "loss": 1.4114, "step": 359000 }, { "epoch": 0.97, "grad_norm": 5.746359825134277, "learning_rate": 8.855042016806723e-06, "loss": 1.4084, "step": 359500 }, { "epoch": 0.97, "grad_norm": 5.197726726531982, "learning_rate": 8.823529411764707e-06, "loss": 1.4066, "step": 360000 }, { "epoch": 0.98, "grad_norm": 1.4346739053726196, "learning_rate": 8.79201680672269e-06, "loss": 1.4066, "step": 360500 }, { "epoch": 0.98, "grad_norm": 1.571542739868164, "learning_rate": 8.760504201680674e-06, "loss": 1.4097, "step": 361000 }, { "epoch": 0.98, "grad_norm": 1.5356281995773315, "learning_rate": 8.728991596638657e-06, "loss": 1.4045, "step": 361500 }, { "epoch": 0.98, "grad_norm": 1.7401924133300781, "learning_rate": 8.697478991596639e-06, "loss": 1.4067, "step": 362000 }, { "epoch": 0.98, "grad_norm": 1.5491187572479248, "learning_rate": 8.66596638655462e-06, "loss": 1.4042, "step": 362500 }, { "epoch": 0.98, "grad_norm": 1.5863696336746216, "learning_rate": 8.634453781512604e-06, "loss": 1.4074, "step": 363000 }, { "epoch": 0.98, "grad_norm": 1.450952410697937, "learning_rate": 8.602941176470587e-06, "loss": 1.4076, "step": 363500 }, { "epoch": 0.99, "grad_norm": 1.5750932693481445, "learning_rate": 8.571428571428571e-06, "loss": 1.41, "step": 364000 }, { "epoch": 0.99, "grad_norm": 1.4661774635314941, "learning_rate": 8.539915966386554e-06, "loss": 1.4091, "step": 364500 }, { "epoch": 0.99, "grad_norm": 1.540864109992981, "learning_rate": 8.508403361344538e-06, "loss": 1.4052, "step": 365000 }, { "epoch": 0.99, "grad_norm": 1.5120595693588257, "learning_rate": 8.476890756302521e-06, "loss": 1.4072, "step": 365500 }, { "epoch": 0.99, "grad_norm": 1.5357037782669067, "learning_rate": 8.445378151260505e-06, "loss": 1.4097, "step": 366000 }, { "epoch": 0.99, "grad_norm": 1.5010443925857544, "learning_rate": 8.413865546218488e-06, "loss": 1.4094, "step": 366500 }, { "epoch": 0.99, "grad_norm": 1.4643309116363525, "learning_rate": 8.382352941176472e-06, "loss": 1.4077, "step": 367000 }, { "epoch": 0.99, "grad_norm": 1.4524095058441162, "learning_rate": 8.350840336134455e-06, "loss": 1.4065, "step": 367500 }, { "epoch": 1.0, "grad_norm": 1.5203324556350708, "learning_rate": 8.319327731092438e-06, "loss": 1.4035, "step": 368000 }, { "epoch": 1.0, "grad_norm": 1.4688167572021484, "learning_rate": 8.28781512605042e-06, "loss": 1.4067, "step": 368500 }, { "epoch": 1.0, "grad_norm": 1.5595752000808716, "learning_rate": 8.256302521008404e-06, "loss": 1.4059, "step": 369000 }, { "epoch": 1.0, "grad_norm": 1.4404747486114502, "learning_rate": 8.224789915966387e-06, "loss": 1.4035, "step": 369500 }, { "epoch": 1.0, "grad_norm": 1.6032897233963013, "learning_rate": 8.193277310924369e-06, "loss": 1.4001, "step": 370000 }, { "epoch": 1.0, "grad_norm": 1.6836262941360474, "learning_rate": 8.161764705882352e-06, "loss": 1.3981, "step": 370500 }, { "epoch": 1.0, "grad_norm": 1.5205241441726685, "learning_rate": 8.130252100840336e-06, "loss": 1.3994, "step": 371000 }, { "epoch": 1.01, "grad_norm": 1.7194490432739258, "learning_rate": 8.098739495798319e-06, "loss": 1.4027, "step": 371500 }, { "epoch": 1.01, "grad_norm": 1.4517977237701416, "learning_rate": 8.067226890756303e-06, "loss": 1.4022, "step": 372000 }, { "epoch": 1.01, "grad_norm": 1.6818935871124268, "learning_rate": 8.035714285714286e-06, "loss": 1.4028, "step": 372500 }, { "epoch": 1.01, "grad_norm": 1.5117074251174927, "learning_rate": 8.00420168067227e-06, "loss": 1.4021, "step": 373000 }, { "epoch": 1.01, "grad_norm": 1.4689205884933472, "learning_rate": 7.972689075630253e-06, "loss": 1.4057, "step": 373500 }, { "epoch": 1.01, "grad_norm": 1.525889277458191, "learning_rate": 7.941176470588236e-06, "loss": 1.4041, "step": 374000 }, { "epoch": 1.01, "grad_norm": 1.4896938800811768, "learning_rate": 7.90966386554622e-06, "loss": 1.4027, "step": 374500 }, { "epoch": 1.02, "grad_norm": 1.4765034914016724, "learning_rate": 7.878151260504201e-06, "loss": 1.4005, "step": 375000 }, { "epoch": 1.02, "grad_norm": 1.5386637449264526, "learning_rate": 7.846638655462185e-06, "loss": 1.397, "step": 375500 }, { "epoch": 1.02, "grad_norm": 1.4808331727981567, "learning_rate": 7.815126050420168e-06, "loss": 1.401, "step": 376000 }, { "epoch": 1.02, "grad_norm": 1.517560362815857, "learning_rate": 7.783613445378152e-06, "loss": 1.4037, "step": 376500 }, { "epoch": 1.02, "grad_norm": 1.6733453273773193, "learning_rate": 7.752100840336135e-06, "loss": 1.3976, "step": 377000 }, { "epoch": 1.02, "grad_norm": 1.480815052986145, "learning_rate": 7.720588235294117e-06, "loss": 1.4, "step": 377500 }, { "epoch": 1.02, "grad_norm": 1.4836503267288208, "learning_rate": 7.6890756302521e-06, "loss": 1.3977, "step": 378000 }, { "epoch": 1.02, "grad_norm": 1.442256212234497, "learning_rate": 7.657563025210084e-06, "loss": 1.399, "step": 378500 }, { "epoch": 1.03, "grad_norm": 1.8496633768081665, "learning_rate": 7.626050420168067e-06, "loss": 1.4038, "step": 379000 }, { "epoch": 1.03, "grad_norm": 1.4886460304260254, "learning_rate": 7.59453781512605e-06, "loss": 1.4061, "step": 379500 }, { "epoch": 1.03, "grad_norm": 1.550764799118042, "learning_rate": 7.563025210084033e-06, "loss": 1.4003, "step": 380000 }, { "epoch": 1.03, "grad_norm": 1.5111615657806396, "learning_rate": 7.531512605042017e-06, "loss": 1.4021, "step": 380500 }, { "epoch": 1.03, "grad_norm": 1.5873339176177979, "learning_rate": 7.5e-06, "loss": 1.4003, "step": 381000 }, { "epoch": 1.03, "grad_norm": 1.5139081478118896, "learning_rate": 7.468487394957984e-06, "loss": 1.3974, "step": 381500 }, { "epoch": 1.03, "grad_norm": 1.4700753688812256, "learning_rate": 7.436974789915967e-06, "loss": 1.4009, "step": 382000 }, { "epoch": 1.04, "grad_norm": 1.4294934272766113, "learning_rate": 7.4054621848739505e-06, "loss": 1.3997, "step": 382500 }, { "epoch": 1.04, "grad_norm": 1.432667851448059, "learning_rate": 7.373949579831932e-06, "loss": 1.3992, "step": 383000 }, { "epoch": 1.04, "grad_norm": 1.6012872457504272, "learning_rate": 7.342436974789916e-06, "loss": 1.3988, "step": 383500 }, { "epoch": 1.04, "grad_norm": 1.5000537633895874, "learning_rate": 7.310924369747899e-06, "loss": 1.399, "step": 384000 }, { "epoch": 1.04, "grad_norm": 1.5064808130264282, "learning_rate": 7.2794117647058826e-06, "loss": 1.4022, "step": 384500 }, { "epoch": 1.04, "grad_norm": 1.5001455545425415, "learning_rate": 7.247899159663866e-06, "loss": 1.3947, "step": 385000 }, { "epoch": 1.04, "grad_norm": 1.4360790252685547, "learning_rate": 7.2163865546218494e-06, "loss": 1.3983, "step": 385500 }, { "epoch": 1.04, "grad_norm": 1.4993146657943726, "learning_rate": 7.184873949579832e-06, "loss": 1.3987, "step": 386000 }, { "epoch": 1.05, "grad_norm": 1.4621449708938599, "learning_rate": 7.1533613445378155e-06, "loss": 1.3974, "step": 386500 }, { "epoch": 1.05, "grad_norm": 1.7409414052963257, "learning_rate": 7.121848739495798e-06, "loss": 1.4004, "step": 387000 }, { "epoch": 1.05, "grad_norm": 1.4486150741577148, "learning_rate": 7.0903361344537815e-06, "loss": 1.3982, "step": 387500 }, { "epoch": 1.05, "grad_norm": 1.5252596139907837, "learning_rate": 7.058823529411765e-06, "loss": 1.4013, "step": 388000 }, { "epoch": 1.05, "grad_norm": 1.4874343872070312, "learning_rate": 7.027310924369748e-06, "loss": 1.3995, "step": 388500 }, { "epoch": 1.05, "grad_norm": 1.5078623294830322, "learning_rate": 6.995798319327732e-06, "loss": 1.3985, "step": 389000 }, { "epoch": 1.05, "grad_norm": 1.5256296396255493, "learning_rate": 6.964285714285714e-06, "loss": 1.4005, "step": 389500 }, { "epoch": 1.06, "grad_norm": 1.5369598865509033, "learning_rate": 6.932773109243698e-06, "loss": 1.3929, "step": 390000 }, { "epoch": 1.06, "grad_norm": 1.4955265522003174, "learning_rate": 6.9012605042016804e-06, "loss": 1.3968, "step": 390500 }, { "epoch": 1.06, "grad_norm": 1.501406192779541, "learning_rate": 6.869747899159664e-06, "loss": 1.3982, "step": 391000 }, { "epoch": 1.06, "grad_norm": 1.5695279836654663, "learning_rate": 6.838235294117647e-06, "loss": 1.3986, "step": 391500 }, { "epoch": 1.06, "grad_norm": 1.590920329093933, "learning_rate": 6.806722689075631e-06, "loss": 1.3989, "step": 392000 }, { "epoch": 1.06, "grad_norm": 1.4469817876815796, "learning_rate": 6.775210084033613e-06, "loss": 1.3958, "step": 392500 }, { "epoch": 1.06, "grad_norm": 1.4517157077789307, "learning_rate": 6.743697478991597e-06, "loss": 1.3948, "step": 393000 }, { "epoch": 1.07, "grad_norm": 1.477184534072876, "learning_rate": 6.71218487394958e-06, "loss": 1.3955, "step": 393500 }, { "epoch": 1.07, "grad_norm": 2.1850063800811768, "learning_rate": 6.680672268907564e-06, "loss": 1.3977, "step": 394000 }, { "epoch": 1.07, "grad_norm": 1.4544538259506226, "learning_rate": 6.649159663865546e-06, "loss": 1.3974, "step": 394500 }, { "epoch": 1.07, "grad_norm": 1.4682557582855225, "learning_rate": 6.61764705882353e-06, "loss": 1.3976, "step": 395000 }, { "epoch": 1.07, "grad_norm": 1.4401472806930542, "learning_rate": 6.586134453781512e-06, "loss": 1.4002, "step": 395500 }, { "epoch": 1.07, "grad_norm": 1.5497291088104248, "learning_rate": 6.554621848739496e-06, "loss": 1.3945, "step": 396000 }, { "epoch": 1.07, "grad_norm": 1.525145173072815, "learning_rate": 6.523109243697479e-06, "loss": 1.4006, "step": 396500 }, { "epoch": 1.07, "grad_norm": 1.5119032859802246, "learning_rate": 6.491596638655463e-06, "loss": 1.3984, "step": 397000 }, { "epoch": 1.08, "grad_norm": 1.7145532369613647, "learning_rate": 6.460084033613446e-06, "loss": 1.398, "step": 397500 }, { "epoch": 1.08, "grad_norm": 1.5175354480743408, "learning_rate": 6.428571428571429e-06, "loss": 1.3971, "step": 398000 }, { "epoch": 1.08, "grad_norm": 1.4529006481170654, "learning_rate": 6.397058823529412e-06, "loss": 1.3986, "step": 398500 }, { "epoch": 1.08, "grad_norm": 1.4779740571975708, "learning_rate": 6.365546218487395e-06, "loss": 1.3985, "step": 399000 }, { "epoch": 1.08, "grad_norm": 1.591557502746582, "learning_rate": 6.334033613445378e-06, "loss": 1.3971, "step": 399500 }, { "epoch": 1.08, "grad_norm": 1.5829887390136719, "learning_rate": 6.3025210084033615e-06, "loss": 1.3989, "step": 400000 }, { "epoch": 1.08, "grad_norm": 1.546576976776123, "learning_rate": 6.271008403361345e-06, "loss": 1.398, "step": 400500 }, { "epoch": 1.09, "grad_norm": 1.4360915422439575, "learning_rate": 6.239495798319328e-06, "loss": 1.3933, "step": 401000 }, { "epoch": 1.09, "grad_norm": 1.555240273475647, "learning_rate": 6.207983193277311e-06, "loss": 1.3964, "step": 401500 }, { "epoch": 1.09, "grad_norm": 1.5486465692520142, "learning_rate": 6.176470588235294e-06, "loss": 1.3922, "step": 402000 }, { "epoch": 1.09, "grad_norm": 1.6140353679656982, "learning_rate": 6.144957983193277e-06, "loss": 1.3941, "step": 402500 }, { "epoch": 1.09, "grad_norm": 1.422938346862793, "learning_rate": 6.1134453781512605e-06, "loss": 1.3946, "step": 403000 }, { "epoch": 1.09, "grad_norm": 1.673789620399475, "learning_rate": 6.081932773109244e-06, "loss": 1.3965, "step": 403500 }, { "epoch": 1.09, "grad_norm": 1.52051842212677, "learning_rate": 6.050420168067227e-06, "loss": 1.3935, "step": 404000 }, { "epoch": 1.09, "grad_norm": 1.5157978534698486, "learning_rate": 6.018907563025211e-06, "loss": 1.3938, "step": 404500 }, { "epoch": 1.1, "grad_norm": 1.5434610843658447, "learning_rate": 5.987394957983194e-06, "loss": 1.3931, "step": 405000 }, { "epoch": 1.1, "grad_norm": 1.7399873733520508, "learning_rate": 5.955882352941176e-06, "loss": 1.3924, "step": 405500 }, { "epoch": 1.1, "grad_norm": 1.482820749282837, "learning_rate": 5.924369747899159e-06, "loss": 1.3923, "step": 406000 }, { "epoch": 1.1, "grad_norm": 4.893394947052002, "learning_rate": 5.892857142857143e-06, "loss": 1.393, "step": 406500 }, { "epoch": 1.1, "grad_norm": 1.538550615310669, "learning_rate": 5.861344537815126e-06, "loss": 1.3938, "step": 407000 }, { "epoch": 1.1, "grad_norm": 1.4997118711471558, "learning_rate": 5.82983193277311e-06, "loss": 1.3934, "step": 407500 }, { "epoch": 1.1, "grad_norm": 1.5265237092971802, "learning_rate": 5.798319327731093e-06, "loss": 1.3915, "step": 408000 }, { "epoch": 1.11, "grad_norm": 1.6841180324554443, "learning_rate": 5.766806722689076e-06, "loss": 1.3946, "step": 408500 }, { "epoch": 1.11, "grad_norm": 1.4722718000411987, "learning_rate": 5.735294117647058e-06, "loss": 1.3949, "step": 409000 }, { "epoch": 1.11, "grad_norm": 2.087042808532715, "learning_rate": 5.703781512605042e-06, "loss": 1.3925, "step": 409500 }, { "epoch": 1.11, "grad_norm": 1.4858590364456177, "learning_rate": 5.672268907563025e-06, "loss": 1.3943, "step": 410000 }, { "epoch": 1.11, "grad_norm": 1.4591546058654785, "learning_rate": 5.640756302521009e-06, "loss": 1.3924, "step": 410500 }, { "epoch": 1.11, "grad_norm": 1.4490437507629395, "learning_rate": 5.609243697478992e-06, "loss": 1.3949, "step": 411000 }, { "epoch": 1.11, "grad_norm": 1.5795851945877075, "learning_rate": 5.5777310924369755e-06, "loss": 1.3951, "step": 411500 }, { "epoch": 1.12, "grad_norm": 1.5447410345077515, "learning_rate": 5.546218487394958e-06, "loss": 1.396, "step": 412000 }, { "epoch": 1.12, "grad_norm": 1.510696530342102, "learning_rate": 5.5147058823529415e-06, "loss": 1.3929, "step": 412500 }, { "epoch": 1.12, "grad_norm": 1.52991783618927, "learning_rate": 5.483193277310924e-06, "loss": 1.393, "step": 413000 }, { "epoch": 1.12, "grad_norm": 1.5724798440933228, "learning_rate": 5.4516806722689076e-06, "loss": 1.3933, "step": 413500 }, { "epoch": 1.12, "grad_norm": 1.9198040962219238, "learning_rate": 5.420168067226891e-06, "loss": 1.3934, "step": 414000 }, { "epoch": 1.12, "grad_norm": 1.5322943925857544, "learning_rate": 5.3886554621848744e-06, "loss": 1.3925, "step": 414500 }, { "epoch": 1.12, "grad_norm": 1.4684040546417236, "learning_rate": 5.357142857142857e-06, "loss": 1.3933, "step": 415000 }, { "epoch": 1.12, "grad_norm": 1.4797214269638062, "learning_rate": 5.3256302521008405e-06, "loss": 1.3925, "step": 415500 }, { "epoch": 1.13, "grad_norm": 1.524305820465088, "learning_rate": 5.294117647058824e-06, "loss": 1.3929, "step": 416000 }, { "epoch": 1.13, "grad_norm": 1.4858139753341675, "learning_rate": 5.2626050420168065e-06, "loss": 1.3881, "step": 416500 }, { "epoch": 1.13, "grad_norm": 1.5586313009262085, "learning_rate": 5.23109243697479e-06, "loss": 1.393, "step": 417000 }, { "epoch": 1.13, "grad_norm": 1.54250168800354, "learning_rate": 5.199579831932773e-06, "loss": 1.3926, "step": 417500 }, { "epoch": 1.13, "grad_norm": 9.902482986450195, "learning_rate": 5.168067226890756e-06, "loss": 1.3923, "step": 418000 }, { "epoch": 1.13, "grad_norm": 3.239046573638916, "learning_rate": 5.136554621848739e-06, "loss": 1.3925, "step": 418500 }, { "epoch": 1.13, "grad_norm": 1.5059127807617188, "learning_rate": 5.105042016806723e-06, "loss": 1.3936, "step": 419000 }, { "epoch": 1.14, "grad_norm": 1.5107486248016357, "learning_rate": 5.073529411764706e-06, "loss": 1.3942, "step": 419500 }, { "epoch": 1.14, "grad_norm": 1.577019214630127, "learning_rate": 5.04201680672269e-06, "loss": 1.3896, "step": 420000 }, { "epoch": 1.14, "grad_norm": 1.4538390636444092, "learning_rate": 5.010504201680672e-06, "loss": 1.387, "step": 420500 }, { "epoch": 1.14, "grad_norm": 1.593549132347107, "learning_rate": 4.978991596638656e-06, "loss": 1.3908, "step": 421000 }, { "epoch": 1.14, "grad_norm": 1.4725204706192017, "learning_rate": 4.947478991596638e-06, "loss": 1.3904, "step": 421500 }, { "epoch": 1.14, "grad_norm": 1.4892488718032837, "learning_rate": 4.915966386554622e-06, "loss": 1.3896, "step": 422000 }, { "epoch": 1.14, "grad_norm": 1.503003478050232, "learning_rate": 4.884453781512605e-06, "loss": 1.3901, "step": 422500 }, { "epoch": 1.15, "grad_norm": 1.5650583505630493, "learning_rate": 4.852941176470589e-06, "loss": 1.3879, "step": 423000 }, { "epoch": 1.15, "grad_norm": 1.5746469497680664, "learning_rate": 4.821428571428572e-06, "loss": 1.3898, "step": 423500 }, { "epoch": 1.15, "grad_norm": 1.4636718034744263, "learning_rate": 4.789915966386555e-06, "loss": 1.3946, "step": 424000 }, { "epoch": 1.15, "grad_norm": 1.5072635412216187, "learning_rate": 4.758403361344537e-06, "loss": 1.3936, "step": 424500 }, { "epoch": 1.15, "grad_norm": 1.9211359024047852, "learning_rate": 4.726890756302521e-06, "loss": 1.3919, "step": 425000 }, { "epoch": 1.15, "grad_norm": 1.6186763048171997, "learning_rate": 4.695378151260504e-06, "loss": 1.3874, "step": 425500 }, { "epoch": 1.15, "grad_norm": 1.6086759567260742, "learning_rate": 4.663865546218488e-06, "loss": 1.3911, "step": 426000 }, { "epoch": 1.15, "grad_norm": 1.4456268548965454, "learning_rate": 4.632352941176471e-06, "loss": 1.3888, "step": 426500 }, { "epoch": 1.16, "grad_norm": 1.5766582489013672, "learning_rate": 4.6008403361344545e-06, "loss": 1.3884, "step": 427000 }, { "epoch": 1.16, "grad_norm": 1.4081532955169678, "learning_rate": 4.569327731092437e-06, "loss": 1.3904, "step": 427500 }, { "epoch": 1.16, "grad_norm": 1.4901301860809326, "learning_rate": 4.53781512605042e-06, "loss": 1.389, "step": 428000 }, { "epoch": 1.16, "grad_norm": 1.5027050971984863, "learning_rate": 4.506302521008403e-06, "loss": 1.3931, "step": 428500 }, { "epoch": 1.16, "grad_norm": 1.4869219064712524, "learning_rate": 4.4747899159663865e-06, "loss": 1.3888, "step": 429000 }, { "epoch": 1.16, "grad_norm": 1.439729928970337, "learning_rate": 4.44327731092437e-06, "loss": 1.3897, "step": 429500 }, { "epoch": 1.16, "grad_norm": 1.5325324535369873, "learning_rate": 4.411764705882353e-06, "loss": 1.3891, "step": 430000 }, { "epoch": 1.17, "grad_norm": 1.5293645858764648, "learning_rate": 4.380252100840337e-06, "loss": 1.3902, "step": 430500 }, { "epoch": 1.17, "grad_norm": 1.4475960731506348, "learning_rate": 4.3487394957983194e-06, "loss": 1.388, "step": 431000 }, { "epoch": 1.17, "grad_norm": 1.5612802505493164, "learning_rate": 4.317226890756302e-06, "loss": 1.3885, "step": 431500 }, { "epoch": 1.17, "grad_norm": 1.682928204536438, "learning_rate": 4.2857142857142855e-06, "loss": 1.3899, "step": 432000 }, { "epoch": 1.17, "grad_norm": 1.5231236219406128, "learning_rate": 4.254201680672269e-06, "loss": 1.3877, "step": 432500 }, { "epoch": 1.17, "grad_norm": 1.446148157119751, "learning_rate": 4.222689075630252e-06, "loss": 1.3901, "step": 433000 }, { "epoch": 1.17, "grad_norm": 1.4778817892074585, "learning_rate": 4.191176470588236e-06, "loss": 1.3865, "step": 433500 }, { "epoch": 1.17, "grad_norm": 1.5888080596923828, "learning_rate": 4.159663865546219e-06, "loss": 1.3872, "step": 434000 }, { "epoch": 1.18, "grad_norm": 1.6371558904647827, "learning_rate": 4.128151260504202e-06, "loss": 1.3893, "step": 434500 }, { "epoch": 1.18, "grad_norm": 1.4442592859268188, "learning_rate": 4.096638655462184e-06, "loss": 1.3934, "step": 435000 }, { "epoch": 1.18, "grad_norm": 1.7637091875076294, "learning_rate": 4.065126050420168e-06, "loss": 1.3892, "step": 435500 }, { "epoch": 1.18, "grad_norm": 1.4838693141937256, "learning_rate": 4.033613445378151e-06, "loss": 1.3866, "step": 436000 }, { "epoch": 1.18, "grad_norm": 1.5558868646621704, "learning_rate": 4.002100840336135e-06, "loss": 1.3914, "step": 436500 }, { "epoch": 1.18, "grad_norm": 1.8331657648086548, "learning_rate": 3.970588235294118e-06, "loss": 1.389, "step": 437000 }, { "epoch": 1.18, "grad_norm": 1.7367424964904785, "learning_rate": 3.939075630252101e-06, "loss": 1.3897, "step": 437500 }, { "epoch": 1.19, "grad_norm": 1.5316094160079956, "learning_rate": 3.907563025210084e-06, "loss": 1.3871, "step": 438000 }, { "epoch": 1.19, "grad_norm": 1.5062899589538574, "learning_rate": 3.876050420168068e-06, "loss": 1.3876, "step": 438500 }, { "epoch": 1.19, "grad_norm": 1.5399343967437744, "learning_rate": 3.84453781512605e-06, "loss": 1.3873, "step": 439000 }, { "epoch": 1.19, "grad_norm": 1.8311206102371216, "learning_rate": 3.8130252100840336e-06, "loss": 1.3886, "step": 439500 }, { "epoch": 1.19, "grad_norm": 1.5011011362075806, "learning_rate": 3.7815126050420167e-06, "loss": 1.3877, "step": 440000 }, { "epoch": 1.19, "grad_norm": 1.5647181272506714, "learning_rate": 3.75e-06, "loss": 1.3895, "step": 440500 }, { "epoch": 1.19, "grad_norm": 1.9663615226745605, "learning_rate": 3.7184873949579835e-06, "loss": 1.3884, "step": 441000 }, { "epoch": 1.2, "grad_norm": 2.4808692932128906, "learning_rate": 3.686974789915966e-06, "loss": 1.3879, "step": 441500 }, { "epoch": 1.2, "grad_norm": 1.4271633625030518, "learning_rate": 3.6554621848739496e-06, "loss": 1.3913, "step": 442000 }, { "epoch": 1.2, "grad_norm": 1.5341715812683105, "learning_rate": 3.623949579831933e-06, "loss": 1.3874, "step": 442500 }, { "epoch": 1.2, "grad_norm": 1.4926517009735107, "learning_rate": 3.592436974789916e-06, "loss": 1.3873, "step": 443000 }, { "epoch": 1.2, "grad_norm": 1.4709627628326416, "learning_rate": 3.560924369747899e-06, "loss": 1.3856, "step": 443500 }, { "epoch": 1.2, "grad_norm": 1.4797513484954834, "learning_rate": 3.5294117647058825e-06, "loss": 1.3874, "step": 444000 }, { "epoch": 1.2, "grad_norm": 1.506548523902893, "learning_rate": 3.497899159663866e-06, "loss": 1.3859, "step": 444500 }, { "epoch": 1.2, "grad_norm": 1.4667857885360718, "learning_rate": 3.466386554621849e-06, "loss": 1.3889, "step": 445000 }, { "epoch": 1.21, "grad_norm": 1.4796762466430664, "learning_rate": 3.434873949579832e-06, "loss": 1.3912, "step": 445500 }, { "epoch": 1.21, "grad_norm": 1.534725546836853, "learning_rate": 3.4033613445378154e-06, "loss": 1.3881, "step": 446000 }, { "epoch": 1.21, "grad_norm": 1.6512054204940796, "learning_rate": 3.3718487394957984e-06, "loss": 1.3874, "step": 446500 }, { "epoch": 1.21, "grad_norm": 1.4926962852478027, "learning_rate": 3.340336134453782e-06, "loss": 1.3844, "step": 447000 }, { "epoch": 1.21, "grad_norm": 1.479819416999817, "learning_rate": 3.308823529411765e-06, "loss": 1.3862, "step": 447500 }, { "epoch": 1.21, "grad_norm": 1.429606318473816, "learning_rate": 3.277310924369748e-06, "loss": 1.3864, "step": 448000 }, { "epoch": 1.21, "grad_norm": 1.526227593421936, "learning_rate": 3.2457983193277313e-06, "loss": 1.388, "step": 448500 }, { "epoch": 1.22, "grad_norm": 1.5270380973815918, "learning_rate": 3.2142857142857143e-06, "loss": 1.3898, "step": 449000 }, { "epoch": 1.22, "grad_norm": 1.6459033489227295, "learning_rate": 3.1827731092436973e-06, "loss": 1.3872, "step": 449500 }, { "epoch": 1.22, "grad_norm": 1.5082780122756958, "learning_rate": 3.1512605042016808e-06, "loss": 1.3864, "step": 450000 }, { "epoch": 1.22, "grad_norm": 1.4675207138061523, "learning_rate": 3.119747899159664e-06, "loss": 1.3858, "step": 450500 }, { "epoch": 1.22, "grad_norm": 1.5487087965011597, "learning_rate": 3.088235294117647e-06, "loss": 1.3859, "step": 451000 }, { "epoch": 1.22, "grad_norm": 1.5166810750961304, "learning_rate": 3.0567226890756302e-06, "loss": 1.3838, "step": 451500 }, { "epoch": 1.22, "grad_norm": 1.4788706302642822, "learning_rate": 3.0252100840336137e-06, "loss": 1.3836, "step": 452000 }, { "epoch": 1.22, "grad_norm": 1.6381962299346924, "learning_rate": 2.993697478991597e-06, "loss": 1.3853, "step": 452500 }, { "epoch": 1.23, "grad_norm": 1.4548882246017456, "learning_rate": 2.9621848739495797e-06, "loss": 1.3878, "step": 453000 }, { "epoch": 1.23, "grad_norm": 1.5543279647827148, "learning_rate": 2.930672268907563e-06, "loss": 1.3885, "step": 453500 }, { "epoch": 1.23, "grad_norm": 1.5119037628173828, "learning_rate": 2.8991596638655466e-06, "loss": 1.3865, "step": 454000 }, { "epoch": 1.23, "grad_norm": 1.5338330268859863, "learning_rate": 2.867647058823529e-06, "loss": 1.3825, "step": 454500 }, { "epoch": 1.23, "grad_norm": 2.100884437561035, "learning_rate": 2.8361344537815126e-06, "loss": 1.3894, "step": 455000 }, { "epoch": 1.23, "grad_norm": 1.4853757619857788, "learning_rate": 2.804621848739496e-06, "loss": 1.385, "step": 455500 }, { "epoch": 1.23, "grad_norm": 1.545937180519104, "learning_rate": 2.773109243697479e-06, "loss": 1.3875, "step": 456000 }, { "epoch": 1.24, "grad_norm": 1.4860107898712158, "learning_rate": 2.741596638655462e-06, "loss": 1.3839, "step": 456500 }, { "epoch": 1.24, "grad_norm": 1.5260435342788696, "learning_rate": 2.7100840336134455e-06, "loss": 1.3815, "step": 457000 }, { "epoch": 1.24, "grad_norm": 1.5752997398376465, "learning_rate": 2.6785714285714285e-06, "loss": 1.3845, "step": 457500 }, { "epoch": 1.24, "grad_norm": 1.5157984495162964, "learning_rate": 2.647058823529412e-06, "loss": 1.3831, "step": 458000 }, { "epoch": 1.24, "grad_norm": 1.5206942558288574, "learning_rate": 2.615546218487395e-06, "loss": 1.3866, "step": 458500 }, { "epoch": 1.24, "grad_norm": 1.524672508239746, "learning_rate": 2.584033613445378e-06, "loss": 1.3869, "step": 459000 }, { "epoch": 1.24, "grad_norm": 6.727693557739258, "learning_rate": 2.5525210084033614e-06, "loss": 1.3805, "step": 459500 }, { "epoch": 1.25, "grad_norm": 1.5827701091766357, "learning_rate": 2.521008403361345e-06, "loss": 1.39, "step": 460000 }, { "epoch": 1.25, "grad_norm": 1.4831866025924683, "learning_rate": 2.489495798319328e-06, "loss": 1.3886, "step": 460500 }, { "epoch": 1.25, "grad_norm": 1.5272330045700073, "learning_rate": 2.457983193277311e-06, "loss": 1.3889, "step": 461000 }, { "epoch": 1.25, "grad_norm": 1.478623628616333, "learning_rate": 2.4264705882352943e-06, "loss": 1.3878, "step": 461500 }, { "epoch": 1.25, "grad_norm": 1.5272207260131836, "learning_rate": 2.3949579831932773e-06, "loss": 1.3834, "step": 462000 }, { "epoch": 1.25, "grad_norm": 1.574120044708252, "learning_rate": 2.3634453781512604e-06, "loss": 1.3852, "step": 462500 }, { "epoch": 1.25, "grad_norm": 1.5751044750213623, "learning_rate": 2.331932773109244e-06, "loss": 1.3829, "step": 463000 }, { "epoch": 1.25, "grad_norm": 1.4704902172088623, "learning_rate": 2.3004201680672272e-06, "loss": 1.3817, "step": 463500 }, { "epoch": 1.26, "grad_norm": 2.406973123550415, "learning_rate": 2.26890756302521e-06, "loss": 1.3872, "step": 464000 }, { "epoch": 1.26, "grad_norm": 1.4869129657745361, "learning_rate": 2.2373949579831933e-06, "loss": 1.3825, "step": 464500 }, { "epoch": 1.26, "grad_norm": 1.5050959587097168, "learning_rate": 2.2058823529411767e-06, "loss": 1.3821, "step": 465000 }, { "epoch": 1.26, "grad_norm": 1.4652327299118042, "learning_rate": 2.1743697478991597e-06, "loss": 1.3831, "step": 465500 }, { "epoch": 1.26, "grad_norm": 1.6011298894882202, "learning_rate": 2.1428571428571427e-06, "loss": 1.3824, "step": 466000 }, { "epoch": 1.26, "grad_norm": 1.589460015296936, "learning_rate": 2.111344537815126e-06, "loss": 1.3816, "step": 466500 }, { "epoch": 1.26, "grad_norm": 1.679612636566162, "learning_rate": 2.0798319327731096e-06, "loss": 1.383, "step": 467000 }, { "epoch": 1.27, "grad_norm": 5.37538480758667, "learning_rate": 2.048319327731092e-06, "loss": 1.3818, "step": 467500 }, { "epoch": 1.27, "grad_norm": 1.5256156921386719, "learning_rate": 2.0168067226890756e-06, "loss": 1.383, "step": 468000 }, { "epoch": 1.27, "grad_norm": 1.546476125717163, "learning_rate": 1.985294117647059e-06, "loss": 1.3841, "step": 468500 }, { "epoch": 1.27, "grad_norm": 1.429592251777649, "learning_rate": 1.953781512605042e-06, "loss": 1.3828, "step": 469000 }, { "epoch": 1.27, "grad_norm": 1.4674160480499268, "learning_rate": 1.922268907563025e-06, "loss": 1.3847, "step": 469500 }, { "epoch": 1.27, "grad_norm": 2.370859384536743, "learning_rate": 1.8907563025210083e-06, "loss": 1.3816, "step": 470000 }, { "epoch": 1.27, "grad_norm": 1.5106278657913208, "learning_rate": 1.8592436974789918e-06, "loss": 1.3783, "step": 470500 }, { "epoch": 1.28, "grad_norm": 1.5777826309204102, "learning_rate": 1.8277310924369748e-06, "loss": 1.3817, "step": 471000 }, { "epoch": 1.28, "grad_norm": 1.4805636405944824, "learning_rate": 1.796218487394958e-06, "loss": 1.3831, "step": 471500 }, { "epoch": 1.28, "grad_norm": 1.5154469013214111, "learning_rate": 1.7647058823529412e-06, "loss": 1.383, "step": 472000 }, { "epoch": 1.28, "grad_norm": 1.54281747341156, "learning_rate": 1.7331932773109245e-06, "loss": 1.3852, "step": 472500 }, { "epoch": 1.28, "grad_norm": 1.7247158288955688, "learning_rate": 1.7016806722689077e-06, "loss": 1.3819, "step": 473000 }, { "epoch": 1.28, "grad_norm": 1.4723429679870605, "learning_rate": 1.670168067226891e-06, "loss": 1.38, "step": 473500 }, { "epoch": 1.28, "grad_norm": 1.5267595052719116, "learning_rate": 1.638655462184874e-06, "loss": 1.3822, "step": 474000 }, { "epoch": 1.28, "grad_norm": 1.566758155822754, "learning_rate": 1.6071428571428572e-06, "loss": 1.3837, "step": 474500 }, { "epoch": 1.29, "grad_norm": 2.029449939727783, "learning_rate": 1.5756302521008404e-06, "loss": 1.3853, "step": 475000 }, { "epoch": 1.29, "grad_norm": 1.4750381708145142, "learning_rate": 1.5441176470588234e-06, "loss": 1.3838, "step": 475500 }, { "epoch": 1.29, "grad_norm": 1.5221339464187622, "learning_rate": 1.5126050420168068e-06, "loss": 1.3859, "step": 476000 }, { "epoch": 1.29, "grad_norm": 1.518754243850708, "learning_rate": 1.4810924369747898e-06, "loss": 1.3783, "step": 476500 }, { "epoch": 1.29, "grad_norm": 1.4300239086151123, "learning_rate": 1.4495798319327733e-06, "loss": 1.3769, "step": 477000 }, { "epoch": 1.29, "grad_norm": 1.5566083192825317, "learning_rate": 1.4180672268907563e-06, "loss": 1.3788, "step": 477500 }, { "epoch": 1.29, "grad_norm": 1.415859580039978, "learning_rate": 1.3865546218487395e-06, "loss": 1.385, "step": 478000 }, { "epoch": 1.3, "grad_norm": 1.4944028854370117, "learning_rate": 1.3550420168067228e-06, "loss": 1.3815, "step": 478500 }, { "epoch": 1.3, "grad_norm": 1.4514822959899902, "learning_rate": 1.323529411764706e-06, "loss": 1.3827, "step": 479000 }, { "epoch": 1.3, "grad_norm": 1.5512882471084595, "learning_rate": 1.292016806722689e-06, "loss": 1.384, "step": 479500 }, { "epoch": 1.3, "grad_norm": 1.574981689453125, "learning_rate": 1.2605042016806724e-06, "loss": 1.382, "step": 480000 }, { "epoch": 1.3, "grad_norm": 1.570827603340149, "learning_rate": 1.2289915966386554e-06, "loss": 1.382, "step": 480500 }, { "epoch": 1.3, "grad_norm": 1.5336010456085205, "learning_rate": 1.1974789915966387e-06, "loss": 1.3803, "step": 481000 }, { "epoch": 1.3, "grad_norm": 1.4452096223831177, "learning_rate": 1.165966386554622e-06, "loss": 1.3804, "step": 481500 }, { "epoch": 1.3, "grad_norm": 1.5529412031173706, "learning_rate": 1.134453781512605e-06, "loss": 1.3813, "step": 482000 }, { "epoch": 1.31, "grad_norm": 1.5553141832351685, "learning_rate": 1.1029411764705884e-06, "loss": 1.3822, "step": 482500 }, { "epoch": 1.31, "grad_norm": 1.5250602960586548, "learning_rate": 1.0714285714285714e-06, "loss": 1.379, "step": 483000 }, { "epoch": 1.31, "grad_norm": 1.4803342819213867, "learning_rate": 1.0399159663865548e-06, "loss": 1.3846, "step": 483500 }, { "epoch": 1.31, "grad_norm": 1.4097282886505127, "learning_rate": 1.0084033613445378e-06, "loss": 1.3855, "step": 484000 }, { "epoch": 1.31, "grad_norm": 1.535632848739624, "learning_rate": 9.76890756302521e-07, "loss": 1.3807, "step": 484500 }, { "epoch": 1.31, "grad_norm": 1.5535025596618652, "learning_rate": 9.453781512605042e-07, "loss": 1.379, "step": 485000 }, { "epoch": 1.31, "grad_norm": 1.5092753171920776, "learning_rate": 9.138655462184874e-07, "loss": 1.3777, "step": 485500 }, { "epoch": 1.32, "grad_norm": 1.5026346445083618, "learning_rate": 8.823529411764706e-07, "loss": 1.3844, "step": 486000 }, { "epoch": 1.32, "grad_norm": 2.1724424362182617, "learning_rate": 8.508403361344538e-07, "loss": 1.3808, "step": 486500 }, { "epoch": 1.32, "grad_norm": 1.5653128623962402, "learning_rate": 8.19327731092437e-07, "loss": 1.3826, "step": 487000 }, { "epoch": 1.32, "grad_norm": 1.8672337532043457, "learning_rate": 7.878151260504202e-07, "loss": 1.3804, "step": 487500 }, { "epoch": 1.32, "grad_norm": 1.5125828981399536, "learning_rate": 7.563025210084034e-07, "loss": 1.3785, "step": 488000 }, { "epoch": 1.32, "grad_norm": 1.5895177125930786, "learning_rate": 7.247899159663866e-07, "loss": 1.3806, "step": 488500 }, { "epoch": 1.32, "grad_norm": 1.505618929862976, "learning_rate": 6.932773109243698e-07, "loss": 1.3822, "step": 489000 }, { "epoch": 1.33, "grad_norm": 1.4767976999282837, "learning_rate": 6.61764705882353e-07, "loss": 1.3809, "step": 489500 }, { "epoch": 1.33, "grad_norm": 1.4713040590286255, "learning_rate": 6.302521008403362e-07, "loss": 1.38, "step": 490000 }, { "epoch": 1.33, "grad_norm": 1.5712190866470337, "learning_rate": 5.987394957983193e-07, "loss": 1.3821, "step": 490500 }, { "epoch": 1.33, "grad_norm": 1.520726203918457, "learning_rate": 5.672268907563025e-07, "loss": 1.3817, "step": 491000 }, { "epoch": 1.33, "grad_norm": 1.4978504180908203, "learning_rate": 5.357142857142857e-07, "loss": 1.3825, "step": 491500 }, { "epoch": 1.33, "grad_norm": 1.5783872604370117, "learning_rate": 5.042016806722689e-07, "loss": 1.3825, "step": 492000 }, { "epoch": 1.33, "grad_norm": 1.5126821994781494, "learning_rate": 4.726890756302521e-07, "loss": 1.3803, "step": 492500 }, { "epoch": 1.33, "grad_norm": 1.4677457809448242, "learning_rate": 4.411764705882353e-07, "loss": 1.3804, "step": 493000 }, { "epoch": 1.34, "grad_norm": 1.5842092037200928, "learning_rate": 4.096638655462185e-07, "loss": 1.3818, "step": 493500 }, { "epoch": 1.34, "grad_norm": 1.5152337551116943, "learning_rate": 3.781512605042017e-07, "loss": 1.3797, "step": 494000 }, { "epoch": 1.34, "grad_norm": 1.5868217945098877, "learning_rate": 3.466386554621849e-07, "loss": 1.3829, "step": 494500 }, { "epoch": 1.34, "grad_norm": 1.4543733596801758, "learning_rate": 3.151260504201681e-07, "loss": 1.3811, "step": 495000 }, { "epoch": 1.34, "grad_norm": 1.5251801013946533, "learning_rate": 2.8361344537815123e-07, "loss": 1.3793, "step": 495500 }, { "epoch": 1.34, "grad_norm": 1.5227956771850586, "learning_rate": 2.5210084033613445e-07, "loss": 1.3848, "step": 496000 }, { "epoch": 1.34, "grad_norm": 1.506102204322815, "learning_rate": 2.2058823529411765e-07, "loss": 1.3789, "step": 496500 }, { "epoch": 1.35, "grad_norm": 1.4776455163955688, "learning_rate": 1.8907563025210085e-07, "loss": 1.3837, "step": 497000 }, { "epoch": 1.35, "grad_norm": 1.5449495315551758, "learning_rate": 1.5756302521008405e-07, "loss": 1.3823, "step": 497500 }, { "epoch": 1.35, "grad_norm": 1.4903110265731812, "learning_rate": 1.2605042016806723e-07, "loss": 1.3816, "step": 498000 }, { "epoch": 1.35, "grad_norm": 1.4964358806610107, "learning_rate": 9.453781512605043e-08, "loss": 1.3783, "step": 498500 }, { "epoch": 1.35, "grad_norm": 1.6141352653503418, "learning_rate": 6.302521008403361e-08, "loss": 1.3819, "step": 499000 }, { "epoch": 1.35, "grad_norm": 1.5006154775619507, "learning_rate": 3.151260504201681e-08, "loss": 1.3771, "step": 499500 }, { "epoch": 1.35, "grad_norm": 1.5279935598373413, "learning_rate": 0.0, "loss": 1.3805, "step": 500000 }, { "epoch": 1.35, "step": 500000, "total_flos": 2.9824904071075946e+19, "train_loss": 1.5473345408935546, "train_runtime": 243315.0329, "train_samples_per_second": 526.067, "train_steps_per_second": 2.055 } ], "logging_steps": 500, "max_steps": 500000, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 10000, "total_flos": 2.9824904071075946e+19, "train_batch_size": 256, "trial_name": null, "trial_params": null }