diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,60951 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 8703, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 4.890166634385125, + "learning_rate": 3.816793893129771e-08, + "loss": 0.8609, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 4.018995826185668, + "learning_rate": 7.633587786259542e-08, + "loss": 0.8911, + "step": 2 + }, + { + "epoch": 0.0, + "grad_norm": 4.508199926906657, + "learning_rate": 1.1450381679389314e-07, + "loss": 0.9919, + "step": 3 + }, + { + "epoch": 0.0, + "grad_norm": 4.0106614528657305, + "learning_rate": 1.5267175572519085e-07, + "loss": 0.8818, + "step": 4 + }, + { + "epoch": 0.0, + "grad_norm": 6.636240967549296, + "learning_rate": 1.9083969465648858e-07, + "loss": 0.7282, + "step": 5 + }, + { + "epoch": 0.0, + "grad_norm": 3.719152783662321, + "learning_rate": 2.2900763358778629e-07, + "loss": 0.8999, + "step": 6 + }, + { + "epoch": 0.0, + "grad_norm": 3.597213351442028, + "learning_rate": 2.67175572519084e-07, + "loss": 0.957, + "step": 7 + }, + { + "epoch": 0.0, + "grad_norm": 1.944866045431503, + "learning_rate": 3.053435114503817e-07, + "loss": 0.9637, + "step": 8 + }, + { + "epoch": 0.0, + "grad_norm": 1.89239455707263, + "learning_rate": 3.4351145038167945e-07, + "loss": 0.9206, + "step": 9 + }, + { + "epoch": 0.0, + "grad_norm": 4.98108745270799, + "learning_rate": 3.8167938931297716e-07, + "loss": 0.8326, + "step": 10 + }, + { + "epoch": 0.0, + "grad_norm": 3.828696327021003, + "learning_rate": 4.1984732824427486e-07, + "loss": 0.9589, + "step": 11 + }, + { + "epoch": 0.0, + "grad_norm": 3.997403458064708, + "learning_rate": 4.5801526717557257e-07, + "loss": 0.9752, + "step": 12 + }, + { + "epoch": 0.0, + "grad_norm": 3.847153101830792, + "learning_rate": 4.961832061068702e-07, + "loss": 0.7562, + "step": 13 + }, + { + "epoch": 0.0, + "grad_norm": 4.023329294672322, + "learning_rate": 5.34351145038168e-07, + "loss": 0.8106, + "step": 14 + }, + { + "epoch": 0.0, + "grad_norm": 3.315806169936015, + "learning_rate": 5.725190839694656e-07, + "loss": 0.8366, + "step": 15 + }, + { + "epoch": 0.0, + "grad_norm": 4.389040742138449, + "learning_rate": 6.106870229007634e-07, + "loss": 0.7775, + "step": 16 + }, + { + "epoch": 0.0, + "grad_norm": 3.2905532425509296, + "learning_rate": 6.48854961832061e-07, + "loss": 0.8314, + "step": 17 + }, + { + "epoch": 0.0, + "grad_norm": 3.384506492690539, + "learning_rate": 6.870229007633589e-07, + "loss": 0.8064, + "step": 18 + }, + { + "epoch": 0.0, + "grad_norm": 3.4455705186879664, + "learning_rate": 7.251908396946565e-07, + "loss": 0.8606, + "step": 19 + }, + { + "epoch": 0.0, + "grad_norm": 3.9725698276763084, + "learning_rate": 7.633587786259543e-07, + "loss": 0.9061, + "step": 20 + }, + { + "epoch": 0.0, + "grad_norm": 3.3974428327769393, + "learning_rate": 8.01526717557252e-07, + "loss": 0.8059, + "step": 21 + }, + { + "epoch": 0.0, + "grad_norm": 4.192106865062429, + "learning_rate": 8.396946564885497e-07, + "loss": 0.8809, + "step": 22 + }, + { + "epoch": 0.0, + "grad_norm": 2.982366905763799, + "learning_rate": 8.778625954198474e-07, + "loss": 0.8447, + "step": 23 + }, + { + "epoch": 0.0, + "grad_norm": 1.7176580466533529, + "learning_rate": 9.160305343511451e-07, + "loss": 0.9067, + "step": 24 + }, + { + "epoch": 0.0, + "grad_norm": 3.8850707390908394, + "learning_rate": 9.54198473282443e-07, + "loss": 0.7836, + "step": 25 + }, + { + "epoch": 0.0, + "grad_norm": 2.790902338373744, + "learning_rate": 9.923664122137404e-07, + "loss": 0.8438, + "step": 26 + }, + { + "epoch": 0.0, + "grad_norm": 2.8786070612893173, + "learning_rate": 1.0305343511450382e-06, + "loss": 0.7237, + "step": 27 + }, + { + "epoch": 0.0, + "grad_norm": 2.8003467080314954, + "learning_rate": 1.068702290076336e-06, + "loss": 0.7366, + "step": 28 + }, + { + "epoch": 0.0, + "grad_norm": 3.2752989644003745, + "learning_rate": 1.1068702290076337e-06, + "loss": 0.936, + "step": 29 + }, + { + "epoch": 0.0, + "grad_norm": 2.5606371594667445, + "learning_rate": 1.1450381679389313e-06, + "loss": 0.78, + "step": 30 + }, + { + "epoch": 0.0, + "grad_norm": 3.136902784546001, + "learning_rate": 1.1832061068702292e-06, + "loss": 0.8018, + "step": 31 + }, + { + "epoch": 0.0, + "grad_norm": 2.57679604702584, + "learning_rate": 1.2213740458015268e-06, + "loss": 0.8801, + "step": 32 + }, + { + "epoch": 0.0, + "grad_norm": 8.245119355469159, + "learning_rate": 1.2595419847328243e-06, + "loss": 0.7583, + "step": 33 + }, + { + "epoch": 0.0, + "grad_norm": 2.4026489118889587, + "learning_rate": 1.297709923664122e-06, + "loss": 0.7171, + "step": 34 + }, + { + "epoch": 0.0, + "grad_norm": 2.428940956414717, + "learning_rate": 1.33587786259542e-06, + "loss": 0.6858, + "step": 35 + }, + { + "epoch": 0.0, + "grad_norm": 2.8484387941028255, + "learning_rate": 1.3740458015267178e-06, + "loss": 0.9102, + "step": 36 + }, + { + "epoch": 0.0, + "grad_norm": 2.9407270086799553, + "learning_rate": 1.4122137404580156e-06, + "loss": 0.808, + "step": 37 + }, + { + "epoch": 0.0, + "grad_norm": 2.4972884035093785, + "learning_rate": 1.450381679389313e-06, + "loss": 0.713, + "step": 38 + }, + { + "epoch": 0.0, + "grad_norm": 3.5162915456359167, + "learning_rate": 1.4885496183206109e-06, + "loss": 0.6245, + "step": 39 + }, + { + "epoch": 0.0, + "grad_norm": 2.312310670357567, + "learning_rate": 1.5267175572519086e-06, + "loss": 0.7021, + "step": 40 + }, + { + "epoch": 0.0, + "grad_norm": 2.8754392083742295, + "learning_rate": 1.5648854961832064e-06, + "loss": 0.6495, + "step": 41 + }, + { + "epoch": 0.0, + "grad_norm": 3.2239917188403506, + "learning_rate": 1.603053435114504e-06, + "loss": 0.702, + "step": 42 + }, + { + "epoch": 0.0, + "grad_norm": 2.676483058379557, + "learning_rate": 1.6412213740458017e-06, + "loss": 0.7584, + "step": 43 + }, + { + "epoch": 0.01, + "grad_norm": 2.805202422080153, + "learning_rate": 1.6793893129770995e-06, + "loss": 0.7396, + "step": 44 + }, + { + "epoch": 0.01, + "grad_norm": 2.383085335865205, + "learning_rate": 1.7175572519083972e-06, + "loss": 0.7019, + "step": 45 + }, + { + "epoch": 0.01, + "grad_norm": 1.9248306809339086, + "learning_rate": 1.7557251908396948e-06, + "loss": 0.7109, + "step": 46 + }, + { + "epoch": 0.01, + "grad_norm": 2.1128613585287783, + "learning_rate": 1.7938931297709925e-06, + "loss": 0.7431, + "step": 47 + }, + { + "epoch": 0.01, + "grad_norm": 2.1156451383954384, + "learning_rate": 1.8320610687022903e-06, + "loss": 0.7432, + "step": 48 + }, + { + "epoch": 0.01, + "grad_norm": 3.664197035688077, + "learning_rate": 1.870229007633588e-06, + "loss": 0.7306, + "step": 49 + }, + { + "epoch": 0.01, + "grad_norm": 3.369083502192842, + "learning_rate": 1.908396946564886e-06, + "loss": 0.615, + "step": 50 + }, + { + "epoch": 0.01, + "grad_norm": 2.1686783449142677, + "learning_rate": 1.946564885496183e-06, + "loss": 0.6047, + "step": 51 + }, + { + "epoch": 0.01, + "grad_norm": 2.675193720413499, + "learning_rate": 1.984732824427481e-06, + "loss": 0.7068, + "step": 52 + }, + { + "epoch": 0.01, + "grad_norm": 3.459552106291937, + "learning_rate": 2.0229007633587786e-06, + "loss": 0.6662, + "step": 53 + }, + { + "epoch": 0.01, + "grad_norm": 2.8768042034160564, + "learning_rate": 2.0610687022900764e-06, + "loss": 0.7876, + "step": 54 + }, + { + "epoch": 0.01, + "grad_norm": 3.1084045313726265, + "learning_rate": 2.099236641221374e-06, + "loss": 0.6153, + "step": 55 + }, + { + "epoch": 0.01, + "grad_norm": 1.588159791258937, + "learning_rate": 2.137404580152672e-06, + "loss": 0.7345, + "step": 56 + }, + { + "epoch": 0.01, + "grad_norm": 3.966400564645722, + "learning_rate": 2.1755725190839697e-06, + "loss": 0.7583, + "step": 57 + }, + { + "epoch": 0.01, + "grad_norm": 2.390132558042109, + "learning_rate": 2.2137404580152674e-06, + "loss": 0.689, + "step": 58 + }, + { + "epoch": 0.01, + "grad_norm": 2.945110784389781, + "learning_rate": 2.2519083969465648e-06, + "loss": 0.7347, + "step": 59 + }, + { + "epoch": 0.01, + "grad_norm": 2.3982587905852006, + "learning_rate": 2.2900763358778625e-06, + "loss": 0.6469, + "step": 60 + }, + { + "epoch": 0.01, + "grad_norm": 3.0933810726730107, + "learning_rate": 2.3282442748091603e-06, + "loss": 0.853, + "step": 61 + }, + { + "epoch": 0.01, + "grad_norm": 3.527176685046032, + "learning_rate": 2.3664122137404585e-06, + "loss": 0.5491, + "step": 62 + }, + { + "epoch": 0.01, + "grad_norm": 2.1702929851115678, + "learning_rate": 2.4045801526717562e-06, + "loss": 0.6383, + "step": 63 + }, + { + "epoch": 0.01, + "grad_norm": 2.066659740815133, + "learning_rate": 2.4427480916030536e-06, + "loss": 0.6553, + "step": 64 + }, + { + "epoch": 0.01, + "grad_norm": 2.1884778761942196, + "learning_rate": 2.4809160305343513e-06, + "loss": 0.6634, + "step": 65 + }, + { + "epoch": 0.01, + "grad_norm": 1.453588640126428, + "learning_rate": 2.5190839694656487e-06, + "loss": 0.8715, + "step": 66 + }, + { + "epoch": 0.01, + "grad_norm": 3.1286451543432965, + "learning_rate": 2.5572519083969464e-06, + "loss": 0.623, + "step": 67 + }, + { + "epoch": 0.01, + "grad_norm": 4.340669242361174, + "learning_rate": 2.595419847328244e-06, + "loss": 0.7098, + "step": 68 + }, + { + "epoch": 0.01, + "grad_norm": 2.4298367262198837, + "learning_rate": 2.633587786259542e-06, + "loss": 0.6573, + "step": 69 + }, + { + "epoch": 0.01, + "grad_norm": 2.4471078307447036, + "learning_rate": 2.67175572519084e-06, + "loss": 0.5916, + "step": 70 + }, + { + "epoch": 0.01, + "grad_norm": 2.2027739797428683, + "learning_rate": 2.709923664122138e-06, + "loss": 0.5689, + "step": 71 + }, + { + "epoch": 0.01, + "grad_norm": 3.007638432464223, + "learning_rate": 2.7480916030534356e-06, + "loss": 0.6749, + "step": 72 + }, + { + "epoch": 0.01, + "grad_norm": 2.0837842414821965, + "learning_rate": 2.7862595419847334e-06, + "loss": 0.6385, + "step": 73 + }, + { + "epoch": 0.01, + "grad_norm": 2.8096065367593126, + "learning_rate": 2.824427480916031e-06, + "loss": 0.7018, + "step": 74 + }, + { + "epoch": 0.01, + "grad_norm": 2.3172792776332614, + "learning_rate": 2.862595419847328e-06, + "loss": 0.5772, + "step": 75 + }, + { + "epoch": 0.01, + "grad_norm": 2.965324143483437, + "learning_rate": 2.900763358778626e-06, + "loss": 0.6377, + "step": 76 + }, + { + "epoch": 0.01, + "grad_norm": 2.6936668913072466, + "learning_rate": 2.938931297709924e-06, + "loss": 0.5813, + "step": 77 + }, + { + "epoch": 0.01, + "grad_norm": 2.418234080082684, + "learning_rate": 2.9770992366412218e-06, + "loss": 0.6751, + "step": 78 + }, + { + "epoch": 0.01, + "grad_norm": 2.3319233899484746, + "learning_rate": 3.0152671755725195e-06, + "loss": 0.6503, + "step": 79 + }, + { + "epoch": 0.01, + "grad_norm": 3.3942978180672085, + "learning_rate": 3.0534351145038173e-06, + "loss": 0.6531, + "step": 80 + }, + { + "epoch": 0.01, + "grad_norm": 2.909082818677905, + "learning_rate": 3.091603053435115e-06, + "loss": 0.7365, + "step": 81 + }, + { + "epoch": 0.01, + "grad_norm": 2.2267671435459437, + "learning_rate": 3.129770992366413e-06, + "loss": 0.61, + "step": 82 + }, + { + "epoch": 0.01, + "grad_norm": 2.0250670457089046, + "learning_rate": 3.1679389312977097e-06, + "loss": 0.6631, + "step": 83 + }, + { + "epoch": 0.01, + "grad_norm": 2.5454647514145714, + "learning_rate": 3.206106870229008e-06, + "loss": 0.5921, + "step": 84 + }, + { + "epoch": 0.01, + "grad_norm": 1.2101376482641772, + "learning_rate": 3.2442748091603056e-06, + "loss": 0.9002, + "step": 85 + }, + { + "epoch": 0.01, + "grad_norm": 1.1290488223839767, + "learning_rate": 3.2824427480916034e-06, + "loss": 0.8682, + "step": 86 + }, + { + "epoch": 0.01, + "grad_norm": 2.895619631338279, + "learning_rate": 3.320610687022901e-06, + "loss": 0.7295, + "step": 87 + }, + { + "epoch": 0.01, + "grad_norm": 1.9264045009895512, + "learning_rate": 3.358778625954199e-06, + "loss": 0.6988, + "step": 88 + }, + { + "epoch": 0.01, + "grad_norm": 2.1640218352358804, + "learning_rate": 3.3969465648854967e-06, + "loss": 0.6314, + "step": 89 + }, + { + "epoch": 0.01, + "grad_norm": 2.1300174045709626, + "learning_rate": 3.4351145038167944e-06, + "loss": 0.5751, + "step": 90 + }, + { + "epoch": 0.01, + "grad_norm": 3.497241911438091, + "learning_rate": 3.473282442748092e-06, + "loss": 0.6143, + "step": 91 + }, + { + "epoch": 0.01, + "grad_norm": 2.7747203732070918, + "learning_rate": 3.5114503816793895e-06, + "loss": 0.5083, + "step": 92 + }, + { + "epoch": 0.01, + "grad_norm": 2.1608759596655642, + "learning_rate": 3.5496183206106873e-06, + "loss": 0.7008, + "step": 93 + }, + { + "epoch": 0.01, + "grad_norm": 2.8148636087730368, + "learning_rate": 3.587786259541985e-06, + "loss": 0.5878, + "step": 94 + }, + { + "epoch": 0.01, + "grad_norm": 1.873025837356046, + "learning_rate": 3.625954198473283e-06, + "loss": 0.7022, + "step": 95 + }, + { + "epoch": 0.01, + "grad_norm": 1.9611886021610716, + "learning_rate": 3.6641221374045806e-06, + "loss": 0.6229, + "step": 96 + }, + { + "epoch": 0.01, + "grad_norm": 2.732768406214643, + "learning_rate": 3.7022900763358783e-06, + "loss": 0.559, + "step": 97 + }, + { + "epoch": 0.01, + "grad_norm": 2.405107944095294, + "learning_rate": 3.740458015267176e-06, + "loss": 0.5774, + "step": 98 + }, + { + "epoch": 0.01, + "grad_norm": 2.1085123456674095, + "learning_rate": 3.778625954198474e-06, + "loss": 0.6115, + "step": 99 + }, + { + "epoch": 0.01, + "grad_norm": 2.3321930179653205, + "learning_rate": 3.816793893129772e-06, + "loss": 0.5939, + "step": 100 + }, + { + "epoch": 0.01, + "grad_norm": 2.1244689736270033, + "learning_rate": 3.8549618320610685e-06, + "loss": 0.5375, + "step": 101 + }, + { + "epoch": 0.01, + "grad_norm": 2.4722976302596957, + "learning_rate": 3.893129770992366e-06, + "loss": 0.4882, + "step": 102 + }, + { + "epoch": 0.01, + "grad_norm": 2.3236423717961716, + "learning_rate": 3.931297709923664e-06, + "loss": 0.5391, + "step": 103 + }, + { + "epoch": 0.01, + "grad_norm": 2.620323160323361, + "learning_rate": 3.969465648854962e-06, + "loss": 0.6207, + "step": 104 + }, + { + "epoch": 0.01, + "grad_norm": 2.9699979151630913, + "learning_rate": 4.0076335877862595e-06, + "loss": 0.7104, + "step": 105 + }, + { + "epoch": 0.01, + "grad_norm": 2.634046358661291, + "learning_rate": 4.045801526717557e-06, + "loss": 0.61, + "step": 106 + }, + { + "epoch": 0.01, + "grad_norm": 2.2638414693594338, + "learning_rate": 4.083969465648855e-06, + "loss": 0.4458, + "step": 107 + }, + { + "epoch": 0.01, + "grad_norm": 1.0383745075393829, + "learning_rate": 4.122137404580153e-06, + "loss": 0.7978, + "step": 108 + }, + { + "epoch": 0.01, + "grad_norm": 3.031294801958909, + "learning_rate": 4.1603053435114506e-06, + "loss": 0.594, + "step": 109 + }, + { + "epoch": 0.01, + "grad_norm": 1.7089036452594197, + "learning_rate": 4.198473282442748e-06, + "loss": 0.6571, + "step": 110 + }, + { + "epoch": 0.01, + "grad_norm": 2.2025902216433675, + "learning_rate": 4.236641221374046e-06, + "loss": 0.688, + "step": 111 + }, + { + "epoch": 0.01, + "grad_norm": 2.067129792466558, + "learning_rate": 4.274809160305344e-06, + "loss": 0.6357, + "step": 112 + }, + { + "epoch": 0.01, + "grad_norm": 2.434797515113872, + "learning_rate": 4.312977099236642e-06, + "loss": 0.6088, + "step": 113 + }, + { + "epoch": 0.01, + "grad_norm": 2.2531317484144475, + "learning_rate": 4.351145038167939e-06, + "loss": 0.5689, + "step": 114 + }, + { + "epoch": 0.01, + "grad_norm": 2.755682999468428, + "learning_rate": 4.389312977099237e-06, + "loss": 0.6887, + "step": 115 + }, + { + "epoch": 0.01, + "grad_norm": 2.6736769324060123, + "learning_rate": 4.427480916030535e-06, + "loss": 0.4289, + "step": 116 + }, + { + "epoch": 0.01, + "grad_norm": 2.359416741890606, + "learning_rate": 4.465648854961833e-06, + "loss": 0.6355, + "step": 117 + }, + { + "epoch": 0.01, + "grad_norm": 2.5295402364438138, + "learning_rate": 4.5038167938931296e-06, + "loss": 0.6899, + "step": 118 + }, + { + "epoch": 0.01, + "grad_norm": 2.478525249113735, + "learning_rate": 4.541984732824427e-06, + "loss": 0.5144, + "step": 119 + }, + { + "epoch": 0.01, + "grad_norm": 2.6421844712036044, + "learning_rate": 4.580152671755725e-06, + "loss": 0.6299, + "step": 120 + }, + { + "epoch": 0.01, + "grad_norm": 2.362963612843913, + "learning_rate": 4.618320610687023e-06, + "loss": 0.6567, + "step": 121 + }, + { + "epoch": 0.01, + "grad_norm": 1.9279529844167191, + "learning_rate": 4.656488549618321e-06, + "loss": 0.6648, + "step": 122 + }, + { + "epoch": 0.01, + "grad_norm": 1.9394517814905574, + "learning_rate": 4.694656488549618e-06, + "loss": 0.692, + "step": 123 + }, + { + "epoch": 0.01, + "grad_norm": 4.714420499303174, + "learning_rate": 4.732824427480917e-06, + "loss": 0.5938, + "step": 124 + }, + { + "epoch": 0.01, + "grad_norm": 2.8280708450828147, + "learning_rate": 4.770992366412215e-06, + "loss": 0.5425, + "step": 125 + }, + { + "epoch": 0.01, + "grad_norm": 3.548074124104295, + "learning_rate": 4.8091603053435125e-06, + "loss": 0.5346, + "step": 126 + }, + { + "epoch": 0.01, + "grad_norm": 11.644079723835075, + "learning_rate": 4.847328244274809e-06, + "loss": 0.6376, + "step": 127 + }, + { + "epoch": 0.01, + "grad_norm": 2.87570972704621, + "learning_rate": 4.885496183206107e-06, + "loss": 0.5417, + "step": 128 + }, + { + "epoch": 0.01, + "grad_norm": 2.2660984143895315, + "learning_rate": 4.923664122137405e-06, + "loss": 0.6728, + "step": 129 + }, + { + "epoch": 0.01, + "grad_norm": 1.9953465116776197, + "learning_rate": 4.961832061068703e-06, + "loss": 0.6377, + "step": 130 + }, + { + "epoch": 0.02, + "grad_norm": 2.080578277369364, + "learning_rate": 5e-06, + "loss": 0.6673, + "step": 131 + }, + { + "epoch": 0.02, + "grad_norm": 1.9444206455609145, + "learning_rate": 5.038167938931297e-06, + "loss": 0.6614, + "step": 132 + }, + { + "epoch": 0.02, + "grad_norm": 2.0632209046595285, + "learning_rate": 5.076335877862596e-06, + "loss": 0.6051, + "step": 133 + }, + { + "epoch": 0.02, + "grad_norm": 7.266717827575863, + "learning_rate": 5.114503816793893e-06, + "loss": 0.6118, + "step": 134 + }, + { + "epoch": 0.02, + "grad_norm": 2.1370183075180718, + "learning_rate": 5.1526717557251914e-06, + "loss": 0.5897, + "step": 135 + }, + { + "epoch": 0.02, + "grad_norm": 2.190800019826852, + "learning_rate": 5.190839694656488e-06, + "loss": 0.5255, + "step": 136 + }, + { + "epoch": 0.02, + "grad_norm": 2.8083730614590907, + "learning_rate": 5.229007633587787e-06, + "loss": 0.5118, + "step": 137 + }, + { + "epoch": 0.02, + "grad_norm": 3.1907072189869132, + "learning_rate": 5.267175572519084e-06, + "loss": 0.6584, + "step": 138 + }, + { + "epoch": 0.02, + "grad_norm": 2.0283430253391495, + "learning_rate": 5.3053435114503825e-06, + "loss": 0.658, + "step": 139 + }, + { + "epoch": 0.02, + "grad_norm": 1.9369334661258206, + "learning_rate": 5.34351145038168e-06, + "loss": 0.4788, + "step": 140 + }, + { + "epoch": 0.02, + "grad_norm": 2.1970211066108503, + "learning_rate": 5.381679389312977e-06, + "loss": 0.6033, + "step": 141 + }, + { + "epoch": 0.02, + "grad_norm": 2.769573750414448, + "learning_rate": 5.419847328244276e-06, + "loss": 0.511, + "step": 142 + }, + { + "epoch": 0.02, + "grad_norm": 3.0832132330439515, + "learning_rate": 5.458015267175573e-06, + "loss": 0.5828, + "step": 143 + }, + { + "epoch": 0.02, + "grad_norm": 1.7745778751743415, + "learning_rate": 5.496183206106871e-06, + "loss": 0.6611, + "step": 144 + }, + { + "epoch": 0.02, + "grad_norm": 2.146549747704848, + "learning_rate": 5.534351145038168e-06, + "loss": 0.66, + "step": 145 + }, + { + "epoch": 0.02, + "grad_norm": 2.199201288842087, + "learning_rate": 5.572519083969467e-06, + "loss": 0.5655, + "step": 146 + }, + { + "epoch": 0.02, + "grad_norm": 2.243549117473414, + "learning_rate": 5.610687022900764e-06, + "loss": 0.5957, + "step": 147 + }, + { + "epoch": 0.02, + "grad_norm": 2.0292520315634683, + "learning_rate": 5.648854961832062e-06, + "loss": 0.5581, + "step": 148 + }, + { + "epoch": 0.02, + "grad_norm": 2.690921987953104, + "learning_rate": 5.687022900763359e-06, + "loss": 0.5743, + "step": 149 + }, + { + "epoch": 0.02, + "grad_norm": 3.1856130218803993, + "learning_rate": 5.725190839694656e-06, + "loss": 0.5946, + "step": 150 + }, + { + "epoch": 0.02, + "grad_norm": 2.472080613760002, + "learning_rate": 5.763358778625955e-06, + "loss": 0.631, + "step": 151 + }, + { + "epoch": 0.02, + "grad_norm": 2.96750544521273, + "learning_rate": 5.801526717557252e-06, + "loss": 0.5649, + "step": 152 + }, + { + "epoch": 0.02, + "grad_norm": 2.700201324810448, + "learning_rate": 5.83969465648855e-06, + "loss": 0.5695, + "step": 153 + }, + { + "epoch": 0.02, + "grad_norm": 2.415854942225705, + "learning_rate": 5.877862595419848e-06, + "loss": 0.5823, + "step": 154 + }, + { + "epoch": 0.02, + "grad_norm": 1.90376096224584, + "learning_rate": 5.916030534351146e-06, + "loss": 0.6253, + "step": 155 + }, + { + "epoch": 0.02, + "grad_norm": 6.626426782028894, + "learning_rate": 5.9541984732824435e-06, + "loss": 0.5143, + "step": 156 + }, + { + "epoch": 0.02, + "grad_norm": 2.3791432741123533, + "learning_rate": 5.992366412213741e-06, + "loss": 0.5803, + "step": 157 + }, + { + "epoch": 0.02, + "grad_norm": 2.205890973911255, + "learning_rate": 6.030534351145039e-06, + "loss": 0.6824, + "step": 158 + }, + { + "epoch": 0.02, + "grad_norm": 1.8596542664171676, + "learning_rate": 6.068702290076336e-06, + "loss": 0.5981, + "step": 159 + }, + { + "epoch": 0.02, + "grad_norm": 4.2277976360702185, + "learning_rate": 6.1068702290076346e-06, + "loss": 0.6759, + "step": 160 + }, + { + "epoch": 0.02, + "grad_norm": 2.2559753041168205, + "learning_rate": 6.1450381679389315e-06, + "loss": 0.6683, + "step": 161 + }, + { + "epoch": 0.02, + "grad_norm": 2.0295168911383654, + "learning_rate": 6.18320610687023e-06, + "loss": 0.6725, + "step": 162 + }, + { + "epoch": 0.02, + "grad_norm": 1.9910566130593819, + "learning_rate": 6.221374045801527e-06, + "loss": 0.702, + "step": 163 + }, + { + "epoch": 0.02, + "grad_norm": 3.2146347813921103, + "learning_rate": 6.259541984732826e-06, + "loss": 0.6066, + "step": 164 + }, + { + "epoch": 0.02, + "grad_norm": 2.4797392947537316, + "learning_rate": 6.2977099236641225e-06, + "loss": 0.6176, + "step": 165 + }, + { + "epoch": 0.02, + "grad_norm": 2.2455278363546824, + "learning_rate": 6.335877862595419e-06, + "loss": 0.6191, + "step": 166 + }, + { + "epoch": 0.02, + "grad_norm": 1.90962756558809, + "learning_rate": 6.374045801526718e-06, + "loss": 0.4593, + "step": 167 + }, + { + "epoch": 0.02, + "grad_norm": 2.2384529189745477, + "learning_rate": 6.412213740458016e-06, + "loss": 0.6232, + "step": 168 + }, + { + "epoch": 0.02, + "grad_norm": 3.2377380490237555, + "learning_rate": 6.4503816793893135e-06, + "loss": 0.5157, + "step": 169 + }, + { + "epoch": 0.02, + "grad_norm": 2.403674760526213, + "learning_rate": 6.488549618320611e-06, + "loss": 0.682, + "step": 170 + }, + { + "epoch": 0.02, + "grad_norm": 2.0848187572084997, + "learning_rate": 6.526717557251909e-06, + "loss": 0.5558, + "step": 171 + }, + { + "epoch": 0.02, + "grad_norm": 1.955984059636698, + "learning_rate": 6.564885496183207e-06, + "loss": 0.5878, + "step": 172 + }, + { + "epoch": 0.02, + "grad_norm": 1.061522538312033, + "learning_rate": 6.6030534351145046e-06, + "loss": 0.7833, + "step": 173 + }, + { + "epoch": 0.02, + "grad_norm": 2.7321834637588127, + "learning_rate": 6.641221374045802e-06, + "loss": 0.54, + "step": 174 + }, + { + "epoch": 0.02, + "grad_norm": 2.0042068571296685, + "learning_rate": 6.679389312977099e-06, + "loss": 0.533, + "step": 175 + }, + { + "epoch": 0.02, + "grad_norm": 2.771325606052799, + "learning_rate": 6.717557251908398e-06, + "loss": 0.5431, + "step": 176 + }, + { + "epoch": 0.02, + "grad_norm": 2.1456729366048592, + "learning_rate": 6.755725190839695e-06, + "loss": 0.6094, + "step": 177 + }, + { + "epoch": 0.02, + "grad_norm": 2.1799578412565372, + "learning_rate": 6.793893129770993e-06, + "loss": 0.6049, + "step": 178 + }, + { + "epoch": 0.02, + "grad_norm": 2.614696765039329, + "learning_rate": 6.83206106870229e-06, + "loss": 0.5772, + "step": 179 + }, + { + "epoch": 0.02, + "grad_norm": 2.1295583526286466, + "learning_rate": 6.870229007633589e-06, + "loss": 0.558, + "step": 180 + }, + { + "epoch": 0.02, + "grad_norm": 2.2013065626882415, + "learning_rate": 6.908396946564886e-06, + "loss": 0.5555, + "step": 181 + }, + { + "epoch": 0.02, + "grad_norm": 2.215697624462296, + "learning_rate": 6.946564885496184e-06, + "loss": 0.5802, + "step": 182 + }, + { + "epoch": 0.02, + "grad_norm": 2.89029255805644, + "learning_rate": 6.984732824427481e-06, + "loss": 0.6081, + "step": 183 + }, + { + "epoch": 0.02, + "grad_norm": 2.1176122493733756, + "learning_rate": 7.022900763358779e-06, + "loss": 0.5276, + "step": 184 + }, + { + "epoch": 0.02, + "grad_norm": 2.3369678727718837, + "learning_rate": 7.061068702290077e-06, + "loss": 0.5832, + "step": 185 + }, + { + "epoch": 0.02, + "grad_norm": 5.758125192525931, + "learning_rate": 7.0992366412213746e-06, + "loss": 0.5762, + "step": 186 + }, + { + "epoch": 0.02, + "grad_norm": 3.145207402153636, + "learning_rate": 7.137404580152672e-06, + "loss": 0.5505, + "step": 187 + }, + { + "epoch": 0.02, + "grad_norm": 3.163194900291638, + "learning_rate": 7.17557251908397e-06, + "loss": 0.6057, + "step": 188 + }, + { + "epoch": 0.02, + "grad_norm": 2.298119938602785, + "learning_rate": 7.213740458015268e-06, + "loss": 0.6132, + "step": 189 + }, + { + "epoch": 0.02, + "grad_norm": 2.5740904622803455, + "learning_rate": 7.251908396946566e-06, + "loss": 0.5028, + "step": 190 + }, + { + "epoch": 0.02, + "grad_norm": 2.7504979431362204, + "learning_rate": 7.290076335877863e-06, + "loss": 0.6823, + "step": 191 + }, + { + "epoch": 0.02, + "grad_norm": 1.7771636257394814, + "learning_rate": 7.328244274809161e-06, + "loss": 0.6498, + "step": 192 + }, + { + "epoch": 0.02, + "grad_norm": 2.740196274248293, + "learning_rate": 7.366412213740458e-06, + "loss": 0.5384, + "step": 193 + }, + { + "epoch": 0.02, + "grad_norm": 2.5915674486945917, + "learning_rate": 7.404580152671757e-06, + "loss": 0.6461, + "step": 194 + }, + { + "epoch": 0.02, + "grad_norm": 2.902793071844288, + "learning_rate": 7.4427480916030536e-06, + "loss": 0.569, + "step": 195 + }, + { + "epoch": 0.02, + "grad_norm": 1.9940485866822664, + "learning_rate": 7.480916030534352e-06, + "loss": 0.5714, + "step": 196 + }, + { + "epoch": 0.02, + "grad_norm": 2.350033204872447, + "learning_rate": 7.519083969465649e-06, + "loss": 0.5913, + "step": 197 + }, + { + "epoch": 0.02, + "grad_norm": 4.324940727123465, + "learning_rate": 7.557251908396948e-06, + "loss": 0.6054, + "step": 198 + }, + { + "epoch": 0.02, + "grad_norm": 1.8656943462922542, + "learning_rate": 7.595419847328245e-06, + "loss": 0.5413, + "step": 199 + }, + { + "epoch": 0.02, + "grad_norm": 2.2045990742282098, + "learning_rate": 7.633587786259543e-06, + "loss": 0.54, + "step": 200 + }, + { + "epoch": 0.02, + "grad_norm": 1.9996523640336585, + "learning_rate": 7.671755725190841e-06, + "loss": 0.5698, + "step": 201 + }, + { + "epoch": 0.02, + "grad_norm": 2.6789297122205507, + "learning_rate": 7.709923664122137e-06, + "loss": 0.5535, + "step": 202 + }, + { + "epoch": 0.02, + "grad_norm": 6.409617792143921, + "learning_rate": 7.748091603053436e-06, + "loss": 0.5816, + "step": 203 + }, + { + "epoch": 0.02, + "grad_norm": 2.4573156676174337, + "learning_rate": 7.786259541984733e-06, + "loss": 0.5612, + "step": 204 + }, + { + "epoch": 0.02, + "grad_norm": 2.5934502129606405, + "learning_rate": 7.824427480916032e-06, + "loss": 0.5842, + "step": 205 + }, + { + "epoch": 0.02, + "grad_norm": 3.4850518085540276, + "learning_rate": 7.862595419847328e-06, + "loss": 0.5914, + "step": 206 + }, + { + "epoch": 0.02, + "grad_norm": 1.8889619954933417, + "learning_rate": 7.900763358778627e-06, + "loss": 0.5697, + "step": 207 + }, + { + "epoch": 0.02, + "grad_norm": 2.282188924360206, + "learning_rate": 7.938931297709924e-06, + "loss": 0.5801, + "step": 208 + }, + { + "epoch": 0.02, + "grad_norm": 2.086007485826392, + "learning_rate": 7.977099236641223e-06, + "loss": 0.4892, + "step": 209 + }, + { + "epoch": 0.02, + "grad_norm": 1.7916431702271172, + "learning_rate": 8.015267175572519e-06, + "loss": 0.5171, + "step": 210 + }, + { + "epoch": 0.02, + "grad_norm": 2.319663191915118, + "learning_rate": 8.053435114503817e-06, + "loss": 0.4724, + "step": 211 + }, + { + "epoch": 0.02, + "grad_norm": 2.187240688307719, + "learning_rate": 8.091603053435115e-06, + "loss": 0.6156, + "step": 212 + }, + { + "epoch": 0.02, + "grad_norm": 3.0433167984071057, + "learning_rate": 8.129770992366412e-06, + "loss": 0.6329, + "step": 213 + }, + { + "epoch": 0.02, + "grad_norm": 6.654672821595755, + "learning_rate": 8.16793893129771e-06, + "loss": 0.5735, + "step": 214 + }, + { + "epoch": 0.02, + "grad_norm": 2.153876898483677, + "learning_rate": 8.206106870229008e-06, + "loss": 0.4993, + "step": 215 + }, + { + "epoch": 0.02, + "grad_norm": 3.9590377121549656, + "learning_rate": 8.244274809160306e-06, + "loss": 0.5775, + "step": 216 + }, + { + "epoch": 0.02, + "grad_norm": 2.0244038668946165, + "learning_rate": 8.282442748091603e-06, + "loss": 0.6411, + "step": 217 + }, + { + "epoch": 0.03, + "grad_norm": 2.0297199850465115, + "learning_rate": 8.320610687022901e-06, + "loss": 0.6323, + "step": 218 + }, + { + "epoch": 0.03, + "grad_norm": 2.419648325354996, + "learning_rate": 8.358778625954199e-06, + "loss": 0.6267, + "step": 219 + }, + { + "epoch": 0.03, + "grad_norm": 1.923402702717104, + "learning_rate": 8.396946564885497e-06, + "loss": 0.546, + "step": 220 + }, + { + "epoch": 0.03, + "grad_norm": 2.2657269922148893, + "learning_rate": 8.435114503816794e-06, + "loss": 0.5049, + "step": 221 + }, + { + "epoch": 0.03, + "grad_norm": 3.094469610542637, + "learning_rate": 8.473282442748092e-06, + "loss": 0.6634, + "step": 222 + }, + { + "epoch": 0.03, + "grad_norm": 2.1265986245033823, + "learning_rate": 8.51145038167939e-06, + "loss": 0.5247, + "step": 223 + }, + { + "epoch": 0.03, + "grad_norm": 2.835965225509286, + "learning_rate": 8.549618320610688e-06, + "loss": 0.3997, + "step": 224 + }, + { + "epoch": 0.03, + "grad_norm": 2.969936874430152, + "learning_rate": 8.587786259541985e-06, + "loss": 0.5997, + "step": 225 + }, + { + "epoch": 0.03, + "grad_norm": 2.7795713607826644, + "learning_rate": 8.625954198473283e-06, + "loss": 0.5052, + "step": 226 + }, + { + "epoch": 0.03, + "grad_norm": 4.763988706293592, + "learning_rate": 8.664122137404581e-06, + "loss": 0.6107, + "step": 227 + }, + { + "epoch": 0.03, + "grad_norm": 2.419915329943222, + "learning_rate": 8.702290076335879e-06, + "loss": 0.5928, + "step": 228 + }, + { + "epoch": 0.03, + "grad_norm": 2.3912195233065687, + "learning_rate": 8.740458015267176e-06, + "loss": 0.5121, + "step": 229 + }, + { + "epoch": 0.03, + "grad_norm": 2.481112229986324, + "learning_rate": 8.778625954198474e-06, + "loss": 0.5493, + "step": 230 + }, + { + "epoch": 0.03, + "grad_norm": 0.9970811468466948, + "learning_rate": 8.816793893129772e-06, + "loss": 0.8413, + "step": 231 + }, + { + "epoch": 0.03, + "grad_norm": 2.6643719746413304, + "learning_rate": 8.85496183206107e-06, + "loss": 0.6671, + "step": 232 + }, + { + "epoch": 0.03, + "grad_norm": 2.4795027111569565, + "learning_rate": 8.893129770992368e-06, + "loss": 0.6332, + "step": 233 + }, + { + "epoch": 0.03, + "grad_norm": 3.320890057855446, + "learning_rate": 8.931297709923665e-06, + "loss": 0.5952, + "step": 234 + }, + { + "epoch": 0.03, + "grad_norm": 2.1581194646705244, + "learning_rate": 8.969465648854963e-06, + "loss": 0.6559, + "step": 235 + }, + { + "epoch": 0.03, + "grad_norm": 2.6637875252727024, + "learning_rate": 9.007633587786259e-06, + "loss": 0.6121, + "step": 236 + }, + { + "epoch": 0.03, + "grad_norm": 2.3427402107652973, + "learning_rate": 9.045801526717559e-06, + "loss": 0.5524, + "step": 237 + }, + { + "epoch": 0.03, + "grad_norm": 2.2473693458726878, + "learning_rate": 9.083969465648855e-06, + "loss": 0.5285, + "step": 238 + }, + { + "epoch": 0.03, + "grad_norm": 2.013251975305994, + "learning_rate": 9.122137404580154e-06, + "loss": 0.5774, + "step": 239 + }, + { + "epoch": 0.03, + "grad_norm": 1.8801664268868556, + "learning_rate": 9.16030534351145e-06, + "loss": 0.4703, + "step": 240 + }, + { + "epoch": 0.03, + "grad_norm": 1.8842559284333256, + "learning_rate": 9.19847328244275e-06, + "loss": 0.648, + "step": 241 + }, + { + "epoch": 0.03, + "grad_norm": 2.2471632106994104, + "learning_rate": 9.236641221374046e-06, + "loss": 0.5878, + "step": 242 + }, + { + "epoch": 0.03, + "grad_norm": 2.3769084743658397, + "learning_rate": 9.274809160305345e-06, + "loss": 0.6307, + "step": 243 + }, + { + "epoch": 0.03, + "grad_norm": 2.4009481548988285, + "learning_rate": 9.312977099236641e-06, + "loss": 0.5087, + "step": 244 + }, + { + "epoch": 0.03, + "grad_norm": 4.9143294407835905, + "learning_rate": 9.351145038167939e-06, + "loss": 0.4744, + "step": 245 + }, + { + "epoch": 0.03, + "grad_norm": 1.3208513550143612, + "learning_rate": 9.389312977099237e-06, + "loss": 0.8089, + "step": 246 + }, + { + "epoch": 0.03, + "grad_norm": 3.04438068976235, + "learning_rate": 9.427480916030534e-06, + "loss": 0.5826, + "step": 247 + }, + { + "epoch": 0.03, + "grad_norm": 7.873175343575843, + "learning_rate": 9.465648854961834e-06, + "loss": 0.579, + "step": 248 + }, + { + "epoch": 0.03, + "grad_norm": 2.024774012889935, + "learning_rate": 9.50381679389313e-06, + "loss": 0.5229, + "step": 249 + }, + { + "epoch": 0.03, + "grad_norm": 1.8831178030571767, + "learning_rate": 9.54198473282443e-06, + "loss": 0.4917, + "step": 250 + }, + { + "epoch": 0.03, + "grad_norm": 3.629240897707987, + "learning_rate": 9.580152671755725e-06, + "loss": 0.5385, + "step": 251 + }, + { + "epoch": 0.03, + "grad_norm": 1.880285496436412, + "learning_rate": 9.618320610687025e-06, + "loss": 0.5479, + "step": 252 + }, + { + "epoch": 0.03, + "grad_norm": 3.1072919562603794, + "learning_rate": 9.656488549618321e-06, + "loss": 0.5006, + "step": 253 + }, + { + "epoch": 0.03, + "grad_norm": 3.153731884165473, + "learning_rate": 9.694656488549619e-06, + "loss": 0.5696, + "step": 254 + }, + { + "epoch": 0.03, + "grad_norm": 2.846966274165905, + "learning_rate": 9.732824427480917e-06, + "loss": 0.6431, + "step": 255 + }, + { + "epoch": 0.03, + "grad_norm": 2.2173124690287693, + "learning_rate": 9.770992366412214e-06, + "loss": 0.5526, + "step": 256 + }, + { + "epoch": 0.03, + "grad_norm": 3.091312726274496, + "learning_rate": 9.809160305343512e-06, + "loss": 0.5499, + "step": 257 + }, + { + "epoch": 0.03, + "grad_norm": 2.654764801407029, + "learning_rate": 9.84732824427481e-06, + "loss": 0.5851, + "step": 258 + }, + { + "epoch": 0.03, + "grad_norm": 1.857701175019791, + "learning_rate": 9.885496183206108e-06, + "loss": 0.6158, + "step": 259 + }, + { + "epoch": 0.03, + "grad_norm": 2.319637965431276, + "learning_rate": 9.923664122137405e-06, + "loss": 0.5985, + "step": 260 + }, + { + "epoch": 0.03, + "grad_norm": 2.3883908579604807, + "learning_rate": 9.961832061068703e-06, + "loss": 0.5516, + "step": 261 + }, + { + "epoch": 0.03, + "grad_norm": 2.57325364523273, + "learning_rate": 1e-05, + "loss": 0.5416, + "step": 262 + }, + { + "epoch": 0.03, + "grad_norm": 3.0248354753736764, + "learning_rate": 9.999999653700435e-06, + "loss": 0.6027, + "step": 263 + }, + { + "epoch": 0.03, + "grad_norm": 1.9730043990472355, + "learning_rate": 9.99999861480178e-06, + "loss": 0.6252, + "step": 264 + }, + { + "epoch": 0.03, + "grad_norm": 2.118427279578174, + "learning_rate": 9.999996883304185e-06, + "loss": 0.5788, + "step": 265 + }, + { + "epoch": 0.03, + "grad_norm": 1.7881154523851575, + "learning_rate": 9.999994459207888e-06, + "loss": 0.6092, + "step": 266 + }, + { + "epoch": 0.03, + "grad_norm": 2.264879859219285, + "learning_rate": 9.999991342513225e-06, + "loss": 0.5427, + "step": 267 + }, + { + "epoch": 0.03, + "grad_norm": 2.048261063151237, + "learning_rate": 9.999987533220625e-06, + "loss": 0.5661, + "step": 268 + }, + { + "epoch": 0.03, + "grad_norm": 2.0040447097897056, + "learning_rate": 9.99998303133062e-06, + "loss": 0.4841, + "step": 269 + }, + { + "epoch": 0.03, + "grad_norm": 2.598392609486478, + "learning_rate": 9.999977836843832e-06, + "loss": 0.5857, + "step": 270 + }, + { + "epoch": 0.03, + "grad_norm": 2.7250674643525077, + "learning_rate": 9.999971949760978e-06, + "loss": 0.6312, + "step": 271 + }, + { + "epoch": 0.03, + "grad_norm": 1.2529182778988164, + "learning_rate": 9.999965370082877e-06, + "loss": 0.7782, + "step": 272 + }, + { + "epoch": 0.03, + "grad_norm": 2.828344703223877, + "learning_rate": 9.999958097810438e-06, + "loss": 0.581, + "step": 273 + }, + { + "epoch": 0.03, + "grad_norm": 2.4503651911763353, + "learning_rate": 9.99995013294467e-06, + "loss": 0.7074, + "step": 274 + }, + { + "epoch": 0.03, + "grad_norm": 2.1203970365001243, + "learning_rate": 9.999941475486676e-06, + "loss": 0.5359, + "step": 275 + }, + { + "epoch": 0.03, + "grad_norm": 3.0760521610611997, + "learning_rate": 9.999932125437653e-06, + "loss": 0.5741, + "step": 276 + }, + { + "epoch": 0.03, + "grad_norm": 2.7191643640792456, + "learning_rate": 9.9999220827989e-06, + "loss": 0.6516, + "step": 277 + }, + { + "epoch": 0.03, + "grad_norm": 7.410014609662457, + "learning_rate": 9.999911347571805e-06, + "loss": 0.4869, + "step": 278 + }, + { + "epoch": 0.03, + "grad_norm": 1.9030209071839324, + "learning_rate": 9.999899919757856e-06, + "loss": 0.6146, + "step": 279 + }, + { + "epoch": 0.03, + "grad_norm": 2.229525366650312, + "learning_rate": 9.999887799358638e-06, + "loss": 0.5973, + "step": 280 + }, + { + "epoch": 0.03, + "grad_norm": 3.1925446294784217, + "learning_rate": 9.999874986375826e-06, + "loss": 0.5777, + "step": 281 + }, + { + "epoch": 0.03, + "grad_norm": 1.9595952562877816, + "learning_rate": 9.999861480811197e-06, + "loss": 0.5594, + "step": 282 + }, + { + "epoch": 0.03, + "grad_norm": 2.6551425234617567, + "learning_rate": 9.999847282666623e-06, + "loss": 0.6058, + "step": 283 + }, + { + "epoch": 0.03, + "grad_norm": 3.476059875470107, + "learning_rate": 9.999832391944069e-06, + "loss": 0.6035, + "step": 284 + }, + { + "epoch": 0.03, + "grad_norm": 1.2523132640825916, + "learning_rate": 9.999816808645598e-06, + "loss": 0.7623, + "step": 285 + }, + { + "epoch": 0.03, + "grad_norm": 2.1409126701603602, + "learning_rate": 9.999800532773367e-06, + "loss": 0.4839, + "step": 286 + }, + { + "epoch": 0.03, + "grad_norm": 2.6160244285665035, + "learning_rate": 9.999783564329634e-06, + "loss": 0.6115, + "step": 287 + }, + { + "epoch": 0.03, + "grad_norm": 2.078427311079447, + "learning_rate": 9.999765903316746e-06, + "loss": 0.589, + "step": 288 + }, + { + "epoch": 0.03, + "grad_norm": 2.0321051193973245, + "learning_rate": 9.999747549737153e-06, + "loss": 0.6369, + "step": 289 + }, + { + "epoch": 0.03, + "grad_norm": 2.6592486244892273, + "learning_rate": 9.999728503593395e-06, + "loss": 0.6332, + "step": 290 + }, + { + "epoch": 0.03, + "grad_norm": 1.7256452984569186, + "learning_rate": 9.999708764888109e-06, + "loss": 0.6107, + "step": 291 + }, + { + "epoch": 0.03, + "grad_norm": 2.1748697429490256, + "learning_rate": 9.999688333624034e-06, + "loss": 0.6385, + "step": 292 + }, + { + "epoch": 0.03, + "grad_norm": 3.1673660380036472, + "learning_rate": 9.999667209803994e-06, + "loss": 0.4918, + "step": 293 + }, + { + "epoch": 0.03, + "grad_norm": 2.040215141055379, + "learning_rate": 9.999645393430918e-06, + "loss": 0.5293, + "step": 294 + }, + { + "epoch": 0.03, + "grad_norm": 2.110658684469769, + "learning_rate": 9.999622884507831e-06, + "loss": 0.5277, + "step": 295 + }, + { + "epoch": 0.03, + "grad_norm": 2.0246698956170928, + "learning_rate": 9.999599683037847e-06, + "loss": 0.6048, + "step": 296 + }, + { + "epoch": 0.03, + "grad_norm": 3.469820273450985, + "learning_rate": 9.999575789024179e-06, + "loss": 0.5437, + "step": 297 + }, + { + "epoch": 0.03, + "grad_norm": 2.461701150196677, + "learning_rate": 9.99955120247014e-06, + "loss": 0.4948, + "step": 298 + }, + { + "epoch": 0.03, + "grad_norm": 1.976559385983663, + "learning_rate": 9.999525923379133e-06, + "loss": 0.5753, + "step": 299 + }, + { + "epoch": 0.03, + "grad_norm": 2.11564352088973, + "learning_rate": 9.999499951754663e-06, + "loss": 0.5357, + "step": 300 + }, + { + "epoch": 0.03, + "grad_norm": 1.9559177558880618, + "learning_rate": 9.999473287600326e-06, + "loss": 0.5466, + "step": 301 + }, + { + "epoch": 0.03, + "grad_norm": 2.321679877882175, + "learning_rate": 9.999445930919813e-06, + "loss": 0.5651, + "step": 302 + }, + { + "epoch": 0.03, + "grad_norm": 2.332491225290583, + "learning_rate": 9.999417881716918e-06, + "loss": 0.4829, + "step": 303 + }, + { + "epoch": 0.03, + "grad_norm": 2.608866943416747, + "learning_rate": 9.999389139995521e-06, + "loss": 0.603, + "step": 304 + }, + { + "epoch": 0.04, + "grad_norm": 2.1026461959102627, + "learning_rate": 9.999359705759607e-06, + "loss": 0.6674, + "step": 305 + }, + { + "epoch": 0.04, + "grad_norm": 1.969653313306091, + "learning_rate": 9.999329579013254e-06, + "loss": 0.5813, + "step": 306 + }, + { + "epoch": 0.04, + "grad_norm": 2.104900216286705, + "learning_rate": 9.999298759760634e-06, + "loss": 0.5826, + "step": 307 + }, + { + "epoch": 0.04, + "grad_norm": 3.197111123007983, + "learning_rate": 9.999267248006013e-06, + "loss": 0.5492, + "step": 308 + }, + { + "epoch": 0.04, + "grad_norm": 2.191814484436333, + "learning_rate": 9.999235043753761e-06, + "loss": 0.55, + "step": 309 + }, + { + "epoch": 0.04, + "grad_norm": 2.0641562916736063, + "learning_rate": 9.999202147008336e-06, + "loss": 0.668, + "step": 310 + }, + { + "epoch": 0.04, + "grad_norm": 2.432220568076151, + "learning_rate": 9.999168557774294e-06, + "loss": 0.4833, + "step": 311 + }, + { + "epoch": 0.04, + "grad_norm": 1.846028110854124, + "learning_rate": 9.999134276056293e-06, + "loss": 0.4844, + "step": 312 + }, + { + "epoch": 0.04, + "grad_norm": 2.2196168761593773, + "learning_rate": 9.999099301859074e-06, + "loss": 0.5817, + "step": 313 + }, + { + "epoch": 0.04, + "grad_norm": 2.0590875049277537, + "learning_rate": 9.999063635187487e-06, + "loss": 0.6188, + "step": 314 + }, + { + "epoch": 0.04, + "grad_norm": 2.3756564260944026, + "learning_rate": 9.999027276046471e-06, + "loss": 0.6247, + "step": 315 + }, + { + "epoch": 0.04, + "grad_norm": 1.9224280943537198, + "learning_rate": 9.998990224441062e-06, + "loss": 0.4924, + "step": 316 + }, + { + "epoch": 0.04, + "grad_norm": 2.2898728158583266, + "learning_rate": 9.998952480376397e-06, + "loss": 0.5806, + "step": 317 + }, + { + "epoch": 0.04, + "grad_norm": 2.2459749555861923, + "learning_rate": 9.998914043857696e-06, + "loss": 0.6464, + "step": 318 + }, + { + "epoch": 0.04, + "grad_norm": 2.2878193590407045, + "learning_rate": 9.998874914890289e-06, + "loss": 0.564, + "step": 319 + }, + { + "epoch": 0.04, + "grad_norm": 3.022189877704837, + "learning_rate": 9.998835093479593e-06, + "loss": 0.6183, + "step": 320 + }, + { + "epoch": 0.04, + "grad_norm": 1.7852386263884152, + "learning_rate": 9.998794579631127e-06, + "loss": 0.7164, + "step": 321 + }, + { + "epoch": 0.04, + "grad_norm": 3.4952600513748306, + "learning_rate": 9.998753373350503e-06, + "loss": 0.6157, + "step": 322 + }, + { + "epoch": 0.04, + "grad_norm": 3.607331735140249, + "learning_rate": 9.998711474643426e-06, + "loss": 0.6304, + "step": 323 + }, + { + "epoch": 0.04, + "grad_norm": 3.199678650417005, + "learning_rate": 9.9986688835157e-06, + "loss": 0.5094, + "step": 324 + }, + { + "epoch": 0.04, + "grad_norm": 1.8105075138624742, + "learning_rate": 9.998625599973228e-06, + "loss": 0.6517, + "step": 325 + }, + { + "epoch": 0.04, + "grad_norm": 3.2766919691830596, + "learning_rate": 9.998581624022004e-06, + "loss": 0.7044, + "step": 326 + }, + { + "epoch": 0.04, + "grad_norm": 2.325303305407908, + "learning_rate": 9.998536955668117e-06, + "loss": 0.5699, + "step": 327 + }, + { + "epoch": 0.04, + "grad_norm": 2.198260263775083, + "learning_rate": 9.99849159491776e-06, + "loss": 0.5399, + "step": 328 + }, + { + "epoch": 0.04, + "grad_norm": 1.8429983667419458, + "learning_rate": 9.99844554177721e-06, + "loss": 0.4809, + "step": 329 + }, + { + "epoch": 0.04, + "grad_norm": 2.278670123122288, + "learning_rate": 9.998398796252851e-06, + "loss": 0.5422, + "step": 330 + }, + { + "epoch": 0.04, + "grad_norm": 2.076292643074287, + "learning_rate": 9.998351358351154e-06, + "loss": 0.6621, + "step": 331 + }, + { + "epoch": 0.04, + "grad_norm": 2.049602712120544, + "learning_rate": 9.998303228078695e-06, + "loss": 0.5686, + "step": 332 + }, + { + "epoch": 0.04, + "grad_norm": 1.9683589965307053, + "learning_rate": 9.998254405442139e-06, + "loss": 0.599, + "step": 333 + }, + { + "epoch": 0.04, + "grad_norm": 2.3829146251777886, + "learning_rate": 9.998204890448247e-06, + "loss": 0.6134, + "step": 334 + }, + { + "epoch": 0.04, + "grad_norm": 1.8621611538553502, + "learning_rate": 9.99815468310388e-06, + "loss": 0.5618, + "step": 335 + }, + { + "epoch": 0.04, + "grad_norm": 1.9501427117240704, + "learning_rate": 9.99810378341599e-06, + "loss": 0.5592, + "step": 336 + }, + { + "epoch": 0.04, + "grad_norm": 2.2173458141975777, + "learning_rate": 9.998052191391633e-06, + "loss": 0.6047, + "step": 337 + }, + { + "epoch": 0.04, + "grad_norm": 2.1083197050433076, + "learning_rate": 9.99799990703795e-06, + "loss": 0.6123, + "step": 338 + }, + { + "epoch": 0.04, + "grad_norm": 2.3886049367478783, + "learning_rate": 9.997946930362186e-06, + "loss": 0.6042, + "step": 339 + }, + { + "epoch": 0.04, + "grad_norm": 2.27622322310789, + "learning_rate": 9.99789326137168e-06, + "loss": 0.6229, + "step": 340 + }, + { + "epoch": 0.04, + "grad_norm": 2.198740791702434, + "learning_rate": 9.997838900073864e-06, + "loss": 0.5225, + "step": 341 + }, + { + "epoch": 0.04, + "grad_norm": 2.648172777903177, + "learning_rate": 9.997783846476268e-06, + "loss": 0.5987, + "step": 342 + }, + { + "epoch": 0.04, + "grad_norm": 1.1818729169978681, + "learning_rate": 9.997728100586522e-06, + "loss": 0.8191, + "step": 343 + }, + { + "epoch": 0.04, + "grad_norm": 2.093180448968955, + "learning_rate": 9.997671662412343e-06, + "loss": 0.5669, + "step": 344 + }, + { + "epoch": 0.04, + "grad_norm": 2.943815523422976, + "learning_rate": 9.997614531961552e-06, + "loss": 0.5554, + "step": 345 + }, + { + "epoch": 0.04, + "grad_norm": 1.8552072142104046, + "learning_rate": 9.99755670924206e-06, + "loss": 0.5835, + "step": 346 + }, + { + "epoch": 0.04, + "grad_norm": 3.5023891592952405, + "learning_rate": 9.99749819426188e-06, + "loss": 0.4249, + "step": 347 + }, + { + "epoch": 0.04, + "grad_norm": 2.270491889889242, + "learning_rate": 9.997438987029115e-06, + "loss": 0.5382, + "step": 348 + }, + { + "epoch": 0.04, + "grad_norm": 2.182820993235715, + "learning_rate": 9.997379087551968e-06, + "loss": 0.6573, + "step": 349 + }, + { + "epoch": 0.04, + "grad_norm": 2.184657188264395, + "learning_rate": 9.997318495838734e-06, + "loss": 0.5796, + "step": 350 + }, + { + "epoch": 0.04, + "grad_norm": 1.868828141168527, + "learning_rate": 9.997257211897808e-06, + "loss": 0.5867, + "step": 351 + }, + { + "epoch": 0.04, + "grad_norm": 2.384406691103516, + "learning_rate": 9.99719523573768e-06, + "loss": 0.5157, + "step": 352 + }, + { + "epoch": 0.04, + "grad_norm": 3.709636548182967, + "learning_rate": 9.997132567366931e-06, + "loss": 0.5179, + "step": 353 + }, + { + "epoch": 0.04, + "grad_norm": 2.331907539317475, + "learning_rate": 9.997069206794246e-06, + "loss": 0.5786, + "step": 354 + }, + { + "epoch": 0.04, + "grad_norm": 2.261363318381477, + "learning_rate": 9.9970051540284e-06, + "loss": 0.5264, + "step": 355 + }, + { + "epoch": 0.04, + "grad_norm": 2.226390362268574, + "learning_rate": 9.996940409078265e-06, + "loss": 0.6287, + "step": 356 + }, + { + "epoch": 0.04, + "grad_norm": 4.062614445723752, + "learning_rate": 9.99687497195281e-06, + "loss": 0.5419, + "step": 357 + }, + { + "epoch": 0.04, + "grad_norm": 2.5967069904878985, + "learning_rate": 9.9968088426611e-06, + "loss": 0.539, + "step": 358 + }, + { + "epoch": 0.04, + "grad_norm": 2.6998291583003247, + "learning_rate": 9.996742021212294e-06, + "loss": 0.6529, + "step": 359 + }, + { + "epoch": 0.04, + "grad_norm": 1.9269182375971687, + "learning_rate": 9.996674507615648e-06, + "loss": 0.6596, + "step": 360 + }, + { + "epoch": 0.04, + "grad_norm": 2.285071141667131, + "learning_rate": 9.996606301880516e-06, + "loss": 0.5829, + "step": 361 + }, + { + "epoch": 0.04, + "grad_norm": 5.293834911033014, + "learning_rate": 9.996537404016345e-06, + "loss": 0.4819, + "step": 362 + }, + { + "epoch": 0.04, + "grad_norm": 2.0478396922772317, + "learning_rate": 9.996467814032675e-06, + "loss": 0.4966, + "step": 363 + }, + { + "epoch": 0.04, + "grad_norm": 2.3024877272670117, + "learning_rate": 9.996397531939152e-06, + "loss": 0.6098, + "step": 364 + }, + { + "epoch": 0.04, + "grad_norm": 2.763621642572886, + "learning_rate": 9.996326557745508e-06, + "loss": 0.5945, + "step": 365 + }, + { + "epoch": 0.04, + "grad_norm": 1.5785769695565834, + "learning_rate": 9.996254891461574e-06, + "loss": 0.5297, + "step": 366 + }, + { + "epoch": 0.04, + "grad_norm": 2.8393543352293973, + "learning_rate": 9.996182533097277e-06, + "loss": 0.6217, + "step": 367 + }, + { + "epoch": 0.04, + "grad_norm": 2.0539379083716467, + "learning_rate": 9.996109482662642e-06, + "loss": 0.6506, + "step": 368 + }, + { + "epoch": 0.04, + "grad_norm": 1.893797567852461, + "learning_rate": 9.996035740167787e-06, + "loss": 0.585, + "step": 369 + }, + { + "epoch": 0.04, + "grad_norm": 3.001101655861041, + "learning_rate": 9.995961305622925e-06, + "loss": 0.566, + "step": 370 + }, + { + "epoch": 0.04, + "grad_norm": 1.8874219676833905, + "learning_rate": 9.995886179038369e-06, + "loss": 0.4936, + "step": 371 + }, + { + "epoch": 0.04, + "grad_norm": 2.027274851009981, + "learning_rate": 9.995810360424526e-06, + "loss": 0.58, + "step": 372 + }, + { + "epoch": 0.04, + "grad_norm": 8.566604108630607, + "learning_rate": 9.995733849791895e-06, + "loss": 0.5217, + "step": 373 + }, + { + "epoch": 0.04, + "grad_norm": 3.0514399791053917, + "learning_rate": 9.995656647151077e-06, + "loss": 0.5771, + "step": 374 + }, + { + "epoch": 0.04, + "grad_norm": 2.3472983920797503, + "learning_rate": 9.995578752512767e-06, + "loss": 0.5467, + "step": 375 + }, + { + "epoch": 0.04, + "grad_norm": 1.5229914916968206, + "learning_rate": 9.995500165887753e-06, + "loss": 0.5958, + "step": 376 + }, + { + "epoch": 0.04, + "grad_norm": 2.3124315722023434, + "learning_rate": 9.995420887286922e-06, + "loss": 0.5041, + "step": 377 + }, + { + "epoch": 0.04, + "grad_norm": 1.7179919951485985, + "learning_rate": 9.995340916721252e-06, + "loss": 0.5433, + "step": 378 + }, + { + "epoch": 0.04, + "grad_norm": 2.059021561522691, + "learning_rate": 9.995260254201826e-06, + "loss": 0.5805, + "step": 379 + }, + { + "epoch": 0.04, + "grad_norm": 1.7593338327926205, + "learning_rate": 9.995178899739813e-06, + "loss": 0.5509, + "step": 380 + }, + { + "epoch": 0.04, + "grad_norm": 2.2984305438292174, + "learning_rate": 9.995096853346486e-06, + "loss": 0.5658, + "step": 381 + }, + { + "epoch": 0.04, + "grad_norm": 1.7423223104243695, + "learning_rate": 9.995014115033207e-06, + "loss": 0.7361, + "step": 382 + }, + { + "epoch": 0.04, + "grad_norm": 2.0950176812170533, + "learning_rate": 9.994930684811439e-06, + "loss": 0.5415, + "step": 383 + }, + { + "epoch": 0.04, + "grad_norm": 2.919678110671075, + "learning_rate": 9.994846562692735e-06, + "loss": 0.5431, + "step": 384 + }, + { + "epoch": 0.04, + "grad_norm": 1.8681446147061849, + "learning_rate": 9.994761748688752e-06, + "loss": 0.5691, + "step": 385 + }, + { + "epoch": 0.04, + "grad_norm": 2.062704405777913, + "learning_rate": 9.994676242811236e-06, + "loss": 0.6474, + "step": 386 + }, + { + "epoch": 0.04, + "grad_norm": 1.8885353129770497, + "learning_rate": 9.994590045072034e-06, + "loss": 0.6155, + "step": 387 + }, + { + "epoch": 0.04, + "grad_norm": 2.1689055124084313, + "learning_rate": 9.994503155483081e-06, + "loss": 0.4243, + "step": 388 + }, + { + "epoch": 0.04, + "grad_norm": 4.733543389171772, + "learning_rate": 9.99441557405642e-06, + "loss": 0.5591, + "step": 389 + }, + { + "epoch": 0.04, + "grad_norm": 2.107080824162522, + "learning_rate": 9.994327300804177e-06, + "loss": 0.6166, + "step": 390 + }, + { + "epoch": 0.04, + "grad_norm": 15.726449859330081, + "learning_rate": 9.99423833573858e-06, + "loss": 0.6016, + "step": 391 + }, + { + "epoch": 0.05, + "grad_norm": 6.728308180095228, + "learning_rate": 9.994148678871953e-06, + "loss": 0.523, + "step": 392 + }, + { + "epoch": 0.05, + "grad_norm": 1.8387161864811226, + "learning_rate": 9.994058330216718e-06, + "loss": 0.5845, + "step": 393 + }, + { + "epoch": 0.05, + "grad_norm": 7.843815542623466, + "learning_rate": 9.993967289785388e-06, + "loss": 0.6234, + "step": 394 + }, + { + "epoch": 0.05, + "grad_norm": 1.0220458623776523, + "learning_rate": 9.993875557590574e-06, + "loss": 0.7276, + "step": 395 + }, + { + "epoch": 0.05, + "grad_norm": 2.6849354741323905, + "learning_rate": 9.993783133644981e-06, + "loss": 0.6013, + "step": 396 + }, + { + "epoch": 0.05, + "grad_norm": 2.3627782083614925, + "learning_rate": 9.993690017961415e-06, + "loss": 0.6327, + "step": 397 + }, + { + "epoch": 0.05, + "grad_norm": 2.3801988820231963, + "learning_rate": 9.993596210552773e-06, + "loss": 0.5637, + "step": 398 + }, + { + "epoch": 0.05, + "grad_norm": 2.838917952864235, + "learning_rate": 9.993501711432047e-06, + "loss": 0.5547, + "step": 399 + }, + { + "epoch": 0.05, + "grad_norm": 2.339429495063228, + "learning_rate": 9.993406520612331e-06, + "loss": 0.536, + "step": 400 + }, + { + "epoch": 0.05, + "grad_norm": 4.606356132851422, + "learning_rate": 9.993310638106808e-06, + "loss": 0.5137, + "step": 401 + }, + { + "epoch": 0.05, + "grad_norm": 1.9492895763624338, + "learning_rate": 9.99321406392876e-06, + "loss": 0.4944, + "step": 402 + }, + { + "epoch": 0.05, + "grad_norm": 2.8675912259371232, + "learning_rate": 9.993116798091565e-06, + "loss": 0.5491, + "step": 403 + }, + { + "epoch": 0.05, + "grad_norm": 3.5594731195330893, + "learning_rate": 9.993018840608695e-06, + "loss": 0.5727, + "step": 404 + }, + { + "epoch": 0.05, + "grad_norm": 2.0021074192983543, + "learning_rate": 9.99292019149372e-06, + "loss": 0.4979, + "step": 405 + }, + { + "epoch": 0.05, + "grad_norm": 2.01303725913655, + "learning_rate": 9.992820850760306e-06, + "loss": 0.6247, + "step": 406 + }, + { + "epoch": 0.05, + "grad_norm": 3.8334361587407724, + "learning_rate": 9.992720818422212e-06, + "loss": 0.6064, + "step": 407 + }, + { + "epoch": 0.05, + "grad_norm": 2.188197789732399, + "learning_rate": 9.992620094493294e-06, + "loss": 0.5418, + "step": 408 + }, + { + "epoch": 0.05, + "grad_norm": 2.0003261952541838, + "learning_rate": 9.992518678987506e-06, + "loss": 0.6508, + "step": 409 + }, + { + "epoch": 0.05, + "grad_norm": 6.960764124327958, + "learning_rate": 9.992416571918896e-06, + "loss": 0.5531, + "step": 410 + }, + { + "epoch": 0.05, + "grad_norm": 2.5373917503926418, + "learning_rate": 9.992313773301607e-06, + "loss": 0.5359, + "step": 411 + }, + { + "epoch": 0.05, + "grad_norm": 2.7579365451438598, + "learning_rate": 9.992210283149878e-06, + "loss": 0.5731, + "step": 412 + }, + { + "epoch": 0.05, + "grad_norm": 2.4042748754043357, + "learning_rate": 9.992106101478046e-06, + "loss": 0.636, + "step": 413 + }, + { + "epoch": 0.05, + "grad_norm": 1.7986792837706107, + "learning_rate": 9.992001228300541e-06, + "loss": 0.602, + "step": 414 + }, + { + "epoch": 0.05, + "grad_norm": 1.8286483782543528, + "learning_rate": 9.991895663631891e-06, + "loss": 0.5572, + "step": 415 + }, + { + "epoch": 0.05, + "grad_norm": 2.2546056882238448, + "learning_rate": 9.991789407486719e-06, + "loss": 0.5046, + "step": 416 + }, + { + "epoch": 0.05, + "grad_norm": 2.0282524147248493, + "learning_rate": 9.99168245987974e-06, + "loss": 0.5858, + "step": 417 + }, + { + "epoch": 0.05, + "grad_norm": 1.7585294508497795, + "learning_rate": 9.991574820825773e-06, + "loss": 0.6255, + "step": 418 + }, + { + "epoch": 0.05, + "grad_norm": 2.6141906569557247, + "learning_rate": 9.991466490339727e-06, + "loss": 0.6995, + "step": 419 + }, + { + "epoch": 0.05, + "grad_norm": 1.1058030398570418, + "learning_rate": 9.991357468436607e-06, + "loss": 0.8318, + "step": 420 + }, + { + "epoch": 0.05, + "grad_norm": 1.9495268892063249, + "learning_rate": 9.991247755131514e-06, + "loss": 0.6128, + "step": 421 + }, + { + "epoch": 0.05, + "grad_norm": 0.9432917880837769, + "learning_rate": 9.991137350439647e-06, + "loss": 0.7418, + "step": 422 + }, + { + "epoch": 0.05, + "grad_norm": 1.7765649438340525, + "learning_rate": 9.991026254376302e-06, + "loss": 0.6068, + "step": 423 + }, + { + "epoch": 0.05, + "grad_norm": 1.8415893778881127, + "learning_rate": 9.990914466956861e-06, + "loss": 0.5946, + "step": 424 + }, + { + "epoch": 0.05, + "grad_norm": 1.9023198943976065, + "learning_rate": 9.990801988196812e-06, + "loss": 0.5789, + "step": 425 + }, + { + "epoch": 0.05, + "grad_norm": 2.873539080210441, + "learning_rate": 9.990688818111739e-06, + "loss": 0.581, + "step": 426 + }, + { + "epoch": 0.05, + "grad_norm": 1.9376791768743804, + "learning_rate": 9.990574956717313e-06, + "loss": 0.5454, + "step": 427 + }, + { + "epoch": 0.05, + "grad_norm": 3.103342281771859, + "learning_rate": 9.99046040402931e-06, + "loss": 0.716, + "step": 428 + }, + { + "epoch": 0.05, + "grad_norm": 1.2618298551056268, + "learning_rate": 9.990345160063594e-06, + "loss": 0.8317, + "step": 429 + }, + { + "epoch": 0.05, + "grad_norm": 1.8074318950085166, + "learning_rate": 9.990229224836131e-06, + "loss": 0.5353, + "step": 430 + }, + { + "epoch": 0.05, + "grad_norm": 2.7202651792713155, + "learning_rate": 9.990112598362982e-06, + "loss": 0.6242, + "step": 431 + }, + { + "epoch": 0.05, + "grad_norm": 2.21084013093517, + "learning_rate": 9.989995280660298e-06, + "loss": 0.5555, + "step": 432 + }, + { + "epoch": 0.05, + "grad_norm": 2.5055722170558843, + "learning_rate": 9.989877271744335e-06, + "loss": 0.5689, + "step": 433 + }, + { + "epoch": 0.05, + "grad_norm": 2.655672632114759, + "learning_rate": 9.989758571631434e-06, + "loss": 0.5412, + "step": 434 + }, + { + "epoch": 0.05, + "grad_norm": 2.1796896497123655, + "learning_rate": 9.989639180338041e-06, + "loss": 0.6623, + "step": 435 + }, + { + "epoch": 0.05, + "grad_norm": 2.035869877449723, + "learning_rate": 9.989519097880693e-06, + "loss": 0.5494, + "step": 436 + }, + { + "epoch": 0.05, + "grad_norm": 1.821996513942337, + "learning_rate": 9.989398324276022e-06, + "loss": 0.5444, + "step": 437 + }, + { + "epoch": 0.05, + "grad_norm": 2.0337328320932437, + "learning_rate": 9.989276859540761e-06, + "loss": 0.522, + "step": 438 + }, + { + "epoch": 0.05, + "grad_norm": 1.8629849412543438, + "learning_rate": 9.989154703691735e-06, + "loss": 0.5428, + "step": 439 + }, + { + "epoch": 0.05, + "grad_norm": 2.950781614154026, + "learning_rate": 9.98903185674586e-06, + "loss": 0.5241, + "step": 440 + }, + { + "epoch": 0.05, + "grad_norm": 0.9777109355800144, + "learning_rate": 9.98890831872016e-06, + "loss": 0.7578, + "step": 441 + }, + { + "epoch": 0.05, + "grad_norm": 1.9589448959919118, + "learning_rate": 9.988784089631742e-06, + "loss": 0.4597, + "step": 442 + }, + { + "epoch": 0.05, + "grad_norm": 2.6582181087938404, + "learning_rate": 9.988659169497816e-06, + "loss": 0.503, + "step": 443 + }, + { + "epoch": 0.05, + "grad_norm": 2.4628060652114927, + "learning_rate": 9.988533558335687e-06, + "loss": 0.4823, + "step": 444 + }, + { + "epoch": 0.05, + "grad_norm": 1.8714000503936454, + "learning_rate": 9.988407256162751e-06, + "loss": 0.5479, + "step": 445 + }, + { + "epoch": 0.05, + "grad_norm": 4.038680229046823, + "learning_rate": 9.988280262996507e-06, + "loss": 0.6211, + "step": 446 + }, + { + "epoch": 0.05, + "grad_norm": 1.9990596903410613, + "learning_rate": 9.988152578854546e-06, + "loss": 0.5205, + "step": 447 + }, + { + "epoch": 0.05, + "grad_norm": 2.2408487160859374, + "learning_rate": 9.988024203754554e-06, + "loss": 0.5349, + "step": 448 + }, + { + "epoch": 0.05, + "grad_norm": 2.345167209951377, + "learning_rate": 9.987895137714312e-06, + "loss": 0.5654, + "step": 449 + }, + { + "epoch": 0.05, + "grad_norm": 1.5886920839224363, + "learning_rate": 9.9877653807517e-06, + "loss": 0.4432, + "step": 450 + }, + { + "epoch": 0.05, + "grad_norm": 1.5227163023034136, + "learning_rate": 9.98763493288469e-06, + "loss": 0.5042, + "step": 451 + }, + { + "epoch": 0.05, + "grad_norm": 1.8525528818109034, + "learning_rate": 9.987503794131358e-06, + "loss": 0.4666, + "step": 452 + }, + { + "epoch": 0.05, + "grad_norm": 1.6459779759608986, + "learning_rate": 9.987371964509859e-06, + "loss": 0.674, + "step": 453 + }, + { + "epoch": 0.05, + "grad_norm": 1.927029726884124, + "learning_rate": 9.98723944403846e-06, + "loss": 0.6311, + "step": 454 + }, + { + "epoch": 0.05, + "grad_norm": 3.5910725050736625, + "learning_rate": 9.987106232735519e-06, + "loss": 0.582, + "step": 455 + }, + { + "epoch": 0.05, + "grad_norm": 2.2551403430256682, + "learning_rate": 9.986972330619485e-06, + "loss": 0.5943, + "step": 456 + }, + { + "epoch": 0.05, + "grad_norm": 1.8514670538312779, + "learning_rate": 9.986837737708907e-06, + "loss": 0.5484, + "step": 457 + }, + { + "epoch": 0.05, + "grad_norm": 2.0741685658042917, + "learning_rate": 9.98670245402243e-06, + "loss": 0.537, + "step": 458 + }, + { + "epoch": 0.05, + "grad_norm": 2.7589610541469263, + "learning_rate": 9.986566479578795e-06, + "loss": 0.4586, + "step": 459 + }, + { + "epoch": 0.05, + "grad_norm": 2.328485838345172, + "learning_rate": 9.986429814396831e-06, + "loss": 0.5975, + "step": 460 + }, + { + "epoch": 0.05, + "grad_norm": 2.339460657509247, + "learning_rate": 9.986292458495474e-06, + "loss": 0.6011, + "step": 461 + }, + { + "epoch": 0.05, + "grad_norm": 4.383369483186488, + "learning_rate": 9.986154411893752e-06, + "loss": 0.4756, + "step": 462 + }, + { + "epoch": 0.05, + "grad_norm": 1.923182102750748, + "learning_rate": 9.986015674610782e-06, + "loss": 0.6031, + "step": 463 + }, + { + "epoch": 0.05, + "grad_norm": 2.7906864713091486, + "learning_rate": 9.985876246665784e-06, + "loss": 0.5685, + "step": 464 + }, + { + "epoch": 0.05, + "grad_norm": 2.0371027699558106, + "learning_rate": 9.985736128078073e-06, + "loss": 0.5506, + "step": 465 + }, + { + "epoch": 0.05, + "grad_norm": 2.0234052447574102, + "learning_rate": 9.985595318867057e-06, + "loss": 0.5514, + "step": 466 + }, + { + "epoch": 0.05, + "grad_norm": 2.8008704431824296, + "learning_rate": 9.985453819052241e-06, + "loss": 0.5111, + "step": 467 + }, + { + "epoch": 0.05, + "grad_norm": 2.2749688087157405, + "learning_rate": 9.985311628653224e-06, + "loss": 0.6589, + "step": 468 + }, + { + "epoch": 0.05, + "grad_norm": 2.248782408003784, + "learning_rate": 9.985168747689706e-06, + "loss": 0.4992, + "step": 469 + }, + { + "epoch": 0.05, + "grad_norm": 2.227040156295523, + "learning_rate": 9.985025176181476e-06, + "loss": 0.7168, + "step": 470 + }, + { + "epoch": 0.05, + "grad_norm": 1.6846112289588329, + "learning_rate": 9.984880914148421e-06, + "loss": 0.6324, + "step": 471 + }, + { + "epoch": 0.05, + "grad_norm": 2.7463735413812818, + "learning_rate": 9.984735961610525e-06, + "loss": 0.6665, + "step": 472 + }, + { + "epoch": 0.05, + "grad_norm": 1.6721300206774572, + "learning_rate": 9.984590318587869e-06, + "loss": 0.4869, + "step": 473 + }, + { + "epoch": 0.05, + "grad_norm": 1.707871299162142, + "learning_rate": 9.984443985100625e-06, + "loss": 0.5766, + "step": 474 + }, + { + "epoch": 0.05, + "grad_norm": 2.4823788121158326, + "learning_rate": 9.984296961169062e-06, + "loss": 0.5238, + "step": 475 + }, + { + "epoch": 0.05, + "grad_norm": 2.3401369073823535, + "learning_rate": 9.984149246813548e-06, + "loss": 0.5448, + "step": 476 + }, + { + "epoch": 0.05, + "grad_norm": 2.9538483696775497, + "learning_rate": 9.984000842054543e-06, + "loss": 0.4805, + "step": 477 + }, + { + "epoch": 0.05, + "grad_norm": 2.080547703494317, + "learning_rate": 9.983851746912605e-06, + "loss": 0.5866, + "step": 478 + }, + { + "epoch": 0.06, + "grad_norm": 2.7923393332855886, + "learning_rate": 9.983701961408386e-06, + "loss": 0.4749, + "step": 479 + }, + { + "epoch": 0.06, + "grad_norm": 2.1611675163842454, + "learning_rate": 9.983551485562635e-06, + "loss": 0.61, + "step": 480 + }, + { + "epoch": 0.06, + "grad_norm": 2.0889012912703855, + "learning_rate": 9.983400319396195e-06, + "loss": 0.5138, + "step": 481 + }, + { + "epoch": 0.06, + "grad_norm": 1.7966073065248644, + "learning_rate": 9.983248462930007e-06, + "loss": 0.4545, + "step": 482 + }, + { + "epoch": 0.06, + "grad_norm": 1.8321383959330082, + "learning_rate": 9.983095916185104e-06, + "loss": 0.6, + "step": 483 + }, + { + "epoch": 0.06, + "grad_norm": 1.9010991046299766, + "learning_rate": 9.982942679182617e-06, + "loss": 0.6006, + "step": 484 + }, + { + "epoch": 0.06, + "grad_norm": 1.6934192548515272, + "learning_rate": 9.982788751943774e-06, + "loss": 0.5647, + "step": 485 + }, + { + "epoch": 0.06, + "grad_norm": 2.1723359435729503, + "learning_rate": 9.982634134489897e-06, + "loss": 0.5833, + "step": 486 + }, + { + "epoch": 0.06, + "grad_norm": 3.443317030687206, + "learning_rate": 9.982478826842402e-06, + "loss": 0.6543, + "step": 487 + }, + { + "epoch": 0.06, + "grad_norm": 8.237864220129367, + "learning_rate": 9.982322829022804e-06, + "loss": 0.6084, + "step": 488 + }, + { + "epoch": 0.06, + "grad_norm": 2.678665386363977, + "learning_rate": 9.98216614105271e-06, + "loss": 0.5761, + "step": 489 + }, + { + "epoch": 0.06, + "grad_norm": 10.664362696197754, + "learning_rate": 9.982008762953823e-06, + "loss": 0.6619, + "step": 490 + }, + { + "epoch": 0.06, + "grad_norm": 3.64478336216436, + "learning_rate": 9.981850694747948e-06, + "loss": 0.567, + "step": 491 + }, + { + "epoch": 0.06, + "grad_norm": 1.8067859424901966, + "learning_rate": 9.981691936456975e-06, + "loss": 0.5719, + "step": 492 + }, + { + "epoch": 0.06, + "grad_norm": 2.3652311385538964, + "learning_rate": 9.9815324881029e-06, + "loss": 0.534, + "step": 493 + }, + { + "epoch": 0.06, + "grad_norm": 2.403731211526843, + "learning_rate": 9.981372349707806e-06, + "loss": 0.5663, + "step": 494 + }, + { + "epoch": 0.06, + "grad_norm": 1.9666329410778112, + "learning_rate": 9.981211521293878e-06, + "loss": 0.5089, + "step": 495 + }, + { + "epoch": 0.06, + "grad_norm": 1.7800792585259921, + "learning_rate": 9.981050002883392e-06, + "loss": 0.514, + "step": 496 + }, + { + "epoch": 0.06, + "grad_norm": 1.8486852927173552, + "learning_rate": 9.980887794498725e-06, + "loss": 0.577, + "step": 497 + }, + { + "epoch": 0.06, + "grad_norm": 2.056354988417175, + "learning_rate": 9.980724896162339e-06, + "loss": 0.4936, + "step": 498 + }, + { + "epoch": 0.06, + "grad_norm": 3.9894753824333082, + "learning_rate": 9.980561307896806e-06, + "loss": 0.5935, + "step": 499 + }, + { + "epoch": 0.06, + "grad_norm": 3.3374831211052096, + "learning_rate": 9.980397029724782e-06, + "loss": 0.5741, + "step": 500 + }, + { + "epoch": 0.06, + "grad_norm": 2.0030874862557044, + "learning_rate": 9.980232061669025e-06, + "loss": 0.5441, + "step": 501 + }, + { + "epoch": 0.06, + "grad_norm": 1.9073596474713068, + "learning_rate": 9.980066403752386e-06, + "loss": 0.5972, + "step": 502 + }, + { + "epoch": 0.06, + "grad_norm": 1.8774714806592239, + "learning_rate": 9.97990005599781e-06, + "loss": 0.5625, + "step": 503 + }, + { + "epoch": 0.06, + "grad_norm": 3.3870696207094775, + "learning_rate": 9.97973301842834e-06, + "loss": 0.5669, + "step": 504 + }, + { + "epoch": 0.06, + "grad_norm": 2.045454811429022, + "learning_rate": 9.979565291067117e-06, + "loss": 0.4997, + "step": 505 + }, + { + "epoch": 0.06, + "grad_norm": 2.846742309221474, + "learning_rate": 9.979396873937372e-06, + "loss": 0.5514, + "step": 506 + }, + { + "epoch": 0.06, + "grad_norm": 2.328248967421797, + "learning_rate": 9.979227767062434e-06, + "loss": 0.5725, + "step": 507 + }, + { + "epoch": 0.06, + "grad_norm": 2.492304612898177, + "learning_rate": 9.979057970465727e-06, + "loss": 0.4954, + "step": 508 + }, + { + "epoch": 0.06, + "grad_norm": 1.8449710513830246, + "learning_rate": 9.978887484170775e-06, + "loss": 0.5348, + "step": 509 + }, + { + "epoch": 0.06, + "grad_norm": 2.249942402600172, + "learning_rate": 9.978716308201188e-06, + "loss": 0.5527, + "step": 510 + }, + { + "epoch": 0.06, + "grad_norm": 2.435225984382256, + "learning_rate": 9.978544442580683e-06, + "loss": 0.5554, + "step": 511 + }, + { + "epoch": 0.06, + "grad_norm": 1.9967300292137642, + "learning_rate": 9.978371887333062e-06, + "loss": 0.6985, + "step": 512 + }, + { + "epoch": 0.06, + "grad_norm": 2.098075739938484, + "learning_rate": 9.97819864248223e-06, + "loss": 0.5478, + "step": 513 + }, + { + "epoch": 0.06, + "grad_norm": 3.0108043273276905, + "learning_rate": 9.978024708052185e-06, + "loss": 0.5786, + "step": 514 + }, + { + "epoch": 0.06, + "grad_norm": 1.8886420429682862, + "learning_rate": 9.97785008406702e-06, + "loss": 0.5368, + "step": 515 + }, + { + "epoch": 0.06, + "grad_norm": 2.4769256597522853, + "learning_rate": 9.977674770550922e-06, + "loss": 0.5564, + "step": 516 + }, + { + "epoch": 0.06, + "grad_norm": 2.0953448724306982, + "learning_rate": 9.977498767528177e-06, + "loss": 0.5797, + "step": 517 + }, + { + "epoch": 0.06, + "grad_norm": 1.9274272714522065, + "learning_rate": 9.977322075023165e-06, + "loss": 0.5284, + "step": 518 + }, + { + "epoch": 0.06, + "grad_norm": 3.290609229407683, + "learning_rate": 9.977144693060364e-06, + "loss": 0.6271, + "step": 519 + }, + { + "epoch": 0.06, + "grad_norm": 2.2994469922299414, + "learning_rate": 9.97696662166434e-06, + "loss": 0.657, + "step": 520 + }, + { + "epoch": 0.06, + "grad_norm": 3.4479291045284794, + "learning_rate": 9.97678786085976e-06, + "loss": 0.5339, + "step": 521 + }, + { + "epoch": 0.06, + "grad_norm": 1.7979868268627963, + "learning_rate": 9.97660841067139e-06, + "loss": 0.6454, + "step": 522 + }, + { + "epoch": 0.06, + "grad_norm": 2.0243030318189406, + "learning_rate": 9.976428271124084e-06, + "loss": 0.5819, + "step": 523 + }, + { + "epoch": 0.06, + "grad_norm": 1.9829068690224358, + "learning_rate": 9.976247442242796e-06, + "loss": 0.5287, + "step": 524 + }, + { + "epoch": 0.06, + "grad_norm": 5.131388712989478, + "learning_rate": 9.976065924052574e-06, + "loss": 0.5158, + "step": 525 + }, + { + "epoch": 0.06, + "grad_norm": 2.448431338062217, + "learning_rate": 9.975883716578563e-06, + "loss": 0.5648, + "step": 526 + }, + { + "epoch": 0.06, + "grad_norm": 2.528208843117592, + "learning_rate": 9.975700819846e-06, + "loss": 0.7358, + "step": 527 + }, + { + "epoch": 0.06, + "grad_norm": 2.0094011920873736, + "learning_rate": 9.975517233880223e-06, + "loss": 0.5864, + "step": 528 + }, + { + "epoch": 0.06, + "grad_norm": 1.7850968093471051, + "learning_rate": 9.975332958706659e-06, + "loss": 0.5882, + "step": 529 + }, + { + "epoch": 0.06, + "grad_norm": 1.9995053929934854, + "learning_rate": 9.975147994350836e-06, + "loss": 0.4404, + "step": 530 + }, + { + "epoch": 0.06, + "grad_norm": 2.263966489162853, + "learning_rate": 9.974962340838375e-06, + "loss": 0.4928, + "step": 531 + }, + { + "epoch": 0.06, + "grad_norm": 3.2152130393922254, + "learning_rate": 9.97477599819499e-06, + "loss": 0.4812, + "step": 532 + }, + { + "epoch": 0.06, + "grad_norm": 1.7523741435856102, + "learning_rate": 9.974588966446498e-06, + "loss": 0.6108, + "step": 533 + }, + { + "epoch": 0.06, + "grad_norm": 1.9739011635537678, + "learning_rate": 9.974401245618804e-06, + "loss": 0.6127, + "step": 534 + }, + { + "epoch": 0.06, + "grad_norm": 2.1225189214538274, + "learning_rate": 9.974212835737908e-06, + "loss": 0.5691, + "step": 535 + }, + { + "epoch": 0.06, + "grad_norm": 2.315986828717084, + "learning_rate": 9.974023736829915e-06, + "loss": 0.5326, + "step": 536 + }, + { + "epoch": 0.06, + "grad_norm": 2.1004171697715934, + "learning_rate": 9.973833948921014e-06, + "loss": 0.6036, + "step": 537 + }, + { + "epoch": 0.06, + "grad_norm": 2.0670416615712317, + "learning_rate": 9.973643472037495e-06, + "loss": 0.538, + "step": 538 + }, + { + "epoch": 0.06, + "grad_norm": 1.841554168594649, + "learning_rate": 9.973452306205745e-06, + "loss": 0.4935, + "step": 539 + }, + { + "epoch": 0.06, + "grad_norm": 2.0343671015390488, + "learning_rate": 9.973260451452242e-06, + "loss": 0.5615, + "step": 540 + }, + { + "epoch": 0.06, + "grad_norm": 2.2651768782888326, + "learning_rate": 9.973067907803564e-06, + "loss": 0.5148, + "step": 541 + }, + { + "epoch": 0.06, + "grad_norm": 2.016895763006731, + "learning_rate": 9.97287467528638e-06, + "loss": 0.5124, + "step": 542 + }, + { + "epoch": 0.06, + "grad_norm": 2.000680555027487, + "learning_rate": 9.972680753927457e-06, + "loss": 0.5512, + "step": 543 + }, + { + "epoch": 0.06, + "grad_norm": 2.0337247130499536, + "learning_rate": 9.972486143753658e-06, + "loss": 0.4864, + "step": 544 + }, + { + "epoch": 0.06, + "grad_norm": 1.901068470180426, + "learning_rate": 9.972290844791939e-06, + "loss": 0.4951, + "step": 545 + }, + { + "epoch": 0.06, + "grad_norm": 1.788125393228748, + "learning_rate": 9.972094857069355e-06, + "loss": 0.5923, + "step": 546 + }, + { + "epoch": 0.06, + "grad_norm": 1.7392139323251832, + "learning_rate": 9.97189818061305e-06, + "loss": 0.5834, + "step": 547 + }, + { + "epoch": 0.06, + "grad_norm": 1.995990366492805, + "learning_rate": 9.971700815450272e-06, + "loss": 0.5108, + "step": 548 + }, + { + "epoch": 0.06, + "grad_norm": 2.124328921818091, + "learning_rate": 9.971502761608356e-06, + "loss": 0.5594, + "step": 549 + }, + { + "epoch": 0.06, + "grad_norm": 1.9822774996278716, + "learning_rate": 9.97130401911474e-06, + "loss": 0.6016, + "step": 550 + }, + { + "epoch": 0.06, + "grad_norm": 2.168248153135091, + "learning_rate": 9.971104587996954e-06, + "loss": 0.5476, + "step": 551 + }, + { + "epoch": 0.06, + "grad_norm": 1.920272165963318, + "learning_rate": 9.97090446828262e-06, + "loss": 0.517, + "step": 552 + }, + { + "epoch": 0.06, + "grad_norm": 1.7401555794657373, + "learning_rate": 9.970703659999459e-06, + "loss": 0.5068, + "step": 553 + }, + { + "epoch": 0.06, + "grad_norm": 2.2512367458212705, + "learning_rate": 9.97050216317529e-06, + "loss": 0.5231, + "step": 554 + }, + { + "epoch": 0.06, + "grad_norm": 1.9294410448670494, + "learning_rate": 9.97029997783802e-06, + "loss": 0.5074, + "step": 555 + }, + { + "epoch": 0.06, + "grad_norm": 2.6861335932259194, + "learning_rate": 9.970097104015661e-06, + "loss": 0.5134, + "step": 556 + }, + { + "epoch": 0.06, + "grad_norm": 1.7293325596022315, + "learning_rate": 9.969893541736308e-06, + "loss": 0.5173, + "step": 557 + }, + { + "epoch": 0.06, + "grad_norm": 2.2897855167340326, + "learning_rate": 9.969689291028166e-06, + "loss": 0.4951, + "step": 558 + }, + { + "epoch": 0.06, + "grad_norm": 1.6993206170666062, + "learning_rate": 9.969484351919523e-06, + "loss": 0.5454, + "step": 559 + }, + { + "epoch": 0.06, + "grad_norm": 0.9847058035952536, + "learning_rate": 9.969278724438768e-06, + "loss": 0.7885, + "step": 560 + }, + { + "epoch": 0.06, + "grad_norm": 2.077648530226415, + "learning_rate": 9.969072408614385e-06, + "loss": 0.6087, + "step": 561 + }, + { + "epoch": 0.06, + "grad_norm": 2.119218554323957, + "learning_rate": 9.968865404474952e-06, + "loss": 0.5378, + "step": 562 + }, + { + "epoch": 0.06, + "grad_norm": 1.9691991776761808, + "learning_rate": 9.968657712049144e-06, + "loss": 0.5568, + "step": 563 + }, + { + "epoch": 0.06, + "grad_norm": 2.445572947457805, + "learning_rate": 9.968449331365732e-06, + "loss": 0.5358, + "step": 564 + }, + { + "epoch": 0.06, + "grad_norm": 2.0075204873900834, + "learning_rate": 9.968240262453577e-06, + "loss": 0.6798, + "step": 565 + }, + { + "epoch": 0.07, + "grad_norm": 1.5988639406998886, + "learning_rate": 9.968030505341642e-06, + "loss": 0.5685, + "step": 566 + }, + { + "epoch": 0.07, + "grad_norm": 1.7955216967813392, + "learning_rate": 9.967820060058982e-06, + "loss": 0.5385, + "step": 567 + }, + { + "epoch": 0.07, + "grad_norm": 1.7126955413396432, + "learning_rate": 9.967608926634748e-06, + "loss": 0.5401, + "step": 568 + }, + { + "epoch": 0.07, + "grad_norm": 1.7935880187862583, + "learning_rate": 9.967397105098187e-06, + "loss": 0.4806, + "step": 569 + }, + { + "epoch": 0.07, + "grad_norm": 1.901029781106834, + "learning_rate": 9.967184595478637e-06, + "loss": 0.53, + "step": 570 + }, + { + "epoch": 0.07, + "grad_norm": 2.4289977395109075, + "learning_rate": 9.966971397805538e-06, + "loss": 0.5637, + "step": 571 + }, + { + "epoch": 0.07, + "grad_norm": 2.1260725695208773, + "learning_rate": 9.966757512108422e-06, + "loss": 0.6531, + "step": 572 + }, + { + "epoch": 0.07, + "grad_norm": 3.095846757003945, + "learning_rate": 9.966542938416916e-06, + "loss": 0.5741, + "step": 573 + }, + { + "epoch": 0.07, + "grad_norm": 1.7959300946548455, + "learning_rate": 9.966327676760741e-06, + "loss": 0.5972, + "step": 574 + }, + { + "epoch": 0.07, + "grad_norm": 2.949517420718697, + "learning_rate": 9.966111727169717e-06, + "loss": 0.6073, + "step": 575 + }, + { + "epoch": 0.07, + "grad_norm": 1.7890006154838127, + "learning_rate": 9.965895089673757e-06, + "loss": 0.616, + "step": 576 + }, + { + "epoch": 0.07, + "grad_norm": 1.703044495347558, + "learning_rate": 9.965677764302869e-06, + "loss": 0.4951, + "step": 577 + }, + { + "epoch": 0.07, + "grad_norm": 1.0557594779185466, + "learning_rate": 9.965459751087156e-06, + "loss": 0.7773, + "step": 578 + }, + { + "epoch": 0.07, + "grad_norm": 4.511284441150251, + "learning_rate": 9.965241050056821e-06, + "loss": 0.5461, + "step": 579 + }, + { + "epoch": 0.07, + "grad_norm": 1.987912906953852, + "learning_rate": 9.965021661242153e-06, + "loss": 0.6269, + "step": 580 + }, + { + "epoch": 0.07, + "grad_norm": 4.163326114097087, + "learning_rate": 9.964801584673548e-06, + "loss": 0.6195, + "step": 581 + }, + { + "epoch": 0.07, + "grad_norm": 1.643189298274756, + "learning_rate": 9.964580820381484e-06, + "loss": 0.5452, + "step": 582 + }, + { + "epoch": 0.07, + "grad_norm": 1.8040576097433332, + "learning_rate": 9.964359368396545e-06, + "loss": 0.4765, + "step": 583 + }, + { + "epoch": 0.07, + "grad_norm": 2.6349169764913407, + "learning_rate": 9.964137228749409e-06, + "loss": 0.5971, + "step": 584 + }, + { + "epoch": 0.07, + "grad_norm": 1.789740294636757, + "learning_rate": 9.963914401470842e-06, + "loss": 0.548, + "step": 585 + }, + { + "epoch": 0.07, + "grad_norm": 2.668166353592765, + "learning_rate": 9.96369088659171e-06, + "loss": 0.4713, + "step": 586 + }, + { + "epoch": 0.07, + "grad_norm": 2.2800896199502083, + "learning_rate": 9.96346668414298e-06, + "loss": 0.5463, + "step": 587 + }, + { + "epoch": 0.07, + "grad_norm": 1.9164841038556226, + "learning_rate": 9.963241794155701e-06, + "loss": 0.5258, + "step": 588 + }, + { + "epoch": 0.07, + "grad_norm": 2.2434150448698325, + "learning_rate": 9.96301621666103e-06, + "loss": 0.5389, + "step": 589 + }, + { + "epoch": 0.07, + "grad_norm": 1.89420857530226, + "learning_rate": 9.962789951690213e-06, + "loss": 0.4505, + "step": 590 + }, + { + "epoch": 0.07, + "grad_norm": 1.200242949607029, + "learning_rate": 9.96256299927459e-06, + "loss": 0.8496, + "step": 591 + }, + { + "epoch": 0.07, + "grad_norm": 1.813874014307887, + "learning_rate": 9.9623353594456e-06, + "loss": 0.4867, + "step": 592 + }, + { + "epoch": 0.07, + "grad_norm": 3.8131360669421115, + "learning_rate": 9.962107032234775e-06, + "loss": 0.4694, + "step": 593 + }, + { + "epoch": 0.07, + "grad_norm": 1.851464975827319, + "learning_rate": 9.961878017673746e-06, + "loss": 0.6189, + "step": 594 + }, + { + "epoch": 0.07, + "grad_norm": 2.01308798272705, + "learning_rate": 9.961648315794231e-06, + "loss": 0.5765, + "step": 595 + }, + { + "epoch": 0.07, + "grad_norm": 2.0047703251731694, + "learning_rate": 9.961417926628051e-06, + "loss": 0.5202, + "step": 596 + }, + { + "epoch": 0.07, + "grad_norm": 1.8702049380112458, + "learning_rate": 9.96118685020712e-06, + "loss": 0.5994, + "step": 597 + }, + { + "epoch": 0.07, + "grad_norm": 2.0908730176025663, + "learning_rate": 9.960955086563447e-06, + "loss": 0.4698, + "step": 598 + }, + { + "epoch": 0.07, + "grad_norm": 1.9250156354503047, + "learning_rate": 9.960722635729131e-06, + "loss": 0.4996, + "step": 599 + }, + { + "epoch": 0.07, + "grad_norm": 1.9327650319495364, + "learning_rate": 9.96048949773638e-06, + "loss": 0.5255, + "step": 600 + }, + { + "epoch": 0.07, + "grad_norm": 2.413015836197884, + "learning_rate": 9.960255672617478e-06, + "loss": 0.5819, + "step": 601 + }, + { + "epoch": 0.07, + "grad_norm": 0.958274460474823, + "learning_rate": 9.96002116040482e-06, + "loss": 0.7668, + "step": 602 + }, + { + "epoch": 0.07, + "grad_norm": 2.5927236709717243, + "learning_rate": 9.959785961130892e-06, + "loss": 0.4978, + "step": 603 + }, + { + "epoch": 0.07, + "grad_norm": 0.9099822747105775, + "learning_rate": 9.95955007482827e-06, + "loss": 0.7573, + "step": 604 + }, + { + "epoch": 0.07, + "grad_norm": 2.01824485284765, + "learning_rate": 9.959313501529633e-06, + "loss": 0.5239, + "step": 605 + }, + { + "epoch": 0.07, + "grad_norm": 2.1559745511651243, + "learning_rate": 9.959076241267747e-06, + "loss": 0.5919, + "step": 606 + }, + { + "epoch": 0.07, + "grad_norm": 2.2206890952663016, + "learning_rate": 9.95883829407548e-06, + "loss": 0.5849, + "step": 607 + }, + { + "epoch": 0.07, + "grad_norm": 2.07008264681777, + "learning_rate": 9.95859965998579e-06, + "loss": 0.514, + "step": 608 + }, + { + "epoch": 0.07, + "grad_norm": 2.1568321442281873, + "learning_rate": 9.958360339031734e-06, + "loss": 0.5549, + "step": 609 + }, + { + "epoch": 0.07, + "grad_norm": 1.7154895186161723, + "learning_rate": 9.958120331246464e-06, + "loss": 0.6352, + "step": 610 + }, + { + "epoch": 0.07, + "grad_norm": 2.0154440267987495, + "learning_rate": 9.957879636663224e-06, + "loss": 0.498, + "step": 611 + }, + { + "epoch": 0.07, + "grad_norm": 2.3563732777713553, + "learning_rate": 9.957638255315354e-06, + "loss": 0.5744, + "step": 612 + }, + { + "epoch": 0.07, + "grad_norm": 2.464455407316683, + "learning_rate": 9.957396187236292e-06, + "loss": 0.532, + "step": 613 + }, + { + "epoch": 0.07, + "grad_norm": 1.7318349809915812, + "learning_rate": 9.95715343245957e-06, + "loss": 0.6437, + "step": 614 + }, + { + "epoch": 0.07, + "grad_norm": 2.444444998095691, + "learning_rate": 9.956909991018813e-06, + "loss": 0.4807, + "step": 615 + }, + { + "epoch": 0.07, + "grad_norm": 2.7174291491536446, + "learning_rate": 9.956665862947743e-06, + "loss": 0.5887, + "step": 616 + }, + { + "epoch": 0.07, + "grad_norm": 2.3789116517547813, + "learning_rate": 9.956421048280174e-06, + "loss": 0.45, + "step": 617 + }, + { + "epoch": 0.07, + "grad_norm": 2.484620723750597, + "learning_rate": 9.956175547050022e-06, + "loss": 0.5533, + "step": 618 + }, + { + "epoch": 0.07, + "grad_norm": 2.122355559497418, + "learning_rate": 9.955929359291291e-06, + "loss": 0.6512, + "step": 619 + }, + { + "epoch": 0.07, + "grad_norm": 1.8120982758466313, + "learning_rate": 9.955682485038084e-06, + "loss": 0.5641, + "step": 620 + }, + { + "epoch": 0.07, + "grad_norm": 2.6795786049320425, + "learning_rate": 9.955434924324596e-06, + "loss": 0.5031, + "step": 621 + }, + { + "epoch": 0.07, + "grad_norm": 2.0326293280023964, + "learning_rate": 9.955186677185122e-06, + "loss": 0.672, + "step": 622 + }, + { + "epoch": 0.07, + "grad_norm": 2.3474402745204173, + "learning_rate": 9.954937743654048e-06, + "loss": 0.5264, + "step": 623 + }, + { + "epoch": 0.07, + "grad_norm": 2.3474965006576283, + "learning_rate": 9.954688123765856e-06, + "loss": 0.526, + "step": 624 + }, + { + "epoch": 0.07, + "grad_norm": 1.2449115807143245, + "learning_rate": 9.954437817555122e-06, + "loss": 0.8155, + "step": 625 + }, + { + "epoch": 0.07, + "grad_norm": 2.605339464482112, + "learning_rate": 9.95418682505652e-06, + "loss": 0.4435, + "step": 626 + }, + { + "epoch": 0.07, + "grad_norm": 2.029216592351817, + "learning_rate": 9.953935146304817e-06, + "loss": 0.5129, + "step": 627 + }, + { + "epoch": 0.07, + "grad_norm": 0.989176654518757, + "learning_rate": 9.953682781334876e-06, + "loss": 0.8205, + "step": 628 + }, + { + "epoch": 0.07, + "grad_norm": 1.7390455433770013, + "learning_rate": 9.953429730181653e-06, + "loss": 0.4952, + "step": 629 + }, + { + "epoch": 0.07, + "grad_norm": 2.0630352549450754, + "learning_rate": 9.953175992880204e-06, + "loss": 0.5644, + "step": 630 + }, + { + "epoch": 0.07, + "grad_norm": 1.8098763223905001, + "learning_rate": 9.952921569465675e-06, + "loss": 0.5738, + "step": 631 + }, + { + "epoch": 0.07, + "grad_norm": 2.134643882066886, + "learning_rate": 9.952666459973304e-06, + "loss": 0.4949, + "step": 632 + }, + { + "epoch": 0.07, + "grad_norm": 2.2757808428129422, + "learning_rate": 9.952410664438436e-06, + "loss": 0.6466, + "step": 633 + }, + { + "epoch": 0.07, + "grad_norm": 1.8313078306214612, + "learning_rate": 9.952154182896499e-06, + "loss": 0.5182, + "step": 634 + }, + { + "epoch": 0.07, + "grad_norm": 2.1764358137619535, + "learning_rate": 9.951897015383023e-06, + "loss": 0.5172, + "step": 635 + }, + { + "epoch": 0.07, + "grad_norm": 2.325567238496445, + "learning_rate": 9.951639161933631e-06, + "loss": 0.4904, + "step": 636 + }, + { + "epoch": 0.07, + "grad_norm": 1.7906520515697446, + "learning_rate": 9.951380622584039e-06, + "loss": 0.5627, + "step": 637 + }, + { + "epoch": 0.07, + "grad_norm": 1.9679335448695763, + "learning_rate": 9.95112139737006e-06, + "loss": 0.5486, + "step": 638 + }, + { + "epoch": 0.07, + "grad_norm": 2.2632138637419326, + "learning_rate": 9.950861486327604e-06, + "loss": 0.5282, + "step": 639 + }, + { + "epoch": 0.07, + "grad_norm": 2.454021479588278, + "learning_rate": 9.950600889492672e-06, + "loss": 0.5918, + "step": 640 + }, + { + "epoch": 0.07, + "grad_norm": 2.5184412986496905, + "learning_rate": 9.950339606901362e-06, + "loss": 0.5734, + "step": 641 + }, + { + "epoch": 0.07, + "grad_norm": 1.7889424889705194, + "learning_rate": 9.950077638589867e-06, + "loss": 0.5436, + "step": 642 + }, + { + "epoch": 0.07, + "grad_norm": 1.8935818395296444, + "learning_rate": 9.949814984594475e-06, + "loss": 0.5078, + "step": 643 + }, + { + "epoch": 0.07, + "grad_norm": 2.4126857663649313, + "learning_rate": 9.949551644951569e-06, + "loss": 0.5145, + "step": 644 + }, + { + "epoch": 0.07, + "grad_norm": 2.2060769275218712, + "learning_rate": 9.949287619697625e-06, + "loss": 0.5834, + "step": 645 + }, + { + "epoch": 0.07, + "grad_norm": 2.0914569325619548, + "learning_rate": 9.94902290886922e-06, + "loss": 0.5225, + "step": 646 + }, + { + "epoch": 0.07, + "grad_norm": 2.871294008503719, + "learning_rate": 9.948757512503015e-06, + "loss": 0.6368, + "step": 647 + }, + { + "epoch": 0.07, + "grad_norm": 1.019332062064822, + "learning_rate": 9.948491430635779e-06, + "loss": 0.7444, + "step": 648 + }, + { + "epoch": 0.07, + "grad_norm": 1.9714214625326023, + "learning_rate": 9.948224663304367e-06, + "loss": 0.5102, + "step": 649 + }, + { + "epoch": 0.07, + "grad_norm": 2.5546874952338325, + "learning_rate": 9.94795721054573e-06, + "loss": 0.5388, + "step": 650 + }, + { + "epoch": 0.07, + "grad_norm": 3.171739076400427, + "learning_rate": 9.94768907239692e-06, + "loss": 0.5669, + "step": 651 + }, + { + "epoch": 0.07, + "grad_norm": 3.6071706335259175, + "learning_rate": 9.947420248895077e-06, + "loss": 0.6496, + "step": 652 + }, + { + "epoch": 0.08, + "grad_norm": 2.045292602663791, + "learning_rate": 9.947150740077436e-06, + "loss": 0.5385, + "step": 653 + }, + { + "epoch": 0.08, + "grad_norm": 1.7594494563789622, + "learning_rate": 9.94688054598133e-06, + "loss": 0.542, + "step": 654 + }, + { + "epoch": 0.08, + "grad_norm": 3.0979323485136585, + "learning_rate": 9.94660966664419e-06, + "loss": 0.5031, + "step": 655 + }, + { + "epoch": 0.08, + "grad_norm": 2.236019032635491, + "learning_rate": 9.946338102103536e-06, + "loss": 0.6364, + "step": 656 + }, + { + "epoch": 0.08, + "grad_norm": 1.971355208204479, + "learning_rate": 9.946065852396984e-06, + "loss": 0.5337, + "step": 657 + }, + { + "epoch": 0.08, + "grad_norm": 3.578291872063506, + "learning_rate": 9.945792917562245e-06, + "loss": 0.5368, + "step": 658 + }, + { + "epoch": 0.08, + "grad_norm": 2.0568071478387573, + "learning_rate": 9.94551929763713e-06, + "loss": 0.5566, + "step": 659 + }, + { + "epoch": 0.08, + "grad_norm": 2.9567326825421567, + "learning_rate": 9.945244992659539e-06, + "loss": 0.5507, + "step": 660 + }, + { + "epoch": 0.08, + "grad_norm": 1.0904525357199466, + "learning_rate": 9.944970002667466e-06, + "loss": 0.7381, + "step": 661 + }, + { + "epoch": 0.08, + "grad_norm": 2.733509053147223, + "learning_rate": 9.944694327699007e-06, + "loss": 0.5748, + "step": 662 + }, + { + "epoch": 0.08, + "grad_norm": 5.556355348898383, + "learning_rate": 9.944417967792343e-06, + "loss": 0.523, + "step": 663 + }, + { + "epoch": 0.08, + "grad_norm": 2.1214029699130985, + "learning_rate": 9.944140922985761e-06, + "loss": 0.4548, + "step": 664 + }, + { + "epoch": 0.08, + "grad_norm": 2.55978427339231, + "learning_rate": 9.943863193317635e-06, + "loss": 0.5298, + "step": 665 + }, + { + "epoch": 0.08, + "grad_norm": 2.2936620226210076, + "learning_rate": 9.943584778826434e-06, + "loss": 0.5022, + "step": 666 + }, + { + "epoch": 0.08, + "grad_norm": 2.20556628878954, + "learning_rate": 9.943305679550727e-06, + "loss": 0.5552, + "step": 667 + }, + { + "epoch": 0.08, + "grad_norm": 0.9831012442096033, + "learning_rate": 9.943025895529174e-06, + "loss": 0.7514, + "step": 668 + }, + { + "epoch": 0.08, + "grad_norm": 3.180583779045343, + "learning_rate": 9.942745426800529e-06, + "loss": 0.5527, + "step": 669 + }, + { + "epoch": 0.08, + "grad_norm": 2.2616320787242095, + "learning_rate": 9.942464273403643e-06, + "loss": 0.5243, + "step": 670 + }, + { + "epoch": 0.08, + "grad_norm": 1.8321299529506032, + "learning_rate": 9.942182435377463e-06, + "loss": 0.5327, + "step": 671 + }, + { + "epoch": 0.08, + "grad_norm": 3.0128739587228384, + "learning_rate": 9.941899912761028e-06, + "loss": 0.4551, + "step": 672 + }, + { + "epoch": 0.08, + "grad_norm": 1.9881347246035441, + "learning_rate": 9.941616705593473e-06, + "loss": 0.4875, + "step": 673 + }, + { + "epoch": 0.08, + "grad_norm": 3.1229868156106777, + "learning_rate": 9.941332813914027e-06, + "loss": 0.4959, + "step": 674 + }, + { + "epoch": 0.08, + "grad_norm": 2.4462541819488464, + "learning_rate": 9.941048237762016e-06, + "loss": 0.604, + "step": 675 + }, + { + "epoch": 0.08, + "grad_norm": 3.0822584649130524, + "learning_rate": 9.94076297717686e-06, + "loss": 0.4954, + "step": 676 + }, + { + "epoch": 0.08, + "grad_norm": 1.9270875399485574, + "learning_rate": 9.94047703219807e-06, + "loss": 0.4783, + "step": 677 + }, + { + "epoch": 0.08, + "grad_norm": 1.9277392122471173, + "learning_rate": 9.94019040286526e-06, + "loss": 0.5306, + "step": 678 + }, + { + "epoch": 0.08, + "grad_norm": 2.098715895988353, + "learning_rate": 9.939903089218129e-06, + "loss": 0.5976, + "step": 679 + }, + { + "epoch": 0.08, + "grad_norm": 1.0620666409701256, + "learning_rate": 9.939615091296479e-06, + "loss": 0.8239, + "step": 680 + }, + { + "epoch": 0.08, + "grad_norm": 1.9029685346928389, + "learning_rate": 9.939326409140201e-06, + "loss": 0.4987, + "step": 681 + }, + { + "epoch": 0.08, + "grad_norm": 1.7605793327274526, + "learning_rate": 9.939037042789284e-06, + "loss": 0.5361, + "step": 682 + }, + { + "epoch": 0.08, + "grad_norm": 1.8372909946594722, + "learning_rate": 9.938746992283812e-06, + "loss": 0.465, + "step": 683 + }, + { + "epoch": 0.08, + "grad_norm": 2.6081343759820523, + "learning_rate": 9.938456257663963e-06, + "loss": 0.4518, + "step": 684 + }, + { + "epoch": 0.08, + "grad_norm": 2.126485581946189, + "learning_rate": 9.938164838970007e-06, + "loss": 0.4543, + "step": 685 + }, + { + "epoch": 0.08, + "grad_norm": 1.9428120973747975, + "learning_rate": 9.937872736242314e-06, + "loss": 0.5348, + "step": 686 + }, + { + "epoch": 0.08, + "grad_norm": 2.0163273214225184, + "learning_rate": 9.937579949521342e-06, + "loss": 0.5541, + "step": 687 + }, + { + "epoch": 0.08, + "grad_norm": 2.057377891791151, + "learning_rate": 9.937286478847655e-06, + "loss": 0.5675, + "step": 688 + }, + { + "epoch": 0.08, + "grad_norm": 2.2465249306335044, + "learning_rate": 9.936992324261898e-06, + "loss": 0.5981, + "step": 689 + }, + { + "epoch": 0.08, + "grad_norm": 1.9262038385341438, + "learning_rate": 9.936697485804818e-06, + "loss": 0.5529, + "step": 690 + }, + { + "epoch": 0.08, + "grad_norm": 2.0024051993436713, + "learning_rate": 9.93640196351726e-06, + "loss": 0.6581, + "step": 691 + }, + { + "epoch": 0.08, + "grad_norm": 8.115884017243316, + "learning_rate": 9.936105757440155e-06, + "loss": 0.4934, + "step": 692 + }, + { + "epoch": 0.08, + "grad_norm": 3.398871482458367, + "learning_rate": 9.935808867614536e-06, + "loss": 0.5358, + "step": 693 + }, + { + "epoch": 0.08, + "grad_norm": 2.007300873595809, + "learning_rate": 9.935511294081528e-06, + "loss": 0.6184, + "step": 694 + }, + { + "epoch": 0.08, + "grad_norm": 1.8477158987942803, + "learning_rate": 9.93521303688235e-06, + "loss": 0.6032, + "step": 695 + }, + { + "epoch": 0.08, + "grad_norm": 1.5343449039322081, + "learning_rate": 9.934914096058317e-06, + "loss": 0.5647, + "step": 696 + }, + { + "epoch": 0.08, + "grad_norm": 1.7804778680838254, + "learning_rate": 9.934614471650838e-06, + "loss": 0.5273, + "step": 697 + }, + { + "epoch": 0.08, + "grad_norm": 2.172940278642374, + "learning_rate": 9.934314163701417e-06, + "loss": 0.6595, + "step": 698 + }, + { + "epoch": 0.08, + "grad_norm": 2.342133488016177, + "learning_rate": 9.934013172251654e-06, + "loss": 0.5122, + "step": 699 + }, + { + "epoch": 0.08, + "grad_norm": 1.8481879425363652, + "learning_rate": 9.93371149734324e-06, + "loss": 0.546, + "step": 700 + }, + { + "epoch": 0.08, + "grad_norm": 1.8531759546579984, + "learning_rate": 9.933409139017963e-06, + "loss": 0.5769, + "step": 701 + }, + { + "epoch": 0.08, + "grad_norm": 1.9451313212524495, + "learning_rate": 9.933106097317707e-06, + "loss": 0.612, + "step": 702 + }, + { + "epoch": 0.08, + "grad_norm": 2.55230137621701, + "learning_rate": 9.93280237228445e-06, + "loss": 0.5932, + "step": 703 + }, + { + "epoch": 0.08, + "grad_norm": 2.044791314072983, + "learning_rate": 9.93249796396026e-06, + "loss": 0.5762, + "step": 704 + }, + { + "epoch": 0.08, + "grad_norm": 1.8104729706868379, + "learning_rate": 9.932192872387309e-06, + "loss": 0.5768, + "step": 705 + }, + { + "epoch": 0.08, + "grad_norm": 1.995672554932377, + "learning_rate": 9.931887097607857e-06, + "loss": 0.6117, + "step": 706 + }, + { + "epoch": 0.08, + "grad_norm": 2.1278005357159704, + "learning_rate": 9.931580639664256e-06, + "loss": 0.4826, + "step": 707 + }, + { + "epoch": 0.08, + "grad_norm": 2.1791385156669003, + "learning_rate": 9.931273498598958e-06, + "loss": 0.4879, + "step": 708 + }, + { + "epoch": 0.08, + "grad_norm": 3.3206698093850275, + "learning_rate": 9.930965674454512e-06, + "loss": 0.5934, + "step": 709 + }, + { + "epoch": 0.08, + "grad_norm": 2.2777966100747844, + "learning_rate": 9.930657167273552e-06, + "loss": 0.4988, + "step": 710 + }, + { + "epoch": 0.08, + "grad_norm": 1.8826281627701882, + "learning_rate": 9.930347977098818e-06, + "loss": 0.5451, + "step": 711 + }, + { + "epoch": 0.08, + "grad_norm": 2.226633155844569, + "learning_rate": 9.930038103973134e-06, + "loss": 0.6042, + "step": 712 + }, + { + "epoch": 0.08, + "grad_norm": 2.1601930532455857, + "learning_rate": 9.929727547939427e-06, + "loss": 0.4846, + "step": 713 + }, + { + "epoch": 0.08, + "grad_norm": 1.8442303257123716, + "learning_rate": 9.929416309040713e-06, + "loss": 0.4684, + "step": 714 + }, + { + "epoch": 0.08, + "grad_norm": 2.063520633778046, + "learning_rate": 9.929104387320107e-06, + "loss": 0.5203, + "step": 715 + }, + { + "epoch": 0.08, + "grad_norm": 2.7980427447754983, + "learning_rate": 9.928791782820814e-06, + "loss": 0.4908, + "step": 716 + }, + { + "epoch": 0.08, + "grad_norm": 2.512675798750525, + "learning_rate": 9.928478495586136e-06, + "loss": 0.5003, + "step": 717 + }, + { + "epoch": 0.08, + "grad_norm": 2.173303590706775, + "learning_rate": 9.928164525659471e-06, + "loss": 0.6356, + "step": 718 + }, + { + "epoch": 0.08, + "grad_norm": 2.1788977791717667, + "learning_rate": 9.92784987308431e-06, + "loss": 0.6497, + "step": 719 + }, + { + "epoch": 0.08, + "grad_norm": 1.826391375036422, + "learning_rate": 9.92753453790424e-06, + "loss": 0.4982, + "step": 720 + }, + { + "epoch": 0.08, + "grad_norm": 2.179762023739226, + "learning_rate": 9.927218520162936e-06, + "loss": 0.5629, + "step": 721 + }, + { + "epoch": 0.08, + "grad_norm": 2.1048496410133914, + "learning_rate": 9.926901819904179e-06, + "loss": 0.594, + "step": 722 + }, + { + "epoch": 0.08, + "grad_norm": 2.660875620099305, + "learning_rate": 9.926584437171833e-06, + "loss": 0.597, + "step": 723 + }, + { + "epoch": 0.08, + "grad_norm": 2.168769712974328, + "learning_rate": 9.926266372009864e-06, + "loss": 0.5471, + "step": 724 + }, + { + "epoch": 0.08, + "grad_norm": 2.3994913270534344, + "learning_rate": 9.925947624462331e-06, + "loss": 0.6612, + "step": 725 + }, + { + "epoch": 0.08, + "grad_norm": 1.8698950319088523, + "learning_rate": 9.925628194573387e-06, + "loss": 0.43, + "step": 726 + }, + { + "epoch": 0.08, + "grad_norm": 2.8823893842736936, + "learning_rate": 9.925308082387278e-06, + "loss": 0.6223, + "step": 727 + }, + { + "epoch": 0.08, + "grad_norm": 2.025879052142635, + "learning_rate": 9.924987287948347e-06, + "loss": 0.6342, + "step": 728 + }, + { + "epoch": 0.08, + "grad_norm": 2.2683153638883216, + "learning_rate": 9.92466581130103e-06, + "loss": 0.5773, + "step": 729 + }, + { + "epoch": 0.08, + "grad_norm": 1.8839728083943041, + "learning_rate": 9.924343652489856e-06, + "loss": 0.574, + "step": 730 + }, + { + "epoch": 0.08, + "grad_norm": 1.8674703703850188, + "learning_rate": 9.924020811559455e-06, + "loss": 0.5426, + "step": 731 + }, + { + "epoch": 0.08, + "grad_norm": 1.8251327391842784, + "learning_rate": 9.923697288554541e-06, + "loss": 0.491, + "step": 732 + }, + { + "epoch": 0.08, + "grad_norm": 2.3171599450221123, + "learning_rate": 9.923373083519932e-06, + "loss": 0.5069, + "step": 733 + }, + { + "epoch": 0.08, + "grad_norm": 1.8393428399702203, + "learning_rate": 9.923048196500537e-06, + "loss": 0.5713, + "step": 734 + }, + { + "epoch": 0.08, + "grad_norm": 1.4959149153027789, + "learning_rate": 9.92272262754136e-06, + "loss": 0.4881, + "step": 735 + }, + { + "epoch": 0.08, + "grad_norm": 2.384836534371826, + "learning_rate": 9.922396376687496e-06, + "loss": 0.5902, + "step": 736 + }, + { + "epoch": 0.08, + "grad_norm": 2.078912666958259, + "learning_rate": 9.922069443984137e-06, + "loss": 0.5072, + "step": 737 + }, + { + "epoch": 0.08, + "grad_norm": 2.230468392380398, + "learning_rate": 9.921741829476574e-06, + "loss": 0.6295, + "step": 738 + }, + { + "epoch": 0.08, + "grad_norm": 2.111905315128315, + "learning_rate": 9.921413533210183e-06, + "loss": 0.5493, + "step": 739 + }, + { + "epoch": 0.09, + "grad_norm": 1.9886021343673919, + "learning_rate": 9.921084555230443e-06, + "loss": 0.5414, + "step": 740 + }, + { + "epoch": 0.09, + "grad_norm": 2.35063839581549, + "learning_rate": 9.920754895582923e-06, + "loss": 0.5427, + "step": 741 + }, + { + "epoch": 0.09, + "grad_norm": 1.6884971089657492, + "learning_rate": 9.920424554313287e-06, + "loss": 0.5673, + "step": 742 + }, + { + "epoch": 0.09, + "grad_norm": 2.2040526045487945, + "learning_rate": 9.920093531467292e-06, + "loss": 0.6176, + "step": 743 + }, + { + "epoch": 0.09, + "grad_norm": 1.830093499337659, + "learning_rate": 9.919761827090794e-06, + "loss": 0.5724, + "step": 744 + }, + { + "epoch": 0.09, + "grad_norm": 1.73956602881056, + "learning_rate": 9.919429441229741e-06, + "loss": 0.6113, + "step": 745 + }, + { + "epoch": 0.09, + "grad_norm": 1.6224037099847817, + "learning_rate": 9.919096373930173e-06, + "loss": 0.5316, + "step": 746 + }, + { + "epoch": 0.09, + "grad_norm": 1.7836489274995084, + "learning_rate": 9.918762625238227e-06, + "loss": 0.6392, + "step": 747 + }, + { + "epoch": 0.09, + "grad_norm": 2.8953826140673886, + "learning_rate": 9.918428195200137e-06, + "loss": 0.4769, + "step": 748 + }, + { + "epoch": 0.09, + "grad_norm": 2.0202531597217592, + "learning_rate": 9.918093083862221e-06, + "loss": 0.5416, + "step": 749 + }, + { + "epoch": 0.09, + "grad_norm": 1.6618625542023637, + "learning_rate": 9.917757291270906e-06, + "loss": 0.5019, + "step": 750 + }, + { + "epoch": 0.09, + "grad_norm": 4.402843337035946, + "learning_rate": 9.917420817472701e-06, + "loss": 0.6129, + "step": 751 + }, + { + "epoch": 0.09, + "grad_norm": 2.1313367286528635, + "learning_rate": 9.917083662514218e-06, + "loss": 0.599, + "step": 752 + }, + { + "epoch": 0.09, + "grad_norm": 1.8259048180525739, + "learning_rate": 9.916745826442155e-06, + "loss": 0.5103, + "step": 753 + }, + { + "epoch": 0.09, + "grad_norm": 1.8131221490423983, + "learning_rate": 9.916407309303315e-06, + "loss": 0.5025, + "step": 754 + }, + { + "epoch": 0.09, + "grad_norm": 2.0687164677680507, + "learning_rate": 9.916068111144584e-06, + "loss": 0.5791, + "step": 755 + }, + { + "epoch": 0.09, + "grad_norm": 2.0056526659623533, + "learning_rate": 9.915728232012948e-06, + "loss": 0.5507, + "step": 756 + }, + { + "epoch": 0.09, + "grad_norm": 2.204055682525048, + "learning_rate": 9.915387671955492e-06, + "loss": 0.4871, + "step": 757 + }, + { + "epoch": 0.09, + "grad_norm": 3.163946230808035, + "learning_rate": 9.915046431019386e-06, + "loss": 0.607, + "step": 758 + }, + { + "epoch": 0.09, + "grad_norm": 1.8721184287224315, + "learning_rate": 9.9147045092519e-06, + "loss": 0.5558, + "step": 759 + }, + { + "epoch": 0.09, + "grad_norm": 2.0394258219841412, + "learning_rate": 9.914361906700395e-06, + "loss": 0.5533, + "step": 760 + }, + { + "epoch": 0.09, + "grad_norm": 2.2110918203526078, + "learning_rate": 9.914018623412332e-06, + "loss": 0.5598, + "step": 761 + }, + { + "epoch": 0.09, + "grad_norm": 1.975737341190045, + "learning_rate": 9.91367465943526e-06, + "loss": 0.5323, + "step": 762 + }, + { + "epoch": 0.09, + "grad_norm": 1.914946697474532, + "learning_rate": 9.913330014816825e-06, + "loss": 0.48, + "step": 763 + }, + { + "epoch": 0.09, + "grad_norm": 2.3215453586939656, + "learning_rate": 9.912984689604767e-06, + "loss": 0.591, + "step": 764 + }, + { + "epoch": 0.09, + "grad_norm": 2.71369353912503, + "learning_rate": 9.91263868384692e-06, + "loss": 0.6416, + "step": 765 + }, + { + "epoch": 0.09, + "grad_norm": 1.8021380576463637, + "learning_rate": 9.912291997591214e-06, + "loss": 0.5271, + "step": 766 + }, + { + "epoch": 0.09, + "grad_norm": 3.501968820056032, + "learning_rate": 9.911944630885673e-06, + "loss": 0.5592, + "step": 767 + }, + { + "epoch": 0.09, + "grad_norm": 1.9508127903138144, + "learning_rate": 9.91159658377841e-06, + "loss": 0.5236, + "step": 768 + }, + { + "epoch": 0.09, + "grad_norm": 2.9344108037566103, + "learning_rate": 9.91124785631764e-06, + "loss": 0.5318, + "step": 769 + }, + { + "epoch": 0.09, + "grad_norm": 1.8071322055126993, + "learning_rate": 9.910898448551667e-06, + "loss": 0.5318, + "step": 770 + }, + { + "epoch": 0.09, + "grad_norm": 1.9819159417999153, + "learning_rate": 9.910548360528894e-06, + "loss": 0.6113, + "step": 771 + }, + { + "epoch": 0.09, + "grad_norm": 2.8651408487871985, + "learning_rate": 9.91019759229781e-06, + "loss": 0.6614, + "step": 772 + }, + { + "epoch": 0.09, + "grad_norm": 1.8931814155451214, + "learning_rate": 9.909846143907007e-06, + "loss": 0.453, + "step": 773 + }, + { + "epoch": 0.09, + "grad_norm": 2.250629188912134, + "learning_rate": 9.909494015405165e-06, + "loss": 0.5246, + "step": 774 + }, + { + "epoch": 0.09, + "grad_norm": 4.356545651971741, + "learning_rate": 9.909141206841063e-06, + "loss": 0.5608, + "step": 775 + }, + { + "epoch": 0.09, + "grad_norm": 3.6067207075793863, + "learning_rate": 9.908787718263573e-06, + "loss": 0.5624, + "step": 776 + }, + { + "epoch": 0.09, + "grad_norm": 2.5715543892755393, + "learning_rate": 9.908433549721657e-06, + "loss": 0.4727, + "step": 777 + }, + { + "epoch": 0.09, + "grad_norm": 1.814019611720708, + "learning_rate": 9.908078701264377e-06, + "loss": 0.5259, + "step": 778 + }, + { + "epoch": 0.09, + "grad_norm": 1.9615486322382332, + "learning_rate": 9.907723172940885e-06, + "loss": 0.4919, + "step": 779 + }, + { + "epoch": 0.09, + "grad_norm": 2.3423456682474426, + "learning_rate": 9.907366964800429e-06, + "loss": 0.6536, + "step": 780 + }, + { + "epoch": 0.09, + "grad_norm": 2.2794434458657546, + "learning_rate": 9.90701007689235e-06, + "loss": 0.5372, + "step": 781 + }, + { + "epoch": 0.09, + "grad_norm": 2.4870751556327084, + "learning_rate": 9.906652509266086e-06, + "loss": 0.5785, + "step": 782 + }, + { + "epoch": 0.09, + "grad_norm": 2.44398977849809, + "learning_rate": 9.906294261971167e-06, + "loss": 0.55, + "step": 783 + }, + { + "epoch": 0.09, + "grad_norm": 2.688777216456219, + "learning_rate": 9.905935335057215e-06, + "loss": 0.5801, + "step": 784 + }, + { + "epoch": 0.09, + "grad_norm": 2.1364305601121654, + "learning_rate": 9.905575728573952e-06, + "loss": 0.5739, + "step": 785 + }, + { + "epoch": 0.09, + "grad_norm": 2.5522044750988018, + "learning_rate": 9.905215442571189e-06, + "loss": 0.5388, + "step": 786 + }, + { + "epoch": 0.09, + "grad_norm": 2.399659223136487, + "learning_rate": 9.904854477098829e-06, + "loss": 0.5485, + "step": 787 + }, + { + "epoch": 0.09, + "grad_norm": 2.221108025977278, + "learning_rate": 9.904492832206879e-06, + "loss": 0.4584, + "step": 788 + }, + { + "epoch": 0.09, + "grad_norm": 2.1366722251738843, + "learning_rate": 9.90413050794543e-06, + "loss": 0.5765, + "step": 789 + }, + { + "epoch": 0.09, + "grad_norm": 2.178803801307301, + "learning_rate": 9.903767504364674e-06, + "loss": 0.4955, + "step": 790 + }, + { + "epoch": 0.09, + "grad_norm": 2.4051784151081432, + "learning_rate": 9.903403821514893e-06, + "loss": 0.5564, + "step": 791 + }, + { + "epoch": 0.09, + "grad_norm": 1.9680915771905254, + "learning_rate": 9.903039459446463e-06, + "loss": 0.5101, + "step": 792 + }, + { + "epoch": 0.09, + "grad_norm": 2.04411908521223, + "learning_rate": 9.902674418209856e-06, + "loss": 0.5609, + "step": 793 + }, + { + "epoch": 0.09, + "grad_norm": 2.255772234723594, + "learning_rate": 9.902308697855638e-06, + "loss": 0.6796, + "step": 794 + }, + { + "epoch": 0.09, + "grad_norm": 2.3993617801407505, + "learning_rate": 9.901942298434469e-06, + "loss": 0.5999, + "step": 795 + }, + { + "epoch": 0.09, + "grad_norm": 2.3125065096164716, + "learning_rate": 9.901575219997101e-06, + "loss": 0.5687, + "step": 796 + }, + { + "epoch": 0.09, + "grad_norm": 1.6085419162717534, + "learning_rate": 9.901207462594383e-06, + "loss": 0.5943, + "step": 797 + }, + { + "epoch": 0.09, + "grad_norm": 2.0920400823301573, + "learning_rate": 9.900839026277256e-06, + "loss": 0.6622, + "step": 798 + }, + { + "epoch": 0.09, + "grad_norm": 2.1861997520184646, + "learning_rate": 9.900469911096756e-06, + "loss": 0.4737, + "step": 799 + }, + { + "epoch": 0.09, + "grad_norm": 4.375693675651978, + "learning_rate": 9.900100117104011e-06, + "loss": 0.602, + "step": 800 + }, + { + "epoch": 0.09, + "grad_norm": 2.2502230168747253, + "learning_rate": 9.899729644350249e-06, + "loss": 0.5393, + "step": 801 + }, + { + "epoch": 0.09, + "grad_norm": 2.114481174134644, + "learning_rate": 9.899358492886784e-06, + "loss": 0.4948, + "step": 802 + }, + { + "epoch": 0.09, + "grad_norm": 1.9471575108186827, + "learning_rate": 9.898986662765029e-06, + "loss": 0.5589, + "step": 803 + }, + { + "epoch": 0.09, + "grad_norm": 2.162272939817687, + "learning_rate": 9.898614154036491e-06, + "loss": 0.5211, + "step": 804 + }, + { + "epoch": 0.09, + "grad_norm": 2.04777132394763, + "learning_rate": 9.898240966752768e-06, + "loss": 0.4473, + "step": 805 + }, + { + "epoch": 0.09, + "grad_norm": 2.4256387684862872, + "learning_rate": 9.897867100965555e-06, + "loss": 0.6285, + "step": 806 + }, + { + "epoch": 0.09, + "grad_norm": 1.9545077404605793, + "learning_rate": 9.89749255672664e-06, + "loss": 0.5509, + "step": 807 + }, + { + "epoch": 0.09, + "grad_norm": 2.930443187041612, + "learning_rate": 9.897117334087904e-06, + "loss": 0.5743, + "step": 808 + }, + { + "epoch": 0.09, + "grad_norm": 1.8622711635673348, + "learning_rate": 9.896741433101322e-06, + "loss": 0.5263, + "step": 809 + }, + { + "epoch": 0.09, + "grad_norm": 2.104415358110688, + "learning_rate": 9.896364853818967e-06, + "loss": 0.5606, + "step": 810 + }, + { + "epoch": 0.09, + "grad_norm": 2.9804163392740417, + "learning_rate": 9.895987596293e-06, + "loss": 0.5242, + "step": 811 + }, + { + "epoch": 0.09, + "grad_norm": 2.1821664235484057, + "learning_rate": 9.895609660575678e-06, + "loss": 0.5937, + "step": 812 + }, + { + "epoch": 0.09, + "grad_norm": 2.047322065001753, + "learning_rate": 9.895231046719354e-06, + "loss": 0.6513, + "step": 813 + }, + { + "epoch": 0.09, + "grad_norm": 2.5285621751661074, + "learning_rate": 9.894851754776473e-06, + "loss": 0.5082, + "step": 814 + }, + { + "epoch": 0.09, + "grad_norm": 2.6050560960277096, + "learning_rate": 9.894471784799575e-06, + "loss": 0.5519, + "step": 815 + }, + { + "epoch": 0.09, + "grad_norm": 2.4468718588917024, + "learning_rate": 9.894091136841294e-06, + "loss": 0.5595, + "step": 816 + }, + { + "epoch": 0.09, + "grad_norm": 2.263929432390896, + "learning_rate": 9.893709810954354e-06, + "loss": 0.6088, + "step": 817 + }, + { + "epoch": 0.09, + "grad_norm": 2.5060644876972598, + "learning_rate": 9.893327807191581e-06, + "loss": 0.5988, + "step": 818 + }, + { + "epoch": 0.09, + "grad_norm": 2.212660303552677, + "learning_rate": 9.892945125605888e-06, + "loss": 0.5156, + "step": 819 + }, + { + "epoch": 0.09, + "grad_norm": 1.629945351422265, + "learning_rate": 9.892561766250284e-06, + "loss": 0.5808, + "step": 820 + }, + { + "epoch": 0.09, + "grad_norm": 2.1968672159859475, + "learning_rate": 9.89217772917787e-06, + "loss": 0.6845, + "step": 821 + }, + { + "epoch": 0.09, + "grad_norm": 2.2512796937148254, + "learning_rate": 9.891793014441844e-06, + "loss": 0.5687, + "step": 822 + }, + { + "epoch": 0.09, + "grad_norm": 2.6830955802885534, + "learning_rate": 9.891407622095498e-06, + "loss": 0.6377, + "step": 823 + }, + { + "epoch": 0.09, + "grad_norm": 1.894746516916812, + "learning_rate": 9.891021552192215e-06, + "loss": 0.6678, + "step": 824 + }, + { + "epoch": 0.09, + "grad_norm": 1.971413953696215, + "learning_rate": 9.890634804785473e-06, + "loss": 0.5345, + "step": 825 + }, + { + "epoch": 0.09, + "grad_norm": 1.9816815986119727, + "learning_rate": 9.890247379928845e-06, + "loss": 0.604, + "step": 826 + }, + { + "epoch": 0.1, + "grad_norm": 1.8958919628301971, + "learning_rate": 9.889859277675999e-06, + "loss": 0.5321, + "step": 827 + }, + { + "epoch": 0.1, + "grad_norm": 2.368763408783505, + "learning_rate": 9.889470498080691e-06, + "loss": 0.4718, + "step": 828 + }, + { + "epoch": 0.1, + "grad_norm": 1.9264470940603833, + "learning_rate": 9.889081041196777e-06, + "loss": 0.4672, + "step": 829 + }, + { + "epoch": 0.1, + "grad_norm": 2.337750119077244, + "learning_rate": 9.888690907078205e-06, + "loss": 0.5995, + "step": 830 + }, + { + "epoch": 0.1, + "grad_norm": 2.134855133374358, + "learning_rate": 9.888300095779013e-06, + "loss": 0.6687, + "step": 831 + }, + { + "epoch": 0.1, + "grad_norm": 2.5573954400696732, + "learning_rate": 9.887908607353341e-06, + "loss": 0.4879, + "step": 832 + }, + { + "epoch": 0.1, + "grad_norm": 1.9280029362037718, + "learning_rate": 9.887516441855413e-06, + "loss": 0.4589, + "step": 833 + }, + { + "epoch": 0.1, + "grad_norm": 2.14199820530583, + "learning_rate": 9.887123599339555e-06, + "loss": 0.5969, + "step": 834 + }, + { + "epoch": 0.1, + "grad_norm": 1.845683916256456, + "learning_rate": 9.886730079860182e-06, + "loss": 0.5413, + "step": 835 + }, + { + "epoch": 0.1, + "grad_norm": 1.6804140424519625, + "learning_rate": 9.886335883471804e-06, + "loss": 0.5706, + "step": 836 + }, + { + "epoch": 0.1, + "grad_norm": 1.7436964357422922, + "learning_rate": 9.885941010229028e-06, + "loss": 0.5755, + "step": 837 + }, + { + "epoch": 0.1, + "grad_norm": 3.030870656340766, + "learning_rate": 9.885545460186548e-06, + "loss": 0.4875, + "step": 838 + }, + { + "epoch": 0.1, + "grad_norm": 4.275484580724343, + "learning_rate": 9.885149233399158e-06, + "loss": 0.4827, + "step": 839 + }, + { + "epoch": 0.1, + "grad_norm": 2.083240178100821, + "learning_rate": 9.884752329921743e-06, + "loss": 0.5062, + "step": 840 + }, + { + "epoch": 0.1, + "grad_norm": 2.1288713566535558, + "learning_rate": 9.88435474980928e-06, + "loss": 0.5906, + "step": 841 + }, + { + "epoch": 0.1, + "grad_norm": 2.0204345005821027, + "learning_rate": 9.883956493116842e-06, + "loss": 0.5751, + "step": 842 + }, + { + "epoch": 0.1, + "grad_norm": 2.6466305426953567, + "learning_rate": 9.883557559899599e-06, + "loss": 0.5665, + "step": 843 + }, + { + "epoch": 0.1, + "grad_norm": 2.8533477132542155, + "learning_rate": 9.883157950212807e-06, + "loss": 0.4665, + "step": 844 + }, + { + "epoch": 0.1, + "grad_norm": 3.4708687595532766, + "learning_rate": 9.882757664111822e-06, + "loss": 0.5441, + "step": 845 + }, + { + "epoch": 0.1, + "grad_norm": 2.2994338199228217, + "learning_rate": 9.882356701652092e-06, + "loss": 0.4874, + "step": 846 + }, + { + "epoch": 0.1, + "grad_norm": 1.9579351519231976, + "learning_rate": 9.881955062889155e-06, + "loss": 0.5224, + "step": 847 + }, + { + "epoch": 0.1, + "grad_norm": 0.9931887200717259, + "learning_rate": 9.88155274787865e-06, + "loss": 0.735, + "step": 848 + }, + { + "epoch": 0.1, + "grad_norm": 2.3310893594085265, + "learning_rate": 9.881149756676302e-06, + "loss": 0.5277, + "step": 849 + }, + { + "epoch": 0.1, + "grad_norm": 3.9117576941358045, + "learning_rate": 9.880746089337938e-06, + "loss": 0.6142, + "step": 850 + }, + { + "epoch": 0.1, + "grad_norm": 2.62749544087934, + "learning_rate": 9.88034174591947e-06, + "loss": 0.6049, + "step": 851 + }, + { + "epoch": 0.1, + "grad_norm": 1.6992686547562001, + "learning_rate": 9.879936726476908e-06, + "loss": 0.5118, + "step": 852 + }, + { + "epoch": 0.1, + "grad_norm": 1.8881631794332134, + "learning_rate": 9.879531031066355e-06, + "loss": 0.5697, + "step": 853 + }, + { + "epoch": 0.1, + "grad_norm": 2.5117542962535326, + "learning_rate": 9.87912465974401e-06, + "loss": 0.7017, + "step": 854 + }, + { + "epoch": 0.1, + "grad_norm": 2.772221473871344, + "learning_rate": 9.878717612566163e-06, + "loss": 0.4996, + "step": 855 + }, + { + "epoch": 0.1, + "grad_norm": 1.7693683763283217, + "learning_rate": 9.878309889589197e-06, + "loss": 0.4482, + "step": 856 + }, + { + "epoch": 0.1, + "grad_norm": 2.083813715152233, + "learning_rate": 9.87790149086959e-06, + "loss": 0.5495, + "step": 857 + }, + { + "epoch": 0.1, + "grad_norm": 2.0428863186336743, + "learning_rate": 9.877492416463913e-06, + "loss": 0.5361, + "step": 858 + }, + { + "epoch": 0.1, + "grad_norm": 2.167621007070851, + "learning_rate": 9.87708266642883e-06, + "loss": 0.446, + "step": 859 + }, + { + "epoch": 0.1, + "grad_norm": 2.0051783378371573, + "learning_rate": 9.876672240821103e-06, + "loss": 0.6286, + "step": 860 + }, + { + "epoch": 0.1, + "grad_norm": 2.7316501723126323, + "learning_rate": 9.87626113969758e-06, + "loss": 0.5663, + "step": 861 + }, + { + "epoch": 0.1, + "grad_norm": 1.9826503785060554, + "learning_rate": 9.87584936311521e-06, + "loss": 0.4846, + "step": 862 + }, + { + "epoch": 0.1, + "grad_norm": 1.88056999337644, + "learning_rate": 9.87543691113103e-06, + "loss": 0.5047, + "step": 863 + }, + { + "epoch": 0.1, + "grad_norm": 3.904497096726396, + "learning_rate": 9.875023783802174e-06, + "loss": 0.6063, + "step": 864 + }, + { + "epoch": 0.1, + "grad_norm": 1.7947849651683292, + "learning_rate": 9.874609981185868e-06, + "loss": 0.5665, + "step": 865 + }, + { + "epoch": 0.1, + "grad_norm": 2.274184819131868, + "learning_rate": 9.87419550333943e-06, + "loss": 0.519, + "step": 866 + }, + { + "epoch": 0.1, + "grad_norm": 2.176736515231948, + "learning_rate": 9.873780350320276e-06, + "loss": 0.5503, + "step": 867 + }, + { + "epoch": 0.1, + "grad_norm": 2.082572846073231, + "learning_rate": 9.873364522185913e-06, + "loss": 0.5768, + "step": 868 + }, + { + "epoch": 0.1, + "grad_norm": 2.0723792133072667, + "learning_rate": 9.87294801899394e-06, + "loss": 0.5119, + "step": 869 + }, + { + "epoch": 0.1, + "grad_norm": 1.9594449988199611, + "learning_rate": 9.872530840802052e-06, + "loss": 0.6024, + "step": 870 + }, + { + "epoch": 0.1, + "grad_norm": 1.6720242689909397, + "learning_rate": 9.872112987668034e-06, + "loss": 0.5155, + "step": 871 + }, + { + "epoch": 0.1, + "grad_norm": 1.5745320908660818, + "learning_rate": 9.87169445964977e-06, + "loss": 0.4948, + "step": 872 + }, + { + "epoch": 0.1, + "grad_norm": 2.3699734020127146, + "learning_rate": 9.871275256805234e-06, + "loss": 0.5871, + "step": 873 + }, + { + "epoch": 0.1, + "grad_norm": 2.447242454615751, + "learning_rate": 9.870855379192492e-06, + "loss": 0.5256, + "step": 874 + }, + { + "epoch": 0.1, + "grad_norm": 1.9619054723644462, + "learning_rate": 9.870434826869707e-06, + "loss": 0.4347, + "step": 875 + }, + { + "epoch": 0.1, + "grad_norm": 2.413207594444614, + "learning_rate": 9.870013599895135e-06, + "loss": 0.5814, + "step": 876 + }, + { + "epoch": 0.1, + "grad_norm": 1.88488074195696, + "learning_rate": 9.86959169832712e-06, + "loss": 0.6093, + "step": 877 + }, + { + "epoch": 0.1, + "grad_norm": 1.1020798272220043, + "learning_rate": 9.869169122224107e-06, + "loss": 0.7854, + "step": 878 + }, + { + "epoch": 0.1, + "grad_norm": 2.0332124961262945, + "learning_rate": 9.86874587164463e-06, + "loss": 0.6893, + "step": 879 + }, + { + "epoch": 0.1, + "grad_norm": 2.0329005627045382, + "learning_rate": 9.86832194664732e-06, + "loss": 0.5611, + "step": 880 + }, + { + "epoch": 0.1, + "grad_norm": 1.6668858806494669, + "learning_rate": 9.867897347290895e-06, + "loss": 0.4961, + "step": 881 + }, + { + "epoch": 0.1, + "grad_norm": 1.96260381361014, + "learning_rate": 9.867472073634175e-06, + "loss": 0.5655, + "step": 882 + }, + { + "epoch": 0.1, + "grad_norm": 2.0521870674033225, + "learning_rate": 9.867046125736066e-06, + "loss": 0.6272, + "step": 883 + }, + { + "epoch": 0.1, + "grad_norm": 2.5514603181433215, + "learning_rate": 9.866619503655569e-06, + "loss": 0.4832, + "step": 884 + }, + { + "epoch": 0.1, + "grad_norm": 2.0541862830110125, + "learning_rate": 9.866192207451781e-06, + "loss": 0.4537, + "step": 885 + }, + { + "epoch": 0.1, + "grad_norm": 3.1940097597358115, + "learning_rate": 9.865764237183894e-06, + "loss": 0.5032, + "step": 886 + }, + { + "epoch": 0.1, + "grad_norm": 2.081322935477042, + "learning_rate": 9.865335592911185e-06, + "loss": 0.5184, + "step": 887 + }, + { + "epoch": 0.1, + "grad_norm": 2.623336425535113, + "learning_rate": 9.864906274693033e-06, + "loss": 0.6915, + "step": 888 + }, + { + "epoch": 0.1, + "grad_norm": 1.859948260015348, + "learning_rate": 9.864476282588908e-06, + "loss": 0.5413, + "step": 889 + }, + { + "epoch": 0.1, + "grad_norm": 2.876717891088719, + "learning_rate": 9.86404561665837e-06, + "loss": 0.4841, + "step": 890 + }, + { + "epoch": 0.1, + "grad_norm": 2.416190909696552, + "learning_rate": 9.863614276961076e-06, + "loss": 0.5213, + "step": 891 + }, + { + "epoch": 0.1, + "grad_norm": 1.8779983877324762, + "learning_rate": 9.863182263556775e-06, + "loss": 0.5661, + "step": 892 + }, + { + "epoch": 0.1, + "grad_norm": 1.93331573223808, + "learning_rate": 9.862749576505307e-06, + "loss": 0.6173, + "step": 893 + }, + { + "epoch": 0.1, + "grad_norm": 5.040800357035209, + "learning_rate": 9.862316215866612e-06, + "loss": 0.6335, + "step": 894 + }, + { + "epoch": 0.1, + "grad_norm": 2.175378755291958, + "learning_rate": 9.861882181700716e-06, + "loss": 0.599, + "step": 895 + }, + { + "epoch": 0.1, + "grad_norm": 2.484611585682195, + "learning_rate": 9.861447474067743e-06, + "loss": 0.5247, + "step": 896 + }, + { + "epoch": 0.1, + "grad_norm": 1.7951302581855397, + "learning_rate": 9.861012093027906e-06, + "loss": 0.5696, + "step": 897 + }, + { + "epoch": 0.1, + "grad_norm": 1.5526691120467024, + "learning_rate": 9.860576038641519e-06, + "loss": 0.4391, + "step": 898 + }, + { + "epoch": 0.1, + "grad_norm": 2.6546786440469714, + "learning_rate": 9.860139310968977e-06, + "loss": 0.5454, + "step": 899 + }, + { + "epoch": 0.1, + "grad_norm": 4.202884123716279, + "learning_rate": 9.859701910070782e-06, + "loss": 0.5243, + "step": 900 + }, + { + "epoch": 0.1, + "grad_norm": 6.139351235017453, + "learning_rate": 9.85926383600752e-06, + "loss": 0.5235, + "step": 901 + }, + { + "epoch": 0.1, + "grad_norm": 2.3337599317768776, + "learning_rate": 9.858825088839875e-06, + "loss": 0.6117, + "step": 902 + }, + { + "epoch": 0.1, + "grad_norm": 1.983665523016773, + "learning_rate": 9.858385668628617e-06, + "loss": 0.61, + "step": 903 + }, + { + "epoch": 0.1, + "grad_norm": 1.9826605737745353, + "learning_rate": 9.85794557543462e-06, + "loss": 0.5841, + "step": 904 + }, + { + "epoch": 0.1, + "grad_norm": 2.0793920356780644, + "learning_rate": 9.85750480931884e-06, + "loss": 0.6438, + "step": 905 + }, + { + "epoch": 0.1, + "grad_norm": 2.2568681660668313, + "learning_rate": 9.857063370342338e-06, + "loss": 0.4774, + "step": 906 + }, + { + "epoch": 0.1, + "grad_norm": 1.8305427229825173, + "learning_rate": 9.856621258566259e-06, + "loss": 0.5997, + "step": 907 + }, + { + "epoch": 0.1, + "grad_norm": 2.08893580885194, + "learning_rate": 9.856178474051845e-06, + "loss": 0.5366, + "step": 908 + }, + { + "epoch": 0.1, + "grad_norm": 9.567135023441297, + "learning_rate": 9.855735016860428e-06, + "loss": 0.624, + "step": 909 + }, + { + "epoch": 0.1, + "grad_norm": 1.8916638741404452, + "learning_rate": 9.85529088705344e-06, + "loss": 0.4536, + "step": 910 + }, + { + "epoch": 0.1, + "grad_norm": 2.1411075703471507, + "learning_rate": 9.854846084692397e-06, + "loss": 0.5809, + "step": 911 + }, + { + "epoch": 0.1, + "grad_norm": 2.357596381735398, + "learning_rate": 9.854400609838916e-06, + "loss": 0.5888, + "step": 912 + }, + { + "epoch": 0.1, + "grad_norm": 1.4936677321580134, + "learning_rate": 9.853954462554703e-06, + "loss": 0.49, + "step": 913 + }, + { + "epoch": 0.11, + "grad_norm": 2.011769222956356, + "learning_rate": 9.853507642901558e-06, + "loss": 0.5093, + "step": 914 + }, + { + "epoch": 0.11, + "grad_norm": 2.9819385128415954, + "learning_rate": 9.853060150941377e-06, + "loss": 0.4335, + "step": 915 + }, + { + "epoch": 0.11, + "grad_norm": 1.675833708963052, + "learning_rate": 9.852611986736144e-06, + "loss": 0.5649, + "step": 916 + }, + { + "epoch": 0.11, + "grad_norm": 2.10717545681445, + "learning_rate": 9.852163150347937e-06, + "loss": 0.5694, + "step": 917 + }, + { + "epoch": 0.11, + "grad_norm": 2.0181411420473, + "learning_rate": 9.851713641838934e-06, + "loss": 0.57, + "step": 918 + }, + { + "epoch": 0.11, + "grad_norm": 1.7055665116202148, + "learning_rate": 9.851263461271394e-06, + "loss": 0.5162, + "step": 919 + }, + { + "epoch": 0.11, + "grad_norm": 2.7282290100619306, + "learning_rate": 9.850812608707683e-06, + "loss": 0.541, + "step": 920 + }, + { + "epoch": 0.11, + "grad_norm": 1.9128856440649002, + "learning_rate": 9.850361084210247e-06, + "loss": 0.5061, + "step": 921 + }, + { + "epoch": 0.11, + "grad_norm": 2.061521210692046, + "learning_rate": 9.849908887841635e-06, + "loss": 0.4863, + "step": 922 + }, + { + "epoch": 0.11, + "grad_norm": 2.259488530051225, + "learning_rate": 9.849456019664486e-06, + "loss": 0.6277, + "step": 923 + }, + { + "epoch": 0.11, + "grad_norm": 2.794885586084605, + "learning_rate": 9.849002479741525e-06, + "loss": 0.4633, + "step": 924 + }, + { + "epoch": 0.11, + "grad_norm": 2.0804153275399906, + "learning_rate": 9.848548268135583e-06, + "loss": 0.5343, + "step": 925 + }, + { + "epoch": 0.11, + "grad_norm": 1.7718272690563932, + "learning_rate": 9.848093384909573e-06, + "loss": 0.5315, + "step": 926 + }, + { + "epoch": 0.11, + "grad_norm": 1.8418474426835068, + "learning_rate": 9.847637830126508e-06, + "loss": 0.6539, + "step": 927 + }, + { + "epoch": 0.11, + "grad_norm": 1.9505070614896556, + "learning_rate": 9.84718160384949e-06, + "loss": 0.4598, + "step": 928 + }, + { + "epoch": 0.11, + "grad_norm": 2.506743969882724, + "learning_rate": 9.846724706141718e-06, + "loss": 0.5976, + "step": 929 + }, + { + "epoch": 0.11, + "grad_norm": 1.0031381787617617, + "learning_rate": 9.846267137066476e-06, + "loss": 0.7782, + "step": 930 + }, + { + "epoch": 0.11, + "grad_norm": 1.5846599054207, + "learning_rate": 9.845808896687152e-06, + "loss": 0.4449, + "step": 931 + }, + { + "epoch": 0.11, + "grad_norm": 1.6276921120644192, + "learning_rate": 9.845349985067218e-06, + "loss": 0.4, + "step": 932 + }, + { + "epoch": 0.11, + "grad_norm": 1.9670455745506532, + "learning_rate": 9.844890402270243e-06, + "loss": 0.5269, + "step": 933 + }, + { + "epoch": 0.11, + "grad_norm": 2.2858606166463047, + "learning_rate": 9.84443014835989e-06, + "loss": 0.4882, + "step": 934 + }, + { + "epoch": 0.11, + "grad_norm": 2.5439573276502268, + "learning_rate": 9.84396922339991e-06, + "loss": 0.4892, + "step": 935 + }, + { + "epoch": 0.11, + "grad_norm": 2.2016239119196155, + "learning_rate": 9.843507627454152e-06, + "loss": 0.6017, + "step": 936 + }, + { + "epoch": 0.11, + "grad_norm": 4.431450848772123, + "learning_rate": 9.843045360586559e-06, + "loss": 0.5045, + "step": 937 + }, + { + "epoch": 0.11, + "grad_norm": 1.8870915149366394, + "learning_rate": 9.842582422861158e-06, + "loss": 0.4405, + "step": 938 + }, + { + "epoch": 0.11, + "grad_norm": 2.1532647667628426, + "learning_rate": 9.842118814342081e-06, + "loss": 0.6124, + "step": 939 + }, + { + "epoch": 0.11, + "grad_norm": 1.92346935831187, + "learning_rate": 9.841654535093544e-06, + "loss": 0.5297, + "step": 940 + }, + { + "epoch": 0.11, + "grad_norm": 2.1723861706838217, + "learning_rate": 9.841189585179859e-06, + "loss": 0.4803, + "step": 941 + }, + { + "epoch": 0.11, + "grad_norm": 1.6258217938948032, + "learning_rate": 9.840723964665432e-06, + "loss": 0.5511, + "step": 942 + }, + { + "epoch": 0.11, + "grad_norm": 1.7738155453002094, + "learning_rate": 9.84025767361476e-06, + "loss": 0.532, + "step": 943 + }, + { + "epoch": 0.11, + "grad_norm": 2.0738336552303536, + "learning_rate": 9.839790712092431e-06, + "loss": 0.6309, + "step": 944 + }, + { + "epoch": 0.11, + "grad_norm": 2.510841463333185, + "learning_rate": 9.839323080163134e-06, + "loss": 0.6706, + "step": 945 + }, + { + "epoch": 0.11, + "grad_norm": 2.4639620806369558, + "learning_rate": 9.838854777891639e-06, + "loss": 0.5444, + "step": 946 + }, + { + "epoch": 0.11, + "grad_norm": 2.1003446221040263, + "learning_rate": 9.83838580534282e-06, + "loss": 0.5484, + "step": 947 + }, + { + "epoch": 0.11, + "grad_norm": 2.219206753720893, + "learning_rate": 9.837916162581638e-06, + "loss": 0.44, + "step": 948 + }, + { + "epoch": 0.11, + "grad_norm": 1.8707519973476032, + "learning_rate": 9.837445849673145e-06, + "loss": 0.5154, + "step": 949 + }, + { + "epoch": 0.11, + "grad_norm": 1.948187874266642, + "learning_rate": 9.836974866682494e-06, + "loss": 0.5119, + "step": 950 + }, + { + "epoch": 0.11, + "grad_norm": 2.914781098342308, + "learning_rate": 9.83650321367492e-06, + "loss": 0.441, + "step": 951 + }, + { + "epoch": 0.11, + "grad_norm": 1.6996017858315513, + "learning_rate": 9.83603089071576e-06, + "loss": 0.4772, + "step": 952 + }, + { + "epoch": 0.11, + "grad_norm": 2.3032078793750927, + "learning_rate": 9.835557897870435e-06, + "loss": 0.5559, + "step": 953 + }, + { + "epoch": 0.11, + "grad_norm": 1.9491132367918018, + "learning_rate": 9.835084235204471e-06, + "loss": 0.4787, + "step": 954 + }, + { + "epoch": 0.11, + "grad_norm": 1.912878126767747, + "learning_rate": 9.834609902783477e-06, + "loss": 0.5808, + "step": 955 + }, + { + "epoch": 0.11, + "grad_norm": 1.7471564771229053, + "learning_rate": 9.834134900673153e-06, + "loss": 0.5492, + "step": 956 + }, + { + "epoch": 0.11, + "grad_norm": 14.441560558354487, + "learning_rate": 9.833659228939302e-06, + "loss": 0.5247, + "step": 957 + }, + { + "epoch": 0.11, + "grad_norm": 2.5143158235820677, + "learning_rate": 9.833182887647811e-06, + "loss": 0.5236, + "step": 958 + }, + { + "epoch": 0.11, + "grad_norm": 5.691288573466352, + "learning_rate": 9.832705876864665e-06, + "loss": 0.4817, + "step": 959 + }, + { + "epoch": 0.11, + "grad_norm": 1.7456129629728314, + "learning_rate": 9.832228196655938e-06, + "loss": 0.5525, + "step": 960 + }, + { + "epoch": 0.11, + "grad_norm": 1.8487467436169844, + "learning_rate": 9.831749847087798e-06, + "loss": 0.6213, + "step": 961 + }, + { + "epoch": 0.11, + "grad_norm": 1.738038570015744, + "learning_rate": 9.831270828226505e-06, + "loss": 0.5483, + "step": 962 + }, + { + "epoch": 0.11, + "grad_norm": 2.0962130298462873, + "learning_rate": 9.830791140138414e-06, + "loss": 0.5434, + "step": 963 + }, + { + "epoch": 0.11, + "grad_norm": 1.9203518131355894, + "learning_rate": 9.830310782889972e-06, + "loss": 0.6674, + "step": 964 + }, + { + "epoch": 0.11, + "grad_norm": 1.7137178676939158, + "learning_rate": 9.829829756547715e-06, + "loss": 0.5118, + "step": 965 + }, + { + "epoch": 0.11, + "grad_norm": 2.6125222980536797, + "learning_rate": 9.829348061178278e-06, + "loss": 0.5765, + "step": 966 + }, + { + "epoch": 0.11, + "grad_norm": 1.9994414209720692, + "learning_rate": 9.828865696848384e-06, + "loss": 0.5677, + "step": 967 + }, + { + "epoch": 0.11, + "grad_norm": 2.025442572970201, + "learning_rate": 9.828382663624849e-06, + "loss": 0.5682, + "step": 968 + }, + { + "epoch": 0.11, + "grad_norm": 4.1175025761190795, + "learning_rate": 9.827898961574584e-06, + "loss": 0.5028, + "step": 969 + }, + { + "epoch": 0.11, + "grad_norm": 1.7298350849749924, + "learning_rate": 9.827414590764593e-06, + "loss": 0.5815, + "step": 970 + }, + { + "epoch": 0.11, + "grad_norm": 1.7746048319465775, + "learning_rate": 9.826929551261968e-06, + "loss": 0.5549, + "step": 971 + }, + { + "epoch": 0.11, + "grad_norm": 3.1380357139233985, + "learning_rate": 9.826443843133898e-06, + "loss": 0.6138, + "step": 972 + }, + { + "epoch": 0.11, + "grad_norm": 2.164530910366711, + "learning_rate": 9.82595746644766e-06, + "loss": 0.5356, + "step": 973 + }, + { + "epoch": 0.11, + "grad_norm": 2.273978627254315, + "learning_rate": 9.825470421270632e-06, + "loss": 0.5469, + "step": 974 + }, + { + "epoch": 0.11, + "grad_norm": 3.6984966506367596, + "learning_rate": 9.824982707670277e-06, + "loss": 0.5147, + "step": 975 + }, + { + "epoch": 0.11, + "grad_norm": 2.299334232263952, + "learning_rate": 9.824494325714154e-06, + "loss": 0.5117, + "step": 976 + }, + { + "epoch": 0.11, + "grad_norm": 2.5743987992660804, + "learning_rate": 9.82400527546991e-06, + "loss": 0.591, + "step": 977 + }, + { + "epoch": 0.11, + "grad_norm": 1.8580755112203982, + "learning_rate": 9.823515557005293e-06, + "loss": 0.512, + "step": 978 + }, + { + "epoch": 0.11, + "grad_norm": 1.8638682917143672, + "learning_rate": 9.823025170388135e-06, + "loss": 0.547, + "step": 979 + }, + { + "epoch": 0.11, + "grad_norm": 3.1109071694765453, + "learning_rate": 9.822534115686367e-06, + "loss": 0.4884, + "step": 980 + }, + { + "epoch": 0.11, + "grad_norm": 1.044077980664823, + "learning_rate": 9.822042392968007e-06, + "loss": 0.7641, + "step": 981 + }, + { + "epoch": 0.11, + "grad_norm": 1.8155165003466838, + "learning_rate": 9.82155000230117e-06, + "loss": 0.5415, + "step": 982 + }, + { + "epoch": 0.11, + "grad_norm": 0.9345341258804145, + "learning_rate": 9.821056943754064e-06, + "loss": 0.7727, + "step": 983 + }, + { + "epoch": 0.11, + "grad_norm": 1.9525081260293182, + "learning_rate": 9.820563217394985e-06, + "loss": 0.4924, + "step": 984 + }, + { + "epoch": 0.11, + "grad_norm": 5.083293606210164, + "learning_rate": 9.820068823292323e-06, + "loss": 0.5863, + "step": 985 + }, + { + "epoch": 0.11, + "grad_norm": 1.7043308629573626, + "learning_rate": 9.819573761514562e-06, + "loss": 0.5528, + "step": 986 + }, + { + "epoch": 0.11, + "grad_norm": 1.947890519378382, + "learning_rate": 9.819078032130278e-06, + "loss": 0.3827, + "step": 987 + }, + { + "epoch": 0.11, + "grad_norm": 1.0580015993585363, + "learning_rate": 9.818581635208141e-06, + "loss": 0.7479, + "step": 988 + }, + { + "epoch": 0.11, + "grad_norm": 1.9249410911869198, + "learning_rate": 9.81808457081691e-06, + "loss": 0.5544, + "step": 989 + }, + { + "epoch": 0.11, + "grad_norm": 2.5582906404529258, + "learning_rate": 9.817586839025439e-06, + "loss": 0.5231, + "step": 990 + }, + { + "epoch": 0.11, + "grad_norm": 2.0019333354053246, + "learning_rate": 9.817088439902673e-06, + "loss": 0.4995, + "step": 991 + }, + { + "epoch": 0.11, + "grad_norm": 2.9761655942962517, + "learning_rate": 9.81658937351765e-06, + "loss": 0.5132, + "step": 992 + }, + { + "epoch": 0.11, + "grad_norm": 2.3176701843037417, + "learning_rate": 9.816089639939503e-06, + "loss": 0.604, + "step": 993 + }, + { + "epoch": 0.11, + "grad_norm": 1.8930942080860391, + "learning_rate": 9.815589239237452e-06, + "loss": 0.5321, + "step": 994 + }, + { + "epoch": 0.11, + "grad_norm": 1.7072557581459284, + "learning_rate": 9.815088171480815e-06, + "loss": 0.5383, + "step": 995 + }, + { + "epoch": 0.11, + "grad_norm": 1.67107596558961, + "learning_rate": 9.814586436738998e-06, + "loss": 0.641, + "step": 996 + }, + { + "epoch": 0.11, + "grad_norm": 2.1096878604045863, + "learning_rate": 9.8140840350815e-06, + "loss": 0.5115, + "step": 997 + }, + { + "epoch": 0.11, + "grad_norm": 1.6414556169990353, + "learning_rate": 9.813580966577916e-06, + "loss": 0.5595, + "step": 998 + }, + { + "epoch": 0.11, + "grad_norm": 1.9958978624521275, + "learning_rate": 9.813077231297931e-06, + "loss": 0.6052, + "step": 999 + }, + { + "epoch": 0.11, + "grad_norm": 3.76854022219441, + "learning_rate": 9.812572829311322e-06, + "loss": 0.5108, + "step": 1000 + }, + { + "epoch": 0.12, + "grad_norm": 1.941674310946483, + "learning_rate": 9.812067760687957e-06, + "loss": 0.689, + "step": 1001 + }, + { + "epoch": 0.12, + "grad_norm": 2.3063096635957803, + "learning_rate": 9.811562025497801e-06, + "loss": 0.5033, + "step": 1002 + }, + { + "epoch": 0.12, + "grad_norm": 3.2500743412745656, + "learning_rate": 9.811055623810906e-06, + "loss": 0.585, + "step": 1003 + }, + { + "epoch": 0.12, + "grad_norm": 2.5046944295941724, + "learning_rate": 9.810548555697419e-06, + "loss": 0.5725, + "step": 1004 + }, + { + "epoch": 0.12, + "grad_norm": 2.353133760303956, + "learning_rate": 9.810040821227577e-06, + "loss": 0.4871, + "step": 1005 + }, + { + "epoch": 0.12, + "grad_norm": 1.4087881231531962, + "learning_rate": 9.809532420471716e-06, + "loss": 0.4698, + "step": 1006 + }, + { + "epoch": 0.12, + "grad_norm": 1.9266873696174018, + "learning_rate": 9.809023353500258e-06, + "loss": 0.4479, + "step": 1007 + }, + { + "epoch": 0.12, + "grad_norm": 1.5945729709139935, + "learning_rate": 9.808513620383715e-06, + "loss": 0.6815, + "step": 1008 + }, + { + "epoch": 0.12, + "grad_norm": 11.22999088069897, + "learning_rate": 9.808003221192701e-06, + "loss": 0.4737, + "step": 1009 + }, + { + "epoch": 0.12, + "grad_norm": 1.856188351218398, + "learning_rate": 9.807492155997913e-06, + "loss": 0.5685, + "step": 1010 + }, + { + "epoch": 0.12, + "grad_norm": 2.3487601762725494, + "learning_rate": 9.806980424870142e-06, + "loss": 0.541, + "step": 1011 + }, + { + "epoch": 0.12, + "grad_norm": 2.3261918867485076, + "learning_rate": 9.806468027880278e-06, + "loss": 0.3967, + "step": 1012 + }, + { + "epoch": 0.12, + "grad_norm": 2.949313300590524, + "learning_rate": 9.805954965099294e-06, + "loss": 0.5587, + "step": 1013 + }, + { + "epoch": 0.12, + "grad_norm": 1.5211627589293955, + "learning_rate": 9.80544123659826e-06, + "loss": 0.498, + "step": 1014 + }, + { + "epoch": 0.12, + "grad_norm": 2.0690696886413193, + "learning_rate": 9.80492684244834e-06, + "loss": 0.5682, + "step": 1015 + }, + { + "epoch": 0.12, + "grad_norm": 1.8183912807707936, + "learning_rate": 9.804411782720786e-06, + "loss": 0.571, + "step": 1016 + }, + { + "epoch": 0.12, + "grad_norm": 1.8692643637832056, + "learning_rate": 9.803896057486942e-06, + "loss": 0.5453, + "step": 1017 + }, + { + "epoch": 0.12, + "grad_norm": 2.5892962792207372, + "learning_rate": 9.803379666818249e-06, + "loss": 0.5541, + "step": 1018 + }, + { + "epoch": 0.12, + "grad_norm": 1.6927202272783397, + "learning_rate": 9.802862610786237e-06, + "loss": 0.4732, + "step": 1019 + }, + { + "epoch": 0.12, + "grad_norm": 1.7437796497847078, + "learning_rate": 9.802344889462528e-06, + "loss": 0.4851, + "step": 1020 + }, + { + "epoch": 0.12, + "grad_norm": 2.8537865279881567, + "learning_rate": 9.801826502918836e-06, + "loss": 0.489, + "step": 1021 + }, + { + "epoch": 0.12, + "grad_norm": 1.0277338882267102, + "learning_rate": 9.80130745122697e-06, + "loss": 0.7973, + "step": 1022 + }, + { + "epoch": 0.12, + "grad_norm": 3.031507196818382, + "learning_rate": 9.800787734458827e-06, + "loss": 0.5246, + "step": 1023 + }, + { + "epoch": 0.12, + "grad_norm": 1.7059599738915348, + "learning_rate": 9.800267352686398e-06, + "loss": 0.6705, + "step": 1024 + }, + { + "epoch": 0.12, + "grad_norm": 1.5229903092206543, + "learning_rate": 9.799746305981766e-06, + "loss": 0.4647, + "step": 1025 + }, + { + "epoch": 0.12, + "grad_norm": 4.759909907868852, + "learning_rate": 9.799224594417109e-06, + "loss": 0.5765, + "step": 1026 + }, + { + "epoch": 0.12, + "grad_norm": 0.9096652099559779, + "learning_rate": 9.79870221806469e-06, + "loss": 0.731, + "step": 1027 + }, + { + "epoch": 0.12, + "grad_norm": 1.903135673912431, + "learning_rate": 9.798179176996873e-06, + "loss": 0.6371, + "step": 1028 + }, + { + "epoch": 0.12, + "grad_norm": 1.8329565257170692, + "learning_rate": 9.797655471286106e-06, + "loss": 0.477, + "step": 1029 + }, + { + "epoch": 0.12, + "grad_norm": 1.6803635725336725, + "learning_rate": 9.797131101004935e-06, + "loss": 0.5654, + "step": 1030 + }, + { + "epoch": 0.12, + "grad_norm": 2.717672175188184, + "learning_rate": 9.796606066225996e-06, + "loss": 0.5536, + "step": 1031 + }, + { + "epoch": 0.12, + "grad_norm": 2.2102161061776915, + "learning_rate": 9.796080367022013e-06, + "loss": 0.5547, + "step": 1032 + }, + { + "epoch": 0.12, + "grad_norm": 2.1385793732095264, + "learning_rate": 9.795554003465809e-06, + "loss": 0.5941, + "step": 1033 + }, + { + "epoch": 0.12, + "grad_norm": 2.0963068014442805, + "learning_rate": 9.795026975630295e-06, + "loss": 0.6031, + "step": 1034 + }, + { + "epoch": 0.12, + "grad_norm": 2.369299885865623, + "learning_rate": 9.794499283588475e-06, + "loss": 0.5379, + "step": 1035 + }, + { + "epoch": 0.12, + "grad_norm": 2.104055127260396, + "learning_rate": 9.793970927413446e-06, + "loss": 0.6338, + "step": 1036 + }, + { + "epoch": 0.12, + "grad_norm": 2.0858628883242756, + "learning_rate": 9.793441907178393e-06, + "loss": 0.6015, + "step": 1037 + }, + { + "epoch": 0.12, + "grad_norm": 2.088834293040691, + "learning_rate": 9.792912222956597e-06, + "loss": 0.6317, + "step": 1038 + }, + { + "epoch": 0.12, + "grad_norm": 2.0129724876822115, + "learning_rate": 9.792381874821431e-06, + "loss": 0.4724, + "step": 1039 + }, + { + "epoch": 0.12, + "grad_norm": 1.7630642652722168, + "learning_rate": 9.791850862846358e-06, + "loss": 0.5239, + "step": 1040 + }, + { + "epoch": 0.12, + "grad_norm": 1.9889206355862572, + "learning_rate": 9.791319187104932e-06, + "loss": 0.4562, + "step": 1041 + }, + { + "epoch": 0.12, + "grad_norm": 1.5286500563588563, + "learning_rate": 9.790786847670803e-06, + "loss": 0.5211, + "step": 1042 + }, + { + "epoch": 0.12, + "grad_norm": 3.74850734360135, + "learning_rate": 9.79025384461771e-06, + "loss": 0.5711, + "step": 1043 + }, + { + "epoch": 0.12, + "grad_norm": 1.653281161769676, + "learning_rate": 9.789720178019483e-06, + "loss": 0.4858, + "step": 1044 + }, + { + "epoch": 0.12, + "grad_norm": 1.7634185634687276, + "learning_rate": 9.789185847950048e-06, + "loss": 0.5161, + "step": 1045 + }, + { + "epoch": 0.12, + "grad_norm": 1.8256191538644848, + "learning_rate": 9.788650854483418e-06, + "loss": 0.5129, + "step": 1046 + }, + { + "epoch": 0.12, + "grad_norm": 1.7558127218892805, + "learning_rate": 9.788115197693702e-06, + "loss": 0.6176, + "step": 1047 + }, + { + "epoch": 0.12, + "grad_norm": 1.975730899011124, + "learning_rate": 9.787578877655097e-06, + "loss": 0.5984, + "step": 1048 + }, + { + "epoch": 0.12, + "grad_norm": 1.9143506115996631, + "learning_rate": 9.787041894441895e-06, + "loss": 0.5623, + "step": 1049 + }, + { + "epoch": 0.12, + "grad_norm": 2.8367601667559863, + "learning_rate": 9.78650424812848e-06, + "loss": 0.4863, + "step": 1050 + }, + { + "epoch": 0.12, + "grad_norm": 2.7923229183103575, + "learning_rate": 9.785965938789324e-06, + "loss": 0.5821, + "step": 1051 + }, + { + "epoch": 0.12, + "grad_norm": 1.7746745276691676, + "learning_rate": 9.785426966498997e-06, + "loss": 0.4848, + "step": 1052 + }, + { + "epoch": 0.12, + "grad_norm": 2.6056208176167677, + "learning_rate": 9.784887331332153e-06, + "loss": 0.5214, + "step": 1053 + }, + { + "epoch": 0.12, + "grad_norm": 1.8536291617414775, + "learning_rate": 9.784347033363548e-06, + "loss": 0.5252, + "step": 1054 + }, + { + "epoch": 0.12, + "grad_norm": 1.7311654246122938, + "learning_rate": 9.783806072668018e-06, + "loss": 0.5526, + "step": 1055 + }, + { + "epoch": 0.12, + "grad_norm": 3.624988059364019, + "learning_rate": 9.7832644493205e-06, + "loss": 0.4739, + "step": 1056 + }, + { + "epoch": 0.12, + "grad_norm": 2.044997115353559, + "learning_rate": 9.782722163396019e-06, + "loss": 0.5295, + "step": 1057 + }, + { + "epoch": 0.12, + "grad_norm": 0.9606906286989311, + "learning_rate": 9.782179214969693e-06, + "loss": 0.7692, + "step": 1058 + }, + { + "epoch": 0.12, + "grad_norm": 2.6145329688566057, + "learning_rate": 9.781635604116731e-06, + "loss": 0.5612, + "step": 1059 + }, + { + "epoch": 0.12, + "grad_norm": 1.723799689000995, + "learning_rate": 9.78109133091243e-06, + "loss": 0.543, + "step": 1060 + }, + { + "epoch": 0.12, + "grad_norm": 1.5940788671836397, + "learning_rate": 9.780546395432188e-06, + "loss": 0.511, + "step": 1061 + }, + { + "epoch": 0.12, + "grad_norm": 1.6206052200842596, + "learning_rate": 9.780000797751489e-06, + "loss": 0.6019, + "step": 1062 + }, + { + "epoch": 0.12, + "grad_norm": 2.5176849367769005, + "learning_rate": 9.779454537945906e-06, + "loss": 0.653, + "step": 1063 + }, + { + "epoch": 0.12, + "grad_norm": 2.158508397858273, + "learning_rate": 9.778907616091108e-06, + "loss": 0.5375, + "step": 1064 + }, + { + "epoch": 0.12, + "grad_norm": 1.8051967241073346, + "learning_rate": 9.778360032262855e-06, + "loss": 0.5596, + "step": 1065 + }, + { + "epoch": 0.12, + "grad_norm": 2.279184037575354, + "learning_rate": 9.777811786536997e-06, + "loss": 0.6517, + "step": 1066 + }, + { + "epoch": 0.12, + "grad_norm": 2.6064123951606217, + "learning_rate": 9.777262878989479e-06, + "loss": 0.5744, + "step": 1067 + }, + { + "epoch": 0.12, + "grad_norm": 1.5453461124778967, + "learning_rate": 9.776713309696335e-06, + "loss": 0.5056, + "step": 1068 + }, + { + "epoch": 0.12, + "grad_norm": 1.7925892276212625, + "learning_rate": 9.77616307873369e-06, + "loss": 0.5721, + "step": 1069 + }, + { + "epoch": 0.12, + "grad_norm": 2.2840604791139123, + "learning_rate": 9.775612186177762e-06, + "loss": 0.5716, + "step": 1070 + }, + { + "epoch": 0.12, + "grad_norm": 1.820901935750855, + "learning_rate": 9.775060632104862e-06, + "loss": 0.5253, + "step": 1071 + }, + { + "epoch": 0.12, + "grad_norm": 2.7963163768707844, + "learning_rate": 9.77450841659139e-06, + "loss": 0.4414, + "step": 1072 + }, + { + "epoch": 0.12, + "grad_norm": 1.9375016601534734, + "learning_rate": 9.77395553971384e-06, + "loss": 0.6388, + "step": 1073 + }, + { + "epoch": 0.12, + "grad_norm": 2.1537371407988473, + "learning_rate": 9.773402001548794e-06, + "loss": 0.4858, + "step": 1074 + }, + { + "epoch": 0.12, + "grad_norm": 1.82832547448926, + "learning_rate": 9.77284780217293e-06, + "loss": 0.5245, + "step": 1075 + }, + { + "epoch": 0.12, + "grad_norm": 1.9660033833108304, + "learning_rate": 9.772292941663015e-06, + "loss": 0.5771, + "step": 1076 + }, + { + "epoch": 0.12, + "grad_norm": 1.9363558696402283, + "learning_rate": 9.771737420095908e-06, + "loss": 0.5608, + "step": 1077 + }, + { + "epoch": 0.12, + "grad_norm": 1.6636530012885635, + "learning_rate": 9.77118123754856e-06, + "loss": 0.5623, + "step": 1078 + }, + { + "epoch": 0.12, + "grad_norm": 1.7082104492170138, + "learning_rate": 9.770624394098015e-06, + "loss": 0.5576, + "step": 1079 + }, + { + "epoch": 0.12, + "grad_norm": 1.953556156305212, + "learning_rate": 9.770066889821403e-06, + "loss": 0.5186, + "step": 1080 + }, + { + "epoch": 0.12, + "grad_norm": 1.652021280426804, + "learning_rate": 9.769508724795953e-06, + "loss": 0.5925, + "step": 1081 + }, + { + "epoch": 0.12, + "grad_norm": 1.866883944372643, + "learning_rate": 9.768949899098981e-06, + "loss": 0.5017, + "step": 1082 + }, + { + "epoch": 0.12, + "grad_norm": 2.8288837934201587, + "learning_rate": 9.768390412807894e-06, + "loss": 0.4962, + "step": 1083 + }, + { + "epoch": 0.12, + "grad_norm": 1.747587610245608, + "learning_rate": 9.767830266000194e-06, + "loss": 0.5369, + "step": 1084 + }, + { + "epoch": 0.12, + "grad_norm": 1.8446737517151766, + "learning_rate": 9.76726945875347e-06, + "loss": 0.6093, + "step": 1085 + }, + { + "epoch": 0.12, + "grad_norm": 1.0528630017289824, + "learning_rate": 9.766707991145407e-06, + "loss": 0.7344, + "step": 1086 + }, + { + "epoch": 0.12, + "grad_norm": 1.498483165262252, + "learning_rate": 9.766145863253778e-06, + "loss": 0.5354, + "step": 1087 + }, + { + "epoch": 0.13, + "grad_norm": 2.525873089272019, + "learning_rate": 9.765583075156451e-06, + "loss": 0.5475, + "step": 1088 + }, + { + "epoch": 0.13, + "grad_norm": 1.8915856670547946, + "learning_rate": 9.76501962693138e-06, + "loss": 0.5054, + "step": 1089 + }, + { + "epoch": 0.13, + "grad_norm": 2.202894083963947, + "learning_rate": 9.764455518656617e-06, + "loss": 0.6191, + "step": 1090 + }, + { + "epoch": 0.13, + "grad_norm": 3.048323948953868, + "learning_rate": 9.7638907504103e-06, + "loss": 0.6221, + "step": 1091 + }, + { + "epoch": 0.13, + "grad_norm": 2.859390596744628, + "learning_rate": 9.763325322270663e-06, + "loss": 0.5485, + "step": 1092 + }, + { + "epoch": 0.13, + "grad_norm": 1.6296253935496907, + "learning_rate": 9.762759234316026e-06, + "loss": 0.5687, + "step": 1093 + }, + { + "epoch": 0.13, + "grad_norm": 2.106137614144312, + "learning_rate": 9.762192486624805e-06, + "loss": 0.5949, + "step": 1094 + }, + { + "epoch": 0.13, + "grad_norm": 18.77649770770319, + "learning_rate": 9.761625079275506e-06, + "loss": 0.561, + "step": 1095 + }, + { + "epoch": 0.13, + "grad_norm": 2.135307080314248, + "learning_rate": 9.761057012346724e-06, + "loss": 0.5587, + "step": 1096 + }, + { + "epoch": 0.13, + "grad_norm": 1.0694752423475116, + "learning_rate": 9.760488285917152e-06, + "loss": 0.8036, + "step": 1097 + }, + { + "epoch": 0.13, + "grad_norm": 1.6986331298285788, + "learning_rate": 9.759918900065564e-06, + "loss": 0.4926, + "step": 1098 + }, + { + "epoch": 0.13, + "grad_norm": 2.0883980159974667, + "learning_rate": 9.759348854870836e-06, + "loss": 0.5514, + "step": 1099 + }, + { + "epoch": 0.13, + "grad_norm": 2.2420830049049103, + "learning_rate": 9.75877815041193e-06, + "loss": 0.4438, + "step": 1100 + }, + { + "epoch": 0.13, + "grad_norm": 1.7496028999049695, + "learning_rate": 9.758206786767897e-06, + "loss": 0.5964, + "step": 1101 + }, + { + "epoch": 0.13, + "grad_norm": 1.8413880447299287, + "learning_rate": 9.757634764017885e-06, + "loss": 0.541, + "step": 1102 + }, + { + "epoch": 0.13, + "grad_norm": 2.8311128352460027, + "learning_rate": 9.75706208224113e-06, + "loss": 0.5505, + "step": 1103 + }, + { + "epoch": 0.13, + "grad_norm": 2.925705189991499, + "learning_rate": 9.756488741516958e-06, + "loss": 0.5832, + "step": 1104 + }, + { + "epoch": 0.13, + "grad_norm": 2.5435021417437853, + "learning_rate": 9.75591474192479e-06, + "loss": 0.5065, + "step": 1105 + }, + { + "epoch": 0.13, + "grad_norm": 1.798812617974512, + "learning_rate": 9.755340083544138e-06, + "loss": 0.4975, + "step": 1106 + }, + { + "epoch": 0.13, + "grad_norm": 1.6728234741591104, + "learning_rate": 9.754764766454598e-06, + "loss": 0.5916, + "step": 1107 + }, + { + "epoch": 0.13, + "grad_norm": 1.6835967492709578, + "learning_rate": 9.754188790735867e-06, + "loss": 0.5969, + "step": 1108 + }, + { + "epoch": 0.13, + "grad_norm": 2.1458369809479296, + "learning_rate": 9.75361215646773e-06, + "loss": 0.5379, + "step": 1109 + }, + { + "epoch": 0.13, + "grad_norm": 1.9384734720719554, + "learning_rate": 9.753034863730058e-06, + "loss": 0.5354, + "step": 1110 + }, + { + "epoch": 0.13, + "grad_norm": 2.9286885421673534, + "learning_rate": 9.752456912602821e-06, + "loss": 0.549, + "step": 1111 + }, + { + "epoch": 0.13, + "grad_norm": 2.0214374862873097, + "learning_rate": 9.751878303166076e-06, + "loss": 0.5268, + "step": 1112 + }, + { + "epoch": 0.13, + "grad_norm": 2.1005791139988825, + "learning_rate": 9.75129903549997e-06, + "loss": 0.5557, + "step": 1113 + }, + { + "epoch": 0.13, + "grad_norm": 1.0012338151437288, + "learning_rate": 9.750719109684746e-06, + "loss": 0.7965, + "step": 1114 + }, + { + "epoch": 0.13, + "grad_norm": 2.1964835138276273, + "learning_rate": 9.750138525800732e-06, + "loss": 0.5391, + "step": 1115 + }, + { + "epoch": 0.13, + "grad_norm": 1.8551273669675743, + "learning_rate": 9.749557283928354e-06, + "loss": 0.5415, + "step": 1116 + }, + { + "epoch": 0.13, + "grad_norm": 2.1977387206178816, + "learning_rate": 9.748975384148123e-06, + "loss": 0.6123, + "step": 1117 + }, + { + "epoch": 0.13, + "grad_norm": 0.8800907563680433, + "learning_rate": 9.748392826540645e-06, + "loss": 0.7572, + "step": 1118 + }, + { + "epoch": 0.13, + "grad_norm": 2.0690865746295914, + "learning_rate": 9.747809611186614e-06, + "loss": 0.565, + "step": 1119 + }, + { + "epoch": 0.13, + "grad_norm": 2.5268434924081653, + "learning_rate": 9.747225738166818e-06, + "loss": 0.5321, + "step": 1120 + }, + { + "epoch": 0.13, + "grad_norm": 3.25232847379261, + "learning_rate": 9.746641207562137e-06, + "loss": 0.5756, + "step": 1121 + }, + { + "epoch": 0.13, + "grad_norm": 2.0827120147142124, + "learning_rate": 9.746056019453536e-06, + "loss": 0.5586, + "step": 1122 + }, + { + "epoch": 0.13, + "grad_norm": 2.1805738741584695, + "learning_rate": 9.745470173922078e-06, + "loss": 0.5393, + "step": 1123 + }, + { + "epoch": 0.13, + "grad_norm": 1.598898635078897, + "learning_rate": 9.744883671048912e-06, + "loss": 0.5129, + "step": 1124 + }, + { + "epoch": 0.13, + "grad_norm": 2.2741894126822686, + "learning_rate": 9.744296510915285e-06, + "loss": 0.4429, + "step": 1125 + }, + { + "epoch": 0.13, + "grad_norm": 1.7305633629946673, + "learning_rate": 9.743708693602526e-06, + "loss": 0.5911, + "step": 1126 + }, + { + "epoch": 0.13, + "grad_norm": 2.0696916797211156, + "learning_rate": 9.743120219192057e-06, + "loss": 0.551, + "step": 1127 + }, + { + "epoch": 0.13, + "grad_norm": 1.7268648989762236, + "learning_rate": 9.7425310877654e-06, + "loss": 0.6358, + "step": 1128 + }, + { + "epoch": 0.13, + "grad_norm": 1.7161246235019867, + "learning_rate": 9.741941299404157e-06, + "loss": 0.4771, + "step": 1129 + }, + { + "epoch": 0.13, + "grad_norm": 2.4477345262586394, + "learning_rate": 9.741350854190028e-06, + "loss": 0.5041, + "step": 1130 + }, + { + "epoch": 0.13, + "grad_norm": 1.8153622433324048, + "learning_rate": 9.740759752204798e-06, + "loss": 0.5874, + "step": 1131 + }, + { + "epoch": 0.13, + "grad_norm": 2.153402684992974, + "learning_rate": 9.74016799353035e-06, + "loss": 0.5805, + "step": 1132 + }, + { + "epoch": 0.13, + "grad_norm": 1.032897770318943, + "learning_rate": 9.739575578248652e-06, + "loss": 0.8022, + "step": 1133 + }, + { + "epoch": 0.13, + "grad_norm": 2.489877394969998, + "learning_rate": 9.738982506441765e-06, + "loss": 0.5259, + "step": 1134 + }, + { + "epoch": 0.13, + "grad_norm": 1.7767384672829438, + "learning_rate": 9.738388778191842e-06, + "loss": 0.4944, + "step": 1135 + }, + { + "epoch": 0.13, + "grad_norm": 2.0808299114652224, + "learning_rate": 9.737794393581125e-06, + "loss": 0.5524, + "step": 1136 + }, + { + "epoch": 0.13, + "grad_norm": 2.0024919052189283, + "learning_rate": 9.737199352691952e-06, + "loss": 0.5834, + "step": 1137 + }, + { + "epoch": 0.13, + "grad_norm": 1.9283759813613832, + "learning_rate": 9.736603655606744e-06, + "loss": 0.5039, + "step": 1138 + }, + { + "epoch": 0.13, + "grad_norm": 1.610968904185747, + "learning_rate": 9.73600730240802e-06, + "loss": 0.5993, + "step": 1139 + }, + { + "epoch": 0.13, + "grad_norm": 2.4684272282164907, + "learning_rate": 9.735410293178382e-06, + "loss": 0.5081, + "step": 1140 + }, + { + "epoch": 0.13, + "grad_norm": 2.3822223808973453, + "learning_rate": 9.73481262800053e-06, + "loss": 0.5757, + "step": 1141 + }, + { + "epoch": 0.13, + "grad_norm": 1.5326342351823343, + "learning_rate": 9.734214306957255e-06, + "loss": 0.5317, + "step": 1142 + }, + { + "epoch": 0.13, + "grad_norm": 1.7957597692658716, + "learning_rate": 9.733615330131432e-06, + "loss": 0.5374, + "step": 1143 + }, + { + "epoch": 0.13, + "grad_norm": 1.8075284868496324, + "learning_rate": 9.733015697606036e-06, + "loss": 0.5659, + "step": 1144 + }, + { + "epoch": 0.13, + "grad_norm": 1.4806346999379807, + "learning_rate": 9.732415409464124e-06, + "loss": 0.5322, + "step": 1145 + }, + { + "epoch": 0.13, + "grad_norm": 12.019984092695893, + "learning_rate": 9.73181446578885e-06, + "loss": 0.5741, + "step": 1146 + }, + { + "epoch": 0.13, + "grad_norm": 2.192169125700534, + "learning_rate": 9.731212866663453e-06, + "loss": 0.5418, + "step": 1147 + }, + { + "epoch": 0.13, + "grad_norm": 1.7722865380127528, + "learning_rate": 9.730610612171272e-06, + "loss": 0.583, + "step": 1148 + }, + { + "epoch": 0.13, + "grad_norm": 1.8945520109358525, + "learning_rate": 9.730007702395728e-06, + "loss": 0.5234, + "step": 1149 + }, + { + "epoch": 0.13, + "grad_norm": 1.6772111050392062, + "learning_rate": 9.729404137420335e-06, + "loss": 0.5266, + "step": 1150 + }, + { + "epoch": 0.13, + "grad_norm": 3.32089407070629, + "learning_rate": 9.7287999173287e-06, + "loss": 0.5213, + "step": 1151 + }, + { + "epoch": 0.13, + "grad_norm": 3.1159783669073793, + "learning_rate": 9.728195042204522e-06, + "loss": 0.5247, + "step": 1152 + }, + { + "epoch": 0.13, + "grad_norm": 1.7223782909228391, + "learning_rate": 9.727589512131583e-06, + "loss": 0.5422, + "step": 1153 + }, + { + "epoch": 0.13, + "grad_norm": 2.2783065543852263, + "learning_rate": 9.726983327193764e-06, + "loss": 0.577, + "step": 1154 + }, + { + "epoch": 0.13, + "grad_norm": 1.9389354648862989, + "learning_rate": 9.726376487475035e-06, + "loss": 0.6023, + "step": 1155 + }, + { + "epoch": 0.13, + "grad_norm": 2.1984364573021744, + "learning_rate": 9.725768993059452e-06, + "loss": 0.5586, + "step": 1156 + }, + { + "epoch": 0.13, + "grad_norm": 2.048867249349575, + "learning_rate": 9.725160844031168e-06, + "loss": 0.6111, + "step": 1157 + }, + { + "epoch": 0.13, + "grad_norm": 1.7932938579703839, + "learning_rate": 9.724552040474421e-06, + "loss": 0.6174, + "step": 1158 + }, + { + "epoch": 0.13, + "grad_norm": 5.354668452481259, + "learning_rate": 9.723942582473545e-06, + "loss": 0.5754, + "step": 1159 + }, + { + "epoch": 0.13, + "grad_norm": 1.8310381019549715, + "learning_rate": 9.723332470112959e-06, + "loss": 0.6152, + "step": 1160 + }, + { + "epoch": 0.13, + "grad_norm": 1.9904506036507443, + "learning_rate": 9.722721703477178e-06, + "loss": 0.5711, + "step": 1161 + }, + { + "epoch": 0.13, + "grad_norm": 1.645971158890774, + "learning_rate": 9.722110282650805e-06, + "loss": 0.5389, + "step": 1162 + }, + { + "epoch": 0.13, + "grad_norm": 2.435873000889471, + "learning_rate": 9.721498207718533e-06, + "loss": 0.4873, + "step": 1163 + }, + { + "epoch": 0.13, + "grad_norm": 2.105386946230954, + "learning_rate": 9.720885478765147e-06, + "loss": 0.565, + "step": 1164 + }, + { + "epoch": 0.13, + "grad_norm": 1.7215944844426054, + "learning_rate": 9.720272095875523e-06, + "loss": 0.5849, + "step": 1165 + }, + { + "epoch": 0.13, + "grad_norm": 1.7952740292959033, + "learning_rate": 9.719658059134624e-06, + "loss": 0.4681, + "step": 1166 + }, + { + "epoch": 0.13, + "grad_norm": 1.8564598935283374, + "learning_rate": 9.719043368627511e-06, + "loss": 0.4824, + "step": 1167 + }, + { + "epoch": 0.13, + "grad_norm": 2.164212558837668, + "learning_rate": 9.718428024439326e-06, + "loss": 0.5809, + "step": 1168 + }, + { + "epoch": 0.13, + "grad_norm": 2.310640636623551, + "learning_rate": 9.717812026655308e-06, + "loss": 0.4249, + "step": 1169 + }, + { + "epoch": 0.13, + "grad_norm": 1.7256183847388988, + "learning_rate": 9.717195375360786e-06, + "loss": 0.5632, + "step": 1170 + }, + { + "epoch": 0.13, + "grad_norm": 1.8898905408970166, + "learning_rate": 9.716578070641178e-06, + "loss": 0.5783, + "step": 1171 + }, + { + "epoch": 0.13, + "grad_norm": 2.5991991034036723, + "learning_rate": 9.715960112581992e-06, + "loss": 0.5394, + "step": 1172 + }, + { + "epoch": 0.13, + "grad_norm": 1.7620975711020934, + "learning_rate": 9.715341501268828e-06, + "loss": 0.5958, + "step": 1173 + }, + { + "epoch": 0.13, + "grad_norm": 1.6573539687568224, + "learning_rate": 9.714722236787377e-06, + "loss": 0.5954, + "step": 1174 + }, + { + "epoch": 0.14, + "grad_norm": 2.0955382122552138, + "learning_rate": 9.714102319223417e-06, + "loss": 0.5094, + "step": 1175 + }, + { + "epoch": 0.14, + "grad_norm": 1.9211957238906248, + "learning_rate": 9.71348174866282e-06, + "loss": 0.5573, + "step": 1176 + }, + { + "epoch": 0.14, + "grad_norm": 2.104246197541215, + "learning_rate": 9.71286052519155e-06, + "loss": 0.5491, + "step": 1177 + }, + { + "epoch": 0.14, + "grad_norm": 2.3347678672012937, + "learning_rate": 9.712238648895655e-06, + "loss": 0.5579, + "step": 1178 + }, + { + "epoch": 0.14, + "grad_norm": 2.0362854768757415, + "learning_rate": 9.711616119861278e-06, + "loss": 0.5169, + "step": 1179 + }, + { + "epoch": 0.14, + "grad_norm": 2.37708880810182, + "learning_rate": 9.710992938174653e-06, + "loss": 0.5069, + "step": 1180 + }, + { + "epoch": 0.14, + "grad_norm": 1.8067092512213696, + "learning_rate": 9.710369103922101e-06, + "loss": 0.5704, + "step": 1181 + }, + { + "epoch": 0.14, + "grad_norm": 1.0319874695569398, + "learning_rate": 9.709744617190039e-06, + "loss": 0.8075, + "step": 1182 + }, + { + "epoch": 0.14, + "grad_norm": 2.1246490879670197, + "learning_rate": 9.709119478064965e-06, + "loss": 0.5285, + "step": 1183 + }, + { + "epoch": 0.14, + "grad_norm": 0.9896056288122467, + "learning_rate": 9.708493686633479e-06, + "loss": 0.818, + "step": 1184 + }, + { + "epoch": 0.14, + "grad_norm": 2.0602780374139904, + "learning_rate": 9.70786724298226e-06, + "loss": 0.5404, + "step": 1185 + }, + { + "epoch": 0.14, + "grad_norm": 2.123247347910237, + "learning_rate": 9.707240147198089e-06, + "loss": 0.6294, + "step": 1186 + }, + { + "epoch": 0.14, + "grad_norm": 1.7564545726358114, + "learning_rate": 9.706612399367828e-06, + "loss": 0.5064, + "step": 1187 + }, + { + "epoch": 0.14, + "grad_norm": 2.0146727228599492, + "learning_rate": 9.705983999578433e-06, + "loss": 0.5689, + "step": 1188 + }, + { + "epoch": 0.14, + "grad_norm": 2.7106021183468805, + "learning_rate": 9.705354947916947e-06, + "loss": 0.5307, + "step": 1189 + }, + { + "epoch": 0.14, + "grad_norm": 1.5624719435084782, + "learning_rate": 9.704725244470509e-06, + "loss": 0.5062, + "step": 1190 + }, + { + "epoch": 0.14, + "grad_norm": 1.7638372444772765, + "learning_rate": 9.704094889326347e-06, + "loss": 0.5707, + "step": 1191 + }, + { + "epoch": 0.14, + "grad_norm": 2.7416476443128306, + "learning_rate": 9.703463882571775e-06, + "loss": 0.4612, + "step": 1192 + }, + { + "epoch": 0.14, + "grad_norm": 1.8788622027847808, + "learning_rate": 9.7028322242942e-06, + "loss": 0.5972, + "step": 1193 + }, + { + "epoch": 0.14, + "grad_norm": 1.5947278035285033, + "learning_rate": 9.70219991458112e-06, + "loss": 0.4544, + "step": 1194 + }, + { + "epoch": 0.14, + "grad_norm": 1.6834339839276224, + "learning_rate": 9.701566953520123e-06, + "loss": 0.4592, + "step": 1195 + }, + { + "epoch": 0.14, + "grad_norm": 1.6980620087124993, + "learning_rate": 9.700933341198885e-06, + "loss": 0.5742, + "step": 1196 + }, + { + "epoch": 0.14, + "grad_norm": 1.9665219047094553, + "learning_rate": 9.700299077705176e-06, + "loss": 0.5163, + "step": 1197 + }, + { + "epoch": 0.14, + "grad_norm": 2.0496349298879504, + "learning_rate": 9.699664163126851e-06, + "loss": 0.4949, + "step": 1198 + }, + { + "epoch": 0.14, + "grad_norm": 1.6394137764220442, + "learning_rate": 9.699028597551862e-06, + "loss": 0.568, + "step": 1199 + }, + { + "epoch": 0.14, + "grad_norm": 1.9774859862632355, + "learning_rate": 9.698392381068244e-06, + "loss": 0.5271, + "step": 1200 + }, + { + "epoch": 0.14, + "grad_norm": 2.0477768094349402, + "learning_rate": 9.697755513764128e-06, + "loss": 0.5543, + "step": 1201 + }, + { + "epoch": 0.14, + "grad_norm": 2.0234679012026042, + "learning_rate": 9.697117995727732e-06, + "loss": 0.5556, + "step": 1202 + }, + { + "epoch": 0.14, + "grad_norm": 1.7828763959907534, + "learning_rate": 9.696479827047364e-06, + "loss": 0.5791, + "step": 1203 + }, + { + "epoch": 0.14, + "grad_norm": 1.5487664238465073, + "learning_rate": 9.695841007811424e-06, + "loss": 0.4605, + "step": 1204 + }, + { + "epoch": 0.14, + "grad_norm": 2.1994498325911764, + "learning_rate": 9.695201538108403e-06, + "loss": 0.5738, + "step": 1205 + }, + { + "epoch": 0.14, + "grad_norm": 2.3968025704719187, + "learning_rate": 9.694561418026875e-06, + "loss": 0.5175, + "step": 1206 + }, + { + "epoch": 0.14, + "grad_norm": 1.635182898671791, + "learning_rate": 9.693920647655515e-06, + "loss": 0.4882, + "step": 1207 + }, + { + "epoch": 0.14, + "grad_norm": 3.377929644036782, + "learning_rate": 9.693279227083079e-06, + "loss": 0.5363, + "step": 1208 + }, + { + "epoch": 0.14, + "grad_norm": 1.8005594942717784, + "learning_rate": 9.692637156398417e-06, + "loss": 0.5271, + "step": 1209 + }, + { + "epoch": 0.14, + "grad_norm": 1.6219754110794222, + "learning_rate": 9.69199443569047e-06, + "loss": 0.5057, + "step": 1210 + }, + { + "epoch": 0.14, + "grad_norm": 2.1370649640349915, + "learning_rate": 9.691351065048266e-06, + "loss": 0.5397, + "step": 1211 + }, + { + "epoch": 0.14, + "grad_norm": 2.9834854095762893, + "learning_rate": 9.690707044560924e-06, + "loss": 0.5886, + "step": 1212 + }, + { + "epoch": 0.14, + "grad_norm": 3.0244444395084003, + "learning_rate": 9.690062374317656e-06, + "loss": 0.5308, + "step": 1213 + }, + { + "epoch": 0.14, + "grad_norm": 1.9977791311500488, + "learning_rate": 9.68941705440776e-06, + "loss": 0.5539, + "step": 1214 + }, + { + "epoch": 0.14, + "grad_norm": 1.8303079553934596, + "learning_rate": 9.688771084920625e-06, + "loss": 0.5131, + "step": 1215 + }, + { + "epoch": 0.14, + "grad_norm": 2.7581418194598397, + "learning_rate": 9.688124465945732e-06, + "loss": 0.5365, + "step": 1216 + }, + { + "epoch": 0.14, + "grad_norm": 2.347117307573303, + "learning_rate": 9.68747719757265e-06, + "loss": 0.4837, + "step": 1217 + }, + { + "epoch": 0.14, + "grad_norm": 2.3103987493003872, + "learning_rate": 9.686829279891037e-06, + "loss": 0.5223, + "step": 1218 + }, + { + "epoch": 0.14, + "grad_norm": 1.2173581871658352, + "learning_rate": 9.686180712990647e-06, + "loss": 0.7106, + "step": 1219 + }, + { + "epoch": 0.14, + "grad_norm": 1.5793123924693708, + "learning_rate": 9.685531496961314e-06, + "loss": 0.5359, + "step": 1220 + }, + { + "epoch": 0.14, + "grad_norm": 1.7801466418323264, + "learning_rate": 9.684881631892971e-06, + "loss": 0.55, + "step": 1221 + }, + { + "epoch": 0.14, + "grad_norm": 1.4745760379179944, + "learning_rate": 9.684231117875634e-06, + "loss": 0.4955, + "step": 1222 + }, + { + "epoch": 0.14, + "grad_norm": 2.0682686278338953, + "learning_rate": 9.683579954999415e-06, + "loss": 0.5503, + "step": 1223 + }, + { + "epoch": 0.14, + "grad_norm": 1.6723597508200834, + "learning_rate": 9.68292814335451e-06, + "loss": 0.4286, + "step": 1224 + }, + { + "epoch": 0.14, + "grad_norm": 1.7455934620170128, + "learning_rate": 9.682275683031213e-06, + "loss": 0.4976, + "step": 1225 + }, + { + "epoch": 0.14, + "grad_norm": 4.386345487079645, + "learning_rate": 9.681622574119898e-06, + "loss": 0.5515, + "step": 1226 + }, + { + "epoch": 0.14, + "grad_norm": 2.2863482924147216, + "learning_rate": 9.680968816711033e-06, + "loss": 0.5484, + "step": 1227 + }, + { + "epoch": 0.14, + "grad_norm": 1.8404935293643752, + "learning_rate": 9.680314410895182e-06, + "loss": 0.5135, + "step": 1228 + }, + { + "epoch": 0.14, + "grad_norm": 1.681102520837694, + "learning_rate": 9.679659356762987e-06, + "loss": 0.4648, + "step": 1229 + }, + { + "epoch": 0.14, + "grad_norm": 1.8005599638084562, + "learning_rate": 9.679003654405188e-06, + "loss": 0.569, + "step": 1230 + }, + { + "epoch": 0.14, + "grad_norm": 1.0065301837428198, + "learning_rate": 9.678347303912615e-06, + "loss": 0.7712, + "step": 1231 + }, + { + "epoch": 0.14, + "grad_norm": 1.4657913817255996, + "learning_rate": 9.677690305376182e-06, + "loss": 0.5298, + "step": 1232 + }, + { + "epoch": 0.14, + "grad_norm": 2.4236628566333467, + "learning_rate": 9.6770326588869e-06, + "loss": 0.4208, + "step": 1233 + }, + { + "epoch": 0.14, + "grad_norm": 1.7502833414627141, + "learning_rate": 9.676374364535864e-06, + "loss": 0.5277, + "step": 1234 + }, + { + "epoch": 0.14, + "grad_norm": 1.8853252144724015, + "learning_rate": 9.67571542241426e-06, + "loss": 0.5274, + "step": 1235 + }, + { + "epoch": 0.14, + "grad_norm": 2.0394596918590175, + "learning_rate": 9.675055832613365e-06, + "loss": 0.5738, + "step": 1236 + }, + { + "epoch": 0.14, + "grad_norm": 1.8305880186273233, + "learning_rate": 9.674395595224546e-06, + "loss": 0.651, + "step": 1237 + }, + { + "epoch": 0.14, + "grad_norm": 5.690649734674144, + "learning_rate": 9.67373471033926e-06, + "loss": 0.5621, + "step": 1238 + }, + { + "epoch": 0.14, + "grad_norm": 1.6784242707116488, + "learning_rate": 9.673073178049051e-06, + "loss": 0.4988, + "step": 1239 + }, + { + "epoch": 0.14, + "grad_norm": 3.741767587848836, + "learning_rate": 9.672410998445553e-06, + "loss": 0.6918, + "step": 1240 + }, + { + "epoch": 0.14, + "grad_norm": 1.5737718714374644, + "learning_rate": 9.671748171620497e-06, + "loss": 0.5579, + "step": 1241 + }, + { + "epoch": 0.14, + "grad_norm": 1.9985290132272966, + "learning_rate": 9.67108469766569e-06, + "loss": 0.5544, + "step": 1242 + }, + { + "epoch": 0.14, + "grad_norm": 1.6771506632645983, + "learning_rate": 9.67042057667304e-06, + "loss": 0.5922, + "step": 1243 + }, + { + "epoch": 0.14, + "grad_norm": 1.8333996955696386, + "learning_rate": 9.669755808734541e-06, + "loss": 0.5705, + "step": 1244 + }, + { + "epoch": 0.14, + "grad_norm": 2.7005159543780555, + "learning_rate": 9.669090393942277e-06, + "loss": 0.5212, + "step": 1245 + }, + { + "epoch": 0.14, + "grad_norm": 1.6265421547783332, + "learning_rate": 9.66842433238842e-06, + "loss": 0.5335, + "step": 1246 + }, + { + "epoch": 0.14, + "grad_norm": 0.8499712273374371, + "learning_rate": 9.667757624165231e-06, + "loss": 0.734, + "step": 1247 + }, + { + "epoch": 0.14, + "grad_norm": 1.55555220403889, + "learning_rate": 9.667090269365066e-06, + "loss": 0.5439, + "step": 1248 + }, + { + "epoch": 0.14, + "grad_norm": 1.8486241773687218, + "learning_rate": 9.666422268080366e-06, + "loss": 0.5254, + "step": 1249 + }, + { + "epoch": 0.14, + "grad_norm": 1.9496732048260292, + "learning_rate": 9.665753620403661e-06, + "loss": 0.6488, + "step": 1250 + }, + { + "epoch": 0.14, + "grad_norm": 1.8081945718074874, + "learning_rate": 9.665084326427575e-06, + "loss": 0.4925, + "step": 1251 + }, + { + "epoch": 0.14, + "grad_norm": 1.6960173732175188, + "learning_rate": 9.664414386244812e-06, + "loss": 0.5717, + "step": 1252 + }, + { + "epoch": 0.14, + "grad_norm": 2.1104227427882822, + "learning_rate": 9.663743799948178e-06, + "loss": 0.621, + "step": 1253 + }, + { + "epoch": 0.14, + "grad_norm": 1.7765506700447908, + "learning_rate": 9.66307256763056e-06, + "loss": 0.5686, + "step": 1254 + }, + { + "epoch": 0.14, + "grad_norm": 1.687176184215572, + "learning_rate": 9.66240068938494e-06, + "loss": 0.5834, + "step": 1255 + }, + { + "epoch": 0.14, + "grad_norm": 1.8520980958277116, + "learning_rate": 9.661728165304381e-06, + "loss": 0.502, + "step": 1256 + }, + { + "epoch": 0.14, + "grad_norm": 1.616056889833274, + "learning_rate": 9.661054995482045e-06, + "loss": 0.558, + "step": 1257 + }, + { + "epoch": 0.14, + "grad_norm": 1.5986388959168787, + "learning_rate": 9.660381180011177e-06, + "loss": 0.4505, + "step": 1258 + }, + { + "epoch": 0.14, + "grad_norm": 2.1478693169988756, + "learning_rate": 9.659706718985118e-06, + "loss": 0.5014, + "step": 1259 + }, + { + "epoch": 0.14, + "grad_norm": 1.8684866220535443, + "learning_rate": 9.65903161249729e-06, + "loss": 0.6017, + "step": 1260 + }, + { + "epoch": 0.14, + "grad_norm": 0.9030299868505797, + "learning_rate": 9.658355860641212e-06, + "loss": 0.722, + "step": 1261 + }, + { + "epoch": 0.15, + "grad_norm": 1.9959181349520354, + "learning_rate": 9.657679463510483e-06, + "loss": 0.5953, + "step": 1262 + }, + { + "epoch": 0.15, + "grad_norm": 1.9569197100920068, + "learning_rate": 9.657002421198805e-06, + "loss": 0.5841, + "step": 1263 + }, + { + "epoch": 0.15, + "grad_norm": 2.565820515759259, + "learning_rate": 9.656324733799955e-06, + "loss": 0.565, + "step": 1264 + }, + { + "epoch": 0.15, + "grad_norm": 1.8092120582056619, + "learning_rate": 9.655646401407813e-06, + "loss": 0.6031, + "step": 1265 + }, + { + "epoch": 0.15, + "grad_norm": 2.000394155729214, + "learning_rate": 9.654967424116335e-06, + "loss": 0.5453, + "step": 1266 + }, + { + "epoch": 0.15, + "grad_norm": 1.7257342102439934, + "learning_rate": 9.654287802019578e-06, + "loss": 0.597, + "step": 1267 + }, + { + "epoch": 0.15, + "grad_norm": 1.7129074037705536, + "learning_rate": 9.653607535211677e-06, + "loss": 0.5042, + "step": 1268 + }, + { + "epoch": 0.15, + "grad_norm": 1.703515189652405, + "learning_rate": 9.65292662378687e-06, + "loss": 0.6198, + "step": 1269 + }, + { + "epoch": 0.15, + "grad_norm": 2.141899223080931, + "learning_rate": 9.652245067839472e-06, + "loss": 0.6045, + "step": 1270 + }, + { + "epoch": 0.15, + "grad_norm": 2.3538866802682046, + "learning_rate": 9.651562867463892e-06, + "loss": 0.5692, + "step": 1271 + }, + { + "epoch": 0.15, + "grad_norm": 1.9302148343853345, + "learning_rate": 9.65088002275463e-06, + "loss": 0.6228, + "step": 1272 + }, + { + "epoch": 0.15, + "grad_norm": 1.9483725325277228, + "learning_rate": 9.650196533806272e-06, + "loss": 0.5384, + "step": 1273 + }, + { + "epoch": 0.15, + "grad_norm": 1.7064954059021051, + "learning_rate": 9.649512400713497e-06, + "loss": 0.6075, + "step": 1274 + }, + { + "epoch": 0.15, + "grad_norm": 1.8465982724463552, + "learning_rate": 9.64882762357107e-06, + "loss": 0.5342, + "step": 1275 + }, + { + "epoch": 0.15, + "grad_norm": 1.447802164711969, + "learning_rate": 9.648142202473844e-06, + "loss": 0.4944, + "step": 1276 + }, + { + "epoch": 0.15, + "grad_norm": 0.9284366628673724, + "learning_rate": 9.647456137516766e-06, + "loss": 0.7562, + "step": 1277 + }, + { + "epoch": 0.15, + "grad_norm": 1.8975927056846607, + "learning_rate": 9.646769428794869e-06, + "loss": 0.5168, + "step": 1278 + }, + { + "epoch": 0.15, + "grad_norm": 4.821651679304054, + "learning_rate": 9.646082076403276e-06, + "loss": 0.504, + "step": 1279 + }, + { + "epoch": 0.15, + "grad_norm": 2.352561983560717, + "learning_rate": 9.645394080437197e-06, + "loss": 0.4853, + "step": 1280 + }, + { + "epoch": 0.15, + "grad_norm": 2.683215841892426, + "learning_rate": 9.644705440991935e-06, + "loss": 0.5744, + "step": 1281 + }, + { + "epoch": 0.15, + "grad_norm": 1.9920554386622165, + "learning_rate": 9.644016158162881e-06, + "loss": 0.5115, + "step": 1282 + }, + { + "epoch": 0.15, + "grad_norm": 2.4687303058955226, + "learning_rate": 9.643326232045512e-06, + "loss": 0.5171, + "step": 1283 + }, + { + "epoch": 0.15, + "grad_norm": 2.2449786537853833, + "learning_rate": 9.642635662735397e-06, + "loss": 0.6039, + "step": 1284 + }, + { + "epoch": 0.15, + "grad_norm": 1.6897826890743508, + "learning_rate": 9.641944450328196e-06, + "loss": 0.5233, + "step": 1285 + }, + { + "epoch": 0.15, + "grad_norm": 1.7833681210350623, + "learning_rate": 9.641252594919653e-06, + "loss": 0.5536, + "step": 1286 + }, + { + "epoch": 0.15, + "grad_norm": 1.6342141858776325, + "learning_rate": 9.640560096605605e-06, + "loss": 0.5006, + "step": 1287 + }, + { + "epoch": 0.15, + "grad_norm": 1.9214249787056155, + "learning_rate": 9.639866955481975e-06, + "loss": 0.5234, + "step": 1288 + }, + { + "epoch": 0.15, + "grad_norm": 0.917029874904909, + "learning_rate": 9.639173171644778e-06, + "loss": 0.795, + "step": 1289 + }, + { + "epoch": 0.15, + "grad_norm": 1.805088008871878, + "learning_rate": 9.638478745190118e-06, + "loss": 0.4273, + "step": 1290 + }, + { + "epoch": 0.15, + "grad_norm": 1.5912050812401852, + "learning_rate": 9.637783676214186e-06, + "loss": 0.5884, + "step": 1291 + }, + { + "epoch": 0.15, + "grad_norm": 2.0056041682416565, + "learning_rate": 9.63708796481326e-06, + "loss": 0.5395, + "step": 1292 + }, + { + "epoch": 0.15, + "grad_norm": 2.408238102553004, + "learning_rate": 9.636391611083712e-06, + "loss": 0.5822, + "step": 1293 + }, + { + "epoch": 0.15, + "grad_norm": 1.7940473401151356, + "learning_rate": 9.635694615122004e-06, + "loss": 0.6245, + "step": 1294 + }, + { + "epoch": 0.15, + "grad_norm": 1.576843151269019, + "learning_rate": 9.63499697702468e-06, + "loss": 0.5923, + "step": 1295 + }, + { + "epoch": 0.15, + "grad_norm": 2.077790163759468, + "learning_rate": 9.634298696888376e-06, + "loss": 0.5623, + "step": 1296 + }, + { + "epoch": 0.15, + "grad_norm": 1.5173566733612762, + "learning_rate": 9.633599774809822e-06, + "loss": 0.497, + "step": 1297 + }, + { + "epoch": 0.15, + "grad_norm": 4.6186170364128305, + "learning_rate": 9.632900210885827e-06, + "loss": 0.5691, + "step": 1298 + }, + { + "epoch": 0.15, + "grad_norm": 1.6991767005197203, + "learning_rate": 9.632200005213299e-06, + "loss": 0.5421, + "step": 1299 + }, + { + "epoch": 0.15, + "grad_norm": 1.7307332167395466, + "learning_rate": 9.631499157889226e-06, + "loss": 0.5393, + "step": 1300 + }, + { + "epoch": 0.15, + "grad_norm": 1.9141365955991985, + "learning_rate": 9.630797669010694e-06, + "loss": 0.4972, + "step": 1301 + }, + { + "epoch": 0.15, + "grad_norm": 1.8219624525909301, + "learning_rate": 9.630095538674871e-06, + "loss": 0.5443, + "step": 1302 + }, + { + "epoch": 0.15, + "grad_norm": 1.8239486039604076, + "learning_rate": 9.629392766979016e-06, + "loss": 0.5789, + "step": 1303 + }, + { + "epoch": 0.15, + "grad_norm": 1.686258332096098, + "learning_rate": 9.628689354020474e-06, + "loss": 0.4382, + "step": 1304 + }, + { + "epoch": 0.15, + "grad_norm": 1.5847306289008554, + "learning_rate": 9.627985299896688e-06, + "loss": 0.4443, + "step": 1305 + }, + { + "epoch": 0.15, + "grad_norm": 2.1747028821828125, + "learning_rate": 9.62728060470518e-06, + "loss": 0.5046, + "step": 1306 + }, + { + "epoch": 0.15, + "grad_norm": 3.091612566972885, + "learning_rate": 9.626575268543561e-06, + "loss": 0.4844, + "step": 1307 + }, + { + "epoch": 0.15, + "grad_norm": 2.0874893210845076, + "learning_rate": 9.62586929150954e-06, + "loss": 0.4793, + "step": 1308 + }, + { + "epoch": 0.15, + "grad_norm": 3.359802442764472, + "learning_rate": 9.625162673700906e-06, + "loss": 0.4877, + "step": 1309 + }, + { + "epoch": 0.15, + "grad_norm": 2.331252042764603, + "learning_rate": 9.624455415215537e-06, + "loss": 0.5427, + "step": 1310 + }, + { + "epoch": 0.15, + "grad_norm": 1.8011672255880258, + "learning_rate": 9.623747516151406e-06, + "loss": 0.4814, + "step": 1311 + }, + { + "epoch": 0.15, + "grad_norm": 1.508310298385075, + "learning_rate": 9.62303897660657e-06, + "loss": 0.4219, + "step": 1312 + }, + { + "epoch": 0.15, + "grad_norm": 1.7003640367450878, + "learning_rate": 9.622329796679175e-06, + "loss": 0.5733, + "step": 1313 + }, + { + "epoch": 0.15, + "grad_norm": 2.1838289810681117, + "learning_rate": 9.621619976467459e-06, + "loss": 0.6036, + "step": 1314 + }, + { + "epoch": 0.15, + "grad_norm": 1.8459241852753459, + "learning_rate": 9.620909516069744e-06, + "loss": 0.4744, + "step": 1315 + }, + { + "epoch": 0.15, + "grad_norm": 2.0316668217648224, + "learning_rate": 9.620198415584441e-06, + "loss": 0.4295, + "step": 1316 + }, + { + "epoch": 0.15, + "grad_norm": 2.1893943314331343, + "learning_rate": 9.619486675110055e-06, + "loss": 0.5323, + "step": 1317 + }, + { + "epoch": 0.15, + "grad_norm": 1.5583331387922668, + "learning_rate": 9.618774294745178e-06, + "loss": 0.4657, + "step": 1318 + }, + { + "epoch": 0.15, + "grad_norm": 1.748098941054415, + "learning_rate": 9.618061274588481e-06, + "loss": 0.5773, + "step": 1319 + }, + { + "epoch": 0.15, + "grad_norm": 1.8781004289935406, + "learning_rate": 9.617347614738738e-06, + "loss": 0.5809, + "step": 1320 + }, + { + "epoch": 0.15, + "grad_norm": 2.5016398856823465, + "learning_rate": 9.616633315294803e-06, + "loss": 0.5738, + "step": 1321 + }, + { + "epoch": 0.15, + "grad_norm": 1.8070642676942736, + "learning_rate": 9.61591837635562e-06, + "loss": 0.593, + "step": 1322 + }, + { + "epoch": 0.15, + "grad_norm": 3.8759527414630104, + "learning_rate": 9.615202798020224e-06, + "loss": 0.6197, + "step": 1323 + }, + { + "epoch": 0.15, + "grad_norm": 1.5907853665896872, + "learning_rate": 9.614486580387737e-06, + "loss": 0.5527, + "step": 1324 + }, + { + "epoch": 0.15, + "grad_norm": 2.4058247734643388, + "learning_rate": 9.613769723557366e-06, + "loss": 0.5374, + "step": 1325 + }, + { + "epoch": 0.15, + "grad_norm": 0.9395281241149795, + "learning_rate": 9.613052227628414e-06, + "loss": 0.7563, + "step": 1326 + }, + { + "epoch": 0.15, + "grad_norm": 2.550992372654103, + "learning_rate": 9.612334092700264e-06, + "loss": 0.581, + "step": 1327 + }, + { + "epoch": 0.15, + "grad_norm": 1.8151426063325695, + "learning_rate": 9.611615318872396e-06, + "loss": 0.5768, + "step": 1328 + }, + { + "epoch": 0.15, + "grad_norm": 2.8696696096733496, + "learning_rate": 9.610895906244373e-06, + "loss": 0.5449, + "step": 1329 + }, + { + "epoch": 0.15, + "grad_norm": 2.160433503560469, + "learning_rate": 9.610175854915846e-06, + "loss": 0.5867, + "step": 1330 + }, + { + "epoch": 0.15, + "grad_norm": 4.661538476041728, + "learning_rate": 9.60945516498656e-06, + "loss": 0.4589, + "step": 1331 + }, + { + "epoch": 0.15, + "grad_norm": 0.9172872587964246, + "learning_rate": 9.608733836556343e-06, + "loss": 0.736, + "step": 1332 + }, + { + "epoch": 0.15, + "grad_norm": 2.0354648502978274, + "learning_rate": 9.608011869725111e-06, + "loss": 0.6312, + "step": 1333 + }, + { + "epoch": 0.15, + "grad_norm": 2.2342857897992436, + "learning_rate": 9.607289264592874e-06, + "loss": 0.5859, + "step": 1334 + }, + { + "epoch": 0.15, + "grad_norm": 2.07927537838627, + "learning_rate": 9.606566021259726e-06, + "loss": 0.4496, + "step": 1335 + }, + { + "epoch": 0.15, + "grad_norm": 0.9153277794732088, + "learning_rate": 9.605842139825851e-06, + "loss": 0.7503, + "step": 1336 + }, + { + "epoch": 0.15, + "grad_norm": 2.036853709615021, + "learning_rate": 9.605117620391522e-06, + "loss": 0.6517, + "step": 1337 + }, + { + "epoch": 0.15, + "grad_norm": 2.1171520480442165, + "learning_rate": 9.604392463057097e-06, + "loss": 0.5511, + "step": 1338 + }, + { + "epoch": 0.15, + "grad_norm": 1.9747557856238598, + "learning_rate": 9.603666667923024e-06, + "loss": 0.6502, + "step": 1339 + }, + { + "epoch": 0.15, + "grad_norm": 2.6538402051067016, + "learning_rate": 9.602940235089841e-06, + "loss": 0.5077, + "step": 1340 + }, + { + "epoch": 0.15, + "grad_norm": 2.135139721275134, + "learning_rate": 9.602213164658177e-06, + "loss": 0.4967, + "step": 1341 + }, + { + "epoch": 0.15, + "grad_norm": 3.2259483627688725, + "learning_rate": 9.60148545672874e-06, + "loss": 0.57, + "step": 1342 + }, + { + "epoch": 0.15, + "grad_norm": 1.8727203899347886, + "learning_rate": 9.600757111402336e-06, + "loss": 0.5683, + "step": 1343 + }, + { + "epoch": 0.15, + "grad_norm": 1.8819895259237334, + "learning_rate": 9.600028128779853e-06, + "loss": 0.6488, + "step": 1344 + }, + { + "epoch": 0.15, + "grad_norm": 1.541759175953644, + "learning_rate": 9.599298508962272e-06, + "loss": 0.6319, + "step": 1345 + }, + { + "epoch": 0.15, + "grad_norm": 2.168890978221971, + "learning_rate": 9.598568252050655e-06, + "loss": 0.5149, + "step": 1346 + }, + { + "epoch": 0.15, + "grad_norm": 1.6705386031984615, + "learning_rate": 9.597837358146163e-06, + "loss": 0.5339, + "step": 1347 + }, + { + "epoch": 0.15, + "grad_norm": 1.6261352861228484, + "learning_rate": 9.597105827350035e-06, + "loss": 0.5364, + "step": 1348 + }, + { + "epoch": 0.16, + "grad_norm": 1.6660173173576758, + "learning_rate": 9.596373659763605e-06, + "loss": 0.5311, + "step": 1349 + }, + { + "epoch": 0.16, + "grad_norm": 1.7296864317719287, + "learning_rate": 9.59564085548829e-06, + "loss": 0.5809, + "step": 1350 + }, + { + "epoch": 0.16, + "grad_norm": 1.9743902359678382, + "learning_rate": 9.594907414625602e-06, + "loss": 0.5152, + "step": 1351 + }, + { + "epoch": 0.16, + "grad_norm": 1.8559917506672614, + "learning_rate": 9.594173337277134e-06, + "loss": 0.6136, + "step": 1352 + }, + { + "epoch": 0.16, + "grad_norm": 1.5856374992614448, + "learning_rate": 9.59343862354457e-06, + "loss": 0.5505, + "step": 1353 + }, + { + "epoch": 0.16, + "grad_norm": 2.12921172670809, + "learning_rate": 9.592703273529684e-06, + "loss": 0.4847, + "step": 1354 + }, + { + "epoch": 0.16, + "grad_norm": 1.9823831914429788, + "learning_rate": 9.591967287334337e-06, + "loss": 0.4316, + "step": 1355 + }, + { + "epoch": 0.16, + "grad_norm": 2.0361992065456316, + "learning_rate": 9.591230665060476e-06, + "loss": 0.5168, + "step": 1356 + }, + { + "epoch": 0.16, + "grad_norm": 0.8389999731052249, + "learning_rate": 9.590493406810138e-06, + "loss": 0.7443, + "step": 1357 + }, + { + "epoch": 0.16, + "grad_norm": 2.061037444653029, + "learning_rate": 9.589755512685451e-06, + "loss": 0.5526, + "step": 1358 + }, + { + "epoch": 0.16, + "grad_norm": 2.303185799440571, + "learning_rate": 9.589016982788622e-06, + "loss": 0.5756, + "step": 1359 + }, + { + "epoch": 0.16, + "grad_norm": 2.22577788698618, + "learning_rate": 9.588277817221956e-06, + "loss": 0.504, + "step": 1360 + }, + { + "epoch": 0.16, + "grad_norm": 2.495231079842881, + "learning_rate": 9.587538016087842e-06, + "loss": 0.6001, + "step": 1361 + }, + { + "epoch": 0.16, + "grad_norm": 1.8828021045574168, + "learning_rate": 9.586797579488758e-06, + "loss": 0.5679, + "step": 1362 + }, + { + "epoch": 0.16, + "grad_norm": 3.6529132646256395, + "learning_rate": 9.586056507527266e-06, + "loss": 0.5296, + "step": 1363 + }, + { + "epoch": 0.16, + "grad_norm": 1.9965361476626804, + "learning_rate": 9.585314800306022e-06, + "loss": 0.5778, + "step": 1364 + }, + { + "epoch": 0.16, + "grad_norm": 1.7630374307097239, + "learning_rate": 9.584572457927766e-06, + "loss": 0.5877, + "step": 1365 + }, + { + "epoch": 0.16, + "grad_norm": 1.9561769411059107, + "learning_rate": 9.583829480495325e-06, + "loss": 0.5537, + "step": 1366 + }, + { + "epoch": 0.16, + "grad_norm": 1.8296568376509585, + "learning_rate": 9.583085868111622e-06, + "loss": 0.5761, + "step": 1367 + }, + { + "epoch": 0.16, + "grad_norm": 2.410707494646512, + "learning_rate": 9.582341620879655e-06, + "loss": 0.4984, + "step": 1368 + }, + { + "epoch": 0.16, + "grad_norm": 1.7337146166604787, + "learning_rate": 9.581596738902521e-06, + "loss": 0.4464, + "step": 1369 + }, + { + "epoch": 0.16, + "grad_norm": 1.7860843511421733, + "learning_rate": 9.580851222283401e-06, + "loss": 0.5717, + "step": 1370 + }, + { + "epoch": 0.16, + "grad_norm": 3.249520606836769, + "learning_rate": 9.580105071125564e-06, + "loss": 0.6088, + "step": 1371 + }, + { + "epoch": 0.16, + "grad_norm": 2.000195050626322, + "learning_rate": 9.579358285532364e-06, + "loss": 0.4956, + "step": 1372 + }, + { + "epoch": 0.16, + "grad_norm": 1.9052364492719027, + "learning_rate": 9.578610865607249e-06, + "loss": 0.5011, + "step": 1373 + }, + { + "epoch": 0.16, + "grad_norm": 2.896794260454515, + "learning_rate": 9.577862811453748e-06, + "loss": 0.4662, + "step": 1374 + }, + { + "epoch": 0.16, + "grad_norm": 1.660013676739649, + "learning_rate": 9.577114123175486e-06, + "loss": 0.5505, + "step": 1375 + }, + { + "epoch": 0.16, + "grad_norm": 3.150120646673957, + "learning_rate": 9.576364800876167e-06, + "loss": 0.501, + "step": 1376 + }, + { + "epoch": 0.16, + "grad_norm": 1.9557497549709233, + "learning_rate": 9.575614844659588e-06, + "loss": 0.6225, + "step": 1377 + }, + { + "epoch": 0.16, + "grad_norm": 1.8280569348577205, + "learning_rate": 9.574864254629634e-06, + "loss": 0.4841, + "step": 1378 + }, + { + "epoch": 0.16, + "grad_norm": 1.9463265637025862, + "learning_rate": 9.574113030890274e-06, + "loss": 0.4751, + "step": 1379 + }, + { + "epoch": 0.16, + "grad_norm": 1.9331697762572653, + "learning_rate": 9.573361173545572e-06, + "loss": 0.4618, + "step": 1380 + }, + { + "epoch": 0.16, + "grad_norm": 1.850508198484521, + "learning_rate": 9.57260868269967e-06, + "loss": 0.5106, + "step": 1381 + }, + { + "epoch": 0.16, + "grad_norm": 2.10499177313721, + "learning_rate": 9.571855558456807e-06, + "loss": 0.6001, + "step": 1382 + }, + { + "epoch": 0.16, + "grad_norm": 2.2096432909627723, + "learning_rate": 9.571101800921304e-06, + "loss": 0.601, + "step": 1383 + }, + { + "epoch": 0.16, + "grad_norm": 1.9332979120207041, + "learning_rate": 9.57034741019757e-06, + "loss": 0.5168, + "step": 1384 + }, + { + "epoch": 0.16, + "grad_norm": 1.0459003983315287, + "learning_rate": 9.569592386390105e-06, + "loss": 0.7835, + "step": 1385 + }, + { + "epoch": 0.16, + "grad_norm": 1.9714242504480233, + "learning_rate": 9.568836729603495e-06, + "loss": 0.5747, + "step": 1386 + }, + { + "epoch": 0.16, + "grad_norm": 11.600528143316716, + "learning_rate": 9.56808043994241e-06, + "loss": 0.577, + "step": 1387 + }, + { + "epoch": 0.16, + "grad_norm": 2.006816380734164, + "learning_rate": 9.567323517511617e-06, + "loss": 0.5495, + "step": 1388 + }, + { + "epoch": 0.16, + "grad_norm": 2.1614670648200107, + "learning_rate": 9.566565962415958e-06, + "loss": 0.6416, + "step": 1389 + }, + { + "epoch": 0.16, + "grad_norm": 2.372382590415793, + "learning_rate": 9.565807774760376e-06, + "loss": 0.4682, + "step": 1390 + }, + { + "epoch": 0.16, + "grad_norm": 2.0571712473570565, + "learning_rate": 9.56504895464989e-06, + "loss": 0.5685, + "step": 1391 + }, + { + "epoch": 0.16, + "grad_norm": 2.1470730720800186, + "learning_rate": 9.564289502189615e-06, + "loss": 0.5494, + "step": 1392 + }, + { + "epoch": 0.16, + "grad_norm": 2.879077283106543, + "learning_rate": 9.563529417484747e-06, + "loss": 0.6456, + "step": 1393 + }, + { + "epoch": 0.16, + "grad_norm": 1.85725869169753, + "learning_rate": 9.562768700640575e-06, + "loss": 0.5873, + "step": 1394 + }, + { + "epoch": 0.16, + "grad_norm": 2.1230601623032794, + "learning_rate": 9.562007351762473e-06, + "loss": 0.5205, + "step": 1395 + }, + { + "epoch": 0.16, + "grad_norm": 2.263549718980947, + "learning_rate": 9.561245370955903e-06, + "loss": 0.531, + "step": 1396 + }, + { + "epoch": 0.16, + "grad_norm": 2.6700140466273794, + "learning_rate": 9.560482758326414e-06, + "loss": 0.516, + "step": 1397 + }, + { + "epoch": 0.16, + "grad_norm": 1.890852254087225, + "learning_rate": 9.559719513979645e-06, + "loss": 0.4899, + "step": 1398 + }, + { + "epoch": 0.16, + "grad_norm": 1.9632468246607881, + "learning_rate": 9.558955638021314e-06, + "loss": 0.5203, + "step": 1399 + }, + { + "epoch": 0.16, + "grad_norm": 1.8608194044045343, + "learning_rate": 9.558191130557242e-06, + "loss": 0.4981, + "step": 1400 + }, + { + "epoch": 0.16, + "grad_norm": 0.941016713952777, + "learning_rate": 9.557425991693323e-06, + "loss": 0.7146, + "step": 1401 + }, + { + "epoch": 0.16, + "grad_norm": 8.479110677104224, + "learning_rate": 9.556660221535545e-06, + "loss": 0.5136, + "step": 1402 + }, + { + "epoch": 0.16, + "grad_norm": 1.854634248025494, + "learning_rate": 9.55589382018998e-06, + "loss": 0.4831, + "step": 1403 + }, + { + "epoch": 0.16, + "grad_norm": 1.821923908562663, + "learning_rate": 9.555126787762796e-06, + "loss": 0.5212, + "step": 1404 + }, + { + "epoch": 0.16, + "grad_norm": 2.2483254989009502, + "learning_rate": 9.554359124360236e-06, + "loss": 0.5665, + "step": 1405 + }, + { + "epoch": 0.16, + "grad_norm": 2.0033334961158134, + "learning_rate": 9.55359083008864e-06, + "loss": 0.5693, + "step": 1406 + }, + { + "epoch": 0.16, + "grad_norm": 0.9040616941878393, + "learning_rate": 9.55282190505443e-06, + "loss": 0.7642, + "step": 1407 + }, + { + "epoch": 0.16, + "grad_norm": 2.4685571795671053, + "learning_rate": 9.552052349364118e-06, + "loss": 0.5584, + "step": 1408 + }, + { + "epoch": 0.16, + "grad_norm": 2.1040518367050782, + "learning_rate": 9.551282163124304e-06, + "loss": 0.5804, + "step": 1409 + }, + { + "epoch": 0.16, + "grad_norm": 1.6710792422808451, + "learning_rate": 9.550511346441674e-06, + "loss": 0.5671, + "step": 1410 + }, + { + "epoch": 0.16, + "grad_norm": 2.191627122312166, + "learning_rate": 9.549739899422998e-06, + "loss": 0.4611, + "step": 1411 + }, + { + "epoch": 0.16, + "grad_norm": 1.705327642401606, + "learning_rate": 9.548967822175142e-06, + "loss": 0.5022, + "step": 1412 + }, + { + "epoch": 0.16, + "grad_norm": 1.8347727946182772, + "learning_rate": 9.548195114805047e-06, + "loss": 0.5471, + "step": 1413 + }, + { + "epoch": 0.16, + "grad_norm": 2.9764802925520053, + "learning_rate": 9.547421777419756e-06, + "loss": 0.4352, + "step": 1414 + }, + { + "epoch": 0.16, + "grad_norm": 1.6946602197003582, + "learning_rate": 9.546647810126388e-06, + "loss": 0.4884, + "step": 1415 + }, + { + "epoch": 0.16, + "grad_norm": 1.9233943474299515, + "learning_rate": 9.545873213032151e-06, + "loss": 0.6501, + "step": 1416 + }, + { + "epoch": 0.16, + "grad_norm": 2.1000008561961976, + "learning_rate": 9.545097986244345e-06, + "loss": 0.6233, + "step": 1417 + }, + { + "epoch": 0.16, + "grad_norm": 3.194788739905355, + "learning_rate": 9.544322129870354e-06, + "loss": 0.524, + "step": 1418 + }, + { + "epoch": 0.16, + "grad_norm": 2.1592055036121645, + "learning_rate": 9.543545644017646e-06, + "loss": 0.5565, + "step": 1419 + }, + { + "epoch": 0.16, + "grad_norm": 2.0296609059043385, + "learning_rate": 9.542768528793784e-06, + "loss": 0.5466, + "step": 1420 + }, + { + "epoch": 0.16, + "grad_norm": 1.729969446737493, + "learning_rate": 9.541990784306414e-06, + "loss": 0.5495, + "step": 1421 + }, + { + "epoch": 0.16, + "grad_norm": 1.7698462955069232, + "learning_rate": 9.541212410663266e-06, + "loss": 0.6134, + "step": 1422 + }, + { + "epoch": 0.16, + "grad_norm": 3.555090294658996, + "learning_rate": 9.54043340797216e-06, + "loss": 0.4947, + "step": 1423 + }, + { + "epoch": 0.16, + "grad_norm": 1.9366223874743607, + "learning_rate": 9.539653776341007e-06, + "loss": 0.5837, + "step": 1424 + }, + { + "epoch": 0.16, + "grad_norm": 2.5921490088478234, + "learning_rate": 9.538873515877797e-06, + "loss": 0.4956, + "step": 1425 + }, + { + "epoch": 0.16, + "grad_norm": 0.9509843780336904, + "learning_rate": 9.538092626690613e-06, + "loss": 0.7544, + "step": 1426 + }, + { + "epoch": 0.16, + "grad_norm": 2.1505978970517656, + "learning_rate": 9.537311108887626e-06, + "loss": 0.5252, + "step": 1427 + }, + { + "epoch": 0.16, + "grad_norm": 2.0597680570537076, + "learning_rate": 9.536528962577092e-06, + "loss": 0.5424, + "step": 1428 + }, + { + "epoch": 0.16, + "grad_norm": 1.8016401978593481, + "learning_rate": 9.535746187867349e-06, + "loss": 0.5543, + "step": 1429 + }, + { + "epoch": 0.16, + "grad_norm": 1.704742676852404, + "learning_rate": 9.53496278486683e-06, + "loss": 0.5873, + "step": 1430 + }, + { + "epoch": 0.16, + "grad_norm": 2.0903093995692044, + "learning_rate": 9.534178753684054e-06, + "loss": 0.53, + "step": 1431 + }, + { + "epoch": 0.16, + "grad_norm": 2.060378143366305, + "learning_rate": 9.533394094427619e-06, + "loss": 0.5089, + "step": 1432 + }, + { + "epoch": 0.16, + "grad_norm": 1.9597813868398153, + "learning_rate": 9.53260880720622e-06, + "loss": 0.5024, + "step": 1433 + }, + { + "epoch": 0.16, + "grad_norm": 2.0713033160538337, + "learning_rate": 9.531822892128637e-06, + "loss": 0.577, + "step": 1434 + }, + { + "epoch": 0.16, + "grad_norm": 2.3398236723030874, + "learning_rate": 9.531036349303729e-06, + "loss": 0.471, + "step": 1435 + }, + { + "epoch": 0.17, + "grad_norm": 1.5449609511427382, + "learning_rate": 9.530249178840452e-06, + "loss": 0.4751, + "step": 1436 + }, + { + "epoch": 0.17, + "grad_norm": 2.2202636298457588, + "learning_rate": 9.529461380847842e-06, + "loss": 0.5656, + "step": 1437 + }, + { + "epoch": 0.17, + "grad_norm": 2.0479970582076232, + "learning_rate": 9.528672955435027e-06, + "loss": 0.5446, + "step": 1438 + }, + { + "epoch": 0.17, + "grad_norm": 1.9239044730878956, + "learning_rate": 9.527883902711219e-06, + "loss": 0.5564, + "step": 1439 + }, + { + "epoch": 0.17, + "grad_norm": 1.6887756653610342, + "learning_rate": 9.527094222785717e-06, + "loss": 0.4266, + "step": 1440 + }, + { + "epoch": 0.17, + "grad_norm": 3.487628035495983, + "learning_rate": 9.526303915767906e-06, + "loss": 0.5408, + "step": 1441 + }, + { + "epoch": 0.17, + "grad_norm": 8.7064563976933, + "learning_rate": 9.525512981767263e-06, + "loss": 0.5075, + "step": 1442 + }, + { + "epoch": 0.17, + "grad_norm": 1.9369214160079387, + "learning_rate": 9.524721420893344e-06, + "loss": 0.543, + "step": 1443 + }, + { + "epoch": 0.17, + "grad_norm": 2.077238591866259, + "learning_rate": 9.523929233255797e-06, + "loss": 0.4671, + "step": 1444 + }, + { + "epoch": 0.17, + "grad_norm": 1.7772843833931513, + "learning_rate": 9.523136418964356e-06, + "loss": 0.5712, + "step": 1445 + }, + { + "epoch": 0.17, + "grad_norm": 2.2758023744504237, + "learning_rate": 9.522342978128843e-06, + "loss": 0.4608, + "step": 1446 + }, + { + "epoch": 0.17, + "grad_norm": 1.9311861858877077, + "learning_rate": 9.521548910859163e-06, + "loss": 0.5748, + "step": 1447 + }, + { + "epoch": 0.17, + "grad_norm": 1.909715683777971, + "learning_rate": 9.520754217265311e-06, + "loss": 0.556, + "step": 1448 + }, + { + "epoch": 0.17, + "grad_norm": 2.1421173885534506, + "learning_rate": 9.519958897457368e-06, + "loss": 0.5377, + "step": 1449 + }, + { + "epoch": 0.17, + "grad_norm": 3.109948896939215, + "learning_rate": 9.519162951545501e-06, + "loss": 0.4631, + "step": 1450 + }, + { + "epoch": 0.17, + "grad_norm": 2.2241729431217605, + "learning_rate": 9.518366379639968e-06, + "loss": 0.6092, + "step": 1451 + }, + { + "epoch": 0.17, + "grad_norm": 1.6252645290041907, + "learning_rate": 9.517569181851103e-06, + "loss": 0.4747, + "step": 1452 + }, + { + "epoch": 0.17, + "grad_norm": 2.2498382162419936, + "learning_rate": 9.516771358289339e-06, + "loss": 0.5081, + "step": 1453 + }, + { + "epoch": 0.17, + "grad_norm": 2.0888134535102534, + "learning_rate": 9.515972909065187e-06, + "loss": 0.7124, + "step": 1454 + }, + { + "epoch": 0.17, + "grad_norm": 0.9467670810506359, + "learning_rate": 9.515173834289253e-06, + "loss": 0.7239, + "step": 1455 + }, + { + "epoch": 0.17, + "grad_norm": 2.1855249702453428, + "learning_rate": 9.51437413407222e-06, + "loss": 0.6236, + "step": 1456 + }, + { + "epoch": 0.17, + "grad_norm": 1.8127158875825493, + "learning_rate": 9.513573808524864e-06, + "loss": 0.519, + "step": 1457 + }, + { + "epoch": 0.17, + "grad_norm": 2.972085765145068, + "learning_rate": 9.512772857758044e-06, + "loss": 0.5843, + "step": 1458 + }, + { + "epoch": 0.17, + "grad_norm": 2.112764304907531, + "learning_rate": 9.511971281882711e-06, + "loss": 0.5616, + "step": 1459 + }, + { + "epoch": 0.17, + "grad_norm": 2.030054421483051, + "learning_rate": 9.511169081009897e-06, + "loss": 0.5482, + "step": 1460 + }, + { + "epoch": 0.17, + "grad_norm": 1.8287901603843935, + "learning_rate": 9.510366255250722e-06, + "loss": 0.5266, + "step": 1461 + }, + { + "epoch": 0.17, + "grad_norm": 2.000496495801951, + "learning_rate": 9.509562804716396e-06, + "loss": 0.5519, + "step": 1462 + }, + { + "epoch": 0.17, + "grad_norm": 1.775526543316991, + "learning_rate": 9.508758729518213e-06, + "loss": 0.6783, + "step": 1463 + }, + { + "epoch": 0.17, + "grad_norm": 1.7284959623174554, + "learning_rate": 9.50795402976755e-06, + "loss": 0.516, + "step": 1464 + }, + { + "epoch": 0.17, + "grad_norm": 2.00937464269589, + "learning_rate": 9.507148705575876e-06, + "loss": 0.3977, + "step": 1465 + }, + { + "epoch": 0.17, + "grad_norm": 1.8227419757331833, + "learning_rate": 9.506342757054744e-06, + "loss": 0.5012, + "step": 1466 + }, + { + "epoch": 0.17, + "grad_norm": 2.11901413287132, + "learning_rate": 9.505536184315793e-06, + "loss": 0.5069, + "step": 1467 + }, + { + "epoch": 0.17, + "grad_norm": 3.53597531641787, + "learning_rate": 9.50472898747075e-06, + "loss": 0.5201, + "step": 1468 + }, + { + "epoch": 0.17, + "grad_norm": 1.8385979069973468, + "learning_rate": 9.50392116663143e-06, + "loss": 0.5532, + "step": 1469 + }, + { + "epoch": 0.17, + "grad_norm": 1.9340394447527698, + "learning_rate": 9.503112721909728e-06, + "loss": 0.5417, + "step": 1470 + }, + { + "epoch": 0.17, + "grad_norm": 1.8211635236483439, + "learning_rate": 9.502303653417631e-06, + "loss": 0.5537, + "step": 1471 + }, + { + "epoch": 0.17, + "grad_norm": 1.8567616213882414, + "learning_rate": 9.501493961267213e-06, + "loss": 0.6387, + "step": 1472 + }, + { + "epoch": 0.17, + "grad_norm": 2.561795593439265, + "learning_rate": 9.500683645570632e-06, + "loss": 0.648, + "step": 1473 + }, + { + "epoch": 0.17, + "grad_norm": 2.1408629257592136, + "learning_rate": 9.499872706440132e-06, + "loss": 0.5465, + "step": 1474 + }, + { + "epoch": 0.17, + "grad_norm": 1.781320494085114, + "learning_rate": 9.499061143988042e-06, + "loss": 0.5453, + "step": 1475 + }, + { + "epoch": 0.17, + "grad_norm": 1.9065847596129448, + "learning_rate": 9.498248958326783e-06, + "loss": 0.6311, + "step": 1476 + }, + { + "epoch": 0.17, + "grad_norm": 1.8454794441091658, + "learning_rate": 9.497436149568858e-06, + "loss": 0.6189, + "step": 1477 + }, + { + "epoch": 0.17, + "grad_norm": 2.054435406232731, + "learning_rate": 9.496622717826855e-06, + "loss": 0.527, + "step": 1478 + }, + { + "epoch": 0.17, + "grad_norm": 1.847704212112147, + "learning_rate": 9.495808663213454e-06, + "loss": 0.4713, + "step": 1479 + }, + { + "epoch": 0.17, + "grad_norm": 1.6161879655352474, + "learning_rate": 9.494993985841414e-06, + "loss": 0.5122, + "step": 1480 + }, + { + "epoch": 0.17, + "grad_norm": 1.6288866221277847, + "learning_rate": 9.494178685823586e-06, + "loss": 0.5416, + "step": 1481 + }, + { + "epoch": 0.17, + "grad_norm": 2.1751590584173246, + "learning_rate": 9.493362763272906e-06, + "loss": 0.531, + "step": 1482 + }, + { + "epoch": 0.17, + "grad_norm": 2.8942983798603406, + "learning_rate": 9.492546218302392e-06, + "loss": 0.536, + "step": 1483 + }, + { + "epoch": 0.17, + "grad_norm": 1.7793768760876623, + "learning_rate": 9.491729051025157e-06, + "loss": 0.6665, + "step": 1484 + }, + { + "epoch": 0.17, + "grad_norm": 1.9606206687667782, + "learning_rate": 9.49091126155439e-06, + "loss": 0.5227, + "step": 1485 + }, + { + "epoch": 0.17, + "grad_norm": 3.0015944592733046, + "learning_rate": 9.490092850003372e-06, + "loss": 0.5494, + "step": 1486 + }, + { + "epoch": 0.17, + "grad_norm": 2.90739247745898, + "learning_rate": 9.489273816485472e-06, + "loss": 0.4668, + "step": 1487 + }, + { + "epoch": 0.17, + "grad_norm": 2.630449565138997, + "learning_rate": 9.488454161114138e-06, + "loss": 0.523, + "step": 1488 + }, + { + "epoch": 0.17, + "grad_norm": 2.257967856576836, + "learning_rate": 9.487633884002914e-06, + "loss": 0.5064, + "step": 1489 + }, + { + "epoch": 0.17, + "grad_norm": 2.06221848584041, + "learning_rate": 9.48681298526542e-06, + "loss": 0.6519, + "step": 1490 + }, + { + "epoch": 0.17, + "grad_norm": 1.5631694786725043, + "learning_rate": 9.48599146501537e-06, + "loss": 0.5348, + "step": 1491 + }, + { + "epoch": 0.17, + "grad_norm": 2.5116495517499198, + "learning_rate": 9.485169323366556e-06, + "loss": 0.6165, + "step": 1492 + }, + { + "epoch": 0.17, + "grad_norm": 4.737551992032248, + "learning_rate": 9.484346560432867e-06, + "loss": 0.5424, + "step": 1493 + }, + { + "epoch": 0.17, + "grad_norm": 4.734580512119994, + "learning_rate": 9.48352317632827e-06, + "loss": 0.5018, + "step": 1494 + }, + { + "epoch": 0.17, + "grad_norm": 2.415610930314321, + "learning_rate": 9.482699171166816e-06, + "loss": 0.5908, + "step": 1495 + }, + { + "epoch": 0.17, + "grad_norm": 1.8142746571788346, + "learning_rate": 9.481874545062651e-06, + "loss": 0.4933, + "step": 1496 + }, + { + "epoch": 0.17, + "grad_norm": 1.771807041567568, + "learning_rate": 9.48104929813e-06, + "loss": 0.5534, + "step": 1497 + }, + { + "epoch": 0.17, + "grad_norm": 0.9252584612298708, + "learning_rate": 9.480223430483176e-06, + "loss": 0.756, + "step": 1498 + }, + { + "epoch": 0.17, + "grad_norm": 2.1829521908306595, + "learning_rate": 9.47939694223658e-06, + "loss": 0.5595, + "step": 1499 + }, + { + "epoch": 0.17, + "grad_norm": 2.321874578093091, + "learning_rate": 9.478569833504694e-06, + "loss": 0.4621, + "step": 1500 + }, + { + "epoch": 0.17, + "grad_norm": 2.2840611952651813, + "learning_rate": 9.47774210440209e-06, + "loss": 0.5139, + "step": 1501 + }, + { + "epoch": 0.17, + "grad_norm": 1.6475694356517705, + "learning_rate": 9.476913755043427e-06, + "loss": 0.5039, + "step": 1502 + }, + { + "epoch": 0.17, + "grad_norm": 1.7602111456002916, + "learning_rate": 9.476084785543444e-06, + "loss": 0.5565, + "step": 1503 + }, + { + "epoch": 0.17, + "grad_norm": 2.0025178336996734, + "learning_rate": 9.475255196016972e-06, + "loss": 0.5418, + "step": 1504 + }, + { + "epoch": 0.17, + "grad_norm": 1.7505514269438753, + "learning_rate": 9.474424986578928e-06, + "loss": 0.4238, + "step": 1505 + }, + { + "epoch": 0.17, + "grad_norm": 2.006269175920709, + "learning_rate": 9.473594157344307e-06, + "loss": 0.6045, + "step": 1506 + }, + { + "epoch": 0.17, + "grad_norm": 2.0209803761513654, + "learning_rate": 9.4727627084282e-06, + "loss": 0.5503, + "step": 1507 + }, + { + "epoch": 0.17, + "grad_norm": 3.4692240938051726, + "learning_rate": 9.471930639945777e-06, + "loss": 0.5782, + "step": 1508 + }, + { + "epoch": 0.17, + "grad_norm": 3.605852202185983, + "learning_rate": 9.471097952012296e-06, + "loss": 0.5746, + "step": 1509 + }, + { + "epoch": 0.17, + "grad_norm": 2.082110555252948, + "learning_rate": 9.4702646447431e-06, + "loss": 0.6455, + "step": 1510 + }, + { + "epoch": 0.17, + "grad_norm": 2.500299430628737, + "learning_rate": 9.46943071825362e-06, + "loss": 0.4985, + "step": 1511 + }, + { + "epoch": 0.17, + "grad_norm": 1.6393743041118873, + "learning_rate": 9.468596172659372e-06, + "loss": 0.6134, + "step": 1512 + }, + { + "epoch": 0.17, + "grad_norm": 4.809760471963379, + "learning_rate": 9.467761008075957e-06, + "loss": 0.4374, + "step": 1513 + }, + { + "epoch": 0.17, + "grad_norm": 2.451962105545382, + "learning_rate": 9.466925224619059e-06, + "loss": 0.5095, + "step": 1514 + }, + { + "epoch": 0.17, + "grad_norm": 2.532438955618862, + "learning_rate": 9.466088822404454e-06, + "loss": 0.552, + "step": 1515 + }, + { + "epoch": 0.17, + "grad_norm": 1.7574621909644783, + "learning_rate": 9.465251801547998e-06, + "loss": 0.4963, + "step": 1516 + }, + { + "epoch": 0.17, + "grad_norm": 1.929622554690285, + "learning_rate": 9.464414162165635e-06, + "loss": 0.486, + "step": 1517 + }, + { + "epoch": 0.17, + "grad_norm": 3.445082309452605, + "learning_rate": 9.463575904373397e-06, + "loss": 0.5817, + "step": 1518 + }, + { + "epoch": 0.17, + "grad_norm": 2.3133468491973166, + "learning_rate": 9.462737028287398e-06, + "loss": 0.5551, + "step": 1519 + }, + { + "epoch": 0.17, + "grad_norm": 1.6198102076375613, + "learning_rate": 9.461897534023838e-06, + "loss": 0.5179, + "step": 1520 + }, + { + "epoch": 0.17, + "grad_norm": 2.116514234181379, + "learning_rate": 9.461057421699004e-06, + "loss": 0.6356, + "step": 1521 + }, + { + "epoch": 0.17, + "grad_norm": 5.349094653833558, + "learning_rate": 9.460216691429271e-06, + "loss": 0.5676, + "step": 1522 + }, + { + "epoch": 0.17, + "grad_norm": 0.9766733759532904, + "learning_rate": 9.459375343331091e-06, + "loss": 0.7801, + "step": 1523 + }, + { + "epoch": 0.18, + "grad_norm": 1.6030340735159212, + "learning_rate": 9.458533377521014e-06, + "loss": 0.517, + "step": 1524 + }, + { + "epoch": 0.18, + "grad_norm": 1.6988472470307165, + "learning_rate": 9.457690794115664e-06, + "loss": 0.5458, + "step": 1525 + }, + { + "epoch": 0.18, + "grad_norm": 2.2751385754829427, + "learning_rate": 9.456847593231758e-06, + "loss": 0.5583, + "step": 1526 + }, + { + "epoch": 0.18, + "grad_norm": 2.385446649891306, + "learning_rate": 9.456003774986096e-06, + "loss": 0.6169, + "step": 1527 + }, + { + "epoch": 0.18, + "grad_norm": 2.169669422604865, + "learning_rate": 9.45515933949556e-06, + "loss": 0.5129, + "step": 1528 + }, + { + "epoch": 0.18, + "grad_norm": 1.7805941627597726, + "learning_rate": 9.454314286877127e-06, + "loss": 0.569, + "step": 1529 + }, + { + "epoch": 0.18, + "grad_norm": 2.6358505985341094, + "learning_rate": 9.45346861724785e-06, + "loss": 0.4823, + "step": 1530 + }, + { + "epoch": 0.18, + "grad_norm": 1.7894858945504952, + "learning_rate": 9.45262233072487e-06, + "loss": 0.5521, + "step": 1531 + }, + { + "epoch": 0.18, + "grad_norm": 2.652156941345048, + "learning_rate": 9.451775427425417e-06, + "loss": 0.5548, + "step": 1532 + }, + { + "epoch": 0.18, + "grad_norm": 2.8043718676510645, + "learning_rate": 9.450927907466803e-06, + "loss": 0.4128, + "step": 1533 + }, + { + "epoch": 0.18, + "grad_norm": 1.9885338788116866, + "learning_rate": 9.450079770966424e-06, + "loss": 0.4633, + "step": 1534 + }, + { + "epoch": 0.18, + "grad_norm": 1.986915889820981, + "learning_rate": 9.449231018041769e-06, + "loss": 0.4737, + "step": 1535 + }, + { + "epoch": 0.18, + "grad_norm": 2.0226185998553654, + "learning_rate": 9.448381648810403e-06, + "loss": 0.4827, + "step": 1536 + }, + { + "epoch": 0.18, + "grad_norm": 1.5378208451849988, + "learning_rate": 9.447531663389982e-06, + "loss": 0.5685, + "step": 1537 + }, + { + "epoch": 0.18, + "grad_norm": 2.1989547700602237, + "learning_rate": 9.446681061898244e-06, + "loss": 0.5481, + "step": 1538 + }, + { + "epoch": 0.18, + "grad_norm": 1.7461483412079766, + "learning_rate": 9.445829844453017e-06, + "loss": 0.5131, + "step": 1539 + }, + { + "epoch": 0.18, + "grad_norm": 1.0038722623007386, + "learning_rate": 9.444978011172207e-06, + "loss": 0.7803, + "step": 1540 + }, + { + "epoch": 0.18, + "grad_norm": 1.7657144239580935, + "learning_rate": 9.444125562173816e-06, + "loss": 0.5738, + "step": 1541 + }, + { + "epoch": 0.18, + "grad_norm": 1.977315713817142, + "learning_rate": 9.443272497575922e-06, + "loss": 0.5779, + "step": 1542 + }, + { + "epoch": 0.18, + "grad_norm": 1.7349565433400973, + "learning_rate": 9.442418817496689e-06, + "loss": 0.5682, + "step": 1543 + }, + { + "epoch": 0.18, + "grad_norm": 3.072917492730993, + "learning_rate": 9.441564522054372e-06, + "loss": 0.4662, + "step": 1544 + }, + { + "epoch": 0.18, + "grad_norm": 1.9603366244114617, + "learning_rate": 9.440709611367308e-06, + "loss": 0.5404, + "step": 1545 + }, + { + "epoch": 0.18, + "grad_norm": 2.3058408812745883, + "learning_rate": 9.439854085553914e-06, + "loss": 0.517, + "step": 1546 + }, + { + "epoch": 0.18, + "grad_norm": 2.942141637564404, + "learning_rate": 9.438997944732705e-06, + "loss": 0.5759, + "step": 1547 + }, + { + "epoch": 0.18, + "grad_norm": 1.888176375381119, + "learning_rate": 9.438141189022267e-06, + "loss": 0.5845, + "step": 1548 + }, + { + "epoch": 0.18, + "grad_norm": 2.7958421091183836, + "learning_rate": 9.43728381854128e-06, + "loss": 0.5848, + "step": 1549 + }, + { + "epoch": 0.18, + "grad_norm": 1.8418586436738982, + "learning_rate": 9.436425833408509e-06, + "loss": 0.5457, + "step": 1550 + }, + { + "epoch": 0.18, + "grad_norm": 2.312040209207688, + "learning_rate": 9.435567233742799e-06, + "loss": 0.4754, + "step": 1551 + }, + { + "epoch": 0.18, + "grad_norm": 1.8213631686188005, + "learning_rate": 9.434708019663085e-06, + "loss": 0.595, + "step": 1552 + }, + { + "epoch": 0.18, + "grad_norm": 1.6958279726854661, + "learning_rate": 9.433848191288384e-06, + "loss": 0.5166, + "step": 1553 + }, + { + "epoch": 0.18, + "grad_norm": 2.6039608029137833, + "learning_rate": 9.432987748737798e-06, + "loss": 0.595, + "step": 1554 + }, + { + "epoch": 0.18, + "grad_norm": 2.444212090530702, + "learning_rate": 9.432126692130518e-06, + "loss": 0.6208, + "step": 1555 + }, + { + "epoch": 0.18, + "grad_norm": 1.914056758420974, + "learning_rate": 9.431265021585816e-06, + "loss": 0.5174, + "step": 1556 + }, + { + "epoch": 0.18, + "grad_norm": 2.5720779055197163, + "learning_rate": 9.430402737223051e-06, + "loss": 0.5405, + "step": 1557 + }, + { + "epoch": 0.18, + "grad_norm": 1.8353091615135224, + "learning_rate": 9.429539839161665e-06, + "loss": 0.6386, + "step": 1558 + }, + { + "epoch": 0.18, + "grad_norm": 2.6682560882299255, + "learning_rate": 9.428676327521189e-06, + "loss": 0.5587, + "step": 1559 + }, + { + "epoch": 0.18, + "grad_norm": 2.2528850932549336, + "learning_rate": 9.427812202421236e-06, + "loss": 0.5145, + "step": 1560 + }, + { + "epoch": 0.18, + "grad_norm": 2.4866252642879645, + "learning_rate": 9.426947463981502e-06, + "loss": 0.6121, + "step": 1561 + }, + { + "epoch": 0.18, + "grad_norm": 1.748776826380648, + "learning_rate": 9.426082112321773e-06, + "loss": 0.6093, + "step": 1562 + }, + { + "epoch": 0.18, + "grad_norm": 4.209771080896768, + "learning_rate": 9.425216147561916e-06, + "loss": 0.5364, + "step": 1563 + }, + { + "epoch": 0.18, + "grad_norm": 0.870438844556082, + "learning_rate": 9.424349569821884e-06, + "loss": 0.7473, + "step": 1564 + }, + { + "epoch": 0.18, + "grad_norm": 4.060885811366647, + "learning_rate": 9.423482379221717e-06, + "loss": 0.4351, + "step": 1565 + }, + { + "epoch": 0.18, + "grad_norm": 1.7458260772542664, + "learning_rate": 9.422614575881536e-06, + "loss": 0.528, + "step": 1566 + }, + { + "epoch": 0.18, + "grad_norm": 1.8680515003621059, + "learning_rate": 9.421746159921553e-06, + "loss": 0.561, + "step": 1567 + }, + { + "epoch": 0.18, + "grad_norm": 1.810134387043382, + "learning_rate": 9.420877131462053e-06, + "loss": 0.5177, + "step": 1568 + }, + { + "epoch": 0.18, + "grad_norm": 2.281103211342526, + "learning_rate": 9.420007490623422e-06, + "loss": 0.5588, + "step": 1569 + }, + { + "epoch": 0.18, + "grad_norm": 2.2006922877456607, + "learning_rate": 9.419137237526116e-06, + "loss": 0.5113, + "step": 1570 + }, + { + "epoch": 0.18, + "grad_norm": 2.137370959466019, + "learning_rate": 9.418266372290689e-06, + "loss": 0.5734, + "step": 1571 + }, + { + "epoch": 0.18, + "grad_norm": 2.390238091119377, + "learning_rate": 9.417394895037768e-06, + "loss": 0.5692, + "step": 1572 + }, + { + "epoch": 0.18, + "grad_norm": 1.7867108325303955, + "learning_rate": 9.416522805888072e-06, + "loss": 0.5484, + "step": 1573 + }, + { + "epoch": 0.18, + "grad_norm": 1.7287156106410118, + "learning_rate": 9.415650104962399e-06, + "loss": 0.5086, + "step": 1574 + }, + { + "epoch": 0.18, + "grad_norm": 1.614209509115682, + "learning_rate": 9.414776792381639e-06, + "loss": 0.4683, + "step": 1575 + }, + { + "epoch": 0.18, + "grad_norm": 1.8526686678311381, + "learning_rate": 9.413902868266764e-06, + "loss": 0.6403, + "step": 1576 + }, + { + "epoch": 0.18, + "grad_norm": 2.1436525981823706, + "learning_rate": 9.413028332738827e-06, + "loss": 0.6158, + "step": 1577 + }, + { + "epoch": 0.18, + "grad_norm": 1.997408998716668, + "learning_rate": 9.41215318591897e-06, + "loss": 0.557, + "step": 1578 + }, + { + "epoch": 0.18, + "grad_norm": 5.682274264355876, + "learning_rate": 9.411277427928419e-06, + "loss": 0.5273, + "step": 1579 + }, + { + "epoch": 0.18, + "grad_norm": 2.9915799640097176, + "learning_rate": 9.410401058888482e-06, + "loss": 0.5122, + "step": 1580 + }, + { + "epoch": 0.18, + "grad_norm": 1.863239011162514, + "learning_rate": 9.409524078920553e-06, + "loss": 0.5355, + "step": 1581 + }, + { + "epoch": 0.18, + "grad_norm": 2.2768149552548866, + "learning_rate": 9.408646488146113e-06, + "loss": 0.5698, + "step": 1582 + }, + { + "epoch": 0.18, + "grad_norm": 2.0316930813079797, + "learning_rate": 9.407768286686726e-06, + "loss": 0.4587, + "step": 1583 + }, + { + "epoch": 0.18, + "grad_norm": 2.185120014444133, + "learning_rate": 9.40688947466404e-06, + "loss": 0.522, + "step": 1584 + }, + { + "epoch": 0.18, + "grad_norm": 1.8355065585628945, + "learning_rate": 9.406010052199786e-06, + "loss": 0.5596, + "step": 1585 + }, + { + "epoch": 0.18, + "grad_norm": 1.7964244886665925, + "learning_rate": 9.405130019415782e-06, + "loss": 0.575, + "step": 1586 + }, + { + "epoch": 0.18, + "grad_norm": 1.6286436422823765, + "learning_rate": 9.404249376433932e-06, + "loss": 0.5221, + "step": 1587 + }, + { + "epoch": 0.18, + "grad_norm": 1.9629959819228031, + "learning_rate": 9.403368123376222e-06, + "loss": 0.5028, + "step": 1588 + }, + { + "epoch": 0.18, + "grad_norm": 1.8467952421348366, + "learning_rate": 9.402486260364721e-06, + "loss": 0.5833, + "step": 1589 + }, + { + "epoch": 0.18, + "grad_norm": 2.2950549190414185, + "learning_rate": 9.401603787521584e-06, + "loss": 0.5262, + "step": 1590 + }, + { + "epoch": 0.18, + "grad_norm": 1.9048919972223175, + "learning_rate": 9.400720704969055e-06, + "loss": 0.4974, + "step": 1591 + }, + { + "epoch": 0.18, + "grad_norm": 2.752228683873074, + "learning_rate": 9.399837012829456e-06, + "loss": 0.5506, + "step": 1592 + }, + { + "epoch": 0.18, + "grad_norm": 3.9953779798529143, + "learning_rate": 9.398952711225195e-06, + "loss": 0.4445, + "step": 1593 + }, + { + "epoch": 0.18, + "grad_norm": 2.2018795280289054, + "learning_rate": 9.398067800278767e-06, + "loss": 0.4596, + "step": 1594 + }, + { + "epoch": 0.18, + "grad_norm": 1.6681465740057362, + "learning_rate": 9.397182280112748e-06, + "loss": 0.463, + "step": 1595 + }, + { + "epoch": 0.18, + "grad_norm": 1.5427933492272992, + "learning_rate": 9.396296150849804e-06, + "loss": 0.4764, + "step": 1596 + }, + { + "epoch": 0.18, + "grad_norm": 2.0167461007259617, + "learning_rate": 9.395409412612677e-06, + "loss": 0.5193, + "step": 1597 + }, + { + "epoch": 0.18, + "grad_norm": 1.7445542085222687, + "learning_rate": 9.394522065524199e-06, + "loss": 0.5209, + "step": 1598 + }, + { + "epoch": 0.18, + "grad_norm": 1.570325645842292, + "learning_rate": 9.393634109707286e-06, + "loss": 0.5666, + "step": 1599 + }, + { + "epoch": 0.18, + "grad_norm": 2.62731632193813, + "learning_rate": 9.392745545284938e-06, + "loss": 0.4471, + "step": 1600 + }, + { + "epoch": 0.18, + "grad_norm": 1.9883075038456708, + "learning_rate": 9.391856372380238e-06, + "loss": 0.5026, + "step": 1601 + }, + { + "epoch": 0.18, + "grad_norm": 2.2141677244465354, + "learning_rate": 9.390966591116351e-06, + "loss": 0.4177, + "step": 1602 + }, + { + "epoch": 0.18, + "grad_norm": 1.9429072382062087, + "learning_rate": 9.390076201616536e-06, + "loss": 0.5277, + "step": 1603 + }, + { + "epoch": 0.18, + "grad_norm": 0.9524843693486186, + "learning_rate": 9.389185204004123e-06, + "loss": 0.776, + "step": 1604 + }, + { + "epoch": 0.18, + "grad_norm": 2.4436621564967007, + "learning_rate": 9.388293598402538e-06, + "loss": 0.5519, + "step": 1605 + }, + { + "epoch": 0.18, + "grad_norm": 2.580810861388631, + "learning_rate": 9.387401384935282e-06, + "loss": 0.4495, + "step": 1606 + }, + { + "epoch": 0.18, + "grad_norm": 1.660961280407153, + "learning_rate": 9.386508563725947e-06, + "loss": 0.486, + "step": 1607 + }, + { + "epoch": 0.18, + "grad_norm": 1.5498932179204852, + "learning_rate": 9.385615134898206e-06, + "loss": 0.4832, + "step": 1608 + }, + { + "epoch": 0.18, + "grad_norm": 1.656627323274859, + "learning_rate": 9.384721098575815e-06, + "loss": 0.6313, + "step": 1609 + }, + { + "epoch": 0.18, + "grad_norm": 3.3286936415258985, + "learning_rate": 9.383826454882618e-06, + "loss": 0.4837, + "step": 1610 + }, + { + "epoch": 0.19, + "grad_norm": 2.242165112596168, + "learning_rate": 9.38293120394254e-06, + "loss": 0.6287, + "step": 1611 + }, + { + "epoch": 0.19, + "grad_norm": 1.9618078407314004, + "learning_rate": 9.38203534587959e-06, + "loss": 0.5228, + "step": 1612 + }, + { + "epoch": 0.19, + "grad_norm": 4.592663318979876, + "learning_rate": 9.381138880817862e-06, + "loss": 0.5712, + "step": 1613 + }, + { + "epoch": 0.19, + "grad_norm": 2.3376267464565847, + "learning_rate": 9.380241808881536e-06, + "loss": 0.6743, + "step": 1614 + }, + { + "epoch": 0.19, + "grad_norm": 1.8649294539568677, + "learning_rate": 9.379344130194873e-06, + "loss": 0.6241, + "step": 1615 + }, + { + "epoch": 0.19, + "grad_norm": 2.1033972423351037, + "learning_rate": 9.378445844882222e-06, + "loss": 0.5421, + "step": 1616 + }, + { + "epoch": 0.19, + "grad_norm": 3.0276002128068393, + "learning_rate": 9.377546953068008e-06, + "loss": 0.5465, + "step": 1617 + }, + { + "epoch": 0.19, + "grad_norm": 1.949817547082076, + "learning_rate": 9.37664745487675e-06, + "loss": 0.4586, + "step": 1618 + }, + { + "epoch": 0.19, + "grad_norm": 2.78858636720677, + "learning_rate": 9.375747350433044e-06, + "loss": 0.6349, + "step": 1619 + }, + { + "epoch": 0.19, + "grad_norm": 1.7454502884578886, + "learning_rate": 9.374846639861573e-06, + "loss": 0.492, + "step": 1620 + }, + { + "epoch": 0.19, + "grad_norm": 1.6503649648430325, + "learning_rate": 9.373945323287102e-06, + "loss": 0.5457, + "step": 1621 + }, + { + "epoch": 0.19, + "grad_norm": 1.8499920739614937, + "learning_rate": 9.373043400834482e-06, + "loss": 0.6042, + "step": 1622 + }, + { + "epoch": 0.19, + "grad_norm": 2.0167520935463745, + "learning_rate": 9.37214087262865e-06, + "loss": 0.5347, + "step": 1623 + }, + { + "epoch": 0.19, + "grad_norm": 1.5100709617349641, + "learning_rate": 9.37123773879462e-06, + "loss": 0.5089, + "step": 1624 + }, + { + "epoch": 0.19, + "grad_norm": 1.8008478191103365, + "learning_rate": 9.370333999457498e-06, + "loss": 0.5735, + "step": 1625 + }, + { + "epoch": 0.19, + "grad_norm": 2.3598057277155364, + "learning_rate": 9.369429654742463e-06, + "loss": 0.6149, + "step": 1626 + }, + { + "epoch": 0.19, + "grad_norm": 2.0311060968027648, + "learning_rate": 9.368524704774793e-06, + "loss": 0.5339, + "step": 1627 + }, + { + "epoch": 0.19, + "grad_norm": 3.6588586436955497, + "learning_rate": 9.367619149679836e-06, + "loss": 0.4918, + "step": 1628 + }, + { + "epoch": 0.19, + "grad_norm": 1.5312167814682616, + "learning_rate": 9.366712989583031e-06, + "loss": 0.4911, + "step": 1629 + }, + { + "epoch": 0.19, + "grad_norm": 1.6772441422717488, + "learning_rate": 9.3658062246099e-06, + "loss": 0.5281, + "step": 1630 + }, + { + "epoch": 0.19, + "grad_norm": 2.2196791581857616, + "learning_rate": 9.364898854886044e-06, + "loss": 0.5219, + "step": 1631 + }, + { + "epoch": 0.19, + "grad_norm": 1.690329251618206, + "learning_rate": 9.363990880537157e-06, + "loss": 0.4907, + "step": 1632 + }, + { + "epoch": 0.19, + "grad_norm": 2.1773979013974643, + "learning_rate": 9.363082301689008e-06, + "loss": 0.702, + "step": 1633 + }, + { + "epoch": 0.19, + "grad_norm": 0.9129287728075911, + "learning_rate": 9.362173118467455e-06, + "loss": 0.7323, + "step": 1634 + }, + { + "epoch": 0.19, + "grad_norm": 2.832091081125249, + "learning_rate": 9.361263330998436e-06, + "loss": 0.5309, + "step": 1635 + }, + { + "epoch": 0.19, + "grad_norm": 1.845024142967831, + "learning_rate": 9.360352939407977e-06, + "loss": 0.5611, + "step": 1636 + }, + { + "epoch": 0.19, + "grad_norm": 1.7155968780584063, + "learning_rate": 9.359441943822185e-06, + "loss": 0.5773, + "step": 1637 + }, + { + "epoch": 0.19, + "grad_norm": 1.940951756223291, + "learning_rate": 9.358530344367247e-06, + "loss": 0.4815, + "step": 1638 + }, + { + "epoch": 0.19, + "grad_norm": 2.4746351623637635, + "learning_rate": 9.357618141169444e-06, + "loss": 0.5174, + "step": 1639 + }, + { + "epoch": 0.19, + "grad_norm": 2.010168292301214, + "learning_rate": 9.35670533435513e-06, + "loss": 0.5211, + "step": 1640 + }, + { + "epoch": 0.19, + "grad_norm": 1.7887664725592907, + "learning_rate": 9.355791924050746e-06, + "loss": 0.503, + "step": 1641 + }, + { + "epoch": 0.19, + "grad_norm": 1.9662081867662127, + "learning_rate": 9.35487791038282e-06, + "loss": 0.5247, + "step": 1642 + }, + { + "epoch": 0.19, + "grad_norm": 1.9667746818100165, + "learning_rate": 9.35396329347796e-06, + "loss": 0.4986, + "step": 1643 + }, + { + "epoch": 0.19, + "grad_norm": 1.755291694452646, + "learning_rate": 9.35304807346286e-06, + "loss": 0.6297, + "step": 1644 + }, + { + "epoch": 0.19, + "grad_norm": 1.5268666469571373, + "learning_rate": 9.352132250464294e-06, + "loss": 0.5353, + "step": 1645 + }, + { + "epoch": 0.19, + "grad_norm": 2.182152519595133, + "learning_rate": 9.351215824609123e-06, + "loss": 0.5012, + "step": 1646 + }, + { + "epoch": 0.19, + "grad_norm": 2.071807968427525, + "learning_rate": 9.350298796024288e-06, + "loss": 0.5629, + "step": 1647 + }, + { + "epoch": 0.19, + "grad_norm": 1.7609081932742119, + "learning_rate": 9.349381164836818e-06, + "loss": 0.527, + "step": 1648 + }, + { + "epoch": 0.19, + "grad_norm": 2.03081093691396, + "learning_rate": 9.348462931173824e-06, + "loss": 0.5172, + "step": 1649 + }, + { + "epoch": 0.19, + "grad_norm": 2.0200849732321906, + "learning_rate": 9.347544095162495e-06, + "loss": 0.6014, + "step": 1650 + }, + { + "epoch": 0.19, + "grad_norm": 2.2375401706846145, + "learning_rate": 9.346624656930113e-06, + "loss": 0.5171, + "step": 1651 + }, + { + "epoch": 0.19, + "grad_norm": 1.598803077642052, + "learning_rate": 9.345704616604036e-06, + "loss": 0.5759, + "step": 1652 + }, + { + "epoch": 0.19, + "grad_norm": 2.1723427389105816, + "learning_rate": 9.344783974311709e-06, + "loss": 0.469, + "step": 1653 + }, + { + "epoch": 0.19, + "grad_norm": 1.7788756360752505, + "learning_rate": 9.343862730180657e-06, + "loss": 0.5035, + "step": 1654 + }, + { + "epoch": 0.19, + "grad_norm": 2.489989827105883, + "learning_rate": 9.342940884338492e-06, + "loss": 0.6048, + "step": 1655 + }, + { + "epoch": 0.19, + "grad_norm": 1.5205891378341054, + "learning_rate": 9.342018436912908e-06, + "loss": 0.5814, + "step": 1656 + }, + { + "epoch": 0.19, + "grad_norm": 2.1676788234413187, + "learning_rate": 9.341095388031684e-06, + "loss": 0.5329, + "step": 1657 + }, + { + "epoch": 0.19, + "grad_norm": 1.9402058529923814, + "learning_rate": 9.340171737822677e-06, + "loss": 0.5218, + "step": 1658 + }, + { + "epoch": 0.19, + "grad_norm": 1.881072234578272, + "learning_rate": 9.339247486413832e-06, + "loss": 0.5825, + "step": 1659 + }, + { + "epoch": 0.19, + "grad_norm": 1.9175369724697766, + "learning_rate": 9.338322633933178e-06, + "loss": 0.4724, + "step": 1660 + }, + { + "epoch": 0.19, + "grad_norm": 20.418053166467853, + "learning_rate": 9.337397180508825e-06, + "loss": 0.4652, + "step": 1661 + }, + { + "epoch": 0.19, + "grad_norm": 1.9262230133338871, + "learning_rate": 9.336471126268965e-06, + "loss": 0.5184, + "step": 1662 + }, + { + "epoch": 0.19, + "grad_norm": 2.0559451459029967, + "learning_rate": 9.335544471341876e-06, + "loss": 0.4706, + "step": 1663 + }, + { + "epoch": 0.19, + "grad_norm": 5.948126151951804, + "learning_rate": 9.334617215855916e-06, + "loss": 0.5583, + "step": 1664 + }, + { + "epoch": 0.19, + "grad_norm": 2.0959032273625624, + "learning_rate": 9.33368935993953e-06, + "loss": 0.5169, + "step": 1665 + }, + { + "epoch": 0.19, + "grad_norm": 1.7551353465570954, + "learning_rate": 9.332760903721248e-06, + "loss": 0.5775, + "step": 1666 + }, + { + "epoch": 0.19, + "grad_norm": 2.015298559665234, + "learning_rate": 9.331831847329674e-06, + "loss": 0.5793, + "step": 1667 + }, + { + "epoch": 0.19, + "grad_norm": 1.7841466972445137, + "learning_rate": 9.3309021908935e-06, + "loss": 0.5258, + "step": 1668 + }, + { + "epoch": 0.19, + "grad_norm": 2.5071785752632825, + "learning_rate": 9.329971934541508e-06, + "loss": 0.5644, + "step": 1669 + }, + { + "epoch": 0.19, + "grad_norm": 2.4689997155356616, + "learning_rate": 9.329041078402553e-06, + "loss": 0.555, + "step": 1670 + }, + { + "epoch": 0.19, + "grad_norm": 1.8628550108931088, + "learning_rate": 9.328109622605579e-06, + "loss": 0.5878, + "step": 1671 + }, + { + "epoch": 0.19, + "grad_norm": 1.83086627279445, + "learning_rate": 9.327177567279608e-06, + "loss": 0.5181, + "step": 1672 + }, + { + "epoch": 0.19, + "grad_norm": 0.890704231586985, + "learning_rate": 9.326244912553749e-06, + "loss": 0.7284, + "step": 1673 + }, + { + "epoch": 0.19, + "grad_norm": 1.85034561764086, + "learning_rate": 9.325311658557195e-06, + "loss": 0.5443, + "step": 1674 + }, + { + "epoch": 0.19, + "grad_norm": 1.7523869508674763, + "learning_rate": 9.32437780541922e-06, + "loss": 0.5438, + "step": 1675 + }, + { + "epoch": 0.19, + "grad_norm": 2.120203311956958, + "learning_rate": 9.323443353269179e-06, + "loss": 0.5628, + "step": 1676 + }, + { + "epoch": 0.19, + "grad_norm": 1.8949944133321155, + "learning_rate": 9.322508302236515e-06, + "loss": 0.4906, + "step": 1677 + }, + { + "epoch": 0.19, + "grad_norm": 1.6104717066554324, + "learning_rate": 9.321572652450749e-06, + "loss": 0.6067, + "step": 1678 + }, + { + "epoch": 0.19, + "grad_norm": 1.907027282582565, + "learning_rate": 9.320636404041487e-06, + "loss": 0.5042, + "step": 1679 + }, + { + "epoch": 0.19, + "grad_norm": 2.556490597953133, + "learning_rate": 9.31969955713842e-06, + "loss": 0.5551, + "step": 1680 + }, + { + "epoch": 0.19, + "grad_norm": 1.7604952199030588, + "learning_rate": 9.318762111871318e-06, + "loss": 0.567, + "step": 1681 + }, + { + "epoch": 0.19, + "grad_norm": 1.9717231670496491, + "learning_rate": 9.317824068370036e-06, + "loss": 0.5675, + "step": 1682 + }, + { + "epoch": 0.19, + "grad_norm": 1.7838151625845646, + "learning_rate": 9.316885426764512e-06, + "loss": 0.4793, + "step": 1683 + }, + { + "epoch": 0.19, + "grad_norm": 2.1260110021628784, + "learning_rate": 9.315946187184765e-06, + "loss": 0.4534, + "step": 1684 + }, + { + "epoch": 0.19, + "grad_norm": 0.9524502686507368, + "learning_rate": 9.315006349760903e-06, + "loss": 0.7155, + "step": 1685 + }, + { + "epoch": 0.19, + "grad_norm": 1.6977042332185168, + "learning_rate": 9.314065914623106e-06, + "loss": 0.5344, + "step": 1686 + }, + { + "epoch": 0.19, + "grad_norm": 1.7123875041222583, + "learning_rate": 9.313124881901648e-06, + "loss": 0.4912, + "step": 1687 + }, + { + "epoch": 0.19, + "grad_norm": 0.8031208988745432, + "learning_rate": 9.312183251726876e-06, + "loss": 0.6784, + "step": 1688 + }, + { + "epoch": 0.19, + "grad_norm": 1.6460891424841892, + "learning_rate": 9.311241024229227e-06, + "loss": 0.5118, + "step": 1689 + }, + { + "epoch": 0.19, + "grad_norm": 1.857826997164157, + "learning_rate": 9.31029819953922e-06, + "loss": 0.5509, + "step": 1690 + }, + { + "epoch": 0.19, + "grad_norm": 2.5030341662851896, + "learning_rate": 9.309354777787452e-06, + "loss": 0.4867, + "step": 1691 + }, + { + "epoch": 0.19, + "grad_norm": 1.4581579820689228, + "learning_rate": 9.308410759104606e-06, + "loss": 0.4773, + "step": 1692 + }, + { + "epoch": 0.19, + "grad_norm": 2.1294693420619235, + "learning_rate": 9.307466143621449e-06, + "loss": 0.4801, + "step": 1693 + }, + { + "epoch": 0.19, + "grad_norm": 1.8206610963410648, + "learning_rate": 9.306520931468828e-06, + "loss": 0.5204, + "step": 1694 + }, + { + "epoch": 0.19, + "grad_norm": 1.7567220059452366, + "learning_rate": 9.305575122777672e-06, + "loss": 0.605, + "step": 1695 + }, + { + "epoch": 0.19, + "grad_norm": 1.7471833702748962, + "learning_rate": 9.304628717678997e-06, + "loss": 0.5759, + "step": 1696 + }, + { + "epoch": 0.19, + "grad_norm": 2.1298582970523965, + "learning_rate": 9.303681716303896e-06, + "loss": 0.5313, + "step": 1697 + }, + { + "epoch": 0.2, + "grad_norm": 2.249811427482779, + "learning_rate": 9.302734118783551e-06, + "loss": 0.569, + "step": 1698 + }, + { + "epoch": 0.2, + "grad_norm": 1.0545328509448202, + "learning_rate": 9.30178592524922e-06, + "loss": 0.8219, + "step": 1699 + }, + { + "epoch": 0.2, + "grad_norm": 1.7480512481127208, + "learning_rate": 9.300837135832249e-06, + "loss": 0.6424, + "step": 1700 + }, + { + "epoch": 0.2, + "grad_norm": 11.671875010015276, + "learning_rate": 9.299887750664062e-06, + "loss": 0.5655, + "step": 1701 + }, + { + "epoch": 0.2, + "grad_norm": 2.140857094371549, + "learning_rate": 9.298937769876168e-06, + "loss": 0.5079, + "step": 1702 + }, + { + "epoch": 0.2, + "grad_norm": 1.9804452311861018, + "learning_rate": 9.29798719360016e-06, + "loss": 0.7256, + "step": 1703 + }, + { + "epoch": 0.2, + "grad_norm": 2.1748119547094826, + "learning_rate": 9.297036021967709e-06, + "loss": 0.4758, + "step": 1704 + }, + { + "epoch": 0.2, + "grad_norm": 4.173287989562393, + "learning_rate": 9.296084255110574e-06, + "loss": 0.6222, + "step": 1705 + }, + { + "epoch": 0.2, + "grad_norm": 2.0189381478411104, + "learning_rate": 9.295131893160591e-06, + "loss": 0.6305, + "step": 1706 + }, + { + "epoch": 0.2, + "grad_norm": 1.6480246533419487, + "learning_rate": 9.294178936249682e-06, + "loss": 0.5096, + "step": 1707 + }, + { + "epoch": 0.2, + "grad_norm": 2.033585701299999, + "learning_rate": 9.29322538450985e-06, + "loss": 0.5171, + "step": 1708 + }, + { + "epoch": 0.2, + "grad_norm": 1.6493562347377557, + "learning_rate": 9.292271238073182e-06, + "loss": 0.5271, + "step": 1709 + }, + { + "epoch": 0.2, + "grad_norm": 1.5186542710373736, + "learning_rate": 9.291316497071847e-06, + "loss": 0.4936, + "step": 1710 + }, + { + "epoch": 0.2, + "grad_norm": 3.8299734116166455, + "learning_rate": 9.290361161638093e-06, + "loss": 0.4407, + "step": 1711 + }, + { + "epoch": 0.2, + "grad_norm": 2.0904959960450777, + "learning_rate": 9.289405231904255e-06, + "loss": 0.4839, + "step": 1712 + }, + { + "epoch": 0.2, + "grad_norm": 2.1669664535299336, + "learning_rate": 9.288448708002743e-06, + "loss": 0.5695, + "step": 1713 + }, + { + "epoch": 0.2, + "grad_norm": 1.8587318527713061, + "learning_rate": 9.287491590066064e-06, + "loss": 0.5323, + "step": 1714 + }, + { + "epoch": 0.2, + "grad_norm": 1.7633797237150333, + "learning_rate": 9.286533878226789e-06, + "loss": 0.6068, + "step": 1715 + }, + { + "epoch": 0.2, + "grad_norm": 2.2555066996358035, + "learning_rate": 9.285575572617586e-06, + "loss": 0.5615, + "step": 1716 + }, + { + "epoch": 0.2, + "grad_norm": 1.949895636328444, + "learning_rate": 9.284616673371196e-06, + "loss": 0.6067, + "step": 1717 + }, + { + "epoch": 0.2, + "grad_norm": 1.6641944382141005, + "learning_rate": 9.283657180620446e-06, + "loss": 0.5577, + "step": 1718 + }, + { + "epoch": 0.2, + "grad_norm": 1.7511235280427524, + "learning_rate": 9.282697094498245e-06, + "loss": 0.6046, + "step": 1719 + }, + { + "epoch": 0.2, + "grad_norm": 4.414553788212881, + "learning_rate": 9.281736415137586e-06, + "loss": 0.462, + "step": 1720 + }, + { + "epoch": 0.2, + "grad_norm": 1.6693156003841387, + "learning_rate": 9.280775142671539e-06, + "loss": 0.4884, + "step": 1721 + }, + { + "epoch": 0.2, + "grad_norm": 1.916775836886097, + "learning_rate": 9.279813277233261e-06, + "loss": 0.6012, + "step": 1722 + }, + { + "epoch": 0.2, + "grad_norm": 1.7620966205766, + "learning_rate": 9.278850818955989e-06, + "loss": 0.5516, + "step": 1723 + }, + { + "epoch": 0.2, + "grad_norm": 2.1150048556025753, + "learning_rate": 9.277887767973044e-06, + "loss": 0.4484, + "step": 1724 + }, + { + "epoch": 0.2, + "grad_norm": 2.38236577808735, + "learning_rate": 9.276924124417825e-06, + "loss": 0.5757, + "step": 1725 + }, + { + "epoch": 0.2, + "grad_norm": 2.3706686831043804, + "learning_rate": 9.275959888423817e-06, + "loss": 0.5205, + "step": 1726 + }, + { + "epoch": 0.2, + "grad_norm": 1.9224324308109333, + "learning_rate": 9.274995060124587e-06, + "loss": 0.4409, + "step": 1727 + }, + { + "epoch": 0.2, + "grad_norm": 4.01397500517764, + "learning_rate": 9.27402963965378e-06, + "loss": 0.6055, + "step": 1728 + }, + { + "epoch": 0.2, + "grad_norm": 2.7855739472627676, + "learning_rate": 9.273063627145129e-06, + "loss": 0.6416, + "step": 1729 + }, + { + "epoch": 0.2, + "grad_norm": 2.1938142516751835, + "learning_rate": 9.272097022732444e-06, + "loss": 0.493, + "step": 1730 + }, + { + "epoch": 0.2, + "grad_norm": 3.2671632617175543, + "learning_rate": 9.271129826549618e-06, + "loss": 0.5706, + "step": 1731 + }, + { + "epoch": 0.2, + "grad_norm": 1.745878934945339, + "learning_rate": 9.27016203873063e-06, + "loss": 0.547, + "step": 1732 + }, + { + "epoch": 0.2, + "grad_norm": 2.2405446983690793, + "learning_rate": 9.269193659409537e-06, + "loss": 0.4102, + "step": 1733 + }, + { + "epoch": 0.2, + "grad_norm": 1.6376638649941833, + "learning_rate": 9.268224688720475e-06, + "loss": 0.4033, + "step": 1734 + }, + { + "epoch": 0.2, + "grad_norm": 1.898115728206501, + "learning_rate": 9.26725512679767e-06, + "loss": 0.517, + "step": 1735 + }, + { + "epoch": 0.2, + "grad_norm": 2.100911007122096, + "learning_rate": 9.266284973775423e-06, + "loss": 0.5908, + "step": 1736 + }, + { + "epoch": 0.2, + "grad_norm": 2.6888217399057366, + "learning_rate": 9.265314229788122e-06, + "loss": 0.4371, + "step": 1737 + }, + { + "epoch": 0.2, + "grad_norm": 3.261139539657666, + "learning_rate": 9.264342894970232e-06, + "loss": 0.5368, + "step": 1738 + }, + { + "epoch": 0.2, + "grad_norm": 2.0070994367713797, + "learning_rate": 9.263370969456303e-06, + "loss": 0.5619, + "step": 1739 + }, + { + "epoch": 0.2, + "grad_norm": 2.233244602205971, + "learning_rate": 9.262398453380964e-06, + "loss": 0.6043, + "step": 1740 + }, + { + "epoch": 0.2, + "grad_norm": 2.1344209362683615, + "learning_rate": 9.261425346878932e-06, + "loss": 0.5722, + "step": 1741 + }, + { + "epoch": 0.2, + "grad_norm": 4.312045015174331, + "learning_rate": 9.260451650084997e-06, + "loss": 0.4329, + "step": 1742 + }, + { + "epoch": 0.2, + "grad_norm": 1.73223118750901, + "learning_rate": 9.259477363134038e-06, + "loss": 0.5158, + "step": 1743 + }, + { + "epoch": 0.2, + "grad_norm": 2.314947432089919, + "learning_rate": 9.258502486161011e-06, + "loss": 0.5371, + "step": 1744 + }, + { + "epoch": 0.2, + "grad_norm": 4.488166647225511, + "learning_rate": 9.25752701930096e-06, + "loss": 0.4757, + "step": 1745 + }, + { + "epoch": 0.2, + "grad_norm": 0.9617956614253966, + "learning_rate": 9.256550962689003e-06, + "loss": 0.735, + "step": 1746 + }, + { + "epoch": 0.2, + "grad_norm": 2.0685417327972937, + "learning_rate": 9.255574316460342e-06, + "loss": 0.5729, + "step": 1747 + }, + { + "epoch": 0.2, + "grad_norm": 1.9397338365648071, + "learning_rate": 9.254597080750268e-06, + "loss": 0.5669, + "step": 1748 + }, + { + "epoch": 0.2, + "grad_norm": 3.140901289905813, + "learning_rate": 9.25361925569414e-06, + "loss": 0.5026, + "step": 1749 + }, + { + "epoch": 0.2, + "grad_norm": 2.1999199991383493, + "learning_rate": 9.25264084142741e-06, + "loss": 0.5072, + "step": 1750 + }, + { + "epoch": 0.2, + "grad_norm": 1.7285888415076514, + "learning_rate": 9.251661838085606e-06, + "loss": 0.604, + "step": 1751 + }, + { + "epoch": 0.2, + "grad_norm": 2.494550271524926, + "learning_rate": 9.250682245804342e-06, + "loss": 0.5594, + "step": 1752 + }, + { + "epoch": 0.2, + "grad_norm": 4.153887537450256, + "learning_rate": 9.249702064719308e-06, + "loss": 0.5721, + "step": 1753 + }, + { + "epoch": 0.2, + "grad_norm": 1.8075900480034057, + "learning_rate": 9.248721294966284e-06, + "loss": 0.6166, + "step": 1754 + }, + { + "epoch": 0.2, + "grad_norm": 2.0685179132930416, + "learning_rate": 9.247739936681118e-06, + "loss": 0.4231, + "step": 1755 + }, + { + "epoch": 0.2, + "grad_norm": 2.522963444088429, + "learning_rate": 9.246757989999754e-06, + "loss": 0.4632, + "step": 1756 + }, + { + "epoch": 0.2, + "grad_norm": 1.7720080104646432, + "learning_rate": 9.245775455058207e-06, + "loss": 0.5268, + "step": 1757 + }, + { + "epoch": 0.2, + "grad_norm": 1.777016192418897, + "learning_rate": 9.24479233199258e-06, + "loss": 0.5514, + "step": 1758 + }, + { + "epoch": 0.2, + "grad_norm": 1.6858353612969699, + "learning_rate": 9.243808620939057e-06, + "loss": 0.6307, + "step": 1759 + }, + { + "epoch": 0.2, + "grad_norm": 1.8225147284944496, + "learning_rate": 9.242824322033895e-06, + "loss": 0.5193, + "step": 1760 + }, + { + "epoch": 0.2, + "grad_norm": 4.238989755907847, + "learning_rate": 9.241839435413445e-06, + "loss": 0.6767, + "step": 1761 + }, + { + "epoch": 0.2, + "grad_norm": 7.657645169142675, + "learning_rate": 9.24085396121413e-06, + "loss": 0.5105, + "step": 1762 + }, + { + "epoch": 0.2, + "grad_norm": 3.472699153319154, + "learning_rate": 9.239867899572459e-06, + "loss": 0.6072, + "step": 1763 + }, + { + "epoch": 0.2, + "grad_norm": 3.0460582834217007, + "learning_rate": 9.238881250625023e-06, + "loss": 0.5136, + "step": 1764 + }, + { + "epoch": 0.2, + "grad_norm": 1.7452890341890717, + "learning_rate": 9.237894014508487e-06, + "loss": 0.5396, + "step": 1765 + }, + { + "epoch": 0.2, + "grad_norm": 4.541047991053986, + "learning_rate": 9.236906191359608e-06, + "loss": 0.4281, + "step": 1766 + }, + { + "epoch": 0.2, + "grad_norm": 1.9218260778816876, + "learning_rate": 9.235917781315217e-06, + "loss": 0.5829, + "step": 1767 + }, + { + "epoch": 0.2, + "grad_norm": 1.6567259338940978, + "learning_rate": 9.23492878451223e-06, + "loss": 0.4657, + "step": 1768 + }, + { + "epoch": 0.2, + "grad_norm": 2.12920076329061, + "learning_rate": 9.233939201087639e-06, + "loss": 0.4404, + "step": 1769 + }, + { + "epoch": 0.2, + "grad_norm": 1.7231129848252806, + "learning_rate": 9.232949031178524e-06, + "loss": 0.4778, + "step": 1770 + }, + { + "epoch": 0.2, + "grad_norm": 1.6687219688921977, + "learning_rate": 9.231958274922042e-06, + "loss": 0.5585, + "step": 1771 + }, + { + "epoch": 0.2, + "grad_norm": 1.8497823571547065, + "learning_rate": 9.230966932455434e-06, + "loss": 0.6104, + "step": 1772 + }, + { + "epoch": 0.2, + "grad_norm": 3.061172769883759, + "learning_rate": 9.22997500391602e-06, + "loss": 0.5214, + "step": 1773 + }, + { + "epoch": 0.2, + "grad_norm": 3.020129858974131, + "learning_rate": 9.228982489441199e-06, + "loss": 0.5454, + "step": 1774 + }, + { + "epoch": 0.2, + "grad_norm": 2.3850469036444344, + "learning_rate": 9.227989389168454e-06, + "loss": 0.5998, + "step": 1775 + }, + { + "epoch": 0.2, + "grad_norm": 2.690129713596542, + "learning_rate": 9.226995703235355e-06, + "loss": 0.6152, + "step": 1776 + }, + { + "epoch": 0.2, + "grad_norm": 1.6039194247374198, + "learning_rate": 9.226001431779543e-06, + "loss": 0.5586, + "step": 1777 + }, + { + "epoch": 0.2, + "grad_norm": 2.025359666134105, + "learning_rate": 9.225006574938745e-06, + "loss": 0.4411, + "step": 1778 + }, + { + "epoch": 0.2, + "grad_norm": 2.137278956157683, + "learning_rate": 9.224011132850765e-06, + "loss": 0.4883, + "step": 1779 + }, + { + "epoch": 0.2, + "grad_norm": 2.050831612541425, + "learning_rate": 9.223015105653497e-06, + "loss": 0.5034, + "step": 1780 + }, + { + "epoch": 0.2, + "grad_norm": 2.332124191851134, + "learning_rate": 9.222018493484907e-06, + "loss": 0.5034, + "step": 1781 + }, + { + "epoch": 0.2, + "grad_norm": 2.151898173779744, + "learning_rate": 9.221021296483047e-06, + "loss": 0.5179, + "step": 1782 + }, + { + "epoch": 0.2, + "grad_norm": 2.0201980247756803, + "learning_rate": 9.220023514786047e-06, + "loss": 0.399, + "step": 1783 + }, + { + "epoch": 0.2, + "grad_norm": 2.1814300163939624, + "learning_rate": 9.219025148532124e-06, + "loss": 0.6111, + "step": 1784 + }, + { + "epoch": 0.21, + "grad_norm": 5.197113075633367, + "learning_rate": 9.218026197859565e-06, + "loss": 0.4652, + "step": 1785 + }, + { + "epoch": 0.21, + "grad_norm": 1.7903012509168792, + "learning_rate": 9.217026662906747e-06, + "loss": 0.5569, + "step": 1786 + }, + { + "epoch": 0.21, + "grad_norm": 0.9326665734843288, + "learning_rate": 9.216026543812129e-06, + "loss": 0.7471, + "step": 1787 + }, + { + "epoch": 0.21, + "grad_norm": 12.126023062470786, + "learning_rate": 9.215025840714243e-06, + "loss": 0.5279, + "step": 1788 + }, + { + "epoch": 0.21, + "grad_norm": 2.0579227510180287, + "learning_rate": 9.214024553751709e-06, + "loss": 0.6602, + "step": 1789 + }, + { + "epoch": 0.21, + "grad_norm": 2.9626414416159172, + "learning_rate": 9.21302268306322e-06, + "loss": 0.5305, + "step": 1790 + }, + { + "epoch": 0.21, + "grad_norm": 2.2437894388769, + "learning_rate": 9.212020228787562e-06, + "loss": 0.4894, + "step": 1791 + }, + { + "epoch": 0.21, + "grad_norm": 3.256827513396298, + "learning_rate": 9.21101719106359e-06, + "loss": 0.4984, + "step": 1792 + }, + { + "epoch": 0.21, + "grad_norm": 1.9117355346291962, + "learning_rate": 9.210013570030246e-06, + "loss": 0.5877, + "step": 1793 + }, + { + "epoch": 0.21, + "grad_norm": 1.9346739038510565, + "learning_rate": 9.209009365826553e-06, + "loss": 0.4944, + "step": 1794 + }, + { + "epoch": 0.21, + "grad_norm": 2.2923896260919716, + "learning_rate": 9.20800457859161e-06, + "loss": 0.5518, + "step": 1795 + }, + { + "epoch": 0.21, + "grad_norm": 1.935080120734737, + "learning_rate": 9.206999208464602e-06, + "loss": 0.5728, + "step": 1796 + }, + { + "epoch": 0.21, + "grad_norm": 2.2387378544723875, + "learning_rate": 9.205993255584793e-06, + "loss": 0.4694, + "step": 1797 + }, + { + "epoch": 0.21, + "grad_norm": 3.6141508127479476, + "learning_rate": 9.204986720091527e-06, + "loss": 0.5316, + "step": 1798 + }, + { + "epoch": 0.21, + "grad_norm": 0.923212355555867, + "learning_rate": 9.203979602124227e-06, + "loss": 0.7379, + "step": 1799 + }, + { + "epoch": 0.21, + "grad_norm": 1.7174323897023884, + "learning_rate": 9.202971901822401e-06, + "loss": 0.5369, + "step": 1800 + }, + { + "epoch": 0.21, + "grad_norm": 2.149912136910846, + "learning_rate": 9.201963619325637e-06, + "loss": 0.5556, + "step": 1801 + }, + { + "epoch": 0.21, + "grad_norm": 1.7152865798518133, + "learning_rate": 9.200954754773598e-06, + "loss": 0.5244, + "step": 1802 + }, + { + "epoch": 0.21, + "grad_norm": 1.9933439146724623, + "learning_rate": 9.199945308306037e-06, + "loss": 0.4657, + "step": 1803 + }, + { + "epoch": 0.21, + "grad_norm": 3.385353556019281, + "learning_rate": 9.198935280062777e-06, + "loss": 0.4397, + "step": 1804 + }, + { + "epoch": 0.21, + "grad_norm": 1.8009881603970972, + "learning_rate": 9.19792467018373e-06, + "loss": 0.6378, + "step": 1805 + }, + { + "epoch": 0.21, + "grad_norm": 2.1227904905286, + "learning_rate": 9.196913478808884e-06, + "loss": 0.6093, + "step": 1806 + }, + { + "epoch": 0.21, + "grad_norm": 1.65541491251585, + "learning_rate": 9.19590170607831e-06, + "loss": 0.55, + "step": 1807 + }, + { + "epoch": 0.21, + "grad_norm": 1.8004198144346177, + "learning_rate": 9.19488935213216e-06, + "loss": 0.5754, + "step": 1808 + }, + { + "epoch": 0.21, + "grad_norm": 1.6392718422746173, + "learning_rate": 9.193876417110663e-06, + "loss": 0.5246, + "step": 1809 + }, + { + "epoch": 0.21, + "grad_norm": 2.6041962325593957, + "learning_rate": 9.19286290115413e-06, + "loss": 0.4805, + "step": 1810 + }, + { + "epoch": 0.21, + "grad_norm": 2.152945962884701, + "learning_rate": 9.191848804402953e-06, + "loss": 0.5872, + "step": 1811 + }, + { + "epoch": 0.21, + "grad_norm": 2.0157569167531815, + "learning_rate": 9.190834126997608e-06, + "loss": 0.5304, + "step": 1812 + }, + { + "epoch": 0.21, + "grad_norm": 1.8933135369308454, + "learning_rate": 9.189818869078646e-06, + "loss": 0.4374, + "step": 1813 + }, + { + "epoch": 0.21, + "grad_norm": 3.0081378690035843, + "learning_rate": 9.188803030786699e-06, + "loss": 0.5237, + "step": 1814 + }, + { + "epoch": 0.21, + "grad_norm": 1.816258066899212, + "learning_rate": 9.18778661226248e-06, + "loss": 0.5353, + "step": 1815 + }, + { + "epoch": 0.21, + "grad_norm": 1.9725331292589412, + "learning_rate": 9.186769613646788e-06, + "loss": 0.5072, + "step": 1816 + }, + { + "epoch": 0.21, + "grad_norm": 1.9242474553624498, + "learning_rate": 9.185752035080493e-06, + "loss": 0.5533, + "step": 1817 + }, + { + "epoch": 0.21, + "grad_norm": 1.8027038075045165, + "learning_rate": 9.184733876704551e-06, + "loss": 0.483, + "step": 1818 + }, + { + "epoch": 0.21, + "grad_norm": 2.488182535391086, + "learning_rate": 9.183715138659996e-06, + "loss": 0.6066, + "step": 1819 + }, + { + "epoch": 0.21, + "grad_norm": 2.0794839724835548, + "learning_rate": 9.182695821087946e-06, + "loss": 0.5022, + "step": 1820 + }, + { + "epoch": 0.21, + "grad_norm": 1.5755164555833303, + "learning_rate": 9.181675924129595e-06, + "loss": 0.5197, + "step": 1821 + }, + { + "epoch": 0.21, + "grad_norm": 0.9283660167109985, + "learning_rate": 9.180655447926219e-06, + "loss": 0.7784, + "step": 1822 + }, + { + "epoch": 0.21, + "grad_norm": 2.2116940209469713, + "learning_rate": 9.179634392619174e-06, + "loss": 0.6143, + "step": 1823 + }, + { + "epoch": 0.21, + "grad_norm": 2.139523940555377, + "learning_rate": 9.178612758349899e-06, + "loss": 0.5217, + "step": 1824 + }, + { + "epoch": 0.21, + "grad_norm": 1.9511526129103196, + "learning_rate": 9.177590545259907e-06, + "loss": 0.4964, + "step": 1825 + }, + { + "epoch": 0.21, + "grad_norm": 0.823612264443561, + "learning_rate": 9.176567753490795e-06, + "loss": 0.6966, + "step": 1826 + }, + { + "epoch": 0.21, + "grad_norm": 2.4795243834549616, + "learning_rate": 9.175544383184243e-06, + "loss": 0.4929, + "step": 1827 + }, + { + "epoch": 0.21, + "grad_norm": 2.2350200431591474, + "learning_rate": 9.174520434482006e-06, + "loss": 0.5514, + "step": 1828 + }, + { + "epoch": 0.21, + "grad_norm": 1.7938537056568982, + "learning_rate": 9.173495907525922e-06, + "loss": 0.5881, + "step": 1829 + }, + { + "epoch": 0.21, + "grad_norm": 2.041122078062016, + "learning_rate": 9.172470802457906e-06, + "loss": 0.5162, + "step": 1830 + }, + { + "epoch": 0.21, + "grad_norm": 1.7715885562248541, + "learning_rate": 9.17144511941996e-06, + "loss": 0.4667, + "step": 1831 + }, + { + "epoch": 0.21, + "grad_norm": 2.208043086641995, + "learning_rate": 9.170418858554156e-06, + "loss": 0.52, + "step": 1832 + }, + { + "epoch": 0.21, + "grad_norm": 1.7486752137742332, + "learning_rate": 9.169392020002655e-06, + "loss": 0.5684, + "step": 1833 + }, + { + "epoch": 0.21, + "grad_norm": 3.2117531554836023, + "learning_rate": 9.168364603907693e-06, + "loss": 0.5671, + "step": 1834 + }, + { + "epoch": 0.21, + "grad_norm": 2.1347463663878847, + "learning_rate": 9.167336610411588e-06, + "loss": 0.4207, + "step": 1835 + }, + { + "epoch": 0.21, + "grad_norm": 1.9142562532132055, + "learning_rate": 9.166308039656737e-06, + "loss": 0.5062, + "step": 1836 + }, + { + "epoch": 0.21, + "grad_norm": 2.007774560296191, + "learning_rate": 9.16527889178562e-06, + "loss": 0.5626, + "step": 1837 + }, + { + "epoch": 0.21, + "grad_norm": 1.492070546092896, + "learning_rate": 9.16424916694079e-06, + "loss": 0.5432, + "step": 1838 + }, + { + "epoch": 0.21, + "grad_norm": 1.987932903431278, + "learning_rate": 9.163218865264889e-06, + "loss": 0.4863, + "step": 1839 + }, + { + "epoch": 0.21, + "grad_norm": 1.9981386912086843, + "learning_rate": 9.162187986900631e-06, + "loss": 0.4791, + "step": 1840 + }, + { + "epoch": 0.21, + "grad_norm": 3.592624829293792, + "learning_rate": 9.161156531990814e-06, + "loss": 0.5567, + "step": 1841 + }, + { + "epoch": 0.21, + "grad_norm": 0.9262053257022613, + "learning_rate": 9.160124500678313e-06, + "loss": 0.7334, + "step": 1842 + }, + { + "epoch": 0.21, + "grad_norm": 1.745789896780652, + "learning_rate": 9.159091893106089e-06, + "loss": 0.597, + "step": 1843 + }, + { + "epoch": 0.21, + "grad_norm": 3.559812399649817, + "learning_rate": 9.158058709417176e-06, + "loss": 0.4866, + "step": 1844 + }, + { + "epoch": 0.21, + "grad_norm": 1.4744705737025416, + "learning_rate": 9.15702494975469e-06, + "loss": 0.5537, + "step": 1845 + }, + { + "epoch": 0.21, + "grad_norm": 1.7076292301830707, + "learning_rate": 9.15599061426183e-06, + "loss": 0.4611, + "step": 1846 + }, + { + "epoch": 0.21, + "grad_norm": 2.216686976791891, + "learning_rate": 9.154955703081868e-06, + "loss": 0.532, + "step": 1847 + }, + { + "epoch": 0.21, + "grad_norm": 2.194850077687543, + "learning_rate": 9.153920216358161e-06, + "loss": 0.5214, + "step": 1848 + }, + { + "epoch": 0.21, + "grad_norm": 1.849176543655242, + "learning_rate": 9.152884154234147e-06, + "loss": 0.5672, + "step": 1849 + }, + { + "epoch": 0.21, + "grad_norm": 2.571982433504018, + "learning_rate": 9.151847516853338e-06, + "loss": 0.5682, + "step": 1850 + }, + { + "epoch": 0.21, + "grad_norm": 1.4476724548427258, + "learning_rate": 9.15081030435933e-06, + "loss": 0.4616, + "step": 1851 + }, + { + "epoch": 0.21, + "grad_norm": 3.170319376968797, + "learning_rate": 9.149772516895798e-06, + "loss": 0.5425, + "step": 1852 + }, + { + "epoch": 0.21, + "grad_norm": 3.698999346373502, + "learning_rate": 9.148734154606497e-06, + "loss": 0.5585, + "step": 1853 + }, + { + "epoch": 0.21, + "grad_norm": 1.6812894428682459, + "learning_rate": 9.147695217635258e-06, + "loss": 0.5393, + "step": 1854 + }, + { + "epoch": 0.21, + "grad_norm": 1.6961757583843824, + "learning_rate": 9.146655706125995e-06, + "loss": 0.5085, + "step": 1855 + }, + { + "epoch": 0.21, + "grad_norm": 7.186448213325977, + "learning_rate": 9.145615620222705e-06, + "loss": 0.492, + "step": 1856 + }, + { + "epoch": 0.21, + "grad_norm": 1.9774798959394002, + "learning_rate": 9.144574960069454e-06, + "loss": 0.5308, + "step": 1857 + }, + { + "epoch": 0.21, + "grad_norm": 1.8391895024784863, + "learning_rate": 9.143533725810398e-06, + "loss": 0.5597, + "step": 1858 + }, + { + "epoch": 0.21, + "grad_norm": 2.4588263380412037, + "learning_rate": 9.142491917589768e-06, + "loss": 0.6383, + "step": 1859 + }, + { + "epoch": 0.21, + "grad_norm": 0.907131153713141, + "learning_rate": 9.141449535551878e-06, + "loss": 0.7582, + "step": 1860 + }, + { + "epoch": 0.21, + "grad_norm": 2.021368692690415, + "learning_rate": 9.140406579841113e-06, + "loss": 0.5481, + "step": 1861 + }, + { + "epoch": 0.21, + "grad_norm": 2.5719269848717543, + "learning_rate": 9.139363050601946e-06, + "loss": 0.467, + "step": 1862 + }, + { + "epoch": 0.21, + "grad_norm": 1.8715011163912674, + "learning_rate": 9.138318947978927e-06, + "loss": 0.5686, + "step": 1863 + }, + { + "epoch": 0.21, + "grad_norm": 38.77794395126524, + "learning_rate": 9.137274272116683e-06, + "loss": 0.5895, + "step": 1864 + }, + { + "epoch": 0.21, + "grad_norm": 1.867779733294635, + "learning_rate": 9.136229023159924e-06, + "loss": 0.4682, + "step": 1865 + }, + { + "epoch": 0.21, + "grad_norm": 0.8127864666263446, + "learning_rate": 9.135183201253436e-06, + "loss": 0.7177, + "step": 1866 + }, + { + "epoch": 0.21, + "grad_norm": 1.8425028393910223, + "learning_rate": 9.134136806542089e-06, + "loss": 0.5679, + "step": 1867 + }, + { + "epoch": 0.21, + "grad_norm": 1.7783589068517192, + "learning_rate": 9.133089839170827e-06, + "loss": 0.5049, + "step": 1868 + }, + { + "epoch": 0.21, + "grad_norm": 5.356543996534519, + "learning_rate": 9.132042299284675e-06, + "loss": 0.4768, + "step": 1869 + }, + { + "epoch": 0.21, + "grad_norm": 0.8374804126372851, + "learning_rate": 9.13099418702874e-06, + "loss": 0.7606, + "step": 1870 + }, + { + "epoch": 0.21, + "grad_norm": 1.8325439443698737, + "learning_rate": 9.129945502548207e-06, + "loss": 0.5195, + "step": 1871 + }, + { + "epoch": 0.22, + "grad_norm": 1.85856708583394, + "learning_rate": 9.128896245988338e-06, + "loss": 0.5826, + "step": 1872 + }, + { + "epoch": 0.22, + "grad_norm": 3.6343025245998666, + "learning_rate": 9.127846417494476e-06, + "loss": 0.5934, + "step": 1873 + }, + { + "epoch": 0.22, + "grad_norm": 2.1865048544002312, + "learning_rate": 9.126796017212043e-06, + "loss": 0.4989, + "step": 1874 + }, + { + "epoch": 0.22, + "grad_norm": 2.9021958568108963, + "learning_rate": 9.12574504528654e-06, + "loss": 0.6248, + "step": 1875 + }, + { + "epoch": 0.22, + "grad_norm": 1.7645176368857158, + "learning_rate": 9.124693501863548e-06, + "loss": 0.4595, + "step": 1876 + }, + { + "epoch": 0.22, + "grad_norm": 2.1204261240904514, + "learning_rate": 9.123641387088728e-06, + "loss": 0.489, + "step": 1877 + }, + { + "epoch": 0.22, + "grad_norm": 2.229406726615884, + "learning_rate": 9.122588701107816e-06, + "loss": 0.4985, + "step": 1878 + }, + { + "epoch": 0.22, + "grad_norm": 1.958105451295423, + "learning_rate": 9.121535444066631e-06, + "loss": 0.5233, + "step": 1879 + }, + { + "epoch": 0.22, + "grad_norm": 0.8092095078807724, + "learning_rate": 9.12048161611107e-06, + "loss": 0.6936, + "step": 1880 + }, + { + "epoch": 0.22, + "grad_norm": 2.1205924474160938, + "learning_rate": 9.11942721738711e-06, + "loss": 0.5167, + "step": 1881 + }, + { + "epoch": 0.22, + "grad_norm": 5.3999100818341494, + "learning_rate": 9.118372248040806e-06, + "loss": 0.5918, + "step": 1882 + }, + { + "epoch": 0.22, + "grad_norm": 1.7527760489266877, + "learning_rate": 9.11731670821829e-06, + "loss": 0.5206, + "step": 1883 + }, + { + "epoch": 0.22, + "grad_norm": 1.7445246125433562, + "learning_rate": 9.116260598065776e-06, + "loss": 0.5597, + "step": 1884 + }, + { + "epoch": 0.22, + "grad_norm": 2.2084289500002594, + "learning_rate": 9.11520391772956e-06, + "loss": 0.4984, + "step": 1885 + }, + { + "epoch": 0.22, + "grad_norm": 1.900056047123036, + "learning_rate": 9.114146667356008e-06, + "loss": 0.5778, + "step": 1886 + }, + { + "epoch": 0.22, + "grad_norm": 2.3733798801148764, + "learning_rate": 9.113088847091572e-06, + "loss": 0.5175, + "step": 1887 + }, + { + "epoch": 0.22, + "grad_norm": 2.2387827326535477, + "learning_rate": 9.112030457082782e-06, + "loss": 0.4892, + "step": 1888 + }, + { + "epoch": 0.22, + "grad_norm": 1.8164512957048708, + "learning_rate": 9.110971497476245e-06, + "loss": 0.5144, + "step": 1889 + }, + { + "epoch": 0.22, + "grad_norm": 2.066164608150523, + "learning_rate": 9.10991196841865e-06, + "loss": 0.5728, + "step": 1890 + }, + { + "epoch": 0.22, + "grad_norm": 0.8826150265260178, + "learning_rate": 9.108851870056759e-06, + "loss": 0.7528, + "step": 1891 + }, + { + "epoch": 0.22, + "grad_norm": 2.2096333128539767, + "learning_rate": 9.107791202537419e-06, + "loss": 0.5318, + "step": 1892 + }, + { + "epoch": 0.22, + "grad_norm": 0.9240021967684576, + "learning_rate": 9.106729966007552e-06, + "loss": 0.7282, + "step": 1893 + }, + { + "epoch": 0.22, + "grad_norm": 2.015164754561475, + "learning_rate": 9.105668160614163e-06, + "loss": 0.4649, + "step": 1894 + }, + { + "epoch": 0.22, + "grad_norm": 2.1511659744168843, + "learning_rate": 9.104605786504332e-06, + "loss": 0.5673, + "step": 1895 + }, + { + "epoch": 0.22, + "grad_norm": 2.358178994873392, + "learning_rate": 9.103542843825217e-06, + "loss": 0.465, + "step": 1896 + }, + { + "epoch": 0.22, + "grad_norm": 2.5521116290661428, + "learning_rate": 9.102479332724058e-06, + "loss": 0.5447, + "step": 1897 + }, + { + "epoch": 0.22, + "grad_norm": 4.10997096003024, + "learning_rate": 9.101415253348173e-06, + "loss": 0.5374, + "step": 1898 + }, + { + "epoch": 0.22, + "grad_norm": 1.9962027174419892, + "learning_rate": 9.100350605844957e-06, + "loss": 0.5406, + "step": 1899 + }, + { + "epoch": 0.22, + "grad_norm": 1.6243297471546538, + "learning_rate": 9.099285390361886e-06, + "loss": 0.5796, + "step": 1900 + }, + { + "epoch": 0.22, + "grad_norm": 2.2121660419747107, + "learning_rate": 9.098219607046511e-06, + "loss": 0.6073, + "step": 1901 + }, + { + "epoch": 0.22, + "grad_norm": 1.9955861573741578, + "learning_rate": 9.097153256046469e-06, + "loss": 0.547, + "step": 1902 + }, + { + "epoch": 0.22, + "grad_norm": 2.1364326659776434, + "learning_rate": 9.096086337509466e-06, + "loss": 0.4824, + "step": 1903 + }, + { + "epoch": 0.22, + "grad_norm": 2.192690978932753, + "learning_rate": 9.095018851583292e-06, + "loss": 0.5849, + "step": 1904 + }, + { + "epoch": 0.22, + "grad_norm": 2.7830087415320337, + "learning_rate": 9.093950798415819e-06, + "loss": 0.5749, + "step": 1905 + }, + { + "epoch": 0.22, + "grad_norm": 1.770271052780923, + "learning_rate": 9.092882178154988e-06, + "loss": 0.4663, + "step": 1906 + }, + { + "epoch": 0.22, + "grad_norm": 1.9445653866631203, + "learning_rate": 9.091812990948827e-06, + "loss": 0.4966, + "step": 1907 + }, + { + "epoch": 0.22, + "grad_norm": 1.021649522757026, + "learning_rate": 9.09074323694544e-06, + "loss": 0.7717, + "step": 1908 + }, + { + "epoch": 0.22, + "grad_norm": 1.8794644831130864, + "learning_rate": 9.089672916293006e-06, + "loss": 0.5512, + "step": 1909 + }, + { + "epoch": 0.22, + "grad_norm": 1.7493547962086655, + "learning_rate": 9.088602029139789e-06, + "loss": 0.5204, + "step": 1910 + }, + { + "epoch": 0.22, + "grad_norm": 1.9728629963329603, + "learning_rate": 9.087530575634127e-06, + "loss": 0.6523, + "step": 1911 + }, + { + "epoch": 0.22, + "grad_norm": 1.9484474067945332, + "learning_rate": 9.086458555924439e-06, + "loss": 0.6191, + "step": 1912 + }, + { + "epoch": 0.22, + "grad_norm": 1.8422586991681948, + "learning_rate": 9.085385970159218e-06, + "loss": 0.5835, + "step": 1913 + }, + { + "epoch": 0.22, + "grad_norm": 1.461546559963499, + "learning_rate": 9.084312818487042e-06, + "loss": 0.4302, + "step": 1914 + }, + { + "epoch": 0.22, + "grad_norm": 1.8370421559220482, + "learning_rate": 9.08323910105656e-06, + "loss": 0.5327, + "step": 1915 + }, + { + "epoch": 0.22, + "grad_norm": 2.3277267708761578, + "learning_rate": 9.082164818016506e-06, + "loss": 0.5237, + "step": 1916 + }, + { + "epoch": 0.22, + "grad_norm": 1.698622440526287, + "learning_rate": 9.081089969515689e-06, + "loss": 0.4815, + "step": 1917 + }, + { + "epoch": 0.22, + "grad_norm": 1.6669862760948075, + "learning_rate": 9.080014555702993e-06, + "loss": 0.5327, + "step": 1918 + }, + { + "epoch": 0.22, + "grad_norm": 1.8013039744692143, + "learning_rate": 9.078938576727393e-06, + "loss": 0.4548, + "step": 1919 + }, + { + "epoch": 0.22, + "grad_norm": 2.1190401318482635, + "learning_rate": 9.077862032737923e-06, + "loss": 0.4389, + "step": 1920 + }, + { + "epoch": 0.22, + "grad_norm": 1.8306882820321422, + "learning_rate": 9.076784923883712e-06, + "loss": 0.6679, + "step": 1921 + }, + { + "epoch": 0.22, + "grad_norm": 1.9206827597398815, + "learning_rate": 9.07570725031396e-06, + "loss": 0.581, + "step": 1922 + }, + { + "epoch": 0.22, + "grad_norm": 2.1044052355430543, + "learning_rate": 9.074629012177946e-06, + "loss": 0.5197, + "step": 1923 + }, + { + "epoch": 0.22, + "grad_norm": 1.8211858826799479, + "learning_rate": 9.073550209625026e-06, + "loss": 0.6589, + "step": 1924 + }, + { + "epoch": 0.22, + "grad_norm": 1.64151622473254, + "learning_rate": 9.072470842804636e-06, + "loss": 0.4113, + "step": 1925 + }, + { + "epoch": 0.22, + "grad_norm": 1.916598599985274, + "learning_rate": 9.071390911866291e-06, + "loss": 0.534, + "step": 1926 + }, + { + "epoch": 0.22, + "grad_norm": 2.231100355315972, + "learning_rate": 9.070310416959582e-06, + "loss": 0.4303, + "step": 1927 + }, + { + "epoch": 0.22, + "grad_norm": 1.5289379666551688, + "learning_rate": 9.06922935823418e-06, + "loss": 0.4689, + "step": 1928 + }, + { + "epoch": 0.22, + "grad_norm": 1.7143397383272994, + "learning_rate": 9.068147735839831e-06, + "loss": 0.5486, + "step": 1929 + }, + { + "epoch": 0.22, + "grad_norm": 3.33881139914712, + "learning_rate": 9.067065549926362e-06, + "loss": 0.5312, + "step": 1930 + }, + { + "epoch": 0.22, + "grad_norm": 1.8960024957907573, + "learning_rate": 9.065982800643679e-06, + "loss": 0.6016, + "step": 1931 + }, + { + "epoch": 0.22, + "grad_norm": 2.785514434261121, + "learning_rate": 9.064899488141761e-06, + "loss": 0.4924, + "step": 1932 + }, + { + "epoch": 0.22, + "grad_norm": 1.8671983616213554, + "learning_rate": 9.06381561257067e-06, + "loss": 0.5363, + "step": 1933 + }, + { + "epoch": 0.22, + "grad_norm": 1.880933228690065, + "learning_rate": 9.062731174080546e-06, + "loss": 0.551, + "step": 1934 + }, + { + "epoch": 0.22, + "grad_norm": 1.9133027538157963, + "learning_rate": 9.061646172821602e-06, + "loss": 0.5565, + "step": 1935 + }, + { + "epoch": 0.22, + "grad_norm": 4.849106292022935, + "learning_rate": 9.060560608944134e-06, + "loss": 0.536, + "step": 1936 + }, + { + "epoch": 0.22, + "grad_norm": 2.213159310538447, + "learning_rate": 9.059474482598513e-06, + "loss": 0.515, + "step": 1937 + }, + { + "epoch": 0.22, + "grad_norm": 1.9411566263430953, + "learning_rate": 9.05838779393519e-06, + "loss": 0.5548, + "step": 1938 + }, + { + "epoch": 0.22, + "grad_norm": 1.9459753390672059, + "learning_rate": 9.057300543104694e-06, + "loss": 0.5633, + "step": 1939 + }, + { + "epoch": 0.22, + "grad_norm": 2.088177135655805, + "learning_rate": 9.05621273025763e-06, + "loss": 0.5007, + "step": 1940 + }, + { + "epoch": 0.22, + "grad_norm": 1.721449262555121, + "learning_rate": 9.05512435554468e-06, + "loss": 0.5571, + "step": 1941 + }, + { + "epoch": 0.22, + "grad_norm": 2.065905032378851, + "learning_rate": 9.054035419116606e-06, + "loss": 0.4857, + "step": 1942 + }, + { + "epoch": 0.22, + "grad_norm": 1.9504759539256835, + "learning_rate": 9.052945921124248e-06, + "loss": 0.4633, + "step": 1943 + }, + { + "epoch": 0.22, + "grad_norm": 1.8560338515869066, + "learning_rate": 9.051855861718524e-06, + "loss": 0.561, + "step": 1944 + }, + { + "epoch": 0.22, + "grad_norm": 2.4671426828258154, + "learning_rate": 9.050765241050428e-06, + "loss": 0.568, + "step": 1945 + }, + { + "epoch": 0.22, + "grad_norm": 1.6397981485264328, + "learning_rate": 9.04967405927103e-06, + "loss": 0.4312, + "step": 1946 + }, + { + "epoch": 0.22, + "grad_norm": 1.9972076886572796, + "learning_rate": 9.048582316531485e-06, + "loss": 0.5349, + "step": 1947 + }, + { + "epoch": 0.22, + "grad_norm": 2.051096277529259, + "learning_rate": 9.047490012983018e-06, + "loss": 0.5323, + "step": 1948 + }, + { + "epoch": 0.22, + "grad_norm": 1.965470589495238, + "learning_rate": 9.046397148776936e-06, + "loss": 0.4902, + "step": 1949 + }, + { + "epoch": 0.22, + "grad_norm": 1.6663527598837038, + "learning_rate": 9.045303724064622e-06, + "loss": 0.6704, + "step": 1950 + }, + { + "epoch": 0.22, + "grad_norm": 2.2056114953419663, + "learning_rate": 9.044209738997536e-06, + "loss": 0.5089, + "step": 1951 + }, + { + "epoch": 0.22, + "grad_norm": 1.721142554293289, + "learning_rate": 9.043115193727217e-06, + "loss": 0.5348, + "step": 1952 + }, + { + "epoch": 0.22, + "grad_norm": 1.0201122793669208, + "learning_rate": 9.042020088405283e-06, + "loss": 0.7489, + "step": 1953 + }, + { + "epoch": 0.22, + "grad_norm": 2.006774542673198, + "learning_rate": 9.040924423183426e-06, + "loss": 0.5338, + "step": 1954 + }, + { + "epoch": 0.22, + "grad_norm": 1.792945940839733, + "learning_rate": 9.039828198213417e-06, + "loss": 0.5401, + "step": 1955 + }, + { + "epoch": 0.22, + "grad_norm": 3.1968251391207456, + "learning_rate": 9.038731413647107e-06, + "loss": 0.4626, + "step": 1956 + }, + { + "epoch": 0.22, + "grad_norm": 1.685138503893445, + "learning_rate": 9.037634069636421e-06, + "loss": 0.5999, + "step": 1957 + }, + { + "epoch": 0.22, + "grad_norm": 2.238135664801939, + "learning_rate": 9.036536166333362e-06, + "loss": 0.5119, + "step": 1958 + }, + { + "epoch": 0.23, + "grad_norm": 2.111904037343309, + "learning_rate": 9.035437703890013e-06, + "loss": 0.5815, + "step": 1959 + }, + { + "epoch": 0.23, + "grad_norm": 0.8229903814935794, + "learning_rate": 9.034338682458532e-06, + "loss": 0.6909, + "step": 1960 + }, + { + "epoch": 0.23, + "grad_norm": 2.6850429482854024, + "learning_rate": 9.033239102191156e-06, + "loss": 0.5839, + "step": 1961 + }, + { + "epoch": 0.23, + "grad_norm": 3.534874169611228, + "learning_rate": 9.032138963240196e-06, + "loss": 0.5452, + "step": 1962 + }, + { + "epoch": 0.23, + "grad_norm": 1.7692708409492117, + "learning_rate": 9.031038265758047e-06, + "loss": 0.5316, + "step": 1963 + }, + { + "epoch": 0.23, + "grad_norm": 3.1581888028680707, + "learning_rate": 9.029937009897176e-06, + "loss": 0.5427, + "step": 1964 + }, + { + "epoch": 0.23, + "grad_norm": 1.912847642581488, + "learning_rate": 9.028835195810129e-06, + "loss": 0.5466, + "step": 1965 + }, + { + "epoch": 0.23, + "grad_norm": 1.6850714608750732, + "learning_rate": 9.027732823649526e-06, + "loss": 0.6231, + "step": 1966 + }, + { + "epoch": 0.23, + "grad_norm": 2.49959562907448, + "learning_rate": 9.026629893568072e-06, + "loss": 0.6016, + "step": 1967 + }, + { + "epoch": 0.23, + "grad_norm": 2.586796918276492, + "learning_rate": 9.02552640571854e-06, + "loss": 0.5038, + "step": 1968 + }, + { + "epoch": 0.23, + "grad_norm": 1.9385622516231051, + "learning_rate": 9.02442236025379e-06, + "loss": 0.6131, + "step": 1969 + }, + { + "epoch": 0.23, + "grad_norm": 2.068435608703494, + "learning_rate": 9.023317757326753e-06, + "loss": 0.5491, + "step": 1970 + }, + { + "epoch": 0.23, + "grad_norm": 4.2004764087517055, + "learning_rate": 9.022212597090434e-06, + "loss": 0.5518, + "step": 1971 + }, + { + "epoch": 0.23, + "grad_norm": 2.5917492754666265, + "learning_rate": 9.021106879697925e-06, + "loss": 0.6244, + "step": 1972 + }, + { + "epoch": 0.23, + "grad_norm": 2.0962273704440277, + "learning_rate": 9.020000605302385e-06, + "loss": 0.5446, + "step": 1973 + }, + { + "epoch": 0.23, + "grad_norm": 1.6645339005009094, + "learning_rate": 9.018893774057061e-06, + "loss": 0.5601, + "step": 1974 + }, + { + "epoch": 0.23, + "grad_norm": 1.8231461292672424, + "learning_rate": 9.017786386115263e-06, + "loss": 0.6358, + "step": 1975 + }, + { + "epoch": 0.23, + "grad_norm": 1.7987163489856062, + "learning_rate": 9.016678441630393e-06, + "loss": 0.5367, + "step": 1976 + }, + { + "epoch": 0.23, + "grad_norm": 2.2369345857824317, + "learning_rate": 9.015569940755922e-06, + "loss": 0.5329, + "step": 1977 + }, + { + "epoch": 0.23, + "grad_norm": 2.728066836602479, + "learning_rate": 9.014460883645398e-06, + "loss": 0.6002, + "step": 1978 + }, + { + "epoch": 0.23, + "grad_norm": 1.6059944357341411, + "learning_rate": 9.013351270452446e-06, + "loss": 0.4742, + "step": 1979 + }, + { + "epoch": 0.23, + "grad_norm": 1.7823855944385212, + "learning_rate": 9.012241101330772e-06, + "loss": 0.5767, + "step": 1980 + }, + { + "epoch": 0.23, + "grad_norm": 2.0201033799993966, + "learning_rate": 9.011130376434157e-06, + "loss": 0.4586, + "step": 1981 + }, + { + "epoch": 0.23, + "grad_norm": 2.0534029345220777, + "learning_rate": 9.010019095916456e-06, + "loss": 0.5922, + "step": 1982 + }, + { + "epoch": 0.23, + "grad_norm": 2.1261607367013187, + "learning_rate": 9.008907259931603e-06, + "loss": 0.5441, + "step": 1983 + }, + { + "epoch": 0.23, + "grad_norm": 2.065925327568663, + "learning_rate": 9.007794868633613e-06, + "loss": 0.5081, + "step": 1984 + }, + { + "epoch": 0.23, + "grad_norm": 2.041424168139034, + "learning_rate": 9.00668192217657e-06, + "loss": 0.5511, + "step": 1985 + }, + { + "epoch": 0.23, + "grad_norm": 2.2997858035561634, + "learning_rate": 9.005568420714643e-06, + "loss": 0.6062, + "step": 1986 + }, + { + "epoch": 0.23, + "grad_norm": 1.8412917708484329, + "learning_rate": 9.00445436440207e-06, + "loss": 0.4952, + "step": 1987 + }, + { + "epoch": 0.23, + "grad_norm": 2.089107395469012, + "learning_rate": 9.003339753393174e-06, + "loss": 0.4571, + "step": 1988 + }, + { + "epoch": 0.23, + "grad_norm": 2.4793857270263975, + "learning_rate": 9.002224587842348e-06, + "loss": 0.5813, + "step": 1989 + }, + { + "epoch": 0.23, + "grad_norm": 1.8213706345617011, + "learning_rate": 9.001108867904066e-06, + "loss": 0.5654, + "step": 1990 + }, + { + "epoch": 0.23, + "grad_norm": 2.213176110546956, + "learning_rate": 8.999992593732876e-06, + "loss": 0.5236, + "step": 1991 + }, + { + "epoch": 0.23, + "grad_norm": 0.9147480873118935, + "learning_rate": 8.998875765483403e-06, + "loss": 0.7171, + "step": 1992 + }, + { + "epoch": 0.23, + "grad_norm": 2.553084948631206, + "learning_rate": 8.997758383310353e-06, + "loss": 0.559, + "step": 1993 + }, + { + "epoch": 0.23, + "grad_norm": 2.299159937606802, + "learning_rate": 8.996640447368505e-06, + "loss": 0.6135, + "step": 1994 + }, + { + "epoch": 0.23, + "grad_norm": 2.2903173319664947, + "learning_rate": 8.995521957812713e-06, + "loss": 0.522, + "step": 1995 + }, + { + "epoch": 0.23, + "grad_norm": 2.7712206512869257, + "learning_rate": 8.994402914797913e-06, + "loss": 0.6154, + "step": 1996 + }, + { + "epoch": 0.23, + "grad_norm": 1.647062710587506, + "learning_rate": 8.993283318479114e-06, + "loss": 0.5562, + "step": 1997 + }, + { + "epoch": 0.23, + "grad_norm": 2.4820557946984594, + "learning_rate": 8.992163169011398e-06, + "loss": 0.5354, + "step": 1998 + }, + { + "epoch": 0.23, + "grad_norm": 1.9501538495177364, + "learning_rate": 8.991042466549934e-06, + "loss": 0.4955, + "step": 1999 + }, + { + "epoch": 0.23, + "grad_norm": 2.5354235506764096, + "learning_rate": 8.989921211249959e-06, + "loss": 0.5564, + "step": 2000 + }, + { + "epoch": 0.23, + "grad_norm": 2.8812252462931816, + "learning_rate": 8.988799403266787e-06, + "loss": 0.5429, + "step": 2001 + }, + { + "epoch": 0.23, + "grad_norm": 2.0598170169677092, + "learning_rate": 8.987677042755813e-06, + "loss": 0.4832, + "step": 2002 + }, + { + "epoch": 0.23, + "grad_norm": 1.7735614800416573, + "learning_rate": 8.986554129872506e-06, + "loss": 0.6027, + "step": 2003 + }, + { + "epoch": 0.23, + "grad_norm": 1.7839874046135726, + "learning_rate": 8.985430664772412e-06, + "loss": 0.3973, + "step": 2004 + }, + { + "epoch": 0.23, + "grad_norm": 4.920618782500927, + "learning_rate": 8.984306647611152e-06, + "loss": 0.4952, + "step": 2005 + }, + { + "epoch": 0.23, + "grad_norm": 2.4874926107984585, + "learning_rate": 8.983182078544426e-06, + "loss": 0.5013, + "step": 2006 + }, + { + "epoch": 0.23, + "grad_norm": 3.870436872918295, + "learning_rate": 8.982056957728007e-06, + "loss": 0.44, + "step": 2007 + }, + { + "epoch": 0.23, + "grad_norm": 2.7663225469186314, + "learning_rate": 8.980931285317748e-06, + "loss": 0.5122, + "step": 2008 + }, + { + "epoch": 0.23, + "grad_norm": 2.787792430643811, + "learning_rate": 8.979805061469578e-06, + "loss": 0.5355, + "step": 2009 + }, + { + "epoch": 0.23, + "grad_norm": 1.7720685138763685, + "learning_rate": 8.978678286339499e-06, + "loss": 0.5106, + "step": 2010 + }, + { + "epoch": 0.23, + "grad_norm": 4.049347466256778, + "learning_rate": 8.977550960083594e-06, + "loss": 0.4953, + "step": 2011 + }, + { + "epoch": 0.23, + "grad_norm": 2.3512648083501, + "learning_rate": 8.976423082858019e-06, + "loss": 0.4878, + "step": 2012 + }, + { + "epoch": 0.23, + "grad_norm": 1.9080251530236207, + "learning_rate": 8.975294654819007e-06, + "loss": 0.5177, + "step": 2013 + }, + { + "epoch": 0.23, + "grad_norm": 1.8155878514484811, + "learning_rate": 8.974165676122868e-06, + "loss": 0.4294, + "step": 2014 + }, + { + "epoch": 0.23, + "grad_norm": 2.0606182954851757, + "learning_rate": 8.973036146925988e-06, + "loss": 0.5454, + "step": 2015 + }, + { + "epoch": 0.23, + "grad_norm": 2.162875142123971, + "learning_rate": 8.971906067384828e-06, + "loss": 0.6403, + "step": 2016 + }, + { + "epoch": 0.23, + "grad_norm": 3.361838190447896, + "learning_rate": 8.970775437655929e-06, + "loss": 0.5584, + "step": 2017 + }, + { + "epoch": 0.23, + "grad_norm": 2.650402602528094, + "learning_rate": 8.969644257895903e-06, + "loss": 0.4942, + "step": 2018 + }, + { + "epoch": 0.23, + "grad_norm": 2.790318007947418, + "learning_rate": 8.968512528261442e-06, + "loss": 0.5611, + "step": 2019 + }, + { + "epoch": 0.23, + "grad_norm": 2.4218513169375613, + "learning_rate": 8.967380248909314e-06, + "loss": 0.5619, + "step": 2020 + }, + { + "epoch": 0.23, + "grad_norm": 2.675111813391207, + "learning_rate": 8.966247419996361e-06, + "loss": 0.5237, + "step": 2021 + }, + { + "epoch": 0.23, + "grad_norm": 1.959342748663956, + "learning_rate": 8.965114041679501e-06, + "loss": 0.5651, + "step": 2022 + }, + { + "epoch": 0.23, + "grad_norm": 2.571613547302985, + "learning_rate": 8.96398011411573e-06, + "loss": 0.5118, + "step": 2023 + }, + { + "epoch": 0.23, + "grad_norm": 2.4611593014253406, + "learning_rate": 8.962845637462124e-06, + "loss": 0.5403, + "step": 2024 + }, + { + "epoch": 0.23, + "grad_norm": 1.9672891699008197, + "learning_rate": 8.961710611875825e-06, + "loss": 0.5571, + "step": 2025 + }, + { + "epoch": 0.23, + "grad_norm": 1.7357065807967837, + "learning_rate": 8.960575037514056e-06, + "loss": 0.5661, + "step": 2026 + }, + { + "epoch": 0.23, + "grad_norm": 2.746420480725577, + "learning_rate": 8.95943891453412e-06, + "loss": 0.4555, + "step": 2027 + }, + { + "epoch": 0.23, + "grad_norm": 2.3178158614336666, + "learning_rate": 8.958302243093393e-06, + "loss": 0.459, + "step": 2028 + }, + { + "epoch": 0.23, + "grad_norm": 1.886116006509517, + "learning_rate": 8.957165023349324e-06, + "loss": 0.5282, + "step": 2029 + }, + { + "epoch": 0.23, + "grad_norm": 1.9081119130058164, + "learning_rate": 8.95602725545944e-06, + "loss": 0.5952, + "step": 2030 + }, + { + "epoch": 0.23, + "grad_norm": 1.8423987633434151, + "learning_rate": 8.954888939581348e-06, + "loss": 0.4258, + "step": 2031 + }, + { + "epoch": 0.23, + "grad_norm": 2.2143503900539963, + "learning_rate": 8.953750075872724e-06, + "loss": 0.5114, + "step": 2032 + }, + { + "epoch": 0.23, + "grad_norm": 2.450856412023587, + "learning_rate": 8.952610664491323e-06, + "loss": 0.4967, + "step": 2033 + }, + { + "epoch": 0.23, + "grad_norm": 1.9805190908088535, + "learning_rate": 8.95147070559498e-06, + "loss": 0.5003, + "step": 2034 + }, + { + "epoch": 0.23, + "grad_norm": 2.677948063167133, + "learning_rate": 8.950330199341596e-06, + "loss": 0.4622, + "step": 2035 + }, + { + "epoch": 0.23, + "grad_norm": 1.9831307116449997, + "learning_rate": 8.94918914588916e-06, + "loss": 0.4746, + "step": 2036 + }, + { + "epoch": 0.23, + "grad_norm": 0.9312163915182219, + "learning_rate": 8.948047545395726e-06, + "loss": 0.7038, + "step": 2037 + }, + { + "epoch": 0.23, + "grad_norm": 1.886471612416682, + "learning_rate": 8.946905398019431e-06, + "loss": 0.5847, + "step": 2038 + }, + { + "epoch": 0.23, + "grad_norm": 2.391104786889129, + "learning_rate": 8.945762703918483e-06, + "loss": 0.4701, + "step": 2039 + }, + { + "epoch": 0.23, + "grad_norm": 2.243904195104066, + "learning_rate": 8.944619463251168e-06, + "loss": 0.6354, + "step": 2040 + }, + { + "epoch": 0.23, + "grad_norm": 2.340947548213193, + "learning_rate": 8.94347567617585e-06, + "loss": 0.6067, + "step": 2041 + }, + { + "epoch": 0.23, + "grad_norm": 1.8960986517902896, + "learning_rate": 8.942331342850963e-06, + "loss": 0.5122, + "step": 2042 + }, + { + "epoch": 0.23, + "grad_norm": 1.8489366828549778, + "learning_rate": 8.941186463435022e-06, + "loss": 0.5327, + "step": 2043 + }, + { + "epoch": 0.23, + "grad_norm": 2.9016716320714124, + "learning_rate": 8.940041038086614e-06, + "loss": 0.5367, + "step": 2044 + }, + { + "epoch": 0.23, + "grad_norm": 4.981146067552342, + "learning_rate": 8.938895066964404e-06, + "loss": 0.5019, + "step": 2045 + }, + { + "epoch": 0.24, + "grad_norm": 2.173745512259389, + "learning_rate": 8.937748550227133e-06, + "loss": 0.5757, + "step": 2046 + }, + { + "epoch": 0.24, + "grad_norm": 1.6241116538216447, + "learning_rate": 8.936601488033612e-06, + "loss": 0.5454, + "step": 2047 + }, + { + "epoch": 0.24, + "grad_norm": 2.6610816194666764, + "learning_rate": 8.935453880542737e-06, + "loss": 0.6177, + "step": 2048 + }, + { + "epoch": 0.24, + "grad_norm": 1.6670705798856436, + "learning_rate": 8.934305727913471e-06, + "loss": 0.5054, + "step": 2049 + }, + { + "epoch": 0.24, + "grad_norm": 1.9224513510586698, + "learning_rate": 8.933157030304857e-06, + "loss": 0.5067, + "step": 2050 + }, + { + "epoch": 0.24, + "grad_norm": 4.018367274285547, + "learning_rate": 8.932007787876013e-06, + "loss": 0.5187, + "step": 2051 + }, + { + "epoch": 0.24, + "grad_norm": 3.748596918233636, + "learning_rate": 8.930858000786131e-06, + "loss": 0.4713, + "step": 2052 + }, + { + "epoch": 0.24, + "grad_norm": 1.831321661548211, + "learning_rate": 8.929707669194481e-06, + "loss": 0.5209, + "step": 2053 + }, + { + "epoch": 0.24, + "grad_norm": 5.001707857952489, + "learning_rate": 8.928556793260403e-06, + "loss": 0.5032, + "step": 2054 + }, + { + "epoch": 0.24, + "grad_norm": 2.430408329174184, + "learning_rate": 8.92740537314332e-06, + "loss": 0.5754, + "step": 2055 + }, + { + "epoch": 0.24, + "grad_norm": 1.8925528705235564, + "learning_rate": 8.926253409002724e-06, + "loss": 0.5093, + "step": 2056 + }, + { + "epoch": 0.24, + "grad_norm": 4.225437857819384, + "learning_rate": 8.925100900998186e-06, + "loss": 0.4646, + "step": 2057 + }, + { + "epoch": 0.24, + "grad_norm": 3.1354516504698555, + "learning_rate": 8.923947849289351e-06, + "loss": 0.5638, + "step": 2058 + }, + { + "epoch": 0.24, + "grad_norm": 1.9132921665934566, + "learning_rate": 8.92279425403594e-06, + "loss": 0.4661, + "step": 2059 + }, + { + "epoch": 0.24, + "grad_norm": 1.6125730492900616, + "learning_rate": 8.921640115397748e-06, + "loss": 0.4956, + "step": 2060 + }, + { + "epoch": 0.24, + "grad_norm": 2.3167689133474267, + "learning_rate": 8.920485433534647e-06, + "loss": 0.5575, + "step": 2061 + }, + { + "epoch": 0.24, + "grad_norm": 2.826781907095146, + "learning_rate": 8.919330208606583e-06, + "loss": 0.6106, + "step": 2062 + }, + { + "epoch": 0.24, + "grad_norm": 2.321674037173777, + "learning_rate": 8.918174440773577e-06, + "loss": 0.4041, + "step": 2063 + }, + { + "epoch": 0.24, + "grad_norm": 2.638864169267249, + "learning_rate": 8.917018130195725e-06, + "loss": 0.4187, + "step": 2064 + }, + { + "epoch": 0.24, + "grad_norm": 11.056884140708432, + "learning_rate": 8.915861277033202e-06, + "loss": 0.5797, + "step": 2065 + }, + { + "epoch": 0.24, + "grad_norm": 2.665890828806845, + "learning_rate": 8.914703881446252e-06, + "loss": 0.5421, + "step": 2066 + }, + { + "epoch": 0.24, + "grad_norm": 2.1259883238062707, + "learning_rate": 8.913545943595198e-06, + "loss": 0.4276, + "step": 2067 + }, + { + "epoch": 0.24, + "grad_norm": 1.6910264743155885, + "learning_rate": 8.912387463640439e-06, + "loss": 0.5054, + "step": 2068 + }, + { + "epoch": 0.24, + "grad_norm": 2.941895797687128, + "learning_rate": 8.911228441742444e-06, + "loss": 0.5703, + "step": 2069 + }, + { + "epoch": 0.24, + "grad_norm": 2.5329820578195132, + "learning_rate": 8.910068878061764e-06, + "loss": 0.5277, + "step": 2070 + }, + { + "epoch": 0.24, + "grad_norm": 1.848208964911629, + "learning_rate": 8.908908772759022e-06, + "loss": 0.5345, + "step": 2071 + }, + { + "epoch": 0.24, + "grad_norm": 2.2046412090931597, + "learning_rate": 8.90774812599491e-06, + "loss": 0.413, + "step": 2072 + }, + { + "epoch": 0.24, + "grad_norm": 1.6467209154823024, + "learning_rate": 8.906586937930208e-06, + "loss": 0.4922, + "step": 2073 + }, + { + "epoch": 0.24, + "grad_norm": 1.8436067594171583, + "learning_rate": 8.905425208725758e-06, + "loss": 0.5338, + "step": 2074 + }, + { + "epoch": 0.24, + "grad_norm": 2.1419270448086674, + "learning_rate": 8.904262938542485e-06, + "loss": 0.5431, + "step": 2075 + }, + { + "epoch": 0.24, + "grad_norm": 1.856177815319082, + "learning_rate": 8.903100127541386e-06, + "loss": 0.5217, + "step": 2076 + }, + { + "epoch": 0.24, + "grad_norm": 1.9119980684578037, + "learning_rate": 8.901936775883535e-06, + "loss": 0.5393, + "step": 2077 + }, + { + "epoch": 0.24, + "grad_norm": 1.7133432676260245, + "learning_rate": 8.900772883730075e-06, + "loss": 0.5472, + "step": 2078 + }, + { + "epoch": 0.24, + "grad_norm": 2.3545169619710604, + "learning_rate": 8.899608451242233e-06, + "loss": 0.5834, + "step": 2079 + }, + { + "epoch": 0.24, + "grad_norm": 1.7687590085648386, + "learning_rate": 8.898443478581302e-06, + "loss": 0.5649, + "step": 2080 + }, + { + "epoch": 0.24, + "grad_norm": 6.268035795443232, + "learning_rate": 8.897277965908657e-06, + "loss": 0.5661, + "step": 2081 + }, + { + "epoch": 0.24, + "grad_norm": 1.7808180118052503, + "learning_rate": 8.896111913385742e-06, + "loss": 0.574, + "step": 2082 + }, + { + "epoch": 0.24, + "grad_norm": 1.6078770321909537, + "learning_rate": 8.89494532117408e-06, + "loss": 0.5393, + "step": 2083 + }, + { + "epoch": 0.24, + "grad_norm": 2.0148917361849326, + "learning_rate": 8.893778189435267e-06, + "loss": 0.4931, + "step": 2084 + }, + { + "epoch": 0.24, + "grad_norm": 4.753782231373047, + "learning_rate": 8.892610518330973e-06, + "loss": 0.5214, + "step": 2085 + }, + { + "epoch": 0.24, + "grad_norm": 1.6471398211881594, + "learning_rate": 8.891442308022946e-06, + "loss": 0.4875, + "step": 2086 + }, + { + "epoch": 0.24, + "grad_norm": 1.7043554882043275, + "learning_rate": 8.890273558673003e-06, + "loss": 0.3684, + "step": 2087 + }, + { + "epoch": 0.24, + "grad_norm": 1.835616868568219, + "learning_rate": 8.889104270443041e-06, + "loss": 0.4754, + "step": 2088 + }, + { + "epoch": 0.24, + "grad_norm": 2.056681168139256, + "learning_rate": 8.887934443495028e-06, + "loss": 0.47, + "step": 2089 + }, + { + "epoch": 0.24, + "grad_norm": 2.3950943575263306, + "learning_rate": 8.88676407799101e-06, + "loss": 0.5694, + "step": 2090 + }, + { + "epoch": 0.24, + "grad_norm": 2.1414395650603617, + "learning_rate": 8.885593174093105e-06, + "loss": 0.4689, + "step": 2091 + }, + { + "epoch": 0.24, + "grad_norm": 5.8822869988179045, + "learning_rate": 8.884421731963506e-06, + "loss": 0.4789, + "step": 2092 + }, + { + "epoch": 0.24, + "grad_norm": 2.5007876769100963, + "learning_rate": 8.883249751764482e-06, + "loss": 0.561, + "step": 2093 + }, + { + "epoch": 0.24, + "grad_norm": 3.0611037260584157, + "learning_rate": 8.882077233658377e-06, + "loss": 0.4767, + "step": 2094 + }, + { + "epoch": 0.24, + "grad_norm": 1.7386468820656826, + "learning_rate": 8.880904177807604e-06, + "loss": 0.5117, + "step": 2095 + }, + { + "epoch": 0.24, + "grad_norm": 0.9437316431529302, + "learning_rate": 8.879730584374655e-06, + "loss": 0.7624, + "step": 2096 + }, + { + "epoch": 0.24, + "grad_norm": 1.9177220870121228, + "learning_rate": 8.8785564535221e-06, + "loss": 0.5588, + "step": 2097 + }, + { + "epoch": 0.24, + "grad_norm": 1.9231574683404207, + "learning_rate": 8.877381785412575e-06, + "loss": 0.5373, + "step": 2098 + }, + { + "epoch": 0.24, + "grad_norm": 2.3243126270036267, + "learning_rate": 8.876206580208798e-06, + "loss": 0.4653, + "step": 2099 + }, + { + "epoch": 0.24, + "grad_norm": 4.650622911641895, + "learning_rate": 8.875030838073557e-06, + "loss": 0.4928, + "step": 2100 + }, + { + "epoch": 0.24, + "grad_norm": 1.6628312074767986, + "learning_rate": 8.873854559169714e-06, + "loss": 0.5065, + "step": 2101 + }, + { + "epoch": 0.24, + "grad_norm": 2.2558874968478078, + "learning_rate": 8.872677743660209e-06, + "loss": 0.434, + "step": 2102 + }, + { + "epoch": 0.24, + "grad_norm": 11.195442737439743, + "learning_rate": 8.871500391708055e-06, + "loss": 0.4853, + "step": 2103 + }, + { + "epoch": 0.24, + "grad_norm": 2.029974243697887, + "learning_rate": 8.870322503476337e-06, + "loss": 0.5586, + "step": 2104 + }, + { + "epoch": 0.24, + "grad_norm": 1.773540159716805, + "learning_rate": 8.869144079128215e-06, + "loss": 0.6171, + "step": 2105 + }, + { + "epoch": 0.24, + "grad_norm": 2.102428276750225, + "learning_rate": 8.867965118826926e-06, + "loss": 0.5663, + "step": 2106 + }, + { + "epoch": 0.24, + "grad_norm": 1.724713434218149, + "learning_rate": 8.866785622735779e-06, + "loss": 0.5822, + "step": 2107 + }, + { + "epoch": 0.24, + "grad_norm": 2.0739950932009736, + "learning_rate": 8.865605591018156e-06, + "loss": 0.6491, + "step": 2108 + }, + { + "epoch": 0.24, + "grad_norm": 3.351888506544126, + "learning_rate": 8.864425023837517e-06, + "loss": 0.5319, + "step": 2109 + }, + { + "epoch": 0.24, + "grad_norm": 2.2844933481250864, + "learning_rate": 8.863243921357394e-06, + "loss": 0.6397, + "step": 2110 + }, + { + "epoch": 0.24, + "grad_norm": 2.6797180404802847, + "learning_rate": 8.862062283741391e-06, + "loss": 0.514, + "step": 2111 + }, + { + "epoch": 0.24, + "grad_norm": 1.8059568255763552, + "learning_rate": 8.86088011115319e-06, + "loss": 0.5067, + "step": 2112 + }, + { + "epoch": 0.24, + "grad_norm": 0.9218594911112099, + "learning_rate": 8.859697403756544e-06, + "loss": 0.7539, + "step": 2113 + }, + { + "epoch": 0.24, + "grad_norm": 2.0891119818489234, + "learning_rate": 8.858514161715281e-06, + "loss": 0.5758, + "step": 2114 + }, + { + "epoch": 0.24, + "grad_norm": 3.5208706710422684, + "learning_rate": 8.857330385193308e-06, + "loss": 0.5146, + "step": 2115 + }, + { + "epoch": 0.24, + "grad_norm": 1.7286589153350835, + "learning_rate": 8.856146074354594e-06, + "loss": 0.3946, + "step": 2116 + }, + { + "epoch": 0.24, + "grad_norm": 2.2660669768271604, + "learning_rate": 8.854961229363197e-06, + "loss": 0.5251, + "step": 2117 + }, + { + "epoch": 0.24, + "grad_norm": 1.8813426087604475, + "learning_rate": 8.853775850383237e-06, + "loss": 0.5208, + "step": 2118 + }, + { + "epoch": 0.24, + "grad_norm": 1.657859256573508, + "learning_rate": 8.852589937578913e-06, + "loss": 0.4701, + "step": 2119 + }, + { + "epoch": 0.24, + "grad_norm": 1.8968142310605052, + "learning_rate": 8.8514034911145e-06, + "loss": 0.5376, + "step": 2120 + }, + { + "epoch": 0.24, + "grad_norm": 2.067689794443828, + "learning_rate": 8.850216511154342e-06, + "loss": 0.5216, + "step": 2121 + }, + { + "epoch": 0.24, + "grad_norm": 2.2943415002118406, + "learning_rate": 8.849028997862858e-06, + "loss": 0.5498, + "step": 2122 + }, + { + "epoch": 0.24, + "grad_norm": 1.8104947919952035, + "learning_rate": 8.847840951404545e-06, + "loss": 0.5608, + "step": 2123 + }, + { + "epoch": 0.24, + "grad_norm": 1.8435694332373755, + "learning_rate": 8.84665237194397e-06, + "loss": 0.495, + "step": 2124 + }, + { + "epoch": 0.24, + "grad_norm": 2.0456575263110457, + "learning_rate": 8.845463259645774e-06, + "loss": 0.5385, + "step": 2125 + }, + { + "epoch": 0.24, + "grad_norm": 2.028474822832002, + "learning_rate": 8.844273614674675e-06, + "loss": 0.5732, + "step": 2126 + }, + { + "epoch": 0.24, + "grad_norm": 2.090798974212873, + "learning_rate": 8.843083437195458e-06, + "loss": 0.6024, + "step": 2127 + }, + { + "epoch": 0.24, + "grad_norm": 1.668936380744818, + "learning_rate": 8.841892727372991e-06, + "loss": 0.4695, + "step": 2128 + }, + { + "epoch": 0.24, + "grad_norm": 2.751565514804749, + "learning_rate": 8.84070148537221e-06, + "loss": 0.5978, + "step": 2129 + }, + { + "epoch": 0.24, + "grad_norm": 2.381452027385268, + "learning_rate": 8.839509711358122e-06, + "loss": 0.4935, + "step": 2130 + }, + { + "epoch": 0.24, + "grad_norm": 1.616691960192636, + "learning_rate": 8.838317405495815e-06, + "loss": 0.4206, + "step": 2131 + }, + { + "epoch": 0.24, + "grad_norm": 1.9391272839328788, + "learning_rate": 8.837124567950446e-06, + "loss": 0.4313, + "step": 2132 + }, + { + "epoch": 0.25, + "grad_norm": 1.940467547341228, + "learning_rate": 8.835931198887247e-06, + "loss": 0.4885, + "step": 2133 + }, + { + "epoch": 0.25, + "grad_norm": 1.735120370072147, + "learning_rate": 8.83473729847152e-06, + "loss": 0.5789, + "step": 2134 + }, + { + "epoch": 0.25, + "grad_norm": 2.308185033562398, + "learning_rate": 8.833542866868649e-06, + "loss": 0.5634, + "step": 2135 + }, + { + "epoch": 0.25, + "grad_norm": 2.476049147621378, + "learning_rate": 8.832347904244082e-06, + "loss": 0.4692, + "step": 2136 + }, + { + "epoch": 0.25, + "grad_norm": 1.8363709659987673, + "learning_rate": 8.83115241076335e-06, + "loss": 0.6006, + "step": 2137 + }, + { + "epoch": 0.25, + "grad_norm": 1.7435882880990088, + "learning_rate": 8.829956386592047e-06, + "loss": 0.4676, + "step": 2138 + }, + { + "epoch": 0.25, + "grad_norm": 1.8961689323625917, + "learning_rate": 8.82875983189585e-06, + "loss": 0.6213, + "step": 2139 + }, + { + "epoch": 0.25, + "grad_norm": 2.467323656668091, + "learning_rate": 8.827562746840506e-06, + "loss": 0.5385, + "step": 2140 + }, + { + "epoch": 0.25, + "grad_norm": 1.9765200786224564, + "learning_rate": 8.82636513159183e-06, + "loss": 0.5186, + "step": 2141 + }, + { + "epoch": 0.25, + "grad_norm": 2.043292190539482, + "learning_rate": 8.825166986315721e-06, + "loss": 0.4912, + "step": 2142 + }, + { + "epoch": 0.25, + "grad_norm": 1.6994357588549216, + "learning_rate": 8.82396831117814e-06, + "loss": 0.4896, + "step": 2143 + }, + { + "epoch": 0.25, + "grad_norm": 1.5122144161829592, + "learning_rate": 8.822769106345135e-06, + "loss": 0.4832, + "step": 2144 + }, + { + "epoch": 0.25, + "grad_norm": 1.8802874125682632, + "learning_rate": 8.821569371982815e-06, + "loss": 0.4965, + "step": 2145 + }, + { + "epoch": 0.25, + "grad_norm": 1.8758906265636286, + "learning_rate": 8.820369108257366e-06, + "loss": 0.6344, + "step": 2146 + }, + { + "epoch": 0.25, + "grad_norm": 2.6135766318094578, + "learning_rate": 8.819168315335051e-06, + "loss": 0.4487, + "step": 2147 + }, + { + "epoch": 0.25, + "grad_norm": 3.465716548330362, + "learning_rate": 8.817966993382202e-06, + "loss": 0.5274, + "step": 2148 + }, + { + "epoch": 0.25, + "grad_norm": 2.2927988617937913, + "learning_rate": 8.816765142565226e-06, + "loss": 0.4902, + "step": 2149 + }, + { + "epoch": 0.25, + "grad_norm": 2.1612341561826502, + "learning_rate": 8.815562763050603e-06, + "loss": 0.5428, + "step": 2150 + }, + { + "epoch": 0.25, + "grad_norm": 2.5270600645006445, + "learning_rate": 8.814359855004889e-06, + "loss": 0.4204, + "step": 2151 + }, + { + "epoch": 0.25, + "grad_norm": 1.944573290441661, + "learning_rate": 8.813156418594706e-06, + "loss": 0.5515, + "step": 2152 + }, + { + "epoch": 0.25, + "grad_norm": 2.4098364177809137, + "learning_rate": 8.811952453986758e-06, + "loss": 0.4848, + "step": 2153 + }, + { + "epoch": 0.25, + "grad_norm": 2.0745868643885954, + "learning_rate": 8.810747961347816e-06, + "loss": 0.4777, + "step": 2154 + }, + { + "epoch": 0.25, + "grad_norm": 1.5357421598195296, + "learning_rate": 8.809542940844727e-06, + "loss": 0.5258, + "step": 2155 + }, + { + "epoch": 0.25, + "grad_norm": 1.9120088992301487, + "learning_rate": 8.808337392644408e-06, + "loss": 0.55, + "step": 2156 + }, + { + "epoch": 0.25, + "grad_norm": 1.5265032109948617, + "learning_rate": 8.807131316913856e-06, + "loss": 0.5494, + "step": 2157 + }, + { + "epoch": 0.25, + "grad_norm": 2.2353621047324186, + "learning_rate": 8.80592471382013e-06, + "loss": 0.5527, + "step": 2158 + }, + { + "epoch": 0.25, + "grad_norm": 2.5875737619756314, + "learning_rate": 8.804717583530373e-06, + "loss": 0.6356, + "step": 2159 + }, + { + "epoch": 0.25, + "grad_norm": 1.9408757215609294, + "learning_rate": 8.803509926211796e-06, + "loss": 0.4903, + "step": 2160 + }, + { + "epoch": 0.25, + "grad_norm": 2.1317363658218245, + "learning_rate": 8.802301742031682e-06, + "loss": 0.5679, + "step": 2161 + }, + { + "epoch": 0.25, + "grad_norm": 0.8986004927487693, + "learning_rate": 8.80109303115739e-06, + "loss": 0.6436, + "step": 2162 + }, + { + "epoch": 0.25, + "grad_norm": 2.7164833239885695, + "learning_rate": 8.799883793756349e-06, + "loss": 0.4657, + "step": 2163 + }, + { + "epoch": 0.25, + "grad_norm": 1.55405859465503, + "learning_rate": 8.798674029996064e-06, + "loss": 0.5128, + "step": 2164 + }, + { + "epoch": 0.25, + "grad_norm": 1.843873193538699, + "learning_rate": 8.79746374004411e-06, + "loss": 0.5871, + "step": 2165 + }, + { + "epoch": 0.25, + "grad_norm": 1.743985771775315, + "learning_rate": 8.796252924068135e-06, + "loss": 0.4725, + "step": 2166 + }, + { + "epoch": 0.25, + "grad_norm": 2.475662863261369, + "learning_rate": 8.795041582235864e-06, + "loss": 0.4312, + "step": 2167 + }, + { + "epoch": 0.25, + "grad_norm": 1.7019028946632286, + "learning_rate": 8.79382971471509e-06, + "loss": 0.5354, + "step": 2168 + }, + { + "epoch": 0.25, + "grad_norm": 0.9559217907331918, + "learning_rate": 8.792617321673682e-06, + "loss": 0.761, + "step": 2169 + }, + { + "epoch": 0.25, + "grad_norm": 1.7933554600853572, + "learning_rate": 8.791404403279577e-06, + "loss": 0.5847, + "step": 2170 + }, + { + "epoch": 0.25, + "grad_norm": 2.77837729947464, + "learning_rate": 8.790190959700793e-06, + "loss": 0.4593, + "step": 2171 + }, + { + "epoch": 0.25, + "grad_norm": 1.5767361983000359, + "learning_rate": 8.788976991105414e-06, + "loss": 0.523, + "step": 2172 + }, + { + "epoch": 0.25, + "grad_norm": 2.097111106381782, + "learning_rate": 8.787762497661598e-06, + "loss": 0.5108, + "step": 2173 + }, + { + "epoch": 0.25, + "grad_norm": 3.0104568643185368, + "learning_rate": 8.786547479537574e-06, + "loss": 0.5184, + "step": 2174 + }, + { + "epoch": 0.25, + "grad_norm": 1.7355274394341116, + "learning_rate": 8.785331936901652e-06, + "loss": 0.4882, + "step": 2175 + }, + { + "epoch": 0.25, + "grad_norm": 0.8474643448813841, + "learning_rate": 8.784115869922206e-06, + "loss": 0.7046, + "step": 2176 + }, + { + "epoch": 0.25, + "grad_norm": 2.6421077280285923, + "learning_rate": 8.782899278767685e-06, + "loss": 0.5535, + "step": 2177 + }, + { + "epoch": 0.25, + "grad_norm": 2.171385895464419, + "learning_rate": 8.78168216360661e-06, + "loss": 0.4936, + "step": 2178 + }, + { + "epoch": 0.25, + "grad_norm": 2.0497486600950787, + "learning_rate": 8.780464524607577e-06, + "loss": 0.5039, + "step": 2179 + }, + { + "epoch": 0.25, + "grad_norm": 1.8961815971980327, + "learning_rate": 8.779246361939253e-06, + "loss": 0.4925, + "step": 2180 + }, + { + "epoch": 0.25, + "grad_norm": 6.885680286055336, + "learning_rate": 8.778027675770378e-06, + "loss": 0.5283, + "step": 2181 + }, + { + "epoch": 0.25, + "grad_norm": 3.7321524773604655, + "learning_rate": 8.776808466269761e-06, + "loss": 0.6226, + "step": 2182 + }, + { + "epoch": 0.25, + "grad_norm": 2.2737846385809206, + "learning_rate": 8.775588733606293e-06, + "loss": 0.5431, + "step": 2183 + }, + { + "epoch": 0.25, + "grad_norm": 2.192855319747856, + "learning_rate": 8.774368477948926e-06, + "loss": 0.5401, + "step": 2184 + }, + { + "epoch": 0.25, + "grad_norm": 2.0143240585962805, + "learning_rate": 8.773147699466692e-06, + "loss": 0.504, + "step": 2185 + }, + { + "epoch": 0.25, + "grad_norm": 2.2946469364563207, + "learning_rate": 8.771926398328691e-06, + "loss": 0.6081, + "step": 2186 + }, + { + "epoch": 0.25, + "grad_norm": 1.6942268083033827, + "learning_rate": 8.770704574704099e-06, + "loss": 0.4941, + "step": 2187 + }, + { + "epoch": 0.25, + "grad_norm": 2.3061919630684082, + "learning_rate": 8.769482228762163e-06, + "loss": 0.4969, + "step": 2188 + }, + { + "epoch": 0.25, + "grad_norm": 1.6007160236864182, + "learning_rate": 8.7682593606722e-06, + "loss": 0.5078, + "step": 2189 + }, + { + "epoch": 0.25, + "grad_norm": 1.8529048852575227, + "learning_rate": 8.767035970603606e-06, + "loss": 0.5192, + "step": 2190 + }, + { + "epoch": 0.25, + "grad_norm": 2.5756828269871974, + "learning_rate": 8.765812058725839e-06, + "loss": 0.5312, + "step": 2191 + }, + { + "epoch": 0.25, + "grad_norm": 2.117732348710527, + "learning_rate": 8.764587625208439e-06, + "loss": 0.5425, + "step": 2192 + }, + { + "epoch": 0.25, + "grad_norm": 1.902167141575285, + "learning_rate": 8.763362670221014e-06, + "loss": 0.5293, + "step": 2193 + }, + { + "epoch": 0.25, + "grad_norm": 1.688685656513306, + "learning_rate": 8.762137193933241e-06, + "loss": 0.4827, + "step": 2194 + }, + { + "epoch": 0.25, + "grad_norm": 2.8012190614592067, + "learning_rate": 8.760911196514879e-06, + "loss": 0.5124, + "step": 2195 + }, + { + "epoch": 0.25, + "grad_norm": 2.0205830095607547, + "learning_rate": 8.759684678135746e-06, + "loss": 0.4967, + "step": 2196 + }, + { + "epoch": 0.25, + "grad_norm": 1.7050477917729272, + "learning_rate": 8.758457638965745e-06, + "loss": 0.4034, + "step": 2197 + }, + { + "epoch": 0.25, + "grad_norm": 2.3596790857427643, + "learning_rate": 8.757230079174843e-06, + "loss": 0.6195, + "step": 2198 + }, + { + "epoch": 0.25, + "grad_norm": 0.8843918112964191, + "learning_rate": 8.75600199893308e-06, + "loss": 0.7156, + "step": 2199 + }, + { + "epoch": 0.25, + "grad_norm": 1.7178972747712413, + "learning_rate": 8.754773398410572e-06, + "loss": 0.5336, + "step": 2200 + }, + { + "epoch": 0.25, + "grad_norm": 2.08424504958815, + "learning_rate": 8.753544277777501e-06, + "loss": 0.5337, + "step": 2201 + }, + { + "epoch": 0.25, + "grad_norm": 2.2678211962938457, + "learning_rate": 8.752314637204129e-06, + "loss": 0.4385, + "step": 2202 + }, + { + "epoch": 0.25, + "grad_norm": 2.0941350796752887, + "learning_rate": 8.751084476860782e-06, + "loss": 0.5387, + "step": 2203 + }, + { + "epoch": 0.25, + "grad_norm": 3.471688181518448, + "learning_rate": 8.749853796917864e-06, + "loss": 0.5868, + "step": 2204 + }, + { + "epoch": 0.25, + "grad_norm": 2.3698609119640057, + "learning_rate": 8.748622597545847e-06, + "loss": 0.4551, + "step": 2205 + }, + { + "epoch": 0.25, + "grad_norm": 2.645698414460901, + "learning_rate": 8.747390878915277e-06, + "loss": 0.5019, + "step": 2206 + }, + { + "epoch": 0.25, + "grad_norm": 1.6839910020961422, + "learning_rate": 8.746158641196771e-06, + "loss": 0.5178, + "step": 2207 + }, + { + "epoch": 0.25, + "grad_norm": 2.480306180187886, + "learning_rate": 8.74492588456102e-06, + "loss": 0.5211, + "step": 2208 + }, + { + "epoch": 0.25, + "grad_norm": 2.2970142696231868, + "learning_rate": 8.743692609178785e-06, + "loss": 0.5558, + "step": 2209 + }, + { + "epoch": 0.25, + "grad_norm": 2.99137543856927, + "learning_rate": 8.742458815220895e-06, + "loss": 0.5267, + "step": 2210 + }, + { + "epoch": 0.25, + "grad_norm": 4.592975607747482, + "learning_rate": 8.74122450285826e-06, + "loss": 0.4719, + "step": 2211 + }, + { + "epoch": 0.25, + "grad_norm": 0.8770709924374213, + "learning_rate": 8.739989672261855e-06, + "loss": 0.7568, + "step": 2212 + }, + { + "epoch": 0.25, + "grad_norm": 1.8140789143636114, + "learning_rate": 8.738754323602728e-06, + "loss": 0.4539, + "step": 2213 + }, + { + "epoch": 0.25, + "grad_norm": 2.231591365591911, + "learning_rate": 8.737518457052e-06, + "loss": 0.4743, + "step": 2214 + }, + { + "epoch": 0.25, + "grad_norm": 2.4534637528693772, + "learning_rate": 8.736282072780863e-06, + "loss": 0.5025, + "step": 2215 + }, + { + "epoch": 0.25, + "grad_norm": 2.155101418896837, + "learning_rate": 8.73504517096058e-06, + "loss": 0.6165, + "step": 2216 + }, + { + "epoch": 0.25, + "grad_norm": 2.059042497669161, + "learning_rate": 8.733807751762486e-06, + "loss": 0.6516, + "step": 2217 + }, + { + "epoch": 0.25, + "grad_norm": 2.3300278755963486, + "learning_rate": 8.73256981535799e-06, + "loss": 0.5414, + "step": 2218 + }, + { + "epoch": 0.25, + "grad_norm": 1.5562905470061332, + "learning_rate": 8.73133136191857e-06, + "loss": 0.4712, + "step": 2219 + }, + { + "epoch": 0.26, + "grad_norm": 2.1734657590601176, + "learning_rate": 8.730092391615776e-06, + "loss": 0.5508, + "step": 2220 + }, + { + "epoch": 0.26, + "grad_norm": 1.7901974916144585, + "learning_rate": 8.728852904621227e-06, + "loss": 0.4144, + "step": 2221 + }, + { + "epoch": 0.26, + "grad_norm": 2.363687218327753, + "learning_rate": 8.727612901106623e-06, + "loss": 0.541, + "step": 2222 + }, + { + "epoch": 0.26, + "grad_norm": 2.144040794938621, + "learning_rate": 8.726372381243726e-06, + "loss": 0.6474, + "step": 2223 + }, + { + "epoch": 0.26, + "grad_norm": 3.056204823364584, + "learning_rate": 8.72513134520437e-06, + "loss": 0.4832, + "step": 2224 + }, + { + "epoch": 0.26, + "grad_norm": 2.8902168082286424, + "learning_rate": 8.723889793160465e-06, + "loss": 0.4652, + "step": 2225 + }, + { + "epoch": 0.26, + "grad_norm": 1.7531965264178277, + "learning_rate": 8.722647725283993e-06, + "loss": 0.5001, + "step": 2226 + }, + { + "epoch": 0.26, + "grad_norm": 3.032572650704824, + "learning_rate": 8.721405141747001e-06, + "loss": 0.6845, + "step": 2227 + }, + { + "epoch": 0.26, + "grad_norm": 1.7632299519330001, + "learning_rate": 8.720162042721614e-06, + "loss": 0.4882, + "step": 2228 + }, + { + "epoch": 0.26, + "grad_norm": 2.148595113215755, + "learning_rate": 8.718918428380025e-06, + "loss": 0.5327, + "step": 2229 + }, + { + "epoch": 0.26, + "grad_norm": 1.6120850373403077, + "learning_rate": 8.7176742988945e-06, + "loss": 0.4537, + "step": 2230 + }, + { + "epoch": 0.26, + "grad_norm": 1.9390430684289897, + "learning_rate": 8.716429654437375e-06, + "loss": 0.5465, + "step": 2231 + }, + { + "epoch": 0.26, + "grad_norm": 1.8441414791594257, + "learning_rate": 8.715184495181057e-06, + "loss": 0.4298, + "step": 2232 + }, + { + "epoch": 0.26, + "grad_norm": 2.2890720875046706, + "learning_rate": 8.713938821298027e-06, + "loss": 0.588, + "step": 2233 + }, + { + "epoch": 0.26, + "grad_norm": 1.7129195311069367, + "learning_rate": 8.712692632960835e-06, + "loss": 0.5146, + "step": 2234 + }, + { + "epoch": 0.26, + "grad_norm": 2.000566059950348, + "learning_rate": 8.711445930342101e-06, + "loss": 0.5246, + "step": 2235 + }, + { + "epoch": 0.26, + "grad_norm": 1.796835364993895, + "learning_rate": 8.710198713614522e-06, + "loss": 0.4904, + "step": 2236 + }, + { + "epoch": 0.26, + "grad_norm": 2.4175145125566138, + "learning_rate": 8.708950982950858e-06, + "loss": 0.5606, + "step": 2237 + }, + { + "epoch": 0.26, + "grad_norm": 1.9912996063405612, + "learning_rate": 8.707702738523948e-06, + "loss": 0.4584, + "step": 2238 + }, + { + "epoch": 0.26, + "grad_norm": 1.7217716365216131, + "learning_rate": 8.706453980506695e-06, + "loss": 0.6064, + "step": 2239 + }, + { + "epoch": 0.26, + "grad_norm": 2.46553766365681, + "learning_rate": 8.70520470907208e-06, + "loss": 0.5136, + "step": 2240 + }, + { + "epoch": 0.26, + "grad_norm": 2.1466332129871484, + "learning_rate": 8.70395492439315e-06, + "loss": 0.5926, + "step": 2241 + }, + { + "epoch": 0.26, + "grad_norm": 1.940713149852987, + "learning_rate": 8.702704626643024e-06, + "loss": 0.5317, + "step": 2242 + }, + { + "epoch": 0.26, + "grad_norm": 5.428791463967903, + "learning_rate": 8.701453815994896e-06, + "loss": 0.4816, + "step": 2243 + }, + { + "epoch": 0.26, + "grad_norm": 2.2641387992645954, + "learning_rate": 8.700202492622025e-06, + "loss": 0.5314, + "step": 2244 + }, + { + "epoch": 0.26, + "grad_norm": 2.6115483631304746, + "learning_rate": 8.698950656697748e-06, + "loss": 0.6277, + "step": 2245 + }, + { + "epoch": 0.26, + "grad_norm": 2.18190515704846, + "learning_rate": 8.697698308395466e-06, + "loss": 0.6492, + "step": 2246 + }, + { + "epoch": 0.26, + "grad_norm": 2.287975324377037, + "learning_rate": 8.696445447888652e-06, + "loss": 0.5203, + "step": 2247 + }, + { + "epoch": 0.26, + "grad_norm": 1.5992541174246715, + "learning_rate": 8.695192075350857e-06, + "loss": 0.5286, + "step": 2248 + }, + { + "epoch": 0.26, + "grad_norm": 1.7529838509972064, + "learning_rate": 8.693938190955698e-06, + "loss": 0.5399, + "step": 2249 + }, + { + "epoch": 0.26, + "grad_norm": 1.8510962430389155, + "learning_rate": 8.692683794876857e-06, + "loss": 0.6027, + "step": 2250 + }, + { + "epoch": 0.26, + "grad_norm": 3.698955514609625, + "learning_rate": 8.691428887288098e-06, + "loss": 0.5755, + "step": 2251 + }, + { + "epoch": 0.26, + "grad_norm": 2.3616661531634584, + "learning_rate": 8.69017346836325e-06, + "loss": 0.4903, + "step": 2252 + }, + { + "epoch": 0.26, + "grad_norm": 1.7542187771166797, + "learning_rate": 8.68891753827621e-06, + "loss": 0.5065, + "step": 2253 + }, + { + "epoch": 0.26, + "grad_norm": 2.125090840983156, + "learning_rate": 8.687661097200952e-06, + "loss": 0.5286, + "step": 2254 + }, + { + "epoch": 0.26, + "grad_norm": 1.8286988748350745, + "learning_rate": 8.686404145311517e-06, + "loss": 0.527, + "step": 2255 + }, + { + "epoch": 0.26, + "grad_norm": 1.7091268135298812, + "learning_rate": 8.68514668278202e-06, + "loss": 0.5475, + "step": 2256 + }, + { + "epoch": 0.26, + "grad_norm": 1.7618261876615084, + "learning_rate": 8.683888709786642e-06, + "loss": 0.4964, + "step": 2257 + }, + { + "epoch": 0.26, + "grad_norm": 3.024407347856764, + "learning_rate": 8.682630226499638e-06, + "loss": 0.4604, + "step": 2258 + }, + { + "epoch": 0.26, + "grad_norm": 1.8878587632013193, + "learning_rate": 8.681371233095334e-06, + "loss": 0.4661, + "step": 2259 + }, + { + "epoch": 0.26, + "grad_norm": 1.6492027137339098, + "learning_rate": 8.680111729748122e-06, + "loss": 0.4795, + "step": 2260 + }, + { + "epoch": 0.26, + "grad_norm": 1.867430839523205, + "learning_rate": 8.678851716632473e-06, + "loss": 0.4868, + "step": 2261 + }, + { + "epoch": 0.26, + "grad_norm": 2.0513182290720677, + "learning_rate": 8.677591193922921e-06, + "loss": 0.4907, + "step": 2262 + }, + { + "epoch": 0.26, + "grad_norm": 1.8850911019899388, + "learning_rate": 8.676330161794073e-06, + "loss": 0.509, + "step": 2263 + }, + { + "epoch": 0.26, + "grad_norm": 2.329901247599794, + "learning_rate": 8.675068620420609e-06, + "loss": 0.5417, + "step": 2264 + }, + { + "epoch": 0.26, + "grad_norm": 1.9145404068485052, + "learning_rate": 8.673806569977274e-06, + "loss": 0.5234, + "step": 2265 + }, + { + "epoch": 0.26, + "grad_norm": 1.8430370504766014, + "learning_rate": 8.67254401063889e-06, + "loss": 0.6163, + "step": 2266 + }, + { + "epoch": 0.26, + "grad_norm": 5.754607210358045, + "learning_rate": 8.671280942580347e-06, + "loss": 0.5156, + "step": 2267 + }, + { + "epoch": 0.26, + "grad_norm": 1.733711129031052, + "learning_rate": 8.670017365976602e-06, + "loss": 0.6012, + "step": 2268 + }, + { + "epoch": 0.26, + "grad_norm": 1.6500512240249596, + "learning_rate": 8.66875328100269e-06, + "loss": 0.5834, + "step": 2269 + }, + { + "epoch": 0.26, + "grad_norm": 3.1601639218713133, + "learning_rate": 8.667488687833705e-06, + "loss": 0.5338, + "step": 2270 + }, + { + "epoch": 0.26, + "grad_norm": 2.4410368829293607, + "learning_rate": 8.666223586644824e-06, + "loss": 0.5383, + "step": 2271 + }, + { + "epoch": 0.26, + "grad_norm": 3.181020542323042, + "learning_rate": 8.664957977611289e-06, + "loss": 0.5309, + "step": 2272 + }, + { + "epoch": 0.26, + "grad_norm": 1.764369803452699, + "learning_rate": 8.663691860908406e-06, + "loss": 0.5299, + "step": 2273 + }, + { + "epoch": 0.26, + "grad_norm": 1.7744264623422263, + "learning_rate": 8.662425236711562e-06, + "loss": 0.5963, + "step": 2274 + }, + { + "epoch": 0.26, + "grad_norm": 1.8626387677397316, + "learning_rate": 8.66115810519621e-06, + "loss": 0.5903, + "step": 2275 + }, + { + "epoch": 0.26, + "grad_norm": 1.8575449438674587, + "learning_rate": 8.65989046653787e-06, + "loss": 0.5327, + "step": 2276 + }, + { + "epoch": 0.26, + "grad_norm": 4.096722196049806, + "learning_rate": 8.658622320912138e-06, + "loss": 0.5095, + "step": 2277 + }, + { + "epoch": 0.26, + "grad_norm": 2.005229864248042, + "learning_rate": 8.657353668494674e-06, + "loss": 0.4656, + "step": 2278 + }, + { + "epoch": 0.26, + "grad_norm": 2.0949252956119375, + "learning_rate": 8.656084509461215e-06, + "loss": 0.5408, + "step": 2279 + }, + { + "epoch": 0.26, + "grad_norm": 1.5034306042431147, + "learning_rate": 8.654814843987563e-06, + "loss": 0.5324, + "step": 2280 + }, + { + "epoch": 0.26, + "grad_norm": 2.0897484214647433, + "learning_rate": 8.653544672249589e-06, + "loss": 0.5591, + "step": 2281 + }, + { + "epoch": 0.26, + "grad_norm": 2.080335148701125, + "learning_rate": 8.652273994423244e-06, + "loss": 0.569, + "step": 2282 + }, + { + "epoch": 0.26, + "grad_norm": 2.008595984347143, + "learning_rate": 8.651002810684535e-06, + "loss": 0.471, + "step": 2283 + }, + { + "epoch": 0.26, + "grad_norm": 3.031010690628849, + "learning_rate": 8.64973112120955e-06, + "loss": 0.5708, + "step": 2284 + }, + { + "epoch": 0.26, + "grad_norm": 0.9948332206862486, + "learning_rate": 8.648458926174441e-06, + "loss": 0.7799, + "step": 2285 + }, + { + "epoch": 0.26, + "grad_norm": 2.0396332652716818, + "learning_rate": 8.647186225755435e-06, + "loss": 0.4751, + "step": 2286 + }, + { + "epoch": 0.26, + "grad_norm": 1.7548931373175751, + "learning_rate": 8.645913020128825e-06, + "loss": 0.5011, + "step": 2287 + }, + { + "epoch": 0.26, + "grad_norm": 2.0038049262412314, + "learning_rate": 8.644639309470975e-06, + "loss": 0.6107, + "step": 2288 + }, + { + "epoch": 0.26, + "grad_norm": 1.8680987325093685, + "learning_rate": 8.643365093958317e-06, + "loss": 0.5834, + "step": 2289 + }, + { + "epoch": 0.26, + "grad_norm": 1.910833530384027, + "learning_rate": 8.64209037376736e-06, + "loss": 0.4698, + "step": 2290 + }, + { + "epoch": 0.26, + "grad_norm": 2.0785730280830625, + "learning_rate": 8.640815149074673e-06, + "loss": 0.5217, + "step": 2291 + }, + { + "epoch": 0.26, + "grad_norm": 2.0848790976934275, + "learning_rate": 8.639539420056902e-06, + "loss": 0.5671, + "step": 2292 + }, + { + "epoch": 0.26, + "grad_norm": 1.8483965979659747, + "learning_rate": 8.638263186890763e-06, + "loss": 0.5116, + "step": 2293 + }, + { + "epoch": 0.26, + "grad_norm": 2.3884745834370387, + "learning_rate": 8.636986449753035e-06, + "loss": 0.5732, + "step": 2294 + }, + { + "epoch": 0.26, + "grad_norm": 1.9756541069138531, + "learning_rate": 8.635709208820576e-06, + "loss": 0.6132, + "step": 2295 + }, + { + "epoch": 0.26, + "grad_norm": 1.93832137629286, + "learning_rate": 8.634431464270308e-06, + "loss": 0.5225, + "step": 2296 + }, + { + "epoch": 0.26, + "grad_norm": 0.8992525614542254, + "learning_rate": 8.63315321627922e-06, + "loss": 0.7111, + "step": 2297 + }, + { + "epoch": 0.26, + "grad_norm": 2.105046768195289, + "learning_rate": 8.63187446502438e-06, + "loss": 0.4815, + "step": 2298 + }, + { + "epoch": 0.26, + "grad_norm": 2.070912678065797, + "learning_rate": 8.630595210682918e-06, + "loss": 0.5886, + "step": 2299 + }, + { + "epoch": 0.26, + "grad_norm": 1.7278541391421343, + "learning_rate": 8.629315453432034e-06, + "loss": 0.5294, + "step": 2300 + }, + { + "epoch": 0.26, + "grad_norm": 2.56117071641269, + "learning_rate": 8.628035193449005e-06, + "loss": 0.4999, + "step": 2301 + }, + { + "epoch": 0.26, + "grad_norm": 1.8518025863754612, + "learning_rate": 8.626754430911169e-06, + "loss": 0.6341, + "step": 2302 + }, + { + "epoch": 0.26, + "grad_norm": 1.7061760875116814, + "learning_rate": 8.625473165995935e-06, + "loss": 0.5476, + "step": 2303 + }, + { + "epoch": 0.26, + "grad_norm": 2.4226513940645313, + "learning_rate": 8.624191398880788e-06, + "loss": 0.5364, + "step": 2304 + }, + { + "epoch": 0.26, + "grad_norm": 1.8058284646508111, + "learning_rate": 8.622909129743275e-06, + "loss": 0.4586, + "step": 2305 + }, + { + "epoch": 0.26, + "grad_norm": 4.328935292027457, + "learning_rate": 8.621626358761018e-06, + "loss": 0.5346, + "step": 2306 + }, + { + "epoch": 0.27, + "grad_norm": 2.4249907731636644, + "learning_rate": 8.620343086111704e-06, + "loss": 0.5571, + "step": 2307 + }, + { + "epoch": 0.27, + "grad_norm": 2.843600469132354, + "learning_rate": 8.619059311973095e-06, + "loss": 0.5594, + "step": 2308 + }, + { + "epoch": 0.27, + "grad_norm": 1.7841753620414624, + "learning_rate": 8.617775036523014e-06, + "loss": 0.5243, + "step": 2309 + }, + { + "epoch": 0.27, + "grad_norm": 1.9806771663835443, + "learning_rate": 8.616490259939364e-06, + "loss": 0.6093, + "step": 2310 + }, + { + "epoch": 0.27, + "grad_norm": 2.10706723258942, + "learning_rate": 8.615204982400108e-06, + "loss": 0.5568, + "step": 2311 + }, + { + "epoch": 0.27, + "grad_norm": 2.107515498865774, + "learning_rate": 8.613919204083286e-06, + "loss": 0.4663, + "step": 2312 + }, + { + "epoch": 0.27, + "grad_norm": 1.8084497120770857, + "learning_rate": 8.612632925166999e-06, + "loss": 0.5211, + "step": 2313 + }, + { + "epoch": 0.27, + "grad_norm": 1.9652682538730981, + "learning_rate": 8.611346145829427e-06, + "loss": 0.5909, + "step": 2314 + }, + { + "epoch": 0.27, + "grad_norm": 1.7769822561053341, + "learning_rate": 8.61005886624881e-06, + "loss": 0.529, + "step": 2315 + }, + { + "epoch": 0.27, + "grad_norm": 1.7625781251490886, + "learning_rate": 8.608771086603466e-06, + "loss": 0.5589, + "step": 2316 + }, + { + "epoch": 0.27, + "grad_norm": 1.7232410926821211, + "learning_rate": 8.607482807071777e-06, + "loss": 0.4574, + "step": 2317 + }, + { + "epoch": 0.27, + "grad_norm": 2.0769521026562483, + "learning_rate": 8.606194027832192e-06, + "loss": 0.5277, + "step": 2318 + }, + { + "epoch": 0.27, + "grad_norm": 2.1481645897457073, + "learning_rate": 8.604904749063237e-06, + "loss": 0.553, + "step": 2319 + }, + { + "epoch": 0.27, + "grad_norm": 1.6829855750905547, + "learning_rate": 8.6036149709435e-06, + "loss": 0.499, + "step": 2320 + }, + { + "epoch": 0.27, + "grad_norm": 1.9474046302458858, + "learning_rate": 8.60232469365164e-06, + "loss": 0.527, + "step": 2321 + }, + { + "epoch": 0.27, + "grad_norm": 2.279835887908195, + "learning_rate": 8.601033917366389e-06, + "loss": 0.5544, + "step": 2322 + }, + { + "epoch": 0.27, + "grad_norm": 1.9463099680217628, + "learning_rate": 8.599742642266544e-06, + "loss": 0.58, + "step": 2323 + }, + { + "epoch": 0.27, + "grad_norm": 0.9580128016449191, + "learning_rate": 8.59845086853097e-06, + "loss": 0.7037, + "step": 2324 + }, + { + "epoch": 0.27, + "grad_norm": 3.24531561197382, + "learning_rate": 8.597158596338605e-06, + "loss": 0.5977, + "step": 2325 + }, + { + "epoch": 0.27, + "grad_norm": 1.8514204764596625, + "learning_rate": 8.595865825868455e-06, + "loss": 0.5622, + "step": 2326 + }, + { + "epoch": 0.27, + "grad_norm": 2.2475751940197632, + "learning_rate": 8.594572557299594e-06, + "loss": 0.5065, + "step": 2327 + }, + { + "epoch": 0.27, + "grad_norm": 0.9158471428342122, + "learning_rate": 8.593278790811164e-06, + "loss": 0.7299, + "step": 2328 + }, + { + "epoch": 0.27, + "grad_norm": 1.813242227312915, + "learning_rate": 8.591984526582378e-06, + "loss": 0.4803, + "step": 2329 + }, + { + "epoch": 0.27, + "grad_norm": 2.645786698014883, + "learning_rate": 8.59068976479252e-06, + "loss": 0.4947, + "step": 2330 + }, + { + "epoch": 0.27, + "grad_norm": 1.93319902528064, + "learning_rate": 8.589394505620935e-06, + "loss": 0.5693, + "step": 2331 + }, + { + "epoch": 0.27, + "grad_norm": 1.9880754684872175, + "learning_rate": 8.588098749247045e-06, + "loss": 0.5421, + "step": 2332 + }, + { + "epoch": 0.27, + "grad_norm": 2.533670935320121, + "learning_rate": 8.586802495850339e-06, + "loss": 0.5184, + "step": 2333 + }, + { + "epoch": 0.27, + "grad_norm": 1.9809616055970773, + "learning_rate": 8.585505745610372e-06, + "loss": 0.575, + "step": 2334 + }, + { + "epoch": 0.27, + "grad_norm": 1.8981138984920811, + "learning_rate": 8.58420849870677e-06, + "loss": 0.4851, + "step": 2335 + }, + { + "epoch": 0.27, + "grad_norm": 2.086166785264737, + "learning_rate": 8.582910755319228e-06, + "loss": 0.6033, + "step": 2336 + }, + { + "epoch": 0.27, + "grad_norm": 1.8260800458352073, + "learning_rate": 8.581612515627509e-06, + "loss": 0.4789, + "step": 2337 + }, + { + "epoch": 0.27, + "grad_norm": 1.659097195306069, + "learning_rate": 8.580313779811444e-06, + "loss": 0.5152, + "step": 2338 + }, + { + "epoch": 0.27, + "grad_norm": 2.18358648363183, + "learning_rate": 8.579014548050934e-06, + "loss": 0.5565, + "step": 2339 + }, + { + "epoch": 0.27, + "grad_norm": 1.5496818491857096, + "learning_rate": 8.57771482052595e-06, + "loss": 0.5464, + "step": 2340 + }, + { + "epoch": 0.27, + "grad_norm": 1.8022724038625273, + "learning_rate": 8.576414597416527e-06, + "loss": 0.5251, + "step": 2341 + }, + { + "epoch": 0.27, + "grad_norm": 1.799153134386961, + "learning_rate": 8.575113878902776e-06, + "loss": 0.5009, + "step": 2342 + }, + { + "epoch": 0.27, + "grad_norm": 2.4829811034135174, + "learning_rate": 8.573812665164867e-06, + "loss": 0.5081, + "step": 2343 + }, + { + "epoch": 0.27, + "grad_norm": 2.4509325387654677, + "learning_rate": 8.572510956383048e-06, + "loss": 0.4958, + "step": 2344 + }, + { + "epoch": 0.27, + "grad_norm": 2.097664324527126, + "learning_rate": 8.57120875273763e-06, + "loss": 0.617, + "step": 2345 + }, + { + "epoch": 0.27, + "grad_norm": 2.348344261473166, + "learning_rate": 8.569906054408994e-06, + "loss": 0.6082, + "step": 2346 + }, + { + "epoch": 0.27, + "grad_norm": 5.672057391635827, + "learning_rate": 8.568602861577589e-06, + "loss": 0.5956, + "step": 2347 + }, + { + "epoch": 0.27, + "grad_norm": 2.0069915450531437, + "learning_rate": 8.567299174423936e-06, + "loss": 0.5148, + "step": 2348 + }, + { + "epoch": 0.27, + "grad_norm": 2.9114641404024724, + "learning_rate": 8.565994993128617e-06, + "loss": 0.4413, + "step": 2349 + }, + { + "epoch": 0.27, + "grad_norm": 2.0249486878362815, + "learning_rate": 8.564690317872289e-06, + "loss": 0.4527, + "step": 2350 + }, + { + "epoch": 0.27, + "grad_norm": 2.330168692292167, + "learning_rate": 8.563385148835677e-06, + "loss": 0.4136, + "step": 2351 + }, + { + "epoch": 0.27, + "grad_norm": 2.4738136289782906, + "learning_rate": 8.562079486199571e-06, + "loss": 0.6065, + "step": 2352 + }, + { + "epoch": 0.27, + "grad_norm": 2.3352898443116574, + "learning_rate": 8.56077333014483e-06, + "loss": 0.5319, + "step": 2353 + }, + { + "epoch": 0.27, + "grad_norm": 2.003509521835223, + "learning_rate": 8.559466680852386e-06, + "loss": 0.4713, + "step": 2354 + }, + { + "epoch": 0.27, + "grad_norm": 1.6677954078827002, + "learning_rate": 8.558159538503234e-06, + "loss": 0.4585, + "step": 2355 + }, + { + "epoch": 0.27, + "grad_norm": 1.8270435954514523, + "learning_rate": 8.556851903278437e-06, + "loss": 0.5158, + "step": 2356 + }, + { + "epoch": 0.27, + "grad_norm": 1.5977877718483098, + "learning_rate": 8.555543775359132e-06, + "loss": 0.4162, + "step": 2357 + }, + { + "epoch": 0.27, + "grad_norm": 1.9839522944195285, + "learning_rate": 8.55423515492652e-06, + "loss": 0.5185, + "step": 2358 + }, + { + "epoch": 0.27, + "grad_norm": 0.9658390005673964, + "learning_rate": 8.552926042161868e-06, + "loss": 0.7499, + "step": 2359 + }, + { + "epoch": 0.27, + "grad_norm": 10.87423004591279, + "learning_rate": 8.551616437246515e-06, + "loss": 0.6412, + "step": 2360 + }, + { + "epoch": 0.27, + "grad_norm": 1.8031405105241216, + "learning_rate": 8.550306340361872e-06, + "loss": 0.5286, + "step": 2361 + }, + { + "epoch": 0.27, + "grad_norm": 1.651957743700889, + "learning_rate": 8.548995751689406e-06, + "loss": 0.4529, + "step": 2362 + }, + { + "epoch": 0.27, + "grad_norm": 1.5661871773717315, + "learning_rate": 8.547684671410665e-06, + "loss": 0.4987, + "step": 2363 + }, + { + "epoch": 0.27, + "grad_norm": 1.9232183513384353, + "learning_rate": 8.54637309970726e-06, + "loss": 0.5896, + "step": 2364 + }, + { + "epoch": 0.27, + "grad_norm": 0.9020394888930264, + "learning_rate": 8.545061036760863e-06, + "loss": 0.7447, + "step": 2365 + }, + { + "epoch": 0.27, + "grad_norm": 1.9091640547450839, + "learning_rate": 8.543748482753229e-06, + "loss": 0.5526, + "step": 2366 + }, + { + "epoch": 0.27, + "grad_norm": 0.8455647781615331, + "learning_rate": 8.542435437866166e-06, + "loss": 0.72, + "step": 2367 + }, + { + "epoch": 0.27, + "grad_norm": 1.6507528582420827, + "learning_rate": 8.541121902281562e-06, + "loss": 0.4673, + "step": 2368 + }, + { + "epoch": 0.27, + "grad_norm": 2.679356007242605, + "learning_rate": 8.539807876181363e-06, + "loss": 0.5614, + "step": 2369 + }, + { + "epoch": 0.27, + "grad_norm": 1.9176614700352168, + "learning_rate": 8.538493359747592e-06, + "loss": 0.4389, + "step": 2370 + }, + { + "epoch": 0.27, + "grad_norm": 3.2720074443151357, + "learning_rate": 8.537178353162334e-06, + "loss": 0.4873, + "step": 2371 + }, + { + "epoch": 0.27, + "grad_norm": 2.888301903142672, + "learning_rate": 8.535862856607742e-06, + "loss": 0.5817, + "step": 2372 + }, + { + "epoch": 0.27, + "grad_norm": 1.7757660656049783, + "learning_rate": 8.534546870266041e-06, + "loss": 0.5896, + "step": 2373 + }, + { + "epoch": 0.27, + "grad_norm": 1.8228889122298935, + "learning_rate": 8.533230394319518e-06, + "loss": 0.5099, + "step": 2374 + }, + { + "epoch": 0.27, + "grad_norm": 0.90402632513311, + "learning_rate": 8.531913428950533e-06, + "loss": 0.7348, + "step": 2375 + }, + { + "epoch": 0.27, + "grad_norm": 1.9428790711454256, + "learning_rate": 8.530595974341512e-06, + "loss": 0.4974, + "step": 2376 + }, + { + "epoch": 0.27, + "grad_norm": 0.8614049628728114, + "learning_rate": 8.529278030674947e-06, + "loss": 0.7531, + "step": 2377 + }, + { + "epoch": 0.27, + "grad_norm": 1.5968356956824492, + "learning_rate": 8.527959598133403e-06, + "loss": 0.4661, + "step": 2378 + }, + { + "epoch": 0.27, + "grad_norm": 2.125853941818109, + "learning_rate": 8.526640676899505e-06, + "loss": 0.5615, + "step": 2379 + }, + { + "epoch": 0.27, + "grad_norm": 2.0342213784685574, + "learning_rate": 8.525321267155952e-06, + "loss": 0.5489, + "step": 2380 + }, + { + "epoch": 0.27, + "grad_norm": 1.795539591551683, + "learning_rate": 8.524001369085506e-06, + "loss": 0.6115, + "step": 2381 + }, + { + "epoch": 0.27, + "grad_norm": 0.8519074764310129, + "learning_rate": 8.522680982871002e-06, + "loss": 0.686, + "step": 2382 + }, + { + "epoch": 0.27, + "grad_norm": 2.188530242911944, + "learning_rate": 8.521360108695339e-06, + "loss": 0.5787, + "step": 2383 + }, + { + "epoch": 0.27, + "grad_norm": 2.872561488825852, + "learning_rate": 8.520038746741482e-06, + "loss": 0.4544, + "step": 2384 + }, + { + "epoch": 0.27, + "grad_norm": 2.013866660018719, + "learning_rate": 8.518716897192469e-06, + "loss": 0.536, + "step": 2385 + }, + { + "epoch": 0.27, + "grad_norm": 1.8494793195374575, + "learning_rate": 8.5173945602314e-06, + "loss": 0.5939, + "step": 2386 + }, + { + "epoch": 0.27, + "grad_norm": 1.9832176665558487, + "learning_rate": 8.516071736041447e-06, + "loss": 0.4244, + "step": 2387 + }, + { + "epoch": 0.27, + "grad_norm": 8.066641057995938, + "learning_rate": 8.514748424805844e-06, + "loss": 0.5044, + "step": 2388 + }, + { + "epoch": 0.27, + "grad_norm": 2.5084331331194893, + "learning_rate": 8.5134246267079e-06, + "loss": 0.5055, + "step": 2389 + }, + { + "epoch": 0.27, + "grad_norm": 3.0971906905476803, + "learning_rate": 8.512100341930985e-06, + "loss": 0.5392, + "step": 2390 + }, + { + "epoch": 0.27, + "grad_norm": 3.2142955934709714, + "learning_rate": 8.510775570658538e-06, + "loss": 0.5431, + "step": 2391 + }, + { + "epoch": 0.27, + "grad_norm": 1.6255438205354569, + "learning_rate": 8.509450313074065e-06, + "loss": 0.4706, + "step": 2392 + }, + { + "epoch": 0.27, + "grad_norm": 3.1440566210055514, + "learning_rate": 8.508124569361147e-06, + "loss": 0.5368, + "step": 2393 + }, + { + "epoch": 0.28, + "grad_norm": 5.580442036293708, + "learning_rate": 8.50679833970342e-06, + "loss": 0.5688, + "step": 2394 + }, + { + "epoch": 0.28, + "grad_norm": 1.7681383725357012, + "learning_rate": 8.505471624284593e-06, + "loss": 0.4565, + "step": 2395 + }, + { + "epoch": 0.28, + "grad_norm": 1.8951924092729338, + "learning_rate": 8.504144423288443e-06, + "loss": 0.5529, + "step": 2396 + }, + { + "epoch": 0.28, + "grad_norm": 2.2937428508779907, + "learning_rate": 8.502816736898816e-06, + "loss": 0.5319, + "step": 2397 + }, + { + "epoch": 0.28, + "grad_norm": 1.7388437012805387, + "learning_rate": 8.50148856529962e-06, + "loss": 0.5622, + "step": 2398 + }, + { + "epoch": 0.28, + "grad_norm": 2.419220188531338, + "learning_rate": 8.500159908674836e-06, + "loss": 0.6096, + "step": 2399 + }, + { + "epoch": 0.28, + "grad_norm": 1.8797098042071638, + "learning_rate": 8.498830767208507e-06, + "loss": 0.4853, + "step": 2400 + }, + { + "epoch": 0.28, + "grad_norm": 2.035074734741205, + "learning_rate": 8.497501141084746e-06, + "loss": 0.5668, + "step": 2401 + }, + { + "epoch": 0.28, + "grad_norm": 2.6767940478062733, + "learning_rate": 8.496171030487734e-06, + "loss": 0.5177, + "step": 2402 + }, + { + "epoch": 0.28, + "grad_norm": 1.7984699733963958, + "learning_rate": 8.494840435601714e-06, + "loss": 0.5678, + "step": 2403 + }, + { + "epoch": 0.28, + "grad_norm": 5.344009074710361, + "learning_rate": 8.493509356611005e-06, + "loss": 0.6263, + "step": 2404 + }, + { + "epoch": 0.28, + "grad_norm": 1.8735917421469612, + "learning_rate": 8.492177793699982e-06, + "loss": 0.5414, + "step": 2405 + }, + { + "epoch": 0.28, + "grad_norm": 1.8241710857775186, + "learning_rate": 8.490845747053098e-06, + "loss": 0.5194, + "step": 2406 + }, + { + "epoch": 0.28, + "grad_norm": 2.2042176118048005, + "learning_rate": 8.489513216854866e-06, + "loss": 0.4905, + "step": 2407 + }, + { + "epoch": 0.28, + "grad_norm": 2.1300988496740922, + "learning_rate": 8.488180203289867e-06, + "loss": 0.5169, + "step": 2408 + }, + { + "epoch": 0.28, + "grad_norm": 4.660090732493645, + "learning_rate": 8.48684670654275e-06, + "loss": 0.5449, + "step": 2409 + }, + { + "epoch": 0.28, + "grad_norm": 2.085723350946313, + "learning_rate": 8.485512726798231e-06, + "loss": 0.4833, + "step": 2410 + }, + { + "epoch": 0.28, + "grad_norm": 1.765503452891893, + "learning_rate": 8.484178264241093e-06, + "loss": 0.4435, + "step": 2411 + }, + { + "epoch": 0.28, + "grad_norm": 1.636353243404151, + "learning_rate": 8.482843319056187e-06, + "loss": 0.4904, + "step": 2412 + }, + { + "epoch": 0.28, + "grad_norm": 2.0386061911027396, + "learning_rate": 8.481507891428425e-06, + "loss": 0.5441, + "step": 2413 + }, + { + "epoch": 0.28, + "grad_norm": 2.1725796535929502, + "learning_rate": 8.480171981542794e-06, + "loss": 0.5793, + "step": 2414 + }, + { + "epoch": 0.28, + "grad_norm": 4.836450667300905, + "learning_rate": 8.478835589584344e-06, + "loss": 0.4223, + "step": 2415 + }, + { + "epoch": 0.28, + "grad_norm": 2.1505424564248234, + "learning_rate": 8.477498715738188e-06, + "loss": 0.4926, + "step": 2416 + }, + { + "epoch": 0.28, + "grad_norm": 1.840454782914558, + "learning_rate": 8.476161360189514e-06, + "loss": 0.4809, + "step": 2417 + }, + { + "epoch": 0.28, + "grad_norm": 1.8818612913039194, + "learning_rate": 8.47482352312357e-06, + "loss": 0.5504, + "step": 2418 + }, + { + "epoch": 0.28, + "grad_norm": 3.4565971716407367, + "learning_rate": 8.473485204725675e-06, + "loss": 0.4423, + "step": 2419 + }, + { + "epoch": 0.28, + "grad_norm": 2.0969523865163753, + "learning_rate": 8.47214640518121e-06, + "loss": 0.5154, + "step": 2420 + }, + { + "epoch": 0.28, + "grad_norm": 2.0695240527446206, + "learning_rate": 8.470807124675626e-06, + "loss": 0.6035, + "step": 2421 + }, + { + "epoch": 0.28, + "grad_norm": 2.0045141237032547, + "learning_rate": 8.46946736339444e-06, + "loss": 0.4661, + "step": 2422 + }, + { + "epoch": 0.28, + "grad_norm": 2.099087722014555, + "learning_rate": 8.468127121523236e-06, + "loss": 0.5171, + "step": 2423 + }, + { + "epoch": 0.28, + "grad_norm": 1.7878960983531456, + "learning_rate": 8.466786399247663e-06, + "loss": 0.4581, + "step": 2424 + }, + { + "epoch": 0.28, + "grad_norm": 1.971256078618696, + "learning_rate": 8.465445196753441e-06, + "loss": 0.5106, + "step": 2425 + }, + { + "epoch": 0.28, + "grad_norm": 2.4516316684893837, + "learning_rate": 8.464103514226349e-06, + "loss": 0.5064, + "step": 2426 + }, + { + "epoch": 0.28, + "grad_norm": 2.319775154737638, + "learning_rate": 8.462761351852238e-06, + "loss": 0.5651, + "step": 2427 + }, + { + "epoch": 0.28, + "grad_norm": 1.8302544208145304, + "learning_rate": 8.461418709817026e-06, + "loss": 0.4807, + "step": 2428 + }, + { + "epoch": 0.28, + "grad_norm": 0.944570195235223, + "learning_rate": 8.460075588306692e-06, + "loss": 0.7355, + "step": 2429 + }, + { + "epoch": 0.28, + "grad_norm": 1.557827092040908, + "learning_rate": 8.458731987507287e-06, + "loss": 0.4978, + "step": 2430 + }, + { + "epoch": 0.28, + "grad_norm": 1.9066442989457917, + "learning_rate": 8.457387907604926e-06, + "loss": 0.5129, + "step": 2431 + }, + { + "epoch": 0.28, + "grad_norm": 1.824117342532886, + "learning_rate": 8.456043348785792e-06, + "loss": 0.4858, + "step": 2432 + }, + { + "epoch": 0.28, + "grad_norm": 2.1823806355264614, + "learning_rate": 8.45469831123613e-06, + "loss": 0.6024, + "step": 2433 + }, + { + "epoch": 0.28, + "grad_norm": 2.718317119718294, + "learning_rate": 8.453352795142259e-06, + "loss": 0.458, + "step": 2434 + }, + { + "epoch": 0.28, + "grad_norm": 2.2935947015750284, + "learning_rate": 8.452006800690554e-06, + "loss": 0.4594, + "step": 2435 + }, + { + "epoch": 0.28, + "grad_norm": 1.981192322218991, + "learning_rate": 8.450660328067467e-06, + "loss": 0.5005, + "step": 2436 + }, + { + "epoch": 0.28, + "grad_norm": 1.9653612682724262, + "learning_rate": 8.449313377459509e-06, + "loss": 0.5971, + "step": 2437 + }, + { + "epoch": 0.28, + "grad_norm": 1.7324664975658497, + "learning_rate": 8.447965949053258e-06, + "loss": 0.534, + "step": 2438 + }, + { + "epoch": 0.28, + "grad_norm": 3.1587540128561247, + "learning_rate": 8.446618043035361e-06, + "loss": 0.552, + "step": 2439 + }, + { + "epoch": 0.28, + "grad_norm": 2.447199903022612, + "learning_rate": 8.44526965959253e-06, + "loss": 0.4602, + "step": 2440 + }, + { + "epoch": 0.28, + "grad_norm": 2.132934600271061, + "learning_rate": 8.443920798911544e-06, + "loss": 0.5594, + "step": 2441 + }, + { + "epoch": 0.28, + "grad_norm": 1.850505870285045, + "learning_rate": 8.442571461179243e-06, + "loss": 0.5521, + "step": 2442 + }, + { + "epoch": 0.28, + "grad_norm": 2.8880157930258608, + "learning_rate": 8.441221646582542e-06, + "loss": 0.6758, + "step": 2443 + }, + { + "epoch": 0.28, + "grad_norm": 1.6200432361001056, + "learning_rate": 8.439871355308413e-06, + "loss": 0.4329, + "step": 2444 + }, + { + "epoch": 0.28, + "grad_norm": 2.3736382223822274, + "learning_rate": 8.438520587543901e-06, + "loss": 0.5428, + "step": 2445 + }, + { + "epoch": 0.28, + "grad_norm": 2.231367064261845, + "learning_rate": 8.43716934347611e-06, + "loss": 0.5207, + "step": 2446 + }, + { + "epoch": 0.28, + "grad_norm": 2.614487878671032, + "learning_rate": 8.43581762329222e-06, + "loss": 0.4072, + "step": 2447 + }, + { + "epoch": 0.28, + "grad_norm": 2.011835224106064, + "learning_rate": 8.434465427179465e-06, + "loss": 0.5087, + "step": 2448 + }, + { + "epoch": 0.28, + "grad_norm": 2.56122732967349, + "learning_rate": 8.433112755325156e-06, + "loss": 0.4995, + "step": 2449 + }, + { + "epoch": 0.28, + "grad_norm": 2.149617913932202, + "learning_rate": 8.431759607916663e-06, + "loss": 0.5051, + "step": 2450 + }, + { + "epoch": 0.28, + "grad_norm": 3.684465741301117, + "learning_rate": 8.430405985141422e-06, + "loss": 0.5627, + "step": 2451 + }, + { + "epoch": 0.28, + "grad_norm": 2.2400415390707393, + "learning_rate": 8.429051887186938e-06, + "loss": 0.5304, + "step": 2452 + }, + { + "epoch": 0.28, + "grad_norm": 1.7080036651392563, + "learning_rate": 8.427697314240783e-06, + "loss": 0.5206, + "step": 2453 + }, + { + "epoch": 0.28, + "grad_norm": 2.019754708991868, + "learning_rate": 8.426342266490588e-06, + "loss": 0.4974, + "step": 2454 + }, + { + "epoch": 0.28, + "grad_norm": 1.7825288062337001, + "learning_rate": 8.424986744124055e-06, + "loss": 0.4479, + "step": 2455 + }, + { + "epoch": 0.28, + "grad_norm": 2.553590391333902, + "learning_rate": 8.423630747328952e-06, + "loss": 0.5018, + "step": 2456 + }, + { + "epoch": 0.28, + "grad_norm": 1.694477646166225, + "learning_rate": 8.422274276293112e-06, + "loss": 0.5876, + "step": 2457 + }, + { + "epoch": 0.28, + "grad_norm": 1.9353920663937663, + "learning_rate": 8.42091733120443e-06, + "loss": 0.5347, + "step": 2458 + }, + { + "epoch": 0.28, + "grad_norm": 1.9125974724980463, + "learning_rate": 8.419559912250873e-06, + "loss": 0.6372, + "step": 2459 + }, + { + "epoch": 0.28, + "grad_norm": 1.6363066527834271, + "learning_rate": 8.41820201962047e-06, + "loss": 0.517, + "step": 2460 + }, + { + "epoch": 0.28, + "grad_norm": 2.770651132381932, + "learning_rate": 8.416843653501314e-06, + "loss": 0.3946, + "step": 2461 + }, + { + "epoch": 0.28, + "grad_norm": 1.7447269718246932, + "learning_rate": 8.415484814081567e-06, + "loss": 0.5645, + "step": 2462 + }, + { + "epoch": 0.28, + "grad_norm": 3.2579212353928275, + "learning_rate": 8.414125501549456e-06, + "loss": 0.5223, + "step": 2463 + }, + { + "epoch": 0.28, + "grad_norm": 2.101878526298239, + "learning_rate": 8.412765716093273e-06, + "loss": 0.5117, + "step": 2464 + }, + { + "epoch": 0.28, + "grad_norm": 1.7959466524879582, + "learning_rate": 8.41140545790137e-06, + "loss": 0.479, + "step": 2465 + }, + { + "epoch": 0.28, + "grad_norm": 1.8632316215309777, + "learning_rate": 8.410044727162177e-06, + "loss": 0.5766, + "step": 2466 + }, + { + "epoch": 0.28, + "grad_norm": 2.30572320937174, + "learning_rate": 8.408683524064178e-06, + "loss": 0.4351, + "step": 2467 + }, + { + "epoch": 0.28, + "grad_norm": 1.9783787516107614, + "learning_rate": 8.407321848795928e-06, + "loss": 0.3994, + "step": 2468 + }, + { + "epoch": 0.28, + "grad_norm": 1.9813384395591465, + "learning_rate": 8.405959701546046e-06, + "loss": 0.5501, + "step": 2469 + }, + { + "epoch": 0.28, + "grad_norm": 2.0587082069470823, + "learning_rate": 8.404597082503216e-06, + "loss": 0.4959, + "step": 2470 + }, + { + "epoch": 0.28, + "grad_norm": 2.0284767220773756, + "learning_rate": 8.403233991856187e-06, + "loss": 0.4635, + "step": 2471 + }, + { + "epoch": 0.28, + "grad_norm": 2.3076544070795073, + "learning_rate": 8.401870429793775e-06, + "loss": 0.5459, + "step": 2472 + }, + { + "epoch": 0.28, + "grad_norm": 2.136617051911346, + "learning_rate": 8.400506396504862e-06, + "loss": 0.4896, + "step": 2473 + }, + { + "epoch": 0.28, + "grad_norm": 1.793823749821937, + "learning_rate": 8.39914189217839e-06, + "loss": 0.4948, + "step": 2474 + }, + { + "epoch": 0.28, + "grad_norm": 1.9877011529799353, + "learning_rate": 8.397776917003373e-06, + "loss": 0.4339, + "step": 2475 + }, + { + "epoch": 0.28, + "grad_norm": 2.1414104109797414, + "learning_rate": 8.396411471168885e-06, + "loss": 0.4235, + "step": 2476 + }, + { + "epoch": 0.28, + "grad_norm": 2.0776739748904367, + "learning_rate": 8.39504555486407e-06, + "loss": 0.4944, + "step": 2477 + }, + { + "epoch": 0.28, + "grad_norm": 2.602588870494956, + "learning_rate": 8.39367916827813e-06, + "loss": 0.4304, + "step": 2478 + }, + { + "epoch": 0.28, + "grad_norm": 1.9596982500983087, + "learning_rate": 8.392312311600342e-06, + "loss": 0.5162, + "step": 2479 + }, + { + "epoch": 0.28, + "grad_norm": 1.9459732395447518, + "learning_rate": 8.39094498502004e-06, + "loss": 0.5254, + "step": 2480 + }, + { + "epoch": 0.29, + "grad_norm": 2.113600715524274, + "learning_rate": 8.389577188726624e-06, + "loss": 0.5244, + "step": 2481 + }, + { + "epoch": 0.29, + "grad_norm": 1.8300524743861903, + "learning_rate": 8.388208922909565e-06, + "loss": 0.6096, + "step": 2482 + }, + { + "epoch": 0.29, + "grad_norm": 1.6704663676474898, + "learning_rate": 8.386840187758392e-06, + "loss": 0.4765, + "step": 2483 + }, + { + "epoch": 0.29, + "grad_norm": 2.283560580900437, + "learning_rate": 8.385470983462702e-06, + "loss": 0.4914, + "step": 2484 + }, + { + "epoch": 0.29, + "grad_norm": 3.000735906709495, + "learning_rate": 8.384101310212159e-06, + "loss": 0.6591, + "step": 2485 + }, + { + "epoch": 0.29, + "grad_norm": 2.186069554083827, + "learning_rate": 8.382731168196488e-06, + "loss": 0.5405, + "step": 2486 + }, + { + "epoch": 0.29, + "grad_norm": 1.7359400705871633, + "learning_rate": 8.381360557605482e-06, + "loss": 0.4225, + "step": 2487 + }, + { + "epoch": 0.29, + "grad_norm": 1.7413768391461313, + "learning_rate": 8.379989478628995e-06, + "loss": 0.4555, + "step": 2488 + }, + { + "epoch": 0.29, + "grad_norm": 1.4667667335876529, + "learning_rate": 8.378617931456954e-06, + "loss": 0.7928, + "step": 2489 + }, + { + "epoch": 0.29, + "grad_norm": 2.735496535214493, + "learning_rate": 8.37724591627934e-06, + "loss": 0.5059, + "step": 2490 + }, + { + "epoch": 0.29, + "grad_norm": 2.839994480530658, + "learning_rate": 8.375873433286208e-06, + "loss": 0.52, + "step": 2491 + }, + { + "epoch": 0.29, + "grad_norm": 1.678638473734695, + "learning_rate": 8.374500482667672e-06, + "loss": 0.5166, + "step": 2492 + }, + { + "epoch": 0.29, + "grad_norm": 2.2162092490751757, + "learning_rate": 8.373127064613915e-06, + "loss": 0.5076, + "step": 2493 + }, + { + "epoch": 0.29, + "grad_norm": 2.3200971074787304, + "learning_rate": 8.371753179315179e-06, + "loss": 0.5531, + "step": 2494 + }, + { + "epoch": 0.29, + "grad_norm": 1.7216342085849654, + "learning_rate": 8.370378826961778e-06, + "loss": 0.4487, + "step": 2495 + }, + { + "epoch": 0.29, + "grad_norm": 2.1423237589216275, + "learning_rate": 8.369004007744087e-06, + "loss": 0.5298, + "step": 2496 + }, + { + "epoch": 0.29, + "grad_norm": 5.645257198526256, + "learning_rate": 8.367628721852543e-06, + "loss": 0.5699, + "step": 2497 + }, + { + "epoch": 0.29, + "grad_norm": 1.8134696604772362, + "learning_rate": 8.36625296947765e-06, + "loss": 0.477, + "step": 2498 + }, + { + "epoch": 0.29, + "grad_norm": 2.5414641186239586, + "learning_rate": 8.36487675080998e-06, + "loss": 0.4389, + "step": 2499 + }, + { + "epoch": 0.29, + "grad_norm": 1.5252367505118072, + "learning_rate": 8.363500066040166e-06, + "loss": 0.4013, + "step": 2500 + }, + { + "epoch": 0.29, + "grad_norm": 2.495368216145309, + "learning_rate": 8.362122915358905e-06, + "loss": 0.528, + "step": 2501 + }, + { + "epoch": 0.29, + "grad_norm": 2.3571232321605056, + "learning_rate": 8.360745298956961e-06, + "loss": 0.5404, + "step": 2502 + }, + { + "epoch": 0.29, + "grad_norm": 3.1125992136789797, + "learning_rate": 8.35936721702516e-06, + "loss": 0.6349, + "step": 2503 + }, + { + "epoch": 0.29, + "grad_norm": 1.8142371765059333, + "learning_rate": 8.357988669754394e-06, + "loss": 0.5174, + "step": 2504 + }, + { + "epoch": 0.29, + "grad_norm": 1.8118932932164502, + "learning_rate": 8.356609657335618e-06, + "loss": 0.5023, + "step": 2505 + }, + { + "epoch": 0.29, + "grad_norm": 2.3076764111032952, + "learning_rate": 8.355230179959854e-06, + "loss": 0.605, + "step": 2506 + }, + { + "epoch": 0.29, + "grad_norm": 1.9235744931715504, + "learning_rate": 8.353850237818186e-06, + "loss": 0.5272, + "step": 2507 + }, + { + "epoch": 0.29, + "grad_norm": 2.078398450030257, + "learning_rate": 8.352469831101766e-06, + "loss": 0.4528, + "step": 2508 + }, + { + "epoch": 0.29, + "grad_norm": 1.9531604578798716, + "learning_rate": 8.351088960001803e-06, + "loss": 0.5168, + "step": 2509 + }, + { + "epoch": 0.29, + "grad_norm": 2.1028441277671313, + "learning_rate": 8.34970762470958e-06, + "loss": 0.5571, + "step": 2510 + }, + { + "epoch": 0.29, + "grad_norm": 2.35250411204121, + "learning_rate": 8.348325825416437e-06, + "loss": 0.4797, + "step": 2511 + }, + { + "epoch": 0.29, + "grad_norm": 2.195447050556698, + "learning_rate": 8.346943562313778e-06, + "loss": 0.6018, + "step": 2512 + }, + { + "epoch": 0.29, + "grad_norm": 2.4852129332469812, + "learning_rate": 8.34556083559308e-06, + "loss": 0.4733, + "step": 2513 + }, + { + "epoch": 0.29, + "grad_norm": 2.060677580368984, + "learning_rate": 8.344177645445873e-06, + "loss": 0.6135, + "step": 2514 + }, + { + "epoch": 0.29, + "grad_norm": 1.7837756150638142, + "learning_rate": 8.342793992063756e-06, + "loss": 0.3539, + "step": 2515 + }, + { + "epoch": 0.29, + "grad_norm": 2.995582646914153, + "learning_rate": 8.341409875638396e-06, + "loss": 0.5847, + "step": 2516 + }, + { + "epoch": 0.29, + "grad_norm": 2.9757836101816952, + "learning_rate": 8.340025296361519e-06, + "loss": 0.4839, + "step": 2517 + }, + { + "epoch": 0.29, + "grad_norm": 2.081990463370359, + "learning_rate": 8.338640254424914e-06, + "loss": 0.5003, + "step": 2518 + }, + { + "epoch": 0.29, + "grad_norm": 1.996171181958468, + "learning_rate": 8.337254750020442e-06, + "loss": 0.5643, + "step": 2519 + }, + { + "epoch": 0.29, + "grad_norm": 1.8140496662868268, + "learning_rate": 8.33586878334002e-06, + "loss": 0.5516, + "step": 2520 + }, + { + "epoch": 0.29, + "grad_norm": 2.3990925075918694, + "learning_rate": 8.334482354575632e-06, + "loss": 0.7757, + "step": 2521 + }, + { + "epoch": 0.29, + "grad_norm": 1.9449977506334513, + "learning_rate": 8.333095463919325e-06, + "loss": 0.5291, + "step": 2522 + }, + { + "epoch": 0.29, + "grad_norm": 1.9491452355773233, + "learning_rate": 8.33170811156321e-06, + "loss": 0.5566, + "step": 2523 + }, + { + "epoch": 0.29, + "grad_norm": 2.5212534621021248, + "learning_rate": 8.330320297699467e-06, + "loss": 0.5415, + "step": 2524 + }, + { + "epoch": 0.29, + "grad_norm": 8.554886217100242, + "learning_rate": 8.328932022520333e-06, + "loss": 0.5234, + "step": 2525 + }, + { + "epoch": 0.29, + "grad_norm": 2.3468339085643706, + "learning_rate": 8.32754328621811e-06, + "loss": 0.5143, + "step": 2526 + }, + { + "epoch": 0.29, + "grad_norm": 2.4750427602865726, + "learning_rate": 8.326154088985167e-06, + "loss": 0.5359, + "step": 2527 + }, + { + "epoch": 0.29, + "grad_norm": 2.930860651733599, + "learning_rate": 8.324764431013939e-06, + "loss": 0.4767, + "step": 2528 + }, + { + "epoch": 0.29, + "grad_norm": 2.1631323063997323, + "learning_rate": 8.323374312496915e-06, + "loss": 0.5287, + "step": 2529 + }, + { + "epoch": 0.29, + "grad_norm": 1.7063873078612262, + "learning_rate": 8.321983733626658e-06, + "loss": 0.5289, + "step": 2530 + }, + { + "epoch": 0.29, + "grad_norm": 2.5864732121922116, + "learning_rate": 8.32059269459579e-06, + "loss": 0.579, + "step": 2531 + }, + { + "epoch": 0.29, + "grad_norm": 3.9602400789474994, + "learning_rate": 8.319201195596997e-06, + "loss": 0.6436, + "step": 2532 + }, + { + "epoch": 0.29, + "grad_norm": 1.8289823560157406, + "learning_rate": 8.317809236823029e-06, + "loss": 0.5489, + "step": 2533 + }, + { + "epoch": 0.29, + "grad_norm": 2.6231852476729802, + "learning_rate": 8.3164168184667e-06, + "loss": 0.5161, + "step": 2534 + }, + { + "epoch": 0.29, + "grad_norm": 2.5366764576126193, + "learning_rate": 8.315023940720887e-06, + "loss": 0.5531, + "step": 2535 + }, + { + "epoch": 0.29, + "grad_norm": 1.9488627060720387, + "learning_rate": 8.313630603778534e-06, + "loss": 0.4573, + "step": 2536 + }, + { + "epoch": 0.29, + "grad_norm": 1.924078240217396, + "learning_rate": 8.31223680783264e-06, + "loss": 0.4541, + "step": 2537 + }, + { + "epoch": 0.29, + "grad_norm": 2.225838060266021, + "learning_rate": 8.310842553076282e-06, + "loss": 0.4971, + "step": 2538 + }, + { + "epoch": 0.29, + "grad_norm": 3.4014459222026394, + "learning_rate": 8.309447839702583e-06, + "loss": 0.5184, + "step": 2539 + }, + { + "epoch": 0.29, + "grad_norm": 3.3570305539406524, + "learning_rate": 8.308052667904743e-06, + "loss": 0.5054, + "step": 2540 + }, + { + "epoch": 0.29, + "grad_norm": 2.057641848920971, + "learning_rate": 8.306657037876022e-06, + "loss": 0.5212, + "step": 2541 + }, + { + "epoch": 0.29, + "grad_norm": 2.4410708190005943, + "learning_rate": 8.30526094980974e-06, + "loss": 0.5786, + "step": 2542 + }, + { + "epoch": 0.29, + "grad_norm": 1.6974692521333536, + "learning_rate": 8.303864403899284e-06, + "loss": 0.588, + "step": 2543 + }, + { + "epoch": 0.29, + "grad_norm": 1.702940776148582, + "learning_rate": 8.302467400338103e-06, + "loss": 0.4504, + "step": 2544 + }, + { + "epoch": 0.29, + "grad_norm": 1.9055720062028387, + "learning_rate": 8.301069939319709e-06, + "loss": 0.5871, + "step": 2545 + }, + { + "epoch": 0.29, + "grad_norm": 2.4313222718474585, + "learning_rate": 8.29967202103768e-06, + "loss": 0.4795, + "step": 2546 + }, + { + "epoch": 0.29, + "grad_norm": 3.1847211600284964, + "learning_rate": 8.298273645685654e-06, + "loss": 0.5752, + "step": 2547 + }, + { + "epoch": 0.29, + "grad_norm": 2.5620276305431986, + "learning_rate": 8.296874813457333e-06, + "loss": 0.5981, + "step": 2548 + }, + { + "epoch": 0.29, + "grad_norm": 2.0790635751117557, + "learning_rate": 8.295475524546483e-06, + "loss": 0.492, + "step": 2549 + }, + { + "epoch": 0.29, + "grad_norm": 1.6700540372456516, + "learning_rate": 8.294075779146937e-06, + "loss": 0.4114, + "step": 2550 + }, + { + "epoch": 0.29, + "grad_norm": 2.1810573750147086, + "learning_rate": 8.292675577452582e-06, + "loss": 0.5181, + "step": 2551 + }, + { + "epoch": 0.29, + "grad_norm": 1.834411970565417, + "learning_rate": 8.291274919657378e-06, + "loss": 0.5127, + "step": 2552 + }, + { + "epoch": 0.29, + "grad_norm": 3.4206051480736432, + "learning_rate": 8.289873805955342e-06, + "loss": 0.4816, + "step": 2553 + }, + { + "epoch": 0.29, + "grad_norm": 1.8007577976221842, + "learning_rate": 8.288472236540556e-06, + "loss": 0.5099, + "step": 2554 + }, + { + "epoch": 0.29, + "grad_norm": 3.17998306811133, + "learning_rate": 8.287070211607164e-06, + "loss": 0.5325, + "step": 2555 + }, + { + "epoch": 0.29, + "grad_norm": 2.8541425438226002, + "learning_rate": 8.285667731349377e-06, + "loss": 0.5163, + "step": 2556 + }, + { + "epoch": 0.29, + "grad_norm": 1.9837987984166925, + "learning_rate": 8.284264795961464e-06, + "loss": 0.4859, + "step": 2557 + }, + { + "epoch": 0.29, + "grad_norm": 2.1085327767185698, + "learning_rate": 8.282861405637763e-06, + "loss": 0.5985, + "step": 2558 + }, + { + "epoch": 0.29, + "grad_norm": 2.197077982294476, + "learning_rate": 8.281457560572665e-06, + "loss": 0.4856, + "step": 2559 + }, + { + "epoch": 0.29, + "grad_norm": 2.0160435763015268, + "learning_rate": 8.280053260960636e-06, + "loss": 0.5432, + "step": 2560 + }, + { + "epoch": 0.29, + "grad_norm": 1.424517128906401, + "learning_rate": 8.278648506996197e-06, + "loss": 0.7695, + "step": 2561 + }, + { + "epoch": 0.29, + "grad_norm": 2.48023049048259, + "learning_rate": 8.277243298873936e-06, + "loss": 0.4928, + "step": 2562 + }, + { + "epoch": 0.29, + "grad_norm": 2.2092091953518698, + "learning_rate": 8.2758376367885e-06, + "loss": 0.4321, + "step": 2563 + }, + { + "epoch": 0.29, + "grad_norm": 2.112813309972629, + "learning_rate": 8.274431520934602e-06, + "loss": 0.4979, + "step": 2564 + }, + { + "epoch": 0.29, + "grad_norm": 1.9282668287096787, + "learning_rate": 8.273024951507017e-06, + "loss": 0.5302, + "step": 2565 + }, + { + "epoch": 0.29, + "grad_norm": 0.8446823223621954, + "learning_rate": 8.271617928700581e-06, + "loss": 0.7312, + "step": 2566 + }, + { + "epoch": 0.29, + "grad_norm": 2.617690319350906, + "learning_rate": 8.270210452710198e-06, + "loss": 0.5737, + "step": 2567 + }, + { + "epoch": 0.3, + "grad_norm": 2.0552496777118328, + "learning_rate": 8.268802523730827e-06, + "loss": 0.5568, + "step": 2568 + }, + { + "epoch": 0.3, + "grad_norm": 2.0232775617679954, + "learning_rate": 8.2673941419575e-06, + "loss": 0.5011, + "step": 2569 + }, + { + "epoch": 0.3, + "grad_norm": 2.024146167493914, + "learning_rate": 8.265985307585301e-06, + "loss": 0.5274, + "step": 2570 + }, + { + "epoch": 0.3, + "grad_norm": 2.3118394486841143, + "learning_rate": 8.264576020809383e-06, + "loss": 0.5356, + "step": 2571 + }, + { + "epoch": 0.3, + "grad_norm": 2.610072451524174, + "learning_rate": 8.26316628182496e-06, + "loss": 0.5092, + "step": 2572 + }, + { + "epoch": 0.3, + "grad_norm": 3.256709912825738, + "learning_rate": 8.261756090827308e-06, + "loss": 0.5468, + "step": 2573 + }, + { + "epoch": 0.3, + "grad_norm": 1.861063601434612, + "learning_rate": 8.260345448011768e-06, + "loss": 0.5414, + "step": 2574 + }, + { + "epoch": 0.3, + "grad_norm": 2.7829656141030537, + "learning_rate": 8.258934353573742e-06, + "loss": 0.5507, + "step": 2575 + }, + { + "epoch": 0.3, + "grad_norm": 2.220009496041894, + "learning_rate": 8.257522807708693e-06, + "loss": 0.5474, + "step": 2576 + }, + { + "epoch": 0.3, + "grad_norm": 1.653471236538373, + "learning_rate": 8.256110810612148e-06, + "loss": 0.5424, + "step": 2577 + }, + { + "epoch": 0.3, + "grad_norm": 1.9681653631174034, + "learning_rate": 8.254698362479698e-06, + "loss": 0.4672, + "step": 2578 + }, + { + "epoch": 0.3, + "grad_norm": 2.0595366649340368, + "learning_rate": 8.253285463506995e-06, + "loss": 0.5706, + "step": 2579 + }, + { + "epoch": 0.3, + "grad_norm": 1.793100338503336, + "learning_rate": 8.251872113889754e-06, + "loss": 0.4998, + "step": 2580 + }, + { + "epoch": 0.3, + "grad_norm": 1.87952322526159, + "learning_rate": 8.250458313823749e-06, + "loss": 0.4832, + "step": 2581 + }, + { + "epoch": 0.3, + "grad_norm": 2.1921515783496046, + "learning_rate": 8.249044063504824e-06, + "loss": 0.4682, + "step": 2582 + }, + { + "epoch": 0.3, + "grad_norm": 2.4465863214620356, + "learning_rate": 8.247629363128876e-06, + "loss": 0.4665, + "step": 2583 + }, + { + "epoch": 0.3, + "grad_norm": 2.074286932433618, + "learning_rate": 8.24621421289187e-06, + "loss": 0.4985, + "step": 2584 + }, + { + "epoch": 0.3, + "grad_norm": 1.9520032705047459, + "learning_rate": 8.244798612989837e-06, + "loss": 0.5981, + "step": 2585 + }, + { + "epoch": 0.3, + "grad_norm": 2.357868501764867, + "learning_rate": 8.24338256361886e-06, + "loss": 0.4091, + "step": 2586 + }, + { + "epoch": 0.3, + "grad_norm": 3.0763479993867717, + "learning_rate": 8.241966064975091e-06, + "loss": 0.5052, + "step": 2587 + }, + { + "epoch": 0.3, + "grad_norm": 2.118292695663577, + "learning_rate": 8.240549117254746e-06, + "loss": 0.4825, + "step": 2588 + }, + { + "epoch": 0.3, + "grad_norm": 2.1767757588181538, + "learning_rate": 8.239131720654099e-06, + "loss": 0.4345, + "step": 2589 + }, + { + "epoch": 0.3, + "grad_norm": 3.023881024929553, + "learning_rate": 8.237713875369485e-06, + "loss": 0.4986, + "step": 2590 + }, + { + "epoch": 0.3, + "grad_norm": 2.4511741843294326, + "learning_rate": 8.236295581597307e-06, + "loss": 0.5393, + "step": 2591 + }, + { + "epoch": 0.3, + "grad_norm": 2.8330080456413795, + "learning_rate": 8.234876839534025e-06, + "loss": 0.4641, + "step": 2592 + }, + { + "epoch": 0.3, + "grad_norm": 5.141542332447975, + "learning_rate": 8.233457649376165e-06, + "loss": 0.5672, + "step": 2593 + }, + { + "epoch": 0.3, + "grad_norm": 1.9362241133091074, + "learning_rate": 8.23203801132031e-06, + "loss": 0.5554, + "step": 2594 + }, + { + "epoch": 0.3, + "grad_norm": 2.2754758547507206, + "learning_rate": 8.230617925563108e-06, + "loss": 0.4555, + "step": 2595 + }, + { + "epoch": 0.3, + "grad_norm": 2.900594829201475, + "learning_rate": 8.229197392301274e-06, + "loss": 0.5462, + "step": 2596 + }, + { + "epoch": 0.3, + "grad_norm": 2.2577907696076145, + "learning_rate": 8.227776411731574e-06, + "loss": 0.479, + "step": 2597 + }, + { + "epoch": 0.3, + "grad_norm": 2.4724473080863394, + "learning_rate": 8.226354984050846e-06, + "loss": 0.5822, + "step": 2598 + }, + { + "epoch": 0.3, + "grad_norm": 3.572981476225515, + "learning_rate": 8.224933109455984e-06, + "loss": 0.5546, + "step": 2599 + }, + { + "epoch": 0.3, + "grad_norm": 1.9099495729709979, + "learning_rate": 8.223510788143946e-06, + "loss": 0.5689, + "step": 2600 + }, + { + "epoch": 0.3, + "grad_norm": 2.4522404105463105, + "learning_rate": 8.222088020311753e-06, + "loss": 0.5573, + "step": 2601 + }, + { + "epoch": 0.3, + "grad_norm": 1.9608137780272412, + "learning_rate": 8.220664806156485e-06, + "loss": 0.4721, + "step": 2602 + }, + { + "epoch": 0.3, + "grad_norm": 2.748922548536042, + "learning_rate": 8.219241145875284e-06, + "loss": 0.4902, + "step": 2603 + }, + { + "epoch": 0.3, + "grad_norm": 2.515391983218839, + "learning_rate": 8.21781703966536e-06, + "loss": 0.5165, + "step": 2604 + }, + { + "epoch": 0.3, + "grad_norm": 1.8026762625518005, + "learning_rate": 8.216392487723974e-06, + "loss": 0.5162, + "step": 2605 + }, + { + "epoch": 0.3, + "grad_norm": 2.525923698956431, + "learning_rate": 8.21496749024846e-06, + "loss": 0.4184, + "step": 2606 + }, + { + "epoch": 0.3, + "grad_norm": 2.690732644247078, + "learning_rate": 8.213542047436207e-06, + "loss": 0.5216, + "step": 2607 + }, + { + "epoch": 0.3, + "grad_norm": 4.932992151033144, + "learning_rate": 8.212116159484663e-06, + "loss": 0.4179, + "step": 2608 + }, + { + "epoch": 0.3, + "grad_norm": 1.8949851989688322, + "learning_rate": 8.210689826591348e-06, + "loss": 0.5248, + "step": 2609 + }, + { + "epoch": 0.3, + "grad_norm": 1.9084182291196878, + "learning_rate": 8.20926304895383e-06, + "loss": 0.4672, + "step": 2610 + }, + { + "epoch": 0.3, + "grad_norm": 2.0229182635627345, + "learning_rate": 8.207835826769754e-06, + "loss": 0.5381, + "step": 2611 + }, + { + "epoch": 0.3, + "grad_norm": 2.368575564456848, + "learning_rate": 8.206408160236814e-06, + "loss": 0.4981, + "step": 2612 + }, + { + "epoch": 0.3, + "grad_norm": 1.80304552071042, + "learning_rate": 8.204980049552771e-06, + "loss": 0.4464, + "step": 2613 + }, + { + "epoch": 0.3, + "grad_norm": 1.8289280162883566, + "learning_rate": 8.203551494915447e-06, + "loss": 0.4563, + "step": 2614 + }, + { + "epoch": 0.3, + "grad_norm": 2.3997514625336303, + "learning_rate": 8.202122496522724e-06, + "loss": 0.5362, + "step": 2615 + }, + { + "epoch": 0.3, + "grad_norm": 5.915580868660194, + "learning_rate": 8.200693054572549e-06, + "loss": 0.454, + "step": 2616 + }, + { + "epoch": 0.3, + "grad_norm": 1.0165465694625697, + "learning_rate": 8.199263169262926e-06, + "loss": 0.7581, + "step": 2617 + }, + { + "epoch": 0.3, + "grad_norm": 2.5175960025874917, + "learning_rate": 8.197832840791921e-06, + "loss": 0.4169, + "step": 2618 + }, + { + "epoch": 0.3, + "grad_norm": 2.309293361666766, + "learning_rate": 8.196402069357667e-06, + "loss": 0.5654, + "step": 2619 + }, + { + "epoch": 0.3, + "grad_norm": 1.913954291665579, + "learning_rate": 8.194970855158351e-06, + "loss": 0.5939, + "step": 2620 + }, + { + "epoch": 0.3, + "grad_norm": 2.5134998548093526, + "learning_rate": 8.193539198392223e-06, + "loss": 0.6259, + "step": 2621 + }, + { + "epoch": 0.3, + "grad_norm": 2.643991665914572, + "learning_rate": 8.192107099257604e-06, + "loss": 0.4816, + "step": 2622 + }, + { + "epoch": 0.3, + "grad_norm": 2.201914171500576, + "learning_rate": 8.190674557952859e-06, + "loss": 0.5591, + "step": 2623 + }, + { + "epoch": 0.3, + "grad_norm": 0.9421403735108577, + "learning_rate": 8.189241574676428e-06, + "loss": 0.7072, + "step": 2624 + }, + { + "epoch": 0.3, + "grad_norm": 2.026448528775754, + "learning_rate": 8.187808149626805e-06, + "loss": 0.5318, + "step": 2625 + }, + { + "epoch": 0.3, + "grad_norm": 2.1255734077205064, + "learning_rate": 8.18637428300255e-06, + "loss": 0.4483, + "step": 2626 + }, + { + "epoch": 0.3, + "grad_norm": 2.3026666589377163, + "learning_rate": 8.184939975002282e-06, + "loss": 0.5126, + "step": 2627 + }, + { + "epoch": 0.3, + "grad_norm": 1.924866681636218, + "learning_rate": 8.183505225824678e-06, + "loss": 0.4764, + "step": 2628 + }, + { + "epoch": 0.3, + "grad_norm": 2.5455062793008416, + "learning_rate": 8.182070035668483e-06, + "loss": 0.4667, + "step": 2629 + }, + { + "epoch": 0.3, + "grad_norm": 2.1014178301836886, + "learning_rate": 8.180634404732499e-06, + "loss": 0.51, + "step": 2630 + }, + { + "epoch": 0.3, + "grad_norm": 1.6326201780747789, + "learning_rate": 8.179198333215588e-06, + "loss": 0.4979, + "step": 2631 + }, + { + "epoch": 0.3, + "grad_norm": 2.1127585387318475, + "learning_rate": 8.177761821316673e-06, + "loss": 0.4186, + "step": 2632 + }, + { + "epoch": 0.3, + "grad_norm": 3.5667817685790095, + "learning_rate": 8.17632486923474e-06, + "loss": 0.4789, + "step": 2633 + }, + { + "epoch": 0.3, + "grad_norm": 3.2282953532535705, + "learning_rate": 8.174887477168838e-06, + "loss": 0.5056, + "step": 2634 + }, + { + "epoch": 0.3, + "grad_norm": 2.2731624794648244, + "learning_rate": 8.173449645318073e-06, + "loss": 0.4992, + "step": 2635 + }, + { + "epoch": 0.3, + "grad_norm": 1.7622816661750307, + "learning_rate": 8.172011373881613e-06, + "loss": 0.4283, + "step": 2636 + }, + { + "epoch": 0.3, + "grad_norm": 1.8124244627370563, + "learning_rate": 8.170572663058685e-06, + "loss": 0.4379, + "step": 2637 + }, + { + "epoch": 0.3, + "grad_norm": 1.812719529052574, + "learning_rate": 8.169133513048581e-06, + "loss": 0.5041, + "step": 2638 + }, + { + "epoch": 0.3, + "grad_norm": 3.046043517141781, + "learning_rate": 8.167693924050654e-06, + "loss": 0.5004, + "step": 2639 + }, + { + "epoch": 0.3, + "grad_norm": 2.141722516858288, + "learning_rate": 8.166253896264313e-06, + "loss": 0.5229, + "step": 2640 + }, + { + "epoch": 0.3, + "grad_norm": 2.31208919722211, + "learning_rate": 8.164813429889028e-06, + "loss": 0.6349, + "step": 2641 + }, + { + "epoch": 0.3, + "grad_norm": 2.191106200937493, + "learning_rate": 8.163372525124337e-06, + "loss": 0.5277, + "step": 2642 + }, + { + "epoch": 0.3, + "grad_norm": 2.578088089010064, + "learning_rate": 8.161931182169831e-06, + "loss": 0.5299, + "step": 2643 + }, + { + "epoch": 0.3, + "grad_norm": 3.4168045269213048, + "learning_rate": 8.160489401225164e-06, + "loss": 0.5204, + "step": 2644 + }, + { + "epoch": 0.3, + "grad_norm": 2.136566182545421, + "learning_rate": 8.159047182490055e-06, + "loss": 0.4372, + "step": 2645 + }, + { + "epoch": 0.3, + "grad_norm": 2.3239674995998194, + "learning_rate": 8.157604526164277e-06, + "loss": 0.5406, + "step": 2646 + }, + { + "epoch": 0.3, + "grad_norm": 1.9675343438418094, + "learning_rate": 8.156161432447667e-06, + "loss": 0.5072, + "step": 2647 + }, + { + "epoch": 0.3, + "grad_norm": 1.9496801825564734, + "learning_rate": 8.154717901540122e-06, + "loss": 0.4905, + "step": 2648 + }, + { + "epoch": 0.3, + "grad_norm": 2.828644232880727, + "learning_rate": 8.153273933641598e-06, + "loss": 0.4398, + "step": 2649 + }, + { + "epoch": 0.3, + "grad_norm": 2.1060654131164345, + "learning_rate": 8.151829528952116e-06, + "loss": 0.5236, + "step": 2650 + }, + { + "epoch": 0.3, + "grad_norm": 0.8552252104557339, + "learning_rate": 8.150384687671754e-06, + "loss": 0.7433, + "step": 2651 + }, + { + "epoch": 0.3, + "grad_norm": 2.3678985640533976, + "learning_rate": 8.148939410000651e-06, + "loss": 0.5357, + "step": 2652 + }, + { + "epoch": 0.3, + "grad_norm": 2.509625024280866, + "learning_rate": 8.147493696139005e-06, + "loss": 0.5589, + "step": 2653 + }, + { + "epoch": 0.3, + "grad_norm": 2.101670409898381, + "learning_rate": 8.146047546287077e-06, + "loss": 0.587, + "step": 2654 + }, + { + "epoch": 0.31, + "grad_norm": 1.9633985830700957, + "learning_rate": 8.144600960645188e-06, + "loss": 0.6321, + "step": 2655 + }, + { + "epoch": 0.31, + "grad_norm": 2.479628384598657, + "learning_rate": 8.143153939413722e-06, + "loss": 0.4181, + "step": 2656 + }, + { + "epoch": 0.31, + "grad_norm": 1.9862100390016741, + "learning_rate": 8.141706482793113e-06, + "loss": 0.5357, + "step": 2657 + }, + { + "epoch": 0.31, + "grad_norm": 2.1669535494021783, + "learning_rate": 8.140258590983867e-06, + "loss": 0.5263, + "step": 2658 + }, + { + "epoch": 0.31, + "grad_norm": 3.2025931739623035, + "learning_rate": 8.138810264186547e-06, + "loss": 0.447, + "step": 2659 + }, + { + "epoch": 0.31, + "grad_norm": 2.6826077503564942, + "learning_rate": 8.137361502601771e-06, + "loss": 0.5164, + "step": 2660 + }, + { + "epoch": 0.31, + "grad_norm": 2.223374492700929, + "learning_rate": 8.135912306430222e-06, + "loss": 0.4507, + "step": 2661 + }, + { + "epoch": 0.31, + "grad_norm": 2.2391538014463412, + "learning_rate": 8.134462675872645e-06, + "loss": 0.4993, + "step": 2662 + }, + { + "epoch": 0.31, + "grad_norm": 2.166778801813421, + "learning_rate": 8.13301261112984e-06, + "loss": 0.5364, + "step": 2663 + }, + { + "epoch": 0.31, + "grad_norm": 1.8973864942428746, + "learning_rate": 8.131562112402673e-06, + "loss": 0.5187, + "step": 2664 + }, + { + "epoch": 0.31, + "grad_norm": 2.5920960687867973, + "learning_rate": 8.130111179892062e-06, + "loss": 0.4947, + "step": 2665 + }, + { + "epoch": 0.31, + "grad_norm": 3.1031256648416106, + "learning_rate": 8.128659813798993e-06, + "loss": 0.5763, + "step": 2666 + }, + { + "epoch": 0.31, + "grad_norm": 1.8341656228175178, + "learning_rate": 8.12720801432451e-06, + "loss": 0.4709, + "step": 2667 + }, + { + "epoch": 0.31, + "grad_norm": 2.3210476864035043, + "learning_rate": 8.125755781669713e-06, + "loss": 0.5819, + "step": 2668 + }, + { + "epoch": 0.31, + "grad_norm": 2.461526355609641, + "learning_rate": 8.124303116035768e-06, + "loss": 0.5239, + "step": 2669 + }, + { + "epoch": 0.31, + "grad_norm": 1.924909982007976, + "learning_rate": 8.122850017623896e-06, + "loss": 0.532, + "step": 2670 + }, + { + "epoch": 0.31, + "grad_norm": 2.4378221968775082, + "learning_rate": 8.121396486635379e-06, + "loss": 0.5152, + "step": 2671 + }, + { + "epoch": 0.31, + "grad_norm": 1.7209751460374247, + "learning_rate": 8.119942523271562e-06, + "loss": 0.5047, + "step": 2672 + }, + { + "epoch": 0.31, + "grad_norm": 2.1142838178598167, + "learning_rate": 8.118488127733848e-06, + "loss": 0.5039, + "step": 2673 + }, + { + "epoch": 0.31, + "grad_norm": 3.0849872068419386, + "learning_rate": 8.117033300223698e-06, + "loss": 0.4756, + "step": 2674 + }, + { + "epoch": 0.31, + "grad_norm": 2.114156491338893, + "learning_rate": 8.115578040942636e-06, + "loss": 0.4856, + "step": 2675 + }, + { + "epoch": 0.31, + "grad_norm": 0.8830685358429977, + "learning_rate": 8.114122350092242e-06, + "loss": 0.7568, + "step": 2676 + }, + { + "epoch": 0.31, + "grad_norm": 2.0350935414465514, + "learning_rate": 8.11266622787416e-06, + "loss": 0.4157, + "step": 2677 + }, + { + "epoch": 0.31, + "grad_norm": 5.6954817884646785, + "learning_rate": 8.11120967449009e-06, + "loss": 0.5191, + "step": 2678 + }, + { + "epoch": 0.31, + "grad_norm": 2.1680373895461678, + "learning_rate": 8.109752690141797e-06, + "loss": 0.42, + "step": 2679 + }, + { + "epoch": 0.31, + "grad_norm": 2.7632888267007614, + "learning_rate": 8.1082952750311e-06, + "loss": 0.584, + "step": 2680 + }, + { + "epoch": 0.31, + "grad_norm": 2.592766198098536, + "learning_rate": 8.106837429359879e-06, + "loss": 0.6261, + "step": 2681 + }, + { + "epoch": 0.31, + "grad_norm": 2.285498419495638, + "learning_rate": 8.105379153330075e-06, + "loss": 0.4616, + "step": 2682 + }, + { + "epoch": 0.31, + "grad_norm": 1.9979798897872922, + "learning_rate": 8.103920447143689e-06, + "loss": 0.488, + "step": 2683 + }, + { + "epoch": 0.31, + "grad_norm": 2.8184448715286545, + "learning_rate": 8.10246131100278e-06, + "loss": 0.5482, + "step": 2684 + }, + { + "epoch": 0.31, + "grad_norm": 3.4075334417898797, + "learning_rate": 8.101001745109466e-06, + "loss": 0.5369, + "step": 2685 + }, + { + "epoch": 0.31, + "grad_norm": 1.998913152808339, + "learning_rate": 8.09954174966593e-06, + "loss": 0.4817, + "step": 2686 + }, + { + "epoch": 0.31, + "grad_norm": 2.7075597433061724, + "learning_rate": 8.098081324874407e-06, + "loss": 0.5357, + "step": 2687 + }, + { + "epoch": 0.31, + "grad_norm": 3.7572738965317525, + "learning_rate": 8.096620470937196e-06, + "loss": 0.5349, + "step": 2688 + }, + { + "epoch": 0.31, + "grad_norm": 4.975533596088515, + "learning_rate": 8.095159188056654e-06, + "loss": 0.4405, + "step": 2689 + }, + { + "epoch": 0.31, + "grad_norm": 8.456773770118364, + "learning_rate": 8.093697476435196e-06, + "loss": 0.4993, + "step": 2690 + }, + { + "epoch": 0.31, + "grad_norm": 2.483942110684858, + "learning_rate": 8.0922353362753e-06, + "loss": 0.5165, + "step": 2691 + }, + { + "epoch": 0.31, + "grad_norm": 2.401080190148116, + "learning_rate": 8.0907727677795e-06, + "loss": 0.481, + "step": 2692 + }, + { + "epoch": 0.31, + "grad_norm": 2.295217433827886, + "learning_rate": 8.089309771150391e-06, + "loss": 0.4102, + "step": 2693 + }, + { + "epoch": 0.31, + "grad_norm": 2.0742333357729574, + "learning_rate": 8.08784634659063e-06, + "loss": 0.5975, + "step": 2694 + }, + { + "epoch": 0.31, + "grad_norm": 0.9327466394488196, + "learning_rate": 8.086382494302927e-06, + "loss": 0.7196, + "step": 2695 + }, + { + "epoch": 0.31, + "grad_norm": 2.3095289021661425, + "learning_rate": 8.084918214490054e-06, + "loss": 0.4926, + "step": 2696 + }, + { + "epoch": 0.31, + "grad_norm": 1.850572146198129, + "learning_rate": 8.083453507354846e-06, + "loss": 0.4379, + "step": 2697 + }, + { + "epoch": 0.31, + "grad_norm": 1.993891193207234, + "learning_rate": 8.081988373100192e-06, + "loss": 0.476, + "step": 2698 + }, + { + "epoch": 0.31, + "grad_norm": 2.812939091119548, + "learning_rate": 8.08052281192904e-06, + "loss": 0.4542, + "step": 2699 + }, + { + "epoch": 0.31, + "grad_norm": 2.3602484852291896, + "learning_rate": 8.079056824044405e-06, + "loss": 0.4792, + "step": 2700 + }, + { + "epoch": 0.31, + "grad_norm": 11.80114180217629, + "learning_rate": 8.077590409649351e-06, + "loss": 0.4957, + "step": 2701 + }, + { + "epoch": 0.31, + "grad_norm": 1.9001716608028922, + "learning_rate": 8.076123568947006e-06, + "loss": 0.5238, + "step": 2702 + }, + { + "epoch": 0.31, + "grad_norm": 2.4884668539980117, + "learning_rate": 8.074656302140558e-06, + "loss": 0.5645, + "step": 2703 + }, + { + "epoch": 0.31, + "grad_norm": 3.1848586805360437, + "learning_rate": 8.07318860943325e-06, + "loss": 0.6117, + "step": 2704 + }, + { + "epoch": 0.31, + "grad_norm": 2.8737626556460176, + "learning_rate": 8.071720491028388e-06, + "loss": 0.4409, + "step": 2705 + }, + { + "epoch": 0.31, + "grad_norm": 2.627295299813453, + "learning_rate": 8.070251947129337e-06, + "loss": 0.521, + "step": 2706 + }, + { + "epoch": 0.31, + "grad_norm": 3.0278307900889434, + "learning_rate": 8.068782977939518e-06, + "loss": 0.551, + "step": 2707 + }, + { + "epoch": 0.31, + "grad_norm": 1.7160545691083138, + "learning_rate": 8.067313583662413e-06, + "loss": 0.5531, + "step": 2708 + }, + { + "epoch": 0.31, + "grad_norm": 2.8424686694430443, + "learning_rate": 8.06584376450156e-06, + "loss": 0.5644, + "step": 2709 + }, + { + "epoch": 0.31, + "grad_norm": 0.873073946817738, + "learning_rate": 8.06437352066056e-06, + "loss": 0.7099, + "step": 2710 + }, + { + "epoch": 0.31, + "grad_norm": 3.435571342920133, + "learning_rate": 8.06290285234307e-06, + "loss": 0.5966, + "step": 2711 + }, + { + "epoch": 0.31, + "grad_norm": 1.9885806864176903, + "learning_rate": 8.061431759752809e-06, + "loss": 0.4403, + "step": 2712 + }, + { + "epoch": 0.31, + "grad_norm": 2.4188466040353287, + "learning_rate": 8.059960243093551e-06, + "loss": 0.4744, + "step": 2713 + }, + { + "epoch": 0.31, + "grad_norm": 2.0671385230118164, + "learning_rate": 8.05848830256913e-06, + "loss": 0.5413, + "step": 2714 + }, + { + "epoch": 0.31, + "grad_norm": 1.9323694346639906, + "learning_rate": 8.057015938383438e-06, + "loss": 0.4844, + "step": 2715 + }, + { + "epoch": 0.31, + "grad_norm": 1.6759499190251672, + "learning_rate": 8.05554315074043e-06, + "loss": 0.4624, + "step": 2716 + }, + { + "epoch": 0.31, + "grad_norm": 2.3345206746226324, + "learning_rate": 8.05406993984411e-06, + "loss": 0.516, + "step": 2717 + }, + { + "epoch": 0.31, + "grad_norm": 3.546702723028085, + "learning_rate": 8.052596305898555e-06, + "loss": 0.3679, + "step": 2718 + }, + { + "epoch": 0.31, + "grad_norm": 2.0698493310403165, + "learning_rate": 8.051122249107885e-06, + "loss": 0.4803, + "step": 2719 + }, + { + "epoch": 0.31, + "grad_norm": 2.298032294374249, + "learning_rate": 8.049647769676291e-06, + "loss": 0.4668, + "step": 2720 + }, + { + "epoch": 0.31, + "grad_norm": 2.8229886794485424, + "learning_rate": 8.048172867808018e-06, + "loss": 0.5083, + "step": 2721 + }, + { + "epoch": 0.31, + "grad_norm": 2.0294921317316788, + "learning_rate": 8.046697543707364e-06, + "loss": 0.5161, + "step": 2722 + }, + { + "epoch": 0.31, + "grad_norm": 2.0472378728028997, + "learning_rate": 8.045221797578698e-06, + "loss": 0.6513, + "step": 2723 + }, + { + "epoch": 0.31, + "grad_norm": 3.13750463813746, + "learning_rate": 8.043745629626433e-06, + "loss": 0.5348, + "step": 2724 + }, + { + "epoch": 0.31, + "grad_norm": 2.8630121861921047, + "learning_rate": 8.04226904005505e-06, + "loss": 0.4848, + "step": 2725 + }, + { + "epoch": 0.31, + "grad_norm": 2.133918818543244, + "learning_rate": 8.040792029069089e-06, + "loss": 0.4513, + "step": 2726 + }, + { + "epoch": 0.31, + "grad_norm": 2.273553407620105, + "learning_rate": 8.039314596873141e-06, + "loss": 0.6088, + "step": 2727 + }, + { + "epoch": 0.31, + "grad_norm": 3.5674563715576912, + "learning_rate": 8.037836743671863e-06, + "loss": 0.5608, + "step": 2728 + }, + { + "epoch": 0.31, + "grad_norm": 4.661128706263857, + "learning_rate": 8.036358469669962e-06, + "loss": 0.4331, + "step": 2729 + }, + { + "epoch": 0.31, + "grad_norm": 1.7991540685105603, + "learning_rate": 8.034879775072215e-06, + "loss": 0.5399, + "step": 2730 + }, + { + "epoch": 0.31, + "grad_norm": 1.800810400125649, + "learning_rate": 8.033400660083448e-06, + "loss": 0.4012, + "step": 2731 + }, + { + "epoch": 0.31, + "grad_norm": 2.0229030226586047, + "learning_rate": 8.031921124908545e-06, + "loss": 0.4888, + "step": 2732 + }, + { + "epoch": 0.31, + "grad_norm": 2.467249055650737, + "learning_rate": 8.030441169752452e-06, + "loss": 0.575, + "step": 2733 + }, + { + "epoch": 0.31, + "grad_norm": 2.248913832923308, + "learning_rate": 8.028960794820176e-06, + "loss": 0.4882, + "step": 2734 + }, + { + "epoch": 0.31, + "grad_norm": 2.404628334176481, + "learning_rate": 8.027480000316773e-06, + "loss": 0.5549, + "step": 2735 + }, + { + "epoch": 0.31, + "grad_norm": 1.8413735537611442, + "learning_rate": 8.025998786447364e-06, + "loss": 0.5055, + "step": 2736 + }, + { + "epoch": 0.31, + "grad_norm": 2.609735715145815, + "learning_rate": 8.024517153417129e-06, + "loss": 0.6208, + "step": 2737 + }, + { + "epoch": 0.31, + "grad_norm": 2.3130414938541075, + "learning_rate": 8.023035101431303e-06, + "loss": 0.595, + "step": 2738 + }, + { + "epoch": 0.31, + "grad_norm": 3.400789501084477, + "learning_rate": 8.021552630695176e-06, + "loss": 0.6384, + "step": 2739 + }, + { + "epoch": 0.31, + "grad_norm": 2.1878091553452323, + "learning_rate": 8.020069741414103e-06, + "loss": 0.414, + "step": 2740 + }, + { + "epoch": 0.31, + "grad_norm": 5.91606653270814, + "learning_rate": 8.018586433793492e-06, + "loss": 0.5733, + "step": 2741 + }, + { + "epoch": 0.32, + "grad_norm": 2.1628194132270306, + "learning_rate": 8.01710270803881e-06, + "loss": 0.5219, + "step": 2742 + }, + { + "epoch": 0.32, + "grad_norm": 1.930665557151176, + "learning_rate": 8.015618564355585e-06, + "loss": 0.56, + "step": 2743 + }, + { + "epoch": 0.32, + "grad_norm": 2.8983264009292906, + "learning_rate": 8.014134002949399e-06, + "loss": 0.4663, + "step": 2744 + }, + { + "epoch": 0.32, + "grad_norm": 2.0660160621890356, + "learning_rate": 8.012649024025892e-06, + "loss": 0.5093, + "step": 2745 + }, + { + "epoch": 0.32, + "grad_norm": 2.482865082761326, + "learning_rate": 8.011163627790765e-06, + "loss": 0.5785, + "step": 2746 + }, + { + "epoch": 0.32, + "grad_norm": 1.9170283553749623, + "learning_rate": 8.009677814449773e-06, + "loss": 0.535, + "step": 2747 + }, + { + "epoch": 0.32, + "grad_norm": 2.512741740230055, + "learning_rate": 8.008191584208732e-06, + "loss": 0.6837, + "step": 2748 + }, + { + "epoch": 0.32, + "grad_norm": 2.7527201018949867, + "learning_rate": 8.006704937273513e-06, + "loss": 0.5023, + "step": 2749 + }, + { + "epoch": 0.32, + "grad_norm": 4.665934520558981, + "learning_rate": 8.005217873850048e-06, + "loss": 0.4276, + "step": 2750 + }, + { + "epoch": 0.32, + "grad_norm": 2.00337124871565, + "learning_rate": 8.003730394144322e-06, + "loss": 0.4756, + "step": 2751 + }, + { + "epoch": 0.32, + "grad_norm": 2.1055865720560973, + "learning_rate": 8.002242498362384e-06, + "loss": 0.631, + "step": 2752 + }, + { + "epoch": 0.32, + "grad_norm": 2.3361706375862212, + "learning_rate": 8.000754186710333e-06, + "loss": 0.508, + "step": 2753 + }, + { + "epoch": 0.32, + "grad_norm": 2.2415534899797693, + "learning_rate": 7.999265459394334e-06, + "loss": 0.5675, + "step": 2754 + }, + { + "epoch": 0.32, + "grad_norm": 3.9876418285797244, + "learning_rate": 7.997776316620603e-06, + "loss": 0.5199, + "step": 2755 + }, + { + "epoch": 0.32, + "grad_norm": 2.1292822971195653, + "learning_rate": 7.996286758595413e-06, + "loss": 0.5067, + "step": 2756 + }, + { + "epoch": 0.32, + "grad_norm": 1.9961904909637553, + "learning_rate": 7.994796785525103e-06, + "loss": 0.5629, + "step": 2757 + }, + { + "epoch": 0.32, + "grad_norm": 2.5422441621330054, + "learning_rate": 7.993306397616061e-06, + "loss": 0.5239, + "step": 2758 + }, + { + "epoch": 0.32, + "grad_norm": 2.339139495398308, + "learning_rate": 7.991815595074733e-06, + "loss": 0.5247, + "step": 2759 + }, + { + "epoch": 0.32, + "grad_norm": 1.9217820361181952, + "learning_rate": 7.990324378107628e-06, + "loss": 0.5198, + "step": 2760 + }, + { + "epoch": 0.32, + "grad_norm": 3.34673339581103, + "learning_rate": 7.98883274692131e-06, + "loss": 0.5867, + "step": 2761 + }, + { + "epoch": 0.32, + "grad_norm": 2.25636343561003, + "learning_rate": 7.987340701722395e-06, + "loss": 0.6272, + "step": 2762 + }, + { + "epoch": 0.32, + "grad_norm": 2.6447224408726435, + "learning_rate": 7.985848242717564e-06, + "loss": 0.5116, + "step": 2763 + }, + { + "epoch": 0.32, + "grad_norm": 2.779417522671597, + "learning_rate": 7.984355370113553e-06, + "loss": 0.5527, + "step": 2764 + }, + { + "epoch": 0.32, + "grad_norm": 0.9295455417084404, + "learning_rate": 7.982862084117152e-06, + "loss": 0.7216, + "step": 2765 + }, + { + "epoch": 0.32, + "grad_norm": 2.928475627983069, + "learning_rate": 7.98136838493521e-06, + "loss": 0.5175, + "step": 2766 + }, + { + "epoch": 0.32, + "grad_norm": 2.054350552554032, + "learning_rate": 7.97987427277464e-06, + "loss": 0.4977, + "step": 2767 + }, + { + "epoch": 0.32, + "grad_norm": 2.3071197434238906, + "learning_rate": 7.978379747842398e-06, + "loss": 0.552, + "step": 2768 + }, + { + "epoch": 0.32, + "grad_norm": 0.8658521399209966, + "learning_rate": 7.97688481034551e-06, + "loss": 0.7213, + "step": 2769 + }, + { + "epoch": 0.32, + "grad_norm": 2.5858970200435887, + "learning_rate": 7.975389460491054e-06, + "loss": 0.4605, + "step": 2770 + }, + { + "epoch": 0.32, + "grad_norm": 1.8410170875727503, + "learning_rate": 7.973893698486166e-06, + "loss": 0.5975, + "step": 2771 + }, + { + "epoch": 0.32, + "grad_norm": 7.281415110380626, + "learning_rate": 7.972397524538036e-06, + "loss": 0.5134, + "step": 2772 + }, + { + "epoch": 0.32, + "grad_norm": 2.1662379473963287, + "learning_rate": 7.970900938853918e-06, + "loss": 0.5741, + "step": 2773 + }, + { + "epoch": 0.32, + "grad_norm": 2.489914144369624, + "learning_rate": 7.969403941641117e-06, + "loss": 0.5369, + "step": 2774 + }, + { + "epoch": 0.32, + "grad_norm": 1.9439346769260484, + "learning_rate": 7.967906533106994e-06, + "loss": 0.5503, + "step": 2775 + }, + { + "epoch": 0.32, + "grad_norm": 2.228612661596518, + "learning_rate": 7.966408713458973e-06, + "loss": 0.5183, + "step": 2776 + }, + { + "epoch": 0.32, + "grad_norm": 1.9583391601775046, + "learning_rate": 7.964910482904532e-06, + "loss": 0.5577, + "step": 2777 + }, + { + "epoch": 0.32, + "grad_norm": 2.172710734632264, + "learning_rate": 7.963411841651202e-06, + "loss": 0.5274, + "step": 2778 + }, + { + "epoch": 0.32, + "grad_norm": 1.8345351998090547, + "learning_rate": 7.961912789906579e-06, + "loss": 0.4859, + "step": 2779 + }, + { + "epoch": 0.32, + "grad_norm": 1.6811891351617008, + "learning_rate": 7.960413327878309e-06, + "loss": 0.5568, + "step": 2780 + }, + { + "epoch": 0.32, + "grad_norm": 2.0586414514304536, + "learning_rate": 7.958913455774097e-06, + "loss": 0.5107, + "step": 2781 + }, + { + "epoch": 0.32, + "grad_norm": 9.476799113416005, + "learning_rate": 7.957413173801706e-06, + "loss": 0.484, + "step": 2782 + }, + { + "epoch": 0.32, + "grad_norm": 2.029924826038305, + "learning_rate": 7.955912482168956e-06, + "loss": 0.5333, + "step": 2783 + }, + { + "epoch": 0.32, + "grad_norm": 2.2184977206129366, + "learning_rate": 7.954411381083717e-06, + "loss": 0.5, + "step": 2784 + }, + { + "epoch": 0.32, + "grad_norm": 2.075023537607446, + "learning_rate": 7.952909870753928e-06, + "loss": 0.4747, + "step": 2785 + }, + { + "epoch": 0.32, + "grad_norm": 0.9026421317046905, + "learning_rate": 7.951407951387575e-06, + "loss": 0.6914, + "step": 2786 + }, + { + "epoch": 0.32, + "grad_norm": 2.754194531422513, + "learning_rate": 7.949905623192702e-06, + "loss": 0.5321, + "step": 2787 + }, + { + "epoch": 0.32, + "grad_norm": 2.870129097248555, + "learning_rate": 7.948402886377415e-06, + "loss": 0.5972, + "step": 2788 + }, + { + "epoch": 0.32, + "grad_norm": 2.4313102873153656, + "learning_rate": 7.94689974114987e-06, + "loss": 0.496, + "step": 2789 + }, + { + "epoch": 0.32, + "grad_norm": 3.2216135299142925, + "learning_rate": 7.945396187718284e-06, + "loss": 0.6397, + "step": 2790 + }, + { + "epoch": 0.32, + "grad_norm": 2.7887964692358387, + "learning_rate": 7.943892226290929e-06, + "loss": 0.5073, + "step": 2791 + }, + { + "epoch": 0.32, + "grad_norm": 2.4775370164035797, + "learning_rate": 7.94238785707613e-06, + "loss": 0.5477, + "step": 2792 + }, + { + "epoch": 0.32, + "grad_norm": 2.3841762780066196, + "learning_rate": 7.940883080282276e-06, + "loss": 0.5021, + "step": 2793 + }, + { + "epoch": 0.32, + "grad_norm": 1.840327680179315, + "learning_rate": 7.939377896117808e-06, + "loss": 0.6436, + "step": 2794 + }, + { + "epoch": 0.32, + "grad_norm": 1.86383657667886, + "learning_rate": 7.937872304791222e-06, + "loss": 0.5248, + "step": 2795 + }, + { + "epoch": 0.32, + "grad_norm": 2.5563136517955662, + "learning_rate": 7.936366306511074e-06, + "loss": 0.4609, + "step": 2796 + }, + { + "epoch": 0.32, + "grad_norm": 2.223963798314906, + "learning_rate": 7.934859901485973e-06, + "loss": 0.5053, + "step": 2797 + }, + { + "epoch": 0.32, + "grad_norm": 1.9513097465351306, + "learning_rate": 7.933353089924586e-06, + "loss": 0.4631, + "step": 2798 + }, + { + "epoch": 0.32, + "grad_norm": 1.7381478779910948, + "learning_rate": 7.93184587203564e-06, + "loss": 0.4679, + "step": 2799 + }, + { + "epoch": 0.32, + "grad_norm": 4.306291311604797, + "learning_rate": 7.93033824802791e-06, + "loss": 0.4704, + "step": 2800 + }, + { + "epoch": 0.32, + "grad_norm": 2.0992274939116955, + "learning_rate": 7.928830218110233e-06, + "loss": 0.5107, + "step": 2801 + }, + { + "epoch": 0.32, + "grad_norm": 1.5565434477154236, + "learning_rate": 7.9273217824915e-06, + "loss": 0.4521, + "step": 2802 + }, + { + "epoch": 0.32, + "grad_norm": 1.5943522873238118, + "learning_rate": 7.925812941380663e-06, + "loss": 0.4312, + "step": 2803 + }, + { + "epoch": 0.32, + "grad_norm": 2.6038082595038707, + "learning_rate": 7.924303694986723e-06, + "loss": 0.5406, + "step": 2804 + }, + { + "epoch": 0.32, + "grad_norm": 2.067468817240631, + "learning_rate": 7.922794043518742e-06, + "loss": 0.578, + "step": 2805 + }, + { + "epoch": 0.32, + "grad_norm": 1.893179960484438, + "learning_rate": 7.921283987185836e-06, + "loss": 0.5362, + "step": 2806 + }, + { + "epoch": 0.32, + "grad_norm": 1.8686549080765698, + "learning_rate": 7.919773526197178e-06, + "loss": 0.4546, + "step": 2807 + }, + { + "epoch": 0.32, + "grad_norm": 2.2669060659135227, + "learning_rate": 7.918262660761999e-06, + "loss": 0.5553, + "step": 2808 + }, + { + "epoch": 0.32, + "grad_norm": 1.7914401011706638, + "learning_rate": 7.916751391089579e-06, + "loss": 0.4995, + "step": 2809 + }, + { + "epoch": 0.32, + "grad_norm": 2.3464973615101257, + "learning_rate": 7.915239717389264e-06, + "loss": 0.495, + "step": 2810 + }, + { + "epoch": 0.32, + "grad_norm": 2.3686119086956654, + "learning_rate": 7.913727639870446e-06, + "loss": 0.5977, + "step": 2811 + }, + { + "epoch": 0.32, + "grad_norm": 2.2303095388299963, + "learning_rate": 7.912215158742581e-06, + "loss": 0.5549, + "step": 2812 + }, + { + "epoch": 0.32, + "grad_norm": 2.197381227343946, + "learning_rate": 7.910702274215176e-06, + "loss": 0.5626, + "step": 2813 + }, + { + "epoch": 0.32, + "grad_norm": 2.567462243597013, + "learning_rate": 7.909188986497797e-06, + "loss": 0.526, + "step": 2814 + }, + { + "epoch": 0.32, + "grad_norm": 1.9303974175298675, + "learning_rate": 7.907675295800062e-06, + "loss": 0.584, + "step": 2815 + }, + { + "epoch": 0.32, + "grad_norm": 7.745199598342414, + "learning_rate": 7.906161202331652e-06, + "loss": 0.4787, + "step": 2816 + }, + { + "epoch": 0.32, + "grad_norm": 1.8187286103340488, + "learning_rate": 7.904646706302292e-06, + "loss": 0.5737, + "step": 2817 + }, + { + "epoch": 0.32, + "grad_norm": 1.6503837972845514, + "learning_rate": 7.903131807921776e-06, + "loss": 0.4773, + "step": 2818 + }, + { + "epoch": 0.32, + "grad_norm": 2.014884374213472, + "learning_rate": 7.901616507399943e-06, + "loss": 0.4838, + "step": 2819 + }, + { + "epoch": 0.32, + "grad_norm": 2.3014735304666822, + "learning_rate": 7.900100804946695e-06, + "loss": 0.5783, + "step": 2820 + }, + { + "epoch": 0.32, + "grad_norm": 1.9541029595367323, + "learning_rate": 7.898584700771984e-06, + "loss": 0.5126, + "step": 2821 + }, + { + "epoch": 0.32, + "grad_norm": 3.7922591750437857, + "learning_rate": 7.897068195085825e-06, + "loss": 0.508, + "step": 2822 + }, + { + "epoch": 0.32, + "grad_norm": 2.333529609122084, + "learning_rate": 7.895551288098278e-06, + "loss": 0.5425, + "step": 2823 + }, + { + "epoch": 0.32, + "grad_norm": 1.6769325083547368, + "learning_rate": 7.894033980019471e-06, + "loss": 0.4845, + "step": 2824 + }, + { + "epoch": 0.32, + "grad_norm": 1.83972346676095, + "learning_rate": 7.892516271059577e-06, + "loss": 0.5696, + "step": 2825 + }, + { + "epoch": 0.32, + "grad_norm": 1.7205715956722716, + "learning_rate": 7.89099816142883e-06, + "loss": 0.4592, + "step": 2826 + }, + { + "epoch": 0.32, + "grad_norm": 1.7138444200888199, + "learning_rate": 7.88947965133752e-06, + "loss": 0.4277, + "step": 2827 + }, + { + "epoch": 0.32, + "grad_norm": 2.3180624398108725, + "learning_rate": 7.887960740995988e-06, + "loss": 0.6045, + "step": 2828 + }, + { + "epoch": 0.33, + "grad_norm": 2.3163712164472154, + "learning_rate": 7.886441430614635e-06, + "loss": 0.55, + "step": 2829 + }, + { + "epoch": 0.33, + "grad_norm": 2.650152235286882, + "learning_rate": 7.884921720403914e-06, + "loss": 0.4923, + "step": 2830 + }, + { + "epoch": 0.33, + "grad_norm": 2.3412441826971344, + "learning_rate": 7.883401610574338e-06, + "loss": 0.5816, + "step": 2831 + }, + { + "epoch": 0.33, + "grad_norm": 2.0717124813637398, + "learning_rate": 7.881881101336467e-06, + "loss": 0.5606, + "step": 2832 + }, + { + "epoch": 0.33, + "grad_norm": 0.8809679457647487, + "learning_rate": 7.880360192900928e-06, + "loss": 0.735, + "step": 2833 + }, + { + "epoch": 0.33, + "grad_norm": 1.795443244861914, + "learning_rate": 7.878838885478393e-06, + "loss": 0.4835, + "step": 2834 + }, + { + "epoch": 0.33, + "grad_norm": 2.0785830319635688, + "learning_rate": 7.877317179279593e-06, + "loss": 0.5664, + "step": 2835 + }, + { + "epoch": 0.33, + "grad_norm": 2.6909259881174155, + "learning_rate": 7.875795074515316e-06, + "loss": 0.3613, + "step": 2836 + }, + { + "epoch": 0.33, + "grad_norm": 2.040848180384166, + "learning_rate": 7.874272571396404e-06, + "loss": 0.446, + "step": 2837 + }, + { + "epoch": 0.33, + "grad_norm": 2.147286690884782, + "learning_rate": 7.872749670133754e-06, + "loss": 0.5337, + "step": 2838 + }, + { + "epoch": 0.33, + "grad_norm": 3.9512494779990552, + "learning_rate": 7.871226370938316e-06, + "loss": 0.516, + "step": 2839 + }, + { + "epoch": 0.33, + "grad_norm": 2.7564960507728684, + "learning_rate": 7.869702674021098e-06, + "loss": 0.4278, + "step": 2840 + }, + { + "epoch": 0.33, + "grad_norm": 2.0586388926302437, + "learning_rate": 7.868178579593165e-06, + "loss": 0.4846, + "step": 2841 + }, + { + "epoch": 0.33, + "grad_norm": 2.0518253062349205, + "learning_rate": 7.86665408786563e-06, + "loss": 0.6266, + "step": 2842 + }, + { + "epoch": 0.33, + "grad_norm": 0.9195303083858667, + "learning_rate": 7.865129199049667e-06, + "loss": 0.7107, + "step": 2843 + }, + { + "epoch": 0.33, + "grad_norm": 2.2937679083383267, + "learning_rate": 7.863603913356505e-06, + "loss": 0.6104, + "step": 2844 + }, + { + "epoch": 0.33, + "grad_norm": 2.6653358538520346, + "learning_rate": 7.862078230997425e-06, + "loss": 0.4646, + "step": 2845 + }, + { + "epoch": 0.33, + "grad_norm": 1.9752400262255085, + "learning_rate": 7.860552152183763e-06, + "loss": 0.5434, + "step": 2846 + }, + { + "epoch": 0.33, + "grad_norm": 2.120938523981115, + "learning_rate": 7.859025677126914e-06, + "loss": 0.4834, + "step": 2847 + }, + { + "epoch": 0.33, + "grad_norm": 1.8307417329711975, + "learning_rate": 7.857498806038321e-06, + "loss": 0.5691, + "step": 2848 + }, + { + "epoch": 0.33, + "grad_norm": 1.8207558744223356, + "learning_rate": 7.85597153912949e-06, + "loss": 0.5926, + "step": 2849 + }, + { + "epoch": 0.33, + "grad_norm": 2.496267171099766, + "learning_rate": 7.854443876611976e-06, + "loss": 0.5429, + "step": 2850 + }, + { + "epoch": 0.33, + "grad_norm": 2.606526518298688, + "learning_rate": 7.852915818697391e-06, + "loss": 0.585, + "step": 2851 + }, + { + "epoch": 0.33, + "grad_norm": 1.222585073046454, + "learning_rate": 7.851387365597401e-06, + "loss": 0.7776, + "step": 2852 + }, + { + "epoch": 0.33, + "grad_norm": 2.490542455946593, + "learning_rate": 7.849858517523725e-06, + "loss": 0.6018, + "step": 2853 + }, + { + "epoch": 0.33, + "grad_norm": 2.963991175454294, + "learning_rate": 7.848329274688143e-06, + "loss": 0.5209, + "step": 2854 + }, + { + "epoch": 0.33, + "grad_norm": 2.04660360536957, + "learning_rate": 7.84679963730248e-06, + "loss": 0.4392, + "step": 2855 + }, + { + "epoch": 0.33, + "grad_norm": 1.9347306604899075, + "learning_rate": 7.845269605578628e-06, + "loss": 0.446, + "step": 2856 + }, + { + "epoch": 0.33, + "grad_norm": 1.8521769781341793, + "learning_rate": 7.84373917972852e-06, + "loss": 0.4683, + "step": 2857 + }, + { + "epoch": 0.33, + "grad_norm": 2.105793466668921, + "learning_rate": 7.842208359964157e-06, + "loss": 0.5612, + "step": 2858 + }, + { + "epoch": 0.33, + "grad_norm": 1.8853773851640279, + "learning_rate": 7.840677146497582e-06, + "loss": 0.4859, + "step": 2859 + }, + { + "epoch": 0.33, + "grad_norm": 4.983068486139454, + "learning_rate": 7.8391455395409e-06, + "loss": 0.6218, + "step": 2860 + }, + { + "epoch": 0.33, + "grad_norm": 2.6540258285639426, + "learning_rate": 7.83761353930627e-06, + "loss": 0.4793, + "step": 2861 + }, + { + "epoch": 0.33, + "grad_norm": 2.0413710881394036, + "learning_rate": 7.836081146005906e-06, + "loss": 0.5717, + "step": 2862 + }, + { + "epoch": 0.33, + "grad_norm": 2.2535283194454703, + "learning_rate": 7.83454835985207e-06, + "loss": 0.4258, + "step": 2863 + }, + { + "epoch": 0.33, + "grad_norm": 2.51213697458294, + "learning_rate": 7.833015181057088e-06, + "loss": 0.5227, + "step": 2864 + }, + { + "epoch": 0.33, + "grad_norm": 0.9837458860826667, + "learning_rate": 7.831481609833333e-06, + "loss": 0.7266, + "step": 2865 + }, + { + "epoch": 0.33, + "grad_norm": 2.5202573275118496, + "learning_rate": 7.829947646393237e-06, + "loss": 0.6385, + "step": 2866 + }, + { + "epoch": 0.33, + "grad_norm": 2.4495264437524926, + "learning_rate": 7.828413290949282e-06, + "loss": 0.4387, + "step": 2867 + }, + { + "epoch": 0.33, + "grad_norm": 1.936619109920517, + "learning_rate": 7.826878543714007e-06, + "loss": 0.6082, + "step": 2868 + }, + { + "epoch": 0.33, + "grad_norm": 1.9382337242367331, + "learning_rate": 7.825343404900008e-06, + "loss": 0.5534, + "step": 2869 + }, + { + "epoch": 0.33, + "grad_norm": 2.4971381755755377, + "learning_rate": 7.823807874719929e-06, + "loss": 0.4565, + "step": 2870 + }, + { + "epoch": 0.33, + "grad_norm": 2.1234957063362, + "learning_rate": 7.82227195338647e-06, + "loss": 0.5726, + "step": 2871 + }, + { + "epoch": 0.33, + "grad_norm": 2.67202790166371, + "learning_rate": 7.820735641112394e-06, + "loss": 0.4659, + "step": 2872 + }, + { + "epoch": 0.33, + "grad_norm": 2.066033714524526, + "learning_rate": 7.819198938110501e-06, + "loss": 0.5954, + "step": 2873 + }, + { + "epoch": 0.33, + "grad_norm": 2.489886034718662, + "learning_rate": 7.817661844593661e-06, + "loss": 0.5677, + "step": 2874 + }, + { + "epoch": 0.33, + "grad_norm": 2.6799771980626135, + "learning_rate": 7.816124360774792e-06, + "loss": 0.5234, + "step": 2875 + }, + { + "epoch": 0.33, + "grad_norm": 2.007174919027362, + "learning_rate": 7.814586486866862e-06, + "loss": 0.4791, + "step": 2876 + }, + { + "epoch": 0.33, + "grad_norm": 2.0221138993847907, + "learning_rate": 7.8130482230829e-06, + "loss": 0.5348, + "step": 2877 + }, + { + "epoch": 0.33, + "grad_norm": 2.0484933471639537, + "learning_rate": 7.811509569635984e-06, + "loss": 0.5348, + "step": 2878 + }, + { + "epoch": 0.33, + "grad_norm": 3.3966521239363003, + "learning_rate": 7.80997052673925e-06, + "loss": 0.4443, + "step": 2879 + }, + { + "epoch": 0.33, + "grad_norm": 6.301959768027492, + "learning_rate": 7.808431094605887e-06, + "loss": 0.5895, + "step": 2880 + }, + { + "epoch": 0.33, + "grad_norm": 1.8765655337998974, + "learning_rate": 7.806891273449134e-06, + "loss": 0.4786, + "step": 2881 + }, + { + "epoch": 0.33, + "grad_norm": 2.1533827667974417, + "learning_rate": 7.80535106348229e-06, + "loss": 0.5444, + "step": 2882 + }, + { + "epoch": 0.33, + "grad_norm": 2.4780372306733187, + "learning_rate": 7.803810464918699e-06, + "loss": 0.5539, + "step": 2883 + }, + { + "epoch": 0.33, + "grad_norm": 2.053174106982524, + "learning_rate": 7.802269477971771e-06, + "loss": 0.4593, + "step": 2884 + }, + { + "epoch": 0.33, + "grad_norm": 3.7452782843645367, + "learning_rate": 7.80072810285496e-06, + "loss": 0.5226, + "step": 2885 + }, + { + "epoch": 0.33, + "grad_norm": 3.1701497276994215, + "learning_rate": 7.799186339781774e-06, + "loss": 0.5726, + "step": 2886 + }, + { + "epoch": 0.33, + "grad_norm": 1.9765197897320606, + "learning_rate": 7.797644188965785e-06, + "loss": 0.5338, + "step": 2887 + }, + { + "epoch": 0.33, + "grad_norm": 2.165322483903886, + "learning_rate": 7.796101650620605e-06, + "loss": 0.4888, + "step": 2888 + }, + { + "epoch": 0.33, + "grad_norm": 1.7802218277226602, + "learning_rate": 7.79455872495991e-06, + "loss": 0.4659, + "step": 2889 + }, + { + "epoch": 0.33, + "grad_norm": 2.078627091424822, + "learning_rate": 7.793015412197424e-06, + "loss": 0.5878, + "step": 2890 + }, + { + "epoch": 0.33, + "grad_norm": 2.1987500174222228, + "learning_rate": 7.791471712546928e-06, + "loss": 0.3887, + "step": 2891 + }, + { + "epoch": 0.33, + "grad_norm": 2.2349132037906094, + "learning_rate": 7.789927626222253e-06, + "loss": 0.5394, + "step": 2892 + }, + { + "epoch": 0.33, + "grad_norm": 1.9491747347423336, + "learning_rate": 7.788383153437286e-06, + "loss": 0.5352, + "step": 2893 + }, + { + "epoch": 0.33, + "grad_norm": 3.1520661463093, + "learning_rate": 7.786838294405968e-06, + "loss": 0.6683, + "step": 2894 + }, + { + "epoch": 0.33, + "grad_norm": 2.0177019206945705, + "learning_rate": 7.78529304934229e-06, + "loss": 0.5486, + "step": 2895 + }, + { + "epoch": 0.33, + "grad_norm": 1.8733395828896169, + "learning_rate": 7.783747418460305e-06, + "loss": 0.5539, + "step": 2896 + }, + { + "epoch": 0.33, + "grad_norm": 1.818741066646174, + "learning_rate": 7.782201401974107e-06, + "loss": 0.4972, + "step": 2897 + }, + { + "epoch": 0.33, + "grad_norm": 2.169323240902594, + "learning_rate": 7.780655000097854e-06, + "loss": 0.561, + "step": 2898 + }, + { + "epoch": 0.33, + "grad_norm": 3.481132685320535, + "learning_rate": 7.779108213045752e-06, + "loss": 0.5459, + "step": 2899 + }, + { + "epoch": 0.33, + "grad_norm": 2.483788380927395, + "learning_rate": 7.777561041032061e-06, + "loss": 0.5005, + "step": 2900 + }, + { + "epoch": 0.33, + "grad_norm": 3.4194458517288875, + "learning_rate": 7.776013484271096e-06, + "loss": 0.4974, + "step": 2901 + }, + { + "epoch": 0.33, + "grad_norm": 2.8954373011375965, + "learning_rate": 7.774465542977224e-06, + "loss": 0.5491, + "step": 2902 + }, + { + "epoch": 0.33, + "grad_norm": 2.1365244094847817, + "learning_rate": 7.772917217364866e-06, + "loss": 0.4709, + "step": 2903 + }, + { + "epoch": 0.33, + "grad_norm": 1.7232532986010074, + "learning_rate": 7.771368507648494e-06, + "loss": 0.5166, + "step": 2904 + }, + { + "epoch": 0.33, + "grad_norm": 3.049455832821686, + "learning_rate": 7.769819414042639e-06, + "loss": 0.5499, + "step": 2905 + }, + { + "epoch": 0.33, + "grad_norm": 2.012171878517844, + "learning_rate": 7.768269936761875e-06, + "loss": 0.5468, + "step": 2906 + }, + { + "epoch": 0.33, + "grad_norm": 2.474627658858279, + "learning_rate": 7.76672007602084e-06, + "loss": 0.5115, + "step": 2907 + }, + { + "epoch": 0.33, + "grad_norm": 2.2474340456451163, + "learning_rate": 7.76516983203422e-06, + "loss": 0.4763, + "step": 2908 + }, + { + "epoch": 0.33, + "grad_norm": 1.98043557433873, + "learning_rate": 7.763619205016754e-06, + "loss": 0.5194, + "step": 2909 + }, + { + "epoch": 0.33, + "grad_norm": 2.228230179586194, + "learning_rate": 7.762068195183234e-06, + "loss": 0.6098, + "step": 2910 + }, + { + "epoch": 0.33, + "grad_norm": 1.860624149234416, + "learning_rate": 7.760516802748506e-06, + "loss": 0.6057, + "step": 2911 + }, + { + "epoch": 0.33, + "grad_norm": 2.132851809397818, + "learning_rate": 7.75896502792747e-06, + "loss": 0.5063, + "step": 2912 + }, + { + "epoch": 0.33, + "grad_norm": 2.044307510309009, + "learning_rate": 7.757412870935074e-06, + "loss": 0.5451, + "step": 2913 + }, + { + "epoch": 0.33, + "grad_norm": 1.639350364709035, + "learning_rate": 7.755860331986326e-06, + "loss": 0.4569, + "step": 2914 + }, + { + "epoch": 0.33, + "grad_norm": 2.2857697468157556, + "learning_rate": 7.75430741129628e-06, + "loss": 0.5052, + "step": 2915 + }, + { + "epoch": 0.34, + "grad_norm": 1.9947975274243404, + "learning_rate": 7.752754109080051e-06, + "loss": 0.5661, + "step": 2916 + }, + { + "epoch": 0.34, + "grad_norm": 1.7206767443045559, + "learning_rate": 7.751200425552801e-06, + "loss": 0.5384, + "step": 2917 + }, + { + "epoch": 0.34, + "grad_norm": 1.8260874410773151, + "learning_rate": 7.749646360929741e-06, + "loss": 0.5626, + "step": 2918 + }, + { + "epoch": 0.34, + "grad_norm": 2.435843437185004, + "learning_rate": 7.748091915426145e-06, + "loss": 0.5373, + "step": 2919 + }, + { + "epoch": 0.34, + "grad_norm": 2.061451427930398, + "learning_rate": 7.746537089257332e-06, + "loss": 0.5114, + "step": 2920 + }, + { + "epoch": 0.34, + "grad_norm": 1.5931050831720555, + "learning_rate": 7.744981882638678e-06, + "loss": 0.4758, + "step": 2921 + }, + { + "epoch": 0.34, + "grad_norm": 3.3228744853219463, + "learning_rate": 7.743426295785608e-06, + "loss": 0.4034, + "step": 2922 + }, + { + "epoch": 0.34, + "grad_norm": 2.1897378820783415, + "learning_rate": 7.741870328913602e-06, + "loss": 0.5802, + "step": 2923 + }, + { + "epoch": 0.34, + "grad_norm": 1.7166339781316216, + "learning_rate": 7.740313982238196e-06, + "loss": 0.5451, + "step": 2924 + }, + { + "epoch": 0.34, + "grad_norm": 1.4997109601899017, + "learning_rate": 7.73875725597497e-06, + "loss": 0.49, + "step": 2925 + }, + { + "epoch": 0.34, + "grad_norm": 1.1203236791245705, + "learning_rate": 7.737200150339564e-06, + "loss": 0.7592, + "step": 2926 + }, + { + "epoch": 0.34, + "grad_norm": 1.742797129883485, + "learning_rate": 7.735642665547667e-06, + "loss": 0.3658, + "step": 2927 + }, + { + "epoch": 0.34, + "grad_norm": 2.3012555458803137, + "learning_rate": 7.734084801815022e-06, + "loss": 0.6232, + "step": 2928 + }, + { + "epoch": 0.34, + "grad_norm": 2.4691819321460167, + "learning_rate": 7.732526559357423e-06, + "loss": 0.561, + "step": 2929 + }, + { + "epoch": 0.34, + "grad_norm": 2.611099599992232, + "learning_rate": 7.730967938390718e-06, + "loss": 0.4972, + "step": 2930 + }, + { + "epoch": 0.34, + "grad_norm": 2.72783732716072, + "learning_rate": 7.729408939130809e-06, + "loss": 0.4314, + "step": 2931 + }, + { + "epoch": 0.34, + "grad_norm": 1.8368836825789712, + "learning_rate": 7.727849561793643e-06, + "loss": 0.4891, + "step": 2932 + }, + { + "epoch": 0.34, + "grad_norm": 1.8445229386917201, + "learning_rate": 7.726289806595231e-06, + "loss": 0.5145, + "step": 2933 + }, + { + "epoch": 0.34, + "grad_norm": 2.128903757136383, + "learning_rate": 7.724729673751628e-06, + "loss": 0.4367, + "step": 2934 + }, + { + "epoch": 0.34, + "grad_norm": 2.5988610619431016, + "learning_rate": 7.72316916347894e-06, + "loss": 0.5272, + "step": 2935 + }, + { + "epoch": 0.34, + "grad_norm": 2.3602582964595364, + "learning_rate": 7.721608275993334e-06, + "loss": 0.5232, + "step": 2936 + }, + { + "epoch": 0.34, + "grad_norm": 2.167494589365408, + "learning_rate": 7.720047011511018e-06, + "loss": 0.5174, + "step": 2937 + }, + { + "epoch": 0.34, + "grad_norm": 2.497134122359307, + "learning_rate": 7.718485370248264e-06, + "loss": 0.5033, + "step": 2938 + }, + { + "epoch": 0.34, + "grad_norm": 1.9261206907560326, + "learning_rate": 7.716923352421385e-06, + "loss": 0.4349, + "step": 2939 + }, + { + "epoch": 0.34, + "grad_norm": 2.383231649251385, + "learning_rate": 7.715360958246753e-06, + "loss": 0.4846, + "step": 2940 + }, + { + "epoch": 0.34, + "grad_norm": 11.890924212846894, + "learning_rate": 7.713798187940794e-06, + "loss": 0.4957, + "step": 2941 + }, + { + "epoch": 0.34, + "grad_norm": 1.8517308708455993, + "learning_rate": 7.712235041719979e-06, + "loss": 0.5856, + "step": 2942 + }, + { + "epoch": 0.34, + "grad_norm": 3.3859422380751445, + "learning_rate": 7.710671519800836e-06, + "loss": 0.545, + "step": 2943 + }, + { + "epoch": 0.34, + "grad_norm": 2.6501928972180138, + "learning_rate": 7.709107622399945e-06, + "loss": 0.5267, + "step": 2944 + }, + { + "epoch": 0.34, + "grad_norm": 3.9882644263317366, + "learning_rate": 7.707543349733932e-06, + "loss": 0.4831, + "step": 2945 + }, + { + "epoch": 0.34, + "grad_norm": 2.8356616527512446, + "learning_rate": 7.705978702019486e-06, + "loss": 0.5728, + "step": 2946 + }, + { + "epoch": 0.34, + "grad_norm": 1.791547352696187, + "learning_rate": 7.704413679473338e-06, + "loss": 0.4833, + "step": 2947 + }, + { + "epoch": 0.34, + "grad_norm": 2.0565533923255033, + "learning_rate": 7.702848282312275e-06, + "loss": 0.4865, + "step": 2948 + }, + { + "epoch": 0.34, + "grad_norm": 1.926220838080352, + "learning_rate": 7.701282510753137e-06, + "loss": 0.4934, + "step": 2949 + }, + { + "epoch": 0.34, + "grad_norm": 2.158863602545339, + "learning_rate": 7.699716365012813e-06, + "loss": 0.5122, + "step": 2950 + }, + { + "epoch": 0.34, + "grad_norm": 1.7349318596712946, + "learning_rate": 7.698149845308245e-06, + "loss": 0.4975, + "step": 2951 + }, + { + "epoch": 0.34, + "grad_norm": 2.0703992921889913, + "learning_rate": 7.696582951856428e-06, + "loss": 0.4926, + "step": 2952 + }, + { + "epoch": 0.34, + "grad_norm": 1.9216645456245265, + "learning_rate": 7.69501568487441e-06, + "loss": 0.536, + "step": 2953 + }, + { + "epoch": 0.34, + "grad_norm": 2.3466368473922428, + "learning_rate": 7.693448044579284e-06, + "loss": 0.5794, + "step": 2954 + }, + { + "epoch": 0.34, + "grad_norm": 2.630253512621041, + "learning_rate": 7.6918800311882e-06, + "loss": 0.5676, + "step": 2955 + }, + { + "epoch": 0.34, + "grad_norm": 1.8924825897882203, + "learning_rate": 7.690311644918362e-06, + "loss": 0.4483, + "step": 2956 + }, + { + "epoch": 0.34, + "grad_norm": 2.322509102806364, + "learning_rate": 7.68874288598702e-06, + "loss": 0.4541, + "step": 2957 + }, + { + "epoch": 0.34, + "grad_norm": 2.4574223382848848, + "learning_rate": 7.687173754611481e-06, + "loss": 0.4947, + "step": 2958 + }, + { + "epoch": 0.34, + "grad_norm": 3.5607622510857704, + "learning_rate": 7.685604251009097e-06, + "loss": 0.5787, + "step": 2959 + }, + { + "epoch": 0.34, + "grad_norm": 2.2096089635268163, + "learning_rate": 7.684034375397278e-06, + "loss": 0.4584, + "step": 2960 + }, + { + "epoch": 0.34, + "grad_norm": 2.2804338401435427, + "learning_rate": 7.682464127993483e-06, + "loss": 0.6141, + "step": 2961 + }, + { + "epoch": 0.34, + "grad_norm": 2.1756159314044514, + "learning_rate": 7.68089350901522e-06, + "loss": 0.47, + "step": 2962 + }, + { + "epoch": 0.34, + "grad_norm": 2.287938095293658, + "learning_rate": 7.679322518680054e-06, + "loss": 0.5433, + "step": 2963 + }, + { + "epoch": 0.34, + "grad_norm": 2.929173897204981, + "learning_rate": 7.677751157205597e-06, + "loss": 0.5351, + "step": 2964 + }, + { + "epoch": 0.34, + "grad_norm": 1.9663847241081105, + "learning_rate": 7.676179424809512e-06, + "loss": 0.3873, + "step": 2965 + }, + { + "epoch": 0.34, + "grad_norm": 1.9227217638591527, + "learning_rate": 7.674607321709517e-06, + "loss": 0.5108, + "step": 2966 + }, + { + "epoch": 0.34, + "grad_norm": 3.243532509885131, + "learning_rate": 7.673034848123379e-06, + "loss": 0.5614, + "step": 2967 + }, + { + "epoch": 0.34, + "grad_norm": 1.7371399234676472, + "learning_rate": 7.67146200426892e-06, + "loss": 0.5275, + "step": 2968 + }, + { + "epoch": 0.34, + "grad_norm": 0.9269322442188325, + "learning_rate": 7.669888790364002e-06, + "loss": 0.7532, + "step": 2969 + }, + { + "epoch": 0.34, + "grad_norm": 1.8511701629478972, + "learning_rate": 7.668315206626554e-06, + "loss": 0.5932, + "step": 2970 + }, + { + "epoch": 0.34, + "grad_norm": 2.064551487376635, + "learning_rate": 7.666741253274545e-06, + "loss": 0.5058, + "step": 2971 + }, + { + "epoch": 0.34, + "grad_norm": 0.874255400463469, + "learning_rate": 7.665166930525999e-06, + "loss": 0.7496, + "step": 2972 + }, + { + "epoch": 0.34, + "grad_norm": 2.200789282396879, + "learning_rate": 7.663592238598992e-06, + "loss": 0.6205, + "step": 2973 + }, + { + "epoch": 0.34, + "grad_norm": 2.0999519128644257, + "learning_rate": 7.66201717771165e-06, + "loss": 0.6294, + "step": 2974 + }, + { + "epoch": 0.34, + "grad_norm": 2.541270548818184, + "learning_rate": 7.660441748082148e-06, + "loss": 0.6071, + "step": 2975 + }, + { + "epoch": 0.34, + "grad_norm": 4.790211970550135, + "learning_rate": 7.658865949928717e-06, + "loss": 0.5314, + "step": 2976 + }, + { + "epoch": 0.34, + "grad_norm": 2.123254001834856, + "learning_rate": 7.657289783469637e-06, + "loss": 0.4634, + "step": 2977 + }, + { + "epoch": 0.34, + "grad_norm": 1.5908427041928235, + "learning_rate": 7.655713248923233e-06, + "loss": 0.4845, + "step": 2978 + }, + { + "epoch": 0.34, + "grad_norm": 0.9049266356984043, + "learning_rate": 7.654136346507892e-06, + "loss": 0.7384, + "step": 2979 + }, + { + "epoch": 0.34, + "grad_norm": 2.6002503760260605, + "learning_rate": 7.652559076442043e-06, + "loss": 0.5255, + "step": 2980 + }, + { + "epoch": 0.34, + "grad_norm": 2.090864433519095, + "learning_rate": 7.650981438944169e-06, + "loss": 0.4484, + "step": 2981 + }, + { + "epoch": 0.34, + "grad_norm": 2.2949779244741935, + "learning_rate": 7.649403434232807e-06, + "loss": 0.5032, + "step": 2982 + }, + { + "epoch": 0.34, + "grad_norm": 2.0295852578748144, + "learning_rate": 7.647825062526539e-06, + "loss": 0.5644, + "step": 2983 + }, + { + "epoch": 0.34, + "grad_norm": 1.8570147353674065, + "learning_rate": 7.646246324044005e-06, + "loss": 0.48, + "step": 2984 + }, + { + "epoch": 0.34, + "grad_norm": 2.2901509684985295, + "learning_rate": 7.644667219003885e-06, + "loss": 0.4782, + "step": 2985 + }, + { + "epoch": 0.34, + "grad_norm": 3.7965285771962836, + "learning_rate": 7.643087747624923e-06, + "loss": 0.5782, + "step": 2986 + }, + { + "epoch": 0.34, + "grad_norm": 1.9451023823398457, + "learning_rate": 7.641507910125901e-06, + "loss": 0.4883, + "step": 2987 + }, + { + "epoch": 0.34, + "grad_norm": 2.0068245925838193, + "learning_rate": 7.639927706725661e-06, + "loss": 0.4565, + "step": 2988 + }, + { + "epoch": 0.34, + "grad_norm": 2.8587125057933247, + "learning_rate": 7.638347137643094e-06, + "loss": 0.5599, + "step": 2989 + }, + { + "epoch": 0.34, + "grad_norm": 2.0719682769291112, + "learning_rate": 7.636766203097137e-06, + "loss": 0.5059, + "step": 2990 + }, + { + "epoch": 0.34, + "grad_norm": 1.8915907712243234, + "learning_rate": 7.635184903306783e-06, + "loss": 0.6083, + "step": 2991 + }, + { + "epoch": 0.34, + "grad_norm": 2.461162270295933, + "learning_rate": 7.633603238491072e-06, + "loss": 0.4568, + "step": 2992 + }, + { + "epoch": 0.34, + "grad_norm": 1.9377791600861256, + "learning_rate": 7.632021208869099e-06, + "loss": 0.5317, + "step": 2993 + }, + { + "epoch": 0.34, + "grad_norm": 2.1755013673172137, + "learning_rate": 7.630438814660002e-06, + "loss": 0.5211, + "step": 2994 + }, + { + "epoch": 0.34, + "grad_norm": 2.041833325931362, + "learning_rate": 7.628856056082976e-06, + "loss": 0.5576, + "step": 2995 + }, + { + "epoch": 0.34, + "grad_norm": 1.8244062603031144, + "learning_rate": 7.6272729333572656e-06, + "loss": 0.4423, + "step": 2996 + }, + { + "epoch": 0.34, + "grad_norm": 1.9113523393591565, + "learning_rate": 7.625689446702162e-06, + "loss": 0.5588, + "step": 2997 + }, + { + "epoch": 0.34, + "grad_norm": 1.7932671127752189, + "learning_rate": 7.6241055963370115e-06, + "loss": 0.4953, + "step": 2998 + }, + { + "epoch": 0.34, + "grad_norm": 2.084520604488172, + "learning_rate": 7.622521382481208e-06, + "loss": 0.5054, + "step": 2999 + }, + { + "epoch": 0.34, + "grad_norm": 2.585848774893426, + "learning_rate": 7.620936805354198e-06, + "loss": 0.5723, + "step": 3000 + }, + { + "epoch": 0.34, + "grad_norm": 2.0960427361570337, + "learning_rate": 7.619351865175475e-06, + "loss": 0.6097, + "step": 3001 + }, + { + "epoch": 0.34, + "grad_norm": 1.8099090711212205, + "learning_rate": 7.617766562164586e-06, + "loss": 0.504, + "step": 3002 + }, + { + "epoch": 0.35, + "grad_norm": 2.3004536172358447, + "learning_rate": 7.6161808965411255e-06, + "loss": 0.5545, + "step": 3003 + }, + { + "epoch": 0.35, + "grad_norm": 0.9187119060330333, + "learning_rate": 7.61459486852474e-06, + "loss": 0.7088, + "step": 3004 + }, + { + "epoch": 0.35, + "grad_norm": 2.1926570913718986, + "learning_rate": 7.6130084783351255e-06, + "loss": 0.5005, + "step": 3005 + }, + { + "epoch": 0.35, + "grad_norm": 26.3348170685861, + "learning_rate": 7.61142172619203e-06, + "loss": 0.5001, + "step": 3006 + }, + { + "epoch": 0.35, + "grad_norm": 1.8228877120025713, + "learning_rate": 7.60983461231525e-06, + "loss": 0.5537, + "step": 3007 + }, + { + "epoch": 0.35, + "grad_norm": 2.934026830395377, + "learning_rate": 7.608247136924631e-06, + "loss": 0.5071, + "step": 3008 + }, + { + "epoch": 0.35, + "grad_norm": 2.428847184622086, + "learning_rate": 7.606659300240069e-06, + "loss": 0.6156, + "step": 3009 + }, + { + "epoch": 0.35, + "grad_norm": 1.9415161648949937, + "learning_rate": 7.605071102481515e-06, + "loss": 0.5086, + "step": 3010 + }, + { + "epoch": 0.35, + "grad_norm": 1.7949844429470543, + "learning_rate": 7.603482543868961e-06, + "loss": 0.4972, + "step": 3011 + }, + { + "epoch": 0.35, + "grad_norm": 2.4293649844133642, + "learning_rate": 7.601893624622457e-06, + "loss": 0.58, + "step": 3012 + }, + { + "epoch": 0.35, + "grad_norm": 1.5984728768570198, + "learning_rate": 7.6003043449620985e-06, + "loss": 0.5248, + "step": 3013 + }, + { + "epoch": 0.35, + "grad_norm": 2.4567970827934973, + "learning_rate": 7.598714705108032e-06, + "loss": 0.4393, + "step": 3014 + }, + { + "epoch": 0.35, + "grad_norm": 2.4661073780936915, + "learning_rate": 7.597124705280457e-06, + "loss": 0.4549, + "step": 3015 + }, + { + "epoch": 0.35, + "grad_norm": 2.1431633035870603, + "learning_rate": 7.595534345699614e-06, + "loss": 0.5268, + "step": 3016 + }, + { + "epoch": 0.35, + "grad_norm": 1.9115151673070327, + "learning_rate": 7.5939436265858045e-06, + "loss": 0.4838, + "step": 3017 + }, + { + "epoch": 0.35, + "grad_norm": 17.118261576765867, + "learning_rate": 7.592352548159374e-06, + "loss": 0.4676, + "step": 3018 + }, + { + "epoch": 0.35, + "grad_norm": 2.8395927292192455, + "learning_rate": 7.590761110640718e-06, + "loss": 0.5961, + "step": 3019 + }, + { + "epoch": 0.35, + "grad_norm": 1.7868876702102325, + "learning_rate": 7.58916931425028e-06, + "loss": 0.5947, + "step": 3020 + }, + { + "epoch": 0.35, + "grad_norm": 1.9374642558959907, + "learning_rate": 7.587577159208558e-06, + "loss": 0.4821, + "step": 3021 + }, + { + "epoch": 0.35, + "grad_norm": 1.9117629781844931, + "learning_rate": 7.585984645736095e-06, + "loss": 0.4922, + "step": 3022 + }, + { + "epoch": 0.35, + "grad_norm": 1.8877449361570138, + "learning_rate": 7.584391774053488e-06, + "loss": 0.554, + "step": 3023 + }, + { + "epoch": 0.35, + "grad_norm": 1.8349479844942105, + "learning_rate": 7.5827985443813786e-06, + "loss": 0.4578, + "step": 3024 + }, + { + "epoch": 0.35, + "grad_norm": 1.816384865181815, + "learning_rate": 7.581204956940463e-06, + "loss": 0.4929, + "step": 3025 + }, + { + "epoch": 0.35, + "grad_norm": 1.726748230334919, + "learning_rate": 7.5796110119514844e-06, + "loss": 0.4977, + "step": 3026 + }, + { + "epoch": 0.35, + "grad_norm": 2.517570093908069, + "learning_rate": 7.578016709635236e-06, + "loss": 0.465, + "step": 3027 + }, + { + "epoch": 0.35, + "grad_norm": 2.013953462256144, + "learning_rate": 7.576422050212557e-06, + "loss": 0.4812, + "step": 3028 + }, + { + "epoch": 0.35, + "grad_norm": 2.2966066388658475, + "learning_rate": 7.574827033904344e-06, + "loss": 0.5277, + "step": 3029 + }, + { + "epoch": 0.35, + "grad_norm": 1.8432401384048425, + "learning_rate": 7.5732316609315345e-06, + "loss": 0.5696, + "step": 3030 + }, + { + "epoch": 0.35, + "grad_norm": 2.4051250639458654, + "learning_rate": 7.571635931515122e-06, + "loss": 0.5518, + "step": 3031 + }, + { + "epoch": 0.35, + "grad_norm": 2.6452145479989446, + "learning_rate": 7.570039845876146e-06, + "loss": 0.4789, + "step": 3032 + }, + { + "epoch": 0.35, + "grad_norm": 2.5612735395296857, + "learning_rate": 7.568443404235694e-06, + "loss": 0.5883, + "step": 3033 + }, + { + "epoch": 0.35, + "grad_norm": 1.828912842747167, + "learning_rate": 7.566846606814909e-06, + "loss": 0.5538, + "step": 3034 + }, + { + "epoch": 0.35, + "grad_norm": 1.7412551767514126, + "learning_rate": 7.565249453834976e-06, + "loss": 0.5182, + "step": 3035 + }, + { + "epoch": 0.35, + "grad_norm": 2.080776631449496, + "learning_rate": 7.563651945517132e-06, + "loss": 0.4987, + "step": 3036 + }, + { + "epoch": 0.35, + "grad_norm": 2.6185454076096106, + "learning_rate": 7.562054082082666e-06, + "loss": 0.5159, + "step": 3037 + }, + { + "epoch": 0.35, + "grad_norm": 2.2219963911960017, + "learning_rate": 7.560455863752911e-06, + "loss": 0.5868, + "step": 3038 + }, + { + "epoch": 0.35, + "grad_norm": 1.013537456890812, + "learning_rate": 7.5588572907492545e-06, + "loss": 0.7644, + "step": 3039 + }, + { + "epoch": 0.35, + "grad_norm": 2.4314280730839473, + "learning_rate": 7.557258363293127e-06, + "loss": 0.6849, + "step": 3040 + }, + { + "epoch": 0.35, + "grad_norm": 2.8380739790951384, + "learning_rate": 7.555659081606016e-06, + "loss": 0.4928, + "step": 3041 + }, + { + "epoch": 0.35, + "grad_norm": 1.8403860002601804, + "learning_rate": 7.55405944590945e-06, + "loss": 0.4819, + "step": 3042 + }, + { + "epoch": 0.35, + "grad_norm": 2.0560123603629687, + "learning_rate": 7.5524594564250144e-06, + "loss": 0.5549, + "step": 3043 + }, + { + "epoch": 0.35, + "grad_norm": 1.8737898680550322, + "learning_rate": 7.550859113374336e-06, + "loss": 0.5082, + "step": 3044 + }, + { + "epoch": 0.35, + "grad_norm": 1.6362494072049818, + "learning_rate": 7.549258416979094e-06, + "loss": 0.4231, + "step": 3045 + }, + { + "epoch": 0.35, + "grad_norm": 2.067393085383789, + "learning_rate": 7.547657367461019e-06, + "loss": 0.512, + "step": 3046 + }, + { + "epoch": 0.35, + "grad_norm": 7.0198623639358795, + "learning_rate": 7.546055965041885e-06, + "loss": 0.6001, + "step": 3047 + }, + { + "epoch": 0.35, + "grad_norm": 3.0966414988714015, + "learning_rate": 7.54445420994352e-06, + "loss": 0.6089, + "step": 3048 + }, + { + "epoch": 0.35, + "grad_norm": 3.2089263817668106, + "learning_rate": 7.542852102387799e-06, + "loss": 0.5655, + "step": 3049 + }, + { + "epoch": 0.35, + "grad_norm": 1.9415941691021346, + "learning_rate": 7.541249642596645e-06, + "loss": 0.5378, + "step": 3050 + }, + { + "epoch": 0.35, + "grad_norm": 2.5719762847019463, + "learning_rate": 7.539646830792031e-06, + "loss": 0.4866, + "step": 3051 + }, + { + "epoch": 0.35, + "grad_norm": 2.320773468786632, + "learning_rate": 7.538043667195979e-06, + "loss": 0.4347, + "step": 3052 + }, + { + "epoch": 0.35, + "grad_norm": 2.6828875222191377, + "learning_rate": 7.536440152030556e-06, + "loss": 0.4795, + "step": 3053 + }, + { + "epoch": 0.35, + "grad_norm": 1.862764114394205, + "learning_rate": 7.534836285517883e-06, + "loss": 0.4713, + "step": 3054 + }, + { + "epoch": 0.35, + "grad_norm": 1.8646135813375817, + "learning_rate": 7.533232067880127e-06, + "loss": 0.4715, + "step": 3055 + }, + { + "epoch": 0.35, + "grad_norm": 1.7378958280272816, + "learning_rate": 7.531627499339504e-06, + "loss": 0.501, + "step": 3056 + }, + { + "epoch": 0.35, + "grad_norm": 0.7869871849195075, + "learning_rate": 7.530022580118278e-06, + "loss": 0.6667, + "step": 3057 + }, + { + "epoch": 0.35, + "grad_norm": 1.900233915189838, + "learning_rate": 7.528417310438762e-06, + "loss": 0.5488, + "step": 3058 + }, + { + "epoch": 0.35, + "grad_norm": 2.1167739385272806, + "learning_rate": 7.526811690523319e-06, + "loss": 0.4699, + "step": 3059 + }, + { + "epoch": 0.35, + "grad_norm": 2.0897236073300878, + "learning_rate": 7.525205720594357e-06, + "loss": 0.5616, + "step": 3060 + }, + { + "epoch": 0.35, + "grad_norm": 1.7832049851391216, + "learning_rate": 7.5235994008743385e-06, + "loss": 0.5226, + "step": 3061 + }, + { + "epoch": 0.35, + "grad_norm": 2.31305898735799, + "learning_rate": 7.521992731585766e-06, + "loss": 0.5587, + "step": 3062 + }, + { + "epoch": 0.35, + "grad_norm": 1.6816633306633766, + "learning_rate": 7.520385712951197e-06, + "loss": 0.5729, + "step": 3063 + }, + { + "epoch": 0.35, + "grad_norm": 2.2866652276576844, + "learning_rate": 7.518778345193236e-06, + "loss": 0.5684, + "step": 3064 + }, + { + "epoch": 0.35, + "grad_norm": 2.344125482088026, + "learning_rate": 7.517170628534536e-06, + "loss": 0.5798, + "step": 3065 + }, + { + "epoch": 0.35, + "grad_norm": 1.9487499593237727, + "learning_rate": 7.515562563197794e-06, + "loss": 0.5593, + "step": 3066 + }, + { + "epoch": 0.35, + "grad_norm": 2.325991750956368, + "learning_rate": 7.5139541494057635e-06, + "loss": 0.4926, + "step": 3067 + }, + { + "epoch": 0.35, + "grad_norm": 2.8550113577945218, + "learning_rate": 7.512345387381239e-06, + "loss": 0.4587, + "step": 3068 + }, + { + "epoch": 0.35, + "grad_norm": 2.055398430362565, + "learning_rate": 7.510736277347067e-06, + "loss": 0.6278, + "step": 3069 + }, + { + "epoch": 0.35, + "grad_norm": 2.3835524235896774, + "learning_rate": 7.50912681952614e-06, + "loss": 0.4817, + "step": 3070 + }, + { + "epoch": 0.35, + "grad_norm": 2.0788569424648284, + "learning_rate": 7.507517014141401e-06, + "loss": 0.5252, + "step": 3071 + }, + { + "epoch": 0.35, + "grad_norm": 2.323438556864119, + "learning_rate": 7.50590686141584e-06, + "loss": 0.5603, + "step": 3072 + }, + { + "epoch": 0.35, + "grad_norm": 1.9882217353541358, + "learning_rate": 7.504296361572494e-06, + "loss": 0.4634, + "step": 3073 + }, + { + "epoch": 0.35, + "grad_norm": 2.1398379051274556, + "learning_rate": 7.502685514834449e-06, + "loss": 0.4196, + "step": 3074 + }, + { + "epoch": 0.35, + "grad_norm": 2.6060615484709055, + "learning_rate": 7.501074321424842e-06, + "loss": 0.5019, + "step": 3075 + }, + { + "epoch": 0.35, + "grad_norm": 1.6815272515601314, + "learning_rate": 7.499462781566851e-06, + "loss": 0.5502, + "step": 3076 + }, + { + "epoch": 0.35, + "grad_norm": 1.544292161334717, + "learning_rate": 7.49785089548371e-06, + "loss": 0.4574, + "step": 3077 + }, + { + "epoch": 0.35, + "grad_norm": 1.7462186380631068, + "learning_rate": 7.496238663398695e-06, + "loss": 0.4485, + "step": 3078 + }, + { + "epoch": 0.35, + "grad_norm": 2.737583939454655, + "learning_rate": 7.494626085535132e-06, + "loss": 0.5146, + "step": 3079 + }, + { + "epoch": 0.35, + "grad_norm": 3.790394097681309, + "learning_rate": 7.493013162116398e-06, + "loss": 0.4545, + "step": 3080 + }, + { + "epoch": 0.35, + "grad_norm": 1.4580651655804155, + "learning_rate": 7.49139989336591e-06, + "loss": 0.6078, + "step": 3081 + }, + { + "epoch": 0.35, + "grad_norm": 1.7449025654937735, + "learning_rate": 7.489786279507142e-06, + "loss": 0.5181, + "step": 3082 + }, + { + "epoch": 0.35, + "grad_norm": 2.442941055781812, + "learning_rate": 7.48817232076361e-06, + "loss": 0.4203, + "step": 3083 + }, + { + "epoch": 0.35, + "grad_norm": 3.1978016641583755, + "learning_rate": 7.486558017358877e-06, + "loss": 0.5042, + "step": 3084 + }, + { + "epoch": 0.35, + "grad_norm": 1.817690979932559, + "learning_rate": 7.484943369516558e-06, + "loss": 0.5954, + "step": 3085 + }, + { + "epoch": 0.35, + "grad_norm": 2.442067351059011, + "learning_rate": 7.4833283774603174e-06, + "loss": 0.4733, + "step": 3086 + }, + { + "epoch": 0.35, + "grad_norm": 2.6067478074928188, + "learning_rate": 7.481713041413857e-06, + "loss": 0.6012, + "step": 3087 + }, + { + "epoch": 0.35, + "grad_norm": 1.8989158828469794, + "learning_rate": 7.480097361600937e-06, + "loss": 0.5297, + "step": 3088 + }, + { + "epoch": 0.35, + "grad_norm": 1.9253868412195685, + "learning_rate": 7.4784813382453595e-06, + "loss": 0.4931, + "step": 3089 + }, + { + "epoch": 0.36, + "grad_norm": 1.8934673982622288, + "learning_rate": 7.476864971570976e-06, + "loss": 0.5084, + "step": 3090 + }, + { + "epoch": 0.36, + "grad_norm": 2.9215448411472478, + "learning_rate": 7.475248261801687e-06, + "loss": 0.5307, + "step": 3091 + }, + { + "epoch": 0.36, + "grad_norm": 2.8266729622017928, + "learning_rate": 7.473631209161436e-06, + "loss": 0.4431, + "step": 3092 + }, + { + "epoch": 0.36, + "grad_norm": 1.748961198995004, + "learning_rate": 7.472013813874219e-06, + "loss": 0.5396, + "step": 3093 + }, + { + "epoch": 0.36, + "grad_norm": 8.12753897676637, + "learning_rate": 7.470396076164078e-06, + "loss": 0.544, + "step": 3094 + }, + { + "epoch": 0.36, + "grad_norm": 1.9251274020674929, + "learning_rate": 7.468777996255099e-06, + "loss": 0.4158, + "step": 3095 + }, + { + "epoch": 0.36, + "grad_norm": 2.3470826798567734, + "learning_rate": 7.46715957437142e-06, + "loss": 0.4367, + "step": 3096 + }, + { + "epoch": 0.36, + "grad_norm": 3.4768803671420305, + "learning_rate": 7.465540810737224e-06, + "loss": 0.4344, + "step": 3097 + }, + { + "epoch": 0.36, + "grad_norm": 1.6092499994121563, + "learning_rate": 7.463921705576741e-06, + "loss": 0.4636, + "step": 3098 + }, + { + "epoch": 0.36, + "grad_norm": 1.5467406565561836, + "learning_rate": 7.4623022591142516e-06, + "loss": 0.5106, + "step": 3099 + }, + { + "epoch": 0.36, + "grad_norm": 1.874925429396424, + "learning_rate": 7.460682471574079e-06, + "loss": 0.5718, + "step": 3100 + }, + { + "epoch": 0.36, + "grad_norm": 2.4800778518046664, + "learning_rate": 7.4590623431805965e-06, + "loss": 0.4719, + "step": 3101 + }, + { + "epoch": 0.36, + "grad_norm": 2.1852046980973134, + "learning_rate": 7.457441874158224e-06, + "loss": 0.489, + "step": 3102 + }, + { + "epoch": 0.36, + "grad_norm": 2.1271942521140907, + "learning_rate": 7.45582106473143e-06, + "loss": 0.5666, + "step": 3103 + }, + { + "epoch": 0.36, + "grad_norm": 2.3588116038229594, + "learning_rate": 7.454199915124727e-06, + "loss": 0.5412, + "step": 3104 + }, + { + "epoch": 0.36, + "grad_norm": 1.7927395601518523, + "learning_rate": 7.452578425562677e-06, + "loss": 0.5562, + "step": 3105 + }, + { + "epoch": 0.36, + "grad_norm": 2.006609263574854, + "learning_rate": 7.450956596269887e-06, + "loss": 0.4832, + "step": 3106 + }, + { + "epoch": 0.36, + "grad_norm": 1.9512060861775211, + "learning_rate": 7.449334427471014e-06, + "loss": 0.5648, + "step": 3107 + }, + { + "epoch": 0.36, + "grad_norm": 2.2965786497543523, + "learning_rate": 7.447711919390761e-06, + "loss": 0.4988, + "step": 3108 + }, + { + "epoch": 0.36, + "grad_norm": 2.111194626432888, + "learning_rate": 7.446089072253877e-06, + "loss": 0.5307, + "step": 3109 + }, + { + "epoch": 0.36, + "grad_norm": 1.9068368380845009, + "learning_rate": 7.444465886285157e-06, + "loss": 0.5114, + "step": 3110 + }, + { + "epoch": 0.36, + "grad_norm": 5.582403869676365, + "learning_rate": 7.4428423617094485e-06, + "loss": 0.5016, + "step": 3111 + }, + { + "epoch": 0.36, + "grad_norm": 5.389292042633605, + "learning_rate": 7.441218498751637e-06, + "loss": 0.5436, + "step": 3112 + }, + { + "epoch": 0.36, + "grad_norm": 2.1219242483238445, + "learning_rate": 7.439594297636663e-06, + "loss": 0.6301, + "step": 3113 + }, + { + "epoch": 0.36, + "grad_norm": 2.267207976764517, + "learning_rate": 7.437969758589508e-06, + "loss": 0.4732, + "step": 3114 + }, + { + "epoch": 0.36, + "grad_norm": 4.687644657326908, + "learning_rate": 7.436344881835205e-06, + "loss": 0.5308, + "step": 3115 + }, + { + "epoch": 0.36, + "grad_norm": 1.809851846195055, + "learning_rate": 7.434719667598831e-06, + "loss": 0.5155, + "step": 3116 + }, + { + "epoch": 0.36, + "grad_norm": 2.951326683921181, + "learning_rate": 7.43309411610551e-06, + "loss": 0.5165, + "step": 3117 + }, + { + "epoch": 0.36, + "grad_norm": 1.9739660998919277, + "learning_rate": 7.431468227580415e-06, + "loss": 0.6536, + "step": 3118 + }, + { + "epoch": 0.36, + "grad_norm": 2.6708568918918405, + "learning_rate": 7.42984200224876e-06, + "loss": 0.5641, + "step": 3119 + }, + { + "epoch": 0.36, + "grad_norm": 1.926609162126331, + "learning_rate": 7.428215440335814e-06, + "loss": 0.5283, + "step": 3120 + }, + { + "epoch": 0.36, + "grad_norm": 2.5048361681299824, + "learning_rate": 7.426588542066885e-06, + "loss": 0.4949, + "step": 3121 + }, + { + "epoch": 0.36, + "grad_norm": 2.816417055949335, + "learning_rate": 7.424961307667331e-06, + "loss": 0.6014, + "step": 3122 + }, + { + "epoch": 0.36, + "grad_norm": 2.431129522000982, + "learning_rate": 7.423333737362558e-06, + "loss": 0.5607, + "step": 3123 + }, + { + "epoch": 0.36, + "grad_norm": 1.9607745394510776, + "learning_rate": 7.421705831378014e-06, + "loss": 0.4455, + "step": 3124 + }, + { + "epoch": 0.36, + "grad_norm": 1.971731739656572, + "learning_rate": 7.420077589939199e-06, + "loss": 0.4937, + "step": 3125 + }, + { + "epoch": 0.36, + "grad_norm": 2.0899099082163612, + "learning_rate": 7.4184490132716534e-06, + "loss": 0.4584, + "step": 3126 + }, + { + "epoch": 0.36, + "grad_norm": 1.9735869363107756, + "learning_rate": 7.41682010160097e-06, + "loss": 0.5652, + "step": 3127 + }, + { + "epoch": 0.36, + "grad_norm": 1.6375971702833931, + "learning_rate": 7.415190855152786e-06, + "loss": 0.5475, + "step": 3128 + }, + { + "epoch": 0.36, + "grad_norm": 2.45023314669999, + "learning_rate": 7.413561274152783e-06, + "loss": 0.4809, + "step": 3129 + }, + { + "epoch": 0.36, + "grad_norm": 2.056422857625645, + "learning_rate": 7.411931358826689e-06, + "loss": 0.5171, + "step": 3130 + }, + { + "epoch": 0.36, + "grad_norm": 1.5054868958806142, + "learning_rate": 7.410301109400281e-06, + "loss": 0.4808, + "step": 3131 + }, + { + "epoch": 0.36, + "grad_norm": 3.3828056300272458, + "learning_rate": 7.4086705260993814e-06, + "loss": 0.519, + "step": 3132 + }, + { + "epoch": 0.36, + "grad_norm": 1.7108145895349829, + "learning_rate": 7.4070396091498575e-06, + "loss": 0.5167, + "step": 3133 + }, + { + "epoch": 0.36, + "grad_norm": 2.1811405072517664, + "learning_rate": 7.405408358777624e-06, + "loss": 0.4639, + "step": 3134 + }, + { + "epoch": 0.36, + "grad_norm": 1.9485573921471266, + "learning_rate": 7.403776775208641e-06, + "loss": 0.5804, + "step": 3135 + }, + { + "epoch": 0.36, + "grad_norm": 2.118709744754571, + "learning_rate": 7.402144858668915e-06, + "loss": 0.5453, + "step": 3136 + }, + { + "epoch": 0.36, + "grad_norm": 0.9540590906381115, + "learning_rate": 7.4005126093845005e-06, + "loss": 0.7287, + "step": 3137 + }, + { + "epoch": 0.36, + "grad_norm": 1.8759706432653271, + "learning_rate": 7.398880027581494e-06, + "loss": 0.606, + "step": 3138 + }, + { + "epoch": 0.36, + "grad_norm": 0.9061848246277939, + "learning_rate": 7.397247113486044e-06, + "loss": 0.738, + "step": 3139 + }, + { + "epoch": 0.36, + "grad_norm": 1.8814007133360884, + "learning_rate": 7.395613867324336e-06, + "loss": 0.4775, + "step": 3140 + }, + { + "epoch": 0.36, + "grad_norm": 1.914763021175095, + "learning_rate": 7.393980289322611e-06, + "loss": 0.5371, + "step": 3141 + }, + { + "epoch": 0.36, + "grad_norm": 1.8351047473956608, + "learning_rate": 7.3923463797071515e-06, + "loss": 0.4919, + "step": 3142 + }, + { + "epoch": 0.36, + "grad_norm": 2.610456681635869, + "learning_rate": 7.390712138704286e-06, + "loss": 0.6067, + "step": 3143 + }, + { + "epoch": 0.36, + "grad_norm": 1.8276151344518674, + "learning_rate": 7.389077566540388e-06, + "loss": 0.4659, + "step": 3144 + }, + { + "epoch": 0.36, + "grad_norm": 2.1450913710434842, + "learning_rate": 7.38744266344188e-06, + "loss": 0.5543, + "step": 3145 + }, + { + "epoch": 0.36, + "grad_norm": 2.3989405299325575, + "learning_rate": 7.385807429635229e-06, + "loss": 0.4398, + "step": 3146 + }, + { + "epoch": 0.36, + "grad_norm": 1.823231640939177, + "learning_rate": 7.384171865346944e-06, + "loss": 0.5893, + "step": 3147 + }, + { + "epoch": 0.36, + "grad_norm": 2.4459555289103547, + "learning_rate": 7.382535970803586e-06, + "loss": 0.5044, + "step": 3148 + }, + { + "epoch": 0.36, + "grad_norm": 2.099212623753445, + "learning_rate": 7.380899746231758e-06, + "loss": 0.525, + "step": 3149 + }, + { + "epoch": 0.36, + "grad_norm": 1.6789875267504135, + "learning_rate": 7.379263191858109e-06, + "loss": 0.5282, + "step": 3150 + }, + { + "epoch": 0.36, + "grad_norm": 1.7343144179524803, + "learning_rate": 7.377626307909336e-06, + "loss": 0.5301, + "step": 3151 + }, + { + "epoch": 0.36, + "grad_norm": 1.8952217035453864, + "learning_rate": 7.375989094612177e-06, + "loss": 0.4757, + "step": 3152 + }, + { + "epoch": 0.36, + "grad_norm": 11.161547708729708, + "learning_rate": 7.374351552193421e-06, + "loss": 0.4779, + "step": 3153 + }, + { + "epoch": 0.36, + "grad_norm": 2.243781677545647, + "learning_rate": 7.372713680879901e-06, + "loss": 0.5456, + "step": 3154 + }, + { + "epoch": 0.36, + "grad_norm": 1.9182894616810096, + "learning_rate": 7.371075480898491e-06, + "loss": 0.5009, + "step": 3155 + }, + { + "epoch": 0.36, + "grad_norm": 1.7326257731793997, + "learning_rate": 7.369436952476116e-06, + "loss": 0.6112, + "step": 3156 + }, + { + "epoch": 0.36, + "grad_norm": 2.031224117428107, + "learning_rate": 7.367798095839745e-06, + "loss": 0.4417, + "step": 3157 + }, + { + "epoch": 0.36, + "grad_norm": 2.726987330858252, + "learning_rate": 7.366158911216391e-06, + "loss": 0.5807, + "step": 3158 + }, + { + "epoch": 0.36, + "grad_norm": 1.9804120454591008, + "learning_rate": 7.364519398833115e-06, + "loss": 0.4744, + "step": 3159 + }, + { + "epoch": 0.36, + "grad_norm": 1.4187826328488, + "learning_rate": 7.3628795589170224e-06, + "loss": 0.4684, + "step": 3160 + }, + { + "epoch": 0.36, + "grad_norm": 1.6058890337316674, + "learning_rate": 7.361239391695261e-06, + "loss": 0.5961, + "step": 3161 + }, + { + "epoch": 0.36, + "grad_norm": 1.9740778067132343, + "learning_rate": 7.35959889739503e-06, + "loss": 0.586, + "step": 3162 + }, + { + "epoch": 0.36, + "grad_norm": 1.9628608964201661, + "learning_rate": 7.357958076243567e-06, + "loss": 0.4806, + "step": 3163 + }, + { + "epoch": 0.36, + "grad_norm": 1.0106988005801811, + "learning_rate": 7.35631692846816e-06, + "loss": 0.7417, + "step": 3164 + }, + { + "epoch": 0.36, + "grad_norm": 2.126958672674491, + "learning_rate": 7.354675454296141e-06, + "loss": 0.5103, + "step": 3165 + }, + { + "epoch": 0.36, + "grad_norm": 1.7176010213065342, + "learning_rate": 7.353033653954885e-06, + "loss": 0.5621, + "step": 3166 + }, + { + "epoch": 0.36, + "grad_norm": 5.695755804760129, + "learning_rate": 7.351391527671815e-06, + "loss": 0.493, + "step": 3167 + }, + { + "epoch": 0.36, + "grad_norm": 1.8645049880854505, + "learning_rate": 7.349749075674399e-06, + "loss": 0.5014, + "step": 3168 + }, + { + "epoch": 0.36, + "grad_norm": 1.718375430885016, + "learning_rate": 7.348106298190145e-06, + "loss": 0.4857, + "step": 3169 + }, + { + "epoch": 0.36, + "grad_norm": 2.038685116778116, + "learning_rate": 7.346463195446617e-06, + "loss": 0.5542, + "step": 3170 + }, + { + "epoch": 0.36, + "grad_norm": 1.7030806662947264, + "learning_rate": 7.3448197676714115e-06, + "loss": 0.5403, + "step": 3171 + }, + { + "epoch": 0.36, + "grad_norm": 1.6220301492539222, + "learning_rate": 7.343176015092177e-06, + "loss": 0.5198, + "step": 3172 + }, + { + "epoch": 0.36, + "grad_norm": 2.9492196877989625, + "learning_rate": 7.341531937936608e-06, + "loss": 0.4857, + "step": 3173 + }, + { + "epoch": 0.36, + "grad_norm": 2.0090274678653395, + "learning_rate": 7.33988753643244e-06, + "loss": 0.6522, + "step": 3174 + }, + { + "epoch": 0.36, + "grad_norm": 2.9027644548490903, + "learning_rate": 7.3382428108074566e-06, + "loss": 0.4974, + "step": 3175 + }, + { + "epoch": 0.36, + "grad_norm": 3.5710390636701534, + "learning_rate": 7.336597761289484e-06, + "loss": 0.5544, + "step": 3176 + }, + { + "epoch": 0.37, + "grad_norm": 2.5275365158756697, + "learning_rate": 7.334952388106393e-06, + "loss": 0.5084, + "step": 3177 + }, + { + "epoch": 0.37, + "grad_norm": 2.0494201519833197, + "learning_rate": 7.3333066914861024e-06, + "loss": 0.4988, + "step": 3178 + }, + { + "epoch": 0.37, + "grad_norm": 2.3934631584707122, + "learning_rate": 7.331660671656574e-06, + "loss": 0.6034, + "step": 3179 + }, + { + "epoch": 0.37, + "grad_norm": 1.723782594664949, + "learning_rate": 7.330014328845813e-06, + "loss": 0.524, + "step": 3180 + }, + { + "epoch": 0.37, + "grad_norm": 3.8114999798888345, + "learning_rate": 7.328367663281869e-06, + "loss": 0.591, + "step": 3181 + }, + { + "epoch": 0.37, + "grad_norm": 1.8561883525509666, + "learning_rate": 7.326720675192841e-06, + "loss": 0.4606, + "step": 3182 + }, + { + "epoch": 0.37, + "grad_norm": 0.94805703412477, + "learning_rate": 7.325073364806867e-06, + "loss": 0.6989, + "step": 3183 + }, + { + "epoch": 0.37, + "grad_norm": 2.973679307183509, + "learning_rate": 7.323425732352134e-06, + "loss": 0.5307, + "step": 3184 + }, + { + "epoch": 0.37, + "grad_norm": 1.775740756994591, + "learning_rate": 7.321777778056871e-06, + "loss": 0.531, + "step": 3185 + }, + { + "epoch": 0.37, + "grad_norm": 2.0502821528170987, + "learning_rate": 7.320129502149353e-06, + "loss": 0.5389, + "step": 3186 + }, + { + "epoch": 0.37, + "grad_norm": 2.2253858743489636, + "learning_rate": 7.318480904857897e-06, + "loss": 0.5612, + "step": 3187 + }, + { + "epoch": 0.37, + "grad_norm": 2.621153424068711, + "learning_rate": 7.316831986410868e-06, + "loss": 0.5588, + "step": 3188 + }, + { + "epoch": 0.37, + "grad_norm": 1.9833168580716858, + "learning_rate": 7.315182747036674e-06, + "loss": 0.4838, + "step": 3189 + }, + { + "epoch": 0.37, + "grad_norm": 0.8524564170062607, + "learning_rate": 7.313533186963767e-06, + "loss": 0.7002, + "step": 3190 + }, + { + "epoch": 0.37, + "grad_norm": 3.03107634064579, + "learning_rate": 7.311883306420643e-06, + "loss": 0.5449, + "step": 3191 + }, + { + "epoch": 0.37, + "grad_norm": 1.8705110042887458, + "learning_rate": 7.310233105635843e-06, + "loss": 0.4446, + "step": 3192 + }, + { + "epoch": 0.37, + "grad_norm": 2.123840300323815, + "learning_rate": 7.308582584837955e-06, + "loss": 0.4796, + "step": 3193 + }, + { + "epoch": 0.37, + "grad_norm": 1.9752455487408522, + "learning_rate": 7.306931744255607e-06, + "loss": 0.5302, + "step": 3194 + }, + { + "epoch": 0.37, + "grad_norm": 1.7101197308004368, + "learning_rate": 7.305280584117471e-06, + "loss": 0.5844, + "step": 3195 + }, + { + "epoch": 0.37, + "grad_norm": 0.8749406624922914, + "learning_rate": 7.303629104652271e-06, + "loss": 0.7354, + "step": 3196 + }, + { + "epoch": 0.37, + "grad_norm": 2.0811736418184013, + "learning_rate": 7.301977306088766e-06, + "loss": 0.5283, + "step": 3197 + }, + { + "epoch": 0.37, + "grad_norm": 1.6971218815444455, + "learning_rate": 7.300325188655762e-06, + "loss": 0.4601, + "step": 3198 + }, + { + "epoch": 0.37, + "grad_norm": 1.852497922082125, + "learning_rate": 7.298672752582111e-06, + "loss": 0.5307, + "step": 3199 + }, + { + "epoch": 0.37, + "grad_norm": 3.550515058265413, + "learning_rate": 7.29701999809671e-06, + "loss": 0.4976, + "step": 3200 + }, + { + "epoch": 0.37, + "grad_norm": 2.0126859120858676, + "learning_rate": 7.295366925428494e-06, + "loss": 0.5404, + "step": 3201 + }, + { + "epoch": 0.37, + "grad_norm": 1.841462450703764, + "learning_rate": 7.293713534806451e-06, + "loss": 0.5051, + "step": 3202 + }, + { + "epoch": 0.37, + "grad_norm": 1.8862402059201049, + "learning_rate": 7.292059826459607e-06, + "loss": 0.5322, + "step": 3203 + }, + { + "epoch": 0.37, + "grad_norm": 2.6034025192903605, + "learning_rate": 7.290405800617033e-06, + "loss": 0.4708, + "step": 3204 + }, + { + "epoch": 0.37, + "grad_norm": 2.238645064785399, + "learning_rate": 7.288751457507844e-06, + "loss": 0.4763, + "step": 3205 + }, + { + "epoch": 0.37, + "grad_norm": 2.049591709278908, + "learning_rate": 7.287096797361197e-06, + "loss": 0.5366, + "step": 3206 + }, + { + "epoch": 0.37, + "grad_norm": 3.532309091186629, + "learning_rate": 7.285441820406301e-06, + "loss": 0.5132, + "step": 3207 + }, + { + "epoch": 0.37, + "grad_norm": 1.863228770837838, + "learning_rate": 7.283786526872398e-06, + "loss": 0.5385, + "step": 3208 + }, + { + "epoch": 0.37, + "grad_norm": 2.868284190461831, + "learning_rate": 7.282130916988781e-06, + "loss": 0.5288, + "step": 3209 + }, + { + "epoch": 0.37, + "grad_norm": 1.6412857947209902, + "learning_rate": 7.280474990984785e-06, + "loss": 0.4763, + "step": 3210 + }, + { + "epoch": 0.37, + "grad_norm": 0.9197115324563732, + "learning_rate": 7.278818749089789e-06, + "loss": 0.7684, + "step": 3211 + }, + { + "epoch": 0.37, + "grad_norm": 2.3460030317762084, + "learning_rate": 7.277162191533213e-06, + "loss": 0.5638, + "step": 3212 + }, + { + "epoch": 0.37, + "grad_norm": 2.6850650615946843, + "learning_rate": 7.275505318544527e-06, + "loss": 0.6287, + "step": 3213 + }, + { + "epoch": 0.37, + "grad_norm": 2.1930545690492353, + "learning_rate": 7.273848130353237e-06, + "loss": 0.4971, + "step": 3214 + }, + { + "epoch": 0.37, + "grad_norm": 3.3505132156274433, + "learning_rate": 7.272190627188897e-06, + "loss": 0.4765, + "step": 3215 + }, + { + "epoch": 0.37, + "grad_norm": 2.2824895546465958, + "learning_rate": 7.270532809281106e-06, + "loss": 0.5819, + "step": 3216 + }, + { + "epoch": 0.37, + "grad_norm": 4.0073510658478195, + "learning_rate": 7.268874676859503e-06, + "loss": 0.4771, + "step": 3217 + }, + { + "epoch": 0.37, + "grad_norm": 1.9652088793024016, + "learning_rate": 7.267216230153773e-06, + "loss": 0.4579, + "step": 3218 + }, + { + "epoch": 0.37, + "grad_norm": 2.0067956395934017, + "learning_rate": 7.265557469393643e-06, + "loss": 0.5462, + "step": 3219 + }, + { + "epoch": 0.37, + "grad_norm": 2.234519215570638, + "learning_rate": 7.263898394808885e-06, + "loss": 0.479, + "step": 3220 + }, + { + "epoch": 0.37, + "grad_norm": 1.8937308113026818, + "learning_rate": 7.262239006629315e-06, + "loss": 0.4489, + "step": 3221 + }, + { + "epoch": 0.37, + "grad_norm": 1.7872955854870838, + "learning_rate": 7.260579305084787e-06, + "loss": 0.4731, + "step": 3222 + }, + { + "epoch": 0.37, + "grad_norm": 2.4114170190253597, + "learning_rate": 7.2589192904052065e-06, + "loss": 0.5488, + "step": 3223 + }, + { + "epoch": 0.37, + "grad_norm": 1.8632435357532933, + "learning_rate": 7.257258962820517e-06, + "loss": 0.4936, + "step": 3224 + }, + { + "epoch": 0.37, + "grad_norm": 1.5889494245152085, + "learning_rate": 7.255598322560707e-06, + "loss": 0.4966, + "step": 3225 + }, + { + "epoch": 0.37, + "grad_norm": 2.4704757724789483, + "learning_rate": 7.253937369855808e-06, + "loss": 0.4387, + "step": 3226 + }, + { + "epoch": 0.37, + "grad_norm": 2.7549965397048326, + "learning_rate": 7.252276104935896e-06, + "loss": 0.5216, + "step": 3227 + }, + { + "epoch": 0.37, + "grad_norm": 2.809916264275648, + "learning_rate": 7.250614528031087e-06, + "loss": 0.5805, + "step": 3228 + }, + { + "epoch": 0.37, + "grad_norm": 1.9489089900126306, + "learning_rate": 7.248952639371543e-06, + "loss": 0.5042, + "step": 3229 + }, + { + "epoch": 0.37, + "grad_norm": 1.8802118945933053, + "learning_rate": 7.247290439187471e-06, + "loss": 0.5728, + "step": 3230 + }, + { + "epoch": 0.37, + "grad_norm": 2.074895522856019, + "learning_rate": 7.245627927709114e-06, + "loss": 0.6152, + "step": 3231 + }, + { + "epoch": 0.37, + "grad_norm": 2.592295403445032, + "learning_rate": 7.243965105166766e-06, + "loss": 0.5559, + "step": 3232 + }, + { + "epoch": 0.37, + "grad_norm": 2.7190186318556324, + "learning_rate": 7.242301971790762e-06, + "loss": 0.5651, + "step": 3233 + }, + { + "epoch": 0.37, + "grad_norm": 2.085765370388405, + "learning_rate": 7.240638527811474e-06, + "loss": 0.5603, + "step": 3234 + }, + { + "epoch": 0.37, + "grad_norm": 1.7110710804156666, + "learning_rate": 7.238974773459328e-06, + "loss": 0.4211, + "step": 3235 + }, + { + "epoch": 0.37, + "grad_norm": 1.8137208200112913, + "learning_rate": 7.237310708964783e-06, + "loss": 0.4364, + "step": 3236 + }, + { + "epoch": 0.37, + "grad_norm": 2.023132542880251, + "learning_rate": 7.235646334558346e-06, + "loss": 0.4862, + "step": 3237 + }, + { + "epoch": 0.37, + "grad_norm": 1.6848600855234326, + "learning_rate": 7.233981650470567e-06, + "loss": 0.4482, + "step": 3238 + }, + { + "epoch": 0.37, + "grad_norm": 1.8868382311741896, + "learning_rate": 7.2323166569320366e-06, + "loss": 0.5264, + "step": 3239 + }, + { + "epoch": 0.37, + "grad_norm": 3.0148909892719913, + "learning_rate": 7.230651354173389e-06, + "loss": 0.5207, + "step": 3240 + }, + { + "epoch": 0.37, + "grad_norm": 1.885194804203804, + "learning_rate": 7.228985742425302e-06, + "loss": 0.5363, + "step": 3241 + }, + { + "epoch": 0.37, + "grad_norm": 1.8144530508181507, + "learning_rate": 7.227319821918496e-06, + "loss": 0.568, + "step": 3242 + }, + { + "epoch": 0.37, + "grad_norm": 2.149548295102153, + "learning_rate": 7.225653592883734e-06, + "loss": 0.5095, + "step": 3243 + }, + { + "epoch": 0.37, + "grad_norm": 2.1772612870531507, + "learning_rate": 7.2239870555518224e-06, + "loss": 0.5496, + "step": 3244 + }, + { + "epoch": 0.37, + "grad_norm": 1.8398368862289578, + "learning_rate": 7.222320210153608e-06, + "loss": 0.475, + "step": 3245 + }, + { + "epoch": 0.37, + "grad_norm": 2.1894193382149774, + "learning_rate": 7.2206530569199855e-06, + "loss": 0.4299, + "step": 3246 + }, + { + "epoch": 0.37, + "grad_norm": 1.9107026587726634, + "learning_rate": 7.218985596081884e-06, + "loss": 0.5145, + "step": 3247 + }, + { + "epoch": 0.37, + "grad_norm": 2.038233063128196, + "learning_rate": 7.217317827870283e-06, + "loss": 0.5319, + "step": 3248 + }, + { + "epoch": 0.37, + "grad_norm": 2.2651182052750625, + "learning_rate": 7.2156497525162e-06, + "loss": 0.4441, + "step": 3249 + }, + { + "epoch": 0.37, + "grad_norm": 2.1163274374191965, + "learning_rate": 7.213981370250698e-06, + "loss": 0.5158, + "step": 3250 + }, + { + "epoch": 0.37, + "grad_norm": 7.5688874956877426, + "learning_rate": 7.212312681304879e-06, + "loss": 0.4803, + "step": 3251 + }, + { + "epoch": 0.37, + "grad_norm": 1.8526500809353896, + "learning_rate": 7.2106436859098904e-06, + "loss": 0.522, + "step": 3252 + }, + { + "epoch": 0.37, + "grad_norm": 1.8788397472015697, + "learning_rate": 7.208974384296921e-06, + "loss": 0.5756, + "step": 3253 + }, + { + "epoch": 0.37, + "grad_norm": 2.9713596369811346, + "learning_rate": 7.207304776697204e-06, + "loss": 0.4271, + "step": 3254 + }, + { + "epoch": 0.37, + "grad_norm": 1.789984616457143, + "learning_rate": 7.205634863342011e-06, + "loss": 0.4678, + "step": 3255 + }, + { + "epoch": 0.37, + "grad_norm": 0.8325535741031778, + "learning_rate": 7.203964644462658e-06, + "loss": 0.7334, + "step": 3256 + }, + { + "epoch": 0.37, + "grad_norm": 2.062446456036115, + "learning_rate": 7.202294120290505e-06, + "loss": 0.4655, + "step": 3257 + }, + { + "epoch": 0.37, + "grad_norm": 1.8397795903457739, + "learning_rate": 7.200623291056952e-06, + "loss": 0.4547, + "step": 3258 + }, + { + "epoch": 0.37, + "grad_norm": 2.003938829803272, + "learning_rate": 7.198952156993441e-06, + "loss": 0.4871, + "step": 3259 + }, + { + "epoch": 0.37, + "grad_norm": 0.89124909279938, + "learning_rate": 7.197280718331459e-06, + "loss": 0.7692, + "step": 3260 + }, + { + "epoch": 0.37, + "grad_norm": 1.9020077513978273, + "learning_rate": 7.195608975302532e-06, + "loss": 0.4372, + "step": 3261 + }, + { + "epoch": 0.37, + "grad_norm": 2.0370801612671094, + "learning_rate": 7.19393692813823e-06, + "loss": 0.5475, + "step": 3262 + }, + { + "epoch": 0.37, + "grad_norm": 1.801995486360701, + "learning_rate": 7.192264577070165e-06, + "loss": 0.4844, + "step": 3263 + }, + { + "epoch": 0.38, + "grad_norm": 2.7835752072198074, + "learning_rate": 7.19059192232999e-06, + "loss": 0.3805, + "step": 3264 + }, + { + "epoch": 0.38, + "grad_norm": 15.601082772477739, + "learning_rate": 7.188918964149402e-06, + "loss": 0.4931, + "step": 3265 + }, + { + "epoch": 0.38, + "grad_norm": 2.012785201442268, + "learning_rate": 7.187245702760137e-06, + "loss": 0.582, + "step": 3266 + }, + { + "epoch": 0.38, + "grad_norm": 1.7760690634172722, + "learning_rate": 7.185572138393977e-06, + "loss": 0.5204, + "step": 3267 + }, + { + "epoch": 0.38, + "grad_norm": 1.9022646832204473, + "learning_rate": 7.183898271282743e-06, + "loss": 0.4624, + "step": 3268 + }, + { + "epoch": 0.38, + "grad_norm": 1.634757795140507, + "learning_rate": 7.182224101658299e-06, + "loss": 0.5325, + "step": 3269 + }, + { + "epoch": 0.38, + "grad_norm": 1.8767580000759363, + "learning_rate": 7.180549629752551e-06, + "loss": 0.5552, + "step": 3270 + }, + { + "epoch": 0.38, + "grad_norm": 1.828377519318436, + "learning_rate": 7.178874855797445e-06, + "loss": 0.4504, + "step": 3271 + }, + { + "epoch": 0.38, + "grad_norm": 2.247467794038215, + "learning_rate": 7.1771997800249715e-06, + "loss": 0.5546, + "step": 3272 + }, + { + "epoch": 0.38, + "grad_norm": 2.19397919747565, + "learning_rate": 7.1755244026671625e-06, + "loss": 0.4685, + "step": 3273 + }, + { + "epoch": 0.38, + "grad_norm": 1.9535158630812461, + "learning_rate": 7.173848723956088e-06, + "loss": 0.4488, + "step": 3274 + }, + { + "epoch": 0.38, + "grad_norm": 2.7406526882238667, + "learning_rate": 7.172172744123867e-06, + "loss": 0.4142, + "step": 3275 + }, + { + "epoch": 0.38, + "grad_norm": 1.8592206525877195, + "learning_rate": 7.170496463402652e-06, + "loss": 0.532, + "step": 3276 + }, + { + "epoch": 0.38, + "grad_norm": 1.8887567127952223, + "learning_rate": 7.1688198820246425e-06, + "loss": 0.4276, + "step": 3277 + }, + { + "epoch": 0.38, + "grad_norm": 2.0180699588180255, + "learning_rate": 7.16714300022208e-06, + "loss": 0.4776, + "step": 3278 + }, + { + "epoch": 0.38, + "grad_norm": 1.8021955720026592, + "learning_rate": 7.165465818227244e-06, + "loss": 0.5022, + "step": 3279 + }, + { + "epoch": 0.38, + "grad_norm": 3.0094149440716094, + "learning_rate": 7.163788336272458e-06, + "loss": 0.47, + "step": 3280 + }, + { + "epoch": 0.38, + "grad_norm": 1.9017665413054776, + "learning_rate": 7.162110554590087e-06, + "loss": 0.5316, + "step": 3281 + }, + { + "epoch": 0.38, + "grad_norm": 2.2026234243488587, + "learning_rate": 7.160432473412535e-06, + "loss": 0.4397, + "step": 3282 + }, + { + "epoch": 0.38, + "grad_norm": 2.356663742354321, + "learning_rate": 7.158754092972252e-06, + "loss": 0.4609, + "step": 3283 + }, + { + "epoch": 0.38, + "grad_norm": 2.0332566541098696, + "learning_rate": 7.157075413501725e-06, + "loss": 0.6345, + "step": 3284 + }, + { + "epoch": 0.38, + "grad_norm": 1.5678032929905938, + "learning_rate": 7.155396435233486e-06, + "loss": 0.4311, + "step": 3285 + }, + { + "epoch": 0.38, + "grad_norm": 1.8825154841327794, + "learning_rate": 7.153717158400107e-06, + "loss": 0.4679, + "step": 3286 + }, + { + "epoch": 0.38, + "grad_norm": 1.8969582820091169, + "learning_rate": 7.152037583234198e-06, + "loss": 0.5157, + "step": 3287 + }, + { + "epoch": 0.38, + "grad_norm": 2.8512636929650883, + "learning_rate": 7.150357709968417e-06, + "loss": 0.418, + "step": 3288 + }, + { + "epoch": 0.38, + "grad_norm": 1.8099448106161273, + "learning_rate": 7.14867753883546e-06, + "loss": 0.5412, + "step": 3289 + }, + { + "epoch": 0.38, + "grad_norm": 2.4534207721145647, + "learning_rate": 7.146997070068062e-06, + "loss": 0.5074, + "step": 3290 + }, + { + "epoch": 0.38, + "grad_norm": 1.7880624734714443, + "learning_rate": 7.145316303898999e-06, + "loss": 0.5573, + "step": 3291 + }, + { + "epoch": 0.38, + "grad_norm": 2.6807351687304894, + "learning_rate": 7.143635240561095e-06, + "loss": 0.4644, + "step": 3292 + }, + { + "epoch": 0.38, + "grad_norm": 2.081068964161852, + "learning_rate": 7.14195388028721e-06, + "loss": 0.5173, + "step": 3293 + }, + { + "epoch": 0.38, + "grad_norm": 2.293983820807733, + "learning_rate": 7.1402722233102425e-06, + "loss": 0.5717, + "step": 3294 + }, + { + "epoch": 0.38, + "grad_norm": 2.0848089250062345, + "learning_rate": 7.138590269863139e-06, + "loss": 0.4972, + "step": 3295 + }, + { + "epoch": 0.38, + "grad_norm": 1.7213707390818764, + "learning_rate": 7.136908020178881e-06, + "loss": 0.5583, + "step": 3296 + }, + { + "epoch": 0.38, + "grad_norm": 2.6598338037195575, + "learning_rate": 7.1352254744904945e-06, + "loss": 0.514, + "step": 3297 + }, + { + "epoch": 0.38, + "grad_norm": 1.734873940679305, + "learning_rate": 7.133542633031044e-06, + "loss": 0.4629, + "step": 3298 + }, + { + "epoch": 0.38, + "grad_norm": 1.9460876505155544, + "learning_rate": 7.131859496033638e-06, + "loss": 0.4747, + "step": 3299 + }, + { + "epoch": 0.38, + "grad_norm": 4.521511835497702, + "learning_rate": 7.130176063731424e-06, + "loss": 0.5332, + "step": 3300 + }, + { + "epoch": 0.38, + "grad_norm": 2.245208563722081, + "learning_rate": 7.128492336357591e-06, + "loss": 0.5573, + "step": 3301 + }, + { + "epoch": 0.38, + "grad_norm": 2.101163022208161, + "learning_rate": 7.126808314145367e-06, + "loss": 0.4659, + "step": 3302 + }, + { + "epoch": 0.38, + "grad_norm": 1.8883660155409427, + "learning_rate": 7.125123997328025e-06, + "loss": 0.488, + "step": 3303 + }, + { + "epoch": 0.38, + "grad_norm": 2.6917231504833827, + "learning_rate": 7.123439386138874e-06, + "loss": 0.4455, + "step": 3304 + }, + { + "epoch": 0.38, + "grad_norm": 2.2753292729551196, + "learning_rate": 7.121754480811268e-06, + "loss": 0.5959, + "step": 3305 + }, + { + "epoch": 0.38, + "grad_norm": 2.351000731916369, + "learning_rate": 7.1200692815785985e-06, + "loss": 0.5244, + "step": 3306 + }, + { + "epoch": 0.38, + "grad_norm": 1.9874802463165762, + "learning_rate": 7.118383788674299e-06, + "loss": 0.5239, + "step": 3307 + }, + { + "epoch": 0.38, + "grad_norm": 4.145282082179849, + "learning_rate": 7.116698002331843e-06, + "loss": 0.5579, + "step": 3308 + }, + { + "epoch": 0.38, + "grad_norm": 1.808496389465618, + "learning_rate": 7.115011922784748e-06, + "loss": 0.4003, + "step": 3309 + }, + { + "epoch": 0.38, + "grad_norm": 1.8347951753979703, + "learning_rate": 7.113325550266568e-06, + "loss": 0.5092, + "step": 3310 + }, + { + "epoch": 0.38, + "grad_norm": 1.6199587495911798, + "learning_rate": 7.111638885010897e-06, + "loss": 0.5317, + "step": 3311 + }, + { + "epoch": 0.38, + "grad_norm": 1.771617775677887, + "learning_rate": 7.109951927251375e-06, + "loss": 0.4248, + "step": 3312 + }, + { + "epoch": 0.38, + "grad_norm": 1.814254324992072, + "learning_rate": 7.108264677221678e-06, + "loss": 0.4322, + "step": 3313 + }, + { + "epoch": 0.38, + "grad_norm": 1.9586739025325208, + "learning_rate": 7.106577135155522e-06, + "loss": 0.5439, + "step": 3314 + }, + { + "epoch": 0.38, + "grad_norm": 2.459431482883816, + "learning_rate": 7.104889301286666e-06, + "loss": 0.516, + "step": 3315 + }, + { + "epoch": 0.38, + "grad_norm": 1.9247091224471418, + "learning_rate": 7.10320117584891e-06, + "loss": 0.4875, + "step": 3316 + }, + { + "epoch": 0.38, + "grad_norm": 2.477395775077742, + "learning_rate": 7.101512759076089e-06, + "loss": 0.53, + "step": 3317 + }, + { + "epoch": 0.38, + "grad_norm": 1.8133035019620085, + "learning_rate": 7.099824051202085e-06, + "loss": 0.4995, + "step": 3318 + }, + { + "epoch": 0.38, + "grad_norm": 1.8168220514325955, + "learning_rate": 7.0981350524608185e-06, + "loss": 0.4669, + "step": 3319 + }, + { + "epoch": 0.38, + "grad_norm": 1.7199759712727876, + "learning_rate": 7.096445763086247e-06, + "loss": 0.4781, + "step": 3320 + }, + { + "epoch": 0.38, + "grad_norm": 2.4478372631076475, + "learning_rate": 7.094756183312372e-06, + "loss": 0.5902, + "step": 3321 + }, + { + "epoch": 0.38, + "grad_norm": 2.445252417717323, + "learning_rate": 7.093066313373233e-06, + "loss": 0.4546, + "step": 3322 + }, + { + "epoch": 0.38, + "grad_norm": 2.134598194439933, + "learning_rate": 7.091376153502911e-06, + "loss": 0.494, + "step": 3323 + }, + { + "epoch": 0.38, + "grad_norm": 2.7549817590983094, + "learning_rate": 7.089685703935527e-06, + "loss": 0.4912, + "step": 3324 + }, + { + "epoch": 0.38, + "grad_norm": 1.873593100951613, + "learning_rate": 7.087994964905241e-06, + "loss": 0.4553, + "step": 3325 + }, + { + "epoch": 0.38, + "grad_norm": 2.726683483613141, + "learning_rate": 7.086303936646252e-06, + "loss": 0.57, + "step": 3326 + }, + { + "epoch": 0.38, + "grad_norm": 2.8760484714681507, + "learning_rate": 7.084612619392806e-06, + "loss": 0.4988, + "step": 3327 + }, + { + "epoch": 0.38, + "grad_norm": 6.071186592474359, + "learning_rate": 7.08292101337918e-06, + "loss": 0.4624, + "step": 3328 + }, + { + "epoch": 0.38, + "grad_norm": 3.6034368668242145, + "learning_rate": 7.081229118839694e-06, + "loss": 0.4847, + "step": 3329 + }, + { + "epoch": 0.38, + "grad_norm": 2.020338677373167, + "learning_rate": 7.079536936008713e-06, + "loss": 0.5451, + "step": 3330 + }, + { + "epoch": 0.38, + "grad_norm": 2.026625897949116, + "learning_rate": 7.077844465120637e-06, + "loss": 0.4378, + "step": 3331 + }, + { + "epoch": 0.38, + "grad_norm": 1.9491601118622488, + "learning_rate": 7.076151706409904e-06, + "loss": 0.5308, + "step": 3332 + }, + { + "epoch": 0.38, + "grad_norm": 1.7985596183329262, + "learning_rate": 7.074458660110996e-06, + "loss": 0.5172, + "step": 3333 + }, + { + "epoch": 0.38, + "grad_norm": 1.6302903652287353, + "learning_rate": 7.072765326458434e-06, + "loss": 0.5382, + "step": 3334 + }, + { + "epoch": 0.38, + "grad_norm": 2.7193246501495403, + "learning_rate": 7.0710717056867795e-06, + "loss": 0.5372, + "step": 3335 + }, + { + "epoch": 0.38, + "grad_norm": 4.378660331233637, + "learning_rate": 7.0693777980306285e-06, + "loss": 0.486, + "step": 3336 + }, + { + "epoch": 0.38, + "grad_norm": 2.231866403492623, + "learning_rate": 7.067683603724624e-06, + "loss": 0.5287, + "step": 3337 + }, + { + "epoch": 0.38, + "grad_norm": 5.829571233886774, + "learning_rate": 7.065989123003446e-06, + "loss": 0.5689, + "step": 3338 + }, + { + "epoch": 0.38, + "grad_norm": 1.6495981190605775, + "learning_rate": 7.064294356101813e-06, + "loss": 0.5163, + "step": 3339 + }, + { + "epoch": 0.38, + "grad_norm": 1.9253678636633127, + "learning_rate": 7.062599303254481e-06, + "loss": 0.4565, + "step": 3340 + }, + { + "epoch": 0.38, + "grad_norm": 1.5668418569876255, + "learning_rate": 7.060903964696253e-06, + "loss": 0.5402, + "step": 3341 + }, + { + "epoch": 0.38, + "grad_norm": 2.215486829168166, + "learning_rate": 7.0592083406619625e-06, + "loss": 0.4735, + "step": 3342 + }, + { + "epoch": 0.38, + "grad_norm": 0.8619760268674858, + "learning_rate": 7.057512431386491e-06, + "loss": 0.7092, + "step": 3343 + }, + { + "epoch": 0.38, + "grad_norm": 1.8582916463553318, + "learning_rate": 7.055816237104753e-06, + "loss": 0.5029, + "step": 3344 + }, + { + "epoch": 0.38, + "grad_norm": 1.7808791953412382, + "learning_rate": 7.054119758051706e-06, + "loss": 0.4882, + "step": 3345 + }, + { + "epoch": 0.38, + "grad_norm": 1.6801590873260863, + "learning_rate": 7.052422994462347e-06, + "loss": 0.5823, + "step": 3346 + }, + { + "epoch": 0.38, + "grad_norm": 1.7823987879297132, + "learning_rate": 7.050725946571709e-06, + "loss": 0.4933, + "step": 3347 + }, + { + "epoch": 0.38, + "grad_norm": 2.184400530057603, + "learning_rate": 7.04902861461487e-06, + "loss": 0.4894, + "step": 3348 + }, + { + "epoch": 0.38, + "grad_norm": 1.774840272074094, + "learning_rate": 7.04733099882694e-06, + "loss": 0.4476, + "step": 3349 + }, + { + "epoch": 0.38, + "grad_norm": 1.9510124060593994, + "learning_rate": 7.045633099443075e-06, + "loss": 0.4145, + "step": 3350 + }, + { + "epoch": 0.39, + "grad_norm": 2.431760183714208, + "learning_rate": 7.043934916698468e-06, + "loss": 0.6732, + "step": 3351 + }, + { + "epoch": 0.39, + "grad_norm": 2.0635805673296357, + "learning_rate": 7.04223645082835e-06, + "loss": 0.5096, + "step": 3352 + }, + { + "epoch": 0.39, + "grad_norm": 1.8403659969769715, + "learning_rate": 7.040537702067993e-06, + "loss": 0.4947, + "step": 3353 + }, + { + "epoch": 0.39, + "grad_norm": 1.7130317839654687, + "learning_rate": 7.038838670652707e-06, + "loss": 0.5062, + "step": 3354 + }, + { + "epoch": 0.39, + "grad_norm": 2.810994776009769, + "learning_rate": 7.037139356817842e-06, + "loss": 0.5007, + "step": 3355 + }, + { + "epoch": 0.39, + "grad_norm": 2.3411930614027816, + "learning_rate": 7.035439760798785e-06, + "loss": 0.5781, + "step": 3356 + }, + { + "epoch": 0.39, + "grad_norm": 2.0230287686438215, + "learning_rate": 7.0337398828309644e-06, + "loss": 0.533, + "step": 3357 + }, + { + "epoch": 0.39, + "grad_norm": 2.4932216934354003, + "learning_rate": 7.032039723149849e-06, + "loss": 0.4499, + "step": 3358 + }, + { + "epoch": 0.39, + "grad_norm": 0.7854868030378267, + "learning_rate": 7.030339281990943e-06, + "loss": 0.6705, + "step": 3359 + }, + { + "epoch": 0.39, + "grad_norm": 1.9843314652864446, + "learning_rate": 7.02863855958979e-06, + "loss": 0.5876, + "step": 3360 + }, + { + "epoch": 0.39, + "grad_norm": 2.3181806647864467, + "learning_rate": 7.0269375561819764e-06, + "loss": 0.5392, + "step": 3361 + }, + { + "epoch": 0.39, + "grad_norm": 1.785424370397955, + "learning_rate": 7.025236272003122e-06, + "loss": 0.5339, + "step": 3362 + }, + { + "epoch": 0.39, + "grad_norm": 2.2542539743269416, + "learning_rate": 7.023534707288891e-06, + "loss": 0.4684, + "step": 3363 + }, + { + "epoch": 0.39, + "grad_norm": 5.359697249439902, + "learning_rate": 7.0218328622749836e-06, + "loss": 0.5553, + "step": 3364 + }, + { + "epoch": 0.39, + "grad_norm": 2.0954780144779774, + "learning_rate": 7.0201307371971375e-06, + "loss": 0.6043, + "step": 3365 + }, + { + "epoch": 0.39, + "grad_norm": 2.0241381270734045, + "learning_rate": 7.0184283322911314e-06, + "loss": 0.6349, + "step": 3366 + }, + { + "epoch": 0.39, + "grad_norm": 10.14007675523266, + "learning_rate": 7.016725647792783e-06, + "loss": 0.4981, + "step": 3367 + }, + { + "epoch": 0.39, + "grad_norm": 1.8713838290459028, + "learning_rate": 7.015022683937947e-06, + "loss": 0.4702, + "step": 3368 + }, + { + "epoch": 0.39, + "grad_norm": 2.2090028356146045, + "learning_rate": 7.013319440962516e-06, + "loss": 0.5631, + "step": 3369 + }, + { + "epoch": 0.39, + "grad_norm": 2.707387595798634, + "learning_rate": 7.011615919102427e-06, + "loss": 0.5185, + "step": 3370 + }, + { + "epoch": 0.39, + "grad_norm": 2.973856408742496, + "learning_rate": 7.009912118593648e-06, + "loss": 0.5069, + "step": 3371 + }, + { + "epoch": 0.39, + "grad_norm": 2.9378565374671552, + "learning_rate": 7.008208039672191e-06, + "loss": 0.5381, + "step": 3372 + }, + { + "epoch": 0.39, + "grad_norm": 4.054627580064713, + "learning_rate": 7.006503682574105e-06, + "loss": 0.5188, + "step": 3373 + }, + { + "epoch": 0.39, + "grad_norm": 1.766154877738583, + "learning_rate": 7.0047990475354755e-06, + "loss": 0.5217, + "step": 3374 + }, + { + "epoch": 0.39, + "grad_norm": 2.2416538809460973, + "learning_rate": 7.00309413479243e-06, + "loss": 0.494, + "step": 3375 + }, + { + "epoch": 0.39, + "grad_norm": 1.8974841364632913, + "learning_rate": 7.001388944581131e-06, + "loss": 0.4418, + "step": 3376 + }, + { + "epoch": 0.39, + "grad_norm": 2.087674603559062, + "learning_rate": 6.999683477137783e-06, + "loss": 0.5272, + "step": 3377 + }, + { + "epoch": 0.39, + "grad_norm": 2.8459375335471115, + "learning_rate": 6.997977732698625e-06, + "loss": 0.5425, + "step": 3378 + }, + { + "epoch": 0.39, + "grad_norm": 4.335732608265989, + "learning_rate": 6.996271711499938e-06, + "loss": 0.5632, + "step": 3379 + }, + { + "epoch": 0.39, + "grad_norm": 1.872561544660747, + "learning_rate": 6.9945654137780384e-06, + "loss": 0.4418, + "step": 3380 + }, + { + "epoch": 0.39, + "grad_norm": 1.9122515933073285, + "learning_rate": 6.992858839769285e-06, + "loss": 0.4891, + "step": 3381 + }, + { + "epoch": 0.39, + "grad_norm": 1.808134879536226, + "learning_rate": 6.99115198971007e-06, + "loss": 0.497, + "step": 3382 + }, + { + "epoch": 0.39, + "grad_norm": 2.9315048228461427, + "learning_rate": 6.989444863836825e-06, + "loss": 0.5139, + "step": 3383 + }, + { + "epoch": 0.39, + "grad_norm": 2.0596979396945376, + "learning_rate": 6.9877374623860215e-06, + "loss": 0.5995, + "step": 3384 + }, + { + "epoch": 0.39, + "grad_norm": 2.064542258563414, + "learning_rate": 6.98602978559417e-06, + "loss": 0.5419, + "step": 3385 + }, + { + "epoch": 0.39, + "grad_norm": 2.369031792877626, + "learning_rate": 6.984321833697817e-06, + "loss": 0.555, + "step": 3386 + }, + { + "epoch": 0.39, + "grad_norm": 1.8853456989645592, + "learning_rate": 6.982613606933547e-06, + "loss": 0.5066, + "step": 3387 + }, + { + "epoch": 0.39, + "grad_norm": 2.278776163921591, + "learning_rate": 6.9809051055379825e-06, + "loss": 0.6361, + "step": 3388 + }, + { + "epoch": 0.39, + "grad_norm": 1.7858077315788579, + "learning_rate": 6.9791963297477875e-06, + "loss": 0.5466, + "step": 3389 + }, + { + "epoch": 0.39, + "grad_norm": 2.9413796084415047, + "learning_rate": 6.97748727979966e-06, + "loss": 0.4507, + "step": 3390 + }, + { + "epoch": 0.39, + "grad_norm": 2.0787315889283353, + "learning_rate": 6.975777955930336e-06, + "loss": 0.599, + "step": 3391 + }, + { + "epoch": 0.39, + "grad_norm": 2.309439684043517, + "learning_rate": 6.974068358376591e-06, + "loss": 0.5042, + "step": 3392 + }, + { + "epoch": 0.39, + "grad_norm": 2.3893742041052435, + "learning_rate": 6.97235848737524e-06, + "loss": 0.5213, + "step": 3393 + }, + { + "epoch": 0.39, + "grad_norm": 1.8943864999761024, + "learning_rate": 6.970648343163133e-06, + "loss": 0.4996, + "step": 3394 + }, + { + "epoch": 0.39, + "grad_norm": 2.769335739512878, + "learning_rate": 6.968937925977158e-06, + "loss": 0.5202, + "step": 3395 + }, + { + "epoch": 0.39, + "grad_norm": 3.2326885589371352, + "learning_rate": 6.967227236054244e-06, + "loss": 0.4951, + "step": 3396 + }, + { + "epoch": 0.39, + "grad_norm": 7.754544319304602, + "learning_rate": 6.9655162736313535e-06, + "loss": 0.4726, + "step": 3397 + }, + { + "epoch": 0.39, + "grad_norm": 1.9703502108468753, + "learning_rate": 6.963805038945488e-06, + "loss": 0.5987, + "step": 3398 + }, + { + "epoch": 0.39, + "grad_norm": 1.964420151277497, + "learning_rate": 6.962093532233689e-06, + "loss": 0.5084, + "step": 3399 + }, + { + "epoch": 0.39, + "grad_norm": 2.7322089983619717, + "learning_rate": 6.9603817537330355e-06, + "loss": 0.4336, + "step": 3400 + }, + { + "epoch": 0.39, + "grad_norm": 2.253488241781959, + "learning_rate": 6.958669703680639e-06, + "loss": 0.4229, + "step": 3401 + }, + { + "epoch": 0.39, + "grad_norm": 2.4461789481107687, + "learning_rate": 6.956957382313656e-06, + "loss": 0.4456, + "step": 3402 + }, + { + "epoch": 0.39, + "grad_norm": 2.180205444191373, + "learning_rate": 6.955244789869274e-06, + "loss": 0.5414, + "step": 3403 + }, + { + "epoch": 0.39, + "grad_norm": 2.003427563603858, + "learning_rate": 6.9535319265847225e-06, + "loss": 0.4573, + "step": 3404 + }, + { + "epoch": 0.39, + "grad_norm": 2.1653592258812395, + "learning_rate": 6.951818792697267e-06, + "loss": 0.5074, + "step": 3405 + }, + { + "epoch": 0.39, + "grad_norm": 1.6388643320473126, + "learning_rate": 6.9501053884442106e-06, + "loss": 0.5501, + "step": 3406 + }, + { + "epoch": 0.39, + "grad_norm": 1.8880422136748156, + "learning_rate": 6.948391714062894e-06, + "loss": 0.5518, + "step": 3407 + }, + { + "epoch": 0.39, + "grad_norm": 1.648228717571414, + "learning_rate": 6.946677769790695e-06, + "loss": 0.5468, + "step": 3408 + }, + { + "epoch": 0.39, + "grad_norm": 1.9643306629666895, + "learning_rate": 6.944963555865028e-06, + "loss": 0.439, + "step": 3409 + }, + { + "epoch": 0.39, + "grad_norm": 2.2701453517850134, + "learning_rate": 6.943249072523344e-06, + "loss": 0.4602, + "step": 3410 + }, + { + "epoch": 0.39, + "grad_norm": 1.7655151445090895, + "learning_rate": 6.941534320003139e-06, + "loss": 0.4444, + "step": 3411 + }, + { + "epoch": 0.39, + "grad_norm": 2.1631394784188576, + "learning_rate": 6.939819298541932e-06, + "loss": 0.5443, + "step": 3412 + }, + { + "epoch": 0.39, + "grad_norm": 1.7957351029559017, + "learning_rate": 6.9381040083772946e-06, + "loss": 0.5593, + "step": 3413 + }, + { + "epoch": 0.39, + "grad_norm": 2.144413007829309, + "learning_rate": 6.936388449746825e-06, + "loss": 0.4586, + "step": 3414 + }, + { + "epoch": 0.39, + "grad_norm": 2.118253158555524, + "learning_rate": 6.934672622888163e-06, + "loss": 0.4744, + "step": 3415 + }, + { + "epoch": 0.39, + "grad_norm": 2.098679053046805, + "learning_rate": 6.932956528038984e-06, + "loss": 0.5025, + "step": 3416 + }, + { + "epoch": 0.39, + "grad_norm": 2.026012934816335, + "learning_rate": 6.931240165437002e-06, + "loss": 0.4754, + "step": 3417 + }, + { + "epoch": 0.39, + "grad_norm": 2.1279189545145365, + "learning_rate": 6.9295235353199665e-06, + "loss": 0.5834, + "step": 3418 + }, + { + "epoch": 0.39, + "grad_norm": 1.9915211969981903, + "learning_rate": 6.927806637925665e-06, + "loss": 0.6684, + "step": 3419 + }, + { + "epoch": 0.39, + "grad_norm": 1.808153379635157, + "learning_rate": 6.926089473491923e-06, + "loss": 0.6168, + "step": 3420 + }, + { + "epoch": 0.39, + "grad_norm": 2.0402742214787764, + "learning_rate": 6.924372042256599e-06, + "loss": 0.6283, + "step": 3421 + }, + { + "epoch": 0.39, + "grad_norm": 2.2188721858116653, + "learning_rate": 6.922654344457594e-06, + "loss": 0.417, + "step": 3422 + }, + { + "epoch": 0.39, + "grad_norm": 2.502934246177953, + "learning_rate": 6.920936380332841e-06, + "loss": 0.5252, + "step": 3423 + }, + { + "epoch": 0.39, + "grad_norm": 2.003982033170949, + "learning_rate": 6.919218150120315e-06, + "loss": 0.4793, + "step": 3424 + }, + { + "epoch": 0.39, + "grad_norm": 2.5994077354030196, + "learning_rate": 6.917499654058023e-06, + "loss": 0.5034, + "step": 3425 + }, + { + "epoch": 0.39, + "grad_norm": 2.4941298696722516, + "learning_rate": 6.91578089238401e-06, + "loss": 0.4829, + "step": 3426 + }, + { + "epoch": 0.39, + "grad_norm": 2.154963348667239, + "learning_rate": 6.91406186533636e-06, + "loss": 0.5715, + "step": 3427 + }, + { + "epoch": 0.39, + "grad_norm": 3.5455535674718277, + "learning_rate": 6.912342573153193e-06, + "loss": 0.5363, + "step": 3428 + }, + { + "epoch": 0.39, + "grad_norm": 2.9893435432691113, + "learning_rate": 6.910623016072662e-06, + "loss": 0.5044, + "step": 3429 + }, + { + "epoch": 0.39, + "grad_norm": 2.916253278679359, + "learning_rate": 6.908903194332963e-06, + "loss": 0.4277, + "step": 3430 + }, + { + "epoch": 0.39, + "grad_norm": 1.9955634076029332, + "learning_rate": 6.907183108172324e-06, + "loss": 0.5436, + "step": 3431 + }, + { + "epoch": 0.39, + "grad_norm": 2.2158499874449897, + "learning_rate": 6.905462757829011e-06, + "loss": 0.475, + "step": 3432 + }, + { + "epoch": 0.39, + "grad_norm": 2.178307891950237, + "learning_rate": 6.9037421435413275e-06, + "loss": 0.4939, + "step": 3433 + }, + { + "epoch": 0.39, + "grad_norm": 4.331011629950725, + "learning_rate": 6.9020212655476116e-06, + "loss": 0.5127, + "step": 3434 + }, + { + "epoch": 0.39, + "grad_norm": 4.553487253834424, + "learning_rate": 6.900300124086239e-06, + "loss": 0.5205, + "step": 3435 + }, + { + "epoch": 0.39, + "grad_norm": 1.8349367637919298, + "learning_rate": 6.898578719395622e-06, + "loss": 0.4996, + "step": 3436 + }, + { + "epoch": 0.39, + "grad_norm": 2.7960297236031453, + "learning_rate": 6.896857051714211e-06, + "loss": 0.5007, + "step": 3437 + }, + { + "epoch": 0.4, + "grad_norm": 1.7197818905948623, + "learning_rate": 6.895135121280488e-06, + "loss": 0.4412, + "step": 3438 + }, + { + "epoch": 0.4, + "grad_norm": 9.297819995590865, + "learning_rate": 6.893412928332979e-06, + "loss": 0.4725, + "step": 3439 + }, + { + "epoch": 0.4, + "grad_norm": 3.0985980814828284, + "learning_rate": 6.891690473110237e-06, + "loss": 0.5328, + "step": 3440 + }, + { + "epoch": 0.4, + "grad_norm": 2.483579898201814, + "learning_rate": 6.889967755850858e-06, + "loss": 0.4426, + "step": 3441 + }, + { + "epoch": 0.4, + "grad_norm": 1.643349768301987, + "learning_rate": 6.888244776793474e-06, + "loss": 0.5251, + "step": 3442 + }, + { + "epoch": 0.4, + "grad_norm": 1.8560864552527307, + "learning_rate": 6.8865215361767515e-06, + "loss": 0.4695, + "step": 3443 + }, + { + "epoch": 0.4, + "grad_norm": 2.136099014732434, + "learning_rate": 6.88479803423939e-06, + "loss": 0.4778, + "step": 3444 + }, + { + "epoch": 0.4, + "grad_norm": 2.1344014437579646, + "learning_rate": 6.883074271220133e-06, + "loss": 0.5328, + "step": 3445 + }, + { + "epoch": 0.4, + "grad_norm": 2.4875853374656764, + "learning_rate": 6.881350247357753e-06, + "loss": 0.5009, + "step": 3446 + }, + { + "epoch": 0.4, + "grad_norm": 1.8230873513323729, + "learning_rate": 6.8796259628910635e-06, + "loss": 0.3822, + "step": 3447 + }, + { + "epoch": 0.4, + "grad_norm": 3.144060516070587, + "learning_rate": 6.87790141805891e-06, + "loss": 0.5541, + "step": 3448 + }, + { + "epoch": 0.4, + "grad_norm": 2.004598245836152, + "learning_rate": 6.8761766131001795e-06, + "loss": 0.4142, + "step": 3449 + }, + { + "epoch": 0.4, + "grad_norm": 2.095755636348988, + "learning_rate": 6.874451548253788e-06, + "loss": 0.5364, + "step": 3450 + }, + { + "epoch": 0.4, + "grad_norm": 2.0105263217559926, + "learning_rate": 6.872726223758692e-06, + "loss": 0.4558, + "step": 3451 + }, + { + "epoch": 0.4, + "grad_norm": 1.9894019796799127, + "learning_rate": 6.871000639853886e-06, + "loss": 0.4566, + "step": 3452 + }, + { + "epoch": 0.4, + "grad_norm": 1.9206441371536123, + "learning_rate": 6.869274796778394e-06, + "loss": 0.5627, + "step": 3453 + }, + { + "epoch": 0.4, + "grad_norm": 1.9996733480670144, + "learning_rate": 6.86754869477128e-06, + "loss": 0.5254, + "step": 3454 + }, + { + "epoch": 0.4, + "grad_norm": 2.4945799916669955, + "learning_rate": 6.865822334071646e-06, + "loss": 0.4843, + "step": 3455 + }, + { + "epoch": 0.4, + "grad_norm": 4.603382215490561, + "learning_rate": 6.864095714918624e-06, + "loss": 0.574, + "step": 3456 + }, + { + "epoch": 0.4, + "grad_norm": 4.913675786018516, + "learning_rate": 6.862368837551387e-06, + "loss": 0.5318, + "step": 3457 + }, + { + "epoch": 0.4, + "grad_norm": 2.0742804757686466, + "learning_rate": 6.860641702209142e-06, + "loss": 0.4984, + "step": 3458 + }, + { + "epoch": 0.4, + "grad_norm": 2.0353713704258203, + "learning_rate": 6.858914309131131e-06, + "loss": 0.5378, + "step": 3459 + }, + { + "epoch": 0.4, + "grad_norm": 2.526278264429891, + "learning_rate": 6.85718665855663e-06, + "loss": 0.5241, + "step": 3460 + }, + { + "epoch": 0.4, + "grad_norm": 1.805414797561438, + "learning_rate": 6.8554587507249555e-06, + "loss": 0.5362, + "step": 3461 + }, + { + "epoch": 0.4, + "grad_norm": 2.262866599877029, + "learning_rate": 6.853730585875458e-06, + "loss": 0.4453, + "step": 3462 + }, + { + "epoch": 0.4, + "grad_norm": 2.297052038857309, + "learning_rate": 6.852002164247519e-06, + "loss": 0.4629, + "step": 3463 + }, + { + "epoch": 0.4, + "grad_norm": 6.429261348960382, + "learning_rate": 6.8502734860805605e-06, + "loss": 0.5512, + "step": 3464 + }, + { + "epoch": 0.4, + "grad_norm": 2.0961285978649347, + "learning_rate": 6.8485445516140405e-06, + "loss": 0.5205, + "step": 3465 + }, + { + "epoch": 0.4, + "grad_norm": 2.4432404905297975, + "learning_rate": 6.846815361087449e-06, + "loss": 0.5336, + "step": 3466 + }, + { + "epoch": 0.4, + "grad_norm": 2.489894439252578, + "learning_rate": 6.845085914740314e-06, + "loss": 0.5266, + "step": 3467 + }, + { + "epoch": 0.4, + "grad_norm": 1.858592888373152, + "learning_rate": 6.8433562128121966e-06, + "loss": 0.4932, + "step": 3468 + }, + { + "epoch": 0.4, + "grad_norm": 2.341560874073902, + "learning_rate": 6.841626255542696e-06, + "loss": 0.4736, + "step": 3469 + }, + { + "epoch": 0.4, + "grad_norm": 1.6685942199324213, + "learning_rate": 6.839896043171446e-06, + "loss": 0.452, + "step": 3470 + }, + { + "epoch": 0.4, + "grad_norm": 1.8714172160577403, + "learning_rate": 6.838165575938114e-06, + "loss": 0.523, + "step": 3471 + }, + { + "epoch": 0.4, + "grad_norm": 2.223435143902185, + "learning_rate": 6.836434854082405e-06, + "loss": 0.5385, + "step": 3472 + }, + { + "epoch": 0.4, + "grad_norm": 1.9897231824328618, + "learning_rate": 6.8347038778440585e-06, + "loss": 0.6362, + "step": 3473 + }, + { + "epoch": 0.4, + "grad_norm": 2.035006947352832, + "learning_rate": 6.83297264746285e-06, + "loss": 0.5629, + "step": 3474 + }, + { + "epoch": 0.4, + "grad_norm": 2.307760585328449, + "learning_rate": 6.831241163178586e-06, + "loss": 0.4724, + "step": 3475 + }, + { + "epoch": 0.4, + "grad_norm": 2.1444844048934115, + "learning_rate": 6.829509425231113e-06, + "loss": 0.5549, + "step": 3476 + }, + { + "epoch": 0.4, + "grad_norm": 2.069914275074382, + "learning_rate": 6.827777433860312e-06, + "loss": 0.4925, + "step": 3477 + }, + { + "epoch": 0.4, + "grad_norm": 2.1854231243681617, + "learning_rate": 6.826045189306099e-06, + "loss": 0.5461, + "step": 3478 + }, + { + "epoch": 0.4, + "grad_norm": 2.1559813400174828, + "learning_rate": 6.8243126918084205e-06, + "loss": 0.5245, + "step": 3479 + }, + { + "epoch": 0.4, + "grad_norm": 2.632282099536249, + "learning_rate": 6.822579941607264e-06, + "loss": 0.495, + "step": 3480 + }, + { + "epoch": 0.4, + "grad_norm": 2.3423916719386395, + "learning_rate": 6.82084693894265e-06, + "loss": 0.4856, + "step": 3481 + }, + { + "epoch": 0.4, + "grad_norm": 1.912904484087553, + "learning_rate": 6.819113684054634e-06, + "loss": 0.5006, + "step": 3482 + }, + { + "epoch": 0.4, + "grad_norm": 2.4091797638086523, + "learning_rate": 6.817380177183306e-06, + "loss": 0.5074, + "step": 3483 + }, + { + "epoch": 0.4, + "grad_norm": 1.9785965166079251, + "learning_rate": 6.815646418568789e-06, + "loss": 0.4868, + "step": 3484 + }, + { + "epoch": 0.4, + "grad_norm": 2.048744995881688, + "learning_rate": 6.813912408451247e-06, + "loss": 0.5939, + "step": 3485 + }, + { + "epoch": 0.4, + "grad_norm": 2.0911700340490222, + "learning_rate": 6.812178147070869e-06, + "loss": 0.5218, + "step": 3486 + }, + { + "epoch": 0.4, + "grad_norm": 2.2747555308835885, + "learning_rate": 6.81044363466789e-06, + "loss": 0.4465, + "step": 3487 + }, + { + "epoch": 0.4, + "grad_norm": 2.094915517223537, + "learning_rate": 6.808708871482572e-06, + "loss": 0.5002, + "step": 3488 + }, + { + "epoch": 0.4, + "grad_norm": 1.8906848722910898, + "learning_rate": 6.806973857755214e-06, + "loss": 0.4358, + "step": 3489 + }, + { + "epoch": 0.4, + "grad_norm": 2.3251912395624648, + "learning_rate": 6.805238593726151e-06, + "loss": 0.4842, + "step": 3490 + }, + { + "epoch": 0.4, + "grad_norm": 2.073795446509601, + "learning_rate": 6.803503079635752e-06, + "loss": 0.5359, + "step": 3491 + }, + { + "epoch": 0.4, + "grad_norm": 2.1740849809324923, + "learning_rate": 6.8017673157244156e-06, + "loss": 0.5325, + "step": 3492 + }, + { + "epoch": 0.4, + "grad_norm": 2.6743320270660744, + "learning_rate": 6.800031302232584e-06, + "loss": 0.619, + "step": 3493 + }, + { + "epoch": 0.4, + "grad_norm": 2.052210682658103, + "learning_rate": 6.798295039400729e-06, + "loss": 0.5002, + "step": 3494 + }, + { + "epoch": 0.4, + "grad_norm": 3.1427699427585734, + "learning_rate": 6.796558527469355e-06, + "loss": 0.4844, + "step": 3495 + }, + { + "epoch": 0.4, + "grad_norm": 1.7137416161296128, + "learning_rate": 6.794821766679006e-06, + "loss": 0.4557, + "step": 3496 + }, + { + "epoch": 0.4, + "grad_norm": 1.6209310768289507, + "learning_rate": 6.793084757270256e-06, + "loss": 0.4675, + "step": 3497 + }, + { + "epoch": 0.4, + "grad_norm": 2.9501731459906053, + "learning_rate": 6.791347499483717e-06, + "loss": 0.4679, + "step": 3498 + }, + { + "epoch": 0.4, + "grad_norm": 1.6582593174997475, + "learning_rate": 6.789609993560032e-06, + "loss": 0.5096, + "step": 3499 + }, + { + "epoch": 0.4, + "grad_norm": 2.193867591080037, + "learning_rate": 6.787872239739882e-06, + "loss": 0.4189, + "step": 3500 + }, + { + "epoch": 0.4, + "grad_norm": 2.2822910212035423, + "learning_rate": 6.786134238263977e-06, + "loss": 0.5389, + "step": 3501 + }, + { + "epoch": 0.4, + "grad_norm": 2.6095517419297294, + "learning_rate": 6.784395989373068e-06, + "loss": 0.4166, + "step": 3502 + }, + { + "epoch": 0.4, + "grad_norm": 2.347207977992443, + "learning_rate": 6.782657493307936e-06, + "loss": 0.5289, + "step": 3503 + }, + { + "epoch": 0.4, + "grad_norm": 2.8859078852986118, + "learning_rate": 6.780918750309395e-06, + "loss": 0.5674, + "step": 3504 + }, + { + "epoch": 0.4, + "grad_norm": 1.9071824286244587, + "learning_rate": 6.7791797606183e-06, + "loss": 0.5492, + "step": 3505 + }, + { + "epoch": 0.4, + "grad_norm": 2.134241113572359, + "learning_rate": 6.77744052447553e-06, + "loss": 0.4575, + "step": 3506 + }, + { + "epoch": 0.4, + "grad_norm": 2.7686059475851854, + "learning_rate": 6.775701042122007e-06, + "loss": 0.5404, + "step": 3507 + }, + { + "epoch": 0.4, + "grad_norm": 1.5706955363113437, + "learning_rate": 6.773961313798685e-06, + "loss": 0.3996, + "step": 3508 + }, + { + "epoch": 0.4, + "grad_norm": 2.626636875653107, + "learning_rate": 6.7722213397465475e-06, + "loss": 0.5042, + "step": 3509 + }, + { + "epoch": 0.4, + "grad_norm": 2.583941264270841, + "learning_rate": 6.770481120206617e-06, + "loss": 0.445, + "step": 3510 + }, + { + "epoch": 0.4, + "grad_norm": 2.8173847825731113, + "learning_rate": 6.768740655419949e-06, + "loss": 0.5191, + "step": 3511 + }, + { + "epoch": 0.4, + "grad_norm": 2.0672022005145423, + "learning_rate": 6.76699994562763e-06, + "loss": 0.4363, + "step": 3512 + }, + { + "epoch": 0.4, + "grad_norm": 2.1977340981728313, + "learning_rate": 6.765258991070787e-06, + "loss": 0.5165, + "step": 3513 + }, + { + "epoch": 0.4, + "grad_norm": 2.6477099652811495, + "learning_rate": 6.763517791990572e-06, + "loss": 0.472, + "step": 3514 + }, + { + "epoch": 0.4, + "grad_norm": 3.0050808588529097, + "learning_rate": 6.7617763486281795e-06, + "loss": 0.5548, + "step": 3515 + }, + { + "epoch": 0.4, + "grad_norm": 1.9639114967009688, + "learning_rate": 6.760034661224831e-06, + "loss": 0.5329, + "step": 3516 + }, + { + "epoch": 0.4, + "grad_norm": 0.8258750637181461, + "learning_rate": 6.758292730021788e-06, + "loss": 0.6946, + "step": 3517 + }, + { + "epoch": 0.4, + "grad_norm": 1.889758770682578, + "learning_rate": 6.756550555260339e-06, + "loss": 0.5132, + "step": 3518 + }, + { + "epoch": 0.4, + "grad_norm": 4.9612032181346875, + "learning_rate": 6.754808137181812e-06, + "loss": 0.5496, + "step": 3519 + }, + { + "epoch": 0.4, + "grad_norm": 2.29602043113293, + "learning_rate": 6.753065476027566e-06, + "loss": 0.4564, + "step": 3520 + }, + { + "epoch": 0.4, + "grad_norm": 2.009534383224343, + "learning_rate": 6.751322572038993e-06, + "loss": 0.5015, + "step": 3521 + }, + { + "epoch": 0.4, + "grad_norm": 1.8233005227356933, + "learning_rate": 6.749579425457522e-06, + "loss": 0.5085, + "step": 3522 + }, + { + "epoch": 0.4, + "grad_norm": 2.102848044348859, + "learning_rate": 6.7478360365246106e-06, + "loss": 0.4057, + "step": 3523 + }, + { + "epoch": 0.4, + "grad_norm": 2.4927043334003782, + "learning_rate": 6.746092405481756e-06, + "loss": 0.4327, + "step": 3524 + }, + { + "epoch": 0.41, + "grad_norm": 2.5182206924926644, + "learning_rate": 6.744348532570482e-06, + "loss": 0.5902, + "step": 3525 + }, + { + "epoch": 0.41, + "grad_norm": 2.630675180211426, + "learning_rate": 6.742604418032353e-06, + "loss": 0.552, + "step": 3526 + }, + { + "epoch": 0.41, + "grad_norm": 2.1684188275145346, + "learning_rate": 6.74086006210896e-06, + "loss": 0.562, + "step": 3527 + }, + { + "epoch": 0.41, + "grad_norm": 5.0464131872586, + "learning_rate": 6.739115465041934e-06, + "loss": 0.4236, + "step": 3528 + }, + { + "epoch": 0.41, + "grad_norm": 1.8377014985372924, + "learning_rate": 6.737370627072934e-06, + "loss": 0.411, + "step": 3529 + }, + { + "epoch": 0.41, + "grad_norm": 1.9744618016684727, + "learning_rate": 6.735625548443656e-06, + "loss": 0.4961, + "step": 3530 + }, + { + "epoch": 0.41, + "grad_norm": 2.033214852066662, + "learning_rate": 6.733880229395828e-06, + "loss": 0.5244, + "step": 3531 + }, + { + "epoch": 0.41, + "grad_norm": 2.6204270685576323, + "learning_rate": 6.732134670171211e-06, + "loss": 0.4187, + "step": 3532 + }, + { + "epoch": 0.41, + "grad_norm": 1.914051754985948, + "learning_rate": 6.730388871011601e-06, + "loss": 0.5338, + "step": 3533 + }, + { + "epoch": 0.41, + "grad_norm": 1.6475926085817167, + "learning_rate": 6.728642832158823e-06, + "loss": 0.578, + "step": 3534 + }, + { + "epoch": 0.41, + "grad_norm": 1.762647803962594, + "learning_rate": 6.726896553854738e-06, + "loss": 0.5096, + "step": 3535 + }, + { + "epoch": 0.41, + "grad_norm": 0.8772609053969875, + "learning_rate": 6.7251500363412425e-06, + "loss": 0.7098, + "step": 3536 + }, + { + "epoch": 0.41, + "grad_norm": 2.0205210120177712, + "learning_rate": 6.723403279860262e-06, + "loss": 0.5067, + "step": 3537 + }, + { + "epoch": 0.41, + "grad_norm": 2.2546565503696363, + "learning_rate": 6.7216562846537584e-06, + "loss": 0.501, + "step": 3538 + }, + { + "epoch": 0.41, + "grad_norm": 2.10002796192174, + "learning_rate": 6.719909050963725e-06, + "loss": 0.4987, + "step": 3539 + }, + { + "epoch": 0.41, + "grad_norm": 1.874818537052079, + "learning_rate": 6.718161579032186e-06, + "loss": 0.5319, + "step": 3540 + }, + { + "epoch": 0.41, + "grad_norm": 1.916160096548118, + "learning_rate": 6.7164138691012035e-06, + "loss": 0.5126, + "step": 3541 + }, + { + "epoch": 0.41, + "grad_norm": 1.9798171070336796, + "learning_rate": 6.714665921412871e-06, + "loss": 0.4996, + "step": 3542 + }, + { + "epoch": 0.41, + "grad_norm": 1.9780709813397543, + "learning_rate": 6.71291773620931e-06, + "loss": 0.4709, + "step": 3543 + }, + { + "epoch": 0.41, + "grad_norm": 2.0919576107326283, + "learning_rate": 6.711169313732682e-06, + "loss": 0.6401, + "step": 3544 + }, + { + "epoch": 0.41, + "grad_norm": 2.2922852903103075, + "learning_rate": 6.709420654225176e-06, + "loss": 0.5517, + "step": 3545 + }, + { + "epoch": 0.41, + "grad_norm": 2.8539758846455174, + "learning_rate": 6.707671757929017e-06, + "loss": 0.5938, + "step": 3546 + }, + { + "epoch": 0.41, + "grad_norm": 2.4354653405420237, + "learning_rate": 6.705922625086464e-06, + "loss": 0.5311, + "step": 3547 + }, + { + "epoch": 0.41, + "grad_norm": 2.0892107892444653, + "learning_rate": 6.704173255939802e-06, + "loss": 0.5119, + "step": 3548 + }, + { + "epoch": 0.41, + "grad_norm": 3.097176300931407, + "learning_rate": 6.702423650731357e-06, + "loss": 0.5103, + "step": 3549 + }, + { + "epoch": 0.41, + "grad_norm": 4.017261819744391, + "learning_rate": 6.700673809703483e-06, + "loss": 0.4445, + "step": 3550 + }, + { + "epoch": 0.41, + "grad_norm": 2.0455477324112703, + "learning_rate": 6.698923733098567e-06, + "loss": 0.5001, + "step": 3551 + }, + { + "epoch": 0.41, + "grad_norm": 2.143977955371404, + "learning_rate": 6.697173421159029e-06, + "loss": 0.5956, + "step": 3552 + }, + { + "epoch": 0.41, + "grad_norm": 1.907655264703219, + "learning_rate": 6.695422874127323e-06, + "loss": 0.4919, + "step": 3553 + }, + { + "epoch": 0.41, + "grad_norm": 1.9116452802190234, + "learning_rate": 6.693672092245934e-06, + "loss": 0.4434, + "step": 3554 + }, + { + "epoch": 0.41, + "grad_norm": 1.8767790016093753, + "learning_rate": 6.69192107575738e-06, + "loss": 0.5277, + "step": 3555 + }, + { + "epoch": 0.41, + "grad_norm": 3.2760262127698647, + "learning_rate": 6.6901698249042125e-06, + "loss": 0.4357, + "step": 3556 + }, + { + "epoch": 0.41, + "grad_norm": 1.806160048395752, + "learning_rate": 6.688418339929013e-06, + "loss": 0.5638, + "step": 3557 + }, + { + "epoch": 0.41, + "grad_norm": 1.9132916256769812, + "learning_rate": 6.686666621074398e-06, + "loss": 0.5239, + "step": 3558 + }, + { + "epoch": 0.41, + "grad_norm": 4.294330487449972, + "learning_rate": 6.684914668583016e-06, + "loss": 0.4701, + "step": 3559 + }, + { + "epoch": 0.41, + "grad_norm": 2.104014515987271, + "learning_rate": 6.683162482697544e-06, + "loss": 0.5176, + "step": 3560 + }, + { + "epoch": 0.41, + "grad_norm": 2.124781326346334, + "learning_rate": 6.681410063660696e-06, + "loss": 0.4375, + "step": 3561 + }, + { + "epoch": 0.41, + "grad_norm": 1.8841532214368404, + "learning_rate": 6.679657411715218e-06, + "loss": 0.4961, + "step": 3562 + }, + { + "epoch": 0.41, + "grad_norm": 2.3435669306674405, + "learning_rate": 6.677904527103887e-06, + "loss": 0.4795, + "step": 3563 + }, + { + "epoch": 0.41, + "grad_norm": 1.9887401121875197, + "learning_rate": 6.67615141006951e-06, + "loss": 0.4865, + "step": 3564 + }, + { + "epoch": 0.41, + "grad_norm": 2.051510051685662, + "learning_rate": 6.674398060854931e-06, + "loss": 0.5532, + "step": 3565 + }, + { + "epoch": 0.41, + "grad_norm": 2.0650627174107883, + "learning_rate": 6.6726444797030225e-06, + "loss": 0.5987, + "step": 3566 + }, + { + "epoch": 0.41, + "grad_norm": 2.009636637490451, + "learning_rate": 6.6708906668566906e-06, + "loss": 0.4522, + "step": 3567 + }, + { + "epoch": 0.41, + "grad_norm": 2.0018100801911562, + "learning_rate": 6.669136622558873e-06, + "loss": 0.4932, + "step": 3568 + }, + { + "epoch": 0.41, + "grad_norm": 2.010070154411886, + "learning_rate": 6.667382347052539e-06, + "loss": 0.5174, + "step": 3569 + }, + { + "epoch": 0.41, + "grad_norm": 1.5741815974182063, + "learning_rate": 6.665627840580693e-06, + "loss": 0.4748, + "step": 3570 + }, + { + "epoch": 0.41, + "grad_norm": 3.1475107559208446, + "learning_rate": 6.663873103386365e-06, + "loss": 0.534, + "step": 3571 + }, + { + "epoch": 0.41, + "grad_norm": 2.040702540317953, + "learning_rate": 6.662118135712623e-06, + "loss": 0.6129, + "step": 3572 + }, + { + "epoch": 0.41, + "grad_norm": 1.8581917288236094, + "learning_rate": 6.660362937802565e-06, + "loss": 0.544, + "step": 3573 + }, + { + "epoch": 0.41, + "grad_norm": 2.095329157755988, + "learning_rate": 6.6586075098993196e-06, + "loss": 0.4411, + "step": 3574 + }, + { + "epoch": 0.41, + "grad_norm": 1.960323019417156, + "learning_rate": 6.65685185224605e-06, + "loss": 0.5614, + "step": 3575 + }, + { + "epoch": 0.41, + "grad_norm": 2.397698994196487, + "learning_rate": 6.655095965085949e-06, + "loss": 0.4431, + "step": 3576 + }, + { + "epoch": 0.41, + "grad_norm": 1.8816801739529165, + "learning_rate": 6.6533398486622426e-06, + "loss": 0.5202, + "step": 3577 + }, + { + "epoch": 0.41, + "grad_norm": 1.693662545886062, + "learning_rate": 6.6515835032181855e-06, + "loss": 0.4835, + "step": 3578 + }, + { + "epoch": 0.41, + "grad_norm": 2.0498649909178264, + "learning_rate": 6.649826928997068e-06, + "loss": 0.5628, + "step": 3579 + }, + { + "epoch": 0.41, + "grad_norm": 2.005088661384491, + "learning_rate": 6.648070126242208e-06, + "loss": 0.5108, + "step": 3580 + }, + { + "epoch": 0.41, + "grad_norm": 2.4096309111837146, + "learning_rate": 6.646313095196961e-06, + "loss": 0.4768, + "step": 3581 + }, + { + "epoch": 0.41, + "grad_norm": 1.86545807768766, + "learning_rate": 6.64455583610471e-06, + "loss": 0.4831, + "step": 3582 + }, + { + "epoch": 0.41, + "grad_norm": 2.4341947955179664, + "learning_rate": 6.642798349208869e-06, + "loss": 0.5218, + "step": 3583 + }, + { + "epoch": 0.41, + "grad_norm": 2.0052273143841948, + "learning_rate": 6.6410406347528855e-06, + "loss": 0.5088, + "step": 3584 + }, + { + "epoch": 0.41, + "grad_norm": 2.280938669416697, + "learning_rate": 6.639282692980238e-06, + "loss": 0.5108, + "step": 3585 + }, + { + "epoch": 0.41, + "grad_norm": 8.115208642837691, + "learning_rate": 6.637524524134434e-06, + "loss": 0.5156, + "step": 3586 + }, + { + "epoch": 0.41, + "grad_norm": 2.7269747037326932, + "learning_rate": 6.635766128459018e-06, + "loss": 0.5422, + "step": 3587 + }, + { + "epoch": 0.41, + "grad_norm": 2.9764975785024523, + "learning_rate": 6.63400750619756e-06, + "loss": 0.4564, + "step": 3588 + }, + { + "epoch": 0.41, + "grad_norm": 2.2619094645667053, + "learning_rate": 6.632248657593667e-06, + "loss": 0.516, + "step": 3589 + }, + { + "epoch": 0.41, + "grad_norm": 2.2670843596428365, + "learning_rate": 6.630489582890971e-06, + "loss": 0.5359, + "step": 3590 + }, + { + "epoch": 0.41, + "grad_norm": 1.6387092591940882, + "learning_rate": 6.6287302823331416e-06, + "loss": 0.4776, + "step": 3591 + }, + { + "epoch": 0.41, + "grad_norm": 2.914563819949806, + "learning_rate": 6.626970756163875e-06, + "loss": 0.5411, + "step": 3592 + }, + { + "epoch": 0.41, + "grad_norm": 3.099125446353, + "learning_rate": 6.625211004626901e-06, + "loss": 0.5732, + "step": 3593 + }, + { + "epoch": 0.41, + "grad_norm": 3.0266478571382875, + "learning_rate": 6.623451027965981e-06, + "loss": 0.5713, + "step": 3594 + }, + { + "epoch": 0.41, + "grad_norm": 0.9074625196503217, + "learning_rate": 6.621690826424905e-06, + "loss": 0.7419, + "step": 3595 + }, + { + "epoch": 0.41, + "grad_norm": 2.6930001157900096, + "learning_rate": 6.619930400247496e-06, + "loss": 0.5184, + "step": 3596 + }, + { + "epoch": 0.41, + "grad_norm": 2.403149628756053, + "learning_rate": 6.6181697496776084e-06, + "loss": 0.5476, + "step": 3597 + }, + { + "epoch": 0.41, + "grad_norm": 2.6753464077936635, + "learning_rate": 6.616408874959128e-06, + "loss": 0.4927, + "step": 3598 + }, + { + "epoch": 0.41, + "grad_norm": 2.011851638293724, + "learning_rate": 6.61464777633597e-06, + "loss": 0.4882, + "step": 3599 + }, + { + "epoch": 0.41, + "grad_norm": 1.8113012556320098, + "learning_rate": 6.612886454052082e-06, + "loss": 0.482, + "step": 3600 + }, + { + "epoch": 0.41, + "grad_norm": 2.0067079901524343, + "learning_rate": 6.611124908351443e-06, + "loss": 0.5304, + "step": 3601 + }, + { + "epoch": 0.41, + "grad_norm": 2.949846810206903, + "learning_rate": 6.609363139478059e-06, + "loss": 0.5669, + "step": 3602 + }, + { + "epoch": 0.41, + "grad_norm": 1.9063583331264744, + "learning_rate": 6.607601147675973e-06, + "loss": 0.5319, + "step": 3603 + }, + { + "epoch": 0.41, + "grad_norm": 3.373132647372629, + "learning_rate": 6.605838933189253e-06, + "loss": 0.4831, + "step": 3604 + }, + { + "epoch": 0.41, + "grad_norm": 1.94091936638512, + "learning_rate": 6.604076496262002e-06, + "loss": 0.5384, + "step": 3605 + }, + { + "epoch": 0.41, + "grad_norm": 2.1061532958443396, + "learning_rate": 6.602313837138353e-06, + "loss": 0.3879, + "step": 3606 + }, + { + "epoch": 0.41, + "grad_norm": 1.8652323968864297, + "learning_rate": 6.600550956062469e-06, + "loss": 0.4835, + "step": 3607 + }, + { + "epoch": 0.41, + "grad_norm": 0.8379089410070378, + "learning_rate": 6.598787853278544e-06, + "loss": 0.726, + "step": 3608 + }, + { + "epoch": 0.41, + "grad_norm": 2.346441402606797, + "learning_rate": 6.597024529030803e-06, + "loss": 0.4963, + "step": 3609 + }, + { + "epoch": 0.41, + "grad_norm": 1.5690653496094755, + "learning_rate": 6.5952609835635e-06, + "loss": 0.5655, + "step": 3610 + }, + { + "epoch": 0.41, + "grad_norm": 2.150094313612813, + "learning_rate": 6.5934972171209224e-06, + "loss": 0.4786, + "step": 3611 + }, + { + "epoch": 0.42, + "grad_norm": 2.751278416364143, + "learning_rate": 6.591733229947387e-06, + "loss": 0.5887, + "step": 3612 + }, + { + "epoch": 0.42, + "grad_norm": 1.9687495721572215, + "learning_rate": 6.589969022287239e-06, + "loss": 0.5172, + "step": 3613 + }, + { + "epoch": 0.42, + "grad_norm": 2.7858800377852178, + "learning_rate": 6.588204594384857e-06, + "loss": 0.5255, + "step": 3614 + }, + { + "epoch": 0.42, + "grad_norm": 2.61345197549771, + "learning_rate": 6.586439946484651e-06, + "loss": 0.4937, + "step": 3615 + }, + { + "epoch": 0.42, + "grad_norm": 2.4301578290487353, + "learning_rate": 6.584675078831057e-06, + "loss": 0.4192, + "step": 3616 + }, + { + "epoch": 0.42, + "grad_norm": 2.0287578369206996, + "learning_rate": 6.582909991668547e-06, + "loss": 0.5568, + "step": 3617 + }, + { + "epoch": 0.42, + "grad_norm": 1.5613951474624528, + "learning_rate": 6.581144685241619e-06, + "loss": 0.4675, + "step": 3618 + }, + { + "epoch": 0.42, + "grad_norm": 1.9929696525055807, + "learning_rate": 6.579379159794802e-06, + "loss": 0.5019, + "step": 3619 + }, + { + "epoch": 0.42, + "grad_norm": 5.795420442194661, + "learning_rate": 6.577613415572658e-06, + "loss": 0.4498, + "step": 3620 + }, + { + "epoch": 0.42, + "grad_norm": 2.2212609121392903, + "learning_rate": 6.575847452819777e-06, + "loss": 0.5215, + "step": 3621 + }, + { + "epoch": 0.42, + "grad_norm": 2.0003312678826775, + "learning_rate": 6.574081271780779e-06, + "loss": 0.5995, + "step": 3622 + }, + { + "epoch": 0.42, + "grad_norm": 1.9922345024127028, + "learning_rate": 6.572314872700316e-06, + "loss": 0.5012, + "step": 3623 + }, + { + "epoch": 0.42, + "grad_norm": 2.446742448296675, + "learning_rate": 6.570548255823071e-06, + "loss": 0.5125, + "step": 3624 + }, + { + "epoch": 0.42, + "grad_norm": 2.0341673993421847, + "learning_rate": 6.568781421393751e-06, + "loss": 0.604, + "step": 3625 + }, + { + "epoch": 0.42, + "grad_norm": 4.063986166195195, + "learning_rate": 6.567014369657102e-06, + "loss": 0.4717, + "step": 3626 + }, + { + "epoch": 0.42, + "grad_norm": 2.2571993278317826, + "learning_rate": 6.565247100857893e-06, + "loss": 0.5973, + "step": 3627 + }, + { + "epoch": 0.42, + "grad_norm": 1.698173396942107, + "learning_rate": 6.563479615240928e-06, + "loss": 0.5376, + "step": 3628 + }, + { + "epoch": 0.42, + "grad_norm": 2.4394806105958624, + "learning_rate": 6.561711913051037e-06, + "loss": 0.5523, + "step": 3629 + }, + { + "epoch": 0.42, + "grad_norm": 3.8721079208835625, + "learning_rate": 6.55994399453308e-06, + "loss": 0.4127, + "step": 3630 + }, + { + "epoch": 0.42, + "grad_norm": 2.9283098952033346, + "learning_rate": 6.558175859931953e-06, + "loss": 0.4671, + "step": 3631 + }, + { + "epoch": 0.42, + "grad_norm": 2.174395118184736, + "learning_rate": 6.556407509492577e-06, + "loss": 0.5461, + "step": 3632 + }, + { + "epoch": 0.42, + "grad_norm": 2.096482074619806, + "learning_rate": 6.554638943459901e-06, + "loss": 0.5073, + "step": 3633 + }, + { + "epoch": 0.42, + "grad_norm": 0.9572624035740518, + "learning_rate": 6.552870162078908e-06, + "loss": 0.7498, + "step": 3634 + }, + { + "epoch": 0.42, + "grad_norm": 1.9455719961594937, + "learning_rate": 6.55110116559461e-06, + "loss": 0.5788, + "step": 3635 + }, + { + "epoch": 0.42, + "grad_norm": 2.5758310580556256, + "learning_rate": 6.5493319542520455e-06, + "loss": 0.4695, + "step": 3636 + }, + { + "epoch": 0.42, + "grad_norm": 1.8744509713540964, + "learning_rate": 6.547562528296287e-06, + "loss": 0.598, + "step": 3637 + }, + { + "epoch": 0.42, + "grad_norm": 2.5164898336705903, + "learning_rate": 6.545792887972436e-06, + "loss": 0.565, + "step": 3638 + }, + { + "epoch": 0.42, + "grad_norm": 1.8707105387309158, + "learning_rate": 6.544023033525622e-06, + "loss": 0.5342, + "step": 3639 + }, + { + "epoch": 0.42, + "grad_norm": 2.147616476187572, + "learning_rate": 6.542252965201005e-06, + "loss": 0.5411, + "step": 3640 + }, + { + "epoch": 0.42, + "grad_norm": 5.379962658006673, + "learning_rate": 6.540482683243774e-06, + "loss": 0.5577, + "step": 3641 + }, + { + "epoch": 0.42, + "grad_norm": 2.672515128821014, + "learning_rate": 6.5387121878991475e-06, + "loss": 0.4223, + "step": 3642 + }, + { + "epoch": 0.42, + "grad_norm": 2.2887278315237816, + "learning_rate": 6.536941479412377e-06, + "loss": 0.4323, + "step": 3643 + }, + { + "epoch": 0.42, + "grad_norm": 1.82695057323536, + "learning_rate": 6.535170558028738e-06, + "loss": 0.4689, + "step": 3644 + }, + { + "epoch": 0.42, + "grad_norm": 1.648996029973481, + "learning_rate": 6.53339942399354e-06, + "loss": 0.4637, + "step": 3645 + }, + { + "epoch": 0.42, + "grad_norm": 1.821127624252996, + "learning_rate": 6.531628077552119e-06, + "loss": 0.5525, + "step": 3646 + }, + { + "epoch": 0.42, + "grad_norm": 1.847745786755322, + "learning_rate": 6.5298565189498415e-06, + "loss": 0.4647, + "step": 3647 + }, + { + "epoch": 0.42, + "grad_norm": 6.038774898396566, + "learning_rate": 6.528084748432104e-06, + "loss": 0.44, + "step": 3648 + }, + { + "epoch": 0.42, + "grad_norm": 1.8695899302635066, + "learning_rate": 6.526312766244331e-06, + "loss": 0.4647, + "step": 3649 + }, + { + "epoch": 0.42, + "grad_norm": 1.8633588340536484, + "learning_rate": 6.52454057263198e-06, + "loss": 0.5182, + "step": 3650 + }, + { + "epoch": 0.42, + "grad_norm": 2.364247034736832, + "learning_rate": 6.522768167840532e-06, + "loss": 0.5467, + "step": 3651 + }, + { + "epoch": 0.42, + "grad_norm": 3.9809139324133564, + "learning_rate": 6.520995552115502e-06, + "loss": 0.6104, + "step": 3652 + }, + { + "epoch": 0.42, + "grad_norm": 2.2045223862495713, + "learning_rate": 6.519222725702431e-06, + "loss": 0.5866, + "step": 3653 + }, + { + "epoch": 0.42, + "grad_norm": 1.9938558985883759, + "learning_rate": 6.517449688846891e-06, + "loss": 0.4443, + "step": 3654 + }, + { + "epoch": 0.42, + "grad_norm": 1.8348027498826385, + "learning_rate": 6.515676441794483e-06, + "loss": 0.434, + "step": 3655 + }, + { + "epoch": 0.42, + "grad_norm": 3.245963541690037, + "learning_rate": 6.513902984790837e-06, + "loss": 0.5046, + "step": 3656 + }, + { + "epoch": 0.42, + "grad_norm": 2.849058738766952, + "learning_rate": 6.5121293180816105e-06, + "loss": 0.4835, + "step": 3657 + }, + { + "epoch": 0.42, + "grad_norm": 2.0078270602152215, + "learning_rate": 6.510355441912493e-06, + "loss": 0.5005, + "step": 3658 + }, + { + "epoch": 0.42, + "grad_norm": 2.3606096710817908, + "learning_rate": 6.508581356529202e-06, + "loss": 0.4694, + "step": 3659 + }, + { + "epoch": 0.42, + "grad_norm": 2.1194421989665924, + "learning_rate": 6.5068070621774844e-06, + "loss": 0.4459, + "step": 3660 + }, + { + "epoch": 0.42, + "grad_norm": 2.0064534227836988, + "learning_rate": 6.5050325591031115e-06, + "loss": 0.4768, + "step": 3661 + }, + { + "epoch": 0.42, + "grad_norm": 2.3046584800140186, + "learning_rate": 6.5032578475518895e-06, + "loss": 0.4392, + "step": 3662 + }, + { + "epoch": 0.42, + "grad_norm": 2.0347630112450745, + "learning_rate": 6.501482927769651e-06, + "loss": 0.6024, + "step": 3663 + }, + { + "epoch": 0.42, + "grad_norm": 2.9713630177246495, + "learning_rate": 6.4997078000022575e-06, + "loss": 0.4754, + "step": 3664 + }, + { + "epoch": 0.42, + "grad_norm": 1.9089597407517178, + "learning_rate": 6.497932464495599e-06, + "loss": 0.5567, + "step": 3665 + }, + { + "epoch": 0.42, + "grad_norm": 1.908761823603585, + "learning_rate": 6.496156921495594e-06, + "loss": 0.6325, + "step": 3666 + }, + { + "epoch": 0.42, + "grad_norm": 1.9686657473336888, + "learning_rate": 6.494381171248193e-06, + "loss": 0.5544, + "step": 3667 + }, + { + "epoch": 0.42, + "grad_norm": 0.827232493903466, + "learning_rate": 6.4926052139993715e-06, + "loss": 0.7387, + "step": 3668 + }, + { + "epoch": 0.42, + "grad_norm": 1.9612875277610882, + "learning_rate": 6.490829049995133e-06, + "loss": 0.5389, + "step": 3669 + }, + { + "epoch": 0.42, + "grad_norm": 1.5884267008321, + "learning_rate": 6.489052679481513e-06, + "loss": 0.419, + "step": 3670 + }, + { + "epoch": 0.42, + "grad_norm": 2.0533336961851147, + "learning_rate": 6.4872761027045735e-06, + "loss": 0.539, + "step": 3671 + }, + { + "epoch": 0.42, + "grad_norm": 2.9597899138807566, + "learning_rate": 6.485499319910405e-06, + "loss": 0.5594, + "step": 3672 + }, + { + "epoch": 0.42, + "grad_norm": 2.0261226240708834, + "learning_rate": 6.4837223313451304e-06, + "loss": 0.5679, + "step": 3673 + }, + { + "epoch": 0.42, + "grad_norm": 1.9244182240950856, + "learning_rate": 6.4819451372548945e-06, + "loss": 0.6026, + "step": 3674 + }, + { + "epoch": 0.42, + "grad_norm": 0.9419572010855228, + "learning_rate": 6.480167737885874e-06, + "loss": 0.759, + "step": 3675 + }, + { + "epoch": 0.42, + "grad_norm": 2.0101465849198905, + "learning_rate": 6.478390133484276e-06, + "loss": 0.5583, + "step": 3676 + }, + { + "epoch": 0.42, + "grad_norm": 2.0882298301855386, + "learning_rate": 6.476612324296332e-06, + "loss": 0.4932, + "step": 3677 + }, + { + "epoch": 0.42, + "grad_norm": 1.8358492213126896, + "learning_rate": 6.474834310568305e-06, + "loss": 0.4862, + "step": 3678 + }, + { + "epoch": 0.42, + "grad_norm": 1.7675586813133983, + "learning_rate": 6.473056092546485e-06, + "loss": 0.5083, + "step": 3679 + }, + { + "epoch": 0.42, + "grad_norm": 2.0811118086280653, + "learning_rate": 6.471277670477189e-06, + "loss": 0.5485, + "step": 3680 + }, + { + "epoch": 0.42, + "grad_norm": 1.938056337446877, + "learning_rate": 6.469499044606765e-06, + "loss": 0.5168, + "step": 3681 + }, + { + "epoch": 0.42, + "grad_norm": 1.7427343905518542, + "learning_rate": 6.467720215181589e-06, + "loss": 0.5109, + "step": 3682 + }, + { + "epoch": 0.42, + "grad_norm": 3.250726862221631, + "learning_rate": 6.4659411824480625e-06, + "loss": 0.4989, + "step": 3683 + }, + { + "epoch": 0.42, + "grad_norm": 2.078709845925251, + "learning_rate": 6.4641619466526166e-06, + "loss": 0.4971, + "step": 3684 + }, + { + "epoch": 0.42, + "grad_norm": 1.974968026225244, + "learning_rate": 6.462382508041714e-06, + "loss": 0.5046, + "step": 3685 + }, + { + "epoch": 0.42, + "grad_norm": 1.6572870003968732, + "learning_rate": 6.460602866861836e-06, + "loss": 0.5854, + "step": 3686 + }, + { + "epoch": 0.42, + "grad_norm": 1.9993132308170822, + "learning_rate": 6.458823023359504e-06, + "loss": 0.4972, + "step": 3687 + }, + { + "epoch": 0.42, + "grad_norm": 2.2077807862892564, + "learning_rate": 6.45704297778126e-06, + "loss": 0.5298, + "step": 3688 + }, + { + "epoch": 0.42, + "grad_norm": 2.4557952690465075, + "learning_rate": 6.455262730373673e-06, + "loss": 0.4567, + "step": 3689 + }, + { + "epoch": 0.42, + "grad_norm": 2.7828039530973063, + "learning_rate": 6.453482281383346e-06, + "loss": 0.5683, + "step": 3690 + }, + { + "epoch": 0.42, + "grad_norm": 2.0117046421260816, + "learning_rate": 6.451701631056905e-06, + "loss": 0.5407, + "step": 3691 + }, + { + "epoch": 0.42, + "grad_norm": 1.6056689280607068, + "learning_rate": 6.449920779641005e-06, + "loss": 0.5058, + "step": 3692 + }, + { + "epoch": 0.42, + "grad_norm": 2.111117675155408, + "learning_rate": 6.4481397273823294e-06, + "loss": 0.4929, + "step": 3693 + }, + { + "epoch": 0.42, + "grad_norm": 1.7061020317167797, + "learning_rate": 6.446358474527592e-06, + "loss": 0.51, + "step": 3694 + }, + { + "epoch": 0.42, + "grad_norm": 2.1274694349672507, + "learning_rate": 6.444577021323528e-06, + "loss": 0.5032, + "step": 3695 + }, + { + "epoch": 0.42, + "grad_norm": 2.231329185570803, + "learning_rate": 6.442795368016904e-06, + "loss": 0.4045, + "step": 3696 + }, + { + "epoch": 0.42, + "grad_norm": 2.309000293890529, + "learning_rate": 6.441013514854517e-06, + "loss": 0.5624, + "step": 3697 + }, + { + "epoch": 0.42, + "grad_norm": 1.6826589101233216, + "learning_rate": 6.439231462083187e-06, + "loss": 0.4769, + "step": 3698 + }, + { + "epoch": 0.43, + "grad_norm": 1.8214224108287267, + "learning_rate": 6.437449209949764e-06, + "loss": 0.4817, + "step": 3699 + }, + { + "epoch": 0.43, + "grad_norm": 0.8737291048765755, + "learning_rate": 6.4356667587011256e-06, + "loss": 0.7071, + "step": 3700 + }, + { + "epoch": 0.43, + "grad_norm": 3.013278207361291, + "learning_rate": 6.4338841085841765e-06, + "loss": 0.56, + "step": 3701 + }, + { + "epoch": 0.43, + "grad_norm": 1.830787167445565, + "learning_rate": 6.432101259845849e-06, + "loss": 0.5148, + "step": 3702 + }, + { + "epoch": 0.43, + "grad_norm": 2.214672267499787, + "learning_rate": 6.430318212733103e-06, + "loss": 0.5129, + "step": 3703 + }, + { + "epoch": 0.43, + "grad_norm": 2.115212816102047, + "learning_rate": 6.428534967492926e-06, + "loss": 0.4162, + "step": 3704 + }, + { + "epoch": 0.43, + "grad_norm": 1.903363189742424, + "learning_rate": 6.426751524372332e-06, + "loss": 0.4495, + "step": 3705 + }, + { + "epoch": 0.43, + "grad_norm": 2.167483502275897, + "learning_rate": 6.4249678836183645e-06, + "loss": 0.5339, + "step": 3706 + }, + { + "epoch": 0.43, + "grad_norm": 2.085569391317056, + "learning_rate": 6.423184045478093e-06, + "loss": 0.4929, + "step": 3707 + }, + { + "epoch": 0.43, + "grad_norm": 1.8896756208843106, + "learning_rate": 6.421400010198613e-06, + "loss": 0.4907, + "step": 3708 + }, + { + "epoch": 0.43, + "grad_norm": 1.8934048982976273, + "learning_rate": 6.419615778027051e-06, + "loss": 0.4848, + "step": 3709 + }, + { + "epoch": 0.43, + "grad_norm": 2.6138143611447546, + "learning_rate": 6.417831349210556e-06, + "loss": 0.5926, + "step": 3710 + }, + { + "epoch": 0.43, + "grad_norm": 1.774365432083248, + "learning_rate": 6.41604672399631e-06, + "loss": 0.4755, + "step": 3711 + }, + { + "epoch": 0.43, + "grad_norm": 2.379687013158279, + "learning_rate": 6.414261902631515e-06, + "loss": 0.5975, + "step": 3712 + }, + { + "epoch": 0.43, + "grad_norm": 1.8288929487264844, + "learning_rate": 6.412476885363407e-06, + "loss": 0.4246, + "step": 3713 + }, + { + "epoch": 0.43, + "grad_norm": 1.9723436058577197, + "learning_rate": 6.410691672439246e-06, + "loss": 0.5106, + "step": 3714 + }, + { + "epoch": 0.43, + "grad_norm": 1.8311296326695852, + "learning_rate": 6.4089062641063175e-06, + "loss": 0.4926, + "step": 3715 + }, + { + "epoch": 0.43, + "grad_norm": 1.737177848721453, + "learning_rate": 6.407120660611938e-06, + "loss": 0.4994, + "step": 3716 + }, + { + "epoch": 0.43, + "grad_norm": 1.7017674137055212, + "learning_rate": 6.40533486220345e-06, + "loss": 0.4946, + "step": 3717 + }, + { + "epoch": 0.43, + "grad_norm": 1.781135263217238, + "learning_rate": 6.403548869128218e-06, + "loss": 0.4597, + "step": 3718 + }, + { + "epoch": 0.43, + "grad_norm": 2.017291109682776, + "learning_rate": 6.401762681633641e-06, + "loss": 0.3983, + "step": 3719 + }, + { + "epoch": 0.43, + "grad_norm": 1.7392309935548593, + "learning_rate": 6.39997629996714e-06, + "loss": 0.4225, + "step": 3720 + }, + { + "epoch": 0.43, + "grad_norm": 2.259780252532224, + "learning_rate": 6.398189724376165e-06, + "loss": 0.6144, + "step": 3721 + }, + { + "epoch": 0.43, + "grad_norm": 2.3282443312515295, + "learning_rate": 6.39640295510819e-06, + "loss": 0.5528, + "step": 3722 + }, + { + "epoch": 0.43, + "grad_norm": 1.803462222240754, + "learning_rate": 6.39461599241072e-06, + "loss": 0.5295, + "step": 3723 + }, + { + "epoch": 0.43, + "grad_norm": 1.9366763207437103, + "learning_rate": 6.392828836531284e-06, + "loss": 0.4919, + "step": 3724 + }, + { + "epoch": 0.43, + "grad_norm": 2.185447776765421, + "learning_rate": 6.39104148771744e-06, + "loss": 0.4678, + "step": 3725 + }, + { + "epoch": 0.43, + "grad_norm": 2.258509865484344, + "learning_rate": 6.389253946216769e-06, + "loss": 0.5785, + "step": 3726 + }, + { + "epoch": 0.43, + "grad_norm": 2.20629079580278, + "learning_rate": 6.387466212276882e-06, + "loss": 0.5292, + "step": 3727 + }, + { + "epoch": 0.43, + "grad_norm": 2.2954860250216815, + "learning_rate": 6.385678286145417e-06, + "loss": 0.6209, + "step": 3728 + }, + { + "epoch": 0.43, + "grad_norm": 1.5767189945645126, + "learning_rate": 6.383890168070035e-06, + "loss": 0.546, + "step": 3729 + }, + { + "epoch": 0.43, + "grad_norm": 2.149308894538845, + "learning_rate": 6.382101858298425e-06, + "loss": 0.5018, + "step": 3730 + }, + { + "epoch": 0.43, + "grad_norm": 1.6840931851694483, + "learning_rate": 6.380313357078307e-06, + "loss": 0.607, + "step": 3731 + }, + { + "epoch": 0.43, + "grad_norm": 1.8772501221990776, + "learning_rate": 6.378524664657421e-06, + "loss": 0.4405, + "step": 3732 + }, + { + "epoch": 0.43, + "grad_norm": 2.1725162050613873, + "learning_rate": 6.376735781283537e-06, + "loss": 0.5541, + "step": 3733 + }, + { + "epoch": 0.43, + "grad_norm": 1.8324553652452498, + "learning_rate": 6.374946707204452e-06, + "loss": 0.4716, + "step": 3734 + }, + { + "epoch": 0.43, + "grad_norm": 1.8863912575458164, + "learning_rate": 6.373157442667985e-06, + "loss": 0.4478, + "step": 3735 + }, + { + "epoch": 0.43, + "grad_norm": 1.6241930444624073, + "learning_rate": 6.37136798792199e-06, + "loss": 0.5152, + "step": 3736 + }, + { + "epoch": 0.43, + "grad_norm": 0.8640154340332881, + "learning_rate": 6.369578343214337e-06, + "loss": 0.6878, + "step": 3737 + }, + { + "epoch": 0.43, + "grad_norm": 2.864004569774421, + "learning_rate": 6.36778850879293e-06, + "loss": 0.4185, + "step": 3738 + }, + { + "epoch": 0.43, + "grad_norm": 2.371126264770795, + "learning_rate": 6.3659984849056965e-06, + "loss": 0.6347, + "step": 3739 + }, + { + "epoch": 0.43, + "grad_norm": 1.8712028043288276, + "learning_rate": 6.3642082718005885e-06, + "loss": 0.5452, + "step": 3740 + }, + { + "epoch": 0.43, + "grad_norm": 2.3567129056905043, + "learning_rate": 6.362417869725586e-06, + "loss": 0.5239, + "step": 3741 + }, + { + "epoch": 0.43, + "grad_norm": 4.64344631950651, + "learning_rate": 6.360627278928697e-06, + "loss": 0.6168, + "step": 3742 + }, + { + "epoch": 0.43, + "grad_norm": 1.8133767567419796, + "learning_rate": 6.358836499657952e-06, + "loss": 0.4792, + "step": 3743 + }, + { + "epoch": 0.43, + "grad_norm": 2.1163142816217655, + "learning_rate": 6.357045532161412e-06, + "loss": 0.5042, + "step": 3744 + }, + { + "epoch": 0.43, + "grad_norm": 2.0854392069726337, + "learning_rate": 6.3552543766871585e-06, + "loss": 0.5545, + "step": 3745 + }, + { + "epoch": 0.43, + "grad_norm": 2.8611455570150977, + "learning_rate": 6.353463033483305e-06, + "loss": 0.4894, + "step": 3746 + }, + { + "epoch": 0.43, + "grad_norm": 2.5649393682942025, + "learning_rate": 6.351671502797986e-06, + "loss": 0.4545, + "step": 3747 + }, + { + "epoch": 0.43, + "grad_norm": 2.350993165228112, + "learning_rate": 6.349879784879364e-06, + "loss": 0.5537, + "step": 3748 + }, + { + "epoch": 0.43, + "grad_norm": 3.528251394467677, + "learning_rate": 6.348087879975627e-06, + "loss": 0.5819, + "step": 3749 + }, + { + "epoch": 0.43, + "grad_norm": 1.8639327564428918, + "learning_rate": 6.3462957883349915e-06, + "loss": 0.5555, + "step": 3750 + }, + { + "epoch": 0.43, + "grad_norm": 3.015975945110439, + "learning_rate": 6.344503510205697e-06, + "loss": 0.5727, + "step": 3751 + }, + { + "epoch": 0.43, + "grad_norm": 1.9868670400549004, + "learning_rate": 6.342711045836008e-06, + "loss": 0.5694, + "step": 3752 + }, + { + "epoch": 0.43, + "grad_norm": 1.7647861416784019, + "learning_rate": 6.34091839547422e-06, + "loss": 0.5391, + "step": 3753 + }, + { + "epoch": 0.43, + "grad_norm": 1.5520820992026816, + "learning_rate": 6.339125559368647e-06, + "loss": 0.5516, + "step": 3754 + }, + { + "epoch": 0.43, + "grad_norm": 1.8324978713053337, + "learning_rate": 6.337332537767632e-06, + "loss": 0.5738, + "step": 3755 + }, + { + "epoch": 0.43, + "grad_norm": 0.8687888923814248, + "learning_rate": 6.3355393309195465e-06, + "loss": 0.7262, + "step": 3756 + }, + { + "epoch": 0.43, + "grad_norm": 2.0095672102802804, + "learning_rate": 6.333745939072784e-06, + "loss": 0.4637, + "step": 3757 + }, + { + "epoch": 0.43, + "grad_norm": 1.173507736904005, + "learning_rate": 6.331952362475765e-06, + "loss": 0.7386, + "step": 3758 + }, + { + "epoch": 0.43, + "grad_norm": 1.91568365322517, + "learning_rate": 6.3301586013769365e-06, + "loss": 0.4145, + "step": 3759 + }, + { + "epoch": 0.43, + "grad_norm": 1.8477133527947616, + "learning_rate": 6.328364656024768e-06, + "loss": 0.4958, + "step": 3760 + }, + { + "epoch": 0.43, + "grad_norm": 1.7953006514160414, + "learning_rate": 6.3265705266677565e-06, + "loss": 0.4897, + "step": 3761 + }, + { + "epoch": 0.43, + "grad_norm": 1.9565416312614845, + "learning_rate": 6.324776213554428e-06, + "loss": 0.4435, + "step": 3762 + }, + { + "epoch": 0.43, + "grad_norm": 2.4445390497054458, + "learning_rate": 6.3229817169333266e-06, + "loss": 0.6382, + "step": 3763 + }, + { + "epoch": 0.43, + "grad_norm": 1.7488835906128346, + "learning_rate": 6.321187037053026e-06, + "loss": 0.4781, + "step": 3764 + }, + { + "epoch": 0.43, + "grad_norm": 1.8741922903853565, + "learning_rate": 6.319392174162125e-06, + "loss": 0.5864, + "step": 3765 + }, + { + "epoch": 0.43, + "grad_norm": 2.2287394702839265, + "learning_rate": 6.317597128509251e-06, + "loss": 0.5154, + "step": 3766 + }, + { + "epoch": 0.43, + "grad_norm": 4.391914166356338, + "learning_rate": 6.3158019003430495e-06, + "loss": 0.5943, + "step": 3767 + }, + { + "epoch": 0.43, + "grad_norm": 1.6914606283809621, + "learning_rate": 6.314006489912197e-06, + "loss": 0.4725, + "step": 3768 + }, + { + "epoch": 0.43, + "grad_norm": 2.8589368004384728, + "learning_rate": 6.3122108974653924e-06, + "loss": 0.5432, + "step": 3769 + }, + { + "epoch": 0.43, + "grad_norm": 2.5149163323222976, + "learning_rate": 6.310415123251364e-06, + "loss": 0.6049, + "step": 3770 + }, + { + "epoch": 0.43, + "grad_norm": 2.7516423580270115, + "learning_rate": 6.308619167518858e-06, + "loss": 0.5448, + "step": 3771 + }, + { + "epoch": 0.43, + "grad_norm": 2.7385148009610685, + "learning_rate": 6.306823030516651e-06, + "loss": 0.4671, + "step": 3772 + }, + { + "epoch": 0.43, + "grad_norm": 2.3928864953584217, + "learning_rate": 6.305026712493545e-06, + "loss": 0.4389, + "step": 3773 + }, + { + "epoch": 0.43, + "grad_norm": 1.8338848402993722, + "learning_rate": 6.3032302136983646e-06, + "loss": 0.5316, + "step": 3774 + }, + { + "epoch": 0.43, + "grad_norm": 2.000738083338805, + "learning_rate": 6.301433534379961e-06, + "loss": 0.5144, + "step": 3775 + }, + { + "epoch": 0.43, + "grad_norm": 2.8774769853039026, + "learning_rate": 6.299636674787208e-06, + "loss": 0.5199, + "step": 3776 + }, + { + "epoch": 0.43, + "grad_norm": 1.7879066582685983, + "learning_rate": 6.297839635169009e-06, + "loss": 0.4689, + "step": 3777 + }, + { + "epoch": 0.43, + "grad_norm": 1.74002426376618, + "learning_rate": 6.29604241577429e-06, + "loss": 0.4719, + "step": 3778 + }, + { + "epoch": 0.43, + "grad_norm": 2.6138881538542127, + "learning_rate": 6.2942450168519996e-06, + "loss": 0.5428, + "step": 3779 + }, + { + "epoch": 0.43, + "grad_norm": 2.3746783202444277, + "learning_rate": 6.292447438651112e-06, + "loss": 0.5243, + "step": 3780 + }, + { + "epoch": 0.43, + "grad_norm": 2.9507871483620898, + "learning_rate": 6.290649681420631e-06, + "loss": 0.4889, + "step": 3781 + }, + { + "epoch": 0.43, + "grad_norm": 2.8901834225395278, + "learning_rate": 6.288851745409578e-06, + "loss": 0.4584, + "step": 3782 + }, + { + "epoch": 0.43, + "grad_norm": 2.583051358633804, + "learning_rate": 6.287053630867006e-06, + "loss": 0.5043, + "step": 3783 + }, + { + "epoch": 0.43, + "grad_norm": 1.8014333270356484, + "learning_rate": 6.285255338041987e-06, + "loss": 0.4925, + "step": 3784 + }, + { + "epoch": 0.43, + "grad_norm": 2.8209157504101245, + "learning_rate": 6.283456867183622e-06, + "loss": 0.5989, + "step": 3785 + }, + { + "epoch": 0.44, + "grad_norm": 2.4362382331503003, + "learning_rate": 6.281658218541032e-06, + "loss": 0.4379, + "step": 3786 + }, + { + "epoch": 0.44, + "grad_norm": 2.0355770411998937, + "learning_rate": 6.27985939236337e-06, + "loss": 0.4979, + "step": 3787 + }, + { + "epoch": 0.44, + "grad_norm": 2.139830492876621, + "learning_rate": 6.278060388899805e-06, + "loss": 0.3805, + "step": 3788 + }, + { + "epoch": 0.44, + "grad_norm": 2.4295413368290024, + "learning_rate": 6.276261208399536e-06, + "loss": 0.5213, + "step": 3789 + }, + { + "epoch": 0.44, + "grad_norm": 2.6167326508652153, + "learning_rate": 6.274461851111787e-06, + "loss": 0.5089, + "step": 3790 + }, + { + "epoch": 0.44, + "grad_norm": 1.7260417984423884, + "learning_rate": 6.272662317285802e-06, + "loss": 0.3962, + "step": 3791 + }, + { + "epoch": 0.44, + "grad_norm": 1.7017241814747766, + "learning_rate": 6.270862607170854e-06, + "loss": 0.4707, + "step": 3792 + }, + { + "epoch": 0.44, + "grad_norm": 2.041661017396624, + "learning_rate": 6.269062721016237e-06, + "loss": 0.4871, + "step": 3793 + }, + { + "epoch": 0.44, + "grad_norm": 1.873489150120475, + "learning_rate": 6.267262659071273e-06, + "loss": 0.4834, + "step": 3794 + }, + { + "epoch": 0.44, + "grad_norm": 2.3659072297258232, + "learning_rate": 6.265462421585304e-06, + "loss": 0.4675, + "step": 3795 + }, + { + "epoch": 0.44, + "grad_norm": 1.9575804561535535, + "learning_rate": 6.2636620088077e-06, + "loss": 0.5542, + "step": 3796 + }, + { + "epoch": 0.44, + "grad_norm": 2.5259495034663435, + "learning_rate": 6.261861420987853e-06, + "loss": 0.5677, + "step": 3797 + }, + { + "epoch": 0.44, + "grad_norm": 3.3645814052454455, + "learning_rate": 6.26006065837518e-06, + "loss": 0.5263, + "step": 3798 + }, + { + "epoch": 0.44, + "grad_norm": 2.275796972363486, + "learning_rate": 6.258259721219125e-06, + "loss": 0.4225, + "step": 3799 + }, + { + "epoch": 0.44, + "grad_norm": 1.806674260234892, + "learning_rate": 6.2564586097691485e-06, + "loss": 0.5176, + "step": 3800 + }, + { + "epoch": 0.44, + "grad_norm": 2.991372843305997, + "learning_rate": 6.2546573242747455e-06, + "loss": 0.511, + "step": 3801 + }, + { + "epoch": 0.44, + "grad_norm": 2.22078520290855, + "learning_rate": 6.252855864985425e-06, + "loss": 0.5603, + "step": 3802 + }, + { + "epoch": 0.44, + "grad_norm": 2.55186548266212, + "learning_rate": 6.251054232150728e-06, + "loss": 0.4304, + "step": 3803 + }, + { + "epoch": 0.44, + "grad_norm": 2.4471899547208684, + "learning_rate": 6.249252426020217e-06, + "loss": 0.5718, + "step": 3804 + }, + { + "epoch": 0.44, + "grad_norm": 1.6293331107990483, + "learning_rate": 6.2474504468434745e-06, + "loss": 0.5235, + "step": 3805 + }, + { + "epoch": 0.44, + "grad_norm": 1.9283970961487356, + "learning_rate": 6.245648294870112e-06, + "loss": 0.5254, + "step": 3806 + }, + { + "epoch": 0.44, + "grad_norm": 1.5080090258814565, + "learning_rate": 6.243845970349764e-06, + "loss": 0.5455, + "step": 3807 + }, + { + "epoch": 0.44, + "grad_norm": 2.4439307809299704, + "learning_rate": 6.242043473532088e-06, + "loss": 0.4383, + "step": 3808 + }, + { + "epoch": 0.44, + "grad_norm": 1.9035877540212305, + "learning_rate": 6.240240804666765e-06, + "loss": 0.4485, + "step": 3809 + }, + { + "epoch": 0.44, + "grad_norm": 2.0610210174697943, + "learning_rate": 6.2384379640034994e-06, + "loss": 0.4874, + "step": 3810 + }, + { + "epoch": 0.44, + "grad_norm": 1.6562402578770676, + "learning_rate": 6.236634951792023e-06, + "loss": 0.5006, + "step": 3811 + }, + { + "epoch": 0.44, + "grad_norm": 1.7588003962779382, + "learning_rate": 6.234831768282088e-06, + "loss": 0.5282, + "step": 3812 + }, + { + "epoch": 0.44, + "grad_norm": 2.2248111168867855, + "learning_rate": 6.2330284137234685e-06, + "loss": 0.5074, + "step": 3813 + }, + { + "epoch": 0.44, + "grad_norm": 1.7488561137757463, + "learning_rate": 6.231224888365968e-06, + "loss": 0.4907, + "step": 3814 + }, + { + "epoch": 0.44, + "grad_norm": 2.6829937035558102, + "learning_rate": 6.229421192459408e-06, + "loss": 0.5216, + "step": 3815 + }, + { + "epoch": 0.44, + "grad_norm": 1.9353687068888177, + "learning_rate": 6.227617326253638e-06, + "loss": 0.5838, + "step": 3816 + }, + { + "epoch": 0.44, + "grad_norm": 1.6451312426968963, + "learning_rate": 6.225813289998528e-06, + "loss": 0.5444, + "step": 3817 + }, + { + "epoch": 0.44, + "grad_norm": 2.7162030540676056, + "learning_rate": 6.224009083943973e-06, + "loss": 0.4303, + "step": 3818 + }, + { + "epoch": 0.44, + "grad_norm": 1.9861443789982134, + "learning_rate": 6.222204708339893e-06, + "loss": 0.4693, + "step": 3819 + }, + { + "epoch": 0.44, + "grad_norm": 1.9965815139772727, + "learning_rate": 6.220400163436228e-06, + "loss": 0.4434, + "step": 3820 + }, + { + "epoch": 0.44, + "grad_norm": 2.305594901880178, + "learning_rate": 6.218595449482945e-06, + "loss": 0.5039, + "step": 3821 + }, + { + "epoch": 0.44, + "grad_norm": 1.7473432684601102, + "learning_rate": 6.21679056673003e-06, + "loss": 0.5352, + "step": 3822 + }, + { + "epoch": 0.44, + "grad_norm": 2.184967971802951, + "learning_rate": 6.2149855154274965e-06, + "loss": 0.4816, + "step": 3823 + }, + { + "epoch": 0.44, + "grad_norm": 0.8565031989209765, + "learning_rate": 6.2131802958253805e-06, + "loss": 0.7019, + "step": 3824 + }, + { + "epoch": 0.44, + "grad_norm": 1.9323659347596283, + "learning_rate": 6.2113749081737396e-06, + "loss": 0.5781, + "step": 3825 + }, + { + "epoch": 0.44, + "grad_norm": 2.2086475064706663, + "learning_rate": 6.209569352722657e-06, + "loss": 0.5641, + "step": 3826 + }, + { + "epoch": 0.44, + "grad_norm": 1.9465680591827748, + "learning_rate": 6.2077636297222355e-06, + "loss": 0.5193, + "step": 3827 + }, + { + "epoch": 0.44, + "grad_norm": 1.4454648734132904, + "learning_rate": 6.2059577394226056e-06, + "loss": 0.4375, + "step": 3828 + }, + { + "epoch": 0.44, + "grad_norm": 1.8245885515136018, + "learning_rate": 6.20415168207392e-06, + "loss": 0.5405, + "step": 3829 + }, + { + "epoch": 0.44, + "grad_norm": 1.7168392268114387, + "learning_rate": 6.202345457926351e-06, + "loss": 0.4489, + "step": 3830 + }, + { + "epoch": 0.44, + "grad_norm": 2.6512890497199604, + "learning_rate": 6.200539067230097e-06, + "loss": 0.6089, + "step": 3831 + }, + { + "epoch": 0.44, + "grad_norm": 1.928512631271115, + "learning_rate": 6.198732510235379e-06, + "loss": 0.5198, + "step": 3832 + }, + { + "epoch": 0.44, + "grad_norm": 2.7641150785719257, + "learning_rate": 6.196925787192443e-06, + "loss": 0.5174, + "step": 3833 + }, + { + "epoch": 0.44, + "grad_norm": 2.5121647751783573, + "learning_rate": 6.195118898351553e-06, + "loss": 0.5276, + "step": 3834 + }, + { + "epoch": 0.44, + "grad_norm": 2.273570010598364, + "learning_rate": 6.193311843963001e-06, + "loss": 0.4879, + "step": 3835 + }, + { + "epoch": 0.44, + "grad_norm": 1.8569612037831926, + "learning_rate": 6.191504624277097e-06, + "loss": 0.5346, + "step": 3836 + }, + { + "epoch": 0.44, + "grad_norm": 3.4690857750330486, + "learning_rate": 6.1896972395441814e-06, + "loss": 0.5943, + "step": 3837 + }, + { + "epoch": 0.44, + "grad_norm": 2.418822478651311, + "learning_rate": 6.187889690014609e-06, + "loss": 0.4427, + "step": 3838 + }, + { + "epoch": 0.44, + "grad_norm": 0.9274803187518802, + "learning_rate": 6.186081975938763e-06, + "loss": 0.7193, + "step": 3839 + }, + { + "epoch": 0.44, + "grad_norm": 2.0402237168644195, + "learning_rate": 6.184274097567047e-06, + "loss": 0.5503, + "step": 3840 + }, + { + "epoch": 0.44, + "grad_norm": 2.5978603816512145, + "learning_rate": 6.1824660551498875e-06, + "loss": 0.5296, + "step": 3841 + }, + { + "epoch": 0.44, + "grad_norm": 2.467422380097626, + "learning_rate": 6.1806578489377345e-06, + "loss": 0.5457, + "step": 3842 + }, + { + "epoch": 0.44, + "grad_norm": 4.093518207155721, + "learning_rate": 6.178849479181061e-06, + "loss": 0.569, + "step": 3843 + }, + { + "epoch": 0.44, + "grad_norm": 1.84911064856627, + "learning_rate": 6.177040946130364e-06, + "loss": 0.5102, + "step": 3844 + }, + { + "epoch": 0.44, + "grad_norm": 2.0113320404640076, + "learning_rate": 6.175232250036157e-06, + "loss": 0.5654, + "step": 3845 + }, + { + "epoch": 0.44, + "grad_norm": 2.2507538000064056, + "learning_rate": 6.173423391148983e-06, + "loss": 0.5763, + "step": 3846 + }, + { + "epoch": 0.44, + "grad_norm": 1.8660644953352767, + "learning_rate": 6.171614369719406e-06, + "loss": 0.5064, + "step": 3847 + }, + { + "epoch": 0.44, + "grad_norm": 2.2868547737774976, + "learning_rate": 6.169805185998006e-06, + "loss": 0.5642, + "step": 3848 + }, + { + "epoch": 0.44, + "grad_norm": 2.27957553249128, + "learning_rate": 6.167995840235396e-06, + "loss": 0.5377, + "step": 3849 + }, + { + "epoch": 0.44, + "grad_norm": 2.475969198433679, + "learning_rate": 6.166186332682203e-06, + "loss": 0.6373, + "step": 3850 + }, + { + "epoch": 0.44, + "grad_norm": 2.171813102639267, + "learning_rate": 6.164376663589082e-06, + "loss": 0.5223, + "step": 3851 + }, + { + "epoch": 0.44, + "grad_norm": 2.4307753912261485, + "learning_rate": 6.162566833206707e-06, + "loss": 0.5823, + "step": 3852 + }, + { + "epoch": 0.44, + "grad_norm": 2.2576737970165253, + "learning_rate": 6.160756841785776e-06, + "loss": 0.4477, + "step": 3853 + }, + { + "epoch": 0.44, + "grad_norm": 2.0771923569351647, + "learning_rate": 6.15894668957701e-06, + "loss": 0.5393, + "step": 3854 + }, + { + "epoch": 0.44, + "grad_norm": 1.7071485325132538, + "learning_rate": 6.157136376831147e-06, + "loss": 0.4766, + "step": 3855 + }, + { + "epoch": 0.44, + "grad_norm": 2.0059189851166965, + "learning_rate": 6.155325903798954e-06, + "loss": 0.5366, + "step": 3856 + }, + { + "epoch": 0.44, + "grad_norm": 2.631687727586758, + "learning_rate": 6.1535152707312174e-06, + "loss": 0.5519, + "step": 3857 + }, + { + "epoch": 0.44, + "grad_norm": 1.9040434323865552, + "learning_rate": 6.151704477878745e-06, + "loss": 0.5294, + "step": 3858 + }, + { + "epoch": 0.44, + "grad_norm": 1.864504990458134, + "learning_rate": 6.149893525492368e-06, + "loss": 0.5515, + "step": 3859 + }, + { + "epoch": 0.44, + "grad_norm": 2.0313320135666597, + "learning_rate": 6.1480824138229375e-06, + "loss": 0.5834, + "step": 3860 + }, + { + "epoch": 0.44, + "grad_norm": 2.1947539900172055, + "learning_rate": 6.1462711431213315e-06, + "loss": 0.4075, + "step": 3861 + }, + { + "epoch": 0.44, + "grad_norm": 3.394372712067144, + "learning_rate": 6.144459713638444e-06, + "loss": 0.5111, + "step": 3862 + }, + { + "epoch": 0.44, + "grad_norm": 3.015999114357917, + "learning_rate": 6.142648125625198e-06, + "loss": 0.4619, + "step": 3863 + }, + { + "epoch": 0.44, + "grad_norm": 1.93096483459641, + "learning_rate": 6.140836379332529e-06, + "loss": 0.4672, + "step": 3864 + }, + { + "epoch": 0.44, + "grad_norm": 1.8856694074513514, + "learning_rate": 6.1390244750114036e-06, + "loss": 0.4368, + "step": 3865 + }, + { + "epoch": 0.44, + "grad_norm": 1.7103978418204258, + "learning_rate": 6.137212412912804e-06, + "loss": 0.4154, + "step": 3866 + }, + { + "epoch": 0.44, + "grad_norm": 2.6423033873328343, + "learning_rate": 6.1354001932877395e-06, + "loss": 0.4603, + "step": 3867 + }, + { + "epoch": 0.44, + "grad_norm": 2.6733956841884265, + "learning_rate": 6.133587816387236e-06, + "loss": 0.4852, + "step": 3868 + }, + { + "epoch": 0.44, + "grad_norm": 3.2377017830208623, + "learning_rate": 6.131775282462345e-06, + "loss": 0.5035, + "step": 3869 + }, + { + "epoch": 0.44, + "grad_norm": 2.946295426927518, + "learning_rate": 6.129962591764137e-06, + "loss": 0.5665, + "step": 3870 + }, + { + "epoch": 0.44, + "grad_norm": 1.8598381292596793, + "learning_rate": 6.128149744543708e-06, + "loss": 0.5054, + "step": 3871 + }, + { + "epoch": 0.44, + "grad_norm": 2.255009701730728, + "learning_rate": 6.12633674105217e-06, + "loss": 0.5304, + "step": 3872 + }, + { + "epoch": 0.45, + "grad_norm": 2.003969247050497, + "learning_rate": 6.124523581540662e-06, + "loss": 0.4773, + "step": 3873 + }, + { + "epoch": 0.45, + "grad_norm": 2.993626931815044, + "learning_rate": 6.122710266260344e-06, + "loss": 0.5387, + "step": 3874 + }, + { + "epoch": 0.45, + "grad_norm": 2.3113266302870112, + "learning_rate": 6.120896795462392e-06, + "loss": 0.5443, + "step": 3875 + }, + { + "epoch": 0.45, + "grad_norm": 2.0324540793999923, + "learning_rate": 6.1190831693980104e-06, + "loss": 0.5958, + "step": 3876 + }, + { + "epoch": 0.45, + "grad_norm": 1.8781463257660689, + "learning_rate": 6.117269388318423e-06, + "loss": 0.4241, + "step": 3877 + }, + { + "epoch": 0.45, + "grad_norm": 1.766528617889605, + "learning_rate": 6.115455452474874e-06, + "loss": 0.5326, + "step": 3878 + }, + { + "epoch": 0.45, + "grad_norm": 2.08696011168214, + "learning_rate": 6.113641362118627e-06, + "loss": 0.4967, + "step": 3879 + }, + { + "epoch": 0.45, + "grad_norm": 2.617142562480076, + "learning_rate": 6.111827117500974e-06, + "loss": 0.6535, + "step": 3880 + }, + { + "epoch": 0.45, + "grad_norm": 2.4597446578833404, + "learning_rate": 6.110012718873219e-06, + "loss": 0.5694, + "step": 3881 + }, + { + "epoch": 0.45, + "grad_norm": 2.338425793615208, + "learning_rate": 6.1081981664866955e-06, + "loss": 0.5326, + "step": 3882 + }, + { + "epoch": 0.45, + "grad_norm": 1.9114017477925207, + "learning_rate": 6.106383460592753e-06, + "loss": 0.4553, + "step": 3883 + }, + { + "epoch": 0.45, + "grad_norm": 3.556580985057182, + "learning_rate": 6.104568601442765e-06, + "loss": 0.5088, + "step": 3884 + }, + { + "epoch": 0.45, + "grad_norm": 2.033415219717344, + "learning_rate": 6.102753589288126e-06, + "loss": 0.5349, + "step": 3885 + }, + { + "epoch": 0.45, + "grad_norm": 1.9303393930953598, + "learning_rate": 6.1009384243802525e-06, + "loss": 0.5533, + "step": 3886 + }, + { + "epoch": 0.45, + "grad_norm": 1.7165561324441685, + "learning_rate": 6.099123106970578e-06, + "loss": 0.5292, + "step": 3887 + }, + { + "epoch": 0.45, + "grad_norm": 1.8476463618228964, + "learning_rate": 6.097307637310561e-06, + "loss": 0.4928, + "step": 3888 + }, + { + "epoch": 0.45, + "grad_norm": 2.2064613316135904, + "learning_rate": 6.095492015651681e-06, + "loss": 0.512, + "step": 3889 + }, + { + "epoch": 0.45, + "grad_norm": 2.7661190702478455, + "learning_rate": 6.093676242245435e-06, + "loss": 0.5571, + "step": 3890 + }, + { + "epoch": 0.45, + "grad_norm": 2.52997232569591, + "learning_rate": 6.0918603173433465e-06, + "loss": 0.454, + "step": 3891 + }, + { + "epoch": 0.45, + "grad_norm": 2.2811725413680155, + "learning_rate": 6.0900442411969555e-06, + "loss": 0.5974, + "step": 3892 + }, + { + "epoch": 0.45, + "grad_norm": 2.7531346178433234, + "learning_rate": 6.088228014057825e-06, + "loss": 0.5644, + "step": 3893 + }, + { + "epoch": 0.45, + "grad_norm": 1.8584947561895628, + "learning_rate": 6.086411636177538e-06, + "loss": 0.4847, + "step": 3894 + }, + { + "epoch": 0.45, + "grad_norm": 3.8414385310343264, + "learning_rate": 6.0845951078077004e-06, + "loss": 0.5338, + "step": 3895 + }, + { + "epoch": 0.45, + "grad_norm": 1.7514750689193765, + "learning_rate": 6.082778429199937e-06, + "loss": 0.5172, + "step": 3896 + }, + { + "epoch": 0.45, + "grad_norm": 2.213959314104493, + "learning_rate": 6.0809616006058915e-06, + "loss": 0.4999, + "step": 3897 + }, + { + "epoch": 0.45, + "grad_norm": 2.2684304661600034, + "learning_rate": 6.079144622277233e-06, + "loss": 0.5539, + "step": 3898 + }, + { + "epoch": 0.45, + "grad_norm": 2.308793211415469, + "learning_rate": 6.077327494465648e-06, + "loss": 0.4763, + "step": 3899 + }, + { + "epoch": 0.45, + "grad_norm": 2.542300597424697, + "learning_rate": 6.075510217422845e-06, + "loss": 0.5467, + "step": 3900 + }, + { + "epoch": 0.45, + "grad_norm": 2.232535642634879, + "learning_rate": 6.073692791400553e-06, + "loss": 0.5943, + "step": 3901 + }, + { + "epoch": 0.45, + "grad_norm": 1.9565365197481344, + "learning_rate": 6.0718752166505214e-06, + "loss": 0.5765, + "step": 3902 + }, + { + "epoch": 0.45, + "grad_norm": 1.9536192156103716, + "learning_rate": 6.0700574934245215e-06, + "loss": 0.5209, + "step": 3903 + }, + { + "epoch": 0.45, + "grad_norm": 2.329778383103884, + "learning_rate": 6.068239621974341e-06, + "loss": 0.5588, + "step": 3904 + }, + { + "epoch": 0.45, + "grad_norm": 3.1984475232309357, + "learning_rate": 6.066421602551796e-06, + "loss": 0.5479, + "step": 3905 + }, + { + "epoch": 0.45, + "grad_norm": 2.6636663151806315, + "learning_rate": 6.064603435408714e-06, + "loss": 0.4796, + "step": 3906 + }, + { + "epoch": 0.45, + "grad_norm": 0.8595379909838388, + "learning_rate": 6.062785120796947e-06, + "loss": 0.7073, + "step": 3907 + }, + { + "epoch": 0.45, + "grad_norm": 0.8501763659093946, + "learning_rate": 6.0609666589683705e-06, + "loss": 0.7138, + "step": 3908 + }, + { + "epoch": 0.45, + "grad_norm": 1.8105520953965444, + "learning_rate": 6.0591480501748765e-06, + "loss": 0.5576, + "step": 3909 + }, + { + "epoch": 0.45, + "grad_norm": 3.7219410778256607, + "learning_rate": 6.057329294668377e-06, + "loss": 0.5004, + "step": 3910 + }, + { + "epoch": 0.45, + "grad_norm": 3.0664917917040366, + "learning_rate": 6.055510392700807e-06, + "loss": 0.5547, + "step": 3911 + }, + { + "epoch": 0.45, + "grad_norm": 1.7630791616942876, + "learning_rate": 6.05369134452412e-06, + "loss": 0.4017, + "step": 3912 + }, + { + "epoch": 0.45, + "grad_norm": 1.781440318369817, + "learning_rate": 6.051872150390293e-06, + "loss": 0.4903, + "step": 3913 + }, + { + "epoch": 0.45, + "grad_norm": 3.0591461032170564, + "learning_rate": 6.0500528105513156e-06, + "loss": 0.5188, + "step": 3914 + }, + { + "epoch": 0.45, + "grad_norm": 3.0386776643555526, + "learning_rate": 6.048233325259205e-06, + "loss": 0.5399, + "step": 3915 + }, + { + "epoch": 0.45, + "grad_norm": 2.4872150763238103, + "learning_rate": 6.046413694765996e-06, + "loss": 0.473, + "step": 3916 + }, + { + "epoch": 0.45, + "grad_norm": 0.8874152573187207, + "learning_rate": 6.044593919323742e-06, + "loss": 0.7141, + "step": 3917 + }, + { + "epoch": 0.45, + "grad_norm": 13.933831816954205, + "learning_rate": 6.04277399918452e-06, + "loss": 0.5209, + "step": 3918 + }, + { + "epoch": 0.45, + "grad_norm": 2.0693774456743887, + "learning_rate": 6.040953934600425e-06, + "loss": 0.4891, + "step": 3919 + }, + { + "epoch": 0.45, + "grad_norm": 2.0545180849312867, + "learning_rate": 6.039133725823571e-06, + "loss": 0.4749, + "step": 3920 + }, + { + "epoch": 0.45, + "grad_norm": 2.0955142470465686, + "learning_rate": 6.037313373106092e-06, + "loss": 0.4954, + "step": 3921 + }, + { + "epoch": 0.45, + "grad_norm": 1.7350661425773517, + "learning_rate": 6.0354928767001465e-06, + "loss": 0.5401, + "step": 3922 + }, + { + "epoch": 0.45, + "grad_norm": 1.7204197049861876, + "learning_rate": 6.033672236857906e-06, + "loss": 0.5187, + "step": 3923 + }, + { + "epoch": 0.45, + "grad_norm": 3.245124687924675, + "learning_rate": 6.031851453831565e-06, + "loss": 0.4353, + "step": 3924 + }, + { + "epoch": 0.45, + "grad_norm": 1.8770269777671564, + "learning_rate": 6.03003052787334e-06, + "loss": 0.4853, + "step": 3925 + }, + { + "epoch": 0.45, + "grad_norm": 2.668819467282734, + "learning_rate": 6.028209459235466e-06, + "loss": 0.4585, + "step": 3926 + }, + { + "epoch": 0.45, + "grad_norm": 3.3989676984328634, + "learning_rate": 6.0263882481701945e-06, + "loss": 0.5283, + "step": 3927 + }, + { + "epoch": 0.45, + "grad_norm": 2.985824212374142, + "learning_rate": 6.024566894929801e-06, + "loss": 0.4527, + "step": 3928 + }, + { + "epoch": 0.45, + "grad_norm": 1.7689977369208556, + "learning_rate": 6.02274539976658e-06, + "loss": 0.4077, + "step": 3929 + }, + { + "epoch": 0.45, + "grad_norm": 2.3400806273556594, + "learning_rate": 6.020923762932841e-06, + "loss": 0.5154, + "step": 3930 + }, + { + "epoch": 0.45, + "grad_norm": 7.318653649764081, + "learning_rate": 6.0191019846809206e-06, + "loss": 0.541, + "step": 3931 + }, + { + "epoch": 0.45, + "grad_norm": 2.18393188501938, + "learning_rate": 6.0172800652631706e-06, + "loss": 0.6133, + "step": 3932 + }, + { + "epoch": 0.45, + "grad_norm": 2.7199326583339434, + "learning_rate": 6.015458004931961e-06, + "loss": 0.4393, + "step": 3933 + }, + { + "epoch": 0.45, + "grad_norm": 2.5962708448455993, + "learning_rate": 6.013635803939684e-06, + "loss": 0.5358, + "step": 3934 + }, + { + "epoch": 0.45, + "grad_norm": 3.2934882179298595, + "learning_rate": 6.011813462538752e-06, + "loss": 0.4911, + "step": 3935 + }, + { + "epoch": 0.45, + "grad_norm": 1.7604511215592078, + "learning_rate": 6.0099909809815925e-06, + "loss": 0.3932, + "step": 3936 + }, + { + "epoch": 0.45, + "grad_norm": 2.6621012690784056, + "learning_rate": 6.008168359520659e-06, + "loss": 0.4924, + "step": 3937 + }, + { + "epoch": 0.45, + "grad_norm": 2.0455475778685943, + "learning_rate": 6.006345598408418e-06, + "loss": 0.6394, + "step": 3938 + }, + { + "epoch": 0.45, + "grad_norm": 1.741692364622363, + "learning_rate": 6.0045226978973614e-06, + "loss": 0.4205, + "step": 3939 + }, + { + "epoch": 0.45, + "grad_norm": 2.1803130784702707, + "learning_rate": 6.002699658239992e-06, + "loss": 0.4323, + "step": 3940 + }, + { + "epoch": 0.45, + "grad_norm": 2.067762500213044, + "learning_rate": 6.0008764796888406e-06, + "loss": 0.5505, + "step": 3941 + }, + { + "epoch": 0.45, + "grad_norm": 2.1648373493138835, + "learning_rate": 5.999053162496453e-06, + "loss": 0.4269, + "step": 3942 + }, + { + "epoch": 0.45, + "grad_norm": 2.4000649531676244, + "learning_rate": 5.997229706915393e-06, + "loss": 0.5466, + "step": 3943 + }, + { + "epoch": 0.45, + "grad_norm": 1.9645038829853294, + "learning_rate": 5.995406113198248e-06, + "loss": 0.4737, + "step": 3944 + }, + { + "epoch": 0.45, + "grad_norm": 1.9625689774836608, + "learning_rate": 5.99358238159762e-06, + "loss": 0.481, + "step": 3945 + }, + { + "epoch": 0.45, + "grad_norm": 2.1628207279062837, + "learning_rate": 5.991758512366133e-06, + "loss": 0.5703, + "step": 3946 + }, + { + "epoch": 0.45, + "grad_norm": 2.1069189848738747, + "learning_rate": 5.989934505756429e-06, + "loss": 0.4677, + "step": 3947 + }, + { + "epoch": 0.45, + "grad_norm": 2.265099512251831, + "learning_rate": 5.988110362021168e-06, + "loss": 0.6182, + "step": 3948 + }, + { + "epoch": 0.45, + "grad_norm": 2.451871978042828, + "learning_rate": 5.986286081413031e-06, + "loss": 0.5586, + "step": 3949 + }, + { + "epoch": 0.45, + "grad_norm": 2.328661493946251, + "learning_rate": 5.984461664184717e-06, + "loss": 0.5488, + "step": 3950 + }, + { + "epoch": 0.45, + "grad_norm": 2.030697768858632, + "learning_rate": 5.982637110588945e-06, + "loss": 0.5583, + "step": 3951 + }, + { + "epoch": 0.45, + "grad_norm": 2.457078791306872, + "learning_rate": 5.980812420878448e-06, + "loss": 0.4943, + "step": 3952 + }, + { + "epoch": 0.45, + "grad_norm": 1.932424277482519, + "learning_rate": 5.978987595305985e-06, + "loss": 0.5248, + "step": 3953 + }, + { + "epoch": 0.45, + "grad_norm": 1.8820036008940983, + "learning_rate": 5.977162634124331e-06, + "loss": 0.4497, + "step": 3954 + }, + { + "epoch": 0.45, + "grad_norm": 2.4890278749172583, + "learning_rate": 5.975337537586278e-06, + "loss": 0.5321, + "step": 3955 + }, + { + "epoch": 0.45, + "grad_norm": 1.7744284105135804, + "learning_rate": 5.973512305944638e-06, + "loss": 0.4956, + "step": 3956 + }, + { + "epoch": 0.45, + "grad_norm": 2.1648847883550255, + "learning_rate": 5.971686939452241e-06, + "loss": 0.4549, + "step": 3957 + }, + { + "epoch": 0.45, + "grad_norm": 1.812123623062125, + "learning_rate": 5.969861438361939e-06, + "loss": 0.4379, + "step": 3958 + }, + { + "epoch": 0.45, + "grad_norm": 1.9994075286171609, + "learning_rate": 5.968035802926598e-06, + "loss": 0.5259, + "step": 3959 + }, + { + "epoch": 0.46, + "grad_norm": 2.7781725784622178, + "learning_rate": 5.966210033399105e-06, + "loss": 0.5522, + "step": 3960 + }, + { + "epoch": 0.46, + "grad_norm": 2.0529129444429435, + "learning_rate": 5.964384130032366e-06, + "loss": 0.4067, + "step": 3961 + }, + { + "epoch": 0.46, + "grad_norm": 2.328488968250573, + "learning_rate": 5.962558093079302e-06, + "loss": 0.4639, + "step": 3962 + }, + { + "epoch": 0.46, + "grad_norm": 2.14096951540091, + "learning_rate": 5.960731922792861e-06, + "loss": 0.5085, + "step": 3963 + }, + { + "epoch": 0.46, + "grad_norm": 1.9016476470604728, + "learning_rate": 5.958905619426e-06, + "loss": 0.5246, + "step": 3964 + }, + { + "epoch": 0.46, + "grad_norm": 1.629847096457984, + "learning_rate": 5.957079183231696e-06, + "loss": 0.4777, + "step": 3965 + }, + { + "epoch": 0.46, + "grad_norm": 1.7373107022831173, + "learning_rate": 5.955252614462952e-06, + "loss": 0.4338, + "step": 3966 + }, + { + "epoch": 0.46, + "grad_norm": 2.046700541707747, + "learning_rate": 5.95342591337278e-06, + "loss": 0.5425, + "step": 3967 + }, + { + "epoch": 0.46, + "grad_norm": 2.2517165246456528, + "learning_rate": 5.951599080214216e-06, + "loss": 0.4851, + "step": 3968 + }, + { + "epoch": 0.46, + "grad_norm": 1.9236733675645439, + "learning_rate": 5.949772115240312e-06, + "loss": 0.5026, + "step": 3969 + }, + { + "epoch": 0.46, + "grad_norm": 2.4237342567495532, + "learning_rate": 5.94794501870414e-06, + "loss": 0.5535, + "step": 3970 + }, + { + "epoch": 0.46, + "grad_norm": 1.6671587830031098, + "learning_rate": 5.946117790858787e-06, + "loss": 0.5378, + "step": 3971 + }, + { + "epoch": 0.46, + "grad_norm": 1.8878139302420236, + "learning_rate": 5.944290431957362e-06, + "loss": 0.5116, + "step": 3972 + }, + { + "epoch": 0.46, + "grad_norm": 2.1318239916070567, + "learning_rate": 5.94246294225299e-06, + "loss": 0.5205, + "step": 3973 + }, + { + "epoch": 0.46, + "grad_norm": 1.8030828161053167, + "learning_rate": 5.940635321998815e-06, + "loss": 0.5034, + "step": 3974 + }, + { + "epoch": 0.46, + "grad_norm": 2.794805107107148, + "learning_rate": 5.938807571447998e-06, + "loss": 0.5509, + "step": 3975 + }, + { + "epoch": 0.46, + "grad_norm": 2.355952607966635, + "learning_rate": 5.9369796908537185e-06, + "loss": 0.5521, + "step": 3976 + }, + { + "epoch": 0.46, + "grad_norm": 2.5096814647070667, + "learning_rate": 5.9351516804691745e-06, + "loss": 0.5208, + "step": 3977 + }, + { + "epoch": 0.46, + "grad_norm": 2.2171385118497535, + "learning_rate": 5.933323540547581e-06, + "loss": 0.5488, + "step": 3978 + }, + { + "epoch": 0.46, + "grad_norm": 3.1007539885191306, + "learning_rate": 5.931495271342173e-06, + "loss": 0.4712, + "step": 3979 + }, + { + "epoch": 0.46, + "grad_norm": 2.088343984488, + "learning_rate": 5.929666873106202e-06, + "loss": 0.4461, + "step": 3980 + }, + { + "epoch": 0.46, + "grad_norm": 0.8458501140332395, + "learning_rate": 5.927838346092936e-06, + "loss": 0.7471, + "step": 3981 + }, + { + "epoch": 0.46, + "grad_norm": 2.948310419100518, + "learning_rate": 5.926009690555663e-06, + "loss": 0.6254, + "step": 3982 + }, + { + "epoch": 0.46, + "grad_norm": 0.8894552595928077, + "learning_rate": 5.924180906747688e-06, + "loss": 0.7081, + "step": 3983 + }, + { + "epoch": 0.46, + "grad_norm": 2.9463515431142673, + "learning_rate": 5.922351994922333e-06, + "loss": 0.4956, + "step": 3984 + }, + { + "epoch": 0.46, + "grad_norm": 2.1200639428898436, + "learning_rate": 5.9205229553329405e-06, + "loss": 0.4816, + "step": 3985 + }, + { + "epoch": 0.46, + "grad_norm": 2.5283295313298106, + "learning_rate": 5.918693788232868e-06, + "loss": 0.5632, + "step": 3986 + }, + { + "epoch": 0.46, + "grad_norm": 1.9344228822113, + "learning_rate": 5.91686449387549e-06, + "loss": 0.4958, + "step": 3987 + }, + { + "epoch": 0.46, + "grad_norm": 3.8288477400796794, + "learning_rate": 5.915035072514202e-06, + "loss": 0.5446, + "step": 3988 + }, + { + "epoch": 0.46, + "grad_norm": 2.33959458882891, + "learning_rate": 5.913205524402415e-06, + "loss": 0.5702, + "step": 3989 + }, + { + "epoch": 0.46, + "grad_norm": 2.5232575043546994, + "learning_rate": 5.9113758497935545e-06, + "loss": 0.4804, + "step": 3990 + }, + { + "epoch": 0.46, + "grad_norm": 1.860874844124119, + "learning_rate": 5.909546048941071e-06, + "loss": 0.5178, + "step": 3991 + }, + { + "epoch": 0.46, + "grad_norm": 0.8720428303040475, + "learning_rate": 5.907716122098424e-06, + "loss": 0.7096, + "step": 3992 + }, + { + "epoch": 0.46, + "grad_norm": 2.172881785234152, + "learning_rate": 5.9058860695191e-06, + "loss": 0.4496, + "step": 3993 + }, + { + "epoch": 0.46, + "grad_norm": 1.4980751016234626, + "learning_rate": 5.904055891456594e-06, + "loss": 0.4602, + "step": 3994 + }, + { + "epoch": 0.46, + "grad_norm": 3.4277576863210166, + "learning_rate": 5.902225588164422e-06, + "loss": 0.4867, + "step": 3995 + }, + { + "epoch": 0.46, + "grad_norm": 3.000118921657069, + "learning_rate": 5.900395159896117e-06, + "loss": 0.4512, + "step": 3996 + }, + { + "epoch": 0.46, + "grad_norm": 1.8354223754231285, + "learning_rate": 5.898564606905231e-06, + "loss": 0.5466, + "step": 3997 + }, + { + "epoch": 0.46, + "grad_norm": 1.9862896887531025, + "learning_rate": 5.896733929445333e-06, + "loss": 0.4749, + "step": 3998 + }, + { + "epoch": 0.46, + "grad_norm": 1.7885771469117557, + "learning_rate": 5.894903127770004e-06, + "loss": 0.4519, + "step": 3999 + }, + { + "epoch": 0.46, + "grad_norm": 1.996893956192921, + "learning_rate": 5.8930722021328505e-06, + "loss": 0.539, + "step": 4000 + }, + { + "epoch": 0.46, + "grad_norm": 2.40417944425607, + "learning_rate": 5.891241152787488e-06, + "loss": 0.5154, + "step": 4001 + }, + { + "epoch": 0.46, + "grad_norm": 2.208862780268627, + "learning_rate": 5.889409979987557e-06, + "loss": 0.5757, + "step": 4002 + }, + { + "epoch": 0.46, + "grad_norm": 2.0610474524954556, + "learning_rate": 5.887578683986709e-06, + "loss": 0.5838, + "step": 4003 + }, + { + "epoch": 0.46, + "grad_norm": 2.0019750915908987, + "learning_rate": 5.885747265038616e-06, + "loss": 0.5483, + "step": 4004 + }, + { + "epoch": 0.46, + "grad_norm": 3.0403085030544705, + "learning_rate": 5.883915723396965e-06, + "loss": 0.5269, + "step": 4005 + }, + { + "epoch": 0.46, + "grad_norm": 2.158196451499147, + "learning_rate": 5.882084059315461e-06, + "loss": 0.5354, + "step": 4006 + }, + { + "epoch": 0.46, + "grad_norm": 1.8841955024468213, + "learning_rate": 5.880252273047826e-06, + "loss": 0.4876, + "step": 4007 + }, + { + "epoch": 0.46, + "grad_norm": 1.835501728419276, + "learning_rate": 5.878420364847799e-06, + "loss": 0.541, + "step": 4008 + }, + { + "epoch": 0.46, + "grad_norm": 2.2182259302627876, + "learning_rate": 5.8765883349691345e-06, + "loss": 0.4574, + "step": 4009 + }, + { + "epoch": 0.46, + "grad_norm": 2.319627149811975, + "learning_rate": 5.874756183665605e-06, + "loss": 0.4995, + "step": 4010 + }, + { + "epoch": 0.46, + "grad_norm": 2.4219634948890376, + "learning_rate": 5.872923911191002e-06, + "loss": 0.4119, + "step": 4011 + }, + { + "epoch": 0.46, + "grad_norm": 2.1819224646555373, + "learning_rate": 5.871091517799129e-06, + "loss": 0.5976, + "step": 4012 + }, + { + "epoch": 0.46, + "grad_norm": 1.9772592227569705, + "learning_rate": 5.8692590037438105e-06, + "loss": 0.4904, + "step": 4013 + }, + { + "epoch": 0.46, + "grad_norm": 2.044041404223672, + "learning_rate": 5.867426369278886e-06, + "loss": 0.4443, + "step": 4014 + }, + { + "epoch": 0.46, + "grad_norm": 2.207962003226654, + "learning_rate": 5.865593614658209e-06, + "loss": 0.4282, + "step": 4015 + }, + { + "epoch": 0.46, + "grad_norm": 2.1994322056962803, + "learning_rate": 5.863760740135657e-06, + "loss": 0.5809, + "step": 4016 + }, + { + "epoch": 0.46, + "grad_norm": 2.2629985434405318, + "learning_rate": 5.861927745965116e-06, + "loss": 0.5015, + "step": 4017 + }, + { + "epoch": 0.46, + "grad_norm": 2.1454385547546098, + "learning_rate": 5.860094632400492e-06, + "loss": 0.4931, + "step": 4018 + }, + { + "epoch": 0.46, + "grad_norm": 2.526214868914331, + "learning_rate": 5.8582613996957085e-06, + "loss": 0.5097, + "step": 4019 + }, + { + "epoch": 0.46, + "grad_norm": 2.1009662785195258, + "learning_rate": 5.856428048104706e-06, + "loss": 0.5104, + "step": 4020 + }, + { + "epoch": 0.46, + "grad_norm": 2.459598459411013, + "learning_rate": 5.854594577881436e-06, + "loss": 0.4262, + "step": 4021 + }, + { + "epoch": 0.46, + "grad_norm": 2.017125998301964, + "learning_rate": 5.852760989279874e-06, + "loss": 0.5284, + "step": 4022 + }, + { + "epoch": 0.46, + "grad_norm": 1.9910137659868243, + "learning_rate": 5.850927282554009e-06, + "loss": 0.5103, + "step": 4023 + }, + { + "epoch": 0.46, + "grad_norm": 3.3089119670440263, + "learning_rate": 5.849093457957844e-06, + "loss": 0.534, + "step": 4024 + }, + { + "epoch": 0.46, + "grad_norm": 2.4765252239383595, + "learning_rate": 5.8472595157454e-06, + "loss": 0.5392, + "step": 4025 + }, + { + "epoch": 0.46, + "grad_norm": 3.058710930789199, + "learning_rate": 5.8454254561707135e-06, + "loss": 0.5104, + "step": 4026 + }, + { + "epoch": 0.46, + "grad_norm": 1.89428021363449, + "learning_rate": 5.84359127948784e-06, + "loss": 0.5429, + "step": 4027 + }, + { + "epoch": 0.46, + "grad_norm": 1.9893551100894353, + "learning_rate": 5.8417569859508485e-06, + "loss": 0.3806, + "step": 4028 + }, + { + "epoch": 0.46, + "grad_norm": 2.582631438874301, + "learning_rate": 5.839922575813824e-06, + "loss": 0.5774, + "step": 4029 + }, + { + "epoch": 0.46, + "grad_norm": 2.0311609250373075, + "learning_rate": 5.838088049330871e-06, + "loss": 0.4786, + "step": 4030 + }, + { + "epoch": 0.46, + "grad_norm": 2.2956484653685814, + "learning_rate": 5.836253406756108e-06, + "loss": 0.5011, + "step": 4031 + }, + { + "epoch": 0.46, + "grad_norm": 2.42925988279104, + "learning_rate": 5.834418648343666e-06, + "loss": 0.5348, + "step": 4032 + }, + { + "epoch": 0.46, + "grad_norm": 2.7690973930786447, + "learning_rate": 5.832583774347697e-06, + "loss": 0.4958, + "step": 4033 + }, + { + "epoch": 0.46, + "grad_norm": 1.8680771943906376, + "learning_rate": 5.830748785022369e-06, + "loss": 0.4211, + "step": 4034 + }, + { + "epoch": 0.46, + "grad_norm": 1.7184966630315233, + "learning_rate": 5.828913680621863e-06, + "loss": 0.6152, + "step": 4035 + }, + { + "epoch": 0.46, + "grad_norm": 1.768834612632359, + "learning_rate": 5.827078461400378e-06, + "loss": 0.5243, + "step": 4036 + }, + { + "epoch": 0.46, + "grad_norm": 1.8511709900862123, + "learning_rate": 5.825243127612127e-06, + "loss": 0.4675, + "step": 4037 + }, + { + "epoch": 0.46, + "grad_norm": 3.615776631265324, + "learning_rate": 5.8234076795113415e-06, + "loss": 0.4936, + "step": 4038 + }, + { + "epoch": 0.46, + "grad_norm": 1.6588891901539942, + "learning_rate": 5.821572117352266e-06, + "loss": 0.4746, + "step": 4039 + }, + { + "epoch": 0.46, + "grad_norm": 1.6905596017734232, + "learning_rate": 5.819736441389164e-06, + "loss": 0.4343, + "step": 4040 + }, + { + "epoch": 0.46, + "grad_norm": 2.049332279341107, + "learning_rate": 5.817900651876311e-06, + "loss": 0.5241, + "step": 4041 + }, + { + "epoch": 0.46, + "grad_norm": 1.4602133257384171, + "learning_rate": 5.816064749068002e-06, + "loss": 0.5168, + "step": 4042 + }, + { + "epoch": 0.46, + "grad_norm": 2.1539153478672253, + "learning_rate": 5.814228733218546e-06, + "loss": 0.5727, + "step": 4043 + }, + { + "epoch": 0.46, + "grad_norm": 1.9788387694478389, + "learning_rate": 5.812392604582265e-06, + "loss": 0.4259, + "step": 4044 + }, + { + "epoch": 0.46, + "grad_norm": 2.188851240621248, + "learning_rate": 5.810556363413502e-06, + "loss": 0.4189, + "step": 4045 + }, + { + "epoch": 0.46, + "grad_norm": 1.8558530111245641, + "learning_rate": 5.808720009966613e-06, + "loss": 0.5138, + "step": 4046 + }, + { + "epoch": 0.47, + "grad_norm": 1.7967470049887913, + "learning_rate": 5.806883544495967e-06, + "loss": 0.4036, + "step": 4047 + }, + { + "epoch": 0.47, + "grad_norm": 1.8023200267365835, + "learning_rate": 5.805046967255954e-06, + "loss": 0.5527, + "step": 4048 + }, + { + "epoch": 0.47, + "grad_norm": 1.8675872581685433, + "learning_rate": 5.8032102785009725e-06, + "loss": 0.4392, + "step": 4049 + }, + { + "epoch": 0.47, + "grad_norm": 2.3813710263068186, + "learning_rate": 5.801373478485443e-06, + "loss": 0.5379, + "step": 4050 + }, + { + "epoch": 0.47, + "grad_norm": 2.7058208306681926, + "learning_rate": 5.799536567463798e-06, + "loss": 0.4979, + "step": 4051 + }, + { + "epoch": 0.47, + "grad_norm": 1.8880369058957132, + "learning_rate": 5.797699545690486e-06, + "loss": 0.511, + "step": 4052 + }, + { + "epoch": 0.47, + "grad_norm": 2.5543445120770203, + "learning_rate": 5.795862413419971e-06, + "loss": 0.5442, + "step": 4053 + }, + { + "epoch": 0.47, + "grad_norm": 0.8802568362331055, + "learning_rate": 5.794025170906733e-06, + "loss": 0.7211, + "step": 4054 + }, + { + "epoch": 0.47, + "grad_norm": 1.8601606667140826, + "learning_rate": 5.792187818405265e-06, + "loss": 0.498, + "step": 4055 + }, + { + "epoch": 0.47, + "grad_norm": 1.9314646790573065, + "learning_rate": 5.7903503561700795e-06, + "loss": 0.5596, + "step": 4056 + }, + { + "epoch": 0.47, + "grad_norm": 3.9196911307073012, + "learning_rate": 5.788512784455697e-06, + "loss": 0.5733, + "step": 4057 + }, + { + "epoch": 0.47, + "grad_norm": 2.2386965796093476, + "learning_rate": 5.786675103516662e-06, + "loss": 0.5662, + "step": 4058 + }, + { + "epoch": 0.47, + "grad_norm": 3.4356402046760577, + "learning_rate": 5.784837313607529e-06, + "loss": 0.5361, + "step": 4059 + }, + { + "epoch": 0.47, + "grad_norm": 1.754407074130776, + "learning_rate": 5.782999414982865e-06, + "loss": 0.45, + "step": 4060 + }, + { + "epoch": 0.47, + "grad_norm": 1.869665092733014, + "learning_rate": 5.781161407897258e-06, + "loss": 0.4561, + "step": 4061 + }, + { + "epoch": 0.47, + "grad_norm": 1.8045093924774072, + "learning_rate": 5.779323292605308e-06, + "loss": 0.3815, + "step": 4062 + }, + { + "epoch": 0.47, + "grad_norm": 1.7541340244559271, + "learning_rate": 5.77748506936163e-06, + "loss": 0.5327, + "step": 4063 + }, + { + "epoch": 0.47, + "grad_norm": 1.9308103687590457, + "learning_rate": 5.775646738420856e-06, + "loss": 0.5062, + "step": 4064 + }, + { + "epoch": 0.47, + "grad_norm": 2.1646673828172354, + "learning_rate": 5.773808300037631e-06, + "loss": 0.5159, + "step": 4065 + }, + { + "epoch": 0.47, + "grad_norm": 2.038305543022963, + "learning_rate": 5.771969754466613e-06, + "loss": 0.5797, + "step": 4066 + }, + { + "epoch": 0.47, + "grad_norm": 1.88308769477512, + "learning_rate": 5.7701311019624785e-06, + "loss": 0.5065, + "step": 4067 + }, + { + "epoch": 0.47, + "grad_norm": 2.048867431426452, + "learning_rate": 5.7682923427799165e-06, + "loss": 0.5306, + "step": 4068 + }, + { + "epoch": 0.47, + "grad_norm": 1.799904162119711, + "learning_rate": 5.766453477173633e-06, + "loss": 0.5325, + "step": 4069 + }, + { + "epoch": 0.47, + "grad_norm": 2.078564736133288, + "learning_rate": 5.764614505398346e-06, + "loss": 0.5057, + "step": 4070 + }, + { + "epoch": 0.47, + "grad_norm": 1.9888277133899182, + "learning_rate": 5.7627754277087896e-06, + "loss": 0.5185, + "step": 4071 + }, + { + "epoch": 0.47, + "grad_norm": 2.18474539742922, + "learning_rate": 5.760936244359715e-06, + "loss": 0.5304, + "step": 4072 + }, + { + "epoch": 0.47, + "grad_norm": 2.06505800815184, + "learning_rate": 5.7590969556058815e-06, + "loss": 0.5176, + "step": 4073 + }, + { + "epoch": 0.47, + "grad_norm": 1.7829550663679807, + "learning_rate": 5.757257561702072e-06, + "loss": 0.4515, + "step": 4074 + }, + { + "epoch": 0.47, + "grad_norm": 2.5537998741685324, + "learning_rate": 5.755418062903074e-06, + "loss": 0.5228, + "step": 4075 + }, + { + "epoch": 0.47, + "grad_norm": 1.5822822561539227, + "learning_rate": 5.753578459463698e-06, + "loss": 0.4262, + "step": 4076 + }, + { + "epoch": 0.47, + "grad_norm": 3.417564992377457, + "learning_rate": 5.751738751638763e-06, + "loss": 0.4744, + "step": 4077 + }, + { + "epoch": 0.47, + "grad_norm": 1.5911961687862044, + "learning_rate": 5.749898939683107e-06, + "loss": 0.4794, + "step": 4078 + }, + { + "epoch": 0.47, + "grad_norm": 4.131929357369358, + "learning_rate": 5.748059023851581e-06, + "loss": 0.4574, + "step": 4079 + }, + { + "epoch": 0.47, + "grad_norm": 1.708163014239714, + "learning_rate": 5.746219004399047e-06, + "loss": 0.5506, + "step": 4080 + }, + { + "epoch": 0.47, + "grad_norm": 1.889848653330532, + "learning_rate": 5.744378881580386e-06, + "loss": 0.4513, + "step": 4081 + }, + { + "epoch": 0.47, + "grad_norm": 1.6772661482636169, + "learning_rate": 5.7425386556504915e-06, + "loss": 0.493, + "step": 4082 + }, + { + "epoch": 0.47, + "grad_norm": 2.077401305791897, + "learning_rate": 5.740698326864271e-06, + "loss": 0.4488, + "step": 4083 + }, + { + "epoch": 0.47, + "grad_norm": 2.452554381863501, + "learning_rate": 5.738857895476646e-06, + "loss": 0.5169, + "step": 4084 + }, + { + "epoch": 0.47, + "grad_norm": 1.7162223227451432, + "learning_rate": 5.737017361742554e-06, + "loss": 0.4311, + "step": 4085 + }, + { + "epoch": 0.47, + "grad_norm": 1.527455321408591, + "learning_rate": 5.735176725916944e-06, + "loss": 0.4526, + "step": 4086 + }, + { + "epoch": 0.47, + "grad_norm": 2.5080522300312116, + "learning_rate": 5.733335988254782e-06, + "loss": 0.5442, + "step": 4087 + }, + { + "epoch": 0.47, + "grad_norm": 1.9202715090747422, + "learning_rate": 5.731495149011045e-06, + "loss": 0.4996, + "step": 4088 + }, + { + "epoch": 0.47, + "grad_norm": 1.7644955861942204, + "learning_rate": 5.729654208440727e-06, + "loss": 0.454, + "step": 4089 + }, + { + "epoch": 0.47, + "grad_norm": 2.595297472188361, + "learning_rate": 5.727813166798836e-06, + "loss": 0.5285, + "step": 4090 + }, + { + "epoch": 0.47, + "grad_norm": 2.0251416974113816, + "learning_rate": 5.7259720243403896e-06, + "loss": 0.4626, + "step": 4091 + }, + { + "epoch": 0.47, + "grad_norm": 1.8700811030811115, + "learning_rate": 5.724130781320424e-06, + "loss": 0.4532, + "step": 4092 + }, + { + "epoch": 0.47, + "grad_norm": 2.0493133875302356, + "learning_rate": 5.722289437993989e-06, + "loss": 0.6167, + "step": 4093 + }, + { + "epoch": 0.47, + "grad_norm": 1.8185319240855067, + "learning_rate": 5.720447994616145e-06, + "loss": 0.4883, + "step": 4094 + }, + { + "epoch": 0.47, + "grad_norm": 2.3752442785658663, + "learning_rate": 5.71860645144197e-06, + "loss": 0.5547, + "step": 4095 + }, + { + "epoch": 0.47, + "grad_norm": 7.219965061180549, + "learning_rate": 5.716764808726554e-06, + "loss": 0.4605, + "step": 4096 + }, + { + "epoch": 0.47, + "grad_norm": 2.1531086527738066, + "learning_rate": 5.714923066725e-06, + "loss": 0.5662, + "step": 4097 + }, + { + "epoch": 0.47, + "grad_norm": 2.4242933567956304, + "learning_rate": 5.713081225692429e-06, + "loss": 0.4251, + "step": 4098 + }, + { + "epoch": 0.47, + "grad_norm": 2.0755982859493183, + "learning_rate": 5.711239285883968e-06, + "loss": 0.5075, + "step": 4099 + }, + { + "epoch": 0.47, + "grad_norm": 2.163603315260941, + "learning_rate": 5.709397247554764e-06, + "loss": 0.4874, + "step": 4100 + }, + { + "epoch": 0.47, + "grad_norm": 2.819603495820722, + "learning_rate": 5.707555110959979e-06, + "loss": 0.6313, + "step": 4101 + }, + { + "epoch": 0.47, + "grad_norm": 1.9863060316053744, + "learning_rate": 5.7057128763547806e-06, + "loss": 0.4618, + "step": 4102 + }, + { + "epoch": 0.47, + "grad_norm": 1.861698265235767, + "learning_rate": 5.703870543994357e-06, + "loss": 0.507, + "step": 4103 + }, + { + "epoch": 0.47, + "grad_norm": 1.8856624138825477, + "learning_rate": 5.7020281141339065e-06, + "loss": 0.5437, + "step": 4104 + }, + { + "epoch": 0.47, + "grad_norm": 2.6991631478477367, + "learning_rate": 5.700185587028644e-06, + "loss": 0.4918, + "step": 4105 + }, + { + "epoch": 0.47, + "grad_norm": 2.1232700414889787, + "learning_rate": 5.698342962933795e-06, + "loss": 0.5827, + "step": 4106 + }, + { + "epoch": 0.47, + "grad_norm": 0.9217145459342841, + "learning_rate": 5.696500242104601e-06, + "loss": 0.6902, + "step": 4107 + }, + { + "epoch": 0.47, + "grad_norm": 2.102765786038736, + "learning_rate": 5.694657424796313e-06, + "loss": 0.4554, + "step": 4108 + }, + { + "epoch": 0.47, + "grad_norm": 2.6488490103289317, + "learning_rate": 5.692814511264199e-06, + "loss": 0.4874, + "step": 4109 + }, + { + "epoch": 0.47, + "grad_norm": 2.3196322521293373, + "learning_rate": 5.6909715017635385e-06, + "loss": 0.4901, + "step": 4110 + }, + { + "epoch": 0.47, + "grad_norm": 1.7787424806913932, + "learning_rate": 5.689128396549626e-06, + "loss": 0.511, + "step": 4111 + }, + { + "epoch": 0.47, + "grad_norm": 2.572026086719596, + "learning_rate": 5.687285195877766e-06, + "loss": 0.4512, + "step": 4112 + }, + { + "epoch": 0.47, + "grad_norm": 1.9988195334038317, + "learning_rate": 5.685441900003281e-06, + "loss": 0.6307, + "step": 4113 + }, + { + "epoch": 0.47, + "grad_norm": 2.1184847284891357, + "learning_rate": 5.683598509181503e-06, + "loss": 0.5759, + "step": 4114 + }, + { + "epoch": 0.47, + "grad_norm": 2.295700217844005, + "learning_rate": 5.681755023667777e-06, + "loss": 0.5746, + "step": 4115 + }, + { + "epoch": 0.47, + "grad_norm": 2.2836223464481344, + "learning_rate": 5.679911443717464e-06, + "loss": 0.5533, + "step": 4116 + }, + { + "epoch": 0.47, + "grad_norm": 2.0281870883366993, + "learning_rate": 5.678067769585935e-06, + "loss": 0.4295, + "step": 4117 + }, + { + "epoch": 0.47, + "grad_norm": 3.2329275839233045, + "learning_rate": 5.676224001528577e-06, + "loss": 0.5842, + "step": 4118 + }, + { + "epoch": 0.47, + "grad_norm": 2.189577601090869, + "learning_rate": 5.674380139800786e-06, + "loss": 0.438, + "step": 4119 + }, + { + "epoch": 0.47, + "grad_norm": 2.656414901277946, + "learning_rate": 5.6725361846579755e-06, + "loss": 0.5253, + "step": 4120 + }, + { + "epoch": 0.47, + "grad_norm": 1.7425987454629568, + "learning_rate": 5.670692136355569e-06, + "loss": 0.481, + "step": 4121 + }, + { + "epoch": 0.47, + "grad_norm": 1.6304100298108062, + "learning_rate": 5.668847995149005e-06, + "loss": 0.5675, + "step": 4122 + }, + { + "epoch": 0.47, + "grad_norm": 2.6019440227691, + "learning_rate": 5.667003761293731e-06, + "loss": 0.5271, + "step": 4123 + }, + { + "epoch": 0.47, + "grad_norm": 2.041535111408343, + "learning_rate": 5.665159435045214e-06, + "loss": 0.4662, + "step": 4124 + }, + { + "epoch": 0.47, + "grad_norm": 0.8839017962855645, + "learning_rate": 5.663315016658925e-06, + "loss": 0.6767, + "step": 4125 + }, + { + "epoch": 0.47, + "grad_norm": 2.5897497689849227, + "learning_rate": 5.661470506390354e-06, + "loss": 0.4292, + "step": 4126 + }, + { + "epoch": 0.47, + "grad_norm": 2.0256400021780188, + "learning_rate": 5.659625904495004e-06, + "loss": 0.602, + "step": 4127 + }, + { + "epoch": 0.47, + "grad_norm": 3.5325215587607035, + "learning_rate": 5.657781211228388e-06, + "loss": 0.5316, + "step": 4128 + }, + { + "epoch": 0.47, + "grad_norm": 1.999266488784331, + "learning_rate": 5.655936426846033e-06, + "loss": 0.4817, + "step": 4129 + }, + { + "epoch": 0.47, + "grad_norm": 2.6807419991481787, + "learning_rate": 5.654091551603478e-06, + "loss": 0.4739, + "step": 4130 + }, + { + "epoch": 0.47, + "grad_norm": 1.961561962990601, + "learning_rate": 5.652246585756274e-06, + "loss": 0.4727, + "step": 4131 + }, + { + "epoch": 0.47, + "grad_norm": 1.7214130324545978, + "learning_rate": 5.6504015295599846e-06, + "loss": 0.5621, + "step": 4132 + }, + { + "epoch": 0.47, + "grad_norm": 0.8168650473240665, + "learning_rate": 5.64855638327019e-06, + "loss": 0.6843, + "step": 4133 + }, + { + "epoch": 0.48, + "grad_norm": 2.071989491518487, + "learning_rate": 5.646711147142477e-06, + "loss": 0.4794, + "step": 4134 + }, + { + "epoch": 0.48, + "grad_norm": 1.6043097567974187, + "learning_rate": 5.644865821432448e-06, + "loss": 0.4729, + "step": 4135 + }, + { + "epoch": 0.48, + "grad_norm": 2.511174266566765, + "learning_rate": 5.643020406395716e-06, + "loss": 0.4514, + "step": 4136 + }, + { + "epoch": 0.48, + "grad_norm": 2.2220030768325048, + "learning_rate": 5.64117490228791e-06, + "loss": 0.4687, + "step": 4137 + }, + { + "epoch": 0.48, + "grad_norm": 3.4090379789284713, + "learning_rate": 5.639329309364667e-06, + "loss": 0.5287, + "step": 4138 + }, + { + "epoch": 0.48, + "grad_norm": 7.202049738729965, + "learning_rate": 5.637483627881639e-06, + "loss": 0.5152, + "step": 4139 + }, + { + "epoch": 0.48, + "grad_norm": 1.9827941804503668, + "learning_rate": 5.635637858094489e-06, + "loss": 0.591, + "step": 4140 + }, + { + "epoch": 0.48, + "grad_norm": 2.5627346021217887, + "learning_rate": 5.633792000258894e-06, + "loss": 0.5133, + "step": 4141 + }, + { + "epoch": 0.48, + "grad_norm": 1.8759455774815856, + "learning_rate": 5.6319460546305404e-06, + "loss": 0.469, + "step": 4142 + }, + { + "epoch": 0.48, + "grad_norm": 2.509325023087706, + "learning_rate": 5.630100021465128e-06, + "loss": 0.5049, + "step": 4143 + }, + { + "epoch": 0.48, + "grad_norm": 2.3080639254461413, + "learning_rate": 5.628253901018371e-06, + "loss": 0.4792, + "step": 4144 + }, + { + "epoch": 0.48, + "grad_norm": 2.0027224557573473, + "learning_rate": 5.626407693545992e-06, + "loss": 0.5526, + "step": 4145 + }, + { + "epoch": 0.48, + "grad_norm": 1.7381523511728132, + "learning_rate": 5.624561399303727e-06, + "loss": 0.5085, + "step": 4146 + }, + { + "epoch": 0.48, + "grad_norm": 1.5077996319332847, + "learning_rate": 5.622715018547325e-06, + "loss": 0.4949, + "step": 4147 + }, + { + "epoch": 0.48, + "grad_norm": 1.8584742885020111, + "learning_rate": 5.620868551532548e-06, + "loss": 0.4701, + "step": 4148 + }, + { + "epoch": 0.48, + "grad_norm": 2.317653547487619, + "learning_rate": 5.619021998515165e-06, + "loss": 0.5021, + "step": 4149 + }, + { + "epoch": 0.48, + "grad_norm": 1.9291006573565688, + "learning_rate": 5.617175359750964e-06, + "loss": 0.515, + "step": 4150 + }, + { + "epoch": 0.48, + "grad_norm": 1.9185328160460615, + "learning_rate": 5.615328635495738e-06, + "loss": 0.5502, + "step": 4151 + }, + { + "epoch": 0.48, + "grad_norm": 1.9276579937636906, + "learning_rate": 5.613481826005296e-06, + "loss": 0.4379, + "step": 4152 + }, + { + "epoch": 0.48, + "grad_norm": 2.6220967379404887, + "learning_rate": 5.6116349315354565e-06, + "loss": 0.5494, + "step": 4153 + }, + { + "epoch": 0.48, + "grad_norm": 2.1948035848535117, + "learning_rate": 5.609787952342054e-06, + "loss": 0.5596, + "step": 4154 + }, + { + "epoch": 0.48, + "grad_norm": 1.9424964079889804, + "learning_rate": 5.607940888680929e-06, + "loss": 0.5069, + "step": 4155 + }, + { + "epoch": 0.48, + "grad_norm": 1.8534538163941456, + "learning_rate": 5.6060937408079374e-06, + "loss": 0.4449, + "step": 4156 + }, + { + "epoch": 0.48, + "grad_norm": 2.088534131861963, + "learning_rate": 5.604246508978947e-06, + "loss": 0.6058, + "step": 4157 + }, + { + "epoch": 0.48, + "grad_norm": 2.366350784511451, + "learning_rate": 5.6023991934498345e-06, + "loss": 0.4441, + "step": 4158 + }, + { + "epoch": 0.48, + "grad_norm": 1.8860825535127888, + "learning_rate": 5.6005517944764895e-06, + "loss": 0.5204, + "step": 4159 + }, + { + "epoch": 0.48, + "grad_norm": 1.851712126653901, + "learning_rate": 5.598704312314813e-06, + "loss": 0.4504, + "step": 4160 + }, + { + "epoch": 0.48, + "grad_norm": 2.033320111303184, + "learning_rate": 5.59685674722072e-06, + "loss": 0.4864, + "step": 4161 + }, + { + "epoch": 0.48, + "grad_norm": 2.238228005155648, + "learning_rate": 5.5950090994501335e-06, + "loss": 0.4531, + "step": 4162 + }, + { + "epoch": 0.48, + "grad_norm": 1.8149911279577937, + "learning_rate": 5.593161369258991e-06, + "loss": 0.4466, + "step": 4163 + }, + { + "epoch": 0.48, + "grad_norm": 2.318340863744374, + "learning_rate": 5.591313556903238e-06, + "loss": 0.5283, + "step": 4164 + }, + { + "epoch": 0.48, + "grad_norm": 1.7543864291233446, + "learning_rate": 5.589465662638831e-06, + "loss": 0.5526, + "step": 4165 + }, + { + "epoch": 0.48, + "grad_norm": 1.7299705014569666, + "learning_rate": 5.587617686721745e-06, + "loss": 0.4967, + "step": 4166 + }, + { + "epoch": 0.48, + "grad_norm": 2.1934163214439772, + "learning_rate": 5.585769629407958e-06, + "loss": 0.4412, + "step": 4167 + }, + { + "epoch": 0.48, + "grad_norm": 1.770499691107071, + "learning_rate": 5.583921490953463e-06, + "loss": 0.4855, + "step": 4168 + }, + { + "epoch": 0.48, + "grad_norm": 1.9553997281729163, + "learning_rate": 5.5820732716142645e-06, + "loss": 0.5158, + "step": 4169 + }, + { + "epoch": 0.48, + "grad_norm": 3.5399882913403236, + "learning_rate": 5.580224971646377e-06, + "loss": 0.5656, + "step": 4170 + }, + { + "epoch": 0.48, + "grad_norm": 2.88482970919887, + "learning_rate": 5.578376591305827e-06, + "loss": 0.5364, + "step": 4171 + }, + { + "epoch": 0.48, + "grad_norm": 1.5189968890444645, + "learning_rate": 5.576528130848652e-06, + "loss": 0.4761, + "step": 4172 + }, + { + "epoch": 0.48, + "grad_norm": 0.8175979832421028, + "learning_rate": 5.5746795905309e-06, + "loss": 0.6874, + "step": 4173 + }, + { + "epoch": 0.48, + "grad_norm": 2.3937920363676315, + "learning_rate": 5.57283097060863e-06, + "loss": 0.522, + "step": 4174 + }, + { + "epoch": 0.48, + "grad_norm": 3.1400877082978993, + "learning_rate": 5.570982271337916e-06, + "loss": 0.4829, + "step": 4175 + }, + { + "epoch": 0.48, + "grad_norm": 1.9870581805467729, + "learning_rate": 5.569133492974834e-06, + "loss": 0.4377, + "step": 4176 + }, + { + "epoch": 0.48, + "grad_norm": 2.403721263461471, + "learning_rate": 5.567284635775479e-06, + "loss": 0.5694, + "step": 4177 + }, + { + "epoch": 0.48, + "grad_norm": 1.7361009979520146, + "learning_rate": 5.565435699995956e-06, + "loss": 0.4694, + "step": 4178 + }, + { + "epoch": 0.48, + "grad_norm": 1.805783916485826, + "learning_rate": 5.5635866858923774e-06, + "loss": 0.3866, + "step": 4179 + }, + { + "epoch": 0.48, + "grad_norm": 0.9363467290094493, + "learning_rate": 5.561737593720867e-06, + "loss": 0.6802, + "step": 4180 + }, + { + "epoch": 0.48, + "grad_norm": 2.75485757712712, + "learning_rate": 5.559888423737564e-06, + "loss": 0.436, + "step": 4181 + }, + { + "epoch": 0.48, + "grad_norm": 2.06935680124501, + "learning_rate": 5.558039176198613e-06, + "loss": 0.4734, + "step": 4182 + }, + { + "epoch": 0.48, + "grad_norm": 2.510465338613576, + "learning_rate": 5.556189851360173e-06, + "loss": 0.4779, + "step": 4183 + }, + { + "epoch": 0.48, + "grad_norm": 1.8589766636356018, + "learning_rate": 5.55434044947841e-06, + "loss": 0.4939, + "step": 4184 + }, + { + "epoch": 0.48, + "grad_norm": 6.460997262452607, + "learning_rate": 5.552490970809504e-06, + "loss": 0.4721, + "step": 4185 + }, + { + "epoch": 0.48, + "grad_norm": 2.077761661602961, + "learning_rate": 5.550641415609646e-06, + "loss": 0.4718, + "step": 4186 + }, + { + "epoch": 0.48, + "grad_norm": 1.892015380372752, + "learning_rate": 5.548791784135034e-06, + "loss": 0.4632, + "step": 4187 + }, + { + "epoch": 0.48, + "grad_norm": 2.183627881211844, + "learning_rate": 5.546942076641877e-06, + "loss": 0.5887, + "step": 4188 + }, + { + "epoch": 0.48, + "grad_norm": 1.803741275940871, + "learning_rate": 5.545092293386399e-06, + "loss": 0.4608, + "step": 4189 + }, + { + "epoch": 0.48, + "grad_norm": 4.468664610427685, + "learning_rate": 5.543242434624832e-06, + "loss": 0.533, + "step": 4190 + }, + { + "epoch": 0.48, + "grad_norm": 2.137967659866638, + "learning_rate": 5.5413925006134165e-06, + "loss": 0.5465, + "step": 4191 + }, + { + "epoch": 0.48, + "grad_norm": 1.8090558621307973, + "learning_rate": 5.539542491608406e-06, + "loss": 0.4436, + "step": 4192 + }, + { + "epoch": 0.48, + "grad_norm": 0.8803895026643748, + "learning_rate": 5.537692407866063e-06, + "loss": 0.6918, + "step": 4193 + }, + { + "epoch": 0.48, + "grad_norm": 2.333408820132182, + "learning_rate": 5.53584224964266e-06, + "loss": 0.4861, + "step": 4194 + }, + { + "epoch": 0.48, + "grad_norm": 2.996113450128624, + "learning_rate": 5.533992017194481e-06, + "loss": 0.5363, + "step": 4195 + }, + { + "epoch": 0.48, + "grad_norm": 1.8025060082184459, + "learning_rate": 5.532141710777822e-06, + "loss": 0.5177, + "step": 4196 + }, + { + "epoch": 0.48, + "grad_norm": 1.926863337854834, + "learning_rate": 5.530291330648985e-06, + "loss": 0.5616, + "step": 4197 + }, + { + "epoch": 0.48, + "grad_norm": 2.420091851965088, + "learning_rate": 5.528440877064284e-06, + "loss": 0.4559, + "step": 4198 + }, + { + "epoch": 0.48, + "grad_norm": 1.9440919188221362, + "learning_rate": 5.526590350280043e-06, + "loss": 0.5827, + "step": 4199 + }, + { + "epoch": 0.48, + "grad_norm": 2.5707012384382986, + "learning_rate": 5.524739750552601e-06, + "loss": 0.4705, + "step": 4200 + }, + { + "epoch": 0.48, + "grad_norm": 3.818261552427796, + "learning_rate": 5.522889078138298e-06, + "loss": 0.4518, + "step": 4201 + }, + { + "epoch": 0.48, + "grad_norm": 1.8722127777476045, + "learning_rate": 5.52103833329349e-06, + "loss": 0.4276, + "step": 4202 + }, + { + "epoch": 0.48, + "grad_norm": 1.9781903859733203, + "learning_rate": 5.5191875162745425e-06, + "loss": 0.575, + "step": 4203 + }, + { + "epoch": 0.48, + "grad_norm": 2.1610015067938004, + "learning_rate": 5.51733662733783e-06, + "loss": 0.5171, + "step": 4204 + }, + { + "epoch": 0.48, + "grad_norm": 1.7079685805933562, + "learning_rate": 5.515485666739739e-06, + "loss": 0.4369, + "step": 4205 + }, + { + "epoch": 0.48, + "grad_norm": 1.6951160336285662, + "learning_rate": 5.513634634736662e-06, + "loss": 0.5575, + "step": 4206 + }, + { + "epoch": 0.48, + "grad_norm": 1.73643320295791, + "learning_rate": 5.511783531585004e-06, + "loss": 0.4252, + "step": 4207 + }, + { + "epoch": 0.48, + "grad_norm": 2.4095701143503256, + "learning_rate": 5.50993235754118e-06, + "loss": 0.5753, + "step": 4208 + }, + { + "epoch": 0.48, + "grad_norm": 2.066039530427085, + "learning_rate": 5.508081112861614e-06, + "loss": 0.4593, + "step": 4209 + }, + { + "epoch": 0.48, + "grad_norm": 0.9531306094377258, + "learning_rate": 5.50622979780274e-06, + "loss": 0.7414, + "step": 4210 + }, + { + "epoch": 0.48, + "grad_norm": 1.8699006315609543, + "learning_rate": 5.504378412621003e-06, + "loss": 0.6138, + "step": 4211 + }, + { + "epoch": 0.48, + "grad_norm": 1.8251540803865427, + "learning_rate": 5.502526957572855e-06, + "loss": 0.555, + "step": 4212 + }, + { + "epoch": 0.48, + "grad_norm": 1.4978626745556893, + "learning_rate": 5.5006754329147595e-06, + "loss": 0.4954, + "step": 4213 + }, + { + "epoch": 0.48, + "grad_norm": 1.804855865537222, + "learning_rate": 5.4988238389031904e-06, + "loss": 0.5597, + "step": 4214 + }, + { + "epoch": 0.48, + "grad_norm": 1.8936785388999215, + "learning_rate": 5.496972175794628e-06, + "loss": 0.4483, + "step": 4215 + }, + { + "epoch": 0.48, + "grad_norm": 1.8216465927204806, + "learning_rate": 5.4951204438455674e-06, + "loss": 0.4991, + "step": 4216 + }, + { + "epoch": 0.48, + "grad_norm": 2.145256387930919, + "learning_rate": 5.49326864331251e-06, + "loss": 0.4945, + "step": 4217 + }, + { + "epoch": 0.48, + "grad_norm": 1.8095099173017117, + "learning_rate": 5.491416774451963e-06, + "loss": 0.5871, + "step": 4218 + }, + { + "epoch": 0.48, + "grad_norm": 2.0609005232464144, + "learning_rate": 5.489564837520451e-06, + "loss": 0.5, + "step": 4219 + }, + { + "epoch": 0.48, + "grad_norm": 1.7541152043472044, + "learning_rate": 5.487712832774502e-06, + "loss": 0.4983, + "step": 4220 + }, + { + "epoch": 0.49, + "grad_norm": 2.1514512289907914, + "learning_rate": 5.485860760470656e-06, + "loss": 0.5166, + "step": 4221 + }, + { + "epoch": 0.49, + "grad_norm": 1.9010564184976764, + "learning_rate": 5.484008620865461e-06, + "loss": 0.4977, + "step": 4222 + }, + { + "epoch": 0.49, + "grad_norm": 2.4878228814801493, + "learning_rate": 5.482156414215476e-06, + "loss": 0.479, + "step": 4223 + }, + { + "epoch": 0.49, + "grad_norm": 2.377831540294386, + "learning_rate": 5.480304140777268e-06, + "loss": 0.4514, + "step": 4224 + }, + { + "epoch": 0.49, + "grad_norm": 1.9775334607957433, + "learning_rate": 5.478451800807414e-06, + "loss": 0.5268, + "step": 4225 + }, + { + "epoch": 0.49, + "grad_norm": 2.1322986645843716, + "learning_rate": 5.4765993945625e-06, + "loss": 0.5763, + "step": 4226 + }, + { + "epoch": 0.49, + "grad_norm": 1.6794139374447141, + "learning_rate": 5.474746922299119e-06, + "loss": 0.5109, + "step": 4227 + }, + { + "epoch": 0.49, + "grad_norm": 1.8470214797830506, + "learning_rate": 5.472894384273877e-06, + "loss": 0.4829, + "step": 4228 + }, + { + "epoch": 0.49, + "grad_norm": 2.179447214431165, + "learning_rate": 5.471041780743388e-06, + "loss": 0.4743, + "step": 4229 + }, + { + "epoch": 0.49, + "grad_norm": 2.07856986214627, + "learning_rate": 5.4691891119642725e-06, + "loss": 0.5479, + "step": 4230 + }, + { + "epoch": 0.49, + "grad_norm": 2.096518014641802, + "learning_rate": 5.467336378193162e-06, + "loss": 0.4831, + "step": 4231 + }, + { + "epoch": 0.49, + "grad_norm": 1.7066353119078104, + "learning_rate": 5.465483579686697e-06, + "loss": 0.5222, + "step": 4232 + }, + { + "epoch": 0.49, + "grad_norm": 1.7466369247851958, + "learning_rate": 5.463630716701528e-06, + "loss": 0.5073, + "step": 4233 + }, + { + "epoch": 0.49, + "grad_norm": 1.7826965887395634, + "learning_rate": 5.461777789494312e-06, + "loss": 0.4965, + "step": 4234 + }, + { + "epoch": 0.49, + "grad_norm": 1.9315768243971463, + "learning_rate": 5.459924798321717e-06, + "loss": 0.502, + "step": 4235 + }, + { + "epoch": 0.49, + "grad_norm": 2.061152777431662, + "learning_rate": 5.458071743440418e-06, + "loss": 0.5807, + "step": 4236 + }, + { + "epoch": 0.49, + "grad_norm": 1.9141693120915213, + "learning_rate": 5.4562186251071e-06, + "loss": 0.4165, + "step": 4237 + }, + { + "epoch": 0.49, + "grad_norm": 2.4984102978562386, + "learning_rate": 5.454365443578457e-06, + "loss": 0.5617, + "step": 4238 + }, + { + "epoch": 0.49, + "grad_norm": 1.7854418412148907, + "learning_rate": 5.452512199111193e-06, + "loss": 0.5487, + "step": 4239 + }, + { + "epoch": 0.49, + "grad_norm": 1.7118796507893006, + "learning_rate": 5.450658891962016e-06, + "loss": 0.4864, + "step": 4240 + }, + { + "epoch": 0.49, + "grad_norm": 2.8075983385958314, + "learning_rate": 5.4488055223876494e-06, + "loss": 0.3922, + "step": 4241 + }, + { + "epoch": 0.49, + "grad_norm": 3.4561413500089495, + "learning_rate": 5.446952090644818e-06, + "loss": 0.4309, + "step": 4242 + }, + { + "epoch": 0.49, + "grad_norm": 2.5781385731931503, + "learning_rate": 5.445098596990261e-06, + "loss": 0.5029, + "step": 4243 + }, + { + "epoch": 0.49, + "grad_norm": 1.8081759585979522, + "learning_rate": 5.443245041680722e-06, + "loss": 0.5132, + "step": 4244 + }, + { + "epoch": 0.49, + "grad_norm": 2.5807753283071673, + "learning_rate": 5.441391424972958e-06, + "loss": 0.4887, + "step": 4245 + }, + { + "epoch": 0.49, + "grad_norm": 1.773516632891921, + "learning_rate": 5.4395377471237295e-06, + "loss": 0.5751, + "step": 4246 + }, + { + "epoch": 0.49, + "grad_norm": 2.0648290134306038, + "learning_rate": 5.43768400838981e-06, + "loss": 0.474, + "step": 4247 + }, + { + "epoch": 0.49, + "grad_norm": 2.224153987683954, + "learning_rate": 5.435830209027977e-06, + "loss": 0.5415, + "step": 4248 + }, + { + "epoch": 0.49, + "grad_norm": 1.6465132193170648, + "learning_rate": 5.433976349295018e-06, + "loss": 0.4725, + "step": 4249 + }, + { + "epoch": 0.49, + "grad_norm": 0.8740972767400428, + "learning_rate": 5.432122429447731e-06, + "loss": 0.6947, + "step": 4250 + }, + { + "epoch": 0.49, + "grad_norm": 2.0403575066044994, + "learning_rate": 5.43026844974292e-06, + "loss": 0.507, + "step": 4251 + }, + { + "epoch": 0.49, + "grad_norm": 2.012382509718183, + "learning_rate": 5.428414410437397e-06, + "loss": 0.4546, + "step": 4252 + }, + { + "epoch": 0.49, + "grad_norm": 1.7782547283467982, + "learning_rate": 5.426560311787984e-06, + "loss": 0.4318, + "step": 4253 + }, + { + "epoch": 0.49, + "grad_norm": 2.1001178258887134, + "learning_rate": 5.4247061540515115e-06, + "loss": 0.526, + "step": 4254 + }, + { + "epoch": 0.49, + "grad_norm": 1.785369659172782, + "learning_rate": 5.422851937484814e-06, + "loss": 0.6257, + "step": 4255 + }, + { + "epoch": 0.49, + "grad_norm": 2.782298455372749, + "learning_rate": 5.420997662344741e-06, + "loss": 0.5945, + "step": 4256 + }, + { + "epoch": 0.49, + "grad_norm": 1.9192264034322384, + "learning_rate": 5.4191433288881445e-06, + "loss": 0.5392, + "step": 4257 + }, + { + "epoch": 0.49, + "grad_norm": 1.9576819276746926, + "learning_rate": 5.417288937371886e-06, + "loss": 0.5385, + "step": 4258 + }, + { + "epoch": 0.49, + "grad_norm": 2.3223863130300475, + "learning_rate": 5.415434488052838e-06, + "loss": 0.3732, + "step": 4259 + }, + { + "epoch": 0.49, + "grad_norm": 2.259740039037036, + "learning_rate": 5.413579981187876e-06, + "loss": 0.5995, + "step": 4260 + }, + { + "epoch": 0.49, + "grad_norm": 1.9748927540697199, + "learning_rate": 5.411725417033886e-06, + "loss": 0.4326, + "step": 4261 + }, + { + "epoch": 0.49, + "grad_norm": 2.0168311808756862, + "learning_rate": 5.409870795847763e-06, + "loss": 0.4861, + "step": 4262 + }, + { + "epoch": 0.49, + "grad_norm": 2.101034914947914, + "learning_rate": 5.408016117886408e-06, + "loss": 0.4539, + "step": 4263 + }, + { + "epoch": 0.49, + "grad_norm": 2.0662258097021082, + "learning_rate": 5.40616138340673e-06, + "loss": 0.4696, + "step": 4264 + }, + { + "epoch": 0.49, + "grad_norm": 1.7375359031607729, + "learning_rate": 5.404306592665649e-06, + "loss": 0.6386, + "step": 4265 + }, + { + "epoch": 0.49, + "grad_norm": 1.701437154831304, + "learning_rate": 5.402451745920089e-06, + "loss": 0.4951, + "step": 4266 + }, + { + "epoch": 0.49, + "grad_norm": 1.7047353217265153, + "learning_rate": 5.400596843426982e-06, + "loss": 0.4027, + "step": 4267 + }, + { + "epoch": 0.49, + "grad_norm": 1.950201280037983, + "learning_rate": 5.398741885443271e-06, + "loss": 0.5135, + "step": 4268 + }, + { + "epoch": 0.49, + "grad_norm": 0.9419896895584683, + "learning_rate": 5.396886872225902e-06, + "loss": 0.7379, + "step": 4269 + }, + { + "epoch": 0.49, + "grad_norm": 2.285664711064563, + "learning_rate": 5.395031804031832e-06, + "loss": 0.5395, + "step": 4270 + }, + { + "epoch": 0.49, + "grad_norm": 1.8750682948924278, + "learning_rate": 5.3931766811180255e-06, + "loss": 0.452, + "step": 4271 + }, + { + "epoch": 0.49, + "grad_norm": 2.179654062503672, + "learning_rate": 5.391321503741454e-06, + "loss": 0.6463, + "step": 4272 + }, + { + "epoch": 0.49, + "grad_norm": 1.9799662770085158, + "learning_rate": 5.3894662721590926e-06, + "loss": 0.4878, + "step": 4273 + }, + { + "epoch": 0.49, + "grad_norm": 1.9257453879214355, + "learning_rate": 5.387610986627933e-06, + "loss": 0.4606, + "step": 4274 + }, + { + "epoch": 0.49, + "grad_norm": 1.8355157905878297, + "learning_rate": 5.385755647404966e-06, + "loss": 0.4925, + "step": 4275 + }, + { + "epoch": 0.49, + "grad_norm": 2.3039730665079845, + "learning_rate": 5.383900254747195e-06, + "loss": 0.6112, + "step": 4276 + }, + { + "epoch": 0.49, + "grad_norm": 3.217442509109483, + "learning_rate": 5.382044808911626e-06, + "loss": 0.487, + "step": 4277 + }, + { + "epoch": 0.49, + "grad_norm": 1.7726503949957795, + "learning_rate": 5.380189310155276e-06, + "loss": 0.4985, + "step": 4278 + }, + { + "epoch": 0.49, + "grad_norm": 1.9436139307883982, + "learning_rate": 5.378333758735168e-06, + "loss": 0.4206, + "step": 4279 + }, + { + "epoch": 0.49, + "grad_norm": 2.068196128346733, + "learning_rate": 5.376478154908335e-06, + "loss": 0.5087, + "step": 4280 + }, + { + "epoch": 0.49, + "grad_norm": 1.9872965090234882, + "learning_rate": 5.374622498931812e-06, + "loss": 0.4316, + "step": 4281 + }, + { + "epoch": 0.49, + "grad_norm": 2.01509428138601, + "learning_rate": 5.372766791062645e-06, + "loss": 0.4593, + "step": 4282 + }, + { + "epoch": 0.49, + "grad_norm": 1.8060045180271909, + "learning_rate": 5.370911031557887e-06, + "loss": 0.3792, + "step": 4283 + }, + { + "epoch": 0.49, + "grad_norm": 2.0466213177281234, + "learning_rate": 5.369055220674597e-06, + "loss": 0.4503, + "step": 4284 + }, + { + "epoch": 0.49, + "grad_norm": 1.936093595553273, + "learning_rate": 5.3671993586698434e-06, + "loss": 0.4687, + "step": 4285 + }, + { + "epoch": 0.49, + "grad_norm": 1.9042011440650939, + "learning_rate": 5.365343445800697e-06, + "loss": 0.5608, + "step": 4286 + }, + { + "epoch": 0.49, + "grad_norm": 2.195929676320687, + "learning_rate": 5.363487482324239e-06, + "loss": 0.4767, + "step": 4287 + }, + { + "epoch": 0.49, + "grad_norm": 2.162688457490507, + "learning_rate": 5.361631468497559e-06, + "loss": 0.4512, + "step": 4288 + }, + { + "epoch": 0.49, + "grad_norm": 2.4730370416897656, + "learning_rate": 5.35977540457775e-06, + "loss": 0.4678, + "step": 4289 + }, + { + "epoch": 0.49, + "grad_norm": 1.9473156565727763, + "learning_rate": 5.3579192908219145e-06, + "loss": 0.4304, + "step": 4290 + }, + { + "epoch": 0.49, + "grad_norm": 2.0396690287040533, + "learning_rate": 5.3560631274871626e-06, + "loss": 0.3768, + "step": 4291 + }, + { + "epoch": 0.49, + "grad_norm": 2.165263328611737, + "learning_rate": 5.354206914830605e-06, + "loss": 0.4551, + "step": 4292 + }, + { + "epoch": 0.49, + "grad_norm": 1.8923391706900512, + "learning_rate": 5.352350653109368e-06, + "loss": 0.4594, + "step": 4293 + }, + { + "epoch": 0.49, + "grad_norm": 2.2714876814163976, + "learning_rate": 5.350494342580581e-06, + "loss": 0.4843, + "step": 4294 + }, + { + "epoch": 0.49, + "grad_norm": 2.235918414202698, + "learning_rate": 5.348637983501377e-06, + "loss": 0.4958, + "step": 4295 + }, + { + "epoch": 0.49, + "grad_norm": 2.9873927183824973, + "learning_rate": 5.3467815761289e-06, + "loss": 0.439, + "step": 4296 + }, + { + "epoch": 0.49, + "grad_norm": 1.7617756632461765, + "learning_rate": 5.344925120720299e-06, + "loss": 0.432, + "step": 4297 + }, + { + "epoch": 0.49, + "grad_norm": 2.3517273847692013, + "learning_rate": 5.343068617532729e-06, + "loss": 0.5884, + "step": 4298 + }, + { + "epoch": 0.49, + "grad_norm": 0.9316001360412404, + "learning_rate": 5.341212066823356e-06, + "loss": 0.7352, + "step": 4299 + }, + { + "epoch": 0.49, + "grad_norm": 2.147101273486869, + "learning_rate": 5.339355468849344e-06, + "loss": 0.557, + "step": 4300 + }, + { + "epoch": 0.49, + "grad_norm": 3.0610002074838554, + "learning_rate": 5.337498823867872e-06, + "loss": 0.5003, + "step": 4301 + }, + { + "epoch": 0.49, + "grad_norm": 2.3205051367957763, + "learning_rate": 5.335642132136124e-06, + "loss": 0.5459, + "step": 4302 + }, + { + "epoch": 0.49, + "grad_norm": 1.8220091671274186, + "learning_rate": 5.333785393911284e-06, + "loss": 0.5629, + "step": 4303 + }, + { + "epoch": 0.49, + "grad_norm": 2.628397747190286, + "learning_rate": 5.331928609450548e-06, + "loss": 0.5112, + "step": 4304 + }, + { + "epoch": 0.49, + "grad_norm": 1.8262453351010755, + "learning_rate": 5.33007177901112e-06, + "loss": 0.4596, + "step": 4305 + }, + { + "epoch": 0.49, + "grad_norm": 2.2186296865060164, + "learning_rate": 5.328214902850205e-06, + "loss": 0.389, + "step": 4306 + }, + { + "epoch": 0.49, + "grad_norm": 2.1208898499289632, + "learning_rate": 5.32635798122502e-06, + "loss": 0.4686, + "step": 4307 + }, + { + "epoch": 0.5, + "grad_norm": 2.888215400637544, + "learning_rate": 5.324501014392782e-06, + "loss": 0.508, + "step": 4308 + }, + { + "epoch": 0.5, + "grad_norm": 1.9549221183415764, + "learning_rate": 5.322644002610722e-06, + "loss": 0.5682, + "step": 4309 + }, + { + "epoch": 0.5, + "grad_norm": 1.8053569026506293, + "learning_rate": 5.32078694613607e-06, + "loss": 0.447, + "step": 4310 + }, + { + "epoch": 0.5, + "grad_norm": 1.7138974462734162, + "learning_rate": 5.318929845226065e-06, + "loss": 0.495, + "step": 4311 + }, + { + "epoch": 0.5, + "grad_norm": 2.1456297340250887, + "learning_rate": 5.317072700137953e-06, + "loss": 0.4689, + "step": 4312 + }, + { + "epoch": 0.5, + "grad_norm": 1.8133898802325854, + "learning_rate": 5.3152155111289874e-06, + "loss": 0.6012, + "step": 4313 + }, + { + "epoch": 0.5, + "grad_norm": 2.649796877364736, + "learning_rate": 5.313358278456422e-06, + "loss": 0.4731, + "step": 4314 + }, + { + "epoch": 0.5, + "grad_norm": 2.084404569254779, + "learning_rate": 5.3115010023775225e-06, + "loss": 0.4889, + "step": 4315 + }, + { + "epoch": 0.5, + "grad_norm": 1.7574113251113324, + "learning_rate": 5.309643683149558e-06, + "loss": 0.4714, + "step": 4316 + }, + { + "epoch": 0.5, + "grad_norm": 2.32321764253821, + "learning_rate": 5.307786321029804e-06, + "loss": 0.4894, + "step": 4317 + }, + { + "epoch": 0.5, + "grad_norm": 2.2052755098665786, + "learning_rate": 5.305928916275544e-06, + "loss": 0.5042, + "step": 4318 + }, + { + "epoch": 0.5, + "grad_norm": 2.3187308264612514, + "learning_rate": 5.304071469144061e-06, + "loss": 0.4639, + "step": 4319 + }, + { + "epoch": 0.5, + "grad_norm": 2.765116291849073, + "learning_rate": 5.302213979892652e-06, + "loss": 0.3894, + "step": 4320 + }, + { + "epoch": 0.5, + "grad_norm": 2.525920744131966, + "learning_rate": 5.300356448778614e-06, + "loss": 0.5832, + "step": 4321 + }, + { + "epoch": 0.5, + "grad_norm": 2.237883535455513, + "learning_rate": 5.298498876059252e-06, + "loss": 0.5571, + "step": 4322 + }, + { + "epoch": 0.5, + "grad_norm": 7.674111048425905, + "learning_rate": 5.29664126199188e-06, + "loss": 0.4623, + "step": 4323 + }, + { + "epoch": 0.5, + "grad_norm": 2.2680513703188048, + "learning_rate": 5.29478360683381e-06, + "loss": 0.4823, + "step": 4324 + }, + { + "epoch": 0.5, + "grad_norm": 3.362862037665259, + "learning_rate": 5.292925910842366e-06, + "loss": 0.5677, + "step": 4325 + }, + { + "epoch": 0.5, + "grad_norm": 1.8700489467813168, + "learning_rate": 5.291068174274876e-06, + "loss": 0.5913, + "step": 4326 + }, + { + "epoch": 0.5, + "grad_norm": 1.9771333603192363, + "learning_rate": 5.289210397388673e-06, + "loss": 0.4329, + "step": 4327 + }, + { + "epoch": 0.5, + "grad_norm": 3.4068578536091305, + "learning_rate": 5.287352580441095e-06, + "loss": 0.5226, + "step": 4328 + }, + { + "epoch": 0.5, + "grad_norm": 2.08893941663053, + "learning_rate": 5.285494723689488e-06, + "loss": 0.4172, + "step": 4329 + }, + { + "epoch": 0.5, + "grad_norm": 2.042582229866781, + "learning_rate": 5.283636827391201e-06, + "loss": 0.4304, + "step": 4330 + }, + { + "epoch": 0.5, + "grad_norm": 2.0324273903259087, + "learning_rate": 5.281778891803591e-06, + "loss": 0.4649, + "step": 4331 + }, + { + "epoch": 0.5, + "grad_norm": 1.536997390166242, + "learning_rate": 5.279920917184016e-06, + "loss": 0.5185, + "step": 4332 + }, + { + "epoch": 0.5, + "grad_norm": 1.6478052513377677, + "learning_rate": 5.278062903789846e-06, + "loss": 0.5101, + "step": 4333 + }, + { + "epoch": 0.5, + "grad_norm": 2.0240575938795122, + "learning_rate": 5.276204851878448e-06, + "loss": 0.5014, + "step": 4334 + }, + { + "epoch": 0.5, + "grad_norm": 1.9664211056797702, + "learning_rate": 5.274346761707204e-06, + "loss": 0.5297, + "step": 4335 + }, + { + "epoch": 0.5, + "grad_norm": 0.8939815045988033, + "learning_rate": 5.272488633533493e-06, + "loss": 0.7381, + "step": 4336 + }, + { + "epoch": 0.5, + "grad_norm": 1.5707312262037825, + "learning_rate": 5.270630467614705e-06, + "loss": 0.5537, + "step": 4337 + }, + { + "epoch": 0.5, + "grad_norm": 3.859466157561985, + "learning_rate": 5.268772264208231e-06, + "loss": 0.5047, + "step": 4338 + }, + { + "epoch": 0.5, + "grad_norm": 1.972332579589227, + "learning_rate": 5.266914023571468e-06, + "loss": 0.4926, + "step": 4339 + }, + { + "epoch": 0.5, + "grad_norm": 2.0496259683604685, + "learning_rate": 5.265055745961821e-06, + "loss": 0.5593, + "step": 4340 + }, + { + "epoch": 0.5, + "grad_norm": 2.112464638054792, + "learning_rate": 5.2631974316367e-06, + "loss": 0.4535, + "step": 4341 + }, + { + "epoch": 0.5, + "grad_norm": 2.034860193759808, + "learning_rate": 5.261339080853514e-06, + "loss": 0.526, + "step": 4342 + }, + { + "epoch": 0.5, + "grad_norm": 1.6356161787167383, + "learning_rate": 5.2594806938696855e-06, + "loss": 0.4621, + "step": 4343 + }, + { + "epoch": 0.5, + "grad_norm": 2.3163697904653744, + "learning_rate": 5.257622270942636e-06, + "loss": 0.5368, + "step": 4344 + }, + { + "epoch": 0.5, + "grad_norm": 1.8011054415475338, + "learning_rate": 5.2557638123297924e-06, + "loss": 0.4875, + "step": 4345 + }, + { + "epoch": 0.5, + "grad_norm": 0.9260275417036818, + "learning_rate": 5.2539053182885916e-06, + "loss": 0.6972, + "step": 4346 + }, + { + "epoch": 0.5, + "grad_norm": 1.8487262902771333, + "learning_rate": 5.252046789076469e-06, + "loss": 0.5379, + "step": 4347 + }, + { + "epoch": 0.5, + "grad_norm": 1.8240406318622295, + "learning_rate": 5.25018822495087e-06, + "loss": 0.415, + "step": 4348 + }, + { + "epoch": 0.5, + "grad_norm": 1.9681706418100264, + "learning_rate": 5.2483296261692405e-06, + "loss": 0.5093, + "step": 4349 + }, + { + "epoch": 0.5, + "grad_norm": 2.0904278415737703, + "learning_rate": 5.246470992989034e-06, + "loss": 0.5288, + "step": 4350 + }, + { + "epoch": 0.5, + "grad_norm": 1.9818633877379053, + "learning_rate": 5.24461232566771e-06, + "loss": 0.4791, + "step": 4351 + }, + { + "epoch": 0.5, + "grad_norm": 1.7374506478528393, + "learning_rate": 5.242753624462728e-06, + "loss": 0.5259, + "step": 4352 + }, + { + "epoch": 0.5, + "grad_norm": 2.422080367277146, + "learning_rate": 5.240894889631556e-06, + "loss": 0.4409, + "step": 4353 + }, + { + "epoch": 0.5, + "grad_norm": 3.734159568443539, + "learning_rate": 5.239036121431664e-06, + "loss": 0.4472, + "step": 4354 + }, + { + "epoch": 0.5, + "grad_norm": 1.871594824009396, + "learning_rate": 5.237177320120532e-06, + "loss": 0.4872, + "step": 4355 + }, + { + "epoch": 0.5, + "grad_norm": 3.873872002614127, + "learning_rate": 5.235318485955638e-06, + "loss": 0.4124, + "step": 4356 + }, + { + "epoch": 0.5, + "grad_norm": 2.6794890196754846, + "learning_rate": 5.233459619194469e-06, + "loss": 0.5986, + "step": 4357 + }, + { + "epoch": 0.5, + "grad_norm": 0.8383798721512503, + "learning_rate": 5.231600720094513e-06, + "loss": 0.6959, + "step": 4358 + }, + { + "epoch": 0.5, + "grad_norm": 1.870038946976061, + "learning_rate": 5.2297417889132655e-06, + "loss": 0.4866, + "step": 4359 + }, + { + "epoch": 0.5, + "grad_norm": 1.6334975065562425, + "learning_rate": 5.227882825908224e-06, + "loss": 0.5275, + "step": 4360 + }, + { + "epoch": 0.5, + "grad_norm": 2.018515835667817, + "learning_rate": 5.226023831336895e-06, + "loss": 0.5197, + "step": 4361 + }, + { + "epoch": 0.5, + "grad_norm": 1.683333857728174, + "learning_rate": 5.224164805456783e-06, + "loss": 0.4603, + "step": 4362 + }, + { + "epoch": 0.5, + "grad_norm": 2.982486207758545, + "learning_rate": 5.222305748525401e-06, + "loss": 0.452, + "step": 4363 + }, + { + "epoch": 0.5, + "grad_norm": 2.029233754403527, + "learning_rate": 5.220446660800264e-06, + "loss": 0.505, + "step": 4364 + }, + { + "epoch": 0.5, + "grad_norm": 3.0474856636766705, + "learning_rate": 5.218587542538895e-06, + "loss": 0.5432, + "step": 4365 + }, + { + "epoch": 0.5, + "grad_norm": 1.684838802235383, + "learning_rate": 5.216728393998818e-06, + "loss": 0.4143, + "step": 4366 + }, + { + "epoch": 0.5, + "grad_norm": 1.9160241418602428, + "learning_rate": 5.214869215437562e-06, + "loss": 0.4501, + "step": 4367 + }, + { + "epoch": 0.5, + "grad_norm": 1.873330938251454, + "learning_rate": 5.2130100071126565e-06, + "loss": 0.5433, + "step": 4368 + }, + { + "epoch": 0.5, + "grad_norm": 2.2401641636346383, + "learning_rate": 5.211150769281645e-06, + "loss": 0.555, + "step": 4369 + }, + { + "epoch": 0.5, + "grad_norm": 2.038017172982198, + "learning_rate": 5.209291502202064e-06, + "loss": 0.5833, + "step": 4370 + }, + { + "epoch": 0.5, + "grad_norm": 1.8120940362954039, + "learning_rate": 5.20743220613146e-06, + "loss": 0.5407, + "step": 4371 + }, + { + "epoch": 0.5, + "grad_norm": 2.065648516569301, + "learning_rate": 5.205572881327383e-06, + "loss": 0.4097, + "step": 4372 + }, + { + "epoch": 0.5, + "grad_norm": 2.270095588239838, + "learning_rate": 5.203713528047386e-06, + "loss": 0.599, + "step": 4373 + }, + { + "epoch": 0.5, + "grad_norm": 2.1391595974591953, + "learning_rate": 5.201854146549027e-06, + "loss": 0.5998, + "step": 4374 + }, + { + "epoch": 0.5, + "grad_norm": 2.437619006601976, + "learning_rate": 5.199994737089868e-06, + "loss": 0.5048, + "step": 4375 + }, + { + "epoch": 0.5, + "grad_norm": 3.5846032549539664, + "learning_rate": 5.198135299927469e-06, + "loss": 0.4505, + "step": 4376 + }, + { + "epoch": 0.5, + "grad_norm": 1.684985479226318, + "learning_rate": 5.196275835319405e-06, + "loss": 0.4815, + "step": 4377 + }, + { + "epoch": 0.5, + "grad_norm": 2.1384131635457635, + "learning_rate": 5.194416343523246e-06, + "loss": 0.4707, + "step": 4378 + }, + { + "epoch": 0.5, + "grad_norm": 2.0767712689451283, + "learning_rate": 5.1925568247965686e-06, + "loss": 0.5008, + "step": 4379 + }, + { + "epoch": 0.5, + "grad_norm": 2.1263901944187285, + "learning_rate": 5.190697279396954e-06, + "loss": 0.4936, + "step": 4380 + }, + { + "epoch": 0.5, + "grad_norm": 2.2794019538675903, + "learning_rate": 5.188837707581983e-06, + "loss": 0.5597, + "step": 4381 + }, + { + "epoch": 0.5, + "grad_norm": 2.09279435308324, + "learning_rate": 5.186978109609248e-06, + "loss": 0.4557, + "step": 4382 + }, + { + "epoch": 0.5, + "grad_norm": 2.6592518939409033, + "learning_rate": 5.185118485736336e-06, + "loss": 0.4075, + "step": 4383 + }, + { + "epoch": 0.5, + "grad_norm": 2.3210002815613073, + "learning_rate": 5.183258836220844e-06, + "loss": 0.4643, + "step": 4384 + }, + { + "epoch": 0.5, + "grad_norm": 3.044384672567464, + "learning_rate": 5.181399161320368e-06, + "loss": 0.5388, + "step": 4385 + }, + { + "epoch": 0.5, + "grad_norm": 2.1456808046138676, + "learning_rate": 5.179539461292514e-06, + "loss": 0.4865, + "step": 4386 + }, + { + "epoch": 0.5, + "grad_norm": 1.6872103740112088, + "learning_rate": 5.177679736394885e-06, + "loss": 0.3983, + "step": 4387 + }, + { + "epoch": 0.5, + "grad_norm": 1.9978209348756895, + "learning_rate": 5.1758199868850875e-06, + "loss": 0.4902, + "step": 4388 + }, + { + "epoch": 0.5, + "grad_norm": 2.1071930925392977, + "learning_rate": 5.173960213020737e-06, + "loss": 0.4628, + "step": 4389 + }, + { + "epoch": 0.5, + "grad_norm": 2.074987885951766, + "learning_rate": 5.172100415059449e-06, + "loss": 0.5643, + "step": 4390 + }, + { + "epoch": 0.5, + "grad_norm": 1.9512771937207587, + "learning_rate": 5.170240593258839e-06, + "loss": 0.5357, + "step": 4391 + }, + { + "epoch": 0.5, + "grad_norm": 2.2705834412404644, + "learning_rate": 5.1683807478765335e-06, + "loss": 0.5121, + "step": 4392 + }, + { + "epoch": 0.5, + "grad_norm": 1.9471872284011937, + "learning_rate": 5.166520879170156e-06, + "loss": 0.4819, + "step": 4393 + }, + { + "epoch": 0.5, + "grad_norm": 2.3250527388533877, + "learning_rate": 5.1646609873973354e-06, + "loss": 0.5091, + "step": 4394 + }, + { + "epoch": 0.5, + "grad_norm": 2.1897222454917453, + "learning_rate": 5.162801072815702e-06, + "loss": 0.5601, + "step": 4395 + }, + { + "epoch": 0.51, + "grad_norm": 2.26402763171055, + "learning_rate": 5.160941135682893e-06, + "loss": 0.6025, + "step": 4396 + }, + { + "epoch": 0.51, + "grad_norm": 2.1002163919247137, + "learning_rate": 5.159081176256545e-06, + "loss": 0.4894, + "step": 4397 + }, + { + "epoch": 0.51, + "grad_norm": 2.1251487603159185, + "learning_rate": 5.157221194794302e-06, + "loss": 0.5199, + "step": 4398 + }, + { + "epoch": 0.51, + "grad_norm": 2.8842010797318722, + "learning_rate": 5.155361191553804e-06, + "loss": 0.6349, + "step": 4399 + }, + { + "epoch": 0.51, + "grad_norm": 1.810397302117786, + "learning_rate": 5.153501166792702e-06, + "loss": 0.4984, + "step": 4400 + }, + { + "epoch": 0.51, + "grad_norm": 2.3760559904731213, + "learning_rate": 5.1516411207686435e-06, + "loss": 0.431, + "step": 4401 + }, + { + "epoch": 0.51, + "grad_norm": 1.9991125693301748, + "learning_rate": 5.1497810537392844e-06, + "loss": 0.4583, + "step": 4402 + }, + { + "epoch": 0.51, + "grad_norm": 2.240526245928486, + "learning_rate": 5.147920965962279e-06, + "loss": 0.6155, + "step": 4403 + }, + { + "epoch": 0.51, + "grad_norm": 4.573512953126507, + "learning_rate": 5.146060857695288e-06, + "loss": 0.454, + "step": 4404 + }, + { + "epoch": 0.51, + "grad_norm": 1.9684184489817413, + "learning_rate": 5.1442007291959715e-06, + "loss": 0.5029, + "step": 4405 + }, + { + "epoch": 0.51, + "grad_norm": 2.222177936080414, + "learning_rate": 5.142340580721995e-06, + "loss": 0.4521, + "step": 4406 + }, + { + "epoch": 0.51, + "grad_norm": 2.084433030978843, + "learning_rate": 5.140480412531027e-06, + "loss": 0.5306, + "step": 4407 + }, + { + "epoch": 0.51, + "grad_norm": 1.8880798612677099, + "learning_rate": 5.138620224880735e-06, + "loss": 0.5455, + "step": 4408 + }, + { + "epoch": 0.51, + "grad_norm": 1.7852614397771662, + "learning_rate": 5.136760018028793e-06, + "loss": 0.5512, + "step": 4409 + }, + { + "epoch": 0.51, + "grad_norm": 2.0460389888499733, + "learning_rate": 5.134899792232879e-06, + "loss": 0.4369, + "step": 4410 + }, + { + "epoch": 0.51, + "grad_norm": 1.953868119269436, + "learning_rate": 5.133039547750669e-06, + "loss": 0.424, + "step": 4411 + }, + { + "epoch": 0.51, + "grad_norm": 1.9390321726996727, + "learning_rate": 5.131179284839843e-06, + "loss": 0.483, + "step": 4412 + }, + { + "epoch": 0.51, + "grad_norm": 2.4585019776191643, + "learning_rate": 5.129319003758085e-06, + "loss": 0.4566, + "step": 4413 + }, + { + "epoch": 0.51, + "grad_norm": 1.9849129612851817, + "learning_rate": 5.1274587047630816e-06, + "loss": 0.4418, + "step": 4414 + }, + { + "epoch": 0.51, + "grad_norm": 1.7482859930494572, + "learning_rate": 5.12559838811252e-06, + "loss": 0.4723, + "step": 4415 + }, + { + "epoch": 0.51, + "grad_norm": 3.0225913574524164, + "learning_rate": 5.1237380540640915e-06, + "loss": 0.5704, + "step": 4416 + }, + { + "epoch": 0.51, + "grad_norm": 2.2120385249626775, + "learning_rate": 5.12187770287549e-06, + "loss": 0.4763, + "step": 4417 + }, + { + "epoch": 0.51, + "grad_norm": 2.209407236134105, + "learning_rate": 5.12001733480441e-06, + "loss": 0.4632, + "step": 4418 + }, + { + "epoch": 0.51, + "grad_norm": 1.5682980079890456, + "learning_rate": 5.118156950108549e-06, + "loss": 0.4961, + "step": 4419 + }, + { + "epoch": 0.51, + "grad_norm": 2.23644487557047, + "learning_rate": 5.116296549045607e-06, + "loss": 0.5267, + "step": 4420 + }, + { + "epoch": 0.51, + "grad_norm": 2.0703623510496376, + "learning_rate": 5.11443613187329e-06, + "loss": 0.5338, + "step": 4421 + }, + { + "epoch": 0.51, + "grad_norm": 1.761164869980416, + "learning_rate": 5.1125756988492985e-06, + "loss": 0.5174, + "step": 4422 + }, + { + "epoch": 0.51, + "grad_norm": 1.9731425486949428, + "learning_rate": 5.110715250231338e-06, + "loss": 0.4494, + "step": 4423 + }, + { + "epoch": 0.51, + "grad_norm": 1.8234457846210086, + "learning_rate": 5.108854786277123e-06, + "loss": 0.4883, + "step": 4424 + }, + { + "epoch": 0.51, + "grad_norm": 2.9658537361969675, + "learning_rate": 5.106994307244361e-06, + "loss": 0.476, + "step": 4425 + }, + { + "epoch": 0.51, + "grad_norm": 2.6050939769434147, + "learning_rate": 5.105133813390766e-06, + "loss": 0.4531, + "step": 4426 + }, + { + "epoch": 0.51, + "grad_norm": 1.857408683426242, + "learning_rate": 5.103273304974054e-06, + "loss": 0.5616, + "step": 4427 + }, + { + "epoch": 0.51, + "grad_norm": 1.746832964508357, + "learning_rate": 5.101412782251942e-06, + "loss": 0.5693, + "step": 4428 + }, + { + "epoch": 0.51, + "grad_norm": 3.1664387648996897, + "learning_rate": 5.099552245482148e-06, + "loss": 0.4356, + "step": 4429 + }, + { + "epoch": 0.51, + "grad_norm": 2.1036049183623673, + "learning_rate": 5.097691694922394e-06, + "loss": 0.5181, + "step": 4430 + }, + { + "epoch": 0.51, + "grad_norm": 2.4064135352355036, + "learning_rate": 5.0958311308304045e-06, + "loss": 0.4886, + "step": 4431 + }, + { + "epoch": 0.51, + "grad_norm": 1.7729229382258174, + "learning_rate": 5.093970553463901e-06, + "loss": 0.4196, + "step": 4432 + }, + { + "epoch": 0.51, + "grad_norm": 2.097947705861859, + "learning_rate": 5.092109963080614e-06, + "loss": 0.4542, + "step": 4433 + }, + { + "epoch": 0.51, + "grad_norm": 5.429005079460941, + "learning_rate": 5.090249359938273e-06, + "loss": 0.554, + "step": 4434 + }, + { + "epoch": 0.51, + "grad_norm": 1.946827344150558, + "learning_rate": 5.088388744294603e-06, + "loss": 0.5755, + "step": 4435 + }, + { + "epoch": 0.51, + "grad_norm": 1.8314475235137133, + "learning_rate": 5.086528116407342e-06, + "loss": 0.5895, + "step": 4436 + }, + { + "epoch": 0.51, + "grad_norm": 2.0860068066893174, + "learning_rate": 5.084667476534221e-06, + "loss": 0.532, + "step": 4437 + }, + { + "epoch": 0.51, + "grad_norm": 2.4383616511966264, + "learning_rate": 5.0828068249329755e-06, + "loss": 0.485, + "step": 4438 + }, + { + "epoch": 0.51, + "grad_norm": 2.766195864271854, + "learning_rate": 5.080946161861342e-06, + "loss": 0.6115, + "step": 4439 + }, + { + "epoch": 0.51, + "grad_norm": 1.6835757337439372, + "learning_rate": 5.0790854875770604e-06, + "loss": 0.4393, + "step": 4440 + }, + { + "epoch": 0.51, + "grad_norm": 1.7456475738059325, + "learning_rate": 5.077224802337872e-06, + "loss": 0.4675, + "step": 4441 + }, + { + "epoch": 0.51, + "grad_norm": 0.8717471764186547, + "learning_rate": 5.075364106401517e-06, + "loss": 0.7052, + "step": 4442 + }, + { + "epoch": 0.51, + "grad_norm": 2.2690095935046366, + "learning_rate": 5.073503400025737e-06, + "loss": 0.5163, + "step": 4443 + }, + { + "epoch": 0.51, + "grad_norm": 2.3592273330822313, + "learning_rate": 5.071642683468281e-06, + "loss": 0.6023, + "step": 4444 + }, + { + "epoch": 0.51, + "grad_norm": 2.04600923860418, + "learning_rate": 5.069781956986894e-06, + "loss": 0.4787, + "step": 4445 + }, + { + "epoch": 0.51, + "grad_norm": 1.9667481473015458, + "learning_rate": 5.0679212208393196e-06, + "loss": 0.5252, + "step": 4446 + }, + { + "epoch": 0.51, + "grad_norm": 2.644024291599729, + "learning_rate": 5.06606047528331e-06, + "loss": 0.5743, + "step": 4447 + }, + { + "epoch": 0.51, + "grad_norm": 2.145402236935919, + "learning_rate": 5.064199720576615e-06, + "loss": 0.4606, + "step": 4448 + }, + { + "epoch": 0.51, + "grad_norm": 2.0620081524132967, + "learning_rate": 5.062338956976986e-06, + "loss": 0.5047, + "step": 4449 + }, + { + "epoch": 0.51, + "grad_norm": 0.8712706130409299, + "learning_rate": 5.060478184742176e-06, + "loss": 0.7158, + "step": 4450 + }, + { + "epoch": 0.51, + "grad_norm": 1.7213720806496773, + "learning_rate": 5.058617404129938e-06, + "loss": 0.4672, + "step": 4451 + }, + { + "epoch": 0.51, + "grad_norm": 2.4133541248976638, + "learning_rate": 5.056756615398026e-06, + "loss": 0.4419, + "step": 4452 + }, + { + "epoch": 0.51, + "grad_norm": 1.9803735824066313, + "learning_rate": 5.0548958188042e-06, + "loss": 0.461, + "step": 4453 + }, + { + "epoch": 0.51, + "grad_norm": 2.1501955369656502, + "learning_rate": 5.053035014606212e-06, + "loss": 0.4827, + "step": 4454 + }, + { + "epoch": 0.51, + "grad_norm": 2.02400132473691, + "learning_rate": 5.051174203061825e-06, + "loss": 0.4974, + "step": 4455 + }, + { + "epoch": 0.51, + "grad_norm": 2.1407765667243654, + "learning_rate": 5.0493133844287955e-06, + "loss": 0.4706, + "step": 4456 + }, + { + "epoch": 0.51, + "grad_norm": 1.9997346287617235, + "learning_rate": 5.047452558964884e-06, + "loss": 0.5207, + "step": 4457 + }, + { + "epoch": 0.51, + "grad_norm": 2.02911970575265, + "learning_rate": 5.045591726927853e-06, + "loss": 0.5419, + "step": 4458 + }, + { + "epoch": 0.51, + "grad_norm": 2.389921304692081, + "learning_rate": 5.043730888575463e-06, + "loss": 0.5249, + "step": 4459 + }, + { + "epoch": 0.51, + "grad_norm": 1.6545975138723845, + "learning_rate": 5.04187004416548e-06, + "loss": 0.4582, + "step": 4460 + }, + { + "epoch": 0.51, + "grad_norm": 2.0461363535972277, + "learning_rate": 5.040009193955664e-06, + "loss": 0.5171, + "step": 4461 + }, + { + "epoch": 0.51, + "grad_norm": 1.9767063965321605, + "learning_rate": 5.0381483382037825e-06, + "loss": 0.5225, + "step": 4462 + }, + { + "epoch": 0.51, + "grad_norm": 2.460285409521772, + "learning_rate": 5.0362874771676e-06, + "loss": 0.5281, + "step": 4463 + }, + { + "epoch": 0.51, + "grad_norm": 2.1709822502896583, + "learning_rate": 5.03442661110488e-06, + "loss": 0.5147, + "step": 4464 + }, + { + "epoch": 0.51, + "grad_norm": 3.8122527028068007, + "learning_rate": 5.032565740273394e-06, + "loss": 0.4825, + "step": 4465 + }, + { + "epoch": 0.51, + "grad_norm": 1.666213591095413, + "learning_rate": 5.030704864930907e-06, + "loss": 0.5902, + "step": 4466 + }, + { + "epoch": 0.51, + "grad_norm": 1.9960950637814083, + "learning_rate": 5.028843985335186e-06, + "loss": 0.4707, + "step": 4467 + }, + { + "epoch": 0.51, + "grad_norm": 1.7580022213779434, + "learning_rate": 5.0269831017440015e-06, + "loss": 0.6086, + "step": 4468 + }, + { + "epoch": 0.51, + "grad_norm": 1.81004448569933, + "learning_rate": 5.025122214415123e-06, + "loss": 0.4495, + "step": 4469 + }, + { + "epoch": 0.51, + "grad_norm": 1.9941524771176407, + "learning_rate": 5.023261323606321e-06, + "loss": 0.5239, + "step": 4470 + }, + { + "epoch": 0.51, + "grad_norm": 2.004711611200936, + "learning_rate": 5.021400429575363e-06, + "loss": 0.4855, + "step": 4471 + }, + { + "epoch": 0.51, + "grad_norm": 2.380617890273587, + "learning_rate": 5.019539532580021e-06, + "loss": 0.5987, + "step": 4472 + }, + { + "epoch": 0.51, + "grad_norm": 2.3860292600304804, + "learning_rate": 5.017678632878067e-06, + "loss": 0.5407, + "step": 4473 + }, + { + "epoch": 0.51, + "grad_norm": 1.8716455927377267, + "learning_rate": 5.015817730727272e-06, + "loss": 0.4558, + "step": 4474 + }, + { + "epoch": 0.51, + "grad_norm": 1.9528274232852458, + "learning_rate": 5.013956826385406e-06, + "loss": 0.4614, + "step": 4475 + }, + { + "epoch": 0.51, + "grad_norm": 2.2529167601616287, + "learning_rate": 5.012095920110245e-06, + "loss": 0.5074, + "step": 4476 + }, + { + "epoch": 0.51, + "grad_norm": 2.486852481886929, + "learning_rate": 5.0102350121595576e-06, + "loss": 0.5603, + "step": 4477 + }, + { + "epoch": 0.51, + "grad_norm": 3.98223603844178, + "learning_rate": 5.008374102791119e-06, + "loss": 0.4898, + "step": 4478 + }, + { + "epoch": 0.51, + "grad_norm": 4.1029245482487635, + "learning_rate": 5.006513192262702e-06, + "loss": 0.3911, + "step": 4479 + }, + { + "epoch": 0.51, + "grad_norm": 1.9095703801728892, + "learning_rate": 5.004652280832077e-06, + "loss": 0.5232, + "step": 4480 + }, + { + "epoch": 0.51, + "grad_norm": 2.1305763811877187, + "learning_rate": 5.002791368757019e-06, + "loss": 0.4502, + "step": 4481 + }, + { + "epoch": 0.51, + "grad_norm": 2.0869937591936347, + "learning_rate": 5.000930456295302e-06, + "loss": 0.5178, + "step": 4482 + }, + { + "epoch": 0.52, + "grad_norm": 1.9623864090465284, + "learning_rate": 4.9990695437046995e-06, + "loss": 0.5864, + "step": 4483 + }, + { + "epoch": 0.52, + "grad_norm": 2.149820792131405, + "learning_rate": 4.997208631242981e-06, + "loss": 0.3687, + "step": 4484 + }, + { + "epoch": 0.52, + "grad_norm": 2.207518213410718, + "learning_rate": 4.995347719167924e-06, + "loss": 0.5253, + "step": 4485 + }, + { + "epoch": 0.52, + "grad_norm": 2.147841557923247, + "learning_rate": 4.9934868077373e-06, + "loss": 0.5243, + "step": 4486 + }, + { + "epoch": 0.52, + "grad_norm": 1.7359989898638204, + "learning_rate": 4.991625897208882e-06, + "loss": 0.4673, + "step": 4487 + }, + { + "epoch": 0.52, + "grad_norm": 1.8249811572087307, + "learning_rate": 4.989764987840442e-06, + "loss": 0.4294, + "step": 4488 + }, + { + "epoch": 0.52, + "grad_norm": 3.797419057797103, + "learning_rate": 4.987904079889756e-06, + "loss": 0.5135, + "step": 4489 + }, + { + "epoch": 0.52, + "grad_norm": 2.040783142534844, + "learning_rate": 4.9860431736145936e-06, + "loss": 0.4545, + "step": 4490 + }, + { + "epoch": 0.52, + "grad_norm": 1.9480582187031563, + "learning_rate": 4.984182269272731e-06, + "loss": 0.4824, + "step": 4491 + }, + { + "epoch": 0.52, + "grad_norm": 1.9628372042110296, + "learning_rate": 4.982321367121935e-06, + "loss": 0.5102, + "step": 4492 + }, + { + "epoch": 0.52, + "grad_norm": 2.7930549930982846, + "learning_rate": 4.980460467419982e-06, + "loss": 0.6359, + "step": 4493 + }, + { + "epoch": 0.52, + "grad_norm": 2.301676637090723, + "learning_rate": 4.97859957042464e-06, + "loss": 0.4286, + "step": 4494 + }, + { + "epoch": 0.52, + "grad_norm": 2.755334521032017, + "learning_rate": 4.976738676393682e-06, + "loss": 0.5428, + "step": 4495 + }, + { + "epoch": 0.52, + "grad_norm": 1.805191950343552, + "learning_rate": 4.9748777855848786e-06, + "loss": 0.4783, + "step": 4496 + }, + { + "epoch": 0.52, + "grad_norm": 5.255393928811061, + "learning_rate": 4.973016898255999e-06, + "loss": 0.4328, + "step": 4497 + }, + { + "epoch": 0.52, + "grad_norm": 2.566190304225009, + "learning_rate": 4.971156014664816e-06, + "loss": 0.4419, + "step": 4498 + }, + { + "epoch": 0.52, + "grad_norm": 5.355518475569609, + "learning_rate": 4.969295135069096e-06, + "loss": 0.4963, + "step": 4499 + }, + { + "epoch": 0.52, + "grad_norm": 1.8207446087422836, + "learning_rate": 4.967434259726608e-06, + "loss": 0.4437, + "step": 4500 + }, + { + "epoch": 0.52, + "grad_norm": 2.4606170727671937, + "learning_rate": 4.965573388895121e-06, + "loss": 0.501, + "step": 4501 + }, + { + "epoch": 0.52, + "grad_norm": 2.0731705456450755, + "learning_rate": 4.963712522832402e-06, + "loss": 0.5974, + "step": 4502 + }, + { + "epoch": 0.52, + "grad_norm": 5.251682506217783, + "learning_rate": 4.96185166179622e-06, + "loss": 0.3679, + "step": 4503 + }, + { + "epoch": 0.52, + "grad_norm": 2.113499774786899, + "learning_rate": 4.959990806044338e-06, + "loss": 0.4769, + "step": 4504 + }, + { + "epoch": 0.52, + "grad_norm": 1.7218646069601113, + "learning_rate": 4.958129955834522e-06, + "loss": 0.5516, + "step": 4505 + }, + { + "epoch": 0.52, + "grad_norm": 3.4019887972656226, + "learning_rate": 4.956269111424537e-06, + "loss": 0.4634, + "step": 4506 + }, + { + "epoch": 0.52, + "grad_norm": 1.906250384427298, + "learning_rate": 4.954408273072148e-06, + "loss": 0.5187, + "step": 4507 + }, + { + "epoch": 0.52, + "grad_norm": 1.7805073601184829, + "learning_rate": 4.952547441035117e-06, + "loss": 0.5198, + "step": 4508 + }, + { + "epoch": 0.52, + "grad_norm": 2.480392726287118, + "learning_rate": 4.950686615571208e-06, + "loss": 0.4868, + "step": 4509 + }, + { + "epoch": 0.52, + "grad_norm": 3.0388878540692223, + "learning_rate": 4.948825796938178e-06, + "loss": 0.53, + "step": 4510 + }, + { + "epoch": 0.52, + "grad_norm": 2.0388311678181474, + "learning_rate": 4.94696498539379e-06, + "loss": 0.3548, + "step": 4511 + }, + { + "epoch": 0.52, + "grad_norm": 2.3788455393514436, + "learning_rate": 4.945104181195803e-06, + "loss": 0.4972, + "step": 4512 + }, + { + "epoch": 0.52, + "grad_norm": 2.1966462424171396, + "learning_rate": 4.9432433846019755e-06, + "loss": 0.4957, + "step": 4513 + }, + { + "epoch": 0.52, + "grad_norm": 2.0229378544039935, + "learning_rate": 4.941382595870065e-06, + "loss": 0.6074, + "step": 4514 + }, + { + "epoch": 0.52, + "grad_norm": 2.963787557226285, + "learning_rate": 4.939521815257826e-06, + "loss": 0.5408, + "step": 4515 + }, + { + "epoch": 0.52, + "grad_norm": 1.7730372046872394, + "learning_rate": 4.937661043023015e-06, + "loss": 0.5389, + "step": 4516 + }, + { + "epoch": 0.52, + "grad_norm": 2.0767720330880635, + "learning_rate": 4.935800279423386e-06, + "loss": 0.5785, + "step": 4517 + }, + { + "epoch": 0.52, + "grad_norm": 2.244031457382286, + "learning_rate": 4.933939524716692e-06, + "loss": 0.559, + "step": 4518 + }, + { + "epoch": 0.52, + "grad_norm": 2.2581850342872487, + "learning_rate": 4.932078779160682e-06, + "loss": 0.4575, + "step": 4519 + }, + { + "epoch": 0.52, + "grad_norm": 1.9445595648487, + "learning_rate": 4.930218043013109e-06, + "loss": 0.4918, + "step": 4520 + }, + { + "epoch": 0.52, + "grad_norm": 1.7615961433681504, + "learning_rate": 4.92835731653172e-06, + "loss": 0.5325, + "step": 4521 + }, + { + "epoch": 0.52, + "grad_norm": 2.268311646347819, + "learning_rate": 4.9264965999742635e-06, + "loss": 0.5027, + "step": 4522 + }, + { + "epoch": 0.52, + "grad_norm": 2.6008891495791047, + "learning_rate": 4.9246358935984854e-06, + "loss": 0.4844, + "step": 4523 + }, + { + "epoch": 0.52, + "grad_norm": 1.8598120213966736, + "learning_rate": 4.922775197662129e-06, + "loss": 0.4984, + "step": 4524 + }, + { + "epoch": 0.52, + "grad_norm": 1.9445755112998133, + "learning_rate": 4.920914512422941e-06, + "loss": 0.4825, + "step": 4525 + }, + { + "epoch": 0.52, + "grad_norm": 0.852023506608896, + "learning_rate": 4.91905383813866e-06, + "loss": 0.7103, + "step": 4526 + }, + { + "epoch": 0.52, + "grad_norm": 2.2574584322351874, + "learning_rate": 4.917193175067026e-06, + "loss": 0.5179, + "step": 4527 + }, + { + "epoch": 0.52, + "grad_norm": 2.1588110019499016, + "learning_rate": 4.915332523465781e-06, + "loss": 0.5071, + "step": 4528 + }, + { + "epoch": 0.52, + "grad_norm": 2.490698435001959, + "learning_rate": 4.91347188359266e-06, + "loss": 0.4511, + "step": 4529 + }, + { + "epoch": 0.52, + "grad_norm": 1.9384555436414646, + "learning_rate": 4.9116112557053976e-06, + "loss": 0.4885, + "step": 4530 + }, + { + "epoch": 0.52, + "grad_norm": 2.0677818202028684, + "learning_rate": 4.909750640061729e-06, + "loss": 0.4725, + "step": 4531 + }, + { + "epoch": 0.52, + "grad_norm": 1.9844513033060593, + "learning_rate": 4.907890036919386e-06, + "loss": 0.4932, + "step": 4532 + }, + { + "epoch": 0.52, + "grad_norm": 2.175757695171607, + "learning_rate": 4.9060294465360994e-06, + "loss": 0.4785, + "step": 4533 + }, + { + "epoch": 0.52, + "grad_norm": 2.3375492718169624, + "learning_rate": 4.904168869169597e-06, + "loss": 0.5052, + "step": 4534 + }, + { + "epoch": 0.52, + "grad_norm": 3.324275346869292, + "learning_rate": 4.902308305077607e-06, + "loss": 0.5379, + "step": 4535 + }, + { + "epoch": 0.52, + "grad_norm": 1.9748633248700438, + "learning_rate": 4.900447754517854e-06, + "loss": 0.5484, + "step": 4536 + }, + { + "epoch": 0.52, + "grad_norm": 1.8697892369333078, + "learning_rate": 4.898587217748059e-06, + "loss": 0.4755, + "step": 4537 + }, + { + "epoch": 0.52, + "grad_norm": 3.6475403723093165, + "learning_rate": 4.896726695025947e-06, + "loss": 0.5286, + "step": 4538 + }, + { + "epoch": 0.52, + "grad_norm": 2.284885069141002, + "learning_rate": 4.894866186609234e-06, + "loss": 0.475, + "step": 4539 + }, + { + "epoch": 0.52, + "grad_norm": 2.04512439950645, + "learning_rate": 4.893005692755639e-06, + "loss": 0.3489, + "step": 4540 + }, + { + "epoch": 0.52, + "grad_norm": 1.7069321309387255, + "learning_rate": 4.8911452137228775e-06, + "loss": 0.4078, + "step": 4541 + }, + { + "epoch": 0.52, + "grad_norm": 2.6585698781506637, + "learning_rate": 4.889284749768663e-06, + "loss": 0.5751, + "step": 4542 + }, + { + "epoch": 0.52, + "grad_norm": 2.609727567666265, + "learning_rate": 4.887424301150705e-06, + "loss": 0.5077, + "step": 4543 + }, + { + "epoch": 0.52, + "grad_norm": 2.0803523414464182, + "learning_rate": 4.885563868126713e-06, + "loss": 0.4209, + "step": 4544 + }, + { + "epoch": 0.52, + "grad_norm": 2.2504738381789946, + "learning_rate": 4.8837034509543935e-06, + "loss": 0.5612, + "step": 4545 + }, + { + "epoch": 0.52, + "grad_norm": 0.8966215837902091, + "learning_rate": 4.881843049891452e-06, + "loss": 0.7617, + "step": 4546 + }, + { + "epoch": 0.52, + "grad_norm": 1.9006807432456274, + "learning_rate": 4.879982665195591e-06, + "loss": 0.5009, + "step": 4547 + }, + { + "epoch": 0.52, + "grad_norm": 2.789301169646061, + "learning_rate": 4.878122297124512e-06, + "loss": 0.6044, + "step": 4548 + }, + { + "epoch": 0.52, + "grad_norm": 3.56379560347693, + "learning_rate": 4.876261945935909e-06, + "loss": 0.4672, + "step": 4549 + }, + { + "epoch": 0.52, + "grad_norm": 1.7069983951925045, + "learning_rate": 4.874401611887481e-06, + "loss": 0.4882, + "step": 4550 + }, + { + "epoch": 0.52, + "grad_norm": 1.906036924900144, + "learning_rate": 4.872541295236919e-06, + "loss": 0.5419, + "step": 4551 + }, + { + "epoch": 0.52, + "grad_norm": 2.066403291549752, + "learning_rate": 4.870680996241916e-06, + "loss": 0.6192, + "step": 4552 + }, + { + "epoch": 0.52, + "grad_norm": 1.8361159966480445, + "learning_rate": 4.8688207151601576e-06, + "loss": 0.547, + "step": 4553 + }, + { + "epoch": 0.52, + "grad_norm": 0.8914070994458013, + "learning_rate": 4.866960452249332e-06, + "loss": 0.6832, + "step": 4554 + }, + { + "epoch": 0.52, + "grad_norm": 2.0316659519095306, + "learning_rate": 4.865100207767121e-06, + "loss": 0.4352, + "step": 4555 + }, + { + "epoch": 0.52, + "grad_norm": 1.7734491793695362, + "learning_rate": 4.863239981971206e-06, + "loss": 0.492, + "step": 4556 + }, + { + "epoch": 0.52, + "grad_norm": 2.2940421480125455, + "learning_rate": 4.861379775119265e-06, + "loss": 0.4796, + "step": 4557 + }, + { + "epoch": 0.52, + "grad_norm": 1.7205222568546756, + "learning_rate": 4.859519587468974e-06, + "loss": 0.4669, + "step": 4558 + }, + { + "epoch": 0.52, + "grad_norm": 2.870807447629867, + "learning_rate": 4.857659419278007e-06, + "loss": 0.5142, + "step": 4559 + }, + { + "epoch": 0.52, + "grad_norm": 2.166466914311534, + "learning_rate": 4.855799270804031e-06, + "loss": 0.5122, + "step": 4560 + }, + { + "epoch": 0.52, + "grad_norm": 2.1895022758410314, + "learning_rate": 4.853939142304714e-06, + "loss": 0.5253, + "step": 4561 + }, + { + "epoch": 0.52, + "grad_norm": 1.9920238045271101, + "learning_rate": 4.852079034037722e-06, + "loss": 0.5338, + "step": 4562 + }, + { + "epoch": 0.52, + "grad_norm": 1.8749201316861372, + "learning_rate": 4.850218946260717e-06, + "loss": 0.5309, + "step": 4563 + }, + { + "epoch": 0.52, + "grad_norm": 2.209211279846637, + "learning_rate": 4.848358879231358e-06, + "loss": 0.4328, + "step": 4564 + }, + { + "epoch": 0.52, + "grad_norm": 2.131699116704348, + "learning_rate": 4.8464988332073e-06, + "loss": 0.5443, + "step": 4565 + }, + { + "epoch": 0.52, + "grad_norm": 2.368353932312741, + "learning_rate": 4.844638808446198e-06, + "loss": 0.5409, + "step": 4566 + }, + { + "epoch": 0.52, + "grad_norm": 3.124976566493968, + "learning_rate": 4.8427788052057e-06, + "loss": 0.5216, + "step": 4567 + }, + { + "epoch": 0.52, + "grad_norm": 2.044246560529135, + "learning_rate": 4.840918823743456e-06, + "loss": 0.5328, + "step": 4568 + }, + { + "epoch": 0.52, + "grad_norm": 2.315500595247249, + "learning_rate": 4.839058864317109e-06, + "loss": 0.5294, + "step": 4569 + }, + { + "epoch": 0.53, + "grad_norm": 2.1159384416707363, + "learning_rate": 4.837198927184299e-06, + "loss": 0.4915, + "step": 4570 + }, + { + "epoch": 0.53, + "grad_norm": 2.1681512116907493, + "learning_rate": 4.835339012602666e-06, + "loss": 0.4837, + "step": 4571 + }, + { + "epoch": 0.53, + "grad_norm": 1.784103190926599, + "learning_rate": 4.833479120829845e-06, + "loss": 0.4699, + "step": 4572 + }, + { + "epoch": 0.53, + "grad_norm": 3.521894210548586, + "learning_rate": 4.8316192521234665e-06, + "loss": 0.4444, + "step": 4573 + }, + { + "epoch": 0.53, + "grad_norm": 2.5842375522375027, + "learning_rate": 4.82975940674116e-06, + "loss": 0.4297, + "step": 4574 + }, + { + "epoch": 0.53, + "grad_norm": 2.232057758251001, + "learning_rate": 4.8278995849405515e-06, + "loss": 0.5342, + "step": 4575 + }, + { + "epoch": 0.53, + "grad_norm": 2.0055726262517743, + "learning_rate": 4.826039786979264e-06, + "loss": 0.4726, + "step": 4576 + }, + { + "epoch": 0.53, + "grad_norm": 2.6532190386300165, + "learning_rate": 4.824180013114914e-06, + "loss": 0.498, + "step": 4577 + }, + { + "epoch": 0.53, + "grad_norm": 2.3489841533554303, + "learning_rate": 4.822320263605118e-06, + "loss": 0.5791, + "step": 4578 + }, + { + "epoch": 0.53, + "grad_norm": 3.2678441188617304, + "learning_rate": 4.820460538707487e-06, + "loss": 0.5331, + "step": 4579 + }, + { + "epoch": 0.53, + "grad_norm": 2.7052086344595665, + "learning_rate": 4.818600838679633e-06, + "loss": 0.4619, + "step": 4580 + }, + { + "epoch": 0.53, + "grad_norm": 2.093347669292747, + "learning_rate": 4.816741163779158e-06, + "loss": 0.5093, + "step": 4581 + }, + { + "epoch": 0.53, + "grad_norm": 1.6861824230104265, + "learning_rate": 4.814881514263666e-06, + "loss": 0.5033, + "step": 4582 + }, + { + "epoch": 0.53, + "grad_norm": 2.067725485917308, + "learning_rate": 4.813021890390754e-06, + "loss": 0.4335, + "step": 4583 + }, + { + "epoch": 0.53, + "grad_norm": 2.4929502613591383, + "learning_rate": 4.811162292418018e-06, + "loss": 0.548, + "step": 4584 + }, + { + "epoch": 0.53, + "grad_norm": 1.6736121022145403, + "learning_rate": 4.809302720603049e-06, + "loss": 0.5388, + "step": 4585 + }, + { + "epoch": 0.53, + "grad_norm": 3.495052710963959, + "learning_rate": 4.807443175203432e-06, + "loss": 0.479, + "step": 4586 + }, + { + "epoch": 0.53, + "grad_norm": 2.014220067342264, + "learning_rate": 4.805583656476755e-06, + "loss": 0.4779, + "step": 4587 + }, + { + "epoch": 0.53, + "grad_norm": 1.99278720798132, + "learning_rate": 4.803724164680596e-06, + "loss": 0.5119, + "step": 4588 + }, + { + "epoch": 0.53, + "grad_norm": 1.9095436280475346, + "learning_rate": 4.801864700072531e-06, + "loss": 0.521, + "step": 4589 + }, + { + "epoch": 0.53, + "grad_norm": 2.433682315782996, + "learning_rate": 4.800005262910135e-06, + "loss": 0.5601, + "step": 4590 + }, + { + "epoch": 0.53, + "grad_norm": 4.480523341064522, + "learning_rate": 4.798145853450973e-06, + "loss": 0.5557, + "step": 4591 + }, + { + "epoch": 0.53, + "grad_norm": 2.596657736476968, + "learning_rate": 4.796286471952615e-06, + "loss": 0.5714, + "step": 4592 + }, + { + "epoch": 0.53, + "grad_norm": 1.826378276861144, + "learning_rate": 4.794427118672619e-06, + "loss": 0.611, + "step": 4593 + }, + { + "epoch": 0.53, + "grad_norm": 2.088002845855787, + "learning_rate": 4.7925677938685425e-06, + "loss": 0.5757, + "step": 4594 + }, + { + "epoch": 0.53, + "grad_norm": 2.553614500472137, + "learning_rate": 4.7907084977979384e-06, + "loss": 0.4282, + "step": 4595 + }, + { + "epoch": 0.53, + "grad_norm": 1.8350135941282417, + "learning_rate": 4.7888492307183575e-06, + "loss": 0.5079, + "step": 4596 + }, + { + "epoch": 0.53, + "grad_norm": 2.1944624542917324, + "learning_rate": 4.786989992887344e-06, + "loss": 0.5607, + "step": 4597 + }, + { + "epoch": 0.53, + "grad_norm": 2.218859580914836, + "learning_rate": 4.785130784562441e-06, + "loss": 0.4979, + "step": 4598 + }, + { + "epoch": 0.53, + "grad_norm": 2.0486123522148705, + "learning_rate": 4.783271606001183e-06, + "loss": 0.4847, + "step": 4599 + }, + { + "epoch": 0.53, + "grad_norm": 2.4057492714364312, + "learning_rate": 4.7814124574611055e-06, + "loss": 0.5351, + "step": 4600 + }, + { + "epoch": 0.53, + "grad_norm": 1.8034945482329836, + "learning_rate": 4.7795533391997365e-06, + "loss": 0.5282, + "step": 4601 + }, + { + "epoch": 0.53, + "grad_norm": 1.8616072470890517, + "learning_rate": 4.777694251474601e-06, + "loss": 0.4732, + "step": 4602 + }, + { + "epoch": 0.53, + "grad_norm": 2.7249869478548603, + "learning_rate": 4.775835194543219e-06, + "loss": 0.4924, + "step": 4603 + }, + { + "epoch": 0.53, + "grad_norm": 2.925363412057559, + "learning_rate": 4.773976168663107e-06, + "loss": 0.4619, + "step": 4604 + }, + { + "epoch": 0.53, + "grad_norm": 3.769554485785394, + "learning_rate": 4.772117174091776e-06, + "loss": 0.4432, + "step": 4605 + }, + { + "epoch": 0.53, + "grad_norm": 2.3072337862537524, + "learning_rate": 4.770258211086735e-06, + "loss": 0.456, + "step": 4606 + }, + { + "epoch": 0.53, + "grad_norm": 1.7009986533953696, + "learning_rate": 4.768399279905489e-06, + "loss": 0.5658, + "step": 4607 + }, + { + "epoch": 0.53, + "grad_norm": 3.0266423701437972, + "learning_rate": 4.766540380805533e-06, + "loss": 0.4838, + "step": 4608 + }, + { + "epoch": 0.53, + "grad_norm": 2.5523636747338307, + "learning_rate": 4.7646815140443625e-06, + "loss": 0.4747, + "step": 4609 + }, + { + "epoch": 0.53, + "grad_norm": 2.544121167235555, + "learning_rate": 4.762822679879469e-06, + "loss": 0.6035, + "step": 4610 + }, + { + "epoch": 0.53, + "grad_norm": 1.677293907848666, + "learning_rate": 4.7609638785683365e-06, + "loss": 0.5121, + "step": 4611 + }, + { + "epoch": 0.53, + "grad_norm": 1.9167508235507689, + "learning_rate": 4.759105110368446e-06, + "loss": 0.4309, + "step": 4612 + }, + { + "epoch": 0.53, + "grad_norm": 2.09105511713776, + "learning_rate": 4.757246375537275e-06, + "loss": 0.5393, + "step": 4613 + }, + { + "epoch": 0.53, + "grad_norm": 1.8331872329457393, + "learning_rate": 4.755387674332292e-06, + "loss": 0.5051, + "step": 4614 + }, + { + "epoch": 0.53, + "grad_norm": 2.3796635600966725, + "learning_rate": 4.753529007010967e-06, + "loss": 0.6322, + "step": 4615 + }, + { + "epoch": 0.53, + "grad_norm": 2.5172924303763975, + "learning_rate": 4.75167037383076e-06, + "loss": 0.5493, + "step": 4616 + }, + { + "epoch": 0.53, + "grad_norm": 1.8632548689985227, + "learning_rate": 4.749811775049131e-06, + "loss": 0.4864, + "step": 4617 + }, + { + "epoch": 0.53, + "grad_norm": 2.1926319622923502, + "learning_rate": 4.7479532109235315e-06, + "loss": 0.5256, + "step": 4618 + }, + { + "epoch": 0.53, + "grad_norm": 1.9258503614453266, + "learning_rate": 4.746094681711409e-06, + "loss": 0.5179, + "step": 4619 + }, + { + "epoch": 0.53, + "grad_norm": 2.0129673265110566, + "learning_rate": 4.744236187670208e-06, + "loss": 0.4516, + "step": 4620 + }, + { + "epoch": 0.53, + "grad_norm": 0.7782092155735626, + "learning_rate": 4.742377729057366e-06, + "loss": 0.6714, + "step": 4621 + }, + { + "epoch": 0.53, + "grad_norm": 2.0880325364239374, + "learning_rate": 4.740519306130315e-06, + "loss": 0.519, + "step": 4622 + }, + { + "epoch": 0.53, + "grad_norm": 2.445679293583082, + "learning_rate": 4.738660919146486e-06, + "loss": 0.5449, + "step": 4623 + }, + { + "epoch": 0.53, + "grad_norm": 1.9905400733509697, + "learning_rate": 4.736802568363301e-06, + "loss": 0.5069, + "step": 4624 + }, + { + "epoch": 0.53, + "grad_norm": 19.65539353140791, + "learning_rate": 4.734944254038178e-06, + "loss": 0.4445, + "step": 4625 + }, + { + "epoch": 0.53, + "grad_norm": 1.844515877516487, + "learning_rate": 4.733085976428535e-06, + "loss": 0.493, + "step": 4626 + }, + { + "epoch": 0.53, + "grad_norm": 2.4405149712545886, + "learning_rate": 4.7312277357917726e-06, + "loss": 0.5364, + "step": 4627 + }, + { + "epoch": 0.53, + "grad_norm": 1.8223939692917552, + "learning_rate": 4.7293695323852975e-06, + "loss": 0.4566, + "step": 4628 + }, + { + "epoch": 0.53, + "grad_norm": 1.9088404911423766, + "learning_rate": 4.7275113664665085e-06, + "loss": 0.4394, + "step": 4629 + }, + { + "epoch": 0.53, + "grad_norm": 1.8776388690910601, + "learning_rate": 4.725653238292798e-06, + "loss": 0.5273, + "step": 4630 + }, + { + "epoch": 0.53, + "grad_norm": 3.820313119531497, + "learning_rate": 4.723795148121553e-06, + "loss": 0.5172, + "step": 4631 + }, + { + "epoch": 0.53, + "grad_norm": 2.4891082787747747, + "learning_rate": 4.721937096210156e-06, + "loss": 0.4589, + "step": 4632 + }, + { + "epoch": 0.53, + "grad_norm": 2.1170687152084224, + "learning_rate": 4.720079082815986e-06, + "loss": 0.4815, + "step": 4633 + }, + { + "epoch": 0.53, + "grad_norm": 1.7654492613414394, + "learning_rate": 4.718221108196412e-06, + "loss": 0.4667, + "step": 4634 + }, + { + "epoch": 0.53, + "grad_norm": 1.7966705051500982, + "learning_rate": 4.7163631726088e-06, + "loss": 0.4898, + "step": 4635 + }, + { + "epoch": 0.53, + "grad_norm": 1.9949175556003313, + "learning_rate": 4.714505276310513e-06, + "loss": 0.5029, + "step": 4636 + }, + { + "epoch": 0.53, + "grad_norm": 3.2710048799635425, + "learning_rate": 4.712647419558906e-06, + "loss": 0.4146, + "step": 4637 + }, + { + "epoch": 0.53, + "grad_norm": 2.1212367985990657, + "learning_rate": 4.710789602611328e-06, + "loss": 0.5525, + "step": 4638 + }, + { + "epoch": 0.53, + "grad_norm": 2.0497723041506144, + "learning_rate": 4.708931825725125e-06, + "loss": 0.5267, + "step": 4639 + }, + { + "epoch": 0.53, + "grad_norm": 0.749457257831849, + "learning_rate": 4.707074089157634e-06, + "loss": 0.6715, + "step": 4640 + }, + { + "epoch": 0.53, + "grad_norm": 2.2923524231462693, + "learning_rate": 4.70521639316619e-06, + "loss": 0.5225, + "step": 4641 + }, + { + "epoch": 0.53, + "grad_norm": 2.038078910233903, + "learning_rate": 4.703358738008121e-06, + "loss": 0.565, + "step": 4642 + }, + { + "epoch": 0.53, + "grad_norm": 3.9779677378659004, + "learning_rate": 4.701501123940749e-06, + "loss": 0.4288, + "step": 4643 + }, + { + "epoch": 0.53, + "grad_norm": 2.782312364187964, + "learning_rate": 4.699643551221388e-06, + "loss": 0.539, + "step": 4644 + }, + { + "epoch": 0.53, + "grad_norm": 2.505084358588231, + "learning_rate": 4.697786020107351e-06, + "loss": 0.566, + "step": 4645 + }, + { + "epoch": 0.53, + "grad_norm": 1.9303263617525803, + "learning_rate": 4.695928530855941e-06, + "loss": 0.4211, + "step": 4646 + }, + { + "epoch": 0.53, + "grad_norm": 1.8939690580516948, + "learning_rate": 4.694071083724459e-06, + "loss": 0.5431, + "step": 4647 + }, + { + "epoch": 0.53, + "grad_norm": 2.0880005195890625, + "learning_rate": 4.692213678970197e-06, + "loss": 0.4942, + "step": 4648 + }, + { + "epoch": 0.53, + "grad_norm": 2.1711645610942836, + "learning_rate": 4.6903563168504435e-06, + "loss": 0.5048, + "step": 4649 + }, + { + "epoch": 0.53, + "grad_norm": 1.9410545037525402, + "learning_rate": 4.688498997622478e-06, + "loss": 0.429, + "step": 4650 + }, + { + "epoch": 0.53, + "grad_norm": 2.1782522782555915, + "learning_rate": 4.686641721543579e-06, + "loss": 0.4882, + "step": 4651 + }, + { + "epoch": 0.53, + "grad_norm": 2.317031666319926, + "learning_rate": 4.684784488871014e-06, + "loss": 0.484, + "step": 4652 + }, + { + "epoch": 0.53, + "grad_norm": 2.280647371521375, + "learning_rate": 4.682927299862048e-06, + "loss": 0.5494, + "step": 4653 + }, + { + "epoch": 0.53, + "grad_norm": 2.252569718288382, + "learning_rate": 4.6810701547739364e-06, + "loss": 0.4461, + "step": 4654 + }, + { + "epoch": 0.53, + "grad_norm": 1.8265080809784993, + "learning_rate": 4.679213053863931e-06, + "loss": 0.5105, + "step": 4655 + }, + { + "epoch": 0.53, + "grad_norm": 0.8756012360122547, + "learning_rate": 4.677355997389279e-06, + "loss": 0.702, + "step": 4656 + }, + { + "epoch": 0.54, + "grad_norm": 7.7162773996819185, + "learning_rate": 4.675498985607217e-06, + "loss": 0.5218, + "step": 4657 + }, + { + "epoch": 0.54, + "grad_norm": 1.6742250478018796, + "learning_rate": 4.673642018774981e-06, + "loss": 0.3734, + "step": 4658 + }, + { + "epoch": 0.54, + "grad_norm": 1.946878799358248, + "learning_rate": 4.671785097149796e-06, + "loss": 0.4704, + "step": 4659 + }, + { + "epoch": 0.54, + "grad_norm": 2.4477476787025316, + "learning_rate": 4.669928220988883e-06, + "loss": 0.4783, + "step": 4660 + }, + { + "epoch": 0.54, + "grad_norm": 2.2596026477671045, + "learning_rate": 4.668071390549454e-06, + "loss": 0.5067, + "step": 4661 + }, + { + "epoch": 0.54, + "grad_norm": 3.5088085501143538, + "learning_rate": 4.666214606088719e-06, + "loss": 0.5386, + "step": 4662 + }, + { + "epoch": 0.54, + "grad_norm": 2.1212458770516136, + "learning_rate": 4.664357867863879e-06, + "loss": 0.5175, + "step": 4663 + }, + { + "epoch": 0.54, + "grad_norm": 1.9348256038658231, + "learning_rate": 4.6625011761321285e-06, + "loss": 0.5458, + "step": 4664 + }, + { + "epoch": 0.54, + "grad_norm": 2.226950920311374, + "learning_rate": 4.6606445311506564e-06, + "loss": 0.5777, + "step": 4665 + }, + { + "epoch": 0.54, + "grad_norm": 6.3178438138762445, + "learning_rate": 4.6587879331766465e-06, + "loss": 0.5639, + "step": 4666 + }, + { + "epoch": 0.54, + "grad_norm": 2.4620681548092764, + "learning_rate": 4.656931382467272e-06, + "loss": 0.5633, + "step": 4667 + }, + { + "epoch": 0.54, + "grad_norm": 2.8010489685360302, + "learning_rate": 4.655074879279703e-06, + "loss": 0.5737, + "step": 4668 + }, + { + "epoch": 0.54, + "grad_norm": 1.8204130338659306, + "learning_rate": 4.653218423871102e-06, + "loss": 0.5318, + "step": 4669 + }, + { + "epoch": 0.54, + "grad_norm": 2.1273226926128994, + "learning_rate": 4.651362016498625e-06, + "loss": 0.5475, + "step": 4670 + }, + { + "epoch": 0.54, + "grad_norm": 2.3686236495776534, + "learning_rate": 4.64950565741942e-06, + "loss": 0.4927, + "step": 4671 + }, + { + "epoch": 0.54, + "grad_norm": 3.8871602855563125, + "learning_rate": 4.647649346890633e-06, + "loss": 0.5704, + "step": 4672 + }, + { + "epoch": 0.54, + "grad_norm": 2.3473391920922544, + "learning_rate": 4.645793085169396e-06, + "loss": 0.5289, + "step": 4673 + }, + { + "epoch": 0.54, + "grad_norm": 2.2052704581082265, + "learning_rate": 4.64393687251284e-06, + "loss": 0.4617, + "step": 4674 + }, + { + "epoch": 0.54, + "grad_norm": 2.461846910497991, + "learning_rate": 4.6420807091780855e-06, + "loss": 0.5598, + "step": 4675 + }, + { + "epoch": 0.54, + "grad_norm": 5.026016128274953, + "learning_rate": 4.640224595422251e-06, + "loss": 0.469, + "step": 4676 + }, + { + "epoch": 0.54, + "grad_norm": 1.6290566132188997, + "learning_rate": 4.638368531502442e-06, + "loss": 0.4079, + "step": 4677 + }, + { + "epoch": 0.54, + "grad_norm": 2.398560750017565, + "learning_rate": 4.636512517675763e-06, + "loss": 0.5602, + "step": 4678 + }, + { + "epoch": 0.54, + "grad_norm": 2.2580981286671573, + "learning_rate": 4.634656554199306e-06, + "loss": 0.5387, + "step": 4679 + }, + { + "epoch": 0.54, + "grad_norm": 3.257152951874033, + "learning_rate": 4.632800641330159e-06, + "loss": 0.5057, + "step": 4680 + }, + { + "epoch": 0.54, + "grad_norm": 2.1334530992862666, + "learning_rate": 4.6309447793254046e-06, + "loss": 0.467, + "step": 4681 + }, + { + "epoch": 0.54, + "grad_norm": 2.220644466957239, + "learning_rate": 4.629088968442115e-06, + "loss": 0.6066, + "step": 4682 + }, + { + "epoch": 0.54, + "grad_norm": 2.0690202270269302, + "learning_rate": 4.627233208937357e-06, + "loss": 0.4983, + "step": 4683 + }, + { + "epoch": 0.54, + "grad_norm": 1.8511330807711688, + "learning_rate": 4.62537750106819e-06, + "loss": 0.5837, + "step": 4684 + }, + { + "epoch": 0.54, + "grad_norm": 1.9143723198718514, + "learning_rate": 4.623521845091667e-06, + "loss": 0.5368, + "step": 4685 + }, + { + "epoch": 0.54, + "grad_norm": 3.025459673779297, + "learning_rate": 4.6216662412648325e-06, + "loss": 0.5198, + "step": 4686 + }, + { + "epoch": 0.54, + "grad_norm": 1.9185796048543928, + "learning_rate": 4.619810689844726e-06, + "loss": 0.4716, + "step": 4687 + }, + { + "epoch": 0.54, + "grad_norm": 2.2758759579198133, + "learning_rate": 4.6179551910883755e-06, + "loss": 0.497, + "step": 4688 + }, + { + "epoch": 0.54, + "grad_norm": 2.0553296301338344, + "learning_rate": 4.616099745252806e-06, + "loss": 0.4888, + "step": 4689 + }, + { + "epoch": 0.54, + "grad_norm": 1.8616828258925835, + "learning_rate": 4.6142443525950345e-06, + "loss": 0.5117, + "step": 4690 + }, + { + "epoch": 0.54, + "grad_norm": 2.4366270225941506, + "learning_rate": 4.612389013372068e-06, + "loss": 0.4398, + "step": 4691 + }, + { + "epoch": 0.54, + "grad_norm": 2.63250381770413, + "learning_rate": 4.610533727840908e-06, + "loss": 0.4308, + "step": 4692 + }, + { + "epoch": 0.54, + "grad_norm": 2.3914458142513424, + "learning_rate": 4.608678496258549e-06, + "loss": 0.5294, + "step": 4693 + }, + { + "epoch": 0.54, + "grad_norm": 3.0856826128306705, + "learning_rate": 4.606823318881977e-06, + "loss": 0.5484, + "step": 4694 + }, + { + "epoch": 0.54, + "grad_norm": 2.8246530527529177, + "learning_rate": 4.6049681959681696e-06, + "loss": 0.5372, + "step": 4695 + }, + { + "epoch": 0.54, + "grad_norm": 2.1132533612609654, + "learning_rate": 4.6031131277741e-06, + "loss": 0.4991, + "step": 4696 + }, + { + "epoch": 0.54, + "grad_norm": 2.155922554314053, + "learning_rate": 4.601258114556731e-06, + "loss": 0.3537, + "step": 4697 + }, + { + "epoch": 0.54, + "grad_norm": 2.327295075291843, + "learning_rate": 4.59940315657302e-06, + "loss": 0.4251, + "step": 4698 + }, + { + "epoch": 0.54, + "grad_norm": 3.314472832819074, + "learning_rate": 4.597548254079913e-06, + "loss": 0.509, + "step": 4699 + }, + { + "epoch": 0.54, + "grad_norm": 1.976844568459057, + "learning_rate": 4.595693407334352e-06, + "loss": 0.4503, + "step": 4700 + }, + { + "epoch": 0.54, + "grad_norm": 2.855313287648476, + "learning_rate": 4.593838616593271e-06, + "loss": 0.5359, + "step": 4701 + }, + { + "epoch": 0.54, + "grad_norm": 1.9107360006439185, + "learning_rate": 4.591983882113594e-06, + "loss": 0.5337, + "step": 4702 + }, + { + "epoch": 0.54, + "grad_norm": 2.603074364611393, + "learning_rate": 4.590129204152239e-06, + "loss": 0.6203, + "step": 4703 + }, + { + "epoch": 0.54, + "grad_norm": 2.2543626661564793, + "learning_rate": 4.588274582966116e-06, + "loss": 0.4764, + "step": 4704 + }, + { + "epoch": 0.54, + "grad_norm": 1.626338927292518, + "learning_rate": 4.586420018812125e-06, + "loss": 0.5262, + "step": 4705 + }, + { + "epoch": 0.54, + "grad_norm": 2.078036401999381, + "learning_rate": 4.5845655119471625e-06, + "loss": 0.4957, + "step": 4706 + }, + { + "epoch": 0.54, + "grad_norm": 3.3990353860007314, + "learning_rate": 4.582711062628114e-06, + "loss": 0.5579, + "step": 4707 + }, + { + "epoch": 0.54, + "grad_norm": 7.773348682252783, + "learning_rate": 4.5808566711118555e-06, + "loss": 0.4949, + "step": 4708 + }, + { + "epoch": 0.54, + "grad_norm": 0.8547930519693662, + "learning_rate": 4.57900233765526e-06, + "loss": 0.7306, + "step": 4709 + }, + { + "epoch": 0.54, + "grad_norm": 3.1824132174941004, + "learning_rate": 4.577148062515186e-06, + "loss": 0.5513, + "step": 4710 + }, + { + "epoch": 0.54, + "grad_norm": 6.523596564302978, + "learning_rate": 4.575293845948492e-06, + "loss": 0.5283, + "step": 4711 + }, + { + "epoch": 0.54, + "grad_norm": 2.392486823652076, + "learning_rate": 4.573439688212018e-06, + "loss": 0.5184, + "step": 4712 + }, + { + "epoch": 0.54, + "grad_norm": 2.503718370629469, + "learning_rate": 4.571585589562606e-06, + "loss": 0.5073, + "step": 4713 + }, + { + "epoch": 0.54, + "grad_norm": 2.489185988165088, + "learning_rate": 4.569731550257083e-06, + "loss": 0.4307, + "step": 4714 + }, + { + "epoch": 0.54, + "grad_norm": 4.121486689174363, + "learning_rate": 4.567877570552272e-06, + "loss": 0.4524, + "step": 4715 + }, + { + "epoch": 0.54, + "grad_norm": 1.9717820302544427, + "learning_rate": 4.566023650704984e-06, + "loss": 0.4249, + "step": 4716 + }, + { + "epoch": 0.54, + "grad_norm": 2.6455486557773136, + "learning_rate": 4.564169790972025e-06, + "loss": 0.5129, + "step": 4717 + }, + { + "epoch": 0.54, + "grad_norm": 2.9945622416665105, + "learning_rate": 4.562315991610192e-06, + "loss": 0.547, + "step": 4718 + }, + { + "epoch": 0.54, + "grad_norm": 2.316971502978805, + "learning_rate": 4.560462252876271e-06, + "loss": 0.4596, + "step": 4719 + }, + { + "epoch": 0.54, + "grad_norm": 1.9399748608543934, + "learning_rate": 4.558608575027043e-06, + "loss": 0.4929, + "step": 4720 + }, + { + "epoch": 0.54, + "grad_norm": 2.0964507806084383, + "learning_rate": 4.5567549583192785e-06, + "loss": 0.4654, + "step": 4721 + }, + { + "epoch": 0.54, + "grad_norm": 2.2831027848607928, + "learning_rate": 4.55490140300974e-06, + "loss": 0.5135, + "step": 4722 + }, + { + "epoch": 0.54, + "grad_norm": 4.56862221232208, + "learning_rate": 4.553047909355183e-06, + "loss": 0.465, + "step": 4723 + }, + { + "epoch": 0.54, + "grad_norm": 1.9180445005728384, + "learning_rate": 4.551194477612351e-06, + "loss": 0.6142, + "step": 4724 + }, + { + "epoch": 0.54, + "grad_norm": 2.3303072432971925, + "learning_rate": 4.549341108037984e-06, + "loss": 0.4185, + "step": 4725 + }, + { + "epoch": 0.54, + "grad_norm": 2.0118920473919855, + "learning_rate": 4.547487800888808e-06, + "loss": 0.4956, + "step": 4726 + }, + { + "epoch": 0.54, + "grad_norm": 0.8972623843282614, + "learning_rate": 4.545634556421542e-06, + "loss": 0.7837, + "step": 4727 + }, + { + "epoch": 0.54, + "grad_norm": 3.0515014862573633, + "learning_rate": 4.543781374892902e-06, + "loss": 0.5154, + "step": 4728 + }, + { + "epoch": 0.54, + "grad_norm": 3.4202463000809455, + "learning_rate": 4.541928256559584e-06, + "loss": 0.5046, + "step": 4729 + }, + { + "epoch": 0.54, + "grad_norm": 2.2805409575343534, + "learning_rate": 4.5400752016782854e-06, + "loss": 0.4745, + "step": 4730 + }, + { + "epoch": 0.54, + "grad_norm": 2.7229789282570054, + "learning_rate": 4.53822221050569e-06, + "loss": 0.5919, + "step": 4731 + }, + { + "epoch": 0.54, + "grad_norm": 2.709486696510282, + "learning_rate": 4.536369283298474e-06, + "loss": 0.5141, + "step": 4732 + }, + { + "epoch": 0.54, + "grad_norm": 2.374102442701293, + "learning_rate": 4.534516420313304e-06, + "loss": 0.5483, + "step": 4733 + }, + { + "epoch": 0.54, + "grad_norm": 2.440598460649114, + "learning_rate": 4.5326636218068394e-06, + "loss": 0.4127, + "step": 4734 + }, + { + "epoch": 0.54, + "grad_norm": 2.210959331333927, + "learning_rate": 4.530810888035729e-06, + "loss": 0.453, + "step": 4735 + }, + { + "epoch": 0.54, + "grad_norm": 2.0923357188717806, + "learning_rate": 4.528958219256613e-06, + "loss": 0.432, + "step": 4736 + }, + { + "epoch": 0.54, + "grad_norm": 3.5166167372329564, + "learning_rate": 4.527105615726124e-06, + "loss": 0.4711, + "step": 4737 + }, + { + "epoch": 0.54, + "grad_norm": 2.1567139485835836, + "learning_rate": 4.525253077700882e-06, + "loss": 0.5106, + "step": 4738 + }, + { + "epoch": 0.54, + "grad_norm": 3.1984622235676508, + "learning_rate": 4.523400605437501e-06, + "loss": 0.4556, + "step": 4739 + }, + { + "epoch": 0.54, + "grad_norm": 2.065987980132794, + "learning_rate": 4.521548199192587e-06, + "loss": 0.4854, + "step": 4740 + }, + { + "epoch": 0.54, + "grad_norm": 2.1206497168478498, + "learning_rate": 4.519695859222733e-06, + "loss": 0.3951, + "step": 4741 + }, + { + "epoch": 0.54, + "grad_norm": 2.15208434725664, + "learning_rate": 4.517843585784525e-06, + "loss": 0.4565, + "step": 4742 + }, + { + "epoch": 0.54, + "grad_norm": 0.7926666702326282, + "learning_rate": 4.515991379134539e-06, + "loss": 0.7006, + "step": 4743 + }, + { + "epoch": 0.55, + "grad_norm": 1.770747463795314, + "learning_rate": 4.514139239529345e-06, + "loss": 0.5386, + "step": 4744 + }, + { + "epoch": 0.55, + "grad_norm": 1.734926305865901, + "learning_rate": 4.512287167225501e-06, + "loss": 0.5087, + "step": 4745 + }, + { + "epoch": 0.55, + "grad_norm": 1.9792870976475687, + "learning_rate": 4.510435162479551e-06, + "loss": 0.5288, + "step": 4746 + }, + { + "epoch": 0.55, + "grad_norm": 2.1211661383539457, + "learning_rate": 4.508583225548039e-06, + "loss": 0.5292, + "step": 4747 + }, + { + "epoch": 0.55, + "grad_norm": 1.8406422465367296, + "learning_rate": 4.506731356687493e-06, + "loss": 0.5636, + "step": 4748 + }, + { + "epoch": 0.55, + "grad_norm": 2.426633954291697, + "learning_rate": 4.504879556154433e-06, + "loss": 0.4705, + "step": 4749 + }, + { + "epoch": 0.55, + "grad_norm": 1.951740043780009, + "learning_rate": 4.5030278242053725e-06, + "loss": 0.5418, + "step": 4750 + }, + { + "epoch": 0.55, + "grad_norm": 2.2739318165608915, + "learning_rate": 4.501176161096811e-06, + "loss": 0.5033, + "step": 4751 + }, + { + "epoch": 0.55, + "grad_norm": 2.260449352853781, + "learning_rate": 4.499324567085242e-06, + "loss": 0.5161, + "step": 4752 + }, + { + "epoch": 0.55, + "grad_norm": 2.0218842622351834, + "learning_rate": 4.497473042427147e-06, + "loss": 0.5561, + "step": 4753 + }, + { + "epoch": 0.55, + "grad_norm": 2.335807955174927, + "learning_rate": 4.495621587378998e-06, + "loss": 0.5447, + "step": 4754 + }, + { + "epoch": 0.55, + "grad_norm": 2.2378513670920257, + "learning_rate": 4.493770202197261e-06, + "loss": 0.5161, + "step": 4755 + }, + { + "epoch": 0.55, + "grad_norm": 2.637832142203877, + "learning_rate": 4.491918887138387e-06, + "loss": 0.586, + "step": 4756 + }, + { + "epoch": 0.55, + "grad_norm": 1.6462058308685985, + "learning_rate": 4.490067642458822e-06, + "loss": 0.5833, + "step": 4757 + }, + { + "epoch": 0.55, + "grad_norm": 2.0283868772182996, + "learning_rate": 4.4882164684149975e-06, + "loss": 0.4125, + "step": 4758 + }, + { + "epoch": 0.55, + "grad_norm": 1.866767667799671, + "learning_rate": 4.4863653652633396e-06, + "loss": 0.5244, + "step": 4759 + }, + { + "epoch": 0.55, + "grad_norm": 2.178485740279844, + "learning_rate": 4.484514333260262e-06, + "loss": 0.5376, + "step": 4760 + }, + { + "epoch": 0.55, + "grad_norm": 2.1838489489424098, + "learning_rate": 4.48266337266217e-06, + "loss": 0.5047, + "step": 4761 + }, + { + "epoch": 0.55, + "grad_norm": 2.2007938235369613, + "learning_rate": 4.480812483725458e-06, + "loss": 0.5146, + "step": 4762 + }, + { + "epoch": 0.55, + "grad_norm": 2.4714701208204457, + "learning_rate": 4.478961666706512e-06, + "loss": 0.5874, + "step": 4763 + }, + { + "epoch": 0.55, + "grad_norm": 7.579022381833512, + "learning_rate": 4.477110921861704e-06, + "loss": 0.5193, + "step": 4764 + }, + { + "epoch": 0.55, + "grad_norm": 1.9462746029049274, + "learning_rate": 4.475260249447401e-06, + "loss": 0.4654, + "step": 4765 + }, + { + "epoch": 0.55, + "grad_norm": 2.3376045446806373, + "learning_rate": 4.473409649719958e-06, + "loss": 0.4594, + "step": 4766 + }, + { + "epoch": 0.55, + "grad_norm": 2.172075675309851, + "learning_rate": 4.471559122935718e-06, + "loss": 0.4822, + "step": 4767 + }, + { + "epoch": 0.55, + "grad_norm": 2.2756047681635656, + "learning_rate": 4.469708669351017e-06, + "loss": 0.5517, + "step": 4768 + }, + { + "epoch": 0.55, + "grad_norm": 2.8420447069086467, + "learning_rate": 4.467858289222179e-06, + "loss": 0.5175, + "step": 4769 + }, + { + "epoch": 0.55, + "grad_norm": 3.4735135764737275, + "learning_rate": 4.4660079828055195e-06, + "loss": 0.4699, + "step": 4770 + }, + { + "epoch": 0.55, + "grad_norm": 2.2244726760047078, + "learning_rate": 4.464157750357341e-06, + "loss": 0.434, + "step": 4771 + }, + { + "epoch": 0.55, + "grad_norm": 2.5712301190342477, + "learning_rate": 4.462307592133938e-06, + "loss": 0.4684, + "step": 4772 + }, + { + "epoch": 0.55, + "grad_norm": 2.871158163194741, + "learning_rate": 4.460457508391595e-06, + "loss": 0.5143, + "step": 4773 + }, + { + "epoch": 0.55, + "grad_norm": 2.8672411278905168, + "learning_rate": 4.458607499386584e-06, + "loss": 0.4884, + "step": 4774 + }, + { + "epoch": 0.55, + "grad_norm": 1.6948690898475525, + "learning_rate": 4.456757565375168e-06, + "loss": 0.527, + "step": 4775 + }, + { + "epoch": 0.55, + "grad_norm": 2.242172624618669, + "learning_rate": 4.4549077066136e-06, + "loss": 0.5039, + "step": 4776 + }, + { + "epoch": 0.55, + "grad_norm": 1.733169128846832, + "learning_rate": 4.4530579233581235e-06, + "loss": 0.5553, + "step": 4777 + }, + { + "epoch": 0.55, + "grad_norm": 2.122524178534542, + "learning_rate": 4.451208215864969e-06, + "loss": 0.4604, + "step": 4778 + }, + { + "epoch": 0.55, + "grad_norm": 2.1732948956991884, + "learning_rate": 4.449358584390357e-06, + "loss": 0.4868, + "step": 4779 + }, + { + "epoch": 0.55, + "grad_norm": 2.089888294581923, + "learning_rate": 4.4475090291904975e-06, + "loss": 0.5874, + "step": 4780 + }, + { + "epoch": 0.55, + "grad_norm": 2.6811187016555498, + "learning_rate": 4.445659550521591e-06, + "loss": 0.4389, + "step": 4781 + }, + { + "epoch": 0.55, + "grad_norm": 5.954889852755012, + "learning_rate": 4.443810148639828e-06, + "loss": 0.483, + "step": 4782 + }, + { + "epoch": 0.55, + "grad_norm": 1.9926765029938955, + "learning_rate": 4.441960823801389e-06, + "loss": 0.471, + "step": 4783 + }, + { + "epoch": 0.55, + "grad_norm": 2.0117538880195527, + "learning_rate": 4.440111576262438e-06, + "loss": 0.5532, + "step": 4784 + }, + { + "epoch": 0.55, + "grad_norm": 2.015242091584923, + "learning_rate": 4.438262406279134e-06, + "loss": 0.4926, + "step": 4785 + }, + { + "epoch": 0.55, + "grad_norm": 2.268574524001044, + "learning_rate": 4.436413314107625e-06, + "loss": 0.5605, + "step": 4786 + }, + { + "epoch": 0.55, + "grad_norm": 2.152610090676576, + "learning_rate": 4.434564300004046e-06, + "loss": 0.4431, + "step": 4787 + }, + { + "epoch": 0.55, + "grad_norm": 2.1769911205351855, + "learning_rate": 4.4327153642245215e-06, + "loss": 0.4975, + "step": 4788 + }, + { + "epoch": 0.55, + "grad_norm": 1.9946367912288085, + "learning_rate": 4.430866507025167e-06, + "loss": 0.4343, + "step": 4789 + }, + { + "epoch": 0.55, + "grad_norm": 1.9959800285746205, + "learning_rate": 4.429017728662086e-06, + "loss": 0.4843, + "step": 4790 + }, + { + "epoch": 0.55, + "grad_norm": 3.585732296804081, + "learning_rate": 4.42716902939137e-06, + "loss": 0.4473, + "step": 4791 + }, + { + "epoch": 0.55, + "grad_norm": 2.014648126681179, + "learning_rate": 4.425320409469101e-06, + "loss": 0.4318, + "step": 4792 + }, + { + "epoch": 0.55, + "grad_norm": 3.038648033470926, + "learning_rate": 4.423471869151348e-06, + "loss": 0.4484, + "step": 4793 + }, + { + "epoch": 0.55, + "grad_norm": 2.1480448448808165, + "learning_rate": 4.421623408694173e-06, + "loss": 0.5539, + "step": 4794 + }, + { + "epoch": 0.55, + "grad_norm": 2.778412486211518, + "learning_rate": 4.419775028353625e-06, + "loss": 0.447, + "step": 4795 + }, + { + "epoch": 0.55, + "grad_norm": 2.074946855336135, + "learning_rate": 4.417926728385738e-06, + "loss": 0.499, + "step": 4796 + }, + { + "epoch": 0.55, + "grad_norm": 2.386830510133657, + "learning_rate": 4.416078509046539e-06, + "loss": 0.4483, + "step": 4797 + }, + { + "epoch": 0.55, + "grad_norm": 2.068768441262766, + "learning_rate": 4.4142303705920446e-06, + "loss": 0.4458, + "step": 4798 + }, + { + "epoch": 0.55, + "grad_norm": 2.3155004263275814, + "learning_rate": 4.412382313278257e-06, + "loss": 0.4361, + "step": 4799 + }, + { + "epoch": 0.55, + "grad_norm": 2.4602844358986777, + "learning_rate": 4.41053433736117e-06, + "loss": 0.5155, + "step": 4800 + }, + { + "epoch": 0.55, + "grad_norm": 2.017909658723393, + "learning_rate": 4.408686443096765e-06, + "loss": 0.4961, + "step": 4801 + }, + { + "epoch": 0.55, + "grad_norm": 4.227753870429378, + "learning_rate": 4.40683863074101e-06, + "loss": 0.486, + "step": 4802 + }, + { + "epoch": 0.55, + "grad_norm": 1.720301005468441, + "learning_rate": 4.404990900549867e-06, + "loss": 0.4686, + "step": 4803 + }, + { + "epoch": 0.55, + "grad_norm": 2.877168920792929, + "learning_rate": 4.403143252779281e-06, + "loss": 0.4592, + "step": 4804 + }, + { + "epoch": 0.55, + "grad_norm": 3.702830868530728, + "learning_rate": 4.401295687685188e-06, + "loss": 0.4838, + "step": 4805 + }, + { + "epoch": 0.55, + "grad_norm": 2.2172314478687962, + "learning_rate": 4.399448205523512e-06, + "loss": 0.6032, + "step": 4806 + }, + { + "epoch": 0.55, + "grad_norm": 2.7116250228966026, + "learning_rate": 4.397600806550167e-06, + "loss": 0.4637, + "step": 4807 + }, + { + "epoch": 0.55, + "grad_norm": 0.8955644342042355, + "learning_rate": 4.395753491021053e-06, + "loss": 0.6679, + "step": 4808 + }, + { + "epoch": 0.55, + "grad_norm": 1.968458394875596, + "learning_rate": 4.3939062591920625e-06, + "loss": 0.4265, + "step": 4809 + }, + { + "epoch": 0.55, + "grad_norm": 2.3346029719916306, + "learning_rate": 4.392059111319071e-06, + "loss": 0.5031, + "step": 4810 + }, + { + "epoch": 0.55, + "grad_norm": 2.041066728892029, + "learning_rate": 4.3902120476579465e-06, + "loss": 0.4575, + "step": 4811 + }, + { + "epoch": 0.55, + "grad_norm": 2.41872507370031, + "learning_rate": 4.388365068464545e-06, + "loss": 0.5538, + "step": 4812 + }, + { + "epoch": 0.55, + "grad_norm": 2.498683019319059, + "learning_rate": 4.386518173994706e-06, + "loss": 0.504, + "step": 4813 + }, + { + "epoch": 0.55, + "grad_norm": 2.7280770441505457, + "learning_rate": 4.3846713645042646e-06, + "loss": 0.5821, + "step": 4814 + }, + { + "epoch": 0.55, + "grad_norm": 6.14104253139992, + "learning_rate": 4.382824640249038e-06, + "loss": 0.4403, + "step": 4815 + }, + { + "epoch": 0.55, + "grad_norm": 2.1232688352886293, + "learning_rate": 4.380978001484836e-06, + "loss": 0.4647, + "step": 4816 + }, + { + "epoch": 0.55, + "grad_norm": 2.1331989964252442, + "learning_rate": 4.3791314484674545e-06, + "loss": 0.6034, + "step": 4817 + }, + { + "epoch": 0.55, + "grad_norm": 1.904910913570794, + "learning_rate": 4.377284981452676e-06, + "loss": 0.4581, + "step": 4818 + }, + { + "epoch": 0.55, + "grad_norm": 2.6381239807927757, + "learning_rate": 4.375438600696274e-06, + "loss": 0.4858, + "step": 4819 + }, + { + "epoch": 0.55, + "grad_norm": 2.9451972938654336, + "learning_rate": 4.3735923064540094e-06, + "loss": 0.5459, + "step": 4820 + }, + { + "epoch": 0.55, + "grad_norm": 2.5070911339774806, + "learning_rate": 4.37174609898163e-06, + "loss": 0.5499, + "step": 4821 + }, + { + "epoch": 0.55, + "grad_norm": 2.286754654617083, + "learning_rate": 4.369899978534873e-06, + "loss": 0.397, + "step": 4822 + }, + { + "epoch": 0.55, + "grad_norm": 1.9290064858255627, + "learning_rate": 4.368053945369461e-06, + "loss": 0.5227, + "step": 4823 + }, + { + "epoch": 0.55, + "grad_norm": 1.8849522859481973, + "learning_rate": 4.366207999741107e-06, + "loss": 0.4666, + "step": 4824 + }, + { + "epoch": 0.55, + "grad_norm": 2.8134678902539303, + "learning_rate": 4.364362141905512e-06, + "loss": 0.4364, + "step": 4825 + }, + { + "epoch": 0.55, + "grad_norm": 1.9787003476236429, + "learning_rate": 4.362516372118362e-06, + "loss": 0.5305, + "step": 4826 + }, + { + "epoch": 0.55, + "grad_norm": 1.8621097400659674, + "learning_rate": 4.360670690635334e-06, + "loss": 0.4453, + "step": 4827 + }, + { + "epoch": 0.55, + "grad_norm": 2.4536719025457447, + "learning_rate": 4.358825097712091e-06, + "loss": 0.4602, + "step": 4828 + }, + { + "epoch": 0.55, + "grad_norm": 1.981973745978072, + "learning_rate": 4.356979593604286e-06, + "loss": 0.5247, + "step": 4829 + }, + { + "epoch": 0.55, + "grad_norm": 2.0018656131336146, + "learning_rate": 4.3551341785675546e-06, + "loss": 0.4626, + "step": 4830 + }, + { + "epoch": 0.56, + "grad_norm": 2.8068196070709868, + "learning_rate": 4.353288852857525e-06, + "loss": 0.4186, + "step": 4831 + }, + { + "epoch": 0.56, + "grad_norm": 0.8816722929981592, + "learning_rate": 4.351443616729812e-06, + "loss": 0.693, + "step": 4832 + }, + { + "epoch": 0.56, + "grad_norm": 0.8971294030527226, + "learning_rate": 4.349598470440016e-06, + "loss": 0.719, + "step": 4833 + }, + { + "epoch": 0.56, + "grad_norm": 2.8088481941079637, + "learning_rate": 4.3477534142437285e-06, + "loss": 0.5122, + "step": 4834 + }, + { + "epoch": 0.56, + "grad_norm": 1.8982260238549702, + "learning_rate": 4.345908448396524e-06, + "loss": 0.5487, + "step": 4835 + }, + { + "epoch": 0.56, + "grad_norm": 2.4202194473250307, + "learning_rate": 4.344063573153969e-06, + "loss": 0.5049, + "step": 4836 + }, + { + "epoch": 0.56, + "grad_norm": 2.480662746767606, + "learning_rate": 4.342218788771614e-06, + "loss": 0.5098, + "step": 4837 + }, + { + "epoch": 0.56, + "grad_norm": 1.6324844766455513, + "learning_rate": 4.340374095504997e-06, + "loss": 0.4112, + "step": 4838 + }, + { + "epoch": 0.56, + "grad_norm": 2.0567558911709756, + "learning_rate": 4.338529493609647e-06, + "loss": 0.4768, + "step": 4839 + }, + { + "epoch": 0.56, + "grad_norm": 2.926953251717545, + "learning_rate": 4.336684983341077e-06, + "loss": 0.4977, + "step": 4840 + }, + { + "epoch": 0.56, + "grad_norm": 3.0242083615998916, + "learning_rate": 4.334840564954789e-06, + "loss": 0.54, + "step": 4841 + }, + { + "epoch": 0.56, + "grad_norm": 4.430326481099695, + "learning_rate": 4.3329962387062704e-06, + "loss": 0.4716, + "step": 4842 + }, + { + "epoch": 0.56, + "grad_norm": 1.6057779262389331, + "learning_rate": 4.331152004850997e-06, + "loss": 0.3888, + "step": 4843 + }, + { + "epoch": 0.56, + "grad_norm": 2.1191325288593066, + "learning_rate": 4.329307863644432e-06, + "loss": 0.5567, + "step": 4844 + }, + { + "epoch": 0.56, + "grad_norm": 2.0972236161890083, + "learning_rate": 4.327463815342025e-06, + "loss": 0.6165, + "step": 4845 + }, + { + "epoch": 0.56, + "grad_norm": 2.04305921417872, + "learning_rate": 4.325619860199216e-06, + "loss": 0.5028, + "step": 4846 + }, + { + "epoch": 0.56, + "grad_norm": 3.064445942496861, + "learning_rate": 4.323775998471426e-06, + "loss": 0.5568, + "step": 4847 + }, + { + "epoch": 0.56, + "grad_norm": 1.9356315393285697, + "learning_rate": 4.321932230414067e-06, + "loss": 0.4771, + "step": 4848 + }, + { + "epoch": 0.56, + "grad_norm": 0.8171768066015725, + "learning_rate": 4.320088556282539e-06, + "loss": 0.6486, + "step": 4849 + }, + { + "epoch": 0.56, + "grad_norm": 2.354030690302065, + "learning_rate": 4.318244976332225e-06, + "loss": 0.5592, + "step": 4850 + }, + { + "epoch": 0.56, + "grad_norm": 1.8994471412906808, + "learning_rate": 4.316401490818499e-06, + "loss": 0.4806, + "step": 4851 + }, + { + "epoch": 0.56, + "grad_norm": 1.957241280352434, + "learning_rate": 4.3145580999967205e-06, + "loss": 0.4902, + "step": 4852 + }, + { + "epoch": 0.56, + "grad_norm": 2.109548970057433, + "learning_rate": 4.312714804122235e-06, + "loss": 0.4725, + "step": 4853 + }, + { + "epoch": 0.56, + "grad_norm": 2.78084388797687, + "learning_rate": 4.310871603450376e-06, + "loss": 0.4891, + "step": 4854 + }, + { + "epoch": 0.56, + "grad_norm": 4.24314626112775, + "learning_rate": 4.309028498236462e-06, + "loss": 0.5026, + "step": 4855 + }, + { + "epoch": 0.56, + "grad_norm": 1.9087017173502536, + "learning_rate": 4.307185488735802e-06, + "loss": 0.5205, + "step": 4856 + }, + { + "epoch": 0.56, + "grad_norm": 2.2605588684019096, + "learning_rate": 4.305342575203688e-06, + "loss": 0.4799, + "step": 4857 + }, + { + "epoch": 0.56, + "grad_norm": 2.3575709518161077, + "learning_rate": 4.3034997578954005e-06, + "loss": 0.5788, + "step": 4858 + }, + { + "epoch": 0.56, + "grad_norm": 2.379705950307545, + "learning_rate": 4.3016570370662055e-06, + "loss": 0.5721, + "step": 4859 + }, + { + "epoch": 0.56, + "grad_norm": 2.8172598934029156, + "learning_rate": 4.299814412971356e-06, + "loss": 0.4542, + "step": 4860 + }, + { + "epoch": 0.56, + "grad_norm": 1.9075095940228246, + "learning_rate": 4.2979718858660935e-06, + "loss": 0.3934, + "step": 4861 + }, + { + "epoch": 0.56, + "grad_norm": 2.0111359194402008, + "learning_rate": 4.296129456005645e-06, + "loss": 0.4677, + "step": 4862 + }, + { + "epoch": 0.56, + "grad_norm": 2.289069426300117, + "learning_rate": 4.294287123645222e-06, + "loss": 0.4561, + "step": 4863 + }, + { + "epoch": 0.56, + "grad_norm": 2.5489572400953158, + "learning_rate": 4.292444889040024e-06, + "loss": 0.6066, + "step": 4864 + }, + { + "epoch": 0.56, + "grad_norm": 2.6466856810242883, + "learning_rate": 4.290602752445237e-06, + "loss": 0.5328, + "step": 4865 + }, + { + "epoch": 0.56, + "grad_norm": 1.7841556348269962, + "learning_rate": 4.288760714116033e-06, + "loss": 0.5067, + "step": 4866 + }, + { + "epoch": 0.56, + "grad_norm": 0.8732661616526366, + "learning_rate": 4.286918774307572e-06, + "loss": 0.668, + "step": 4867 + }, + { + "epoch": 0.56, + "grad_norm": 2.1755388804886433, + "learning_rate": 4.285076933275001e-06, + "loss": 0.4578, + "step": 4868 + }, + { + "epoch": 0.56, + "grad_norm": 1.6093843805861137, + "learning_rate": 4.283235191273448e-06, + "loss": 0.4595, + "step": 4869 + }, + { + "epoch": 0.56, + "grad_norm": 1.8886196596728957, + "learning_rate": 4.281393548558031e-06, + "loss": 0.5814, + "step": 4870 + }, + { + "epoch": 0.56, + "grad_norm": 2.3165741518740535, + "learning_rate": 4.279552005383857e-06, + "loss": 0.4868, + "step": 4871 + }, + { + "epoch": 0.56, + "grad_norm": 2.243643240126361, + "learning_rate": 4.277710562006013e-06, + "loss": 0.5029, + "step": 4872 + }, + { + "epoch": 0.56, + "grad_norm": 1.8190538095569726, + "learning_rate": 4.275869218679577e-06, + "loss": 0.4742, + "step": 4873 + }, + { + "epoch": 0.56, + "grad_norm": 2.6915922524522196, + "learning_rate": 4.274027975659611e-06, + "loss": 0.4619, + "step": 4874 + }, + { + "epoch": 0.56, + "grad_norm": 2.4115100271511696, + "learning_rate": 4.272186833201166e-06, + "loss": 0.5327, + "step": 4875 + }, + { + "epoch": 0.56, + "grad_norm": 3.370105347238914, + "learning_rate": 4.270345791559272e-06, + "loss": 0.4557, + "step": 4876 + }, + { + "epoch": 0.56, + "grad_norm": 2.5742337229062744, + "learning_rate": 4.2685048509889545e-06, + "loss": 0.4928, + "step": 4877 + }, + { + "epoch": 0.56, + "grad_norm": 9.59249573430252, + "learning_rate": 4.266664011745219e-06, + "loss": 0.4208, + "step": 4878 + }, + { + "epoch": 0.56, + "grad_norm": 2.379763245748081, + "learning_rate": 4.264823274083056e-06, + "loss": 0.5031, + "step": 4879 + }, + { + "epoch": 0.56, + "grad_norm": 2.8292054580274524, + "learning_rate": 4.2629826382574485e-06, + "loss": 0.4508, + "step": 4880 + }, + { + "epoch": 0.56, + "grad_norm": 1.9603231027873116, + "learning_rate": 4.261142104523356e-06, + "loss": 0.5729, + "step": 4881 + }, + { + "epoch": 0.56, + "grad_norm": 2.475036885292569, + "learning_rate": 4.259301673135732e-06, + "loss": 0.4342, + "step": 4882 + }, + { + "epoch": 0.56, + "grad_norm": 2.874258557850412, + "learning_rate": 4.25746134434951e-06, + "loss": 0.5302, + "step": 4883 + }, + { + "epoch": 0.56, + "grad_norm": 2.3043173898004117, + "learning_rate": 4.255621118419616e-06, + "loss": 0.4859, + "step": 4884 + }, + { + "epoch": 0.56, + "grad_norm": 2.417472425449803, + "learning_rate": 4.253780995600954e-06, + "loss": 0.5839, + "step": 4885 + }, + { + "epoch": 0.56, + "grad_norm": 2.1353327321355935, + "learning_rate": 4.251940976148421e-06, + "loss": 0.4422, + "step": 4886 + }, + { + "epoch": 0.56, + "grad_norm": 2.131277589157522, + "learning_rate": 4.250101060316895e-06, + "loss": 0.4663, + "step": 4887 + }, + { + "epoch": 0.56, + "grad_norm": 2.905176800252126, + "learning_rate": 4.248261248361238e-06, + "loss": 0.4874, + "step": 4888 + }, + { + "epoch": 0.56, + "grad_norm": 2.9391429980998005, + "learning_rate": 4.246421540536304e-06, + "loss": 0.586, + "step": 4889 + }, + { + "epoch": 0.56, + "grad_norm": 2.5998071869728867, + "learning_rate": 4.244581937096927e-06, + "loss": 0.4853, + "step": 4890 + }, + { + "epoch": 0.56, + "grad_norm": 2.096220858236659, + "learning_rate": 4.2427424382979295e-06, + "loss": 0.5901, + "step": 4891 + }, + { + "epoch": 0.56, + "grad_norm": 3.191849536420475, + "learning_rate": 4.240903044394118e-06, + "loss": 0.4185, + "step": 4892 + }, + { + "epoch": 0.56, + "grad_norm": 2.0076065766678504, + "learning_rate": 4.2390637556402855e-06, + "loss": 0.5511, + "step": 4893 + }, + { + "epoch": 0.56, + "grad_norm": 1.8609089774205958, + "learning_rate": 4.2372245722912096e-06, + "loss": 0.4869, + "step": 4894 + }, + { + "epoch": 0.56, + "grad_norm": 1.8921358984423517, + "learning_rate": 4.2353854946016545e-06, + "loss": 0.5514, + "step": 4895 + }, + { + "epoch": 0.56, + "grad_norm": 1.7325404169828718, + "learning_rate": 4.233546522826368e-06, + "loss": 0.5463, + "step": 4896 + }, + { + "epoch": 0.56, + "grad_norm": 2.3341226046259593, + "learning_rate": 4.231707657220086e-06, + "loss": 0.4918, + "step": 4897 + }, + { + "epoch": 0.56, + "grad_norm": 2.4579173255106137, + "learning_rate": 4.229868898037525e-06, + "loss": 0.5456, + "step": 4898 + }, + { + "epoch": 0.56, + "grad_norm": 2.001502809621097, + "learning_rate": 4.22803024553339e-06, + "loss": 0.5462, + "step": 4899 + }, + { + "epoch": 0.56, + "grad_norm": 2.014203429415695, + "learning_rate": 4.226191699962372e-06, + "loss": 0.5881, + "step": 4900 + }, + { + "epoch": 0.56, + "grad_norm": 0.946634232605974, + "learning_rate": 4.224353261579145e-06, + "loss": 0.6741, + "step": 4901 + }, + { + "epoch": 0.56, + "grad_norm": 1.8804151613845406, + "learning_rate": 4.222514930638371e-06, + "loss": 0.5216, + "step": 4902 + }, + { + "epoch": 0.56, + "grad_norm": 2.2119805316163554, + "learning_rate": 4.220676707394693e-06, + "loss": 0.5555, + "step": 4903 + }, + { + "epoch": 0.56, + "grad_norm": 1.9959437903763895, + "learning_rate": 4.218838592102744e-06, + "loss": 0.4924, + "step": 4904 + }, + { + "epoch": 0.56, + "grad_norm": 1.7905737459839444, + "learning_rate": 4.217000585017137e-06, + "loss": 0.4355, + "step": 4905 + }, + { + "epoch": 0.56, + "grad_norm": 1.8971197224296756, + "learning_rate": 4.215162686392473e-06, + "loss": 0.4892, + "step": 4906 + }, + { + "epoch": 0.56, + "grad_norm": 1.9280343370076989, + "learning_rate": 4.2133248964833395e-06, + "loss": 0.508, + "step": 4907 + }, + { + "epoch": 0.56, + "grad_norm": 2.529311727539601, + "learning_rate": 4.2114872155443035e-06, + "loss": 0.4692, + "step": 4908 + }, + { + "epoch": 0.56, + "grad_norm": 2.7326194097675334, + "learning_rate": 4.209649643829922e-06, + "loss": 0.5205, + "step": 4909 + }, + { + "epoch": 0.56, + "grad_norm": 1.7790444012482636, + "learning_rate": 4.207812181594735e-06, + "loss": 0.4721, + "step": 4910 + }, + { + "epoch": 0.56, + "grad_norm": 2.349879977509902, + "learning_rate": 4.205974829093268e-06, + "loss": 0.5013, + "step": 4911 + }, + { + "epoch": 0.56, + "grad_norm": 1.6718859436935387, + "learning_rate": 4.204137586580029e-06, + "loss": 0.5034, + "step": 4912 + }, + { + "epoch": 0.56, + "grad_norm": 1.7281666435481362, + "learning_rate": 4.2023004543095166e-06, + "loss": 0.4819, + "step": 4913 + }, + { + "epoch": 0.56, + "grad_norm": 1.9850936560538484, + "learning_rate": 4.200463432536205e-06, + "loss": 0.4133, + "step": 4914 + }, + { + "epoch": 0.56, + "grad_norm": 2.029263562374778, + "learning_rate": 4.19862652151456e-06, + "loss": 0.5032, + "step": 4915 + }, + { + "epoch": 0.56, + "grad_norm": 2.5993494665846297, + "learning_rate": 4.19678972149903e-06, + "loss": 0.4864, + "step": 4916 + }, + { + "epoch": 0.56, + "grad_norm": 0.9460637044325306, + "learning_rate": 4.194953032744049e-06, + "loss": 0.7168, + "step": 4917 + }, + { + "epoch": 0.57, + "grad_norm": 2.196981884180265, + "learning_rate": 4.193116455504034e-06, + "loss": 0.5259, + "step": 4918 + }, + { + "epoch": 0.57, + "grad_norm": 2.1542055875153507, + "learning_rate": 4.191279990033389e-06, + "loss": 0.5454, + "step": 4919 + }, + { + "epoch": 0.57, + "grad_norm": 1.8785784731767823, + "learning_rate": 4.189443636586499e-06, + "loss": 0.5814, + "step": 4920 + }, + { + "epoch": 0.57, + "grad_norm": 2.001989828028382, + "learning_rate": 4.187607395417736e-06, + "loss": 0.5132, + "step": 4921 + }, + { + "epoch": 0.57, + "grad_norm": 1.7824156873246804, + "learning_rate": 4.185771266781456e-06, + "loss": 0.4893, + "step": 4922 + }, + { + "epoch": 0.57, + "grad_norm": 2.7748320127775874, + "learning_rate": 4.183935250931999e-06, + "loss": 0.5937, + "step": 4923 + }, + { + "epoch": 0.57, + "grad_norm": 1.8522722545890007, + "learning_rate": 4.18209934812369e-06, + "loss": 0.4281, + "step": 4924 + }, + { + "epoch": 0.57, + "grad_norm": 1.8718730208534284, + "learning_rate": 4.1802635586108376e-06, + "loss": 0.4751, + "step": 4925 + }, + { + "epoch": 0.57, + "grad_norm": 3.913702610122003, + "learning_rate": 4.178427882647735e-06, + "loss": 0.4585, + "step": 4926 + }, + { + "epoch": 0.57, + "grad_norm": 2.223324439855877, + "learning_rate": 4.17659232048866e-06, + "loss": 0.5031, + "step": 4927 + }, + { + "epoch": 0.57, + "grad_norm": 2.2384172410151866, + "learning_rate": 4.174756872387874e-06, + "loss": 0.3995, + "step": 4928 + }, + { + "epoch": 0.57, + "grad_norm": 4.065609620076174, + "learning_rate": 4.172921538599623e-06, + "loss": 0.4498, + "step": 4929 + }, + { + "epoch": 0.57, + "grad_norm": 2.8725773817573677, + "learning_rate": 4.171086319378138e-06, + "loss": 0.4205, + "step": 4930 + }, + { + "epoch": 0.57, + "grad_norm": 1.9859123812513846, + "learning_rate": 4.169251214977632e-06, + "loss": 0.5265, + "step": 4931 + }, + { + "epoch": 0.57, + "grad_norm": 1.91293454616368, + "learning_rate": 4.1674162256523035e-06, + "loss": 0.5223, + "step": 4932 + }, + { + "epoch": 0.57, + "grad_norm": 1.8585811987141765, + "learning_rate": 4.1655813516563355e-06, + "loss": 0.4696, + "step": 4933 + }, + { + "epoch": 0.57, + "grad_norm": 1.9072509953141223, + "learning_rate": 4.163746593243895e-06, + "loss": 0.4282, + "step": 4934 + }, + { + "epoch": 0.57, + "grad_norm": 2.9718203896768194, + "learning_rate": 4.16191195066913e-06, + "loss": 0.579, + "step": 4935 + }, + { + "epoch": 0.57, + "grad_norm": 2.8897660720412377, + "learning_rate": 4.160077424186177e-06, + "loss": 0.534, + "step": 4936 + }, + { + "epoch": 0.57, + "grad_norm": 1.8441561837690763, + "learning_rate": 4.158243014049153e-06, + "loss": 0.4479, + "step": 4937 + }, + { + "epoch": 0.57, + "grad_norm": 1.9273697873552018, + "learning_rate": 4.156408720512162e-06, + "loss": 0.5085, + "step": 4938 + }, + { + "epoch": 0.57, + "grad_norm": 1.9204223135130989, + "learning_rate": 4.154574543829288e-06, + "loss": 0.5095, + "step": 4939 + }, + { + "epoch": 0.57, + "grad_norm": 1.7054908952069434, + "learning_rate": 4.152740484254602e-06, + "loss": 0.5133, + "step": 4940 + }, + { + "epoch": 0.57, + "grad_norm": 1.7949749662504497, + "learning_rate": 4.150906542042157e-06, + "loss": 0.4932, + "step": 4941 + }, + { + "epoch": 0.57, + "grad_norm": 1.9945027114465863, + "learning_rate": 4.1490727174459915e-06, + "loss": 0.5135, + "step": 4942 + }, + { + "epoch": 0.57, + "grad_norm": 2.1558234792945696, + "learning_rate": 4.147239010720125e-06, + "loss": 0.4807, + "step": 4943 + }, + { + "epoch": 0.57, + "grad_norm": 2.0298266155134237, + "learning_rate": 4.145405422118564e-06, + "loss": 0.5205, + "step": 4944 + }, + { + "epoch": 0.57, + "grad_norm": 1.6293622057820634, + "learning_rate": 4.143571951895295e-06, + "loss": 0.4909, + "step": 4945 + }, + { + "epoch": 0.57, + "grad_norm": 3.0428784154186506, + "learning_rate": 4.141738600304292e-06, + "loss": 0.4807, + "step": 4946 + }, + { + "epoch": 0.57, + "grad_norm": 2.2301659759375045, + "learning_rate": 4.13990536759951e-06, + "loss": 0.4982, + "step": 4947 + }, + { + "epoch": 0.57, + "grad_norm": 2.0425291356435835, + "learning_rate": 4.138072254034887e-06, + "loss": 0.5129, + "step": 4948 + }, + { + "epoch": 0.57, + "grad_norm": 2.5506930856377283, + "learning_rate": 4.136239259864345e-06, + "loss": 0.5271, + "step": 4949 + }, + { + "epoch": 0.57, + "grad_norm": 2.551508088294162, + "learning_rate": 4.134406385341792e-06, + "loss": 0.4055, + "step": 4950 + }, + { + "epoch": 0.57, + "grad_norm": 1.7998341094436725, + "learning_rate": 4.132573630721116e-06, + "loss": 0.5095, + "step": 4951 + }, + { + "epoch": 0.57, + "grad_norm": 3.039285514104016, + "learning_rate": 4.130740996256191e-06, + "loss": 0.5509, + "step": 4952 + }, + { + "epoch": 0.57, + "grad_norm": 1.9339287197276769, + "learning_rate": 4.128908482200873e-06, + "loss": 0.3957, + "step": 4953 + }, + { + "epoch": 0.57, + "grad_norm": 2.237322072526917, + "learning_rate": 4.127076088809e-06, + "loss": 0.435, + "step": 4954 + }, + { + "epoch": 0.57, + "grad_norm": 5.051578812904449, + "learning_rate": 4.125243816334396e-06, + "loss": 0.5125, + "step": 4955 + }, + { + "epoch": 0.57, + "grad_norm": 1.7685201898480072, + "learning_rate": 4.123411665030867e-06, + "loss": 0.5, + "step": 4956 + }, + { + "epoch": 0.57, + "grad_norm": 2.491675411776145, + "learning_rate": 4.121579635152203e-06, + "loss": 0.4902, + "step": 4957 + }, + { + "epoch": 0.57, + "grad_norm": 2.1401442915416062, + "learning_rate": 4.119747726952175e-06, + "loss": 0.3505, + "step": 4958 + }, + { + "epoch": 0.57, + "grad_norm": 2.7495705293497226, + "learning_rate": 4.11791594068454e-06, + "loss": 0.4847, + "step": 4959 + }, + { + "epoch": 0.57, + "grad_norm": 1.7507459180621532, + "learning_rate": 4.116084276603036e-06, + "loss": 0.517, + "step": 4960 + }, + { + "epoch": 0.57, + "grad_norm": 0.8124322023785099, + "learning_rate": 4.1142527349613845e-06, + "loss": 0.6717, + "step": 4961 + }, + { + "epoch": 0.57, + "grad_norm": 2.4059524298386106, + "learning_rate": 4.112421316013291e-06, + "loss": 0.498, + "step": 4962 + }, + { + "epoch": 0.57, + "grad_norm": 4.183244760037441, + "learning_rate": 4.110590020012444e-06, + "loss": 0.5584, + "step": 4963 + }, + { + "epoch": 0.57, + "grad_norm": 2.1503158973515872, + "learning_rate": 4.108758847212514e-06, + "loss": 0.5638, + "step": 4964 + }, + { + "epoch": 0.57, + "grad_norm": 1.9541332682791734, + "learning_rate": 4.106927797867153e-06, + "loss": 0.4394, + "step": 4965 + }, + { + "epoch": 0.57, + "grad_norm": 0.8696930627686397, + "learning_rate": 4.105096872229999e-06, + "loss": 0.7121, + "step": 4966 + }, + { + "epoch": 0.57, + "grad_norm": 2.0155083488013403, + "learning_rate": 4.10326607055467e-06, + "loss": 0.555, + "step": 4967 + }, + { + "epoch": 0.57, + "grad_norm": 1.8507965911239483, + "learning_rate": 4.1014353930947705e-06, + "loss": 0.5369, + "step": 4968 + }, + { + "epoch": 0.57, + "grad_norm": 2.706019782779668, + "learning_rate": 4.099604840103884e-06, + "loss": 0.5027, + "step": 4969 + }, + { + "epoch": 0.57, + "grad_norm": 1.832887168198852, + "learning_rate": 4.09777441183558e-06, + "loss": 0.5627, + "step": 4970 + }, + { + "epoch": 0.57, + "grad_norm": 2.6226289689159055, + "learning_rate": 4.095944108543407e-06, + "loss": 0.5838, + "step": 4971 + }, + { + "epoch": 0.57, + "grad_norm": 2.8795268547998067, + "learning_rate": 4.094113930480902e-06, + "loss": 0.4942, + "step": 4972 + }, + { + "epoch": 0.57, + "grad_norm": 2.407372928687027, + "learning_rate": 4.092283877901576e-06, + "loss": 0.4334, + "step": 4973 + }, + { + "epoch": 0.57, + "grad_norm": 2.138581029903175, + "learning_rate": 4.090453951058931e-06, + "loss": 0.5233, + "step": 4974 + }, + { + "epoch": 0.57, + "grad_norm": 2.242861892293095, + "learning_rate": 4.088624150206446e-06, + "loss": 0.5421, + "step": 4975 + }, + { + "epoch": 0.57, + "grad_norm": 1.9590877057840173, + "learning_rate": 4.086794475597588e-06, + "loss": 0.5342, + "step": 4976 + }, + { + "epoch": 0.57, + "grad_norm": 1.9018599238666924, + "learning_rate": 4.084964927485799e-06, + "loss": 0.5087, + "step": 4977 + }, + { + "epoch": 0.57, + "grad_norm": 2.2444186842727274, + "learning_rate": 4.083135506124511e-06, + "loss": 0.4506, + "step": 4978 + }, + { + "epoch": 0.57, + "grad_norm": 1.8154180841638676, + "learning_rate": 4.081306211767133e-06, + "loss": 0.5088, + "step": 4979 + }, + { + "epoch": 0.57, + "grad_norm": 1.941825819966256, + "learning_rate": 4.07947704466706e-06, + "loss": 0.493, + "step": 4980 + }, + { + "epoch": 0.57, + "grad_norm": 2.6202522639039056, + "learning_rate": 4.07764800507767e-06, + "loss": 0.4861, + "step": 4981 + }, + { + "epoch": 0.57, + "grad_norm": 2.145563702525943, + "learning_rate": 4.075819093252315e-06, + "loss": 0.4918, + "step": 4982 + }, + { + "epoch": 0.57, + "grad_norm": 3.5851614081696654, + "learning_rate": 4.07399030944434e-06, + "loss": 0.5234, + "step": 4983 + }, + { + "epoch": 0.57, + "grad_norm": 3.1919231530265737, + "learning_rate": 4.072161653907067e-06, + "loss": 0.5362, + "step": 4984 + }, + { + "epoch": 0.57, + "grad_norm": 2.2291841144662365, + "learning_rate": 4.070333126893801e-06, + "loss": 0.5739, + "step": 4985 + }, + { + "epoch": 0.57, + "grad_norm": 2.711840118797459, + "learning_rate": 4.068504728657829e-06, + "loss": 0.4767, + "step": 4986 + }, + { + "epoch": 0.57, + "grad_norm": 2.1098487339056544, + "learning_rate": 4.06667645945242e-06, + "loss": 0.5056, + "step": 4987 + }, + { + "epoch": 0.57, + "grad_norm": 1.686138157850958, + "learning_rate": 4.064848319530827e-06, + "loss": 0.5041, + "step": 4988 + }, + { + "epoch": 0.57, + "grad_norm": 1.6779998089098929, + "learning_rate": 4.063020309146283e-06, + "loss": 0.4328, + "step": 4989 + }, + { + "epoch": 0.57, + "grad_norm": 2.7715543673413636, + "learning_rate": 4.061192428552003e-06, + "loss": 0.5868, + "step": 4990 + }, + { + "epoch": 0.57, + "grad_norm": 1.87621677957861, + "learning_rate": 4.0593646780011855e-06, + "loss": 0.4041, + "step": 4991 + }, + { + "epoch": 0.57, + "grad_norm": 1.9799952214360097, + "learning_rate": 4.057537057747011e-06, + "loss": 0.3825, + "step": 4992 + }, + { + "epoch": 0.57, + "grad_norm": 5.191962032564414, + "learning_rate": 4.055709568042639e-06, + "loss": 0.5182, + "step": 4993 + }, + { + "epoch": 0.57, + "grad_norm": 2.4056085504771247, + "learning_rate": 4.0538822091412135e-06, + "loss": 0.596, + "step": 4994 + }, + { + "epoch": 0.57, + "grad_norm": 3.1557029997561292, + "learning_rate": 4.052054981295861e-06, + "loss": 0.4718, + "step": 4995 + }, + { + "epoch": 0.57, + "grad_norm": 2.4745503699260496, + "learning_rate": 4.050227884759688e-06, + "loss": 0.4707, + "step": 4996 + }, + { + "epoch": 0.57, + "grad_norm": 1.8731063745074426, + "learning_rate": 4.0484009197857845e-06, + "loss": 0.4856, + "step": 4997 + }, + { + "epoch": 0.57, + "grad_norm": 2.3352027027644082, + "learning_rate": 4.0465740866272226e-06, + "loss": 0.4969, + "step": 4998 + }, + { + "epoch": 0.57, + "grad_norm": 2.1654976337617593, + "learning_rate": 4.044747385537051e-06, + "loss": 0.3864, + "step": 4999 + }, + { + "epoch": 0.57, + "grad_norm": 1.9222933691282333, + "learning_rate": 4.0429208167683055e-06, + "loss": 0.4549, + "step": 5000 + }, + { + "epoch": 0.57, + "grad_norm": 1.8384631792689854, + "learning_rate": 4.041094380574003e-06, + "loss": 0.4744, + "step": 5001 + }, + { + "epoch": 0.57, + "grad_norm": 1.7587667530725555, + "learning_rate": 4.039268077207142e-06, + "loss": 0.5799, + "step": 5002 + }, + { + "epoch": 0.57, + "grad_norm": 4.443458710688763, + "learning_rate": 4.037441906920698e-06, + "loss": 0.534, + "step": 5003 + }, + { + "epoch": 0.57, + "grad_norm": 1.6869593214041998, + "learning_rate": 4.035615869967636e-06, + "loss": 0.4987, + "step": 5004 + }, + { + "epoch": 0.58, + "grad_norm": 2.5245552063171752, + "learning_rate": 4.033789966600897e-06, + "loss": 0.526, + "step": 5005 + }, + { + "epoch": 0.58, + "grad_norm": 2.1928208735367622, + "learning_rate": 4.031964197073403e-06, + "loss": 0.5846, + "step": 5006 + }, + { + "epoch": 0.58, + "grad_norm": 1.9780115053875704, + "learning_rate": 4.0301385616380625e-06, + "loss": 0.4264, + "step": 5007 + }, + { + "epoch": 0.58, + "grad_norm": 2.6959489344436074, + "learning_rate": 4.02831306054776e-06, + "loss": 0.5236, + "step": 5008 + }, + { + "epoch": 0.58, + "grad_norm": 5.137343834748668, + "learning_rate": 4.026487694055363e-06, + "loss": 0.4965, + "step": 5009 + }, + { + "epoch": 0.58, + "grad_norm": 2.063379617003464, + "learning_rate": 4.024662462413723e-06, + "loss": 0.5063, + "step": 5010 + }, + { + "epoch": 0.58, + "grad_norm": 3.00919099193985, + "learning_rate": 4.022837365875669e-06, + "loss": 0.4789, + "step": 5011 + }, + { + "epoch": 0.58, + "grad_norm": 2.2186214029094358, + "learning_rate": 4.021012404694015e-06, + "loss": 0.501, + "step": 5012 + }, + { + "epoch": 0.58, + "grad_norm": 2.164592089088275, + "learning_rate": 4.019187579121554e-06, + "loss": 0.4599, + "step": 5013 + }, + { + "epoch": 0.58, + "grad_norm": 1.4935651214366972, + "learning_rate": 4.017362889411057e-06, + "loss": 0.4958, + "step": 5014 + }, + { + "epoch": 0.58, + "grad_norm": 2.8270027641724713, + "learning_rate": 4.015538335815285e-06, + "loss": 0.5822, + "step": 5015 + }, + { + "epoch": 0.58, + "grad_norm": 1.9497195203509876, + "learning_rate": 4.01371391858697e-06, + "loss": 0.5634, + "step": 5016 + }, + { + "epoch": 0.58, + "grad_norm": 1.9640386680326911, + "learning_rate": 4.011889637978834e-06, + "loss": 0.5563, + "step": 5017 + }, + { + "epoch": 0.58, + "grad_norm": 1.7665206536814326, + "learning_rate": 4.010065494243573e-06, + "loss": 0.5306, + "step": 5018 + }, + { + "epoch": 0.58, + "grad_norm": 2.119895955700955, + "learning_rate": 4.008241487633869e-06, + "loss": 0.5588, + "step": 5019 + }, + { + "epoch": 0.58, + "grad_norm": 1.8848738519774146, + "learning_rate": 4.006417618402382e-06, + "loss": 0.4787, + "step": 5020 + }, + { + "epoch": 0.58, + "grad_norm": 2.159256972604456, + "learning_rate": 4.004593886801754e-06, + "loss": 0.5115, + "step": 5021 + }, + { + "epoch": 0.58, + "grad_norm": 2.298096011661009, + "learning_rate": 4.002770293084608e-06, + "loss": 0.5072, + "step": 5022 + }, + { + "epoch": 0.58, + "grad_norm": 2.058669829855097, + "learning_rate": 4.000946837503549e-06, + "loss": 0.5114, + "step": 5023 + }, + { + "epoch": 0.58, + "grad_norm": 2.1606690275260916, + "learning_rate": 3.99912352031116e-06, + "loss": 0.51, + "step": 5024 + }, + { + "epoch": 0.58, + "grad_norm": 2.8676412652791505, + "learning_rate": 3.997300341760009e-06, + "loss": 0.4045, + "step": 5025 + }, + { + "epoch": 0.58, + "grad_norm": 1.9506671275881255, + "learning_rate": 3.99547730210264e-06, + "loss": 0.5437, + "step": 5026 + }, + { + "epoch": 0.58, + "grad_norm": 1.8974971499162363, + "learning_rate": 3.993654401591582e-06, + "loss": 0.4975, + "step": 5027 + }, + { + "epoch": 0.58, + "grad_norm": 1.5773332696081204, + "learning_rate": 3.991831640479341e-06, + "loss": 0.4072, + "step": 5028 + }, + { + "epoch": 0.58, + "grad_norm": 3.1695677655377916, + "learning_rate": 3.990009019018407e-06, + "loss": 0.4043, + "step": 5029 + }, + { + "epoch": 0.58, + "grad_norm": 2.535829142047127, + "learning_rate": 3.988186537461249e-06, + "loss": 0.4496, + "step": 5030 + }, + { + "epoch": 0.58, + "grad_norm": 2.1793178825187742, + "learning_rate": 3.986364196060317e-06, + "loss": 0.5623, + "step": 5031 + }, + { + "epoch": 0.58, + "grad_norm": 1.8726386936814023, + "learning_rate": 3.984541995068042e-06, + "loss": 0.4801, + "step": 5032 + }, + { + "epoch": 0.58, + "grad_norm": 3.187100837158078, + "learning_rate": 3.982719934736832e-06, + "loss": 0.4795, + "step": 5033 + }, + { + "epoch": 0.58, + "grad_norm": 2.053318075877081, + "learning_rate": 3.980898015319081e-06, + "loss": 0.469, + "step": 5034 + }, + { + "epoch": 0.58, + "grad_norm": 2.750010143799455, + "learning_rate": 3.97907623706716e-06, + "loss": 0.436, + "step": 5035 + }, + { + "epoch": 0.58, + "grad_norm": 3.1557758426491014, + "learning_rate": 3.9772546002334225e-06, + "loss": 0.4734, + "step": 5036 + }, + { + "epoch": 0.58, + "grad_norm": 2.3709809724999213, + "learning_rate": 3.975433105070201e-06, + "loss": 0.4687, + "step": 5037 + }, + { + "epoch": 0.58, + "grad_norm": 2.2677680908771385, + "learning_rate": 3.973611751829806e-06, + "loss": 0.4006, + "step": 5038 + }, + { + "epoch": 0.58, + "grad_norm": 2.050601411160466, + "learning_rate": 3.971790540764536e-06, + "loss": 0.4561, + "step": 5039 + }, + { + "epoch": 0.58, + "grad_norm": 2.096496428736196, + "learning_rate": 3.9699694721266606e-06, + "loss": 0.5403, + "step": 5040 + }, + { + "epoch": 0.58, + "grad_norm": 1.7487509248770061, + "learning_rate": 3.968148546168436e-06, + "loss": 0.4529, + "step": 5041 + }, + { + "epoch": 0.58, + "grad_norm": 0.8886988088483981, + "learning_rate": 3.966327763142096e-06, + "loss": 0.7171, + "step": 5042 + }, + { + "epoch": 0.58, + "grad_norm": 2.247997828679026, + "learning_rate": 3.964507123299855e-06, + "loss": 0.4773, + "step": 5043 + }, + { + "epoch": 0.58, + "grad_norm": 2.0062390587818424, + "learning_rate": 3.962686626893908e-06, + "loss": 0.3555, + "step": 5044 + }, + { + "epoch": 0.58, + "grad_norm": 1.6557572827983393, + "learning_rate": 3.96086627417643e-06, + "loss": 0.4223, + "step": 5045 + }, + { + "epoch": 0.58, + "grad_norm": 1.8692730725750724, + "learning_rate": 3.959046065399575e-06, + "loss": 0.4954, + "step": 5046 + }, + { + "epoch": 0.58, + "grad_norm": 2.4431973607265403, + "learning_rate": 3.95722600081548e-06, + "loss": 0.596, + "step": 5047 + }, + { + "epoch": 0.58, + "grad_norm": 1.9635399328161813, + "learning_rate": 3.955406080676259e-06, + "loss": 0.4882, + "step": 5048 + }, + { + "epoch": 0.58, + "grad_norm": 2.2757223190189193, + "learning_rate": 3.953586305234008e-06, + "loss": 0.5501, + "step": 5049 + }, + { + "epoch": 0.58, + "grad_norm": 2.0410664255160116, + "learning_rate": 3.951766674740798e-06, + "loss": 0.5016, + "step": 5050 + }, + { + "epoch": 0.58, + "grad_norm": 1.9191431290808638, + "learning_rate": 3.949947189448687e-06, + "loss": 0.5311, + "step": 5051 + }, + { + "epoch": 0.58, + "grad_norm": 4.910937651993045, + "learning_rate": 3.94812784960971e-06, + "loss": 0.4287, + "step": 5052 + }, + { + "epoch": 0.58, + "grad_norm": 1.9264101675990217, + "learning_rate": 3.9463086554758804e-06, + "loss": 0.52, + "step": 5053 + }, + { + "epoch": 0.58, + "grad_norm": 1.7792582711392193, + "learning_rate": 3.944489607299193e-06, + "loss": 0.4696, + "step": 5054 + }, + { + "epoch": 0.58, + "grad_norm": 4.305932155770469, + "learning_rate": 3.942670705331624e-06, + "loss": 0.5601, + "step": 5055 + }, + { + "epoch": 0.58, + "grad_norm": 2.066615509475358, + "learning_rate": 3.940851949825124e-06, + "loss": 0.4613, + "step": 5056 + }, + { + "epoch": 0.58, + "grad_norm": 2.2295540689392266, + "learning_rate": 3.939033341031631e-06, + "loss": 0.504, + "step": 5057 + }, + { + "epoch": 0.58, + "grad_norm": 1.8611785447779898, + "learning_rate": 3.937214879203054e-06, + "loss": 0.5114, + "step": 5058 + }, + { + "epoch": 0.58, + "grad_norm": 2.1944821387216105, + "learning_rate": 3.935396564591289e-06, + "loss": 0.5118, + "step": 5059 + }, + { + "epoch": 0.58, + "grad_norm": 2.294748910519658, + "learning_rate": 3.933578397448205e-06, + "loss": 0.5741, + "step": 5060 + }, + { + "epoch": 0.58, + "grad_norm": 2.052986038411296, + "learning_rate": 3.931760378025659e-06, + "loss": 0.5571, + "step": 5061 + }, + { + "epoch": 0.58, + "grad_norm": 3.215132614678951, + "learning_rate": 3.929942506575479e-06, + "loss": 0.4858, + "step": 5062 + }, + { + "epoch": 0.58, + "grad_norm": 1.6748661700030432, + "learning_rate": 3.9281247833494785e-06, + "loss": 0.3865, + "step": 5063 + }, + { + "epoch": 0.58, + "grad_norm": 2.520971946681373, + "learning_rate": 3.926307208599447e-06, + "loss": 0.4741, + "step": 5064 + }, + { + "epoch": 0.58, + "grad_norm": 1.6837228010839616, + "learning_rate": 3.924489782577157e-06, + "loss": 0.4823, + "step": 5065 + }, + { + "epoch": 0.58, + "grad_norm": 1.8876522887850178, + "learning_rate": 3.922672505534354e-06, + "loss": 0.4355, + "step": 5066 + }, + { + "epoch": 0.58, + "grad_norm": 0.9093677652514591, + "learning_rate": 3.92085537772277e-06, + "loss": 0.7104, + "step": 5067 + }, + { + "epoch": 0.58, + "grad_norm": 1.7099539109042323, + "learning_rate": 3.91903839939411e-06, + "loss": 0.4875, + "step": 5068 + }, + { + "epoch": 0.58, + "grad_norm": 2.928228535665067, + "learning_rate": 3.9172215708000655e-06, + "loss": 0.5067, + "step": 5069 + }, + { + "epoch": 0.58, + "grad_norm": 1.7177059068547176, + "learning_rate": 3.915404892192301e-06, + "loss": 0.5575, + "step": 5070 + }, + { + "epoch": 0.58, + "grad_norm": 2.239370624976635, + "learning_rate": 3.9135883638224626e-06, + "loss": 0.5398, + "step": 5071 + }, + { + "epoch": 0.58, + "grad_norm": 2.8464339163562564, + "learning_rate": 3.911771985942177e-06, + "loss": 0.5605, + "step": 5072 + }, + { + "epoch": 0.58, + "grad_norm": 2.0069333490583983, + "learning_rate": 3.909955758803045e-06, + "loss": 0.5188, + "step": 5073 + }, + { + "epoch": 0.58, + "grad_norm": 1.5477613977058782, + "learning_rate": 3.908139682656655e-06, + "loss": 0.4431, + "step": 5074 + }, + { + "epoch": 0.58, + "grad_norm": 4.257987285529607, + "learning_rate": 3.906323757754566e-06, + "loss": 0.5196, + "step": 5075 + }, + { + "epoch": 0.58, + "grad_norm": 2.884613383799756, + "learning_rate": 3.90450798434832e-06, + "loss": 0.4711, + "step": 5076 + }, + { + "epoch": 0.58, + "grad_norm": 2.818265529190095, + "learning_rate": 3.902692362689441e-06, + "loss": 0.5662, + "step": 5077 + }, + { + "epoch": 0.58, + "grad_norm": 1.9670194204144935, + "learning_rate": 3.9008768930294235e-06, + "loss": 0.5179, + "step": 5078 + }, + { + "epoch": 0.58, + "grad_norm": 2.756112943138309, + "learning_rate": 3.899061575619748e-06, + "loss": 0.4144, + "step": 5079 + }, + { + "epoch": 0.58, + "grad_norm": 1.6131250867024352, + "learning_rate": 3.897246410711874e-06, + "loss": 0.4549, + "step": 5080 + }, + { + "epoch": 0.58, + "grad_norm": 2.0914356420400995, + "learning_rate": 3.895431398557235e-06, + "loss": 0.5001, + "step": 5081 + }, + { + "epoch": 0.58, + "grad_norm": 1.8408471056055373, + "learning_rate": 3.893616539407249e-06, + "loss": 0.5751, + "step": 5082 + }, + { + "epoch": 0.58, + "grad_norm": 1.9432495162021517, + "learning_rate": 3.891801833513308e-06, + "loss": 0.5183, + "step": 5083 + }, + { + "epoch": 0.58, + "grad_norm": 1.7052886461016197, + "learning_rate": 3.889987281126784e-06, + "loss": 0.5428, + "step": 5084 + }, + { + "epoch": 0.58, + "grad_norm": 1.7989533558028759, + "learning_rate": 3.8881728824990294e-06, + "loss": 0.4032, + "step": 5085 + }, + { + "epoch": 0.58, + "grad_norm": 1.9358715042143657, + "learning_rate": 3.886358637881375e-06, + "loss": 0.5367, + "step": 5086 + }, + { + "epoch": 0.58, + "grad_norm": 2.096792394730582, + "learning_rate": 3.884544547525129e-06, + "loss": 0.5183, + "step": 5087 + }, + { + "epoch": 0.58, + "grad_norm": 2.1821248662723716, + "learning_rate": 3.882730611681579e-06, + "loss": 0.5262, + "step": 5088 + }, + { + "epoch": 0.58, + "grad_norm": 1.778842648305525, + "learning_rate": 3.88091683060199e-06, + "loss": 0.5247, + "step": 5089 + }, + { + "epoch": 0.58, + "grad_norm": 2.27926603008679, + "learning_rate": 3.87910320453761e-06, + "loss": 0.4334, + "step": 5090 + }, + { + "epoch": 0.58, + "grad_norm": 2.0475838239880115, + "learning_rate": 3.877289733739659e-06, + "loss": 0.5511, + "step": 5091 + }, + { + "epoch": 0.59, + "grad_norm": 2.095930790642349, + "learning_rate": 3.875476418459339e-06, + "loss": 0.4956, + "step": 5092 + }, + { + "epoch": 0.59, + "grad_norm": 1.9226072577876656, + "learning_rate": 3.873663258947831e-06, + "loss": 0.5413, + "step": 5093 + }, + { + "epoch": 0.59, + "grad_norm": 1.7878089645494295, + "learning_rate": 3.871850255456294e-06, + "loss": 0.5152, + "step": 5094 + }, + { + "epoch": 0.59, + "grad_norm": 2.3609283129082823, + "learning_rate": 3.8700374082358635e-06, + "loss": 0.6077, + "step": 5095 + }, + { + "epoch": 0.59, + "grad_norm": 1.8463043222337554, + "learning_rate": 3.868224717537657e-06, + "loss": 0.4491, + "step": 5096 + }, + { + "epoch": 0.59, + "grad_norm": 2.541036960538255, + "learning_rate": 3.8664121836127654e-06, + "loss": 0.4727, + "step": 5097 + }, + { + "epoch": 0.59, + "grad_norm": 2.7865942887703996, + "learning_rate": 3.864599806712261e-06, + "loss": 0.468, + "step": 5098 + }, + { + "epoch": 0.59, + "grad_norm": 1.926914243018918, + "learning_rate": 3.8627875870871975e-06, + "loss": 0.4032, + "step": 5099 + }, + { + "epoch": 0.59, + "grad_norm": 1.9592095537899248, + "learning_rate": 3.860975524988598e-06, + "loss": 0.5282, + "step": 5100 + }, + { + "epoch": 0.59, + "grad_norm": 1.7466609514430285, + "learning_rate": 3.859163620667472e-06, + "loss": 0.4771, + "step": 5101 + }, + { + "epoch": 0.59, + "grad_norm": 2.5709428090125805, + "learning_rate": 3.857351874374805e-06, + "loss": 0.4547, + "step": 5102 + }, + { + "epoch": 0.59, + "grad_norm": 1.8515140593093564, + "learning_rate": 3.8555402863615564e-06, + "loss": 0.5148, + "step": 5103 + }, + { + "epoch": 0.59, + "grad_norm": 3.211628102786537, + "learning_rate": 3.85372885687867e-06, + "loss": 0.4235, + "step": 5104 + }, + { + "epoch": 0.59, + "grad_norm": 2.556255659978394, + "learning_rate": 3.851917586177063e-06, + "loss": 0.478, + "step": 5105 + }, + { + "epoch": 0.59, + "grad_norm": 1.91936878903108, + "learning_rate": 3.850106474507635e-06, + "loss": 0.5042, + "step": 5106 + }, + { + "epoch": 0.59, + "grad_norm": 4.2007554797552755, + "learning_rate": 3.848295522121257e-06, + "loss": 0.4346, + "step": 5107 + }, + { + "epoch": 0.59, + "grad_norm": 3.1206728104682084, + "learning_rate": 3.846484729268784e-06, + "loss": 0.513, + "step": 5108 + }, + { + "epoch": 0.59, + "grad_norm": 1.7945190443348735, + "learning_rate": 3.844674096201047e-06, + "loss": 0.4905, + "step": 5109 + }, + { + "epoch": 0.59, + "grad_norm": 1.7014830190827321, + "learning_rate": 3.842863623168854e-06, + "loss": 0.4868, + "step": 5110 + }, + { + "epoch": 0.59, + "grad_norm": 1.6464996938385885, + "learning_rate": 3.841053310422992e-06, + "loss": 0.5061, + "step": 5111 + }, + { + "epoch": 0.59, + "grad_norm": 1.8463589866513597, + "learning_rate": 3.8392431582142245e-06, + "loss": 0.5389, + "step": 5112 + }, + { + "epoch": 0.59, + "grad_norm": 3.4126482446103203, + "learning_rate": 3.837433166793293e-06, + "loss": 0.6167, + "step": 5113 + }, + { + "epoch": 0.59, + "grad_norm": 3.2183611995152437, + "learning_rate": 3.835623336410919e-06, + "loss": 0.5473, + "step": 5114 + }, + { + "epoch": 0.59, + "grad_norm": 2.1560539760269832, + "learning_rate": 3.833813667317798e-06, + "loss": 0.476, + "step": 5115 + }, + { + "epoch": 0.59, + "grad_norm": 2.362392335973571, + "learning_rate": 3.832004159764608e-06, + "loss": 0.5105, + "step": 5116 + }, + { + "epoch": 0.59, + "grad_norm": 1.950244366474332, + "learning_rate": 3.830194814001997e-06, + "loss": 0.4924, + "step": 5117 + }, + { + "epoch": 0.59, + "grad_norm": 2.11651555226258, + "learning_rate": 3.828385630280598e-06, + "loss": 0.489, + "step": 5118 + }, + { + "epoch": 0.59, + "grad_norm": 2.6478023428787254, + "learning_rate": 3.826576608851018e-06, + "loss": 0.4623, + "step": 5119 + }, + { + "epoch": 0.59, + "grad_norm": 1.789802484321989, + "learning_rate": 3.824767749963844e-06, + "loss": 0.5006, + "step": 5120 + }, + { + "epoch": 0.59, + "grad_norm": 2.2723470621031883, + "learning_rate": 3.822959053869637e-06, + "loss": 0.4139, + "step": 5121 + }, + { + "epoch": 0.59, + "grad_norm": 2.31402964176604, + "learning_rate": 3.8211505208189394e-06, + "loss": 0.5263, + "step": 5122 + }, + { + "epoch": 0.59, + "grad_norm": 1.996856866642223, + "learning_rate": 3.819342151062266e-06, + "loss": 0.5374, + "step": 5123 + }, + { + "epoch": 0.59, + "grad_norm": 2.313077275191408, + "learning_rate": 3.817533944850114e-06, + "loss": 0.3802, + "step": 5124 + }, + { + "epoch": 0.59, + "grad_norm": 1.8891070476286684, + "learning_rate": 3.815725902432955e-06, + "loss": 0.4929, + "step": 5125 + }, + { + "epoch": 0.59, + "grad_norm": 2.0824919662084085, + "learning_rate": 3.8139180240612386e-06, + "loss": 0.5195, + "step": 5126 + }, + { + "epoch": 0.59, + "grad_norm": 2.0232204013723156, + "learning_rate": 3.812110309985392e-06, + "loss": 0.3904, + "step": 5127 + }, + { + "epoch": 0.59, + "grad_norm": 1.6856058055890788, + "learning_rate": 3.81030276045582e-06, + "loss": 0.3562, + "step": 5128 + }, + { + "epoch": 0.59, + "grad_norm": 2.3789232962692366, + "learning_rate": 3.808495375722903e-06, + "loss": 0.4671, + "step": 5129 + }, + { + "epoch": 0.59, + "grad_norm": 2.156310135622822, + "learning_rate": 3.8066881560370007e-06, + "loss": 0.4864, + "step": 5130 + }, + { + "epoch": 0.59, + "grad_norm": 1.7814220610235605, + "learning_rate": 3.804881101648448e-06, + "loss": 0.5279, + "step": 5131 + }, + { + "epoch": 0.59, + "grad_norm": 1.9565177372261564, + "learning_rate": 3.8030742128075584e-06, + "loss": 0.402, + "step": 5132 + }, + { + "epoch": 0.59, + "grad_norm": 3.202462548608363, + "learning_rate": 3.801267489764623e-06, + "loss": 0.5782, + "step": 5133 + }, + { + "epoch": 0.59, + "grad_norm": 1.749075307762249, + "learning_rate": 3.7994609327699055e-06, + "loss": 0.4144, + "step": 5134 + }, + { + "epoch": 0.59, + "grad_norm": 1.904459324409021, + "learning_rate": 3.7976545420736518e-06, + "loss": 0.5036, + "step": 5135 + }, + { + "epoch": 0.59, + "grad_norm": 3.5826028063315793, + "learning_rate": 3.7958483179260823e-06, + "loss": 0.4392, + "step": 5136 + }, + { + "epoch": 0.59, + "grad_norm": 4.313782545564559, + "learning_rate": 3.7940422605773957e-06, + "loss": 0.4324, + "step": 5137 + }, + { + "epoch": 0.59, + "grad_norm": 1.5977554510193337, + "learning_rate": 3.7922363702777666e-06, + "loss": 0.4965, + "step": 5138 + }, + { + "epoch": 0.59, + "grad_norm": 1.5922671810192703, + "learning_rate": 3.7904306472773458e-06, + "loss": 0.4903, + "step": 5139 + }, + { + "epoch": 0.59, + "grad_norm": 1.6634004171854582, + "learning_rate": 3.7886250918262617e-06, + "loss": 0.4924, + "step": 5140 + }, + { + "epoch": 0.59, + "grad_norm": 1.6316297122755117, + "learning_rate": 3.7868197041746207e-06, + "loss": 0.38, + "step": 5141 + }, + { + "epoch": 0.59, + "grad_norm": 2.8601554709795787, + "learning_rate": 3.785014484572505e-06, + "loss": 0.4643, + "step": 5142 + }, + { + "epoch": 0.59, + "grad_norm": 1.649172948373203, + "learning_rate": 3.783209433269972e-06, + "loss": 0.4407, + "step": 5143 + }, + { + "epoch": 0.59, + "grad_norm": 2.1188275988838012, + "learning_rate": 3.781404550517057e-06, + "loss": 0.5649, + "step": 5144 + }, + { + "epoch": 0.59, + "grad_norm": 2.40227538812724, + "learning_rate": 3.7795998365637725e-06, + "loss": 0.4825, + "step": 5145 + }, + { + "epoch": 0.59, + "grad_norm": 1.6758135495213597, + "learning_rate": 3.777795291660107e-06, + "loss": 0.4683, + "step": 5146 + }, + { + "epoch": 0.59, + "grad_norm": 2.1592485223440843, + "learning_rate": 3.775990916056027e-06, + "loss": 0.5533, + "step": 5147 + }, + { + "epoch": 0.59, + "grad_norm": 1.8489728738101892, + "learning_rate": 3.7741867100014726e-06, + "loss": 0.4293, + "step": 5148 + }, + { + "epoch": 0.59, + "grad_norm": 1.8691819261941616, + "learning_rate": 3.7723826737463633e-06, + "loss": 0.5122, + "step": 5149 + }, + { + "epoch": 0.59, + "grad_norm": 2.159434322061529, + "learning_rate": 3.770578807540595e-06, + "loss": 0.5283, + "step": 5150 + }, + { + "epoch": 0.59, + "grad_norm": 1.7440572091893427, + "learning_rate": 3.7687751116340353e-06, + "loss": 0.6008, + "step": 5151 + }, + { + "epoch": 0.59, + "grad_norm": 5.052090379786352, + "learning_rate": 3.766971586276534e-06, + "loss": 0.4302, + "step": 5152 + }, + { + "epoch": 0.59, + "grad_norm": 3.2211415403591745, + "learning_rate": 3.765168231717915e-06, + "loss": 0.6014, + "step": 5153 + }, + { + "epoch": 0.59, + "grad_norm": 2.2118118465653365, + "learning_rate": 3.7633650482079785e-06, + "loss": 0.4986, + "step": 5154 + }, + { + "epoch": 0.59, + "grad_norm": 0.8850217650313964, + "learning_rate": 3.761562035996502e-06, + "loss": 0.7244, + "step": 5155 + }, + { + "epoch": 0.59, + "grad_norm": 2.233231746747674, + "learning_rate": 3.759759195333237e-06, + "loss": 0.4751, + "step": 5156 + }, + { + "epoch": 0.59, + "grad_norm": 1.8174492063917722, + "learning_rate": 3.7579565264679136e-06, + "loss": 0.4645, + "step": 5157 + }, + { + "epoch": 0.59, + "grad_norm": 3.619666157434479, + "learning_rate": 3.756154029650237e-06, + "loss": 0.5249, + "step": 5158 + }, + { + "epoch": 0.59, + "grad_norm": 2.381248703646977, + "learning_rate": 3.7543517051298887e-06, + "loss": 0.549, + "step": 5159 + }, + { + "epoch": 0.59, + "grad_norm": 3.865945174285883, + "learning_rate": 3.7525495531565263e-06, + "loss": 0.5229, + "step": 5160 + }, + { + "epoch": 0.59, + "grad_norm": 2.5743063993222077, + "learning_rate": 3.7507475739797842e-06, + "loss": 0.5264, + "step": 5161 + }, + { + "epoch": 0.59, + "grad_norm": 2.1928359163704343, + "learning_rate": 3.748945767849273e-06, + "loss": 0.5894, + "step": 5162 + }, + { + "epoch": 0.59, + "grad_norm": 2.154537743807666, + "learning_rate": 3.747144135014576e-06, + "loss": 0.4267, + "step": 5163 + }, + { + "epoch": 0.59, + "grad_norm": 1.703758517326541, + "learning_rate": 3.7453426757252566e-06, + "loss": 0.4275, + "step": 5164 + }, + { + "epoch": 0.59, + "grad_norm": 1.7660343323383436, + "learning_rate": 3.743541390230852e-06, + "loss": 0.5147, + "step": 5165 + }, + { + "epoch": 0.59, + "grad_norm": 1.6829350294040817, + "learning_rate": 3.7417402787808766e-06, + "loss": 0.5379, + "step": 5166 + }, + { + "epoch": 0.59, + "grad_norm": 1.83894653512329, + "learning_rate": 3.739939341624821e-06, + "loss": 0.5098, + "step": 5167 + }, + { + "epoch": 0.59, + "grad_norm": 1.8435996809835238, + "learning_rate": 3.7381385790121495e-06, + "loss": 0.4637, + "step": 5168 + }, + { + "epoch": 0.59, + "grad_norm": 2.007474493390019, + "learning_rate": 3.7363379911923027e-06, + "loss": 0.5445, + "step": 5169 + }, + { + "epoch": 0.59, + "grad_norm": 2.8639610983991295, + "learning_rate": 3.7345375784146977e-06, + "loss": 0.4704, + "step": 5170 + }, + { + "epoch": 0.59, + "grad_norm": 1.8486174705138188, + "learning_rate": 3.7327373409287295e-06, + "loss": 0.4389, + "step": 5171 + }, + { + "epoch": 0.59, + "grad_norm": 1.605898149332536, + "learning_rate": 3.730937278983764e-06, + "loss": 0.4896, + "step": 5172 + }, + { + "epoch": 0.59, + "grad_norm": 2.6322092583227494, + "learning_rate": 3.7291373928291475e-06, + "loss": 0.4128, + "step": 5173 + }, + { + "epoch": 0.59, + "grad_norm": 3.2446505742004086, + "learning_rate": 3.7273376827141987e-06, + "loss": 0.5149, + "step": 5174 + }, + { + "epoch": 0.59, + "grad_norm": 2.8744305051259906, + "learning_rate": 3.7255381488882136e-06, + "loss": 0.578, + "step": 5175 + }, + { + "epoch": 0.59, + "grad_norm": 2.3539883885725783, + "learning_rate": 3.723738791600464e-06, + "loss": 0.5754, + "step": 5176 + }, + { + "epoch": 0.59, + "grad_norm": 2.508821851660808, + "learning_rate": 3.721939611100196e-06, + "loss": 0.4888, + "step": 5177 + }, + { + "epoch": 0.59, + "grad_norm": 1.7946251169536143, + "learning_rate": 3.720140607636631e-06, + "loss": 0.485, + "step": 5178 + }, + { + "epoch": 0.6, + "grad_norm": 1.6746555907304834, + "learning_rate": 3.7183417814589685e-06, + "loss": 0.5244, + "step": 5179 + }, + { + "epoch": 0.6, + "grad_norm": 0.9031905011416685, + "learning_rate": 3.7165431328163793e-06, + "loss": 0.7693, + "step": 5180 + }, + { + "epoch": 0.6, + "grad_norm": 2.265614167623082, + "learning_rate": 3.714744661958014e-06, + "loss": 0.5531, + "step": 5181 + }, + { + "epoch": 0.6, + "grad_norm": 2.382694599613371, + "learning_rate": 3.712946369132995e-06, + "loss": 0.5047, + "step": 5182 + }, + { + "epoch": 0.6, + "grad_norm": 1.7571971210916035, + "learning_rate": 3.711148254590422e-06, + "loss": 0.4817, + "step": 5183 + }, + { + "epoch": 0.6, + "grad_norm": 2.187704044167985, + "learning_rate": 3.709350318579371e-06, + "loss": 0.3888, + "step": 5184 + }, + { + "epoch": 0.6, + "grad_norm": 1.9806166480021807, + "learning_rate": 3.7075525613488887e-06, + "loss": 0.5675, + "step": 5185 + }, + { + "epoch": 0.6, + "grad_norm": 3.767025064524522, + "learning_rate": 3.705754983148002e-06, + "loss": 0.4905, + "step": 5186 + }, + { + "epoch": 0.6, + "grad_norm": 1.7851244435095854, + "learning_rate": 3.7039575842257113e-06, + "loss": 0.4357, + "step": 5187 + }, + { + "epoch": 0.6, + "grad_norm": 2.1495033431958683, + "learning_rate": 3.702160364830991e-06, + "loss": 0.4497, + "step": 5188 + }, + { + "epoch": 0.6, + "grad_norm": 1.742273447884854, + "learning_rate": 3.7003633252127925e-06, + "loss": 0.5312, + "step": 5189 + }, + { + "epoch": 0.6, + "grad_norm": 2.5965898641739713, + "learning_rate": 3.6985664656200402e-06, + "loss": 0.4468, + "step": 5190 + }, + { + "epoch": 0.6, + "grad_norm": 1.8302371158109296, + "learning_rate": 3.696769786301637e-06, + "loss": 0.5284, + "step": 5191 + }, + { + "epoch": 0.6, + "grad_norm": 1.763988215639915, + "learning_rate": 3.6949732875064558e-06, + "loss": 0.5422, + "step": 5192 + }, + { + "epoch": 0.6, + "grad_norm": 2.7126095806810517, + "learning_rate": 3.69317696948335e-06, + "loss": 0.4968, + "step": 5193 + }, + { + "epoch": 0.6, + "grad_norm": 1.819678163749761, + "learning_rate": 3.6913808324811434e-06, + "loss": 0.4307, + "step": 5194 + }, + { + "epoch": 0.6, + "grad_norm": 2.355656312401466, + "learning_rate": 3.6895848767486374e-06, + "loss": 0.4794, + "step": 5195 + }, + { + "epoch": 0.6, + "grad_norm": 1.6798276162456356, + "learning_rate": 3.6877891025346067e-06, + "loss": 0.5185, + "step": 5196 + }, + { + "epoch": 0.6, + "grad_norm": 1.9482610910358236, + "learning_rate": 3.685993510087803e-06, + "loss": 0.5332, + "step": 5197 + }, + { + "epoch": 0.6, + "grad_norm": 7.403382179608146, + "learning_rate": 3.6841980996569505e-06, + "loss": 0.4768, + "step": 5198 + }, + { + "epoch": 0.6, + "grad_norm": 2.1244089102150987, + "learning_rate": 3.6824028714907493e-06, + "loss": 0.4539, + "step": 5199 + }, + { + "epoch": 0.6, + "grad_norm": 2.7341090378199593, + "learning_rate": 3.6806078258378764e-06, + "loss": 0.4817, + "step": 5200 + }, + { + "epoch": 0.6, + "grad_norm": 1.6663169375882163, + "learning_rate": 3.678812962946977e-06, + "loss": 0.4364, + "step": 5201 + }, + { + "epoch": 0.6, + "grad_norm": 2.824657750755704, + "learning_rate": 3.677018283066677e-06, + "loss": 0.4722, + "step": 5202 + }, + { + "epoch": 0.6, + "grad_norm": 2.2855630382986285, + "learning_rate": 3.675223786445574e-06, + "loss": 0.495, + "step": 5203 + }, + { + "epoch": 0.6, + "grad_norm": 2.1673325640774666, + "learning_rate": 3.673429473332244e-06, + "loss": 0.5388, + "step": 5204 + }, + { + "epoch": 0.6, + "grad_norm": 1.9498452062317948, + "learning_rate": 3.671635343975234e-06, + "loss": 0.4719, + "step": 5205 + }, + { + "epoch": 0.6, + "grad_norm": 1.6433333714114524, + "learning_rate": 3.669841398623065e-06, + "loss": 0.5031, + "step": 5206 + }, + { + "epoch": 0.6, + "grad_norm": 2.628277434421528, + "learning_rate": 3.668047637524237e-06, + "loss": 0.3821, + "step": 5207 + }, + { + "epoch": 0.6, + "grad_norm": 1.8689921611061533, + "learning_rate": 3.6662540609272175e-06, + "loss": 0.4362, + "step": 5208 + }, + { + "epoch": 0.6, + "grad_norm": 1.9419601774502209, + "learning_rate": 3.664460669080455e-06, + "loss": 0.4884, + "step": 5209 + }, + { + "epoch": 0.6, + "grad_norm": 2.584024743564064, + "learning_rate": 3.6626674622323687e-06, + "loss": 0.5127, + "step": 5210 + }, + { + "epoch": 0.6, + "grad_norm": 2.714152450685024, + "learning_rate": 3.660874440631355e-06, + "loss": 0.4647, + "step": 5211 + }, + { + "epoch": 0.6, + "grad_norm": 2.6935775038699057, + "learning_rate": 3.6590816045257817e-06, + "loss": 0.4615, + "step": 5212 + }, + { + "epoch": 0.6, + "grad_norm": 2.1788554651812073, + "learning_rate": 3.657288954163991e-06, + "loss": 0.5538, + "step": 5213 + }, + { + "epoch": 0.6, + "grad_norm": 2.910937791644827, + "learning_rate": 3.6554964897943033e-06, + "loss": 0.4315, + "step": 5214 + }, + { + "epoch": 0.6, + "grad_norm": 2.1319578237312133, + "learning_rate": 3.653704211665008e-06, + "loss": 0.4977, + "step": 5215 + }, + { + "epoch": 0.6, + "grad_norm": 3.658594734948585, + "learning_rate": 3.651912120024372e-06, + "loss": 0.5164, + "step": 5216 + }, + { + "epoch": 0.6, + "grad_norm": 2.3427357787390792, + "learning_rate": 3.650120215120639e-06, + "loss": 0.4379, + "step": 5217 + }, + { + "epoch": 0.6, + "grad_norm": 1.9183660866668575, + "learning_rate": 3.648328497202017e-06, + "loss": 0.5347, + "step": 5218 + }, + { + "epoch": 0.6, + "grad_norm": 2.0174662668473893, + "learning_rate": 3.646536966516697e-06, + "loss": 0.504, + "step": 5219 + }, + { + "epoch": 0.6, + "grad_norm": 2.407681155344638, + "learning_rate": 3.644745623312843e-06, + "loss": 0.5008, + "step": 5220 + }, + { + "epoch": 0.6, + "grad_norm": 1.9079389311400419, + "learning_rate": 3.64295446783859e-06, + "loss": 0.5657, + "step": 5221 + }, + { + "epoch": 0.6, + "grad_norm": 2.158752138024685, + "learning_rate": 3.6411635003420494e-06, + "loss": 0.4678, + "step": 5222 + }, + { + "epoch": 0.6, + "grad_norm": 2.6414308534385946, + "learning_rate": 3.639372721071305e-06, + "loss": 0.4897, + "step": 5223 + }, + { + "epoch": 0.6, + "grad_norm": 2.2048509211163325, + "learning_rate": 3.6375821302744153e-06, + "loss": 0.4239, + "step": 5224 + }, + { + "epoch": 0.6, + "grad_norm": 2.3043329082204957, + "learning_rate": 3.6357917281994136e-06, + "loss": 0.461, + "step": 5225 + }, + { + "epoch": 0.6, + "grad_norm": 5.787822226487705, + "learning_rate": 3.634001515094305e-06, + "loss": 0.5041, + "step": 5226 + }, + { + "epoch": 0.6, + "grad_norm": 3.9267004856434613, + "learning_rate": 3.6322114912070716e-06, + "loss": 0.5209, + "step": 5227 + }, + { + "epoch": 0.6, + "grad_norm": 2.0120494929271424, + "learning_rate": 3.630421656785664e-06, + "loss": 0.404, + "step": 5228 + }, + { + "epoch": 0.6, + "grad_norm": 1.8675339089767384, + "learning_rate": 3.6286320120780113e-06, + "loss": 0.5213, + "step": 5229 + }, + { + "epoch": 0.6, + "grad_norm": 2.100735100636779, + "learning_rate": 3.6268425573320143e-06, + "loss": 0.4335, + "step": 5230 + }, + { + "epoch": 0.6, + "grad_norm": 1.8252419539835898, + "learning_rate": 3.625053292795549e-06, + "loss": 0.3742, + "step": 5231 + }, + { + "epoch": 0.6, + "grad_norm": 1.7833609339060217, + "learning_rate": 3.6232642187164634e-06, + "loss": 0.4513, + "step": 5232 + }, + { + "epoch": 0.6, + "grad_norm": 2.8100465433653956, + "learning_rate": 3.6214753353425795e-06, + "loss": 0.4088, + "step": 5233 + }, + { + "epoch": 0.6, + "grad_norm": 2.198763580969919, + "learning_rate": 3.619686642921696e-06, + "loss": 0.4848, + "step": 5234 + }, + { + "epoch": 0.6, + "grad_norm": 2.411755264791243, + "learning_rate": 3.6178981417015767e-06, + "loss": 0.4337, + "step": 5235 + }, + { + "epoch": 0.6, + "grad_norm": 1.5973978927569226, + "learning_rate": 3.6161098319299682e-06, + "loss": 0.5213, + "step": 5236 + }, + { + "epoch": 0.6, + "grad_norm": 2.3144953478109858, + "learning_rate": 3.614321713854586e-06, + "loss": 0.4532, + "step": 5237 + }, + { + "epoch": 0.6, + "grad_norm": 2.3255429386702584, + "learning_rate": 3.6125337877231192e-06, + "loss": 0.5384, + "step": 5238 + }, + { + "epoch": 0.6, + "grad_norm": 2.1792170325168225, + "learning_rate": 3.610746053783233e-06, + "loss": 0.4802, + "step": 5239 + }, + { + "epoch": 0.6, + "grad_norm": 2.4945700096072567, + "learning_rate": 3.608958512282562e-06, + "loss": 0.5652, + "step": 5240 + }, + { + "epoch": 0.6, + "grad_norm": 1.904857160561426, + "learning_rate": 3.607171163468717e-06, + "loss": 0.5058, + "step": 5241 + }, + { + "epoch": 0.6, + "grad_norm": 2.152717183942851, + "learning_rate": 3.6053840075892816e-06, + "loss": 0.4417, + "step": 5242 + }, + { + "epoch": 0.6, + "grad_norm": 1.8536091996616813, + "learning_rate": 3.6035970448918117e-06, + "loss": 0.5222, + "step": 5243 + }, + { + "epoch": 0.6, + "grad_norm": 2.784560161519531, + "learning_rate": 3.6018102756238373e-06, + "loss": 0.513, + "step": 5244 + }, + { + "epoch": 0.6, + "grad_norm": 2.081735573007617, + "learning_rate": 3.600023700032861e-06, + "loss": 0.4797, + "step": 5245 + }, + { + "epoch": 0.6, + "grad_norm": 5.559302015633876, + "learning_rate": 3.598237318366361e-06, + "loss": 0.4627, + "step": 5246 + }, + { + "epoch": 0.6, + "grad_norm": 2.438376075944337, + "learning_rate": 3.596451130871783e-06, + "loss": 0.4577, + "step": 5247 + }, + { + "epoch": 0.6, + "grad_norm": 1.9018070366763367, + "learning_rate": 3.594665137796552e-06, + "loss": 0.5193, + "step": 5248 + }, + { + "epoch": 0.6, + "grad_norm": 2.0612480510945765, + "learning_rate": 3.5928793393880623e-06, + "loss": 0.527, + "step": 5249 + }, + { + "epoch": 0.6, + "grad_norm": 1.8937544089156397, + "learning_rate": 3.5910937358936825e-06, + "loss": 0.4952, + "step": 5250 + }, + { + "epoch": 0.6, + "grad_norm": 2.478356072805228, + "learning_rate": 3.5893083275607565e-06, + "loss": 0.4835, + "step": 5251 + }, + { + "epoch": 0.6, + "grad_norm": 3.0567687550395433, + "learning_rate": 3.5875231146365954e-06, + "loss": 0.5392, + "step": 5252 + }, + { + "epoch": 0.6, + "grad_norm": 2.2251454695912534, + "learning_rate": 3.5857380973684876e-06, + "loss": 0.4423, + "step": 5253 + }, + { + "epoch": 0.6, + "grad_norm": 1.7593047091888647, + "learning_rate": 3.5839532760036933e-06, + "loss": 0.3864, + "step": 5254 + }, + { + "epoch": 0.6, + "grad_norm": 3.3671360678996054, + "learning_rate": 3.582168650789446e-06, + "loss": 0.6129, + "step": 5255 + }, + { + "epoch": 0.6, + "grad_norm": 2.289354941784124, + "learning_rate": 3.580384221972951e-06, + "loss": 0.5094, + "step": 5256 + }, + { + "epoch": 0.6, + "grad_norm": 1.816460514694996, + "learning_rate": 3.5785999898013887e-06, + "loss": 0.5085, + "step": 5257 + }, + { + "epoch": 0.6, + "grad_norm": 0.9020533848780438, + "learning_rate": 3.576815954521909e-06, + "loss": 0.7224, + "step": 5258 + }, + { + "epoch": 0.6, + "grad_norm": 2.380320407323962, + "learning_rate": 3.575032116381637e-06, + "loss": 0.5264, + "step": 5259 + }, + { + "epoch": 0.6, + "grad_norm": 1.683611385170924, + "learning_rate": 3.5732484756276693e-06, + "loss": 0.4612, + "step": 5260 + }, + { + "epoch": 0.6, + "grad_norm": 1.8986384435652586, + "learning_rate": 3.5714650325070752e-06, + "loss": 0.5239, + "step": 5261 + }, + { + "epoch": 0.6, + "grad_norm": 2.6789333148109358, + "learning_rate": 3.5696817872668984e-06, + "loss": 0.4833, + "step": 5262 + }, + { + "epoch": 0.6, + "grad_norm": 1.8765901497839967, + "learning_rate": 3.5678987401541522e-06, + "loss": 0.5072, + "step": 5263 + }, + { + "epoch": 0.6, + "grad_norm": 1.7807824253806515, + "learning_rate": 3.5661158914158243e-06, + "loss": 0.5973, + "step": 5264 + }, + { + "epoch": 0.6, + "grad_norm": 2.577665132259173, + "learning_rate": 3.5643332412988753e-06, + "loss": 0.5137, + "step": 5265 + }, + { + "epoch": 0.61, + "grad_norm": 0.8805721493386138, + "learning_rate": 3.562550790050237e-06, + "loss": 0.7122, + "step": 5266 + }, + { + "epoch": 0.61, + "grad_norm": 1.9262208286072058, + "learning_rate": 3.5607685379168145e-06, + "loss": 0.5226, + "step": 5267 + }, + { + "epoch": 0.61, + "grad_norm": 1.9007790602720147, + "learning_rate": 3.558986485145485e-06, + "loss": 0.5135, + "step": 5268 + }, + { + "epoch": 0.61, + "grad_norm": 1.859940978902418, + "learning_rate": 3.5572046319830973e-06, + "loss": 0.5489, + "step": 5269 + }, + { + "epoch": 0.61, + "grad_norm": 3.1312496578856273, + "learning_rate": 3.555422978676474e-06, + "loss": 0.345, + "step": 5270 + }, + { + "epoch": 0.61, + "grad_norm": 2.2400433393134884, + "learning_rate": 3.5536415254724092e-06, + "loss": 0.5206, + "step": 5271 + }, + { + "epoch": 0.61, + "grad_norm": 2.442229092544587, + "learning_rate": 3.551860272617671e-06, + "loss": 0.5175, + "step": 5272 + }, + { + "epoch": 0.61, + "grad_norm": 1.756505030843506, + "learning_rate": 3.5500792203589964e-06, + "loss": 0.4504, + "step": 5273 + }, + { + "epoch": 0.61, + "grad_norm": 3.1227232966055385, + "learning_rate": 3.548298368943097e-06, + "loss": 0.5371, + "step": 5274 + }, + { + "epoch": 0.61, + "grad_norm": 1.800279877107523, + "learning_rate": 3.5465177186166556e-06, + "loss": 0.4504, + "step": 5275 + }, + { + "epoch": 0.61, + "grad_norm": 1.8861029929618702, + "learning_rate": 3.544737269626328e-06, + "loss": 0.4788, + "step": 5276 + }, + { + "epoch": 0.61, + "grad_norm": 3.806270775090061, + "learning_rate": 3.5429570222187424e-06, + "loss": 0.5327, + "step": 5277 + }, + { + "epoch": 0.61, + "grad_norm": 1.851348771162477, + "learning_rate": 3.5411769766404975e-06, + "loss": 0.5723, + "step": 5278 + }, + { + "epoch": 0.61, + "grad_norm": 2.9407529794864073, + "learning_rate": 3.539397133138165e-06, + "loss": 0.53, + "step": 5279 + }, + { + "epoch": 0.61, + "grad_norm": 2.255678561509405, + "learning_rate": 3.5376174919582884e-06, + "loss": 0.4523, + "step": 5280 + }, + { + "epoch": 0.61, + "grad_norm": 2.109333858003371, + "learning_rate": 3.5358380533473834e-06, + "loss": 0.6049, + "step": 5281 + }, + { + "epoch": 0.61, + "grad_norm": 1.9543463872371662, + "learning_rate": 3.5340588175519387e-06, + "loss": 0.5767, + "step": 5282 + }, + { + "epoch": 0.61, + "grad_norm": 1.954789910891778, + "learning_rate": 3.532279784818412e-06, + "loss": 0.5222, + "step": 5283 + }, + { + "epoch": 0.61, + "grad_norm": 4.217246768798719, + "learning_rate": 3.530500955393235e-06, + "loss": 0.4705, + "step": 5284 + }, + { + "epoch": 0.61, + "grad_norm": 2.162742143026045, + "learning_rate": 3.5287223295228135e-06, + "loss": 0.496, + "step": 5285 + }, + { + "epoch": 0.61, + "grad_norm": 2.1952518784120705, + "learning_rate": 3.526943907453518e-06, + "loss": 0.5425, + "step": 5286 + }, + { + "epoch": 0.61, + "grad_norm": 3.0311327233338257, + "learning_rate": 3.5251656894316976e-06, + "loss": 0.5178, + "step": 5287 + }, + { + "epoch": 0.61, + "grad_norm": 2.1793184895728213, + "learning_rate": 3.52338767570367e-06, + "loss": 0.4764, + "step": 5288 + }, + { + "epoch": 0.61, + "grad_norm": 2.5171265720321623, + "learning_rate": 3.521609866515726e-06, + "loss": 0.4561, + "step": 5289 + }, + { + "epoch": 0.61, + "grad_norm": 2.0214141996089565, + "learning_rate": 3.5198322621141268e-06, + "loss": 0.4769, + "step": 5290 + }, + { + "epoch": 0.61, + "grad_norm": 3.467404025753706, + "learning_rate": 3.518054862745107e-06, + "loss": 0.5324, + "step": 5291 + }, + { + "epoch": 0.61, + "grad_norm": 1.9587843756755257, + "learning_rate": 3.5162776686548717e-06, + "loss": 0.5348, + "step": 5292 + }, + { + "epoch": 0.61, + "grad_norm": 9.727065902410251, + "learning_rate": 3.5145006800895952e-06, + "loss": 0.4488, + "step": 5293 + }, + { + "epoch": 0.61, + "grad_norm": 0.7731133394435948, + "learning_rate": 3.512723897295428e-06, + "loss": 0.6526, + "step": 5294 + }, + { + "epoch": 0.61, + "grad_norm": 2.0958243083660206, + "learning_rate": 3.5109473205184886e-06, + "loss": 0.6023, + "step": 5295 + }, + { + "epoch": 0.61, + "grad_norm": 2.5940262490219714, + "learning_rate": 3.509170950004869e-06, + "loss": 0.4935, + "step": 5296 + }, + { + "epoch": 0.61, + "grad_norm": 4.631766107250355, + "learning_rate": 3.5073947860006298e-06, + "loss": 0.4375, + "step": 5297 + }, + { + "epoch": 0.61, + "grad_norm": 1.8206644892940258, + "learning_rate": 3.5056188287518074e-06, + "loss": 0.4782, + "step": 5298 + }, + { + "epoch": 0.61, + "grad_norm": 2.0115250956260056, + "learning_rate": 3.503843078504405e-06, + "loss": 0.5973, + "step": 5299 + }, + { + "epoch": 0.61, + "grad_norm": 2.0210062439668453, + "learning_rate": 3.5020675355044013e-06, + "loss": 0.5092, + "step": 5300 + }, + { + "epoch": 0.61, + "grad_norm": 2.551098705009813, + "learning_rate": 3.500292199997743e-06, + "loss": 0.4806, + "step": 5301 + }, + { + "epoch": 0.61, + "grad_norm": 1.8530320403119205, + "learning_rate": 3.498517072230351e-06, + "loss": 0.5373, + "step": 5302 + }, + { + "epoch": 0.61, + "grad_norm": 2.3319728714856676, + "learning_rate": 3.4967421524481125e-06, + "loss": 0.4574, + "step": 5303 + }, + { + "epoch": 0.61, + "grad_norm": 4.431112093270714, + "learning_rate": 3.49496744089689e-06, + "loss": 0.5058, + "step": 5304 + }, + { + "epoch": 0.61, + "grad_norm": 2.143903065927962, + "learning_rate": 3.493192937822518e-06, + "loss": 0.4619, + "step": 5305 + }, + { + "epoch": 0.61, + "grad_norm": 3.518626801519871, + "learning_rate": 3.491418643470799e-06, + "loss": 0.5322, + "step": 5306 + }, + { + "epoch": 0.61, + "grad_norm": 1.915067079841346, + "learning_rate": 3.489644558087507e-06, + "loss": 0.4577, + "step": 5307 + }, + { + "epoch": 0.61, + "grad_norm": 2.4543087513451565, + "learning_rate": 3.4878706819183903e-06, + "loss": 0.4314, + "step": 5308 + }, + { + "epoch": 0.61, + "grad_norm": 1.5880688068754105, + "learning_rate": 3.4860970152091644e-06, + "loss": 0.5391, + "step": 5309 + }, + { + "epoch": 0.61, + "grad_norm": 1.5817385074194683, + "learning_rate": 3.484323558205518e-06, + "loss": 0.5179, + "step": 5310 + }, + { + "epoch": 0.61, + "grad_norm": 2.4559778435572803, + "learning_rate": 3.48255031115311e-06, + "loss": 0.4055, + "step": 5311 + }, + { + "epoch": 0.61, + "grad_norm": 1.952043817900018, + "learning_rate": 3.480777274297571e-06, + "loss": 0.5661, + "step": 5312 + }, + { + "epoch": 0.61, + "grad_norm": 1.9926922834617962, + "learning_rate": 3.4790044478845e-06, + "loss": 0.4946, + "step": 5313 + }, + { + "epoch": 0.61, + "grad_norm": 1.7823876998327304, + "learning_rate": 3.4772318321594686e-06, + "loss": 0.403, + "step": 5314 + }, + { + "epoch": 0.61, + "grad_norm": 2.339912615938851, + "learning_rate": 3.4754594273680205e-06, + "loss": 0.5297, + "step": 5315 + }, + { + "epoch": 0.61, + "grad_norm": 2.1285387966501754, + "learning_rate": 3.473687233755668e-06, + "loss": 0.5384, + "step": 5316 + }, + { + "epoch": 0.61, + "grad_norm": 7.856271013534417, + "learning_rate": 3.4719152515678967e-06, + "loss": 0.5382, + "step": 5317 + }, + { + "epoch": 0.61, + "grad_norm": 2.1203329103856565, + "learning_rate": 3.470143481050159e-06, + "loss": 0.5993, + "step": 5318 + }, + { + "epoch": 0.61, + "grad_norm": 2.864352373616673, + "learning_rate": 3.468371922447884e-06, + "loss": 0.4734, + "step": 5319 + }, + { + "epoch": 0.61, + "grad_norm": 2.4363359030816873, + "learning_rate": 3.4666005760064624e-06, + "loss": 0.5036, + "step": 5320 + }, + { + "epoch": 0.61, + "grad_norm": 3.8305137052569864, + "learning_rate": 3.4648294419712637e-06, + "loss": 0.4562, + "step": 5321 + }, + { + "epoch": 0.61, + "grad_norm": 1.893200774823598, + "learning_rate": 3.463058520587625e-06, + "loss": 0.464, + "step": 5322 + }, + { + "epoch": 0.61, + "grad_norm": 1.9173374060899253, + "learning_rate": 3.461287812100853e-06, + "loss": 0.4617, + "step": 5323 + }, + { + "epoch": 0.61, + "grad_norm": 3.759629342310016, + "learning_rate": 3.459517316756228e-06, + "loss": 0.4925, + "step": 5324 + }, + { + "epoch": 0.61, + "grad_norm": 1.7412212971525658, + "learning_rate": 3.4577470347989965e-06, + "loss": 0.4454, + "step": 5325 + }, + { + "epoch": 0.61, + "grad_norm": 3.8553309403186957, + "learning_rate": 3.4559769664743792e-06, + "loss": 0.6085, + "step": 5326 + }, + { + "epoch": 0.61, + "grad_norm": 2.436516098349339, + "learning_rate": 3.4542071120275644e-06, + "loss": 0.4478, + "step": 5327 + }, + { + "epoch": 0.61, + "grad_norm": 2.088911213070293, + "learning_rate": 3.4524374717037135e-06, + "loss": 0.462, + "step": 5328 + }, + { + "epoch": 0.61, + "grad_norm": 2.0056973989536324, + "learning_rate": 3.450668045747956e-06, + "loss": 0.5238, + "step": 5329 + }, + { + "epoch": 0.61, + "grad_norm": 2.7648725598491706, + "learning_rate": 3.448898834405392e-06, + "loss": 0.5179, + "step": 5330 + }, + { + "epoch": 0.61, + "grad_norm": 2.358872644271964, + "learning_rate": 3.447129837921094e-06, + "loss": 0.5958, + "step": 5331 + }, + { + "epoch": 0.61, + "grad_norm": 2.1599831027891963, + "learning_rate": 3.4453610565401007e-06, + "loss": 0.4505, + "step": 5332 + }, + { + "epoch": 0.61, + "grad_norm": 1.7822481392512597, + "learning_rate": 3.443592490507425e-06, + "loss": 0.4951, + "step": 5333 + }, + { + "epoch": 0.61, + "grad_norm": 2.2730156458120083, + "learning_rate": 3.441824140068047e-06, + "loss": 0.5518, + "step": 5334 + }, + { + "epoch": 0.61, + "grad_norm": 0.8803813422595941, + "learning_rate": 3.4400560054669196e-06, + "loss": 0.7054, + "step": 5335 + }, + { + "epoch": 0.61, + "grad_norm": 1.6603821356390671, + "learning_rate": 3.4382880869489653e-06, + "loss": 0.4261, + "step": 5336 + }, + { + "epoch": 0.61, + "grad_norm": 1.9452875937859426, + "learning_rate": 3.436520384759075e-06, + "loss": 0.4445, + "step": 5337 + }, + { + "epoch": 0.61, + "grad_norm": 1.9081719436096778, + "learning_rate": 3.4347528991421085e-06, + "loss": 0.5139, + "step": 5338 + }, + { + "epoch": 0.61, + "grad_norm": 2.7761579161634535, + "learning_rate": 3.4329856303429e-06, + "loss": 0.5153, + "step": 5339 + }, + { + "epoch": 0.61, + "grad_norm": 0.8932842459481577, + "learning_rate": 3.43121857860625e-06, + "loss": 0.7029, + "step": 5340 + }, + { + "epoch": 0.61, + "grad_norm": 2.0125564829483054, + "learning_rate": 3.4294517441769314e-06, + "loss": 0.4078, + "step": 5341 + }, + { + "epoch": 0.61, + "grad_norm": 2.0091839556871025, + "learning_rate": 3.4276851272996847e-06, + "loss": 0.5321, + "step": 5342 + }, + { + "epoch": 0.61, + "grad_norm": 2.444473233674903, + "learning_rate": 3.4259187282192217e-06, + "loss": 0.4928, + "step": 5343 + }, + { + "epoch": 0.61, + "grad_norm": 2.1310783503963164, + "learning_rate": 3.424152547180225e-06, + "loss": 0.5343, + "step": 5344 + }, + { + "epoch": 0.61, + "grad_norm": 3.6403547022212046, + "learning_rate": 3.422386584427343e-06, + "loss": 0.4685, + "step": 5345 + }, + { + "epoch": 0.61, + "grad_norm": 1.9191705462152533, + "learning_rate": 3.420620840205199e-06, + "loss": 0.4971, + "step": 5346 + }, + { + "epoch": 0.61, + "grad_norm": 12.429433660315318, + "learning_rate": 3.4188553147583824e-06, + "loss": 0.5284, + "step": 5347 + }, + { + "epoch": 0.61, + "grad_norm": 2.040754745324801, + "learning_rate": 3.417090008331454e-06, + "loss": 0.4996, + "step": 5348 + }, + { + "epoch": 0.61, + "grad_norm": 2.627370388501293, + "learning_rate": 3.4153249211689426e-06, + "loss": 0.5271, + "step": 5349 + }, + { + "epoch": 0.61, + "grad_norm": 7.474877128720193, + "learning_rate": 3.413560053515349e-06, + "loss": 0.469, + "step": 5350 + }, + { + "epoch": 0.61, + "grad_norm": 1.7253322722524558, + "learning_rate": 3.4117954056151435e-06, + "loss": 0.5338, + "step": 5351 + }, + { + "epoch": 0.61, + "grad_norm": 1.978972707055065, + "learning_rate": 3.4100309777127634e-06, + "loss": 0.4693, + "step": 5352 + }, + { + "epoch": 0.62, + "grad_norm": 2.0641287041823566, + "learning_rate": 3.408266770052615e-06, + "loss": 0.5592, + "step": 5353 + }, + { + "epoch": 0.62, + "grad_norm": 2.2327455245286902, + "learning_rate": 3.406502782879079e-06, + "loss": 0.5313, + "step": 5354 + }, + { + "epoch": 0.62, + "grad_norm": 2.2216321827365433, + "learning_rate": 3.404739016436501e-06, + "loss": 0.5033, + "step": 5355 + }, + { + "epoch": 0.62, + "grad_norm": 1.6715385416114652, + "learning_rate": 3.4029754709691976e-06, + "loss": 0.5335, + "step": 5356 + }, + { + "epoch": 0.62, + "grad_norm": 1.7420113626740266, + "learning_rate": 3.401212146721457e-06, + "loss": 0.5183, + "step": 5357 + }, + { + "epoch": 0.62, + "grad_norm": 1.9849758206579096, + "learning_rate": 3.3994490439375318e-06, + "loss": 0.4563, + "step": 5358 + }, + { + "epoch": 0.62, + "grad_norm": 2.256120193531315, + "learning_rate": 3.3976861628616477e-06, + "loss": 0.4245, + "step": 5359 + }, + { + "epoch": 0.62, + "grad_norm": 1.7240707967092326, + "learning_rate": 3.395923503737999e-06, + "loss": 0.4165, + "step": 5360 + }, + { + "epoch": 0.62, + "grad_norm": 2.940443166826805, + "learning_rate": 3.3941610668107482e-06, + "loss": 0.4334, + "step": 5361 + }, + { + "epoch": 0.62, + "grad_norm": 3.0212916783727173, + "learning_rate": 3.392398852324029e-06, + "loss": 0.5134, + "step": 5362 + }, + { + "epoch": 0.62, + "grad_norm": 2.388627719039449, + "learning_rate": 3.3906368605219418e-06, + "loss": 0.4939, + "step": 5363 + }, + { + "epoch": 0.62, + "grad_norm": 2.3388150878217133, + "learning_rate": 3.3888750916485585e-06, + "loss": 0.4605, + "step": 5364 + }, + { + "epoch": 0.62, + "grad_norm": 1.9339088214998041, + "learning_rate": 3.3871135459479176e-06, + "loss": 0.5394, + "step": 5365 + }, + { + "epoch": 0.62, + "grad_norm": 3.7420362780740306, + "learning_rate": 3.38535222366403e-06, + "loss": 0.5325, + "step": 5366 + }, + { + "epoch": 0.62, + "grad_norm": 1.9279599503585507, + "learning_rate": 3.383591125040872e-06, + "loss": 0.4081, + "step": 5367 + }, + { + "epoch": 0.62, + "grad_norm": 2.0373659589629542, + "learning_rate": 3.3818302503223915e-06, + "loss": 0.453, + "step": 5368 + }, + { + "epoch": 0.62, + "grad_norm": 2.0723744168466314, + "learning_rate": 3.380069599752507e-06, + "loss": 0.5241, + "step": 5369 + }, + { + "epoch": 0.62, + "grad_norm": 2.0650209590184887, + "learning_rate": 3.378309173575098e-06, + "loss": 0.4903, + "step": 5370 + }, + { + "epoch": 0.62, + "grad_norm": 1.913256572438015, + "learning_rate": 3.3765489720340215e-06, + "loss": 0.5924, + "step": 5371 + }, + { + "epoch": 0.62, + "grad_norm": 1.861917212430981, + "learning_rate": 3.374788995373101e-06, + "loss": 0.5935, + "step": 5372 + }, + { + "epoch": 0.62, + "grad_norm": 3.2134220949662993, + "learning_rate": 3.3730292438361266e-06, + "loss": 0.5764, + "step": 5373 + }, + { + "epoch": 0.62, + "grad_norm": 2.404813075531006, + "learning_rate": 3.37126971766686e-06, + "loss": 0.4597, + "step": 5374 + }, + { + "epoch": 0.62, + "grad_norm": 0.8568576969483848, + "learning_rate": 3.3695104171090297e-06, + "loss": 0.7125, + "step": 5375 + }, + { + "epoch": 0.62, + "grad_norm": 1.9052219258192944, + "learning_rate": 3.3677513424063345e-06, + "loss": 0.5126, + "step": 5376 + }, + { + "epoch": 0.62, + "grad_norm": 1.8837848044143082, + "learning_rate": 3.365992493802441e-06, + "loss": 0.5549, + "step": 5377 + }, + { + "epoch": 0.62, + "grad_norm": 2.4224271200865855, + "learning_rate": 3.364233871540984e-06, + "loss": 0.5573, + "step": 5378 + }, + { + "epoch": 0.62, + "grad_norm": 2.2066407073879364, + "learning_rate": 3.3624754758655674e-06, + "loss": 0.4866, + "step": 5379 + }, + { + "epoch": 0.62, + "grad_norm": 1.8542094062435068, + "learning_rate": 3.360717307019764e-06, + "loss": 0.4013, + "step": 5380 + }, + { + "epoch": 0.62, + "grad_norm": 1.6822937202408952, + "learning_rate": 3.3589593652471153e-06, + "loss": 0.411, + "step": 5381 + }, + { + "epoch": 0.62, + "grad_norm": 1.7544209945581823, + "learning_rate": 3.3572016507911314e-06, + "loss": 0.6129, + "step": 5382 + }, + { + "epoch": 0.62, + "grad_norm": 2.99018500358122, + "learning_rate": 3.3554441638952904e-06, + "loss": 0.5524, + "step": 5383 + }, + { + "epoch": 0.62, + "grad_norm": 2.2243994422778273, + "learning_rate": 3.3536869048030386e-06, + "loss": 0.4495, + "step": 5384 + }, + { + "epoch": 0.62, + "grad_norm": 1.9126584395974655, + "learning_rate": 3.351929873757792e-06, + "loss": 0.4654, + "step": 5385 + }, + { + "epoch": 0.62, + "grad_norm": 1.976247615458194, + "learning_rate": 3.3501730710029358e-06, + "loss": 0.4626, + "step": 5386 + }, + { + "epoch": 0.62, + "grad_norm": 1.8031343232412036, + "learning_rate": 3.348416496781818e-06, + "loss": 0.4519, + "step": 5387 + }, + { + "epoch": 0.62, + "grad_norm": 2.160944353821932, + "learning_rate": 3.3466601513377604e-06, + "loss": 0.5272, + "step": 5388 + }, + { + "epoch": 0.62, + "grad_norm": 1.6072434873495018, + "learning_rate": 3.3449040349140527e-06, + "loss": 0.4441, + "step": 5389 + }, + { + "epoch": 0.62, + "grad_norm": 1.8543581586058269, + "learning_rate": 3.3431481477539513e-06, + "loss": 0.546, + "step": 5390 + }, + { + "epoch": 0.62, + "grad_norm": 2.0003116429734575, + "learning_rate": 3.3413924901006817e-06, + "loss": 0.4957, + "step": 5391 + }, + { + "epoch": 0.62, + "grad_norm": 1.8716364435227555, + "learning_rate": 3.339637062197437e-06, + "loss": 0.5058, + "step": 5392 + }, + { + "epoch": 0.62, + "grad_norm": 1.6204509686606114, + "learning_rate": 3.337881864287379e-06, + "loss": 0.5113, + "step": 5393 + }, + { + "epoch": 0.62, + "grad_norm": 2.2798535147091936, + "learning_rate": 3.3361268966136367e-06, + "loss": 0.5756, + "step": 5394 + }, + { + "epoch": 0.62, + "grad_norm": 2.828169098382473, + "learning_rate": 3.334372159419309e-06, + "loss": 0.4471, + "step": 5395 + }, + { + "epoch": 0.62, + "grad_norm": 2.4188946170754604, + "learning_rate": 3.3326176529474624e-06, + "loss": 0.4624, + "step": 5396 + }, + { + "epoch": 0.62, + "grad_norm": 2.444168748635672, + "learning_rate": 3.330863377441128e-06, + "loss": 0.6145, + "step": 5397 + }, + { + "epoch": 0.62, + "grad_norm": 2.9246093398897894, + "learning_rate": 3.3291093331433107e-06, + "loss": 0.465, + "step": 5398 + }, + { + "epoch": 0.62, + "grad_norm": 2.4755913948672434, + "learning_rate": 3.3273555202969783e-06, + "loss": 0.502, + "step": 5399 + }, + { + "epoch": 0.62, + "grad_norm": 1.6170016059231933, + "learning_rate": 3.3256019391450696e-06, + "loss": 0.4998, + "step": 5400 + }, + { + "epoch": 0.62, + "grad_norm": 0.8409408123480376, + "learning_rate": 3.32384858993049e-06, + "loss": 0.7317, + "step": 5401 + }, + { + "epoch": 0.62, + "grad_norm": 2.0786923399199586, + "learning_rate": 3.3220954728961143e-06, + "loss": 0.5258, + "step": 5402 + }, + { + "epoch": 0.62, + "grad_norm": 2.4926398827495904, + "learning_rate": 3.320342588284784e-06, + "loss": 0.4232, + "step": 5403 + }, + { + "epoch": 0.62, + "grad_norm": 2.2739164111784627, + "learning_rate": 3.318589936339306e-06, + "loss": 0.5064, + "step": 5404 + }, + { + "epoch": 0.62, + "grad_norm": 2.5387373643599718, + "learning_rate": 3.316837517302459e-06, + "loss": 0.499, + "step": 5405 + }, + { + "epoch": 0.62, + "grad_norm": 2.4477125787975433, + "learning_rate": 3.315085331416987e-06, + "loss": 0.4711, + "step": 5406 + }, + { + "epoch": 0.62, + "grad_norm": 2.078543627241337, + "learning_rate": 3.313333378925604e-06, + "loss": 0.541, + "step": 5407 + }, + { + "epoch": 0.62, + "grad_norm": 3.0096122942064905, + "learning_rate": 3.3115816600709883e-06, + "loss": 0.5223, + "step": 5408 + }, + { + "epoch": 0.62, + "grad_norm": 2.134034070691248, + "learning_rate": 3.3098301750957883e-06, + "loss": 0.5498, + "step": 5409 + }, + { + "epoch": 0.62, + "grad_norm": 1.814331441803216, + "learning_rate": 3.3080789242426202e-06, + "loss": 0.5152, + "step": 5410 + }, + { + "epoch": 0.62, + "grad_norm": 1.5947620033916907, + "learning_rate": 3.3063279077540674e-06, + "loss": 0.4965, + "step": 5411 + }, + { + "epoch": 0.62, + "grad_norm": 2.670422635695194, + "learning_rate": 3.304577125872678e-06, + "loss": 0.5252, + "step": 5412 + }, + { + "epoch": 0.62, + "grad_norm": 0.8481309416108033, + "learning_rate": 3.3028265788409724e-06, + "loss": 0.7209, + "step": 5413 + }, + { + "epoch": 0.62, + "grad_norm": 2.3275015351397403, + "learning_rate": 3.301076266901435e-06, + "loss": 0.5774, + "step": 5414 + }, + { + "epoch": 0.62, + "grad_norm": 1.7723760058240716, + "learning_rate": 3.2993261902965185e-06, + "loss": 0.4969, + "step": 5415 + }, + { + "epoch": 0.62, + "grad_norm": 1.6288110863670269, + "learning_rate": 3.2975763492686446e-06, + "loss": 0.3498, + "step": 5416 + }, + { + "epoch": 0.62, + "grad_norm": 3.3159070163206104, + "learning_rate": 3.295826744060199e-06, + "loss": 0.5038, + "step": 5417 + }, + { + "epoch": 0.62, + "grad_norm": 1.9223193865584487, + "learning_rate": 3.294077374913538e-06, + "loss": 0.5378, + "step": 5418 + }, + { + "epoch": 0.62, + "grad_norm": 2.504274870200266, + "learning_rate": 3.2923282420709834e-06, + "loss": 0.5196, + "step": 5419 + }, + { + "epoch": 0.62, + "grad_norm": 1.8891047086202426, + "learning_rate": 3.2905793457748257e-06, + "loss": 0.4703, + "step": 5420 + }, + { + "epoch": 0.62, + "grad_norm": 2.4163546049434963, + "learning_rate": 3.2888306862673197e-06, + "loss": 0.4626, + "step": 5421 + }, + { + "epoch": 0.62, + "grad_norm": 2.2921702959739036, + "learning_rate": 3.2870822637906917e-06, + "loss": 0.5418, + "step": 5422 + }, + { + "epoch": 0.62, + "grad_norm": 2.044643133074909, + "learning_rate": 3.2853340785871313e-06, + "loss": 0.4955, + "step": 5423 + }, + { + "epoch": 0.62, + "grad_norm": 2.0524347023059963, + "learning_rate": 3.283586130898797e-06, + "loss": 0.5119, + "step": 5424 + }, + { + "epoch": 0.62, + "grad_norm": 2.1986270563403023, + "learning_rate": 3.281838420967815e-06, + "loss": 0.4728, + "step": 5425 + }, + { + "epoch": 0.62, + "grad_norm": 1.7030788540366442, + "learning_rate": 3.280090949036277e-06, + "loss": 0.6014, + "step": 5426 + }, + { + "epoch": 0.62, + "grad_norm": 2.2389674800226254, + "learning_rate": 3.278343715346243e-06, + "loss": 0.5127, + "step": 5427 + }, + { + "epoch": 0.62, + "grad_norm": 1.9624577198445174, + "learning_rate": 3.2765967201397393e-06, + "loss": 0.53, + "step": 5428 + }, + { + "epoch": 0.62, + "grad_norm": 3.060024978832597, + "learning_rate": 3.2748499636587596e-06, + "loss": 0.521, + "step": 5429 + }, + { + "epoch": 0.62, + "grad_norm": 1.8716538441438582, + "learning_rate": 3.2731034461452637e-06, + "loss": 0.4534, + "step": 5430 + }, + { + "epoch": 0.62, + "grad_norm": 2.0004141489829204, + "learning_rate": 3.2713571678411794e-06, + "loss": 0.534, + "step": 5431 + }, + { + "epoch": 0.62, + "grad_norm": 2.0127201001254247, + "learning_rate": 3.2696111289884008e-06, + "loss": 0.5803, + "step": 5432 + }, + { + "epoch": 0.62, + "grad_norm": 1.8434622285235553, + "learning_rate": 3.2678653298287887e-06, + "loss": 0.4429, + "step": 5433 + }, + { + "epoch": 0.62, + "grad_norm": 3.1851690966358803, + "learning_rate": 3.2661197706041715e-06, + "loss": 0.5494, + "step": 5434 + }, + { + "epoch": 0.62, + "grad_norm": 1.9239965932129126, + "learning_rate": 3.2643744515563437e-06, + "loss": 0.4243, + "step": 5435 + }, + { + "epoch": 0.62, + "grad_norm": 1.9364864861194138, + "learning_rate": 3.2626293729270663e-06, + "loss": 0.5061, + "step": 5436 + }, + { + "epoch": 0.62, + "grad_norm": 1.8256759623551713, + "learning_rate": 3.260884534958068e-06, + "loss": 0.4751, + "step": 5437 + }, + { + "epoch": 0.62, + "grad_norm": 2.163303035184231, + "learning_rate": 3.2591399378910416e-06, + "loss": 0.5195, + "step": 5438 + }, + { + "epoch": 0.62, + "grad_norm": 2.419613486368604, + "learning_rate": 3.2573955819676495e-06, + "loss": 0.49, + "step": 5439 + }, + { + "epoch": 0.63, + "grad_norm": 2.492563208030891, + "learning_rate": 3.2556514674295192e-06, + "loss": 0.4921, + "step": 5440 + }, + { + "epoch": 0.63, + "grad_norm": 2.151594633538163, + "learning_rate": 3.2539075945182458e-06, + "loss": 0.6302, + "step": 5441 + }, + { + "epoch": 0.63, + "grad_norm": 2.222809386283672, + "learning_rate": 3.252163963475391e-06, + "loss": 0.4145, + "step": 5442 + }, + { + "epoch": 0.63, + "grad_norm": 0.8629987240356548, + "learning_rate": 3.25042057454248e-06, + "loss": 0.7121, + "step": 5443 + }, + { + "epoch": 0.63, + "grad_norm": 1.5957943659851723, + "learning_rate": 3.248677427961008e-06, + "loss": 0.4833, + "step": 5444 + }, + { + "epoch": 0.63, + "grad_norm": 1.6865892741349249, + "learning_rate": 3.246934523972436e-06, + "loss": 0.4822, + "step": 5445 + }, + { + "epoch": 0.63, + "grad_norm": 2.032940037575656, + "learning_rate": 3.2451918628181887e-06, + "loss": 0.5021, + "step": 5446 + }, + { + "epoch": 0.63, + "grad_norm": 1.9625136227564548, + "learning_rate": 3.2434494447396613e-06, + "loss": 0.4151, + "step": 5447 + }, + { + "epoch": 0.63, + "grad_norm": 2.0234879948969127, + "learning_rate": 3.241707269978213e-06, + "loss": 0.4309, + "step": 5448 + }, + { + "epoch": 0.63, + "grad_norm": 1.9984934808801522, + "learning_rate": 3.239965338775169e-06, + "loss": 0.5347, + "step": 5449 + }, + { + "epoch": 0.63, + "grad_norm": 2.8865959966878423, + "learning_rate": 3.2382236513718213e-06, + "loss": 0.5553, + "step": 5450 + }, + { + "epoch": 0.63, + "grad_norm": 2.917491796949907, + "learning_rate": 3.2364822080094284e-06, + "loss": 0.5164, + "step": 5451 + }, + { + "epoch": 0.63, + "grad_norm": 1.8386424263150054, + "learning_rate": 3.234741008929214e-06, + "loss": 0.4726, + "step": 5452 + }, + { + "epoch": 0.63, + "grad_norm": 3.101511843486334, + "learning_rate": 3.23300005437237e-06, + "loss": 0.4987, + "step": 5453 + }, + { + "epoch": 0.63, + "grad_norm": 2.5834371313578064, + "learning_rate": 3.2312593445800543e-06, + "loss": 0.4833, + "step": 5454 + }, + { + "epoch": 0.63, + "grad_norm": 3.074124067488697, + "learning_rate": 3.229518879793385e-06, + "loss": 0.5185, + "step": 5455 + }, + { + "epoch": 0.63, + "grad_norm": 1.6164772418194375, + "learning_rate": 3.227778660253455e-06, + "loss": 0.4007, + "step": 5456 + }, + { + "epoch": 0.63, + "grad_norm": 2.5970426868171055, + "learning_rate": 3.226038686201317e-06, + "loss": 0.454, + "step": 5457 + }, + { + "epoch": 0.63, + "grad_norm": 2.530440871466244, + "learning_rate": 3.224298957877994e-06, + "loss": 0.481, + "step": 5458 + }, + { + "epoch": 0.63, + "grad_norm": 7.872181662058484, + "learning_rate": 3.222559475524471e-06, + "loss": 0.5199, + "step": 5459 + }, + { + "epoch": 0.63, + "grad_norm": 2.107434746615305, + "learning_rate": 3.2208202393817022e-06, + "loss": 0.4578, + "step": 5460 + }, + { + "epoch": 0.63, + "grad_norm": 2.067788759016398, + "learning_rate": 3.219081249690606e-06, + "loss": 0.5462, + "step": 5461 + }, + { + "epoch": 0.63, + "grad_norm": 2.1036400735618215, + "learning_rate": 3.217342506692066e-06, + "loss": 0.4587, + "step": 5462 + }, + { + "epoch": 0.63, + "grad_norm": 3.0479744856530036, + "learning_rate": 3.2156040106269332e-06, + "loss": 0.4521, + "step": 5463 + }, + { + "epoch": 0.63, + "grad_norm": 2.7140449125220787, + "learning_rate": 3.2138657617360234e-06, + "loss": 0.4796, + "step": 5464 + }, + { + "epoch": 0.63, + "grad_norm": 2.2439638484013273, + "learning_rate": 3.2121277602601196e-06, + "loss": 0.4848, + "step": 5465 + }, + { + "epoch": 0.63, + "grad_norm": 2.796180878793987, + "learning_rate": 3.2103900064399686e-06, + "loss": 0.5593, + "step": 5466 + }, + { + "epoch": 0.63, + "grad_norm": 2.4914069756075645, + "learning_rate": 3.2086525005162835e-06, + "loss": 0.5893, + "step": 5467 + }, + { + "epoch": 0.63, + "grad_norm": 2.18116161977915, + "learning_rate": 3.206915242729744e-06, + "loss": 0.495, + "step": 5468 + }, + { + "epoch": 0.63, + "grad_norm": 1.9842344808766266, + "learning_rate": 3.2051782333209945e-06, + "loss": 0.5285, + "step": 5469 + }, + { + "epoch": 0.63, + "grad_norm": 2.2217710507703954, + "learning_rate": 3.203441472530645e-06, + "loss": 0.4712, + "step": 5470 + }, + { + "epoch": 0.63, + "grad_norm": 2.2199659190950634, + "learning_rate": 3.201704960599274e-06, + "loss": 0.4495, + "step": 5471 + }, + { + "epoch": 0.63, + "grad_norm": 1.9866354678362226, + "learning_rate": 3.1999686977674183e-06, + "loss": 0.4999, + "step": 5472 + }, + { + "epoch": 0.63, + "grad_norm": 2.094767689638826, + "learning_rate": 3.198232684275586e-06, + "loss": 0.5136, + "step": 5473 + }, + { + "epoch": 0.63, + "grad_norm": 2.696260652387171, + "learning_rate": 3.1964969203642513e-06, + "loss": 0.3798, + "step": 5474 + }, + { + "epoch": 0.63, + "grad_norm": 1.743684191845175, + "learning_rate": 3.1947614062738507e-06, + "loss": 0.5076, + "step": 5475 + }, + { + "epoch": 0.63, + "grad_norm": 2.1477505059896025, + "learning_rate": 3.1930261422447874e-06, + "loss": 0.4909, + "step": 5476 + }, + { + "epoch": 0.63, + "grad_norm": 2.110614112493166, + "learning_rate": 3.191291128517429e-06, + "loss": 0.5725, + "step": 5477 + }, + { + "epoch": 0.63, + "grad_norm": 1.793577344726605, + "learning_rate": 3.189556365332111e-06, + "loss": 0.5389, + "step": 5478 + }, + { + "epoch": 0.63, + "grad_norm": 2.48563357490089, + "learning_rate": 3.1878218529291315e-06, + "loss": 0.5038, + "step": 5479 + }, + { + "epoch": 0.63, + "grad_norm": 2.1961424574153425, + "learning_rate": 3.1860875915487557e-06, + "loss": 0.4658, + "step": 5480 + }, + { + "epoch": 0.63, + "grad_norm": 2.176353334602592, + "learning_rate": 3.1843535814312122e-06, + "loss": 0.5073, + "step": 5481 + }, + { + "epoch": 0.63, + "grad_norm": 2.3908775853091595, + "learning_rate": 3.182619822816696e-06, + "loss": 0.5216, + "step": 5482 + }, + { + "epoch": 0.63, + "grad_norm": 1.9661136077323, + "learning_rate": 3.1808863159453675e-06, + "loss": 0.4825, + "step": 5483 + }, + { + "epoch": 0.63, + "grad_norm": 2.047933580453743, + "learning_rate": 3.1791530610573508e-06, + "loss": 0.5584, + "step": 5484 + }, + { + "epoch": 0.63, + "grad_norm": 2.0209465887783704, + "learning_rate": 3.1774200583927365e-06, + "loss": 0.5056, + "step": 5485 + }, + { + "epoch": 0.63, + "grad_norm": 2.5529231465982036, + "learning_rate": 3.1756873081915807e-06, + "loss": 0.4111, + "step": 5486 + }, + { + "epoch": 0.63, + "grad_norm": 2.189266039835674, + "learning_rate": 3.1739548106939044e-06, + "loss": 0.4383, + "step": 5487 + }, + { + "epoch": 0.63, + "grad_norm": 2.2626596865526483, + "learning_rate": 3.17222256613969e-06, + "loss": 0.5876, + "step": 5488 + }, + { + "epoch": 0.63, + "grad_norm": 2.5902035126375864, + "learning_rate": 3.1704905747688885e-06, + "loss": 0.5448, + "step": 5489 + }, + { + "epoch": 0.63, + "grad_norm": 2.089935356967814, + "learning_rate": 3.1687588368214164e-06, + "loss": 0.4845, + "step": 5490 + }, + { + "epoch": 0.63, + "grad_norm": 1.8981239175060673, + "learning_rate": 3.1670273525371528e-06, + "loss": 0.4418, + "step": 5491 + }, + { + "epoch": 0.63, + "grad_norm": 2.0005416692033977, + "learning_rate": 3.1652961221559427e-06, + "loss": 0.5027, + "step": 5492 + }, + { + "epoch": 0.63, + "grad_norm": 2.3380197393451994, + "learning_rate": 3.163565145917596e-06, + "loss": 0.463, + "step": 5493 + }, + { + "epoch": 0.63, + "grad_norm": 0.8669324105633256, + "learning_rate": 3.161834424061887e-06, + "loss": 0.6778, + "step": 5494 + }, + { + "epoch": 0.63, + "grad_norm": 2.7693676977422323, + "learning_rate": 3.1601039568285553e-06, + "loss": 0.5203, + "step": 5495 + }, + { + "epoch": 0.63, + "grad_norm": 2.494471524574695, + "learning_rate": 3.1583737444573048e-06, + "loss": 0.4885, + "step": 5496 + }, + { + "epoch": 0.63, + "grad_norm": 1.986629870835382, + "learning_rate": 3.1566437871878047e-06, + "loss": 0.4557, + "step": 5497 + }, + { + "epoch": 0.63, + "grad_norm": 1.90571627829019, + "learning_rate": 3.154914085259688e-06, + "loss": 0.4967, + "step": 5498 + }, + { + "epoch": 0.63, + "grad_norm": 1.8365470487215445, + "learning_rate": 3.153184638912552e-06, + "loss": 0.4354, + "step": 5499 + }, + { + "epoch": 0.63, + "grad_norm": 2.3045297144048993, + "learning_rate": 3.1514554483859607e-06, + "loss": 0.5078, + "step": 5500 + }, + { + "epoch": 0.63, + "grad_norm": 1.7201320928651003, + "learning_rate": 3.1497265139194403e-06, + "loss": 0.4696, + "step": 5501 + }, + { + "epoch": 0.63, + "grad_norm": 2.475811120035794, + "learning_rate": 3.1479978357524825e-06, + "loss": 0.5005, + "step": 5502 + }, + { + "epoch": 0.63, + "grad_norm": 1.9022446174107563, + "learning_rate": 3.1462694141245436e-06, + "loss": 0.5551, + "step": 5503 + }, + { + "epoch": 0.63, + "grad_norm": 2.1188792851230835, + "learning_rate": 3.1445412492750453e-06, + "loss": 0.4868, + "step": 5504 + }, + { + "epoch": 0.63, + "grad_norm": 1.697497688205146, + "learning_rate": 3.1428133414433716e-06, + "loss": 0.4967, + "step": 5505 + }, + { + "epoch": 0.63, + "grad_norm": 1.9011482899226602, + "learning_rate": 3.141085690868871e-06, + "loss": 0.5391, + "step": 5506 + }, + { + "epoch": 0.63, + "grad_norm": 2.726026376328595, + "learning_rate": 3.13935829779086e-06, + "loss": 0.547, + "step": 5507 + }, + { + "epoch": 0.63, + "grad_norm": 0.8517259292572459, + "learning_rate": 3.1376311624486145e-06, + "loss": 0.6902, + "step": 5508 + }, + { + "epoch": 0.63, + "grad_norm": 2.0326054610223574, + "learning_rate": 3.135904285081377e-06, + "loss": 0.5106, + "step": 5509 + }, + { + "epoch": 0.63, + "grad_norm": 1.845098460560559, + "learning_rate": 3.1341776659283563e-06, + "loss": 0.5586, + "step": 5510 + }, + { + "epoch": 0.63, + "grad_norm": 1.9079263731783191, + "learning_rate": 3.132451305228721e-06, + "loss": 0.5593, + "step": 5511 + }, + { + "epoch": 0.63, + "grad_norm": 1.8264977194364973, + "learning_rate": 3.1307252032216084e-06, + "loss": 0.5123, + "step": 5512 + }, + { + "epoch": 0.63, + "grad_norm": 2.1434414483742126, + "learning_rate": 3.1289993601461164e-06, + "loss": 0.563, + "step": 5513 + }, + { + "epoch": 0.63, + "grad_norm": 2.018507874856822, + "learning_rate": 3.1272737762413085e-06, + "loss": 0.4767, + "step": 5514 + }, + { + "epoch": 0.63, + "grad_norm": 0.8679659914785318, + "learning_rate": 3.1255484517462132e-06, + "loss": 0.7118, + "step": 5515 + }, + { + "epoch": 0.63, + "grad_norm": 1.8612713908822534, + "learning_rate": 3.1238233868998226e-06, + "loss": 0.5348, + "step": 5516 + }, + { + "epoch": 0.63, + "grad_norm": 1.5154200542282614, + "learning_rate": 3.12209858194109e-06, + "loss": 0.3655, + "step": 5517 + }, + { + "epoch": 0.63, + "grad_norm": 2.0114303528931625, + "learning_rate": 3.1203740371089373e-06, + "loss": 0.4145, + "step": 5518 + }, + { + "epoch": 0.63, + "grad_norm": 8.079473163891034, + "learning_rate": 3.1186497526422476e-06, + "loss": 0.4273, + "step": 5519 + }, + { + "epoch": 0.63, + "grad_norm": 1.9035999038090468, + "learning_rate": 3.116925728779868e-06, + "loss": 0.542, + "step": 5520 + }, + { + "epoch": 0.63, + "grad_norm": 0.7937425295291928, + "learning_rate": 3.115201965760612e-06, + "loss": 0.6775, + "step": 5521 + }, + { + "epoch": 0.63, + "grad_norm": 1.9570194616629277, + "learning_rate": 3.113478463823252e-06, + "loss": 0.4436, + "step": 5522 + }, + { + "epoch": 0.63, + "grad_norm": 2.878348273581029, + "learning_rate": 3.1117552232065273e-06, + "loss": 0.5439, + "step": 5523 + }, + { + "epoch": 0.63, + "grad_norm": 2.1837151272103306, + "learning_rate": 3.1100322441491425e-06, + "loss": 0.5855, + "step": 5524 + }, + { + "epoch": 0.63, + "grad_norm": 1.741340177795247, + "learning_rate": 3.1083095268897645e-06, + "loss": 0.4565, + "step": 5525 + }, + { + "epoch": 0.63, + "grad_norm": 1.87273493201481, + "learning_rate": 3.106587071667023e-06, + "loss": 0.5266, + "step": 5526 + }, + { + "epoch": 0.64, + "grad_norm": 2.4403967139063805, + "learning_rate": 3.104864878719513e-06, + "loss": 0.4985, + "step": 5527 + }, + { + "epoch": 0.64, + "grad_norm": 1.8949647271768586, + "learning_rate": 3.103142948285791e-06, + "loss": 0.3935, + "step": 5528 + }, + { + "epoch": 0.64, + "grad_norm": 1.9168976559173336, + "learning_rate": 3.1014212806043794e-06, + "loss": 0.4998, + "step": 5529 + }, + { + "epoch": 0.64, + "grad_norm": 1.7620393697609043, + "learning_rate": 3.0996998759137624e-06, + "loss": 0.5804, + "step": 5530 + }, + { + "epoch": 0.64, + "grad_norm": 1.777334719009669, + "learning_rate": 3.0979787344523905e-06, + "loss": 0.4908, + "step": 5531 + }, + { + "epoch": 0.64, + "grad_norm": 1.9867553883862528, + "learning_rate": 3.096257856458674e-06, + "loss": 0.4528, + "step": 5532 + }, + { + "epoch": 0.64, + "grad_norm": 1.7590566537287158, + "learning_rate": 3.0945372421709897e-06, + "loss": 0.547, + "step": 5533 + }, + { + "epoch": 0.64, + "grad_norm": 1.9427973146101325, + "learning_rate": 3.092816891827677e-06, + "loss": 0.4105, + "step": 5534 + }, + { + "epoch": 0.64, + "grad_norm": 1.8576271553816364, + "learning_rate": 3.0910968056670377e-06, + "loss": 0.5031, + "step": 5535 + }, + { + "epoch": 0.64, + "grad_norm": 2.230890943788785, + "learning_rate": 3.0893769839273385e-06, + "loss": 0.5431, + "step": 5536 + }, + { + "epoch": 0.64, + "grad_norm": 2.0470094683375395, + "learning_rate": 3.0876574268468085e-06, + "loss": 0.4813, + "step": 5537 + }, + { + "epoch": 0.64, + "grad_norm": 2.1381721249714833, + "learning_rate": 3.0859381346636423e-06, + "loss": 0.4614, + "step": 5538 + }, + { + "epoch": 0.64, + "grad_norm": 1.9202944937013042, + "learning_rate": 3.084219107615992e-06, + "loss": 0.5541, + "step": 5539 + }, + { + "epoch": 0.64, + "grad_norm": 1.946420394170314, + "learning_rate": 3.08250034594198e-06, + "loss": 0.4418, + "step": 5540 + }, + { + "epoch": 0.64, + "grad_norm": 2.2691653473809463, + "learning_rate": 3.0807818498796873e-06, + "loss": 0.41, + "step": 5541 + }, + { + "epoch": 0.64, + "grad_norm": 2.557680871722288, + "learning_rate": 3.07906361966716e-06, + "loss": 0.4663, + "step": 5542 + }, + { + "epoch": 0.64, + "grad_norm": 1.7339254295367075, + "learning_rate": 3.077345655542408e-06, + "loss": 0.5292, + "step": 5543 + }, + { + "epoch": 0.64, + "grad_norm": 2.1464965626885126, + "learning_rate": 3.075627957743402e-06, + "loss": 0.4082, + "step": 5544 + }, + { + "epoch": 0.64, + "grad_norm": 2.092898928589037, + "learning_rate": 3.0739105265080793e-06, + "loss": 0.4897, + "step": 5545 + }, + { + "epoch": 0.64, + "grad_norm": 2.531116004392681, + "learning_rate": 3.072193362074337e-06, + "loss": 0.5057, + "step": 5546 + }, + { + "epoch": 0.64, + "grad_norm": 1.8261922761823381, + "learning_rate": 3.0704764646800356e-06, + "loss": 0.475, + "step": 5547 + }, + { + "epoch": 0.64, + "grad_norm": 2.061763038437076, + "learning_rate": 3.068759834563e-06, + "loss": 0.4514, + "step": 5548 + }, + { + "epoch": 0.64, + "grad_norm": 1.9244023425591734, + "learning_rate": 3.067043471961017e-06, + "loss": 0.5022, + "step": 5549 + }, + { + "epoch": 0.64, + "grad_norm": 1.7487493527579148, + "learning_rate": 3.065327377111838e-06, + "loss": 0.4487, + "step": 5550 + }, + { + "epoch": 0.64, + "grad_norm": 1.8113799972949425, + "learning_rate": 3.0636115502531756e-06, + "loss": 0.4591, + "step": 5551 + }, + { + "epoch": 0.64, + "grad_norm": 1.996205997864592, + "learning_rate": 3.0618959916227054e-06, + "loss": 0.505, + "step": 5552 + }, + { + "epoch": 0.64, + "grad_norm": 2.275813189925835, + "learning_rate": 3.0601807014580675e-06, + "loss": 0.5045, + "step": 5553 + }, + { + "epoch": 0.64, + "grad_norm": 1.7863640020421998, + "learning_rate": 3.0584656799968626e-06, + "loss": 0.4414, + "step": 5554 + }, + { + "epoch": 0.64, + "grad_norm": 2.5195021084538376, + "learning_rate": 3.0567509274766573e-06, + "loss": 0.5951, + "step": 5555 + }, + { + "epoch": 0.64, + "grad_norm": 2.0794278893549683, + "learning_rate": 3.055036444134975e-06, + "loss": 0.561, + "step": 5556 + }, + { + "epoch": 0.64, + "grad_norm": 1.7128727653221711, + "learning_rate": 3.0533222302093078e-06, + "loss": 0.4713, + "step": 5557 + }, + { + "epoch": 0.64, + "grad_norm": 2.105368716929038, + "learning_rate": 3.0516082859371077e-06, + "loss": 0.5605, + "step": 5558 + }, + { + "epoch": 0.64, + "grad_norm": 2.1726823942615194, + "learning_rate": 3.0498946115557902e-06, + "loss": 0.5954, + "step": 5559 + }, + { + "epoch": 0.64, + "grad_norm": 1.78515968943789, + "learning_rate": 3.048181207302734e-06, + "loss": 0.5501, + "step": 5560 + }, + { + "epoch": 0.64, + "grad_norm": 3.548231730586014, + "learning_rate": 3.0464680734152783e-06, + "loss": 0.5291, + "step": 5561 + }, + { + "epoch": 0.64, + "grad_norm": 1.9965825456504864, + "learning_rate": 3.0447552101307277e-06, + "loss": 0.4596, + "step": 5562 + }, + { + "epoch": 0.64, + "grad_norm": 1.905289898316301, + "learning_rate": 3.043042617686346e-06, + "loss": 0.5355, + "step": 5563 + }, + { + "epoch": 0.64, + "grad_norm": 4.413282011317972, + "learning_rate": 3.0413302963193613e-06, + "loss": 0.534, + "step": 5564 + }, + { + "epoch": 0.64, + "grad_norm": 10.489726013349605, + "learning_rate": 3.0396182462669653e-06, + "loss": 0.5205, + "step": 5565 + }, + { + "epoch": 0.64, + "grad_norm": 1.9336462050810599, + "learning_rate": 3.0379064677663116e-06, + "loss": 0.5173, + "step": 5566 + }, + { + "epoch": 0.64, + "grad_norm": 4.387608144900776, + "learning_rate": 3.0361949610545134e-06, + "loss": 0.5101, + "step": 5567 + }, + { + "epoch": 0.64, + "grad_norm": 2.6786748747638582, + "learning_rate": 3.034483726368648e-06, + "loss": 0.4581, + "step": 5568 + }, + { + "epoch": 0.64, + "grad_norm": 2.190461940130666, + "learning_rate": 3.032772763945757e-06, + "loss": 0.4302, + "step": 5569 + }, + { + "epoch": 0.64, + "grad_norm": 2.112801727325627, + "learning_rate": 3.0310620740228423e-06, + "loss": 0.4769, + "step": 5570 + }, + { + "epoch": 0.64, + "grad_norm": 2.152726871749323, + "learning_rate": 3.0293516568368674e-06, + "loss": 0.4952, + "step": 5571 + }, + { + "epoch": 0.64, + "grad_norm": 2.4045654305167803, + "learning_rate": 3.027641512624763e-06, + "loss": 0.5409, + "step": 5572 + }, + { + "epoch": 0.64, + "grad_norm": 1.9276201081550612, + "learning_rate": 3.0259316416234108e-06, + "loss": 0.5344, + "step": 5573 + }, + { + "epoch": 0.64, + "grad_norm": 2.052616622214022, + "learning_rate": 3.024222044069667e-06, + "loss": 0.472, + "step": 5574 + }, + { + "epoch": 0.64, + "grad_norm": 2.4088133171919877, + "learning_rate": 3.022512720200342e-06, + "loss": 0.5486, + "step": 5575 + }, + { + "epoch": 0.64, + "grad_norm": 1.9674143461359455, + "learning_rate": 3.0208036702522137e-06, + "loss": 0.57, + "step": 5576 + }, + { + "epoch": 0.64, + "grad_norm": 1.8536634249450537, + "learning_rate": 3.019094894462018e-06, + "loss": 0.499, + "step": 5577 + }, + { + "epoch": 0.64, + "grad_norm": 1.9017673700005961, + "learning_rate": 3.017386393066455e-06, + "loss": 0.5281, + "step": 5578 + }, + { + "epoch": 0.64, + "grad_norm": 1.6885242358406094, + "learning_rate": 3.0156781663021845e-06, + "loss": 0.6116, + "step": 5579 + }, + { + "epoch": 0.64, + "grad_norm": 4.3147518412845445, + "learning_rate": 3.013970214405831e-06, + "loss": 0.5254, + "step": 5580 + }, + { + "epoch": 0.64, + "grad_norm": 1.9401791217959017, + "learning_rate": 3.0122625376139793e-06, + "loss": 0.4231, + "step": 5581 + }, + { + "epoch": 0.64, + "grad_norm": 1.7291908519215604, + "learning_rate": 3.010555136163177e-06, + "loss": 0.4613, + "step": 5582 + }, + { + "epoch": 0.64, + "grad_norm": 1.7713284558622675, + "learning_rate": 3.008848010289932e-06, + "loss": 0.4499, + "step": 5583 + }, + { + "epoch": 0.64, + "grad_norm": 1.7822423304715316, + "learning_rate": 3.0071411602307167e-06, + "loss": 0.496, + "step": 5584 + }, + { + "epoch": 0.64, + "grad_norm": 2.2463531897993114, + "learning_rate": 3.0054345862219615e-06, + "loss": 0.5358, + "step": 5585 + }, + { + "epoch": 0.64, + "grad_norm": 2.8413978291311452, + "learning_rate": 3.003728288500064e-06, + "loss": 0.513, + "step": 5586 + }, + { + "epoch": 0.64, + "grad_norm": 1.792180429678406, + "learning_rate": 3.0020222673013767e-06, + "loss": 0.4822, + "step": 5587 + }, + { + "epoch": 0.64, + "grad_norm": 1.7628663491262724, + "learning_rate": 3.000316522862219e-06, + "loss": 0.4933, + "step": 5588 + }, + { + "epoch": 0.64, + "grad_norm": 2.015449978990122, + "learning_rate": 2.998611055418871e-06, + "loss": 0.4533, + "step": 5589 + }, + { + "epoch": 0.64, + "grad_norm": 2.002256252184459, + "learning_rate": 2.9969058652075722e-06, + "loss": 0.5612, + "step": 5590 + }, + { + "epoch": 0.64, + "grad_norm": 2.2630874976622684, + "learning_rate": 2.9952009524645254e-06, + "loss": 0.453, + "step": 5591 + }, + { + "epoch": 0.64, + "grad_norm": 2.433846682130279, + "learning_rate": 2.993496317425897e-06, + "loss": 0.5015, + "step": 5592 + }, + { + "epoch": 0.64, + "grad_norm": 1.810111399259158, + "learning_rate": 2.99179196032781e-06, + "loss": 0.4463, + "step": 5593 + }, + { + "epoch": 0.64, + "grad_norm": 6.426628072379558, + "learning_rate": 2.9900878814063526e-06, + "loss": 0.4677, + "step": 5594 + }, + { + "epoch": 0.64, + "grad_norm": 2.4928502799682812, + "learning_rate": 2.9883840808975745e-06, + "loss": 0.4738, + "step": 5595 + }, + { + "epoch": 0.64, + "grad_norm": 2.7808878450193655, + "learning_rate": 2.9866805590374846e-06, + "loss": 0.5381, + "step": 5596 + }, + { + "epoch": 0.64, + "grad_norm": 0.8043546142564383, + "learning_rate": 2.9849773160620554e-06, + "loss": 0.6734, + "step": 5597 + }, + { + "epoch": 0.64, + "grad_norm": 2.6821684515836117, + "learning_rate": 2.983274352207218e-06, + "loss": 0.5298, + "step": 5598 + }, + { + "epoch": 0.64, + "grad_norm": 3.529727386334369, + "learning_rate": 2.98157166770887e-06, + "loss": 0.4758, + "step": 5599 + }, + { + "epoch": 0.64, + "grad_norm": 2.159918653984131, + "learning_rate": 2.9798692628028637e-06, + "loss": 0.545, + "step": 5600 + }, + { + "epoch": 0.64, + "grad_norm": 5.192039667581039, + "learning_rate": 2.9781671377250177e-06, + "loss": 0.5305, + "step": 5601 + }, + { + "epoch": 0.64, + "grad_norm": 1.7636665877775353, + "learning_rate": 2.9764652927111092e-06, + "loss": 0.5106, + "step": 5602 + }, + { + "epoch": 0.64, + "grad_norm": 1.9573987250484117, + "learning_rate": 2.974763727996878e-06, + "loss": 0.5559, + "step": 5603 + }, + { + "epoch": 0.64, + "grad_norm": 2.014230716464861, + "learning_rate": 2.9730624438180244e-06, + "loss": 0.5054, + "step": 5604 + }, + { + "epoch": 0.64, + "grad_norm": 2.1315577173684854, + "learning_rate": 2.9713614404102108e-06, + "loss": 0.5395, + "step": 5605 + }, + { + "epoch": 0.64, + "grad_norm": 1.6365137926792894, + "learning_rate": 2.9696607180090596e-06, + "loss": 0.5651, + "step": 5606 + }, + { + "epoch": 0.64, + "grad_norm": 2.968610861132133, + "learning_rate": 2.9679602768501527e-06, + "loss": 0.4986, + "step": 5607 + }, + { + "epoch": 0.64, + "grad_norm": 1.8180774672669044, + "learning_rate": 2.9662601171690364e-06, + "loss": 0.5251, + "step": 5608 + }, + { + "epoch": 0.64, + "grad_norm": 7.55405375533643, + "learning_rate": 2.9645602392012167e-06, + "loss": 0.4422, + "step": 5609 + }, + { + "epoch": 0.64, + "grad_norm": 1.8200873166140532, + "learning_rate": 2.96286064318216e-06, + "loss": 0.5504, + "step": 5610 + }, + { + "epoch": 0.64, + "grad_norm": 2.277965607657865, + "learning_rate": 2.9611613293472953e-06, + "loss": 0.504, + "step": 5611 + }, + { + "epoch": 0.64, + "grad_norm": 2.3345107016301174, + "learning_rate": 2.9594622979320087e-06, + "loss": 0.508, + "step": 5612 + }, + { + "epoch": 0.64, + "grad_norm": 3.383492905719264, + "learning_rate": 2.957763549171651e-06, + "loss": 0.507, + "step": 5613 + }, + { + "epoch": 0.65, + "grad_norm": 2.7011307169589607, + "learning_rate": 2.956065083301533e-06, + "loss": 0.53, + "step": 5614 + }, + { + "epoch": 0.65, + "grad_norm": 1.8740358657998757, + "learning_rate": 2.9543669005569257e-06, + "loss": 0.5109, + "step": 5615 + }, + { + "epoch": 0.65, + "grad_norm": 2.3754266751026205, + "learning_rate": 2.952669001173061e-06, + "loss": 0.4666, + "step": 5616 + }, + { + "epoch": 0.65, + "grad_norm": 2.194253463705244, + "learning_rate": 2.950971385385132e-06, + "loss": 0.5572, + "step": 5617 + }, + { + "epoch": 0.65, + "grad_norm": 1.8873998754943746, + "learning_rate": 2.9492740534282917e-06, + "loss": 0.543, + "step": 5618 + }, + { + "epoch": 0.65, + "grad_norm": 2.055232485591827, + "learning_rate": 2.947577005537654e-06, + "loss": 0.4085, + "step": 5619 + }, + { + "epoch": 0.65, + "grad_norm": 1.9962789172501518, + "learning_rate": 2.9458802419482934e-06, + "loss": 0.4775, + "step": 5620 + }, + { + "epoch": 0.65, + "grad_norm": 1.8066721073113896, + "learning_rate": 2.9441837628952468e-06, + "loss": 0.553, + "step": 5621 + }, + { + "epoch": 0.65, + "grad_norm": 2.2527767843167266, + "learning_rate": 2.942487568613509e-06, + "loss": 0.5608, + "step": 5622 + }, + { + "epoch": 0.65, + "grad_norm": 1.8920301331205227, + "learning_rate": 2.940791659338039e-06, + "loss": 0.4596, + "step": 5623 + }, + { + "epoch": 0.65, + "grad_norm": 2.2021626508148815, + "learning_rate": 2.93909603530375e-06, + "loss": 0.5382, + "step": 5624 + }, + { + "epoch": 0.65, + "grad_norm": 2.2409667496750187, + "learning_rate": 2.9374006967455203e-06, + "loss": 0.4685, + "step": 5625 + }, + { + "epoch": 0.65, + "grad_norm": 1.839341710079866, + "learning_rate": 2.9357056438981894e-06, + "loss": 0.5012, + "step": 5626 + }, + { + "epoch": 0.65, + "grad_norm": 2.0694968623110483, + "learning_rate": 2.9340108769965553e-06, + "loss": 0.5146, + "step": 5627 + }, + { + "epoch": 0.65, + "grad_norm": 2.0164962760058835, + "learning_rate": 2.9323163962753764e-06, + "loss": 0.4064, + "step": 5628 + }, + { + "epoch": 0.65, + "grad_norm": 6.082777147941806, + "learning_rate": 2.930622201969372e-06, + "loss": 0.506, + "step": 5629 + }, + { + "epoch": 0.65, + "grad_norm": 1.886903302637016, + "learning_rate": 2.9289282943132226e-06, + "loss": 0.5085, + "step": 5630 + }, + { + "epoch": 0.65, + "grad_norm": 2.361370364914034, + "learning_rate": 2.927234673541567e-06, + "loss": 0.5499, + "step": 5631 + }, + { + "epoch": 0.65, + "grad_norm": 2.760155113786717, + "learning_rate": 2.9255413398890055e-06, + "loss": 0.4053, + "step": 5632 + }, + { + "epoch": 0.65, + "grad_norm": 2.2837585068520148, + "learning_rate": 2.9238482935900974e-06, + "loss": 0.5029, + "step": 5633 + }, + { + "epoch": 0.65, + "grad_norm": 1.6378144102170837, + "learning_rate": 2.9221555348793646e-06, + "loss": 0.4746, + "step": 5634 + }, + { + "epoch": 0.65, + "grad_norm": 1.7546234545372479, + "learning_rate": 2.9204630639912867e-06, + "loss": 0.4296, + "step": 5635 + }, + { + "epoch": 0.65, + "grad_norm": 1.9472238640172752, + "learning_rate": 2.9187708811603044e-06, + "loss": 0.5231, + "step": 5636 + }, + { + "epoch": 0.65, + "grad_norm": 2.4235417462581457, + "learning_rate": 2.9170789866208216e-06, + "loss": 0.4472, + "step": 5637 + }, + { + "epoch": 0.65, + "grad_norm": 2.4986276900734743, + "learning_rate": 2.915387380607196e-06, + "loss": 0.5465, + "step": 5638 + }, + { + "epoch": 0.65, + "grad_norm": 2.478941897051696, + "learning_rate": 2.9136960633537493e-06, + "loss": 0.5031, + "step": 5639 + }, + { + "epoch": 0.65, + "grad_norm": 2.238295223625105, + "learning_rate": 2.9120050350947614e-06, + "loss": 0.4651, + "step": 5640 + }, + { + "epoch": 0.65, + "grad_norm": 1.7928643623108445, + "learning_rate": 2.910314296064476e-06, + "loss": 0.5198, + "step": 5641 + }, + { + "epoch": 0.65, + "grad_norm": 2.3034484913734308, + "learning_rate": 2.9086238464970896e-06, + "loss": 0.5545, + "step": 5642 + }, + { + "epoch": 0.65, + "grad_norm": 2.353874870908824, + "learning_rate": 2.9069336866267685e-06, + "loss": 0.5515, + "step": 5643 + }, + { + "epoch": 0.65, + "grad_norm": 1.7518219792267016, + "learning_rate": 2.9052438166876305e-06, + "loss": 0.3649, + "step": 5644 + }, + { + "epoch": 0.65, + "grad_norm": 1.897320106620009, + "learning_rate": 2.903554236913754e-06, + "loss": 0.4353, + "step": 5645 + }, + { + "epoch": 0.65, + "grad_norm": 0.8003529443714816, + "learning_rate": 2.901864947539184e-06, + "loss": 0.6799, + "step": 5646 + }, + { + "epoch": 0.65, + "grad_norm": 2.048362992868039, + "learning_rate": 2.900175948797916e-06, + "loss": 0.4855, + "step": 5647 + }, + { + "epoch": 0.65, + "grad_norm": 1.684062112393004, + "learning_rate": 2.8984872409239136e-06, + "loss": 0.4983, + "step": 5648 + }, + { + "epoch": 0.65, + "grad_norm": 2.6400390166018495, + "learning_rate": 2.8967988241510924e-06, + "loss": 0.5321, + "step": 5649 + }, + { + "epoch": 0.65, + "grad_norm": 2.3501465331481017, + "learning_rate": 2.895110698713336e-06, + "loss": 0.4653, + "step": 5650 + }, + { + "epoch": 0.65, + "grad_norm": 1.8975275483326586, + "learning_rate": 2.893422864844479e-06, + "loss": 0.4664, + "step": 5651 + }, + { + "epoch": 0.65, + "grad_norm": 2.126082003963139, + "learning_rate": 2.891735322778324e-06, + "loss": 0.4617, + "step": 5652 + }, + { + "epoch": 0.65, + "grad_norm": 1.7357050787203474, + "learning_rate": 2.890048072748625e-06, + "loss": 0.5149, + "step": 5653 + }, + { + "epoch": 0.65, + "grad_norm": 3.4739901044199195, + "learning_rate": 2.888361114989103e-06, + "loss": 0.4183, + "step": 5654 + }, + { + "epoch": 0.65, + "grad_norm": 2.1827995866571017, + "learning_rate": 2.886674449733432e-06, + "loss": 0.4719, + "step": 5655 + }, + { + "epoch": 0.65, + "grad_norm": 2.8730794251334473, + "learning_rate": 2.8849880772152537e-06, + "loss": 0.4317, + "step": 5656 + }, + { + "epoch": 0.65, + "grad_norm": 2.4079443838520653, + "learning_rate": 2.8833019976681577e-06, + "loss": 0.4901, + "step": 5657 + }, + { + "epoch": 0.65, + "grad_norm": 1.9250668492759944, + "learning_rate": 2.8816162113257033e-06, + "loss": 0.5425, + "step": 5658 + }, + { + "epoch": 0.65, + "grad_norm": 1.7391656383410103, + "learning_rate": 2.879930718421403e-06, + "loss": 0.5909, + "step": 5659 + }, + { + "epoch": 0.65, + "grad_norm": 2.205475936238161, + "learning_rate": 2.8782455191887345e-06, + "loss": 0.5035, + "step": 5660 + }, + { + "epoch": 0.65, + "grad_norm": 2.1627633028304825, + "learning_rate": 2.876560613861127e-06, + "loss": 0.4877, + "step": 5661 + }, + { + "epoch": 0.65, + "grad_norm": 2.0939309266324004, + "learning_rate": 2.874876002671977e-06, + "loss": 0.4919, + "step": 5662 + }, + { + "epoch": 0.65, + "grad_norm": 1.7273308552452442, + "learning_rate": 2.8731916858546335e-06, + "loss": 0.5061, + "step": 5663 + }, + { + "epoch": 0.65, + "grad_norm": 2.621336402765989, + "learning_rate": 2.871507663642411e-06, + "loss": 0.5696, + "step": 5664 + }, + { + "epoch": 0.65, + "grad_norm": 1.9998646592161322, + "learning_rate": 2.8698239362685785e-06, + "loss": 0.5937, + "step": 5665 + }, + { + "epoch": 0.65, + "grad_norm": 2.614144241895868, + "learning_rate": 2.8681405039663636e-06, + "loss": 0.4512, + "step": 5666 + }, + { + "epoch": 0.65, + "grad_norm": 2.4178270945943963, + "learning_rate": 2.8664573669689584e-06, + "loss": 0.5197, + "step": 5667 + }, + { + "epoch": 0.65, + "grad_norm": 0.8032245847710764, + "learning_rate": 2.864774525509507e-06, + "loss": 0.6664, + "step": 5668 + }, + { + "epoch": 0.65, + "grad_norm": 0.8471000327629293, + "learning_rate": 2.863091979821121e-06, + "loss": 0.7335, + "step": 5669 + }, + { + "epoch": 0.65, + "grad_norm": 1.8761578957606613, + "learning_rate": 2.8614097301368616e-06, + "loss": 0.4083, + "step": 5670 + }, + { + "epoch": 0.65, + "grad_norm": 1.8753151967331183, + "learning_rate": 2.859727776689758e-06, + "loss": 0.5161, + "step": 5671 + }, + { + "epoch": 0.65, + "grad_norm": 1.515420821785121, + "learning_rate": 2.85804611971279e-06, + "loss": 0.3771, + "step": 5672 + }, + { + "epoch": 0.65, + "grad_norm": 2.2830445624264146, + "learning_rate": 2.8563647594389067e-06, + "loss": 0.4898, + "step": 5673 + }, + { + "epoch": 0.65, + "grad_norm": 3.2151272323820685, + "learning_rate": 2.8546836961010016e-06, + "loss": 0.5167, + "step": 5674 + }, + { + "epoch": 0.65, + "grad_norm": 1.9515475982274781, + "learning_rate": 2.8530029299319416e-06, + "loss": 0.5451, + "step": 5675 + }, + { + "epoch": 0.65, + "grad_norm": 1.823961450737385, + "learning_rate": 2.8513224611645414e-06, + "loss": 0.412, + "step": 5676 + }, + { + "epoch": 0.65, + "grad_norm": 2.4296216065732197, + "learning_rate": 2.8496422900315834e-06, + "loss": 0.4368, + "step": 5677 + }, + { + "epoch": 0.65, + "grad_norm": 1.8785117590328633, + "learning_rate": 2.8479624167658013e-06, + "loss": 0.4788, + "step": 5678 + }, + { + "epoch": 0.65, + "grad_norm": 2.170129258520768, + "learning_rate": 2.846282841599895e-06, + "loss": 0.5471, + "step": 5679 + }, + { + "epoch": 0.65, + "grad_norm": 1.674894837068322, + "learning_rate": 2.8446035647665136e-06, + "loss": 0.4356, + "step": 5680 + }, + { + "epoch": 0.65, + "grad_norm": 1.7638860126417175, + "learning_rate": 2.8429245864982756e-06, + "loss": 0.5654, + "step": 5681 + }, + { + "epoch": 0.65, + "grad_norm": 1.9252802047097994, + "learning_rate": 2.8412459070277486e-06, + "loss": 0.4529, + "step": 5682 + }, + { + "epoch": 0.65, + "grad_norm": 2.7797247844760844, + "learning_rate": 2.839567526587466e-06, + "loss": 0.5436, + "step": 5683 + }, + { + "epoch": 0.65, + "grad_norm": 2.198085281001469, + "learning_rate": 2.8378894454099158e-06, + "loss": 0.4086, + "step": 5684 + }, + { + "epoch": 0.65, + "grad_norm": 2.1621371642707015, + "learning_rate": 2.8362116637275425e-06, + "loss": 0.4674, + "step": 5685 + }, + { + "epoch": 0.65, + "grad_norm": 2.6402653158014333, + "learning_rate": 2.834534181772758e-06, + "loss": 0.4756, + "step": 5686 + }, + { + "epoch": 0.65, + "grad_norm": 2.23170526549326, + "learning_rate": 2.832856999777921e-06, + "loss": 0.4468, + "step": 5687 + }, + { + "epoch": 0.65, + "grad_norm": 1.8353874480073944, + "learning_rate": 2.8311801179753583e-06, + "loss": 0.3524, + "step": 5688 + }, + { + "epoch": 0.65, + "grad_norm": 2.4148347914274706, + "learning_rate": 2.8295035365973487e-06, + "loss": 0.4478, + "step": 5689 + }, + { + "epoch": 0.65, + "grad_norm": 2.125699964299632, + "learning_rate": 2.827827255876137e-06, + "loss": 0.5036, + "step": 5690 + }, + { + "epoch": 0.65, + "grad_norm": 1.9577728957092018, + "learning_rate": 2.8261512760439136e-06, + "loss": 0.508, + "step": 5691 + }, + { + "epoch": 0.65, + "grad_norm": 2.0656096645437483, + "learning_rate": 2.8244755973328413e-06, + "loss": 0.5216, + "step": 5692 + }, + { + "epoch": 0.65, + "grad_norm": 2.170529465923274, + "learning_rate": 2.8228002199750306e-06, + "loss": 0.5317, + "step": 5693 + }, + { + "epoch": 0.65, + "grad_norm": 2.296225017803758, + "learning_rate": 2.821125144202558e-06, + "loss": 0.5199, + "step": 5694 + }, + { + "epoch": 0.65, + "grad_norm": 5.701663214273712, + "learning_rate": 2.8194503702474505e-06, + "loss": 0.4798, + "step": 5695 + }, + { + "epoch": 0.65, + "grad_norm": 2.292155764430784, + "learning_rate": 2.8177758983417024e-06, + "loss": 0.5405, + "step": 5696 + }, + { + "epoch": 0.65, + "grad_norm": 2.339171185156427, + "learning_rate": 2.8161017287172573e-06, + "loss": 0.5379, + "step": 5697 + }, + { + "epoch": 0.65, + "grad_norm": 3.0378792943286688, + "learning_rate": 2.814427861606024e-06, + "loss": 0.4655, + "step": 5698 + }, + { + "epoch": 0.65, + "grad_norm": 1.7277773840510053, + "learning_rate": 2.8127542972398625e-06, + "loss": 0.457, + "step": 5699 + }, + { + "epoch": 0.65, + "grad_norm": 1.9781406220520874, + "learning_rate": 2.811081035850599e-06, + "loss": 0.5602, + "step": 5700 + }, + { + "epoch": 0.66, + "grad_norm": 4.231285710850122, + "learning_rate": 2.80940807767001e-06, + "loss": 0.5333, + "step": 5701 + }, + { + "epoch": 0.66, + "grad_norm": 1.6892566653724403, + "learning_rate": 2.807735422929836e-06, + "loss": 0.5036, + "step": 5702 + }, + { + "epoch": 0.66, + "grad_norm": 2.1808149104142007, + "learning_rate": 2.8060630718617723e-06, + "loss": 0.6168, + "step": 5703 + }, + { + "epoch": 0.66, + "grad_norm": 1.6602727075015502, + "learning_rate": 2.804391024697469e-06, + "loss": 0.4828, + "step": 5704 + }, + { + "epoch": 0.66, + "grad_norm": 2.6278307415698396, + "learning_rate": 2.8027192816685434e-06, + "loss": 0.4313, + "step": 5705 + }, + { + "epoch": 0.66, + "grad_norm": 2.5519094177781656, + "learning_rate": 2.80104784300656e-06, + "loss": 0.38, + "step": 5706 + }, + { + "epoch": 0.66, + "grad_norm": 2.1714243849884087, + "learning_rate": 2.79937670894305e-06, + "loss": 0.4862, + "step": 5707 + }, + { + "epoch": 0.66, + "grad_norm": 2.070083245800295, + "learning_rate": 2.7977058797094974e-06, + "loss": 0.5087, + "step": 5708 + }, + { + "epoch": 0.66, + "grad_norm": 2.652779830037731, + "learning_rate": 2.7960353555373454e-06, + "loss": 0.5049, + "step": 5709 + }, + { + "epoch": 0.66, + "grad_norm": 1.928166747132015, + "learning_rate": 2.7943651366579917e-06, + "loss": 0.5735, + "step": 5710 + }, + { + "epoch": 0.66, + "grad_norm": 2.025659986101139, + "learning_rate": 2.792695223302799e-06, + "loss": 0.5095, + "step": 5711 + }, + { + "epoch": 0.66, + "grad_norm": 1.6655407474206225, + "learning_rate": 2.79102561570308e-06, + "loss": 0.4599, + "step": 5712 + }, + { + "epoch": 0.66, + "grad_norm": 4.1855641239998915, + "learning_rate": 2.7893563140901125e-06, + "loss": 0.4193, + "step": 5713 + }, + { + "epoch": 0.66, + "grad_norm": 2.079525184061289, + "learning_rate": 2.787687318695123e-06, + "loss": 0.4483, + "step": 5714 + }, + { + "epoch": 0.66, + "grad_norm": 2.2751085126428605, + "learning_rate": 2.786018629749305e-06, + "loss": 0.4702, + "step": 5715 + }, + { + "epoch": 0.66, + "grad_norm": 3.1180719366246534, + "learning_rate": 2.7843502474838015e-06, + "loss": 0.4647, + "step": 5716 + }, + { + "epoch": 0.66, + "grad_norm": 1.9350993030459924, + "learning_rate": 2.782682172129719e-06, + "loss": 0.4894, + "step": 5717 + }, + { + "epoch": 0.66, + "grad_norm": 3.106591020195427, + "learning_rate": 2.7810144039181164e-06, + "loss": 0.4359, + "step": 5718 + }, + { + "epoch": 0.66, + "grad_norm": 2.426518038918427, + "learning_rate": 2.779346943080016e-06, + "loss": 0.5052, + "step": 5719 + }, + { + "epoch": 0.66, + "grad_norm": 2.83520680278635, + "learning_rate": 2.777679789846391e-06, + "loss": 0.4917, + "step": 5720 + }, + { + "epoch": 0.66, + "grad_norm": 1.8041865998521727, + "learning_rate": 2.776012944448178e-06, + "loss": 0.381, + "step": 5721 + }, + { + "epoch": 0.66, + "grad_norm": 1.4271895550513658, + "learning_rate": 2.774346407116265e-06, + "loss": 0.4369, + "step": 5722 + }, + { + "epoch": 0.66, + "grad_norm": 2.0407899555614772, + "learning_rate": 2.7726801780815045e-06, + "loss": 0.627, + "step": 5723 + }, + { + "epoch": 0.66, + "grad_norm": 2.7715934730528438, + "learning_rate": 2.7710142575746998e-06, + "loss": 0.48, + "step": 5724 + }, + { + "epoch": 0.66, + "grad_norm": 2.0530119823439854, + "learning_rate": 2.769348645826614e-06, + "loss": 0.4594, + "step": 5725 + }, + { + "epoch": 0.66, + "grad_norm": 1.9831302503387818, + "learning_rate": 2.767683343067965e-06, + "loss": 0.519, + "step": 5726 + }, + { + "epoch": 0.66, + "grad_norm": 2.3778984137966988, + "learning_rate": 2.766018349529435e-06, + "loss": 0.5877, + "step": 5727 + }, + { + "epoch": 0.66, + "grad_norm": 2.4819785207870035, + "learning_rate": 2.764353665441654e-06, + "loss": 0.5585, + "step": 5728 + }, + { + "epoch": 0.66, + "grad_norm": 3.46865970749228, + "learning_rate": 2.762689291035219e-06, + "loss": 0.4429, + "step": 5729 + }, + { + "epoch": 0.66, + "grad_norm": 1.6748284038011523, + "learning_rate": 2.7610252265406744e-06, + "loss": 0.5222, + "step": 5730 + }, + { + "epoch": 0.66, + "grad_norm": 2.140217051413417, + "learning_rate": 2.7593614721885265e-06, + "loss": 0.5149, + "step": 5731 + }, + { + "epoch": 0.66, + "grad_norm": 2.0383365110845224, + "learning_rate": 2.757698028209241e-06, + "loss": 0.4583, + "step": 5732 + }, + { + "epoch": 0.66, + "grad_norm": 0.8334414182055687, + "learning_rate": 2.756034894833235e-06, + "loss": 0.6696, + "step": 5733 + }, + { + "epoch": 0.66, + "grad_norm": 1.7127542373334697, + "learning_rate": 2.7543720722908882e-06, + "loss": 0.4362, + "step": 5734 + }, + { + "epoch": 0.66, + "grad_norm": 1.65288183957263, + "learning_rate": 2.752709560812531e-06, + "loss": 0.4362, + "step": 5735 + }, + { + "epoch": 0.66, + "grad_norm": 1.9265345807581933, + "learning_rate": 2.751047360628458e-06, + "loss": 0.5227, + "step": 5736 + }, + { + "epoch": 0.66, + "grad_norm": 1.851985849118492, + "learning_rate": 2.749385471968914e-06, + "loss": 0.5566, + "step": 5737 + }, + { + "epoch": 0.66, + "grad_norm": 2.1845187670788384, + "learning_rate": 2.7477238950641055e-06, + "loss": 0.3884, + "step": 5738 + }, + { + "epoch": 0.66, + "grad_norm": 2.569301862306714, + "learning_rate": 2.7460626301441917e-06, + "loss": 0.5418, + "step": 5739 + }, + { + "epoch": 0.66, + "grad_norm": 1.8375097797805147, + "learning_rate": 2.7444016774392933e-06, + "loss": 0.4643, + "step": 5740 + }, + { + "epoch": 0.66, + "grad_norm": 2.616077363807139, + "learning_rate": 2.742741037179484e-06, + "loss": 0.4949, + "step": 5741 + }, + { + "epoch": 0.66, + "grad_norm": 2.403794839636427, + "learning_rate": 2.7410807095947955e-06, + "loss": 0.5082, + "step": 5742 + }, + { + "epoch": 0.66, + "grad_norm": 1.8905499016080287, + "learning_rate": 2.7394206949152135e-06, + "loss": 0.4654, + "step": 5743 + }, + { + "epoch": 0.66, + "grad_norm": 1.7470326377205445, + "learning_rate": 2.7377609933706884e-06, + "loss": 0.4923, + "step": 5744 + }, + { + "epoch": 0.66, + "grad_norm": 2.2042882951814877, + "learning_rate": 2.7361016051911154e-06, + "loss": 0.4332, + "step": 5745 + }, + { + "epoch": 0.66, + "grad_norm": 2.133535347566426, + "learning_rate": 2.734442530606358e-06, + "loss": 0.4246, + "step": 5746 + }, + { + "epoch": 0.66, + "grad_norm": 2.1860478311475227, + "learning_rate": 2.7327837698462276e-06, + "loss": 0.458, + "step": 5747 + }, + { + "epoch": 0.66, + "grad_norm": 2.6519785226670036, + "learning_rate": 2.7311253231404987e-06, + "loss": 0.5615, + "step": 5748 + }, + { + "epoch": 0.66, + "grad_norm": 0.8087182123627396, + "learning_rate": 2.7294671907188964e-06, + "loss": 0.6561, + "step": 5749 + }, + { + "epoch": 0.66, + "grad_norm": 2.049457017965674, + "learning_rate": 2.7278093728111044e-06, + "loss": 0.5714, + "step": 5750 + }, + { + "epoch": 0.66, + "grad_norm": 2.0572112988414655, + "learning_rate": 2.726151869646766e-06, + "loss": 0.4624, + "step": 5751 + }, + { + "epoch": 0.66, + "grad_norm": 2.4465945015305666, + "learning_rate": 2.7244946814554746e-06, + "loss": 0.5179, + "step": 5752 + }, + { + "epoch": 0.66, + "grad_norm": 2.386503856819927, + "learning_rate": 2.722837808466788e-06, + "loss": 0.446, + "step": 5753 + }, + { + "epoch": 0.66, + "grad_norm": 5.1214577234598035, + "learning_rate": 2.7211812509102113e-06, + "loss": 0.4592, + "step": 5754 + }, + { + "epoch": 0.66, + "grad_norm": 1.6921632151016615, + "learning_rate": 2.719525009015216e-06, + "loss": 0.5305, + "step": 5755 + }, + { + "epoch": 0.66, + "grad_norm": 1.6616699300950832, + "learning_rate": 2.7178690830112186e-06, + "loss": 0.5036, + "step": 5756 + }, + { + "epoch": 0.66, + "grad_norm": 1.728119427691811, + "learning_rate": 2.716213473127603e-06, + "loss": 0.6117, + "step": 5757 + }, + { + "epoch": 0.66, + "grad_norm": 2.9705482522973417, + "learning_rate": 2.7145581795937013e-06, + "loss": 0.6019, + "step": 5758 + }, + { + "epoch": 0.66, + "grad_norm": 2.075590138977285, + "learning_rate": 2.7129032026388046e-06, + "loss": 0.4676, + "step": 5759 + }, + { + "epoch": 0.66, + "grad_norm": 2.4937973271565976, + "learning_rate": 2.7112485424921584e-06, + "loss": 0.583, + "step": 5760 + }, + { + "epoch": 0.66, + "grad_norm": 2.3486013301836492, + "learning_rate": 2.70959419938297e-06, + "loss": 0.5287, + "step": 5761 + }, + { + "epoch": 0.66, + "grad_norm": 1.8692078848441247, + "learning_rate": 2.7079401735403938e-06, + "loss": 0.4708, + "step": 5762 + }, + { + "epoch": 0.66, + "grad_norm": 2.171377816131872, + "learning_rate": 2.70628646519355e-06, + "loss": 0.5051, + "step": 5763 + }, + { + "epoch": 0.66, + "grad_norm": 2.1408636203571176, + "learning_rate": 2.7046330745715056e-06, + "loss": 0.5066, + "step": 5764 + }, + { + "epoch": 0.66, + "grad_norm": 1.6010355322494831, + "learning_rate": 2.7029800019032924e-06, + "loss": 0.3531, + "step": 5765 + }, + { + "epoch": 0.66, + "grad_norm": 5.678454681219228, + "learning_rate": 2.7013272474178898e-06, + "loss": 0.4946, + "step": 5766 + }, + { + "epoch": 0.66, + "grad_norm": 1.9396844145546188, + "learning_rate": 2.6996748113442397e-06, + "loss": 0.5542, + "step": 5767 + }, + { + "epoch": 0.66, + "grad_norm": 2.729213259829574, + "learning_rate": 2.698022693911237e-06, + "loss": 0.5462, + "step": 5768 + }, + { + "epoch": 0.66, + "grad_norm": 1.6647278650685282, + "learning_rate": 2.6963708953477296e-06, + "loss": 0.4884, + "step": 5769 + }, + { + "epoch": 0.66, + "grad_norm": 0.8153585174880043, + "learning_rate": 2.6947194158825296e-06, + "loss": 0.6277, + "step": 5770 + }, + { + "epoch": 0.66, + "grad_norm": 2.3980437202311533, + "learning_rate": 2.6930682557443944e-06, + "loss": 0.5091, + "step": 5771 + }, + { + "epoch": 0.66, + "grad_norm": 2.1409167777760354, + "learning_rate": 2.6914174151620467e-06, + "loss": 0.5113, + "step": 5772 + }, + { + "epoch": 0.66, + "grad_norm": 1.5051745171781656, + "learning_rate": 2.6897668943641564e-06, + "loss": 0.4089, + "step": 5773 + }, + { + "epoch": 0.66, + "grad_norm": 1.898037815837164, + "learning_rate": 2.688116693579361e-06, + "loss": 0.5209, + "step": 5774 + }, + { + "epoch": 0.66, + "grad_norm": 2.092589102386741, + "learning_rate": 2.686466813036236e-06, + "loss": 0.4397, + "step": 5775 + }, + { + "epoch": 0.66, + "grad_norm": 1.889973876038591, + "learning_rate": 2.684817252963329e-06, + "loss": 0.4695, + "step": 5776 + }, + { + "epoch": 0.66, + "grad_norm": 1.6749614680492515, + "learning_rate": 2.6831680135891338e-06, + "loss": 0.4861, + "step": 5777 + }, + { + "epoch": 0.66, + "grad_norm": 1.7676565528575214, + "learning_rate": 2.6815190951421054e-06, + "loss": 0.5502, + "step": 5778 + }, + { + "epoch": 0.66, + "grad_norm": 2.7419340407442663, + "learning_rate": 2.6798704978506485e-06, + "loss": 0.472, + "step": 5779 + }, + { + "epoch": 0.66, + "grad_norm": 2.061560569618951, + "learning_rate": 2.678222221943131e-06, + "loss": 0.5639, + "step": 5780 + }, + { + "epoch": 0.66, + "grad_norm": 0.8470544800731576, + "learning_rate": 2.6765742676478666e-06, + "loss": 0.6728, + "step": 5781 + }, + { + "epoch": 0.66, + "grad_norm": 1.6921886413157088, + "learning_rate": 2.6749266351931343e-06, + "loss": 0.5165, + "step": 5782 + }, + { + "epoch": 0.66, + "grad_norm": 2.0420298872441762, + "learning_rate": 2.67327932480716e-06, + "loss": 0.4866, + "step": 5783 + }, + { + "epoch": 0.66, + "grad_norm": 5.232205138935863, + "learning_rate": 2.671632336718132e-06, + "loss": 0.4507, + "step": 5784 + }, + { + "epoch": 0.66, + "grad_norm": 2.525972939570713, + "learning_rate": 2.6699856711541884e-06, + "loss": 0.5002, + "step": 5785 + }, + { + "epoch": 0.66, + "grad_norm": 3.004982462687212, + "learning_rate": 2.6683393283434273e-06, + "loss": 0.469, + "step": 5786 + }, + { + "epoch": 0.66, + "grad_norm": 2.226837945624228, + "learning_rate": 2.6666933085138967e-06, + "loss": 0.4925, + "step": 5787 + }, + { + "epoch": 0.67, + "grad_norm": 2.09694828725871, + "learning_rate": 2.665047611893607e-06, + "loss": 0.5697, + "step": 5788 + }, + { + "epoch": 0.67, + "grad_norm": 1.9859202613885023, + "learning_rate": 2.6634022387105176e-06, + "loss": 0.4209, + "step": 5789 + }, + { + "epoch": 0.67, + "grad_norm": 2.373936364814647, + "learning_rate": 2.661757189192543e-06, + "loss": 0.5267, + "step": 5790 + }, + { + "epoch": 0.67, + "grad_norm": 0.8852222380156823, + "learning_rate": 2.66011246356756e-06, + "loss": 0.6659, + "step": 5791 + }, + { + "epoch": 0.67, + "grad_norm": 2.0026968299210233, + "learning_rate": 2.658468062063394e-06, + "loss": 0.6033, + "step": 5792 + }, + { + "epoch": 0.67, + "grad_norm": 2.018272361637807, + "learning_rate": 2.6568239849078235e-06, + "loss": 0.4932, + "step": 5793 + }, + { + "epoch": 0.67, + "grad_norm": 2.5919971444446572, + "learning_rate": 2.6551802323285915e-06, + "loss": 0.5486, + "step": 5794 + }, + { + "epoch": 0.67, + "grad_norm": 1.8108697082396865, + "learning_rate": 2.6535368045533867e-06, + "loss": 0.5117, + "step": 5795 + }, + { + "epoch": 0.67, + "grad_norm": 1.9686476832251607, + "learning_rate": 2.6518937018098557e-06, + "loss": 0.4797, + "step": 5796 + }, + { + "epoch": 0.67, + "grad_norm": 5.431416297345606, + "learning_rate": 2.6502509243256047e-06, + "loss": 0.4511, + "step": 5797 + }, + { + "epoch": 0.67, + "grad_norm": 7.469366440860353, + "learning_rate": 2.6486084723281856e-06, + "loss": 0.495, + "step": 5798 + }, + { + "epoch": 0.67, + "grad_norm": 1.7812298332147831, + "learning_rate": 2.6469663460451167e-06, + "loss": 0.5248, + "step": 5799 + }, + { + "epoch": 0.67, + "grad_norm": 2.3021248792317532, + "learning_rate": 2.64532454570386e-06, + "loss": 0.4883, + "step": 5800 + }, + { + "epoch": 0.67, + "grad_norm": 2.2069537398883745, + "learning_rate": 2.643683071531841e-06, + "loss": 0.554, + "step": 5801 + }, + { + "epoch": 0.67, + "grad_norm": 2.302472339346388, + "learning_rate": 2.642041923756433e-06, + "loss": 0.4753, + "step": 5802 + }, + { + "epoch": 0.67, + "grad_norm": 2.095700946502289, + "learning_rate": 2.6404011026049714e-06, + "loss": 0.4385, + "step": 5803 + }, + { + "epoch": 0.67, + "grad_norm": 1.6754531588505395, + "learning_rate": 2.6387606083047378e-06, + "loss": 0.4341, + "step": 5804 + }, + { + "epoch": 0.67, + "grad_norm": 2.3009526407201433, + "learning_rate": 2.637120441082979e-06, + "loss": 0.4312, + "step": 5805 + }, + { + "epoch": 0.67, + "grad_norm": 1.6545528519463906, + "learning_rate": 2.6354806011668844e-06, + "loss": 0.3979, + "step": 5806 + }, + { + "epoch": 0.67, + "grad_norm": 1.562064889601809, + "learning_rate": 2.6338410887836093e-06, + "loss": 0.4854, + "step": 5807 + }, + { + "epoch": 0.67, + "grad_norm": 2.8047182126155183, + "learning_rate": 2.632201904160257e-06, + "loss": 0.5709, + "step": 5808 + }, + { + "epoch": 0.67, + "grad_norm": 2.83132552107066, + "learning_rate": 2.630563047523887e-06, + "loss": 0.4505, + "step": 5809 + }, + { + "epoch": 0.67, + "grad_norm": 1.8236078185137223, + "learning_rate": 2.628924519101511e-06, + "loss": 0.5214, + "step": 5810 + }, + { + "epoch": 0.67, + "grad_norm": 1.8599243557362735, + "learning_rate": 2.627286319120102e-06, + "loss": 0.5781, + "step": 5811 + }, + { + "epoch": 0.67, + "grad_norm": 2.134736704623458, + "learning_rate": 2.625648447806579e-06, + "loss": 0.4123, + "step": 5812 + }, + { + "epoch": 0.67, + "grad_norm": 2.4878765522092148, + "learning_rate": 2.6240109053878236e-06, + "loss": 0.4699, + "step": 5813 + }, + { + "epoch": 0.67, + "grad_norm": 2.236279819987923, + "learning_rate": 2.6223736920906668e-06, + "loss": 0.5377, + "step": 5814 + }, + { + "epoch": 0.67, + "grad_norm": 1.7918725670018099, + "learning_rate": 2.6207368081418914e-06, + "loss": 0.468, + "step": 5815 + }, + { + "epoch": 0.67, + "grad_norm": 1.6946205351266566, + "learning_rate": 2.619100253768244e-06, + "loss": 0.4462, + "step": 5816 + }, + { + "epoch": 0.67, + "grad_norm": 2.1603469870523693, + "learning_rate": 2.617464029196415e-06, + "loss": 0.4745, + "step": 5817 + }, + { + "epoch": 0.67, + "grad_norm": 3.036179575006548, + "learning_rate": 2.615828134653058e-06, + "loss": 0.4957, + "step": 5818 + }, + { + "epoch": 0.67, + "grad_norm": 2.5471789419346162, + "learning_rate": 2.6141925703647725e-06, + "loss": 0.5005, + "step": 5819 + }, + { + "epoch": 0.67, + "grad_norm": 2.6142049129505565, + "learning_rate": 2.6125573365581215e-06, + "loss": 0.4582, + "step": 5820 + }, + { + "epoch": 0.67, + "grad_norm": 1.5907906281432374, + "learning_rate": 2.6109224334596117e-06, + "loss": 0.431, + "step": 5821 + }, + { + "epoch": 0.67, + "grad_norm": 1.902475491157895, + "learning_rate": 2.609287861295715e-06, + "loss": 0.5234, + "step": 5822 + }, + { + "epoch": 0.67, + "grad_norm": 1.8977290341967217, + "learning_rate": 2.607653620292848e-06, + "loss": 0.5185, + "step": 5823 + }, + { + "epoch": 0.67, + "grad_norm": 2.283802264046408, + "learning_rate": 2.6060197106773894e-06, + "loss": 0.4382, + "step": 5824 + }, + { + "epoch": 0.67, + "grad_norm": 1.9793385396153687, + "learning_rate": 2.604386132675666e-06, + "loss": 0.5457, + "step": 5825 + }, + { + "epoch": 0.67, + "grad_norm": 2.1103824934844484, + "learning_rate": 2.6027528865139596e-06, + "loss": 0.5486, + "step": 5826 + }, + { + "epoch": 0.67, + "grad_norm": 1.8337591122847308, + "learning_rate": 2.6011199724185067e-06, + "loss": 0.4055, + "step": 5827 + }, + { + "epoch": 0.67, + "grad_norm": 1.6484087504820424, + "learning_rate": 2.5994873906155016e-06, + "loss": 0.454, + "step": 5828 + }, + { + "epoch": 0.67, + "grad_norm": 2.411906422774842, + "learning_rate": 2.597855141331085e-06, + "loss": 0.4936, + "step": 5829 + }, + { + "epoch": 0.67, + "grad_norm": 3.028874857680586, + "learning_rate": 2.596223224791361e-06, + "loss": 0.5427, + "step": 5830 + }, + { + "epoch": 0.67, + "grad_norm": 1.8702025733168324, + "learning_rate": 2.5945916412223772e-06, + "loss": 0.4425, + "step": 5831 + }, + { + "epoch": 0.67, + "grad_norm": 2.054842161873053, + "learning_rate": 2.592960390850144e-06, + "loss": 0.4847, + "step": 5832 + }, + { + "epoch": 0.67, + "grad_norm": 1.9077131100751072, + "learning_rate": 2.591329473900621e-06, + "loss": 0.5182, + "step": 5833 + }, + { + "epoch": 0.67, + "grad_norm": 2.305056716342883, + "learning_rate": 2.58969889059972e-06, + "loss": 0.479, + "step": 5834 + }, + { + "epoch": 0.67, + "grad_norm": 1.7160853774053524, + "learning_rate": 2.5880686411733134e-06, + "loss": 0.5297, + "step": 5835 + }, + { + "epoch": 0.67, + "grad_norm": 2.4322941771733007, + "learning_rate": 2.586438725847219e-06, + "loss": 0.4996, + "step": 5836 + }, + { + "epoch": 0.67, + "grad_norm": 2.317377722869986, + "learning_rate": 2.584809144847216e-06, + "loss": 0.5384, + "step": 5837 + }, + { + "epoch": 0.67, + "grad_norm": 0.8337430252487481, + "learning_rate": 2.58317989839903e-06, + "loss": 0.6971, + "step": 5838 + }, + { + "epoch": 0.67, + "grad_norm": 3.5563754249765176, + "learning_rate": 2.581550986728348e-06, + "loss": 0.5495, + "step": 5839 + }, + { + "epoch": 0.67, + "grad_norm": 3.0247023880593114, + "learning_rate": 2.5799224100608024e-06, + "loss": 0.5733, + "step": 5840 + }, + { + "epoch": 0.67, + "grad_norm": 4.596171253109812, + "learning_rate": 2.578294168621987e-06, + "loss": 0.4468, + "step": 5841 + }, + { + "epoch": 0.67, + "grad_norm": 2.5054261032423866, + "learning_rate": 2.576666262637444e-06, + "loss": 0.5975, + "step": 5842 + }, + { + "epoch": 0.67, + "grad_norm": 2.4848492886658264, + "learning_rate": 2.575038692332671e-06, + "loss": 0.4821, + "step": 5843 + }, + { + "epoch": 0.67, + "grad_norm": 2.455585066117292, + "learning_rate": 2.573411457933116e-06, + "loss": 0.471, + "step": 5844 + }, + { + "epoch": 0.67, + "grad_norm": 3.1585447765360137, + "learning_rate": 2.571784559664188e-06, + "loss": 0.5836, + "step": 5845 + }, + { + "epoch": 0.67, + "grad_norm": 1.8686966842097241, + "learning_rate": 2.570157997751239e-06, + "loss": 0.462, + "step": 5846 + }, + { + "epoch": 0.67, + "grad_norm": 1.9284973952385376, + "learning_rate": 2.5685317724195868e-06, + "loss": 0.5161, + "step": 5847 + }, + { + "epoch": 0.67, + "grad_norm": 1.8089116556951226, + "learning_rate": 2.5669058838944894e-06, + "loss": 0.4898, + "step": 5848 + }, + { + "epoch": 0.67, + "grad_norm": 2.373931924280234, + "learning_rate": 2.56528033240117e-06, + "loss": 0.4507, + "step": 5849 + }, + { + "epoch": 0.67, + "grad_norm": 2.019465234607856, + "learning_rate": 2.563655118164795e-06, + "loss": 0.4243, + "step": 5850 + }, + { + "epoch": 0.67, + "grad_norm": 1.852767570279682, + "learning_rate": 2.562030241410493e-06, + "loss": 0.4221, + "step": 5851 + }, + { + "epoch": 0.67, + "grad_norm": 1.931578519999569, + "learning_rate": 2.5604057023633376e-06, + "loss": 0.5612, + "step": 5852 + }, + { + "epoch": 0.67, + "grad_norm": 1.695620491370757, + "learning_rate": 2.558781501248364e-06, + "loss": 0.4485, + "step": 5853 + }, + { + "epoch": 0.67, + "grad_norm": 2.6636104918798624, + "learning_rate": 2.557157638290554e-06, + "loss": 0.4845, + "step": 5854 + }, + { + "epoch": 0.67, + "grad_norm": 1.7239997659845414, + "learning_rate": 2.555534113714843e-06, + "loss": 0.5544, + "step": 5855 + }, + { + "epoch": 0.67, + "grad_norm": 1.9063928477068948, + "learning_rate": 2.553910927746125e-06, + "loss": 0.4993, + "step": 5856 + }, + { + "epoch": 0.67, + "grad_norm": 2.330906258320501, + "learning_rate": 2.55228808060924e-06, + "loss": 0.5308, + "step": 5857 + }, + { + "epoch": 0.67, + "grad_norm": 1.8683035618913877, + "learning_rate": 2.5506655725289874e-06, + "loss": 0.3372, + "step": 5858 + }, + { + "epoch": 0.67, + "grad_norm": 1.9377851895707632, + "learning_rate": 2.549043403730116e-06, + "loss": 0.4379, + "step": 5859 + }, + { + "epoch": 0.67, + "grad_norm": 1.8151606804033869, + "learning_rate": 2.547421574437327e-06, + "loss": 0.4818, + "step": 5860 + }, + { + "epoch": 0.67, + "grad_norm": 1.8300130293808814, + "learning_rate": 2.5458000848752753e-06, + "loss": 0.5119, + "step": 5861 + }, + { + "epoch": 0.67, + "grad_norm": 2.8444637999850237, + "learning_rate": 2.544178935268573e-06, + "loss": 0.5109, + "step": 5862 + }, + { + "epoch": 0.67, + "grad_norm": 1.8568923950329312, + "learning_rate": 2.542558125841777e-06, + "loss": 0.6191, + "step": 5863 + }, + { + "epoch": 0.67, + "grad_norm": 2.0394325888300284, + "learning_rate": 2.5409376568194056e-06, + "loss": 0.378, + "step": 5864 + }, + { + "epoch": 0.67, + "grad_norm": 1.8811242041766203, + "learning_rate": 2.539317528425922e-06, + "loss": 0.5135, + "step": 5865 + }, + { + "epoch": 0.67, + "grad_norm": 1.6879271225914863, + "learning_rate": 2.5376977408857505e-06, + "loss": 0.5758, + "step": 5866 + }, + { + "epoch": 0.67, + "grad_norm": 2.2391407212159407, + "learning_rate": 2.5360782944232594e-06, + "loss": 0.5673, + "step": 5867 + }, + { + "epoch": 0.67, + "grad_norm": 2.111829363391716, + "learning_rate": 2.5344591892627777e-06, + "loss": 0.4914, + "step": 5868 + }, + { + "epoch": 0.67, + "grad_norm": 2.7997134113210094, + "learning_rate": 2.532840425628581e-06, + "loss": 0.4689, + "step": 5869 + }, + { + "epoch": 0.67, + "grad_norm": 1.965281696844387, + "learning_rate": 2.531222003744902e-06, + "loss": 0.4955, + "step": 5870 + }, + { + "epoch": 0.67, + "grad_norm": 2.149144986415558, + "learning_rate": 2.5296039238359227e-06, + "loss": 0.5781, + "step": 5871 + }, + { + "epoch": 0.67, + "grad_norm": 2.3657996245802226, + "learning_rate": 2.527986186125781e-06, + "loss": 0.5862, + "step": 5872 + }, + { + "epoch": 0.67, + "grad_norm": 2.571738393780249, + "learning_rate": 2.526368790838565e-06, + "loss": 0.4651, + "step": 5873 + }, + { + "epoch": 0.67, + "grad_norm": 2.112110816037857, + "learning_rate": 2.5247517381983137e-06, + "loss": 0.505, + "step": 5874 + }, + { + "epoch": 0.68, + "grad_norm": 1.8384796208281766, + "learning_rate": 2.5231350284290248e-06, + "loss": 0.4904, + "step": 5875 + }, + { + "epoch": 0.68, + "grad_norm": 2.4710005304435163, + "learning_rate": 2.5215186617546426e-06, + "loss": 0.5747, + "step": 5876 + }, + { + "epoch": 0.68, + "grad_norm": 1.7930812546481028, + "learning_rate": 2.519902638399064e-06, + "loss": 0.577, + "step": 5877 + }, + { + "epoch": 0.68, + "grad_norm": 1.7924746273497554, + "learning_rate": 2.518286958586145e-06, + "loss": 0.4963, + "step": 5878 + }, + { + "epoch": 0.68, + "grad_norm": 1.8607580234887013, + "learning_rate": 2.5166716225396864e-06, + "loss": 0.4719, + "step": 5879 + }, + { + "epoch": 0.68, + "grad_norm": 1.758336164022484, + "learning_rate": 2.5150566304834422e-06, + "loss": 0.5133, + "step": 5880 + }, + { + "epoch": 0.68, + "grad_norm": 2.107707681859021, + "learning_rate": 2.513441982641126e-06, + "loss": 0.5154, + "step": 5881 + }, + { + "epoch": 0.68, + "grad_norm": 1.7445000378886628, + "learning_rate": 2.511827679236393e-06, + "loss": 0.5194, + "step": 5882 + }, + { + "epoch": 0.68, + "grad_norm": 2.374957739467125, + "learning_rate": 2.5102137204928604e-06, + "loss": 0.5384, + "step": 5883 + }, + { + "epoch": 0.68, + "grad_norm": 1.583577895043586, + "learning_rate": 2.5086001066340907e-06, + "loss": 0.4747, + "step": 5884 + }, + { + "epoch": 0.68, + "grad_norm": 2.0775569342002007, + "learning_rate": 2.5069868378836048e-06, + "loss": 0.5145, + "step": 5885 + }, + { + "epoch": 0.68, + "grad_norm": 2.924575383942415, + "learning_rate": 2.505373914464868e-06, + "loss": 0.4899, + "step": 5886 + }, + { + "epoch": 0.68, + "grad_norm": 2.318661465013642, + "learning_rate": 2.5037613366013066e-06, + "loss": 0.52, + "step": 5887 + }, + { + "epoch": 0.68, + "grad_norm": 2.1068496097615985, + "learning_rate": 2.5021491045162903e-06, + "loss": 0.491, + "step": 5888 + }, + { + "epoch": 0.68, + "grad_norm": 7.357556737274467, + "learning_rate": 2.50053721843315e-06, + "loss": 0.5215, + "step": 5889 + }, + { + "epoch": 0.68, + "grad_norm": 2.342593945072089, + "learning_rate": 2.4989256785751588e-06, + "loss": 0.3865, + "step": 5890 + }, + { + "epoch": 0.68, + "grad_norm": 2.660316939250707, + "learning_rate": 2.497314485165551e-06, + "loss": 0.5969, + "step": 5891 + }, + { + "epoch": 0.68, + "grad_norm": 3.01227860389893, + "learning_rate": 2.495703638427508e-06, + "loss": 0.4278, + "step": 5892 + }, + { + "epoch": 0.68, + "grad_norm": 2.0886376121392596, + "learning_rate": 2.4940931385841625e-06, + "loss": 0.4287, + "step": 5893 + }, + { + "epoch": 0.68, + "grad_norm": 2.1251409755619926, + "learning_rate": 2.4924829858586e-06, + "loss": 0.4535, + "step": 5894 + }, + { + "epoch": 0.68, + "grad_norm": 2.380658304002521, + "learning_rate": 2.490873180473862e-06, + "loss": 0.6059, + "step": 5895 + }, + { + "epoch": 0.68, + "grad_norm": 0.8390872948657713, + "learning_rate": 2.4892637226529344e-06, + "loss": 0.6989, + "step": 5896 + }, + { + "epoch": 0.68, + "grad_norm": 1.6574633646961692, + "learning_rate": 2.4876546126187622e-06, + "loss": 0.4482, + "step": 5897 + }, + { + "epoch": 0.68, + "grad_norm": 2.727237372904544, + "learning_rate": 2.486045850594239e-06, + "loss": 0.5493, + "step": 5898 + }, + { + "epoch": 0.68, + "grad_norm": 1.8959039023599484, + "learning_rate": 2.484437436802207e-06, + "loss": 0.5254, + "step": 5899 + }, + { + "epoch": 0.68, + "grad_norm": 1.699775917034189, + "learning_rate": 2.482829371465467e-06, + "loss": 0.3697, + "step": 5900 + }, + { + "epoch": 0.68, + "grad_norm": 2.976082662185539, + "learning_rate": 2.4812216548067646e-06, + "loss": 0.4379, + "step": 5901 + }, + { + "epoch": 0.68, + "grad_norm": 2.2099332356892103, + "learning_rate": 2.479614287048805e-06, + "loss": 0.5683, + "step": 5902 + }, + { + "epoch": 0.68, + "grad_norm": 1.8170557459671952, + "learning_rate": 2.4780072684142355e-06, + "loss": 0.507, + "step": 5903 + }, + { + "epoch": 0.68, + "grad_norm": 2.1623949665197424, + "learning_rate": 2.476400599125664e-06, + "loss": 0.4321, + "step": 5904 + }, + { + "epoch": 0.68, + "grad_norm": 1.9566598946561105, + "learning_rate": 2.4747942794056425e-06, + "loss": 0.5124, + "step": 5905 + }, + { + "epoch": 0.68, + "grad_norm": 2.9278617931931055, + "learning_rate": 2.473188309476682e-06, + "loss": 0.5001, + "step": 5906 + }, + { + "epoch": 0.68, + "grad_norm": 2.2009734404072088, + "learning_rate": 2.4715826895612372e-06, + "loss": 0.4426, + "step": 5907 + }, + { + "epoch": 0.68, + "grad_norm": 2.0569089499298148, + "learning_rate": 2.4699774198817228e-06, + "loss": 0.5334, + "step": 5908 + }, + { + "epoch": 0.68, + "grad_norm": 2.4085875062925863, + "learning_rate": 2.4683725006604953e-06, + "loss": 0.4501, + "step": 5909 + }, + { + "epoch": 0.68, + "grad_norm": 2.275463064603917, + "learning_rate": 2.466767932119875e-06, + "loss": 0.4302, + "step": 5910 + }, + { + "epoch": 0.68, + "grad_norm": 2.4495464204722053, + "learning_rate": 2.4651637144821176e-06, + "loss": 0.5357, + "step": 5911 + }, + { + "epoch": 0.68, + "grad_norm": 2.2745231301499715, + "learning_rate": 2.4635598479694455e-06, + "loss": 0.4778, + "step": 5912 + }, + { + "epoch": 0.68, + "grad_norm": 2.95387042349627, + "learning_rate": 2.4619563328040226e-06, + "loss": 0.4717, + "step": 5913 + }, + { + "epoch": 0.68, + "grad_norm": 2.338000430687956, + "learning_rate": 2.4603531692079703e-06, + "loss": 0.4823, + "step": 5914 + }, + { + "epoch": 0.68, + "grad_norm": 2.828970515055067, + "learning_rate": 2.458750357403355e-06, + "loss": 0.6028, + "step": 5915 + }, + { + "epoch": 0.68, + "grad_norm": 2.030293625647091, + "learning_rate": 2.457147897612202e-06, + "loss": 0.4697, + "step": 5916 + }, + { + "epoch": 0.68, + "grad_norm": 2.457324888279669, + "learning_rate": 2.45554579005648e-06, + "loss": 0.4897, + "step": 5917 + }, + { + "epoch": 0.68, + "grad_norm": 7.5149018114247985, + "learning_rate": 2.453944034958117e-06, + "loss": 0.5906, + "step": 5918 + }, + { + "epoch": 0.68, + "grad_norm": 2.111930671468288, + "learning_rate": 2.4523426325389843e-06, + "loss": 0.5763, + "step": 5919 + }, + { + "epoch": 0.68, + "grad_norm": 0.8272807194201066, + "learning_rate": 2.4507415830209076e-06, + "loss": 0.681, + "step": 5920 + }, + { + "epoch": 0.68, + "grad_norm": 2.19870236704104, + "learning_rate": 2.4491408866256667e-06, + "loss": 0.4728, + "step": 5921 + }, + { + "epoch": 0.68, + "grad_norm": 2.1619373629382297, + "learning_rate": 2.447540543574987e-06, + "loss": 0.5149, + "step": 5922 + }, + { + "epoch": 0.68, + "grad_norm": 0.8416824118970133, + "learning_rate": 2.4459405540905505e-06, + "loss": 0.7002, + "step": 5923 + }, + { + "epoch": 0.68, + "grad_norm": 1.787171133351888, + "learning_rate": 2.4443409183939843e-06, + "loss": 0.5861, + "step": 5924 + }, + { + "epoch": 0.68, + "grad_norm": 2.3168382277292356, + "learning_rate": 2.4427416367068733e-06, + "loss": 0.4131, + "step": 5925 + }, + { + "epoch": 0.68, + "grad_norm": 1.8244781585766352, + "learning_rate": 2.4411427092507485e-06, + "loss": 0.4414, + "step": 5926 + }, + { + "epoch": 0.68, + "grad_norm": 3.6917434416431836, + "learning_rate": 2.439544136247092e-06, + "loss": 0.479, + "step": 5927 + }, + { + "epoch": 0.68, + "grad_norm": 1.9822450490727068, + "learning_rate": 2.437945917917336e-06, + "loss": 0.5427, + "step": 5928 + }, + { + "epoch": 0.68, + "grad_norm": 2.3105509495920047, + "learning_rate": 2.4363480544828692e-06, + "loss": 0.3691, + "step": 5929 + }, + { + "epoch": 0.68, + "grad_norm": 1.7290824275493988, + "learning_rate": 2.4347505461650252e-06, + "loss": 0.4979, + "step": 5930 + }, + { + "epoch": 0.68, + "grad_norm": 3.058153629669663, + "learning_rate": 2.433153393185092e-06, + "loss": 0.4774, + "step": 5931 + }, + { + "epoch": 0.68, + "grad_norm": 2.8708556205538653, + "learning_rate": 2.431556595764305e-06, + "loss": 0.4148, + "step": 5932 + }, + { + "epoch": 0.68, + "grad_norm": 2.534797694711912, + "learning_rate": 2.429960154123855e-06, + "loss": 0.5013, + "step": 5933 + }, + { + "epoch": 0.68, + "grad_norm": 0.8311032803583079, + "learning_rate": 2.4283640684848777e-06, + "loss": 0.6469, + "step": 5934 + }, + { + "epoch": 0.68, + "grad_norm": 1.8227949287226102, + "learning_rate": 2.426768339068466e-06, + "loss": 0.4366, + "step": 5935 + }, + { + "epoch": 0.68, + "grad_norm": 2.0716741185958862, + "learning_rate": 2.4251729660956563e-06, + "loss": 0.5072, + "step": 5936 + }, + { + "epoch": 0.68, + "grad_norm": 2.0368618310824407, + "learning_rate": 2.4235779497874433e-06, + "loss": 0.4663, + "step": 5937 + }, + { + "epoch": 0.68, + "grad_norm": 1.9303653734625565, + "learning_rate": 2.4219832903647667e-06, + "loss": 0.4678, + "step": 5938 + }, + { + "epoch": 0.68, + "grad_norm": 1.8610390800924805, + "learning_rate": 2.420388988048516e-06, + "loss": 0.5365, + "step": 5939 + }, + { + "epoch": 0.68, + "grad_norm": 2.2736576976171077, + "learning_rate": 2.418795043059538e-06, + "loss": 0.5545, + "step": 5940 + }, + { + "epoch": 0.68, + "grad_norm": 2.2571788289142782, + "learning_rate": 2.4172014556186214e-06, + "loss": 0.4829, + "step": 5941 + }, + { + "epoch": 0.68, + "grad_norm": 3.4508535760060868, + "learning_rate": 2.4156082259465135e-06, + "loss": 0.5832, + "step": 5942 + }, + { + "epoch": 0.68, + "grad_norm": 1.9465605839816635, + "learning_rate": 2.414015354263907e-06, + "loss": 0.4729, + "step": 5943 + }, + { + "epoch": 0.68, + "grad_norm": 2.198921615696182, + "learning_rate": 2.412422840791446e-06, + "loss": 0.424, + "step": 5944 + }, + { + "epoch": 0.68, + "grad_norm": 2.5284007151253816, + "learning_rate": 2.410830685749722e-06, + "loss": 0.5808, + "step": 5945 + }, + { + "epoch": 0.68, + "grad_norm": 2.0027188132878093, + "learning_rate": 2.4092388893592856e-06, + "loss": 0.4738, + "step": 5946 + }, + { + "epoch": 0.68, + "grad_norm": 2.387721754980416, + "learning_rate": 2.407647451840627e-06, + "loss": 0.3963, + "step": 5947 + }, + { + "epoch": 0.68, + "grad_norm": 2.9071194702160827, + "learning_rate": 2.406056373414197e-06, + "loss": 0.4922, + "step": 5948 + }, + { + "epoch": 0.68, + "grad_norm": 1.9357560730619225, + "learning_rate": 2.404465654300387e-06, + "loss": 0.5345, + "step": 5949 + }, + { + "epoch": 0.68, + "grad_norm": 3.4331672644261437, + "learning_rate": 2.402875294719546e-06, + "loss": 0.424, + "step": 5950 + }, + { + "epoch": 0.68, + "grad_norm": 1.9960314425091281, + "learning_rate": 2.4012852948919685e-06, + "loss": 0.4036, + "step": 5951 + }, + { + "epoch": 0.68, + "grad_norm": 1.9478962258208887, + "learning_rate": 2.399695655037903e-06, + "loss": 0.5203, + "step": 5952 + }, + { + "epoch": 0.68, + "grad_norm": 2.0279117417337145, + "learning_rate": 2.3981063753775437e-06, + "loss": 0.5042, + "step": 5953 + }, + { + "epoch": 0.68, + "grad_norm": 1.5617023860747241, + "learning_rate": 2.39651745613104e-06, + "loss": 0.4255, + "step": 5954 + }, + { + "epoch": 0.68, + "grad_norm": 1.9410015725332481, + "learning_rate": 2.3949288975184852e-06, + "loss": 0.5141, + "step": 5955 + }, + { + "epoch": 0.68, + "grad_norm": 1.8888283057345092, + "learning_rate": 2.393340699759931e-06, + "loss": 0.4686, + "step": 5956 + }, + { + "epoch": 0.68, + "grad_norm": 1.9654893684530157, + "learning_rate": 2.391752863075369e-06, + "loss": 0.5255, + "step": 5957 + }, + { + "epoch": 0.68, + "grad_norm": 1.6293294524419162, + "learning_rate": 2.3901653876847507e-06, + "loss": 0.4412, + "step": 5958 + }, + { + "epoch": 0.68, + "grad_norm": 2.3156004153884533, + "learning_rate": 2.388578273807971e-06, + "loss": 0.5249, + "step": 5959 + }, + { + "epoch": 0.68, + "grad_norm": 1.7606753747130122, + "learning_rate": 2.3869915216648766e-06, + "loss": 0.501, + "step": 5960 + }, + { + "epoch": 0.68, + "grad_norm": 3.401625191080599, + "learning_rate": 2.385405131475262e-06, + "loss": 0.5061, + "step": 5961 + }, + { + "epoch": 0.69, + "grad_norm": 2.4407343178014203, + "learning_rate": 2.3838191034588774e-06, + "loss": 0.5863, + "step": 5962 + }, + { + "epoch": 0.69, + "grad_norm": 2.331471916347791, + "learning_rate": 2.382233437835418e-06, + "loss": 0.3616, + "step": 5963 + }, + { + "epoch": 0.69, + "grad_norm": 1.9093641900961436, + "learning_rate": 2.380648134824527e-06, + "loss": 0.5257, + "step": 5964 + }, + { + "epoch": 0.69, + "grad_norm": 2.1927703336562963, + "learning_rate": 2.379063194645805e-06, + "loss": 0.5113, + "step": 5965 + }, + { + "epoch": 0.69, + "grad_norm": 2.5516660636358113, + "learning_rate": 2.3774786175187932e-06, + "loss": 0.5541, + "step": 5966 + }, + { + "epoch": 0.69, + "grad_norm": 1.671485647554803, + "learning_rate": 2.3758944036629906e-06, + "loss": 0.4763, + "step": 5967 + }, + { + "epoch": 0.69, + "grad_norm": 2.2866112898900117, + "learning_rate": 2.3743105532978396e-06, + "loss": 0.5615, + "step": 5968 + }, + { + "epoch": 0.69, + "grad_norm": 2.8392508435778185, + "learning_rate": 2.372727066642737e-06, + "loss": 0.5083, + "step": 5969 + }, + { + "epoch": 0.69, + "grad_norm": 2.2012259284346043, + "learning_rate": 2.371143943917025e-06, + "loss": 0.5082, + "step": 5970 + }, + { + "epoch": 0.69, + "grad_norm": 3.122949056670715, + "learning_rate": 2.3695611853399997e-06, + "loss": 0.3565, + "step": 5971 + }, + { + "epoch": 0.69, + "grad_norm": 4.172474839746245, + "learning_rate": 2.3679787911309016e-06, + "loss": 0.4987, + "step": 5972 + }, + { + "epoch": 0.69, + "grad_norm": 7.180674593448937, + "learning_rate": 2.366396761508928e-06, + "loss": 0.5726, + "step": 5973 + }, + { + "epoch": 0.69, + "grad_norm": 1.9020073263583017, + "learning_rate": 2.3648150966932163e-06, + "loss": 0.5499, + "step": 5974 + }, + { + "epoch": 0.69, + "grad_norm": 1.9741929707998185, + "learning_rate": 2.363233796902863e-06, + "loss": 0.4759, + "step": 5975 + }, + { + "epoch": 0.69, + "grad_norm": 2.5876454403560585, + "learning_rate": 2.361652862356906e-06, + "loss": 0.4912, + "step": 5976 + }, + { + "epoch": 0.69, + "grad_norm": 2.071676875640921, + "learning_rate": 2.3600722932743407e-06, + "loss": 0.5367, + "step": 5977 + }, + { + "epoch": 0.69, + "grad_norm": 3.0376433727572096, + "learning_rate": 2.3584920898741003e-06, + "loss": 0.4038, + "step": 5978 + }, + { + "epoch": 0.69, + "grad_norm": 2.3827649805043736, + "learning_rate": 2.3569122523750804e-06, + "loss": 0.4669, + "step": 5979 + }, + { + "epoch": 0.69, + "grad_norm": 1.9950054070716192, + "learning_rate": 2.355332780996116e-06, + "loss": 0.5847, + "step": 5980 + }, + { + "epoch": 0.69, + "grad_norm": 1.9782160013929824, + "learning_rate": 2.3537536759559974e-06, + "loss": 0.499, + "step": 5981 + }, + { + "epoch": 0.69, + "grad_norm": 1.8530391521390959, + "learning_rate": 2.3521749374734602e-06, + "loss": 0.5523, + "step": 5982 + }, + { + "epoch": 0.69, + "grad_norm": 2.3967293067775794, + "learning_rate": 2.3505965657671943e-06, + "loss": 0.5352, + "step": 5983 + }, + { + "epoch": 0.69, + "grad_norm": 2.4430489469983714, + "learning_rate": 2.3490185610558324e-06, + "loss": 0.4885, + "step": 5984 + }, + { + "epoch": 0.69, + "grad_norm": 2.3367744425538715, + "learning_rate": 2.347440923557959e-06, + "loss": 0.3917, + "step": 5985 + }, + { + "epoch": 0.69, + "grad_norm": 2.3621241387968226, + "learning_rate": 2.345863653492111e-06, + "loss": 0.5136, + "step": 5986 + }, + { + "epoch": 0.69, + "grad_norm": 1.778825982792816, + "learning_rate": 2.344286751076768e-06, + "loss": 0.3885, + "step": 5987 + }, + { + "epoch": 0.69, + "grad_norm": 2.4161980008752937, + "learning_rate": 2.342710216530366e-06, + "loss": 0.5154, + "step": 5988 + }, + { + "epoch": 0.69, + "grad_norm": 3.397030237078764, + "learning_rate": 2.3411340500712833e-06, + "loss": 0.513, + "step": 5989 + }, + { + "epoch": 0.69, + "grad_norm": 2.1082244913607036, + "learning_rate": 2.339558251917853e-06, + "loss": 0.4425, + "step": 5990 + }, + { + "epoch": 0.69, + "grad_norm": 1.6383116268003521, + "learning_rate": 2.3379828222883504e-06, + "loss": 0.4463, + "step": 5991 + }, + { + "epoch": 0.69, + "grad_norm": 1.5284598906639957, + "learning_rate": 2.336407761401009e-06, + "loss": 0.5159, + "step": 5992 + }, + { + "epoch": 0.69, + "grad_norm": 2.601763888594823, + "learning_rate": 2.3348330694740006e-06, + "loss": 0.4759, + "step": 5993 + }, + { + "epoch": 0.69, + "grad_norm": 0.830866888235839, + "learning_rate": 2.333258746725458e-06, + "loss": 0.6617, + "step": 5994 + }, + { + "epoch": 0.69, + "grad_norm": 2.3371317127458706, + "learning_rate": 2.3316847933734478e-06, + "loss": 0.5051, + "step": 5995 + }, + { + "epoch": 0.69, + "grad_norm": 1.7351471305002308, + "learning_rate": 2.3301112096359996e-06, + "loss": 0.5742, + "step": 5996 + }, + { + "epoch": 0.69, + "grad_norm": 2.5085720921505628, + "learning_rate": 2.3285379957310827e-06, + "loss": 0.4256, + "step": 5997 + }, + { + "epoch": 0.69, + "grad_norm": 2.493315681001551, + "learning_rate": 2.3269651518766217e-06, + "loss": 0.5224, + "step": 5998 + }, + { + "epoch": 0.69, + "grad_norm": 2.3946817070817255, + "learning_rate": 2.3253926782904833e-06, + "loss": 0.4506, + "step": 5999 + }, + { + "epoch": 0.69, + "grad_norm": 2.0483383656849172, + "learning_rate": 2.323820575190489e-06, + "loss": 0.5543, + "step": 6000 + }, + { + "epoch": 0.69, + "grad_norm": 3.4177176319511386, + "learning_rate": 2.322248842794404e-06, + "loss": 0.5128, + "step": 6001 + }, + { + "epoch": 0.69, + "grad_norm": 2.767185215870721, + "learning_rate": 2.320677481319947e-06, + "loss": 0.4102, + "step": 6002 + }, + { + "epoch": 0.69, + "grad_norm": 1.8607950468720453, + "learning_rate": 2.319106490984781e-06, + "loss": 0.4601, + "step": 6003 + }, + { + "epoch": 0.69, + "grad_norm": 1.916011155192851, + "learning_rate": 2.3175358720065183e-06, + "loss": 0.4539, + "step": 6004 + }, + { + "epoch": 0.69, + "grad_norm": 1.9652054173281073, + "learning_rate": 2.3159656246027234e-06, + "loss": 0.4496, + "step": 6005 + }, + { + "epoch": 0.69, + "grad_norm": 2.7717552441787747, + "learning_rate": 2.3143957489909037e-06, + "loss": 0.4218, + "step": 6006 + }, + { + "epoch": 0.69, + "grad_norm": 2.1200393700223628, + "learning_rate": 2.312826245388521e-06, + "loss": 0.5103, + "step": 6007 + }, + { + "epoch": 0.69, + "grad_norm": 2.2541996338483004, + "learning_rate": 2.31125711401298e-06, + "loss": 0.5237, + "step": 6008 + }, + { + "epoch": 0.69, + "grad_norm": 0.7658195017258035, + "learning_rate": 2.3096883550816395e-06, + "loss": 0.649, + "step": 6009 + }, + { + "epoch": 0.69, + "grad_norm": 2.3785627947208035, + "learning_rate": 2.3081199688118e-06, + "loss": 0.4651, + "step": 6010 + }, + { + "epoch": 0.69, + "grad_norm": 2.7022018774681955, + "learning_rate": 2.3065519554207204e-06, + "loss": 0.4521, + "step": 6011 + }, + { + "epoch": 0.69, + "grad_norm": 2.2525920271046274, + "learning_rate": 2.3049843151255933e-06, + "loss": 0.5859, + "step": 6012 + }, + { + "epoch": 0.69, + "grad_norm": 0.8242512190761943, + "learning_rate": 2.303417048143574e-06, + "loss": 0.6502, + "step": 6013 + }, + { + "epoch": 0.69, + "grad_norm": 2.426969402258358, + "learning_rate": 2.3018501546917567e-06, + "loss": 0.4548, + "step": 6014 + }, + { + "epoch": 0.69, + "grad_norm": 2.280137146410003, + "learning_rate": 2.3002836349871897e-06, + "loss": 0.4785, + "step": 6015 + }, + { + "epoch": 0.69, + "grad_norm": 1.8456208434060648, + "learning_rate": 2.298717489246865e-06, + "loss": 0.523, + "step": 6016 + }, + { + "epoch": 0.69, + "grad_norm": 0.8362729219772587, + "learning_rate": 2.297151717687727e-06, + "loss": 0.6838, + "step": 6017 + }, + { + "epoch": 0.69, + "grad_norm": 1.9127212543216146, + "learning_rate": 2.295586320526663e-06, + "loss": 0.5422, + "step": 6018 + }, + { + "epoch": 0.69, + "grad_norm": 1.8800372750483636, + "learning_rate": 2.294021297980516e-06, + "loss": 0.4279, + "step": 6019 + }, + { + "epoch": 0.69, + "grad_norm": 1.9383113619130719, + "learning_rate": 2.2924566502660676e-06, + "loss": 0.4367, + "step": 6020 + }, + { + "epoch": 0.69, + "grad_norm": 2.1918159048233896, + "learning_rate": 2.2908923776000573e-06, + "loss": 0.5104, + "step": 6021 + }, + { + "epoch": 0.69, + "grad_norm": 3.84951228496165, + "learning_rate": 2.289328480199164e-06, + "loss": 0.5931, + "step": 6022 + }, + { + "epoch": 0.69, + "grad_norm": 2.3453592692779086, + "learning_rate": 2.2877649582800216e-06, + "loss": 0.5014, + "step": 6023 + }, + { + "epoch": 0.69, + "grad_norm": 3.2790843728307495, + "learning_rate": 2.2862018120592072e-06, + "loss": 0.4943, + "step": 6024 + }, + { + "epoch": 0.69, + "grad_norm": 2.684177182594862, + "learning_rate": 2.284639041753246e-06, + "loss": 0.6272, + "step": 6025 + }, + { + "epoch": 0.69, + "grad_norm": 2.3795482414524782, + "learning_rate": 2.2830766475786166e-06, + "loss": 0.4736, + "step": 6026 + }, + { + "epoch": 0.69, + "grad_norm": 2.7121226588457445, + "learning_rate": 2.281514629751737e-06, + "loss": 0.4134, + "step": 6027 + }, + { + "epoch": 0.69, + "grad_norm": 2.5526451557591985, + "learning_rate": 2.2799529884889827e-06, + "loss": 0.5957, + "step": 6028 + }, + { + "epoch": 0.69, + "grad_norm": 2.0480610650375564, + "learning_rate": 2.278391724006669e-06, + "loss": 0.4511, + "step": 6029 + }, + { + "epoch": 0.69, + "grad_norm": 2.297968189657318, + "learning_rate": 2.2768308365210616e-06, + "loss": 0.4794, + "step": 6030 + }, + { + "epoch": 0.69, + "grad_norm": 2.2946117563911375, + "learning_rate": 2.275270326248374e-06, + "loss": 0.5546, + "step": 6031 + }, + { + "epoch": 0.69, + "grad_norm": 2.9467766965516904, + "learning_rate": 2.2737101934047707e-06, + "loss": 0.5186, + "step": 6032 + }, + { + "epoch": 0.69, + "grad_norm": 2.7311912367817985, + "learning_rate": 2.2721504382063567e-06, + "loss": 0.515, + "step": 6033 + }, + { + "epoch": 0.69, + "grad_norm": 2.915806268101038, + "learning_rate": 2.270591060869194e-06, + "loss": 0.3993, + "step": 6034 + }, + { + "epoch": 0.69, + "grad_norm": 1.6898429659122811, + "learning_rate": 2.2690320616092826e-06, + "loss": 0.5236, + "step": 6035 + }, + { + "epoch": 0.69, + "grad_norm": 3.0261210795281093, + "learning_rate": 2.267473440642579e-06, + "loss": 0.6032, + "step": 6036 + }, + { + "epoch": 0.69, + "grad_norm": 1.8855461015696047, + "learning_rate": 2.2659151981849793e-06, + "loss": 0.4586, + "step": 6037 + }, + { + "epoch": 0.69, + "grad_norm": 1.7773264561500792, + "learning_rate": 2.2643573344523345e-06, + "loss": 0.4123, + "step": 6038 + }, + { + "epoch": 0.69, + "grad_norm": 1.7245042829614439, + "learning_rate": 2.2627998496604366e-06, + "loss": 0.4524, + "step": 6039 + }, + { + "epoch": 0.69, + "grad_norm": 2.025832286791638, + "learning_rate": 2.2612427440250308e-06, + "loss": 0.5347, + "step": 6040 + }, + { + "epoch": 0.69, + "grad_norm": 4.343866946337479, + "learning_rate": 2.2596860177618034e-06, + "loss": 0.4964, + "step": 6041 + }, + { + "epoch": 0.69, + "grad_norm": 2.6663491910356623, + "learning_rate": 2.2581296710863963e-06, + "loss": 0.4953, + "step": 6042 + }, + { + "epoch": 0.69, + "grad_norm": 1.9073048903471526, + "learning_rate": 2.256573704214393e-06, + "loss": 0.474, + "step": 6043 + }, + { + "epoch": 0.69, + "grad_norm": 1.8172922955327782, + "learning_rate": 2.2550181173613226e-06, + "loss": 0.4244, + "step": 6044 + }, + { + "epoch": 0.69, + "grad_norm": 2.39065109584252, + "learning_rate": 2.253462910742669e-06, + "loss": 0.5052, + "step": 6045 + }, + { + "epoch": 0.69, + "grad_norm": 2.210920749014501, + "learning_rate": 2.2519080845738573e-06, + "loss": 0.4114, + "step": 6046 + }, + { + "epoch": 0.69, + "grad_norm": 1.951174338119239, + "learning_rate": 2.2503536390702603e-06, + "loss": 0.5665, + "step": 6047 + }, + { + "epoch": 0.69, + "grad_norm": 1.8670668256813343, + "learning_rate": 2.248799574447202e-06, + "loss": 0.525, + "step": 6048 + }, + { + "epoch": 0.7, + "grad_norm": 2.6621342417821054, + "learning_rate": 2.2472458909199507e-06, + "loss": 0.5673, + "step": 6049 + }, + { + "epoch": 0.7, + "grad_norm": 5.554834483288534, + "learning_rate": 2.2456925887037194e-06, + "loss": 0.4641, + "step": 6050 + }, + { + "epoch": 0.7, + "grad_norm": 2.2584445192667904, + "learning_rate": 2.2441396680136763e-06, + "loss": 0.5412, + "step": 6051 + }, + { + "epoch": 0.7, + "grad_norm": 2.3865652259470047, + "learning_rate": 2.242587129064927e-06, + "loss": 0.5597, + "step": 6052 + }, + { + "epoch": 0.7, + "grad_norm": 0.8739800679639163, + "learning_rate": 2.2410349720725327e-06, + "loss": 0.6751, + "step": 6053 + }, + { + "epoch": 0.7, + "grad_norm": 2.412909229055911, + "learning_rate": 2.239483197251494e-06, + "loss": 0.5377, + "step": 6054 + }, + { + "epoch": 0.7, + "grad_norm": 2.09882819213224, + "learning_rate": 2.237931804816767e-06, + "loss": 0.5065, + "step": 6055 + }, + { + "epoch": 0.7, + "grad_norm": 2.267491590171007, + "learning_rate": 2.2363807949832463e-06, + "loss": 0.4412, + "step": 6056 + }, + { + "epoch": 0.7, + "grad_norm": 2.120182952040126, + "learning_rate": 2.2348301679657802e-06, + "loss": 0.4912, + "step": 6057 + }, + { + "epoch": 0.7, + "grad_norm": 1.938282558794569, + "learning_rate": 2.233279923979159e-06, + "loss": 0.4771, + "step": 6058 + }, + { + "epoch": 0.7, + "grad_norm": 1.7519178395675385, + "learning_rate": 2.2317300632381256e-06, + "loss": 0.4349, + "step": 6059 + }, + { + "epoch": 0.7, + "grad_norm": 1.9733519960816648, + "learning_rate": 2.230180585957362e-06, + "loss": 0.523, + "step": 6060 + }, + { + "epoch": 0.7, + "grad_norm": 2.1321645463668544, + "learning_rate": 2.2286314923515077e-06, + "loss": 0.5003, + "step": 6061 + }, + { + "epoch": 0.7, + "grad_norm": 1.64313885793828, + "learning_rate": 2.227082782635136e-06, + "loss": 0.3769, + "step": 6062 + }, + { + "epoch": 0.7, + "grad_norm": 3.967640964137952, + "learning_rate": 2.225534457022778e-06, + "loss": 0.558, + "step": 6063 + }, + { + "epoch": 0.7, + "grad_norm": 2.3766480462337616, + "learning_rate": 2.2239865157289046e-06, + "loss": 0.5446, + "step": 6064 + }, + { + "epoch": 0.7, + "grad_norm": 2.050222391588646, + "learning_rate": 2.2224389589679407e-06, + "loss": 0.4218, + "step": 6065 + }, + { + "epoch": 0.7, + "grad_norm": 2.055357287815045, + "learning_rate": 2.220891786954249e-06, + "loss": 0.577, + "step": 6066 + }, + { + "epoch": 0.7, + "grad_norm": 1.8057753645341201, + "learning_rate": 2.2193449999021476e-06, + "loss": 0.4971, + "step": 6067 + }, + { + "epoch": 0.7, + "grad_norm": 2.304950218425839, + "learning_rate": 2.2177985980258946e-06, + "loss": 0.5512, + "step": 6068 + }, + { + "epoch": 0.7, + "grad_norm": 2.0380778208482693, + "learning_rate": 2.216252581539697e-06, + "loss": 0.5207, + "step": 6069 + }, + { + "epoch": 0.7, + "grad_norm": 1.7706933171083448, + "learning_rate": 2.2147069506577107e-06, + "loss": 0.4179, + "step": 6070 + }, + { + "epoch": 0.7, + "grad_norm": 2.4421290502312956, + "learning_rate": 2.2131617055940337e-06, + "loss": 0.5206, + "step": 6071 + }, + { + "epoch": 0.7, + "grad_norm": 2.537186205080983, + "learning_rate": 2.2116168465627162e-06, + "loss": 0.4471, + "step": 6072 + }, + { + "epoch": 0.7, + "grad_norm": 2.0813377832526645, + "learning_rate": 2.2100723737777485e-06, + "loss": 0.4374, + "step": 6073 + }, + { + "epoch": 0.7, + "grad_norm": 2.2356801087220806, + "learning_rate": 2.208528287453074e-06, + "loss": 0.4669, + "step": 6074 + }, + { + "epoch": 0.7, + "grad_norm": 2.2001266916823154, + "learning_rate": 2.206984587802576e-06, + "loss": 0.5854, + "step": 6075 + }, + { + "epoch": 0.7, + "grad_norm": 2.3760486093599322, + "learning_rate": 2.205441275040091e-06, + "loss": 0.5073, + "step": 6076 + }, + { + "epoch": 0.7, + "grad_norm": 2.174556509685616, + "learning_rate": 2.203898349379394e-06, + "loss": 0.4316, + "step": 6077 + }, + { + "epoch": 0.7, + "grad_norm": 2.0495525453349313, + "learning_rate": 2.202355811034218e-06, + "loss": 0.4381, + "step": 6078 + }, + { + "epoch": 0.7, + "grad_norm": 2.127073549575823, + "learning_rate": 2.2008136602182264e-06, + "loss": 0.4969, + "step": 6079 + }, + { + "epoch": 0.7, + "grad_norm": 1.8252337778207794, + "learning_rate": 2.1992718971450432e-06, + "loss": 0.5506, + "step": 6080 + }, + { + "epoch": 0.7, + "grad_norm": 2.1426027074195924, + "learning_rate": 2.197730522028231e-06, + "loss": 0.4884, + "step": 6081 + }, + { + "epoch": 0.7, + "grad_norm": 1.9614088403641654, + "learning_rate": 2.196189535081302e-06, + "loss": 0.4805, + "step": 6082 + }, + { + "epoch": 0.7, + "grad_norm": 2.0584289502178366, + "learning_rate": 2.1946489365177122e-06, + "loss": 0.5545, + "step": 6083 + }, + { + "epoch": 0.7, + "grad_norm": 2.5501003820303723, + "learning_rate": 2.1931087265508674e-06, + "loss": 0.4662, + "step": 6084 + }, + { + "epoch": 0.7, + "grad_norm": 1.8905175041864024, + "learning_rate": 2.191568905394113e-06, + "loss": 0.5319, + "step": 6085 + }, + { + "epoch": 0.7, + "grad_norm": 2.4364584831944076, + "learning_rate": 2.19002947326075e-06, + "loss": 0.4951, + "step": 6086 + }, + { + "epoch": 0.7, + "grad_norm": 2.558004712466623, + "learning_rate": 2.1884904303640155e-06, + "loss": 0.5673, + "step": 6087 + }, + { + "epoch": 0.7, + "grad_norm": 2.4434517789479515, + "learning_rate": 2.1869517769171016e-06, + "loss": 0.5213, + "step": 6088 + }, + { + "epoch": 0.7, + "grad_norm": 1.8107592346694084, + "learning_rate": 2.1854135131331405e-06, + "loss": 0.4808, + "step": 6089 + }, + { + "epoch": 0.7, + "grad_norm": 2.3228733520741947, + "learning_rate": 2.1838756392252098e-06, + "loss": 0.4606, + "step": 6090 + }, + { + "epoch": 0.7, + "grad_norm": 1.8932009453838858, + "learning_rate": 2.1823381554063398e-06, + "loss": 0.4332, + "step": 6091 + }, + { + "epoch": 0.7, + "grad_norm": 2.8639192485031266, + "learning_rate": 2.180801061889499e-06, + "loss": 0.5313, + "step": 6092 + }, + { + "epoch": 0.7, + "grad_norm": 2.3291430238693276, + "learning_rate": 2.1792643588876085e-06, + "loss": 0.5016, + "step": 6093 + }, + { + "epoch": 0.7, + "grad_norm": 2.010188865865139, + "learning_rate": 2.177728046613528e-06, + "loss": 0.5202, + "step": 6094 + }, + { + "epoch": 0.7, + "grad_norm": 5.530577327571859, + "learning_rate": 2.1761921252800737e-06, + "loss": 0.535, + "step": 6095 + }, + { + "epoch": 0.7, + "grad_norm": 1.8917624567800624, + "learning_rate": 2.174656595099994e-06, + "loss": 0.4239, + "step": 6096 + }, + { + "epoch": 0.7, + "grad_norm": 2.3008669801466186, + "learning_rate": 2.1731214562859942e-06, + "loss": 0.4451, + "step": 6097 + }, + { + "epoch": 0.7, + "grad_norm": 1.8878740422483733, + "learning_rate": 2.17158670905072e-06, + "loss": 0.4722, + "step": 6098 + }, + { + "epoch": 0.7, + "grad_norm": 1.883008924914656, + "learning_rate": 2.1700523536067657e-06, + "loss": 0.4749, + "step": 6099 + }, + { + "epoch": 0.7, + "grad_norm": 2.0653937254471395, + "learning_rate": 2.168518390166668e-06, + "loss": 0.5465, + "step": 6100 + }, + { + "epoch": 0.7, + "grad_norm": 1.792381014415643, + "learning_rate": 2.1669848189429136e-06, + "loss": 0.4474, + "step": 6101 + }, + { + "epoch": 0.7, + "grad_norm": 1.9723262229697558, + "learning_rate": 2.1654516401479303e-06, + "loss": 0.5592, + "step": 6102 + }, + { + "epoch": 0.7, + "grad_norm": 2.5850032701320735, + "learning_rate": 2.1639188539940968e-06, + "loss": 0.5041, + "step": 6103 + }, + { + "epoch": 0.7, + "grad_norm": 2.6468278591580296, + "learning_rate": 2.16238646069373e-06, + "loss": 0.5583, + "step": 6104 + }, + { + "epoch": 0.7, + "grad_norm": 2.2249743222432357, + "learning_rate": 2.1608544604591015e-06, + "loss": 0.3791, + "step": 6105 + }, + { + "epoch": 0.7, + "grad_norm": 1.8510480382730115, + "learning_rate": 2.1593228535024193e-06, + "loss": 0.4955, + "step": 6106 + }, + { + "epoch": 0.7, + "grad_norm": 2.390802424236188, + "learning_rate": 2.1577916400358452e-06, + "loss": 0.4436, + "step": 6107 + }, + { + "epoch": 0.7, + "grad_norm": 3.038227746960908, + "learning_rate": 2.1562608202714806e-06, + "loss": 0.5755, + "step": 6108 + }, + { + "epoch": 0.7, + "grad_norm": 1.877853801657469, + "learning_rate": 2.1547303944213733e-06, + "loss": 0.549, + "step": 6109 + }, + { + "epoch": 0.7, + "grad_norm": 2.0373242384895027, + "learning_rate": 2.1532003626975204e-06, + "loss": 0.5007, + "step": 6110 + }, + { + "epoch": 0.7, + "grad_norm": 1.946137803721126, + "learning_rate": 2.1516707253118586e-06, + "loss": 0.5437, + "step": 6111 + }, + { + "epoch": 0.7, + "grad_norm": 1.9595302840442244, + "learning_rate": 2.1501414824762763e-06, + "loss": 0.4117, + "step": 6112 + }, + { + "epoch": 0.7, + "grad_norm": 2.0512688963064463, + "learning_rate": 2.1486126344026027e-06, + "loss": 0.5599, + "step": 6113 + }, + { + "epoch": 0.7, + "grad_norm": 2.162042152817863, + "learning_rate": 2.147084181302612e-06, + "loss": 0.4656, + "step": 6114 + }, + { + "epoch": 0.7, + "grad_norm": 2.0010939703518167, + "learning_rate": 2.145556123388026e-06, + "loss": 0.5207, + "step": 6115 + }, + { + "epoch": 0.7, + "grad_norm": 1.8299497088832897, + "learning_rate": 2.144028460870512e-06, + "loss": 0.5228, + "step": 6116 + }, + { + "epoch": 0.7, + "grad_norm": 0.93564423568438, + "learning_rate": 2.1425011939616795e-06, + "loss": 0.7195, + "step": 6117 + }, + { + "epoch": 0.7, + "grad_norm": 2.1355027857381965, + "learning_rate": 2.1409743228730883e-06, + "loss": 0.5015, + "step": 6118 + }, + { + "epoch": 0.7, + "grad_norm": 2.4675885401992184, + "learning_rate": 2.139447847816237e-06, + "loss": 0.552, + "step": 6119 + }, + { + "epoch": 0.7, + "grad_norm": 1.9722422866494538, + "learning_rate": 2.1379217690025767e-06, + "loss": 0.5809, + "step": 6120 + }, + { + "epoch": 0.7, + "grad_norm": 0.7901429663762329, + "learning_rate": 2.1363960866434947e-06, + "loss": 0.6383, + "step": 6121 + }, + { + "epoch": 0.7, + "grad_norm": 2.058591554757942, + "learning_rate": 2.1348708009503333e-06, + "loss": 0.5086, + "step": 6122 + }, + { + "epoch": 0.7, + "grad_norm": 2.415292567200039, + "learning_rate": 2.1333459121343696e-06, + "loss": 0.5396, + "step": 6123 + }, + { + "epoch": 0.7, + "grad_norm": 2.1241612785901927, + "learning_rate": 2.131821420406836e-06, + "loss": 0.5066, + "step": 6124 + }, + { + "epoch": 0.7, + "grad_norm": 1.9311360003306395, + "learning_rate": 2.1302973259789004e-06, + "loss": 0.4979, + "step": 6125 + }, + { + "epoch": 0.7, + "grad_norm": 2.552352369215579, + "learning_rate": 2.1287736290616845e-06, + "loss": 0.4483, + "step": 6126 + }, + { + "epoch": 0.7, + "grad_norm": 2.340873814636964, + "learning_rate": 2.127250329866248e-06, + "loss": 0.4862, + "step": 6127 + }, + { + "epoch": 0.7, + "grad_norm": 2.498350236841117, + "learning_rate": 2.1257274286035963e-06, + "loss": 0.4683, + "step": 6128 + }, + { + "epoch": 0.7, + "grad_norm": 2.3032079196746844, + "learning_rate": 2.124204925484685e-06, + "loss": 0.5819, + "step": 6129 + }, + { + "epoch": 0.7, + "grad_norm": 2.317560894290641, + "learning_rate": 2.122682820720409e-06, + "loss": 0.5879, + "step": 6130 + }, + { + "epoch": 0.7, + "grad_norm": 2.0970471619186832, + "learning_rate": 2.121161114521609e-06, + "loss": 0.46, + "step": 6131 + }, + { + "epoch": 0.7, + "grad_norm": 3.667528489541149, + "learning_rate": 2.119639807099075e-06, + "loss": 0.4275, + "step": 6132 + }, + { + "epoch": 0.7, + "grad_norm": 2.2976533226879177, + "learning_rate": 2.1181188986635354e-06, + "loss": 0.5435, + "step": 6133 + }, + { + "epoch": 0.7, + "grad_norm": 2.1458132577211506, + "learning_rate": 2.1165983894256647e-06, + "loss": 0.5011, + "step": 6134 + }, + { + "epoch": 0.7, + "grad_norm": 2.0309115832312896, + "learning_rate": 2.1150782795960884e-06, + "loss": 0.5232, + "step": 6135 + }, + { + "epoch": 0.71, + "grad_norm": 1.9981742421359077, + "learning_rate": 2.1135585693853665e-06, + "loss": 0.5973, + "step": 6136 + }, + { + "epoch": 0.71, + "grad_norm": 1.7767445531609898, + "learning_rate": 2.112039259004014e-06, + "loss": 0.4985, + "step": 6137 + }, + { + "epoch": 0.71, + "grad_norm": 2.6687932679965702, + "learning_rate": 2.110520348662481e-06, + "loss": 0.4441, + "step": 6138 + }, + { + "epoch": 0.71, + "grad_norm": 1.7915572571460514, + "learning_rate": 2.109001838571171e-06, + "loss": 0.4352, + "step": 6139 + }, + { + "epoch": 0.71, + "grad_norm": 1.8190769478857969, + "learning_rate": 2.107483728940423e-06, + "loss": 0.4775, + "step": 6140 + }, + { + "epoch": 0.71, + "grad_norm": 1.7629953896006052, + "learning_rate": 2.1059660199805303e-06, + "loss": 0.4981, + "step": 6141 + }, + { + "epoch": 0.71, + "grad_norm": 2.1676216541481166, + "learning_rate": 2.1044487119017215e-06, + "loss": 0.4625, + "step": 6142 + }, + { + "epoch": 0.71, + "grad_norm": 1.6228635225367642, + "learning_rate": 2.1029318049141772e-06, + "loss": 0.4988, + "step": 6143 + }, + { + "epoch": 0.71, + "grad_norm": 2.280088133970521, + "learning_rate": 2.101415299228016e-06, + "loss": 0.4868, + "step": 6144 + }, + { + "epoch": 0.71, + "grad_norm": 1.8920051943358118, + "learning_rate": 2.0998991950533065e-06, + "loss": 0.5052, + "step": 6145 + }, + { + "epoch": 0.71, + "grad_norm": 1.6644437251131567, + "learning_rate": 2.098383492600059e-06, + "loss": 0.4929, + "step": 6146 + }, + { + "epoch": 0.71, + "grad_norm": 3.6837673804815196, + "learning_rate": 2.0968681920782273e-06, + "loss": 0.3802, + "step": 6147 + }, + { + "epoch": 0.71, + "grad_norm": 1.61637877115338, + "learning_rate": 2.095353293697709e-06, + "loss": 0.3433, + "step": 6148 + }, + { + "epoch": 0.71, + "grad_norm": 2.6086714137986147, + "learning_rate": 2.093838797668351e-06, + "loss": 0.5591, + "step": 6149 + }, + { + "epoch": 0.71, + "grad_norm": 1.7556559996098247, + "learning_rate": 2.092324704199938e-06, + "loss": 0.3852, + "step": 6150 + }, + { + "epoch": 0.71, + "grad_norm": 1.8223384893108303, + "learning_rate": 2.0908110135022046e-06, + "loss": 0.4682, + "step": 6151 + }, + { + "epoch": 0.71, + "grad_norm": 2.0452727998168636, + "learning_rate": 2.089297725784824e-06, + "loss": 0.4124, + "step": 6152 + }, + { + "epoch": 0.71, + "grad_norm": 0.8032088942780174, + "learning_rate": 2.08778484125742e-06, + "loss": 0.6352, + "step": 6153 + }, + { + "epoch": 0.71, + "grad_norm": 1.7910021896167811, + "learning_rate": 2.0862723601295557e-06, + "loss": 0.494, + "step": 6154 + }, + { + "epoch": 0.71, + "grad_norm": 1.9810359064632312, + "learning_rate": 2.084760282610738e-06, + "loss": 0.4643, + "step": 6155 + }, + { + "epoch": 0.71, + "grad_norm": 4.0203997226600245, + "learning_rate": 2.083248608910422e-06, + "loss": 0.5457, + "step": 6156 + }, + { + "epoch": 0.71, + "grad_norm": 1.6991263500533322, + "learning_rate": 2.081737339238002e-06, + "loss": 0.4716, + "step": 6157 + }, + { + "epoch": 0.71, + "grad_norm": 2.138702172592572, + "learning_rate": 2.0802264738028223e-06, + "loss": 0.4939, + "step": 6158 + }, + { + "epoch": 0.71, + "grad_norm": 2.2168756925803073, + "learning_rate": 2.0787160128141636e-06, + "loss": 0.5763, + "step": 6159 + }, + { + "epoch": 0.71, + "grad_norm": 2.4799559328896272, + "learning_rate": 2.077205956481259e-06, + "loss": 0.4167, + "step": 6160 + }, + { + "epoch": 0.71, + "grad_norm": 2.4093469195180695, + "learning_rate": 2.075696305013277e-06, + "loss": 0.4457, + "step": 6161 + }, + { + "epoch": 0.71, + "grad_norm": 1.7399593041718693, + "learning_rate": 2.074187058619338e-06, + "loss": 0.4775, + "step": 6162 + }, + { + "epoch": 0.71, + "grad_norm": 2.2001979250913606, + "learning_rate": 2.0726782175085016e-06, + "loss": 0.4942, + "step": 6163 + }, + { + "epoch": 0.71, + "grad_norm": 3.61454728741997, + "learning_rate": 2.071169781889771e-06, + "loss": 0.5114, + "step": 6164 + }, + { + "epoch": 0.71, + "grad_norm": 1.919899000271575, + "learning_rate": 2.069661751972093e-06, + "loss": 0.4536, + "step": 6165 + }, + { + "epoch": 0.71, + "grad_norm": 2.741206933866748, + "learning_rate": 2.068154127964363e-06, + "loss": 0.5249, + "step": 6166 + }, + { + "epoch": 0.71, + "grad_norm": 2.3320957648737783, + "learning_rate": 2.0666469100754143e-06, + "loss": 0.4258, + "step": 6167 + }, + { + "epoch": 0.71, + "grad_norm": 2.077486490704608, + "learning_rate": 2.065140098514029e-06, + "loss": 0.4958, + "step": 6168 + }, + { + "epoch": 0.71, + "grad_norm": 3.32855634515467, + "learning_rate": 2.063633693488927e-06, + "loss": 0.429, + "step": 6169 + }, + { + "epoch": 0.71, + "grad_norm": 1.743553259848041, + "learning_rate": 2.062127695208779e-06, + "loss": 0.5308, + "step": 6170 + }, + { + "epoch": 0.71, + "grad_norm": 2.476298738517536, + "learning_rate": 2.060622103882192e-06, + "loss": 0.5192, + "step": 6171 + }, + { + "epoch": 0.71, + "grad_norm": 1.889008293451934, + "learning_rate": 2.0591169197177244e-06, + "loss": 0.4916, + "step": 6172 + }, + { + "epoch": 0.71, + "grad_norm": 3.4069098636298296, + "learning_rate": 2.0576121429238718e-06, + "loss": 0.4604, + "step": 6173 + }, + { + "epoch": 0.71, + "grad_norm": 1.8822815695214004, + "learning_rate": 2.0561077737090727e-06, + "loss": 0.4753, + "step": 6174 + }, + { + "epoch": 0.71, + "grad_norm": 2.368589933780417, + "learning_rate": 2.0546038122817173e-06, + "loss": 0.5475, + "step": 6175 + }, + { + "epoch": 0.71, + "grad_norm": 2.9100669407726696, + "learning_rate": 2.05310025885013e-06, + "loss": 0.4698, + "step": 6176 + }, + { + "epoch": 0.71, + "grad_norm": 1.997181869328449, + "learning_rate": 2.051597113622586e-06, + "loss": 0.3963, + "step": 6177 + }, + { + "epoch": 0.71, + "grad_norm": 1.7204518723701758, + "learning_rate": 2.0500943768072974e-06, + "loss": 0.5208, + "step": 6178 + }, + { + "epoch": 0.71, + "grad_norm": 1.6637508339897304, + "learning_rate": 2.0485920486124265e-06, + "loss": 0.4078, + "step": 6179 + }, + { + "epoch": 0.71, + "grad_norm": 2.007682168111539, + "learning_rate": 2.0470901292460736e-06, + "loss": 0.4717, + "step": 6180 + }, + { + "epoch": 0.71, + "grad_norm": 2.1329703553079677, + "learning_rate": 2.045588618916285e-06, + "loss": 0.407, + "step": 6181 + }, + { + "epoch": 0.71, + "grad_norm": 2.1834981646381526, + "learning_rate": 2.0440875178310473e-06, + "loss": 0.5337, + "step": 6182 + }, + { + "epoch": 0.71, + "grad_norm": 1.648299323472207, + "learning_rate": 2.0425868261982963e-06, + "loss": 0.4546, + "step": 6183 + }, + { + "epoch": 0.71, + "grad_norm": 1.7842332931902416, + "learning_rate": 2.0410865442259042e-06, + "loss": 0.5344, + "step": 6184 + }, + { + "epoch": 0.71, + "grad_norm": 2.7609632770206933, + "learning_rate": 2.0395866721216935e-06, + "loss": 0.5313, + "step": 6185 + }, + { + "epoch": 0.71, + "grad_norm": 2.4779027234914652, + "learning_rate": 2.038087210093422e-06, + "loss": 0.4022, + "step": 6186 + }, + { + "epoch": 0.71, + "grad_norm": 7.038887924103739, + "learning_rate": 2.036588158348799e-06, + "loss": 0.4693, + "step": 6187 + }, + { + "epoch": 0.71, + "grad_norm": 1.7939348539677558, + "learning_rate": 2.0350895170954693e-06, + "loss": 0.442, + "step": 6188 + }, + { + "epoch": 0.71, + "grad_norm": 2.2302041573320897, + "learning_rate": 2.0335912865410277e-06, + "loss": 0.5002, + "step": 6189 + }, + { + "epoch": 0.71, + "grad_norm": 2.1776709688108418, + "learning_rate": 2.032093466893006e-06, + "loss": 0.4584, + "step": 6190 + }, + { + "epoch": 0.71, + "grad_norm": 2.114152902373883, + "learning_rate": 2.0305960583588853e-06, + "loss": 0.5551, + "step": 6191 + }, + { + "epoch": 0.71, + "grad_norm": 1.8550707282687924, + "learning_rate": 2.0290990611460836e-06, + "loss": 0.5224, + "step": 6192 + }, + { + "epoch": 0.71, + "grad_norm": 1.6579874462889062, + "learning_rate": 2.0276024754619634e-06, + "loss": 0.4969, + "step": 6193 + }, + { + "epoch": 0.71, + "grad_norm": 2.224697023168764, + "learning_rate": 2.026106301513836e-06, + "loss": 0.5534, + "step": 6194 + }, + { + "epoch": 0.71, + "grad_norm": 2.6969206057109427, + "learning_rate": 2.024610539508946e-06, + "loss": 0.4905, + "step": 6195 + }, + { + "epoch": 0.71, + "grad_norm": 2.0992038091722183, + "learning_rate": 2.023115189654491e-06, + "loss": 0.4442, + "step": 6196 + }, + { + "epoch": 0.71, + "grad_norm": 2.2469119318632256, + "learning_rate": 2.0216202521576045e-06, + "loss": 0.431, + "step": 6197 + }, + { + "epoch": 0.71, + "grad_norm": 2.413811938273658, + "learning_rate": 2.0201257272253643e-06, + "loss": 0.4102, + "step": 6198 + }, + { + "epoch": 0.71, + "grad_norm": 2.778002199721246, + "learning_rate": 2.0186316150647913e-06, + "loss": 0.4312, + "step": 6199 + }, + { + "epoch": 0.71, + "grad_norm": 2.1068346712180666, + "learning_rate": 2.017137915882851e-06, + "loss": 0.5234, + "step": 6200 + }, + { + "epoch": 0.71, + "grad_norm": 3.1603520531305187, + "learning_rate": 2.015644629886449e-06, + "loss": 0.4398, + "step": 6201 + }, + { + "epoch": 0.71, + "grad_norm": 1.8462567148752724, + "learning_rate": 2.014151757282438e-06, + "loss": 0.4742, + "step": 6202 + }, + { + "epoch": 0.71, + "grad_norm": 1.8544134266578187, + "learning_rate": 2.012659298277606e-06, + "loss": 0.435, + "step": 6203 + }, + { + "epoch": 0.71, + "grad_norm": 2.421708215378694, + "learning_rate": 2.011167253078693e-06, + "loss": 0.5405, + "step": 6204 + }, + { + "epoch": 0.71, + "grad_norm": 2.1184987231673573, + "learning_rate": 2.0096756218923725e-06, + "loss": 0.4399, + "step": 6205 + }, + { + "epoch": 0.71, + "grad_norm": 1.9447532301703763, + "learning_rate": 2.0081844049252686e-06, + "loss": 0.48, + "step": 6206 + }, + { + "epoch": 0.71, + "grad_norm": 1.6929067632712549, + "learning_rate": 2.0066936023839406e-06, + "loss": 0.4173, + "step": 6207 + }, + { + "epoch": 0.71, + "grad_norm": 0.8269653347921048, + "learning_rate": 2.0052032144748982e-06, + "loss": 0.6632, + "step": 6208 + }, + { + "epoch": 0.71, + "grad_norm": 1.7998500735895029, + "learning_rate": 2.003713241404586e-06, + "loss": 0.4976, + "step": 6209 + }, + { + "epoch": 0.71, + "grad_norm": 2.801614498341954, + "learning_rate": 2.002223683379399e-06, + "loss": 0.5235, + "step": 6210 + }, + { + "epoch": 0.71, + "grad_norm": 1.8173602610340023, + "learning_rate": 2.000734540605666e-06, + "loss": 0.5476, + "step": 6211 + }, + { + "epoch": 0.71, + "grad_norm": 2.7996769122619494, + "learning_rate": 1.999245813289667e-06, + "loss": 0.4999, + "step": 6212 + }, + { + "epoch": 0.71, + "grad_norm": 0.8060480864988084, + "learning_rate": 1.9977575016376177e-06, + "loss": 0.6504, + "step": 6213 + }, + { + "epoch": 0.71, + "grad_norm": 1.722484353457214, + "learning_rate": 1.9962696058556795e-06, + "loss": 0.3862, + "step": 6214 + }, + { + "epoch": 0.71, + "grad_norm": 2.006774140409978, + "learning_rate": 1.9947821261499533e-06, + "loss": 0.4603, + "step": 6215 + }, + { + "epoch": 0.71, + "grad_norm": 2.27598932734707, + "learning_rate": 1.9932950627264884e-06, + "loss": 0.5607, + "step": 6216 + }, + { + "epoch": 0.71, + "grad_norm": 4.355590951741181, + "learning_rate": 1.991808415791269e-06, + "loss": 0.4253, + "step": 6217 + }, + { + "epoch": 0.71, + "grad_norm": 2.3720431276795333, + "learning_rate": 1.9903221855502285e-06, + "loss": 0.5401, + "step": 6218 + }, + { + "epoch": 0.71, + "grad_norm": 2.053626225769318, + "learning_rate": 1.9888363722092376e-06, + "loss": 0.5096, + "step": 6219 + }, + { + "epoch": 0.71, + "grad_norm": 2.2082417304700144, + "learning_rate": 1.987350975974109e-06, + "loss": 0.4925, + "step": 6220 + }, + { + "epoch": 0.71, + "grad_norm": 2.49802811561495, + "learning_rate": 1.9858659970506027e-06, + "loss": 0.5357, + "step": 6221 + }, + { + "epoch": 0.71, + "grad_norm": 1.6328310617516402, + "learning_rate": 1.984381435644415e-06, + "loss": 0.4345, + "step": 6222 + }, + { + "epoch": 0.72, + "grad_norm": 2.4672107895408186, + "learning_rate": 1.982897291961191e-06, + "loss": 0.5537, + "step": 6223 + }, + { + "epoch": 0.72, + "grad_norm": 2.6759273335238465, + "learning_rate": 1.9814135662065093e-06, + "loss": 0.5496, + "step": 6224 + }, + { + "epoch": 0.72, + "grad_norm": 2.5134969914874206, + "learning_rate": 1.9799302585858988e-06, + "loss": 0.4352, + "step": 6225 + }, + { + "epoch": 0.72, + "grad_norm": 1.9424304226522964, + "learning_rate": 1.9784473693048245e-06, + "loss": 0.4412, + "step": 6226 + }, + { + "epoch": 0.72, + "grad_norm": 0.8404046902826372, + "learning_rate": 1.976964898568699e-06, + "loss": 0.6816, + "step": 6227 + }, + { + "epoch": 0.72, + "grad_norm": 1.9337040045585845, + "learning_rate": 1.9754828465828703e-06, + "loss": 0.4588, + "step": 6228 + }, + { + "epoch": 0.72, + "grad_norm": 3.7013419410950563, + "learning_rate": 1.9740012135526358e-06, + "loss": 0.4941, + "step": 6229 + }, + { + "epoch": 0.72, + "grad_norm": 2.0858433522882223, + "learning_rate": 1.972519999683229e-06, + "loss": 0.5073, + "step": 6230 + }, + { + "epoch": 0.72, + "grad_norm": 2.677424612388146, + "learning_rate": 1.9710392051798273e-06, + "loss": 0.4737, + "step": 6231 + }, + { + "epoch": 0.72, + "grad_norm": 2.234617325019333, + "learning_rate": 1.969558830247549e-06, + "loss": 0.3999, + "step": 6232 + }, + { + "epoch": 0.72, + "grad_norm": 1.9047489057037716, + "learning_rate": 1.9680788750914575e-06, + "loss": 0.4204, + "step": 6233 + }, + { + "epoch": 0.72, + "grad_norm": 2.9018951529660804, + "learning_rate": 1.966599339916554e-06, + "loss": 0.4785, + "step": 6234 + }, + { + "epoch": 0.72, + "grad_norm": 0.8087597407239835, + "learning_rate": 1.9651202249277862e-06, + "loss": 0.6718, + "step": 6235 + }, + { + "epoch": 0.72, + "grad_norm": 4.356844274535127, + "learning_rate": 1.9636415303300373e-06, + "loss": 0.4949, + "step": 6236 + }, + { + "epoch": 0.72, + "grad_norm": 0.788468189459655, + "learning_rate": 1.9621632563281394e-06, + "loss": 0.6597, + "step": 6237 + }, + { + "epoch": 0.72, + "grad_norm": 2.4429863758106203, + "learning_rate": 1.960685403126861e-06, + "loss": 0.4895, + "step": 6238 + }, + { + "epoch": 0.72, + "grad_norm": 2.8907398797635535, + "learning_rate": 1.9592079709309126e-06, + "loss": 0.5394, + "step": 6239 + }, + { + "epoch": 0.72, + "grad_norm": 2.3376738180249794, + "learning_rate": 1.9577309599449513e-06, + "loss": 0.5212, + "step": 6240 + }, + { + "epoch": 0.72, + "grad_norm": 3.9821480341518827, + "learning_rate": 1.9562543703735683e-06, + "loss": 0.4829, + "step": 6241 + }, + { + "epoch": 0.72, + "grad_norm": 2.3338223294549216, + "learning_rate": 1.9547782024213047e-06, + "loss": 0.4895, + "step": 6242 + }, + { + "epoch": 0.72, + "grad_norm": 3.153309713192778, + "learning_rate": 1.9533024562926355e-06, + "loss": 0.4806, + "step": 6243 + }, + { + "epoch": 0.72, + "grad_norm": 2.448066731413163, + "learning_rate": 1.9518271321919837e-06, + "loss": 0.4947, + "step": 6244 + }, + { + "epoch": 0.72, + "grad_norm": 2.5103778473413, + "learning_rate": 1.950352230323708e-06, + "loss": 0.4347, + "step": 6245 + }, + { + "epoch": 0.72, + "grad_norm": 2.0725462049302474, + "learning_rate": 1.9488777508921155e-06, + "loss": 0.5094, + "step": 6246 + }, + { + "epoch": 0.72, + "grad_norm": 2.1041108523691614, + "learning_rate": 1.9474036941014473e-06, + "loss": 0.4277, + "step": 6247 + }, + { + "epoch": 0.72, + "grad_norm": 3.862483725065435, + "learning_rate": 1.945930060155892e-06, + "loss": 0.4024, + "step": 6248 + }, + { + "epoch": 0.72, + "grad_norm": 3.007937062358391, + "learning_rate": 1.9444568492595727e-06, + "loss": 0.5861, + "step": 6249 + }, + { + "epoch": 0.72, + "grad_norm": 2.254252500077309, + "learning_rate": 1.942984061616564e-06, + "loss": 0.4892, + "step": 6250 + }, + { + "epoch": 0.72, + "grad_norm": 2.553542929158215, + "learning_rate": 1.941511697430871e-06, + "loss": 0.422, + "step": 6251 + }, + { + "epoch": 0.72, + "grad_norm": 2.9596478313299146, + "learning_rate": 1.9400397569064505e-06, + "loss": 0.4766, + "step": 6252 + }, + { + "epoch": 0.72, + "grad_norm": 2.4194313290806257, + "learning_rate": 1.9385682402471913e-06, + "loss": 0.5291, + "step": 6253 + }, + { + "epoch": 0.72, + "grad_norm": 2.7095678423697396, + "learning_rate": 1.9370971476569308e-06, + "loss": 0.4047, + "step": 6254 + }, + { + "epoch": 0.72, + "grad_norm": 2.252417469180771, + "learning_rate": 1.93562647933944e-06, + "loss": 0.5313, + "step": 6255 + }, + { + "epoch": 0.72, + "grad_norm": 2.119473766716319, + "learning_rate": 1.934156235498442e-06, + "loss": 0.4964, + "step": 6256 + }, + { + "epoch": 0.72, + "grad_norm": 1.9912461949435232, + "learning_rate": 1.93268641633759e-06, + "loss": 0.527, + "step": 6257 + }, + { + "epoch": 0.72, + "grad_norm": 2.1668886299740957, + "learning_rate": 1.931217022060483e-06, + "loss": 0.4919, + "step": 6258 + }, + { + "epoch": 0.72, + "grad_norm": 3.024780992690607, + "learning_rate": 1.929748052870664e-06, + "loss": 0.4308, + "step": 6259 + }, + { + "epoch": 0.72, + "grad_norm": 3.5129359053767404, + "learning_rate": 1.9282795089716116e-06, + "loss": 0.3991, + "step": 6260 + }, + { + "epoch": 0.72, + "grad_norm": 2.513409482021169, + "learning_rate": 1.9268113905667514e-06, + "loss": 0.437, + "step": 6261 + }, + { + "epoch": 0.72, + "grad_norm": 2.251324999915131, + "learning_rate": 1.9253436978594433e-06, + "loss": 0.5511, + "step": 6262 + }, + { + "epoch": 0.72, + "grad_norm": 2.0221517695317925, + "learning_rate": 1.923876431052995e-06, + "loss": 0.5427, + "step": 6263 + }, + { + "epoch": 0.72, + "grad_norm": 1.7938132473863428, + "learning_rate": 1.922409590350651e-06, + "loss": 0.4492, + "step": 6264 + }, + { + "epoch": 0.72, + "grad_norm": 2.5448970597681058, + "learning_rate": 1.9209431759555973e-06, + "loss": 0.4506, + "step": 6265 + }, + { + "epoch": 0.72, + "grad_norm": 2.2834376005333827, + "learning_rate": 1.91947718807096e-06, + "loss": 0.4852, + "step": 6266 + }, + { + "epoch": 0.72, + "grad_norm": 3.5338523539634, + "learning_rate": 1.9180116268998104e-06, + "loss": 0.5126, + "step": 6267 + }, + { + "epoch": 0.72, + "grad_norm": 1.7748821023872656, + "learning_rate": 1.9165464926451556e-06, + "loss": 0.4952, + "step": 6268 + }, + { + "epoch": 0.72, + "grad_norm": 2.3581543930093014, + "learning_rate": 1.9150817855099473e-06, + "loss": 0.5251, + "step": 6269 + }, + { + "epoch": 0.72, + "grad_norm": 10.22389756467088, + "learning_rate": 1.9136175056970747e-06, + "loss": 0.5504, + "step": 6270 + }, + { + "epoch": 0.72, + "grad_norm": 2.2950426306359484, + "learning_rate": 1.9121536534093723e-06, + "loss": 0.5849, + "step": 6271 + }, + { + "epoch": 0.72, + "grad_norm": 1.760639573534165, + "learning_rate": 1.9106902288496087e-06, + "loss": 0.4468, + "step": 6272 + }, + { + "epoch": 0.72, + "grad_norm": 1.8493484806422544, + "learning_rate": 1.9092272322205013e-06, + "loss": 0.4984, + "step": 6273 + }, + { + "epoch": 0.72, + "grad_norm": 3.6911902223139634, + "learning_rate": 1.907764663724701e-06, + "loss": 0.4536, + "step": 6274 + }, + { + "epoch": 0.72, + "grad_norm": 0.8380775182716391, + "learning_rate": 1.9063025235648058e-06, + "loss": 0.6931, + "step": 6275 + }, + { + "epoch": 0.72, + "grad_norm": 1.987491301459409, + "learning_rate": 1.904840811943347e-06, + "loss": 0.4554, + "step": 6276 + }, + { + "epoch": 0.72, + "grad_norm": 3.2733647509690442, + "learning_rate": 1.903379529062805e-06, + "loss": 0.5086, + "step": 6277 + }, + { + "epoch": 0.72, + "grad_norm": 1.8319642662456037, + "learning_rate": 1.901918675125594e-06, + "loss": 0.48, + "step": 6278 + }, + { + "epoch": 0.72, + "grad_norm": 3.252590812641215, + "learning_rate": 1.9004582503340696e-06, + "loss": 0.5294, + "step": 6279 + }, + { + "epoch": 0.72, + "grad_norm": 1.981232683136987, + "learning_rate": 1.8989982548905333e-06, + "loss": 0.4432, + "step": 6280 + }, + { + "epoch": 0.72, + "grad_norm": 1.869064168158839, + "learning_rate": 1.8975386889972218e-06, + "loss": 0.4783, + "step": 6281 + }, + { + "epoch": 0.72, + "grad_norm": 2.7857437200727784, + "learning_rate": 1.8960795528563125e-06, + "loss": 0.6173, + "step": 6282 + }, + { + "epoch": 0.72, + "grad_norm": 2.7624447796642495, + "learning_rate": 1.8946208466699267e-06, + "loss": 0.5138, + "step": 6283 + }, + { + "epoch": 0.72, + "grad_norm": 1.9750768174968976, + "learning_rate": 1.893162570640124e-06, + "loss": 0.4459, + "step": 6284 + }, + { + "epoch": 0.72, + "grad_norm": 2.046137694778906, + "learning_rate": 1.891704724968902e-06, + "loss": 0.5025, + "step": 6285 + }, + { + "epoch": 0.72, + "grad_norm": 3.2206913667324106, + "learning_rate": 1.8902473098582048e-06, + "loss": 0.4478, + "step": 6286 + }, + { + "epoch": 0.72, + "grad_norm": 1.9557844491245842, + "learning_rate": 1.88879032550991e-06, + "loss": 0.4773, + "step": 6287 + }, + { + "epoch": 0.72, + "grad_norm": 2.2490218103674127, + "learning_rate": 1.8873337721258416e-06, + "loss": 0.5222, + "step": 6288 + }, + { + "epoch": 0.72, + "grad_norm": 1.9227280781160936, + "learning_rate": 1.8858776499077592e-06, + "loss": 0.5256, + "step": 6289 + }, + { + "epoch": 0.72, + "grad_norm": 1.713613151354472, + "learning_rate": 1.8844219590573664e-06, + "loss": 0.4946, + "step": 6290 + }, + { + "epoch": 0.72, + "grad_norm": 1.7918477944864535, + "learning_rate": 1.8829666997763023e-06, + "loss": 0.4691, + "step": 6291 + }, + { + "epoch": 0.72, + "grad_norm": 6.125522885475883, + "learning_rate": 1.8815118722661534e-06, + "loss": 0.5789, + "step": 6292 + }, + { + "epoch": 0.72, + "grad_norm": 1.8642218016353034, + "learning_rate": 1.8800574767284379e-06, + "loss": 0.5137, + "step": 6293 + }, + { + "epoch": 0.72, + "grad_norm": 1.622094681514146, + "learning_rate": 1.8786035133646219e-06, + "loss": 0.47, + "step": 6294 + }, + { + "epoch": 0.72, + "grad_norm": 1.656413240169926, + "learning_rate": 1.8771499823761047e-06, + "loss": 0.5391, + "step": 6295 + }, + { + "epoch": 0.72, + "grad_norm": 2.4128879127927987, + "learning_rate": 1.8756968839642332e-06, + "loss": 0.5205, + "step": 6296 + }, + { + "epoch": 0.72, + "grad_norm": 1.853746632585124, + "learning_rate": 1.8742442183302879e-06, + "loss": 0.4701, + "step": 6297 + }, + { + "epoch": 0.72, + "grad_norm": 1.9206821482355807, + "learning_rate": 1.8727919856754922e-06, + "loss": 0.4757, + "step": 6298 + }, + { + "epoch": 0.72, + "grad_norm": 2.0283327894468535, + "learning_rate": 1.8713401862010071e-06, + "loss": 0.5549, + "step": 6299 + }, + { + "epoch": 0.72, + "grad_norm": 5.738924086024506, + "learning_rate": 1.8698888201079395e-06, + "loss": 0.6512, + "step": 6300 + }, + { + "epoch": 0.72, + "grad_norm": 2.0175739596668265, + "learning_rate": 1.8684378875973286e-06, + "loss": 0.5435, + "step": 6301 + }, + { + "epoch": 0.72, + "grad_norm": 2.8987495873792826, + "learning_rate": 1.8669873888701606e-06, + "loss": 0.4949, + "step": 6302 + }, + { + "epoch": 0.72, + "grad_norm": 2.5561852490344634, + "learning_rate": 1.8655373241273572e-06, + "loss": 0.5716, + "step": 6303 + }, + { + "epoch": 0.72, + "grad_norm": 2.29288659667296, + "learning_rate": 1.8640876935697787e-06, + "loss": 0.4937, + "step": 6304 + }, + { + "epoch": 0.72, + "grad_norm": 2.0551766403121445, + "learning_rate": 1.8626384973982314e-06, + "loss": 0.4862, + "step": 6305 + }, + { + "epoch": 0.72, + "grad_norm": 2.7148499191159217, + "learning_rate": 1.861189735813455e-06, + "loss": 0.444, + "step": 6306 + }, + { + "epoch": 0.72, + "grad_norm": 1.622374401215246, + "learning_rate": 1.8597414090161336e-06, + "loss": 0.4473, + "step": 6307 + }, + { + "epoch": 0.72, + "grad_norm": 4.664376767481218, + "learning_rate": 1.8582935172068873e-06, + "loss": 0.4907, + "step": 6308 + }, + { + "epoch": 0.72, + "grad_norm": 4.017427987994815, + "learning_rate": 1.8568460605862797e-06, + "loss": 0.5515, + "step": 6309 + }, + { + "epoch": 0.73, + "grad_norm": 1.8821712065861482, + "learning_rate": 1.8553990393548105e-06, + "loss": 0.4799, + "step": 6310 + }, + { + "epoch": 0.73, + "grad_norm": 1.6390397294263601, + "learning_rate": 1.8539524537129232e-06, + "loss": 0.5953, + "step": 6311 + }, + { + "epoch": 0.73, + "grad_norm": 2.422146080469555, + "learning_rate": 1.8525063038609954e-06, + "loss": 0.5155, + "step": 6312 + }, + { + "epoch": 0.73, + "grad_norm": 2.262202152337742, + "learning_rate": 1.8510605899993505e-06, + "loss": 0.5299, + "step": 6313 + }, + { + "epoch": 0.73, + "grad_norm": 2.1216627723796337, + "learning_rate": 1.8496153123282461e-06, + "loss": 0.4619, + "step": 6314 + }, + { + "epoch": 0.73, + "grad_norm": 2.8825292551519075, + "learning_rate": 1.848170471047886e-06, + "loss": 0.4712, + "step": 6315 + }, + { + "epoch": 0.73, + "grad_norm": 1.957892167108236, + "learning_rate": 1.846726066358403e-06, + "loss": 0.4486, + "step": 6316 + }, + { + "epoch": 0.73, + "grad_norm": 1.7580557316199963, + "learning_rate": 1.8452820984598813e-06, + "loss": 0.5351, + "step": 6317 + }, + { + "epoch": 0.73, + "grad_norm": 2.287818176431958, + "learning_rate": 1.8438385675523346e-06, + "loss": 0.4909, + "step": 6318 + }, + { + "epoch": 0.73, + "grad_norm": 2.0894758133740403, + "learning_rate": 1.8423954738357248e-06, + "loss": 0.5343, + "step": 6319 + }, + { + "epoch": 0.73, + "grad_norm": 3.031781029852434, + "learning_rate": 1.840952817509945e-06, + "loss": 0.5428, + "step": 6320 + }, + { + "epoch": 0.73, + "grad_norm": 1.7938253995116438, + "learning_rate": 1.8395105987748357e-06, + "loss": 0.483, + "step": 6321 + }, + { + "epoch": 0.73, + "grad_norm": 1.783295159369498, + "learning_rate": 1.8380688178301693e-06, + "loss": 0.5477, + "step": 6322 + }, + { + "epoch": 0.73, + "grad_norm": 2.478841883175993, + "learning_rate": 1.8366274748756646e-06, + "loss": 0.4507, + "step": 6323 + }, + { + "epoch": 0.73, + "grad_norm": 2.3859199827465707, + "learning_rate": 1.8351865701109734e-06, + "loss": 0.4569, + "step": 6324 + }, + { + "epoch": 0.73, + "grad_norm": 2.015573209330615, + "learning_rate": 1.8337461037356892e-06, + "loss": 0.3931, + "step": 6325 + }, + { + "epoch": 0.73, + "grad_norm": 1.7337011326667826, + "learning_rate": 1.8323060759493477e-06, + "loss": 0.5, + "step": 6326 + }, + { + "epoch": 0.73, + "grad_norm": 1.7056545177008253, + "learning_rate": 1.8308664869514186e-06, + "loss": 0.4095, + "step": 6327 + }, + { + "epoch": 0.73, + "grad_norm": 2.1783797001360603, + "learning_rate": 1.8294273369413163e-06, + "loss": 0.487, + "step": 6328 + }, + { + "epoch": 0.73, + "grad_norm": 1.984539197433747, + "learning_rate": 1.8279886261183883e-06, + "loss": 0.5071, + "step": 6329 + }, + { + "epoch": 0.73, + "grad_norm": 1.7634494874272522, + "learning_rate": 1.826550354681928e-06, + "loss": 0.5173, + "step": 6330 + }, + { + "epoch": 0.73, + "grad_norm": 2.021276001508078, + "learning_rate": 1.825112522831161e-06, + "loss": 0.4555, + "step": 6331 + }, + { + "epoch": 0.73, + "grad_norm": 1.9082756263584135, + "learning_rate": 1.8236751307652617e-06, + "loss": 0.4669, + "step": 6332 + }, + { + "epoch": 0.73, + "grad_norm": 2.738171703966473, + "learning_rate": 1.8222381786833293e-06, + "loss": 0.5502, + "step": 6333 + }, + { + "epoch": 0.73, + "grad_norm": 0.8385368408201282, + "learning_rate": 1.8208016667844153e-06, + "loss": 0.6684, + "step": 6334 + }, + { + "epoch": 0.73, + "grad_norm": 2.7184776174031064, + "learning_rate": 1.8193655952675027e-06, + "loss": 0.5249, + "step": 6335 + }, + { + "epoch": 0.73, + "grad_norm": 1.7550181437076597, + "learning_rate": 1.8179299643315184e-06, + "loss": 0.5674, + "step": 6336 + }, + { + "epoch": 0.73, + "grad_norm": 2.050740032125593, + "learning_rate": 1.8164947741753225e-06, + "loss": 0.44, + "step": 6337 + }, + { + "epoch": 0.73, + "grad_norm": 2.1031751866405166, + "learning_rate": 1.8150600249977208e-06, + "loss": 0.4578, + "step": 6338 + }, + { + "epoch": 0.73, + "grad_norm": 1.8950367821048322, + "learning_rate": 1.8136257169974507e-06, + "loss": 0.458, + "step": 6339 + }, + { + "epoch": 0.73, + "grad_norm": 1.8715140553921894, + "learning_rate": 1.8121918503731966e-06, + "loss": 0.4117, + "step": 6340 + }, + { + "epoch": 0.73, + "grad_norm": 1.7623193219146276, + "learning_rate": 1.8107584253235733e-06, + "loss": 0.4534, + "step": 6341 + }, + { + "epoch": 0.73, + "grad_norm": 2.162854321557444, + "learning_rate": 1.8093254420471424e-06, + "loss": 0.5155, + "step": 6342 + }, + { + "epoch": 0.73, + "grad_norm": 2.131349867676294, + "learning_rate": 1.8078929007423985e-06, + "loss": 0.538, + "step": 6343 + }, + { + "epoch": 0.73, + "grad_norm": 2.3171937676212875, + "learning_rate": 1.8064608016077756e-06, + "loss": 0.4898, + "step": 6344 + }, + { + "epoch": 0.73, + "grad_norm": 2.1941811052445366, + "learning_rate": 1.8050291448416506e-06, + "loss": 0.5508, + "step": 6345 + }, + { + "epoch": 0.73, + "grad_norm": 2.4249831999730684, + "learning_rate": 1.803597930642334e-06, + "loss": 0.505, + "step": 6346 + }, + { + "epoch": 0.73, + "grad_norm": 1.831641128393604, + "learning_rate": 1.8021671592080796e-06, + "loss": 0.4834, + "step": 6347 + }, + { + "epoch": 0.73, + "grad_norm": 3.83024681700238, + "learning_rate": 1.800736830737077e-06, + "loss": 0.591, + "step": 6348 + }, + { + "epoch": 0.73, + "grad_norm": 2.093736525473045, + "learning_rate": 1.7993069454274537e-06, + "loss": 0.5379, + "step": 6349 + }, + { + "epoch": 0.73, + "grad_norm": 2.226649914206589, + "learning_rate": 1.7978775034772766e-06, + "loss": 0.5277, + "step": 6350 + }, + { + "epoch": 0.73, + "grad_norm": 1.9358468896290768, + "learning_rate": 1.7964485050845548e-06, + "loss": 0.4821, + "step": 6351 + }, + { + "epoch": 0.73, + "grad_norm": 2.354623166387629, + "learning_rate": 1.79501995044723e-06, + "loss": 0.4964, + "step": 6352 + }, + { + "epoch": 0.73, + "grad_norm": 2.601164648841618, + "learning_rate": 1.7935918397631875e-06, + "loss": 0.4614, + "step": 6353 + }, + { + "epoch": 0.73, + "grad_norm": 0.9745288360551739, + "learning_rate": 1.7921641732302463e-06, + "loss": 0.7706, + "step": 6354 + }, + { + "epoch": 0.73, + "grad_norm": 1.6957725828212997, + "learning_rate": 1.7907369510461702e-06, + "loss": 0.5397, + "step": 6355 + }, + { + "epoch": 0.73, + "grad_norm": 1.6100211400013387, + "learning_rate": 1.7893101734086543e-06, + "loss": 0.4746, + "step": 6356 + }, + { + "epoch": 0.73, + "grad_norm": 2.8447170499866847, + "learning_rate": 1.7878838405153388e-06, + "loss": 0.4802, + "step": 6357 + }, + { + "epoch": 0.73, + "grad_norm": 2.3255522690529595, + "learning_rate": 1.7864579525637948e-06, + "loss": 0.5432, + "step": 6358 + }, + { + "epoch": 0.73, + "grad_norm": 1.9970097266407991, + "learning_rate": 1.785032509751541e-06, + "loss": 0.4358, + "step": 6359 + }, + { + "epoch": 0.73, + "grad_norm": 1.858462670711407, + "learning_rate": 1.7836075122760255e-06, + "loss": 0.5524, + "step": 6360 + }, + { + "epoch": 0.73, + "grad_norm": 1.765228617380252, + "learning_rate": 1.7821829603346418e-06, + "loss": 0.4894, + "step": 6361 + }, + { + "epoch": 0.73, + "grad_norm": 1.9915575129798142, + "learning_rate": 1.7807588541247167e-06, + "loss": 0.5333, + "step": 6362 + }, + { + "epoch": 0.73, + "grad_norm": 2.466567466442859, + "learning_rate": 1.7793351938435166e-06, + "loss": 0.504, + "step": 6363 + }, + { + "epoch": 0.73, + "grad_norm": 2.1604306099873614, + "learning_rate": 1.7779119796882489e-06, + "loss": 0.5605, + "step": 6364 + }, + { + "epoch": 0.73, + "grad_norm": 1.972826586225186, + "learning_rate": 1.7764892118560555e-06, + "loss": 0.5054, + "step": 6365 + }, + { + "epoch": 0.73, + "grad_norm": 1.6839004446481611, + "learning_rate": 1.7750668905440166e-06, + "loss": 0.4431, + "step": 6366 + }, + { + "epoch": 0.73, + "grad_norm": 1.914727198177321, + "learning_rate": 1.7736450159491552e-06, + "loss": 0.5, + "step": 6367 + }, + { + "epoch": 0.73, + "grad_norm": 2.377258711989198, + "learning_rate": 1.7722235882684275e-06, + "loss": 0.4811, + "step": 6368 + }, + { + "epoch": 0.73, + "grad_norm": 2.2865486391080005, + "learning_rate": 1.7708026076987273e-06, + "loss": 0.5003, + "step": 6369 + }, + { + "epoch": 0.73, + "grad_norm": 1.9638668258879712, + "learning_rate": 1.7693820744368928e-06, + "loss": 0.5083, + "step": 6370 + }, + { + "epoch": 0.73, + "grad_norm": 2.048879096489691, + "learning_rate": 1.7679619886796917e-06, + "loss": 0.5189, + "step": 6371 + }, + { + "epoch": 0.73, + "grad_norm": 0.8554372389125955, + "learning_rate": 1.7665423506238377e-06, + "loss": 0.6866, + "step": 6372 + }, + { + "epoch": 0.73, + "grad_norm": 1.8075358909319172, + "learning_rate": 1.7651231604659757e-06, + "loss": 0.5364, + "step": 6373 + }, + { + "epoch": 0.73, + "grad_norm": 1.686032530221486, + "learning_rate": 1.7637044184026946e-06, + "loss": 0.502, + "step": 6374 + }, + { + "epoch": 0.73, + "grad_norm": 1.7045689232556733, + "learning_rate": 1.7622861246305156e-06, + "loss": 0.3874, + "step": 6375 + }, + { + "epoch": 0.73, + "grad_norm": 2.522599926988805, + "learning_rate": 1.7608682793459037e-06, + "loss": 0.5155, + "step": 6376 + }, + { + "epoch": 0.73, + "grad_norm": 2.0948567119521013, + "learning_rate": 1.7594508827452545e-06, + "loss": 0.5264, + "step": 6377 + }, + { + "epoch": 0.73, + "grad_norm": 1.8183544113194867, + "learning_rate": 1.7580339350249099e-06, + "loss": 0.5063, + "step": 6378 + }, + { + "epoch": 0.73, + "grad_norm": 1.9572714333319212, + "learning_rate": 1.756617436381141e-06, + "loss": 0.4382, + "step": 6379 + }, + { + "epoch": 0.73, + "grad_norm": 1.8614628196136365, + "learning_rate": 1.7552013870101652e-06, + "loss": 0.4981, + "step": 6380 + }, + { + "epoch": 0.73, + "grad_norm": 1.7257071472198622, + "learning_rate": 1.7537857871081293e-06, + "loss": 0.4084, + "step": 6381 + }, + { + "epoch": 0.73, + "grad_norm": 1.586278891113459, + "learning_rate": 1.752370636871127e-06, + "loss": 0.4939, + "step": 6382 + }, + { + "epoch": 0.73, + "grad_norm": 3.3354946446510554, + "learning_rate": 1.7509559364951783e-06, + "loss": 0.5147, + "step": 6383 + }, + { + "epoch": 0.73, + "grad_norm": 1.9165456395701335, + "learning_rate": 1.7495416861762527e-06, + "loss": 0.4639, + "step": 6384 + }, + { + "epoch": 0.73, + "grad_norm": 1.8607136472667773, + "learning_rate": 1.7481278861102475e-06, + "loss": 0.5229, + "step": 6385 + }, + { + "epoch": 0.73, + "grad_norm": 1.9296542064715891, + "learning_rate": 1.7467145364930066e-06, + "loss": 0.421, + "step": 6386 + }, + { + "epoch": 0.73, + "grad_norm": 2.5224945288851752, + "learning_rate": 1.7453016375203024e-06, + "loss": 0.5117, + "step": 6387 + }, + { + "epoch": 0.73, + "grad_norm": 1.7633416445115957, + "learning_rate": 1.7438891893878534e-06, + "loss": 0.4943, + "step": 6388 + }, + { + "epoch": 0.73, + "grad_norm": 2.2711841442929446, + "learning_rate": 1.7424771922913098e-06, + "loss": 0.5057, + "step": 6389 + }, + { + "epoch": 0.73, + "grad_norm": 1.8239839286176018, + "learning_rate": 1.7410656464262598e-06, + "loss": 0.4879, + "step": 6390 + }, + { + "epoch": 0.73, + "grad_norm": 2.55260194675326, + "learning_rate": 1.7396545519882336e-06, + "loss": 0.537, + "step": 6391 + }, + { + "epoch": 0.73, + "grad_norm": 2.0085794227469926, + "learning_rate": 1.7382439091726927e-06, + "loss": 0.4738, + "step": 6392 + }, + { + "epoch": 0.73, + "grad_norm": 1.872407716447733, + "learning_rate": 1.7368337181750423e-06, + "loss": 0.4844, + "step": 6393 + }, + { + "epoch": 0.73, + "grad_norm": 2.3833575908885747, + "learning_rate": 1.735423979190618e-06, + "loss": 0.4874, + "step": 6394 + }, + { + "epoch": 0.73, + "grad_norm": 1.909705183011182, + "learning_rate": 1.7340146924147005e-06, + "loss": 0.4619, + "step": 6395 + }, + { + "epoch": 0.73, + "grad_norm": 2.345725128941785, + "learning_rate": 1.7326058580425003e-06, + "loss": 0.5255, + "step": 6396 + }, + { + "epoch": 0.74, + "grad_norm": 2.045014748527228, + "learning_rate": 1.7311974762691725e-06, + "loss": 0.5067, + "step": 6397 + }, + { + "epoch": 0.74, + "grad_norm": 3.954162767205877, + "learning_rate": 1.7297895472898024e-06, + "loss": 0.5367, + "step": 6398 + }, + { + "epoch": 0.74, + "grad_norm": 1.6307621442838125, + "learning_rate": 1.7283820712994214e-06, + "loss": 0.4839, + "step": 6399 + }, + { + "epoch": 0.74, + "grad_norm": 2.2618544388805404, + "learning_rate": 1.7269750484929853e-06, + "loss": 0.4794, + "step": 6400 + }, + { + "epoch": 0.74, + "grad_norm": 3.058868031489827, + "learning_rate": 1.7255684790654008e-06, + "loss": 0.5088, + "step": 6401 + }, + { + "epoch": 0.74, + "grad_norm": 1.9609008204360812, + "learning_rate": 1.7241623632115017e-06, + "loss": 0.482, + "step": 6402 + }, + { + "epoch": 0.74, + "grad_norm": 1.7412906977922125, + "learning_rate": 1.722756701126066e-06, + "loss": 0.4823, + "step": 6403 + }, + { + "epoch": 0.74, + "grad_norm": 2.2391597397466776, + "learning_rate": 1.7213514930038028e-06, + "loss": 0.5442, + "step": 6404 + }, + { + "epoch": 0.74, + "grad_norm": 1.8119908409784025, + "learning_rate": 1.7199467390393649e-06, + "loss": 0.4962, + "step": 6405 + }, + { + "epoch": 0.74, + "grad_norm": 2.048155774082454, + "learning_rate": 1.7185424394273347e-06, + "loss": 0.515, + "step": 6406 + }, + { + "epoch": 0.74, + "grad_norm": 2.1866790125883577, + "learning_rate": 1.7171385943622392e-06, + "loss": 0.4608, + "step": 6407 + }, + { + "epoch": 0.74, + "grad_norm": 1.9690426812406312, + "learning_rate": 1.7157352040385372e-06, + "loss": 0.5462, + "step": 6408 + }, + { + "epoch": 0.74, + "grad_norm": 1.8117968748463251, + "learning_rate": 1.7143322686506236e-06, + "loss": 0.4495, + "step": 6409 + }, + { + "epoch": 0.74, + "grad_norm": 2.694165654878887, + "learning_rate": 1.712929788392837e-06, + "loss": 0.5052, + "step": 6410 + }, + { + "epoch": 0.74, + "grad_norm": 2.5453272644743388, + "learning_rate": 1.7115277634594451e-06, + "loss": 0.4388, + "step": 6411 + }, + { + "epoch": 0.74, + "grad_norm": 2.3359065796656604, + "learning_rate": 1.7101261940446601e-06, + "loss": 0.4996, + "step": 6412 + }, + { + "epoch": 0.74, + "grad_norm": 1.6751353193935272, + "learning_rate": 1.7087250803426225e-06, + "loss": 0.4569, + "step": 6413 + }, + { + "epoch": 0.74, + "grad_norm": 5.302166197764301, + "learning_rate": 1.7073244225474184e-06, + "loss": 0.5813, + "step": 6414 + }, + { + "epoch": 0.74, + "grad_norm": 2.877642489009678, + "learning_rate": 1.7059242208530634e-06, + "loss": 0.5277, + "step": 6415 + }, + { + "epoch": 0.74, + "grad_norm": 2.1077683577988853, + "learning_rate": 1.7045244754535185e-06, + "loss": 0.4862, + "step": 6416 + }, + { + "epoch": 0.74, + "grad_norm": 1.6034975711186212, + "learning_rate": 1.7031251865426685e-06, + "loss": 0.4744, + "step": 6417 + }, + { + "epoch": 0.74, + "grad_norm": 2.778621279359413, + "learning_rate": 1.7017263543143486e-06, + "loss": 0.4952, + "step": 6418 + }, + { + "epoch": 0.74, + "grad_norm": 1.4732778910217057, + "learning_rate": 1.7003279789623212e-06, + "loss": 0.4811, + "step": 6419 + }, + { + "epoch": 0.74, + "grad_norm": 1.7266084062224527, + "learning_rate": 1.6989300606802921e-06, + "loss": 0.4854, + "step": 6420 + }, + { + "epoch": 0.74, + "grad_norm": 2.787029287262787, + "learning_rate": 1.697532599661898e-06, + "loss": 0.4182, + "step": 6421 + }, + { + "epoch": 0.74, + "grad_norm": 1.9997075024957107, + "learning_rate": 1.6961355961007176e-06, + "loss": 0.5135, + "step": 6422 + }, + { + "epoch": 0.74, + "grad_norm": 3.7935876039895264, + "learning_rate": 1.6947390501902606e-06, + "loss": 0.5237, + "step": 6423 + }, + { + "epoch": 0.74, + "grad_norm": 2.067730841617121, + "learning_rate": 1.6933429621239795e-06, + "loss": 0.4889, + "step": 6424 + }, + { + "epoch": 0.74, + "grad_norm": 1.991052640745627, + "learning_rate": 1.6919473320952567e-06, + "loss": 0.5638, + "step": 6425 + }, + { + "epoch": 0.74, + "grad_norm": 2.7047905990763956, + "learning_rate": 1.6905521602974183e-06, + "loss": 0.5754, + "step": 6426 + }, + { + "epoch": 0.74, + "grad_norm": 5.04502082485513, + "learning_rate": 1.6891574469237215e-06, + "loss": 0.5033, + "step": 6427 + }, + { + "epoch": 0.74, + "grad_norm": 1.7168090564711342, + "learning_rate": 1.6877631921673598e-06, + "loss": 0.502, + "step": 6428 + }, + { + "epoch": 0.74, + "grad_norm": 2.3544269510995246, + "learning_rate": 1.6863693962214688e-06, + "loss": 0.4742, + "step": 6429 + }, + { + "epoch": 0.74, + "grad_norm": 1.565972907655242, + "learning_rate": 1.6849760592791137e-06, + "loss": 0.467, + "step": 6430 + }, + { + "epoch": 0.74, + "grad_norm": 1.8749970376187846, + "learning_rate": 1.683583181533302e-06, + "loss": 0.4901, + "step": 6431 + }, + { + "epoch": 0.74, + "grad_norm": 1.8284467855369886, + "learning_rate": 1.6821907631769719e-06, + "loss": 0.4852, + "step": 6432 + }, + { + "epoch": 0.74, + "grad_norm": 2.1949053716752425, + "learning_rate": 1.680798804403006e-06, + "loss": 0.4531, + "step": 6433 + }, + { + "epoch": 0.74, + "grad_norm": 2.156298320854908, + "learning_rate": 1.679407305404212e-06, + "loss": 0.5589, + "step": 6434 + }, + { + "epoch": 0.74, + "grad_norm": 2.059707928309894, + "learning_rate": 1.678016266373344e-06, + "loss": 0.5108, + "step": 6435 + }, + { + "epoch": 0.74, + "grad_norm": 1.9102415546504543, + "learning_rate": 1.6766256875030856e-06, + "loss": 0.4959, + "step": 6436 + }, + { + "epoch": 0.74, + "grad_norm": 1.6636396383603198, + "learning_rate": 1.6752355689860634e-06, + "loss": 0.3945, + "step": 6437 + }, + { + "epoch": 0.74, + "grad_norm": 1.9708050477105261, + "learning_rate": 1.6738459110148326e-06, + "loss": 0.5654, + "step": 6438 + }, + { + "epoch": 0.74, + "grad_norm": 3.247662399858331, + "learning_rate": 1.672456713781892e-06, + "loss": 0.5184, + "step": 6439 + }, + { + "epoch": 0.74, + "grad_norm": 3.737273447721273, + "learning_rate": 1.671067977479669e-06, + "loss": 0.4076, + "step": 6440 + }, + { + "epoch": 0.74, + "grad_norm": 0.7819924861146742, + "learning_rate": 1.6696797023005346e-06, + "loss": 0.6929, + "step": 6441 + }, + { + "epoch": 0.74, + "grad_norm": 1.7843069943623606, + "learning_rate": 1.6682918884367899e-06, + "loss": 0.4926, + "step": 6442 + }, + { + "epoch": 0.74, + "grad_norm": 2.6272574211800417, + "learning_rate": 1.6669045360806774e-06, + "loss": 0.4138, + "step": 6443 + }, + { + "epoch": 0.74, + "grad_norm": 2.284159297519438, + "learning_rate": 1.6655176454243694e-06, + "loss": 0.497, + "step": 6444 + }, + { + "epoch": 0.74, + "grad_norm": 2.05139120874132, + "learning_rate": 1.664131216659981e-06, + "loss": 0.4619, + "step": 6445 + }, + { + "epoch": 0.74, + "grad_norm": 1.777785856776366, + "learning_rate": 1.662745249979557e-06, + "loss": 0.4772, + "step": 6446 + }, + { + "epoch": 0.74, + "grad_norm": 2.2672654060794244, + "learning_rate": 1.6613597455750853e-06, + "loss": 0.5381, + "step": 6447 + }, + { + "epoch": 0.74, + "grad_norm": 2.8903462946285106, + "learning_rate": 1.6599747036384829e-06, + "loss": 0.5699, + "step": 6448 + }, + { + "epoch": 0.74, + "grad_norm": 2.0403232238840023, + "learning_rate": 1.6585901243616044e-06, + "loss": 0.4962, + "step": 6449 + }, + { + "epoch": 0.74, + "grad_norm": 3.21318523121185, + "learning_rate": 1.6572060079362444e-06, + "loss": 0.4519, + "step": 6450 + }, + { + "epoch": 0.74, + "grad_norm": 1.817783628267319, + "learning_rate": 1.6558223545541297e-06, + "loss": 0.4784, + "step": 6451 + }, + { + "epoch": 0.74, + "grad_norm": 1.6839627314004544, + "learning_rate": 1.6544391644069218e-06, + "loss": 0.5107, + "step": 6452 + }, + { + "epoch": 0.74, + "grad_norm": 1.5923458401925887, + "learning_rate": 1.6530564376862224e-06, + "loss": 0.5001, + "step": 6453 + }, + { + "epoch": 0.74, + "grad_norm": 2.27004056592196, + "learning_rate": 1.6516741745835658e-06, + "loss": 0.5522, + "step": 6454 + }, + { + "epoch": 0.74, + "grad_norm": 2.345695794352978, + "learning_rate": 1.650292375290421e-06, + "loss": 0.528, + "step": 6455 + }, + { + "epoch": 0.74, + "grad_norm": 1.6512121069329335, + "learning_rate": 1.6489110399981978e-06, + "loss": 0.4474, + "step": 6456 + }, + { + "epoch": 0.74, + "grad_norm": 2.2856920148305546, + "learning_rate": 1.6475301688982353e-06, + "loss": 0.4873, + "step": 6457 + }, + { + "epoch": 0.74, + "grad_norm": 1.861781930697599, + "learning_rate": 1.646149762181815e-06, + "loss": 0.4651, + "step": 6458 + }, + { + "epoch": 0.74, + "grad_norm": 0.8717502600985453, + "learning_rate": 1.644769820040147e-06, + "loss": 0.6936, + "step": 6459 + }, + { + "epoch": 0.74, + "grad_norm": 2.394106193048894, + "learning_rate": 1.6433903426643838e-06, + "loss": 0.5062, + "step": 6460 + }, + { + "epoch": 0.74, + "grad_norm": 2.3339287744302526, + "learning_rate": 1.642011330245607e-06, + "loss": 0.459, + "step": 6461 + }, + { + "epoch": 0.74, + "grad_norm": 1.9225696643733123, + "learning_rate": 1.6406327829748415e-06, + "loss": 0.5465, + "step": 6462 + }, + { + "epoch": 0.74, + "grad_norm": 2.1577958070996566, + "learning_rate": 1.6392547010430388e-06, + "loss": 0.398, + "step": 6463 + }, + { + "epoch": 0.74, + "grad_norm": 2.2035995038236735, + "learning_rate": 1.6378770846410946e-06, + "loss": 0.4616, + "step": 6464 + }, + { + "epoch": 0.74, + "grad_norm": 0.8851861953740503, + "learning_rate": 1.6364999339598332e-06, + "loss": 0.666, + "step": 6465 + }, + { + "epoch": 0.74, + "grad_norm": 2.0225110605363, + "learning_rate": 1.6351232491900193e-06, + "loss": 0.5437, + "step": 6466 + }, + { + "epoch": 0.74, + "grad_norm": 2.2434118631157975, + "learning_rate": 1.6337470305223502e-06, + "loss": 0.4068, + "step": 6467 + }, + { + "epoch": 0.74, + "grad_norm": 1.8505707557928814, + "learning_rate": 1.6323712781474598e-06, + "loss": 0.6022, + "step": 6468 + }, + { + "epoch": 0.74, + "grad_norm": 1.8327671581593565, + "learning_rate": 1.630995992255915e-06, + "loss": 0.4561, + "step": 6469 + }, + { + "epoch": 0.74, + "grad_norm": 2.3938361714602876, + "learning_rate": 1.6296211730382229e-06, + "loss": 0.5554, + "step": 6470 + }, + { + "epoch": 0.74, + "grad_norm": 0.839348439254524, + "learning_rate": 1.6282468206848206e-06, + "loss": 0.6787, + "step": 6471 + }, + { + "epoch": 0.74, + "grad_norm": 0.8063425840487671, + "learning_rate": 1.6268729353860867e-06, + "loss": 0.6805, + "step": 6472 + }, + { + "epoch": 0.74, + "grad_norm": 2.6225881314619857, + "learning_rate": 1.6254995173323296e-06, + "loss": 0.4509, + "step": 6473 + }, + { + "epoch": 0.74, + "grad_norm": 3.065901225285745, + "learning_rate": 1.6241265667137928e-06, + "loss": 0.4508, + "step": 6474 + }, + { + "epoch": 0.74, + "grad_norm": 1.8554665486141357, + "learning_rate": 1.6227540837206613e-06, + "loss": 0.5044, + "step": 6475 + }, + { + "epoch": 0.74, + "grad_norm": 2.1188290272697006, + "learning_rate": 1.6213820685430477e-06, + "loss": 0.5207, + "step": 6476 + }, + { + "epoch": 0.74, + "grad_norm": 1.8769755451955927, + "learning_rate": 1.620010521371006e-06, + "loss": 0.4482, + "step": 6477 + }, + { + "epoch": 0.74, + "grad_norm": 5.235845942195978, + "learning_rate": 1.6186394423945196e-06, + "loss": 0.5343, + "step": 6478 + }, + { + "epoch": 0.74, + "grad_norm": 2.038567248580995, + "learning_rate": 1.617268831803514e-06, + "loss": 0.4926, + "step": 6479 + }, + { + "epoch": 0.74, + "grad_norm": 2.338115106820033, + "learning_rate": 1.615898689787842e-06, + "loss": 0.4469, + "step": 6480 + }, + { + "epoch": 0.74, + "grad_norm": 2.151616747188723, + "learning_rate": 1.6145290165372994e-06, + "loss": 0.4721, + "step": 6481 + }, + { + "epoch": 0.74, + "grad_norm": 0.8494268864176064, + "learning_rate": 1.6131598122416091e-06, + "loss": 0.6913, + "step": 6482 + }, + { + "epoch": 0.74, + "grad_norm": 3.0869226272740136, + "learning_rate": 1.6117910770904365e-06, + "loss": 0.5689, + "step": 6483 + }, + { + "epoch": 0.75, + "grad_norm": 2.241384501135865, + "learning_rate": 1.6104228112733777e-06, + "loss": 0.4783, + "step": 6484 + }, + { + "epoch": 0.75, + "grad_norm": 2.303813222093908, + "learning_rate": 1.6090550149799634e-06, + "loss": 0.4778, + "step": 6485 + }, + { + "epoch": 0.75, + "grad_norm": 2.837938147458567, + "learning_rate": 1.6076876883996595e-06, + "loss": 0.4836, + "step": 6486 + }, + { + "epoch": 0.75, + "grad_norm": 1.7654473971557707, + "learning_rate": 1.6063208317218714e-06, + "loss": 0.4906, + "step": 6487 + }, + { + "epoch": 0.75, + "grad_norm": 1.6045997003348882, + "learning_rate": 1.6049544451359318e-06, + "loss": 0.4385, + "step": 6488 + }, + { + "epoch": 0.75, + "grad_norm": 2.9455510774500184, + "learning_rate": 1.6035885288311164e-06, + "loss": 0.4142, + "step": 6489 + }, + { + "epoch": 0.75, + "grad_norm": 2.220625328969181, + "learning_rate": 1.6022230829966278e-06, + "loss": 0.4961, + "step": 6490 + }, + { + "epoch": 0.75, + "grad_norm": 2.0445280253362896, + "learning_rate": 1.6008581078216111e-06, + "loss": 0.5092, + "step": 6491 + }, + { + "epoch": 0.75, + "grad_norm": 1.9874898014858842, + "learning_rate": 1.5994936034951402e-06, + "loss": 0.4813, + "step": 6492 + }, + { + "epoch": 0.75, + "grad_norm": 2.520446653323418, + "learning_rate": 1.5981295702062255e-06, + "loss": 0.5548, + "step": 6493 + }, + { + "epoch": 0.75, + "grad_norm": 2.0567385458596945, + "learning_rate": 1.5967660081438146e-06, + "loss": 0.4403, + "step": 6494 + }, + { + "epoch": 0.75, + "grad_norm": 2.533717328334331, + "learning_rate": 1.595402917496785e-06, + "loss": 0.4295, + "step": 6495 + }, + { + "epoch": 0.75, + "grad_norm": 2.350417094086101, + "learning_rate": 1.5940402984539554e-06, + "loss": 0.5359, + "step": 6496 + }, + { + "epoch": 0.75, + "grad_norm": 1.6200789870942718, + "learning_rate": 1.5926781512040723e-06, + "loss": 0.4778, + "step": 6497 + }, + { + "epoch": 0.75, + "grad_norm": 3.375887839362443, + "learning_rate": 1.591316475935823e-06, + "loss": 0.4472, + "step": 6498 + }, + { + "epoch": 0.75, + "grad_norm": 1.840483246365416, + "learning_rate": 1.5899552728378231e-06, + "loss": 0.4863, + "step": 6499 + }, + { + "epoch": 0.75, + "grad_norm": 1.565592707319838, + "learning_rate": 1.5885945420986321e-06, + "loss": 0.5255, + "step": 6500 + }, + { + "epoch": 0.75, + "grad_norm": 2.3243601300857955, + "learning_rate": 1.5872342839067305e-06, + "loss": 0.4858, + "step": 6501 + }, + { + "epoch": 0.75, + "grad_norm": 1.7428290057231206, + "learning_rate": 1.5858744984505465e-06, + "loss": 0.3655, + "step": 6502 + }, + { + "epoch": 0.75, + "grad_norm": 2.004284766770113, + "learning_rate": 1.5845151859184338e-06, + "loss": 0.5335, + "step": 6503 + }, + { + "epoch": 0.75, + "grad_norm": 2.9590778431250877, + "learning_rate": 1.5831563464986883e-06, + "loss": 0.5577, + "step": 6504 + }, + { + "epoch": 0.75, + "grad_norm": 1.9471470521338405, + "learning_rate": 1.5817979803795314e-06, + "loss": 0.4656, + "step": 6505 + }, + { + "epoch": 0.75, + "grad_norm": 2.0539476460015305, + "learning_rate": 1.5804400877491282e-06, + "loss": 0.5194, + "step": 6506 + }, + { + "epoch": 0.75, + "grad_norm": 0.9044183481952217, + "learning_rate": 1.57908266879557e-06, + "loss": 0.6776, + "step": 6507 + }, + { + "epoch": 0.75, + "grad_norm": 2.009004259154785, + "learning_rate": 1.5777257237068898e-06, + "loss": 0.5251, + "step": 6508 + }, + { + "epoch": 0.75, + "grad_norm": 1.61949569174443, + "learning_rate": 1.5763692526710484e-06, + "loss": 0.4259, + "step": 6509 + }, + { + "epoch": 0.75, + "grad_norm": 1.8368468941528928, + "learning_rate": 1.5750132558759463e-06, + "loss": 0.4993, + "step": 6510 + }, + { + "epoch": 0.75, + "grad_norm": 1.7350478970366157, + "learning_rate": 1.5736577335094128e-06, + "loss": 0.5492, + "step": 6511 + }, + { + "epoch": 0.75, + "grad_norm": 1.9835185449775052, + "learning_rate": 1.5723026857592184e-06, + "loss": 0.466, + "step": 6512 + }, + { + "epoch": 0.75, + "grad_norm": 2.1710787715551576, + "learning_rate": 1.5709481128130628e-06, + "loss": 0.4599, + "step": 6513 + }, + { + "epoch": 0.75, + "grad_norm": 2.17593040766446, + "learning_rate": 1.5695940148585787e-06, + "loss": 0.4722, + "step": 6514 + }, + { + "epoch": 0.75, + "grad_norm": 1.8489899581094549, + "learning_rate": 1.5682403920833388e-06, + "loss": 0.5329, + "step": 6515 + }, + { + "epoch": 0.75, + "grad_norm": 1.8639757309254745, + "learning_rate": 1.566887244674844e-06, + "loss": 0.4902, + "step": 6516 + }, + { + "epoch": 0.75, + "grad_norm": 1.7582749214117077, + "learning_rate": 1.5655345728205351e-06, + "loss": 0.4425, + "step": 6517 + }, + { + "epoch": 0.75, + "grad_norm": 1.9459301081776175, + "learning_rate": 1.5641823767077824e-06, + "loss": 0.5635, + "step": 6518 + }, + { + "epoch": 0.75, + "grad_norm": 2.0018333615394157, + "learning_rate": 1.5628306565238915e-06, + "loss": 0.5627, + "step": 6519 + }, + { + "epoch": 0.75, + "grad_norm": 2.270063467558181, + "learning_rate": 1.5614794124561017e-06, + "loss": 0.4462, + "step": 6520 + }, + { + "epoch": 0.75, + "grad_norm": 3.0825871551195156, + "learning_rate": 1.560128644691589e-06, + "loss": 0.5102, + "step": 6521 + }, + { + "epoch": 0.75, + "grad_norm": 1.773889060479605, + "learning_rate": 1.5587783534174595e-06, + "loss": 0.433, + "step": 6522 + }, + { + "epoch": 0.75, + "grad_norm": 2.184627689609748, + "learning_rate": 1.5574285388207576e-06, + "loss": 0.401, + "step": 6523 + }, + { + "epoch": 0.75, + "grad_norm": 1.9654382227985172, + "learning_rate": 1.5560792010884574e-06, + "loss": 0.4702, + "step": 6524 + }, + { + "epoch": 0.75, + "grad_norm": 0.7981755577564958, + "learning_rate": 1.554730340407471e-06, + "loss": 0.6933, + "step": 6525 + }, + { + "epoch": 0.75, + "grad_norm": 5.254894237436918, + "learning_rate": 1.553381956964639e-06, + "loss": 0.4319, + "step": 6526 + }, + { + "epoch": 0.75, + "grad_norm": 2.1722644940021274, + "learning_rate": 1.5520340509467435e-06, + "loss": 0.465, + "step": 6527 + }, + { + "epoch": 0.75, + "grad_norm": 2.059492478884718, + "learning_rate": 1.5506866225404926e-06, + "loss": 0.5407, + "step": 6528 + }, + { + "epoch": 0.75, + "grad_norm": 1.8130527865409911, + "learning_rate": 1.5493396719325343e-06, + "loss": 0.5493, + "step": 6529 + }, + { + "epoch": 0.75, + "grad_norm": 3.2083034783499094, + "learning_rate": 1.547993199309446e-06, + "loss": 0.6175, + "step": 6530 + }, + { + "epoch": 0.75, + "grad_norm": 1.9651822619407406, + "learning_rate": 1.546647204857743e-06, + "loss": 0.4722, + "step": 6531 + }, + { + "epoch": 0.75, + "grad_norm": 3.629206748747898, + "learning_rate": 1.545301688763871e-06, + "loss": 0.4409, + "step": 6532 + }, + { + "epoch": 0.75, + "grad_norm": 4.05629937912357, + "learning_rate": 1.5439566512142095e-06, + "loss": 0.5608, + "step": 6533 + }, + { + "epoch": 0.75, + "grad_norm": 2.2203633035077437, + "learning_rate": 1.5426120923950755e-06, + "loss": 0.5351, + "step": 6534 + }, + { + "epoch": 0.75, + "grad_norm": 2.9629628700751978, + "learning_rate": 1.5412680124927154e-06, + "loss": 0.4183, + "step": 6535 + }, + { + "epoch": 0.75, + "grad_norm": 2.1655307757075035, + "learning_rate": 1.5399244116933098e-06, + "loss": 0.5619, + "step": 6536 + }, + { + "epoch": 0.75, + "grad_norm": 2.7060701668836673, + "learning_rate": 1.5385812901829766e-06, + "loss": 0.5226, + "step": 6537 + }, + { + "epoch": 0.75, + "grad_norm": 1.7364926782371983, + "learning_rate": 1.5372386481477641e-06, + "loss": 0.4513, + "step": 6538 + }, + { + "epoch": 0.75, + "grad_norm": 2.63487142639888, + "learning_rate": 1.5358964857736524e-06, + "loss": 0.5477, + "step": 6539 + }, + { + "epoch": 0.75, + "grad_norm": 2.1386593216876606, + "learning_rate": 1.5345548032465613e-06, + "loss": 0.5046, + "step": 6540 + }, + { + "epoch": 0.75, + "grad_norm": 1.8196775447189655, + "learning_rate": 1.5332136007523368e-06, + "loss": 0.4471, + "step": 6541 + }, + { + "epoch": 0.75, + "grad_norm": 2.523946785359498, + "learning_rate": 1.5318728784767656e-06, + "loss": 0.453, + "step": 6542 + }, + { + "epoch": 0.75, + "grad_norm": 1.9099040637123323, + "learning_rate": 1.5305326366055606e-06, + "loss": 0.4545, + "step": 6543 + }, + { + "epoch": 0.75, + "grad_norm": 1.9811563678089645, + "learning_rate": 1.5291928753243757e-06, + "loss": 0.4474, + "step": 6544 + }, + { + "epoch": 0.75, + "grad_norm": 1.8091770421846807, + "learning_rate": 1.5278535948187912e-06, + "loss": 0.5124, + "step": 6545 + }, + { + "epoch": 0.75, + "grad_norm": 2.721423777899966, + "learning_rate": 1.5265147952743263e-06, + "loss": 0.3936, + "step": 6546 + }, + { + "epoch": 0.75, + "grad_norm": 1.9753044279944842, + "learning_rate": 1.5251764768764293e-06, + "loss": 0.4866, + "step": 6547 + }, + { + "epoch": 0.75, + "grad_norm": 2.3621269206789646, + "learning_rate": 1.5238386398104864e-06, + "loss": 0.4053, + "step": 6548 + }, + { + "epoch": 0.75, + "grad_norm": 1.732112313838319, + "learning_rate": 1.5225012842618114e-06, + "loss": 0.3963, + "step": 6549 + }, + { + "epoch": 0.75, + "grad_norm": 1.6981184351366223, + "learning_rate": 1.5211644104156575e-06, + "loss": 0.576, + "step": 6550 + }, + { + "epoch": 0.75, + "grad_norm": 2.407810505562715, + "learning_rate": 1.5198280184572072e-06, + "loss": 0.5632, + "step": 6551 + }, + { + "epoch": 0.75, + "grad_norm": 1.879663918914485, + "learning_rate": 1.518492108571577e-06, + "loss": 0.5241, + "step": 6552 + }, + { + "epoch": 0.75, + "grad_norm": 2.0734233824369928, + "learning_rate": 1.5171566809438154e-06, + "loss": 0.5232, + "step": 6553 + }, + { + "epoch": 0.75, + "grad_norm": 6.02367506864186, + "learning_rate": 1.5158217357589084e-06, + "loss": 0.5619, + "step": 6554 + }, + { + "epoch": 0.75, + "grad_norm": 2.321301776742757, + "learning_rate": 1.5144872732017696e-06, + "loss": 0.555, + "step": 6555 + }, + { + "epoch": 0.75, + "grad_norm": 1.9950068405223962, + "learning_rate": 1.5131532934572517e-06, + "loss": 0.4192, + "step": 6556 + }, + { + "epoch": 0.75, + "grad_norm": 3.917789848600706, + "learning_rate": 1.5118197967101356e-06, + "loss": 0.4867, + "step": 6557 + }, + { + "epoch": 0.75, + "grad_norm": 7.553987917661604, + "learning_rate": 1.5104867831451353e-06, + "loss": 0.4546, + "step": 6558 + }, + { + "epoch": 0.75, + "grad_norm": 2.2052232438341757, + "learning_rate": 1.5091542529469034e-06, + "loss": 0.5113, + "step": 6559 + }, + { + "epoch": 0.75, + "grad_norm": 1.976599209711734, + "learning_rate": 1.507822206300018e-06, + "loss": 0.4657, + "step": 6560 + }, + { + "epoch": 0.75, + "grad_norm": 3.113744940036828, + "learning_rate": 1.506490643388997e-06, + "loss": 0.5725, + "step": 6561 + }, + { + "epoch": 0.75, + "grad_norm": 3.447207902052716, + "learning_rate": 1.5051595643982858e-06, + "loss": 0.5257, + "step": 6562 + }, + { + "epoch": 0.75, + "grad_norm": 2.0297719082331636, + "learning_rate": 1.5038289695122676e-06, + "loss": 0.4612, + "step": 6563 + }, + { + "epoch": 0.75, + "grad_norm": 2.840962244723054, + "learning_rate": 1.502498858915254e-06, + "loss": 0.4838, + "step": 6564 + }, + { + "epoch": 0.75, + "grad_norm": 2.074981335749734, + "learning_rate": 1.501169232791494e-06, + "loss": 0.4652, + "step": 6565 + }, + { + "epoch": 0.75, + "grad_norm": 1.7213026150012902, + "learning_rate": 1.4998400913251637e-06, + "loss": 0.3766, + "step": 6566 + }, + { + "epoch": 0.75, + "grad_norm": 4.373812447332412, + "learning_rate": 1.4985114347003799e-06, + "loss": 0.4282, + "step": 6567 + }, + { + "epoch": 0.75, + "grad_norm": 2.146567323360134, + "learning_rate": 1.4971832631011857e-06, + "loss": 0.5062, + "step": 6568 + }, + { + "epoch": 0.75, + "grad_norm": 1.9561716934132185, + "learning_rate": 1.495855576711559e-06, + "loss": 0.5109, + "step": 6569 + }, + { + "epoch": 0.75, + "grad_norm": 2.180869293050752, + "learning_rate": 1.4945283757154095e-06, + "loss": 0.4952, + "step": 6570 + }, + { + "epoch": 0.76, + "grad_norm": 1.8989991772210126, + "learning_rate": 1.4932016602965838e-06, + "loss": 0.5181, + "step": 6571 + }, + { + "epoch": 0.76, + "grad_norm": 1.9704282840750442, + "learning_rate": 1.4918754306388544e-06, + "loss": 0.446, + "step": 6572 + }, + { + "epoch": 0.76, + "grad_norm": 2.1743100052283038, + "learning_rate": 1.4905496869259351e-06, + "loss": 0.385, + "step": 6573 + }, + { + "epoch": 0.76, + "grad_norm": 1.896113551308374, + "learning_rate": 1.4892244293414636e-06, + "loss": 0.5528, + "step": 6574 + }, + { + "epoch": 0.76, + "grad_norm": 4.0867008139008565, + "learning_rate": 1.4878996580690175e-06, + "loss": 0.3734, + "step": 6575 + }, + { + "epoch": 0.76, + "grad_norm": 2.0302456674221756, + "learning_rate": 1.4865753732921012e-06, + "loss": 0.5046, + "step": 6576 + }, + { + "epoch": 0.76, + "grad_norm": 2.3610913084914906, + "learning_rate": 1.4852515751941565e-06, + "loss": 0.5336, + "step": 6577 + }, + { + "epoch": 0.76, + "grad_norm": 2.109537909584165, + "learning_rate": 1.4839282639585557e-06, + "loss": 0.4624, + "step": 6578 + }, + { + "epoch": 0.76, + "grad_norm": 1.9397038821135526, + "learning_rate": 1.4826054397686008e-06, + "loss": 0.4036, + "step": 6579 + }, + { + "epoch": 0.76, + "grad_norm": 2.2767227633423626, + "learning_rate": 1.4812831028075324e-06, + "loss": 0.4076, + "step": 6580 + }, + { + "epoch": 0.76, + "grad_norm": 1.8132435721433877, + "learning_rate": 1.4799612532585178e-06, + "loss": 0.4231, + "step": 6581 + }, + { + "epoch": 0.76, + "grad_norm": 2.083181386480729, + "learning_rate": 1.4786398913046628e-06, + "loss": 0.4911, + "step": 6582 + }, + { + "epoch": 0.76, + "grad_norm": 1.8136039211205544, + "learning_rate": 1.4773190171289981e-06, + "loss": 0.4411, + "step": 6583 + }, + { + "epoch": 0.76, + "grad_norm": 2.413975949078423, + "learning_rate": 1.4759986309144947e-06, + "loss": 0.4696, + "step": 6584 + }, + { + "epoch": 0.76, + "grad_norm": 2.5380519490564257, + "learning_rate": 1.4746787328440503e-06, + "loss": 0.4677, + "step": 6585 + }, + { + "epoch": 0.76, + "grad_norm": 2.5327498755294644, + "learning_rate": 1.4733593231004972e-06, + "loss": 0.4937, + "step": 6586 + }, + { + "epoch": 0.76, + "grad_norm": 1.834001152876658, + "learning_rate": 1.4720404018665985e-06, + "loss": 0.4485, + "step": 6587 + }, + { + "epoch": 0.76, + "grad_norm": 2.656975754038567, + "learning_rate": 1.4707219693250541e-06, + "loss": 0.4647, + "step": 6588 + }, + { + "epoch": 0.76, + "grad_norm": 1.985685645394257, + "learning_rate": 1.469404025658489e-06, + "loss": 0.534, + "step": 6589 + }, + { + "epoch": 0.76, + "grad_norm": 2.2108764104319367, + "learning_rate": 1.4680865710494691e-06, + "loss": 0.4688, + "step": 6590 + }, + { + "epoch": 0.76, + "grad_norm": 1.929567640616009, + "learning_rate": 1.466769605680483e-06, + "loss": 0.447, + "step": 6591 + }, + { + "epoch": 0.76, + "grad_norm": 0.8315123547880507, + "learning_rate": 1.465453129733962e-06, + "loss": 0.6603, + "step": 6592 + }, + { + "epoch": 0.76, + "grad_norm": 2.7953323448257033, + "learning_rate": 1.4641371433922585e-06, + "loss": 0.4477, + "step": 6593 + }, + { + "epoch": 0.76, + "grad_norm": 2.396355983041216, + "learning_rate": 1.4628216468376677e-06, + "loss": 0.5773, + "step": 6594 + }, + { + "epoch": 0.76, + "grad_norm": 0.8604037469684871, + "learning_rate": 1.461506640252408e-06, + "loss": 0.6612, + "step": 6595 + }, + { + "epoch": 0.76, + "grad_norm": 1.8318393397870936, + "learning_rate": 1.4601921238186374e-06, + "loss": 0.5352, + "step": 6596 + }, + { + "epoch": 0.76, + "grad_norm": 1.8227475262603838, + "learning_rate": 1.4588780977184402e-06, + "loss": 0.3912, + "step": 6597 + }, + { + "epoch": 0.76, + "grad_norm": 2.1349264150938163, + "learning_rate": 1.4575645621338346e-06, + "loss": 0.4929, + "step": 6598 + }, + { + "epoch": 0.76, + "grad_norm": 2.2500825300809826, + "learning_rate": 1.4562515172467734e-06, + "loss": 0.4307, + "step": 6599 + }, + { + "epoch": 0.76, + "grad_norm": 2.4081264347155016, + "learning_rate": 1.454938963239137e-06, + "loss": 0.539, + "step": 6600 + }, + { + "epoch": 0.76, + "grad_norm": 2.4067927275539147, + "learning_rate": 1.4536269002927427e-06, + "loss": 0.4989, + "step": 6601 + }, + { + "epoch": 0.76, + "grad_norm": 1.8227725375285402, + "learning_rate": 1.452315328589336e-06, + "loss": 0.481, + "step": 6602 + }, + { + "epoch": 0.76, + "grad_norm": 0.7991728316623232, + "learning_rate": 1.4510042483105957e-06, + "loss": 0.6959, + "step": 6603 + }, + { + "epoch": 0.76, + "grad_norm": 2.145257616019355, + "learning_rate": 1.44969365963813e-06, + "loss": 0.4675, + "step": 6604 + }, + { + "epoch": 0.76, + "grad_norm": 1.8316462068486614, + "learning_rate": 1.4483835627534858e-06, + "loss": 0.5692, + "step": 6605 + }, + { + "epoch": 0.76, + "grad_norm": 2.531275864182249, + "learning_rate": 1.4470739578381338e-06, + "loss": 0.4897, + "step": 6606 + }, + { + "epoch": 0.76, + "grad_norm": 2.0105876107711502, + "learning_rate": 1.445764845073483e-06, + "loss": 0.5321, + "step": 6607 + }, + { + "epoch": 0.76, + "grad_norm": 5.220177639510594, + "learning_rate": 1.4444562246408689e-06, + "loss": 0.4963, + "step": 6608 + }, + { + "epoch": 0.76, + "grad_norm": 2.6561210678092055, + "learning_rate": 1.4431480967215645e-06, + "loss": 0.4517, + "step": 6609 + }, + { + "epoch": 0.76, + "grad_norm": 1.850792347980298, + "learning_rate": 1.441840461496768e-06, + "loss": 0.3954, + "step": 6610 + }, + { + "epoch": 0.76, + "grad_norm": 2.527090126991325, + "learning_rate": 1.4405333191476157e-06, + "loss": 0.5334, + "step": 6611 + }, + { + "epoch": 0.76, + "grad_norm": 0.8827810382458436, + "learning_rate": 1.43922666985517e-06, + "loss": 0.6992, + "step": 6612 + }, + { + "epoch": 0.76, + "grad_norm": 2.1592274873138466, + "learning_rate": 1.437920513800431e-06, + "loss": 0.5163, + "step": 6613 + }, + { + "epoch": 0.76, + "grad_norm": 1.7869913810243843, + "learning_rate": 1.4366148511643235e-06, + "loss": 0.4591, + "step": 6614 + }, + { + "epoch": 0.76, + "grad_norm": 2.1662034409806004, + "learning_rate": 1.4353096821277118e-06, + "loss": 0.4818, + "step": 6615 + }, + { + "epoch": 0.76, + "grad_norm": 1.6523858417216504, + "learning_rate": 1.434005006871385e-06, + "loss": 0.4172, + "step": 6616 + }, + { + "epoch": 0.76, + "grad_norm": 0.7986419634734306, + "learning_rate": 1.432700825576066e-06, + "loss": 0.6827, + "step": 6617 + }, + { + "epoch": 0.76, + "grad_norm": 2.564660496633666, + "learning_rate": 1.4313971384224117e-06, + "loss": 0.4548, + "step": 6618 + }, + { + "epoch": 0.76, + "grad_norm": 2.606948956125746, + "learning_rate": 1.4300939455910084e-06, + "loss": 0.4268, + "step": 6619 + }, + { + "epoch": 0.76, + "grad_norm": 6.007475510505929, + "learning_rate": 1.428791247262371e-06, + "loss": 0.4881, + "step": 6620 + }, + { + "epoch": 0.76, + "grad_norm": 3.915019256243825, + "learning_rate": 1.4274890436169537e-06, + "loss": 0.5175, + "step": 6621 + }, + { + "epoch": 0.76, + "grad_norm": 13.23176047380593, + "learning_rate": 1.426187334835135e-06, + "loss": 0.4908, + "step": 6622 + }, + { + "epoch": 0.76, + "grad_norm": 2.1606317416565974, + "learning_rate": 1.4248861210972265e-06, + "loss": 0.4951, + "step": 6623 + }, + { + "epoch": 0.76, + "grad_norm": 1.7357673208513777, + "learning_rate": 1.4235854025834744e-06, + "loss": 0.5471, + "step": 6624 + }, + { + "epoch": 0.76, + "grad_norm": 1.916909522410285, + "learning_rate": 1.4222851794740516e-06, + "loss": 0.4742, + "step": 6625 + }, + { + "epoch": 0.76, + "grad_norm": 3.002212867463506, + "learning_rate": 1.4209854519490673e-06, + "loss": 0.522, + "step": 6626 + }, + { + "epoch": 0.76, + "grad_norm": 2.200180532008721, + "learning_rate": 1.4196862201885569e-06, + "loss": 0.4447, + "step": 6627 + }, + { + "epoch": 0.76, + "grad_norm": 2.285593349131978, + "learning_rate": 1.4183874843724927e-06, + "loss": 0.5197, + "step": 6628 + }, + { + "epoch": 0.76, + "grad_norm": 1.8269835896463489, + "learning_rate": 1.4170892446807721e-06, + "loss": 0.5648, + "step": 6629 + }, + { + "epoch": 0.76, + "grad_norm": 2.131528002473121, + "learning_rate": 1.4157915012932306e-06, + "loss": 0.5684, + "step": 6630 + }, + { + "epoch": 0.76, + "grad_norm": 1.9152060192451006, + "learning_rate": 1.4144942543896278e-06, + "loss": 0.4851, + "step": 6631 + }, + { + "epoch": 0.76, + "grad_norm": 2.325849778014444, + "learning_rate": 1.4131975041496615e-06, + "loss": 0.5064, + "step": 6632 + }, + { + "epoch": 0.76, + "grad_norm": 2.6409304012787898, + "learning_rate": 1.4119012507529546e-06, + "loss": 0.4878, + "step": 6633 + }, + { + "epoch": 0.76, + "grad_norm": 1.9891175117290074, + "learning_rate": 1.410605494379066e-06, + "loss": 0.4987, + "step": 6634 + }, + { + "epoch": 0.76, + "grad_norm": 1.8524344984751682, + "learning_rate": 1.4093102352074822e-06, + "loss": 0.5424, + "step": 6635 + }, + { + "epoch": 0.76, + "grad_norm": 1.7056084386475636, + "learning_rate": 1.4080154734176233e-06, + "loss": 0.4137, + "step": 6636 + }, + { + "epoch": 0.76, + "grad_norm": 2.483467293651881, + "learning_rate": 1.406721209188837e-06, + "loss": 0.5231, + "step": 6637 + }, + { + "epoch": 0.76, + "grad_norm": 1.8730483855541245, + "learning_rate": 1.4054274427004083e-06, + "loss": 0.4751, + "step": 6638 + }, + { + "epoch": 0.76, + "grad_norm": 0.884111771177903, + "learning_rate": 1.4041341741315456e-06, + "loss": 0.6868, + "step": 6639 + }, + { + "epoch": 0.76, + "grad_norm": 2.866258976394583, + "learning_rate": 1.4028414036613962e-06, + "loss": 0.4424, + "step": 6640 + }, + { + "epoch": 0.76, + "grad_norm": 2.224037300154087, + "learning_rate": 1.401549131469031e-06, + "loss": 0.5474, + "step": 6641 + }, + { + "epoch": 0.76, + "grad_norm": 2.282468745165187, + "learning_rate": 1.4002573577334583e-06, + "loss": 0.5153, + "step": 6642 + }, + { + "epoch": 0.76, + "grad_norm": 2.4825000530042605, + "learning_rate": 1.3989660826336133e-06, + "loss": 0.4304, + "step": 6643 + }, + { + "epoch": 0.76, + "grad_norm": 1.7171878760574766, + "learning_rate": 1.3976753063483605e-06, + "loss": 0.5521, + "step": 6644 + }, + { + "epoch": 0.76, + "grad_norm": 3.092815378631865, + "learning_rate": 1.3963850290565023e-06, + "loss": 0.4809, + "step": 6645 + }, + { + "epoch": 0.76, + "grad_norm": 2.598373402938719, + "learning_rate": 1.3950952509367644e-06, + "loss": 0.4018, + "step": 6646 + }, + { + "epoch": 0.76, + "grad_norm": 2.1026657426709794, + "learning_rate": 1.3938059721678088e-06, + "loss": 0.5078, + "step": 6647 + }, + { + "epoch": 0.76, + "grad_norm": 1.9541351430320089, + "learning_rate": 1.3925171929282243e-06, + "loss": 0.4653, + "step": 6648 + }, + { + "epoch": 0.76, + "grad_norm": 6.131037081244338, + "learning_rate": 1.3912289133965345e-06, + "loss": 0.5085, + "step": 6649 + }, + { + "epoch": 0.76, + "grad_norm": 3.11137653844253, + "learning_rate": 1.3899411337511897e-06, + "loss": 0.3772, + "step": 6650 + }, + { + "epoch": 0.76, + "grad_norm": 1.877443913788874, + "learning_rate": 1.3886538541705751e-06, + "loss": 0.4421, + "step": 6651 + }, + { + "epoch": 0.76, + "grad_norm": 2.8294019185900634, + "learning_rate": 1.3873670748330025e-06, + "loss": 0.5097, + "step": 6652 + }, + { + "epoch": 0.76, + "grad_norm": 2.4641861458968948, + "learning_rate": 1.3860807959167178e-06, + "loss": 0.3986, + "step": 6653 + }, + { + "epoch": 0.76, + "grad_norm": 1.6126462954653091, + "learning_rate": 1.3847950175998932e-06, + "loss": 0.4066, + "step": 6654 + }, + { + "epoch": 0.76, + "grad_norm": 2.4857846844518905, + "learning_rate": 1.3835097400606384e-06, + "loss": 0.4137, + "step": 6655 + }, + { + "epoch": 0.76, + "grad_norm": 2.295171109763108, + "learning_rate": 1.3822249634769864e-06, + "loss": 0.3941, + "step": 6656 + }, + { + "epoch": 0.76, + "grad_norm": 2.335849644499583, + "learning_rate": 1.3809406880269073e-06, + "loss": 0.4476, + "step": 6657 + }, + { + "epoch": 0.77, + "grad_norm": 1.9184579543243196, + "learning_rate": 1.379656913888296e-06, + "loss": 0.5073, + "step": 6658 + }, + { + "epoch": 0.77, + "grad_norm": 2.4596407911503575, + "learning_rate": 1.3783736412389831e-06, + "loss": 0.4856, + "step": 6659 + }, + { + "epoch": 0.77, + "grad_norm": 2.4712794136793605, + "learning_rate": 1.377090870256725e-06, + "loss": 0.5209, + "step": 6660 + }, + { + "epoch": 0.77, + "grad_norm": 2.374465108561623, + "learning_rate": 1.3758086011192135e-06, + "loss": 0.427, + "step": 6661 + }, + { + "epoch": 0.77, + "grad_norm": 1.8257027227663676, + "learning_rate": 1.3745268340040663e-06, + "loss": 0.4491, + "step": 6662 + }, + { + "epoch": 0.77, + "grad_norm": 1.991417850680566, + "learning_rate": 1.373245569088833e-06, + "loss": 0.5223, + "step": 6663 + }, + { + "epoch": 0.77, + "grad_norm": 2.1221172568523756, + "learning_rate": 1.3719648065509966e-06, + "loss": 0.3985, + "step": 6664 + }, + { + "epoch": 0.77, + "grad_norm": 2.5704922343822907, + "learning_rate": 1.3706845465679658e-06, + "loss": 0.5162, + "step": 6665 + }, + { + "epoch": 0.77, + "grad_norm": 1.7160631421619812, + "learning_rate": 1.3694047893170841e-06, + "loss": 0.4201, + "step": 6666 + }, + { + "epoch": 0.77, + "grad_norm": 1.8697161248998917, + "learning_rate": 1.3681255349756206e-06, + "loss": 0.5343, + "step": 6667 + }, + { + "epoch": 0.77, + "grad_norm": 2.02145798006618, + "learning_rate": 1.3668467837207805e-06, + "loss": 0.584, + "step": 6668 + }, + { + "epoch": 0.77, + "grad_norm": 1.7749988908637504, + "learning_rate": 1.365568535729695e-06, + "loss": 0.5244, + "step": 6669 + }, + { + "epoch": 0.77, + "grad_norm": 1.6333339003228142, + "learning_rate": 1.3642907911794257e-06, + "loss": 0.3855, + "step": 6670 + }, + { + "epoch": 0.77, + "grad_norm": 2.1050267200370243, + "learning_rate": 1.3630135502469655e-06, + "loss": 0.5186, + "step": 6671 + }, + { + "epoch": 0.77, + "grad_norm": 5.171583801043754, + "learning_rate": 1.3617368131092396e-06, + "loss": 0.5437, + "step": 6672 + }, + { + "epoch": 0.77, + "grad_norm": 2.972909725672133, + "learning_rate": 1.3604605799430987e-06, + "loss": 0.4793, + "step": 6673 + }, + { + "epoch": 0.77, + "grad_norm": 2.0314852965674035, + "learning_rate": 1.3591848509253292e-06, + "loss": 0.409, + "step": 6674 + }, + { + "epoch": 0.77, + "grad_norm": 2.1976797939625277, + "learning_rate": 1.357909626232642e-06, + "loss": 0.5483, + "step": 6675 + }, + { + "epoch": 0.77, + "grad_norm": 1.7451618155841675, + "learning_rate": 1.3566349060416845e-06, + "loss": 0.4602, + "step": 6676 + }, + { + "epoch": 0.77, + "grad_norm": 2.271396155610624, + "learning_rate": 1.3553606905290268e-06, + "loss": 0.5312, + "step": 6677 + }, + { + "epoch": 0.77, + "grad_norm": 1.7799878881217612, + "learning_rate": 1.3540869798711765e-06, + "loss": 0.5145, + "step": 6678 + }, + { + "epoch": 0.77, + "grad_norm": 2.0740888029896687, + "learning_rate": 1.352813774244565e-06, + "loss": 0.4343, + "step": 6679 + }, + { + "epoch": 0.77, + "grad_norm": 2.1783691349729897, + "learning_rate": 1.351541073825559e-06, + "loss": 0.5254, + "step": 6680 + }, + { + "epoch": 0.77, + "grad_norm": 3.115331027551632, + "learning_rate": 1.3502688787904505e-06, + "loss": 0.4274, + "step": 6681 + }, + { + "epoch": 0.77, + "grad_norm": 2.180658267193341, + "learning_rate": 1.3489971893154658e-06, + "loss": 0.4842, + "step": 6682 + }, + { + "epoch": 0.77, + "grad_norm": 2.200878522626764, + "learning_rate": 1.3477260055767583e-06, + "loss": 0.5514, + "step": 6683 + }, + { + "epoch": 0.77, + "grad_norm": 4.141751692830821, + "learning_rate": 1.34645532775041e-06, + "loss": 0.4969, + "step": 6684 + }, + { + "epoch": 0.77, + "grad_norm": 2.542104361263541, + "learning_rate": 1.345185156012439e-06, + "loss": 0.4858, + "step": 6685 + }, + { + "epoch": 0.77, + "grad_norm": 6.199353934404737, + "learning_rate": 1.3439154905387869e-06, + "loss": 0.5359, + "step": 6686 + }, + { + "epoch": 0.77, + "grad_norm": 3.0536693787132463, + "learning_rate": 1.3426463315053279e-06, + "loss": 0.4807, + "step": 6687 + }, + { + "epoch": 0.77, + "grad_norm": 2.8629775979087793, + "learning_rate": 1.341377679087864e-06, + "loss": 0.4624, + "step": 6688 + }, + { + "epoch": 0.77, + "grad_norm": 1.8700245662012343, + "learning_rate": 1.3401095334621317e-06, + "loss": 0.4581, + "step": 6689 + }, + { + "epoch": 0.77, + "grad_norm": 1.9597611887978124, + "learning_rate": 1.3388418948037912e-06, + "loss": 0.5094, + "step": 6690 + }, + { + "epoch": 0.77, + "grad_norm": 2.408663139893484, + "learning_rate": 1.3375747632884394e-06, + "loss": 0.4206, + "step": 6691 + }, + { + "epoch": 0.77, + "grad_norm": 2.233119843177849, + "learning_rate": 1.336308139091595e-06, + "loss": 0.5235, + "step": 6692 + }, + { + "epoch": 0.77, + "grad_norm": 1.7859186690040665, + "learning_rate": 1.3350420223887145e-06, + "loss": 0.421, + "step": 6693 + }, + { + "epoch": 0.77, + "grad_norm": 2.845046863870964, + "learning_rate": 1.3337764133551767e-06, + "loss": 0.5359, + "step": 6694 + }, + { + "epoch": 0.77, + "grad_norm": 1.8607617862151697, + "learning_rate": 1.3325113121662964e-06, + "loss": 0.43, + "step": 6695 + }, + { + "epoch": 0.77, + "grad_norm": 1.9771253330996137, + "learning_rate": 1.3312467189973122e-06, + "loss": 0.5041, + "step": 6696 + }, + { + "epoch": 0.77, + "grad_norm": 1.8774328281013428, + "learning_rate": 1.329982634023399e-06, + "loss": 0.3976, + "step": 6697 + }, + { + "epoch": 0.77, + "grad_norm": 1.7853459653958383, + "learning_rate": 1.328719057419654e-06, + "loss": 0.5299, + "step": 6698 + }, + { + "epoch": 0.77, + "grad_norm": 1.7385518925394607, + "learning_rate": 1.327455989361111e-06, + "loss": 0.4364, + "step": 6699 + }, + { + "epoch": 0.77, + "grad_norm": 1.9275880876634222, + "learning_rate": 1.3261934300227263e-06, + "loss": 0.6018, + "step": 6700 + }, + { + "epoch": 0.77, + "grad_norm": 1.9209156302563102, + "learning_rate": 1.3249313795793934e-06, + "loss": 0.5259, + "step": 6701 + }, + { + "epoch": 0.77, + "grad_norm": 1.8747399048716538, + "learning_rate": 1.3236698382059287e-06, + "loss": 0.4488, + "step": 6702 + }, + { + "epoch": 0.77, + "grad_norm": 2.046195900023158, + "learning_rate": 1.3224088060770817e-06, + "loss": 0.5061, + "step": 6703 + }, + { + "epoch": 0.77, + "grad_norm": 1.8702143119137158, + "learning_rate": 1.3211482833675283e-06, + "loss": 0.4952, + "step": 6704 + }, + { + "epoch": 0.77, + "grad_norm": 1.8926954428513714, + "learning_rate": 1.3198882702518789e-06, + "loss": 0.499, + "step": 6705 + }, + { + "epoch": 0.77, + "grad_norm": 2.3506148361316903, + "learning_rate": 1.318628766904667e-06, + "loss": 0.4748, + "step": 6706 + }, + { + "epoch": 0.77, + "grad_norm": 1.9381458265235822, + "learning_rate": 1.3173697735003627e-06, + "loss": 0.5442, + "step": 6707 + }, + { + "epoch": 0.77, + "grad_norm": 2.45641902634138, + "learning_rate": 1.3161112902133594e-06, + "loss": 0.4845, + "step": 6708 + }, + { + "epoch": 0.77, + "grad_norm": 1.911917760111031, + "learning_rate": 1.3148533172179806e-06, + "loss": 0.5563, + "step": 6709 + }, + { + "epoch": 0.77, + "grad_norm": 4.898658814367033, + "learning_rate": 1.3135958546884836e-06, + "loss": 0.4194, + "step": 6710 + }, + { + "epoch": 0.77, + "grad_norm": 2.1850544090484014, + "learning_rate": 1.3123389027990492e-06, + "loss": 0.4339, + "step": 6711 + }, + { + "epoch": 0.77, + "grad_norm": 2.319310540817452, + "learning_rate": 1.3110824617237922e-06, + "loss": 0.4327, + "step": 6712 + }, + { + "epoch": 0.77, + "grad_norm": 2.2309585501826974, + "learning_rate": 1.309826531636752e-06, + "loss": 0.567, + "step": 6713 + }, + { + "epoch": 0.77, + "grad_norm": 1.8230232281134822, + "learning_rate": 1.3085711127119033e-06, + "loss": 0.4394, + "step": 6714 + }, + { + "epoch": 0.77, + "grad_norm": 7.105475986335523, + "learning_rate": 1.3073162051231431e-06, + "loss": 0.5651, + "step": 6715 + }, + { + "epoch": 0.77, + "grad_norm": 0.8639335527642217, + "learning_rate": 1.306061809044304e-06, + "loss": 0.6957, + "step": 6716 + }, + { + "epoch": 0.77, + "grad_norm": 2.0500317119988685, + "learning_rate": 1.3048079246491418e-06, + "loss": 0.4819, + "step": 6717 + }, + { + "epoch": 0.77, + "grad_norm": 2.792999354237437, + "learning_rate": 1.3035545521113473e-06, + "loss": 0.5142, + "step": 6718 + }, + { + "epoch": 0.77, + "grad_norm": 1.580358152576347, + "learning_rate": 1.302301691604535e-06, + "loss": 0.4886, + "step": 6719 + }, + { + "epoch": 0.77, + "grad_norm": 1.8364984487164793, + "learning_rate": 1.3010493433022543e-06, + "loss": 0.3732, + "step": 6720 + }, + { + "epoch": 0.77, + "grad_norm": 2.7054045130686233, + "learning_rate": 1.299797507377975e-06, + "loss": 0.4411, + "step": 6721 + }, + { + "epoch": 0.77, + "grad_norm": 2.2869154653238772, + "learning_rate": 1.298546184005106e-06, + "loss": 0.4411, + "step": 6722 + }, + { + "epoch": 0.77, + "grad_norm": 1.5432253021972209, + "learning_rate": 1.2972953733569764e-06, + "loss": 0.408, + "step": 6723 + }, + { + "epoch": 0.77, + "grad_norm": 1.7813140598872401, + "learning_rate": 1.2960450756068526e-06, + "loss": 0.461, + "step": 6724 + }, + { + "epoch": 0.77, + "grad_norm": 3.4014320847964417, + "learning_rate": 1.2947952909279216e-06, + "loss": 0.4893, + "step": 6725 + }, + { + "epoch": 0.77, + "grad_norm": 1.8590359594943828, + "learning_rate": 1.2935460194933064e-06, + "loss": 0.5247, + "step": 6726 + }, + { + "epoch": 0.77, + "grad_norm": 3.357576698208126, + "learning_rate": 1.2922972614760548e-06, + "loss": 0.5521, + "step": 6727 + }, + { + "epoch": 0.77, + "grad_norm": 2.2039762027947756, + "learning_rate": 1.2910490170491424e-06, + "loss": 0.4539, + "step": 6728 + }, + { + "epoch": 0.77, + "grad_norm": 1.6844148982337923, + "learning_rate": 1.2898012863854797e-06, + "loss": 0.4719, + "step": 6729 + }, + { + "epoch": 0.77, + "grad_norm": 2.9000770987316664, + "learning_rate": 1.2885540696578985e-06, + "loss": 0.4771, + "step": 6730 + }, + { + "epoch": 0.77, + "grad_norm": 2.3177356337333626, + "learning_rate": 1.287307367039166e-06, + "loss": 0.4729, + "step": 6731 + }, + { + "epoch": 0.77, + "grad_norm": 6.475186446131532, + "learning_rate": 1.2860611787019733e-06, + "loss": 0.5234, + "step": 6732 + }, + { + "epoch": 0.77, + "grad_norm": 2.49160385672363, + "learning_rate": 1.2848155048189437e-06, + "loss": 0.5278, + "step": 6733 + }, + { + "epoch": 0.77, + "grad_norm": 2.18391066145214, + "learning_rate": 1.2835703455626253e-06, + "loss": 0.4879, + "step": 6734 + }, + { + "epoch": 0.77, + "grad_norm": 1.7488137517806368, + "learning_rate": 1.2823257011055006e-06, + "loss": 0.4936, + "step": 6735 + }, + { + "epoch": 0.77, + "grad_norm": 1.976012220543827, + "learning_rate": 1.2810815716199748e-06, + "loss": 0.4757, + "step": 6736 + }, + { + "epoch": 0.77, + "grad_norm": 1.8172464920094802, + "learning_rate": 1.2798379572783881e-06, + "loss": 0.5045, + "step": 6737 + }, + { + "epoch": 0.77, + "grad_norm": 1.9450642090119168, + "learning_rate": 1.278594858253e-06, + "loss": 0.4171, + "step": 6738 + }, + { + "epoch": 0.77, + "grad_norm": 1.6726112677378306, + "learning_rate": 1.2773522747160094e-06, + "loss": 0.4592, + "step": 6739 + }, + { + "epoch": 0.77, + "grad_norm": 3.8267315663769557, + "learning_rate": 1.2761102068395353e-06, + "loss": 0.4235, + "step": 6740 + }, + { + "epoch": 0.77, + "grad_norm": 1.955101897526041, + "learning_rate": 1.2748686547956319e-06, + "loss": 0.5214, + "step": 6741 + }, + { + "epoch": 0.77, + "grad_norm": 1.7940065315901625, + "learning_rate": 1.2736276187562757e-06, + "loss": 0.4251, + "step": 6742 + }, + { + "epoch": 0.77, + "grad_norm": 4.267550615914824, + "learning_rate": 1.2723870988933778e-06, + "loss": 0.5193, + "step": 6743 + }, + { + "epoch": 0.77, + "grad_norm": 1.878695188811749, + "learning_rate": 1.271147095378772e-06, + "loss": 0.4575, + "step": 6744 + }, + { + "epoch": 0.78, + "grad_norm": 1.8649535172535148, + "learning_rate": 1.269907608384226e-06, + "loss": 0.4974, + "step": 6745 + }, + { + "epoch": 0.78, + "grad_norm": 2.9159280970011907, + "learning_rate": 1.2686686380814305e-06, + "loss": 0.5062, + "step": 6746 + }, + { + "epoch": 0.78, + "grad_norm": 2.133537628417536, + "learning_rate": 1.2674301846420107e-06, + "loss": 0.5272, + "step": 6747 + }, + { + "epoch": 0.78, + "grad_norm": 2.5401004355469747, + "learning_rate": 1.266192248237515e-06, + "loss": 0.4476, + "step": 6748 + }, + { + "epoch": 0.78, + "grad_norm": 2.9157370839975103, + "learning_rate": 1.2649548290394208e-06, + "loss": 0.4047, + "step": 6749 + }, + { + "epoch": 0.78, + "grad_norm": 3.247453756391527, + "learning_rate": 1.2637179272191386e-06, + "loss": 0.5311, + "step": 6750 + }, + { + "epoch": 0.78, + "grad_norm": 2.1663503576833767, + "learning_rate": 1.2624815429480003e-06, + "loss": 0.547, + "step": 6751 + }, + { + "epoch": 0.78, + "grad_norm": 2.5433187204903076, + "learning_rate": 1.2612456763972724e-06, + "loss": 0.504, + "step": 6752 + }, + { + "epoch": 0.78, + "grad_norm": 1.9063523641437436, + "learning_rate": 1.2600103277381448e-06, + "loss": 0.523, + "step": 6753 + }, + { + "epoch": 0.78, + "grad_norm": 1.9030750621005672, + "learning_rate": 1.2587754971417421e-06, + "loss": 0.5281, + "step": 6754 + }, + { + "epoch": 0.78, + "grad_norm": 0.7819758777386262, + "learning_rate": 1.257541184779106e-06, + "loss": 0.6673, + "step": 6755 + }, + { + "epoch": 0.78, + "grad_norm": 6.668046341226919, + "learning_rate": 1.2563073908212182e-06, + "loss": 0.4839, + "step": 6756 + }, + { + "epoch": 0.78, + "grad_norm": 2.627001906787616, + "learning_rate": 1.2550741154389813e-06, + "loss": 0.5136, + "step": 6757 + }, + { + "epoch": 0.78, + "grad_norm": 1.802223119575306, + "learning_rate": 1.25384135880323e-06, + "loss": 0.5239, + "step": 6758 + }, + { + "epoch": 0.78, + "grad_norm": 2.2767557894425052, + "learning_rate": 1.252609121084724e-06, + "loss": 0.4705, + "step": 6759 + }, + { + "epoch": 0.78, + "grad_norm": 2.263001748897031, + "learning_rate": 1.2513774024541547e-06, + "loss": 0.5737, + "step": 6760 + }, + { + "epoch": 0.78, + "grad_norm": 1.990291190075724, + "learning_rate": 1.2501462030821365e-06, + "loss": 0.5995, + "step": 6761 + }, + { + "epoch": 0.78, + "grad_norm": 2.255397059676647, + "learning_rate": 1.2489155231392187e-06, + "loss": 0.4798, + "step": 6762 + }, + { + "epoch": 0.78, + "grad_norm": 2.3781526185115225, + "learning_rate": 1.2476853627958713e-06, + "loss": 0.4888, + "step": 6763 + }, + { + "epoch": 0.78, + "grad_norm": 2.310530150693741, + "learning_rate": 1.2464557222224994e-06, + "loss": 0.5195, + "step": 6764 + }, + { + "epoch": 0.78, + "grad_norm": 5.014776980740316, + "learning_rate": 1.2452266015894288e-06, + "loss": 0.4814, + "step": 6765 + }, + { + "epoch": 0.78, + "grad_norm": 2.058618529572185, + "learning_rate": 1.2439980010669206e-06, + "loss": 0.4204, + "step": 6766 + }, + { + "epoch": 0.78, + "grad_norm": 2.2861074665934713, + "learning_rate": 1.2427699208251587e-06, + "loss": 0.5205, + "step": 6767 + }, + { + "epoch": 0.78, + "grad_norm": 2.2437829666207953, + "learning_rate": 1.2415423610342548e-06, + "loss": 0.5345, + "step": 6768 + }, + { + "epoch": 0.78, + "grad_norm": 0.7977842659437432, + "learning_rate": 1.2403153218642538e-06, + "loss": 0.6579, + "step": 6769 + }, + { + "epoch": 0.78, + "grad_norm": 2.47932444235642, + "learning_rate": 1.2390888034851223e-06, + "loss": 0.5622, + "step": 6770 + }, + { + "epoch": 0.78, + "grad_norm": 2.2955424000696927, + "learning_rate": 1.2378628060667591e-06, + "loss": 0.5141, + "step": 6771 + }, + { + "epoch": 0.78, + "grad_norm": 2.6490586482640284, + "learning_rate": 1.2366373297789886e-06, + "loss": 0.4911, + "step": 6772 + }, + { + "epoch": 0.78, + "grad_norm": 2.196992848784864, + "learning_rate": 1.2354123747915631e-06, + "loss": 0.4335, + "step": 6773 + }, + { + "epoch": 0.78, + "grad_norm": 1.842891679313481, + "learning_rate": 1.2341879412741625e-06, + "loss": 0.432, + "step": 6774 + }, + { + "epoch": 0.78, + "grad_norm": 2.309189463218756, + "learning_rate": 1.2329640293963968e-06, + "loss": 0.4566, + "step": 6775 + }, + { + "epoch": 0.78, + "grad_norm": 2.478285387005094, + "learning_rate": 1.2317406393278004e-06, + "loss": 0.5637, + "step": 6776 + }, + { + "epoch": 0.78, + "grad_norm": 1.7932254696937506, + "learning_rate": 1.2305177712378391e-06, + "loss": 0.5194, + "step": 6777 + }, + { + "epoch": 0.78, + "grad_norm": 2.120509430267194, + "learning_rate": 1.2292954252959017e-06, + "loss": 0.5533, + "step": 6778 + }, + { + "epoch": 0.78, + "grad_norm": 2.1515692253505447, + "learning_rate": 1.2280736016713107e-06, + "loss": 0.4646, + "step": 6779 + }, + { + "epoch": 0.78, + "grad_norm": 2.224594169473721, + "learning_rate": 1.2268523005333093e-06, + "loss": 0.5846, + "step": 6780 + }, + { + "epoch": 0.78, + "grad_norm": 2.0420015161311666, + "learning_rate": 1.225631522051075e-06, + "loss": 0.5074, + "step": 6781 + }, + { + "epoch": 0.78, + "grad_norm": 2.6798174356296443, + "learning_rate": 1.2244112663937073e-06, + "loss": 0.5912, + "step": 6782 + }, + { + "epoch": 0.78, + "grad_norm": 3.356405234873417, + "learning_rate": 1.2231915337302386e-06, + "loss": 0.5078, + "step": 6783 + }, + { + "epoch": 0.78, + "grad_norm": 2.1991395936904867, + "learning_rate": 1.2219723242296233e-06, + "loss": 0.4549, + "step": 6784 + }, + { + "epoch": 0.78, + "grad_norm": 5.158126118939817, + "learning_rate": 1.2207536380607481e-06, + "loss": 0.4861, + "step": 6785 + }, + { + "epoch": 0.78, + "grad_norm": 2.083604177977479, + "learning_rate": 1.2195354753924248e-06, + "loss": 0.4929, + "step": 6786 + }, + { + "epoch": 0.78, + "grad_norm": 2.1246398835578613, + "learning_rate": 1.2183178363933923e-06, + "loss": 0.3935, + "step": 6787 + }, + { + "epoch": 0.78, + "grad_norm": 3.45148569611535, + "learning_rate": 1.2171007212323171e-06, + "loss": 0.4775, + "step": 6788 + }, + { + "epoch": 0.78, + "grad_norm": 2.638307679309626, + "learning_rate": 1.2158841300777956e-06, + "loss": 0.3952, + "step": 6789 + }, + { + "epoch": 0.78, + "grad_norm": 1.9360154899302933, + "learning_rate": 1.2146680630983482e-06, + "loss": 0.4473, + "step": 6790 + }, + { + "epoch": 0.78, + "grad_norm": 2.148329940155619, + "learning_rate": 1.2134525204624265e-06, + "loss": 0.4843, + "step": 6791 + }, + { + "epoch": 0.78, + "grad_norm": 2.638775584220251, + "learning_rate": 1.2122375023384052e-06, + "loss": 0.4817, + "step": 6792 + }, + { + "epoch": 0.78, + "grad_norm": 2.4414395286657804, + "learning_rate": 1.2110230088945879e-06, + "loss": 0.5892, + "step": 6793 + }, + { + "epoch": 0.78, + "grad_norm": 2.4227825154114564, + "learning_rate": 1.2098090402992085e-06, + "loss": 0.5097, + "step": 6794 + }, + { + "epoch": 0.78, + "grad_norm": 0.8358335621606696, + "learning_rate": 1.2085955967204232e-06, + "loss": 0.6773, + "step": 6795 + }, + { + "epoch": 0.78, + "grad_norm": 1.7419927315017967, + "learning_rate": 1.2073826783263204e-06, + "loss": 0.5391, + "step": 6796 + }, + { + "epoch": 0.78, + "grad_norm": 2.282668667576598, + "learning_rate": 1.2061702852849106e-06, + "loss": 0.3795, + "step": 6797 + }, + { + "epoch": 0.78, + "grad_norm": 4.2122987091039255, + "learning_rate": 1.2049584177641372e-06, + "loss": 0.5424, + "step": 6798 + }, + { + "epoch": 0.78, + "grad_norm": 2.4038558945441806, + "learning_rate": 1.2037470759318648e-06, + "loss": 0.4995, + "step": 6799 + }, + { + "epoch": 0.78, + "grad_norm": 2.2346334215149954, + "learning_rate": 1.2025362599558916e-06, + "loss": 0.4915, + "step": 6800 + }, + { + "epoch": 0.78, + "grad_norm": 2.310228137016606, + "learning_rate": 1.2013259700039364e-06, + "loss": 0.5251, + "step": 6801 + }, + { + "epoch": 0.78, + "grad_norm": 1.8931384870779213, + "learning_rate": 1.2001162062436517e-06, + "loss": 0.3893, + "step": 6802 + }, + { + "epoch": 0.78, + "grad_norm": 2.320937755375561, + "learning_rate": 1.1989069688426103e-06, + "loss": 0.4368, + "step": 6803 + }, + { + "epoch": 0.78, + "grad_norm": 1.8312779568191377, + "learning_rate": 1.19769825796832e-06, + "loss": 0.4747, + "step": 6804 + }, + { + "epoch": 0.78, + "grad_norm": 2.4786233530509816, + "learning_rate": 1.1964900737882057e-06, + "loss": 0.5685, + "step": 6805 + }, + { + "epoch": 0.78, + "grad_norm": 3.761824017245682, + "learning_rate": 1.1952824164696287e-06, + "loss": 0.5017, + "step": 6806 + }, + { + "epoch": 0.78, + "grad_norm": 1.9162395943008483, + "learning_rate": 1.1940752861798711e-06, + "loss": 0.4936, + "step": 6807 + }, + { + "epoch": 0.78, + "grad_norm": 2.3902552252150273, + "learning_rate": 1.192868683086147e-06, + "loss": 0.4478, + "step": 6808 + }, + { + "epoch": 0.78, + "grad_norm": 2.187967322113359, + "learning_rate": 1.1916626073555926e-06, + "loss": 0.5396, + "step": 6809 + }, + { + "epoch": 0.78, + "grad_norm": 2.0656061700738793, + "learning_rate": 1.190457059155275e-06, + "loss": 0.5102, + "step": 6810 + }, + { + "epoch": 0.78, + "grad_norm": 2.7194725515543636, + "learning_rate": 1.1892520386521843e-06, + "loss": 0.4628, + "step": 6811 + }, + { + "epoch": 0.78, + "grad_norm": 2.9011110535247093, + "learning_rate": 1.188047546013243e-06, + "loss": 0.4164, + "step": 6812 + }, + { + "epoch": 0.78, + "grad_norm": 2.485173749848713, + "learning_rate": 1.1868435814052953e-06, + "loss": 0.454, + "step": 6813 + }, + { + "epoch": 0.78, + "grad_norm": 1.962450487849109, + "learning_rate": 1.1856401449951128e-06, + "loss": 0.4923, + "step": 6814 + }, + { + "epoch": 0.78, + "grad_norm": 4.422970149099892, + "learning_rate": 1.1844372369493977e-06, + "loss": 0.4826, + "step": 6815 + }, + { + "epoch": 0.78, + "grad_norm": 1.875345394226263, + "learning_rate": 1.1832348574347747e-06, + "loss": 0.5334, + "step": 6816 + }, + { + "epoch": 0.78, + "grad_norm": 1.746154329079848, + "learning_rate": 1.1820330066178e-06, + "loss": 0.5129, + "step": 6817 + }, + { + "epoch": 0.78, + "grad_norm": 2.2608347470323045, + "learning_rate": 1.1808316846649498e-06, + "loss": 0.4999, + "step": 6818 + }, + { + "epoch": 0.78, + "grad_norm": 2.1486211474879475, + "learning_rate": 1.1796308917426347e-06, + "loss": 0.497, + "step": 6819 + }, + { + "epoch": 0.78, + "grad_norm": 2.80592640680791, + "learning_rate": 1.1784306280171854e-06, + "loss": 0.478, + "step": 6820 + }, + { + "epoch": 0.78, + "grad_norm": 3.6989225133514743, + "learning_rate": 1.1772308936548664e-06, + "loss": 0.4468, + "step": 6821 + }, + { + "epoch": 0.78, + "grad_norm": 2.856001956477636, + "learning_rate": 1.1760316888218594e-06, + "loss": 0.475, + "step": 6822 + }, + { + "epoch": 0.78, + "grad_norm": 1.8653365562727315, + "learning_rate": 1.1748330136842817e-06, + "loss": 0.4231, + "step": 6823 + }, + { + "epoch": 0.78, + "grad_norm": 1.9782036404673067, + "learning_rate": 1.1736348684081705e-06, + "loss": 0.3855, + "step": 6824 + }, + { + "epoch": 0.78, + "grad_norm": 2.2989993737188033, + "learning_rate": 1.1724372531594969e-06, + "loss": 0.5216, + "step": 6825 + }, + { + "epoch": 0.78, + "grad_norm": 2.027458946354153, + "learning_rate": 1.1712401681041502e-06, + "loss": 0.444, + "step": 6826 + }, + { + "epoch": 0.78, + "grad_norm": 2.337849639508019, + "learning_rate": 1.1700436134079535e-06, + "loss": 0.5159, + "step": 6827 + }, + { + "epoch": 0.78, + "grad_norm": 2.3860164571926723, + "learning_rate": 1.1688475892366507e-06, + "loss": 0.5347, + "step": 6828 + }, + { + "epoch": 0.78, + "grad_norm": 2.379605159909446, + "learning_rate": 1.1676520957559179e-06, + "loss": 0.5115, + "step": 6829 + }, + { + "epoch": 0.78, + "grad_norm": 2.270479711868883, + "learning_rate": 1.1664571331313513e-06, + "loss": 0.4693, + "step": 6830 + }, + { + "epoch": 0.78, + "grad_norm": 2.8932128764483234, + "learning_rate": 1.1652627015284806e-06, + "loss": 0.5513, + "step": 6831 + }, + { + "epoch": 0.79, + "grad_norm": 2.1853336048844705, + "learning_rate": 1.1640688011127553e-06, + "loss": 0.5341, + "step": 6832 + }, + { + "epoch": 0.79, + "grad_norm": 2.333743872207169, + "learning_rate": 1.162875432049555e-06, + "loss": 0.5411, + "step": 6833 + }, + { + "epoch": 0.79, + "grad_norm": 1.756120974849827, + "learning_rate": 1.161682594504186e-06, + "loss": 0.5759, + "step": 6834 + }, + { + "epoch": 0.79, + "grad_norm": 2.1276041720392986, + "learning_rate": 1.1604902886418778e-06, + "loss": 0.3545, + "step": 6835 + }, + { + "epoch": 0.79, + "grad_norm": 1.9222165337346127, + "learning_rate": 1.1592985146277914e-06, + "loss": 0.5175, + "step": 6836 + }, + { + "epoch": 0.79, + "grad_norm": 2.147927014709979, + "learning_rate": 1.1581072726270082e-06, + "loss": 0.6192, + "step": 6837 + }, + { + "epoch": 0.79, + "grad_norm": 2.081538766656623, + "learning_rate": 1.1569165628045426e-06, + "loss": 0.3708, + "step": 6838 + }, + { + "epoch": 0.79, + "grad_norm": 2.3135169599905336, + "learning_rate": 1.1557263853253264e-06, + "loss": 0.4965, + "step": 6839 + }, + { + "epoch": 0.79, + "grad_norm": 8.960732314892969, + "learning_rate": 1.1545367403542273e-06, + "loss": 0.5104, + "step": 6840 + }, + { + "epoch": 0.79, + "grad_norm": 3.690353255836699, + "learning_rate": 1.1533476280560308e-06, + "loss": 0.5303, + "step": 6841 + }, + { + "epoch": 0.79, + "grad_norm": 1.7897476631418416, + "learning_rate": 1.1521590485954564e-06, + "loss": 0.5265, + "step": 6842 + }, + { + "epoch": 0.79, + "grad_norm": 2.027332174096245, + "learning_rate": 1.1509710021371428e-06, + "loss": 0.4271, + "step": 6843 + }, + { + "epoch": 0.79, + "grad_norm": 2.045655772207243, + "learning_rate": 1.1497834888456606e-06, + "loss": 0.4733, + "step": 6844 + }, + { + "epoch": 0.79, + "grad_norm": 1.7947768129361377, + "learning_rate": 1.148596508885501e-06, + "loss": 0.4679, + "step": 6845 + }, + { + "epoch": 0.79, + "grad_norm": 2.1073656403914875, + "learning_rate": 1.1474100624210877e-06, + "loss": 0.3981, + "step": 6846 + }, + { + "epoch": 0.79, + "grad_norm": 1.920288539829552, + "learning_rate": 1.1462241496167636e-06, + "loss": 0.4917, + "step": 6847 + }, + { + "epoch": 0.79, + "grad_norm": 2.145477458701994, + "learning_rate": 1.1450387706368043e-06, + "loss": 0.4516, + "step": 6848 + }, + { + "epoch": 0.79, + "grad_norm": 0.819790441987797, + "learning_rate": 1.143853925645405e-06, + "loss": 0.7107, + "step": 6849 + }, + { + "epoch": 0.79, + "grad_norm": 1.8391440810504196, + "learning_rate": 1.142669614806694e-06, + "loss": 0.5009, + "step": 6850 + }, + { + "epoch": 0.79, + "grad_norm": 1.7812400360517238, + "learning_rate": 1.1414858382847198e-06, + "loss": 0.4872, + "step": 6851 + }, + { + "epoch": 0.79, + "grad_norm": 1.7902226741195288, + "learning_rate": 1.1403025962434572e-06, + "loss": 0.4706, + "step": 6852 + }, + { + "epoch": 0.79, + "grad_norm": 2.0661529522006967, + "learning_rate": 1.1391198888468118e-06, + "loss": 0.4459, + "step": 6853 + }, + { + "epoch": 0.79, + "grad_norm": 1.8511199811439056, + "learning_rate": 1.1379377162586097e-06, + "loss": 0.4471, + "step": 6854 + }, + { + "epoch": 0.79, + "grad_norm": 1.722957225090292, + "learning_rate": 1.1367560786426075e-06, + "loss": 0.4583, + "step": 6855 + }, + { + "epoch": 0.79, + "grad_norm": 2.4733571939489982, + "learning_rate": 1.135574976162484e-06, + "loss": 0.4524, + "step": 6856 + }, + { + "epoch": 0.79, + "grad_norm": 1.911453829308716, + "learning_rate": 1.134394408981846e-06, + "loss": 0.4235, + "step": 6857 + }, + { + "epoch": 0.79, + "grad_norm": 2.3720338080075316, + "learning_rate": 1.133214377264223e-06, + "loss": 0.4963, + "step": 6858 + }, + { + "epoch": 0.79, + "grad_norm": 2.133757253736453, + "learning_rate": 1.1320348811730759e-06, + "loss": 0.4636, + "step": 6859 + }, + { + "epoch": 0.79, + "grad_norm": 2.663119311581652, + "learning_rate": 1.1308559208717862e-06, + "loss": 0.5076, + "step": 6860 + }, + { + "epoch": 0.79, + "grad_norm": 2.020318347017507, + "learning_rate": 1.129677496523665e-06, + "loss": 0.5991, + "step": 6861 + }, + { + "epoch": 0.79, + "grad_norm": 2.0212951357422853, + "learning_rate": 1.128499608291946e-06, + "loss": 0.498, + "step": 6862 + }, + { + "epoch": 0.79, + "grad_norm": 2.527734612746698, + "learning_rate": 1.1273222563397911e-06, + "loss": 0.5276, + "step": 6863 + }, + { + "epoch": 0.79, + "grad_norm": 2.710120500290196, + "learning_rate": 1.1261454408302858e-06, + "loss": 0.4525, + "step": 6864 + }, + { + "epoch": 0.79, + "grad_norm": 1.6499088438348073, + "learning_rate": 1.1249691619264447e-06, + "loss": 0.4487, + "step": 6865 + }, + { + "epoch": 0.79, + "grad_norm": 2.4159326361833844, + "learning_rate": 1.1237934197912021e-06, + "loss": 0.4697, + "step": 6866 + }, + { + "epoch": 0.79, + "grad_norm": 3.1854973490643403, + "learning_rate": 1.1226182145874255e-06, + "loss": 0.4908, + "step": 6867 + }, + { + "epoch": 0.79, + "grad_norm": 2.1868808233124906, + "learning_rate": 1.1214435464779006e-06, + "loss": 0.4471, + "step": 6868 + }, + { + "epoch": 0.79, + "grad_norm": 2.5998775298666703, + "learning_rate": 1.1202694156253452e-06, + "loss": 0.4518, + "step": 6869 + }, + { + "epoch": 0.79, + "grad_norm": 2.797325286171102, + "learning_rate": 1.119095822192397e-06, + "loss": 0.484, + "step": 6870 + }, + { + "epoch": 0.79, + "grad_norm": 2.5402210327121946, + "learning_rate": 1.1179227663416248e-06, + "loss": 0.4556, + "step": 6871 + }, + { + "epoch": 0.79, + "grad_norm": 2.156653301030024, + "learning_rate": 1.1167502482355186e-06, + "loss": 0.5674, + "step": 6872 + }, + { + "epoch": 0.79, + "grad_norm": 1.9492662540895498, + "learning_rate": 1.1155782680364952e-06, + "loss": 0.5253, + "step": 6873 + }, + { + "epoch": 0.79, + "grad_norm": 1.9536336601098874, + "learning_rate": 1.1144068259068957e-06, + "loss": 0.4616, + "step": 6874 + }, + { + "epoch": 0.79, + "grad_norm": 2.357512342482728, + "learning_rate": 1.1132359220089917e-06, + "loss": 0.4609, + "step": 6875 + }, + { + "epoch": 0.79, + "grad_norm": 1.6604133194759714, + "learning_rate": 1.1120655565049726e-06, + "loss": 0.4053, + "step": 6876 + }, + { + "epoch": 0.79, + "grad_norm": 2.534929117448477, + "learning_rate": 1.1108957295569611e-06, + "loss": 0.3737, + "step": 6877 + }, + { + "epoch": 0.79, + "grad_norm": 2.1165745640875997, + "learning_rate": 1.1097264413269992e-06, + "loss": 0.4306, + "step": 6878 + }, + { + "epoch": 0.79, + "grad_norm": 2.7134970914880534, + "learning_rate": 1.1085576919770557e-06, + "loss": 0.4518, + "step": 6879 + }, + { + "epoch": 0.79, + "grad_norm": 1.9628344112929053, + "learning_rate": 1.1073894816690277e-06, + "loss": 0.4285, + "step": 6880 + }, + { + "epoch": 0.79, + "grad_norm": 1.8714986414464168, + "learning_rate": 1.1062218105647338e-06, + "loss": 0.4372, + "step": 6881 + }, + { + "epoch": 0.79, + "grad_norm": 1.924438013178842, + "learning_rate": 1.1050546788259208e-06, + "loss": 0.481, + "step": 6882 + }, + { + "epoch": 0.79, + "grad_norm": 1.980846120156529, + "learning_rate": 1.1038880866142582e-06, + "loss": 0.5062, + "step": 6883 + }, + { + "epoch": 0.79, + "grad_norm": 2.026980720851859, + "learning_rate": 1.1027220340913448e-06, + "loss": 0.4165, + "step": 6884 + }, + { + "epoch": 0.79, + "grad_norm": 1.7940762222690434, + "learning_rate": 1.101556521418698e-06, + "loss": 0.4906, + "step": 6885 + }, + { + "epoch": 0.79, + "grad_norm": 2.1970407044966542, + "learning_rate": 1.1003915487577683e-06, + "loss": 0.4923, + "step": 6886 + }, + { + "epoch": 0.79, + "grad_norm": 1.981984242282057, + "learning_rate": 1.099227116269924e-06, + "loss": 0.5154, + "step": 6887 + }, + { + "epoch": 0.79, + "grad_norm": 2.8357228272358075, + "learning_rate": 1.0980632241164663e-06, + "loss": 0.495, + "step": 6888 + }, + { + "epoch": 0.79, + "grad_norm": 1.928662829113934, + "learning_rate": 1.0968998724586143e-06, + "loss": 0.4377, + "step": 6889 + }, + { + "epoch": 0.79, + "grad_norm": 1.8823113009848835, + "learning_rate": 1.0957370614575158e-06, + "loss": 0.5572, + "step": 6890 + }, + { + "epoch": 0.79, + "grad_norm": 2.4705444252994853, + "learning_rate": 1.0945747912742428e-06, + "loss": 0.527, + "step": 6891 + }, + { + "epoch": 0.79, + "grad_norm": 1.899860569424478, + "learning_rate": 1.0934130620697935e-06, + "loss": 0.4854, + "step": 6892 + }, + { + "epoch": 0.79, + "grad_norm": 2.6260547508544922, + "learning_rate": 1.0922518740050896e-06, + "loss": 0.464, + "step": 6893 + }, + { + "epoch": 0.79, + "grad_norm": 2.3701246864996928, + "learning_rate": 1.09109122724098e-06, + "loss": 0.5483, + "step": 6894 + }, + { + "epoch": 0.79, + "grad_norm": 2.026589501504713, + "learning_rate": 1.0899311219382358e-06, + "loss": 0.4764, + "step": 6895 + }, + { + "epoch": 0.79, + "grad_norm": 1.8643949362639638, + "learning_rate": 1.0887715582575565e-06, + "loss": 0.4558, + "step": 6896 + }, + { + "epoch": 0.79, + "grad_norm": 2.466005920746009, + "learning_rate": 1.0876125363595635e-06, + "loss": 0.5115, + "step": 6897 + }, + { + "epoch": 0.79, + "grad_norm": 2.4048158990542214, + "learning_rate": 1.086454056404803e-06, + "loss": 0.4889, + "step": 6898 + }, + { + "epoch": 0.79, + "grad_norm": 2.0050197628860555, + "learning_rate": 1.0852961185537502e-06, + "loss": 0.5716, + "step": 6899 + }, + { + "epoch": 0.79, + "grad_norm": 2.157050295518263, + "learning_rate": 1.0841387229667994e-06, + "loss": 0.3877, + "step": 6900 + }, + { + "epoch": 0.79, + "grad_norm": 1.7425829884828348, + "learning_rate": 1.082981869804276e-06, + "loss": 0.4453, + "step": 6901 + }, + { + "epoch": 0.79, + "grad_norm": 3.1438184694566167, + "learning_rate": 1.0818255592264242e-06, + "loss": 0.4935, + "step": 6902 + }, + { + "epoch": 0.79, + "grad_norm": 1.6676341778685355, + "learning_rate": 1.0806697913934183e-06, + "loss": 0.4217, + "step": 6903 + }, + { + "epoch": 0.79, + "grad_norm": 1.848714942702512, + "learning_rate": 1.0795145664653534e-06, + "loss": 0.4233, + "step": 6904 + }, + { + "epoch": 0.79, + "grad_norm": 1.8788119623056538, + "learning_rate": 1.0783598846022526e-06, + "loss": 0.5946, + "step": 6905 + }, + { + "epoch": 0.79, + "grad_norm": 54.542465475210705, + "learning_rate": 1.0772057459640612e-06, + "loss": 0.4921, + "step": 6906 + }, + { + "epoch": 0.79, + "grad_norm": 1.7726760480017982, + "learning_rate": 1.076052150710651e-06, + "loss": 0.545, + "step": 6907 + }, + { + "epoch": 0.79, + "grad_norm": 2.1422865062916396, + "learning_rate": 1.0748990990018149e-06, + "loss": 0.5227, + "step": 6908 + }, + { + "epoch": 0.79, + "grad_norm": 2.0815910853850275, + "learning_rate": 1.0737465909972778e-06, + "loss": 0.5556, + "step": 6909 + }, + { + "epoch": 0.79, + "grad_norm": 2.346965877095207, + "learning_rate": 1.0725946268566812e-06, + "loss": 0.4844, + "step": 6910 + }, + { + "epoch": 0.79, + "grad_norm": 2.1619557154686233, + "learning_rate": 1.0714432067395985e-06, + "loss": 0.4785, + "step": 6911 + }, + { + "epoch": 0.79, + "grad_norm": 1.8547340764421911, + "learning_rate": 1.0702923308055208e-06, + "loss": 0.498, + "step": 6912 + }, + { + "epoch": 0.79, + "grad_norm": 2.8896719820628562, + "learning_rate": 1.0691419992138697e-06, + "loss": 0.5503, + "step": 6913 + }, + { + "epoch": 0.79, + "grad_norm": 1.8106425014328813, + "learning_rate": 1.067992212123987e-06, + "loss": 0.4525, + "step": 6914 + }, + { + "epoch": 0.79, + "grad_norm": 2.426918064900197, + "learning_rate": 1.0668429696951432e-06, + "loss": 0.5103, + "step": 6915 + }, + { + "epoch": 0.79, + "grad_norm": 2.180739630841152, + "learning_rate": 1.0656942720865303e-06, + "loss": 0.3896, + "step": 6916 + }, + { + "epoch": 0.79, + "grad_norm": 2.7871294050976116, + "learning_rate": 1.0645461194572642e-06, + "loss": 0.5765, + "step": 6917 + }, + { + "epoch": 0.79, + "grad_norm": 2.507997146290191, + "learning_rate": 1.0633985119663886e-06, + "loss": 0.4662, + "step": 6918 + }, + { + "epoch": 0.8, + "grad_norm": 0.8084917282912344, + "learning_rate": 1.0622514497728686e-06, + "loss": 0.6919, + "step": 6919 + }, + { + "epoch": 0.8, + "grad_norm": 1.9346300743401115, + "learning_rate": 1.061104933035597e-06, + "loss": 0.5561, + "step": 6920 + }, + { + "epoch": 0.8, + "grad_norm": 2.002055042925914, + "learning_rate": 1.0599589619133865e-06, + "loss": 0.4863, + "step": 6921 + }, + { + "epoch": 0.8, + "grad_norm": 0.8461546089783812, + "learning_rate": 1.0588135365649804e-06, + "loss": 0.7013, + "step": 6922 + }, + { + "epoch": 0.8, + "grad_norm": 3.2195006398791386, + "learning_rate": 1.0576686571490386e-06, + "loss": 0.468, + "step": 6923 + }, + { + "epoch": 0.8, + "grad_norm": 1.8848874804638946, + "learning_rate": 1.0565243238241525e-06, + "loss": 0.392, + "step": 6924 + }, + { + "epoch": 0.8, + "grad_norm": 1.9736907108445945, + "learning_rate": 1.0553805367488324e-06, + "loss": 0.6129, + "step": 6925 + }, + { + "epoch": 0.8, + "grad_norm": 1.9402108565671452, + "learning_rate": 1.0542372960815189e-06, + "loss": 0.4796, + "step": 6926 + }, + { + "epoch": 0.8, + "grad_norm": 2.5723012541646164, + "learning_rate": 1.0530946019805704e-06, + "loss": 0.4899, + "step": 6927 + }, + { + "epoch": 0.8, + "grad_norm": 2.317020650700444, + "learning_rate": 1.0519524546042754e-06, + "loss": 0.4573, + "step": 6928 + }, + { + "epoch": 0.8, + "grad_norm": 2.153141278992208, + "learning_rate": 1.0508108541108408e-06, + "loss": 0.4583, + "step": 6929 + }, + { + "epoch": 0.8, + "grad_norm": 1.8824673691468163, + "learning_rate": 1.0496698006584044e-06, + "loss": 0.4798, + "step": 6930 + }, + { + "epoch": 0.8, + "grad_norm": 2.3708191379497645, + "learning_rate": 1.0485292944050213e-06, + "loss": 0.4244, + "step": 6931 + }, + { + "epoch": 0.8, + "grad_norm": 2.505625473071346, + "learning_rate": 1.0473893355086773e-06, + "loss": 0.4772, + "step": 6932 + }, + { + "epoch": 0.8, + "grad_norm": 2.245740870319385, + "learning_rate": 1.0462499241272767e-06, + "loss": 0.5423, + "step": 6933 + }, + { + "epoch": 0.8, + "grad_norm": 2.4594798626658765, + "learning_rate": 1.045111060418653e-06, + "loss": 0.594, + "step": 6934 + }, + { + "epoch": 0.8, + "grad_norm": 2.072492417941293, + "learning_rate": 1.0439727445405596e-06, + "loss": 0.5704, + "step": 6935 + }, + { + "epoch": 0.8, + "grad_norm": 2.1201752812378274, + "learning_rate": 1.0428349766506768e-06, + "loss": 0.4473, + "step": 6936 + }, + { + "epoch": 0.8, + "grad_norm": 2.596679220699931, + "learning_rate": 1.0416977569066084e-06, + "loss": 0.5047, + "step": 6937 + }, + { + "epoch": 0.8, + "grad_norm": 3.016111942319278, + "learning_rate": 1.0405610854658794e-06, + "loss": 0.5053, + "step": 6938 + }, + { + "epoch": 0.8, + "grad_norm": 1.973109727123369, + "learning_rate": 1.0394249624859444e-06, + "loss": 0.4764, + "step": 6939 + }, + { + "epoch": 0.8, + "grad_norm": 2.0665456318696362, + "learning_rate": 1.0382893881241773e-06, + "loss": 0.5783, + "step": 6940 + }, + { + "epoch": 0.8, + "grad_norm": 1.9007858580361008, + "learning_rate": 1.0371543625378772e-06, + "loss": 0.4804, + "step": 6941 + }, + { + "epoch": 0.8, + "grad_norm": 2.3689725617246, + "learning_rate": 1.0360198858842695e-06, + "loss": 0.5323, + "step": 6942 + }, + { + "epoch": 0.8, + "grad_norm": 1.786753858207934, + "learning_rate": 1.0348859583205007e-06, + "loss": 0.4684, + "step": 6943 + }, + { + "epoch": 0.8, + "grad_norm": 2.3795322644822012, + "learning_rate": 1.0337525800036409e-06, + "loss": 0.452, + "step": 6944 + }, + { + "epoch": 0.8, + "grad_norm": 1.887280243484445, + "learning_rate": 1.0326197510906876e-06, + "loss": 0.4558, + "step": 6945 + }, + { + "epoch": 0.8, + "grad_norm": 2.285407813230962, + "learning_rate": 1.031487471738558e-06, + "loss": 0.435, + "step": 6946 + }, + { + "epoch": 0.8, + "grad_norm": 3.5980685336818787, + "learning_rate": 1.0303557421040983e-06, + "loss": 0.4886, + "step": 6947 + }, + { + "epoch": 0.8, + "grad_norm": 1.9678972838336406, + "learning_rate": 1.0292245623440722e-06, + "loss": 0.4215, + "step": 6948 + }, + { + "epoch": 0.8, + "grad_norm": 3.2110063860834526, + "learning_rate": 1.0280939326151729e-06, + "loss": 0.5736, + "step": 6949 + }, + { + "epoch": 0.8, + "grad_norm": 2.990120625443097, + "learning_rate": 1.0269638530740128e-06, + "loss": 0.4683, + "step": 6950 + }, + { + "epoch": 0.8, + "grad_norm": 1.7992209540986273, + "learning_rate": 1.0258343238771334e-06, + "loss": 0.4856, + "step": 6951 + }, + { + "epoch": 0.8, + "grad_norm": 2.049716779880409, + "learning_rate": 1.0247053451809935e-06, + "loss": 0.4853, + "step": 6952 + }, + { + "epoch": 0.8, + "grad_norm": 1.930729803373553, + "learning_rate": 1.0235769171419818e-06, + "loss": 0.4806, + "step": 6953 + }, + { + "epoch": 0.8, + "grad_norm": 5.161785309286897, + "learning_rate": 1.0224490399164056e-06, + "loss": 0.4474, + "step": 6954 + }, + { + "epoch": 0.8, + "grad_norm": 2.3264028387010347, + "learning_rate": 1.0213217136605008e-06, + "loss": 0.5402, + "step": 6955 + }, + { + "epoch": 0.8, + "grad_norm": 2.1014553382553105, + "learning_rate": 1.0201949385304233e-06, + "loss": 0.5393, + "step": 6956 + }, + { + "epoch": 0.8, + "grad_norm": 2.6406130800531504, + "learning_rate": 1.0190687146822536e-06, + "loss": 0.4486, + "step": 6957 + }, + { + "epoch": 0.8, + "grad_norm": 2.4837414601393397, + "learning_rate": 1.0179430422719944e-06, + "loss": 0.5196, + "step": 6958 + }, + { + "epoch": 0.8, + "grad_norm": 2.2713310772466926, + "learning_rate": 1.0168179214555767e-06, + "loss": 0.5546, + "step": 6959 + }, + { + "epoch": 0.8, + "grad_norm": 4.139404299533773, + "learning_rate": 1.015693352388849e-06, + "loss": 0.5079, + "step": 6960 + }, + { + "epoch": 0.8, + "grad_norm": 2.3206529771845754, + "learning_rate": 1.0145693352275897e-06, + "loss": 0.5122, + "step": 6961 + }, + { + "epoch": 0.8, + "grad_norm": 8.141156822566217, + "learning_rate": 1.0134458701274957e-06, + "loss": 0.431, + "step": 6962 + }, + { + "epoch": 0.8, + "grad_norm": 11.543361889658675, + "learning_rate": 1.0123229572441884e-06, + "loss": 0.4217, + "step": 6963 + }, + { + "epoch": 0.8, + "grad_norm": 1.810841062441158, + "learning_rate": 1.011200596733215e-06, + "loss": 0.4445, + "step": 6964 + }, + { + "epoch": 0.8, + "grad_norm": 1.6999187249403787, + "learning_rate": 1.010078788750043e-06, + "loss": 0.5488, + "step": 6965 + }, + { + "epoch": 0.8, + "grad_norm": 2.4450110206668136, + "learning_rate": 1.0089575334500674e-06, + "loss": 0.5814, + "step": 6966 + }, + { + "epoch": 0.8, + "grad_norm": 2.3807295950666747, + "learning_rate": 1.0078368309886017e-06, + "loss": 0.4198, + "step": 6967 + }, + { + "epoch": 0.8, + "grad_norm": 1.9905215404513177, + "learning_rate": 1.0067166815208885e-06, + "loss": 0.4378, + "step": 6968 + }, + { + "epoch": 0.8, + "grad_norm": 1.978963913982134, + "learning_rate": 1.0055970852020869e-06, + "loss": 0.5111, + "step": 6969 + }, + { + "epoch": 0.8, + "grad_norm": 2.5712813181336114, + "learning_rate": 1.0044780421872869e-06, + "loss": 0.5221, + "step": 6970 + }, + { + "epoch": 0.8, + "grad_norm": 2.4163569095718094, + "learning_rate": 1.0033595526314948e-06, + "loss": 0.497, + "step": 6971 + }, + { + "epoch": 0.8, + "grad_norm": 2.441038785615688, + "learning_rate": 1.0022416166896471e-06, + "loss": 0.5116, + "step": 6972 + }, + { + "epoch": 0.8, + "grad_norm": 3.230760761112982, + "learning_rate": 1.0011242345165978e-06, + "loss": 0.4375, + "step": 6973 + }, + { + "epoch": 0.8, + "grad_norm": 2.571087343361732, + "learning_rate": 1.0000074062671266e-06, + "loss": 0.4441, + "step": 6974 + }, + { + "epoch": 0.8, + "grad_norm": 2.2293860604637015, + "learning_rate": 9.988911320959361e-07, + "loss": 0.442, + "step": 6975 + }, + { + "epoch": 0.8, + "grad_norm": 1.8853905051124316, + "learning_rate": 9.977754121576538e-07, + "loss": 0.5029, + "step": 6976 + }, + { + "epoch": 0.8, + "grad_norm": 1.9576030434705964, + "learning_rate": 9.96660246606827e-07, + "loss": 0.5137, + "step": 6977 + }, + { + "epoch": 0.8, + "grad_norm": 1.8489505181429395, + "learning_rate": 9.95545635597931e-07, + "loss": 0.5345, + "step": 6978 + }, + { + "epoch": 0.8, + "grad_norm": 2.7472132666535427, + "learning_rate": 9.944315792853583e-07, + "loss": 0.4273, + "step": 6979 + }, + { + "epoch": 0.8, + "grad_norm": 2.3817454598977292, + "learning_rate": 9.93318077823431e-07, + "loss": 0.4812, + "step": 6980 + }, + { + "epoch": 0.8, + "grad_norm": 1.8356873481223022, + "learning_rate": 9.922051313663895e-07, + "loss": 0.4546, + "step": 6981 + }, + { + "epoch": 0.8, + "grad_norm": 2.430939947794239, + "learning_rate": 9.910927400683973e-07, + "loss": 0.5812, + "step": 6982 + }, + { + "epoch": 0.8, + "grad_norm": 2.6871584678850673, + "learning_rate": 9.899809040835463e-07, + "loss": 0.4872, + "step": 6983 + }, + { + "epoch": 0.8, + "grad_norm": 3.175770247155596, + "learning_rate": 9.88869623565844e-07, + "loss": 0.437, + "step": 6984 + }, + { + "epoch": 0.8, + "grad_norm": 2.838049926944601, + "learning_rate": 9.877588986692287e-07, + "loss": 0.541, + "step": 6985 + }, + { + "epoch": 0.8, + "grad_norm": 2.230130952393589, + "learning_rate": 9.866487295475541e-07, + "loss": 0.4317, + "step": 6986 + }, + { + "epoch": 0.8, + "grad_norm": 1.8681483162207415, + "learning_rate": 9.855391163546041e-07, + "loss": 0.5667, + "step": 6987 + }, + { + "epoch": 0.8, + "grad_norm": 1.9918095690909632, + "learning_rate": 9.844300592440786e-07, + "loss": 0.4849, + "step": 6988 + }, + { + "epoch": 0.8, + "grad_norm": 2.566977259145139, + "learning_rate": 9.83321558369607e-07, + "loss": 0.4822, + "step": 6989 + }, + { + "epoch": 0.8, + "grad_norm": 2.4108322474721455, + "learning_rate": 9.822136138847376e-07, + "loss": 0.3626, + "step": 6990 + }, + { + "epoch": 0.8, + "grad_norm": 2.24716410499373, + "learning_rate": 9.811062259429427e-07, + "loss": 0.5606, + "step": 6991 + }, + { + "epoch": 0.8, + "grad_norm": 2.139841354911367, + "learning_rate": 9.799993946976156e-07, + "loss": 0.4896, + "step": 6992 + }, + { + "epoch": 0.8, + "grad_norm": 2.7821758112873707, + "learning_rate": 9.788931203020779e-07, + "loss": 0.4456, + "step": 6993 + }, + { + "epoch": 0.8, + "grad_norm": 5.551450790531179, + "learning_rate": 9.777874029095669e-07, + "loss": 0.5511, + "step": 6994 + }, + { + "epoch": 0.8, + "grad_norm": 1.7574118054142498, + "learning_rate": 9.766822426732498e-07, + "loss": 0.5174, + "step": 6995 + }, + { + "epoch": 0.8, + "grad_norm": 3.689577713743871, + "learning_rate": 9.755776397462097e-07, + "loss": 0.4808, + "step": 6996 + }, + { + "epoch": 0.8, + "grad_norm": 2.090351730834388, + "learning_rate": 9.744735942814598e-07, + "loss": 0.4548, + "step": 6997 + }, + { + "epoch": 0.8, + "grad_norm": 1.851570535551841, + "learning_rate": 9.73370106431929e-07, + "loss": 0.4751, + "step": 6998 + }, + { + "epoch": 0.8, + "grad_norm": 3.685757237897816, + "learning_rate": 9.722671763504748e-07, + "loss": 0.5521, + "step": 6999 + }, + { + "epoch": 0.8, + "grad_norm": 2.3329880438269983, + "learning_rate": 9.711648041898725e-07, + "loss": 0.4475, + "step": 7000 + }, + { + "epoch": 0.8, + "grad_norm": 0.8211388058320978, + "learning_rate": 9.700629901028248e-07, + "loss": 0.6808, + "step": 7001 + }, + { + "epoch": 0.8, + "grad_norm": 3.7025022924990716, + "learning_rate": 9.689617342419537e-07, + "loss": 0.498, + "step": 7002 + }, + { + "epoch": 0.8, + "grad_norm": 0.8618205336584218, + "learning_rate": 9.67861036759804e-07, + "loss": 0.6715, + "step": 7003 + }, + { + "epoch": 0.8, + "grad_norm": 3.757651123589481, + "learning_rate": 9.66760897808846e-07, + "loss": 0.4526, + "step": 7004 + }, + { + "epoch": 0.8, + "grad_norm": 2.0239934404780215, + "learning_rate": 9.65661317541469e-07, + "loss": 0.5025, + "step": 7005 + }, + { + "epoch": 0.81, + "grad_norm": 2.1214484633625643, + "learning_rate": 9.645622961099888e-07, + "loss": 0.5062, + "step": 7006 + }, + { + "epoch": 0.81, + "grad_norm": 2.1421659946204272, + "learning_rate": 9.634638336666403e-07, + "loss": 0.4379, + "step": 7007 + }, + { + "epoch": 0.81, + "grad_norm": 0.8710571435972807, + "learning_rate": 9.623659303635819e-07, + "loss": 0.6844, + "step": 7008 + }, + { + "epoch": 0.81, + "grad_norm": 2.2186878990820427, + "learning_rate": 9.612685863528949e-07, + "loss": 0.5013, + "step": 7009 + }, + { + "epoch": 0.81, + "grad_norm": 2.1765719997345463, + "learning_rate": 9.601718017865847e-07, + "loss": 0.449, + "step": 7010 + }, + { + "epoch": 0.81, + "grad_norm": 1.95680934889087, + "learning_rate": 9.590755768165755e-07, + "loss": 0.5009, + "step": 7011 + }, + { + "epoch": 0.81, + "grad_norm": 6.94023325599711, + "learning_rate": 9.579799115947193e-07, + "loss": 0.5847, + "step": 7012 + }, + { + "epoch": 0.81, + "grad_norm": 1.92846928151658, + "learning_rate": 9.568848062727836e-07, + "loss": 0.4829, + "step": 7013 + }, + { + "epoch": 0.81, + "grad_norm": 3.238418984263425, + "learning_rate": 9.557902610024655e-07, + "loss": 0.4819, + "step": 7014 + }, + { + "epoch": 0.81, + "grad_norm": 1.9366319011502207, + "learning_rate": 9.546962759353794e-07, + "loss": 0.3804, + "step": 7015 + }, + { + "epoch": 0.81, + "grad_norm": 2.388475621553339, + "learning_rate": 9.536028512230655e-07, + "loss": 0.4742, + "step": 7016 + }, + { + "epoch": 0.81, + "grad_norm": 2.1441527452603784, + "learning_rate": 9.525099870169824e-07, + "loss": 0.5296, + "step": 7017 + }, + { + "epoch": 0.81, + "grad_norm": 2.3951285106336235, + "learning_rate": 9.514176834685157e-07, + "loss": 0.4536, + "step": 7018 + }, + { + "epoch": 0.81, + "grad_norm": 1.5776643327558262, + "learning_rate": 9.503259407289695e-07, + "loss": 0.4895, + "step": 7019 + }, + { + "epoch": 0.81, + "grad_norm": 2.62008075437621, + "learning_rate": 9.492347589495737e-07, + "loss": 0.4446, + "step": 7020 + }, + { + "epoch": 0.81, + "grad_norm": 2.031754906439386, + "learning_rate": 9.481441382814776e-07, + "loss": 0.4304, + "step": 7021 + }, + { + "epoch": 0.81, + "grad_norm": 5.9209253818117356, + "learning_rate": 9.470540788757526e-07, + "loss": 0.5019, + "step": 7022 + }, + { + "epoch": 0.81, + "grad_norm": 2.9689679585386908, + "learning_rate": 9.459645808833956e-07, + "loss": 0.5793, + "step": 7023 + }, + { + "epoch": 0.81, + "grad_norm": 1.9025320177411236, + "learning_rate": 9.448756444553226e-07, + "loss": 0.5115, + "step": 7024 + }, + { + "epoch": 0.81, + "grad_norm": 3.0321541978501743, + "learning_rate": 9.437872697423717e-07, + "loss": 0.5404, + "step": 7025 + }, + { + "epoch": 0.81, + "grad_norm": 2.3897363933159563, + "learning_rate": 9.426994568953069e-07, + "loss": 0.5554, + "step": 7026 + }, + { + "epoch": 0.81, + "grad_norm": 3.076949630484575, + "learning_rate": 9.416122060648109e-07, + "loss": 0.4384, + "step": 7027 + }, + { + "epoch": 0.81, + "grad_norm": 1.8868886055212564, + "learning_rate": 9.405255174014876e-07, + "loss": 0.3976, + "step": 7028 + }, + { + "epoch": 0.81, + "grad_norm": 3.451106531534848, + "learning_rate": 9.39439391055868e-07, + "loss": 0.5219, + "step": 7029 + }, + { + "epoch": 0.81, + "grad_norm": 2.342789745505471, + "learning_rate": 9.383538271783993e-07, + "loss": 0.5348, + "step": 7030 + }, + { + "epoch": 0.81, + "grad_norm": 0.8863303992556801, + "learning_rate": 9.372688259194556e-07, + "loss": 0.6751, + "step": 7031 + }, + { + "epoch": 0.81, + "grad_norm": 2.871688100187901, + "learning_rate": 9.361843874293302e-07, + "loss": 0.4663, + "step": 7032 + }, + { + "epoch": 0.81, + "grad_norm": 1.726703894177428, + "learning_rate": 9.351005118582401e-07, + "loss": 0.438, + "step": 7033 + }, + { + "epoch": 0.81, + "grad_norm": 2.1699527537512284, + "learning_rate": 9.340171993563224e-07, + "loss": 0.5172, + "step": 7034 + }, + { + "epoch": 0.81, + "grad_norm": 1.8219710895995123, + "learning_rate": 9.329344500736387e-07, + "loss": 0.4662, + "step": 7035 + }, + { + "epoch": 0.81, + "grad_norm": 1.9764108806576621, + "learning_rate": 9.318522641601696e-07, + "loss": 0.5037, + "step": 7036 + }, + { + "epoch": 0.81, + "grad_norm": 2.1248778128600723, + "learning_rate": 9.307706417658213e-07, + "loss": 0.5511, + "step": 7037 + }, + { + "epoch": 0.81, + "grad_norm": 1.6427497621219986, + "learning_rate": 9.296895830404178e-07, + "loss": 0.6188, + "step": 7038 + }, + { + "epoch": 0.81, + "grad_norm": 2.1166235041367365, + "learning_rate": 9.286090881337096e-07, + "loss": 0.5768, + "step": 7039 + }, + { + "epoch": 0.81, + "grad_norm": 1.6982839443888422, + "learning_rate": 9.275291571953637e-07, + "loss": 0.4529, + "step": 7040 + }, + { + "epoch": 0.81, + "grad_norm": 2.541298453763416, + "learning_rate": 9.264497903749764e-07, + "loss": 0.4675, + "step": 7041 + }, + { + "epoch": 0.81, + "grad_norm": 1.703669809130345, + "learning_rate": 9.253709878220557e-07, + "loss": 0.4815, + "step": 7042 + }, + { + "epoch": 0.81, + "grad_norm": 2.590531413012473, + "learning_rate": 9.242927496860416e-07, + "loss": 0.4211, + "step": 7043 + }, + { + "epoch": 0.81, + "grad_norm": 1.773865878715323, + "learning_rate": 9.232150761162884e-07, + "loss": 0.4711, + "step": 7044 + }, + { + "epoch": 0.81, + "grad_norm": 2.8889318578347654, + "learning_rate": 9.221379672620784e-07, + "loss": 0.472, + "step": 7045 + }, + { + "epoch": 0.81, + "grad_norm": 2.1391464765388766, + "learning_rate": 9.210614232726107e-07, + "loss": 0.5312, + "step": 7046 + }, + { + "epoch": 0.81, + "grad_norm": 2.5623251622112444, + "learning_rate": 9.199854442970068e-07, + "loss": 0.4753, + "step": 7047 + }, + { + "epoch": 0.81, + "grad_norm": 2.373675449431209, + "learning_rate": 9.189100304843135e-07, + "loss": 0.5195, + "step": 7048 + }, + { + "epoch": 0.81, + "grad_norm": 1.817825588650336, + "learning_rate": 9.178351819834952e-07, + "loss": 0.5063, + "step": 7049 + }, + { + "epoch": 0.81, + "grad_norm": 2.0391025133179412, + "learning_rate": 9.167608989434413e-07, + "loss": 0.5285, + "step": 7050 + }, + { + "epoch": 0.81, + "grad_norm": 0.8905690937983424, + "learning_rate": 9.156871815129592e-07, + "loss": 0.6918, + "step": 7051 + }, + { + "epoch": 0.81, + "grad_norm": 3.331994320352603, + "learning_rate": 9.146140298407824e-07, + "loss": 0.4236, + "step": 7052 + }, + { + "epoch": 0.81, + "grad_norm": 2.2558961752035853, + "learning_rate": 9.135414440755619e-07, + "loss": 0.5603, + "step": 7053 + }, + { + "epoch": 0.81, + "grad_norm": 1.8970930755125217, + "learning_rate": 9.124694243658732e-07, + "loss": 0.4776, + "step": 7054 + }, + { + "epoch": 0.81, + "grad_norm": 1.742581628557631, + "learning_rate": 9.113979708602111e-07, + "loss": 0.5159, + "step": 7055 + }, + { + "epoch": 0.81, + "grad_norm": 1.8577410099208607, + "learning_rate": 9.103270837069955e-07, + "loss": 0.4904, + "step": 7056 + }, + { + "epoch": 0.81, + "grad_norm": 2.375162775843596, + "learning_rate": 9.092567630545624e-07, + "loss": 0.4795, + "step": 7057 + }, + { + "epoch": 0.81, + "grad_norm": 2.688642271705832, + "learning_rate": 9.081870090511763e-07, + "loss": 0.4817, + "step": 7058 + }, + { + "epoch": 0.81, + "grad_norm": 0.8957659174873532, + "learning_rate": 9.071178218450144e-07, + "loss": 0.7194, + "step": 7059 + }, + { + "epoch": 0.81, + "grad_norm": 2.0578758123620093, + "learning_rate": 9.060492015841843e-07, + "loss": 0.5513, + "step": 7060 + }, + { + "epoch": 0.81, + "grad_norm": 1.5262024679770978, + "learning_rate": 9.049811484167082e-07, + "loss": 0.469, + "step": 7061 + }, + { + "epoch": 0.81, + "grad_norm": 2.3349400012848576, + "learning_rate": 9.039136624905359e-07, + "loss": 0.3991, + "step": 7062 + }, + { + "epoch": 0.81, + "grad_norm": 3.6870298734876092, + "learning_rate": 9.028467439535321e-07, + "loss": 0.4558, + "step": 7063 + }, + { + "epoch": 0.81, + "grad_norm": 2.2816171306392543, + "learning_rate": 9.017803929534885e-07, + "loss": 0.2933, + "step": 7064 + }, + { + "epoch": 0.81, + "grad_norm": 8.21499383593897, + "learning_rate": 9.007146096381142e-07, + "loss": 0.4905, + "step": 7065 + }, + { + "epoch": 0.81, + "grad_norm": 1.9759448784554605, + "learning_rate": 8.996493941550438e-07, + "loss": 0.4567, + "step": 7066 + }, + { + "epoch": 0.81, + "grad_norm": 1.720270389015571, + "learning_rate": 8.985847466518288e-07, + "loss": 0.4695, + "step": 7067 + }, + { + "epoch": 0.81, + "grad_norm": 1.9371179415351363, + "learning_rate": 8.975206672759429e-07, + "loss": 0.5705, + "step": 7068 + }, + { + "epoch": 0.81, + "grad_norm": 1.9784798254939324, + "learning_rate": 8.964571561747847e-07, + "loss": 0.5348, + "step": 7069 + }, + { + "epoch": 0.81, + "grad_norm": 2.1355817136758852, + "learning_rate": 8.953942134956695e-07, + "loss": 0.4473, + "step": 7070 + }, + { + "epoch": 0.81, + "grad_norm": 1.9644382723443754, + "learning_rate": 8.943318393858381e-07, + "loss": 0.5142, + "step": 7071 + }, + { + "epoch": 0.81, + "grad_norm": 2.249466319865713, + "learning_rate": 8.932700339924477e-07, + "loss": 0.4784, + "step": 7072 + }, + { + "epoch": 0.81, + "grad_norm": 4.345899129478599, + "learning_rate": 8.922087974625826e-07, + "loss": 0.5234, + "step": 7073 + }, + { + "epoch": 0.81, + "grad_norm": 1.6984425009761908, + "learning_rate": 8.911481299432434e-07, + "loss": 0.4393, + "step": 7074 + }, + { + "epoch": 0.81, + "grad_norm": 1.7600225076890579, + "learning_rate": 8.900880315813532e-07, + "loss": 0.4763, + "step": 7075 + }, + { + "epoch": 0.81, + "grad_norm": 2.3842609592823973, + "learning_rate": 8.890285025237561e-07, + "loss": 0.5632, + "step": 7076 + }, + { + "epoch": 0.81, + "grad_norm": 2.6873067275351232, + "learning_rate": 8.879695429172197e-07, + "loss": 0.5483, + "step": 7077 + }, + { + "epoch": 0.81, + "grad_norm": 2.0682223147057854, + "learning_rate": 8.869111529084285e-07, + "loss": 0.6063, + "step": 7078 + }, + { + "epoch": 0.81, + "grad_norm": 2.783422168271318, + "learning_rate": 8.858533326439938e-07, + "loss": 0.5584, + "step": 7079 + }, + { + "epoch": 0.81, + "grad_norm": 2.356151228943602, + "learning_rate": 8.847960822704416e-07, + "loss": 0.4072, + "step": 7080 + }, + { + "epoch": 0.81, + "grad_norm": 1.8749885815061054, + "learning_rate": 8.837394019342244e-07, + "loss": 0.5363, + "step": 7081 + }, + { + "epoch": 0.81, + "grad_norm": 5.425604513229108, + "learning_rate": 8.826832917817107e-07, + "loss": 0.5061, + "step": 7082 + }, + { + "epoch": 0.81, + "grad_norm": 0.8149863183424358, + "learning_rate": 8.816277519591959e-07, + "loss": 0.6774, + "step": 7083 + }, + { + "epoch": 0.81, + "grad_norm": 1.7749192638716431, + "learning_rate": 8.805727826128901e-07, + "loss": 0.4582, + "step": 7084 + }, + { + "epoch": 0.81, + "grad_norm": 1.7441954094730574, + "learning_rate": 8.795183838889304e-07, + "loss": 0.4721, + "step": 7085 + }, + { + "epoch": 0.81, + "grad_norm": 1.9559520341536387, + "learning_rate": 8.784645559333705e-07, + "loss": 0.5098, + "step": 7086 + }, + { + "epoch": 0.81, + "grad_norm": 1.7421057039736922, + "learning_rate": 8.774112988921852e-07, + "loss": 0.4674, + "step": 7087 + }, + { + "epoch": 0.81, + "grad_norm": 2.3574221597582334, + "learning_rate": 8.763586129112739e-07, + "loss": 0.442, + "step": 7088 + }, + { + "epoch": 0.81, + "grad_norm": 2.6381455464775856, + "learning_rate": 8.753064981364523e-07, + "loss": 0.5638, + "step": 7089 + }, + { + "epoch": 0.81, + "grad_norm": 1.7092716554515601, + "learning_rate": 8.74254954713461e-07, + "loss": 0.5396, + "step": 7090 + }, + { + "epoch": 0.81, + "grad_norm": 1.7658416380613209, + "learning_rate": 8.732039827879591e-07, + "loss": 0.4619, + "step": 7091 + }, + { + "epoch": 0.81, + "grad_norm": 1.6604205410467268, + "learning_rate": 8.721535825055266e-07, + "loss": 0.4715, + "step": 7092 + }, + { + "epoch": 0.82, + "grad_norm": 2.325193798997003, + "learning_rate": 8.711037540116635e-07, + "loss": 0.469, + "step": 7093 + }, + { + "epoch": 0.82, + "grad_norm": 1.9721404510586076, + "learning_rate": 8.700544974517944e-07, + "loss": 0.4353, + "step": 7094 + }, + { + "epoch": 0.82, + "grad_norm": 2.067629913134902, + "learning_rate": 8.690058129712603e-07, + "loss": 0.4741, + "step": 7095 + }, + { + "epoch": 0.82, + "grad_norm": 2.2141461427416207, + "learning_rate": 8.679577007153262e-07, + "loss": 0.5423, + "step": 7096 + }, + { + "epoch": 0.82, + "grad_norm": 2.426286036112623, + "learning_rate": 8.669101608291747e-07, + "loss": 0.4429, + "step": 7097 + }, + { + "epoch": 0.82, + "grad_norm": 2.186779115355316, + "learning_rate": 8.658631934579126e-07, + "loss": 0.5433, + "step": 7098 + }, + { + "epoch": 0.82, + "grad_norm": 1.8603791509333492, + "learning_rate": 8.648167987465645e-07, + "loss": 0.4491, + "step": 7099 + }, + { + "epoch": 0.82, + "grad_norm": 1.7274928311168423, + "learning_rate": 8.637709768400776e-07, + "loss": 0.4572, + "step": 7100 + }, + { + "epoch": 0.82, + "grad_norm": 1.9439760164781315, + "learning_rate": 8.627257278833179e-07, + "loss": 0.5436, + "step": 7101 + }, + { + "epoch": 0.82, + "grad_norm": 2.738983886181228, + "learning_rate": 8.61681052021075e-07, + "loss": 0.5225, + "step": 7102 + }, + { + "epoch": 0.82, + "grad_norm": 2.1759089941098884, + "learning_rate": 8.606369493980543e-07, + "loss": 0.4136, + "step": 7103 + }, + { + "epoch": 0.82, + "grad_norm": 1.7673926337731398, + "learning_rate": 8.595934201588879e-07, + "loss": 0.4672, + "step": 7104 + }, + { + "epoch": 0.82, + "grad_norm": 2.2140059409297845, + "learning_rate": 8.58550464448123e-07, + "loss": 0.487, + "step": 7105 + }, + { + "epoch": 0.82, + "grad_norm": 1.6931790570549277, + "learning_rate": 8.575080824102311e-07, + "loss": 0.4321, + "step": 7106 + }, + { + "epoch": 0.82, + "grad_norm": 2.079819975287847, + "learning_rate": 8.564662741896024e-07, + "loss": 0.4554, + "step": 7107 + }, + { + "epoch": 0.82, + "grad_norm": 2.5216672131939792, + "learning_rate": 8.554250399305475e-07, + "loss": 0.531, + "step": 7108 + }, + { + "epoch": 0.82, + "grad_norm": 1.9759464116717769, + "learning_rate": 8.543843797772972e-07, + "loss": 0.4573, + "step": 7109 + }, + { + "epoch": 0.82, + "grad_norm": 3.0123788135809666, + "learning_rate": 8.533442938740055e-07, + "loss": 0.4851, + "step": 7110 + }, + { + "epoch": 0.82, + "grad_norm": 11.10217092208774, + "learning_rate": 8.523047823647429e-07, + "loss": 0.5369, + "step": 7111 + }, + { + "epoch": 0.82, + "grad_norm": 2.6482496956753625, + "learning_rate": 8.512658453935052e-07, + "loss": 0.535, + "step": 7112 + }, + { + "epoch": 0.82, + "grad_norm": 1.6951791794413231, + "learning_rate": 8.502274831042035e-07, + "loss": 0.4698, + "step": 7113 + }, + { + "epoch": 0.82, + "grad_norm": 2.1419048283401603, + "learning_rate": 8.491896956406709e-07, + "loss": 0.3742, + "step": 7114 + }, + { + "epoch": 0.82, + "grad_norm": 2.647288674381827, + "learning_rate": 8.481524831466636e-07, + "loss": 0.4663, + "step": 7115 + }, + { + "epoch": 0.82, + "grad_norm": 2.212816205352019, + "learning_rate": 8.471158457658546e-07, + "loss": 0.4788, + "step": 7116 + }, + { + "epoch": 0.82, + "grad_norm": 2.226411240350637, + "learning_rate": 8.460797836418406e-07, + "loss": 0.4566, + "step": 7117 + }, + { + "epoch": 0.82, + "grad_norm": 2.0047558874321316, + "learning_rate": 8.450442969181339e-07, + "loss": 0.4682, + "step": 7118 + }, + { + "epoch": 0.82, + "grad_norm": 1.928286400598645, + "learning_rate": 8.440093857381726e-07, + "loss": 0.4678, + "step": 7119 + }, + { + "epoch": 0.82, + "grad_norm": 2.049124996554422, + "learning_rate": 8.429750502453104e-07, + "loss": 0.3416, + "step": 7120 + }, + { + "epoch": 0.82, + "grad_norm": 2.167257519010175, + "learning_rate": 8.419412905828256e-07, + "loss": 0.5378, + "step": 7121 + }, + { + "epoch": 0.82, + "grad_norm": 2.0033286249349964, + "learning_rate": 8.409081068939112e-07, + "loss": 0.4851, + "step": 7122 + }, + { + "epoch": 0.82, + "grad_norm": 2.0456320652895945, + "learning_rate": 8.398754993216873e-07, + "loss": 0.474, + "step": 7123 + }, + { + "epoch": 0.82, + "grad_norm": 2.2988666583937296, + "learning_rate": 8.388434680091873e-07, + "loss": 0.4066, + "step": 7124 + }, + { + "epoch": 0.82, + "grad_norm": 2.0193507375383803, + "learning_rate": 8.378120130993717e-07, + "loss": 0.5461, + "step": 7125 + }, + { + "epoch": 0.82, + "grad_norm": 1.8785091086583843, + "learning_rate": 8.367811347351129e-07, + "loss": 0.4863, + "step": 7126 + }, + { + "epoch": 0.82, + "grad_norm": 1.7100357221126279, + "learning_rate": 8.357508330592113e-07, + "loss": 0.4289, + "step": 7127 + }, + { + "epoch": 0.82, + "grad_norm": 2.248096663910823, + "learning_rate": 8.347211082143813e-07, + "loss": 0.5027, + "step": 7128 + }, + { + "epoch": 0.82, + "grad_norm": 0.8806878421355433, + "learning_rate": 8.336919603432641e-07, + "loss": 0.671, + "step": 7129 + }, + { + "epoch": 0.82, + "grad_norm": 0.8075767460880735, + "learning_rate": 8.326633895884129e-07, + "loss": 0.6814, + "step": 7130 + }, + { + "epoch": 0.82, + "grad_norm": 1.8092762032053218, + "learning_rate": 8.31635396092309e-07, + "loss": 0.5631, + "step": 7131 + }, + { + "epoch": 0.82, + "grad_norm": 2.7939449235489127, + "learning_rate": 8.306079799973477e-07, + "loss": 0.5455, + "step": 7132 + }, + { + "epoch": 0.82, + "grad_norm": 2.5401624836141803, + "learning_rate": 8.29581141445846e-07, + "loss": 0.5001, + "step": 7133 + }, + { + "epoch": 0.82, + "grad_norm": 3.253319121927762, + "learning_rate": 8.285548805800431e-07, + "loss": 0.4524, + "step": 7134 + }, + { + "epoch": 0.82, + "grad_norm": 2.734909214092982, + "learning_rate": 8.275291975420946e-07, + "loss": 0.5165, + "step": 7135 + }, + { + "epoch": 0.82, + "grad_norm": 1.773533349665385, + "learning_rate": 8.265040924740798e-07, + "loss": 0.4372, + "step": 7136 + }, + { + "epoch": 0.82, + "grad_norm": 2.18771972982896, + "learning_rate": 8.254795655179943e-07, + "loss": 0.4934, + "step": 7137 + }, + { + "epoch": 0.82, + "grad_norm": 0.8559613934026883, + "learning_rate": 8.24455616815758e-07, + "loss": 0.6579, + "step": 7138 + }, + { + "epoch": 0.82, + "grad_norm": 1.9152957395454688, + "learning_rate": 8.234322465092049e-07, + "loss": 0.551, + "step": 7139 + }, + { + "epoch": 0.82, + "grad_norm": 1.7113550761355243, + "learning_rate": 8.224094547400946e-07, + "loss": 0.3392, + "step": 7140 + }, + { + "epoch": 0.82, + "grad_norm": 1.9014598729376788, + "learning_rate": 8.213872416501018e-07, + "loss": 0.5268, + "step": 7141 + }, + { + "epoch": 0.82, + "grad_norm": 1.8982393778091111, + "learning_rate": 8.203656073808269e-07, + "loss": 0.4163, + "step": 7142 + }, + { + "epoch": 0.82, + "grad_norm": 2.8096820995286262, + "learning_rate": 8.193445520737819e-07, + "loss": 0.6212, + "step": 7143 + }, + { + "epoch": 0.82, + "grad_norm": 3.834729205758222, + "learning_rate": 8.18324075870407e-07, + "loss": 0.4669, + "step": 7144 + }, + { + "epoch": 0.82, + "grad_norm": 1.9465965060091415, + "learning_rate": 8.17304178912055e-07, + "loss": 0.5516, + "step": 7145 + }, + { + "epoch": 0.82, + "grad_norm": 2.5377106618394305, + "learning_rate": 8.162848613400054e-07, + "loss": 0.5824, + "step": 7146 + }, + { + "epoch": 0.82, + "grad_norm": 2.453241493699161, + "learning_rate": 8.152661232954506e-07, + "loss": 0.5529, + "step": 7147 + }, + { + "epoch": 0.82, + "grad_norm": 1.8181703412781312, + "learning_rate": 8.14247964919509e-07, + "loss": 0.5294, + "step": 7148 + }, + { + "epoch": 0.82, + "grad_norm": 1.7323284929595972, + "learning_rate": 8.132303863532126e-07, + "loss": 0.4541, + "step": 7149 + }, + { + "epoch": 0.82, + "grad_norm": 2.103415859691071, + "learning_rate": 8.1221338773752e-07, + "loss": 0.5308, + "step": 7150 + }, + { + "epoch": 0.82, + "grad_norm": 2.613934354221605, + "learning_rate": 8.111969692133032e-07, + "loss": 0.4561, + "step": 7151 + }, + { + "epoch": 0.82, + "grad_norm": 2.7704158575285973, + "learning_rate": 8.101811309213553e-07, + "loss": 0.5544, + "step": 7152 + }, + { + "epoch": 0.82, + "grad_norm": 2.8445106765520443, + "learning_rate": 8.091658730023927e-07, + "loss": 0.4831, + "step": 7153 + }, + { + "epoch": 0.82, + "grad_norm": 2.0784167410747862, + "learning_rate": 8.081511955970461e-07, + "loss": 0.4512, + "step": 7154 + }, + { + "epoch": 0.82, + "grad_norm": 1.992705775146944, + "learning_rate": 8.071370988458715e-07, + "loss": 0.4659, + "step": 7155 + }, + { + "epoch": 0.82, + "grad_norm": 1.9903867730686569, + "learning_rate": 8.061235828893382e-07, + "loss": 0.4863, + "step": 7156 + }, + { + "epoch": 0.82, + "grad_norm": 0.8442058279685288, + "learning_rate": 8.051106478678411e-07, + "loss": 0.6752, + "step": 7157 + }, + { + "epoch": 0.82, + "grad_norm": 2.0109709247903855, + "learning_rate": 8.040982939216891e-07, + "loss": 0.5485, + "step": 7158 + }, + { + "epoch": 0.82, + "grad_norm": 12.34895072254103, + "learning_rate": 8.030865211911176e-07, + "loss": 0.4745, + "step": 7159 + }, + { + "epoch": 0.82, + "grad_norm": 2.122180840698105, + "learning_rate": 8.020753298162715e-07, + "loss": 0.445, + "step": 7160 + }, + { + "epoch": 0.82, + "grad_norm": 1.5367150801455722, + "learning_rate": 8.010647199372246e-07, + "loss": 0.3822, + "step": 7161 + }, + { + "epoch": 0.82, + "grad_norm": 2.090481456205168, + "learning_rate": 8.000546916939644e-07, + "loss": 0.4591, + "step": 7162 + }, + { + "epoch": 0.82, + "grad_norm": 0.8667318194323672, + "learning_rate": 7.990452452264025e-07, + "loss": 0.6822, + "step": 7163 + }, + { + "epoch": 0.82, + "grad_norm": 1.922746520559312, + "learning_rate": 7.980363806743641e-07, + "loss": 0.4893, + "step": 7164 + }, + { + "epoch": 0.82, + "grad_norm": 2.7570605638722427, + "learning_rate": 7.970280981775991e-07, + "loss": 0.533, + "step": 7165 + }, + { + "epoch": 0.82, + "grad_norm": 2.180705708402237, + "learning_rate": 7.960203978757736e-07, + "loss": 0.4946, + "step": 7166 + }, + { + "epoch": 0.82, + "grad_norm": 0.8792795687230016, + "learning_rate": 7.950132799084753e-07, + "loss": 0.6833, + "step": 7167 + }, + { + "epoch": 0.82, + "grad_norm": 1.8132059701644931, + "learning_rate": 7.940067444152078e-07, + "loss": 0.5378, + "step": 7168 + }, + { + "epoch": 0.82, + "grad_norm": 7.8548728903877825, + "learning_rate": 7.930007915353988e-07, + "loss": 0.4892, + "step": 7169 + }, + { + "epoch": 0.82, + "grad_norm": 2.305215098458179, + "learning_rate": 7.919954214083903e-07, + "loss": 0.5132, + "step": 7170 + }, + { + "epoch": 0.82, + "grad_norm": 1.941541227634566, + "learning_rate": 7.909906341734486e-07, + "loss": 0.4447, + "step": 7171 + }, + { + "epoch": 0.82, + "grad_norm": 2.102629072062652, + "learning_rate": 7.899864299697546e-07, + "loss": 0.5014, + "step": 7172 + }, + { + "epoch": 0.82, + "grad_norm": 3.2349095162524355, + "learning_rate": 7.889828089364105e-07, + "loss": 0.5379, + "step": 7173 + }, + { + "epoch": 0.82, + "grad_norm": 2.1837705866064567, + "learning_rate": 7.879797712124399e-07, + "loss": 0.4881, + "step": 7174 + }, + { + "epoch": 0.82, + "grad_norm": 1.9113937243569958, + "learning_rate": 7.8697731693678e-07, + "loss": 0.5547, + "step": 7175 + }, + { + "epoch": 0.82, + "grad_norm": 2.1675205577960854, + "learning_rate": 7.859754462482938e-07, + "loss": 0.476, + "step": 7176 + }, + { + "epoch": 0.82, + "grad_norm": 2.069727601843752, + "learning_rate": 7.849741592857585e-07, + "loss": 0.4878, + "step": 7177 + }, + { + "epoch": 0.82, + "grad_norm": 2.1014293597331877, + "learning_rate": 7.839734561878732e-07, + "loss": 0.5298, + "step": 7178 + }, + { + "epoch": 0.82, + "grad_norm": 2.2550857355073965, + "learning_rate": 7.829733370932529e-07, + "loss": 0.5246, + "step": 7179 + }, + { + "epoch": 0.83, + "grad_norm": 2.2002130063956766, + "learning_rate": 7.819738021404371e-07, + "loss": 0.5188, + "step": 7180 + }, + { + "epoch": 0.83, + "grad_norm": 1.955232269767143, + "learning_rate": 7.809748514678783e-07, + "loss": 0.4627, + "step": 7181 + }, + { + "epoch": 0.83, + "grad_norm": 2.017760987003644, + "learning_rate": 7.799764852139535e-07, + "loss": 0.4037, + "step": 7182 + }, + { + "epoch": 0.83, + "grad_norm": 2.151292242865054, + "learning_rate": 7.789787035169539e-07, + "loss": 0.546, + "step": 7183 + }, + { + "epoch": 0.83, + "grad_norm": 1.6367685862030645, + "learning_rate": 7.779815065150942e-07, + "loss": 0.464, + "step": 7184 + }, + { + "epoch": 0.83, + "grad_norm": 2.2718581227359955, + "learning_rate": 7.769848943465041e-07, + "loss": 0.4998, + "step": 7185 + }, + { + "epoch": 0.83, + "grad_norm": 2.133329482062187, + "learning_rate": 7.75988867149236e-07, + "loss": 0.4041, + "step": 7186 + }, + { + "epoch": 0.83, + "grad_norm": 2.0147540243557125, + "learning_rate": 7.749934250612568e-07, + "loss": 0.5469, + "step": 7187 + }, + { + "epoch": 0.83, + "grad_norm": 2.195133399334986, + "learning_rate": 7.739985682204581e-07, + "loss": 0.4322, + "step": 7188 + }, + { + "epoch": 0.83, + "grad_norm": 2.3687806953539097, + "learning_rate": 7.730042967646451e-07, + "loss": 0.4531, + "step": 7189 + }, + { + "epoch": 0.83, + "grad_norm": 4.559193039701547, + "learning_rate": 7.720106108315451e-07, + "loss": 0.5108, + "step": 7190 + }, + { + "epoch": 0.83, + "grad_norm": 2.069987156628196, + "learning_rate": 7.710175105588036e-07, + "loss": 0.4847, + "step": 7191 + }, + { + "epoch": 0.83, + "grad_norm": 3.056257570932104, + "learning_rate": 7.700249960839823e-07, + "loss": 0.4405, + "step": 7192 + }, + { + "epoch": 0.83, + "grad_norm": 2.1160321978693313, + "learning_rate": 7.690330675445673e-07, + "loss": 0.5208, + "step": 7193 + }, + { + "epoch": 0.83, + "grad_norm": 2.7549145362097103, + "learning_rate": 7.680417250779593e-07, + "loss": 0.4808, + "step": 7194 + }, + { + "epoch": 0.83, + "grad_norm": 1.9161957836107297, + "learning_rate": 7.670509688214766e-07, + "loss": 0.4607, + "step": 7195 + }, + { + "epoch": 0.83, + "grad_norm": 1.8418102874712863, + "learning_rate": 7.660607989123625e-07, + "loss": 0.5038, + "step": 7196 + }, + { + "epoch": 0.83, + "grad_norm": 2.1732809153605315, + "learning_rate": 7.650712154877732e-07, + "loss": 0.4705, + "step": 7197 + }, + { + "epoch": 0.83, + "grad_norm": 2.1347844942114245, + "learning_rate": 7.640822186847841e-07, + "loss": 0.519, + "step": 7198 + }, + { + "epoch": 0.83, + "grad_norm": 1.9066774776618884, + "learning_rate": 7.630938086403933e-07, + "loss": 0.5416, + "step": 7199 + }, + { + "epoch": 0.83, + "grad_norm": 1.8626974855215608, + "learning_rate": 7.621059854915136e-07, + "loss": 0.542, + "step": 7200 + }, + { + "epoch": 0.83, + "grad_norm": 2.527681097273383, + "learning_rate": 7.611187493749794e-07, + "loss": 0.5229, + "step": 7201 + }, + { + "epoch": 0.83, + "grad_norm": 2.498535550130743, + "learning_rate": 7.601321004275414e-07, + "loss": 0.4809, + "step": 7202 + }, + { + "epoch": 0.83, + "grad_norm": 2.2208369051575825, + "learning_rate": 7.591460387858712e-07, + "loss": 0.5338, + "step": 7203 + }, + { + "epoch": 0.83, + "grad_norm": 2.4686269944689885, + "learning_rate": 7.581605645865558e-07, + "loss": 0.5613, + "step": 7204 + }, + { + "epoch": 0.83, + "grad_norm": 1.7285385220055365, + "learning_rate": 7.57175677966106e-07, + "loss": 0.5189, + "step": 7205 + }, + { + "epoch": 0.83, + "grad_norm": 1.913520529194837, + "learning_rate": 7.561913790609449e-07, + "loss": 0.4224, + "step": 7206 + }, + { + "epoch": 0.83, + "grad_norm": 0.8510649091136632, + "learning_rate": 7.5520766800742e-07, + "loss": 0.6425, + "step": 7207 + }, + { + "epoch": 0.83, + "grad_norm": 1.8318864509980899, + "learning_rate": 7.542245449417929e-07, + "loss": 0.4831, + "step": 7208 + }, + { + "epoch": 0.83, + "grad_norm": 2.6374017847328486, + "learning_rate": 7.532420100002486e-07, + "loss": 0.5372, + "step": 7209 + }, + { + "epoch": 0.83, + "grad_norm": 1.9842241519757968, + "learning_rate": 7.522600633188831e-07, + "loss": 0.3895, + "step": 7210 + }, + { + "epoch": 0.83, + "grad_norm": 3.2831521182222896, + "learning_rate": 7.51278705033719e-07, + "loss": 0.4943, + "step": 7211 + }, + { + "epoch": 0.83, + "grad_norm": 5.187244868998728, + "learning_rate": 7.502979352806916e-07, + "loss": 0.5258, + "step": 7212 + }, + { + "epoch": 0.83, + "grad_norm": 2.0148017872374226, + "learning_rate": 7.493177541956592e-07, + "loss": 0.403, + "step": 7213 + }, + { + "epoch": 0.83, + "grad_norm": 3.6706479359769095, + "learning_rate": 7.483381619143942e-07, + "loss": 0.4627, + "step": 7214 + }, + { + "epoch": 0.83, + "grad_norm": 1.8363730324012142, + "learning_rate": 7.473591585725921e-07, + "loss": 0.4652, + "step": 7215 + }, + { + "epoch": 0.83, + "grad_norm": 1.9130070966900155, + "learning_rate": 7.463807443058624e-07, + "loss": 0.4329, + "step": 7216 + }, + { + "epoch": 0.83, + "grad_norm": 2.5028289626930813, + "learning_rate": 7.454029192497342e-07, + "loss": 0.4683, + "step": 7217 + }, + { + "epoch": 0.83, + "grad_norm": 1.997286108115056, + "learning_rate": 7.444256835396579e-07, + "loss": 0.5105, + "step": 7218 + }, + { + "epoch": 0.83, + "grad_norm": 2.143459082216822, + "learning_rate": 7.434490373109976e-07, + "loss": 0.5333, + "step": 7219 + }, + { + "epoch": 0.83, + "grad_norm": 1.981983029765865, + "learning_rate": 7.424729806990411e-07, + "loss": 0.5497, + "step": 7220 + }, + { + "epoch": 0.83, + "grad_norm": 1.8421357048375944, + "learning_rate": 7.414975138389879e-07, + "loss": 0.4144, + "step": 7221 + }, + { + "epoch": 0.83, + "grad_norm": 2.076303628192361, + "learning_rate": 7.40522636865963e-07, + "loss": 0.5216, + "step": 7222 + }, + { + "epoch": 0.83, + "grad_norm": 6.2948685931315005, + "learning_rate": 7.395483499150036e-07, + "loss": 0.3816, + "step": 7223 + }, + { + "epoch": 0.83, + "grad_norm": 1.843025641190779, + "learning_rate": 7.385746531210697e-07, + "loss": 0.5038, + "step": 7224 + }, + { + "epoch": 0.83, + "grad_norm": 2.8560779780918644, + "learning_rate": 7.376015466190362e-07, + "loss": 0.4859, + "step": 7225 + }, + { + "epoch": 0.83, + "grad_norm": 2.0367798672986273, + "learning_rate": 7.366290305436996e-07, + "loss": 0.4701, + "step": 7226 + }, + { + "epoch": 0.83, + "grad_norm": 1.6686742076763643, + "learning_rate": 7.356571050297695e-07, + "loss": 0.4786, + "step": 7227 + }, + { + "epoch": 0.83, + "grad_norm": 1.8883007968056043, + "learning_rate": 7.346857702118798e-07, + "loss": 0.4542, + "step": 7228 + }, + { + "epoch": 0.83, + "grad_norm": 1.6796377307076744, + "learning_rate": 7.337150262245774e-07, + "loss": 0.4212, + "step": 7229 + }, + { + "epoch": 0.83, + "grad_norm": 2.3615960097503157, + "learning_rate": 7.327448732023312e-07, + "loss": 0.5169, + "step": 7230 + }, + { + "epoch": 0.83, + "grad_norm": 2.2561572660804856, + "learning_rate": 7.317753112795256e-07, + "loss": 0.6223, + "step": 7231 + }, + { + "epoch": 0.83, + "grad_norm": 1.827493591725782, + "learning_rate": 7.308063405904653e-07, + "loss": 0.4697, + "step": 7232 + }, + { + "epoch": 0.83, + "grad_norm": 2.2371019032096555, + "learning_rate": 7.298379612693701e-07, + "loss": 0.4984, + "step": 7233 + }, + { + "epoch": 0.83, + "grad_norm": 4.1185151856443065, + "learning_rate": 7.288701734503823e-07, + "loss": 0.5263, + "step": 7234 + }, + { + "epoch": 0.83, + "grad_norm": 0.8242167454328136, + "learning_rate": 7.279029772675572e-07, + "loss": 0.6658, + "step": 7235 + }, + { + "epoch": 0.83, + "grad_norm": 2.333117371326312, + "learning_rate": 7.269363728548723e-07, + "loss": 0.3986, + "step": 7236 + }, + { + "epoch": 0.83, + "grad_norm": 2.330393626976881, + "learning_rate": 7.259703603462215e-07, + "loss": 0.4969, + "step": 7237 + }, + { + "epoch": 0.83, + "grad_norm": 2.2417854671646, + "learning_rate": 7.250049398754144e-07, + "loss": 0.4614, + "step": 7238 + }, + { + "epoch": 0.83, + "grad_norm": 1.8014093345280997, + "learning_rate": 7.240401115761841e-07, + "loss": 0.5091, + "step": 7239 + }, + { + "epoch": 0.83, + "grad_norm": 1.9211338336842472, + "learning_rate": 7.23075875582176e-07, + "loss": 0.4664, + "step": 7240 + }, + { + "epoch": 0.83, + "grad_norm": 2.973058076861141, + "learning_rate": 7.22112232026958e-07, + "loss": 0.5227, + "step": 7241 + }, + { + "epoch": 0.83, + "grad_norm": 3.128538434917711, + "learning_rate": 7.211491810440107e-07, + "loss": 0.495, + "step": 7242 + }, + { + "epoch": 0.83, + "grad_norm": 2.0774347077688655, + "learning_rate": 7.201867227667408e-07, + "loss": 0.3879, + "step": 7243 + }, + { + "epoch": 0.83, + "grad_norm": 2.903181520863112, + "learning_rate": 7.192248573284621e-07, + "loss": 0.5501, + "step": 7244 + }, + { + "epoch": 0.83, + "grad_norm": 1.973632568988977, + "learning_rate": 7.182635848624164e-07, + "loss": 0.5582, + "step": 7245 + }, + { + "epoch": 0.83, + "grad_norm": 2.1575937046995155, + "learning_rate": 7.173029055017555e-07, + "loss": 0.4276, + "step": 7246 + }, + { + "epoch": 0.83, + "grad_norm": 3.4822813595951945, + "learning_rate": 7.163428193795557e-07, + "loss": 0.4861, + "step": 7247 + }, + { + "epoch": 0.83, + "grad_norm": 1.9677855053376065, + "learning_rate": 7.153833266288057e-07, + "loss": 0.4927, + "step": 7248 + }, + { + "epoch": 0.83, + "grad_norm": 1.9754368478743356, + "learning_rate": 7.144244273824164e-07, + "loss": 0.4683, + "step": 7249 + }, + { + "epoch": 0.83, + "grad_norm": 2.036520067071005, + "learning_rate": 7.134661217732114e-07, + "loss": 0.4501, + "step": 7250 + }, + { + "epoch": 0.83, + "grad_norm": 4.310057510790527, + "learning_rate": 7.12508409933938e-07, + "loss": 0.467, + "step": 7251 + }, + { + "epoch": 0.83, + "grad_norm": 2.1583248267490194, + "learning_rate": 7.115512919972562e-07, + "loss": 0.5565, + "step": 7252 + }, + { + "epoch": 0.83, + "grad_norm": 3.1177560350843114, + "learning_rate": 7.105947680957481e-07, + "loss": 0.3885, + "step": 7253 + }, + { + "epoch": 0.83, + "grad_norm": 1.7597421935917266, + "learning_rate": 7.09638838361908e-07, + "loss": 0.4723, + "step": 7254 + }, + { + "epoch": 0.83, + "grad_norm": 2.4650762097638905, + "learning_rate": 7.086835029281541e-07, + "loss": 0.4598, + "step": 7255 + }, + { + "epoch": 0.83, + "grad_norm": 2.182764556918169, + "learning_rate": 7.077287619268186e-07, + "loss": 0.513, + "step": 7256 + }, + { + "epoch": 0.83, + "grad_norm": 2.1153158883086816, + "learning_rate": 7.0677461549015e-07, + "loss": 0.4066, + "step": 7257 + }, + { + "epoch": 0.83, + "grad_norm": 0.8832279179047732, + "learning_rate": 7.058210637503193e-07, + "loss": 0.7148, + "step": 7258 + }, + { + "epoch": 0.83, + "grad_norm": 3.357671654469918, + "learning_rate": 7.048681068394098e-07, + "loss": 0.408, + "step": 7259 + }, + { + "epoch": 0.83, + "grad_norm": 2.335062610667309, + "learning_rate": 7.039157448894279e-07, + "loss": 0.5368, + "step": 7260 + }, + { + "epoch": 0.83, + "grad_norm": 2.2434962596324306, + "learning_rate": 7.029639780322923e-07, + "loss": 0.4847, + "step": 7261 + }, + { + "epoch": 0.83, + "grad_norm": 2.0793097349689256, + "learning_rate": 7.020128063998421e-07, + "loss": 0.4318, + "step": 7262 + }, + { + "epoch": 0.83, + "grad_norm": 2.8175615690636726, + "learning_rate": 7.010622301238329e-07, + "loss": 0.4899, + "step": 7263 + }, + { + "epoch": 0.83, + "grad_norm": 2.3933248161458263, + "learning_rate": 7.001122493359397e-07, + "loss": 0.5284, + "step": 7264 + }, + { + "epoch": 0.83, + "grad_norm": 1.8818318174902324, + "learning_rate": 6.991628641677522e-07, + "loss": 0.5069, + "step": 7265 + }, + { + "epoch": 0.83, + "grad_norm": 1.8001121898212702, + "learning_rate": 6.98214074750781e-07, + "loss": 0.4734, + "step": 7266 + }, + { + "epoch": 0.83, + "grad_norm": 1.8642097293664586, + "learning_rate": 6.972658812164495e-07, + "loss": 0.5144, + "step": 7267 + }, + { + "epoch": 0.84, + "grad_norm": 2.120147867239031, + "learning_rate": 6.963182836961041e-07, + "loss": 0.433, + "step": 7268 + }, + { + "epoch": 0.84, + "grad_norm": 2.036972781559473, + "learning_rate": 6.953712823210035e-07, + "loss": 0.4622, + "step": 7269 + }, + { + "epoch": 0.84, + "grad_norm": 2.291862339002079, + "learning_rate": 6.944248772223289e-07, + "loss": 0.5592, + "step": 7270 + }, + { + "epoch": 0.84, + "grad_norm": 1.8802091831343655, + "learning_rate": 6.934790685311732e-07, + "loss": 0.4569, + "step": 7271 + }, + { + "epoch": 0.84, + "grad_norm": 2.384779987683665, + "learning_rate": 6.925338563785517e-07, + "loss": 0.4725, + "step": 7272 + }, + { + "epoch": 0.84, + "grad_norm": 0.9031285359657009, + "learning_rate": 6.915892408953934e-07, + "loss": 0.7345, + "step": 7273 + }, + { + "epoch": 0.84, + "grad_norm": 1.9233502096819546, + "learning_rate": 6.906452222125487e-07, + "loss": 0.4081, + "step": 7274 + }, + { + "epoch": 0.84, + "grad_norm": 1.7567318430030023, + "learning_rate": 6.897018004607814e-07, + "loss": 0.5376, + "step": 7275 + }, + { + "epoch": 0.84, + "grad_norm": 3.1502653565000953, + "learning_rate": 6.887589757707725e-07, + "loss": 0.5263, + "step": 7276 + }, + { + "epoch": 0.84, + "grad_norm": 1.7787776445745294, + "learning_rate": 6.878167482731251e-07, + "loss": 0.5127, + "step": 7277 + }, + { + "epoch": 0.84, + "grad_norm": 1.7885508151509206, + "learning_rate": 6.868751180983546e-07, + "loss": 0.5484, + "step": 7278 + }, + { + "epoch": 0.84, + "grad_norm": 2.4907070317549804, + "learning_rate": 6.859340853768948e-07, + "loss": 0.481, + "step": 7279 + }, + { + "epoch": 0.84, + "grad_norm": 2.7385753883349415, + "learning_rate": 6.849936502390991e-07, + "loss": 0.5362, + "step": 7280 + }, + { + "epoch": 0.84, + "grad_norm": 2.5130809901777207, + "learning_rate": 6.840538128152358e-07, + "loss": 0.4787, + "step": 7281 + }, + { + "epoch": 0.84, + "grad_norm": 2.203891098524167, + "learning_rate": 6.831145732354893e-07, + "loss": 0.5576, + "step": 7282 + }, + { + "epoch": 0.84, + "grad_norm": 2.1083472086145525, + "learning_rate": 6.821759316299659e-07, + "loss": 0.4538, + "step": 7283 + }, + { + "epoch": 0.84, + "grad_norm": 1.9808281837846622, + "learning_rate": 6.812378881286835e-07, + "loss": 0.4825, + "step": 7284 + }, + { + "epoch": 0.84, + "grad_norm": 2.1055819656301473, + "learning_rate": 6.803004428615817e-07, + "loss": 0.5048, + "step": 7285 + }, + { + "epoch": 0.84, + "grad_norm": 1.8562845569871673, + "learning_rate": 6.793635959585138e-07, + "loss": 0.5021, + "step": 7286 + }, + { + "epoch": 0.84, + "grad_norm": 3.0677075964091864, + "learning_rate": 6.784273475492525e-07, + "loss": 0.4679, + "step": 7287 + }, + { + "epoch": 0.84, + "grad_norm": 1.868580076636986, + "learning_rate": 6.77491697763486e-07, + "loss": 0.5486, + "step": 7288 + }, + { + "epoch": 0.84, + "grad_norm": 1.7362013947820045, + "learning_rate": 6.765566467308216e-07, + "loss": 0.3757, + "step": 7289 + }, + { + "epoch": 0.84, + "grad_norm": 1.5948824632769025, + "learning_rate": 6.756221945807806e-07, + "loss": 0.5286, + "step": 7290 + }, + { + "epoch": 0.84, + "grad_norm": 2.5621107448840275, + "learning_rate": 6.746883414428057e-07, + "loss": 0.5916, + "step": 7291 + }, + { + "epoch": 0.84, + "grad_norm": 2.074388585901026, + "learning_rate": 6.737550874462512e-07, + "loss": 0.4832, + "step": 7292 + }, + { + "epoch": 0.84, + "grad_norm": 1.5517872052451054, + "learning_rate": 6.728224327203936e-07, + "loss": 0.4132, + "step": 7293 + }, + { + "epoch": 0.84, + "grad_norm": 2.0665227334637226, + "learning_rate": 6.718903773944235e-07, + "loss": 0.507, + "step": 7294 + }, + { + "epoch": 0.84, + "grad_norm": 2.0197844704331724, + "learning_rate": 6.709589215974482e-07, + "loss": 0.4449, + "step": 7295 + }, + { + "epoch": 0.84, + "grad_norm": 3.142513687937387, + "learning_rate": 6.700280654584923e-07, + "loss": 0.4468, + "step": 7296 + }, + { + "epoch": 0.84, + "grad_norm": 3.428367670189275, + "learning_rate": 6.690978091065004e-07, + "loss": 0.5605, + "step": 7297 + }, + { + "epoch": 0.84, + "grad_norm": 3.2580809457365927, + "learning_rate": 6.681681526703282e-07, + "loss": 0.3997, + "step": 7298 + }, + { + "epoch": 0.84, + "grad_norm": 1.9850901016927027, + "learning_rate": 6.672390962787545e-07, + "loss": 0.5436, + "step": 7299 + }, + { + "epoch": 0.84, + "grad_norm": 1.935373496211033, + "learning_rate": 6.663106400604696e-07, + "loss": 0.4635, + "step": 7300 + }, + { + "epoch": 0.84, + "grad_norm": 1.8979026651528956, + "learning_rate": 6.653827841440852e-07, + "loss": 0.5728, + "step": 7301 + }, + { + "epoch": 0.84, + "grad_norm": 2.085432877011784, + "learning_rate": 6.644555286581267e-07, + "loss": 0.5542, + "step": 7302 + }, + { + "epoch": 0.84, + "grad_norm": 2.3719931022451353, + "learning_rate": 6.635288737310369e-07, + "loss": 0.4327, + "step": 7303 + }, + { + "epoch": 0.84, + "grad_norm": 3.9702345049432926, + "learning_rate": 6.626028194911771e-07, + "loss": 0.4512, + "step": 7304 + }, + { + "epoch": 0.84, + "grad_norm": 0.892402874265305, + "learning_rate": 6.616773660668224e-07, + "loss": 0.6806, + "step": 7305 + }, + { + "epoch": 0.84, + "grad_norm": 1.7963512602223803, + "learning_rate": 6.607525135861686e-07, + "loss": 0.3657, + "step": 7306 + }, + { + "epoch": 0.84, + "grad_norm": 2.033086489718355, + "learning_rate": 6.598282621773239e-07, + "loss": 0.504, + "step": 7307 + }, + { + "epoch": 0.84, + "grad_norm": 1.8160602966284962, + "learning_rate": 6.589046119683178e-07, + "loss": 0.3991, + "step": 7308 + }, + { + "epoch": 0.84, + "grad_norm": 1.8391129506107051, + "learning_rate": 6.579815630870917e-07, + "loss": 0.5244, + "step": 7309 + }, + { + "epoch": 0.84, + "grad_norm": 1.9279276846684756, + "learning_rate": 6.570591156615086e-07, + "loss": 0.5446, + "step": 7310 + }, + { + "epoch": 0.84, + "grad_norm": 1.9682734346201853, + "learning_rate": 6.561372698193446e-07, + "loss": 0.4732, + "step": 7311 + }, + { + "epoch": 0.84, + "grad_norm": 2.967342666929636, + "learning_rate": 6.552160256882934e-07, + "loss": 0.4819, + "step": 7312 + }, + { + "epoch": 0.84, + "grad_norm": 1.9117496544383945, + "learning_rate": 6.542953833959647e-07, + "loss": 0.5036, + "step": 7313 + }, + { + "epoch": 0.84, + "grad_norm": 3.984548968619642, + "learning_rate": 6.533753430698886e-07, + "loss": 0.4913, + "step": 7314 + }, + { + "epoch": 0.84, + "grad_norm": 2.9382272509102525, + "learning_rate": 6.524559048375051e-07, + "loss": 0.5107, + "step": 7315 + }, + { + "epoch": 0.84, + "grad_norm": 2.3080913144905724, + "learning_rate": 6.515370688261785e-07, + "loss": 0.4449, + "step": 7316 + }, + { + "epoch": 0.84, + "grad_norm": 1.9016757030546967, + "learning_rate": 6.506188351631826e-07, + "loss": 0.4996, + "step": 7317 + }, + { + "epoch": 0.84, + "grad_norm": 5.144887325334697, + "learning_rate": 6.497012039757128e-07, + "loss": 0.4779, + "step": 7318 + }, + { + "epoch": 0.84, + "grad_norm": 2.1351754650895503, + "learning_rate": 6.487841753908785e-07, + "loss": 0.5263, + "step": 7319 + }, + { + "epoch": 0.84, + "grad_norm": 1.8555589757768867, + "learning_rate": 6.478677495357072e-07, + "loss": 0.5147, + "step": 7320 + }, + { + "epoch": 0.84, + "grad_norm": 3.943888422074758, + "learning_rate": 6.469519265371416e-07, + "loss": 0.5326, + "step": 7321 + }, + { + "epoch": 0.84, + "grad_norm": 2.0927064029019484, + "learning_rate": 6.460367065220396e-07, + "loss": 0.4357, + "step": 7322 + }, + { + "epoch": 0.84, + "grad_norm": 1.8481601143699864, + "learning_rate": 6.451220896171806e-07, + "loss": 0.5587, + "step": 7323 + }, + { + "epoch": 0.84, + "grad_norm": 2.0063529725808857, + "learning_rate": 6.442080759492541e-07, + "loss": 0.561, + "step": 7324 + }, + { + "epoch": 0.84, + "grad_norm": 2.161625122387795, + "learning_rate": 6.432946656448719e-07, + "loss": 0.4135, + "step": 7325 + }, + { + "epoch": 0.84, + "grad_norm": 2.0998797580917614, + "learning_rate": 6.423818588305564e-07, + "loss": 0.5521, + "step": 7326 + }, + { + "epoch": 0.84, + "grad_norm": 1.812299048864969, + "learning_rate": 6.414696556327526e-07, + "loss": 0.4679, + "step": 7327 + }, + { + "epoch": 0.84, + "grad_norm": 2.1679924072304866, + "learning_rate": 6.405580561778168e-07, + "loss": 0.4392, + "step": 7328 + }, + { + "epoch": 0.84, + "grad_norm": 2.2402366981500133, + "learning_rate": 6.396470605920241e-07, + "loss": 0.591, + "step": 7329 + }, + { + "epoch": 0.84, + "grad_norm": 1.9692912998689012, + "learning_rate": 6.387366690015645e-07, + "loss": 0.5103, + "step": 7330 + }, + { + "epoch": 0.84, + "grad_norm": 1.7143980078176757, + "learning_rate": 6.378268815325467e-07, + "loss": 0.4535, + "step": 7331 + }, + { + "epoch": 0.84, + "grad_norm": 2.4849000453090535, + "learning_rate": 6.369176983109932e-07, + "loss": 0.4627, + "step": 7332 + }, + { + "epoch": 0.84, + "grad_norm": 5.044473031082685, + "learning_rate": 6.360091194628448e-07, + "loss": 0.4489, + "step": 7333 + }, + { + "epoch": 0.84, + "grad_norm": 1.6810682903497514, + "learning_rate": 6.351011451139566e-07, + "loss": 0.4535, + "step": 7334 + }, + { + "epoch": 0.84, + "grad_norm": 2.850868256136373, + "learning_rate": 6.341937753901029e-07, + "loss": 0.4973, + "step": 7335 + }, + { + "epoch": 0.84, + "grad_norm": 1.8522292941520664, + "learning_rate": 6.332870104169703e-07, + "loss": 0.5491, + "step": 7336 + }, + { + "epoch": 0.84, + "grad_norm": 2.349820739891909, + "learning_rate": 6.323808503201656e-07, + "loss": 0.5185, + "step": 7337 + }, + { + "epoch": 0.84, + "grad_norm": 1.998877229402378, + "learning_rate": 6.314752952252079e-07, + "loss": 0.4089, + "step": 7338 + }, + { + "epoch": 0.84, + "grad_norm": 2.1606313186159842, + "learning_rate": 6.305703452575368e-07, + "loss": 0.4719, + "step": 7339 + }, + { + "epoch": 0.84, + "grad_norm": 2.1783792503470907, + "learning_rate": 6.296660005425048e-07, + "loss": 0.5149, + "step": 7340 + }, + { + "epoch": 0.84, + "grad_norm": 2.0250373912882496, + "learning_rate": 6.287622612053801e-07, + "loss": 0.5143, + "step": 7341 + }, + { + "epoch": 0.84, + "grad_norm": 2.1402418014763884, + "learning_rate": 6.278591273713508e-07, + "loss": 0.4493, + "step": 7342 + }, + { + "epoch": 0.84, + "grad_norm": 2.0539604830382006, + "learning_rate": 6.269565991655174e-07, + "loss": 0.395, + "step": 7343 + }, + { + "epoch": 0.84, + "grad_norm": 1.823952932675824, + "learning_rate": 6.260546767128989e-07, + "loss": 0.4575, + "step": 7344 + }, + { + "epoch": 0.84, + "grad_norm": 1.8877956434440728, + "learning_rate": 6.251533601384296e-07, + "loss": 0.4795, + "step": 7345 + }, + { + "epoch": 0.84, + "grad_norm": 1.8681202966325348, + "learning_rate": 6.242526495669587e-07, + "loss": 0.4477, + "step": 7346 + }, + { + "epoch": 0.84, + "grad_norm": 2.350969096037664, + "learning_rate": 6.233525451232519e-07, + "loss": 0.532, + "step": 7347 + }, + { + "epoch": 0.84, + "grad_norm": 2.5970827149402904, + "learning_rate": 6.224530469319934e-07, + "loss": 0.5717, + "step": 7348 + }, + { + "epoch": 0.84, + "grad_norm": 1.730334697204506, + "learning_rate": 6.215541551177795e-07, + "loss": 0.4345, + "step": 7349 + }, + { + "epoch": 0.84, + "grad_norm": 4.760807513290828, + "learning_rate": 6.206558698051274e-07, + "loss": 0.5329, + "step": 7350 + }, + { + "epoch": 0.84, + "grad_norm": 1.899477012690792, + "learning_rate": 6.197581911184642e-07, + "loss": 0.4678, + "step": 7351 + }, + { + "epoch": 0.84, + "grad_norm": 2.1284397859555195, + "learning_rate": 6.188611191821387e-07, + "loss": 0.481, + "step": 7352 + }, + { + "epoch": 0.84, + "grad_norm": 1.7093903886383504, + "learning_rate": 6.17964654120411e-07, + "loss": 0.441, + "step": 7353 + }, + { + "epoch": 0.84, + "grad_norm": 2.36866324503014, + "learning_rate": 6.170687960574612e-07, + "loss": 0.493, + "step": 7354 + }, + { + "epoch": 0.85, + "grad_norm": 2.344742754319983, + "learning_rate": 6.161735451173822e-07, + "loss": 0.4349, + "step": 7355 + }, + { + "epoch": 0.85, + "grad_norm": 2.0564823050814267, + "learning_rate": 6.15278901424185e-07, + "loss": 0.5004, + "step": 7356 + }, + { + "epoch": 0.85, + "grad_norm": 3.7087505489007735, + "learning_rate": 6.143848651017941e-07, + "loss": 0.4614, + "step": 7357 + }, + { + "epoch": 0.85, + "grad_norm": 1.8809648301136048, + "learning_rate": 6.134914362740535e-07, + "loss": 0.4934, + "step": 7358 + }, + { + "epoch": 0.85, + "grad_norm": 2.1639950624237, + "learning_rate": 6.125986150647179e-07, + "loss": 0.4779, + "step": 7359 + }, + { + "epoch": 0.85, + "grad_norm": 1.826944463172608, + "learning_rate": 6.117064015974633e-07, + "loss": 0.4459, + "step": 7360 + }, + { + "epoch": 0.85, + "grad_norm": 5.04297174490235, + "learning_rate": 6.108147959958776e-07, + "loss": 0.5707, + "step": 7361 + }, + { + "epoch": 0.85, + "grad_norm": 1.9468530383110887, + "learning_rate": 6.099237983834666e-07, + "loss": 0.4852, + "step": 7362 + }, + { + "epoch": 0.85, + "grad_norm": 2.4549057830870034, + "learning_rate": 6.090334088836492e-07, + "loss": 0.5002, + "step": 7363 + }, + { + "epoch": 0.85, + "grad_norm": 2.255534548097752, + "learning_rate": 6.08143627619765e-07, + "loss": 0.4671, + "step": 7364 + }, + { + "epoch": 0.85, + "grad_norm": 1.866377773184388, + "learning_rate": 6.072544547150633e-07, + "loss": 0.4448, + "step": 7365 + }, + { + "epoch": 0.85, + "grad_norm": 2.6404253234514976, + "learning_rate": 6.06365890292715e-07, + "loss": 0.4381, + "step": 7366 + }, + { + "epoch": 0.85, + "grad_norm": 2.1536274923910272, + "learning_rate": 6.054779344758027e-07, + "loss": 0.5694, + "step": 7367 + }, + { + "epoch": 0.85, + "grad_norm": 7.123731391328768, + "learning_rate": 6.045905873873242e-07, + "loss": 0.4282, + "step": 7368 + }, + { + "epoch": 0.85, + "grad_norm": 1.71583646228802, + "learning_rate": 6.037038491501978e-07, + "loss": 0.522, + "step": 7369 + }, + { + "epoch": 0.85, + "grad_norm": 3.6626470094922468, + "learning_rate": 6.028177198872514e-07, + "loss": 0.4022, + "step": 7370 + }, + { + "epoch": 0.85, + "grad_norm": 3.3966502341835882, + "learning_rate": 6.019321997212341e-07, + "loss": 0.5581, + "step": 7371 + }, + { + "epoch": 0.85, + "grad_norm": 2.1022482974203394, + "learning_rate": 6.010472887748053e-07, + "loss": 0.5295, + "step": 7372 + }, + { + "epoch": 0.85, + "grad_norm": 2.133049330434335, + "learning_rate": 6.001629871705455e-07, + "loss": 0.5329, + "step": 7373 + }, + { + "epoch": 0.85, + "grad_norm": 2.1318766430647753, + "learning_rate": 5.992792950309456e-07, + "loss": 0.4958, + "step": 7374 + }, + { + "epoch": 0.85, + "grad_norm": 4.2795485747545445, + "learning_rate": 5.983962124784165e-07, + "loss": 0.4428, + "step": 7375 + }, + { + "epoch": 0.85, + "grad_norm": 2.840424922034586, + "learning_rate": 5.975137396352809e-07, + "loss": 0.5365, + "step": 7376 + }, + { + "epoch": 0.85, + "grad_norm": 2.4705495884182205, + "learning_rate": 5.9663187662378e-07, + "loss": 0.4392, + "step": 7377 + }, + { + "epoch": 0.85, + "grad_norm": 1.7647193124232938, + "learning_rate": 5.957506235660693e-07, + "loss": 0.4282, + "step": 7378 + }, + { + "epoch": 0.85, + "grad_norm": 2.3122739260326632, + "learning_rate": 5.948699805842195e-07, + "loss": 0.5017, + "step": 7379 + }, + { + "epoch": 0.85, + "grad_norm": 2.0337310633161976, + "learning_rate": 5.939899478002153e-07, + "loss": 0.4441, + "step": 7380 + }, + { + "epoch": 0.85, + "grad_norm": 13.309177810807729, + "learning_rate": 5.93110525335962e-07, + "loss": 0.4713, + "step": 7381 + }, + { + "epoch": 0.85, + "grad_norm": 2.275022713390884, + "learning_rate": 5.922317133132744e-07, + "loss": 0.4914, + "step": 7382 + }, + { + "epoch": 0.85, + "grad_norm": 3.258148905905298, + "learning_rate": 5.913535118538871e-07, + "loss": 0.5342, + "step": 7383 + }, + { + "epoch": 0.85, + "grad_norm": 1.8281947079945777, + "learning_rate": 5.904759210794469e-07, + "loss": 0.3753, + "step": 7384 + }, + { + "epoch": 0.85, + "grad_norm": 1.7727812750868706, + "learning_rate": 5.895989411115194e-07, + "loss": 0.5151, + "step": 7385 + }, + { + "epoch": 0.85, + "grad_norm": 3.750507187125224, + "learning_rate": 5.887225720715828e-07, + "loss": 0.4906, + "step": 7386 + }, + { + "epoch": 0.85, + "grad_norm": 2.3939165498474915, + "learning_rate": 5.8784681408103e-07, + "loss": 0.5329, + "step": 7387 + }, + { + "epoch": 0.85, + "grad_norm": 2.847794750890957, + "learning_rate": 5.869716672611741e-07, + "loss": 0.5703, + "step": 7388 + }, + { + "epoch": 0.85, + "grad_norm": 2.1626650434342456, + "learning_rate": 5.86097131733237e-07, + "loss": 0.4765, + "step": 7389 + }, + { + "epoch": 0.85, + "grad_norm": 1.950007852604291, + "learning_rate": 5.852232076183617e-07, + "loss": 0.4654, + "step": 7390 + }, + { + "epoch": 0.85, + "grad_norm": 2.0174853234801198, + "learning_rate": 5.84349895037602e-07, + "loss": 0.409, + "step": 7391 + }, + { + "epoch": 0.85, + "grad_norm": 1.8636683998457557, + "learning_rate": 5.83477194111931e-07, + "loss": 0.5184, + "step": 7392 + }, + { + "epoch": 0.85, + "grad_norm": 1.9209244208404415, + "learning_rate": 5.826051049622334e-07, + "loss": 0.4648, + "step": 7393 + }, + { + "epoch": 0.85, + "grad_norm": 1.961140893703432, + "learning_rate": 5.817336277093121e-07, + "loss": 0.5048, + "step": 7394 + }, + { + "epoch": 0.85, + "grad_norm": 0.8272880542945591, + "learning_rate": 5.808627624738838e-07, + "loss": 0.6636, + "step": 7395 + }, + { + "epoch": 0.85, + "grad_norm": 3.9045614838281164, + "learning_rate": 5.799925093765801e-07, + "loss": 0.5106, + "step": 7396 + }, + { + "epoch": 0.85, + "grad_norm": 1.9259639263360695, + "learning_rate": 5.791228685379474e-07, + "loss": 0.4476, + "step": 7397 + }, + { + "epoch": 0.85, + "grad_norm": 1.9933237910892496, + "learning_rate": 5.782538400784498e-07, + "loss": 0.3991, + "step": 7398 + }, + { + "epoch": 0.85, + "grad_norm": 1.9932924191212764, + "learning_rate": 5.773854241184639e-07, + "loss": 0.4832, + "step": 7399 + }, + { + "epoch": 0.85, + "grad_norm": 1.9699709420561649, + "learning_rate": 5.765176207782841e-07, + "loss": 0.485, + "step": 7400 + }, + { + "epoch": 0.85, + "grad_norm": 2.26833355998099, + "learning_rate": 5.756504301781163e-07, + "loss": 0.5202, + "step": 7401 + }, + { + "epoch": 0.85, + "grad_norm": 2.278133729047044, + "learning_rate": 5.747838524380855e-07, + "loss": 0.4762, + "step": 7402 + }, + { + "epoch": 0.85, + "grad_norm": 2.426927383761171, + "learning_rate": 5.739178876782275e-07, + "loss": 0.4968, + "step": 7403 + }, + { + "epoch": 0.85, + "grad_norm": 2.414839559479861, + "learning_rate": 5.730525360184985e-07, + "loss": 0.5137, + "step": 7404 + }, + { + "epoch": 0.85, + "grad_norm": 2.072038642652952, + "learning_rate": 5.72187797578766e-07, + "loss": 0.4782, + "step": 7405 + }, + { + "epoch": 0.85, + "grad_norm": 1.8059920066218003, + "learning_rate": 5.713236724788113e-07, + "loss": 0.4751, + "step": 7406 + }, + { + "epoch": 0.85, + "grad_norm": 2.428150506744794, + "learning_rate": 5.704601608383353e-07, + "loss": 0.4594, + "step": 7407 + }, + { + "epoch": 0.85, + "grad_norm": 1.640569917083776, + "learning_rate": 5.695972627769497e-07, + "loss": 0.5418, + "step": 7408 + }, + { + "epoch": 0.85, + "grad_norm": 2.1321981168922832, + "learning_rate": 5.687349784141849e-07, + "loss": 0.4839, + "step": 7409 + }, + { + "epoch": 0.85, + "grad_norm": 2.0342254512892333, + "learning_rate": 5.678733078694825e-07, + "loss": 0.4413, + "step": 7410 + }, + { + "epoch": 0.85, + "grad_norm": 2.4091287003184116, + "learning_rate": 5.670122512622022e-07, + "loss": 0.4554, + "step": 7411 + }, + { + "epoch": 0.85, + "grad_norm": 1.7794283360263499, + "learning_rate": 5.66151808711618e-07, + "loss": 0.4437, + "step": 7412 + }, + { + "epoch": 0.85, + "grad_norm": 2.307850046768339, + "learning_rate": 5.652919803369167e-07, + "loss": 0.4991, + "step": 7413 + }, + { + "epoch": 0.85, + "grad_norm": 2.3830578502811623, + "learning_rate": 5.644327662572014e-07, + "loss": 0.4406, + "step": 7414 + }, + { + "epoch": 0.85, + "grad_norm": 1.9691020748667285, + "learning_rate": 5.635741665914918e-07, + "loss": 0.515, + "step": 7415 + }, + { + "epoch": 0.85, + "grad_norm": 2.6626636922513374, + "learning_rate": 5.627161814587195e-07, + "loss": 0.4041, + "step": 7416 + }, + { + "epoch": 0.85, + "grad_norm": 1.974654177270855, + "learning_rate": 5.61858810977734e-07, + "loss": 0.4101, + "step": 7417 + }, + { + "epoch": 0.85, + "grad_norm": 2.001690882768793, + "learning_rate": 5.610020552672968e-07, + "loss": 0.4966, + "step": 7418 + }, + { + "epoch": 0.85, + "grad_norm": 1.7832428661847315, + "learning_rate": 5.601459144460864e-07, + "loss": 0.4855, + "step": 7419 + }, + { + "epoch": 0.85, + "grad_norm": 2.3001500946569484, + "learning_rate": 5.59290388632694e-07, + "loss": 0.4874, + "step": 7420 + }, + { + "epoch": 0.85, + "grad_norm": 1.8782861835794988, + "learning_rate": 5.584354779456291e-07, + "loss": 0.4775, + "step": 7421 + }, + { + "epoch": 0.85, + "grad_norm": 1.9786489134352634, + "learning_rate": 5.575811825033112e-07, + "loss": 0.5153, + "step": 7422 + }, + { + "epoch": 0.85, + "grad_norm": 4.910591231207547, + "learning_rate": 5.567275024240798e-07, + "loss": 0.4797, + "step": 7423 + }, + { + "epoch": 0.85, + "grad_norm": 2.221850131757607, + "learning_rate": 5.558744378261838e-07, + "loss": 0.4794, + "step": 7424 + }, + { + "epoch": 0.85, + "grad_norm": 2.2519760106994378, + "learning_rate": 5.550219888277925e-07, + "loss": 0.5467, + "step": 7425 + }, + { + "epoch": 0.85, + "grad_norm": 2.274203674030171, + "learning_rate": 5.541701555469847e-07, + "loss": 0.4564, + "step": 7426 + }, + { + "epoch": 0.85, + "grad_norm": 2.03904845205746, + "learning_rate": 5.533189381017562e-07, + "loss": 0.4055, + "step": 7427 + }, + { + "epoch": 0.85, + "grad_norm": 10.791902391858443, + "learning_rate": 5.524683366100192e-07, + "loss": 0.4861, + "step": 7428 + }, + { + "epoch": 0.85, + "grad_norm": 2.553883234470799, + "learning_rate": 5.516183511895979e-07, + "loss": 0.506, + "step": 7429 + }, + { + "epoch": 0.85, + "grad_norm": 1.927828753670472, + "learning_rate": 5.507689819582312e-07, + "loss": 0.5527, + "step": 7430 + }, + { + "epoch": 0.85, + "grad_norm": 2.14131738300678, + "learning_rate": 5.499202290335754e-07, + "loss": 0.5234, + "step": 7431 + }, + { + "epoch": 0.85, + "grad_norm": 2.317922969490147, + "learning_rate": 5.490720925331988e-07, + "loss": 0.4588, + "step": 7432 + }, + { + "epoch": 0.85, + "grad_norm": 1.8989339617481684, + "learning_rate": 5.482245725745838e-07, + "loss": 0.5264, + "step": 7433 + }, + { + "epoch": 0.85, + "grad_norm": 1.8727741966306468, + "learning_rate": 5.473776692751315e-07, + "loss": 0.4525, + "step": 7434 + }, + { + "epoch": 0.85, + "grad_norm": 2.201912153173782, + "learning_rate": 5.465313827521518e-07, + "loss": 0.5183, + "step": 7435 + }, + { + "epoch": 0.85, + "grad_norm": 2.910531175184344, + "learning_rate": 5.456857131228743e-07, + "loss": 0.5138, + "step": 7436 + }, + { + "epoch": 0.85, + "grad_norm": 2.0405328944223204, + "learning_rate": 5.448406605044398e-07, + "loss": 0.5227, + "step": 7437 + }, + { + "epoch": 0.85, + "grad_norm": 1.990494301689452, + "learning_rate": 5.439962250139058e-07, + "loss": 0.5237, + "step": 7438 + }, + { + "epoch": 0.85, + "grad_norm": 2.412908144764464, + "learning_rate": 5.431524067682426e-07, + "loss": 0.5102, + "step": 7439 + }, + { + "epoch": 0.85, + "grad_norm": 1.9537980283123841, + "learning_rate": 5.423092058843365e-07, + "loss": 0.4347, + "step": 7440 + }, + { + "epoch": 0.85, + "grad_norm": 2.4212470704707916, + "learning_rate": 5.414666224789866e-07, + "loss": 0.4625, + "step": 7441 + }, + { + "epoch": 0.86, + "grad_norm": 1.8890819629395847, + "learning_rate": 5.406246566689083e-07, + "loss": 0.4552, + "step": 7442 + }, + { + "epoch": 0.86, + "grad_norm": 2.052124001674223, + "learning_rate": 5.397833085707299e-07, + "loss": 0.4724, + "step": 7443 + }, + { + "epoch": 0.86, + "grad_norm": 1.8986847731790975, + "learning_rate": 5.389425783009955e-07, + "loss": 0.4344, + "step": 7444 + }, + { + "epoch": 0.86, + "grad_norm": 2.3410407172732035, + "learning_rate": 5.381024659761624e-07, + "loss": 0.5271, + "step": 7445 + }, + { + "epoch": 0.86, + "grad_norm": 2.656594892094784, + "learning_rate": 5.372629717126033e-07, + "loss": 0.5087, + "step": 7446 + }, + { + "epoch": 0.86, + "grad_norm": 2.288793186539535, + "learning_rate": 5.364240956266031e-07, + "loss": 0.5156, + "step": 7447 + }, + { + "epoch": 0.86, + "grad_norm": 2.008679332605421, + "learning_rate": 5.355858378343653e-07, + "loss": 0.4764, + "step": 7448 + }, + { + "epoch": 0.86, + "grad_norm": 2.96839773703762, + "learning_rate": 5.347481984520031e-07, + "loss": 0.4203, + "step": 7449 + }, + { + "epoch": 0.86, + "grad_norm": 2.275713169463398, + "learning_rate": 5.339111775955475e-07, + "loss": 0.5153, + "step": 7450 + }, + { + "epoch": 0.86, + "grad_norm": 2.48068613428696, + "learning_rate": 5.330747753809423e-07, + "loss": 0.481, + "step": 7451 + }, + { + "epoch": 0.86, + "grad_norm": 2.3287475387582415, + "learning_rate": 5.32238991924045e-07, + "loss": 0.4605, + "step": 7452 + }, + { + "epoch": 0.86, + "grad_norm": 1.5777994765805736, + "learning_rate": 5.314038273406291e-07, + "loss": 0.4057, + "step": 7453 + }, + { + "epoch": 0.86, + "grad_norm": 2.3979543114221613, + "learning_rate": 5.305692817463803e-07, + "loss": 0.5007, + "step": 7454 + }, + { + "epoch": 0.86, + "grad_norm": 3.353294508241976, + "learning_rate": 5.297353552569012e-07, + "loss": 0.5123, + "step": 7455 + }, + { + "epoch": 0.86, + "grad_norm": 2.1625963337692955, + "learning_rate": 5.289020479877055e-07, + "loss": 0.5288, + "step": 7456 + }, + { + "epoch": 0.86, + "grad_norm": 2.8236788696298336, + "learning_rate": 5.280693600542247e-07, + "loss": 0.4559, + "step": 7457 + }, + { + "epoch": 0.86, + "grad_norm": 2.601304463244207, + "learning_rate": 5.27237291571801e-07, + "loss": 0.5463, + "step": 7458 + }, + { + "epoch": 0.86, + "grad_norm": 2.695240440914216, + "learning_rate": 5.264058426556934e-07, + "loss": 0.4991, + "step": 7459 + }, + { + "epoch": 0.86, + "grad_norm": 2.2838188417449214, + "learning_rate": 5.25575013421073e-07, + "loss": 0.5237, + "step": 7460 + }, + { + "epoch": 0.86, + "grad_norm": 1.8865845075275451, + "learning_rate": 5.247448039830277e-07, + "loss": 0.4752, + "step": 7461 + }, + { + "epoch": 0.86, + "grad_norm": 2.817657850117066, + "learning_rate": 5.239152144565557e-07, + "loss": 0.4576, + "step": 7462 + }, + { + "epoch": 0.86, + "grad_norm": 2.0936638824475673, + "learning_rate": 5.230862449565755e-07, + "loss": 0.4935, + "step": 7463 + }, + { + "epoch": 0.86, + "grad_norm": 2.189477321752696, + "learning_rate": 5.222578955979107e-07, + "loss": 0.4369, + "step": 7464 + }, + { + "epoch": 0.86, + "grad_norm": 2.247748327507094, + "learning_rate": 5.214301664953075e-07, + "loss": 0.4046, + "step": 7465 + }, + { + "epoch": 0.86, + "grad_norm": 6.436868510031934, + "learning_rate": 5.206030577634214e-07, + "loss": 0.5248, + "step": 7466 + }, + { + "epoch": 0.86, + "grad_norm": 2.539791197896571, + "learning_rate": 5.19776569516825e-07, + "loss": 0.4151, + "step": 7467 + }, + { + "epoch": 0.86, + "grad_norm": 2.2760368345960984, + "learning_rate": 5.189507018700007e-07, + "loss": 0.4938, + "step": 7468 + }, + { + "epoch": 0.86, + "grad_norm": 4.430810431601701, + "learning_rate": 5.181254549373505e-07, + "loss": 0.4728, + "step": 7469 + }, + { + "epoch": 0.86, + "grad_norm": 2.245732223229314, + "learning_rate": 5.173008288331843e-07, + "loss": 0.517, + "step": 7470 + }, + { + "epoch": 0.86, + "grad_norm": 1.8906805604225645, + "learning_rate": 5.164768236717326e-07, + "loss": 0.5882, + "step": 7471 + }, + { + "epoch": 0.86, + "grad_norm": 1.858442766536668, + "learning_rate": 5.156534395671342e-07, + "loss": 0.5328, + "step": 7472 + }, + { + "epoch": 0.86, + "grad_norm": 1.9210649022042006, + "learning_rate": 5.148306766334438e-07, + "loss": 0.4368, + "step": 7473 + }, + { + "epoch": 0.86, + "grad_norm": 3.602497516123075, + "learning_rate": 5.140085349846324e-07, + "loss": 0.4218, + "step": 7474 + }, + { + "epoch": 0.86, + "grad_norm": 1.8326532634441488, + "learning_rate": 5.131870147345808e-07, + "loss": 0.536, + "step": 7475 + }, + { + "epoch": 0.86, + "grad_norm": 3.1422459040202435, + "learning_rate": 5.123661159970872e-07, + "loss": 0.4156, + "step": 7476 + }, + { + "epoch": 0.86, + "grad_norm": 2.288292535626195, + "learning_rate": 5.115458388858613e-07, + "loss": 0.5201, + "step": 7477 + }, + { + "epoch": 0.86, + "grad_norm": 2.280316333386202, + "learning_rate": 5.107261835145294e-07, + "loss": 0.3821, + "step": 7478 + }, + { + "epoch": 0.86, + "grad_norm": 1.6011649528916359, + "learning_rate": 5.099071499966279e-07, + "loss": 0.4292, + "step": 7479 + }, + { + "epoch": 0.86, + "grad_norm": 1.9634467598732879, + "learning_rate": 5.090887384456127e-07, + "loss": 0.5263, + "step": 7480 + }, + { + "epoch": 0.86, + "grad_norm": 1.854036052821305, + "learning_rate": 5.08270948974845e-07, + "loss": 0.5598, + "step": 7481 + }, + { + "epoch": 0.86, + "grad_norm": 0.8054213757972821, + "learning_rate": 5.074537816976089e-07, + "loss": 0.69, + "step": 7482 + }, + { + "epoch": 0.86, + "grad_norm": 1.832278853643356, + "learning_rate": 5.066372367270955e-07, + "loss": 0.4698, + "step": 7483 + }, + { + "epoch": 0.86, + "grad_norm": 1.668743812932087, + "learning_rate": 5.058213141764151e-07, + "loss": 0.3872, + "step": 7484 + }, + { + "epoch": 0.86, + "grad_norm": 2.292126066413465, + "learning_rate": 5.050060141585866e-07, + "loss": 0.5002, + "step": 7485 + }, + { + "epoch": 0.86, + "grad_norm": 2.637549916453231, + "learning_rate": 5.041913367865475e-07, + "loss": 0.414, + "step": 7486 + }, + { + "epoch": 0.86, + "grad_norm": 2.3914156546030223, + "learning_rate": 5.033772821731447e-07, + "loss": 0.6529, + "step": 7487 + }, + { + "epoch": 0.86, + "grad_norm": 2.004763845281532, + "learning_rate": 5.025638504311431e-07, + "loss": 0.5122, + "step": 7488 + }, + { + "epoch": 0.86, + "grad_norm": 9.40200742298774, + "learning_rate": 5.017510416732169e-07, + "loss": 0.5485, + "step": 7489 + }, + { + "epoch": 0.86, + "grad_norm": 1.7873289842726132, + "learning_rate": 5.009388560119583e-07, + "loss": 0.5399, + "step": 7490 + }, + { + "epoch": 0.86, + "grad_norm": 2.251080739973806, + "learning_rate": 5.0012729355987e-07, + "loss": 0.4036, + "step": 7491 + }, + { + "epoch": 0.86, + "grad_norm": 1.8710136111710667, + "learning_rate": 4.993163544293689e-07, + "loss": 0.3984, + "step": 7492 + }, + { + "epoch": 0.86, + "grad_norm": 3.4801780620277, + "learning_rate": 4.985060387327872e-07, + "loss": 0.4531, + "step": 7493 + }, + { + "epoch": 0.86, + "grad_norm": 2.337404458610312, + "learning_rate": 4.976963465823686e-07, + "loss": 0.5495, + "step": 7494 + }, + { + "epoch": 0.86, + "grad_norm": 1.9616018509901116, + "learning_rate": 4.968872780902739e-07, + "loss": 0.4503, + "step": 7495 + }, + { + "epoch": 0.86, + "grad_norm": 2.1278768332354505, + "learning_rate": 4.960788333685729e-07, + "loss": 0.5188, + "step": 7496 + }, + { + "epoch": 0.86, + "grad_norm": 1.8193805983553766, + "learning_rate": 4.952710125292515e-07, + "loss": 0.5065, + "step": 7497 + }, + { + "epoch": 0.86, + "grad_norm": 2.7420640235334077, + "learning_rate": 4.944638156842086e-07, + "loss": 0.4362, + "step": 7498 + }, + { + "epoch": 0.86, + "grad_norm": 2.715360849479292, + "learning_rate": 4.936572429452585e-07, + "loss": 0.3728, + "step": 7499 + }, + { + "epoch": 0.86, + "grad_norm": 2.35047211516371, + "learning_rate": 4.928512944241259e-07, + "loss": 0.602, + "step": 7500 + }, + { + "epoch": 0.86, + "grad_norm": 2.2032144104208675, + "learning_rate": 4.92045970232452e-07, + "loss": 0.5692, + "step": 7501 + }, + { + "epoch": 0.86, + "grad_norm": 2.315436215093921, + "learning_rate": 4.912412704817882e-07, + "loss": 0.4412, + "step": 7502 + }, + { + "epoch": 0.86, + "grad_norm": 7.561208254704693, + "learning_rate": 4.90437195283604e-07, + "loss": 0.5435, + "step": 7503 + }, + { + "epoch": 0.86, + "grad_norm": 3.1285488672474813, + "learning_rate": 4.896337447492777e-07, + "loss": 0.4126, + "step": 7504 + }, + { + "epoch": 0.86, + "grad_norm": 2.418404871475747, + "learning_rate": 4.888309189901047e-07, + "loss": 0.4225, + "step": 7505 + }, + { + "epoch": 0.86, + "grad_norm": 1.8156476880753913, + "learning_rate": 4.880287181172905e-07, + "loss": 0.4272, + "step": 7506 + }, + { + "epoch": 0.86, + "grad_norm": 3.6184234218254083, + "learning_rate": 4.872271422419572e-07, + "loss": 0.5367, + "step": 7507 + }, + { + "epoch": 0.86, + "grad_norm": 2.4102223705954655, + "learning_rate": 4.864261914751384e-07, + "loss": 0.4896, + "step": 7508 + }, + { + "epoch": 0.86, + "grad_norm": 1.607189392234605, + "learning_rate": 4.856258659277818e-07, + "loss": 0.5388, + "step": 7509 + }, + { + "epoch": 0.86, + "grad_norm": 1.894098615547395, + "learning_rate": 4.848261657107489e-07, + "loss": 0.4778, + "step": 7510 + }, + { + "epoch": 0.86, + "grad_norm": 1.711126982510492, + "learning_rate": 4.840270909348127e-07, + "loss": 0.5282, + "step": 7511 + }, + { + "epoch": 0.86, + "grad_norm": 1.9900422783623928, + "learning_rate": 4.832286417106625e-07, + "loss": 0.4204, + "step": 7512 + }, + { + "epoch": 0.86, + "grad_norm": 2.1723320276684794, + "learning_rate": 4.824308181488979e-07, + "loss": 0.453, + "step": 7513 + }, + { + "epoch": 0.86, + "grad_norm": 2.6545563238851657, + "learning_rate": 4.816336203600335e-07, + "loss": 0.5992, + "step": 7514 + }, + { + "epoch": 0.86, + "grad_norm": 1.9543991706579598, + "learning_rate": 4.808370484544983e-07, + "loss": 0.5535, + "step": 7515 + }, + { + "epoch": 0.86, + "grad_norm": 1.7896131603545111, + "learning_rate": 4.800411025426327e-07, + "loss": 0.515, + "step": 7516 + }, + { + "epoch": 0.86, + "grad_norm": 2.467694598012592, + "learning_rate": 4.792457827346891e-07, + "loss": 0.4574, + "step": 7517 + }, + { + "epoch": 0.86, + "grad_norm": 1.6959573048144598, + "learning_rate": 4.784510891408384e-07, + "loss": 0.4071, + "step": 7518 + }, + { + "epoch": 0.86, + "grad_norm": 2.538551124852352, + "learning_rate": 4.776570218711579e-07, + "loss": 0.603, + "step": 7519 + }, + { + "epoch": 0.86, + "grad_norm": 2.3882086650261427, + "learning_rate": 4.768635810356448e-07, + "loss": 0.5168, + "step": 7520 + }, + { + "epoch": 0.86, + "grad_norm": 2.287792532683033, + "learning_rate": 4.760707667442044e-07, + "loss": 0.5717, + "step": 7521 + }, + { + "epoch": 0.86, + "grad_norm": 2.4629981307283093, + "learning_rate": 4.752785791066583e-07, + "loss": 0.475, + "step": 7522 + }, + { + "epoch": 0.86, + "grad_norm": 2.8116538011486973, + "learning_rate": 4.744870182327388e-07, + "loss": 0.5354, + "step": 7523 + }, + { + "epoch": 0.86, + "grad_norm": 2.5116871383925488, + "learning_rate": 4.736960842320948e-07, + "loss": 0.5967, + "step": 7524 + }, + { + "epoch": 0.86, + "grad_norm": 2.36716815419743, + "learning_rate": 4.729057772142842e-07, + "loss": 0.5885, + "step": 7525 + }, + { + "epoch": 0.86, + "grad_norm": 3.4475358017803974, + "learning_rate": 4.721160972887823e-07, + "loss": 0.4357, + "step": 7526 + }, + { + "epoch": 0.86, + "grad_norm": 2.1782520953726636, + "learning_rate": 4.7132704456497314e-07, + "loss": 0.4876, + "step": 7527 + }, + { + "epoch": 0.86, + "grad_norm": 1.7500905113878773, + "learning_rate": 4.705386191521588e-07, + "loss": 0.5218, + "step": 7528 + }, + { + "epoch": 0.87, + "grad_norm": 2.0629892189182644, + "learning_rate": 4.697508211595492e-07, + "loss": 0.4484, + "step": 7529 + }, + { + "epoch": 0.87, + "grad_norm": 2.321121311455933, + "learning_rate": 4.6896365069627246e-07, + "loss": 0.5005, + "step": 7530 + }, + { + "epoch": 0.87, + "grad_norm": 5.498448567084177, + "learning_rate": 4.6817710787136486e-07, + "loss": 0.4248, + "step": 7531 + }, + { + "epoch": 0.87, + "grad_norm": 3.321289687938903, + "learning_rate": 4.673911927937802e-07, + "loss": 0.502, + "step": 7532 + }, + { + "epoch": 0.87, + "grad_norm": 2.5052951072668868, + "learning_rate": 4.666059055723815e-07, + "loss": 0.4723, + "step": 7533 + }, + { + "epoch": 0.87, + "grad_norm": 1.8895375386432036, + "learning_rate": 4.6582124631594836e-07, + "loss": 0.4204, + "step": 7534 + }, + { + "epoch": 0.87, + "grad_norm": 2.1003799561355723, + "learning_rate": 4.6503721513317004e-07, + "loss": 0.4149, + "step": 7535 + }, + { + "epoch": 0.87, + "grad_norm": 3.010101496357928, + "learning_rate": 4.6425381213265177e-07, + "loss": 0.4661, + "step": 7536 + }, + { + "epoch": 0.87, + "grad_norm": 3.4539598171914836, + "learning_rate": 4.634710374229101e-07, + "loss": 0.3747, + "step": 7537 + }, + { + "epoch": 0.87, + "grad_norm": 2.2007487055175696, + "learning_rate": 4.626888911123739e-07, + "loss": 0.5049, + "step": 7538 + }, + { + "epoch": 0.87, + "grad_norm": 2.6540120031111214, + "learning_rate": 4.619073733093871e-07, + "loss": 0.4898, + "step": 7539 + }, + { + "epoch": 0.87, + "grad_norm": 1.7305761254661562, + "learning_rate": 4.6112648412220404e-07, + "loss": 0.4862, + "step": 7540 + }, + { + "epoch": 0.87, + "grad_norm": 2.4111604569080427, + "learning_rate": 4.6034622365899563e-07, + "loss": 0.5019, + "step": 7541 + }, + { + "epoch": 0.87, + "grad_norm": 2.0405162086909336, + "learning_rate": 4.595665920278408e-07, + "loss": 0.5327, + "step": 7542 + }, + { + "epoch": 0.87, + "grad_norm": 1.9608567758775484, + "learning_rate": 4.587875893367361e-07, + "loss": 0.5201, + "step": 7543 + }, + { + "epoch": 0.87, + "grad_norm": 2.471351893342806, + "learning_rate": 4.5800921569358723e-07, + "loss": 0.4711, + "step": 7544 + }, + { + "epoch": 0.87, + "grad_norm": 2.2555620054457632, + "learning_rate": 4.572314712062159e-07, + "loss": 0.5299, + "step": 7545 + }, + { + "epoch": 0.87, + "grad_norm": 1.8620946231082607, + "learning_rate": 4.564543559823531e-07, + "loss": 0.5069, + "step": 7546 + }, + { + "epoch": 0.87, + "grad_norm": 1.9170212126994397, + "learning_rate": 4.5567787012964826e-07, + "loss": 0.4502, + "step": 7547 + }, + { + "epoch": 0.87, + "grad_norm": 2.5994814476165717, + "learning_rate": 4.549020137556559e-07, + "loss": 0.4379, + "step": 7548 + }, + { + "epoch": 0.87, + "grad_norm": 1.9795954128202704, + "learning_rate": 4.5412678696785007e-07, + "loss": 0.4387, + "step": 7549 + }, + { + "epoch": 0.87, + "grad_norm": 2.018285653542061, + "learning_rate": 4.533521898736132e-07, + "loss": 0.5368, + "step": 7550 + }, + { + "epoch": 0.87, + "grad_norm": 2.0308004086066225, + "learning_rate": 4.52578222580245e-07, + "loss": 0.4131, + "step": 7551 + }, + { + "epoch": 0.87, + "grad_norm": 2.6287344952768934, + "learning_rate": 4.5180488519495246e-07, + "loss": 0.5289, + "step": 7552 + }, + { + "epoch": 0.87, + "grad_norm": 4.009431965692164, + "learning_rate": 4.5103217782486053e-07, + "loss": 0.4215, + "step": 7553 + }, + { + "epoch": 0.87, + "grad_norm": 2.83465563681277, + "learning_rate": 4.5026010057700186e-07, + "loss": 0.5095, + "step": 7554 + }, + { + "epoch": 0.87, + "grad_norm": 1.8910496617226145, + "learning_rate": 4.494886535583276e-07, + "loss": 0.4467, + "step": 7555 + }, + { + "epoch": 0.87, + "grad_norm": 2.4201526145443437, + "learning_rate": 4.487178368756967e-07, + "loss": 0.5583, + "step": 7556 + }, + { + "epoch": 0.87, + "grad_norm": 2.3835953142310613, + "learning_rate": 4.4794765063588207e-07, + "loss": 0.4061, + "step": 7557 + }, + { + "epoch": 0.87, + "grad_norm": 2.1120525521200597, + "learning_rate": 4.471780949455712e-07, + "loss": 0.4898, + "step": 7558 + }, + { + "epoch": 0.87, + "grad_norm": 2.323746980456667, + "learning_rate": 4.4640916991136096e-07, + "loss": 0.4664, + "step": 7559 + }, + { + "epoch": 0.87, + "grad_norm": 0.807541042737632, + "learning_rate": 4.456408756397651e-07, + "loss": 0.6671, + "step": 7560 + }, + { + "epoch": 0.87, + "grad_norm": 1.9930287038756664, + "learning_rate": 4.4487321223720516e-07, + "loss": 0.5729, + "step": 7561 + }, + { + "epoch": 0.87, + "grad_norm": 0.8427141621996793, + "learning_rate": 4.4410617981001934e-07, + "loss": 0.6786, + "step": 7562 + }, + { + "epoch": 0.87, + "grad_norm": 1.9801169210275944, + "learning_rate": 4.433397784644561e-07, + "loss": 0.4996, + "step": 7563 + }, + { + "epoch": 0.87, + "grad_norm": 1.8534931025692385, + "learning_rate": 4.425740083066793e-07, + "loss": 0.4843, + "step": 7564 + }, + { + "epoch": 0.87, + "grad_norm": 1.8228869320205519, + "learning_rate": 4.4180886944275914e-07, + "loss": 0.5617, + "step": 7565 + }, + { + "epoch": 0.87, + "grad_norm": 2.4970343715073633, + "learning_rate": 4.410443619786864e-07, + "loss": 0.526, + "step": 7566 + }, + { + "epoch": 0.87, + "grad_norm": 2.3130563545694187, + "learning_rate": 4.40280486020358e-07, + "loss": 0.5599, + "step": 7567 + }, + { + "epoch": 0.87, + "grad_norm": 2.485456864375333, + "learning_rate": 4.3951724167358766e-07, + "loss": 0.4832, + "step": 7568 + }, + { + "epoch": 0.87, + "grad_norm": 2.5240161688697342, + "learning_rate": 4.3875462904409806e-07, + "loss": 0.495, + "step": 7569 + }, + { + "epoch": 0.87, + "grad_norm": 2.282509741593416, + "learning_rate": 4.379926482375285e-07, + "loss": 0.4779, + "step": 7570 + }, + { + "epoch": 0.87, + "grad_norm": 1.6271917099069408, + "learning_rate": 4.372312993594258e-07, + "loss": 0.3866, + "step": 7571 + }, + { + "epoch": 0.87, + "grad_norm": 1.8691207948266293, + "learning_rate": 4.364705825152543e-07, + "loss": 0.4998, + "step": 7572 + }, + { + "epoch": 0.87, + "grad_norm": 1.9627580119973405, + "learning_rate": 4.357104978103865e-07, + "loss": 0.4986, + "step": 7573 + }, + { + "epoch": 0.87, + "grad_norm": 2.500034605990953, + "learning_rate": 4.34951045350111e-07, + "loss": 0.556, + "step": 7574 + }, + { + "epoch": 0.87, + "grad_norm": 2.12064812133031, + "learning_rate": 4.341922252396258e-07, + "loss": 0.5121, + "step": 7575 + }, + { + "epoch": 0.87, + "grad_norm": 1.830750219903513, + "learning_rate": 4.334340375840418e-07, + "loss": 0.4013, + "step": 7576 + }, + { + "epoch": 0.87, + "grad_norm": 2.1664226726545976, + "learning_rate": 4.3267648248838446e-07, + "loss": 0.5316, + "step": 7577 + }, + { + "epoch": 0.87, + "grad_norm": 2.055989796070457, + "learning_rate": 4.319195600575893e-07, + "loss": 0.4916, + "step": 7578 + }, + { + "epoch": 0.87, + "grad_norm": 2.1064678834924364, + "learning_rate": 4.311632703965063e-07, + "loss": 0.5412, + "step": 7579 + }, + { + "epoch": 0.87, + "grad_norm": 1.9017183348267375, + "learning_rate": 4.3040761360989503e-07, + "loss": 0.4888, + "step": 7580 + }, + { + "epoch": 0.87, + "grad_norm": 1.8082831017060668, + "learning_rate": 4.2965258980243116e-07, + "loss": 0.4364, + "step": 7581 + }, + { + "epoch": 0.87, + "grad_norm": 1.8632897058631996, + "learning_rate": 4.288981990786972e-07, + "loss": 0.4631, + "step": 7582 + }, + { + "epoch": 0.87, + "grad_norm": 2.4351204003025693, + "learning_rate": 4.28144441543194e-07, + "loss": 0.5043, + "step": 7583 + }, + { + "epoch": 0.87, + "grad_norm": 2.0871884040065516, + "learning_rate": 4.273913173003297e-07, + "loss": 0.4692, + "step": 7584 + }, + { + "epoch": 0.87, + "grad_norm": 3.0828053825434742, + "learning_rate": 4.266388264544291e-07, + "loss": 0.506, + "step": 7585 + }, + { + "epoch": 0.87, + "grad_norm": 2.7879559282966846, + "learning_rate": 4.258869691097256e-07, + "loss": 0.4153, + "step": 7586 + }, + { + "epoch": 0.87, + "grad_norm": 2.185177368114945, + "learning_rate": 4.251357453703675e-07, + "loss": 0.4763, + "step": 7587 + }, + { + "epoch": 0.87, + "grad_norm": 3.199721893337713, + "learning_rate": 4.243851553404127e-07, + "loss": 0.5243, + "step": 7588 + }, + { + "epoch": 0.87, + "grad_norm": 2.581961493489355, + "learning_rate": 4.236351991238347e-07, + "loss": 0.4515, + "step": 7589 + }, + { + "epoch": 0.87, + "grad_norm": 2.2269169342791253, + "learning_rate": 4.2288587682451534e-07, + "loss": 0.5182, + "step": 7590 + }, + { + "epoch": 0.87, + "grad_norm": 1.677518784819435, + "learning_rate": 4.221371885462522e-07, + "loss": 0.4909, + "step": 7591 + }, + { + "epoch": 0.87, + "grad_norm": 2.065194191043958, + "learning_rate": 4.2138913439275173e-07, + "loss": 0.4637, + "step": 7592 + }, + { + "epoch": 0.87, + "grad_norm": 0.8198682163949804, + "learning_rate": 4.206417144676367e-07, + "loss": 0.6692, + "step": 7593 + }, + { + "epoch": 0.87, + "grad_norm": 2.1448505436570664, + "learning_rate": 4.1989492887443697e-07, + "loss": 0.5272, + "step": 7594 + }, + { + "epoch": 0.87, + "grad_norm": 1.9841489831245294, + "learning_rate": 4.1914877771659925e-07, + "loss": 0.4991, + "step": 7595 + }, + { + "epoch": 0.87, + "grad_norm": 1.878147052110389, + "learning_rate": 4.1840326109747974e-07, + "loss": 0.424, + "step": 7596 + }, + { + "epoch": 0.87, + "grad_norm": 1.6150963184659404, + "learning_rate": 4.176583791203459e-07, + "loss": 0.5105, + "step": 7597 + }, + { + "epoch": 0.87, + "grad_norm": 2.9396595405128974, + "learning_rate": 4.169141318883807e-07, + "loss": 0.4205, + "step": 7598 + }, + { + "epoch": 0.87, + "grad_norm": 2.520451829596819, + "learning_rate": 4.1617051950467613e-07, + "loss": 0.4737, + "step": 7599 + }, + { + "epoch": 0.87, + "grad_norm": 2.9721295550496296, + "learning_rate": 4.154275420722359e-07, + "loss": 0.4234, + "step": 7600 + }, + { + "epoch": 0.87, + "grad_norm": 2.1640747745046762, + "learning_rate": 4.1468519969397993e-07, + "loss": 0.5124, + "step": 7601 + }, + { + "epoch": 0.87, + "grad_norm": 2.4308831681560097, + "learning_rate": 4.139434924727359e-07, + "loss": 0.469, + "step": 7602 + }, + { + "epoch": 0.87, + "grad_norm": 2.0390990258898096, + "learning_rate": 4.1320242051124395e-07, + "loss": 0.5201, + "step": 7603 + }, + { + "epoch": 0.87, + "grad_norm": 2.704325546544485, + "learning_rate": 4.1246198391215853e-07, + "loss": 0.4716, + "step": 7604 + }, + { + "epoch": 0.87, + "grad_norm": 1.7961369562525837, + "learning_rate": 4.117221827780443e-07, + "loss": 0.4738, + "step": 7605 + }, + { + "epoch": 0.87, + "grad_norm": 2.136810796271047, + "learning_rate": 4.109830172113793e-07, + "loss": 0.4365, + "step": 7606 + }, + { + "epoch": 0.87, + "grad_norm": 1.920080851974724, + "learning_rate": 4.102444873145511e-07, + "loss": 0.4449, + "step": 7607 + }, + { + "epoch": 0.87, + "grad_norm": 1.9729734470636426, + "learning_rate": 4.095065931898623e-07, + "loss": 0.5439, + "step": 7608 + }, + { + "epoch": 0.87, + "grad_norm": 1.7844033086701856, + "learning_rate": 4.0876933493952444e-07, + "loss": 0.5091, + "step": 7609 + }, + { + "epoch": 0.87, + "grad_norm": 2.2201202945661884, + "learning_rate": 4.080327126656641e-07, + "loss": 0.5655, + "step": 7610 + }, + { + "epoch": 0.87, + "grad_norm": 2.4503506877360866, + "learning_rate": 4.0729672647031593e-07, + "loss": 0.4812, + "step": 7611 + }, + { + "epoch": 0.87, + "grad_norm": 1.8543311802120197, + "learning_rate": 4.065613764554305e-07, + "loss": 0.4991, + "step": 7612 + }, + { + "epoch": 0.87, + "grad_norm": 2.210093465820383, + "learning_rate": 4.0582666272286685e-07, + "loss": 0.5658, + "step": 7613 + }, + { + "epoch": 0.87, + "grad_norm": 2.3728375979821306, + "learning_rate": 4.0509258537439866e-07, + "loss": 0.5008, + "step": 7614 + }, + { + "epoch": 0.87, + "grad_norm": 2.632928450192723, + "learning_rate": 4.043591445117101e-07, + "loss": 0.5199, + "step": 7615 + }, + { + "epoch": 0.88, + "grad_norm": 2.9584567907168937, + "learning_rate": 4.0362634023639713e-07, + "loss": 0.4008, + "step": 7616 + }, + { + "epoch": 0.88, + "grad_norm": 2.414719967964127, + "learning_rate": 4.028941726499658e-07, + "loss": 0.5683, + "step": 7617 + }, + { + "epoch": 0.88, + "grad_norm": 2.7871151914851655, + "learning_rate": 4.021626418538388e-07, + "loss": 0.4876, + "step": 7618 + }, + { + "epoch": 0.88, + "grad_norm": 2.315510466823666, + "learning_rate": 4.0143174794934516e-07, + "loss": 0.4824, + "step": 7619 + }, + { + "epoch": 0.88, + "grad_norm": 5.550212452180025, + "learning_rate": 4.007014910377305e-07, + "loss": 0.517, + "step": 7620 + }, + { + "epoch": 0.88, + "grad_norm": 1.9865535921344637, + "learning_rate": 3.999718712201484e-07, + "loss": 0.5212, + "step": 7621 + }, + { + "epoch": 0.88, + "grad_norm": 2.5375720164956674, + "learning_rate": 3.9924288859766514e-07, + "loss": 0.4375, + "step": 7622 + }, + { + "epoch": 0.88, + "grad_norm": 1.9771022495100916, + "learning_rate": 3.985145432712606e-07, + "loss": 0.4561, + "step": 7623 + }, + { + "epoch": 0.88, + "grad_norm": 3.2274032834331448, + "learning_rate": 3.9778683534182403e-07, + "loss": 0.4892, + "step": 7624 + }, + { + "epoch": 0.88, + "grad_norm": 2.4149666658440734, + "learning_rate": 3.9705976491015874e-07, + "loss": 0.5119, + "step": 7625 + }, + { + "epoch": 0.88, + "grad_norm": 1.7399396363204704, + "learning_rate": 3.963333320769769e-07, + "loss": 0.537, + "step": 7626 + }, + { + "epoch": 0.88, + "grad_norm": 2.586987277356396, + "learning_rate": 3.956075369429052e-07, + "loss": 0.5208, + "step": 7627 + }, + { + "epoch": 0.88, + "grad_norm": 0.8703693874369917, + "learning_rate": 3.948823796084789e-07, + "loss": 0.6761, + "step": 7628 + }, + { + "epoch": 0.88, + "grad_norm": 1.5742361972194594, + "learning_rate": 3.941578601741491e-07, + "loss": 0.5332, + "step": 7629 + }, + { + "epoch": 0.88, + "grad_norm": 2.7847256195736128, + "learning_rate": 3.934339787402736e-07, + "loss": 0.4645, + "step": 7630 + }, + { + "epoch": 0.88, + "grad_norm": 1.9681946171498312, + "learning_rate": 3.927107354071269e-07, + "loss": 0.5269, + "step": 7631 + }, + { + "epoch": 0.88, + "grad_norm": 4.177223105812872, + "learning_rate": 3.9198813027488956e-07, + "loss": 0.4951, + "step": 7632 + }, + { + "epoch": 0.88, + "grad_norm": 1.667181161705969, + "learning_rate": 3.9126616344365933e-07, + "loss": 0.4384, + "step": 7633 + }, + { + "epoch": 0.88, + "grad_norm": 2.8102901722703293, + "learning_rate": 3.905448350134411e-07, + "loss": 0.516, + "step": 7634 + }, + { + "epoch": 0.88, + "grad_norm": 2.0195748339134885, + "learning_rate": 3.8982414508415445e-07, + "loss": 0.5234, + "step": 7635 + }, + { + "epoch": 0.88, + "grad_norm": 3.0240742798109825, + "learning_rate": 3.891040937556284e-07, + "loss": 0.4229, + "step": 7636 + }, + { + "epoch": 0.88, + "grad_norm": 1.9735427922366915, + "learning_rate": 3.8838468112760485e-07, + "loss": 0.4627, + "step": 7637 + }, + { + "epoch": 0.88, + "grad_norm": 6.63888436315548, + "learning_rate": 3.876659072997363e-07, + "loss": 0.5075, + "step": 7638 + }, + { + "epoch": 0.88, + "grad_norm": 1.8804080373674346, + "learning_rate": 3.869477723715881e-07, + "loss": 0.6041, + "step": 7639 + }, + { + "epoch": 0.88, + "grad_norm": 1.775846574354268, + "learning_rate": 3.862302764426351e-07, + "loss": 0.5119, + "step": 7640 + }, + { + "epoch": 0.88, + "grad_norm": 2.211944554532105, + "learning_rate": 3.855134196122645e-07, + "loss": 0.4835, + "step": 7641 + }, + { + "epoch": 0.88, + "grad_norm": 2.39585833870452, + "learning_rate": 3.847972019797763e-07, + "loss": 0.4659, + "step": 7642 + }, + { + "epoch": 0.88, + "grad_norm": 2.177169041259043, + "learning_rate": 3.8408162364438004e-07, + "loss": 0.5575, + "step": 7643 + }, + { + "epoch": 0.88, + "grad_norm": 2.4226263043834115, + "learning_rate": 3.833666847051981e-07, + "loss": 0.432, + "step": 7644 + }, + { + "epoch": 0.88, + "grad_norm": 1.9584017361848907, + "learning_rate": 3.826523852612629e-07, + "loss": 0.5794, + "step": 7645 + }, + { + "epoch": 0.88, + "grad_norm": 1.676550378382369, + "learning_rate": 3.8193872541151976e-07, + "loss": 0.461, + "step": 7646 + }, + { + "epoch": 0.88, + "grad_norm": 3.290950066290489, + "learning_rate": 3.812257052548246e-07, + "loss": 0.5006, + "step": 7647 + }, + { + "epoch": 0.88, + "grad_norm": 2.6514198033085647, + "learning_rate": 3.8051332488994565e-07, + "loss": 0.4542, + "step": 7648 + }, + { + "epoch": 0.88, + "grad_norm": 2.0979813018291282, + "learning_rate": 3.798015844155595e-07, + "loss": 0.4694, + "step": 7649 + }, + { + "epoch": 0.88, + "grad_norm": 1.8896330434950261, + "learning_rate": 3.790904839302584e-07, + "loss": 0.4557, + "step": 7650 + }, + { + "epoch": 0.88, + "grad_norm": 2.0573358495363783, + "learning_rate": 3.7838002353254243e-07, + "loss": 0.5122, + "step": 7651 + }, + { + "epoch": 0.88, + "grad_norm": 2.006059148635446, + "learning_rate": 3.776702033208257e-07, + "loss": 0.5127, + "step": 7652 + }, + { + "epoch": 0.88, + "grad_norm": 4.773230512060994, + "learning_rate": 3.7696102339343067e-07, + "loss": 0.4881, + "step": 7653 + }, + { + "epoch": 0.88, + "grad_norm": 7.011246293340883, + "learning_rate": 3.7625248384859536e-07, + "loss": 0.5343, + "step": 7654 + }, + { + "epoch": 0.88, + "grad_norm": 2.736556812428555, + "learning_rate": 3.755445847844641e-07, + "loss": 0.3857, + "step": 7655 + }, + { + "epoch": 0.88, + "grad_norm": 2.613376102658403, + "learning_rate": 3.7483732629909673e-07, + "loss": 0.5046, + "step": 7656 + }, + { + "epoch": 0.88, + "grad_norm": 2.3114825902498857, + "learning_rate": 3.741307084904611e-07, + "loss": 0.535, + "step": 7657 + }, + { + "epoch": 0.88, + "grad_norm": 1.906164401445148, + "learning_rate": 3.734247314564393e-07, + "loss": 0.4092, + "step": 7658 + }, + { + "epoch": 0.88, + "grad_norm": 4.622250686792475, + "learning_rate": 3.727193952948216e-07, + "loss": 0.5649, + "step": 7659 + }, + { + "epoch": 0.88, + "grad_norm": 2.2921323813166135, + "learning_rate": 3.720147001033125e-07, + "loss": 0.451, + "step": 7660 + }, + { + "epoch": 0.88, + "grad_norm": 1.6287354790537265, + "learning_rate": 3.7131064597952517e-07, + "loss": 0.4905, + "step": 7661 + }, + { + "epoch": 0.88, + "grad_norm": 2.0666547816300813, + "learning_rate": 3.706072330209853e-07, + "loss": 0.5872, + "step": 7662 + }, + { + "epoch": 0.88, + "grad_norm": 1.9262064370297467, + "learning_rate": 3.6990446132513014e-07, + "loss": 0.5343, + "step": 7663 + }, + { + "epoch": 0.88, + "grad_norm": 0.8463670618413531, + "learning_rate": 3.6920233098930614e-07, + "loss": 0.6583, + "step": 7664 + }, + { + "epoch": 0.88, + "grad_norm": 2.013682512529784, + "learning_rate": 3.685008421107744e-07, + "loss": 0.3735, + "step": 7665 + }, + { + "epoch": 0.88, + "grad_norm": 1.6217176621340392, + "learning_rate": 3.6779999478670337e-07, + "loss": 0.3438, + "step": 7666 + }, + { + "epoch": 0.88, + "grad_norm": 2.1155802754632727, + "learning_rate": 3.670997891141753e-07, + "loss": 0.4704, + "step": 7667 + }, + { + "epoch": 0.88, + "grad_norm": 2.1504878098407323, + "learning_rate": 3.6640022519018046e-07, + "loss": 0.5195, + "step": 7668 + }, + { + "epoch": 0.88, + "grad_norm": 8.191136715546454, + "learning_rate": 3.657013031116252e-07, + "loss": 0.5064, + "step": 7669 + }, + { + "epoch": 0.88, + "grad_norm": 1.9404872179957902, + "learning_rate": 3.6500302297532154e-07, + "loss": 0.5422, + "step": 7670 + }, + { + "epoch": 0.88, + "grad_norm": 2.160099085520927, + "learning_rate": 3.643053848779976e-07, + "loss": 0.4684, + "step": 7671 + }, + { + "epoch": 0.88, + "grad_norm": 2.122939551053653, + "learning_rate": 3.636083889162878e-07, + "loss": 0.341, + "step": 7672 + }, + { + "epoch": 0.88, + "grad_norm": 2.6812121227885597, + "learning_rate": 3.629120351867416e-07, + "loss": 0.5076, + "step": 7673 + }, + { + "epoch": 0.88, + "grad_norm": 2.3797288627740376, + "learning_rate": 3.6221632378581616e-07, + "loss": 0.5253, + "step": 7674 + }, + { + "epoch": 0.88, + "grad_norm": 2.0319780553520372, + "learning_rate": 3.615212548098834e-07, + "loss": 0.4276, + "step": 7675 + }, + { + "epoch": 0.88, + "grad_norm": 1.868804732116299, + "learning_rate": 3.6082682835522245e-07, + "loss": 0.5022, + "step": 7676 + }, + { + "epoch": 0.88, + "grad_norm": 2.2819979632788727, + "learning_rate": 3.601330445180262e-07, + "loss": 0.4387, + "step": 7677 + }, + { + "epoch": 0.88, + "grad_norm": 2.3076599046546082, + "learning_rate": 3.594399033943963e-07, + "loss": 0.4153, + "step": 7678 + }, + { + "epoch": 0.88, + "grad_norm": 1.9975086667441697, + "learning_rate": 3.5874740508034744e-07, + "loss": 0.4853, + "step": 7679 + }, + { + "epoch": 0.88, + "grad_norm": 3.933047421896532, + "learning_rate": 3.5805554967180467e-07, + "loss": 0.3763, + "step": 7680 + }, + { + "epoch": 0.88, + "grad_norm": 3.1011014465582996, + "learning_rate": 3.5736433726460243e-07, + "loss": 0.4322, + "step": 7681 + }, + { + "epoch": 0.88, + "grad_norm": 2.4366798178867417, + "learning_rate": 3.566737679544885e-07, + "loss": 0.4687, + "step": 7682 + }, + { + "epoch": 0.88, + "grad_norm": 2.1359394526995357, + "learning_rate": 3.5598384183712033e-07, + "loss": 0.4901, + "step": 7683 + }, + { + "epoch": 0.88, + "grad_norm": 2.835377871972897, + "learning_rate": 3.552945590080653e-07, + "loss": 0.5058, + "step": 7684 + }, + { + "epoch": 0.88, + "grad_norm": 1.8637498660216119, + "learning_rate": 3.546059195628038e-07, + "loss": 0.4098, + "step": 7685 + }, + { + "epoch": 0.88, + "grad_norm": 2.3482424971739007, + "learning_rate": 3.5391792359672605e-07, + "loss": 0.5955, + "step": 7686 + }, + { + "epoch": 0.88, + "grad_norm": 2.918424979794592, + "learning_rate": 3.5323057120513206e-07, + "loss": 0.5, + "step": 7687 + }, + { + "epoch": 0.88, + "grad_norm": 2.1253241595733146, + "learning_rate": 3.5254386248323504e-07, + "loss": 0.3799, + "step": 7688 + }, + { + "epoch": 0.88, + "grad_norm": 2.5316001483898267, + "learning_rate": 3.5185779752615615e-07, + "loss": 0.5699, + "step": 7689 + }, + { + "epoch": 0.88, + "grad_norm": 1.9755734816467194, + "learning_rate": 3.5117237642893153e-07, + "loss": 0.4022, + "step": 7690 + }, + { + "epoch": 0.88, + "grad_norm": 2.5152519540655685, + "learning_rate": 3.504875992865031e-07, + "loss": 0.4126, + "step": 7691 + }, + { + "epoch": 0.88, + "grad_norm": 1.8459814563889863, + "learning_rate": 3.4980346619372776e-07, + "loss": 0.4908, + "step": 7692 + }, + { + "epoch": 0.88, + "grad_norm": 2.5237017988286428, + "learning_rate": 3.4911997724537016e-07, + "loss": 0.4812, + "step": 7693 + }, + { + "epoch": 0.88, + "grad_norm": 1.8863619885869625, + "learning_rate": 3.484371325361086e-07, + "loss": 0.5379, + "step": 7694 + }, + { + "epoch": 0.88, + "grad_norm": 1.8008544136479896, + "learning_rate": 3.477549321605289e-07, + "loss": 0.5266, + "step": 7695 + }, + { + "epoch": 0.88, + "grad_norm": 2.095555357636055, + "learning_rate": 3.4707337621313066e-07, + "loss": 0.4798, + "step": 7696 + }, + { + "epoch": 0.88, + "grad_norm": 2.2290302447944317, + "learning_rate": 3.463924647883221e-07, + "loss": 0.3975, + "step": 7697 + }, + { + "epoch": 0.88, + "grad_norm": 1.9273922420198621, + "learning_rate": 3.45712197980424e-07, + "loss": 0.4769, + "step": 7698 + }, + { + "epoch": 0.88, + "grad_norm": 2.1630417183333215, + "learning_rate": 3.450325758836659e-07, + "loss": 0.5193, + "step": 7699 + }, + { + "epoch": 0.88, + "grad_norm": 1.7419965722192794, + "learning_rate": 3.443535985921892e-07, + "loss": 0.5145, + "step": 7700 + }, + { + "epoch": 0.88, + "grad_norm": 2.525127605511081, + "learning_rate": 3.436752662000448e-07, + "loss": 0.458, + "step": 7701 + }, + { + "epoch": 0.88, + "grad_norm": 1.840323738437548, + "learning_rate": 3.4299757880119686e-07, + "loss": 0.4776, + "step": 7702 + }, + { + "epoch": 0.89, + "grad_norm": 1.878628845690861, + "learning_rate": 3.423205364895171e-07, + "loss": 0.4476, + "step": 7703 + }, + { + "epoch": 0.89, + "grad_norm": 1.6400156119136993, + "learning_rate": 3.4164413935879046e-07, + "loss": 0.3143, + "step": 7704 + }, + { + "epoch": 0.89, + "grad_norm": 2.7797403688488327, + "learning_rate": 3.4096838750271087e-07, + "loss": 0.5563, + "step": 7705 + }, + { + "epoch": 0.89, + "grad_norm": 1.8234686096849333, + "learning_rate": 3.4029328101488293e-07, + "loss": 0.5051, + "step": 7706 + }, + { + "epoch": 0.89, + "grad_norm": 3.4755845756027597, + "learning_rate": 3.3961881998882285e-07, + "loss": 0.4257, + "step": 7707 + }, + { + "epoch": 0.89, + "grad_norm": 4.522880705937556, + "learning_rate": 3.3894500451795597e-07, + "loss": 0.4489, + "step": 7708 + }, + { + "epoch": 0.89, + "grad_norm": 2.548021454071435, + "learning_rate": 3.382718346956204e-07, + "loss": 0.4839, + "step": 7709 + }, + { + "epoch": 0.89, + "grad_norm": 2.0246695343609917, + "learning_rate": 3.375993106150621e-07, + "loss": 0.515, + "step": 7710 + }, + { + "epoch": 0.89, + "grad_norm": 1.938437945090546, + "learning_rate": 3.369274323694405e-07, + "loss": 0.5125, + "step": 7711 + }, + { + "epoch": 0.89, + "grad_norm": 2.4190789947974656, + "learning_rate": 3.362562000518227e-07, + "loss": 0.4284, + "step": 7712 + }, + { + "epoch": 0.89, + "grad_norm": 2.1202512731788032, + "learning_rate": 3.355856137551888e-07, + "loss": 0.5927, + "step": 7713 + }, + { + "epoch": 0.89, + "grad_norm": 1.912847185010895, + "learning_rate": 3.3491567357242736e-07, + "loss": 0.4627, + "step": 7714 + }, + { + "epoch": 0.89, + "grad_norm": 3.2200391498301246, + "learning_rate": 3.3424637959633964e-07, + "loss": 0.5148, + "step": 7715 + }, + { + "epoch": 0.89, + "grad_norm": 1.9414270599981116, + "learning_rate": 3.335777319196348e-07, + "loss": 0.4688, + "step": 7716 + }, + { + "epoch": 0.89, + "grad_norm": 0.8458331594935771, + "learning_rate": 3.3290973063493437e-07, + "loss": 0.6902, + "step": 7717 + }, + { + "epoch": 0.89, + "grad_norm": 2.0967144022055346, + "learning_rate": 3.3224237583476924e-07, + "loss": 0.433, + "step": 7718 + }, + { + "epoch": 0.89, + "grad_norm": 2.388217349491806, + "learning_rate": 3.315756676115822e-07, + "loss": 0.5036, + "step": 7719 + }, + { + "epoch": 0.89, + "grad_norm": 2.2631294466185308, + "learning_rate": 3.309096060577244e-07, + "loss": 0.5007, + "step": 7720 + }, + { + "epoch": 0.89, + "grad_norm": 1.9035945867096333, + "learning_rate": 3.3024419126546026e-07, + "loss": 0.4711, + "step": 7721 + }, + { + "epoch": 0.89, + "grad_norm": 2.113104448797073, + "learning_rate": 3.295794233269611e-07, + "loss": 0.4494, + "step": 7722 + }, + { + "epoch": 0.89, + "grad_norm": 1.787173529468723, + "learning_rate": 3.289153023343117e-07, + "loss": 0.4853, + "step": 7723 + }, + { + "epoch": 0.89, + "grad_norm": 1.7812707297385089, + "learning_rate": 3.2825182837950496e-07, + "loss": 0.5351, + "step": 7724 + }, + { + "epoch": 0.89, + "grad_norm": 2.4471721222126113, + "learning_rate": 3.27589001554447e-07, + "loss": 0.4612, + "step": 7725 + }, + { + "epoch": 0.89, + "grad_norm": 1.8333826762653511, + "learning_rate": 3.269268219509508e-07, + "loss": 0.5878, + "step": 7726 + }, + { + "epoch": 0.89, + "grad_norm": 2.2738888541239364, + "learning_rate": 3.26265289660741e-07, + "loss": 0.4697, + "step": 7727 + }, + { + "epoch": 0.89, + "grad_norm": 7.430661640591568, + "learning_rate": 3.2560440477545474e-07, + "loss": 0.5221, + "step": 7728 + }, + { + "epoch": 0.89, + "grad_norm": 2.1560119625114877, + "learning_rate": 3.2494416738663547e-07, + "loss": 0.5069, + "step": 7729 + }, + { + "epoch": 0.89, + "grad_norm": 2.374307837213059, + "learning_rate": 3.2428457758574173e-07, + "loss": 0.4999, + "step": 7730 + }, + { + "epoch": 0.89, + "grad_norm": 2.4206911794475285, + "learning_rate": 3.236256354641376e-07, + "loss": 0.4911, + "step": 7731 + }, + { + "epoch": 0.89, + "grad_norm": 2.859164167260816, + "learning_rate": 3.229673411131007e-07, + "loss": 0.4454, + "step": 7732 + }, + { + "epoch": 0.89, + "grad_norm": 1.990919302956601, + "learning_rate": 3.22309694623818e-07, + "loss": 0.5464, + "step": 7733 + }, + { + "epoch": 0.89, + "grad_norm": 2.2963688270103266, + "learning_rate": 3.216526960873867e-07, + "loss": 0.5367, + "step": 7734 + }, + { + "epoch": 0.89, + "grad_norm": 1.9743035442087487, + "learning_rate": 3.209963455948123e-07, + "loss": 0.54, + "step": 7735 + }, + { + "epoch": 0.89, + "grad_norm": 1.7157558145288492, + "learning_rate": 3.203406432370143e-07, + "loss": 0.4946, + "step": 7736 + }, + { + "epoch": 0.89, + "grad_norm": 1.8511911782957249, + "learning_rate": 3.196855891048195e-07, + "loss": 0.5035, + "step": 7737 + }, + { + "epoch": 0.89, + "grad_norm": 2.0580455456919853, + "learning_rate": 3.19031183288967e-07, + "loss": 0.5628, + "step": 7738 + }, + { + "epoch": 0.89, + "grad_norm": 1.5973663622580487, + "learning_rate": 3.183774258801031e-07, + "loss": 0.4052, + "step": 7739 + }, + { + "epoch": 0.89, + "grad_norm": 2.065618512979532, + "learning_rate": 3.1772431696878827e-07, + "loss": 0.5071, + "step": 7740 + }, + { + "epoch": 0.89, + "grad_norm": 2.123382797318764, + "learning_rate": 3.170718566454889e-07, + "loss": 0.4078, + "step": 7741 + }, + { + "epoch": 0.89, + "grad_norm": 4.7147147594558705, + "learning_rate": 3.164200450005861e-07, + "loss": 0.5081, + "step": 7742 + }, + { + "epoch": 0.89, + "grad_norm": 0.8441308521121934, + "learning_rate": 3.157688821243665e-07, + "loss": 0.6617, + "step": 7743 + }, + { + "epoch": 0.89, + "grad_norm": 2.4911383668396474, + "learning_rate": 3.151183681070308e-07, + "loss": 0.4469, + "step": 7744 + }, + { + "epoch": 0.89, + "grad_norm": 3.418827584094532, + "learning_rate": 3.144685030386874e-07, + "loss": 0.5005, + "step": 7745 + }, + { + "epoch": 0.89, + "grad_norm": 1.9391508504504427, + "learning_rate": 3.1381928700935484e-07, + "loss": 0.4535, + "step": 7746 + }, + { + "epoch": 0.89, + "grad_norm": 2.1615594203551316, + "learning_rate": 3.1317072010896344e-07, + "loss": 0.3997, + "step": 7747 + }, + { + "epoch": 0.89, + "grad_norm": 3.537062001952909, + "learning_rate": 3.1252280242735136e-07, + "loss": 0.4466, + "step": 7748 + }, + { + "epoch": 0.89, + "grad_norm": 2.374381247419036, + "learning_rate": 3.118755340542695e-07, + "loss": 0.5366, + "step": 7749 + }, + { + "epoch": 0.89, + "grad_norm": 5.4153640618899415, + "learning_rate": 3.112289150793768e-07, + "loss": 0.5229, + "step": 7750 + }, + { + "epoch": 0.89, + "grad_norm": 2.1033695379162634, + "learning_rate": 3.105829455922427e-07, + "loss": 0.5169, + "step": 7751 + }, + { + "epoch": 0.89, + "grad_norm": 2.7321611745321257, + "learning_rate": 3.0993762568234554e-07, + "loss": 0.4287, + "step": 7752 + }, + { + "epoch": 0.89, + "grad_norm": 1.7286133803739854, + "learning_rate": 3.092929554390772e-07, + "loss": 0.4292, + "step": 7753 + }, + { + "epoch": 0.89, + "grad_norm": 1.8643817002166243, + "learning_rate": 3.0864893495173575e-07, + "loss": 0.4611, + "step": 7754 + }, + { + "epoch": 0.89, + "grad_norm": 1.929251815169608, + "learning_rate": 3.0800556430953143e-07, + "loss": 0.4239, + "step": 7755 + }, + { + "epoch": 0.89, + "grad_norm": 2.33003959594801, + "learning_rate": 3.073628436015835e-07, + "loss": 0.4121, + "step": 7756 + }, + { + "epoch": 0.89, + "grad_norm": 2.5612847041073423, + "learning_rate": 3.0672077291692193e-07, + "loss": 0.5089, + "step": 7757 + }, + { + "epoch": 0.89, + "grad_norm": 1.9480081914045402, + "learning_rate": 3.0607935234448547e-07, + "loss": 0.5639, + "step": 7758 + }, + { + "epoch": 0.89, + "grad_norm": 2.5149635915060675, + "learning_rate": 3.054385819731248e-07, + "loss": 0.4471, + "step": 7759 + }, + { + "epoch": 0.89, + "grad_norm": 3.0014668209435893, + "learning_rate": 3.047984618915978e-07, + "loss": 0.4622, + "step": 7760 + }, + { + "epoch": 0.89, + "grad_norm": 2.0587821562889888, + "learning_rate": 3.041589921885757e-07, + "loss": 0.549, + "step": 7761 + }, + { + "epoch": 0.89, + "grad_norm": 2.212448212242227, + "learning_rate": 3.0352017295263603e-07, + "loss": 0.4772, + "step": 7762 + }, + { + "epoch": 0.89, + "grad_norm": 2.0854718038892646, + "learning_rate": 3.0288200427226856e-07, + "loss": 0.4436, + "step": 7763 + }, + { + "epoch": 0.89, + "grad_norm": 3.7174992735536927, + "learning_rate": 3.02244486235872e-07, + "loss": 0.5528, + "step": 7764 + }, + { + "epoch": 0.89, + "grad_norm": 2.9997415711691486, + "learning_rate": 3.0160761893175625e-07, + "loss": 0.4978, + "step": 7765 + }, + { + "epoch": 0.89, + "grad_norm": 2.346855821794742, + "learning_rate": 3.009714024481397e-07, + "loss": 0.4614, + "step": 7766 + }, + { + "epoch": 0.89, + "grad_norm": 2.3959352863048253, + "learning_rate": 3.0033583687315013e-07, + "loss": 0.4974, + "step": 7767 + }, + { + "epoch": 0.89, + "grad_norm": 2.5043022999854316, + "learning_rate": 2.997009222948255e-07, + "loss": 0.4509, + "step": 7768 + }, + { + "epoch": 0.89, + "grad_norm": 1.8837659006289813, + "learning_rate": 2.990666588011165e-07, + "loss": 0.4205, + "step": 7769 + }, + { + "epoch": 0.89, + "grad_norm": 2.4228718694801596, + "learning_rate": 2.9843304647987904e-07, + "loss": 0.5316, + "step": 7770 + }, + { + "epoch": 0.89, + "grad_norm": 1.7235001140768424, + "learning_rate": 2.9780008541888116e-07, + "loss": 0.5393, + "step": 7771 + }, + { + "epoch": 0.89, + "grad_norm": 2.074344524395402, + "learning_rate": 2.971677757058017e-07, + "loss": 0.4203, + "step": 7772 + }, + { + "epoch": 0.89, + "grad_norm": 1.9926817095464413, + "learning_rate": 2.9653611742822664e-07, + "loss": 0.5198, + "step": 7773 + }, + { + "epoch": 0.89, + "grad_norm": 2.2100538465240613, + "learning_rate": 2.9590511067365436e-07, + "loss": 0.4147, + "step": 7774 + }, + { + "epoch": 0.89, + "grad_norm": 2.615202009297096, + "learning_rate": 2.9527475552949106e-07, + "loss": 0.4574, + "step": 7775 + }, + { + "epoch": 0.89, + "grad_norm": 2.6596655606151223, + "learning_rate": 2.946450520830546e-07, + "loss": 0.5196, + "step": 7776 + }, + { + "epoch": 0.89, + "grad_norm": 2.770903818471776, + "learning_rate": 2.940160004215692e-07, + "loss": 0.5432, + "step": 7777 + }, + { + "epoch": 0.89, + "grad_norm": 2.402518018017984, + "learning_rate": 2.9338760063217344e-07, + "loss": 0.5162, + "step": 7778 + }, + { + "epoch": 0.89, + "grad_norm": 1.7615263462726805, + "learning_rate": 2.9275985280191153e-07, + "loss": 0.5337, + "step": 7779 + }, + { + "epoch": 0.89, + "grad_norm": 1.9233575331746176, + "learning_rate": 2.921327570177396e-07, + "loss": 0.5657, + "step": 7780 + }, + { + "epoch": 0.89, + "grad_norm": 1.9936759571930225, + "learning_rate": 2.9150631336652245e-07, + "loss": 0.4473, + "step": 7781 + }, + { + "epoch": 0.89, + "grad_norm": 1.9427183915317798, + "learning_rate": 2.908805219350358e-07, + "loss": 0.4751, + "step": 7782 + }, + { + "epoch": 0.89, + "grad_norm": 4.005507008673433, + "learning_rate": 2.902553828099636e-07, + "loss": 0.5409, + "step": 7783 + }, + { + "epoch": 0.89, + "grad_norm": 2.17015499026607, + "learning_rate": 2.896308960779004e-07, + "loss": 0.5925, + "step": 7784 + }, + { + "epoch": 0.89, + "grad_norm": 3.0320142624396254, + "learning_rate": 2.8900706182534874e-07, + "loss": 0.5564, + "step": 7785 + }, + { + "epoch": 0.89, + "grad_norm": 1.7341321731178418, + "learning_rate": 2.8838388013872344e-07, + "loss": 0.449, + "step": 7786 + }, + { + "epoch": 0.89, + "grad_norm": 1.7317662668692784, + "learning_rate": 2.877613511043459e-07, + "loss": 0.5358, + "step": 7787 + }, + { + "epoch": 0.89, + "grad_norm": 1.7777550529279151, + "learning_rate": 2.8713947480845103e-07, + "loss": 0.5052, + "step": 7788 + }, + { + "epoch": 0.89, + "grad_norm": 1.8631783602341898, + "learning_rate": 2.8651825133717894e-07, + "loss": 0.444, + "step": 7789 + }, + { + "epoch": 0.9, + "grad_norm": 2.238934633838346, + "learning_rate": 2.858976807765834e-07, + "loss": 0.4943, + "step": 7790 + }, + { + "epoch": 0.9, + "grad_norm": 2.1387758587545447, + "learning_rate": 2.852777632126241e-07, + "loss": 0.5509, + "step": 7791 + }, + { + "epoch": 0.9, + "grad_norm": 2.5868507757215116, + "learning_rate": 2.8465849873117177e-07, + "loss": 0.5663, + "step": 7792 + }, + { + "epoch": 0.9, + "grad_norm": 2.263252273686556, + "learning_rate": 2.840398874180084e-07, + "loss": 0.5056, + "step": 7793 + }, + { + "epoch": 0.9, + "grad_norm": 2.3541751331306076, + "learning_rate": 2.834219293588225e-07, + "loss": 0.4596, + "step": 7794 + }, + { + "epoch": 0.9, + "grad_norm": 4.6048330471009375, + "learning_rate": 2.828046246392141e-07, + "loss": 0.4431, + "step": 7795 + }, + { + "epoch": 0.9, + "grad_norm": 2.0595725321667384, + "learning_rate": 2.821879733446919e-07, + "loss": 0.5222, + "step": 7796 + }, + { + "epoch": 0.9, + "grad_norm": 2.5483177348497805, + "learning_rate": 2.8157197556067495e-07, + "loss": 0.4616, + "step": 7797 + }, + { + "epoch": 0.9, + "grad_norm": 2.728568095472027, + "learning_rate": 2.8095663137248984e-07, + "loss": 0.4933, + "step": 7798 + }, + { + "epoch": 0.9, + "grad_norm": 3.185031737150029, + "learning_rate": 2.803419408653757e-07, + "loss": 0.4876, + "step": 7799 + }, + { + "epoch": 0.9, + "grad_norm": 2.172349202060047, + "learning_rate": 2.7972790412447827e-07, + "loss": 0.5167, + "step": 7800 + }, + { + "epoch": 0.9, + "grad_norm": 1.976727912092295, + "learning_rate": 2.791145212348539e-07, + "loss": 0.4744, + "step": 7801 + }, + { + "epoch": 0.9, + "grad_norm": 2.2534991986493904, + "learning_rate": 2.785017922814681e-07, + "loss": 0.5464, + "step": 7802 + }, + { + "epoch": 0.9, + "grad_norm": 2.3510773459806895, + "learning_rate": 2.778897173491968e-07, + "loss": 0.5094, + "step": 7803 + }, + { + "epoch": 0.9, + "grad_norm": 4.97130035466109, + "learning_rate": 2.772782965228232e-07, + "loss": 0.4587, + "step": 7804 + }, + { + "epoch": 0.9, + "grad_norm": 3.006764167875854, + "learning_rate": 2.766675298870425e-07, + "loss": 0.4666, + "step": 7805 + }, + { + "epoch": 0.9, + "grad_norm": 2.938958638616173, + "learning_rate": 2.7605741752645686e-07, + "loss": 0.4192, + "step": 7806 + }, + { + "epoch": 0.9, + "grad_norm": 1.7852334010547655, + "learning_rate": 2.7544795952558045e-07, + "loss": 0.3891, + "step": 7807 + }, + { + "epoch": 0.9, + "grad_norm": 2.025615234693282, + "learning_rate": 2.748391559688335e-07, + "loss": 0.5499, + "step": 7808 + }, + { + "epoch": 0.9, + "grad_norm": 3.297855721679448, + "learning_rate": 2.742310069405485e-07, + "loss": 0.5158, + "step": 7809 + }, + { + "epoch": 0.9, + "grad_norm": 1.8545338457820344, + "learning_rate": 2.736235125249664e-07, + "loss": 0.4255, + "step": 7810 + }, + { + "epoch": 0.9, + "grad_norm": 3.07817933727314, + "learning_rate": 2.730166728062361e-07, + "loss": 0.4137, + "step": 7811 + }, + { + "epoch": 0.9, + "grad_norm": 2.4258874537778103, + "learning_rate": 2.7241048786841805e-07, + "loss": 0.5015, + "step": 7812 + }, + { + "epoch": 0.9, + "grad_norm": 1.9924916361666247, + "learning_rate": 2.718049577954796e-07, + "loss": 0.4822, + "step": 7813 + }, + { + "epoch": 0.9, + "grad_norm": 2.5208362841367475, + "learning_rate": 2.7120008267130016e-07, + "loss": 0.5691, + "step": 7814 + }, + { + "epoch": 0.9, + "grad_norm": 2.224433965261639, + "learning_rate": 2.7059586257966565e-07, + "loss": 0.4665, + "step": 7815 + }, + { + "epoch": 0.9, + "grad_norm": 1.9370963190426809, + "learning_rate": 2.699922976042735e-07, + "loss": 0.4578, + "step": 7816 + }, + { + "epoch": 0.9, + "grad_norm": 2.124564876365193, + "learning_rate": 2.693893878287296e-07, + "loss": 0.514, + "step": 7817 + }, + { + "epoch": 0.9, + "grad_norm": 1.7393942931598239, + "learning_rate": 2.687871333365477e-07, + "loss": 0.5096, + "step": 7818 + }, + { + "epoch": 0.9, + "grad_norm": 2.621501975933147, + "learning_rate": 2.6818553421115226e-07, + "loss": 0.5663, + "step": 7819 + }, + { + "epoch": 0.9, + "grad_norm": 2.355057755956894, + "learning_rate": 2.675845905358776e-07, + "loss": 0.4714, + "step": 7820 + }, + { + "epoch": 0.9, + "grad_norm": 2.0595378830192024, + "learning_rate": 2.6698430239396557e-07, + "loss": 0.4783, + "step": 7821 + }, + { + "epoch": 0.9, + "grad_norm": 1.9258348031331385, + "learning_rate": 2.6638466986856847e-07, + "loss": 0.6038, + "step": 7822 + }, + { + "epoch": 0.9, + "grad_norm": 1.7988468341200656, + "learning_rate": 2.6578569304274604e-07, + "loss": 0.5295, + "step": 7823 + }, + { + "epoch": 0.9, + "grad_norm": 1.8827985673437198, + "learning_rate": 2.6518737199947077e-07, + "loss": 0.4528, + "step": 7824 + }, + { + "epoch": 0.9, + "grad_norm": 2.9847250216835213, + "learning_rate": 2.6458970682161964e-07, + "loss": 0.4569, + "step": 7825 + }, + { + "epoch": 0.9, + "grad_norm": 1.8663020254704306, + "learning_rate": 2.6399269759198266e-07, + "loss": 0.5017, + "step": 7826 + }, + { + "epoch": 0.9, + "grad_norm": 1.740935843545659, + "learning_rate": 2.6339634439325634e-07, + "loss": 0.4507, + "step": 7827 + }, + { + "epoch": 0.9, + "grad_norm": 1.7323149699664946, + "learning_rate": 2.6280064730804853e-07, + "loss": 0.5093, + "step": 7828 + }, + { + "epoch": 0.9, + "grad_norm": 1.9615762734735587, + "learning_rate": 2.6220560641887385e-07, + "loss": 0.432, + "step": 7829 + }, + { + "epoch": 0.9, + "grad_norm": 2.099720551705267, + "learning_rate": 2.61611221808159e-07, + "loss": 0.4206, + "step": 7830 + }, + { + "epoch": 0.9, + "grad_norm": 1.9990147881943885, + "learning_rate": 2.610174935582366e-07, + "loss": 0.5038, + "step": 7831 + }, + { + "epoch": 0.9, + "grad_norm": 2.32052907609017, + "learning_rate": 2.604244217513496e-07, + "loss": 0.4088, + "step": 7832 + }, + { + "epoch": 0.9, + "grad_norm": 1.9795931261880877, + "learning_rate": 2.598320064696519e-07, + "loss": 0.579, + "step": 7833 + }, + { + "epoch": 0.9, + "grad_norm": 3.641042032224614, + "learning_rate": 2.592402477952033e-07, + "loss": 0.4506, + "step": 7834 + }, + { + "epoch": 0.9, + "grad_norm": 2.0658239217940304, + "learning_rate": 2.5864914580997327e-07, + "loss": 0.4741, + "step": 7835 + }, + { + "epoch": 0.9, + "grad_norm": 1.7732258501777476, + "learning_rate": 2.580587005958435e-07, + "loss": 0.5072, + "step": 7836 + }, + { + "epoch": 0.9, + "grad_norm": 2.5403787781438294, + "learning_rate": 2.5746891223460135e-07, + "loss": 0.4934, + "step": 7837 + }, + { + "epoch": 0.9, + "grad_norm": 1.8413429257923315, + "learning_rate": 2.568797808079432e-07, + "loss": 0.4593, + "step": 7838 + }, + { + "epoch": 0.9, + "grad_norm": 2.2145572781900977, + "learning_rate": 2.56291306397477e-07, + "loss": 0.5592, + "step": 7839 + }, + { + "epoch": 0.9, + "grad_norm": 2.527526104364451, + "learning_rate": 2.5570348908471653e-07, + "loss": 0.4337, + "step": 7840 + }, + { + "epoch": 0.9, + "grad_norm": 2.208597016000098, + "learning_rate": 2.551163289510877e-07, + "loss": 0.4758, + "step": 7841 + }, + { + "epoch": 0.9, + "grad_norm": 2.305936529770572, + "learning_rate": 2.5452982607792274e-07, + "loss": 0.4881, + "step": 7842 + }, + { + "epoch": 0.9, + "grad_norm": 1.7894452178924711, + "learning_rate": 2.5394398054646494e-07, + "loss": 0.44, + "step": 7843 + }, + { + "epoch": 0.9, + "grad_norm": 1.959332415233702, + "learning_rate": 2.533587924378644e-07, + "loss": 0.513, + "step": 7844 + }, + { + "epoch": 0.9, + "grad_norm": 2.1575747688850315, + "learning_rate": 2.527742618331819e-07, + "loss": 0.563, + "step": 7845 + }, + { + "epoch": 0.9, + "grad_norm": 1.785407727450857, + "learning_rate": 2.5219038881338643e-07, + "loss": 0.4688, + "step": 7846 + }, + { + "epoch": 0.9, + "grad_norm": 2.16238964544465, + "learning_rate": 2.5160717345935616e-07, + "loss": 0.4166, + "step": 7847 + }, + { + "epoch": 0.9, + "grad_norm": 2.056138899657702, + "learning_rate": 2.5102461585187696e-07, + "loss": 0.4324, + "step": 7848 + }, + { + "epoch": 0.9, + "grad_norm": 2.4313422238719458, + "learning_rate": 2.504427160716466e-07, + "loss": 0.6332, + "step": 7849 + }, + { + "epoch": 0.9, + "grad_norm": 2.180746989358462, + "learning_rate": 2.498614741992683e-07, + "loss": 0.4537, + "step": 7850 + }, + { + "epoch": 0.9, + "grad_norm": 1.6698701662664577, + "learning_rate": 2.4928089031525605e-07, + "loss": 0.3459, + "step": 7851 + }, + { + "epoch": 0.9, + "grad_norm": 1.9421465948987424, + "learning_rate": 2.48700964500031e-07, + "loss": 0.4065, + "step": 7852 + }, + { + "epoch": 0.9, + "grad_norm": 1.8940166371692349, + "learning_rate": 2.4812169683392616e-07, + "loss": 0.4605, + "step": 7853 + }, + { + "epoch": 0.9, + "grad_norm": 2.2815199474804584, + "learning_rate": 2.4754308739718013e-07, + "loss": 0.5149, + "step": 7854 + }, + { + "epoch": 0.9, + "grad_norm": 2.227105372137781, + "learning_rate": 2.4696513626994324e-07, + "loss": 0.5106, + "step": 7855 + }, + { + "epoch": 0.9, + "grad_norm": 2.044729410403832, + "learning_rate": 2.463878435322725e-07, + "loss": 0.4892, + "step": 7856 + }, + { + "epoch": 0.9, + "grad_norm": 2.2222546792805877, + "learning_rate": 2.458112092641335e-07, + "loss": 0.4331, + "step": 7857 + }, + { + "epoch": 0.9, + "grad_norm": 1.669096366875629, + "learning_rate": 2.4523523354540336e-07, + "loss": 0.5399, + "step": 7858 + }, + { + "epoch": 0.9, + "grad_norm": 1.9622700239994508, + "learning_rate": 2.4465991645586385e-07, + "loss": 0.4898, + "step": 7859 + }, + { + "epoch": 0.9, + "grad_norm": 1.5205071184664154, + "learning_rate": 2.440852580752101e-07, + "loss": 0.4699, + "step": 7860 + }, + { + "epoch": 0.9, + "grad_norm": 1.8207304980007373, + "learning_rate": 2.435112584830418e-07, + "loss": 0.5299, + "step": 7861 + }, + { + "epoch": 0.9, + "grad_norm": 1.7393924580442095, + "learning_rate": 2.429379177588709e-07, + "loss": 0.3485, + "step": 7862 + }, + { + "epoch": 0.9, + "grad_norm": 1.8259391689984361, + "learning_rate": 2.423652359821155e-07, + "loss": 0.4758, + "step": 7863 + }, + { + "epoch": 0.9, + "grad_norm": 1.848383333352061, + "learning_rate": 2.417932132321038e-07, + "loss": 0.4283, + "step": 7864 + }, + { + "epoch": 0.9, + "grad_norm": 0.8182018735023306, + "learning_rate": 2.412218495880714e-07, + "loss": 0.6072, + "step": 7865 + }, + { + "epoch": 0.9, + "grad_norm": 1.9272712883611105, + "learning_rate": 2.406511451291643e-07, + "loss": 0.5115, + "step": 7866 + }, + { + "epoch": 0.9, + "grad_norm": 2.6451377115610355, + "learning_rate": 2.400810999344361e-07, + "loss": 0.4589, + "step": 7867 + }, + { + "epoch": 0.9, + "grad_norm": 2.082948159586117, + "learning_rate": 2.3951171408285123e-07, + "loss": 0.5206, + "step": 7868 + }, + { + "epoch": 0.9, + "grad_norm": 2.4741513171645533, + "learning_rate": 2.3894298765327726e-07, + "loss": 0.6061, + "step": 7869 + }, + { + "epoch": 0.9, + "grad_norm": 1.813540857185806, + "learning_rate": 2.3837492072449676e-07, + "loss": 0.4257, + "step": 7870 + }, + { + "epoch": 0.9, + "grad_norm": 2.811492498915913, + "learning_rate": 2.378075133751967e-07, + "loss": 0.5986, + "step": 7871 + }, + { + "epoch": 0.9, + "grad_norm": 2.052362304216388, + "learning_rate": 2.3724076568397592e-07, + "loss": 0.5656, + "step": 7872 + }, + { + "epoch": 0.9, + "grad_norm": 0.8146237474627175, + "learning_rate": 2.3667467772933884e-07, + "loss": 0.6639, + "step": 7873 + }, + { + "epoch": 0.9, + "grad_norm": 2.1638717006587993, + "learning_rate": 2.3610924958970105e-07, + "loss": 0.5056, + "step": 7874 + }, + { + "epoch": 0.9, + "grad_norm": 2.6855076845887367, + "learning_rate": 2.3554448134338436e-07, + "loss": 0.4699, + "step": 7875 + }, + { + "epoch": 0.9, + "grad_norm": 1.986137450591559, + "learning_rate": 2.3498037306862066e-07, + "loss": 0.5402, + "step": 7876 + }, + { + "epoch": 0.91, + "grad_norm": 1.6888609446441045, + "learning_rate": 2.3441692484355073e-07, + "loss": 0.4808, + "step": 7877 + }, + { + "epoch": 0.91, + "grad_norm": 2.16569576641359, + "learning_rate": 2.338541367462227e-07, + "loss": 0.5756, + "step": 7878 + }, + { + "epoch": 0.91, + "grad_norm": 3.359891302067057, + "learning_rate": 2.3329200885459425e-07, + "loss": 0.4404, + "step": 7879 + }, + { + "epoch": 0.91, + "grad_norm": 2.1358706472129474, + "learning_rate": 2.3273054124653082e-07, + "loss": 0.3898, + "step": 7880 + }, + { + "epoch": 0.91, + "grad_norm": 2.2483282206129926, + "learning_rate": 2.3216973399980802e-07, + "loss": 0.4378, + "step": 7881 + }, + { + "epoch": 0.91, + "grad_norm": 2.5440668978290772, + "learning_rate": 2.3160958719210647e-07, + "loss": 0.3826, + "step": 7882 + }, + { + "epoch": 0.91, + "grad_norm": 2.288074018557181, + "learning_rate": 2.310501009010202e-07, + "loss": 0.6138, + "step": 7883 + }, + { + "epoch": 0.91, + "grad_norm": 2.3860820512455287, + "learning_rate": 2.3049127520404723e-07, + "loss": 0.4967, + "step": 7884 + }, + { + "epoch": 0.91, + "grad_norm": 3.4394393527133116, + "learning_rate": 2.2993311017859788e-07, + "loss": 0.4829, + "step": 7885 + }, + { + "epoch": 0.91, + "grad_norm": 2.4818134290848963, + "learning_rate": 2.2937560590198638e-07, + "loss": 0.463, + "step": 7886 + }, + { + "epoch": 0.91, + "grad_norm": 1.965274911720076, + "learning_rate": 2.2881876245144097e-07, + "loss": 0.5334, + "step": 7887 + }, + { + "epoch": 0.91, + "grad_norm": 2.456860246091764, + "learning_rate": 2.2826257990409273e-07, + "loss": 0.6101, + "step": 7888 + }, + { + "epoch": 0.91, + "grad_norm": 2.784972688206685, + "learning_rate": 2.277070583369867e-07, + "loss": 0.5243, + "step": 7889 + }, + { + "epoch": 0.91, + "grad_norm": 1.8551967813119554, + "learning_rate": 2.2715219782707131e-07, + "loss": 0.4489, + "step": 7890 + }, + { + "epoch": 0.91, + "grad_norm": 0.835141588016095, + "learning_rate": 2.2659799845120788e-07, + "loss": 0.6437, + "step": 7891 + }, + { + "epoch": 0.91, + "grad_norm": 2.014836539433783, + "learning_rate": 2.2604446028616223e-07, + "loss": 0.4565, + "step": 7892 + }, + { + "epoch": 0.91, + "grad_norm": 2.0700200633516395, + "learning_rate": 2.2549158340861133e-07, + "loss": 0.4155, + "step": 7893 + }, + { + "epoch": 0.91, + "grad_norm": 2.1585587435393943, + "learning_rate": 2.2493936789513892e-07, + "loss": 0.4611, + "step": 7894 + }, + { + "epoch": 0.91, + "grad_norm": 2.039401459473966, + "learning_rate": 2.2438781382223883e-07, + "loss": 0.5237, + "step": 7895 + }, + { + "epoch": 0.91, + "grad_norm": 2.5451554989707046, + "learning_rate": 2.2383692126631163e-07, + "loss": 0.5172, + "step": 7896 + }, + { + "epoch": 0.91, + "grad_norm": 1.8845787032282753, + "learning_rate": 2.2328669030366623e-07, + "loss": 0.5119, + "step": 7897 + }, + { + "epoch": 0.91, + "grad_norm": 2.308412960245356, + "learning_rate": 2.227371210105217e-07, + "loss": 0.4585, + "step": 7898 + }, + { + "epoch": 0.91, + "grad_norm": 1.6656681667150586, + "learning_rate": 2.2218821346300267e-07, + "loss": 0.4303, + "step": 7899 + }, + { + "epoch": 0.91, + "grad_norm": 2.6855180751074736, + "learning_rate": 2.2163996773714612e-07, + "loss": 0.5139, + "step": 7900 + }, + { + "epoch": 0.91, + "grad_norm": 2.9463451324311563, + "learning_rate": 2.2109238390889242e-07, + "loss": 0.5371, + "step": 7901 + }, + { + "epoch": 0.91, + "grad_norm": 2.1599190921216116, + "learning_rate": 2.205454620540959e-07, + "loss": 0.5614, + "step": 7902 + }, + { + "epoch": 0.91, + "grad_norm": 2.034484805269269, + "learning_rate": 2.19999202248512e-07, + "loss": 0.511, + "step": 7903 + }, + { + "epoch": 0.91, + "grad_norm": 1.7256945735437206, + "learning_rate": 2.1945360456781194e-07, + "loss": 0.4428, + "step": 7904 + }, + { + "epoch": 0.91, + "grad_norm": 2.1424955419217793, + "learning_rate": 2.189086690875697e-07, + "loss": 0.4546, + "step": 7905 + }, + { + "epoch": 0.91, + "grad_norm": 1.808120872490734, + "learning_rate": 2.1836439588327152e-07, + "loss": 0.5006, + "step": 7906 + }, + { + "epoch": 0.91, + "grad_norm": 1.99715505987209, + "learning_rate": 2.1782078503030768e-07, + "loss": 0.5577, + "step": 7907 + }, + { + "epoch": 0.91, + "grad_norm": 5.196377525071023, + "learning_rate": 2.1727783660398183e-07, + "loss": 0.4787, + "step": 7908 + }, + { + "epoch": 0.91, + "grad_norm": 2.2807307949400792, + "learning_rate": 2.1673555067950047e-07, + "loss": 0.444, + "step": 7909 + }, + { + "epoch": 0.91, + "grad_norm": 1.7897977102757918, + "learning_rate": 2.1619392733198298e-07, + "loss": 0.3901, + "step": 7910 + }, + { + "epoch": 0.91, + "grad_norm": 1.9262104435629732, + "learning_rate": 2.1565296663645319e-07, + "loss": 0.6008, + "step": 7911 + }, + { + "epoch": 0.91, + "grad_norm": 1.7646105100648166, + "learning_rate": 2.1511266866784674e-07, + "loss": 0.5516, + "step": 7912 + }, + { + "epoch": 0.91, + "grad_norm": 2.0243533104616436, + "learning_rate": 2.1457303350100377e-07, + "loss": 0.4357, + "step": 7913 + }, + { + "epoch": 0.91, + "grad_norm": 2.4053809341574377, + "learning_rate": 2.1403406121067616e-07, + "loss": 0.4965, + "step": 7914 + }, + { + "epoch": 0.91, + "grad_norm": 2.4104894606764375, + "learning_rate": 2.1349575187152138e-07, + "loss": 0.4795, + "step": 7915 + }, + { + "epoch": 0.91, + "grad_norm": 2.290855080983749, + "learning_rate": 2.1295810555810535e-07, + "loss": 0.5292, + "step": 7916 + }, + { + "epoch": 0.91, + "grad_norm": 2.1069852773780116, + "learning_rate": 2.1242112234490407e-07, + "loss": 0.5557, + "step": 7917 + }, + { + "epoch": 0.91, + "grad_norm": 2.0555822051899986, + "learning_rate": 2.118848023062997e-07, + "loss": 0.4313, + "step": 7918 + }, + { + "epoch": 0.91, + "grad_norm": 2.08521513651175, + "learning_rate": 2.113491455165828e-07, + "loss": 0.4186, + "step": 7919 + }, + { + "epoch": 0.91, + "grad_norm": 1.7251146126476526, + "learning_rate": 2.1081415204995291e-07, + "loss": 0.4792, + "step": 7920 + }, + { + "epoch": 0.91, + "grad_norm": 1.7243842526200532, + "learning_rate": 2.1027982198051744e-07, + "loss": 0.4471, + "step": 7921 + }, + { + "epoch": 0.91, + "grad_norm": 1.9857146366430551, + "learning_rate": 2.0974615538229105e-07, + "loss": 0.381, + "step": 7922 + }, + { + "epoch": 0.91, + "grad_norm": 4.0762951873324385, + "learning_rate": 2.0921315232919793e-07, + "loss": 0.5641, + "step": 7923 + }, + { + "epoch": 0.91, + "grad_norm": 2.0703247753060747, + "learning_rate": 2.0868081289506847e-07, + "loss": 0.4934, + "step": 7924 + }, + { + "epoch": 0.91, + "grad_norm": 1.9450966598596273, + "learning_rate": 2.0814913715364372e-07, + "loss": 0.4501, + "step": 7925 + }, + { + "epoch": 0.91, + "grad_norm": 1.7332638109404221, + "learning_rate": 2.0761812517856972e-07, + "loss": 0.5285, + "step": 7926 + }, + { + "epoch": 0.91, + "grad_norm": 2.25691588406782, + "learning_rate": 2.0708777704340376e-07, + "loss": 0.4712, + "step": 7927 + }, + { + "epoch": 0.91, + "grad_norm": 2.5183246050024986, + "learning_rate": 2.0655809282160767e-07, + "loss": 0.5181, + "step": 7928 + }, + { + "epoch": 0.91, + "grad_norm": 0.889796052070294, + "learning_rate": 2.0602907258655546e-07, + "loss": 0.7208, + "step": 7929 + }, + { + "epoch": 0.91, + "grad_norm": 2.3634521607194836, + "learning_rate": 2.0550071641152525e-07, + "loss": 0.5143, + "step": 7930 + }, + { + "epoch": 0.91, + "grad_norm": 2.0103287007531963, + "learning_rate": 2.049730243697057e-07, + "loss": 0.4362, + "step": 7931 + }, + { + "epoch": 0.91, + "grad_norm": 3.063801226644467, + "learning_rate": 2.0444599653419161e-07, + "loss": 0.4527, + "step": 7932 + }, + { + "epoch": 0.91, + "grad_norm": 2.1762019662621235, + "learning_rate": 2.03919632977988e-07, + "loss": 0.4099, + "step": 7933 + }, + { + "epoch": 0.91, + "grad_norm": 1.9649189007307415, + "learning_rate": 2.033939337740065e-07, + "loss": 0.4776, + "step": 7934 + }, + { + "epoch": 0.91, + "grad_norm": 2.231257507852725, + "learning_rate": 2.0286889899506613e-07, + "loss": 0.4694, + "step": 7935 + }, + { + "epoch": 0.91, + "grad_norm": 2.022821007824068, + "learning_rate": 2.0234452871389476e-07, + "loss": 0.4836, + "step": 7936 + }, + { + "epoch": 0.91, + "grad_norm": 1.8361961659004296, + "learning_rate": 2.0182082300312877e-07, + "loss": 0.53, + "step": 7937 + }, + { + "epoch": 0.91, + "grad_norm": 1.938551377997809, + "learning_rate": 2.0129778193531015e-07, + "loss": 0.3781, + "step": 7938 + }, + { + "epoch": 0.91, + "grad_norm": 1.743316207098689, + "learning_rate": 2.007754055828931e-07, + "loss": 0.497, + "step": 7939 + }, + { + "epoch": 0.91, + "grad_norm": 1.9919900875456569, + "learning_rate": 2.002536940182348e-07, + "loss": 0.4792, + "step": 7940 + }, + { + "epoch": 0.91, + "grad_norm": 1.9104387245533985, + "learning_rate": 1.9973264731360354e-07, + "loss": 0.5489, + "step": 7941 + }, + { + "epoch": 0.91, + "grad_norm": 1.9562856943574627, + "learning_rate": 1.992122655411749e-07, + "loss": 0.4595, + "step": 7942 + }, + { + "epoch": 0.91, + "grad_norm": 2.041512491814604, + "learning_rate": 1.986925487730307e-07, + "loss": 0.5324, + "step": 7943 + }, + { + "epoch": 0.91, + "grad_norm": 2.3490432269975887, + "learning_rate": 1.981734970811644e-07, + "loss": 0.5197, + "step": 7944 + }, + { + "epoch": 0.91, + "grad_norm": 2.334460729098768, + "learning_rate": 1.9765511053747243e-07, + "loss": 0.4413, + "step": 7945 + }, + { + "epoch": 0.91, + "grad_norm": 2.194681099494441, + "learning_rate": 1.9713738921376346e-07, + "loss": 0.5379, + "step": 7946 + }, + { + "epoch": 0.91, + "grad_norm": 3.608421774278065, + "learning_rate": 1.9662033318175068e-07, + "loss": 0.4535, + "step": 7947 + }, + { + "epoch": 0.91, + "grad_norm": 2.307409505118636, + "learning_rate": 1.961039425130584e-07, + "loss": 0.5116, + "step": 7948 + }, + { + "epoch": 0.91, + "grad_norm": 2.7055474374068202, + "learning_rate": 1.9558821727921508e-07, + "loss": 0.5067, + "step": 7949 + }, + { + "epoch": 0.91, + "grad_norm": 1.9317621673654353, + "learning_rate": 1.9507315755166068e-07, + "loss": 0.4693, + "step": 7950 + }, + { + "epoch": 0.91, + "grad_norm": 2.024154011725036, + "learning_rate": 1.9455876340173929e-07, + "loss": 0.4662, + "step": 7951 + }, + { + "epoch": 0.91, + "grad_norm": 2.2753951416791844, + "learning_rate": 1.940450349007067e-07, + "loss": 0.4417, + "step": 7952 + }, + { + "epoch": 0.91, + "grad_norm": 2.2203302839309846, + "learning_rate": 1.9353197211972262e-07, + "loss": 0.5659, + "step": 7953 + }, + { + "epoch": 0.91, + "grad_norm": 2.2563893580384287, + "learning_rate": 1.9301957512985802e-07, + "loss": 0.5505, + "step": 7954 + }, + { + "epoch": 0.91, + "grad_norm": 1.8760583035835703, + "learning_rate": 1.9250784400208832e-07, + "loss": 0.4464, + "step": 7955 + }, + { + "epoch": 0.91, + "grad_norm": 0.9990966454915617, + "learning_rate": 1.9199677880730018e-07, + "loss": 0.6996, + "step": 7956 + }, + { + "epoch": 0.91, + "grad_norm": 1.9237599776195486, + "learning_rate": 1.9148637961628468e-07, + "loss": 0.6038, + "step": 7957 + }, + { + "epoch": 0.91, + "grad_norm": 1.6371339210292004, + "learning_rate": 1.9097664649974368e-07, + "loss": 0.4466, + "step": 7958 + }, + { + "epoch": 0.91, + "grad_norm": 1.7662190798850002, + "learning_rate": 1.9046757952828398e-07, + "loss": 0.4156, + "step": 7959 + }, + { + "epoch": 0.91, + "grad_norm": 2.3338822562752646, + "learning_rate": 1.8995917877242308e-07, + "loss": 0.4625, + "step": 7960 + }, + { + "epoch": 0.91, + "grad_norm": 2.2647587853794446, + "learning_rate": 1.8945144430258356e-07, + "loss": 0.4832, + "step": 7961 + }, + { + "epoch": 0.91, + "grad_norm": 3.5206310993709504, + "learning_rate": 1.8894437618909578e-07, + "loss": 0.4999, + "step": 7962 + }, + { + "epoch": 0.91, + "grad_norm": 2.9137003315175436, + "learning_rate": 1.8843797450220024e-07, + "loss": 0.4108, + "step": 7963 + }, + { + "epoch": 0.92, + "grad_norm": 1.9533683001233906, + "learning_rate": 1.8793223931204308e-07, + "loss": 0.4483, + "step": 7964 + }, + { + "epoch": 0.92, + "grad_norm": 3.188862087683525, + "learning_rate": 1.8742717068867877e-07, + "loss": 0.3995, + "step": 7965 + }, + { + "epoch": 0.92, + "grad_norm": 1.9665401382357417, + "learning_rate": 1.869227687020686e-07, + "loss": 0.4365, + "step": 7966 + }, + { + "epoch": 0.92, + "grad_norm": 2.3143608259203163, + "learning_rate": 1.8641903342208389e-07, + "loss": 0.5685, + "step": 7967 + }, + { + "epoch": 0.92, + "grad_norm": 1.9153070641244236, + "learning_rate": 1.8591596491849996e-07, + "loss": 0.4483, + "step": 7968 + }, + { + "epoch": 0.92, + "grad_norm": 2.005173668831575, + "learning_rate": 1.8541356326100436e-07, + "loss": 0.5629, + "step": 7969 + }, + { + "epoch": 0.92, + "grad_norm": 2.292940574260069, + "learning_rate": 1.8491182851918643e-07, + "loss": 0.4789, + "step": 7970 + }, + { + "epoch": 0.92, + "grad_norm": 2.078937021704676, + "learning_rate": 1.8441076076254837e-07, + "loss": 0.574, + "step": 7971 + }, + { + "epoch": 0.92, + "grad_norm": 2.3798440783072388, + "learning_rate": 1.8391036006049744e-07, + "loss": 0.5465, + "step": 7972 + }, + { + "epoch": 0.92, + "grad_norm": 2.833638060508819, + "learning_rate": 1.8341062648234987e-07, + "loss": 0.5143, + "step": 7973 + }, + { + "epoch": 0.92, + "grad_norm": 2.191557370823667, + "learning_rate": 1.8291156009732746e-07, + "loss": 0.5249, + "step": 7974 + }, + { + "epoch": 0.92, + "grad_norm": 1.8036034276767148, + "learning_rate": 1.8241316097456218e-07, + "loss": 0.5334, + "step": 7975 + }, + { + "epoch": 0.92, + "grad_norm": 4.155544703329232, + "learning_rate": 1.8191542918309045e-07, + "loss": 0.3753, + "step": 7976 + }, + { + "epoch": 0.92, + "grad_norm": 3.004205508804817, + "learning_rate": 1.8141836479185993e-07, + "loss": 0.4891, + "step": 7977 + }, + { + "epoch": 0.92, + "grad_norm": 2.1882546701008163, + "learning_rate": 1.8092196786972215e-07, + "loss": 0.4722, + "step": 7978 + }, + { + "epoch": 0.92, + "grad_norm": 1.996178721859881, + "learning_rate": 1.804262384854394e-07, + "loss": 0.4469, + "step": 7979 + }, + { + "epoch": 0.92, + "grad_norm": 1.7781120572922013, + "learning_rate": 1.799311767076789e-07, + "loss": 0.5628, + "step": 7980 + }, + { + "epoch": 0.92, + "grad_norm": 2.7522065403533245, + "learning_rate": 1.7943678260501641e-07, + "loss": 0.441, + "step": 7981 + }, + { + "epoch": 0.92, + "grad_norm": 2.0651646883159995, + "learning_rate": 1.7894305624593655e-07, + "loss": 0.5002, + "step": 7982 + }, + { + "epoch": 0.92, + "grad_norm": 2.568981008222588, + "learning_rate": 1.784499976988291e-07, + "loss": 0.5103, + "step": 7983 + }, + { + "epoch": 0.92, + "grad_norm": 1.7278617113090513, + "learning_rate": 1.7795760703199327e-07, + "loss": 0.5113, + "step": 7984 + }, + { + "epoch": 0.92, + "grad_norm": 2.472571841758644, + "learning_rate": 1.7746588431363397e-07, + "loss": 0.4739, + "step": 7985 + }, + { + "epoch": 0.92, + "grad_norm": 2.5581462119885905, + "learning_rate": 1.7697482961186674e-07, + "loss": 0.5417, + "step": 7986 + }, + { + "epoch": 0.92, + "grad_norm": 1.8780913172927785, + "learning_rate": 1.764844429947088e-07, + "loss": 0.531, + "step": 7987 + }, + { + "epoch": 0.92, + "grad_norm": 1.7570848361516738, + "learning_rate": 1.7599472453009136e-07, + "loss": 0.4832, + "step": 7988 + }, + { + "epoch": 0.92, + "grad_norm": 1.7991695453187564, + "learning_rate": 1.7550567428584852e-07, + "loss": 0.4902, + "step": 7989 + }, + { + "epoch": 0.92, + "grad_norm": 1.9105296270141725, + "learning_rate": 1.7501729232972442e-07, + "loss": 0.4739, + "step": 7990 + }, + { + "epoch": 0.92, + "grad_norm": 1.824306785711125, + "learning_rate": 1.7452957872936881e-07, + "loss": 0.3892, + "step": 7991 + }, + { + "epoch": 0.92, + "grad_norm": 2.166072161654502, + "learning_rate": 1.7404253355234102e-07, + "loss": 0.5336, + "step": 7992 + }, + { + "epoch": 0.92, + "grad_norm": 2.247264251826484, + "learning_rate": 1.7355615686610427e-07, + "loss": 0.4501, + "step": 7993 + }, + { + "epoch": 0.92, + "grad_norm": 2.267710077394332, + "learning_rate": 1.7307044873803414e-07, + "loss": 0.4626, + "step": 7994 + }, + { + "epoch": 0.92, + "grad_norm": 2.076295760548668, + "learning_rate": 1.7258540923540846e-07, + "loss": 0.5351, + "step": 7995 + }, + { + "epoch": 0.92, + "grad_norm": 1.9948699624914832, + "learning_rate": 1.7210103842541626e-07, + "loss": 0.4632, + "step": 7996 + }, + { + "epoch": 0.92, + "grad_norm": 1.9105770342424637, + "learning_rate": 1.7161733637515166e-07, + "loss": 0.5645, + "step": 7997 + }, + { + "epoch": 0.92, + "grad_norm": 2.287209472860106, + "learning_rate": 1.7113430315161772e-07, + "loss": 0.5097, + "step": 7998 + }, + { + "epoch": 0.92, + "grad_norm": 1.957730057335225, + "learning_rate": 1.706519388217237e-07, + "loss": 0.4966, + "step": 7999 + }, + { + "epoch": 0.92, + "grad_norm": 1.896980782921911, + "learning_rate": 1.701702434522856e-07, + "loss": 0.5705, + "step": 8000 + }, + { + "epoch": 0.92, + "grad_norm": 2.3015932772251526, + "learning_rate": 1.6968921711003005e-07, + "loss": 0.4411, + "step": 8001 + }, + { + "epoch": 0.92, + "grad_norm": 2.687678852196777, + "learning_rate": 1.6920885986158707e-07, + "loss": 0.4408, + "step": 8002 + }, + { + "epoch": 0.92, + "grad_norm": 4.78801883923739, + "learning_rate": 1.6872917177349623e-07, + "loss": 0.5845, + "step": 8003 + }, + { + "epoch": 0.92, + "grad_norm": 2.9501255451639143, + "learning_rate": 1.682501529122038e-07, + "loss": 0.45, + "step": 8004 + }, + { + "epoch": 0.92, + "grad_norm": 2.1386630897533303, + "learning_rate": 1.677718033440634e-07, + "loss": 0.4636, + "step": 8005 + }, + { + "epoch": 0.92, + "grad_norm": 3.001747229819958, + "learning_rate": 1.6729412313533534e-07, + "loss": 0.4446, + "step": 8006 + }, + { + "epoch": 0.92, + "grad_norm": 2.028711579311054, + "learning_rate": 1.668171123521889e-07, + "loss": 0.5, + "step": 8007 + }, + { + "epoch": 0.92, + "grad_norm": 1.9993125420999664, + "learning_rate": 1.6634077106069791e-07, + "loss": 0.4531, + "step": 8008 + }, + { + "epoch": 0.92, + "grad_norm": 2.061783550149706, + "learning_rate": 1.6586509932684735e-07, + "loss": 0.466, + "step": 8009 + }, + { + "epoch": 0.92, + "grad_norm": 1.792411794727479, + "learning_rate": 1.6539009721652455e-07, + "loss": 0.4588, + "step": 8010 + }, + { + "epoch": 0.92, + "grad_norm": 2.105670370370146, + "learning_rate": 1.6491576479552962e-07, + "loss": 0.5492, + "step": 8011 + }, + { + "epoch": 0.92, + "grad_norm": 2.179724434910707, + "learning_rate": 1.6444210212956392e-07, + "loss": 0.3773, + "step": 8012 + }, + { + "epoch": 0.92, + "grad_norm": 1.6165921712600508, + "learning_rate": 1.6396910928424216e-07, + "loss": 0.401, + "step": 8013 + }, + { + "epoch": 0.92, + "grad_norm": 2.5996996434133965, + "learning_rate": 1.634967863250808e-07, + "loss": 0.4079, + "step": 8014 + }, + { + "epoch": 0.92, + "grad_norm": 2.3590043042466773, + "learning_rate": 1.6302513331750702e-07, + "loss": 0.3117, + "step": 8015 + }, + { + "epoch": 0.92, + "grad_norm": 1.958467029029524, + "learning_rate": 1.6255415032685406e-07, + "loss": 0.4985, + "step": 8016 + }, + { + "epoch": 0.92, + "grad_norm": 1.9379479335531633, + "learning_rate": 1.6208383741836254e-07, + "loss": 0.5138, + "step": 8017 + }, + { + "epoch": 0.92, + "grad_norm": 1.9022610537063376, + "learning_rate": 1.6161419465717975e-07, + "loss": 0.4968, + "step": 8018 + }, + { + "epoch": 0.92, + "grad_norm": 2.5953016404846974, + "learning_rate": 1.6114522210836091e-07, + "loss": 0.5201, + "step": 8019 + }, + { + "epoch": 0.92, + "grad_norm": 2.965895843505926, + "learning_rate": 1.6067691983686794e-07, + "loss": 0.4837, + "step": 8020 + }, + { + "epoch": 0.92, + "grad_norm": 2.0828353285766883, + "learning_rate": 1.602092879075695e-07, + "loss": 0.43, + "step": 8021 + }, + { + "epoch": 0.92, + "grad_norm": 1.901578612517592, + "learning_rate": 1.5974232638524212e-07, + "loss": 0.4287, + "step": 8022 + }, + { + "epoch": 0.92, + "grad_norm": 2.5527459468832463, + "learning_rate": 1.592760353345696e-07, + "loss": 0.43, + "step": 8023 + }, + { + "epoch": 0.92, + "grad_norm": 1.8200867181013363, + "learning_rate": 1.5881041482014192e-07, + "loss": 0.4889, + "step": 8024 + }, + { + "epoch": 0.92, + "grad_norm": 2.0920180572405096, + "learning_rate": 1.5834546490645696e-07, + "loss": 0.4931, + "step": 8025 + }, + { + "epoch": 0.92, + "grad_norm": 2.255213561270281, + "learning_rate": 1.5788118565792042e-07, + "loss": 0.4483, + "step": 8026 + }, + { + "epoch": 0.92, + "grad_norm": 3.0911784018832433, + "learning_rate": 1.5741757713884253e-07, + "loss": 0.4902, + "step": 8027 + }, + { + "epoch": 0.92, + "grad_norm": 1.7819304773351499, + "learning_rate": 1.569546394134436e-07, + "loss": 0.4614, + "step": 8028 + }, + { + "epoch": 0.92, + "grad_norm": 2.0487995875711533, + "learning_rate": 1.5649237254584838e-07, + "loss": 0.5201, + "step": 8029 + }, + { + "epoch": 0.92, + "grad_norm": 2.2793007325278176, + "learning_rate": 1.560307766000918e-07, + "loss": 0.5048, + "step": 8030 + }, + { + "epoch": 0.92, + "grad_norm": 2.027387421472596, + "learning_rate": 1.5556985164011217e-07, + "loss": 0.5009, + "step": 8031 + }, + { + "epoch": 0.92, + "grad_norm": 1.9782535662943772, + "learning_rate": 1.5510959772975841e-07, + "loss": 0.4431, + "step": 8032 + }, + { + "epoch": 0.92, + "grad_norm": 0.7835028566349568, + "learning_rate": 1.546500149327834e-07, + "loss": 0.645, + "step": 8033 + }, + { + "epoch": 0.92, + "grad_norm": 1.9547264074324655, + "learning_rate": 1.5419110331284904e-07, + "loss": 0.5546, + "step": 8034 + }, + { + "epoch": 0.92, + "grad_norm": 2.5281659046810456, + "learning_rate": 1.5373286293352385e-07, + "loss": 0.4084, + "step": 8035 + }, + { + "epoch": 0.92, + "grad_norm": 1.9748924234422105, + "learning_rate": 1.5327529385828377e-07, + "loss": 0.4805, + "step": 8036 + }, + { + "epoch": 0.92, + "grad_norm": 1.533086763430095, + "learning_rate": 1.5281839615050975e-07, + "loss": 0.3945, + "step": 8037 + }, + { + "epoch": 0.92, + "grad_norm": 2.1843738059619233, + "learning_rate": 1.5236216987349283e-07, + "loss": 0.5754, + "step": 8038 + }, + { + "epoch": 0.92, + "grad_norm": 1.929045406871986, + "learning_rate": 1.5190661509042748e-07, + "loss": 0.5093, + "step": 8039 + }, + { + "epoch": 0.92, + "grad_norm": 2.0385363491766872, + "learning_rate": 1.5145173186441875e-07, + "loss": 0.4116, + "step": 8040 + }, + { + "epoch": 0.92, + "grad_norm": 1.6040494668183753, + "learning_rate": 1.509975202584757e-07, + "loss": 0.4406, + "step": 8041 + }, + { + "epoch": 0.92, + "grad_norm": 2.6057678082783395, + "learning_rate": 1.5054398033551688e-07, + "loss": 0.4755, + "step": 8042 + }, + { + "epoch": 0.92, + "grad_norm": 1.911946231688503, + "learning_rate": 1.5009111215836536e-07, + "loss": 0.4923, + "step": 8043 + }, + { + "epoch": 0.92, + "grad_norm": 2.079354177191774, + "learning_rate": 1.496389157897532e-07, + "loss": 0.5125, + "step": 8044 + }, + { + "epoch": 0.92, + "grad_norm": 2.011808963488335, + "learning_rate": 1.4918739129231863e-07, + "loss": 0.5339, + "step": 8045 + }, + { + "epoch": 0.92, + "grad_norm": 3.2023340088146117, + "learning_rate": 1.4873653872860605e-07, + "loss": 0.4644, + "step": 8046 + }, + { + "epoch": 0.92, + "grad_norm": 1.988131415038431, + "learning_rate": 1.482863581610683e-07, + "loss": 0.4792, + "step": 8047 + }, + { + "epoch": 0.92, + "grad_norm": 2.708896182831012, + "learning_rate": 1.4783684965206323e-07, + "loss": 0.5109, + "step": 8048 + }, + { + "epoch": 0.92, + "grad_norm": 2.145691622547765, + "learning_rate": 1.4738801326385777e-07, + "loss": 0.4751, + "step": 8049 + }, + { + "epoch": 0.92, + "grad_norm": 6.436820363411749, + "learning_rate": 1.4693984905862378e-07, + "loss": 0.4558, + "step": 8050 + }, + { + "epoch": 0.93, + "grad_norm": 1.9115897861693334, + "learning_rate": 1.4649235709844168e-07, + "loss": 0.3913, + "step": 8051 + }, + { + "epoch": 0.93, + "grad_norm": 2.253423538052999, + "learning_rate": 1.4604553744529737e-07, + "loss": 0.5442, + "step": 8052 + }, + { + "epoch": 0.93, + "grad_norm": 4.3090020171053345, + "learning_rate": 1.4559939016108472e-07, + "loss": 0.3653, + "step": 8053 + }, + { + "epoch": 0.93, + "grad_norm": 2.7298989112674725, + "learning_rate": 1.4515391530760426e-07, + "loss": 0.5342, + "step": 8054 + }, + { + "epoch": 0.93, + "grad_norm": 2.057603114024709, + "learning_rate": 1.4470911294656222e-07, + "loss": 0.5192, + "step": 8055 + }, + { + "epoch": 0.93, + "grad_norm": 2.5248043766875514, + "learning_rate": 1.442649831395726e-07, + "loss": 0.5517, + "step": 8056 + }, + { + "epoch": 0.93, + "grad_norm": 2.284828305335089, + "learning_rate": 1.438215259481568e-07, + "loss": 0.4144, + "step": 8057 + }, + { + "epoch": 0.93, + "grad_norm": 0.8905914957021667, + "learning_rate": 1.433787414337412e-07, + "loss": 0.6678, + "step": 8058 + }, + { + "epoch": 0.93, + "grad_norm": 2.71499600710676, + "learning_rate": 1.429366296576623e-07, + "loss": 0.5011, + "step": 8059 + }, + { + "epoch": 0.93, + "grad_norm": 2.0756106205858575, + "learning_rate": 1.4249519068115947e-07, + "loss": 0.5492, + "step": 8060 + }, + { + "epoch": 0.93, + "grad_norm": 2.208101659783904, + "learning_rate": 1.420544245653821e-07, + "loss": 0.5102, + "step": 8061 + }, + { + "epoch": 0.93, + "grad_norm": 2.4389428200649936, + "learning_rate": 1.4161433137138358e-07, + "loss": 0.4644, + "step": 8062 + }, + { + "epoch": 0.93, + "grad_norm": 2.3045402830990773, + "learning_rate": 1.4117491116012683e-07, + "loss": 0.479, + "step": 8063 + }, + { + "epoch": 0.93, + "grad_norm": 1.9541189830213517, + "learning_rate": 1.4073616399248037e-07, + "loss": 0.42, + "step": 8064 + }, + { + "epoch": 0.93, + "grad_norm": 1.6130659098449072, + "learning_rate": 1.4029808992921778e-07, + "loss": 0.4834, + "step": 8065 + }, + { + "epoch": 0.93, + "grad_norm": 2.147447428008245, + "learning_rate": 1.398606890310228e-07, + "loss": 0.4786, + "step": 8066 + }, + { + "epoch": 0.93, + "grad_norm": 2.3567534803800236, + "learning_rate": 1.3942396135848301e-07, + "loss": 0.5011, + "step": 8067 + }, + { + "epoch": 0.93, + "grad_norm": 2.338880318588386, + "learning_rate": 1.3898790697209453e-07, + "loss": 0.5872, + "step": 8068 + }, + { + "epoch": 0.93, + "grad_norm": 2.581096956853325, + "learning_rate": 1.3855252593225842e-07, + "loss": 0.3963, + "step": 8069 + }, + { + "epoch": 0.93, + "grad_norm": 1.770299562492781, + "learning_rate": 1.3811781829928593e-07, + "loss": 0.5404, + "step": 8070 + }, + { + "epoch": 0.93, + "grad_norm": 1.9217966646673075, + "learning_rate": 1.3768378413339e-07, + "loss": 0.5116, + "step": 8071 + }, + { + "epoch": 0.93, + "grad_norm": 1.8189868338312154, + "learning_rate": 1.372504234946942e-07, + "loss": 0.4853, + "step": 8072 + }, + { + "epoch": 0.93, + "grad_norm": 1.9162263331461113, + "learning_rate": 1.3681773644322772e-07, + "loss": 0.5646, + "step": 8073 + }, + { + "epoch": 0.93, + "grad_norm": 2.50214157945727, + "learning_rate": 1.3638572303892594e-07, + "loss": 0.4916, + "step": 8074 + }, + { + "epoch": 0.93, + "grad_norm": 2.0582244282782653, + "learning_rate": 1.3595438334163103e-07, + "loss": 0.4577, + "step": 8075 + }, + { + "epoch": 0.93, + "grad_norm": 0.8823412127574939, + "learning_rate": 1.355237174110935e-07, + "loss": 0.6555, + "step": 8076 + }, + { + "epoch": 0.93, + "grad_norm": 1.7736473569464835, + "learning_rate": 1.3509372530696674e-07, + "loss": 0.5195, + "step": 8077 + }, + { + "epoch": 0.93, + "grad_norm": 2.328138835937736, + "learning_rate": 1.3466440708881534e-07, + "loss": 0.5302, + "step": 8078 + }, + { + "epoch": 0.93, + "grad_norm": 2.225383681527746, + "learning_rate": 1.342357628161073e-07, + "loss": 0.417, + "step": 8079 + }, + { + "epoch": 0.93, + "grad_norm": 2.1613858444869454, + "learning_rate": 1.3380779254821896e-07, + "loss": 0.4258, + "step": 8080 + }, + { + "epoch": 0.93, + "grad_norm": 2.587411829219413, + "learning_rate": 1.3338049634443183e-07, + "loss": 0.5498, + "step": 8081 + }, + { + "epoch": 0.93, + "grad_norm": 2.3632206368946655, + "learning_rate": 1.329538742639358e-07, + "loss": 0.4379, + "step": 8082 + }, + { + "epoch": 0.93, + "grad_norm": 2.490719239649113, + "learning_rate": 1.3252792636582578e-07, + "loss": 0.4835, + "step": 8083 + }, + { + "epoch": 0.93, + "grad_norm": 1.9899899202423035, + "learning_rate": 1.3210265270910516e-07, + "loss": 0.4994, + "step": 8084 + }, + { + "epoch": 0.93, + "grad_norm": 1.8264529311252422, + "learning_rate": 1.3167805335268126e-07, + "loss": 0.5148, + "step": 8085 + }, + { + "epoch": 0.93, + "grad_norm": 1.6789998294119555, + "learning_rate": 1.3125412835537032e-07, + "loss": 0.556, + "step": 8086 + }, + { + "epoch": 0.93, + "grad_norm": 3.4764406271377695, + "learning_rate": 1.3083087777589432e-07, + "loss": 0.532, + "step": 8087 + }, + { + "epoch": 0.93, + "grad_norm": 2.312818347790458, + "learning_rate": 1.3040830167288188e-07, + "loss": 0.5922, + "step": 8088 + }, + { + "epoch": 0.93, + "grad_norm": 2.3705055475791075, + "learning_rate": 1.299864001048673e-07, + "loss": 0.4495, + "step": 8089 + }, + { + "epoch": 0.93, + "grad_norm": 1.6649292996569292, + "learning_rate": 1.295651731302938e-07, + "loss": 0.4586, + "step": 8090 + }, + { + "epoch": 0.93, + "grad_norm": 2.2257675478148604, + "learning_rate": 1.2914462080750923e-07, + "loss": 0.4869, + "step": 8091 + }, + { + "epoch": 0.93, + "grad_norm": 2.4165760911904846, + "learning_rate": 1.2872474319476747e-07, + "loss": 0.4218, + "step": 8092 + }, + { + "epoch": 0.93, + "grad_norm": 2.2242998201781967, + "learning_rate": 1.283055403502309e-07, + "loss": 0.4685, + "step": 8093 + }, + { + "epoch": 0.93, + "grad_norm": 2.0898630633447604, + "learning_rate": 1.2788701233196643e-07, + "loss": 0.5471, + "step": 8094 + }, + { + "epoch": 0.93, + "grad_norm": 2.2808461757542458, + "learning_rate": 1.274691591979499e-07, + "loss": 0.443, + "step": 8095 + }, + { + "epoch": 0.93, + "grad_norm": 2.078243897977429, + "learning_rate": 1.2705198100606052e-07, + "loss": 0.4613, + "step": 8096 + }, + { + "epoch": 0.93, + "grad_norm": 2.3736714805211085, + "learning_rate": 1.2663547781408769e-07, + "loss": 0.3855, + "step": 8097 + }, + { + "epoch": 0.93, + "grad_norm": 2.0280938638639836, + "learning_rate": 1.262196496797241e-07, + "loss": 0.5124, + "step": 8098 + }, + { + "epoch": 0.93, + "grad_norm": 1.7620594821719144, + "learning_rate": 1.2580449666057038e-07, + "loss": 0.4486, + "step": 8099 + }, + { + "epoch": 0.93, + "grad_norm": 1.6836999326179645, + "learning_rate": 1.2539001881413326e-07, + "loss": 0.4806, + "step": 8100 + }, + { + "epoch": 0.93, + "grad_norm": 4.16261666007082, + "learning_rate": 1.2497621619782686e-07, + "loss": 0.4673, + "step": 8101 + }, + { + "epoch": 0.93, + "grad_norm": 3.128580757361907, + "learning_rate": 1.2456308886897028e-07, + "loss": 0.4954, + "step": 8102 + }, + { + "epoch": 0.93, + "grad_norm": 2.4660150799955782, + "learning_rate": 1.2415063688479057e-07, + "loss": 0.5254, + "step": 8103 + }, + { + "epoch": 0.93, + "grad_norm": 2.8492569745892475, + "learning_rate": 1.2373886030242032e-07, + "loss": 0.4658, + "step": 8104 + }, + { + "epoch": 0.93, + "grad_norm": 2.4469744950134027, + "learning_rate": 1.233277591788984e-07, + "loss": 0.428, + "step": 8105 + }, + { + "epoch": 0.93, + "grad_norm": 2.0688889668412185, + "learning_rate": 1.229173335711703e-07, + "loss": 0.4191, + "step": 8106 + }, + { + "epoch": 0.93, + "grad_norm": 1.6850460293375198, + "learning_rate": 1.225075835360884e-07, + "loss": 0.4412, + "step": 8107 + }, + { + "epoch": 0.93, + "grad_norm": 2.027183857434624, + "learning_rate": 1.220985091304111e-07, + "loss": 0.494, + "step": 8108 + }, + { + "epoch": 0.93, + "grad_norm": 2.3193272264515987, + "learning_rate": 1.2169011041080426e-07, + "loss": 0.5424, + "step": 8109 + }, + { + "epoch": 0.93, + "grad_norm": 1.948700376686977, + "learning_rate": 1.2128238743383758e-07, + "loss": 0.526, + "step": 8110 + }, + { + "epoch": 0.93, + "grad_norm": 0.8783381119927861, + "learning_rate": 1.2087534025598979e-07, + "loss": 0.6496, + "step": 8111 + }, + { + "epoch": 0.93, + "grad_norm": 1.9264507623716343, + "learning_rate": 1.2046896893364467e-07, + "loss": 0.3456, + "step": 8112 + }, + { + "epoch": 0.93, + "grad_norm": 1.7001400029963276, + "learning_rate": 1.2006327352309276e-07, + "loss": 0.5803, + "step": 8113 + }, + { + "epoch": 0.93, + "grad_norm": 5.071528182286443, + "learning_rate": 1.1965825408053133e-07, + "loss": 0.4143, + "step": 8114 + }, + { + "epoch": 0.93, + "grad_norm": 1.82997352598845, + "learning_rate": 1.1925391066206272e-07, + "loss": 0.4194, + "step": 8115 + }, + { + "epoch": 0.93, + "grad_norm": 0.8583410944710604, + "learning_rate": 1.1885024332369765e-07, + "loss": 0.6604, + "step": 8116 + }, + { + "epoch": 0.93, + "grad_norm": 1.7939437646106957, + "learning_rate": 1.1844725212135089e-07, + "loss": 0.5033, + "step": 8117 + }, + { + "epoch": 0.93, + "grad_norm": 2.1610446836265407, + "learning_rate": 1.1804493711084553e-07, + "loss": 0.5463, + "step": 8118 + }, + { + "epoch": 0.93, + "grad_norm": 1.6273086423574008, + "learning_rate": 1.1764329834790977e-07, + "loss": 0.5022, + "step": 8119 + }, + { + "epoch": 0.93, + "grad_norm": 2.128296965963352, + "learning_rate": 1.1724233588817913e-07, + "loss": 0.489, + "step": 8120 + }, + { + "epoch": 0.93, + "grad_norm": 2.439341377778607, + "learning_rate": 1.1684204978719416e-07, + "loss": 0.4551, + "step": 8121 + }, + { + "epoch": 0.93, + "grad_norm": 1.9258077805869753, + "learning_rate": 1.1644244010040273e-07, + "loss": 0.4802, + "step": 8122 + }, + { + "epoch": 0.93, + "grad_norm": 2.041682990701367, + "learning_rate": 1.1604350688315836e-07, + "loss": 0.4408, + "step": 8123 + }, + { + "epoch": 0.93, + "grad_norm": 2.1558770680714976, + "learning_rate": 1.1564525019072181e-07, + "loss": 0.4288, + "step": 8124 + }, + { + "epoch": 0.93, + "grad_norm": 1.7986136346332064, + "learning_rate": 1.1524767007825843e-07, + "loss": 0.5222, + "step": 8125 + }, + { + "epoch": 0.93, + "grad_norm": 2.2118192927416396, + "learning_rate": 1.1485076660084249e-07, + "loss": 0.5244, + "step": 8126 + }, + { + "epoch": 0.93, + "grad_norm": 1.576924720495921, + "learning_rate": 1.1445453981345167e-07, + "loss": 0.5054, + "step": 8127 + }, + { + "epoch": 0.93, + "grad_norm": 3.4055570032563813, + "learning_rate": 1.1405898977097263e-07, + "loss": 0.5293, + "step": 8128 + }, + { + "epoch": 0.93, + "grad_norm": 3.1388486439285113, + "learning_rate": 1.1366411652819598e-07, + "loss": 0.5156, + "step": 8129 + }, + { + "epoch": 0.93, + "grad_norm": 1.687929030296724, + "learning_rate": 1.1326992013981852e-07, + "loss": 0.4119, + "step": 8130 + }, + { + "epoch": 0.93, + "grad_norm": 2.337838203617382, + "learning_rate": 1.1287640066044658e-07, + "loss": 0.4676, + "step": 8131 + }, + { + "epoch": 0.93, + "grad_norm": 1.7215105292591768, + "learning_rate": 1.1248355814458822e-07, + "loss": 0.5422, + "step": 8132 + }, + { + "epoch": 0.93, + "grad_norm": 2.241602321479729, + "learning_rate": 1.1209139264666102e-07, + "loss": 0.4502, + "step": 8133 + }, + { + "epoch": 0.93, + "grad_norm": 2.20939546163229, + "learning_rate": 1.1169990422098764e-07, + "loss": 0.5408, + "step": 8134 + }, + { + "epoch": 0.93, + "grad_norm": 2.4460914610358757, + "learning_rate": 1.1130909292179693e-07, + "loss": 0.4632, + "step": 8135 + }, + { + "epoch": 0.93, + "grad_norm": 1.9707081571418674, + "learning_rate": 1.1091895880322334e-07, + "loss": 0.4798, + "step": 8136 + }, + { + "epoch": 0.93, + "grad_norm": 2.149942080882984, + "learning_rate": 1.1052950191930978e-07, + "loss": 0.4869, + "step": 8137 + }, + { + "epoch": 0.94, + "grad_norm": 2.0004859743975216, + "learning_rate": 1.1014072232400196e-07, + "loss": 0.4771, + "step": 8138 + }, + { + "epoch": 0.94, + "grad_norm": 2.0617511363898235, + "learning_rate": 1.0975262007115516e-07, + "loss": 0.494, + "step": 8139 + }, + { + "epoch": 0.94, + "grad_norm": 2.096215152771438, + "learning_rate": 1.0936519521452748e-07, + "loss": 0.5197, + "step": 8140 + }, + { + "epoch": 0.94, + "grad_norm": 3.1247173985396843, + "learning_rate": 1.0897844780778654e-07, + "loss": 0.5415, + "step": 8141 + }, + { + "epoch": 0.94, + "grad_norm": 2.0902380658116426, + "learning_rate": 1.0859237790450284e-07, + "loss": 0.4642, + "step": 8142 + }, + { + "epoch": 0.94, + "grad_norm": 2.3101410486832417, + "learning_rate": 1.082069855581569e-07, + "loss": 0.4522, + "step": 8143 + }, + { + "epoch": 0.94, + "grad_norm": 2.9710199907840242, + "learning_rate": 1.0782227082213104e-07, + "loss": 0.4392, + "step": 8144 + }, + { + "epoch": 0.94, + "grad_norm": 1.8184119849112796, + "learning_rate": 1.0743823374971762e-07, + "loss": 0.4452, + "step": 8145 + }, + { + "epoch": 0.94, + "grad_norm": 2.1703026282756555, + "learning_rate": 1.0705487439411243e-07, + "loss": 0.5118, + "step": 8146 + }, + { + "epoch": 0.94, + "grad_norm": 2.7661611310252745, + "learning_rate": 1.0667219280841857e-07, + "loss": 0.5821, + "step": 8147 + }, + { + "epoch": 0.94, + "grad_norm": 1.7427371760023855, + "learning_rate": 1.062901890456447e-07, + "loss": 0.4471, + "step": 8148 + }, + { + "epoch": 0.94, + "grad_norm": 1.8208886403659923, + "learning_rate": 1.0590886315870685e-07, + "loss": 0.4343, + "step": 8149 + }, + { + "epoch": 0.94, + "grad_norm": 26.994696167644015, + "learning_rate": 1.0552821520042556e-07, + "loss": 0.4926, + "step": 8150 + }, + { + "epoch": 0.94, + "grad_norm": 1.8665526001866652, + "learning_rate": 1.051482452235275e-07, + "loss": 0.5555, + "step": 8151 + }, + { + "epoch": 0.94, + "grad_norm": 1.5980064434509742, + "learning_rate": 1.0476895328064729e-07, + "loss": 0.452, + "step": 8152 + }, + { + "epoch": 0.94, + "grad_norm": 2.4626612061107154, + "learning_rate": 1.0439033942432341e-07, + "loss": 0.493, + "step": 8153 + }, + { + "epoch": 0.94, + "grad_norm": 2.008864849450951, + "learning_rate": 1.0401240370700228e-07, + "loss": 0.5863, + "step": 8154 + }, + { + "epoch": 0.94, + "grad_norm": 2.10335587358992, + "learning_rate": 1.0363514618103477e-07, + "loss": 0.4546, + "step": 8155 + }, + { + "epoch": 0.94, + "grad_norm": 2.307544911497827, + "learning_rate": 1.0325856689867853e-07, + "loss": 0.4889, + "step": 8156 + }, + { + "epoch": 0.94, + "grad_norm": 2.192924263589598, + "learning_rate": 1.0288266591209738e-07, + "loss": 0.4186, + "step": 8157 + }, + { + "epoch": 0.94, + "grad_norm": 2.3445574228265977, + "learning_rate": 1.0250744327336138e-07, + "loss": 0.5138, + "step": 8158 + }, + { + "epoch": 0.94, + "grad_norm": 2.706912809284124, + "learning_rate": 1.0213289903444557e-07, + "loss": 0.5092, + "step": 8159 + }, + { + "epoch": 0.94, + "grad_norm": 2.820597525748506, + "learning_rate": 1.0175903324723291e-07, + "loss": 0.3952, + "step": 8160 + }, + { + "epoch": 0.94, + "grad_norm": 1.8781883247540678, + "learning_rate": 1.0138584596350976e-07, + "loss": 0.5263, + "step": 8161 + }, + { + "epoch": 0.94, + "grad_norm": 2.769212397174568, + "learning_rate": 1.0101333723497198e-07, + "loss": 0.4713, + "step": 8162 + }, + { + "epoch": 0.94, + "grad_norm": 2.706592328887155, + "learning_rate": 1.0064150711321718e-07, + "loss": 0.4013, + "step": 8163 + }, + { + "epoch": 0.94, + "grad_norm": 0.8304667641849134, + "learning_rate": 1.0027035564975252e-07, + "loss": 0.6854, + "step": 8164 + }, + { + "epoch": 0.94, + "grad_norm": 2.1183799627668582, + "learning_rate": 9.989988289598962e-08, + "loss": 0.4873, + "step": 8165 + }, + { + "epoch": 0.94, + "grad_norm": 1.8848187251062787, + "learning_rate": 9.953008890324634e-08, + "loss": 0.5102, + "step": 8166 + }, + { + "epoch": 0.94, + "grad_norm": 2.0322758501280087, + "learning_rate": 9.916097372274558e-08, + "loss": 0.5795, + "step": 8167 + }, + { + "epoch": 0.94, + "grad_norm": 2.1706580031313187, + "learning_rate": 9.879253740561867e-08, + "loss": 0.536, + "step": 8168 + }, + { + "epoch": 0.94, + "grad_norm": 2.0326705961764104, + "learning_rate": 9.84247800029009e-08, + "loss": 0.5295, + "step": 8169 + }, + { + "epoch": 0.94, + "grad_norm": 2.032374106842836, + "learning_rate": 9.805770156553263e-08, + "loss": 0.5612, + "step": 8170 + }, + { + "epoch": 0.94, + "grad_norm": 1.809415950512553, + "learning_rate": 9.769130214436318e-08, + "loss": 0.4451, + "step": 8171 + }, + { + "epoch": 0.94, + "grad_norm": 2.5016152767064646, + "learning_rate": 9.732558179014528e-08, + "loss": 0.4934, + "step": 8172 + }, + { + "epoch": 0.94, + "grad_norm": 2.0134033984882094, + "learning_rate": 9.696054055353843e-08, + "loss": 0.5059, + "step": 8173 + }, + { + "epoch": 0.94, + "grad_norm": 1.7037241977666748, + "learning_rate": 9.659617848510883e-08, + "loss": 0.3832, + "step": 8174 + }, + { + "epoch": 0.94, + "grad_norm": 2.072702739979123, + "learning_rate": 9.623249563532666e-08, + "loss": 0.483, + "step": 8175 + }, + { + "epoch": 0.94, + "grad_norm": 2.0798594034074487, + "learning_rate": 9.586949205456997e-08, + "loss": 0.5168, + "step": 8176 + }, + { + "epoch": 0.94, + "grad_norm": 1.6667071021639077, + "learning_rate": 9.550716779312242e-08, + "loss": 0.5309, + "step": 8177 + }, + { + "epoch": 0.94, + "grad_norm": 1.9226690385661063, + "learning_rate": 9.514552290117162e-08, + "loss": 0.5141, + "step": 8178 + }, + { + "epoch": 0.94, + "grad_norm": 2.1610775118456247, + "learning_rate": 9.478455742881365e-08, + "loss": 0.5141, + "step": 8179 + }, + { + "epoch": 0.94, + "grad_norm": 1.5058743972699011, + "learning_rate": 9.44242714260496e-08, + "loss": 0.487, + "step": 8180 + }, + { + "epoch": 0.94, + "grad_norm": 2.512104274286153, + "learning_rate": 9.406466494278566e-08, + "loss": 0.5097, + "step": 8181 + }, + { + "epoch": 0.94, + "grad_norm": 2.038052809655248, + "learning_rate": 9.37057380288342e-08, + "loss": 0.6662, + "step": 8182 + }, + { + "epoch": 0.94, + "grad_norm": 2.8717268926974873, + "learning_rate": 9.334749073391436e-08, + "loss": 0.4569, + "step": 8183 + }, + { + "epoch": 0.94, + "grad_norm": 1.849468868144424, + "learning_rate": 9.298992310765032e-08, + "loss": 0.4643, + "step": 8184 + }, + { + "epoch": 0.94, + "grad_norm": 2.556247108719828, + "learning_rate": 9.26330351995719e-08, + "loss": 0.4953, + "step": 8185 + }, + { + "epoch": 0.94, + "grad_norm": 1.9799756523816332, + "learning_rate": 9.227682705911567e-08, + "loss": 0.5309, + "step": 8186 + }, + { + "epoch": 0.94, + "grad_norm": 1.8543210327092232, + "learning_rate": 9.192129873562383e-08, + "loss": 0.4953, + "step": 8187 + }, + { + "epoch": 0.94, + "grad_norm": 1.538752010183699, + "learning_rate": 9.156645027834255e-08, + "loss": 0.4215, + "step": 8188 + }, + { + "epoch": 0.94, + "grad_norm": 1.7522510004601333, + "learning_rate": 9.121228173642749e-08, + "loss": 0.4356, + "step": 8189 + }, + { + "epoch": 0.94, + "grad_norm": 2.340639346117667, + "learning_rate": 9.085879315893664e-08, + "loss": 0.4889, + "step": 8190 + }, + { + "epoch": 0.94, + "grad_norm": 1.8741167195516106, + "learning_rate": 9.050598459483528e-08, + "loss": 0.4988, + "step": 8191 + }, + { + "epoch": 0.94, + "grad_norm": 2.97533943183261, + "learning_rate": 9.015385609299431e-08, + "loss": 0.5377, + "step": 8192 + }, + { + "epoch": 0.94, + "grad_norm": 2.672060008411488, + "learning_rate": 8.980240770219139e-08, + "loss": 0.5132, + "step": 8193 + }, + { + "epoch": 0.94, + "grad_norm": 2.1834326861067987, + "learning_rate": 8.945163947110758e-08, + "loss": 0.5576, + "step": 8194 + }, + { + "epoch": 0.94, + "grad_norm": 1.7101987298510644, + "learning_rate": 8.910155144833288e-08, + "loss": 0.3655, + "step": 8195 + }, + { + "epoch": 0.94, + "grad_norm": 1.834269291635032, + "learning_rate": 8.875214368236073e-08, + "loss": 0.4482, + "step": 8196 + }, + { + "epoch": 0.94, + "grad_norm": 2.762876428500223, + "learning_rate": 8.840341622159077e-08, + "loss": 0.4892, + "step": 8197 + }, + { + "epoch": 0.94, + "grad_norm": 2.844746178482963, + "learning_rate": 8.805536911432877e-08, + "loss": 0.5435, + "step": 8198 + }, + { + "epoch": 0.94, + "grad_norm": 3.1227393898759264, + "learning_rate": 8.770800240878619e-08, + "loss": 0.4752, + "step": 8199 + }, + { + "epoch": 0.94, + "grad_norm": 1.9073978175133313, + "learning_rate": 8.736131615308063e-08, + "loss": 0.3858, + "step": 8200 + }, + { + "epoch": 0.94, + "grad_norm": 2.293273129720375, + "learning_rate": 8.701531039523426e-08, + "loss": 0.4441, + "step": 8201 + }, + { + "epoch": 0.94, + "grad_norm": 3.222552394733756, + "learning_rate": 8.666998518317648e-08, + "loss": 0.4865, + "step": 8202 + }, + { + "epoch": 0.94, + "grad_norm": 1.6788054651142348, + "learning_rate": 8.632534056474129e-08, + "loss": 0.5124, + "step": 8203 + }, + { + "epoch": 0.94, + "grad_norm": 0.8581537318400115, + "learning_rate": 8.598137658766881e-08, + "loss": 0.6607, + "step": 8204 + }, + { + "epoch": 0.94, + "grad_norm": 2.1944355909695923, + "learning_rate": 8.563809329960481e-08, + "loss": 0.5083, + "step": 8205 + }, + { + "epoch": 0.94, + "grad_norm": 2.112898432923045, + "learning_rate": 8.529549074810129e-08, + "loss": 0.5304, + "step": 8206 + }, + { + "epoch": 0.94, + "grad_norm": 1.7773799897234068, + "learning_rate": 8.495356898061469e-08, + "loss": 0.425, + "step": 8207 + }, + { + "epoch": 0.94, + "grad_norm": 2.0268704327700857, + "learning_rate": 8.46123280445088e-08, + "loss": 0.4676, + "step": 8208 + }, + { + "epoch": 0.94, + "grad_norm": 2.343903693059684, + "learning_rate": 8.427176798705139e-08, + "loss": 0.5652, + "step": 8209 + }, + { + "epoch": 0.94, + "grad_norm": 2.1152987859795718, + "learning_rate": 8.393188885541748e-08, + "loss": 0.4278, + "step": 8210 + }, + { + "epoch": 0.94, + "grad_norm": 2.1574151463604943, + "learning_rate": 8.359269069668662e-08, + "loss": 0.4443, + "step": 8211 + }, + { + "epoch": 0.94, + "grad_norm": 2.0445715608288832, + "learning_rate": 8.325417355784516e-08, + "loss": 0.4074, + "step": 8212 + }, + { + "epoch": 0.94, + "grad_norm": 1.811679910282104, + "learning_rate": 8.291633748578331e-08, + "loss": 0.495, + "step": 8213 + }, + { + "epoch": 0.94, + "grad_norm": 2.1170193438453664, + "learning_rate": 8.257918252729924e-08, + "loss": 0.4103, + "step": 8214 + }, + { + "epoch": 0.94, + "grad_norm": 2.248550909235313, + "learning_rate": 8.224270872909556e-08, + "loss": 0.5475, + "step": 8215 + }, + { + "epoch": 0.94, + "grad_norm": 5.590481319259768, + "learning_rate": 8.19069161377789e-08, + "loss": 0.4672, + "step": 8216 + }, + { + "epoch": 0.94, + "grad_norm": 1.9362351511910811, + "learning_rate": 8.157180479986538e-08, + "loss": 0.4249, + "step": 8217 + }, + { + "epoch": 0.94, + "grad_norm": 1.5470414448666432, + "learning_rate": 8.123737476177284e-08, + "loss": 0.4729, + "step": 8218 + }, + { + "epoch": 0.94, + "grad_norm": 2.6596123467268318, + "learning_rate": 8.09036260698276e-08, + "loss": 0.4436, + "step": 8219 + }, + { + "epoch": 0.94, + "grad_norm": 1.8623066206762264, + "learning_rate": 8.057055877025988e-08, + "loss": 0.4641, + "step": 8220 + }, + { + "epoch": 0.94, + "grad_norm": 2.1584336283905814, + "learning_rate": 8.023817290920666e-08, + "loss": 0.5372, + "step": 8221 + }, + { + "epoch": 0.94, + "grad_norm": 1.7393935408484684, + "learning_rate": 7.990646853270944e-08, + "loss": 0.4489, + "step": 8222 + }, + { + "epoch": 0.94, + "grad_norm": 2.319954745166295, + "learning_rate": 7.957544568671593e-08, + "loss": 0.4723, + "step": 8223 + }, + { + "epoch": 0.94, + "grad_norm": 1.6144617123171034, + "learning_rate": 7.924510441707945e-08, + "loss": 0.4888, + "step": 8224 + }, + { + "epoch": 0.95, + "grad_norm": 2.6440783132425225, + "learning_rate": 7.891544476955892e-08, + "loss": 0.4689, + "step": 8225 + }, + { + "epoch": 0.95, + "grad_norm": 2.3108294817929007, + "learning_rate": 7.858646678981841e-08, + "loss": 0.5496, + "step": 8226 + }, + { + "epoch": 0.95, + "grad_norm": 1.6537288980063074, + "learning_rate": 7.82581705234281e-08, + "loss": 0.4277, + "step": 8227 + }, + { + "epoch": 0.95, + "grad_norm": 2.077571046408205, + "learning_rate": 7.79305560158633e-08, + "loss": 0.5252, + "step": 8228 + }, + { + "epoch": 0.95, + "grad_norm": 2.0328674661781676, + "learning_rate": 7.760362331250604e-08, + "loss": 0.5546, + "step": 8229 + }, + { + "epoch": 0.95, + "grad_norm": 1.9392274085690162, + "learning_rate": 7.727737245864175e-08, + "loss": 0.5003, + "step": 8230 + }, + { + "epoch": 0.95, + "grad_norm": 2.157190619506827, + "learning_rate": 7.695180349946318e-08, + "loss": 0.4555, + "step": 8231 + }, + { + "epoch": 0.95, + "grad_norm": 6.1867975622490645, + "learning_rate": 7.662691648006815e-08, + "loss": 0.4839, + "step": 8232 + }, + { + "epoch": 0.95, + "grad_norm": 2.369498163987176, + "learning_rate": 7.630271144546008e-08, + "loss": 0.4683, + "step": 8233 + }, + { + "epoch": 0.95, + "grad_norm": 2.45293076996086, + "learning_rate": 7.597918844054753e-08, + "loss": 0.5565, + "step": 8234 + }, + { + "epoch": 0.95, + "grad_norm": 2.053536479159425, + "learning_rate": 7.565634751014517e-08, + "loss": 0.4757, + "step": 8235 + }, + { + "epoch": 0.95, + "grad_norm": 2.2550532845876514, + "learning_rate": 7.533418869897225e-08, + "loss": 0.5403, + "step": 8236 + }, + { + "epoch": 0.95, + "grad_norm": 2.0463940998104366, + "learning_rate": 7.501271205165473e-08, + "loss": 0.5134, + "step": 8237 + }, + { + "epoch": 0.95, + "grad_norm": 2.213458060128385, + "learning_rate": 7.46919176127231e-08, + "loss": 0.5811, + "step": 8238 + }, + { + "epoch": 0.95, + "grad_norm": 2.093498383595694, + "learning_rate": 7.43718054266146e-08, + "loss": 0.4991, + "step": 8239 + }, + { + "epoch": 0.95, + "grad_norm": 1.7322372661029075, + "learning_rate": 7.405237553766986e-08, + "loss": 0.4705, + "step": 8240 + }, + { + "epoch": 0.95, + "grad_norm": 1.6661916541879334, + "learning_rate": 7.373362799013683e-08, + "loss": 0.4979, + "step": 8241 + }, + { + "epoch": 0.95, + "grad_norm": 1.6652458841755704, + "learning_rate": 7.341556282816853e-08, + "loss": 0.4535, + "step": 8242 + }, + { + "epoch": 0.95, + "grad_norm": 4.136477538047666, + "learning_rate": 7.309818009582304e-08, + "loss": 0.4759, + "step": 8243 + }, + { + "epoch": 0.95, + "grad_norm": 2.5367530976434796, + "learning_rate": 7.278147983706463e-08, + "loss": 0.4927, + "step": 8244 + }, + { + "epoch": 0.95, + "grad_norm": 1.9602328214878317, + "learning_rate": 7.246546209576155e-08, + "loss": 0.4948, + "step": 8245 + }, + { + "epoch": 0.95, + "grad_norm": 2.410380220729539, + "learning_rate": 7.21501269156899e-08, + "loss": 0.4527, + "step": 8246 + }, + { + "epoch": 0.95, + "grad_norm": 2.9721338354242834, + "learning_rate": 7.183547434052863e-08, + "loss": 0.5225, + "step": 8247 + }, + { + "epoch": 0.95, + "grad_norm": 2.2233943377118486, + "learning_rate": 7.152150441386452e-08, + "loss": 0.5162, + "step": 8248 + }, + { + "epoch": 0.95, + "grad_norm": 2.389017145371587, + "learning_rate": 7.120821717918724e-08, + "loss": 0.464, + "step": 8249 + }, + { + "epoch": 0.95, + "grad_norm": 2.579128130815504, + "learning_rate": 7.089561267989431e-08, + "loss": 0.5667, + "step": 8250 + }, + { + "epoch": 0.95, + "grad_norm": 1.9191728095600276, + "learning_rate": 7.058369095928719e-08, + "loss": 0.4732, + "step": 8251 + }, + { + "epoch": 0.95, + "grad_norm": 0.8138580393729101, + "learning_rate": 7.027245206057409e-08, + "loss": 0.7062, + "step": 8252 + }, + { + "epoch": 0.95, + "grad_norm": 2.5118539484844935, + "learning_rate": 6.996189602686609e-08, + "loss": 0.4177, + "step": 8253 + }, + { + "epoch": 0.95, + "grad_norm": 2.0104235382452162, + "learning_rate": 6.965202290118323e-08, + "loss": 0.5101, + "step": 8254 + }, + { + "epoch": 0.95, + "grad_norm": 2.115350204816361, + "learning_rate": 6.934283272644782e-08, + "loss": 0.4891, + "step": 8255 + }, + { + "epoch": 0.95, + "grad_norm": 2.272952748064892, + "learning_rate": 6.90343255454895e-08, + "loss": 0.4468, + "step": 8256 + }, + { + "epoch": 0.95, + "grad_norm": 4.609129833312253, + "learning_rate": 6.872650140104186e-08, + "loss": 0.572, + "step": 8257 + }, + { + "epoch": 0.95, + "grad_norm": 2.5063754100614526, + "learning_rate": 6.841936033574526e-08, + "loss": 0.5694, + "step": 8258 + }, + { + "epoch": 0.95, + "grad_norm": 2.279190487247249, + "learning_rate": 6.811290239214452e-08, + "loss": 0.4815, + "step": 8259 + }, + { + "epoch": 0.95, + "grad_norm": 2.525446881570976, + "learning_rate": 6.78071276126907e-08, + "loss": 0.4325, + "step": 8260 + }, + { + "epoch": 0.95, + "grad_norm": 1.9271975965316115, + "learning_rate": 6.750203603973937e-08, + "loss": 0.4595, + "step": 8261 + }, + { + "epoch": 0.95, + "grad_norm": 1.8132687488716681, + "learning_rate": 6.719762771555116e-08, + "loss": 0.4607, + "step": 8262 + }, + { + "epoch": 0.95, + "grad_norm": 1.806652710156521, + "learning_rate": 6.689390268229345e-08, + "loss": 0.3799, + "step": 8263 + }, + { + "epoch": 0.95, + "grad_norm": 2.0368999218445993, + "learning_rate": 6.65908609820376e-08, + "loss": 0.4306, + "step": 8264 + }, + { + "epoch": 0.95, + "grad_norm": 2.511606902321926, + "learning_rate": 6.628850265676167e-08, + "loss": 0.478, + "step": 8265 + }, + { + "epoch": 0.95, + "grad_norm": 1.924056142845482, + "learning_rate": 6.598682774834775e-08, + "loss": 0.5219, + "step": 8266 + }, + { + "epoch": 0.95, + "grad_norm": 4.152242874660278, + "learning_rate": 6.568583629858349e-08, + "loss": 0.454, + "step": 8267 + }, + { + "epoch": 0.95, + "grad_norm": 2.50656737652948, + "learning_rate": 6.538552834916278e-08, + "loss": 0.4911, + "step": 8268 + }, + { + "epoch": 0.95, + "grad_norm": 1.8896848370015829, + "learning_rate": 6.508590394168402e-08, + "loss": 0.407, + "step": 8269 + }, + { + "epoch": 0.95, + "grad_norm": 1.7199146605830398, + "learning_rate": 6.478696311765065e-08, + "loss": 0.4707, + "step": 8270 + }, + { + "epoch": 0.95, + "grad_norm": 1.7173928531057958, + "learning_rate": 6.448870591847289e-08, + "loss": 0.4485, + "step": 8271 + }, + { + "epoch": 0.95, + "grad_norm": 1.9465516788346606, + "learning_rate": 6.419113238546438e-08, + "loss": 0.4874, + "step": 8272 + }, + { + "epoch": 0.95, + "grad_norm": 1.878305572550131, + "learning_rate": 6.3894242559846e-08, + "loss": 0.4274, + "step": 8273 + }, + { + "epoch": 0.95, + "grad_norm": 1.8513453368641881, + "learning_rate": 6.359803648274154e-08, + "loss": 0.5761, + "step": 8274 + }, + { + "epoch": 0.95, + "grad_norm": 1.6468128485972924, + "learning_rate": 6.330251419518263e-08, + "loss": 0.4787, + "step": 8275 + }, + { + "epoch": 0.95, + "grad_norm": 2.3711208574473503, + "learning_rate": 6.300767573810373e-08, + "loss": 0.4408, + "step": 8276 + }, + { + "epoch": 0.95, + "grad_norm": 1.7380019881402677, + "learning_rate": 6.271352115234663e-08, + "loss": 0.4383, + "step": 8277 + }, + { + "epoch": 0.95, + "grad_norm": 2.400774104624241, + "learning_rate": 6.242005047865707e-08, + "loss": 0.4868, + "step": 8278 + }, + { + "epoch": 0.95, + "grad_norm": 2.4139888324158805, + "learning_rate": 6.212726375768751e-08, + "loss": 0.5151, + "step": 8279 + }, + { + "epoch": 0.95, + "grad_norm": 1.9479617748656484, + "learning_rate": 6.183516102999387e-08, + "loss": 0.4939, + "step": 8280 + }, + { + "epoch": 0.95, + "grad_norm": 2.9495927928379975, + "learning_rate": 6.154374233603876e-08, + "loss": 0.4245, + "step": 8281 + }, + { + "epoch": 0.95, + "grad_norm": 2.0464669028443323, + "learning_rate": 6.12530077161888e-08, + "loss": 0.4699, + "step": 8282 + }, + { + "epoch": 0.95, + "grad_norm": 2.1409418423140614, + "learning_rate": 6.09629572107162e-08, + "loss": 0.4791, + "step": 8283 + }, + { + "epoch": 0.95, + "grad_norm": 2.130466763204626, + "learning_rate": 6.067359085980051e-08, + "loss": 0.4834, + "step": 8284 + }, + { + "epoch": 0.95, + "grad_norm": 2.794358424495802, + "learning_rate": 6.038490870352242e-08, + "loss": 0.4945, + "step": 8285 + }, + { + "epoch": 0.95, + "grad_norm": 2.5082685382169236, + "learning_rate": 6.009691078187163e-08, + "loss": 0.4403, + "step": 8286 + }, + { + "epoch": 0.95, + "grad_norm": 2.2014644764366356, + "learning_rate": 5.980959713474122e-08, + "loss": 0.5277, + "step": 8287 + }, + { + "epoch": 0.95, + "grad_norm": 2.1501686601696033, + "learning_rate": 5.95229678019299e-08, + "loss": 0.5127, + "step": 8288 + }, + { + "epoch": 0.95, + "grad_norm": 1.800698362247267, + "learning_rate": 5.9237022823140924e-08, + "loss": 0.372, + "step": 8289 + }, + { + "epoch": 0.95, + "grad_norm": 1.9283674952445404, + "learning_rate": 5.8951762237984264e-08, + "loss": 0.5348, + "step": 8290 + }, + { + "epoch": 0.95, + "grad_norm": 2.529976429818939, + "learning_rate": 5.8667186085973326e-08, + "loss": 0.4561, + "step": 8291 + }, + { + "epoch": 0.95, + "grad_norm": 1.9901692254826935, + "learning_rate": 5.838329440652824e-08, + "loss": 0.4535, + "step": 8292 + }, + { + "epoch": 0.95, + "grad_norm": 2.4536320604389217, + "learning_rate": 5.8100087238972534e-08, + "loss": 0.3832, + "step": 8293 + }, + { + "epoch": 0.95, + "grad_norm": 1.8204082221724118, + "learning_rate": 5.7817564622537626e-08, + "loss": 0.5183, + "step": 8294 + }, + { + "epoch": 0.95, + "grad_norm": 1.6346098918642138, + "learning_rate": 5.75357265963572e-08, + "loss": 0.4043, + "step": 8295 + }, + { + "epoch": 0.95, + "grad_norm": 2.341002831282058, + "learning_rate": 5.7254573199472254e-08, + "loss": 0.4451, + "step": 8296 + }, + { + "epoch": 0.95, + "grad_norm": 2.7549114316318337, + "learning_rate": 5.6974104470827186e-08, + "loss": 0.4879, + "step": 8297 + }, + { + "epoch": 0.95, + "grad_norm": 2.574620651372921, + "learning_rate": 5.6694320449273145e-08, + "loss": 0.5027, + "step": 8298 + }, + { + "epoch": 0.95, + "grad_norm": 2.599635536073557, + "learning_rate": 5.641522117356635e-08, + "loss": 0.5163, + "step": 8299 + }, + { + "epoch": 0.95, + "grad_norm": 1.7737757669358272, + "learning_rate": 5.613680668236588e-08, + "loss": 0.4186, + "step": 8300 + }, + { + "epoch": 0.95, + "grad_norm": 1.9342727566092899, + "learning_rate": 5.5859077014239205e-08, + "loss": 0.4512, + "step": 8301 + }, + { + "epoch": 0.95, + "grad_norm": 2.5314673892545008, + "learning_rate": 5.5582032207656676e-08, + "loss": 0.5119, + "step": 8302 + }, + { + "epoch": 0.95, + "grad_norm": 2.3780497467574446, + "learning_rate": 5.530567230099482e-08, + "loss": 0.4709, + "step": 8303 + }, + { + "epoch": 0.95, + "grad_norm": 2.356193781021113, + "learning_rate": 5.502999733253411e-08, + "loss": 0.5361, + "step": 8304 + }, + { + "epoch": 0.95, + "grad_norm": 1.8040589578286474, + "learning_rate": 5.4755007340462354e-08, + "loss": 0.4968, + "step": 8305 + }, + { + "epoch": 0.95, + "grad_norm": 2.295567632926318, + "learning_rate": 5.448070236287017e-08, + "loss": 0.4163, + "step": 8306 + }, + { + "epoch": 0.95, + "grad_norm": 1.849799273526126, + "learning_rate": 5.420708243775497e-08, + "loss": 0.5169, + "step": 8307 + }, + { + "epoch": 0.95, + "grad_norm": 2.964378658410538, + "learning_rate": 5.3934147603017514e-08, + "loss": 0.5743, + "step": 8308 + }, + { + "epoch": 0.95, + "grad_norm": 1.7737300496357848, + "learning_rate": 5.366189789646592e-08, + "loss": 0.4387, + "step": 8309 + }, + { + "epoch": 0.95, + "grad_norm": 2.0994933560696567, + "learning_rate": 5.339033335581056e-08, + "loss": 0.4968, + "step": 8310 + }, + { + "epoch": 0.95, + "grad_norm": 1.9293592193953497, + "learning_rate": 5.311945401867025e-08, + "loss": 0.5156, + "step": 8311 + }, + { + "epoch": 0.96, + "grad_norm": 1.7107945886635205, + "learning_rate": 5.284925992256551e-08, + "loss": 0.408, + "step": 8312 + }, + { + "epoch": 0.96, + "grad_norm": 2.0218748146858965, + "learning_rate": 5.25797511049253e-08, + "loss": 0.5117, + "step": 8313 + }, + { + "epoch": 0.96, + "grad_norm": 2.4401371717876486, + "learning_rate": 5.231092760308032e-08, + "loss": 0.5121, + "step": 8314 + }, + { + "epoch": 0.96, + "grad_norm": 2.114197273563366, + "learning_rate": 5.2042789454269104e-08, + "loss": 0.5482, + "step": 8315 + }, + { + "epoch": 0.96, + "grad_norm": 2.030794865734462, + "learning_rate": 5.1775336695633616e-08, + "loss": 0.4513, + "step": 8316 + }, + { + "epoch": 0.96, + "grad_norm": 2.0056748547593934, + "learning_rate": 5.150856936422144e-08, + "loss": 0.3973, + "step": 8317 + }, + { + "epoch": 0.96, + "grad_norm": 2.319002857003995, + "learning_rate": 5.124248749698468e-08, + "loss": 0.4725, + "step": 8318 + }, + { + "epoch": 0.96, + "grad_norm": 2.3188070986369174, + "learning_rate": 5.09770911307822e-08, + "loss": 0.5389, + "step": 8319 + }, + { + "epoch": 0.96, + "grad_norm": 1.7582917085029484, + "learning_rate": 5.071238030237569e-08, + "loss": 0.444, + "step": 8320 + }, + { + "epoch": 0.96, + "grad_norm": 4.012920319343171, + "learning_rate": 5.0448355048432484e-08, + "loss": 0.4383, + "step": 8321 + }, + { + "epoch": 0.96, + "grad_norm": 1.9170507531004495, + "learning_rate": 5.018501540552611e-08, + "loss": 0.5265, + "step": 8322 + }, + { + "epoch": 0.96, + "grad_norm": 3.1193905185817394, + "learning_rate": 4.992236141013407e-08, + "loss": 0.3957, + "step": 8323 + }, + { + "epoch": 0.96, + "grad_norm": 2.7264590627593157, + "learning_rate": 4.966039309863946e-08, + "loss": 0.4861, + "step": 8324 + }, + { + "epoch": 0.96, + "grad_norm": 1.7713697854863821, + "learning_rate": 4.939911050732937e-08, + "loss": 0.5064, + "step": 8325 + }, + { + "epoch": 0.96, + "grad_norm": 3.5448806095320884, + "learning_rate": 4.913851367239764e-08, + "loss": 0.4752, + "step": 8326 + }, + { + "epoch": 0.96, + "grad_norm": 2.864661199703402, + "learning_rate": 4.887860262994038e-08, + "loss": 0.3706, + "step": 8327 + }, + { + "epoch": 0.96, + "grad_norm": 2.215967204836886, + "learning_rate": 4.861937741596268e-08, + "loss": 0.5291, + "step": 8328 + }, + { + "epoch": 0.96, + "grad_norm": 2.079927876863055, + "learning_rate": 4.8360838066370266e-08, + "loss": 0.5562, + "step": 8329 + }, + { + "epoch": 0.96, + "grad_norm": 2.243969187991946, + "learning_rate": 4.810298461697727e-08, + "loss": 0.4789, + "step": 8330 + }, + { + "epoch": 0.96, + "grad_norm": 6.014411034545867, + "learning_rate": 4.784581710350123e-08, + "loss": 0.4511, + "step": 8331 + }, + { + "epoch": 0.96, + "grad_norm": 2.5906508418840577, + "learning_rate": 4.758933556156475e-08, + "loss": 0.4787, + "step": 8332 + }, + { + "epoch": 0.96, + "grad_norm": 1.9918233939079075, + "learning_rate": 4.73335400266961e-08, + "loss": 0.5215, + "step": 8333 + }, + { + "epoch": 0.96, + "grad_norm": 1.7545592311875304, + "learning_rate": 4.707843053432748e-08, + "loss": 0.3953, + "step": 8334 + }, + { + "epoch": 0.96, + "grad_norm": 1.8255317064485896, + "learning_rate": 4.6824007119796176e-08, + "loss": 0.4647, + "step": 8335 + }, + { + "epoch": 0.96, + "grad_norm": 1.65089495443273, + "learning_rate": 4.657026981834623e-08, + "loss": 0.4531, + "step": 8336 + }, + { + "epoch": 0.96, + "grad_norm": 2.065329220433614, + "learning_rate": 4.6317218665123953e-08, + "loss": 0.4651, + "step": 8337 + }, + { + "epoch": 0.96, + "grad_norm": 0.8427960822306894, + "learning_rate": 4.606485369518354e-08, + "loss": 0.6722, + "step": 8338 + }, + { + "epoch": 0.96, + "grad_norm": 0.8565782117245647, + "learning_rate": 4.581317494348092e-08, + "loss": 0.6953, + "step": 8339 + }, + { + "epoch": 0.96, + "grad_norm": 1.9944112565467742, + "learning_rate": 4.556218244487876e-08, + "loss": 0.5364, + "step": 8340 + }, + { + "epoch": 0.96, + "grad_norm": 1.8492523932549663, + "learning_rate": 4.53118762341459e-08, + "loss": 0.4648, + "step": 8341 + }, + { + "epoch": 0.96, + "grad_norm": 1.9259408803126086, + "learning_rate": 4.506225634595296e-08, + "loss": 0.5278, + "step": 8342 + }, + { + "epoch": 0.96, + "grad_norm": 2.2960479323297487, + "learning_rate": 4.481332281487838e-08, + "loss": 0.4093, + "step": 8343 + }, + { + "epoch": 0.96, + "grad_norm": 1.8104865082487747, + "learning_rate": 4.4565075675404024e-08, + "loss": 0.5431, + "step": 8344 + }, + { + "epoch": 0.96, + "grad_norm": 1.67742257981651, + "learning_rate": 4.4317514961917387e-08, + "loss": 0.5489, + "step": 8345 + }, + { + "epoch": 0.96, + "grad_norm": 2.493866747800699, + "learning_rate": 4.407064070870992e-08, + "loss": 0.46, + "step": 8346 + }, + { + "epoch": 0.96, + "grad_norm": 3.6316221402586075, + "learning_rate": 4.3824452949978725e-08, + "loss": 0.4902, + "step": 8347 + }, + { + "epoch": 0.96, + "grad_norm": 8.052226605028183, + "learning_rate": 4.357895171982596e-08, + "loss": 0.5052, + "step": 8348 + }, + { + "epoch": 0.96, + "grad_norm": 0.8510981466546902, + "learning_rate": 4.333413705225886e-08, + "loss": 0.7161, + "step": 8349 + }, + { + "epoch": 0.96, + "grad_norm": 2.0769268074883893, + "learning_rate": 4.3090008981187534e-08, + "loss": 0.5383, + "step": 8350 + }, + { + "epoch": 0.96, + "grad_norm": 1.9229641820451802, + "learning_rate": 4.284656754043048e-08, + "loss": 0.4235, + "step": 8351 + }, + { + "epoch": 0.96, + "grad_norm": 2.069245540581593, + "learning_rate": 4.2603812763707956e-08, + "loss": 0.4546, + "step": 8352 + }, + { + "epoch": 0.96, + "grad_norm": 2.7400006153394267, + "learning_rate": 4.2361744684646934e-08, + "loss": 0.4568, + "step": 8353 + }, + { + "epoch": 0.96, + "grad_norm": 2.0192758117837375, + "learning_rate": 4.212036333677783e-08, + "loss": 0.4641, + "step": 8354 + }, + { + "epoch": 0.96, + "grad_norm": 2.2915084454007686, + "learning_rate": 4.187966875353777e-08, + "loss": 0.5357, + "step": 8355 + }, + { + "epoch": 0.96, + "grad_norm": 2.3644591370553076, + "learning_rate": 4.163966096826677e-08, + "loss": 0.5057, + "step": 8356 + }, + { + "epoch": 0.96, + "grad_norm": 1.6612270854985094, + "learning_rate": 4.140034001421156e-08, + "loss": 0.4144, + "step": 8357 + }, + { + "epoch": 0.96, + "grad_norm": 1.6508439030693616, + "learning_rate": 4.116170592452229e-08, + "loss": 0.4143, + "step": 8358 + }, + { + "epoch": 0.96, + "grad_norm": 3.2723744471142227, + "learning_rate": 4.0923758732254734e-08, + "loss": 0.4128, + "step": 8359 + }, + { + "epoch": 0.96, + "grad_norm": 2.2170461936941415, + "learning_rate": 4.068649847036865e-08, + "loss": 0.6117, + "step": 8360 + }, + { + "epoch": 0.96, + "grad_norm": 1.9644081176828636, + "learning_rate": 4.044992517173052e-08, + "loss": 0.4898, + "step": 8361 + }, + { + "epoch": 0.96, + "grad_norm": 2.1337027713314534, + "learning_rate": 4.021403886910913e-08, + "loss": 0.471, + "step": 8362 + }, + { + "epoch": 0.96, + "grad_norm": 4.767649621758456, + "learning_rate": 3.997883959518001e-08, + "loss": 0.4579, + "step": 8363 + }, + { + "epoch": 0.96, + "grad_norm": 2.0886999861838875, + "learning_rate": 3.9744327382523206e-08, + "loss": 0.6288, + "step": 8364 + }, + { + "epoch": 0.96, + "grad_norm": 1.8882376352265282, + "learning_rate": 3.951050226362274e-08, + "loss": 0.4235, + "step": 8365 + }, + { + "epoch": 0.96, + "grad_norm": 2.789232361878686, + "learning_rate": 3.9277364270868813e-08, + "loss": 0.483, + "step": 8366 + }, + { + "epoch": 0.96, + "grad_norm": 2.406925349270383, + "learning_rate": 3.904491343655503e-08, + "loss": 0.5216, + "step": 8367 + }, + { + "epoch": 0.96, + "grad_norm": 0.8018086552200833, + "learning_rate": 3.8813149792880645e-08, + "loss": 0.6529, + "step": 8368 + }, + { + "epoch": 0.96, + "grad_norm": 1.7943811290522491, + "learning_rate": 3.858207337194886e-08, + "loss": 0.4787, + "step": 8369 + }, + { + "epoch": 0.96, + "grad_norm": 2.187189507261587, + "learning_rate": 3.8351684205769625e-08, + "loss": 0.4737, + "step": 8370 + }, + { + "epoch": 0.96, + "grad_norm": 2.198379545315124, + "learning_rate": 3.812198232625519e-08, + "loss": 0.4239, + "step": 8371 + }, + { + "epoch": 0.96, + "grad_norm": 6.643911876224704, + "learning_rate": 3.789296776522455e-08, + "loss": 0.5484, + "step": 8372 + }, + { + "epoch": 0.96, + "grad_norm": 2.0109929193367604, + "learning_rate": 3.7664640554400664e-08, + "loss": 0.4673, + "step": 8373 + }, + { + "epoch": 0.96, + "grad_norm": 2.8846239807661482, + "learning_rate": 3.743700072541101e-08, + "loss": 0.4271, + "step": 8374 + }, + { + "epoch": 0.96, + "grad_norm": 1.9854813325344618, + "learning_rate": 3.721004830978869e-08, + "loss": 0.4916, + "step": 8375 + }, + { + "epoch": 0.96, + "grad_norm": 2.1284789018538683, + "learning_rate": 3.698378333897079e-08, + "loss": 0.5168, + "step": 8376 + }, + { + "epoch": 0.96, + "grad_norm": 2.151850844664872, + "learning_rate": 3.675820584429945e-08, + "loss": 0.5084, + "step": 8377 + }, + { + "epoch": 0.96, + "grad_norm": 1.6475363117531057, + "learning_rate": 3.653331585702247e-08, + "loss": 0.5225, + "step": 8378 + }, + { + "epoch": 0.96, + "grad_norm": 1.6651793597962228, + "learning_rate": 3.630911340828991e-08, + "loss": 0.5083, + "step": 8379 + }, + { + "epoch": 0.96, + "grad_norm": 2.0134997382854154, + "learning_rate": 3.6085598529159714e-08, + "loss": 0.5645, + "step": 8380 + }, + { + "epoch": 0.96, + "grad_norm": 2.128904667769331, + "learning_rate": 3.586277125059268e-08, + "loss": 0.4944, + "step": 8381 + }, + { + "epoch": 0.96, + "grad_norm": 1.6927035545953664, + "learning_rate": 3.5640631603454653e-08, + "loss": 0.466, + "step": 8382 + }, + { + "epoch": 0.96, + "grad_norm": 1.7987987792779336, + "learning_rate": 3.5419179618516596e-08, + "loss": 0.5473, + "step": 8383 + }, + { + "epoch": 0.96, + "grad_norm": 2.2235500287758265, + "learning_rate": 3.519841532645396e-08, + "loss": 0.524, + "step": 8384 + }, + { + "epoch": 0.96, + "grad_norm": 1.8052836410480801, + "learning_rate": 3.497833875784673e-08, + "loss": 0.4734, + "step": 8385 + }, + { + "epoch": 0.96, + "grad_norm": 2.6543562281863915, + "learning_rate": 3.475894994317996e-08, + "loss": 0.4936, + "step": 8386 + }, + { + "epoch": 0.96, + "grad_norm": 0.7577600683884453, + "learning_rate": 3.45402489128438e-08, + "loss": 0.6546, + "step": 8387 + }, + { + "epoch": 0.96, + "grad_norm": 2.4332431479677776, + "learning_rate": 3.432223569713178e-08, + "loss": 0.552, + "step": 8388 + }, + { + "epoch": 0.96, + "grad_norm": 2.390866864673474, + "learning_rate": 3.410491032624419e-08, + "loss": 0.5447, + "step": 8389 + }, + { + "epoch": 0.96, + "grad_norm": 2.17077927121182, + "learning_rate": 3.388827283028362e-08, + "loss": 0.4603, + "step": 8390 + }, + { + "epoch": 0.96, + "grad_norm": 2.7194717503043204, + "learning_rate": 3.367232323925995e-08, + "loss": 0.4518, + "step": 8391 + }, + { + "epoch": 0.96, + "grad_norm": 1.816602315120856, + "learning_rate": 3.3457061583085374e-08, + "loss": 0.4718, + "step": 8392 + }, + { + "epoch": 0.96, + "grad_norm": 2.8277417618324248, + "learning_rate": 3.32424878915788e-08, + "loss": 0.4985, + "step": 8393 + }, + { + "epoch": 0.96, + "grad_norm": 2.5512317915937124, + "learning_rate": 3.3028602194462575e-08, + "loss": 0.4821, + "step": 8394 + }, + { + "epoch": 0.96, + "grad_norm": 0.8852016333211405, + "learning_rate": 3.281540452136356e-08, + "loss": 0.6756, + "step": 8395 + }, + { + "epoch": 0.96, + "grad_norm": 2.0603564883434085, + "learning_rate": 3.260289490181479e-08, + "loss": 0.5182, + "step": 8396 + }, + { + "epoch": 0.96, + "grad_norm": 2.1124459115636793, + "learning_rate": 3.239107336525271e-08, + "loss": 0.5522, + "step": 8397 + }, + { + "epoch": 0.96, + "grad_norm": 2.2708996199617846, + "learning_rate": 3.217993994101831e-08, + "loss": 0.454, + "step": 8398 + }, + { + "epoch": 0.97, + "grad_norm": 2.2983711027818856, + "learning_rate": 3.1969494658358746e-08, + "loss": 0.5477, + "step": 8399 + }, + { + "epoch": 0.97, + "grad_norm": 2.5255894993535972, + "learning_rate": 3.1759737546423476e-08, + "loss": 0.4896, + "step": 8400 + }, + { + "epoch": 0.97, + "grad_norm": 1.7655659795787515, + "learning_rate": 3.155066863426981e-08, + "loss": 0.5487, + "step": 8401 + }, + { + "epoch": 0.97, + "grad_norm": 2.089434539574228, + "learning_rate": 3.1342287950856256e-08, + "loss": 0.3889, + "step": 8402 + }, + { + "epoch": 0.97, + "grad_norm": 2.000821966318182, + "learning_rate": 3.1134595525048606e-08, + "loss": 0.5534, + "step": 8403 + }, + { + "epoch": 0.97, + "grad_norm": 2.6159325657035253, + "learning_rate": 3.092759138561607e-08, + "loss": 0.4524, + "step": 8404 + }, + { + "epoch": 0.97, + "grad_norm": 1.823958876161883, + "learning_rate": 3.0721275561232946e-08, + "loss": 0.4518, + "step": 8405 + }, + { + "epoch": 0.97, + "grad_norm": 1.7726353103165573, + "learning_rate": 3.0515648080478023e-08, + "loss": 0.4692, + "step": 8406 + }, + { + "epoch": 0.97, + "grad_norm": 3.1349416848488594, + "learning_rate": 3.0310708971834635e-08, + "loss": 0.4878, + "step": 8407 + }, + { + "epoch": 0.97, + "grad_norm": 1.8783516817970018, + "learning_rate": 3.010645826369174e-08, + "loss": 0.4874, + "step": 8408 + }, + { + "epoch": 0.97, + "grad_norm": 4.326212082852649, + "learning_rate": 2.990289598434115e-08, + "loss": 0.4293, + "step": 8409 + }, + { + "epoch": 0.97, + "grad_norm": 2.2717858746221826, + "learning_rate": 2.9700022161980312e-08, + "loss": 0.4886, + "step": 8410 + }, + { + "epoch": 0.97, + "grad_norm": 1.6908264024446573, + "learning_rate": 2.9497836824711192e-08, + "loss": 0.5352, + "step": 8411 + }, + { + "epoch": 0.97, + "grad_norm": 2.8391981957765515, + "learning_rate": 2.929634000054138e-08, + "loss": 0.5575, + "step": 8412 + }, + { + "epoch": 0.97, + "grad_norm": 2.3371403026086774, + "learning_rate": 2.909553171738133e-08, + "loss": 0.4999, + "step": 8413 + }, + { + "epoch": 0.97, + "grad_norm": 1.7504321700164074, + "learning_rate": 2.889541200304713e-08, + "loss": 0.4945, + "step": 8414 + }, + { + "epoch": 0.97, + "grad_norm": 2.1208162869757556, + "learning_rate": 2.8695980885259933e-08, + "loss": 0.4417, + "step": 8415 + }, + { + "epoch": 0.97, + "grad_norm": 2.455106333105743, + "learning_rate": 2.8497238391643756e-08, + "loss": 0.5292, + "step": 8416 + }, + { + "epoch": 0.97, + "grad_norm": 2.5402633986817746, + "learning_rate": 2.8299184549729352e-08, + "loss": 0.4951, + "step": 8417 + }, + { + "epoch": 0.97, + "grad_norm": 0.8239994502568786, + "learning_rate": 2.810181938695089e-08, + "loss": 0.6507, + "step": 8418 + }, + { + "epoch": 0.97, + "grad_norm": 1.6206688399795766, + "learning_rate": 2.7905142930647057e-08, + "loss": 0.5395, + "step": 8419 + }, + { + "epoch": 0.97, + "grad_norm": 1.7201942730462734, + "learning_rate": 2.770915520806161e-08, + "loss": 0.4526, + "step": 8420 + }, + { + "epoch": 0.97, + "grad_norm": 2.2308190204431595, + "learning_rate": 2.751385624634284e-08, + "loss": 0.4954, + "step": 8421 + }, + { + "epoch": 0.97, + "grad_norm": 2.4642404555732598, + "learning_rate": 2.7319246072543548e-08, + "loss": 0.5399, + "step": 8422 + }, + { + "epoch": 0.97, + "grad_norm": 1.937547888831758, + "learning_rate": 2.712532471362106e-08, + "loss": 0.4536, + "step": 8423 + }, + { + "epoch": 0.97, + "grad_norm": 2.23933470458382, + "learning_rate": 2.693209219643722e-08, + "loss": 0.4931, + "step": 8424 + }, + { + "epoch": 0.97, + "grad_norm": 2.3441629634171455, + "learning_rate": 2.6739548547758398e-08, + "loss": 0.4677, + "step": 8425 + }, + { + "epoch": 0.97, + "grad_norm": 2.242565192478841, + "learning_rate": 2.6547693794256037e-08, + "loss": 0.5458, + "step": 8426 + }, + { + "epoch": 0.97, + "grad_norm": 2.0644891175962727, + "learning_rate": 2.635652796250554e-08, + "loss": 0.5168, + "step": 8427 + }, + { + "epoch": 0.97, + "grad_norm": 2.0575660061453633, + "learning_rate": 2.6166051078987954e-08, + "loss": 0.4991, + "step": 8428 + }, + { + "epoch": 0.97, + "grad_norm": 2.691731750873132, + "learning_rate": 2.597626317008717e-08, + "loss": 0.4864, + "step": 8429 + }, + { + "epoch": 0.97, + "grad_norm": 1.9080349391019298, + "learning_rate": 2.5787164262092158e-08, + "loss": 0.5258, + "step": 8430 + }, + { + "epoch": 0.97, + "grad_norm": 2.9531367460445592, + "learning_rate": 2.5598754381198632e-08, + "loss": 0.4467, + "step": 8431 + }, + { + "epoch": 0.97, + "grad_norm": 1.822465814819731, + "learning_rate": 2.5411033553503495e-08, + "loss": 0.4288, + "step": 8432 + }, + { + "epoch": 0.97, + "grad_norm": 2.7390680119366433, + "learning_rate": 2.522400180501039e-08, + "loss": 0.4249, + "step": 8433 + }, + { + "epoch": 0.97, + "grad_norm": 10.257068989349596, + "learning_rate": 2.5037659161626925e-08, + "loss": 0.5719, + "step": 8434 + }, + { + "epoch": 0.97, + "grad_norm": 1.8494445754465583, + "learning_rate": 2.4852005649165235e-08, + "loss": 0.54, + "step": 8435 + }, + { + "epoch": 0.97, + "grad_norm": 1.8188644235731277, + "learning_rate": 2.4667041293341964e-08, + "loss": 0.474, + "step": 8436 + }, + { + "epoch": 0.97, + "grad_norm": 2.142761233091188, + "learning_rate": 2.448276611977829e-08, + "loss": 0.5026, + "step": 8437 + }, + { + "epoch": 0.97, + "grad_norm": 2.160623722539833, + "learning_rate": 2.4299180154000456e-08, + "loss": 0.4253, + "step": 8438 + }, + { + "epoch": 0.97, + "grad_norm": 1.9475239563726947, + "learning_rate": 2.4116283421438126e-08, + "loss": 0.3693, + "step": 8439 + }, + { + "epoch": 0.97, + "grad_norm": 2.6578631355556057, + "learning_rate": 2.3934075947426027e-08, + "loss": 0.4346, + "step": 8440 + }, + { + "epoch": 0.97, + "grad_norm": 2.0678799984307927, + "learning_rate": 2.375255775720453e-08, + "loss": 0.4726, + "step": 8441 + }, + { + "epoch": 0.97, + "grad_norm": 5.20676376509795, + "learning_rate": 2.35717288759163e-08, + "loss": 0.541, + "step": 8442 + }, + { + "epoch": 0.97, + "grad_norm": 1.9981210931030393, + "learning_rate": 2.3391589328610743e-08, + "loss": 0.494, + "step": 8443 + }, + { + "epoch": 0.97, + "grad_norm": 1.5814425396576608, + "learning_rate": 2.3212139140239563e-08, + "loss": 0.4527, + "step": 8444 + }, + { + "epoch": 0.97, + "grad_norm": 2.018066009769416, + "learning_rate": 2.3033378335661217e-08, + "loss": 0.5379, + "step": 8445 + }, + { + "epoch": 0.97, + "grad_norm": 2.175053052839372, + "learning_rate": 2.2855306939637557e-08, + "loss": 0.5147, + "step": 8446 + }, + { + "epoch": 0.97, + "grad_norm": 2.0260251094210053, + "learning_rate": 2.267792497683441e-08, + "loss": 0.4859, + "step": 8447 + }, + { + "epoch": 0.97, + "grad_norm": 1.8713588163861292, + "learning_rate": 2.2501232471822675e-08, + "loss": 0.4937, + "step": 8448 + }, + { + "epoch": 0.97, + "grad_norm": 2.604649603649941, + "learning_rate": 2.232522944907889e-08, + "loss": 0.4605, + "step": 8449 + }, + { + "epoch": 0.97, + "grad_norm": 1.965090530245984, + "learning_rate": 2.2149915932981327e-08, + "loss": 0.3869, + "step": 8450 + }, + { + "epoch": 0.97, + "grad_norm": 2.147430737549078, + "learning_rate": 2.1975291947815558e-08, + "loss": 0.4408, + "step": 8451 + }, + { + "epoch": 0.97, + "grad_norm": 1.8104388605531028, + "learning_rate": 2.180135751777057e-08, + "loss": 0.3794, + "step": 8452 + }, + { + "epoch": 0.97, + "grad_norm": 2.109809351145331, + "learning_rate": 2.162811266693876e-08, + "loss": 0.4269, + "step": 8453 + }, + { + "epoch": 0.97, + "grad_norm": 1.9526464006350546, + "learning_rate": 2.1455557419318705e-08, + "loss": 0.4702, + "step": 8454 + }, + { + "epoch": 0.97, + "grad_norm": 2.8359060230630577, + "learning_rate": 2.1283691798812402e-08, + "loss": 0.4791, + "step": 8455 + }, + { + "epoch": 0.97, + "grad_norm": 2.25065276494736, + "learning_rate": 2.1112515829226908e-08, + "loss": 0.4716, + "step": 8456 + }, + { + "epoch": 0.97, + "grad_norm": 2.5567277534554544, + "learning_rate": 2.0942029534273267e-08, + "loss": 0.4761, + "step": 8457 + }, + { + "epoch": 0.97, + "grad_norm": 1.631247685154881, + "learning_rate": 2.077223293756703e-08, + "loss": 0.4911, + "step": 8458 + }, + { + "epoch": 0.97, + "grad_norm": 2.0323116118688063, + "learning_rate": 2.0603126062629374e-08, + "loss": 0.4584, + "step": 8459 + }, + { + "epoch": 0.97, + "grad_norm": 2.3192401821762623, + "learning_rate": 2.0434708932883796e-08, + "loss": 0.4833, + "step": 8460 + }, + { + "epoch": 0.97, + "grad_norm": 2.680906670255701, + "learning_rate": 2.0266981571659405e-08, + "loss": 0.5055, + "step": 8461 + }, + { + "epoch": 0.97, + "grad_norm": 0.8193953670570945, + "learning_rate": 2.0099944002190953e-08, + "loss": 0.7005, + "step": 8462 + }, + { + "epoch": 0.97, + "grad_norm": 1.8741689641931003, + "learning_rate": 1.9933596247615483e-08, + "loss": 0.5364, + "step": 8463 + }, + { + "epoch": 0.97, + "grad_norm": 2.1983409769176583, + "learning_rate": 1.9767938330975678e-08, + "loss": 0.4177, + "step": 8464 + }, + { + "epoch": 0.97, + "grad_norm": 3.167007146401783, + "learning_rate": 1.960297027521818e-08, + "loss": 0.4965, + "step": 8465 + }, + { + "epoch": 0.97, + "grad_norm": 3.870986641745271, + "learning_rate": 1.9438692103194713e-08, + "loss": 0.4802, + "step": 8466 + }, + { + "epoch": 0.97, + "grad_norm": 2.1619573963761702, + "learning_rate": 1.9275103837660956e-08, + "loss": 0.4939, + "step": 8467 + }, + { + "epoch": 0.97, + "grad_norm": 2.054941735966446, + "learning_rate": 1.9112205501277124e-08, + "loss": 0.5166, + "step": 8468 + }, + { + "epoch": 0.97, + "grad_norm": 2.139724149684847, + "learning_rate": 1.8949997116608498e-08, + "loss": 0.5103, + "step": 8469 + }, + { + "epoch": 0.97, + "grad_norm": 2.349340462580863, + "learning_rate": 1.878847870612266e-08, + "loss": 0.4815, + "step": 8470 + }, + { + "epoch": 0.97, + "grad_norm": 2.851937866277887, + "learning_rate": 1.8627650292194488e-08, + "loss": 0.4975, + "step": 8471 + }, + { + "epoch": 0.97, + "grad_norm": 3.51334280521689, + "learning_rate": 1.8467511897101164e-08, + "loss": 0.4954, + "step": 8472 + }, + { + "epoch": 0.97, + "grad_norm": 2.210048224824802, + "learning_rate": 1.83080635430255e-08, + "loss": 0.5644, + "step": 8473 + }, + { + "epoch": 0.97, + "grad_norm": 2.1642360409715606, + "learning_rate": 1.8149305252053716e-08, + "loss": 0.482, + "step": 8474 + }, + { + "epoch": 0.97, + "grad_norm": 2.696092060331628, + "learning_rate": 1.7991237046177667e-08, + "loss": 0.494, + "step": 8475 + }, + { + "epoch": 0.97, + "grad_norm": 1.9634367009493712, + "learning_rate": 1.783385894729206e-08, + "loss": 0.5477, + "step": 8476 + }, + { + "epoch": 0.97, + "grad_norm": 1.9698477278841708, + "learning_rate": 1.767717097719779e-08, + "loss": 0.5269, + "step": 8477 + }, + { + "epoch": 0.97, + "grad_norm": 2.321785312113312, + "learning_rate": 1.7521173157598603e-08, + "loss": 0.5042, + "step": 8478 + }, + { + "epoch": 0.97, + "grad_norm": 4.765743303210131, + "learning_rate": 1.736586551010333e-08, + "loss": 0.4459, + "step": 8479 + }, + { + "epoch": 0.97, + "grad_norm": 2.1481328335478977, + "learning_rate": 1.7211248056225872e-08, + "loss": 0.4413, + "step": 8480 + }, + { + "epoch": 0.97, + "grad_norm": 2.1950317676732265, + "learning_rate": 1.7057320817382984e-08, + "loss": 0.4531, + "step": 8481 + }, + { + "epoch": 0.97, + "grad_norm": 1.8356280333577921, + "learning_rate": 1.6904083814897054e-08, + "loss": 0.4618, + "step": 8482 + }, + { + "epoch": 0.97, + "grad_norm": 2.075591198535836, + "learning_rate": 1.675153706999444e-08, + "loss": 0.4331, + "step": 8483 + }, + { + "epoch": 0.97, + "grad_norm": 1.9990094901101156, + "learning_rate": 1.6599680603805458e-08, + "loss": 0.4703, + "step": 8484 + }, + { + "epoch": 0.97, + "grad_norm": 2.127273916817755, + "learning_rate": 1.6448514437365503e-08, + "loss": 0.496, + "step": 8485 + }, + { + "epoch": 0.98, + "grad_norm": 1.9145256582137946, + "learning_rate": 1.6298038591614497e-08, + "loss": 0.3906, + "step": 8486 + }, + { + "epoch": 0.98, + "grad_norm": 2.0969988143908243, + "learning_rate": 1.614825308739576e-08, + "loss": 0.4669, + "step": 8487 + }, + { + "epoch": 0.98, + "grad_norm": 1.917974179311906, + "learning_rate": 1.59991579454577e-08, + "loss": 0.416, + "step": 8488 + }, + { + "epoch": 0.98, + "grad_norm": 2.3076442465064444, + "learning_rate": 1.5850753186453238e-08, + "loss": 0.4845, + "step": 8489 + }, + { + "epoch": 0.98, + "grad_norm": 1.6636443427063503, + "learning_rate": 1.5703038830939266e-08, + "loss": 0.4655, + "step": 8490 + }, + { + "epoch": 0.98, + "grad_norm": 2.5570495942017426, + "learning_rate": 1.5556014899376637e-08, + "loss": 0.5515, + "step": 8491 + }, + { + "epoch": 0.98, + "grad_norm": 2.5162814625025276, + "learning_rate": 1.540968141213184e-08, + "loss": 0.4226, + "step": 8492 + }, + { + "epoch": 0.98, + "grad_norm": 2.1397169738795956, + "learning_rate": 1.5264038389474768e-08, + "loss": 0.5577, + "step": 8493 + }, + { + "epoch": 0.98, + "grad_norm": 2.60614032565627, + "learning_rate": 1.5119085851579286e-08, + "loss": 0.4528, + "step": 8494 + }, + { + "epoch": 0.98, + "grad_norm": 3.6643033549567723, + "learning_rate": 1.4974823818524885e-08, + "loss": 0.431, + "step": 8495 + }, + { + "epoch": 0.98, + "grad_norm": 2.2926451754578205, + "learning_rate": 1.4831252310294474e-08, + "loss": 0.5129, + "step": 8496 + }, + { + "epoch": 0.98, + "grad_norm": 2.0152383731863117, + "learning_rate": 1.4688371346775477e-08, + "loss": 0.4879, + "step": 8497 + }, + { + "epoch": 0.98, + "grad_norm": 2.421600297366718, + "learning_rate": 1.4546180947759837e-08, + "loss": 0.4992, + "step": 8498 + }, + { + "epoch": 0.98, + "grad_norm": 2.281790832238897, + "learning_rate": 1.4404681132943465e-08, + "loss": 0.4321, + "step": 8499 + }, + { + "epoch": 0.98, + "grad_norm": 1.7485509574901028, + "learning_rate": 1.4263871921927352e-08, + "loss": 0.5278, + "step": 8500 + }, + { + "epoch": 0.98, + "grad_norm": 3.613976016237742, + "learning_rate": 1.4123753334215895e-08, + "loss": 0.4557, + "step": 8501 + }, + { + "epoch": 0.98, + "grad_norm": 2.4195857646804195, + "learning_rate": 1.3984325389219123e-08, + "loss": 0.4454, + "step": 8502 + }, + { + "epoch": 0.98, + "grad_norm": 3.6673902316032474, + "learning_rate": 1.3845588106249364e-08, + "loss": 0.4604, + "step": 8503 + }, + { + "epoch": 0.98, + "grad_norm": 1.8462693168908872, + "learning_rate": 1.370754150452569e-08, + "loss": 0.4589, + "step": 8504 + }, + { + "epoch": 0.98, + "grad_norm": 2.1850044896747827, + "learning_rate": 1.3570185603168917e-08, + "loss": 0.5954, + "step": 8505 + }, + { + "epoch": 0.98, + "grad_norm": 2.3899324423913835, + "learning_rate": 1.3433520421207158e-08, + "loss": 0.4744, + "step": 8506 + }, + { + "epoch": 0.98, + "grad_norm": 2.0112272781875453, + "learning_rate": 1.3297545977569714e-08, + "loss": 0.5041, + "step": 8507 + }, + { + "epoch": 0.98, + "grad_norm": 1.8851436413315588, + "learning_rate": 1.3162262291093187e-08, + "loss": 0.4849, + "step": 8508 + }, + { + "epoch": 0.98, + "grad_norm": 2.357141443423784, + "learning_rate": 1.3027669380515918e-08, + "loss": 0.4167, + "step": 8509 + }, + { + "epoch": 0.98, + "grad_norm": 1.9113891969660923, + "learning_rate": 1.2893767264482438e-08, + "loss": 0.5183, + "step": 8510 + }, + { + "epoch": 0.98, + "grad_norm": 2.5564075672629807, + "learning_rate": 1.2760555961540133e-08, + "loss": 0.3791, + "step": 8511 + }, + { + "epoch": 0.98, + "grad_norm": 2.7676913062937203, + "learning_rate": 1.2628035490142021e-08, + "loss": 0.4722, + "step": 8512 + }, + { + "epoch": 0.98, + "grad_norm": 2.170345819527668, + "learning_rate": 1.2496205868644529e-08, + "loss": 0.4538, + "step": 8513 + }, + { + "epoch": 0.98, + "grad_norm": 2.316262792178341, + "learning_rate": 1.236506711530916e-08, + "loss": 0.5178, + "step": 8514 + }, + { + "epoch": 0.98, + "grad_norm": 2.002275652729228, + "learning_rate": 1.2234619248300273e-08, + "loss": 0.4668, + "step": 8515 + }, + { + "epoch": 0.98, + "grad_norm": 2.0921247522436763, + "learning_rate": 1.2104862285688413e-08, + "loss": 0.4325, + "step": 8516 + }, + { + "epoch": 0.98, + "grad_norm": 4.982930861805802, + "learning_rate": 1.1975796245446425e-08, + "loss": 0.413, + "step": 8517 + }, + { + "epoch": 0.98, + "grad_norm": 2.9727829082129404, + "learning_rate": 1.1847421145453896e-08, + "loss": 0.5543, + "step": 8518 + }, + { + "epoch": 0.98, + "grad_norm": 2.009434385601694, + "learning_rate": 1.171973700349216e-08, + "loss": 0.5987, + "step": 8519 + }, + { + "epoch": 0.98, + "grad_norm": 1.7292943239194842, + "learning_rate": 1.1592743837248731e-08, + "loss": 0.4451, + "step": 8520 + }, + { + "epoch": 0.98, + "grad_norm": 2.044725409114241, + "learning_rate": 1.1466441664314543e-08, + "loss": 0.4616, + "step": 8521 + }, + { + "epoch": 0.98, + "grad_norm": 2.80333876280949, + "learning_rate": 1.1340830502184486e-08, + "loss": 0.3796, + "step": 8522 + }, + { + "epoch": 0.98, + "grad_norm": 1.833313222283905, + "learning_rate": 1.1215910368258531e-08, + "loss": 0.3819, + "step": 8523 + }, + { + "epoch": 0.98, + "grad_norm": 2.3957836306616564, + "learning_rate": 1.1091681279840616e-08, + "loss": 0.5141, + "step": 8524 + }, + { + "epoch": 0.98, + "grad_norm": 1.7694226944397944, + "learning_rate": 1.0968143254139196e-08, + "loss": 0.4345, + "step": 8525 + }, + { + "epoch": 0.98, + "grad_norm": 2.8815356097743163, + "learning_rate": 1.084529630826614e-08, + "loss": 0.5465, + "step": 8526 + }, + { + "epoch": 0.98, + "grad_norm": 1.8265132012478755, + "learning_rate": 1.072314045923839e-08, + "loss": 0.5159, + "step": 8527 + }, + { + "epoch": 0.98, + "grad_norm": 2.150949089204457, + "learning_rate": 1.0601675723977412e-08, + "loss": 0.5081, + "step": 8528 + }, + { + "epoch": 0.98, + "grad_norm": 2.483612270092897, + "learning_rate": 1.0480902119308078e-08, + "loss": 0.4364, + "step": 8529 + }, + { + "epoch": 0.98, + "grad_norm": 1.752178594072906, + "learning_rate": 1.0360819661959786e-08, + "loss": 0.4205, + "step": 8530 + }, + { + "epoch": 0.98, + "grad_norm": 3.1776093438044084, + "learning_rate": 1.0241428368566453e-08, + "loss": 0.5201, + "step": 8531 + }, + { + "epoch": 0.98, + "grad_norm": 1.8303775928671429, + "learning_rate": 1.0122728255666514e-08, + "loss": 0.5272, + "step": 8532 + }, + { + "epoch": 0.98, + "grad_norm": 1.939028692573307, + "learning_rate": 1.0004719339701818e-08, + "loss": 0.5291, + "step": 8533 + }, + { + "epoch": 0.98, + "grad_norm": 2.449724745337555, + "learning_rate": 9.887401637019289e-09, + "loss": 0.5051, + "step": 8534 + }, + { + "epoch": 0.98, + "grad_norm": 2.2797608774893727, + "learning_rate": 9.770775163869262e-09, + "loss": 0.5102, + "step": 8535 + }, + { + "epoch": 0.98, + "grad_norm": 2.4888090943252643, + "learning_rate": 9.654839936407145e-09, + "loss": 0.5772, + "step": 8536 + }, + { + "epoch": 0.98, + "grad_norm": 5.457495641497708, + "learning_rate": 9.539595970692318e-09, + "loss": 0.4646, + "step": 8537 + }, + { + "epoch": 0.98, + "grad_norm": 1.8701443425665243, + "learning_rate": 9.425043282688118e-09, + "loss": 0.4945, + "step": 8538 + }, + { + "epoch": 0.98, + "grad_norm": 2.029551704944981, + "learning_rate": 9.311181888262965e-09, + "loss": 0.5002, + "step": 8539 + }, + { + "epoch": 0.98, + "grad_norm": 2.383623191173553, + "learning_rate": 9.198011803188134e-09, + "loss": 0.4711, + "step": 8540 + }, + { + "epoch": 0.98, + "grad_norm": 2.2909766674051615, + "learning_rate": 9.085533043140526e-09, + "loss": 0.3959, + "step": 8541 + }, + { + "epoch": 0.98, + "grad_norm": 2.2582704344039635, + "learning_rate": 8.973745623699904e-09, + "loss": 0.4068, + "step": 8542 + }, + { + "epoch": 0.98, + "grad_norm": 2.0371618658084167, + "learning_rate": 8.86264956035221e-09, + "loss": 0.4619, + "step": 8543 + }, + { + "epoch": 0.98, + "grad_norm": 2.1135467765767633, + "learning_rate": 8.752244868485692e-09, + "loss": 0.4541, + "step": 8544 + }, + { + "epoch": 0.98, + "grad_norm": 2.2003478395875833, + "learning_rate": 8.64253156339312e-09, + "loss": 0.5296, + "step": 8545 + }, + { + "epoch": 0.98, + "grad_norm": 1.8541614999106166, + "learning_rate": 8.533509660273443e-09, + "loss": 0.4841, + "step": 8546 + }, + { + "epoch": 0.98, + "grad_norm": 2.3349111512738374, + "learning_rate": 8.425179174226806e-09, + "loss": 0.4507, + "step": 8547 + }, + { + "epoch": 0.98, + "grad_norm": 2.0729501813777516, + "learning_rate": 8.317540120260093e-09, + "loss": 0.5069, + "step": 8548 + }, + { + "epoch": 0.98, + "grad_norm": 3.304912080605692, + "learning_rate": 8.210592513283044e-09, + "loss": 0.4546, + "step": 8549 + }, + { + "epoch": 0.98, + "grad_norm": 1.8076757188099863, + "learning_rate": 8.104336368109922e-09, + "loss": 0.4525, + "step": 8550 + }, + { + "epoch": 0.98, + "grad_norm": 1.8024973954871126, + "learning_rate": 7.998771699459506e-09, + "loss": 0.548, + "step": 8551 + }, + { + "epoch": 0.98, + "grad_norm": 1.897601398837334, + "learning_rate": 7.8938985219551e-09, + "loss": 0.5335, + "step": 8552 + }, + { + "epoch": 0.98, + "grad_norm": 2.0794020608788992, + "learning_rate": 7.789716850122863e-09, + "loss": 0.4895, + "step": 8553 + }, + { + "epoch": 0.98, + "grad_norm": 1.9681372173852025, + "learning_rate": 7.686226698394028e-09, + "loss": 0.4632, + "step": 8554 + }, + { + "epoch": 0.98, + "grad_norm": 1.8034034162526709, + "learning_rate": 7.583428081104905e-09, + "loss": 0.4651, + "step": 8555 + }, + { + "epoch": 0.98, + "grad_norm": 2.1226761239874907, + "learning_rate": 7.481321012494658e-09, + "loss": 0.392, + "step": 8556 + }, + { + "epoch": 0.98, + "grad_norm": 2.2948510184863076, + "learning_rate": 7.37990550670642e-09, + "loss": 0.5451, + "step": 8557 + }, + { + "epoch": 0.98, + "grad_norm": 1.6692194908255125, + "learning_rate": 7.279181577789507e-09, + "loss": 0.4258, + "step": 8558 + }, + { + "epoch": 0.98, + "grad_norm": 2.3350585851063483, + "learning_rate": 7.179149239695538e-09, + "loss": 0.3638, + "step": 8559 + }, + { + "epoch": 0.98, + "grad_norm": 3.419721711929588, + "learning_rate": 7.079808506281205e-09, + "loss": 0.5438, + "step": 8560 + }, + { + "epoch": 0.98, + "grad_norm": 2.375983809543111, + "learning_rate": 6.981159391306613e-09, + "loss": 0.4911, + "step": 8561 + }, + { + "epoch": 0.98, + "grad_norm": 4.02420963976272, + "learning_rate": 6.883201908436943e-09, + "loss": 0.475, + "step": 8562 + }, + { + "epoch": 0.98, + "grad_norm": 0.8510713419896108, + "learning_rate": 6.7859360712418945e-09, + "loss": 0.6603, + "step": 8563 + }, + { + "epoch": 0.98, + "grad_norm": 1.8438141280400084, + "learning_rate": 6.689361893193469e-09, + "loss": 0.4068, + "step": 8564 + }, + { + "epoch": 0.98, + "grad_norm": 2.456464460836008, + "learning_rate": 6.593479387669854e-09, + "loss": 0.4216, + "step": 8565 + }, + { + "epoch": 0.98, + "grad_norm": 1.751682333494253, + "learning_rate": 6.498288567953204e-09, + "loss": 0.4706, + "step": 8566 + }, + { + "epoch": 0.98, + "grad_norm": 1.6375181445623097, + "learning_rate": 6.40378944722797e-09, + "loss": 0.4423, + "step": 8567 + }, + { + "epoch": 0.98, + "grad_norm": 2.0754917796511094, + "learning_rate": 6.309982038585349e-09, + "loss": 0.429, + "step": 8568 + }, + { + "epoch": 0.98, + "grad_norm": 3.3267378812199415, + "learning_rate": 6.2168663550188356e-09, + "loss": 0.4595, + "step": 8569 + }, + { + "epoch": 0.98, + "grad_norm": 3.3000545329164703, + "learning_rate": 6.124442409427001e-09, + "loss": 0.4661, + "step": 8570 + }, + { + "epoch": 0.98, + "grad_norm": 2.595311015917194, + "learning_rate": 6.032710214612936e-09, + "loss": 0.5048, + "step": 8571 + }, + { + "epoch": 0.98, + "grad_norm": 3.000507948445932, + "learning_rate": 5.941669783282589e-09, + "loss": 0.5218, + "step": 8572 + }, + { + "epoch": 0.99, + "grad_norm": 2.2721584672003172, + "learning_rate": 5.851321128046983e-09, + "loss": 0.5126, + "step": 8573 + }, + { + "epoch": 0.99, + "grad_norm": 2.6228493234912813, + "learning_rate": 5.761664261421662e-09, + "loss": 0.4873, + "step": 8574 + }, + { + "epoch": 0.99, + "grad_norm": 3.456458132930476, + "learning_rate": 5.6726991958250265e-09, + "loss": 0.552, + "step": 8575 + }, + { + "epoch": 0.99, + "grad_norm": 2.4854409277569296, + "learning_rate": 5.5844259435816615e-09, + "loss": 0.4871, + "step": 8576 + }, + { + "epoch": 0.99, + "grad_norm": 2.1682074798640367, + "learning_rate": 5.496844516918454e-09, + "loss": 0.5335, + "step": 8577 + }, + { + "epoch": 0.99, + "grad_norm": 1.846728805331797, + "learning_rate": 5.409954927966809e-09, + "loss": 0.5168, + "step": 8578 + }, + { + "epoch": 0.99, + "grad_norm": 1.9658518545458625, + "learning_rate": 5.323757188763768e-09, + "loss": 0.5128, + "step": 8579 + }, + { + "epoch": 0.99, + "grad_norm": 1.682218920076119, + "learning_rate": 5.2382513112481105e-09, + "loss": 0.5317, + "step": 8580 + }, + { + "epoch": 0.99, + "grad_norm": 1.8767494447228479, + "learning_rate": 5.153437307265363e-09, + "loss": 0.539, + "step": 8581 + }, + { + "epoch": 0.99, + "grad_norm": 0.8510255968077208, + "learning_rate": 5.0693151885627955e-09, + "loss": 0.6957, + "step": 8582 + }, + { + "epoch": 0.99, + "grad_norm": 1.8085321647330515, + "learning_rate": 4.985884966793864e-09, + "loss": 0.4895, + "step": 8583 + }, + { + "epoch": 0.99, + "grad_norm": 2.0813163187646797, + "learning_rate": 4.903146653515434e-09, + "loss": 0.5301, + "step": 8584 + }, + { + "epoch": 0.99, + "grad_norm": 0.8625512875989088, + "learning_rate": 4.821100260187228e-09, + "loss": 0.6781, + "step": 8585 + }, + { + "epoch": 0.99, + "grad_norm": 2.8171125373989705, + "learning_rate": 4.739745798175155e-09, + "loss": 0.4937, + "step": 8586 + }, + { + "epoch": 0.99, + "grad_norm": 2.6599400124616426, + "learning_rate": 4.659083278748533e-09, + "loss": 0.3579, + "step": 8587 + }, + { + "epoch": 0.99, + "grad_norm": 2.016290464523796, + "learning_rate": 4.5791127130800916e-09, + "loss": 0.4551, + "step": 8588 + }, + { + "epoch": 0.99, + "grad_norm": 2.2060162517301647, + "learning_rate": 4.499834112248191e-09, + "loss": 0.4552, + "step": 8589 + }, + { + "epoch": 0.99, + "grad_norm": 3.1963431740166874, + "learning_rate": 4.421247487233493e-09, + "loss": 0.5383, + "step": 8590 + }, + { + "epoch": 0.99, + "grad_norm": 2.4032892068817984, + "learning_rate": 4.343352848922844e-09, + "loss": 0.4433, + "step": 8591 + }, + { + "epoch": 0.99, + "grad_norm": 2.005096962045565, + "learning_rate": 4.2661502081053905e-09, + "loss": 0.5012, + "step": 8592 + }, + { + "epoch": 0.99, + "grad_norm": 2.035971664649162, + "learning_rate": 4.189639575475912e-09, + "loss": 0.5677, + "step": 8593 + }, + { + "epoch": 0.99, + "grad_norm": 2.152531079340269, + "learning_rate": 4.1138209616320426e-09, + "loss": 0.5031, + "step": 8594 + }, + { + "epoch": 0.99, + "grad_norm": 2.4093864588153306, + "learning_rate": 4.038694377075936e-09, + "loss": 0.5274, + "step": 8595 + }, + { + "epoch": 0.99, + "grad_norm": 2.5620957703645124, + "learning_rate": 3.964259832215378e-09, + "loss": 0.4775, + "step": 8596 + }, + { + "epoch": 0.99, + "grad_norm": 1.8963137399836518, + "learning_rate": 3.890517337359345e-09, + "loss": 0.4439, + "step": 8597 + }, + { + "epoch": 0.99, + "grad_norm": 2.04489372373147, + "learning_rate": 3.8174669027241095e-09, + "loss": 0.5024, + "step": 8598 + }, + { + "epoch": 0.99, + "grad_norm": 2.49376132145976, + "learning_rate": 3.745108538427688e-09, + "loss": 0.4608, + "step": 8599 + }, + { + "epoch": 0.99, + "grad_norm": 2.1156139131456584, + "learning_rate": 3.673442254493731e-09, + "loss": 0.4705, + "step": 8600 + }, + { + "epoch": 0.99, + "grad_norm": 2.075661739913366, + "learning_rate": 3.6024680608487406e-09, + "loss": 0.526, + "step": 8601 + }, + { + "epoch": 0.99, + "grad_norm": 1.953779536786496, + "learning_rate": 3.532185967324853e-09, + "loss": 0.5343, + "step": 8602 + }, + { + "epoch": 0.99, + "grad_norm": 2.077554738644134, + "learning_rate": 3.4625959836570576e-09, + "loss": 0.4168, + "step": 8603 + }, + { + "epoch": 0.99, + "grad_norm": 2.340869385240241, + "learning_rate": 3.3936981194848674e-09, + "loss": 0.4325, + "step": 8604 + }, + { + "epoch": 0.99, + "grad_norm": 1.8055745131885308, + "learning_rate": 3.3254923843523135e-09, + "loss": 0.4636, + "step": 8605 + }, + { + "epoch": 0.99, + "grad_norm": 1.615977719319574, + "learning_rate": 3.257978787706839e-09, + "loss": 0.4568, + "step": 8606 + }, + { + "epoch": 0.99, + "grad_norm": 2.022503086931025, + "learning_rate": 3.1911573389015183e-09, + "loss": 0.4728, + "step": 8607 + }, + { + "epoch": 0.99, + "grad_norm": 1.8038261347112179, + "learning_rate": 3.1250280471906148e-09, + "loss": 0.5022, + "step": 8608 + }, + { + "epoch": 0.99, + "grad_norm": 1.9005098232792654, + "learning_rate": 3.059590921735689e-09, + "loss": 0.4903, + "step": 8609 + }, + { + "epoch": 0.99, + "grad_norm": 2.7121184255497983, + "learning_rate": 2.994845971601157e-09, + "loss": 0.5057, + "step": 8610 + }, + { + "epoch": 0.99, + "grad_norm": 1.8829936499192288, + "learning_rate": 2.930793205754845e-09, + "loss": 0.4401, + "step": 8611 + }, + { + "epoch": 0.99, + "grad_norm": 2.2458740811960114, + "learning_rate": 2.8674326330691005e-09, + "loss": 0.4826, + "step": 8612 + }, + { + "epoch": 0.99, + "grad_norm": 2.5255337539533813, + "learning_rate": 2.804764262321347e-09, + "loss": 0.3954, + "step": 8613 + }, + { + "epoch": 0.99, + "grad_norm": 2.073872778583016, + "learning_rate": 2.7427881021918624e-09, + "loss": 0.4575, + "step": 8614 + }, + { + "epoch": 0.99, + "grad_norm": 1.7573421244613352, + "learning_rate": 2.6815041612665573e-09, + "loss": 0.4923, + "step": 8615 + }, + { + "epoch": 0.99, + "grad_norm": 2.1072704686075476, + "learning_rate": 2.6209124480330863e-09, + "loss": 0.4179, + "step": 8616 + }, + { + "epoch": 0.99, + "grad_norm": 2.339779565883222, + "learning_rate": 2.5610129708852903e-09, + "loss": 0.467, + "step": 8617 + }, + { + "epoch": 0.99, + "grad_norm": 1.976507025797458, + "learning_rate": 2.501805738120422e-09, + "loss": 0.4968, + "step": 8618 + }, + { + "epoch": 0.99, + "grad_norm": 1.7842495301999894, + "learning_rate": 2.443290757940253e-09, + "loss": 0.4336, + "step": 8619 + }, + { + "epoch": 0.99, + "grad_norm": 2.0513403546478757, + "learning_rate": 2.3854680384494123e-09, + "loss": 0.5684, + "step": 8620 + }, + { + "epoch": 0.99, + "grad_norm": 2.0306113134590316, + "learning_rate": 2.3283375876581583e-09, + "loss": 0.471, + "step": 8621 + }, + { + "epoch": 0.99, + "grad_norm": 1.9422893301731392, + "learning_rate": 2.2718994134796056e-09, + "loss": 0.4196, + "step": 8622 + }, + { + "epoch": 0.99, + "grad_norm": 1.8288777836366124, + "learning_rate": 2.2161535237319453e-09, + "loss": 0.4639, + "step": 8623 + }, + { + "epoch": 0.99, + "grad_norm": 2.0518676659970074, + "learning_rate": 2.1610999261373335e-09, + "loss": 0.5486, + "step": 8624 + }, + { + "epoch": 0.99, + "grad_norm": 2.0143869195850015, + "learning_rate": 2.1067386283213366e-09, + "loss": 0.504, + "step": 8625 + }, + { + "epoch": 0.99, + "grad_norm": 2.1311116119542324, + "learning_rate": 2.0530696378145974e-09, + "loss": 0.5689, + "step": 8626 + }, + { + "epoch": 0.99, + "grad_norm": 2.101204366982583, + "learning_rate": 2.000092962050615e-09, + "loss": 0.6446, + "step": 8627 + }, + { + "epoch": 0.99, + "grad_norm": 2.5468030815408875, + "learning_rate": 1.9478086083679627e-09, + "loss": 0.5374, + "step": 8628 + }, + { + "epoch": 0.99, + "grad_norm": 2.361396273221522, + "learning_rate": 1.8962165840097356e-09, + "loss": 0.5044, + "step": 8629 + }, + { + "epoch": 0.99, + "grad_norm": 1.7665549740058983, + "learning_rate": 1.8453168961213297e-09, + "loss": 0.4604, + "step": 8630 + }, + { + "epoch": 0.99, + "grad_norm": 1.9173112353057813, + "learning_rate": 1.795109551754326e-09, + "loss": 0.4643, + "step": 8631 + }, + { + "epoch": 0.99, + "grad_norm": 1.8942448814934543, + "learning_rate": 1.7455945578626065e-09, + "loss": 0.6472, + "step": 8632 + }, + { + "epoch": 0.99, + "grad_norm": 2.639079680908984, + "learning_rate": 1.6967719213056843e-09, + "loss": 0.4571, + "step": 8633 + }, + { + "epoch": 0.99, + "grad_norm": 1.975131564195945, + "learning_rate": 1.6486416488459279e-09, + "loss": 0.5866, + "step": 8634 + }, + { + "epoch": 0.99, + "grad_norm": 1.8141308878069986, + "learning_rate": 1.6012037471507813e-09, + "loss": 0.4341, + "step": 8635 + }, + { + "epoch": 0.99, + "grad_norm": 2.029796067609798, + "learning_rate": 1.554458222791655e-09, + "loss": 0.4774, + "step": 8636 + }, + { + "epoch": 0.99, + "grad_norm": 2.590206471907889, + "learning_rate": 1.5084050822422591e-09, + "loss": 0.5284, + "step": 8637 + }, + { + "epoch": 0.99, + "grad_norm": 1.9410480869798183, + "learning_rate": 1.4630443318836007e-09, + "loss": 0.4055, + "step": 8638 + }, + { + "epoch": 0.99, + "grad_norm": 1.7601269585994348, + "learning_rate": 1.4183759779978768e-09, + "loss": 0.481, + "step": 8639 + }, + { + "epoch": 0.99, + "grad_norm": 2.083915156751448, + "learning_rate": 1.3744000267729152e-09, + "loss": 0.4667, + "step": 8640 + }, + { + "epoch": 0.99, + "grad_norm": 2.0299591777780366, + "learning_rate": 1.3311164843005098e-09, + "loss": 0.5181, + "step": 8641 + }, + { + "epoch": 0.99, + "grad_norm": 2.8870151749360664, + "learning_rate": 1.2885253565758649e-09, + "loss": 0.5497, + "step": 8642 + }, + { + "epoch": 0.99, + "grad_norm": 2.0456848978654145, + "learning_rate": 1.2466266494987056e-09, + "loss": 0.5793, + "step": 8643 + }, + { + "epoch": 0.99, + "grad_norm": 1.932323656091851, + "learning_rate": 1.205420368873278e-09, + "loss": 0.4592, + "step": 8644 + }, + { + "epoch": 0.99, + "grad_norm": 1.9983708576039143, + "learning_rate": 1.1649065204072385e-09, + "loss": 0.5058, + "step": 8645 + }, + { + "epoch": 0.99, + "grad_norm": 2.364473602530412, + "learning_rate": 1.1250851097122096e-09, + "loss": 0.4603, + "step": 8646 + }, + { + "epoch": 0.99, + "grad_norm": 1.7044245168971675, + "learning_rate": 1.0859561423048892e-09, + "loss": 0.4447, + "step": 8647 + }, + { + "epoch": 0.99, + "grad_norm": 2.4729336592649127, + "learning_rate": 1.0475196236053863e-09, + "loss": 0.5199, + "step": 8648 + }, + { + "epoch": 0.99, + "grad_norm": 0.8966801209688295, + "learning_rate": 1.0097755589372204e-09, + "loss": 0.7169, + "step": 8649 + }, + { + "epoch": 0.99, + "grad_norm": 3.161874786591972, + "learning_rate": 9.727239535289867e-10, + "loss": 0.5136, + "step": 8650 + }, + { + "epoch": 0.99, + "grad_norm": 1.9354472703661458, + "learning_rate": 9.363648125132462e-10, + "loss": 0.5284, + "step": 8651 + }, + { + "epoch": 0.99, + "grad_norm": 1.6412937863737416, + "learning_rate": 9.006981409265258e-10, + "loss": 0.5027, + "step": 8652 + }, + { + "epoch": 0.99, + "grad_norm": 3.2178850678866944, + "learning_rate": 8.657239437087628e-10, + "loss": 0.4669, + "step": 8653 + }, + { + "epoch": 0.99, + "grad_norm": 4.963727970450669, + "learning_rate": 8.314422257055255e-10, + "loss": 0.5062, + "step": 8654 + }, + { + "epoch": 0.99, + "grad_norm": 3.3062782219734808, + "learning_rate": 7.978529916646827e-10, + "loss": 0.4759, + "step": 8655 + }, + { + "epoch": 0.99, + "grad_norm": 1.8441119099354049, + "learning_rate": 7.649562462397342e-10, + "loss": 0.383, + "step": 8656 + }, + { + "epoch": 0.99, + "grad_norm": 6.449801381717954, + "learning_rate": 7.327519939870353e-10, + "loss": 0.4437, + "step": 8657 + }, + { + "epoch": 0.99, + "grad_norm": 2.127470098160178, + "learning_rate": 7.012402393674622e-10, + "loss": 0.4213, + "step": 8658 + }, + { + "epoch": 0.99, + "grad_norm": 2.183668209647357, + "learning_rate": 6.704209867464117e-10, + "loss": 0.4882, + "step": 8659 + }, + { + "epoch": 1.0, + "grad_norm": 6.789568882209908, + "learning_rate": 6.402942403926915e-10, + "loss": 0.5107, + "step": 8660 + }, + { + "epoch": 1.0, + "grad_norm": 0.9040750708214508, + "learning_rate": 6.108600044796298e-10, + "loss": 0.6958, + "step": 8661 + }, + { + "epoch": 1.0, + "grad_norm": 2.732296919489568, + "learning_rate": 5.821182830839655e-10, + "loss": 0.4526, + "step": 8662 + }, + { + "epoch": 1.0, + "grad_norm": 1.9694283481010917, + "learning_rate": 5.540690801875137e-10, + "loss": 0.6215, + "step": 8663 + }, + { + "epoch": 1.0, + "grad_norm": 1.8392061804814037, + "learning_rate": 5.267123996754997e-10, + "loss": 0.399, + "step": 8664 + }, + { + "epoch": 1.0, + "grad_norm": 1.9209099817699458, + "learning_rate": 5.000482453376698e-10, + "loss": 0.4623, + "step": 8665 + }, + { + "epoch": 1.0, + "grad_norm": 2.402210225928313, + "learning_rate": 4.740766208666259e-10, + "loss": 0.4203, + "step": 8666 + }, + { + "epoch": 1.0, + "grad_norm": 1.9846695534012289, + "learning_rate": 4.487975298606007e-10, + "loss": 0.5537, + "step": 8667 + }, + { + "epoch": 1.0, + "grad_norm": 2.186052721466957, + "learning_rate": 4.242109758217927e-10, + "loss": 0.4282, + "step": 8668 + }, + { + "epoch": 1.0, + "grad_norm": 3.9878835013958995, + "learning_rate": 4.0031696215470093e-10, + "loss": 0.4387, + "step": 8669 + }, + { + "epoch": 1.0, + "grad_norm": 1.9925442771888353, + "learning_rate": 3.771154921700104e-10, + "loss": 0.4824, + "step": 8670 + }, + { + "epoch": 1.0, + "grad_norm": 2.021271178237182, + "learning_rate": 3.5460656908126166e-10, + "loss": 0.4705, + "step": 8671 + }, + { + "epoch": 1.0, + "grad_norm": 2.3519819289165538, + "learning_rate": 3.327901960065161e-10, + "loss": 0.5725, + "step": 8672 + }, + { + "epoch": 1.0, + "grad_norm": 2.2437279941984687, + "learning_rate": 3.116663759678007e-10, + "loss": 0.4309, + "step": 8673 + }, + { + "epoch": 1.0, + "grad_norm": 2.1463365593084553, + "learning_rate": 2.9123511189110833e-10, + "loss": 0.5002, + "step": 8674 + }, + { + "epoch": 1.0, + "grad_norm": 2.106139316077505, + "learning_rate": 2.714964066063974e-10, + "loss": 0.5088, + "step": 8675 + }, + { + "epoch": 1.0, + "grad_norm": 2.2066937703640463, + "learning_rate": 2.524502628475922e-10, + "loss": 0.55, + "step": 8676 + }, + { + "epoch": 1.0, + "grad_norm": 1.9069622265745285, + "learning_rate": 2.3409668325424795e-10, + "loss": 0.4362, + "step": 8677 + }, + { + "epoch": 1.0, + "grad_norm": 1.8819084356917204, + "learning_rate": 2.1643567036711e-10, + "loss": 0.4415, + "step": 8678 + }, + { + "epoch": 1.0, + "grad_norm": 4.715773637521027, + "learning_rate": 1.9946722663366502e-10, + "loss": 0.4657, + "step": 8679 + }, + { + "epoch": 1.0, + "grad_norm": 0.7911372347873028, + "learning_rate": 1.8319135440425517e-10, + "loss": 0.6461, + "step": 8680 + }, + { + "epoch": 1.0, + "grad_norm": 2.4677066927749842, + "learning_rate": 1.676080559326332e-10, + "loss": 0.4878, + "step": 8681 + }, + { + "epoch": 1.0, + "grad_norm": 2.3928213531300107, + "learning_rate": 1.5271733337818284e-10, + "loss": 0.5157, + "step": 8682 + }, + { + "epoch": 1.0, + "grad_norm": 2.4463703707380082, + "learning_rate": 1.3851918880369853e-10, + "loss": 0.4726, + "step": 8683 + }, + { + "epoch": 1.0, + "grad_norm": 2.416832537824428, + "learning_rate": 1.250136241748301e-10, + "loss": 0.4753, + "step": 8684 + }, + { + "epoch": 1.0, + "grad_norm": 1.9745248537301268, + "learning_rate": 1.1220064136341358e-10, + "loss": 0.5421, + "step": 8685 + }, + { + "epoch": 1.0, + "grad_norm": 2.559466389901249, + "learning_rate": 1.0008024214414048e-10, + "loss": 0.4686, + "step": 8686 + }, + { + "epoch": 1.0, + "grad_norm": 2.4318864684333654, + "learning_rate": 8.865242819566799e-11, + "loss": 0.4927, + "step": 8687 + }, + { + "epoch": 1.0, + "grad_norm": 1.9268296621983754, + "learning_rate": 7.791720110117417e-11, + "loss": 0.5602, + "step": 8688 + }, + { + "epoch": 1.0, + "grad_norm": 2.861678464172052, + "learning_rate": 6.787456234724765e-11, + "loss": 0.5594, + "step": 8689 + }, + { + "epoch": 1.0, + "grad_norm": 3.290861302156981, + "learning_rate": 5.852451332555298e-11, + "loss": 0.4977, + "step": 8690 + }, + { + "epoch": 1.0, + "grad_norm": 2.055235885134363, + "learning_rate": 4.98670553306102e-11, + "loss": 0.5084, + "step": 8691 + }, + { + "epoch": 1.0, + "grad_norm": 1.7596783179811035, + "learning_rate": 4.1902189562570416e-11, + "loss": 0.4557, + "step": 8692 + }, + { + "epoch": 1.0, + "grad_norm": 6.408393892082615, + "learning_rate": 3.462991712388508e-11, + "loss": 0.575, + "step": 8693 + }, + { + "epoch": 1.0, + "grad_norm": 2.2439517753114413, + "learning_rate": 2.8050239022636704e-11, + "loss": 0.5415, + "step": 8694 + }, + { + "epoch": 1.0, + "grad_norm": 2.2654484288330323, + "learning_rate": 2.2163156169208167e-11, + "loss": 0.498, + "step": 8695 + }, + { + "epoch": 1.0, + "grad_norm": 0.8332978729276281, + "learning_rate": 1.69686693801685e-11, + "loss": 0.6542, + "step": 8696 + }, + { + "epoch": 1.0, + "grad_norm": 2.189068784614712, + "learning_rate": 1.2466779374942228e-11, + "loss": 0.4562, + "step": 8697 + }, + { + "epoch": 1.0, + "grad_norm": 2.9263971122915615, + "learning_rate": 8.657486776364465e-12, + "loss": 0.4561, + "step": 8698 + }, + { + "epoch": 1.0, + "grad_norm": 2.4445107736609613, + "learning_rate": 5.5407921123462606e-12, + "loss": 0.4447, + "step": 8699 + }, + { + "epoch": 1.0, + "grad_norm": 6.923642959262328, + "learning_rate": 3.1166958153194813e-12, + "loss": 0.537, + "step": 8700 + }, + { + "epoch": 1.0, + "grad_norm": 1.9357179395376982, + "learning_rate": 1.3851982200163705e-12, + "loss": 0.425, + "step": 8701 + }, + { + "epoch": 1.0, + "grad_norm": 1.621611557708318, + "learning_rate": 3.4629956680021226e-13, + "loss": 0.4618, + "step": 8702 + }, + { + "epoch": 1.0, + "grad_norm": 1.3558951288903296, + "learning_rate": 0.0, + "loss": 0.6268, + "step": 8703 + }, + { + "epoch": 1.0, + "step": 8703, + "total_flos": 2814559681249280.0, + "train_loss": 0.5202090283186973, + "train_runtime": 33769.6613, + "train_samples_per_second": 32.988, + "train_steps_per_second": 0.258 + } + ], + "logging_steps": 1.0, + "max_steps": 8703, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 871, + "total_flos": 2814559681249280.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}