diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,66565 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.222222222222222, + "eval_steps": 2250, + "global_step": 9500, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00044444444444444447, + "grad_norm": 0.7025980353355408, + "learning_rate": 2e-05, + "loss": 3.0289, + "step": 1 + }, + { + "epoch": 0.0008888888888888889, + "grad_norm": 0.8140734434127808, + "learning_rate": 4e-05, + "loss": 3.1428, + "step": 2 + }, + { + "epoch": 0.0013333333333333333, + "grad_norm": 0.8431426286697388, + "learning_rate": 6e-05, + "loss": 2.759, + "step": 3 + }, + { + "epoch": 0.0017777777777777779, + "grad_norm": 0.8204770088195801, + "learning_rate": 8e-05, + "loss": 3.0823, + "step": 4 + }, + { + "epoch": 0.0022222222222222222, + "grad_norm": 0.7714145183563232, + "learning_rate": 0.0001, + "loss": 2.9085, + "step": 5 + }, + { + "epoch": 0.0026666666666666666, + "grad_norm": 0.8233251571655273, + "learning_rate": 0.00012, + "loss": 2.926, + "step": 6 + }, + { + "epoch": 0.003111111111111111, + "grad_norm": 0.919024646282196, + "learning_rate": 0.00014, + "loss": 2.9215, + "step": 7 + }, + { + "epoch": 0.0035555555555555557, + "grad_norm": 0.996406078338623, + "learning_rate": 0.00016, + "loss": 2.6709, + "step": 8 + }, + { + "epoch": 0.004, + "grad_norm": 1.156698226928711, + "learning_rate": 0.00018, + "loss": 2.936, + "step": 9 + }, + { + "epoch": 0.0044444444444444444, + "grad_norm": 1.6368826627731323, + "learning_rate": 0.0002, + "loss": 3.0452, + "step": 10 + }, + { + "epoch": 0.004888888888888889, + "grad_norm": 1.3638646602630615, + "learning_rate": 0.00019998220640569397, + "loss": 2.7221, + "step": 11 + }, + { + "epoch": 0.005333333333333333, + "grad_norm": 1.2367130517959595, + "learning_rate": 0.00019996441281138792, + "loss": 3.1021, + "step": 12 + }, + { + "epoch": 0.0057777777777777775, + "grad_norm": 1.1429595947265625, + "learning_rate": 0.00019994661921708185, + "loss": 2.5428, + "step": 13 + }, + { + "epoch": 0.006222222222222222, + "grad_norm": 1.3077034950256348, + "learning_rate": 0.0001999288256227758, + "loss": 2.7311, + "step": 14 + }, + { + "epoch": 0.006666666666666667, + "grad_norm": 1.2416568994522095, + "learning_rate": 0.00019991103202846976, + "loss": 2.3485, + "step": 15 + }, + { + "epoch": 0.0071111111111111115, + "grad_norm": 1.2917921543121338, + "learning_rate": 0.00019989323843416372, + "loss": 2.6843, + "step": 16 + }, + { + "epoch": 0.007555555555555556, + "grad_norm": 1.388919711112976, + "learning_rate": 0.00019987544483985765, + "loss": 2.5847, + "step": 17 + }, + { + "epoch": 0.008, + "grad_norm": 1.4038573503494263, + "learning_rate": 0.0001998576512455516, + "loss": 2.8317, + "step": 18 + }, + { + "epoch": 0.008444444444444444, + "grad_norm": 1.3480195999145508, + "learning_rate": 0.00019983985765124556, + "loss": 2.9627, + "step": 19 + }, + { + "epoch": 0.008888888888888889, + "grad_norm": 1.1358228921890259, + "learning_rate": 0.0001998220640569395, + "loss": 2.6335, + "step": 20 + }, + { + "epoch": 0.009333333333333334, + "grad_norm": 1.3050692081451416, + "learning_rate": 0.00019980427046263345, + "loss": 2.7682, + "step": 21 + }, + { + "epoch": 0.009777777777777778, + "grad_norm": 1.1249107122421265, + "learning_rate": 0.0001997864768683274, + "loss": 2.5659, + "step": 22 + }, + { + "epoch": 0.010222222222222223, + "grad_norm": 2.0670816898345947, + "learning_rate": 0.00019976868327402136, + "loss": 3.6833, + "step": 23 + }, + { + "epoch": 0.010666666666666666, + "grad_norm": 1.3983336687088013, + "learning_rate": 0.00019975088967971532, + "loss": 2.2004, + "step": 24 + }, + { + "epoch": 0.011111111111111112, + "grad_norm": 1.2295548915863037, + "learning_rate": 0.00019973309608540928, + "loss": 2.5752, + "step": 25 + }, + { + "epoch": 0.011555555555555555, + "grad_norm": 1.4657237529754639, + "learning_rate": 0.0001997153024911032, + "loss": 2.5437, + "step": 26 + }, + { + "epoch": 0.012, + "grad_norm": 1.2808438539505005, + "learning_rate": 0.00019969750889679716, + "loss": 2.5708, + "step": 27 + }, + { + "epoch": 0.012444444444444444, + "grad_norm": 1.811769723892212, + "learning_rate": 0.00019967971530249112, + "loss": 2.7687, + "step": 28 + }, + { + "epoch": 0.012888888888888889, + "grad_norm": 1.4724963903427124, + "learning_rate": 0.00019966192170818507, + "loss": 2.4946, + "step": 29 + }, + { + "epoch": 0.013333333333333334, + "grad_norm": 1.31075119972229, + "learning_rate": 0.000199644128113879, + "loss": 2.2154, + "step": 30 + }, + { + "epoch": 0.013777777777777778, + "grad_norm": 1.5601913928985596, + "learning_rate": 0.00019962633451957296, + "loss": 2.9217, + "step": 31 + }, + { + "epoch": 0.014222222222222223, + "grad_norm": 1.3747111558914185, + "learning_rate": 0.00019960854092526692, + "loss": 2.921, + "step": 32 + }, + { + "epoch": 0.014666666666666666, + "grad_norm": 1.3256399631500244, + "learning_rate": 0.00019959074733096085, + "loss": 2.664, + "step": 33 + }, + { + "epoch": 0.015111111111111112, + "grad_norm": 1.421920657157898, + "learning_rate": 0.0001995729537366548, + "loss": 2.4761, + "step": 34 + }, + { + "epoch": 0.015555555555555555, + "grad_norm": 1.6060773134231567, + "learning_rate": 0.00019955516014234876, + "loss": 2.585, + "step": 35 + }, + { + "epoch": 0.016, + "grad_norm": 1.2819411754608154, + "learning_rate": 0.00019953736654804272, + "loss": 2.4084, + "step": 36 + }, + { + "epoch": 0.016444444444444446, + "grad_norm": 1.6595838069915771, + "learning_rate": 0.00019951957295373667, + "loss": 2.9492, + "step": 37 + }, + { + "epoch": 0.016888888888888887, + "grad_norm": 1.7327772378921509, + "learning_rate": 0.00019950177935943063, + "loss": 2.4173, + "step": 38 + }, + { + "epoch": 0.017333333333333333, + "grad_norm": 1.7617017030715942, + "learning_rate": 0.00019948398576512456, + "loss": 2.0839, + "step": 39 + }, + { + "epoch": 0.017777777777777778, + "grad_norm": 1.677414894104004, + "learning_rate": 0.00019946619217081851, + "loss": 2.3126, + "step": 40 + }, + { + "epoch": 0.018222222222222223, + "grad_norm": 1.3741059303283691, + "learning_rate": 0.00019944839857651247, + "loss": 2.5466, + "step": 41 + }, + { + "epoch": 0.018666666666666668, + "grad_norm": 1.7205222845077515, + "learning_rate": 0.00019943060498220643, + "loss": 2.7476, + "step": 42 + }, + { + "epoch": 0.01911111111111111, + "grad_norm": 1.9464102983474731, + "learning_rate": 0.00019941281138790036, + "loss": 2.8767, + "step": 43 + }, + { + "epoch": 0.019555555555555555, + "grad_norm": 1.5541801452636719, + "learning_rate": 0.00019939501779359431, + "loss": 2.4521, + "step": 44 + }, + { + "epoch": 0.02, + "grad_norm": 1.6664501428604126, + "learning_rate": 0.00019937722419928827, + "loss": 2.1682, + "step": 45 + }, + { + "epoch": 0.020444444444444446, + "grad_norm": 1.555294394493103, + "learning_rate": 0.0001993594306049822, + "loss": 2.6263, + "step": 46 + }, + { + "epoch": 0.020888888888888887, + "grad_norm": 1.8338146209716797, + "learning_rate": 0.00019934163701067616, + "loss": 2.6912, + "step": 47 + }, + { + "epoch": 0.021333333333333333, + "grad_norm": 1.5415942668914795, + "learning_rate": 0.0001993238434163701, + "loss": 2.5132, + "step": 48 + }, + { + "epoch": 0.021777777777777778, + "grad_norm": 1.7458958625793457, + "learning_rate": 0.00019930604982206407, + "loss": 2.6587, + "step": 49 + }, + { + "epoch": 0.022222222222222223, + "grad_norm": 5.483662128448486, + "learning_rate": 0.00019928825622775803, + "loss": 2.7683, + "step": 50 + }, + { + "epoch": 0.02266666666666667, + "grad_norm": 0.7813481688499451, + "learning_rate": 0.00019927046263345198, + "loss": 2.3085, + "step": 51 + }, + { + "epoch": 0.02311111111111111, + "grad_norm": 0.9960127472877502, + "learning_rate": 0.0001992526690391459, + "loss": 2.4272, + "step": 52 + }, + { + "epoch": 0.023555555555555555, + "grad_norm": 1.1653705835342407, + "learning_rate": 0.00019923487544483987, + "loss": 2.7689, + "step": 53 + }, + { + "epoch": 0.024, + "grad_norm": 1.2431868314743042, + "learning_rate": 0.00019921708185053382, + "loss": 2.7339, + "step": 54 + }, + { + "epoch": 0.024444444444444446, + "grad_norm": 1.22008216381073, + "learning_rate": 0.00019919928825622778, + "loss": 2.4097, + "step": 55 + }, + { + "epoch": 0.024888888888888887, + "grad_norm": 1.1046444177627563, + "learning_rate": 0.0001991814946619217, + "loss": 2.5895, + "step": 56 + }, + { + "epoch": 0.025333333333333333, + "grad_norm": 1.812303066253662, + "learning_rate": 0.00019916370106761567, + "loss": 2.7829, + "step": 57 + }, + { + "epoch": 0.025777777777777778, + "grad_norm": 1.241114854812622, + "learning_rate": 0.00019914590747330962, + "loss": 2.4605, + "step": 58 + }, + { + "epoch": 0.026222222222222223, + "grad_norm": 1.2486529350280762, + "learning_rate": 0.00019912811387900355, + "loss": 2.5066, + "step": 59 + }, + { + "epoch": 0.02666666666666667, + "grad_norm": 1.2300881147384644, + "learning_rate": 0.0001991103202846975, + "loss": 2.676, + "step": 60 + }, + { + "epoch": 0.02711111111111111, + "grad_norm": 1.840987205505371, + "learning_rate": 0.00019909252669039147, + "loss": 2.5316, + "step": 61 + }, + { + "epoch": 0.027555555555555555, + "grad_norm": 1.1190531253814697, + "learning_rate": 0.00019907473309608542, + "loss": 2.3772, + "step": 62 + }, + { + "epoch": 0.028, + "grad_norm": 1.255723476409912, + "learning_rate": 0.00019905693950177938, + "loss": 2.4208, + "step": 63 + }, + { + "epoch": 0.028444444444444446, + "grad_norm": 1.0897091627120972, + "learning_rate": 0.00019903914590747334, + "loss": 2.5596, + "step": 64 + }, + { + "epoch": 0.028888888888888888, + "grad_norm": 1.1145886182785034, + "learning_rate": 0.00019902135231316726, + "loss": 2.428, + "step": 65 + }, + { + "epoch": 0.029333333333333333, + "grad_norm": 1.608787178993225, + "learning_rate": 0.00019900355871886122, + "loss": 2.956, + "step": 66 + }, + { + "epoch": 0.029777777777777778, + "grad_norm": 1.2278952598571777, + "learning_rate": 0.00019898576512455518, + "loss": 2.9204, + "step": 67 + }, + { + "epoch": 0.030222222222222223, + "grad_norm": 1.2748639583587646, + "learning_rate": 0.00019896797153024913, + "loss": 2.8291, + "step": 68 + }, + { + "epoch": 0.030666666666666665, + "grad_norm": 1.2492533922195435, + "learning_rate": 0.00019895017793594306, + "loss": 2.8622, + "step": 69 + }, + { + "epoch": 0.03111111111111111, + "grad_norm": 1.2926253080368042, + "learning_rate": 0.00019893238434163702, + "loss": 2.7478, + "step": 70 + }, + { + "epoch": 0.03155555555555556, + "grad_norm": 1.654407024383545, + "learning_rate": 0.00019891459074733098, + "loss": 2.4137, + "step": 71 + }, + { + "epoch": 0.032, + "grad_norm": 1.4059809446334839, + "learning_rate": 0.0001988967971530249, + "loss": 3.0662, + "step": 72 + }, + { + "epoch": 0.03244444444444444, + "grad_norm": 1.5277940034866333, + "learning_rate": 0.00019887900355871886, + "loss": 2.4363, + "step": 73 + }, + { + "epoch": 0.03288888888888889, + "grad_norm": 1.141005277633667, + "learning_rate": 0.00019886120996441282, + "loss": 2.4846, + "step": 74 + }, + { + "epoch": 0.03333333333333333, + "grad_norm": 1.2516539096832275, + "learning_rate": 0.00019884341637010678, + "loss": 2.3899, + "step": 75 + }, + { + "epoch": 0.033777777777777775, + "grad_norm": 1.2361774444580078, + "learning_rate": 0.00019882562277580073, + "loss": 2.4988, + "step": 76 + }, + { + "epoch": 0.03422222222222222, + "grad_norm": 1.2464451789855957, + "learning_rate": 0.0001988078291814947, + "loss": 2.4915, + "step": 77 + }, + { + "epoch": 0.034666666666666665, + "grad_norm": 1.488785982131958, + "learning_rate": 0.00019879003558718862, + "loss": 2.6796, + "step": 78 + }, + { + "epoch": 0.035111111111111114, + "grad_norm": 1.3093085289001465, + "learning_rate": 0.00019877224199288257, + "loss": 2.6892, + "step": 79 + }, + { + "epoch": 0.035555555555555556, + "grad_norm": 1.1957430839538574, + "learning_rate": 0.00019875444839857653, + "loss": 2.5761, + "step": 80 + }, + { + "epoch": 0.036, + "grad_norm": 1.3030658960342407, + "learning_rate": 0.00019873665480427046, + "loss": 2.6818, + "step": 81 + }, + { + "epoch": 0.036444444444444446, + "grad_norm": 1.2497376203536987, + "learning_rate": 0.00019871886120996442, + "loss": 2.2396, + "step": 82 + }, + { + "epoch": 0.03688888888888889, + "grad_norm": 1.1968713998794556, + "learning_rate": 0.00019870106761565837, + "loss": 2.6936, + "step": 83 + }, + { + "epoch": 0.037333333333333336, + "grad_norm": 1.5414577722549438, + "learning_rate": 0.00019868327402135233, + "loss": 2.4236, + "step": 84 + }, + { + "epoch": 0.03777777777777778, + "grad_norm": 1.4061691761016846, + "learning_rate": 0.00019866548042704626, + "loss": 2.0842, + "step": 85 + }, + { + "epoch": 0.03822222222222222, + "grad_norm": 1.3697423934936523, + "learning_rate": 0.00019864768683274022, + "loss": 2.6033, + "step": 86 + }, + { + "epoch": 0.03866666666666667, + "grad_norm": 1.4249686002731323, + "learning_rate": 0.00019862989323843417, + "loss": 2.3001, + "step": 87 + }, + { + "epoch": 0.03911111111111111, + "grad_norm": 1.449453592300415, + "learning_rate": 0.00019861209964412813, + "loss": 2.7934, + "step": 88 + }, + { + "epoch": 0.03955555555555555, + "grad_norm": 1.6140450239181519, + "learning_rate": 0.00019859430604982209, + "loss": 2.9546, + "step": 89 + }, + { + "epoch": 0.04, + "grad_norm": 1.3802794218063354, + "learning_rate": 0.00019857651245551604, + "loss": 2.6017, + "step": 90 + }, + { + "epoch": 0.04044444444444444, + "grad_norm": 1.4572910070419312, + "learning_rate": 0.00019855871886120997, + "loss": 2.9772, + "step": 91 + }, + { + "epoch": 0.04088888888888889, + "grad_norm": 1.6315029859542847, + "learning_rate": 0.00019854092526690393, + "loss": 3.0303, + "step": 92 + }, + { + "epoch": 0.04133333333333333, + "grad_norm": 1.6254206895828247, + "learning_rate": 0.00019852313167259788, + "loss": 2.9715, + "step": 93 + }, + { + "epoch": 0.041777777777777775, + "grad_norm": 1.2871061563491821, + "learning_rate": 0.0001985053380782918, + "loss": 2.3368, + "step": 94 + }, + { + "epoch": 0.042222222222222223, + "grad_norm": 1.4380096197128296, + "learning_rate": 0.00019848754448398577, + "loss": 2.5723, + "step": 95 + }, + { + "epoch": 0.042666666666666665, + "grad_norm": 1.5760232210159302, + "learning_rate": 0.00019846975088967973, + "loss": 2.5501, + "step": 96 + }, + { + "epoch": 0.043111111111111114, + "grad_norm": 1.6527888774871826, + "learning_rate": 0.00019845195729537368, + "loss": 2.6406, + "step": 97 + }, + { + "epoch": 0.043555555555555556, + "grad_norm": 1.8810604810714722, + "learning_rate": 0.0001984341637010676, + "loss": 2.8451, + "step": 98 + }, + { + "epoch": 0.044, + "grad_norm": 2.4290010929107666, + "learning_rate": 0.00019841637010676157, + "loss": 2.3485, + "step": 99 + }, + { + "epoch": 0.044444444444444446, + "grad_norm": 2.3675355911254883, + "learning_rate": 0.00019839857651245553, + "loss": 2.8247, + "step": 100 + }, + { + "epoch": 0.04488888888888889, + "grad_norm": 0.7726467847824097, + "learning_rate": 0.00019838078291814948, + "loss": 2.5495, + "step": 101 + }, + { + "epoch": 0.04533333333333334, + "grad_norm": 1.0265134572982788, + "learning_rate": 0.00019836298932384344, + "loss": 2.7777, + "step": 102 + }, + { + "epoch": 0.04577777777777778, + "grad_norm": 0.9280586838722229, + "learning_rate": 0.00019834519572953737, + "loss": 2.8585, + "step": 103 + }, + { + "epoch": 0.04622222222222222, + "grad_norm": 0.897459089756012, + "learning_rate": 0.00019832740213523132, + "loss": 2.5307, + "step": 104 + }, + { + "epoch": 0.04666666666666667, + "grad_norm": 1.299446702003479, + "learning_rate": 0.00019830960854092528, + "loss": 2.6633, + "step": 105 + }, + { + "epoch": 0.04711111111111111, + "grad_norm": 1.0051794052124023, + "learning_rate": 0.00019829181494661924, + "loss": 2.714, + "step": 106 + }, + { + "epoch": 0.04755555555555555, + "grad_norm": 1.096691370010376, + "learning_rate": 0.00019827402135231317, + "loss": 2.5904, + "step": 107 + }, + { + "epoch": 0.048, + "grad_norm": 0.9809961318969727, + "learning_rate": 0.00019825622775800712, + "loss": 2.7201, + "step": 108 + }, + { + "epoch": 0.04844444444444444, + "grad_norm": 1.2203365564346313, + "learning_rate": 0.00019823843416370108, + "loss": 2.7113, + "step": 109 + }, + { + "epoch": 0.04888888888888889, + "grad_norm": 1.2605012655258179, + "learning_rate": 0.000198220640569395, + "loss": 2.7018, + "step": 110 + }, + { + "epoch": 0.04933333333333333, + "grad_norm": 1.0020304918289185, + "learning_rate": 0.00019820284697508897, + "loss": 2.4043, + "step": 111 + }, + { + "epoch": 0.049777777777777775, + "grad_norm": 0.9287395477294922, + "learning_rate": 0.00019818505338078292, + "loss": 2.1282, + "step": 112 + }, + { + "epoch": 0.050222222222222224, + "grad_norm": 1.0499564409255981, + "learning_rate": 0.00019816725978647688, + "loss": 2.724, + "step": 113 + }, + { + "epoch": 0.050666666666666665, + "grad_norm": 0.9938886165618896, + "learning_rate": 0.00019814946619217083, + "loss": 2.8035, + "step": 114 + }, + { + "epoch": 0.051111111111111114, + "grad_norm": 0.9068772196769714, + "learning_rate": 0.0001981316725978648, + "loss": 2.2173, + "step": 115 + }, + { + "epoch": 0.051555555555555556, + "grad_norm": 0.9217369556427002, + "learning_rate": 0.00019811387900355872, + "loss": 2.266, + "step": 116 + }, + { + "epoch": 0.052, + "grad_norm": 0.9447048306465149, + "learning_rate": 0.00019809608540925268, + "loss": 2.5042, + "step": 117 + }, + { + "epoch": 0.052444444444444446, + "grad_norm": 1.1115142107009888, + "learning_rate": 0.00019807829181494663, + "loss": 2.9463, + "step": 118 + }, + { + "epoch": 0.05288888888888889, + "grad_norm": 1.0305101871490479, + "learning_rate": 0.0001980604982206406, + "loss": 2.5201, + "step": 119 + }, + { + "epoch": 0.05333333333333334, + "grad_norm": 1.2101026773452759, + "learning_rate": 0.00019804270462633452, + "loss": 2.6229, + "step": 120 + }, + { + "epoch": 0.05377777777777778, + "grad_norm": 1.163856029510498, + "learning_rate": 0.00019802491103202848, + "loss": 2.8074, + "step": 121 + }, + { + "epoch": 0.05422222222222222, + "grad_norm": 1.2083830833435059, + "learning_rate": 0.00019800711743772243, + "loss": 2.7174, + "step": 122 + }, + { + "epoch": 0.05466666666666667, + "grad_norm": 1.169143795967102, + "learning_rate": 0.00019798932384341636, + "loss": 2.3256, + "step": 123 + }, + { + "epoch": 0.05511111111111111, + "grad_norm": 1.1567578315734863, + "learning_rate": 0.00019797153024911032, + "loss": 2.4596, + "step": 124 + }, + { + "epoch": 0.05555555555555555, + "grad_norm": 1.2948107719421387, + "learning_rate": 0.00019795373665480427, + "loss": 2.5357, + "step": 125 + }, + { + "epoch": 0.056, + "grad_norm": 1.2189359664916992, + "learning_rate": 0.00019793594306049823, + "loss": 2.823, + "step": 126 + }, + { + "epoch": 0.05644444444444444, + "grad_norm": 1.5137442350387573, + "learning_rate": 0.0001979181494661922, + "loss": 3.2187, + "step": 127 + }, + { + "epoch": 0.05688888888888889, + "grad_norm": 1.2950530052185059, + "learning_rate": 0.00019790035587188614, + "loss": 2.6476, + "step": 128 + }, + { + "epoch": 0.05733333333333333, + "grad_norm": 1.1926493644714355, + "learning_rate": 0.00019788256227758007, + "loss": 1.9832, + "step": 129 + }, + { + "epoch": 0.057777777777777775, + "grad_norm": 1.3409109115600586, + "learning_rate": 0.00019786476868327403, + "loss": 2.5699, + "step": 130 + }, + { + "epoch": 0.058222222222222224, + "grad_norm": 1.2740064859390259, + "learning_rate": 0.000197846975088968, + "loss": 2.6332, + "step": 131 + }, + { + "epoch": 0.058666666666666666, + "grad_norm": 1.7328652143478394, + "learning_rate": 0.00019782918149466194, + "loss": 2.946, + "step": 132 + }, + { + "epoch": 0.059111111111111114, + "grad_norm": 1.3805177211761475, + "learning_rate": 0.00019781138790035587, + "loss": 2.7012, + "step": 133 + }, + { + "epoch": 0.059555555555555556, + "grad_norm": 1.3198126554489136, + "learning_rate": 0.00019779359430604983, + "loss": 2.6473, + "step": 134 + }, + { + "epoch": 0.06, + "grad_norm": 1.4987982511520386, + "learning_rate": 0.00019777580071174379, + "loss": 2.8902, + "step": 135 + }, + { + "epoch": 0.060444444444444446, + "grad_norm": 1.4471278190612793, + "learning_rate": 0.00019775800711743772, + "loss": 2.8084, + "step": 136 + }, + { + "epoch": 0.06088888888888889, + "grad_norm": 1.1906874179840088, + "learning_rate": 0.00019774021352313167, + "loss": 2.2734, + "step": 137 + }, + { + "epoch": 0.06133333333333333, + "grad_norm": 1.349488615989685, + "learning_rate": 0.00019772241992882563, + "loss": 2.734, + "step": 138 + }, + { + "epoch": 0.06177777777777778, + "grad_norm": 1.1277025938034058, + "learning_rate": 0.00019770462633451958, + "loss": 1.7254, + "step": 139 + }, + { + "epoch": 0.06222222222222222, + "grad_norm": 1.277053713798523, + "learning_rate": 0.00019768683274021354, + "loss": 2.5841, + "step": 140 + }, + { + "epoch": 0.06266666666666666, + "grad_norm": 1.358282446861267, + "learning_rate": 0.0001976690391459075, + "loss": 2.4615, + "step": 141 + }, + { + "epoch": 0.06311111111111112, + "grad_norm": 1.463334560394287, + "learning_rate": 0.00019765124555160143, + "loss": 2.8497, + "step": 142 + }, + { + "epoch": 0.06355555555555556, + "grad_norm": 1.537904143333435, + "learning_rate": 0.00019763345195729538, + "loss": 2.4246, + "step": 143 + }, + { + "epoch": 0.064, + "grad_norm": 1.3648548126220703, + "learning_rate": 0.00019761565836298934, + "loss": 2.6535, + "step": 144 + }, + { + "epoch": 0.06444444444444444, + "grad_norm": 1.3705697059631348, + "learning_rate": 0.0001975978647686833, + "loss": 2.6365, + "step": 145 + }, + { + "epoch": 0.06488888888888888, + "grad_norm": 1.5270709991455078, + "learning_rate": 0.00019758007117437723, + "loss": 2.7863, + "step": 146 + }, + { + "epoch": 0.06533333333333333, + "grad_norm": 1.413665771484375, + "learning_rate": 0.00019756227758007118, + "loss": 2.4596, + "step": 147 + }, + { + "epoch": 0.06577777777777778, + "grad_norm": 1.7925262451171875, + "learning_rate": 0.00019754448398576514, + "loss": 3.2978, + "step": 148 + }, + { + "epoch": 0.06622222222222222, + "grad_norm": 1.5601551532745361, + "learning_rate": 0.00019752669039145907, + "loss": 2.1796, + "step": 149 + }, + { + "epoch": 0.06666666666666667, + "grad_norm": 1.8019471168518066, + "learning_rate": 0.00019750889679715302, + "loss": 2.5945, + "step": 150 + }, + { + "epoch": 0.06711111111111111, + "grad_norm": 0.816262423992157, + "learning_rate": 0.00019749110320284698, + "loss": 2.478, + "step": 151 + }, + { + "epoch": 0.06755555555555555, + "grad_norm": 0.9055171608924866, + "learning_rate": 0.00019747330960854094, + "loss": 2.6226, + "step": 152 + }, + { + "epoch": 0.068, + "grad_norm": 0.8419144153594971, + "learning_rate": 0.0001974555160142349, + "loss": 2.2478, + "step": 153 + }, + { + "epoch": 0.06844444444444445, + "grad_norm": 1.01926589012146, + "learning_rate": 0.00019743772241992885, + "loss": 2.8489, + "step": 154 + }, + { + "epoch": 0.06888888888888889, + "grad_norm": 1.3037279844284058, + "learning_rate": 0.00019741992882562278, + "loss": 3.1187, + "step": 155 + }, + { + "epoch": 0.06933333333333333, + "grad_norm": 1.0188137292861938, + "learning_rate": 0.00019740213523131674, + "loss": 2.6388, + "step": 156 + }, + { + "epoch": 0.06977777777777777, + "grad_norm": 1.1099557876586914, + "learning_rate": 0.0001973843416370107, + "loss": 2.7675, + "step": 157 + }, + { + "epoch": 0.07022222222222223, + "grad_norm": 0.9259713292121887, + "learning_rate": 0.00019736654804270465, + "loss": 2.7232, + "step": 158 + }, + { + "epoch": 0.07066666666666667, + "grad_norm": 1.0099951028823853, + "learning_rate": 0.00019734875444839858, + "loss": 2.3996, + "step": 159 + }, + { + "epoch": 0.07111111111111111, + "grad_norm": 1.045190691947937, + "learning_rate": 0.00019733096085409254, + "loss": 2.5773, + "step": 160 + }, + { + "epoch": 0.07155555555555555, + "grad_norm": 1.1050662994384766, + "learning_rate": 0.0001973131672597865, + "loss": 2.1176, + "step": 161 + }, + { + "epoch": 0.072, + "grad_norm": 1.2864255905151367, + "learning_rate": 0.00019729537366548042, + "loss": 3.2324, + "step": 162 + }, + { + "epoch": 0.07244444444444445, + "grad_norm": 1.0812265872955322, + "learning_rate": 0.00019727758007117438, + "loss": 2.671, + "step": 163 + }, + { + "epoch": 0.07288888888888889, + "grad_norm": 1.0999687910079956, + "learning_rate": 0.00019725978647686833, + "loss": 2.7204, + "step": 164 + }, + { + "epoch": 0.07333333333333333, + "grad_norm": 1.0504425764083862, + "learning_rate": 0.0001972419928825623, + "loss": 2.5729, + "step": 165 + }, + { + "epoch": 0.07377777777777778, + "grad_norm": 1.059959053993225, + "learning_rate": 0.00019722419928825625, + "loss": 2.4222, + "step": 166 + }, + { + "epoch": 0.07422222222222222, + "grad_norm": 1.0592875480651855, + "learning_rate": 0.0001972064056939502, + "loss": 2.4263, + "step": 167 + }, + { + "epoch": 0.07466666666666667, + "grad_norm": 1.0814061164855957, + "learning_rate": 0.00019718861209964413, + "loss": 2.5768, + "step": 168 + }, + { + "epoch": 0.07511111111111111, + "grad_norm": 1.416944980621338, + "learning_rate": 0.0001971708185053381, + "loss": 3.0183, + "step": 169 + }, + { + "epoch": 0.07555555555555556, + "grad_norm": 1.0500316619873047, + "learning_rate": 0.00019715302491103205, + "loss": 2.6289, + "step": 170 + }, + { + "epoch": 0.076, + "grad_norm": 1.2884352207183838, + "learning_rate": 0.000197135231316726, + "loss": 2.5171, + "step": 171 + }, + { + "epoch": 0.07644444444444444, + "grad_norm": 1.3116530179977417, + "learning_rate": 0.00019711743772241993, + "loss": 3.0034, + "step": 172 + }, + { + "epoch": 0.0768888888888889, + "grad_norm": 1.2504932880401611, + "learning_rate": 0.0001970996441281139, + "loss": 2.5675, + "step": 173 + }, + { + "epoch": 0.07733333333333334, + "grad_norm": 1.1547982692718506, + "learning_rate": 0.00019708185053380785, + "loss": 2.301, + "step": 174 + }, + { + "epoch": 0.07777777777777778, + "grad_norm": 1.1116724014282227, + "learning_rate": 0.00019706405693950177, + "loss": 2.4003, + "step": 175 + }, + { + "epoch": 0.07822222222222222, + "grad_norm": 1.109155297279358, + "learning_rate": 0.00019704626334519573, + "loss": 2.2188, + "step": 176 + }, + { + "epoch": 0.07866666666666666, + "grad_norm": 1.2276736497879028, + "learning_rate": 0.0001970284697508897, + "loss": 2.5917, + "step": 177 + }, + { + "epoch": 0.0791111111111111, + "grad_norm": 1.5495067834854126, + "learning_rate": 0.00019701067615658364, + "loss": 2.5696, + "step": 178 + }, + { + "epoch": 0.07955555555555556, + "grad_norm": 1.2796157598495483, + "learning_rate": 0.0001969928825622776, + "loss": 2.3558, + "step": 179 + }, + { + "epoch": 0.08, + "grad_norm": 1.151888132095337, + "learning_rate": 0.00019697508896797156, + "loss": 1.779, + "step": 180 + }, + { + "epoch": 0.08044444444444444, + "grad_norm": 1.2485597133636475, + "learning_rate": 0.0001969572953736655, + "loss": 2.1977, + "step": 181 + }, + { + "epoch": 0.08088888888888889, + "grad_norm": 1.392452597618103, + "learning_rate": 0.00019693950177935944, + "loss": 2.6898, + "step": 182 + }, + { + "epoch": 0.08133333333333333, + "grad_norm": 1.514426350593567, + "learning_rate": 0.0001969217081850534, + "loss": 2.8083, + "step": 183 + }, + { + "epoch": 0.08177777777777778, + "grad_norm": 1.472489356994629, + "learning_rate": 0.00019690391459074736, + "loss": 2.9328, + "step": 184 + }, + { + "epoch": 0.08222222222222222, + "grad_norm": 1.5749987363815308, + "learning_rate": 0.00019688612099644129, + "loss": 2.3176, + "step": 185 + }, + { + "epoch": 0.08266666666666667, + "grad_norm": 1.5175185203552246, + "learning_rate": 0.00019686832740213524, + "loss": 2.7677, + "step": 186 + }, + { + "epoch": 0.08311111111111111, + "grad_norm": 1.286679983139038, + "learning_rate": 0.0001968505338078292, + "loss": 2.08, + "step": 187 + }, + { + "epoch": 0.08355555555555555, + "grad_norm": 1.4742923974990845, + "learning_rate": 0.00019683274021352313, + "loss": 3.0767, + "step": 188 + }, + { + "epoch": 0.084, + "grad_norm": 1.3634746074676514, + "learning_rate": 0.00019681494661921708, + "loss": 2.423, + "step": 189 + }, + { + "epoch": 0.08444444444444445, + "grad_norm": 1.4298174381256104, + "learning_rate": 0.00019679715302491104, + "loss": 2.1539, + "step": 190 + }, + { + "epoch": 0.08488888888888889, + "grad_norm": 1.6024513244628906, + "learning_rate": 0.000196779359430605, + "loss": 2.868, + "step": 191 + }, + { + "epoch": 0.08533333333333333, + "grad_norm": 1.4882041215896606, + "learning_rate": 0.00019676156583629895, + "loss": 2.4238, + "step": 192 + }, + { + "epoch": 0.08577777777777777, + "grad_norm": 1.4021437168121338, + "learning_rate": 0.00019674377224199288, + "loss": 2.6468, + "step": 193 + }, + { + "epoch": 0.08622222222222223, + "grad_norm": 1.5105438232421875, + "learning_rate": 0.00019672597864768684, + "loss": 2.3776, + "step": 194 + }, + { + "epoch": 0.08666666666666667, + "grad_norm": 1.753899097442627, + "learning_rate": 0.0001967081850533808, + "loss": 2.4496, + "step": 195 + }, + { + "epoch": 0.08711111111111111, + "grad_norm": 1.6667553186416626, + "learning_rate": 0.00019669039145907475, + "loss": 2.7937, + "step": 196 + }, + { + "epoch": 0.08755555555555555, + "grad_norm": 1.4890007972717285, + "learning_rate": 0.00019667259786476868, + "loss": 2.7493, + "step": 197 + }, + { + "epoch": 0.088, + "grad_norm": 1.8673200607299805, + "learning_rate": 0.00019665480427046264, + "loss": 2.7379, + "step": 198 + }, + { + "epoch": 0.08844444444444445, + "grad_norm": 2.127183675765991, + "learning_rate": 0.0001966370106761566, + "loss": 3.239, + "step": 199 + }, + { + "epoch": 0.08888888888888889, + "grad_norm": 2.974283218383789, + "learning_rate": 0.00019661921708185052, + "loss": 2.9018, + "step": 200 + }, + { + "epoch": 0.08933333333333333, + "grad_norm": 0.7585410475730896, + "learning_rate": 0.00019660142348754448, + "loss": 2.1856, + "step": 201 + }, + { + "epoch": 0.08977777777777778, + "grad_norm": 0.8838407397270203, + "learning_rate": 0.00019658362989323844, + "loss": 2.353, + "step": 202 + }, + { + "epoch": 0.09022222222222222, + "grad_norm": 0.9938530325889587, + "learning_rate": 0.0001965658362989324, + "loss": 2.5122, + "step": 203 + }, + { + "epoch": 0.09066666666666667, + "grad_norm": 0.8707981109619141, + "learning_rate": 0.00019654804270462635, + "loss": 2.5396, + "step": 204 + }, + { + "epoch": 0.09111111111111111, + "grad_norm": 0.9297839999198914, + "learning_rate": 0.0001965302491103203, + "loss": 2.7153, + "step": 205 + }, + { + "epoch": 0.09155555555555556, + "grad_norm": 1.176153302192688, + "learning_rate": 0.00019651245551601424, + "loss": 2.5667, + "step": 206 + }, + { + "epoch": 0.092, + "grad_norm": 1.1235204935073853, + "learning_rate": 0.0001964946619217082, + "loss": 2.5297, + "step": 207 + }, + { + "epoch": 0.09244444444444444, + "grad_norm": 1.123063564300537, + "learning_rate": 0.00019647686832740215, + "loss": 2.8088, + "step": 208 + }, + { + "epoch": 0.09288888888888888, + "grad_norm": 1.0872026681900024, + "learning_rate": 0.0001964590747330961, + "loss": 2.5135, + "step": 209 + }, + { + "epoch": 0.09333333333333334, + "grad_norm": 1.1103240251541138, + "learning_rate": 0.00019644128113879004, + "loss": 2.6343, + "step": 210 + }, + { + "epoch": 0.09377777777777778, + "grad_norm": 0.998205840587616, + "learning_rate": 0.000196423487544484, + "loss": 2.5504, + "step": 211 + }, + { + "epoch": 0.09422222222222222, + "grad_norm": 0.9748513698577881, + "learning_rate": 0.00019640569395017795, + "loss": 2.2131, + "step": 212 + }, + { + "epoch": 0.09466666666666666, + "grad_norm": 1.0071059465408325, + "learning_rate": 0.00019638790035587188, + "loss": 2.2381, + "step": 213 + }, + { + "epoch": 0.0951111111111111, + "grad_norm": 1.4692164659500122, + "learning_rate": 0.00019637010676156583, + "loss": 2.6001, + "step": 214 + }, + { + "epoch": 0.09555555555555556, + "grad_norm": 1.2764703035354614, + "learning_rate": 0.0001963523131672598, + "loss": 2.7826, + "step": 215 + }, + { + "epoch": 0.096, + "grad_norm": 1.0466008186340332, + "learning_rate": 0.00019633451957295375, + "loss": 2.7655, + "step": 216 + }, + { + "epoch": 0.09644444444444444, + "grad_norm": 1.1789090633392334, + "learning_rate": 0.0001963167259786477, + "loss": 2.5712, + "step": 217 + }, + { + "epoch": 0.09688888888888889, + "grad_norm": 1.240662932395935, + "learning_rate": 0.00019629893238434166, + "loss": 2.7105, + "step": 218 + }, + { + "epoch": 0.09733333333333333, + "grad_norm": 1.3624532222747803, + "learning_rate": 0.0001962811387900356, + "loss": 2.7762, + "step": 219 + }, + { + "epoch": 0.09777777777777778, + "grad_norm": 1.0563603639602661, + "learning_rate": 0.00019626334519572955, + "loss": 2.3591, + "step": 220 + }, + { + "epoch": 0.09822222222222222, + "grad_norm": 1.0683754682540894, + "learning_rate": 0.0001962455516014235, + "loss": 1.9874, + "step": 221 + }, + { + "epoch": 0.09866666666666667, + "grad_norm": 1.7027884721755981, + "learning_rate": 0.00019622775800711746, + "loss": 2.5819, + "step": 222 + }, + { + "epoch": 0.09911111111111111, + "grad_norm": 1.2697211503982544, + "learning_rate": 0.0001962099644128114, + "loss": 2.5147, + "step": 223 + }, + { + "epoch": 0.09955555555555555, + "grad_norm": 1.1141654253005981, + "learning_rate": 0.00019619217081850534, + "loss": 2.196, + "step": 224 + }, + { + "epoch": 0.1, + "grad_norm": 1.337015986442566, + "learning_rate": 0.0001961743772241993, + "loss": 2.5916, + "step": 225 + }, + { + "epoch": 0.10044444444444445, + "grad_norm": 1.3354969024658203, + "learning_rate": 0.00019615658362989323, + "loss": 2.3433, + "step": 226 + }, + { + "epoch": 0.10088888888888889, + "grad_norm": 1.2302438020706177, + "learning_rate": 0.0001961387900355872, + "loss": 2.3541, + "step": 227 + }, + { + "epoch": 0.10133333333333333, + "grad_norm": 1.8002538681030273, + "learning_rate": 0.00019612099644128114, + "loss": 2.6135, + "step": 228 + }, + { + "epoch": 0.10177777777777777, + "grad_norm": 1.4334473609924316, + "learning_rate": 0.0001961032028469751, + "loss": 2.7333, + "step": 229 + }, + { + "epoch": 0.10222222222222223, + "grad_norm": 1.5954945087432861, + "learning_rate": 0.00019608540925266906, + "loss": 2.6602, + "step": 230 + }, + { + "epoch": 0.10266666666666667, + "grad_norm": 1.3982867002487183, + "learning_rate": 0.000196067615658363, + "loss": 2.7748, + "step": 231 + }, + { + "epoch": 0.10311111111111111, + "grad_norm": 1.322675108909607, + "learning_rate": 0.00019604982206405694, + "loss": 2.4225, + "step": 232 + }, + { + "epoch": 0.10355555555555555, + "grad_norm": 1.3092737197875977, + "learning_rate": 0.0001960320284697509, + "loss": 2.479, + "step": 233 + }, + { + "epoch": 0.104, + "grad_norm": 1.3040847778320312, + "learning_rate": 0.00019601423487544486, + "loss": 2.6299, + "step": 234 + }, + { + "epoch": 0.10444444444444445, + "grad_norm": 1.3706668615341187, + "learning_rate": 0.0001959964412811388, + "loss": 2.2323, + "step": 235 + }, + { + "epoch": 0.10488888888888889, + "grad_norm": 1.5371273756027222, + "learning_rate": 0.00019597864768683274, + "loss": 2.8628, + "step": 236 + }, + { + "epoch": 0.10533333333333333, + "grad_norm": 1.5061933994293213, + "learning_rate": 0.0001959608540925267, + "loss": 2.962, + "step": 237 + }, + { + "epoch": 0.10577777777777778, + "grad_norm": 1.4326897859573364, + "learning_rate": 0.00019594306049822065, + "loss": 2.6172, + "step": 238 + }, + { + "epoch": 0.10622222222222222, + "grad_norm": 1.7283401489257812, + "learning_rate": 0.00019592526690391458, + "loss": 3.1021, + "step": 239 + }, + { + "epoch": 0.10666666666666667, + "grad_norm": 1.5328441858291626, + "learning_rate": 0.00019590747330960854, + "loss": 2.9195, + "step": 240 + }, + { + "epoch": 0.10711111111111112, + "grad_norm": 1.4138455390930176, + "learning_rate": 0.0001958896797153025, + "loss": 2.7317, + "step": 241 + }, + { + "epoch": 0.10755555555555556, + "grad_norm": 1.336175560951233, + "learning_rate": 0.00019587188612099645, + "loss": 2.6417, + "step": 242 + }, + { + "epoch": 0.108, + "grad_norm": 1.5993636846542358, + "learning_rate": 0.0001958540925266904, + "loss": 2.5646, + "step": 243 + }, + { + "epoch": 0.10844444444444444, + "grad_norm": 1.464353084564209, + "learning_rate": 0.00019583629893238437, + "loss": 2.6526, + "step": 244 + }, + { + "epoch": 0.10888888888888888, + "grad_norm": 1.731520652770996, + "learning_rate": 0.0001958185053380783, + "loss": 2.5564, + "step": 245 + }, + { + "epoch": 0.10933333333333334, + "grad_norm": 1.7385467290878296, + "learning_rate": 0.00019580071174377225, + "loss": 2.8463, + "step": 246 + }, + { + "epoch": 0.10977777777777778, + "grad_norm": 1.7964988946914673, + "learning_rate": 0.0001957829181494662, + "loss": 2.7856, + "step": 247 + }, + { + "epoch": 0.11022222222222222, + "grad_norm": 1.8664138317108154, + "learning_rate": 0.00019576512455516017, + "loss": 2.6548, + "step": 248 + }, + { + "epoch": 0.11066666666666666, + "grad_norm": 1.7917791604995728, + "learning_rate": 0.0001957473309608541, + "loss": 2.5387, + "step": 249 + }, + { + "epoch": 0.1111111111111111, + "grad_norm": 1.9322112798690796, + "learning_rate": 0.00019572953736654805, + "loss": 1.9883, + "step": 250 + }, + { + "epoch": 0.11155555555555556, + "grad_norm": 0.8669712543487549, + "learning_rate": 0.000195711743772242, + "loss": 2.6931, + "step": 251 + }, + { + "epoch": 0.112, + "grad_norm": 1.020733118057251, + "learning_rate": 0.00019569395017793594, + "loss": 3.1699, + "step": 252 + }, + { + "epoch": 0.11244444444444444, + "grad_norm": 0.8693115711212158, + "learning_rate": 0.0001956761565836299, + "loss": 2.4069, + "step": 253 + }, + { + "epoch": 0.11288888888888889, + "grad_norm": 1.043466567993164, + "learning_rate": 0.00019565836298932385, + "loss": 2.9878, + "step": 254 + }, + { + "epoch": 0.11333333333333333, + "grad_norm": 1.030266284942627, + "learning_rate": 0.0001956405693950178, + "loss": 2.6809, + "step": 255 + }, + { + "epoch": 0.11377777777777778, + "grad_norm": 0.9637128710746765, + "learning_rate": 0.00019562277580071176, + "loss": 2.2864, + "step": 256 + }, + { + "epoch": 0.11422222222222222, + "grad_norm": 1.2764060497283936, + "learning_rate": 0.00019560498220640572, + "loss": 2.5282, + "step": 257 + }, + { + "epoch": 0.11466666666666667, + "grad_norm": 1.0020864009857178, + "learning_rate": 0.00019558718861209965, + "loss": 2.6671, + "step": 258 + }, + { + "epoch": 0.11511111111111111, + "grad_norm": 0.9636043310165405, + "learning_rate": 0.0001955693950177936, + "loss": 2.6084, + "step": 259 + }, + { + "epoch": 0.11555555555555555, + "grad_norm": 0.9684137105941772, + "learning_rate": 0.00019555160142348756, + "loss": 2.6087, + "step": 260 + }, + { + "epoch": 0.116, + "grad_norm": 0.9589288830757141, + "learning_rate": 0.00019553380782918152, + "loss": 2.4366, + "step": 261 + }, + { + "epoch": 0.11644444444444445, + "grad_norm": 0.9377467632293701, + "learning_rate": 0.00019551601423487545, + "loss": 2.1606, + "step": 262 + }, + { + "epoch": 0.11688888888888889, + "grad_norm": 1.1523168087005615, + "learning_rate": 0.0001954982206405694, + "loss": 2.8838, + "step": 263 + }, + { + "epoch": 0.11733333333333333, + "grad_norm": 1.0509337186813354, + "learning_rate": 0.00019548042704626336, + "loss": 2.41, + "step": 264 + }, + { + "epoch": 0.11777777777777777, + "grad_norm": 1.2317067384719849, + "learning_rate": 0.0001954626334519573, + "loss": 2.5933, + "step": 265 + }, + { + "epoch": 0.11822222222222223, + "grad_norm": 1.4088350534439087, + "learning_rate": 0.00019544483985765125, + "loss": 2.7486, + "step": 266 + }, + { + "epoch": 0.11866666666666667, + "grad_norm": 1.033850073814392, + "learning_rate": 0.0001954270462633452, + "loss": 2.407, + "step": 267 + }, + { + "epoch": 0.11911111111111111, + "grad_norm": 1.2386589050292969, + "learning_rate": 0.00019540925266903916, + "loss": 2.1679, + "step": 268 + }, + { + "epoch": 0.11955555555555555, + "grad_norm": 1.0948667526245117, + "learning_rate": 0.00019539145907473312, + "loss": 2.4034, + "step": 269 + }, + { + "epoch": 0.12, + "grad_norm": 1.3226970434188843, + "learning_rate": 0.00019537366548042707, + "loss": 2.7977, + "step": 270 + }, + { + "epoch": 0.12044444444444445, + "grad_norm": 1.3416509628295898, + "learning_rate": 0.000195355871886121, + "loss": 2.7933, + "step": 271 + }, + { + "epoch": 0.12088888888888889, + "grad_norm": 1.142828345298767, + "learning_rate": 0.00019533807829181496, + "loss": 2.1703, + "step": 272 + }, + { + "epoch": 0.12133333333333333, + "grad_norm": 1.4413820505142212, + "learning_rate": 0.00019532028469750892, + "loss": 3.1676, + "step": 273 + }, + { + "epoch": 0.12177777777777778, + "grad_norm": 1.3678597211837769, + "learning_rate": 0.00019530249110320287, + "loss": 2.5917, + "step": 274 + }, + { + "epoch": 0.12222222222222222, + "grad_norm": 1.165423035621643, + "learning_rate": 0.0001952846975088968, + "loss": 2.1922, + "step": 275 + }, + { + "epoch": 0.12266666666666666, + "grad_norm": 1.458446741104126, + "learning_rate": 0.00019526690391459076, + "loss": 2.5036, + "step": 276 + }, + { + "epoch": 0.12311111111111112, + "grad_norm": 1.5351003408432007, + "learning_rate": 0.00019524911032028471, + "loss": 2.849, + "step": 277 + }, + { + "epoch": 0.12355555555555556, + "grad_norm": 1.343141794204712, + "learning_rate": 0.00019523131672597864, + "loss": 2.3158, + "step": 278 + }, + { + "epoch": 0.124, + "grad_norm": 1.5747337341308594, + "learning_rate": 0.0001952135231316726, + "loss": 3.3124, + "step": 279 + }, + { + "epoch": 0.12444444444444444, + "grad_norm": 1.538909912109375, + "learning_rate": 0.00019519572953736656, + "loss": 2.7005, + "step": 280 + }, + { + "epoch": 0.12488888888888888, + "grad_norm": 1.4149315357208252, + "learning_rate": 0.0001951779359430605, + "loss": 1.8815, + "step": 281 + }, + { + "epoch": 0.12533333333333332, + "grad_norm": 1.2315411567687988, + "learning_rate": 0.00019516014234875447, + "loss": 2.4296, + "step": 282 + }, + { + "epoch": 0.12577777777777777, + "grad_norm": 1.5550092458724976, + "learning_rate": 0.0001951423487544484, + "loss": 3.0271, + "step": 283 + }, + { + "epoch": 0.12622222222222224, + "grad_norm": 1.3692402839660645, + "learning_rate": 0.00019512455516014236, + "loss": 2.5704, + "step": 284 + }, + { + "epoch": 0.12666666666666668, + "grad_norm": 1.4310396909713745, + "learning_rate": 0.0001951067615658363, + "loss": 2.4914, + "step": 285 + }, + { + "epoch": 0.12711111111111112, + "grad_norm": 1.283097505569458, + "learning_rate": 0.00019508896797153027, + "loss": 2.31, + "step": 286 + }, + { + "epoch": 0.12755555555555556, + "grad_norm": 1.3749858140945435, + "learning_rate": 0.00019507117437722422, + "loss": 2.2563, + "step": 287 + }, + { + "epoch": 0.128, + "grad_norm": 1.3623768091201782, + "learning_rate": 0.00019505338078291815, + "loss": 2.2465, + "step": 288 + }, + { + "epoch": 0.12844444444444444, + "grad_norm": 1.4308843612670898, + "learning_rate": 0.0001950355871886121, + "loss": 2.6771, + "step": 289 + }, + { + "epoch": 0.1288888888888889, + "grad_norm": 1.568965196609497, + "learning_rate": 0.00019501779359430604, + "loss": 2.3809, + "step": 290 + }, + { + "epoch": 0.12933333333333333, + "grad_norm": 1.429734468460083, + "learning_rate": 0.000195, + "loss": 2.6331, + "step": 291 + }, + { + "epoch": 0.12977777777777777, + "grad_norm": 1.3805055618286133, + "learning_rate": 0.00019498220640569395, + "loss": 2.2138, + "step": 292 + }, + { + "epoch": 0.1302222222222222, + "grad_norm": 1.5100244283676147, + "learning_rate": 0.0001949644128113879, + "loss": 2.1832, + "step": 293 + }, + { + "epoch": 0.13066666666666665, + "grad_norm": 1.2256643772125244, + "learning_rate": 0.00019494661921708187, + "loss": 2.1298, + "step": 294 + }, + { + "epoch": 0.13111111111111112, + "grad_norm": 1.6592442989349365, + "learning_rate": 0.00019492882562277582, + "loss": 2.5226, + "step": 295 + }, + { + "epoch": 0.13155555555555556, + "grad_norm": 1.3523834943771362, + "learning_rate": 0.00019491103202846975, + "loss": 2.3038, + "step": 296 + }, + { + "epoch": 0.132, + "grad_norm": 1.4921329021453857, + "learning_rate": 0.0001948932384341637, + "loss": 2.2571, + "step": 297 + }, + { + "epoch": 0.13244444444444445, + "grad_norm": 1.7550227642059326, + "learning_rate": 0.00019487544483985766, + "loss": 3.1873, + "step": 298 + }, + { + "epoch": 0.1328888888888889, + "grad_norm": 1.6412465572357178, + "learning_rate": 0.00019485765124555162, + "loss": 2.63, + "step": 299 + }, + { + "epoch": 0.13333333333333333, + "grad_norm": 1.9257419109344482, + "learning_rate": 0.00019483985765124558, + "loss": 2.4374, + "step": 300 + }, + { + "epoch": 0.13377777777777777, + "grad_norm": 0.9691830277442932, + "learning_rate": 0.0001948220640569395, + "loss": 3.1218, + "step": 301 + }, + { + "epoch": 0.13422222222222221, + "grad_norm": 0.8326031565666199, + "learning_rate": 0.00019480427046263346, + "loss": 2.5508, + "step": 302 + }, + { + "epoch": 0.13466666666666666, + "grad_norm": 0.9384410381317139, + "learning_rate": 0.0001947864768683274, + "loss": 2.5186, + "step": 303 + }, + { + "epoch": 0.1351111111111111, + "grad_norm": 1.1212788820266724, + "learning_rate": 0.00019476868327402135, + "loss": 2.1688, + "step": 304 + }, + { + "epoch": 0.13555555555555557, + "grad_norm": 0.8847819566726685, + "learning_rate": 0.0001947508896797153, + "loss": 2.4118, + "step": 305 + }, + { + "epoch": 0.136, + "grad_norm": 1.0403448343276978, + "learning_rate": 0.00019473309608540926, + "loss": 2.5872, + "step": 306 + }, + { + "epoch": 0.13644444444444445, + "grad_norm": 1.0033663511276245, + "learning_rate": 0.00019471530249110322, + "loss": 2.4532, + "step": 307 + }, + { + "epoch": 0.1368888888888889, + "grad_norm": 0.9713349342346191, + "learning_rate": 0.00019469750889679718, + "loss": 2.6743, + "step": 308 + }, + { + "epoch": 0.13733333333333334, + "grad_norm": 0.9954231381416321, + "learning_rate": 0.0001946797153024911, + "loss": 2.2478, + "step": 309 + }, + { + "epoch": 0.13777777777777778, + "grad_norm": 1.4259557723999023, + "learning_rate": 0.00019466192170818506, + "loss": 2.4252, + "step": 310 + }, + { + "epoch": 0.13822222222222222, + "grad_norm": 1.1478148698806763, + "learning_rate": 0.00019464412811387902, + "loss": 2.6959, + "step": 311 + }, + { + "epoch": 0.13866666666666666, + "grad_norm": 0.9570370316505432, + "learning_rate": 0.00019462633451957297, + "loss": 2.4102, + "step": 312 + }, + { + "epoch": 0.1391111111111111, + "grad_norm": 1.0587913990020752, + "learning_rate": 0.0001946085409252669, + "loss": 2.341, + "step": 313 + }, + { + "epoch": 0.13955555555555554, + "grad_norm": 0.9694374203681946, + "learning_rate": 0.00019459074733096086, + "loss": 2.3627, + "step": 314 + }, + { + "epoch": 0.14, + "grad_norm": 1.0712873935699463, + "learning_rate": 0.00019457295373665482, + "loss": 2.7581, + "step": 315 + }, + { + "epoch": 0.14044444444444446, + "grad_norm": 1.1833367347717285, + "learning_rate": 0.00019455516014234875, + "loss": 2.424, + "step": 316 + }, + { + "epoch": 0.1408888888888889, + "grad_norm": 1.1975206136703491, + "learning_rate": 0.0001945373665480427, + "loss": 2.992, + "step": 317 + }, + { + "epoch": 0.14133333333333334, + "grad_norm": 1.165632724761963, + "learning_rate": 0.00019451957295373666, + "loss": 2.3119, + "step": 318 + }, + { + "epoch": 0.14177777777777778, + "grad_norm": 1.2752189636230469, + "learning_rate": 0.00019450177935943062, + "loss": 2.755, + "step": 319 + }, + { + "epoch": 0.14222222222222222, + "grad_norm": 1.0351862907409668, + "learning_rate": 0.00019448398576512457, + "loss": 2.4693, + "step": 320 + }, + { + "epoch": 0.14266666666666666, + "grad_norm": 1.3102034330368042, + "learning_rate": 0.00019446619217081853, + "loss": 3.0192, + "step": 321 + }, + { + "epoch": 0.1431111111111111, + "grad_norm": 1.2251161336898804, + "learning_rate": 0.00019444839857651246, + "loss": 2.6388, + "step": 322 + }, + { + "epoch": 0.14355555555555555, + "grad_norm": 1.147139072418213, + "learning_rate": 0.00019443060498220641, + "loss": 2.1927, + "step": 323 + }, + { + "epoch": 0.144, + "grad_norm": 1.6661100387573242, + "learning_rate": 0.00019441281138790037, + "loss": 2.3797, + "step": 324 + }, + { + "epoch": 0.14444444444444443, + "grad_norm": 1.5821012258529663, + "learning_rate": 0.00019439501779359433, + "loss": 2.6176, + "step": 325 + }, + { + "epoch": 0.1448888888888889, + "grad_norm": 1.4074416160583496, + "learning_rate": 0.00019437722419928826, + "loss": 2.887, + "step": 326 + }, + { + "epoch": 0.14533333333333334, + "grad_norm": 1.1706616878509521, + "learning_rate": 0.0001943594306049822, + "loss": 2.5354, + "step": 327 + }, + { + "epoch": 0.14577777777777778, + "grad_norm": 1.8788220882415771, + "learning_rate": 0.00019434163701067617, + "loss": 3.0704, + "step": 328 + }, + { + "epoch": 0.14622222222222223, + "grad_norm": 1.3559796810150146, + "learning_rate": 0.0001943238434163701, + "loss": 2.5688, + "step": 329 + }, + { + "epoch": 0.14666666666666667, + "grad_norm": 1.3595597743988037, + "learning_rate": 0.00019430604982206406, + "loss": 2.4277, + "step": 330 + }, + { + "epoch": 0.1471111111111111, + "grad_norm": 1.267451286315918, + "learning_rate": 0.000194288256227758, + "loss": 2.6135, + "step": 331 + }, + { + "epoch": 0.14755555555555555, + "grad_norm": 1.2744300365447998, + "learning_rate": 0.00019427046263345197, + "loss": 2.5527, + "step": 332 + }, + { + "epoch": 0.148, + "grad_norm": 1.6571002006530762, + "learning_rate": 0.00019425266903914593, + "loss": 2.653, + "step": 333 + }, + { + "epoch": 0.14844444444444443, + "grad_norm": 1.295233130455017, + "learning_rate": 0.00019423487544483988, + "loss": 2.3443, + "step": 334 + }, + { + "epoch": 0.14888888888888888, + "grad_norm": 1.4817813634872437, + "learning_rate": 0.0001942170818505338, + "loss": 2.6859, + "step": 335 + }, + { + "epoch": 0.14933333333333335, + "grad_norm": 1.2812329530715942, + "learning_rate": 0.00019419928825622777, + "loss": 2.5331, + "step": 336 + }, + { + "epoch": 0.1497777777777778, + "grad_norm": 1.2227575778961182, + "learning_rate": 0.00019418149466192172, + "loss": 2.2328, + "step": 337 + }, + { + "epoch": 0.15022222222222223, + "grad_norm": 1.2625856399536133, + "learning_rate": 0.00019416370106761568, + "loss": 2.3834, + "step": 338 + }, + { + "epoch": 0.15066666666666667, + "grad_norm": 1.4927825927734375, + "learning_rate": 0.0001941459074733096, + "loss": 2.66, + "step": 339 + }, + { + "epoch": 0.1511111111111111, + "grad_norm": 1.5926779508590698, + "learning_rate": 0.00019412811387900357, + "loss": 2.8668, + "step": 340 + }, + { + "epoch": 0.15155555555555555, + "grad_norm": 1.4455138444900513, + "learning_rate": 0.00019411032028469752, + "loss": 2.313, + "step": 341 + }, + { + "epoch": 0.152, + "grad_norm": 1.7588112354278564, + "learning_rate": 0.00019409252669039145, + "loss": 3.2341, + "step": 342 + }, + { + "epoch": 0.15244444444444444, + "grad_norm": 1.61067795753479, + "learning_rate": 0.0001940747330960854, + "loss": 3.0195, + "step": 343 + }, + { + "epoch": 0.15288888888888888, + "grad_norm": 1.5478051900863647, + "learning_rate": 0.00019405693950177937, + "loss": 2.5716, + "step": 344 + }, + { + "epoch": 0.15333333333333332, + "grad_norm": 1.436558485031128, + "learning_rate": 0.00019403914590747332, + "loss": 2.4509, + "step": 345 + }, + { + "epoch": 0.1537777777777778, + "grad_norm": 1.6531201601028442, + "learning_rate": 0.00019402135231316728, + "loss": 2.8739, + "step": 346 + }, + { + "epoch": 0.15422222222222223, + "grad_norm": 1.6915215253829956, + "learning_rate": 0.00019400355871886124, + "loss": 2.6791, + "step": 347 + }, + { + "epoch": 0.15466666666666667, + "grad_norm": 1.5415655374526978, + "learning_rate": 0.00019398576512455516, + "loss": 2.8461, + "step": 348 + }, + { + "epoch": 0.15511111111111112, + "grad_norm": 1.5193134546279907, + "learning_rate": 0.00019396797153024912, + "loss": 2.0756, + "step": 349 + }, + { + "epoch": 0.15555555555555556, + "grad_norm": 1.9580684900283813, + "learning_rate": 0.00019395017793594308, + "loss": 2.2343, + "step": 350 + }, + { + "epoch": 0.156, + "grad_norm": 0.9276076555252075, + "learning_rate": 0.00019393238434163703, + "loss": 2.5678, + "step": 351 + }, + { + "epoch": 0.15644444444444444, + "grad_norm": 0.8778754472732544, + "learning_rate": 0.00019391459074733096, + "loss": 2.2734, + "step": 352 + }, + { + "epoch": 0.15688888888888888, + "grad_norm": 0.9423462152481079, + "learning_rate": 0.00019389679715302492, + "loss": 2.3079, + "step": 353 + }, + { + "epoch": 0.15733333333333333, + "grad_norm": 1.1275615692138672, + "learning_rate": 0.00019387900355871888, + "loss": 3.0597, + "step": 354 + }, + { + "epoch": 0.15777777777777777, + "grad_norm": 1.098436713218689, + "learning_rate": 0.0001938612099644128, + "loss": 2.5681, + "step": 355 + }, + { + "epoch": 0.1582222222222222, + "grad_norm": 1.2952444553375244, + "learning_rate": 0.00019384341637010676, + "loss": 3.2901, + "step": 356 + }, + { + "epoch": 0.15866666666666668, + "grad_norm": 0.9615929126739502, + "learning_rate": 0.00019382562277580072, + "loss": 2.1562, + "step": 357 + }, + { + "epoch": 0.15911111111111112, + "grad_norm": 1.0871940851211548, + "learning_rate": 0.00019380782918149468, + "loss": 2.483, + "step": 358 + }, + { + "epoch": 0.15955555555555556, + "grad_norm": 1.2492108345031738, + "learning_rate": 0.00019379003558718863, + "loss": 2.4314, + "step": 359 + }, + { + "epoch": 0.16, + "grad_norm": 1.184037208557129, + "learning_rate": 0.0001937722419928826, + "loss": 2.5729, + "step": 360 + }, + { + "epoch": 0.16044444444444445, + "grad_norm": 1.07174813747406, + "learning_rate": 0.00019375444839857652, + "loss": 2.4092, + "step": 361 + }, + { + "epoch": 0.1608888888888889, + "grad_norm": 1.034098744392395, + "learning_rate": 0.00019373665480427047, + "loss": 2.3074, + "step": 362 + }, + { + "epoch": 0.16133333333333333, + "grad_norm": 1.224602460861206, + "learning_rate": 0.00019371886120996443, + "loss": 2.5208, + "step": 363 + }, + { + "epoch": 0.16177777777777777, + "grad_norm": 1.1033531427383423, + "learning_rate": 0.0001937010676156584, + "loss": 2.4352, + "step": 364 + }, + { + "epoch": 0.1622222222222222, + "grad_norm": 1.3021239042282104, + "learning_rate": 0.00019368327402135232, + "loss": 3.0264, + "step": 365 + }, + { + "epoch": 0.16266666666666665, + "grad_norm": 1.0758678913116455, + "learning_rate": 0.00019366548042704627, + "loss": 2.1807, + "step": 366 + }, + { + "epoch": 0.16311111111111112, + "grad_norm": 1.262274980545044, + "learning_rate": 0.00019364768683274023, + "loss": 2.0729, + "step": 367 + }, + { + "epoch": 0.16355555555555557, + "grad_norm": 1.1292612552642822, + "learning_rate": 0.00019362989323843416, + "loss": 2.3755, + "step": 368 + }, + { + "epoch": 0.164, + "grad_norm": 1.226879596710205, + "learning_rate": 0.00019361209964412812, + "loss": 2.7226, + "step": 369 + }, + { + "epoch": 0.16444444444444445, + "grad_norm": 1.2748644351959229, + "learning_rate": 0.00019359430604982207, + "loss": 2.8053, + "step": 370 + }, + { + "epoch": 0.1648888888888889, + "grad_norm": 1.2830379009246826, + "learning_rate": 0.00019357651245551603, + "loss": 2.6742, + "step": 371 + }, + { + "epoch": 0.16533333333333333, + "grad_norm": 1.3083009719848633, + "learning_rate": 0.00019355871886120998, + "loss": 2.5176, + "step": 372 + }, + { + "epoch": 0.16577777777777777, + "grad_norm": 1.2429115772247314, + "learning_rate": 0.00019354092526690391, + "loss": 2.4358, + "step": 373 + }, + { + "epoch": 0.16622222222222222, + "grad_norm": 1.3192554712295532, + "learning_rate": 0.00019352313167259787, + "loss": 2.3644, + "step": 374 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 1.1091505289077759, + "learning_rate": 0.00019350533807829183, + "loss": 2.0567, + "step": 375 + }, + { + "epoch": 0.1671111111111111, + "grad_norm": 1.3943992853164673, + "learning_rate": 0.00019348754448398578, + "loss": 2.3812, + "step": 376 + }, + { + "epoch": 0.16755555555555557, + "grad_norm": 1.3048312664031982, + "learning_rate": 0.00019346975088967974, + "loss": 2.614, + "step": 377 + }, + { + "epoch": 0.168, + "grad_norm": 1.2868915796279907, + "learning_rate": 0.00019345195729537367, + "loss": 2.8753, + "step": 378 + }, + { + "epoch": 0.16844444444444445, + "grad_norm": 1.7796956300735474, + "learning_rate": 0.00019343416370106763, + "loss": 2.8598, + "step": 379 + }, + { + "epoch": 0.1688888888888889, + "grad_norm": 1.2406312227249146, + "learning_rate": 0.00019341637010676156, + "loss": 2.3268, + "step": 380 + }, + { + "epoch": 0.16933333333333334, + "grad_norm": 1.1642827987670898, + "learning_rate": 0.0001933985765124555, + "loss": 1.8269, + "step": 381 + }, + { + "epoch": 0.16977777777777778, + "grad_norm": 1.325958251953125, + "learning_rate": 0.00019338078291814947, + "loss": 2.6527, + "step": 382 + }, + { + "epoch": 0.17022222222222222, + "grad_norm": 1.4351950883865356, + "learning_rate": 0.00019336298932384343, + "loss": 2.7466, + "step": 383 + }, + { + "epoch": 0.17066666666666666, + "grad_norm": 1.4853794574737549, + "learning_rate": 0.00019334519572953738, + "loss": 2.7099, + "step": 384 + }, + { + "epoch": 0.1711111111111111, + "grad_norm": 1.2756681442260742, + "learning_rate": 0.00019332740213523134, + "loss": 1.9808, + "step": 385 + }, + { + "epoch": 0.17155555555555554, + "grad_norm": 1.567001223564148, + "learning_rate": 0.00019330960854092527, + "loss": 2.6314, + "step": 386 + }, + { + "epoch": 0.172, + "grad_norm": 1.4116157293319702, + "learning_rate": 0.00019329181494661922, + "loss": 2.3277, + "step": 387 + }, + { + "epoch": 0.17244444444444446, + "grad_norm": 1.7159870862960815, + "learning_rate": 0.00019327402135231318, + "loss": 3.2206, + "step": 388 + }, + { + "epoch": 0.1728888888888889, + "grad_norm": 1.3646256923675537, + "learning_rate": 0.00019325622775800714, + "loss": 2.4943, + "step": 389 + }, + { + "epoch": 0.17333333333333334, + "grad_norm": 1.47772216796875, + "learning_rate": 0.0001932384341637011, + "loss": 2.3482, + "step": 390 + }, + { + "epoch": 0.17377777777777778, + "grad_norm": 1.6584293842315674, + "learning_rate": 0.00019322064056939502, + "loss": 2.449, + "step": 391 + }, + { + "epoch": 0.17422222222222222, + "grad_norm": 1.6674505472183228, + "learning_rate": 0.00019320284697508898, + "loss": 2.319, + "step": 392 + }, + { + "epoch": 0.17466666666666666, + "grad_norm": 1.4847460985183716, + "learning_rate": 0.0001931850533807829, + "loss": 2.4977, + "step": 393 + }, + { + "epoch": 0.1751111111111111, + "grad_norm": 1.5762537717819214, + "learning_rate": 0.00019316725978647687, + "loss": 2.5036, + "step": 394 + }, + { + "epoch": 0.17555555555555555, + "grad_norm": 1.731825828552246, + "learning_rate": 0.00019314946619217082, + "loss": 2.9634, + "step": 395 + }, + { + "epoch": 0.176, + "grad_norm": 1.5546553134918213, + "learning_rate": 0.00019313167259786478, + "loss": 2.9042, + "step": 396 + }, + { + "epoch": 0.17644444444444443, + "grad_norm": 1.7094860076904297, + "learning_rate": 0.00019311387900355873, + "loss": 2.6971, + "step": 397 + }, + { + "epoch": 0.1768888888888889, + "grad_norm": 1.4419265985488892, + "learning_rate": 0.0001930960854092527, + "loss": 2.0126, + "step": 398 + }, + { + "epoch": 0.17733333333333334, + "grad_norm": 1.516395926475525, + "learning_rate": 0.00019307829181494662, + "loss": 1.9048, + "step": 399 + }, + { + "epoch": 0.17777777777777778, + "grad_norm": 2.010030508041382, + "learning_rate": 0.00019306049822064058, + "loss": 2.6672, + "step": 400 + }, + { + "epoch": 0.17822222222222223, + "grad_norm": 0.8796574473381042, + "learning_rate": 0.00019304270462633453, + "loss": 2.3451, + "step": 401 + }, + { + "epoch": 0.17866666666666667, + "grad_norm": 1.060291051864624, + "learning_rate": 0.0001930249110320285, + "loss": 1.5039, + "step": 402 + }, + { + "epoch": 0.1791111111111111, + "grad_norm": 0.9270257949829102, + "learning_rate": 0.00019300711743772245, + "loss": 2.3051, + "step": 403 + }, + { + "epoch": 0.17955555555555555, + "grad_norm": 1.0624326467514038, + "learning_rate": 0.00019298932384341638, + "loss": 2.4763, + "step": 404 + }, + { + "epoch": 0.18, + "grad_norm": 1.396337866783142, + "learning_rate": 0.00019297153024911033, + "loss": 3.0648, + "step": 405 + }, + { + "epoch": 0.18044444444444444, + "grad_norm": 1.185214877128601, + "learning_rate": 0.00019295373665480426, + "loss": 2.5809, + "step": 406 + }, + { + "epoch": 0.18088888888888888, + "grad_norm": 1.2690224647521973, + "learning_rate": 0.00019293594306049822, + "loss": 2.6203, + "step": 407 + }, + { + "epoch": 0.18133333333333335, + "grad_norm": 1.083329439163208, + "learning_rate": 0.00019291814946619217, + "loss": 2.6254, + "step": 408 + }, + { + "epoch": 0.1817777777777778, + "grad_norm": 1.1971805095672607, + "learning_rate": 0.00019290035587188613, + "loss": 2.6854, + "step": 409 + }, + { + "epoch": 0.18222222222222223, + "grad_norm": 1.253471851348877, + "learning_rate": 0.0001928825622775801, + "loss": 2.7219, + "step": 410 + }, + { + "epoch": 0.18266666666666667, + "grad_norm": 1.1367192268371582, + "learning_rate": 0.00019286476868327404, + "loss": 2.6906, + "step": 411 + }, + { + "epoch": 0.1831111111111111, + "grad_norm": 1.1325358152389526, + "learning_rate": 0.00019284697508896797, + "loss": 2.6536, + "step": 412 + }, + { + "epoch": 0.18355555555555556, + "grad_norm": 1.1050721406936646, + "learning_rate": 0.00019282918149466193, + "loss": 2.351, + "step": 413 + }, + { + "epoch": 0.184, + "grad_norm": 1.1547539234161377, + "learning_rate": 0.0001928113879003559, + "loss": 2.3844, + "step": 414 + }, + { + "epoch": 0.18444444444444444, + "grad_norm": 1.0745432376861572, + "learning_rate": 0.00019279359430604984, + "loss": 1.9506, + "step": 415 + }, + { + "epoch": 0.18488888888888888, + "grad_norm": 1.3409316539764404, + "learning_rate": 0.0001927758007117438, + "loss": 3.0778, + "step": 416 + }, + { + "epoch": 0.18533333333333332, + "grad_norm": 1.029353380203247, + "learning_rate": 0.00019275800711743773, + "loss": 2.1147, + "step": 417 + }, + { + "epoch": 0.18577777777777776, + "grad_norm": 1.3205243349075317, + "learning_rate": 0.00019274021352313169, + "loss": 2.706, + "step": 418 + }, + { + "epoch": 0.18622222222222223, + "grad_norm": 1.2287330627441406, + "learning_rate": 0.00019272241992882562, + "loss": 2.9919, + "step": 419 + }, + { + "epoch": 0.18666666666666668, + "grad_norm": 1.2103333473205566, + "learning_rate": 0.00019270462633451957, + "loss": 2.5828, + "step": 420 + }, + { + "epoch": 0.18711111111111112, + "grad_norm": 1.2801356315612793, + "learning_rate": 0.00019268683274021353, + "loss": 2.7884, + "step": 421 + }, + { + "epoch": 0.18755555555555556, + "grad_norm": 1.2993440628051758, + "learning_rate": 0.00019266903914590748, + "loss": 2.3698, + "step": 422 + }, + { + "epoch": 0.188, + "grad_norm": 1.173684000968933, + "learning_rate": 0.00019265124555160144, + "loss": 1.9735, + "step": 423 + }, + { + "epoch": 0.18844444444444444, + "grad_norm": 1.1559852361679077, + "learning_rate": 0.0001926334519572954, + "loss": 2.3973, + "step": 424 + }, + { + "epoch": 0.18888888888888888, + "grad_norm": 1.1459964513778687, + "learning_rate": 0.00019261565836298933, + "loss": 2.2315, + "step": 425 + }, + { + "epoch": 0.18933333333333333, + "grad_norm": 1.2700178623199463, + "learning_rate": 0.00019259786476868328, + "loss": 2.0161, + "step": 426 + }, + { + "epoch": 0.18977777777777777, + "grad_norm": 1.4809290170669556, + "learning_rate": 0.00019258007117437724, + "loss": 2.4066, + "step": 427 + }, + { + "epoch": 0.1902222222222222, + "grad_norm": 1.3454186916351318, + "learning_rate": 0.0001925622775800712, + "loss": 2.5743, + "step": 428 + }, + { + "epoch": 0.19066666666666668, + "grad_norm": 1.7205144166946411, + "learning_rate": 0.00019254448398576513, + "loss": 2.6614, + "step": 429 + }, + { + "epoch": 0.19111111111111112, + "grad_norm": 1.3243727684020996, + "learning_rate": 0.00019252669039145908, + "loss": 2.5333, + "step": 430 + }, + { + "epoch": 0.19155555555555556, + "grad_norm": 1.282810926437378, + "learning_rate": 0.00019250889679715304, + "loss": 2.4967, + "step": 431 + }, + { + "epoch": 0.192, + "grad_norm": 1.4963980913162231, + "learning_rate": 0.00019249110320284697, + "loss": 2.5604, + "step": 432 + }, + { + "epoch": 0.19244444444444445, + "grad_norm": 1.4316112995147705, + "learning_rate": 0.00019247330960854092, + "loss": 2.5011, + "step": 433 + }, + { + "epoch": 0.1928888888888889, + "grad_norm": 1.550047755241394, + "learning_rate": 0.00019245551601423488, + "loss": 2.2658, + "step": 434 + }, + { + "epoch": 0.19333333333333333, + "grad_norm": 1.3296672105789185, + "learning_rate": 0.00019243772241992884, + "loss": 2.3954, + "step": 435 + }, + { + "epoch": 0.19377777777777777, + "grad_norm": 1.4095767736434937, + "learning_rate": 0.0001924199288256228, + "loss": 2.4617, + "step": 436 + }, + { + "epoch": 0.1942222222222222, + "grad_norm": 1.5941203832626343, + "learning_rate": 0.00019240213523131675, + "loss": 2.4245, + "step": 437 + }, + { + "epoch": 0.19466666666666665, + "grad_norm": 1.427892804145813, + "learning_rate": 0.00019238434163701068, + "loss": 2.6397, + "step": 438 + }, + { + "epoch": 0.19511111111111112, + "grad_norm": 1.512261152267456, + "learning_rate": 0.00019236654804270464, + "loss": 3.0727, + "step": 439 + }, + { + "epoch": 0.19555555555555557, + "grad_norm": 1.3837703466415405, + "learning_rate": 0.0001923487544483986, + "loss": 2.0297, + "step": 440 + }, + { + "epoch": 0.196, + "grad_norm": 1.7951322793960571, + "learning_rate": 0.00019233096085409255, + "loss": 2.9765, + "step": 441 + }, + { + "epoch": 0.19644444444444445, + "grad_norm": 1.418602705001831, + "learning_rate": 0.00019231316725978648, + "loss": 2.4067, + "step": 442 + }, + { + "epoch": 0.1968888888888889, + "grad_norm": 1.8720040321350098, + "learning_rate": 0.00019229537366548044, + "loss": 2.911, + "step": 443 + }, + { + "epoch": 0.19733333333333333, + "grad_norm": 1.337851881980896, + "learning_rate": 0.0001922775800711744, + "loss": 2.2034, + "step": 444 + }, + { + "epoch": 0.19777777777777777, + "grad_norm": 1.770947813987732, + "learning_rate": 0.00019225978647686832, + "loss": 3.1644, + "step": 445 + }, + { + "epoch": 0.19822222222222222, + "grad_norm": 1.5249742269515991, + "learning_rate": 0.00019224199288256228, + "loss": 3.0308, + "step": 446 + }, + { + "epoch": 0.19866666666666666, + "grad_norm": 1.9548603296279907, + "learning_rate": 0.00019222419928825623, + "loss": 3.2289, + "step": 447 + }, + { + "epoch": 0.1991111111111111, + "grad_norm": 1.8034451007843018, + "learning_rate": 0.0001922064056939502, + "loss": 2.9985, + "step": 448 + }, + { + "epoch": 0.19955555555555557, + "grad_norm": 2.0380022525787354, + "learning_rate": 0.00019218861209964415, + "loss": 2.9506, + "step": 449 + }, + { + "epoch": 0.2, + "grad_norm": 2.272326946258545, + "learning_rate": 0.0001921708185053381, + "loss": 2.9212, + "step": 450 + }, + { + "epoch": 0.20044444444444445, + "grad_norm": 0.8263271450996399, + "learning_rate": 0.00019215302491103203, + "loss": 2.646, + "step": 451 + }, + { + "epoch": 0.2008888888888889, + "grad_norm": 0.7942408919334412, + "learning_rate": 0.000192135231316726, + "loss": 2.1999, + "step": 452 + }, + { + "epoch": 0.20133333333333334, + "grad_norm": 1.059103012084961, + "learning_rate": 0.00019211743772241995, + "loss": 1.499, + "step": 453 + }, + { + "epoch": 0.20177777777777778, + "grad_norm": 1.2836692333221436, + "learning_rate": 0.0001920996441281139, + "loss": 2.1925, + "step": 454 + }, + { + "epoch": 0.20222222222222222, + "grad_norm": 0.9823219776153564, + "learning_rate": 0.00019208185053380783, + "loss": 2.3254, + "step": 455 + }, + { + "epoch": 0.20266666666666666, + "grad_norm": 1.1425485610961914, + "learning_rate": 0.0001920640569395018, + "loss": 2.6993, + "step": 456 + }, + { + "epoch": 0.2031111111111111, + "grad_norm": 1.2006306648254395, + "learning_rate": 0.00019204626334519575, + "loss": 2.8467, + "step": 457 + }, + { + "epoch": 0.20355555555555555, + "grad_norm": 1.0690330266952515, + "learning_rate": 0.00019202846975088967, + "loss": 2.5, + "step": 458 + }, + { + "epoch": 0.204, + "grad_norm": 1.0791610479354858, + "learning_rate": 0.00019201067615658363, + "loss": 2.6261, + "step": 459 + }, + { + "epoch": 0.20444444444444446, + "grad_norm": 1.281191349029541, + "learning_rate": 0.0001919928825622776, + "loss": 2.6089, + "step": 460 + }, + { + "epoch": 0.2048888888888889, + "grad_norm": 0.9611422419548035, + "learning_rate": 0.00019197508896797154, + "loss": 2.6225, + "step": 461 + }, + { + "epoch": 0.20533333333333334, + "grad_norm": 1.1073009967803955, + "learning_rate": 0.0001919572953736655, + "loss": 2.2157, + "step": 462 + }, + { + "epoch": 0.20577777777777778, + "grad_norm": 1.305037498474121, + "learning_rate": 0.00019193950177935943, + "loss": 3.2172, + "step": 463 + }, + { + "epoch": 0.20622222222222222, + "grad_norm": 1.1567325592041016, + "learning_rate": 0.00019192170818505339, + "loss": 2.4355, + "step": 464 + }, + { + "epoch": 0.20666666666666667, + "grad_norm": 1.2676507234573364, + "learning_rate": 0.00019190391459074734, + "loss": 2.405, + "step": 465 + }, + { + "epoch": 0.2071111111111111, + "grad_norm": 1.1165955066680908, + "learning_rate": 0.0001918861209964413, + "loss": 2.0347, + "step": 466 + }, + { + "epoch": 0.20755555555555555, + "grad_norm": 1.3841233253479004, + "learning_rate": 0.00019186832740213526, + "loss": 2.7864, + "step": 467 + }, + { + "epoch": 0.208, + "grad_norm": 1.4399226903915405, + "learning_rate": 0.00019185053380782919, + "loss": 3.6052, + "step": 468 + }, + { + "epoch": 0.20844444444444443, + "grad_norm": 1.4364230632781982, + "learning_rate": 0.00019183274021352314, + "loss": 2.7003, + "step": 469 + }, + { + "epoch": 0.2088888888888889, + "grad_norm": 1.1740696430206299, + "learning_rate": 0.00019181494661921707, + "loss": 2.1421, + "step": 470 + }, + { + "epoch": 0.20933333333333334, + "grad_norm": 1.5531638860702515, + "learning_rate": 0.00019179715302491103, + "loss": 2.6817, + "step": 471 + }, + { + "epoch": 0.20977777777777779, + "grad_norm": 1.4464926719665527, + "learning_rate": 0.00019177935943060498, + "loss": 2.7101, + "step": 472 + }, + { + "epoch": 0.21022222222222223, + "grad_norm": 1.319682002067566, + "learning_rate": 0.00019176156583629894, + "loss": 2.3335, + "step": 473 + }, + { + "epoch": 0.21066666666666667, + "grad_norm": 1.347642183303833, + "learning_rate": 0.0001917437722419929, + "loss": 2.6819, + "step": 474 + }, + { + "epoch": 0.2111111111111111, + "grad_norm": 1.3280656337738037, + "learning_rate": 0.00019172597864768685, + "loss": 2.7256, + "step": 475 + }, + { + "epoch": 0.21155555555555555, + "grad_norm": 1.4412258863449097, + "learning_rate": 0.00019170818505338078, + "loss": 2.7524, + "step": 476 + }, + { + "epoch": 0.212, + "grad_norm": 1.455552577972412, + "learning_rate": 0.00019169039145907474, + "loss": 2.5749, + "step": 477 + }, + { + "epoch": 0.21244444444444444, + "grad_norm": 1.2440650463104248, + "learning_rate": 0.0001916725978647687, + "loss": 2.3623, + "step": 478 + }, + { + "epoch": 0.21288888888888888, + "grad_norm": 1.2427901029586792, + "learning_rate": 0.00019165480427046265, + "loss": 2.3891, + "step": 479 + }, + { + "epoch": 0.21333333333333335, + "grad_norm": 1.2674572467803955, + "learning_rate": 0.0001916370106761566, + "loss": 2.2005, + "step": 480 + }, + { + "epoch": 0.2137777777777778, + "grad_norm": 1.4567019939422607, + "learning_rate": 0.00019161921708185054, + "loss": 2.7974, + "step": 481 + }, + { + "epoch": 0.21422222222222223, + "grad_norm": 1.25277578830719, + "learning_rate": 0.0001916014234875445, + "loss": 2.4577, + "step": 482 + }, + { + "epoch": 0.21466666666666667, + "grad_norm": 1.2980494499206543, + "learning_rate": 0.00019158362989323842, + "loss": 2.4017, + "step": 483 + }, + { + "epoch": 0.21511111111111111, + "grad_norm": 1.5980355739593506, + "learning_rate": 0.00019156583629893238, + "loss": 2.7121, + "step": 484 + }, + { + "epoch": 0.21555555555555556, + "grad_norm": 1.3960875272750854, + "learning_rate": 0.00019154804270462634, + "loss": 2.4198, + "step": 485 + }, + { + "epoch": 0.216, + "grad_norm": 1.5180373191833496, + "learning_rate": 0.0001915302491103203, + "loss": 2.8813, + "step": 486 + }, + { + "epoch": 0.21644444444444444, + "grad_norm": 1.339158058166504, + "learning_rate": 0.00019151245551601425, + "loss": 2.4161, + "step": 487 + }, + { + "epoch": 0.21688888888888888, + "grad_norm": 1.708709955215454, + "learning_rate": 0.0001914946619217082, + "loss": 2.6625, + "step": 488 + }, + { + "epoch": 0.21733333333333332, + "grad_norm": 1.4037717580795288, + "learning_rate": 0.00019147686832740214, + "loss": 2.7222, + "step": 489 + }, + { + "epoch": 0.21777777777777776, + "grad_norm": 1.547869324684143, + "learning_rate": 0.0001914590747330961, + "loss": 2.4619, + "step": 490 + }, + { + "epoch": 0.21822222222222223, + "grad_norm": 2.0343785285949707, + "learning_rate": 0.00019144128113879005, + "loss": 2.8051, + "step": 491 + }, + { + "epoch": 0.21866666666666668, + "grad_norm": 1.5703917741775513, + "learning_rate": 0.000191423487544484, + "loss": 2.7364, + "step": 492 + }, + { + "epoch": 0.21911111111111112, + "grad_norm": 1.4888960123062134, + "learning_rate": 0.00019140569395017796, + "loss": 2.3952, + "step": 493 + }, + { + "epoch": 0.21955555555555556, + "grad_norm": 1.9154101610183716, + "learning_rate": 0.0001913879003558719, + "loss": 3.4184, + "step": 494 + }, + { + "epoch": 0.22, + "grad_norm": 1.8733478784561157, + "learning_rate": 0.00019137010676156585, + "loss": 2.0588, + "step": 495 + }, + { + "epoch": 0.22044444444444444, + "grad_norm": 1.5168616771697998, + "learning_rate": 0.00019135231316725978, + "loss": 2.4865, + "step": 496 + }, + { + "epoch": 0.22088888888888888, + "grad_norm": 1.6080540418624878, + "learning_rate": 0.00019133451957295373, + "loss": 2.6185, + "step": 497 + }, + { + "epoch": 0.22133333333333333, + "grad_norm": 1.8169959783554077, + "learning_rate": 0.0001913167259786477, + "loss": 2.7514, + "step": 498 + }, + { + "epoch": 0.22177777777777777, + "grad_norm": 1.903592824935913, + "learning_rate": 0.00019129893238434165, + "loss": 2.926, + "step": 499 + }, + { + "epoch": 0.2222222222222222, + "grad_norm": 2.2263951301574707, + "learning_rate": 0.0001912811387900356, + "loss": 3.209, + "step": 500 + }, + { + "epoch": 0.22266666666666668, + "grad_norm": 0.9277405142784119, + "learning_rate": 0.00019126334519572956, + "loss": 2.8295, + "step": 501 + }, + { + "epoch": 0.22311111111111112, + "grad_norm": 0.7616639137268066, + "learning_rate": 0.0001912455516014235, + "loss": 2.3259, + "step": 502 + }, + { + "epoch": 0.22355555555555556, + "grad_norm": 0.863919734954834, + "learning_rate": 0.00019122775800711745, + "loss": 2.6629, + "step": 503 + }, + { + "epoch": 0.224, + "grad_norm": 0.9117692112922668, + "learning_rate": 0.0001912099644128114, + "loss": 2.8012, + "step": 504 + }, + { + "epoch": 0.22444444444444445, + "grad_norm": 0.8689172863960266, + "learning_rate": 0.00019119217081850536, + "loss": 1.8062, + "step": 505 + }, + { + "epoch": 0.2248888888888889, + "grad_norm": 0.9966077208518982, + "learning_rate": 0.00019117437722419932, + "loss": 2.3426, + "step": 506 + }, + { + "epoch": 0.22533333333333333, + "grad_norm": 1.1402056217193604, + "learning_rate": 0.00019115658362989324, + "loss": 2.597, + "step": 507 + }, + { + "epoch": 0.22577777777777777, + "grad_norm": 1.0207332372665405, + "learning_rate": 0.0001911387900355872, + "loss": 2.7771, + "step": 508 + }, + { + "epoch": 0.2262222222222222, + "grad_norm": 1.0293519496917725, + "learning_rate": 0.00019112099644128113, + "loss": 2.2381, + "step": 509 + }, + { + "epoch": 0.22666666666666666, + "grad_norm": 1.0531278848648071, + "learning_rate": 0.0001911032028469751, + "loss": 2.42, + "step": 510 + }, + { + "epoch": 0.22711111111111112, + "grad_norm": 1.2546653747558594, + "learning_rate": 0.00019108540925266904, + "loss": 2.9447, + "step": 511 + }, + { + "epoch": 0.22755555555555557, + "grad_norm": 1.0765845775604248, + "learning_rate": 0.000191067615658363, + "loss": 2.3876, + "step": 512 + }, + { + "epoch": 0.228, + "grad_norm": 1.0224113464355469, + "learning_rate": 0.00019104982206405696, + "loss": 2.5661, + "step": 513 + }, + { + "epoch": 0.22844444444444445, + "grad_norm": 1.1143425703048706, + "learning_rate": 0.0001910320284697509, + "loss": 2.4567, + "step": 514 + }, + { + "epoch": 0.2288888888888889, + "grad_norm": 1.2478740215301514, + "learning_rate": 0.00019101423487544484, + "loss": 2.2325, + "step": 515 + }, + { + "epoch": 0.22933333333333333, + "grad_norm": 1.5122989416122437, + "learning_rate": 0.0001909964412811388, + "loss": 2.7624, + "step": 516 + }, + { + "epoch": 0.22977777777777778, + "grad_norm": 1.2084643840789795, + "learning_rate": 0.00019097864768683276, + "loss": 2.1843, + "step": 517 + }, + { + "epoch": 0.23022222222222222, + "grad_norm": 2.0436813831329346, + "learning_rate": 0.0001909608540925267, + "loss": 2.7025, + "step": 518 + }, + { + "epoch": 0.23066666666666666, + "grad_norm": 1.6114445924758911, + "learning_rate": 0.00019094306049822067, + "loss": 2.5077, + "step": 519 + }, + { + "epoch": 0.2311111111111111, + "grad_norm": 1.163203239440918, + "learning_rate": 0.0001909252669039146, + "loss": 2.6062, + "step": 520 + }, + { + "epoch": 0.23155555555555554, + "grad_norm": 1.3827770948410034, + "learning_rate": 0.00019090747330960855, + "loss": 2.565, + "step": 521 + }, + { + "epoch": 0.232, + "grad_norm": 1.443726658821106, + "learning_rate": 0.00019088967971530248, + "loss": 2.9211, + "step": 522 + }, + { + "epoch": 0.23244444444444445, + "grad_norm": 1.1512651443481445, + "learning_rate": 0.00019087188612099644, + "loss": 2.3893, + "step": 523 + }, + { + "epoch": 0.2328888888888889, + "grad_norm": 1.3335007429122925, + "learning_rate": 0.0001908540925266904, + "loss": 2.8188, + "step": 524 + }, + { + "epoch": 0.23333333333333334, + "grad_norm": 1.2235959768295288, + "learning_rate": 0.00019083629893238435, + "loss": 2.2223, + "step": 525 + }, + { + "epoch": 0.23377777777777778, + "grad_norm": 1.3788108825683594, + "learning_rate": 0.0001908185053380783, + "loss": 2.7354, + "step": 526 + }, + { + "epoch": 0.23422222222222222, + "grad_norm": 1.400914192199707, + "learning_rate": 0.00019080071174377227, + "loss": 2.3977, + "step": 527 + }, + { + "epoch": 0.23466666666666666, + "grad_norm": 1.4983909130096436, + "learning_rate": 0.0001907829181494662, + "loss": 2.7346, + "step": 528 + }, + { + "epoch": 0.2351111111111111, + "grad_norm": 1.453970193862915, + "learning_rate": 0.00019076512455516015, + "loss": 2.4243, + "step": 529 + }, + { + "epoch": 0.23555555555555555, + "grad_norm": 1.6744136810302734, + "learning_rate": 0.0001907473309608541, + "loss": 0.8518, + "step": 530 + }, + { + "epoch": 0.236, + "grad_norm": 1.4733753204345703, + "learning_rate": 0.00019072953736654807, + "loss": 2.5426, + "step": 531 + }, + { + "epoch": 0.23644444444444446, + "grad_norm": 1.4669400453567505, + "learning_rate": 0.00019071174377224202, + "loss": 2.527, + "step": 532 + }, + { + "epoch": 0.2368888888888889, + "grad_norm": 1.413023829460144, + "learning_rate": 0.00019069395017793595, + "loss": 2.5775, + "step": 533 + }, + { + "epoch": 0.23733333333333334, + "grad_norm": 1.4842833280563354, + "learning_rate": 0.0001906761565836299, + "loss": 2.3223, + "step": 534 + }, + { + "epoch": 0.23777777777777778, + "grad_norm": 1.6651334762573242, + "learning_rate": 0.00019065836298932384, + "loss": 2.7261, + "step": 535 + }, + { + "epoch": 0.23822222222222222, + "grad_norm": 2.1817784309387207, + "learning_rate": 0.0001906405693950178, + "loss": 2.602, + "step": 536 + }, + { + "epoch": 0.23866666666666667, + "grad_norm": 1.4185001850128174, + "learning_rate": 0.00019062277580071175, + "loss": 2.5273, + "step": 537 + }, + { + "epoch": 0.2391111111111111, + "grad_norm": 1.6372658014297485, + "learning_rate": 0.0001906049822064057, + "loss": 2.7267, + "step": 538 + }, + { + "epoch": 0.23955555555555555, + "grad_norm": 2.114755868911743, + "learning_rate": 0.00019058718861209966, + "loss": 1.3715, + "step": 539 + }, + { + "epoch": 0.24, + "grad_norm": 1.5371288061141968, + "learning_rate": 0.00019056939501779362, + "loss": 2.5673, + "step": 540 + }, + { + "epoch": 0.24044444444444443, + "grad_norm": 1.3606349229812622, + "learning_rate": 0.00019055160142348755, + "loss": 2.5102, + "step": 541 + }, + { + "epoch": 0.2408888888888889, + "grad_norm": 1.7038285732269287, + "learning_rate": 0.0001905338078291815, + "loss": 2.8909, + "step": 542 + }, + { + "epoch": 0.24133333333333334, + "grad_norm": 1.6660969257354736, + "learning_rate": 0.00019051601423487546, + "loss": 2.3934, + "step": 543 + }, + { + "epoch": 0.24177777777777779, + "grad_norm": 1.4915132522583008, + "learning_rate": 0.00019049822064056942, + "loss": 2.6195, + "step": 544 + }, + { + "epoch": 0.24222222222222223, + "grad_norm": 1.606236219406128, + "learning_rate": 0.00019048042704626335, + "loss": 2.5366, + "step": 545 + }, + { + "epoch": 0.24266666666666667, + "grad_norm": 1.6464382410049438, + "learning_rate": 0.0001904626334519573, + "loss": 2.2821, + "step": 546 + }, + { + "epoch": 0.2431111111111111, + "grad_norm": 1.5627448558807373, + "learning_rate": 0.00019044483985765126, + "loss": 2.7557, + "step": 547 + }, + { + "epoch": 0.24355555555555555, + "grad_norm": 1.5537325143814087, + "learning_rate": 0.0001904270462633452, + "loss": 2.7917, + "step": 548 + }, + { + "epoch": 0.244, + "grad_norm": 2.0328938961029053, + "learning_rate": 0.00019040925266903915, + "loss": 3.1031, + "step": 549 + }, + { + "epoch": 0.24444444444444444, + "grad_norm": 2.5331227779388428, + "learning_rate": 0.0001903914590747331, + "loss": 2.0921, + "step": 550 + }, + { + "epoch": 0.24488888888888888, + "grad_norm": 1.062377691268921, + "learning_rate": 0.00019037366548042706, + "loss": 2.7696, + "step": 551 + }, + { + "epoch": 0.24533333333333332, + "grad_norm": 0.9012869596481323, + "learning_rate": 0.00019035587188612102, + "loss": 1.8978, + "step": 552 + }, + { + "epoch": 0.2457777777777778, + "grad_norm": 0.9942989349365234, + "learning_rate": 0.00019033807829181495, + "loss": 1.9979, + "step": 553 + }, + { + "epoch": 0.24622222222222223, + "grad_norm": 1.0721116065979004, + "learning_rate": 0.0001903202846975089, + "loss": 2.6083, + "step": 554 + }, + { + "epoch": 0.24666666666666667, + "grad_norm": 1.0755621194839478, + "learning_rate": 0.00019030249110320286, + "loss": 2.7578, + "step": 555 + }, + { + "epoch": 0.24711111111111111, + "grad_norm": 1.080788254737854, + "learning_rate": 0.00019028469750889681, + "loss": 2.2836, + "step": 556 + }, + { + "epoch": 0.24755555555555556, + "grad_norm": 1.0383445024490356, + "learning_rate": 0.00019026690391459077, + "loss": 2.4542, + "step": 557 + }, + { + "epoch": 0.248, + "grad_norm": 1.1483523845672607, + "learning_rate": 0.0001902491103202847, + "loss": 2.5689, + "step": 558 + }, + { + "epoch": 0.24844444444444444, + "grad_norm": 1.041678547859192, + "learning_rate": 0.00019023131672597866, + "loss": 2.2751, + "step": 559 + }, + { + "epoch": 0.24888888888888888, + "grad_norm": 1.1849489212036133, + "learning_rate": 0.0001902135231316726, + "loss": 2.6915, + "step": 560 + }, + { + "epoch": 0.24933333333333332, + "grad_norm": 1.1290448904037476, + "learning_rate": 0.00019019572953736654, + "loss": 2.184, + "step": 561 + }, + { + "epoch": 0.24977777777777777, + "grad_norm": 1.2467504739761353, + "learning_rate": 0.0001901779359430605, + "loss": 2.2676, + "step": 562 + }, + { + "epoch": 0.25022222222222223, + "grad_norm": 1.1683002710342407, + "learning_rate": 0.00019016014234875446, + "loss": 2.4178, + "step": 563 + }, + { + "epoch": 0.25066666666666665, + "grad_norm": 1.2386951446533203, + "learning_rate": 0.0001901423487544484, + "loss": 2.4034, + "step": 564 + }, + { + "epoch": 0.2511111111111111, + "grad_norm": 1.259753704071045, + "learning_rate": 0.00019012455516014237, + "loss": 2.5685, + "step": 565 + }, + { + "epoch": 0.25155555555555553, + "grad_norm": 1.5166339874267578, + "learning_rate": 0.0001901067615658363, + "loss": 2.6635, + "step": 566 + }, + { + "epoch": 0.252, + "grad_norm": 1.23752760887146, + "learning_rate": 0.00019008896797153026, + "loss": 2.4881, + "step": 567 + }, + { + "epoch": 0.25244444444444447, + "grad_norm": 1.2667707204818726, + "learning_rate": 0.0001900711743772242, + "loss": 2.8058, + "step": 568 + }, + { + "epoch": 0.2528888888888889, + "grad_norm": 1.8489893674850464, + "learning_rate": 0.00019005338078291817, + "loss": 2.6723, + "step": 569 + }, + { + "epoch": 0.25333333333333335, + "grad_norm": 1.2785292863845825, + "learning_rate": 0.00019003558718861212, + "loss": 2.8666, + "step": 570 + }, + { + "epoch": 0.25377777777777777, + "grad_norm": 1.141205906867981, + "learning_rate": 0.00019001779359430605, + "loss": 2.1319, + "step": 571 + }, + { + "epoch": 0.25422222222222224, + "grad_norm": 1.4991300106048584, + "learning_rate": 0.00019, + "loss": 2.3779, + "step": 572 + }, + { + "epoch": 0.25466666666666665, + "grad_norm": 1.2517198324203491, + "learning_rate": 0.00018998220640569394, + "loss": 2.7036, + "step": 573 + }, + { + "epoch": 0.2551111111111111, + "grad_norm": 1.186219334602356, + "learning_rate": 0.0001899644128113879, + "loss": 2.6787, + "step": 574 + }, + { + "epoch": 0.25555555555555554, + "grad_norm": 1.2609152793884277, + "learning_rate": 0.00018994661921708185, + "loss": 2.5269, + "step": 575 + }, + { + "epoch": 0.256, + "grad_norm": 1.4722431898117065, + "learning_rate": 0.0001899288256227758, + "loss": 2.6287, + "step": 576 + }, + { + "epoch": 0.2564444444444444, + "grad_norm": 1.3302136659622192, + "learning_rate": 0.00018991103202846977, + "loss": 2.5439, + "step": 577 + }, + { + "epoch": 0.2568888888888889, + "grad_norm": 1.270352840423584, + "learning_rate": 0.00018989323843416372, + "loss": 2.5176, + "step": 578 + }, + { + "epoch": 0.25733333333333336, + "grad_norm": 1.2411810159683228, + "learning_rate": 0.00018987544483985765, + "loss": 2.423, + "step": 579 + }, + { + "epoch": 0.2577777777777778, + "grad_norm": 1.3175048828125, + "learning_rate": 0.0001898576512455516, + "loss": 2.364, + "step": 580 + }, + { + "epoch": 0.25822222222222224, + "grad_norm": 1.4399092197418213, + "learning_rate": 0.00018983985765124556, + "loss": 2.7466, + "step": 581 + }, + { + "epoch": 0.25866666666666666, + "grad_norm": 1.234508752822876, + "learning_rate": 0.00018982206405693952, + "loss": 2.1863, + "step": 582 + }, + { + "epoch": 0.2591111111111111, + "grad_norm": 1.6190673112869263, + "learning_rate": 0.00018980427046263348, + "loss": 2.4877, + "step": 583 + }, + { + "epoch": 0.25955555555555554, + "grad_norm": 1.159323811531067, + "learning_rate": 0.0001897864768683274, + "loss": 2.2116, + "step": 584 + }, + { + "epoch": 0.26, + "grad_norm": 1.284498929977417, + "learning_rate": 0.00018976868327402136, + "loss": 2.2746, + "step": 585 + }, + { + "epoch": 0.2604444444444444, + "grad_norm": 1.366461992263794, + "learning_rate": 0.0001897508896797153, + "loss": 2.5624, + "step": 586 + }, + { + "epoch": 0.2608888888888889, + "grad_norm": 1.4767354726791382, + "learning_rate": 0.00018973309608540925, + "loss": 2.5998, + "step": 587 + }, + { + "epoch": 0.2613333333333333, + "grad_norm": 1.6025152206420898, + "learning_rate": 0.0001897153024911032, + "loss": 2.1852, + "step": 588 + }, + { + "epoch": 0.2617777777777778, + "grad_norm": 1.54243803024292, + "learning_rate": 0.00018969750889679716, + "loss": 2.9229, + "step": 589 + }, + { + "epoch": 0.26222222222222225, + "grad_norm": 1.440328598022461, + "learning_rate": 0.00018967971530249112, + "loss": 2.3776, + "step": 590 + }, + { + "epoch": 0.26266666666666666, + "grad_norm": 1.5140371322631836, + "learning_rate": 0.00018966192170818508, + "loss": 3.043, + "step": 591 + }, + { + "epoch": 0.26311111111111113, + "grad_norm": 1.7295174598693848, + "learning_rate": 0.000189644128113879, + "loss": 2.8572, + "step": 592 + }, + { + "epoch": 0.26355555555555554, + "grad_norm": 1.5222134590148926, + "learning_rate": 0.00018962633451957296, + "loss": 2.2202, + "step": 593 + }, + { + "epoch": 0.264, + "grad_norm": 1.484958529472351, + "learning_rate": 0.00018960854092526692, + "loss": 2.1051, + "step": 594 + }, + { + "epoch": 0.2644444444444444, + "grad_norm": 1.4371466636657715, + "learning_rate": 0.00018959074733096087, + "loss": 2.301, + "step": 595 + }, + { + "epoch": 0.2648888888888889, + "grad_norm": 1.6050223112106323, + "learning_rate": 0.00018957295373665483, + "loss": 2.4561, + "step": 596 + }, + { + "epoch": 0.2653333333333333, + "grad_norm": 1.7809783220291138, + "learning_rate": 0.00018955516014234876, + "loss": 2.3243, + "step": 597 + }, + { + "epoch": 0.2657777777777778, + "grad_norm": 1.850594401359558, + "learning_rate": 0.00018953736654804272, + "loss": 2.6873, + "step": 598 + }, + { + "epoch": 0.26622222222222225, + "grad_norm": 1.9856559038162231, + "learning_rate": 0.00018951957295373665, + "loss": 2.3005, + "step": 599 + }, + { + "epoch": 0.26666666666666666, + "grad_norm": 2.3018789291381836, + "learning_rate": 0.0001895017793594306, + "loss": 1.6366, + "step": 600 + }, + { + "epoch": 0.26711111111111113, + "grad_norm": 0.9843171834945679, + "learning_rate": 0.00018948398576512456, + "loss": 2.7333, + "step": 601 + }, + { + "epoch": 0.26755555555555555, + "grad_norm": 0.9362220764160156, + "learning_rate": 0.00018946619217081852, + "loss": 2.8176, + "step": 602 + }, + { + "epoch": 0.268, + "grad_norm": 0.9775174260139465, + "learning_rate": 0.00018944839857651247, + "loss": 2.3173, + "step": 603 + }, + { + "epoch": 0.26844444444444443, + "grad_norm": 1.0477993488311768, + "learning_rate": 0.00018943060498220643, + "loss": 2.0655, + "step": 604 + }, + { + "epoch": 0.2688888888888889, + "grad_norm": 1.0463943481445312, + "learning_rate": 0.00018941281138790036, + "loss": 1.998, + "step": 605 + }, + { + "epoch": 0.2693333333333333, + "grad_norm": 1.0541325807571411, + "learning_rate": 0.00018939501779359431, + "loss": 2.4205, + "step": 606 + }, + { + "epoch": 0.2697777777777778, + "grad_norm": 1.0537536144256592, + "learning_rate": 0.00018937722419928827, + "loss": 2.8228, + "step": 607 + }, + { + "epoch": 0.2702222222222222, + "grad_norm": 1.0244420766830444, + "learning_rate": 0.00018935943060498223, + "loss": 2.3281, + "step": 608 + }, + { + "epoch": 0.27066666666666667, + "grad_norm": 1.3767787218093872, + "learning_rate": 0.00018934163701067618, + "loss": 3.1796, + "step": 609 + }, + { + "epoch": 0.27111111111111114, + "grad_norm": 1.03878915309906, + "learning_rate": 0.0001893238434163701, + "loss": 1.968, + "step": 610 + }, + { + "epoch": 0.27155555555555555, + "grad_norm": 1.1602753400802612, + "learning_rate": 0.00018930604982206407, + "loss": 2.6853, + "step": 611 + }, + { + "epoch": 0.272, + "grad_norm": 1.0449435710906982, + "learning_rate": 0.000189288256227758, + "loss": 2.7955, + "step": 612 + }, + { + "epoch": 0.27244444444444443, + "grad_norm": 1.095615029335022, + "learning_rate": 0.00018927046263345196, + "loss": 2.299, + "step": 613 + }, + { + "epoch": 0.2728888888888889, + "grad_norm": 1.4768877029418945, + "learning_rate": 0.0001892526690391459, + "loss": 3.057, + "step": 614 + }, + { + "epoch": 0.2733333333333333, + "grad_norm": 1.276252269744873, + "learning_rate": 0.00018923487544483987, + "loss": 2.585, + "step": 615 + }, + { + "epoch": 0.2737777777777778, + "grad_norm": 1.2952446937561035, + "learning_rate": 0.00018921708185053383, + "loss": 2.5297, + "step": 616 + }, + { + "epoch": 0.2742222222222222, + "grad_norm": 1.2312525510787964, + "learning_rate": 0.00018919928825622778, + "loss": 2.6543, + "step": 617 + }, + { + "epoch": 0.27466666666666667, + "grad_norm": 1.368359923362732, + "learning_rate": 0.0001891814946619217, + "loss": 2.6951, + "step": 618 + }, + { + "epoch": 0.2751111111111111, + "grad_norm": 1.50626802444458, + "learning_rate": 0.00018916370106761567, + "loss": 2.7336, + "step": 619 + }, + { + "epoch": 0.27555555555555555, + "grad_norm": 1.2678533792495728, + "learning_rate": 0.00018914590747330962, + "loss": 2.4208, + "step": 620 + }, + { + "epoch": 0.276, + "grad_norm": 1.167494773864746, + "learning_rate": 0.00018912811387900358, + "loss": 2.1091, + "step": 621 + }, + { + "epoch": 0.27644444444444444, + "grad_norm": 1.12721586227417, + "learning_rate": 0.00018911032028469754, + "loss": 2.0721, + "step": 622 + }, + { + "epoch": 0.2768888888888889, + "grad_norm": 1.306931495666504, + "learning_rate": 0.00018909252669039147, + "loss": 2.4418, + "step": 623 + }, + { + "epoch": 0.2773333333333333, + "grad_norm": 1.8457114696502686, + "learning_rate": 0.00018907473309608542, + "loss": 3.0008, + "step": 624 + }, + { + "epoch": 0.2777777777777778, + "grad_norm": 1.4332703351974487, + "learning_rate": 0.00018905693950177935, + "loss": 2.3379, + "step": 625 + }, + { + "epoch": 0.2782222222222222, + "grad_norm": 1.4976214170455933, + "learning_rate": 0.0001890391459074733, + "loss": 2.8298, + "step": 626 + }, + { + "epoch": 0.2786666666666667, + "grad_norm": 1.3851099014282227, + "learning_rate": 0.00018902135231316727, + "loss": 2.608, + "step": 627 + }, + { + "epoch": 0.2791111111111111, + "grad_norm": 1.3901604413986206, + "learning_rate": 0.00018900355871886122, + "loss": 2.4981, + "step": 628 + }, + { + "epoch": 0.27955555555555556, + "grad_norm": 1.5062224864959717, + "learning_rate": 0.00018898576512455518, + "loss": 2.4534, + "step": 629 + }, + { + "epoch": 0.28, + "grad_norm": 1.3895263671875, + "learning_rate": 0.00018896797153024913, + "loss": 2.5953, + "step": 630 + }, + { + "epoch": 0.28044444444444444, + "grad_norm": 1.5202879905700684, + "learning_rate": 0.00018895017793594306, + "loss": 2.2741, + "step": 631 + }, + { + "epoch": 0.2808888888888889, + "grad_norm": 1.2481839656829834, + "learning_rate": 0.00018893238434163702, + "loss": 2.3242, + "step": 632 + }, + { + "epoch": 0.2813333333333333, + "grad_norm": 1.2710275650024414, + "learning_rate": 0.00018891459074733098, + "loss": 2.1379, + "step": 633 + }, + { + "epoch": 0.2817777777777778, + "grad_norm": 1.3879283666610718, + "learning_rate": 0.00018889679715302493, + "loss": 1.353, + "step": 634 + }, + { + "epoch": 0.2822222222222222, + "grad_norm": 1.5457983016967773, + "learning_rate": 0.0001888790035587189, + "loss": 2.3432, + "step": 635 + }, + { + "epoch": 0.2826666666666667, + "grad_norm": 1.5545676946640015, + "learning_rate": 0.00018886120996441282, + "loss": 2.6554, + "step": 636 + }, + { + "epoch": 0.2831111111111111, + "grad_norm": 1.2374818325042725, + "learning_rate": 0.00018884341637010678, + "loss": 1.4633, + "step": 637 + }, + { + "epoch": 0.28355555555555556, + "grad_norm": 1.3478444814682007, + "learning_rate": 0.0001888256227758007, + "loss": 2.4976, + "step": 638 + }, + { + "epoch": 0.284, + "grad_norm": 1.5220305919647217, + "learning_rate": 0.00018880782918149466, + "loss": 2.6628, + "step": 639 + }, + { + "epoch": 0.28444444444444444, + "grad_norm": 1.6763924360275269, + "learning_rate": 0.00018879003558718862, + "loss": 2.7547, + "step": 640 + }, + { + "epoch": 0.2848888888888889, + "grad_norm": 1.5462572574615479, + "learning_rate": 0.00018877224199288258, + "loss": 2.6934, + "step": 641 + }, + { + "epoch": 0.2853333333333333, + "grad_norm": 1.9419124126434326, + "learning_rate": 0.00018875444839857653, + "loss": 2.5849, + "step": 642 + }, + { + "epoch": 0.2857777777777778, + "grad_norm": 1.6451220512390137, + "learning_rate": 0.00018873665480427046, + "loss": 2.7719, + "step": 643 + }, + { + "epoch": 0.2862222222222222, + "grad_norm": 1.5274759531021118, + "learning_rate": 0.00018871886120996442, + "loss": 2.2116, + "step": 644 + }, + { + "epoch": 0.2866666666666667, + "grad_norm": 2.1637439727783203, + "learning_rate": 0.00018870106761565837, + "loss": 3.0492, + "step": 645 + }, + { + "epoch": 0.2871111111111111, + "grad_norm": 1.6823992729187012, + "learning_rate": 0.00018868327402135233, + "loss": 2.7498, + "step": 646 + }, + { + "epoch": 0.28755555555555556, + "grad_norm": 1.695084810256958, + "learning_rate": 0.0001886654804270463, + "loss": 2.6973, + "step": 647 + }, + { + "epoch": 0.288, + "grad_norm": 1.8430315256118774, + "learning_rate": 0.00018864768683274024, + "loss": 1.9394, + "step": 648 + }, + { + "epoch": 0.28844444444444445, + "grad_norm": 2.2120563983917236, + "learning_rate": 0.00018862989323843417, + "loss": 1.9077, + "step": 649 + }, + { + "epoch": 0.28888888888888886, + "grad_norm": 2.309023141860962, + "learning_rate": 0.0001886120996441281, + "loss": 2.15, + "step": 650 + }, + { + "epoch": 0.28933333333333333, + "grad_norm": 1.1258161067962646, + "learning_rate": 0.00018859430604982206, + "loss": 2.9532, + "step": 651 + }, + { + "epoch": 0.2897777777777778, + "grad_norm": 0.9455536603927612, + "learning_rate": 0.00018857651245551602, + "loss": 2.695, + "step": 652 + }, + { + "epoch": 0.2902222222222222, + "grad_norm": 1.228993535041809, + "learning_rate": 0.00018855871886120997, + "loss": 1.3139, + "step": 653 + }, + { + "epoch": 0.2906666666666667, + "grad_norm": 1.0988825559616089, + "learning_rate": 0.00018854092526690393, + "loss": 2.3994, + "step": 654 + }, + { + "epoch": 0.2911111111111111, + "grad_norm": 1.0510218143463135, + "learning_rate": 0.00018852313167259788, + "loss": 2.9534, + "step": 655 + }, + { + "epoch": 0.29155555555555557, + "grad_norm": 1.1386710405349731, + "learning_rate": 0.00018850533807829181, + "loss": 2.2164, + "step": 656 + }, + { + "epoch": 0.292, + "grad_norm": 1.1761900186538696, + "learning_rate": 0.00018848754448398577, + "loss": 2.6014, + "step": 657 + }, + { + "epoch": 0.29244444444444445, + "grad_norm": 1.0478448867797852, + "learning_rate": 0.00018846975088967973, + "loss": 2.497, + "step": 658 + }, + { + "epoch": 0.29288888888888887, + "grad_norm": 1.2289860248565674, + "learning_rate": 0.00018845195729537368, + "loss": 3.0106, + "step": 659 + }, + { + "epoch": 0.29333333333333333, + "grad_norm": 1.2881073951721191, + "learning_rate": 0.00018843416370106764, + "loss": 2.5109, + "step": 660 + }, + { + "epoch": 0.2937777777777778, + "grad_norm": 1.2944267988204956, + "learning_rate": 0.00018841637010676157, + "loss": 2.3959, + "step": 661 + }, + { + "epoch": 0.2942222222222222, + "grad_norm": 1.1457511186599731, + "learning_rate": 0.00018839857651245553, + "loss": 2.5717, + "step": 662 + }, + { + "epoch": 0.2946666666666667, + "grad_norm": 1.288615345954895, + "learning_rate": 0.00018838078291814946, + "loss": 2.6314, + "step": 663 + }, + { + "epoch": 0.2951111111111111, + "grad_norm": 1.153175711631775, + "learning_rate": 0.0001883629893238434, + "loss": 2.1529, + "step": 664 + }, + { + "epoch": 0.29555555555555557, + "grad_norm": 1.0861622095108032, + "learning_rate": 0.00018834519572953737, + "loss": 2.2134, + "step": 665 + }, + { + "epoch": 0.296, + "grad_norm": 1.3027865886688232, + "learning_rate": 0.00018832740213523132, + "loss": 2.496, + "step": 666 + }, + { + "epoch": 0.29644444444444445, + "grad_norm": 1.2084095478057861, + "learning_rate": 0.00018830960854092528, + "loss": 2.6488, + "step": 667 + }, + { + "epoch": 0.29688888888888887, + "grad_norm": 1.2126069068908691, + "learning_rate": 0.00018829181494661924, + "loss": 2.2976, + "step": 668 + }, + { + "epoch": 0.29733333333333334, + "grad_norm": 1.392379879951477, + "learning_rate": 0.00018827402135231317, + "loss": 2.25, + "step": 669 + }, + { + "epoch": 0.29777777777777775, + "grad_norm": 1.228171944618225, + "learning_rate": 0.00018825622775800712, + "loss": 2.4179, + "step": 670 + }, + { + "epoch": 0.2982222222222222, + "grad_norm": 1.2194924354553223, + "learning_rate": 0.00018823843416370108, + "loss": 2.5822, + "step": 671 + }, + { + "epoch": 0.2986666666666667, + "grad_norm": 1.2600946426391602, + "learning_rate": 0.00018822064056939504, + "loss": 2.3714, + "step": 672 + }, + { + "epoch": 0.2991111111111111, + "grad_norm": 1.2756378650665283, + "learning_rate": 0.000188202846975089, + "loss": 2.3046, + "step": 673 + }, + { + "epoch": 0.2995555555555556, + "grad_norm": 1.4584718942642212, + "learning_rate": 0.00018818505338078292, + "loss": 2.6382, + "step": 674 + }, + { + "epoch": 0.3, + "grad_norm": 1.393619179725647, + "learning_rate": 0.00018816725978647688, + "loss": 2.7167, + "step": 675 + }, + { + "epoch": 0.30044444444444446, + "grad_norm": 1.6865702867507935, + "learning_rate": 0.0001881494661921708, + "loss": 2.446, + "step": 676 + }, + { + "epoch": 0.3008888888888889, + "grad_norm": 1.3696800470352173, + "learning_rate": 0.00018813167259786477, + "loss": 2.5134, + "step": 677 + }, + { + "epoch": 0.30133333333333334, + "grad_norm": 1.3241018056869507, + "learning_rate": 0.00018811387900355872, + "loss": 2.7836, + "step": 678 + }, + { + "epoch": 0.30177777777777776, + "grad_norm": 1.3688435554504395, + "learning_rate": 0.00018809608540925268, + "loss": 2.8028, + "step": 679 + }, + { + "epoch": 0.3022222222222222, + "grad_norm": 2.019115447998047, + "learning_rate": 0.00018807829181494663, + "loss": 2.4958, + "step": 680 + }, + { + "epoch": 0.30266666666666664, + "grad_norm": 1.3393666744232178, + "learning_rate": 0.0001880604982206406, + "loss": 2.4144, + "step": 681 + }, + { + "epoch": 0.3031111111111111, + "grad_norm": 1.5808879137039185, + "learning_rate": 0.00018804270462633452, + "loss": 1.5098, + "step": 682 + }, + { + "epoch": 0.3035555555555556, + "grad_norm": 1.5631835460662842, + "learning_rate": 0.00018802491103202848, + "loss": 2.6174, + "step": 683 + }, + { + "epoch": 0.304, + "grad_norm": 1.338965892791748, + "learning_rate": 0.00018800711743772243, + "loss": 2.933, + "step": 684 + }, + { + "epoch": 0.30444444444444446, + "grad_norm": 1.4270402193069458, + "learning_rate": 0.0001879893238434164, + "loss": 2.4163, + "step": 685 + }, + { + "epoch": 0.3048888888888889, + "grad_norm": 1.6511561870574951, + "learning_rate": 0.00018797153024911035, + "loss": 2.8065, + "step": 686 + }, + { + "epoch": 0.30533333333333335, + "grad_norm": 1.3582799434661865, + "learning_rate": 0.00018795373665480428, + "loss": 2.4156, + "step": 687 + }, + { + "epoch": 0.30577777777777776, + "grad_norm": 1.3298442363739014, + "learning_rate": 0.00018793594306049823, + "loss": 2.7763, + "step": 688 + }, + { + "epoch": 0.30622222222222223, + "grad_norm": 1.5233420133590698, + "learning_rate": 0.00018791814946619216, + "loss": 2.605, + "step": 689 + }, + { + "epoch": 0.30666666666666664, + "grad_norm": 1.3514484167099, + "learning_rate": 0.00018790035587188612, + "loss": 2.6729, + "step": 690 + }, + { + "epoch": 0.3071111111111111, + "grad_norm": 1.5793657302856445, + "learning_rate": 0.00018788256227758007, + "loss": 2.8392, + "step": 691 + }, + { + "epoch": 0.3075555555555556, + "grad_norm": 1.590437889099121, + "learning_rate": 0.00018786476868327403, + "loss": 2.445, + "step": 692 + }, + { + "epoch": 0.308, + "grad_norm": 1.5807512998580933, + "learning_rate": 0.000187846975088968, + "loss": 2.9341, + "step": 693 + }, + { + "epoch": 0.30844444444444447, + "grad_norm": 1.5815593004226685, + "learning_rate": 0.00018782918149466194, + "loss": 2.5062, + "step": 694 + }, + { + "epoch": 0.3088888888888889, + "grad_norm": 1.6342856884002686, + "learning_rate": 0.00018781138790035587, + "loss": 2.769, + "step": 695 + }, + { + "epoch": 0.30933333333333335, + "grad_norm": 1.8067598342895508, + "learning_rate": 0.00018779359430604983, + "loss": 2.3512, + "step": 696 + }, + { + "epoch": 0.30977777777777776, + "grad_norm": 1.7684136629104614, + "learning_rate": 0.0001877758007117438, + "loss": 3.2031, + "step": 697 + }, + { + "epoch": 0.31022222222222223, + "grad_norm": 1.7312310934066772, + "learning_rate": 0.00018775800711743774, + "loss": 2.7118, + "step": 698 + }, + { + "epoch": 0.31066666666666665, + "grad_norm": 1.7264273166656494, + "learning_rate": 0.0001877402135231317, + "loss": 2.7106, + "step": 699 + }, + { + "epoch": 0.3111111111111111, + "grad_norm": 1.8946504592895508, + "learning_rate": 0.00018772241992882563, + "loss": 1.8234, + "step": 700 + }, + { + "epoch": 0.31155555555555553, + "grad_norm": 0.897571325302124, + "learning_rate": 0.00018770462633451959, + "loss": 2.4317, + "step": 701 + }, + { + "epoch": 0.312, + "grad_norm": 1.212544560432434, + "learning_rate": 0.00018768683274021351, + "loss": 1.817, + "step": 702 + }, + { + "epoch": 0.31244444444444447, + "grad_norm": 1.4370547533035278, + "learning_rate": 0.00018766903914590747, + "loss": 1.6912, + "step": 703 + }, + { + "epoch": 0.3128888888888889, + "grad_norm": 1.2379952669143677, + "learning_rate": 0.00018765124555160143, + "loss": 2.8551, + "step": 704 + }, + { + "epoch": 0.31333333333333335, + "grad_norm": 1.2043800354003906, + "learning_rate": 0.00018763345195729538, + "loss": 2.2021, + "step": 705 + }, + { + "epoch": 0.31377777777777777, + "grad_norm": 1.1653484106063843, + "learning_rate": 0.00018761565836298934, + "loss": 2.97, + "step": 706 + }, + { + "epoch": 0.31422222222222224, + "grad_norm": 1.2504090070724487, + "learning_rate": 0.0001875978647686833, + "loss": 2.8417, + "step": 707 + }, + { + "epoch": 0.31466666666666665, + "grad_norm": 1.186420202255249, + "learning_rate": 0.00018758007117437723, + "loss": 2.2231, + "step": 708 + }, + { + "epoch": 0.3151111111111111, + "grad_norm": 1.3034793138504028, + "learning_rate": 0.00018756227758007118, + "loss": 2.7367, + "step": 709 + }, + { + "epoch": 0.31555555555555553, + "grad_norm": 1.1502488851547241, + "learning_rate": 0.00018754448398576514, + "loss": 3.0465, + "step": 710 + }, + { + "epoch": 0.316, + "grad_norm": 1.3152366876602173, + "learning_rate": 0.0001875266903914591, + "loss": 3.0754, + "step": 711 + }, + { + "epoch": 0.3164444444444444, + "grad_norm": 1.0930118560791016, + "learning_rate": 0.00018750889679715305, + "loss": 2.3183, + "step": 712 + }, + { + "epoch": 0.3168888888888889, + "grad_norm": 1.3411848545074463, + "learning_rate": 0.00018749110320284698, + "loss": 3.0055, + "step": 713 + }, + { + "epoch": 0.31733333333333336, + "grad_norm": 1.222649097442627, + "learning_rate": 0.00018747330960854094, + "loss": 2.541, + "step": 714 + }, + { + "epoch": 0.31777777777777777, + "grad_norm": 1.3231635093688965, + "learning_rate": 0.00018745551601423487, + "loss": 3.1424, + "step": 715 + }, + { + "epoch": 0.31822222222222224, + "grad_norm": 1.3029333353042603, + "learning_rate": 0.00018743772241992882, + "loss": 2.736, + "step": 716 + }, + { + "epoch": 0.31866666666666665, + "grad_norm": 1.1651556491851807, + "learning_rate": 0.00018741992882562278, + "loss": 2.4525, + "step": 717 + }, + { + "epoch": 0.3191111111111111, + "grad_norm": 1.1928997039794922, + "learning_rate": 0.00018740213523131674, + "loss": 2.338, + "step": 718 + }, + { + "epoch": 0.31955555555555554, + "grad_norm": 1.2094029188156128, + "learning_rate": 0.0001873843416370107, + "loss": 2.1728, + "step": 719 + }, + { + "epoch": 0.32, + "grad_norm": 1.3674081563949585, + "learning_rate": 0.00018736654804270465, + "loss": 2.7086, + "step": 720 + }, + { + "epoch": 0.3204444444444444, + "grad_norm": 1.2240111827850342, + "learning_rate": 0.00018734875444839858, + "loss": 2.4084, + "step": 721 + }, + { + "epoch": 0.3208888888888889, + "grad_norm": 1.625939965248108, + "learning_rate": 0.00018733096085409254, + "loss": 2.233, + "step": 722 + }, + { + "epoch": 0.32133333333333336, + "grad_norm": 1.572806477546692, + "learning_rate": 0.0001873131672597865, + "loss": 2.8494, + "step": 723 + }, + { + "epoch": 0.3217777777777778, + "grad_norm": 1.4796736240386963, + "learning_rate": 0.00018729537366548045, + "loss": 2.5345, + "step": 724 + }, + { + "epoch": 0.32222222222222224, + "grad_norm": 1.5258103609085083, + "learning_rate": 0.0001872775800711744, + "loss": 2.4471, + "step": 725 + }, + { + "epoch": 0.32266666666666666, + "grad_norm": 1.5818583965301514, + "learning_rate": 0.00018725978647686834, + "loss": 2.9746, + "step": 726 + }, + { + "epoch": 0.3231111111111111, + "grad_norm": 1.275765061378479, + "learning_rate": 0.00018724199288256226, + "loss": 2.4232, + "step": 727 + }, + { + "epoch": 0.32355555555555554, + "grad_norm": 1.3722361326217651, + "learning_rate": 0.00018722419928825622, + "loss": 1.4788, + "step": 728 + }, + { + "epoch": 0.324, + "grad_norm": 1.369632363319397, + "learning_rate": 0.00018720640569395018, + "loss": 2.5712, + "step": 729 + }, + { + "epoch": 0.3244444444444444, + "grad_norm": 1.5968562364578247, + "learning_rate": 0.00018718861209964413, + "loss": 2.3008, + "step": 730 + }, + { + "epoch": 0.3248888888888889, + "grad_norm": 1.6501327753067017, + "learning_rate": 0.0001871708185053381, + "loss": 2.5323, + "step": 731 + }, + { + "epoch": 0.3253333333333333, + "grad_norm": 1.5736616849899292, + "learning_rate": 0.00018715302491103205, + "loss": 2.2086, + "step": 732 + }, + { + "epoch": 0.3257777777777778, + "grad_norm": 1.4434736967086792, + "learning_rate": 0.00018713523131672598, + "loss": 2.613, + "step": 733 + }, + { + "epoch": 0.32622222222222225, + "grad_norm": 1.5532594919204712, + "learning_rate": 0.00018711743772241993, + "loss": 2.4163, + "step": 734 + }, + { + "epoch": 0.32666666666666666, + "grad_norm": 1.4101078510284424, + "learning_rate": 0.0001870996441281139, + "loss": 2.6866, + "step": 735 + }, + { + "epoch": 0.32711111111111113, + "grad_norm": 1.3974218368530273, + "learning_rate": 0.00018708185053380785, + "loss": 2.0701, + "step": 736 + }, + { + "epoch": 0.32755555555555554, + "grad_norm": 1.8705499172210693, + "learning_rate": 0.0001870640569395018, + "loss": 2.5967, + "step": 737 + }, + { + "epoch": 0.328, + "grad_norm": 1.7035057544708252, + "learning_rate": 0.00018704626334519576, + "loss": 2.589, + "step": 738 + }, + { + "epoch": 0.32844444444444443, + "grad_norm": 1.477556824684143, + "learning_rate": 0.0001870284697508897, + "loss": 2.576, + "step": 739 + }, + { + "epoch": 0.3288888888888889, + "grad_norm": 1.5129868984222412, + "learning_rate": 0.00018701067615658362, + "loss": 2.5122, + "step": 740 + }, + { + "epoch": 0.3293333333333333, + "grad_norm": 1.453865647315979, + "learning_rate": 0.00018699288256227757, + "loss": 2.1861, + "step": 741 + }, + { + "epoch": 0.3297777777777778, + "grad_norm": 1.3834642171859741, + "learning_rate": 0.00018697508896797153, + "loss": 2.3745, + "step": 742 + }, + { + "epoch": 0.3302222222222222, + "grad_norm": 1.610922932624817, + "learning_rate": 0.0001869572953736655, + "loss": 2.6936, + "step": 743 + }, + { + "epoch": 0.33066666666666666, + "grad_norm": 1.550073266029358, + "learning_rate": 0.00018693950177935944, + "loss": 2.3292, + "step": 744 + }, + { + "epoch": 0.33111111111111113, + "grad_norm": 1.5677452087402344, + "learning_rate": 0.0001869217081850534, + "loss": 2.8312, + "step": 745 + }, + { + "epoch": 0.33155555555555555, + "grad_norm": 1.6848689317703247, + "learning_rate": 0.00018690391459074733, + "loss": 2.9414, + "step": 746 + }, + { + "epoch": 0.332, + "grad_norm": 1.5544129610061646, + "learning_rate": 0.00018688612099644129, + "loss": 2.3318, + "step": 747 + }, + { + "epoch": 0.33244444444444443, + "grad_norm": 1.705752968788147, + "learning_rate": 0.00018686832740213524, + "loss": 2.4389, + "step": 748 + }, + { + "epoch": 0.3328888888888889, + "grad_norm": 1.7628095149993896, + "learning_rate": 0.0001868505338078292, + "loss": 2.7991, + "step": 749 + }, + { + "epoch": 0.3333333333333333, + "grad_norm": 2.3342695236206055, + "learning_rate": 0.00018683274021352316, + "loss": 2.8554, + "step": 750 + }, + { + "epoch": 0.3337777777777778, + "grad_norm": 0.9579228758811951, + "learning_rate": 0.0001868149466192171, + "loss": 3.0001, + "step": 751 + }, + { + "epoch": 0.3342222222222222, + "grad_norm": 0.910830020904541, + "learning_rate": 0.00018679715302491104, + "loss": 2.7247, + "step": 752 + }, + { + "epoch": 0.33466666666666667, + "grad_norm": 0.9618215560913086, + "learning_rate": 0.00018677935943060497, + "loss": 2.1627, + "step": 753 + }, + { + "epoch": 0.33511111111111114, + "grad_norm": 1.398019790649414, + "learning_rate": 0.00018676156583629893, + "loss": 2.8774, + "step": 754 + }, + { + "epoch": 0.33555555555555555, + "grad_norm": 0.98659348487854, + "learning_rate": 0.00018674377224199288, + "loss": 2.207, + "step": 755 + }, + { + "epoch": 0.336, + "grad_norm": 1.0544646978378296, + "learning_rate": 0.00018672597864768684, + "loss": 2.5833, + "step": 756 + }, + { + "epoch": 0.33644444444444443, + "grad_norm": 1.1433809995651245, + "learning_rate": 0.0001867081850533808, + "loss": 2.1706, + "step": 757 + }, + { + "epoch": 0.3368888888888889, + "grad_norm": 1.170425295829773, + "learning_rate": 0.00018669039145907475, + "loss": 2.3311, + "step": 758 + }, + { + "epoch": 0.3373333333333333, + "grad_norm": 1.2254000902175903, + "learning_rate": 0.00018667259786476868, + "loss": 2.671, + "step": 759 + }, + { + "epoch": 0.3377777777777778, + "grad_norm": 1.2437927722930908, + "learning_rate": 0.00018665480427046264, + "loss": 2.9828, + "step": 760 + }, + { + "epoch": 0.3382222222222222, + "grad_norm": 1.3588943481445312, + "learning_rate": 0.0001866370106761566, + "loss": 2.8734, + "step": 761 + }, + { + "epoch": 0.33866666666666667, + "grad_norm": 1.5109245777130127, + "learning_rate": 0.00018661921708185055, + "loss": 2.0159, + "step": 762 + }, + { + "epoch": 0.3391111111111111, + "grad_norm": 1.2116190195083618, + "learning_rate": 0.0001866014234875445, + "loss": 2.1226, + "step": 763 + }, + { + "epoch": 0.33955555555555555, + "grad_norm": 1.3050971031188965, + "learning_rate": 0.00018658362989323847, + "loss": 2.2373, + "step": 764 + }, + { + "epoch": 0.34, + "grad_norm": 1.2877930402755737, + "learning_rate": 0.0001865658362989324, + "loss": 2.5477, + "step": 765 + }, + { + "epoch": 0.34044444444444444, + "grad_norm": 1.306307077407837, + "learning_rate": 0.00018654804270462632, + "loss": 2.7654, + "step": 766 + }, + { + "epoch": 0.3408888888888889, + "grad_norm": 1.3450534343719482, + "learning_rate": 0.00018653024911032028, + "loss": 1.9589, + "step": 767 + }, + { + "epoch": 0.3413333333333333, + "grad_norm": 1.574854850769043, + "learning_rate": 0.00018651245551601424, + "loss": 2.778, + "step": 768 + }, + { + "epoch": 0.3417777777777778, + "grad_norm": 1.3935576677322388, + "learning_rate": 0.0001864946619217082, + "loss": 2.8835, + "step": 769 + }, + { + "epoch": 0.3422222222222222, + "grad_norm": 1.169109582901001, + "learning_rate": 0.00018647686832740215, + "loss": 1.8586, + "step": 770 + }, + { + "epoch": 0.3426666666666667, + "grad_norm": 1.6845237016677856, + "learning_rate": 0.0001864590747330961, + "loss": 2.3178, + "step": 771 + }, + { + "epoch": 0.3431111111111111, + "grad_norm": 2.100719690322876, + "learning_rate": 0.00018644128113879004, + "loss": 2.5948, + "step": 772 + }, + { + "epoch": 0.34355555555555556, + "grad_norm": 1.3049522638320923, + "learning_rate": 0.000186423487544484, + "loss": 2.2732, + "step": 773 + }, + { + "epoch": 0.344, + "grad_norm": 1.610150933265686, + "learning_rate": 0.00018640569395017795, + "loss": 2.658, + "step": 774 + }, + { + "epoch": 0.34444444444444444, + "grad_norm": 2.3013813495635986, + "learning_rate": 0.0001863879003558719, + "loss": 1.0201, + "step": 775 + }, + { + "epoch": 0.3448888888888889, + "grad_norm": 1.3802649974822998, + "learning_rate": 0.00018637010676156586, + "loss": 2.4351, + "step": 776 + }, + { + "epoch": 0.3453333333333333, + "grad_norm": 1.4456000328063965, + "learning_rate": 0.0001863523131672598, + "loss": 2.9545, + "step": 777 + }, + { + "epoch": 0.3457777777777778, + "grad_norm": 1.7522518634796143, + "learning_rate": 0.00018633451957295375, + "loss": 2.3526, + "step": 778 + }, + { + "epoch": 0.3462222222222222, + "grad_norm": 1.460486650466919, + "learning_rate": 0.00018631672597864768, + "loss": 2.9393, + "step": 779 + }, + { + "epoch": 0.3466666666666667, + "grad_norm": 1.650462031364441, + "learning_rate": 0.00018629893238434163, + "loss": 2.8534, + "step": 780 + }, + { + "epoch": 0.3471111111111111, + "grad_norm": 1.3861716985702515, + "learning_rate": 0.0001862811387900356, + "loss": 2.7778, + "step": 781 + }, + { + "epoch": 0.34755555555555556, + "grad_norm": 1.4128412008285522, + "learning_rate": 0.00018626334519572955, + "loss": 2.5953, + "step": 782 + }, + { + "epoch": 0.348, + "grad_norm": 1.5334755182266235, + "learning_rate": 0.0001862455516014235, + "loss": 2.4878, + "step": 783 + }, + { + "epoch": 0.34844444444444445, + "grad_norm": 1.4212511777877808, + "learning_rate": 0.00018622775800711746, + "loss": 2.4014, + "step": 784 + }, + { + "epoch": 0.3488888888888889, + "grad_norm": 1.7563406229019165, + "learning_rate": 0.0001862099644128114, + "loss": 2.8054, + "step": 785 + }, + { + "epoch": 0.34933333333333333, + "grad_norm": 1.8490791320800781, + "learning_rate": 0.00018619217081850535, + "loss": 2.5487, + "step": 786 + }, + { + "epoch": 0.3497777777777778, + "grad_norm": 1.55039644241333, + "learning_rate": 0.0001861743772241993, + "loss": 2.1236, + "step": 787 + }, + { + "epoch": 0.3502222222222222, + "grad_norm": 1.187299132347107, + "learning_rate": 0.00018615658362989326, + "loss": 1.773, + "step": 788 + }, + { + "epoch": 0.3506666666666667, + "grad_norm": 1.3866082429885864, + "learning_rate": 0.00018613879003558722, + "loss": 2.0484, + "step": 789 + }, + { + "epoch": 0.3511111111111111, + "grad_norm": 1.5214849710464478, + "learning_rate": 0.00018612099644128114, + "loss": 2.5582, + "step": 790 + }, + { + "epoch": 0.35155555555555557, + "grad_norm": 1.9252493381500244, + "learning_rate": 0.0001861032028469751, + "loss": 2.6299, + "step": 791 + }, + { + "epoch": 0.352, + "grad_norm": 2.005993366241455, + "learning_rate": 0.00018608540925266903, + "loss": 2.9747, + "step": 792 + }, + { + "epoch": 0.35244444444444445, + "grad_norm": 1.4414646625518799, + "learning_rate": 0.000186067615658363, + "loss": 2.3433, + "step": 793 + }, + { + "epoch": 0.35288888888888886, + "grad_norm": 1.6313060522079468, + "learning_rate": 0.00018604982206405694, + "loss": 2.4237, + "step": 794 + }, + { + "epoch": 0.35333333333333333, + "grad_norm": 1.917683720588684, + "learning_rate": 0.0001860320284697509, + "loss": 2.7755, + "step": 795 + }, + { + "epoch": 0.3537777777777778, + "grad_norm": 1.681670069694519, + "learning_rate": 0.00018601423487544486, + "loss": 2.5065, + "step": 796 + }, + { + "epoch": 0.3542222222222222, + "grad_norm": 1.6937282085418701, + "learning_rate": 0.0001859964412811388, + "loss": 2.1956, + "step": 797 + }, + { + "epoch": 0.3546666666666667, + "grad_norm": 1.8412083387374878, + "learning_rate": 0.00018597864768683274, + "loss": 2.2921, + "step": 798 + }, + { + "epoch": 0.3551111111111111, + "grad_norm": 2.1753652095794678, + "learning_rate": 0.0001859608540925267, + "loss": 2.0335, + "step": 799 + }, + { + "epoch": 0.35555555555555557, + "grad_norm": 3.113302707672119, + "learning_rate": 0.00018594306049822066, + "loss": 2.3211, + "step": 800 + }, + { + "epoch": 0.356, + "grad_norm": 1.0349335670471191, + "learning_rate": 0.0001859252669039146, + "loss": 2.5146, + "step": 801 + }, + { + "epoch": 0.35644444444444445, + "grad_norm": 1.299102544784546, + "learning_rate": 0.00018590747330960857, + "loss": 2.537, + "step": 802 + }, + { + "epoch": 0.35688888888888887, + "grad_norm": 1.0697323083877563, + "learning_rate": 0.0001858896797153025, + "loss": 2.2026, + "step": 803 + }, + { + "epoch": 0.35733333333333334, + "grad_norm": 1.0610216856002808, + "learning_rate": 0.00018587188612099645, + "loss": 2.6074, + "step": 804 + }, + { + "epoch": 0.35777777777777775, + "grad_norm": 1.3162498474121094, + "learning_rate": 0.00018585409252669038, + "loss": 3.1557, + "step": 805 + }, + { + "epoch": 0.3582222222222222, + "grad_norm": 1.2941645383834839, + "learning_rate": 0.00018583629893238434, + "loss": 2.8547, + "step": 806 + }, + { + "epoch": 0.3586666666666667, + "grad_norm": 1.1461007595062256, + "learning_rate": 0.0001858185053380783, + "loss": 2.7406, + "step": 807 + }, + { + "epoch": 0.3591111111111111, + "grad_norm": 1.1096692085266113, + "learning_rate": 0.00018580071174377225, + "loss": 2.3455, + "step": 808 + }, + { + "epoch": 0.3595555555555556, + "grad_norm": 1.158469796180725, + "learning_rate": 0.0001857829181494662, + "loss": 2.9105, + "step": 809 + }, + { + "epoch": 0.36, + "grad_norm": 1.1534368991851807, + "learning_rate": 0.00018576512455516017, + "loss": 2.6555, + "step": 810 + }, + { + "epoch": 0.36044444444444446, + "grad_norm": 1.1266659498214722, + "learning_rate": 0.0001857473309608541, + "loss": 2.7283, + "step": 811 + }, + { + "epoch": 0.36088888888888887, + "grad_norm": 1.1437948942184448, + "learning_rate": 0.00018572953736654805, + "loss": 2.4803, + "step": 812 + }, + { + "epoch": 0.36133333333333334, + "grad_norm": 1.182286262512207, + "learning_rate": 0.000185711743772242, + "loss": 2.6214, + "step": 813 + }, + { + "epoch": 0.36177777777777775, + "grad_norm": 1.253722071647644, + "learning_rate": 0.00018569395017793596, + "loss": 2.6758, + "step": 814 + }, + { + "epoch": 0.3622222222222222, + "grad_norm": 1.2334574460983276, + "learning_rate": 0.00018567615658362992, + "loss": 2.6005, + "step": 815 + }, + { + "epoch": 0.3626666666666667, + "grad_norm": 1.1198886632919312, + "learning_rate": 0.00018565836298932385, + "loss": 2.4263, + "step": 816 + }, + { + "epoch": 0.3631111111111111, + "grad_norm": 1.501847743988037, + "learning_rate": 0.00018564056939501778, + "loss": 3.3744, + "step": 817 + }, + { + "epoch": 0.3635555555555556, + "grad_norm": 1.3934186697006226, + "learning_rate": 0.00018562277580071174, + "loss": 3.2117, + "step": 818 + }, + { + "epoch": 0.364, + "grad_norm": 1.3710157871246338, + "learning_rate": 0.0001856049822064057, + "loss": 1.9331, + "step": 819 + }, + { + "epoch": 0.36444444444444446, + "grad_norm": 1.4316257238388062, + "learning_rate": 0.00018558718861209965, + "loss": 1.9532, + "step": 820 + }, + { + "epoch": 0.3648888888888889, + "grad_norm": 1.8586760759353638, + "learning_rate": 0.0001855693950177936, + "loss": 2.8284, + "step": 821 + }, + { + "epoch": 0.36533333333333334, + "grad_norm": 1.5303040742874146, + "learning_rate": 0.00018555160142348756, + "loss": 2.0609, + "step": 822 + }, + { + "epoch": 0.36577777777777776, + "grad_norm": 1.5688817501068115, + "learning_rate": 0.0001855338078291815, + "loss": 2.5499, + "step": 823 + }, + { + "epoch": 0.3662222222222222, + "grad_norm": 1.3707939386367798, + "learning_rate": 0.00018551601423487545, + "loss": 2.4933, + "step": 824 + }, + { + "epoch": 0.36666666666666664, + "grad_norm": 1.435909628868103, + "learning_rate": 0.0001854982206405694, + "loss": 2.7909, + "step": 825 + }, + { + "epoch": 0.3671111111111111, + "grad_norm": 1.5045204162597656, + "learning_rate": 0.00018548042704626336, + "loss": 2.4644, + "step": 826 + }, + { + "epoch": 0.3675555555555556, + "grad_norm": 1.3811103105545044, + "learning_rate": 0.00018546263345195732, + "loss": 2.1934, + "step": 827 + }, + { + "epoch": 0.368, + "grad_norm": 1.7939069271087646, + "learning_rate": 0.00018544483985765127, + "loss": 2.2179, + "step": 828 + }, + { + "epoch": 0.36844444444444446, + "grad_norm": 1.7151756286621094, + "learning_rate": 0.0001854270462633452, + "loss": 2.9784, + "step": 829 + }, + { + "epoch": 0.3688888888888889, + "grad_norm": 1.6932202577590942, + "learning_rate": 0.00018540925266903913, + "loss": 2.5832, + "step": 830 + }, + { + "epoch": 0.36933333333333335, + "grad_norm": 1.4874944686889648, + "learning_rate": 0.0001853914590747331, + "loss": 2.2795, + "step": 831 + }, + { + "epoch": 0.36977777777777776, + "grad_norm": 1.5106111764907837, + "learning_rate": 0.00018537366548042705, + "loss": 1.6537, + "step": 832 + }, + { + "epoch": 0.37022222222222223, + "grad_norm": 1.5347083806991577, + "learning_rate": 0.000185355871886121, + "loss": 2.5899, + "step": 833 + }, + { + "epoch": 0.37066666666666664, + "grad_norm": 1.2724993228912354, + "learning_rate": 0.00018533807829181496, + "loss": 2.4979, + "step": 834 + }, + { + "epoch": 0.3711111111111111, + "grad_norm": 1.5324300527572632, + "learning_rate": 0.00018532028469750892, + "loss": 2.4469, + "step": 835 + }, + { + "epoch": 0.37155555555555553, + "grad_norm": 1.6249970197677612, + "learning_rate": 0.00018530249110320285, + "loss": 2.6245, + "step": 836 + }, + { + "epoch": 0.372, + "grad_norm": 1.3731900453567505, + "learning_rate": 0.0001852846975088968, + "loss": 1.9859, + "step": 837 + }, + { + "epoch": 0.37244444444444447, + "grad_norm": 1.437991976737976, + "learning_rate": 0.00018526690391459076, + "loss": 2.4142, + "step": 838 + }, + { + "epoch": 0.3728888888888889, + "grad_norm": 2.2342700958251953, + "learning_rate": 0.00018524911032028471, + "loss": 1.0456, + "step": 839 + }, + { + "epoch": 0.37333333333333335, + "grad_norm": 1.4727734327316284, + "learning_rate": 0.00018523131672597867, + "loss": 2.3314, + "step": 840 + }, + { + "epoch": 0.37377777777777776, + "grad_norm": 1.6986255645751953, + "learning_rate": 0.00018521352313167263, + "loss": 2.5555, + "step": 841 + }, + { + "epoch": 0.37422222222222223, + "grad_norm": 1.611127495765686, + "learning_rate": 0.00018519572953736656, + "loss": 2.5777, + "step": 842 + }, + { + "epoch": 0.37466666666666665, + "grad_norm": 1.5206453800201416, + "learning_rate": 0.0001851779359430605, + "loss": 2.8207, + "step": 843 + }, + { + "epoch": 0.3751111111111111, + "grad_norm": 1.5014015436172485, + "learning_rate": 0.00018516014234875444, + "loss": 2.2796, + "step": 844 + }, + { + "epoch": 0.37555555555555553, + "grad_norm": 1.9145801067352295, + "learning_rate": 0.0001851423487544484, + "loss": 2.432, + "step": 845 + }, + { + "epoch": 0.376, + "grad_norm": 1.7269951105117798, + "learning_rate": 0.00018512455516014236, + "loss": 2.8725, + "step": 846 + }, + { + "epoch": 0.37644444444444447, + "grad_norm": 2.094856023788452, + "learning_rate": 0.0001851067615658363, + "loss": 2.5228, + "step": 847 + }, + { + "epoch": 0.3768888888888889, + "grad_norm": 4.175536155700684, + "learning_rate": 0.00018508896797153027, + "loss": 1.4835, + "step": 848 + }, + { + "epoch": 0.37733333333333335, + "grad_norm": 2.4652979373931885, + "learning_rate": 0.0001850711743772242, + "loss": 2.4617, + "step": 849 + }, + { + "epoch": 0.37777777777777777, + "grad_norm": 2.9979593753814697, + "learning_rate": 0.00018505338078291815, + "loss": 2.2239, + "step": 850 + }, + { + "epoch": 0.37822222222222224, + "grad_norm": 2.925299644470215, + "learning_rate": 0.0001850355871886121, + "loss": 1.4753, + "step": 851 + }, + { + "epoch": 0.37866666666666665, + "grad_norm": 1.156390905380249, + "learning_rate": 0.00018501779359430607, + "loss": 2.5781, + "step": 852 + }, + { + "epoch": 0.3791111111111111, + "grad_norm": 1.1416361331939697, + "learning_rate": 0.00018500000000000002, + "loss": 2.5683, + "step": 853 + }, + { + "epoch": 0.37955555555555553, + "grad_norm": 1.1092318296432495, + "learning_rate": 0.00018498220640569398, + "loss": 2.6212, + "step": 854 + }, + { + "epoch": 0.38, + "grad_norm": 1.1286892890930176, + "learning_rate": 0.0001849644128113879, + "loss": 2.3329, + "step": 855 + }, + { + "epoch": 0.3804444444444444, + "grad_norm": 1.3141859769821167, + "learning_rate": 0.00018494661921708184, + "loss": 2.6131, + "step": 856 + }, + { + "epoch": 0.3808888888888889, + "grad_norm": 1.1940083503723145, + "learning_rate": 0.0001849288256227758, + "loss": 2.6077, + "step": 857 + }, + { + "epoch": 0.38133333333333336, + "grad_norm": 1.2545088529586792, + "learning_rate": 0.00018491103202846975, + "loss": 2.6905, + "step": 858 + }, + { + "epoch": 0.38177777777777777, + "grad_norm": 1.0562766790390015, + "learning_rate": 0.0001848932384341637, + "loss": 2.1665, + "step": 859 + }, + { + "epoch": 0.38222222222222224, + "grad_norm": 1.2652605772018433, + "learning_rate": 0.00018487544483985767, + "loss": 2.6562, + "step": 860 + }, + { + "epoch": 0.38266666666666665, + "grad_norm": 1.213104486465454, + "learning_rate": 0.00018485765124555162, + "loss": 2.3212, + "step": 861 + }, + { + "epoch": 0.3831111111111111, + "grad_norm": 1.2592909336090088, + "learning_rate": 0.00018483985765124555, + "loss": 2.2561, + "step": 862 + }, + { + "epoch": 0.38355555555555554, + "grad_norm": 1.3437938690185547, + "learning_rate": 0.0001848220640569395, + "loss": 2.7051, + "step": 863 + }, + { + "epoch": 0.384, + "grad_norm": 1.2356623411178589, + "learning_rate": 0.00018480427046263346, + "loss": 2.1958, + "step": 864 + }, + { + "epoch": 0.3844444444444444, + "grad_norm": 1.6262998580932617, + "learning_rate": 0.00018478647686832742, + "loss": 3.2701, + "step": 865 + }, + { + "epoch": 0.3848888888888889, + "grad_norm": 1.336805820465088, + "learning_rate": 0.00018476868327402138, + "loss": 2.8542, + "step": 866 + }, + { + "epoch": 0.38533333333333336, + "grad_norm": 1.4716001749038696, + "learning_rate": 0.00018475088967971533, + "loss": 2.5937, + "step": 867 + }, + { + "epoch": 0.3857777777777778, + "grad_norm": 1.3492522239685059, + "learning_rate": 0.00018473309608540926, + "loss": 2.7743, + "step": 868 + }, + { + "epoch": 0.38622222222222224, + "grad_norm": 1.2297523021697998, + "learning_rate": 0.0001847153024911032, + "loss": 2.1033, + "step": 869 + }, + { + "epoch": 0.38666666666666666, + "grad_norm": 1.3531607389450073, + "learning_rate": 0.00018469750889679715, + "loss": 2.3351, + "step": 870 + }, + { + "epoch": 0.38711111111111113, + "grad_norm": 1.213259220123291, + "learning_rate": 0.0001846797153024911, + "loss": 2.422, + "step": 871 + }, + { + "epoch": 0.38755555555555554, + "grad_norm": 1.6566977500915527, + "learning_rate": 0.00018466192170818506, + "loss": 3.0473, + "step": 872 + }, + { + "epoch": 0.388, + "grad_norm": 1.389674425125122, + "learning_rate": 0.00018464412811387902, + "loss": 2.6186, + "step": 873 + }, + { + "epoch": 0.3884444444444444, + "grad_norm": 1.4744458198547363, + "learning_rate": 0.00018462633451957298, + "loss": 2.5226, + "step": 874 + }, + { + "epoch": 0.3888888888888889, + "grad_norm": 1.5133804082870483, + "learning_rate": 0.0001846085409252669, + "loss": 1.6426, + "step": 875 + }, + { + "epoch": 0.3893333333333333, + "grad_norm": 1.3070919513702393, + "learning_rate": 0.00018459074733096086, + "loss": 2.5463, + "step": 876 + }, + { + "epoch": 0.3897777777777778, + "grad_norm": 1.5536634922027588, + "learning_rate": 0.00018457295373665482, + "loss": 2.5429, + "step": 877 + }, + { + "epoch": 0.39022222222222225, + "grad_norm": 1.3954426050186157, + "learning_rate": 0.00018455516014234877, + "loss": 2.1144, + "step": 878 + }, + { + "epoch": 0.39066666666666666, + "grad_norm": 1.403937816619873, + "learning_rate": 0.00018453736654804273, + "loss": 2.4386, + "step": 879 + }, + { + "epoch": 0.39111111111111113, + "grad_norm": 1.692806601524353, + "learning_rate": 0.0001845195729537367, + "loss": 2.7535, + "step": 880 + }, + { + "epoch": 0.39155555555555555, + "grad_norm": 2.0360846519470215, + "learning_rate": 0.00018450177935943062, + "loss": 2.3359, + "step": 881 + }, + { + "epoch": 0.392, + "grad_norm": 1.7397620677947998, + "learning_rate": 0.00018448398576512455, + "loss": 2.8038, + "step": 882 + }, + { + "epoch": 0.39244444444444443, + "grad_norm": 1.9279683828353882, + "learning_rate": 0.0001844661921708185, + "loss": 2.6394, + "step": 883 + }, + { + "epoch": 0.3928888888888889, + "grad_norm": 1.6761490106582642, + "learning_rate": 0.00018444839857651246, + "loss": 2.6922, + "step": 884 + }, + { + "epoch": 0.3933333333333333, + "grad_norm": 1.7097992897033691, + "learning_rate": 0.00018443060498220642, + "loss": 2.8203, + "step": 885 + }, + { + "epoch": 0.3937777777777778, + "grad_norm": 1.796673059463501, + "learning_rate": 0.00018441281138790037, + "loss": 3.291, + "step": 886 + }, + { + "epoch": 0.3942222222222222, + "grad_norm": 1.646627426147461, + "learning_rate": 0.00018439501779359433, + "loss": 2.6629, + "step": 887 + }, + { + "epoch": 0.39466666666666667, + "grad_norm": 1.7203751802444458, + "learning_rate": 0.00018437722419928826, + "loss": 2.3987, + "step": 888 + }, + { + "epoch": 0.39511111111111114, + "grad_norm": 2.0489413738250732, + "learning_rate": 0.00018435943060498221, + "loss": 2.5578, + "step": 889 + }, + { + "epoch": 0.39555555555555555, + "grad_norm": 1.5938488245010376, + "learning_rate": 0.00018434163701067617, + "loss": 2.1986, + "step": 890 + }, + { + "epoch": 0.396, + "grad_norm": 1.604232907295227, + "learning_rate": 0.00018432384341637013, + "loss": 2.0916, + "step": 891 + }, + { + "epoch": 0.39644444444444443, + "grad_norm": 1.5554643869400024, + "learning_rate": 0.00018430604982206408, + "loss": 2.5466, + "step": 892 + }, + { + "epoch": 0.3968888888888889, + "grad_norm": 1.606425166130066, + "learning_rate": 0.000184288256227758, + "loss": 2.6946, + "step": 893 + }, + { + "epoch": 0.3973333333333333, + "grad_norm": 1.6642916202545166, + "learning_rate": 0.00018427046263345197, + "loss": 2.1499, + "step": 894 + }, + { + "epoch": 0.3977777777777778, + "grad_norm": 1.8029860258102417, + "learning_rate": 0.0001842526690391459, + "loss": 2.6755, + "step": 895 + }, + { + "epoch": 0.3982222222222222, + "grad_norm": 2.077056407928467, + "learning_rate": 0.00018423487544483986, + "loss": 2.7935, + "step": 896 + }, + { + "epoch": 0.39866666666666667, + "grad_norm": 1.7928773164749146, + "learning_rate": 0.0001842170818505338, + "loss": 2.8071, + "step": 897 + }, + { + "epoch": 0.39911111111111114, + "grad_norm": 2.6374940872192383, + "learning_rate": 0.00018419928825622777, + "loss": 3.2532, + "step": 898 + }, + { + "epoch": 0.39955555555555555, + "grad_norm": 2.1440799236297607, + "learning_rate": 0.00018418149466192173, + "loss": 1.447, + "step": 899 + }, + { + "epoch": 0.4, + "grad_norm": 1.737084984779358, + "learning_rate": 0.00018416370106761568, + "loss": 2.3376, + "step": 900 + }, + { + "epoch": 0.40044444444444444, + "grad_norm": 1.0363576412200928, + "learning_rate": 0.0001841459074733096, + "loss": 2.5708, + "step": 901 + }, + { + "epoch": 0.4008888888888889, + "grad_norm": 0.8888387084007263, + "learning_rate": 0.00018412811387900357, + "loss": 2.8373, + "step": 902 + }, + { + "epoch": 0.4013333333333333, + "grad_norm": 0.8826941847801208, + "learning_rate": 0.00018411032028469752, + "loss": 2.6704, + "step": 903 + }, + { + "epoch": 0.4017777777777778, + "grad_norm": 0.9509096741676331, + "learning_rate": 0.00018409252669039148, + "loss": 2.6441, + "step": 904 + }, + { + "epoch": 0.4022222222222222, + "grad_norm": 1.036983847618103, + "learning_rate": 0.00018407473309608544, + "loss": 2.3402, + "step": 905 + }, + { + "epoch": 0.4026666666666667, + "grad_norm": 0.9840334057807922, + "learning_rate": 0.00018405693950177937, + "loss": 2.2159, + "step": 906 + }, + { + "epoch": 0.4031111111111111, + "grad_norm": 1.118037223815918, + "learning_rate": 0.0001840391459074733, + "loss": 2.4926, + "step": 907 + }, + { + "epoch": 0.40355555555555556, + "grad_norm": 1.1225322484970093, + "learning_rate": 0.00018402135231316725, + "loss": 2.6811, + "step": 908 + }, + { + "epoch": 0.404, + "grad_norm": 1.295507788658142, + "learning_rate": 0.0001840035587188612, + "loss": 2.8815, + "step": 909 + }, + { + "epoch": 0.40444444444444444, + "grad_norm": 1.1872442960739136, + "learning_rate": 0.00018398576512455517, + "loss": 3.0637, + "step": 910 + }, + { + "epoch": 0.4048888888888889, + "grad_norm": 1.2855168581008911, + "learning_rate": 0.00018396797153024912, + "loss": 2.4883, + "step": 911 + }, + { + "epoch": 0.4053333333333333, + "grad_norm": 1.232706904411316, + "learning_rate": 0.00018395017793594308, + "loss": 2.6065, + "step": 912 + }, + { + "epoch": 0.4057777777777778, + "grad_norm": 1.326191782951355, + "learning_rate": 0.000183932384341637, + "loss": 2.0947, + "step": 913 + }, + { + "epoch": 0.4062222222222222, + "grad_norm": 1.2210899591445923, + "learning_rate": 0.00018391459074733096, + "loss": 2.8879, + "step": 914 + }, + { + "epoch": 0.4066666666666667, + "grad_norm": 1.358302354812622, + "learning_rate": 0.00018389679715302492, + "loss": 2.6797, + "step": 915 + }, + { + "epoch": 0.4071111111111111, + "grad_norm": 1.2646130323410034, + "learning_rate": 0.00018387900355871888, + "loss": 2.2242, + "step": 916 + }, + { + "epoch": 0.40755555555555556, + "grad_norm": 1.2646642923355103, + "learning_rate": 0.00018386120996441283, + "loss": 2.5151, + "step": 917 + }, + { + "epoch": 0.408, + "grad_norm": 1.4230983257293701, + "learning_rate": 0.0001838434163701068, + "loss": 1.5127, + "step": 918 + }, + { + "epoch": 0.40844444444444444, + "grad_norm": 1.5309816598892212, + "learning_rate": 0.00018382562277580072, + "loss": 2.9285, + "step": 919 + }, + { + "epoch": 0.4088888888888889, + "grad_norm": 1.2716691493988037, + "learning_rate": 0.00018380782918149465, + "loss": 2.5965, + "step": 920 + }, + { + "epoch": 0.4093333333333333, + "grad_norm": 1.433159589767456, + "learning_rate": 0.0001837900355871886, + "loss": 2.5312, + "step": 921 + }, + { + "epoch": 0.4097777777777778, + "grad_norm": 1.3177905082702637, + "learning_rate": 0.00018377224199288256, + "loss": 2.5805, + "step": 922 + }, + { + "epoch": 0.4102222222222222, + "grad_norm": 1.6881523132324219, + "learning_rate": 0.00018375444839857652, + "loss": 2.5188, + "step": 923 + }, + { + "epoch": 0.4106666666666667, + "grad_norm": 1.5823473930358887, + "learning_rate": 0.00018373665480427047, + "loss": 2.6616, + "step": 924 + }, + { + "epoch": 0.4111111111111111, + "grad_norm": 1.2907118797302246, + "learning_rate": 0.00018371886120996443, + "loss": 2.7342, + "step": 925 + }, + { + "epoch": 0.41155555555555556, + "grad_norm": 1.569952368736267, + "learning_rate": 0.00018370106761565836, + "loss": 2.6745, + "step": 926 + }, + { + "epoch": 0.412, + "grad_norm": 1.2594976425170898, + "learning_rate": 0.00018368327402135232, + "loss": 2.1675, + "step": 927 + }, + { + "epoch": 0.41244444444444445, + "grad_norm": 1.449838638305664, + "learning_rate": 0.00018366548042704627, + "loss": 2.7803, + "step": 928 + }, + { + "epoch": 0.4128888888888889, + "grad_norm": 1.5406020879745483, + "learning_rate": 0.00018364768683274023, + "loss": 3.0986, + "step": 929 + }, + { + "epoch": 0.41333333333333333, + "grad_norm": 1.357981562614441, + "learning_rate": 0.0001836298932384342, + "loss": 2.2642, + "step": 930 + }, + { + "epoch": 0.4137777777777778, + "grad_norm": 1.4212137460708618, + "learning_rate": 0.00018361209964412814, + "loss": 2.4661, + "step": 931 + }, + { + "epoch": 0.4142222222222222, + "grad_norm": 1.3381963968276978, + "learning_rate": 0.00018359430604982207, + "loss": 2.1015, + "step": 932 + }, + { + "epoch": 0.4146666666666667, + "grad_norm": 1.223344326019287, + "learning_rate": 0.000183576512455516, + "loss": 1.0327, + "step": 933 + }, + { + "epoch": 0.4151111111111111, + "grad_norm": 1.5084744691848755, + "learning_rate": 0.00018355871886120996, + "loss": 3.0321, + "step": 934 + }, + { + "epoch": 0.41555555555555557, + "grad_norm": 1.3545995950698853, + "learning_rate": 0.00018354092526690392, + "loss": 1.6716, + "step": 935 + }, + { + "epoch": 0.416, + "grad_norm": 1.775868535041809, + "learning_rate": 0.00018352313167259787, + "loss": 1.6868, + "step": 936 + }, + { + "epoch": 0.41644444444444445, + "grad_norm": 1.401804804801941, + "learning_rate": 0.00018350533807829183, + "loss": 2.2054, + "step": 937 + }, + { + "epoch": 0.41688888888888886, + "grad_norm": 1.4973540306091309, + "learning_rate": 0.00018348754448398578, + "loss": 2.396, + "step": 938 + }, + { + "epoch": 0.41733333333333333, + "grad_norm": 1.5535446405410767, + "learning_rate": 0.00018346975088967971, + "loss": 2.465, + "step": 939 + }, + { + "epoch": 0.4177777777777778, + "grad_norm": 1.7133632898330688, + "learning_rate": 0.00018345195729537367, + "loss": 2.8616, + "step": 940 + }, + { + "epoch": 0.4182222222222222, + "grad_norm": 1.5524804592132568, + "learning_rate": 0.00018343416370106763, + "loss": 2.1337, + "step": 941 + }, + { + "epoch": 0.4186666666666667, + "grad_norm": 1.4653924703598022, + "learning_rate": 0.00018341637010676158, + "loss": 2.3331, + "step": 942 + }, + { + "epoch": 0.4191111111111111, + "grad_norm": 1.5715734958648682, + "learning_rate": 0.00018339857651245554, + "loss": 2.1571, + "step": 943 + }, + { + "epoch": 0.41955555555555557, + "grad_norm": 1.7893381118774414, + "learning_rate": 0.0001833807829181495, + "loss": 2.7177, + "step": 944 + }, + { + "epoch": 0.42, + "grad_norm": 2.0075345039367676, + "learning_rate": 0.00018336298932384343, + "loss": 2.3181, + "step": 945 + }, + { + "epoch": 0.42044444444444445, + "grad_norm": 2.4787044525146484, + "learning_rate": 0.00018334519572953736, + "loss": 2.8261, + "step": 946 + }, + { + "epoch": 0.42088888888888887, + "grad_norm": 1.778351068496704, + "learning_rate": 0.0001833274021352313, + "loss": 2.7114, + "step": 947 + }, + { + "epoch": 0.42133333333333334, + "grad_norm": 2.3666553497314453, + "learning_rate": 0.00018330960854092527, + "loss": 2.6281, + "step": 948 + }, + { + "epoch": 0.42177777777777775, + "grad_norm": 2.386976718902588, + "learning_rate": 0.00018329181494661922, + "loss": 3.5857, + "step": 949 + }, + { + "epoch": 0.4222222222222222, + "grad_norm": 1.791477918624878, + "learning_rate": 0.00018327402135231318, + "loss": 2.5117, + "step": 950 + }, + { + "epoch": 0.4226666666666667, + "grad_norm": 1.2231998443603516, + "learning_rate": 0.00018325622775800714, + "loss": 2.0187, + "step": 951 + }, + { + "epoch": 0.4231111111111111, + "grad_norm": 1.0212533473968506, + "learning_rate": 0.00018323843416370107, + "loss": 2.8211, + "step": 952 + }, + { + "epoch": 0.4235555555555556, + "grad_norm": 1.1677687168121338, + "learning_rate": 0.00018322064056939502, + "loss": 2.5803, + "step": 953 + }, + { + "epoch": 0.424, + "grad_norm": 1.0932122468948364, + "learning_rate": 0.00018320284697508898, + "loss": 2.5059, + "step": 954 + }, + { + "epoch": 0.42444444444444446, + "grad_norm": 1.079312801361084, + "learning_rate": 0.00018318505338078294, + "loss": 2.4724, + "step": 955 + }, + { + "epoch": 0.42488888888888887, + "grad_norm": 1.324885368347168, + "learning_rate": 0.0001831672597864769, + "loss": 2.8136, + "step": 956 + }, + { + "epoch": 0.42533333333333334, + "grad_norm": 1.3072516918182373, + "learning_rate": 0.00018314946619217085, + "loss": 2.9349, + "step": 957 + }, + { + "epoch": 0.42577777777777776, + "grad_norm": 1.3189862966537476, + "learning_rate": 0.00018313167259786478, + "loss": 2.5912, + "step": 958 + }, + { + "epoch": 0.4262222222222222, + "grad_norm": 1.2304902076721191, + "learning_rate": 0.0001831138790035587, + "loss": 2.5022, + "step": 959 + }, + { + "epoch": 0.4266666666666667, + "grad_norm": 1.3272204399108887, + "learning_rate": 0.00018309608540925266, + "loss": 3.0605, + "step": 960 + }, + { + "epoch": 0.4271111111111111, + "grad_norm": 1.2381232976913452, + "learning_rate": 0.00018307829181494662, + "loss": 2.0443, + "step": 961 + }, + { + "epoch": 0.4275555555555556, + "grad_norm": 1.268486499786377, + "learning_rate": 0.00018306049822064058, + "loss": 2.2911, + "step": 962 + }, + { + "epoch": 0.428, + "grad_norm": 1.2793582677841187, + "learning_rate": 0.00018304270462633453, + "loss": 2.8313, + "step": 963 + }, + { + "epoch": 0.42844444444444446, + "grad_norm": 1.3932663202285767, + "learning_rate": 0.0001830249110320285, + "loss": 2.8657, + "step": 964 + }, + { + "epoch": 0.4288888888888889, + "grad_norm": 1.2110832929611206, + "learning_rate": 0.00018300711743772242, + "loss": 2.5214, + "step": 965 + }, + { + "epoch": 0.42933333333333334, + "grad_norm": 1.253836989402771, + "learning_rate": 0.00018298932384341638, + "loss": 2.5128, + "step": 966 + }, + { + "epoch": 0.42977777777777776, + "grad_norm": 1.33391273021698, + "learning_rate": 0.00018297153024911033, + "loss": 2.2437, + "step": 967 + }, + { + "epoch": 0.43022222222222223, + "grad_norm": 1.2081773281097412, + "learning_rate": 0.0001829537366548043, + "loss": 2.5682, + "step": 968 + }, + { + "epoch": 0.43066666666666664, + "grad_norm": 1.3490543365478516, + "learning_rate": 0.00018293594306049825, + "loss": 2.6874, + "step": 969 + }, + { + "epoch": 0.4311111111111111, + "grad_norm": 1.4848097562789917, + "learning_rate": 0.0001829181494661922, + "loss": 2.2042, + "step": 970 + }, + { + "epoch": 0.4315555555555556, + "grad_norm": 1.2465113401412964, + "learning_rate": 0.00018290035587188613, + "loss": 2.0922, + "step": 971 + }, + { + "epoch": 0.432, + "grad_norm": 1.3515832424163818, + "learning_rate": 0.00018288256227758006, + "loss": 2.5685, + "step": 972 + }, + { + "epoch": 0.43244444444444446, + "grad_norm": 1.4319607019424438, + "learning_rate": 0.00018286476868327402, + "loss": 2.0091, + "step": 973 + }, + { + "epoch": 0.4328888888888889, + "grad_norm": 1.680587887763977, + "learning_rate": 0.00018284697508896797, + "loss": 2.6249, + "step": 974 + }, + { + "epoch": 0.43333333333333335, + "grad_norm": 1.3431737422943115, + "learning_rate": 0.00018282918149466193, + "loss": 2.5173, + "step": 975 + }, + { + "epoch": 0.43377777777777776, + "grad_norm": 1.2416516542434692, + "learning_rate": 0.0001828113879003559, + "loss": 2.644, + "step": 976 + }, + { + "epoch": 0.43422222222222223, + "grad_norm": 1.2653284072875977, + "learning_rate": 0.00018279359430604984, + "loss": 2.4739, + "step": 977 + }, + { + "epoch": 0.43466666666666665, + "grad_norm": 1.5131683349609375, + "learning_rate": 0.00018277580071174377, + "loss": 2.8553, + "step": 978 + }, + { + "epoch": 0.4351111111111111, + "grad_norm": 1.4946473836898804, + "learning_rate": 0.00018275800711743773, + "loss": 2.6073, + "step": 979 + }, + { + "epoch": 0.43555555555555553, + "grad_norm": 1.3179363012313843, + "learning_rate": 0.0001827402135231317, + "loss": 2.7802, + "step": 980 + }, + { + "epoch": 0.436, + "grad_norm": 1.5546060800552368, + "learning_rate": 0.00018272241992882564, + "loss": 2.9142, + "step": 981 + }, + { + "epoch": 0.43644444444444447, + "grad_norm": 1.3515474796295166, + "learning_rate": 0.0001827046263345196, + "loss": 2.4532, + "step": 982 + }, + { + "epoch": 0.4368888888888889, + "grad_norm": 1.3547914028167725, + "learning_rate": 0.00018268683274021356, + "loss": 2.483, + "step": 983 + }, + { + "epoch": 0.43733333333333335, + "grad_norm": 1.2733529806137085, + "learning_rate": 0.00018266903914590749, + "loss": 2.0126, + "step": 984 + }, + { + "epoch": 0.43777777777777777, + "grad_norm": 1.6882998943328857, + "learning_rate": 0.00018265124555160141, + "loss": 3.432, + "step": 985 + }, + { + "epoch": 0.43822222222222224, + "grad_norm": 1.3751314878463745, + "learning_rate": 0.00018263345195729537, + "loss": 2.4604, + "step": 986 + }, + { + "epoch": 0.43866666666666665, + "grad_norm": 1.412575364112854, + "learning_rate": 0.00018261565836298933, + "loss": 2.4426, + "step": 987 + }, + { + "epoch": 0.4391111111111111, + "grad_norm": 1.7669273614883423, + "learning_rate": 0.00018259786476868328, + "loss": 2.5918, + "step": 988 + }, + { + "epoch": 0.43955555555555553, + "grad_norm": 1.605697751045227, + "learning_rate": 0.00018258007117437724, + "loss": 2.7675, + "step": 989 + }, + { + "epoch": 0.44, + "grad_norm": 1.567189335823059, + "learning_rate": 0.0001825622775800712, + "loss": 2.9831, + "step": 990 + }, + { + "epoch": 0.44044444444444447, + "grad_norm": 1.3639848232269287, + "learning_rate": 0.00018254448398576513, + "loss": 1.9593, + "step": 991 + }, + { + "epoch": 0.4408888888888889, + "grad_norm": 1.586616039276123, + "learning_rate": 0.00018252669039145908, + "loss": 2.6094, + "step": 992 + }, + { + "epoch": 0.44133333333333336, + "grad_norm": 1.5296803712844849, + "learning_rate": 0.00018250889679715304, + "loss": 2.219, + "step": 993 + }, + { + "epoch": 0.44177777777777777, + "grad_norm": 1.6999601125717163, + "learning_rate": 0.000182491103202847, + "loss": 2.669, + "step": 994 + }, + { + "epoch": 0.44222222222222224, + "grad_norm": 2.0300962924957275, + "learning_rate": 0.00018247330960854095, + "loss": 2.6897, + "step": 995 + }, + { + "epoch": 0.44266666666666665, + "grad_norm": 1.7834362983703613, + "learning_rate": 0.00018245551601423488, + "loss": 2.3681, + "step": 996 + }, + { + "epoch": 0.4431111111111111, + "grad_norm": 1.7336286306381226, + "learning_rate": 0.0001824377224199288, + "loss": 2.5579, + "step": 997 + }, + { + "epoch": 0.44355555555555554, + "grad_norm": 1.8825653791427612, + "learning_rate": 0.00018241992882562277, + "loss": 2.8049, + "step": 998 + }, + { + "epoch": 0.444, + "grad_norm": 1.9167088270187378, + "learning_rate": 0.00018240213523131672, + "loss": 2.9734, + "step": 999 + }, + { + "epoch": 0.4444444444444444, + "grad_norm": 2.2173099517822266, + "learning_rate": 0.00018238434163701068, + "loss": 2.6596, + "step": 1000 + }, + { + "epoch": 0.4448888888888889, + "grad_norm": 0.9079247117042542, + "learning_rate": 0.00018236654804270464, + "loss": 2.6596, + "step": 1001 + }, + { + "epoch": 0.44533333333333336, + "grad_norm": 0.9305357336997986, + "learning_rate": 0.0001823487544483986, + "loss": 2.6803, + "step": 1002 + }, + { + "epoch": 0.4457777777777778, + "grad_norm": 1.031278133392334, + "learning_rate": 0.00018233096085409252, + "loss": 2.5942, + "step": 1003 + }, + { + "epoch": 0.44622222222222224, + "grad_norm": 1.5068715810775757, + "learning_rate": 0.00018231316725978648, + "loss": 1.6077, + "step": 1004 + }, + { + "epoch": 0.44666666666666666, + "grad_norm": 1.17013680934906, + "learning_rate": 0.00018229537366548044, + "loss": 2.4984, + "step": 1005 + }, + { + "epoch": 0.4471111111111111, + "grad_norm": 1.2330650091171265, + "learning_rate": 0.0001822775800711744, + "loss": 2.9625, + "step": 1006 + }, + { + "epoch": 0.44755555555555554, + "grad_norm": 1.1845786571502686, + "learning_rate": 0.00018225978647686835, + "loss": 2.8457, + "step": 1007 + }, + { + "epoch": 0.448, + "grad_norm": 1.144061803817749, + "learning_rate": 0.0001822419928825623, + "loss": 2.0844, + "step": 1008 + }, + { + "epoch": 0.4484444444444444, + "grad_norm": 1.383981466293335, + "learning_rate": 0.00018222419928825624, + "loss": 2.4893, + "step": 1009 + }, + { + "epoch": 0.4488888888888889, + "grad_norm": 1.125385046005249, + "learning_rate": 0.00018220640569395016, + "loss": 2.2932, + "step": 1010 + }, + { + "epoch": 0.4493333333333333, + "grad_norm": 1.2119320631027222, + "learning_rate": 0.00018218861209964412, + "loss": 2.4534, + "step": 1011 + }, + { + "epoch": 0.4497777777777778, + "grad_norm": 1.108948826789856, + "learning_rate": 0.00018217081850533808, + "loss": 2.3211, + "step": 1012 + }, + { + "epoch": 0.45022222222222225, + "grad_norm": 1.3019822835922241, + "learning_rate": 0.00018215302491103203, + "loss": 2.3836, + "step": 1013 + }, + { + "epoch": 0.45066666666666666, + "grad_norm": 1.3383592367172241, + "learning_rate": 0.000182135231316726, + "loss": 2.7794, + "step": 1014 + }, + { + "epoch": 0.45111111111111113, + "grad_norm": 1.2700541019439697, + "learning_rate": 0.00018211743772241995, + "loss": 2.488, + "step": 1015 + }, + { + "epoch": 0.45155555555555554, + "grad_norm": 1.280701756477356, + "learning_rate": 0.00018209964412811388, + "loss": 2.5005, + "step": 1016 + }, + { + "epoch": 0.452, + "grad_norm": 1.1454371213912964, + "learning_rate": 0.00018208185053380783, + "loss": 2.3253, + "step": 1017 + }, + { + "epoch": 0.4524444444444444, + "grad_norm": 1.242236614227295, + "learning_rate": 0.0001820640569395018, + "loss": 2.4167, + "step": 1018 + }, + { + "epoch": 0.4528888888888889, + "grad_norm": 1.2990704774856567, + "learning_rate": 0.00018204626334519575, + "loss": 2.1872, + "step": 1019 + }, + { + "epoch": 0.4533333333333333, + "grad_norm": 1.283494472503662, + "learning_rate": 0.0001820284697508897, + "loss": 2.3171, + "step": 1020 + }, + { + "epoch": 0.4537777777777778, + "grad_norm": 2.063596487045288, + "learning_rate": 0.00018201067615658366, + "loss": 1.681, + "step": 1021 + }, + { + "epoch": 0.45422222222222225, + "grad_norm": 1.3359391689300537, + "learning_rate": 0.0001819928825622776, + "loss": 2.0327, + "step": 1022 + }, + { + "epoch": 0.45466666666666666, + "grad_norm": 1.263917326927185, + "learning_rate": 0.00018197508896797152, + "loss": 2.239, + "step": 1023 + }, + { + "epoch": 0.45511111111111113, + "grad_norm": 1.8083239793777466, + "learning_rate": 0.00018195729537366547, + "loss": 2.7766, + "step": 1024 + }, + { + "epoch": 0.45555555555555555, + "grad_norm": 1.2244737148284912, + "learning_rate": 0.00018193950177935943, + "loss": 2.0234, + "step": 1025 + }, + { + "epoch": 0.456, + "grad_norm": 1.4717793464660645, + "learning_rate": 0.0001819217081850534, + "loss": 2.655, + "step": 1026 + }, + { + "epoch": 0.45644444444444443, + "grad_norm": 1.5211389064788818, + "learning_rate": 0.00018190391459074734, + "loss": 2.335, + "step": 1027 + }, + { + "epoch": 0.4568888888888889, + "grad_norm": 1.3322489261627197, + "learning_rate": 0.0001818861209964413, + "loss": 2.4195, + "step": 1028 + }, + { + "epoch": 0.4573333333333333, + "grad_norm": 1.3629570007324219, + "learning_rate": 0.00018186832740213523, + "loss": 2.0856, + "step": 1029 + }, + { + "epoch": 0.4577777777777778, + "grad_norm": 1.7404910326004028, + "learning_rate": 0.00018185053380782919, + "loss": 2.4039, + "step": 1030 + }, + { + "epoch": 0.4582222222222222, + "grad_norm": 1.3935850858688354, + "learning_rate": 0.00018183274021352314, + "loss": 2.2005, + "step": 1031 + }, + { + "epoch": 0.45866666666666667, + "grad_norm": 1.3948017358779907, + "learning_rate": 0.0001818149466192171, + "loss": 2.7373, + "step": 1032 + }, + { + "epoch": 0.45911111111111114, + "grad_norm": 1.4649925231933594, + "learning_rate": 0.00018179715302491106, + "loss": 2.4669, + "step": 1033 + }, + { + "epoch": 0.45955555555555555, + "grad_norm": 1.5847502946853638, + "learning_rate": 0.000181779359430605, + "loss": 1.9865, + "step": 1034 + }, + { + "epoch": 0.46, + "grad_norm": 3.006155490875244, + "learning_rate": 0.00018176156583629894, + "loss": 0.4302, + "step": 1035 + }, + { + "epoch": 0.46044444444444443, + "grad_norm": 1.4123693704605103, + "learning_rate": 0.00018174377224199287, + "loss": 2.6673, + "step": 1036 + }, + { + "epoch": 0.4608888888888889, + "grad_norm": 1.4881420135498047, + "learning_rate": 0.00018172597864768683, + "loss": 2.4905, + "step": 1037 + }, + { + "epoch": 0.4613333333333333, + "grad_norm": 1.5692585706710815, + "learning_rate": 0.00018170818505338078, + "loss": 2.5459, + "step": 1038 + }, + { + "epoch": 0.4617777777777778, + "grad_norm": 1.4341565370559692, + "learning_rate": 0.00018169039145907474, + "loss": 2.3733, + "step": 1039 + }, + { + "epoch": 0.4622222222222222, + "grad_norm": 1.3901523351669312, + "learning_rate": 0.0001816725978647687, + "loss": 2.3915, + "step": 1040 + }, + { + "epoch": 0.46266666666666667, + "grad_norm": 1.5324454307556152, + "learning_rate": 0.00018165480427046265, + "loss": 2.3619, + "step": 1041 + }, + { + "epoch": 0.4631111111111111, + "grad_norm": 1.567613959312439, + "learning_rate": 0.00018163701067615658, + "loss": 1.7382, + "step": 1042 + }, + { + "epoch": 0.46355555555555555, + "grad_norm": 1.8476368188858032, + "learning_rate": 0.00018161921708185054, + "loss": 2.8568, + "step": 1043 + }, + { + "epoch": 0.464, + "grad_norm": 1.7850401401519775, + "learning_rate": 0.0001816014234875445, + "loss": 2.773, + "step": 1044 + }, + { + "epoch": 0.46444444444444444, + "grad_norm": 1.4900022745132446, + "learning_rate": 0.00018158362989323845, + "loss": 2.2433, + "step": 1045 + }, + { + "epoch": 0.4648888888888889, + "grad_norm": 1.9751386642456055, + "learning_rate": 0.0001815658362989324, + "loss": 2.9478, + "step": 1046 + }, + { + "epoch": 0.4653333333333333, + "grad_norm": 2.110889196395874, + "learning_rate": 0.00018154804270462637, + "loss": 3.1765, + "step": 1047 + }, + { + "epoch": 0.4657777777777778, + "grad_norm": 1.8263683319091797, + "learning_rate": 0.0001815302491103203, + "loss": 2.7491, + "step": 1048 + }, + { + "epoch": 0.4662222222222222, + "grad_norm": 1.875029444694519, + "learning_rate": 0.00018151245551601422, + "loss": 2.3277, + "step": 1049 + }, + { + "epoch": 0.4666666666666667, + "grad_norm": 2.229357957839966, + "learning_rate": 0.00018149466192170818, + "loss": 2.8278, + "step": 1050 + }, + { + "epoch": 0.4671111111111111, + "grad_norm": 1.0930174589157104, + "learning_rate": 0.00018147686832740214, + "loss": 2.5997, + "step": 1051 + }, + { + "epoch": 0.46755555555555556, + "grad_norm": 1.0300097465515137, + "learning_rate": 0.0001814590747330961, + "loss": 2.647, + "step": 1052 + }, + { + "epoch": 0.468, + "grad_norm": 1.4961392879486084, + "learning_rate": 0.00018144128113879005, + "loss": 1.1492, + "step": 1053 + }, + { + "epoch": 0.46844444444444444, + "grad_norm": 1.1863456964492798, + "learning_rate": 0.000181423487544484, + "loss": 2.4534, + "step": 1054 + }, + { + "epoch": 0.4688888888888889, + "grad_norm": 1.156611442565918, + "learning_rate": 0.00018140569395017794, + "loss": 2.627, + "step": 1055 + }, + { + "epoch": 0.4693333333333333, + "grad_norm": 1.1836708784103394, + "learning_rate": 0.0001813879003558719, + "loss": 2.3464, + "step": 1056 + }, + { + "epoch": 0.4697777777777778, + "grad_norm": 1.2201026678085327, + "learning_rate": 0.00018137010676156585, + "loss": 2.8861, + "step": 1057 + }, + { + "epoch": 0.4702222222222222, + "grad_norm": 2.332244396209717, + "learning_rate": 0.0001813523131672598, + "loss": 1.634, + "step": 1058 + }, + { + "epoch": 0.4706666666666667, + "grad_norm": 1.337121605873108, + "learning_rate": 0.00018133451957295376, + "loss": 1.8932, + "step": 1059 + }, + { + "epoch": 0.4711111111111111, + "grad_norm": 1.2118984460830688, + "learning_rate": 0.00018131672597864772, + "loss": 2.2048, + "step": 1060 + }, + { + "epoch": 0.47155555555555556, + "grad_norm": 1.2091714143753052, + "learning_rate": 0.00018129893238434165, + "loss": 2.3127, + "step": 1061 + }, + { + "epoch": 0.472, + "grad_norm": 1.2021245956420898, + "learning_rate": 0.00018128113879003558, + "loss": 2.6434, + "step": 1062 + }, + { + "epoch": 0.47244444444444444, + "grad_norm": 1.436784267425537, + "learning_rate": 0.00018126334519572953, + "loss": 2.3693, + "step": 1063 + }, + { + "epoch": 0.4728888888888889, + "grad_norm": 2.271524429321289, + "learning_rate": 0.0001812455516014235, + "loss": 2.2956, + "step": 1064 + }, + { + "epoch": 0.47333333333333333, + "grad_norm": 1.162920355796814, + "learning_rate": 0.00018122775800711745, + "loss": 2.1083, + "step": 1065 + }, + { + "epoch": 0.4737777777777778, + "grad_norm": 1.3496499061584473, + "learning_rate": 0.0001812099644128114, + "loss": 2.7196, + "step": 1066 + }, + { + "epoch": 0.4742222222222222, + "grad_norm": 1.6458051204681396, + "learning_rate": 0.00018119217081850536, + "loss": 2.3444, + "step": 1067 + }, + { + "epoch": 0.4746666666666667, + "grad_norm": 1.4568052291870117, + "learning_rate": 0.0001811743772241993, + "loss": 2.3294, + "step": 1068 + }, + { + "epoch": 0.4751111111111111, + "grad_norm": 1.4898393154144287, + "learning_rate": 0.00018115658362989325, + "loss": 2.5576, + "step": 1069 + }, + { + "epoch": 0.47555555555555556, + "grad_norm": 1.3853182792663574, + "learning_rate": 0.0001811387900355872, + "loss": 2.4463, + "step": 1070 + }, + { + "epoch": 0.476, + "grad_norm": 1.521707534790039, + "learning_rate": 0.00018112099644128116, + "loss": 2.7504, + "step": 1071 + }, + { + "epoch": 0.47644444444444445, + "grad_norm": 1.8744828701019287, + "learning_rate": 0.00018110320284697512, + "loss": 2.4272, + "step": 1072 + }, + { + "epoch": 0.47688888888888886, + "grad_norm": 1.4953957796096802, + "learning_rate": 0.00018108540925266907, + "loss": 2.2033, + "step": 1073 + }, + { + "epoch": 0.47733333333333333, + "grad_norm": 1.463110327720642, + "learning_rate": 0.000181067615658363, + "loss": 2.3522, + "step": 1074 + }, + { + "epoch": 0.4777777777777778, + "grad_norm": 1.3929156064987183, + "learning_rate": 0.00018104982206405693, + "loss": 2.5893, + "step": 1075 + }, + { + "epoch": 0.4782222222222222, + "grad_norm": 1.3469513654708862, + "learning_rate": 0.0001810320284697509, + "loss": 2.1896, + "step": 1076 + }, + { + "epoch": 0.4786666666666667, + "grad_norm": 1.3536866903305054, + "learning_rate": 0.00018101423487544484, + "loss": 2.8212, + "step": 1077 + }, + { + "epoch": 0.4791111111111111, + "grad_norm": 1.3798056840896606, + "learning_rate": 0.0001809964412811388, + "loss": 2.7518, + "step": 1078 + }, + { + "epoch": 0.47955555555555557, + "grad_norm": 1.553146243095398, + "learning_rate": 0.00018097864768683276, + "loss": 2.5848, + "step": 1079 + }, + { + "epoch": 0.48, + "grad_norm": 1.532638669013977, + "learning_rate": 0.0001809608540925267, + "loss": 2.3551, + "step": 1080 + }, + { + "epoch": 0.48044444444444445, + "grad_norm": 1.4585469961166382, + "learning_rate": 0.00018094306049822064, + "loss": 2.6695, + "step": 1081 + }, + { + "epoch": 0.48088888888888887, + "grad_norm": 1.6650795936584473, + "learning_rate": 0.0001809252669039146, + "loss": 2.7979, + "step": 1082 + }, + { + "epoch": 0.48133333333333334, + "grad_norm": 1.3776137828826904, + "learning_rate": 0.00018090747330960856, + "loss": 2.1564, + "step": 1083 + }, + { + "epoch": 0.4817777777777778, + "grad_norm": 1.5997897386550903, + "learning_rate": 0.0001808896797153025, + "loss": 2.4603, + "step": 1084 + }, + { + "epoch": 0.4822222222222222, + "grad_norm": 1.5599387884140015, + "learning_rate": 0.00018087188612099647, + "loss": 1.7888, + "step": 1085 + }, + { + "epoch": 0.4826666666666667, + "grad_norm": 1.5563712120056152, + "learning_rate": 0.0001808540925266904, + "loss": 2.438, + "step": 1086 + }, + { + "epoch": 0.4831111111111111, + "grad_norm": 1.3926085233688354, + "learning_rate": 0.00018083629893238433, + "loss": 2.5696, + "step": 1087 + }, + { + "epoch": 0.48355555555555557, + "grad_norm": 1.7727118730545044, + "learning_rate": 0.00018081850533807828, + "loss": 2.2867, + "step": 1088 + }, + { + "epoch": 0.484, + "grad_norm": 1.5934937000274658, + "learning_rate": 0.00018080071174377224, + "loss": 2.6703, + "step": 1089 + }, + { + "epoch": 0.48444444444444446, + "grad_norm": 1.789147973060608, + "learning_rate": 0.0001807829181494662, + "loss": 2.5851, + "step": 1090 + }, + { + "epoch": 0.48488888888888887, + "grad_norm": 1.4976032972335815, + "learning_rate": 0.00018076512455516015, + "loss": 2.1773, + "step": 1091 + }, + { + "epoch": 0.48533333333333334, + "grad_norm": 1.5114315748214722, + "learning_rate": 0.0001807473309608541, + "loss": 2.1908, + "step": 1092 + }, + { + "epoch": 0.48577777777777775, + "grad_norm": 1.5656942129135132, + "learning_rate": 0.00018072953736654804, + "loss": 2.5476, + "step": 1093 + }, + { + "epoch": 0.4862222222222222, + "grad_norm": 1.7908459901809692, + "learning_rate": 0.000180711743772242, + "loss": 2.6867, + "step": 1094 + }, + { + "epoch": 0.4866666666666667, + "grad_norm": 2.176135301589966, + "learning_rate": 0.00018069395017793595, + "loss": 2.0774, + "step": 1095 + }, + { + "epoch": 0.4871111111111111, + "grad_norm": 1.5971789360046387, + "learning_rate": 0.0001806761565836299, + "loss": 1.9446, + "step": 1096 + }, + { + "epoch": 0.4875555555555556, + "grad_norm": 1.709897518157959, + "learning_rate": 0.00018065836298932386, + "loss": 2.4818, + "step": 1097 + }, + { + "epoch": 0.488, + "grad_norm": 1.9650827646255493, + "learning_rate": 0.00018064056939501782, + "loss": 2.5649, + "step": 1098 + }, + { + "epoch": 0.48844444444444446, + "grad_norm": 1.6556960344314575, + "learning_rate": 0.00018062277580071175, + "loss": 2.4576, + "step": 1099 + }, + { + "epoch": 0.4888888888888889, + "grad_norm": 2.5766797065734863, + "learning_rate": 0.00018060498220640568, + "loss": 2.1069, + "step": 1100 + }, + { + "epoch": 0.48933333333333334, + "grad_norm": 1.075654149055481, + "learning_rate": 0.00018058718861209964, + "loss": 2.5237, + "step": 1101 + }, + { + "epoch": 0.48977777777777776, + "grad_norm": 1.0496094226837158, + "learning_rate": 0.0001805693950177936, + "loss": 2.4905, + "step": 1102 + }, + { + "epoch": 0.4902222222222222, + "grad_norm": 0.9558612704277039, + "learning_rate": 0.00018055160142348755, + "loss": 2.9379, + "step": 1103 + }, + { + "epoch": 0.49066666666666664, + "grad_norm": 1.1676995754241943, + "learning_rate": 0.0001805338078291815, + "loss": 2.7487, + "step": 1104 + }, + { + "epoch": 0.4911111111111111, + "grad_norm": 1.0585354566574097, + "learning_rate": 0.00018051601423487546, + "loss": 2.6956, + "step": 1105 + }, + { + "epoch": 0.4915555555555556, + "grad_norm": 1.1087442636489868, + "learning_rate": 0.0001804982206405694, + "loss": 2.6808, + "step": 1106 + }, + { + "epoch": 0.492, + "grad_norm": 1.053286075592041, + "learning_rate": 0.00018048042704626335, + "loss": 2.2225, + "step": 1107 + }, + { + "epoch": 0.49244444444444446, + "grad_norm": 1.1056700944900513, + "learning_rate": 0.0001804626334519573, + "loss": 2.5699, + "step": 1108 + }, + { + "epoch": 0.4928888888888889, + "grad_norm": 1.198739767074585, + "learning_rate": 0.00018044483985765126, + "loss": 2.552, + "step": 1109 + }, + { + "epoch": 0.49333333333333335, + "grad_norm": 2.429079294204712, + "learning_rate": 0.00018042704626334522, + "loss": 1.7893, + "step": 1110 + }, + { + "epoch": 0.49377777777777776, + "grad_norm": 1.0832264423370361, + "learning_rate": 0.00018040925266903917, + "loss": 2.4136, + "step": 1111 + }, + { + "epoch": 0.49422222222222223, + "grad_norm": 1.0978045463562012, + "learning_rate": 0.0001803914590747331, + "loss": 2.1968, + "step": 1112 + }, + { + "epoch": 0.49466666666666664, + "grad_norm": 1.118681788444519, + "learning_rate": 0.00018037366548042703, + "loss": 2.3322, + "step": 1113 + }, + { + "epoch": 0.4951111111111111, + "grad_norm": 1.1858903169631958, + "learning_rate": 0.000180355871886121, + "loss": 2.5168, + "step": 1114 + }, + { + "epoch": 0.4955555555555556, + "grad_norm": 1.3438916206359863, + "learning_rate": 0.00018033807829181495, + "loss": 2.7303, + "step": 1115 + }, + { + "epoch": 0.496, + "grad_norm": 1.301822543144226, + "learning_rate": 0.0001803202846975089, + "loss": 2.824, + "step": 1116 + }, + { + "epoch": 0.49644444444444447, + "grad_norm": 1.2330950498580933, + "learning_rate": 0.00018030249110320286, + "loss": 3.0469, + "step": 1117 + }, + { + "epoch": 0.4968888888888889, + "grad_norm": 1.5200353860855103, + "learning_rate": 0.00018028469750889682, + "loss": 1.5884, + "step": 1118 + }, + { + "epoch": 0.49733333333333335, + "grad_norm": 1.2964918613433838, + "learning_rate": 0.00018026690391459075, + "loss": 2.4635, + "step": 1119 + }, + { + "epoch": 0.49777777777777776, + "grad_norm": 1.351252555847168, + "learning_rate": 0.0001802491103202847, + "loss": 2.1818, + "step": 1120 + }, + { + "epoch": 0.49822222222222223, + "grad_norm": 1.2766691446304321, + "learning_rate": 0.00018023131672597866, + "loss": 2.387, + "step": 1121 + }, + { + "epoch": 0.49866666666666665, + "grad_norm": 1.4819822311401367, + "learning_rate": 0.00018021352313167261, + "loss": 2.0414, + "step": 1122 + }, + { + "epoch": 0.4991111111111111, + "grad_norm": 1.5996578931808472, + "learning_rate": 0.00018019572953736657, + "loss": 2.6798, + "step": 1123 + }, + { + "epoch": 0.49955555555555553, + "grad_norm": 1.4682111740112305, + "learning_rate": 0.00018017793594306053, + "loss": 2.6812, + "step": 1124 + }, + { + "epoch": 0.5, + "grad_norm": 1.392949104309082, + "learning_rate": 0.00018016014234875446, + "loss": 2.4617, + "step": 1125 + }, + { + "epoch": 0.5004444444444445, + "grad_norm": 1.4642528295516968, + "learning_rate": 0.0001801423487544484, + "loss": 2.6291, + "step": 1126 + }, + { + "epoch": 0.5008888888888889, + "grad_norm": 1.2145447731018066, + "learning_rate": 0.00018012455516014234, + "loss": 2.0178, + "step": 1127 + }, + { + "epoch": 0.5013333333333333, + "grad_norm": 1.6017488241195679, + "learning_rate": 0.0001801067615658363, + "loss": 2.5999, + "step": 1128 + }, + { + "epoch": 0.5017777777777778, + "grad_norm": 1.3489327430725098, + "learning_rate": 0.00018008896797153026, + "loss": 1.565, + "step": 1129 + }, + { + "epoch": 0.5022222222222222, + "grad_norm": 1.815772533416748, + "learning_rate": 0.0001800711743772242, + "loss": 2.4781, + "step": 1130 + }, + { + "epoch": 0.5026666666666667, + "grad_norm": 1.6084818840026855, + "learning_rate": 0.00018005338078291817, + "loss": 2.2115, + "step": 1131 + }, + { + "epoch": 0.5031111111111111, + "grad_norm": 1.483842372894287, + "learning_rate": 0.0001800355871886121, + "loss": 2.4087, + "step": 1132 + }, + { + "epoch": 0.5035555555555555, + "grad_norm": 1.555029273033142, + "learning_rate": 0.00018001779359430605, + "loss": 2.6442, + "step": 1133 + }, + { + "epoch": 0.504, + "grad_norm": 1.6016467809677124, + "learning_rate": 0.00018, + "loss": 2.3034, + "step": 1134 + }, + { + "epoch": 0.5044444444444445, + "grad_norm": 1.4886064529418945, + "learning_rate": 0.00017998220640569397, + "loss": 2.3137, + "step": 1135 + }, + { + "epoch": 0.5048888888888889, + "grad_norm": 1.590067744255066, + "learning_rate": 0.00017996441281138792, + "loss": 2.507, + "step": 1136 + }, + { + "epoch": 0.5053333333333333, + "grad_norm": 1.2926700115203857, + "learning_rate": 0.00017994661921708188, + "loss": 1.8661, + "step": 1137 + }, + { + "epoch": 0.5057777777777778, + "grad_norm": 1.8830050230026245, + "learning_rate": 0.0001799288256227758, + "loss": 2.7748, + "step": 1138 + }, + { + "epoch": 0.5062222222222222, + "grad_norm": 1.6669584512710571, + "learning_rate": 0.00017991103202846974, + "loss": 2.7596, + "step": 1139 + }, + { + "epoch": 0.5066666666666667, + "grad_norm": 1.7653512954711914, + "learning_rate": 0.0001798932384341637, + "loss": 3.2697, + "step": 1140 + }, + { + "epoch": 0.5071111111111111, + "grad_norm": 1.8505072593688965, + "learning_rate": 0.00017987544483985765, + "loss": 2.9115, + "step": 1141 + }, + { + "epoch": 0.5075555555555555, + "grad_norm": 1.5989995002746582, + "learning_rate": 0.0001798576512455516, + "loss": 2.0211, + "step": 1142 + }, + { + "epoch": 0.508, + "grad_norm": 1.929032802581787, + "learning_rate": 0.00017983985765124557, + "loss": 2.6159, + "step": 1143 + }, + { + "epoch": 0.5084444444444445, + "grad_norm": 1.9541597366333008, + "learning_rate": 0.00017982206405693952, + "loss": 2.6225, + "step": 1144 + }, + { + "epoch": 0.5088888888888888, + "grad_norm": 2.0774333477020264, + "learning_rate": 0.00017980427046263345, + "loss": 2.6007, + "step": 1145 + }, + { + "epoch": 0.5093333333333333, + "grad_norm": 1.6789966821670532, + "learning_rate": 0.0001797864768683274, + "loss": 2.505, + "step": 1146 + }, + { + "epoch": 0.5097777777777778, + "grad_norm": 1.837697148323059, + "learning_rate": 0.00017976868327402136, + "loss": 3.0319, + "step": 1147 + }, + { + "epoch": 0.5102222222222222, + "grad_norm": 1.7084914445877075, + "learning_rate": 0.00017975088967971532, + "loss": 2.4174, + "step": 1148 + }, + { + "epoch": 0.5106666666666667, + "grad_norm": 2.1682441234588623, + "learning_rate": 0.00017973309608540928, + "loss": 1.1208, + "step": 1149 + }, + { + "epoch": 0.5111111111111111, + "grad_norm": 1.9390794038772583, + "learning_rate": 0.00017971530249110323, + "loss": 2.4768, + "step": 1150 + }, + { + "epoch": 0.5115555555555555, + "grad_norm": 0.9777438044548035, + "learning_rate": 0.00017969750889679716, + "loss": 3.1091, + "step": 1151 + }, + { + "epoch": 0.512, + "grad_norm": 0.9038203358650208, + "learning_rate": 0.0001796797153024911, + "loss": 2.5902, + "step": 1152 + }, + { + "epoch": 0.5124444444444445, + "grad_norm": 1.2871443033218384, + "learning_rate": 0.00017966192170818505, + "loss": 1.4081, + "step": 1153 + }, + { + "epoch": 0.5128888888888888, + "grad_norm": 1.109168529510498, + "learning_rate": 0.000179644128113879, + "loss": 2.5515, + "step": 1154 + }, + { + "epoch": 0.5133333333333333, + "grad_norm": 1.2267260551452637, + "learning_rate": 0.00017962633451957296, + "loss": 2.9597, + "step": 1155 + }, + { + "epoch": 0.5137777777777778, + "grad_norm": 1.304792881011963, + "learning_rate": 0.00017960854092526692, + "loss": 2.4844, + "step": 1156 + }, + { + "epoch": 0.5142222222222222, + "grad_norm": 1.1886632442474365, + "learning_rate": 0.00017959074733096088, + "loss": 2.5076, + "step": 1157 + }, + { + "epoch": 0.5146666666666667, + "grad_norm": 1.3380016088485718, + "learning_rate": 0.0001795729537366548, + "loss": 2.7098, + "step": 1158 + }, + { + "epoch": 0.5151111111111111, + "grad_norm": 2.1414008140563965, + "learning_rate": 0.00017955516014234876, + "loss": 1.6224, + "step": 1159 + }, + { + "epoch": 0.5155555555555555, + "grad_norm": 1.3274937868118286, + "learning_rate": 0.00017953736654804272, + "loss": 2.9532, + "step": 1160 + }, + { + "epoch": 0.516, + "grad_norm": 1.297349214553833, + "learning_rate": 0.00017951957295373667, + "loss": 2.4212, + "step": 1161 + }, + { + "epoch": 0.5164444444444445, + "grad_norm": 1.2180557250976562, + "learning_rate": 0.00017950177935943063, + "loss": 2.6664, + "step": 1162 + }, + { + "epoch": 0.5168888888888888, + "grad_norm": 1.4957740306854248, + "learning_rate": 0.0001794839857651246, + "loss": 2.5514, + "step": 1163 + }, + { + "epoch": 0.5173333333333333, + "grad_norm": 1.2369511127471924, + "learning_rate": 0.00017946619217081852, + "loss": 2.3431, + "step": 1164 + }, + { + "epoch": 0.5177777777777778, + "grad_norm": 1.2156001329421997, + "learning_rate": 0.00017944839857651245, + "loss": 2.5481, + "step": 1165 + }, + { + "epoch": 0.5182222222222223, + "grad_norm": 1.2086858749389648, + "learning_rate": 0.0001794306049822064, + "loss": 2.0433, + "step": 1166 + }, + { + "epoch": 0.5186666666666667, + "grad_norm": 1.3301823139190674, + "learning_rate": 0.00017941281138790036, + "loss": 2.3485, + "step": 1167 + }, + { + "epoch": 0.5191111111111111, + "grad_norm": 1.3325672149658203, + "learning_rate": 0.00017939501779359432, + "loss": 2.6565, + "step": 1168 + }, + { + "epoch": 0.5195555555555555, + "grad_norm": 1.1277254819869995, + "learning_rate": 0.00017937722419928827, + "loss": 2.1358, + "step": 1169 + }, + { + "epoch": 0.52, + "grad_norm": 1.3675233125686646, + "learning_rate": 0.0001793594306049822, + "loss": 2.7481, + "step": 1170 + }, + { + "epoch": 0.5204444444444445, + "grad_norm": 1.3147132396697998, + "learning_rate": 0.00017934163701067616, + "loss": 2.4171, + "step": 1171 + }, + { + "epoch": 0.5208888888888888, + "grad_norm": 1.2295223474502563, + "learning_rate": 0.00017932384341637011, + "loss": 2.4101, + "step": 1172 + }, + { + "epoch": 0.5213333333333333, + "grad_norm": 1.762349009513855, + "learning_rate": 0.00017930604982206407, + "loss": 3.3649, + "step": 1173 + }, + { + "epoch": 0.5217777777777778, + "grad_norm": 1.4821921586990356, + "learning_rate": 0.00017928825622775803, + "loss": 2.7119, + "step": 1174 + }, + { + "epoch": 0.5222222222222223, + "grad_norm": 1.4601001739501953, + "learning_rate": 0.00017927046263345198, + "loss": 2.1603, + "step": 1175 + }, + { + "epoch": 0.5226666666666666, + "grad_norm": 1.397454857826233, + "learning_rate": 0.0001792526690391459, + "loss": 2.8709, + "step": 1176 + }, + { + "epoch": 0.5231111111111111, + "grad_norm": 1.5645291805267334, + "learning_rate": 0.00017923487544483984, + "loss": 2.264, + "step": 1177 + }, + { + "epoch": 0.5235555555555556, + "grad_norm": 1.3776110410690308, + "learning_rate": 0.0001792170818505338, + "loss": 2.5389, + "step": 1178 + }, + { + "epoch": 0.524, + "grad_norm": 1.2663601636886597, + "learning_rate": 0.00017919928825622776, + "loss": 2.2745, + "step": 1179 + }, + { + "epoch": 0.5244444444444445, + "grad_norm": 1.4239956140518188, + "learning_rate": 0.0001791814946619217, + "loss": 2.7701, + "step": 1180 + }, + { + "epoch": 0.5248888888888888, + "grad_norm": 1.3543609380722046, + "learning_rate": 0.00017916370106761567, + "loss": 2.1195, + "step": 1181 + }, + { + "epoch": 0.5253333333333333, + "grad_norm": 1.3027421236038208, + "learning_rate": 0.00017914590747330963, + "loss": 2.1416, + "step": 1182 + }, + { + "epoch": 0.5257777777777778, + "grad_norm": 1.191349744796753, + "learning_rate": 0.00017912811387900355, + "loss": 1.8744, + "step": 1183 + }, + { + "epoch": 0.5262222222222223, + "grad_norm": 2.0700368881225586, + "learning_rate": 0.0001791103202846975, + "loss": 1.3923, + "step": 1184 + }, + { + "epoch": 0.5266666666666666, + "grad_norm": 1.5801739692687988, + "learning_rate": 0.00017909252669039147, + "loss": 2.7669, + "step": 1185 + }, + { + "epoch": 0.5271111111111111, + "grad_norm": 1.549028992652893, + "learning_rate": 0.00017907473309608542, + "loss": 2.7438, + "step": 1186 + }, + { + "epoch": 0.5275555555555556, + "grad_norm": 1.8961384296417236, + "learning_rate": 0.00017905693950177938, + "loss": 2.4417, + "step": 1187 + }, + { + "epoch": 0.528, + "grad_norm": 1.741623044013977, + "learning_rate": 0.00017903914590747334, + "loss": 2.5803, + "step": 1188 + }, + { + "epoch": 0.5284444444444445, + "grad_norm": 1.3786072731018066, + "learning_rate": 0.00017902135231316727, + "loss": 2.0496, + "step": 1189 + }, + { + "epoch": 0.5288888888888889, + "grad_norm": 1.7181576490402222, + "learning_rate": 0.0001790035587188612, + "loss": 2.5333, + "step": 1190 + }, + { + "epoch": 0.5293333333333333, + "grad_norm": 1.6818015575408936, + "learning_rate": 0.00017898576512455515, + "loss": 1.9528, + "step": 1191 + }, + { + "epoch": 0.5297777777777778, + "grad_norm": 1.7420971393585205, + "learning_rate": 0.0001789679715302491, + "loss": 1.8997, + "step": 1192 + }, + { + "epoch": 0.5302222222222223, + "grad_norm": 1.5828181505203247, + "learning_rate": 0.00017895017793594307, + "loss": 2.5736, + "step": 1193 + }, + { + "epoch": 0.5306666666666666, + "grad_norm": 1.89609956741333, + "learning_rate": 0.00017893238434163702, + "loss": 1.9595, + "step": 1194 + }, + { + "epoch": 0.5311111111111111, + "grad_norm": 1.4787846803665161, + "learning_rate": 0.00017891459074733098, + "loss": 2.3573, + "step": 1195 + }, + { + "epoch": 0.5315555555555556, + "grad_norm": 1.931437373161316, + "learning_rate": 0.0001788967971530249, + "loss": 3.1484, + "step": 1196 + }, + { + "epoch": 0.532, + "grad_norm": 1.7234727144241333, + "learning_rate": 0.00017887900355871886, + "loss": 2.5473, + "step": 1197 + }, + { + "epoch": 0.5324444444444445, + "grad_norm": 2.0949268341064453, + "learning_rate": 0.00017886120996441282, + "loss": 2.9174, + "step": 1198 + }, + { + "epoch": 0.5328888888888889, + "grad_norm": 1.990982174873352, + "learning_rate": 0.00017884341637010678, + "loss": 2.9417, + "step": 1199 + }, + { + "epoch": 0.5333333333333333, + "grad_norm": 2.504199981689453, + "learning_rate": 0.00017882562277580073, + "loss": 2.3845, + "step": 1200 + }, + { + "epoch": 0.5337777777777778, + "grad_norm": 1.2785906791687012, + "learning_rate": 0.0001788078291814947, + "loss": 1.7756, + "step": 1201 + }, + { + "epoch": 0.5342222222222223, + "grad_norm": 1.2264808416366577, + "learning_rate": 0.00017879003558718862, + "loss": 2.578, + "step": 1202 + }, + { + "epoch": 0.5346666666666666, + "grad_norm": 1.7778968811035156, + "learning_rate": 0.00017877224199288255, + "loss": 0.1643, + "step": 1203 + }, + { + "epoch": 0.5351111111111111, + "grad_norm": 0.8537470102310181, + "learning_rate": 0.0001787544483985765, + "loss": 1.3101, + "step": 1204 + }, + { + "epoch": 0.5355555555555556, + "grad_norm": 1.3113861083984375, + "learning_rate": 0.00017873665480427046, + "loss": 3.1657, + "step": 1205 + }, + { + "epoch": 0.536, + "grad_norm": 1.2095390558242798, + "learning_rate": 0.00017871886120996442, + "loss": 2.4251, + "step": 1206 + }, + { + "epoch": 0.5364444444444444, + "grad_norm": 1.25002121925354, + "learning_rate": 0.00017870106761565837, + "loss": 2.5725, + "step": 1207 + }, + { + "epoch": 0.5368888888888889, + "grad_norm": 1.3035950660705566, + "learning_rate": 0.00017868327402135233, + "loss": 2.8178, + "step": 1208 + }, + { + "epoch": 0.5373333333333333, + "grad_norm": 1.295156717300415, + "learning_rate": 0.00017866548042704626, + "loss": 2.3121, + "step": 1209 + }, + { + "epoch": 0.5377777777777778, + "grad_norm": 1.2773317098617554, + "learning_rate": 0.00017864768683274022, + "loss": 2.637, + "step": 1210 + }, + { + "epoch": 0.5382222222222223, + "grad_norm": 1.2895100116729736, + "learning_rate": 0.00017862989323843417, + "loss": 2.5377, + "step": 1211 + }, + { + "epoch": 0.5386666666666666, + "grad_norm": 1.4635089635849, + "learning_rate": 0.00017861209964412813, + "loss": 2.134, + "step": 1212 + }, + { + "epoch": 0.5391111111111111, + "grad_norm": 1.3764005899429321, + "learning_rate": 0.0001785943060498221, + "loss": 2.4902, + "step": 1213 + }, + { + "epoch": 0.5395555555555556, + "grad_norm": 1.3150113821029663, + "learning_rate": 0.00017857651245551604, + "loss": 2.8578, + "step": 1214 + }, + { + "epoch": 0.54, + "grad_norm": 2.5147175788879395, + "learning_rate": 0.00017855871886120997, + "loss": 1.2944, + "step": 1215 + }, + { + "epoch": 0.5404444444444444, + "grad_norm": 1.1644126176834106, + "learning_rate": 0.0001785409252669039, + "loss": 2.5019, + "step": 1216 + }, + { + "epoch": 0.5408888888888889, + "grad_norm": 1.375962734222412, + "learning_rate": 0.00017852313167259786, + "loss": 2.5303, + "step": 1217 + }, + { + "epoch": 0.5413333333333333, + "grad_norm": 1.2268140316009521, + "learning_rate": 0.00017850533807829182, + "loss": 2.1568, + "step": 1218 + }, + { + "epoch": 0.5417777777777778, + "grad_norm": 1.3434302806854248, + "learning_rate": 0.00017848754448398577, + "loss": 2.1272, + "step": 1219 + }, + { + "epoch": 0.5422222222222223, + "grad_norm": 1.3689292669296265, + "learning_rate": 0.00017846975088967973, + "loss": 2.4022, + "step": 1220 + }, + { + "epoch": 0.5426666666666666, + "grad_norm": 1.6363227367401123, + "learning_rate": 0.00017845195729537368, + "loss": 2.8829, + "step": 1221 + }, + { + "epoch": 0.5431111111111111, + "grad_norm": 1.4127588272094727, + "learning_rate": 0.00017843416370106761, + "loss": 2.699, + "step": 1222 + }, + { + "epoch": 0.5435555555555556, + "grad_norm": 1.462015986442566, + "learning_rate": 0.00017841637010676157, + "loss": 2.8654, + "step": 1223 + }, + { + "epoch": 0.544, + "grad_norm": 1.2841753959655762, + "learning_rate": 0.00017839857651245553, + "loss": 2.2034, + "step": 1224 + }, + { + "epoch": 0.5444444444444444, + "grad_norm": 1.377759575843811, + "learning_rate": 0.00017838078291814948, + "loss": 2.5642, + "step": 1225 + }, + { + "epoch": 0.5448888888888889, + "grad_norm": 1.3649755716323853, + "learning_rate": 0.00017836298932384344, + "loss": 2.7037, + "step": 1226 + }, + { + "epoch": 0.5453333333333333, + "grad_norm": 1.399375319480896, + "learning_rate": 0.0001783451957295374, + "loss": 2.766, + "step": 1227 + }, + { + "epoch": 0.5457777777777778, + "grad_norm": 1.3284432888031006, + "learning_rate": 0.00017832740213523133, + "loss": 2.6953, + "step": 1228 + }, + { + "epoch": 0.5462222222222223, + "grad_norm": 1.5032292604446411, + "learning_rate": 0.00017830960854092526, + "loss": 2.4118, + "step": 1229 + }, + { + "epoch": 0.5466666666666666, + "grad_norm": 1.4157973527908325, + "learning_rate": 0.0001782918149466192, + "loss": 2.6983, + "step": 1230 + }, + { + "epoch": 0.5471111111111111, + "grad_norm": 1.2969857454299927, + "learning_rate": 0.00017827402135231317, + "loss": 2.1972, + "step": 1231 + }, + { + "epoch": 0.5475555555555556, + "grad_norm": 1.4679317474365234, + "learning_rate": 0.00017825622775800712, + "loss": 2.6372, + "step": 1232 + }, + { + "epoch": 0.548, + "grad_norm": 1.451851487159729, + "learning_rate": 0.00017823843416370108, + "loss": 2.5865, + "step": 1233 + }, + { + "epoch": 0.5484444444444444, + "grad_norm": 1.4248473644256592, + "learning_rate": 0.00017822064056939504, + "loss": 2.3113, + "step": 1234 + }, + { + "epoch": 0.5488888888888889, + "grad_norm": 1.5742985010147095, + "learning_rate": 0.00017820284697508897, + "loss": 2.6367, + "step": 1235 + }, + { + "epoch": 0.5493333333333333, + "grad_norm": 1.569652795791626, + "learning_rate": 0.00017818505338078292, + "loss": 1.9242, + "step": 1236 + }, + { + "epoch": 0.5497777777777778, + "grad_norm": 1.8335659503936768, + "learning_rate": 0.00017816725978647688, + "loss": 2.2898, + "step": 1237 + }, + { + "epoch": 0.5502222222222222, + "grad_norm": 1.419884204864502, + "learning_rate": 0.00017814946619217084, + "loss": 2.3061, + "step": 1238 + }, + { + "epoch": 0.5506666666666666, + "grad_norm": 1.5601950883865356, + "learning_rate": 0.0001781316725978648, + "loss": 2.5379, + "step": 1239 + }, + { + "epoch": 0.5511111111111111, + "grad_norm": 1.4974377155303955, + "learning_rate": 0.00017811387900355875, + "loss": 2.5647, + "step": 1240 + }, + { + "epoch": 0.5515555555555556, + "grad_norm": 1.5708105564117432, + "learning_rate": 0.00017809608540925268, + "loss": 2.8586, + "step": 1241 + }, + { + "epoch": 0.552, + "grad_norm": 1.7998818159103394, + "learning_rate": 0.0001780782918149466, + "loss": 3.0485, + "step": 1242 + }, + { + "epoch": 0.5524444444444444, + "grad_norm": 1.7063149213790894, + "learning_rate": 0.00017806049822064056, + "loss": 2.6647, + "step": 1243 + }, + { + "epoch": 0.5528888888888889, + "grad_norm": 1.4614622592926025, + "learning_rate": 0.00017804270462633452, + "loss": 2.2354, + "step": 1244 + }, + { + "epoch": 0.5533333333333333, + "grad_norm": 1.6693414449691772, + "learning_rate": 0.00017802491103202848, + "loss": 2.1268, + "step": 1245 + }, + { + "epoch": 0.5537777777777778, + "grad_norm": 1.6453592777252197, + "learning_rate": 0.00017800711743772243, + "loss": 2.6987, + "step": 1246 + }, + { + "epoch": 0.5542222222222222, + "grad_norm": 1.7275609970092773, + "learning_rate": 0.0001779893238434164, + "loss": 2.4686, + "step": 1247 + }, + { + "epoch": 0.5546666666666666, + "grad_norm": 1.6857225894927979, + "learning_rate": 0.00017797153024911032, + "loss": 1.6319, + "step": 1248 + }, + { + "epoch": 0.5551111111111111, + "grad_norm": 1.822630524635315, + "learning_rate": 0.00017795373665480428, + "loss": 2.215, + "step": 1249 + }, + { + "epoch": 0.5555555555555556, + "grad_norm": 1.8363982439041138, + "learning_rate": 0.00017793594306049823, + "loss": 1.699, + "step": 1250 + }, + { + "epoch": 0.556, + "grad_norm": 1.0663748979568481, + "learning_rate": 0.0001779181494661922, + "loss": 2.6193, + "step": 1251 + }, + { + "epoch": 0.5564444444444444, + "grad_norm": 1.552035927772522, + "learning_rate": 0.00017790035587188615, + "loss": 1.0698, + "step": 1252 + }, + { + "epoch": 0.5568888888888889, + "grad_norm": 1.1600538492202759, + "learning_rate": 0.0001778825622775801, + "loss": 2.5013, + "step": 1253 + }, + { + "epoch": 0.5573333333333333, + "grad_norm": 1.0800617933273315, + "learning_rate": 0.00017786476868327403, + "loss": 2.5511, + "step": 1254 + }, + { + "epoch": 0.5577777777777778, + "grad_norm": 1.054673433303833, + "learning_rate": 0.00017784697508896796, + "loss": 2.3978, + "step": 1255 + }, + { + "epoch": 0.5582222222222222, + "grad_norm": 0.9816464185714722, + "learning_rate": 0.00017782918149466192, + "loss": 2.0309, + "step": 1256 + }, + { + "epoch": 0.5586666666666666, + "grad_norm": 1.2822006940841675, + "learning_rate": 0.00017781138790035587, + "loss": 2.4745, + "step": 1257 + }, + { + "epoch": 0.5591111111111111, + "grad_norm": 1.2752128839492798, + "learning_rate": 0.00017779359430604983, + "loss": 2.6353, + "step": 1258 + }, + { + "epoch": 0.5595555555555556, + "grad_norm": 1.1391791105270386, + "learning_rate": 0.0001777758007117438, + "loss": 2.6778, + "step": 1259 + }, + { + "epoch": 0.56, + "grad_norm": 1.1968642473220825, + "learning_rate": 0.00017775800711743772, + "loss": 2.9373, + "step": 1260 + }, + { + "epoch": 0.5604444444444444, + "grad_norm": 1.2563437223434448, + "learning_rate": 0.00017774021352313167, + "loss": 2.3472, + "step": 1261 + }, + { + "epoch": 0.5608888888888889, + "grad_norm": 1.2173583507537842, + "learning_rate": 0.00017772241992882563, + "loss": 1.8898, + "step": 1262 + }, + { + "epoch": 0.5613333333333334, + "grad_norm": 1.0731583833694458, + "learning_rate": 0.00017770462633451959, + "loss": 2.4451, + "step": 1263 + }, + { + "epoch": 0.5617777777777778, + "grad_norm": 1.0820194482803345, + "learning_rate": 0.00017768683274021354, + "loss": 2.7584, + "step": 1264 + }, + { + "epoch": 0.5622222222222222, + "grad_norm": 1.2503118515014648, + "learning_rate": 0.0001776690391459075, + "loss": 2.5674, + "step": 1265 + }, + { + "epoch": 0.5626666666666666, + "grad_norm": 1.265372633934021, + "learning_rate": 0.00017765124555160143, + "loss": 2.3988, + "step": 1266 + }, + { + "epoch": 0.5631111111111111, + "grad_norm": 1.1392314434051514, + "learning_rate": 0.00017763345195729536, + "loss": 2.5185, + "step": 1267 + }, + { + "epoch": 0.5635555555555556, + "grad_norm": 1.2027145624160767, + "learning_rate": 0.00017761565836298931, + "loss": 2.5246, + "step": 1268 + }, + { + "epoch": 0.564, + "grad_norm": 2.07536244392395, + "learning_rate": 0.00017759786476868327, + "loss": 2.7272, + "step": 1269 + }, + { + "epoch": 0.5644444444444444, + "grad_norm": 1.5870450735092163, + "learning_rate": 0.00017758007117437723, + "loss": 3.0011, + "step": 1270 + }, + { + "epoch": 0.5648888888888889, + "grad_norm": 1.439990520477295, + "learning_rate": 0.00017756227758007118, + "loss": 2.0491, + "step": 1271 + }, + { + "epoch": 0.5653333333333334, + "grad_norm": 2.632349967956543, + "learning_rate": 0.00017754448398576514, + "loss": 2.4149, + "step": 1272 + }, + { + "epoch": 0.5657777777777778, + "grad_norm": 1.4456804990768433, + "learning_rate": 0.00017752669039145907, + "loss": 2.7718, + "step": 1273 + }, + { + "epoch": 0.5662222222222222, + "grad_norm": 1.3915135860443115, + "learning_rate": 0.00017750889679715303, + "loss": 3.1489, + "step": 1274 + }, + { + "epoch": 0.5666666666666667, + "grad_norm": 1.4057570695877075, + "learning_rate": 0.00017749110320284698, + "loss": 2.7257, + "step": 1275 + }, + { + "epoch": 0.5671111111111111, + "grad_norm": 1.498278260231018, + "learning_rate": 0.00017747330960854094, + "loss": 2.1074, + "step": 1276 + }, + { + "epoch": 0.5675555555555556, + "grad_norm": 1.3324486017227173, + "learning_rate": 0.0001774555160142349, + "loss": 2.5449, + "step": 1277 + }, + { + "epoch": 0.568, + "grad_norm": 1.2378911972045898, + "learning_rate": 0.00017743772241992885, + "loss": 1.8185, + "step": 1278 + }, + { + "epoch": 0.5684444444444444, + "grad_norm": 1.7115696668624878, + "learning_rate": 0.00017741992882562278, + "loss": 3.1832, + "step": 1279 + }, + { + "epoch": 0.5688888888888889, + "grad_norm": 1.381099820137024, + "learning_rate": 0.0001774021352313167, + "loss": 2.3531, + "step": 1280 + }, + { + "epoch": 0.5693333333333334, + "grad_norm": 1.3072692155838013, + "learning_rate": 0.00017738434163701067, + "loss": 2.4594, + "step": 1281 + }, + { + "epoch": 0.5697777777777778, + "grad_norm": 1.5001025199890137, + "learning_rate": 0.00017736654804270462, + "loss": 2.822, + "step": 1282 + }, + { + "epoch": 0.5702222222222222, + "grad_norm": 1.440004587173462, + "learning_rate": 0.00017734875444839858, + "loss": 2.6287, + "step": 1283 + }, + { + "epoch": 0.5706666666666667, + "grad_norm": 1.5406244993209839, + "learning_rate": 0.00017733096085409254, + "loss": 2.4, + "step": 1284 + }, + { + "epoch": 0.5711111111111111, + "grad_norm": 1.3207265138626099, + "learning_rate": 0.0001773131672597865, + "loss": 2.2423, + "step": 1285 + }, + { + "epoch": 0.5715555555555556, + "grad_norm": 1.3449149131774902, + "learning_rate": 0.00017729537366548042, + "loss": 2.1636, + "step": 1286 + }, + { + "epoch": 0.572, + "grad_norm": 1.5409855842590332, + "learning_rate": 0.00017727758007117438, + "loss": 2.5376, + "step": 1287 + }, + { + "epoch": 0.5724444444444444, + "grad_norm": 1.6577012538909912, + "learning_rate": 0.00017725978647686834, + "loss": 2.1492, + "step": 1288 + }, + { + "epoch": 0.5728888888888889, + "grad_norm": 1.4876697063446045, + "learning_rate": 0.0001772419928825623, + "loss": 2.4557, + "step": 1289 + }, + { + "epoch": 0.5733333333333334, + "grad_norm": 1.600537657737732, + "learning_rate": 0.00017722419928825625, + "loss": 2.3974, + "step": 1290 + }, + { + "epoch": 0.5737777777777778, + "grad_norm": 1.856227159500122, + "learning_rate": 0.0001772064056939502, + "loss": 2.8138, + "step": 1291 + }, + { + "epoch": 0.5742222222222222, + "grad_norm": 1.80988609790802, + "learning_rate": 0.00017718861209964414, + "loss": 1.4204, + "step": 1292 + }, + { + "epoch": 0.5746666666666667, + "grad_norm": 1.5674011707305908, + "learning_rate": 0.00017717081850533806, + "loss": 2.2016, + "step": 1293 + }, + { + "epoch": 0.5751111111111111, + "grad_norm": 1.5148704051971436, + "learning_rate": 0.00017715302491103202, + "loss": 2.607, + "step": 1294 + }, + { + "epoch": 0.5755555555555556, + "grad_norm": 1.924994945526123, + "learning_rate": 0.00017713523131672598, + "loss": 2.69, + "step": 1295 + }, + { + "epoch": 0.576, + "grad_norm": 1.8337801694869995, + "learning_rate": 0.00017711743772241993, + "loss": 2.1814, + "step": 1296 + }, + { + "epoch": 0.5764444444444444, + "grad_norm": 1.7834872007369995, + "learning_rate": 0.0001770996441281139, + "loss": 2.7517, + "step": 1297 + }, + { + "epoch": 0.5768888888888889, + "grad_norm": 1.7494984865188599, + "learning_rate": 0.00017708185053380785, + "loss": 2.2146, + "step": 1298 + }, + { + "epoch": 0.5773333333333334, + "grad_norm": 1.8861663341522217, + "learning_rate": 0.00017706405693950178, + "loss": 2.5008, + "step": 1299 + }, + { + "epoch": 0.5777777777777777, + "grad_norm": 3.3163883686065674, + "learning_rate": 0.00017704626334519573, + "loss": 3.1244, + "step": 1300 + }, + { + "epoch": 0.5782222222222222, + "grad_norm": 1.1315196752548218, + "learning_rate": 0.0001770284697508897, + "loss": 2.7789, + "step": 1301 + }, + { + "epoch": 0.5786666666666667, + "grad_norm": 1.067335844039917, + "learning_rate": 0.00017701067615658365, + "loss": 3.2978, + "step": 1302 + }, + { + "epoch": 0.5791111111111111, + "grad_norm": 1.1568325757980347, + "learning_rate": 0.0001769928825622776, + "loss": 2.8684, + "step": 1303 + }, + { + "epoch": 0.5795555555555556, + "grad_norm": 1.2416324615478516, + "learning_rate": 0.00017697508896797156, + "loss": 3.2694, + "step": 1304 + }, + { + "epoch": 0.58, + "grad_norm": 1.24941086769104, + "learning_rate": 0.0001769572953736655, + "loss": 2.8852, + "step": 1305 + }, + { + "epoch": 0.5804444444444444, + "grad_norm": 1.0672186613082886, + "learning_rate": 0.00017693950177935942, + "loss": 2.4795, + "step": 1306 + }, + { + "epoch": 0.5808888888888889, + "grad_norm": 1.0531431436538696, + "learning_rate": 0.00017692170818505337, + "loss": 2.5932, + "step": 1307 + }, + { + "epoch": 0.5813333333333334, + "grad_norm": 1.2098814249038696, + "learning_rate": 0.00017690391459074733, + "loss": 2.3703, + "step": 1308 + }, + { + "epoch": 0.5817777777777777, + "grad_norm": 1.1954690217971802, + "learning_rate": 0.0001768861209964413, + "loss": 2.5722, + "step": 1309 + }, + { + "epoch": 0.5822222222222222, + "grad_norm": 1.2348884344100952, + "learning_rate": 0.00017686832740213524, + "loss": 2.2636, + "step": 1310 + }, + { + "epoch": 0.5826666666666667, + "grad_norm": 1.145476222038269, + "learning_rate": 0.0001768505338078292, + "loss": 3.0043, + "step": 1311 + }, + { + "epoch": 0.5831111111111111, + "grad_norm": 1.1092824935913086, + "learning_rate": 0.00017683274021352313, + "loss": 2.7379, + "step": 1312 + }, + { + "epoch": 0.5835555555555556, + "grad_norm": 1.696060061454773, + "learning_rate": 0.00017681494661921709, + "loss": 1.8912, + "step": 1313 + }, + { + "epoch": 0.584, + "grad_norm": 1.3610656261444092, + "learning_rate": 0.00017679715302491104, + "loss": 2.5365, + "step": 1314 + }, + { + "epoch": 0.5844444444444444, + "grad_norm": 1.2558561563491821, + "learning_rate": 0.000176779359430605, + "loss": 2.5177, + "step": 1315 + }, + { + "epoch": 0.5848888888888889, + "grad_norm": 1.0652177333831787, + "learning_rate": 0.00017676156583629896, + "loss": 2.0853, + "step": 1316 + }, + { + "epoch": 0.5853333333333334, + "grad_norm": 1.3081934452056885, + "learning_rate": 0.0001767437722419929, + "loss": 2.6939, + "step": 1317 + }, + { + "epoch": 0.5857777777777777, + "grad_norm": 1.3675099611282349, + "learning_rate": 0.00017672597864768684, + "loss": 2.4198, + "step": 1318 + }, + { + "epoch": 0.5862222222222222, + "grad_norm": 1.2688225507736206, + "learning_rate": 0.00017670818505338077, + "loss": 2.472, + "step": 1319 + }, + { + "epoch": 0.5866666666666667, + "grad_norm": 1.7709518671035767, + "learning_rate": 0.00017669039145907473, + "loss": 1.2899, + "step": 1320 + }, + { + "epoch": 0.5871111111111111, + "grad_norm": 1.2735425233840942, + "learning_rate": 0.00017667259786476868, + "loss": 2.3649, + "step": 1321 + }, + { + "epoch": 0.5875555555555556, + "grad_norm": 1.4274276494979858, + "learning_rate": 0.00017665480427046264, + "loss": 3.0392, + "step": 1322 + }, + { + "epoch": 0.588, + "grad_norm": 1.1845803260803223, + "learning_rate": 0.0001766370106761566, + "loss": 2.2422, + "step": 1323 + }, + { + "epoch": 0.5884444444444444, + "grad_norm": 1.505797028541565, + "learning_rate": 0.00017661921708185055, + "loss": 2.6997, + "step": 1324 + }, + { + "epoch": 0.5888888888888889, + "grad_norm": 1.500378131866455, + "learning_rate": 0.00017660142348754448, + "loss": 2.4131, + "step": 1325 + }, + { + "epoch": 0.5893333333333334, + "grad_norm": 1.666031837463379, + "learning_rate": 0.00017658362989323844, + "loss": 2.4642, + "step": 1326 + }, + { + "epoch": 0.5897777777777777, + "grad_norm": 1.4224402904510498, + "learning_rate": 0.0001765658362989324, + "loss": 2.6931, + "step": 1327 + }, + { + "epoch": 0.5902222222222222, + "grad_norm": 1.5196523666381836, + "learning_rate": 0.00017654804270462635, + "loss": 2.846, + "step": 1328 + }, + { + "epoch": 0.5906666666666667, + "grad_norm": 1.455924391746521, + "learning_rate": 0.0001765302491103203, + "loss": 2.6315, + "step": 1329 + }, + { + "epoch": 0.5911111111111111, + "grad_norm": 2.582533597946167, + "learning_rate": 0.00017651245551601427, + "loss": 1.3039, + "step": 1330 + }, + { + "epoch": 0.5915555555555555, + "grad_norm": 1.7225983142852783, + "learning_rate": 0.0001764946619217082, + "loss": 2.5404, + "step": 1331 + }, + { + "epoch": 0.592, + "grad_norm": 1.50846529006958, + "learning_rate": 0.00017647686832740212, + "loss": 2.2202, + "step": 1332 + }, + { + "epoch": 0.5924444444444444, + "grad_norm": 1.578640103340149, + "learning_rate": 0.00017645907473309608, + "loss": 2.3533, + "step": 1333 + }, + { + "epoch": 0.5928888888888889, + "grad_norm": 1.3282861709594727, + "learning_rate": 0.00017644128113879004, + "loss": 2.2071, + "step": 1334 + }, + { + "epoch": 0.5933333333333334, + "grad_norm": 1.6370536088943481, + "learning_rate": 0.000176423487544484, + "loss": 2.9182, + "step": 1335 + }, + { + "epoch": 0.5937777777777777, + "grad_norm": 1.1675159931182861, + "learning_rate": 0.00017640569395017795, + "loss": 1.177, + "step": 1336 + }, + { + "epoch": 0.5942222222222222, + "grad_norm": 1.2795166969299316, + "learning_rate": 0.0001763879003558719, + "loss": 1.9403, + "step": 1337 + }, + { + "epoch": 0.5946666666666667, + "grad_norm": 1.825806975364685, + "learning_rate": 0.00017637010676156584, + "loss": 2.655, + "step": 1338 + }, + { + "epoch": 0.5951111111111111, + "grad_norm": 1.5679066181182861, + "learning_rate": 0.0001763523131672598, + "loss": 2.3021, + "step": 1339 + }, + { + "epoch": 0.5955555555555555, + "grad_norm": 1.534218430519104, + "learning_rate": 0.00017633451957295375, + "loss": 2.5934, + "step": 1340 + }, + { + "epoch": 0.596, + "grad_norm": 1.8854663372039795, + "learning_rate": 0.0001763167259786477, + "loss": 2.6493, + "step": 1341 + }, + { + "epoch": 0.5964444444444444, + "grad_norm": 1.5538815259933472, + "learning_rate": 0.00017629893238434166, + "loss": 2.6666, + "step": 1342 + }, + { + "epoch": 0.5968888888888889, + "grad_norm": 1.702937364578247, + "learning_rate": 0.00017628113879003562, + "loss": 2.6987, + "step": 1343 + }, + { + "epoch": 0.5973333333333334, + "grad_norm": 1.9548336267471313, + "learning_rate": 0.00017626334519572955, + "loss": 2.4912, + "step": 1344 + }, + { + "epoch": 0.5977777777777777, + "grad_norm": 1.5537859201431274, + "learning_rate": 0.00017624555160142348, + "loss": 2.2085, + "step": 1345 + }, + { + "epoch": 0.5982222222222222, + "grad_norm": 1.6140497922897339, + "learning_rate": 0.00017622775800711743, + "loss": 2.6939, + "step": 1346 + }, + { + "epoch": 0.5986666666666667, + "grad_norm": 2.120786666870117, + "learning_rate": 0.0001762099644128114, + "loss": 2.8833, + "step": 1347 + }, + { + "epoch": 0.5991111111111111, + "grad_norm": 1.8668750524520874, + "learning_rate": 0.00017619217081850535, + "loss": 2.412, + "step": 1348 + }, + { + "epoch": 0.5995555555555555, + "grad_norm": 1.6141252517700195, + "learning_rate": 0.0001761743772241993, + "loss": 2.3202, + "step": 1349 + }, + { + "epoch": 0.6, + "grad_norm": 2.0876715183258057, + "learning_rate": 0.00017615658362989323, + "loss": 1.9042, + "step": 1350 + }, + { + "epoch": 0.6004444444444444, + "grad_norm": 0.9330457448959351, + "learning_rate": 0.0001761387900355872, + "loss": 2.1474, + "step": 1351 + }, + { + "epoch": 0.6008888888888889, + "grad_norm": 0.9378175139427185, + "learning_rate": 0.00017612099644128115, + "loss": 2.6252, + "step": 1352 + }, + { + "epoch": 0.6013333333333334, + "grad_norm": 1.2432550191879272, + "learning_rate": 0.0001761032028469751, + "loss": 3.2561, + "step": 1353 + }, + { + "epoch": 0.6017777777777777, + "grad_norm": 1.2766064405441284, + "learning_rate": 0.00017608540925266906, + "loss": 2.0631, + "step": 1354 + }, + { + "epoch": 0.6022222222222222, + "grad_norm": 1.170651912689209, + "learning_rate": 0.00017606761565836301, + "loss": 3.0557, + "step": 1355 + }, + { + "epoch": 0.6026666666666667, + "grad_norm": 1.724684476852417, + "learning_rate": 0.00017604982206405694, + "loss": 1.6797, + "step": 1356 + }, + { + "epoch": 0.6031111111111112, + "grad_norm": 0.9961637258529663, + "learning_rate": 0.00017603202846975087, + "loss": 2.8432, + "step": 1357 + }, + { + "epoch": 0.6035555555555555, + "grad_norm": 1.1883872747421265, + "learning_rate": 0.00017601423487544483, + "loss": 3.071, + "step": 1358 + }, + { + "epoch": 0.604, + "grad_norm": 1.0519766807556152, + "learning_rate": 0.0001759964412811388, + "loss": 2.1003, + "step": 1359 + }, + { + "epoch": 0.6044444444444445, + "grad_norm": 1.0907334089279175, + "learning_rate": 0.00017597864768683274, + "loss": 2.6082, + "step": 1360 + }, + { + "epoch": 0.6048888888888889, + "grad_norm": 1.1930400133132935, + "learning_rate": 0.0001759608540925267, + "loss": 2.4901, + "step": 1361 + }, + { + "epoch": 0.6053333333333333, + "grad_norm": 1.688298225402832, + "learning_rate": 0.00017594306049822066, + "loss": 1.7889, + "step": 1362 + }, + { + "epoch": 0.6057777777777777, + "grad_norm": 1.192671537399292, + "learning_rate": 0.00017592526690391459, + "loss": 1.313, + "step": 1363 + }, + { + "epoch": 0.6062222222222222, + "grad_norm": 1.3298887014389038, + "learning_rate": 0.00017590747330960854, + "loss": 3.2103, + "step": 1364 + }, + { + "epoch": 0.6066666666666667, + "grad_norm": 1.1152617931365967, + "learning_rate": 0.0001758896797153025, + "loss": 2.4196, + "step": 1365 + }, + { + "epoch": 0.6071111111111112, + "grad_norm": 1.2884933948516846, + "learning_rate": 0.00017587188612099646, + "loss": 2.6931, + "step": 1366 + }, + { + "epoch": 0.6075555555555555, + "grad_norm": 1.1782528162002563, + "learning_rate": 0.0001758540925266904, + "loss": 2.0241, + "step": 1367 + }, + { + "epoch": 0.608, + "grad_norm": 1.228033423423767, + "learning_rate": 0.00017583629893238437, + "loss": 2.1323, + "step": 1368 + }, + { + "epoch": 0.6084444444444445, + "grad_norm": 1.3437577486038208, + "learning_rate": 0.0001758185053380783, + "loss": 1.3953, + "step": 1369 + }, + { + "epoch": 0.6088888888888889, + "grad_norm": 1.3058631420135498, + "learning_rate": 0.00017580071174377223, + "loss": 2.6964, + "step": 1370 + }, + { + "epoch": 0.6093333333333333, + "grad_norm": 1.219984531402588, + "learning_rate": 0.00017578291814946618, + "loss": 2.2271, + "step": 1371 + }, + { + "epoch": 0.6097777777777778, + "grad_norm": 1.363558292388916, + "learning_rate": 0.00017576512455516014, + "loss": 2.5532, + "step": 1372 + }, + { + "epoch": 0.6102222222222222, + "grad_norm": 1.8681366443634033, + "learning_rate": 0.0001757473309608541, + "loss": 1.3838, + "step": 1373 + }, + { + "epoch": 0.6106666666666667, + "grad_norm": 1.3558433055877686, + "learning_rate": 0.00017572953736654805, + "loss": 2.6416, + "step": 1374 + }, + { + "epoch": 0.6111111111111112, + "grad_norm": 1.3465511798858643, + "learning_rate": 0.000175711743772242, + "loss": 2.3744, + "step": 1375 + }, + { + "epoch": 0.6115555555555555, + "grad_norm": 1.6007107496261597, + "learning_rate": 0.00017569395017793594, + "loss": 2.3307, + "step": 1376 + }, + { + "epoch": 0.612, + "grad_norm": 1.602541208267212, + "learning_rate": 0.0001756761565836299, + "loss": 2.5173, + "step": 1377 + }, + { + "epoch": 0.6124444444444445, + "grad_norm": 1.6473790407180786, + "learning_rate": 0.00017565836298932385, + "loss": 2.6707, + "step": 1378 + }, + { + "epoch": 0.6128888888888889, + "grad_norm": 1.376232385635376, + "learning_rate": 0.0001756405693950178, + "loss": 1.9534, + "step": 1379 + }, + { + "epoch": 0.6133333333333333, + "grad_norm": 1.420652151107788, + "learning_rate": 0.00017562277580071176, + "loss": 2.5401, + "step": 1380 + }, + { + "epoch": 0.6137777777777778, + "grad_norm": 1.1767398118972778, + "learning_rate": 0.00017560498220640572, + "loss": 1.1765, + "step": 1381 + }, + { + "epoch": 0.6142222222222222, + "grad_norm": 1.3465992212295532, + "learning_rate": 0.00017558718861209965, + "loss": 2.0177, + "step": 1382 + }, + { + "epoch": 0.6146666666666667, + "grad_norm": 1.6473318338394165, + "learning_rate": 0.00017556939501779358, + "loss": 2.2605, + "step": 1383 + }, + { + "epoch": 0.6151111111111112, + "grad_norm": 1.4791382551193237, + "learning_rate": 0.00017555160142348754, + "loss": 2.494, + "step": 1384 + }, + { + "epoch": 0.6155555555555555, + "grad_norm": 1.4145492315292358, + "learning_rate": 0.0001755338078291815, + "loss": 2.1339, + "step": 1385 + }, + { + "epoch": 0.616, + "grad_norm": 1.6023871898651123, + "learning_rate": 0.00017551601423487545, + "loss": 2.8089, + "step": 1386 + }, + { + "epoch": 0.6164444444444445, + "grad_norm": 1.548842191696167, + "learning_rate": 0.0001754982206405694, + "loss": 2.2959, + "step": 1387 + }, + { + "epoch": 0.6168888888888889, + "grad_norm": 1.6211026906967163, + "learning_rate": 0.00017548042704626336, + "loss": 2.6256, + "step": 1388 + }, + { + "epoch": 0.6173333333333333, + "grad_norm": 1.508934497833252, + "learning_rate": 0.0001754626334519573, + "loss": 2.4794, + "step": 1389 + }, + { + "epoch": 0.6177777777777778, + "grad_norm": 1.704952359199524, + "learning_rate": 0.00017544483985765125, + "loss": 2.6322, + "step": 1390 + }, + { + "epoch": 0.6182222222222222, + "grad_norm": 1.4746431112289429, + "learning_rate": 0.0001754270462633452, + "loss": 2.2247, + "step": 1391 + }, + { + "epoch": 0.6186666666666667, + "grad_norm": 1.5690298080444336, + "learning_rate": 0.00017540925266903916, + "loss": 2.1532, + "step": 1392 + }, + { + "epoch": 0.6191111111111111, + "grad_norm": 1.7075462341308594, + "learning_rate": 0.00017539145907473312, + "loss": 2.864, + "step": 1393 + }, + { + "epoch": 0.6195555555555555, + "grad_norm": 1.909990906715393, + "learning_rate": 0.00017537366548042707, + "loss": 2.8461, + "step": 1394 + }, + { + "epoch": 0.62, + "grad_norm": 1.7880994081497192, + "learning_rate": 0.000175355871886121, + "loss": 2.6034, + "step": 1395 + }, + { + "epoch": 0.6204444444444445, + "grad_norm": 1.8660807609558105, + "learning_rate": 0.00017533807829181493, + "loss": 2.372, + "step": 1396 + }, + { + "epoch": 0.6208888888888889, + "grad_norm": 1.9713836908340454, + "learning_rate": 0.0001753202846975089, + "loss": 2.4238, + "step": 1397 + }, + { + "epoch": 0.6213333333333333, + "grad_norm": 2.378556966781616, + "learning_rate": 0.00017530249110320285, + "loss": 2.9749, + "step": 1398 + }, + { + "epoch": 0.6217777777777778, + "grad_norm": 3.1581037044525146, + "learning_rate": 0.0001752846975088968, + "loss": 0.1631, + "step": 1399 + }, + { + "epoch": 0.6222222222222222, + "grad_norm": 2.725257396697998, + "learning_rate": 0.00017526690391459076, + "loss": 2.3153, + "step": 1400 + }, + { + "epoch": 0.6226666666666667, + "grad_norm": 1.687172770500183, + "learning_rate": 0.00017524911032028472, + "loss": 1.8111, + "step": 1401 + }, + { + "epoch": 0.6231111111111111, + "grad_norm": 1.2486687898635864, + "learning_rate": 0.00017523131672597865, + "loss": 1.6803, + "step": 1402 + }, + { + "epoch": 0.6235555555555555, + "grad_norm": 0.9351308345794678, + "learning_rate": 0.0001752135231316726, + "loss": 2.5876, + "step": 1403 + }, + { + "epoch": 0.624, + "grad_norm": 1.1229432821273804, + "learning_rate": 0.00017519572953736656, + "loss": 2.5936, + "step": 1404 + }, + { + "epoch": 0.6244444444444445, + "grad_norm": 0.9976657629013062, + "learning_rate": 0.00017517793594306051, + "loss": 2.7395, + "step": 1405 + }, + { + "epoch": 0.6248888888888889, + "grad_norm": 0.9780849814414978, + "learning_rate": 0.00017516014234875447, + "loss": 2.1686, + "step": 1406 + }, + { + "epoch": 0.6253333333333333, + "grad_norm": 1.0690516233444214, + "learning_rate": 0.00017514234875444843, + "loss": 2.2365, + "step": 1407 + }, + { + "epoch": 0.6257777777777778, + "grad_norm": 1.1127312183380127, + "learning_rate": 0.00017512455516014236, + "loss": 2.5104, + "step": 1408 + }, + { + "epoch": 0.6262222222222222, + "grad_norm": 1.1946388483047485, + "learning_rate": 0.00017510676156583629, + "loss": 2.7337, + "step": 1409 + }, + { + "epoch": 0.6266666666666667, + "grad_norm": 1.0641489028930664, + "learning_rate": 0.00017508896797153024, + "loss": 2.5558, + "step": 1410 + }, + { + "epoch": 0.6271111111111111, + "grad_norm": 1.127685546875, + "learning_rate": 0.0001750711743772242, + "loss": 2.517, + "step": 1411 + }, + { + "epoch": 0.6275555555555555, + "grad_norm": 1.2236816883087158, + "learning_rate": 0.00017505338078291816, + "loss": 2.6978, + "step": 1412 + }, + { + "epoch": 0.628, + "grad_norm": 1.4464499950408936, + "learning_rate": 0.0001750355871886121, + "loss": 2.6535, + "step": 1413 + }, + { + "epoch": 0.6284444444444445, + "grad_norm": 1.4766771793365479, + "learning_rate": 0.00017501779359430607, + "loss": 3.3998, + "step": 1414 + }, + { + "epoch": 0.6288888888888889, + "grad_norm": 1.2874871492385864, + "learning_rate": 0.000175, + "loss": 2.3482, + "step": 1415 + }, + { + "epoch": 0.6293333333333333, + "grad_norm": 1.3195867538452148, + "learning_rate": 0.00017498220640569395, + "loss": 2.7828, + "step": 1416 + }, + { + "epoch": 0.6297777777777778, + "grad_norm": 1.2177435159683228, + "learning_rate": 0.0001749644128113879, + "loss": 2.3438, + "step": 1417 + }, + { + "epoch": 0.6302222222222222, + "grad_norm": 1.236802577972412, + "learning_rate": 0.00017494661921708187, + "loss": 2.6197, + "step": 1418 + }, + { + "epoch": 0.6306666666666667, + "grad_norm": 1.363099217414856, + "learning_rate": 0.00017492882562277582, + "loss": 2.6425, + "step": 1419 + }, + { + "epoch": 0.6311111111111111, + "grad_norm": 2.4340715408325195, + "learning_rate": 0.00017491103202846978, + "loss": 1.5395, + "step": 1420 + }, + { + "epoch": 0.6315555555555555, + "grad_norm": 1.425446629524231, + "learning_rate": 0.0001748932384341637, + "loss": 2.6681, + "step": 1421 + }, + { + "epoch": 0.632, + "grad_norm": 1.3142110109329224, + "learning_rate": 0.00017487544483985764, + "loss": 2.53, + "step": 1422 + }, + { + "epoch": 0.6324444444444445, + "grad_norm": 1.947712779045105, + "learning_rate": 0.0001748576512455516, + "loss": 2.8098, + "step": 1423 + }, + { + "epoch": 0.6328888888888888, + "grad_norm": 1.202399730682373, + "learning_rate": 0.00017483985765124555, + "loss": 1.8831, + "step": 1424 + }, + { + "epoch": 0.6333333333333333, + "grad_norm": 1.3699499368667603, + "learning_rate": 0.0001748220640569395, + "loss": 2.9502, + "step": 1425 + }, + { + "epoch": 0.6337777777777778, + "grad_norm": 1.4097647666931152, + "learning_rate": 0.00017480427046263347, + "loss": 2.8757, + "step": 1426 + }, + { + "epoch": 0.6342222222222222, + "grad_norm": 1.3218767642974854, + "learning_rate": 0.00017478647686832742, + "loss": 2.299, + "step": 1427 + }, + { + "epoch": 0.6346666666666667, + "grad_norm": 1.4610822200775146, + "learning_rate": 0.00017476868327402135, + "loss": 2.5177, + "step": 1428 + }, + { + "epoch": 0.6351111111111111, + "grad_norm": 1.65794038772583, + "learning_rate": 0.0001747508896797153, + "loss": 2.8784, + "step": 1429 + }, + { + "epoch": 0.6355555555555555, + "grad_norm": 1.5559744834899902, + "learning_rate": 0.00017473309608540926, + "loss": 2.3513, + "step": 1430 + }, + { + "epoch": 0.636, + "grad_norm": 1.8620177507400513, + "learning_rate": 0.00017471530249110322, + "loss": 0.9563, + "step": 1431 + }, + { + "epoch": 0.6364444444444445, + "grad_norm": 1.3389177322387695, + "learning_rate": 0.00017469750889679718, + "loss": 1.5891, + "step": 1432 + }, + { + "epoch": 0.6368888888888888, + "grad_norm": 1.3747659921646118, + "learning_rate": 0.00017467971530249113, + "loss": 1.4177, + "step": 1433 + }, + { + "epoch": 0.6373333333333333, + "grad_norm": 1.6347014904022217, + "learning_rate": 0.00017466192170818506, + "loss": 2.359, + "step": 1434 + }, + { + "epoch": 0.6377777777777778, + "grad_norm": 1.6315451860427856, + "learning_rate": 0.000174644128113879, + "loss": 3.0319, + "step": 1435 + }, + { + "epoch": 0.6382222222222222, + "grad_norm": 1.5996496677398682, + "learning_rate": 0.00017462633451957295, + "loss": 2.3993, + "step": 1436 + }, + { + "epoch": 0.6386666666666667, + "grad_norm": 1.7360764741897583, + "learning_rate": 0.0001746085409252669, + "loss": 2.3704, + "step": 1437 + }, + { + "epoch": 0.6391111111111111, + "grad_norm": 1.6707364320755005, + "learning_rate": 0.00017459074733096086, + "loss": 2.4405, + "step": 1438 + }, + { + "epoch": 0.6395555555555555, + "grad_norm": 1.5770982503890991, + "learning_rate": 0.00017457295373665482, + "loss": 2.4132, + "step": 1439 + }, + { + "epoch": 0.64, + "grad_norm": 1.9009085893630981, + "learning_rate": 0.00017455516014234875, + "loss": 2.6834, + "step": 1440 + }, + { + "epoch": 0.6404444444444445, + "grad_norm": 1.7474616765975952, + "learning_rate": 0.0001745373665480427, + "loss": 2.341, + "step": 1441 + }, + { + "epoch": 0.6408888888888888, + "grad_norm": 2.131709337234497, + "learning_rate": 0.00017451957295373666, + "loss": 2.554, + "step": 1442 + }, + { + "epoch": 0.6413333333333333, + "grad_norm": 2.146632194519043, + "learning_rate": 0.00017450177935943062, + "loss": 2.8733, + "step": 1443 + }, + { + "epoch": 0.6417777777777778, + "grad_norm": 2.0064449310302734, + "learning_rate": 0.00017448398576512457, + "loss": 2.4239, + "step": 1444 + }, + { + "epoch": 0.6422222222222222, + "grad_norm": 2.1821370124816895, + "learning_rate": 0.00017446619217081853, + "loss": 3.9109, + "step": 1445 + }, + { + "epoch": 0.6426666666666667, + "grad_norm": 1.7780088186264038, + "learning_rate": 0.00017444839857651246, + "loss": 2.3761, + "step": 1446 + }, + { + "epoch": 0.6431111111111111, + "grad_norm": 1.9316993951797485, + "learning_rate": 0.00017443060498220642, + "loss": 2.977, + "step": 1447 + }, + { + "epoch": 0.6435555555555555, + "grad_norm": 2.113534927368164, + "learning_rate": 0.00017441281138790035, + "loss": 2.6501, + "step": 1448 + }, + { + "epoch": 0.644, + "grad_norm": 2.1450841426849365, + "learning_rate": 0.0001743950177935943, + "loss": 1.5886, + "step": 1449 + }, + { + "epoch": 0.6444444444444445, + "grad_norm": 2.311339855194092, + "learning_rate": 0.00017437722419928826, + "loss": 2.0959, + "step": 1450 + }, + { + "epoch": 0.6448888888888888, + "grad_norm": 3.696924924850464, + "learning_rate": 0.00017435943060498222, + "loss": 1.73, + "step": 1451 + }, + { + "epoch": 0.6453333333333333, + "grad_norm": 1.0169023275375366, + "learning_rate": 0.00017434163701067617, + "loss": 2.5696, + "step": 1452 + }, + { + "epoch": 0.6457777777777778, + "grad_norm": 1.278465747833252, + "learning_rate": 0.0001743238434163701, + "loss": 2.876, + "step": 1453 + }, + { + "epoch": 0.6462222222222223, + "grad_norm": 1.182702898979187, + "learning_rate": 0.00017430604982206406, + "loss": 2.4447, + "step": 1454 + }, + { + "epoch": 0.6466666666666666, + "grad_norm": 1.3612101078033447, + "learning_rate": 0.00017428825622775801, + "loss": 2.5971, + "step": 1455 + }, + { + "epoch": 0.6471111111111111, + "grad_norm": 1.2513208389282227, + "learning_rate": 0.00017427046263345197, + "loss": 1.7877, + "step": 1456 + }, + { + "epoch": 0.6475555555555556, + "grad_norm": 1.108196496963501, + "learning_rate": 0.00017425266903914593, + "loss": 2.2443, + "step": 1457 + }, + { + "epoch": 0.648, + "grad_norm": 1.3617606163024902, + "learning_rate": 0.00017423487544483988, + "loss": 2.4862, + "step": 1458 + }, + { + "epoch": 0.6484444444444445, + "grad_norm": 1.2257615327835083, + "learning_rate": 0.0001742170818505338, + "loss": 2.6374, + "step": 1459 + }, + { + "epoch": 0.6488888888888888, + "grad_norm": 1.6558245420455933, + "learning_rate": 0.00017419928825622777, + "loss": 1.4984, + "step": 1460 + }, + { + "epoch": 0.6493333333333333, + "grad_norm": 1.308287262916565, + "learning_rate": 0.0001741814946619217, + "loss": 2.5151, + "step": 1461 + }, + { + "epoch": 0.6497777777777778, + "grad_norm": 1.210597038269043, + "learning_rate": 0.00017416370106761566, + "loss": 2.0015, + "step": 1462 + }, + { + "epoch": 0.6502222222222223, + "grad_norm": 1.6040871143341064, + "learning_rate": 0.0001741459074733096, + "loss": 2.5846, + "step": 1463 + }, + { + "epoch": 0.6506666666666666, + "grad_norm": 1.5346297025680542, + "learning_rate": 0.00017412811387900357, + "loss": 2.6826, + "step": 1464 + }, + { + "epoch": 0.6511111111111111, + "grad_norm": 1.5624873638153076, + "learning_rate": 0.00017411032028469752, + "loss": 2.7424, + "step": 1465 + }, + { + "epoch": 0.6515555555555556, + "grad_norm": 1.3819774389266968, + "learning_rate": 0.00017409252669039145, + "loss": 2.7193, + "step": 1466 + }, + { + "epoch": 0.652, + "grad_norm": 1.2688080072402954, + "learning_rate": 0.0001740747330960854, + "loss": 2.2891, + "step": 1467 + }, + { + "epoch": 0.6524444444444445, + "grad_norm": 1.2661856412887573, + "learning_rate": 0.00017405693950177937, + "loss": 1.2741, + "step": 1468 + }, + { + "epoch": 0.6528888888888889, + "grad_norm": 1.215834617614746, + "learning_rate": 0.00017403914590747332, + "loss": 2.3391, + "step": 1469 + }, + { + "epoch": 0.6533333333333333, + "grad_norm": 1.450677752494812, + "learning_rate": 0.00017402135231316728, + "loss": 2.752, + "step": 1470 + }, + { + "epoch": 0.6537777777777778, + "grad_norm": 1.2563883066177368, + "learning_rate": 0.00017400355871886124, + "loss": 2.3388, + "step": 1471 + }, + { + "epoch": 0.6542222222222223, + "grad_norm": 1.5057649612426758, + "learning_rate": 0.00017398576512455517, + "loss": 2.4776, + "step": 1472 + }, + { + "epoch": 0.6546666666666666, + "grad_norm": 1.5081661939620972, + "learning_rate": 0.00017396797153024912, + "loss": 2.4154, + "step": 1473 + }, + { + "epoch": 0.6551111111111111, + "grad_norm": 1.3150933980941772, + "learning_rate": 0.00017395017793594305, + "loss": 2.4713, + "step": 1474 + }, + { + "epoch": 0.6555555555555556, + "grad_norm": 1.830236554145813, + "learning_rate": 0.000173932384341637, + "loss": 2.4647, + "step": 1475 + }, + { + "epoch": 0.656, + "grad_norm": 1.368726372718811, + "learning_rate": 0.00017391459074733097, + "loss": 2.1934, + "step": 1476 + }, + { + "epoch": 0.6564444444444445, + "grad_norm": 1.3172972202301025, + "learning_rate": 0.00017389679715302492, + "loss": 2.269, + "step": 1477 + }, + { + "epoch": 0.6568888888888889, + "grad_norm": 1.7238434553146362, + "learning_rate": 0.00017387900355871888, + "loss": 2.5588, + "step": 1478 + }, + { + "epoch": 0.6573333333333333, + "grad_norm": 1.5723549127578735, + "learning_rate": 0.0001738612099644128, + "loss": 2.3316, + "step": 1479 + }, + { + "epoch": 0.6577777777777778, + "grad_norm": 1.4798550605773926, + "learning_rate": 0.00017384341637010676, + "loss": 2.2944, + "step": 1480 + }, + { + "epoch": 0.6582222222222223, + "grad_norm": 1.411055564880371, + "learning_rate": 0.00017382562277580072, + "loss": 2.5573, + "step": 1481 + }, + { + "epoch": 0.6586666666666666, + "grad_norm": 2.178536891937256, + "learning_rate": 0.00017380782918149468, + "loss": 2.6412, + "step": 1482 + }, + { + "epoch": 0.6591111111111111, + "grad_norm": 1.48294198513031, + "learning_rate": 0.00017379003558718863, + "loss": 2.2673, + "step": 1483 + }, + { + "epoch": 0.6595555555555556, + "grad_norm": 1.597641944885254, + "learning_rate": 0.0001737722419928826, + "loss": 2.1974, + "step": 1484 + }, + { + "epoch": 0.66, + "grad_norm": 1.7174814939498901, + "learning_rate": 0.00017375444839857652, + "loss": 2.7182, + "step": 1485 + }, + { + "epoch": 0.6604444444444444, + "grad_norm": 2.032003879547119, + "learning_rate": 0.00017373665480427045, + "loss": 3.3137, + "step": 1486 + }, + { + "epoch": 0.6608888888888889, + "grad_norm": 1.495666742324829, + "learning_rate": 0.0001737188612099644, + "loss": 1.8833, + "step": 1487 + }, + { + "epoch": 0.6613333333333333, + "grad_norm": 1.6871966123580933, + "learning_rate": 0.00017370106761565836, + "loss": 3.0253, + "step": 1488 + }, + { + "epoch": 0.6617777777777778, + "grad_norm": 1.5694763660430908, + "learning_rate": 0.00017368327402135232, + "loss": 1.9472, + "step": 1489 + }, + { + "epoch": 0.6622222222222223, + "grad_norm": 1.8708395957946777, + "learning_rate": 0.00017366548042704627, + "loss": 2.8555, + "step": 1490 + }, + { + "epoch": 0.6626666666666666, + "grad_norm": 1.8783323764801025, + "learning_rate": 0.00017364768683274023, + "loss": 2.4134, + "step": 1491 + }, + { + "epoch": 0.6631111111111111, + "grad_norm": 2.046388626098633, + "learning_rate": 0.00017362989323843416, + "loss": 2.7402, + "step": 1492 + }, + { + "epoch": 0.6635555555555556, + "grad_norm": 1.5785688161849976, + "learning_rate": 0.00017361209964412812, + "loss": 2.4242, + "step": 1493 + }, + { + "epoch": 0.664, + "grad_norm": 1.7086628675460815, + "learning_rate": 0.00017359430604982207, + "loss": 2.3236, + "step": 1494 + }, + { + "epoch": 0.6644444444444444, + "grad_norm": 1.5549596548080444, + "learning_rate": 0.00017357651245551603, + "loss": 2.23, + "step": 1495 + }, + { + "epoch": 0.6648888888888889, + "grad_norm": 2.255401372909546, + "learning_rate": 0.00017355871886121, + "loss": 3.0353, + "step": 1496 + }, + { + "epoch": 0.6653333333333333, + "grad_norm": 1.643557071685791, + "learning_rate": 0.00017354092526690394, + "loss": 2.4116, + "step": 1497 + }, + { + "epoch": 0.6657777777777778, + "grad_norm": 2.1392343044281006, + "learning_rate": 0.00017352313167259787, + "loss": 3.4415, + "step": 1498 + }, + { + "epoch": 0.6662222222222223, + "grad_norm": 2.6055009365081787, + "learning_rate": 0.0001735053380782918, + "loss": 0.1766, + "step": 1499 + }, + { + "epoch": 0.6666666666666666, + "grad_norm": 2.152284622192383, + "learning_rate": 0.00017348754448398576, + "loss": 2.9102, + "step": 1500 + }, + { + "epoch": 0.6671111111111111, + "grad_norm": 0.9273963570594788, + "learning_rate": 0.00017346975088967971, + "loss": 2.428, + "step": 1501 + }, + { + "epoch": 0.6675555555555556, + "grad_norm": 0.9428668022155762, + "learning_rate": 0.00017345195729537367, + "loss": 2.6534, + "step": 1502 + }, + { + "epoch": 0.668, + "grad_norm": 0.93215012550354, + "learning_rate": 0.00017343416370106763, + "loss": 2.351, + "step": 1503 + }, + { + "epoch": 0.6684444444444444, + "grad_norm": 1.035394549369812, + "learning_rate": 0.00017341637010676158, + "loss": 1.1251, + "step": 1504 + }, + { + "epoch": 0.6688888888888889, + "grad_norm": 1.0628288984298706, + "learning_rate": 0.00017339857651245551, + "loss": 2.4944, + "step": 1505 + }, + { + "epoch": 0.6693333333333333, + "grad_norm": 1.3216973543167114, + "learning_rate": 0.00017338078291814947, + "loss": 2.5529, + "step": 1506 + }, + { + "epoch": 0.6697777777777778, + "grad_norm": 1.2949331998825073, + "learning_rate": 0.00017336298932384343, + "loss": 2.5878, + "step": 1507 + }, + { + "epoch": 0.6702222222222223, + "grad_norm": 1.3365072011947632, + "learning_rate": 0.00017334519572953738, + "loss": 3.059, + "step": 1508 + }, + { + "epoch": 0.6706666666666666, + "grad_norm": 1.181065320968628, + "learning_rate": 0.00017332740213523134, + "loss": 2.6428, + "step": 1509 + }, + { + "epoch": 0.6711111111111111, + "grad_norm": 1.2061887979507446, + "learning_rate": 0.0001733096085409253, + "loss": 1.4412, + "step": 1510 + }, + { + "epoch": 0.6715555555555556, + "grad_norm": 1.4257198572158813, + "learning_rate": 0.00017329181494661923, + "loss": 2.9952, + "step": 1511 + }, + { + "epoch": 0.672, + "grad_norm": 1.2718660831451416, + "learning_rate": 0.00017327402135231316, + "loss": 2.3034, + "step": 1512 + }, + { + "epoch": 0.6724444444444444, + "grad_norm": 1.2620964050292969, + "learning_rate": 0.0001732562277580071, + "loss": 1.7634, + "step": 1513 + }, + { + "epoch": 0.6728888888888889, + "grad_norm": 1.4162237644195557, + "learning_rate": 0.00017323843416370107, + "loss": 2.2134, + "step": 1514 + }, + { + "epoch": 0.6733333333333333, + "grad_norm": 1.3909696340560913, + "learning_rate": 0.00017322064056939502, + "loss": 2.7267, + "step": 1515 + }, + { + "epoch": 0.6737777777777778, + "grad_norm": 1.563040852546692, + "learning_rate": 0.00017320284697508898, + "loss": 1.83, + "step": 1516 + }, + { + "epoch": 0.6742222222222222, + "grad_norm": 1.417112112045288, + "learning_rate": 0.00017318505338078294, + "loss": 2.1559, + "step": 1517 + }, + { + "epoch": 0.6746666666666666, + "grad_norm": 1.617037057876587, + "learning_rate": 0.00017316725978647687, + "loss": 2.7987, + "step": 1518 + }, + { + "epoch": 0.6751111111111111, + "grad_norm": 1.5133682489395142, + "learning_rate": 0.00017314946619217082, + "loss": 2.5342, + "step": 1519 + }, + { + "epoch": 0.6755555555555556, + "grad_norm": 1.3706659078598022, + "learning_rate": 0.00017313167259786478, + "loss": 2.583, + "step": 1520 + }, + { + "epoch": 0.676, + "grad_norm": 1.600906252861023, + "learning_rate": 0.00017311387900355874, + "loss": 2.7443, + "step": 1521 + }, + { + "epoch": 0.6764444444444444, + "grad_norm": 1.269012689590454, + "learning_rate": 0.0001730960854092527, + "loss": 2.3754, + "step": 1522 + }, + { + "epoch": 0.6768888888888889, + "grad_norm": 1.382144808769226, + "learning_rate": 0.00017307829181494665, + "loss": 2.5199, + "step": 1523 + }, + { + "epoch": 0.6773333333333333, + "grad_norm": 1.71562659740448, + "learning_rate": 0.00017306049822064058, + "loss": 3.278, + "step": 1524 + }, + { + "epoch": 0.6777777777777778, + "grad_norm": 1.2466914653778076, + "learning_rate": 0.0001730427046263345, + "loss": 2.3992, + "step": 1525 + }, + { + "epoch": 0.6782222222222222, + "grad_norm": 1.547672986984253, + "learning_rate": 0.00017302491103202846, + "loss": 2.4622, + "step": 1526 + }, + { + "epoch": 0.6786666666666666, + "grad_norm": 1.5382349491119385, + "learning_rate": 0.00017300711743772242, + "loss": 2.1739, + "step": 1527 + }, + { + "epoch": 0.6791111111111111, + "grad_norm": 1.4863885641098022, + "learning_rate": 0.00017298932384341638, + "loss": 2.6332, + "step": 1528 + }, + { + "epoch": 0.6795555555555556, + "grad_norm": 1.6594899892807007, + "learning_rate": 0.00017297153024911033, + "loss": 3.0222, + "step": 1529 + }, + { + "epoch": 0.68, + "grad_norm": 1.72464120388031, + "learning_rate": 0.00017295373665480426, + "loss": 2.6409, + "step": 1530 + }, + { + "epoch": 0.6804444444444444, + "grad_norm": 1.5191189050674438, + "learning_rate": 0.00017293594306049822, + "loss": 2.6336, + "step": 1531 + }, + { + "epoch": 0.6808888888888889, + "grad_norm": 1.8320003747940063, + "learning_rate": 0.00017291814946619218, + "loss": 2.7605, + "step": 1532 + }, + { + "epoch": 0.6813333333333333, + "grad_norm": 1.6135400533676147, + "learning_rate": 0.00017290035587188613, + "loss": 2.6308, + "step": 1533 + }, + { + "epoch": 0.6817777777777778, + "grad_norm": 1.8295135498046875, + "learning_rate": 0.0001728825622775801, + "loss": 2.5421, + "step": 1534 + }, + { + "epoch": 0.6822222222222222, + "grad_norm": 1.799838900566101, + "learning_rate": 0.00017286476868327405, + "loss": 3.0082, + "step": 1535 + }, + { + "epoch": 0.6826666666666666, + "grad_norm": 1.427129864692688, + "learning_rate": 0.00017284697508896798, + "loss": 2.5288, + "step": 1536 + }, + { + "epoch": 0.6831111111111111, + "grad_norm": 1.533745527267456, + "learning_rate": 0.00017282918149466193, + "loss": 2.7791, + "step": 1537 + }, + { + "epoch": 0.6835555555555556, + "grad_norm": 1.6388285160064697, + "learning_rate": 0.00017281138790035586, + "loss": 3.325, + "step": 1538 + }, + { + "epoch": 0.684, + "grad_norm": 1.5030821561813354, + "learning_rate": 0.00017279359430604982, + "loss": 2.0334, + "step": 1539 + }, + { + "epoch": 0.6844444444444444, + "grad_norm": 1.2664015293121338, + "learning_rate": 0.00017277580071174377, + "loss": 2.0804, + "step": 1540 + }, + { + "epoch": 0.6848888888888889, + "grad_norm": 1.496182918548584, + "learning_rate": 0.00017275800711743773, + "loss": 1.6106, + "step": 1541 + }, + { + "epoch": 0.6853333333333333, + "grad_norm": 1.7371916770935059, + "learning_rate": 0.0001727402135231317, + "loss": 2.6694, + "step": 1542 + }, + { + "epoch": 0.6857777777777778, + "grad_norm": 1.8223196268081665, + "learning_rate": 0.00017272241992882562, + "loss": 2.5409, + "step": 1543 + }, + { + "epoch": 0.6862222222222222, + "grad_norm": 1.9429682493209839, + "learning_rate": 0.00017270462633451957, + "loss": 2.5704, + "step": 1544 + }, + { + "epoch": 0.6866666666666666, + "grad_norm": 1.7982163429260254, + "learning_rate": 0.00017268683274021353, + "loss": 2.9834, + "step": 1545 + }, + { + "epoch": 0.6871111111111111, + "grad_norm": 1.8245515823364258, + "learning_rate": 0.00017266903914590749, + "loss": 2.6373, + "step": 1546 + }, + { + "epoch": 0.6875555555555556, + "grad_norm": 1.7472467422485352, + "learning_rate": 0.00017265124555160144, + "loss": 2.8218, + "step": 1547 + }, + { + "epoch": 0.688, + "grad_norm": 1.9026468992233276, + "learning_rate": 0.0001726334519572954, + "loss": 3.0626, + "step": 1548 + }, + { + "epoch": 0.6884444444444444, + "grad_norm": 2.187288522720337, + "learning_rate": 0.00017261565836298933, + "loss": 2.3243, + "step": 1549 + }, + { + "epoch": 0.6888888888888889, + "grad_norm": 2.155287742614746, + "learning_rate": 0.00017259786476868329, + "loss": 3.0798, + "step": 1550 + }, + { + "epoch": 0.6893333333333334, + "grad_norm": 2.2100751399993896, + "learning_rate": 0.00017258007117437721, + "loss": 2.1143, + "step": 1551 + }, + { + "epoch": 0.6897777777777778, + "grad_norm": 1.0366290807724, + "learning_rate": 0.00017256227758007117, + "loss": 2.1292, + "step": 1552 + }, + { + "epoch": 0.6902222222222222, + "grad_norm": 1.2031378746032715, + "learning_rate": 0.00017254448398576513, + "loss": 2.6824, + "step": 1553 + }, + { + "epoch": 0.6906666666666667, + "grad_norm": 1.154441475868225, + "learning_rate": 0.00017252669039145908, + "loss": 2.203, + "step": 1554 + }, + { + "epoch": 0.6911111111111111, + "grad_norm": 1.2247297763824463, + "learning_rate": 0.00017250889679715304, + "loss": 2.8808, + "step": 1555 + }, + { + "epoch": 0.6915555555555556, + "grad_norm": 1.3467925786972046, + "learning_rate": 0.00017249110320284697, + "loss": 2.9562, + "step": 1556 + }, + { + "epoch": 0.692, + "grad_norm": 1.1302120685577393, + "learning_rate": 0.00017247330960854093, + "loss": 2.485, + "step": 1557 + }, + { + "epoch": 0.6924444444444444, + "grad_norm": 1.2643386125564575, + "learning_rate": 0.00017245551601423488, + "loss": 2.572, + "step": 1558 + }, + { + "epoch": 0.6928888888888889, + "grad_norm": 1.353574514389038, + "learning_rate": 0.00017243772241992884, + "loss": 2.671, + "step": 1559 + }, + { + "epoch": 0.6933333333333334, + "grad_norm": 1.7643210887908936, + "learning_rate": 0.0001724199288256228, + "loss": 2.6473, + "step": 1560 + }, + { + "epoch": 0.6937777777777778, + "grad_norm": 1.5903198719024658, + "learning_rate": 0.00017240213523131675, + "loss": 2.2954, + "step": 1561 + }, + { + "epoch": 0.6942222222222222, + "grad_norm": 1.627631425857544, + "learning_rate": 0.00017238434163701068, + "loss": 2.233, + "step": 1562 + }, + { + "epoch": 0.6946666666666667, + "grad_norm": 1.3801125288009644, + "learning_rate": 0.00017236654804270464, + "loss": 2.2972, + "step": 1563 + }, + { + "epoch": 0.6951111111111111, + "grad_norm": 1.451066017150879, + "learning_rate": 0.00017234875444839857, + "loss": 2.013, + "step": 1564 + }, + { + "epoch": 0.6955555555555556, + "grad_norm": 1.316686987876892, + "learning_rate": 0.00017233096085409252, + "loss": 2.3651, + "step": 1565 + }, + { + "epoch": 0.696, + "grad_norm": 1.300595760345459, + "learning_rate": 0.00017231316725978648, + "loss": 2.0519, + "step": 1566 + }, + { + "epoch": 0.6964444444444444, + "grad_norm": 1.4267830848693848, + "learning_rate": 0.00017229537366548044, + "loss": 2.2664, + "step": 1567 + }, + { + "epoch": 0.6968888888888889, + "grad_norm": 1.5179320573806763, + "learning_rate": 0.0001722775800711744, + "loss": 2.3228, + "step": 1568 + }, + { + "epoch": 0.6973333333333334, + "grad_norm": 1.7718604803085327, + "learning_rate": 0.00017225978647686832, + "loss": 2.6723, + "step": 1569 + }, + { + "epoch": 0.6977777777777778, + "grad_norm": 2.2856781482696533, + "learning_rate": 0.00017224199288256228, + "loss": 2.3014, + "step": 1570 + }, + { + "epoch": 0.6982222222222222, + "grad_norm": 1.3740836381912231, + "learning_rate": 0.00017222419928825624, + "loss": 2.7384, + "step": 1571 + }, + { + "epoch": 0.6986666666666667, + "grad_norm": 1.3104565143585205, + "learning_rate": 0.0001722064056939502, + "loss": 2.4267, + "step": 1572 + }, + { + "epoch": 0.6991111111111111, + "grad_norm": 1.5572513341903687, + "learning_rate": 0.00017218861209964415, + "loss": 2.7882, + "step": 1573 + }, + { + "epoch": 0.6995555555555556, + "grad_norm": 1.344378113746643, + "learning_rate": 0.0001721708185053381, + "loss": 2.6669, + "step": 1574 + }, + { + "epoch": 0.7, + "grad_norm": 1.6647869348526, + "learning_rate": 0.00017215302491103203, + "loss": 2.5764, + "step": 1575 + }, + { + "epoch": 0.7004444444444444, + "grad_norm": 1.4547927379608154, + "learning_rate": 0.000172135231316726, + "loss": 2.4882, + "step": 1576 + }, + { + "epoch": 0.7008888888888889, + "grad_norm": 1.69290030002594, + "learning_rate": 0.00017211743772241992, + "loss": 2.6278, + "step": 1577 + }, + { + "epoch": 0.7013333333333334, + "grad_norm": 1.5832207202911377, + "learning_rate": 0.00017209964412811388, + "loss": 2.3207, + "step": 1578 + }, + { + "epoch": 0.7017777777777777, + "grad_norm": 1.6772409677505493, + "learning_rate": 0.00017208185053380783, + "loss": 2.9327, + "step": 1579 + }, + { + "epoch": 0.7022222222222222, + "grad_norm": 1.6155133247375488, + "learning_rate": 0.0001720640569395018, + "loss": 2.6978, + "step": 1580 + }, + { + "epoch": 0.7026666666666667, + "grad_norm": 1.2734161615371704, + "learning_rate": 0.00017204626334519575, + "loss": 1.9025, + "step": 1581 + }, + { + "epoch": 0.7031111111111111, + "grad_norm": 1.4908726215362549, + "learning_rate": 0.00017202846975088968, + "loss": 2.1544, + "step": 1582 + }, + { + "epoch": 0.7035555555555556, + "grad_norm": 1.7874783277511597, + "learning_rate": 0.00017201067615658363, + "loss": 3.0701, + "step": 1583 + }, + { + "epoch": 0.704, + "grad_norm": 2.0259952545166016, + "learning_rate": 0.0001719928825622776, + "loss": 2.926, + "step": 1584 + }, + { + "epoch": 0.7044444444444444, + "grad_norm": 1.7444260120391846, + "learning_rate": 0.00017197508896797155, + "loss": 2.2407, + "step": 1585 + }, + { + "epoch": 0.7048888888888889, + "grad_norm": 1.7600386142730713, + "learning_rate": 0.0001719572953736655, + "loss": 2.6499, + "step": 1586 + }, + { + "epoch": 0.7053333333333334, + "grad_norm": 1.7595195770263672, + "learning_rate": 0.00017193950177935946, + "loss": 2.4446, + "step": 1587 + }, + { + "epoch": 0.7057777777777777, + "grad_norm": 1.9496681690216064, + "learning_rate": 0.0001719217081850534, + "loss": 3.0214, + "step": 1588 + }, + { + "epoch": 0.7062222222222222, + "grad_norm": 1.8185930252075195, + "learning_rate": 0.00017190391459074734, + "loss": 2.5126, + "step": 1589 + }, + { + "epoch": 0.7066666666666667, + "grad_norm": 1.7105134725570679, + "learning_rate": 0.00017188612099644127, + "loss": 2.6629, + "step": 1590 + }, + { + "epoch": 0.7071111111111111, + "grad_norm": 1.8199312686920166, + "learning_rate": 0.00017186832740213523, + "loss": 2.2038, + "step": 1591 + }, + { + "epoch": 0.7075555555555556, + "grad_norm": 1.6544042825698853, + "learning_rate": 0.0001718505338078292, + "loss": 2.4603, + "step": 1592 + }, + { + "epoch": 0.708, + "grad_norm": 1.5924146175384521, + "learning_rate": 0.00017183274021352314, + "loss": 2.5588, + "step": 1593 + }, + { + "epoch": 0.7084444444444444, + "grad_norm": 1.6457511186599731, + "learning_rate": 0.0001718149466192171, + "loss": 2.7249, + "step": 1594 + }, + { + "epoch": 0.7088888888888889, + "grad_norm": 1.7341830730438232, + "learning_rate": 0.00017179715302491103, + "loss": 2.7286, + "step": 1595 + }, + { + "epoch": 0.7093333333333334, + "grad_norm": 1.8553625345230103, + "learning_rate": 0.00017177935943060499, + "loss": 3.2606, + "step": 1596 + }, + { + "epoch": 0.7097777777777777, + "grad_norm": 2.046403408050537, + "learning_rate": 0.00017176156583629894, + "loss": 2.7661, + "step": 1597 + }, + { + "epoch": 0.7102222222222222, + "grad_norm": 1.9650744199752808, + "learning_rate": 0.0001717437722419929, + "loss": 2.7783, + "step": 1598 + }, + { + "epoch": 0.7106666666666667, + "grad_norm": 2.4216885566711426, + "learning_rate": 0.00017172597864768686, + "loss": 2.6446, + "step": 1599 + }, + { + "epoch": 0.7111111111111111, + "grad_norm": 2.498917818069458, + "learning_rate": 0.0001717081850533808, + "loss": 3.2051, + "step": 1600 + }, + { + "epoch": 0.7115555555555556, + "grad_norm": 1.0230510234832764, + "learning_rate": 0.00017169039145907474, + "loss": 2.7331, + "step": 1601 + }, + { + "epoch": 0.712, + "grad_norm": 1.6314057111740112, + "learning_rate": 0.00017167259786476867, + "loss": 1.3376, + "step": 1602 + }, + { + "epoch": 0.7124444444444444, + "grad_norm": 1.033645510673523, + "learning_rate": 0.00017165480427046263, + "loss": 2.7086, + "step": 1603 + }, + { + "epoch": 0.7128888888888889, + "grad_norm": 1.1229987144470215, + "learning_rate": 0.00017163701067615658, + "loss": 2.2418, + "step": 1604 + }, + { + "epoch": 0.7133333333333334, + "grad_norm": 1.2182966470718384, + "learning_rate": 0.00017161921708185054, + "loss": 2.8067, + "step": 1605 + }, + { + "epoch": 0.7137777777777777, + "grad_norm": 1.102390170097351, + "learning_rate": 0.0001716014234875445, + "loss": 3.0503, + "step": 1606 + }, + { + "epoch": 0.7142222222222222, + "grad_norm": 1.0558178424835205, + "learning_rate": 0.00017158362989323845, + "loss": 3.2076, + "step": 1607 + }, + { + "epoch": 0.7146666666666667, + "grad_norm": 1.236777424812317, + "learning_rate": 0.00017156583629893238, + "loss": 2.8685, + "step": 1608 + }, + { + "epoch": 0.7151111111111111, + "grad_norm": 1.6383613348007202, + "learning_rate": 0.00017154804270462634, + "loss": 2.9057, + "step": 1609 + }, + { + "epoch": 0.7155555555555555, + "grad_norm": 1.3844068050384521, + "learning_rate": 0.0001715302491103203, + "loss": 2.5375, + "step": 1610 + }, + { + "epoch": 0.716, + "grad_norm": 1.1408178806304932, + "learning_rate": 0.00017151245551601425, + "loss": 2.5761, + "step": 1611 + }, + { + "epoch": 0.7164444444444444, + "grad_norm": 1.1641733646392822, + "learning_rate": 0.0001714946619217082, + "loss": 2.6517, + "step": 1612 + }, + { + "epoch": 0.7168888888888889, + "grad_norm": 1.1055686473846436, + "learning_rate": 0.00017147686832740214, + "loss": 2.1878, + "step": 1613 + }, + { + "epoch": 0.7173333333333334, + "grad_norm": 1.1979750394821167, + "learning_rate": 0.0001714590747330961, + "loss": 2.4771, + "step": 1614 + }, + { + "epoch": 0.7177777777777777, + "grad_norm": 1.271674633026123, + "learning_rate": 0.00017144128113879002, + "loss": 2.4917, + "step": 1615 + }, + { + "epoch": 0.7182222222222222, + "grad_norm": 1.3329592943191528, + "learning_rate": 0.00017142348754448398, + "loss": 2.5694, + "step": 1616 + }, + { + "epoch": 0.7186666666666667, + "grad_norm": 1.4913185834884644, + "learning_rate": 0.00017140569395017794, + "loss": 2.8739, + "step": 1617 + }, + { + "epoch": 0.7191111111111111, + "grad_norm": 1.3278918266296387, + "learning_rate": 0.0001713879003558719, + "loss": 2.6695, + "step": 1618 + }, + { + "epoch": 0.7195555555555555, + "grad_norm": 1.5261479616165161, + "learning_rate": 0.00017137010676156585, + "loss": 1.4149, + "step": 1619 + }, + { + "epoch": 0.72, + "grad_norm": 1.4373037815093994, + "learning_rate": 0.00017135231316725978, + "loss": 2.8322, + "step": 1620 + }, + { + "epoch": 0.7204444444444444, + "grad_norm": 1.2355538606643677, + "learning_rate": 0.00017133451957295374, + "loss": 2.1176, + "step": 1621 + }, + { + "epoch": 0.7208888888888889, + "grad_norm": 1.29166579246521, + "learning_rate": 0.0001713167259786477, + "loss": 2.5198, + "step": 1622 + }, + { + "epoch": 0.7213333333333334, + "grad_norm": 1.6880923509597778, + "learning_rate": 0.00017129893238434165, + "loss": 2.9581, + "step": 1623 + }, + { + "epoch": 0.7217777777777777, + "grad_norm": 1.5091137886047363, + "learning_rate": 0.0001712811387900356, + "loss": 2.2347, + "step": 1624 + }, + { + "epoch": 0.7222222222222222, + "grad_norm": 1.5347869396209717, + "learning_rate": 0.00017126334519572956, + "loss": 2.6514, + "step": 1625 + }, + { + "epoch": 0.7226666666666667, + "grad_norm": 1.6462808847427368, + "learning_rate": 0.0001712455516014235, + "loss": 2.9572, + "step": 1626 + }, + { + "epoch": 0.7231111111111111, + "grad_norm": 1.5206083059310913, + "learning_rate": 0.00017122775800711745, + "loss": 2.6225, + "step": 1627 + }, + { + "epoch": 0.7235555555555555, + "grad_norm": 1.6654325723648071, + "learning_rate": 0.00017120996441281138, + "loss": 2.4817, + "step": 1628 + }, + { + "epoch": 0.724, + "grad_norm": 1.6981536149978638, + "learning_rate": 0.00017119217081850533, + "loss": 2.5816, + "step": 1629 + }, + { + "epoch": 0.7244444444444444, + "grad_norm": 1.5140714645385742, + "learning_rate": 0.0001711743772241993, + "loss": 2.6285, + "step": 1630 + }, + { + "epoch": 0.7248888888888889, + "grad_norm": 1.4249933958053589, + "learning_rate": 0.00017115658362989325, + "loss": 1.8135, + "step": 1631 + }, + { + "epoch": 0.7253333333333334, + "grad_norm": 1.72703218460083, + "learning_rate": 0.0001711387900355872, + "loss": 2.7962, + "step": 1632 + }, + { + "epoch": 0.7257777777777777, + "grad_norm": 1.43552565574646, + "learning_rate": 0.00017112099644128113, + "loss": 2.0849, + "step": 1633 + }, + { + "epoch": 0.7262222222222222, + "grad_norm": 1.728300929069519, + "learning_rate": 0.0001711032028469751, + "loss": 3.2766, + "step": 1634 + }, + { + "epoch": 0.7266666666666667, + "grad_norm": 1.7892253398895264, + "learning_rate": 0.00017108540925266905, + "loss": 2.9754, + "step": 1635 + }, + { + "epoch": 0.7271111111111112, + "grad_norm": 1.731090784072876, + "learning_rate": 0.000171067615658363, + "loss": 2.6769, + "step": 1636 + }, + { + "epoch": 0.7275555555555555, + "grad_norm": 1.4827601909637451, + "learning_rate": 0.00017104982206405696, + "loss": 2.1877, + "step": 1637 + }, + { + "epoch": 0.728, + "grad_norm": 1.4949201345443726, + "learning_rate": 0.00017103202846975091, + "loss": 2.7004, + "step": 1638 + }, + { + "epoch": 0.7284444444444444, + "grad_norm": 1.5235779285430908, + "learning_rate": 0.00017101423487544484, + "loss": 2.451, + "step": 1639 + }, + { + "epoch": 0.7288888888888889, + "grad_norm": 1.7077027559280396, + "learning_rate": 0.0001709964412811388, + "loss": 2.4066, + "step": 1640 + }, + { + "epoch": 0.7293333333333333, + "grad_norm": 1.3838727474212646, + "learning_rate": 0.00017097864768683273, + "loss": 2.0036, + "step": 1641 + }, + { + "epoch": 0.7297777777777777, + "grad_norm": 1.5762803554534912, + "learning_rate": 0.0001709608540925267, + "loss": 2.3423, + "step": 1642 + }, + { + "epoch": 0.7302222222222222, + "grad_norm": 1.5735541582107544, + "learning_rate": 0.00017094306049822064, + "loss": 2.6597, + "step": 1643 + }, + { + "epoch": 0.7306666666666667, + "grad_norm": 2.120513916015625, + "learning_rate": 0.0001709252669039146, + "loss": 3.0132, + "step": 1644 + }, + { + "epoch": 0.7311111111111112, + "grad_norm": 1.8525890111923218, + "learning_rate": 0.00017090747330960856, + "loss": 2.3857, + "step": 1645 + }, + { + "epoch": 0.7315555555555555, + "grad_norm": 1.7836278676986694, + "learning_rate": 0.00017088967971530249, + "loss": 2.9966, + "step": 1646 + }, + { + "epoch": 0.732, + "grad_norm": 2.076381206512451, + "learning_rate": 0.00017087188612099644, + "loss": 3.142, + "step": 1647 + }, + { + "epoch": 0.7324444444444445, + "grad_norm": 2.3997738361358643, + "learning_rate": 0.0001708540925266904, + "loss": 3.5624, + "step": 1648 + }, + { + "epoch": 0.7328888888888889, + "grad_norm": 2.19384503364563, + "learning_rate": 0.00017083629893238435, + "loss": 2.0362, + "step": 1649 + }, + { + "epoch": 0.7333333333333333, + "grad_norm": 2.8410866260528564, + "learning_rate": 0.0001708185053380783, + "loss": 2.3181, + "step": 1650 + }, + { + "epoch": 0.7337777777777778, + "grad_norm": 1.1997778415679932, + "learning_rate": 0.00017080071174377227, + "loss": 1.5159, + "step": 1651 + }, + { + "epoch": 0.7342222222222222, + "grad_norm": 1.0796202421188354, + "learning_rate": 0.0001707829181494662, + "loss": 3.136, + "step": 1652 + }, + { + "epoch": 0.7346666666666667, + "grad_norm": 1.2189334630966187, + "learning_rate": 0.00017076512455516015, + "loss": 2.7014, + "step": 1653 + }, + { + "epoch": 0.7351111111111112, + "grad_norm": 1.2097785472869873, + "learning_rate": 0.00017074733096085408, + "loss": 3.3386, + "step": 1654 + }, + { + "epoch": 0.7355555555555555, + "grad_norm": 1.1241984367370605, + "learning_rate": 0.00017072953736654804, + "loss": 2.809, + "step": 1655 + }, + { + "epoch": 0.736, + "grad_norm": 1.315993309020996, + "learning_rate": 0.000170711743772242, + "loss": 2.1582, + "step": 1656 + }, + { + "epoch": 0.7364444444444445, + "grad_norm": 1.2245084047317505, + "learning_rate": 0.00017069395017793595, + "loss": 2.5975, + "step": 1657 + }, + { + "epoch": 0.7368888888888889, + "grad_norm": 1.542858362197876, + "learning_rate": 0.0001706761565836299, + "loss": 1.5061, + "step": 1658 + }, + { + "epoch": 0.7373333333333333, + "grad_norm": 1.0316481590270996, + "learning_rate": 0.00017065836298932384, + "loss": 1.3895, + "step": 1659 + }, + { + "epoch": 0.7377777777777778, + "grad_norm": 1.2059721946716309, + "learning_rate": 0.0001706405693950178, + "loss": 2.5289, + "step": 1660 + }, + { + "epoch": 0.7382222222222222, + "grad_norm": 1.303240418434143, + "learning_rate": 0.00017062277580071175, + "loss": 2.6186, + "step": 1661 + }, + { + "epoch": 0.7386666666666667, + "grad_norm": 1.26139235496521, + "learning_rate": 0.0001706049822064057, + "loss": 2.7619, + "step": 1662 + }, + { + "epoch": 0.7391111111111112, + "grad_norm": 1.4750614166259766, + "learning_rate": 0.00017058718861209966, + "loss": 2.4993, + "step": 1663 + }, + { + "epoch": 0.7395555555555555, + "grad_norm": 1.3910586833953857, + "learning_rate": 0.00017056939501779362, + "loss": 2.2823, + "step": 1664 + }, + { + "epoch": 0.74, + "grad_norm": 1.4160467386245728, + "learning_rate": 0.00017055160142348755, + "loss": 2.102, + "step": 1665 + }, + { + "epoch": 0.7404444444444445, + "grad_norm": 1.1895157098770142, + "learning_rate": 0.0001705338078291815, + "loss": 1.9158, + "step": 1666 + }, + { + "epoch": 0.7408888888888889, + "grad_norm": 1.5959806442260742, + "learning_rate": 0.00017051601423487544, + "loss": 2.5473, + "step": 1667 + }, + { + "epoch": 0.7413333333333333, + "grad_norm": 1.5768413543701172, + "learning_rate": 0.0001704982206405694, + "loss": 1.2562, + "step": 1668 + }, + { + "epoch": 0.7417777777777778, + "grad_norm": 1.4953051805496216, + "learning_rate": 0.00017048042704626335, + "loss": 2.5303, + "step": 1669 + }, + { + "epoch": 0.7422222222222222, + "grad_norm": 1.3557592630386353, + "learning_rate": 0.0001704626334519573, + "loss": 2.2733, + "step": 1670 + }, + { + "epoch": 0.7426666666666667, + "grad_norm": 1.5126179456710815, + "learning_rate": 0.00017044483985765126, + "loss": 2.7248, + "step": 1671 + }, + { + "epoch": 0.7431111111111111, + "grad_norm": 1.3499473333358765, + "learning_rate": 0.0001704270462633452, + "loss": 2.5348, + "step": 1672 + }, + { + "epoch": 0.7435555555555555, + "grad_norm": 1.2511281967163086, + "learning_rate": 0.00017040925266903915, + "loss": 2.2185, + "step": 1673 + }, + { + "epoch": 0.744, + "grad_norm": 1.4628006219863892, + "learning_rate": 0.0001703914590747331, + "loss": 2.3975, + "step": 1674 + }, + { + "epoch": 0.7444444444444445, + "grad_norm": 1.4203425645828247, + "learning_rate": 0.00017037366548042706, + "loss": 2.1031, + "step": 1675 + }, + { + "epoch": 0.7448888888888889, + "grad_norm": 1.3346225023269653, + "learning_rate": 0.00017035587188612102, + "loss": 1.9704, + "step": 1676 + }, + { + "epoch": 0.7453333333333333, + "grad_norm": 1.3518871068954468, + "learning_rate": 0.00017033807829181497, + "loss": 2.6569, + "step": 1677 + }, + { + "epoch": 0.7457777777777778, + "grad_norm": 1.5000810623168945, + "learning_rate": 0.0001703202846975089, + "loss": 2.1933, + "step": 1678 + }, + { + "epoch": 0.7462222222222222, + "grad_norm": 1.6626880168914795, + "learning_rate": 0.00017030249110320286, + "loss": 2.166, + "step": 1679 + }, + { + "epoch": 0.7466666666666667, + "grad_norm": 1.3020575046539307, + "learning_rate": 0.0001702846975088968, + "loss": 2.4969, + "step": 1680 + }, + { + "epoch": 0.7471111111111111, + "grad_norm": 1.4085545539855957, + "learning_rate": 0.00017026690391459075, + "loss": 2.4218, + "step": 1681 + }, + { + "epoch": 0.7475555555555555, + "grad_norm": 1.861728310585022, + "learning_rate": 0.0001702491103202847, + "loss": 2.2863, + "step": 1682 + }, + { + "epoch": 0.748, + "grad_norm": 1.688712239265442, + "learning_rate": 0.00017023131672597866, + "loss": 2.773, + "step": 1683 + }, + { + "epoch": 0.7484444444444445, + "grad_norm": 1.4581354856491089, + "learning_rate": 0.00017021352313167262, + "loss": 2.4764, + "step": 1684 + }, + { + "epoch": 0.7488888888888889, + "grad_norm": 1.8125518560409546, + "learning_rate": 0.00017019572953736654, + "loss": 2.1238, + "step": 1685 + }, + { + "epoch": 0.7493333333333333, + "grad_norm": 1.5851460695266724, + "learning_rate": 0.0001701779359430605, + "loss": 2.2135, + "step": 1686 + }, + { + "epoch": 0.7497777777777778, + "grad_norm": 1.852522611618042, + "learning_rate": 0.00017016014234875446, + "loss": 2.3781, + "step": 1687 + }, + { + "epoch": 0.7502222222222222, + "grad_norm": 1.500272512435913, + "learning_rate": 0.00017014234875444841, + "loss": 2.2147, + "step": 1688 + }, + { + "epoch": 0.7506666666666667, + "grad_norm": 1.4343912601470947, + "learning_rate": 0.00017012455516014237, + "loss": 2.3782, + "step": 1689 + }, + { + "epoch": 0.7511111111111111, + "grad_norm": 1.8600046634674072, + "learning_rate": 0.00017010676156583633, + "loss": 2.7655, + "step": 1690 + }, + { + "epoch": 0.7515555555555555, + "grad_norm": 1.5437164306640625, + "learning_rate": 0.00017008896797153026, + "loss": 2.3394, + "step": 1691 + }, + { + "epoch": 0.752, + "grad_norm": 1.9030184745788574, + "learning_rate": 0.0001700711743772242, + "loss": 2.7159, + "step": 1692 + }, + { + "epoch": 0.7524444444444445, + "grad_norm": 2.081378698348999, + "learning_rate": 0.00017005338078291814, + "loss": 2.5658, + "step": 1693 + }, + { + "epoch": 0.7528888888888889, + "grad_norm": 1.6532082557678223, + "learning_rate": 0.0001700355871886121, + "loss": 2.3354, + "step": 1694 + }, + { + "epoch": 0.7533333333333333, + "grad_norm": 2.3726096153259277, + "learning_rate": 0.00017001779359430606, + "loss": 3.1188, + "step": 1695 + }, + { + "epoch": 0.7537777777777778, + "grad_norm": 2.014913558959961, + "learning_rate": 0.00017, + "loss": 3.0022, + "step": 1696 + }, + { + "epoch": 0.7542222222222222, + "grad_norm": 2.250953197479248, + "learning_rate": 0.00016998220640569397, + "loss": 2.8569, + "step": 1697 + }, + { + "epoch": 0.7546666666666667, + "grad_norm": 1.7740085124969482, + "learning_rate": 0.0001699644128113879, + "loss": 2.3986, + "step": 1698 + }, + { + "epoch": 0.7551111111111111, + "grad_norm": 1.9951434135437012, + "learning_rate": 0.00016994661921708185, + "loss": 1.5462, + "step": 1699 + }, + { + "epoch": 0.7555555555555555, + "grad_norm": 2.3089957237243652, + "learning_rate": 0.0001699288256227758, + "loss": 2.038, + "step": 1700 + }, + { + "epoch": 0.756, + "grad_norm": 0.9273460507392883, + "learning_rate": 0.00016991103202846977, + "loss": 2.2534, + "step": 1701 + }, + { + "epoch": 0.7564444444444445, + "grad_norm": 1.0741509199142456, + "learning_rate": 0.00016989323843416372, + "loss": 2.6395, + "step": 1702 + }, + { + "epoch": 0.7568888888888889, + "grad_norm": 1.2935476303100586, + "learning_rate": 0.00016987544483985765, + "loss": 2.7484, + "step": 1703 + }, + { + "epoch": 0.7573333333333333, + "grad_norm": 1.2503875494003296, + "learning_rate": 0.0001698576512455516, + "loss": 2.4681, + "step": 1704 + }, + { + "epoch": 0.7577777777777778, + "grad_norm": 1.1031012535095215, + "learning_rate": 0.00016983985765124557, + "loss": 2.5408, + "step": 1705 + }, + { + "epoch": 0.7582222222222222, + "grad_norm": 1.1890628337860107, + "learning_rate": 0.0001698220640569395, + "loss": 2.9032, + "step": 1706 + }, + { + "epoch": 0.7586666666666667, + "grad_norm": 1.1588826179504395, + "learning_rate": 0.00016980427046263345, + "loss": 2.3581, + "step": 1707 + }, + { + "epoch": 0.7591111111111111, + "grad_norm": 1.2451859712600708, + "learning_rate": 0.0001697864768683274, + "loss": 2.4299, + "step": 1708 + }, + { + "epoch": 0.7595555555555555, + "grad_norm": 1.4853380918502808, + "learning_rate": 0.00016976868327402137, + "loss": 3.0938, + "step": 1709 + }, + { + "epoch": 0.76, + "grad_norm": 1.686055302619934, + "learning_rate": 0.0001697508896797153, + "loss": 2.3023, + "step": 1710 + }, + { + "epoch": 0.7604444444444445, + "grad_norm": 1.3711706399917603, + "learning_rate": 0.00016973309608540925, + "loss": 2.6126, + "step": 1711 + }, + { + "epoch": 0.7608888888888888, + "grad_norm": 1.3711973428726196, + "learning_rate": 0.0001697153024911032, + "loss": 2.634, + "step": 1712 + }, + { + "epoch": 0.7613333333333333, + "grad_norm": 1.236276388168335, + "learning_rate": 0.00016969750889679716, + "loss": 2.6303, + "step": 1713 + }, + { + "epoch": 0.7617777777777778, + "grad_norm": 1.2677000761032104, + "learning_rate": 0.00016967971530249112, + "loss": 2.646, + "step": 1714 + }, + { + "epoch": 0.7622222222222222, + "grad_norm": 1.2618008852005005, + "learning_rate": 0.00016966192170818508, + "loss": 2.3963, + "step": 1715 + }, + { + "epoch": 0.7626666666666667, + "grad_norm": 1.3823282718658447, + "learning_rate": 0.000169644128113879, + "loss": 2.6979, + "step": 1716 + }, + { + "epoch": 0.7631111111111111, + "grad_norm": 1.3976502418518066, + "learning_rate": 0.00016962633451957296, + "loss": 1.525, + "step": 1717 + }, + { + "epoch": 0.7635555555555555, + "grad_norm": 1.343619465827942, + "learning_rate": 0.0001696085409252669, + "loss": 2.9, + "step": 1718 + }, + { + "epoch": 0.764, + "grad_norm": 1.1925876140594482, + "learning_rate": 0.00016959074733096085, + "loss": 1.9166, + "step": 1719 + }, + { + "epoch": 0.7644444444444445, + "grad_norm": 1.782758355140686, + "learning_rate": 0.0001695729537366548, + "loss": 2.5998, + "step": 1720 + }, + { + "epoch": 0.7648888888888888, + "grad_norm": 1.5515620708465576, + "learning_rate": 0.00016955516014234876, + "loss": 2.1778, + "step": 1721 + }, + { + "epoch": 0.7653333333333333, + "grad_norm": 1.3295077085494995, + "learning_rate": 0.00016953736654804272, + "loss": 2.0455, + "step": 1722 + }, + { + "epoch": 0.7657777777777778, + "grad_norm": 1.6880308389663696, + "learning_rate": 0.00016951957295373665, + "loss": 2.6615, + "step": 1723 + }, + { + "epoch": 0.7662222222222222, + "grad_norm": 1.8657255172729492, + "learning_rate": 0.0001695017793594306, + "loss": 2.5311, + "step": 1724 + }, + { + "epoch": 0.7666666666666667, + "grad_norm": 1.4304696321487427, + "learning_rate": 0.00016948398576512456, + "loss": 1.9907, + "step": 1725 + }, + { + "epoch": 0.7671111111111111, + "grad_norm": 1.3921934366226196, + "learning_rate": 0.00016946619217081852, + "loss": 2.5942, + "step": 1726 + }, + { + "epoch": 0.7675555555555555, + "grad_norm": 1.1713439226150513, + "learning_rate": 0.00016944839857651247, + "loss": 1.329, + "step": 1727 + }, + { + "epoch": 0.768, + "grad_norm": 1.447230577468872, + "learning_rate": 0.00016943060498220643, + "loss": 2.2461, + "step": 1728 + }, + { + "epoch": 0.7684444444444445, + "grad_norm": 1.5526431798934937, + "learning_rate": 0.00016941281138790036, + "loss": 2.5692, + "step": 1729 + }, + { + "epoch": 0.7688888888888888, + "grad_norm": 1.864875078201294, + "learning_rate": 0.00016939501779359432, + "loss": 2.3305, + "step": 1730 + }, + { + "epoch": 0.7693333333333333, + "grad_norm": 1.5413137674331665, + "learning_rate": 0.00016937722419928825, + "loss": 2.4763, + "step": 1731 + }, + { + "epoch": 0.7697777777777778, + "grad_norm": 1.5602953433990479, + "learning_rate": 0.0001693594306049822, + "loss": 2.3397, + "step": 1732 + }, + { + "epoch": 0.7702222222222223, + "grad_norm": 1.5029823780059814, + "learning_rate": 0.00016934163701067616, + "loss": 2.5413, + "step": 1733 + }, + { + "epoch": 0.7706666666666667, + "grad_norm": 1.9721778631210327, + "learning_rate": 0.00016932384341637012, + "loss": 2.3882, + "step": 1734 + }, + { + "epoch": 0.7711111111111111, + "grad_norm": 1.5762044191360474, + "learning_rate": 0.00016930604982206407, + "loss": 2.7452, + "step": 1735 + }, + { + "epoch": 0.7715555555555556, + "grad_norm": 1.6823639869689941, + "learning_rate": 0.000169288256227758, + "loss": 2.706, + "step": 1736 + }, + { + "epoch": 0.772, + "grad_norm": 2.066340446472168, + "learning_rate": 0.00016927046263345196, + "loss": 2.8768, + "step": 1737 + }, + { + "epoch": 0.7724444444444445, + "grad_norm": 1.7574570178985596, + "learning_rate": 0.00016925266903914591, + "loss": 2.9213, + "step": 1738 + }, + { + "epoch": 0.7728888888888888, + "grad_norm": 1.5815603733062744, + "learning_rate": 0.00016923487544483987, + "loss": 2.7557, + "step": 1739 + }, + { + "epoch": 0.7733333333333333, + "grad_norm": 2.00929594039917, + "learning_rate": 0.00016921708185053383, + "loss": 2.828, + "step": 1740 + }, + { + "epoch": 0.7737777777777778, + "grad_norm": 1.5500198602676392, + "learning_rate": 0.00016919928825622778, + "loss": 2.8645, + "step": 1741 + }, + { + "epoch": 0.7742222222222223, + "grad_norm": 1.5847936868667603, + "learning_rate": 0.0001691814946619217, + "loss": 2.4433, + "step": 1742 + }, + { + "epoch": 0.7746666666666666, + "grad_norm": 1.5422377586364746, + "learning_rate": 0.00016916370106761567, + "loss": 2.7777, + "step": 1743 + }, + { + "epoch": 0.7751111111111111, + "grad_norm": 1.4462459087371826, + "learning_rate": 0.0001691459074733096, + "loss": 2.3321, + "step": 1744 + }, + { + "epoch": 0.7755555555555556, + "grad_norm": 2.3052868843078613, + "learning_rate": 0.00016912811387900356, + "loss": 2.3839, + "step": 1745 + }, + { + "epoch": 0.776, + "grad_norm": 1.589308261871338, + "learning_rate": 0.0001691103202846975, + "loss": 2.2261, + "step": 1746 + }, + { + "epoch": 0.7764444444444445, + "grad_norm": 1.5655573606491089, + "learning_rate": 0.00016909252669039147, + "loss": 2.3206, + "step": 1747 + }, + { + "epoch": 0.7768888888888889, + "grad_norm": 1.4442392587661743, + "learning_rate": 0.00016907473309608542, + "loss": 1.8804, + "step": 1748 + }, + { + "epoch": 0.7773333333333333, + "grad_norm": 2.0022120475769043, + "learning_rate": 0.00016905693950177935, + "loss": 3.0496, + "step": 1749 + }, + { + "epoch": 0.7777777777777778, + "grad_norm": 2.1214585304260254, + "learning_rate": 0.0001690391459074733, + "loss": 3.1286, + "step": 1750 + }, + { + "epoch": 0.7782222222222223, + "grad_norm": 1.1207834482192993, + "learning_rate": 0.00016902135231316727, + "loss": 1.5612, + "step": 1751 + }, + { + "epoch": 0.7786666666666666, + "grad_norm": 1.0428500175476074, + "learning_rate": 0.00016900355871886122, + "loss": 2.3335, + "step": 1752 + }, + { + "epoch": 0.7791111111111111, + "grad_norm": 1.0575370788574219, + "learning_rate": 0.00016898576512455518, + "loss": 1.4655, + "step": 1753 + }, + { + "epoch": 0.7795555555555556, + "grad_norm": 1.2163389921188354, + "learning_rate": 0.00016896797153024914, + "loss": 2.3588, + "step": 1754 + }, + { + "epoch": 0.78, + "grad_norm": 1.481124997138977, + "learning_rate": 0.00016895017793594307, + "loss": 2.8144, + "step": 1755 + }, + { + "epoch": 0.7804444444444445, + "grad_norm": 1.3335241079330444, + "learning_rate": 0.00016893238434163702, + "loss": 3.0533, + "step": 1756 + }, + { + "epoch": 0.7808888888888889, + "grad_norm": 1.2501187324523926, + "learning_rate": 0.00016891459074733095, + "loss": 2.7067, + "step": 1757 + }, + { + "epoch": 0.7813333333333333, + "grad_norm": 1.2316926717758179, + "learning_rate": 0.0001688967971530249, + "loss": 3.0781, + "step": 1758 + }, + { + "epoch": 0.7817777777777778, + "grad_norm": 1.4136369228363037, + "learning_rate": 0.00016887900355871886, + "loss": 2.4057, + "step": 1759 + }, + { + "epoch": 0.7822222222222223, + "grad_norm": 1.2159130573272705, + "learning_rate": 0.00016886120996441282, + "loss": 2.3195, + "step": 1760 + }, + { + "epoch": 0.7826666666666666, + "grad_norm": 1.3189160823822021, + "learning_rate": 0.00016884341637010678, + "loss": 2.6697, + "step": 1761 + }, + { + "epoch": 0.7831111111111111, + "grad_norm": 1.2556674480438232, + "learning_rate": 0.0001688256227758007, + "loss": 2.4658, + "step": 1762 + }, + { + "epoch": 0.7835555555555556, + "grad_norm": 1.4505153894424438, + "learning_rate": 0.00016880782918149466, + "loss": 2.4346, + "step": 1763 + }, + { + "epoch": 0.784, + "grad_norm": 1.3776673078536987, + "learning_rate": 0.00016879003558718862, + "loss": 2.2668, + "step": 1764 + }, + { + "epoch": 0.7844444444444445, + "grad_norm": 1.7608091831207275, + "learning_rate": 0.00016877224199288258, + "loss": 1.6074, + "step": 1765 + }, + { + "epoch": 0.7848888888888889, + "grad_norm": 1.6191914081573486, + "learning_rate": 0.00016875444839857653, + "loss": 2.3601, + "step": 1766 + }, + { + "epoch": 0.7853333333333333, + "grad_norm": 1.4360511302947998, + "learning_rate": 0.0001687366548042705, + "loss": 2.5914, + "step": 1767 + }, + { + "epoch": 0.7857777777777778, + "grad_norm": 1.3812462091445923, + "learning_rate": 0.00016871886120996442, + "loss": 2.5718, + "step": 1768 + }, + { + "epoch": 0.7862222222222223, + "grad_norm": 1.522873878479004, + "learning_rate": 0.00016870106761565838, + "loss": 2.7388, + "step": 1769 + }, + { + "epoch": 0.7866666666666666, + "grad_norm": 1.7307029962539673, + "learning_rate": 0.0001686832740213523, + "loss": 2.0664, + "step": 1770 + }, + { + "epoch": 0.7871111111111111, + "grad_norm": 1.383586049079895, + "learning_rate": 0.00016866548042704626, + "loss": 2.7238, + "step": 1771 + }, + { + "epoch": 0.7875555555555556, + "grad_norm": 1.6344101428985596, + "learning_rate": 0.00016864768683274022, + "loss": 3.0245, + "step": 1772 + }, + { + "epoch": 0.788, + "grad_norm": 1.4926892518997192, + "learning_rate": 0.00016862989323843417, + "loss": 2.4893, + "step": 1773 + }, + { + "epoch": 0.7884444444444444, + "grad_norm": 1.7372292280197144, + "learning_rate": 0.00016861209964412813, + "loss": 2.9537, + "step": 1774 + }, + { + "epoch": 0.7888888888888889, + "grad_norm": 1.3594554662704468, + "learning_rate": 0.00016859430604982206, + "loss": 1.9789, + "step": 1775 + }, + { + "epoch": 0.7893333333333333, + "grad_norm": 1.3864822387695312, + "learning_rate": 0.00016857651245551602, + "loss": 2.585, + "step": 1776 + }, + { + "epoch": 0.7897777777777778, + "grad_norm": 1.5721726417541504, + "learning_rate": 0.00016855871886120997, + "loss": 2.8356, + "step": 1777 + }, + { + "epoch": 0.7902222222222223, + "grad_norm": 1.3060152530670166, + "learning_rate": 0.00016854092526690393, + "loss": 2.2237, + "step": 1778 + }, + { + "epoch": 0.7906666666666666, + "grad_norm": 1.349345326423645, + "learning_rate": 0.0001685231316725979, + "loss": 2.3712, + "step": 1779 + }, + { + "epoch": 0.7911111111111111, + "grad_norm": 1.4352922439575195, + "learning_rate": 0.00016850533807829184, + "loss": 1.6791, + "step": 1780 + }, + { + "epoch": 0.7915555555555556, + "grad_norm": 1.5430026054382324, + "learning_rate": 0.00016848754448398577, + "loss": 2.7579, + "step": 1781 + }, + { + "epoch": 0.792, + "grad_norm": 1.7820968627929688, + "learning_rate": 0.00016846975088967973, + "loss": 3.1443, + "step": 1782 + }, + { + "epoch": 0.7924444444444444, + "grad_norm": 1.3988341093063354, + "learning_rate": 0.00016845195729537366, + "loss": 1.8977, + "step": 1783 + }, + { + "epoch": 0.7928888888888889, + "grad_norm": 1.705902338027954, + "learning_rate": 0.00016843416370106761, + "loss": 2.795, + "step": 1784 + }, + { + "epoch": 0.7933333333333333, + "grad_norm": 1.6256381273269653, + "learning_rate": 0.00016841637010676157, + "loss": 2.6093, + "step": 1785 + }, + { + "epoch": 0.7937777777777778, + "grad_norm": 1.6664106845855713, + "learning_rate": 0.00016839857651245553, + "loss": 2.5954, + "step": 1786 + }, + { + "epoch": 0.7942222222222223, + "grad_norm": 1.5220837593078613, + "learning_rate": 0.00016838078291814948, + "loss": 2.4056, + "step": 1787 + }, + { + "epoch": 0.7946666666666666, + "grad_norm": 2.4615044593811035, + "learning_rate": 0.00016836298932384341, + "loss": 3.1873, + "step": 1788 + }, + { + "epoch": 0.7951111111111111, + "grad_norm": 1.54017174243927, + "learning_rate": 0.00016834519572953737, + "loss": 2.5285, + "step": 1789 + }, + { + "epoch": 0.7955555555555556, + "grad_norm": 1.5758004188537598, + "learning_rate": 0.00016832740213523133, + "loss": 2.4812, + "step": 1790 + }, + { + "epoch": 0.796, + "grad_norm": 2.027078866958618, + "learning_rate": 0.00016830960854092528, + "loss": 2.5266, + "step": 1791 + }, + { + "epoch": 0.7964444444444444, + "grad_norm": 1.9708378314971924, + "learning_rate": 0.00016829181494661924, + "loss": 2.7944, + "step": 1792 + }, + { + "epoch": 0.7968888888888889, + "grad_norm": 1.4897676706314087, + "learning_rate": 0.00016827402135231317, + "loss": 2.1396, + "step": 1793 + }, + { + "epoch": 0.7973333333333333, + "grad_norm": 1.5909985303878784, + "learning_rate": 0.00016825622775800713, + "loss": 2.1982, + "step": 1794 + }, + { + "epoch": 0.7977777777777778, + "grad_norm": 1.780980110168457, + "learning_rate": 0.00016823843416370108, + "loss": 2.836, + "step": 1795 + }, + { + "epoch": 0.7982222222222223, + "grad_norm": 1.9581319093704224, + "learning_rate": 0.000168220640569395, + "loss": 3.0858, + "step": 1796 + }, + { + "epoch": 0.7986666666666666, + "grad_norm": 2.094820737838745, + "learning_rate": 0.00016820284697508897, + "loss": 3.2028, + "step": 1797 + }, + { + "epoch": 0.7991111111111111, + "grad_norm": 2.108332872390747, + "learning_rate": 0.00016818505338078292, + "loss": 2.1878, + "step": 1798 + }, + { + "epoch": 0.7995555555555556, + "grad_norm": 1.941762089729309, + "learning_rate": 0.00016816725978647688, + "loss": 2.6336, + "step": 1799 + }, + { + "epoch": 0.8, + "grad_norm": 3.6256611347198486, + "learning_rate": 0.0001681494661921708, + "loss": 1.605, + "step": 1800 + }, + { + "epoch": 0.8004444444444444, + "grad_norm": 0.9606013894081116, + "learning_rate": 0.00016813167259786477, + "loss": 2.5109, + "step": 1801 + }, + { + "epoch": 0.8008888888888889, + "grad_norm": 1.1120461225509644, + "learning_rate": 0.00016811387900355872, + "loss": 3.0039, + "step": 1802 + }, + { + "epoch": 0.8013333333333333, + "grad_norm": 0.9408321976661682, + "learning_rate": 0.00016809608540925268, + "loss": 2.1882, + "step": 1803 + }, + { + "epoch": 0.8017777777777778, + "grad_norm": 1.210302472114563, + "learning_rate": 0.00016807829181494664, + "loss": 2.4686, + "step": 1804 + }, + { + "epoch": 0.8022222222222222, + "grad_norm": 1.1656526327133179, + "learning_rate": 0.0001680604982206406, + "loss": 2.546, + "step": 1805 + }, + { + "epoch": 0.8026666666666666, + "grad_norm": 1.2500494718551636, + "learning_rate": 0.00016804270462633452, + "loss": 2.7287, + "step": 1806 + }, + { + "epoch": 0.8031111111111111, + "grad_norm": 1.1803468465805054, + "learning_rate": 0.00016802491103202848, + "loss": 2.7445, + "step": 1807 + }, + { + "epoch": 0.8035555555555556, + "grad_norm": 1.7277123928070068, + "learning_rate": 0.00016800711743772244, + "loss": 1.3246, + "step": 1808 + }, + { + "epoch": 0.804, + "grad_norm": 1.6080886125564575, + "learning_rate": 0.00016798932384341636, + "loss": 2.639, + "step": 1809 + }, + { + "epoch": 0.8044444444444444, + "grad_norm": 1.1558185815811157, + "learning_rate": 0.00016797153024911032, + "loss": 2.5563, + "step": 1810 + }, + { + "epoch": 0.8048888888888889, + "grad_norm": 1.3278673887252808, + "learning_rate": 0.00016795373665480428, + "loss": 2.1776, + "step": 1811 + }, + { + "epoch": 0.8053333333333333, + "grad_norm": 1.2279390096664429, + "learning_rate": 0.00016793594306049823, + "loss": 2.5615, + "step": 1812 + }, + { + "epoch": 0.8057777777777778, + "grad_norm": 1.3550021648406982, + "learning_rate": 0.00016791814946619216, + "loss": 2.432, + "step": 1813 + }, + { + "epoch": 0.8062222222222222, + "grad_norm": 1.2661508321762085, + "learning_rate": 0.00016790035587188612, + "loss": 2.4724, + "step": 1814 + }, + { + "epoch": 0.8066666666666666, + "grad_norm": 1.5162663459777832, + "learning_rate": 0.00016788256227758008, + "loss": 2.7064, + "step": 1815 + }, + { + "epoch": 0.8071111111111111, + "grad_norm": 1.526525616645813, + "learning_rate": 0.00016786476868327403, + "loss": 2.7493, + "step": 1816 + }, + { + "epoch": 0.8075555555555556, + "grad_norm": 1.349353551864624, + "learning_rate": 0.000167846975088968, + "loss": 2.175, + "step": 1817 + }, + { + "epoch": 0.808, + "grad_norm": 1.518172025680542, + "learning_rate": 0.00016782918149466195, + "loss": 2.9974, + "step": 1818 + }, + { + "epoch": 0.8084444444444444, + "grad_norm": 1.3705731630325317, + "learning_rate": 0.00016781138790035588, + "loss": 2.691, + "step": 1819 + }, + { + "epoch": 0.8088888888888889, + "grad_norm": 1.4127305746078491, + "learning_rate": 0.00016779359430604983, + "loss": 2.6794, + "step": 1820 + }, + { + "epoch": 0.8093333333333333, + "grad_norm": 1.3548202514648438, + "learning_rate": 0.0001677758007117438, + "loss": 2.3797, + "step": 1821 + }, + { + "epoch": 0.8097777777777778, + "grad_norm": 2.2642502784729004, + "learning_rate": 0.00016775800711743772, + "loss": 2.1909, + "step": 1822 + }, + { + "epoch": 0.8102222222222222, + "grad_norm": 1.6806228160858154, + "learning_rate": 0.00016774021352313167, + "loss": 2.8928, + "step": 1823 + }, + { + "epoch": 0.8106666666666666, + "grad_norm": 1.296937346458435, + "learning_rate": 0.00016772241992882563, + "loss": 2.0109, + "step": 1824 + }, + { + "epoch": 0.8111111111111111, + "grad_norm": 1.387312650680542, + "learning_rate": 0.0001677046263345196, + "loss": 2.2319, + "step": 1825 + }, + { + "epoch": 0.8115555555555556, + "grad_norm": 1.6626933813095093, + "learning_rate": 0.00016768683274021352, + "loss": 2.6542, + "step": 1826 + }, + { + "epoch": 0.812, + "grad_norm": 1.658470869064331, + "learning_rate": 0.00016766903914590747, + "loss": 2.3784, + "step": 1827 + }, + { + "epoch": 0.8124444444444444, + "grad_norm": 1.5517417192459106, + "learning_rate": 0.00016765124555160143, + "loss": 2.6614, + "step": 1828 + }, + { + "epoch": 0.8128888888888889, + "grad_norm": 1.6006346940994263, + "learning_rate": 0.00016763345195729539, + "loss": 3.0233, + "step": 1829 + }, + { + "epoch": 0.8133333333333334, + "grad_norm": 1.503570795059204, + "learning_rate": 0.00016761565836298934, + "loss": 2.2458, + "step": 1830 + }, + { + "epoch": 0.8137777777777778, + "grad_norm": 1.9097468852996826, + "learning_rate": 0.0001675978647686833, + "loss": 2.6981, + "step": 1831 + }, + { + "epoch": 0.8142222222222222, + "grad_norm": 1.504949927330017, + "learning_rate": 0.00016758007117437723, + "loss": 2.8705, + "step": 1832 + }, + { + "epoch": 0.8146666666666667, + "grad_norm": 1.7333391904830933, + "learning_rate": 0.00016756227758007119, + "loss": 2.3296, + "step": 1833 + }, + { + "epoch": 0.8151111111111111, + "grad_norm": 4.087153434753418, + "learning_rate": 0.00016754448398576511, + "loss": 0.2551, + "step": 1834 + }, + { + "epoch": 0.8155555555555556, + "grad_norm": 1.6669485569000244, + "learning_rate": 0.00016752669039145907, + "loss": 2.2058, + "step": 1835 + }, + { + "epoch": 0.816, + "grad_norm": 1.4292755126953125, + "learning_rate": 0.00016750889679715303, + "loss": 2.2523, + "step": 1836 + }, + { + "epoch": 0.8164444444444444, + "grad_norm": 1.538756251335144, + "learning_rate": 0.00016749110320284698, + "loss": 2.3624, + "step": 1837 + }, + { + "epoch": 0.8168888888888889, + "grad_norm": 1.5922455787658691, + "learning_rate": 0.00016747330960854094, + "loss": 2.6137, + "step": 1838 + }, + { + "epoch": 0.8173333333333334, + "grad_norm": 1.5305728912353516, + "learning_rate": 0.00016745551601423487, + "loss": 2.1719, + "step": 1839 + }, + { + "epoch": 0.8177777777777778, + "grad_norm": 2.139403820037842, + "learning_rate": 0.00016743772241992883, + "loss": 3.0687, + "step": 1840 + }, + { + "epoch": 0.8182222222222222, + "grad_norm": 1.7138938903808594, + "learning_rate": 0.00016741992882562278, + "loss": 2.684, + "step": 1841 + }, + { + "epoch": 0.8186666666666667, + "grad_norm": 2.1638147830963135, + "learning_rate": 0.00016740213523131674, + "loss": 2.8127, + "step": 1842 + }, + { + "epoch": 0.8191111111111111, + "grad_norm": 2.317457914352417, + "learning_rate": 0.0001673843416370107, + "loss": 2.7219, + "step": 1843 + }, + { + "epoch": 0.8195555555555556, + "grad_norm": 1.7413108348846436, + "learning_rate": 0.00016736654804270465, + "loss": 2.6101, + "step": 1844 + }, + { + "epoch": 0.82, + "grad_norm": 2.0976204872131348, + "learning_rate": 0.00016734875444839858, + "loss": 2.5319, + "step": 1845 + }, + { + "epoch": 0.8204444444444444, + "grad_norm": 2.517296075820923, + "learning_rate": 0.00016733096085409254, + "loss": 3.3618, + "step": 1846 + }, + { + "epoch": 0.8208888888888889, + "grad_norm": 2.0704150199890137, + "learning_rate": 0.00016731316725978647, + "loss": 2.4757, + "step": 1847 + }, + { + "epoch": 0.8213333333333334, + "grad_norm": 2.702256917953491, + "learning_rate": 0.00016729537366548042, + "loss": 0.1476, + "step": 1848 + }, + { + "epoch": 0.8217777777777778, + "grad_norm": 1.8367481231689453, + "learning_rate": 0.00016727758007117438, + "loss": 1.8312, + "step": 1849 + }, + { + "epoch": 0.8222222222222222, + "grad_norm": 1.943381428718567, + "learning_rate": 0.00016725978647686834, + "loss": 0.9415, + "step": 1850 + }, + { + "epoch": 0.8226666666666667, + "grad_norm": 1.0285029411315918, + "learning_rate": 0.0001672419928825623, + "loss": 2.5791, + "step": 1851 + }, + { + "epoch": 0.8231111111111111, + "grad_norm": 1.0174617767333984, + "learning_rate": 0.00016722419928825622, + "loss": 2.4917, + "step": 1852 + }, + { + "epoch": 0.8235555555555556, + "grad_norm": 1.1286165714263916, + "learning_rate": 0.00016720640569395018, + "loss": 2.7286, + "step": 1853 + }, + { + "epoch": 0.824, + "grad_norm": 1.2801088094711304, + "learning_rate": 0.00016718861209964414, + "loss": 2.2644, + "step": 1854 + }, + { + "epoch": 0.8244444444444444, + "grad_norm": 1.3122279644012451, + "learning_rate": 0.0001671708185053381, + "loss": 1.8628, + "step": 1855 + }, + { + "epoch": 0.8248888888888889, + "grad_norm": 1.3034883737564087, + "learning_rate": 0.00016715302491103205, + "loss": 2.8352, + "step": 1856 + }, + { + "epoch": 0.8253333333333334, + "grad_norm": 1.3606019020080566, + "learning_rate": 0.000167135231316726, + "loss": 3.1646, + "step": 1857 + }, + { + "epoch": 0.8257777777777778, + "grad_norm": 1.503804087638855, + "learning_rate": 0.00016711743772241993, + "loss": 2.3705, + "step": 1858 + }, + { + "epoch": 0.8262222222222222, + "grad_norm": 1.2869994640350342, + "learning_rate": 0.0001670996441281139, + "loss": 2.587, + "step": 1859 + }, + { + "epoch": 0.8266666666666667, + "grad_norm": 1.376165747642517, + "learning_rate": 0.00016708185053380782, + "loss": 2.2356, + "step": 1860 + }, + { + "epoch": 0.8271111111111111, + "grad_norm": 1.1255593299865723, + "learning_rate": 0.00016706405693950178, + "loss": 2.1244, + "step": 1861 + }, + { + "epoch": 0.8275555555555556, + "grad_norm": 1.390615701675415, + "learning_rate": 0.00016704626334519573, + "loss": 2.6707, + "step": 1862 + }, + { + "epoch": 0.828, + "grad_norm": 1.195713996887207, + "learning_rate": 0.0001670284697508897, + "loss": 2.1111, + "step": 1863 + }, + { + "epoch": 0.8284444444444444, + "grad_norm": 1.275231957435608, + "learning_rate": 0.00016701067615658365, + "loss": 2.8402, + "step": 1864 + }, + { + "epoch": 0.8288888888888889, + "grad_norm": 1.294203758239746, + "learning_rate": 0.00016699288256227758, + "loss": 2.9545, + "step": 1865 + }, + { + "epoch": 0.8293333333333334, + "grad_norm": 1.4004563093185425, + "learning_rate": 0.00016697508896797153, + "loss": 2.9775, + "step": 1866 + }, + { + "epoch": 0.8297777777777777, + "grad_norm": 1.3818445205688477, + "learning_rate": 0.0001669572953736655, + "loss": 2.4705, + "step": 1867 + }, + { + "epoch": 0.8302222222222222, + "grad_norm": 1.381339192390442, + "learning_rate": 0.00016693950177935945, + "loss": 2.0993, + "step": 1868 + }, + { + "epoch": 0.8306666666666667, + "grad_norm": 1.2843163013458252, + "learning_rate": 0.0001669217081850534, + "loss": 2.7269, + "step": 1869 + }, + { + "epoch": 0.8311111111111111, + "grad_norm": 1.5089333057403564, + "learning_rate": 0.00016690391459074736, + "loss": 2.442, + "step": 1870 + }, + { + "epoch": 0.8315555555555556, + "grad_norm": 1.5830549001693726, + "learning_rate": 0.0001668861209964413, + "loss": 2.5014, + "step": 1871 + }, + { + "epoch": 0.832, + "grad_norm": 1.250998616218567, + "learning_rate": 0.00016686832740213524, + "loss": 2.3065, + "step": 1872 + }, + { + "epoch": 0.8324444444444444, + "grad_norm": 1.5048855543136597, + "learning_rate": 0.00016685053380782917, + "loss": 2.3963, + "step": 1873 + }, + { + "epoch": 0.8328888888888889, + "grad_norm": 1.3290457725524902, + "learning_rate": 0.00016683274021352313, + "loss": 2.3719, + "step": 1874 + }, + { + "epoch": 0.8333333333333334, + "grad_norm": 1.4442025423049927, + "learning_rate": 0.0001668149466192171, + "loss": 2.5401, + "step": 1875 + }, + { + "epoch": 0.8337777777777777, + "grad_norm": 1.5543490648269653, + "learning_rate": 0.00016679715302491104, + "loss": 2.6803, + "step": 1876 + }, + { + "epoch": 0.8342222222222222, + "grad_norm": 1.3237760066986084, + "learning_rate": 0.000166779359430605, + "loss": 2.4261, + "step": 1877 + }, + { + "epoch": 0.8346666666666667, + "grad_norm": 1.4572439193725586, + "learning_rate": 0.00016676156583629893, + "loss": 2.0907, + "step": 1878 + }, + { + "epoch": 0.8351111111111111, + "grad_norm": 1.3316566944122314, + "learning_rate": 0.00016674377224199289, + "loss": 1.099, + "step": 1879 + }, + { + "epoch": 0.8355555555555556, + "grad_norm": 1.640336513519287, + "learning_rate": 0.00016672597864768684, + "loss": 3.0017, + "step": 1880 + }, + { + "epoch": 0.836, + "grad_norm": 1.6514651775360107, + "learning_rate": 0.0001667081850533808, + "loss": 2.6943, + "step": 1881 + }, + { + "epoch": 0.8364444444444444, + "grad_norm": 1.7922955751419067, + "learning_rate": 0.00016669039145907476, + "loss": 3.0074, + "step": 1882 + }, + { + "epoch": 0.8368888888888889, + "grad_norm": 1.293692708015442, + "learning_rate": 0.00016667259786476868, + "loss": 2.0004, + "step": 1883 + }, + { + "epoch": 0.8373333333333334, + "grad_norm": 1.6649595499038696, + "learning_rate": 0.00016665480427046264, + "loss": 2.8518, + "step": 1884 + }, + { + "epoch": 0.8377777777777777, + "grad_norm": 1.55930757522583, + "learning_rate": 0.0001666370106761566, + "loss": 2.5283, + "step": 1885 + }, + { + "epoch": 0.8382222222222222, + "grad_norm": 1.808152198791504, + "learning_rate": 0.00016661921708185053, + "loss": 3.0004, + "step": 1886 + }, + { + "epoch": 0.8386666666666667, + "grad_norm": 2.0772571563720703, + "learning_rate": 0.00016660142348754448, + "loss": 2.2274, + "step": 1887 + }, + { + "epoch": 0.8391111111111111, + "grad_norm": 1.5994224548339844, + "learning_rate": 0.00016658362989323844, + "loss": 2.212, + "step": 1888 + }, + { + "epoch": 0.8395555555555556, + "grad_norm": 1.7321178913116455, + "learning_rate": 0.0001665658362989324, + "loss": 2.4917, + "step": 1889 + }, + { + "epoch": 0.84, + "grad_norm": 1.8322235345840454, + "learning_rate": 0.00016654804270462633, + "loss": 3.1259, + "step": 1890 + }, + { + "epoch": 0.8404444444444444, + "grad_norm": 1.6857374906539917, + "learning_rate": 0.00016653024911032028, + "loss": 2.81, + "step": 1891 + }, + { + "epoch": 0.8408888888888889, + "grad_norm": 1.6723679304122925, + "learning_rate": 0.00016651245551601424, + "loss": 2.4271, + "step": 1892 + }, + { + "epoch": 0.8413333333333334, + "grad_norm": 1.2392948865890503, + "learning_rate": 0.0001664946619217082, + "loss": 1.6478, + "step": 1893 + }, + { + "epoch": 0.8417777777777777, + "grad_norm": 1.6006635427474976, + "learning_rate": 0.00016647686832740215, + "loss": 2.7513, + "step": 1894 + }, + { + "epoch": 0.8422222222222222, + "grad_norm": 1.623317003250122, + "learning_rate": 0.0001664590747330961, + "loss": 2.3341, + "step": 1895 + }, + { + "epoch": 0.8426666666666667, + "grad_norm": 1.8542609214782715, + "learning_rate": 0.00016644128113879004, + "loss": 2.7884, + "step": 1896 + }, + { + "epoch": 0.8431111111111111, + "grad_norm": 1.8695321083068848, + "learning_rate": 0.000166423487544484, + "loss": 2.3178, + "step": 1897 + }, + { + "epoch": 0.8435555555555555, + "grad_norm": 1.759669303894043, + "learning_rate": 0.00016640569395017795, + "loss": 2.8503, + "step": 1898 + }, + { + "epoch": 0.844, + "grad_norm": 2.0336124897003174, + "learning_rate": 0.00016638790035587188, + "loss": 2.4729, + "step": 1899 + }, + { + "epoch": 0.8444444444444444, + "grad_norm": 2.0583572387695312, + "learning_rate": 0.00016637010676156584, + "loss": 2.7253, + "step": 1900 + }, + { + "epoch": 0.8448888888888889, + "grad_norm": 0.9357160329818726, + "learning_rate": 0.0001663523131672598, + "loss": 1.1275, + "step": 1901 + }, + { + "epoch": 0.8453333333333334, + "grad_norm": 1.1076958179473877, + "learning_rate": 0.00016633451957295375, + "loss": 2.4995, + "step": 1902 + }, + { + "epoch": 0.8457777777777777, + "grad_norm": 1.0276774168014526, + "learning_rate": 0.00016631672597864768, + "loss": 2.427, + "step": 1903 + }, + { + "epoch": 0.8462222222222222, + "grad_norm": 1.1579983234405518, + "learning_rate": 0.00016629893238434164, + "loss": 2.2964, + "step": 1904 + }, + { + "epoch": 0.8466666666666667, + "grad_norm": 1.100016474723816, + "learning_rate": 0.0001662811387900356, + "loss": 2.4451, + "step": 1905 + }, + { + "epoch": 0.8471111111111111, + "grad_norm": 1.2550407648086548, + "learning_rate": 0.00016626334519572955, + "loss": 2.985, + "step": 1906 + }, + { + "epoch": 0.8475555555555555, + "grad_norm": 1.1398979425430298, + "learning_rate": 0.0001662455516014235, + "loss": 2.3818, + "step": 1907 + }, + { + "epoch": 0.848, + "grad_norm": 1.3666727542877197, + "learning_rate": 0.00016622775800711746, + "loss": 2.3968, + "step": 1908 + }, + { + "epoch": 0.8484444444444444, + "grad_norm": 1.4796351194381714, + "learning_rate": 0.0001662099644128114, + "loss": 1.2558, + "step": 1909 + }, + { + "epoch": 0.8488888888888889, + "grad_norm": 1.3779754638671875, + "learning_rate": 0.00016619217081850535, + "loss": 3.1081, + "step": 1910 + }, + { + "epoch": 0.8493333333333334, + "grad_norm": 1.4465447664260864, + "learning_rate": 0.0001661743772241993, + "loss": 2.4524, + "step": 1911 + }, + { + "epoch": 0.8497777777777777, + "grad_norm": 1.3205504417419434, + "learning_rate": 0.00016615658362989323, + "loss": 2.7317, + "step": 1912 + }, + { + "epoch": 0.8502222222222222, + "grad_norm": 1.286799669265747, + "learning_rate": 0.0001661387900355872, + "loss": 2.2615, + "step": 1913 + }, + { + "epoch": 0.8506666666666667, + "grad_norm": 1.2320808172225952, + "learning_rate": 0.00016612099644128115, + "loss": 2.5005, + "step": 1914 + }, + { + "epoch": 0.8511111111111112, + "grad_norm": 1.5686407089233398, + "learning_rate": 0.0001661032028469751, + "loss": 2.3068, + "step": 1915 + }, + { + "epoch": 0.8515555555555555, + "grad_norm": 2.0578267574310303, + "learning_rate": 0.00016608540925266903, + "loss": 1.2472, + "step": 1916 + }, + { + "epoch": 0.852, + "grad_norm": 1.3643602132797241, + "learning_rate": 0.000166067615658363, + "loss": 2.6433, + "step": 1917 + }, + { + "epoch": 0.8524444444444444, + "grad_norm": 1.3753222227096558, + "learning_rate": 0.00016604982206405695, + "loss": 2.4551, + "step": 1918 + }, + { + "epoch": 0.8528888888888889, + "grad_norm": 1.2894665002822876, + "learning_rate": 0.0001660320284697509, + "loss": 1.7715, + "step": 1919 + }, + { + "epoch": 0.8533333333333334, + "grad_norm": 1.381399393081665, + "learning_rate": 0.00016601423487544486, + "loss": 2.5613, + "step": 1920 + }, + { + "epoch": 0.8537777777777777, + "grad_norm": 1.5483062267303467, + "learning_rate": 0.00016599644128113881, + "loss": 2.9712, + "step": 1921 + }, + { + "epoch": 0.8542222222222222, + "grad_norm": 1.481539011001587, + "learning_rate": 0.00016597864768683274, + "loss": 2.4221, + "step": 1922 + }, + { + "epoch": 0.8546666666666667, + "grad_norm": 1.316893458366394, + "learning_rate": 0.0001659608540925267, + "loss": 2.0611, + "step": 1923 + }, + { + "epoch": 0.8551111111111112, + "grad_norm": 1.398386836051941, + "learning_rate": 0.00016594306049822066, + "loss": 2.1331, + "step": 1924 + }, + { + "epoch": 0.8555555555555555, + "grad_norm": 1.521558165550232, + "learning_rate": 0.0001659252669039146, + "loss": 2.6778, + "step": 1925 + }, + { + "epoch": 0.856, + "grad_norm": 1.4601832628250122, + "learning_rate": 0.00016590747330960854, + "loss": 2.2369, + "step": 1926 + }, + { + "epoch": 0.8564444444444445, + "grad_norm": 1.6191009283065796, + "learning_rate": 0.0001658896797153025, + "loss": 2.6541, + "step": 1927 + }, + { + "epoch": 0.8568888888888889, + "grad_norm": 1.5223315954208374, + "learning_rate": 0.00016587188612099646, + "loss": 2.3937, + "step": 1928 + }, + { + "epoch": 0.8573333333333333, + "grad_norm": 1.2509064674377441, + "learning_rate": 0.00016585409252669039, + "loss": 1.227, + "step": 1929 + }, + { + "epoch": 0.8577777777777778, + "grad_norm": 1.5808355808258057, + "learning_rate": 0.00016583629893238434, + "loss": 2.3527, + "step": 1930 + }, + { + "epoch": 0.8582222222222222, + "grad_norm": 1.5534776449203491, + "learning_rate": 0.0001658185053380783, + "loss": 2.8214, + "step": 1931 + }, + { + "epoch": 0.8586666666666667, + "grad_norm": 1.675430178642273, + "learning_rate": 0.00016580071174377225, + "loss": 2.6934, + "step": 1932 + }, + { + "epoch": 0.8591111111111112, + "grad_norm": 1.638614296913147, + "learning_rate": 0.0001657829181494662, + "loss": 2.6275, + "step": 1933 + }, + { + "epoch": 0.8595555555555555, + "grad_norm": 1.4431530237197876, + "learning_rate": 0.00016576512455516017, + "loss": 2.4221, + "step": 1934 + }, + { + "epoch": 0.86, + "grad_norm": 1.532965898513794, + "learning_rate": 0.0001657473309608541, + "loss": 2.1414, + "step": 1935 + }, + { + "epoch": 0.8604444444444445, + "grad_norm": 1.8967840671539307, + "learning_rate": 0.00016572953736654805, + "loss": 2.2568, + "step": 1936 + }, + { + "epoch": 0.8608888888888889, + "grad_norm": 1.6051034927368164, + "learning_rate": 0.000165711743772242, + "loss": 2.6386, + "step": 1937 + }, + { + "epoch": 0.8613333333333333, + "grad_norm": 1.7966537475585938, + "learning_rate": 0.00016569395017793594, + "loss": 2.1053, + "step": 1938 + }, + { + "epoch": 0.8617777777777778, + "grad_norm": 1.6629730463027954, + "learning_rate": 0.0001656761565836299, + "loss": 2.7147, + "step": 1939 + }, + { + "epoch": 0.8622222222222222, + "grad_norm": 1.5632649660110474, + "learning_rate": 0.00016565836298932385, + "loss": 2.3997, + "step": 1940 + }, + { + "epoch": 0.8626666666666667, + "grad_norm": 1.827580451965332, + "learning_rate": 0.0001656405693950178, + "loss": 2.118, + "step": 1941 + }, + { + "epoch": 0.8631111111111112, + "grad_norm": 1.7074384689331055, + "learning_rate": 0.00016562277580071174, + "loss": 2.1276, + "step": 1942 + }, + { + "epoch": 0.8635555555555555, + "grad_norm": 1.6580160856246948, + "learning_rate": 0.0001656049822064057, + "loss": 2.0692, + "step": 1943 + }, + { + "epoch": 0.864, + "grad_norm": 2.2901294231414795, + "learning_rate": 0.00016558718861209965, + "loss": 2.8533, + "step": 1944 + }, + { + "epoch": 0.8644444444444445, + "grad_norm": 1.9296009540557861, + "learning_rate": 0.0001655693950177936, + "loss": 2.5001, + "step": 1945 + }, + { + "epoch": 0.8648888888888889, + "grad_norm": 1.7774969339370728, + "learning_rate": 0.00016555160142348756, + "loss": 3.0877, + "step": 1946 + }, + { + "epoch": 0.8653333333333333, + "grad_norm": 1.7558300495147705, + "learning_rate": 0.00016553380782918152, + "loss": 2.1556, + "step": 1947 + }, + { + "epoch": 0.8657777777777778, + "grad_norm": 1.870627760887146, + "learning_rate": 0.00016551601423487545, + "loss": 1.3417, + "step": 1948 + }, + { + "epoch": 0.8662222222222222, + "grad_norm": 1.396712064743042, + "learning_rate": 0.0001654982206405694, + "loss": 1.0483, + "step": 1949 + }, + { + "epoch": 0.8666666666666667, + "grad_norm": 1.8367741107940674, + "learning_rate": 0.00016548042704626334, + "loss": 2.2391, + "step": 1950 + }, + { + "epoch": 0.8671111111111112, + "grad_norm": 1.0035446882247925, + "learning_rate": 0.0001654626334519573, + "loss": 2.7096, + "step": 1951 + }, + { + "epoch": 0.8675555555555555, + "grad_norm": 1.2445075511932373, + "learning_rate": 0.00016544483985765125, + "loss": 2.5628, + "step": 1952 + }, + { + "epoch": 0.868, + "grad_norm": 1.1393455266952515, + "learning_rate": 0.0001654270462633452, + "loss": 2.8044, + "step": 1953 + }, + { + "epoch": 0.8684444444444445, + "grad_norm": 1.042448878288269, + "learning_rate": 0.00016540925266903916, + "loss": 2.183, + "step": 1954 + }, + { + "epoch": 0.8688888888888889, + "grad_norm": 1.0702394247055054, + "learning_rate": 0.0001653914590747331, + "loss": 2.544, + "step": 1955 + }, + { + "epoch": 0.8693333333333333, + "grad_norm": 1.1012481451034546, + "learning_rate": 0.00016537366548042705, + "loss": 2.4327, + "step": 1956 + }, + { + "epoch": 0.8697777777777778, + "grad_norm": 1.0068960189819336, + "learning_rate": 0.000165355871886121, + "loss": 2.1235, + "step": 1957 + }, + { + "epoch": 0.8702222222222222, + "grad_norm": 1.271165370941162, + "learning_rate": 0.00016533807829181496, + "loss": 2.3766, + "step": 1958 + }, + { + "epoch": 0.8706666666666667, + "grad_norm": 1.2693040370941162, + "learning_rate": 0.00016532028469750892, + "loss": 2.355, + "step": 1959 + }, + { + "epoch": 0.8711111111111111, + "grad_norm": 1.28933846950531, + "learning_rate": 0.00016530249110320287, + "loss": 2.1452, + "step": 1960 + }, + { + "epoch": 0.8715555555555555, + "grad_norm": 1.3570892810821533, + "learning_rate": 0.0001652846975088968, + "loss": 2.8351, + "step": 1961 + }, + { + "epoch": 0.872, + "grad_norm": 1.373550534248352, + "learning_rate": 0.00016526690391459076, + "loss": 2.1488, + "step": 1962 + }, + { + "epoch": 0.8724444444444445, + "grad_norm": 1.1201701164245605, + "learning_rate": 0.0001652491103202847, + "loss": 2.1829, + "step": 1963 + }, + { + "epoch": 0.8728888888888889, + "grad_norm": 1.28224778175354, + "learning_rate": 0.00016523131672597865, + "loss": 2.6936, + "step": 1964 + }, + { + "epoch": 0.8733333333333333, + "grad_norm": 1.540144443511963, + "learning_rate": 0.0001652135231316726, + "loss": 2.5752, + "step": 1965 + }, + { + "epoch": 0.8737777777777778, + "grad_norm": 1.3699182271957397, + "learning_rate": 0.00016519572953736656, + "loss": 2.5122, + "step": 1966 + }, + { + "epoch": 0.8742222222222222, + "grad_norm": 1.5557972192764282, + "learning_rate": 0.00016517793594306052, + "loss": 2.2133, + "step": 1967 + }, + { + "epoch": 0.8746666666666667, + "grad_norm": 1.4491480588912964, + "learning_rate": 0.00016516014234875444, + "loss": 2.4586, + "step": 1968 + }, + { + "epoch": 0.8751111111111111, + "grad_norm": 1.3533332347869873, + "learning_rate": 0.0001651423487544484, + "loss": 2.1343, + "step": 1969 + }, + { + "epoch": 0.8755555555555555, + "grad_norm": 1.8715101480484009, + "learning_rate": 0.00016512455516014236, + "loss": 2.5727, + "step": 1970 + }, + { + "epoch": 0.876, + "grad_norm": 1.507683515548706, + "learning_rate": 0.00016510676156583631, + "loss": 1.8529, + "step": 1971 + }, + { + "epoch": 0.8764444444444445, + "grad_norm": 1.381305456161499, + "learning_rate": 0.00016508896797153027, + "loss": 2.2016, + "step": 1972 + }, + { + "epoch": 0.8768888888888889, + "grad_norm": 1.4611423015594482, + "learning_rate": 0.0001650711743772242, + "loss": 2.4748, + "step": 1973 + }, + { + "epoch": 0.8773333333333333, + "grad_norm": 1.5445431470870972, + "learning_rate": 0.00016505338078291816, + "loss": 2.5277, + "step": 1974 + }, + { + "epoch": 0.8777777777777778, + "grad_norm": 1.4540585279464722, + "learning_rate": 0.0001650355871886121, + "loss": 2.4206, + "step": 1975 + }, + { + "epoch": 0.8782222222222222, + "grad_norm": 1.4331034421920776, + "learning_rate": 0.00016501779359430604, + "loss": 2.6362, + "step": 1976 + }, + { + "epoch": 0.8786666666666667, + "grad_norm": 1.668470859527588, + "learning_rate": 0.000165, + "loss": 2.5848, + "step": 1977 + }, + { + "epoch": 0.8791111111111111, + "grad_norm": 1.2265642881393433, + "learning_rate": 0.00016498220640569396, + "loss": 2.0659, + "step": 1978 + }, + { + "epoch": 0.8795555555555555, + "grad_norm": 1.6252071857452393, + "learning_rate": 0.0001649644128113879, + "loss": 2.3045, + "step": 1979 + }, + { + "epoch": 0.88, + "grad_norm": 1.3835192918777466, + "learning_rate": 0.00016494661921708184, + "loss": 2.3903, + "step": 1980 + }, + { + "epoch": 0.8804444444444445, + "grad_norm": 1.4605368375778198, + "learning_rate": 0.0001649288256227758, + "loss": 2.7493, + "step": 1981 + }, + { + "epoch": 0.8808888888888889, + "grad_norm": 1.863997459411621, + "learning_rate": 0.00016491103202846975, + "loss": 2.3149, + "step": 1982 + }, + { + "epoch": 0.8813333333333333, + "grad_norm": 1.7310175895690918, + "learning_rate": 0.0001648932384341637, + "loss": 2.7611, + "step": 1983 + }, + { + "epoch": 0.8817777777777778, + "grad_norm": 1.4304709434509277, + "learning_rate": 0.00016487544483985767, + "loss": 1.2887, + "step": 1984 + }, + { + "epoch": 0.8822222222222222, + "grad_norm": 1.2733263969421387, + "learning_rate": 0.00016485765124555162, + "loss": 2.0624, + "step": 1985 + }, + { + "epoch": 0.8826666666666667, + "grad_norm": 1.5163559913635254, + "learning_rate": 0.00016483985765124555, + "loss": 2.7885, + "step": 1986 + }, + { + "epoch": 0.8831111111111111, + "grad_norm": 1.3988975286483765, + "learning_rate": 0.0001648220640569395, + "loss": 2.1499, + "step": 1987 + }, + { + "epoch": 0.8835555555555555, + "grad_norm": 1.324731469154358, + "learning_rate": 0.00016480427046263347, + "loss": 2.1789, + "step": 1988 + }, + { + "epoch": 0.884, + "grad_norm": 1.5569076538085938, + "learning_rate": 0.0001647864768683274, + "loss": 2.4129, + "step": 1989 + }, + { + "epoch": 0.8844444444444445, + "grad_norm": 2.2256200313568115, + "learning_rate": 0.00016476868327402135, + "loss": 2.9758, + "step": 1990 + }, + { + "epoch": 0.8848888888888888, + "grad_norm": 1.666374683380127, + "learning_rate": 0.0001647508896797153, + "loss": 2.7925, + "step": 1991 + }, + { + "epoch": 0.8853333333333333, + "grad_norm": 1.6541733741760254, + "learning_rate": 0.00016473309608540927, + "loss": 2.4019, + "step": 1992 + }, + { + "epoch": 0.8857777777777778, + "grad_norm": 1.5037000179290771, + "learning_rate": 0.0001647153024911032, + "loss": 2.6672, + "step": 1993 + }, + { + "epoch": 0.8862222222222222, + "grad_norm": 1.7169650793075562, + "learning_rate": 0.00016469750889679715, + "loss": 2.7251, + "step": 1994 + }, + { + "epoch": 0.8866666666666667, + "grad_norm": 1.9718469381332397, + "learning_rate": 0.0001646797153024911, + "loss": 2.1821, + "step": 1995 + }, + { + "epoch": 0.8871111111111111, + "grad_norm": 1.751865029335022, + "learning_rate": 0.00016466192170818506, + "loss": 2.6323, + "step": 1996 + }, + { + "epoch": 0.8875555555555555, + "grad_norm": 1.602544903755188, + "learning_rate": 0.00016464412811387902, + "loss": 2.6532, + "step": 1997 + }, + { + "epoch": 0.888, + "grad_norm": 1.865159273147583, + "learning_rate": 0.00016462633451957298, + "loss": 3.023, + "step": 1998 + }, + { + "epoch": 0.8884444444444445, + "grad_norm": 1.7071006298065186, + "learning_rate": 0.0001646085409252669, + "loss": 2.6798, + "step": 1999 + }, + { + "epoch": 0.8888888888888888, + "grad_norm": 2.0052783489227295, + "learning_rate": 0.00016459074733096086, + "loss": 2.7293, + "step": 2000 + }, + { + "epoch": 0.8893333333333333, + "grad_norm": 1.0259556770324707, + "learning_rate": 0.00016457295373665482, + "loss": 2.552, + "step": 2001 + }, + { + "epoch": 0.8897777777777778, + "grad_norm": 1.1333271265029907, + "learning_rate": 0.00016455516014234875, + "loss": 2.8014, + "step": 2002 + }, + { + "epoch": 0.8902222222222222, + "grad_norm": 0.973343551158905, + "learning_rate": 0.0001645373665480427, + "loss": 2.5627, + "step": 2003 + }, + { + "epoch": 0.8906666666666667, + "grad_norm": 0.9888269901275635, + "learning_rate": 0.00016451957295373666, + "loss": 2.2681, + "step": 2004 + }, + { + "epoch": 0.8911111111111111, + "grad_norm": 1.2307792901992798, + "learning_rate": 0.00016450177935943062, + "loss": 2.1229, + "step": 2005 + }, + { + "epoch": 0.8915555555555555, + "grad_norm": 1.0843390226364136, + "learning_rate": 0.00016448398576512455, + "loss": 2.3058, + "step": 2006 + }, + { + "epoch": 0.892, + "grad_norm": 1.3246184587478638, + "learning_rate": 0.0001644661921708185, + "loss": 2.5498, + "step": 2007 + }, + { + "epoch": 0.8924444444444445, + "grad_norm": 1.156726360321045, + "learning_rate": 0.00016444839857651246, + "loss": 2.3375, + "step": 2008 + }, + { + "epoch": 0.8928888888888888, + "grad_norm": 1.061392903327942, + "learning_rate": 0.00016443060498220642, + "loss": 1.2548, + "step": 2009 + }, + { + "epoch": 0.8933333333333333, + "grad_norm": 1.357146978378296, + "learning_rate": 0.00016441281138790037, + "loss": 2.8068, + "step": 2010 + }, + { + "epoch": 0.8937777777777778, + "grad_norm": 1.3615721464157104, + "learning_rate": 0.00016439501779359433, + "loss": 2.4011, + "step": 2011 + }, + { + "epoch": 0.8942222222222223, + "grad_norm": 1.3508340120315552, + "learning_rate": 0.00016437722419928826, + "loss": 2.5085, + "step": 2012 + }, + { + "epoch": 0.8946666666666667, + "grad_norm": 1.4771041870117188, + "learning_rate": 0.00016435943060498222, + "loss": 1.8489, + "step": 2013 + }, + { + "epoch": 0.8951111111111111, + "grad_norm": 1.2312934398651123, + "learning_rate": 0.00016434163701067617, + "loss": 1.9961, + "step": 2014 + }, + { + "epoch": 0.8955555555555555, + "grad_norm": 1.308119535446167, + "learning_rate": 0.0001643238434163701, + "loss": 2.6029, + "step": 2015 + }, + { + "epoch": 0.896, + "grad_norm": 1.3931859731674194, + "learning_rate": 0.00016430604982206406, + "loss": 2.5537, + "step": 2016 + }, + { + "epoch": 0.8964444444444445, + "grad_norm": 1.5460842847824097, + "learning_rate": 0.00016428825622775802, + "loss": 2.5682, + "step": 2017 + }, + { + "epoch": 0.8968888888888888, + "grad_norm": 1.4253586530685425, + "learning_rate": 0.00016427046263345197, + "loss": 2.6531, + "step": 2018 + }, + { + "epoch": 0.8973333333333333, + "grad_norm": 1.2461731433868408, + "learning_rate": 0.0001642526690391459, + "loss": 1.9027, + "step": 2019 + }, + { + "epoch": 0.8977777777777778, + "grad_norm": 1.418392539024353, + "learning_rate": 0.00016423487544483986, + "loss": 2.5344, + "step": 2020 + }, + { + "epoch": 0.8982222222222223, + "grad_norm": 1.5666571855545044, + "learning_rate": 0.00016421708185053381, + "loss": 2.4009, + "step": 2021 + }, + { + "epoch": 0.8986666666666666, + "grad_norm": 1.2651710510253906, + "learning_rate": 0.00016419928825622777, + "loss": 2.1727, + "step": 2022 + }, + { + "epoch": 0.8991111111111111, + "grad_norm": 1.4496339559555054, + "learning_rate": 0.00016418149466192173, + "loss": 2.3306, + "step": 2023 + }, + { + "epoch": 0.8995555555555556, + "grad_norm": 1.3731813430786133, + "learning_rate": 0.00016416370106761568, + "loss": 2.0778, + "step": 2024 + }, + { + "epoch": 0.9, + "grad_norm": 1.383135199546814, + "learning_rate": 0.0001641459074733096, + "loss": 2.7001, + "step": 2025 + }, + { + "epoch": 0.9004444444444445, + "grad_norm": 1.2729257345199585, + "learning_rate": 0.00016412811387900357, + "loss": 2.1518, + "step": 2026 + }, + { + "epoch": 0.9008888888888889, + "grad_norm": 1.5172004699707031, + "learning_rate": 0.00016411032028469753, + "loss": 2.4128, + "step": 2027 + }, + { + "epoch": 0.9013333333333333, + "grad_norm": 1.3917080163955688, + "learning_rate": 0.00016409252669039146, + "loss": 1.2132, + "step": 2028 + }, + { + "epoch": 0.9017777777777778, + "grad_norm": 1.359440565109253, + "learning_rate": 0.0001640747330960854, + "loss": 2.6389, + "step": 2029 + }, + { + "epoch": 0.9022222222222223, + "grad_norm": 1.6984691619873047, + "learning_rate": 0.00016405693950177937, + "loss": 2.5598, + "step": 2030 + }, + { + "epoch": 0.9026666666666666, + "grad_norm": 1.3481029272079468, + "learning_rate": 0.00016403914590747332, + "loss": 2.2033, + "step": 2031 + }, + { + "epoch": 0.9031111111111111, + "grad_norm": 1.4901320934295654, + "learning_rate": 0.00016402135231316725, + "loss": 2.227, + "step": 2032 + }, + { + "epoch": 0.9035555555555556, + "grad_norm": 1.4756929874420166, + "learning_rate": 0.0001640035587188612, + "loss": 2.3098, + "step": 2033 + }, + { + "epoch": 0.904, + "grad_norm": 2.4183623790740967, + "learning_rate": 0.00016398576512455517, + "loss": 2.2787, + "step": 2034 + }, + { + "epoch": 0.9044444444444445, + "grad_norm": 1.7991214990615845, + "learning_rate": 0.00016396797153024912, + "loss": 2.3717, + "step": 2035 + }, + { + "epoch": 0.9048888888888889, + "grad_norm": 1.7338757514953613, + "learning_rate": 0.00016395017793594308, + "loss": 2.9113, + "step": 2036 + }, + { + "epoch": 0.9053333333333333, + "grad_norm": 1.5208733081817627, + "learning_rate": 0.00016393238434163704, + "loss": 2.0556, + "step": 2037 + }, + { + "epoch": 0.9057777777777778, + "grad_norm": 1.5961337089538574, + "learning_rate": 0.00016391459074733097, + "loss": 2.093, + "step": 2038 + }, + { + "epoch": 0.9062222222222223, + "grad_norm": 1.6487394571304321, + "learning_rate": 0.00016389679715302492, + "loss": 2.6833, + "step": 2039 + }, + { + "epoch": 0.9066666666666666, + "grad_norm": 2.891885995864868, + "learning_rate": 0.00016387900355871888, + "loss": 1.6798, + "step": 2040 + }, + { + "epoch": 0.9071111111111111, + "grad_norm": 1.8462321758270264, + "learning_rate": 0.0001638612099644128, + "loss": 2.5937, + "step": 2041 + }, + { + "epoch": 0.9075555555555556, + "grad_norm": 1.7322661876678467, + "learning_rate": 0.00016384341637010676, + "loss": 2.7494, + "step": 2042 + }, + { + "epoch": 0.908, + "grad_norm": 1.7559343576431274, + "learning_rate": 0.00016382562277580072, + "loss": 2.7197, + "step": 2043 + }, + { + "epoch": 0.9084444444444445, + "grad_norm": 1.7972688674926758, + "learning_rate": 0.00016380782918149468, + "loss": 2.2508, + "step": 2044 + }, + { + "epoch": 0.9088888888888889, + "grad_norm": 1.5934430360794067, + "learning_rate": 0.0001637900355871886, + "loss": 2.5825, + "step": 2045 + }, + { + "epoch": 0.9093333333333333, + "grad_norm": 2.1732285022735596, + "learning_rate": 0.00016377224199288256, + "loss": 2.5181, + "step": 2046 + }, + { + "epoch": 0.9097777777777778, + "grad_norm": 1.8256020545959473, + "learning_rate": 0.00016375444839857652, + "loss": 2.4343, + "step": 2047 + }, + { + "epoch": 0.9102222222222223, + "grad_norm": 1.8094704151153564, + "learning_rate": 0.00016373665480427048, + "loss": 2.4485, + "step": 2048 + }, + { + "epoch": 0.9106666666666666, + "grad_norm": 2.246121644973755, + "learning_rate": 0.00016371886120996443, + "loss": 3.1074, + "step": 2049 + }, + { + "epoch": 0.9111111111111111, + "grad_norm": 3.0540249347686768, + "learning_rate": 0.0001637010676156584, + "loss": 1.8492, + "step": 2050 + }, + { + "epoch": 0.9115555555555556, + "grad_norm": 0.9966058731079102, + "learning_rate": 0.00016368327402135232, + "loss": 2.6141, + "step": 2051 + }, + { + "epoch": 0.912, + "grad_norm": 1.0565382242202759, + "learning_rate": 0.00016366548042704628, + "loss": 2.5051, + "step": 2052 + }, + { + "epoch": 0.9124444444444444, + "grad_norm": 0.999159574508667, + "learning_rate": 0.00016364768683274023, + "loss": 1.2208, + "step": 2053 + }, + { + "epoch": 0.9128888888888889, + "grad_norm": 1.0446531772613525, + "learning_rate": 0.00016362989323843416, + "loss": 2.403, + "step": 2054 + }, + { + "epoch": 0.9133333333333333, + "grad_norm": 1.7847496271133423, + "learning_rate": 0.00016361209964412812, + "loss": 1.4985, + "step": 2055 + }, + { + "epoch": 0.9137777777777778, + "grad_norm": 1.1442463397979736, + "learning_rate": 0.00016359430604982207, + "loss": 2.8388, + "step": 2056 + }, + { + "epoch": 0.9142222222222223, + "grad_norm": 1.1940288543701172, + "learning_rate": 0.00016357651245551603, + "loss": 2.7245, + "step": 2057 + }, + { + "epoch": 0.9146666666666666, + "grad_norm": 1.3934929370880127, + "learning_rate": 0.00016355871886120996, + "loss": 2.8598, + "step": 2058 + }, + { + "epoch": 0.9151111111111111, + "grad_norm": 1.361688256263733, + "learning_rate": 0.00016354092526690392, + "loss": 2.7173, + "step": 2059 + }, + { + "epoch": 0.9155555555555556, + "grad_norm": 1.3823915719985962, + "learning_rate": 0.00016352313167259787, + "loss": 1.5828, + "step": 2060 + }, + { + "epoch": 0.916, + "grad_norm": 1.3563628196716309, + "learning_rate": 0.00016350533807829183, + "loss": 2.5457, + "step": 2061 + }, + { + "epoch": 0.9164444444444444, + "grad_norm": 1.2287131547927856, + "learning_rate": 0.00016348754448398579, + "loss": 2.9229, + "step": 2062 + }, + { + "epoch": 0.9168888888888889, + "grad_norm": 1.1974775791168213, + "learning_rate": 0.00016346975088967972, + "loss": 2.2515, + "step": 2063 + }, + { + "epoch": 0.9173333333333333, + "grad_norm": 1.5387582778930664, + "learning_rate": 0.00016345195729537367, + "loss": 1.6172, + "step": 2064 + }, + { + "epoch": 0.9177777777777778, + "grad_norm": 1.3966673612594604, + "learning_rate": 0.00016343416370106763, + "loss": 2.771, + "step": 2065 + }, + { + "epoch": 0.9182222222222223, + "grad_norm": 1.7705440521240234, + "learning_rate": 0.00016341637010676156, + "loss": 1.446, + "step": 2066 + }, + { + "epoch": 0.9186666666666666, + "grad_norm": 1.4579976797103882, + "learning_rate": 0.00016339857651245551, + "loss": 1.8905, + "step": 2067 + }, + { + "epoch": 0.9191111111111111, + "grad_norm": 1.4872655868530273, + "learning_rate": 0.00016338078291814947, + "loss": 2.2954, + "step": 2068 + }, + { + "epoch": 0.9195555555555556, + "grad_norm": 1.5094295740127563, + "learning_rate": 0.00016336298932384343, + "loss": 3.1148, + "step": 2069 + }, + { + "epoch": 0.92, + "grad_norm": 1.5641443729400635, + "learning_rate": 0.00016334519572953736, + "loss": 2.7561, + "step": 2070 + }, + { + "epoch": 0.9204444444444444, + "grad_norm": 1.4170724153518677, + "learning_rate": 0.00016332740213523131, + "loss": 2.2233, + "step": 2071 + }, + { + "epoch": 0.9208888888888889, + "grad_norm": 1.633217215538025, + "learning_rate": 0.00016330960854092527, + "loss": 2.9578, + "step": 2072 + }, + { + "epoch": 0.9213333333333333, + "grad_norm": 1.2382259368896484, + "learning_rate": 0.00016329181494661923, + "loss": 1.4224, + "step": 2073 + }, + { + "epoch": 0.9217777777777778, + "grad_norm": 1.9555550813674927, + "learning_rate": 0.00016327402135231318, + "loss": 2.4531, + "step": 2074 + }, + { + "epoch": 0.9222222222222223, + "grad_norm": 1.4946098327636719, + "learning_rate": 0.00016325622775800714, + "loss": 1.3729, + "step": 2075 + }, + { + "epoch": 0.9226666666666666, + "grad_norm": 0.9218053817749023, + "learning_rate": 0.00016323843416370107, + "loss": 1.1931, + "step": 2076 + }, + { + "epoch": 0.9231111111111111, + "grad_norm": 1.8889461755752563, + "learning_rate": 0.00016322064056939503, + "loss": 2.8331, + "step": 2077 + }, + { + "epoch": 0.9235555555555556, + "grad_norm": 2.0126349925994873, + "learning_rate": 0.00016320284697508898, + "loss": 2.0097, + "step": 2078 + }, + { + "epoch": 0.924, + "grad_norm": 1.5034337043762207, + "learning_rate": 0.0001631850533807829, + "loss": 2.4002, + "step": 2079 + }, + { + "epoch": 0.9244444444444444, + "grad_norm": 1.582135558128357, + "learning_rate": 0.00016316725978647687, + "loss": 2.4081, + "step": 2080 + }, + { + "epoch": 0.9248888888888889, + "grad_norm": 1.4720321893692017, + "learning_rate": 0.00016314946619217082, + "loss": 2.0486, + "step": 2081 + }, + { + "epoch": 0.9253333333333333, + "grad_norm": 1.524876594543457, + "learning_rate": 0.00016313167259786478, + "loss": 2.4884, + "step": 2082 + }, + { + "epoch": 0.9257777777777778, + "grad_norm": 1.3611582517623901, + "learning_rate": 0.0001631138790035587, + "loss": 2.2896, + "step": 2083 + }, + { + "epoch": 0.9262222222222222, + "grad_norm": 1.6695072650909424, + "learning_rate": 0.00016309608540925267, + "loss": 2.5879, + "step": 2084 + }, + { + "epoch": 0.9266666666666666, + "grad_norm": 1.5357855558395386, + "learning_rate": 0.00016307829181494662, + "loss": 2.4094, + "step": 2085 + }, + { + "epoch": 0.9271111111111111, + "grad_norm": 1.6165261268615723, + "learning_rate": 0.00016306049822064058, + "loss": 2.3728, + "step": 2086 + }, + { + "epoch": 0.9275555555555556, + "grad_norm": 1.7907167673110962, + "learning_rate": 0.00016304270462633454, + "loss": 2.4651, + "step": 2087 + }, + { + "epoch": 0.928, + "grad_norm": 1.6630196571350098, + "learning_rate": 0.0001630249110320285, + "loss": 2.2993, + "step": 2088 + }, + { + "epoch": 0.9284444444444444, + "grad_norm": 1.7729859352111816, + "learning_rate": 0.00016300711743772242, + "loss": 3.0367, + "step": 2089 + }, + { + "epoch": 0.9288888888888889, + "grad_norm": 1.6440625190734863, + "learning_rate": 0.00016298932384341638, + "loss": 2.4056, + "step": 2090 + }, + { + "epoch": 0.9293333333333333, + "grad_norm": 1.8084213733673096, + "learning_rate": 0.00016297153024911034, + "loss": 2.5752, + "step": 2091 + }, + { + "epoch": 0.9297777777777778, + "grad_norm": 1.6393537521362305, + "learning_rate": 0.00016295373665480426, + "loss": 2.2847, + "step": 2092 + }, + { + "epoch": 0.9302222222222222, + "grad_norm": 1.849129319190979, + "learning_rate": 0.00016293594306049822, + "loss": 2.8678, + "step": 2093 + }, + { + "epoch": 0.9306666666666666, + "grad_norm": 2.1448423862457275, + "learning_rate": 0.00016291814946619218, + "loss": 3.2427, + "step": 2094 + }, + { + "epoch": 0.9311111111111111, + "grad_norm": 1.7885196208953857, + "learning_rate": 0.00016290035587188613, + "loss": 2.0247, + "step": 2095 + }, + { + "epoch": 0.9315555555555556, + "grad_norm": 1.889359474182129, + "learning_rate": 0.00016288256227758006, + "loss": 2.492, + "step": 2096 + }, + { + "epoch": 0.932, + "grad_norm": 1.7645171880722046, + "learning_rate": 0.00016286476868327402, + "loss": 2.6633, + "step": 2097 + }, + { + "epoch": 0.9324444444444444, + "grad_norm": 1.768557071685791, + "learning_rate": 0.00016284697508896798, + "loss": 2.4075, + "step": 2098 + }, + { + "epoch": 0.9328888888888889, + "grad_norm": 2.301161766052246, + "learning_rate": 0.00016282918149466193, + "loss": 2.7808, + "step": 2099 + }, + { + "epoch": 0.9333333333333333, + "grad_norm": 2.5840353965759277, + "learning_rate": 0.0001628113879003559, + "loss": 3.1963, + "step": 2100 + }, + { + "epoch": 0.9337777777777778, + "grad_norm": 1.097058653831482, + "learning_rate": 0.00016279359430604985, + "loss": 2.4867, + "step": 2101 + }, + { + "epoch": 0.9342222222222222, + "grad_norm": 0.9999614357948303, + "learning_rate": 0.00016277580071174378, + "loss": 1.277, + "step": 2102 + }, + { + "epoch": 0.9346666666666666, + "grad_norm": 0.8796145915985107, + "learning_rate": 0.00016275800711743773, + "loss": 1.4156, + "step": 2103 + }, + { + "epoch": 0.9351111111111111, + "grad_norm": 1.1534805297851562, + "learning_rate": 0.0001627402135231317, + "loss": 2.3798, + "step": 2104 + }, + { + "epoch": 0.9355555555555556, + "grad_norm": 1.3361473083496094, + "learning_rate": 0.00016272241992882562, + "loss": 2.6389, + "step": 2105 + }, + { + "epoch": 0.936, + "grad_norm": 1.3026865720748901, + "learning_rate": 0.00016270462633451957, + "loss": 2.9566, + "step": 2106 + }, + { + "epoch": 0.9364444444444444, + "grad_norm": 1.2137173414230347, + "learning_rate": 0.00016268683274021353, + "loss": 2.7551, + "step": 2107 + }, + { + "epoch": 0.9368888888888889, + "grad_norm": 1.1544345617294312, + "learning_rate": 0.0001626690391459075, + "loss": 1.6071, + "step": 2108 + }, + { + "epoch": 0.9373333333333334, + "grad_norm": 1.3754730224609375, + "learning_rate": 0.00016265124555160142, + "loss": 2.681, + "step": 2109 + }, + { + "epoch": 0.9377777777777778, + "grad_norm": 1.2891576290130615, + "learning_rate": 0.00016263345195729537, + "loss": 2.5061, + "step": 2110 + }, + { + "epoch": 0.9382222222222222, + "grad_norm": 1.6436229944229126, + "learning_rate": 0.00016261565836298933, + "loss": 2.7442, + "step": 2111 + }, + { + "epoch": 0.9386666666666666, + "grad_norm": 1.6582046747207642, + "learning_rate": 0.00016259786476868329, + "loss": 2.4712, + "step": 2112 + }, + { + "epoch": 0.9391111111111111, + "grad_norm": 1.338775873184204, + "learning_rate": 0.00016258007117437724, + "loss": 2.2104, + "step": 2113 + }, + { + "epoch": 0.9395555555555556, + "grad_norm": 1.4280105829238892, + "learning_rate": 0.0001625622775800712, + "loss": 1.706, + "step": 2114 + }, + { + "epoch": 0.94, + "grad_norm": 1.3031154870986938, + "learning_rate": 0.00016254448398576513, + "loss": 2.9918, + "step": 2115 + }, + { + "epoch": 0.9404444444444444, + "grad_norm": 1.6146162748336792, + "learning_rate": 0.00016252669039145908, + "loss": 2.577, + "step": 2116 + }, + { + "epoch": 0.9408888888888889, + "grad_norm": 1.5383062362670898, + "learning_rate": 0.00016250889679715304, + "loss": 2.38, + "step": 2117 + }, + { + "epoch": 0.9413333333333334, + "grad_norm": 1.5242427587509155, + "learning_rate": 0.00016249110320284697, + "loss": 2.5093, + "step": 2118 + }, + { + "epoch": 0.9417777777777778, + "grad_norm": 1.5081580877304077, + "learning_rate": 0.00016247330960854093, + "loss": 2.983, + "step": 2119 + }, + { + "epoch": 0.9422222222222222, + "grad_norm": 1.362468957901001, + "learning_rate": 0.00016245551601423488, + "loss": 2.4969, + "step": 2120 + }, + { + "epoch": 0.9426666666666667, + "grad_norm": 1.5495905876159668, + "learning_rate": 0.00016243772241992884, + "loss": 2.474, + "step": 2121 + }, + { + "epoch": 0.9431111111111111, + "grad_norm": 1.6289684772491455, + "learning_rate": 0.00016241992882562277, + "loss": 2.7819, + "step": 2122 + }, + { + "epoch": 0.9435555555555556, + "grad_norm": 1.3928167819976807, + "learning_rate": 0.00016240213523131673, + "loss": 2.3204, + "step": 2123 + }, + { + "epoch": 0.944, + "grad_norm": 1.5994818210601807, + "learning_rate": 0.00016238434163701068, + "loss": 2.783, + "step": 2124 + }, + { + "epoch": 0.9444444444444444, + "grad_norm": 1.4788901805877686, + "learning_rate": 0.00016236654804270464, + "loss": 2.4959, + "step": 2125 + }, + { + "epoch": 0.9448888888888889, + "grad_norm": 1.7189639806747437, + "learning_rate": 0.0001623487544483986, + "loss": 2.7708, + "step": 2126 + }, + { + "epoch": 0.9453333333333334, + "grad_norm": 1.6642398834228516, + "learning_rate": 0.00016233096085409255, + "loss": 1.4068, + "step": 2127 + }, + { + "epoch": 0.9457777777777778, + "grad_norm": 1.9017895460128784, + "learning_rate": 0.00016231316725978648, + "loss": 2.0995, + "step": 2128 + }, + { + "epoch": 0.9462222222222222, + "grad_norm": 1.4762321710586548, + "learning_rate": 0.00016229537366548044, + "loss": 2.8266, + "step": 2129 + }, + { + "epoch": 0.9466666666666667, + "grad_norm": 1.1698795557022095, + "learning_rate": 0.0001622775800711744, + "loss": 1.2059, + "step": 2130 + }, + { + "epoch": 0.9471111111111111, + "grad_norm": 1.7860333919525146, + "learning_rate": 0.00016225978647686832, + "loss": 3.0256, + "step": 2131 + }, + { + "epoch": 0.9475555555555556, + "grad_norm": 1.6017791032791138, + "learning_rate": 0.00016224199288256228, + "loss": 3.0768, + "step": 2132 + }, + { + "epoch": 0.948, + "grad_norm": 1.6588813066482544, + "learning_rate": 0.00016222419928825624, + "loss": 2.8438, + "step": 2133 + }, + { + "epoch": 0.9484444444444444, + "grad_norm": 1.97148597240448, + "learning_rate": 0.0001622064056939502, + "loss": 2.4861, + "step": 2134 + }, + { + "epoch": 0.9488888888888889, + "grad_norm": 1.5532220602035522, + "learning_rate": 0.00016218861209964412, + "loss": 2.999, + "step": 2135 + }, + { + "epoch": 0.9493333333333334, + "grad_norm": 1.6381109952926636, + "learning_rate": 0.00016217081850533808, + "loss": 2.6297, + "step": 2136 + }, + { + "epoch": 0.9497777777777778, + "grad_norm": 1.6247540712356567, + "learning_rate": 0.00016215302491103204, + "loss": 2.0808, + "step": 2137 + }, + { + "epoch": 0.9502222222222222, + "grad_norm": 2.061701536178589, + "learning_rate": 0.000162135231316726, + "loss": 2.9111, + "step": 2138 + }, + { + "epoch": 0.9506666666666667, + "grad_norm": 1.9049525260925293, + "learning_rate": 0.00016211743772241995, + "loss": 1.179, + "step": 2139 + }, + { + "epoch": 0.9511111111111111, + "grad_norm": 1.4355841875076294, + "learning_rate": 0.0001620996441281139, + "loss": 2.2306, + "step": 2140 + }, + { + "epoch": 0.9515555555555556, + "grad_norm": 2.1435563564300537, + "learning_rate": 0.00016208185053380783, + "loss": 3.1722, + "step": 2141 + }, + { + "epoch": 0.952, + "grad_norm": 1.7206003665924072, + "learning_rate": 0.0001620640569395018, + "loss": 2.6664, + "step": 2142 + }, + { + "epoch": 0.9524444444444444, + "grad_norm": 1.7475922107696533, + "learning_rate": 0.00016204626334519575, + "loss": 2.2984, + "step": 2143 + }, + { + "epoch": 0.9528888888888889, + "grad_norm": 1.429494857788086, + "learning_rate": 0.00016202846975088968, + "loss": 2.0885, + "step": 2144 + }, + { + "epoch": 0.9533333333333334, + "grad_norm": 1.6629289388656616, + "learning_rate": 0.00016201067615658363, + "loss": 2.312, + "step": 2145 + }, + { + "epoch": 0.9537777777777777, + "grad_norm": 1.6623343229293823, + "learning_rate": 0.0001619928825622776, + "loss": 2.3646, + "step": 2146 + }, + { + "epoch": 0.9542222222222222, + "grad_norm": 2.0395777225494385, + "learning_rate": 0.00016197508896797155, + "loss": 2.4601, + "step": 2147 + }, + { + "epoch": 0.9546666666666667, + "grad_norm": 2.142592191696167, + "learning_rate": 0.00016195729537366548, + "loss": 2.5194, + "step": 2148 + }, + { + "epoch": 0.9551111111111111, + "grad_norm": 1.8677221536636353, + "learning_rate": 0.00016193950177935943, + "loss": 2.3944, + "step": 2149 + }, + { + "epoch": 0.9555555555555556, + "grad_norm": 1.8945562839508057, + "learning_rate": 0.0001619217081850534, + "loss": 2.6568, + "step": 2150 + }, + { + "epoch": 0.956, + "grad_norm": 1.6855690479278564, + "learning_rate": 0.00016190391459074735, + "loss": 3.2555, + "step": 2151 + }, + { + "epoch": 0.9564444444444444, + "grad_norm": 1.2461864948272705, + "learning_rate": 0.0001618861209964413, + "loss": 2.8541, + "step": 2152 + }, + { + "epoch": 0.9568888888888889, + "grad_norm": 1.181875467300415, + "learning_rate": 0.00016186832740213523, + "loss": 2.6893, + "step": 2153 + }, + { + "epoch": 0.9573333333333334, + "grad_norm": 1.1439327001571655, + "learning_rate": 0.0001618505338078292, + "loss": 2.8082, + "step": 2154 + }, + { + "epoch": 0.9577777777777777, + "grad_norm": 1.2180849313735962, + "learning_rate": 0.00016183274021352314, + "loss": 2.4687, + "step": 2155 + }, + { + "epoch": 0.9582222222222222, + "grad_norm": 1.2314586639404297, + "learning_rate": 0.0001618149466192171, + "loss": 2.1729, + "step": 2156 + }, + { + "epoch": 0.9586666666666667, + "grad_norm": 1.3291667699813843, + "learning_rate": 0.00016179715302491103, + "loss": 2.5573, + "step": 2157 + }, + { + "epoch": 0.9591111111111111, + "grad_norm": 1.359408974647522, + "learning_rate": 0.000161779359430605, + "loss": 2.2368, + "step": 2158 + }, + { + "epoch": 0.9595555555555556, + "grad_norm": 1.2051069736480713, + "learning_rate": 0.00016176156583629894, + "loss": 2.2714, + "step": 2159 + }, + { + "epoch": 0.96, + "grad_norm": 1.4360554218292236, + "learning_rate": 0.00016174377224199287, + "loss": 2.4253, + "step": 2160 + }, + { + "epoch": 0.9604444444444444, + "grad_norm": 1.3218653202056885, + "learning_rate": 0.00016172597864768683, + "loss": 2.38, + "step": 2161 + }, + { + "epoch": 0.9608888888888889, + "grad_norm": 1.658354640007019, + "learning_rate": 0.00016170818505338079, + "loss": 3.0823, + "step": 2162 + }, + { + "epoch": 0.9613333333333334, + "grad_norm": 1.390121579170227, + "learning_rate": 0.00016169039145907474, + "loss": 2.6402, + "step": 2163 + }, + { + "epoch": 0.9617777777777777, + "grad_norm": 1.279600977897644, + "learning_rate": 0.0001616725978647687, + "loss": 2.6816, + "step": 2164 + }, + { + "epoch": 0.9622222222222222, + "grad_norm": 1.3409185409545898, + "learning_rate": 0.00016165480427046266, + "loss": 2.0361, + "step": 2165 + }, + { + "epoch": 0.9626666666666667, + "grad_norm": 1.7012516260147095, + "learning_rate": 0.00016163701067615658, + "loss": 2.7103, + "step": 2166 + }, + { + "epoch": 0.9631111111111111, + "grad_norm": 1.6114073991775513, + "learning_rate": 0.00016161921708185054, + "loss": 2.2603, + "step": 2167 + }, + { + "epoch": 0.9635555555555556, + "grad_norm": 1.3906176090240479, + "learning_rate": 0.0001616014234875445, + "loss": 2.4579, + "step": 2168 + }, + { + "epoch": 0.964, + "grad_norm": 1.3002898693084717, + "learning_rate": 0.00016158362989323845, + "loss": 2.3782, + "step": 2169 + }, + { + "epoch": 0.9644444444444444, + "grad_norm": 1.4082340002059937, + "learning_rate": 0.00016156583629893238, + "loss": 2.6653, + "step": 2170 + }, + { + "epoch": 0.9648888888888889, + "grad_norm": 1.4760489463806152, + "learning_rate": 0.00016154804270462634, + "loss": 2.7057, + "step": 2171 + }, + { + "epoch": 0.9653333333333334, + "grad_norm": 1.411620020866394, + "learning_rate": 0.0001615302491103203, + "loss": 2.3573, + "step": 2172 + }, + { + "epoch": 0.9657777777777777, + "grad_norm": 1.4951653480529785, + "learning_rate": 0.00016151245551601423, + "loss": 2.9025, + "step": 2173 + }, + { + "epoch": 0.9662222222222222, + "grad_norm": 1.6565364599227905, + "learning_rate": 0.00016149466192170818, + "loss": 3.272, + "step": 2174 + }, + { + "epoch": 0.9666666666666667, + "grad_norm": 1.4833595752716064, + "learning_rate": 0.00016147686832740214, + "loss": 2.6249, + "step": 2175 + }, + { + "epoch": 0.9671111111111111, + "grad_norm": 1.5216375589370728, + "learning_rate": 0.0001614590747330961, + "loss": 2.4307, + "step": 2176 + }, + { + "epoch": 0.9675555555555555, + "grad_norm": 1.8078597784042358, + "learning_rate": 0.00016144128113879005, + "loss": 2.629, + "step": 2177 + }, + { + "epoch": 0.968, + "grad_norm": 1.560192584991455, + "learning_rate": 0.000161423487544484, + "loss": 2.6558, + "step": 2178 + }, + { + "epoch": 0.9684444444444444, + "grad_norm": 1.6416150331497192, + "learning_rate": 0.00016140569395017794, + "loss": 2.3076, + "step": 2179 + }, + { + "epoch": 0.9688888888888889, + "grad_norm": 1.6443932056427002, + "learning_rate": 0.0001613879003558719, + "loss": 2.9646, + "step": 2180 + }, + { + "epoch": 0.9693333333333334, + "grad_norm": 2.160329580307007, + "learning_rate": 0.00016137010676156585, + "loss": 2.0787, + "step": 2181 + }, + { + "epoch": 0.9697777777777777, + "grad_norm": 1.5181187391281128, + "learning_rate": 0.00016135231316725978, + "loss": 2.5944, + "step": 2182 + }, + { + "epoch": 0.9702222222222222, + "grad_norm": 1.5956158638000488, + "learning_rate": 0.00016133451957295374, + "loss": 2.5744, + "step": 2183 + }, + { + "epoch": 0.9706666666666667, + "grad_norm": 1.6073002815246582, + "learning_rate": 0.0001613167259786477, + "loss": 2.2124, + "step": 2184 + }, + { + "epoch": 0.9711111111111111, + "grad_norm": 1.5423829555511475, + "learning_rate": 0.00016129893238434165, + "loss": 2.5337, + "step": 2185 + }, + { + "epoch": 0.9715555555555555, + "grad_norm": 1.5740853548049927, + "learning_rate": 0.00016128113879003558, + "loss": 2.7245, + "step": 2186 + }, + { + "epoch": 0.972, + "grad_norm": 1.5205440521240234, + "learning_rate": 0.00016126334519572954, + "loss": 2.6604, + "step": 2187 + }, + { + "epoch": 0.9724444444444444, + "grad_norm": 1.425803303718567, + "learning_rate": 0.0001612455516014235, + "loss": 2.2333, + "step": 2188 + }, + { + "epoch": 0.9728888888888889, + "grad_norm": 1.6136490106582642, + "learning_rate": 0.00016122775800711745, + "loss": 2.2972, + "step": 2189 + }, + { + "epoch": 0.9733333333333334, + "grad_norm": 2.0137991905212402, + "learning_rate": 0.0001612099644128114, + "loss": 1.3431, + "step": 2190 + }, + { + "epoch": 0.9737777777777777, + "grad_norm": 1.7414988279342651, + "learning_rate": 0.00016119217081850536, + "loss": 2.8453, + "step": 2191 + }, + { + "epoch": 0.9742222222222222, + "grad_norm": 1.7436699867248535, + "learning_rate": 0.0001611743772241993, + "loss": 2.6342, + "step": 2192 + }, + { + "epoch": 0.9746666666666667, + "grad_norm": 2.1652956008911133, + "learning_rate": 0.00016115658362989325, + "loss": 3.4064, + "step": 2193 + }, + { + "epoch": 0.9751111111111112, + "grad_norm": 1.4634653329849243, + "learning_rate": 0.0001611387900355872, + "loss": 2.2654, + "step": 2194 + }, + { + "epoch": 0.9755555555555555, + "grad_norm": 1.380988359451294, + "learning_rate": 0.00016112099644128113, + "loss": 1.9905, + "step": 2195 + }, + { + "epoch": 0.976, + "grad_norm": 1.78019380569458, + "learning_rate": 0.0001611032028469751, + "loss": 2.7685, + "step": 2196 + }, + { + "epoch": 0.9764444444444444, + "grad_norm": 2.219177722930908, + "learning_rate": 0.00016108540925266905, + "loss": 2.8879, + "step": 2197 + }, + { + "epoch": 0.9768888888888889, + "grad_norm": 2.295215129852295, + "learning_rate": 0.000161067615658363, + "loss": 2.7534, + "step": 2198 + }, + { + "epoch": 0.9773333333333334, + "grad_norm": 2.250352144241333, + "learning_rate": 0.00016104982206405693, + "loss": 2.7764, + "step": 2199 + }, + { + "epoch": 0.9777777777777777, + "grad_norm": 2.36488938331604, + "learning_rate": 0.0001610320284697509, + "loss": 3.1503, + "step": 2200 + }, + { + "epoch": 0.9782222222222222, + "grad_norm": 1.0454128980636597, + "learning_rate": 0.00016101423487544485, + "loss": 1.2486, + "step": 2201 + }, + { + "epoch": 0.9786666666666667, + "grad_norm": 1.087558388710022, + "learning_rate": 0.0001609964412811388, + "loss": 2.7034, + "step": 2202 + }, + { + "epoch": 0.9791111111111112, + "grad_norm": 1.1958833932876587, + "learning_rate": 0.00016097864768683276, + "loss": 2.5241, + "step": 2203 + }, + { + "epoch": 0.9795555555555555, + "grad_norm": 1.2436286211013794, + "learning_rate": 0.00016096085409252671, + "loss": 2.5663, + "step": 2204 + }, + { + "epoch": 0.98, + "grad_norm": 1.29501211643219, + "learning_rate": 0.00016094306049822064, + "loss": 2.2681, + "step": 2205 + }, + { + "epoch": 0.9804444444444445, + "grad_norm": 1.4029202461242676, + "learning_rate": 0.0001609252669039146, + "loss": 2.6442, + "step": 2206 + }, + { + "epoch": 0.9808888888888889, + "grad_norm": 1.2167294025421143, + "learning_rate": 0.00016090747330960856, + "loss": 2.3042, + "step": 2207 + }, + { + "epoch": 0.9813333333333333, + "grad_norm": 1.1777758598327637, + "learning_rate": 0.00016088967971530249, + "loss": 2.3734, + "step": 2208 + }, + { + "epoch": 0.9817777777777777, + "grad_norm": 1.352673053741455, + "learning_rate": 0.00016087188612099644, + "loss": 2.5139, + "step": 2209 + }, + { + "epoch": 0.9822222222222222, + "grad_norm": 1.5402523279190063, + "learning_rate": 0.0001608540925266904, + "loss": 2.4843, + "step": 2210 + }, + { + "epoch": 0.9826666666666667, + "grad_norm": 1.278908371925354, + "learning_rate": 0.00016083629893238436, + "loss": 2.6108, + "step": 2211 + }, + { + "epoch": 0.9831111111111112, + "grad_norm": 1.1538258790969849, + "learning_rate": 0.00016081850533807829, + "loss": 2.221, + "step": 2212 + }, + { + "epoch": 0.9835555555555555, + "grad_norm": 1.3440600633621216, + "learning_rate": 0.00016080071174377224, + "loss": 2.1903, + "step": 2213 + }, + { + "epoch": 0.984, + "grad_norm": 1.4237117767333984, + "learning_rate": 0.0001607829181494662, + "loss": 2.6641, + "step": 2214 + }, + { + "epoch": 0.9844444444444445, + "grad_norm": 1.4718806743621826, + "learning_rate": 0.00016076512455516015, + "loss": 1.0382, + "step": 2215 + }, + { + "epoch": 0.9848888888888889, + "grad_norm": 1.376482605934143, + "learning_rate": 0.0001607473309608541, + "loss": 2.2212, + "step": 2216 + }, + { + "epoch": 0.9853333333333333, + "grad_norm": 1.2039564847946167, + "learning_rate": 0.00016072953736654807, + "loss": 1.969, + "step": 2217 + }, + { + "epoch": 0.9857777777777778, + "grad_norm": 1.4104335308074951, + "learning_rate": 0.000160711743772242, + "loss": 2.5837, + "step": 2218 + }, + { + "epoch": 0.9862222222222222, + "grad_norm": 1.5590723752975464, + "learning_rate": 0.00016069395017793595, + "loss": 2.5401, + "step": 2219 + }, + { + "epoch": 0.9866666666666667, + "grad_norm": 1.4591701030731201, + "learning_rate": 0.0001606761565836299, + "loss": 2.1405, + "step": 2220 + }, + { + "epoch": 0.9871111111111112, + "grad_norm": 1.6289016008377075, + "learning_rate": 0.00016065836298932384, + "loss": 2.6935, + "step": 2221 + }, + { + "epoch": 0.9875555555555555, + "grad_norm": 1.431257963180542, + "learning_rate": 0.0001606405693950178, + "loss": 2.7326, + "step": 2222 + }, + { + "epoch": 0.988, + "grad_norm": 1.7325876951217651, + "learning_rate": 0.00016062277580071175, + "loss": 2.2347, + "step": 2223 + }, + { + "epoch": 0.9884444444444445, + "grad_norm": 1.4045909643173218, + "learning_rate": 0.0001606049822064057, + "loss": 2.4795, + "step": 2224 + }, + { + "epoch": 0.9888888888888889, + "grad_norm": 1.5909349918365479, + "learning_rate": 0.00016058718861209964, + "loss": 2.89, + "step": 2225 + }, + { + "epoch": 0.9893333333333333, + "grad_norm": 1.7079018354415894, + "learning_rate": 0.0001605693950177936, + "loss": 2.9235, + "step": 2226 + }, + { + "epoch": 0.9897777777777778, + "grad_norm": 1.401973843574524, + "learning_rate": 0.00016055160142348755, + "loss": 2.5211, + "step": 2227 + }, + { + "epoch": 0.9902222222222222, + "grad_norm": 1.7690025568008423, + "learning_rate": 0.0001605338078291815, + "loss": 2.8388, + "step": 2228 + }, + { + "epoch": 0.9906666666666667, + "grad_norm": 1.3721446990966797, + "learning_rate": 0.00016051601423487546, + "loss": 2.6306, + "step": 2229 + }, + { + "epoch": 0.9911111111111112, + "grad_norm": 1.9013787508010864, + "learning_rate": 0.00016049822064056942, + "loss": 3.1249, + "step": 2230 + }, + { + "epoch": 0.9915555555555555, + "grad_norm": 1.5793306827545166, + "learning_rate": 0.00016048042704626335, + "loss": 0.0805, + "step": 2231 + }, + { + "epoch": 0.992, + "grad_norm": 1.6632827520370483, + "learning_rate": 0.0001604626334519573, + "loss": 2.1838, + "step": 2232 + }, + { + "epoch": 0.9924444444444445, + "grad_norm": 1.837686538696289, + "learning_rate": 0.00016044483985765126, + "loss": 2.245, + "step": 2233 + }, + { + "epoch": 0.9928888888888889, + "grad_norm": 2.039071559906006, + "learning_rate": 0.0001604270462633452, + "loss": 2.713, + "step": 2234 + }, + { + "epoch": 0.9933333333333333, + "grad_norm": 1.6307156085968018, + "learning_rate": 0.00016040925266903915, + "loss": 3.0165, + "step": 2235 + }, + { + "epoch": 0.9937777777777778, + "grad_norm": 1.4582569599151611, + "learning_rate": 0.0001603914590747331, + "loss": 2.3939, + "step": 2236 + }, + { + "epoch": 0.9942222222222222, + "grad_norm": 1.3615283966064453, + "learning_rate": 0.00016037366548042706, + "loss": 1.9555, + "step": 2237 + }, + { + "epoch": 0.9946666666666667, + "grad_norm": 1.5992584228515625, + "learning_rate": 0.000160355871886121, + "loss": 2.4956, + "step": 2238 + }, + { + "epoch": 0.9951111111111111, + "grad_norm": 1.8983911275863647, + "learning_rate": 0.00016033807829181495, + "loss": 2.6738, + "step": 2239 + }, + { + "epoch": 0.9955555555555555, + "grad_norm": 1.622286081314087, + "learning_rate": 0.0001603202846975089, + "loss": 2.2483, + "step": 2240 + }, + { + "epoch": 0.996, + "grad_norm": 1.2114402055740356, + "learning_rate": 0.00016030249110320286, + "loss": 1.3111, + "step": 2241 + }, + { + "epoch": 0.9964444444444445, + "grad_norm": 1.6320134401321411, + "learning_rate": 0.00016028469750889682, + "loss": 2.7794, + "step": 2242 + }, + { + "epoch": 0.9968888888888889, + "grad_norm": 1.804682970046997, + "learning_rate": 0.00016026690391459075, + "loss": 2.515, + "step": 2243 + }, + { + "epoch": 0.9973333333333333, + "grad_norm": 2.0232090950012207, + "learning_rate": 0.0001602491103202847, + "loss": 3.3673, + "step": 2244 + }, + { + "epoch": 0.9977777777777778, + "grad_norm": 1.852048635482788, + "learning_rate": 0.00016023131672597866, + "loss": 2.7809, + "step": 2245 + }, + { + "epoch": 0.9982222222222222, + "grad_norm": 1.7070189714431763, + "learning_rate": 0.00016021352313167262, + "loss": 2.2775, + "step": 2246 + }, + { + "epoch": 0.9986666666666667, + "grad_norm": 1.6251046657562256, + "learning_rate": 0.00016019572953736655, + "loss": 2.3802, + "step": 2247 + }, + { + "epoch": 0.9991111111111111, + "grad_norm": 1.7044910192489624, + "learning_rate": 0.0001601779359430605, + "loss": 2.5114, + "step": 2248 + }, + { + "epoch": 0.9995555555555555, + "grad_norm": 2.653667688369751, + "learning_rate": 0.00016016014234875446, + "loss": 1.5299, + "step": 2249 + }, + { + "epoch": 1.0, + "grad_norm": 2.5051262378692627, + "learning_rate": 0.0001601423487544484, + "loss": 1.7248, + "step": 2250 + }, + { + "epoch": 1.0, + "eval_loss": 2.410618305206299, + "eval_runtime": 47.6193, + "eval_samples_per_second": 10.5, + "eval_steps_per_second": 10.5, + "step": 2250 + }, + { + "epoch": 1.0004444444444445, + "grad_norm": 0.9968487620353699, + "learning_rate": 0.00016012455516014234, + "loss": 2.3568, + "step": 2251 + }, + { + "epoch": 1.000888888888889, + "grad_norm": 1.1485272645950317, + "learning_rate": 0.0001601067615658363, + "loss": 2.6016, + "step": 2252 + }, + { + "epoch": 1.0013333333333334, + "grad_norm": 0.9918361902236938, + "learning_rate": 0.00016008896797153026, + "loss": 2.1317, + "step": 2253 + }, + { + "epoch": 1.0017777777777779, + "grad_norm": 1.443756103515625, + "learning_rate": 0.00016007117437722421, + "loss": 1.3992, + "step": 2254 + }, + { + "epoch": 1.0022222222222221, + "grad_norm": 1.2466870546340942, + "learning_rate": 0.00016005338078291817, + "loss": 2.7594, + "step": 2255 + }, + { + "epoch": 1.0026666666666666, + "grad_norm": 1.1465460062026978, + "learning_rate": 0.0001600355871886121, + "loss": 2.3712, + "step": 2256 + }, + { + "epoch": 1.003111111111111, + "grad_norm": 1.0530627965927124, + "learning_rate": 0.00016001779359430606, + "loss": 1.8851, + "step": 2257 + }, + { + "epoch": 1.0035555555555555, + "grad_norm": 1.195264458656311, + "learning_rate": 0.00016, + "loss": 2.2701, + "step": 2258 + }, + { + "epoch": 1.004, + "grad_norm": 1.2500135898590088, + "learning_rate": 0.00015998220640569397, + "loss": 2.167, + "step": 2259 + }, + { + "epoch": 1.0044444444444445, + "grad_norm": 1.24211585521698, + "learning_rate": 0.0001599644128113879, + "loss": 2.3523, + "step": 2260 + }, + { + "epoch": 1.004888888888889, + "grad_norm": 1.3692959547042847, + "learning_rate": 0.00015994661921708186, + "loss": 2.5483, + "step": 2261 + }, + { + "epoch": 1.0053333333333334, + "grad_norm": 1.1393674612045288, + "learning_rate": 0.0001599288256227758, + "loss": 2.3064, + "step": 2262 + }, + { + "epoch": 1.0057777777777779, + "grad_norm": 1.341461181640625, + "learning_rate": 0.00015991103202846974, + "loss": 2.0089, + "step": 2263 + }, + { + "epoch": 1.0062222222222221, + "grad_norm": 1.434979796409607, + "learning_rate": 0.0001598932384341637, + "loss": 1.9006, + "step": 2264 + }, + { + "epoch": 1.0066666666666666, + "grad_norm": 1.4398572444915771, + "learning_rate": 0.00015987544483985765, + "loss": 2.1331, + "step": 2265 + }, + { + "epoch": 1.007111111111111, + "grad_norm": 1.2505806684494019, + "learning_rate": 0.0001598576512455516, + "loss": 2.059, + "step": 2266 + }, + { + "epoch": 1.0075555555555555, + "grad_norm": 1.3361196517944336, + "learning_rate": 0.00015983985765124557, + "loss": 1.2448, + "step": 2267 + }, + { + "epoch": 1.008, + "grad_norm": 1.3557100296020508, + "learning_rate": 0.00015982206405693952, + "loss": 2.2836, + "step": 2268 + }, + { + "epoch": 1.0084444444444445, + "grad_norm": 1.4562923908233643, + "learning_rate": 0.00015980427046263345, + "loss": 2.4052, + "step": 2269 + }, + { + "epoch": 1.008888888888889, + "grad_norm": 1.8324874639511108, + "learning_rate": 0.0001597864768683274, + "loss": 1.0363, + "step": 2270 + }, + { + "epoch": 1.0093333333333334, + "grad_norm": 1.3576056957244873, + "learning_rate": 0.00015976868327402137, + "loss": 2.5, + "step": 2271 + }, + { + "epoch": 1.0097777777777779, + "grad_norm": 1.369577407836914, + "learning_rate": 0.00015975088967971532, + "loss": 2.4154, + "step": 2272 + }, + { + "epoch": 1.0102222222222221, + "grad_norm": 1.7282054424285889, + "learning_rate": 0.00015973309608540925, + "loss": 2.4238, + "step": 2273 + }, + { + "epoch": 1.0106666666666666, + "grad_norm": 1.4440052509307861, + "learning_rate": 0.0001597153024911032, + "loss": 1.8114, + "step": 2274 + }, + { + "epoch": 1.011111111111111, + "grad_norm": 1.8842155933380127, + "learning_rate": 0.00015969750889679717, + "loss": 2.5752, + "step": 2275 + }, + { + "epoch": 1.0115555555555555, + "grad_norm": 1.6671173572540283, + "learning_rate": 0.0001596797153024911, + "loss": 2.3995, + "step": 2276 + }, + { + "epoch": 1.012, + "grad_norm": 1.6246118545532227, + "learning_rate": 0.00015966192170818505, + "loss": 2.4257, + "step": 2277 + }, + { + "epoch": 1.0124444444444445, + "grad_norm": 1.416971206665039, + "learning_rate": 0.000159644128113879, + "loss": 1.9455, + "step": 2278 + }, + { + "epoch": 1.012888888888889, + "grad_norm": 1.754091739654541, + "learning_rate": 0.00015962633451957296, + "loss": 2.6238, + "step": 2279 + }, + { + "epoch": 1.0133333333333334, + "grad_norm": 1.3849114179611206, + "learning_rate": 0.00015960854092526692, + "loss": 1.5943, + "step": 2280 + }, + { + "epoch": 1.0137777777777779, + "grad_norm": 1.6501544713974, + "learning_rate": 0.00015959074733096088, + "loss": 1.9212, + "step": 2281 + }, + { + "epoch": 1.0142222222222221, + "grad_norm": 1.7187528610229492, + "learning_rate": 0.0001595729537366548, + "loss": 1.7647, + "step": 2282 + }, + { + "epoch": 1.0146666666666666, + "grad_norm": 1.8686177730560303, + "learning_rate": 0.00015955516014234876, + "loss": 2.4814, + "step": 2283 + }, + { + "epoch": 1.015111111111111, + "grad_norm": 1.5445845127105713, + "learning_rate": 0.00015953736654804272, + "loss": 1.9214, + "step": 2284 + }, + { + "epoch": 1.0155555555555555, + "grad_norm": 1.7692326307296753, + "learning_rate": 0.00015951957295373668, + "loss": 2.1594, + "step": 2285 + }, + { + "epoch": 1.016, + "grad_norm": 1.6517410278320312, + "learning_rate": 0.0001595017793594306, + "loss": 2.1724, + "step": 2286 + }, + { + "epoch": 1.0164444444444445, + "grad_norm": 1.2496143579483032, + "learning_rate": 0.00015948398576512456, + "loss": 0.7354, + "step": 2287 + }, + { + "epoch": 1.016888888888889, + "grad_norm": 1.6711918115615845, + "learning_rate": 0.00015946619217081852, + "loss": 2.1376, + "step": 2288 + }, + { + "epoch": 1.0173333333333334, + "grad_norm": 1.8869587182998657, + "learning_rate": 0.00015944839857651245, + "loss": 2.2878, + "step": 2289 + }, + { + "epoch": 1.0177777777777777, + "grad_norm": 1.5782763957977295, + "learning_rate": 0.0001594306049822064, + "loss": 1.9263, + "step": 2290 + }, + { + "epoch": 1.0182222222222221, + "grad_norm": 1.8840655088424683, + "learning_rate": 0.00015941281138790036, + "loss": 2.4303, + "step": 2291 + }, + { + "epoch": 1.0186666666666666, + "grad_norm": 2.064854383468628, + "learning_rate": 0.00015939501779359432, + "loss": 2.246, + "step": 2292 + }, + { + "epoch": 1.019111111111111, + "grad_norm": 1.704014539718628, + "learning_rate": 0.00015937722419928827, + "loss": 1.6609, + "step": 2293 + }, + { + "epoch": 1.0195555555555555, + "grad_norm": 1.7326053380966187, + "learning_rate": 0.00015935943060498223, + "loss": 2.0684, + "step": 2294 + }, + { + "epoch": 1.02, + "grad_norm": 1.9503422975540161, + "learning_rate": 0.00015934163701067616, + "loss": 2.1345, + "step": 2295 + }, + { + "epoch": 1.0204444444444445, + "grad_norm": 2.1505517959594727, + "learning_rate": 0.00015932384341637012, + "loss": 2.2523, + "step": 2296 + }, + { + "epoch": 1.020888888888889, + "grad_norm": 2.059180736541748, + "learning_rate": 0.00015930604982206407, + "loss": 1.956, + "step": 2297 + }, + { + "epoch": 1.0213333333333334, + "grad_norm": 1.7710636854171753, + "learning_rate": 0.000159288256227758, + "loss": 1.8129, + "step": 2298 + }, + { + "epoch": 1.0217777777777777, + "grad_norm": 1.9192954301834106, + "learning_rate": 0.00015927046263345196, + "loss": 2.004, + "step": 2299 + }, + { + "epoch": 1.0222222222222221, + "grad_norm": 3.243239402770996, + "learning_rate": 0.00015925266903914591, + "loss": 1.0487, + "step": 2300 + }, + { + "epoch": 1.0226666666666666, + "grad_norm": 1.2613039016723633, + "learning_rate": 0.00015923487544483987, + "loss": 2.2974, + "step": 2301 + }, + { + "epoch": 1.023111111111111, + "grad_norm": 1.9154945611953735, + "learning_rate": 0.0001592170818505338, + "loss": 1.0126, + "step": 2302 + }, + { + "epoch": 1.0235555555555556, + "grad_norm": 1.2044062614440918, + "learning_rate": 0.00015919928825622776, + "loss": 0.5583, + "step": 2303 + }, + { + "epoch": 1.024, + "grad_norm": 1.3576868772506714, + "learning_rate": 0.00015918149466192171, + "loss": 2.2222, + "step": 2304 + }, + { + "epoch": 1.0244444444444445, + "grad_norm": 1.372114658355713, + "learning_rate": 0.00015916370106761567, + "loss": 2.2053, + "step": 2305 + }, + { + "epoch": 1.024888888888889, + "grad_norm": 1.9428937435150146, + "learning_rate": 0.00015914590747330963, + "loss": 1.9576, + "step": 2306 + }, + { + "epoch": 1.0253333333333334, + "grad_norm": 1.2017688751220703, + "learning_rate": 0.00015912811387900358, + "loss": 2.264, + "step": 2307 + }, + { + "epoch": 1.0257777777777777, + "grad_norm": 1.3299150466918945, + "learning_rate": 0.0001591103202846975, + "loss": 1.8914, + "step": 2308 + }, + { + "epoch": 1.0262222222222221, + "grad_norm": 1.3821178674697876, + "learning_rate": 0.00015909252669039147, + "loss": 2.7482, + "step": 2309 + }, + { + "epoch": 1.0266666666666666, + "grad_norm": 1.5535212755203247, + "learning_rate": 0.00015907473309608543, + "loss": 2.3124, + "step": 2310 + }, + { + "epoch": 1.027111111111111, + "grad_norm": 1.2688461542129517, + "learning_rate": 0.00015905693950177936, + "loss": 2.0699, + "step": 2311 + }, + { + "epoch": 1.0275555555555556, + "grad_norm": 1.5910311937332153, + "learning_rate": 0.0001590391459074733, + "loss": 2.258, + "step": 2312 + }, + { + "epoch": 1.028, + "grad_norm": 1.4131362438201904, + "learning_rate": 0.00015902135231316727, + "loss": 1.8134, + "step": 2313 + }, + { + "epoch": 1.0284444444444445, + "grad_norm": 1.4447015523910522, + "learning_rate": 0.00015900355871886122, + "loss": 1.4502, + "step": 2314 + }, + { + "epoch": 1.028888888888889, + "grad_norm": 1.6646491289138794, + "learning_rate": 0.00015898576512455515, + "loss": 2.3019, + "step": 2315 + }, + { + "epoch": 1.0293333333333334, + "grad_norm": 1.4541774988174438, + "learning_rate": 0.0001589679715302491, + "loss": 2.2436, + "step": 2316 + }, + { + "epoch": 1.0297777777777777, + "grad_norm": 1.4342156648635864, + "learning_rate": 0.00015895017793594307, + "loss": 2.3697, + "step": 2317 + }, + { + "epoch": 1.0302222222222222, + "grad_norm": 1.6885074377059937, + "learning_rate": 0.00015893238434163702, + "loss": 2.3556, + "step": 2318 + }, + { + "epoch": 1.0306666666666666, + "grad_norm": 2.0182912349700928, + "learning_rate": 0.00015891459074733098, + "loss": 2.886, + "step": 2319 + }, + { + "epoch": 1.031111111111111, + "grad_norm": 1.5477981567382812, + "learning_rate": 0.00015889679715302494, + "loss": 2.3829, + "step": 2320 + }, + { + "epoch": 1.0315555555555556, + "grad_norm": 1.4278366565704346, + "learning_rate": 0.00015887900355871887, + "loss": 1.8171, + "step": 2321 + }, + { + "epoch": 1.032, + "grad_norm": 1.5529868602752686, + "learning_rate": 0.00015886120996441282, + "loss": 1.9937, + "step": 2322 + }, + { + "epoch": 1.0324444444444445, + "grad_norm": 1.7762391567230225, + "learning_rate": 0.00015884341637010678, + "loss": 1.9945, + "step": 2323 + }, + { + "epoch": 1.032888888888889, + "grad_norm": 1.378474473953247, + "learning_rate": 0.0001588256227758007, + "loss": 1.907, + "step": 2324 + }, + { + "epoch": 1.0333333333333334, + "grad_norm": 1.6256483793258667, + "learning_rate": 0.00015880782918149466, + "loss": 2.2908, + "step": 2325 + }, + { + "epoch": 1.0337777777777777, + "grad_norm": 1.5568405389785767, + "learning_rate": 0.00015879003558718862, + "loss": 2.2006, + "step": 2326 + }, + { + "epoch": 1.0342222222222222, + "grad_norm": 1.5443711280822754, + "learning_rate": 0.00015877224199288258, + "loss": 1.892, + "step": 2327 + }, + { + "epoch": 1.0346666666666666, + "grad_norm": 1.584693193435669, + "learning_rate": 0.0001587544483985765, + "loss": 2.2243, + "step": 2328 + }, + { + "epoch": 1.035111111111111, + "grad_norm": 1.7238883972167969, + "learning_rate": 0.00015873665480427046, + "loss": 1.8239, + "step": 2329 + }, + { + "epoch": 1.0355555555555556, + "grad_norm": 1.7188372611999512, + "learning_rate": 0.00015871886120996442, + "loss": 2.1286, + "step": 2330 + }, + { + "epoch": 1.036, + "grad_norm": 1.8142226934432983, + "learning_rate": 0.00015870106761565838, + "loss": 2.342, + "step": 2331 + }, + { + "epoch": 1.0364444444444445, + "grad_norm": 1.9462339878082275, + "learning_rate": 0.00015868327402135233, + "loss": 2.3758, + "step": 2332 + }, + { + "epoch": 1.036888888888889, + "grad_norm": 1.4883403778076172, + "learning_rate": 0.00015866548042704626, + "loss": 1.6063, + "step": 2333 + }, + { + "epoch": 1.0373333333333334, + "grad_norm": 1.6989222764968872, + "learning_rate": 0.00015864768683274022, + "loss": 2.2398, + "step": 2334 + }, + { + "epoch": 1.0377777777777777, + "grad_norm": 1.6604561805725098, + "learning_rate": 0.00015862989323843418, + "loss": 2.2341, + "step": 2335 + }, + { + "epoch": 1.0382222222222222, + "grad_norm": 2.0556907653808594, + "learning_rate": 0.00015861209964412813, + "loss": 2.4467, + "step": 2336 + }, + { + "epoch": 1.0386666666666666, + "grad_norm": 2.043485164642334, + "learning_rate": 0.00015859430604982206, + "loss": 1.7722, + "step": 2337 + }, + { + "epoch": 1.039111111111111, + "grad_norm": 1.7736142873764038, + "learning_rate": 0.00015857651245551602, + "loss": 1.9208, + "step": 2338 + }, + { + "epoch": 1.0395555555555556, + "grad_norm": 2.0610883235931396, + "learning_rate": 0.00015855871886120997, + "loss": 2.8929, + "step": 2339 + }, + { + "epoch": 1.04, + "grad_norm": 1.8629289865493774, + "learning_rate": 0.0001585409252669039, + "loss": 1.9888, + "step": 2340 + }, + { + "epoch": 1.0404444444444445, + "grad_norm": 2.201791286468506, + "learning_rate": 0.00015852313167259786, + "loss": 2.4039, + "step": 2341 + }, + { + "epoch": 1.040888888888889, + "grad_norm": 2.3649518489837646, + "learning_rate": 0.00015850533807829182, + "loss": 2.342, + "step": 2342 + }, + { + "epoch": 1.0413333333333332, + "grad_norm": 1.763653039932251, + "learning_rate": 0.00015848754448398577, + "loss": 1.7666, + "step": 2343 + }, + { + "epoch": 1.0417777777777777, + "grad_norm": 1.721071481704712, + "learning_rate": 0.00015846975088967973, + "loss": 2.2088, + "step": 2344 + }, + { + "epoch": 1.0422222222222222, + "grad_norm": 2.0251195430755615, + "learning_rate": 0.00015845195729537369, + "loss": 2.3982, + "step": 2345 + }, + { + "epoch": 1.0426666666666666, + "grad_norm": 1.797646403312683, + "learning_rate": 0.00015843416370106762, + "loss": 2.2807, + "step": 2346 + }, + { + "epoch": 1.043111111111111, + "grad_norm": 2.336357831954956, + "learning_rate": 0.00015841637010676157, + "loss": 2.7199, + "step": 2347 + }, + { + "epoch": 1.0435555555555556, + "grad_norm": 2.0868053436279297, + "learning_rate": 0.00015839857651245553, + "loss": 1.9292, + "step": 2348 + }, + { + "epoch": 1.044, + "grad_norm": 2.3412818908691406, + "learning_rate": 0.00015838078291814949, + "loss": 2.1445, + "step": 2349 + }, + { + "epoch": 1.0444444444444445, + "grad_norm": 2.016115188598633, + "learning_rate": 0.00015836298932384341, + "loss": 1.079, + "step": 2350 + }, + { + "epoch": 1.044888888888889, + "grad_norm": 1.2892199754714966, + "learning_rate": 0.00015834519572953737, + "loss": 1.1067, + "step": 2351 + }, + { + "epoch": 1.0453333333333332, + "grad_norm": 1.37278151512146, + "learning_rate": 0.00015832740213523133, + "loss": 2.208, + "step": 2352 + }, + { + "epoch": 1.0457777777777777, + "grad_norm": 1.2415770292282104, + "learning_rate": 0.00015830960854092526, + "loss": 2.2114, + "step": 2353 + }, + { + "epoch": 1.0462222222222222, + "grad_norm": 1.232061505317688, + "learning_rate": 0.0001582918149466192, + "loss": 1.4155, + "step": 2354 + }, + { + "epoch": 1.0466666666666666, + "grad_norm": 1.177029013633728, + "learning_rate": 0.00015827402135231317, + "loss": 2.5071, + "step": 2355 + }, + { + "epoch": 1.047111111111111, + "grad_norm": 1.4585161209106445, + "learning_rate": 0.00015825622775800713, + "loss": 2.3271, + "step": 2356 + }, + { + "epoch": 1.0475555555555556, + "grad_norm": 1.424669623374939, + "learning_rate": 0.00015823843416370108, + "loss": 2.1732, + "step": 2357 + }, + { + "epoch": 1.048, + "grad_norm": 1.3325684070587158, + "learning_rate": 0.00015822064056939504, + "loss": 2.2668, + "step": 2358 + }, + { + "epoch": 1.0484444444444445, + "grad_norm": 1.5595697164535522, + "learning_rate": 0.00015820284697508897, + "loss": 2.3379, + "step": 2359 + }, + { + "epoch": 1.048888888888889, + "grad_norm": 1.6011629104614258, + "learning_rate": 0.00015818505338078293, + "loss": 2.8927, + "step": 2360 + }, + { + "epoch": 1.0493333333333332, + "grad_norm": 1.100995421409607, + "learning_rate": 0.00015816725978647688, + "loss": 1.6169, + "step": 2361 + }, + { + "epoch": 1.0497777777777777, + "grad_norm": 1.3697389364242554, + "learning_rate": 0.00015814946619217084, + "loss": 2.3766, + "step": 2362 + }, + { + "epoch": 1.0502222222222222, + "grad_norm": 1.332924485206604, + "learning_rate": 0.00015813167259786477, + "loss": 1.7657, + "step": 2363 + }, + { + "epoch": 1.0506666666666666, + "grad_norm": 1.3072422742843628, + "learning_rate": 0.00015811387900355872, + "loss": 2.0262, + "step": 2364 + }, + { + "epoch": 1.051111111111111, + "grad_norm": 1.370421290397644, + "learning_rate": 0.00015809608540925268, + "loss": 1.8601, + "step": 2365 + }, + { + "epoch": 1.0515555555555556, + "grad_norm": 1.580460786819458, + "learning_rate": 0.0001580782918149466, + "loss": 1.3883, + "step": 2366 + }, + { + "epoch": 1.052, + "grad_norm": 1.478049635887146, + "learning_rate": 0.00015806049822064057, + "loss": 2.3788, + "step": 2367 + }, + { + "epoch": 1.0524444444444445, + "grad_norm": 1.421947717666626, + "learning_rate": 0.00015804270462633452, + "loss": 1.4549, + "step": 2368 + }, + { + "epoch": 1.052888888888889, + "grad_norm": 1.768334984779358, + "learning_rate": 0.00015802491103202848, + "loss": 1.0313, + "step": 2369 + }, + { + "epoch": 1.0533333333333332, + "grad_norm": 1.5838056802749634, + "learning_rate": 0.00015800711743772244, + "loss": 2.3386, + "step": 2370 + }, + { + "epoch": 1.0537777777777777, + "grad_norm": 1.6991932392120361, + "learning_rate": 0.0001579893238434164, + "loss": 2.5764, + "step": 2371 + }, + { + "epoch": 1.0542222222222222, + "grad_norm": 1.4423344135284424, + "learning_rate": 0.00015797153024911032, + "loss": 1.9598, + "step": 2372 + }, + { + "epoch": 1.0546666666666666, + "grad_norm": 1.508663535118103, + "learning_rate": 0.00015795373665480428, + "loss": 2.3974, + "step": 2373 + }, + { + "epoch": 1.055111111111111, + "grad_norm": 1.6960604190826416, + "learning_rate": 0.00015793594306049823, + "loss": 2.2544, + "step": 2374 + }, + { + "epoch": 1.0555555555555556, + "grad_norm": 1.786561131477356, + "learning_rate": 0.0001579181494661922, + "loss": 1.688, + "step": 2375 + }, + { + "epoch": 1.056, + "grad_norm": 1.505338191986084, + "learning_rate": 0.00015790035587188612, + "loss": 2.2269, + "step": 2376 + }, + { + "epoch": 1.0564444444444445, + "grad_norm": 1.4534296989440918, + "learning_rate": 0.00015788256227758008, + "loss": 1.9174, + "step": 2377 + }, + { + "epoch": 1.056888888888889, + "grad_norm": 1.5155029296875, + "learning_rate": 0.00015786476868327403, + "loss": 2.2544, + "step": 2378 + }, + { + "epoch": 1.0573333333333332, + "grad_norm": 1.8670555353164673, + "learning_rate": 0.00015784697508896796, + "loss": 2.4167, + "step": 2379 + }, + { + "epoch": 1.0577777777777777, + "grad_norm": 1.633664846420288, + "learning_rate": 0.00015782918149466192, + "loss": 1.8265, + "step": 2380 + }, + { + "epoch": 1.0582222222222222, + "grad_norm": 1.5484849214553833, + "learning_rate": 0.00015781138790035588, + "loss": 2.1692, + "step": 2381 + }, + { + "epoch": 1.0586666666666666, + "grad_norm": 1.8765406608581543, + "learning_rate": 0.00015779359430604983, + "loss": 0.8998, + "step": 2382 + }, + { + "epoch": 1.0591111111111111, + "grad_norm": 1.6616917848587036, + "learning_rate": 0.0001577758007117438, + "loss": 2.3774, + "step": 2383 + }, + { + "epoch": 1.0595555555555556, + "grad_norm": 1.5309672355651855, + "learning_rate": 0.00015775800711743775, + "loss": 1.8152, + "step": 2384 + }, + { + "epoch": 1.06, + "grad_norm": 1.7901145219802856, + "learning_rate": 0.00015774021352313168, + "loss": 2.6988, + "step": 2385 + }, + { + "epoch": 1.0604444444444445, + "grad_norm": 1.5736534595489502, + "learning_rate": 0.00015772241992882563, + "loss": 1.9631, + "step": 2386 + }, + { + "epoch": 1.060888888888889, + "grad_norm": 1.8793672323226929, + "learning_rate": 0.0001577046263345196, + "loss": 2.7282, + "step": 2387 + }, + { + "epoch": 1.0613333333333332, + "grad_norm": 1.6914716958999634, + "learning_rate": 0.00015768683274021354, + "loss": 2.1709, + "step": 2388 + }, + { + "epoch": 1.0617777777777777, + "grad_norm": 1.847061038017273, + "learning_rate": 0.00015766903914590747, + "loss": 2.1721, + "step": 2389 + }, + { + "epoch": 1.0622222222222222, + "grad_norm": 1.698413610458374, + "learning_rate": 0.00015765124555160143, + "loss": 1.6681, + "step": 2390 + }, + { + "epoch": 1.0626666666666666, + "grad_norm": 1.9005299806594849, + "learning_rate": 0.0001576334519572954, + "loss": 1.9767, + "step": 2391 + }, + { + "epoch": 1.0631111111111111, + "grad_norm": 1.9315385818481445, + "learning_rate": 0.00015761565836298932, + "loss": 2.4512, + "step": 2392 + }, + { + "epoch": 1.0635555555555556, + "grad_norm": 1.114691972732544, + "learning_rate": 0.00015759786476868327, + "loss": 0.8507, + "step": 2393 + }, + { + "epoch": 1.064, + "grad_norm": 1.4050822257995605, + "learning_rate": 0.00015758007117437723, + "loss": 1.3638, + "step": 2394 + }, + { + "epoch": 1.0644444444444445, + "grad_norm": 1.3316043615341187, + "learning_rate": 0.00015756227758007119, + "loss": 1.1736, + "step": 2395 + }, + { + "epoch": 1.064888888888889, + "grad_norm": 1.950830340385437, + "learning_rate": 0.00015754448398576514, + "loss": 2.0624, + "step": 2396 + }, + { + "epoch": 1.0653333333333332, + "grad_norm": 1.9094411134719849, + "learning_rate": 0.0001575266903914591, + "loss": 2.4114, + "step": 2397 + }, + { + "epoch": 1.0657777777777777, + "grad_norm": 2.046294927597046, + "learning_rate": 0.00015750889679715303, + "loss": 2.32, + "step": 2398 + }, + { + "epoch": 1.0662222222222222, + "grad_norm": 2.1060452461242676, + "learning_rate": 0.00015749110320284698, + "loss": 2.7453, + "step": 2399 + }, + { + "epoch": 1.0666666666666667, + "grad_norm": 3.819446325302124, + "learning_rate": 0.00015747330960854094, + "loss": 0.8665, + "step": 2400 + }, + { + "epoch": 1.0671111111111111, + "grad_norm": 1.0843795537948608, + "learning_rate": 0.0001574555160142349, + "loss": 2.5427, + "step": 2401 + }, + { + "epoch": 1.0675555555555556, + "grad_norm": 1.219653606414795, + "learning_rate": 0.00015743772241992883, + "loss": 0.8994, + "step": 2402 + }, + { + "epoch": 1.068, + "grad_norm": 1.363571047782898, + "learning_rate": 0.00015741992882562278, + "loss": 2.6447, + "step": 2403 + }, + { + "epoch": 1.0684444444444445, + "grad_norm": 1.192108154296875, + "learning_rate": 0.00015740213523131674, + "loss": 2.1144, + "step": 2404 + }, + { + "epoch": 1.068888888888889, + "grad_norm": 1.539057731628418, + "learning_rate": 0.00015738434163701067, + "loss": 2.487, + "step": 2405 + }, + { + "epoch": 1.0693333333333332, + "grad_norm": 1.3476603031158447, + "learning_rate": 0.00015736654804270463, + "loss": 2.207, + "step": 2406 + }, + { + "epoch": 1.0697777777777777, + "grad_norm": 1.304561972618103, + "learning_rate": 0.00015734875444839858, + "loss": 2.0535, + "step": 2407 + }, + { + "epoch": 1.0702222222222222, + "grad_norm": 1.4313247203826904, + "learning_rate": 0.00015733096085409254, + "loss": 2.5509, + "step": 2408 + }, + { + "epoch": 1.0706666666666667, + "grad_norm": 1.3336185216903687, + "learning_rate": 0.0001573131672597865, + "loss": 1.8019, + "step": 2409 + }, + { + "epoch": 1.0711111111111111, + "grad_norm": 1.3517190217971802, + "learning_rate": 0.00015729537366548045, + "loss": 2.188, + "step": 2410 + }, + { + "epoch": 1.0715555555555556, + "grad_norm": 1.563821792602539, + "learning_rate": 0.00015727758007117438, + "loss": 2.2798, + "step": 2411 + }, + { + "epoch": 1.072, + "grad_norm": 1.5215498208999634, + "learning_rate": 0.00015725978647686834, + "loss": 2.454, + "step": 2412 + }, + { + "epoch": 1.0724444444444445, + "grad_norm": 1.465469479560852, + "learning_rate": 0.0001572419928825623, + "loss": 1.8113, + "step": 2413 + }, + { + "epoch": 1.072888888888889, + "grad_norm": 1.5329351425170898, + "learning_rate": 0.00015722419928825622, + "loss": 2.3624, + "step": 2414 + }, + { + "epoch": 1.0733333333333333, + "grad_norm": 1.7038596868515015, + "learning_rate": 0.00015720640569395018, + "loss": 0.1039, + "step": 2415 + }, + { + "epoch": 1.0737777777777777, + "grad_norm": 1.4838898181915283, + "learning_rate": 0.00015718861209964414, + "loss": 1.8683, + "step": 2416 + }, + { + "epoch": 1.0742222222222222, + "grad_norm": 1.7675329446792603, + "learning_rate": 0.0001571708185053381, + "loss": 2.3259, + "step": 2417 + }, + { + "epoch": 1.0746666666666667, + "grad_norm": 1.687468409538269, + "learning_rate": 0.00015715302491103202, + "loss": 2.6304, + "step": 2418 + }, + { + "epoch": 1.0751111111111111, + "grad_norm": 1.5204507112503052, + "learning_rate": 0.00015713523131672598, + "loss": 2.5227, + "step": 2419 + }, + { + "epoch": 1.0755555555555556, + "grad_norm": 2.2087671756744385, + "learning_rate": 0.00015711743772241994, + "loss": 1.0911, + "step": 2420 + }, + { + "epoch": 1.076, + "grad_norm": 1.5000768899917603, + "learning_rate": 0.0001570996441281139, + "loss": 2.3532, + "step": 2421 + }, + { + "epoch": 1.0764444444444445, + "grad_norm": 1.5198241472244263, + "learning_rate": 0.00015708185053380785, + "loss": 2.0702, + "step": 2422 + }, + { + "epoch": 1.076888888888889, + "grad_norm": 1.6272002458572388, + "learning_rate": 0.00015706405693950178, + "loss": 2.0378, + "step": 2423 + }, + { + "epoch": 1.0773333333333333, + "grad_norm": 1.5990360975265503, + "learning_rate": 0.00015704626334519573, + "loss": 2.2515, + "step": 2424 + }, + { + "epoch": 1.0777777777777777, + "grad_norm": 1.4444339275360107, + "learning_rate": 0.0001570284697508897, + "loss": 2.2619, + "step": 2425 + }, + { + "epoch": 1.0782222222222222, + "grad_norm": 1.8956879377365112, + "learning_rate": 0.00015701067615658365, + "loss": 2.8021, + "step": 2426 + }, + { + "epoch": 1.0786666666666667, + "grad_norm": 1.6425714492797852, + "learning_rate": 0.00015699288256227758, + "loss": 2.0961, + "step": 2427 + }, + { + "epoch": 1.0791111111111111, + "grad_norm": 1.7819446325302124, + "learning_rate": 0.00015697508896797153, + "loss": 2.2295, + "step": 2428 + }, + { + "epoch": 1.0795555555555556, + "grad_norm": 1.8124161958694458, + "learning_rate": 0.0001569572953736655, + "loss": 2.3429, + "step": 2429 + }, + { + "epoch": 1.08, + "grad_norm": 1.7560713291168213, + "learning_rate": 0.00015693950177935942, + "loss": 2.0773, + "step": 2430 + }, + { + "epoch": 1.0804444444444445, + "grad_norm": 1.647606611251831, + "learning_rate": 0.00015692170818505338, + "loss": 1.9757, + "step": 2431 + }, + { + "epoch": 1.0808888888888888, + "grad_norm": 1.6525201797485352, + "learning_rate": 0.00015690391459074733, + "loss": 1.8019, + "step": 2432 + }, + { + "epoch": 1.0813333333333333, + "grad_norm": 1.7084051370620728, + "learning_rate": 0.0001568861209964413, + "loss": 2.6304, + "step": 2433 + }, + { + "epoch": 1.0817777777777777, + "grad_norm": 1.3071404695510864, + "learning_rate": 0.00015686832740213525, + "loss": 1.1213, + "step": 2434 + }, + { + "epoch": 1.0822222222222222, + "grad_norm": 1.7664408683776855, + "learning_rate": 0.0001568505338078292, + "loss": 2.335, + "step": 2435 + }, + { + "epoch": 1.0826666666666667, + "grad_norm": 1.7795616388320923, + "learning_rate": 0.00015683274021352313, + "loss": 1.8807, + "step": 2436 + }, + { + "epoch": 1.0831111111111111, + "grad_norm": 1.9509518146514893, + "learning_rate": 0.0001568149466192171, + "loss": 2.2775, + "step": 2437 + }, + { + "epoch": 1.0835555555555556, + "grad_norm": 1.7835257053375244, + "learning_rate": 0.00015679715302491104, + "loss": 1.8702, + "step": 2438 + }, + { + "epoch": 1.084, + "grad_norm": 1.7957788705825806, + "learning_rate": 0.000156779359430605, + "loss": 2.4921, + "step": 2439 + }, + { + "epoch": 1.0844444444444445, + "grad_norm": 1.2208243608474731, + "learning_rate": 0.00015676156583629893, + "loss": 1.2614, + "step": 2440 + }, + { + "epoch": 1.0848888888888888, + "grad_norm": 1.8217169046401978, + "learning_rate": 0.0001567437722419929, + "loss": 2.0923, + "step": 2441 + }, + { + "epoch": 1.0853333333333333, + "grad_norm": 2.509866952896118, + "learning_rate": 0.00015672597864768684, + "loss": 2.5114, + "step": 2442 + }, + { + "epoch": 1.0857777777777777, + "grad_norm": 2.0780751705169678, + "learning_rate": 0.00015670818505338077, + "loss": 2.5196, + "step": 2443 + }, + { + "epoch": 1.0862222222222222, + "grad_norm": 1.780432105064392, + "learning_rate": 0.00015669039145907473, + "loss": 1.6573, + "step": 2444 + }, + { + "epoch": 1.0866666666666667, + "grad_norm": 1.7413227558135986, + "learning_rate": 0.00015667259786476869, + "loss": 1.6994, + "step": 2445 + }, + { + "epoch": 1.0871111111111111, + "grad_norm": 2.0534093379974365, + "learning_rate": 0.00015665480427046264, + "loss": 2.5252, + "step": 2446 + }, + { + "epoch": 1.0875555555555556, + "grad_norm": 1.8891476392745972, + "learning_rate": 0.0001566370106761566, + "loss": 1.7454, + "step": 2447 + }, + { + "epoch": 1.088, + "grad_norm": 2.5236616134643555, + "learning_rate": 0.00015661921708185056, + "loss": 2.7, + "step": 2448 + }, + { + "epoch": 1.0884444444444445, + "grad_norm": 2.130950689315796, + "learning_rate": 0.00015660142348754448, + "loss": 1.9737, + "step": 2449 + }, + { + "epoch": 1.0888888888888888, + "grad_norm": 3.0445713996887207, + "learning_rate": 0.00015658362989323844, + "loss": 1.5697, + "step": 2450 + }, + { + "epoch": 1.0893333333333333, + "grad_norm": 1.1887680292129517, + "learning_rate": 0.0001565658362989324, + "loss": 2.4287, + "step": 2451 + }, + { + "epoch": 1.0897777777777777, + "grad_norm": 1.3546632528305054, + "learning_rate": 0.00015654804270462635, + "loss": 1.377, + "step": 2452 + }, + { + "epoch": 1.0902222222222222, + "grad_norm": 1.6304256916046143, + "learning_rate": 0.00015653024911032028, + "loss": 2.4908, + "step": 2453 + }, + { + "epoch": 1.0906666666666667, + "grad_norm": 1.516430139541626, + "learning_rate": 0.00015651245551601424, + "loss": 2.0657, + "step": 2454 + }, + { + "epoch": 1.0911111111111111, + "grad_norm": 1.5727593898773193, + "learning_rate": 0.0001564946619217082, + "loss": 2.0812, + "step": 2455 + }, + { + "epoch": 1.0915555555555556, + "grad_norm": 1.2789214849472046, + "learning_rate": 0.00015647686832740213, + "loss": 2.01, + "step": 2456 + }, + { + "epoch": 1.092, + "grad_norm": 1.4972316026687622, + "learning_rate": 0.00015645907473309608, + "loss": 1.0983, + "step": 2457 + }, + { + "epoch": 1.0924444444444443, + "grad_norm": 1.3972692489624023, + "learning_rate": 0.00015644128113879004, + "loss": 2.2204, + "step": 2458 + }, + { + "epoch": 1.0928888888888888, + "grad_norm": 1.5354390144348145, + "learning_rate": 0.000156423487544484, + "loss": 2.3677, + "step": 2459 + }, + { + "epoch": 1.0933333333333333, + "grad_norm": 1.5079275369644165, + "learning_rate": 0.00015640569395017795, + "loss": 2.4312, + "step": 2460 + }, + { + "epoch": 1.0937777777777777, + "grad_norm": 1.4273076057434082, + "learning_rate": 0.0001563879003558719, + "loss": 2.1158, + "step": 2461 + }, + { + "epoch": 1.0942222222222222, + "grad_norm": 1.5340080261230469, + "learning_rate": 0.00015637010676156584, + "loss": 2.559, + "step": 2462 + }, + { + "epoch": 1.0946666666666667, + "grad_norm": 1.5617725849151611, + "learning_rate": 0.0001563523131672598, + "loss": 2.5294, + "step": 2463 + }, + { + "epoch": 1.0951111111111111, + "grad_norm": 1.6314741373062134, + "learning_rate": 0.00015633451957295375, + "loss": 2.1713, + "step": 2464 + }, + { + "epoch": 1.0955555555555556, + "grad_norm": 1.460752010345459, + "learning_rate": 0.0001563167259786477, + "loss": 2.3558, + "step": 2465 + }, + { + "epoch": 1.096, + "grad_norm": 1.428756833076477, + "learning_rate": 0.00015629893238434164, + "loss": 2.2738, + "step": 2466 + }, + { + "epoch": 1.0964444444444443, + "grad_norm": 1.7158453464508057, + "learning_rate": 0.0001562811387900356, + "loss": 2.172, + "step": 2467 + }, + { + "epoch": 1.0968888888888888, + "grad_norm": 1.5051125288009644, + "learning_rate": 0.00015626334519572955, + "loss": 1.9741, + "step": 2468 + }, + { + "epoch": 1.0973333333333333, + "grad_norm": 1.4402563571929932, + "learning_rate": 0.00015624555160142348, + "loss": 2.2152, + "step": 2469 + }, + { + "epoch": 1.0977777777777777, + "grad_norm": 1.6247109174728394, + "learning_rate": 0.00015622775800711744, + "loss": 2.1543, + "step": 2470 + }, + { + "epoch": 1.0982222222222222, + "grad_norm": 1.6644169092178345, + "learning_rate": 0.0001562099644128114, + "loss": 2.2489, + "step": 2471 + }, + { + "epoch": 1.0986666666666667, + "grad_norm": 2.04425048828125, + "learning_rate": 0.00015619217081850535, + "loss": 2.2436, + "step": 2472 + }, + { + "epoch": 1.0991111111111111, + "grad_norm": 1.86391282081604, + "learning_rate": 0.0001561743772241993, + "loss": 1.8856, + "step": 2473 + }, + { + "epoch": 1.0995555555555556, + "grad_norm": 1.5092231035232544, + "learning_rate": 0.00015615658362989326, + "loss": 2.0027, + "step": 2474 + }, + { + "epoch": 1.1, + "grad_norm": 1.5313433408737183, + "learning_rate": 0.0001561387900355872, + "loss": 1.4636, + "step": 2475 + }, + { + "epoch": 1.1004444444444443, + "grad_norm": 1.7613354921340942, + "learning_rate": 0.00015612099644128115, + "loss": 2.3526, + "step": 2476 + }, + { + "epoch": 1.1008888888888888, + "grad_norm": 1.567148208618164, + "learning_rate": 0.0001561032028469751, + "loss": 1.8585, + "step": 2477 + }, + { + "epoch": 1.1013333333333333, + "grad_norm": 2.1191651821136475, + "learning_rate": 0.00015608540925266906, + "loss": 2.188, + "step": 2478 + }, + { + "epoch": 1.1017777777777777, + "grad_norm": 1.6111822128295898, + "learning_rate": 0.000156067615658363, + "loss": 2.0623, + "step": 2479 + }, + { + "epoch": 1.1022222222222222, + "grad_norm": 1.5612345933914185, + "learning_rate": 0.00015604982206405695, + "loss": 1.881, + "step": 2480 + }, + { + "epoch": 1.1026666666666667, + "grad_norm": 1.9181139469146729, + "learning_rate": 0.0001560320284697509, + "loss": 2.4053, + "step": 2481 + }, + { + "epoch": 1.1031111111111112, + "grad_norm": 1.5462487936019897, + "learning_rate": 0.00015601423487544483, + "loss": 1.7245, + "step": 2482 + }, + { + "epoch": 1.1035555555555556, + "grad_norm": 1.9112005233764648, + "learning_rate": 0.0001559964412811388, + "loss": 2.0651, + "step": 2483 + }, + { + "epoch": 1.104, + "grad_norm": 1.8536262512207031, + "learning_rate": 0.00015597864768683274, + "loss": 2.2609, + "step": 2484 + }, + { + "epoch": 1.1044444444444443, + "grad_norm": 1.6455966234207153, + "learning_rate": 0.0001559608540925267, + "loss": 1.8761, + "step": 2485 + }, + { + "epoch": 1.1048888888888888, + "grad_norm": 1.9066351652145386, + "learning_rate": 0.00015594306049822066, + "loss": 1.9272, + "step": 2486 + }, + { + "epoch": 1.1053333333333333, + "grad_norm": 1.5465588569641113, + "learning_rate": 0.00015592526690391461, + "loss": 1.6961, + "step": 2487 + }, + { + "epoch": 1.1057777777777777, + "grad_norm": 2.180607318878174, + "learning_rate": 0.00015590747330960854, + "loss": 2.4785, + "step": 2488 + }, + { + "epoch": 1.1062222222222222, + "grad_norm": 1.9382919073104858, + "learning_rate": 0.0001558896797153025, + "loss": 1.754, + "step": 2489 + }, + { + "epoch": 1.1066666666666667, + "grad_norm": 1.8899612426757812, + "learning_rate": 0.00015587188612099646, + "loss": 2.2687, + "step": 2490 + }, + { + "epoch": 1.1071111111111112, + "grad_norm": 1.633954644203186, + "learning_rate": 0.0001558540925266904, + "loss": 1.9051, + "step": 2491 + }, + { + "epoch": 1.1075555555555556, + "grad_norm": 3.747358560562134, + "learning_rate": 0.00015583629893238434, + "loss": 1.9931, + "step": 2492 + }, + { + "epoch": 1.108, + "grad_norm": 2.0736889839172363, + "learning_rate": 0.0001558185053380783, + "loss": 2.3253, + "step": 2493 + }, + { + "epoch": 1.1084444444444443, + "grad_norm": 1.9722511768341064, + "learning_rate": 0.00015580071174377226, + "loss": 2.2268, + "step": 2494 + }, + { + "epoch": 1.1088888888888888, + "grad_norm": 1.9055894613265991, + "learning_rate": 0.00015578291814946619, + "loss": 2.0168, + "step": 2495 + }, + { + "epoch": 1.1093333333333333, + "grad_norm": 2.424161672592163, + "learning_rate": 0.00015576512455516014, + "loss": 2.4325, + "step": 2496 + }, + { + "epoch": 1.1097777777777778, + "grad_norm": 1.9778692722320557, + "learning_rate": 0.0001557473309608541, + "loss": 1.8312, + "step": 2497 + }, + { + "epoch": 1.1102222222222222, + "grad_norm": 2.590223550796509, + "learning_rate": 0.00015572953736654805, + "loss": 2.6732, + "step": 2498 + }, + { + "epoch": 1.1106666666666667, + "grad_norm": 1.8567228317260742, + "learning_rate": 0.000155711743772242, + "loss": 0.7758, + "step": 2499 + }, + { + "epoch": 1.1111111111111112, + "grad_norm": 2.436675786972046, + "learning_rate": 0.00015569395017793597, + "loss": 2.0896, + "step": 2500 + }, + { + "epoch": 1.1115555555555556, + "grad_norm": 1.4165303707122803, + "learning_rate": 0.0001556761565836299, + "loss": 2.3637, + "step": 2501 + }, + { + "epoch": 1.112, + "grad_norm": 1.173225998878479, + "learning_rate": 0.00015565836298932385, + "loss": 2.1477, + "step": 2502 + }, + { + "epoch": 1.1124444444444443, + "grad_norm": 1.1812031269073486, + "learning_rate": 0.0001556405693950178, + "loss": 2.5036, + "step": 2503 + }, + { + "epoch": 1.1128888888888888, + "grad_norm": 1.4120737314224243, + "learning_rate": 0.00015562277580071177, + "loss": 2.1574, + "step": 2504 + }, + { + "epoch": 1.1133333333333333, + "grad_norm": 1.3986128568649292, + "learning_rate": 0.0001556049822064057, + "loss": 1.8443, + "step": 2505 + }, + { + "epoch": 1.1137777777777778, + "grad_norm": 1.6244175434112549, + "learning_rate": 0.00015558718861209965, + "loss": 2.7789, + "step": 2506 + }, + { + "epoch": 1.1142222222222222, + "grad_norm": 1.4272732734680176, + "learning_rate": 0.0001555693950177936, + "loss": 2.2718, + "step": 2507 + }, + { + "epoch": 1.1146666666666667, + "grad_norm": 1.3611066341400146, + "learning_rate": 0.00015555160142348754, + "loss": 2.3992, + "step": 2508 + }, + { + "epoch": 1.1151111111111112, + "grad_norm": 1.751434087753296, + "learning_rate": 0.0001555338078291815, + "loss": 2.1827, + "step": 2509 + }, + { + "epoch": 1.1155555555555556, + "grad_norm": 1.3341114521026611, + "learning_rate": 0.00015551601423487545, + "loss": 1.985, + "step": 2510 + }, + { + "epoch": 1.116, + "grad_norm": 1.3708674907684326, + "learning_rate": 0.0001554982206405694, + "loss": 2.0507, + "step": 2511 + }, + { + "epoch": 1.1164444444444444, + "grad_norm": 1.4740220308303833, + "learning_rate": 0.00015548042704626336, + "loss": 2.1808, + "step": 2512 + }, + { + "epoch": 1.1168888888888888, + "grad_norm": 1.5105361938476562, + "learning_rate": 0.0001554626334519573, + "loss": 2.055, + "step": 2513 + }, + { + "epoch": 1.1173333333333333, + "grad_norm": 1.4184283018112183, + "learning_rate": 0.00015544483985765125, + "loss": 2.0523, + "step": 2514 + }, + { + "epoch": 1.1177777777777778, + "grad_norm": 1.5358822345733643, + "learning_rate": 0.0001554270462633452, + "loss": 2.073, + "step": 2515 + }, + { + "epoch": 1.1182222222222222, + "grad_norm": 1.636608362197876, + "learning_rate": 0.00015540925266903916, + "loss": 2.0907, + "step": 2516 + }, + { + "epoch": 1.1186666666666667, + "grad_norm": 1.7694652080535889, + "learning_rate": 0.00015539145907473312, + "loss": 2.4836, + "step": 2517 + }, + { + "epoch": 1.1191111111111112, + "grad_norm": 1.6159368753433228, + "learning_rate": 0.00015537366548042705, + "loss": 1.9878, + "step": 2518 + }, + { + "epoch": 1.1195555555555556, + "grad_norm": 2.006478786468506, + "learning_rate": 0.000155355871886121, + "loss": 2.6112, + "step": 2519 + }, + { + "epoch": 1.12, + "grad_norm": 1.530470371246338, + "learning_rate": 0.00015533807829181493, + "loss": 1.8775, + "step": 2520 + }, + { + "epoch": 1.1204444444444444, + "grad_norm": 1.8092018365859985, + "learning_rate": 0.0001553202846975089, + "loss": 2.0667, + "step": 2521 + }, + { + "epoch": 1.1208888888888888, + "grad_norm": 1.5595135688781738, + "learning_rate": 0.00015530249110320285, + "loss": 2.7584, + "step": 2522 + }, + { + "epoch": 1.1213333333333333, + "grad_norm": 1.5130575895309448, + "learning_rate": 0.0001552846975088968, + "loss": 2.3097, + "step": 2523 + }, + { + "epoch": 1.1217777777777778, + "grad_norm": 1.7195639610290527, + "learning_rate": 0.00015526690391459076, + "loss": 2.3219, + "step": 2524 + }, + { + "epoch": 1.1222222222222222, + "grad_norm": 1.7365368604660034, + "learning_rate": 0.00015524911032028472, + "loss": 1.8011, + "step": 2525 + }, + { + "epoch": 1.1226666666666667, + "grad_norm": 1.6279082298278809, + "learning_rate": 0.00015523131672597865, + "loss": 1.9524, + "step": 2526 + }, + { + "epoch": 1.1231111111111112, + "grad_norm": 1.9169297218322754, + "learning_rate": 0.0001552135231316726, + "loss": 2.3763, + "step": 2527 + }, + { + "epoch": 1.1235555555555556, + "grad_norm": 1.688704490661621, + "learning_rate": 0.00015519572953736656, + "loss": 2.1569, + "step": 2528 + }, + { + "epoch": 1.124, + "grad_norm": 1.2597405910491943, + "learning_rate": 0.00015517793594306052, + "loss": 1.3028, + "step": 2529 + }, + { + "epoch": 1.1244444444444444, + "grad_norm": 1.07369863986969, + "learning_rate": 0.00015516014234875445, + "loss": 0.8657, + "step": 2530 + }, + { + "epoch": 1.1248888888888888, + "grad_norm": 1.3311814069747925, + "learning_rate": 0.0001551423487544484, + "loss": 1.1018, + "step": 2531 + }, + { + "epoch": 1.1253333333333333, + "grad_norm": 1.7124340534210205, + "learning_rate": 0.00015512455516014236, + "loss": 2.2348, + "step": 2532 + }, + { + "epoch": 1.1257777777777778, + "grad_norm": 1.6865593194961548, + "learning_rate": 0.0001551067615658363, + "loss": 2.095, + "step": 2533 + }, + { + "epoch": 1.1262222222222222, + "grad_norm": 2.115900754928589, + "learning_rate": 0.00015508896797153024, + "loss": 2.2804, + "step": 2534 + }, + { + "epoch": 1.1266666666666667, + "grad_norm": 1.720361590385437, + "learning_rate": 0.0001550711743772242, + "loss": 2.125, + "step": 2535 + }, + { + "epoch": 1.1271111111111112, + "grad_norm": 1.8980624675750732, + "learning_rate": 0.00015505338078291816, + "loss": 2.2915, + "step": 2536 + }, + { + "epoch": 1.1275555555555556, + "grad_norm": 2.2021002769470215, + "learning_rate": 0.00015503558718861211, + "loss": 2.2995, + "step": 2537 + }, + { + "epoch": 1.1280000000000001, + "grad_norm": 1.804162621498108, + "learning_rate": 0.00015501779359430607, + "loss": 2.1668, + "step": 2538 + }, + { + "epoch": 1.1284444444444444, + "grad_norm": 1.6780054569244385, + "learning_rate": 0.000155, + "loss": 1.7612, + "step": 2539 + }, + { + "epoch": 1.1288888888888888, + "grad_norm": 2.008111000061035, + "learning_rate": 0.00015498220640569396, + "loss": 2.2902, + "step": 2540 + }, + { + "epoch": 1.1293333333333333, + "grad_norm": 2.181689500808716, + "learning_rate": 0.0001549644128113879, + "loss": 2.3495, + "step": 2541 + }, + { + "epoch": 1.1297777777777778, + "grad_norm": 2.064223289489746, + "learning_rate": 0.00015494661921708187, + "loss": 1.9178, + "step": 2542 + }, + { + "epoch": 1.1302222222222222, + "grad_norm": 2.1660356521606445, + "learning_rate": 0.0001549288256227758, + "loss": 2.6801, + "step": 2543 + }, + { + "epoch": 1.1306666666666667, + "grad_norm": 2.031355142593384, + "learning_rate": 0.00015491103202846976, + "loss": 2.2943, + "step": 2544 + }, + { + "epoch": 1.1311111111111112, + "grad_norm": 2.2684144973754883, + "learning_rate": 0.0001548932384341637, + "loss": 2.5049, + "step": 2545 + }, + { + "epoch": 1.1315555555555556, + "grad_norm": 2.3677666187286377, + "learning_rate": 0.00015487544483985764, + "loss": 2.6018, + "step": 2546 + }, + { + "epoch": 1.1320000000000001, + "grad_norm": 2.2659435272216797, + "learning_rate": 0.0001548576512455516, + "loss": 2.3025, + "step": 2547 + }, + { + "epoch": 1.1324444444444444, + "grad_norm": 2.0491015911102295, + "learning_rate": 0.00015483985765124555, + "loss": 2.3657, + "step": 2548 + }, + { + "epoch": 1.1328888888888888, + "grad_norm": 1.3263907432556152, + "learning_rate": 0.0001548220640569395, + "loss": 0.88, + "step": 2549 + }, + { + "epoch": 1.1333333333333333, + "grad_norm": 2.2537829875946045, + "learning_rate": 0.00015480427046263347, + "loss": 1.195, + "step": 2550 + }, + { + "epoch": 1.1337777777777778, + "grad_norm": 1.338564395904541, + "learning_rate": 0.00015478647686832742, + "loss": 2.866, + "step": 2551 + }, + { + "epoch": 1.1342222222222222, + "grad_norm": 1.3049834966659546, + "learning_rate": 0.00015476868327402135, + "loss": 2.7732, + "step": 2552 + }, + { + "epoch": 1.1346666666666667, + "grad_norm": 0.8578532338142395, + "learning_rate": 0.0001547508896797153, + "loss": 1.0418, + "step": 2553 + }, + { + "epoch": 1.1351111111111112, + "grad_norm": 1.3947099447250366, + "learning_rate": 0.00015473309608540927, + "loss": 2.3232, + "step": 2554 + }, + { + "epoch": 1.1355555555555557, + "grad_norm": 1.4263209104537964, + "learning_rate": 0.00015471530249110322, + "loss": 2.4567, + "step": 2555 + }, + { + "epoch": 1.1360000000000001, + "grad_norm": 1.4634780883789062, + "learning_rate": 0.00015469750889679715, + "loss": 2.2001, + "step": 2556 + }, + { + "epoch": 1.1364444444444444, + "grad_norm": 1.4137287139892578, + "learning_rate": 0.0001546797153024911, + "loss": 2.3393, + "step": 2557 + }, + { + "epoch": 1.1368888888888888, + "grad_norm": 1.3310433626174927, + "learning_rate": 0.00015466192170818507, + "loss": 2.1137, + "step": 2558 + }, + { + "epoch": 1.1373333333333333, + "grad_norm": 1.6760700941085815, + "learning_rate": 0.000154644128113879, + "loss": 2.6479, + "step": 2559 + }, + { + "epoch": 1.1377777777777778, + "grad_norm": 1.6810277700424194, + "learning_rate": 0.00015462633451957295, + "loss": 1.7997, + "step": 2560 + }, + { + "epoch": 1.1382222222222222, + "grad_norm": 1.40380859375, + "learning_rate": 0.0001546085409252669, + "loss": 2.1069, + "step": 2561 + }, + { + "epoch": 1.1386666666666667, + "grad_norm": 1.451357126235962, + "learning_rate": 0.00015459074733096086, + "loss": 2.0879, + "step": 2562 + }, + { + "epoch": 1.1391111111111112, + "grad_norm": 1.7103229761123657, + "learning_rate": 0.00015457295373665482, + "loss": 2.1198, + "step": 2563 + }, + { + "epoch": 1.1395555555555554, + "grad_norm": 1.3014283180236816, + "learning_rate": 0.00015455516014234878, + "loss": 1.4409, + "step": 2564 + }, + { + "epoch": 1.1400000000000001, + "grad_norm": 1.7110216617584229, + "learning_rate": 0.0001545373665480427, + "loss": 2.064, + "step": 2565 + }, + { + "epoch": 1.1404444444444444, + "grad_norm": 1.2959778308868408, + "learning_rate": 0.00015451957295373666, + "loss": 1.9788, + "step": 2566 + }, + { + "epoch": 1.1408888888888888, + "grad_norm": 1.339138388633728, + "learning_rate": 0.00015450177935943062, + "loss": 1.525, + "step": 2567 + }, + { + "epoch": 1.1413333333333333, + "grad_norm": 2.068941116333008, + "learning_rate": 0.00015448398576512458, + "loss": 2.5298, + "step": 2568 + }, + { + "epoch": 1.1417777777777778, + "grad_norm": 1.6485881805419922, + "learning_rate": 0.0001544661921708185, + "loss": 2.0371, + "step": 2569 + }, + { + "epoch": 1.1422222222222222, + "grad_norm": 1.6411560773849487, + "learning_rate": 0.00015444839857651246, + "loss": 2.1563, + "step": 2570 + }, + { + "epoch": 1.1426666666666667, + "grad_norm": 1.8316292762756348, + "learning_rate": 0.00015443060498220642, + "loss": 1.9724, + "step": 2571 + }, + { + "epoch": 1.1431111111111112, + "grad_norm": 1.3947020769119263, + "learning_rate": 0.00015441281138790035, + "loss": 2.1488, + "step": 2572 + }, + { + "epoch": 1.1435555555555554, + "grad_norm": 1.722806692123413, + "learning_rate": 0.0001543950177935943, + "loss": 1.9657, + "step": 2573 + }, + { + "epoch": 1.144, + "grad_norm": 1.944720983505249, + "learning_rate": 0.00015437722419928826, + "loss": 2.0344, + "step": 2574 + }, + { + "epoch": 1.1444444444444444, + "grad_norm": 1.69381582736969, + "learning_rate": 0.00015435943060498222, + "loss": 1.6078, + "step": 2575 + }, + { + "epoch": 1.1448888888888888, + "grad_norm": 1.678240180015564, + "learning_rate": 0.00015434163701067617, + "loss": 2.2514, + "step": 2576 + }, + { + "epoch": 1.1453333333333333, + "grad_norm": 1.8483023643493652, + "learning_rate": 0.00015432384341637013, + "loss": 2.3157, + "step": 2577 + }, + { + "epoch": 1.1457777777777778, + "grad_norm": 1.6956913471221924, + "learning_rate": 0.00015430604982206406, + "loss": 2.2523, + "step": 2578 + }, + { + "epoch": 1.1462222222222223, + "grad_norm": 1.861174464225769, + "learning_rate": 0.00015428825622775802, + "loss": 2.5922, + "step": 2579 + }, + { + "epoch": 1.1466666666666667, + "grad_norm": 1.6170905828475952, + "learning_rate": 0.00015427046263345197, + "loss": 1.7942, + "step": 2580 + }, + { + "epoch": 1.1471111111111112, + "grad_norm": 1.5658963918685913, + "learning_rate": 0.00015425266903914593, + "loss": 2.0127, + "step": 2581 + }, + { + "epoch": 1.1475555555555554, + "grad_norm": 1.7916998863220215, + "learning_rate": 0.00015423487544483986, + "loss": 2.2693, + "step": 2582 + }, + { + "epoch": 1.148, + "grad_norm": 1.8371453285217285, + "learning_rate": 0.00015421708185053381, + "loss": 1.8348, + "step": 2583 + }, + { + "epoch": 1.1484444444444444, + "grad_norm": 1.862358808517456, + "learning_rate": 0.00015419928825622777, + "loss": 2.536, + "step": 2584 + }, + { + "epoch": 1.1488888888888888, + "grad_norm": 2.036752700805664, + "learning_rate": 0.0001541814946619217, + "loss": 2.4652, + "step": 2585 + }, + { + "epoch": 1.1493333333333333, + "grad_norm": 2.268584966659546, + "learning_rate": 0.00015416370106761566, + "loss": 2.2486, + "step": 2586 + }, + { + "epoch": 1.1497777777777778, + "grad_norm": 1.7090932130813599, + "learning_rate": 0.00015414590747330961, + "loss": 1.95, + "step": 2587 + }, + { + "epoch": 1.1502222222222223, + "grad_norm": 1.6534048318862915, + "learning_rate": 0.00015412811387900357, + "loss": 1.301, + "step": 2588 + }, + { + "epoch": 1.1506666666666667, + "grad_norm": 1.7545114755630493, + "learning_rate": 0.00015411032028469753, + "loss": 1.753, + "step": 2589 + }, + { + "epoch": 1.1511111111111112, + "grad_norm": 2.4128382205963135, + "learning_rate": 0.00015409252669039148, + "loss": 2.5481, + "step": 2590 + }, + { + "epoch": 1.1515555555555554, + "grad_norm": 1.815370798110962, + "learning_rate": 0.0001540747330960854, + "loss": 2.091, + "step": 2591 + }, + { + "epoch": 1.152, + "grad_norm": 2.445251941680908, + "learning_rate": 0.00015405693950177937, + "loss": 2.1765, + "step": 2592 + }, + { + "epoch": 1.1524444444444444, + "grad_norm": 1.9091752767562866, + "learning_rate": 0.00015403914590747333, + "loss": 1.5047, + "step": 2593 + }, + { + "epoch": 1.1528888888888889, + "grad_norm": 2.3893678188323975, + "learning_rate": 0.00015402135231316728, + "loss": 2.2924, + "step": 2594 + }, + { + "epoch": 1.1533333333333333, + "grad_norm": 2.1791248321533203, + "learning_rate": 0.0001540035587188612, + "loss": 2.1891, + "step": 2595 + }, + { + "epoch": 1.1537777777777778, + "grad_norm": 1.9411416053771973, + "learning_rate": 0.00015398576512455517, + "loss": 2.2953, + "step": 2596 + }, + { + "epoch": 1.1542222222222223, + "grad_norm": 2.478189468383789, + "learning_rate": 0.00015396797153024912, + "loss": 1.9645, + "step": 2597 + }, + { + "epoch": 1.1546666666666667, + "grad_norm": 2.280930280685425, + "learning_rate": 0.00015395017793594305, + "loss": 2.2849, + "step": 2598 + }, + { + "epoch": 1.1551111111111112, + "grad_norm": 2.5622832775115967, + "learning_rate": 0.000153932384341637, + "loss": 1.4624, + "step": 2599 + }, + { + "epoch": 1.1555555555555554, + "grad_norm": 2.4844248294830322, + "learning_rate": 0.00015391459074733097, + "loss": 1.5791, + "step": 2600 + }, + { + "epoch": 1.156, + "grad_norm": 1.2558754682540894, + "learning_rate": 0.00015389679715302492, + "loss": 2.402, + "step": 2601 + }, + { + "epoch": 1.1564444444444444, + "grad_norm": 1.8104875087738037, + "learning_rate": 0.00015387900355871888, + "loss": 1.0876, + "step": 2602 + }, + { + "epoch": 1.1568888888888889, + "grad_norm": 1.2649706602096558, + "learning_rate": 0.0001538612099644128, + "loss": 2.1157, + "step": 2603 + }, + { + "epoch": 1.1573333333333333, + "grad_norm": 1.4647830724716187, + "learning_rate": 0.00015384341637010677, + "loss": 2.2213, + "step": 2604 + }, + { + "epoch": 1.1577777777777778, + "grad_norm": 1.4536770582199097, + "learning_rate": 0.00015382562277580072, + "loss": 2.0763, + "step": 2605 + }, + { + "epoch": 1.1582222222222223, + "grad_norm": 1.4016244411468506, + "learning_rate": 0.00015380782918149468, + "loss": 2.3909, + "step": 2606 + }, + { + "epoch": 1.1586666666666667, + "grad_norm": 1.4496042728424072, + "learning_rate": 0.00015379003558718864, + "loss": 1.9667, + "step": 2607 + }, + { + "epoch": 1.1591111111111112, + "grad_norm": 1.384210228919983, + "learning_rate": 0.00015377224199288256, + "loss": 2.8226, + "step": 2608 + }, + { + "epoch": 1.1595555555555555, + "grad_norm": 1.5709418058395386, + "learning_rate": 0.00015375444839857652, + "loss": 2.7716, + "step": 2609 + }, + { + "epoch": 1.16, + "grad_norm": 1.6072601079940796, + "learning_rate": 0.00015373665480427045, + "loss": 2.2285, + "step": 2610 + }, + { + "epoch": 1.1604444444444444, + "grad_norm": 1.4224820137023926, + "learning_rate": 0.0001537188612099644, + "loss": 2.049, + "step": 2611 + }, + { + "epoch": 1.1608888888888889, + "grad_norm": 1.5850938558578491, + "learning_rate": 0.00015370106761565836, + "loss": 2.326, + "step": 2612 + }, + { + "epoch": 1.1613333333333333, + "grad_norm": 1.3699077367782593, + "learning_rate": 0.00015368327402135232, + "loss": 2.1054, + "step": 2613 + }, + { + "epoch": 1.1617777777777778, + "grad_norm": 1.5815706253051758, + "learning_rate": 0.00015366548042704628, + "loss": 2.4236, + "step": 2614 + }, + { + "epoch": 1.1622222222222223, + "grad_norm": 1.3918722867965698, + "learning_rate": 0.00015364768683274023, + "loss": 2.2995, + "step": 2615 + }, + { + "epoch": 1.1626666666666667, + "grad_norm": 1.4517208337783813, + "learning_rate": 0.00015362989323843416, + "loss": 1.8738, + "step": 2616 + }, + { + "epoch": 1.1631111111111112, + "grad_norm": 1.84287691116333, + "learning_rate": 0.00015361209964412812, + "loss": 1.3763, + "step": 2617 + }, + { + "epoch": 1.1635555555555555, + "grad_norm": 1.6634745597839355, + "learning_rate": 0.00015359430604982208, + "loss": 2.3855, + "step": 2618 + }, + { + "epoch": 1.164, + "grad_norm": 1.544952154159546, + "learning_rate": 0.00015357651245551603, + "loss": 1.9593, + "step": 2619 + }, + { + "epoch": 1.1644444444444444, + "grad_norm": 1.4931992292404175, + "learning_rate": 0.00015355871886121, + "loss": 2.1309, + "step": 2620 + }, + { + "epoch": 1.1648888888888889, + "grad_norm": 1.7971177101135254, + "learning_rate": 0.00015354092526690392, + "loss": 2.2084, + "step": 2621 + }, + { + "epoch": 1.1653333333333333, + "grad_norm": 1.4012914896011353, + "learning_rate": 0.00015352313167259787, + "loss": 1.5844, + "step": 2622 + }, + { + "epoch": 1.1657777777777778, + "grad_norm": 1.5000579357147217, + "learning_rate": 0.0001535053380782918, + "loss": 1.8046, + "step": 2623 + }, + { + "epoch": 1.1662222222222223, + "grad_norm": 1.830424427986145, + "learning_rate": 0.00015348754448398576, + "loss": 2.6892, + "step": 2624 + }, + { + "epoch": 1.1666666666666667, + "grad_norm": 1.5387390851974487, + "learning_rate": 0.00015346975088967972, + "loss": 1.9499, + "step": 2625 + }, + { + "epoch": 1.1671111111111112, + "grad_norm": 1.5739384889602661, + "learning_rate": 0.00015345195729537367, + "loss": 2.0548, + "step": 2626 + }, + { + "epoch": 1.1675555555555555, + "grad_norm": 1.7389553785324097, + "learning_rate": 0.00015343416370106763, + "loss": 2.3472, + "step": 2627 + }, + { + "epoch": 1.168, + "grad_norm": 1.8690800666809082, + "learning_rate": 0.00015341637010676159, + "loss": 2.5385, + "step": 2628 + }, + { + "epoch": 1.1684444444444444, + "grad_norm": 1.6705001592636108, + "learning_rate": 0.00015339857651245552, + "loss": 2.2232, + "step": 2629 + }, + { + "epoch": 1.1688888888888889, + "grad_norm": 1.811651587486267, + "learning_rate": 0.00015338078291814947, + "loss": 2.3896, + "step": 2630 + }, + { + "epoch": 1.1693333333333333, + "grad_norm": 2.812932252883911, + "learning_rate": 0.00015336298932384343, + "loss": 2.3391, + "step": 2631 + }, + { + "epoch": 1.1697777777777778, + "grad_norm": 1.434435248374939, + "learning_rate": 0.00015334519572953739, + "loss": 0.9854, + "step": 2632 + }, + { + "epoch": 1.1702222222222223, + "grad_norm": 1.7616302967071533, + "learning_rate": 0.00015332740213523134, + "loss": 2.0482, + "step": 2633 + }, + { + "epoch": 1.1706666666666667, + "grad_norm": 2.4221534729003906, + "learning_rate": 0.00015330960854092527, + "loss": 2.338, + "step": 2634 + }, + { + "epoch": 1.1711111111111112, + "grad_norm": 1.9369844198226929, + "learning_rate": 0.00015329181494661923, + "loss": 1.8924, + "step": 2635 + }, + { + "epoch": 1.1715555555555555, + "grad_norm": 2.09136700630188, + "learning_rate": 0.00015327402135231316, + "loss": 2.5067, + "step": 2636 + }, + { + "epoch": 1.172, + "grad_norm": 1.7432854175567627, + "learning_rate": 0.0001532562277580071, + "loss": 1.6748, + "step": 2637 + }, + { + "epoch": 1.1724444444444444, + "grad_norm": 1.8707739114761353, + "learning_rate": 0.00015323843416370107, + "loss": 2.3218, + "step": 2638 + }, + { + "epoch": 1.1728888888888889, + "grad_norm": 1.9009658098220825, + "learning_rate": 0.00015322064056939503, + "loss": 2.1614, + "step": 2639 + }, + { + "epoch": 1.1733333333333333, + "grad_norm": 2.321162462234497, + "learning_rate": 0.00015320284697508898, + "loss": 2.0851, + "step": 2640 + }, + { + "epoch": 1.1737777777777778, + "grad_norm": 1.7875491380691528, + "learning_rate": 0.00015318505338078294, + "loss": 1.7379, + "step": 2641 + }, + { + "epoch": 1.1742222222222223, + "grad_norm": 1.9961577653884888, + "learning_rate": 0.00015316725978647687, + "loss": 2.4488, + "step": 2642 + }, + { + "epoch": 1.1746666666666667, + "grad_norm": 2.089043617248535, + "learning_rate": 0.00015314946619217083, + "loss": 2.2587, + "step": 2643 + }, + { + "epoch": 1.1751111111111112, + "grad_norm": 2.016988754272461, + "learning_rate": 0.00015313167259786478, + "loss": 2.3505, + "step": 2644 + }, + { + "epoch": 1.1755555555555555, + "grad_norm": 3.9953866004943848, + "learning_rate": 0.00015311387900355874, + "loss": 2.6823, + "step": 2645 + }, + { + "epoch": 1.176, + "grad_norm": 2.324265956878662, + "learning_rate": 0.00015309608540925267, + "loss": 2.0942, + "step": 2646 + }, + { + "epoch": 1.1764444444444444, + "grad_norm": 1.8716621398925781, + "learning_rate": 0.00015307829181494662, + "loss": 1.8976, + "step": 2647 + }, + { + "epoch": 1.1768888888888889, + "grad_norm": 2.1721549034118652, + "learning_rate": 0.00015306049822064058, + "loss": 2.3372, + "step": 2648 + }, + { + "epoch": 1.1773333333333333, + "grad_norm": 2.4310812950134277, + "learning_rate": 0.0001530427046263345, + "loss": 2.3681, + "step": 2649 + }, + { + "epoch": 1.1777777777777778, + "grad_norm": 2.1429104804992676, + "learning_rate": 0.00015302491103202847, + "loss": 1.3194, + "step": 2650 + }, + { + "epoch": 1.1782222222222223, + "grad_norm": 1.4743183851242065, + "learning_rate": 0.00015300711743772242, + "loss": 2.7132, + "step": 2651 + }, + { + "epoch": 1.1786666666666668, + "grad_norm": 1.449602723121643, + "learning_rate": 0.00015298932384341638, + "loss": 2.0043, + "step": 2652 + }, + { + "epoch": 1.1791111111111112, + "grad_norm": 1.5418530702590942, + "learning_rate": 0.00015297153024911034, + "loss": 2.4505, + "step": 2653 + }, + { + "epoch": 1.1795555555555555, + "grad_norm": 1.408302664756775, + "learning_rate": 0.0001529537366548043, + "loss": 2.1342, + "step": 2654 + }, + { + "epoch": 1.18, + "grad_norm": 1.6608649492263794, + "learning_rate": 0.00015293594306049822, + "loss": 2.3212, + "step": 2655 + }, + { + "epoch": 1.1804444444444444, + "grad_norm": 1.560037612915039, + "learning_rate": 0.00015291814946619218, + "loss": 2.9029, + "step": 2656 + }, + { + "epoch": 1.1808888888888889, + "grad_norm": 1.5058655738830566, + "learning_rate": 0.00015290035587188613, + "loss": 2.5045, + "step": 2657 + }, + { + "epoch": 1.1813333333333333, + "grad_norm": 1.5224006175994873, + "learning_rate": 0.0001528825622775801, + "loss": 2.2639, + "step": 2658 + }, + { + "epoch": 1.1817777777777778, + "grad_norm": 1.611315131187439, + "learning_rate": 0.00015286476868327402, + "loss": 2.594, + "step": 2659 + }, + { + "epoch": 1.1822222222222223, + "grad_norm": 1.6001996994018555, + "learning_rate": 0.00015284697508896798, + "loss": 2.6139, + "step": 2660 + }, + { + "epoch": 1.1826666666666668, + "grad_norm": 1.5929144620895386, + "learning_rate": 0.00015282918149466193, + "loss": 2.6785, + "step": 2661 + }, + { + "epoch": 1.1831111111111112, + "grad_norm": 1.6971992254257202, + "learning_rate": 0.00015281138790035586, + "loss": 2.364, + "step": 2662 + }, + { + "epoch": 1.1835555555555555, + "grad_norm": 1.468849539756775, + "learning_rate": 0.00015279359430604982, + "loss": 1.773, + "step": 2663 + }, + { + "epoch": 1.184, + "grad_norm": 1.352769374847412, + "learning_rate": 0.00015277580071174378, + "loss": 2.5955, + "step": 2664 + }, + { + "epoch": 1.1844444444444444, + "grad_norm": 1.4742112159729004, + "learning_rate": 0.00015275800711743773, + "loss": 1.9913, + "step": 2665 + }, + { + "epoch": 1.1848888888888889, + "grad_norm": 1.3926454782485962, + "learning_rate": 0.0001527402135231317, + "loss": 1.8751, + "step": 2666 + }, + { + "epoch": 1.1853333333333333, + "grad_norm": 1.6356984376907349, + "learning_rate": 0.00015272241992882565, + "loss": 2.353, + "step": 2667 + }, + { + "epoch": 1.1857777777777778, + "grad_norm": 1.5044867992401123, + "learning_rate": 0.00015270462633451958, + "loss": 1.988, + "step": 2668 + }, + { + "epoch": 1.1862222222222223, + "grad_norm": 1.8624123334884644, + "learning_rate": 0.00015268683274021353, + "loss": 2.5895, + "step": 2669 + }, + { + "epoch": 1.1866666666666668, + "grad_norm": 1.6830346584320068, + "learning_rate": 0.0001526690391459075, + "loss": 1.3593, + "step": 2670 + }, + { + "epoch": 1.1871111111111112, + "grad_norm": 1.6659824848175049, + "learning_rate": 0.00015265124555160144, + "loss": 1.8984, + "step": 2671 + }, + { + "epoch": 1.1875555555555555, + "grad_norm": 1.5918940305709839, + "learning_rate": 0.00015263345195729537, + "loss": 2.0415, + "step": 2672 + }, + { + "epoch": 1.188, + "grad_norm": 1.8598551750183105, + "learning_rate": 0.00015261565836298933, + "loss": 2.1718, + "step": 2673 + }, + { + "epoch": 1.1884444444444444, + "grad_norm": 1.764703631401062, + "learning_rate": 0.0001525978647686833, + "loss": 2.0413, + "step": 2674 + }, + { + "epoch": 1.1888888888888889, + "grad_norm": 1.7580000162124634, + "learning_rate": 0.00015258007117437722, + "loss": 2.1583, + "step": 2675 + }, + { + "epoch": 1.1893333333333334, + "grad_norm": 1.7123738527297974, + "learning_rate": 0.00015256227758007117, + "loss": 2.0192, + "step": 2676 + }, + { + "epoch": 1.1897777777777778, + "grad_norm": 2.88462233543396, + "learning_rate": 0.00015254448398576513, + "loss": 1.9526, + "step": 2677 + }, + { + "epoch": 1.1902222222222223, + "grad_norm": 1.6699835062026978, + "learning_rate": 0.00015252669039145909, + "loss": 2.0907, + "step": 2678 + }, + { + "epoch": 1.1906666666666668, + "grad_norm": 2.0208330154418945, + "learning_rate": 0.00015250889679715304, + "loss": 2.3712, + "step": 2679 + }, + { + "epoch": 1.1911111111111112, + "grad_norm": 1.9458266496658325, + "learning_rate": 0.000152491103202847, + "loss": 2.1126, + "step": 2680 + }, + { + "epoch": 1.1915555555555555, + "grad_norm": 1.6024980545043945, + "learning_rate": 0.00015247330960854093, + "loss": 1.8988, + "step": 2681 + }, + { + "epoch": 1.192, + "grad_norm": 1.7899705171585083, + "learning_rate": 0.00015245551601423488, + "loss": 2.0417, + "step": 2682 + }, + { + "epoch": 1.1924444444444444, + "grad_norm": 1.6227293014526367, + "learning_rate": 0.00015243772241992884, + "loss": 2.1364, + "step": 2683 + }, + { + "epoch": 1.1928888888888889, + "grad_norm": 1.7193636894226074, + "learning_rate": 0.0001524199288256228, + "loss": 1.9889, + "step": 2684 + }, + { + "epoch": 1.1933333333333334, + "grad_norm": 1.7960073947906494, + "learning_rate": 0.00015240213523131673, + "loss": 2.0971, + "step": 2685 + }, + { + "epoch": 1.1937777777777778, + "grad_norm": 1.8026853799819946, + "learning_rate": 0.00015238434163701068, + "loss": 2.0781, + "step": 2686 + }, + { + "epoch": 1.1942222222222223, + "grad_norm": 1.0487536191940308, + "learning_rate": 0.00015236654804270464, + "loss": 0.076, + "step": 2687 + }, + { + "epoch": 1.1946666666666665, + "grad_norm": 2.0273492336273193, + "learning_rate": 0.00015234875444839857, + "loss": 2.2794, + "step": 2688 + }, + { + "epoch": 1.1951111111111112, + "grad_norm": 1.8268475532531738, + "learning_rate": 0.00015233096085409253, + "loss": 1.9992, + "step": 2689 + }, + { + "epoch": 1.1955555555555555, + "grad_norm": 1.6929086446762085, + "learning_rate": 0.00015231316725978648, + "loss": 2.0807, + "step": 2690 + }, + { + "epoch": 1.196, + "grad_norm": 2.158275842666626, + "learning_rate": 0.00015229537366548044, + "loss": 2.2911, + "step": 2691 + }, + { + "epoch": 1.1964444444444444, + "grad_norm": 1.98186457157135, + "learning_rate": 0.0001522775800711744, + "loss": 2.0243, + "step": 2692 + }, + { + "epoch": 1.196888888888889, + "grad_norm": 1.696062445640564, + "learning_rate": 0.00015225978647686832, + "loss": 1.8214, + "step": 2693 + }, + { + "epoch": 1.1973333333333334, + "grad_norm": 2.215367078781128, + "learning_rate": 0.00015224199288256228, + "loss": 2.1777, + "step": 2694 + }, + { + "epoch": 1.1977777777777778, + "grad_norm": 2.0742318630218506, + "learning_rate": 0.00015222419928825624, + "loss": 2.1899, + "step": 2695 + }, + { + "epoch": 1.1982222222222223, + "grad_norm": 2.0556631088256836, + "learning_rate": 0.0001522064056939502, + "loss": 2.1347, + "step": 2696 + }, + { + "epoch": 1.1986666666666665, + "grad_norm": 2.068554162979126, + "learning_rate": 0.00015218861209964415, + "loss": 1.8877, + "step": 2697 + }, + { + "epoch": 1.199111111111111, + "grad_norm": 2.118912696838379, + "learning_rate": 0.00015217081850533808, + "loss": 2.1662, + "step": 2698 + }, + { + "epoch": 1.1995555555555555, + "grad_norm": 1.5802162885665894, + "learning_rate": 0.00015215302491103204, + "loss": 0.963, + "step": 2699 + }, + { + "epoch": 1.2, + "grad_norm": 2.575432062149048, + "learning_rate": 0.00015213523131672597, + "loss": 2.3078, + "step": 2700 + }, + { + "epoch": 1.2004444444444444, + "grad_norm": 1.573493480682373, + "learning_rate": 0.00015211743772241992, + "loss": 1.3132, + "step": 2701 + }, + { + "epoch": 1.200888888888889, + "grad_norm": 1.0483604669570923, + "learning_rate": 0.00015209964412811388, + "loss": 1.0147, + "step": 2702 + }, + { + "epoch": 1.2013333333333334, + "grad_norm": 0.8470864295959473, + "learning_rate": 0.00015208185053380784, + "loss": 0.9255, + "step": 2703 + }, + { + "epoch": 1.2017777777777778, + "grad_norm": 1.5103946924209595, + "learning_rate": 0.0001520640569395018, + "loss": 1.7368, + "step": 2704 + }, + { + "epoch": 1.2022222222222223, + "grad_norm": 1.2146368026733398, + "learning_rate": 0.00015204626334519575, + "loss": 1.5886, + "step": 2705 + }, + { + "epoch": 1.2026666666666666, + "grad_norm": 1.3831676244735718, + "learning_rate": 0.00015202846975088968, + "loss": 2.1525, + "step": 2706 + }, + { + "epoch": 1.203111111111111, + "grad_norm": 1.5528510808944702, + "learning_rate": 0.00015201067615658363, + "loss": 2.5657, + "step": 2707 + }, + { + "epoch": 1.2035555555555555, + "grad_norm": 1.551809549331665, + "learning_rate": 0.0001519928825622776, + "loss": 2.2547, + "step": 2708 + }, + { + "epoch": 1.204, + "grad_norm": 1.305998682975769, + "learning_rate": 0.00015197508896797155, + "loss": 1.3105, + "step": 2709 + }, + { + "epoch": 1.2044444444444444, + "grad_norm": 1.868577480316162, + "learning_rate": 0.0001519572953736655, + "loss": 2.6362, + "step": 2710 + }, + { + "epoch": 1.204888888888889, + "grad_norm": 1.6573137044906616, + "learning_rate": 0.00015193950177935943, + "loss": 2.3689, + "step": 2711 + }, + { + "epoch": 1.2053333333333334, + "grad_norm": 1.4828029870986938, + "learning_rate": 0.0001519217081850534, + "loss": 1.2541, + "step": 2712 + }, + { + "epoch": 1.2057777777777778, + "grad_norm": 1.4731237888336182, + "learning_rate": 0.00015190391459074732, + "loss": 2.0792, + "step": 2713 + }, + { + "epoch": 1.2062222222222223, + "grad_norm": 1.6513289213180542, + "learning_rate": 0.00015188612099644128, + "loss": 2.2811, + "step": 2714 + }, + { + "epoch": 1.2066666666666666, + "grad_norm": 1.5226035118103027, + "learning_rate": 0.00015186832740213523, + "loss": 2.117, + "step": 2715 + }, + { + "epoch": 1.207111111111111, + "grad_norm": 1.672688603401184, + "learning_rate": 0.0001518505338078292, + "loss": 2.0124, + "step": 2716 + }, + { + "epoch": 1.2075555555555555, + "grad_norm": 1.6700776815414429, + "learning_rate": 0.00015183274021352315, + "loss": 2.0039, + "step": 2717 + }, + { + "epoch": 1.208, + "grad_norm": 1.613197922706604, + "learning_rate": 0.0001518149466192171, + "loss": 2.182, + "step": 2718 + }, + { + "epoch": 1.2084444444444444, + "grad_norm": 1.8841910362243652, + "learning_rate": 0.00015179715302491103, + "loss": 2.4719, + "step": 2719 + }, + { + "epoch": 1.208888888888889, + "grad_norm": 1.5783162117004395, + "learning_rate": 0.000151779359430605, + "loss": 2.2804, + "step": 2720 + }, + { + "epoch": 1.2093333333333334, + "grad_norm": 1.5242904424667358, + "learning_rate": 0.00015176156583629894, + "loss": 2.322, + "step": 2721 + }, + { + "epoch": 1.2097777777777778, + "grad_norm": 1.4535586833953857, + "learning_rate": 0.0001517437722419929, + "loss": 1.9133, + "step": 2722 + }, + { + "epoch": 1.2102222222222223, + "grad_norm": 2.0285212993621826, + "learning_rate": 0.00015172597864768686, + "loss": 2.0526, + "step": 2723 + }, + { + "epoch": 1.2106666666666666, + "grad_norm": 1.6039782762527466, + "learning_rate": 0.0001517081850533808, + "loss": 1.8729, + "step": 2724 + }, + { + "epoch": 1.211111111111111, + "grad_norm": 1.4324172735214233, + "learning_rate": 0.00015169039145907474, + "loss": 1.6432, + "step": 2725 + }, + { + "epoch": 1.2115555555555555, + "grad_norm": 1.6351962089538574, + "learning_rate": 0.00015167259786476867, + "loss": 2.4551, + "step": 2726 + }, + { + "epoch": 1.212, + "grad_norm": 1.7832766771316528, + "learning_rate": 0.00015165480427046263, + "loss": 2.1844, + "step": 2727 + }, + { + "epoch": 1.2124444444444444, + "grad_norm": 1.8114533424377441, + "learning_rate": 0.00015163701067615659, + "loss": 2.1506, + "step": 2728 + }, + { + "epoch": 1.212888888888889, + "grad_norm": 1.875593900680542, + "learning_rate": 0.00015161921708185054, + "loss": 2.0954, + "step": 2729 + }, + { + "epoch": 1.2133333333333334, + "grad_norm": 1.673852562904358, + "learning_rate": 0.0001516014234875445, + "loss": 2.1643, + "step": 2730 + }, + { + "epoch": 1.2137777777777778, + "grad_norm": 1.6588380336761475, + "learning_rate": 0.00015158362989323845, + "loss": 1.6013, + "step": 2731 + }, + { + "epoch": 1.2142222222222223, + "grad_norm": 1.8047765493392944, + "learning_rate": 0.00015156583629893238, + "loss": 2.441, + "step": 2732 + }, + { + "epoch": 1.2146666666666666, + "grad_norm": 1.8153445720672607, + "learning_rate": 0.00015154804270462634, + "loss": 2.2575, + "step": 2733 + }, + { + "epoch": 1.215111111111111, + "grad_norm": 1.9475229978561401, + "learning_rate": 0.0001515302491103203, + "loss": 2.0499, + "step": 2734 + }, + { + "epoch": 1.2155555555555555, + "grad_norm": 1.8662759065628052, + "learning_rate": 0.00015151245551601425, + "loss": 1.9091, + "step": 2735 + }, + { + "epoch": 1.216, + "grad_norm": 1.9955482482910156, + "learning_rate": 0.0001514946619217082, + "loss": 2.5757, + "step": 2736 + }, + { + "epoch": 1.2164444444444444, + "grad_norm": 1.897420883178711, + "learning_rate": 0.00015147686832740214, + "loss": 1.8366, + "step": 2737 + }, + { + "epoch": 1.216888888888889, + "grad_norm": 1.6787134408950806, + "learning_rate": 0.0001514590747330961, + "loss": 0.962, + "step": 2738 + }, + { + "epoch": 1.2173333333333334, + "grad_norm": 1.790507197380066, + "learning_rate": 0.00015144128113879003, + "loss": 2.1025, + "step": 2739 + }, + { + "epoch": 1.2177777777777778, + "grad_norm": 2.025376558303833, + "learning_rate": 0.00015142348754448398, + "loss": 2.4571, + "step": 2740 + }, + { + "epoch": 1.2182222222222223, + "grad_norm": 2.094409942626953, + "learning_rate": 0.00015140569395017794, + "loss": 2.2626, + "step": 2741 + }, + { + "epoch": 1.2186666666666666, + "grad_norm": 2.1918323040008545, + "learning_rate": 0.0001513879003558719, + "loss": 2.4293, + "step": 2742 + }, + { + "epoch": 1.219111111111111, + "grad_norm": 2.2895400524139404, + "learning_rate": 0.00015137010676156585, + "loss": 2.4093, + "step": 2743 + }, + { + "epoch": 1.2195555555555555, + "grad_norm": 2.0072021484375, + "learning_rate": 0.0001513523131672598, + "loss": 2.3826, + "step": 2744 + }, + { + "epoch": 1.22, + "grad_norm": 2.0313949584960938, + "learning_rate": 0.00015133451957295374, + "loss": 1.6368, + "step": 2745 + }, + { + "epoch": 1.2204444444444444, + "grad_norm": 2.9698612689971924, + "learning_rate": 0.0001513167259786477, + "loss": 2.5953, + "step": 2746 + }, + { + "epoch": 1.220888888888889, + "grad_norm": 2.604888439178467, + "learning_rate": 0.00015129893238434165, + "loss": 2.9221, + "step": 2747 + }, + { + "epoch": 1.2213333333333334, + "grad_norm": 2.023358106613159, + "learning_rate": 0.0001512811387900356, + "loss": 1.8182, + "step": 2748 + }, + { + "epoch": 1.2217777777777779, + "grad_norm": 2.1045310497283936, + "learning_rate": 0.00015126334519572956, + "loss": 1.6869, + "step": 2749 + }, + { + "epoch": 1.2222222222222223, + "grad_norm": 2.9360511302948, + "learning_rate": 0.0001512455516014235, + "loss": 2.0189, + "step": 2750 + }, + { + "epoch": 1.2226666666666666, + "grad_norm": 1.2831294536590576, + "learning_rate": 0.00015122775800711745, + "loss": 2.4483, + "step": 2751 + }, + { + "epoch": 1.223111111111111, + "grad_norm": 1.1311028003692627, + "learning_rate": 0.00015120996441281138, + "loss": 1.2781, + "step": 2752 + }, + { + "epoch": 1.2235555555555555, + "grad_norm": 1.4160290956497192, + "learning_rate": 0.00015119217081850534, + "loss": 2.2221, + "step": 2753 + }, + { + "epoch": 1.224, + "grad_norm": 1.3835599422454834, + "learning_rate": 0.0001511743772241993, + "loss": 2.2433, + "step": 2754 + }, + { + "epoch": 1.2244444444444444, + "grad_norm": 1.6253594160079956, + "learning_rate": 0.00015115658362989325, + "loss": 1.8161, + "step": 2755 + }, + { + "epoch": 1.224888888888889, + "grad_norm": 1.227462649345398, + "learning_rate": 0.0001511387900355872, + "loss": 1.9292, + "step": 2756 + }, + { + "epoch": 1.2253333333333334, + "grad_norm": 1.425794005393982, + "learning_rate": 0.00015112099644128116, + "loss": 1.6751, + "step": 2757 + }, + { + "epoch": 1.2257777777777779, + "grad_norm": 0.8352449536323547, + "learning_rate": 0.0001511032028469751, + "loss": 0.042, + "step": 2758 + }, + { + "epoch": 1.2262222222222223, + "grad_norm": 1.41720449924469, + "learning_rate": 0.00015108540925266905, + "loss": 2.2062, + "step": 2759 + }, + { + "epoch": 1.2266666666666666, + "grad_norm": 1.4105916023254395, + "learning_rate": 0.000151067615658363, + "loss": 1.8583, + "step": 2760 + }, + { + "epoch": 1.227111111111111, + "grad_norm": 1.653696060180664, + "learning_rate": 0.00015104982206405696, + "loss": 1.4183, + "step": 2761 + }, + { + "epoch": 1.2275555555555555, + "grad_norm": 1.5550695657730103, + "learning_rate": 0.0001510320284697509, + "loss": 2.0958, + "step": 2762 + }, + { + "epoch": 1.228, + "grad_norm": 1.5534552335739136, + "learning_rate": 0.00015101423487544485, + "loss": 1.1471, + "step": 2763 + }, + { + "epoch": 1.2284444444444444, + "grad_norm": 1.956020474433899, + "learning_rate": 0.0001509964412811388, + "loss": 2.408, + "step": 2764 + }, + { + "epoch": 1.228888888888889, + "grad_norm": 1.5008649826049805, + "learning_rate": 0.00015097864768683273, + "loss": 1.9145, + "step": 2765 + }, + { + "epoch": 1.2293333333333334, + "grad_norm": 1.4730578660964966, + "learning_rate": 0.0001509608540925267, + "loss": 1.8439, + "step": 2766 + }, + { + "epoch": 1.2297777777777779, + "grad_norm": 1.7233079671859741, + "learning_rate": 0.00015094306049822064, + "loss": 2.4389, + "step": 2767 + }, + { + "epoch": 1.2302222222222223, + "grad_norm": 1.4450547695159912, + "learning_rate": 0.0001509252669039146, + "loss": 2.1717, + "step": 2768 + }, + { + "epoch": 1.2306666666666666, + "grad_norm": 1.7250124216079712, + "learning_rate": 0.00015090747330960856, + "loss": 2.198, + "step": 2769 + }, + { + "epoch": 1.231111111111111, + "grad_norm": 2.009876012802124, + "learning_rate": 0.00015088967971530251, + "loss": 2.6704, + "step": 2770 + }, + { + "epoch": 1.2315555555555555, + "grad_norm": 1.9580446481704712, + "learning_rate": 0.00015087188612099644, + "loss": 1.803, + "step": 2771 + }, + { + "epoch": 1.232, + "grad_norm": 1.7807945013046265, + "learning_rate": 0.0001508540925266904, + "loss": 2.401, + "step": 2772 + }, + { + "epoch": 1.2324444444444445, + "grad_norm": 1.9631510972976685, + "learning_rate": 0.00015083629893238436, + "loss": 2.762, + "step": 2773 + }, + { + "epoch": 1.232888888888889, + "grad_norm": 1.7688226699829102, + "learning_rate": 0.0001508185053380783, + "loss": 2.5009, + "step": 2774 + }, + { + "epoch": 1.2333333333333334, + "grad_norm": 1.898895263671875, + "learning_rate": 0.00015080071174377224, + "loss": 2.1815, + "step": 2775 + }, + { + "epoch": 1.2337777777777779, + "grad_norm": 1.748230218887329, + "learning_rate": 0.0001507829181494662, + "loss": 2.1702, + "step": 2776 + }, + { + "epoch": 1.2342222222222223, + "grad_norm": 1.829336404800415, + "learning_rate": 0.00015076512455516016, + "loss": 2.6297, + "step": 2777 + }, + { + "epoch": 1.2346666666666666, + "grad_norm": 1.7293047904968262, + "learning_rate": 0.00015074733096085409, + "loss": 2.0303, + "step": 2778 + }, + { + "epoch": 1.235111111111111, + "grad_norm": 1.6625522375106812, + "learning_rate": 0.00015072953736654804, + "loss": 1.9354, + "step": 2779 + }, + { + "epoch": 1.2355555555555555, + "grad_norm": 1.653939127922058, + "learning_rate": 0.000150711743772242, + "loss": 1.7279, + "step": 2780 + }, + { + "epoch": 1.236, + "grad_norm": 1.7402019500732422, + "learning_rate": 0.00015069395017793595, + "loss": 1.8008, + "step": 2781 + }, + { + "epoch": 1.2364444444444445, + "grad_norm": 1.5978055000305176, + "learning_rate": 0.0001506761565836299, + "loss": 2.0594, + "step": 2782 + }, + { + "epoch": 1.236888888888889, + "grad_norm": 1.8129159212112427, + "learning_rate": 0.00015065836298932384, + "loss": 1.8207, + "step": 2783 + }, + { + "epoch": 1.2373333333333334, + "grad_norm": 1.7434604167938232, + "learning_rate": 0.0001506405693950178, + "loss": 1.9957, + "step": 2784 + }, + { + "epoch": 1.2377777777777779, + "grad_norm": 1.829714059829712, + "learning_rate": 0.00015062277580071175, + "loss": 2.2293, + "step": 2785 + }, + { + "epoch": 1.2382222222222223, + "grad_norm": 2.231995105743408, + "learning_rate": 0.0001506049822064057, + "loss": 2.5919, + "step": 2786 + }, + { + "epoch": 1.2386666666666666, + "grad_norm": 2.2278225421905518, + "learning_rate": 0.00015058718861209967, + "loss": 2.299, + "step": 2787 + }, + { + "epoch": 1.239111111111111, + "grad_norm": 1.5640493631362915, + "learning_rate": 0.0001505693950177936, + "loss": 1.2429, + "step": 2788 + }, + { + "epoch": 1.2395555555555555, + "grad_norm": 1.987496256828308, + "learning_rate": 0.00015055160142348755, + "loss": 1.1852, + "step": 2789 + }, + { + "epoch": 1.24, + "grad_norm": 1.910750389099121, + "learning_rate": 0.00015053380782918148, + "loss": 1.7426, + "step": 2790 + }, + { + "epoch": 1.2404444444444445, + "grad_norm": 1.7743321657180786, + "learning_rate": 0.00015051601423487544, + "loss": 1.7287, + "step": 2791 + }, + { + "epoch": 1.240888888888889, + "grad_norm": 1.9322333335876465, + "learning_rate": 0.0001504982206405694, + "loss": 2.3926, + "step": 2792 + }, + { + "epoch": 1.2413333333333334, + "grad_norm": 1.9463812112808228, + "learning_rate": 0.00015048042704626335, + "loss": 2.2058, + "step": 2793 + }, + { + "epoch": 1.2417777777777779, + "grad_norm": 1.9254072904586792, + "learning_rate": 0.0001504626334519573, + "loss": 2.167, + "step": 2794 + }, + { + "epoch": 1.2422222222222223, + "grad_norm": 2.315269947052002, + "learning_rate": 0.00015044483985765126, + "loss": 1.988, + "step": 2795 + }, + { + "epoch": 1.2426666666666666, + "grad_norm": 2.6116104125976562, + "learning_rate": 0.0001504270462633452, + "loss": 2.5243, + "step": 2796 + }, + { + "epoch": 1.243111111111111, + "grad_norm": 2.500777244567871, + "learning_rate": 0.00015040925266903915, + "loss": 2.4406, + "step": 2797 + }, + { + "epoch": 1.2435555555555555, + "grad_norm": 2.5886335372924805, + "learning_rate": 0.0001503914590747331, + "loss": 1.7912, + "step": 2798 + }, + { + "epoch": 1.244, + "grad_norm": 2.798053503036499, + "learning_rate": 0.00015037366548042706, + "loss": 1.7156, + "step": 2799 + }, + { + "epoch": 1.2444444444444445, + "grad_norm": 3.800767660140991, + "learning_rate": 0.00015035587188612102, + "loss": 1.2951, + "step": 2800 + }, + { + "epoch": 1.244888888888889, + "grad_norm": 1.2219860553741455, + "learning_rate": 0.00015033807829181495, + "loss": 2.3513, + "step": 2801 + }, + { + "epoch": 1.2453333333333334, + "grad_norm": 1.1778558492660522, + "learning_rate": 0.0001503202846975089, + "loss": 2.2511, + "step": 2802 + }, + { + "epoch": 1.2457777777777779, + "grad_norm": 1.369297981262207, + "learning_rate": 0.00015030249110320283, + "loss": 1.935, + "step": 2803 + }, + { + "epoch": 1.2462222222222223, + "grad_norm": 1.6750556230545044, + "learning_rate": 0.0001502846975088968, + "loss": 2.3057, + "step": 2804 + }, + { + "epoch": 1.2466666666666666, + "grad_norm": 1.4354381561279297, + "learning_rate": 0.00015026690391459075, + "loss": 2.4099, + "step": 2805 + }, + { + "epoch": 1.247111111111111, + "grad_norm": 1.3664571046829224, + "learning_rate": 0.0001502491103202847, + "loss": 2.0848, + "step": 2806 + }, + { + "epoch": 1.2475555555555555, + "grad_norm": 1.3316177129745483, + "learning_rate": 0.00015023131672597866, + "loss": 2.1252, + "step": 2807 + }, + { + "epoch": 1.248, + "grad_norm": 1.3470760583877563, + "learning_rate": 0.00015021352313167262, + "loss": 2.3662, + "step": 2808 + }, + { + "epoch": 1.2484444444444445, + "grad_norm": 1.4858183860778809, + "learning_rate": 0.00015019572953736655, + "loss": 2.4822, + "step": 2809 + }, + { + "epoch": 1.248888888888889, + "grad_norm": 1.4876043796539307, + "learning_rate": 0.0001501779359430605, + "loss": 2.2044, + "step": 2810 + }, + { + "epoch": 1.2493333333333334, + "grad_norm": 1.807070016860962, + "learning_rate": 0.00015016014234875446, + "loss": 2.438, + "step": 2811 + }, + { + "epoch": 1.2497777777777777, + "grad_norm": 1.5529999732971191, + "learning_rate": 0.00015014234875444842, + "loss": 2.2415, + "step": 2812 + }, + { + "epoch": 1.2502222222222223, + "grad_norm": 1.5677090883255005, + "learning_rate": 0.00015012455516014237, + "loss": 2.1466, + "step": 2813 + }, + { + "epoch": 1.2506666666666666, + "grad_norm": 1.4437453746795654, + "learning_rate": 0.0001501067615658363, + "loss": 2.1075, + "step": 2814 + }, + { + "epoch": 1.251111111111111, + "grad_norm": 1.7834696769714355, + "learning_rate": 0.00015008896797153026, + "loss": 2.6351, + "step": 2815 + }, + { + "epoch": 1.2515555555555555, + "grad_norm": 1.593764305114746, + "learning_rate": 0.0001500711743772242, + "loss": 2.1957, + "step": 2816 + }, + { + "epoch": 1.252, + "grad_norm": 1.5799425840377808, + "learning_rate": 0.00015005338078291814, + "loss": 2.7452, + "step": 2817 + }, + { + "epoch": 1.2524444444444445, + "grad_norm": 1.6034505367279053, + "learning_rate": 0.0001500355871886121, + "loss": 1.7773, + "step": 2818 + }, + { + "epoch": 1.252888888888889, + "grad_norm": 1.1954715251922607, + "learning_rate": 0.00015001779359430606, + "loss": 1.204, + "step": 2819 + }, + { + "epoch": 1.2533333333333334, + "grad_norm": 1.638249158859253, + "learning_rate": 0.00015000000000000001, + "loss": 2.394, + "step": 2820 + }, + { + "epoch": 1.2537777777777777, + "grad_norm": 1.640615463256836, + "learning_rate": 0.00014998220640569397, + "loss": 2.6767, + "step": 2821 + }, + { + "epoch": 1.2542222222222223, + "grad_norm": 1.6124838590621948, + "learning_rate": 0.0001499644128113879, + "loss": 2.2059, + "step": 2822 + }, + { + "epoch": 1.2546666666666666, + "grad_norm": 1.6705976724624634, + "learning_rate": 0.00014994661921708186, + "loss": 2.2517, + "step": 2823 + }, + { + "epoch": 1.255111111111111, + "grad_norm": 1.4649361371994019, + "learning_rate": 0.0001499288256227758, + "loss": 1.9224, + "step": 2824 + }, + { + "epoch": 1.2555555555555555, + "grad_norm": 1.701545238494873, + "learning_rate": 0.00014991103202846977, + "loss": 2.2677, + "step": 2825 + }, + { + "epoch": 1.256, + "grad_norm": 1.726928949356079, + "learning_rate": 0.00014989323843416373, + "loss": 2.1658, + "step": 2826 + }, + { + "epoch": 1.2564444444444445, + "grad_norm": 1.6811003684997559, + "learning_rate": 0.00014987544483985766, + "loss": 2.0167, + "step": 2827 + }, + { + "epoch": 1.256888888888889, + "grad_norm": 1.8641170263290405, + "learning_rate": 0.0001498576512455516, + "loss": 1.7901, + "step": 2828 + }, + { + "epoch": 1.2573333333333334, + "grad_norm": 1.6056395769119263, + "learning_rate": 0.00014983985765124554, + "loss": 1.6597, + "step": 2829 + }, + { + "epoch": 1.2577777777777777, + "grad_norm": 1.687373399734497, + "learning_rate": 0.0001498220640569395, + "loss": 1.9804, + "step": 2830 + }, + { + "epoch": 1.2582222222222224, + "grad_norm": 1.6241012811660767, + "learning_rate": 0.00014980427046263345, + "loss": 2.0725, + "step": 2831 + }, + { + "epoch": 1.2586666666666666, + "grad_norm": 1.4386781454086304, + "learning_rate": 0.0001497864768683274, + "loss": 1.1918, + "step": 2832 + }, + { + "epoch": 1.259111111111111, + "grad_norm": 1.7372790575027466, + "learning_rate": 0.00014976868327402137, + "loss": 2.2247, + "step": 2833 + }, + { + "epoch": 1.2595555555555555, + "grad_norm": 1.8010145425796509, + "learning_rate": 0.00014975088967971532, + "loss": 2.1944, + "step": 2834 + }, + { + "epoch": 1.26, + "grad_norm": 1.9382820129394531, + "learning_rate": 0.00014973309608540925, + "loss": 2.3833, + "step": 2835 + }, + { + "epoch": 1.2604444444444445, + "grad_norm": 1.9000599384307861, + "learning_rate": 0.0001497153024911032, + "loss": 1.7084, + "step": 2836 + }, + { + "epoch": 1.260888888888889, + "grad_norm": 2.1467716693878174, + "learning_rate": 0.00014969750889679717, + "loss": 2.0741, + "step": 2837 + }, + { + "epoch": 1.2613333333333334, + "grad_norm": 2.0739872455596924, + "learning_rate": 0.00014967971530249112, + "loss": 2.2452, + "step": 2838 + }, + { + "epoch": 1.2617777777777777, + "grad_norm": 1.7084413766860962, + "learning_rate": 0.00014966192170818508, + "loss": 1.3884, + "step": 2839 + }, + { + "epoch": 1.2622222222222224, + "grad_norm": 1.855447769165039, + "learning_rate": 0.000149644128113879, + "loss": 1.9037, + "step": 2840 + }, + { + "epoch": 1.2626666666666666, + "grad_norm": 1.8656028509140015, + "learning_rate": 0.00014962633451957296, + "loss": 1.7658, + "step": 2841 + }, + { + "epoch": 1.263111111111111, + "grad_norm": 2.142399549484253, + "learning_rate": 0.0001496085409252669, + "loss": 1.9569, + "step": 2842 + }, + { + "epoch": 1.2635555555555555, + "grad_norm": 1.9603620767593384, + "learning_rate": 0.00014959074733096085, + "loss": 1.9807, + "step": 2843 + }, + { + "epoch": 1.264, + "grad_norm": 2.183345317840576, + "learning_rate": 0.0001495729537366548, + "loss": 2.4544, + "step": 2844 + }, + { + "epoch": 1.2644444444444445, + "grad_norm": 2.205909490585327, + "learning_rate": 0.00014955516014234876, + "loss": 2.4811, + "step": 2845 + }, + { + "epoch": 1.264888888888889, + "grad_norm": 2.540581226348877, + "learning_rate": 0.00014953736654804272, + "loss": 2.9558, + "step": 2846 + }, + { + "epoch": 1.2653333333333334, + "grad_norm": 2.1151061058044434, + "learning_rate": 0.00014951957295373668, + "loss": 1.7681, + "step": 2847 + }, + { + "epoch": 1.2657777777777777, + "grad_norm": 2.5562145709991455, + "learning_rate": 0.0001495017793594306, + "loss": 2.3364, + "step": 2848 + }, + { + "epoch": 1.2662222222222224, + "grad_norm": 2.720233201980591, + "learning_rate": 0.00014948398576512456, + "loss": 1.7419, + "step": 2849 + }, + { + "epoch": 1.2666666666666666, + "grad_norm": 2.397717237472534, + "learning_rate": 0.00014946619217081852, + "loss": 1.9456, + "step": 2850 + }, + { + "epoch": 1.267111111111111, + "grad_norm": 1.2284705638885498, + "learning_rate": 0.00014944839857651248, + "loss": 2.54, + "step": 2851 + }, + { + "epoch": 1.2675555555555555, + "grad_norm": 1.3442673683166504, + "learning_rate": 0.00014943060498220643, + "loss": 2.1183, + "step": 2852 + }, + { + "epoch": 1.268, + "grad_norm": 1.3058741092681885, + "learning_rate": 0.00014941281138790036, + "loss": 2.0397, + "step": 2853 + }, + { + "epoch": 1.2684444444444445, + "grad_norm": 1.4303501844406128, + "learning_rate": 0.00014939501779359432, + "loss": 2.9471, + "step": 2854 + }, + { + "epoch": 1.268888888888889, + "grad_norm": 1.456242561340332, + "learning_rate": 0.00014937722419928825, + "loss": 2.1462, + "step": 2855 + }, + { + "epoch": 1.2693333333333334, + "grad_norm": 1.5883921384811401, + "learning_rate": 0.0001493594306049822, + "loss": 2.7315, + "step": 2856 + }, + { + "epoch": 1.2697777777777777, + "grad_norm": 1.4734134674072266, + "learning_rate": 0.00014934163701067616, + "loss": 1.95, + "step": 2857 + }, + { + "epoch": 1.2702222222222221, + "grad_norm": 1.4658904075622559, + "learning_rate": 0.00014932384341637012, + "loss": 2.5269, + "step": 2858 + }, + { + "epoch": 1.2706666666666666, + "grad_norm": 1.253811240196228, + "learning_rate": 0.00014930604982206407, + "loss": 1.3081, + "step": 2859 + }, + { + "epoch": 1.271111111111111, + "grad_norm": 1.178534984588623, + "learning_rate": 0.00014928825622775803, + "loss": 1.2465, + "step": 2860 + }, + { + "epoch": 1.2715555555555556, + "grad_norm": 1.6904065608978271, + "learning_rate": 0.00014927046263345196, + "loss": 2.314, + "step": 2861 + }, + { + "epoch": 1.272, + "grad_norm": 1.5300809144973755, + "learning_rate": 0.00014925266903914592, + "loss": 2.0894, + "step": 2862 + }, + { + "epoch": 1.2724444444444445, + "grad_norm": 1.5079879760742188, + "learning_rate": 0.00014923487544483987, + "loss": 2.1044, + "step": 2863 + }, + { + "epoch": 1.272888888888889, + "grad_norm": 1.88065505027771, + "learning_rate": 0.00014921708185053383, + "loss": 2.1615, + "step": 2864 + }, + { + "epoch": 1.2733333333333334, + "grad_norm": 1.3186235427856445, + "learning_rate": 0.00014919928825622779, + "loss": 0.7738, + "step": 2865 + }, + { + "epoch": 1.2737777777777777, + "grad_norm": 2.2539889812469482, + "learning_rate": 0.00014918149466192171, + "loss": 2.2635, + "step": 2866 + }, + { + "epoch": 1.2742222222222221, + "grad_norm": 1.5537022352218628, + "learning_rate": 0.00014916370106761567, + "loss": 1.9448, + "step": 2867 + }, + { + "epoch": 1.2746666666666666, + "grad_norm": 1.676327109336853, + "learning_rate": 0.0001491459074733096, + "loss": 2.2653, + "step": 2868 + }, + { + "epoch": 1.275111111111111, + "grad_norm": 1.697751760482788, + "learning_rate": 0.00014912811387900356, + "loss": 2.2573, + "step": 2869 + }, + { + "epoch": 1.2755555555555556, + "grad_norm": 1.7857534885406494, + "learning_rate": 0.00014911032028469751, + "loss": 2.6627, + "step": 2870 + }, + { + "epoch": 1.276, + "grad_norm": 1.6772701740264893, + "learning_rate": 0.00014909252669039147, + "loss": 2.1949, + "step": 2871 + }, + { + "epoch": 1.2764444444444445, + "grad_norm": 1.546369194984436, + "learning_rate": 0.00014907473309608543, + "loss": 1.5786, + "step": 2872 + }, + { + "epoch": 1.276888888888889, + "grad_norm": 1.6441593170166016, + "learning_rate": 0.00014905693950177936, + "loss": 1.9288, + "step": 2873 + }, + { + "epoch": 1.2773333333333334, + "grad_norm": 1.613301396369934, + "learning_rate": 0.0001490391459074733, + "loss": 1.9617, + "step": 2874 + }, + { + "epoch": 1.2777777777777777, + "grad_norm": 2.057661771774292, + "learning_rate": 0.00014902135231316727, + "loss": 2.3796, + "step": 2875 + }, + { + "epoch": 1.2782222222222221, + "grad_norm": 2.0095505714416504, + "learning_rate": 0.00014900355871886123, + "loss": 2.1544, + "step": 2876 + }, + { + "epoch": 1.2786666666666666, + "grad_norm": 1.9702578783035278, + "learning_rate": 0.00014898576512455518, + "loss": 2.562, + "step": 2877 + }, + { + "epoch": 1.279111111111111, + "grad_norm": 1.7147190570831299, + "learning_rate": 0.00014896797153024914, + "loss": 2.1862, + "step": 2878 + }, + { + "epoch": 1.2795555555555556, + "grad_norm": 1.5411655902862549, + "learning_rate": 0.00014895017793594307, + "loss": 2.0377, + "step": 2879 + }, + { + "epoch": 1.28, + "grad_norm": 1.9299793243408203, + "learning_rate": 0.000148932384341637, + "loss": 2.4939, + "step": 2880 + }, + { + "epoch": 1.2804444444444445, + "grad_norm": 1.9820499420166016, + "learning_rate": 0.00014891459074733095, + "loss": 2.4094, + "step": 2881 + }, + { + "epoch": 1.280888888888889, + "grad_norm": 1.9093626737594604, + "learning_rate": 0.0001488967971530249, + "loss": 1.8737, + "step": 2882 + }, + { + "epoch": 1.2813333333333334, + "grad_norm": 1.7828611135482788, + "learning_rate": 0.00014887900355871887, + "loss": 2.3426, + "step": 2883 + }, + { + "epoch": 1.2817777777777777, + "grad_norm": 2.0726230144500732, + "learning_rate": 0.00014886120996441282, + "loss": 2.2123, + "step": 2884 + }, + { + "epoch": 1.2822222222222222, + "grad_norm": 1.8538103103637695, + "learning_rate": 0.00014884341637010678, + "loss": 1.9047, + "step": 2885 + }, + { + "epoch": 1.2826666666666666, + "grad_norm": 1.746737003326416, + "learning_rate": 0.0001488256227758007, + "loss": 1.693, + "step": 2886 + }, + { + "epoch": 1.283111111111111, + "grad_norm": 1.8844788074493408, + "learning_rate": 0.00014880782918149467, + "loss": 2.0716, + "step": 2887 + }, + { + "epoch": 1.2835555555555556, + "grad_norm": 1.7158288955688477, + "learning_rate": 0.00014879003558718862, + "loss": 1.9545, + "step": 2888 + }, + { + "epoch": 1.284, + "grad_norm": 1.926275610923767, + "learning_rate": 0.00014877224199288258, + "loss": 1.8723, + "step": 2889 + }, + { + "epoch": 1.2844444444444445, + "grad_norm": 2.2394421100616455, + "learning_rate": 0.00014875444839857654, + "loss": 2.3997, + "step": 2890 + }, + { + "epoch": 1.284888888888889, + "grad_norm": 1.957261323928833, + "learning_rate": 0.00014873665480427046, + "loss": 2.1043, + "step": 2891 + }, + { + "epoch": 1.2853333333333334, + "grad_norm": 2.291721820831299, + "learning_rate": 0.00014871886120996442, + "loss": 2.213, + "step": 2892 + }, + { + "epoch": 1.2857777777777777, + "grad_norm": 1.8697887659072876, + "learning_rate": 0.00014870106761565835, + "loss": 1.8325, + "step": 2893 + }, + { + "epoch": 1.2862222222222222, + "grad_norm": 1.8223001956939697, + "learning_rate": 0.0001486832740213523, + "loss": 1.8901, + "step": 2894 + }, + { + "epoch": 1.2866666666666666, + "grad_norm": 1.9460232257843018, + "learning_rate": 0.00014866548042704626, + "loss": 2.0251, + "step": 2895 + }, + { + "epoch": 1.287111111111111, + "grad_norm": 2.430386543273926, + "learning_rate": 0.00014864768683274022, + "loss": 2.591, + "step": 2896 + }, + { + "epoch": 1.2875555555555556, + "grad_norm": 2.0571846961975098, + "learning_rate": 0.00014862989323843418, + "loss": 2.0458, + "step": 2897 + }, + { + "epoch": 1.288, + "grad_norm": 2.013607978820801, + "learning_rate": 0.00014861209964412813, + "loss": 1.9527, + "step": 2898 + }, + { + "epoch": 1.2884444444444445, + "grad_norm": 1.809848427772522, + "learning_rate": 0.00014859430604982206, + "loss": 1.2951, + "step": 2899 + }, + { + "epoch": 1.2888888888888888, + "grad_norm": 2.80146861076355, + "learning_rate": 0.00014857651245551602, + "loss": 1.8434, + "step": 2900 + }, + { + "epoch": 1.2893333333333334, + "grad_norm": 0.43238645792007446, + "learning_rate": 0.00014855871886120998, + "loss": 0.0395, + "step": 2901 + }, + { + "epoch": 1.2897777777777777, + "grad_norm": 1.3477979898452759, + "learning_rate": 0.00014854092526690393, + "loss": 2.1978, + "step": 2902 + }, + { + "epoch": 1.2902222222222222, + "grad_norm": 1.4748413562774658, + "learning_rate": 0.0001485231316725979, + "loss": 2.2269, + "step": 2903 + }, + { + "epoch": 1.2906666666666666, + "grad_norm": 1.8670775890350342, + "learning_rate": 0.00014850533807829182, + "loss": 2.5789, + "step": 2904 + }, + { + "epoch": 1.291111111111111, + "grad_norm": 1.5996267795562744, + "learning_rate": 0.00014848754448398577, + "loss": 2.0898, + "step": 2905 + }, + { + "epoch": 1.2915555555555556, + "grad_norm": 1.5340416431427002, + "learning_rate": 0.0001484697508896797, + "loss": 2.0439, + "step": 2906 + }, + { + "epoch": 1.292, + "grad_norm": 1.7059005498886108, + "learning_rate": 0.00014845195729537366, + "loss": 2.6905, + "step": 2907 + }, + { + "epoch": 1.2924444444444445, + "grad_norm": 1.3028349876403809, + "learning_rate": 0.00014843416370106762, + "loss": 1.1318, + "step": 2908 + }, + { + "epoch": 1.2928888888888888, + "grad_norm": 1.6985855102539062, + "learning_rate": 0.00014841637010676157, + "loss": 2.2406, + "step": 2909 + }, + { + "epoch": 1.2933333333333334, + "grad_norm": 1.416417121887207, + "learning_rate": 0.00014839857651245553, + "loss": 2.1449, + "step": 2910 + }, + { + "epoch": 1.2937777777777777, + "grad_norm": 1.791305422782898, + "learning_rate": 0.00014838078291814949, + "loss": 2.4623, + "step": 2911 + }, + { + "epoch": 1.2942222222222222, + "grad_norm": 1.3889151811599731, + "learning_rate": 0.00014836298932384342, + "loss": 1.965, + "step": 2912 + }, + { + "epoch": 1.2946666666666666, + "grad_norm": 1.8636940717697144, + "learning_rate": 0.00014834519572953737, + "loss": 2.9771, + "step": 2913 + }, + { + "epoch": 1.295111111111111, + "grad_norm": 1.9207584857940674, + "learning_rate": 0.00014832740213523133, + "loss": 2.3587, + "step": 2914 + }, + { + "epoch": 1.2955555555555556, + "grad_norm": 1.6041591167449951, + "learning_rate": 0.00014830960854092528, + "loss": 1.986, + "step": 2915 + }, + { + "epoch": 1.296, + "grad_norm": 1.9507296085357666, + "learning_rate": 0.00014829181494661924, + "loss": 2.7387, + "step": 2916 + }, + { + "epoch": 1.2964444444444445, + "grad_norm": 1.7080721855163574, + "learning_rate": 0.00014827402135231317, + "loss": 2.0368, + "step": 2917 + }, + { + "epoch": 1.2968888888888888, + "grad_norm": 1.7469477653503418, + "learning_rate": 0.00014825622775800713, + "loss": 2.0248, + "step": 2918 + }, + { + "epoch": 1.2973333333333334, + "grad_norm": 1.622348666191101, + "learning_rate": 0.00014823843416370106, + "loss": 2.3361, + "step": 2919 + }, + { + "epoch": 1.2977777777777777, + "grad_norm": 1.9793723821640015, + "learning_rate": 0.000148220640569395, + "loss": 2.7984, + "step": 2920 + }, + { + "epoch": 1.2982222222222222, + "grad_norm": 1.6501868963241577, + "learning_rate": 0.00014820284697508897, + "loss": 1.9731, + "step": 2921 + }, + { + "epoch": 1.2986666666666666, + "grad_norm": 1.7134915590286255, + "learning_rate": 0.00014818505338078293, + "loss": 2.3017, + "step": 2922 + }, + { + "epoch": 1.299111111111111, + "grad_norm": 1.6116629838943481, + "learning_rate": 0.00014816725978647688, + "loss": 2.0448, + "step": 2923 + }, + { + "epoch": 1.2995555555555556, + "grad_norm": 1.7651207447052002, + "learning_rate": 0.00014814946619217084, + "loss": 2.2993, + "step": 2924 + }, + { + "epoch": 1.3, + "grad_norm": 1.5447845458984375, + "learning_rate": 0.00014813167259786477, + "loss": 1.6308, + "step": 2925 + }, + { + "epoch": 1.3004444444444445, + "grad_norm": 1.505743145942688, + "learning_rate": 0.00014811387900355873, + "loss": 1.54, + "step": 2926 + }, + { + "epoch": 1.3008888888888888, + "grad_norm": 1.8078949451446533, + "learning_rate": 0.00014809608540925268, + "loss": 2.3919, + "step": 2927 + }, + { + "epoch": 1.3013333333333335, + "grad_norm": 1.799320101737976, + "learning_rate": 0.00014807829181494664, + "loss": 2.7337, + "step": 2928 + }, + { + "epoch": 1.3017777777777777, + "grad_norm": 1.7614754438400269, + "learning_rate": 0.0001480604982206406, + "loss": 2.1453, + "step": 2929 + }, + { + "epoch": 1.3022222222222222, + "grad_norm": 1.8952438831329346, + "learning_rate": 0.00014804270462633452, + "loss": 2.2258, + "step": 2930 + }, + { + "epoch": 1.3026666666666666, + "grad_norm": 1.7075976133346558, + "learning_rate": 0.00014802491103202848, + "loss": 2.0048, + "step": 2931 + }, + { + "epoch": 1.303111111111111, + "grad_norm": 1.847962737083435, + "learning_rate": 0.0001480071174377224, + "loss": 1.7736, + "step": 2932 + }, + { + "epoch": 1.3035555555555556, + "grad_norm": 1.7101470232009888, + "learning_rate": 0.00014798932384341637, + "loss": 2.0625, + "step": 2933 + }, + { + "epoch": 1.304, + "grad_norm": 1.760359764099121, + "learning_rate": 0.00014797153024911032, + "loss": 1.7883, + "step": 2934 + }, + { + "epoch": 1.3044444444444445, + "grad_norm": 2.244199275970459, + "learning_rate": 0.00014795373665480428, + "loss": 2.3975, + "step": 2935 + }, + { + "epoch": 1.3048888888888888, + "grad_norm": 2.010221242904663, + "learning_rate": 0.00014793594306049824, + "loss": 2.2006, + "step": 2936 + }, + { + "epoch": 1.3053333333333335, + "grad_norm": 1.8990859985351562, + "learning_rate": 0.0001479181494661922, + "loss": 2.0714, + "step": 2937 + }, + { + "epoch": 1.3057777777777777, + "grad_norm": 1.8823907375335693, + "learning_rate": 0.00014790035587188612, + "loss": 1.9876, + "step": 2938 + }, + { + "epoch": 1.3062222222222222, + "grad_norm": 1.7615153789520264, + "learning_rate": 0.00014788256227758008, + "loss": 2.2601, + "step": 2939 + }, + { + "epoch": 1.3066666666666666, + "grad_norm": 2.3797504901885986, + "learning_rate": 0.00014786476868327403, + "loss": 2.3143, + "step": 2940 + }, + { + "epoch": 1.3071111111111111, + "grad_norm": 2.3076117038726807, + "learning_rate": 0.000147846975088968, + "loss": 2.4184, + "step": 2941 + }, + { + "epoch": 1.3075555555555556, + "grad_norm": 2.3064980506896973, + "learning_rate": 0.00014782918149466195, + "loss": 2.0238, + "step": 2942 + }, + { + "epoch": 1.308, + "grad_norm": 2.125981330871582, + "learning_rate": 0.00014781138790035588, + "loss": 2.4634, + "step": 2943 + }, + { + "epoch": 1.3084444444444445, + "grad_norm": 2.4080536365509033, + "learning_rate": 0.00014779359430604983, + "loss": 2.204, + "step": 2944 + }, + { + "epoch": 1.3088888888888888, + "grad_norm": 2.146500587463379, + "learning_rate": 0.00014777580071174376, + "loss": 2.2429, + "step": 2945 + }, + { + "epoch": 1.3093333333333335, + "grad_norm": 2.469111680984497, + "learning_rate": 0.00014775800711743772, + "loss": 2.4771, + "step": 2946 + }, + { + "epoch": 1.3097777777777777, + "grad_norm": 2.244917392730713, + "learning_rate": 0.00014774021352313168, + "loss": 1.8265, + "step": 2947 + }, + { + "epoch": 1.3102222222222222, + "grad_norm": 2.1437273025512695, + "learning_rate": 0.00014772241992882563, + "loss": 2.496, + "step": 2948 + }, + { + "epoch": 1.3106666666666666, + "grad_norm": 2.3928580284118652, + "learning_rate": 0.0001477046263345196, + "loss": 1.9276, + "step": 2949 + }, + { + "epoch": 1.3111111111111111, + "grad_norm": 3.0038251876831055, + "learning_rate": 0.00014768683274021355, + "loss": 2.0514, + "step": 2950 + }, + { + "epoch": 1.3115555555555556, + "grad_norm": 1.4333771467208862, + "learning_rate": 0.00014766903914590747, + "loss": 2.8184, + "step": 2951 + }, + { + "epoch": 1.312, + "grad_norm": 1.254792332649231, + "learning_rate": 0.00014765124555160143, + "loss": 2.1764, + "step": 2952 + }, + { + "epoch": 1.3124444444444445, + "grad_norm": 1.2323510646820068, + "learning_rate": 0.0001476334519572954, + "loss": 2.4008, + "step": 2953 + }, + { + "epoch": 1.3128888888888888, + "grad_norm": 1.026353120803833, + "learning_rate": 0.00014761565836298934, + "loss": 1.6053, + "step": 2954 + }, + { + "epoch": 1.3133333333333335, + "grad_norm": 1.8161970376968384, + "learning_rate": 0.0001475978647686833, + "loss": 2.4387, + "step": 2955 + }, + { + "epoch": 1.3137777777777777, + "grad_norm": 1.568258285522461, + "learning_rate": 0.00014758007117437723, + "loss": 2.1573, + "step": 2956 + }, + { + "epoch": 1.3142222222222222, + "grad_norm": 1.747540831565857, + "learning_rate": 0.00014756227758007116, + "loss": 2.3005, + "step": 2957 + }, + { + "epoch": 1.3146666666666667, + "grad_norm": 1.9220967292785645, + "learning_rate": 0.00014754448398576512, + "loss": 2.517, + "step": 2958 + }, + { + "epoch": 1.3151111111111111, + "grad_norm": 1.7257778644561768, + "learning_rate": 0.00014752669039145907, + "loss": 2.576, + "step": 2959 + }, + { + "epoch": 1.3155555555555556, + "grad_norm": 1.6477985382080078, + "learning_rate": 0.00014750889679715303, + "loss": 2.1995, + "step": 2960 + }, + { + "epoch": 1.316, + "grad_norm": 1.5505272150039673, + "learning_rate": 0.00014749110320284699, + "loss": 2.346, + "step": 2961 + }, + { + "epoch": 1.3164444444444445, + "grad_norm": 1.4658108949661255, + "learning_rate": 0.00014747330960854094, + "loss": 2.0652, + "step": 2962 + }, + { + "epoch": 1.3168888888888888, + "grad_norm": 1.694881558418274, + "learning_rate": 0.00014745551601423487, + "loss": 2.1266, + "step": 2963 + }, + { + "epoch": 1.3173333333333335, + "grad_norm": 1.606266736984253, + "learning_rate": 0.00014743772241992883, + "loss": 2.0245, + "step": 2964 + }, + { + "epoch": 1.3177777777777777, + "grad_norm": 1.483372688293457, + "learning_rate": 0.00014741992882562278, + "loss": 2.0158, + "step": 2965 + }, + { + "epoch": 1.3182222222222222, + "grad_norm": 1.510193943977356, + "learning_rate": 0.00014740213523131674, + "loss": 1.8094, + "step": 2966 + }, + { + "epoch": 1.3186666666666667, + "grad_norm": 1.2403696775436401, + "learning_rate": 0.0001473843416370107, + "loss": 1.1705, + "step": 2967 + }, + { + "epoch": 1.3191111111111111, + "grad_norm": 1.4912583827972412, + "learning_rate": 0.00014736654804270465, + "loss": 1.9255, + "step": 2968 + }, + { + "epoch": 1.3195555555555556, + "grad_norm": 1.7279419898986816, + "learning_rate": 0.00014734875444839858, + "loss": 1.2055, + "step": 2969 + }, + { + "epoch": 1.32, + "grad_norm": 1.8031781911849976, + "learning_rate": 0.0001473309608540925, + "loss": 2.2159, + "step": 2970 + }, + { + "epoch": 1.3204444444444445, + "grad_norm": 2.015742778778076, + "learning_rate": 0.00014731316725978647, + "loss": 2.5995, + "step": 2971 + }, + { + "epoch": 1.3208888888888888, + "grad_norm": 1.7225654125213623, + "learning_rate": 0.00014729537366548043, + "loss": 2.2011, + "step": 2972 + }, + { + "epoch": 1.3213333333333335, + "grad_norm": 1.881436824798584, + "learning_rate": 0.00014727758007117438, + "loss": 2.1361, + "step": 2973 + }, + { + "epoch": 1.3217777777777777, + "grad_norm": 1.8075528144836426, + "learning_rate": 0.00014725978647686834, + "loss": 2.4812, + "step": 2974 + }, + { + "epoch": 1.3222222222222222, + "grad_norm": 1.7048109769821167, + "learning_rate": 0.0001472419928825623, + "loss": 2.2206, + "step": 2975 + }, + { + "epoch": 1.3226666666666667, + "grad_norm": 1.8709800243377686, + "learning_rate": 0.00014722419928825622, + "loss": 2.0549, + "step": 2976 + }, + { + "epoch": 1.3231111111111111, + "grad_norm": 1.6973004341125488, + "learning_rate": 0.00014720640569395018, + "loss": 2.0749, + "step": 2977 + }, + { + "epoch": 1.3235555555555556, + "grad_norm": 2.0399484634399414, + "learning_rate": 0.00014718861209964414, + "loss": 2.59, + "step": 2978 + }, + { + "epoch": 1.324, + "grad_norm": 1.7553602457046509, + "learning_rate": 0.0001471708185053381, + "loss": 1.4675, + "step": 2979 + }, + { + "epoch": 1.3244444444444445, + "grad_norm": 1.8353267908096313, + "learning_rate": 0.00014715302491103205, + "loss": 2.1814, + "step": 2980 + }, + { + "epoch": 1.3248888888888888, + "grad_norm": 1.942821741104126, + "learning_rate": 0.000147135231316726, + "loss": 1.7173, + "step": 2981 + }, + { + "epoch": 1.3253333333333333, + "grad_norm": 2.025289535522461, + "learning_rate": 0.00014711743772241994, + "loss": 2.4813, + "step": 2982 + }, + { + "epoch": 1.3257777777777777, + "grad_norm": 1.890375018119812, + "learning_rate": 0.00014709964412811387, + "loss": 2.2062, + "step": 2983 + }, + { + "epoch": 1.3262222222222222, + "grad_norm": 1.8974454402923584, + "learning_rate": 0.00014708185053380782, + "loss": 2.3105, + "step": 2984 + }, + { + "epoch": 1.3266666666666667, + "grad_norm": 1.7386442422866821, + "learning_rate": 0.00014706405693950178, + "loss": 1.9486, + "step": 2985 + }, + { + "epoch": 1.3271111111111111, + "grad_norm": 1.974223017692566, + "learning_rate": 0.00014704626334519574, + "loss": 2.5025, + "step": 2986 + }, + { + "epoch": 1.3275555555555556, + "grad_norm": 2.720777988433838, + "learning_rate": 0.0001470284697508897, + "loss": 2.2163, + "step": 2987 + }, + { + "epoch": 1.328, + "grad_norm": 2.455169200897217, + "learning_rate": 0.00014701067615658365, + "loss": 2.2537, + "step": 2988 + }, + { + "epoch": 1.3284444444444445, + "grad_norm": 1.8268187046051025, + "learning_rate": 0.00014699288256227758, + "loss": 1.3468, + "step": 2989 + }, + { + "epoch": 1.3288888888888888, + "grad_norm": 1.7359619140625, + "learning_rate": 0.00014697508896797153, + "loss": 2.2044, + "step": 2990 + }, + { + "epoch": 1.3293333333333333, + "grad_norm": 2.3060410022735596, + "learning_rate": 0.0001469572953736655, + "loss": 2.1335, + "step": 2991 + }, + { + "epoch": 1.3297777777777777, + "grad_norm": 2.0797529220581055, + "learning_rate": 0.00014693950177935945, + "loss": 2.4904, + "step": 2992 + }, + { + "epoch": 1.3302222222222222, + "grad_norm": 2.0389015674591064, + "learning_rate": 0.0001469217081850534, + "loss": 2.1793, + "step": 2993 + }, + { + "epoch": 1.3306666666666667, + "grad_norm": 1.99758780002594, + "learning_rate": 0.00014690391459074736, + "loss": 1.7031, + "step": 2994 + }, + { + "epoch": 1.3311111111111111, + "grad_norm": 2.201939582824707, + "learning_rate": 0.0001468861209964413, + "loss": 2.254, + "step": 2995 + }, + { + "epoch": 1.3315555555555556, + "grad_norm": 2.535717010498047, + "learning_rate": 0.00014686832740213522, + "loss": 2.2282, + "step": 2996 + }, + { + "epoch": 1.332, + "grad_norm": 2.139512062072754, + "learning_rate": 0.00014685053380782918, + "loss": 2.1161, + "step": 2997 + }, + { + "epoch": 1.3324444444444445, + "grad_norm": 2.589766502380371, + "learning_rate": 0.00014683274021352313, + "loss": 2.2188, + "step": 2998 + }, + { + "epoch": 1.3328888888888888, + "grad_norm": 2.3444082736968994, + "learning_rate": 0.0001468149466192171, + "loss": 2.3078, + "step": 2999 + }, + { + "epoch": 1.3333333333333333, + "grad_norm": 2.9983866214752197, + "learning_rate": 0.00014679715302491105, + "loss": 1.9401, + "step": 3000 + }, + { + "epoch": 1.3337777777777777, + "grad_norm": 1.0303854942321777, + "learning_rate": 0.000146779359430605, + "loss": 1.3335, + "step": 3001 + }, + { + "epoch": 1.3342222222222222, + "grad_norm": 1.3651078939437866, + "learning_rate": 0.00014676156583629893, + "loss": 2.7932, + "step": 3002 + }, + { + "epoch": 1.3346666666666667, + "grad_norm": 1.4741727113723755, + "learning_rate": 0.0001467437722419929, + "loss": 2.4145, + "step": 3003 + }, + { + "epoch": 1.3351111111111111, + "grad_norm": 1.308189868927002, + "learning_rate": 0.00014672597864768684, + "loss": 2.492, + "step": 3004 + }, + { + "epoch": 1.3355555555555556, + "grad_norm": 1.5847667455673218, + "learning_rate": 0.0001467081850533808, + "loss": 2.1946, + "step": 3005 + }, + { + "epoch": 1.336, + "grad_norm": 1.329132318496704, + "learning_rate": 0.00014669039145907476, + "loss": 1.5467, + "step": 3006 + }, + { + "epoch": 1.3364444444444445, + "grad_norm": 1.514951229095459, + "learning_rate": 0.00014667259786476869, + "loss": 2.048, + "step": 3007 + }, + { + "epoch": 1.3368888888888888, + "grad_norm": 1.4534947872161865, + "learning_rate": 0.00014665480427046264, + "loss": 1.3235, + "step": 3008 + }, + { + "epoch": 1.3373333333333333, + "grad_norm": 1.4798189401626587, + "learning_rate": 0.00014663701067615657, + "loss": 2.0619, + "step": 3009 + }, + { + "epoch": 1.3377777777777777, + "grad_norm": 1.5956952571868896, + "learning_rate": 0.00014661921708185053, + "loss": 2.325, + "step": 3010 + }, + { + "epoch": 1.3382222222222222, + "grad_norm": 1.8076585531234741, + "learning_rate": 0.00014660142348754449, + "loss": 2.8001, + "step": 3011 + }, + { + "epoch": 1.3386666666666667, + "grad_norm": 1.6839090585708618, + "learning_rate": 0.00014658362989323844, + "loss": 2.4422, + "step": 3012 + }, + { + "epoch": 1.3391111111111111, + "grad_norm": 1.4808694124221802, + "learning_rate": 0.0001465658362989324, + "loss": 2.2948, + "step": 3013 + }, + { + "epoch": 1.3395555555555556, + "grad_norm": 1.645921230316162, + "learning_rate": 0.00014654804270462635, + "loss": 2.1147, + "step": 3014 + }, + { + "epoch": 1.34, + "grad_norm": 1.490327000617981, + "learning_rate": 0.00014653024911032028, + "loss": 1.5932, + "step": 3015 + }, + { + "epoch": 1.3404444444444445, + "grad_norm": 1.7233227491378784, + "learning_rate": 0.00014651245551601424, + "loss": 2.5041, + "step": 3016 + }, + { + "epoch": 1.3408888888888888, + "grad_norm": 1.904835820198059, + "learning_rate": 0.0001464946619217082, + "loss": 2.4865, + "step": 3017 + }, + { + "epoch": 1.3413333333333333, + "grad_norm": 1.3651695251464844, + "learning_rate": 0.00014647686832740215, + "loss": 1.6397, + "step": 3018 + }, + { + "epoch": 1.3417777777777777, + "grad_norm": 1.8083473443984985, + "learning_rate": 0.0001464590747330961, + "loss": 2.0558, + "step": 3019 + }, + { + "epoch": 1.3422222222222222, + "grad_norm": 1.4256452322006226, + "learning_rate": 0.00014644128113879004, + "loss": 1.4262, + "step": 3020 + }, + { + "epoch": 1.3426666666666667, + "grad_norm": 1.686566948890686, + "learning_rate": 0.000146423487544484, + "loss": 2.235, + "step": 3021 + }, + { + "epoch": 1.3431111111111111, + "grad_norm": 1.6832934617996216, + "learning_rate": 0.00014640569395017793, + "loss": 2.3787, + "step": 3022 + }, + { + "epoch": 1.3435555555555556, + "grad_norm": 5.573652267456055, + "learning_rate": 0.00014638790035587188, + "loss": 2.2192, + "step": 3023 + }, + { + "epoch": 1.3439999999999999, + "grad_norm": 1.8312151432037354, + "learning_rate": 0.00014637010676156584, + "loss": 2.4134, + "step": 3024 + }, + { + "epoch": 1.3444444444444446, + "grad_norm": 2.171259641647339, + "learning_rate": 0.0001463523131672598, + "loss": 2.5597, + "step": 3025 + }, + { + "epoch": 1.3448888888888888, + "grad_norm": 1.7489248514175415, + "learning_rate": 0.00014633451957295375, + "loss": 2.0362, + "step": 3026 + }, + { + "epoch": 1.3453333333333333, + "grad_norm": 1.915249228477478, + "learning_rate": 0.0001463167259786477, + "loss": 2.0456, + "step": 3027 + }, + { + "epoch": 1.3457777777777777, + "grad_norm": 2.186251640319824, + "learning_rate": 0.00014629893238434164, + "loss": 2.448, + "step": 3028 + }, + { + "epoch": 1.3462222222222222, + "grad_norm": 2.2437589168548584, + "learning_rate": 0.0001462811387900356, + "loss": 0.093, + "step": 3029 + }, + { + "epoch": 1.3466666666666667, + "grad_norm": 1.4108742475509644, + "learning_rate": 0.00014626334519572955, + "loss": 1.0545, + "step": 3030 + }, + { + "epoch": 1.3471111111111111, + "grad_norm": 1.1790105104446411, + "learning_rate": 0.0001462455516014235, + "loss": 0.8722, + "step": 3031 + }, + { + "epoch": 1.3475555555555556, + "grad_norm": 1.8908590078353882, + "learning_rate": 0.00014622775800711746, + "loss": 2.3223, + "step": 3032 + }, + { + "epoch": 1.3479999999999999, + "grad_norm": 1.7565504312515259, + "learning_rate": 0.0001462099644128114, + "loss": 2.0448, + "step": 3033 + }, + { + "epoch": 1.3484444444444446, + "grad_norm": 2.0749552249908447, + "learning_rate": 0.00014619217081850535, + "loss": 2.4472, + "step": 3034 + }, + { + "epoch": 1.3488888888888888, + "grad_norm": 1.9328750371932983, + "learning_rate": 0.00014617437722419928, + "loss": 2.3066, + "step": 3035 + }, + { + "epoch": 1.3493333333333333, + "grad_norm": 1.9503514766693115, + "learning_rate": 0.00014615658362989324, + "loss": 2.1583, + "step": 3036 + }, + { + "epoch": 1.3497777777777777, + "grad_norm": 2.1976773738861084, + "learning_rate": 0.0001461387900355872, + "loss": 2.6388, + "step": 3037 + }, + { + "epoch": 1.3502222222222222, + "grad_norm": 1.8023751974105835, + "learning_rate": 0.00014612099644128115, + "loss": 2.0306, + "step": 3038 + }, + { + "epoch": 1.3506666666666667, + "grad_norm": 2.1533122062683105, + "learning_rate": 0.0001461032028469751, + "loss": 2.7645, + "step": 3039 + }, + { + "epoch": 1.3511111111111112, + "grad_norm": 1.889941930770874, + "learning_rate": 0.00014608540925266906, + "loss": 2.2246, + "step": 3040 + }, + { + "epoch": 1.3515555555555556, + "grad_norm": 2.390805244445801, + "learning_rate": 0.000146067615658363, + "loss": 2.8298, + "step": 3041 + }, + { + "epoch": 1.3519999999999999, + "grad_norm": 2.4162545204162598, + "learning_rate": 0.00014604982206405695, + "loss": 2.1917, + "step": 3042 + }, + { + "epoch": 1.3524444444444446, + "grad_norm": 2.0249195098876953, + "learning_rate": 0.0001460320284697509, + "loss": 2.6536, + "step": 3043 + }, + { + "epoch": 1.3528888888888888, + "grad_norm": 1.7109678983688354, + "learning_rate": 0.00014601423487544486, + "loss": 1.6031, + "step": 3044 + }, + { + "epoch": 1.3533333333333333, + "grad_norm": 1.919968843460083, + "learning_rate": 0.00014599644128113882, + "loss": 2.3276, + "step": 3045 + }, + { + "epoch": 1.3537777777777777, + "grad_norm": 2.2520694732666016, + "learning_rate": 0.00014597864768683275, + "loss": 2.5268, + "step": 3046 + }, + { + "epoch": 1.3542222222222222, + "grad_norm": 2.1359968185424805, + "learning_rate": 0.00014596085409252668, + "loss": 2.3626, + "step": 3047 + }, + { + "epoch": 1.3546666666666667, + "grad_norm": 2.3673102855682373, + "learning_rate": 0.00014594306049822063, + "loss": 2.5428, + "step": 3048 + }, + { + "epoch": 1.3551111111111112, + "grad_norm": 2.3437814712524414, + "learning_rate": 0.0001459252669039146, + "loss": 2.5333, + "step": 3049 + }, + { + "epoch": 1.3555555555555556, + "grad_norm": 1.5844188928604126, + "learning_rate": 0.00014590747330960854, + "loss": 0.678, + "step": 3050 + }, + { + "epoch": 1.3559999999999999, + "grad_norm": 0.8454328775405884, + "learning_rate": 0.0001458896797153025, + "loss": 0.0259, + "step": 3051 + }, + { + "epoch": 1.3564444444444446, + "grad_norm": 1.2323882579803467, + "learning_rate": 0.00014587188612099646, + "loss": 2.408, + "step": 3052 + }, + { + "epoch": 1.3568888888888888, + "grad_norm": 1.4226750135421753, + "learning_rate": 0.0001458540925266904, + "loss": 2.3492, + "step": 3053 + }, + { + "epoch": 1.3573333333333333, + "grad_norm": 1.3918040990829468, + "learning_rate": 0.00014583629893238434, + "loss": 2.2151, + "step": 3054 + }, + { + "epoch": 1.3577777777777778, + "grad_norm": 1.5795356035232544, + "learning_rate": 0.0001458185053380783, + "loss": 2.5257, + "step": 3055 + }, + { + "epoch": 1.3582222222222222, + "grad_norm": 1.3857512474060059, + "learning_rate": 0.00014580071174377226, + "loss": 2.3031, + "step": 3056 + }, + { + "epoch": 1.3586666666666667, + "grad_norm": 1.50767183303833, + "learning_rate": 0.0001457829181494662, + "loss": 1.606, + "step": 3057 + }, + { + "epoch": 1.3591111111111112, + "grad_norm": 1.5515854358673096, + "learning_rate": 0.00014576512455516017, + "loss": 2.4394, + "step": 3058 + }, + { + "epoch": 1.3595555555555556, + "grad_norm": 1.567515254020691, + "learning_rate": 0.0001457473309608541, + "loss": 2.7249, + "step": 3059 + }, + { + "epoch": 1.3599999999999999, + "grad_norm": 1.5846283435821533, + "learning_rate": 0.00014572953736654803, + "loss": 1.8917, + "step": 3060 + }, + { + "epoch": 1.3604444444444446, + "grad_norm": 1.4666316509246826, + "learning_rate": 0.00014571174377224198, + "loss": 2.1971, + "step": 3061 + }, + { + "epoch": 1.3608888888888888, + "grad_norm": 1.708336353302002, + "learning_rate": 0.00014569395017793594, + "loss": 2.177, + "step": 3062 + }, + { + "epoch": 1.3613333333333333, + "grad_norm": 1.7200583219528198, + "learning_rate": 0.0001456761565836299, + "loss": 2.3996, + "step": 3063 + }, + { + "epoch": 1.3617777777777778, + "grad_norm": 1.7250936031341553, + "learning_rate": 0.00014565836298932385, + "loss": 2.4772, + "step": 3064 + }, + { + "epoch": 1.3622222222222222, + "grad_norm": 1.6220717430114746, + "learning_rate": 0.0001456405693950178, + "loss": 1.7843, + "step": 3065 + }, + { + "epoch": 1.3626666666666667, + "grad_norm": 1.2669525146484375, + "learning_rate": 0.00014562277580071174, + "loss": 1.0277, + "step": 3066 + }, + { + "epoch": 1.3631111111111112, + "grad_norm": 1.7346656322479248, + "learning_rate": 0.0001456049822064057, + "loss": 1.7548, + "step": 3067 + }, + { + "epoch": 1.3635555555555556, + "grad_norm": 1.718342900276184, + "learning_rate": 0.00014558718861209965, + "loss": 1.7593, + "step": 3068 + }, + { + "epoch": 1.3639999999999999, + "grad_norm": 1.738747239112854, + "learning_rate": 0.0001455693950177936, + "loss": 2.1453, + "step": 3069 + }, + { + "epoch": 1.3644444444444446, + "grad_norm": 1.6312371492385864, + "learning_rate": 0.00014555160142348757, + "loss": 2.1307, + "step": 3070 + }, + { + "epoch": 1.3648888888888888, + "grad_norm": 1.9798495769500732, + "learning_rate": 0.00014553380782918152, + "loss": 1.9325, + "step": 3071 + }, + { + "epoch": 1.3653333333333333, + "grad_norm": 1.8481535911560059, + "learning_rate": 0.00014551601423487545, + "loss": 2.0689, + "step": 3072 + }, + { + "epoch": 1.3657777777777778, + "grad_norm": 1.7567691802978516, + "learning_rate": 0.00014549822064056938, + "loss": 1.8361, + "step": 3073 + }, + { + "epoch": 1.3662222222222222, + "grad_norm": 1.9224940538406372, + "learning_rate": 0.00014548042704626334, + "loss": 2.2146, + "step": 3074 + }, + { + "epoch": 1.3666666666666667, + "grad_norm": 2.118567705154419, + "learning_rate": 0.0001454626334519573, + "loss": 2.7325, + "step": 3075 + }, + { + "epoch": 1.3671111111111112, + "grad_norm": 1.6111234426498413, + "learning_rate": 0.00014544483985765125, + "loss": 1.5926, + "step": 3076 + }, + { + "epoch": 1.3675555555555556, + "grad_norm": 1.8418885469436646, + "learning_rate": 0.0001454270462633452, + "loss": 2.1703, + "step": 3077 + }, + { + "epoch": 1.3679999999999999, + "grad_norm": 3.9272093772888184, + "learning_rate": 0.00014540925266903916, + "loss": 0.0625, + "step": 3078 + }, + { + "epoch": 1.3684444444444446, + "grad_norm": 1.4438681602478027, + "learning_rate": 0.0001453914590747331, + "loss": 1.2624, + "step": 3079 + }, + { + "epoch": 1.3688888888888888, + "grad_norm": 2.313140392303467, + "learning_rate": 0.00014537366548042705, + "loss": 2.459, + "step": 3080 + }, + { + "epoch": 1.3693333333333333, + "grad_norm": 1.765426754951477, + "learning_rate": 0.000145355871886121, + "loss": 1.0009, + "step": 3081 + }, + { + "epoch": 1.3697777777777778, + "grad_norm": 1.7897223234176636, + "learning_rate": 0.00014533807829181496, + "loss": 1.6343, + "step": 3082 + }, + { + "epoch": 1.3702222222222222, + "grad_norm": 1.9620853662490845, + "learning_rate": 0.00014532028469750892, + "loss": 1.9386, + "step": 3083 + }, + { + "epoch": 1.3706666666666667, + "grad_norm": 2.0378286838531494, + "learning_rate": 0.00014530249110320288, + "loss": 1.7725, + "step": 3084 + }, + { + "epoch": 1.3711111111111112, + "grad_norm": 1.9394813776016235, + "learning_rate": 0.0001452846975088968, + "loss": 2.1801, + "step": 3085 + }, + { + "epoch": 1.3715555555555556, + "grad_norm": 2.0343053340911865, + "learning_rate": 0.00014526690391459073, + "loss": 2.0472, + "step": 3086 + }, + { + "epoch": 1.3719999999999999, + "grad_norm": 2.085235595703125, + "learning_rate": 0.0001452491103202847, + "loss": 1.969, + "step": 3087 + }, + { + "epoch": 1.3724444444444446, + "grad_norm": 1.9965012073516846, + "learning_rate": 0.00014523131672597865, + "loss": 2.3486, + "step": 3088 + }, + { + "epoch": 1.3728888888888888, + "grad_norm": 1.9986323118209839, + "learning_rate": 0.0001452135231316726, + "loss": 2.2368, + "step": 3089 + }, + { + "epoch": 1.3733333333333333, + "grad_norm": 2.003603935241699, + "learning_rate": 0.00014519572953736656, + "loss": 2.0619, + "step": 3090 + }, + { + "epoch": 1.3737777777777778, + "grad_norm": 2.0343897342681885, + "learning_rate": 0.00014517793594306052, + "loss": 2.2089, + "step": 3091 + }, + { + "epoch": 1.3742222222222222, + "grad_norm": 2.0502665042877197, + "learning_rate": 0.00014516014234875445, + "loss": 2.3439, + "step": 3092 + }, + { + "epoch": 1.3746666666666667, + "grad_norm": 2.019620895385742, + "learning_rate": 0.0001451423487544484, + "loss": 2.2632, + "step": 3093 + }, + { + "epoch": 1.3751111111111112, + "grad_norm": 1.9464764595031738, + "learning_rate": 0.00014512455516014236, + "loss": 1.7843, + "step": 3094 + }, + { + "epoch": 1.3755555555555556, + "grad_norm": 2.3599178791046143, + "learning_rate": 0.00014510676156583632, + "loss": 2.2186, + "step": 3095 + }, + { + "epoch": 1.376, + "grad_norm": 2.2975494861602783, + "learning_rate": 0.00014508896797153027, + "loss": 2.5936, + "step": 3096 + }, + { + "epoch": 1.3764444444444446, + "grad_norm": 2.219733476638794, + "learning_rate": 0.00014507117437722423, + "loss": 2.0997, + "step": 3097 + }, + { + "epoch": 1.3768888888888888, + "grad_norm": 2.035273790359497, + "learning_rate": 0.00014505338078291816, + "loss": 2.0009, + "step": 3098 + }, + { + "epoch": 1.3773333333333333, + "grad_norm": 2.999622106552124, + "learning_rate": 0.0001450355871886121, + "loss": 2.7129, + "step": 3099 + }, + { + "epoch": 1.3777777777777778, + "grad_norm": 2.15091872215271, + "learning_rate": 0.00014501779359430604, + "loss": 2.1994, + "step": 3100 + }, + { + "epoch": 1.3782222222222222, + "grad_norm": 1.0955103635787964, + "learning_rate": 0.000145, + "loss": 1.416, + "step": 3101 + }, + { + "epoch": 1.3786666666666667, + "grad_norm": 1.3353731632232666, + "learning_rate": 0.00014498220640569396, + "loss": 2.4926, + "step": 3102 + }, + { + "epoch": 1.3791111111111112, + "grad_norm": 1.4232149124145508, + "learning_rate": 0.00014496441281138791, + "loss": 2.5539, + "step": 3103 + }, + { + "epoch": 1.3795555555555556, + "grad_norm": 1.4904459714889526, + "learning_rate": 0.00014494661921708187, + "loss": 2.5516, + "step": 3104 + }, + { + "epoch": 1.38, + "grad_norm": 1.351136565208435, + "learning_rate": 0.0001449288256227758, + "loss": 1.9835, + "step": 3105 + }, + { + "epoch": 1.3804444444444444, + "grad_norm": 1.4860031604766846, + "learning_rate": 0.00014491103202846976, + "loss": 1.9876, + "step": 3106 + }, + { + "epoch": 1.3808888888888888, + "grad_norm": 1.582760214805603, + "learning_rate": 0.0001448932384341637, + "loss": 2.6782, + "step": 3107 + }, + { + "epoch": 1.3813333333333333, + "grad_norm": 1.3775660991668701, + "learning_rate": 0.00014487544483985767, + "loss": 1.6808, + "step": 3108 + }, + { + "epoch": 1.3817777777777778, + "grad_norm": 1.6874479055404663, + "learning_rate": 0.00014485765124555163, + "loss": 2.6314, + "step": 3109 + }, + { + "epoch": 1.3822222222222222, + "grad_norm": 1.7144551277160645, + "learning_rate": 0.00014483985765124558, + "loss": 2.7101, + "step": 3110 + }, + { + "epoch": 1.3826666666666667, + "grad_norm": 1.5098775625228882, + "learning_rate": 0.0001448220640569395, + "loss": 1.8929, + "step": 3111 + }, + { + "epoch": 1.3831111111111112, + "grad_norm": 1.653300404548645, + "learning_rate": 0.00014480427046263344, + "loss": 2.1654, + "step": 3112 + }, + { + "epoch": 1.3835555555555556, + "grad_norm": 1.4902502298355103, + "learning_rate": 0.0001447864768683274, + "loss": 2.3472, + "step": 3113 + }, + { + "epoch": 1.384, + "grad_norm": 1.75850248336792, + "learning_rate": 0.00014476868327402135, + "loss": 1.99, + "step": 3114 + }, + { + "epoch": 1.3844444444444444, + "grad_norm": 1.5602999925613403, + "learning_rate": 0.0001447508896797153, + "loss": 1.8835, + "step": 3115 + }, + { + "epoch": 1.3848888888888888, + "grad_norm": 1.6873457431793213, + "learning_rate": 0.00014473309608540927, + "loss": 1.9495, + "step": 3116 + }, + { + "epoch": 1.3853333333333333, + "grad_norm": 1.1590341329574585, + "learning_rate": 0.00014471530249110322, + "loss": 1.2124, + "step": 3117 + }, + { + "epoch": 1.3857777777777778, + "grad_norm": 1.7708086967468262, + "learning_rate": 0.00014469750889679715, + "loss": 2.0342, + "step": 3118 + }, + { + "epoch": 1.3862222222222222, + "grad_norm": 1.969315767288208, + "learning_rate": 0.0001446797153024911, + "loss": 2.5779, + "step": 3119 + }, + { + "epoch": 1.3866666666666667, + "grad_norm": 1.6105482578277588, + "learning_rate": 0.00014466192170818507, + "loss": 2.2028, + "step": 3120 + }, + { + "epoch": 1.3871111111111112, + "grad_norm": 1.7451056241989136, + "learning_rate": 0.00014464412811387902, + "loss": 2.1619, + "step": 3121 + }, + { + "epoch": 1.3875555555555557, + "grad_norm": 1.5029910802841187, + "learning_rate": 0.00014462633451957298, + "loss": 2.2189, + "step": 3122 + }, + { + "epoch": 1.388, + "grad_norm": 1.8065159320831299, + "learning_rate": 0.0001446085409252669, + "loss": 2.52, + "step": 3123 + }, + { + "epoch": 1.3884444444444444, + "grad_norm": 1.7965675592422485, + "learning_rate": 0.00014459074733096086, + "loss": 2.6056, + "step": 3124 + }, + { + "epoch": 1.3888888888888888, + "grad_norm": 1.6334154605865479, + "learning_rate": 0.0001445729537366548, + "loss": 2.2306, + "step": 3125 + }, + { + "epoch": 1.3893333333333333, + "grad_norm": 2.191298484802246, + "learning_rate": 0.00014455516014234875, + "loss": 2.3345, + "step": 3126 + }, + { + "epoch": 1.3897777777777778, + "grad_norm": 2.035778045654297, + "learning_rate": 0.0001445373665480427, + "loss": 2.2855, + "step": 3127 + }, + { + "epoch": 1.3902222222222222, + "grad_norm": 1.8941333293914795, + "learning_rate": 0.00014451957295373666, + "loss": 2.0976, + "step": 3128 + }, + { + "epoch": 1.3906666666666667, + "grad_norm": 1.8983358144760132, + "learning_rate": 0.00014450177935943062, + "loss": 2.1321, + "step": 3129 + }, + { + "epoch": 1.3911111111111112, + "grad_norm": 1.8651962280273438, + "learning_rate": 0.00014448398576512458, + "loss": 2.2432, + "step": 3130 + }, + { + "epoch": 1.3915555555555557, + "grad_norm": 1.7466819286346436, + "learning_rate": 0.0001444661921708185, + "loss": 1.2187, + "step": 3131 + }, + { + "epoch": 1.392, + "grad_norm": 1.9504824876785278, + "learning_rate": 0.00014444839857651246, + "loss": 1.0134, + "step": 3132 + }, + { + "epoch": 1.3924444444444444, + "grad_norm": 2.349276065826416, + "learning_rate": 0.00014443060498220642, + "loss": 2.3251, + "step": 3133 + }, + { + "epoch": 1.3928888888888888, + "grad_norm": 2.0128836631774902, + "learning_rate": 0.00014441281138790038, + "loss": 1.8844, + "step": 3134 + }, + { + "epoch": 1.3933333333333333, + "grad_norm": 1.8258366584777832, + "learning_rate": 0.00014439501779359433, + "loss": 2.1697, + "step": 3135 + }, + { + "epoch": 1.3937777777777778, + "grad_norm": 1.9964505434036255, + "learning_rate": 0.00014437722419928826, + "loss": 2.0272, + "step": 3136 + }, + { + "epoch": 1.3942222222222223, + "grad_norm": 2.2089779376983643, + "learning_rate": 0.0001443594306049822, + "loss": 2.4377, + "step": 3137 + }, + { + "epoch": 1.3946666666666667, + "grad_norm": 1.9052916765213013, + "learning_rate": 0.00014434163701067615, + "loss": 1.9119, + "step": 3138 + }, + { + "epoch": 1.3951111111111112, + "grad_norm": 2.0558083057403564, + "learning_rate": 0.0001443238434163701, + "loss": 2.0866, + "step": 3139 + }, + { + "epoch": 1.3955555555555557, + "grad_norm": 1.9824244976043701, + "learning_rate": 0.00014430604982206406, + "loss": 2.1271, + "step": 3140 + }, + { + "epoch": 1.396, + "grad_norm": 2.383279800415039, + "learning_rate": 0.00014428825622775802, + "loss": 2.5828, + "step": 3141 + }, + { + "epoch": 1.3964444444444444, + "grad_norm": 2.1160545349121094, + "learning_rate": 0.00014427046263345197, + "loss": 1.8748, + "step": 3142 + }, + { + "epoch": 1.3968888888888888, + "grad_norm": 1.8280696868896484, + "learning_rate": 0.0001442526690391459, + "loss": 2.0148, + "step": 3143 + }, + { + "epoch": 1.3973333333333333, + "grad_norm": 1.842757225036621, + "learning_rate": 0.00014423487544483986, + "loss": 1.9273, + "step": 3144 + }, + { + "epoch": 1.3977777777777778, + "grad_norm": 1.878212809562683, + "learning_rate": 0.00014421708185053382, + "loss": 1.8468, + "step": 3145 + }, + { + "epoch": 1.3982222222222223, + "grad_norm": 2.176372766494751, + "learning_rate": 0.00014419928825622777, + "loss": 1.7978, + "step": 3146 + }, + { + "epoch": 1.3986666666666667, + "grad_norm": 2.247149705886841, + "learning_rate": 0.00014418149466192173, + "loss": 2.298, + "step": 3147 + }, + { + "epoch": 1.3991111111111112, + "grad_norm": 2.320523977279663, + "learning_rate": 0.00014416370106761569, + "loss": 2.3023, + "step": 3148 + }, + { + "epoch": 1.3995555555555557, + "grad_norm": 2.5072226524353027, + "learning_rate": 0.00014414590747330961, + "loss": 2.0168, + "step": 3149 + }, + { + "epoch": 1.4, + "grad_norm": 3.0211939811706543, + "learning_rate": 0.00014412811387900354, + "loss": 2.7225, + "step": 3150 + }, + { + "epoch": 1.4004444444444444, + "grad_norm": 1.3240656852722168, + "learning_rate": 0.0001441103202846975, + "loss": 2.5294, + "step": 3151 + }, + { + "epoch": 1.4008888888888889, + "grad_norm": 1.478697657585144, + "learning_rate": 0.00014409252669039146, + "loss": 2.2188, + "step": 3152 + }, + { + "epoch": 1.4013333333333333, + "grad_norm": 1.3129013776779175, + "learning_rate": 0.0001440747330960854, + "loss": 2.188, + "step": 3153 + }, + { + "epoch": 1.4017777777777778, + "grad_norm": 1.450279951095581, + "learning_rate": 0.00014405693950177937, + "loss": 2.3634, + "step": 3154 + }, + { + "epoch": 1.4022222222222223, + "grad_norm": 1.3494071960449219, + "learning_rate": 0.00014403914590747333, + "loss": 2.2367, + "step": 3155 + }, + { + "epoch": 1.4026666666666667, + "grad_norm": 1.4874467849731445, + "learning_rate": 0.00014402135231316726, + "loss": 2.548, + "step": 3156 + }, + { + "epoch": 1.403111111111111, + "grad_norm": 1.3934712409973145, + "learning_rate": 0.0001440035587188612, + "loss": 1.5867, + "step": 3157 + }, + { + "epoch": 1.4035555555555557, + "grad_norm": 1.5048962831497192, + "learning_rate": 0.00014398576512455517, + "loss": 2.5955, + "step": 3158 + }, + { + "epoch": 1.404, + "grad_norm": 1.5615451335906982, + "learning_rate": 0.00014396797153024913, + "loss": 2.1421, + "step": 3159 + }, + { + "epoch": 1.4044444444444444, + "grad_norm": 1.5293431282043457, + "learning_rate": 0.00014395017793594308, + "loss": 2.3808, + "step": 3160 + }, + { + "epoch": 1.4048888888888889, + "grad_norm": 1.3629491329193115, + "learning_rate": 0.00014393238434163704, + "loss": 1.8447, + "step": 3161 + }, + { + "epoch": 1.4053333333333333, + "grad_norm": 1.76398766040802, + "learning_rate": 0.00014391459074733097, + "loss": 1.8923, + "step": 3162 + }, + { + "epoch": 1.4057777777777778, + "grad_norm": 1.6600054502487183, + "learning_rate": 0.0001438967971530249, + "loss": 2.0265, + "step": 3163 + }, + { + "epoch": 1.4062222222222223, + "grad_norm": 1.6202727556228638, + "learning_rate": 0.00014387900355871885, + "loss": 2.2756, + "step": 3164 + }, + { + "epoch": 1.4066666666666667, + "grad_norm": 1.749403953552246, + "learning_rate": 0.0001438612099644128, + "loss": 2.5024, + "step": 3165 + }, + { + "epoch": 1.407111111111111, + "grad_norm": 1.7654697895050049, + "learning_rate": 0.00014384341637010677, + "loss": 2.4529, + "step": 3166 + }, + { + "epoch": 1.4075555555555557, + "grad_norm": 1.5884429216384888, + "learning_rate": 0.00014382562277580072, + "loss": 1.8367, + "step": 3167 + }, + { + "epoch": 1.408, + "grad_norm": 1.5916013717651367, + "learning_rate": 0.00014380782918149468, + "loss": 1.8217, + "step": 3168 + }, + { + "epoch": 1.4084444444444444, + "grad_norm": 1.7130736112594604, + "learning_rate": 0.0001437900355871886, + "loss": 2.2377, + "step": 3169 + }, + { + "epoch": 1.4088888888888889, + "grad_norm": 1.05029296875, + "learning_rate": 0.00014377224199288257, + "loss": 0.5225, + "step": 3170 + }, + { + "epoch": 1.4093333333333333, + "grad_norm": 1.631998062133789, + "learning_rate": 0.00014375444839857652, + "loss": 2.1113, + "step": 3171 + }, + { + "epoch": 1.4097777777777778, + "grad_norm": 1.6177490949630737, + "learning_rate": 0.00014373665480427048, + "loss": 2.0905, + "step": 3172 + }, + { + "epoch": 1.4102222222222223, + "grad_norm": 1.727180004119873, + "learning_rate": 0.00014371886120996443, + "loss": 2.2642, + "step": 3173 + }, + { + "epoch": 1.4106666666666667, + "grad_norm": 1.755303144454956, + "learning_rate": 0.0001437010676156584, + "loss": 2.2187, + "step": 3174 + }, + { + "epoch": 1.411111111111111, + "grad_norm": 1.6000373363494873, + "learning_rate": 0.00014368327402135232, + "loss": 1.8188, + "step": 3175 + }, + { + "epoch": 1.4115555555555557, + "grad_norm": 2.0754306316375732, + "learning_rate": 0.00014366548042704625, + "loss": 2.0832, + "step": 3176 + }, + { + "epoch": 1.412, + "grad_norm": 1.7767425775527954, + "learning_rate": 0.0001436476868327402, + "loss": 2.1773, + "step": 3177 + }, + { + "epoch": 1.4124444444444444, + "grad_norm": 2.1510021686553955, + "learning_rate": 0.00014362989323843416, + "loss": 2.1337, + "step": 3178 + }, + { + "epoch": 1.4128888888888889, + "grad_norm": 1.9618239402770996, + "learning_rate": 0.00014361209964412812, + "loss": 2.108, + "step": 3179 + }, + { + "epoch": 1.4133333333333333, + "grad_norm": 1.8377591371536255, + "learning_rate": 0.00014359430604982208, + "loss": 1.7457, + "step": 3180 + }, + { + "epoch": 1.4137777777777778, + "grad_norm": 2.0039236545562744, + "learning_rate": 0.00014357651245551603, + "loss": 2.4462, + "step": 3181 + }, + { + "epoch": 1.4142222222222223, + "grad_norm": 1.9533127546310425, + "learning_rate": 0.00014355871886120996, + "loss": 2.0373, + "step": 3182 + }, + { + "epoch": 1.4146666666666667, + "grad_norm": 2.212468385696411, + "learning_rate": 0.00014354092526690392, + "loss": 2.2246, + "step": 3183 + }, + { + "epoch": 1.415111111111111, + "grad_norm": 1.4578838348388672, + "learning_rate": 0.00014352313167259788, + "loss": 1.1584, + "step": 3184 + }, + { + "epoch": 1.4155555555555557, + "grad_norm": 2.0764267444610596, + "learning_rate": 0.00014350533807829183, + "loss": 1.3613, + "step": 3185 + }, + { + "epoch": 1.416, + "grad_norm": 2.053358316421509, + "learning_rate": 0.0001434875444839858, + "loss": 2.5823, + "step": 3186 + }, + { + "epoch": 1.4164444444444444, + "grad_norm": 0.93979811668396, + "learning_rate": 0.00014346975088967974, + "loss": 0.0561, + "step": 3187 + }, + { + "epoch": 1.4168888888888889, + "grad_norm": 1.6901674270629883, + "learning_rate": 0.00014345195729537367, + "loss": 1.9685, + "step": 3188 + }, + { + "epoch": 1.4173333333333333, + "grad_norm": 1.8892921209335327, + "learning_rate": 0.0001434341637010676, + "loss": 2.1895, + "step": 3189 + }, + { + "epoch": 1.4177777777777778, + "grad_norm": 2.0869650840759277, + "learning_rate": 0.00014341637010676156, + "loss": 2.1289, + "step": 3190 + }, + { + "epoch": 1.4182222222222223, + "grad_norm": 2.154538154602051, + "learning_rate": 0.00014339857651245552, + "loss": 2.005, + "step": 3191 + }, + { + "epoch": 1.4186666666666667, + "grad_norm": 2.2069180011749268, + "learning_rate": 0.00014338078291814947, + "loss": 2.2602, + "step": 3192 + }, + { + "epoch": 1.419111111111111, + "grad_norm": 2.003593921661377, + "learning_rate": 0.00014336298932384343, + "loss": 1.9024, + "step": 3193 + }, + { + "epoch": 1.4195555555555557, + "grad_norm": 2.1326658725738525, + "learning_rate": 0.00014334519572953739, + "loss": 2.3338, + "step": 3194 + }, + { + "epoch": 1.42, + "grad_norm": 2.287719488143921, + "learning_rate": 0.00014332740213523132, + "loss": 2.1353, + "step": 3195 + }, + { + "epoch": 1.4204444444444444, + "grad_norm": 1.7974603176116943, + "learning_rate": 0.00014330960854092527, + "loss": 1.9235, + "step": 3196 + }, + { + "epoch": 1.4208888888888889, + "grad_norm": 2.7026476860046387, + "learning_rate": 0.00014329181494661923, + "loss": 1.8728, + "step": 3197 + }, + { + "epoch": 1.4213333333333333, + "grad_norm": 2.0487453937530518, + "learning_rate": 0.00014327402135231318, + "loss": 2.3309, + "step": 3198 + }, + { + "epoch": 1.4217777777777778, + "grad_norm": 2.1622159481048584, + "learning_rate": 0.00014325622775800714, + "loss": 2.104, + "step": 3199 + }, + { + "epoch": 1.4222222222222223, + "grad_norm": 2.4706900119781494, + "learning_rate": 0.0001432384341637011, + "loss": 2.2835, + "step": 3200 + }, + { + "epoch": 1.4226666666666667, + "grad_norm": 0.8910221457481384, + "learning_rate": 0.00014322064056939503, + "loss": 1.2837, + "step": 3201 + }, + { + "epoch": 1.423111111111111, + "grad_norm": 1.3645347356796265, + "learning_rate": 0.00014320284697508896, + "loss": 2.7213, + "step": 3202 + }, + { + "epoch": 1.4235555555555557, + "grad_norm": 1.3697624206542969, + "learning_rate": 0.0001431850533807829, + "loss": 2.6312, + "step": 3203 + }, + { + "epoch": 1.424, + "grad_norm": 1.5852277278900146, + "learning_rate": 0.00014316725978647687, + "loss": 2.8356, + "step": 3204 + }, + { + "epoch": 1.4244444444444444, + "grad_norm": 1.2539130449295044, + "learning_rate": 0.00014314946619217083, + "loss": 1.7608, + "step": 3205 + }, + { + "epoch": 1.4248888888888889, + "grad_norm": 1.5661532878875732, + "learning_rate": 0.00014313167259786478, + "loss": 2.0956, + "step": 3206 + }, + { + "epoch": 1.4253333333333333, + "grad_norm": 1.6269443035125732, + "learning_rate": 0.00014311387900355874, + "loss": 2.2012, + "step": 3207 + }, + { + "epoch": 1.4257777777777778, + "grad_norm": 1.4714109897613525, + "learning_rate": 0.00014309608540925267, + "loss": 2.3546, + "step": 3208 + }, + { + "epoch": 1.4262222222222223, + "grad_norm": 1.7439886331558228, + "learning_rate": 0.00014307829181494662, + "loss": 2.7, + "step": 3209 + }, + { + "epoch": 1.4266666666666667, + "grad_norm": 1.4173275232315063, + "learning_rate": 0.00014306049822064058, + "loss": 2.0922, + "step": 3210 + }, + { + "epoch": 1.427111111111111, + "grad_norm": 1.5306942462921143, + "learning_rate": 0.00014304270462633454, + "loss": 2.0992, + "step": 3211 + }, + { + "epoch": 1.4275555555555557, + "grad_norm": 1.797987461090088, + "learning_rate": 0.0001430249110320285, + "loss": 2.9628, + "step": 3212 + }, + { + "epoch": 1.428, + "grad_norm": 2.1177406311035156, + "learning_rate": 0.00014300711743772245, + "loss": 2.1528, + "step": 3213 + }, + { + "epoch": 1.4284444444444444, + "grad_norm": 1.593675136566162, + "learning_rate": 0.00014298932384341638, + "loss": 2.5579, + "step": 3214 + }, + { + "epoch": 1.4288888888888889, + "grad_norm": 1.5105654001235962, + "learning_rate": 0.0001429715302491103, + "loss": 2.1294, + "step": 3215 + }, + { + "epoch": 1.4293333333333333, + "grad_norm": 1.520651936531067, + "learning_rate": 0.00014295373665480427, + "loss": 1.8752, + "step": 3216 + }, + { + "epoch": 1.4297777777777778, + "grad_norm": 1.612784504890442, + "learning_rate": 0.00014293594306049822, + "loss": 2.0609, + "step": 3217 + }, + { + "epoch": 1.4302222222222223, + "grad_norm": 1.5184054374694824, + "learning_rate": 0.00014291814946619218, + "loss": 1.4772, + "step": 3218 + }, + { + "epoch": 1.4306666666666668, + "grad_norm": 1.46523916721344, + "learning_rate": 0.00014290035587188614, + "loss": 1.9029, + "step": 3219 + }, + { + "epoch": 1.431111111111111, + "grad_norm": 1.4900418519973755, + "learning_rate": 0.0001428825622775801, + "loss": 1.5782, + "step": 3220 + }, + { + "epoch": 1.4315555555555557, + "grad_norm": 1.8551801443099976, + "learning_rate": 0.00014286476868327402, + "loss": 2.47, + "step": 3221 + }, + { + "epoch": 1.432, + "grad_norm": 1.8610374927520752, + "learning_rate": 0.00014284697508896798, + "loss": 2.5916, + "step": 3222 + }, + { + "epoch": 1.4324444444444444, + "grad_norm": 1.73556649684906, + "learning_rate": 0.00014282918149466193, + "loss": 2.2463, + "step": 3223 + }, + { + "epoch": 1.4328888888888889, + "grad_norm": 1.574223279953003, + "learning_rate": 0.0001428113879003559, + "loss": 1.7704, + "step": 3224 + }, + { + "epoch": 1.4333333333333333, + "grad_norm": 2.1118242740631104, + "learning_rate": 0.00014279359430604985, + "loss": 1.9923, + "step": 3225 + }, + { + "epoch": 1.4337777777777778, + "grad_norm": 2.1607296466827393, + "learning_rate": 0.00014277580071174378, + "loss": 2.2045, + "step": 3226 + }, + { + "epoch": 1.4342222222222223, + "grad_norm": 1.532531499862671, + "learning_rate": 0.0001427580071174377, + "loss": 1.904, + "step": 3227 + }, + { + "epoch": 1.4346666666666668, + "grad_norm": 2.332968235015869, + "learning_rate": 0.00014274021352313166, + "loss": 2.3031, + "step": 3228 + }, + { + "epoch": 1.435111111111111, + "grad_norm": 1.7639070749282837, + "learning_rate": 0.00014272241992882562, + "loss": 2.3637, + "step": 3229 + }, + { + "epoch": 1.4355555555555555, + "grad_norm": 2.296191453933716, + "learning_rate": 0.00014270462633451958, + "loss": 1.664, + "step": 3230 + }, + { + "epoch": 1.436, + "grad_norm": 1.685143232345581, + "learning_rate": 0.00014268683274021353, + "loss": 1.9173, + "step": 3231 + }, + { + "epoch": 1.4364444444444444, + "grad_norm": 1.8601534366607666, + "learning_rate": 0.0001426690391459075, + "loss": 2.0097, + "step": 3232 + }, + { + "epoch": 1.4368888888888889, + "grad_norm": 1.818623423576355, + "learning_rate": 0.00014265124555160142, + "loss": 1.8948, + "step": 3233 + }, + { + "epoch": 1.4373333333333334, + "grad_norm": 2.0175423622131348, + "learning_rate": 0.00014263345195729537, + "loss": 2.5014, + "step": 3234 + }, + { + "epoch": 1.4377777777777778, + "grad_norm": 1.874712586402893, + "learning_rate": 0.00014261565836298933, + "loss": 2.4529, + "step": 3235 + }, + { + "epoch": 1.4382222222222223, + "grad_norm": 2.350339889526367, + "learning_rate": 0.0001425978647686833, + "loss": 2.5295, + "step": 3236 + }, + { + "epoch": 1.4386666666666668, + "grad_norm": 1.8386290073394775, + "learning_rate": 0.00014258007117437724, + "loss": 2.4049, + "step": 3237 + }, + { + "epoch": 1.439111111111111, + "grad_norm": 1.9711859226226807, + "learning_rate": 0.0001425622775800712, + "loss": 2.1427, + "step": 3238 + }, + { + "epoch": 1.4395555555555555, + "grad_norm": 1.9588954448699951, + "learning_rate": 0.00014254448398576513, + "loss": 2.0168, + "step": 3239 + }, + { + "epoch": 1.44, + "grad_norm": 2.025226593017578, + "learning_rate": 0.00014252669039145906, + "loss": 1.9483, + "step": 3240 + }, + { + "epoch": 1.4404444444444444, + "grad_norm": 2.1897053718566895, + "learning_rate": 0.00014250889679715302, + "loss": 2.3079, + "step": 3241 + }, + { + "epoch": 1.4408888888888889, + "grad_norm": 1.9101537466049194, + "learning_rate": 0.00014249110320284697, + "loss": 1.5792, + "step": 3242 + }, + { + "epoch": 1.4413333333333334, + "grad_norm": 2.3152666091918945, + "learning_rate": 0.00014247330960854093, + "loss": 2.6493, + "step": 3243 + }, + { + "epoch": 1.4417777777777778, + "grad_norm": 3.443295478820801, + "learning_rate": 0.00014245551601423489, + "loss": 2.0788, + "step": 3244 + }, + { + "epoch": 1.4422222222222223, + "grad_norm": 2.00852370262146, + "learning_rate": 0.00014243772241992884, + "loss": 2.2241, + "step": 3245 + }, + { + "epoch": 1.4426666666666668, + "grad_norm": 2.1195225715637207, + "learning_rate": 0.00014241992882562277, + "loss": 2.2187, + "step": 3246 + }, + { + "epoch": 1.443111111111111, + "grad_norm": 2.060398817062378, + "learning_rate": 0.00014240213523131673, + "loss": 1.73, + "step": 3247 + }, + { + "epoch": 1.4435555555555555, + "grad_norm": 2.193606376647949, + "learning_rate": 0.00014238434163701068, + "loss": 2.5191, + "step": 3248 + }, + { + "epoch": 1.444, + "grad_norm": 2.3782386779785156, + "learning_rate": 0.00014236654804270464, + "loss": 2.3936, + "step": 3249 + }, + { + "epoch": 1.4444444444444444, + "grad_norm": 3.235896348953247, + "learning_rate": 0.0001423487544483986, + "loss": 2.299, + "step": 3250 + }, + { + "epoch": 1.444888888888889, + "grad_norm": 1.3519978523254395, + "learning_rate": 0.00014233096085409255, + "loss": 2.6515, + "step": 3251 + }, + { + "epoch": 1.4453333333333334, + "grad_norm": 1.7427107095718384, + "learning_rate": 0.00014231316725978648, + "loss": 2.758, + "step": 3252 + }, + { + "epoch": 1.4457777777777778, + "grad_norm": 1.2374142408370972, + "learning_rate": 0.0001422953736654804, + "loss": 2.4892, + "step": 3253 + }, + { + "epoch": 1.4462222222222223, + "grad_norm": 1.4171687364578247, + "learning_rate": 0.00014227758007117437, + "loss": 2.5246, + "step": 3254 + }, + { + "epoch": 1.4466666666666668, + "grad_norm": 1.9048703908920288, + "learning_rate": 0.00014225978647686833, + "loss": 2.5152, + "step": 3255 + }, + { + "epoch": 1.447111111111111, + "grad_norm": 1.6656217575073242, + "learning_rate": 0.00014224199288256228, + "loss": 2.3669, + "step": 3256 + }, + { + "epoch": 1.4475555555555555, + "grad_norm": 1.9828662872314453, + "learning_rate": 0.00014222419928825624, + "loss": 2.5827, + "step": 3257 + }, + { + "epoch": 1.448, + "grad_norm": 1.593224287033081, + "learning_rate": 0.0001422064056939502, + "loss": 2.3798, + "step": 3258 + }, + { + "epoch": 1.4484444444444444, + "grad_norm": 1.7066659927368164, + "learning_rate": 0.00014218861209964412, + "loss": 2.5127, + "step": 3259 + }, + { + "epoch": 1.448888888888889, + "grad_norm": 1.5722315311431885, + "learning_rate": 0.00014217081850533808, + "loss": 1.6095, + "step": 3260 + }, + { + "epoch": 1.4493333333333334, + "grad_norm": 1.5399599075317383, + "learning_rate": 0.00014215302491103204, + "loss": 2.2268, + "step": 3261 + }, + { + "epoch": 1.4497777777777778, + "grad_norm": 2.0712404251098633, + "learning_rate": 0.000142135231316726, + "loss": 1.9943, + "step": 3262 + }, + { + "epoch": 1.4502222222222223, + "grad_norm": 1.9809646606445312, + "learning_rate": 0.00014211743772241995, + "loss": 2.8128, + "step": 3263 + }, + { + "epoch": 1.4506666666666668, + "grad_norm": 1.5712720155715942, + "learning_rate": 0.0001420996441281139, + "loss": 2.3628, + "step": 3264 + }, + { + "epoch": 1.451111111111111, + "grad_norm": 1.8636808395385742, + "learning_rate": 0.00014208185053380784, + "loss": 2.5732, + "step": 3265 + }, + { + "epoch": 1.4515555555555555, + "grad_norm": 1.9150162935256958, + "learning_rate": 0.00014206405693950177, + "loss": 2.2289, + "step": 3266 + }, + { + "epoch": 1.452, + "grad_norm": 1.132552146911621, + "learning_rate": 0.00014204626334519572, + "loss": 0.6418, + "step": 3267 + }, + { + "epoch": 1.4524444444444444, + "grad_norm": 1.7738691568374634, + "learning_rate": 0.00014202846975088968, + "loss": 2.2081, + "step": 3268 + }, + { + "epoch": 1.452888888888889, + "grad_norm": 1.6669455766677856, + "learning_rate": 0.00014201067615658364, + "loss": 1.912, + "step": 3269 + }, + { + "epoch": 1.4533333333333334, + "grad_norm": 1.720966100692749, + "learning_rate": 0.0001419928825622776, + "loss": 2.7566, + "step": 3270 + }, + { + "epoch": 1.4537777777777778, + "grad_norm": 1.593934178352356, + "learning_rate": 0.00014197508896797155, + "loss": 2.076, + "step": 3271 + }, + { + "epoch": 1.4542222222222223, + "grad_norm": 1.9219107627868652, + "learning_rate": 0.00014195729537366548, + "loss": 2.3327, + "step": 3272 + }, + { + "epoch": 1.4546666666666668, + "grad_norm": 1.694810390472412, + "learning_rate": 0.00014193950177935943, + "loss": 2.2732, + "step": 3273 + }, + { + "epoch": 1.455111111111111, + "grad_norm": 2.009329319000244, + "learning_rate": 0.0001419217081850534, + "loss": 1.7474, + "step": 3274 + }, + { + "epoch": 1.4555555555555555, + "grad_norm": 1.8578834533691406, + "learning_rate": 0.00014190391459074735, + "loss": 2.2934, + "step": 3275 + }, + { + "epoch": 1.456, + "grad_norm": 1.8823623657226562, + "learning_rate": 0.0001418861209964413, + "loss": 2.2761, + "step": 3276 + }, + { + "epoch": 1.4564444444444444, + "grad_norm": 1.817597508430481, + "learning_rate": 0.00014186832740213526, + "loss": 1.9425, + "step": 3277 + }, + { + "epoch": 1.456888888888889, + "grad_norm": 2.0119707584381104, + "learning_rate": 0.0001418505338078292, + "loss": 2.2159, + "step": 3278 + }, + { + "epoch": 1.4573333333333334, + "grad_norm": 1.5498830080032349, + "learning_rate": 0.00014183274021352312, + "loss": 1.4838, + "step": 3279 + }, + { + "epoch": 1.4577777777777778, + "grad_norm": 1.7245508432388306, + "learning_rate": 0.00014181494661921708, + "loss": 2.362, + "step": 3280 + }, + { + "epoch": 1.458222222222222, + "grad_norm": 1.6693973541259766, + "learning_rate": 0.00014179715302491103, + "loss": 2.0165, + "step": 3281 + }, + { + "epoch": 1.4586666666666668, + "grad_norm": 1.8107409477233887, + "learning_rate": 0.000141779359430605, + "loss": 2.0808, + "step": 3282 + }, + { + "epoch": 1.459111111111111, + "grad_norm": 1.8532699346542358, + "learning_rate": 0.00014176156583629894, + "loss": 2.2752, + "step": 3283 + }, + { + "epoch": 1.4595555555555555, + "grad_norm": 1.8230167627334595, + "learning_rate": 0.0001417437722419929, + "loss": 2.1949, + "step": 3284 + }, + { + "epoch": 1.46, + "grad_norm": 2.083483934402466, + "learning_rate": 0.00014172597864768683, + "loss": 2.4207, + "step": 3285 + }, + { + "epoch": 1.4604444444444444, + "grad_norm": 2.386479377746582, + "learning_rate": 0.0001417081850533808, + "loss": 2.4775, + "step": 3286 + }, + { + "epoch": 1.460888888888889, + "grad_norm": 1.8848226070404053, + "learning_rate": 0.00014169039145907474, + "loss": 2.1974, + "step": 3287 + }, + { + "epoch": 1.4613333333333334, + "grad_norm": 1.7829835414886475, + "learning_rate": 0.0001416725978647687, + "loss": 1.9449, + "step": 3288 + }, + { + "epoch": 1.4617777777777778, + "grad_norm": 1.645369529724121, + "learning_rate": 0.00014165480427046266, + "loss": 1.5355, + "step": 3289 + }, + { + "epoch": 1.462222222222222, + "grad_norm": 2.256089687347412, + "learning_rate": 0.0001416370106761566, + "loss": 2.5316, + "step": 3290 + }, + { + "epoch": 1.4626666666666668, + "grad_norm": 2.052887201309204, + "learning_rate": 0.00014161921708185054, + "loss": 2.3761, + "step": 3291 + }, + { + "epoch": 1.463111111111111, + "grad_norm": 2.1068687438964844, + "learning_rate": 0.00014160142348754447, + "loss": 0.9723, + "step": 3292 + }, + { + "epoch": 1.4635555555555555, + "grad_norm": 2.4331436157226562, + "learning_rate": 0.00014158362989323843, + "loss": 2.1379, + "step": 3293 + }, + { + "epoch": 1.464, + "grad_norm": 1.9529526233673096, + "learning_rate": 0.00014156583629893239, + "loss": 2.0011, + "step": 3294 + }, + { + "epoch": 1.4644444444444444, + "grad_norm": 2.1232481002807617, + "learning_rate": 0.00014154804270462634, + "loss": 1.8544, + "step": 3295 + }, + { + "epoch": 1.464888888888889, + "grad_norm": 2.0463287830352783, + "learning_rate": 0.0001415302491103203, + "loss": 2.1042, + "step": 3296 + }, + { + "epoch": 1.4653333333333334, + "grad_norm": 2.2398715019226074, + "learning_rate": 0.00014151245551601425, + "loss": 2.389, + "step": 3297 + }, + { + "epoch": 1.4657777777777778, + "grad_norm": 2.3587806224823, + "learning_rate": 0.00014149466192170818, + "loss": 2.2351, + "step": 3298 + }, + { + "epoch": 1.466222222222222, + "grad_norm": 2.272650957107544, + "learning_rate": 0.00014147686832740214, + "loss": 2.1772, + "step": 3299 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 1.661880373954773, + "learning_rate": 0.0001414590747330961, + "loss": 1.2369, + "step": 3300 + }, + { + "epoch": 1.467111111111111, + "grad_norm": 1.3112317323684692, + "learning_rate": 0.00014144128113879005, + "loss": 2.3523, + "step": 3301 + }, + { + "epoch": 1.4675555555555555, + "grad_norm": 1.5207730531692505, + "learning_rate": 0.000141423487544484, + "loss": 2.3686, + "step": 3302 + }, + { + "epoch": 1.468, + "grad_norm": 1.6390271186828613, + "learning_rate": 0.00014140569395017797, + "loss": 0.0503, + "step": 3303 + }, + { + "epoch": 1.4684444444444444, + "grad_norm": 1.316325068473816, + "learning_rate": 0.0001413879003558719, + "loss": 2.2079, + "step": 3304 + }, + { + "epoch": 1.468888888888889, + "grad_norm": 1.6727405786514282, + "learning_rate": 0.00014137010676156583, + "loss": 2.8864, + "step": 3305 + }, + { + "epoch": 1.4693333333333334, + "grad_norm": 1.413974404335022, + "learning_rate": 0.00014135231316725978, + "loss": 1.4162, + "step": 3306 + }, + { + "epoch": 1.4697777777777778, + "grad_norm": 1.4699324369430542, + "learning_rate": 0.00014133451957295374, + "loss": 2.0043, + "step": 3307 + }, + { + "epoch": 1.470222222222222, + "grad_norm": 1.9841383695602417, + "learning_rate": 0.0001413167259786477, + "loss": 2.5486, + "step": 3308 + }, + { + "epoch": 1.4706666666666668, + "grad_norm": 1.5038107633590698, + "learning_rate": 0.00014129893238434165, + "loss": 2.3013, + "step": 3309 + }, + { + "epoch": 1.471111111111111, + "grad_norm": 1.5381580591201782, + "learning_rate": 0.0001412811387900356, + "loss": 2.1017, + "step": 3310 + }, + { + "epoch": 1.4715555555555555, + "grad_norm": 1.4826030731201172, + "learning_rate": 0.00014126334519572954, + "loss": 2.4797, + "step": 3311 + }, + { + "epoch": 1.472, + "grad_norm": 1.4599792957305908, + "learning_rate": 0.0001412455516014235, + "loss": 1.9834, + "step": 3312 + }, + { + "epoch": 1.4724444444444444, + "grad_norm": 1.4552773237228394, + "learning_rate": 0.00014122775800711745, + "loss": 2.2435, + "step": 3313 + }, + { + "epoch": 1.472888888888889, + "grad_norm": 1.4118120670318604, + "learning_rate": 0.0001412099644128114, + "loss": 1.735, + "step": 3314 + }, + { + "epoch": 1.4733333333333334, + "grad_norm": 1.5949891805648804, + "learning_rate": 0.00014119217081850536, + "loss": 1.9896, + "step": 3315 + }, + { + "epoch": 1.4737777777777779, + "grad_norm": 1.7730779647827148, + "learning_rate": 0.0001411743772241993, + "loss": 2.0223, + "step": 3316 + }, + { + "epoch": 1.474222222222222, + "grad_norm": 1.7242622375488281, + "learning_rate": 0.00014115658362989322, + "loss": 2.3709, + "step": 3317 + }, + { + "epoch": 1.4746666666666668, + "grad_norm": 1.8402231931686401, + "learning_rate": 0.00014113879003558718, + "loss": 2.3489, + "step": 3318 + }, + { + "epoch": 1.475111111111111, + "grad_norm": 1.328906774520874, + "learning_rate": 0.00014112099644128113, + "loss": 1.0891, + "step": 3319 + }, + { + "epoch": 1.4755555555555555, + "grad_norm": 1.8179643154144287, + "learning_rate": 0.0001411032028469751, + "loss": 2.6636, + "step": 3320 + }, + { + "epoch": 1.476, + "grad_norm": 1.6425268650054932, + "learning_rate": 0.00014108540925266905, + "loss": 1.9906, + "step": 3321 + }, + { + "epoch": 1.4764444444444444, + "grad_norm": 1.7107009887695312, + "learning_rate": 0.000141067615658363, + "loss": 2.066, + "step": 3322 + }, + { + "epoch": 1.476888888888889, + "grad_norm": 1.7221518754959106, + "learning_rate": 0.00014104982206405693, + "loss": 2.4627, + "step": 3323 + }, + { + "epoch": 1.4773333333333334, + "grad_norm": 1.9194080829620361, + "learning_rate": 0.0001410320284697509, + "loss": 2.2223, + "step": 3324 + }, + { + "epoch": 1.4777777777777779, + "grad_norm": 1.7251501083374023, + "learning_rate": 0.00014101423487544485, + "loss": 1.902, + "step": 3325 + }, + { + "epoch": 1.478222222222222, + "grad_norm": 1.7051130533218384, + "learning_rate": 0.0001409964412811388, + "loss": 2.2801, + "step": 3326 + }, + { + "epoch": 1.4786666666666668, + "grad_norm": 2.0462424755096436, + "learning_rate": 0.00014097864768683276, + "loss": 2.2611, + "step": 3327 + }, + { + "epoch": 1.479111111111111, + "grad_norm": 1.6997382640838623, + "learning_rate": 0.00014096085409252672, + "loss": 2.0672, + "step": 3328 + }, + { + "epoch": 1.4795555555555555, + "grad_norm": 1.790878415107727, + "learning_rate": 0.00014094306049822065, + "loss": 1.866, + "step": 3329 + }, + { + "epoch": 1.48, + "grad_norm": 2.1614255905151367, + "learning_rate": 0.00014092526690391458, + "loss": 2.691, + "step": 3330 + }, + { + "epoch": 1.4804444444444445, + "grad_norm": 1.456119418144226, + "learning_rate": 0.00014090747330960853, + "loss": 0.8112, + "step": 3331 + }, + { + "epoch": 1.480888888888889, + "grad_norm": 1.8652554750442505, + "learning_rate": 0.0001408896797153025, + "loss": 1.9741, + "step": 3332 + }, + { + "epoch": 1.4813333333333334, + "grad_norm": 1.9750827550888062, + "learning_rate": 0.00014087188612099644, + "loss": 2.1358, + "step": 3333 + }, + { + "epoch": 1.4817777777777779, + "grad_norm": 2.0224530696868896, + "learning_rate": 0.0001408540925266904, + "loss": 2.2299, + "step": 3334 + }, + { + "epoch": 1.482222222222222, + "grad_norm": 2.1135432720184326, + "learning_rate": 0.00014083629893238436, + "loss": 2.2098, + "step": 3335 + }, + { + "epoch": 1.4826666666666668, + "grad_norm": 1.8072072267532349, + "learning_rate": 0.0001408185053380783, + "loss": 1.9179, + "step": 3336 + }, + { + "epoch": 1.483111111111111, + "grad_norm": 2.2474746704101562, + "learning_rate": 0.00014080071174377224, + "loss": 2.6121, + "step": 3337 + }, + { + "epoch": 1.4835555555555555, + "grad_norm": 1.997774600982666, + "learning_rate": 0.0001407829181494662, + "loss": 1.8134, + "step": 3338 + }, + { + "epoch": 1.484, + "grad_norm": 1.8671613931655884, + "learning_rate": 0.00014076512455516016, + "loss": 1.8315, + "step": 3339 + }, + { + "epoch": 1.4844444444444445, + "grad_norm": 1.809183955192566, + "learning_rate": 0.0001407473309608541, + "loss": 1.9693, + "step": 3340 + }, + { + "epoch": 1.484888888888889, + "grad_norm": 2.1424343585968018, + "learning_rate": 0.00014072953736654807, + "loss": 1.7883, + "step": 3341 + }, + { + "epoch": 1.4853333333333334, + "grad_norm": 2.0938260555267334, + "learning_rate": 0.000140711743772242, + "loss": 2.2282, + "step": 3342 + }, + { + "epoch": 1.4857777777777779, + "grad_norm": 2.3834707736968994, + "learning_rate": 0.00014069395017793593, + "loss": 2.1236, + "step": 3343 + }, + { + "epoch": 1.4862222222222221, + "grad_norm": 2.215244770050049, + "learning_rate": 0.00014067615658362988, + "loss": 2.3009, + "step": 3344 + }, + { + "epoch": 1.4866666666666668, + "grad_norm": 1.9744479656219482, + "learning_rate": 0.00014065836298932384, + "loss": 2.0397, + "step": 3345 + }, + { + "epoch": 1.487111111111111, + "grad_norm": 2.0358409881591797, + "learning_rate": 0.0001406405693950178, + "loss": 1.7655, + "step": 3346 + }, + { + "epoch": 1.4875555555555555, + "grad_norm": 2.4060535430908203, + "learning_rate": 0.00014062277580071175, + "loss": 2.1293, + "step": 3347 + }, + { + "epoch": 1.488, + "grad_norm": 2.8066680431365967, + "learning_rate": 0.0001406049822064057, + "loss": 2.1486, + "step": 3348 + }, + { + "epoch": 1.4884444444444445, + "grad_norm": 2.352820873260498, + "learning_rate": 0.00014058718861209964, + "loss": 1.4603, + "step": 3349 + }, + { + "epoch": 1.488888888888889, + "grad_norm": 2.9096052646636963, + "learning_rate": 0.0001405693950177936, + "loss": 2.8709, + "step": 3350 + }, + { + "epoch": 1.4893333333333334, + "grad_norm": 1.335901141166687, + "learning_rate": 0.00014055160142348755, + "loss": 2.7017, + "step": 3351 + }, + { + "epoch": 1.4897777777777779, + "grad_norm": 1.4894367456436157, + "learning_rate": 0.0001405338078291815, + "loss": 2.3371, + "step": 3352 + }, + { + "epoch": 1.4902222222222221, + "grad_norm": 1.6448569297790527, + "learning_rate": 0.00014051601423487547, + "loss": 2.3155, + "step": 3353 + }, + { + "epoch": 1.4906666666666666, + "grad_norm": 1.5843652486801147, + "learning_rate": 0.00014049822064056942, + "loss": 2.0601, + "step": 3354 + }, + { + "epoch": 1.491111111111111, + "grad_norm": 1.6974892616271973, + "learning_rate": 0.00014048042704626335, + "loss": 2.651, + "step": 3355 + }, + { + "epoch": 1.4915555555555555, + "grad_norm": 1.653384804725647, + "learning_rate": 0.00014046263345195728, + "loss": 2.0151, + "step": 3356 + }, + { + "epoch": 1.492, + "grad_norm": 1.6719948053359985, + "learning_rate": 0.00014044483985765124, + "loss": 2.2734, + "step": 3357 + }, + { + "epoch": 1.4924444444444445, + "grad_norm": 1.239280104637146, + "learning_rate": 0.0001404270462633452, + "loss": 0.9993, + "step": 3358 + }, + { + "epoch": 1.492888888888889, + "grad_norm": 1.8192747831344604, + "learning_rate": 0.00014040925266903915, + "loss": 2.4339, + "step": 3359 + }, + { + "epoch": 1.4933333333333334, + "grad_norm": 1.7065457105636597, + "learning_rate": 0.0001403914590747331, + "loss": 2.1031, + "step": 3360 + }, + { + "epoch": 1.4937777777777779, + "grad_norm": 1.8094873428344727, + "learning_rate": 0.00014037366548042706, + "loss": 2.3386, + "step": 3361 + }, + { + "epoch": 1.4942222222222221, + "grad_norm": 1.534125566482544, + "learning_rate": 0.000140355871886121, + "loss": 1.7805, + "step": 3362 + }, + { + "epoch": 1.4946666666666666, + "grad_norm": 1.619163990020752, + "learning_rate": 0.00014033807829181495, + "loss": 2.4424, + "step": 3363 + }, + { + "epoch": 1.495111111111111, + "grad_norm": 1.7250571250915527, + "learning_rate": 0.0001403202846975089, + "loss": 2.0668, + "step": 3364 + }, + { + "epoch": 1.4955555555555555, + "grad_norm": 1.4816625118255615, + "learning_rate": 0.00014030249110320286, + "loss": 2.0556, + "step": 3365 + }, + { + "epoch": 1.496, + "grad_norm": 1.794979214668274, + "learning_rate": 0.00014028469750889682, + "loss": 1.9561, + "step": 3366 + }, + { + "epoch": 1.4964444444444445, + "grad_norm": 1.855759859085083, + "learning_rate": 0.00014026690391459078, + "loss": 2.3782, + "step": 3367 + }, + { + "epoch": 1.496888888888889, + "grad_norm": 1.706645131111145, + "learning_rate": 0.0001402491103202847, + "loss": 2.3613, + "step": 3368 + }, + { + "epoch": 1.4973333333333334, + "grad_norm": 1.6514323949813843, + "learning_rate": 0.00014023131672597863, + "loss": 2.3841, + "step": 3369 + }, + { + "epoch": 1.4977777777777779, + "grad_norm": 1.9299840927124023, + "learning_rate": 0.0001402135231316726, + "loss": 1.8054, + "step": 3370 + }, + { + "epoch": 1.4982222222222221, + "grad_norm": 1.72597336769104, + "learning_rate": 0.00014019572953736655, + "loss": 2.2748, + "step": 3371 + }, + { + "epoch": 1.4986666666666666, + "grad_norm": 1.791800618171692, + "learning_rate": 0.0001401779359430605, + "loss": 1.8032, + "step": 3372 + }, + { + "epoch": 1.499111111111111, + "grad_norm": 1.779707431793213, + "learning_rate": 0.00014016014234875446, + "loss": 2.1661, + "step": 3373 + }, + { + "epoch": 1.4995555555555555, + "grad_norm": 1.7183303833007812, + "learning_rate": 0.00014014234875444842, + "loss": 1.9356, + "step": 3374 + }, + { + "epoch": 1.5, + "grad_norm": 1.637531042098999, + "learning_rate": 0.00014012455516014235, + "loss": 1.9201, + "step": 3375 + }, + { + "epoch": 1.5004444444444445, + "grad_norm": 1.4911421537399292, + "learning_rate": 0.0001401067615658363, + "loss": 1.2015, + "step": 3376 + }, + { + "epoch": 1.500888888888889, + "grad_norm": 1.6465941667556763, + "learning_rate": 0.00014008896797153026, + "loss": 1.8771, + "step": 3377 + }, + { + "epoch": 1.5013333333333332, + "grad_norm": 1.7987436056137085, + "learning_rate": 0.00014007117437722422, + "loss": 2.0635, + "step": 3378 + }, + { + "epoch": 1.5017777777777779, + "grad_norm": 1.6817963123321533, + "learning_rate": 0.00014005338078291817, + "loss": 1.8847, + "step": 3379 + }, + { + "epoch": 1.5022222222222221, + "grad_norm": 1.6715887784957886, + "learning_rate": 0.00014003558718861213, + "loss": 1.8009, + "step": 3380 + }, + { + "epoch": 1.5026666666666668, + "grad_norm": 2.189204216003418, + "learning_rate": 0.00014001779359430606, + "loss": 2.1705, + "step": 3381 + }, + { + "epoch": 1.503111111111111, + "grad_norm": 1.7621302604675293, + "learning_rate": 0.00014, + "loss": 2.3973, + "step": 3382 + }, + { + "epoch": 1.5035555555555555, + "grad_norm": 1.9194334745407104, + "learning_rate": 0.00013998220640569394, + "loss": 2.1278, + "step": 3383 + }, + { + "epoch": 1.504, + "grad_norm": 2.001845121383667, + "learning_rate": 0.0001399644128113879, + "loss": 2.6398, + "step": 3384 + }, + { + "epoch": 1.5044444444444445, + "grad_norm": 1.9402610063552856, + "learning_rate": 0.00013994661921708186, + "loss": 1.8872, + "step": 3385 + }, + { + "epoch": 1.504888888888889, + "grad_norm": 1.9671640396118164, + "learning_rate": 0.00013992882562277581, + "loss": 2.1836, + "step": 3386 + }, + { + "epoch": 1.5053333333333332, + "grad_norm": 2.1770575046539307, + "learning_rate": 0.00013991103202846977, + "loss": 2.1509, + "step": 3387 + }, + { + "epoch": 1.5057777777777779, + "grad_norm": 2.059763193130493, + "learning_rate": 0.0001398932384341637, + "loss": 2.3197, + "step": 3388 + }, + { + "epoch": 1.5062222222222221, + "grad_norm": 1.899730920791626, + "learning_rate": 0.00013987544483985766, + "loss": 1.9101, + "step": 3389 + }, + { + "epoch": 1.5066666666666668, + "grad_norm": 1.6671397686004639, + "learning_rate": 0.0001398576512455516, + "loss": 1.8031, + "step": 3390 + }, + { + "epoch": 1.507111111111111, + "grad_norm": 2.2725398540496826, + "learning_rate": 0.00013983985765124557, + "loss": 2.2501, + "step": 3391 + }, + { + "epoch": 1.5075555555555555, + "grad_norm": 2.0739402770996094, + "learning_rate": 0.00013982206405693953, + "loss": 2.5371, + "step": 3392 + }, + { + "epoch": 1.508, + "grad_norm": 2.2705914974212646, + "learning_rate": 0.00013980427046263348, + "loss": 2.1084, + "step": 3393 + }, + { + "epoch": 1.5084444444444445, + "grad_norm": 1.981615662574768, + "learning_rate": 0.0001397864768683274, + "loss": 1.9832, + "step": 3394 + }, + { + "epoch": 1.508888888888889, + "grad_norm": 2.1815102100372314, + "learning_rate": 0.00013976868327402134, + "loss": 2.1987, + "step": 3395 + }, + { + "epoch": 1.5093333333333332, + "grad_norm": 1.9982526302337646, + "learning_rate": 0.0001397508896797153, + "loss": 2.1131, + "step": 3396 + }, + { + "epoch": 1.5097777777777779, + "grad_norm": 2.5864624977111816, + "learning_rate": 0.00013973309608540925, + "loss": 2.7484, + "step": 3397 + }, + { + "epoch": 1.5102222222222221, + "grad_norm": 2.7735939025878906, + "learning_rate": 0.0001397153024911032, + "loss": 2.8055, + "step": 3398 + }, + { + "epoch": 1.5106666666666668, + "grad_norm": 2.547844648361206, + "learning_rate": 0.00013969750889679717, + "loss": 2.8584, + "step": 3399 + }, + { + "epoch": 1.511111111111111, + "grad_norm": 3.193415880203247, + "learning_rate": 0.0001396797153024911, + "loss": 1.5981, + "step": 3400 + }, + { + "epoch": 1.5115555555555555, + "grad_norm": 1.2286021709442139, + "learning_rate": 0.00013966192170818505, + "loss": 2.0801, + "step": 3401 + }, + { + "epoch": 1.512, + "grad_norm": 1.3546948432922363, + "learning_rate": 0.000139644128113879, + "loss": 2.2488, + "step": 3402 + }, + { + "epoch": 1.5124444444444445, + "grad_norm": 1.3589816093444824, + "learning_rate": 0.00013962633451957297, + "loss": 2.5642, + "step": 3403 + }, + { + "epoch": 1.512888888888889, + "grad_norm": 0.9513995051383972, + "learning_rate": 0.00013960854092526692, + "loss": 0.5388, + "step": 3404 + }, + { + "epoch": 1.5133333333333332, + "grad_norm": 1.2379084825515747, + "learning_rate": 0.00013959074733096088, + "loss": 1.2631, + "step": 3405 + }, + { + "epoch": 1.5137777777777779, + "grad_norm": 1.4364656209945679, + "learning_rate": 0.0001395729537366548, + "loss": 2.2967, + "step": 3406 + }, + { + "epoch": 1.5142222222222221, + "grad_norm": 1.6733119487762451, + "learning_rate": 0.00013955516014234874, + "loss": 2.4779, + "step": 3407 + }, + { + "epoch": 1.5146666666666668, + "grad_norm": 1.715740442276001, + "learning_rate": 0.0001395373665480427, + "loss": 2.4899, + "step": 3408 + }, + { + "epoch": 1.515111111111111, + "grad_norm": 1.6031004190444946, + "learning_rate": 0.00013951957295373665, + "loss": 2.3027, + "step": 3409 + }, + { + "epoch": 1.5155555555555555, + "grad_norm": 1.6325712203979492, + "learning_rate": 0.0001395017793594306, + "loss": 1.7408, + "step": 3410 + }, + { + "epoch": 1.516, + "grad_norm": 1.7524162530899048, + "learning_rate": 0.00013948398576512456, + "loss": 2.1623, + "step": 3411 + }, + { + "epoch": 1.5164444444444445, + "grad_norm": 1.493108868598938, + "learning_rate": 0.00013946619217081852, + "loss": 2.1662, + "step": 3412 + }, + { + "epoch": 1.516888888888889, + "grad_norm": 1.7417209148406982, + "learning_rate": 0.00013944839857651245, + "loss": 2.3913, + "step": 3413 + }, + { + "epoch": 1.5173333333333332, + "grad_norm": 1.671183705329895, + "learning_rate": 0.0001394306049822064, + "loss": 2.4024, + "step": 3414 + }, + { + "epoch": 1.517777777777778, + "grad_norm": 1.567742943763733, + "learning_rate": 0.00013941281138790036, + "loss": 1.9279, + "step": 3415 + }, + { + "epoch": 1.5182222222222221, + "grad_norm": 1.7501814365386963, + "learning_rate": 0.00013939501779359432, + "loss": 1.9253, + "step": 3416 + }, + { + "epoch": 1.5186666666666668, + "grad_norm": 1.5426925420761108, + "learning_rate": 0.00013937722419928828, + "loss": 2.021, + "step": 3417 + }, + { + "epoch": 1.519111111111111, + "grad_norm": 1.4181113243103027, + "learning_rate": 0.00013935943060498223, + "loss": 1.803, + "step": 3418 + }, + { + "epoch": 1.5195555555555555, + "grad_norm": 1.6199541091918945, + "learning_rate": 0.00013934163701067616, + "loss": 1.8904, + "step": 3419 + }, + { + "epoch": 1.52, + "grad_norm": 1.6284871101379395, + "learning_rate": 0.0001393238434163701, + "loss": 2.1618, + "step": 3420 + }, + { + "epoch": 1.5204444444444445, + "grad_norm": 2.004983425140381, + "learning_rate": 0.00013930604982206405, + "loss": 2.4985, + "step": 3421 + }, + { + "epoch": 1.520888888888889, + "grad_norm": 1.890509843826294, + "learning_rate": 0.000139288256227758, + "loss": 2.0206, + "step": 3422 + }, + { + "epoch": 1.5213333333333332, + "grad_norm": 2.1789512634277344, + "learning_rate": 0.00013927046263345196, + "loss": 1.9493, + "step": 3423 + }, + { + "epoch": 1.521777777777778, + "grad_norm": 1.6540831327438354, + "learning_rate": 0.00013925266903914592, + "loss": 2.0598, + "step": 3424 + }, + { + "epoch": 1.5222222222222221, + "grad_norm": 2.1028473377227783, + "learning_rate": 0.00013923487544483987, + "loss": 2.4147, + "step": 3425 + }, + { + "epoch": 1.5226666666666666, + "grad_norm": 2.1696505546569824, + "learning_rate": 0.0001392170818505338, + "loss": 2.3895, + "step": 3426 + }, + { + "epoch": 1.523111111111111, + "grad_norm": 2.0171515941619873, + "learning_rate": 0.00013919928825622776, + "loss": 2.0155, + "step": 3427 + }, + { + "epoch": 1.5235555555555556, + "grad_norm": 2.0088951587677, + "learning_rate": 0.00013918149466192172, + "loss": 2.1746, + "step": 3428 + }, + { + "epoch": 1.524, + "grad_norm": 1.8271888494491577, + "learning_rate": 0.00013916370106761567, + "loss": 2.2367, + "step": 3429 + }, + { + "epoch": 1.5244444444444445, + "grad_norm": 2.0240986347198486, + "learning_rate": 0.00013914590747330963, + "loss": 2.5284, + "step": 3430 + }, + { + "epoch": 1.524888888888889, + "grad_norm": 1.7613776922225952, + "learning_rate": 0.00013912811387900359, + "loss": 1.8526, + "step": 3431 + }, + { + "epoch": 1.5253333333333332, + "grad_norm": 2.417299509048462, + "learning_rate": 0.00013911032028469751, + "loss": 2.6705, + "step": 3432 + }, + { + "epoch": 1.525777777777778, + "grad_norm": 2.0129454135894775, + "learning_rate": 0.00013909252669039144, + "loss": 1.8315, + "step": 3433 + }, + { + "epoch": 1.5262222222222221, + "grad_norm": 2.165886640548706, + "learning_rate": 0.0001390747330960854, + "loss": 2.5886, + "step": 3434 + }, + { + "epoch": 1.5266666666666666, + "grad_norm": 2.1423709392547607, + "learning_rate": 0.00013905693950177936, + "loss": 1.9441, + "step": 3435 + }, + { + "epoch": 1.527111111111111, + "grad_norm": 1.9875420331954956, + "learning_rate": 0.0001390391459074733, + "loss": 2.1317, + "step": 3436 + }, + { + "epoch": 1.5275555555555556, + "grad_norm": 2.0450797080993652, + "learning_rate": 0.00013902135231316727, + "loss": 2.0809, + "step": 3437 + }, + { + "epoch": 1.528, + "grad_norm": 2.3254358768463135, + "learning_rate": 0.00013900355871886123, + "loss": 2.5318, + "step": 3438 + }, + { + "epoch": 1.5284444444444445, + "grad_norm": 2.041480779647827, + "learning_rate": 0.00013898576512455516, + "loss": 1.9015, + "step": 3439 + }, + { + "epoch": 1.528888888888889, + "grad_norm": 1.6092534065246582, + "learning_rate": 0.0001389679715302491, + "loss": 1.5966, + "step": 3440 + }, + { + "epoch": 1.5293333333333332, + "grad_norm": 2.0477304458618164, + "learning_rate": 0.00013895017793594307, + "loss": 2.1176, + "step": 3441 + }, + { + "epoch": 1.529777777777778, + "grad_norm": 2.8084466457366943, + "learning_rate": 0.00013893238434163703, + "loss": 2.5453, + "step": 3442 + }, + { + "epoch": 1.5302222222222222, + "grad_norm": 2.2902886867523193, + "learning_rate": 0.00013891459074733098, + "loss": 2.0898, + "step": 3443 + }, + { + "epoch": 1.5306666666666666, + "grad_norm": 2.420135259628296, + "learning_rate": 0.00013889679715302494, + "loss": 2.3878, + "step": 3444 + }, + { + "epoch": 1.531111111111111, + "grad_norm": 3.70841646194458, + "learning_rate": 0.00013887900355871887, + "loss": 2.4154, + "step": 3445 + }, + { + "epoch": 1.5315555555555556, + "grad_norm": 2.6717746257781982, + "learning_rate": 0.0001388612099644128, + "loss": 2.1492, + "step": 3446 + }, + { + "epoch": 1.532, + "grad_norm": 2.6898179054260254, + "learning_rate": 0.00013884341637010675, + "loss": 2.1948, + "step": 3447 + }, + { + "epoch": 1.5324444444444445, + "grad_norm": 2.706569194793701, + "learning_rate": 0.0001388256227758007, + "loss": 2.5001, + "step": 3448 + }, + { + "epoch": 1.532888888888889, + "grad_norm": 2.5174717903137207, + "learning_rate": 0.00013880782918149467, + "loss": 2.0004, + "step": 3449 + }, + { + "epoch": 1.5333333333333332, + "grad_norm": 2.5441904067993164, + "learning_rate": 0.00013879003558718862, + "loss": 2.0386, + "step": 3450 + }, + { + "epoch": 1.533777777777778, + "grad_norm": 1.2441266775131226, + "learning_rate": 0.00013877224199288258, + "loss": 2.3708, + "step": 3451 + }, + { + "epoch": 1.5342222222222222, + "grad_norm": 1.6138944625854492, + "learning_rate": 0.0001387544483985765, + "loss": 2.2968, + "step": 3452 + }, + { + "epoch": 1.5346666666666666, + "grad_norm": 1.7886635065078735, + "learning_rate": 0.00013873665480427047, + "loss": 2.3651, + "step": 3453 + }, + { + "epoch": 1.535111111111111, + "grad_norm": 1.5253329277038574, + "learning_rate": 0.00013871886120996442, + "loss": 2.409, + "step": 3454 + }, + { + "epoch": 1.5355555555555556, + "grad_norm": 1.6435463428497314, + "learning_rate": 0.00013870106761565838, + "loss": 2.1088, + "step": 3455 + }, + { + "epoch": 1.536, + "grad_norm": 1.7751388549804688, + "learning_rate": 0.00013868327402135233, + "loss": 1.8769, + "step": 3456 + }, + { + "epoch": 1.5364444444444443, + "grad_norm": 1.4366142749786377, + "learning_rate": 0.0001386654804270463, + "loss": 1.3134, + "step": 3457 + }, + { + "epoch": 1.536888888888889, + "grad_norm": 1.9377721548080444, + "learning_rate": 0.00013864768683274022, + "loss": 2.3063, + "step": 3458 + }, + { + "epoch": 1.5373333333333332, + "grad_norm": 2.1702098846435547, + "learning_rate": 0.00013862989323843415, + "loss": 1.8274, + "step": 3459 + }, + { + "epoch": 1.537777777777778, + "grad_norm": 1.566851019859314, + "learning_rate": 0.0001386120996441281, + "loss": 2.2409, + "step": 3460 + }, + { + "epoch": 1.5382222222222222, + "grad_norm": 1.5840020179748535, + "learning_rate": 0.00013859430604982206, + "loss": 2.676, + "step": 3461 + }, + { + "epoch": 1.5386666666666666, + "grad_norm": 1.6341493129730225, + "learning_rate": 0.00013857651245551602, + "loss": 1.7889, + "step": 3462 + }, + { + "epoch": 1.539111111111111, + "grad_norm": 1.810591220855713, + "learning_rate": 0.00013855871886120998, + "loss": 2.312, + "step": 3463 + }, + { + "epoch": 1.5395555555555556, + "grad_norm": 1.6743979454040527, + "learning_rate": 0.00013854092526690393, + "loss": 2.1647, + "step": 3464 + }, + { + "epoch": 1.54, + "grad_norm": 1.6081904172897339, + "learning_rate": 0.00013852313167259786, + "loss": 2.3812, + "step": 3465 + }, + { + "epoch": 1.5404444444444443, + "grad_norm": 1.6036180257797241, + "learning_rate": 0.00013850533807829182, + "loss": 2.1979, + "step": 3466 + }, + { + "epoch": 1.540888888888889, + "grad_norm": 1.4913078546524048, + "learning_rate": 0.00013848754448398578, + "loss": 2.0216, + "step": 3467 + }, + { + "epoch": 1.5413333333333332, + "grad_norm": 1.6572915315628052, + "learning_rate": 0.00013846975088967973, + "loss": 2.0528, + "step": 3468 + }, + { + "epoch": 1.541777777777778, + "grad_norm": 2.2356021404266357, + "learning_rate": 0.0001384519572953737, + "loss": 2.1856, + "step": 3469 + }, + { + "epoch": 1.5422222222222222, + "grad_norm": 1.9612963199615479, + "learning_rate": 0.00013843416370106764, + "loss": 2.4308, + "step": 3470 + }, + { + "epoch": 1.5426666666666666, + "grad_norm": 1.8817795515060425, + "learning_rate": 0.00013841637010676157, + "loss": 2.1989, + "step": 3471 + }, + { + "epoch": 1.543111111111111, + "grad_norm": 1.5524616241455078, + "learning_rate": 0.0001383985765124555, + "loss": 1.9362, + "step": 3472 + }, + { + "epoch": 1.5435555555555556, + "grad_norm": 1.8420137166976929, + "learning_rate": 0.00013838078291814946, + "loss": 2.0204, + "step": 3473 + }, + { + "epoch": 1.544, + "grad_norm": 1.6466306447982788, + "learning_rate": 0.00013836298932384342, + "loss": 2.0033, + "step": 3474 + }, + { + "epoch": 1.5444444444444443, + "grad_norm": 1.7015817165374756, + "learning_rate": 0.00013834519572953737, + "loss": 1.986, + "step": 3475 + }, + { + "epoch": 1.544888888888889, + "grad_norm": 2.1093199253082275, + "learning_rate": 0.00013832740213523133, + "loss": 2.3153, + "step": 3476 + }, + { + "epoch": 1.5453333333333332, + "grad_norm": 1.966652274131775, + "learning_rate": 0.00013830960854092529, + "loss": 2.2379, + "step": 3477 + }, + { + "epoch": 1.545777777777778, + "grad_norm": 1.2320265769958496, + "learning_rate": 0.00013829181494661922, + "loss": 0.9829, + "step": 3478 + }, + { + "epoch": 1.5462222222222222, + "grad_norm": 2.0316035747528076, + "learning_rate": 0.00013827402135231317, + "loss": 2.2463, + "step": 3479 + }, + { + "epoch": 1.5466666666666666, + "grad_norm": 1.9726709127426147, + "learning_rate": 0.00013825622775800713, + "loss": 1.8361, + "step": 3480 + }, + { + "epoch": 1.547111111111111, + "grad_norm": 1.76983642578125, + "learning_rate": 0.00013823843416370108, + "loss": 2.151, + "step": 3481 + }, + { + "epoch": 1.5475555555555556, + "grad_norm": 1.8415701389312744, + "learning_rate": 0.00013822064056939504, + "loss": 2.0344, + "step": 3482 + }, + { + "epoch": 1.548, + "grad_norm": 2.0059757232666016, + "learning_rate": 0.000138202846975089, + "loss": 2.1534, + "step": 3483 + }, + { + "epoch": 1.5484444444444443, + "grad_norm": 1.8996038436889648, + "learning_rate": 0.00013818505338078293, + "loss": 2.3525, + "step": 3484 + }, + { + "epoch": 1.548888888888889, + "grad_norm": 2.0750856399536133, + "learning_rate": 0.00013816725978647686, + "loss": 2.0834, + "step": 3485 + }, + { + "epoch": 1.5493333333333332, + "grad_norm": 2.5925469398498535, + "learning_rate": 0.0001381494661921708, + "loss": 2.6209, + "step": 3486 + }, + { + "epoch": 1.549777777777778, + "grad_norm": 2.2502434253692627, + "learning_rate": 0.00013813167259786477, + "loss": 2.4877, + "step": 3487 + }, + { + "epoch": 1.5502222222222222, + "grad_norm": 2.1007752418518066, + "learning_rate": 0.00013811387900355873, + "loss": 2.164, + "step": 3488 + }, + { + "epoch": 1.5506666666666666, + "grad_norm": 2.5511112213134766, + "learning_rate": 0.00013809608540925268, + "loss": 1.7419, + "step": 3489 + }, + { + "epoch": 1.551111111111111, + "grad_norm": 2.076032876968384, + "learning_rate": 0.0001380782918149466, + "loss": 2.2016, + "step": 3490 + }, + { + "epoch": 1.5515555555555556, + "grad_norm": 2.3464345932006836, + "learning_rate": 0.00013806049822064057, + "loss": 2.828, + "step": 3491 + }, + { + "epoch": 1.552, + "grad_norm": 2.0291099548339844, + "learning_rate": 0.00013804270462633452, + "loss": 1.6438, + "step": 3492 + }, + { + "epoch": 1.5524444444444443, + "grad_norm": 2.3025505542755127, + "learning_rate": 0.00013802491103202848, + "loss": 1.8687, + "step": 3493 + }, + { + "epoch": 1.552888888888889, + "grad_norm": 2.7315847873687744, + "learning_rate": 0.00013800711743772244, + "loss": 2.6435, + "step": 3494 + }, + { + "epoch": 1.5533333333333332, + "grad_norm": 2.493013620376587, + "learning_rate": 0.0001379893238434164, + "loss": 2.3683, + "step": 3495 + }, + { + "epoch": 1.553777777777778, + "grad_norm": 2.1778078079223633, + "learning_rate": 0.00013797153024911032, + "loss": 2.2781, + "step": 3496 + }, + { + "epoch": 1.5542222222222222, + "grad_norm": 2.7922496795654297, + "learning_rate": 0.00013795373665480425, + "loss": 2.0727, + "step": 3497 + }, + { + "epoch": 1.5546666666666666, + "grad_norm": 2.76652193069458, + "learning_rate": 0.0001379359430604982, + "loss": 1.6055, + "step": 3498 + }, + { + "epoch": 1.555111111111111, + "grad_norm": 2.2120234966278076, + "learning_rate": 0.00013791814946619217, + "loss": 1.9275, + "step": 3499 + }, + { + "epoch": 1.5555555555555556, + "grad_norm": 3.5710928440093994, + "learning_rate": 0.00013790035587188612, + "loss": 2.7359, + "step": 3500 + }, + { + "epoch": 1.556, + "grad_norm": 1.3622146844863892, + "learning_rate": 0.00013788256227758008, + "loss": 2.8565, + "step": 3501 + }, + { + "epoch": 1.5564444444444443, + "grad_norm": 1.3293352127075195, + "learning_rate": 0.00013786476868327404, + "loss": 2.1254, + "step": 3502 + }, + { + "epoch": 1.556888888888889, + "grad_norm": 1.4484037160873413, + "learning_rate": 0.00013784697508896797, + "loss": 2.5264, + "step": 3503 + }, + { + "epoch": 1.5573333333333332, + "grad_norm": 1.5083986520767212, + "learning_rate": 0.00013782918149466192, + "loss": 2.212, + "step": 3504 + }, + { + "epoch": 1.557777777777778, + "grad_norm": 1.6028261184692383, + "learning_rate": 0.00013781138790035588, + "loss": 2.1341, + "step": 3505 + }, + { + "epoch": 1.5582222222222222, + "grad_norm": 1.4829325675964355, + "learning_rate": 0.00013779359430604983, + "loss": 1.8982, + "step": 3506 + }, + { + "epoch": 1.5586666666666666, + "grad_norm": 1.533338189125061, + "learning_rate": 0.0001377758007117438, + "loss": 2.2298, + "step": 3507 + }, + { + "epoch": 1.5591111111111111, + "grad_norm": 1.5170714855194092, + "learning_rate": 0.00013775800711743775, + "loss": 1.9902, + "step": 3508 + }, + { + "epoch": 1.5595555555555556, + "grad_norm": 1.5905641317367554, + "learning_rate": 0.00013774021352313168, + "loss": 2.1633, + "step": 3509 + }, + { + "epoch": 1.56, + "grad_norm": 1.601283311843872, + "learning_rate": 0.0001377224199288256, + "loss": 2.1138, + "step": 3510 + }, + { + "epoch": 1.5604444444444443, + "grad_norm": 1.5424052476882935, + "learning_rate": 0.00013770462633451956, + "loss": 1.9056, + "step": 3511 + }, + { + "epoch": 1.560888888888889, + "grad_norm": 1.7833845615386963, + "learning_rate": 0.00013768683274021352, + "loss": 2.0569, + "step": 3512 + }, + { + "epoch": 1.5613333333333332, + "grad_norm": 1.729130506515503, + "learning_rate": 0.00013766903914590748, + "loss": 2.2351, + "step": 3513 + }, + { + "epoch": 1.561777777777778, + "grad_norm": 1.7116620540618896, + "learning_rate": 0.00013765124555160143, + "loss": 2.7517, + "step": 3514 + }, + { + "epoch": 1.5622222222222222, + "grad_norm": 1.9057334661483765, + "learning_rate": 0.0001376334519572954, + "loss": 2.7395, + "step": 3515 + }, + { + "epoch": 1.5626666666666666, + "grad_norm": 1.9144783020019531, + "learning_rate": 0.00013761565836298932, + "loss": 2.2624, + "step": 3516 + }, + { + "epoch": 1.5631111111111111, + "grad_norm": 1.756967306137085, + "learning_rate": 0.00013759786476868327, + "loss": 1.5217, + "step": 3517 + }, + { + "epoch": 1.5635555555555556, + "grad_norm": 2.028092861175537, + "learning_rate": 0.00013758007117437723, + "loss": 2.6578, + "step": 3518 + }, + { + "epoch": 1.564, + "grad_norm": 1.8625141382217407, + "learning_rate": 0.0001375622775800712, + "loss": 2.4186, + "step": 3519 + }, + { + "epoch": 1.5644444444444443, + "grad_norm": 2.1919872760772705, + "learning_rate": 0.00013754448398576514, + "loss": 2.2043, + "step": 3520 + }, + { + "epoch": 1.564888888888889, + "grad_norm": 1.709939956665039, + "learning_rate": 0.0001375266903914591, + "loss": 2.3172, + "step": 3521 + }, + { + "epoch": 1.5653333333333332, + "grad_norm": 2.1514196395874023, + "learning_rate": 0.00013750889679715303, + "loss": 2.4595, + "step": 3522 + }, + { + "epoch": 1.565777777777778, + "grad_norm": 1.8870500326156616, + "learning_rate": 0.00013749110320284696, + "loss": 2.6702, + "step": 3523 + }, + { + "epoch": 1.5662222222222222, + "grad_norm": 1.7913631200790405, + "learning_rate": 0.00013747330960854092, + "loss": 2.3106, + "step": 3524 + }, + { + "epoch": 1.5666666666666667, + "grad_norm": 1.8012170791625977, + "learning_rate": 0.00013745551601423487, + "loss": 1.7666, + "step": 3525 + }, + { + "epoch": 1.5671111111111111, + "grad_norm": 2.001098871231079, + "learning_rate": 0.00013743772241992883, + "loss": 2.1891, + "step": 3526 + }, + { + "epoch": 1.5675555555555556, + "grad_norm": 1.766248345375061, + "learning_rate": 0.00013741992882562279, + "loss": 2.1883, + "step": 3527 + }, + { + "epoch": 1.568, + "grad_norm": 1.6869791746139526, + "learning_rate": 0.00013740213523131674, + "loss": 1.8203, + "step": 3528 + }, + { + "epoch": 1.5684444444444443, + "grad_norm": 1.724373459815979, + "learning_rate": 0.00013738434163701067, + "loss": 1.8375, + "step": 3529 + }, + { + "epoch": 1.568888888888889, + "grad_norm": 1.7977555990219116, + "learning_rate": 0.00013736654804270463, + "loss": 2.2153, + "step": 3530 + }, + { + "epoch": 1.5693333333333332, + "grad_norm": 1.736857295036316, + "learning_rate": 0.00013734875444839858, + "loss": 1.7471, + "step": 3531 + }, + { + "epoch": 1.569777777777778, + "grad_norm": 1.7593656778335571, + "learning_rate": 0.00013733096085409254, + "loss": 1.7479, + "step": 3532 + }, + { + "epoch": 1.5702222222222222, + "grad_norm": 1.9257147312164307, + "learning_rate": 0.0001373131672597865, + "loss": 1.9489, + "step": 3533 + }, + { + "epoch": 1.5706666666666667, + "grad_norm": 2.0456788539886475, + "learning_rate": 0.00013729537366548045, + "loss": 2.1165, + "step": 3534 + }, + { + "epoch": 1.5711111111111111, + "grad_norm": 2.0241599082946777, + "learning_rate": 0.00013727758007117438, + "loss": 2.3391, + "step": 3535 + }, + { + "epoch": 1.5715555555555556, + "grad_norm": 2.1118271350860596, + "learning_rate": 0.0001372597864768683, + "loss": 2.3182, + "step": 3536 + }, + { + "epoch": 1.572, + "grad_norm": 2.060579776763916, + "learning_rate": 0.00013724199288256227, + "loss": 2.3438, + "step": 3537 + }, + { + "epoch": 1.5724444444444443, + "grad_norm": 1.868186354637146, + "learning_rate": 0.00013722419928825623, + "loss": 1.67, + "step": 3538 + }, + { + "epoch": 1.572888888888889, + "grad_norm": 2.3484630584716797, + "learning_rate": 0.00013720640569395018, + "loss": 2.2858, + "step": 3539 + }, + { + "epoch": 1.5733333333333333, + "grad_norm": 2.5575010776519775, + "learning_rate": 0.00013718861209964414, + "loss": 2.4188, + "step": 3540 + }, + { + "epoch": 1.573777777777778, + "grad_norm": 1.9896149635314941, + "learning_rate": 0.0001371708185053381, + "loss": 1.6996, + "step": 3541 + }, + { + "epoch": 1.5742222222222222, + "grad_norm": 2.069122791290283, + "learning_rate": 0.00013715302491103202, + "loss": 1.705, + "step": 3542 + }, + { + "epoch": 1.5746666666666667, + "grad_norm": 2.1964266300201416, + "learning_rate": 0.00013713523131672598, + "loss": 1.8919, + "step": 3543 + }, + { + "epoch": 1.5751111111111111, + "grad_norm": 2.8693044185638428, + "learning_rate": 0.00013711743772241994, + "loss": 2.5328, + "step": 3544 + }, + { + "epoch": 1.5755555555555556, + "grad_norm": 2.714355707168579, + "learning_rate": 0.0001370996441281139, + "loss": 2.4895, + "step": 3545 + }, + { + "epoch": 1.576, + "grad_norm": 2.324647903442383, + "learning_rate": 0.00013708185053380785, + "loss": 2.1343, + "step": 3546 + }, + { + "epoch": 1.5764444444444443, + "grad_norm": 2.0678861141204834, + "learning_rate": 0.0001370640569395018, + "loss": 2.1356, + "step": 3547 + }, + { + "epoch": 1.576888888888889, + "grad_norm": 1.9118576049804688, + "learning_rate": 0.00013704626334519574, + "loss": 1.1119, + "step": 3548 + }, + { + "epoch": 1.5773333333333333, + "grad_norm": 0.4530702829360962, + "learning_rate": 0.00013702846975088967, + "loss": 0.078, + "step": 3549 + }, + { + "epoch": 1.5777777777777777, + "grad_norm": 2.177344560623169, + "learning_rate": 0.00013701067615658362, + "loss": 1.7089, + "step": 3550 + }, + { + "epoch": 1.5782222222222222, + "grad_norm": 1.2385826110839844, + "learning_rate": 0.00013699288256227758, + "loss": 2.394, + "step": 3551 + }, + { + "epoch": 1.5786666666666667, + "grad_norm": 1.412482500076294, + "learning_rate": 0.00013697508896797154, + "loss": 2.4647, + "step": 3552 + }, + { + "epoch": 1.5791111111111111, + "grad_norm": 1.1037476062774658, + "learning_rate": 0.0001369572953736655, + "loss": 1.245, + "step": 3553 + }, + { + "epoch": 1.5795555555555556, + "grad_norm": 1.6340032815933228, + "learning_rate": 0.00013693950177935945, + "loss": 1.5744, + "step": 3554 + }, + { + "epoch": 1.58, + "grad_norm": 1.600129246711731, + "learning_rate": 0.00013692170818505338, + "loss": 2.1649, + "step": 3555 + }, + { + "epoch": 1.5804444444444443, + "grad_norm": 1.5471206903457642, + "learning_rate": 0.00013690391459074733, + "loss": 1.8665, + "step": 3556 + }, + { + "epoch": 1.580888888888889, + "grad_norm": 1.4040687084197998, + "learning_rate": 0.0001368861209964413, + "loss": 2.1955, + "step": 3557 + }, + { + "epoch": 1.5813333333333333, + "grad_norm": 1.490673542022705, + "learning_rate": 0.00013686832740213525, + "loss": 2.0096, + "step": 3558 + }, + { + "epoch": 1.5817777777777777, + "grad_norm": 1.6409083604812622, + "learning_rate": 0.0001368505338078292, + "loss": 2.2744, + "step": 3559 + }, + { + "epoch": 1.5822222222222222, + "grad_norm": 1.574512004852295, + "learning_rate": 0.00013683274021352316, + "loss": 1.9902, + "step": 3560 + }, + { + "epoch": 1.5826666666666667, + "grad_norm": 1.7083110809326172, + "learning_rate": 0.0001368149466192171, + "loss": 1.6893, + "step": 3561 + }, + { + "epoch": 1.5831111111111111, + "grad_norm": 1.8733265399932861, + "learning_rate": 0.00013679715302491102, + "loss": 2.2119, + "step": 3562 + }, + { + "epoch": 1.5835555555555556, + "grad_norm": 1.8871500492095947, + "learning_rate": 0.00013677935943060498, + "loss": 2.3291, + "step": 3563 + }, + { + "epoch": 1.584, + "grad_norm": 1.5511809587478638, + "learning_rate": 0.00013676156583629893, + "loss": 2.2039, + "step": 3564 + }, + { + "epoch": 1.5844444444444443, + "grad_norm": 1.8675401210784912, + "learning_rate": 0.0001367437722419929, + "loss": 2.0337, + "step": 3565 + }, + { + "epoch": 1.584888888888889, + "grad_norm": 1.8697566986083984, + "learning_rate": 0.00013672597864768684, + "loss": 1.5566, + "step": 3566 + }, + { + "epoch": 1.5853333333333333, + "grad_norm": 1.7386986017227173, + "learning_rate": 0.0001367081850533808, + "loss": 2.0596, + "step": 3567 + }, + { + "epoch": 1.5857777777777777, + "grad_norm": 1.9721729755401611, + "learning_rate": 0.00013669039145907473, + "loss": 2.6456, + "step": 3568 + }, + { + "epoch": 1.5862222222222222, + "grad_norm": 1.8694285154342651, + "learning_rate": 0.0001366725978647687, + "loss": 2.3915, + "step": 3569 + }, + { + "epoch": 1.5866666666666667, + "grad_norm": 2.148411512374878, + "learning_rate": 0.00013665480427046264, + "loss": 2.6203, + "step": 3570 + }, + { + "epoch": 1.5871111111111111, + "grad_norm": 1.9500958919525146, + "learning_rate": 0.0001366370106761566, + "loss": 2.3018, + "step": 3571 + }, + { + "epoch": 1.5875555555555556, + "grad_norm": 1.888933777809143, + "learning_rate": 0.00013661921708185056, + "loss": 2.2713, + "step": 3572 + }, + { + "epoch": 1.588, + "grad_norm": 2.1509106159210205, + "learning_rate": 0.0001366014234875445, + "loss": 2.2562, + "step": 3573 + }, + { + "epoch": 1.5884444444444443, + "grad_norm": 1.9218084812164307, + "learning_rate": 0.00013658362989323844, + "loss": 1.9106, + "step": 3574 + }, + { + "epoch": 1.588888888888889, + "grad_norm": 2.161742687225342, + "learning_rate": 0.00013656583629893237, + "loss": 2.061, + "step": 3575 + }, + { + "epoch": 1.5893333333333333, + "grad_norm": 1.4747031927108765, + "learning_rate": 0.00013654804270462633, + "loss": 1.0917, + "step": 3576 + }, + { + "epoch": 1.5897777777777777, + "grad_norm": 2.0540664196014404, + "learning_rate": 0.00013653024911032029, + "loss": 2.2196, + "step": 3577 + }, + { + "epoch": 1.5902222222222222, + "grad_norm": 2.018321990966797, + "learning_rate": 0.00013651245551601424, + "loss": 2.4654, + "step": 3578 + }, + { + "epoch": 1.5906666666666667, + "grad_norm": 2.201575517654419, + "learning_rate": 0.0001364946619217082, + "loss": 2.1311, + "step": 3579 + }, + { + "epoch": 1.5911111111111111, + "grad_norm": 2.108130931854248, + "learning_rate": 0.00013647686832740213, + "loss": 2.4114, + "step": 3580 + }, + { + "epoch": 1.5915555555555554, + "grad_norm": 1.782263159751892, + "learning_rate": 0.00013645907473309608, + "loss": 1.9585, + "step": 3581 + }, + { + "epoch": 1.592, + "grad_norm": 2.01802134513855, + "learning_rate": 0.00013644128113879004, + "loss": 2.4353, + "step": 3582 + }, + { + "epoch": 1.5924444444444443, + "grad_norm": 2.023480176925659, + "learning_rate": 0.000136423487544484, + "loss": 1.6961, + "step": 3583 + }, + { + "epoch": 1.592888888888889, + "grad_norm": 2.0900518894195557, + "learning_rate": 0.00013640569395017795, + "loss": 2.1796, + "step": 3584 + }, + { + "epoch": 1.5933333333333333, + "grad_norm": 2.238590717315674, + "learning_rate": 0.0001363879003558719, + "loss": 2.2932, + "step": 3585 + }, + { + "epoch": 1.5937777777777777, + "grad_norm": 2.2192628383636475, + "learning_rate": 0.00013637010676156584, + "loss": 1.9587, + "step": 3586 + }, + { + "epoch": 1.5942222222222222, + "grad_norm": 2.1162943840026855, + "learning_rate": 0.00013635231316725977, + "loss": 2.5576, + "step": 3587 + }, + { + "epoch": 1.5946666666666667, + "grad_norm": 2.580493927001953, + "learning_rate": 0.00013633451957295373, + "loss": 2.4377, + "step": 3588 + }, + { + "epoch": 1.5951111111111111, + "grad_norm": 2.192513942718506, + "learning_rate": 0.00013631672597864768, + "loss": 2.2521, + "step": 3589 + }, + { + "epoch": 1.5955555555555554, + "grad_norm": 2.2190680503845215, + "learning_rate": 0.00013629893238434164, + "loss": 2.0167, + "step": 3590 + }, + { + "epoch": 1.596, + "grad_norm": 2.1189939975738525, + "learning_rate": 0.0001362811387900356, + "loss": 2.1514, + "step": 3591 + }, + { + "epoch": 1.5964444444444443, + "grad_norm": 1.9854724407196045, + "learning_rate": 0.00013626334519572955, + "loss": 2.4261, + "step": 3592 + }, + { + "epoch": 1.596888888888889, + "grad_norm": 2.232863426208496, + "learning_rate": 0.00013624555160142348, + "loss": 2.0627, + "step": 3593 + }, + { + "epoch": 1.5973333333333333, + "grad_norm": 2.1362996101379395, + "learning_rate": 0.00013622775800711744, + "loss": 2.0867, + "step": 3594 + }, + { + "epoch": 1.5977777777777777, + "grad_norm": 2.116586923599243, + "learning_rate": 0.0001362099644128114, + "loss": 2.0974, + "step": 3595 + }, + { + "epoch": 1.5982222222222222, + "grad_norm": 2.2818892002105713, + "learning_rate": 0.00013619217081850535, + "loss": 2.4957, + "step": 3596 + }, + { + "epoch": 1.5986666666666667, + "grad_norm": 2.1921818256378174, + "learning_rate": 0.0001361743772241993, + "loss": 2.0431, + "step": 3597 + }, + { + "epoch": 1.5991111111111111, + "grad_norm": 2.915421724319458, + "learning_rate": 0.00013615658362989326, + "loss": 2.8215, + "step": 3598 + }, + { + "epoch": 1.5995555555555554, + "grad_norm": 2.097731590270996, + "learning_rate": 0.0001361387900355872, + "loss": 1.9497, + "step": 3599 + }, + { + "epoch": 1.6, + "grad_norm": 2.766552209854126, + "learning_rate": 0.00013612099644128112, + "loss": 2.1285, + "step": 3600 + }, + { + "epoch": 1.6004444444444443, + "grad_norm": 1.9238742589950562, + "learning_rate": 0.00013610320284697508, + "loss": 2.5526, + "step": 3601 + }, + { + "epoch": 1.600888888888889, + "grad_norm": 1.162853717803955, + "learning_rate": 0.00013608540925266903, + "loss": 1.0674, + "step": 3602 + }, + { + "epoch": 1.6013333333333333, + "grad_norm": 1.4226114749908447, + "learning_rate": 0.000136067615658363, + "loss": 2.2656, + "step": 3603 + }, + { + "epoch": 1.6017777777777777, + "grad_norm": 1.5394558906555176, + "learning_rate": 0.00013604982206405695, + "loss": 2.1054, + "step": 3604 + }, + { + "epoch": 1.6022222222222222, + "grad_norm": 1.7394784688949585, + "learning_rate": 0.0001360320284697509, + "loss": 2.3422, + "step": 3605 + }, + { + "epoch": 1.6026666666666667, + "grad_norm": 1.7149112224578857, + "learning_rate": 0.00013601423487544483, + "loss": 2.4442, + "step": 3606 + }, + { + "epoch": 1.6031111111111112, + "grad_norm": 1.8628418445587158, + "learning_rate": 0.0001359964412811388, + "loss": 2.5289, + "step": 3607 + }, + { + "epoch": 1.6035555555555554, + "grad_norm": 1.5874834060668945, + "learning_rate": 0.00013597864768683275, + "loss": 2.4502, + "step": 3608 + }, + { + "epoch": 1.604, + "grad_norm": 1.6472437381744385, + "learning_rate": 0.0001359608540925267, + "loss": 2.1657, + "step": 3609 + }, + { + "epoch": 1.6044444444444443, + "grad_norm": 1.950655221939087, + "learning_rate": 0.00013594306049822066, + "loss": 1.9845, + "step": 3610 + }, + { + "epoch": 1.604888888888889, + "grad_norm": 1.5699740648269653, + "learning_rate": 0.00013592526690391462, + "loss": 2.5056, + "step": 3611 + }, + { + "epoch": 1.6053333333333333, + "grad_norm": 1.7249257564544678, + "learning_rate": 0.00013590747330960855, + "loss": 2.4613, + "step": 3612 + }, + { + "epoch": 1.6057777777777777, + "grad_norm": 1.6163465976715088, + "learning_rate": 0.00013588967971530248, + "loss": 2.2756, + "step": 3613 + }, + { + "epoch": 1.6062222222222222, + "grad_norm": 1.6546217203140259, + "learning_rate": 0.00013587188612099643, + "loss": 2.0813, + "step": 3614 + }, + { + "epoch": 1.6066666666666667, + "grad_norm": 1.7059136629104614, + "learning_rate": 0.0001358540925266904, + "loss": 2.2557, + "step": 3615 + }, + { + "epoch": 1.6071111111111112, + "grad_norm": 1.9633029699325562, + "learning_rate": 0.00013583629893238434, + "loss": 2.3581, + "step": 3616 + }, + { + "epoch": 1.6075555555555554, + "grad_norm": 1.5104560852050781, + "learning_rate": 0.0001358185053380783, + "loss": 1.8789, + "step": 3617 + }, + { + "epoch": 1.608, + "grad_norm": 1.7537540197372437, + "learning_rate": 0.00013580071174377226, + "loss": 2.1946, + "step": 3618 + }, + { + "epoch": 1.6084444444444443, + "grad_norm": 1.8772600889205933, + "learning_rate": 0.0001357829181494662, + "loss": 2.3868, + "step": 3619 + }, + { + "epoch": 1.608888888888889, + "grad_norm": 1.7722148895263672, + "learning_rate": 0.00013576512455516014, + "loss": 2.0009, + "step": 3620 + }, + { + "epoch": 1.6093333333333333, + "grad_norm": 1.662315011024475, + "learning_rate": 0.0001357473309608541, + "loss": 1.721, + "step": 3621 + }, + { + "epoch": 1.6097777777777778, + "grad_norm": 1.9207539558410645, + "learning_rate": 0.00013572953736654806, + "loss": 2.4827, + "step": 3622 + }, + { + "epoch": 1.6102222222222222, + "grad_norm": 1.8683892488479614, + "learning_rate": 0.000135711743772242, + "loss": 2.2517, + "step": 3623 + }, + { + "epoch": 1.6106666666666667, + "grad_norm": 1.8369477987289429, + "learning_rate": 0.00013569395017793597, + "loss": 2.0752, + "step": 3624 + }, + { + "epoch": 1.6111111111111112, + "grad_norm": 1.9077363014221191, + "learning_rate": 0.0001356761565836299, + "loss": 1.9962, + "step": 3625 + }, + { + "epoch": 1.6115555555555554, + "grad_norm": 1.9185972213745117, + "learning_rate": 0.00013565836298932383, + "loss": 1.8832, + "step": 3626 + }, + { + "epoch": 1.612, + "grad_norm": 1.9970186948776245, + "learning_rate": 0.00013564056939501778, + "loss": 2.2276, + "step": 3627 + }, + { + "epoch": 1.6124444444444443, + "grad_norm": 1.8423793315887451, + "learning_rate": 0.00013562277580071174, + "loss": 2.3652, + "step": 3628 + }, + { + "epoch": 1.612888888888889, + "grad_norm": 2.101058006286621, + "learning_rate": 0.0001356049822064057, + "loss": 2.3492, + "step": 3629 + }, + { + "epoch": 1.6133333333333333, + "grad_norm": 2.483633279800415, + "learning_rate": 0.00013558718861209965, + "loss": 2.4914, + "step": 3630 + }, + { + "epoch": 1.6137777777777778, + "grad_norm": 2.1490073204040527, + "learning_rate": 0.0001355693950177936, + "loss": 1.7586, + "step": 3631 + }, + { + "epoch": 1.6142222222222222, + "grad_norm": 2.103785276412964, + "learning_rate": 0.00013555160142348754, + "loss": 2.379, + "step": 3632 + }, + { + "epoch": 1.6146666666666667, + "grad_norm": 2.0692965984344482, + "learning_rate": 0.0001355338078291815, + "loss": 2.0689, + "step": 3633 + }, + { + "epoch": 1.6151111111111112, + "grad_norm": 2.025420665740967, + "learning_rate": 0.00013551601423487545, + "loss": 2.1845, + "step": 3634 + }, + { + "epoch": 1.6155555555555554, + "grad_norm": 1.7488460540771484, + "learning_rate": 0.0001354982206405694, + "loss": 1.6538, + "step": 3635 + }, + { + "epoch": 1.616, + "grad_norm": 2.308826446533203, + "learning_rate": 0.00013548042704626337, + "loss": 2.2989, + "step": 3636 + }, + { + "epoch": 1.6164444444444444, + "grad_norm": 2.220733404159546, + "learning_rate": 0.00013546263345195732, + "loss": 2.6318, + "step": 3637 + }, + { + "epoch": 1.616888888888889, + "grad_norm": 2.038041830062866, + "learning_rate": 0.00013544483985765125, + "loss": 1.8199, + "step": 3638 + }, + { + "epoch": 1.6173333333333333, + "grad_norm": 2.3639976978302, + "learning_rate": 0.00013542704626334518, + "loss": 1.7709, + "step": 3639 + }, + { + "epoch": 1.6177777777777778, + "grad_norm": 2.1797988414764404, + "learning_rate": 0.00013540925266903914, + "loss": 1.9726, + "step": 3640 + }, + { + "epoch": 1.6182222222222222, + "grad_norm": 2.5964488983154297, + "learning_rate": 0.0001353914590747331, + "loss": 2.5678, + "step": 3641 + }, + { + "epoch": 1.6186666666666667, + "grad_norm": 2.0529944896698, + "learning_rate": 0.00013537366548042705, + "loss": 1.9994, + "step": 3642 + }, + { + "epoch": 1.6191111111111112, + "grad_norm": 1.9986488819122314, + "learning_rate": 0.000135355871886121, + "loss": 2.1701, + "step": 3643 + }, + { + "epoch": 1.6195555555555554, + "grad_norm": 2.469334840774536, + "learning_rate": 0.00013533807829181496, + "loss": 2.5633, + "step": 3644 + }, + { + "epoch": 1.62, + "grad_norm": 2.4147121906280518, + "learning_rate": 0.0001353202846975089, + "loss": 2.2908, + "step": 3645 + }, + { + "epoch": 1.6204444444444444, + "grad_norm": 2.1340951919555664, + "learning_rate": 0.00013530249110320285, + "loss": 1.8365, + "step": 3646 + }, + { + "epoch": 1.620888888888889, + "grad_norm": 2.6548309326171875, + "learning_rate": 0.0001352846975088968, + "loss": 2.6132, + "step": 3647 + }, + { + "epoch": 1.6213333333333333, + "grad_norm": 2.923912286758423, + "learning_rate": 0.00013526690391459076, + "loss": 2.6588, + "step": 3648 + }, + { + "epoch": 1.6217777777777778, + "grad_norm": 2.134868621826172, + "learning_rate": 0.00013524911032028472, + "loss": 0.9781, + "step": 3649 + }, + { + "epoch": 1.6222222222222222, + "grad_norm": 3.1165969371795654, + "learning_rate": 0.00013523131672597868, + "loss": 1.6002, + "step": 3650 + }, + { + "epoch": 1.6226666666666667, + "grad_norm": 1.4649471044540405, + "learning_rate": 0.0001352135231316726, + "loss": 0.9932, + "step": 3651 + }, + { + "epoch": 1.6231111111111112, + "grad_norm": 1.4319626092910767, + "learning_rate": 0.00013519572953736653, + "loss": 2.754, + "step": 3652 + }, + { + "epoch": 1.6235555555555554, + "grad_norm": 1.6543512344360352, + "learning_rate": 0.0001351779359430605, + "loss": 2.3402, + "step": 3653 + }, + { + "epoch": 1.624, + "grad_norm": 1.540387511253357, + "learning_rate": 0.00013516014234875445, + "loss": 2.5623, + "step": 3654 + }, + { + "epoch": 1.6244444444444444, + "grad_norm": 1.4166914224624634, + "learning_rate": 0.0001351423487544484, + "loss": 1.6858, + "step": 3655 + }, + { + "epoch": 1.624888888888889, + "grad_norm": 2.0138063430786133, + "learning_rate": 0.00013512455516014236, + "loss": 2.6032, + "step": 3656 + }, + { + "epoch": 1.6253333333333333, + "grad_norm": 1.8225046396255493, + "learning_rate": 0.00013510676156583632, + "loss": 2.6059, + "step": 3657 + }, + { + "epoch": 1.6257777777777778, + "grad_norm": 1.8486915826797485, + "learning_rate": 0.00013508896797153025, + "loss": 2.212, + "step": 3658 + }, + { + "epoch": 1.6262222222222222, + "grad_norm": 1.6416893005371094, + "learning_rate": 0.0001350711743772242, + "loss": 2.4992, + "step": 3659 + }, + { + "epoch": 1.6266666666666667, + "grad_norm": 1.5142600536346436, + "learning_rate": 0.00013505338078291816, + "loss": 2.5075, + "step": 3660 + }, + { + "epoch": 1.6271111111111112, + "grad_norm": 1.2331926822662354, + "learning_rate": 0.00013503558718861212, + "loss": 0.9244, + "step": 3661 + }, + { + "epoch": 1.6275555555555554, + "grad_norm": 1.6997432708740234, + "learning_rate": 0.00013501779359430607, + "loss": 2.1417, + "step": 3662 + }, + { + "epoch": 1.6280000000000001, + "grad_norm": 1.8140586614608765, + "learning_rate": 0.00013500000000000003, + "loss": 2.3071, + "step": 3663 + }, + { + "epoch": 1.6284444444444444, + "grad_norm": 1.7522242069244385, + "learning_rate": 0.00013498220640569396, + "loss": 1.7328, + "step": 3664 + }, + { + "epoch": 1.628888888888889, + "grad_norm": 1.7086604833602905, + "learning_rate": 0.0001349644128113879, + "loss": 2.15, + "step": 3665 + }, + { + "epoch": 1.6293333333333333, + "grad_norm": 1.699811577796936, + "learning_rate": 0.00013494661921708184, + "loss": 2.1629, + "step": 3666 + }, + { + "epoch": 1.6297777777777778, + "grad_norm": 1.6811929941177368, + "learning_rate": 0.0001349288256227758, + "loss": 1.7282, + "step": 3667 + }, + { + "epoch": 1.6302222222222222, + "grad_norm": 1.8707060813903809, + "learning_rate": 0.00013491103202846976, + "loss": 2.4604, + "step": 3668 + }, + { + "epoch": 1.6306666666666667, + "grad_norm": 1.7901276350021362, + "learning_rate": 0.00013489323843416371, + "loss": 2.1453, + "step": 3669 + }, + { + "epoch": 1.6311111111111112, + "grad_norm": 1.7635587453842163, + "learning_rate": 0.00013487544483985764, + "loss": 2.688, + "step": 3670 + }, + { + "epoch": 1.6315555555555554, + "grad_norm": 2.1348886489868164, + "learning_rate": 0.0001348576512455516, + "loss": 2.6815, + "step": 3671 + }, + { + "epoch": 1.6320000000000001, + "grad_norm": 1.738974690437317, + "learning_rate": 0.00013483985765124556, + "loss": 2.1772, + "step": 3672 + }, + { + "epoch": 1.6324444444444444, + "grad_norm": 1.7756551504135132, + "learning_rate": 0.0001348220640569395, + "loss": 2.3101, + "step": 3673 + }, + { + "epoch": 1.6328888888888888, + "grad_norm": 1.9206149578094482, + "learning_rate": 0.00013480427046263347, + "loss": 2.318, + "step": 3674 + }, + { + "epoch": 1.6333333333333333, + "grad_norm": 1.740354299545288, + "learning_rate": 0.00013478647686832743, + "loss": 1.6665, + "step": 3675 + }, + { + "epoch": 1.6337777777777778, + "grad_norm": 1.8040590286254883, + "learning_rate": 0.00013476868327402135, + "loss": 1.9037, + "step": 3676 + }, + { + "epoch": 1.6342222222222222, + "grad_norm": 1.8580551147460938, + "learning_rate": 0.0001347508896797153, + "loss": 1.9794, + "step": 3677 + }, + { + "epoch": 1.6346666666666667, + "grad_norm": 1.9417858123779297, + "learning_rate": 0.00013473309608540924, + "loss": 2.0177, + "step": 3678 + }, + { + "epoch": 1.6351111111111112, + "grad_norm": 1.7505145072937012, + "learning_rate": 0.0001347153024911032, + "loss": 2.1137, + "step": 3679 + }, + { + "epoch": 1.6355555555555554, + "grad_norm": 2.044389486312866, + "learning_rate": 0.00013469750889679715, + "loss": 2.5816, + "step": 3680 + }, + { + "epoch": 1.6360000000000001, + "grad_norm": 1.7209917306900024, + "learning_rate": 0.0001346797153024911, + "loss": 1.2114, + "step": 3681 + }, + { + "epoch": 1.6364444444444444, + "grad_norm": 0.24984845519065857, + "learning_rate": 0.00013466192170818507, + "loss": 0.037, + "step": 3682 + }, + { + "epoch": 1.6368888888888888, + "grad_norm": 2.0459017753601074, + "learning_rate": 0.000134644128113879, + "loss": 2.2279, + "step": 3683 + }, + { + "epoch": 1.6373333333333333, + "grad_norm": 2.004143714904785, + "learning_rate": 0.00013462633451957295, + "loss": 2.1229, + "step": 3684 + }, + { + "epoch": 1.6377777777777778, + "grad_norm": 1.9758665561676025, + "learning_rate": 0.0001346085409252669, + "loss": 2.0448, + "step": 3685 + }, + { + "epoch": 1.6382222222222222, + "grad_norm": 2.130927324295044, + "learning_rate": 0.00013459074733096087, + "loss": 2.2224, + "step": 3686 + }, + { + "epoch": 1.6386666666666667, + "grad_norm": 2.007413148880005, + "learning_rate": 0.00013457295373665482, + "loss": 2.2341, + "step": 3687 + }, + { + "epoch": 1.6391111111111112, + "grad_norm": 2.422636032104492, + "learning_rate": 0.00013455516014234878, + "loss": 2.4104, + "step": 3688 + }, + { + "epoch": 1.6395555555555554, + "grad_norm": 2.4192779064178467, + "learning_rate": 0.0001345373665480427, + "loss": 2.3782, + "step": 3689 + }, + { + "epoch": 1.6400000000000001, + "grad_norm": 1.9421988725662231, + "learning_rate": 0.00013451957295373666, + "loss": 2.1848, + "step": 3690 + }, + { + "epoch": 1.6404444444444444, + "grad_norm": 2.0658280849456787, + "learning_rate": 0.0001345017793594306, + "loss": 1.9037, + "step": 3691 + }, + { + "epoch": 1.6408888888888888, + "grad_norm": 2.1715986728668213, + "learning_rate": 0.00013448398576512455, + "loss": 2.5529, + "step": 3692 + }, + { + "epoch": 1.6413333333333333, + "grad_norm": 2.3295042514801025, + "learning_rate": 0.0001344661921708185, + "loss": 2.2945, + "step": 3693 + }, + { + "epoch": 1.6417777777777778, + "grad_norm": 2.211021661758423, + "learning_rate": 0.00013444839857651246, + "loss": 2.2324, + "step": 3694 + }, + { + "epoch": 1.6422222222222222, + "grad_norm": 2.5660417079925537, + "learning_rate": 0.00013443060498220642, + "loss": 2.6198, + "step": 3695 + }, + { + "epoch": 1.6426666666666667, + "grad_norm": 2.0584888458251953, + "learning_rate": 0.00013441281138790035, + "loss": 1.9456, + "step": 3696 + }, + { + "epoch": 1.6431111111111112, + "grad_norm": 2.6596102714538574, + "learning_rate": 0.0001343950177935943, + "loss": 3.0825, + "step": 3697 + }, + { + "epoch": 1.6435555555555554, + "grad_norm": 2.368485689163208, + "learning_rate": 0.00013437722419928826, + "loss": 2.4511, + "step": 3698 + }, + { + "epoch": 1.6440000000000001, + "grad_norm": 2.7313435077667236, + "learning_rate": 0.00013435943060498222, + "loss": 1.0798, + "step": 3699 + }, + { + "epoch": 1.6444444444444444, + "grad_norm": 2.3031601905822754, + "learning_rate": 0.00013434163701067618, + "loss": 2.4437, + "step": 3700 + }, + { + "epoch": 1.6448888888888888, + "grad_norm": 1.3070570230484009, + "learning_rate": 0.00013432384341637013, + "loss": 2.4191, + "step": 3701 + }, + { + "epoch": 1.6453333333333333, + "grad_norm": 1.4076443910598755, + "learning_rate": 0.00013430604982206406, + "loss": 2.2877, + "step": 3702 + }, + { + "epoch": 1.6457777777777778, + "grad_norm": 1.5221251249313354, + "learning_rate": 0.000134288256227758, + "loss": 2.6039, + "step": 3703 + }, + { + "epoch": 1.6462222222222223, + "grad_norm": 1.4921314716339111, + "learning_rate": 0.00013427046263345195, + "loss": 2.7059, + "step": 3704 + }, + { + "epoch": 1.6466666666666665, + "grad_norm": 1.7720656394958496, + "learning_rate": 0.0001342526690391459, + "loss": 2.9481, + "step": 3705 + }, + { + "epoch": 1.6471111111111112, + "grad_norm": 1.6238889694213867, + "learning_rate": 0.00013423487544483986, + "loss": 1.6842, + "step": 3706 + }, + { + "epoch": 1.6475555555555554, + "grad_norm": 1.5820468664169312, + "learning_rate": 0.00013421708185053382, + "loss": 2.4521, + "step": 3707 + }, + { + "epoch": 1.6480000000000001, + "grad_norm": 1.584951639175415, + "learning_rate": 0.00013419928825622777, + "loss": 1.9391, + "step": 3708 + }, + { + "epoch": 1.6484444444444444, + "grad_norm": 1.8263479471206665, + "learning_rate": 0.0001341814946619217, + "loss": 2.4315, + "step": 3709 + }, + { + "epoch": 1.6488888888888888, + "grad_norm": 1.4678611755371094, + "learning_rate": 0.00013416370106761566, + "loss": 1.9397, + "step": 3710 + }, + { + "epoch": 1.6493333333333333, + "grad_norm": 1.4494836330413818, + "learning_rate": 0.00013414590747330962, + "loss": 1.6722, + "step": 3711 + }, + { + "epoch": 1.6497777777777778, + "grad_norm": 1.7007122039794922, + "learning_rate": 0.00013412811387900357, + "loss": 2.412, + "step": 3712 + }, + { + "epoch": 1.6502222222222223, + "grad_norm": 1.7339609861373901, + "learning_rate": 0.00013411032028469753, + "loss": 2.1592, + "step": 3713 + }, + { + "epoch": 1.6506666666666665, + "grad_norm": 1.8747652769088745, + "learning_rate": 0.00013409252669039148, + "loss": 2.8591, + "step": 3714 + }, + { + "epoch": 1.6511111111111112, + "grad_norm": 1.7554471492767334, + "learning_rate": 0.00013407473309608541, + "loss": 2.3202, + "step": 3715 + }, + { + "epoch": 1.6515555555555554, + "grad_norm": 1.5907261371612549, + "learning_rate": 0.00013405693950177934, + "loss": 1.7988, + "step": 3716 + }, + { + "epoch": 1.6520000000000001, + "grad_norm": 1.1176869869232178, + "learning_rate": 0.0001340391459074733, + "loss": 0.8156, + "step": 3717 + }, + { + "epoch": 1.6524444444444444, + "grad_norm": 1.6439223289489746, + "learning_rate": 0.00013402135231316726, + "loss": 2.4484, + "step": 3718 + }, + { + "epoch": 1.6528888888888889, + "grad_norm": 1.4052650928497314, + "learning_rate": 0.0001340035587188612, + "loss": 1.775, + "step": 3719 + }, + { + "epoch": 1.6533333333333333, + "grad_norm": 1.7691340446472168, + "learning_rate": 0.00013398576512455517, + "loss": 2.0681, + "step": 3720 + }, + { + "epoch": 1.6537777777777778, + "grad_norm": 1.88296377658844, + "learning_rate": 0.00013396797153024913, + "loss": 2.1395, + "step": 3721 + }, + { + "epoch": 1.6542222222222223, + "grad_norm": 1.7832571268081665, + "learning_rate": 0.00013395017793594306, + "loss": 1.7583, + "step": 3722 + }, + { + "epoch": 1.6546666666666665, + "grad_norm": 1.8587702512741089, + "learning_rate": 0.000133932384341637, + "loss": 1.9886, + "step": 3723 + }, + { + "epoch": 1.6551111111111112, + "grad_norm": 1.7117775678634644, + "learning_rate": 0.00013391459074733097, + "loss": 2.3072, + "step": 3724 + }, + { + "epoch": 1.6555555555555554, + "grad_norm": 1.7575953006744385, + "learning_rate": 0.00013389679715302493, + "loss": 2.2971, + "step": 3725 + }, + { + "epoch": 1.6560000000000001, + "grad_norm": 1.789642333984375, + "learning_rate": 0.00013387900355871888, + "loss": 1.8977, + "step": 3726 + }, + { + "epoch": 1.6564444444444444, + "grad_norm": 1.8398528099060059, + "learning_rate": 0.00013386120996441284, + "loss": 2.1343, + "step": 3727 + }, + { + "epoch": 1.6568888888888889, + "grad_norm": 1.2515662908554077, + "learning_rate": 0.00013384341637010677, + "loss": 0.9636, + "step": 3728 + }, + { + "epoch": 1.6573333333333333, + "grad_norm": 2.197495937347412, + "learning_rate": 0.0001338256227758007, + "loss": 2.1068, + "step": 3729 + }, + { + "epoch": 1.6577777777777778, + "grad_norm": 1.9065452814102173, + "learning_rate": 0.00013380782918149465, + "loss": 2.1467, + "step": 3730 + }, + { + "epoch": 1.6582222222222223, + "grad_norm": 2.0440008640289307, + "learning_rate": 0.0001337900355871886, + "loss": 2.2524, + "step": 3731 + }, + { + "epoch": 1.6586666666666665, + "grad_norm": 2.092015266418457, + "learning_rate": 0.00013377224199288257, + "loss": 1.9448, + "step": 3732 + }, + { + "epoch": 1.6591111111111112, + "grad_norm": 2.170003652572632, + "learning_rate": 0.00013375444839857652, + "loss": 2.7868, + "step": 3733 + }, + { + "epoch": 1.6595555555555555, + "grad_norm": 2.460923910140991, + "learning_rate": 0.00013373665480427048, + "loss": 2.7106, + "step": 3734 + }, + { + "epoch": 1.6600000000000001, + "grad_norm": 2.117558717727661, + "learning_rate": 0.0001337188612099644, + "loss": 1.8442, + "step": 3735 + }, + { + "epoch": 1.6604444444444444, + "grad_norm": 1.9140143394470215, + "learning_rate": 0.00013370106761565837, + "loss": 1.7407, + "step": 3736 + }, + { + "epoch": 1.6608888888888889, + "grad_norm": 2.178762912750244, + "learning_rate": 0.00013368327402135232, + "loss": 2.0877, + "step": 3737 + }, + { + "epoch": 1.6613333333333333, + "grad_norm": 1.4736453294754028, + "learning_rate": 0.00013366548042704628, + "loss": 1.2792, + "step": 3738 + }, + { + "epoch": 1.6617777777777778, + "grad_norm": 2.096111536026001, + "learning_rate": 0.00013364768683274023, + "loss": 2.0112, + "step": 3739 + }, + { + "epoch": 1.6622222222222223, + "grad_norm": 2.047461986541748, + "learning_rate": 0.0001336298932384342, + "loss": 2.578, + "step": 3740 + }, + { + "epoch": 1.6626666666666665, + "grad_norm": 2.3218650817871094, + "learning_rate": 0.00013361209964412812, + "loss": 2.2947, + "step": 3741 + }, + { + "epoch": 1.6631111111111112, + "grad_norm": 2.492847204208374, + "learning_rate": 0.00013359430604982205, + "loss": 2.4653, + "step": 3742 + }, + { + "epoch": 1.6635555555555555, + "grad_norm": 2.1130130290985107, + "learning_rate": 0.000133576512455516, + "loss": 1.9604, + "step": 3743 + }, + { + "epoch": 1.6640000000000001, + "grad_norm": 2.3125598430633545, + "learning_rate": 0.00013355871886120996, + "loss": 2.0361, + "step": 3744 + }, + { + "epoch": 1.6644444444444444, + "grad_norm": 2.435710906982422, + "learning_rate": 0.00013354092526690392, + "loss": 2.6202, + "step": 3745 + }, + { + "epoch": 1.6648888888888889, + "grad_norm": 2.7004244327545166, + "learning_rate": 0.00013352313167259788, + "loss": 2.2756, + "step": 3746 + }, + { + "epoch": 1.6653333333333333, + "grad_norm": 1.9285310506820679, + "learning_rate": 0.00013350533807829183, + "loss": 1.8988, + "step": 3747 + }, + { + "epoch": 1.6657777777777778, + "grad_norm": 2.2662885189056396, + "learning_rate": 0.00013348754448398576, + "loss": 2.2077, + "step": 3748 + }, + { + "epoch": 1.6662222222222223, + "grad_norm": 2.3850321769714355, + "learning_rate": 0.00013346975088967972, + "loss": 2.4601, + "step": 3749 + }, + { + "epoch": 1.6666666666666665, + "grad_norm": 2.6833248138427734, + "learning_rate": 0.00013345195729537367, + "loss": 1.1392, + "step": 3750 + }, + { + "epoch": 1.6671111111111112, + "grad_norm": 1.3608639240264893, + "learning_rate": 0.00013343416370106763, + "loss": 2.1485, + "step": 3751 + }, + { + "epoch": 1.6675555555555555, + "grad_norm": 1.7300606966018677, + "learning_rate": 0.0001334163701067616, + "loss": 2.5823, + "step": 3752 + }, + { + "epoch": 1.6680000000000001, + "grad_norm": 1.7243146896362305, + "learning_rate": 0.00013339857651245554, + "loss": 2.4296, + "step": 3753 + }, + { + "epoch": 1.6684444444444444, + "grad_norm": 1.6703439950942993, + "learning_rate": 0.00013338078291814947, + "loss": 2.1775, + "step": 3754 + }, + { + "epoch": 1.6688888888888889, + "grad_norm": 1.81732976436615, + "learning_rate": 0.0001333629893238434, + "loss": 2.548, + "step": 3755 + }, + { + "epoch": 1.6693333333333333, + "grad_norm": 1.9755940437316895, + "learning_rate": 0.00013334519572953736, + "loss": 2.2344, + "step": 3756 + }, + { + "epoch": 1.6697777777777778, + "grad_norm": 1.8577018976211548, + "learning_rate": 0.00013332740213523132, + "loss": 2.247, + "step": 3757 + }, + { + "epoch": 1.6702222222222223, + "grad_norm": 1.7116265296936035, + "learning_rate": 0.00013330960854092527, + "loss": 2.1821, + "step": 3758 + }, + { + "epoch": 1.6706666666666665, + "grad_norm": 1.8977397680282593, + "learning_rate": 0.00013329181494661923, + "loss": 2.1784, + "step": 3759 + }, + { + "epoch": 1.6711111111111112, + "grad_norm": 2.1667838096618652, + "learning_rate": 0.00013327402135231316, + "loss": 2.4484, + "step": 3760 + }, + { + "epoch": 1.6715555555555555, + "grad_norm": 1.9198535680770874, + "learning_rate": 0.00013325622775800712, + "loss": 2.5515, + "step": 3761 + }, + { + "epoch": 1.6720000000000002, + "grad_norm": 1.7330615520477295, + "learning_rate": 0.00013323843416370107, + "loss": 2.4671, + "step": 3762 + }, + { + "epoch": 1.6724444444444444, + "grad_norm": 1.9315346479415894, + "learning_rate": 0.00013322064056939503, + "loss": 1.9458, + "step": 3763 + }, + { + "epoch": 1.6728888888888889, + "grad_norm": 2.0418102741241455, + "learning_rate": 0.00013320284697508898, + "loss": 2.3197, + "step": 3764 + }, + { + "epoch": 1.6733333333333333, + "grad_norm": 1.8241764307022095, + "learning_rate": 0.00013318505338078294, + "loss": 2.5491, + "step": 3765 + }, + { + "epoch": 1.6737777777777778, + "grad_norm": 1.6973588466644287, + "learning_rate": 0.00013316725978647687, + "loss": 2.1914, + "step": 3766 + }, + { + "epoch": 1.6742222222222223, + "grad_norm": 2.0757534503936768, + "learning_rate": 0.00013314946619217083, + "loss": 2.9653, + "step": 3767 + }, + { + "epoch": 1.6746666666666665, + "grad_norm": 1.850299596786499, + "learning_rate": 0.00013313167259786476, + "loss": 2.241, + "step": 3768 + }, + { + "epoch": 1.6751111111111112, + "grad_norm": 1.8038710355758667, + "learning_rate": 0.0001331138790035587, + "loss": 2.2839, + "step": 3769 + }, + { + "epoch": 1.6755555555555555, + "grad_norm": 1.7989375591278076, + "learning_rate": 0.00013309608540925267, + "loss": 2.1376, + "step": 3770 + }, + { + "epoch": 1.6760000000000002, + "grad_norm": 1.8544578552246094, + "learning_rate": 0.00013307829181494663, + "loss": 2.1846, + "step": 3771 + }, + { + "epoch": 1.6764444444444444, + "grad_norm": 1.7627222537994385, + "learning_rate": 0.00013306049822064058, + "loss": 1.9189, + "step": 3772 + }, + { + "epoch": 1.6768888888888889, + "grad_norm": 1.8061233758926392, + "learning_rate": 0.0001330427046263345, + "loss": 1.9897, + "step": 3773 + }, + { + "epoch": 1.6773333333333333, + "grad_norm": 1.7060072422027588, + "learning_rate": 0.00013302491103202847, + "loss": 2.0127, + "step": 3774 + }, + { + "epoch": 1.6777777777777778, + "grad_norm": 2.054374933242798, + "learning_rate": 0.00013300711743772242, + "loss": 2.2547, + "step": 3775 + }, + { + "epoch": 1.6782222222222223, + "grad_norm": 1.944670557975769, + "learning_rate": 0.00013298932384341638, + "loss": 2.1802, + "step": 3776 + }, + { + "epoch": 1.6786666666666665, + "grad_norm": 2.326748847961426, + "learning_rate": 0.00013297153024911034, + "loss": 2.3484, + "step": 3777 + }, + { + "epoch": 1.6791111111111112, + "grad_norm": 1.707459807395935, + "learning_rate": 0.0001329537366548043, + "loss": 1.8628, + "step": 3778 + }, + { + "epoch": 1.6795555555555555, + "grad_norm": 2.1038479804992676, + "learning_rate": 0.00013293594306049822, + "loss": 1.6272, + "step": 3779 + }, + { + "epoch": 1.6800000000000002, + "grad_norm": 2.0666604042053223, + "learning_rate": 0.00013291814946619218, + "loss": 2.3157, + "step": 3780 + }, + { + "epoch": 1.6804444444444444, + "grad_norm": 1.8454957008361816, + "learning_rate": 0.0001329003558718861, + "loss": 2.0967, + "step": 3781 + }, + { + "epoch": 1.6808888888888889, + "grad_norm": 1.9296761751174927, + "learning_rate": 0.00013288256227758007, + "loss": 2.1812, + "step": 3782 + }, + { + "epoch": 1.6813333333333333, + "grad_norm": 1.9812657833099365, + "learning_rate": 0.00013286476868327402, + "loss": 2.0514, + "step": 3783 + }, + { + "epoch": 1.6817777777777778, + "grad_norm": 1.7936532497406006, + "learning_rate": 0.00013284697508896798, + "loss": 1.9351, + "step": 3784 + }, + { + "epoch": 1.6822222222222223, + "grad_norm": 2.2539501190185547, + "learning_rate": 0.00013282918149466194, + "loss": 2.297, + "step": 3785 + }, + { + "epoch": 1.6826666666666665, + "grad_norm": 2.0437049865722656, + "learning_rate": 0.00013281138790035586, + "loss": 1.9794, + "step": 3786 + }, + { + "epoch": 1.6831111111111112, + "grad_norm": 2.1592202186584473, + "learning_rate": 0.00013279359430604982, + "loss": 2.2224, + "step": 3787 + }, + { + "epoch": 1.6835555555555555, + "grad_norm": 2.0359416007995605, + "learning_rate": 0.00013277580071174378, + "loss": 2.0624, + "step": 3788 + }, + { + "epoch": 1.6840000000000002, + "grad_norm": 2.133168935775757, + "learning_rate": 0.00013275800711743773, + "loss": 2.0215, + "step": 3789 + }, + { + "epoch": 1.6844444444444444, + "grad_norm": 2.4152679443359375, + "learning_rate": 0.0001327402135231317, + "loss": 2.2511, + "step": 3790 + }, + { + "epoch": 1.6848888888888889, + "grad_norm": 2.4660964012145996, + "learning_rate": 0.00013272241992882565, + "loss": 2.6581, + "step": 3791 + }, + { + "epoch": 1.6853333333333333, + "grad_norm": 2.309948205947876, + "learning_rate": 0.00013270462633451958, + "loss": 2.3009, + "step": 3792 + }, + { + "epoch": 1.6857777777777778, + "grad_norm": 2.3204185962677, + "learning_rate": 0.00013268683274021353, + "loss": 2.0474, + "step": 3793 + }, + { + "epoch": 1.6862222222222223, + "grad_norm": 2.021592855453491, + "learning_rate": 0.00013266903914590746, + "loss": 1.6188, + "step": 3794 + }, + { + "epoch": 1.6866666666666665, + "grad_norm": 2.1225037574768066, + "learning_rate": 0.00013265124555160142, + "loss": 1.7655, + "step": 3795 + }, + { + "epoch": 1.6871111111111112, + "grad_norm": 2.4528112411499023, + "learning_rate": 0.00013263345195729538, + "loss": 2.146, + "step": 3796 + }, + { + "epoch": 1.6875555555555555, + "grad_norm": 2.4885880947113037, + "learning_rate": 0.00013261565836298933, + "loss": 2.6485, + "step": 3797 + }, + { + "epoch": 1.688, + "grad_norm": 2.7207860946655273, + "learning_rate": 0.0001325978647686833, + "loss": 2.7009, + "step": 3798 + }, + { + "epoch": 1.6884444444444444, + "grad_norm": 3.098876476287842, + "learning_rate": 0.00013258007117437722, + "loss": 1.9231, + "step": 3799 + }, + { + "epoch": 1.6888888888888889, + "grad_norm": 3.4644060134887695, + "learning_rate": 0.00013256227758007117, + "loss": 2.3836, + "step": 3800 + }, + { + "epoch": 1.6893333333333334, + "grad_norm": 1.1060537099838257, + "learning_rate": 0.00013254448398576513, + "loss": 1.2927, + "step": 3801 + }, + { + "epoch": 1.6897777777777778, + "grad_norm": 1.0716097354888916, + "learning_rate": 0.0001325266903914591, + "loss": 1.1274, + "step": 3802 + }, + { + "epoch": 1.6902222222222223, + "grad_norm": 1.659468412399292, + "learning_rate": 0.00013250889679715304, + "loss": 2.5655, + "step": 3803 + }, + { + "epoch": 1.6906666666666665, + "grad_norm": 1.416318416595459, + "learning_rate": 0.000132491103202847, + "loss": 2.0306, + "step": 3804 + }, + { + "epoch": 1.6911111111111112, + "grad_norm": 2.2710070610046387, + "learning_rate": 0.00013247330960854093, + "loss": 2.178, + "step": 3805 + }, + { + "epoch": 1.6915555555555555, + "grad_norm": 1.5939732789993286, + "learning_rate": 0.00013245551601423489, + "loss": 2.3397, + "step": 3806 + }, + { + "epoch": 1.692, + "grad_norm": 1.590168833732605, + "learning_rate": 0.00013243772241992882, + "loss": 2.2752, + "step": 3807 + }, + { + "epoch": 1.6924444444444444, + "grad_norm": 1.1703904867172241, + "learning_rate": 0.00013241992882562277, + "loss": 1.0365, + "step": 3808 + }, + { + "epoch": 1.6928888888888889, + "grad_norm": 1.6481446027755737, + "learning_rate": 0.00013240213523131673, + "loss": 2.2892, + "step": 3809 + }, + { + "epoch": 1.6933333333333334, + "grad_norm": 1.8637354373931885, + "learning_rate": 0.00013238434163701069, + "loss": 2.4122, + "step": 3810 + }, + { + "epoch": 1.6937777777777778, + "grad_norm": 1.6943954229354858, + "learning_rate": 0.00013236654804270464, + "loss": 2.4287, + "step": 3811 + }, + { + "epoch": 1.6942222222222223, + "grad_norm": 1.9139350652694702, + "learning_rate": 0.00013234875444839857, + "loss": 2.038, + "step": 3812 + }, + { + "epoch": 1.6946666666666665, + "grad_norm": 1.6153117418289185, + "learning_rate": 0.00013233096085409253, + "loss": 1.8512, + "step": 3813 + }, + { + "epoch": 1.6951111111111112, + "grad_norm": 1.473202109336853, + "learning_rate": 0.00013231316725978648, + "loss": 2.0833, + "step": 3814 + }, + { + "epoch": 1.6955555555555555, + "grad_norm": 1.6183151006698608, + "learning_rate": 0.00013229537366548044, + "loss": 1.797, + "step": 3815 + }, + { + "epoch": 1.696, + "grad_norm": 1.648177981376648, + "learning_rate": 0.0001322775800711744, + "loss": 1.9791, + "step": 3816 + }, + { + "epoch": 1.6964444444444444, + "grad_norm": 1.758804202079773, + "learning_rate": 0.00013225978647686835, + "loss": 2.5341, + "step": 3817 + }, + { + "epoch": 1.696888888888889, + "grad_norm": 1.5626602172851562, + "learning_rate": 0.00013224199288256228, + "loss": 1.5439, + "step": 3818 + }, + { + "epoch": 1.6973333333333334, + "grad_norm": 1.9132189750671387, + "learning_rate": 0.0001322241992882562, + "loss": 2.4955, + "step": 3819 + }, + { + "epoch": 1.6977777777777778, + "grad_norm": 1.975059986114502, + "learning_rate": 0.00013220640569395017, + "loss": 2.4941, + "step": 3820 + }, + { + "epoch": 1.6982222222222223, + "grad_norm": 1.8108739852905273, + "learning_rate": 0.00013218861209964413, + "loss": 1.7566, + "step": 3821 + }, + { + "epoch": 1.6986666666666665, + "grad_norm": 1.5749622583389282, + "learning_rate": 0.00013217081850533808, + "loss": 2.0191, + "step": 3822 + }, + { + "epoch": 1.6991111111111112, + "grad_norm": 2.2732627391815186, + "learning_rate": 0.00013215302491103204, + "loss": 1.8025, + "step": 3823 + }, + { + "epoch": 1.6995555555555555, + "grad_norm": 1.6251347064971924, + "learning_rate": 0.000132135231316726, + "loss": 1.8303, + "step": 3824 + }, + { + "epoch": 1.7, + "grad_norm": 1.9713494777679443, + "learning_rate": 0.00013211743772241992, + "loss": 2.1631, + "step": 3825 + }, + { + "epoch": 1.7004444444444444, + "grad_norm": 1.9133716821670532, + "learning_rate": 0.00013209964412811388, + "loss": 2.3184, + "step": 3826 + }, + { + "epoch": 1.700888888888889, + "grad_norm": 2.2946348190307617, + "learning_rate": 0.00013208185053380784, + "loss": 2.5479, + "step": 3827 + }, + { + "epoch": 1.7013333333333334, + "grad_norm": 1.764487862586975, + "learning_rate": 0.0001320640569395018, + "loss": 1.7169, + "step": 3828 + }, + { + "epoch": 1.7017777777777776, + "grad_norm": 1.9306504726409912, + "learning_rate": 0.00013204626334519575, + "loss": 1.8107, + "step": 3829 + }, + { + "epoch": 1.7022222222222223, + "grad_norm": 1.9167112112045288, + "learning_rate": 0.0001320284697508897, + "loss": 1.7252, + "step": 3830 + }, + { + "epoch": 1.7026666666666666, + "grad_norm": 2.143211603164673, + "learning_rate": 0.00013201067615658364, + "loss": 2.3653, + "step": 3831 + }, + { + "epoch": 1.7031111111111112, + "grad_norm": 2.1837661266326904, + "learning_rate": 0.00013199288256227757, + "loss": 2.3616, + "step": 3832 + }, + { + "epoch": 1.7035555555555555, + "grad_norm": 2.0470423698425293, + "learning_rate": 0.00013197508896797152, + "loss": 1.9548, + "step": 3833 + }, + { + "epoch": 1.704, + "grad_norm": 2.4029338359832764, + "learning_rate": 0.00013195729537366548, + "loss": 2.0182, + "step": 3834 + }, + { + "epoch": 1.7044444444444444, + "grad_norm": 2.1079888343811035, + "learning_rate": 0.00013193950177935944, + "loss": 2.0754, + "step": 3835 + }, + { + "epoch": 1.704888888888889, + "grad_norm": 2.2155072689056396, + "learning_rate": 0.0001319217081850534, + "loss": 2.2782, + "step": 3836 + }, + { + "epoch": 1.7053333333333334, + "grad_norm": 2.1504364013671875, + "learning_rate": 0.00013190391459074735, + "loss": 2.1015, + "step": 3837 + }, + { + "epoch": 1.7057777777777776, + "grad_norm": 2.214022159576416, + "learning_rate": 0.00013188612099644128, + "loss": 2.2518, + "step": 3838 + }, + { + "epoch": 1.7062222222222223, + "grad_norm": 2.3961539268493652, + "learning_rate": 0.00013186832740213523, + "loss": 1.7802, + "step": 3839 + }, + { + "epoch": 1.7066666666666666, + "grad_norm": 2.034930467605591, + "learning_rate": 0.0001318505338078292, + "loss": 2.2164, + "step": 3840 + }, + { + "epoch": 1.7071111111111112, + "grad_norm": 2.0269289016723633, + "learning_rate": 0.00013183274021352315, + "loss": 1.7755, + "step": 3841 + }, + { + "epoch": 1.7075555555555555, + "grad_norm": 2.0814435482025146, + "learning_rate": 0.0001318149466192171, + "loss": 1.9628, + "step": 3842 + }, + { + "epoch": 1.708, + "grad_norm": 2.4242076873779297, + "learning_rate": 0.00013179715302491106, + "loss": 2.4269, + "step": 3843 + }, + { + "epoch": 1.7084444444444444, + "grad_norm": 2.6244444847106934, + "learning_rate": 0.000131779359430605, + "loss": 2.279, + "step": 3844 + }, + { + "epoch": 1.708888888888889, + "grad_norm": 2.184532403945923, + "learning_rate": 0.00013176156583629892, + "loss": 2.0163, + "step": 3845 + }, + { + "epoch": 1.7093333333333334, + "grad_norm": 2.2187845706939697, + "learning_rate": 0.00013174377224199288, + "loss": 2.226, + "step": 3846 + }, + { + "epoch": 1.7097777777777776, + "grad_norm": 2.7310564517974854, + "learning_rate": 0.00013172597864768683, + "loss": 2.1964, + "step": 3847 + }, + { + "epoch": 1.7102222222222223, + "grad_norm": 2.8237147331237793, + "learning_rate": 0.0001317081850533808, + "loss": 2.5273, + "step": 3848 + }, + { + "epoch": 1.7106666666666666, + "grad_norm": 1.9283852577209473, + "learning_rate": 0.00013169039145907474, + "loss": 1.3825, + "step": 3849 + }, + { + "epoch": 1.7111111111111112, + "grad_norm": 2.044787883758545, + "learning_rate": 0.00013167259786476867, + "loss": 0.2311, + "step": 3850 + }, + { + "epoch": 1.7115555555555555, + "grad_norm": 1.3595030307769775, + "learning_rate": 0.00013165480427046263, + "loss": 1.8332, + "step": 3851 + }, + { + "epoch": 1.712, + "grad_norm": 1.7082335948944092, + "learning_rate": 0.0001316370106761566, + "loss": 1.3219, + "step": 3852 + }, + { + "epoch": 1.7124444444444444, + "grad_norm": 1.4628937244415283, + "learning_rate": 0.00013161921708185054, + "loss": 2.1888, + "step": 3853 + }, + { + "epoch": 1.712888888888889, + "grad_norm": 1.4949939250946045, + "learning_rate": 0.0001316014234875445, + "loss": 2.2021, + "step": 3854 + }, + { + "epoch": 1.7133333333333334, + "grad_norm": 1.4384099245071411, + "learning_rate": 0.00013158362989323846, + "loss": 1.9055, + "step": 3855 + }, + { + "epoch": 1.7137777777777776, + "grad_norm": 1.6663846969604492, + "learning_rate": 0.00013156583629893239, + "loss": 2.4469, + "step": 3856 + }, + { + "epoch": 1.7142222222222223, + "grad_norm": 1.5963435173034668, + "learning_rate": 0.00013154804270462634, + "loss": 2.1661, + "step": 3857 + }, + { + "epoch": 1.7146666666666666, + "grad_norm": 1.9488998651504517, + "learning_rate": 0.00013153024911032027, + "loss": 2.2052, + "step": 3858 + }, + { + "epoch": 1.7151111111111113, + "grad_norm": 1.6803126335144043, + "learning_rate": 0.00013151245551601423, + "loss": 2.437, + "step": 3859 + }, + { + "epoch": 1.7155555555555555, + "grad_norm": 1.6857129335403442, + "learning_rate": 0.00013149466192170818, + "loss": 1.8197, + "step": 3860 + }, + { + "epoch": 1.716, + "grad_norm": 1.7986105680465698, + "learning_rate": 0.00013147686832740214, + "loss": 2.2436, + "step": 3861 + }, + { + "epoch": 1.7164444444444444, + "grad_norm": 1.6643034219741821, + "learning_rate": 0.0001314590747330961, + "loss": 1.9625, + "step": 3862 + }, + { + "epoch": 1.716888888888889, + "grad_norm": 1.7694244384765625, + "learning_rate": 0.00013144128113879003, + "loss": 2.2189, + "step": 3863 + }, + { + "epoch": 1.7173333333333334, + "grad_norm": 1.951474905014038, + "learning_rate": 0.00013142348754448398, + "loss": 2.4388, + "step": 3864 + }, + { + "epoch": 1.7177777777777776, + "grad_norm": 1.966758370399475, + "learning_rate": 0.00013140569395017794, + "loss": 2.5255, + "step": 3865 + }, + { + "epoch": 1.7182222222222223, + "grad_norm": 1.5251381397247314, + "learning_rate": 0.0001313879003558719, + "loss": 1.4708, + "step": 3866 + }, + { + "epoch": 1.7186666666666666, + "grad_norm": 1.851199984550476, + "learning_rate": 0.00013137010676156585, + "loss": 2.4586, + "step": 3867 + }, + { + "epoch": 1.7191111111111113, + "grad_norm": 1.7658207416534424, + "learning_rate": 0.0001313523131672598, + "loss": 2.5013, + "step": 3868 + }, + { + "epoch": 1.7195555555555555, + "grad_norm": 2.4436843395233154, + "learning_rate": 0.00013133451957295374, + "loss": 2.5335, + "step": 3869 + }, + { + "epoch": 1.72, + "grad_norm": 1.8331066370010376, + "learning_rate": 0.0001313167259786477, + "loss": 2.3018, + "step": 3870 + }, + { + "epoch": 1.7204444444444444, + "grad_norm": 1.725740671157837, + "learning_rate": 0.00013129893238434163, + "loss": 2.4409, + "step": 3871 + }, + { + "epoch": 1.720888888888889, + "grad_norm": 2.027172088623047, + "learning_rate": 0.00013128113879003558, + "loss": 2.1246, + "step": 3872 + }, + { + "epoch": 1.7213333333333334, + "grad_norm": 1.6568158864974976, + "learning_rate": 0.00013126334519572954, + "loss": 1.8242, + "step": 3873 + }, + { + "epoch": 1.7217777777777776, + "grad_norm": 1.8668732643127441, + "learning_rate": 0.0001312455516014235, + "loss": 2.0447, + "step": 3874 + }, + { + "epoch": 1.7222222222222223, + "grad_norm": 1.9975069761276245, + "learning_rate": 0.00013122775800711745, + "loss": 2.1747, + "step": 3875 + }, + { + "epoch": 1.7226666666666666, + "grad_norm": 1.9104905128479004, + "learning_rate": 0.00013120996441281138, + "loss": 2.248, + "step": 3876 + }, + { + "epoch": 1.7231111111111113, + "grad_norm": 2.3680083751678467, + "learning_rate": 0.00013119217081850534, + "loss": 2.4361, + "step": 3877 + }, + { + "epoch": 1.7235555555555555, + "grad_norm": 2.0153682231903076, + "learning_rate": 0.0001311743772241993, + "loss": 1.9933, + "step": 3878 + }, + { + "epoch": 1.724, + "grad_norm": 2.112910032272339, + "learning_rate": 0.00013115658362989325, + "loss": 2.4397, + "step": 3879 + }, + { + "epoch": 1.7244444444444444, + "grad_norm": 1.9228010177612305, + "learning_rate": 0.0001311387900355872, + "loss": 2.7384, + "step": 3880 + }, + { + "epoch": 1.724888888888889, + "grad_norm": 2.054799795150757, + "learning_rate": 0.00013112099644128116, + "loss": 2.09, + "step": 3881 + }, + { + "epoch": 1.7253333333333334, + "grad_norm": 2.12286376953125, + "learning_rate": 0.0001311032028469751, + "loss": 2.0431, + "step": 3882 + }, + { + "epoch": 1.7257777777777776, + "grad_norm": 2.27078914642334, + "learning_rate": 0.00013108540925266905, + "loss": 2.5563, + "step": 3883 + }, + { + "epoch": 1.7262222222222223, + "grad_norm": 1.996488332748413, + "learning_rate": 0.00013106761565836298, + "loss": 2.4713, + "step": 3884 + }, + { + "epoch": 1.7266666666666666, + "grad_norm": 2.3658390045166016, + "learning_rate": 0.00013104982206405693, + "loss": 2.1061, + "step": 3885 + }, + { + "epoch": 1.7271111111111113, + "grad_norm": 1.9138381481170654, + "learning_rate": 0.0001310320284697509, + "loss": 2.6341, + "step": 3886 + }, + { + "epoch": 1.7275555555555555, + "grad_norm": 1.9364850521087646, + "learning_rate": 0.00013101423487544485, + "loss": 2.4424, + "step": 3887 + }, + { + "epoch": 1.728, + "grad_norm": 2.730329751968384, + "learning_rate": 0.0001309964412811388, + "loss": 2.0531, + "step": 3888 + }, + { + "epoch": 1.7284444444444444, + "grad_norm": 2.1814632415771484, + "learning_rate": 0.00013097864768683273, + "loss": 2.3215, + "step": 3889 + }, + { + "epoch": 1.728888888888889, + "grad_norm": 2.137681484222412, + "learning_rate": 0.0001309608540925267, + "loss": 2.4898, + "step": 3890 + }, + { + "epoch": 1.7293333333333334, + "grad_norm": 2.490191698074341, + "learning_rate": 0.00013094306049822065, + "loss": 2.15, + "step": 3891 + }, + { + "epoch": 1.7297777777777776, + "grad_norm": 2.2890820503234863, + "learning_rate": 0.0001309252669039146, + "loss": 1.6876, + "step": 3892 + }, + { + "epoch": 1.7302222222222223, + "grad_norm": 1.9590189456939697, + "learning_rate": 0.00013090747330960856, + "loss": 1.9625, + "step": 3893 + }, + { + "epoch": 1.7306666666666666, + "grad_norm": 2.1057655811309814, + "learning_rate": 0.00013088967971530252, + "loss": 2.198, + "step": 3894 + }, + { + "epoch": 1.7311111111111113, + "grad_norm": 2.1267521381378174, + "learning_rate": 0.00013087188612099645, + "loss": 2.2824, + "step": 3895 + }, + { + "epoch": 1.7315555555555555, + "grad_norm": 1.9178341627120972, + "learning_rate": 0.0001308540925266904, + "loss": 1.9127, + "step": 3896 + }, + { + "epoch": 1.732, + "grad_norm": 2.151017189025879, + "learning_rate": 0.00013083629893238433, + "loss": 2.0591, + "step": 3897 + }, + { + "epoch": 1.7324444444444445, + "grad_norm": 2.1666769981384277, + "learning_rate": 0.0001308185053380783, + "loss": 2.1367, + "step": 3898 + }, + { + "epoch": 1.732888888888889, + "grad_norm": 2.2003917694091797, + "learning_rate": 0.00013080071174377224, + "loss": 0.9107, + "step": 3899 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 2.9451475143432617, + "learning_rate": 0.0001307829181494662, + "loss": 2.487, + "step": 3900 + }, + { + "epoch": 1.7337777777777776, + "grad_norm": 1.6602669954299927, + "learning_rate": 0.00013076512455516016, + "loss": 2.9994, + "step": 3901 + }, + { + "epoch": 1.7342222222222223, + "grad_norm": 1.3574813604354858, + "learning_rate": 0.0001307473309608541, + "loss": 2.2298, + "step": 3902 + }, + { + "epoch": 1.7346666666666666, + "grad_norm": 1.5164433717727661, + "learning_rate": 0.00013072953736654804, + "loss": 2.0965, + "step": 3903 + }, + { + "epoch": 1.7351111111111113, + "grad_norm": 1.5202592611312866, + "learning_rate": 0.000130711743772242, + "loss": 2.3827, + "step": 3904 + }, + { + "epoch": 1.7355555555555555, + "grad_norm": 1.3015217781066895, + "learning_rate": 0.00013069395017793596, + "loss": 1.6272, + "step": 3905 + }, + { + "epoch": 1.736, + "grad_norm": 1.5526891946792603, + "learning_rate": 0.0001306761565836299, + "loss": 2.4231, + "step": 3906 + }, + { + "epoch": 1.7364444444444445, + "grad_norm": 1.6718441247940063, + "learning_rate": 0.00013065836298932387, + "loss": 2.8115, + "step": 3907 + }, + { + "epoch": 1.736888888888889, + "grad_norm": 1.7403420209884644, + "learning_rate": 0.0001306405693950178, + "loss": 2.7689, + "step": 3908 + }, + { + "epoch": 1.7373333333333334, + "grad_norm": 1.7474772930145264, + "learning_rate": 0.00013062277580071176, + "loss": 2.6111, + "step": 3909 + }, + { + "epoch": 1.7377777777777776, + "grad_norm": 1.6219923496246338, + "learning_rate": 0.00013060498220640568, + "loss": 2.5719, + "step": 3910 + }, + { + "epoch": 1.7382222222222223, + "grad_norm": 1.893513798713684, + "learning_rate": 0.00013058718861209964, + "loss": 2.7495, + "step": 3911 + }, + { + "epoch": 1.7386666666666666, + "grad_norm": 1.917389154434204, + "learning_rate": 0.0001305693950177936, + "loss": 2.4973, + "step": 3912 + }, + { + "epoch": 1.7391111111111113, + "grad_norm": 1.699096918106079, + "learning_rate": 0.00013055160142348755, + "loss": 1.7678, + "step": 3913 + }, + { + "epoch": 1.7395555555555555, + "grad_norm": 1.8528611660003662, + "learning_rate": 0.0001305338078291815, + "loss": 1.8641, + "step": 3914 + }, + { + "epoch": 1.74, + "grad_norm": 1.741129994392395, + "learning_rate": 0.00013051601423487544, + "loss": 2.0575, + "step": 3915 + }, + { + "epoch": 1.7404444444444445, + "grad_norm": 1.7191723585128784, + "learning_rate": 0.0001304982206405694, + "loss": 2.1448, + "step": 3916 + }, + { + "epoch": 1.740888888888889, + "grad_norm": 1.8906670808792114, + "learning_rate": 0.00013048042704626335, + "loss": 2.3253, + "step": 3917 + }, + { + "epoch": 1.7413333333333334, + "grad_norm": 1.799043893814087, + "learning_rate": 0.0001304626334519573, + "loss": 2.2234, + "step": 3918 + }, + { + "epoch": 1.7417777777777776, + "grad_norm": 1.6781656742095947, + "learning_rate": 0.00013044483985765127, + "loss": 1.9896, + "step": 3919 + }, + { + "epoch": 1.7422222222222223, + "grad_norm": 1.7165361642837524, + "learning_rate": 0.00013042704626334522, + "loss": 2.0458, + "step": 3920 + }, + { + "epoch": 1.7426666666666666, + "grad_norm": 1.9061387777328491, + "learning_rate": 0.00013040925266903915, + "loss": 2.4378, + "step": 3921 + }, + { + "epoch": 1.743111111111111, + "grad_norm": 2.1114823818206787, + "learning_rate": 0.0001303914590747331, + "loss": 2.4924, + "step": 3922 + }, + { + "epoch": 1.7435555555555555, + "grad_norm": 1.9921493530273438, + "learning_rate": 0.00013037366548042704, + "loss": 2.5657, + "step": 3923 + }, + { + "epoch": 1.744, + "grad_norm": 1.8910974264144897, + "learning_rate": 0.000130355871886121, + "loss": 2.0381, + "step": 3924 + }, + { + "epoch": 1.7444444444444445, + "grad_norm": 2.049863815307617, + "learning_rate": 0.00013033807829181495, + "loss": 1.9867, + "step": 3925 + }, + { + "epoch": 1.744888888888889, + "grad_norm": 1.9373703002929688, + "learning_rate": 0.0001303202846975089, + "loss": 2.3285, + "step": 3926 + }, + { + "epoch": 1.7453333333333334, + "grad_norm": 1.5811951160430908, + "learning_rate": 0.00013030249110320286, + "loss": 1.1489, + "step": 3927 + }, + { + "epoch": 1.7457777777777777, + "grad_norm": 2.237574577331543, + "learning_rate": 0.0001302846975088968, + "loss": 2.8071, + "step": 3928 + }, + { + "epoch": 1.7462222222222223, + "grad_norm": 1.7456693649291992, + "learning_rate": 0.00013026690391459075, + "loss": 1.5628, + "step": 3929 + }, + { + "epoch": 1.7466666666666666, + "grad_norm": 1.9937740564346313, + "learning_rate": 0.0001302491103202847, + "loss": 2.2982, + "step": 3930 + }, + { + "epoch": 1.747111111111111, + "grad_norm": 1.7053123712539673, + "learning_rate": 0.00013023131672597866, + "loss": 1.0467, + "step": 3931 + }, + { + "epoch": 1.7475555555555555, + "grad_norm": 2.3195912837982178, + "learning_rate": 0.00013021352313167262, + "loss": 2.5614, + "step": 3932 + }, + { + "epoch": 1.748, + "grad_norm": 2.1397838592529297, + "learning_rate": 0.00013019572953736655, + "loss": 1.9569, + "step": 3933 + }, + { + "epoch": 1.7484444444444445, + "grad_norm": 1.823757290840149, + "learning_rate": 0.0001301779359430605, + "loss": 1.8498, + "step": 3934 + }, + { + "epoch": 1.748888888888889, + "grad_norm": 1.9765830039978027, + "learning_rate": 0.00013016014234875443, + "loss": 2.2709, + "step": 3935 + }, + { + "epoch": 1.7493333333333334, + "grad_norm": 2.0341432094573975, + "learning_rate": 0.0001301423487544484, + "loss": 1.8853, + "step": 3936 + }, + { + "epoch": 1.7497777777777777, + "grad_norm": 1.8640036582946777, + "learning_rate": 0.00013012455516014235, + "loss": 1.9018, + "step": 3937 + }, + { + "epoch": 1.7502222222222223, + "grad_norm": 1.3533066511154175, + "learning_rate": 0.0001301067615658363, + "loss": 0.9755, + "step": 3938 + }, + { + "epoch": 1.7506666666666666, + "grad_norm": 2.0747861862182617, + "learning_rate": 0.00013008896797153026, + "loss": 2.1518, + "step": 3939 + }, + { + "epoch": 1.751111111111111, + "grad_norm": 2.1353771686553955, + "learning_rate": 0.0001300711743772242, + "loss": 2.3496, + "step": 3940 + }, + { + "epoch": 1.7515555555555555, + "grad_norm": 2.6912460327148438, + "learning_rate": 0.00013005338078291815, + "loss": 2.2062, + "step": 3941 + }, + { + "epoch": 1.752, + "grad_norm": 1.905840277671814, + "learning_rate": 0.0001300355871886121, + "loss": 2.0854, + "step": 3942 + }, + { + "epoch": 1.7524444444444445, + "grad_norm": 2.2967617511749268, + "learning_rate": 0.00013001779359430606, + "loss": 1.8793, + "step": 3943 + }, + { + "epoch": 1.752888888888889, + "grad_norm": 2.267254590988159, + "learning_rate": 0.00013000000000000002, + "loss": 1.8769, + "step": 3944 + }, + { + "epoch": 1.7533333333333334, + "grad_norm": 2.3356738090515137, + "learning_rate": 0.00012998220640569397, + "loss": 1.7641, + "step": 3945 + }, + { + "epoch": 1.7537777777777777, + "grad_norm": 2.722877264022827, + "learning_rate": 0.0001299644128113879, + "loss": 2.2982, + "step": 3946 + }, + { + "epoch": 1.7542222222222223, + "grad_norm": 2.4450271129608154, + "learning_rate": 0.00012994661921708186, + "loss": 2.1468, + "step": 3947 + }, + { + "epoch": 1.7546666666666666, + "grad_norm": 2.8750321865081787, + "learning_rate": 0.0001299288256227758, + "loss": 2.4269, + "step": 3948 + }, + { + "epoch": 1.755111111111111, + "grad_norm": 2.6799416542053223, + "learning_rate": 0.00012991103202846974, + "loss": 1.6945, + "step": 3949 + }, + { + "epoch": 1.7555555555555555, + "grad_norm": 3.447089433670044, + "learning_rate": 0.0001298932384341637, + "loss": 2.7562, + "step": 3950 + }, + { + "epoch": 1.756, + "grad_norm": 1.3063669204711914, + "learning_rate": 0.00012987544483985766, + "loss": 2.1046, + "step": 3951 + }, + { + "epoch": 1.7564444444444445, + "grad_norm": 1.4739446640014648, + "learning_rate": 0.0001298576512455516, + "loss": 2.3863, + "step": 3952 + }, + { + "epoch": 1.756888888888889, + "grad_norm": 1.543544888496399, + "learning_rate": 0.00012983985765124554, + "loss": 2.3732, + "step": 3953 + }, + { + "epoch": 1.7573333333333334, + "grad_norm": 1.6399084329605103, + "learning_rate": 0.0001298220640569395, + "loss": 2.5028, + "step": 3954 + }, + { + "epoch": 1.7577777777777777, + "grad_norm": 1.5914816856384277, + "learning_rate": 0.00012980427046263346, + "loss": 2.2064, + "step": 3955 + }, + { + "epoch": 1.7582222222222224, + "grad_norm": 1.6115366220474243, + "learning_rate": 0.0001297864768683274, + "loss": 2.2451, + "step": 3956 + }, + { + "epoch": 1.7586666666666666, + "grad_norm": 1.5162943601608276, + "learning_rate": 0.00012976868327402137, + "loss": 2.1785, + "step": 3957 + }, + { + "epoch": 1.759111111111111, + "grad_norm": 1.6043645143508911, + "learning_rate": 0.00012975088967971533, + "loss": 2.2091, + "step": 3958 + }, + { + "epoch": 1.7595555555555555, + "grad_norm": 1.4509108066558838, + "learning_rate": 0.00012973309608540925, + "loss": 2.3536, + "step": 3959 + }, + { + "epoch": 1.76, + "grad_norm": 1.6944358348846436, + "learning_rate": 0.0001297153024911032, + "loss": 2.5589, + "step": 3960 + }, + { + "epoch": 1.7604444444444445, + "grad_norm": 1.5533844232559204, + "learning_rate": 0.00012969750889679714, + "loss": 2.4115, + "step": 3961 + }, + { + "epoch": 1.7608888888888887, + "grad_norm": 1.4054555892944336, + "learning_rate": 0.0001296797153024911, + "loss": 2.1334, + "step": 3962 + }, + { + "epoch": 1.7613333333333334, + "grad_norm": 1.6660244464874268, + "learning_rate": 0.00012966192170818505, + "loss": 2.3195, + "step": 3963 + }, + { + "epoch": 1.7617777777777777, + "grad_norm": 1.5399365425109863, + "learning_rate": 0.000129644128113879, + "loss": 2.0814, + "step": 3964 + }, + { + "epoch": 1.7622222222222224, + "grad_norm": 1.5576121807098389, + "learning_rate": 0.00012962633451957297, + "loss": 1.8001, + "step": 3965 + }, + { + "epoch": 1.7626666666666666, + "grad_norm": 1.8270119428634644, + "learning_rate": 0.0001296085409252669, + "loss": 1.9568, + "step": 3966 + }, + { + "epoch": 1.763111111111111, + "grad_norm": 1.5569310188293457, + "learning_rate": 0.00012959074733096085, + "loss": 2.1702, + "step": 3967 + }, + { + "epoch": 1.7635555555555555, + "grad_norm": 1.7133228778839111, + "learning_rate": 0.0001295729537366548, + "loss": 2.4522, + "step": 3968 + }, + { + "epoch": 1.764, + "grad_norm": 1.6561294794082642, + "learning_rate": 0.00012955516014234877, + "loss": 2.0854, + "step": 3969 + }, + { + "epoch": 1.7644444444444445, + "grad_norm": 2.1431984901428223, + "learning_rate": 0.00012953736654804272, + "loss": 1.9774, + "step": 3970 + }, + { + "epoch": 1.7648888888888887, + "grad_norm": 1.6542658805847168, + "learning_rate": 0.00012951957295373668, + "loss": 1.9308, + "step": 3971 + }, + { + "epoch": 1.7653333333333334, + "grad_norm": 1.7912119626998901, + "learning_rate": 0.0001295017793594306, + "loss": 1.9613, + "step": 3972 + }, + { + "epoch": 1.7657777777777777, + "grad_norm": 1.2500418424606323, + "learning_rate": 0.00012948398576512456, + "loss": 0.6403, + "step": 3973 + }, + { + "epoch": 1.7662222222222224, + "grad_norm": 1.623761534690857, + "learning_rate": 0.0001294661921708185, + "loss": 2.2582, + "step": 3974 + }, + { + "epoch": 1.7666666666666666, + "grad_norm": 1.6567007303237915, + "learning_rate": 0.00012944839857651245, + "loss": 1.8474, + "step": 3975 + }, + { + "epoch": 1.767111111111111, + "grad_norm": 1.7294385433197021, + "learning_rate": 0.0001294306049822064, + "loss": 1.9536, + "step": 3976 + }, + { + "epoch": 1.7675555555555555, + "grad_norm": 1.9673961400985718, + "learning_rate": 0.00012941281138790036, + "loss": 2.0788, + "step": 3977 + }, + { + "epoch": 1.768, + "grad_norm": 2.372072696685791, + "learning_rate": 0.00012939501779359432, + "loss": 2.1401, + "step": 3978 + }, + { + "epoch": 1.7684444444444445, + "grad_norm": 1.8470436334609985, + "learning_rate": 0.00012937722419928825, + "loss": 2.2509, + "step": 3979 + }, + { + "epoch": 1.7688888888888887, + "grad_norm": 1.7825738191604614, + "learning_rate": 0.0001293594306049822, + "loss": 2.1858, + "step": 3980 + }, + { + "epoch": 1.7693333333333334, + "grad_norm": 1.4077835083007812, + "learning_rate": 0.00012934163701067616, + "loss": 1.2364, + "step": 3981 + }, + { + "epoch": 1.7697777777777777, + "grad_norm": 2.0429136753082275, + "learning_rate": 0.00012932384341637012, + "loss": 2.2663, + "step": 3982 + }, + { + "epoch": 1.7702222222222224, + "grad_norm": 1.2380872964859009, + "learning_rate": 0.00012930604982206408, + "loss": 0.6942, + "step": 3983 + }, + { + "epoch": 1.7706666666666666, + "grad_norm": 2.0053813457489014, + "learning_rate": 0.00012928825622775803, + "loss": 2.1342, + "step": 3984 + }, + { + "epoch": 1.771111111111111, + "grad_norm": 1.8979686498641968, + "learning_rate": 0.00012927046263345196, + "loss": 2.4219, + "step": 3985 + }, + { + "epoch": 1.7715555555555556, + "grad_norm": 2.177061080932617, + "learning_rate": 0.00012925266903914592, + "loss": 2.2856, + "step": 3986 + }, + { + "epoch": 1.772, + "grad_norm": 2.1877963542938232, + "learning_rate": 0.00012923487544483985, + "loss": 2.0785, + "step": 3987 + }, + { + "epoch": 1.7724444444444445, + "grad_norm": 1.8126049041748047, + "learning_rate": 0.0001292170818505338, + "loss": 1.8393, + "step": 3988 + }, + { + "epoch": 1.7728888888888887, + "grad_norm": 2.09319806098938, + "learning_rate": 0.00012919928825622776, + "loss": 2.5437, + "step": 3989 + }, + { + "epoch": 1.7733333333333334, + "grad_norm": 1.2192652225494385, + "learning_rate": 0.00012918149466192172, + "loss": 1.0593, + "step": 3990 + }, + { + "epoch": 1.7737777777777777, + "grad_norm": 2.0721795558929443, + "learning_rate": 0.00012916370106761567, + "loss": 2.3111, + "step": 3991 + }, + { + "epoch": 1.7742222222222224, + "grad_norm": 1.9391847848892212, + "learning_rate": 0.0001291459074733096, + "loss": 1.9213, + "step": 3992 + }, + { + "epoch": 1.7746666666666666, + "grad_norm": 2.3300163745880127, + "learning_rate": 0.00012912811387900356, + "loss": 2.4885, + "step": 3993 + }, + { + "epoch": 1.775111111111111, + "grad_norm": 2.414308786392212, + "learning_rate": 0.00012911032028469752, + "loss": 2.4184, + "step": 3994 + }, + { + "epoch": 1.7755555555555556, + "grad_norm": 2.2488865852355957, + "learning_rate": 0.00012909252669039147, + "loss": 2.0251, + "step": 3995 + }, + { + "epoch": 1.776, + "grad_norm": 3.426772117614746, + "learning_rate": 0.00012907473309608543, + "loss": 2.7753, + "step": 3996 + }, + { + "epoch": 1.7764444444444445, + "grad_norm": 2.1739494800567627, + "learning_rate": 0.00012905693950177938, + "loss": 2.0393, + "step": 3997 + }, + { + "epoch": 1.7768888888888887, + "grad_norm": 2.434339761734009, + "learning_rate": 0.00012903914590747331, + "loss": 2.2456, + "step": 3998 + }, + { + "epoch": 1.7773333333333334, + "grad_norm": 2.3951499462127686, + "learning_rate": 0.00012902135231316727, + "loss": 2.292, + "step": 3999 + }, + { + "epoch": 1.7777777777777777, + "grad_norm": 2.0735788345336914, + "learning_rate": 0.0001290035587188612, + "loss": 1.3612, + "step": 4000 + }, + { + "epoch": 1.7782222222222224, + "grad_norm": 0.45426124334335327, + "learning_rate": 0.00012898576512455516, + "loss": 0.0292, + "step": 4001 + }, + { + "epoch": 1.7786666666666666, + "grad_norm": 1.561895728111267, + "learning_rate": 0.0001289679715302491, + "loss": 2.864, + "step": 4002 + }, + { + "epoch": 1.779111111111111, + "grad_norm": 1.1529381275177002, + "learning_rate": 0.00012895017793594307, + "loss": 1.1453, + "step": 4003 + }, + { + "epoch": 1.7795555555555556, + "grad_norm": 1.7909576892852783, + "learning_rate": 0.00012893238434163703, + "loss": 2.1009, + "step": 4004 + }, + { + "epoch": 1.78, + "grad_norm": 1.8325178623199463, + "learning_rate": 0.00012891459074733096, + "loss": 2.4447, + "step": 4005 + }, + { + "epoch": 1.7804444444444445, + "grad_norm": 1.695643663406372, + "learning_rate": 0.0001288967971530249, + "loss": 2.5286, + "step": 4006 + }, + { + "epoch": 1.7808888888888887, + "grad_norm": 1.6463677883148193, + "learning_rate": 0.00012887900355871887, + "loss": 2.2066, + "step": 4007 + }, + { + "epoch": 1.7813333333333334, + "grad_norm": 1.7361533641815186, + "learning_rate": 0.00012886120996441282, + "loss": 2.2354, + "step": 4008 + }, + { + "epoch": 1.7817777777777777, + "grad_norm": 2.212310791015625, + "learning_rate": 0.00012884341637010678, + "loss": 2.1203, + "step": 4009 + }, + { + "epoch": 1.7822222222222224, + "grad_norm": 1.4698344469070435, + "learning_rate": 0.00012882562277580074, + "loss": 1.8529, + "step": 4010 + }, + { + "epoch": 1.7826666666666666, + "grad_norm": 1.823663592338562, + "learning_rate": 0.00012880782918149467, + "loss": 2.2639, + "step": 4011 + }, + { + "epoch": 1.783111111111111, + "grad_norm": 1.7825374603271484, + "learning_rate": 0.00012879003558718862, + "loss": 1.9933, + "step": 4012 + }, + { + "epoch": 1.7835555555555556, + "grad_norm": 1.8066425323486328, + "learning_rate": 0.00012877224199288255, + "loss": 2.1136, + "step": 4013 + }, + { + "epoch": 1.784, + "grad_norm": 2.2551443576812744, + "learning_rate": 0.0001287544483985765, + "loss": 1.9374, + "step": 4014 + }, + { + "epoch": 1.7844444444444445, + "grad_norm": 1.7912770509719849, + "learning_rate": 0.00012873665480427047, + "loss": 2.2331, + "step": 4015 + }, + { + "epoch": 1.7848888888888887, + "grad_norm": 1.5528522729873657, + "learning_rate": 0.00012871886120996442, + "loss": 1.9012, + "step": 4016 + }, + { + "epoch": 1.7853333333333334, + "grad_norm": 1.6881825923919678, + "learning_rate": 0.00012870106761565838, + "loss": 2.061, + "step": 4017 + }, + { + "epoch": 1.7857777777777777, + "grad_norm": 1.2570315599441528, + "learning_rate": 0.0001286832740213523, + "loss": 1.2893, + "step": 4018 + }, + { + "epoch": 1.7862222222222224, + "grad_norm": 1.4894715547561646, + "learning_rate": 0.00012866548042704627, + "loss": 1.7158, + "step": 4019 + }, + { + "epoch": 1.7866666666666666, + "grad_norm": 1.969714879989624, + "learning_rate": 0.00012864768683274022, + "loss": 2.3733, + "step": 4020 + }, + { + "epoch": 1.787111111111111, + "grad_norm": 1.8792929649353027, + "learning_rate": 0.00012862989323843418, + "loss": 2.1233, + "step": 4021 + }, + { + "epoch": 1.7875555555555556, + "grad_norm": 1.8268121480941772, + "learning_rate": 0.00012861209964412813, + "loss": 2.136, + "step": 4022 + }, + { + "epoch": 1.788, + "grad_norm": 2.0907411575317383, + "learning_rate": 0.00012859430604982206, + "loss": 2.3892, + "step": 4023 + }, + { + "epoch": 1.7884444444444445, + "grad_norm": 1.901108741760254, + "learning_rate": 0.00012857651245551602, + "loss": 1.938, + "step": 4024 + }, + { + "epoch": 1.7888888888888888, + "grad_norm": 1.818852186203003, + "learning_rate": 0.00012855871886120998, + "loss": 2.5912, + "step": 4025 + }, + { + "epoch": 1.7893333333333334, + "grad_norm": 1.6948503255844116, + "learning_rate": 0.0001285409252669039, + "loss": 1.8355, + "step": 4026 + }, + { + "epoch": 1.7897777777777777, + "grad_norm": 2.053222417831421, + "learning_rate": 0.00012852313167259786, + "loss": 1.7958, + "step": 4027 + }, + { + "epoch": 1.7902222222222224, + "grad_norm": 1.777777910232544, + "learning_rate": 0.00012850533807829182, + "loss": 2.276, + "step": 4028 + }, + { + "epoch": 1.7906666666666666, + "grad_norm": 2.1247165203094482, + "learning_rate": 0.00012848754448398578, + "loss": 2.275, + "step": 4029 + }, + { + "epoch": 1.791111111111111, + "grad_norm": 2.1670467853546143, + "learning_rate": 0.0001284697508896797, + "loss": 2.1813, + "step": 4030 + }, + { + "epoch": 1.7915555555555556, + "grad_norm": 1.7745897769927979, + "learning_rate": 0.00012845195729537366, + "loss": 1.7079, + "step": 4031 + }, + { + "epoch": 1.792, + "grad_norm": 1.98904287815094, + "learning_rate": 0.00012843416370106762, + "loss": 1.9803, + "step": 4032 + }, + { + "epoch": 1.7924444444444445, + "grad_norm": 1.8329356908798218, + "learning_rate": 0.00012841637010676157, + "loss": 2.3103, + "step": 4033 + }, + { + "epoch": 1.7928888888888888, + "grad_norm": 1.814437985420227, + "learning_rate": 0.00012839857651245553, + "loss": 2.0819, + "step": 4034 + }, + { + "epoch": 1.7933333333333334, + "grad_norm": 1.9947712421417236, + "learning_rate": 0.0001283807829181495, + "loss": 2.1793, + "step": 4035 + }, + { + "epoch": 1.7937777777777777, + "grad_norm": 1.7744113206863403, + "learning_rate": 0.00012836298932384342, + "loss": 1.4961, + "step": 4036 + }, + { + "epoch": 1.7942222222222224, + "grad_norm": 1.941453456878662, + "learning_rate": 0.00012834519572953737, + "loss": 1.5453, + "step": 4037 + }, + { + "epoch": 1.7946666666666666, + "grad_norm": 1.993354320526123, + "learning_rate": 0.00012832740213523133, + "loss": 2.5863, + "step": 4038 + }, + { + "epoch": 1.795111111111111, + "grad_norm": 1.9085299968719482, + "learning_rate": 0.00012830960854092526, + "loss": 2.0983, + "step": 4039 + }, + { + "epoch": 1.7955555555555556, + "grad_norm": 2.2730631828308105, + "learning_rate": 0.00012829181494661922, + "loss": 2.894, + "step": 4040 + }, + { + "epoch": 1.796, + "grad_norm": 2.3365797996520996, + "learning_rate": 0.00012827402135231317, + "loss": 2.123, + "step": 4041 + }, + { + "epoch": 1.7964444444444445, + "grad_norm": 1.8728188276290894, + "learning_rate": 0.00012825622775800713, + "loss": 1.4073, + "step": 4042 + }, + { + "epoch": 1.7968888888888888, + "grad_norm": 2.5472781658172607, + "learning_rate": 0.00012823843416370106, + "loss": 2.2789, + "step": 4043 + }, + { + "epoch": 1.7973333333333334, + "grad_norm": 2.269137382507324, + "learning_rate": 0.00012822064056939501, + "loss": 2.3703, + "step": 4044 + }, + { + "epoch": 1.7977777777777777, + "grad_norm": 2.4079058170318604, + "learning_rate": 0.00012820284697508897, + "loss": 1.6054, + "step": 4045 + }, + { + "epoch": 1.7982222222222224, + "grad_norm": 2.692018747329712, + "learning_rate": 0.00012818505338078293, + "loss": 2.6721, + "step": 4046 + }, + { + "epoch": 1.7986666666666666, + "grad_norm": 2.388993740081787, + "learning_rate": 0.00012816725978647688, + "loss": 2.0374, + "step": 4047 + }, + { + "epoch": 1.799111111111111, + "grad_norm": 1.563175082206726, + "learning_rate": 0.00012814946619217084, + "loss": 0.9111, + "step": 4048 + }, + { + "epoch": 1.7995555555555556, + "grad_norm": 2.669541835784912, + "learning_rate": 0.00012813167259786477, + "loss": 1.4868, + "step": 4049 + }, + { + "epoch": 1.8, + "grad_norm": 2.8668127059936523, + "learning_rate": 0.00012811387900355873, + "loss": 1.8904, + "step": 4050 + }, + { + "epoch": 1.8004444444444445, + "grad_norm": 1.4953787326812744, + "learning_rate": 0.00012809608540925266, + "loss": 2.3084, + "step": 4051 + }, + { + "epoch": 1.8008888888888888, + "grad_norm": 1.6110552549362183, + "learning_rate": 0.0001280782918149466, + "loss": 2.9876, + "step": 4052 + }, + { + "epoch": 1.8013333333333335, + "grad_norm": 1.4815409183502197, + "learning_rate": 0.00012806049822064057, + "loss": 2.3245, + "step": 4053 + }, + { + "epoch": 1.8017777777777777, + "grad_norm": 1.6469542980194092, + "learning_rate": 0.00012804270462633453, + "loss": 2.4534, + "step": 4054 + }, + { + "epoch": 1.8022222222222222, + "grad_norm": 1.7720386981964111, + "learning_rate": 0.00012802491103202848, + "loss": 2.5591, + "step": 4055 + }, + { + "epoch": 1.8026666666666666, + "grad_norm": 1.607649564743042, + "learning_rate": 0.0001280071174377224, + "loss": 2.3919, + "step": 4056 + }, + { + "epoch": 1.803111111111111, + "grad_norm": 1.521120548248291, + "learning_rate": 0.00012798932384341637, + "loss": 2.292, + "step": 4057 + }, + { + "epoch": 1.8035555555555556, + "grad_norm": 1.7461004257202148, + "learning_rate": 0.00012797153024911032, + "loss": 2.1695, + "step": 4058 + }, + { + "epoch": 1.804, + "grad_norm": 1.6158878803253174, + "learning_rate": 0.00012795373665480428, + "loss": 1.4256, + "step": 4059 + }, + { + "epoch": 1.8044444444444445, + "grad_norm": 1.6743505001068115, + "learning_rate": 0.00012793594306049824, + "loss": 1.9986, + "step": 4060 + }, + { + "epoch": 1.8048888888888888, + "grad_norm": 1.6149520874023438, + "learning_rate": 0.0001279181494661922, + "loss": 2.5781, + "step": 4061 + }, + { + "epoch": 1.8053333333333335, + "grad_norm": 1.5467309951782227, + "learning_rate": 0.00012790035587188612, + "loss": 1.7904, + "step": 4062 + }, + { + "epoch": 1.8057777777777777, + "grad_norm": 1.7630541324615479, + "learning_rate": 0.00012788256227758008, + "loss": 2.2801, + "step": 4063 + }, + { + "epoch": 1.8062222222222222, + "grad_norm": 1.8055700063705444, + "learning_rate": 0.000127864768683274, + "loss": 1.9704, + "step": 4064 + }, + { + "epoch": 1.8066666666666666, + "grad_norm": 1.7958096265792847, + "learning_rate": 0.00012784697508896797, + "loss": 2.3531, + "step": 4065 + }, + { + "epoch": 1.8071111111111111, + "grad_norm": 1.426638126373291, + "learning_rate": 0.00012782918149466192, + "loss": 1.2682, + "step": 4066 + }, + { + "epoch": 1.8075555555555556, + "grad_norm": 1.6399476528167725, + "learning_rate": 0.00012781138790035588, + "loss": 2.1429, + "step": 4067 + }, + { + "epoch": 1.808, + "grad_norm": 1.7020010948181152, + "learning_rate": 0.00012779359430604984, + "loss": 2.2423, + "step": 4068 + }, + { + "epoch": 1.8084444444444445, + "grad_norm": 1.5817952156066895, + "learning_rate": 0.00012777580071174376, + "loss": 1.7548, + "step": 4069 + }, + { + "epoch": 1.8088888888888888, + "grad_norm": 1.6301820278167725, + "learning_rate": 0.00012775800711743772, + "loss": 2.1788, + "step": 4070 + }, + { + "epoch": 1.8093333333333335, + "grad_norm": 1.7955340147018433, + "learning_rate": 0.00012774021352313168, + "loss": 1.756, + "step": 4071 + }, + { + "epoch": 1.8097777777777777, + "grad_norm": 1.5359266996383667, + "learning_rate": 0.00012772241992882563, + "loss": 1.714, + "step": 4072 + }, + { + "epoch": 1.8102222222222222, + "grad_norm": 1.6445212364196777, + "learning_rate": 0.0001277046263345196, + "loss": 1.9434, + "step": 4073 + }, + { + "epoch": 1.8106666666666666, + "grad_norm": 1.811740517616272, + "learning_rate": 0.00012768683274021355, + "loss": 1.7521, + "step": 4074 + }, + { + "epoch": 1.8111111111111111, + "grad_norm": 1.9442163705825806, + "learning_rate": 0.00012766903914590748, + "loss": 2.284, + "step": 4075 + }, + { + "epoch": 1.8115555555555556, + "grad_norm": 2.383162498474121, + "learning_rate": 0.00012765124555160143, + "loss": 2.1859, + "step": 4076 + }, + { + "epoch": 1.812, + "grad_norm": 1.7321439981460571, + "learning_rate": 0.00012763345195729536, + "loss": 1.6559, + "step": 4077 + }, + { + "epoch": 1.8124444444444445, + "grad_norm": 1.9843666553497314, + "learning_rate": 0.00012761565836298932, + "loss": 2.1682, + "step": 4078 + }, + { + "epoch": 1.8128888888888888, + "grad_norm": 1.7194164991378784, + "learning_rate": 0.00012759786476868328, + "loss": 1.4098, + "step": 4079 + }, + { + "epoch": 1.8133333333333335, + "grad_norm": 1.5130424499511719, + "learning_rate": 0.00012758007117437723, + "loss": 1.1193, + "step": 4080 + }, + { + "epoch": 1.8137777777777777, + "grad_norm": 2.0162298679351807, + "learning_rate": 0.0001275622775800712, + "loss": 2.1374, + "step": 4081 + }, + { + "epoch": 1.8142222222222222, + "grad_norm": 2.095752716064453, + "learning_rate": 0.00012754448398576512, + "loss": 1.7003, + "step": 4082 + }, + { + "epoch": 1.8146666666666667, + "grad_norm": 1.4571044445037842, + "learning_rate": 0.00012752669039145907, + "loss": 0.9885, + "step": 4083 + }, + { + "epoch": 1.8151111111111111, + "grad_norm": 2.0335910320281982, + "learning_rate": 0.00012750889679715303, + "loss": 1.7632, + "step": 4084 + }, + { + "epoch": 1.8155555555555556, + "grad_norm": 2.1240885257720947, + "learning_rate": 0.000127491103202847, + "loss": 1.9801, + "step": 4085 + }, + { + "epoch": 1.8159999999999998, + "grad_norm": 2.1505091190338135, + "learning_rate": 0.00012747330960854094, + "loss": 2.513, + "step": 4086 + }, + { + "epoch": 1.8164444444444445, + "grad_norm": 2.2950072288513184, + "learning_rate": 0.0001274555160142349, + "loss": 1.9917, + "step": 4087 + }, + { + "epoch": 1.8168888888888888, + "grad_norm": 2.2907001972198486, + "learning_rate": 0.00012743772241992883, + "loss": 2.3145, + "step": 4088 + }, + { + "epoch": 1.8173333333333335, + "grad_norm": 2.215815782546997, + "learning_rate": 0.00012741992882562279, + "loss": 1.8032, + "step": 4089 + }, + { + "epoch": 1.8177777777777777, + "grad_norm": 2.2718591690063477, + "learning_rate": 0.00012740213523131672, + "loss": 2.2278, + "step": 4090 + }, + { + "epoch": 1.8182222222222222, + "grad_norm": 2.494612455368042, + "learning_rate": 0.00012738434163701067, + "loss": 2.8879, + "step": 4091 + }, + { + "epoch": 1.8186666666666667, + "grad_norm": 2.505629539489746, + "learning_rate": 0.00012736654804270463, + "loss": 2.0987, + "step": 4092 + }, + { + "epoch": 1.8191111111111111, + "grad_norm": 2.47916841506958, + "learning_rate": 0.00012734875444839859, + "loss": 2.3081, + "step": 4093 + }, + { + "epoch": 1.8195555555555556, + "grad_norm": 2.0762839317321777, + "learning_rate": 0.00012733096085409254, + "loss": 1.929, + "step": 4094 + }, + { + "epoch": 1.8199999999999998, + "grad_norm": 2.3247344493865967, + "learning_rate": 0.00012731316725978647, + "loss": 2.4938, + "step": 4095 + }, + { + "epoch": 1.8204444444444445, + "grad_norm": 2.481536865234375, + "learning_rate": 0.00012729537366548043, + "loss": 2.1279, + "step": 4096 + }, + { + "epoch": 1.8208888888888888, + "grad_norm": 2.1468632221221924, + "learning_rate": 0.00012727758007117438, + "loss": 2.2491, + "step": 4097 + }, + { + "epoch": 1.8213333333333335, + "grad_norm": 2.509892702102661, + "learning_rate": 0.00012725978647686834, + "loss": 2.1366, + "step": 4098 + }, + { + "epoch": 1.8217777777777777, + "grad_norm": 2.757918119430542, + "learning_rate": 0.0001272419928825623, + "loss": 2.3787, + "step": 4099 + }, + { + "epoch": 1.8222222222222222, + "grad_norm": 2.95184326171875, + "learning_rate": 0.00012722419928825625, + "loss": 1.7402, + "step": 4100 + }, + { + "epoch": 1.8226666666666667, + "grad_norm": 1.111022710800171, + "learning_rate": 0.00012720640569395018, + "loss": 1.3282, + "step": 4101 + }, + { + "epoch": 1.8231111111111111, + "grad_norm": 1.570564866065979, + "learning_rate": 0.00012718861209964414, + "loss": 2.5706, + "step": 4102 + }, + { + "epoch": 1.8235555555555556, + "grad_norm": 1.5850750207901, + "learning_rate": 0.00012717081850533807, + "loss": 2.603, + "step": 4103 + }, + { + "epoch": 1.8239999999999998, + "grad_norm": 1.873236060142517, + "learning_rate": 0.00012715302491103203, + "loss": 2.306, + "step": 4104 + }, + { + "epoch": 1.8244444444444445, + "grad_norm": 1.5911369323730469, + "learning_rate": 0.00012713523131672598, + "loss": 2.377, + "step": 4105 + }, + { + "epoch": 1.8248888888888888, + "grad_norm": 1.6573972702026367, + "learning_rate": 0.00012711743772241994, + "loss": 2.1528, + "step": 4106 + }, + { + "epoch": 1.8253333333333335, + "grad_norm": 1.76816987991333, + "learning_rate": 0.0001270996441281139, + "loss": 1.9276, + "step": 4107 + }, + { + "epoch": 1.8257777777777777, + "grad_norm": 1.6484469175338745, + "learning_rate": 0.00012708185053380782, + "loss": 2.2868, + "step": 4108 + }, + { + "epoch": 1.8262222222222222, + "grad_norm": 1.8483304977416992, + "learning_rate": 0.00012706405693950178, + "loss": 1.8456, + "step": 4109 + }, + { + "epoch": 1.8266666666666667, + "grad_norm": 1.5337408781051636, + "learning_rate": 0.00012704626334519574, + "loss": 1.3085, + "step": 4110 + }, + { + "epoch": 1.8271111111111111, + "grad_norm": 1.832377314567566, + "learning_rate": 0.0001270284697508897, + "loss": 2.0178, + "step": 4111 + }, + { + "epoch": 1.8275555555555556, + "grad_norm": 1.631611943244934, + "learning_rate": 0.00012701067615658365, + "loss": 2.3294, + "step": 4112 + }, + { + "epoch": 1.8279999999999998, + "grad_norm": 1.6167796850204468, + "learning_rate": 0.00012699288256227758, + "loss": 2.2195, + "step": 4113 + }, + { + "epoch": 1.8284444444444445, + "grad_norm": 1.6683152914047241, + "learning_rate": 0.00012697508896797154, + "loss": 2.4356, + "step": 4114 + }, + { + "epoch": 1.8288888888888888, + "grad_norm": 1.8200979232788086, + "learning_rate": 0.0001269572953736655, + "loss": 2.0209, + "step": 4115 + }, + { + "epoch": 1.8293333333333335, + "grad_norm": 1.7240800857543945, + "learning_rate": 0.00012693950177935942, + "loss": 2.5672, + "step": 4116 + }, + { + "epoch": 1.8297777777777777, + "grad_norm": 1.9097305536270142, + "learning_rate": 0.00012692170818505338, + "loss": 2.4372, + "step": 4117 + }, + { + "epoch": 1.8302222222222222, + "grad_norm": 1.1662497520446777, + "learning_rate": 0.00012690391459074733, + "loss": 1.1567, + "step": 4118 + }, + { + "epoch": 1.8306666666666667, + "grad_norm": 1.9544744491577148, + "learning_rate": 0.0001268861209964413, + "loss": 2.2734, + "step": 4119 + }, + { + "epoch": 1.8311111111111111, + "grad_norm": 1.7287496328353882, + "learning_rate": 0.00012686832740213522, + "loss": 1.9544, + "step": 4120 + }, + { + "epoch": 1.8315555555555556, + "grad_norm": 1.6461305618286133, + "learning_rate": 0.00012685053380782918, + "loss": 2.2844, + "step": 4121 + }, + { + "epoch": 1.8319999999999999, + "grad_norm": 1.8667876720428467, + "learning_rate": 0.00012683274021352313, + "loss": 2.483, + "step": 4122 + }, + { + "epoch": 1.8324444444444445, + "grad_norm": 1.8884638547897339, + "learning_rate": 0.0001268149466192171, + "loss": 2.2412, + "step": 4123 + }, + { + "epoch": 1.8328888888888888, + "grad_norm": 1.9503480195999146, + "learning_rate": 0.00012679715302491105, + "loss": 2.3155, + "step": 4124 + }, + { + "epoch": 1.8333333333333335, + "grad_norm": 1.864048719406128, + "learning_rate": 0.000126779359430605, + "loss": 1.9597, + "step": 4125 + }, + { + "epoch": 1.8337777777777777, + "grad_norm": 1.8108760118484497, + "learning_rate": 0.00012676156583629893, + "loss": 2.4295, + "step": 4126 + }, + { + "epoch": 1.8342222222222222, + "grad_norm": 2.0015783309936523, + "learning_rate": 0.0001267437722419929, + "loss": 2.2783, + "step": 4127 + }, + { + "epoch": 1.8346666666666667, + "grad_norm": 1.8798251152038574, + "learning_rate": 0.00012672597864768685, + "loss": 1.6982, + "step": 4128 + }, + { + "epoch": 1.8351111111111111, + "grad_norm": 2.052774667739868, + "learning_rate": 0.00012670818505338078, + "loss": 2.3289, + "step": 4129 + }, + { + "epoch": 1.8355555555555556, + "grad_norm": 2.0394935607910156, + "learning_rate": 0.00012669039145907473, + "loss": 2.3255, + "step": 4130 + }, + { + "epoch": 1.8359999999999999, + "grad_norm": 3.176072120666504, + "learning_rate": 0.0001266725978647687, + "loss": 2.1502, + "step": 4131 + }, + { + "epoch": 1.8364444444444445, + "grad_norm": 2.176759719848633, + "learning_rate": 0.00012665480427046264, + "loss": 2.0181, + "step": 4132 + }, + { + "epoch": 1.8368888888888888, + "grad_norm": 1.9604226350784302, + "learning_rate": 0.00012663701067615657, + "loss": 2.3706, + "step": 4133 + }, + { + "epoch": 1.8373333333333335, + "grad_norm": 1.9100964069366455, + "learning_rate": 0.00012661921708185053, + "loss": 2.1983, + "step": 4134 + }, + { + "epoch": 1.8377777777777777, + "grad_norm": 1.7293579578399658, + "learning_rate": 0.0001266014234875445, + "loss": 2.0016, + "step": 4135 + }, + { + "epoch": 1.8382222222222222, + "grad_norm": 1.660583257675171, + "learning_rate": 0.00012658362989323844, + "loss": 1.5306, + "step": 4136 + }, + { + "epoch": 1.8386666666666667, + "grad_norm": 1.858089804649353, + "learning_rate": 0.0001265658362989324, + "loss": 2.1515, + "step": 4137 + }, + { + "epoch": 1.8391111111111111, + "grad_norm": 2.4940619468688965, + "learning_rate": 0.00012654804270462636, + "loss": 1.9427, + "step": 4138 + }, + { + "epoch": 1.8395555555555556, + "grad_norm": 2.1511716842651367, + "learning_rate": 0.00012653024911032029, + "loss": 2.0729, + "step": 4139 + }, + { + "epoch": 1.8399999999999999, + "grad_norm": 2.469057321548462, + "learning_rate": 0.00012651245551601424, + "loss": 1.8632, + "step": 4140 + }, + { + "epoch": 1.8404444444444445, + "grad_norm": 2.173260450363159, + "learning_rate": 0.0001264946619217082, + "loss": 2.422, + "step": 4141 + }, + { + "epoch": 1.8408888888888888, + "grad_norm": 2.3541669845581055, + "learning_rate": 0.00012647686832740213, + "loss": 2.3772, + "step": 4142 + }, + { + "epoch": 1.8413333333333335, + "grad_norm": 2.6083996295928955, + "learning_rate": 0.00012645907473309608, + "loss": 2.3829, + "step": 4143 + }, + { + "epoch": 1.8417777777777777, + "grad_norm": 2.0515520572662354, + "learning_rate": 0.00012644128113879004, + "loss": 2.0062, + "step": 4144 + }, + { + "epoch": 1.8422222222222222, + "grad_norm": 2.331029176712036, + "learning_rate": 0.000126423487544484, + "loss": 2.1876, + "step": 4145 + }, + { + "epoch": 1.8426666666666667, + "grad_norm": 2.6274282932281494, + "learning_rate": 0.00012640569395017793, + "loss": 2.344, + "step": 4146 + }, + { + "epoch": 1.8431111111111111, + "grad_norm": 2.3808679580688477, + "learning_rate": 0.00012638790035587188, + "loss": 1.9633, + "step": 4147 + }, + { + "epoch": 1.8435555555555556, + "grad_norm": 2.2616419792175293, + "learning_rate": 0.00012637010676156584, + "loss": 2.2526, + "step": 4148 + }, + { + "epoch": 1.8439999999999999, + "grad_norm": 1.2193570137023926, + "learning_rate": 0.0001263523131672598, + "loss": 0.1276, + "step": 4149 + }, + { + "epoch": 1.8444444444444446, + "grad_norm": 2.9776499271392822, + "learning_rate": 0.00012633451957295375, + "loss": 1.8211, + "step": 4150 + }, + { + "epoch": 1.8448888888888888, + "grad_norm": 1.406093955039978, + "learning_rate": 0.0001263167259786477, + "loss": 2.3213, + "step": 4151 + }, + { + "epoch": 1.8453333333333335, + "grad_norm": 1.0631603002548218, + "learning_rate": 0.00012629893238434164, + "loss": 1.0766, + "step": 4152 + }, + { + "epoch": 1.8457777777777777, + "grad_norm": 1.5336110591888428, + "learning_rate": 0.0001262811387900356, + "loss": 2.4137, + "step": 4153 + }, + { + "epoch": 1.8462222222222222, + "grad_norm": 1.60658597946167, + "learning_rate": 0.00012626334519572955, + "loss": 2.7207, + "step": 4154 + }, + { + "epoch": 1.8466666666666667, + "grad_norm": 1.6107380390167236, + "learning_rate": 0.00012624555160142348, + "loss": 2.6709, + "step": 4155 + }, + { + "epoch": 1.8471111111111111, + "grad_norm": 1.7542976140975952, + "learning_rate": 0.00012622775800711744, + "loss": 2.6682, + "step": 4156 + }, + { + "epoch": 1.8475555555555556, + "grad_norm": 1.7753915786743164, + "learning_rate": 0.0001262099644128114, + "loss": 2.0563, + "step": 4157 + }, + { + "epoch": 1.8479999999999999, + "grad_norm": 1.0910627841949463, + "learning_rate": 0.00012619217081850535, + "loss": 1.2284, + "step": 4158 + }, + { + "epoch": 1.8484444444444446, + "grad_norm": 1.4845707416534424, + "learning_rate": 0.00012617437722419928, + "loss": 1.7377, + "step": 4159 + }, + { + "epoch": 1.8488888888888888, + "grad_norm": 1.4500175714492798, + "learning_rate": 0.00012615658362989324, + "loss": 1.5395, + "step": 4160 + }, + { + "epoch": 1.8493333333333335, + "grad_norm": 1.6412715911865234, + "learning_rate": 0.0001261387900355872, + "loss": 1.5817, + "step": 4161 + }, + { + "epoch": 1.8497777777777777, + "grad_norm": 1.6677457094192505, + "learning_rate": 0.00012612099644128115, + "loss": 2.4675, + "step": 4162 + }, + { + "epoch": 1.8502222222222222, + "grad_norm": 1.9067307710647583, + "learning_rate": 0.0001261032028469751, + "loss": 2.383, + "step": 4163 + }, + { + "epoch": 1.8506666666666667, + "grad_norm": 1.6068315505981445, + "learning_rate": 0.00012608540925266906, + "loss": 1.9027, + "step": 4164 + }, + { + "epoch": 1.8511111111111112, + "grad_norm": 1.8001701831817627, + "learning_rate": 0.000126067615658363, + "loss": 2.42, + "step": 4165 + }, + { + "epoch": 1.8515555555555556, + "grad_norm": 2.3257319927215576, + "learning_rate": 0.00012604982206405695, + "loss": 2.0886, + "step": 4166 + }, + { + "epoch": 1.8519999999999999, + "grad_norm": 2.088073492050171, + "learning_rate": 0.00012603202846975088, + "loss": 2.7623, + "step": 4167 + }, + { + "epoch": 1.8524444444444446, + "grad_norm": 1.0101107358932495, + "learning_rate": 0.00012601423487544483, + "loss": 0.7091, + "step": 4168 + }, + { + "epoch": 1.8528888888888888, + "grad_norm": 1.636488676071167, + "learning_rate": 0.0001259964412811388, + "loss": 2.0673, + "step": 4169 + }, + { + "epoch": 1.8533333333333335, + "grad_norm": 1.791585922241211, + "learning_rate": 0.00012597864768683275, + "loss": 2.1496, + "step": 4170 + }, + { + "epoch": 1.8537777777777777, + "grad_norm": 1.8314164876937866, + "learning_rate": 0.0001259608540925267, + "loss": 2.2932, + "step": 4171 + }, + { + "epoch": 1.8542222222222222, + "grad_norm": 1.7913572788238525, + "learning_rate": 0.00012594306049822063, + "loss": 1.8432, + "step": 4172 + }, + { + "epoch": 1.8546666666666667, + "grad_norm": 1.7947697639465332, + "learning_rate": 0.0001259252669039146, + "loss": 1.7726, + "step": 4173 + }, + { + "epoch": 1.8551111111111112, + "grad_norm": 1.8670748472213745, + "learning_rate": 0.00012590747330960855, + "loss": 2.0514, + "step": 4174 + }, + { + "epoch": 1.8555555555555556, + "grad_norm": 1.9150646924972534, + "learning_rate": 0.0001258896797153025, + "loss": 2.2755, + "step": 4175 + }, + { + "epoch": 1.8559999999999999, + "grad_norm": 1.9982563257217407, + "learning_rate": 0.00012587188612099646, + "loss": 2.4126, + "step": 4176 + }, + { + "epoch": 1.8564444444444446, + "grad_norm": 1.914477825164795, + "learning_rate": 0.00012585409252669042, + "loss": 1.7552, + "step": 4177 + }, + { + "epoch": 1.8568888888888888, + "grad_norm": 2.1474571228027344, + "learning_rate": 0.00012583629893238435, + "loss": 2.5033, + "step": 4178 + }, + { + "epoch": 1.8573333333333333, + "grad_norm": 1.845629334449768, + "learning_rate": 0.0001258185053380783, + "loss": 2.109, + "step": 4179 + }, + { + "epoch": 1.8577777777777778, + "grad_norm": 2.0772738456726074, + "learning_rate": 0.00012580071174377223, + "loss": 1.5434, + "step": 4180 + }, + { + "epoch": 1.8582222222222222, + "grad_norm": 1.9606627225875854, + "learning_rate": 0.0001257829181494662, + "loss": 2.1156, + "step": 4181 + }, + { + "epoch": 1.8586666666666667, + "grad_norm": 2.0049219131469727, + "learning_rate": 0.00012576512455516014, + "loss": 1.9324, + "step": 4182 + }, + { + "epoch": 1.8591111111111112, + "grad_norm": 1.535262107849121, + "learning_rate": 0.0001257473309608541, + "loss": 1.2178, + "step": 4183 + }, + { + "epoch": 1.8595555555555556, + "grad_norm": 2.2102348804473877, + "learning_rate": 0.00012572953736654806, + "loss": 2.8349, + "step": 4184 + }, + { + "epoch": 1.8599999999999999, + "grad_norm": 1.7623968124389648, + "learning_rate": 0.000125711743772242, + "loss": 1.5631, + "step": 4185 + }, + { + "epoch": 1.8604444444444446, + "grad_norm": 2.135024070739746, + "learning_rate": 0.00012569395017793594, + "loss": 2.233, + "step": 4186 + }, + { + "epoch": 1.8608888888888888, + "grad_norm": 2.3472445011138916, + "learning_rate": 0.0001256761565836299, + "loss": 2.658, + "step": 4187 + }, + { + "epoch": 1.8613333333333333, + "grad_norm": 2.1595356464385986, + "learning_rate": 0.00012565836298932386, + "loss": 2.0647, + "step": 4188 + }, + { + "epoch": 1.8617777777777778, + "grad_norm": 2.6070942878723145, + "learning_rate": 0.0001256405693950178, + "loss": 2.0941, + "step": 4189 + }, + { + "epoch": 1.8622222222222222, + "grad_norm": 2.0253846645355225, + "learning_rate": 0.00012562277580071177, + "loss": 1.9656, + "step": 4190 + }, + { + "epoch": 1.8626666666666667, + "grad_norm": 2.2132487297058105, + "learning_rate": 0.0001256049822064057, + "loss": 2.3254, + "step": 4191 + }, + { + "epoch": 1.8631111111111112, + "grad_norm": 2.6254289150238037, + "learning_rate": 0.00012558718861209966, + "loss": 3.0945, + "step": 4192 + }, + { + "epoch": 1.8635555555555556, + "grad_norm": 2.6178314685821533, + "learning_rate": 0.00012556939501779358, + "loss": 2.216, + "step": 4193 + }, + { + "epoch": 1.8639999999999999, + "grad_norm": 2.047734498977661, + "learning_rate": 0.00012555160142348754, + "loss": 1.6887, + "step": 4194 + }, + { + "epoch": 1.8644444444444446, + "grad_norm": 3.436713695526123, + "learning_rate": 0.0001255338078291815, + "loss": 1.5871, + "step": 4195 + }, + { + "epoch": 1.8648888888888888, + "grad_norm": 2.280249834060669, + "learning_rate": 0.00012551601423487545, + "loss": 2.2639, + "step": 4196 + }, + { + "epoch": 1.8653333333333333, + "grad_norm": 2.708115816116333, + "learning_rate": 0.0001254982206405694, + "loss": 2.4468, + "step": 4197 + }, + { + "epoch": 1.8657777777777778, + "grad_norm": 2.6053898334503174, + "learning_rate": 0.00012548042704626334, + "loss": 2.3777, + "step": 4198 + }, + { + "epoch": 1.8662222222222222, + "grad_norm": 1.6615748405456543, + "learning_rate": 0.0001254626334519573, + "loss": 0.9062, + "step": 4199 + }, + { + "epoch": 1.8666666666666667, + "grad_norm": 2.4906649589538574, + "learning_rate": 0.00012544483985765125, + "loss": 0.8929, + "step": 4200 + }, + { + "epoch": 1.8671111111111112, + "grad_norm": 1.4890950918197632, + "learning_rate": 0.0001254270462633452, + "loss": 2.7543, + "step": 4201 + }, + { + "epoch": 1.8675555555555556, + "grad_norm": 1.5864791870117188, + "learning_rate": 0.00012540925266903917, + "loss": 2.6892, + "step": 4202 + }, + { + "epoch": 1.8679999999999999, + "grad_norm": 1.612914800643921, + "learning_rate": 0.0001253914590747331, + "loss": 1.7013, + "step": 4203 + }, + { + "epoch": 1.8684444444444446, + "grad_norm": 1.4477015733718872, + "learning_rate": 0.00012537366548042705, + "loss": 1.879, + "step": 4204 + }, + { + "epoch": 1.8688888888888888, + "grad_norm": 1.9278892278671265, + "learning_rate": 0.000125355871886121, + "loss": 2.1627, + "step": 4205 + }, + { + "epoch": 1.8693333333333333, + "grad_norm": 1.7229881286621094, + "learning_rate": 0.00012533807829181494, + "loss": 2.3671, + "step": 4206 + }, + { + "epoch": 1.8697777777777778, + "grad_norm": 1.8617095947265625, + "learning_rate": 0.0001253202846975089, + "loss": 2.6608, + "step": 4207 + }, + { + "epoch": 1.8702222222222222, + "grad_norm": 1.8363666534423828, + "learning_rate": 0.00012530249110320285, + "loss": 2.059, + "step": 4208 + }, + { + "epoch": 1.8706666666666667, + "grad_norm": 1.920870304107666, + "learning_rate": 0.0001252846975088968, + "loss": 1.7415, + "step": 4209 + }, + { + "epoch": 1.871111111111111, + "grad_norm": 1.610530138015747, + "learning_rate": 0.00012526690391459074, + "loss": 2.2202, + "step": 4210 + }, + { + "epoch": 1.8715555555555556, + "grad_norm": 1.8126658201217651, + "learning_rate": 0.0001252491103202847, + "loss": 2.2095, + "step": 4211 + }, + { + "epoch": 1.8719999999999999, + "grad_norm": 2.0825304985046387, + "learning_rate": 0.00012523131672597865, + "loss": 2.3952, + "step": 4212 + }, + { + "epoch": 1.8724444444444446, + "grad_norm": 1.455892562866211, + "learning_rate": 0.0001252135231316726, + "loss": 1.4388, + "step": 4213 + }, + { + "epoch": 1.8728888888888888, + "grad_norm": 1.9848651885986328, + "learning_rate": 0.00012519572953736656, + "loss": 2.325, + "step": 4214 + }, + { + "epoch": 1.8733333333333333, + "grad_norm": 1.972327709197998, + "learning_rate": 0.00012517793594306052, + "loss": 1.9154, + "step": 4215 + }, + { + "epoch": 1.8737777777777778, + "grad_norm": 1.8076159954071045, + "learning_rate": 0.00012516014234875445, + "loss": 2.4675, + "step": 4216 + }, + { + "epoch": 1.8742222222222222, + "grad_norm": 1.6413958072662354, + "learning_rate": 0.0001251423487544484, + "loss": 1.5596, + "step": 4217 + }, + { + "epoch": 1.8746666666666667, + "grad_norm": 1.9646254777908325, + "learning_rate": 0.00012512455516014236, + "loss": 2.1716, + "step": 4218 + }, + { + "epoch": 1.875111111111111, + "grad_norm": 1.7197006940841675, + "learning_rate": 0.0001251067615658363, + "loss": 2.0816, + "step": 4219 + }, + { + "epoch": 1.8755555555555556, + "grad_norm": 1.8920749425888062, + "learning_rate": 0.00012508896797153025, + "loss": 2.2979, + "step": 4220 + }, + { + "epoch": 1.876, + "grad_norm": 1.8623499870300293, + "learning_rate": 0.0001250711743772242, + "loss": 2.5679, + "step": 4221 + }, + { + "epoch": 1.8764444444444446, + "grad_norm": 2.2921416759490967, + "learning_rate": 0.00012505338078291816, + "loss": 2.4083, + "step": 4222 + }, + { + "epoch": 1.8768888888888888, + "grad_norm": 1.984156847000122, + "learning_rate": 0.0001250355871886121, + "loss": 2.6118, + "step": 4223 + }, + { + "epoch": 1.8773333333333333, + "grad_norm": 1.681645154953003, + "learning_rate": 0.00012501779359430605, + "loss": 1.4962, + "step": 4224 + }, + { + "epoch": 1.8777777777777778, + "grad_norm": 2.018657684326172, + "learning_rate": 0.000125, + "loss": 2.2015, + "step": 4225 + }, + { + "epoch": 1.8782222222222222, + "grad_norm": 1.8145815134048462, + "learning_rate": 0.00012498220640569396, + "loss": 1.4773, + "step": 4226 + }, + { + "epoch": 1.8786666666666667, + "grad_norm": 1.8681118488311768, + "learning_rate": 0.00012496441281138792, + "loss": 2.0218, + "step": 4227 + }, + { + "epoch": 1.879111111111111, + "grad_norm": 2.0949301719665527, + "learning_rate": 0.00012494661921708187, + "loss": 2.1287, + "step": 4228 + }, + { + "epoch": 1.8795555555555556, + "grad_norm": 1.3150235414505005, + "learning_rate": 0.0001249288256227758, + "loss": 1.1873, + "step": 4229 + }, + { + "epoch": 1.88, + "grad_norm": 2.1858744621276855, + "learning_rate": 0.00012491103202846976, + "loss": 2.2247, + "step": 4230 + }, + { + "epoch": 1.8804444444444446, + "grad_norm": 2.0682497024536133, + "learning_rate": 0.00012489323843416371, + "loss": 2.2693, + "step": 4231 + }, + { + "epoch": 1.8808888888888888, + "grad_norm": 2.008765697479248, + "learning_rate": 0.00012487544483985764, + "loss": 1.8305, + "step": 4232 + }, + { + "epoch": 1.8813333333333333, + "grad_norm": 2.322118043899536, + "learning_rate": 0.0001248576512455516, + "loss": 2.6863, + "step": 4233 + }, + { + "epoch": 1.8817777777777778, + "grad_norm": 2.1211628913879395, + "learning_rate": 0.00012483985765124556, + "loss": 2.3278, + "step": 4234 + }, + { + "epoch": 1.8822222222222222, + "grad_norm": 2.126739501953125, + "learning_rate": 0.0001248220640569395, + "loss": 1.9615, + "step": 4235 + }, + { + "epoch": 1.8826666666666667, + "grad_norm": 1.4139108657836914, + "learning_rate": 0.00012480427046263344, + "loss": 1.3694, + "step": 4236 + }, + { + "epoch": 1.883111111111111, + "grad_norm": 2.1995489597320557, + "learning_rate": 0.0001247864768683274, + "loss": 2.1418, + "step": 4237 + }, + { + "epoch": 1.8835555555555556, + "grad_norm": 2.4655182361602783, + "learning_rate": 0.00012476868327402136, + "loss": 2.497, + "step": 4238 + }, + { + "epoch": 1.884, + "grad_norm": 2.2003283500671387, + "learning_rate": 0.0001247508896797153, + "loss": 2.6386, + "step": 4239 + }, + { + "epoch": 1.8844444444444446, + "grad_norm": 2.3272600173950195, + "learning_rate": 0.00012473309608540927, + "loss": 2.2253, + "step": 4240 + }, + { + "epoch": 1.8848888888888888, + "grad_norm": 1.7986118793487549, + "learning_rate": 0.00012471530249110323, + "loss": 2.112, + "step": 4241 + }, + { + "epoch": 1.8853333333333333, + "grad_norm": 2.1452407836914062, + "learning_rate": 0.00012469750889679715, + "loss": 1.9534, + "step": 4242 + }, + { + "epoch": 1.8857777777777778, + "grad_norm": 2.466782808303833, + "learning_rate": 0.0001246797153024911, + "loss": 2.5373, + "step": 4243 + }, + { + "epoch": 1.8862222222222222, + "grad_norm": 2.473870277404785, + "learning_rate": 0.00012466192170818507, + "loss": 2.505, + "step": 4244 + }, + { + "epoch": 1.8866666666666667, + "grad_norm": 2.5043163299560547, + "learning_rate": 0.000124644128113879, + "loss": 2.4174, + "step": 4245 + }, + { + "epoch": 1.887111111111111, + "grad_norm": 2.2881720066070557, + "learning_rate": 0.00012462633451957295, + "loss": 2.4042, + "step": 4246 + }, + { + "epoch": 1.8875555555555557, + "grad_norm": 2.357645034790039, + "learning_rate": 0.0001246085409252669, + "loss": 2.1379, + "step": 4247 + }, + { + "epoch": 1.888, + "grad_norm": 2.677143096923828, + "learning_rate": 0.00012459074733096087, + "loss": 2.0372, + "step": 4248 + }, + { + "epoch": 1.8884444444444446, + "grad_norm": 2.4684627056121826, + "learning_rate": 0.0001245729537366548, + "loss": 1.7057, + "step": 4249 + }, + { + "epoch": 1.8888888888888888, + "grad_norm": 2.5674924850463867, + "learning_rate": 0.00012455516014234875, + "loss": 1.4432, + "step": 4250 + }, + { + "epoch": 1.8893333333333333, + "grad_norm": 1.4329222440719604, + "learning_rate": 0.0001245373665480427, + "loss": 2.3156, + "step": 4251 + }, + { + "epoch": 1.8897777777777778, + "grad_norm": 1.45597505569458, + "learning_rate": 0.00012451957295373667, + "loss": 2.4051, + "step": 4252 + }, + { + "epoch": 1.8902222222222222, + "grad_norm": 1.2270104885101318, + "learning_rate": 0.00012450177935943062, + "loss": 1.7157, + "step": 4253 + }, + { + "epoch": 1.8906666666666667, + "grad_norm": 1.1220571994781494, + "learning_rate": 0.00012448398576512458, + "loss": 1.2594, + "step": 4254 + }, + { + "epoch": 1.891111111111111, + "grad_norm": 1.5439119338989258, + "learning_rate": 0.0001244661921708185, + "loss": 2.2097, + "step": 4255 + }, + { + "epoch": 1.8915555555555557, + "grad_norm": 1.6956852674484253, + "learning_rate": 0.00012444839857651246, + "loss": 2.6823, + "step": 4256 + }, + { + "epoch": 1.892, + "grad_norm": 1.640121579170227, + "learning_rate": 0.00012443060498220642, + "loss": 2.2303, + "step": 4257 + }, + { + "epoch": 1.8924444444444446, + "grad_norm": 1.3527717590332031, + "learning_rate": 0.00012441281138790035, + "loss": 1.1705, + "step": 4258 + }, + { + "epoch": 1.8928888888888888, + "grad_norm": 1.7290159463882446, + "learning_rate": 0.0001243950177935943, + "loss": 2.4764, + "step": 4259 + }, + { + "epoch": 1.8933333333333333, + "grad_norm": 1.5144662857055664, + "learning_rate": 0.00012437722419928826, + "loss": 1.8495, + "step": 4260 + }, + { + "epoch": 1.8937777777777778, + "grad_norm": 1.8776041269302368, + "learning_rate": 0.00012435943060498222, + "loss": 2.7901, + "step": 4261 + }, + { + "epoch": 1.8942222222222223, + "grad_norm": 1.9808104038238525, + "learning_rate": 0.00012434163701067615, + "loss": 2.3308, + "step": 4262 + }, + { + "epoch": 1.8946666666666667, + "grad_norm": 1.6954816579818726, + "learning_rate": 0.0001243238434163701, + "loss": 2.1066, + "step": 4263 + }, + { + "epoch": 1.895111111111111, + "grad_norm": 2.016079902648926, + "learning_rate": 0.00012430604982206406, + "loss": 2.4023, + "step": 4264 + }, + { + "epoch": 1.8955555555555557, + "grad_norm": 1.6305691003799438, + "learning_rate": 0.00012428825622775802, + "loss": 1.8707, + "step": 4265 + }, + { + "epoch": 1.896, + "grad_norm": 1.8553330898284912, + "learning_rate": 0.00012427046263345198, + "loss": 2.0848, + "step": 4266 + }, + { + "epoch": 1.8964444444444446, + "grad_norm": 1.7080594301223755, + "learning_rate": 0.00012425266903914593, + "loss": 2.0636, + "step": 4267 + }, + { + "epoch": 1.8968888888888888, + "grad_norm": 1.9433776140213013, + "learning_rate": 0.00012423487544483986, + "loss": 2.1638, + "step": 4268 + }, + { + "epoch": 1.8973333333333333, + "grad_norm": 1.7864351272583008, + "learning_rate": 0.00012421708185053382, + "loss": 2.1109, + "step": 4269 + }, + { + "epoch": 1.8977777777777778, + "grad_norm": 1.667856216430664, + "learning_rate": 0.00012419928825622777, + "loss": 1.4533, + "step": 4270 + }, + { + "epoch": 1.8982222222222223, + "grad_norm": 1.8229117393493652, + "learning_rate": 0.0001241814946619217, + "loss": 2.093, + "step": 4271 + }, + { + "epoch": 1.8986666666666667, + "grad_norm": 2.1975040435791016, + "learning_rate": 0.00012416370106761566, + "loss": 2.6783, + "step": 4272 + }, + { + "epoch": 1.899111111111111, + "grad_norm": 1.8947041034698486, + "learning_rate": 0.00012414590747330962, + "loss": 2.4428, + "step": 4273 + }, + { + "epoch": 1.8995555555555557, + "grad_norm": 1.981955647468567, + "learning_rate": 0.00012412811387900357, + "loss": 1.9542, + "step": 4274 + }, + { + "epoch": 1.9, + "grad_norm": 2.057514190673828, + "learning_rate": 0.0001241103202846975, + "loss": 2.4705, + "step": 4275 + }, + { + "epoch": 1.9004444444444446, + "grad_norm": 1.9923932552337646, + "learning_rate": 0.00012409252669039146, + "loss": 2.1799, + "step": 4276 + }, + { + "epoch": 1.9008888888888889, + "grad_norm": 1.9740736484527588, + "learning_rate": 0.00012407473309608542, + "loss": 2.401, + "step": 4277 + }, + { + "epoch": 1.9013333333333333, + "grad_norm": 0.5957589149475098, + "learning_rate": 0.00012405693950177937, + "loss": 0.0373, + "step": 4278 + }, + { + "epoch": 1.9017777777777778, + "grad_norm": 1.835414171218872, + "learning_rate": 0.00012403914590747333, + "loss": 2.2666, + "step": 4279 + }, + { + "epoch": 1.9022222222222223, + "grad_norm": 1.9342777729034424, + "learning_rate": 0.00012402135231316728, + "loss": 2.1608, + "step": 4280 + }, + { + "epoch": 1.9026666666666667, + "grad_norm": 1.760406732559204, + "learning_rate": 0.00012400355871886121, + "loss": 1.7764, + "step": 4281 + }, + { + "epoch": 1.903111111111111, + "grad_norm": 2.0876519680023193, + "learning_rate": 0.00012398576512455517, + "loss": 2.6303, + "step": 4282 + }, + { + "epoch": 1.9035555555555557, + "grad_norm": 2.0983736515045166, + "learning_rate": 0.00012396797153024913, + "loss": 2.28, + "step": 4283 + }, + { + "epoch": 1.904, + "grad_norm": 2.1319432258605957, + "learning_rate": 0.00012395017793594306, + "loss": 2.5054, + "step": 4284 + }, + { + "epoch": 1.9044444444444446, + "grad_norm": 1.9509506225585938, + "learning_rate": 0.000123932384341637, + "loss": 1.8634, + "step": 4285 + }, + { + "epoch": 1.9048888888888889, + "grad_norm": 2.6779415607452393, + "learning_rate": 0.00012391459074733097, + "loss": 2.8077, + "step": 4286 + }, + { + "epoch": 1.9053333333333333, + "grad_norm": 1.5328880548477173, + "learning_rate": 0.00012389679715302493, + "loss": 1.1721, + "step": 4287 + }, + { + "epoch": 1.9057777777777778, + "grad_norm": 2.5803184509277344, + "learning_rate": 0.00012387900355871886, + "loss": 2.3973, + "step": 4288 + }, + { + "epoch": 1.9062222222222223, + "grad_norm": 1.985290765762329, + "learning_rate": 0.0001238612099644128, + "loss": 2.2365, + "step": 4289 + }, + { + "epoch": 1.9066666666666667, + "grad_norm": 2.207040309906006, + "learning_rate": 0.00012384341637010677, + "loss": 1.5546, + "step": 4290 + }, + { + "epoch": 1.907111111111111, + "grad_norm": 2.6116459369659424, + "learning_rate": 0.00012382562277580072, + "loss": 2.4088, + "step": 4291 + }, + { + "epoch": 1.9075555555555557, + "grad_norm": 1.6348541975021362, + "learning_rate": 0.00012380782918149468, + "loss": 1.5725, + "step": 4292 + }, + { + "epoch": 1.908, + "grad_norm": 2.5747082233428955, + "learning_rate": 0.0001237900355871886, + "loss": 2.5069, + "step": 4293 + }, + { + "epoch": 1.9084444444444446, + "grad_norm": 2.236910104751587, + "learning_rate": 0.00012377224199288257, + "loss": 2.2303, + "step": 4294 + }, + { + "epoch": 1.9088888888888889, + "grad_norm": 2.4665608406066895, + "learning_rate": 0.00012375444839857652, + "loss": 2.0439, + "step": 4295 + }, + { + "epoch": 1.9093333333333333, + "grad_norm": 2.041987895965576, + "learning_rate": 0.00012373665480427045, + "loss": 2.1251, + "step": 4296 + }, + { + "epoch": 1.9097777777777778, + "grad_norm": 2.454289436340332, + "learning_rate": 0.0001237188612099644, + "loss": 2.5853, + "step": 4297 + }, + { + "epoch": 1.9102222222222223, + "grad_norm": 2.1545984745025635, + "learning_rate": 0.00012370106761565837, + "loss": 2.0791, + "step": 4298 + }, + { + "epoch": 1.9106666666666667, + "grad_norm": 2.7019107341766357, + "learning_rate": 0.00012368327402135232, + "loss": 2.4134, + "step": 4299 + }, + { + "epoch": 1.911111111111111, + "grad_norm": 3.2024686336517334, + "learning_rate": 0.00012366548042704625, + "loss": 1.6449, + "step": 4300 + }, + { + "epoch": 1.9115555555555557, + "grad_norm": 1.3786087036132812, + "learning_rate": 0.0001236476868327402, + "loss": 2.5061, + "step": 4301 + }, + { + "epoch": 1.912, + "grad_norm": 1.5719784498214722, + "learning_rate": 0.00012362989323843417, + "loss": 2.3814, + "step": 4302 + }, + { + "epoch": 1.9124444444444444, + "grad_norm": 1.4920408725738525, + "learning_rate": 0.00012361209964412812, + "loss": 2.0447, + "step": 4303 + }, + { + "epoch": 1.9128888888888889, + "grad_norm": 1.747694730758667, + "learning_rate": 0.00012359430604982208, + "loss": 2.8776, + "step": 4304 + }, + { + "epoch": 1.9133333333333333, + "grad_norm": 1.8610203266143799, + "learning_rate": 0.00012357651245551603, + "loss": 2.3367, + "step": 4305 + }, + { + "epoch": 1.9137777777777778, + "grad_norm": 1.5181238651275635, + "learning_rate": 0.00012355871886120996, + "loss": 1.9326, + "step": 4306 + }, + { + "epoch": 1.9142222222222223, + "grad_norm": 1.707848072052002, + "learning_rate": 0.00012354092526690392, + "loss": 2.3507, + "step": 4307 + }, + { + "epoch": 1.9146666666666667, + "grad_norm": 1.7067019939422607, + "learning_rate": 0.00012352313167259788, + "loss": 2.0366, + "step": 4308 + }, + { + "epoch": 1.915111111111111, + "grad_norm": 1.7387741804122925, + "learning_rate": 0.0001235053380782918, + "loss": 2.0162, + "step": 4309 + }, + { + "epoch": 1.9155555555555557, + "grad_norm": 1.7039339542388916, + "learning_rate": 0.00012348754448398576, + "loss": 2.1688, + "step": 4310 + }, + { + "epoch": 1.916, + "grad_norm": 1.6669498682022095, + "learning_rate": 0.00012346975088967972, + "loss": 2.2707, + "step": 4311 + }, + { + "epoch": 1.9164444444444444, + "grad_norm": 1.6338768005371094, + "learning_rate": 0.00012345195729537368, + "loss": 2.0841, + "step": 4312 + }, + { + "epoch": 1.9168888888888889, + "grad_norm": 1.6878036260604858, + "learning_rate": 0.0001234341637010676, + "loss": 2.2946, + "step": 4313 + }, + { + "epoch": 1.9173333333333333, + "grad_norm": 1.7214369773864746, + "learning_rate": 0.00012341637010676156, + "loss": 1.9289, + "step": 4314 + }, + { + "epoch": 1.9177777777777778, + "grad_norm": 1.6168575286865234, + "learning_rate": 0.00012339857651245552, + "loss": 1.9991, + "step": 4315 + }, + { + "epoch": 1.9182222222222223, + "grad_norm": 1.788225531578064, + "learning_rate": 0.00012338078291814947, + "loss": 2.3562, + "step": 4316 + }, + { + "epoch": 1.9186666666666667, + "grad_norm": 1.5336823463439941, + "learning_rate": 0.00012336298932384343, + "loss": 1.7089, + "step": 4317 + }, + { + "epoch": 1.919111111111111, + "grad_norm": 1.8361992835998535, + "learning_rate": 0.0001233451957295374, + "loss": 2.0708, + "step": 4318 + }, + { + "epoch": 1.9195555555555557, + "grad_norm": 1.857429027557373, + "learning_rate": 0.00012332740213523132, + "loss": 1.7238, + "step": 4319 + }, + { + "epoch": 1.92, + "grad_norm": 1.6928489208221436, + "learning_rate": 0.00012330960854092527, + "loss": 2.0781, + "step": 4320 + }, + { + "epoch": 1.9204444444444444, + "grad_norm": 1.6085439920425415, + "learning_rate": 0.00012329181494661923, + "loss": 1.37, + "step": 4321 + }, + { + "epoch": 1.9208888888888889, + "grad_norm": 1.9759423732757568, + "learning_rate": 0.00012327402135231316, + "loss": 2.561, + "step": 4322 + }, + { + "epoch": 1.9213333333333333, + "grad_norm": 1.8976612091064453, + "learning_rate": 0.00012325622775800712, + "loss": 2.1077, + "step": 4323 + }, + { + "epoch": 1.9217777777777778, + "grad_norm": 2.0411934852600098, + "learning_rate": 0.00012323843416370107, + "loss": 2.4446, + "step": 4324 + }, + { + "epoch": 1.9222222222222223, + "grad_norm": 2.429276704788208, + "learning_rate": 0.00012322064056939503, + "loss": 2.5635, + "step": 4325 + }, + { + "epoch": 1.9226666666666667, + "grad_norm": 2.421410322189331, + "learning_rate": 0.00012320284697508896, + "loss": 2.9828, + "step": 4326 + }, + { + "epoch": 1.923111111111111, + "grad_norm": 2.055605173110962, + "learning_rate": 0.00012318505338078291, + "loss": 2.5132, + "step": 4327 + }, + { + "epoch": 1.9235555555555557, + "grad_norm": 1.587828278541565, + "learning_rate": 0.00012316725978647687, + "loss": 1.3512, + "step": 4328 + }, + { + "epoch": 1.924, + "grad_norm": 2.128390312194824, + "learning_rate": 0.00012314946619217083, + "loss": 2.1455, + "step": 4329 + }, + { + "epoch": 1.9244444444444444, + "grad_norm": 1.7967475652694702, + "learning_rate": 0.00012313167259786478, + "loss": 0.8184, + "step": 4330 + }, + { + "epoch": 1.9248888888888889, + "grad_norm": 2.1603779792785645, + "learning_rate": 0.00012311387900355874, + "loss": 2.2261, + "step": 4331 + }, + { + "epoch": 1.9253333333333333, + "grad_norm": 2.0636863708496094, + "learning_rate": 0.00012309608540925267, + "loss": 2.0121, + "step": 4332 + }, + { + "epoch": 1.9257777777777778, + "grad_norm": 2.0725672245025635, + "learning_rate": 0.00012307829181494663, + "loss": 2.276, + "step": 4333 + }, + { + "epoch": 1.926222222222222, + "grad_norm": 2.1373484134674072, + "learning_rate": 0.00012306049822064058, + "loss": 2.3134, + "step": 4334 + }, + { + "epoch": 1.9266666666666667, + "grad_norm": 1.7675268650054932, + "learning_rate": 0.0001230427046263345, + "loss": 1.5304, + "step": 4335 + }, + { + "epoch": 1.927111111111111, + "grad_norm": 1.9493167400360107, + "learning_rate": 0.00012302491103202847, + "loss": 1.8928, + "step": 4336 + }, + { + "epoch": 1.9275555555555557, + "grad_norm": 2.15889835357666, + "learning_rate": 0.00012300711743772243, + "loss": 2.0092, + "step": 4337 + }, + { + "epoch": 1.928, + "grad_norm": 2.0056865215301514, + "learning_rate": 0.00012298932384341638, + "loss": 2.3798, + "step": 4338 + }, + { + "epoch": 1.9284444444444444, + "grad_norm": 2.266223907470703, + "learning_rate": 0.0001229715302491103, + "loss": 2.3884, + "step": 4339 + }, + { + "epoch": 1.9288888888888889, + "grad_norm": 2.173867702484131, + "learning_rate": 0.00012295373665480427, + "loss": 2.5138, + "step": 4340 + }, + { + "epoch": 1.9293333333333333, + "grad_norm": 2.383648633956909, + "learning_rate": 0.00012293594306049822, + "loss": 2.4544, + "step": 4341 + }, + { + "epoch": 1.9297777777777778, + "grad_norm": 1.9824879169464111, + "learning_rate": 0.00012291814946619218, + "loss": 1.7544, + "step": 4342 + }, + { + "epoch": 1.930222222222222, + "grad_norm": 2.24977707862854, + "learning_rate": 0.00012290035587188614, + "loss": 2.9336, + "step": 4343 + }, + { + "epoch": 1.9306666666666668, + "grad_norm": 2.6044070720672607, + "learning_rate": 0.0001228825622775801, + "loss": 2.0662, + "step": 4344 + }, + { + "epoch": 1.931111111111111, + "grad_norm": 2.6351845264434814, + "learning_rate": 0.00012286476868327402, + "loss": 2.554, + "step": 4345 + }, + { + "epoch": 1.9315555555555557, + "grad_norm": 2.308326005935669, + "learning_rate": 0.00012284697508896798, + "loss": 2.0536, + "step": 4346 + }, + { + "epoch": 1.932, + "grad_norm": 2.2656610012054443, + "learning_rate": 0.00012282918149466194, + "loss": 2.3636, + "step": 4347 + }, + { + "epoch": 1.9324444444444444, + "grad_norm": 2.7390570640563965, + "learning_rate": 0.00012281138790035587, + "loss": 2.2897, + "step": 4348 + }, + { + "epoch": 1.9328888888888889, + "grad_norm": 2.2677807807922363, + "learning_rate": 0.00012279359430604982, + "loss": 2.1377, + "step": 4349 + }, + { + "epoch": 1.9333333333333333, + "grad_norm": 3.099202871322632, + "learning_rate": 0.00012277580071174378, + "loss": 2.5962, + "step": 4350 + }, + { + "epoch": 1.9337777777777778, + "grad_norm": 0.9823408126831055, + "learning_rate": 0.00012275800711743774, + "loss": 1.15, + "step": 4351 + }, + { + "epoch": 1.934222222222222, + "grad_norm": 1.5499428510665894, + "learning_rate": 0.00012274021352313166, + "loss": 1.9964, + "step": 4352 + }, + { + "epoch": 1.9346666666666668, + "grad_norm": 1.5423551797866821, + "learning_rate": 0.00012272241992882562, + "loss": 2.1596, + "step": 4353 + }, + { + "epoch": 1.935111111111111, + "grad_norm": 1.57822847366333, + "learning_rate": 0.00012270462633451958, + "loss": 2.2386, + "step": 4354 + }, + { + "epoch": 1.9355555555555557, + "grad_norm": 1.5377904176712036, + "learning_rate": 0.00012268683274021353, + "loss": 2.0222, + "step": 4355 + }, + { + "epoch": 1.936, + "grad_norm": 1.53287935256958, + "learning_rate": 0.0001226690391459075, + "loss": 1.9897, + "step": 4356 + }, + { + "epoch": 1.9364444444444444, + "grad_norm": 1.51986563205719, + "learning_rate": 0.00012265124555160145, + "loss": 2.0826, + "step": 4357 + }, + { + "epoch": 1.9368888888888889, + "grad_norm": 1.7066446542739868, + "learning_rate": 0.00012263345195729538, + "loss": 1.8589, + "step": 4358 + }, + { + "epoch": 1.9373333333333334, + "grad_norm": 1.175493836402893, + "learning_rate": 0.00012261565836298933, + "loss": 1.0915, + "step": 4359 + }, + { + "epoch": 1.9377777777777778, + "grad_norm": 1.7773221731185913, + "learning_rate": 0.0001225978647686833, + "loss": 1.8631, + "step": 4360 + }, + { + "epoch": 1.938222222222222, + "grad_norm": 2.198300361633301, + "learning_rate": 0.00012258007117437722, + "loss": 2.6825, + "step": 4361 + }, + { + "epoch": 1.9386666666666668, + "grad_norm": 1.8990510702133179, + "learning_rate": 0.00012256227758007118, + "loss": 2.8493, + "step": 4362 + }, + { + "epoch": 1.939111111111111, + "grad_norm": 1.874263882637024, + "learning_rate": 0.00012254448398576513, + "loss": 2.588, + "step": 4363 + }, + { + "epoch": 1.9395555555555557, + "grad_norm": 1.9859611988067627, + "learning_rate": 0.0001225266903914591, + "loss": 2.3982, + "step": 4364 + }, + { + "epoch": 1.94, + "grad_norm": 1.6862070560455322, + "learning_rate": 0.00012250889679715302, + "loss": 2.0444, + "step": 4365 + }, + { + "epoch": 1.9404444444444444, + "grad_norm": 1.7006193399429321, + "learning_rate": 0.00012249110320284697, + "loss": 1.6788, + "step": 4366 + }, + { + "epoch": 1.9408888888888889, + "grad_norm": 1.6702203750610352, + "learning_rate": 0.00012247330960854093, + "loss": 2.0756, + "step": 4367 + }, + { + "epoch": 1.9413333333333334, + "grad_norm": 2.062119483947754, + "learning_rate": 0.0001224555160142349, + "loss": 3.1842, + "step": 4368 + }, + { + "epoch": 1.9417777777777778, + "grad_norm": 1.8628273010253906, + "learning_rate": 0.00012243772241992884, + "loss": 2.0465, + "step": 4369 + }, + { + "epoch": 1.942222222222222, + "grad_norm": 1.9983272552490234, + "learning_rate": 0.0001224199288256228, + "loss": 1.8614, + "step": 4370 + }, + { + "epoch": 1.9426666666666668, + "grad_norm": 1.8302124738693237, + "learning_rate": 0.00012240213523131673, + "loss": 2.2356, + "step": 4371 + }, + { + "epoch": 1.943111111111111, + "grad_norm": 2.0615060329437256, + "learning_rate": 0.00012238434163701069, + "loss": 2.6518, + "step": 4372 + }, + { + "epoch": 1.9435555555555557, + "grad_norm": 1.957165002822876, + "learning_rate": 0.00012236654804270464, + "loss": 2.5132, + "step": 4373 + }, + { + "epoch": 1.944, + "grad_norm": 2.0981383323669434, + "learning_rate": 0.00012234875444839857, + "loss": 2.5369, + "step": 4374 + }, + { + "epoch": 1.9444444444444444, + "grad_norm": 2.0047876834869385, + "learning_rate": 0.00012233096085409253, + "loss": 1.8372, + "step": 4375 + }, + { + "epoch": 1.944888888888889, + "grad_norm": 1.8745818138122559, + "learning_rate": 0.00012231316725978649, + "loss": 2.0336, + "step": 4376 + }, + { + "epoch": 1.9453333333333334, + "grad_norm": 1.9470521211624146, + "learning_rate": 0.00012229537366548044, + "loss": 2.0855, + "step": 4377 + }, + { + "epoch": 1.9457777777777778, + "grad_norm": 1.992088794708252, + "learning_rate": 0.00012227758007117437, + "loss": 2.2125, + "step": 4378 + }, + { + "epoch": 1.946222222222222, + "grad_norm": 1.8258486986160278, + "learning_rate": 0.00012225978647686833, + "loss": 1.8891, + "step": 4379 + }, + { + "epoch": 1.9466666666666668, + "grad_norm": 1.850938081741333, + "learning_rate": 0.00012224199288256228, + "loss": 2.3489, + "step": 4380 + }, + { + "epoch": 1.947111111111111, + "grad_norm": 2.0930421352386475, + "learning_rate": 0.00012222419928825624, + "loss": 2.173, + "step": 4381 + }, + { + "epoch": 1.9475555555555557, + "grad_norm": 1.875168800354004, + "learning_rate": 0.0001222064056939502, + "loss": 1.7742, + "step": 4382 + }, + { + "epoch": 1.948, + "grad_norm": 2.2607669830322266, + "learning_rate": 0.00012218861209964413, + "loss": 2.6387, + "step": 4383 + }, + { + "epoch": 1.9484444444444444, + "grad_norm": 1.710083246231079, + "learning_rate": 0.00012217081850533808, + "loss": 2.132, + "step": 4384 + }, + { + "epoch": 1.948888888888889, + "grad_norm": 2.2312233448028564, + "learning_rate": 0.00012215302491103204, + "loss": 1.6657, + "step": 4385 + }, + { + "epoch": 1.9493333333333334, + "grad_norm": 2.3804073333740234, + "learning_rate": 0.000122135231316726, + "loss": 2.277, + "step": 4386 + }, + { + "epoch": 1.9497777777777778, + "grad_norm": 2.0216658115386963, + "learning_rate": 0.00012211743772241993, + "loss": 2.4888, + "step": 4387 + }, + { + "epoch": 1.950222222222222, + "grad_norm": 1.6631717681884766, + "learning_rate": 0.00012209964412811388, + "loss": 1.218, + "step": 4388 + }, + { + "epoch": 1.9506666666666668, + "grad_norm": 2.17562198638916, + "learning_rate": 0.00012208185053380784, + "loss": 1.9006, + "step": 4389 + }, + { + "epoch": 1.951111111111111, + "grad_norm": 1.9813545942306519, + "learning_rate": 0.00012206405693950178, + "loss": 1.9596, + "step": 4390 + }, + { + "epoch": 1.9515555555555557, + "grad_norm": 1.9598819017410278, + "learning_rate": 0.00012204626334519574, + "loss": 2.5526, + "step": 4391 + }, + { + "epoch": 1.952, + "grad_norm": 2.731766700744629, + "learning_rate": 0.00012202846975088968, + "loss": 2.2122, + "step": 4392 + }, + { + "epoch": 1.9524444444444444, + "grad_norm": 2.2050271034240723, + "learning_rate": 0.00012201067615658364, + "loss": 2.17, + "step": 4393 + }, + { + "epoch": 1.952888888888889, + "grad_norm": 2.014303684234619, + "learning_rate": 0.0001219928825622776, + "loss": 2.0293, + "step": 4394 + }, + { + "epoch": 1.9533333333333334, + "grad_norm": 2.2613487243652344, + "learning_rate": 0.00012197508896797154, + "loss": 2.4478, + "step": 4395 + }, + { + "epoch": 1.9537777777777778, + "grad_norm": 2.023850917816162, + "learning_rate": 0.00012195729537366549, + "loss": 1.7262, + "step": 4396 + }, + { + "epoch": 1.954222222222222, + "grad_norm": 1.9727839231491089, + "learning_rate": 0.00012193950177935945, + "loss": 2.1561, + "step": 4397 + }, + { + "epoch": 1.9546666666666668, + "grad_norm": 2.2552168369293213, + "learning_rate": 0.00012192170818505339, + "loss": 2.3178, + "step": 4398 + }, + { + "epoch": 1.955111111111111, + "grad_norm": 2.353135108947754, + "learning_rate": 0.00012190391459074735, + "loss": 2.2246, + "step": 4399 + }, + { + "epoch": 1.9555555555555557, + "grad_norm": 2.8717939853668213, + "learning_rate": 0.00012188612099644128, + "loss": 2.3769, + "step": 4400 + }, + { + "epoch": 1.956, + "grad_norm": 1.117507815361023, + "learning_rate": 0.00012186832740213523, + "loss": 1.2481, + "step": 4401 + }, + { + "epoch": 1.9564444444444444, + "grad_norm": 1.0877759456634521, + "learning_rate": 0.00012185053380782918, + "loss": 0.9556, + "step": 4402 + }, + { + "epoch": 1.956888888888889, + "grad_norm": 1.1263483762741089, + "learning_rate": 0.00012183274021352313, + "loss": 1.1806, + "step": 4403 + }, + { + "epoch": 1.9573333333333334, + "grad_norm": 1.5997364521026611, + "learning_rate": 0.00012181494661921709, + "loss": 2.2267, + "step": 4404 + }, + { + "epoch": 1.9577777777777778, + "grad_norm": 1.4710240364074707, + "learning_rate": 0.00012179715302491103, + "loss": 2.4633, + "step": 4405 + }, + { + "epoch": 1.958222222222222, + "grad_norm": 1.5954550504684448, + "learning_rate": 0.00012177935943060499, + "loss": 2.3169, + "step": 4406 + }, + { + "epoch": 1.9586666666666668, + "grad_norm": 1.688092589378357, + "learning_rate": 0.00012176156583629895, + "loss": 2.6058, + "step": 4407 + }, + { + "epoch": 1.959111111111111, + "grad_norm": 3.413558006286621, + "learning_rate": 0.00012174377224199289, + "loss": 1.9166, + "step": 4408 + }, + { + "epoch": 1.9595555555555557, + "grad_norm": 1.7245512008666992, + "learning_rate": 0.00012172597864768685, + "loss": 2.4489, + "step": 4409 + }, + { + "epoch": 1.96, + "grad_norm": 1.8946932554244995, + "learning_rate": 0.0001217081850533808, + "loss": 2.1216, + "step": 4410 + }, + { + "epoch": 1.9604444444444444, + "grad_norm": 1.766736626625061, + "learning_rate": 0.00012169039145907475, + "loss": 2.4328, + "step": 4411 + }, + { + "epoch": 1.960888888888889, + "grad_norm": 1.7106037139892578, + "learning_rate": 0.00012167259786476868, + "loss": 2.0669, + "step": 4412 + }, + { + "epoch": 1.9613333333333334, + "grad_norm": 1.758056879043579, + "learning_rate": 0.00012165480427046263, + "loss": 2.306, + "step": 4413 + }, + { + "epoch": 1.9617777777777778, + "grad_norm": 1.796230673789978, + "learning_rate": 0.00012163701067615659, + "loss": 2.3531, + "step": 4414 + }, + { + "epoch": 1.962222222222222, + "grad_norm": 1.641489863395691, + "learning_rate": 0.00012161921708185053, + "loss": 1.9206, + "step": 4415 + }, + { + "epoch": 1.9626666666666668, + "grad_norm": 1.558814287185669, + "learning_rate": 0.00012160142348754449, + "loss": 1.7989, + "step": 4416 + }, + { + "epoch": 1.963111111111111, + "grad_norm": 1.724256992340088, + "learning_rate": 0.00012158362989323844, + "loss": 2.0509, + "step": 4417 + }, + { + "epoch": 1.9635555555555557, + "grad_norm": 1.6848506927490234, + "learning_rate": 0.00012156583629893239, + "loss": 2.4329, + "step": 4418 + }, + { + "epoch": 1.964, + "grad_norm": 1.874718427658081, + "learning_rate": 0.00012154804270462634, + "loss": 2.2301, + "step": 4419 + }, + { + "epoch": 1.9644444444444444, + "grad_norm": 1.9206674098968506, + "learning_rate": 0.0001215302491103203, + "loss": 2.2543, + "step": 4420 + }, + { + "epoch": 1.964888888888889, + "grad_norm": 1.9313912391662598, + "learning_rate": 0.00012151245551601424, + "loss": 2.0722, + "step": 4421 + }, + { + "epoch": 1.9653333333333334, + "grad_norm": 2.0314974784851074, + "learning_rate": 0.0001214946619217082, + "loss": 2.2565, + "step": 4422 + }, + { + "epoch": 1.9657777777777778, + "grad_norm": 1.886389970779419, + "learning_rate": 0.00012147686832740214, + "loss": 2.3392, + "step": 4423 + }, + { + "epoch": 1.966222222222222, + "grad_norm": 2.023409843444824, + "learning_rate": 0.0001214590747330961, + "loss": 2.5448, + "step": 4424 + }, + { + "epoch": 1.9666666666666668, + "grad_norm": 1.7405117750167847, + "learning_rate": 0.00012144128113879003, + "loss": 1.7628, + "step": 4425 + }, + { + "epoch": 1.967111111111111, + "grad_norm": 1.9717170000076294, + "learning_rate": 0.00012142348754448398, + "loss": 2.027, + "step": 4426 + }, + { + "epoch": 1.9675555555555555, + "grad_norm": 1.9732880592346191, + "learning_rate": 0.00012140569395017794, + "loss": 2.3524, + "step": 4427 + }, + { + "epoch": 1.968, + "grad_norm": 1.9365227222442627, + "learning_rate": 0.00012138790035587188, + "loss": 1.9627, + "step": 4428 + }, + { + "epoch": 1.9684444444444444, + "grad_norm": 2.359199285507202, + "learning_rate": 0.00012137010676156584, + "loss": 2.1723, + "step": 4429 + }, + { + "epoch": 1.968888888888889, + "grad_norm": 2.102796792984009, + "learning_rate": 0.00012135231316725978, + "loss": 2.388, + "step": 4430 + }, + { + "epoch": 1.9693333333333334, + "grad_norm": 2.1342241764068604, + "learning_rate": 0.00012133451957295374, + "loss": 2.6232, + "step": 4431 + }, + { + "epoch": 1.9697777777777778, + "grad_norm": 2.0955238342285156, + "learning_rate": 0.0001213167259786477, + "loss": 2.0068, + "step": 4432 + }, + { + "epoch": 1.970222222222222, + "grad_norm": 2.0554394721984863, + "learning_rate": 0.00012129893238434164, + "loss": 2.0677, + "step": 4433 + }, + { + "epoch": 1.9706666666666668, + "grad_norm": 2.020298957824707, + "learning_rate": 0.0001212811387900356, + "loss": 1.9219, + "step": 4434 + }, + { + "epoch": 1.971111111111111, + "grad_norm": 1.943292498588562, + "learning_rate": 0.00012126334519572955, + "loss": 2.1289, + "step": 4435 + }, + { + "epoch": 1.9715555555555555, + "grad_norm": 1.9721516370773315, + "learning_rate": 0.0001212455516014235, + "loss": 1.7805, + "step": 4436 + }, + { + "epoch": 1.972, + "grad_norm": 1.828384518623352, + "learning_rate": 0.00012122775800711745, + "loss": 1.8164, + "step": 4437 + }, + { + "epoch": 1.9724444444444444, + "grad_norm": 2.0130600929260254, + "learning_rate": 0.00012120996441281138, + "loss": 1.8864, + "step": 4438 + }, + { + "epoch": 1.972888888888889, + "grad_norm": 2.123643398284912, + "learning_rate": 0.00012119217081850534, + "loss": 1.9729, + "step": 4439 + }, + { + "epoch": 1.9733333333333334, + "grad_norm": 2.224832534790039, + "learning_rate": 0.00012117437722419928, + "loss": 2.3063, + "step": 4440 + }, + { + "epoch": 1.9737777777777779, + "grad_norm": 2.5760409832000732, + "learning_rate": 0.00012115658362989324, + "loss": 3.1176, + "step": 4441 + }, + { + "epoch": 1.974222222222222, + "grad_norm": 1.936394214630127, + "learning_rate": 0.0001211387900355872, + "loss": 1.9176, + "step": 4442 + }, + { + "epoch": 1.9746666666666668, + "grad_norm": 2.0854177474975586, + "learning_rate": 0.00012112099644128114, + "loss": 2.125, + "step": 4443 + }, + { + "epoch": 1.975111111111111, + "grad_norm": 2.2257535457611084, + "learning_rate": 0.00012110320284697509, + "loss": 2.1138, + "step": 4444 + }, + { + "epoch": 1.9755555555555555, + "grad_norm": 2.3724265098571777, + "learning_rate": 0.00012108540925266905, + "loss": 2.2497, + "step": 4445 + }, + { + "epoch": 1.976, + "grad_norm": 1.9573043584823608, + "learning_rate": 0.00012106761565836299, + "loss": 1.7342, + "step": 4446 + }, + { + "epoch": 1.9764444444444444, + "grad_norm": 2.2017226219177246, + "learning_rate": 0.00012104982206405695, + "loss": 2.292, + "step": 4447 + }, + { + "epoch": 1.976888888888889, + "grad_norm": 2.3729429244995117, + "learning_rate": 0.0001210320284697509, + "loss": 2.853, + "step": 4448 + }, + { + "epoch": 1.9773333333333334, + "grad_norm": 2.4285669326782227, + "learning_rate": 0.00012101423487544485, + "loss": 2.0607, + "step": 4449 + }, + { + "epoch": 1.9777777777777779, + "grad_norm": 2.3270487785339355, + "learning_rate": 0.0001209964412811388, + "loss": 1.2632, + "step": 4450 + }, + { + "epoch": 1.978222222222222, + "grad_norm": 1.3772460222244263, + "learning_rate": 0.00012097864768683273, + "loss": 2.7924, + "step": 4451 + }, + { + "epoch": 1.9786666666666668, + "grad_norm": 1.451092004776001, + "learning_rate": 0.00012096085409252669, + "loss": 2.6418, + "step": 4452 + }, + { + "epoch": 1.979111111111111, + "grad_norm": 1.4491418600082397, + "learning_rate": 0.00012094306049822063, + "loss": 2.5647, + "step": 4453 + }, + { + "epoch": 1.9795555555555555, + "grad_norm": 1.4295765161514282, + "learning_rate": 0.00012092526690391459, + "loss": 2.092, + "step": 4454 + }, + { + "epoch": 1.98, + "grad_norm": 1.1193150281906128, + "learning_rate": 0.00012090747330960855, + "loss": 0.9929, + "step": 4455 + }, + { + "epoch": 1.9804444444444445, + "grad_norm": 1.6379578113555908, + "learning_rate": 0.00012088967971530249, + "loss": 2.621, + "step": 4456 + }, + { + "epoch": 1.980888888888889, + "grad_norm": 1.7126623392105103, + "learning_rate": 0.00012087188612099645, + "loss": 1.7564, + "step": 4457 + }, + { + "epoch": 1.9813333333333332, + "grad_norm": 1.670942783355713, + "learning_rate": 0.0001208540925266904, + "loss": 2.1591, + "step": 4458 + }, + { + "epoch": 1.9817777777777779, + "grad_norm": 1.9480079412460327, + "learning_rate": 0.00012083629893238435, + "loss": 2.5495, + "step": 4459 + }, + { + "epoch": 1.982222222222222, + "grad_norm": 1.5117992162704468, + "learning_rate": 0.0001208185053380783, + "loss": 2.1216, + "step": 4460 + }, + { + "epoch": 1.9826666666666668, + "grad_norm": 1.5676404237747192, + "learning_rate": 0.00012080071174377226, + "loss": 2.2749, + "step": 4461 + }, + { + "epoch": 1.983111111111111, + "grad_norm": 1.228750467300415, + "learning_rate": 0.0001207829181494662, + "loss": 0.9559, + "step": 4462 + }, + { + "epoch": 1.9835555555555555, + "grad_norm": 1.8625293970108032, + "learning_rate": 0.00012076512455516016, + "loss": 2.1203, + "step": 4463 + }, + { + "epoch": 1.984, + "grad_norm": 1.5544917583465576, + "learning_rate": 0.00012074733096085409, + "loss": 2.1986, + "step": 4464 + }, + { + "epoch": 1.9844444444444445, + "grad_norm": 1.8329989910125732, + "learning_rate": 0.00012072953736654804, + "loss": 2.1676, + "step": 4465 + }, + { + "epoch": 1.984888888888889, + "grad_norm": 1.6483327150344849, + "learning_rate": 0.00012071174377224199, + "loss": 2.383, + "step": 4466 + }, + { + "epoch": 1.9853333333333332, + "grad_norm": 1.6952513456344604, + "learning_rate": 0.00012069395017793594, + "loss": 2.0415, + "step": 4467 + }, + { + "epoch": 1.9857777777777779, + "grad_norm": 1.7458889484405518, + "learning_rate": 0.0001206761565836299, + "loss": 2.0539, + "step": 4468 + }, + { + "epoch": 1.9862222222222221, + "grad_norm": 1.8751850128173828, + "learning_rate": 0.00012065836298932384, + "loss": 2.391, + "step": 4469 + }, + { + "epoch": 1.9866666666666668, + "grad_norm": 1.6705282926559448, + "learning_rate": 0.0001206405693950178, + "loss": 1.8026, + "step": 4470 + }, + { + "epoch": 1.987111111111111, + "grad_norm": 1.6303818225860596, + "learning_rate": 0.00012062277580071176, + "loss": 1.899, + "step": 4471 + }, + { + "epoch": 1.9875555555555555, + "grad_norm": 1.7543621063232422, + "learning_rate": 0.0001206049822064057, + "loss": 1.9756, + "step": 4472 + }, + { + "epoch": 1.988, + "grad_norm": 1.8945670127868652, + "learning_rate": 0.00012058718861209966, + "loss": 2.1054, + "step": 4473 + }, + { + "epoch": 1.9884444444444445, + "grad_norm": 1.76500403881073, + "learning_rate": 0.00012056939501779361, + "loss": 2.2563, + "step": 4474 + }, + { + "epoch": 1.988888888888889, + "grad_norm": 1.8483823537826538, + "learning_rate": 0.00012055160142348755, + "loss": 2.243, + "step": 4475 + }, + { + "epoch": 1.9893333333333332, + "grad_norm": 1.7314825057983398, + "learning_rate": 0.00012053380782918151, + "loss": 2.4037, + "step": 4476 + }, + { + "epoch": 1.9897777777777779, + "grad_norm": 1.545645833015442, + "learning_rate": 0.00012051601423487544, + "loss": 1.1406, + "step": 4477 + }, + { + "epoch": 1.9902222222222221, + "grad_norm": 1.576292634010315, + "learning_rate": 0.0001204982206405694, + "loss": 1.7195, + "step": 4478 + }, + { + "epoch": 1.9906666666666668, + "grad_norm": 1.6278116703033447, + "learning_rate": 0.00012048042704626334, + "loss": 1.8675, + "step": 4479 + }, + { + "epoch": 1.991111111111111, + "grad_norm": 1.6416962146759033, + "learning_rate": 0.0001204626334519573, + "loss": 1.9954, + "step": 4480 + }, + { + "epoch": 1.9915555555555555, + "grad_norm": 1.7250615358352661, + "learning_rate": 0.00012044483985765125, + "loss": 2.2059, + "step": 4481 + }, + { + "epoch": 1.992, + "grad_norm": 1.973854899406433, + "learning_rate": 0.0001204270462633452, + "loss": 2.4063, + "step": 4482 + }, + { + "epoch": 1.9924444444444445, + "grad_norm": 2.003932476043701, + "learning_rate": 0.00012040925266903915, + "loss": 1.9116, + "step": 4483 + }, + { + "epoch": 1.992888888888889, + "grad_norm": 1.8206032514572144, + "learning_rate": 0.00012039145907473311, + "loss": 1.9458, + "step": 4484 + }, + { + "epoch": 1.9933333333333332, + "grad_norm": 2.3211958408355713, + "learning_rate": 0.00012037366548042705, + "loss": 2.565, + "step": 4485 + }, + { + "epoch": 1.9937777777777779, + "grad_norm": 2.1076741218566895, + "learning_rate": 0.00012035587188612101, + "loss": 2.1175, + "step": 4486 + }, + { + "epoch": 1.9942222222222221, + "grad_norm": 1.6778833866119385, + "learning_rate": 0.00012033807829181497, + "loss": 1.5964, + "step": 4487 + }, + { + "epoch": 1.9946666666666668, + "grad_norm": 2.298328399658203, + "learning_rate": 0.00012032028469750891, + "loss": 2.1779, + "step": 4488 + }, + { + "epoch": 1.995111111111111, + "grad_norm": 1.8983428478240967, + "learning_rate": 0.00012030249110320286, + "loss": 2.0109, + "step": 4489 + }, + { + "epoch": 1.9955555555555555, + "grad_norm": 1.9424333572387695, + "learning_rate": 0.0001202846975088968, + "loss": 1.6167, + "step": 4490 + }, + { + "epoch": 1.996, + "grad_norm": 1.9415746927261353, + "learning_rate": 0.00012026690391459075, + "loss": 1.7478, + "step": 4491 + }, + { + "epoch": 1.9964444444444445, + "grad_norm": 1.8782165050506592, + "learning_rate": 0.0001202491103202847, + "loss": 2.08, + "step": 4492 + }, + { + "epoch": 1.996888888888889, + "grad_norm": 2.077409267425537, + "learning_rate": 0.00012023131672597865, + "loss": 1.9588, + "step": 4493 + }, + { + "epoch": 1.9973333333333332, + "grad_norm": 2.3767199516296387, + "learning_rate": 0.0001202135231316726, + "loss": 2.4939, + "step": 4494 + }, + { + "epoch": 1.9977777777777779, + "grad_norm": 2.2348458766937256, + "learning_rate": 0.00012019572953736655, + "loss": 2.3313, + "step": 4495 + }, + { + "epoch": 1.9982222222222221, + "grad_norm": 2.509856700897217, + "learning_rate": 0.0001201779359430605, + "loss": 2.5201, + "step": 4496 + }, + { + "epoch": 1.9986666666666668, + "grad_norm": 2.5316574573516846, + "learning_rate": 0.00012016014234875446, + "loss": 2.3262, + "step": 4497 + }, + { + "epoch": 1.999111111111111, + "grad_norm": 2.4037058353424072, + "learning_rate": 0.0001201423487544484, + "loss": 1.8568, + "step": 4498 + }, + { + "epoch": 1.9995555555555555, + "grad_norm": 2.8537352085113525, + "learning_rate": 0.00012012455516014236, + "loss": 2.8726, + "step": 4499 + }, + { + "epoch": 2.0, + "grad_norm": 3.3729751110076904, + "learning_rate": 0.00012010676156583632, + "loss": 2.0103, + "step": 4500 + }, + { + "epoch": 2.0, + "eval_loss": 2.4265496730804443, + "eval_runtime": 47.2809, + "eval_samples_per_second": 10.575, + "eval_steps_per_second": 10.575, + "step": 4500 + }, + { + "epoch": 2.0004444444444442, + "grad_norm": 0.8853614926338196, + "learning_rate": 0.00012008896797153026, + "loss": 0.8435, + "step": 4501 + }, + { + "epoch": 2.000888888888889, + "grad_norm": 1.3319120407104492, + "learning_rate": 0.00012007117437722422, + "loss": 2.0388, + "step": 4502 + }, + { + "epoch": 2.001333333333333, + "grad_norm": 1.6479343175888062, + "learning_rate": 0.00012005338078291815, + "loss": 1.9234, + "step": 4503 + }, + { + "epoch": 2.001777777777778, + "grad_norm": 1.6244651079177856, + "learning_rate": 0.0001200355871886121, + "loss": 2.0868, + "step": 4504 + }, + { + "epoch": 2.002222222222222, + "grad_norm": 1.5395774841308594, + "learning_rate": 0.00012001779359430605, + "loss": 2.1305, + "step": 4505 + }, + { + "epoch": 2.002666666666667, + "grad_norm": 1.7427664995193481, + "learning_rate": 0.00012, + "loss": 2.3715, + "step": 4506 + }, + { + "epoch": 2.003111111111111, + "grad_norm": 1.600896954536438, + "learning_rate": 0.00011998220640569396, + "loss": 1.9859, + "step": 4507 + }, + { + "epoch": 2.0035555555555558, + "grad_norm": 1.6297886371612549, + "learning_rate": 0.0001199644128113879, + "loss": 1.4048, + "step": 4508 + }, + { + "epoch": 2.004, + "grad_norm": 1.7642384767532349, + "learning_rate": 0.00011994661921708186, + "loss": 2.2935, + "step": 4509 + }, + { + "epoch": 2.0044444444444443, + "grad_norm": 1.5079247951507568, + "learning_rate": 0.00011992882562277582, + "loss": 1.7682, + "step": 4510 + }, + { + "epoch": 2.004888888888889, + "grad_norm": 1.8100050687789917, + "learning_rate": 0.00011991103202846976, + "loss": 1.9194, + "step": 4511 + }, + { + "epoch": 2.005333333333333, + "grad_norm": 1.9946025609970093, + "learning_rate": 0.00011989323843416371, + "loss": 1.9621, + "step": 4512 + }, + { + "epoch": 2.005777777777778, + "grad_norm": 1.6905927658081055, + "learning_rate": 0.00011987544483985766, + "loss": 2.0369, + "step": 4513 + }, + { + "epoch": 2.006222222222222, + "grad_norm": 1.6684443950653076, + "learning_rate": 0.00011985765124555161, + "loss": 1.8459, + "step": 4514 + }, + { + "epoch": 2.006666666666667, + "grad_norm": 2.0483903884887695, + "learning_rate": 0.00011983985765124557, + "loss": 1.9101, + "step": 4515 + }, + { + "epoch": 2.007111111111111, + "grad_norm": 2.2682251930236816, + "learning_rate": 0.0001198220640569395, + "loss": 1.5367, + "step": 4516 + }, + { + "epoch": 2.0075555555555558, + "grad_norm": 1.442750096321106, + "learning_rate": 0.00011980427046263346, + "loss": 0.6781, + "step": 4517 + }, + { + "epoch": 2.008, + "grad_norm": 2.0190680027008057, + "learning_rate": 0.0001197864768683274, + "loss": 1.9999, + "step": 4518 + }, + { + "epoch": 2.0084444444444443, + "grad_norm": 2.084582567214966, + "learning_rate": 0.00011976868327402136, + "loss": 2.1547, + "step": 4519 + }, + { + "epoch": 2.008888888888889, + "grad_norm": 2.291092872619629, + "learning_rate": 0.0001197508896797153, + "loss": 1.982, + "step": 4520 + }, + { + "epoch": 2.009333333333333, + "grad_norm": 2.4098877906799316, + "learning_rate": 0.00011973309608540926, + "loss": 1.9975, + "step": 4521 + }, + { + "epoch": 2.009777777777778, + "grad_norm": 2.174729585647583, + "learning_rate": 0.00011971530249110321, + "loss": 1.8608, + "step": 4522 + }, + { + "epoch": 2.010222222222222, + "grad_norm": 2.5682456493377686, + "learning_rate": 0.00011969750889679716, + "loss": 1.8649, + "step": 4523 + }, + { + "epoch": 2.010666666666667, + "grad_norm": 1.9961142539978027, + "learning_rate": 0.00011967971530249111, + "loss": 1.5078, + "step": 4524 + }, + { + "epoch": 2.011111111111111, + "grad_norm": 2.4178638458251953, + "learning_rate": 0.00011966192170818507, + "loss": 1.7468, + "step": 4525 + }, + { + "epoch": 2.0115555555555558, + "grad_norm": 2.30759334564209, + "learning_rate": 0.00011964412811387901, + "loss": 1.8143, + "step": 4526 + }, + { + "epoch": 2.012, + "grad_norm": 1.8575639724731445, + "learning_rate": 0.00011962633451957297, + "loss": 1.2505, + "step": 4527 + }, + { + "epoch": 2.0124444444444443, + "grad_norm": 2.4838311672210693, + "learning_rate": 0.0001196085409252669, + "loss": 1.9343, + "step": 4528 + }, + { + "epoch": 2.012888888888889, + "grad_norm": 2.2968297004699707, + "learning_rate": 0.00011959074733096085, + "loss": 1.8051, + "step": 4529 + }, + { + "epoch": 2.013333333333333, + "grad_norm": 2.502021074295044, + "learning_rate": 0.0001195729537366548, + "loss": 1.5853, + "step": 4530 + }, + { + "epoch": 2.013777777777778, + "grad_norm": 1.7160247564315796, + "learning_rate": 0.00011955516014234875, + "loss": 0.898, + "step": 4531 + }, + { + "epoch": 2.014222222222222, + "grad_norm": 2.109179973602295, + "learning_rate": 0.00011953736654804271, + "loss": 1.3351, + "step": 4532 + }, + { + "epoch": 2.014666666666667, + "grad_norm": 2.4689619541168213, + "learning_rate": 0.00011951957295373665, + "loss": 1.8757, + "step": 4533 + }, + { + "epoch": 2.015111111111111, + "grad_norm": 3.084038257598877, + "learning_rate": 0.00011950177935943061, + "loss": 2.2736, + "step": 4534 + }, + { + "epoch": 2.0155555555555558, + "grad_norm": 3.0082006454467773, + "learning_rate": 0.00011948398576512457, + "loss": 2.0318, + "step": 4535 + }, + { + "epoch": 2.016, + "grad_norm": 2.722201108932495, + "learning_rate": 0.00011946619217081851, + "loss": 1.7501, + "step": 4536 + }, + { + "epoch": 2.0164444444444443, + "grad_norm": 2.107433795928955, + "learning_rate": 0.00011944839857651246, + "loss": 1.4507, + "step": 4537 + }, + { + "epoch": 2.016888888888889, + "grad_norm": 2.8475327491760254, + "learning_rate": 0.00011943060498220642, + "loss": 1.8524, + "step": 4538 + }, + { + "epoch": 2.017333333333333, + "grad_norm": 2.276287078857422, + "learning_rate": 0.00011941281138790036, + "loss": 1.5575, + "step": 4539 + }, + { + "epoch": 2.017777777777778, + "grad_norm": 2.9798543453216553, + "learning_rate": 0.00011939501779359432, + "loss": 1.8226, + "step": 4540 + }, + { + "epoch": 2.018222222222222, + "grad_norm": 2.7004785537719727, + "learning_rate": 0.00011937722419928825, + "loss": 1.6149, + "step": 4541 + }, + { + "epoch": 2.018666666666667, + "grad_norm": 2.5577023029327393, + "learning_rate": 0.0001193594306049822, + "loss": 1.6913, + "step": 4542 + }, + { + "epoch": 2.019111111111111, + "grad_norm": 2.958678960800171, + "learning_rate": 0.00011934163701067615, + "loss": 1.9823, + "step": 4543 + }, + { + "epoch": 2.0195555555555558, + "grad_norm": 2.862009048461914, + "learning_rate": 0.0001193238434163701, + "loss": 2.1328, + "step": 4544 + }, + { + "epoch": 2.02, + "grad_norm": 2.533651113510132, + "learning_rate": 0.00011930604982206406, + "loss": 1.2886, + "step": 4545 + }, + { + "epoch": 2.0204444444444443, + "grad_norm": 2.7307307720184326, + "learning_rate": 0.000119288256227758, + "loss": 1.4595, + "step": 4546 + }, + { + "epoch": 2.020888888888889, + "grad_norm": 2.3221077919006348, + "learning_rate": 0.00011927046263345196, + "loss": 1.4627, + "step": 4547 + }, + { + "epoch": 2.021333333333333, + "grad_norm": 2.621258497238159, + "learning_rate": 0.00011925266903914592, + "loss": 1.9821, + "step": 4548 + }, + { + "epoch": 2.021777777777778, + "grad_norm": 3.0992743968963623, + "learning_rate": 0.00011923487544483986, + "loss": 1.9199, + "step": 4549 + }, + { + "epoch": 2.022222222222222, + "grad_norm": 2.962254524230957, + "learning_rate": 0.00011921708185053382, + "loss": 1.2242, + "step": 4550 + }, + { + "epoch": 2.022666666666667, + "grad_norm": 1.6041865348815918, + "learning_rate": 0.00011919928825622777, + "loss": 1.3209, + "step": 4551 + }, + { + "epoch": 2.023111111111111, + "grad_norm": 1.467262864112854, + "learning_rate": 0.00011918149466192172, + "loss": 1.234, + "step": 4552 + }, + { + "epoch": 2.0235555555555558, + "grad_norm": 1.7941569089889526, + "learning_rate": 0.00011916370106761567, + "loss": 2.2728, + "step": 4553 + }, + { + "epoch": 2.024, + "grad_norm": 1.5521703958511353, + "learning_rate": 0.0001191459074733096, + "loss": 1.8525, + "step": 4554 + }, + { + "epoch": 2.0244444444444443, + "grad_norm": 1.894801378250122, + "learning_rate": 0.00011912811387900356, + "loss": 2.1562, + "step": 4555 + }, + { + "epoch": 2.024888888888889, + "grad_norm": 1.414631962776184, + "learning_rate": 0.0001191103202846975, + "loss": 0.9182, + "step": 4556 + }, + { + "epoch": 2.025333333333333, + "grad_norm": 2.034543752670288, + "learning_rate": 0.00011909252669039146, + "loss": 2.1913, + "step": 4557 + }, + { + "epoch": 2.025777777777778, + "grad_norm": 1.8976682424545288, + "learning_rate": 0.00011907473309608542, + "loss": 2.0936, + "step": 4558 + }, + { + "epoch": 2.026222222222222, + "grad_norm": 1.6398429870605469, + "learning_rate": 0.00011905693950177936, + "loss": 1.7521, + "step": 4559 + }, + { + "epoch": 2.026666666666667, + "grad_norm": 1.8975718021392822, + "learning_rate": 0.00011903914590747332, + "loss": 1.6575, + "step": 4560 + }, + { + "epoch": 2.027111111111111, + "grad_norm": 1.5204815864562988, + "learning_rate": 0.00011902135231316727, + "loss": 0.9841, + "step": 4561 + }, + { + "epoch": 2.0275555555555558, + "grad_norm": 1.710460901260376, + "learning_rate": 0.00011900355871886121, + "loss": 1.4803, + "step": 4562 + }, + { + "epoch": 2.028, + "grad_norm": 1.8635836839675903, + "learning_rate": 0.00011898576512455517, + "loss": 2.0793, + "step": 4563 + }, + { + "epoch": 2.0284444444444443, + "grad_norm": 1.871050238609314, + "learning_rate": 0.00011896797153024913, + "loss": 1.0645, + "step": 4564 + }, + { + "epoch": 2.028888888888889, + "grad_norm": 2.071890115737915, + "learning_rate": 0.00011895017793594307, + "loss": 1.9001, + "step": 4565 + }, + { + "epoch": 2.029333333333333, + "grad_norm": 2.0963134765625, + "learning_rate": 0.00011893238434163703, + "loss": 2.1881, + "step": 4566 + }, + { + "epoch": 2.029777777777778, + "grad_norm": 1.9067516326904297, + "learning_rate": 0.00011891459074733096, + "loss": 1.7577, + "step": 4567 + }, + { + "epoch": 2.030222222222222, + "grad_norm": 2.036006212234497, + "learning_rate": 0.00011889679715302491, + "loss": 2.1429, + "step": 4568 + }, + { + "epoch": 2.030666666666667, + "grad_norm": 2.4452297687530518, + "learning_rate": 0.00011887900355871886, + "loss": 2.1539, + "step": 4569 + }, + { + "epoch": 2.031111111111111, + "grad_norm": 2.6038591861724854, + "learning_rate": 0.00011886120996441281, + "loss": 2.0575, + "step": 4570 + }, + { + "epoch": 2.0315555555555553, + "grad_norm": 1.8687160015106201, + "learning_rate": 0.00011884341637010677, + "loss": 1.633, + "step": 4571 + }, + { + "epoch": 2.032, + "grad_norm": 2.6096248626708984, + "learning_rate": 0.00011882562277580071, + "loss": 1.9688, + "step": 4572 + }, + { + "epoch": 2.0324444444444443, + "grad_norm": 1.9733986854553223, + "learning_rate": 0.00011880782918149467, + "loss": 2.0018, + "step": 4573 + }, + { + "epoch": 2.032888888888889, + "grad_norm": 2.4421169757843018, + "learning_rate": 0.00011879003558718862, + "loss": 1.9457, + "step": 4574 + }, + { + "epoch": 2.033333333333333, + "grad_norm": 2.1608901023864746, + "learning_rate": 0.00011877224199288257, + "loss": 1.5918, + "step": 4575 + }, + { + "epoch": 2.033777777777778, + "grad_norm": 1.7584596872329712, + "learning_rate": 0.00011875444839857652, + "loss": 1.5401, + "step": 4576 + }, + { + "epoch": 2.034222222222222, + "grad_norm": 3.489712953567505, + "learning_rate": 0.00011873665480427048, + "loss": 0.9426, + "step": 4577 + }, + { + "epoch": 2.034666666666667, + "grad_norm": 2.114403486251831, + "learning_rate": 0.00011871886120996442, + "loss": 1.8606, + "step": 4578 + }, + { + "epoch": 2.035111111111111, + "grad_norm": 2.2650794982910156, + "learning_rate": 0.00011870106761565838, + "loss": 1.6157, + "step": 4579 + }, + { + "epoch": 2.0355555555555553, + "grad_norm": 2.404672145843506, + "learning_rate": 0.00011868327402135231, + "loss": 1.9152, + "step": 4580 + }, + { + "epoch": 2.036, + "grad_norm": 2.237191677093506, + "learning_rate": 0.00011866548042704627, + "loss": 1.8395, + "step": 4581 + }, + { + "epoch": 2.0364444444444443, + "grad_norm": 2.234955072402954, + "learning_rate": 0.00011864768683274021, + "loss": 1.7004, + "step": 4582 + }, + { + "epoch": 2.036888888888889, + "grad_norm": 1.999866247177124, + "learning_rate": 0.00011862989323843417, + "loss": 1.33, + "step": 4583 + }, + { + "epoch": 2.037333333333333, + "grad_norm": 2.5568530559539795, + "learning_rate": 0.00011861209964412812, + "loss": 1.7804, + "step": 4584 + }, + { + "epoch": 2.037777777777778, + "grad_norm": 2.2619681358337402, + "learning_rate": 0.00011859430604982206, + "loss": 1.7402, + "step": 4585 + }, + { + "epoch": 2.038222222222222, + "grad_norm": 2.6120476722717285, + "learning_rate": 0.00011857651245551602, + "loss": 1.6572, + "step": 4586 + }, + { + "epoch": 2.038666666666667, + "grad_norm": 2.367854595184326, + "learning_rate": 0.00011855871886120998, + "loss": 1.3876, + "step": 4587 + }, + { + "epoch": 2.039111111111111, + "grad_norm": 2.1282546520233154, + "learning_rate": 0.00011854092526690392, + "loss": 1.5421, + "step": 4588 + }, + { + "epoch": 2.0395555555555553, + "grad_norm": 2.2529513835906982, + "learning_rate": 0.00011852313167259788, + "loss": 1.2083, + "step": 4589 + }, + { + "epoch": 2.04, + "grad_norm": 2.7248787879943848, + "learning_rate": 0.00011850533807829183, + "loss": 1.6816, + "step": 4590 + }, + { + "epoch": 2.0404444444444443, + "grad_norm": 3.0865466594696045, + "learning_rate": 0.00011848754448398578, + "loss": 1.5498, + "step": 4591 + }, + { + "epoch": 2.040888888888889, + "grad_norm": 3.441319465637207, + "learning_rate": 0.00011846975088967973, + "loss": 1.5423, + "step": 4592 + }, + { + "epoch": 2.041333333333333, + "grad_norm": 2.960602283477783, + "learning_rate": 0.00011845195729537366, + "loss": 1.7773, + "step": 4593 + }, + { + "epoch": 2.041777777777778, + "grad_norm": 3.269716501235962, + "learning_rate": 0.00011843416370106762, + "loss": 2.1755, + "step": 4594 + }, + { + "epoch": 2.042222222222222, + "grad_norm": 3.3861782550811768, + "learning_rate": 0.00011841637010676156, + "loss": 1.8699, + "step": 4595 + }, + { + "epoch": 2.042666666666667, + "grad_norm": 2.951418399810791, + "learning_rate": 0.00011839857651245552, + "loss": 1.9355, + "step": 4596 + }, + { + "epoch": 2.043111111111111, + "grad_norm": 2.9961771965026855, + "learning_rate": 0.00011838078291814948, + "loss": 2.1783, + "step": 4597 + }, + { + "epoch": 2.0435555555555553, + "grad_norm": 2.833582639694214, + "learning_rate": 0.00011836298932384342, + "loss": 1.8636, + "step": 4598 + }, + { + "epoch": 2.044, + "grad_norm": 3.0772078037261963, + "learning_rate": 0.00011834519572953737, + "loss": 1.4656, + "step": 4599 + }, + { + "epoch": 2.0444444444444443, + "grad_norm": 3.595390558242798, + "learning_rate": 0.00011832740213523133, + "loss": 1.5303, + "step": 4600 + }, + { + "epoch": 2.044888888888889, + "grad_norm": 1.3135249614715576, + "learning_rate": 0.00011830960854092527, + "loss": 1.0165, + "step": 4601 + }, + { + "epoch": 2.0453333333333332, + "grad_norm": 1.4123620986938477, + "learning_rate": 0.00011829181494661923, + "loss": 1.1516, + "step": 4602 + }, + { + "epoch": 2.045777777777778, + "grad_norm": 2.0700747966766357, + "learning_rate": 0.00011827402135231317, + "loss": 1.506, + "step": 4603 + }, + { + "epoch": 2.046222222222222, + "grad_norm": 1.950496792793274, + "learning_rate": 0.00011825622775800713, + "loss": 1.8448, + "step": 4604 + }, + { + "epoch": 2.046666666666667, + "grad_norm": 2.0276780128479004, + "learning_rate": 0.00011823843416370109, + "loss": 1.9754, + "step": 4605 + }, + { + "epoch": 2.047111111111111, + "grad_norm": 2.1765434741973877, + "learning_rate": 0.00011822064056939502, + "loss": 2.5343, + "step": 4606 + }, + { + "epoch": 2.0475555555555554, + "grad_norm": 1.9355638027191162, + "learning_rate": 0.00011820284697508896, + "loss": 2.0598, + "step": 4607 + }, + { + "epoch": 2.048, + "grad_norm": 2.012378215789795, + "learning_rate": 0.00011818505338078292, + "loss": 1.7933, + "step": 4608 + }, + { + "epoch": 2.0484444444444443, + "grad_norm": 2.179774045944214, + "learning_rate": 0.00011816725978647687, + "loss": 2.1403, + "step": 4609 + }, + { + "epoch": 2.048888888888889, + "grad_norm": 2.4090864658355713, + "learning_rate": 0.00011814946619217081, + "loss": 1.648, + "step": 4610 + }, + { + "epoch": 2.0493333333333332, + "grad_norm": 2.0735795497894287, + "learning_rate": 0.00011813167259786477, + "loss": 1.7304, + "step": 4611 + }, + { + "epoch": 2.049777777777778, + "grad_norm": 2.167476177215576, + "learning_rate": 0.00011811387900355873, + "loss": 1.7997, + "step": 4612 + }, + { + "epoch": 2.050222222222222, + "grad_norm": 1.9912680387496948, + "learning_rate": 0.00011809608540925267, + "loss": 1.4525, + "step": 4613 + }, + { + "epoch": 2.050666666666667, + "grad_norm": 2.363731622695923, + "learning_rate": 0.00011807829181494663, + "loss": 2.1657, + "step": 4614 + }, + { + "epoch": 2.051111111111111, + "grad_norm": 1.7121641635894775, + "learning_rate": 0.00011806049822064058, + "loss": 1.3718, + "step": 4615 + }, + { + "epoch": 2.0515555555555554, + "grad_norm": 1.7621139287948608, + "learning_rate": 0.00011804270462633453, + "loss": 1.3004, + "step": 4616 + }, + { + "epoch": 2.052, + "grad_norm": 2.833408832550049, + "learning_rate": 0.00011802491103202848, + "loss": 1.7775, + "step": 4617 + }, + { + "epoch": 2.0524444444444443, + "grad_norm": 1.8854105472564697, + "learning_rate": 0.00011800711743772244, + "loss": 1.608, + "step": 4618 + }, + { + "epoch": 2.052888888888889, + "grad_norm": 2.169327974319458, + "learning_rate": 0.00011798932384341637, + "loss": 1.9171, + "step": 4619 + }, + { + "epoch": 2.0533333333333332, + "grad_norm": 2.554474115371704, + "learning_rate": 0.00011797153024911031, + "loss": 1.3938, + "step": 4620 + }, + { + "epoch": 2.053777777777778, + "grad_norm": 1.969563364982605, + "learning_rate": 0.00011795373665480427, + "loss": 1.7774, + "step": 4621 + }, + { + "epoch": 2.054222222222222, + "grad_norm": 1.854287028312683, + "learning_rate": 0.00011793594306049822, + "loss": 1.8198, + "step": 4622 + }, + { + "epoch": 2.054666666666667, + "grad_norm": 2.1573734283447266, + "learning_rate": 0.00011791814946619217, + "loss": 1.7923, + "step": 4623 + }, + { + "epoch": 2.055111111111111, + "grad_norm": 1.8452686071395874, + "learning_rate": 0.00011790035587188612, + "loss": 1.2323, + "step": 4624 + }, + { + "epoch": 2.0555555555555554, + "grad_norm": 2.0869951248168945, + "learning_rate": 0.00011788256227758008, + "loss": 1.9431, + "step": 4625 + }, + { + "epoch": 2.056, + "grad_norm": 1.9938613176345825, + "learning_rate": 0.00011786476868327402, + "loss": 1.5775, + "step": 4626 + }, + { + "epoch": 2.0564444444444443, + "grad_norm": 2.0701723098754883, + "learning_rate": 0.00011784697508896798, + "loss": 1.465, + "step": 4627 + }, + { + "epoch": 2.056888888888889, + "grad_norm": 2.5349276065826416, + "learning_rate": 0.00011782918149466194, + "loss": 2.0339, + "step": 4628 + }, + { + "epoch": 2.0573333333333332, + "grad_norm": 1.9578278064727783, + "learning_rate": 0.00011781138790035588, + "loss": 1.8306, + "step": 4629 + }, + { + "epoch": 2.057777777777778, + "grad_norm": 2.2757606506347656, + "learning_rate": 0.00011779359430604984, + "loss": 1.5594, + "step": 4630 + }, + { + "epoch": 2.058222222222222, + "grad_norm": 2.5828166007995605, + "learning_rate": 0.00011777580071174379, + "loss": 2.4208, + "step": 4631 + }, + { + "epoch": 2.058666666666667, + "grad_norm": 2.063826084136963, + "learning_rate": 0.00011775800711743772, + "loss": 1.3296, + "step": 4632 + }, + { + "epoch": 2.059111111111111, + "grad_norm": 2.9016664028167725, + "learning_rate": 0.00011774021352313167, + "loss": 2.1661, + "step": 4633 + }, + { + "epoch": 2.0595555555555554, + "grad_norm": 2.3844025135040283, + "learning_rate": 0.00011772241992882562, + "loss": 1.6781, + "step": 4634 + }, + { + "epoch": 2.06, + "grad_norm": 1.9740389585494995, + "learning_rate": 0.00011770462633451958, + "loss": 1.4587, + "step": 4635 + }, + { + "epoch": 2.0604444444444443, + "grad_norm": 2.022944688796997, + "learning_rate": 0.00011768683274021352, + "loss": 1.5967, + "step": 4636 + }, + { + "epoch": 2.060888888888889, + "grad_norm": 2.8368794918060303, + "learning_rate": 0.00011766903914590748, + "loss": 1.9038, + "step": 4637 + }, + { + "epoch": 2.0613333333333332, + "grad_norm": 2.7894749641418457, + "learning_rate": 0.00011765124555160143, + "loss": 1.5072, + "step": 4638 + }, + { + "epoch": 2.061777777777778, + "grad_norm": 2.564565896987915, + "learning_rate": 0.00011763345195729538, + "loss": 1.4213, + "step": 4639 + }, + { + "epoch": 2.062222222222222, + "grad_norm": 2.5650315284729004, + "learning_rate": 0.00011761565836298933, + "loss": 1.6088, + "step": 4640 + }, + { + "epoch": 2.062666666666667, + "grad_norm": 1.4882378578186035, + "learning_rate": 0.00011759786476868329, + "loss": 0.7152, + "step": 4641 + }, + { + "epoch": 2.063111111111111, + "grad_norm": 3.052170991897583, + "learning_rate": 0.00011758007117437723, + "loss": 1.7892, + "step": 4642 + }, + { + "epoch": 2.0635555555555554, + "grad_norm": 3.016031265258789, + "learning_rate": 0.00011756227758007119, + "loss": 2.0574, + "step": 4643 + }, + { + "epoch": 2.064, + "grad_norm": 2.477534532546997, + "learning_rate": 0.00011754448398576512, + "loss": 1.4391, + "step": 4644 + }, + { + "epoch": 2.0644444444444443, + "grad_norm": 2.703592300415039, + "learning_rate": 0.00011752669039145908, + "loss": 1.782, + "step": 4645 + }, + { + "epoch": 2.064888888888889, + "grad_norm": 2.8277475833892822, + "learning_rate": 0.00011750889679715302, + "loss": 2.0831, + "step": 4646 + }, + { + "epoch": 2.0653333333333332, + "grad_norm": 2.78928542137146, + "learning_rate": 0.00011749110320284697, + "loss": 1.3444, + "step": 4647 + }, + { + "epoch": 2.065777777777778, + "grad_norm": 2.8295578956604004, + "learning_rate": 0.00011747330960854093, + "loss": 1.4441, + "step": 4648 + }, + { + "epoch": 2.066222222222222, + "grad_norm": 1.9804134368896484, + "learning_rate": 0.00011745551601423487, + "loss": 0.8381, + "step": 4649 + }, + { + "epoch": 2.066666666666667, + "grad_norm": 3.729863166809082, + "learning_rate": 0.00011743772241992883, + "loss": 1.7128, + "step": 4650 + }, + { + "epoch": 2.067111111111111, + "grad_norm": 2.222588062286377, + "learning_rate": 0.00011741992882562279, + "loss": 0.9648, + "step": 4651 + }, + { + "epoch": 2.0675555555555554, + "grad_norm": 1.8874247074127197, + "learning_rate": 0.00011740213523131673, + "loss": 1.9391, + "step": 4652 + }, + { + "epoch": 2.068, + "grad_norm": 1.757788062095642, + "learning_rate": 0.00011738434163701069, + "loss": 1.8273, + "step": 4653 + }, + { + "epoch": 2.0684444444444443, + "grad_norm": 2.6937432289123535, + "learning_rate": 0.00011736654804270464, + "loss": 2.0317, + "step": 4654 + }, + { + "epoch": 2.068888888888889, + "grad_norm": 2.1685352325439453, + "learning_rate": 0.00011734875444839859, + "loss": 1.957, + "step": 4655 + }, + { + "epoch": 2.0693333333333332, + "grad_norm": 1.865443229675293, + "learning_rate": 0.00011733096085409254, + "loss": 1.8245, + "step": 4656 + }, + { + "epoch": 2.069777777777778, + "grad_norm": 1.8847582340240479, + "learning_rate": 0.00011731316725978647, + "loss": 2.0589, + "step": 4657 + }, + { + "epoch": 2.070222222222222, + "grad_norm": 2.063577890396118, + "learning_rate": 0.00011729537366548043, + "loss": 1.9999, + "step": 4658 + }, + { + "epoch": 2.070666666666667, + "grad_norm": 2.3595352172851562, + "learning_rate": 0.00011727758007117437, + "loss": 1.2466, + "step": 4659 + }, + { + "epoch": 2.071111111111111, + "grad_norm": 1.9096697568893433, + "learning_rate": 0.00011725978647686833, + "loss": 1.5026, + "step": 4660 + }, + { + "epoch": 2.0715555555555554, + "grad_norm": 2.064755439758301, + "learning_rate": 0.00011724199288256228, + "loss": 1.7534, + "step": 4661 + }, + { + "epoch": 2.072, + "grad_norm": 1.523712158203125, + "learning_rate": 0.00011722419928825623, + "loss": 1.2692, + "step": 4662 + }, + { + "epoch": 2.0724444444444443, + "grad_norm": 2.223828077316284, + "learning_rate": 0.00011720640569395018, + "loss": 1.8284, + "step": 4663 + }, + { + "epoch": 2.072888888888889, + "grad_norm": 2.1838390827178955, + "learning_rate": 0.00011718861209964414, + "loss": 1.6618, + "step": 4664 + }, + { + "epoch": 2.0733333333333333, + "grad_norm": 1.8496938943862915, + "learning_rate": 0.00011717081850533808, + "loss": 1.9343, + "step": 4665 + }, + { + "epoch": 2.073777777777778, + "grad_norm": 2.063234806060791, + "learning_rate": 0.00011715302491103204, + "loss": 1.4225, + "step": 4666 + }, + { + "epoch": 2.074222222222222, + "grad_norm": 2.388313055038452, + "learning_rate": 0.000117135231316726, + "loss": 1.5016, + "step": 4667 + }, + { + "epoch": 2.074666666666667, + "grad_norm": 2.2325823307037354, + "learning_rate": 0.00011711743772241994, + "loss": 1.7935, + "step": 4668 + }, + { + "epoch": 2.075111111111111, + "grad_norm": 1.600996732711792, + "learning_rate": 0.0001170996441281139, + "loss": 1.0136, + "step": 4669 + }, + { + "epoch": 2.0755555555555554, + "grad_norm": 2.17900013923645, + "learning_rate": 0.00011708185053380783, + "loss": 1.8612, + "step": 4670 + }, + { + "epoch": 2.076, + "grad_norm": 2.1239895820617676, + "learning_rate": 0.00011706405693950178, + "loss": 2.083, + "step": 4671 + }, + { + "epoch": 2.0764444444444443, + "grad_norm": 1.9274380207061768, + "learning_rate": 0.00011704626334519572, + "loss": 1.789, + "step": 4672 + }, + { + "epoch": 2.076888888888889, + "grad_norm": 2.4279444217681885, + "learning_rate": 0.00011702846975088968, + "loss": 2.0962, + "step": 4673 + }, + { + "epoch": 2.0773333333333333, + "grad_norm": 2.049731969833374, + "learning_rate": 0.00011701067615658364, + "loss": 1.529, + "step": 4674 + }, + { + "epoch": 2.077777777777778, + "grad_norm": 2.452834367752075, + "learning_rate": 0.00011699288256227758, + "loss": 1.5339, + "step": 4675 + }, + { + "epoch": 2.078222222222222, + "grad_norm": 2.3302857875823975, + "learning_rate": 0.00011697508896797154, + "loss": 1.8732, + "step": 4676 + }, + { + "epoch": 2.078666666666667, + "grad_norm": 2.1098620891571045, + "learning_rate": 0.0001169572953736655, + "loss": 1.4732, + "step": 4677 + }, + { + "epoch": 2.079111111111111, + "grad_norm": 2.499377489089966, + "learning_rate": 0.00011693950177935944, + "loss": 1.5989, + "step": 4678 + }, + { + "epoch": 2.0795555555555554, + "grad_norm": 2.321789503097534, + "learning_rate": 0.00011692170818505339, + "loss": 1.9255, + "step": 4679 + }, + { + "epoch": 2.08, + "grad_norm": 1.7443134784698486, + "learning_rate": 0.00011690391459074735, + "loss": 0.6187, + "step": 4680 + }, + { + "epoch": 2.0804444444444443, + "grad_norm": 2.2512123584747314, + "learning_rate": 0.00011688612099644129, + "loss": 1.8004, + "step": 4681 + }, + { + "epoch": 2.080888888888889, + "grad_norm": 2.09832763671875, + "learning_rate": 0.00011686832740213525, + "loss": 1.3556, + "step": 4682 + }, + { + "epoch": 2.0813333333333333, + "grad_norm": 2.038501262664795, + "learning_rate": 0.00011685053380782918, + "loss": 1.1602, + "step": 4683 + }, + { + "epoch": 2.081777777777778, + "grad_norm": 2.386502504348755, + "learning_rate": 0.00011683274021352313, + "loss": 1.5816, + "step": 4684 + }, + { + "epoch": 2.082222222222222, + "grad_norm": 2.359687328338623, + "learning_rate": 0.00011681494661921708, + "loss": 1.7559, + "step": 4685 + }, + { + "epoch": 2.0826666666666664, + "grad_norm": 2.628209114074707, + "learning_rate": 0.00011679715302491103, + "loss": 1.7236, + "step": 4686 + }, + { + "epoch": 2.083111111111111, + "grad_norm": 2.598487615585327, + "learning_rate": 0.00011677935943060499, + "loss": 2.2871, + "step": 4687 + }, + { + "epoch": 2.0835555555555554, + "grad_norm": 2.430295944213867, + "learning_rate": 0.00011676156583629893, + "loss": 1.3882, + "step": 4688 + }, + { + "epoch": 2.084, + "grad_norm": 2.897634506225586, + "learning_rate": 0.00011674377224199289, + "loss": 1.9332, + "step": 4689 + }, + { + "epoch": 2.0844444444444443, + "grad_norm": 3.5412042140960693, + "learning_rate": 0.00011672597864768685, + "loss": 1.7709, + "step": 4690 + }, + { + "epoch": 2.084888888888889, + "grad_norm": 2.518477201461792, + "learning_rate": 0.00011670818505338079, + "loss": 1.8739, + "step": 4691 + }, + { + "epoch": 2.0853333333333333, + "grad_norm": 2.6702585220336914, + "learning_rate": 0.00011669039145907475, + "loss": 1.5968, + "step": 4692 + }, + { + "epoch": 2.085777777777778, + "grad_norm": 2.3972411155700684, + "learning_rate": 0.00011667259786476869, + "loss": 1.2457, + "step": 4693 + }, + { + "epoch": 2.086222222222222, + "grad_norm": 3.0265438556671143, + "learning_rate": 0.00011665480427046265, + "loss": 2.0656, + "step": 4694 + }, + { + "epoch": 2.086666666666667, + "grad_norm": 2.678575038909912, + "learning_rate": 0.0001166370106761566, + "loss": 1.3855, + "step": 4695 + }, + { + "epoch": 2.087111111111111, + "grad_norm": 2.815953016281128, + "learning_rate": 0.00011661921708185053, + "loss": 1.5628, + "step": 4696 + }, + { + "epoch": 2.0875555555555554, + "grad_norm": 3.07003116607666, + "learning_rate": 0.00011660142348754447, + "loss": 1.8748, + "step": 4697 + }, + { + "epoch": 2.088, + "grad_norm": 2.8302512168884277, + "learning_rate": 0.00011658362989323843, + "loss": 1.9202, + "step": 4698 + }, + { + "epoch": 2.0884444444444443, + "grad_norm": 2.839108467102051, + "learning_rate": 0.00011656583629893239, + "loss": 1.5236, + "step": 4699 + }, + { + "epoch": 2.088888888888889, + "grad_norm": 2.802971124649048, + "learning_rate": 0.00011654804270462633, + "loss": 1.2329, + "step": 4700 + }, + { + "epoch": 2.0893333333333333, + "grad_norm": 1.8777737617492676, + "learning_rate": 0.00011653024911032029, + "loss": 2.2772, + "step": 4701 + }, + { + "epoch": 2.089777777777778, + "grad_norm": 2.0970041751861572, + "learning_rate": 0.00011651245551601424, + "loss": 1.5844, + "step": 4702 + }, + { + "epoch": 2.090222222222222, + "grad_norm": 2.249812126159668, + "learning_rate": 0.00011649466192170819, + "loss": 1.9234, + "step": 4703 + }, + { + "epoch": 2.0906666666666665, + "grad_norm": 2.0751559734344482, + "learning_rate": 0.00011647686832740214, + "loss": 1.9922, + "step": 4704 + }, + { + "epoch": 2.091111111111111, + "grad_norm": 3.0285685062408447, + "learning_rate": 0.0001164590747330961, + "loss": 1.4631, + "step": 4705 + }, + { + "epoch": 2.0915555555555554, + "grad_norm": 1.9764230251312256, + "learning_rate": 0.00011644128113879004, + "loss": 1.3146, + "step": 4706 + }, + { + "epoch": 2.092, + "grad_norm": 2.0233638286590576, + "learning_rate": 0.000116423487544484, + "loss": 1.7718, + "step": 4707 + }, + { + "epoch": 2.0924444444444443, + "grad_norm": 2.1841344833374023, + "learning_rate": 0.00011640569395017796, + "loss": 2.2122, + "step": 4708 + }, + { + "epoch": 2.092888888888889, + "grad_norm": 2.00598406791687, + "learning_rate": 0.00011638790035587188, + "loss": 1.7922, + "step": 4709 + }, + { + "epoch": 2.0933333333333333, + "grad_norm": 2.4193332195281982, + "learning_rate": 0.00011637010676156583, + "loss": 1.916, + "step": 4710 + }, + { + "epoch": 2.093777777777778, + "grad_norm": 2.1774141788482666, + "learning_rate": 0.00011635231316725978, + "loss": 2.0158, + "step": 4711 + }, + { + "epoch": 2.094222222222222, + "grad_norm": 2.0492541790008545, + "learning_rate": 0.00011633451957295374, + "loss": 2.1842, + "step": 4712 + }, + { + "epoch": 2.0946666666666665, + "grad_norm": 2.008819818496704, + "learning_rate": 0.00011631672597864768, + "loss": 1.6566, + "step": 4713 + }, + { + "epoch": 2.095111111111111, + "grad_norm": 1.877214789390564, + "learning_rate": 0.00011629893238434164, + "loss": 1.9144, + "step": 4714 + }, + { + "epoch": 2.0955555555555554, + "grad_norm": 3.5000193119049072, + "learning_rate": 0.0001162811387900356, + "loss": 2.7184, + "step": 4715 + }, + { + "epoch": 2.096, + "grad_norm": 2.240647554397583, + "learning_rate": 0.00011626334519572954, + "loss": 2.0603, + "step": 4716 + }, + { + "epoch": 2.0964444444444443, + "grad_norm": 1.966137409210205, + "learning_rate": 0.0001162455516014235, + "loss": 1.7733, + "step": 4717 + }, + { + "epoch": 2.096888888888889, + "grad_norm": 1.9724191427230835, + "learning_rate": 0.00011622775800711745, + "loss": 1.8949, + "step": 4718 + }, + { + "epoch": 2.0973333333333333, + "grad_norm": 2.057591199874878, + "learning_rate": 0.0001162099644128114, + "loss": 1.943, + "step": 4719 + }, + { + "epoch": 2.097777777777778, + "grad_norm": 2.395439624786377, + "learning_rate": 0.00011619217081850535, + "loss": 1.8616, + "step": 4720 + }, + { + "epoch": 2.098222222222222, + "grad_norm": 2.374725580215454, + "learning_rate": 0.00011617437722419931, + "loss": 1.5945, + "step": 4721 + }, + { + "epoch": 2.0986666666666665, + "grad_norm": 2.0969061851501465, + "learning_rate": 0.00011615658362989324, + "loss": 1.7964, + "step": 4722 + }, + { + "epoch": 2.099111111111111, + "grad_norm": 2.3275258541107178, + "learning_rate": 0.00011613879003558718, + "loss": 1.9125, + "step": 4723 + }, + { + "epoch": 2.0995555555555554, + "grad_norm": 2.1269102096557617, + "learning_rate": 0.00011612099644128114, + "loss": 1.2444, + "step": 4724 + }, + { + "epoch": 2.1, + "grad_norm": 2.2072010040283203, + "learning_rate": 0.0001161032028469751, + "loss": 1.4687, + "step": 4725 + }, + { + "epoch": 2.1004444444444443, + "grad_norm": 2.1687400341033936, + "learning_rate": 0.00011608540925266904, + "loss": 1.8108, + "step": 4726 + }, + { + "epoch": 2.100888888888889, + "grad_norm": 2.358961820602417, + "learning_rate": 0.00011606761565836299, + "loss": 1.7005, + "step": 4727 + }, + { + "epoch": 2.1013333333333333, + "grad_norm": 2.413325786590576, + "learning_rate": 0.00011604982206405695, + "loss": 1.9044, + "step": 4728 + }, + { + "epoch": 2.101777777777778, + "grad_norm": 2.0373430252075195, + "learning_rate": 0.00011603202846975089, + "loss": 1.395, + "step": 4729 + }, + { + "epoch": 2.102222222222222, + "grad_norm": 2.424889326095581, + "learning_rate": 0.00011601423487544485, + "loss": 1.7084, + "step": 4730 + }, + { + "epoch": 2.1026666666666665, + "grad_norm": 2.3407599925994873, + "learning_rate": 0.0001159964412811388, + "loss": 1.3036, + "step": 4731 + }, + { + "epoch": 2.103111111111111, + "grad_norm": 2.332714080810547, + "learning_rate": 0.00011597864768683275, + "loss": 1.7091, + "step": 4732 + }, + { + "epoch": 2.1035555555555554, + "grad_norm": 2.412674903869629, + "learning_rate": 0.0001159608540925267, + "loss": 1.5195, + "step": 4733 + }, + { + "epoch": 2.104, + "grad_norm": 2.552485942840576, + "learning_rate": 0.00011594306049822066, + "loss": 1.2986, + "step": 4734 + }, + { + "epoch": 2.1044444444444443, + "grad_norm": 2.3344991207122803, + "learning_rate": 0.00011592526690391459, + "loss": 1.5909, + "step": 4735 + }, + { + "epoch": 2.104888888888889, + "grad_norm": 2.5788843631744385, + "learning_rate": 0.00011590747330960853, + "loss": 1.7849, + "step": 4736 + }, + { + "epoch": 2.1053333333333333, + "grad_norm": 2.2648143768310547, + "learning_rate": 0.00011588967971530249, + "loss": 1.688, + "step": 4737 + }, + { + "epoch": 2.105777777777778, + "grad_norm": 1.9284850358963013, + "learning_rate": 0.00011587188612099645, + "loss": 0.7604, + "step": 4738 + }, + { + "epoch": 2.106222222222222, + "grad_norm": 2.7157113552093506, + "learning_rate": 0.00011585409252669039, + "loss": 1.6809, + "step": 4739 + }, + { + "epoch": 2.1066666666666665, + "grad_norm": 2.454627752304077, + "learning_rate": 0.00011583629893238435, + "loss": 1.5076, + "step": 4740 + }, + { + "epoch": 2.107111111111111, + "grad_norm": 2.6363182067871094, + "learning_rate": 0.0001158185053380783, + "loss": 1.5113, + "step": 4741 + }, + { + "epoch": 2.1075555555555554, + "grad_norm": 2.9892001152038574, + "learning_rate": 0.00011580071174377225, + "loss": 1.453, + "step": 4742 + }, + { + "epoch": 2.108, + "grad_norm": 2.927412509918213, + "learning_rate": 0.0001157829181494662, + "loss": 1.7116, + "step": 4743 + }, + { + "epoch": 2.1084444444444443, + "grad_norm": 2.7504935264587402, + "learning_rate": 0.00011576512455516016, + "loss": 1.5148, + "step": 4744 + }, + { + "epoch": 2.108888888888889, + "grad_norm": 2.982175588607788, + "learning_rate": 0.0001157473309608541, + "loss": 1.7693, + "step": 4745 + }, + { + "epoch": 2.1093333333333333, + "grad_norm": 2.9303767681121826, + "learning_rate": 0.00011572953736654806, + "loss": 1.6931, + "step": 4746 + }, + { + "epoch": 2.109777777777778, + "grad_norm": 3.9879038333892822, + "learning_rate": 0.00011571174377224201, + "loss": 1.9336, + "step": 4747 + }, + { + "epoch": 2.110222222222222, + "grad_norm": 3.65157413482666, + "learning_rate": 0.00011569395017793594, + "loss": 1.7332, + "step": 4748 + }, + { + "epoch": 2.1106666666666665, + "grad_norm": 3.0835530757904053, + "learning_rate": 0.00011567615658362989, + "loss": 1.9205, + "step": 4749 + }, + { + "epoch": 2.111111111111111, + "grad_norm": 3.5538792610168457, + "learning_rate": 0.00011565836298932384, + "loss": 0.9, + "step": 4750 + }, + { + "epoch": 2.1115555555555554, + "grad_norm": 1.4265509843826294, + "learning_rate": 0.0001156405693950178, + "loss": 1.01, + "step": 4751 + }, + { + "epoch": 2.112, + "grad_norm": 1.9871463775634766, + "learning_rate": 0.00011562277580071174, + "loss": 2.5466, + "step": 4752 + }, + { + "epoch": 2.1124444444444443, + "grad_norm": 2.88714337348938, + "learning_rate": 0.0001156049822064057, + "loss": 1.1221, + "step": 4753 + }, + { + "epoch": 2.112888888888889, + "grad_norm": 1.92192804813385, + "learning_rate": 0.00011558718861209966, + "loss": 1.7343, + "step": 4754 + }, + { + "epoch": 2.1133333333333333, + "grad_norm": 2.3003766536712646, + "learning_rate": 0.0001155693950177936, + "loss": 2.0344, + "step": 4755 + }, + { + "epoch": 2.113777777777778, + "grad_norm": 2.1846070289611816, + "learning_rate": 0.00011555160142348756, + "loss": 1.9369, + "step": 4756 + }, + { + "epoch": 2.1142222222222222, + "grad_norm": 2.2077274322509766, + "learning_rate": 0.00011553380782918151, + "loss": 1.8763, + "step": 4757 + }, + { + "epoch": 2.1146666666666665, + "grad_norm": 2.247318983078003, + "learning_rate": 0.00011551601423487545, + "loss": 1.6496, + "step": 4758 + }, + { + "epoch": 2.115111111111111, + "grad_norm": 2.483921527862549, + "learning_rate": 0.00011549822064056941, + "loss": 2.0657, + "step": 4759 + }, + { + "epoch": 2.1155555555555554, + "grad_norm": 2.4575605392456055, + "learning_rate": 0.00011548042704626334, + "loss": 1.9344, + "step": 4760 + }, + { + "epoch": 2.116, + "grad_norm": 2.4088830947875977, + "learning_rate": 0.0001154626334519573, + "loss": 1.8533, + "step": 4761 + }, + { + "epoch": 2.1164444444444444, + "grad_norm": 2.2537829875946045, + "learning_rate": 0.00011544483985765124, + "loss": 1.8034, + "step": 4762 + }, + { + "epoch": 2.116888888888889, + "grad_norm": 2.119475841522217, + "learning_rate": 0.0001154270462633452, + "loss": 1.9903, + "step": 4763 + }, + { + "epoch": 2.1173333333333333, + "grad_norm": 2.0730443000793457, + "learning_rate": 0.00011540925266903915, + "loss": 1.6459, + "step": 4764 + }, + { + "epoch": 2.117777777777778, + "grad_norm": 1.8100666999816895, + "learning_rate": 0.0001153914590747331, + "loss": 1.7129, + "step": 4765 + }, + { + "epoch": 2.1182222222222222, + "grad_norm": 1.7500770092010498, + "learning_rate": 0.00011537366548042705, + "loss": 1.5328, + "step": 4766 + }, + { + "epoch": 2.1186666666666665, + "grad_norm": 2.3155996799468994, + "learning_rate": 0.00011535587188612101, + "loss": 1.3889, + "step": 4767 + }, + { + "epoch": 2.119111111111111, + "grad_norm": 2.4421792030334473, + "learning_rate": 0.00011533807829181495, + "loss": 2.0543, + "step": 4768 + }, + { + "epoch": 2.1195555555555554, + "grad_norm": 2.4871792793273926, + "learning_rate": 0.00011532028469750891, + "loss": 2.2072, + "step": 4769 + }, + { + "epoch": 2.12, + "grad_norm": 2.081446647644043, + "learning_rate": 0.00011530249110320286, + "loss": 1.4294, + "step": 4770 + }, + { + "epoch": 2.1204444444444444, + "grad_norm": 2.19559383392334, + "learning_rate": 0.00011528469750889681, + "loss": 1.9015, + "step": 4771 + }, + { + "epoch": 2.120888888888889, + "grad_norm": 2.453030586242676, + "learning_rate": 0.00011526690391459076, + "loss": 1.8428, + "step": 4772 + }, + { + "epoch": 2.1213333333333333, + "grad_norm": 2.7875795364379883, + "learning_rate": 0.0001152491103202847, + "loss": 1.5894, + "step": 4773 + }, + { + "epoch": 2.121777777777778, + "grad_norm": 2.395458459854126, + "learning_rate": 0.00011523131672597865, + "loss": 1.5671, + "step": 4774 + }, + { + "epoch": 2.1222222222222222, + "grad_norm": 2.4182310104370117, + "learning_rate": 0.0001152135231316726, + "loss": 1.6568, + "step": 4775 + }, + { + "epoch": 2.1226666666666665, + "grad_norm": 2.46071195602417, + "learning_rate": 0.00011519572953736655, + "loss": 1.8421, + "step": 4776 + }, + { + "epoch": 2.123111111111111, + "grad_norm": 2.5873522758483887, + "learning_rate": 0.0001151779359430605, + "loss": 1.5898, + "step": 4777 + }, + { + "epoch": 2.1235555555555554, + "grad_norm": 1.7765636444091797, + "learning_rate": 0.00011516014234875445, + "loss": 0.7924, + "step": 4778 + }, + { + "epoch": 2.124, + "grad_norm": 2.2877538204193115, + "learning_rate": 0.0001151423487544484, + "loss": 1.9232, + "step": 4779 + }, + { + "epoch": 2.1244444444444444, + "grad_norm": 2.445138692855835, + "learning_rate": 0.00011512455516014236, + "loss": 2.0002, + "step": 4780 + }, + { + "epoch": 2.124888888888889, + "grad_norm": 2.173436403274536, + "learning_rate": 0.0001151067615658363, + "loss": 1.6479, + "step": 4781 + }, + { + "epoch": 2.1253333333333333, + "grad_norm": 2.5610175132751465, + "learning_rate": 0.00011508896797153026, + "loss": 1.4669, + "step": 4782 + }, + { + "epoch": 2.1257777777777775, + "grad_norm": 2.4170966148376465, + "learning_rate": 0.0001150711743772242, + "loss": 1.7575, + "step": 4783 + }, + { + "epoch": 2.1262222222222222, + "grad_norm": 2.7330989837646484, + "learning_rate": 0.00011505338078291816, + "loss": 1.8737, + "step": 4784 + }, + { + "epoch": 2.1266666666666665, + "grad_norm": 2.4493215084075928, + "learning_rate": 0.00011503558718861212, + "loss": 1.2428, + "step": 4785 + }, + { + "epoch": 2.127111111111111, + "grad_norm": 2.54807710647583, + "learning_rate": 0.00011501779359430605, + "loss": 1.675, + "step": 4786 + }, + { + "epoch": 2.1275555555555554, + "grad_norm": 3.092026472091675, + "learning_rate": 0.00011499999999999999, + "loss": 1.8129, + "step": 4787 + }, + { + "epoch": 2.128, + "grad_norm": 3.01973819732666, + "learning_rate": 0.00011498220640569395, + "loss": 1.9959, + "step": 4788 + }, + { + "epoch": 2.1284444444444444, + "grad_norm": 2.2665798664093018, + "learning_rate": 0.0001149644128113879, + "loss": 1.1747, + "step": 4789 + }, + { + "epoch": 2.128888888888889, + "grad_norm": 3.0983033180236816, + "learning_rate": 0.00011494661921708185, + "loss": 1.9808, + "step": 4790 + }, + { + "epoch": 2.1293333333333333, + "grad_norm": 2.7604308128356934, + "learning_rate": 0.0001149288256227758, + "loss": 1.7185, + "step": 4791 + }, + { + "epoch": 2.129777777777778, + "grad_norm": 2.712104558944702, + "learning_rate": 0.00011491103202846976, + "loss": 1.3634, + "step": 4792 + }, + { + "epoch": 2.1302222222222222, + "grad_norm": 2.7724192142486572, + "learning_rate": 0.0001148932384341637, + "loss": 1.2231, + "step": 4793 + }, + { + "epoch": 2.1306666666666665, + "grad_norm": 3.0472283363342285, + "learning_rate": 0.00011487544483985766, + "loss": 1.7844, + "step": 4794 + }, + { + "epoch": 2.131111111111111, + "grad_norm": 2.9061384201049805, + "learning_rate": 0.00011485765124555161, + "loss": 1.7998, + "step": 4795 + }, + { + "epoch": 2.1315555555555554, + "grad_norm": 3.207463502883911, + "learning_rate": 0.00011483985765124556, + "loss": 1.8822, + "step": 4796 + }, + { + "epoch": 2.132, + "grad_norm": 3.0004780292510986, + "learning_rate": 0.00011482206405693951, + "loss": 1.6778, + "step": 4797 + }, + { + "epoch": 2.1324444444444444, + "grad_norm": 3.055454730987549, + "learning_rate": 0.00011480427046263347, + "loss": 1.7763, + "step": 4798 + }, + { + "epoch": 2.132888888888889, + "grad_norm": 3.3208091259002686, + "learning_rate": 0.0001147864768683274, + "loss": 1.8971, + "step": 4799 + }, + { + "epoch": 2.1333333333333333, + "grad_norm": 2.958496570587158, + "learning_rate": 0.00011476868327402134, + "loss": 1.3253, + "step": 4800 + }, + { + "epoch": 2.1337777777777776, + "grad_norm": 1.970202922821045, + "learning_rate": 0.0001147508896797153, + "loss": 2.5294, + "step": 4801 + }, + { + "epoch": 2.1342222222222222, + "grad_norm": 1.807246446609497, + "learning_rate": 0.00011473309608540926, + "loss": 2.6724, + "step": 4802 + }, + { + "epoch": 2.1346666666666665, + "grad_norm": 1.7409089803695679, + "learning_rate": 0.0001147153024911032, + "loss": 1.8308, + "step": 4803 + }, + { + "epoch": 2.135111111111111, + "grad_norm": 1.3472347259521484, + "learning_rate": 0.00011469750889679716, + "loss": 1.1779, + "step": 4804 + }, + { + "epoch": 2.1355555555555554, + "grad_norm": 2.0794670581817627, + "learning_rate": 0.00011467971530249111, + "loss": 1.8861, + "step": 4805 + }, + { + "epoch": 2.136, + "grad_norm": 1.9827806949615479, + "learning_rate": 0.00011466192170818505, + "loss": 2.0878, + "step": 4806 + }, + { + "epoch": 2.1364444444444444, + "grad_norm": 1.8904576301574707, + "learning_rate": 0.00011464412811387901, + "loss": 1.7428, + "step": 4807 + }, + { + "epoch": 2.136888888888889, + "grad_norm": 1.7913202047348022, + "learning_rate": 0.00011462633451957297, + "loss": 1.2166, + "step": 4808 + }, + { + "epoch": 2.1373333333333333, + "grad_norm": 2.015347957611084, + "learning_rate": 0.00011460854092526691, + "loss": 1.8428, + "step": 4809 + }, + { + "epoch": 2.137777777777778, + "grad_norm": 2.0501961708068848, + "learning_rate": 0.00011459074733096087, + "loss": 2.1464, + "step": 4810 + }, + { + "epoch": 2.1382222222222222, + "grad_norm": 2.050212860107422, + "learning_rate": 0.00011457295373665482, + "loss": 2.0002, + "step": 4811 + }, + { + "epoch": 2.1386666666666665, + "grad_norm": 2.134122133255005, + "learning_rate": 0.00011455516014234875, + "loss": 2.1944, + "step": 4812 + }, + { + "epoch": 2.139111111111111, + "grad_norm": 1.8899743556976318, + "learning_rate": 0.0001145373665480427, + "loss": 1.6606, + "step": 4813 + }, + { + "epoch": 2.1395555555555554, + "grad_norm": 2.5020689964294434, + "learning_rate": 0.00011451957295373665, + "loss": 1.2598, + "step": 4814 + }, + { + "epoch": 2.14, + "grad_norm": 1.9991487264633179, + "learning_rate": 0.00011450177935943061, + "loss": 1.9065, + "step": 4815 + }, + { + "epoch": 2.1404444444444444, + "grad_norm": 2.101409912109375, + "learning_rate": 0.00011448398576512455, + "loss": 2.0605, + "step": 4816 + }, + { + "epoch": 2.140888888888889, + "grad_norm": 1.888168454170227, + "learning_rate": 0.00011446619217081851, + "loss": 1.7381, + "step": 4817 + }, + { + "epoch": 2.1413333333333333, + "grad_norm": 1.9398518800735474, + "learning_rate": 0.00011444839857651247, + "loss": 1.6413, + "step": 4818 + }, + { + "epoch": 2.1417777777777776, + "grad_norm": 2.3134799003601074, + "learning_rate": 0.00011443060498220641, + "loss": 1.6505, + "step": 4819 + }, + { + "epoch": 2.1422222222222222, + "grad_norm": 2.276531934738159, + "learning_rate": 0.00011441281138790036, + "loss": 2.3464, + "step": 4820 + }, + { + "epoch": 2.1426666666666665, + "grad_norm": 1.9262102842330933, + "learning_rate": 0.00011439501779359432, + "loss": 1.7171, + "step": 4821 + }, + { + "epoch": 2.143111111111111, + "grad_norm": 1.9879403114318848, + "learning_rate": 0.00011437722419928826, + "loss": 1.3029, + "step": 4822 + }, + { + "epoch": 2.1435555555555554, + "grad_norm": 2.1766903400421143, + "learning_rate": 0.00011435943060498222, + "loss": 1.7259, + "step": 4823 + }, + { + "epoch": 2.144, + "grad_norm": 2.2997825145721436, + "learning_rate": 0.00011434163701067618, + "loss": 2.3198, + "step": 4824 + }, + { + "epoch": 2.1444444444444444, + "grad_norm": 2.1982157230377197, + "learning_rate": 0.0001143238434163701, + "loss": 1.913, + "step": 4825 + }, + { + "epoch": 2.144888888888889, + "grad_norm": 2.041598320007324, + "learning_rate": 0.00011430604982206405, + "loss": 1.473, + "step": 4826 + }, + { + "epoch": 2.1453333333333333, + "grad_norm": 2.7166502475738525, + "learning_rate": 0.000114288256227758, + "loss": 1.6377, + "step": 4827 + }, + { + "epoch": 2.145777777777778, + "grad_norm": 2.4108223915100098, + "learning_rate": 0.00011427046263345196, + "loss": 1.5567, + "step": 4828 + }, + { + "epoch": 2.1462222222222223, + "grad_norm": 2.168083429336548, + "learning_rate": 0.0001142526690391459, + "loss": 1.4504, + "step": 4829 + }, + { + "epoch": 2.1466666666666665, + "grad_norm": 2.364166498184204, + "learning_rate": 0.00011423487544483986, + "loss": 2.1665, + "step": 4830 + }, + { + "epoch": 2.147111111111111, + "grad_norm": 2.303101062774658, + "learning_rate": 0.00011421708185053382, + "loss": 1.7004, + "step": 4831 + }, + { + "epoch": 2.1475555555555554, + "grad_norm": 2.227083921432495, + "learning_rate": 0.00011419928825622776, + "loss": 0.8074, + "step": 4832 + }, + { + "epoch": 2.148, + "grad_norm": 2.1052846908569336, + "learning_rate": 0.00011418149466192172, + "loss": 1.6949, + "step": 4833 + }, + { + "epoch": 2.1484444444444444, + "grad_norm": 2.4847774505615234, + "learning_rate": 0.00011416370106761567, + "loss": 1.601, + "step": 4834 + }, + { + "epoch": 2.148888888888889, + "grad_norm": 1.6619476079940796, + "learning_rate": 0.00011414590747330962, + "loss": 0.8843, + "step": 4835 + }, + { + "epoch": 2.1493333333333333, + "grad_norm": 2.540987491607666, + "learning_rate": 0.00011412811387900357, + "loss": 1.5859, + "step": 4836 + }, + { + "epoch": 2.1497777777777776, + "grad_norm": 2.535325765609741, + "learning_rate": 0.00011411032028469753, + "loss": 1.8662, + "step": 4837 + }, + { + "epoch": 2.1502222222222223, + "grad_norm": 2.593508243560791, + "learning_rate": 0.00011409252669039146, + "loss": 1.5427, + "step": 4838 + }, + { + "epoch": 2.1506666666666665, + "grad_norm": 2.8630247116088867, + "learning_rate": 0.0001140747330960854, + "loss": 1.5968, + "step": 4839 + }, + { + "epoch": 2.151111111111111, + "grad_norm": 2.9739978313446045, + "learning_rate": 0.00011405693950177936, + "loss": 1.7853, + "step": 4840 + }, + { + "epoch": 2.1515555555555554, + "grad_norm": 2.3142545223236084, + "learning_rate": 0.00011403914590747332, + "loss": 1.4719, + "step": 4841 + }, + { + "epoch": 2.152, + "grad_norm": 2.915494203567505, + "learning_rate": 0.00011402135231316726, + "loss": 1.4225, + "step": 4842 + }, + { + "epoch": 2.1524444444444444, + "grad_norm": 2.4613630771636963, + "learning_rate": 0.00011400355871886121, + "loss": 1.8753, + "step": 4843 + }, + { + "epoch": 2.152888888888889, + "grad_norm": 3.434368133544922, + "learning_rate": 0.00011398576512455517, + "loss": 1.4475, + "step": 4844 + }, + { + "epoch": 2.1533333333333333, + "grad_norm": 3.4849135875701904, + "learning_rate": 0.00011396797153024911, + "loss": 2.0161, + "step": 4845 + }, + { + "epoch": 2.153777777777778, + "grad_norm": 2.9939815998077393, + "learning_rate": 0.00011395017793594307, + "loss": 1.7339, + "step": 4846 + }, + { + "epoch": 2.1542222222222223, + "grad_norm": 2.8146097660064697, + "learning_rate": 0.00011393238434163703, + "loss": 1.5392, + "step": 4847 + }, + { + "epoch": 2.1546666666666665, + "grad_norm": 4.10907506942749, + "learning_rate": 0.00011391459074733097, + "loss": 1.2541, + "step": 4848 + }, + { + "epoch": 2.155111111111111, + "grad_norm": 2.995576858520508, + "learning_rate": 0.00011389679715302493, + "loss": 1.668, + "step": 4849 + }, + { + "epoch": 2.1555555555555554, + "grad_norm": 0.45589274168014526, + "learning_rate": 0.00011387900355871888, + "loss": 0.0619, + "step": 4850 + }, + { + "epoch": 2.156, + "grad_norm": 1.8959424495697021, + "learning_rate": 0.00011386120996441281, + "loss": 2.3939, + "step": 4851 + }, + { + "epoch": 2.1564444444444444, + "grad_norm": 1.9614112377166748, + "learning_rate": 0.00011384341637010676, + "loss": 1.8642, + "step": 4852 + }, + { + "epoch": 2.156888888888889, + "grad_norm": 2.2174315452575684, + "learning_rate": 0.00011382562277580071, + "loss": 1.6341, + "step": 4853 + }, + { + "epoch": 2.1573333333333333, + "grad_norm": 2.617152452468872, + "learning_rate": 0.00011380782918149467, + "loss": 1.9807, + "step": 4854 + }, + { + "epoch": 2.1577777777777776, + "grad_norm": 1.9597722291946411, + "learning_rate": 0.00011379003558718861, + "loss": 1.5573, + "step": 4855 + }, + { + "epoch": 2.1582222222222223, + "grad_norm": 2.3918566703796387, + "learning_rate": 0.00011377224199288257, + "loss": 2.0388, + "step": 4856 + }, + { + "epoch": 2.1586666666666665, + "grad_norm": 2.0133910179138184, + "learning_rate": 0.00011375444839857652, + "loss": 2.0006, + "step": 4857 + }, + { + "epoch": 2.159111111111111, + "grad_norm": 2.1372923851013184, + "learning_rate": 0.00011373665480427047, + "loss": 1.8971, + "step": 4858 + }, + { + "epoch": 2.1595555555555555, + "grad_norm": 2.200821876525879, + "learning_rate": 0.00011371886120996442, + "loss": 1.8821, + "step": 4859 + }, + { + "epoch": 2.16, + "grad_norm": 2.4604291915893555, + "learning_rate": 0.00011370106761565838, + "loss": 1.9079, + "step": 4860 + }, + { + "epoch": 2.1604444444444444, + "grad_norm": 2.0270156860351562, + "learning_rate": 0.00011368327402135232, + "loss": 1.4849, + "step": 4861 + }, + { + "epoch": 2.160888888888889, + "grad_norm": 2.796276807785034, + "learning_rate": 0.00011366548042704628, + "loss": 2.0916, + "step": 4862 + }, + { + "epoch": 2.1613333333333333, + "grad_norm": 2.0537173748016357, + "learning_rate": 0.00011364768683274024, + "loss": 1.7261, + "step": 4863 + }, + { + "epoch": 2.1617777777777776, + "grad_norm": 1.9119226932525635, + "learning_rate": 0.00011362989323843417, + "loss": 1.5883, + "step": 4864 + }, + { + "epoch": 2.1622222222222223, + "grad_norm": 2.016460657119751, + "learning_rate": 0.00011361209964412811, + "loss": 2.0473, + "step": 4865 + }, + { + "epoch": 2.1626666666666665, + "grad_norm": 2.056744337081909, + "learning_rate": 0.00011359430604982207, + "loss": 1.5667, + "step": 4866 + }, + { + "epoch": 2.163111111111111, + "grad_norm": 2.0856573581695557, + "learning_rate": 0.00011357651245551602, + "loss": 2.1636, + "step": 4867 + }, + { + "epoch": 2.1635555555555555, + "grad_norm": 2.0556750297546387, + "learning_rate": 0.00011355871886120996, + "loss": 1.6923, + "step": 4868 + }, + { + "epoch": 2.164, + "grad_norm": 1.8435472249984741, + "learning_rate": 0.00011354092526690392, + "loss": 1.4836, + "step": 4869 + }, + { + "epoch": 2.1644444444444444, + "grad_norm": 1.7105246782302856, + "learning_rate": 0.00011352313167259788, + "loss": 1.2481, + "step": 4870 + }, + { + "epoch": 2.164888888888889, + "grad_norm": 1.367600679397583, + "learning_rate": 0.00011350533807829182, + "loss": 0.6025, + "step": 4871 + }, + { + "epoch": 2.1653333333333333, + "grad_norm": 2.135531425476074, + "learning_rate": 0.00011348754448398578, + "loss": 1.6163, + "step": 4872 + }, + { + "epoch": 2.1657777777777776, + "grad_norm": 1.9856308698654175, + "learning_rate": 0.00011346975088967972, + "loss": 1.6504, + "step": 4873 + }, + { + "epoch": 2.1662222222222223, + "grad_norm": 2.070834159851074, + "learning_rate": 0.00011345195729537368, + "loss": 1.7144, + "step": 4874 + }, + { + "epoch": 2.1666666666666665, + "grad_norm": 2.081918716430664, + "learning_rate": 0.00011343416370106763, + "loss": 1.6878, + "step": 4875 + }, + { + "epoch": 2.167111111111111, + "grad_norm": 2.117114305496216, + "learning_rate": 0.00011341637010676156, + "loss": 1.5885, + "step": 4876 + }, + { + "epoch": 2.1675555555555555, + "grad_norm": 2.200413227081299, + "learning_rate": 0.0001133985765124555, + "loss": 1.4892, + "step": 4877 + }, + { + "epoch": 2.168, + "grad_norm": 2.2609503269195557, + "learning_rate": 0.00011338078291814946, + "loss": 1.5828, + "step": 4878 + }, + { + "epoch": 2.1684444444444444, + "grad_norm": 2.2752559185028076, + "learning_rate": 0.00011336298932384342, + "loss": 1.6549, + "step": 4879 + }, + { + "epoch": 2.168888888888889, + "grad_norm": 2.138413667678833, + "learning_rate": 0.00011334519572953736, + "loss": 1.241, + "step": 4880 + }, + { + "epoch": 2.1693333333333333, + "grad_norm": 2.5910072326660156, + "learning_rate": 0.00011332740213523132, + "loss": 1.4842, + "step": 4881 + }, + { + "epoch": 2.1697777777777776, + "grad_norm": 2.1112563610076904, + "learning_rate": 0.00011330960854092527, + "loss": 1.5217, + "step": 4882 + }, + { + "epoch": 2.1702222222222223, + "grad_norm": 2.3650734424591064, + "learning_rate": 0.00011329181494661922, + "loss": 1.6714, + "step": 4883 + }, + { + "epoch": 2.1706666666666665, + "grad_norm": 2.035025119781494, + "learning_rate": 0.00011327402135231317, + "loss": 1.2853, + "step": 4884 + }, + { + "epoch": 2.171111111111111, + "grad_norm": 2.171036958694458, + "learning_rate": 0.00011325622775800713, + "loss": 1.4231, + "step": 4885 + }, + { + "epoch": 2.1715555555555555, + "grad_norm": 3.245302200317383, + "learning_rate": 0.00011323843416370107, + "loss": 1.7397, + "step": 4886 + }, + { + "epoch": 2.172, + "grad_norm": 3.072438955307007, + "learning_rate": 0.00011322064056939503, + "loss": 2.2672, + "step": 4887 + }, + { + "epoch": 2.1724444444444444, + "grad_norm": 2.8883869647979736, + "learning_rate": 0.00011320284697508899, + "loss": 1.7494, + "step": 4888 + }, + { + "epoch": 2.172888888888889, + "grad_norm": 2.5595476627349854, + "learning_rate": 0.00011318505338078292, + "loss": 1.3625, + "step": 4889 + }, + { + "epoch": 2.1733333333333333, + "grad_norm": 2.532860517501831, + "learning_rate": 0.00011316725978647686, + "loss": 1.0884, + "step": 4890 + }, + { + "epoch": 2.1737777777777776, + "grad_norm": 3.10447359085083, + "learning_rate": 0.00011314946619217082, + "loss": 2.1665, + "step": 4891 + }, + { + "epoch": 2.1742222222222223, + "grad_norm": 2.6869397163391113, + "learning_rate": 0.00011313167259786477, + "loss": 1.644, + "step": 4892 + }, + { + "epoch": 2.1746666666666665, + "grad_norm": 2.8844075202941895, + "learning_rate": 0.00011311387900355871, + "loss": 1.5886, + "step": 4893 + }, + { + "epoch": 2.175111111111111, + "grad_norm": 2.6155810356140137, + "learning_rate": 0.00011309608540925267, + "loss": 1.4173, + "step": 4894 + }, + { + "epoch": 2.1755555555555555, + "grad_norm": 3.163278341293335, + "learning_rate": 0.00011307829181494663, + "loss": 1.5141, + "step": 4895 + }, + { + "epoch": 2.176, + "grad_norm": 2.6014788150787354, + "learning_rate": 0.00011306049822064057, + "loss": 1.3882, + "step": 4896 + }, + { + "epoch": 2.1764444444444444, + "grad_norm": 3.472792387008667, + "learning_rate": 0.00011304270462633453, + "loss": 1.8972, + "step": 4897 + }, + { + "epoch": 2.176888888888889, + "grad_norm": 2.989288568496704, + "learning_rate": 0.00011302491103202848, + "loss": 1.7598, + "step": 4898 + }, + { + "epoch": 2.1773333333333333, + "grad_norm": 3.674435615539551, + "learning_rate": 0.00011300711743772243, + "loss": 1.5909, + "step": 4899 + }, + { + "epoch": 2.1777777777777776, + "grad_norm": 2.7188007831573486, + "learning_rate": 0.00011298932384341638, + "loss": 0.8713, + "step": 4900 + }, + { + "epoch": 2.1782222222222223, + "grad_norm": 1.8986291885375977, + "learning_rate": 0.00011297153024911034, + "loss": 2.2709, + "step": 4901 + }, + { + "epoch": 2.1786666666666665, + "grad_norm": 1.9453548192977905, + "learning_rate": 0.00011295373665480427, + "loss": 2.0065, + "step": 4902 + }, + { + "epoch": 2.179111111111111, + "grad_norm": 1.927510380744934, + "learning_rate": 0.00011293594306049821, + "loss": 2.1494, + "step": 4903 + }, + { + "epoch": 2.1795555555555555, + "grad_norm": 2.2795588970184326, + "learning_rate": 0.00011291814946619217, + "loss": 1.9723, + "step": 4904 + }, + { + "epoch": 2.18, + "grad_norm": 2.4390902519226074, + "learning_rate": 0.00011290035587188612, + "loss": 2.4457, + "step": 4905 + }, + { + "epoch": 2.1804444444444444, + "grad_norm": 2.1050994396209717, + "learning_rate": 0.00011288256227758007, + "loss": 1.3167, + "step": 4906 + }, + { + "epoch": 2.180888888888889, + "grad_norm": 2.186591386795044, + "learning_rate": 0.00011286476868327402, + "loss": 1.9273, + "step": 4907 + }, + { + "epoch": 2.1813333333333333, + "grad_norm": 2.417327404022217, + "learning_rate": 0.00011284697508896798, + "loss": 1.9586, + "step": 4908 + }, + { + "epoch": 2.1817777777777776, + "grad_norm": 2.4069387912750244, + "learning_rate": 0.00011282918149466192, + "loss": 1.3856, + "step": 4909 + }, + { + "epoch": 2.1822222222222223, + "grad_norm": 2.5273990631103516, + "learning_rate": 0.00011281138790035588, + "loss": 1.8896, + "step": 4910 + }, + { + "epoch": 2.1826666666666665, + "grad_norm": 2.1475119590759277, + "learning_rate": 0.00011279359430604984, + "loss": 1.9951, + "step": 4911 + }, + { + "epoch": 2.1831111111111112, + "grad_norm": 2.5976321697235107, + "learning_rate": 0.00011277580071174378, + "loss": 1.6981, + "step": 4912 + }, + { + "epoch": 2.1835555555555555, + "grad_norm": 1.4629095792770386, + "learning_rate": 0.00011275800711743774, + "loss": 0.9384, + "step": 4913 + }, + { + "epoch": 2.184, + "grad_norm": 2.086716413497925, + "learning_rate": 0.00011274021352313169, + "loss": 1.5494, + "step": 4914 + }, + { + "epoch": 2.1844444444444444, + "grad_norm": 1.9124023914337158, + "learning_rate": 0.00011272241992882562, + "loss": 1.5172, + "step": 4915 + }, + { + "epoch": 2.1848888888888887, + "grad_norm": 2.046513080596924, + "learning_rate": 0.00011270462633451956, + "loss": 1.8116, + "step": 4916 + }, + { + "epoch": 2.1853333333333333, + "grad_norm": 2.0074009895324707, + "learning_rate": 0.00011268683274021352, + "loss": 1.2871, + "step": 4917 + }, + { + "epoch": 2.1857777777777776, + "grad_norm": 2.108560800552368, + "learning_rate": 0.00011266903914590748, + "loss": 1.8694, + "step": 4918 + }, + { + "epoch": 2.1862222222222223, + "grad_norm": 2.6234912872314453, + "learning_rate": 0.00011265124555160142, + "loss": 1.577, + "step": 4919 + }, + { + "epoch": 2.1866666666666665, + "grad_norm": 2.239198684692383, + "learning_rate": 0.00011263345195729538, + "loss": 2.0347, + "step": 4920 + }, + { + "epoch": 2.1871111111111112, + "grad_norm": 2.2819833755493164, + "learning_rate": 0.00011261565836298933, + "loss": 2.3103, + "step": 4921 + }, + { + "epoch": 2.1875555555555555, + "grad_norm": 2.2331910133361816, + "learning_rate": 0.00011259786476868328, + "loss": 1.6046, + "step": 4922 + }, + { + "epoch": 2.188, + "grad_norm": 2.1115944385528564, + "learning_rate": 0.00011258007117437723, + "loss": 1.8757, + "step": 4923 + }, + { + "epoch": 2.1884444444444444, + "grad_norm": 2.104067087173462, + "learning_rate": 0.00011256227758007119, + "loss": 1.6774, + "step": 4924 + }, + { + "epoch": 2.188888888888889, + "grad_norm": 2.385721206665039, + "learning_rate": 0.00011254448398576513, + "loss": 2.1468, + "step": 4925 + }, + { + "epoch": 2.1893333333333334, + "grad_norm": 2.554507255554199, + "learning_rate": 0.00011252669039145909, + "loss": 2.0144, + "step": 4926 + }, + { + "epoch": 2.1897777777777776, + "grad_norm": 2.23824143409729, + "learning_rate": 0.00011250889679715305, + "loss": 1.7106, + "step": 4927 + }, + { + "epoch": 2.1902222222222223, + "grad_norm": 2.3488128185272217, + "learning_rate": 0.00011249110320284698, + "loss": 1.8532, + "step": 4928 + }, + { + "epoch": 2.1906666666666665, + "grad_norm": 2.960286855697632, + "learning_rate": 0.00011247330960854092, + "loss": 1.6043, + "step": 4929 + }, + { + "epoch": 2.1911111111111112, + "grad_norm": 2.2605531215667725, + "learning_rate": 0.00011245551601423487, + "loss": 1.4308, + "step": 4930 + }, + { + "epoch": 2.1915555555555555, + "grad_norm": 2.7191028594970703, + "learning_rate": 0.00011243772241992883, + "loss": 1.4713, + "step": 4931 + }, + { + "epoch": 2.192, + "grad_norm": 2.499677896499634, + "learning_rate": 0.00011241992882562277, + "loss": 1.4607, + "step": 4932 + }, + { + "epoch": 2.1924444444444444, + "grad_norm": 2.556260347366333, + "learning_rate": 0.00011240213523131673, + "loss": 1.4645, + "step": 4933 + }, + { + "epoch": 2.1928888888888887, + "grad_norm": 2.443964958190918, + "learning_rate": 0.00011238434163701069, + "loss": 1.3057, + "step": 4934 + }, + { + "epoch": 2.1933333333333334, + "grad_norm": 2.543210744857788, + "learning_rate": 0.00011236654804270463, + "loss": 2.0736, + "step": 4935 + }, + { + "epoch": 2.1937777777777776, + "grad_norm": 2.3704352378845215, + "learning_rate": 0.00011234875444839859, + "loss": 1.3827, + "step": 4936 + }, + { + "epoch": 2.1942222222222223, + "grad_norm": 2.49611234664917, + "learning_rate": 0.00011233096085409254, + "loss": 1.7015, + "step": 4937 + }, + { + "epoch": 2.1946666666666665, + "grad_norm": 2.5088584423065186, + "learning_rate": 0.00011231316725978649, + "loss": 1.7995, + "step": 4938 + }, + { + "epoch": 2.1951111111111112, + "grad_norm": 2.3908095359802246, + "learning_rate": 0.00011229537366548044, + "loss": 1.5912, + "step": 4939 + }, + { + "epoch": 2.1955555555555555, + "grad_norm": 2.597078323364258, + "learning_rate": 0.0001122775800711744, + "loss": 1.5497, + "step": 4940 + }, + { + "epoch": 2.196, + "grad_norm": 2.943357467651367, + "learning_rate": 0.00011225978647686833, + "loss": 2.0209, + "step": 4941 + }, + { + "epoch": 2.1964444444444444, + "grad_norm": 3.4762535095214844, + "learning_rate": 0.00011224199288256227, + "loss": 2.0581, + "step": 4942 + }, + { + "epoch": 2.196888888888889, + "grad_norm": 2.8447139263153076, + "learning_rate": 0.00011222419928825623, + "loss": 1.42, + "step": 4943 + }, + { + "epoch": 2.1973333333333334, + "grad_norm": 2.7726380825042725, + "learning_rate": 0.00011220640569395018, + "loss": 1.608, + "step": 4944 + }, + { + "epoch": 2.1977777777777776, + "grad_norm": 2.942455768585205, + "learning_rate": 0.00011218861209964413, + "loss": 1.6582, + "step": 4945 + }, + { + "epoch": 2.1982222222222223, + "grad_norm": 3.3560426235198975, + "learning_rate": 0.00011217081850533808, + "loss": 1.897, + "step": 4946 + }, + { + "epoch": 2.1986666666666665, + "grad_norm": 3.4621267318725586, + "learning_rate": 0.00011215302491103204, + "loss": 2.2004, + "step": 4947 + }, + { + "epoch": 2.1991111111111112, + "grad_norm": 2.5123441219329834, + "learning_rate": 0.00011213523131672598, + "loss": 1.0752, + "step": 4948 + }, + { + "epoch": 2.1995555555555555, + "grad_norm": 1.6749694347381592, + "learning_rate": 0.00011211743772241994, + "loss": 0.6226, + "step": 4949 + }, + { + "epoch": 2.2, + "grad_norm": 3.319334030151367, + "learning_rate": 0.0001120996441281139, + "loss": 1.4356, + "step": 4950 + }, + { + "epoch": 2.2004444444444444, + "grad_norm": 1.5259418487548828, + "learning_rate": 0.00011208185053380784, + "loss": 1.6631, + "step": 4951 + }, + { + "epoch": 2.2008888888888887, + "grad_norm": 1.3049883842468262, + "learning_rate": 0.0001120640569395018, + "loss": 0.9819, + "step": 4952 + }, + { + "epoch": 2.2013333333333334, + "grad_norm": 2.1342673301696777, + "learning_rate": 0.00011204626334519575, + "loss": 1.929, + "step": 4953 + }, + { + "epoch": 2.2017777777777776, + "grad_norm": 1.989028811454773, + "learning_rate": 0.00011202846975088968, + "loss": 2.0505, + "step": 4954 + }, + { + "epoch": 2.2022222222222223, + "grad_norm": 2.049135208129883, + "learning_rate": 0.00011201067615658362, + "loss": 1.8818, + "step": 4955 + }, + { + "epoch": 2.2026666666666666, + "grad_norm": 2.263387441635132, + "learning_rate": 0.00011199288256227758, + "loss": 2.0014, + "step": 4956 + }, + { + "epoch": 2.2031111111111112, + "grad_norm": 2.11501407623291, + "learning_rate": 0.00011197508896797154, + "loss": 1.9057, + "step": 4957 + }, + { + "epoch": 2.2035555555555555, + "grad_norm": 1.9304909706115723, + "learning_rate": 0.00011195729537366548, + "loss": 1.867, + "step": 4958 + }, + { + "epoch": 2.204, + "grad_norm": 2.2471048831939697, + "learning_rate": 0.00011193950177935944, + "loss": 1.7576, + "step": 4959 + }, + { + "epoch": 2.2044444444444444, + "grad_norm": 2.4755563735961914, + "learning_rate": 0.0001119217081850534, + "loss": 1.6393, + "step": 4960 + }, + { + "epoch": 2.204888888888889, + "grad_norm": 2.0645923614501953, + "learning_rate": 0.00011190391459074734, + "loss": 1.4799, + "step": 4961 + }, + { + "epoch": 2.2053333333333334, + "grad_norm": 2.2446157932281494, + "learning_rate": 0.00011188612099644129, + "loss": 2.2068, + "step": 4962 + }, + { + "epoch": 2.2057777777777776, + "grad_norm": 2.309330701828003, + "learning_rate": 0.00011186832740213524, + "loss": 2.1778, + "step": 4963 + }, + { + "epoch": 2.2062222222222223, + "grad_norm": 1.9780677556991577, + "learning_rate": 0.00011185053380782919, + "loss": 1.8793, + "step": 4964 + }, + { + "epoch": 2.2066666666666666, + "grad_norm": 2.061521530151367, + "learning_rate": 0.00011183274021352315, + "loss": 1.2263, + "step": 4965 + }, + { + "epoch": 2.2071111111111112, + "grad_norm": 2.245497703552246, + "learning_rate": 0.00011181494661921709, + "loss": 1.9673, + "step": 4966 + }, + { + "epoch": 2.2075555555555555, + "grad_norm": 2.020643472671509, + "learning_rate": 0.00011179715302491102, + "loss": 1.9154, + "step": 4967 + }, + { + "epoch": 2.208, + "grad_norm": 2.15903902053833, + "learning_rate": 0.00011177935943060498, + "loss": 1.8257, + "step": 4968 + }, + { + "epoch": 2.2084444444444444, + "grad_norm": 2.490280866622925, + "learning_rate": 0.00011176156583629893, + "loss": 1.9819, + "step": 4969 + }, + { + "epoch": 2.2088888888888887, + "grad_norm": 2.4871490001678467, + "learning_rate": 0.00011174377224199288, + "loss": 1.946, + "step": 4970 + }, + { + "epoch": 2.2093333333333334, + "grad_norm": 2.053659200668335, + "learning_rate": 0.00011172597864768683, + "loss": 1.8871, + "step": 4971 + }, + { + "epoch": 2.2097777777777776, + "grad_norm": 2.2057316303253174, + "learning_rate": 0.00011170818505338079, + "loss": 1.5619, + "step": 4972 + }, + { + "epoch": 2.2102222222222223, + "grad_norm": 1.9269914627075195, + "learning_rate": 0.00011169039145907473, + "loss": 1.3407, + "step": 4973 + }, + { + "epoch": 2.2106666666666666, + "grad_norm": 2.1930150985717773, + "learning_rate": 0.00011167259786476869, + "loss": 1.7184, + "step": 4974 + }, + { + "epoch": 2.2111111111111112, + "grad_norm": 2.1484861373901367, + "learning_rate": 0.00011165480427046265, + "loss": 1.8792, + "step": 4975 + }, + { + "epoch": 2.2115555555555555, + "grad_norm": 2.3635623455047607, + "learning_rate": 0.00011163701067615659, + "loss": 1.8599, + "step": 4976 + }, + { + "epoch": 2.212, + "grad_norm": 2.525038003921509, + "learning_rate": 0.00011161921708185055, + "loss": 1.9892, + "step": 4977 + }, + { + "epoch": 2.2124444444444444, + "grad_norm": 2.2718634605407715, + "learning_rate": 0.0001116014234875445, + "loss": 1.6329, + "step": 4978 + }, + { + "epoch": 2.2128888888888887, + "grad_norm": 1.9843499660491943, + "learning_rate": 0.00011158362989323844, + "loss": 1.2032, + "step": 4979 + }, + { + "epoch": 2.2133333333333334, + "grad_norm": 2.85638165473938, + "learning_rate": 0.00011156583629893237, + "loss": 2.0707, + "step": 4980 + }, + { + "epoch": 2.2137777777777776, + "grad_norm": 2.3343589305877686, + "learning_rate": 0.00011154804270462633, + "loss": 1.3783, + "step": 4981 + }, + { + "epoch": 2.2142222222222223, + "grad_norm": 2.3680782318115234, + "learning_rate": 0.00011153024911032029, + "loss": 1.4898, + "step": 4982 + }, + { + "epoch": 2.2146666666666666, + "grad_norm": 1.55228853225708, + "learning_rate": 0.00011151245551601423, + "loss": 0.6674, + "step": 4983 + }, + { + "epoch": 2.2151111111111113, + "grad_norm": 2.7255640029907227, + "learning_rate": 0.00011149466192170819, + "loss": 1.7476, + "step": 4984 + }, + { + "epoch": 2.2155555555555555, + "grad_norm": 2.6943490505218506, + "learning_rate": 0.00011147686832740214, + "loss": 1.5847, + "step": 4985 + }, + { + "epoch": 2.216, + "grad_norm": 2.6895389556884766, + "learning_rate": 0.00011145907473309609, + "loss": 1.8233, + "step": 4986 + }, + { + "epoch": 2.2164444444444444, + "grad_norm": 2.645171642303467, + "learning_rate": 0.00011144128113879004, + "loss": 1.7922, + "step": 4987 + }, + { + "epoch": 2.2168888888888887, + "grad_norm": 3.0169036388397217, + "learning_rate": 0.000111423487544484, + "loss": 1.8521, + "step": 4988 + }, + { + "epoch": 2.2173333333333334, + "grad_norm": 2.563162088394165, + "learning_rate": 0.00011140569395017794, + "loss": 1.654, + "step": 4989 + }, + { + "epoch": 2.2177777777777776, + "grad_norm": 2.9162724018096924, + "learning_rate": 0.0001113879003558719, + "loss": 1.8161, + "step": 4990 + }, + { + "epoch": 2.2182222222222223, + "grad_norm": 2.8812615871429443, + "learning_rate": 0.00011137010676156586, + "loss": 1.8481, + "step": 4991 + }, + { + "epoch": 2.2186666666666666, + "grad_norm": 3.00016188621521, + "learning_rate": 0.00011135231316725978, + "loss": 1.7091, + "step": 4992 + }, + { + "epoch": 2.2191111111111113, + "grad_norm": 2.6278302669525146, + "learning_rate": 0.00011133451957295373, + "loss": 1.7687, + "step": 4993 + }, + { + "epoch": 2.2195555555555555, + "grad_norm": 2.530658006668091, + "learning_rate": 0.00011131672597864768, + "loss": 1.6654, + "step": 4994 + }, + { + "epoch": 2.22, + "grad_norm": 2.8739798069000244, + "learning_rate": 0.00011129893238434164, + "loss": 1.3435, + "step": 4995 + }, + { + "epoch": 2.2204444444444444, + "grad_norm": 2.9799602031707764, + "learning_rate": 0.00011128113879003558, + "loss": 1.642, + "step": 4996 + }, + { + "epoch": 2.2208888888888887, + "grad_norm": 2.8478050231933594, + "learning_rate": 0.00011126334519572954, + "loss": 1.6135, + "step": 4997 + }, + { + "epoch": 2.2213333333333334, + "grad_norm": 3.293814182281494, + "learning_rate": 0.0001112455516014235, + "loss": 1.8149, + "step": 4998 + }, + { + "epoch": 2.2217777777777776, + "grad_norm": 2.8683934211730957, + "learning_rate": 0.00011122775800711744, + "loss": 1.3027, + "step": 4999 + }, + { + "epoch": 2.2222222222222223, + "grad_norm": 3.7593607902526855, + "learning_rate": 0.0001112099644128114, + "loss": 1.6055, + "step": 5000 + }, + { + "epoch": 2.2226666666666666, + "grad_norm": 1.727084755897522, + "learning_rate": 0.00011119217081850535, + "loss": 1.8026, + "step": 5001 + }, + { + "epoch": 2.2231111111111113, + "grad_norm": 1.792447805404663, + "learning_rate": 0.0001111743772241993, + "loss": 1.9238, + "step": 5002 + }, + { + "epoch": 2.2235555555555555, + "grad_norm": 1.3714830875396729, + "learning_rate": 0.00011115658362989325, + "loss": 0.1835, + "step": 5003 + }, + { + "epoch": 2.224, + "grad_norm": 2.177612066268921, + "learning_rate": 0.00011113879003558721, + "loss": 2.0674, + "step": 5004 + }, + { + "epoch": 2.2244444444444444, + "grad_norm": 2.2636287212371826, + "learning_rate": 0.00011112099644128114, + "loss": 2.0529, + "step": 5005 + }, + { + "epoch": 2.2248888888888887, + "grad_norm": 2.174473285675049, + "learning_rate": 0.00011110320284697508, + "loss": 1.7144, + "step": 5006 + }, + { + "epoch": 2.2253333333333334, + "grad_norm": 2.079563617706299, + "learning_rate": 0.00011108540925266904, + "loss": 1.7053, + "step": 5007 + }, + { + "epoch": 2.2257777777777776, + "grad_norm": 2.0495669841766357, + "learning_rate": 0.000111067615658363, + "loss": 1.4467, + "step": 5008 + }, + { + "epoch": 2.2262222222222223, + "grad_norm": 2.1744115352630615, + "learning_rate": 0.00011104982206405694, + "loss": 1.6289, + "step": 5009 + }, + { + "epoch": 2.2266666666666666, + "grad_norm": 2.245788335800171, + "learning_rate": 0.00011103202846975089, + "loss": 2.0664, + "step": 5010 + }, + { + "epoch": 2.2271111111111113, + "grad_norm": 2.4176290035247803, + "learning_rate": 0.00011101423487544485, + "loss": 1.986, + "step": 5011 + }, + { + "epoch": 2.2275555555555555, + "grad_norm": 2.397927761077881, + "learning_rate": 0.00011099644128113879, + "loss": 1.8439, + "step": 5012 + }, + { + "epoch": 2.228, + "grad_norm": 2.479949951171875, + "learning_rate": 0.00011097864768683275, + "loss": 1.7942, + "step": 5013 + }, + { + "epoch": 2.2284444444444444, + "grad_norm": 2.213899850845337, + "learning_rate": 0.0001109608540925267, + "loss": 1.8292, + "step": 5014 + }, + { + "epoch": 2.2288888888888887, + "grad_norm": 2.117074728012085, + "learning_rate": 0.00011094306049822065, + "loss": 1.8464, + "step": 5015 + }, + { + "epoch": 2.2293333333333334, + "grad_norm": 2.4568095207214355, + "learning_rate": 0.0001109252669039146, + "loss": 1.7872, + "step": 5016 + }, + { + "epoch": 2.2297777777777776, + "grad_norm": 2.0903635025024414, + "learning_rate": 0.00011090747330960856, + "loss": 1.4369, + "step": 5017 + }, + { + "epoch": 2.2302222222222223, + "grad_norm": 2.1136090755462646, + "learning_rate": 0.00011088967971530249, + "loss": 1.8091, + "step": 5018 + }, + { + "epoch": 2.2306666666666666, + "grad_norm": 1.9191817045211792, + "learning_rate": 0.00011087188612099643, + "loss": 1.9579, + "step": 5019 + }, + { + "epoch": 2.2311111111111113, + "grad_norm": 2.3440165519714355, + "learning_rate": 0.00011085409252669039, + "loss": 1.9823, + "step": 5020 + }, + { + "epoch": 2.2315555555555555, + "grad_norm": 2.085297107696533, + "learning_rate": 0.00011083629893238435, + "loss": 1.8584, + "step": 5021 + }, + { + "epoch": 2.232, + "grad_norm": 2.513420820236206, + "learning_rate": 0.00011081850533807829, + "loss": 2.2256, + "step": 5022 + }, + { + "epoch": 2.2324444444444445, + "grad_norm": 2.3045380115509033, + "learning_rate": 0.00011080071174377225, + "loss": 1.4638, + "step": 5023 + }, + { + "epoch": 2.2328888888888887, + "grad_norm": 2.3060178756713867, + "learning_rate": 0.0001107829181494662, + "loss": 1.8818, + "step": 5024 + }, + { + "epoch": 2.2333333333333334, + "grad_norm": 2.0273396968841553, + "learning_rate": 0.00011076512455516015, + "loss": 1.5125, + "step": 5025 + }, + { + "epoch": 2.2337777777777776, + "grad_norm": 2.183797597885132, + "learning_rate": 0.0001107473309608541, + "loss": 1.4318, + "step": 5026 + }, + { + "epoch": 2.2342222222222223, + "grad_norm": 2.43717098236084, + "learning_rate": 0.00011072953736654806, + "loss": 1.7804, + "step": 5027 + }, + { + "epoch": 2.2346666666666666, + "grad_norm": 2.276435375213623, + "learning_rate": 0.000110711743772242, + "loss": 1.5594, + "step": 5028 + }, + { + "epoch": 2.2351111111111113, + "grad_norm": 2.2554962635040283, + "learning_rate": 0.00011069395017793596, + "loss": 1.4094, + "step": 5029 + }, + { + "epoch": 2.2355555555555555, + "grad_norm": 2.3153090476989746, + "learning_rate": 0.00011067615658362991, + "loss": 1.3073, + "step": 5030 + }, + { + "epoch": 2.2359999999999998, + "grad_norm": 2.353789806365967, + "learning_rate": 0.00011065836298932384, + "loss": 1.5179, + "step": 5031 + }, + { + "epoch": 2.2364444444444445, + "grad_norm": 2.6622314453125, + "learning_rate": 0.00011064056939501779, + "loss": 1.9487, + "step": 5032 + }, + { + "epoch": 2.2368888888888887, + "grad_norm": 2.1066958904266357, + "learning_rate": 0.00011062277580071174, + "loss": 1.3317, + "step": 5033 + }, + { + "epoch": 2.2373333333333334, + "grad_norm": 2.2060763835906982, + "learning_rate": 0.0001106049822064057, + "loss": 0.9029, + "step": 5034 + }, + { + "epoch": 2.2377777777777776, + "grad_norm": 0.20760299265384674, + "learning_rate": 0.00011058718861209964, + "loss": 0.037, + "step": 5035 + }, + { + "epoch": 2.2382222222222223, + "grad_norm": 2.2269294261932373, + "learning_rate": 0.0001105693950177936, + "loss": 1.4259, + "step": 5036 + }, + { + "epoch": 2.2386666666666666, + "grad_norm": 2.511350631713867, + "learning_rate": 0.00011055160142348756, + "loss": 2.0862, + "step": 5037 + }, + { + "epoch": 2.2391111111111113, + "grad_norm": 2.4969749450683594, + "learning_rate": 0.0001105338078291815, + "loss": 1.4302, + "step": 5038 + }, + { + "epoch": 2.2395555555555555, + "grad_norm": 2.8733372688293457, + "learning_rate": 0.00011051601423487546, + "loss": 1.4739, + "step": 5039 + }, + { + "epoch": 2.24, + "grad_norm": 2.5586845874786377, + "learning_rate": 0.00011049822064056941, + "loss": 1.6102, + "step": 5040 + }, + { + "epoch": 2.2404444444444445, + "grad_norm": 3.037733554840088, + "learning_rate": 0.00011048042704626335, + "loss": 1.6267, + "step": 5041 + }, + { + "epoch": 2.2408888888888887, + "grad_norm": 2.6196389198303223, + "learning_rate": 0.00011046263345195731, + "loss": 1.3955, + "step": 5042 + }, + { + "epoch": 2.2413333333333334, + "grad_norm": 2.608210325241089, + "learning_rate": 0.00011044483985765127, + "loss": 1.8584, + "step": 5043 + }, + { + "epoch": 2.2417777777777776, + "grad_norm": 2.4374005794525146, + "learning_rate": 0.0001104270462633452, + "loss": 1.1662, + "step": 5044 + }, + { + "epoch": 2.2422222222222223, + "grad_norm": 2.6647913455963135, + "learning_rate": 0.00011040925266903914, + "loss": 1.7048, + "step": 5045 + }, + { + "epoch": 2.2426666666666666, + "grad_norm": 3.091082811355591, + "learning_rate": 0.0001103914590747331, + "loss": 1.6592, + "step": 5046 + }, + { + "epoch": 2.2431111111111113, + "grad_norm": 3.078878164291382, + "learning_rate": 0.00011037366548042705, + "loss": 1.6601, + "step": 5047 + }, + { + "epoch": 2.2435555555555555, + "grad_norm": 2.757037401199341, + "learning_rate": 0.000110355871886121, + "loss": 1.3657, + "step": 5048 + }, + { + "epoch": 2.2439999999999998, + "grad_norm": 4.261408805847168, + "learning_rate": 0.00011033807829181495, + "loss": 1.2643, + "step": 5049 + }, + { + "epoch": 2.2444444444444445, + "grad_norm": 4.232182025909424, + "learning_rate": 0.00011032028469750891, + "loss": 1.125, + "step": 5050 + }, + { + "epoch": 2.2448888888888887, + "grad_norm": 1.8265682458877563, + "learning_rate": 0.00011030249110320285, + "loss": 2.396, + "step": 5051 + }, + { + "epoch": 2.2453333333333334, + "grad_norm": 2.1252670288085938, + "learning_rate": 0.00011028469750889681, + "loss": 2.0516, + "step": 5052 + }, + { + "epoch": 2.2457777777777777, + "grad_norm": 2.073094129562378, + "learning_rate": 0.00011026690391459075, + "loss": 2.2738, + "step": 5053 + }, + { + "epoch": 2.2462222222222223, + "grad_norm": 2.2928173542022705, + "learning_rate": 0.00011024911032028471, + "loss": 1.8305, + "step": 5054 + }, + { + "epoch": 2.2466666666666666, + "grad_norm": 2.276385545730591, + "learning_rate": 0.00011023131672597866, + "loss": 2.1467, + "step": 5055 + }, + { + "epoch": 2.2471111111111113, + "grad_norm": 1.8952126502990723, + "learning_rate": 0.00011021352313167261, + "loss": 1.6379, + "step": 5056 + }, + { + "epoch": 2.2475555555555555, + "grad_norm": 2.017606735229492, + "learning_rate": 0.00011019572953736654, + "loss": 2.0341, + "step": 5057 + }, + { + "epoch": 2.248, + "grad_norm": 2.055772542953491, + "learning_rate": 0.00011017793594306049, + "loss": 2.0079, + "step": 5058 + }, + { + "epoch": 2.2484444444444445, + "grad_norm": 2.386974334716797, + "learning_rate": 0.00011016014234875445, + "loss": 2.0906, + "step": 5059 + }, + { + "epoch": 2.2488888888888887, + "grad_norm": 2.5190494060516357, + "learning_rate": 0.00011014234875444839, + "loss": 0.848, + "step": 5060 + }, + { + "epoch": 2.2493333333333334, + "grad_norm": 1.9555598497390747, + "learning_rate": 0.00011012455516014235, + "loss": 1.8228, + "step": 5061 + }, + { + "epoch": 2.2497777777777777, + "grad_norm": 2.0160269737243652, + "learning_rate": 0.0001101067615658363, + "loss": 1.8248, + "step": 5062 + }, + { + "epoch": 2.2502222222222223, + "grad_norm": 1.9360467195510864, + "learning_rate": 0.00011008896797153025, + "loss": 1.5145, + "step": 5063 + }, + { + "epoch": 2.2506666666666666, + "grad_norm": 2.0160787105560303, + "learning_rate": 0.0001100711743772242, + "loss": 1.72, + "step": 5064 + }, + { + "epoch": 2.2511111111111113, + "grad_norm": 2.07167649269104, + "learning_rate": 0.00011005338078291816, + "loss": 1.5055, + "step": 5065 + }, + { + "epoch": 2.2515555555555555, + "grad_norm": 2.21230149269104, + "learning_rate": 0.0001100355871886121, + "loss": 1.9066, + "step": 5066 + }, + { + "epoch": 2.252, + "grad_norm": 1.537184476852417, + "learning_rate": 0.00011001779359430606, + "loss": 0.8466, + "step": 5067 + }, + { + "epoch": 2.2524444444444445, + "grad_norm": 1.7883505821228027, + "learning_rate": 0.00011000000000000002, + "loss": 1.1587, + "step": 5068 + }, + { + "epoch": 2.2528888888888887, + "grad_norm": 2.396481990814209, + "learning_rate": 0.00010998220640569396, + "loss": 2.3124, + "step": 5069 + }, + { + "epoch": 2.2533333333333334, + "grad_norm": 2.4314522743225098, + "learning_rate": 0.00010996441281138789, + "loss": 1.9213, + "step": 5070 + }, + { + "epoch": 2.2537777777777777, + "grad_norm": 2.3176066875457764, + "learning_rate": 0.00010994661921708185, + "loss": 1.7673, + "step": 5071 + }, + { + "epoch": 2.2542222222222223, + "grad_norm": 2.6162097454071045, + "learning_rate": 0.0001099288256227758, + "loss": 1.8331, + "step": 5072 + }, + { + "epoch": 2.2546666666666666, + "grad_norm": 2.079402446746826, + "learning_rate": 0.00010991103202846975, + "loss": 1.7033, + "step": 5073 + }, + { + "epoch": 2.2551111111111113, + "grad_norm": 2.6413557529449463, + "learning_rate": 0.0001098932384341637, + "loss": 2.1742, + "step": 5074 + }, + { + "epoch": 2.2555555555555555, + "grad_norm": 2.4262351989746094, + "learning_rate": 0.00010987544483985766, + "loss": 1.8914, + "step": 5075 + }, + { + "epoch": 2.2560000000000002, + "grad_norm": 2.247837543487549, + "learning_rate": 0.0001098576512455516, + "loss": 1.7544, + "step": 5076 + }, + { + "epoch": 2.2564444444444445, + "grad_norm": 2.4542994499206543, + "learning_rate": 0.00010983985765124556, + "loss": 1.5624, + "step": 5077 + }, + { + "epoch": 2.2568888888888887, + "grad_norm": 2.1061196327209473, + "learning_rate": 0.00010982206405693951, + "loss": 1.4963, + "step": 5078 + }, + { + "epoch": 2.2573333333333334, + "grad_norm": 2.599663257598877, + "learning_rate": 0.00010980427046263346, + "loss": 2.1344, + "step": 5079 + }, + { + "epoch": 2.2577777777777777, + "grad_norm": 2.2885448932647705, + "learning_rate": 0.00010978647686832741, + "loss": 1.6295, + "step": 5080 + }, + { + "epoch": 2.2582222222222224, + "grad_norm": 2.3721494674682617, + "learning_rate": 0.00010976868327402137, + "loss": 1.7371, + "step": 5081 + }, + { + "epoch": 2.2586666666666666, + "grad_norm": 2.5894291400909424, + "learning_rate": 0.00010975088967971531, + "loss": 1.7291, + "step": 5082 + }, + { + "epoch": 2.2591111111111113, + "grad_norm": 2.5275352001190186, + "learning_rate": 0.00010973309608540924, + "loss": 1.6438, + "step": 5083 + }, + { + "epoch": 2.2595555555555555, + "grad_norm": 2.664001941680908, + "learning_rate": 0.0001097153024911032, + "loss": 1.561, + "step": 5084 + }, + { + "epoch": 2.26, + "grad_norm": 2.6688332557678223, + "learning_rate": 0.00010969750889679716, + "loss": 1.6883, + "step": 5085 + }, + { + "epoch": 2.2604444444444445, + "grad_norm": 3.350391387939453, + "learning_rate": 0.0001096797153024911, + "loss": 1.7163, + "step": 5086 + }, + { + "epoch": 2.2608888888888887, + "grad_norm": 2.204253911972046, + "learning_rate": 0.00010966192170818506, + "loss": 1.3111, + "step": 5087 + }, + { + "epoch": 2.2613333333333334, + "grad_norm": 2.883998394012451, + "learning_rate": 0.00010964412811387901, + "loss": 2.3049, + "step": 5088 + }, + { + "epoch": 2.2617777777777777, + "grad_norm": 3.0937411785125732, + "learning_rate": 0.00010962633451957295, + "loss": 2.2407, + "step": 5089 + }, + { + "epoch": 2.2622222222222224, + "grad_norm": 2.6656394004821777, + "learning_rate": 0.00010960854092526691, + "loss": 1.4929, + "step": 5090 + }, + { + "epoch": 2.2626666666666666, + "grad_norm": 2.6025304794311523, + "learning_rate": 0.00010959074733096087, + "loss": 1.6213, + "step": 5091 + }, + { + "epoch": 2.2631111111111113, + "grad_norm": 2.685457706451416, + "learning_rate": 0.00010957295373665481, + "loss": 1.5779, + "step": 5092 + }, + { + "epoch": 2.2635555555555555, + "grad_norm": 2.978306531906128, + "learning_rate": 0.00010955516014234877, + "loss": 1.9549, + "step": 5093 + }, + { + "epoch": 2.2640000000000002, + "grad_norm": 3.0156309604644775, + "learning_rate": 0.00010953736654804272, + "loss": 1.837, + "step": 5094 + }, + { + "epoch": 2.2644444444444445, + "grad_norm": 2.647887706756592, + "learning_rate": 0.00010951957295373667, + "loss": 1.496, + "step": 5095 + }, + { + "epoch": 2.2648888888888887, + "grad_norm": 3.1577885150909424, + "learning_rate": 0.0001095017793594306, + "loss": 1.7638, + "step": 5096 + }, + { + "epoch": 2.2653333333333334, + "grad_norm": 3.2039244174957275, + "learning_rate": 0.00010948398576512455, + "loss": 1.8427, + "step": 5097 + }, + { + "epoch": 2.2657777777777777, + "grad_norm": 3.6146154403686523, + "learning_rate": 0.00010946619217081851, + "loss": 1.961, + "step": 5098 + }, + { + "epoch": 2.2662222222222224, + "grad_norm": 2.9038968086242676, + "learning_rate": 0.00010944839857651245, + "loss": 1.731, + "step": 5099 + }, + { + "epoch": 2.2666666666666666, + "grad_norm": 1.7584317922592163, + "learning_rate": 0.00010943060498220641, + "loss": 0.5946, + "step": 5100 + }, + { + "epoch": 2.2671111111111113, + "grad_norm": 1.3620789051055908, + "learning_rate": 0.00010941281138790037, + "loss": 1.1835, + "step": 5101 + }, + { + "epoch": 2.2675555555555555, + "grad_norm": 1.787398338317871, + "learning_rate": 0.00010939501779359431, + "loss": 2.3021, + "step": 5102 + }, + { + "epoch": 2.268, + "grad_norm": 2.0760881900787354, + "learning_rate": 0.00010937722419928826, + "loss": 2.3895, + "step": 5103 + }, + { + "epoch": 2.2684444444444445, + "grad_norm": 1.6183782815933228, + "learning_rate": 0.00010935943060498222, + "loss": 1.7555, + "step": 5104 + }, + { + "epoch": 2.2688888888888887, + "grad_norm": 2.2935304641723633, + "learning_rate": 0.00010934163701067616, + "loss": 2.1392, + "step": 5105 + }, + { + "epoch": 2.2693333333333334, + "grad_norm": 2.3295974731445312, + "learning_rate": 0.00010932384341637012, + "loss": 1.9799, + "step": 5106 + }, + { + "epoch": 2.2697777777777777, + "grad_norm": 2.144185781478882, + "learning_rate": 0.00010930604982206408, + "loss": 1.6646, + "step": 5107 + }, + { + "epoch": 2.2702222222222224, + "grad_norm": 1.8681271076202393, + "learning_rate": 0.000109288256227758, + "loss": 1.7238, + "step": 5108 + }, + { + "epoch": 2.2706666666666666, + "grad_norm": 1.9623098373413086, + "learning_rate": 0.00010927046263345195, + "loss": 1.5286, + "step": 5109 + }, + { + "epoch": 2.2711111111111113, + "grad_norm": 2.0515475273132324, + "learning_rate": 0.0001092526690391459, + "loss": 1.853, + "step": 5110 + }, + { + "epoch": 2.2715555555555556, + "grad_norm": 2.088438034057617, + "learning_rate": 0.00010923487544483986, + "loss": 1.907, + "step": 5111 + }, + { + "epoch": 2.2720000000000002, + "grad_norm": 2.4442410469055176, + "learning_rate": 0.0001092170818505338, + "loss": 2.044, + "step": 5112 + }, + { + "epoch": 2.2724444444444445, + "grad_norm": 2.190702199935913, + "learning_rate": 0.00010919928825622776, + "loss": 2.2709, + "step": 5113 + }, + { + "epoch": 2.2728888888888887, + "grad_norm": 2.703242778778076, + "learning_rate": 0.00010918149466192172, + "loss": 2.2026, + "step": 5114 + }, + { + "epoch": 2.2733333333333334, + "grad_norm": 2.1000590324401855, + "learning_rate": 0.00010916370106761566, + "loss": 1.552, + "step": 5115 + }, + { + "epoch": 2.2737777777777777, + "grad_norm": 2.5693249702453613, + "learning_rate": 0.00010914590747330962, + "loss": 2.2501, + "step": 5116 + }, + { + "epoch": 2.2742222222222224, + "grad_norm": 2.3385515213012695, + "learning_rate": 0.00010912811387900357, + "loss": 2.283, + "step": 5117 + }, + { + "epoch": 2.2746666666666666, + "grad_norm": 1.8569934368133545, + "learning_rate": 0.00010911032028469752, + "loss": 1.5728, + "step": 5118 + }, + { + "epoch": 2.2751111111111113, + "grad_norm": 2.1497058868408203, + "learning_rate": 0.00010909252669039147, + "loss": 1.6939, + "step": 5119 + }, + { + "epoch": 2.2755555555555556, + "grad_norm": 2.399209499359131, + "learning_rate": 0.00010907473309608543, + "loss": 2.1534, + "step": 5120 + }, + { + "epoch": 2.276, + "grad_norm": 2.810572624206543, + "learning_rate": 0.00010905693950177936, + "loss": 1.8042, + "step": 5121 + }, + { + "epoch": 2.2764444444444445, + "grad_norm": 2.422611713409424, + "learning_rate": 0.0001090391459074733, + "loss": 1.7979, + "step": 5122 + }, + { + "epoch": 2.2768888888888887, + "grad_norm": 2.5435221195220947, + "learning_rate": 0.00010902135231316726, + "loss": 1.8007, + "step": 5123 + }, + { + "epoch": 2.2773333333333334, + "grad_norm": 2.7877004146575928, + "learning_rate": 0.00010900355871886122, + "loss": 1.9082, + "step": 5124 + }, + { + "epoch": 2.2777777777777777, + "grad_norm": 2.4548027515411377, + "learning_rate": 0.00010898576512455516, + "loss": 1.4372, + "step": 5125 + }, + { + "epoch": 2.2782222222222224, + "grad_norm": 2.1647870540618896, + "learning_rate": 0.00010896797153024911, + "loss": 1.5144, + "step": 5126 + }, + { + "epoch": 2.2786666666666666, + "grad_norm": 2.2049877643585205, + "learning_rate": 0.00010895017793594307, + "loss": 1.278, + "step": 5127 + }, + { + "epoch": 2.279111111111111, + "grad_norm": 2.6493771076202393, + "learning_rate": 0.00010893238434163701, + "loss": 2.1244, + "step": 5128 + }, + { + "epoch": 2.2795555555555556, + "grad_norm": 2.0453760623931885, + "learning_rate": 0.00010891459074733097, + "loss": 0.8898, + "step": 5129 + }, + { + "epoch": 2.2800000000000002, + "grad_norm": 2.6239845752716064, + "learning_rate": 0.00010889679715302493, + "loss": 1.7447, + "step": 5130 + }, + { + "epoch": 2.2804444444444445, + "grad_norm": 1.8818942308425903, + "learning_rate": 0.00010887900355871887, + "loss": 0.8104, + "step": 5131 + }, + { + "epoch": 2.2808888888888887, + "grad_norm": 2.51438045501709, + "learning_rate": 0.00010886120996441283, + "loss": 1.4221, + "step": 5132 + }, + { + "epoch": 2.2813333333333334, + "grad_norm": 2.23756742477417, + "learning_rate": 0.00010884341637010678, + "loss": 1.3229, + "step": 5133 + }, + { + "epoch": 2.2817777777777777, + "grad_norm": 2.578627109527588, + "learning_rate": 0.00010882562277580071, + "loss": 1.679, + "step": 5134 + }, + { + "epoch": 2.2822222222222224, + "grad_norm": 2.4972848892211914, + "learning_rate": 0.00010880782918149466, + "loss": 1.0042, + "step": 5135 + }, + { + "epoch": 2.2826666666666666, + "grad_norm": 2.4864046573638916, + "learning_rate": 0.00010879003558718861, + "loss": 1.6573, + "step": 5136 + }, + { + "epoch": 2.2831111111111113, + "grad_norm": 1.8569329977035522, + "learning_rate": 0.00010877224199288257, + "loss": 0.6083, + "step": 5137 + }, + { + "epoch": 2.2835555555555556, + "grad_norm": 2.6191885471343994, + "learning_rate": 0.00010875444839857651, + "loss": 1.7911, + "step": 5138 + }, + { + "epoch": 2.284, + "grad_norm": 2.874465227127075, + "learning_rate": 0.00010873665480427047, + "loss": 1.9264, + "step": 5139 + }, + { + "epoch": 2.2844444444444445, + "grad_norm": 2.813215494155884, + "learning_rate": 0.00010871886120996441, + "loss": 1.6019, + "step": 5140 + }, + { + "epoch": 2.2848888888888887, + "grad_norm": 2.852952003479004, + "learning_rate": 0.00010870106761565837, + "loss": 2.0002, + "step": 5141 + }, + { + "epoch": 2.2853333333333334, + "grad_norm": 3.3380825519561768, + "learning_rate": 0.00010868327402135232, + "loss": 1.6685, + "step": 5142 + }, + { + "epoch": 2.2857777777777777, + "grad_norm": 2.840141534805298, + "learning_rate": 0.00010866548042704627, + "loss": 1.6248, + "step": 5143 + }, + { + "epoch": 2.2862222222222224, + "grad_norm": 2.88301420211792, + "learning_rate": 0.00010864768683274022, + "loss": 1.3737, + "step": 5144 + }, + { + "epoch": 2.2866666666666666, + "grad_norm": 3.1075048446655273, + "learning_rate": 0.00010862989323843418, + "loss": 1.678, + "step": 5145 + }, + { + "epoch": 2.287111111111111, + "grad_norm": 3.333651542663574, + "learning_rate": 0.00010861209964412812, + "loss": 2.0464, + "step": 5146 + }, + { + "epoch": 2.2875555555555556, + "grad_norm": 4.473095417022705, + "learning_rate": 0.00010859430604982205, + "loss": 1.6364, + "step": 5147 + }, + { + "epoch": 2.288, + "grad_norm": 2.844916343688965, + "learning_rate": 0.00010857651245551601, + "loss": 1.8926, + "step": 5148 + }, + { + "epoch": 2.2884444444444445, + "grad_norm": 5.156615257263184, + "learning_rate": 0.00010855871886120997, + "loss": 1.9084, + "step": 5149 + }, + { + "epoch": 2.2888888888888888, + "grad_norm": 3.648468494415283, + "learning_rate": 0.00010854092526690391, + "loss": 0.7005, + "step": 5150 + }, + { + "epoch": 2.2893333333333334, + "grad_norm": 2.095301628112793, + "learning_rate": 0.00010852313167259786, + "loss": 1.952, + "step": 5151 + }, + { + "epoch": 2.2897777777777777, + "grad_norm": 1.9257947206497192, + "learning_rate": 0.00010850533807829182, + "loss": 1.4345, + "step": 5152 + }, + { + "epoch": 2.2902222222222224, + "grad_norm": 2.2104406356811523, + "learning_rate": 0.00010848754448398576, + "loss": 1.9751, + "step": 5153 + }, + { + "epoch": 2.2906666666666666, + "grad_norm": 1.96810781955719, + "learning_rate": 0.00010846975088967972, + "loss": 2.024, + "step": 5154 + }, + { + "epoch": 2.2911111111111113, + "grad_norm": 2.201118230819702, + "learning_rate": 0.00010845195729537368, + "loss": 1.8283, + "step": 5155 + }, + { + "epoch": 2.2915555555555556, + "grad_norm": 2.238771438598633, + "learning_rate": 0.00010843416370106762, + "loss": 1.6468, + "step": 5156 + }, + { + "epoch": 2.292, + "grad_norm": 2.4652698040008545, + "learning_rate": 0.00010841637010676158, + "loss": 2.0724, + "step": 5157 + }, + { + "epoch": 2.2924444444444445, + "grad_norm": 1.8881862163543701, + "learning_rate": 0.00010839857651245553, + "loss": 1.2482, + "step": 5158 + }, + { + "epoch": 2.2928888888888888, + "grad_norm": 2.0697972774505615, + "learning_rate": 0.00010838078291814948, + "loss": 2.2032, + "step": 5159 + }, + { + "epoch": 2.2933333333333334, + "grad_norm": 2.31854510307312, + "learning_rate": 0.0001083629893238434, + "loss": 1.9085, + "step": 5160 + }, + { + "epoch": 2.2937777777777777, + "grad_norm": 2.013096332550049, + "learning_rate": 0.00010834519572953736, + "loss": 1.2551, + "step": 5161 + }, + { + "epoch": 2.2942222222222224, + "grad_norm": 2.2634682655334473, + "learning_rate": 0.00010832740213523132, + "loss": 1.5858, + "step": 5162 + }, + { + "epoch": 2.2946666666666666, + "grad_norm": 2.027539014816284, + "learning_rate": 0.00010830960854092526, + "loss": 1.3721, + "step": 5163 + }, + { + "epoch": 2.295111111111111, + "grad_norm": 2.623805284500122, + "learning_rate": 0.00010829181494661922, + "loss": 2.0648, + "step": 5164 + }, + { + "epoch": 2.2955555555555556, + "grad_norm": 1.4081056118011475, + "learning_rate": 0.00010827402135231317, + "loss": 0.8012, + "step": 5165 + }, + { + "epoch": 2.296, + "grad_norm": 1.6935027837753296, + "learning_rate": 0.00010825622775800712, + "loss": 1.0633, + "step": 5166 + }, + { + "epoch": 2.2964444444444445, + "grad_norm": 2.5808827877044678, + "learning_rate": 0.00010823843416370107, + "loss": 1.3108, + "step": 5167 + }, + { + "epoch": 2.2968888888888888, + "grad_norm": 2.5715770721435547, + "learning_rate": 0.00010822064056939503, + "loss": 1.5686, + "step": 5168 + }, + { + "epoch": 2.2973333333333334, + "grad_norm": 2.925513505935669, + "learning_rate": 0.00010820284697508897, + "loss": 1.9211, + "step": 5169 + }, + { + "epoch": 2.2977777777777777, + "grad_norm": 2.3410723209381104, + "learning_rate": 0.00010818505338078293, + "loss": 1.7527, + "step": 5170 + }, + { + "epoch": 2.2982222222222224, + "grad_norm": 2.2690529823303223, + "learning_rate": 0.00010816725978647689, + "loss": 1.5536, + "step": 5171 + }, + { + "epoch": 2.2986666666666666, + "grad_norm": 2.3402512073516846, + "learning_rate": 0.00010814946619217083, + "loss": 1.6756, + "step": 5172 + }, + { + "epoch": 2.2991111111111113, + "grad_norm": 2.567690849304199, + "learning_rate": 0.00010813167259786476, + "loss": 2.1322, + "step": 5173 + }, + { + "epoch": 2.2995555555555556, + "grad_norm": 2.5583088397979736, + "learning_rate": 0.00010811387900355872, + "loss": 1.7518, + "step": 5174 + }, + { + "epoch": 2.3, + "grad_norm": 2.2459707260131836, + "learning_rate": 0.00010809608540925267, + "loss": 1.4129, + "step": 5175 + }, + { + "epoch": 2.3004444444444445, + "grad_norm": 3.016340970993042, + "learning_rate": 0.00010807829181494661, + "loss": 1.8386, + "step": 5176 + }, + { + "epoch": 2.3008888888888888, + "grad_norm": 2.6692094802856445, + "learning_rate": 0.00010806049822064057, + "loss": 1.5559, + "step": 5177 + }, + { + "epoch": 2.3013333333333335, + "grad_norm": 2.3022642135620117, + "learning_rate": 0.00010804270462633453, + "loss": 1.4982, + "step": 5178 + }, + { + "epoch": 2.3017777777777777, + "grad_norm": 2.4145843982696533, + "learning_rate": 0.00010802491103202847, + "loss": 1.8678, + "step": 5179 + }, + { + "epoch": 2.3022222222222224, + "grad_norm": 2.2984983921051025, + "learning_rate": 0.00010800711743772243, + "loss": 1.606, + "step": 5180 + }, + { + "epoch": 2.3026666666666666, + "grad_norm": 2.6932058334350586, + "learning_rate": 0.00010798932384341638, + "loss": 1.5845, + "step": 5181 + }, + { + "epoch": 2.303111111111111, + "grad_norm": 2.256645441055298, + "learning_rate": 0.00010797153024911033, + "loss": 1.7118, + "step": 5182 + }, + { + "epoch": 2.3035555555555556, + "grad_norm": 2.5675642490386963, + "learning_rate": 0.00010795373665480428, + "loss": 1.3647, + "step": 5183 + }, + { + "epoch": 2.304, + "grad_norm": 2.930628776550293, + "learning_rate": 0.00010793594306049824, + "loss": 1.4687, + "step": 5184 + }, + { + "epoch": 2.3044444444444445, + "grad_norm": 3.3996012210845947, + "learning_rate": 0.00010791814946619218, + "loss": 1.5153, + "step": 5185 + }, + { + "epoch": 2.3048888888888888, + "grad_norm": 4.883398532867432, + "learning_rate": 0.00010790035587188611, + "loss": 1.8317, + "step": 5186 + }, + { + "epoch": 2.3053333333333335, + "grad_norm": 2.2228214740753174, + "learning_rate": 0.00010788256227758007, + "loss": 1.1171, + "step": 5187 + }, + { + "epoch": 2.3057777777777777, + "grad_norm": 2.6813435554504395, + "learning_rate": 0.00010786476868327402, + "loss": 1.5585, + "step": 5188 + }, + { + "epoch": 2.3062222222222224, + "grad_norm": 2.6681740283966064, + "learning_rate": 0.00010784697508896797, + "loss": 1.6806, + "step": 5189 + }, + { + "epoch": 2.3066666666666666, + "grad_norm": 2.8957736492156982, + "learning_rate": 0.00010782918149466192, + "loss": 1.8597, + "step": 5190 + }, + { + "epoch": 2.3071111111111113, + "grad_norm": 2.954521656036377, + "learning_rate": 0.00010781138790035588, + "loss": 1.6892, + "step": 5191 + }, + { + "epoch": 2.3075555555555556, + "grad_norm": 3.0856049060821533, + "learning_rate": 0.00010779359430604982, + "loss": 1.5892, + "step": 5192 + }, + { + "epoch": 2.308, + "grad_norm": 3.5431087017059326, + "learning_rate": 0.00010777580071174378, + "loss": 1.9527, + "step": 5193 + }, + { + "epoch": 2.3084444444444445, + "grad_norm": 2.8147850036621094, + "learning_rate": 0.00010775800711743774, + "loss": 1.6474, + "step": 5194 + }, + { + "epoch": 2.3088888888888888, + "grad_norm": 2.9207980632781982, + "learning_rate": 0.00010774021352313168, + "loss": 1.7115, + "step": 5195 + }, + { + "epoch": 2.3093333333333335, + "grad_norm": 1.972179651260376, + "learning_rate": 0.00010772241992882564, + "loss": 0.7474, + "step": 5196 + }, + { + "epoch": 2.3097777777777777, + "grad_norm": 3.6194729804992676, + "learning_rate": 0.00010770462633451959, + "loss": 1.577, + "step": 5197 + }, + { + "epoch": 2.3102222222222224, + "grad_norm": 3.8124332427978516, + "learning_rate": 0.00010768683274021354, + "loss": 1.6077, + "step": 5198 + }, + { + "epoch": 2.3106666666666666, + "grad_norm": 4.198216438293457, + "learning_rate": 0.00010766903914590746, + "loss": 2.2859, + "step": 5199 + }, + { + "epoch": 2.311111111111111, + "grad_norm": 3.1311981678009033, + "learning_rate": 0.00010765124555160142, + "loss": 1.2557, + "step": 5200 + }, + { + "epoch": 2.3115555555555556, + "grad_norm": 1.368379831314087, + "learning_rate": 0.00010763345195729538, + "loss": 1.233, + "step": 5201 + }, + { + "epoch": 2.312, + "grad_norm": 2.020923376083374, + "learning_rate": 0.00010761565836298932, + "loss": 1.9648, + "step": 5202 + }, + { + "epoch": 2.3124444444444445, + "grad_norm": 2.088132619857788, + "learning_rate": 0.00010759786476868328, + "loss": 2.2838, + "step": 5203 + }, + { + "epoch": 2.3128888888888888, + "grad_norm": 2.1785659790039062, + "learning_rate": 0.00010758007117437723, + "loss": 2.1716, + "step": 5204 + }, + { + "epoch": 2.3133333333333335, + "grad_norm": 2.086555004119873, + "learning_rate": 0.00010756227758007118, + "loss": 1.9242, + "step": 5205 + }, + { + "epoch": 2.3137777777777777, + "grad_norm": 2.2909762859344482, + "learning_rate": 0.00010754448398576513, + "loss": 2.3151, + "step": 5206 + }, + { + "epoch": 2.3142222222222224, + "grad_norm": 2.3706085681915283, + "learning_rate": 0.00010752669039145909, + "loss": 2.0996, + "step": 5207 + }, + { + "epoch": 2.3146666666666667, + "grad_norm": 2.0024471282958984, + "learning_rate": 0.00010750889679715303, + "loss": 1.8437, + "step": 5208 + }, + { + "epoch": 2.3151111111111113, + "grad_norm": 2.22165584564209, + "learning_rate": 0.00010749110320284699, + "loss": 1.6399, + "step": 5209 + }, + { + "epoch": 2.3155555555555556, + "grad_norm": 2.4060609340667725, + "learning_rate": 0.00010747330960854095, + "loss": 1.4666, + "step": 5210 + }, + { + "epoch": 2.316, + "grad_norm": 0.6937952041625977, + "learning_rate": 0.00010745551601423489, + "loss": 0.0337, + "step": 5211 + }, + { + "epoch": 2.3164444444444445, + "grad_norm": 2.10048508644104, + "learning_rate": 0.00010743772241992882, + "loss": 2.2067, + "step": 5212 + }, + { + "epoch": 2.3168888888888888, + "grad_norm": 2.3263375759124756, + "learning_rate": 0.00010741992882562277, + "loss": 2.0357, + "step": 5213 + }, + { + "epoch": 2.3173333333333335, + "grad_norm": 1.9152144193649292, + "learning_rate": 0.00010740213523131673, + "loss": 1.9073, + "step": 5214 + }, + { + "epoch": 2.3177777777777777, + "grad_norm": 2.3582661151885986, + "learning_rate": 0.00010738434163701067, + "loss": 1.9668, + "step": 5215 + }, + { + "epoch": 2.3182222222222224, + "grad_norm": 2.1141693592071533, + "learning_rate": 0.00010736654804270463, + "loss": 1.6874, + "step": 5216 + }, + { + "epoch": 2.3186666666666667, + "grad_norm": 2.0202839374542236, + "learning_rate": 0.00010734875444839859, + "loss": 1.7318, + "step": 5217 + }, + { + "epoch": 2.319111111111111, + "grad_norm": 2.4021458625793457, + "learning_rate": 0.00010733096085409253, + "loss": 1.9969, + "step": 5218 + }, + { + "epoch": 2.3195555555555556, + "grad_norm": 1.3853273391723633, + "learning_rate": 0.00010731316725978649, + "loss": 0.8318, + "step": 5219 + }, + { + "epoch": 2.32, + "grad_norm": 1.6327898502349854, + "learning_rate": 0.00010729537366548044, + "loss": 1.3863, + "step": 5220 + }, + { + "epoch": 2.3204444444444445, + "grad_norm": 2.009892463684082, + "learning_rate": 0.00010727758007117439, + "loss": 1.5846, + "step": 5221 + }, + { + "epoch": 2.320888888888889, + "grad_norm": 2.0592620372772217, + "learning_rate": 0.00010725978647686834, + "loss": 1.6894, + "step": 5222 + }, + { + "epoch": 2.3213333333333335, + "grad_norm": 2.1385364532470703, + "learning_rate": 0.0001072419928825623, + "loss": 1.6496, + "step": 5223 + }, + { + "epoch": 2.3217777777777777, + "grad_norm": 2.383230209350586, + "learning_rate": 0.00010722419928825623, + "loss": 1.707, + "step": 5224 + }, + { + "epoch": 2.3222222222222224, + "grad_norm": 2.0533084869384766, + "learning_rate": 0.00010720640569395017, + "loss": 1.4989, + "step": 5225 + }, + { + "epoch": 2.3226666666666667, + "grad_norm": 2.2495386600494385, + "learning_rate": 0.00010718861209964413, + "loss": 1.7552, + "step": 5226 + }, + { + "epoch": 2.3231111111111113, + "grad_norm": 2.7372584342956543, + "learning_rate": 0.00010717081850533808, + "loss": 1.5627, + "step": 5227 + }, + { + "epoch": 2.3235555555555556, + "grad_norm": 2.1765596866607666, + "learning_rate": 0.00010715302491103203, + "loss": 1.2697, + "step": 5228 + }, + { + "epoch": 2.324, + "grad_norm": 2.3925368785858154, + "learning_rate": 0.00010713523131672598, + "loss": 1.8882, + "step": 5229 + }, + { + "epoch": 2.3244444444444445, + "grad_norm": 3.3124077320098877, + "learning_rate": 0.00010711743772241993, + "loss": 1.5998, + "step": 5230 + }, + { + "epoch": 2.324888888888889, + "grad_norm": 2.3769872188568115, + "learning_rate": 0.00010709964412811388, + "loss": 1.8299, + "step": 5231 + }, + { + "epoch": 2.3253333333333335, + "grad_norm": 3.1028130054473877, + "learning_rate": 0.00010708185053380784, + "loss": 2.4035, + "step": 5232 + }, + { + "epoch": 2.3257777777777777, + "grad_norm": 2.543940544128418, + "learning_rate": 0.00010706405693950178, + "loss": 1.2689, + "step": 5233 + }, + { + "epoch": 2.3262222222222224, + "grad_norm": 2.7707502841949463, + "learning_rate": 0.00010704626334519574, + "loss": 1.9459, + "step": 5234 + }, + { + "epoch": 2.3266666666666667, + "grad_norm": 2.408712863922119, + "learning_rate": 0.0001070284697508897, + "loss": 1.7291, + "step": 5235 + }, + { + "epoch": 2.327111111111111, + "grad_norm": 3.0360755920410156, + "learning_rate": 0.00010701067615658364, + "loss": 2.274, + "step": 5236 + }, + { + "epoch": 2.3275555555555556, + "grad_norm": 2.5077056884765625, + "learning_rate": 0.00010699288256227757, + "loss": 1.5385, + "step": 5237 + }, + { + "epoch": 2.328, + "grad_norm": 3.110048770904541, + "learning_rate": 0.00010697508896797152, + "loss": 1.7831, + "step": 5238 + }, + { + "epoch": 2.3284444444444445, + "grad_norm": 2.6556756496429443, + "learning_rate": 0.00010695729537366548, + "loss": 1.9516, + "step": 5239 + }, + { + "epoch": 2.328888888888889, + "grad_norm": 3.066831350326538, + "learning_rate": 0.00010693950177935942, + "loss": 1.9794, + "step": 5240 + }, + { + "epoch": 2.3293333333333335, + "grad_norm": 2.9944040775299072, + "learning_rate": 0.00010692170818505338, + "loss": 1.2826, + "step": 5241 + }, + { + "epoch": 2.3297777777777777, + "grad_norm": 2.9339687824249268, + "learning_rate": 0.00010690391459074734, + "loss": 1.6169, + "step": 5242 + }, + { + "epoch": 2.330222222222222, + "grad_norm": 2.9092490673065186, + "learning_rate": 0.00010688612099644128, + "loss": 1.6369, + "step": 5243 + }, + { + "epoch": 2.3306666666666667, + "grad_norm": 3.148411512374878, + "learning_rate": 0.00010686832740213524, + "loss": 1.7855, + "step": 5244 + }, + { + "epoch": 2.3311111111111114, + "grad_norm": 2.889481544494629, + "learning_rate": 0.00010685053380782919, + "loss": 1.8247, + "step": 5245 + }, + { + "epoch": 2.3315555555555556, + "grad_norm": 3.1188857555389404, + "learning_rate": 0.00010683274021352314, + "loss": 1.301, + "step": 5246 + }, + { + "epoch": 2.332, + "grad_norm": 2.914860486984253, + "learning_rate": 0.00010681494661921709, + "loss": 1.5213, + "step": 5247 + }, + { + "epoch": 2.3324444444444445, + "grad_norm": 3.193268060684204, + "learning_rate": 0.00010679715302491105, + "loss": 1.5691, + "step": 5248 + }, + { + "epoch": 2.332888888888889, + "grad_norm": 2.0643980503082275, + "learning_rate": 0.00010677935943060499, + "loss": 0.7877, + "step": 5249 + }, + { + "epoch": 2.3333333333333335, + "grad_norm": 3.4626474380493164, + "learning_rate": 0.00010676156583629892, + "loss": 1.5895, + "step": 5250 + }, + { + "epoch": 2.3337777777777777, + "grad_norm": 1.721472144126892, + "learning_rate": 0.00010674377224199288, + "loss": 1.8599, + "step": 5251 + }, + { + "epoch": 2.3342222222222224, + "grad_norm": 2.4358739852905273, + "learning_rate": 0.00010672597864768683, + "loss": 2.4073, + "step": 5252 + }, + { + "epoch": 2.3346666666666667, + "grad_norm": 2.0674357414245605, + "learning_rate": 0.00010670818505338078, + "loss": 1.4055, + "step": 5253 + }, + { + "epoch": 2.335111111111111, + "grad_norm": 1.9118478298187256, + "learning_rate": 0.00010669039145907473, + "loss": 1.3337, + "step": 5254 + }, + { + "epoch": 2.3355555555555556, + "grad_norm": 2.8397645950317383, + "learning_rate": 0.00010667259786476869, + "loss": 2.0588, + "step": 5255 + }, + { + "epoch": 2.336, + "grad_norm": 2.2740516662597656, + "learning_rate": 0.00010665480427046263, + "loss": 1.9389, + "step": 5256 + }, + { + "epoch": 2.3364444444444445, + "grad_norm": 1.9148417711257935, + "learning_rate": 0.00010663701067615659, + "loss": 1.1185, + "step": 5257 + }, + { + "epoch": 2.336888888888889, + "grad_norm": 1.7196369171142578, + "learning_rate": 0.00010661921708185055, + "loss": 0.9252, + "step": 5258 + }, + { + "epoch": 2.3373333333333335, + "grad_norm": 2.5018043518066406, + "learning_rate": 0.00010660142348754449, + "loss": 2.1507, + "step": 5259 + }, + { + "epoch": 2.3377777777777777, + "grad_norm": 2.2617380619049072, + "learning_rate": 0.00010658362989323845, + "loss": 1.6547, + "step": 5260 + }, + { + "epoch": 2.338222222222222, + "grad_norm": 2.1879079341888428, + "learning_rate": 0.0001065658362989324, + "loss": 1.8825, + "step": 5261 + }, + { + "epoch": 2.3386666666666667, + "grad_norm": 2.5784716606140137, + "learning_rate": 0.00010654804270462634, + "loss": 2.1602, + "step": 5262 + }, + { + "epoch": 2.339111111111111, + "grad_norm": 2.2693116664886475, + "learning_rate": 0.00010653024911032027, + "loss": 1.818, + "step": 5263 + }, + { + "epoch": 2.3395555555555556, + "grad_norm": 2.2588093280792236, + "learning_rate": 0.00010651245551601423, + "loss": 2.0916, + "step": 5264 + }, + { + "epoch": 2.34, + "grad_norm": 2.266087055206299, + "learning_rate": 0.00010649466192170819, + "loss": 1.8388, + "step": 5265 + }, + { + "epoch": 2.3404444444444445, + "grad_norm": 2.1709249019622803, + "learning_rate": 0.00010647686832740213, + "loss": 1.9512, + "step": 5266 + }, + { + "epoch": 2.340888888888889, + "grad_norm": 1.934448003768921, + "learning_rate": 0.00010645907473309609, + "loss": 1.5891, + "step": 5267 + }, + { + "epoch": 2.3413333333333335, + "grad_norm": 1.595613956451416, + "learning_rate": 0.00010644128113879004, + "loss": 0.9631, + "step": 5268 + }, + { + "epoch": 2.3417777777777777, + "grad_norm": 2.1244924068450928, + "learning_rate": 0.00010642348754448399, + "loss": 1.49, + "step": 5269 + }, + { + "epoch": 2.3422222222222224, + "grad_norm": 2.107415199279785, + "learning_rate": 0.00010640569395017794, + "loss": 1.2837, + "step": 5270 + }, + { + "epoch": 2.3426666666666667, + "grad_norm": 2.0290896892547607, + "learning_rate": 0.0001063879003558719, + "loss": 1.426, + "step": 5271 + }, + { + "epoch": 2.343111111111111, + "grad_norm": 1.9935804605484009, + "learning_rate": 0.00010637010676156584, + "loss": 1.6032, + "step": 5272 + }, + { + "epoch": 2.3435555555555556, + "grad_norm": 2.401331663131714, + "learning_rate": 0.0001063523131672598, + "loss": 1.745, + "step": 5273 + }, + { + "epoch": 2.344, + "grad_norm": 2.32720685005188, + "learning_rate": 0.00010633451957295375, + "loss": 2.1566, + "step": 5274 + }, + { + "epoch": 2.3444444444444446, + "grad_norm": 2.5163567066192627, + "learning_rate": 0.0001063167259786477, + "loss": 1.8647, + "step": 5275 + }, + { + "epoch": 2.344888888888889, + "grad_norm": 2.6666061878204346, + "learning_rate": 0.00010629893238434163, + "loss": 2.0087, + "step": 5276 + }, + { + "epoch": 2.3453333333333335, + "grad_norm": 2.972123622894287, + "learning_rate": 0.00010628113879003558, + "loss": 2.0574, + "step": 5277 + }, + { + "epoch": 2.3457777777777777, + "grad_norm": 2.4128878116607666, + "learning_rate": 0.00010626334519572954, + "loss": 1.4681, + "step": 5278 + }, + { + "epoch": 2.346222222222222, + "grad_norm": 2.2778286933898926, + "learning_rate": 0.00010624555160142348, + "loss": 1.6174, + "step": 5279 + }, + { + "epoch": 2.3466666666666667, + "grad_norm": 2.547231435775757, + "learning_rate": 0.00010622775800711744, + "loss": 1.6653, + "step": 5280 + }, + { + "epoch": 2.347111111111111, + "grad_norm": 2.483854055404663, + "learning_rate": 0.0001062099644128114, + "loss": 1.5825, + "step": 5281 + }, + { + "epoch": 2.3475555555555556, + "grad_norm": 2.559788465499878, + "learning_rate": 0.00010619217081850534, + "loss": 2.0953, + "step": 5282 + }, + { + "epoch": 2.348, + "grad_norm": 3.0145204067230225, + "learning_rate": 0.0001061743772241993, + "loss": 1.9542, + "step": 5283 + }, + { + "epoch": 2.3484444444444446, + "grad_norm": 2.558265209197998, + "learning_rate": 0.00010615658362989325, + "loss": 1.8425, + "step": 5284 + }, + { + "epoch": 2.348888888888889, + "grad_norm": 2.9224250316619873, + "learning_rate": 0.0001061387900355872, + "loss": 1.947, + "step": 5285 + }, + { + "epoch": 2.3493333333333335, + "grad_norm": 2.7229421138763428, + "learning_rate": 0.00010612099644128115, + "loss": 1.305, + "step": 5286 + }, + { + "epoch": 2.3497777777777777, + "grad_norm": 2.2449328899383545, + "learning_rate": 0.00010610320284697511, + "loss": 1.4746, + "step": 5287 + }, + { + "epoch": 2.3502222222222224, + "grad_norm": 2.956235885620117, + "learning_rate": 0.00010608540925266905, + "loss": 1.9641, + "step": 5288 + }, + { + "epoch": 2.3506666666666667, + "grad_norm": 2.308121681213379, + "learning_rate": 0.00010606761565836298, + "loss": 1.4521, + "step": 5289 + }, + { + "epoch": 2.351111111111111, + "grad_norm": 2.414666175842285, + "learning_rate": 0.00010604982206405694, + "loss": 1.0793, + "step": 5290 + }, + { + "epoch": 2.3515555555555556, + "grad_norm": 3.1062090396881104, + "learning_rate": 0.0001060320284697509, + "loss": 1.5819, + "step": 5291 + }, + { + "epoch": 2.352, + "grad_norm": 2.759904146194458, + "learning_rate": 0.00010601423487544484, + "loss": 2.2146, + "step": 5292 + }, + { + "epoch": 2.3524444444444446, + "grad_norm": 3.40423321723938, + "learning_rate": 0.00010599644128113879, + "loss": 1.8593, + "step": 5293 + }, + { + "epoch": 2.352888888888889, + "grad_norm": 3.157247543334961, + "learning_rate": 0.00010597864768683275, + "loss": 1.9475, + "step": 5294 + }, + { + "epoch": 2.3533333333333335, + "grad_norm": 2.931468963623047, + "learning_rate": 0.00010596085409252669, + "loss": 1.9819, + "step": 5295 + }, + { + "epoch": 2.3537777777777777, + "grad_norm": 2.865544557571411, + "learning_rate": 0.00010594306049822065, + "loss": 1.7654, + "step": 5296 + }, + { + "epoch": 2.354222222222222, + "grad_norm": 3.2829089164733887, + "learning_rate": 0.0001059252669039146, + "loss": 1.7053, + "step": 5297 + }, + { + "epoch": 2.3546666666666667, + "grad_norm": 2.915627956390381, + "learning_rate": 0.00010590747330960855, + "loss": 1.6627, + "step": 5298 + }, + { + "epoch": 2.355111111111111, + "grad_norm": 2.891852617263794, + "learning_rate": 0.0001058896797153025, + "loss": 1.4539, + "step": 5299 + }, + { + "epoch": 2.3555555555555556, + "grad_norm": 3.9856643676757812, + "learning_rate": 0.00010587188612099646, + "loss": 1.6694, + "step": 5300 + }, + { + "epoch": 2.356, + "grad_norm": 1.8876063823699951, + "learning_rate": 0.0001058540925266904, + "loss": 1.9182, + "step": 5301 + }, + { + "epoch": 2.3564444444444446, + "grad_norm": 1.3082079887390137, + "learning_rate": 0.00010583629893238433, + "loss": 0.945, + "step": 5302 + }, + { + "epoch": 2.356888888888889, + "grad_norm": 1.5244293212890625, + "learning_rate": 0.00010581850533807829, + "loss": 0.8277, + "step": 5303 + }, + { + "epoch": 2.3573333333333335, + "grad_norm": 2.210871458053589, + "learning_rate": 0.00010580071174377225, + "loss": 1.937, + "step": 5304 + }, + { + "epoch": 2.3577777777777778, + "grad_norm": 2.4062864780426025, + "learning_rate": 0.00010578291814946619, + "loss": 1.9807, + "step": 5305 + }, + { + "epoch": 2.3582222222222224, + "grad_norm": 2.0000367164611816, + "learning_rate": 0.00010576512455516015, + "loss": 1.4079, + "step": 5306 + }, + { + "epoch": 2.3586666666666667, + "grad_norm": 1.9247461557388306, + "learning_rate": 0.0001057473309608541, + "loss": 1.7219, + "step": 5307 + }, + { + "epoch": 2.359111111111111, + "grad_norm": 2.2574825286865234, + "learning_rate": 0.00010572953736654805, + "loss": 1.7616, + "step": 5308 + }, + { + "epoch": 2.3595555555555556, + "grad_norm": 2.254293203353882, + "learning_rate": 0.000105711743772242, + "loss": 2.0128, + "step": 5309 + }, + { + "epoch": 2.36, + "grad_norm": 2.4639298915863037, + "learning_rate": 0.00010569395017793596, + "loss": 1.9739, + "step": 5310 + }, + { + "epoch": 2.3604444444444446, + "grad_norm": 2.103483200073242, + "learning_rate": 0.0001056761565836299, + "loss": 1.91, + "step": 5311 + }, + { + "epoch": 2.360888888888889, + "grad_norm": 0.13134007155895233, + "learning_rate": 0.00010565836298932386, + "loss": 0.0197, + "step": 5312 + }, + { + "epoch": 2.3613333333333335, + "grad_norm": 2.1380348205566406, + "learning_rate": 0.00010564056939501781, + "loss": 1.7965, + "step": 5313 + }, + { + "epoch": 2.3617777777777778, + "grad_norm": 1.9932422637939453, + "learning_rate": 0.00010562277580071176, + "loss": 1.6092, + "step": 5314 + }, + { + "epoch": 2.362222222222222, + "grad_norm": 2.254859209060669, + "learning_rate": 0.00010560498220640569, + "loss": 2.1432, + "step": 5315 + }, + { + "epoch": 2.3626666666666667, + "grad_norm": 2.0134313106536865, + "learning_rate": 0.00010558718861209964, + "loss": 1.8426, + "step": 5316 + }, + { + "epoch": 2.363111111111111, + "grad_norm": 2.157883644104004, + "learning_rate": 0.0001055693950177936, + "loss": 1.5952, + "step": 5317 + }, + { + "epoch": 2.3635555555555556, + "grad_norm": 2.2276217937469482, + "learning_rate": 0.00010555160142348754, + "loss": 1.8253, + "step": 5318 + }, + { + "epoch": 2.364, + "grad_norm": 2.406975269317627, + "learning_rate": 0.0001055338078291815, + "loss": 1.7489, + "step": 5319 + }, + { + "epoch": 2.3644444444444446, + "grad_norm": 2.4040451049804688, + "learning_rate": 0.00010551601423487544, + "loss": 1.7381, + "step": 5320 + }, + { + "epoch": 2.364888888888889, + "grad_norm": 2.3204426765441895, + "learning_rate": 0.0001054982206405694, + "loss": 2.0069, + "step": 5321 + }, + { + "epoch": 2.3653333333333335, + "grad_norm": 2.3594791889190674, + "learning_rate": 0.00010548042704626336, + "loss": 1.6934, + "step": 5322 + }, + { + "epoch": 2.3657777777777778, + "grad_norm": 2.970160961151123, + "learning_rate": 0.0001054626334519573, + "loss": 2.0672, + "step": 5323 + }, + { + "epoch": 2.3662222222222224, + "grad_norm": 2.4029760360717773, + "learning_rate": 0.00010544483985765125, + "loss": 1.6066, + "step": 5324 + }, + { + "epoch": 2.3666666666666667, + "grad_norm": 2.390634059906006, + "learning_rate": 0.00010542704626334521, + "loss": 1.8098, + "step": 5325 + }, + { + "epoch": 2.367111111111111, + "grad_norm": 2.158510208129883, + "learning_rate": 0.00010540925266903915, + "loss": 1.4622, + "step": 5326 + }, + { + "epoch": 2.3675555555555556, + "grad_norm": 2.2908811569213867, + "learning_rate": 0.00010539145907473311, + "loss": 1.4533, + "step": 5327 + }, + { + "epoch": 2.368, + "grad_norm": 2.348654270172119, + "learning_rate": 0.00010537366548042704, + "loss": 1.7244, + "step": 5328 + }, + { + "epoch": 2.3684444444444446, + "grad_norm": 2.7863059043884277, + "learning_rate": 0.000105355871886121, + "loss": 1.9031, + "step": 5329 + }, + { + "epoch": 2.368888888888889, + "grad_norm": 1.919249415397644, + "learning_rate": 0.00010533807829181494, + "loss": 0.9351, + "step": 5330 + }, + { + "epoch": 2.3693333333333335, + "grad_norm": 2.1779160499572754, + "learning_rate": 0.0001053202846975089, + "loss": 1.311, + "step": 5331 + }, + { + "epoch": 2.3697777777777778, + "grad_norm": 2.9580140113830566, + "learning_rate": 0.00010530249110320285, + "loss": 1.8562, + "step": 5332 + }, + { + "epoch": 2.370222222222222, + "grad_norm": 2.4959969520568848, + "learning_rate": 0.0001052846975088968, + "loss": 1.3581, + "step": 5333 + }, + { + "epoch": 2.3706666666666667, + "grad_norm": 2.6963963508605957, + "learning_rate": 0.00010526690391459075, + "loss": 2.3081, + "step": 5334 + }, + { + "epoch": 2.371111111111111, + "grad_norm": 2.9075794219970703, + "learning_rate": 0.00010524911032028471, + "loss": 2.2261, + "step": 5335 + }, + { + "epoch": 2.3715555555555556, + "grad_norm": 2.346102714538574, + "learning_rate": 0.00010523131672597865, + "loss": 1.4546, + "step": 5336 + }, + { + "epoch": 2.372, + "grad_norm": 2.860933542251587, + "learning_rate": 0.00010521352313167261, + "loss": 1.6149, + "step": 5337 + }, + { + "epoch": 2.3724444444444446, + "grad_norm": 3.0181329250335693, + "learning_rate": 0.00010519572953736656, + "loss": 1.9698, + "step": 5338 + }, + { + "epoch": 2.372888888888889, + "grad_norm": 3.008528709411621, + "learning_rate": 0.00010517793594306051, + "loss": 1.943, + "step": 5339 + }, + { + "epoch": 2.3733333333333335, + "grad_norm": 2.5461809635162354, + "learning_rate": 0.00010516014234875444, + "loss": 1.473, + "step": 5340 + }, + { + "epoch": 2.3737777777777778, + "grad_norm": 2.90517258644104, + "learning_rate": 0.00010514234875444839, + "loss": 2.0846, + "step": 5341 + }, + { + "epoch": 2.3742222222222225, + "grad_norm": 2.84635329246521, + "learning_rate": 0.00010512455516014235, + "loss": 1.8785, + "step": 5342 + }, + { + "epoch": 2.3746666666666667, + "grad_norm": 2.7728958129882812, + "learning_rate": 0.00010510676156583629, + "loss": 1.4901, + "step": 5343 + }, + { + "epoch": 2.375111111111111, + "grad_norm": 2.4480459690093994, + "learning_rate": 0.00010508896797153025, + "loss": 1.4275, + "step": 5344 + }, + { + "epoch": 2.3755555555555556, + "grad_norm": 3.176147937774658, + "learning_rate": 0.0001050711743772242, + "loss": 1.3497, + "step": 5345 + }, + { + "epoch": 2.376, + "grad_norm": 2.7006847858428955, + "learning_rate": 0.00010505338078291815, + "loss": 1.2245, + "step": 5346 + }, + { + "epoch": 2.3764444444444446, + "grad_norm": 3.114924192428589, + "learning_rate": 0.0001050355871886121, + "loss": 1.6906, + "step": 5347 + }, + { + "epoch": 2.376888888888889, + "grad_norm": 3.2181596755981445, + "learning_rate": 0.00010501779359430606, + "loss": 1.4348, + "step": 5348 + }, + { + "epoch": 2.3773333333333335, + "grad_norm": 2.9679977893829346, + "learning_rate": 0.000105, + "loss": 1.8438, + "step": 5349 + }, + { + "epoch": 2.3777777777777778, + "grad_norm": 3.126516342163086, + "learning_rate": 0.00010498220640569396, + "loss": 1.0786, + "step": 5350 + }, + { + "epoch": 2.378222222222222, + "grad_norm": 1.8322924375534058, + "learning_rate": 0.00010496441281138792, + "loss": 2.0924, + "step": 5351 + }, + { + "epoch": 2.3786666666666667, + "grad_norm": 1.7690398693084717, + "learning_rate": 0.00010494661921708186, + "loss": 2.0291, + "step": 5352 + }, + { + "epoch": 2.379111111111111, + "grad_norm": 0.14878343045711517, + "learning_rate": 0.00010492882562277579, + "loss": 0.0181, + "step": 5353 + }, + { + "epoch": 2.3795555555555556, + "grad_norm": 2.2192952632904053, + "learning_rate": 0.00010491103202846975, + "loss": 1.5731, + "step": 5354 + }, + { + "epoch": 2.38, + "grad_norm": 2.1451058387756348, + "learning_rate": 0.0001048932384341637, + "loss": 1.7856, + "step": 5355 + }, + { + "epoch": 2.3804444444444446, + "grad_norm": 2.5920441150665283, + "learning_rate": 0.00010487544483985765, + "loss": 2.0455, + "step": 5356 + }, + { + "epoch": 2.380888888888889, + "grad_norm": 2.153883218765259, + "learning_rate": 0.0001048576512455516, + "loss": 1.6193, + "step": 5357 + }, + { + "epoch": 2.3813333333333335, + "grad_norm": 2.2498092651367188, + "learning_rate": 0.00010483985765124556, + "loss": 2.1478, + "step": 5358 + }, + { + "epoch": 2.3817777777777778, + "grad_norm": 2.189810037612915, + "learning_rate": 0.0001048220640569395, + "loss": 1.8984, + "step": 5359 + }, + { + "epoch": 2.3822222222222225, + "grad_norm": 2.1763858795166016, + "learning_rate": 0.00010480427046263346, + "loss": 1.6781, + "step": 5360 + }, + { + "epoch": 2.3826666666666667, + "grad_norm": 2.4611992835998535, + "learning_rate": 0.00010478647686832741, + "loss": 1.9078, + "step": 5361 + }, + { + "epoch": 2.383111111111111, + "grad_norm": 2.345245122909546, + "learning_rate": 0.00010476868327402136, + "loss": 1.9821, + "step": 5362 + }, + { + "epoch": 2.3835555555555556, + "grad_norm": 2.14660906791687, + "learning_rate": 0.00010475088967971531, + "loss": 1.4488, + "step": 5363 + }, + { + "epoch": 2.384, + "grad_norm": 2.06569504737854, + "learning_rate": 0.00010473309608540927, + "loss": 1.908, + "step": 5364 + }, + { + "epoch": 2.3844444444444446, + "grad_norm": 2.5986487865448, + "learning_rate": 0.00010471530249110321, + "loss": 2.0425, + "step": 5365 + }, + { + "epoch": 2.384888888888889, + "grad_norm": 2.309596061706543, + "learning_rate": 0.00010469750889679714, + "loss": 2.0211, + "step": 5366 + }, + { + "epoch": 2.3853333333333335, + "grad_norm": 2.2031354904174805, + "learning_rate": 0.0001046797153024911, + "loss": 1.793, + "step": 5367 + }, + { + "epoch": 2.3857777777777778, + "grad_norm": 2.5349233150482178, + "learning_rate": 0.00010466192170818506, + "loss": 2.126, + "step": 5368 + }, + { + "epoch": 2.386222222222222, + "grad_norm": 1.8240305185317993, + "learning_rate": 0.000104644128113879, + "loss": 1.3644, + "step": 5369 + }, + { + "epoch": 2.3866666666666667, + "grad_norm": 1.6334868669509888, + "learning_rate": 0.00010462633451957296, + "loss": 1.0386, + "step": 5370 + }, + { + "epoch": 2.387111111111111, + "grad_norm": 2.6212308406829834, + "learning_rate": 0.00010460854092526691, + "loss": 2.2779, + "step": 5371 + }, + { + "epoch": 2.3875555555555557, + "grad_norm": 2.4640159606933594, + "learning_rate": 0.00010459074733096085, + "loss": 2.1993, + "step": 5372 + }, + { + "epoch": 2.388, + "grad_norm": 2.5300076007843018, + "learning_rate": 0.00010457295373665481, + "loss": 2.1011, + "step": 5373 + }, + { + "epoch": 2.3884444444444446, + "grad_norm": 2.257607936859131, + "learning_rate": 0.00010455516014234877, + "loss": 1.6381, + "step": 5374 + }, + { + "epoch": 2.388888888888889, + "grad_norm": 2.7849531173706055, + "learning_rate": 0.00010453736654804271, + "loss": 1.4969, + "step": 5375 + }, + { + "epoch": 2.389333333333333, + "grad_norm": 2.2761592864990234, + "learning_rate": 0.00010451957295373667, + "loss": 1.4029, + "step": 5376 + }, + { + "epoch": 2.389777777777778, + "grad_norm": 2.228327512741089, + "learning_rate": 0.00010450177935943062, + "loss": 1.8044, + "step": 5377 + }, + { + "epoch": 2.3902222222222225, + "grad_norm": 1.8073506355285645, + "learning_rate": 0.00010448398576512457, + "loss": 0.9291, + "step": 5378 + }, + { + "epoch": 2.3906666666666667, + "grad_norm": 2.5977213382720947, + "learning_rate": 0.0001044661921708185, + "loss": 1.585, + "step": 5379 + }, + { + "epoch": 2.391111111111111, + "grad_norm": 2.726691246032715, + "learning_rate": 0.00010444839857651245, + "loss": 1.6747, + "step": 5380 + }, + { + "epoch": 2.3915555555555557, + "grad_norm": 2.13911509513855, + "learning_rate": 0.00010443060498220641, + "loss": 1.531, + "step": 5381 + }, + { + "epoch": 2.392, + "grad_norm": 2.832296133041382, + "learning_rate": 0.00010441281138790035, + "loss": 1.7403, + "step": 5382 + }, + { + "epoch": 2.3924444444444446, + "grad_norm": 2.713308811187744, + "learning_rate": 0.00010439501779359431, + "loss": 2.0669, + "step": 5383 + }, + { + "epoch": 2.392888888888889, + "grad_norm": 2.503474473953247, + "learning_rate": 0.00010437722419928826, + "loss": 1.8383, + "step": 5384 + }, + { + "epoch": 2.3933333333333335, + "grad_norm": 2.878114700317383, + "learning_rate": 0.00010435943060498221, + "loss": 1.7423, + "step": 5385 + }, + { + "epoch": 2.393777777777778, + "grad_norm": 3.174804449081421, + "learning_rate": 0.00010434163701067616, + "loss": 1.5972, + "step": 5386 + }, + { + "epoch": 2.394222222222222, + "grad_norm": 2.129746675491333, + "learning_rate": 0.00010432384341637012, + "loss": 1.0781, + "step": 5387 + }, + { + "epoch": 2.3946666666666667, + "grad_norm": 2.780766487121582, + "learning_rate": 0.00010430604982206406, + "loss": 1.649, + "step": 5388 + }, + { + "epoch": 2.395111111111111, + "grad_norm": 2.8491053581237793, + "learning_rate": 0.00010428825622775802, + "loss": 1.6903, + "step": 5389 + }, + { + "epoch": 2.3955555555555557, + "grad_norm": 2.460873603820801, + "learning_rate": 0.00010427046263345198, + "loss": 1.3277, + "step": 5390 + }, + { + "epoch": 2.396, + "grad_norm": 2.9226863384246826, + "learning_rate": 0.00010425266903914592, + "loss": 1.5445, + "step": 5391 + }, + { + "epoch": 2.3964444444444446, + "grad_norm": 3.39444899559021, + "learning_rate": 0.00010423487544483985, + "loss": 1.6876, + "step": 5392 + }, + { + "epoch": 2.396888888888889, + "grad_norm": 3.0782647132873535, + "learning_rate": 0.0001042170818505338, + "loss": 1.6426, + "step": 5393 + }, + { + "epoch": 2.397333333333333, + "grad_norm": 2.9284658432006836, + "learning_rate": 0.00010419928825622776, + "loss": 1.5015, + "step": 5394 + }, + { + "epoch": 2.397777777777778, + "grad_norm": 3.772498369216919, + "learning_rate": 0.0001041814946619217, + "loss": 1.734, + "step": 5395 + }, + { + "epoch": 2.398222222222222, + "grad_norm": 3.4858293533325195, + "learning_rate": 0.00010416370106761566, + "loss": 2.1559, + "step": 5396 + }, + { + "epoch": 2.3986666666666667, + "grad_norm": 2.8593242168426514, + "learning_rate": 0.00010414590747330962, + "loss": 1.2558, + "step": 5397 + }, + { + "epoch": 2.399111111111111, + "grad_norm": 3.39560866355896, + "learning_rate": 0.00010412811387900356, + "loss": 1.726, + "step": 5398 + }, + { + "epoch": 2.3995555555555557, + "grad_norm": 2.1574625968933105, + "learning_rate": 0.00010411032028469752, + "loss": 0.8812, + "step": 5399 + }, + { + "epoch": 2.4, + "grad_norm": 4.058824062347412, + "learning_rate": 0.00010409252669039147, + "loss": 1.1959, + "step": 5400 + }, + { + "epoch": 2.4004444444444446, + "grad_norm": 1.906567931175232, + "learning_rate": 0.00010407473309608542, + "loss": 2.2807, + "step": 5401 + }, + { + "epoch": 2.400888888888889, + "grad_norm": 1.9457437992095947, + "learning_rate": 0.00010405693950177937, + "loss": 1.7017, + "step": 5402 + }, + { + "epoch": 2.4013333333333335, + "grad_norm": 2.0136795043945312, + "learning_rate": 0.00010403914590747333, + "loss": 2.0448, + "step": 5403 + }, + { + "epoch": 2.401777777777778, + "grad_norm": 2.323181390762329, + "learning_rate": 0.00010402135231316727, + "loss": 1.8618, + "step": 5404 + }, + { + "epoch": 2.402222222222222, + "grad_norm": 1.999695897102356, + "learning_rate": 0.0001040035587188612, + "loss": 1.8949, + "step": 5405 + }, + { + "epoch": 2.4026666666666667, + "grad_norm": 2.2009153366088867, + "learning_rate": 0.00010398576512455516, + "loss": 1.7722, + "step": 5406 + }, + { + "epoch": 2.403111111111111, + "grad_norm": 2.2078299522399902, + "learning_rate": 0.00010396797153024912, + "loss": 2.0193, + "step": 5407 + }, + { + "epoch": 2.4035555555555557, + "grad_norm": 2.5770654678344727, + "learning_rate": 0.00010395017793594306, + "loss": 2.0849, + "step": 5408 + }, + { + "epoch": 2.404, + "grad_norm": 2.2066149711608887, + "learning_rate": 0.00010393238434163701, + "loss": 1.5842, + "step": 5409 + }, + { + "epoch": 2.4044444444444446, + "grad_norm": 2.5766196250915527, + "learning_rate": 0.00010391459074733096, + "loss": 2.3273, + "step": 5410 + }, + { + "epoch": 2.404888888888889, + "grad_norm": 2.1553375720977783, + "learning_rate": 0.00010389679715302491, + "loss": 1.7654, + "step": 5411 + }, + { + "epoch": 2.405333333333333, + "grad_norm": 2.179262638092041, + "learning_rate": 0.00010387900355871887, + "loss": 1.8465, + "step": 5412 + }, + { + "epoch": 2.405777777777778, + "grad_norm": 2.3855044841766357, + "learning_rate": 0.00010386120996441281, + "loss": 1.5526, + "step": 5413 + }, + { + "epoch": 2.406222222222222, + "grad_norm": 1.9066109657287598, + "learning_rate": 0.00010384341637010677, + "loss": 1.1865, + "step": 5414 + }, + { + "epoch": 2.4066666666666667, + "grad_norm": 2.4043421745300293, + "learning_rate": 0.00010382562277580073, + "loss": 2.1115, + "step": 5415 + }, + { + "epoch": 2.407111111111111, + "grad_norm": 2.303219795227051, + "learning_rate": 0.00010380782918149467, + "loss": 1.6968, + "step": 5416 + }, + { + "epoch": 2.4075555555555557, + "grad_norm": 2.519510507583618, + "learning_rate": 0.00010379003558718863, + "loss": 1.4451, + "step": 5417 + }, + { + "epoch": 2.408, + "grad_norm": 2.0006396770477295, + "learning_rate": 0.00010377224199288256, + "loss": 1.6501, + "step": 5418 + }, + { + "epoch": 2.4084444444444446, + "grad_norm": 2.359020709991455, + "learning_rate": 0.00010375444839857651, + "loss": 2.1004, + "step": 5419 + }, + { + "epoch": 2.408888888888889, + "grad_norm": 2.581718683242798, + "learning_rate": 0.00010373665480427045, + "loss": 1.5921, + "step": 5420 + }, + { + "epoch": 2.4093333333333335, + "grad_norm": 2.2149415016174316, + "learning_rate": 0.00010371886120996441, + "loss": 1.4495, + "step": 5421 + }, + { + "epoch": 2.409777777777778, + "grad_norm": 2.1162991523742676, + "learning_rate": 0.00010370106761565837, + "loss": 1.7689, + "step": 5422 + }, + { + "epoch": 2.410222222222222, + "grad_norm": 2.3050718307495117, + "learning_rate": 0.00010368327402135231, + "loss": 1.7421, + "step": 5423 + }, + { + "epoch": 2.4106666666666667, + "grad_norm": 2.368021011352539, + "learning_rate": 0.00010366548042704627, + "loss": 1.8718, + "step": 5424 + }, + { + "epoch": 2.411111111111111, + "grad_norm": 2.574089765548706, + "learning_rate": 0.00010364768683274022, + "loss": 1.8538, + "step": 5425 + }, + { + "epoch": 2.4115555555555557, + "grad_norm": 2.1765310764312744, + "learning_rate": 0.00010362989323843417, + "loss": 1.6152, + "step": 5426 + }, + { + "epoch": 2.412, + "grad_norm": 2.298729419708252, + "learning_rate": 0.00010361209964412812, + "loss": 1.6085, + "step": 5427 + }, + { + "epoch": 2.4124444444444446, + "grad_norm": 2.547469139099121, + "learning_rate": 0.00010359430604982208, + "loss": 1.7991, + "step": 5428 + }, + { + "epoch": 2.412888888888889, + "grad_norm": 2.315479040145874, + "learning_rate": 0.00010357651245551602, + "loss": 1.5592, + "step": 5429 + }, + { + "epoch": 2.413333333333333, + "grad_norm": 2.59236478805542, + "learning_rate": 0.00010355871886120998, + "loss": 1.4449, + "step": 5430 + }, + { + "epoch": 2.413777777777778, + "grad_norm": 2.6196041107177734, + "learning_rate": 0.00010354092526690391, + "loss": 1.4048, + "step": 5431 + }, + { + "epoch": 2.414222222222222, + "grad_norm": 2.8616275787353516, + "learning_rate": 0.00010352313167259787, + "loss": 2.1669, + "step": 5432 + }, + { + "epoch": 2.4146666666666667, + "grad_norm": 2.7045931816101074, + "learning_rate": 0.00010350533807829181, + "loss": 1.8538, + "step": 5433 + }, + { + "epoch": 2.415111111111111, + "grad_norm": 2.812784194946289, + "learning_rate": 0.00010348754448398576, + "loss": 1.7349, + "step": 5434 + }, + { + "epoch": 2.4155555555555557, + "grad_norm": 2.665402889251709, + "learning_rate": 0.00010346975088967972, + "loss": 1.671, + "step": 5435 + }, + { + "epoch": 2.416, + "grad_norm": 2.632174253463745, + "learning_rate": 0.00010345195729537366, + "loss": 1.297, + "step": 5436 + }, + { + "epoch": 2.4164444444444446, + "grad_norm": 2.9285874366760254, + "learning_rate": 0.00010343416370106762, + "loss": 1.6447, + "step": 5437 + }, + { + "epoch": 2.416888888888889, + "grad_norm": 2.873795747756958, + "learning_rate": 0.00010341637010676158, + "loss": 1.6297, + "step": 5438 + }, + { + "epoch": 2.4173333333333336, + "grad_norm": 1.693901538848877, + "learning_rate": 0.00010339857651245552, + "loss": 0.571, + "step": 5439 + }, + { + "epoch": 2.417777777777778, + "grad_norm": 2.4902453422546387, + "learning_rate": 0.00010338078291814948, + "loss": 1.7092, + "step": 5440 + }, + { + "epoch": 2.418222222222222, + "grad_norm": 2.657254457473755, + "learning_rate": 0.00010336298932384343, + "loss": 1.2449, + "step": 5441 + }, + { + "epoch": 2.4186666666666667, + "grad_norm": 2.572938919067383, + "learning_rate": 0.00010334519572953738, + "loss": 1.6842, + "step": 5442 + }, + { + "epoch": 2.419111111111111, + "grad_norm": 3.0407369136810303, + "learning_rate": 0.00010332740213523133, + "loss": 1.7859, + "step": 5443 + }, + { + "epoch": 2.4195555555555557, + "grad_norm": 2.9764583110809326, + "learning_rate": 0.00010330960854092526, + "loss": 1.6893, + "step": 5444 + }, + { + "epoch": 2.42, + "grad_norm": 2.7912721633911133, + "learning_rate": 0.00010329181494661922, + "loss": 1.5315, + "step": 5445 + }, + { + "epoch": 2.4204444444444446, + "grad_norm": 3.436552047729492, + "learning_rate": 0.00010327402135231316, + "loss": 1.6444, + "step": 5446 + }, + { + "epoch": 2.420888888888889, + "grad_norm": 3.2824277877807617, + "learning_rate": 0.00010325622775800712, + "loss": 1.6908, + "step": 5447 + }, + { + "epoch": 2.421333333333333, + "grad_norm": 3.5586631298065186, + "learning_rate": 0.00010323843416370107, + "loss": 1.9766, + "step": 5448 + }, + { + "epoch": 2.421777777777778, + "grad_norm": 3.6402716636657715, + "learning_rate": 0.00010322064056939502, + "loss": 1.8899, + "step": 5449 + }, + { + "epoch": 2.422222222222222, + "grad_norm": 3.806692600250244, + "learning_rate": 0.00010320284697508897, + "loss": 1.7844, + "step": 5450 + }, + { + "epoch": 2.4226666666666667, + "grad_norm": 1.4669278860092163, + "learning_rate": 0.00010318505338078293, + "loss": 0.98, + "step": 5451 + }, + { + "epoch": 2.423111111111111, + "grad_norm": 1.7053353786468506, + "learning_rate": 0.00010316725978647687, + "loss": 2.1493, + "step": 5452 + }, + { + "epoch": 2.4235555555555557, + "grad_norm": 2.3028388023376465, + "learning_rate": 0.00010314946619217083, + "loss": 2.0852, + "step": 5453 + }, + { + "epoch": 2.424, + "grad_norm": 1.9444522857666016, + "learning_rate": 0.00010313167259786479, + "loss": 1.6929, + "step": 5454 + }, + { + "epoch": 2.4244444444444446, + "grad_norm": 2.3324930667877197, + "learning_rate": 0.00010311387900355873, + "loss": 1.9656, + "step": 5455 + }, + { + "epoch": 2.424888888888889, + "grad_norm": 2.4078588485717773, + "learning_rate": 0.00010309608540925266, + "loss": 1.8105, + "step": 5456 + }, + { + "epoch": 2.4253333333333336, + "grad_norm": 2.183835983276367, + "learning_rate": 0.00010307829181494661, + "loss": 1.921, + "step": 5457 + }, + { + "epoch": 2.425777777777778, + "grad_norm": 2.3134961128234863, + "learning_rate": 0.00010306049822064057, + "loss": 1.8601, + "step": 5458 + }, + { + "epoch": 2.426222222222222, + "grad_norm": 2.484114408493042, + "learning_rate": 0.00010304270462633451, + "loss": 1.701, + "step": 5459 + }, + { + "epoch": 2.4266666666666667, + "grad_norm": 2.430379867553711, + "learning_rate": 0.00010302491103202847, + "loss": 1.9341, + "step": 5460 + }, + { + "epoch": 2.427111111111111, + "grad_norm": 2.4869649410247803, + "learning_rate": 0.00010300711743772243, + "loss": 1.9392, + "step": 5461 + }, + { + "epoch": 2.4275555555555557, + "grad_norm": 2.402344226837158, + "learning_rate": 0.00010298932384341637, + "loss": 1.8691, + "step": 5462 + }, + { + "epoch": 2.428, + "grad_norm": 1.7501614093780518, + "learning_rate": 0.00010297153024911033, + "loss": 1.085, + "step": 5463 + }, + { + "epoch": 2.4284444444444446, + "grad_norm": 2.223562002182007, + "learning_rate": 0.00010295373665480428, + "loss": 1.3557, + "step": 5464 + }, + { + "epoch": 2.428888888888889, + "grad_norm": 2.1328225135803223, + "learning_rate": 0.00010293594306049823, + "loss": 1.7067, + "step": 5465 + }, + { + "epoch": 2.429333333333333, + "grad_norm": 2.1772732734680176, + "learning_rate": 0.00010291814946619218, + "loss": 1.5118, + "step": 5466 + }, + { + "epoch": 2.429777777777778, + "grad_norm": 2.320878505706787, + "learning_rate": 0.00010290035587188614, + "loss": 1.3153, + "step": 5467 + }, + { + "epoch": 2.430222222222222, + "grad_norm": 2.253594398498535, + "learning_rate": 0.00010288256227758008, + "loss": 1.8138, + "step": 5468 + }, + { + "epoch": 2.4306666666666668, + "grad_norm": 2.6910340785980225, + "learning_rate": 0.00010286476868327401, + "loss": 2.0876, + "step": 5469 + }, + { + "epoch": 2.431111111111111, + "grad_norm": 2.565636396408081, + "learning_rate": 0.00010284697508896797, + "loss": 1.9388, + "step": 5470 + }, + { + "epoch": 2.4315555555555557, + "grad_norm": 2.60259747505188, + "learning_rate": 0.00010282918149466192, + "loss": 1.5759, + "step": 5471 + }, + { + "epoch": 2.432, + "grad_norm": 2.3194119930267334, + "learning_rate": 0.00010281138790035587, + "loss": 1.8334, + "step": 5472 + }, + { + "epoch": 2.4324444444444446, + "grad_norm": 2.5380783081054688, + "learning_rate": 0.00010279359430604982, + "loss": 1.7694, + "step": 5473 + }, + { + "epoch": 2.432888888888889, + "grad_norm": 2.5655555725097656, + "learning_rate": 0.00010277580071174378, + "loss": 2.0182, + "step": 5474 + }, + { + "epoch": 2.4333333333333336, + "grad_norm": 2.371737241744995, + "learning_rate": 0.00010275800711743772, + "loss": 1.805, + "step": 5475 + }, + { + "epoch": 2.433777777777778, + "grad_norm": 2.5527260303497314, + "learning_rate": 0.00010274021352313168, + "loss": 1.7959, + "step": 5476 + }, + { + "epoch": 2.434222222222222, + "grad_norm": 2.364063262939453, + "learning_rate": 0.00010272241992882564, + "loss": 1.4738, + "step": 5477 + }, + { + "epoch": 2.4346666666666668, + "grad_norm": 3.0714645385742188, + "learning_rate": 0.00010270462633451958, + "loss": 1.7449, + "step": 5478 + }, + { + "epoch": 2.435111111111111, + "grad_norm": 2.63435959815979, + "learning_rate": 0.00010268683274021354, + "loss": 1.9282, + "step": 5479 + }, + { + "epoch": 2.4355555555555557, + "grad_norm": 2.5400550365448, + "learning_rate": 0.00010266903914590749, + "loss": 1.199, + "step": 5480 + }, + { + "epoch": 2.436, + "grad_norm": 2.6152279376983643, + "learning_rate": 0.00010265124555160144, + "loss": 1.7645, + "step": 5481 + }, + { + "epoch": 2.4364444444444446, + "grad_norm": 3.2764761447906494, + "learning_rate": 0.00010263345195729536, + "loss": 1.8478, + "step": 5482 + }, + { + "epoch": 2.436888888888889, + "grad_norm": 2.65189790725708, + "learning_rate": 0.00010261565836298932, + "loss": 1.969, + "step": 5483 + }, + { + "epoch": 2.437333333333333, + "grad_norm": 2.8644859790802, + "learning_rate": 0.00010259786476868328, + "loss": 1.5599, + "step": 5484 + }, + { + "epoch": 2.437777777777778, + "grad_norm": 2.7728471755981445, + "learning_rate": 0.00010258007117437722, + "loss": 1.4802, + "step": 5485 + }, + { + "epoch": 2.438222222222222, + "grad_norm": 3.231962203979492, + "learning_rate": 0.00010256227758007118, + "loss": 1.8151, + "step": 5486 + }, + { + "epoch": 2.4386666666666668, + "grad_norm": 2.6833078861236572, + "learning_rate": 0.00010254448398576513, + "loss": 1.4914, + "step": 5487 + }, + { + "epoch": 2.439111111111111, + "grad_norm": 2.837379217147827, + "learning_rate": 0.00010252669039145908, + "loss": 2.1413, + "step": 5488 + }, + { + "epoch": 2.4395555555555557, + "grad_norm": 2.6518139839172363, + "learning_rate": 0.00010250889679715303, + "loss": 1.5023, + "step": 5489 + }, + { + "epoch": 2.44, + "grad_norm": 2.0156285762786865, + "learning_rate": 0.00010249110320284699, + "loss": 0.9281, + "step": 5490 + }, + { + "epoch": 2.4404444444444446, + "grad_norm": 3.069772958755493, + "learning_rate": 0.00010247330960854093, + "loss": 2.1119, + "step": 5491 + }, + { + "epoch": 2.440888888888889, + "grad_norm": 3.10864520072937, + "learning_rate": 0.00010245551601423489, + "loss": 1.5862, + "step": 5492 + }, + { + "epoch": 2.4413333333333336, + "grad_norm": 2.859609365463257, + "learning_rate": 0.00010243772241992885, + "loss": 1.7583, + "step": 5493 + }, + { + "epoch": 2.441777777777778, + "grad_norm": 2.6531405448913574, + "learning_rate": 0.00010241992882562279, + "loss": 1.4576, + "step": 5494 + }, + { + "epoch": 2.442222222222222, + "grad_norm": 3.0856566429138184, + "learning_rate": 0.00010240213523131672, + "loss": 1.6381, + "step": 5495 + }, + { + "epoch": 2.4426666666666668, + "grad_norm": 3.7810099124908447, + "learning_rate": 0.00010238434163701067, + "loss": 2.2101, + "step": 5496 + }, + { + "epoch": 2.443111111111111, + "grad_norm": 3.085653066635132, + "learning_rate": 0.00010236654804270463, + "loss": 1.7354, + "step": 5497 + }, + { + "epoch": 2.4435555555555557, + "grad_norm": 2.9411280155181885, + "learning_rate": 0.00010234875444839857, + "loss": 1.1896, + "step": 5498 + }, + { + "epoch": 2.444, + "grad_norm": 2.1418721675872803, + "learning_rate": 0.00010233096085409253, + "loss": 1.0698, + "step": 5499 + }, + { + "epoch": 2.4444444444444446, + "grad_norm": 2.725583076477051, + "learning_rate": 0.00010231316725978647, + "loss": 0.6198, + "step": 5500 + }, + { + "epoch": 2.444888888888889, + "grad_norm": 1.92898690700531, + "learning_rate": 0.00010229537366548043, + "loss": 2.5948, + "step": 5501 + }, + { + "epoch": 2.445333333333333, + "grad_norm": 1.2224948406219482, + "learning_rate": 0.00010227758007117439, + "loss": 1.0915, + "step": 5502 + }, + { + "epoch": 2.445777777777778, + "grad_norm": 1.8230127096176147, + "learning_rate": 0.00010225978647686833, + "loss": 2.0617, + "step": 5503 + }, + { + "epoch": 2.446222222222222, + "grad_norm": 1.861149787902832, + "learning_rate": 0.00010224199288256229, + "loss": 1.6798, + "step": 5504 + }, + { + "epoch": 2.4466666666666668, + "grad_norm": 1.9252985715866089, + "learning_rate": 0.00010222419928825624, + "loss": 1.9089, + "step": 5505 + }, + { + "epoch": 2.447111111111111, + "grad_norm": 2.1263620853424072, + "learning_rate": 0.00010220640569395019, + "loss": 2.0751, + "step": 5506 + }, + { + "epoch": 2.4475555555555557, + "grad_norm": 2.2096424102783203, + "learning_rate": 0.00010218861209964414, + "loss": 2.0726, + "step": 5507 + }, + { + "epoch": 2.448, + "grad_norm": 2.455075263977051, + "learning_rate": 0.00010217081850533807, + "loss": 2.2006, + "step": 5508 + }, + { + "epoch": 2.448444444444444, + "grad_norm": 2.1597304344177246, + "learning_rate": 0.00010215302491103203, + "loss": 1.5901, + "step": 5509 + }, + { + "epoch": 2.448888888888889, + "grad_norm": 2.4968745708465576, + "learning_rate": 0.00010213523131672597, + "loss": 1.8659, + "step": 5510 + }, + { + "epoch": 2.449333333333333, + "grad_norm": 2.074950695037842, + "learning_rate": 0.00010211743772241993, + "loss": 2.1795, + "step": 5511 + }, + { + "epoch": 2.449777777777778, + "grad_norm": 2.339791774749756, + "learning_rate": 0.00010209964412811388, + "loss": 1.5882, + "step": 5512 + }, + { + "epoch": 2.450222222222222, + "grad_norm": 2.3237624168395996, + "learning_rate": 0.00010208185053380783, + "loss": 1.4691, + "step": 5513 + }, + { + "epoch": 2.4506666666666668, + "grad_norm": 2.4327070713043213, + "learning_rate": 0.00010206405693950178, + "loss": 1.8784, + "step": 5514 + }, + { + "epoch": 2.451111111111111, + "grad_norm": 2.7746636867523193, + "learning_rate": 0.00010204626334519574, + "loss": 1.7476, + "step": 5515 + }, + { + "epoch": 2.4515555555555557, + "grad_norm": 1.4151685237884521, + "learning_rate": 0.00010202846975088968, + "loss": 0.7311, + "step": 5516 + }, + { + "epoch": 2.452, + "grad_norm": 2.369919776916504, + "learning_rate": 0.00010201067615658364, + "loss": 1.8284, + "step": 5517 + }, + { + "epoch": 2.4524444444444446, + "grad_norm": 2.3550519943237305, + "learning_rate": 0.0001019928825622776, + "loss": 1.4522, + "step": 5518 + }, + { + "epoch": 2.452888888888889, + "grad_norm": 2.1800880432128906, + "learning_rate": 0.00010197508896797154, + "loss": 1.7927, + "step": 5519 + }, + { + "epoch": 2.453333333333333, + "grad_norm": 1.9889439344406128, + "learning_rate": 0.0001019572953736655, + "loss": 1.4447, + "step": 5520 + }, + { + "epoch": 2.453777777777778, + "grad_norm": 2.0936267375946045, + "learning_rate": 0.00010193950177935942, + "loss": 1.6783, + "step": 5521 + }, + { + "epoch": 2.454222222222222, + "grad_norm": 2.577599287033081, + "learning_rate": 0.00010192170818505338, + "loss": 2.0707, + "step": 5522 + }, + { + "epoch": 2.4546666666666668, + "grad_norm": 2.550760507583618, + "learning_rate": 0.00010190391459074732, + "loss": 1.8725, + "step": 5523 + }, + { + "epoch": 2.455111111111111, + "grad_norm": 2.492544651031494, + "learning_rate": 0.00010188612099644128, + "loss": 1.8554, + "step": 5524 + }, + { + "epoch": 2.4555555555555557, + "grad_norm": 2.88720703125, + "learning_rate": 0.00010186832740213524, + "loss": 1.9373, + "step": 5525 + }, + { + "epoch": 2.456, + "grad_norm": 2.423215627670288, + "learning_rate": 0.00010185053380782918, + "loss": 1.8004, + "step": 5526 + }, + { + "epoch": 2.456444444444444, + "grad_norm": 2.4208219051361084, + "learning_rate": 0.00010183274021352314, + "loss": 2.0648, + "step": 5527 + }, + { + "epoch": 2.456888888888889, + "grad_norm": 2.6025261878967285, + "learning_rate": 0.00010181494661921709, + "loss": 1.7313, + "step": 5528 + }, + { + "epoch": 2.457333333333333, + "grad_norm": 1.9392775297164917, + "learning_rate": 0.00010179715302491104, + "loss": 1.209, + "step": 5529 + }, + { + "epoch": 2.457777777777778, + "grad_norm": 2.2096610069274902, + "learning_rate": 0.00010177935943060499, + "loss": 1.6588, + "step": 5530 + }, + { + "epoch": 2.458222222222222, + "grad_norm": 2.5169079303741455, + "learning_rate": 0.00010176156583629895, + "loss": 1.6311, + "step": 5531 + }, + { + "epoch": 2.458666666666667, + "grad_norm": 2.4798898696899414, + "learning_rate": 0.00010174377224199289, + "loss": 1.4915, + "step": 5532 + }, + { + "epoch": 2.459111111111111, + "grad_norm": 2.8230345249176025, + "learning_rate": 0.00010172597864768685, + "loss": 1.7837, + "step": 5533 + }, + { + "epoch": 2.4595555555555557, + "grad_norm": 2.252345561981201, + "learning_rate": 0.00010170818505338078, + "loss": 1.694, + "step": 5534 + }, + { + "epoch": 2.46, + "grad_norm": 2.8912289142608643, + "learning_rate": 0.00010169039145907473, + "loss": 2.1645, + "step": 5535 + }, + { + "epoch": 2.4604444444444447, + "grad_norm": 2.01261305809021, + "learning_rate": 0.00010167259786476868, + "loss": 0.7449, + "step": 5536 + }, + { + "epoch": 2.460888888888889, + "grad_norm": 2.8200204372406006, + "learning_rate": 0.00010165480427046263, + "loss": 1.5755, + "step": 5537 + }, + { + "epoch": 2.461333333333333, + "grad_norm": 2.4388539791107178, + "learning_rate": 0.00010163701067615659, + "loss": 1.7643, + "step": 5538 + }, + { + "epoch": 2.461777777777778, + "grad_norm": 2.9990639686584473, + "learning_rate": 0.00010161921708185053, + "loss": 1.7626, + "step": 5539 + }, + { + "epoch": 2.462222222222222, + "grad_norm": 2.477353811264038, + "learning_rate": 0.00010160142348754449, + "loss": 1.3518, + "step": 5540 + }, + { + "epoch": 2.462666666666667, + "grad_norm": 2.8211402893066406, + "learning_rate": 0.00010158362989323845, + "loss": 1.5751, + "step": 5541 + }, + { + "epoch": 2.463111111111111, + "grad_norm": 2.7907443046569824, + "learning_rate": 0.00010156583629893239, + "loss": 2.068, + "step": 5542 + }, + { + "epoch": 2.4635555555555557, + "grad_norm": 2.866947650909424, + "learning_rate": 0.00010154804270462635, + "loss": 1.593, + "step": 5543 + }, + { + "epoch": 2.464, + "grad_norm": 3.0840697288513184, + "learning_rate": 0.0001015302491103203, + "loss": 1.6982, + "step": 5544 + }, + { + "epoch": 2.464444444444444, + "grad_norm": 2.5934946537017822, + "learning_rate": 0.00010151245551601424, + "loss": 1.6589, + "step": 5545 + }, + { + "epoch": 2.464888888888889, + "grad_norm": 3.505406141281128, + "learning_rate": 0.0001014946619217082, + "loss": 1.4979, + "step": 5546 + }, + { + "epoch": 2.465333333333333, + "grad_norm": 3.8022685050964355, + "learning_rate": 0.00010147686832740213, + "loss": 2.0308, + "step": 5547 + }, + { + "epoch": 2.465777777777778, + "grad_norm": 3.6010377407073975, + "learning_rate": 0.00010145907473309609, + "loss": 1.9663, + "step": 5548 + }, + { + "epoch": 2.466222222222222, + "grad_norm": 3.2680211067199707, + "learning_rate": 0.00010144128113879003, + "loss": 2.0005, + "step": 5549 + }, + { + "epoch": 2.466666666666667, + "grad_norm": 4.470943927764893, + "learning_rate": 0.00010142348754448399, + "loss": 1.4878, + "step": 5550 + }, + { + "epoch": 2.467111111111111, + "grad_norm": 1.9788882732391357, + "learning_rate": 0.00010140569395017794, + "loss": 1.9776, + "step": 5551 + }, + { + "epoch": 2.4675555555555557, + "grad_norm": 1.8858304023742676, + "learning_rate": 0.00010138790035587189, + "loss": 2.3771, + "step": 5552 + }, + { + "epoch": 2.468, + "grad_norm": 2.0019304752349854, + "learning_rate": 0.00010137010676156584, + "loss": 1.513, + "step": 5553 + }, + { + "epoch": 2.4684444444444447, + "grad_norm": 2.7883894443511963, + "learning_rate": 0.0001013523131672598, + "loss": 2.1866, + "step": 5554 + }, + { + "epoch": 2.468888888888889, + "grad_norm": 2.3653924465179443, + "learning_rate": 0.00010133451957295374, + "loss": 1.6028, + "step": 5555 + }, + { + "epoch": 2.469333333333333, + "grad_norm": 2.258758544921875, + "learning_rate": 0.0001013167259786477, + "loss": 1.8427, + "step": 5556 + }, + { + "epoch": 2.469777777777778, + "grad_norm": 2.4392614364624023, + "learning_rate": 0.00010129893238434165, + "loss": 1.4794, + "step": 5557 + }, + { + "epoch": 2.470222222222222, + "grad_norm": 2.4021196365356445, + "learning_rate": 0.0001012811387900356, + "loss": 1.7046, + "step": 5558 + }, + { + "epoch": 2.470666666666667, + "grad_norm": 2.3524880409240723, + "learning_rate": 0.00010126334519572955, + "loss": 1.5135, + "step": 5559 + }, + { + "epoch": 2.471111111111111, + "grad_norm": 2.1956753730773926, + "learning_rate": 0.00010124555160142348, + "loss": 1.4995, + "step": 5560 + }, + { + "epoch": 2.4715555555555557, + "grad_norm": 2.2904982566833496, + "learning_rate": 0.00010122775800711744, + "loss": 1.7763, + "step": 5561 + }, + { + "epoch": 2.472, + "grad_norm": 2.2968485355377197, + "learning_rate": 0.00010120996441281138, + "loss": 1.882, + "step": 5562 + }, + { + "epoch": 2.4724444444444442, + "grad_norm": 2.505476951599121, + "learning_rate": 0.00010119217081850534, + "loss": 1.928, + "step": 5563 + }, + { + "epoch": 2.472888888888889, + "grad_norm": 2.5836219787597656, + "learning_rate": 0.0001011743772241993, + "loss": 2.0092, + "step": 5564 + }, + { + "epoch": 2.473333333333333, + "grad_norm": 2.1713929176330566, + "learning_rate": 0.00010115658362989324, + "loss": 1.6092, + "step": 5565 + }, + { + "epoch": 2.473777777777778, + "grad_norm": 2.1562206745147705, + "learning_rate": 0.0001011387900355872, + "loss": 1.1912, + "step": 5566 + }, + { + "epoch": 2.474222222222222, + "grad_norm": 2.7238199710845947, + "learning_rate": 0.00010112099644128115, + "loss": 1.8131, + "step": 5567 + }, + { + "epoch": 2.474666666666667, + "grad_norm": 2.7333614826202393, + "learning_rate": 0.0001011032028469751, + "loss": 2.0719, + "step": 5568 + }, + { + "epoch": 2.475111111111111, + "grad_norm": 1.9092731475830078, + "learning_rate": 0.00010108540925266905, + "loss": 1.5397, + "step": 5569 + }, + { + "epoch": 2.4755555555555557, + "grad_norm": 2.2644593715667725, + "learning_rate": 0.00010106761565836301, + "loss": 1.5289, + "step": 5570 + }, + { + "epoch": 2.476, + "grad_norm": 2.569396734237671, + "learning_rate": 0.00010104982206405695, + "loss": 1.8126, + "step": 5571 + }, + { + "epoch": 2.4764444444444447, + "grad_norm": 2.5156378746032715, + "learning_rate": 0.00010103202846975088, + "loss": 1.542, + "step": 5572 + }, + { + "epoch": 2.476888888888889, + "grad_norm": 2.716794013977051, + "learning_rate": 0.00010101423487544484, + "loss": 1.9589, + "step": 5573 + }, + { + "epoch": 2.477333333333333, + "grad_norm": 2.614689826965332, + "learning_rate": 0.0001009964412811388, + "loss": 2.0686, + "step": 5574 + }, + { + "epoch": 2.477777777777778, + "grad_norm": 3.7736783027648926, + "learning_rate": 0.00010097864768683274, + "loss": 1.7161, + "step": 5575 + }, + { + "epoch": 2.478222222222222, + "grad_norm": 2.7454824447631836, + "learning_rate": 0.00010096085409252669, + "loss": 2.2613, + "step": 5576 + }, + { + "epoch": 2.478666666666667, + "grad_norm": 2.346919298171997, + "learning_rate": 0.00010094306049822065, + "loss": 2.0465, + "step": 5577 + }, + { + "epoch": 2.479111111111111, + "grad_norm": 2.593102216720581, + "learning_rate": 0.00010092526690391459, + "loss": 1.9165, + "step": 5578 + }, + { + "epoch": 2.4795555555555557, + "grad_norm": 2.560601234436035, + "learning_rate": 0.00010090747330960855, + "loss": 1.8571, + "step": 5579 + }, + { + "epoch": 2.48, + "grad_norm": 2.8583500385284424, + "learning_rate": 0.0001008896797153025, + "loss": 1.9994, + "step": 5580 + }, + { + "epoch": 2.4804444444444442, + "grad_norm": 2.5867176055908203, + "learning_rate": 0.00010087188612099645, + "loss": 1.6889, + "step": 5581 + }, + { + "epoch": 2.480888888888889, + "grad_norm": 6.32763147354126, + "learning_rate": 0.0001008540925266904, + "loss": 1.3415, + "step": 5582 + }, + { + "epoch": 2.481333333333333, + "grad_norm": 2.52264666557312, + "learning_rate": 0.00010083629893238435, + "loss": 1.8925, + "step": 5583 + }, + { + "epoch": 2.481777777777778, + "grad_norm": 2.677546977996826, + "learning_rate": 0.0001008185053380783, + "loss": 2.1266, + "step": 5584 + }, + { + "epoch": 2.482222222222222, + "grad_norm": 2.429607391357422, + "learning_rate": 0.00010080071174377223, + "loss": 1.6861, + "step": 5585 + }, + { + "epoch": 2.482666666666667, + "grad_norm": 2.5945723056793213, + "learning_rate": 0.00010078291814946619, + "loss": 1.3189, + "step": 5586 + }, + { + "epoch": 2.483111111111111, + "grad_norm": 2.5813865661621094, + "learning_rate": 0.00010076512455516015, + "loss": 1.6405, + "step": 5587 + }, + { + "epoch": 2.4835555555555557, + "grad_norm": 2.870296001434326, + "learning_rate": 0.00010074733096085409, + "loss": 1.6053, + "step": 5588 + }, + { + "epoch": 2.484, + "grad_norm": 3.1349499225616455, + "learning_rate": 0.00010072953736654805, + "loss": 1.7858, + "step": 5589 + }, + { + "epoch": 2.4844444444444447, + "grad_norm": 2.9594576358795166, + "learning_rate": 0.00010071174377224199, + "loss": 1.4855, + "step": 5590 + }, + { + "epoch": 2.484888888888889, + "grad_norm": 2.6506004333496094, + "learning_rate": 0.00010069395017793595, + "loss": 1.4352, + "step": 5591 + }, + { + "epoch": 2.485333333333333, + "grad_norm": 2.739713191986084, + "learning_rate": 0.0001006761565836299, + "loss": 1.5386, + "step": 5592 + }, + { + "epoch": 2.485777777777778, + "grad_norm": 2.450587034225464, + "learning_rate": 0.00010065836298932384, + "loss": 1.3184, + "step": 5593 + }, + { + "epoch": 2.486222222222222, + "grad_norm": 3.1459028720855713, + "learning_rate": 0.0001006405693950178, + "loss": 1.6811, + "step": 5594 + }, + { + "epoch": 2.486666666666667, + "grad_norm": 3.288677215576172, + "learning_rate": 0.00010062277580071176, + "loss": 1.7088, + "step": 5595 + }, + { + "epoch": 2.487111111111111, + "grad_norm": 3.5362205505371094, + "learning_rate": 0.0001006049822064057, + "loss": 1.4808, + "step": 5596 + }, + { + "epoch": 2.4875555555555557, + "grad_norm": 3.3700575828552246, + "learning_rate": 0.00010058718861209966, + "loss": 1.9626, + "step": 5597 + }, + { + "epoch": 2.488, + "grad_norm": 5.008380889892578, + "learning_rate": 0.00010056939501779359, + "loss": 1.8212, + "step": 5598 + }, + { + "epoch": 2.4884444444444442, + "grad_norm": 3.4378342628479004, + "learning_rate": 0.00010055160142348754, + "loss": 1.8416, + "step": 5599 + }, + { + "epoch": 2.488888888888889, + "grad_norm": 3.6014068126678467, + "learning_rate": 0.00010053380782918149, + "loss": 1.8809, + "step": 5600 + }, + { + "epoch": 2.489333333333333, + "grad_norm": 1.8102363348007202, + "learning_rate": 0.00010051601423487544, + "loss": 2.2235, + "step": 5601 + }, + { + "epoch": 2.489777777777778, + "grad_norm": 1.5259066820144653, + "learning_rate": 0.0001004982206405694, + "loss": 1.4081, + "step": 5602 + }, + { + "epoch": 2.490222222222222, + "grad_norm": 1.8424686193466187, + "learning_rate": 0.00010048042704626334, + "loss": 2.124, + "step": 5603 + }, + { + "epoch": 2.490666666666667, + "grad_norm": 2.1134793758392334, + "learning_rate": 0.0001004626334519573, + "loss": 2.0344, + "step": 5604 + }, + { + "epoch": 2.491111111111111, + "grad_norm": 1.8464834690093994, + "learning_rate": 0.00010044483985765125, + "loss": 1.4534, + "step": 5605 + }, + { + "epoch": 2.4915555555555557, + "grad_norm": 2.2974958419799805, + "learning_rate": 0.0001004270462633452, + "loss": 1.936, + "step": 5606 + }, + { + "epoch": 2.492, + "grad_norm": 2.2461228370666504, + "learning_rate": 0.00010040925266903915, + "loss": 1.9357, + "step": 5607 + }, + { + "epoch": 2.4924444444444447, + "grad_norm": 2.2530407905578613, + "learning_rate": 0.00010039145907473311, + "loss": 2.0635, + "step": 5608 + }, + { + "epoch": 2.492888888888889, + "grad_norm": 1.6925837993621826, + "learning_rate": 0.00010037366548042705, + "loss": 0.9727, + "step": 5609 + }, + { + "epoch": 2.493333333333333, + "grad_norm": 1.5781002044677734, + "learning_rate": 0.00010035587188612101, + "loss": 1.0898, + "step": 5610 + }, + { + "epoch": 2.493777777777778, + "grad_norm": 2.560060501098633, + "learning_rate": 0.00010033807829181494, + "loss": 1.6789, + "step": 5611 + }, + { + "epoch": 2.494222222222222, + "grad_norm": 2.309338092803955, + "learning_rate": 0.0001003202846975089, + "loss": 1.7006, + "step": 5612 + }, + { + "epoch": 2.494666666666667, + "grad_norm": 2.1079952716827393, + "learning_rate": 0.00010030249110320284, + "loss": 1.9404, + "step": 5613 + }, + { + "epoch": 2.495111111111111, + "grad_norm": 2.587090253829956, + "learning_rate": 0.0001002846975088968, + "loss": 2.0718, + "step": 5614 + }, + { + "epoch": 2.4955555555555557, + "grad_norm": 2.6044342517852783, + "learning_rate": 0.00010026690391459075, + "loss": 1.4223, + "step": 5615 + }, + { + "epoch": 2.496, + "grad_norm": 2.266907215118408, + "learning_rate": 0.0001002491103202847, + "loss": 1.8536, + "step": 5616 + }, + { + "epoch": 2.4964444444444442, + "grad_norm": 2.3977086544036865, + "learning_rate": 0.00010023131672597865, + "loss": 1.5307, + "step": 5617 + }, + { + "epoch": 2.496888888888889, + "grad_norm": 2.1894779205322266, + "learning_rate": 0.00010021352313167261, + "loss": 1.6952, + "step": 5618 + }, + { + "epoch": 2.497333333333333, + "grad_norm": 2.3074259757995605, + "learning_rate": 0.00010019572953736655, + "loss": 1.7691, + "step": 5619 + }, + { + "epoch": 2.497777777777778, + "grad_norm": 2.2768285274505615, + "learning_rate": 0.00010017793594306051, + "loss": 1.696, + "step": 5620 + }, + { + "epoch": 2.498222222222222, + "grad_norm": 2.4698994159698486, + "learning_rate": 0.00010016014234875446, + "loss": 1.4205, + "step": 5621 + }, + { + "epoch": 2.498666666666667, + "grad_norm": 2.3182806968688965, + "learning_rate": 0.00010014234875444841, + "loss": 1.7386, + "step": 5622 + }, + { + "epoch": 2.499111111111111, + "grad_norm": 2.212176561355591, + "learning_rate": 0.00010012455516014236, + "loss": 1.4822, + "step": 5623 + }, + { + "epoch": 2.4995555555555553, + "grad_norm": 2.8632688522338867, + "learning_rate": 0.00010010676156583629, + "loss": 1.9683, + "step": 5624 + }, + { + "epoch": 2.5, + "grad_norm": 2.298098087310791, + "learning_rate": 0.00010008896797153025, + "loss": 1.865, + "step": 5625 + }, + { + "epoch": 2.5004444444444447, + "grad_norm": 1.8092749118804932, + "learning_rate": 0.00010007117437722419, + "loss": 0.9924, + "step": 5626 + }, + { + "epoch": 2.500888888888889, + "grad_norm": 3.5151941776275635, + "learning_rate": 0.00010005338078291815, + "loss": 2.314, + "step": 5627 + }, + { + "epoch": 2.501333333333333, + "grad_norm": 1.888319730758667, + "learning_rate": 0.0001000355871886121, + "loss": 0.9312, + "step": 5628 + }, + { + "epoch": 2.501777777777778, + "grad_norm": 3.430162191390991, + "learning_rate": 0.00010001779359430605, + "loss": 1.7719, + "step": 5629 + }, + { + "epoch": 2.502222222222222, + "grad_norm": 2.4954025745391846, + "learning_rate": 0.0001, + "loss": 1.4474, + "step": 5630 + }, + { + "epoch": 2.502666666666667, + "grad_norm": 2.7440812587738037, + "learning_rate": 9.998220640569396e-05, + "loss": 2.1656, + "step": 5631 + }, + { + "epoch": 2.503111111111111, + "grad_norm": 2.9336841106414795, + "learning_rate": 9.99644128113879e-05, + "loss": 1.767, + "step": 5632 + }, + { + "epoch": 2.5035555555555558, + "grad_norm": 2.7770421504974365, + "learning_rate": 9.994661921708186e-05, + "loss": 1.8719, + "step": 5633 + }, + { + "epoch": 2.504, + "grad_norm": 3.4708054065704346, + "learning_rate": 9.99288256227758e-05, + "loss": 2.0225, + "step": 5634 + }, + { + "epoch": 2.5044444444444443, + "grad_norm": 3.865091323852539, + "learning_rate": 9.991103202846975e-05, + "loss": 1.9943, + "step": 5635 + }, + { + "epoch": 2.504888888888889, + "grad_norm": 2.653648853302002, + "learning_rate": 9.98932384341637e-05, + "loss": 1.7085, + "step": 5636 + }, + { + "epoch": 2.505333333333333, + "grad_norm": 3.2496232986450195, + "learning_rate": 9.987544483985766e-05, + "loss": 1.5987, + "step": 5637 + }, + { + "epoch": 2.505777777777778, + "grad_norm": 2.717987537384033, + "learning_rate": 9.98576512455516e-05, + "loss": 1.5691, + "step": 5638 + }, + { + "epoch": 2.506222222222222, + "grad_norm": 2.918710470199585, + "learning_rate": 9.983985765124556e-05, + "loss": 1.9691, + "step": 5639 + }, + { + "epoch": 2.506666666666667, + "grad_norm": 3.100358247756958, + "learning_rate": 9.98220640569395e-05, + "loss": 1.907, + "step": 5640 + }, + { + "epoch": 2.507111111111111, + "grad_norm": 2.9432833194732666, + "learning_rate": 9.980427046263346e-05, + "loss": 1.9512, + "step": 5641 + }, + { + "epoch": 2.5075555555555553, + "grad_norm": 3.8566741943359375, + "learning_rate": 9.97864768683274e-05, + "loss": 2.0891, + "step": 5642 + }, + { + "epoch": 2.508, + "grad_norm": 2.724581003189087, + "learning_rate": 9.976868327402136e-05, + "loss": 1.4336, + "step": 5643 + }, + { + "epoch": 2.5084444444444447, + "grad_norm": 2.8649446964263916, + "learning_rate": 9.975088967971531e-05, + "loss": 1.8689, + "step": 5644 + }, + { + "epoch": 2.508888888888889, + "grad_norm": 2.8804287910461426, + "learning_rate": 9.973309608540926e-05, + "loss": 1.902, + "step": 5645 + }, + { + "epoch": 2.509333333333333, + "grad_norm": 2.8557331562042236, + "learning_rate": 9.971530249110321e-05, + "loss": 1.7273, + "step": 5646 + }, + { + "epoch": 2.509777777777778, + "grad_norm": 3.825040340423584, + "learning_rate": 9.969750889679716e-05, + "loss": 2.0143, + "step": 5647 + }, + { + "epoch": 2.510222222222222, + "grad_norm": 2.0822556018829346, + "learning_rate": 9.96797153024911e-05, + "loss": 0.7991, + "step": 5648 + }, + { + "epoch": 2.510666666666667, + "grad_norm": 4.02184534072876, + "learning_rate": 9.966192170818506e-05, + "loss": 1.6318, + "step": 5649 + }, + { + "epoch": 2.511111111111111, + "grad_norm": 3.868868589401245, + "learning_rate": 9.964412811387901e-05, + "loss": 1.7922, + "step": 5650 + }, + { + "epoch": 2.5115555555555558, + "grad_norm": 1.9891340732574463, + "learning_rate": 9.962633451957296e-05, + "loss": 1.9876, + "step": 5651 + }, + { + "epoch": 2.512, + "grad_norm": 2.9037976264953613, + "learning_rate": 9.960854092526691e-05, + "loss": 2.0403, + "step": 5652 + }, + { + "epoch": 2.5124444444444443, + "grad_norm": 2.0764451026916504, + "learning_rate": 9.959074733096086e-05, + "loss": 2.0316, + "step": 5653 + }, + { + "epoch": 2.512888888888889, + "grad_norm": 2.249202251434326, + "learning_rate": 9.957295373665481e-05, + "loss": 1.8978, + "step": 5654 + }, + { + "epoch": 2.513333333333333, + "grad_norm": 2.4839189052581787, + "learning_rate": 9.955516014234875e-05, + "loss": 2.0246, + "step": 5655 + }, + { + "epoch": 2.513777777777778, + "grad_norm": 2.4105618000030518, + "learning_rate": 9.953736654804271e-05, + "loss": 1.8856, + "step": 5656 + }, + { + "epoch": 2.514222222222222, + "grad_norm": 1.5155545473098755, + "learning_rate": 9.951957295373667e-05, + "loss": 1.0195, + "step": 5657 + }, + { + "epoch": 2.514666666666667, + "grad_norm": 2.674628973007202, + "learning_rate": 9.950177935943061e-05, + "loss": 2.1171, + "step": 5658 + }, + { + "epoch": 2.515111111111111, + "grad_norm": 2.4236528873443604, + "learning_rate": 9.948398576512457e-05, + "loss": 1.9027, + "step": 5659 + }, + { + "epoch": 2.5155555555555553, + "grad_norm": 2.4038853645324707, + "learning_rate": 9.946619217081851e-05, + "loss": 1.5235, + "step": 5660 + }, + { + "epoch": 2.516, + "grad_norm": 2.637871026992798, + "learning_rate": 9.944839857651245e-05, + "loss": 2.012, + "step": 5661 + }, + { + "epoch": 2.5164444444444447, + "grad_norm": 2.1370859146118164, + "learning_rate": 9.943060498220641e-05, + "loss": 1.6912, + "step": 5662 + }, + { + "epoch": 2.516888888888889, + "grad_norm": 1.37175452709198, + "learning_rate": 9.941281138790037e-05, + "loss": 0.8939, + "step": 5663 + }, + { + "epoch": 2.517333333333333, + "grad_norm": 2.60074520111084, + "learning_rate": 9.939501779359431e-05, + "loss": 2.1094, + "step": 5664 + }, + { + "epoch": 2.517777777777778, + "grad_norm": 2.421234607696533, + "learning_rate": 9.937722419928827e-05, + "loss": 2.2255, + "step": 5665 + }, + { + "epoch": 2.518222222222222, + "grad_norm": 1.9047367572784424, + "learning_rate": 9.935943060498221e-05, + "loss": 1.5189, + "step": 5666 + }, + { + "epoch": 2.518666666666667, + "grad_norm": 2.2271924018859863, + "learning_rate": 9.934163701067616e-05, + "loss": 1.6326, + "step": 5667 + }, + { + "epoch": 2.519111111111111, + "grad_norm": 2.0416018962860107, + "learning_rate": 9.932384341637011e-05, + "loss": 1.1365, + "step": 5668 + }, + { + "epoch": 2.5195555555555558, + "grad_norm": 2.345722198486328, + "learning_rate": 9.930604982206406e-05, + "loss": 1.771, + "step": 5669 + }, + { + "epoch": 2.52, + "grad_norm": 2.515684127807617, + "learning_rate": 9.928825622775802e-05, + "loss": 1.5751, + "step": 5670 + }, + { + "epoch": 2.5204444444444443, + "grad_norm": 2.2543230056762695, + "learning_rate": 9.927046263345196e-05, + "loss": 1.505, + "step": 5671 + }, + { + "epoch": 2.520888888888889, + "grad_norm": 2.895059585571289, + "learning_rate": 9.92526690391459e-05, + "loss": 2.2744, + "step": 5672 + }, + { + "epoch": 2.521333333333333, + "grad_norm": 2.4043800830841064, + "learning_rate": 9.923487544483986e-05, + "loss": 1.5109, + "step": 5673 + }, + { + "epoch": 2.521777777777778, + "grad_norm": 2.604374885559082, + "learning_rate": 9.92170818505338e-05, + "loss": 1.5665, + "step": 5674 + }, + { + "epoch": 2.522222222222222, + "grad_norm": 2.587221145629883, + "learning_rate": 9.919928825622776e-05, + "loss": 1.7969, + "step": 5675 + }, + { + "epoch": 2.522666666666667, + "grad_norm": 2.516834259033203, + "learning_rate": 9.918149466192172e-05, + "loss": 1.5522, + "step": 5676 + }, + { + "epoch": 2.523111111111111, + "grad_norm": 2.5173187255859375, + "learning_rate": 9.916370106761566e-05, + "loss": 1.8983, + "step": 5677 + }, + { + "epoch": 2.5235555555555553, + "grad_norm": 3.197408676147461, + "learning_rate": 9.914590747330962e-05, + "loss": 2.0039, + "step": 5678 + }, + { + "epoch": 2.524, + "grad_norm": 2.5664446353912354, + "learning_rate": 9.912811387900356e-05, + "loss": 1.7915, + "step": 5679 + }, + { + "epoch": 2.5244444444444447, + "grad_norm": 2.8082244396209717, + "learning_rate": 9.91103202846975e-05, + "loss": 2.2351, + "step": 5680 + }, + { + "epoch": 2.524888888888889, + "grad_norm": 2.381884813308716, + "learning_rate": 9.909252669039146e-05, + "loss": 1.3298, + "step": 5681 + }, + { + "epoch": 2.525333333333333, + "grad_norm": 2.747676134109497, + "learning_rate": 9.907473309608542e-05, + "loss": 1.758, + "step": 5682 + }, + { + "epoch": 2.525777777777778, + "grad_norm": 1.99376380443573, + "learning_rate": 9.905693950177936e-05, + "loss": 0.815, + "step": 5683 + }, + { + "epoch": 2.526222222222222, + "grad_norm": 2.863327980041504, + "learning_rate": 9.903914590747332e-05, + "loss": 1.9873, + "step": 5684 + }, + { + "epoch": 2.5266666666666664, + "grad_norm": 2.060922861099243, + "learning_rate": 9.902135231316726e-05, + "loss": 1.206, + "step": 5685 + }, + { + "epoch": 2.527111111111111, + "grad_norm": 2.845263719558716, + "learning_rate": 9.900355871886122e-05, + "loss": 1.7081, + "step": 5686 + }, + { + "epoch": 2.5275555555555558, + "grad_norm": 2.831609010696411, + "learning_rate": 9.898576512455516e-05, + "loss": 1.9307, + "step": 5687 + }, + { + "epoch": 2.528, + "grad_norm": 2.4219014644622803, + "learning_rate": 9.896797153024912e-05, + "loss": 1.2403, + "step": 5688 + }, + { + "epoch": 2.5284444444444443, + "grad_norm": 3.4156317710876465, + "learning_rate": 9.895017793594307e-05, + "loss": 1.9255, + "step": 5689 + }, + { + "epoch": 2.528888888888889, + "grad_norm": 3.083696126937866, + "learning_rate": 9.893238434163702e-05, + "loss": 1.5899, + "step": 5690 + }, + { + "epoch": 2.529333333333333, + "grad_norm": 3.043922185897827, + "learning_rate": 9.891459074733097e-05, + "loss": 1.9652, + "step": 5691 + }, + { + "epoch": 2.529777777777778, + "grad_norm": 2.8423397541046143, + "learning_rate": 9.889679715302491e-05, + "loss": 1.7007, + "step": 5692 + }, + { + "epoch": 2.530222222222222, + "grad_norm": 3.143592596054077, + "learning_rate": 9.887900355871886e-05, + "loss": 1.613, + "step": 5693 + }, + { + "epoch": 2.530666666666667, + "grad_norm": 3.120520830154419, + "learning_rate": 9.886120996441281e-05, + "loss": 1.5702, + "step": 5694 + }, + { + "epoch": 2.531111111111111, + "grad_norm": 3.2891790866851807, + "learning_rate": 9.884341637010677e-05, + "loss": 2.051, + "step": 5695 + }, + { + "epoch": 2.5315555555555553, + "grad_norm": 2.3981990814208984, + "learning_rate": 9.882562277580071e-05, + "loss": 1.1692, + "step": 5696 + }, + { + "epoch": 2.532, + "grad_norm": 2.9288384914398193, + "learning_rate": 9.880782918149467e-05, + "loss": 1.5118, + "step": 5697 + }, + { + "epoch": 2.5324444444444447, + "grad_norm": 2.981546640396118, + "learning_rate": 9.879003558718861e-05, + "loss": 1.6747, + "step": 5698 + }, + { + "epoch": 2.532888888888889, + "grad_norm": 3.5307023525238037, + "learning_rate": 9.877224199288257e-05, + "loss": 1.7474, + "step": 5699 + }, + { + "epoch": 2.533333333333333, + "grad_norm": 5.0346832275390625, + "learning_rate": 9.875444839857651e-05, + "loss": 1.8528, + "step": 5700 + }, + { + "epoch": 2.533777777777778, + "grad_norm": 1.5125519037246704, + "learning_rate": 9.873665480427047e-05, + "loss": 1.1769, + "step": 5701 + }, + { + "epoch": 2.534222222222222, + "grad_norm": 1.7165669202804565, + "learning_rate": 9.871886120996443e-05, + "loss": 1.086, + "step": 5702 + }, + { + "epoch": 2.5346666666666664, + "grad_norm": 1.4426929950714111, + "learning_rate": 9.870106761565837e-05, + "loss": 1.1691, + "step": 5703 + }, + { + "epoch": 2.535111111111111, + "grad_norm": 2.2641258239746094, + "learning_rate": 9.868327402135232e-05, + "loss": 2.1713, + "step": 5704 + }, + { + "epoch": 2.535555555555556, + "grad_norm": 2.1632139682769775, + "learning_rate": 9.866548042704627e-05, + "loss": 2.2283, + "step": 5705 + }, + { + "epoch": 2.536, + "grad_norm": 1.8564096689224243, + "learning_rate": 9.864768683274021e-05, + "loss": 1.0078, + "step": 5706 + }, + { + "epoch": 2.5364444444444443, + "grad_norm": 2.37397837638855, + "learning_rate": 9.862989323843417e-05, + "loss": 2.1491, + "step": 5707 + }, + { + "epoch": 2.536888888888889, + "grad_norm": 2.2556726932525635, + "learning_rate": 9.861209964412812e-05, + "loss": 2.0101, + "step": 5708 + }, + { + "epoch": 2.537333333333333, + "grad_norm": 2.226167678833008, + "learning_rate": 9.859430604982207e-05, + "loss": 1.8431, + "step": 5709 + }, + { + "epoch": 2.537777777777778, + "grad_norm": 2.111975908279419, + "learning_rate": 9.857651245551602e-05, + "loss": 1.9402, + "step": 5710 + }, + { + "epoch": 2.538222222222222, + "grad_norm": 2.208085060119629, + "learning_rate": 9.855871886120997e-05, + "loss": 1.8334, + "step": 5711 + }, + { + "epoch": 2.538666666666667, + "grad_norm": 2.404080867767334, + "learning_rate": 9.854092526690392e-05, + "loss": 2.0223, + "step": 5712 + }, + { + "epoch": 2.539111111111111, + "grad_norm": 2.602574586868286, + "learning_rate": 9.852313167259787e-05, + "loss": 2.0228, + "step": 5713 + }, + { + "epoch": 2.5395555555555553, + "grad_norm": 2.617043972015381, + "learning_rate": 9.850533807829182e-05, + "loss": 1.7762, + "step": 5714 + }, + { + "epoch": 2.54, + "grad_norm": 2.011544704437256, + "learning_rate": 9.848754448398578e-05, + "loss": 1.5084, + "step": 5715 + }, + { + "epoch": 2.5404444444444443, + "grad_norm": 2.117644786834717, + "learning_rate": 9.846975088967972e-05, + "loss": 1.6064, + "step": 5716 + }, + { + "epoch": 2.540888888888889, + "grad_norm": 2.198139190673828, + "learning_rate": 9.845195729537368e-05, + "loss": 1.6679, + "step": 5717 + }, + { + "epoch": 2.541333333333333, + "grad_norm": 1.5015946626663208, + "learning_rate": 9.843416370106762e-05, + "loss": 0.9008, + "step": 5718 + }, + { + "epoch": 2.541777777777778, + "grad_norm": 2.3386454582214355, + "learning_rate": 9.841637010676156e-05, + "loss": 1.665, + "step": 5719 + }, + { + "epoch": 2.542222222222222, + "grad_norm": 2.063887357711792, + "learning_rate": 9.839857651245552e-05, + "loss": 1.4986, + "step": 5720 + }, + { + "epoch": 2.5426666666666664, + "grad_norm": 2.3388946056365967, + "learning_rate": 9.838078291814948e-05, + "loss": 2.0517, + "step": 5721 + }, + { + "epoch": 2.543111111111111, + "grad_norm": 2.838296890258789, + "learning_rate": 9.836298932384342e-05, + "loss": 1.8289, + "step": 5722 + }, + { + "epoch": 2.543555555555556, + "grad_norm": 2.528554677963257, + "learning_rate": 9.834519572953738e-05, + "loss": 2.0108, + "step": 5723 + }, + { + "epoch": 2.544, + "grad_norm": 3.278167963027954, + "learning_rate": 9.832740213523132e-05, + "loss": 0.9733, + "step": 5724 + }, + { + "epoch": 2.5444444444444443, + "grad_norm": 2.756471633911133, + "learning_rate": 9.830960854092526e-05, + "loss": 1.7284, + "step": 5725 + }, + { + "epoch": 2.544888888888889, + "grad_norm": 2.6623575687408447, + "learning_rate": 9.829181494661922e-05, + "loss": 1.8741, + "step": 5726 + }, + { + "epoch": 2.5453333333333332, + "grad_norm": 2.228694438934326, + "learning_rate": 9.827402135231318e-05, + "loss": 1.5795, + "step": 5727 + }, + { + "epoch": 2.545777777777778, + "grad_norm": 1.9252150058746338, + "learning_rate": 9.825622775800712e-05, + "loss": 1.038, + "step": 5728 + }, + { + "epoch": 2.546222222222222, + "grad_norm": 2.6613807678222656, + "learning_rate": 9.823843416370107e-05, + "loss": 1.9298, + "step": 5729 + }, + { + "epoch": 2.546666666666667, + "grad_norm": 2.613647222518921, + "learning_rate": 9.822064056939502e-05, + "loss": 1.8275, + "step": 5730 + }, + { + "epoch": 2.547111111111111, + "grad_norm": 2.9346303939819336, + "learning_rate": 9.820284697508897e-05, + "loss": 1.9397, + "step": 5731 + }, + { + "epoch": 2.5475555555555554, + "grad_norm": 2.4454238414764404, + "learning_rate": 9.818505338078292e-05, + "loss": 1.6681, + "step": 5732 + }, + { + "epoch": 2.548, + "grad_norm": 2.5040907859802246, + "learning_rate": 9.816725978647687e-05, + "loss": 1.2844, + "step": 5733 + }, + { + "epoch": 2.5484444444444443, + "grad_norm": 2.631068706512451, + "learning_rate": 9.814946619217083e-05, + "loss": 1.482, + "step": 5734 + }, + { + "epoch": 2.548888888888889, + "grad_norm": 2.5751149654388428, + "learning_rate": 9.813167259786477e-05, + "loss": 1.7342, + "step": 5735 + }, + { + "epoch": 2.5493333333333332, + "grad_norm": 2.755535840988159, + "learning_rate": 9.811387900355873e-05, + "loss": 1.775, + "step": 5736 + }, + { + "epoch": 2.549777777777778, + "grad_norm": 3.007052421569824, + "learning_rate": 9.809608540925267e-05, + "loss": 1.8263, + "step": 5737 + }, + { + "epoch": 2.550222222222222, + "grad_norm": 2.7909440994262695, + "learning_rate": 9.807829181494662e-05, + "loss": 1.7401, + "step": 5738 + }, + { + "epoch": 2.5506666666666664, + "grad_norm": 3.113389492034912, + "learning_rate": 9.806049822064057e-05, + "loss": 1.4822, + "step": 5739 + }, + { + "epoch": 2.551111111111111, + "grad_norm": 3.607086420059204, + "learning_rate": 9.804270462633453e-05, + "loss": 2.3029, + "step": 5740 + }, + { + "epoch": 2.551555555555556, + "grad_norm": 2.5248825550079346, + "learning_rate": 9.802491103202847e-05, + "loss": 1.6365, + "step": 5741 + }, + { + "epoch": 2.552, + "grad_norm": 3.1107113361358643, + "learning_rate": 9.800711743772243e-05, + "loss": 1.6821, + "step": 5742 + }, + { + "epoch": 2.5524444444444443, + "grad_norm": 3.095695734024048, + "learning_rate": 9.798932384341637e-05, + "loss": 1.9611, + "step": 5743 + }, + { + "epoch": 2.552888888888889, + "grad_norm": 3.119293212890625, + "learning_rate": 9.797153024911033e-05, + "loss": 1.8032, + "step": 5744 + }, + { + "epoch": 2.5533333333333332, + "grad_norm": 3.1017251014709473, + "learning_rate": 9.795373665480427e-05, + "loss": 1.3616, + "step": 5745 + }, + { + "epoch": 2.553777777777778, + "grad_norm": 3.225269317626953, + "learning_rate": 9.793594306049823e-05, + "loss": 1.6475, + "step": 5746 + }, + { + "epoch": 2.554222222222222, + "grad_norm": 3.819324254989624, + "learning_rate": 9.791814946619218e-05, + "loss": 1.8154, + "step": 5747 + }, + { + "epoch": 2.554666666666667, + "grad_norm": 1.9647773504257202, + "learning_rate": 9.790035587188613e-05, + "loss": 0.7431, + "step": 5748 + }, + { + "epoch": 2.555111111111111, + "grad_norm": 3.4745988845825195, + "learning_rate": 9.788256227758008e-05, + "loss": 1.7861, + "step": 5749 + }, + { + "epoch": 2.5555555555555554, + "grad_norm": 4.805171489715576, + "learning_rate": 9.786476868327403e-05, + "loss": 1.0196, + "step": 5750 + }, + { + "epoch": 2.556, + "grad_norm": 1.983173131942749, + "learning_rate": 9.784697508896797e-05, + "loss": 2.2983, + "step": 5751 + }, + { + "epoch": 2.5564444444444443, + "grad_norm": 1.9651044607162476, + "learning_rate": 9.782918149466193e-05, + "loss": 1.9459, + "step": 5752 + }, + { + "epoch": 2.556888888888889, + "grad_norm": 2.155700445175171, + "learning_rate": 9.781138790035588e-05, + "loss": 2.1677, + "step": 5753 + }, + { + "epoch": 2.5573333333333332, + "grad_norm": 0.1379898637533188, + "learning_rate": 9.779359430604982e-05, + "loss": 0.0167, + "step": 5754 + }, + { + "epoch": 2.557777777777778, + "grad_norm": 2.319441080093384, + "learning_rate": 9.777580071174378e-05, + "loss": 2.1024, + "step": 5755 + }, + { + "epoch": 2.558222222222222, + "grad_norm": 2.3348388671875, + "learning_rate": 9.775800711743772e-05, + "loss": 2.4375, + "step": 5756 + }, + { + "epoch": 2.5586666666666664, + "grad_norm": 2.42160964012146, + "learning_rate": 9.774021352313168e-05, + "loss": 2.0312, + "step": 5757 + }, + { + "epoch": 2.559111111111111, + "grad_norm": 2.4215610027313232, + "learning_rate": 9.772241992882562e-05, + "loss": 2.1268, + "step": 5758 + }, + { + "epoch": 2.559555555555556, + "grad_norm": 1.9341044425964355, + "learning_rate": 9.770462633451958e-05, + "loss": 1.6431, + "step": 5759 + }, + { + "epoch": 2.56, + "grad_norm": 2.569185972213745, + "learning_rate": 9.768683274021354e-05, + "loss": 1.9818, + "step": 5760 + }, + { + "epoch": 2.5604444444444443, + "grad_norm": 2.380908727645874, + "learning_rate": 9.766903914590748e-05, + "loss": 1.6281, + "step": 5761 + }, + { + "epoch": 2.560888888888889, + "grad_norm": 2.445352554321289, + "learning_rate": 9.765124555160144e-05, + "loss": 1.7746, + "step": 5762 + }, + { + "epoch": 2.5613333333333332, + "grad_norm": 1.9633126258850098, + "learning_rate": 9.763345195729538e-05, + "loss": 1.2796, + "step": 5763 + }, + { + "epoch": 2.561777777777778, + "grad_norm": 2.4921209812164307, + "learning_rate": 9.761565836298932e-05, + "loss": 2.1531, + "step": 5764 + }, + { + "epoch": 2.562222222222222, + "grad_norm": 1.6959223747253418, + "learning_rate": 9.759786476868328e-05, + "loss": 0.8272, + "step": 5765 + }, + { + "epoch": 2.562666666666667, + "grad_norm": 2.389455556869507, + "learning_rate": 9.758007117437723e-05, + "loss": 1.7277, + "step": 5766 + }, + { + "epoch": 2.563111111111111, + "grad_norm": 1.7974770069122314, + "learning_rate": 9.756227758007118e-05, + "loss": 1.1353, + "step": 5767 + }, + { + "epoch": 2.5635555555555554, + "grad_norm": 2.2047617435455322, + "learning_rate": 9.754448398576513e-05, + "loss": 1.7808, + "step": 5768 + }, + { + "epoch": 2.564, + "grad_norm": 3.0488312244415283, + "learning_rate": 9.752669039145908e-05, + "loss": 2.2577, + "step": 5769 + }, + { + "epoch": 2.5644444444444443, + "grad_norm": 2.719773054122925, + "learning_rate": 9.750889679715302e-05, + "loss": 2.1927, + "step": 5770 + }, + { + "epoch": 2.564888888888889, + "grad_norm": 2.654473304748535, + "learning_rate": 9.749110320284698e-05, + "loss": 1.8773, + "step": 5771 + }, + { + "epoch": 2.5653333333333332, + "grad_norm": 2.6189746856689453, + "learning_rate": 9.747330960854093e-05, + "loss": 1.5834, + "step": 5772 + }, + { + "epoch": 2.565777777777778, + "grad_norm": 2.9932594299316406, + "learning_rate": 9.745551601423488e-05, + "loss": 2.3835, + "step": 5773 + }, + { + "epoch": 2.566222222222222, + "grad_norm": 2.437887668609619, + "learning_rate": 9.743772241992883e-05, + "loss": 1.7243, + "step": 5774 + }, + { + "epoch": 2.5666666666666664, + "grad_norm": 2.175389051437378, + "learning_rate": 9.741992882562279e-05, + "loss": 1.4495, + "step": 5775 + }, + { + "epoch": 2.567111111111111, + "grad_norm": 2.9918575286865234, + "learning_rate": 9.740213523131673e-05, + "loss": 2.0336, + "step": 5776 + }, + { + "epoch": 2.567555555555556, + "grad_norm": 2.525195360183716, + "learning_rate": 9.738434163701067e-05, + "loss": 1.6138, + "step": 5777 + }, + { + "epoch": 2.568, + "grad_norm": 2.7021284103393555, + "learning_rate": 9.736654804270463e-05, + "loss": 1.7927, + "step": 5778 + }, + { + "epoch": 2.5684444444444443, + "grad_norm": 2.420149087905884, + "learning_rate": 9.734875444839859e-05, + "loss": 1.7339, + "step": 5779 + }, + { + "epoch": 2.568888888888889, + "grad_norm": 2.3807191848754883, + "learning_rate": 9.733096085409253e-05, + "loss": 0.6999, + "step": 5780 + }, + { + "epoch": 2.5693333333333332, + "grad_norm": 2.4973058700561523, + "learning_rate": 9.731316725978649e-05, + "loss": 1.6894, + "step": 5781 + }, + { + "epoch": 2.569777777777778, + "grad_norm": 2.848585844039917, + "learning_rate": 9.729537366548043e-05, + "loss": 1.9012, + "step": 5782 + }, + { + "epoch": 2.570222222222222, + "grad_norm": 2.616931676864624, + "learning_rate": 9.727758007117437e-05, + "loss": 1.5713, + "step": 5783 + }, + { + "epoch": 2.570666666666667, + "grad_norm": 2.4025752544403076, + "learning_rate": 9.725978647686833e-05, + "loss": 1.6347, + "step": 5784 + }, + { + "epoch": 2.571111111111111, + "grad_norm": 2.759446620941162, + "learning_rate": 9.724199288256229e-05, + "loss": 1.5939, + "step": 5785 + }, + { + "epoch": 2.5715555555555554, + "grad_norm": 2.421267032623291, + "learning_rate": 9.722419928825623e-05, + "loss": 1.3653, + "step": 5786 + }, + { + "epoch": 2.572, + "grad_norm": 3.161170482635498, + "learning_rate": 9.720640569395019e-05, + "loss": 1.5433, + "step": 5787 + }, + { + "epoch": 2.5724444444444443, + "grad_norm": 3.038508653640747, + "learning_rate": 9.718861209964413e-05, + "loss": 1.4943, + "step": 5788 + }, + { + "epoch": 2.572888888888889, + "grad_norm": 2.3798742294311523, + "learning_rate": 9.717081850533809e-05, + "loss": 1.1891, + "step": 5789 + }, + { + "epoch": 2.5733333333333333, + "grad_norm": 2.9611308574676514, + "learning_rate": 9.715302491103203e-05, + "loss": 1.6984, + "step": 5790 + }, + { + "epoch": 2.573777777777778, + "grad_norm": 3.015956401824951, + "learning_rate": 9.713523131672598e-05, + "loss": 1.6971, + "step": 5791 + }, + { + "epoch": 2.574222222222222, + "grad_norm": 2.819864273071289, + "learning_rate": 9.711743772241994e-05, + "loss": 1.5978, + "step": 5792 + }, + { + "epoch": 2.5746666666666664, + "grad_norm": 3.813222885131836, + "learning_rate": 9.709964412811388e-05, + "loss": 1.9956, + "step": 5793 + }, + { + "epoch": 2.575111111111111, + "grad_norm": 2.9637928009033203, + "learning_rate": 9.708185053380784e-05, + "loss": 1.7197, + "step": 5794 + }, + { + "epoch": 2.575555555555556, + "grad_norm": 3.4697177410125732, + "learning_rate": 9.706405693950178e-05, + "loss": 1.3374, + "step": 5795 + }, + { + "epoch": 2.576, + "grad_norm": 3.505645751953125, + "learning_rate": 9.704626334519573e-05, + "loss": 1.9452, + "step": 5796 + }, + { + "epoch": 2.5764444444444443, + "grad_norm": 3.6183602809906006, + "learning_rate": 9.702846975088968e-05, + "loss": 1.6057, + "step": 5797 + }, + { + "epoch": 2.576888888888889, + "grad_norm": 3.725675106048584, + "learning_rate": 9.701067615658364e-05, + "loss": 1.6473, + "step": 5798 + }, + { + "epoch": 2.5773333333333333, + "grad_norm": 2.1364715099334717, + "learning_rate": 9.699288256227758e-05, + "loss": 0.8211, + "step": 5799 + }, + { + "epoch": 2.5777777777777775, + "grad_norm": 4.8051934242248535, + "learning_rate": 9.697508896797154e-05, + "loss": 1.2673, + "step": 5800 + }, + { + "epoch": 2.578222222222222, + "grad_norm": 1.737740397453308, + "learning_rate": 9.695729537366548e-05, + "loss": 2.0515, + "step": 5801 + }, + { + "epoch": 2.578666666666667, + "grad_norm": 1.431738018989563, + "learning_rate": 9.693950177935944e-05, + "loss": 1.0424, + "step": 5802 + }, + { + "epoch": 2.579111111111111, + "grad_norm": 1.8884087800979614, + "learning_rate": 9.692170818505338e-05, + "loss": 2.2214, + "step": 5803 + }, + { + "epoch": 2.5795555555555554, + "grad_norm": 2.2000203132629395, + "learning_rate": 9.690391459074734e-05, + "loss": 2.3403, + "step": 5804 + }, + { + "epoch": 2.58, + "grad_norm": 1.952431559562683, + "learning_rate": 9.68861209964413e-05, + "loss": 2.2942, + "step": 5805 + }, + { + "epoch": 2.5804444444444443, + "grad_norm": 2.162529706954956, + "learning_rate": 9.686832740213524e-05, + "loss": 2.1921, + "step": 5806 + }, + { + "epoch": 2.580888888888889, + "grad_norm": 2.1220896244049072, + "learning_rate": 9.68505338078292e-05, + "loss": 1.914, + "step": 5807 + }, + { + "epoch": 2.5813333333333333, + "grad_norm": 1.9332010746002197, + "learning_rate": 9.683274021352314e-05, + "loss": 1.4451, + "step": 5808 + }, + { + "epoch": 2.581777777777778, + "grad_norm": 2.8427624702453613, + "learning_rate": 9.681494661921708e-05, + "loss": 2.0637, + "step": 5809 + }, + { + "epoch": 2.582222222222222, + "grad_norm": 1.7098627090454102, + "learning_rate": 9.679715302491104e-05, + "loss": 1.2616, + "step": 5810 + }, + { + "epoch": 2.5826666666666664, + "grad_norm": 2.2705390453338623, + "learning_rate": 9.677935943060499e-05, + "loss": 1.9668, + "step": 5811 + }, + { + "epoch": 2.583111111111111, + "grad_norm": 2.204101800918579, + "learning_rate": 9.676156583629894e-05, + "loss": 1.8447, + "step": 5812 + }, + { + "epoch": 2.583555555555556, + "grad_norm": 2.294118642807007, + "learning_rate": 9.674377224199289e-05, + "loss": 1.7681, + "step": 5813 + }, + { + "epoch": 2.584, + "grad_norm": 2.175595760345459, + "learning_rate": 9.672597864768683e-05, + "loss": 1.9735, + "step": 5814 + }, + { + "epoch": 2.5844444444444443, + "grad_norm": 2.1814606189727783, + "learning_rate": 9.670818505338078e-05, + "loss": 1.709, + "step": 5815 + }, + { + "epoch": 2.584888888888889, + "grad_norm": 2.43196702003479, + "learning_rate": 9.669039145907473e-05, + "loss": 2.0574, + "step": 5816 + }, + { + "epoch": 2.5853333333333333, + "grad_norm": 2.466120958328247, + "learning_rate": 9.667259786476869e-05, + "loss": 2.2044, + "step": 5817 + }, + { + "epoch": 2.5857777777777775, + "grad_norm": 1.9092581272125244, + "learning_rate": 9.665480427046263e-05, + "loss": 0.5758, + "step": 5818 + }, + { + "epoch": 2.586222222222222, + "grad_norm": 2.337799310684204, + "learning_rate": 9.663701067615659e-05, + "loss": 2.1761, + "step": 5819 + }, + { + "epoch": 2.586666666666667, + "grad_norm": 2.0676565170288086, + "learning_rate": 9.661921708185055e-05, + "loss": 1.7785, + "step": 5820 + }, + { + "epoch": 2.587111111111111, + "grad_norm": 2.228407382965088, + "learning_rate": 9.660142348754449e-05, + "loss": 1.3814, + "step": 5821 + }, + { + "epoch": 2.5875555555555554, + "grad_norm": 2.3954479694366455, + "learning_rate": 9.658362989323843e-05, + "loss": 1.3558, + "step": 5822 + }, + { + "epoch": 2.588, + "grad_norm": 2.1830084323883057, + "learning_rate": 9.656583629893239e-05, + "loss": 1.8961, + "step": 5823 + }, + { + "epoch": 2.5884444444444443, + "grad_norm": 2.57344913482666, + "learning_rate": 9.654804270462635e-05, + "loss": 1.8005, + "step": 5824 + }, + { + "epoch": 2.588888888888889, + "grad_norm": 2.3932812213897705, + "learning_rate": 9.653024911032029e-05, + "loss": 1.824, + "step": 5825 + }, + { + "epoch": 2.5893333333333333, + "grad_norm": 2.067878007888794, + "learning_rate": 9.651245551601425e-05, + "loss": 1.7157, + "step": 5826 + }, + { + "epoch": 2.589777777777778, + "grad_norm": 2.737030267715454, + "learning_rate": 9.649466192170819e-05, + "loss": 1.3283, + "step": 5827 + }, + { + "epoch": 2.590222222222222, + "grad_norm": 2.7902398109436035, + "learning_rate": 9.647686832740213e-05, + "loss": 1.8529, + "step": 5828 + }, + { + "epoch": 2.5906666666666665, + "grad_norm": 2.5697999000549316, + "learning_rate": 9.645907473309609e-05, + "loss": 1.7706, + "step": 5829 + }, + { + "epoch": 2.591111111111111, + "grad_norm": 2.486321449279785, + "learning_rate": 9.644128113879004e-05, + "loss": 1.4964, + "step": 5830 + }, + { + "epoch": 2.5915555555555554, + "grad_norm": 2.440593719482422, + "learning_rate": 9.642348754448399e-05, + "loss": 1.732, + "step": 5831 + }, + { + "epoch": 2.592, + "grad_norm": 2.2928342819213867, + "learning_rate": 9.640569395017794e-05, + "loss": 1.0141, + "step": 5832 + }, + { + "epoch": 2.5924444444444443, + "grad_norm": 2.5680768489837646, + "learning_rate": 9.63879003558719e-05, + "loss": 1.3784, + "step": 5833 + }, + { + "epoch": 2.592888888888889, + "grad_norm": 2.402845621109009, + "learning_rate": 9.637010676156584e-05, + "loss": 1.7085, + "step": 5834 + }, + { + "epoch": 2.5933333333333333, + "grad_norm": 1.819101333618164, + "learning_rate": 9.635231316725979e-05, + "loss": 0.8828, + "step": 5835 + }, + { + "epoch": 2.5937777777777775, + "grad_norm": 2.9509596824645996, + "learning_rate": 9.633451957295374e-05, + "loss": 1.4475, + "step": 5836 + }, + { + "epoch": 2.594222222222222, + "grad_norm": 2.3825151920318604, + "learning_rate": 9.63167259786477e-05, + "loss": 1.2385, + "step": 5837 + }, + { + "epoch": 2.594666666666667, + "grad_norm": 3.0059218406677246, + "learning_rate": 9.629893238434164e-05, + "loss": 1.6829, + "step": 5838 + }, + { + "epoch": 2.595111111111111, + "grad_norm": 3.277036190032959, + "learning_rate": 9.62811387900356e-05, + "loss": 1.6309, + "step": 5839 + }, + { + "epoch": 2.5955555555555554, + "grad_norm": 2.9145870208740234, + "learning_rate": 9.626334519572954e-05, + "loss": 2.0603, + "step": 5840 + }, + { + "epoch": 2.596, + "grad_norm": 2.803736686706543, + "learning_rate": 9.624555160142348e-05, + "loss": 1.6505, + "step": 5841 + }, + { + "epoch": 2.5964444444444443, + "grad_norm": 3.3242056369781494, + "learning_rate": 9.622775800711744e-05, + "loss": 1.9847, + "step": 5842 + }, + { + "epoch": 2.596888888888889, + "grad_norm": 3.6770615577697754, + "learning_rate": 9.62099644128114e-05, + "loss": 1.4115, + "step": 5843 + }, + { + "epoch": 2.5973333333333333, + "grad_norm": 2.989746332168579, + "learning_rate": 9.619217081850534e-05, + "loss": 1.7387, + "step": 5844 + }, + { + "epoch": 2.597777777777778, + "grad_norm": 3.3722546100616455, + "learning_rate": 9.61743772241993e-05, + "loss": 1.6478, + "step": 5845 + }, + { + "epoch": 2.598222222222222, + "grad_norm": 3.2518842220306396, + "learning_rate": 9.615658362989324e-05, + "loss": 1.1471, + "step": 5846 + }, + { + "epoch": 2.5986666666666665, + "grad_norm": 3.7409679889678955, + "learning_rate": 9.61387900355872e-05, + "loss": 1.9251, + "step": 5847 + }, + { + "epoch": 2.599111111111111, + "grad_norm": 3.091366767883301, + "learning_rate": 9.612099644128114e-05, + "loss": 1.5747, + "step": 5848 + }, + { + "epoch": 2.5995555555555554, + "grad_norm": 3.8686630725860596, + "learning_rate": 9.61032028469751e-05, + "loss": 1.0119, + "step": 5849 + }, + { + "epoch": 2.6, + "grad_norm": 2.7347426414489746, + "learning_rate": 9.608540925266905e-05, + "loss": 0.4375, + "step": 5850 + }, + { + "epoch": 2.6004444444444443, + "grad_norm": 1.894957184791565, + "learning_rate": 9.6067615658363e-05, + "loss": 2.3036, + "step": 5851 + }, + { + "epoch": 2.600888888888889, + "grad_norm": 1.9998407363891602, + "learning_rate": 9.604982206405695e-05, + "loss": 2.2295, + "step": 5852 + }, + { + "epoch": 2.6013333333333333, + "grad_norm": 2.1077568531036377, + "learning_rate": 9.60320284697509e-05, + "loss": 1.085, + "step": 5853 + }, + { + "epoch": 2.6017777777777775, + "grad_norm": 2.0301358699798584, + "learning_rate": 9.601423487544484e-05, + "loss": 1.1873, + "step": 5854 + }, + { + "epoch": 2.602222222222222, + "grad_norm": 2.020034074783325, + "learning_rate": 9.59964412811388e-05, + "loss": 1.7447, + "step": 5855 + }, + { + "epoch": 2.602666666666667, + "grad_norm": 2.428344964981079, + "learning_rate": 9.597864768683275e-05, + "loss": 2.155, + "step": 5856 + }, + { + "epoch": 2.603111111111111, + "grad_norm": 2.2994885444641113, + "learning_rate": 9.596085409252669e-05, + "loss": 2.0446, + "step": 5857 + }, + { + "epoch": 2.6035555555555554, + "grad_norm": 2.4764015674591064, + "learning_rate": 9.594306049822065e-05, + "loss": 1.8314, + "step": 5858 + }, + { + "epoch": 2.604, + "grad_norm": 2.4663918018341064, + "learning_rate": 9.592526690391459e-05, + "loss": 1.6086, + "step": 5859 + }, + { + "epoch": 2.6044444444444443, + "grad_norm": 2.3457717895507812, + "learning_rate": 9.590747330960854e-05, + "loss": 1.8214, + "step": 5860 + }, + { + "epoch": 2.604888888888889, + "grad_norm": 1.8627485036849976, + "learning_rate": 9.588967971530249e-05, + "loss": 1.225, + "step": 5861 + }, + { + "epoch": 2.6053333333333333, + "grad_norm": 2.610481023788452, + "learning_rate": 9.587188612099645e-05, + "loss": 1.8318, + "step": 5862 + }, + { + "epoch": 2.605777777777778, + "grad_norm": 2.533219337463379, + "learning_rate": 9.585409252669039e-05, + "loss": 1.8934, + "step": 5863 + }, + { + "epoch": 2.606222222222222, + "grad_norm": 2.506080389022827, + "learning_rate": 9.583629893238435e-05, + "loss": 2.0397, + "step": 5864 + }, + { + "epoch": 2.6066666666666665, + "grad_norm": 2.50927734375, + "learning_rate": 9.58185053380783e-05, + "loss": 1.8956, + "step": 5865 + }, + { + "epoch": 2.607111111111111, + "grad_norm": 2.1902568340301514, + "learning_rate": 9.580071174377225e-05, + "loss": 1.7044, + "step": 5866 + }, + { + "epoch": 2.6075555555555554, + "grad_norm": 2.695941925048828, + "learning_rate": 9.578291814946619e-05, + "loss": 1.936, + "step": 5867 + }, + { + "epoch": 2.608, + "grad_norm": 2.4527158737182617, + "learning_rate": 9.576512455516015e-05, + "loss": 1.4728, + "step": 5868 + }, + { + "epoch": 2.6084444444444443, + "grad_norm": 2.4402294158935547, + "learning_rate": 9.57473309608541e-05, + "loss": 1.6587, + "step": 5869 + }, + { + "epoch": 2.608888888888889, + "grad_norm": 2.2047953605651855, + "learning_rate": 9.572953736654805e-05, + "loss": 1.5533, + "step": 5870 + }, + { + "epoch": 2.6093333333333333, + "grad_norm": 2.1342928409576416, + "learning_rate": 9.5711743772242e-05, + "loss": 1.6553, + "step": 5871 + }, + { + "epoch": 2.6097777777777775, + "grad_norm": 2.7137703895568848, + "learning_rate": 9.569395017793595e-05, + "loss": 2.0063, + "step": 5872 + }, + { + "epoch": 2.610222222222222, + "grad_norm": 2.3683669567108154, + "learning_rate": 9.567615658362989e-05, + "loss": 1.4502, + "step": 5873 + }, + { + "epoch": 2.610666666666667, + "grad_norm": 2.7055866718292236, + "learning_rate": 9.565836298932385e-05, + "loss": 2.0081, + "step": 5874 + }, + { + "epoch": 2.611111111111111, + "grad_norm": 2.435533046722412, + "learning_rate": 9.56405693950178e-05, + "loss": 1.4356, + "step": 5875 + }, + { + "epoch": 2.6115555555555554, + "grad_norm": 2.407223701477051, + "learning_rate": 9.562277580071174e-05, + "loss": 1.4788, + "step": 5876 + }, + { + "epoch": 2.612, + "grad_norm": 2.53997540473938, + "learning_rate": 9.56049822064057e-05, + "loss": 1.6948, + "step": 5877 + }, + { + "epoch": 2.6124444444444443, + "grad_norm": 2.5272786617279053, + "learning_rate": 9.558718861209966e-05, + "loss": 2.0246, + "step": 5878 + }, + { + "epoch": 2.612888888888889, + "grad_norm": 2.6360082626342773, + "learning_rate": 9.55693950177936e-05, + "loss": 1.6889, + "step": 5879 + }, + { + "epoch": 2.6133333333333333, + "grad_norm": 2.879133462905884, + "learning_rate": 9.555160142348754e-05, + "loss": 1.7153, + "step": 5880 + }, + { + "epoch": 2.613777777777778, + "grad_norm": 1.858364224433899, + "learning_rate": 9.55338078291815e-05, + "loss": 0.8528, + "step": 5881 + }, + { + "epoch": 2.6142222222222222, + "grad_norm": 2.2637181282043457, + "learning_rate": 9.551601423487546e-05, + "loss": 1.3998, + "step": 5882 + }, + { + "epoch": 2.6146666666666665, + "grad_norm": 2.4725327491760254, + "learning_rate": 9.54982206405694e-05, + "loss": 1.8587, + "step": 5883 + }, + { + "epoch": 2.615111111111111, + "grad_norm": 2.9442496299743652, + "learning_rate": 9.548042704626336e-05, + "loss": 2.0215, + "step": 5884 + }, + { + "epoch": 2.6155555555555554, + "grad_norm": 2.771085739135742, + "learning_rate": 9.54626334519573e-05, + "loss": 1.7668, + "step": 5885 + }, + { + "epoch": 2.616, + "grad_norm": 3.2240281105041504, + "learning_rate": 9.544483985765124e-05, + "loss": 1.9279, + "step": 5886 + }, + { + "epoch": 2.6164444444444444, + "grad_norm": 2.726102590560913, + "learning_rate": 9.54270462633452e-05, + "loss": 1.5733, + "step": 5887 + }, + { + "epoch": 2.616888888888889, + "grad_norm": 2.668149709701538, + "learning_rate": 9.540925266903915e-05, + "loss": 1.562, + "step": 5888 + }, + { + "epoch": 2.6173333333333333, + "grad_norm": 2.835635185241699, + "learning_rate": 9.53914590747331e-05, + "loss": 1.7907, + "step": 5889 + }, + { + "epoch": 2.6177777777777775, + "grad_norm": 3.4420056343078613, + "learning_rate": 9.537366548042705e-05, + "loss": 1.7993, + "step": 5890 + }, + { + "epoch": 2.6182222222222222, + "grad_norm": 3.626722574234009, + "learning_rate": 9.535587188612101e-05, + "loss": 2.1257, + "step": 5891 + }, + { + "epoch": 2.618666666666667, + "grad_norm": 2.6178202629089355, + "learning_rate": 9.533807829181495e-05, + "loss": 1.3251, + "step": 5892 + }, + { + "epoch": 2.619111111111111, + "grad_norm": 2.7256970405578613, + "learning_rate": 9.53202846975089e-05, + "loss": 1.5753, + "step": 5893 + }, + { + "epoch": 2.6195555555555554, + "grad_norm": 3.222346782684326, + "learning_rate": 9.530249110320285e-05, + "loss": 1.928, + "step": 5894 + }, + { + "epoch": 2.62, + "grad_norm": 3.2846624851226807, + "learning_rate": 9.528469750889681e-05, + "loss": 1.8586, + "step": 5895 + }, + { + "epoch": 2.6204444444444444, + "grad_norm": 3.4086952209472656, + "learning_rate": 9.526690391459075e-05, + "loss": 1.5629, + "step": 5896 + }, + { + "epoch": 2.620888888888889, + "grad_norm": 3.3256282806396484, + "learning_rate": 9.524911032028471e-05, + "loss": 1.6927, + "step": 5897 + }, + { + "epoch": 2.6213333333333333, + "grad_norm": 3.0743277072906494, + "learning_rate": 9.523131672597865e-05, + "loss": 1.4407, + "step": 5898 + }, + { + "epoch": 2.621777777777778, + "grad_norm": 5.507002353668213, + "learning_rate": 9.52135231316726e-05, + "loss": 1.2145, + "step": 5899 + }, + { + "epoch": 2.6222222222222222, + "grad_norm": 4.734825134277344, + "learning_rate": 9.519572953736655e-05, + "loss": 1.7574, + "step": 5900 + }, + { + "epoch": 2.6226666666666665, + "grad_norm": 1.4950133562088013, + "learning_rate": 9.517793594306051e-05, + "loss": 1.0372, + "step": 5901 + }, + { + "epoch": 2.623111111111111, + "grad_norm": 2.024033784866333, + "learning_rate": 9.516014234875445e-05, + "loss": 1.9618, + "step": 5902 + }, + { + "epoch": 2.6235555555555554, + "grad_norm": 2.259506940841675, + "learning_rate": 9.514234875444841e-05, + "loss": 1.7976, + "step": 5903 + }, + { + "epoch": 2.624, + "grad_norm": 2.1952743530273438, + "learning_rate": 9.512455516014235e-05, + "loss": 2.5925, + "step": 5904 + }, + { + "epoch": 2.6244444444444444, + "grad_norm": 2.353161096572876, + "learning_rate": 9.51067615658363e-05, + "loss": 1.9936, + "step": 5905 + }, + { + "epoch": 2.624888888888889, + "grad_norm": 1.892688274383545, + "learning_rate": 9.508896797153025e-05, + "loss": 1.7493, + "step": 5906 + }, + { + "epoch": 2.6253333333333333, + "grad_norm": 2.5873072147369385, + "learning_rate": 9.50711743772242e-05, + "loss": 1.8485, + "step": 5907 + }, + { + "epoch": 2.6257777777777775, + "grad_norm": 1.6135746240615845, + "learning_rate": 9.505338078291815e-05, + "loss": 0.9499, + "step": 5908 + }, + { + "epoch": 2.6262222222222222, + "grad_norm": 2.211524486541748, + "learning_rate": 9.50355871886121e-05, + "loss": 1.6318, + "step": 5909 + }, + { + "epoch": 2.626666666666667, + "grad_norm": 2.18351411819458, + "learning_rate": 9.501779359430606e-05, + "loss": 1.6961, + "step": 5910 + }, + { + "epoch": 2.627111111111111, + "grad_norm": 2.3403427600860596, + "learning_rate": 9.5e-05, + "loss": 1.2993, + "step": 5911 + }, + { + "epoch": 2.6275555555555554, + "grad_norm": 2.16619610786438, + "learning_rate": 9.498220640569395e-05, + "loss": 1.5, + "step": 5912 + }, + { + "epoch": 2.628, + "grad_norm": 2.4509599208831787, + "learning_rate": 9.49644128113879e-05, + "loss": 2.1853, + "step": 5913 + }, + { + "epoch": 2.6284444444444444, + "grad_norm": 2.2944180965423584, + "learning_rate": 9.494661921708186e-05, + "loss": 1.4259, + "step": 5914 + }, + { + "epoch": 2.628888888888889, + "grad_norm": 2.5902411937713623, + "learning_rate": 9.49288256227758e-05, + "loss": 1.5032, + "step": 5915 + }, + { + "epoch": 2.6293333333333333, + "grad_norm": 2.6031863689422607, + "learning_rate": 9.491103202846976e-05, + "loss": 1.9135, + "step": 5916 + }, + { + "epoch": 2.629777777777778, + "grad_norm": 2.3265883922576904, + "learning_rate": 9.48932384341637e-05, + "loss": 1.2576, + "step": 5917 + }, + { + "epoch": 2.6302222222222222, + "grad_norm": 2.358459949493408, + "learning_rate": 9.487544483985765e-05, + "loss": 1.8601, + "step": 5918 + }, + { + "epoch": 2.6306666666666665, + "grad_norm": 2.6963040828704834, + "learning_rate": 9.48576512455516e-05, + "loss": 2.0838, + "step": 5919 + }, + { + "epoch": 2.631111111111111, + "grad_norm": 2.430453300476074, + "learning_rate": 9.483985765124556e-05, + "loss": 1.6023, + "step": 5920 + }, + { + "epoch": 2.6315555555555554, + "grad_norm": 2.707322120666504, + "learning_rate": 9.48220640569395e-05, + "loss": 1.5959, + "step": 5921 + }, + { + "epoch": 2.632, + "grad_norm": 2.6505823135375977, + "learning_rate": 9.480427046263346e-05, + "loss": 1.9604, + "step": 5922 + }, + { + "epoch": 2.6324444444444444, + "grad_norm": 2.96512770652771, + "learning_rate": 9.478647686832742e-05, + "loss": 1.7178, + "step": 5923 + }, + { + "epoch": 2.632888888888889, + "grad_norm": 2.6181640625, + "learning_rate": 9.476868327402136e-05, + "loss": 2.2189, + "step": 5924 + }, + { + "epoch": 2.6333333333333333, + "grad_norm": 2.878074884414673, + "learning_rate": 9.47508896797153e-05, + "loss": 1.4863, + "step": 5925 + }, + { + "epoch": 2.6337777777777776, + "grad_norm": 2.7391178607940674, + "learning_rate": 9.473309608540926e-05, + "loss": 1.5675, + "step": 5926 + }, + { + "epoch": 2.6342222222222222, + "grad_norm": 2.5607688426971436, + "learning_rate": 9.471530249110321e-05, + "loss": 1.6423, + "step": 5927 + }, + { + "epoch": 2.634666666666667, + "grad_norm": 2.78110408782959, + "learning_rate": 9.469750889679716e-05, + "loss": 1.7438, + "step": 5928 + }, + { + "epoch": 2.635111111111111, + "grad_norm": 2.7865052223205566, + "learning_rate": 9.467971530249111e-05, + "loss": 1.8183, + "step": 5929 + }, + { + "epoch": 2.6355555555555554, + "grad_norm": 2.619302749633789, + "learning_rate": 9.466192170818506e-05, + "loss": 1.4989, + "step": 5930 + }, + { + "epoch": 2.636, + "grad_norm": 2.7835965156555176, + "learning_rate": 9.4644128113879e-05, + "loss": 1.3163, + "step": 5931 + }, + { + "epoch": 2.6364444444444444, + "grad_norm": 2.692561149597168, + "learning_rate": 9.462633451957296e-05, + "loss": 1.4886, + "step": 5932 + }, + { + "epoch": 2.6368888888888886, + "grad_norm": 2.836426019668579, + "learning_rate": 9.460854092526691e-05, + "loss": 1.9111, + "step": 5933 + }, + { + "epoch": 2.6373333333333333, + "grad_norm": 2.8386855125427246, + "learning_rate": 9.459074733096086e-05, + "loss": 1.8409, + "step": 5934 + }, + { + "epoch": 2.637777777777778, + "grad_norm": 2.6482274532318115, + "learning_rate": 9.457295373665481e-05, + "loss": 1.6835, + "step": 5935 + }, + { + "epoch": 2.6382222222222222, + "grad_norm": 2.3308165073394775, + "learning_rate": 9.455516014234877e-05, + "loss": 1.2158, + "step": 5936 + }, + { + "epoch": 2.6386666666666665, + "grad_norm": 2.602177619934082, + "learning_rate": 9.453736654804271e-05, + "loss": 1.8307, + "step": 5937 + }, + { + "epoch": 2.639111111111111, + "grad_norm": 3.2326931953430176, + "learning_rate": 9.451957295373665e-05, + "loss": 2.0871, + "step": 5938 + }, + { + "epoch": 2.6395555555555554, + "grad_norm": 2.9063456058502197, + "learning_rate": 9.450177935943061e-05, + "loss": 1.7372, + "step": 5939 + }, + { + "epoch": 2.64, + "grad_norm": 2.6239306926727295, + "learning_rate": 9.448398576512457e-05, + "loss": 1.5927, + "step": 5940 + }, + { + "epoch": 2.6404444444444444, + "grad_norm": 3.01645827293396, + "learning_rate": 9.446619217081851e-05, + "loss": 1.5774, + "step": 5941 + }, + { + "epoch": 2.640888888888889, + "grad_norm": 3.1588776111602783, + "learning_rate": 9.444839857651247e-05, + "loss": 1.6249, + "step": 5942 + }, + { + "epoch": 2.6413333333333333, + "grad_norm": 3.4573142528533936, + "learning_rate": 9.443060498220641e-05, + "loss": 1.8496, + "step": 5943 + }, + { + "epoch": 2.6417777777777776, + "grad_norm": 2.9585986137390137, + "learning_rate": 9.441281138790035e-05, + "loss": 2.2489, + "step": 5944 + }, + { + "epoch": 2.6422222222222222, + "grad_norm": 3.1667559146881104, + "learning_rate": 9.439501779359431e-05, + "loss": 1.7358, + "step": 5945 + }, + { + "epoch": 2.642666666666667, + "grad_norm": 3.757124900817871, + "learning_rate": 9.437722419928827e-05, + "loss": 2.1035, + "step": 5946 + }, + { + "epoch": 2.643111111111111, + "grad_norm": 3.6398861408233643, + "learning_rate": 9.435943060498221e-05, + "loss": 2.1867, + "step": 5947 + }, + { + "epoch": 2.6435555555555554, + "grad_norm": 3.7026424407958984, + "learning_rate": 9.434163701067617e-05, + "loss": 1.5336, + "step": 5948 + }, + { + "epoch": 2.644, + "grad_norm": 3.198451519012451, + "learning_rate": 9.432384341637012e-05, + "loss": 0.8215, + "step": 5949 + }, + { + "epoch": 2.6444444444444444, + "grad_norm": 3.469071865081787, + "learning_rate": 9.430604982206405e-05, + "loss": 0.8383, + "step": 5950 + }, + { + "epoch": 2.6448888888888886, + "grad_norm": 1.5619441270828247, + "learning_rate": 9.428825622775801e-05, + "loss": 2.0473, + "step": 5951 + }, + { + "epoch": 2.6453333333333333, + "grad_norm": 1.3054723739624023, + "learning_rate": 9.427046263345196e-05, + "loss": 1.0846, + "step": 5952 + }, + { + "epoch": 2.645777777777778, + "grad_norm": 1.9514864683151245, + "learning_rate": 9.425266903914591e-05, + "loss": 1.7394, + "step": 5953 + }, + { + "epoch": 2.6462222222222223, + "grad_norm": 2.2087137699127197, + "learning_rate": 9.423487544483986e-05, + "loss": 1.6852, + "step": 5954 + }, + { + "epoch": 2.6466666666666665, + "grad_norm": 2.1699979305267334, + "learning_rate": 9.421708185053382e-05, + "loss": 1.6228, + "step": 5955 + }, + { + "epoch": 2.647111111111111, + "grad_norm": 2.410844564437866, + "learning_rate": 9.419928825622776e-05, + "loss": 2.018, + "step": 5956 + }, + { + "epoch": 2.6475555555555554, + "grad_norm": 2.351001024246216, + "learning_rate": 9.41814946619217e-05, + "loss": 1.5701, + "step": 5957 + }, + { + "epoch": 2.648, + "grad_norm": 2.365635633468628, + "learning_rate": 9.416370106761566e-05, + "loss": 1.8983, + "step": 5958 + }, + { + "epoch": 2.6484444444444444, + "grad_norm": 1.956677794456482, + "learning_rate": 9.414590747330962e-05, + "loss": 1.0431, + "step": 5959 + }, + { + "epoch": 2.648888888888889, + "grad_norm": 2.353614330291748, + "learning_rate": 9.412811387900356e-05, + "loss": 1.9778, + "step": 5960 + }, + { + "epoch": 2.6493333333333333, + "grad_norm": 2.23882794380188, + "learning_rate": 9.411032028469752e-05, + "loss": 1.4755, + "step": 5961 + }, + { + "epoch": 2.6497777777777776, + "grad_norm": 2.3019869327545166, + "learning_rate": 9.409252669039146e-05, + "loss": 1.1742, + "step": 5962 + }, + { + "epoch": 2.6502222222222223, + "grad_norm": 2.3776917457580566, + "learning_rate": 9.40747330960854e-05, + "loss": 1.6219, + "step": 5963 + }, + { + "epoch": 2.6506666666666665, + "grad_norm": 2.3942043781280518, + "learning_rate": 9.405693950177936e-05, + "loss": 1.7453, + "step": 5964 + }, + { + "epoch": 2.651111111111111, + "grad_norm": 2.4723472595214844, + "learning_rate": 9.403914590747332e-05, + "loss": 2.0211, + "step": 5965 + }, + { + "epoch": 2.6515555555555554, + "grad_norm": 2.4571118354797363, + "learning_rate": 9.402135231316726e-05, + "loss": 1.6631, + "step": 5966 + }, + { + "epoch": 2.652, + "grad_norm": 2.0202927589416504, + "learning_rate": 9.400355871886122e-05, + "loss": 1.291, + "step": 5967 + }, + { + "epoch": 2.6524444444444444, + "grad_norm": 2.624063014984131, + "learning_rate": 9.398576512455517e-05, + "loss": 1.9158, + "step": 5968 + }, + { + "epoch": 2.6528888888888886, + "grad_norm": 2.2154154777526855, + "learning_rate": 9.396797153024912e-05, + "loss": 1.46, + "step": 5969 + }, + { + "epoch": 2.6533333333333333, + "grad_norm": 2.480447769165039, + "learning_rate": 9.395017793594306e-05, + "loss": 2.152, + "step": 5970 + }, + { + "epoch": 2.653777777777778, + "grad_norm": 2.2933542728424072, + "learning_rate": 9.393238434163702e-05, + "loss": 1.5246, + "step": 5971 + }, + { + "epoch": 2.6542222222222223, + "grad_norm": 2.259690523147583, + "learning_rate": 9.391459074733097e-05, + "loss": 1.8234, + "step": 5972 + }, + { + "epoch": 2.6546666666666665, + "grad_norm": 2.219409942626953, + "learning_rate": 9.389679715302492e-05, + "loss": 1.8107, + "step": 5973 + }, + { + "epoch": 2.655111111111111, + "grad_norm": 2.7897937297821045, + "learning_rate": 9.387900355871887e-05, + "loss": 1.7732, + "step": 5974 + }, + { + "epoch": 2.6555555555555554, + "grad_norm": 2.3130133152008057, + "learning_rate": 9.386120996441281e-05, + "loss": 1.5716, + "step": 5975 + }, + { + "epoch": 2.656, + "grad_norm": 2.908433437347412, + "learning_rate": 9.384341637010676e-05, + "loss": 1.9881, + "step": 5976 + }, + { + "epoch": 2.6564444444444444, + "grad_norm": 2.9439632892608643, + "learning_rate": 9.382562277580071e-05, + "loss": 1.7524, + "step": 5977 + }, + { + "epoch": 2.656888888888889, + "grad_norm": 2.63321852684021, + "learning_rate": 9.380782918149467e-05, + "loss": 1.9221, + "step": 5978 + }, + { + "epoch": 2.6573333333333333, + "grad_norm": 2.665898323059082, + "learning_rate": 9.379003558718861e-05, + "loss": 1.9193, + "step": 5979 + }, + { + "epoch": 2.6577777777777776, + "grad_norm": 2.8060519695281982, + "learning_rate": 9.377224199288257e-05, + "loss": 1.7567, + "step": 5980 + }, + { + "epoch": 2.6582222222222223, + "grad_norm": 2.898313045501709, + "learning_rate": 9.375444839857653e-05, + "loss": 1.9132, + "step": 5981 + }, + { + "epoch": 2.6586666666666665, + "grad_norm": 3.1064798831939697, + "learning_rate": 9.373665480427047e-05, + "loss": 1.5257, + "step": 5982 + }, + { + "epoch": 2.659111111111111, + "grad_norm": 3.1904420852661133, + "learning_rate": 9.371886120996441e-05, + "loss": 2.2091, + "step": 5983 + }, + { + "epoch": 2.6595555555555555, + "grad_norm": 2.535499095916748, + "learning_rate": 9.370106761565837e-05, + "loss": 1.5115, + "step": 5984 + }, + { + "epoch": 2.66, + "grad_norm": 2.571568727493286, + "learning_rate": 9.368327402135233e-05, + "loss": 1.7336, + "step": 5985 + }, + { + "epoch": 2.6604444444444444, + "grad_norm": 2.7381021976470947, + "learning_rate": 9.366548042704627e-05, + "loss": 1.7122, + "step": 5986 + }, + { + "epoch": 2.6608888888888886, + "grad_norm": 2.603264331817627, + "learning_rate": 9.364768683274022e-05, + "loss": 1.6372, + "step": 5987 + }, + { + "epoch": 2.6613333333333333, + "grad_norm": 2.825221300125122, + "learning_rate": 9.362989323843417e-05, + "loss": 1.8266, + "step": 5988 + }, + { + "epoch": 2.661777777777778, + "grad_norm": 3.076425313949585, + "learning_rate": 9.361209964412811e-05, + "loss": 1.8103, + "step": 5989 + }, + { + "epoch": 2.6622222222222223, + "grad_norm": 3.15580677986145, + "learning_rate": 9.359430604982207e-05, + "loss": 1.6719, + "step": 5990 + }, + { + "epoch": 2.6626666666666665, + "grad_norm": 3.181922674179077, + "learning_rate": 9.357651245551602e-05, + "loss": 1.8815, + "step": 5991 + }, + { + "epoch": 2.663111111111111, + "grad_norm": 2.974426746368408, + "learning_rate": 9.355871886120997e-05, + "loss": 1.6728, + "step": 5992 + }, + { + "epoch": 2.6635555555555555, + "grad_norm": 2.7631449699401855, + "learning_rate": 9.354092526690392e-05, + "loss": 1.5703, + "step": 5993 + }, + { + "epoch": 2.664, + "grad_norm": 3.3828341960906982, + "learning_rate": 9.352313167259788e-05, + "loss": 1.6526, + "step": 5994 + }, + { + "epoch": 2.6644444444444444, + "grad_norm": 3.690214157104492, + "learning_rate": 9.350533807829181e-05, + "loss": 2.2105, + "step": 5995 + }, + { + "epoch": 2.664888888888889, + "grad_norm": 4.263453960418701, + "learning_rate": 9.348754448398577e-05, + "loss": 1.8527, + "step": 5996 + }, + { + "epoch": 2.6653333333333333, + "grad_norm": 2.7068159580230713, + "learning_rate": 9.346975088967972e-05, + "loss": 0.7449, + "step": 5997 + }, + { + "epoch": 2.6657777777777776, + "grad_norm": 2.991706609725952, + "learning_rate": 9.345195729537366e-05, + "loss": 1.0355, + "step": 5998 + }, + { + "epoch": 2.6662222222222223, + "grad_norm": 4.530425071716309, + "learning_rate": 9.343416370106762e-05, + "loss": 1.912, + "step": 5999 + }, + { + "epoch": 2.6666666666666665, + "grad_norm": 4.280130386352539, + "learning_rate": 9.341637010676158e-05, + "loss": 1.333, + "step": 6000 + }, + { + "epoch": 2.667111111111111, + "grad_norm": 1.9667905569076538, + "learning_rate": 9.339857651245552e-05, + "loss": 2.5096, + "step": 6001 + }, + { + "epoch": 2.6675555555555555, + "grad_norm": 1.9078987836837769, + "learning_rate": 9.338078291814946e-05, + "loss": 2.1031, + "step": 6002 + }, + { + "epoch": 2.668, + "grad_norm": 2.1769826412200928, + "learning_rate": 9.336298932384342e-05, + "loss": 2.0237, + "step": 6003 + }, + { + "epoch": 2.6684444444444444, + "grad_norm": 0.29694122076034546, + "learning_rate": 9.334519572953738e-05, + "loss": 0.0224, + "step": 6004 + }, + { + "epoch": 2.6688888888888886, + "grad_norm": 1.7639079093933105, + "learning_rate": 9.332740213523132e-05, + "loss": 0.9714, + "step": 6005 + }, + { + "epoch": 2.6693333333333333, + "grad_norm": 2.3218326568603516, + "learning_rate": 9.330960854092528e-05, + "loss": 2.0642, + "step": 6006 + }, + { + "epoch": 2.669777777777778, + "grad_norm": 2.195533514022827, + "learning_rate": 9.329181494661923e-05, + "loss": 1.1775, + "step": 6007 + }, + { + "epoch": 2.6702222222222223, + "grad_norm": 1.75465989112854, + "learning_rate": 9.327402135231316e-05, + "loss": 0.9689, + "step": 6008 + }, + { + "epoch": 2.6706666666666665, + "grad_norm": 2.281090259552002, + "learning_rate": 9.325622775800712e-05, + "loss": 1.977, + "step": 6009 + }, + { + "epoch": 2.671111111111111, + "grad_norm": 1.779270052909851, + "learning_rate": 9.323843416370108e-05, + "loss": 0.882, + "step": 6010 + }, + { + "epoch": 2.6715555555555555, + "grad_norm": 2.8689680099487305, + "learning_rate": 9.322064056939502e-05, + "loss": 2.0544, + "step": 6011 + }, + { + "epoch": 2.672, + "grad_norm": 2.2885069847106934, + "learning_rate": 9.320284697508897e-05, + "loss": 1.6721, + "step": 6012 + }, + { + "epoch": 2.6724444444444444, + "grad_norm": 2.3824915885925293, + "learning_rate": 9.318505338078293e-05, + "loss": 1.7404, + "step": 6013 + }, + { + "epoch": 2.672888888888889, + "grad_norm": 2.4072515964508057, + "learning_rate": 9.316725978647687e-05, + "loss": 1.7259, + "step": 6014 + }, + { + "epoch": 2.6733333333333333, + "grad_norm": 2.6987950801849365, + "learning_rate": 9.314946619217082e-05, + "loss": 1.9788, + "step": 6015 + }, + { + "epoch": 2.6737777777777776, + "grad_norm": 2.521949052810669, + "learning_rate": 9.313167259786477e-05, + "loss": 1.4704, + "step": 6016 + }, + { + "epoch": 2.6742222222222223, + "grad_norm": 3.2929980754852295, + "learning_rate": 9.311387900355873e-05, + "loss": 2.1755, + "step": 6017 + }, + { + "epoch": 2.6746666666666665, + "grad_norm": 1.4418671131134033, + "learning_rate": 9.309608540925267e-05, + "loss": 0.817, + "step": 6018 + }, + { + "epoch": 2.675111111111111, + "grad_norm": 1.5588750839233398, + "learning_rate": 9.307829181494663e-05, + "loss": 1.0768, + "step": 6019 + }, + { + "epoch": 2.6755555555555555, + "grad_norm": 2.279388189315796, + "learning_rate": 9.306049822064057e-05, + "loss": 1.2211, + "step": 6020 + }, + { + "epoch": 2.676, + "grad_norm": 2.7521657943725586, + "learning_rate": 9.304270462633452e-05, + "loss": 1.7047, + "step": 6021 + }, + { + "epoch": 2.6764444444444444, + "grad_norm": 2.4141414165496826, + "learning_rate": 9.302491103202847e-05, + "loss": 1.6488, + "step": 6022 + }, + { + "epoch": 2.6768888888888887, + "grad_norm": 2.966360330581665, + "learning_rate": 9.300711743772243e-05, + "loss": 1.8208, + "step": 6023 + }, + { + "epoch": 2.6773333333333333, + "grad_norm": 2.5755410194396973, + "learning_rate": 9.298932384341637e-05, + "loss": 2.2004, + "step": 6024 + }, + { + "epoch": 2.677777777777778, + "grad_norm": 2.3477094173431396, + "learning_rate": 9.297153024911033e-05, + "loss": 1.9106, + "step": 6025 + }, + { + "epoch": 2.6782222222222223, + "grad_norm": 3.050549030303955, + "learning_rate": 9.295373665480428e-05, + "loss": 2.0783, + "step": 6026 + }, + { + "epoch": 2.6786666666666665, + "grad_norm": 2.4758310317993164, + "learning_rate": 9.293594306049823e-05, + "loss": 1.7121, + "step": 6027 + }, + { + "epoch": 2.679111111111111, + "grad_norm": 2.427130699157715, + "learning_rate": 9.291814946619217e-05, + "loss": 1.9215, + "step": 6028 + }, + { + "epoch": 2.6795555555555555, + "grad_norm": 2.3932747840881348, + "learning_rate": 9.290035587188613e-05, + "loss": 1.7158, + "step": 6029 + }, + { + "epoch": 2.68, + "grad_norm": 2.1773719787597656, + "learning_rate": 9.288256227758008e-05, + "loss": 1.6804, + "step": 6030 + }, + { + "epoch": 2.6804444444444444, + "grad_norm": 2.6314051151275635, + "learning_rate": 9.286476868327403e-05, + "loss": 1.5102, + "step": 6031 + }, + { + "epoch": 2.680888888888889, + "grad_norm": 2.660897970199585, + "learning_rate": 9.284697508896798e-05, + "loss": 1.9421, + "step": 6032 + }, + { + "epoch": 2.6813333333333333, + "grad_norm": 3.031437873840332, + "learning_rate": 9.282918149466193e-05, + "loss": 1.9283, + "step": 6033 + }, + { + "epoch": 2.6817777777777776, + "grad_norm": 2.821213722229004, + "learning_rate": 9.281138790035587e-05, + "loss": 1.7759, + "step": 6034 + }, + { + "epoch": 2.6822222222222223, + "grad_norm": 2.7068347930908203, + "learning_rate": 9.279359430604982e-05, + "loss": 1.7589, + "step": 6035 + }, + { + "epoch": 2.6826666666666665, + "grad_norm": 2.7685983180999756, + "learning_rate": 9.277580071174378e-05, + "loss": 2.0632, + "step": 6036 + }, + { + "epoch": 2.6831111111111112, + "grad_norm": 2.5510952472686768, + "learning_rate": 9.275800711743772e-05, + "loss": 1.6365, + "step": 6037 + }, + { + "epoch": 2.6835555555555555, + "grad_norm": 2.7311320304870605, + "learning_rate": 9.274021352313168e-05, + "loss": 1.5591, + "step": 6038 + }, + { + "epoch": 2.684, + "grad_norm": 3.251199722290039, + "learning_rate": 9.272241992882564e-05, + "loss": 2.0724, + "step": 6039 + }, + { + "epoch": 2.6844444444444444, + "grad_norm": 2.5799219608306885, + "learning_rate": 9.270462633451957e-05, + "loss": 1.643, + "step": 6040 + }, + { + "epoch": 2.6848888888888887, + "grad_norm": 3.4391684532165527, + "learning_rate": 9.268683274021352e-05, + "loss": 1.8439, + "step": 6041 + }, + { + "epoch": 2.6853333333333333, + "grad_norm": 3.380760908126831, + "learning_rate": 9.266903914590748e-05, + "loss": 2.3897, + "step": 6042 + }, + { + "epoch": 2.685777777777778, + "grad_norm": 2.8219289779663086, + "learning_rate": 9.265124555160142e-05, + "loss": 1.5193, + "step": 6043 + }, + { + "epoch": 2.6862222222222223, + "grad_norm": 2.5288350582122803, + "learning_rate": 9.263345195729538e-05, + "loss": 1.3944, + "step": 6044 + }, + { + "epoch": 2.6866666666666665, + "grad_norm": 3.494917154312134, + "learning_rate": 9.261565836298934e-05, + "loss": 1.5498, + "step": 6045 + }, + { + "epoch": 2.6871111111111112, + "grad_norm": 3.129040241241455, + "learning_rate": 9.259786476868328e-05, + "loss": 1.6695, + "step": 6046 + }, + { + "epoch": 2.6875555555555555, + "grad_norm": 3.235013484954834, + "learning_rate": 9.258007117437722e-05, + "loss": 1.9297, + "step": 6047 + }, + { + "epoch": 2.6879999999999997, + "grad_norm": 3.288318634033203, + "learning_rate": 9.256227758007118e-05, + "loss": 1.7766, + "step": 6048 + }, + { + "epoch": 2.6884444444444444, + "grad_norm": 3.6393327713012695, + "learning_rate": 9.254448398576513e-05, + "loss": 2.0743, + "step": 6049 + }, + { + "epoch": 2.688888888888889, + "grad_norm": 3.2492709159851074, + "learning_rate": 9.252669039145908e-05, + "loss": 0.7984, + "step": 6050 + }, + { + "epoch": 2.6893333333333334, + "grad_norm": 1.417654037475586, + "learning_rate": 9.250889679715303e-05, + "loss": 1.1905, + "step": 6051 + }, + { + "epoch": 2.6897777777777776, + "grad_norm": 2.2296252250671387, + "learning_rate": 9.249110320284699e-05, + "loss": 1.9266, + "step": 6052 + }, + { + "epoch": 2.6902222222222223, + "grad_norm": 2.10345721244812, + "learning_rate": 9.247330960854092e-05, + "loss": 2.199, + "step": 6053 + }, + { + "epoch": 2.6906666666666665, + "grad_norm": 2.360987901687622, + "learning_rate": 9.245551601423488e-05, + "loss": 2.2995, + "step": 6054 + }, + { + "epoch": 2.6911111111111112, + "grad_norm": 2.832897186279297, + "learning_rate": 9.243772241992883e-05, + "loss": 1.8118, + "step": 6055 + }, + { + "epoch": 2.6915555555555555, + "grad_norm": 2.3768863677978516, + "learning_rate": 9.241992882562278e-05, + "loss": 2.1399, + "step": 6056 + }, + { + "epoch": 2.692, + "grad_norm": 2.3208935260772705, + "learning_rate": 9.240213523131673e-05, + "loss": 2.2196, + "step": 6057 + }, + { + "epoch": 2.6924444444444444, + "grad_norm": 2.1701810359954834, + "learning_rate": 9.238434163701069e-05, + "loss": 2.0644, + "step": 6058 + }, + { + "epoch": 2.6928888888888887, + "grad_norm": 2.7813336849212646, + "learning_rate": 9.236654804270463e-05, + "loss": 2.1576, + "step": 6059 + }, + { + "epoch": 2.6933333333333334, + "grad_norm": 2.3689005374908447, + "learning_rate": 9.234875444839857e-05, + "loss": 2.148, + "step": 6060 + }, + { + "epoch": 2.693777777777778, + "grad_norm": 2.3087990283966064, + "learning_rate": 9.233096085409253e-05, + "loss": 2.139, + "step": 6061 + }, + { + "epoch": 2.6942222222222223, + "grad_norm": 2.087191581726074, + "learning_rate": 9.231316725978649e-05, + "loss": 1.7637, + "step": 6062 + }, + { + "epoch": 2.6946666666666665, + "grad_norm": 2.2079708576202393, + "learning_rate": 9.229537366548043e-05, + "loss": 2.0657, + "step": 6063 + }, + { + "epoch": 2.6951111111111112, + "grad_norm": 2.13736629486084, + "learning_rate": 9.227758007117439e-05, + "loss": 2.0342, + "step": 6064 + }, + { + "epoch": 2.6955555555555555, + "grad_norm": 2.382661819458008, + "learning_rate": 9.225978647686834e-05, + "loss": 1.5989, + "step": 6065 + }, + { + "epoch": 2.6959999999999997, + "grad_norm": 2.3359639644622803, + "learning_rate": 9.224199288256227e-05, + "loss": 2.0982, + "step": 6066 + }, + { + "epoch": 2.6964444444444444, + "grad_norm": 2.5619516372680664, + "learning_rate": 9.222419928825623e-05, + "loss": 1.9643, + "step": 6067 + }, + { + "epoch": 2.696888888888889, + "grad_norm": 2.3712518215179443, + "learning_rate": 9.220640569395019e-05, + "loss": 1.9981, + "step": 6068 + }, + { + "epoch": 2.6973333333333334, + "grad_norm": 2.2327232360839844, + "learning_rate": 9.218861209964413e-05, + "loss": 1.081, + "step": 6069 + }, + { + "epoch": 2.6977777777777776, + "grad_norm": 2.4085304737091064, + "learning_rate": 9.217081850533809e-05, + "loss": 1.8645, + "step": 6070 + }, + { + "epoch": 2.6982222222222223, + "grad_norm": 2.5223472118377686, + "learning_rate": 9.215302491103204e-05, + "loss": 1.9147, + "step": 6071 + }, + { + "epoch": 2.6986666666666665, + "grad_norm": 2.6552350521087646, + "learning_rate": 9.213523131672598e-05, + "loss": 2.2025, + "step": 6072 + }, + { + "epoch": 2.6991111111111112, + "grad_norm": 2.412116289138794, + "learning_rate": 9.211743772241993e-05, + "loss": 1.7127, + "step": 6073 + }, + { + "epoch": 2.6995555555555555, + "grad_norm": 2.261444091796875, + "learning_rate": 9.209964412811388e-05, + "loss": 1.4406, + "step": 6074 + }, + { + "epoch": 2.7, + "grad_norm": 2.534367561340332, + "learning_rate": 9.208185053380784e-05, + "loss": 1.7541, + "step": 6075 + }, + { + "epoch": 2.7004444444444444, + "grad_norm": 2.575896978378296, + "learning_rate": 9.206405693950178e-05, + "loss": 1.9618, + "step": 6076 + }, + { + "epoch": 2.7008888888888887, + "grad_norm": 2.7344939708709717, + "learning_rate": 9.204626334519574e-05, + "loss": 1.9426, + "step": 6077 + }, + { + "epoch": 2.7013333333333334, + "grad_norm": 2.7714240550994873, + "learning_rate": 9.202846975088968e-05, + "loss": 1.8616, + "step": 6078 + }, + { + "epoch": 2.7017777777777776, + "grad_norm": 2.9522223472595215, + "learning_rate": 9.201067615658363e-05, + "loss": 2.3087, + "step": 6079 + }, + { + "epoch": 2.7022222222222223, + "grad_norm": 2.702958345413208, + "learning_rate": 9.199288256227758e-05, + "loss": 2.236, + "step": 6080 + }, + { + "epoch": 2.7026666666666666, + "grad_norm": 1.8355095386505127, + "learning_rate": 9.197508896797154e-05, + "loss": 0.6777, + "step": 6081 + }, + { + "epoch": 2.7031111111111112, + "grad_norm": 2.5094728469848633, + "learning_rate": 9.195729537366548e-05, + "loss": 1.8198, + "step": 6082 + }, + { + "epoch": 2.7035555555555555, + "grad_norm": 2.6528875827789307, + "learning_rate": 9.193950177935944e-05, + "loss": 1.7715, + "step": 6083 + }, + { + "epoch": 2.7039999999999997, + "grad_norm": 2.9194884300231934, + "learning_rate": 9.19217081850534e-05, + "loss": 1.7147, + "step": 6084 + }, + { + "epoch": 2.7044444444444444, + "grad_norm": 2.9441230297088623, + "learning_rate": 9.190391459074732e-05, + "loss": 1.854, + "step": 6085 + }, + { + "epoch": 2.704888888888889, + "grad_norm": 2.6675310134887695, + "learning_rate": 9.188612099644128e-05, + "loss": 2.0047, + "step": 6086 + }, + { + "epoch": 2.7053333333333334, + "grad_norm": 2.501265048980713, + "learning_rate": 9.186832740213524e-05, + "loss": 1.5176, + "step": 6087 + }, + { + "epoch": 2.7057777777777776, + "grad_norm": 2.112760066986084, + "learning_rate": 9.185053380782918e-05, + "loss": 1.0436, + "step": 6088 + }, + { + "epoch": 2.7062222222222223, + "grad_norm": 2.7593295574188232, + "learning_rate": 9.183274021352314e-05, + "loss": 1.6187, + "step": 6089 + }, + { + "epoch": 2.7066666666666666, + "grad_norm": 2.7610411643981934, + "learning_rate": 9.18149466192171e-05, + "loss": 1.7194, + "step": 6090 + }, + { + "epoch": 2.7071111111111112, + "grad_norm": 2.309563398361206, + "learning_rate": 9.179715302491104e-05, + "loss": 1.2155, + "step": 6091 + }, + { + "epoch": 2.7075555555555555, + "grad_norm": 3.307159900665283, + "learning_rate": 9.177935943060498e-05, + "loss": 1.9251, + "step": 6092 + }, + { + "epoch": 2.708, + "grad_norm": 2.4298038482666016, + "learning_rate": 9.176156583629894e-05, + "loss": 1.2146, + "step": 6093 + }, + { + "epoch": 2.7084444444444444, + "grad_norm": 2.78938627243042, + "learning_rate": 9.174377224199289e-05, + "loss": 1.7818, + "step": 6094 + }, + { + "epoch": 2.7088888888888887, + "grad_norm": 2.8825080394744873, + "learning_rate": 9.172597864768684e-05, + "loss": 1.6426, + "step": 6095 + }, + { + "epoch": 2.7093333333333334, + "grad_norm": 3.2043745517730713, + "learning_rate": 9.170818505338079e-05, + "loss": 1.5032, + "step": 6096 + }, + { + "epoch": 2.7097777777777776, + "grad_norm": 3.1680638790130615, + "learning_rate": 9.169039145907475e-05, + "loss": 1.3996, + "step": 6097 + }, + { + "epoch": 2.7102222222222223, + "grad_norm": 3.6439359188079834, + "learning_rate": 9.167259786476868e-05, + "loss": 2.1672, + "step": 6098 + }, + { + "epoch": 2.7106666666666666, + "grad_norm": 4.421741485595703, + "learning_rate": 9.165480427046263e-05, + "loss": 2.4516, + "step": 6099 + }, + { + "epoch": 2.7111111111111112, + "grad_norm": 3.2291154861450195, + "learning_rate": 9.163701067615659e-05, + "loss": 1.6698, + "step": 6100 + }, + { + "epoch": 2.7115555555555555, + "grad_norm": 1.8090177774429321, + "learning_rate": 9.161921708185053e-05, + "loss": 2.1445, + "step": 6101 + }, + { + "epoch": 2.7119999999999997, + "grad_norm": 1.2429306507110596, + "learning_rate": 9.160142348754449e-05, + "loss": 1.2912, + "step": 6102 + }, + { + "epoch": 2.7124444444444444, + "grad_norm": 2.093599319458008, + "learning_rate": 9.158362989323845e-05, + "loss": 2.4831, + "step": 6103 + }, + { + "epoch": 2.712888888888889, + "grad_norm": 1.8647911548614502, + "learning_rate": 9.156583629893239e-05, + "loss": 0.9684, + "step": 6104 + }, + { + "epoch": 2.7133333333333334, + "grad_norm": 2.1396780014038086, + "learning_rate": 9.154804270462633e-05, + "loss": 2.1317, + "step": 6105 + }, + { + "epoch": 2.7137777777777776, + "grad_norm": 2.1516218185424805, + "learning_rate": 9.153024911032029e-05, + "loss": 1.7869, + "step": 6106 + }, + { + "epoch": 2.7142222222222223, + "grad_norm": 2.357754945755005, + "learning_rate": 9.151245551601425e-05, + "loss": 2.3442, + "step": 6107 + }, + { + "epoch": 2.7146666666666666, + "grad_norm": 2.1616320610046387, + "learning_rate": 9.149466192170819e-05, + "loss": 2.2035, + "step": 6108 + }, + { + "epoch": 2.7151111111111113, + "grad_norm": 2.2545979022979736, + "learning_rate": 9.147686832740214e-05, + "loss": 2.3456, + "step": 6109 + }, + { + "epoch": 2.7155555555555555, + "grad_norm": 1.8658015727996826, + "learning_rate": 9.14590747330961e-05, + "loss": 1.5012, + "step": 6110 + }, + { + "epoch": 2.716, + "grad_norm": 2.5703420639038086, + "learning_rate": 9.144128113879003e-05, + "loss": 1.8163, + "step": 6111 + }, + { + "epoch": 2.7164444444444444, + "grad_norm": 2.278449296951294, + "learning_rate": 9.142348754448399e-05, + "loss": 1.7411, + "step": 6112 + }, + { + "epoch": 2.7168888888888887, + "grad_norm": 2.5653324127197266, + "learning_rate": 9.140569395017794e-05, + "loss": 1.8087, + "step": 6113 + }, + { + "epoch": 2.7173333333333334, + "grad_norm": 2.3541901111602783, + "learning_rate": 9.138790035587189e-05, + "loss": 2.0437, + "step": 6114 + }, + { + "epoch": 2.7177777777777776, + "grad_norm": 2.4183175563812256, + "learning_rate": 9.137010676156584e-05, + "loss": 1.6396, + "step": 6115 + }, + { + "epoch": 2.7182222222222223, + "grad_norm": 2.6298437118530273, + "learning_rate": 9.13523131672598e-05, + "loss": 2.3171, + "step": 6116 + }, + { + "epoch": 2.7186666666666666, + "grad_norm": 2.7562639713287354, + "learning_rate": 9.133451957295374e-05, + "loss": 2.0566, + "step": 6117 + }, + { + "epoch": 2.7191111111111113, + "grad_norm": 2.4366018772125244, + "learning_rate": 9.131672597864769e-05, + "loss": 1.9631, + "step": 6118 + }, + { + "epoch": 2.7195555555555555, + "grad_norm": 2.1214277744293213, + "learning_rate": 9.129893238434164e-05, + "loss": 1.3662, + "step": 6119 + }, + { + "epoch": 2.7199999999999998, + "grad_norm": 2.162672996520996, + "learning_rate": 9.12811387900356e-05, + "loss": 1.3789, + "step": 6120 + }, + { + "epoch": 2.7204444444444444, + "grad_norm": 2.6844053268432617, + "learning_rate": 9.126334519572954e-05, + "loss": 1.8149, + "step": 6121 + }, + { + "epoch": 2.720888888888889, + "grad_norm": 2.491919994354248, + "learning_rate": 9.12455516014235e-05, + "loss": 1.8213, + "step": 6122 + }, + { + "epoch": 2.7213333333333334, + "grad_norm": 2.4091458320617676, + "learning_rate": 9.122775800711744e-05, + "loss": 1.8815, + "step": 6123 + }, + { + "epoch": 2.7217777777777776, + "grad_norm": 2.2828023433685303, + "learning_rate": 9.120996441281138e-05, + "loss": 1.4552, + "step": 6124 + }, + { + "epoch": 2.7222222222222223, + "grad_norm": 2.2405014038085938, + "learning_rate": 9.119217081850534e-05, + "loss": 1.3707, + "step": 6125 + }, + { + "epoch": 2.7226666666666666, + "grad_norm": 2.6642508506774902, + "learning_rate": 9.11743772241993e-05, + "loss": 1.7702, + "step": 6126 + }, + { + "epoch": 2.7231111111111113, + "grad_norm": 2.5781748294830322, + "learning_rate": 9.115658362989324e-05, + "loss": 1.7319, + "step": 6127 + }, + { + "epoch": 2.7235555555555555, + "grad_norm": 2.8042945861816406, + "learning_rate": 9.11387900355872e-05, + "loss": 1.8082, + "step": 6128 + }, + { + "epoch": 2.724, + "grad_norm": 1.7119746208190918, + "learning_rate": 9.112099644128115e-05, + "loss": 0.7126, + "step": 6129 + }, + { + "epoch": 2.7244444444444444, + "grad_norm": 2.4615836143493652, + "learning_rate": 9.110320284697508e-05, + "loss": 1.6634, + "step": 6130 + }, + { + "epoch": 2.7248888888888887, + "grad_norm": 2.9589643478393555, + "learning_rate": 9.108540925266904e-05, + "loss": 1.829, + "step": 6131 + }, + { + "epoch": 2.7253333333333334, + "grad_norm": 2.783677577972412, + "learning_rate": 9.1067615658363e-05, + "loss": 1.98, + "step": 6132 + }, + { + "epoch": 2.7257777777777776, + "grad_norm": 3.0595808029174805, + "learning_rate": 9.104982206405694e-05, + "loss": 1.7831, + "step": 6133 + }, + { + "epoch": 2.7262222222222223, + "grad_norm": 2.033073902130127, + "learning_rate": 9.10320284697509e-05, + "loss": 0.923, + "step": 6134 + }, + { + "epoch": 2.7266666666666666, + "grad_norm": 3.1585965156555176, + "learning_rate": 9.101423487544485e-05, + "loss": 1.735, + "step": 6135 + }, + { + "epoch": 2.7271111111111113, + "grad_norm": 2.5209286212921143, + "learning_rate": 9.09964412811388e-05, + "loss": 1.5229, + "step": 6136 + }, + { + "epoch": 2.7275555555555555, + "grad_norm": 2.977914810180664, + "learning_rate": 9.097864768683274e-05, + "loss": 2.0039, + "step": 6137 + }, + { + "epoch": 2.7279999999999998, + "grad_norm": 2.497366189956665, + "learning_rate": 9.09608540925267e-05, + "loss": 1.4511, + "step": 6138 + }, + { + "epoch": 2.7284444444444444, + "grad_norm": 2.936041831970215, + "learning_rate": 9.094306049822065e-05, + "loss": 1.9052, + "step": 6139 + }, + { + "epoch": 2.728888888888889, + "grad_norm": 3.006460428237915, + "learning_rate": 9.092526690391459e-05, + "loss": 1.758, + "step": 6140 + }, + { + "epoch": 2.7293333333333334, + "grad_norm": 3.10183048248291, + "learning_rate": 9.090747330960855e-05, + "loss": 1.6804, + "step": 6141 + }, + { + "epoch": 2.7297777777777776, + "grad_norm": 3.5119240283966064, + "learning_rate": 9.08896797153025e-05, + "loss": 1.763, + "step": 6142 + }, + { + "epoch": 2.7302222222222223, + "grad_norm": 3.4593982696533203, + "learning_rate": 9.087188612099644e-05, + "loss": 2.3246, + "step": 6143 + }, + { + "epoch": 2.7306666666666666, + "grad_norm": 2.949693202972412, + "learning_rate": 9.085409252669039e-05, + "loss": 1.4612, + "step": 6144 + }, + { + "epoch": 2.7311111111111113, + "grad_norm": 3.316683769226074, + "learning_rate": 9.083629893238435e-05, + "loss": 2.0814, + "step": 6145 + }, + { + "epoch": 2.7315555555555555, + "grad_norm": 2.9927797317504883, + "learning_rate": 9.081850533807829e-05, + "loss": 1.5698, + "step": 6146 + }, + { + "epoch": 2.732, + "grad_norm": 2.82143497467041, + "learning_rate": 9.080071174377225e-05, + "loss": 1.6885, + "step": 6147 + }, + { + "epoch": 2.7324444444444445, + "grad_norm": 3.127837657928467, + "learning_rate": 9.07829181494662e-05, + "loss": 1.7723, + "step": 6148 + }, + { + "epoch": 2.7328888888888887, + "grad_norm": 3.9703283309936523, + "learning_rate": 9.076512455516015e-05, + "loss": 2.0333, + "step": 6149 + }, + { + "epoch": 2.7333333333333334, + "grad_norm": 2.2306530475616455, + "learning_rate": 9.074733096085409e-05, + "loss": 0.5828, + "step": 6150 + }, + { + "epoch": 2.7337777777777776, + "grad_norm": 2.2018139362335205, + "learning_rate": 9.072953736654805e-05, + "loss": 2.6287, + "step": 6151 + }, + { + "epoch": 2.7342222222222223, + "grad_norm": 1.7515716552734375, + "learning_rate": 9.0711743772242e-05, + "loss": 1.8294, + "step": 6152 + }, + { + "epoch": 2.7346666666666666, + "grad_norm": 1.612664818763733, + "learning_rate": 9.069395017793595e-05, + "loss": 1.3168, + "step": 6153 + }, + { + "epoch": 2.7351111111111113, + "grad_norm": 2.2013888359069824, + "learning_rate": 9.06761565836299e-05, + "loss": 2.3534, + "step": 6154 + }, + { + "epoch": 2.7355555555555555, + "grad_norm": 2.403697967529297, + "learning_rate": 9.065836298932386e-05, + "loss": 2.0122, + "step": 6155 + }, + { + "epoch": 2.7359999999999998, + "grad_norm": 2.392401695251465, + "learning_rate": 9.064056939501779e-05, + "loss": 2.1935, + "step": 6156 + }, + { + "epoch": 2.7364444444444445, + "grad_norm": 2.547696828842163, + "learning_rate": 9.062277580071175e-05, + "loss": 2.1032, + "step": 6157 + }, + { + "epoch": 2.736888888888889, + "grad_norm": 2.0454320907592773, + "learning_rate": 9.06049822064057e-05, + "loss": 1.6723, + "step": 6158 + }, + { + "epoch": 2.7373333333333334, + "grad_norm": 2.0025997161865234, + "learning_rate": 9.058718861209964e-05, + "loss": 1.5211, + "step": 6159 + }, + { + "epoch": 2.7377777777777776, + "grad_norm": 2.7177071571350098, + "learning_rate": 9.05693950177936e-05, + "loss": 2.2681, + "step": 6160 + }, + { + "epoch": 2.7382222222222223, + "grad_norm": 2.377166986465454, + "learning_rate": 9.055160142348756e-05, + "loss": 1.6563, + "step": 6161 + }, + { + "epoch": 2.7386666666666666, + "grad_norm": 2.444869041442871, + "learning_rate": 9.05338078291815e-05, + "loss": 1.7362, + "step": 6162 + }, + { + "epoch": 2.7391111111111113, + "grad_norm": 2.2301559448242188, + "learning_rate": 9.051601423487544e-05, + "loss": 2.0245, + "step": 6163 + }, + { + "epoch": 2.7395555555555555, + "grad_norm": 2.3340377807617188, + "learning_rate": 9.04982206405694e-05, + "loss": 1.6158, + "step": 6164 + }, + { + "epoch": 2.74, + "grad_norm": 2.1984641551971436, + "learning_rate": 9.048042704626336e-05, + "loss": 1.6976, + "step": 6165 + }, + { + "epoch": 2.7404444444444445, + "grad_norm": 2.684303045272827, + "learning_rate": 9.04626334519573e-05, + "loss": 1.9596, + "step": 6166 + }, + { + "epoch": 2.7408888888888887, + "grad_norm": 2.037320852279663, + "learning_rate": 9.044483985765126e-05, + "loss": 1.5683, + "step": 6167 + }, + { + "epoch": 2.7413333333333334, + "grad_norm": 1.9725054502487183, + "learning_rate": 9.04270462633452e-05, + "loss": 1.5862, + "step": 6168 + }, + { + "epoch": 2.7417777777777776, + "grad_norm": 2.6273114681243896, + "learning_rate": 9.040925266903914e-05, + "loss": 2.0218, + "step": 6169 + }, + { + "epoch": 2.7422222222222223, + "grad_norm": 2.4630777835845947, + "learning_rate": 9.03914590747331e-05, + "loss": 1.7888, + "step": 6170 + }, + { + "epoch": 2.7426666666666666, + "grad_norm": 2.1881697177886963, + "learning_rate": 9.037366548042705e-05, + "loss": 1.811, + "step": 6171 + }, + { + "epoch": 2.7431111111111113, + "grad_norm": 2.804276943206787, + "learning_rate": 9.0355871886121e-05, + "loss": 2.1978, + "step": 6172 + }, + { + "epoch": 2.7435555555555555, + "grad_norm": 2.1326301097869873, + "learning_rate": 9.033807829181495e-05, + "loss": 1.7793, + "step": 6173 + }, + { + "epoch": 2.7439999999999998, + "grad_norm": 2.142714023590088, + "learning_rate": 9.032028469750891e-05, + "loss": 1.824, + "step": 6174 + }, + { + "epoch": 2.7444444444444445, + "grad_norm": 2.4953362941741943, + "learning_rate": 9.030249110320284e-05, + "loss": 1.922, + "step": 6175 + }, + { + "epoch": 2.744888888888889, + "grad_norm": 2.264841318130493, + "learning_rate": 9.02846975088968e-05, + "loss": 1.7474, + "step": 6176 + }, + { + "epoch": 2.7453333333333334, + "grad_norm": 2.464749813079834, + "learning_rate": 9.026690391459075e-05, + "loss": 1.7516, + "step": 6177 + }, + { + "epoch": 2.7457777777777777, + "grad_norm": 2.3408308029174805, + "learning_rate": 9.02491103202847e-05, + "loss": 1.6946, + "step": 6178 + }, + { + "epoch": 2.7462222222222223, + "grad_norm": 2.4677340984344482, + "learning_rate": 9.023131672597865e-05, + "loss": 1.6011, + "step": 6179 + }, + { + "epoch": 2.7466666666666666, + "grad_norm": 2.2355053424835205, + "learning_rate": 9.021352313167261e-05, + "loss": 1.5883, + "step": 6180 + }, + { + "epoch": 2.747111111111111, + "grad_norm": 0.23143476247787476, + "learning_rate": 9.019572953736655e-05, + "loss": 0.0327, + "step": 6181 + }, + { + "epoch": 2.7475555555555555, + "grad_norm": 2.9940497875213623, + "learning_rate": 9.01779359430605e-05, + "loss": 1.9261, + "step": 6182 + }, + { + "epoch": 2.748, + "grad_norm": 2.4658970832824707, + "learning_rate": 9.016014234875445e-05, + "loss": 1.8213, + "step": 6183 + }, + { + "epoch": 2.7484444444444445, + "grad_norm": 3.212437152862549, + "learning_rate": 9.014234875444841e-05, + "loss": 1.511, + "step": 6184 + }, + { + "epoch": 2.7488888888888887, + "grad_norm": 2.720136880874634, + "learning_rate": 9.012455516014235e-05, + "loss": 1.5774, + "step": 6185 + }, + { + "epoch": 2.7493333333333334, + "grad_norm": 2.8537580966949463, + "learning_rate": 9.010676156583631e-05, + "loss": 1.8448, + "step": 6186 + }, + { + "epoch": 2.7497777777777777, + "grad_norm": 2.702603578567505, + "learning_rate": 9.008896797153026e-05, + "loss": 1.3248, + "step": 6187 + }, + { + "epoch": 2.7502222222222223, + "grad_norm": 2.6414220333099365, + "learning_rate": 9.00711743772242e-05, + "loss": 1.7348, + "step": 6188 + }, + { + "epoch": 2.7506666666666666, + "grad_norm": 2.9192988872528076, + "learning_rate": 9.005338078291815e-05, + "loss": 1.4816, + "step": 6189 + }, + { + "epoch": 2.7511111111111113, + "grad_norm": 2.888636589050293, + "learning_rate": 9.00355871886121e-05, + "loss": 1.5585, + "step": 6190 + }, + { + "epoch": 2.7515555555555555, + "grad_norm": 3.021221399307251, + "learning_rate": 9.001779359430605e-05, + "loss": 1.8431, + "step": 6191 + }, + { + "epoch": 2.752, + "grad_norm": 2.7811830043792725, + "learning_rate": 9e-05, + "loss": 1.7267, + "step": 6192 + }, + { + "epoch": 2.7524444444444445, + "grad_norm": 3.7655017375946045, + "learning_rate": 8.998220640569396e-05, + "loss": 1.9514, + "step": 6193 + }, + { + "epoch": 2.752888888888889, + "grad_norm": 2.969604015350342, + "learning_rate": 8.99644128113879e-05, + "loss": 1.8387, + "step": 6194 + }, + { + "epoch": 2.7533333333333334, + "grad_norm": 3.2840607166290283, + "learning_rate": 8.994661921708185e-05, + "loss": 1.4239, + "step": 6195 + }, + { + "epoch": 2.7537777777777777, + "grad_norm": 2.914557695388794, + "learning_rate": 8.99288256227758e-05, + "loss": 1.7825, + "step": 6196 + }, + { + "epoch": 2.7542222222222223, + "grad_norm": 3.4805359840393066, + "learning_rate": 8.991103202846976e-05, + "loss": 1.8883, + "step": 6197 + }, + { + "epoch": 2.7546666666666666, + "grad_norm": 3.3305397033691406, + "learning_rate": 8.98932384341637e-05, + "loss": 1.3508, + "step": 6198 + }, + { + "epoch": 2.755111111111111, + "grad_norm": 3.2841057777404785, + "learning_rate": 8.987544483985766e-05, + "loss": 1.4563, + "step": 6199 + }, + { + "epoch": 2.7555555555555555, + "grad_norm": 2.9176838397979736, + "learning_rate": 8.985765124555162e-05, + "loss": 1.0079, + "step": 6200 + }, + { + "epoch": 2.7560000000000002, + "grad_norm": 1.1757615804672241, + "learning_rate": 8.983985765124555e-05, + "loss": 1.0498, + "step": 6201 + }, + { + "epoch": 2.7564444444444445, + "grad_norm": 1.7046676874160767, + "learning_rate": 8.98220640569395e-05, + "loss": 1.9004, + "step": 6202 + }, + { + "epoch": 2.7568888888888887, + "grad_norm": 1.9083878993988037, + "learning_rate": 8.980427046263346e-05, + "loss": 2.1245, + "step": 6203 + }, + { + "epoch": 2.7573333333333334, + "grad_norm": 2.188979148864746, + "learning_rate": 8.97864768683274e-05, + "loss": 2.0422, + "step": 6204 + }, + { + "epoch": 2.7577777777777777, + "grad_norm": 2.305520534515381, + "learning_rate": 8.976868327402136e-05, + "loss": 1.8755, + "step": 6205 + }, + { + "epoch": 2.7582222222222224, + "grad_norm": 2.1911003589630127, + "learning_rate": 8.975088967971532e-05, + "loss": 1.9326, + "step": 6206 + }, + { + "epoch": 2.7586666666666666, + "grad_norm": 1.9062891006469727, + "learning_rate": 8.973309608540926e-05, + "loss": 1.7775, + "step": 6207 + }, + { + "epoch": 2.7591111111111113, + "grad_norm": 2.4695067405700684, + "learning_rate": 8.97153024911032e-05, + "loss": 2.0425, + "step": 6208 + }, + { + "epoch": 2.7595555555555555, + "grad_norm": 2.0994985103607178, + "learning_rate": 8.969750889679716e-05, + "loss": 1.6857, + "step": 6209 + }, + { + "epoch": 2.76, + "grad_norm": 2.3211324214935303, + "learning_rate": 8.96797153024911e-05, + "loss": 2.2616, + "step": 6210 + }, + { + "epoch": 2.7604444444444445, + "grad_norm": 2.4278671741485596, + "learning_rate": 8.966192170818506e-05, + "loss": 1.8835, + "step": 6211 + }, + { + "epoch": 2.7608888888888887, + "grad_norm": 2.776087999343872, + "learning_rate": 8.964412811387901e-05, + "loss": 1.4898, + "step": 6212 + }, + { + "epoch": 2.7613333333333334, + "grad_norm": 2.4669578075408936, + "learning_rate": 8.962633451957296e-05, + "loss": 1.993, + "step": 6213 + }, + { + "epoch": 2.7617777777777777, + "grad_norm": 2.3764331340789795, + "learning_rate": 8.96085409252669e-05, + "loss": 1.6464, + "step": 6214 + }, + { + "epoch": 2.7622222222222224, + "grad_norm": 2.225770950317383, + "learning_rate": 8.959074733096086e-05, + "loss": 1.9112, + "step": 6215 + }, + { + "epoch": 2.7626666666666666, + "grad_norm": 2.581855535507202, + "learning_rate": 8.957295373665481e-05, + "loss": 1.7821, + "step": 6216 + }, + { + "epoch": 2.763111111111111, + "grad_norm": 2.1973445415496826, + "learning_rate": 8.955516014234876e-05, + "loss": 1.4587, + "step": 6217 + }, + { + "epoch": 2.7635555555555555, + "grad_norm": 2.4556639194488525, + "learning_rate": 8.953736654804271e-05, + "loss": 2.1099, + "step": 6218 + }, + { + "epoch": 2.7640000000000002, + "grad_norm": 1.6619460582733154, + "learning_rate": 8.951957295373667e-05, + "loss": 0.7801, + "step": 6219 + }, + { + "epoch": 2.7644444444444445, + "grad_norm": 2.5685040950775146, + "learning_rate": 8.95017793594306e-05, + "loss": 1.572, + "step": 6220 + }, + { + "epoch": 2.7648888888888887, + "grad_norm": 2.160173177719116, + "learning_rate": 8.948398576512455e-05, + "loss": 1.5836, + "step": 6221 + }, + { + "epoch": 2.7653333333333334, + "grad_norm": 2.594742774963379, + "learning_rate": 8.946619217081851e-05, + "loss": 1.307, + "step": 6222 + }, + { + "epoch": 2.7657777777777777, + "grad_norm": 2.604459285736084, + "learning_rate": 8.944839857651245e-05, + "loss": 1.8215, + "step": 6223 + }, + { + "epoch": 2.7662222222222224, + "grad_norm": 2.575556516647339, + "learning_rate": 8.943060498220641e-05, + "loss": 1.7588, + "step": 6224 + }, + { + "epoch": 2.7666666666666666, + "grad_norm": 2.51969051361084, + "learning_rate": 8.941281138790037e-05, + "loss": 1.8821, + "step": 6225 + }, + { + "epoch": 2.7671111111111113, + "grad_norm": 2.5474319458007812, + "learning_rate": 8.939501779359431e-05, + "loss": 2.1172, + "step": 6226 + }, + { + "epoch": 2.7675555555555555, + "grad_norm": 2.6223058700561523, + "learning_rate": 8.937722419928825e-05, + "loss": 1.7894, + "step": 6227 + }, + { + "epoch": 2.768, + "grad_norm": 2.7866673469543457, + "learning_rate": 8.935943060498221e-05, + "loss": 1.7887, + "step": 6228 + }, + { + "epoch": 2.7684444444444445, + "grad_norm": 0.3962222933769226, + "learning_rate": 8.934163701067617e-05, + "loss": 0.0359, + "step": 6229 + }, + { + "epoch": 2.7688888888888887, + "grad_norm": 2.9397222995758057, + "learning_rate": 8.932384341637011e-05, + "loss": 1.8361, + "step": 6230 + }, + { + "epoch": 2.7693333333333334, + "grad_norm": 2.984018087387085, + "learning_rate": 8.930604982206407e-05, + "loss": 1.7147, + "step": 6231 + }, + { + "epoch": 2.7697777777777777, + "grad_norm": 3.337759017944336, + "learning_rate": 8.928825622775802e-05, + "loss": 2.0495, + "step": 6232 + }, + { + "epoch": 2.7702222222222224, + "grad_norm": 2.786367654800415, + "learning_rate": 8.927046263345195e-05, + "loss": 1.7017, + "step": 6233 + }, + { + "epoch": 2.7706666666666666, + "grad_norm": 3.031346321105957, + "learning_rate": 8.925266903914591e-05, + "loss": 1.8949, + "step": 6234 + }, + { + "epoch": 2.771111111111111, + "grad_norm": 2.8684723377227783, + "learning_rate": 8.923487544483986e-05, + "loss": 1.7939, + "step": 6235 + }, + { + "epoch": 2.7715555555555556, + "grad_norm": 2.4864673614501953, + "learning_rate": 8.921708185053381e-05, + "loss": 1.6988, + "step": 6236 + }, + { + "epoch": 2.7720000000000002, + "grad_norm": 2.5959644317626953, + "learning_rate": 8.919928825622776e-05, + "loss": 1.4515, + "step": 6237 + }, + { + "epoch": 2.7724444444444445, + "grad_norm": 2.3537802696228027, + "learning_rate": 8.918149466192172e-05, + "loss": 1.2161, + "step": 6238 + }, + { + "epoch": 2.7728888888888887, + "grad_norm": 2.6737425327301025, + "learning_rate": 8.916370106761566e-05, + "loss": 1.6259, + "step": 6239 + }, + { + "epoch": 2.7733333333333334, + "grad_norm": 2.907355785369873, + "learning_rate": 8.91459074733096e-05, + "loss": 1.846, + "step": 6240 + }, + { + "epoch": 2.7737777777777777, + "grad_norm": 3.1080589294433594, + "learning_rate": 8.912811387900356e-05, + "loss": 1.718, + "step": 6241 + }, + { + "epoch": 2.7742222222222224, + "grad_norm": 3.7171342372894287, + "learning_rate": 8.911032028469752e-05, + "loss": 1.7999, + "step": 6242 + }, + { + "epoch": 2.7746666666666666, + "grad_norm": 2.974966049194336, + "learning_rate": 8.909252669039146e-05, + "loss": 1.5202, + "step": 6243 + }, + { + "epoch": 2.7751111111111113, + "grad_norm": 2.403938055038452, + "learning_rate": 8.907473309608542e-05, + "loss": 1.3251, + "step": 6244 + }, + { + "epoch": 2.7755555555555556, + "grad_norm": 3.240013599395752, + "learning_rate": 8.905693950177937e-05, + "loss": 1.1542, + "step": 6245 + }, + { + "epoch": 2.776, + "grad_norm": 2.923494815826416, + "learning_rate": 8.90391459074733e-05, + "loss": 1.5198, + "step": 6246 + }, + { + "epoch": 2.7764444444444445, + "grad_norm": 3.5100812911987305, + "learning_rate": 8.902135231316726e-05, + "loss": 1.4171, + "step": 6247 + }, + { + "epoch": 2.7768888888888887, + "grad_norm": 2.885017156600952, + "learning_rate": 8.900355871886122e-05, + "loss": 1.4852, + "step": 6248 + }, + { + "epoch": 2.7773333333333334, + "grad_norm": 3.7107977867126465, + "learning_rate": 8.898576512455516e-05, + "loss": 1.3071, + "step": 6249 + }, + { + "epoch": 2.7777777777777777, + "grad_norm": 3.349717378616333, + "learning_rate": 8.896797153024912e-05, + "loss": 1.5356, + "step": 6250 + }, + { + "epoch": 2.7782222222222224, + "grad_norm": 2.166078805923462, + "learning_rate": 8.895017793594307e-05, + "loss": 1.9864, + "step": 6251 + }, + { + "epoch": 2.7786666666666666, + "grad_norm": 2.238597869873047, + "learning_rate": 8.893238434163702e-05, + "loss": 1.9695, + "step": 6252 + }, + { + "epoch": 2.779111111111111, + "grad_norm": 1.4127635955810547, + "learning_rate": 8.891459074733096e-05, + "loss": 1.1453, + "step": 6253 + }, + { + "epoch": 2.7795555555555556, + "grad_norm": 2.074450969696045, + "learning_rate": 8.889679715302492e-05, + "loss": 1.5515, + "step": 6254 + }, + { + "epoch": 2.7800000000000002, + "grad_norm": 2.3815135955810547, + "learning_rate": 8.887900355871886e-05, + "loss": 1.9383, + "step": 6255 + }, + { + "epoch": 2.7804444444444445, + "grad_norm": 2.3668477535247803, + "learning_rate": 8.886120996441281e-05, + "loss": 1.685, + "step": 6256 + }, + { + "epoch": 2.7808888888888887, + "grad_norm": 2.168121576309204, + "learning_rate": 8.884341637010677e-05, + "loss": 1.783, + "step": 6257 + }, + { + "epoch": 2.7813333333333334, + "grad_norm": 1.5242462158203125, + "learning_rate": 8.882562277580071e-05, + "loss": 1.0202, + "step": 6258 + }, + { + "epoch": 2.7817777777777777, + "grad_norm": 2.2029366493225098, + "learning_rate": 8.880782918149466e-05, + "loss": 1.4853, + "step": 6259 + }, + { + "epoch": 2.7822222222222224, + "grad_norm": 2.45417857170105, + "learning_rate": 8.879003558718861e-05, + "loss": 1.9181, + "step": 6260 + }, + { + "epoch": 2.7826666666666666, + "grad_norm": 2.351133346557617, + "learning_rate": 8.877224199288257e-05, + "loss": 1.9803, + "step": 6261 + }, + { + "epoch": 2.7831111111111113, + "grad_norm": 2.16433048248291, + "learning_rate": 8.875444839857651e-05, + "loss": 1.7061, + "step": 6262 + }, + { + "epoch": 2.7835555555555556, + "grad_norm": 2.3350791931152344, + "learning_rate": 8.873665480427047e-05, + "loss": 1.7437, + "step": 6263 + }, + { + "epoch": 2.784, + "grad_norm": 2.5073392391204834, + "learning_rate": 8.871886120996443e-05, + "loss": 1.6962, + "step": 6264 + }, + { + "epoch": 2.7844444444444445, + "grad_norm": 2.518251657485962, + "learning_rate": 8.870106761565836e-05, + "loss": 1.6035, + "step": 6265 + }, + { + "epoch": 2.7848888888888887, + "grad_norm": 2.514759063720703, + "learning_rate": 8.868327402135231e-05, + "loss": 1.9147, + "step": 6266 + }, + { + "epoch": 2.7853333333333334, + "grad_norm": 2.542900562286377, + "learning_rate": 8.866548042704627e-05, + "loss": 1.8066, + "step": 6267 + }, + { + "epoch": 2.7857777777777777, + "grad_norm": 2.330296754837036, + "learning_rate": 8.864768683274021e-05, + "loss": 1.7774, + "step": 6268 + }, + { + "epoch": 2.7862222222222224, + "grad_norm": 2.444782018661499, + "learning_rate": 8.862989323843417e-05, + "loss": 2.0453, + "step": 6269 + }, + { + "epoch": 2.7866666666666666, + "grad_norm": 2.7158892154693604, + "learning_rate": 8.861209964412812e-05, + "loss": 2.2444, + "step": 6270 + }, + { + "epoch": 2.787111111111111, + "grad_norm": 2.286931037902832, + "learning_rate": 8.859430604982207e-05, + "loss": 1.1165, + "step": 6271 + }, + { + "epoch": 2.7875555555555556, + "grad_norm": 2.6693062782287598, + "learning_rate": 8.857651245551601e-05, + "loss": 1.8122, + "step": 6272 + }, + { + "epoch": 2.7880000000000003, + "grad_norm": 2.527357816696167, + "learning_rate": 8.855871886120997e-05, + "loss": 1.9374, + "step": 6273 + }, + { + "epoch": 2.7884444444444445, + "grad_norm": 2.357048273086548, + "learning_rate": 8.854092526690392e-05, + "loss": 1.9746, + "step": 6274 + }, + { + "epoch": 2.7888888888888888, + "grad_norm": 2.5850024223327637, + "learning_rate": 8.852313167259787e-05, + "loss": 1.3167, + "step": 6275 + }, + { + "epoch": 2.7893333333333334, + "grad_norm": 2.3246774673461914, + "learning_rate": 8.850533807829182e-05, + "loss": 1.5308, + "step": 6276 + }, + { + "epoch": 2.7897777777777777, + "grad_norm": 2.6649742126464844, + "learning_rate": 8.848754448398578e-05, + "loss": 1.9587, + "step": 6277 + }, + { + "epoch": 2.7902222222222224, + "grad_norm": 2.920469284057617, + "learning_rate": 8.846975088967971e-05, + "loss": 2.0161, + "step": 6278 + }, + { + "epoch": 2.7906666666666666, + "grad_norm": 2.4961137771606445, + "learning_rate": 8.845195729537367e-05, + "loss": 1.7213, + "step": 6279 + }, + { + "epoch": 2.7911111111111113, + "grad_norm": 2.586489677429199, + "learning_rate": 8.843416370106762e-05, + "loss": 1.6672, + "step": 6280 + }, + { + "epoch": 2.7915555555555556, + "grad_norm": 2.7179059982299805, + "learning_rate": 8.841637010676156e-05, + "loss": 1.6846, + "step": 6281 + }, + { + "epoch": 2.792, + "grad_norm": 2.3709304332733154, + "learning_rate": 8.839857651245552e-05, + "loss": 1.6412, + "step": 6282 + }, + { + "epoch": 2.7924444444444445, + "grad_norm": 2.750460624694824, + "learning_rate": 8.838078291814948e-05, + "loss": 1.928, + "step": 6283 + }, + { + "epoch": 2.7928888888888888, + "grad_norm": 2.755523681640625, + "learning_rate": 8.836298932384342e-05, + "loss": 1.3791, + "step": 6284 + }, + { + "epoch": 2.7933333333333334, + "grad_norm": 6.7336554527282715, + "learning_rate": 8.834519572953736e-05, + "loss": 1.7842, + "step": 6285 + }, + { + "epoch": 2.7937777777777777, + "grad_norm": 1.775144100189209, + "learning_rate": 8.832740213523132e-05, + "loss": 0.8977, + "step": 6286 + }, + { + "epoch": 2.7942222222222224, + "grad_norm": 1.6157419681549072, + "learning_rate": 8.830960854092528e-05, + "loss": 0.6727, + "step": 6287 + }, + { + "epoch": 2.7946666666666666, + "grad_norm": 3.1187615394592285, + "learning_rate": 8.829181494661922e-05, + "loss": 2.2613, + "step": 6288 + }, + { + "epoch": 2.795111111111111, + "grad_norm": 2.994696617126465, + "learning_rate": 8.827402135231318e-05, + "loss": 1.5852, + "step": 6289 + }, + { + "epoch": 2.7955555555555556, + "grad_norm": 2.502854585647583, + "learning_rate": 8.825622775800713e-05, + "loss": 1.5286, + "step": 6290 + }, + { + "epoch": 2.7960000000000003, + "grad_norm": 2.853635549545288, + "learning_rate": 8.823843416370106e-05, + "loss": 1.7994, + "step": 6291 + }, + { + "epoch": 2.7964444444444445, + "grad_norm": 4.293439865112305, + "learning_rate": 8.822064056939502e-05, + "loss": 2.0785, + "step": 6292 + }, + { + "epoch": 2.7968888888888888, + "grad_norm": 3.213901996612549, + "learning_rate": 8.820284697508897e-05, + "loss": 1.9483, + "step": 6293 + }, + { + "epoch": 2.7973333333333334, + "grad_norm": 3.6944658756256104, + "learning_rate": 8.818505338078292e-05, + "loss": 2.1967, + "step": 6294 + }, + { + "epoch": 2.7977777777777777, + "grad_norm": 2.9698970317840576, + "learning_rate": 8.816725978647687e-05, + "loss": 1.5886, + "step": 6295 + }, + { + "epoch": 2.7982222222222224, + "grad_norm": 3.9858527183532715, + "learning_rate": 8.814946619217083e-05, + "loss": 1.9551, + "step": 6296 + }, + { + "epoch": 2.7986666666666666, + "grad_norm": 4.04840612411499, + "learning_rate": 8.813167259786477e-05, + "loss": 2.0034, + "step": 6297 + }, + { + "epoch": 2.7991111111111113, + "grad_norm": 3.7091634273529053, + "learning_rate": 8.811387900355872e-05, + "loss": 2.1336, + "step": 6298 + }, + { + "epoch": 2.7995555555555556, + "grad_norm": 3.552135944366455, + "learning_rate": 8.809608540925267e-05, + "loss": 1.8169, + "step": 6299 + }, + { + "epoch": 2.8, + "grad_norm": 2.049909830093384, + "learning_rate": 8.807829181494662e-05, + "loss": 0.6878, + "step": 6300 + }, + { + "epoch": 2.8004444444444445, + "grad_norm": 3.8162107467651367, + "learning_rate": 8.806049822064057e-05, + "loss": 2.3236, + "step": 6301 + }, + { + "epoch": 2.8008888888888888, + "grad_norm": 1.9239376783370972, + "learning_rate": 8.804270462633453e-05, + "loss": 1.5104, + "step": 6302 + }, + { + "epoch": 2.8013333333333335, + "grad_norm": 1.867793083190918, + "learning_rate": 8.802491103202847e-05, + "loss": 1.936, + "step": 6303 + }, + { + "epoch": 2.8017777777777777, + "grad_norm": 2.062577486038208, + "learning_rate": 8.800711743772242e-05, + "loss": 1.8487, + "step": 6304 + }, + { + "epoch": 2.8022222222222224, + "grad_norm": 2.304241895675659, + "learning_rate": 8.798932384341637e-05, + "loss": 1.9238, + "step": 6305 + }, + { + "epoch": 2.8026666666666666, + "grad_norm": 2.0936267375946045, + "learning_rate": 8.797153024911033e-05, + "loss": 1.6966, + "step": 6306 + }, + { + "epoch": 2.803111111111111, + "grad_norm": 2.325967788696289, + "learning_rate": 8.795373665480427e-05, + "loss": 2.0191, + "step": 6307 + }, + { + "epoch": 2.8035555555555556, + "grad_norm": 2.175997495651245, + "learning_rate": 8.793594306049823e-05, + "loss": 1.5685, + "step": 6308 + }, + { + "epoch": 2.8040000000000003, + "grad_norm": 1.519380807876587, + "learning_rate": 8.791814946619218e-05, + "loss": 0.9142, + "step": 6309 + }, + { + "epoch": 2.8044444444444445, + "grad_norm": 2.5699596405029297, + "learning_rate": 8.790035587188611e-05, + "loss": 2.3034, + "step": 6310 + }, + { + "epoch": 2.8048888888888888, + "grad_norm": 2.172449827194214, + "learning_rate": 8.788256227758007e-05, + "loss": 1.7681, + "step": 6311 + }, + { + "epoch": 2.8053333333333335, + "grad_norm": 2.67712140083313, + "learning_rate": 8.786476868327403e-05, + "loss": 2.1584, + "step": 6312 + }, + { + "epoch": 2.8057777777777777, + "grad_norm": 2.426130771636963, + "learning_rate": 8.784697508896797e-05, + "loss": 1.7722, + "step": 6313 + }, + { + "epoch": 2.806222222222222, + "grad_norm": 2.3680663108825684, + "learning_rate": 8.782918149466193e-05, + "loss": 1.886, + "step": 6314 + }, + { + "epoch": 2.8066666666666666, + "grad_norm": 2.257155179977417, + "learning_rate": 8.781138790035588e-05, + "loss": 1.7808, + "step": 6315 + }, + { + "epoch": 2.8071111111111113, + "grad_norm": 2.636523485183716, + "learning_rate": 8.779359430604983e-05, + "loss": 1.7881, + "step": 6316 + }, + { + "epoch": 2.8075555555555556, + "grad_norm": 2.501422882080078, + "learning_rate": 8.777580071174377e-05, + "loss": 1.814, + "step": 6317 + }, + { + "epoch": 2.808, + "grad_norm": 2.323500394821167, + "learning_rate": 8.775800711743772e-05, + "loss": 1.7802, + "step": 6318 + }, + { + "epoch": 2.8084444444444445, + "grad_norm": 1.7834057807922363, + "learning_rate": 8.774021352313168e-05, + "loss": 0.901, + "step": 6319 + }, + { + "epoch": 2.8088888888888888, + "grad_norm": 1.7829649448394775, + "learning_rate": 8.772241992882562e-05, + "loss": 0.9174, + "step": 6320 + }, + { + "epoch": 2.8093333333333335, + "grad_norm": 2.242635488510132, + "learning_rate": 8.770462633451958e-05, + "loss": 1.7682, + "step": 6321 + }, + { + "epoch": 2.8097777777777777, + "grad_norm": 2.7776167392730713, + "learning_rate": 8.768683274021354e-05, + "loss": 2.1089, + "step": 6322 + }, + { + "epoch": 2.8102222222222224, + "grad_norm": 2.151515483856201, + "learning_rate": 8.766903914590747e-05, + "loss": 1.4301, + "step": 6323 + }, + { + "epoch": 2.8106666666666666, + "grad_norm": 2.3415439128875732, + "learning_rate": 8.765124555160142e-05, + "loss": 1.8939, + "step": 6324 + }, + { + "epoch": 2.811111111111111, + "grad_norm": 2.3634235858917236, + "learning_rate": 8.763345195729538e-05, + "loss": 1.9442, + "step": 6325 + }, + { + "epoch": 2.8115555555555556, + "grad_norm": 2.3634817600250244, + "learning_rate": 8.761565836298932e-05, + "loss": 1.3034, + "step": 6326 + }, + { + "epoch": 2.8120000000000003, + "grad_norm": 2.2050936222076416, + "learning_rate": 8.759786476868328e-05, + "loss": 1.4394, + "step": 6327 + }, + { + "epoch": 2.8124444444444445, + "grad_norm": 2.491987466812134, + "learning_rate": 8.758007117437724e-05, + "loss": 1.4429, + "step": 6328 + }, + { + "epoch": 2.8128888888888888, + "grad_norm": 2.7539877891540527, + "learning_rate": 8.756227758007118e-05, + "loss": 1.8103, + "step": 6329 + }, + { + "epoch": 2.8133333333333335, + "grad_norm": 1.8611799478530884, + "learning_rate": 8.754448398576512e-05, + "loss": 0.8885, + "step": 6330 + }, + { + "epoch": 2.8137777777777777, + "grad_norm": 2.7642264366149902, + "learning_rate": 8.752669039145908e-05, + "loss": 1.6687, + "step": 6331 + }, + { + "epoch": 2.814222222222222, + "grad_norm": 2.5059311389923096, + "learning_rate": 8.750889679715303e-05, + "loss": 1.5068, + "step": 6332 + }, + { + "epoch": 2.8146666666666667, + "grad_norm": 2.761054515838623, + "learning_rate": 8.749110320284698e-05, + "loss": 1.7512, + "step": 6333 + }, + { + "epoch": 2.8151111111111113, + "grad_norm": 2.5050809383392334, + "learning_rate": 8.747330960854093e-05, + "loss": 1.4633, + "step": 6334 + }, + { + "epoch": 2.8155555555555556, + "grad_norm": 3.096000909805298, + "learning_rate": 8.745551601423489e-05, + "loss": 1.7558, + "step": 6335 + }, + { + "epoch": 2.816, + "grad_norm": 3.1274073123931885, + "learning_rate": 8.743772241992882e-05, + "loss": 1.614, + "step": 6336 + }, + { + "epoch": 2.8164444444444445, + "grad_norm": 3.165975332260132, + "learning_rate": 8.741992882562278e-05, + "loss": 1.5489, + "step": 6337 + }, + { + "epoch": 2.8168888888888888, + "grad_norm": 3.016789674758911, + "learning_rate": 8.740213523131673e-05, + "loss": 1.8568, + "step": 6338 + }, + { + "epoch": 2.8173333333333335, + "grad_norm": 2.219804525375366, + "learning_rate": 8.738434163701068e-05, + "loss": 1.0983, + "step": 6339 + }, + { + "epoch": 2.8177777777777777, + "grad_norm": 3.128307819366455, + "learning_rate": 8.736654804270463e-05, + "loss": 1.4682, + "step": 6340 + }, + { + "epoch": 2.8182222222222224, + "grad_norm": 3.149846076965332, + "learning_rate": 8.734875444839859e-05, + "loss": 1.8692, + "step": 6341 + }, + { + "epoch": 2.8186666666666667, + "grad_norm": 2.897365093231201, + "learning_rate": 8.733096085409253e-05, + "loss": 1.5963, + "step": 6342 + }, + { + "epoch": 2.819111111111111, + "grad_norm": 3.354923963546753, + "learning_rate": 8.731316725978647e-05, + "loss": 1.8655, + "step": 6343 + }, + { + "epoch": 2.8195555555555556, + "grad_norm": 3.33359432220459, + "learning_rate": 8.729537366548043e-05, + "loss": 1.8005, + "step": 6344 + }, + { + "epoch": 2.82, + "grad_norm": 2.5689620971679688, + "learning_rate": 8.727758007117437e-05, + "loss": 1.5202, + "step": 6345 + }, + { + "epoch": 2.8204444444444445, + "grad_norm": 4.082332134246826, + "learning_rate": 8.725978647686833e-05, + "loss": 2.2171, + "step": 6346 + }, + { + "epoch": 2.820888888888889, + "grad_norm": 3.0966172218322754, + "learning_rate": 8.724199288256229e-05, + "loss": 1.9997, + "step": 6347 + }, + { + "epoch": 2.8213333333333335, + "grad_norm": 3.0002849102020264, + "learning_rate": 8.722419928825623e-05, + "loss": 1.488, + "step": 6348 + }, + { + "epoch": 2.8217777777777777, + "grad_norm": 4.207763671875, + "learning_rate": 8.720640569395017e-05, + "loss": 1.2479, + "step": 6349 + }, + { + "epoch": 2.822222222222222, + "grad_norm": 2.4575085639953613, + "learning_rate": 8.718861209964413e-05, + "loss": 0.5755, + "step": 6350 + }, + { + "epoch": 2.8226666666666667, + "grad_norm": 1.834578275680542, + "learning_rate": 8.717081850533809e-05, + "loss": 2.0906, + "step": 6351 + }, + { + "epoch": 2.8231111111111113, + "grad_norm": 1.9886993169784546, + "learning_rate": 8.715302491103203e-05, + "loss": 1.6937, + "step": 6352 + }, + { + "epoch": 2.8235555555555556, + "grad_norm": 1.45979905128479, + "learning_rate": 8.713523131672599e-05, + "loss": 1.3237, + "step": 6353 + }, + { + "epoch": 2.824, + "grad_norm": 2.220140218734741, + "learning_rate": 8.711743772241994e-05, + "loss": 2.1938, + "step": 6354 + }, + { + "epoch": 2.8244444444444445, + "grad_norm": 2.36525821685791, + "learning_rate": 8.709964412811388e-05, + "loss": 1.7316, + "step": 6355 + }, + { + "epoch": 2.824888888888889, + "grad_norm": 2.672027587890625, + "learning_rate": 8.708185053380783e-05, + "loss": 2.1021, + "step": 6356 + }, + { + "epoch": 2.8253333333333335, + "grad_norm": 2.566497325897217, + "learning_rate": 8.706405693950178e-05, + "loss": 2.0366, + "step": 6357 + }, + { + "epoch": 2.8257777777777777, + "grad_norm": 2.3866355419158936, + "learning_rate": 8.704626334519573e-05, + "loss": 1.8904, + "step": 6358 + }, + { + "epoch": 2.8262222222222224, + "grad_norm": 2.158238172531128, + "learning_rate": 8.702846975088968e-05, + "loss": 1.9666, + "step": 6359 + }, + { + "epoch": 2.8266666666666667, + "grad_norm": 2.4062252044677734, + "learning_rate": 8.701067615658364e-05, + "loss": 1.7557, + "step": 6360 + }, + { + "epoch": 2.827111111111111, + "grad_norm": 2.215730905532837, + "learning_rate": 8.699288256227758e-05, + "loss": 1.0731, + "step": 6361 + }, + { + "epoch": 2.8275555555555556, + "grad_norm": 2.172459125518799, + "learning_rate": 8.697508896797153e-05, + "loss": 1.7543, + "step": 6362 + }, + { + "epoch": 2.828, + "grad_norm": 2.4174609184265137, + "learning_rate": 8.695729537366548e-05, + "loss": 1.5526, + "step": 6363 + }, + { + "epoch": 2.8284444444444445, + "grad_norm": 1.6709802150726318, + "learning_rate": 8.693950177935944e-05, + "loss": 0.8726, + "step": 6364 + }, + { + "epoch": 2.828888888888889, + "grad_norm": 2.830564022064209, + "learning_rate": 8.692170818505338e-05, + "loss": 1.9783, + "step": 6365 + }, + { + "epoch": 2.8293333333333335, + "grad_norm": 2.5049219131469727, + "learning_rate": 8.690391459074734e-05, + "loss": 1.7796, + "step": 6366 + }, + { + "epoch": 2.8297777777777777, + "grad_norm": 2.455636739730835, + "learning_rate": 8.68861209964413e-05, + "loss": 1.807, + "step": 6367 + }, + { + "epoch": 2.830222222222222, + "grad_norm": 2.934166431427002, + "learning_rate": 8.686832740213522e-05, + "loss": 1.4921, + "step": 6368 + }, + { + "epoch": 2.8306666666666667, + "grad_norm": 3.1317198276519775, + "learning_rate": 8.685053380782918e-05, + "loss": 1.7484, + "step": 6369 + }, + { + "epoch": 2.8311111111111114, + "grad_norm": 3.014157295227051, + "learning_rate": 8.683274021352314e-05, + "loss": 2.2238, + "step": 6370 + }, + { + "epoch": 2.8315555555555556, + "grad_norm": 2.56988787651062, + "learning_rate": 8.681494661921708e-05, + "loss": 1.6546, + "step": 6371 + }, + { + "epoch": 2.832, + "grad_norm": 2.6320784091949463, + "learning_rate": 8.679715302491104e-05, + "loss": 2.1411, + "step": 6372 + }, + { + "epoch": 2.8324444444444445, + "grad_norm": 2.682128667831421, + "learning_rate": 8.6779359430605e-05, + "loss": 2.1698, + "step": 6373 + }, + { + "epoch": 2.832888888888889, + "grad_norm": 2.754392623901367, + "learning_rate": 8.676156583629894e-05, + "loss": 1.8696, + "step": 6374 + }, + { + "epoch": 2.8333333333333335, + "grad_norm": 2.4099462032318115, + "learning_rate": 8.674377224199288e-05, + "loss": 1.8406, + "step": 6375 + }, + { + "epoch": 2.8337777777777777, + "grad_norm": 3.0007071495056152, + "learning_rate": 8.672597864768684e-05, + "loss": 1.7611, + "step": 6376 + }, + { + "epoch": 2.8342222222222224, + "grad_norm": 3.2685530185699463, + "learning_rate": 8.670818505338079e-05, + "loss": 1.8421, + "step": 6377 + }, + { + "epoch": 2.8346666666666667, + "grad_norm": 2.7052693367004395, + "learning_rate": 8.669039145907474e-05, + "loss": 1.6633, + "step": 6378 + }, + { + "epoch": 2.835111111111111, + "grad_norm": 2.628025531768799, + "learning_rate": 8.667259786476869e-05, + "loss": 1.7274, + "step": 6379 + }, + { + "epoch": 2.8355555555555556, + "grad_norm": 1.5739624500274658, + "learning_rate": 8.665480427046265e-05, + "loss": 0.7554, + "step": 6380 + }, + { + "epoch": 2.836, + "grad_norm": 1.9430078268051147, + "learning_rate": 8.663701067615658e-05, + "loss": 0.8761, + "step": 6381 + }, + { + "epoch": 2.8364444444444445, + "grad_norm": 2.2438039779663086, + "learning_rate": 8.661921708185053e-05, + "loss": 1.1933, + "step": 6382 + }, + { + "epoch": 2.836888888888889, + "grad_norm": 2.6071414947509766, + "learning_rate": 8.660142348754449e-05, + "loss": 1.7854, + "step": 6383 + }, + { + "epoch": 2.8373333333333335, + "grad_norm": 1.6424260139465332, + "learning_rate": 8.658362989323843e-05, + "loss": 0.7366, + "step": 6384 + }, + { + "epoch": 2.8377777777777777, + "grad_norm": 2.981468439102173, + "learning_rate": 8.656583629893239e-05, + "loss": 1.7676, + "step": 6385 + }, + { + "epoch": 2.838222222222222, + "grad_norm": 2.440037965774536, + "learning_rate": 8.654804270462635e-05, + "loss": 1.5232, + "step": 6386 + }, + { + "epoch": 2.8386666666666667, + "grad_norm": 2.5505833625793457, + "learning_rate": 8.653024911032029e-05, + "loss": 1.4548, + "step": 6387 + }, + { + "epoch": 2.8391111111111114, + "grad_norm": 2.7578847408294678, + "learning_rate": 8.651245551601423e-05, + "loss": 1.5028, + "step": 6388 + }, + { + "epoch": 2.8395555555555556, + "grad_norm": 2.750519275665283, + "learning_rate": 8.649466192170819e-05, + "loss": 1.7589, + "step": 6389 + }, + { + "epoch": 2.84, + "grad_norm": 2.7024030685424805, + "learning_rate": 8.647686832740213e-05, + "loss": 1.6456, + "step": 6390 + }, + { + "epoch": 2.8404444444444445, + "grad_norm": 2.8694217205047607, + "learning_rate": 8.645907473309609e-05, + "loss": 1.0521, + "step": 6391 + }, + { + "epoch": 2.840888888888889, + "grad_norm": 2.930608034133911, + "learning_rate": 8.644128113879004e-05, + "loss": 1.6561, + "step": 6392 + }, + { + "epoch": 2.8413333333333335, + "grad_norm": 2.9115495681762695, + "learning_rate": 8.642348754448399e-05, + "loss": 1.8796, + "step": 6393 + }, + { + "epoch": 2.8417777777777777, + "grad_norm": 3.2313528060913086, + "learning_rate": 8.640569395017793e-05, + "loss": 1.8615, + "step": 6394 + }, + { + "epoch": 2.8422222222222224, + "grad_norm": 2.823005437850952, + "learning_rate": 8.638790035587189e-05, + "loss": 1.7883, + "step": 6395 + }, + { + "epoch": 2.8426666666666667, + "grad_norm": 3.2536568641662598, + "learning_rate": 8.637010676156584e-05, + "loss": 1.734, + "step": 6396 + }, + { + "epoch": 2.843111111111111, + "grad_norm": 3.4909675121307373, + "learning_rate": 8.635231316725979e-05, + "loss": 1.5882, + "step": 6397 + }, + { + "epoch": 2.8435555555555556, + "grad_norm": 3.702479124069214, + "learning_rate": 8.633451957295374e-05, + "loss": 1.7106, + "step": 6398 + }, + { + "epoch": 2.844, + "grad_norm": 3.9514353275299072, + "learning_rate": 8.63167259786477e-05, + "loss": 2.0991, + "step": 6399 + }, + { + "epoch": 2.8444444444444446, + "grad_norm": 3.16878080368042, + "learning_rate": 8.629893238434164e-05, + "loss": 1.7365, + "step": 6400 + }, + { + "epoch": 2.844888888888889, + "grad_norm": 1.7129290103912354, + "learning_rate": 8.628113879003559e-05, + "loss": 2.2009, + "step": 6401 + }, + { + "epoch": 2.8453333333333335, + "grad_norm": 1.914031744003296, + "learning_rate": 8.626334519572954e-05, + "loss": 2.0489, + "step": 6402 + }, + { + "epoch": 2.8457777777777777, + "grad_norm": 2.1656267642974854, + "learning_rate": 8.624555160142348e-05, + "loss": 2.4192, + "step": 6403 + }, + { + "epoch": 2.846222222222222, + "grad_norm": 2.315261125564575, + "learning_rate": 8.622775800711744e-05, + "loss": 1.8947, + "step": 6404 + }, + { + "epoch": 2.8466666666666667, + "grad_norm": 2.6250321865081787, + "learning_rate": 8.62099644128114e-05, + "loss": 2.3402, + "step": 6405 + }, + { + "epoch": 2.8471111111111114, + "grad_norm": 2.3993537425994873, + "learning_rate": 8.619217081850534e-05, + "loss": 1.9262, + "step": 6406 + }, + { + "epoch": 2.8475555555555556, + "grad_norm": 2.0765573978424072, + "learning_rate": 8.617437722419928e-05, + "loss": 2.1101, + "step": 6407 + }, + { + "epoch": 2.848, + "grad_norm": 2.398111343383789, + "learning_rate": 8.615658362989324e-05, + "loss": 1.8188, + "step": 6408 + }, + { + "epoch": 2.8484444444444446, + "grad_norm": 2.267061471939087, + "learning_rate": 8.61387900355872e-05, + "loss": 2.3754, + "step": 6409 + }, + { + "epoch": 2.848888888888889, + "grad_norm": 2.5441832542419434, + "learning_rate": 8.612099644128114e-05, + "loss": 1.6375, + "step": 6410 + }, + { + "epoch": 2.8493333333333335, + "grad_norm": 2.2044379711151123, + "learning_rate": 8.61032028469751e-05, + "loss": 1.8951, + "step": 6411 + }, + { + "epoch": 2.8497777777777777, + "grad_norm": 2.2039835453033447, + "learning_rate": 8.608540925266905e-05, + "loss": 1.8844, + "step": 6412 + }, + { + "epoch": 2.8502222222222224, + "grad_norm": 2.50587797164917, + "learning_rate": 8.6067615658363e-05, + "loss": 1.9177, + "step": 6413 + }, + { + "epoch": 2.8506666666666667, + "grad_norm": 2.2638602256774902, + "learning_rate": 8.604982206405694e-05, + "loss": 1.5041, + "step": 6414 + }, + { + "epoch": 2.851111111111111, + "grad_norm": 2.9379706382751465, + "learning_rate": 8.60320284697509e-05, + "loss": 1.5956, + "step": 6415 + }, + { + "epoch": 2.8515555555555556, + "grad_norm": 2.415709972381592, + "learning_rate": 8.601423487544484e-05, + "loss": 1.5924, + "step": 6416 + }, + { + "epoch": 2.852, + "grad_norm": 2.4809000492095947, + "learning_rate": 8.59964412811388e-05, + "loss": 1.9965, + "step": 6417 + }, + { + "epoch": 2.8524444444444446, + "grad_norm": 2.643976926803589, + "learning_rate": 8.597864768683275e-05, + "loss": 2.1711, + "step": 6418 + }, + { + "epoch": 2.852888888888889, + "grad_norm": 2.432825803756714, + "learning_rate": 8.59608540925267e-05, + "loss": 1.4952, + "step": 6419 + }, + { + "epoch": 2.8533333333333335, + "grad_norm": 2.853224277496338, + "learning_rate": 8.594306049822064e-05, + "loss": 1.5422, + "step": 6420 + }, + { + "epoch": 2.8537777777777777, + "grad_norm": 2.624593496322632, + "learning_rate": 8.59252669039146e-05, + "loss": 1.5306, + "step": 6421 + }, + { + "epoch": 2.854222222222222, + "grad_norm": 2.337578296661377, + "learning_rate": 8.590747330960855e-05, + "loss": 1.6708, + "step": 6422 + }, + { + "epoch": 2.8546666666666667, + "grad_norm": 2.62070369720459, + "learning_rate": 8.588967971530249e-05, + "loss": 1.799, + "step": 6423 + }, + { + "epoch": 2.8551111111111114, + "grad_norm": 2.670642852783203, + "learning_rate": 8.587188612099645e-05, + "loss": 1.7654, + "step": 6424 + }, + { + "epoch": 2.8555555555555556, + "grad_norm": 2.330082416534424, + "learning_rate": 8.58540925266904e-05, + "loss": 1.4492, + "step": 6425 + }, + { + "epoch": 2.856, + "grad_norm": 2.2589471340179443, + "learning_rate": 8.583629893238434e-05, + "loss": 1.5373, + "step": 6426 + }, + { + "epoch": 2.8564444444444446, + "grad_norm": 2.6059587001800537, + "learning_rate": 8.581850533807829e-05, + "loss": 1.8362, + "step": 6427 + }, + { + "epoch": 2.856888888888889, + "grad_norm": 2.6167008876800537, + "learning_rate": 8.580071174377225e-05, + "loss": 1.7661, + "step": 6428 + }, + { + "epoch": 2.857333333333333, + "grad_norm": 2.616682291030884, + "learning_rate": 8.578291814946619e-05, + "loss": 1.8365, + "step": 6429 + }, + { + "epoch": 2.8577777777777778, + "grad_norm": 0.22476230561733246, + "learning_rate": 8.576512455516015e-05, + "loss": 0.0266, + "step": 6430 + }, + { + "epoch": 2.8582222222222224, + "grad_norm": 1.9581819772720337, + "learning_rate": 8.57473309608541e-05, + "loss": 1.0284, + "step": 6431 + }, + { + "epoch": 2.8586666666666667, + "grad_norm": 2.870471477508545, + "learning_rate": 8.572953736654805e-05, + "loss": 1.8517, + "step": 6432 + }, + { + "epoch": 2.859111111111111, + "grad_norm": 2.1929726600646973, + "learning_rate": 8.571174377224199e-05, + "loss": 1.1532, + "step": 6433 + }, + { + "epoch": 2.8595555555555556, + "grad_norm": 2.5478527545928955, + "learning_rate": 8.569395017793595e-05, + "loss": 1.6376, + "step": 6434 + }, + { + "epoch": 2.86, + "grad_norm": 3.627499580383301, + "learning_rate": 8.567615658362989e-05, + "loss": 1.6505, + "step": 6435 + }, + { + "epoch": 2.8604444444444446, + "grad_norm": 2.877537727355957, + "learning_rate": 8.565836298932385e-05, + "loss": 2.0833, + "step": 6436 + }, + { + "epoch": 2.860888888888889, + "grad_norm": 2.8161423206329346, + "learning_rate": 8.56405693950178e-05, + "loss": 1.7967, + "step": 6437 + }, + { + "epoch": 2.8613333333333335, + "grad_norm": 3.0182766914367676, + "learning_rate": 8.562277580071175e-05, + "loss": 1.8528, + "step": 6438 + }, + { + "epoch": 2.8617777777777778, + "grad_norm": 2.9146299362182617, + "learning_rate": 8.560498220640569e-05, + "loss": 1.3118, + "step": 6439 + }, + { + "epoch": 2.862222222222222, + "grad_norm": 4.448184967041016, + "learning_rate": 8.558718861209964e-05, + "loss": 1.9648, + "step": 6440 + }, + { + "epoch": 2.8626666666666667, + "grad_norm": 3.721683979034424, + "learning_rate": 8.55693950177936e-05, + "loss": 2.0082, + "step": 6441 + }, + { + "epoch": 2.8631111111111114, + "grad_norm": 2.6923537254333496, + "learning_rate": 8.555160142348754e-05, + "loss": 1.5117, + "step": 6442 + }, + { + "epoch": 2.8635555555555556, + "grad_norm": 3.0055980682373047, + "learning_rate": 8.55338078291815e-05, + "loss": 1.688, + "step": 6443 + }, + { + "epoch": 2.864, + "grad_norm": 2.95833158493042, + "learning_rate": 8.551601423487546e-05, + "loss": 1.8496, + "step": 6444 + }, + { + "epoch": 2.8644444444444446, + "grad_norm": 3.0208351612091064, + "learning_rate": 8.54982206405694e-05, + "loss": 1.6707, + "step": 6445 + }, + { + "epoch": 2.864888888888889, + "grad_norm": 3.0486018657684326, + "learning_rate": 8.548042704626334e-05, + "loss": 1.6162, + "step": 6446 + }, + { + "epoch": 2.865333333333333, + "grad_norm": 4.173706531524658, + "learning_rate": 8.54626334519573e-05, + "loss": 2.0012, + "step": 6447 + }, + { + "epoch": 2.8657777777777778, + "grad_norm": 3.727224349975586, + "learning_rate": 8.544483985765124e-05, + "loss": 2.3882, + "step": 6448 + }, + { + "epoch": 2.8662222222222224, + "grad_norm": 3.276524782180786, + "learning_rate": 8.54270462633452e-05, + "loss": 1.3258, + "step": 6449 + }, + { + "epoch": 2.8666666666666667, + "grad_norm": 3.5537030696868896, + "learning_rate": 8.540925266903916e-05, + "loss": 0.7531, + "step": 6450 + }, + { + "epoch": 2.867111111111111, + "grad_norm": 2.074697732925415, + "learning_rate": 8.53914590747331e-05, + "loss": 2.3748, + "step": 6451 + }, + { + "epoch": 2.8675555555555556, + "grad_norm": 1.607412576675415, + "learning_rate": 8.537366548042704e-05, + "loss": 1.1406, + "step": 6452 + }, + { + "epoch": 2.868, + "grad_norm": 1.7138793468475342, + "learning_rate": 8.5355871886121e-05, + "loss": 1.1559, + "step": 6453 + }, + { + "epoch": 2.8684444444444446, + "grad_norm": 2.1833138465881348, + "learning_rate": 8.533807829181495e-05, + "loss": 1.4804, + "step": 6454 + }, + { + "epoch": 2.868888888888889, + "grad_norm": 2.5808310508728027, + "learning_rate": 8.53202846975089e-05, + "loss": 2.0459, + "step": 6455 + }, + { + "epoch": 2.8693333333333335, + "grad_norm": 2.10170316696167, + "learning_rate": 8.530249110320285e-05, + "loss": 1.5515, + "step": 6456 + }, + { + "epoch": 2.8697777777777778, + "grad_norm": 2.5783615112304688, + "learning_rate": 8.528469750889681e-05, + "loss": 1.7325, + "step": 6457 + }, + { + "epoch": 2.870222222222222, + "grad_norm": 1.9656144380569458, + "learning_rate": 8.526690391459075e-05, + "loss": 1.6021, + "step": 6458 + }, + { + "epoch": 2.8706666666666667, + "grad_norm": 2.899205446243286, + "learning_rate": 8.52491103202847e-05, + "loss": 1.5858, + "step": 6459 + }, + { + "epoch": 2.871111111111111, + "grad_norm": 2.2554736137390137, + "learning_rate": 8.523131672597865e-05, + "loss": 1.8979, + "step": 6460 + }, + { + "epoch": 2.8715555555555556, + "grad_norm": 2.1474039554595947, + "learning_rate": 8.52135231316726e-05, + "loss": 1.6935, + "step": 6461 + }, + { + "epoch": 2.872, + "grad_norm": 2.697214365005493, + "learning_rate": 8.519572953736655e-05, + "loss": 1.746, + "step": 6462 + }, + { + "epoch": 2.8724444444444446, + "grad_norm": 2.4106836318969727, + "learning_rate": 8.517793594306051e-05, + "loss": 1.7322, + "step": 6463 + }, + { + "epoch": 2.872888888888889, + "grad_norm": 2.5296225547790527, + "learning_rate": 8.516014234875445e-05, + "loss": 1.6172, + "step": 6464 + }, + { + "epoch": 2.873333333333333, + "grad_norm": 2.625232696533203, + "learning_rate": 8.51423487544484e-05, + "loss": 1.9533, + "step": 6465 + }, + { + "epoch": 2.8737777777777778, + "grad_norm": 2.728658676147461, + "learning_rate": 8.512455516014235e-05, + "loss": 1.8563, + "step": 6466 + }, + { + "epoch": 2.8742222222222225, + "grad_norm": 2.2425432205200195, + "learning_rate": 8.510676156583631e-05, + "loss": 1.1704, + "step": 6467 + }, + { + "epoch": 2.8746666666666667, + "grad_norm": 1.9965929985046387, + "learning_rate": 8.508896797153025e-05, + "loss": 1.2467, + "step": 6468 + }, + { + "epoch": 2.875111111111111, + "grad_norm": 2.5652081966400146, + "learning_rate": 8.507117437722421e-05, + "loss": 1.9512, + "step": 6469 + }, + { + "epoch": 2.8755555555555556, + "grad_norm": 2.329965591430664, + "learning_rate": 8.505338078291816e-05, + "loss": 1.81, + "step": 6470 + }, + { + "epoch": 2.876, + "grad_norm": 3.135545015335083, + "learning_rate": 8.50355871886121e-05, + "loss": 1.9133, + "step": 6471 + }, + { + "epoch": 2.8764444444444446, + "grad_norm": 2.618896722793579, + "learning_rate": 8.501779359430605e-05, + "loss": 2.2505, + "step": 6472 + }, + { + "epoch": 2.876888888888889, + "grad_norm": 3.150949001312256, + "learning_rate": 8.5e-05, + "loss": 2.0441, + "step": 6473 + }, + { + "epoch": 2.8773333333333335, + "grad_norm": 2.217580556869507, + "learning_rate": 8.498220640569395e-05, + "loss": 1.4439, + "step": 6474 + }, + { + "epoch": 2.8777777777777778, + "grad_norm": 2.7368547916412354, + "learning_rate": 8.49644128113879e-05, + "loss": 2.0908, + "step": 6475 + }, + { + "epoch": 2.878222222222222, + "grad_norm": 2.5103628635406494, + "learning_rate": 8.494661921708186e-05, + "loss": 1.9196, + "step": 6476 + }, + { + "epoch": 2.8786666666666667, + "grad_norm": 2.70851469039917, + "learning_rate": 8.49288256227758e-05, + "loss": 1.9561, + "step": 6477 + }, + { + "epoch": 2.879111111111111, + "grad_norm": 2.451458692550659, + "learning_rate": 8.491103202846975e-05, + "loss": 1.6853, + "step": 6478 + }, + { + "epoch": 2.8795555555555556, + "grad_norm": 3.1393275260925293, + "learning_rate": 8.48932384341637e-05, + "loss": 1.797, + "step": 6479 + }, + { + "epoch": 2.88, + "grad_norm": 2.9535908699035645, + "learning_rate": 8.487544483985765e-05, + "loss": 2.0018, + "step": 6480 + }, + { + "epoch": 2.8804444444444446, + "grad_norm": 2.7212975025177, + "learning_rate": 8.48576512455516e-05, + "loss": 1.7088, + "step": 6481 + }, + { + "epoch": 2.880888888888889, + "grad_norm": 2.8284823894500732, + "learning_rate": 8.483985765124556e-05, + "loss": 1.9613, + "step": 6482 + }, + { + "epoch": 2.881333333333333, + "grad_norm": 3.322567939758301, + "learning_rate": 8.48220640569395e-05, + "loss": 1.8317, + "step": 6483 + }, + { + "epoch": 2.8817777777777778, + "grad_norm": 2.870798349380493, + "learning_rate": 8.480427046263345e-05, + "loss": 1.6794, + "step": 6484 + }, + { + "epoch": 2.8822222222222225, + "grad_norm": 2.9603869915008545, + "learning_rate": 8.47864768683274e-05, + "loss": 1.9088, + "step": 6485 + }, + { + "epoch": 2.8826666666666667, + "grad_norm": 3.2760636806488037, + "learning_rate": 8.476868327402136e-05, + "loss": 1.3813, + "step": 6486 + }, + { + "epoch": 2.883111111111111, + "grad_norm": 2.696504831314087, + "learning_rate": 8.47508896797153e-05, + "loss": 1.692, + "step": 6487 + }, + { + "epoch": 2.8835555555555556, + "grad_norm": 3.375922679901123, + "learning_rate": 8.473309608540926e-05, + "loss": 1.5721, + "step": 6488 + }, + { + "epoch": 2.884, + "grad_norm": 2.901848316192627, + "learning_rate": 8.471530249110322e-05, + "loss": 1.589, + "step": 6489 + }, + { + "epoch": 2.8844444444444446, + "grad_norm": 2.677187919616699, + "learning_rate": 8.469750889679716e-05, + "loss": 2.0427, + "step": 6490 + }, + { + "epoch": 2.884888888888889, + "grad_norm": 3.2531144618988037, + "learning_rate": 8.46797153024911e-05, + "loss": 1.8249, + "step": 6491 + }, + { + "epoch": 2.8853333333333335, + "grad_norm": 3.3353047370910645, + "learning_rate": 8.466192170818506e-05, + "loss": 1.3361, + "step": 6492 + }, + { + "epoch": 2.8857777777777778, + "grad_norm": 2.4956469535827637, + "learning_rate": 8.4644128113879e-05, + "loss": 1.3096, + "step": 6493 + }, + { + "epoch": 2.886222222222222, + "grad_norm": 3.014362096786499, + "learning_rate": 8.462633451957296e-05, + "loss": 1.5515, + "step": 6494 + }, + { + "epoch": 2.8866666666666667, + "grad_norm": 2.6748619079589844, + "learning_rate": 8.460854092526691e-05, + "loss": 1.5532, + "step": 6495 + }, + { + "epoch": 2.887111111111111, + "grad_norm": 3.3072657585144043, + "learning_rate": 8.459074733096086e-05, + "loss": 1.744, + "step": 6496 + }, + { + "epoch": 2.8875555555555557, + "grad_norm": 3.1664316654205322, + "learning_rate": 8.45729537366548e-05, + "loss": 1.5507, + "step": 6497 + }, + { + "epoch": 2.888, + "grad_norm": 3.5255908966064453, + "learning_rate": 8.455516014234876e-05, + "loss": 2.1625, + "step": 6498 + }, + { + "epoch": 2.8884444444444446, + "grad_norm": 0.49165022373199463, + "learning_rate": 8.453736654804271e-05, + "loss": 0.0612, + "step": 6499 + }, + { + "epoch": 2.888888888888889, + "grad_norm": 4.066702365875244, + "learning_rate": 8.451957295373666e-05, + "loss": 1.5139, + "step": 6500 + }, + { + "epoch": 2.889333333333333, + "grad_norm": 1.581194519996643, + "learning_rate": 8.450177935943061e-05, + "loss": 1.1063, + "step": 6501 + }, + { + "epoch": 2.889777777777778, + "grad_norm": 1.7735909223556519, + "learning_rate": 8.448398576512457e-05, + "loss": 1.9774, + "step": 6502 + }, + { + "epoch": 2.8902222222222225, + "grad_norm": 1.8580666780471802, + "learning_rate": 8.446619217081851e-05, + "loss": 2.2711, + "step": 6503 + }, + { + "epoch": 2.8906666666666667, + "grad_norm": 2.2903764247894287, + "learning_rate": 8.444839857651245e-05, + "loss": 2.2313, + "step": 6504 + }, + { + "epoch": 2.891111111111111, + "grad_norm": 2.7461414337158203, + "learning_rate": 8.443060498220641e-05, + "loss": 2.6372, + "step": 6505 + }, + { + "epoch": 2.8915555555555557, + "grad_norm": 2.5035669803619385, + "learning_rate": 8.441281138790035e-05, + "loss": 1.8447, + "step": 6506 + }, + { + "epoch": 2.892, + "grad_norm": 2.2642126083374023, + "learning_rate": 8.439501779359431e-05, + "loss": 2.3071, + "step": 6507 + }, + { + "epoch": 2.8924444444444446, + "grad_norm": 2.4640629291534424, + "learning_rate": 8.437722419928827e-05, + "loss": 2.0594, + "step": 6508 + }, + { + "epoch": 2.892888888888889, + "grad_norm": 2.5078279972076416, + "learning_rate": 8.435943060498221e-05, + "loss": 2.0226, + "step": 6509 + }, + { + "epoch": 2.8933333333333335, + "grad_norm": 2.1550204753875732, + "learning_rate": 8.434163701067615e-05, + "loss": 1.803, + "step": 6510 + }, + { + "epoch": 2.893777777777778, + "grad_norm": 2.478727340698242, + "learning_rate": 8.432384341637011e-05, + "loss": 1.9638, + "step": 6511 + }, + { + "epoch": 2.894222222222222, + "grad_norm": 2.432596206665039, + "learning_rate": 8.430604982206407e-05, + "loss": 1.8639, + "step": 6512 + }, + { + "epoch": 2.8946666666666667, + "grad_norm": 2.1648998260498047, + "learning_rate": 8.428825622775801e-05, + "loss": 1.8943, + "step": 6513 + }, + { + "epoch": 2.895111111111111, + "grad_norm": 2.2655625343322754, + "learning_rate": 8.427046263345197e-05, + "loss": 1.4152, + "step": 6514 + }, + { + "epoch": 2.8955555555555557, + "grad_norm": 2.6261086463928223, + "learning_rate": 8.425266903914592e-05, + "loss": 1.9381, + "step": 6515 + }, + { + "epoch": 2.896, + "grad_norm": 2.0679445266723633, + "learning_rate": 8.423487544483986e-05, + "loss": 1.5273, + "step": 6516 + }, + { + "epoch": 2.8964444444444446, + "grad_norm": 2.227121591567993, + "learning_rate": 8.421708185053381e-05, + "loss": 1.5115, + "step": 6517 + }, + { + "epoch": 2.896888888888889, + "grad_norm": 2.8133530616760254, + "learning_rate": 8.419928825622776e-05, + "loss": 2.1005, + "step": 6518 + }, + { + "epoch": 2.897333333333333, + "grad_norm": 2.5659241676330566, + "learning_rate": 8.418149466192171e-05, + "loss": 1.6425, + "step": 6519 + }, + { + "epoch": 2.897777777777778, + "grad_norm": 3.140711784362793, + "learning_rate": 8.416370106761566e-05, + "loss": 1.7234, + "step": 6520 + }, + { + "epoch": 2.8982222222222225, + "grad_norm": 2.281583786010742, + "learning_rate": 8.414590747330962e-05, + "loss": 1.4164, + "step": 6521 + }, + { + "epoch": 2.8986666666666667, + "grad_norm": 2.2396039962768555, + "learning_rate": 8.412811387900356e-05, + "loss": 1.6061, + "step": 6522 + }, + { + "epoch": 2.899111111111111, + "grad_norm": 2.5794193744659424, + "learning_rate": 8.41103202846975e-05, + "loss": 1.8131, + "step": 6523 + }, + { + "epoch": 2.8995555555555557, + "grad_norm": 2.583009958267212, + "learning_rate": 8.409252669039146e-05, + "loss": 1.6304, + "step": 6524 + }, + { + "epoch": 2.9, + "grad_norm": 2.1925716400146484, + "learning_rate": 8.40747330960854e-05, + "loss": 1.4056, + "step": 6525 + }, + { + "epoch": 2.9004444444444446, + "grad_norm": 2.4621171951293945, + "learning_rate": 8.405693950177936e-05, + "loss": 1.5926, + "step": 6526 + }, + { + "epoch": 2.900888888888889, + "grad_norm": 2.397620439529419, + "learning_rate": 8.403914590747332e-05, + "loss": 1.5116, + "step": 6527 + }, + { + "epoch": 2.9013333333333335, + "grad_norm": 2.841352939605713, + "learning_rate": 8.402135231316726e-05, + "loss": 1.5999, + "step": 6528 + }, + { + "epoch": 2.901777777777778, + "grad_norm": 3.072946548461914, + "learning_rate": 8.400355871886122e-05, + "loss": 2.002, + "step": 6529 + }, + { + "epoch": 2.902222222222222, + "grad_norm": 2.5697319507598877, + "learning_rate": 8.398576512455516e-05, + "loss": 1.8486, + "step": 6530 + }, + { + "epoch": 2.9026666666666667, + "grad_norm": 3.085151195526123, + "learning_rate": 8.396797153024912e-05, + "loss": 1.6757, + "step": 6531 + }, + { + "epoch": 2.903111111111111, + "grad_norm": 2.6995904445648193, + "learning_rate": 8.395017793594306e-05, + "loss": 1.4552, + "step": 6532 + }, + { + "epoch": 2.9035555555555557, + "grad_norm": 3.3730642795562744, + "learning_rate": 8.393238434163702e-05, + "loss": 1.8132, + "step": 6533 + }, + { + "epoch": 2.904, + "grad_norm": 2.572873115539551, + "learning_rate": 8.391459074733097e-05, + "loss": 1.8587, + "step": 6534 + }, + { + "epoch": 2.9044444444444446, + "grad_norm": 3.1449742317199707, + "learning_rate": 8.389679715302492e-05, + "loss": 1.3689, + "step": 6535 + }, + { + "epoch": 2.904888888888889, + "grad_norm": 2.7788970470428467, + "learning_rate": 8.387900355871886e-05, + "loss": 1.4992, + "step": 6536 + }, + { + "epoch": 2.905333333333333, + "grad_norm": 2.823796510696411, + "learning_rate": 8.386120996441282e-05, + "loss": 1.9288, + "step": 6537 + }, + { + "epoch": 2.905777777777778, + "grad_norm": 3.0071866512298584, + "learning_rate": 8.384341637010676e-05, + "loss": 2.0833, + "step": 6538 + }, + { + "epoch": 2.9062222222222225, + "grad_norm": 2.019472360610962, + "learning_rate": 8.382562277580071e-05, + "loss": 0.9878, + "step": 6539 + }, + { + "epoch": 2.9066666666666667, + "grad_norm": 2.4632411003112793, + "learning_rate": 8.380782918149467e-05, + "loss": 1.6176, + "step": 6540 + }, + { + "epoch": 2.907111111111111, + "grad_norm": 3.062143564224243, + "learning_rate": 8.379003558718861e-05, + "loss": 2.1748, + "step": 6541 + }, + { + "epoch": 2.9075555555555557, + "grad_norm": 4.060853958129883, + "learning_rate": 8.377224199288256e-05, + "loss": 1.9903, + "step": 6542 + }, + { + "epoch": 2.908, + "grad_norm": 3.1725289821624756, + "learning_rate": 8.375444839857651e-05, + "loss": 1.8678, + "step": 6543 + }, + { + "epoch": 2.9084444444444446, + "grad_norm": 3.047879934310913, + "learning_rate": 8.373665480427047e-05, + "loss": 1.6334, + "step": 6544 + }, + { + "epoch": 2.908888888888889, + "grad_norm": 3.0653295516967773, + "learning_rate": 8.371886120996441e-05, + "loss": 1.8163, + "step": 6545 + }, + { + "epoch": 2.9093333333333335, + "grad_norm": 3.4780588150024414, + "learning_rate": 8.370106761565837e-05, + "loss": 1.6996, + "step": 6546 + }, + { + "epoch": 2.909777777777778, + "grad_norm": 3.106790781021118, + "learning_rate": 8.368327402135233e-05, + "loss": 1.5692, + "step": 6547 + }, + { + "epoch": 2.910222222222222, + "grad_norm": 3.5817697048187256, + "learning_rate": 8.366548042704627e-05, + "loss": 2.3844, + "step": 6548 + }, + { + "epoch": 2.9106666666666667, + "grad_norm": 3.6129720211029053, + "learning_rate": 8.364768683274021e-05, + "loss": 1.8793, + "step": 6549 + }, + { + "epoch": 2.911111111111111, + "grad_norm": 3.0888235569000244, + "learning_rate": 8.362989323843417e-05, + "loss": 0.7461, + "step": 6550 + }, + { + "epoch": 2.9115555555555557, + "grad_norm": 1.7484018802642822, + "learning_rate": 8.361209964412811e-05, + "loss": 1.9615, + "step": 6551 + }, + { + "epoch": 2.912, + "grad_norm": 2.0905039310455322, + "learning_rate": 8.359430604982207e-05, + "loss": 2.2685, + "step": 6552 + }, + { + "epoch": 2.9124444444444446, + "grad_norm": 2.227344512939453, + "learning_rate": 8.357651245551602e-05, + "loss": 1.8803, + "step": 6553 + }, + { + "epoch": 2.912888888888889, + "grad_norm": 2.469000816345215, + "learning_rate": 8.355871886120997e-05, + "loss": 2.3178, + "step": 6554 + }, + { + "epoch": 2.913333333333333, + "grad_norm": 2.261646270751953, + "learning_rate": 8.354092526690391e-05, + "loss": 2.118, + "step": 6555 + }, + { + "epoch": 2.913777777777778, + "grad_norm": 2.2827672958374023, + "learning_rate": 8.352313167259787e-05, + "loss": 1.5105, + "step": 6556 + }, + { + "epoch": 2.9142222222222225, + "grad_norm": 2.4409070014953613, + "learning_rate": 8.350533807829182e-05, + "loss": 2.1229, + "step": 6557 + }, + { + "epoch": 2.9146666666666667, + "grad_norm": 1.7336750030517578, + "learning_rate": 8.348754448398577e-05, + "loss": 0.9735, + "step": 6558 + }, + { + "epoch": 2.915111111111111, + "grad_norm": 2.462920904159546, + "learning_rate": 8.346975088967972e-05, + "loss": 1.6474, + "step": 6559 + }, + { + "epoch": 2.9155555555555557, + "grad_norm": 2.5200788974761963, + "learning_rate": 8.345195729537368e-05, + "loss": 1.8454, + "step": 6560 + }, + { + "epoch": 2.916, + "grad_norm": 2.2653720378875732, + "learning_rate": 8.343416370106762e-05, + "loss": 1.7404, + "step": 6561 + }, + { + "epoch": 2.916444444444444, + "grad_norm": 2.323324680328369, + "learning_rate": 8.341637010676157e-05, + "loss": 1.7726, + "step": 6562 + }, + { + "epoch": 2.916888888888889, + "grad_norm": 2.575195074081421, + "learning_rate": 8.339857651245552e-05, + "loss": 2.0535, + "step": 6563 + }, + { + "epoch": 2.9173333333333336, + "grad_norm": 2.4426960945129395, + "learning_rate": 8.338078291814946e-05, + "loss": 1.8601, + "step": 6564 + }, + { + "epoch": 2.917777777777778, + "grad_norm": 2.3300108909606934, + "learning_rate": 8.336298932384342e-05, + "loss": 1.7842, + "step": 6565 + }, + { + "epoch": 2.918222222222222, + "grad_norm": 2.5958733558654785, + "learning_rate": 8.334519572953738e-05, + "loss": 1.9596, + "step": 6566 + }, + { + "epoch": 2.9186666666666667, + "grad_norm": 2.2606828212738037, + "learning_rate": 8.332740213523132e-05, + "loss": 1.2543, + "step": 6567 + }, + { + "epoch": 2.919111111111111, + "grad_norm": 2.573596954345703, + "learning_rate": 8.330960854092526e-05, + "loss": 1.6507, + "step": 6568 + }, + { + "epoch": 2.9195555555555557, + "grad_norm": 1.9967552423477173, + "learning_rate": 8.329181494661922e-05, + "loss": 1.3919, + "step": 6569 + }, + { + "epoch": 2.92, + "grad_norm": 2.615650177001953, + "learning_rate": 8.327402135231316e-05, + "loss": 1.8423, + "step": 6570 + }, + { + "epoch": 2.9204444444444446, + "grad_norm": 2.7236530780792236, + "learning_rate": 8.325622775800712e-05, + "loss": 2.3231, + "step": 6571 + }, + { + "epoch": 2.920888888888889, + "grad_norm": 2.5713248252868652, + "learning_rate": 8.323843416370108e-05, + "loss": 1.869, + "step": 6572 + }, + { + "epoch": 2.921333333333333, + "grad_norm": 2.9785003662109375, + "learning_rate": 8.322064056939502e-05, + "loss": 2.2974, + "step": 6573 + }, + { + "epoch": 2.921777777777778, + "grad_norm": 2.4110934734344482, + "learning_rate": 8.320284697508898e-05, + "loss": 1.2225, + "step": 6574 + }, + { + "epoch": 2.9222222222222225, + "grad_norm": 2.372670888900757, + "learning_rate": 8.318505338078292e-05, + "loss": 1.2217, + "step": 6575 + }, + { + "epoch": 2.9226666666666667, + "grad_norm": 2.6958694458007812, + "learning_rate": 8.316725978647687e-05, + "loss": 1.913, + "step": 6576 + }, + { + "epoch": 2.923111111111111, + "grad_norm": 2.6148412227630615, + "learning_rate": 8.314946619217082e-05, + "loss": 1.4505, + "step": 6577 + }, + { + "epoch": 2.9235555555555557, + "grad_norm": 2.7632627487182617, + "learning_rate": 8.313167259786477e-05, + "loss": 2.0089, + "step": 6578 + }, + { + "epoch": 2.924, + "grad_norm": 2.448899984359741, + "learning_rate": 8.311387900355873e-05, + "loss": 1.7222, + "step": 6579 + }, + { + "epoch": 2.924444444444444, + "grad_norm": 1.7573479413986206, + "learning_rate": 8.309608540925267e-05, + "loss": 0.8537, + "step": 6580 + }, + { + "epoch": 2.924888888888889, + "grad_norm": 2.8462624549865723, + "learning_rate": 8.307829181494662e-05, + "loss": 2.0423, + "step": 6581 + }, + { + "epoch": 2.9253333333333336, + "grad_norm": 1.8303264379501343, + "learning_rate": 8.306049822064057e-05, + "loss": 0.9714, + "step": 6582 + }, + { + "epoch": 2.925777777777778, + "grad_norm": 2.0662248134613037, + "learning_rate": 8.304270462633452e-05, + "loss": 0.9561, + "step": 6583 + }, + { + "epoch": 2.926222222222222, + "grad_norm": 2.5108532905578613, + "learning_rate": 8.302491103202847e-05, + "loss": 1.4271, + "step": 6584 + }, + { + "epoch": 2.9266666666666667, + "grad_norm": 2.5698022842407227, + "learning_rate": 8.300711743772243e-05, + "loss": 1.4, + "step": 6585 + }, + { + "epoch": 2.927111111111111, + "grad_norm": 2.503103494644165, + "learning_rate": 8.298932384341637e-05, + "loss": 1.2983, + "step": 6586 + }, + { + "epoch": 2.9275555555555557, + "grad_norm": 2.902477264404297, + "learning_rate": 8.297153024911033e-05, + "loss": 1.7343, + "step": 6587 + }, + { + "epoch": 2.928, + "grad_norm": 2.991421937942505, + "learning_rate": 8.295373665480427e-05, + "loss": 1.6752, + "step": 6588 + }, + { + "epoch": 2.9284444444444446, + "grad_norm": 2.4898111820220947, + "learning_rate": 8.293594306049823e-05, + "loss": 1.7385, + "step": 6589 + }, + { + "epoch": 2.928888888888889, + "grad_norm": 3.5942859649658203, + "learning_rate": 8.291814946619217e-05, + "loss": 2.0857, + "step": 6590 + }, + { + "epoch": 2.929333333333333, + "grad_norm": 1.9334756135940552, + "learning_rate": 8.290035587188613e-05, + "loss": 0.77, + "step": 6591 + }, + { + "epoch": 2.929777777777778, + "grad_norm": 3.102285385131836, + "learning_rate": 8.288256227758008e-05, + "loss": 1.8683, + "step": 6592 + }, + { + "epoch": 2.930222222222222, + "grad_norm": 2.8912463188171387, + "learning_rate": 8.286476868327403e-05, + "loss": 1.7117, + "step": 6593 + }, + { + "epoch": 2.9306666666666668, + "grad_norm": 3.238525152206421, + "learning_rate": 8.284697508896797e-05, + "loss": 2.01, + "step": 6594 + }, + { + "epoch": 2.931111111111111, + "grad_norm": 2.9973912239074707, + "learning_rate": 8.282918149466193e-05, + "loss": 1.7764, + "step": 6595 + }, + { + "epoch": 2.9315555555555557, + "grad_norm": 3.1456546783447266, + "learning_rate": 8.281138790035587e-05, + "loss": 2.0722, + "step": 6596 + }, + { + "epoch": 2.932, + "grad_norm": 2.999476671218872, + "learning_rate": 8.279359430604983e-05, + "loss": 1.7548, + "step": 6597 + }, + { + "epoch": 2.932444444444444, + "grad_norm": 3.8145954608917236, + "learning_rate": 8.277580071174378e-05, + "loss": 1.6423, + "step": 6598 + }, + { + "epoch": 2.932888888888889, + "grad_norm": 5.341122150421143, + "learning_rate": 8.275800711743773e-05, + "loss": 1.1883, + "step": 6599 + }, + { + "epoch": 2.9333333333333336, + "grad_norm": 3.1695616245269775, + "learning_rate": 8.274021352313167e-05, + "loss": 0.7776, + "step": 6600 + }, + { + "epoch": 2.933777777777778, + "grad_norm": 2.0281214714050293, + "learning_rate": 8.272241992882562e-05, + "loss": 1.7799, + "step": 6601 + }, + { + "epoch": 2.934222222222222, + "grad_norm": 2.2131547927856445, + "learning_rate": 8.270462633451958e-05, + "loss": 1.9286, + "step": 6602 + }, + { + "epoch": 2.9346666666666668, + "grad_norm": 2.3528523445129395, + "learning_rate": 8.268683274021352e-05, + "loss": 1.5521, + "step": 6603 + }, + { + "epoch": 2.935111111111111, + "grad_norm": 2.04451584815979, + "learning_rate": 8.266903914590748e-05, + "loss": 1.7594, + "step": 6604 + }, + { + "epoch": 2.9355555555555557, + "grad_norm": 0.19427204132080078, + "learning_rate": 8.265124555160144e-05, + "loss": 0.0238, + "step": 6605 + }, + { + "epoch": 2.936, + "grad_norm": 2.289008855819702, + "learning_rate": 8.263345195729538e-05, + "loss": 1.6694, + "step": 6606 + }, + { + "epoch": 2.9364444444444446, + "grad_norm": 3.0516107082366943, + "learning_rate": 8.261565836298932e-05, + "loss": 1.9453, + "step": 6607 + }, + { + "epoch": 2.936888888888889, + "grad_norm": 2.20947265625, + "learning_rate": 8.259786476868328e-05, + "loss": 1.7946, + "step": 6608 + }, + { + "epoch": 2.937333333333333, + "grad_norm": 2.571842670440674, + "learning_rate": 8.258007117437722e-05, + "loss": 1.6298, + "step": 6609 + }, + { + "epoch": 2.937777777777778, + "grad_norm": 2.839123249053955, + "learning_rate": 8.256227758007118e-05, + "loss": 1.9534, + "step": 6610 + }, + { + "epoch": 2.938222222222222, + "grad_norm": 2.8716540336608887, + "learning_rate": 8.254448398576514e-05, + "loss": 1.8971, + "step": 6611 + }, + { + "epoch": 2.9386666666666668, + "grad_norm": 2.287142276763916, + "learning_rate": 8.252669039145908e-05, + "loss": 1.73, + "step": 6612 + }, + { + "epoch": 2.939111111111111, + "grad_norm": 2.8752803802490234, + "learning_rate": 8.250889679715302e-05, + "loss": 1.7428, + "step": 6613 + }, + { + "epoch": 2.9395555555555557, + "grad_norm": 2.5958893299102783, + "learning_rate": 8.249110320284698e-05, + "loss": 1.9177, + "step": 6614 + }, + { + "epoch": 2.94, + "grad_norm": 2.100194215774536, + "learning_rate": 8.247330960854092e-05, + "loss": 1.4296, + "step": 6615 + }, + { + "epoch": 2.940444444444444, + "grad_norm": 2.6018195152282715, + "learning_rate": 8.245551601423488e-05, + "loss": 1.7576, + "step": 6616 + }, + { + "epoch": 2.940888888888889, + "grad_norm": 2.3454983234405518, + "learning_rate": 8.243772241992883e-05, + "loss": 1.5927, + "step": 6617 + }, + { + "epoch": 2.9413333333333336, + "grad_norm": 2.5776448249816895, + "learning_rate": 8.241992882562278e-05, + "loss": 1.7602, + "step": 6618 + }, + { + "epoch": 2.941777777777778, + "grad_norm": 2.55011248588562, + "learning_rate": 8.240213523131673e-05, + "loss": 1.4901, + "step": 6619 + }, + { + "epoch": 2.942222222222222, + "grad_norm": 2.298241376876831, + "learning_rate": 8.238434163701068e-05, + "loss": 1.5664, + "step": 6620 + }, + { + "epoch": 2.9426666666666668, + "grad_norm": 2.403535842895508, + "learning_rate": 8.236654804270463e-05, + "loss": 1.7368, + "step": 6621 + }, + { + "epoch": 2.943111111111111, + "grad_norm": 2.3302366733551025, + "learning_rate": 8.234875444839858e-05, + "loss": 1.5256, + "step": 6622 + }, + { + "epoch": 2.9435555555555557, + "grad_norm": 2.4017152786254883, + "learning_rate": 8.233096085409253e-05, + "loss": 1.9532, + "step": 6623 + }, + { + "epoch": 2.944, + "grad_norm": 1.9871573448181152, + "learning_rate": 8.231316725978649e-05, + "loss": 1.2034, + "step": 6624 + }, + { + "epoch": 2.9444444444444446, + "grad_norm": 2.877697229385376, + "learning_rate": 8.229537366548043e-05, + "loss": 1.7799, + "step": 6625 + }, + { + "epoch": 2.944888888888889, + "grad_norm": 2.7882204055786133, + "learning_rate": 8.227758007117437e-05, + "loss": 1.8585, + "step": 6626 + }, + { + "epoch": 2.945333333333333, + "grad_norm": 3.066232442855835, + "learning_rate": 8.225978647686833e-05, + "loss": 1.9722, + "step": 6627 + }, + { + "epoch": 2.945777777777778, + "grad_norm": 2.9001948833465576, + "learning_rate": 8.224199288256227e-05, + "loss": 2.0303, + "step": 6628 + }, + { + "epoch": 2.946222222222222, + "grad_norm": 3.1734731197357178, + "learning_rate": 8.222419928825623e-05, + "loss": 1.8917, + "step": 6629 + }, + { + "epoch": 2.9466666666666668, + "grad_norm": 1.936790108680725, + "learning_rate": 8.220640569395019e-05, + "loss": 0.9073, + "step": 6630 + }, + { + "epoch": 2.947111111111111, + "grad_norm": 2.223129987716675, + "learning_rate": 8.218861209964413e-05, + "loss": 0.9589, + "step": 6631 + }, + { + "epoch": 2.9475555555555557, + "grad_norm": 1.784337043762207, + "learning_rate": 8.217081850533809e-05, + "loss": 0.7341, + "step": 6632 + }, + { + "epoch": 2.948, + "grad_norm": 2.703338146209717, + "learning_rate": 8.215302491103203e-05, + "loss": 1.5117, + "step": 6633 + }, + { + "epoch": 2.948444444444444, + "grad_norm": 2.924023389816284, + "learning_rate": 8.213523131672599e-05, + "loss": 1.8323, + "step": 6634 + }, + { + "epoch": 2.948888888888889, + "grad_norm": 3.084257125854492, + "learning_rate": 8.211743772241993e-05, + "loss": 1.7585, + "step": 6635 + }, + { + "epoch": 2.9493333333333336, + "grad_norm": 2.6331820487976074, + "learning_rate": 8.209964412811389e-05, + "loss": 1.5509, + "step": 6636 + }, + { + "epoch": 2.949777777777778, + "grad_norm": 2.6916754245758057, + "learning_rate": 8.208185053380784e-05, + "loss": 1.6519, + "step": 6637 + }, + { + "epoch": 2.950222222222222, + "grad_norm": 3.28885555267334, + "learning_rate": 8.206405693950178e-05, + "loss": 1.9601, + "step": 6638 + }, + { + "epoch": 2.9506666666666668, + "grad_norm": 3.135986328125, + "learning_rate": 8.204626334519573e-05, + "loss": 1.9278, + "step": 6639 + }, + { + "epoch": 2.951111111111111, + "grad_norm": 2.845036745071411, + "learning_rate": 8.202846975088968e-05, + "loss": 1.611, + "step": 6640 + }, + { + "epoch": 2.9515555555555557, + "grad_norm": 2.581521987915039, + "learning_rate": 8.201067615658363e-05, + "loss": 1.4946, + "step": 6641 + }, + { + "epoch": 2.952, + "grad_norm": 2.706934928894043, + "learning_rate": 8.199288256227758e-05, + "loss": 1.6828, + "step": 6642 + }, + { + "epoch": 2.9524444444444446, + "grad_norm": 3.34147572517395, + "learning_rate": 8.197508896797154e-05, + "loss": 2.12, + "step": 6643 + }, + { + "epoch": 2.952888888888889, + "grad_norm": 3.1140594482421875, + "learning_rate": 8.195729537366548e-05, + "loss": 2.0541, + "step": 6644 + }, + { + "epoch": 2.953333333333333, + "grad_norm": 3.713144540786743, + "learning_rate": 8.193950177935944e-05, + "loss": 2.1426, + "step": 6645 + }, + { + "epoch": 2.953777777777778, + "grad_norm": 3.481236219406128, + "learning_rate": 8.192170818505338e-05, + "loss": 1.4833, + "step": 6646 + }, + { + "epoch": 2.954222222222222, + "grad_norm": 3.1434216499328613, + "learning_rate": 8.190391459074734e-05, + "loss": 1.8463, + "step": 6647 + }, + { + "epoch": 2.9546666666666668, + "grad_norm": 3.119110107421875, + "learning_rate": 8.188612099644128e-05, + "loss": 1.4728, + "step": 6648 + }, + { + "epoch": 2.955111111111111, + "grad_norm": 0.43881502747535706, + "learning_rate": 8.186832740213524e-05, + "loss": 0.0622, + "step": 6649 + }, + { + "epoch": 2.9555555555555557, + "grad_norm": 3.7765581607818604, + "learning_rate": 8.18505338078292e-05, + "loss": 1.5129, + "step": 6650 + }, + { + "epoch": 2.956, + "grad_norm": 2.1686058044433594, + "learning_rate": 8.183274021352314e-05, + "loss": 2.1518, + "step": 6651 + }, + { + "epoch": 2.956444444444444, + "grad_norm": 2.1492674350738525, + "learning_rate": 8.181494661921708e-05, + "loss": 1.6543, + "step": 6652 + }, + { + "epoch": 2.956888888888889, + "grad_norm": 2.0771119594573975, + "learning_rate": 8.179715302491104e-05, + "loss": 2.1437, + "step": 6653 + }, + { + "epoch": 2.9573333333333336, + "grad_norm": 2.154829978942871, + "learning_rate": 8.177935943060498e-05, + "loss": 2.0731, + "step": 6654 + }, + { + "epoch": 2.957777777777778, + "grad_norm": 2.0446035861968994, + "learning_rate": 8.176156583629894e-05, + "loss": 2.1788, + "step": 6655 + }, + { + "epoch": 2.958222222222222, + "grad_norm": 2.4353065490722656, + "learning_rate": 8.174377224199289e-05, + "loss": 1.9073, + "step": 6656 + }, + { + "epoch": 2.958666666666667, + "grad_norm": 2.2374634742736816, + "learning_rate": 8.172597864768684e-05, + "loss": 1.9468, + "step": 6657 + }, + { + "epoch": 2.959111111111111, + "grad_norm": 1.9991698265075684, + "learning_rate": 8.170818505338078e-05, + "loss": 2.0529, + "step": 6658 + }, + { + "epoch": 2.9595555555555557, + "grad_norm": 2.3634281158447266, + "learning_rate": 8.169039145907474e-05, + "loss": 2.2038, + "step": 6659 + }, + { + "epoch": 2.96, + "grad_norm": 2.3970882892608643, + "learning_rate": 8.167259786476868e-05, + "loss": 2.1983, + "step": 6660 + }, + { + "epoch": 2.9604444444444447, + "grad_norm": 2.5298354625701904, + "learning_rate": 8.165480427046264e-05, + "loss": 2.1612, + "step": 6661 + }, + { + "epoch": 2.960888888888889, + "grad_norm": 2.4006104469299316, + "learning_rate": 8.163701067615659e-05, + "loss": 2.0669, + "step": 6662 + }, + { + "epoch": 2.961333333333333, + "grad_norm": 2.5116260051727295, + "learning_rate": 8.161921708185053e-05, + "loss": 1.9285, + "step": 6663 + }, + { + "epoch": 2.961777777777778, + "grad_norm": 2.3565287590026855, + "learning_rate": 8.160142348754449e-05, + "loss": 1.9953, + "step": 6664 + }, + { + "epoch": 2.962222222222222, + "grad_norm": 2.253330945968628, + "learning_rate": 8.158362989323843e-05, + "loss": 1.5129, + "step": 6665 + }, + { + "epoch": 2.962666666666667, + "grad_norm": 2.49210524559021, + "learning_rate": 8.156583629893239e-05, + "loss": 1.9875, + "step": 6666 + }, + { + "epoch": 2.963111111111111, + "grad_norm": 2.353780508041382, + "learning_rate": 8.154804270462633e-05, + "loss": 1.471, + "step": 6667 + }, + { + "epoch": 2.9635555555555557, + "grad_norm": 2.679733991622925, + "learning_rate": 8.153024911032029e-05, + "loss": 1.8062, + "step": 6668 + }, + { + "epoch": 2.964, + "grad_norm": 2.1088638305664062, + "learning_rate": 8.151245551601425e-05, + "loss": 1.6653, + "step": 6669 + }, + { + "epoch": 2.964444444444444, + "grad_norm": 2.271273612976074, + "learning_rate": 8.149466192170819e-05, + "loss": 1.8267, + "step": 6670 + }, + { + "epoch": 2.964888888888889, + "grad_norm": 2.468289613723755, + "learning_rate": 8.147686832740213e-05, + "loss": 1.5659, + "step": 6671 + }, + { + "epoch": 2.9653333333333336, + "grad_norm": 2.2520124912261963, + "learning_rate": 8.145907473309609e-05, + "loss": 1.6409, + "step": 6672 + }, + { + "epoch": 2.965777777777778, + "grad_norm": 2.9224207401275635, + "learning_rate": 8.144128113879003e-05, + "loss": 1.7884, + "step": 6673 + }, + { + "epoch": 2.966222222222222, + "grad_norm": 2.306745767593384, + "learning_rate": 8.142348754448399e-05, + "loss": 1.8612, + "step": 6674 + }, + { + "epoch": 2.966666666666667, + "grad_norm": 2.707267999649048, + "learning_rate": 8.140569395017794e-05, + "loss": 1.6955, + "step": 6675 + }, + { + "epoch": 2.967111111111111, + "grad_norm": 2.6562561988830566, + "learning_rate": 8.138790035587189e-05, + "loss": 1.8765, + "step": 6676 + }, + { + "epoch": 2.9675555555555553, + "grad_norm": 2.344191789627075, + "learning_rate": 8.137010676156584e-05, + "loss": 1.3519, + "step": 6677 + }, + { + "epoch": 2.968, + "grad_norm": 2.481653928756714, + "learning_rate": 8.135231316725979e-05, + "loss": 1.8528, + "step": 6678 + }, + { + "epoch": 2.9684444444444447, + "grad_norm": 2.491582155227661, + "learning_rate": 8.133451957295374e-05, + "loss": 1.8404, + "step": 6679 + }, + { + "epoch": 2.968888888888889, + "grad_norm": 2.41845440864563, + "learning_rate": 8.131672597864769e-05, + "loss": 1.413, + "step": 6680 + }, + { + "epoch": 2.969333333333333, + "grad_norm": 2.9708704948425293, + "learning_rate": 8.129893238434164e-05, + "loss": 2.0273, + "step": 6681 + }, + { + "epoch": 2.969777777777778, + "grad_norm": 3.0216286182403564, + "learning_rate": 8.12811387900356e-05, + "loss": 2.0485, + "step": 6682 + }, + { + "epoch": 2.970222222222222, + "grad_norm": 2.9172136783599854, + "learning_rate": 8.126334519572954e-05, + "loss": 1.5057, + "step": 6683 + }, + { + "epoch": 2.970666666666667, + "grad_norm": 3.159728765487671, + "learning_rate": 8.124555160142349e-05, + "loss": 1.9017, + "step": 6684 + }, + { + "epoch": 2.971111111111111, + "grad_norm": 2.4506280422210693, + "learning_rate": 8.122775800711744e-05, + "loss": 1.4581, + "step": 6685 + }, + { + "epoch": 2.9715555555555557, + "grad_norm": 2.9856271743774414, + "learning_rate": 8.120996441281138e-05, + "loss": 1.591, + "step": 6686 + }, + { + "epoch": 2.972, + "grad_norm": 3.172899007797241, + "learning_rate": 8.119217081850534e-05, + "loss": 1.7744, + "step": 6687 + }, + { + "epoch": 2.9724444444444442, + "grad_norm": 2.1155688762664795, + "learning_rate": 8.11743772241993e-05, + "loss": 1.1155, + "step": 6688 + }, + { + "epoch": 2.972888888888889, + "grad_norm": 2.4878904819488525, + "learning_rate": 8.115658362989324e-05, + "loss": 1.6012, + "step": 6689 + }, + { + "epoch": 2.9733333333333336, + "grad_norm": 2.819701671600342, + "learning_rate": 8.11387900355872e-05, + "loss": 1.6376, + "step": 6690 + }, + { + "epoch": 2.973777777777778, + "grad_norm": 3.1864094734191895, + "learning_rate": 8.112099644128114e-05, + "loss": 1.846, + "step": 6691 + }, + { + "epoch": 2.974222222222222, + "grad_norm": 3.0991835594177246, + "learning_rate": 8.11032028469751e-05, + "loss": 2.155, + "step": 6692 + }, + { + "epoch": 2.974666666666667, + "grad_norm": 2.6021008491516113, + "learning_rate": 8.108540925266904e-05, + "loss": 1.366, + "step": 6693 + }, + { + "epoch": 2.975111111111111, + "grad_norm": 3.0120420455932617, + "learning_rate": 8.1067615658363e-05, + "loss": 1.7095, + "step": 6694 + }, + { + "epoch": 2.9755555555555553, + "grad_norm": 3.54189395904541, + "learning_rate": 8.104982206405695e-05, + "loss": 1.5697, + "step": 6695 + }, + { + "epoch": 2.976, + "grad_norm": 2.7960903644561768, + "learning_rate": 8.10320284697509e-05, + "loss": 1.5675, + "step": 6696 + }, + { + "epoch": 2.9764444444444447, + "grad_norm": 3.8344061374664307, + "learning_rate": 8.101423487544484e-05, + "loss": 1.8995, + "step": 6697 + }, + { + "epoch": 2.976888888888889, + "grad_norm": 3.2357256412506104, + "learning_rate": 8.09964412811388e-05, + "loss": 1.6873, + "step": 6698 + }, + { + "epoch": 2.977333333333333, + "grad_norm": 4.216027736663818, + "learning_rate": 8.097864768683274e-05, + "loss": 1.9569, + "step": 6699 + }, + { + "epoch": 2.977777777777778, + "grad_norm": 4.249828338623047, + "learning_rate": 8.09608540925267e-05, + "loss": 1.9055, + "step": 6700 + }, + { + "epoch": 2.978222222222222, + "grad_norm": 2.1459178924560547, + "learning_rate": 8.094306049822065e-05, + "loss": 2.1627, + "step": 6701 + }, + { + "epoch": 2.978666666666667, + "grad_norm": 1.4327117204666138, + "learning_rate": 8.09252669039146e-05, + "loss": 1.133, + "step": 6702 + }, + { + "epoch": 2.979111111111111, + "grad_norm": 1.9758896827697754, + "learning_rate": 8.090747330960855e-05, + "loss": 2.7843, + "step": 6703 + }, + { + "epoch": 2.9795555555555557, + "grad_norm": 2.312091112136841, + "learning_rate": 8.08896797153025e-05, + "loss": 2.1245, + "step": 6704 + }, + { + "epoch": 2.98, + "grad_norm": 1.7319886684417725, + "learning_rate": 8.087188612099644e-05, + "loss": 0.4393, + "step": 6705 + }, + { + "epoch": 2.9804444444444442, + "grad_norm": 2.626613140106201, + "learning_rate": 8.085409252669039e-05, + "loss": 2.6569, + "step": 6706 + }, + { + "epoch": 2.980888888888889, + "grad_norm": 2.209878444671631, + "learning_rate": 8.083629893238435e-05, + "loss": 2.1272, + "step": 6707 + }, + { + "epoch": 2.981333333333333, + "grad_norm": 2.3179056644439697, + "learning_rate": 8.081850533807829e-05, + "loss": 1.7923, + "step": 6708 + }, + { + "epoch": 2.981777777777778, + "grad_norm": 2.408010959625244, + "learning_rate": 8.080071174377225e-05, + "loss": 1.7156, + "step": 6709 + }, + { + "epoch": 2.982222222222222, + "grad_norm": 3.3281619548797607, + "learning_rate": 8.078291814946619e-05, + "loss": 1.9335, + "step": 6710 + }, + { + "epoch": 2.982666666666667, + "grad_norm": 2.58201003074646, + "learning_rate": 8.076512455516015e-05, + "loss": 1.6612, + "step": 6711 + }, + { + "epoch": 2.983111111111111, + "grad_norm": 2.454719066619873, + "learning_rate": 8.074733096085409e-05, + "loss": 2.0895, + "step": 6712 + }, + { + "epoch": 2.9835555555555553, + "grad_norm": 2.9061319828033447, + "learning_rate": 8.072953736654805e-05, + "loss": 2.1351, + "step": 6713 + }, + { + "epoch": 2.984, + "grad_norm": 2.6297855377197266, + "learning_rate": 8.0711743772242e-05, + "loss": 1.849, + "step": 6714 + }, + { + "epoch": 2.9844444444444447, + "grad_norm": 2.431936264038086, + "learning_rate": 8.069395017793595e-05, + "loss": 1.809, + "step": 6715 + }, + { + "epoch": 2.984888888888889, + "grad_norm": 1.5601791143417358, + "learning_rate": 8.067615658362989e-05, + "loss": 0.7057, + "step": 6716 + }, + { + "epoch": 2.985333333333333, + "grad_norm": 2.3893916606903076, + "learning_rate": 8.065836298932385e-05, + "loss": 1.859, + "step": 6717 + }, + { + "epoch": 2.985777777777778, + "grad_norm": 2.8812737464904785, + "learning_rate": 8.064056939501779e-05, + "loss": 1.4955, + "step": 6718 + }, + { + "epoch": 2.986222222222222, + "grad_norm": 2.6631298065185547, + "learning_rate": 8.062277580071175e-05, + "loss": 1.8091, + "step": 6719 + }, + { + "epoch": 2.986666666666667, + "grad_norm": 2.2545714378356934, + "learning_rate": 8.06049822064057e-05, + "loss": 1.7845, + "step": 6720 + }, + { + "epoch": 2.987111111111111, + "grad_norm": 2.667125940322876, + "learning_rate": 8.058718861209965e-05, + "loss": 1.9743, + "step": 6721 + }, + { + "epoch": 2.9875555555555557, + "grad_norm": 2.8963820934295654, + "learning_rate": 8.05693950177936e-05, + "loss": 2.2487, + "step": 6722 + }, + { + "epoch": 2.988, + "grad_norm": 2.3376080989837646, + "learning_rate": 8.055160142348754e-05, + "loss": 1.7657, + "step": 6723 + }, + { + "epoch": 2.9884444444444442, + "grad_norm": 2.4739911556243896, + "learning_rate": 8.05338078291815e-05, + "loss": 1.4618, + "step": 6724 + }, + { + "epoch": 2.988888888888889, + "grad_norm": 2.836094856262207, + "learning_rate": 8.051601423487544e-05, + "loss": 1.8076, + "step": 6725 + }, + { + "epoch": 2.989333333333333, + "grad_norm": 2.611704111099243, + "learning_rate": 8.04982206405694e-05, + "loss": 1.8282, + "step": 6726 + }, + { + "epoch": 2.989777777777778, + "grad_norm": 2.016409158706665, + "learning_rate": 8.048042704626336e-05, + "loss": 1.579, + "step": 6727 + }, + { + "epoch": 2.990222222222222, + "grad_norm": 3.172990083694458, + "learning_rate": 8.04626334519573e-05, + "loss": 1.9213, + "step": 6728 + }, + { + "epoch": 2.990666666666667, + "grad_norm": 2.6530048847198486, + "learning_rate": 8.044483985765124e-05, + "loss": 1.3573, + "step": 6729 + }, + { + "epoch": 2.991111111111111, + "grad_norm": 2.725369453430176, + "learning_rate": 8.04270462633452e-05, + "loss": 1.7038, + "step": 6730 + }, + { + "epoch": 2.9915555555555553, + "grad_norm": 2.7324156761169434, + "learning_rate": 8.040925266903914e-05, + "loss": 1.9726, + "step": 6731 + }, + { + "epoch": 2.992, + "grad_norm": 2.893840789794922, + "learning_rate": 8.03914590747331e-05, + "loss": 1.5817, + "step": 6732 + }, + { + "epoch": 2.9924444444444447, + "grad_norm": 3.1419413089752197, + "learning_rate": 8.037366548042706e-05, + "loss": 1.5595, + "step": 6733 + }, + { + "epoch": 2.992888888888889, + "grad_norm": 2.4272377490997314, + "learning_rate": 8.0355871886121e-05, + "loss": 1.338, + "step": 6734 + }, + { + "epoch": 2.993333333333333, + "grad_norm": 2.944011688232422, + "learning_rate": 8.033807829181496e-05, + "loss": 1.7456, + "step": 6735 + }, + { + "epoch": 2.993777777777778, + "grad_norm": 2.8351387977600098, + "learning_rate": 8.03202846975089e-05, + "loss": 1.6632, + "step": 6736 + }, + { + "epoch": 2.994222222222222, + "grad_norm": 3.1518852710723877, + "learning_rate": 8.030249110320285e-05, + "loss": 0.9622, + "step": 6737 + }, + { + "epoch": 2.994666666666667, + "grad_norm": 2.9322409629821777, + "learning_rate": 8.02846975088968e-05, + "loss": 1.1134, + "step": 6738 + }, + { + "epoch": 2.995111111111111, + "grad_norm": 2.9083549976348877, + "learning_rate": 8.026690391459075e-05, + "loss": 1.7185, + "step": 6739 + }, + { + "epoch": 2.9955555555555557, + "grad_norm": 2.974486827850342, + "learning_rate": 8.024911032028471e-05, + "loss": 1.9998, + "step": 6740 + }, + { + "epoch": 2.996, + "grad_norm": 2.9648959636688232, + "learning_rate": 8.023131672597865e-05, + "loss": 1.7564, + "step": 6741 + }, + { + "epoch": 2.9964444444444442, + "grad_norm": 3.1087772846221924, + "learning_rate": 8.02135231316726e-05, + "loss": 1.4747, + "step": 6742 + }, + { + "epoch": 2.996888888888889, + "grad_norm": 3.156559944152832, + "learning_rate": 8.019572953736655e-05, + "loss": 1.6082, + "step": 6743 + }, + { + "epoch": 2.997333333333333, + "grad_norm": 2.8267040252685547, + "learning_rate": 8.01779359430605e-05, + "loss": 1.7786, + "step": 6744 + }, + { + "epoch": 2.997777777777778, + "grad_norm": 3.1161437034606934, + "learning_rate": 8.016014234875445e-05, + "loss": 1.8029, + "step": 6745 + }, + { + "epoch": 2.998222222222222, + "grad_norm": 2.9387221336364746, + "learning_rate": 8.014234875444841e-05, + "loss": 1.4219, + "step": 6746 + }, + { + "epoch": 2.998666666666667, + "grad_norm": 3.949208974838257, + "learning_rate": 8.012455516014235e-05, + "loss": 2.0522, + "step": 6747 + }, + { + "epoch": 2.999111111111111, + "grad_norm": 3.7779977321624756, + "learning_rate": 8.010676156583631e-05, + "loss": 1.7612, + "step": 6748 + }, + { + "epoch": 2.9995555555555553, + "grad_norm": 3.2595958709716797, + "learning_rate": 8.008896797153025e-05, + "loss": 1.5605, + "step": 6749 + }, + { + "epoch": 3.0, + "grad_norm": 3.8709285259246826, + "learning_rate": 8.00711743772242e-05, + "loss": 1.2562, + "step": 6750 + }, + { + "epoch": 3.0, + "eval_loss": 2.560321807861328, + "eval_runtime": 47.1336, + "eval_samples_per_second": 10.608, + "eval_steps_per_second": 10.608, + "step": 6750 + }, + { + "epoch": 3.0004444444444442, + "grad_norm": 1.5255669355392456, + "learning_rate": 8.005338078291815e-05, + "loss": 1.4736, + "step": 6751 + }, + { + "epoch": 3.000888888888889, + "grad_norm": 1.8357324600219727, + "learning_rate": 8.003558718861211e-05, + "loss": 1.8316, + "step": 6752 + }, + { + "epoch": 3.001333333333333, + "grad_norm": 2.2091808319091797, + "learning_rate": 8.001779359430605e-05, + "loss": 1.994, + "step": 6753 + }, + { + "epoch": 3.001777777777778, + "grad_norm": 2.113086462020874, + "learning_rate": 8e-05, + "loss": 2.0161, + "step": 6754 + }, + { + "epoch": 3.002222222222222, + "grad_norm": 2.0146021842956543, + "learning_rate": 7.998220640569395e-05, + "loss": 1.3508, + "step": 6755 + }, + { + "epoch": 3.002666666666667, + "grad_norm": 2.153592348098755, + "learning_rate": 7.99644128113879e-05, + "loss": 1.2604, + "step": 6756 + }, + { + "epoch": 3.003111111111111, + "grad_norm": 1.5062990188598633, + "learning_rate": 7.994661921708185e-05, + "loss": 1.0908, + "step": 6757 + }, + { + "epoch": 3.0035555555555558, + "grad_norm": 2.1184024810791016, + "learning_rate": 7.99288256227758e-05, + "loss": 1.2938, + "step": 6758 + }, + { + "epoch": 3.004, + "grad_norm": 2.0012667179107666, + "learning_rate": 7.991103202846976e-05, + "loss": 1.3528, + "step": 6759 + }, + { + "epoch": 3.0044444444444443, + "grad_norm": 2.275223731994629, + "learning_rate": 7.98932384341637e-05, + "loss": 1.2119, + "step": 6760 + }, + { + "epoch": 3.004888888888889, + "grad_norm": 2.465026617050171, + "learning_rate": 7.987544483985766e-05, + "loss": 1.4336, + "step": 6761 + }, + { + "epoch": 3.005333333333333, + "grad_norm": 2.5878005027770996, + "learning_rate": 7.98576512455516e-05, + "loss": 1.4849, + "step": 6762 + }, + { + "epoch": 3.005777777777778, + "grad_norm": 2.4724271297454834, + "learning_rate": 7.983985765124555e-05, + "loss": 1.033, + "step": 6763 + }, + { + "epoch": 3.006222222222222, + "grad_norm": 2.9463562965393066, + "learning_rate": 7.98220640569395e-05, + "loss": 1.4481, + "step": 6764 + }, + { + "epoch": 3.006666666666667, + "grad_norm": 3.1810200214385986, + "learning_rate": 7.980427046263346e-05, + "loss": 0.9846, + "step": 6765 + }, + { + "epoch": 3.007111111111111, + "grad_norm": 2.737637758255005, + "learning_rate": 7.97864768683274e-05, + "loss": 1.0059, + "step": 6766 + }, + { + "epoch": 3.0075555555555558, + "grad_norm": 2.819664239883423, + "learning_rate": 7.976868327402136e-05, + "loss": 1.331, + "step": 6767 + }, + { + "epoch": 3.008, + "grad_norm": 2.880838632583618, + "learning_rate": 7.97508896797153e-05, + "loss": 1.6169, + "step": 6768 + }, + { + "epoch": 3.0084444444444443, + "grad_norm": 2.998161792755127, + "learning_rate": 7.973309608540926e-05, + "loss": 1.3927, + "step": 6769 + }, + { + "epoch": 3.008888888888889, + "grad_norm": 2.9646785259246826, + "learning_rate": 7.97153024911032e-05, + "loss": 1.1182, + "step": 6770 + }, + { + "epoch": 3.009333333333333, + "grad_norm": 3.8950388431549072, + "learning_rate": 7.969750889679716e-05, + "loss": 1.0995, + "step": 6771 + }, + { + "epoch": 3.009777777777778, + "grad_norm": 3.958792209625244, + "learning_rate": 7.967971530249112e-05, + "loss": 1.3026, + "step": 6772 + }, + { + "epoch": 3.010222222222222, + "grad_norm": 3.3509862422943115, + "learning_rate": 7.966192170818506e-05, + "loss": 1.511, + "step": 6773 + }, + { + "epoch": 3.010666666666667, + "grad_norm": 3.338815689086914, + "learning_rate": 7.9644128113879e-05, + "loss": 1.1785, + "step": 6774 + }, + { + "epoch": 3.011111111111111, + "grad_norm": 4.509990692138672, + "learning_rate": 7.962633451957296e-05, + "loss": 1.4605, + "step": 6775 + }, + { + "epoch": 3.0115555555555558, + "grad_norm": 3.72403883934021, + "learning_rate": 7.96085409252669e-05, + "loss": 1.3251, + "step": 6776 + }, + { + "epoch": 3.012, + "grad_norm": 3.0874853134155273, + "learning_rate": 7.959074733096086e-05, + "loss": 1.424, + "step": 6777 + }, + { + "epoch": 3.0124444444444443, + "grad_norm": 2.400561809539795, + "learning_rate": 7.957295373665481e-05, + "loss": 0.6489, + "step": 6778 + }, + { + "epoch": 3.012888888888889, + "grad_norm": 3.6095080375671387, + "learning_rate": 7.955516014234876e-05, + "loss": 0.9895, + "step": 6779 + }, + { + "epoch": 3.013333333333333, + "grad_norm": 4.059244632720947, + "learning_rate": 7.953736654804271e-05, + "loss": 1.0755, + "step": 6780 + }, + { + "epoch": 3.013777777777778, + "grad_norm": 3.3951244354248047, + "learning_rate": 7.951957295373666e-05, + "loss": 1.4593, + "step": 6781 + }, + { + "epoch": 3.014222222222222, + "grad_norm": 3.264514207839966, + "learning_rate": 7.950177935943061e-05, + "loss": 1.1751, + "step": 6782 + }, + { + "epoch": 3.014666666666667, + "grad_norm": 3.5838840007781982, + "learning_rate": 7.948398576512456e-05, + "loss": 1.2873, + "step": 6783 + }, + { + "epoch": 3.015111111111111, + "grad_norm": 3.029461622238159, + "learning_rate": 7.946619217081851e-05, + "loss": 0.5727, + "step": 6784 + }, + { + "epoch": 3.0155555555555558, + "grad_norm": 2.3278002738952637, + "learning_rate": 7.944839857651247e-05, + "loss": 0.5224, + "step": 6785 + }, + { + "epoch": 3.016, + "grad_norm": 3.3948307037353516, + "learning_rate": 7.943060498220641e-05, + "loss": 1.5969, + "step": 6786 + }, + { + "epoch": 3.0164444444444443, + "grad_norm": 3.4269049167633057, + "learning_rate": 7.941281138790035e-05, + "loss": 1.3439, + "step": 6787 + }, + { + "epoch": 3.016888888888889, + "grad_norm": 4.021193504333496, + "learning_rate": 7.939501779359431e-05, + "loss": 1.429, + "step": 6788 + }, + { + "epoch": 3.017333333333333, + "grad_norm": 3.9195425510406494, + "learning_rate": 7.937722419928825e-05, + "loss": 1.3733, + "step": 6789 + }, + { + "epoch": 3.017777777777778, + "grad_norm": 3.872466564178467, + "learning_rate": 7.935943060498221e-05, + "loss": 1.1731, + "step": 6790 + }, + { + "epoch": 3.018222222222222, + "grad_norm": 4.006219387054443, + "learning_rate": 7.934163701067617e-05, + "loss": 1.5755, + "step": 6791 + }, + { + "epoch": 3.018666666666667, + "grad_norm": 4.188656330108643, + "learning_rate": 7.932384341637011e-05, + "loss": 1.4132, + "step": 6792 + }, + { + "epoch": 3.019111111111111, + "grad_norm": 3.21732234954834, + "learning_rate": 7.930604982206407e-05, + "loss": 0.9631, + "step": 6793 + }, + { + "epoch": 3.0195555555555558, + "grad_norm": 2.5914382934570312, + "learning_rate": 7.928825622775801e-05, + "loss": 0.9383, + "step": 6794 + }, + { + "epoch": 3.02, + "grad_norm": 3.1577656269073486, + "learning_rate": 7.927046263345195e-05, + "loss": 1.1981, + "step": 6795 + }, + { + "epoch": 3.0204444444444443, + "grad_norm": 3.7935643196105957, + "learning_rate": 7.925266903914591e-05, + "loss": 1.2181, + "step": 6796 + }, + { + "epoch": 3.020888888888889, + "grad_norm": 4.230493068695068, + "learning_rate": 7.923487544483986e-05, + "loss": 1.467, + "step": 6797 + }, + { + "epoch": 3.021333333333333, + "grad_norm": 4.8975982666015625, + "learning_rate": 7.921708185053381e-05, + "loss": 1.0094, + "step": 6798 + }, + { + "epoch": 3.021777777777778, + "grad_norm": 3.1656551361083984, + "learning_rate": 7.919928825622776e-05, + "loss": 0.5155, + "step": 6799 + }, + { + "epoch": 3.022222222222222, + "grad_norm": 4.159629821777344, + "learning_rate": 7.918149466192171e-05, + "loss": 0.1859, + "step": 6800 + }, + { + "epoch": 3.022666666666667, + "grad_norm": 3.048837900161743, + "learning_rate": 7.916370106761566e-05, + "loss": 0.8377, + "step": 6801 + }, + { + "epoch": 3.023111111111111, + "grad_norm": 2.2959177494049072, + "learning_rate": 7.91459074733096e-05, + "loss": 1.6241, + "step": 6802 + }, + { + "epoch": 3.0235555555555558, + "grad_norm": 1.9068602323532104, + "learning_rate": 7.912811387900356e-05, + "loss": 0.0712, + "step": 6803 + }, + { + "epoch": 3.024, + "grad_norm": 1.6793782711029053, + "learning_rate": 7.911032028469752e-05, + "loss": 0.5563, + "step": 6804 + }, + { + "epoch": 3.0244444444444443, + "grad_norm": 2.5858163833618164, + "learning_rate": 7.909252669039146e-05, + "loss": 1.3781, + "step": 6805 + }, + { + "epoch": 3.024888888888889, + "grad_norm": 2.612051010131836, + "learning_rate": 7.907473309608542e-05, + "loss": 1.5614, + "step": 6806 + }, + { + "epoch": 3.025333333333333, + "grad_norm": 2.8191487789154053, + "learning_rate": 7.905693950177936e-05, + "loss": 1.3573, + "step": 6807 + }, + { + "epoch": 3.025777777777778, + "grad_norm": 2.706998109817505, + "learning_rate": 7.90391459074733e-05, + "loss": 1.6064, + "step": 6808 + }, + { + "epoch": 3.026222222222222, + "grad_norm": 2.847705841064453, + "learning_rate": 7.902135231316726e-05, + "loss": 1.0591, + "step": 6809 + }, + { + "epoch": 3.026666666666667, + "grad_norm": 2.685861825942993, + "learning_rate": 7.900355871886122e-05, + "loss": 1.5385, + "step": 6810 + }, + { + "epoch": 3.027111111111111, + "grad_norm": 3.0858583450317383, + "learning_rate": 7.898576512455516e-05, + "loss": 1.4013, + "step": 6811 + }, + { + "epoch": 3.0275555555555558, + "grad_norm": 2.825329065322876, + "learning_rate": 7.896797153024912e-05, + "loss": 1.4406, + "step": 6812 + }, + { + "epoch": 3.028, + "grad_norm": 3.5686569213867188, + "learning_rate": 7.895017793594306e-05, + "loss": 1.3895, + "step": 6813 + }, + { + "epoch": 3.0284444444444443, + "grad_norm": 3.122117280960083, + "learning_rate": 7.893238434163702e-05, + "loss": 1.3254, + "step": 6814 + }, + { + "epoch": 3.028888888888889, + "grad_norm": 2.970949649810791, + "learning_rate": 7.891459074733096e-05, + "loss": 1.4918, + "step": 6815 + }, + { + "epoch": 3.029333333333333, + "grad_norm": 2.8904595375061035, + "learning_rate": 7.889679715302492e-05, + "loss": 1.2927, + "step": 6816 + }, + { + "epoch": 3.029777777777778, + "grad_norm": 3.0297486782073975, + "learning_rate": 7.887900355871887e-05, + "loss": 1.4865, + "step": 6817 + }, + { + "epoch": 3.030222222222222, + "grad_norm": 3.1951310634613037, + "learning_rate": 7.886120996441282e-05, + "loss": 1.1547, + "step": 6818 + }, + { + "epoch": 3.030666666666667, + "grad_norm": 1.9806190729141235, + "learning_rate": 7.884341637010677e-05, + "loss": 0.5659, + "step": 6819 + }, + { + "epoch": 3.031111111111111, + "grad_norm": 2.957610845565796, + "learning_rate": 7.882562277580072e-05, + "loss": 1.3614, + "step": 6820 + }, + { + "epoch": 3.0315555555555553, + "grad_norm": 3.6964633464813232, + "learning_rate": 7.880782918149466e-05, + "loss": 1.7276, + "step": 6821 + }, + { + "epoch": 3.032, + "grad_norm": 3.108067512512207, + "learning_rate": 7.879003558718861e-05, + "loss": 1.4918, + "step": 6822 + }, + { + "epoch": 3.0324444444444443, + "grad_norm": 2.6663601398468018, + "learning_rate": 7.877224199288257e-05, + "loss": 1.233, + "step": 6823 + }, + { + "epoch": 3.032888888888889, + "grad_norm": 3.010274648666382, + "learning_rate": 7.875444839857651e-05, + "loss": 1.3845, + "step": 6824 + }, + { + "epoch": 3.033333333333333, + "grad_norm": 3.353402614593506, + "learning_rate": 7.873665480427047e-05, + "loss": 1.3772, + "step": 6825 + }, + { + "epoch": 3.033777777777778, + "grad_norm": 2.9852728843688965, + "learning_rate": 7.871886120996441e-05, + "loss": 1.5947, + "step": 6826 + }, + { + "epoch": 3.034222222222222, + "grad_norm": 2.881147861480713, + "learning_rate": 7.870106761565837e-05, + "loss": 1.3783, + "step": 6827 + }, + { + "epoch": 3.034666666666667, + "grad_norm": 3.402742385864258, + "learning_rate": 7.868327402135231e-05, + "loss": 1.1179, + "step": 6828 + }, + { + "epoch": 3.035111111111111, + "grad_norm": 2.7964463233947754, + "learning_rate": 7.866548042704627e-05, + "loss": 0.6328, + "step": 6829 + }, + { + "epoch": 3.0355555555555553, + "grad_norm": 3.21201229095459, + "learning_rate": 7.864768683274023e-05, + "loss": 1.4121, + "step": 6830 + }, + { + "epoch": 3.036, + "grad_norm": 2.9257187843322754, + "learning_rate": 7.862989323843417e-05, + "loss": 0.7511, + "step": 6831 + }, + { + "epoch": 3.0364444444444443, + "grad_norm": 3.565912961959839, + "learning_rate": 7.861209964412811e-05, + "loss": 1.5339, + "step": 6832 + }, + { + "epoch": 3.036888888888889, + "grad_norm": 2.8367793560028076, + "learning_rate": 7.859430604982207e-05, + "loss": 1.2353, + "step": 6833 + }, + { + "epoch": 3.037333333333333, + "grad_norm": 3.7711472511291504, + "learning_rate": 7.857651245551601e-05, + "loss": 1.4139, + "step": 6834 + }, + { + "epoch": 3.037777777777778, + "grad_norm": 2.9545300006866455, + "learning_rate": 7.855871886120997e-05, + "loss": 1.3022, + "step": 6835 + }, + { + "epoch": 3.038222222222222, + "grad_norm": 3.401104688644409, + "learning_rate": 7.854092526690392e-05, + "loss": 1.3024, + "step": 6836 + }, + { + "epoch": 3.038666666666667, + "grad_norm": 3.2514753341674805, + "learning_rate": 7.852313167259787e-05, + "loss": 0.9559, + "step": 6837 + }, + { + "epoch": 3.039111111111111, + "grad_norm": 4.7731547355651855, + "learning_rate": 7.850533807829182e-05, + "loss": 1.3061, + "step": 6838 + }, + { + "epoch": 3.0395555555555553, + "grad_norm": 3.2436811923980713, + "learning_rate": 7.848754448398577e-05, + "loss": 1.0786, + "step": 6839 + }, + { + "epoch": 3.04, + "grad_norm": 3.0958991050720215, + "learning_rate": 7.846975088967971e-05, + "loss": 1.2044, + "step": 6840 + }, + { + "epoch": 3.0404444444444443, + "grad_norm": 3.618880271911621, + "learning_rate": 7.845195729537367e-05, + "loss": 1.0556, + "step": 6841 + }, + { + "epoch": 3.040888888888889, + "grad_norm": 4.0584001541137695, + "learning_rate": 7.843416370106762e-05, + "loss": 1.1659, + "step": 6842 + }, + { + "epoch": 3.041333333333333, + "grad_norm": 4.1572418212890625, + "learning_rate": 7.841637010676157e-05, + "loss": 1.0593, + "step": 6843 + }, + { + "epoch": 3.041777777777778, + "grad_norm": 3.093219757080078, + "learning_rate": 7.839857651245552e-05, + "loss": 1.1444, + "step": 6844 + }, + { + "epoch": 3.042222222222222, + "grad_norm": 4.556105613708496, + "learning_rate": 7.838078291814947e-05, + "loss": 1.3215, + "step": 6845 + }, + { + "epoch": 3.042666666666667, + "grad_norm": 3.609612226486206, + "learning_rate": 7.836298932384342e-05, + "loss": 1.3856, + "step": 6846 + }, + { + "epoch": 3.043111111111111, + "grad_norm": 4.959066867828369, + "learning_rate": 7.834519572953736e-05, + "loss": 1.1098, + "step": 6847 + }, + { + "epoch": 3.0435555555555553, + "grad_norm": 3.45310640335083, + "learning_rate": 7.832740213523132e-05, + "loss": 0.7384, + "step": 6848 + }, + { + "epoch": 3.044, + "grad_norm": 5.438348770141602, + "learning_rate": 7.830960854092528e-05, + "loss": 1.2785, + "step": 6849 + }, + { + "epoch": 3.0444444444444443, + "grad_norm": 6.3002119064331055, + "learning_rate": 7.829181494661922e-05, + "loss": 0.9269, + "step": 6850 + }, + { + "epoch": 3.044888888888889, + "grad_norm": 2.523265838623047, + "learning_rate": 7.827402135231318e-05, + "loss": 1.9499, + "step": 6851 + }, + { + "epoch": 3.0453333333333332, + "grad_norm": 2.234962224960327, + "learning_rate": 7.825622775800712e-05, + "loss": 1.646, + "step": 6852 + }, + { + "epoch": 3.045777777777778, + "grad_norm": 2.712780475616455, + "learning_rate": 7.823843416370106e-05, + "loss": 1.793, + "step": 6853 + }, + { + "epoch": 3.046222222222222, + "grad_norm": 2.686671018600464, + "learning_rate": 7.822064056939502e-05, + "loss": 1.5653, + "step": 6854 + }, + { + "epoch": 3.046666666666667, + "grad_norm": 2.907186269760132, + "learning_rate": 7.820284697508898e-05, + "loss": 1.5827, + "step": 6855 + }, + { + "epoch": 3.047111111111111, + "grad_norm": 2.4521493911743164, + "learning_rate": 7.818505338078292e-05, + "loss": 1.5815, + "step": 6856 + }, + { + "epoch": 3.0475555555555554, + "grad_norm": 2.803241491317749, + "learning_rate": 7.816725978647688e-05, + "loss": 1.5903, + "step": 6857 + }, + { + "epoch": 3.048, + "grad_norm": 2.856114625930786, + "learning_rate": 7.814946619217082e-05, + "loss": 1.3927, + "step": 6858 + }, + { + "epoch": 3.0484444444444443, + "grad_norm": 3.376676082611084, + "learning_rate": 7.813167259786477e-05, + "loss": 1.561, + "step": 6859 + }, + { + "epoch": 3.048888888888889, + "grad_norm": 3.10386061668396, + "learning_rate": 7.811387900355872e-05, + "loss": 1.473, + "step": 6860 + }, + { + "epoch": 3.0493333333333332, + "grad_norm": 3.0056633949279785, + "learning_rate": 7.809608540925267e-05, + "loss": 1.3637, + "step": 6861 + }, + { + "epoch": 3.049777777777778, + "grad_norm": 3.1798784732818604, + "learning_rate": 7.807829181494663e-05, + "loss": 1.7298, + "step": 6862 + }, + { + "epoch": 3.050222222222222, + "grad_norm": 3.0518245697021484, + "learning_rate": 7.806049822064057e-05, + "loss": 1.2918, + "step": 6863 + }, + { + "epoch": 3.050666666666667, + "grad_norm": 2.996016502380371, + "learning_rate": 7.804270462633453e-05, + "loss": 1.5182, + "step": 6864 + }, + { + "epoch": 3.051111111111111, + "grad_norm": 3.0778775215148926, + "learning_rate": 7.802491103202847e-05, + "loss": 1.7565, + "step": 6865 + }, + { + "epoch": 3.0515555555555554, + "grad_norm": 2.45369553565979, + "learning_rate": 7.800711743772242e-05, + "loss": 0.9105, + "step": 6866 + }, + { + "epoch": 3.052, + "grad_norm": 3.1584999561309814, + "learning_rate": 7.798932384341637e-05, + "loss": 1.5315, + "step": 6867 + }, + { + "epoch": 3.0524444444444443, + "grad_norm": 2.7491531372070312, + "learning_rate": 7.797153024911033e-05, + "loss": 1.2398, + "step": 6868 + }, + { + "epoch": 3.052888888888889, + "grad_norm": 3.181427478790283, + "learning_rate": 7.795373665480427e-05, + "loss": 1.4826, + "step": 6869 + }, + { + "epoch": 3.0533333333333332, + "grad_norm": 2.462205648422241, + "learning_rate": 7.793594306049823e-05, + "loss": 0.7215, + "step": 6870 + }, + { + "epoch": 3.053777777777778, + "grad_norm": 2.8350601196289062, + "learning_rate": 7.791814946619217e-05, + "loss": 1.4129, + "step": 6871 + }, + { + "epoch": 3.054222222222222, + "grad_norm": 3.099984645843506, + "learning_rate": 7.790035587188613e-05, + "loss": 1.1478, + "step": 6872 + }, + { + "epoch": 3.054666666666667, + "grad_norm": 2.893434524536133, + "learning_rate": 7.788256227758007e-05, + "loss": 1.4698, + "step": 6873 + }, + { + "epoch": 3.055111111111111, + "grad_norm": 3.185155153274536, + "learning_rate": 7.786476868327403e-05, + "loss": 1.6558, + "step": 6874 + }, + { + "epoch": 3.0555555555555554, + "grad_norm": 2.3959739208221436, + "learning_rate": 7.784697508896798e-05, + "loss": 1.266, + "step": 6875 + }, + { + "epoch": 3.056, + "grad_norm": 3.058983564376831, + "learning_rate": 7.782918149466193e-05, + "loss": 1.3393, + "step": 6876 + }, + { + "epoch": 3.0564444444444443, + "grad_norm": 3.080317974090576, + "learning_rate": 7.781138790035588e-05, + "loss": 1.5509, + "step": 6877 + }, + { + "epoch": 3.056888888888889, + "grad_norm": 3.090117931365967, + "learning_rate": 7.779359430604983e-05, + "loss": 1.2201, + "step": 6878 + }, + { + "epoch": 3.0573333333333332, + "grad_norm": 2.1614432334899902, + "learning_rate": 7.777580071174377e-05, + "loss": 0.6346, + "step": 6879 + }, + { + "epoch": 3.057777777777778, + "grad_norm": 2.7697033882141113, + "learning_rate": 7.775800711743773e-05, + "loss": 1.1247, + "step": 6880 + }, + { + "epoch": 3.058222222222222, + "grad_norm": 3.1787707805633545, + "learning_rate": 7.774021352313168e-05, + "loss": 1.0795, + "step": 6881 + }, + { + "epoch": 3.058666666666667, + "grad_norm": 3.6657590866088867, + "learning_rate": 7.772241992882563e-05, + "loss": 1.6009, + "step": 6882 + }, + { + "epoch": 3.059111111111111, + "grad_norm": 3.2929611206054688, + "learning_rate": 7.770462633451958e-05, + "loss": 1.4993, + "step": 6883 + }, + { + "epoch": 3.0595555555555554, + "grad_norm": 3.753507137298584, + "learning_rate": 7.768683274021352e-05, + "loss": 1.6565, + "step": 6884 + }, + { + "epoch": 3.06, + "grad_norm": 2.848525047302246, + "learning_rate": 7.766903914590747e-05, + "loss": 0.9654, + "step": 6885 + }, + { + "epoch": 3.0604444444444443, + "grad_norm": 3.355612277984619, + "learning_rate": 7.765124555160142e-05, + "loss": 1.0126, + "step": 6886 + }, + { + "epoch": 3.060888888888889, + "grad_norm": 3.229212522506714, + "learning_rate": 7.763345195729538e-05, + "loss": 0.7328, + "step": 6887 + }, + { + "epoch": 3.0613333333333332, + "grad_norm": 4.031661033630371, + "learning_rate": 7.761565836298932e-05, + "loss": 1.3381, + "step": 6888 + }, + { + "epoch": 3.061777777777778, + "grad_norm": 3.2130744457244873, + "learning_rate": 7.759786476868328e-05, + "loss": 1.1694, + "step": 6889 + }, + { + "epoch": 3.062222222222222, + "grad_norm": 4.325170040130615, + "learning_rate": 7.758007117437722e-05, + "loss": 0.9918, + "step": 6890 + }, + { + "epoch": 3.062666666666667, + "grad_norm": 3.6601080894470215, + "learning_rate": 7.756227758007118e-05, + "loss": 1.6125, + "step": 6891 + }, + { + "epoch": 3.063111111111111, + "grad_norm": 2.9753174781799316, + "learning_rate": 7.754448398576512e-05, + "loss": 0.8867, + "step": 6892 + }, + { + "epoch": 3.0635555555555554, + "grad_norm": 4.441684246063232, + "learning_rate": 7.752669039145908e-05, + "loss": 1.6672, + "step": 6893 + }, + { + "epoch": 3.064, + "grad_norm": 5.101708889007568, + "learning_rate": 7.750889679715304e-05, + "loss": 1.3812, + "step": 6894 + }, + { + "epoch": 3.0644444444444443, + "grad_norm": 4.84489631652832, + "learning_rate": 7.749110320284698e-05, + "loss": 1.4592, + "step": 6895 + }, + { + "epoch": 3.064888888888889, + "grad_norm": 4.635469913482666, + "learning_rate": 7.747330960854093e-05, + "loss": 1.2783, + "step": 6896 + }, + { + "epoch": 3.0653333333333332, + "grad_norm": 4.737509727478027, + "learning_rate": 7.745551601423488e-05, + "loss": 1.1961, + "step": 6897 + }, + { + "epoch": 3.065777777777778, + "grad_norm": 5.450290203094482, + "learning_rate": 7.743772241992882e-05, + "loss": 0.9652, + "step": 6898 + }, + { + "epoch": 3.066222222222222, + "grad_norm": 4.097311973571777, + "learning_rate": 7.741992882562278e-05, + "loss": 0.9152, + "step": 6899 + }, + { + "epoch": 3.066666666666667, + "grad_norm": 4.8908257484436035, + "learning_rate": 7.740213523131673e-05, + "loss": 0.9301, + "step": 6900 + }, + { + "epoch": 3.067111111111111, + "grad_norm": 1.8603249788284302, + "learning_rate": 7.738434163701068e-05, + "loss": 0.848, + "step": 6901 + }, + { + "epoch": 3.0675555555555554, + "grad_norm": 2.392153263092041, + "learning_rate": 7.736654804270463e-05, + "loss": 1.9415, + "step": 6902 + }, + { + "epoch": 3.068, + "grad_norm": 2.698664426803589, + "learning_rate": 7.734875444839858e-05, + "loss": 1.7879, + "step": 6903 + }, + { + "epoch": 3.0684444444444443, + "grad_norm": 2.744234561920166, + "learning_rate": 7.733096085409253e-05, + "loss": 1.7097, + "step": 6904 + }, + { + "epoch": 3.068888888888889, + "grad_norm": 3.156621217727661, + "learning_rate": 7.731316725978648e-05, + "loss": 2.1794, + "step": 6905 + }, + { + "epoch": 3.0693333333333332, + "grad_norm": 2.774522066116333, + "learning_rate": 7.729537366548043e-05, + "loss": 1.6727, + "step": 6906 + }, + { + "epoch": 3.069777777777778, + "grad_norm": 3.170665740966797, + "learning_rate": 7.727758007117439e-05, + "loss": 0.7644, + "step": 6907 + }, + { + "epoch": 3.070222222222222, + "grad_norm": 2.518483877182007, + "learning_rate": 7.725978647686833e-05, + "loss": 1.337, + "step": 6908 + }, + { + "epoch": 3.070666666666667, + "grad_norm": 2.7955567836761475, + "learning_rate": 7.724199288256229e-05, + "loss": 1.1453, + "step": 6909 + }, + { + "epoch": 3.071111111111111, + "grad_norm": 3.288102865219116, + "learning_rate": 7.722419928825623e-05, + "loss": 1.2716, + "step": 6910 + }, + { + "epoch": 3.0715555555555554, + "grad_norm": 3.349630355834961, + "learning_rate": 7.720640569395017e-05, + "loss": 1.716, + "step": 6911 + }, + { + "epoch": 3.072, + "grad_norm": 2.8400745391845703, + "learning_rate": 7.718861209964413e-05, + "loss": 1.2617, + "step": 6912 + }, + { + "epoch": 3.0724444444444443, + "grad_norm": 2.843729257583618, + "learning_rate": 7.717081850533809e-05, + "loss": 1.4572, + "step": 6913 + }, + { + "epoch": 3.072888888888889, + "grad_norm": 3.0647592544555664, + "learning_rate": 7.715302491103203e-05, + "loss": 1.5921, + "step": 6914 + }, + { + "epoch": 3.0733333333333333, + "grad_norm": 3.1365675926208496, + "learning_rate": 7.713523131672599e-05, + "loss": 1.5678, + "step": 6915 + }, + { + "epoch": 3.073777777777778, + "grad_norm": 2.7604968547821045, + "learning_rate": 7.711743772241993e-05, + "loss": 1.3843, + "step": 6916 + }, + { + "epoch": 3.074222222222222, + "grad_norm": 2.654712438583374, + "learning_rate": 7.709964412811389e-05, + "loss": 1.0891, + "step": 6917 + }, + { + "epoch": 3.074666666666667, + "grad_norm": 3.091094732284546, + "learning_rate": 7.708185053380783e-05, + "loss": 1.4574, + "step": 6918 + }, + { + "epoch": 3.075111111111111, + "grad_norm": 2.8697140216827393, + "learning_rate": 7.706405693950179e-05, + "loss": 1.2856, + "step": 6919 + }, + { + "epoch": 3.0755555555555554, + "grad_norm": 4.098965644836426, + "learning_rate": 7.704626334519574e-05, + "loss": 1.2447, + "step": 6920 + }, + { + "epoch": 3.076, + "grad_norm": 3.0717883110046387, + "learning_rate": 7.702846975088968e-05, + "loss": 1.2924, + "step": 6921 + }, + { + "epoch": 3.0764444444444443, + "grad_norm": 3.0707571506500244, + "learning_rate": 7.701067615658364e-05, + "loss": 1.3306, + "step": 6922 + }, + { + "epoch": 3.076888888888889, + "grad_norm": 3.0397446155548096, + "learning_rate": 7.699288256227758e-05, + "loss": 1.3013, + "step": 6923 + }, + { + "epoch": 3.0773333333333333, + "grad_norm": 3.693847894668579, + "learning_rate": 7.697508896797153e-05, + "loss": 1.4978, + "step": 6924 + }, + { + "epoch": 3.077777777777778, + "grad_norm": 3.112276077270508, + "learning_rate": 7.695729537366548e-05, + "loss": 1.4023, + "step": 6925 + }, + { + "epoch": 3.078222222222222, + "grad_norm": 3.29516863822937, + "learning_rate": 7.693950177935944e-05, + "loss": 1.2205, + "step": 6926 + }, + { + "epoch": 3.078666666666667, + "grad_norm": 2.247548818588257, + "learning_rate": 7.692170818505338e-05, + "loss": 0.6385, + "step": 6927 + }, + { + "epoch": 3.079111111111111, + "grad_norm": 2.3756346702575684, + "learning_rate": 7.690391459074734e-05, + "loss": 0.7201, + "step": 6928 + }, + { + "epoch": 3.0795555555555554, + "grad_norm": 3.00430965423584, + "learning_rate": 7.688612099644128e-05, + "loss": 1.0356, + "step": 6929 + }, + { + "epoch": 3.08, + "grad_norm": 2.532029390335083, + "learning_rate": 7.686832740213523e-05, + "loss": 0.5134, + "step": 6930 + }, + { + "epoch": 3.0804444444444443, + "grad_norm": 0.30651673674583435, + "learning_rate": 7.685053380782918e-05, + "loss": 0.0335, + "step": 6931 + }, + { + "epoch": 3.080888888888889, + "grad_norm": 2.980158567428589, + "learning_rate": 7.683274021352314e-05, + "loss": 1.0148, + "step": 6932 + }, + { + "epoch": 3.0813333333333333, + "grad_norm": 3.9379446506500244, + "learning_rate": 7.681494661921708e-05, + "loss": 1.354, + "step": 6933 + }, + { + "epoch": 3.081777777777778, + "grad_norm": 3.313826560974121, + "learning_rate": 7.679715302491104e-05, + "loss": 1.5082, + "step": 6934 + }, + { + "epoch": 3.082222222222222, + "grad_norm": 2.866286039352417, + "learning_rate": 7.6779359430605e-05, + "loss": 0.7554, + "step": 6935 + }, + { + "epoch": 3.0826666666666664, + "grad_norm": 3.264652967453003, + "learning_rate": 7.676156583629894e-05, + "loss": 1.0634, + "step": 6936 + }, + { + "epoch": 3.083111111111111, + "grad_norm": 3.0582849979400635, + "learning_rate": 7.674377224199288e-05, + "loss": 0.9423, + "step": 6937 + }, + { + "epoch": 3.0835555555555554, + "grad_norm": 4.915375709533691, + "learning_rate": 7.672597864768684e-05, + "loss": 1.1836, + "step": 6938 + }, + { + "epoch": 3.084, + "grad_norm": 3.4640719890594482, + "learning_rate": 7.670818505338079e-05, + "loss": 1.2714, + "step": 6939 + }, + { + "epoch": 3.0844444444444443, + "grad_norm": 3.8669791221618652, + "learning_rate": 7.669039145907474e-05, + "loss": 1.0252, + "step": 6940 + }, + { + "epoch": 3.084888888888889, + "grad_norm": 4.150646686553955, + "learning_rate": 7.667259786476869e-05, + "loss": 1.432, + "step": 6941 + }, + { + "epoch": 3.0853333333333333, + "grad_norm": 3.675527334213257, + "learning_rate": 7.665480427046264e-05, + "loss": 1.2161, + "step": 6942 + }, + { + "epoch": 3.085777777777778, + "grad_norm": 4.122688293457031, + "learning_rate": 7.663701067615658e-05, + "loss": 1.2763, + "step": 6943 + }, + { + "epoch": 3.086222222222222, + "grad_norm": 3.758718967437744, + "learning_rate": 7.661921708185053e-05, + "loss": 1.2185, + "step": 6944 + }, + { + "epoch": 3.086666666666667, + "grad_norm": 4.356784343719482, + "learning_rate": 7.660142348754449e-05, + "loss": 1.4009, + "step": 6945 + }, + { + "epoch": 3.087111111111111, + "grad_norm": 4.016665458679199, + "learning_rate": 7.658362989323843e-05, + "loss": 0.9262, + "step": 6946 + }, + { + "epoch": 3.0875555555555554, + "grad_norm": 6.008724689483643, + "learning_rate": 7.656583629893239e-05, + "loss": 1.37, + "step": 6947 + }, + { + "epoch": 3.088, + "grad_norm": 5.219587326049805, + "learning_rate": 7.654804270462633e-05, + "loss": 1.1014, + "step": 6948 + }, + { + "epoch": 3.0884444444444443, + "grad_norm": 3.348543643951416, + "learning_rate": 7.653024911032029e-05, + "loss": 0.3617, + "step": 6949 + }, + { + "epoch": 3.088888888888889, + "grad_norm": 5.084573745727539, + "learning_rate": 7.651245551601423e-05, + "loss": 0.6737, + "step": 6950 + }, + { + "epoch": 3.0893333333333333, + "grad_norm": 2.369049072265625, + "learning_rate": 7.649466192170819e-05, + "loss": 1.7119, + "step": 6951 + }, + { + "epoch": 3.089777777777778, + "grad_norm": 2.740426540374756, + "learning_rate": 7.647686832740215e-05, + "loss": 1.6545, + "step": 6952 + }, + { + "epoch": 3.090222222222222, + "grad_norm": 2.695539712905884, + "learning_rate": 7.645907473309609e-05, + "loss": 1.7265, + "step": 6953 + }, + { + "epoch": 3.0906666666666665, + "grad_norm": 2.5847222805023193, + "learning_rate": 7.644128113879005e-05, + "loss": 1.1412, + "step": 6954 + }, + { + "epoch": 3.091111111111111, + "grad_norm": 2.5503435134887695, + "learning_rate": 7.642348754448399e-05, + "loss": 1.3825, + "step": 6955 + }, + { + "epoch": 3.0915555555555554, + "grad_norm": 3.02182674407959, + "learning_rate": 7.640569395017793e-05, + "loss": 1.3872, + "step": 6956 + }, + { + "epoch": 3.092, + "grad_norm": 2.3122494220733643, + "learning_rate": 7.638790035587189e-05, + "loss": 0.9554, + "step": 6957 + }, + { + "epoch": 3.0924444444444443, + "grad_norm": 2.8011865615844727, + "learning_rate": 7.637010676156584e-05, + "loss": 1.2296, + "step": 6958 + }, + { + "epoch": 3.092888888888889, + "grad_norm": 3.4765398502349854, + "learning_rate": 7.635231316725979e-05, + "loss": 2.2308, + "step": 6959 + }, + { + "epoch": 3.0933333333333333, + "grad_norm": 2.9357285499572754, + "learning_rate": 7.633451957295374e-05, + "loss": 1.394, + "step": 6960 + }, + { + "epoch": 3.093777777777778, + "grad_norm": 3.6758627891540527, + "learning_rate": 7.631672597864769e-05, + "loss": 1.7402, + "step": 6961 + }, + { + "epoch": 3.094222222222222, + "grad_norm": 3.1817829608917236, + "learning_rate": 7.629893238434164e-05, + "loss": 1.432, + "step": 6962 + }, + { + "epoch": 3.0946666666666665, + "grad_norm": 3.500157594680786, + "learning_rate": 7.628113879003559e-05, + "loss": 1.7906, + "step": 6963 + }, + { + "epoch": 3.095111111111111, + "grad_norm": 3.7627017498016357, + "learning_rate": 7.626334519572954e-05, + "loss": 1.7225, + "step": 6964 + }, + { + "epoch": 3.0955555555555554, + "grad_norm": 2.646817445755005, + "learning_rate": 7.62455516014235e-05, + "loss": 1.3691, + "step": 6965 + }, + { + "epoch": 3.096, + "grad_norm": 2.6752378940582275, + "learning_rate": 7.622775800711744e-05, + "loss": 1.4794, + "step": 6966 + }, + { + "epoch": 3.0964444444444443, + "grad_norm": 2.979374885559082, + "learning_rate": 7.62099644128114e-05, + "loss": 1.0324, + "step": 6967 + }, + { + "epoch": 3.096888888888889, + "grad_norm": 2.8177530765533447, + "learning_rate": 7.619217081850534e-05, + "loss": 1.5979, + "step": 6968 + }, + { + "epoch": 3.0973333333333333, + "grad_norm": 2.7737972736358643, + "learning_rate": 7.617437722419928e-05, + "loss": 1.138, + "step": 6969 + }, + { + "epoch": 3.097777777777778, + "grad_norm": 3.1239850521087646, + "learning_rate": 7.615658362989324e-05, + "loss": 1.4938, + "step": 6970 + }, + { + "epoch": 3.098222222222222, + "grad_norm": 3.2741284370422363, + "learning_rate": 7.61387900355872e-05, + "loss": 1.0675, + "step": 6971 + }, + { + "epoch": 3.0986666666666665, + "grad_norm": 2.790844678878784, + "learning_rate": 7.612099644128114e-05, + "loss": 1.2557, + "step": 6972 + }, + { + "epoch": 3.099111111111111, + "grad_norm": 2.8968873023986816, + "learning_rate": 7.61032028469751e-05, + "loss": 1.259, + "step": 6973 + }, + { + "epoch": 3.0995555555555554, + "grad_norm": 3.275770664215088, + "learning_rate": 7.608540925266904e-05, + "loss": 1.2327, + "step": 6974 + }, + { + "epoch": 3.1, + "grad_norm": 3.203305721282959, + "learning_rate": 7.606761565836298e-05, + "loss": 0.9974, + "step": 6975 + }, + { + "epoch": 3.1004444444444443, + "grad_norm": 3.2165777683258057, + "learning_rate": 7.604982206405694e-05, + "loss": 1.6345, + "step": 6976 + }, + { + "epoch": 3.100888888888889, + "grad_norm": 3.0916032791137695, + "learning_rate": 7.60320284697509e-05, + "loss": 1.2841, + "step": 6977 + }, + { + "epoch": 3.1013333333333333, + "grad_norm": 3.5910959243774414, + "learning_rate": 7.601423487544484e-05, + "loss": 0.96, + "step": 6978 + }, + { + "epoch": 3.101777777777778, + "grad_norm": 3.80975341796875, + "learning_rate": 7.59964412811388e-05, + "loss": 1.5965, + "step": 6979 + }, + { + "epoch": 3.102222222222222, + "grad_norm": 2.8560242652893066, + "learning_rate": 7.597864768683275e-05, + "loss": 0.8661, + "step": 6980 + }, + { + "epoch": 3.1026666666666665, + "grad_norm": 4.064279079437256, + "learning_rate": 7.59608540925267e-05, + "loss": 1.3318, + "step": 6981 + }, + { + "epoch": 3.103111111111111, + "grad_norm": 3.475475311279297, + "learning_rate": 7.594306049822064e-05, + "loss": 0.9713, + "step": 6982 + }, + { + "epoch": 3.1035555555555554, + "grad_norm": 3.4135968685150146, + "learning_rate": 7.59252669039146e-05, + "loss": 1.1763, + "step": 6983 + }, + { + "epoch": 3.104, + "grad_norm": 2.9875054359436035, + "learning_rate": 7.590747330960855e-05, + "loss": 1.3583, + "step": 6984 + }, + { + "epoch": 3.1044444444444443, + "grad_norm": 3.5660154819488525, + "learning_rate": 7.58896797153025e-05, + "loss": 1.2142, + "step": 6985 + }, + { + "epoch": 3.104888888888889, + "grad_norm": 3.5056378841400146, + "learning_rate": 7.587188612099645e-05, + "loss": 1.3169, + "step": 6986 + }, + { + "epoch": 3.1053333333333333, + "grad_norm": 2.358790636062622, + "learning_rate": 7.58540925266904e-05, + "loss": 0.488, + "step": 6987 + }, + { + "epoch": 3.105777777777778, + "grad_norm": 3.0565037727355957, + "learning_rate": 7.583629893238434e-05, + "loss": 1.1014, + "step": 6988 + }, + { + "epoch": 3.106222222222222, + "grad_norm": 2.9908316135406494, + "learning_rate": 7.581850533807829e-05, + "loss": 1.0469, + "step": 6989 + }, + { + "epoch": 3.1066666666666665, + "grad_norm": 3.8466875553131104, + "learning_rate": 7.580071174377225e-05, + "loss": 0.9868, + "step": 6990 + }, + { + "epoch": 3.107111111111111, + "grad_norm": 4.6302032470703125, + "learning_rate": 7.578291814946619e-05, + "loss": 1.6431, + "step": 6991 + }, + { + "epoch": 3.1075555555555554, + "grad_norm": 3.230506420135498, + "learning_rate": 7.576512455516015e-05, + "loss": 0.8402, + "step": 6992 + }, + { + "epoch": 3.108, + "grad_norm": 3.8830463886260986, + "learning_rate": 7.57473309608541e-05, + "loss": 1.1773, + "step": 6993 + }, + { + "epoch": 3.1084444444444443, + "grad_norm": 4.235727310180664, + "learning_rate": 7.572953736654805e-05, + "loss": 1.2941, + "step": 6994 + }, + { + "epoch": 3.108888888888889, + "grad_norm": 4.56505012512207, + "learning_rate": 7.571174377224199e-05, + "loss": 1.1328, + "step": 6995 + }, + { + "epoch": 3.1093333333333333, + "grad_norm": 3.8152174949645996, + "learning_rate": 7.569395017793595e-05, + "loss": 0.8993, + "step": 6996 + }, + { + "epoch": 3.109777777777778, + "grad_norm": 5.836917877197266, + "learning_rate": 7.56761565836299e-05, + "loss": 1.6419, + "step": 6997 + }, + { + "epoch": 3.110222222222222, + "grad_norm": 3.6817240715026855, + "learning_rate": 7.565836298932385e-05, + "loss": 0.995, + "step": 6998 + }, + { + "epoch": 3.1106666666666665, + "grad_norm": 4.596709251403809, + "learning_rate": 7.56405693950178e-05, + "loss": 0.7939, + "step": 6999 + }, + { + "epoch": 3.111111111111111, + "grad_norm": 3.930978775024414, + "learning_rate": 7.562277580071175e-05, + "loss": 0.7911, + "step": 7000 + }, + { + "epoch": 3.1115555555555554, + "grad_norm": 2.6198768615722656, + "learning_rate": 7.560498220640569e-05, + "loss": 1.8534, + "step": 7001 + }, + { + "epoch": 3.112, + "grad_norm": 2.364030599594116, + "learning_rate": 7.558718861209965e-05, + "loss": 2.0188, + "step": 7002 + }, + { + "epoch": 3.1124444444444443, + "grad_norm": 2.803751230239868, + "learning_rate": 7.55693950177936e-05, + "loss": 1.405, + "step": 7003 + }, + { + "epoch": 3.112888888888889, + "grad_norm": 2.8711655139923096, + "learning_rate": 7.555160142348755e-05, + "loss": 1.3464, + "step": 7004 + }, + { + "epoch": 3.1133333333333333, + "grad_norm": 2.2702465057373047, + "learning_rate": 7.55338078291815e-05, + "loss": 1.1326, + "step": 7005 + }, + { + "epoch": 3.113777777777778, + "grad_norm": 3.279500961303711, + "learning_rate": 7.551601423487544e-05, + "loss": 1.7044, + "step": 7006 + }, + { + "epoch": 3.1142222222222222, + "grad_norm": 3.2445764541625977, + "learning_rate": 7.54982206405694e-05, + "loss": 1.5965, + "step": 7007 + }, + { + "epoch": 3.1146666666666665, + "grad_norm": 3.1852023601531982, + "learning_rate": 7.548042704626334e-05, + "loss": 1.7852, + "step": 7008 + }, + { + "epoch": 3.115111111111111, + "grad_norm": 2.400352716445923, + "learning_rate": 7.54626334519573e-05, + "loss": 0.6979, + "step": 7009 + }, + { + "epoch": 3.1155555555555554, + "grad_norm": 3.9273557662963867, + "learning_rate": 7.544483985765126e-05, + "loss": 1.7671, + "step": 7010 + }, + { + "epoch": 3.116, + "grad_norm": 3.297231674194336, + "learning_rate": 7.54270462633452e-05, + "loss": 1.6285, + "step": 7011 + }, + { + "epoch": 3.1164444444444444, + "grad_norm": 2.7438652515411377, + "learning_rate": 7.540925266903916e-05, + "loss": 1.2797, + "step": 7012 + }, + { + "epoch": 3.116888888888889, + "grad_norm": 2.7498419284820557, + "learning_rate": 7.53914590747331e-05, + "loss": 1.4858, + "step": 7013 + }, + { + "epoch": 3.1173333333333333, + "grad_norm": 2.6676368713378906, + "learning_rate": 7.537366548042704e-05, + "loss": 1.3337, + "step": 7014 + }, + { + "epoch": 3.117777777777778, + "grad_norm": 2.901026487350464, + "learning_rate": 7.5355871886121e-05, + "loss": 1.0484, + "step": 7015 + }, + { + "epoch": 3.1182222222222222, + "grad_norm": 2.0546693801879883, + "learning_rate": 7.533807829181496e-05, + "loss": 0.693, + "step": 7016 + }, + { + "epoch": 3.1186666666666665, + "grad_norm": 2.8888492584228516, + "learning_rate": 7.53202846975089e-05, + "loss": 1.1957, + "step": 7017 + }, + { + "epoch": 3.119111111111111, + "grad_norm": 3.6129257678985596, + "learning_rate": 7.530249110320285e-05, + "loss": 1.4137, + "step": 7018 + }, + { + "epoch": 3.1195555555555554, + "grad_norm": 2.8861212730407715, + "learning_rate": 7.52846975088968e-05, + "loss": 1.3716, + "step": 7019 + }, + { + "epoch": 3.12, + "grad_norm": 2.638096809387207, + "learning_rate": 7.526690391459074e-05, + "loss": 1.1059, + "step": 7020 + }, + { + "epoch": 3.1204444444444444, + "grad_norm": 3.651658773422241, + "learning_rate": 7.52491103202847e-05, + "loss": 1.4312, + "step": 7021 + }, + { + "epoch": 3.120888888888889, + "grad_norm": 3.251763343811035, + "learning_rate": 7.523131672597865e-05, + "loss": 1.2436, + "step": 7022 + }, + { + "epoch": 3.1213333333333333, + "grad_norm": 3.0787482261657715, + "learning_rate": 7.52135231316726e-05, + "loss": 1.6314, + "step": 7023 + }, + { + "epoch": 3.121777777777778, + "grad_norm": 3.3005053997039795, + "learning_rate": 7.519572953736655e-05, + "loss": 1.3528, + "step": 7024 + }, + { + "epoch": 3.1222222222222222, + "grad_norm": 3.0210680961608887, + "learning_rate": 7.517793594306051e-05, + "loss": 1.028, + "step": 7025 + }, + { + "epoch": 3.1226666666666665, + "grad_norm": 3.53727126121521, + "learning_rate": 7.516014234875445e-05, + "loss": 1.2015, + "step": 7026 + }, + { + "epoch": 3.123111111111111, + "grad_norm": 3.3059957027435303, + "learning_rate": 7.51423487544484e-05, + "loss": 1.1474, + "step": 7027 + }, + { + "epoch": 3.1235555555555554, + "grad_norm": 3.240772008895874, + "learning_rate": 7.512455516014235e-05, + "loss": 1.1528, + "step": 7028 + }, + { + "epoch": 3.124, + "grad_norm": 2.7922959327697754, + "learning_rate": 7.510676156583631e-05, + "loss": 1.1426, + "step": 7029 + }, + { + "epoch": 3.1244444444444444, + "grad_norm": 3.379770278930664, + "learning_rate": 7.508896797153025e-05, + "loss": 1.3263, + "step": 7030 + }, + { + "epoch": 3.124888888888889, + "grad_norm": 2.729382038116455, + "learning_rate": 7.507117437722421e-05, + "loss": 1.1248, + "step": 7031 + }, + { + "epoch": 3.1253333333333333, + "grad_norm": 3.164997100830078, + "learning_rate": 7.505338078291815e-05, + "loss": 1.3626, + "step": 7032 + }, + { + "epoch": 3.1257777777777775, + "grad_norm": 2.980705499649048, + "learning_rate": 7.50355871886121e-05, + "loss": 1.0401, + "step": 7033 + }, + { + "epoch": 3.1262222222222222, + "grad_norm": 3.0273003578186035, + "learning_rate": 7.501779359430605e-05, + "loss": 1.0748, + "step": 7034 + }, + { + "epoch": 3.1266666666666665, + "grad_norm": 3.4305291175842285, + "learning_rate": 7.500000000000001e-05, + "loss": 1.38, + "step": 7035 + }, + { + "epoch": 3.127111111111111, + "grad_norm": 3.338956594467163, + "learning_rate": 7.498220640569395e-05, + "loss": 0.9866, + "step": 7036 + }, + { + "epoch": 3.1275555555555554, + "grad_norm": 3.5987491607666016, + "learning_rate": 7.49644128113879e-05, + "loss": 1.1284, + "step": 7037 + }, + { + "epoch": 3.128, + "grad_norm": 3.742375135421753, + "learning_rate": 7.494661921708186e-05, + "loss": 1.4609, + "step": 7038 + }, + { + "epoch": 3.1284444444444444, + "grad_norm": 3.7690203189849854, + "learning_rate": 7.49288256227758e-05, + "loss": 1.5164, + "step": 7039 + }, + { + "epoch": 3.128888888888889, + "grad_norm": 4.437491416931152, + "learning_rate": 7.491103202846975e-05, + "loss": 1.1423, + "step": 7040 + }, + { + "epoch": 3.1293333333333333, + "grad_norm": 3.674255609512329, + "learning_rate": 7.48932384341637e-05, + "loss": 1.3641, + "step": 7041 + }, + { + "epoch": 3.129777777777778, + "grad_norm": 3.8117153644561768, + "learning_rate": 7.487544483985766e-05, + "loss": 1.0165, + "step": 7042 + }, + { + "epoch": 3.1302222222222222, + "grad_norm": 3.8670129776000977, + "learning_rate": 7.48576512455516e-05, + "loss": 1.0165, + "step": 7043 + }, + { + "epoch": 3.1306666666666665, + "grad_norm": 3.9163060188293457, + "learning_rate": 7.483985765124556e-05, + "loss": 0.9119, + "step": 7044 + }, + { + "epoch": 3.131111111111111, + "grad_norm": 4.793590545654297, + "learning_rate": 7.48220640569395e-05, + "loss": 1.3223, + "step": 7045 + }, + { + "epoch": 3.1315555555555554, + "grad_norm": 5.119454383850098, + "learning_rate": 7.480427046263345e-05, + "loss": 1.1945, + "step": 7046 + }, + { + "epoch": 3.132, + "grad_norm": 4.798033714294434, + "learning_rate": 7.47864768683274e-05, + "loss": 1.407, + "step": 7047 + }, + { + "epoch": 3.1324444444444444, + "grad_norm": 2.8798067569732666, + "learning_rate": 7.476868327402136e-05, + "loss": 0.682, + "step": 7048 + }, + { + "epoch": 3.132888888888889, + "grad_norm": 2.7945594787597656, + "learning_rate": 7.47508896797153e-05, + "loss": 0.5815, + "step": 7049 + }, + { + "epoch": 3.1333333333333333, + "grad_norm": 4.523054122924805, + "learning_rate": 7.473309608540926e-05, + "loss": 0.7363, + "step": 7050 + }, + { + "epoch": 3.1337777777777776, + "grad_norm": 2.011254072189331, + "learning_rate": 7.471530249110322e-05, + "loss": 0.9446, + "step": 7051 + }, + { + "epoch": 3.1342222222222222, + "grad_norm": 1.6838473081588745, + "learning_rate": 7.469750889679716e-05, + "loss": 0.86, + "step": 7052 + }, + { + "epoch": 3.1346666666666665, + "grad_norm": 2.2713677883148193, + "learning_rate": 7.46797153024911e-05, + "loss": 1.7424, + "step": 7053 + }, + { + "epoch": 3.135111111111111, + "grad_norm": 2.4536499977111816, + "learning_rate": 7.466192170818506e-05, + "loss": 1.5837, + "step": 7054 + }, + { + "epoch": 3.1355555555555554, + "grad_norm": 1.8696529865264893, + "learning_rate": 7.464412811387901e-05, + "loss": 0.6377, + "step": 7055 + }, + { + "epoch": 3.136, + "grad_norm": 3.031843900680542, + "learning_rate": 7.462633451957296e-05, + "loss": 1.8678, + "step": 7056 + }, + { + "epoch": 3.1364444444444444, + "grad_norm": 2.593820333480835, + "learning_rate": 7.460854092526691e-05, + "loss": 1.4144, + "step": 7057 + }, + { + "epoch": 3.136888888888889, + "grad_norm": 2.432314872741699, + "learning_rate": 7.459074733096086e-05, + "loss": 1.5448, + "step": 7058 + }, + { + "epoch": 3.1373333333333333, + "grad_norm": 2.6551852226257324, + "learning_rate": 7.45729537366548e-05, + "loss": 1.3579, + "step": 7059 + }, + { + "epoch": 3.137777777777778, + "grad_norm": 3.1164588928222656, + "learning_rate": 7.455516014234876e-05, + "loss": 1.927, + "step": 7060 + }, + { + "epoch": 3.1382222222222222, + "grad_norm": 2.72015643119812, + "learning_rate": 7.453736654804271e-05, + "loss": 1.3657, + "step": 7061 + }, + { + "epoch": 3.1386666666666665, + "grad_norm": 3.068243980407715, + "learning_rate": 7.451957295373666e-05, + "loss": 1.4239, + "step": 7062 + }, + { + "epoch": 3.139111111111111, + "grad_norm": 2.426748514175415, + "learning_rate": 7.450177935943061e-05, + "loss": 1.1847, + "step": 7063 + }, + { + "epoch": 3.1395555555555554, + "grad_norm": 2.8654985427856445, + "learning_rate": 7.448398576512457e-05, + "loss": 1.5308, + "step": 7064 + }, + { + "epoch": 3.14, + "grad_norm": 1.9712382555007935, + "learning_rate": 7.44661921708185e-05, + "loss": 0.7233, + "step": 7065 + }, + { + "epoch": 3.1404444444444444, + "grad_norm": 1.8847380876541138, + "learning_rate": 7.444839857651246e-05, + "loss": 0.7311, + "step": 7066 + }, + { + "epoch": 3.140888888888889, + "grad_norm": 2.814565658569336, + "learning_rate": 7.443060498220641e-05, + "loss": 1.1762, + "step": 7067 + }, + { + "epoch": 3.1413333333333333, + "grad_norm": 2.49889874458313, + "learning_rate": 7.441281138790035e-05, + "loss": 1.2439, + "step": 7068 + }, + { + "epoch": 3.1417777777777776, + "grad_norm": 3.1838126182556152, + "learning_rate": 7.439501779359431e-05, + "loss": 1.3337, + "step": 7069 + }, + { + "epoch": 3.1422222222222222, + "grad_norm": 2.6192221641540527, + "learning_rate": 7.437722419928827e-05, + "loss": 1.065, + "step": 7070 + }, + { + "epoch": 3.1426666666666665, + "grad_norm": 2.363414764404297, + "learning_rate": 7.435943060498221e-05, + "loss": 1.0256, + "step": 7071 + }, + { + "epoch": 3.143111111111111, + "grad_norm": 1.8646711111068726, + "learning_rate": 7.434163701067615e-05, + "loss": 0.6336, + "step": 7072 + }, + { + "epoch": 3.1435555555555554, + "grad_norm": 2.434340238571167, + "learning_rate": 7.432384341637011e-05, + "loss": 1.0433, + "step": 7073 + }, + { + "epoch": 3.144, + "grad_norm": 3.6076550483703613, + "learning_rate": 7.430604982206407e-05, + "loss": 1.1586, + "step": 7074 + }, + { + "epoch": 3.1444444444444444, + "grad_norm": 3.699741840362549, + "learning_rate": 7.428825622775801e-05, + "loss": 1.5315, + "step": 7075 + }, + { + "epoch": 3.144888888888889, + "grad_norm": 3.15507435798645, + "learning_rate": 7.427046263345197e-05, + "loss": 1.3789, + "step": 7076 + }, + { + "epoch": 3.1453333333333333, + "grad_norm": 2.9796347618103027, + "learning_rate": 7.425266903914591e-05, + "loss": 1.083, + "step": 7077 + }, + { + "epoch": 3.145777777777778, + "grad_norm": 2.9139842987060547, + "learning_rate": 7.423487544483985e-05, + "loss": 1.3829, + "step": 7078 + }, + { + "epoch": 3.1462222222222223, + "grad_norm": 3.6654200553894043, + "learning_rate": 7.421708185053381e-05, + "loss": 1.2362, + "step": 7079 + }, + { + "epoch": 3.1466666666666665, + "grad_norm": 3.540692090988159, + "learning_rate": 7.419928825622776e-05, + "loss": 1.4958, + "step": 7080 + }, + { + "epoch": 3.147111111111111, + "grad_norm": 3.061079740524292, + "learning_rate": 7.418149466192171e-05, + "loss": 1.317, + "step": 7081 + }, + { + "epoch": 3.1475555555555554, + "grad_norm": 3.567028522491455, + "learning_rate": 7.416370106761566e-05, + "loss": 1.3618, + "step": 7082 + }, + { + "epoch": 3.148, + "grad_norm": 3.034331798553467, + "learning_rate": 7.414590747330962e-05, + "loss": 0.9736, + "step": 7083 + }, + { + "epoch": 3.1484444444444444, + "grad_norm": 0.4216180741786957, + "learning_rate": 7.412811387900356e-05, + "loss": 0.0315, + "step": 7084 + }, + { + "epoch": 3.148888888888889, + "grad_norm": 2.6628007888793945, + "learning_rate": 7.41103202846975e-05, + "loss": 0.8623, + "step": 7085 + }, + { + "epoch": 3.1493333333333333, + "grad_norm": 3.0110785961151123, + "learning_rate": 7.409252669039146e-05, + "loss": 1.0985, + "step": 7086 + }, + { + "epoch": 3.1497777777777776, + "grad_norm": 3.624995231628418, + "learning_rate": 7.407473309608542e-05, + "loss": 1.1393, + "step": 7087 + }, + { + "epoch": 3.1502222222222223, + "grad_norm": 4.227591514587402, + "learning_rate": 7.405693950177936e-05, + "loss": 1.0868, + "step": 7088 + }, + { + "epoch": 3.1506666666666665, + "grad_norm": 3.4621124267578125, + "learning_rate": 7.403914590747332e-05, + "loss": 1.427, + "step": 7089 + }, + { + "epoch": 3.151111111111111, + "grad_norm": 3.3510584831237793, + "learning_rate": 7.402135231316726e-05, + "loss": 1.4212, + "step": 7090 + }, + { + "epoch": 3.1515555555555554, + "grad_norm": 3.500433921813965, + "learning_rate": 7.40035587188612e-05, + "loss": 1.1449, + "step": 7091 + }, + { + "epoch": 3.152, + "grad_norm": 2.601447582244873, + "learning_rate": 7.398576512455516e-05, + "loss": 0.8084, + "step": 7092 + }, + { + "epoch": 3.1524444444444444, + "grad_norm": 3.0802958011627197, + "learning_rate": 7.396797153024912e-05, + "loss": 1.1565, + "step": 7093 + }, + { + "epoch": 3.152888888888889, + "grad_norm": 3.7664129734039307, + "learning_rate": 7.395017793594306e-05, + "loss": 1.3433, + "step": 7094 + }, + { + "epoch": 3.1533333333333333, + "grad_norm": 3.885650157928467, + "learning_rate": 7.393238434163702e-05, + "loss": 1.2266, + "step": 7095 + }, + { + "epoch": 3.153777777777778, + "grad_norm": 5.502748489379883, + "learning_rate": 7.391459074733097e-05, + "loss": 1.8502, + "step": 7096 + }, + { + "epoch": 3.1542222222222223, + "grad_norm": 3.670057535171509, + "learning_rate": 7.389679715302492e-05, + "loss": 1.4684, + "step": 7097 + }, + { + "epoch": 3.1546666666666665, + "grad_norm": 4.47036075592041, + "learning_rate": 7.387900355871886e-05, + "loss": 1.183, + "step": 7098 + }, + { + "epoch": 3.155111111111111, + "grad_norm": 0.5041842460632324, + "learning_rate": 7.386120996441282e-05, + "loss": 0.0461, + "step": 7099 + }, + { + "epoch": 3.1555555555555554, + "grad_norm": 3.492530107498169, + "learning_rate": 7.384341637010677e-05, + "loss": 0.8281, + "step": 7100 + }, + { + "epoch": 3.156, + "grad_norm": 2.3296926021575928, + "learning_rate": 7.382562277580072e-05, + "loss": 1.9542, + "step": 7101 + }, + { + "epoch": 3.1564444444444444, + "grad_norm": 2.73773455619812, + "learning_rate": 7.380782918149467e-05, + "loss": 1.7659, + "step": 7102 + }, + { + "epoch": 3.156888888888889, + "grad_norm": 2.8689823150634766, + "learning_rate": 7.379003558718862e-05, + "loss": 1.7431, + "step": 7103 + }, + { + "epoch": 3.1573333333333333, + "grad_norm": 2.7786591053009033, + "learning_rate": 7.377224199288256e-05, + "loss": 1.6562, + "step": 7104 + }, + { + "epoch": 3.1577777777777776, + "grad_norm": 3.037637710571289, + "learning_rate": 7.375444839857651e-05, + "loss": 1.746, + "step": 7105 + }, + { + "epoch": 3.1582222222222223, + "grad_norm": 3.2228243350982666, + "learning_rate": 7.373665480427047e-05, + "loss": 1.8074, + "step": 7106 + }, + { + "epoch": 3.1586666666666665, + "grad_norm": 2.8026838302612305, + "learning_rate": 7.371886120996441e-05, + "loss": 1.786, + "step": 7107 + }, + { + "epoch": 3.159111111111111, + "grad_norm": 3.4075815677642822, + "learning_rate": 7.370106761565837e-05, + "loss": 1.6417, + "step": 7108 + }, + { + "epoch": 3.1595555555555555, + "grad_norm": 2.9315574169158936, + "learning_rate": 7.368327402135233e-05, + "loss": 1.3821, + "step": 7109 + }, + { + "epoch": 3.16, + "grad_norm": 3.2183048725128174, + "learning_rate": 7.366548042704626e-05, + "loss": 1.3795, + "step": 7110 + }, + { + "epoch": 3.1604444444444444, + "grad_norm": 2.97540020942688, + "learning_rate": 7.364768683274021e-05, + "loss": 1.6864, + "step": 7111 + }, + { + "epoch": 3.160888888888889, + "grad_norm": 3.3829736709594727, + "learning_rate": 7.362989323843417e-05, + "loss": 1.5294, + "step": 7112 + }, + { + "epoch": 3.1613333333333333, + "grad_norm": 3.1063239574432373, + "learning_rate": 7.361209964412811e-05, + "loss": 1.451, + "step": 7113 + }, + { + "epoch": 3.1617777777777776, + "grad_norm": 3.2091407775878906, + "learning_rate": 7.359430604982207e-05, + "loss": 1.1878, + "step": 7114 + }, + { + "epoch": 3.1622222222222223, + "grad_norm": 3.398361921310425, + "learning_rate": 7.357651245551603e-05, + "loss": 1.5108, + "step": 7115 + }, + { + "epoch": 3.1626666666666665, + "grad_norm": 3.1675875186920166, + "learning_rate": 7.355871886120997e-05, + "loss": 1.294, + "step": 7116 + }, + { + "epoch": 3.163111111111111, + "grad_norm": 2.810655117034912, + "learning_rate": 7.354092526690391e-05, + "loss": 1.2984, + "step": 7117 + }, + { + "epoch": 3.1635555555555555, + "grad_norm": 3.2484841346740723, + "learning_rate": 7.352313167259787e-05, + "loss": 1.1205, + "step": 7118 + }, + { + "epoch": 3.164, + "grad_norm": 3.3352835178375244, + "learning_rate": 7.350533807829182e-05, + "loss": 1.5339, + "step": 7119 + }, + { + "epoch": 3.1644444444444444, + "grad_norm": 3.3871214389801025, + "learning_rate": 7.348754448398577e-05, + "loss": 1.3406, + "step": 7120 + }, + { + "epoch": 3.164888888888889, + "grad_norm": 3.104323148727417, + "learning_rate": 7.346975088967972e-05, + "loss": 1.636, + "step": 7121 + }, + { + "epoch": 3.1653333333333333, + "grad_norm": 2.902137041091919, + "learning_rate": 7.345195729537368e-05, + "loss": 1.4295, + "step": 7122 + }, + { + "epoch": 3.1657777777777776, + "grad_norm": 2.7861342430114746, + "learning_rate": 7.343416370106761e-05, + "loss": 0.9665, + "step": 7123 + }, + { + "epoch": 3.1662222222222223, + "grad_norm": 2.9004876613616943, + "learning_rate": 7.341637010676157e-05, + "loss": 1.2588, + "step": 7124 + }, + { + "epoch": 3.1666666666666665, + "grad_norm": 2.671241521835327, + "learning_rate": 7.339857651245552e-05, + "loss": 1.2189, + "step": 7125 + }, + { + "epoch": 3.167111111111111, + "grad_norm": 3.0016653537750244, + "learning_rate": 7.338078291814947e-05, + "loss": 0.9757, + "step": 7126 + }, + { + "epoch": 3.1675555555555555, + "grad_norm": 3.0266857147216797, + "learning_rate": 7.336298932384342e-05, + "loss": 1.2218, + "step": 7127 + }, + { + "epoch": 3.168, + "grad_norm": 2.868429183959961, + "learning_rate": 7.334519572953738e-05, + "loss": 0.8355, + "step": 7128 + }, + { + "epoch": 3.1684444444444444, + "grad_norm": 3.6476175785064697, + "learning_rate": 7.332740213523132e-05, + "loss": 0.864, + "step": 7129 + }, + { + "epoch": 3.168888888888889, + "grad_norm": 3.347186803817749, + "learning_rate": 7.330960854092526e-05, + "loss": 1.1723, + "step": 7130 + }, + { + "epoch": 3.1693333333333333, + "grad_norm": 3.4044735431671143, + "learning_rate": 7.329181494661922e-05, + "loss": 1.0311, + "step": 7131 + }, + { + "epoch": 3.1697777777777776, + "grad_norm": 2.955648422241211, + "learning_rate": 7.327402135231318e-05, + "loss": 1.1082, + "step": 7132 + }, + { + "epoch": 3.1702222222222223, + "grad_norm": 3.7140543460845947, + "learning_rate": 7.325622775800712e-05, + "loss": 0.9562, + "step": 7133 + }, + { + "epoch": 3.1706666666666665, + "grad_norm": 2.378147840499878, + "learning_rate": 7.323843416370108e-05, + "loss": 0.697, + "step": 7134 + }, + { + "epoch": 3.171111111111111, + "grad_norm": 3.5638556480407715, + "learning_rate": 7.322064056939502e-05, + "loss": 1.2156, + "step": 7135 + }, + { + "epoch": 3.1715555555555555, + "grad_norm": 3.8616840839385986, + "learning_rate": 7.320284697508896e-05, + "loss": 0.9394, + "step": 7136 + }, + { + "epoch": 3.172, + "grad_norm": 3.537442684173584, + "learning_rate": 7.318505338078292e-05, + "loss": 1.3647, + "step": 7137 + }, + { + "epoch": 3.1724444444444444, + "grad_norm": 3.341238260269165, + "learning_rate": 7.316725978647688e-05, + "loss": 1.0796, + "step": 7138 + }, + { + "epoch": 3.172888888888889, + "grad_norm": 3.2897703647613525, + "learning_rate": 7.314946619217082e-05, + "loss": 0.9163, + "step": 7139 + }, + { + "epoch": 3.1733333333333333, + "grad_norm": 3.147260904312134, + "learning_rate": 7.313167259786478e-05, + "loss": 0.9676, + "step": 7140 + }, + { + "epoch": 3.1737777777777776, + "grad_norm": 4.187511444091797, + "learning_rate": 7.311387900355873e-05, + "loss": 1.3926, + "step": 7141 + }, + { + "epoch": 3.1742222222222223, + "grad_norm": 4.7306809425354, + "learning_rate": 7.309608540925267e-05, + "loss": 1.3427, + "step": 7142 + }, + { + "epoch": 3.1746666666666665, + "grad_norm": 3.0592243671417236, + "learning_rate": 7.307829181494662e-05, + "loss": 0.7532, + "step": 7143 + }, + { + "epoch": 3.175111111111111, + "grad_norm": 4.022464275360107, + "learning_rate": 7.306049822064057e-05, + "loss": 1.3008, + "step": 7144 + }, + { + "epoch": 3.1755555555555555, + "grad_norm": 4.482253551483154, + "learning_rate": 7.304270462633453e-05, + "loss": 1.1332, + "step": 7145 + }, + { + "epoch": 3.176, + "grad_norm": 3.72058367729187, + "learning_rate": 7.302491103202847e-05, + "loss": 0.7426, + "step": 7146 + }, + { + "epoch": 3.1764444444444444, + "grad_norm": 4.921838760375977, + "learning_rate": 7.300711743772243e-05, + "loss": 1.0258, + "step": 7147 + }, + { + "epoch": 3.176888888888889, + "grad_norm": 4.758439540863037, + "learning_rate": 7.298932384341637e-05, + "loss": 1.3546, + "step": 7148 + }, + { + "epoch": 3.1773333333333333, + "grad_norm": 3.1380600929260254, + "learning_rate": 7.297153024911032e-05, + "loss": 1.0218, + "step": 7149 + }, + { + "epoch": 3.1777777777777776, + "grad_norm": 7.19141149520874, + "learning_rate": 7.295373665480427e-05, + "loss": 1.2103, + "step": 7150 + }, + { + "epoch": 3.1782222222222223, + "grad_norm": 1.9877086877822876, + "learning_rate": 7.293594306049823e-05, + "loss": 1.8951, + "step": 7151 + }, + { + "epoch": 3.1786666666666665, + "grad_norm": 2.509326934814453, + "learning_rate": 7.291814946619217e-05, + "loss": 1.9882, + "step": 7152 + }, + { + "epoch": 3.179111111111111, + "grad_norm": 2.56880521774292, + "learning_rate": 7.290035587188613e-05, + "loss": 1.8781, + "step": 7153 + }, + { + "epoch": 3.1795555555555555, + "grad_norm": 2.692734718322754, + "learning_rate": 7.288256227758008e-05, + "loss": 1.9204, + "step": 7154 + }, + { + "epoch": 3.18, + "grad_norm": 2.635511875152588, + "learning_rate": 7.286476868327401e-05, + "loss": 1.1913, + "step": 7155 + }, + { + "epoch": 3.1804444444444444, + "grad_norm": 2.8718338012695312, + "learning_rate": 7.284697508896797e-05, + "loss": 1.432, + "step": 7156 + }, + { + "epoch": 3.180888888888889, + "grad_norm": 3.3866138458251953, + "learning_rate": 7.282918149466193e-05, + "loss": 1.7453, + "step": 7157 + }, + { + "epoch": 3.1813333333333333, + "grad_norm": 3.44844913482666, + "learning_rate": 7.281138790035587e-05, + "loss": 1.3587, + "step": 7158 + }, + { + "epoch": 3.1817777777777776, + "grad_norm": 2.8075125217437744, + "learning_rate": 7.279359430604983e-05, + "loss": 1.5954, + "step": 7159 + }, + { + "epoch": 3.1822222222222223, + "grad_norm": 2.6318461894989014, + "learning_rate": 7.277580071174378e-05, + "loss": 1.2991, + "step": 7160 + }, + { + "epoch": 3.1826666666666665, + "grad_norm": 2.9282736778259277, + "learning_rate": 7.275800711743773e-05, + "loss": 1.5801, + "step": 7161 + }, + { + "epoch": 3.1831111111111112, + "grad_norm": 2.870149850845337, + "learning_rate": 7.274021352313167e-05, + "loss": 0.8409, + "step": 7162 + }, + { + "epoch": 3.1835555555555555, + "grad_norm": 2.717829942703247, + "learning_rate": 7.272241992882563e-05, + "loss": 0.994, + "step": 7163 + }, + { + "epoch": 3.184, + "grad_norm": 2.9092955589294434, + "learning_rate": 7.270462633451958e-05, + "loss": 1.6491, + "step": 7164 + }, + { + "epoch": 3.1844444444444444, + "grad_norm": 2.3932883739471436, + "learning_rate": 7.268683274021352e-05, + "loss": 0.8707, + "step": 7165 + }, + { + "epoch": 3.1848888888888887, + "grad_norm": 3.078213691711426, + "learning_rate": 7.266903914590748e-05, + "loss": 1.0287, + "step": 7166 + }, + { + "epoch": 3.1853333333333333, + "grad_norm": 3.1520655155181885, + "learning_rate": 7.265124555160144e-05, + "loss": 1.1343, + "step": 7167 + }, + { + "epoch": 3.1857777777777776, + "grad_norm": 3.593202829360962, + "learning_rate": 7.263345195729537e-05, + "loss": 1.276, + "step": 7168 + }, + { + "epoch": 3.1862222222222223, + "grad_norm": 2.6163017749786377, + "learning_rate": 7.261565836298932e-05, + "loss": 1.226, + "step": 7169 + }, + { + "epoch": 3.1866666666666665, + "grad_norm": 2.836480140686035, + "learning_rate": 7.259786476868328e-05, + "loss": 1.2427, + "step": 7170 + }, + { + "epoch": 3.1871111111111112, + "grad_norm": 3.4099297523498535, + "learning_rate": 7.258007117437722e-05, + "loss": 1.4251, + "step": 7171 + }, + { + "epoch": 3.1875555555555555, + "grad_norm": 2.73268461227417, + "learning_rate": 7.256227758007118e-05, + "loss": 1.0855, + "step": 7172 + }, + { + "epoch": 3.188, + "grad_norm": 2.9737985134124756, + "learning_rate": 7.254448398576514e-05, + "loss": 1.451, + "step": 7173 + }, + { + "epoch": 3.1884444444444444, + "grad_norm": 3.1273388862609863, + "learning_rate": 7.252669039145908e-05, + "loss": 1.6291, + "step": 7174 + }, + { + "epoch": 3.188888888888889, + "grad_norm": 3.3678598403930664, + "learning_rate": 7.250889679715302e-05, + "loss": 1.6578, + "step": 7175 + }, + { + "epoch": 3.1893333333333334, + "grad_norm": 3.497072458267212, + "learning_rate": 7.249110320284698e-05, + "loss": 1.4176, + "step": 7176 + }, + { + "epoch": 3.1897777777777776, + "grad_norm": 3.735827922821045, + "learning_rate": 7.247330960854094e-05, + "loss": 1.5746, + "step": 7177 + }, + { + "epoch": 3.1902222222222223, + "grad_norm": 2.853653907775879, + "learning_rate": 7.245551601423488e-05, + "loss": 1.1089, + "step": 7178 + }, + { + "epoch": 3.1906666666666665, + "grad_norm": 3.3315930366516113, + "learning_rate": 7.243772241992883e-05, + "loss": 1.1311, + "step": 7179 + }, + { + "epoch": 3.1911111111111112, + "grad_norm": 2.9672279357910156, + "learning_rate": 7.241992882562279e-05, + "loss": 1.3171, + "step": 7180 + }, + { + "epoch": 3.1915555555555555, + "grad_norm": 3.4035391807556152, + "learning_rate": 7.240213523131672e-05, + "loss": 0.9667, + "step": 7181 + }, + { + "epoch": 3.192, + "grad_norm": 3.72255539894104, + "learning_rate": 7.238434163701068e-05, + "loss": 1.36, + "step": 7182 + }, + { + "epoch": 3.1924444444444444, + "grad_norm": 3.1913297176361084, + "learning_rate": 7.236654804270463e-05, + "loss": 1.0758, + "step": 7183 + }, + { + "epoch": 3.1928888888888887, + "grad_norm": 2.7058324813842773, + "learning_rate": 7.234875444839858e-05, + "loss": 0.6656, + "step": 7184 + }, + { + "epoch": 3.1933333333333334, + "grad_norm": 3.929481029510498, + "learning_rate": 7.233096085409253e-05, + "loss": 1.6081, + "step": 7185 + }, + { + "epoch": 3.1937777777777776, + "grad_norm": 3.4291040897369385, + "learning_rate": 7.231316725978649e-05, + "loss": 1.363, + "step": 7186 + }, + { + "epoch": 3.1942222222222223, + "grad_norm": 3.8611419200897217, + "learning_rate": 7.229537366548043e-05, + "loss": 1.6286, + "step": 7187 + }, + { + "epoch": 3.1946666666666665, + "grad_norm": 2.4565515518188477, + "learning_rate": 7.227758007117438e-05, + "loss": 0.4925, + "step": 7188 + }, + { + "epoch": 3.1951111111111112, + "grad_norm": 2.0368804931640625, + "learning_rate": 7.225978647686833e-05, + "loss": 0.5928, + "step": 7189 + }, + { + "epoch": 3.1955555555555555, + "grad_norm": 3.8311848640441895, + "learning_rate": 7.224199288256229e-05, + "loss": 1.3479, + "step": 7190 + }, + { + "epoch": 3.196, + "grad_norm": 4.187136173248291, + "learning_rate": 7.222419928825623e-05, + "loss": 0.9687, + "step": 7191 + }, + { + "epoch": 3.1964444444444444, + "grad_norm": 3.8171749114990234, + "learning_rate": 7.220640569395019e-05, + "loss": 1.265, + "step": 7192 + }, + { + "epoch": 3.196888888888889, + "grad_norm": 3.7234129905700684, + "learning_rate": 7.218861209964413e-05, + "loss": 1.1282, + "step": 7193 + }, + { + "epoch": 3.1973333333333334, + "grad_norm": 4.388239860534668, + "learning_rate": 7.217081850533807e-05, + "loss": 1.2207, + "step": 7194 + }, + { + "epoch": 3.1977777777777776, + "grad_norm": 3.327991247177124, + "learning_rate": 7.215302491103203e-05, + "loss": 1.1965, + "step": 7195 + }, + { + "epoch": 3.1982222222222223, + "grad_norm": 4.21957540512085, + "learning_rate": 7.213523131672599e-05, + "loss": 1.4889, + "step": 7196 + }, + { + "epoch": 3.1986666666666665, + "grad_norm": 4.019267559051514, + "learning_rate": 7.211743772241993e-05, + "loss": 1.3072, + "step": 7197 + }, + { + "epoch": 3.1991111111111112, + "grad_norm": 3.848752975463867, + "learning_rate": 7.209964412811389e-05, + "loss": 1.0561, + "step": 7198 + }, + { + "epoch": 3.1995555555555555, + "grad_norm": 3.2341089248657227, + "learning_rate": 7.208185053380784e-05, + "loss": 0.545, + "step": 7199 + }, + { + "epoch": 3.2, + "grad_norm": 3.097386598587036, + "learning_rate": 7.206405693950177e-05, + "loss": 0.2964, + "step": 7200 + }, + { + "epoch": 3.2004444444444444, + "grad_norm": 2.824582099914551, + "learning_rate": 7.204626334519573e-05, + "loss": 1.753, + "step": 7201 + }, + { + "epoch": 3.2008888888888887, + "grad_norm": 2.1013436317443848, + "learning_rate": 7.202846975088968e-05, + "loss": 0.606, + "step": 7202 + }, + { + "epoch": 3.2013333333333334, + "grad_norm": 2.8657641410827637, + "learning_rate": 7.201067615658363e-05, + "loss": 1.8523, + "step": 7203 + }, + { + "epoch": 3.2017777777777776, + "grad_norm": 2.5109121799468994, + "learning_rate": 7.199288256227758e-05, + "loss": 1.1317, + "step": 7204 + }, + { + "epoch": 3.2022222222222223, + "grad_norm": 3.1287527084350586, + "learning_rate": 7.197508896797154e-05, + "loss": 1.8141, + "step": 7205 + }, + { + "epoch": 3.2026666666666666, + "grad_norm": 3.054079294204712, + "learning_rate": 7.195729537366548e-05, + "loss": 1.8931, + "step": 7206 + }, + { + "epoch": 3.2031111111111112, + "grad_norm": 3.2487199306488037, + "learning_rate": 7.193950177935943e-05, + "loss": 1.4648, + "step": 7207 + }, + { + "epoch": 3.2035555555555555, + "grad_norm": 2.855591058731079, + "learning_rate": 7.192170818505338e-05, + "loss": 1.2971, + "step": 7208 + }, + { + "epoch": 3.204, + "grad_norm": 2.5365679264068604, + "learning_rate": 7.190391459074734e-05, + "loss": 1.2072, + "step": 7209 + }, + { + "epoch": 3.2044444444444444, + "grad_norm": 3.0599007606506348, + "learning_rate": 7.188612099644128e-05, + "loss": 1.6128, + "step": 7210 + }, + { + "epoch": 3.204888888888889, + "grad_norm": 3.139268159866333, + "learning_rate": 7.186832740213524e-05, + "loss": 1.2958, + "step": 7211 + }, + { + "epoch": 3.2053333333333334, + "grad_norm": 3.227274179458618, + "learning_rate": 7.18505338078292e-05, + "loss": 1.4077, + "step": 7212 + }, + { + "epoch": 3.2057777777777776, + "grad_norm": 3.3886468410491943, + "learning_rate": 7.183274021352313e-05, + "loss": 1.5784, + "step": 7213 + }, + { + "epoch": 3.2062222222222223, + "grad_norm": 3.387749195098877, + "learning_rate": 7.181494661921708e-05, + "loss": 1.4535, + "step": 7214 + }, + { + "epoch": 3.2066666666666666, + "grad_norm": 2.8726046085357666, + "learning_rate": 7.179715302491104e-05, + "loss": 1.1744, + "step": 7215 + }, + { + "epoch": 3.2071111111111112, + "grad_norm": 2.2621724605560303, + "learning_rate": 7.177935943060498e-05, + "loss": 0.6728, + "step": 7216 + }, + { + "epoch": 3.2075555555555555, + "grad_norm": 3.217412233352661, + "learning_rate": 7.176156583629894e-05, + "loss": 1.6073, + "step": 7217 + }, + { + "epoch": 3.208, + "grad_norm": 2.7585723400115967, + "learning_rate": 7.17437722419929e-05, + "loss": 0.9378, + "step": 7218 + }, + { + "epoch": 3.2084444444444444, + "grad_norm": 3.4076976776123047, + "learning_rate": 7.172597864768684e-05, + "loss": 1.3989, + "step": 7219 + }, + { + "epoch": 3.2088888888888887, + "grad_norm": 2.9216151237487793, + "learning_rate": 7.170818505338078e-05, + "loss": 1.206, + "step": 7220 + }, + { + "epoch": 3.2093333333333334, + "grad_norm": 3.7456247806549072, + "learning_rate": 7.169039145907474e-05, + "loss": 1.8443, + "step": 7221 + }, + { + "epoch": 3.2097777777777776, + "grad_norm": 2.7865076065063477, + "learning_rate": 7.167259786476869e-05, + "loss": 0.9977, + "step": 7222 + }, + { + "epoch": 3.2102222222222223, + "grad_norm": 3.0865001678466797, + "learning_rate": 7.165480427046264e-05, + "loss": 1.3247, + "step": 7223 + }, + { + "epoch": 3.2106666666666666, + "grad_norm": 3.52168607711792, + "learning_rate": 7.163701067615659e-05, + "loss": 1.0324, + "step": 7224 + }, + { + "epoch": 3.2111111111111112, + "grad_norm": 4.151998519897461, + "learning_rate": 7.161921708185055e-05, + "loss": 1.5058, + "step": 7225 + }, + { + "epoch": 3.2115555555555555, + "grad_norm": 3.7610089778900146, + "learning_rate": 7.160142348754448e-05, + "loss": 1.8216, + "step": 7226 + }, + { + "epoch": 3.212, + "grad_norm": 2.9817423820495605, + "learning_rate": 7.158362989323843e-05, + "loss": 1.3767, + "step": 7227 + }, + { + "epoch": 3.2124444444444444, + "grad_norm": 2.8566582202911377, + "learning_rate": 7.156583629893239e-05, + "loss": 0.9962, + "step": 7228 + }, + { + "epoch": 3.2128888888888887, + "grad_norm": 2.9715373516082764, + "learning_rate": 7.154804270462633e-05, + "loss": 1.4952, + "step": 7229 + }, + { + "epoch": 3.2133333333333334, + "grad_norm": 3.730404853820801, + "learning_rate": 7.153024911032029e-05, + "loss": 1.5204, + "step": 7230 + }, + { + "epoch": 3.2137777777777776, + "grad_norm": 3.011878490447998, + "learning_rate": 7.151245551601425e-05, + "loss": 1.1504, + "step": 7231 + }, + { + "epoch": 3.2142222222222223, + "grad_norm": 3.1973698139190674, + "learning_rate": 7.149466192170819e-05, + "loss": 0.9828, + "step": 7232 + }, + { + "epoch": 3.2146666666666666, + "grad_norm": 3.3064098358154297, + "learning_rate": 7.147686832740213e-05, + "loss": 1.5142, + "step": 7233 + }, + { + "epoch": 3.2151111111111113, + "grad_norm": 2.9979686737060547, + "learning_rate": 7.145907473309609e-05, + "loss": 1.1844, + "step": 7234 + }, + { + "epoch": 3.2155555555555555, + "grad_norm": 3.717355489730835, + "learning_rate": 7.144128113879005e-05, + "loss": 1.7117, + "step": 7235 + }, + { + "epoch": 3.216, + "grad_norm": 4.831557273864746, + "learning_rate": 7.142348754448399e-05, + "loss": 1.553, + "step": 7236 + }, + { + "epoch": 3.2164444444444444, + "grad_norm": 2.6884803771972656, + "learning_rate": 7.140569395017795e-05, + "loss": 0.7039, + "step": 7237 + }, + { + "epoch": 3.2168888888888887, + "grad_norm": 3.3653738498687744, + "learning_rate": 7.138790035587189e-05, + "loss": 1.144, + "step": 7238 + }, + { + "epoch": 3.2173333333333334, + "grad_norm": 3.712017774581909, + "learning_rate": 7.137010676156583e-05, + "loss": 1.2988, + "step": 7239 + }, + { + "epoch": 3.2177777777777776, + "grad_norm": 4.0276360511779785, + "learning_rate": 7.135231316725979e-05, + "loss": 1.0672, + "step": 7240 + }, + { + "epoch": 3.2182222222222223, + "grad_norm": 3.8651421070098877, + "learning_rate": 7.133451957295374e-05, + "loss": 1.5012, + "step": 7241 + }, + { + "epoch": 3.2186666666666666, + "grad_norm": 3.2855637073516846, + "learning_rate": 7.131672597864769e-05, + "loss": 0.9562, + "step": 7242 + }, + { + "epoch": 3.2191111111111113, + "grad_norm": 3.601529598236084, + "learning_rate": 7.129893238434164e-05, + "loss": 1.2631, + "step": 7243 + }, + { + "epoch": 3.2195555555555555, + "grad_norm": 3.80122709274292, + "learning_rate": 7.12811387900356e-05, + "loss": 1.2177, + "step": 7244 + }, + { + "epoch": 3.22, + "grad_norm": 4.019015789031982, + "learning_rate": 7.126334519572953e-05, + "loss": 1.2818, + "step": 7245 + }, + { + "epoch": 3.2204444444444444, + "grad_norm": 3.7049124240875244, + "learning_rate": 7.124555160142349e-05, + "loss": 0.9796, + "step": 7246 + }, + { + "epoch": 3.2208888888888887, + "grad_norm": 4.283298969268799, + "learning_rate": 7.122775800711744e-05, + "loss": 1.1318, + "step": 7247 + }, + { + "epoch": 3.2213333333333334, + "grad_norm": 5.531714916229248, + "learning_rate": 7.120996441281139e-05, + "loss": 1.4307, + "step": 7248 + }, + { + "epoch": 3.2217777777777776, + "grad_norm": 3.2561047077178955, + "learning_rate": 7.119217081850534e-05, + "loss": 1.09, + "step": 7249 + }, + { + "epoch": 3.2222222222222223, + "grad_norm": 4.089896202087402, + "learning_rate": 7.11743772241993e-05, + "loss": 0.5095, + "step": 7250 + }, + { + "epoch": 3.2226666666666666, + "grad_norm": 2.2933907508850098, + "learning_rate": 7.115658362989324e-05, + "loss": 2.2543, + "step": 7251 + }, + { + "epoch": 3.2231111111111113, + "grad_norm": 2.559863567352295, + "learning_rate": 7.113879003558718e-05, + "loss": 1.9739, + "step": 7252 + }, + { + "epoch": 3.2235555555555555, + "grad_norm": 1.9117332696914673, + "learning_rate": 7.112099644128114e-05, + "loss": 0.9344, + "step": 7253 + }, + { + "epoch": 3.224, + "grad_norm": 2.9356801509857178, + "learning_rate": 7.11032028469751e-05, + "loss": 1.5221, + "step": 7254 + }, + { + "epoch": 3.2244444444444444, + "grad_norm": 2.627389430999756, + "learning_rate": 7.108540925266904e-05, + "loss": 1.4828, + "step": 7255 + }, + { + "epoch": 3.2248888888888887, + "grad_norm": 3.1114463806152344, + "learning_rate": 7.1067615658363e-05, + "loss": 1.4934, + "step": 7256 + }, + { + "epoch": 3.2253333333333334, + "grad_norm": 2.5716917514801025, + "learning_rate": 7.104982206405695e-05, + "loss": 1.0995, + "step": 7257 + }, + { + "epoch": 3.2257777777777776, + "grad_norm": 2.8480474948883057, + "learning_rate": 7.103202846975088e-05, + "loss": 1.7053, + "step": 7258 + }, + { + "epoch": 3.2262222222222223, + "grad_norm": 2.725034475326538, + "learning_rate": 7.101423487544484e-05, + "loss": 1.1406, + "step": 7259 + }, + { + "epoch": 3.2266666666666666, + "grad_norm": 2.4350764751434326, + "learning_rate": 7.09964412811388e-05, + "loss": 1.2509, + "step": 7260 + }, + { + "epoch": 3.2271111111111113, + "grad_norm": 3.65389347076416, + "learning_rate": 7.097864768683274e-05, + "loss": 2.3713, + "step": 7261 + }, + { + "epoch": 3.2275555555555555, + "grad_norm": 3.542847156524658, + "learning_rate": 7.09608540925267e-05, + "loss": 1.5322, + "step": 7262 + }, + { + "epoch": 3.228, + "grad_norm": 2.748013496398926, + "learning_rate": 7.094306049822065e-05, + "loss": 1.3331, + "step": 7263 + }, + { + "epoch": 3.2284444444444444, + "grad_norm": 3.1858341693878174, + "learning_rate": 7.09252669039146e-05, + "loss": 1.7167, + "step": 7264 + }, + { + "epoch": 3.2288888888888887, + "grad_norm": 2.994159698486328, + "learning_rate": 7.090747330960854e-05, + "loss": 1.0373, + "step": 7265 + }, + { + "epoch": 3.2293333333333334, + "grad_norm": 2.560145616531372, + "learning_rate": 7.08896797153025e-05, + "loss": 0.9071, + "step": 7266 + }, + { + "epoch": 3.2297777777777776, + "grad_norm": 3.167236566543579, + "learning_rate": 7.087188612099645e-05, + "loss": 1.317, + "step": 7267 + }, + { + "epoch": 3.2302222222222223, + "grad_norm": 2.4366395473480225, + "learning_rate": 7.08540925266904e-05, + "loss": 0.8397, + "step": 7268 + }, + { + "epoch": 3.2306666666666666, + "grad_norm": 3.0970094203948975, + "learning_rate": 7.083629893238435e-05, + "loss": 1.2824, + "step": 7269 + }, + { + "epoch": 3.2311111111111113, + "grad_norm": 3.605058431625366, + "learning_rate": 7.08185053380783e-05, + "loss": 1.4352, + "step": 7270 + }, + { + "epoch": 3.2315555555555555, + "grad_norm": 3.437645673751831, + "learning_rate": 7.080071174377224e-05, + "loss": 1.4561, + "step": 7271 + }, + { + "epoch": 3.232, + "grad_norm": 2.9223389625549316, + "learning_rate": 7.078291814946619e-05, + "loss": 1.146, + "step": 7272 + }, + { + "epoch": 3.2324444444444445, + "grad_norm": 4.0127129554748535, + "learning_rate": 7.076512455516015e-05, + "loss": 1.5512, + "step": 7273 + }, + { + "epoch": 3.2328888888888887, + "grad_norm": 3.0993196964263916, + "learning_rate": 7.074733096085409e-05, + "loss": 1.178, + "step": 7274 + }, + { + "epoch": 3.2333333333333334, + "grad_norm": 3.2388551235198975, + "learning_rate": 7.072953736654805e-05, + "loss": 1.1, + "step": 7275 + }, + { + "epoch": 3.2337777777777776, + "grad_norm": 3.486222505569458, + "learning_rate": 7.0711743772242e-05, + "loss": 1.5717, + "step": 7276 + }, + { + "epoch": 3.2342222222222223, + "grad_norm": 3.0996615886688232, + "learning_rate": 7.069395017793595e-05, + "loss": 1.1912, + "step": 7277 + }, + { + "epoch": 3.2346666666666666, + "grad_norm": 3.7221884727478027, + "learning_rate": 7.067615658362989e-05, + "loss": 1.3382, + "step": 7278 + }, + { + "epoch": 3.2351111111111113, + "grad_norm": 2.7222108840942383, + "learning_rate": 7.065836298932385e-05, + "loss": 1.0546, + "step": 7279 + }, + { + "epoch": 3.2355555555555555, + "grad_norm": 3.1601808071136475, + "learning_rate": 7.06405693950178e-05, + "loss": 1.0409, + "step": 7280 + }, + { + "epoch": 3.2359999999999998, + "grad_norm": 3.1560659408569336, + "learning_rate": 7.062277580071175e-05, + "loss": 1.3843, + "step": 7281 + }, + { + "epoch": 3.2364444444444445, + "grad_norm": 2.998133897781372, + "learning_rate": 7.06049822064057e-05, + "loss": 1.227, + "step": 7282 + }, + { + "epoch": 3.2368888888888887, + "grad_norm": 2.981759548187256, + "learning_rate": 7.058718861209965e-05, + "loss": 1.0669, + "step": 7283 + }, + { + "epoch": 3.2373333333333334, + "grad_norm": 4.712173938751221, + "learning_rate": 7.056939501779359e-05, + "loss": 1.5214, + "step": 7284 + }, + { + "epoch": 3.2377777777777776, + "grad_norm": 3.807121515274048, + "learning_rate": 7.055160142348755e-05, + "loss": 1.1862, + "step": 7285 + }, + { + "epoch": 3.2382222222222223, + "grad_norm": 3.189521312713623, + "learning_rate": 7.05338078291815e-05, + "loss": 1.0769, + "step": 7286 + }, + { + "epoch": 3.2386666666666666, + "grad_norm": 3.0515081882476807, + "learning_rate": 7.051601423487545e-05, + "loss": 0.9761, + "step": 7287 + }, + { + "epoch": 3.2391111111111113, + "grad_norm": 3.679802894592285, + "learning_rate": 7.04982206405694e-05, + "loss": 1.1306, + "step": 7288 + }, + { + "epoch": 3.2395555555555555, + "grad_norm": 3.8492064476013184, + "learning_rate": 7.048042704626336e-05, + "loss": 1.0187, + "step": 7289 + }, + { + "epoch": 3.24, + "grad_norm": 4.128783226013184, + "learning_rate": 7.046263345195729e-05, + "loss": 1.1335, + "step": 7290 + }, + { + "epoch": 3.2404444444444445, + "grad_norm": 3.4628500938415527, + "learning_rate": 7.044483985765124e-05, + "loss": 1.4144, + "step": 7291 + }, + { + "epoch": 3.2408888888888887, + "grad_norm": 3.7098937034606934, + "learning_rate": 7.04270462633452e-05, + "loss": 1.0089, + "step": 7292 + }, + { + "epoch": 3.2413333333333334, + "grad_norm": 3.4453952312469482, + "learning_rate": 7.040925266903914e-05, + "loss": 1.1287, + "step": 7293 + }, + { + "epoch": 3.2417777777777776, + "grad_norm": 4.36504602432251, + "learning_rate": 7.03914590747331e-05, + "loss": 1.435, + "step": 7294 + }, + { + "epoch": 3.2422222222222223, + "grad_norm": 4.6257476806640625, + "learning_rate": 7.037366548042706e-05, + "loss": 1.1206, + "step": 7295 + }, + { + "epoch": 3.2426666666666666, + "grad_norm": 5.652115821838379, + "learning_rate": 7.0355871886121e-05, + "loss": 1.1406, + "step": 7296 + }, + { + "epoch": 3.2431111111111113, + "grad_norm": 3.9133806228637695, + "learning_rate": 7.033807829181494e-05, + "loss": 0.8279, + "step": 7297 + }, + { + "epoch": 3.2435555555555555, + "grad_norm": 5.016626834869385, + "learning_rate": 7.03202846975089e-05, + "loss": 1.7871, + "step": 7298 + }, + { + "epoch": 3.2439999999999998, + "grad_norm": 2.0724196434020996, + "learning_rate": 7.030249110320286e-05, + "loss": 0.4856, + "step": 7299 + }, + { + "epoch": 3.2444444444444445, + "grad_norm": 5.151663303375244, + "learning_rate": 7.02846975088968e-05, + "loss": 1.195, + "step": 7300 + }, + { + "epoch": 3.2448888888888887, + "grad_norm": 1.3501733541488647, + "learning_rate": 7.026690391459075e-05, + "loss": 0.0231, + "step": 7301 + }, + { + "epoch": 3.2453333333333334, + "grad_norm": 2.257030963897705, + "learning_rate": 7.024911032028471e-05, + "loss": 1.4925, + "step": 7302 + }, + { + "epoch": 3.2457777777777777, + "grad_norm": 2.3157708644866943, + "learning_rate": 7.023131672597864e-05, + "loss": 1.3873, + "step": 7303 + }, + { + "epoch": 3.2462222222222223, + "grad_norm": 2.4795522689819336, + "learning_rate": 7.02135231316726e-05, + "loss": 1.7858, + "step": 7304 + }, + { + "epoch": 3.2466666666666666, + "grad_norm": 3.000107765197754, + "learning_rate": 7.019572953736655e-05, + "loss": 2.1163, + "step": 7305 + }, + { + "epoch": 3.2471111111111113, + "grad_norm": 3.0592353343963623, + "learning_rate": 7.01779359430605e-05, + "loss": 1.615, + "step": 7306 + }, + { + "epoch": 3.2475555555555555, + "grad_norm": 3.055565357208252, + "learning_rate": 7.016014234875445e-05, + "loss": 1.4699, + "step": 7307 + }, + { + "epoch": 3.248, + "grad_norm": 2.8485829830169678, + "learning_rate": 7.014234875444841e-05, + "loss": 1.61, + "step": 7308 + }, + { + "epoch": 3.2484444444444445, + "grad_norm": 0.6298081278800964, + "learning_rate": 7.012455516014235e-05, + "loss": 0.0241, + "step": 7309 + }, + { + "epoch": 3.2488888888888887, + "grad_norm": 3.0662970542907715, + "learning_rate": 7.01067615658363e-05, + "loss": 1.7241, + "step": 7310 + }, + { + "epoch": 3.2493333333333334, + "grad_norm": 4.107135772705078, + "learning_rate": 7.008896797153025e-05, + "loss": 2.0917, + "step": 7311 + }, + { + "epoch": 3.2497777777777777, + "grad_norm": 3.6882572174072266, + "learning_rate": 7.007117437722421e-05, + "loss": 1.5678, + "step": 7312 + }, + { + "epoch": 3.2502222222222223, + "grad_norm": 2.740238904953003, + "learning_rate": 7.005338078291815e-05, + "loss": 1.3232, + "step": 7313 + }, + { + "epoch": 3.2506666666666666, + "grad_norm": 3.1968138217926025, + "learning_rate": 7.003558718861211e-05, + "loss": 1.3232, + "step": 7314 + }, + { + "epoch": 3.2511111111111113, + "grad_norm": 3.4555680751800537, + "learning_rate": 7.001779359430606e-05, + "loss": 1.6254, + "step": 7315 + }, + { + "epoch": 3.2515555555555555, + "grad_norm": 3.412783145904541, + "learning_rate": 7e-05, + "loss": 1.671, + "step": 7316 + }, + { + "epoch": 3.252, + "grad_norm": 3.239433765411377, + "learning_rate": 6.998220640569395e-05, + "loss": 1.4749, + "step": 7317 + }, + { + "epoch": 3.2524444444444445, + "grad_norm": 2.9741363525390625, + "learning_rate": 6.996441281138791e-05, + "loss": 0.8289, + "step": 7318 + }, + { + "epoch": 3.2528888888888887, + "grad_norm": 3.045731782913208, + "learning_rate": 6.994661921708185e-05, + "loss": 1.6571, + "step": 7319 + }, + { + "epoch": 3.2533333333333334, + "grad_norm": 3.0729551315307617, + "learning_rate": 6.99288256227758e-05, + "loss": 1.4405, + "step": 7320 + }, + { + "epoch": 3.2537777777777777, + "grad_norm": 2.7762842178344727, + "learning_rate": 6.991103202846976e-05, + "loss": 1.1644, + "step": 7321 + }, + { + "epoch": 3.2542222222222223, + "grad_norm": 2.8989222049713135, + "learning_rate": 6.98932384341637e-05, + "loss": 1.2091, + "step": 7322 + }, + { + "epoch": 3.2546666666666666, + "grad_norm": 3.4842469692230225, + "learning_rate": 6.987544483985765e-05, + "loss": 1.489, + "step": 7323 + }, + { + "epoch": 3.2551111111111113, + "grad_norm": 3.5377140045166016, + "learning_rate": 6.98576512455516e-05, + "loss": 0.9816, + "step": 7324 + }, + { + "epoch": 3.2555555555555555, + "grad_norm": 3.346153974533081, + "learning_rate": 6.983985765124555e-05, + "loss": 1.6473, + "step": 7325 + }, + { + "epoch": 3.2560000000000002, + "grad_norm": 3.840273380279541, + "learning_rate": 6.98220640569395e-05, + "loss": 1.509, + "step": 7326 + }, + { + "epoch": 3.2564444444444445, + "grad_norm": 3.4088332653045654, + "learning_rate": 6.980427046263346e-05, + "loss": 1.1997, + "step": 7327 + }, + { + "epoch": 3.2568888888888887, + "grad_norm": 2.9059813022613525, + "learning_rate": 6.97864768683274e-05, + "loss": 1.133, + "step": 7328 + }, + { + "epoch": 3.2573333333333334, + "grad_norm": 3.1663708686828613, + "learning_rate": 6.976868327402135e-05, + "loss": 1.2817, + "step": 7329 + }, + { + "epoch": 3.2577777777777777, + "grad_norm": 3.088623523712158, + "learning_rate": 6.97508896797153e-05, + "loss": 1.142, + "step": 7330 + }, + { + "epoch": 3.2582222222222224, + "grad_norm": 2.9888076782226562, + "learning_rate": 6.973309608540926e-05, + "loss": 0.8191, + "step": 7331 + }, + { + "epoch": 3.2586666666666666, + "grad_norm": 3.0160036087036133, + "learning_rate": 6.97153024911032e-05, + "loss": 0.8145, + "step": 7332 + }, + { + "epoch": 3.2591111111111113, + "grad_norm": 4.1365885734558105, + "learning_rate": 6.969750889679716e-05, + "loss": 1.2396, + "step": 7333 + }, + { + "epoch": 3.2595555555555555, + "grad_norm": 4.00795316696167, + "learning_rate": 6.967971530249112e-05, + "loss": 1.7597, + "step": 7334 + }, + { + "epoch": 3.26, + "grad_norm": 3.9610869884490967, + "learning_rate": 6.966192170818505e-05, + "loss": 1.4981, + "step": 7335 + }, + { + "epoch": 3.2604444444444445, + "grad_norm": 2.927777051925659, + "learning_rate": 6.9644128113879e-05, + "loss": 0.9722, + "step": 7336 + }, + { + "epoch": 3.2608888888888887, + "grad_norm": 3.2167749404907227, + "learning_rate": 6.962633451957296e-05, + "loss": 1.1545, + "step": 7337 + }, + { + "epoch": 3.2613333333333334, + "grad_norm": 3.5486435890197754, + "learning_rate": 6.96085409252669e-05, + "loss": 1.2762, + "step": 7338 + }, + { + "epoch": 3.2617777777777777, + "grad_norm": 2.870244026184082, + "learning_rate": 6.959074733096086e-05, + "loss": 0.8667, + "step": 7339 + }, + { + "epoch": 3.2622222222222224, + "grad_norm": 3.2309038639068604, + "learning_rate": 6.957295373665481e-05, + "loss": 1.0299, + "step": 7340 + }, + { + "epoch": 3.2626666666666666, + "grad_norm": 3.7159862518310547, + "learning_rate": 6.955516014234876e-05, + "loss": 1.2253, + "step": 7341 + }, + { + "epoch": 3.2631111111111113, + "grad_norm": 4.002220630645752, + "learning_rate": 6.95373665480427e-05, + "loss": 1.0, + "step": 7342 + }, + { + "epoch": 3.2635555555555555, + "grad_norm": 3.580116033554077, + "learning_rate": 6.951957295373666e-05, + "loss": 1.1418, + "step": 7343 + }, + { + "epoch": 3.2640000000000002, + "grad_norm": 3.863196849822998, + "learning_rate": 6.950177935943061e-05, + "loss": 1.2002, + "step": 7344 + }, + { + "epoch": 3.2644444444444445, + "grad_norm": 3.9584481716156006, + "learning_rate": 6.948398576512456e-05, + "loss": 1.1874, + "step": 7345 + }, + { + "epoch": 3.2648888888888887, + "grad_norm": 4.6558403968811035, + "learning_rate": 6.946619217081851e-05, + "loss": 1.1067, + "step": 7346 + }, + { + "epoch": 3.2653333333333334, + "grad_norm": 5.399629592895508, + "learning_rate": 6.944839857651247e-05, + "loss": 1.3862, + "step": 7347 + }, + { + "epoch": 3.2657777777777777, + "grad_norm": 4.66273307800293, + "learning_rate": 6.94306049822064e-05, + "loss": 1.7355, + "step": 7348 + }, + { + "epoch": 3.2662222222222224, + "grad_norm": 4.567107200622559, + "learning_rate": 6.941281138790035e-05, + "loss": 1.4238, + "step": 7349 + }, + { + "epoch": 3.2666666666666666, + "grad_norm": 3.7617597579956055, + "learning_rate": 6.939501779359431e-05, + "loss": 0.5293, + "step": 7350 + }, + { + "epoch": 3.2671111111111113, + "grad_norm": 2.0919718742370605, + "learning_rate": 6.937722419928825e-05, + "loss": 0.9802, + "step": 7351 + }, + { + "epoch": 3.2675555555555555, + "grad_norm": 2.003411293029785, + "learning_rate": 6.935943060498221e-05, + "loss": 0.7958, + "step": 7352 + }, + { + "epoch": 3.268, + "grad_norm": 2.802001714706421, + "learning_rate": 6.934163701067617e-05, + "loss": 1.5791, + "step": 7353 + }, + { + "epoch": 3.2684444444444445, + "grad_norm": 2.7242465019226074, + "learning_rate": 6.932384341637011e-05, + "loss": 1.2121, + "step": 7354 + }, + { + "epoch": 3.2688888888888887, + "grad_norm": 3.243849039077759, + "learning_rate": 6.930604982206405e-05, + "loss": 1.4581, + "step": 7355 + }, + { + "epoch": 3.2693333333333334, + "grad_norm": 2.92507004737854, + "learning_rate": 6.928825622775801e-05, + "loss": 0.9443, + "step": 7356 + }, + { + "epoch": 3.2697777777777777, + "grad_norm": 3.716792345046997, + "learning_rate": 6.927046263345197e-05, + "loss": 1.8721, + "step": 7357 + }, + { + "epoch": 3.2702222222222224, + "grad_norm": 3.27911639213562, + "learning_rate": 6.925266903914591e-05, + "loss": 1.8725, + "step": 7358 + }, + { + "epoch": 3.2706666666666666, + "grad_norm": 2.785362720489502, + "learning_rate": 6.923487544483987e-05, + "loss": 1.0283, + "step": 7359 + }, + { + "epoch": 3.2711111111111113, + "grad_norm": 3.401243209838867, + "learning_rate": 6.921708185053382e-05, + "loss": 1.4464, + "step": 7360 + }, + { + "epoch": 3.2715555555555556, + "grad_norm": 3.113215923309326, + "learning_rate": 6.919928825622775e-05, + "loss": 1.6867, + "step": 7361 + }, + { + "epoch": 3.2720000000000002, + "grad_norm": 3.3423285484313965, + "learning_rate": 6.918149466192171e-05, + "loss": 1.2452, + "step": 7362 + }, + { + "epoch": 3.2724444444444445, + "grad_norm": 2.914412021636963, + "learning_rate": 6.916370106761566e-05, + "loss": 1.0074, + "step": 7363 + }, + { + "epoch": 3.2728888888888887, + "grad_norm": 2.851064920425415, + "learning_rate": 6.914590747330961e-05, + "loss": 1.2674, + "step": 7364 + }, + { + "epoch": 3.2733333333333334, + "grad_norm": 3.0711374282836914, + "learning_rate": 6.912811387900356e-05, + "loss": 1.3246, + "step": 7365 + }, + { + "epoch": 3.2737777777777777, + "grad_norm": 3.009274959564209, + "learning_rate": 6.911032028469752e-05, + "loss": 1.3831, + "step": 7366 + }, + { + "epoch": 3.2742222222222224, + "grad_norm": 3.4454495906829834, + "learning_rate": 6.909252669039146e-05, + "loss": 1.5029, + "step": 7367 + }, + { + "epoch": 3.2746666666666666, + "grad_norm": 3.192911386489868, + "learning_rate": 6.90747330960854e-05, + "loss": 1.3722, + "step": 7368 + }, + { + "epoch": 3.2751111111111113, + "grad_norm": 3.413494110107422, + "learning_rate": 6.905693950177936e-05, + "loss": 1.3843, + "step": 7369 + }, + { + "epoch": 3.2755555555555556, + "grad_norm": 2.5494213104248047, + "learning_rate": 6.90391459074733e-05, + "loss": 0.9536, + "step": 7370 + }, + { + "epoch": 3.276, + "grad_norm": 3.192929267883301, + "learning_rate": 6.902135231316726e-05, + "loss": 1.687, + "step": 7371 + }, + { + "epoch": 3.2764444444444445, + "grad_norm": 3.70212459564209, + "learning_rate": 6.900355871886122e-05, + "loss": 1.5884, + "step": 7372 + }, + { + "epoch": 3.2768888888888887, + "grad_norm": 3.3133480548858643, + "learning_rate": 6.898576512455516e-05, + "loss": 1.5349, + "step": 7373 + }, + { + "epoch": 3.2773333333333334, + "grad_norm": 3.146245241165161, + "learning_rate": 6.89679715302491e-05, + "loss": 1.259, + "step": 7374 + }, + { + "epoch": 3.2777777777777777, + "grad_norm": 3.329066753387451, + "learning_rate": 6.895017793594306e-05, + "loss": 1.4778, + "step": 7375 + }, + { + "epoch": 3.2782222222222224, + "grad_norm": 3.4649317264556885, + "learning_rate": 6.893238434163702e-05, + "loss": 1.4338, + "step": 7376 + }, + { + "epoch": 3.2786666666666666, + "grad_norm": 3.8039920330047607, + "learning_rate": 6.891459074733096e-05, + "loss": 1.6371, + "step": 7377 + }, + { + "epoch": 3.279111111111111, + "grad_norm": 3.138587474822998, + "learning_rate": 6.889679715302492e-05, + "loss": 1.2055, + "step": 7378 + }, + { + "epoch": 3.2795555555555556, + "grad_norm": 3.8496856689453125, + "learning_rate": 6.887900355871887e-05, + "loss": 1.1513, + "step": 7379 + }, + { + "epoch": 3.2800000000000002, + "grad_norm": 1.128354787826538, + "learning_rate": 6.88612099644128e-05, + "loss": 0.039, + "step": 7380 + }, + { + "epoch": 3.2804444444444445, + "grad_norm": 3.4761149883270264, + "learning_rate": 6.884341637010676e-05, + "loss": 1.6024, + "step": 7381 + }, + { + "epoch": 3.2808888888888887, + "grad_norm": 3.339864492416382, + "learning_rate": 6.882562277580072e-05, + "loss": 1.3504, + "step": 7382 + }, + { + "epoch": 3.2813333333333334, + "grad_norm": 2.745142936706543, + "learning_rate": 6.880782918149466e-05, + "loss": 0.8555, + "step": 7383 + }, + { + "epoch": 3.2817777777777777, + "grad_norm": 4.043299198150635, + "learning_rate": 6.879003558718862e-05, + "loss": 1.5035, + "step": 7384 + }, + { + "epoch": 3.2822222222222224, + "grad_norm": 3.773738384246826, + "learning_rate": 6.877224199288257e-05, + "loss": 1.1627, + "step": 7385 + }, + { + "epoch": 3.2826666666666666, + "grad_norm": 3.9867827892303467, + "learning_rate": 6.875444839857652e-05, + "loss": 1.5756, + "step": 7386 + }, + { + "epoch": 3.2831111111111113, + "grad_norm": 3.7746942043304443, + "learning_rate": 6.873665480427046e-05, + "loss": 1.4665, + "step": 7387 + }, + { + "epoch": 3.2835555555555556, + "grad_norm": 3.536393642425537, + "learning_rate": 6.871886120996441e-05, + "loss": 1.0364, + "step": 7388 + }, + { + "epoch": 3.284, + "grad_norm": 3.190711498260498, + "learning_rate": 6.870106761565837e-05, + "loss": 0.9868, + "step": 7389 + }, + { + "epoch": 3.2844444444444445, + "grad_norm": 3.708233594894409, + "learning_rate": 6.868327402135231e-05, + "loss": 1.3567, + "step": 7390 + }, + { + "epoch": 3.2848888888888887, + "grad_norm": 3.1745657920837402, + "learning_rate": 6.866548042704627e-05, + "loss": 1.048, + "step": 7391 + }, + { + "epoch": 3.2853333333333334, + "grad_norm": 3.826011896133423, + "learning_rate": 6.864768683274023e-05, + "loss": 1.1933, + "step": 7392 + }, + { + "epoch": 3.2857777777777777, + "grad_norm": 3.8659939765930176, + "learning_rate": 6.862989323843416e-05, + "loss": 1.0345, + "step": 7393 + }, + { + "epoch": 3.2862222222222224, + "grad_norm": 4.651473522186279, + "learning_rate": 6.861209964412811e-05, + "loss": 1.0049, + "step": 7394 + }, + { + "epoch": 3.2866666666666666, + "grad_norm": 4.106167316436768, + "learning_rate": 6.859430604982207e-05, + "loss": 0.9622, + "step": 7395 + }, + { + "epoch": 3.287111111111111, + "grad_norm": 4.003452301025391, + "learning_rate": 6.857651245551601e-05, + "loss": 1.0904, + "step": 7396 + }, + { + "epoch": 3.2875555555555556, + "grad_norm": 0.4317832887172699, + "learning_rate": 6.855871886120997e-05, + "loss": 0.0452, + "step": 7397 + }, + { + "epoch": 3.288, + "grad_norm": 4.068361759185791, + "learning_rate": 6.854092526690393e-05, + "loss": 1.0543, + "step": 7398 + }, + { + "epoch": 3.2884444444444445, + "grad_norm": 3.885014772415161, + "learning_rate": 6.852313167259787e-05, + "loss": 0.5935, + "step": 7399 + }, + { + "epoch": 3.2888888888888888, + "grad_norm": 5.79363489151001, + "learning_rate": 6.850533807829181e-05, + "loss": 0.9945, + "step": 7400 + }, + { + "epoch": 3.2893333333333334, + "grad_norm": 2.343862295150757, + "learning_rate": 6.848754448398577e-05, + "loss": 1.873, + "step": 7401 + }, + { + "epoch": 3.2897777777777777, + "grad_norm": 2.8761746883392334, + "learning_rate": 6.846975088967972e-05, + "loss": 2.2376, + "step": 7402 + }, + { + "epoch": 3.2902222222222224, + "grad_norm": 2.3083155155181885, + "learning_rate": 6.845195729537367e-05, + "loss": 0.9427, + "step": 7403 + }, + { + "epoch": 3.2906666666666666, + "grad_norm": 2.915044069290161, + "learning_rate": 6.843416370106762e-05, + "loss": 1.3453, + "step": 7404 + }, + { + "epoch": 3.2911111111111113, + "grad_norm": 3.3559882640838623, + "learning_rate": 6.841637010676158e-05, + "loss": 2.2773, + "step": 7405 + }, + { + "epoch": 3.2915555555555556, + "grad_norm": 2.8695106506347656, + "learning_rate": 6.839857651245551e-05, + "loss": 1.099, + "step": 7406 + }, + { + "epoch": 3.292, + "grad_norm": 3.35927414894104, + "learning_rate": 6.838078291814947e-05, + "loss": 1.7923, + "step": 7407 + }, + { + "epoch": 3.2924444444444445, + "grad_norm": 3.227517604827881, + "learning_rate": 6.836298932384342e-05, + "loss": 1.651, + "step": 7408 + }, + { + "epoch": 3.2928888888888888, + "grad_norm": 3.0707974433898926, + "learning_rate": 6.834519572953737e-05, + "loss": 1.5187, + "step": 7409 + }, + { + "epoch": 3.2933333333333334, + "grad_norm": 2.92110538482666, + "learning_rate": 6.832740213523132e-05, + "loss": 1.5467, + "step": 7410 + }, + { + "epoch": 3.2937777777777777, + "grad_norm": 3.0438144207000732, + "learning_rate": 6.830960854092528e-05, + "loss": 1.6164, + "step": 7411 + }, + { + "epoch": 3.2942222222222224, + "grad_norm": 3.135681629180908, + "learning_rate": 6.829181494661922e-05, + "loss": 1.3172, + "step": 7412 + }, + { + "epoch": 3.2946666666666666, + "grad_norm": 3.0080184936523438, + "learning_rate": 6.827402135231316e-05, + "loss": 1.5289, + "step": 7413 + }, + { + "epoch": 3.295111111111111, + "grad_norm": 2.6062517166137695, + "learning_rate": 6.825622775800712e-05, + "loss": 1.47, + "step": 7414 + }, + { + "epoch": 3.2955555555555556, + "grad_norm": 3.1638424396514893, + "learning_rate": 6.823843416370106e-05, + "loss": 1.5606, + "step": 7415 + }, + { + "epoch": 3.296, + "grad_norm": 3.3744962215423584, + "learning_rate": 6.822064056939502e-05, + "loss": 1.6891, + "step": 7416 + }, + { + "epoch": 3.2964444444444445, + "grad_norm": 3.1398794651031494, + "learning_rate": 6.820284697508898e-05, + "loss": 1.4498, + "step": 7417 + }, + { + "epoch": 3.2968888888888888, + "grad_norm": 2.805931806564331, + "learning_rate": 6.818505338078292e-05, + "loss": 1.0785, + "step": 7418 + }, + { + "epoch": 3.2973333333333334, + "grad_norm": 2.9776318073272705, + "learning_rate": 6.816725978647686e-05, + "loss": 1.5049, + "step": 7419 + }, + { + "epoch": 3.2977777777777777, + "grad_norm": 2.0422849655151367, + "learning_rate": 6.814946619217082e-05, + "loss": 0.6512, + "step": 7420 + }, + { + "epoch": 3.2982222222222224, + "grad_norm": 3.6055731773376465, + "learning_rate": 6.813167259786478e-05, + "loss": 1.4551, + "step": 7421 + }, + { + "epoch": 3.2986666666666666, + "grad_norm": 3.5920650959014893, + "learning_rate": 6.811387900355872e-05, + "loss": 1.2965, + "step": 7422 + }, + { + "epoch": 3.2991111111111113, + "grad_norm": 2.8194003105163574, + "learning_rate": 6.809608540925268e-05, + "loss": 1.102, + "step": 7423 + }, + { + "epoch": 3.2995555555555556, + "grad_norm": 3.0865092277526855, + "learning_rate": 6.807829181494663e-05, + "loss": 1.0238, + "step": 7424 + }, + { + "epoch": 3.3, + "grad_norm": 3.17631459236145, + "learning_rate": 6.806049822064056e-05, + "loss": 1.3226, + "step": 7425 + }, + { + "epoch": 3.3004444444444445, + "grad_norm": 3.16072678565979, + "learning_rate": 6.804270462633452e-05, + "loss": 1.5295, + "step": 7426 + }, + { + "epoch": 3.3008888888888888, + "grad_norm": 2.93965482711792, + "learning_rate": 6.802491103202847e-05, + "loss": 0.8391, + "step": 7427 + }, + { + "epoch": 3.3013333333333335, + "grad_norm": 3.356229543685913, + "learning_rate": 6.800711743772242e-05, + "loss": 1.6525, + "step": 7428 + }, + { + "epoch": 3.3017777777777777, + "grad_norm": 3.251253128051758, + "learning_rate": 6.798932384341637e-05, + "loss": 1.2885, + "step": 7429 + }, + { + "epoch": 3.3022222222222224, + "grad_norm": 3.1769371032714844, + "learning_rate": 6.797153024911033e-05, + "loss": 1.0414, + "step": 7430 + }, + { + "epoch": 3.3026666666666666, + "grad_norm": 3.4345128536224365, + "learning_rate": 6.795373665480427e-05, + "loss": 1.0469, + "step": 7431 + }, + { + "epoch": 3.303111111111111, + "grad_norm": 3.2680447101593018, + "learning_rate": 6.793594306049822e-05, + "loss": 1.3976, + "step": 7432 + }, + { + "epoch": 3.3035555555555556, + "grad_norm": 3.0672709941864014, + "learning_rate": 6.791814946619217e-05, + "loss": 1.2144, + "step": 7433 + }, + { + "epoch": 3.304, + "grad_norm": 3.2540392875671387, + "learning_rate": 6.790035587188613e-05, + "loss": 1.1354, + "step": 7434 + }, + { + "epoch": 3.3044444444444445, + "grad_norm": 3.3583109378814697, + "learning_rate": 6.788256227758007e-05, + "loss": 1.2996, + "step": 7435 + }, + { + "epoch": 3.3048888888888888, + "grad_norm": 4.012465476989746, + "learning_rate": 6.786476868327403e-05, + "loss": 1.5203, + "step": 7436 + }, + { + "epoch": 3.3053333333333335, + "grad_norm": 3.8637561798095703, + "learning_rate": 6.784697508896798e-05, + "loss": 1.3783, + "step": 7437 + }, + { + "epoch": 3.3057777777777777, + "grad_norm": 3.8466806411743164, + "learning_rate": 6.782918149466191e-05, + "loss": 1.3447, + "step": 7438 + }, + { + "epoch": 3.3062222222222224, + "grad_norm": 3.448899269104004, + "learning_rate": 6.781138790035587e-05, + "loss": 1.1285, + "step": 7439 + }, + { + "epoch": 3.3066666666666666, + "grad_norm": 3.3083012104034424, + "learning_rate": 6.779359430604983e-05, + "loss": 0.8823, + "step": 7440 + }, + { + "epoch": 3.3071111111111113, + "grad_norm": 4.381979465484619, + "learning_rate": 6.777580071174377e-05, + "loss": 1.1432, + "step": 7441 + }, + { + "epoch": 3.3075555555555556, + "grad_norm": 4.388772964477539, + "learning_rate": 6.775800711743773e-05, + "loss": 1.2684, + "step": 7442 + }, + { + "epoch": 3.308, + "grad_norm": 3.7288894653320312, + "learning_rate": 6.774021352313168e-05, + "loss": 1.035, + "step": 7443 + }, + { + "epoch": 3.3084444444444445, + "grad_norm": 4.472944736480713, + "learning_rate": 6.772241992882563e-05, + "loss": 1.217, + "step": 7444 + }, + { + "epoch": 3.3088888888888888, + "grad_norm": 4.284884929656982, + "learning_rate": 6.770462633451957e-05, + "loss": 1.237, + "step": 7445 + }, + { + "epoch": 3.3093333333333335, + "grad_norm": 6.807945251464844, + "learning_rate": 6.768683274021353e-05, + "loss": 1.2017, + "step": 7446 + }, + { + "epoch": 3.3097777777777777, + "grad_norm": 4.781992435455322, + "learning_rate": 6.766903914590748e-05, + "loss": 1.32, + "step": 7447 + }, + { + "epoch": 3.3102222222222224, + "grad_norm": 3.9543511867523193, + "learning_rate": 6.765124555160142e-05, + "loss": 1.0203, + "step": 7448 + }, + { + "epoch": 3.3106666666666666, + "grad_norm": 4.435871124267578, + "learning_rate": 6.763345195729538e-05, + "loss": 1.1069, + "step": 7449 + }, + { + "epoch": 3.311111111111111, + "grad_norm": 4.147932529449463, + "learning_rate": 6.761565836298934e-05, + "loss": 0.6211, + "step": 7450 + }, + { + "epoch": 3.3115555555555556, + "grad_norm": 2.8902626037597656, + "learning_rate": 6.759786476868327e-05, + "loss": 1.9548, + "step": 7451 + }, + { + "epoch": 3.312, + "grad_norm": 1.825205683708191, + "learning_rate": 6.758007117437722e-05, + "loss": 0.6829, + "step": 7452 + }, + { + "epoch": 3.3124444444444445, + "grad_norm": 3.156799077987671, + "learning_rate": 6.756227758007118e-05, + "loss": 1.5275, + "step": 7453 + }, + { + "epoch": 3.3128888888888888, + "grad_norm": 3.1396076679229736, + "learning_rate": 6.754448398576512e-05, + "loss": 1.4951, + "step": 7454 + }, + { + "epoch": 3.3133333333333335, + "grad_norm": 3.152575731277466, + "learning_rate": 6.752669039145908e-05, + "loss": 0.9246, + "step": 7455 + }, + { + "epoch": 3.3137777777777777, + "grad_norm": 3.4386932849884033, + "learning_rate": 6.750889679715304e-05, + "loss": 1.5932, + "step": 7456 + }, + { + "epoch": 3.3142222222222224, + "grad_norm": 3.287623882293701, + "learning_rate": 6.749110320284698e-05, + "loss": 1.3995, + "step": 7457 + }, + { + "epoch": 3.3146666666666667, + "grad_norm": 3.267625570297241, + "learning_rate": 6.747330960854092e-05, + "loss": 1.4844, + "step": 7458 + }, + { + "epoch": 3.3151111111111113, + "grad_norm": 2.988413095474243, + "learning_rate": 6.745551601423488e-05, + "loss": 1.1216, + "step": 7459 + }, + { + "epoch": 3.3155555555555556, + "grad_norm": 2.892350196838379, + "learning_rate": 6.743772241992882e-05, + "loss": 1.415, + "step": 7460 + }, + { + "epoch": 3.316, + "grad_norm": 3.1864426136016846, + "learning_rate": 6.741992882562278e-05, + "loss": 1.4968, + "step": 7461 + }, + { + "epoch": 3.3164444444444445, + "grad_norm": 2.9494309425354004, + "learning_rate": 6.740213523131673e-05, + "loss": 1.1238, + "step": 7462 + }, + { + "epoch": 3.3168888888888888, + "grad_norm": 2.9198689460754395, + "learning_rate": 6.738434163701068e-05, + "loss": 1.5899, + "step": 7463 + }, + { + "epoch": 3.3173333333333335, + "grad_norm": 3.371006965637207, + "learning_rate": 6.736654804270462e-05, + "loss": 1.564, + "step": 7464 + }, + { + "epoch": 3.3177777777777777, + "grad_norm": 2.8129475116729736, + "learning_rate": 6.734875444839858e-05, + "loss": 1.0162, + "step": 7465 + }, + { + "epoch": 3.3182222222222224, + "grad_norm": 3.1402010917663574, + "learning_rate": 6.733096085409253e-05, + "loss": 1.3211, + "step": 7466 + }, + { + "epoch": 3.3186666666666667, + "grad_norm": 2.9242656230926514, + "learning_rate": 6.731316725978648e-05, + "loss": 1.2605, + "step": 7467 + }, + { + "epoch": 3.319111111111111, + "grad_norm": 3.019500255584717, + "learning_rate": 6.729537366548043e-05, + "loss": 1.3583, + "step": 7468 + }, + { + "epoch": 3.3195555555555556, + "grad_norm": 3.6432340145111084, + "learning_rate": 6.727758007117439e-05, + "loss": 1.4444, + "step": 7469 + }, + { + "epoch": 3.32, + "grad_norm": 3.2907845973968506, + "learning_rate": 6.725978647686833e-05, + "loss": 1.1577, + "step": 7470 + }, + { + "epoch": 3.3204444444444445, + "grad_norm": 2.75240421295166, + "learning_rate": 6.724199288256228e-05, + "loss": 1.5465, + "step": 7471 + }, + { + "epoch": 3.320888888888889, + "grad_norm": 3.2636733055114746, + "learning_rate": 6.722419928825623e-05, + "loss": 1.212, + "step": 7472 + }, + { + "epoch": 3.3213333333333335, + "grad_norm": 3.171222448348999, + "learning_rate": 6.720640569395017e-05, + "loss": 1.3743, + "step": 7473 + }, + { + "epoch": 3.3217777777777777, + "grad_norm": 2.8894879817962646, + "learning_rate": 6.718861209964413e-05, + "loss": 1.5747, + "step": 7474 + }, + { + "epoch": 3.3222222222222224, + "grad_norm": 3.2905118465423584, + "learning_rate": 6.717081850533809e-05, + "loss": 1.6193, + "step": 7475 + }, + { + "epoch": 3.3226666666666667, + "grad_norm": 3.048412799835205, + "learning_rate": 6.715302491103203e-05, + "loss": 1.1528, + "step": 7476 + }, + { + "epoch": 3.3231111111111113, + "grad_norm": 3.6255054473876953, + "learning_rate": 6.713523131672597e-05, + "loss": 1.3157, + "step": 7477 + }, + { + "epoch": 3.3235555555555556, + "grad_norm": 3.5292255878448486, + "learning_rate": 6.711743772241993e-05, + "loss": 1.2811, + "step": 7478 + }, + { + "epoch": 3.324, + "grad_norm": 2.83103346824646, + "learning_rate": 6.709964412811389e-05, + "loss": 0.9067, + "step": 7479 + }, + { + "epoch": 3.3244444444444445, + "grad_norm": 3.3375396728515625, + "learning_rate": 6.708185053380783e-05, + "loss": 1.307, + "step": 7480 + }, + { + "epoch": 3.324888888888889, + "grad_norm": 2.8192222118377686, + "learning_rate": 6.706405693950179e-05, + "loss": 1.0012, + "step": 7481 + }, + { + "epoch": 3.3253333333333335, + "grad_norm": 3.162932872772217, + "learning_rate": 6.704626334519574e-05, + "loss": 1.2155, + "step": 7482 + }, + { + "epoch": 3.3257777777777777, + "grad_norm": 4.290798187255859, + "learning_rate": 6.702846975088967e-05, + "loss": 1.3119, + "step": 7483 + }, + { + "epoch": 3.3262222222222224, + "grad_norm": 3.4641189575195312, + "learning_rate": 6.701067615658363e-05, + "loss": 1.0565, + "step": 7484 + }, + { + "epoch": 3.3266666666666667, + "grad_norm": 4.372963905334473, + "learning_rate": 6.699288256227758e-05, + "loss": 1.5057, + "step": 7485 + }, + { + "epoch": 3.327111111111111, + "grad_norm": 3.908487319946289, + "learning_rate": 6.697508896797153e-05, + "loss": 1.2567, + "step": 7486 + }, + { + "epoch": 3.3275555555555556, + "grad_norm": 3.5218594074249268, + "learning_rate": 6.695729537366548e-05, + "loss": 1.039, + "step": 7487 + }, + { + "epoch": 3.328, + "grad_norm": 4.635097980499268, + "learning_rate": 6.693950177935944e-05, + "loss": 1.1867, + "step": 7488 + }, + { + "epoch": 3.3284444444444445, + "grad_norm": 3.447298526763916, + "learning_rate": 6.692170818505338e-05, + "loss": 1.1312, + "step": 7489 + }, + { + "epoch": 3.328888888888889, + "grad_norm": 3.3791463375091553, + "learning_rate": 6.690391459074733e-05, + "loss": 0.8666, + "step": 7490 + }, + { + "epoch": 3.3293333333333335, + "grad_norm": 2.8901917934417725, + "learning_rate": 6.688612099644128e-05, + "loss": 1.1193, + "step": 7491 + }, + { + "epoch": 3.3297777777777777, + "grad_norm": 3.7830843925476074, + "learning_rate": 6.686832740213524e-05, + "loss": 1.152, + "step": 7492 + }, + { + "epoch": 3.330222222222222, + "grad_norm": 3.2505948543548584, + "learning_rate": 6.685053380782918e-05, + "loss": 0.9152, + "step": 7493 + }, + { + "epoch": 3.3306666666666667, + "grad_norm": 4.339881896972656, + "learning_rate": 6.683274021352314e-05, + "loss": 1.3848, + "step": 7494 + }, + { + "epoch": 3.3311111111111114, + "grad_norm": 4.330918312072754, + "learning_rate": 6.68149466192171e-05, + "loss": 1.5097, + "step": 7495 + }, + { + "epoch": 3.3315555555555556, + "grad_norm": 3.9478585720062256, + "learning_rate": 6.679715302491103e-05, + "loss": 1.2268, + "step": 7496 + }, + { + "epoch": 3.332, + "grad_norm": 4.484266757965088, + "learning_rate": 6.677935943060498e-05, + "loss": 1.1452, + "step": 7497 + }, + { + "epoch": 3.3324444444444445, + "grad_norm": 3.884369373321533, + "learning_rate": 6.676156583629894e-05, + "loss": 1.3011, + "step": 7498 + }, + { + "epoch": 3.332888888888889, + "grad_norm": 5.073292255401611, + "learning_rate": 6.674377224199288e-05, + "loss": 1.4814, + "step": 7499 + }, + { + "epoch": 3.3333333333333335, + "grad_norm": 4.949814796447754, + "learning_rate": 6.672597864768684e-05, + "loss": 0.5286, + "step": 7500 + }, + { + "epoch": 3.3337777777777777, + "grad_norm": 2.3786208629608154, + "learning_rate": 6.67081850533808e-05, + "loss": 1.8626, + "step": 7501 + }, + { + "epoch": 3.3342222222222224, + "grad_norm": 2.6522817611694336, + "learning_rate": 6.669039145907474e-05, + "loss": 2.0315, + "step": 7502 + }, + { + "epoch": 3.3346666666666667, + "grad_norm": 2.5052194595336914, + "learning_rate": 6.667259786476868e-05, + "loss": 1.5958, + "step": 7503 + }, + { + "epoch": 3.335111111111111, + "grad_norm": 2.0900769233703613, + "learning_rate": 6.665480427046264e-05, + "loss": 1.0634, + "step": 7504 + }, + { + "epoch": 3.3355555555555556, + "grad_norm": 3.4959940910339355, + "learning_rate": 6.663701067615658e-05, + "loss": 1.9428, + "step": 7505 + }, + { + "epoch": 3.336, + "grad_norm": 2.677666187286377, + "learning_rate": 6.661921708185054e-05, + "loss": 1.0751, + "step": 7506 + }, + { + "epoch": 3.3364444444444445, + "grad_norm": 3.592830181121826, + "learning_rate": 6.660142348754449e-05, + "loss": 2.0699, + "step": 7507 + }, + { + "epoch": 3.336888888888889, + "grad_norm": 2.6081550121307373, + "learning_rate": 6.658362989323844e-05, + "loss": 1.142, + "step": 7508 + }, + { + "epoch": 3.3373333333333335, + "grad_norm": 1.7961082458496094, + "learning_rate": 6.656583629893238e-05, + "loss": 0.5108, + "step": 7509 + }, + { + "epoch": 3.3377777777777777, + "grad_norm": 3.024773359298706, + "learning_rate": 6.654804270462633e-05, + "loss": 1.2942, + "step": 7510 + }, + { + "epoch": 3.338222222222222, + "grad_norm": 3.3666229248046875, + "learning_rate": 6.653024911032029e-05, + "loss": 1.7867, + "step": 7511 + }, + { + "epoch": 3.3386666666666667, + "grad_norm": 3.1930816173553467, + "learning_rate": 6.651245551601423e-05, + "loss": 1.3751, + "step": 7512 + }, + { + "epoch": 3.339111111111111, + "grad_norm": 2.655470609664917, + "learning_rate": 6.649466192170819e-05, + "loss": 1.6016, + "step": 7513 + }, + { + "epoch": 3.3395555555555556, + "grad_norm": 3.1630845069885254, + "learning_rate": 6.647686832740215e-05, + "loss": 1.3575, + "step": 7514 + }, + { + "epoch": 3.34, + "grad_norm": 3.315333843231201, + "learning_rate": 6.645907473309609e-05, + "loss": 1.2997, + "step": 7515 + }, + { + "epoch": 3.3404444444444445, + "grad_norm": 2.40342378616333, + "learning_rate": 6.644128113879003e-05, + "loss": 0.9741, + "step": 7516 + }, + { + "epoch": 3.340888888888889, + "grad_norm": 0.24996036291122437, + "learning_rate": 6.642348754448399e-05, + "loss": 0.0318, + "step": 7517 + }, + { + "epoch": 3.3413333333333335, + "grad_norm": 2.008341073989868, + "learning_rate": 6.640569395017793e-05, + "loss": 0.5644, + "step": 7518 + }, + { + "epoch": 3.3417777777777777, + "grad_norm": 3.1700870990753174, + "learning_rate": 6.638790035587189e-05, + "loss": 1.164, + "step": 7519 + }, + { + "epoch": 3.3422222222222224, + "grad_norm": 3.5381534099578857, + "learning_rate": 6.637010676156585e-05, + "loss": 1.6558, + "step": 7520 + }, + { + "epoch": 3.3426666666666667, + "grad_norm": 3.9801483154296875, + "learning_rate": 6.635231316725979e-05, + "loss": 1.6166, + "step": 7521 + }, + { + "epoch": 3.343111111111111, + "grad_norm": 3.240372896194458, + "learning_rate": 6.633451957295373e-05, + "loss": 1.3277, + "step": 7522 + }, + { + "epoch": 3.3435555555555556, + "grad_norm": 3.508293390274048, + "learning_rate": 6.631672597864769e-05, + "loss": 1.3398, + "step": 7523 + }, + { + "epoch": 3.344, + "grad_norm": 3.4153871536254883, + "learning_rate": 6.629893238434164e-05, + "loss": 1.1003, + "step": 7524 + }, + { + "epoch": 3.3444444444444446, + "grad_norm": 3.2748870849609375, + "learning_rate": 6.628113879003559e-05, + "loss": 1.3643, + "step": 7525 + }, + { + "epoch": 3.344888888888889, + "grad_norm": 3.4981982707977295, + "learning_rate": 6.626334519572954e-05, + "loss": 1.3478, + "step": 7526 + }, + { + "epoch": 3.3453333333333335, + "grad_norm": 3.316903591156006, + "learning_rate": 6.62455516014235e-05, + "loss": 1.2191, + "step": 7527 + }, + { + "epoch": 3.3457777777777777, + "grad_norm": 3.2292816638946533, + "learning_rate": 6.622775800711744e-05, + "loss": 1.4199, + "step": 7528 + }, + { + "epoch": 3.346222222222222, + "grad_norm": 3.875464677810669, + "learning_rate": 6.620996441281139e-05, + "loss": 1.4645, + "step": 7529 + }, + { + "epoch": 3.3466666666666667, + "grad_norm": 3.4380528926849365, + "learning_rate": 6.619217081850534e-05, + "loss": 1.2479, + "step": 7530 + }, + { + "epoch": 3.347111111111111, + "grad_norm": 3.452080726623535, + "learning_rate": 6.617437722419929e-05, + "loss": 1.1636, + "step": 7531 + }, + { + "epoch": 3.3475555555555556, + "grad_norm": 3.524420738220215, + "learning_rate": 6.615658362989324e-05, + "loss": 1.2439, + "step": 7532 + }, + { + "epoch": 3.348, + "grad_norm": 3.042452096939087, + "learning_rate": 6.61387900355872e-05, + "loss": 1.1566, + "step": 7533 + }, + { + "epoch": 3.3484444444444446, + "grad_norm": 3.027488946914673, + "learning_rate": 6.612099644128114e-05, + "loss": 1.1191, + "step": 7534 + }, + { + "epoch": 3.348888888888889, + "grad_norm": 3.0509836673736572, + "learning_rate": 6.610320284697508e-05, + "loss": 1.0986, + "step": 7535 + }, + { + "epoch": 3.3493333333333335, + "grad_norm": 4.5463080406188965, + "learning_rate": 6.608540925266904e-05, + "loss": 1.5118, + "step": 7536 + }, + { + "epoch": 3.3497777777777777, + "grad_norm": 2.0029096603393555, + "learning_rate": 6.6067615658363e-05, + "loss": 0.5587, + "step": 7537 + }, + { + "epoch": 3.3502222222222224, + "grad_norm": 4.032919406890869, + "learning_rate": 6.604982206405694e-05, + "loss": 1.5668, + "step": 7538 + }, + { + "epoch": 3.3506666666666667, + "grad_norm": 3.9945425987243652, + "learning_rate": 6.60320284697509e-05, + "loss": 1.3243, + "step": 7539 + }, + { + "epoch": 3.351111111111111, + "grad_norm": 3.5870978832244873, + "learning_rate": 6.601423487544485e-05, + "loss": 1.1531, + "step": 7540 + }, + { + "epoch": 3.3515555555555556, + "grad_norm": 3.745569944381714, + "learning_rate": 6.599644128113878e-05, + "loss": 1.2786, + "step": 7541 + }, + { + "epoch": 3.352, + "grad_norm": 3.2785604000091553, + "learning_rate": 6.597864768683274e-05, + "loss": 1.0538, + "step": 7542 + }, + { + "epoch": 3.3524444444444446, + "grad_norm": 4.0194621086120605, + "learning_rate": 6.59608540925267e-05, + "loss": 1.2927, + "step": 7543 + }, + { + "epoch": 3.352888888888889, + "grad_norm": 4.010869026184082, + "learning_rate": 6.594306049822064e-05, + "loss": 1.2408, + "step": 7544 + }, + { + "epoch": 3.3533333333333335, + "grad_norm": 4.548504829406738, + "learning_rate": 6.59252669039146e-05, + "loss": 0.8367, + "step": 7545 + }, + { + "epoch": 3.3537777777777777, + "grad_norm": 3.896096706390381, + "learning_rate": 6.590747330960855e-05, + "loss": 0.8111, + "step": 7546 + }, + { + "epoch": 3.354222222222222, + "grad_norm": 4.7822957038879395, + "learning_rate": 6.58896797153025e-05, + "loss": 1.4794, + "step": 7547 + }, + { + "epoch": 3.3546666666666667, + "grad_norm": 4.28626823425293, + "learning_rate": 6.587188612099644e-05, + "loss": 1.0438, + "step": 7548 + }, + { + "epoch": 3.355111111111111, + "grad_norm": 4.136358737945557, + "learning_rate": 6.58540925266904e-05, + "loss": 0.9042, + "step": 7549 + }, + { + "epoch": 3.3555555555555556, + "grad_norm": 3.873260021209717, + "learning_rate": 6.583629893238434e-05, + "loss": 0.8692, + "step": 7550 + }, + { + "epoch": 3.356, + "grad_norm": 2.481637716293335, + "learning_rate": 6.58185053380783e-05, + "loss": 2.0427, + "step": 7551 + }, + { + "epoch": 3.3564444444444446, + "grad_norm": 2.9505603313446045, + "learning_rate": 6.580071174377225e-05, + "loss": 1.5954, + "step": 7552 + }, + { + "epoch": 3.356888888888889, + "grad_norm": 2.6145901679992676, + "learning_rate": 6.578291814946619e-05, + "loss": 1.3394, + "step": 7553 + }, + { + "epoch": 3.3573333333333335, + "grad_norm": 2.786313533782959, + "learning_rate": 6.576512455516014e-05, + "loss": 1.376, + "step": 7554 + }, + { + "epoch": 3.3577777777777778, + "grad_norm": 3.3209388256073, + "learning_rate": 6.574733096085409e-05, + "loss": 1.4978, + "step": 7555 + }, + { + "epoch": 3.3582222222222224, + "grad_norm": 3.2253782749176025, + "learning_rate": 6.572953736654805e-05, + "loss": 1.345, + "step": 7556 + }, + { + "epoch": 3.3586666666666667, + "grad_norm": 2.5213868618011475, + "learning_rate": 6.571174377224199e-05, + "loss": 1.0464, + "step": 7557 + }, + { + "epoch": 3.359111111111111, + "grad_norm": 3.1049644947052, + "learning_rate": 6.569395017793595e-05, + "loss": 1.2791, + "step": 7558 + }, + { + "epoch": 3.3595555555555556, + "grad_norm": 3.2726714611053467, + "learning_rate": 6.56761565836299e-05, + "loss": 1.1688, + "step": 7559 + }, + { + "epoch": 3.36, + "grad_norm": 3.0971052646636963, + "learning_rate": 6.565836298932385e-05, + "loss": 1.1853, + "step": 7560 + }, + { + "epoch": 3.3604444444444446, + "grad_norm": 3.0031280517578125, + "learning_rate": 6.564056939501779e-05, + "loss": 1.5328, + "step": 7561 + }, + { + "epoch": 3.360888888888889, + "grad_norm": 4.010336399078369, + "learning_rate": 6.562277580071175e-05, + "loss": 1.3606, + "step": 7562 + }, + { + "epoch": 3.3613333333333335, + "grad_norm": 3.0197978019714355, + "learning_rate": 6.560498220640569e-05, + "loss": 1.5253, + "step": 7563 + }, + { + "epoch": 3.3617777777777778, + "grad_norm": 3.6670336723327637, + "learning_rate": 6.558718861209965e-05, + "loss": 2.0248, + "step": 7564 + }, + { + "epoch": 3.362222222222222, + "grad_norm": 3.820261001586914, + "learning_rate": 6.55693950177936e-05, + "loss": 1.6484, + "step": 7565 + }, + { + "epoch": 3.3626666666666667, + "grad_norm": 3.1288888454437256, + "learning_rate": 6.555160142348755e-05, + "loss": 1.1557, + "step": 7566 + }, + { + "epoch": 3.363111111111111, + "grad_norm": 3.1067354679107666, + "learning_rate": 6.553380782918149e-05, + "loss": 0.9204, + "step": 7567 + }, + { + "epoch": 3.3635555555555556, + "grad_norm": 3.1598074436187744, + "learning_rate": 6.551601423487545e-05, + "loss": 1.193, + "step": 7568 + }, + { + "epoch": 3.364, + "grad_norm": 2.919339656829834, + "learning_rate": 6.54982206405694e-05, + "loss": 1.3601, + "step": 7569 + }, + { + "epoch": 3.3644444444444446, + "grad_norm": 3.384775161743164, + "learning_rate": 6.548042704626335e-05, + "loss": 1.3367, + "step": 7570 + }, + { + "epoch": 3.364888888888889, + "grad_norm": 3.7233974933624268, + "learning_rate": 6.54626334519573e-05, + "loss": 1.748, + "step": 7571 + }, + { + "epoch": 3.3653333333333335, + "grad_norm": 3.6494290828704834, + "learning_rate": 6.544483985765126e-05, + "loss": 1.646, + "step": 7572 + }, + { + "epoch": 3.3657777777777778, + "grad_norm": 3.263012409210205, + "learning_rate": 6.54270462633452e-05, + "loss": 0.9094, + "step": 7573 + }, + { + "epoch": 3.3662222222222224, + "grad_norm": 3.7953481674194336, + "learning_rate": 6.540925266903914e-05, + "loss": 1.4884, + "step": 7574 + }, + { + "epoch": 3.3666666666666667, + "grad_norm": 2.8919782638549805, + "learning_rate": 6.53914590747331e-05, + "loss": 1.121, + "step": 7575 + }, + { + "epoch": 3.367111111111111, + "grad_norm": 2.639195680618286, + "learning_rate": 6.537366548042704e-05, + "loss": 0.5362, + "step": 7576 + }, + { + "epoch": 3.3675555555555556, + "grad_norm": 3.4763965606689453, + "learning_rate": 6.5355871886121e-05, + "loss": 1.3417, + "step": 7577 + }, + { + "epoch": 3.368, + "grad_norm": 2.803255558013916, + "learning_rate": 6.533807829181496e-05, + "loss": 1.0939, + "step": 7578 + }, + { + "epoch": 3.3684444444444446, + "grad_norm": 3.6555652618408203, + "learning_rate": 6.53202846975089e-05, + "loss": 1.3298, + "step": 7579 + }, + { + "epoch": 3.368888888888889, + "grad_norm": 3.3960537910461426, + "learning_rate": 6.530249110320284e-05, + "loss": 1.1507, + "step": 7580 + }, + { + "epoch": 3.3693333333333335, + "grad_norm": 2.8318889141082764, + "learning_rate": 6.52846975088968e-05, + "loss": 0.9001, + "step": 7581 + }, + { + "epoch": 3.3697777777777778, + "grad_norm": 3.948547601699829, + "learning_rate": 6.526690391459076e-05, + "loss": 0.7694, + "step": 7582 + }, + { + "epoch": 3.370222222222222, + "grad_norm": 3.6763882637023926, + "learning_rate": 6.52491103202847e-05, + "loss": 0.9653, + "step": 7583 + }, + { + "epoch": 3.3706666666666667, + "grad_norm": 4.624429225921631, + "learning_rate": 6.523131672597865e-05, + "loss": 1.2327, + "step": 7584 + }, + { + "epoch": 3.371111111111111, + "grad_norm": 3.7554173469543457, + "learning_rate": 6.521352313167261e-05, + "loss": 1.0042, + "step": 7585 + }, + { + "epoch": 3.3715555555555556, + "grad_norm": 2.8825485706329346, + "learning_rate": 6.519572953736655e-05, + "loss": 0.8331, + "step": 7586 + }, + { + "epoch": 3.372, + "grad_norm": 4.5201849937438965, + "learning_rate": 6.51779359430605e-05, + "loss": 1.8612, + "step": 7587 + }, + { + "epoch": 3.3724444444444446, + "grad_norm": 3.1740362644195557, + "learning_rate": 6.516014234875445e-05, + "loss": 1.1159, + "step": 7588 + }, + { + "epoch": 3.372888888888889, + "grad_norm": 3.3097000122070312, + "learning_rate": 6.51423487544484e-05, + "loss": 1.3252, + "step": 7589 + }, + { + "epoch": 3.3733333333333335, + "grad_norm": 3.417339563369751, + "learning_rate": 6.512455516014235e-05, + "loss": 0.9649, + "step": 7590 + }, + { + "epoch": 3.3737777777777778, + "grad_norm": 4.091862201690674, + "learning_rate": 6.510676156583631e-05, + "loss": 1.5291, + "step": 7591 + }, + { + "epoch": 3.3742222222222225, + "grad_norm": 5.507560729980469, + "learning_rate": 6.508896797153025e-05, + "loss": 0.6388, + "step": 7592 + }, + { + "epoch": 3.3746666666666667, + "grad_norm": 5.815156936645508, + "learning_rate": 6.50711743772242e-05, + "loss": 1.1697, + "step": 7593 + }, + { + "epoch": 3.375111111111111, + "grad_norm": 3.993685245513916, + "learning_rate": 6.505338078291815e-05, + "loss": 1.3822, + "step": 7594 + }, + { + "epoch": 3.3755555555555556, + "grad_norm": 3.4094078540802, + "learning_rate": 6.50355871886121e-05, + "loss": 0.8347, + "step": 7595 + }, + { + "epoch": 3.376, + "grad_norm": 3.818934202194214, + "learning_rate": 6.501779359430605e-05, + "loss": 1.0501, + "step": 7596 + }, + { + "epoch": 3.3764444444444446, + "grad_norm": 5.371906280517578, + "learning_rate": 6.500000000000001e-05, + "loss": 1.3774, + "step": 7597 + }, + { + "epoch": 3.376888888888889, + "grad_norm": 4.580707550048828, + "learning_rate": 6.498220640569395e-05, + "loss": 1.2469, + "step": 7598 + }, + { + "epoch": 3.3773333333333335, + "grad_norm": 5.118098258972168, + "learning_rate": 6.49644128113879e-05, + "loss": 1.7323, + "step": 7599 + }, + { + "epoch": 3.3777777777777778, + "grad_norm": 4.494351387023926, + "learning_rate": 6.494661921708185e-05, + "loss": 1.426, + "step": 7600 + }, + { + "epoch": 3.378222222222222, + "grad_norm": 1.999345302581787, + "learning_rate": 6.49288256227758e-05, + "loss": 1.0275, + "step": 7601 + }, + { + "epoch": 3.3786666666666667, + "grad_norm": 2.187124729156494, + "learning_rate": 6.491103202846975e-05, + "loss": 0.8653, + "step": 7602 + }, + { + "epoch": 3.379111111111111, + "grad_norm": 3.221345901489258, + "learning_rate": 6.48932384341637e-05, + "loss": 1.444, + "step": 7603 + }, + { + "epoch": 3.3795555555555556, + "grad_norm": 2.575716495513916, + "learning_rate": 6.487544483985766e-05, + "loss": 1.2003, + "step": 7604 + }, + { + "epoch": 3.38, + "grad_norm": 3.7870845794677734, + "learning_rate": 6.48576512455516e-05, + "loss": 1.7092, + "step": 7605 + }, + { + "epoch": 3.3804444444444446, + "grad_norm": 2.4028003215789795, + "learning_rate": 6.483985765124555e-05, + "loss": 0.777, + "step": 7606 + }, + { + "epoch": 3.380888888888889, + "grad_norm": 2.2200560569763184, + "learning_rate": 6.48220640569395e-05, + "loss": 0.8364, + "step": 7607 + }, + { + "epoch": 3.3813333333333335, + "grad_norm": 3.2974655628204346, + "learning_rate": 6.480427046263345e-05, + "loss": 1.5625, + "step": 7608 + }, + { + "epoch": 3.3817777777777778, + "grad_norm": 3.614440441131592, + "learning_rate": 6.47864768683274e-05, + "loss": 1.6034, + "step": 7609 + }, + { + "epoch": 3.3822222222222225, + "grad_norm": 2.842564582824707, + "learning_rate": 6.476868327402136e-05, + "loss": 1.3101, + "step": 7610 + }, + { + "epoch": 3.3826666666666667, + "grad_norm": 3.6041154861450195, + "learning_rate": 6.47508896797153e-05, + "loss": 1.7356, + "step": 7611 + }, + { + "epoch": 3.383111111111111, + "grad_norm": 2.605125665664673, + "learning_rate": 6.473309608540925e-05, + "loss": 0.7838, + "step": 7612 + }, + { + "epoch": 3.3835555555555556, + "grad_norm": 3.500422477722168, + "learning_rate": 6.47153024911032e-05, + "loss": 1.4996, + "step": 7613 + }, + { + "epoch": 3.384, + "grad_norm": 2.843087673187256, + "learning_rate": 6.469750889679716e-05, + "loss": 1.1637, + "step": 7614 + }, + { + "epoch": 3.3844444444444446, + "grad_norm": 2.8948328495025635, + "learning_rate": 6.46797153024911e-05, + "loss": 1.3661, + "step": 7615 + }, + { + "epoch": 3.384888888888889, + "grad_norm": 3.8482465744018555, + "learning_rate": 6.466192170818506e-05, + "loss": 1.4079, + "step": 7616 + }, + { + "epoch": 3.3853333333333335, + "grad_norm": 3.166999101638794, + "learning_rate": 6.464412811387902e-05, + "loss": 1.2834, + "step": 7617 + }, + { + "epoch": 3.3857777777777778, + "grad_norm": 2.966557264328003, + "learning_rate": 6.462633451957296e-05, + "loss": 1.0525, + "step": 7618 + }, + { + "epoch": 3.386222222222222, + "grad_norm": 3.0287766456604004, + "learning_rate": 6.46085409252669e-05, + "loss": 1.0802, + "step": 7619 + }, + { + "epoch": 3.3866666666666667, + "grad_norm": 3.048327684402466, + "learning_rate": 6.459074733096086e-05, + "loss": 1.3006, + "step": 7620 + }, + { + "epoch": 3.387111111111111, + "grad_norm": 2.9743454456329346, + "learning_rate": 6.45729537366548e-05, + "loss": 1.1943, + "step": 7621 + }, + { + "epoch": 3.3875555555555557, + "grad_norm": 3.471954345703125, + "learning_rate": 6.455516014234876e-05, + "loss": 1.1794, + "step": 7622 + }, + { + "epoch": 3.388, + "grad_norm": 3.0449366569519043, + "learning_rate": 6.453736654804271e-05, + "loss": 1.1742, + "step": 7623 + }, + { + "epoch": 3.3884444444444446, + "grad_norm": 3.310814142227173, + "learning_rate": 6.451957295373666e-05, + "loss": 1.1978, + "step": 7624 + }, + { + "epoch": 3.388888888888889, + "grad_norm": 3.090825319290161, + "learning_rate": 6.45017793594306e-05, + "loss": 1.2543, + "step": 7625 + }, + { + "epoch": 3.389333333333333, + "grad_norm": 3.573758363723755, + "learning_rate": 6.448398576512456e-05, + "loss": 1.0293, + "step": 7626 + }, + { + "epoch": 3.389777777777778, + "grad_norm": 3.0098912715911865, + "learning_rate": 6.446619217081851e-05, + "loss": 1.0174, + "step": 7627 + }, + { + "epoch": 3.3902222222222225, + "grad_norm": 3.734511375427246, + "learning_rate": 6.444839857651246e-05, + "loss": 1.4176, + "step": 7628 + }, + { + "epoch": 3.3906666666666667, + "grad_norm": 3.394307851791382, + "learning_rate": 6.443060498220641e-05, + "loss": 1.4112, + "step": 7629 + }, + { + "epoch": 3.391111111111111, + "grad_norm": 3.520249605178833, + "learning_rate": 6.441281138790037e-05, + "loss": 1.3007, + "step": 7630 + }, + { + "epoch": 3.3915555555555557, + "grad_norm": 3.260222911834717, + "learning_rate": 6.439501779359431e-05, + "loss": 1.1919, + "step": 7631 + }, + { + "epoch": 3.392, + "grad_norm": 3.4656856060028076, + "learning_rate": 6.437722419928825e-05, + "loss": 1.1526, + "step": 7632 + }, + { + "epoch": 3.3924444444444446, + "grad_norm": 3.670583963394165, + "learning_rate": 6.435943060498221e-05, + "loss": 1.222, + "step": 7633 + }, + { + "epoch": 3.392888888888889, + "grad_norm": 3.5100245475769043, + "learning_rate": 6.434163701067615e-05, + "loss": 1.4112, + "step": 7634 + }, + { + "epoch": 3.3933333333333335, + "grad_norm": 3.7049787044525146, + "learning_rate": 6.432384341637011e-05, + "loss": 1.3744, + "step": 7635 + }, + { + "epoch": 3.393777777777778, + "grad_norm": 3.5180797576904297, + "learning_rate": 6.430604982206407e-05, + "loss": 1.0369, + "step": 7636 + }, + { + "epoch": 3.394222222222222, + "grad_norm": 3.888057231903076, + "learning_rate": 6.428825622775801e-05, + "loss": 1.3029, + "step": 7637 + }, + { + "epoch": 3.3946666666666667, + "grad_norm": 2.7446529865264893, + "learning_rate": 6.427046263345195e-05, + "loss": 1.0311, + "step": 7638 + }, + { + "epoch": 3.395111111111111, + "grad_norm": 4.01584005355835, + "learning_rate": 6.425266903914591e-05, + "loss": 1.0342, + "step": 7639 + }, + { + "epoch": 3.3955555555555557, + "grad_norm": 4.280939102172852, + "learning_rate": 6.423487544483985e-05, + "loss": 1.4491, + "step": 7640 + }, + { + "epoch": 3.396, + "grad_norm": 4.160184383392334, + "learning_rate": 6.421708185053381e-05, + "loss": 1.8072, + "step": 7641 + }, + { + "epoch": 3.3964444444444446, + "grad_norm": 3.48480486869812, + "learning_rate": 6.419928825622777e-05, + "loss": 1.0331, + "step": 7642 + }, + { + "epoch": 3.396888888888889, + "grad_norm": 3.321204423904419, + "learning_rate": 6.418149466192171e-05, + "loss": 0.9976, + "step": 7643 + }, + { + "epoch": 3.397333333333333, + "grad_norm": 3.8095834255218506, + "learning_rate": 6.416370106761567e-05, + "loss": 1.3911, + "step": 7644 + }, + { + "epoch": 3.397777777777778, + "grad_norm": 4.368819236755371, + "learning_rate": 6.414590747330961e-05, + "loss": 1.1837, + "step": 7645 + }, + { + "epoch": 3.398222222222222, + "grad_norm": 4.469260215759277, + "learning_rate": 6.412811387900356e-05, + "loss": 1.0485, + "step": 7646 + }, + { + "epoch": 3.3986666666666667, + "grad_norm": 4.630403518676758, + "learning_rate": 6.411032028469751e-05, + "loss": 1.3338, + "step": 7647 + }, + { + "epoch": 3.399111111111111, + "grad_norm": 4.27545166015625, + "learning_rate": 6.409252669039146e-05, + "loss": 1.071, + "step": 7648 + }, + { + "epoch": 3.3995555555555557, + "grad_norm": 5.530716896057129, + "learning_rate": 6.407473309608542e-05, + "loss": 1.4461, + "step": 7649 + }, + { + "epoch": 3.4, + "grad_norm": 5.1273193359375, + "learning_rate": 6.405693950177936e-05, + "loss": 0.9624, + "step": 7650 + }, + { + "epoch": 3.4004444444444446, + "grad_norm": 2.5767862796783447, + "learning_rate": 6.40391459074733e-05, + "loss": 1.6648, + "step": 7651 + }, + { + "epoch": 3.400888888888889, + "grad_norm": 2.5170185565948486, + "learning_rate": 6.402135231316726e-05, + "loss": 1.7532, + "step": 7652 + }, + { + "epoch": 3.4013333333333335, + "grad_norm": 2.523386240005493, + "learning_rate": 6.40035587188612e-05, + "loss": 1.7704, + "step": 7653 + }, + { + "epoch": 3.401777777777778, + "grad_norm": 2.007690668106079, + "learning_rate": 6.398576512455516e-05, + "loss": 0.8182, + "step": 7654 + }, + { + "epoch": 3.402222222222222, + "grad_norm": 2.8901143074035645, + "learning_rate": 6.396797153024912e-05, + "loss": 1.8572, + "step": 7655 + }, + { + "epoch": 3.4026666666666667, + "grad_norm": 3.179241895675659, + "learning_rate": 6.395017793594306e-05, + "loss": 1.5891, + "step": 7656 + }, + { + "epoch": 3.403111111111111, + "grad_norm": 3.0536370277404785, + "learning_rate": 6.3932384341637e-05, + "loss": 1.4515, + "step": 7657 + }, + { + "epoch": 3.4035555555555557, + "grad_norm": 2.9210469722747803, + "learning_rate": 6.391459074733096e-05, + "loss": 1.8337, + "step": 7658 + }, + { + "epoch": 3.404, + "grad_norm": 3.83801531791687, + "learning_rate": 6.389679715302492e-05, + "loss": 1.8063, + "step": 7659 + }, + { + "epoch": 3.4044444444444446, + "grad_norm": 2.360499382019043, + "learning_rate": 6.387900355871886e-05, + "loss": 0.762, + "step": 7660 + }, + { + "epoch": 3.404888888888889, + "grad_norm": 3.5909371376037598, + "learning_rate": 6.386120996441282e-05, + "loss": 1.4551, + "step": 7661 + }, + { + "epoch": 3.405333333333333, + "grad_norm": 3.15690541267395, + "learning_rate": 6.384341637010677e-05, + "loss": 1.387, + "step": 7662 + }, + { + "epoch": 3.405777777777778, + "grad_norm": 3.711642026901245, + "learning_rate": 6.382562277580072e-05, + "loss": 1.6299, + "step": 7663 + }, + { + "epoch": 3.406222222222222, + "grad_norm": 3.540889263153076, + "learning_rate": 6.380782918149466e-05, + "loss": 1.5186, + "step": 7664 + }, + { + "epoch": 3.4066666666666667, + "grad_norm": 3.2987284660339355, + "learning_rate": 6.379003558718862e-05, + "loss": 1.0771, + "step": 7665 + }, + { + "epoch": 3.407111111111111, + "grad_norm": 3.3886878490448, + "learning_rate": 6.377224199288256e-05, + "loss": 1.8004, + "step": 7666 + }, + { + "epoch": 3.4075555555555557, + "grad_norm": 2.921910524368286, + "learning_rate": 6.375444839857652e-05, + "loss": 1.3376, + "step": 7667 + }, + { + "epoch": 3.408, + "grad_norm": 0.2479649782180786, + "learning_rate": 6.373665480427047e-05, + "loss": 0.0287, + "step": 7668 + }, + { + "epoch": 3.4084444444444446, + "grad_norm": 2.7667505741119385, + "learning_rate": 6.371886120996441e-05, + "loss": 1.1423, + "step": 7669 + }, + { + "epoch": 3.408888888888889, + "grad_norm": 2.919297933578491, + "learning_rate": 6.370106761565836e-05, + "loss": 1.5937, + "step": 7670 + }, + { + "epoch": 3.4093333333333335, + "grad_norm": 3.060208559036255, + "learning_rate": 6.368327402135231e-05, + "loss": 1.3546, + "step": 7671 + }, + { + "epoch": 3.409777777777778, + "grad_norm": 3.8419125080108643, + "learning_rate": 6.366548042704627e-05, + "loss": 1.6202, + "step": 7672 + }, + { + "epoch": 3.410222222222222, + "grad_norm": 3.074181318283081, + "learning_rate": 6.364768683274021e-05, + "loss": 1.1265, + "step": 7673 + }, + { + "epoch": 3.4106666666666667, + "grad_norm": 3.2371325492858887, + "learning_rate": 6.362989323843417e-05, + "loss": 0.9814, + "step": 7674 + }, + { + "epoch": 3.411111111111111, + "grad_norm": 3.5532615184783936, + "learning_rate": 6.361209964412813e-05, + "loss": 1.5752, + "step": 7675 + }, + { + "epoch": 3.4115555555555557, + "grad_norm": 2.725200653076172, + "learning_rate": 6.359430604982207e-05, + "loss": 0.9369, + "step": 7676 + }, + { + "epoch": 3.412, + "grad_norm": 2.41279673576355, + "learning_rate": 6.357651245551601e-05, + "loss": 1.2278, + "step": 7677 + }, + { + "epoch": 3.4124444444444446, + "grad_norm": 3.5397229194641113, + "learning_rate": 6.355871886120997e-05, + "loss": 1.1223, + "step": 7678 + }, + { + "epoch": 3.412888888888889, + "grad_norm": 3.2247607707977295, + "learning_rate": 6.354092526690391e-05, + "loss": 1.6677, + "step": 7679 + }, + { + "epoch": 3.413333333333333, + "grad_norm": 3.0843541622161865, + "learning_rate": 6.352313167259787e-05, + "loss": 1.3358, + "step": 7680 + }, + { + "epoch": 3.413777777777778, + "grad_norm": 3.328740358352661, + "learning_rate": 6.350533807829183e-05, + "loss": 1.5565, + "step": 7681 + }, + { + "epoch": 3.414222222222222, + "grad_norm": 2.958048105239868, + "learning_rate": 6.348754448398577e-05, + "loss": 1.2256, + "step": 7682 + }, + { + "epoch": 3.4146666666666667, + "grad_norm": 0.26679933071136475, + "learning_rate": 6.346975088967971e-05, + "loss": 0.0342, + "step": 7683 + }, + { + "epoch": 3.415111111111111, + "grad_norm": 4.010861396789551, + "learning_rate": 6.345195729537367e-05, + "loss": 1.9807, + "step": 7684 + }, + { + "epoch": 3.4155555555555557, + "grad_norm": 3.461297035217285, + "learning_rate": 6.343416370106761e-05, + "loss": 1.4831, + "step": 7685 + }, + { + "epoch": 3.416, + "grad_norm": 3.9611904621124268, + "learning_rate": 6.341637010676157e-05, + "loss": 0.9961, + "step": 7686 + }, + { + "epoch": 3.4164444444444446, + "grad_norm": 3.7136943340301514, + "learning_rate": 6.339857651245552e-05, + "loss": 1.3796, + "step": 7687 + }, + { + "epoch": 3.416888888888889, + "grad_norm": 3.2812464237213135, + "learning_rate": 6.338078291814947e-05, + "loss": 1.3365, + "step": 7688 + }, + { + "epoch": 3.4173333333333336, + "grad_norm": 3.156381130218506, + "learning_rate": 6.336298932384342e-05, + "loss": 1.0016, + "step": 7689 + }, + { + "epoch": 3.417777777777778, + "grad_norm": 1.95352041721344, + "learning_rate": 6.334519572953737e-05, + "loss": 0.5333, + "step": 7690 + }, + { + "epoch": 3.418222222222222, + "grad_norm": 3.408022880554199, + "learning_rate": 6.332740213523132e-05, + "loss": 1.2858, + "step": 7691 + }, + { + "epoch": 3.4186666666666667, + "grad_norm": 4.81541633605957, + "learning_rate": 6.330960854092527e-05, + "loss": 1.3646, + "step": 7692 + }, + { + "epoch": 3.419111111111111, + "grad_norm": 3.8997044563293457, + "learning_rate": 6.329181494661922e-05, + "loss": 1.4012, + "step": 7693 + }, + { + "epoch": 3.4195555555555557, + "grad_norm": 3.6768176555633545, + "learning_rate": 6.327402135231318e-05, + "loss": 1.3647, + "step": 7694 + }, + { + "epoch": 3.42, + "grad_norm": 3.490180015563965, + "learning_rate": 6.325622775800712e-05, + "loss": 0.9646, + "step": 7695 + }, + { + "epoch": 3.4204444444444446, + "grad_norm": 3.8013601303100586, + "learning_rate": 6.323843416370106e-05, + "loss": 1.5347, + "step": 7696 + }, + { + "epoch": 3.420888888888889, + "grad_norm": 4.499445915222168, + "learning_rate": 6.322064056939502e-05, + "loss": 1.0899, + "step": 7697 + }, + { + "epoch": 3.421333333333333, + "grad_norm": 3.4003989696502686, + "learning_rate": 6.320284697508896e-05, + "loss": 0.9259, + "step": 7698 + }, + { + "epoch": 3.421777777777778, + "grad_norm": 4.400297164916992, + "learning_rate": 6.318505338078292e-05, + "loss": 1.6985, + "step": 7699 + }, + { + "epoch": 3.422222222222222, + "grad_norm": 4.983489513397217, + "learning_rate": 6.316725978647688e-05, + "loss": 1.1409, + "step": 7700 + }, + { + "epoch": 3.4226666666666667, + "grad_norm": 2.452336311340332, + "learning_rate": 6.314946619217082e-05, + "loss": 1.7351, + "step": 7701 + }, + { + "epoch": 3.423111111111111, + "grad_norm": 2.2893435955047607, + "learning_rate": 6.313167259786478e-05, + "loss": 2.1632, + "step": 7702 + }, + { + "epoch": 3.4235555555555557, + "grad_norm": 2.175917625427246, + "learning_rate": 6.311387900355872e-05, + "loss": 1.0005, + "step": 7703 + }, + { + "epoch": 3.424, + "grad_norm": 2.591916084289551, + "learning_rate": 6.309608540925268e-05, + "loss": 1.5803, + "step": 7704 + }, + { + "epoch": 3.4244444444444446, + "grad_norm": 2.7328264713287354, + "learning_rate": 6.307829181494662e-05, + "loss": 1.6264, + "step": 7705 + }, + { + "epoch": 3.424888888888889, + "grad_norm": 2.771141767501831, + "learning_rate": 6.306049822064057e-05, + "loss": 1.3346, + "step": 7706 + }, + { + "epoch": 3.4253333333333336, + "grad_norm": 3.2554972171783447, + "learning_rate": 6.304270462633453e-05, + "loss": 1.3667, + "step": 7707 + }, + { + "epoch": 3.425777777777778, + "grad_norm": 4.142671585083008, + "learning_rate": 6.302491103202847e-05, + "loss": 1.5744, + "step": 7708 + }, + { + "epoch": 3.426222222222222, + "grad_norm": 3.247718095779419, + "learning_rate": 6.300711743772242e-05, + "loss": 1.2485, + "step": 7709 + }, + { + "epoch": 3.4266666666666667, + "grad_norm": 3.285029649734497, + "learning_rate": 6.298932384341637e-05, + "loss": 1.3671, + "step": 7710 + }, + { + "epoch": 3.427111111111111, + "grad_norm": 2.8541810512542725, + "learning_rate": 6.297153024911032e-05, + "loss": 1.0453, + "step": 7711 + }, + { + "epoch": 3.4275555555555557, + "grad_norm": 3.167170286178589, + "learning_rate": 6.295373665480427e-05, + "loss": 1.1664, + "step": 7712 + }, + { + "epoch": 3.428, + "grad_norm": 3.222472667694092, + "learning_rate": 6.293594306049823e-05, + "loss": 1.0299, + "step": 7713 + }, + { + "epoch": 3.4284444444444446, + "grad_norm": 3.7111828327178955, + "learning_rate": 6.291814946619217e-05, + "loss": 1.3693, + "step": 7714 + }, + { + "epoch": 3.428888888888889, + "grad_norm": 3.3316597938537598, + "learning_rate": 6.290035587188612e-05, + "loss": 1.7048, + "step": 7715 + }, + { + "epoch": 3.429333333333333, + "grad_norm": 2.9972000122070312, + "learning_rate": 6.288256227758007e-05, + "loss": 1.1209, + "step": 7716 + }, + { + "epoch": 3.429777777777778, + "grad_norm": 3.5281057357788086, + "learning_rate": 6.286476868327403e-05, + "loss": 1.6927, + "step": 7717 + }, + { + "epoch": 3.430222222222222, + "grad_norm": 3.553957939147949, + "learning_rate": 6.284697508896797e-05, + "loss": 1.5864, + "step": 7718 + }, + { + "epoch": 3.4306666666666668, + "grad_norm": 4.420113563537598, + "learning_rate": 6.282918149466193e-05, + "loss": 1.5708, + "step": 7719 + }, + { + "epoch": 3.431111111111111, + "grad_norm": 4.024765968322754, + "learning_rate": 6.281138790035588e-05, + "loss": 1.3665, + "step": 7720 + }, + { + "epoch": 3.4315555555555557, + "grad_norm": 3.158511161804199, + "learning_rate": 6.279359430604983e-05, + "loss": 1.1667, + "step": 7721 + }, + { + "epoch": 3.432, + "grad_norm": 3.0785789489746094, + "learning_rate": 6.277580071174377e-05, + "loss": 1.1982, + "step": 7722 + }, + { + "epoch": 3.4324444444444446, + "grad_norm": 2.7140047550201416, + "learning_rate": 6.275800711743773e-05, + "loss": 1.1619, + "step": 7723 + }, + { + "epoch": 3.432888888888889, + "grad_norm": 3.3508801460266113, + "learning_rate": 6.274021352313167e-05, + "loss": 1.3535, + "step": 7724 + }, + { + "epoch": 3.4333333333333336, + "grad_norm": 2.8375468254089355, + "learning_rate": 6.272241992882563e-05, + "loss": 1.1456, + "step": 7725 + }, + { + "epoch": 3.433777777777778, + "grad_norm": 2.967170000076294, + "learning_rate": 6.270462633451958e-05, + "loss": 1.1639, + "step": 7726 + }, + { + "epoch": 3.434222222222222, + "grad_norm": 3.734205484390259, + "learning_rate": 6.268683274021353e-05, + "loss": 1.4107, + "step": 7727 + }, + { + "epoch": 3.4346666666666668, + "grad_norm": 3.417689085006714, + "learning_rate": 6.266903914590747e-05, + "loss": 1.2238, + "step": 7728 + }, + { + "epoch": 3.435111111111111, + "grad_norm": 3.4484238624572754, + "learning_rate": 6.265124555160143e-05, + "loss": 1.1604, + "step": 7729 + }, + { + "epoch": 3.4355555555555557, + "grad_norm": 1.4519230127334595, + "learning_rate": 6.263345195729537e-05, + "loss": 0.2928, + "step": 7730 + }, + { + "epoch": 3.436, + "grad_norm": 3.699568748474121, + "learning_rate": 6.261565836298932e-05, + "loss": 1.5317, + "step": 7731 + }, + { + "epoch": 3.4364444444444446, + "grad_norm": 3.111086845397949, + "learning_rate": 6.259786476868328e-05, + "loss": 1.3173, + "step": 7732 + }, + { + "epoch": 3.436888888888889, + "grad_norm": 2.9475185871124268, + "learning_rate": 6.258007117437722e-05, + "loss": 0.8653, + "step": 7733 + }, + { + "epoch": 3.437333333333333, + "grad_norm": 3.5937764644622803, + "learning_rate": 6.256227758007118e-05, + "loss": 1.4628, + "step": 7734 + }, + { + "epoch": 3.437777777777778, + "grad_norm": 3.6315152645111084, + "learning_rate": 6.254448398576512e-05, + "loss": 1.6076, + "step": 7735 + }, + { + "epoch": 3.438222222222222, + "grad_norm": 3.4667177200317383, + "learning_rate": 6.252669039145908e-05, + "loss": 1.4787, + "step": 7736 + }, + { + "epoch": 3.4386666666666668, + "grad_norm": 3.2693827152252197, + "learning_rate": 6.250889679715302e-05, + "loss": 1.1539, + "step": 7737 + }, + { + "epoch": 3.439111111111111, + "grad_norm": 3.5391995906829834, + "learning_rate": 6.249110320284698e-05, + "loss": 1.1326, + "step": 7738 + }, + { + "epoch": 3.4395555555555557, + "grad_norm": 4.569096088409424, + "learning_rate": 6.247330960854094e-05, + "loss": 1.5917, + "step": 7739 + }, + { + "epoch": 3.44, + "grad_norm": 3.872166633605957, + "learning_rate": 6.245551601423488e-05, + "loss": 1.3355, + "step": 7740 + }, + { + "epoch": 3.4404444444444446, + "grad_norm": 3.8824992179870605, + "learning_rate": 6.243772241992882e-05, + "loss": 1.5446, + "step": 7741 + }, + { + "epoch": 3.440888888888889, + "grad_norm": 3.592456579208374, + "learning_rate": 6.241992882562278e-05, + "loss": 0.9791, + "step": 7742 + }, + { + "epoch": 3.4413333333333336, + "grad_norm": 3.4050142765045166, + "learning_rate": 6.240213523131672e-05, + "loss": 0.8251, + "step": 7743 + }, + { + "epoch": 3.441777777777778, + "grad_norm": 3.6496431827545166, + "learning_rate": 6.238434163701068e-05, + "loss": 1.2421, + "step": 7744 + }, + { + "epoch": 3.442222222222222, + "grad_norm": 4.084912300109863, + "learning_rate": 6.236654804270463e-05, + "loss": 1.1315, + "step": 7745 + }, + { + "epoch": 3.4426666666666668, + "grad_norm": 4.4305033683776855, + "learning_rate": 6.234875444839858e-05, + "loss": 0.8433, + "step": 7746 + }, + { + "epoch": 3.443111111111111, + "grad_norm": 4.542752742767334, + "learning_rate": 6.233096085409253e-05, + "loss": 1.3728, + "step": 7747 + }, + { + "epoch": 3.4435555555555557, + "grad_norm": 4.393490791320801, + "learning_rate": 6.231316725978648e-05, + "loss": 1.4556, + "step": 7748 + }, + { + "epoch": 3.444, + "grad_norm": 4.165998458862305, + "learning_rate": 6.229537366548043e-05, + "loss": 0.8499, + "step": 7749 + }, + { + "epoch": 3.4444444444444446, + "grad_norm": 3.2756288051605225, + "learning_rate": 6.227758007117438e-05, + "loss": 0.59, + "step": 7750 + }, + { + "epoch": 3.444888888888889, + "grad_norm": 1.687354564666748, + "learning_rate": 6.225978647686833e-05, + "loss": 0.7765, + "step": 7751 + }, + { + "epoch": 3.445333333333333, + "grad_norm": 2.0984833240509033, + "learning_rate": 6.224199288256229e-05, + "loss": 1.1706, + "step": 7752 + }, + { + "epoch": 3.445777777777778, + "grad_norm": 1.5732635259628296, + "learning_rate": 6.222419928825623e-05, + "loss": 0.4636, + "step": 7753 + }, + { + "epoch": 3.446222222222222, + "grad_norm": 2.6701159477233887, + "learning_rate": 6.220640569395018e-05, + "loss": 1.3432, + "step": 7754 + }, + { + "epoch": 3.4466666666666668, + "grad_norm": 2.630073308944702, + "learning_rate": 6.218861209964413e-05, + "loss": 1.229, + "step": 7755 + }, + { + "epoch": 3.447111111111111, + "grad_norm": 3.405998945236206, + "learning_rate": 6.217081850533807e-05, + "loss": 1.6136, + "step": 7756 + }, + { + "epoch": 3.4475555555555557, + "grad_norm": 3.1982181072235107, + "learning_rate": 6.215302491103203e-05, + "loss": 1.4239, + "step": 7757 + }, + { + "epoch": 3.448, + "grad_norm": 3.283080816268921, + "learning_rate": 6.213523131672599e-05, + "loss": 1.5826, + "step": 7758 + }, + { + "epoch": 3.448444444444444, + "grad_norm": 3.861956834793091, + "learning_rate": 6.211743772241993e-05, + "loss": 1.9406, + "step": 7759 + }, + { + "epoch": 3.448888888888889, + "grad_norm": 3.5665135383605957, + "learning_rate": 6.209964412811389e-05, + "loss": 1.5776, + "step": 7760 + }, + { + "epoch": 3.449333333333333, + "grad_norm": 3.1784329414367676, + "learning_rate": 6.208185053380783e-05, + "loss": 1.1607, + "step": 7761 + }, + { + "epoch": 3.449777777777778, + "grad_norm": 3.31244158744812, + "learning_rate": 6.206405693950179e-05, + "loss": 1.2746, + "step": 7762 + }, + { + "epoch": 3.450222222222222, + "grad_norm": 3.8500120639801025, + "learning_rate": 6.204626334519573e-05, + "loss": 1.4645, + "step": 7763 + }, + { + "epoch": 3.4506666666666668, + "grad_norm": 3.650425434112549, + "learning_rate": 6.202846975088969e-05, + "loss": 1.5679, + "step": 7764 + }, + { + "epoch": 3.451111111111111, + "grad_norm": 3.314197063446045, + "learning_rate": 6.201067615658364e-05, + "loss": 1.236, + "step": 7765 + }, + { + "epoch": 3.4515555555555557, + "grad_norm": 3.924046277999878, + "learning_rate": 6.199288256227759e-05, + "loss": 1.4479, + "step": 7766 + }, + { + "epoch": 3.452, + "grad_norm": 4.836942672729492, + "learning_rate": 6.197508896797153e-05, + "loss": 1.4685, + "step": 7767 + }, + { + "epoch": 3.4524444444444446, + "grad_norm": 3.9449543952941895, + "learning_rate": 6.195729537366548e-05, + "loss": 1.5383, + "step": 7768 + }, + { + "epoch": 3.452888888888889, + "grad_norm": 3.259953737258911, + "learning_rate": 6.193950177935943e-05, + "loss": 1.3374, + "step": 7769 + }, + { + "epoch": 3.453333333333333, + "grad_norm": 3.4236366748809814, + "learning_rate": 6.192170818505338e-05, + "loss": 1.4552, + "step": 7770 + }, + { + "epoch": 3.453777777777778, + "grad_norm": 3.391941547393799, + "learning_rate": 6.190391459074734e-05, + "loss": 1.3679, + "step": 7771 + }, + { + "epoch": 3.454222222222222, + "grad_norm": 2.923438787460327, + "learning_rate": 6.188612099644128e-05, + "loss": 1.0185, + "step": 7772 + }, + { + "epoch": 3.4546666666666668, + "grad_norm": 3.636385202407837, + "learning_rate": 6.186832740213523e-05, + "loss": 1.2137, + "step": 7773 + }, + { + "epoch": 3.455111111111111, + "grad_norm": 3.6334786415100098, + "learning_rate": 6.185053380782918e-05, + "loss": 1.2733, + "step": 7774 + }, + { + "epoch": 3.4555555555555557, + "grad_norm": 3.618476152420044, + "learning_rate": 6.183274021352313e-05, + "loss": 1.4823, + "step": 7775 + }, + { + "epoch": 3.456, + "grad_norm": 3.8086905479431152, + "learning_rate": 6.181494661921708e-05, + "loss": 1.2488, + "step": 7776 + }, + { + "epoch": 3.456444444444444, + "grad_norm": 4.449453353881836, + "learning_rate": 6.179715302491104e-05, + "loss": 1.2205, + "step": 7777 + }, + { + "epoch": 3.456888888888889, + "grad_norm": 2.2830700874328613, + "learning_rate": 6.177935943060498e-05, + "loss": 0.576, + "step": 7778 + }, + { + "epoch": 3.457333333333333, + "grad_norm": 3.2258355617523193, + "learning_rate": 6.176156583629894e-05, + "loss": 1.0031, + "step": 7779 + }, + { + "epoch": 3.457777777777778, + "grad_norm": 3.3974528312683105, + "learning_rate": 6.174377224199288e-05, + "loss": 1.2962, + "step": 7780 + }, + { + "epoch": 3.458222222222222, + "grad_norm": 3.8626692295074463, + "learning_rate": 6.172597864768684e-05, + "loss": 1.663, + "step": 7781 + }, + { + "epoch": 3.458666666666667, + "grad_norm": 2.1425042152404785, + "learning_rate": 6.170818505338078e-05, + "loss": 0.5912, + "step": 7782 + }, + { + "epoch": 3.459111111111111, + "grad_norm": 3.64699649810791, + "learning_rate": 6.169039145907474e-05, + "loss": 1.5683, + "step": 7783 + }, + { + "epoch": 3.4595555555555557, + "grad_norm": 3.0158698558807373, + "learning_rate": 6.16725978647687e-05, + "loss": 1.1897, + "step": 7784 + }, + { + "epoch": 3.46, + "grad_norm": 2.8843777179718018, + "learning_rate": 6.165480427046264e-05, + "loss": 0.9849, + "step": 7785 + }, + { + "epoch": 3.4604444444444447, + "grad_norm": 4.387118339538574, + "learning_rate": 6.163701067615658e-05, + "loss": 1.2511, + "step": 7786 + }, + { + "epoch": 3.460888888888889, + "grad_norm": 3.0603246688842773, + "learning_rate": 6.161921708185054e-05, + "loss": 1.0083, + "step": 7787 + }, + { + "epoch": 3.461333333333333, + "grad_norm": 3.8462274074554443, + "learning_rate": 6.160142348754448e-05, + "loss": 1.4729, + "step": 7788 + }, + { + "epoch": 3.461777777777778, + "grad_norm": 3.917243719100952, + "learning_rate": 6.158362989323844e-05, + "loss": 1.104, + "step": 7789 + }, + { + "epoch": 3.462222222222222, + "grad_norm": 4.193759441375732, + "learning_rate": 6.156583629893239e-05, + "loss": 1.4846, + "step": 7790 + }, + { + "epoch": 3.462666666666667, + "grad_norm": 3.3156356811523438, + "learning_rate": 6.154804270462634e-05, + "loss": 0.9882, + "step": 7791 + }, + { + "epoch": 3.463111111111111, + "grad_norm": 4.5232062339782715, + "learning_rate": 6.153024911032029e-05, + "loss": 1.1761, + "step": 7792 + }, + { + "epoch": 3.4635555555555557, + "grad_norm": 3.9281444549560547, + "learning_rate": 6.151245551601423e-05, + "loss": 1.3836, + "step": 7793 + }, + { + "epoch": 3.464, + "grad_norm": 5.008172035217285, + "learning_rate": 6.149466192170819e-05, + "loss": 1.4874, + "step": 7794 + }, + { + "epoch": 3.464444444444444, + "grad_norm": 5.387003421783447, + "learning_rate": 6.147686832740213e-05, + "loss": 1.0767, + "step": 7795 + }, + { + "epoch": 3.464888888888889, + "grad_norm": 5.167206764221191, + "learning_rate": 6.145907473309609e-05, + "loss": 1.3537, + "step": 7796 + }, + { + "epoch": 3.465333333333333, + "grad_norm": 3.742565870285034, + "learning_rate": 6.144128113879005e-05, + "loss": 0.7384, + "step": 7797 + }, + { + "epoch": 3.465777777777778, + "grad_norm": 3.991476058959961, + "learning_rate": 6.142348754448399e-05, + "loss": 1.1434, + "step": 7798 + }, + { + "epoch": 3.466222222222222, + "grad_norm": 2.9769797325134277, + "learning_rate": 6.140569395017793e-05, + "loss": 0.496, + "step": 7799 + }, + { + "epoch": 3.466666666666667, + "grad_norm": 3.5614140033721924, + "learning_rate": 6.138790035587189e-05, + "loss": 0.4358, + "step": 7800 + }, + { + "epoch": 3.467111111111111, + "grad_norm": 2.5742027759552, + "learning_rate": 6.137010676156583e-05, + "loss": 2.0281, + "step": 7801 + }, + { + "epoch": 3.4675555555555557, + "grad_norm": 2.53623366355896, + "learning_rate": 6.135231316725979e-05, + "loss": 1.6929, + "step": 7802 + }, + { + "epoch": 3.468, + "grad_norm": 2.944135904312134, + "learning_rate": 6.133451957295375e-05, + "loss": 1.7063, + "step": 7803 + }, + { + "epoch": 3.4684444444444447, + "grad_norm": 2.7259559631347656, + "learning_rate": 6.131672597864769e-05, + "loss": 1.2452, + "step": 7804 + }, + { + "epoch": 3.468888888888889, + "grad_norm": 3.2671380043029785, + "learning_rate": 6.129893238434164e-05, + "loss": 1.7628, + "step": 7805 + }, + { + "epoch": 3.469333333333333, + "grad_norm": 2.8245341777801514, + "learning_rate": 6.128113879003559e-05, + "loss": 1.6624, + "step": 7806 + }, + { + "epoch": 3.469777777777778, + "grad_norm": 2.8411941528320312, + "learning_rate": 6.126334519572954e-05, + "loss": 1.4877, + "step": 7807 + }, + { + "epoch": 3.470222222222222, + "grad_norm": 3.039693593978882, + "learning_rate": 6.124555160142349e-05, + "loss": 1.8058, + "step": 7808 + }, + { + "epoch": 3.470666666666667, + "grad_norm": 3.047823667526245, + "learning_rate": 6.122775800711744e-05, + "loss": 1.0683, + "step": 7809 + }, + { + "epoch": 3.471111111111111, + "grad_norm": 3.6758055686950684, + "learning_rate": 6.12099644128114e-05, + "loss": 1.3514, + "step": 7810 + }, + { + "epoch": 3.4715555555555557, + "grad_norm": 2.9851505756378174, + "learning_rate": 6.119217081850534e-05, + "loss": 1.1167, + "step": 7811 + }, + { + "epoch": 3.472, + "grad_norm": 2.0843656063079834, + "learning_rate": 6.117437722419929e-05, + "loss": 0.533, + "step": 7812 + }, + { + "epoch": 3.4724444444444442, + "grad_norm": 3.570765733718872, + "learning_rate": 6.115658362989324e-05, + "loss": 1.5857, + "step": 7813 + }, + { + "epoch": 3.472888888888889, + "grad_norm": 2.951054573059082, + "learning_rate": 6.113879003558719e-05, + "loss": 0.9994, + "step": 7814 + }, + { + "epoch": 3.473333333333333, + "grad_norm": 3.410421133041382, + "learning_rate": 6.112099644128114e-05, + "loss": 1.492, + "step": 7815 + }, + { + "epoch": 3.473777777777778, + "grad_norm": 3.0904409885406494, + "learning_rate": 6.11032028469751e-05, + "loss": 1.6493, + "step": 7816 + }, + { + "epoch": 3.474222222222222, + "grad_norm": 3.2813360691070557, + "learning_rate": 6.108540925266904e-05, + "loss": 1.2129, + "step": 7817 + }, + { + "epoch": 3.474666666666667, + "grad_norm": 3.62602162361145, + "learning_rate": 6.1067615658363e-05, + "loss": 1.2179, + "step": 7818 + }, + { + "epoch": 3.475111111111111, + "grad_norm": 2.9935662746429443, + "learning_rate": 6.104982206405694e-05, + "loss": 1.2209, + "step": 7819 + }, + { + "epoch": 3.4755555555555557, + "grad_norm": 3.7781035900115967, + "learning_rate": 6.103202846975089e-05, + "loss": 1.8109, + "step": 7820 + }, + { + "epoch": 3.476, + "grad_norm": 3.0393505096435547, + "learning_rate": 6.101423487544484e-05, + "loss": 1.3712, + "step": 7821 + }, + { + "epoch": 3.4764444444444447, + "grad_norm": 3.436675786972046, + "learning_rate": 6.09964412811388e-05, + "loss": 1.1229, + "step": 7822 + }, + { + "epoch": 3.476888888888889, + "grad_norm": 3.587766170501709, + "learning_rate": 6.0978647686832747e-05, + "loss": 1.4697, + "step": 7823 + }, + { + "epoch": 3.477333333333333, + "grad_norm": 3.541997194290161, + "learning_rate": 6.0960854092526696e-05, + "loss": 0.978, + "step": 7824 + }, + { + "epoch": 3.477777777777778, + "grad_norm": 4.022648334503174, + "learning_rate": 6.094306049822064e-05, + "loss": 1.1844, + "step": 7825 + }, + { + "epoch": 3.478222222222222, + "grad_norm": 3.1931285858154297, + "learning_rate": 6.092526690391459e-05, + "loss": 1.3828, + "step": 7826 + }, + { + "epoch": 3.478666666666667, + "grad_norm": 2.8355612754821777, + "learning_rate": 6.0907473309608545e-05, + "loss": 0.8536, + "step": 7827 + }, + { + "epoch": 3.479111111111111, + "grad_norm": 3.4258205890655518, + "learning_rate": 6.0889679715302495e-05, + "loss": 1.2377, + "step": 7828 + }, + { + "epoch": 3.4795555555555557, + "grad_norm": 3.4359145164489746, + "learning_rate": 6.0871886120996445e-05, + "loss": 1.2237, + "step": 7829 + }, + { + "epoch": 3.48, + "grad_norm": 3.928776502609253, + "learning_rate": 6.08540925266904e-05, + "loss": 1.4612, + "step": 7830 + }, + { + "epoch": 3.4804444444444442, + "grad_norm": 3.3484604358673096, + "learning_rate": 6.083629893238434e-05, + "loss": 1.3294, + "step": 7831 + }, + { + "epoch": 3.480888888888889, + "grad_norm": 4.023479461669922, + "learning_rate": 6.0818505338078294e-05, + "loss": 1.1446, + "step": 7832 + }, + { + "epoch": 3.481333333333333, + "grad_norm": 4.194918632507324, + "learning_rate": 6.0800711743772244e-05, + "loss": 1.5604, + "step": 7833 + }, + { + "epoch": 3.481777777777778, + "grad_norm": 3.498831033706665, + "learning_rate": 6.0782918149466193e-05, + "loss": 1.2972, + "step": 7834 + }, + { + "epoch": 3.482222222222222, + "grad_norm": 3.6610026359558105, + "learning_rate": 6.076512455516015e-05, + "loss": 1.0893, + "step": 7835 + }, + { + "epoch": 3.482666666666667, + "grad_norm": 3.50296950340271, + "learning_rate": 6.07473309608541e-05, + "loss": 1.252, + "step": 7836 + }, + { + "epoch": 3.483111111111111, + "grad_norm": 3.684080123901367, + "learning_rate": 6.072953736654805e-05, + "loss": 1.3696, + "step": 7837 + }, + { + "epoch": 3.4835555555555557, + "grad_norm": 3.986010789871216, + "learning_rate": 6.071174377224199e-05, + "loss": 1.3597, + "step": 7838 + }, + { + "epoch": 3.484, + "grad_norm": 3.7796928882598877, + "learning_rate": 6.069395017793594e-05, + "loss": 1.2141, + "step": 7839 + }, + { + "epoch": 3.4844444444444447, + "grad_norm": 3.9027299880981445, + "learning_rate": 6.067615658362989e-05, + "loss": 1.2926, + "step": 7840 + }, + { + "epoch": 3.484888888888889, + "grad_norm": 4.090729236602783, + "learning_rate": 6.065836298932385e-05, + "loss": 1.3292, + "step": 7841 + }, + { + "epoch": 3.485333333333333, + "grad_norm": 3.7317214012145996, + "learning_rate": 6.06405693950178e-05, + "loss": 1.2942, + "step": 7842 + }, + { + "epoch": 3.485777777777778, + "grad_norm": 5.53169584274292, + "learning_rate": 6.062277580071175e-05, + "loss": 1.1903, + "step": 7843 + }, + { + "epoch": 3.486222222222222, + "grad_norm": 4.516229629516602, + "learning_rate": 6.060498220640569e-05, + "loss": 1.246, + "step": 7844 + }, + { + "epoch": 3.486666666666667, + "grad_norm": 4.402373313903809, + "learning_rate": 6.058718861209964e-05, + "loss": 1.4393, + "step": 7845 + }, + { + "epoch": 3.487111111111111, + "grad_norm": 5.0335540771484375, + "learning_rate": 6.05693950177936e-05, + "loss": 0.9533, + "step": 7846 + }, + { + "epoch": 3.4875555555555557, + "grad_norm": 4.421757698059082, + "learning_rate": 6.0551601423487547e-05, + "loss": 1.1198, + "step": 7847 + }, + { + "epoch": 3.488, + "grad_norm": 3.6150312423706055, + "learning_rate": 6.0533807829181496e-05, + "loss": 1.3501, + "step": 7848 + }, + { + "epoch": 3.4884444444444442, + "grad_norm": 3.798158645629883, + "learning_rate": 6.051601423487545e-05, + "loss": 0.9775, + "step": 7849 + }, + { + "epoch": 3.488888888888889, + "grad_norm": 4.989626407623291, + "learning_rate": 6.04982206405694e-05, + "loss": 0.7092, + "step": 7850 + }, + { + "epoch": 3.489333333333333, + "grad_norm": 2.5157763957977295, + "learning_rate": 6.0480427046263345e-05, + "loss": 1.8447, + "step": 7851 + }, + { + "epoch": 3.489777777777778, + "grad_norm": 1.8457545042037964, + "learning_rate": 6.0462633451957295e-05, + "loss": 1.0517, + "step": 7852 + }, + { + "epoch": 3.490222222222222, + "grad_norm": 2.611934185028076, + "learning_rate": 6.0444839857651245e-05, + "loss": 1.78, + "step": 7853 + }, + { + "epoch": 3.490666666666667, + "grad_norm": 0.1728007197380066, + "learning_rate": 6.04270462633452e-05, + "loss": 0.0174, + "step": 7854 + }, + { + "epoch": 3.491111111111111, + "grad_norm": 2.6706738471984863, + "learning_rate": 6.040925266903915e-05, + "loss": 1.605, + "step": 7855 + }, + { + "epoch": 3.4915555555555557, + "grad_norm": 2.728020429611206, + "learning_rate": 6.03914590747331e-05, + "loss": 1.5846, + "step": 7856 + }, + { + "epoch": 3.492, + "grad_norm": 3.039168357849121, + "learning_rate": 6.0373665480427044e-05, + "loss": 1.7239, + "step": 7857 + }, + { + "epoch": 3.4924444444444447, + "grad_norm": 2.816784381866455, + "learning_rate": 6.0355871886120994e-05, + "loss": 1.4047, + "step": 7858 + }, + { + "epoch": 3.492888888888889, + "grad_norm": 2.839526653289795, + "learning_rate": 6.033807829181495e-05, + "loss": 1.4536, + "step": 7859 + }, + { + "epoch": 3.493333333333333, + "grad_norm": 3.2044901847839355, + "learning_rate": 6.03202846975089e-05, + "loss": 1.322, + "step": 7860 + }, + { + "epoch": 3.493777777777778, + "grad_norm": 3.872880458831787, + "learning_rate": 6.030249110320285e-05, + "loss": 1.7948, + "step": 7861 + }, + { + "epoch": 3.494222222222222, + "grad_norm": 2.9369702339172363, + "learning_rate": 6.0284697508896806e-05, + "loss": 1.1952, + "step": 7862 + }, + { + "epoch": 3.494666666666667, + "grad_norm": 2.457634687423706, + "learning_rate": 6.0266903914590756e-05, + "loss": 1.1545, + "step": 7863 + }, + { + "epoch": 3.495111111111111, + "grad_norm": 3.318244457244873, + "learning_rate": 6.02491103202847e-05, + "loss": 1.6481, + "step": 7864 + }, + { + "epoch": 3.4955555555555557, + "grad_norm": 3.1333770751953125, + "learning_rate": 6.023131672597865e-05, + "loss": 1.4185, + "step": 7865 + }, + { + "epoch": 3.496, + "grad_norm": 3.627277135848999, + "learning_rate": 6.02135231316726e-05, + "loss": 1.4683, + "step": 7866 + }, + { + "epoch": 3.4964444444444442, + "grad_norm": 2.975083827972412, + "learning_rate": 6.0195729537366555e-05, + "loss": 1.5541, + "step": 7867 + }, + { + "epoch": 3.496888888888889, + "grad_norm": 2.3885438442230225, + "learning_rate": 6.0177935943060504e-05, + "loss": 1.0314, + "step": 7868 + }, + { + "epoch": 3.497333333333333, + "grad_norm": 3.9095265865325928, + "learning_rate": 6.0160142348754454e-05, + "loss": 1.6982, + "step": 7869 + }, + { + "epoch": 3.497777777777778, + "grad_norm": 5.884151935577393, + "learning_rate": 6.01423487544484e-05, + "loss": 1.5449, + "step": 7870 + }, + { + "epoch": 3.498222222222222, + "grad_norm": 3.240220785140991, + "learning_rate": 6.012455516014235e-05, + "loss": 1.2482, + "step": 7871 + }, + { + "epoch": 3.498666666666667, + "grad_norm": 2.9373934268951416, + "learning_rate": 6.01067615658363e-05, + "loss": 1.3222, + "step": 7872 + }, + { + "epoch": 3.499111111111111, + "grad_norm": 3.125972270965576, + "learning_rate": 6.008896797153025e-05, + "loss": 1.4957, + "step": 7873 + }, + { + "epoch": 3.4995555555555553, + "grad_norm": 3.1987497806549072, + "learning_rate": 6.00711743772242e-05, + "loss": 1.5033, + "step": 7874 + }, + { + "epoch": 3.5, + "grad_norm": 2.531674861907959, + "learning_rate": 6.005338078291816e-05, + "loss": 1.2043, + "step": 7875 + }, + { + "epoch": 3.5004444444444447, + "grad_norm": 3.2194294929504395, + "learning_rate": 6.003558718861211e-05, + "loss": 1.4242, + "step": 7876 + }, + { + "epoch": 3.500888888888889, + "grad_norm": 3.215784788131714, + "learning_rate": 6.001779359430605e-05, + "loss": 1.5275, + "step": 7877 + }, + { + "epoch": 3.501333333333333, + "grad_norm": 2.9531593322753906, + "learning_rate": 6e-05, + "loss": 1.3093, + "step": 7878 + }, + { + "epoch": 3.501777777777778, + "grad_norm": 3.470592737197876, + "learning_rate": 5.998220640569395e-05, + "loss": 1.5293, + "step": 7879 + }, + { + "epoch": 3.502222222222222, + "grad_norm": 3.2422022819519043, + "learning_rate": 5.996441281138791e-05, + "loss": 1.4052, + "step": 7880 + }, + { + "epoch": 3.502666666666667, + "grad_norm": 3.3211963176727295, + "learning_rate": 5.994661921708186e-05, + "loss": 1.304, + "step": 7881 + }, + { + "epoch": 3.503111111111111, + "grad_norm": 3.2284364700317383, + "learning_rate": 5.992882562277581e-05, + "loss": 1.1494, + "step": 7882 + }, + { + "epoch": 3.5035555555555558, + "grad_norm": 3.206890821456909, + "learning_rate": 5.991103202846975e-05, + "loss": 1.296, + "step": 7883 + }, + { + "epoch": 3.504, + "grad_norm": 2.572016716003418, + "learning_rate": 5.98932384341637e-05, + "loss": 0.7784, + "step": 7884 + }, + { + "epoch": 3.5044444444444443, + "grad_norm": 3.5026330947875977, + "learning_rate": 5.987544483985765e-05, + "loss": 1.3138, + "step": 7885 + }, + { + "epoch": 3.504888888888889, + "grad_norm": 3.5628318786621094, + "learning_rate": 5.9857651245551606e-05, + "loss": 1.1652, + "step": 7886 + }, + { + "epoch": 3.505333333333333, + "grad_norm": 3.418849468231201, + "learning_rate": 5.9839857651245556e-05, + "loss": 1.4964, + "step": 7887 + }, + { + "epoch": 3.505777777777778, + "grad_norm": 4.8035454750061035, + "learning_rate": 5.9822064056939506e-05, + "loss": 1.8758, + "step": 7888 + }, + { + "epoch": 3.506222222222222, + "grad_norm": 3.9143970012664795, + "learning_rate": 5.980427046263345e-05, + "loss": 1.4264, + "step": 7889 + }, + { + "epoch": 3.506666666666667, + "grad_norm": 3.6827564239501953, + "learning_rate": 5.97864768683274e-05, + "loss": 1.5077, + "step": 7890 + }, + { + "epoch": 3.507111111111111, + "grad_norm": 4.183266639709473, + "learning_rate": 5.9768683274021355e-05, + "loss": 1.2283, + "step": 7891 + }, + { + "epoch": 3.5075555555555553, + "grad_norm": 3.9071977138519287, + "learning_rate": 5.9750889679715304e-05, + "loss": 1.2634, + "step": 7892 + }, + { + "epoch": 3.508, + "grad_norm": 3.631640672683716, + "learning_rate": 5.9733096085409254e-05, + "loss": 1.5404, + "step": 7893 + }, + { + "epoch": 3.5084444444444447, + "grad_norm": 3.552340269088745, + "learning_rate": 5.971530249110321e-05, + "loss": 1.2201, + "step": 7894 + }, + { + "epoch": 3.508888888888889, + "grad_norm": 4.531553268432617, + "learning_rate": 5.969750889679716e-05, + "loss": 1.4324, + "step": 7895 + }, + { + "epoch": 3.509333333333333, + "grad_norm": 3.7588775157928467, + "learning_rate": 5.96797153024911e-05, + "loss": 1.2748, + "step": 7896 + }, + { + "epoch": 3.509777777777778, + "grad_norm": 4.838202476501465, + "learning_rate": 5.966192170818505e-05, + "loss": 1.1187, + "step": 7897 + }, + { + "epoch": 3.510222222222222, + "grad_norm": 4.4196367263793945, + "learning_rate": 5.9644128113879e-05, + "loss": 1.0293, + "step": 7898 + }, + { + "epoch": 3.510666666666667, + "grad_norm": 3.9595701694488525, + "learning_rate": 5.962633451957296e-05, + "loss": 1.232, + "step": 7899 + }, + { + "epoch": 3.511111111111111, + "grad_norm": 5.525210380554199, + "learning_rate": 5.960854092526691e-05, + "loss": 0.8993, + "step": 7900 + }, + { + "epoch": 3.5115555555555558, + "grad_norm": 2.474489212036133, + "learning_rate": 5.959074733096086e-05, + "loss": 1.8906, + "step": 7901 + }, + { + "epoch": 3.512, + "grad_norm": 1.5867375135421753, + "learning_rate": 5.95729537366548e-05, + "loss": 0.3189, + "step": 7902 + }, + { + "epoch": 3.5124444444444443, + "grad_norm": 2.7589077949523926, + "learning_rate": 5.955516014234875e-05, + "loss": 1.5188, + "step": 7903 + }, + { + "epoch": 3.512888888888889, + "grad_norm": 3.1739702224731445, + "learning_rate": 5.953736654804271e-05, + "loss": 1.4973, + "step": 7904 + }, + { + "epoch": 3.513333333333333, + "grad_norm": 3.220743179321289, + "learning_rate": 5.951957295373666e-05, + "loss": 1.7824, + "step": 7905 + }, + { + "epoch": 3.513777777777778, + "grad_norm": 3.1341934204101562, + "learning_rate": 5.950177935943061e-05, + "loss": 1.53, + "step": 7906 + }, + { + "epoch": 3.514222222222222, + "grad_norm": 2.954864501953125, + "learning_rate": 5.9483985765124564e-05, + "loss": 1.0464, + "step": 7907 + }, + { + "epoch": 3.514666666666667, + "grad_norm": 2.6394574642181396, + "learning_rate": 5.9466192170818513e-05, + "loss": 1.4441, + "step": 7908 + }, + { + "epoch": 3.515111111111111, + "grad_norm": 3.118455648422241, + "learning_rate": 5.9448398576512456e-05, + "loss": 1.438, + "step": 7909 + }, + { + "epoch": 3.5155555555555553, + "grad_norm": 2.812499523162842, + "learning_rate": 5.9430604982206406e-05, + "loss": 1.32, + "step": 7910 + }, + { + "epoch": 3.516, + "grad_norm": 3.96679425239563, + "learning_rate": 5.9412811387900356e-05, + "loss": 1.5107, + "step": 7911 + }, + { + "epoch": 3.5164444444444447, + "grad_norm": 3.8176608085632324, + "learning_rate": 5.939501779359431e-05, + "loss": 1.7569, + "step": 7912 + }, + { + "epoch": 3.516888888888889, + "grad_norm": 3.084055185317993, + "learning_rate": 5.937722419928826e-05, + "loss": 1.4477, + "step": 7913 + }, + { + "epoch": 3.517333333333333, + "grad_norm": 3.462454319000244, + "learning_rate": 5.935943060498221e-05, + "loss": 1.5585, + "step": 7914 + }, + { + "epoch": 3.517777777777778, + "grad_norm": 2.181887149810791, + "learning_rate": 5.9341637010676155e-05, + "loss": 0.452, + "step": 7915 + }, + { + "epoch": 3.518222222222222, + "grad_norm": 3.213517189025879, + "learning_rate": 5.9323843416370104e-05, + "loss": 1.5827, + "step": 7916 + }, + { + "epoch": 3.518666666666667, + "grad_norm": 3.1706907749176025, + "learning_rate": 5.930604982206406e-05, + "loss": 1.1167, + "step": 7917 + }, + { + "epoch": 3.519111111111111, + "grad_norm": 3.325066566467285, + "learning_rate": 5.928825622775801e-05, + "loss": 1.2163, + "step": 7918 + }, + { + "epoch": 3.5195555555555558, + "grad_norm": 3.789510488510132, + "learning_rate": 5.927046263345196e-05, + "loss": 1.5674, + "step": 7919 + }, + { + "epoch": 3.52, + "grad_norm": 3.43265962600708, + "learning_rate": 5.925266903914592e-05, + "loss": 1.3059, + "step": 7920 + }, + { + "epoch": 3.5204444444444443, + "grad_norm": 3.1737518310546875, + "learning_rate": 5.923487544483987e-05, + "loss": 1.1704, + "step": 7921 + }, + { + "epoch": 3.520888888888889, + "grad_norm": 3.2082409858703613, + "learning_rate": 5.921708185053381e-05, + "loss": 1.5402, + "step": 7922 + }, + { + "epoch": 3.521333333333333, + "grad_norm": 3.5814785957336426, + "learning_rate": 5.919928825622776e-05, + "loss": 1.4676, + "step": 7923 + }, + { + "epoch": 3.521777777777778, + "grad_norm": 3.379570722579956, + "learning_rate": 5.918149466192171e-05, + "loss": 1.1777, + "step": 7924 + }, + { + "epoch": 3.522222222222222, + "grad_norm": 3.4962823390960693, + "learning_rate": 5.9163701067615666e-05, + "loss": 1.6759, + "step": 7925 + }, + { + "epoch": 3.522666666666667, + "grad_norm": 3.593132495880127, + "learning_rate": 5.9145907473309615e-05, + "loss": 1.5066, + "step": 7926 + }, + { + "epoch": 3.523111111111111, + "grad_norm": 3.335500478744507, + "learning_rate": 5.9128113879003565e-05, + "loss": 0.9421, + "step": 7927 + }, + { + "epoch": 3.5235555555555553, + "grad_norm": 3.0314064025878906, + "learning_rate": 5.911032028469751e-05, + "loss": 1.0284, + "step": 7928 + }, + { + "epoch": 3.524, + "grad_norm": 3.651460886001587, + "learning_rate": 5.909252669039146e-05, + "loss": 1.3157, + "step": 7929 + }, + { + "epoch": 3.5244444444444447, + "grad_norm": 3.4120965003967285, + "learning_rate": 5.907473309608541e-05, + "loss": 1.4894, + "step": 7930 + }, + { + "epoch": 3.524888888888889, + "grad_norm": 4.027737140655518, + "learning_rate": 5.9056939501779364e-05, + "loss": 1.6357, + "step": 7931 + }, + { + "epoch": 3.525333333333333, + "grad_norm": 2.242091178894043, + "learning_rate": 5.9039145907473314e-05, + "loss": 0.5504, + "step": 7932 + }, + { + "epoch": 3.525777777777778, + "grad_norm": 3.149824380874634, + "learning_rate": 5.902135231316726e-05, + "loss": 0.8101, + "step": 7933 + }, + { + "epoch": 3.526222222222222, + "grad_norm": 2.0692434310913086, + "learning_rate": 5.900355871886122e-05, + "loss": 0.4684, + "step": 7934 + }, + { + "epoch": 3.5266666666666664, + "grad_norm": 3.153052806854248, + "learning_rate": 5.8985765124555156e-05, + "loss": 1.1421, + "step": 7935 + }, + { + "epoch": 3.527111111111111, + "grad_norm": 3.0566887855529785, + "learning_rate": 5.896797153024911e-05, + "loss": 1.0727, + "step": 7936 + }, + { + "epoch": 3.5275555555555558, + "grad_norm": 3.488640546798706, + "learning_rate": 5.895017793594306e-05, + "loss": 1.2473, + "step": 7937 + }, + { + "epoch": 3.528, + "grad_norm": 3.4758243560791016, + "learning_rate": 5.893238434163701e-05, + "loss": 0.9908, + "step": 7938 + }, + { + "epoch": 3.5284444444444443, + "grad_norm": 5.093916416168213, + "learning_rate": 5.891459074733097e-05, + "loss": 1.1238, + "step": 7939 + }, + { + "epoch": 3.528888888888889, + "grad_norm": 3.6023635864257812, + "learning_rate": 5.889679715302492e-05, + "loss": 0.8337, + "step": 7940 + }, + { + "epoch": 3.529333333333333, + "grad_norm": 3.5062379837036133, + "learning_rate": 5.887900355871886e-05, + "loss": 1.244, + "step": 7941 + }, + { + "epoch": 3.529777777777778, + "grad_norm": 3.3245818614959717, + "learning_rate": 5.886120996441281e-05, + "loss": 1.0635, + "step": 7942 + }, + { + "epoch": 3.530222222222222, + "grad_norm": 3.718571662902832, + "learning_rate": 5.884341637010676e-05, + "loss": 1.2694, + "step": 7943 + }, + { + "epoch": 3.530666666666667, + "grad_norm": 4.398813247680664, + "learning_rate": 5.882562277580072e-05, + "loss": 1.2429, + "step": 7944 + }, + { + "epoch": 3.531111111111111, + "grad_norm": 3.6693432331085205, + "learning_rate": 5.880782918149467e-05, + "loss": 1.0828, + "step": 7945 + }, + { + "epoch": 3.5315555555555553, + "grad_norm": 4.250223159790039, + "learning_rate": 5.8790035587188616e-05, + "loss": 1.6145, + "step": 7946 + }, + { + "epoch": 3.532, + "grad_norm": 2.441047191619873, + "learning_rate": 5.877224199288256e-05, + "loss": 0.279, + "step": 7947 + }, + { + "epoch": 3.5324444444444447, + "grad_norm": 4.091504096984863, + "learning_rate": 5.875444839857651e-05, + "loss": 0.6715, + "step": 7948 + }, + { + "epoch": 3.532888888888889, + "grad_norm": 5.657823085784912, + "learning_rate": 5.8736654804270466e-05, + "loss": 1.2939, + "step": 7949 + }, + { + "epoch": 3.533333333333333, + "grad_norm": 4.15081787109375, + "learning_rate": 5.8718861209964415e-05, + "loss": 0.4457, + "step": 7950 + }, + { + "epoch": 3.533777777777778, + "grad_norm": 2.6434662342071533, + "learning_rate": 5.8701067615658365e-05, + "loss": 2.3563, + "step": 7951 + }, + { + "epoch": 3.534222222222222, + "grad_norm": 2.4858129024505615, + "learning_rate": 5.868327402135232e-05, + "loss": 1.8796, + "step": 7952 + }, + { + "epoch": 3.5346666666666664, + "grad_norm": 3.0322587490081787, + "learning_rate": 5.866548042704627e-05, + "loss": 1.9928, + "step": 7953 + }, + { + "epoch": 3.535111111111111, + "grad_norm": 2.7141330242156982, + "learning_rate": 5.8647686832740214e-05, + "loss": 1.6356, + "step": 7954 + }, + { + "epoch": 3.535555555555556, + "grad_norm": 3.070211172103882, + "learning_rate": 5.8629893238434164e-05, + "loss": 1.3506, + "step": 7955 + }, + { + "epoch": 3.536, + "grad_norm": 3.548081159591675, + "learning_rate": 5.8612099644128114e-05, + "loss": 1.8977, + "step": 7956 + }, + { + "epoch": 3.5364444444444443, + "grad_norm": 3.189389228820801, + "learning_rate": 5.859430604982207e-05, + "loss": 1.5622, + "step": 7957 + }, + { + "epoch": 3.536888888888889, + "grad_norm": 3.3881306648254395, + "learning_rate": 5.857651245551602e-05, + "loss": 1.7807, + "step": 7958 + }, + { + "epoch": 3.537333333333333, + "grad_norm": 2.211007595062256, + "learning_rate": 5.855871886120997e-05, + "loss": 0.688, + "step": 7959 + }, + { + "epoch": 3.537777777777778, + "grad_norm": 3.352668523788452, + "learning_rate": 5.854092526690391e-05, + "loss": 1.8142, + "step": 7960 + }, + { + "epoch": 3.538222222222222, + "grad_norm": 3.3707339763641357, + "learning_rate": 5.852313167259786e-05, + "loss": 1.3783, + "step": 7961 + }, + { + "epoch": 3.538666666666667, + "grad_norm": 3.5272951126098633, + "learning_rate": 5.850533807829182e-05, + "loss": 1.5301, + "step": 7962 + }, + { + "epoch": 3.539111111111111, + "grad_norm": 3.348961353302002, + "learning_rate": 5.848754448398577e-05, + "loss": 1.2704, + "step": 7963 + }, + { + "epoch": 3.5395555555555553, + "grad_norm": 3.5585920810699463, + "learning_rate": 5.846975088967972e-05, + "loss": 1.3461, + "step": 7964 + }, + { + "epoch": 3.54, + "grad_norm": 3.355560064315796, + "learning_rate": 5.8451957295373675e-05, + "loss": 1.4307, + "step": 7965 + }, + { + "epoch": 3.5404444444444443, + "grad_norm": 3.4020283222198486, + "learning_rate": 5.8434163701067624e-05, + "loss": 1.4312, + "step": 7966 + }, + { + "epoch": 3.540888888888889, + "grad_norm": 3.8027524948120117, + "learning_rate": 5.841637010676157e-05, + "loss": 1.6558, + "step": 7967 + }, + { + "epoch": 3.541333333333333, + "grad_norm": 3.7693047523498535, + "learning_rate": 5.839857651245552e-05, + "loss": 1.2358, + "step": 7968 + }, + { + "epoch": 3.541777777777778, + "grad_norm": 0.21360933780670166, + "learning_rate": 5.838078291814947e-05, + "loss": 0.0254, + "step": 7969 + }, + { + "epoch": 3.542222222222222, + "grad_norm": 2.69741153717041, + "learning_rate": 5.836298932384342e-05, + "loss": 1.2167, + "step": 7970 + }, + { + "epoch": 3.5426666666666664, + "grad_norm": 2.8727669715881348, + "learning_rate": 5.834519572953737e-05, + "loss": 1.0769, + "step": 7971 + }, + { + "epoch": 3.543111111111111, + "grad_norm": 2.91815447807312, + "learning_rate": 5.832740213523132e-05, + "loss": 1.0958, + "step": 7972 + }, + { + "epoch": 3.543555555555556, + "grad_norm": 3.0180909633636475, + "learning_rate": 5.8309608540925266e-05, + "loss": 1.0416, + "step": 7973 + }, + { + "epoch": 3.544, + "grad_norm": 3.282245635986328, + "learning_rate": 5.8291814946619215e-05, + "loss": 1.1943, + "step": 7974 + }, + { + "epoch": 3.5444444444444443, + "grad_norm": 3.464338779449463, + "learning_rate": 5.8274021352313165e-05, + "loss": 1.088, + "step": 7975 + }, + { + "epoch": 3.544888888888889, + "grad_norm": 3.1577398777008057, + "learning_rate": 5.825622775800712e-05, + "loss": 1.3338, + "step": 7976 + }, + { + "epoch": 3.5453333333333332, + "grad_norm": 3.2722463607788086, + "learning_rate": 5.823843416370107e-05, + "loss": 1.2914, + "step": 7977 + }, + { + "epoch": 3.545777777777778, + "grad_norm": 3.671210527420044, + "learning_rate": 5.822064056939502e-05, + "loss": 1.3975, + "step": 7978 + }, + { + "epoch": 3.546222222222222, + "grad_norm": 3.583202600479126, + "learning_rate": 5.820284697508898e-05, + "loss": 1.2501, + "step": 7979 + }, + { + "epoch": 3.546666666666667, + "grad_norm": 2.0527262687683105, + "learning_rate": 5.8185053380782914e-05, + "loss": 0.5888, + "step": 7980 + }, + { + "epoch": 3.547111111111111, + "grad_norm": 3.583556890487671, + "learning_rate": 5.816725978647687e-05, + "loss": 1.0753, + "step": 7981 + }, + { + "epoch": 3.5475555555555554, + "grad_norm": 3.821152687072754, + "learning_rate": 5.814946619217082e-05, + "loss": 1.311, + "step": 7982 + }, + { + "epoch": 3.548, + "grad_norm": 3.369014263153076, + "learning_rate": 5.813167259786477e-05, + "loss": 1.2148, + "step": 7983 + }, + { + "epoch": 3.5484444444444443, + "grad_norm": 3.3493340015411377, + "learning_rate": 5.8113879003558726e-05, + "loss": 1.1789, + "step": 7984 + }, + { + "epoch": 3.548888888888889, + "grad_norm": 2.961345672607422, + "learning_rate": 5.8096085409252676e-05, + "loss": 1.0513, + "step": 7985 + }, + { + "epoch": 3.5493333333333332, + "grad_norm": 3.708880662918091, + "learning_rate": 5.807829181494662e-05, + "loss": 1.3949, + "step": 7986 + }, + { + "epoch": 3.549777777777778, + "grad_norm": 3.2943549156188965, + "learning_rate": 5.806049822064057e-05, + "loss": 1.1287, + "step": 7987 + }, + { + "epoch": 3.550222222222222, + "grad_norm": 3.0104446411132812, + "learning_rate": 5.804270462633452e-05, + "loss": 1.121, + "step": 7988 + }, + { + "epoch": 3.5506666666666664, + "grad_norm": 3.7163071632385254, + "learning_rate": 5.8024911032028475e-05, + "loss": 1.0807, + "step": 7989 + }, + { + "epoch": 3.551111111111111, + "grad_norm": 3.9415080547332764, + "learning_rate": 5.8007117437722425e-05, + "loss": 1.3248, + "step": 7990 + }, + { + "epoch": 3.551555555555556, + "grad_norm": 4.1947245597839355, + "learning_rate": 5.7989323843416374e-05, + "loss": 1.5827, + "step": 7991 + }, + { + "epoch": 3.552, + "grad_norm": 4.378093719482422, + "learning_rate": 5.797153024911033e-05, + "loss": 1.2251, + "step": 7992 + }, + { + "epoch": 3.5524444444444443, + "grad_norm": 3.898223638534546, + "learning_rate": 5.795373665480427e-05, + "loss": 1.4066, + "step": 7993 + }, + { + "epoch": 3.552888888888889, + "grad_norm": 4.775320529937744, + "learning_rate": 5.7935943060498223e-05, + "loss": 1.2938, + "step": 7994 + }, + { + "epoch": 3.5533333333333332, + "grad_norm": 4.527971267700195, + "learning_rate": 5.791814946619217e-05, + "loss": 1.3705, + "step": 7995 + }, + { + "epoch": 3.553777777777778, + "grad_norm": 4.258374214172363, + "learning_rate": 5.790035587188612e-05, + "loss": 1.2344, + "step": 7996 + }, + { + "epoch": 3.554222222222222, + "grad_norm": 3.4799654483795166, + "learning_rate": 5.788256227758008e-05, + "loss": 1.1858, + "step": 7997 + }, + { + "epoch": 3.554666666666667, + "grad_norm": 5.774474620819092, + "learning_rate": 5.786476868327403e-05, + "loss": 1.6127, + "step": 7998 + }, + { + "epoch": 3.555111111111111, + "grad_norm": 4.93737268447876, + "learning_rate": 5.784697508896797e-05, + "loss": 1.2544, + "step": 7999 + }, + { + "epoch": 3.5555555555555554, + "grad_norm": 3.601134777069092, + "learning_rate": 5.782918149466192e-05, + "loss": 0.5138, + "step": 8000 + }, + { + "epoch": 3.556, + "grad_norm": 2.7162842750549316, + "learning_rate": 5.781138790035587e-05, + "loss": 2.1305, + "step": 8001 + }, + { + "epoch": 3.5564444444444443, + "grad_norm": 2.303110361099243, + "learning_rate": 5.779359430604983e-05, + "loss": 1.6483, + "step": 8002 + }, + { + "epoch": 3.556888888888889, + "grad_norm": 2.755901575088501, + "learning_rate": 5.777580071174378e-05, + "loss": 1.8632, + "step": 8003 + }, + { + "epoch": 3.5573333333333332, + "grad_norm": 1.7043906450271606, + "learning_rate": 5.775800711743773e-05, + "loss": 0.9941, + "step": 8004 + }, + { + "epoch": 3.557777777777778, + "grad_norm": 1.6068474054336548, + "learning_rate": 5.774021352313167e-05, + "loss": 0.6672, + "step": 8005 + }, + { + "epoch": 3.558222222222222, + "grad_norm": 2.530790328979492, + "learning_rate": 5.772241992882562e-05, + "loss": 1.2947, + "step": 8006 + }, + { + "epoch": 3.5586666666666664, + "grad_norm": 3.187232732772827, + "learning_rate": 5.7704626334519577e-05, + "loss": 1.9547, + "step": 8007 + }, + { + "epoch": 3.559111111111111, + "grad_norm": 2.78572416305542, + "learning_rate": 5.7686832740213526e-05, + "loss": 1.5949, + "step": 8008 + }, + { + "epoch": 3.559555555555556, + "grad_norm": 3.9255530834198, + "learning_rate": 5.7669039145907476e-05, + "loss": 1.6875, + "step": 8009 + }, + { + "epoch": 3.56, + "grad_norm": 2.8568902015686035, + "learning_rate": 5.765124555160143e-05, + "loss": 1.7145, + "step": 8010 + }, + { + "epoch": 3.5604444444444443, + "grad_norm": 2.9281044006347656, + "learning_rate": 5.763345195729538e-05, + "loss": 1.518, + "step": 8011 + }, + { + "epoch": 3.560888888888889, + "grad_norm": 2.7695422172546387, + "learning_rate": 5.7615658362989325e-05, + "loss": 1.665, + "step": 8012 + }, + { + "epoch": 3.5613333333333332, + "grad_norm": 3.807720899581909, + "learning_rate": 5.7597864768683275e-05, + "loss": 1.395, + "step": 8013 + }, + { + "epoch": 3.561777777777778, + "grad_norm": 3.4705801010131836, + "learning_rate": 5.7580071174377225e-05, + "loss": 1.4524, + "step": 8014 + }, + { + "epoch": 3.562222222222222, + "grad_norm": 2.88387131690979, + "learning_rate": 5.756227758007118e-05, + "loss": 1.1103, + "step": 8015 + }, + { + "epoch": 3.562666666666667, + "grad_norm": 2.9393246173858643, + "learning_rate": 5.754448398576513e-05, + "loss": 1.0856, + "step": 8016 + }, + { + "epoch": 3.563111111111111, + "grad_norm": 3.0343334674835205, + "learning_rate": 5.752669039145908e-05, + "loss": 1.3709, + "step": 8017 + }, + { + "epoch": 3.5635555555555554, + "grad_norm": 2.985978126525879, + "learning_rate": 5.7508896797153023e-05, + "loss": 1.36, + "step": 8018 + }, + { + "epoch": 3.564, + "grad_norm": 3.5497658252716064, + "learning_rate": 5.749110320284697e-05, + "loss": 1.4632, + "step": 8019 + }, + { + "epoch": 3.5644444444444443, + "grad_norm": 2.939556837081909, + "learning_rate": 5.747330960854092e-05, + "loss": 1.417, + "step": 8020 + }, + { + "epoch": 3.564888888888889, + "grad_norm": 2.9384403228759766, + "learning_rate": 5.745551601423488e-05, + "loss": 0.9369, + "step": 8021 + }, + { + "epoch": 3.5653333333333332, + "grad_norm": 3.302537441253662, + "learning_rate": 5.743772241992883e-05, + "loss": 1.592, + "step": 8022 + }, + { + "epoch": 3.565777777777778, + "grad_norm": 3.0296859741210938, + "learning_rate": 5.741992882562278e-05, + "loss": 1.3002, + "step": 8023 + }, + { + "epoch": 3.566222222222222, + "grad_norm": 2.4445323944091797, + "learning_rate": 5.7402135231316735e-05, + "loss": 1.0328, + "step": 8024 + }, + { + "epoch": 3.5666666666666664, + "grad_norm": 2.847886562347412, + "learning_rate": 5.738434163701067e-05, + "loss": 1.151, + "step": 8025 + }, + { + "epoch": 3.567111111111111, + "grad_norm": 3.6882436275482178, + "learning_rate": 5.736654804270463e-05, + "loss": 1.717, + "step": 8026 + }, + { + "epoch": 3.567555555555556, + "grad_norm": 2.9185895919799805, + "learning_rate": 5.734875444839858e-05, + "loss": 1.4205, + "step": 8027 + }, + { + "epoch": 3.568, + "grad_norm": 3.1053783893585205, + "learning_rate": 5.733096085409253e-05, + "loss": 1.2341, + "step": 8028 + }, + { + "epoch": 3.5684444444444443, + "grad_norm": 3.3123791217803955, + "learning_rate": 5.7313167259786484e-05, + "loss": 1.1749, + "step": 8029 + }, + { + "epoch": 3.568888888888889, + "grad_norm": 3.7100796699523926, + "learning_rate": 5.7295373665480434e-05, + "loss": 1.2908, + "step": 8030 + }, + { + "epoch": 3.5693333333333332, + "grad_norm": 2.9911487102508545, + "learning_rate": 5.727758007117438e-05, + "loss": 0.8336, + "step": 8031 + }, + { + "epoch": 3.569777777777778, + "grad_norm": 4.015035629272461, + "learning_rate": 5.7259786476868326e-05, + "loss": 1.8666, + "step": 8032 + }, + { + "epoch": 3.570222222222222, + "grad_norm": 3.5821290016174316, + "learning_rate": 5.7241992882562276e-05, + "loss": 1.1474, + "step": 8033 + }, + { + "epoch": 3.570666666666667, + "grad_norm": 3.538398265838623, + "learning_rate": 5.722419928825623e-05, + "loss": 0.8516, + "step": 8034 + }, + { + "epoch": 3.571111111111111, + "grad_norm": 3.570612907409668, + "learning_rate": 5.720640569395018e-05, + "loss": 1.4082, + "step": 8035 + }, + { + "epoch": 3.5715555555555554, + "grad_norm": 2.5072994232177734, + "learning_rate": 5.718861209964413e-05, + "loss": 0.6298, + "step": 8036 + }, + { + "epoch": 3.572, + "grad_norm": 3.3981175422668457, + "learning_rate": 5.717081850533809e-05, + "loss": 1.1501, + "step": 8037 + }, + { + "epoch": 3.5724444444444443, + "grad_norm": 4.341430187225342, + "learning_rate": 5.7153024911032025e-05, + "loss": 1.4794, + "step": 8038 + }, + { + "epoch": 3.572888888888889, + "grad_norm": 4.046430587768555, + "learning_rate": 5.713523131672598e-05, + "loss": 1.3761, + "step": 8039 + }, + { + "epoch": 3.5733333333333333, + "grad_norm": 3.6594674587249756, + "learning_rate": 5.711743772241993e-05, + "loss": 1.5613, + "step": 8040 + }, + { + "epoch": 3.573777777777778, + "grad_norm": 4.007656574249268, + "learning_rate": 5.709964412811388e-05, + "loss": 1.3619, + "step": 8041 + }, + { + "epoch": 3.574222222222222, + "grad_norm": 4.279335021972656, + "learning_rate": 5.708185053380784e-05, + "loss": 1.6566, + "step": 8042 + }, + { + "epoch": 3.5746666666666664, + "grad_norm": 3.4057838916778564, + "learning_rate": 5.706405693950179e-05, + "loss": 1.2244, + "step": 8043 + }, + { + "epoch": 3.575111111111111, + "grad_norm": 3.479970932006836, + "learning_rate": 5.704626334519573e-05, + "loss": 1.4622, + "step": 8044 + }, + { + "epoch": 3.575555555555556, + "grad_norm": 3.1348230838775635, + "learning_rate": 5.702846975088968e-05, + "loss": 1.2252, + "step": 8045 + }, + { + "epoch": 3.576, + "grad_norm": 3.5639426708221436, + "learning_rate": 5.701067615658363e-05, + "loss": 1.0362, + "step": 8046 + }, + { + "epoch": 3.5764444444444443, + "grad_norm": 3.8750410079956055, + "learning_rate": 5.6992882562277586e-05, + "loss": 0.9873, + "step": 8047 + }, + { + "epoch": 3.576888888888889, + "grad_norm": 4.268735885620117, + "learning_rate": 5.6975088967971535e-05, + "loss": 1.7215, + "step": 8048 + }, + { + "epoch": 3.5773333333333333, + "grad_norm": 3.788827657699585, + "learning_rate": 5.6957295373665485e-05, + "loss": 1.0994, + "step": 8049 + }, + { + "epoch": 3.5777777777777775, + "grad_norm": 3.688361644744873, + "learning_rate": 5.693950177935944e-05, + "loss": 0.2493, + "step": 8050 + }, + { + "epoch": 3.578222222222222, + "grad_norm": 2.1738009452819824, + "learning_rate": 5.692170818505338e-05, + "loss": 1.7641, + "step": 8051 + }, + { + "epoch": 3.578666666666667, + "grad_norm": 2.439173698425293, + "learning_rate": 5.6903914590747334e-05, + "loss": 2.0032, + "step": 8052 + }, + { + "epoch": 3.579111111111111, + "grad_norm": 2.087841749191284, + "learning_rate": 5.6886120996441284e-05, + "loss": 1.0669, + "step": 8053 + }, + { + "epoch": 3.5795555555555554, + "grad_norm": 2.6237778663635254, + "learning_rate": 5.6868327402135234e-05, + "loss": 1.5711, + "step": 8054 + }, + { + "epoch": 3.58, + "grad_norm": 2.9692187309265137, + "learning_rate": 5.685053380782919e-05, + "loss": 1.553, + "step": 8055 + }, + { + "epoch": 3.5804444444444443, + "grad_norm": 2.481410264968872, + "learning_rate": 5.683274021352314e-05, + "loss": 0.8767, + "step": 8056 + }, + { + "epoch": 3.580888888888889, + "grad_norm": 3.197645425796509, + "learning_rate": 5.681494661921708e-05, + "loss": 1.7859, + "step": 8057 + }, + { + "epoch": 3.5813333333333333, + "grad_norm": 3.208965301513672, + "learning_rate": 5.679715302491103e-05, + "loss": 1.5878, + "step": 8058 + }, + { + "epoch": 3.581777777777778, + "grad_norm": 3.1050381660461426, + "learning_rate": 5.677935943060498e-05, + "loss": 1.4308, + "step": 8059 + }, + { + "epoch": 3.582222222222222, + "grad_norm": 3.836418867111206, + "learning_rate": 5.676156583629894e-05, + "loss": 1.6935, + "step": 8060 + }, + { + "epoch": 3.5826666666666664, + "grad_norm": 3.386842727661133, + "learning_rate": 5.674377224199289e-05, + "loss": 1.5557, + "step": 8061 + }, + { + "epoch": 3.583111111111111, + "grad_norm": 2.919567108154297, + "learning_rate": 5.672597864768684e-05, + "loss": 1.1336, + "step": 8062 + }, + { + "epoch": 3.583555555555556, + "grad_norm": 3.271778106689453, + "learning_rate": 5.670818505338078e-05, + "loss": 1.3239, + "step": 8063 + }, + { + "epoch": 3.584, + "grad_norm": 4.067761421203613, + "learning_rate": 5.669039145907473e-05, + "loss": 1.423, + "step": 8064 + }, + { + "epoch": 3.5844444444444443, + "grad_norm": 3.229156970977783, + "learning_rate": 5.667259786476868e-05, + "loss": 1.3032, + "step": 8065 + }, + { + "epoch": 3.584888888888889, + "grad_norm": 3.7134037017822266, + "learning_rate": 5.665480427046264e-05, + "loss": 1.2842, + "step": 8066 + }, + { + "epoch": 3.5853333333333333, + "grad_norm": 3.65313982963562, + "learning_rate": 5.663701067615659e-05, + "loss": 1.4233, + "step": 8067 + }, + { + "epoch": 3.5857777777777775, + "grad_norm": 3.398033857345581, + "learning_rate": 5.661921708185054e-05, + "loss": 1.3633, + "step": 8068 + }, + { + "epoch": 3.586222222222222, + "grad_norm": 3.1917221546173096, + "learning_rate": 5.660142348754449e-05, + "loss": 1.0808, + "step": 8069 + }, + { + "epoch": 3.586666666666667, + "grad_norm": 3.6049952507019043, + "learning_rate": 5.658362989323843e-05, + "loss": 1.7095, + "step": 8070 + }, + { + "epoch": 3.587111111111111, + "grad_norm": 3.4335100650787354, + "learning_rate": 5.6565836298932386e-05, + "loss": 1.3466, + "step": 8071 + }, + { + "epoch": 3.5875555555555554, + "grad_norm": 3.463630437850952, + "learning_rate": 5.6548042704626336e-05, + "loss": 1.2838, + "step": 8072 + }, + { + "epoch": 3.588, + "grad_norm": 3.933293104171753, + "learning_rate": 5.6530249110320285e-05, + "loss": 1.4277, + "step": 8073 + }, + { + "epoch": 3.5884444444444443, + "grad_norm": 3.340559482574463, + "learning_rate": 5.651245551601424e-05, + "loss": 1.2219, + "step": 8074 + }, + { + "epoch": 3.588888888888889, + "grad_norm": 4.030251979827881, + "learning_rate": 5.649466192170819e-05, + "loss": 1.4176, + "step": 8075 + }, + { + "epoch": 3.5893333333333333, + "grad_norm": 3.1213886737823486, + "learning_rate": 5.6476868327402134e-05, + "loss": 1.2696, + "step": 8076 + }, + { + "epoch": 3.589777777777778, + "grad_norm": 2.9783213138580322, + "learning_rate": 5.6459074733096084e-05, + "loss": 1.2995, + "step": 8077 + }, + { + "epoch": 3.590222222222222, + "grad_norm": 2.2861247062683105, + "learning_rate": 5.6441281138790034e-05, + "loss": 0.642, + "step": 8078 + }, + { + "epoch": 3.5906666666666665, + "grad_norm": 3.1233749389648438, + "learning_rate": 5.642348754448399e-05, + "loss": 0.9259, + "step": 8079 + }, + { + "epoch": 3.591111111111111, + "grad_norm": 3.5224452018737793, + "learning_rate": 5.640569395017794e-05, + "loss": 1.4473, + "step": 8080 + }, + { + "epoch": 3.5915555555555554, + "grad_norm": 3.7169413566589355, + "learning_rate": 5.638790035587189e-05, + "loss": 1.1774, + "step": 8081 + }, + { + "epoch": 3.592, + "grad_norm": 3.676466226577759, + "learning_rate": 5.6370106761565846e-05, + "loss": 1.4331, + "step": 8082 + }, + { + "epoch": 3.5924444444444443, + "grad_norm": 3.7899556159973145, + "learning_rate": 5.635231316725978e-05, + "loss": 1.6973, + "step": 8083 + }, + { + "epoch": 3.592888888888889, + "grad_norm": 4.713149070739746, + "learning_rate": 5.633451957295374e-05, + "loss": 1.4462, + "step": 8084 + }, + { + "epoch": 3.5933333333333333, + "grad_norm": 3.4206809997558594, + "learning_rate": 5.631672597864769e-05, + "loss": 1.2807, + "step": 8085 + }, + { + "epoch": 3.5937777777777775, + "grad_norm": 3.8811004161834717, + "learning_rate": 5.629893238434164e-05, + "loss": 1.2656, + "step": 8086 + }, + { + "epoch": 3.594222222222222, + "grad_norm": 3.997771978378296, + "learning_rate": 5.6281138790035595e-05, + "loss": 1.5387, + "step": 8087 + }, + { + "epoch": 3.594666666666667, + "grad_norm": 3.7929420471191406, + "learning_rate": 5.6263345195729545e-05, + "loss": 1.2682, + "step": 8088 + }, + { + "epoch": 3.595111111111111, + "grad_norm": 3.0101442337036133, + "learning_rate": 5.624555160142349e-05, + "loss": 0.7153, + "step": 8089 + }, + { + "epoch": 3.5955555555555554, + "grad_norm": 3.637249708175659, + "learning_rate": 5.622775800711744e-05, + "loss": 1.3984, + "step": 8090 + }, + { + "epoch": 3.596, + "grad_norm": 4.056656360626221, + "learning_rate": 5.620996441281139e-05, + "loss": 1.3663, + "step": 8091 + }, + { + "epoch": 3.5964444444444443, + "grad_norm": 3.1671206951141357, + "learning_rate": 5.6192170818505344e-05, + "loss": 1.1746, + "step": 8092 + }, + { + "epoch": 3.596888888888889, + "grad_norm": 4.801858901977539, + "learning_rate": 5.617437722419929e-05, + "loss": 1.7342, + "step": 8093 + }, + { + "epoch": 3.5973333333333333, + "grad_norm": 3.729170322418213, + "learning_rate": 5.615658362989324e-05, + "loss": 0.959, + "step": 8094 + }, + { + "epoch": 3.597777777777778, + "grad_norm": 3.9978530406951904, + "learning_rate": 5.61387900355872e-05, + "loss": 1.2262, + "step": 8095 + }, + { + "epoch": 3.598222222222222, + "grad_norm": 3.4558112621307373, + "learning_rate": 5.6120996441281136e-05, + "loss": 1.0928, + "step": 8096 + }, + { + "epoch": 3.5986666666666665, + "grad_norm": 3.660371780395508, + "learning_rate": 5.610320284697509e-05, + "loss": 1.1768, + "step": 8097 + }, + { + "epoch": 3.599111111111111, + "grad_norm": 5.200618743896484, + "learning_rate": 5.608540925266904e-05, + "loss": 1.0061, + "step": 8098 + }, + { + "epoch": 3.5995555555555554, + "grad_norm": 4.098397731781006, + "learning_rate": 5.606761565836299e-05, + "loss": 1.1386, + "step": 8099 + }, + { + "epoch": 3.6, + "grad_norm": 2.963202714920044, + "learning_rate": 5.604982206405695e-05, + "loss": 0.4808, + "step": 8100 + }, + { + "epoch": 3.6004444444444443, + "grad_norm": 1.6309106349945068, + "learning_rate": 5.60320284697509e-05, + "loss": 0.8084, + "step": 8101 + }, + { + "epoch": 3.600888888888889, + "grad_norm": 3.205033779144287, + "learning_rate": 5.601423487544484e-05, + "loss": 1.2626, + "step": 8102 + }, + { + "epoch": 3.6013333333333333, + "grad_norm": 3.2114498615264893, + "learning_rate": 5.599644128113879e-05, + "loss": 1.5981, + "step": 8103 + }, + { + "epoch": 3.6017777777777775, + "grad_norm": 3.0003700256347656, + "learning_rate": 5.597864768683274e-05, + "loss": 1.3144, + "step": 8104 + }, + { + "epoch": 3.602222222222222, + "grad_norm": 3.0486159324645996, + "learning_rate": 5.59608540925267e-05, + "loss": 1.3489, + "step": 8105 + }, + { + "epoch": 3.602666666666667, + "grad_norm": 2.8934648036956787, + "learning_rate": 5.5943060498220646e-05, + "loss": 1.316, + "step": 8106 + }, + { + "epoch": 3.603111111111111, + "grad_norm": 3.1762866973876953, + "learning_rate": 5.5925266903914596e-05, + "loss": 1.259, + "step": 8107 + }, + { + "epoch": 3.6035555555555554, + "grad_norm": 2.9977829456329346, + "learning_rate": 5.5907473309608546e-05, + "loss": 1.0371, + "step": 8108 + }, + { + "epoch": 3.604, + "grad_norm": 3.3716068267822266, + "learning_rate": 5.588967971530249e-05, + "loss": 1.4329, + "step": 8109 + }, + { + "epoch": 3.6044444444444443, + "grad_norm": 3.4693422317504883, + "learning_rate": 5.587188612099644e-05, + "loss": 1.3105, + "step": 8110 + }, + { + "epoch": 3.604888888888889, + "grad_norm": 3.0525338649749756, + "learning_rate": 5.5854092526690395e-05, + "loss": 1.3395, + "step": 8111 + }, + { + "epoch": 3.6053333333333333, + "grad_norm": 3.2203667163848877, + "learning_rate": 5.5836298932384345e-05, + "loss": 1.4343, + "step": 8112 + }, + { + "epoch": 3.605777777777778, + "grad_norm": 3.3371267318725586, + "learning_rate": 5.5818505338078294e-05, + "loss": 1.3877, + "step": 8113 + }, + { + "epoch": 3.606222222222222, + "grad_norm": 4.093338489532471, + "learning_rate": 5.580071174377225e-05, + "loss": 1.558, + "step": 8114 + }, + { + "epoch": 3.6066666666666665, + "grad_norm": 3.266066312789917, + "learning_rate": 5.578291814946619e-05, + "loss": 0.9591, + "step": 8115 + }, + { + "epoch": 3.607111111111111, + "grad_norm": 3.600327730178833, + "learning_rate": 5.5765124555160144e-05, + "loss": 1.4223, + "step": 8116 + }, + { + "epoch": 3.6075555555555554, + "grad_norm": 3.469787359237671, + "learning_rate": 5.574733096085409e-05, + "loss": 1.1383, + "step": 8117 + }, + { + "epoch": 3.608, + "grad_norm": 3.263157844543457, + "learning_rate": 5.572953736654804e-05, + "loss": 1.5695, + "step": 8118 + }, + { + "epoch": 3.6084444444444443, + "grad_norm": 2.616588830947876, + "learning_rate": 5.5711743772242e-05, + "loss": 0.9983, + "step": 8119 + }, + { + "epoch": 3.608888888888889, + "grad_norm": 3.373994827270508, + "learning_rate": 5.569395017793595e-05, + "loss": 1.3782, + "step": 8120 + }, + { + "epoch": 3.6093333333333333, + "grad_norm": 3.624722957611084, + "learning_rate": 5.567615658362989e-05, + "loss": 1.3283, + "step": 8121 + }, + { + "epoch": 3.6097777777777775, + "grad_norm": 3.547624349594116, + "learning_rate": 5.565836298932384e-05, + "loss": 0.9186, + "step": 8122 + }, + { + "epoch": 3.610222222222222, + "grad_norm": 3.4890434741973877, + "learning_rate": 5.564056939501779e-05, + "loss": 1.514, + "step": 8123 + }, + { + "epoch": 3.610666666666667, + "grad_norm": 3.2042863368988037, + "learning_rate": 5.562277580071175e-05, + "loss": 1.2029, + "step": 8124 + }, + { + "epoch": 3.611111111111111, + "grad_norm": 3.498734712600708, + "learning_rate": 5.56049822064057e-05, + "loss": 1.2113, + "step": 8125 + }, + { + "epoch": 3.6115555555555554, + "grad_norm": 3.9023139476776123, + "learning_rate": 5.558718861209965e-05, + "loss": 1.1575, + "step": 8126 + }, + { + "epoch": 3.612, + "grad_norm": 3.4858694076538086, + "learning_rate": 5.5569395017793604e-05, + "loss": 1.3176, + "step": 8127 + }, + { + "epoch": 3.6124444444444443, + "grad_norm": 3.3926501274108887, + "learning_rate": 5.555160142348754e-05, + "loss": 1.52, + "step": 8128 + }, + { + "epoch": 3.612888888888889, + "grad_norm": 3.7569804191589355, + "learning_rate": 5.55338078291815e-05, + "loss": 1.2774, + "step": 8129 + }, + { + "epoch": 3.6133333333333333, + "grad_norm": 3.865650177001953, + "learning_rate": 5.5516014234875446e-05, + "loss": 1.4652, + "step": 8130 + }, + { + "epoch": 3.613777777777778, + "grad_norm": 3.499619483947754, + "learning_rate": 5.5498220640569396e-05, + "loss": 1.3073, + "step": 8131 + }, + { + "epoch": 3.6142222222222222, + "grad_norm": 3.366881847381592, + "learning_rate": 5.548042704626335e-05, + "loss": 1.1339, + "step": 8132 + }, + { + "epoch": 3.6146666666666665, + "grad_norm": 3.8024709224700928, + "learning_rate": 5.54626334519573e-05, + "loss": 1.4588, + "step": 8133 + }, + { + "epoch": 3.615111111111111, + "grad_norm": 4.1196160316467285, + "learning_rate": 5.5444839857651245e-05, + "loss": 1.2886, + "step": 8134 + }, + { + "epoch": 3.6155555555555554, + "grad_norm": 3.782890796661377, + "learning_rate": 5.5427046263345195e-05, + "loss": 1.1498, + "step": 8135 + }, + { + "epoch": 3.616, + "grad_norm": 3.4527840614318848, + "learning_rate": 5.5409252669039145e-05, + "loss": 1.4733, + "step": 8136 + }, + { + "epoch": 3.6164444444444444, + "grad_norm": 3.2366058826446533, + "learning_rate": 5.53914590747331e-05, + "loss": 1.0925, + "step": 8137 + }, + { + "epoch": 3.616888888888889, + "grad_norm": 2.8542094230651855, + "learning_rate": 5.537366548042705e-05, + "loss": 0.7504, + "step": 8138 + }, + { + "epoch": 3.6173333333333333, + "grad_norm": 3.7491934299468994, + "learning_rate": 5.5355871886121e-05, + "loss": 1.4432, + "step": 8139 + }, + { + "epoch": 3.6177777777777775, + "grad_norm": 3.4385619163513184, + "learning_rate": 5.533807829181496e-05, + "loss": 0.8893, + "step": 8140 + }, + { + "epoch": 3.6182222222222222, + "grad_norm": 4.420047283172607, + "learning_rate": 5.5320284697508893e-05, + "loss": 1.8875, + "step": 8141 + }, + { + "epoch": 3.618666666666667, + "grad_norm": 4.307175636291504, + "learning_rate": 5.530249110320285e-05, + "loss": 1.1576, + "step": 8142 + }, + { + "epoch": 3.619111111111111, + "grad_norm": 3.8173797130584717, + "learning_rate": 5.52846975088968e-05, + "loss": 1.5327, + "step": 8143 + }, + { + "epoch": 3.6195555555555554, + "grad_norm": 4.208653926849365, + "learning_rate": 5.526690391459075e-05, + "loss": 0.8899, + "step": 8144 + }, + { + "epoch": 3.62, + "grad_norm": 4.356147766113281, + "learning_rate": 5.5249110320284706e-05, + "loss": 1.4298, + "step": 8145 + }, + { + "epoch": 3.6204444444444444, + "grad_norm": 3.923570394515991, + "learning_rate": 5.5231316725978656e-05, + "loss": 1.0489, + "step": 8146 + }, + { + "epoch": 3.620888888888889, + "grad_norm": 5.018566608428955, + "learning_rate": 5.52135231316726e-05, + "loss": 1.1978, + "step": 8147 + }, + { + "epoch": 3.6213333333333333, + "grad_norm": 3.8071272373199463, + "learning_rate": 5.519572953736655e-05, + "loss": 0.5866, + "step": 8148 + }, + { + "epoch": 3.621777777777778, + "grad_norm": 5.471420764923096, + "learning_rate": 5.51779359430605e-05, + "loss": 1.0767, + "step": 8149 + }, + { + "epoch": 3.6222222222222222, + "grad_norm": 4.0738606452941895, + "learning_rate": 5.5160142348754454e-05, + "loss": 0.5901, + "step": 8150 + }, + { + "epoch": 3.6226666666666665, + "grad_norm": 2.3456101417541504, + "learning_rate": 5.5142348754448404e-05, + "loss": 1.7003, + "step": 8151 + }, + { + "epoch": 3.623111111111111, + "grad_norm": 2.777620315551758, + "learning_rate": 5.5124555160142354e-05, + "loss": 1.9537, + "step": 8152 + }, + { + "epoch": 3.6235555555555554, + "grad_norm": 2.4514338970184326, + "learning_rate": 5.5106761565836304e-05, + "loss": 1.5011, + "step": 8153 + }, + { + "epoch": 3.624, + "grad_norm": 2.7258999347686768, + "learning_rate": 5.5088967971530247e-05, + "loss": 1.7209, + "step": 8154 + }, + { + "epoch": 3.6244444444444444, + "grad_norm": 2.463881492614746, + "learning_rate": 5.5071174377224196e-05, + "loss": 1.3477, + "step": 8155 + }, + { + "epoch": 3.624888888888889, + "grad_norm": 2.6786627769470215, + "learning_rate": 5.505338078291815e-05, + "loss": 1.4005, + "step": 8156 + }, + { + "epoch": 3.6253333333333333, + "grad_norm": 3.139360189437866, + "learning_rate": 5.50355871886121e-05, + "loss": 1.9946, + "step": 8157 + }, + { + "epoch": 3.6257777777777775, + "grad_norm": 2.5752453804016113, + "learning_rate": 5.501779359430605e-05, + "loss": 1.3608, + "step": 8158 + }, + { + "epoch": 3.6262222222222222, + "grad_norm": 3.0296263694763184, + "learning_rate": 5.500000000000001e-05, + "loss": 1.5828, + "step": 8159 + }, + { + "epoch": 3.626666666666667, + "grad_norm": 3.2608470916748047, + "learning_rate": 5.4982206405693945e-05, + "loss": 1.9236, + "step": 8160 + }, + { + "epoch": 3.627111111111111, + "grad_norm": 2.6645302772521973, + "learning_rate": 5.49644128113879e-05, + "loss": 1.4359, + "step": 8161 + }, + { + "epoch": 3.6275555555555554, + "grad_norm": 3.320736885070801, + "learning_rate": 5.494661921708185e-05, + "loss": 1.5185, + "step": 8162 + }, + { + "epoch": 3.628, + "grad_norm": 2.807404041290283, + "learning_rate": 5.49288256227758e-05, + "loss": 1.4373, + "step": 8163 + }, + { + "epoch": 3.6284444444444444, + "grad_norm": 2.9995763301849365, + "learning_rate": 5.491103202846976e-05, + "loss": 1.4947, + "step": 8164 + }, + { + "epoch": 3.628888888888889, + "grad_norm": 2.971586227416992, + "learning_rate": 5.489323843416371e-05, + "loss": 1.1496, + "step": 8165 + }, + { + "epoch": 3.6293333333333333, + "grad_norm": 3.100977659225464, + "learning_rate": 5.487544483985766e-05, + "loss": 1.1148, + "step": 8166 + }, + { + "epoch": 3.629777777777778, + "grad_norm": 3.1417391300201416, + "learning_rate": 5.48576512455516e-05, + "loss": 1.6336, + "step": 8167 + }, + { + "epoch": 3.6302222222222222, + "grad_norm": 3.098752975463867, + "learning_rate": 5.483985765124555e-05, + "loss": 1.2825, + "step": 8168 + }, + { + "epoch": 3.6306666666666665, + "grad_norm": 3.1631276607513428, + "learning_rate": 5.4822064056939506e-05, + "loss": 1.8255, + "step": 8169 + }, + { + "epoch": 3.631111111111111, + "grad_norm": 3.218193769454956, + "learning_rate": 5.4804270462633456e-05, + "loss": 1.6269, + "step": 8170 + }, + { + "epoch": 3.6315555555555554, + "grad_norm": 3.31066632270813, + "learning_rate": 5.4786476868327405e-05, + "loss": 1.3839, + "step": 8171 + }, + { + "epoch": 3.632, + "grad_norm": 4.438895225524902, + "learning_rate": 5.476868327402136e-05, + "loss": 1.9307, + "step": 8172 + }, + { + "epoch": 3.6324444444444444, + "grad_norm": 3.1939444541931152, + "learning_rate": 5.47508896797153e-05, + "loss": 1.3621, + "step": 8173 + }, + { + "epoch": 3.632888888888889, + "grad_norm": 2.8579087257385254, + "learning_rate": 5.4733096085409255e-05, + "loss": 0.8286, + "step": 8174 + }, + { + "epoch": 3.6333333333333333, + "grad_norm": 3.163404941558838, + "learning_rate": 5.4715302491103204e-05, + "loss": 1.3634, + "step": 8175 + }, + { + "epoch": 3.6337777777777776, + "grad_norm": 3.0559070110321045, + "learning_rate": 5.4697508896797154e-05, + "loss": 1.2169, + "step": 8176 + }, + { + "epoch": 3.6342222222222222, + "grad_norm": 3.1324009895324707, + "learning_rate": 5.467971530249111e-05, + "loss": 1.1629, + "step": 8177 + }, + { + "epoch": 3.634666666666667, + "grad_norm": 2.9012513160705566, + "learning_rate": 5.466192170818506e-05, + "loss": 0.8936, + "step": 8178 + }, + { + "epoch": 3.635111111111111, + "grad_norm": 3.157163381576538, + "learning_rate": 5.4644128113879e-05, + "loss": 1.2233, + "step": 8179 + }, + { + "epoch": 3.6355555555555554, + "grad_norm": 3.3618364334106445, + "learning_rate": 5.462633451957295e-05, + "loss": 1.3142, + "step": 8180 + }, + { + "epoch": 3.636, + "grad_norm": 3.3610551357269287, + "learning_rate": 5.46085409252669e-05, + "loss": 0.9224, + "step": 8181 + }, + { + "epoch": 3.6364444444444444, + "grad_norm": 3.8400661945343018, + "learning_rate": 5.459074733096086e-05, + "loss": 1.2544, + "step": 8182 + }, + { + "epoch": 3.6368888888888886, + "grad_norm": 3.3344969749450684, + "learning_rate": 5.457295373665481e-05, + "loss": 1.2641, + "step": 8183 + }, + { + "epoch": 3.6373333333333333, + "grad_norm": 3.121795415878296, + "learning_rate": 5.455516014234876e-05, + "loss": 1.1398, + "step": 8184 + }, + { + "epoch": 3.637777777777778, + "grad_norm": 4.248197555541992, + "learning_rate": 5.4537366548042715e-05, + "loss": 1.0294, + "step": 8185 + }, + { + "epoch": 3.6382222222222222, + "grad_norm": 3.8400375843048096, + "learning_rate": 5.451957295373665e-05, + "loss": 1.2639, + "step": 8186 + }, + { + "epoch": 3.6386666666666665, + "grad_norm": 3.7360832691192627, + "learning_rate": 5.450177935943061e-05, + "loss": 0.9932, + "step": 8187 + }, + { + "epoch": 3.639111111111111, + "grad_norm": 3.7453458309173584, + "learning_rate": 5.448398576512456e-05, + "loss": 1.2704, + "step": 8188 + }, + { + "epoch": 3.6395555555555554, + "grad_norm": 3.8645150661468506, + "learning_rate": 5.446619217081851e-05, + "loss": 0.682, + "step": 8189 + }, + { + "epoch": 3.64, + "grad_norm": 3.455167055130005, + "learning_rate": 5.4448398576512464e-05, + "loss": 1.3446, + "step": 8190 + }, + { + "epoch": 3.6404444444444444, + "grad_norm": 3.7151405811309814, + "learning_rate": 5.443060498220641e-05, + "loss": 1.0171, + "step": 8191 + }, + { + "epoch": 3.640888888888889, + "grad_norm": 4.202325820922852, + "learning_rate": 5.4412811387900356e-05, + "loss": 1.26, + "step": 8192 + }, + { + "epoch": 3.6413333333333333, + "grad_norm": 4.061198711395264, + "learning_rate": 5.4395017793594306e-05, + "loss": 0.9919, + "step": 8193 + }, + { + "epoch": 3.6417777777777776, + "grad_norm": 4.6272501945495605, + "learning_rate": 5.4377224199288256e-05, + "loss": 1.2987, + "step": 8194 + }, + { + "epoch": 3.6422222222222222, + "grad_norm": 4.520051956176758, + "learning_rate": 5.4359430604982205e-05, + "loss": 1.0514, + "step": 8195 + }, + { + "epoch": 3.642666666666667, + "grad_norm": 3.6569225788116455, + "learning_rate": 5.434163701067616e-05, + "loss": 1.2133, + "step": 8196 + }, + { + "epoch": 3.643111111111111, + "grad_norm": 4.312341213226318, + "learning_rate": 5.432384341637011e-05, + "loss": 1.1944, + "step": 8197 + }, + { + "epoch": 3.6435555555555554, + "grad_norm": 4.46543025970459, + "learning_rate": 5.430604982206406e-05, + "loss": 1.3214, + "step": 8198 + }, + { + "epoch": 3.644, + "grad_norm": 5.549198627471924, + "learning_rate": 5.4288256227758004e-05, + "loss": 0.8821, + "step": 8199 + }, + { + "epoch": 3.6444444444444444, + "grad_norm": 4.581550121307373, + "learning_rate": 5.4270462633451954e-05, + "loss": 1.044, + "step": 8200 + }, + { + "epoch": 3.6448888888888886, + "grad_norm": 2.513296127319336, + "learning_rate": 5.425266903914591e-05, + "loss": 0.9842, + "step": 8201 + }, + { + "epoch": 3.6453333333333333, + "grad_norm": 2.9860239028930664, + "learning_rate": 5.423487544483986e-05, + "loss": 2.2302, + "step": 8202 + }, + { + "epoch": 3.645777777777778, + "grad_norm": 2.648104429244995, + "learning_rate": 5.421708185053381e-05, + "loss": 1.7132, + "step": 8203 + }, + { + "epoch": 3.6462222222222223, + "grad_norm": 2.892163038253784, + "learning_rate": 5.4199288256227767e-05, + "loss": 1.3049, + "step": 8204 + }, + { + "epoch": 3.6466666666666665, + "grad_norm": 3.270510196685791, + "learning_rate": 5.41814946619217e-05, + "loss": 1.8626, + "step": 8205 + }, + { + "epoch": 3.647111111111111, + "grad_norm": 3.1879770755767822, + "learning_rate": 5.416370106761566e-05, + "loss": 1.513, + "step": 8206 + }, + { + "epoch": 3.6475555555555554, + "grad_norm": 2.4823553562164307, + "learning_rate": 5.414590747330961e-05, + "loss": 0.8951, + "step": 8207 + }, + { + "epoch": 3.648, + "grad_norm": 2.580904722213745, + "learning_rate": 5.412811387900356e-05, + "loss": 1.0414, + "step": 8208 + }, + { + "epoch": 3.6484444444444444, + "grad_norm": 3.6623318195343018, + "learning_rate": 5.4110320284697515e-05, + "loss": 1.6687, + "step": 8209 + }, + { + "epoch": 3.648888888888889, + "grad_norm": 3.58868408203125, + "learning_rate": 5.4092526690391465e-05, + "loss": 1.4268, + "step": 8210 + }, + { + "epoch": 3.6493333333333333, + "grad_norm": 3.2081754207611084, + "learning_rate": 5.4074733096085415e-05, + "loss": 1.6204, + "step": 8211 + }, + { + "epoch": 3.6497777777777776, + "grad_norm": 3.3098785877227783, + "learning_rate": 5.405693950177936e-05, + "loss": 1.1347, + "step": 8212 + }, + { + "epoch": 3.6502222222222223, + "grad_norm": 3.5921061038970947, + "learning_rate": 5.403914590747331e-05, + "loss": 1.5662, + "step": 8213 + }, + { + "epoch": 3.6506666666666665, + "grad_norm": 3.2691173553466797, + "learning_rate": 5.4021352313167264e-05, + "loss": 1.2848, + "step": 8214 + }, + { + "epoch": 3.651111111111111, + "grad_norm": 3.106048583984375, + "learning_rate": 5.4003558718861213e-05, + "loss": 0.8942, + "step": 8215 + }, + { + "epoch": 3.6515555555555554, + "grad_norm": 6.022765636444092, + "learning_rate": 5.398576512455516e-05, + "loss": 1.1837, + "step": 8216 + }, + { + "epoch": 3.652, + "grad_norm": 3.3533272743225098, + "learning_rate": 5.396797153024912e-05, + "loss": 1.3667, + "step": 8217 + }, + { + "epoch": 3.6524444444444444, + "grad_norm": 3.1057679653167725, + "learning_rate": 5.3950177935943056e-05, + "loss": 1.3119, + "step": 8218 + }, + { + "epoch": 3.6528888888888886, + "grad_norm": 2.8598716259002686, + "learning_rate": 5.393238434163701e-05, + "loss": 1.0251, + "step": 8219 + }, + { + "epoch": 3.6533333333333333, + "grad_norm": 3.3327109813690186, + "learning_rate": 5.391459074733096e-05, + "loss": 1.1523, + "step": 8220 + }, + { + "epoch": 3.653777777777778, + "grad_norm": 2.8015899658203125, + "learning_rate": 5.389679715302491e-05, + "loss": 1.087, + "step": 8221 + }, + { + "epoch": 3.6542222222222223, + "grad_norm": 3.4534077644348145, + "learning_rate": 5.387900355871887e-05, + "loss": 1.1916, + "step": 8222 + }, + { + "epoch": 3.6546666666666665, + "grad_norm": 2.8814008235931396, + "learning_rate": 5.386120996441282e-05, + "loss": 1.1617, + "step": 8223 + }, + { + "epoch": 3.655111111111111, + "grad_norm": 3.4058141708374023, + "learning_rate": 5.384341637010677e-05, + "loss": 1.2716, + "step": 8224 + }, + { + "epoch": 3.6555555555555554, + "grad_norm": 3.901968479156494, + "learning_rate": 5.382562277580071e-05, + "loss": 1.5236, + "step": 8225 + }, + { + "epoch": 3.656, + "grad_norm": 2.169243097305298, + "learning_rate": 5.380782918149466e-05, + "loss": 0.5998, + "step": 8226 + }, + { + "epoch": 3.6564444444444444, + "grad_norm": 3.659686803817749, + "learning_rate": 5.379003558718862e-05, + "loss": 1.4257, + "step": 8227 + }, + { + "epoch": 3.656888888888889, + "grad_norm": 3.8471503257751465, + "learning_rate": 5.3772241992882567e-05, + "loss": 1.3645, + "step": 8228 + }, + { + "epoch": 3.6573333333333333, + "grad_norm": 2.410559892654419, + "learning_rate": 5.3754448398576516e-05, + "loss": 0.6455, + "step": 8229 + }, + { + "epoch": 3.6577777777777776, + "grad_norm": 3.5442912578582764, + "learning_rate": 5.373665480427047e-05, + "loss": 1.3689, + "step": 8230 + }, + { + "epoch": 3.6582222222222223, + "grad_norm": 3.4644775390625, + "learning_rate": 5.371886120996441e-05, + "loss": 1.197, + "step": 8231 + }, + { + "epoch": 3.6586666666666665, + "grad_norm": 3.7939751148223877, + "learning_rate": 5.3701067615658365e-05, + "loss": 1.1387, + "step": 8232 + }, + { + "epoch": 3.659111111111111, + "grad_norm": 3.8991305828094482, + "learning_rate": 5.3683274021352315e-05, + "loss": 1.1945, + "step": 8233 + }, + { + "epoch": 3.6595555555555555, + "grad_norm": 2.8764808177948, + "learning_rate": 5.3665480427046265e-05, + "loss": 0.8908, + "step": 8234 + }, + { + "epoch": 3.66, + "grad_norm": 3.422370433807373, + "learning_rate": 5.364768683274022e-05, + "loss": 1.0134, + "step": 8235 + }, + { + "epoch": 3.6604444444444444, + "grad_norm": 4.0114874839782715, + "learning_rate": 5.362989323843417e-05, + "loss": 1.0199, + "step": 8236 + }, + { + "epoch": 3.6608888888888886, + "grad_norm": 4.0204362869262695, + "learning_rate": 5.3612099644128114e-05, + "loss": 1.447, + "step": 8237 + }, + { + "epoch": 3.6613333333333333, + "grad_norm": 3.3613667488098145, + "learning_rate": 5.3594306049822064e-05, + "loss": 0.8356, + "step": 8238 + }, + { + "epoch": 3.661777777777778, + "grad_norm": 5.9143571853637695, + "learning_rate": 5.3576512455516014e-05, + "loss": 1.2582, + "step": 8239 + }, + { + "epoch": 3.6622222222222223, + "grad_norm": 3.864518404006958, + "learning_rate": 5.355871886120996e-05, + "loss": 1.2546, + "step": 8240 + }, + { + "epoch": 3.6626666666666665, + "grad_norm": 3.446545362472534, + "learning_rate": 5.354092526690392e-05, + "loss": 0.9876, + "step": 8241 + }, + { + "epoch": 3.663111111111111, + "grad_norm": 4.813685417175293, + "learning_rate": 5.352313167259787e-05, + "loss": 1.1929, + "step": 8242 + }, + { + "epoch": 3.6635555555555555, + "grad_norm": 4.656696319580078, + "learning_rate": 5.350533807829182e-05, + "loss": 1.0747, + "step": 8243 + }, + { + "epoch": 3.664, + "grad_norm": 4.323572158813477, + "learning_rate": 5.348754448398576e-05, + "loss": 1.2275, + "step": 8244 + }, + { + "epoch": 3.6644444444444444, + "grad_norm": 3.6625609397888184, + "learning_rate": 5.346975088967971e-05, + "loss": 1.0316, + "step": 8245 + }, + { + "epoch": 3.664888888888889, + "grad_norm": 4.840065002441406, + "learning_rate": 5.345195729537367e-05, + "loss": 1.3223, + "step": 8246 + }, + { + "epoch": 3.6653333333333333, + "grad_norm": 4.664702415466309, + "learning_rate": 5.343416370106762e-05, + "loss": 1.4522, + "step": 8247 + }, + { + "epoch": 3.6657777777777776, + "grad_norm": 4.857150077819824, + "learning_rate": 5.341637010676157e-05, + "loss": 1.0819, + "step": 8248 + }, + { + "epoch": 3.6662222222222223, + "grad_norm": 10.743447303771973, + "learning_rate": 5.3398576512455524e-05, + "loss": 1.3214, + "step": 8249 + }, + { + "epoch": 3.6666666666666665, + "grad_norm": 4.244997024536133, + "learning_rate": 5.338078291814946e-05, + "loss": 0.3447, + "step": 8250 + }, + { + "epoch": 3.667111111111111, + "grad_norm": 2.1878910064697266, + "learning_rate": 5.336298932384342e-05, + "loss": 1.7063, + "step": 8251 + }, + { + "epoch": 3.6675555555555555, + "grad_norm": 2.090615749359131, + "learning_rate": 5.334519572953737e-05, + "loss": 0.9182, + "step": 8252 + }, + { + "epoch": 3.668, + "grad_norm": 2.9112021923065186, + "learning_rate": 5.3327402135231316e-05, + "loss": 1.575, + "step": 8253 + }, + { + "epoch": 3.6684444444444444, + "grad_norm": 3.180088996887207, + "learning_rate": 5.330960854092527e-05, + "loss": 1.6393, + "step": 8254 + }, + { + "epoch": 3.6688888888888886, + "grad_norm": 3.059612512588501, + "learning_rate": 5.329181494661922e-05, + "loss": 1.631, + "step": 8255 + }, + { + "epoch": 3.6693333333333333, + "grad_norm": 3.003873825073242, + "learning_rate": 5.327402135231317e-05, + "loss": 1.4932, + "step": 8256 + }, + { + "epoch": 3.669777777777778, + "grad_norm": 3.1199257373809814, + "learning_rate": 5.3256227758007115e-05, + "loss": 1.5315, + "step": 8257 + }, + { + "epoch": 3.6702222222222223, + "grad_norm": 2.0866105556488037, + "learning_rate": 5.3238434163701065e-05, + "loss": 0.8337, + "step": 8258 + }, + { + "epoch": 3.6706666666666665, + "grad_norm": 3.190763473510742, + "learning_rate": 5.322064056939502e-05, + "loss": 1.4458, + "step": 8259 + }, + { + "epoch": 3.671111111111111, + "grad_norm": 3.141622304916382, + "learning_rate": 5.320284697508897e-05, + "loss": 1.4373, + "step": 8260 + }, + { + "epoch": 3.6715555555555555, + "grad_norm": 3.208282232284546, + "learning_rate": 5.318505338078292e-05, + "loss": 1.2228, + "step": 8261 + }, + { + "epoch": 3.672, + "grad_norm": 3.1854054927825928, + "learning_rate": 5.316725978647688e-05, + "loss": 1.1494, + "step": 8262 + }, + { + "epoch": 3.6724444444444444, + "grad_norm": 3.755186080932617, + "learning_rate": 5.3149466192170814e-05, + "loss": 1.5219, + "step": 8263 + }, + { + "epoch": 3.672888888888889, + "grad_norm": 3.6373298168182373, + "learning_rate": 5.313167259786477e-05, + "loss": 1.2791, + "step": 8264 + }, + { + "epoch": 3.6733333333333333, + "grad_norm": 3.869194984436035, + "learning_rate": 5.311387900355872e-05, + "loss": 1.854, + "step": 8265 + }, + { + "epoch": 3.6737777777777776, + "grad_norm": 3.1836750507354736, + "learning_rate": 5.309608540925267e-05, + "loss": 1.3653, + "step": 8266 + }, + { + "epoch": 3.6742222222222223, + "grad_norm": 3.2657806873321533, + "learning_rate": 5.3078291814946626e-05, + "loss": 1.5467, + "step": 8267 + }, + { + "epoch": 3.6746666666666665, + "grad_norm": 1.893934726715088, + "learning_rate": 5.3060498220640576e-05, + "loss": 0.7488, + "step": 8268 + }, + { + "epoch": 3.675111111111111, + "grad_norm": 3.510976552963257, + "learning_rate": 5.3042704626334526e-05, + "loss": 1.148, + "step": 8269 + }, + { + "epoch": 3.6755555555555555, + "grad_norm": 2.769212007522583, + "learning_rate": 5.302491103202847e-05, + "loss": 0.9423, + "step": 8270 + }, + { + "epoch": 3.676, + "grad_norm": 3.1665525436401367, + "learning_rate": 5.300711743772242e-05, + "loss": 1.9088, + "step": 8271 + }, + { + "epoch": 3.6764444444444444, + "grad_norm": 3.130796432495117, + "learning_rate": 5.2989323843416375e-05, + "loss": 1.2455, + "step": 8272 + }, + { + "epoch": 3.6768888888888887, + "grad_norm": 3.2460105419158936, + "learning_rate": 5.2971530249110324e-05, + "loss": 1.2274, + "step": 8273 + }, + { + "epoch": 3.6773333333333333, + "grad_norm": 3.6489906311035156, + "learning_rate": 5.2953736654804274e-05, + "loss": 1.3301, + "step": 8274 + }, + { + "epoch": 3.677777777777778, + "grad_norm": 3.5762670040130615, + "learning_rate": 5.293594306049823e-05, + "loss": 1.9313, + "step": 8275 + }, + { + "epoch": 3.6782222222222223, + "grad_norm": 3.458677053451538, + "learning_rate": 5.291814946619217e-05, + "loss": 1.0342, + "step": 8276 + }, + { + "epoch": 3.6786666666666665, + "grad_norm": 3.369166612625122, + "learning_rate": 5.290035587188612e-05, + "loss": 1.7108, + "step": 8277 + }, + { + "epoch": 3.679111111111111, + "grad_norm": 2.5164918899536133, + "learning_rate": 5.288256227758007e-05, + "loss": 0.6384, + "step": 8278 + }, + { + "epoch": 3.6795555555555555, + "grad_norm": 4.323822975158691, + "learning_rate": 5.286476868327402e-05, + "loss": 1.6491, + "step": 8279 + }, + { + "epoch": 3.68, + "grad_norm": 3.677583932876587, + "learning_rate": 5.284697508896798e-05, + "loss": 0.9492, + "step": 8280 + }, + { + "epoch": 3.6804444444444444, + "grad_norm": 4.226683616638184, + "learning_rate": 5.282918149466193e-05, + "loss": 1.3614, + "step": 8281 + }, + { + "epoch": 3.680888888888889, + "grad_norm": 3.287234306335449, + "learning_rate": 5.281138790035588e-05, + "loss": 1.3625, + "step": 8282 + }, + { + "epoch": 3.6813333333333333, + "grad_norm": 3.5409836769104004, + "learning_rate": 5.279359430604982e-05, + "loss": 1.4074, + "step": 8283 + }, + { + "epoch": 3.6817777777777776, + "grad_norm": 3.3012094497680664, + "learning_rate": 5.277580071174377e-05, + "loss": 1.1894, + "step": 8284 + }, + { + "epoch": 3.6822222222222223, + "grad_norm": 3.455919027328491, + "learning_rate": 5.275800711743772e-05, + "loss": 1.3859, + "step": 8285 + }, + { + "epoch": 3.6826666666666665, + "grad_norm": 3.6948232650756836, + "learning_rate": 5.274021352313168e-05, + "loss": 1.1195, + "step": 8286 + }, + { + "epoch": 3.6831111111111112, + "grad_norm": 3.614387273788452, + "learning_rate": 5.272241992882563e-05, + "loss": 1.3584, + "step": 8287 + }, + { + "epoch": 3.6835555555555555, + "grad_norm": 3.2227156162261963, + "learning_rate": 5.270462633451958e-05, + "loss": 0.3771, + "step": 8288 + }, + { + "epoch": 3.684, + "grad_norm": 2.9594337940216064, + "learning_rate": 5.268683274021352e-05, + "loss": 1.0337, + "step": 8289 + }, + { + "epoch": 3.6844444444444444, + "grad_norm": 3.800976276397705, + "learning_rate": 5.266903914590747e-05, + "loss": 0.7237, + "step": 8290 + }, + { + "epoch": 3.6848888888888887, + "grad_norm": 4.719809055328369, + "learning_rate": 5.2651245551601426e-05, + "loss": 1.2393, + "step": 8291 + }, + { + "epoch": 3.6853333333333333, + "grad_norm": 4.721984386444092, + "learning_rate": 5.2633451957295376e-05, + "loss": 1.2577, + "step": 8292 + }, + { + "epoch": 3.685777777777778, + "grad_norm": 3.1835551261901855, + "learning_rate": 5.2615658362989326e-05, + "loss": 0.9904, + "step": 8293 + }, + { + "epoch": 3.6862222222222223, + "grad_norm": 4.421992301940918, + "learning_rate": 5.259786476868328e-05, + "loss": 1.4825, + "step": 8294 + }, + { + "epoch": 3.6866666666666665, + "grad_norm": 4.140617370605469, + "learning_rate": 5.258007117437722e-05, + "loss": 1.1782, + "step": 8295 + }, + { + "epoch": 3.6871111111111112, + "grad_norm": 3.6903188228607178, + "learning_rate": 5.2562277580071175e-05, + "loss": 1.5041, + "step": 8296 + }, + { + "epoch": 3.6875555555555555, + "grad_norm": 3.7069318294525146, + "learning_rate": 5.2544483985765124e-05, + "loss": 1.1797, + "step": 8297 + }, + { + "epoch": 3.6879999999999997, + "grad_norm": 4.413390636444092, + "learning_rate": 5.2526690391459074e-05, + "loss": 0.9219, + "step": 8298 + }, + { + "epoch": 3.6884444444444444, + "grad_norm": 5.5441508293151855, + "learning_rate": 5.250889679715303e-05, + "loss": 1.0509, + "step": 8299 + }, + { + "epoch": 3.688888888888889, + "grad_norm": 0.8163884282112122, + "learning_rate": 5.249110320284698e-05, + "loss": 0.0719, + "step": 8300 + }, + { + "epoch": 3.6893333333333334, + "grad_norm": 1.9359767436981201, + "learning_rate": 5.247330960854093e-05, + "loss": 0.8754, + "step": 8301 + }, + { + "epoch": 3.6897777777777776, + "grad_norm": 2.754122495651245, + "learning_rate": 5.245551601423487e-05, + "loss": 1.6614, + "step": 8302 + }, + { + "epoch": 3.6902222222222223, + "grad_norm": 2.6773335933685303, + "learning_rate": 5.243772241992882e-05, + "loss": 1.7607, + "step": 8303 + }, + { + "epoch": 3.6906666666666665, + "grad_norm": 2.679157257080078, + "learning_rate": 5.241992882562278e-05, + "loss": 1.3, + "step": 8304 + }, + { + "epoch": 3.6911111111111112, + "grad_norm": 2.7156100273132324, + "learning_rate": 5.240213523131673e-05, + "loss": 1.5878, + "step": 8305 + }, + { + "epoch": 3.6915555555555555, + "grad_norm": 3.0638999938964844, + "learning_rate": 5.238434163701068e-05, + "loss": 2.0461, + "step": 8306 + }, + { + "epoch": 3.692, + "grad_norm": 2.997682571411133, + "learning_rate": 5.2366548042704635e-05, + "loss": 1.6546, + "step": 8307 + }, + { + "epoch": 3.6924444444444444, + "grad_norm": 3.044121265411377, + "learning_rate": 5.234875444839857e-05, + "loss": 1.6228, + "step": 8308 + }, + { + "epoch": 3.6928888888888887, + "grad_norm": 2.8701446056365967, + "learning_rate": 5.233096085409253e-05, + "loss": 1.7426, + "step": 8309 + }, + { + "epoch": 3.6933333333333334, + "grad_norm": 3.174617052078247, + "learning_rate": 5.231316725978648e-05, + "loss": 1.5458, + "step": 8310 + }, + { + "epoch": 3.693777777777778, + "grad_norm": 3.201929807662964, + "learning_rate": 5.229537366548043e-05, + "loss": 1.6514, + "step": 8311 + }, + { + "epoch": 3.6942222222222223, + "grad_norm": 3.799694538116455, + "learning_rate": 5.2277580071174384e-05, + "loss": 1.6626, + "step": 8312 + }, + { + "epoch": 3.6946666666666665, + "grad_norm": 3.5478615760803223, + "learning_rate": 5.2259786476868334e-05, + "loss": 1.3328, + "step": 8313 + }, + { + "epoch": 3.6951111111111112, + "grad_norm": 3.048062562942505, + "learning_rate": 5.224199288256228e-05, + "loss": 1.4886, + "step": 8314 + }, + { + "epoch": 3.6955555555555555, + "grad_norm": 3.5230629444122314, + "learning_rate": 5.2224199288256226e-05, + "loss": 1.2701, + "step": 8315 + }, + { + "epoch": 3.6959999999999997, + "grad_norm": 3.156364917755127, + "learning_rate": 5.2206405693950176e-05, + "loss": 1.1824, + "step": 8316 + }, + { + "epoch": 3.6964444444444444, + "grad_norm": 3.9449214935302734, + "learning_rate": 5.218861209964413e-05, + "loss": 1.2558, + "step": 8317 + }, + { + "epoch": 3.696888888888889, + "grad_norm": 3.572063684463501, + "learning_rate": 5.217081850533808e-05, + "loss": 1.4913, + "step": 8318 + }, + { + "epoch": 3.6973333333333334, + "grad_norm": 3.8648767471313477, + "learning_rate": 5.215302491103203e-05, + "loss": 1.6746, + "step": 8319 + }, + { + "epoch": 3.6977777777777776, + "grad_norm": 3.326974391937256, + "learning_rate": 5.213523131672599e-05, + "loss": 1.213, + "step": 8320 + }, + { + "epoch": 3.6982222222222223, + "grad_norm": 3.392534017562866, + "learning_rate": 5.2117437722419925e-05, + "loss": 1.272, + "step": 8321 + }, + { + "epoch": 3.6986666666666665, + "grad_norm": 3.3309099674224854, + "learning_rate": 5.209964412811388e-05, + "loss": 1.559, + "step": 8322 + }, + { + "epoch": 3.6991111111111112, + "grad_norm": 3.49074387550354, + "learning_rate": 5.208185053380783e-05, + "loss": 1.2227, + "step": 8323 + }, + { + "epoch": 3.6995555555555555, + "grad_norm": 3.63338041305542, + "learning_rate": 5.206405693950178e-05, + "loss": 1.2891, + "step": 8324 + }, + { + "epoch": 3.7, + "grad_norm": 3.5115768909454346, + "learning_rate": 5.204626334519574e-05, + "loss": 1.3273, + "step": 8325 + }, + { + "epoch": 3.7004444444444444, + "grad_norm": 3.8019955158233643, + "learning_rate": 5.202846975088969e-05, + "loss": 1.0889, + "step": 8326 + }, + { + "epoch": 3.7008888888888887, + "grad_norm": 3.4950296878814697, + "learning_rate": 5.2010676156583636e-05, + "loss": 1.379, + "step": 8327 + }, + { + "epoch": 3.7013333333333334, + "grad_norm": 3.5437874794006348, + "learning_rate": 5.199288256227758e-05, + "loss": 1.4179, + "step": 8328 + }, + { + "epoch": 3.7017777777777776, + "grad_norm": 3.734877109527588, + "learning_rate": 5.197508896797153e-05, + "loss": 1.0075, + "step": 8329 + }, + { + "epoch": 3.7022222222222223, + "grad_norm": 2.877346992492676, + "learning_rate": 5.195729537366548e-05, + "loss": 1.1345, + "step": 8330 + }, + { + "epoch": 3.7026666666666666, + "grad_norm": 3.68837571144104, + "learning_rate": 5.1939501779359435e-05, + "loss": 1.176, + "step": 8331 + }, + { + "epoch": 3.7031111111111112, + "grad_norm": 3.2419826984405518, + "learning_rate": 5.1921708185053385e-05, + "loss": 0.9544, + "step": 8332 + }, + { + "epoch": 3.7035555555555555, + "grad_norm": 3.9483325481414795, + "learning_rate": 5.1903914590747335e-05, + "loss": 1.0054, + "step": 8333 + }, + { + "epoch": 3.7039999999999997, + "grad_norm": 3.8043711185455322, + "learning_rate": 5.188612099644128e-05, + "loss": 1.1455, + "step": 8334 + }, + { + "epoch": 3.7044444444444444, + "grad_norm": 3.818793535232544, + "learning_rate": 5.186832740213523e-05, + "loss": 1.0459, + "step": 8335 + }, + { + "epoch": 3.704888888888889, + "grad_norm": 3.5451431274414062, + "learning_rate": 5.1850533807829184e-05, + "loss": 1.2172, + "step": 8336 + }, + { + "epoch": 3.7053333333333334, + "grad_norm": 3.9264607429504395, + "learning_rate": 5.1832740213523134e-05, + "loss": 1.5576, + "step": 8337 + }, + { + "epoch": 3.7057777777777776, + "grad_norm": 3.3462579250335693, + "learning_rate": 5.1814946619217083e-05, + "loss": 1.1177, + "step": 8338 + }, + { + "epoch": 3.7062222222222223, + "grad_norm": 3.5663697719573975, + "learning_rate": 5.179715302491104e-05, + "loss": 1.4554, + "step": 8339 + }, + { + "epoch": 3.7066666666666666, + "grad_norm": 3.879847288131714, + "learning_rate": 5.177935943060499e-05, + "loss": 1.3961, + "step": 8340 + }, + { + "epoch": 3.7071111111111112, + "grad_norm": 4.285848617553711, + "learning_rate": 5.176156583629893e-05, + "loss": 1.485, + "step": 8341 + }, + { + "epoch": 3.7075555555555555, + "grad_norm": 4.327765464782715, + "learning_rate": 5.174377224199288e-05, + "loss": 1.439, + "step": 8342 + }, + { + "epoch": 3.708, + "grad_norm": 3.3615176677703857, + "learning_rate": 5.172597864768683e-05, + "loss": 1.0735, + "step": 8343 + }, + { + "epoch": 3.7084444444444444, + "grad_norm": 5.214451789855957, + "learning_rate": 5.170818505338079e-05, + "loss": 1.2868, + "step": 8344 + }, + { + "epoch": 3.7088888888888887, + "grad_norm": 3.7910678386688232, + "learning_rate": 5.169039145907474e-05, + "loss": 1.2784, + "step": 8345 + }, + { + "epoch": 3.7093333333333334, + "grad_norm": 3.738447666168213, + "learning_rate": 5.167259786476869e-05, + "loss": 1.1879, + "step": 8346 + }, + { + "epoch": 3.7097777777777776, + "grad_norm": 4.8904643058776855, + "learning_rate": 5.165480427046263e-05, + "loss": 1.3139, + "step": 8347 + }, + { + "epoch": 3.7102222222222223, + "grad_norm": 5.133594036102295, + "learning_rate": 5.163701067615658e-05, + "loss": 1.5525, + "step": 8348 + }, + { + "epoch": 3.7106666666666666, + "grad_norm": 7.878839492797852, + "learning_rate": 5.161921708185054e-05, + "loss": 1.2252, + "step": 8349 + }, + { + "epoch": 3.7111111111111112, + "grad_norm": 3.2023541927337646, + "learning_rate": 5.160142348754449e-05, + "loss": 1.0264, + "step": 8350 + }, + { + "epoch": 3.7115555555555555, + "grad_norm": 2.548800230026245, + "learning_rate": 5.1583629893238437e-05, + "loss": 1.7163, + "step": 8351 + }, + { + "epoch": 3.7119999999999997, + "grad_norm": 2.4168503284454346, + "learning_rate": 5.156583629893239e-05, + "loss": 1.3757, + "step": 8352 + }, + { + "epoch": 3.7124444444444444, + "grad_norm": 2.8642630577087402, + "learning_rate": 5.154804270462633e-05, + "loss": 1.593, + "step": 8353 + }, + { + "epoch": 3.712888888888889, + "grad_norm": 3.071199893951416, + "learning_rate": 5.1530249110320286e-05, + "loss": 1.929, + "step": 8354 + }, + { + "epoch": 3.7133333333333334, + "grad_norm": 2.835141897201538, + "learning_rate": 5.1512455516014235e-05, + "loss": 1.5476, + "step": 8355 + }, + { + "epoch": 3.7137777777777776, + "grad_norm": 3.1856870651245117, + "learning_rate": 5.1494661921708185e-05, + "loss": 1.4111, + "step": 8356 + }, + { + "epoch": 3.7142222222222223, + "grad_norm": 3.279014825820923, + "learning_rate": 5.147686832740214e-05, + "loss": 1.6864, + "step": 8357 + }, + { + "epoch": 3.7146666666666666, + "grad_norm": 2.251955270767212, + "learning_rate": 5.145907473309609e-05, + "loss": 0.8535, + "step": 8358 + }, + { + "epoch": 3.7151111111111113, + "grad_norm": 3.4706077575683594, + "learning_rate": 5.144128113879004e-05, + "loss": 1.4023, + "step": 8359 + }, + { + "epoch": 3.7155555555555555, + "grad_norm": 3.1662933826446533, + "learning_rate": 5.1423487544483984e-05, + "loss": 1.1449, + "step": 8360 + }, + { + "epoch": 3.716, + "grad_norm": 3.3553576469421387, + "learning_rate": 5.1405693950177934e-05, + "loss": 1.1429, + "step": 8361 + }, + { + "epoch": 3.7164444444444444, + "grad_norm": 4.185884952545166, + "learning_rate": 5.138790035587189e-05, + "loss": 1.6231, + "step": 8362 + }, + { + "epoch": 3.7168888888888887, + "grad_norm": 3.4784839153289795, + "learning_rate": 5.137010676156584e-05, + "loss": 1.7953, + "step": 8363 + }, + { + "epoch": 3.7173333333333334, + "grad_norm": 3.0565781593322754, + "learning_rate": 5.135231316725979e-05, + "loss": 1.5965, + "step": 8364 + }, + { + "epoch": 3.7177777777777776, + "grad_norm": 3.630974054336548, + "learning_rate": 5.1334519572953746e-05, + "loss": 1.0787, + "step": 8365 + }, + { + "epoch": 3.7182222222222223, + "grad_norm": 3.7152397632598877, + "learning_rate": 5.131672597864768e-05, + "loss": 1.783, + "step": 8366 + }, + { + "epoch": 3.7186666666666666, + "grad_norm": 2.838580846786499, + "learning_rate": 5.129893238434164e-05, + "loss": 0.9311, + "step": 8367 + }, + { + "epoch": 3.7191111111111113, + "grad_norm": 3.2233829498291016, + "learning_rate": 5.128113879003559e-05, + "loss": 1.3966, + "step": 8368 + }, + { + "epoch": 3.7195555555555555, + "grad_norm": 3.2140471935272217, + "learning_rate": 5.126334519572954e-05, + "loss": 1.4011, + "step": 8369 + }, + { + "epoch": 3.7199999999999998, + "grad_norm": 3.5495688915252686, + "learning_rate": 5.1245551601423495e-05, + "loss": 1.5965, + "step": 8370 + }, + { + "epoch": 3.7204444444444444, + "grad_norm": 4.187902450561523, + "learning_rate": 5.1227758007117445e-05, + "loss": 1.4036, + "step": 8371 + }, + { + "epoch": 3.720888888888889, + "grad_norm": 3.8422319889068604, + "learning_rate": 5.1209964412811394e-05, + "loss": 1.5535, + "step": 8372 + }, + { + "epoch": 3.7213333333333334, + "grad_norm": 3.2021896839141846, + "learning_rate": 5.119217081850534e-05, + "loss": 1.4836, + "step": 8373 + }, + { + "epoch": 3.7217777777777776, + "grad_norm": 3.2228951454162598, + "learning_rate": 5.117437722419929e-05, + "loss": 1.2058, + "step": 8374 + }, + { + "epoch": 3.7222222222222223, + "grad_norm": 3.7099387645721436, + "learning_rate": 5.115658362989324e-05, + "loss": 1.2875, + "step": 8375 + }, + { + "epoch": 3.7226666666666666, + "grad_norm": 2.979400157928467, + "learning_rate": 5.113879003558719e-05, + "loss": 1.1885, + "step": 8376 + }, + { + "epoch": 3.7231111111111113, + "grad_norm": 3.7015509605407715, + "learning_rate": 5.112099644128114e-05, + "loss": 1.467, + "step": 8377 + }, + { + "epoch": 3.7235555555555555, + "grad_norm": 4.023457050323486, + "learning_rate": 5.110320284697509e-05, + "loss": 1.5333, + "step": 8378 + }, + { + "epoch": 3.724, + "grad_norm": 2.9902260303497314, + "learning_rate": 5.1085409252669036e-05, + "loss": 1.2314, + "step": 8379 + }, + { + "epoch": 3.7244444444444444, + "grad_norm": 4.326292991638184, + "learning_rate": 5.1067615658362985e-05, + "loss": 1.9829, + "step": 8380 + }, + { + "epoch": 3.7248888888888887, + "grad_norm": 3.4808270931243896, + "learning_rate": 5.104982206405694e-05, + "loss": 1.5185, + "step": 8381 + }, + { + "epoch": 3.7253333333333334, + "grad_norm": 3.8214287757873535, + "learning_rate": 5.103202846975089e-05, + "loss": 1.6433, + "step": 8382 + }, + { + "epoch": 3.7257777777777776, + "grad_norm": 3.6692192554473877, + "learning_rate": 5.101423487544484e-05, + "loss": 1.4233, + "step": 8383 + }, + { + "epoch": 3.7262222222222223, + "grad_norm": 2.361046552658081, + "learning_rate": 5.09964412811388e-05, + "loss": 0.7065, + "step": 8384 + }, + { + "epoch": 3.7266666666666666, + "grad_norm": 3.3511815071105957, + "learning_rate": 5.097864768683275e-05, + "loss": 1.0584, + "step": 8385 + }, + { + "epoch": 3.7271111111111113, + "grad_norm": 3.851069211959839, + "learning_rate": 5.096085409252669e-05, + "loss": 1.5869, + "step": 8386 + }, + { + "epoch": 3.7275555555555555, + "grad_norm": 3.2693023681640625, + "learning_rate": 5.094306049822064e-05, + "loss": 1.0885, + "step": 8387 + }, + { + "epoch": 3.7279999999999998, + "grad_norm": 3.3839292526245117, + "learning_rate": 5.092526690391459e-05, + "loss": 1.2459, + "step": 8388 + }, + { + "epoch": 3.7284444444444444, + "grad_norm": 4.055683135986328, + "learning_rate": 5.0907473309608546e-05, + "loss": 1.1619, + "step": 8389 + }, + { + "epoch": 3.728888888888889, + "grad_norm": 4.578985691070557, + "learning_rate": 5.0889679715302496e-05, + "loss": 1.3901, + "step": 8390 + }, + { + "epoch": 3.7293333333333334, + "grad_norm": 3.6490328311920166, + "learning_rate": 5.0871886120996446e-05, + "loss": 1.2829, + "step": 8391 + }, + { + "epoch": 3.7297777777777776, + "grad_norm": 4.452314853668213, + "learning_rate": 5.085409252669039e-05, + "loss": 1.9056, + "step": 8392 + }, + { + "epoch": 3.7302222222222223, + "grad_norm": 2.537201166152954, + "learning_rate": 5.083629893238434e-05, + "loss": 0.9342, + "step": 8393 + }, + { + "epoch": 3.7306666666666666, + "grad_norm": 3.178445339202881, + "learning_rate": 5.0818505338078295e-05, + "loss": 1.0333, + "step": 8394 + }, + { + "epoch": 3.7311111111111113, + "grad_norm": 3.890375852584839, + "learning_rate": 5.0800711743772245e-05, + "loss": 1.2319, + "step": 8395 + }, + { + "epoch": 3.7315555555555555, + "grad_norm": 3.9328713417053223, + "learning_rate": 5.0782918149466194e-05, + "loss": 0.9856, + "step": 8396 + }, + { + "epoch": 3.732, + "grad_norm": 4.24088191986084, + "learning_rate": 5.076512455516015e-05, + "loss": 1.2177, + "step": 8397 + }, + { + "epoch": 3.7324444444444445, + "grad_norm": 3.290562629699707, + "learning_rate": 5.07473309608541e-05, + "loss": 0.7268, + "step": 8398 + }, + { + "epoch": 3.7328888888888887, + "grad_norm": 3.8136353492736816, + "learning_rate": 5.0729537366548043e-05, + "loss": 0.3112, + "step": 8399 + }, + { + "epoch": 3.7333333333333334, + "grad_norm": 4.7547454833984375, + "learning_rate": 5.071174377224199e-05, + "loss": 0.8184, + "step": 8400 + }, + { + "epoch": 3.7337777777777776, + "grad_norm": 2.6292288303375244, + "learning_rate": 5.069395017793594e-05, + "loss": 1.9555, + "step": 8401 + }, + { + "epoch": 3.7342222222222223, + "grad_norm": 1.9523777961730957, + "learning_rate": 5.06761565836299e-05, + "loss": 0.8654, + "step": 8402 + }, + { + "epoch": 3.7346666666666666, + "grad_norm": 2.6126413345336914, + "learning_rate": 5.065836298932385e-05, + "loss": 1.7074, + "step": 8403 + }, + { + "epoch": 3.7351111111111113, + "grad_norm": 2.737042188644409, + "learning_rate": 5.06405693950178e-05, + "loss": 1.7787, + "step": 8404 + }, + { + "epoch": 3.7355555555555555, + "grad_norm": 3.067366600036621, + "learning_rate": 5.062277580071174e-05, + "loss": 1.4087, + "step": 8405 + }, + { + "epoch": 3.7359999999999998, + "grad_norm": 3.5647099018096924, + "learning_rate": 5.060498220640569e-05, + "loss": 1.8064, + "step": 8406 + }, + { + "epoch": 3.7364444444444445, + "grad_norm": 3.5738532543182373, + "learning_rate": 5.058718861209965e-05, + "loss": 2.0468, + "step": 8407 + }, + { + "epoch": 3.736888888888889, + "grad_norm": 3.4049813747406006, + "learning_rate": 5.05693950177936e-05, + "loss": 1.33, + "step": 8408 + }, + { + "epoch": 3.7373333333333334, + "grad_norm": 3.1586861610412598, + "learning_rate": 5.055160142348755e-05, + "loss": 1.7597, + "step": 8409 + }, + { + "epoch": 3.7377777777777776, + "grad_norm": 2.9543251991271973, + "learning_rate": 5.0533807829181504e-05, + "loss": 1.3858, + "step": 8410 + }, + { + "epoch": 3.7382222222222223, + "grad_norm": 3.1417078971862793, + "learning_rate": 5.051601423487544e-05, + "loss": 1.8369, + "step": 8411 + }, + { + "epoch": 3.7386666666666666, + "grad_norm": 3.0961074829101562, + "learning_rate": 5.04982206405694e-05, + "loss": 1.4527, + "step": 8412 + }, + { + "epoch": 3.7391111111111113, + "grad_norm": 3.145967483520508, + "learning_rate": 5.0480427046263346e-05, + "loss": 1.2984, + "step": 8413 + }, + { + "epoch": 3.7395555555555555, + "grad_norm": 2.759549856185913, + "learning_rate": 5.0462633451957296e-05, + "loss": 0.936, + "step": 8414 + }, + { + "epoch": 3.74, + "grad_norm": 2.0562026500701904, + "learning_rate": 5.044483985765125e-05, + "loss": 0.7822, + "step": 8415 + }, + { + "epoch": 3.7404444444444445, + "grad_norm": 3.5271592140197754, + "learning_rate": 5.04270462633452e-05, + "loss": 1.9599, + "step": 8416 + }, + { + "epoch": 3.7408888888888887, + "grad_norm": 4.152881145477295, + "learning_rate": 5.040925266903915e-05, + "loss": 1.5676, + "step": 8417 + }, + { + "epoch": 3.7413333333333334, + "grad_norm": 3.7143032550811768, + "learning_rate": 5.0391459074733095e-05, + "loss": 1.4384, + "step": 8418 + }, + { + "epoch": 3.7417777777777776, + "grad_norm": 3.6692559719085693, + "learning_rate": 5.0373665480427045e-05, + "loss": 1.0879, + "step": 8419 + }, + { + "epoch": 3.7422222222222223, + "grad_norm": 2.8461532592773438, + "learning_rate": 5.0355871886120994e-05, + "loss": 1.2018, + "step": 8420 + }, + { + "epoch": 3.7426666666666666, + "grad_norm": 3.5240817070007324, + "learning_rate": 5.033807829181495e-05, + "loss": 1.7161, + "step": 8421 + }, + { + "epoch": 3.7431111111111113, + "grad_norm": 3.98225998878479, + "learning_rate": 5.03202846975089e-05, + "loss": 1.5325, + "step": 8422 + }, + { + "epoch": 3.7435555555555555, + "grad_norm": 3.3189501762390137, + "learning_rate": 5.030249110320285e-05, + "loss": 1.6055, + "step": 8423 + }, + { + "epoch": 3.7439999999999998, + "grad_norm": 3.6065752506256104, + "learning_rate": 5.028469750889679e-05, + "loss": 1.6128, + "step": 8424 + }, + { + "epoch": 3.7444444444444445, + "grad_norm": 3.7471060752868652, + "learning_rate": 5.026690391459074e-05, + "loss": 1.1217, + "step": 8425 + }, + { + "epoch": 3.744888888888889, + "grad_norm": 3.301255702972412, + "learning_rate": 5.02491103202847e-05, + "loss": 1.1631, + "step": 8426 + }, + { + "epoch": 3.7453333333333334, + "grad_norm": 3.3452343940734863, + "learning_rate": 5.023131672597865e-05, + "loss": 1.2224, + "step": 8427 + }, + { + "epoch": 3.7457777777777777, + "grad_norm": 3.3727502822875977, + "learning_rate": 5.02135231316726e-05, + "loss": 1.1533, + "step": 8428 + }, + { + "epoch": 3.7462222222222223, + "grad_norm": 3.6482954025268555, + "learning_rate": 5.0195729537366555e-05, + "loss": 1.4724, + "step": 8429 + }, + { + "epoch": 3.7466666666666666, + "grad_norm": 3.6111557483673096, + "learning_rate": 5.0177935943060505e-05, + "loss": 1.2438, + "step": 8430 + }, + { + "epoch": 3.747111111111111, + "grad_norm": 0.2538396120071411, + "learning_rate": 5.016014234875445e-05, + "loss": 0.0322, + "step": 8431 + }, + { + "epoch": 3.7475555555555555, + "grad_norm": 3.558535575866699, + "learning_rate": 5.01423487544484e-05, + "loss": 1.6036, + "step": 8432 + }, + { + "epoch": 3.748, + "grad_norm": 3.5707340240478516, + "learning_rate": 5.012455516014235e-05, + "loss": 1.0665, + "step": 8433 + }, + { + "epoch": 3.7484444444444445, + "grad_norm": 3.5678842067718506, + "learning_rate": 5.0106761565836304e-05, + "loss": 0.9684, + "step": 8434 + }, + { + "epoch": 3.7488888888888887, + "grad_norm": 3.770473003387451, + "learning_rate": 5.0088967971530254e-05, + "loss": 1.1987, + "step": 8435 + }, + { + "epoch": 3.7493333333333334, + "grad_norm": 3.830598831176758, + "learning_rate": 5.0071174377224204e-05, + "loss": 1.2741, + "step": 8436 + }, + { + "epoch": 3.7497777777777777, + "grad_norm": 3.5863218307495117, + "learning_rate": 5.0053380782918146e-05, + "loss": 1.7064, + "step": 8437 + }, + { + "epoch": 3.7502222222222223, + "grad_norm": 4.064740180969238, + "learning_rate": 5.0035587188612096e-05, + "loss": 1.5107, + "step": 8438 + }, + { + "epoch": 3.7506666666666666, + "grad_norm": 3.272308588027954, + "learning_rate": 5.001779359430605e-05, + "loss": 1.0924, + "step": 8439 + }, + { + "epoch": 3.7511111111111113, + "grad_norm": 3.53114914894104, + "learning_rate": 5e-05, + "loss": 1.1262, + "step": 8440 + }, + { + "epoch": 3.7515555555555555, + "grad_norm": 3.6443865299224854, + "learning_rate": 4.998220640569395e-05, + "loss": 1.2448, + "step": 8441 + }, + { + "epoch": 3.752, + "grad_norm": 4.559260845184326, + "learning_rate": 4.99644128113879e-05, + "loss": 1.0831, + "step": 8442 + }, + { + "epoch": 3.7524444444444445, + "grad_norm": 4.263270378112793, + "learning_rate": 4.994661921708185e-05, + "loss": 1.2406, + "step": 8443 + }, + { + "epoch": 3.752888888888889, + "grad_norm": 4.247504234313965, + "learning_rate": 4.99288256227758e-05, + "loss": 1.2629, + "step": 8444 + }, + { + "epoch": 3.7533333333333334, + "grad_norm": 3.752406358718872, + "learning_rate": 4.991103202846975e-05, + "loss": 1.1891, + "step": 8445 + }, + { + "epoch": 3.7537777777777777, + "grad_norm": 3.750194787979126, + "learning_rate": 4.98932384341637e-05, + "loss": 1.3531, + "step": 8446 + }, + { + "epoch": 3.7542222222222223, + "grad_norm": 5.040890216827393, + "learning_rate": 4.987544483985766e-05, + "loss": 1.2088, + "step": 8447 + }, + { + "epoch": 3.7546666666666666, + "grad_norm": 4.1202006340026855, + "learning_rate": 4.985765124555161e-05, + "loss": 1.2779, + "step": 8448 + }, + { + "epoch": 3.755111111111111, + "grad_norm": 4.093197345733643, + "learning_rate": 4.983985765124555e-05, + "loss": 0.832, + "step": 8449 + }, + { + "epoch": 3.7555555555555555, + "grad_norm": 4.263852119445801, + "learning_rate": 4.9822064056939506e-05, + "loss": 0.4735, + "step": 8450 + }, + { + "epoch": 3.7560000000000002, + "grad_norm": 1.3852638006210327, + "learning_rate": 4.9804270462633456e-05, + "loss": 0.6947, + "step": 8451 + }, + { + "epoch": 3.7564444444444445, + "grad_norm": 2.5553395748138428, + "learning_rate": 4.9786476868327406e-05, + "loss": 1.6021, + "step": 8452 + }, + { + "epoch": 3.7568888888888887, + "grad_norm": 2.2025933265686035, + "learning_rate": 4.9768683274021356e-05, + "loss": 1.1253, + "step": 8453 + }, + { + "epoch": 3.7573333333333334, + "grad_norm": 2.739720582962036, + "learning_rate": 4.9750889679715305e-05, + "loss": 1.8841, + "step": 8454 + }, + { + "epoch": 3.7577777777777777, + "grad_norm": 3.2897424697875977, + "learning_rate": 4.9733096085409255e-05, + "loss": 1.8541, + "step": 8455 + }, + { + "epoch": 3.7582222222222224, + "grad_norm": 3.067814350128174, + "learning_rate": 4.9715302491103205e-05, + "loss": 1.4044, + "step": 8456 + }, + { + "epoch": 3.7586666666666666, + "grad_norm": 2.3474533557891846, + "learning_rate": 4.9697508896797154e-05, + "loss": 1.1692, + "step": 8457 + }, + { + "epoch": 3.7591111111111113, + "grad_norm": 3.4697251319885254, + "learning_rate": 4.9679715302491104e-05, + "loss": 1.6912, + "step": 8458 + }, + { + "epoch": 3.7595555555555555, + "grad_norm": 3.053978443145752, + "learning_rate": 4.9661921708185054e-05, + "loss": 1.4072, + "step": 8459 + }, + { + "epoch": 3.76, + "grad_norm": 3.327169179916382, + "learning_rate": 4.964412811387901e-05, + "loss": 1.6578, + "step": 8460 + }, + { + "epoch": 3.7604444444444445, + "grad_norm": 3.0925164222717285, + "learning_rate": 4.962633451957295e-05, + "loss": 1.6706, + "step": 8461 + }, + { + "epoch": 3.7608888888888887, + "grad_norm": 2.676941394805908, + "learning_rate": 4.96085409252669e-05, + "loss": 1.1866, + "step": 8462 + }, + { + "epoch": 3.7613333333333334, + "grad_norm": 3.976247787475586, + "learning_rate": 4.959074733096086e-05, + "loss": 1.4686, + "step": 8463 + }, + { + "epoch": 3.7617777777777777, + "grad_norm": 3.166390895843506, + "learning_rate": 4.957295373665481e-05, + "loss": 1.1084, + "step": 8464 + }, + { + "epoch": 3.7622222222222224, + "grad_norm": 3.5380375385284424, + "learning_rate": 4.955516014234875e-05, + "loss": 1.5746, + "step": 8465 + }, + { + "epoch": 3.7626666666666666, + "grad_norm": 2.127175807952881, + "learning_rate": 4.953736654804271e-05, + "loss": 0.4779, + "step": 8466 + }, + { + "epoch": 3.763111111111111, + "grad_norm": 3.756263494491577, + "learning_rate": 4.951957295373666e-05, + "loss": 1.2304, + "step": 8467 + }, + { + "epoch": 3.7635555555555555, + "grad_norm": 3.4877758026123047, + "learning_rate": 4.950177935943061e-05, + "loss": 1.7547, + "step": 8468 + }, + { + "epoch": 3.7640000000000002, + "grad_norm": 3.6990151405334473, + "learning_rate": 4.948398576512456e-05, + "loss": 1.4276, + "step": 8469 + }, + { + "epoch": 3.7644444444444445, + "grad_norm": 3.083824634552002, + "learning_rate": 4.946619217081851e-05, + "loss": 1.3949, + "step": 8470 + }, + { + "epoch": 3.7648888888888887, + "grad_norm": 3.808980703353882, + "learning_rate": 4.944839857651246e-05, + "loss": 1.247, + "step": 8471 + }, + { + "epoch": 3.7653333333333334, + "grad_norm": 3.5250051021575928, + "learning_rate": 4.943060498220641e-05, + "loss": 1.6016, + "step": 8472 + }, + { + "epoch": 3.7657777777777777, + "grad_norm": 3.2846179008483887, + "learning_rate": 4.941281138790036e-05, + "loss": 1.1513, + "step": 8473 + }, + { + "epoch": 3.7662222222222224, + "grad_norm": 3.1756033897399902, + "learning_rate": 4.9395017793594306e-05, + "loss": 1.2079, + "step": 8474 + }, + { + "epoch": 3.7666666666666666, + "grad_norm": 3.1101205348968506, + "learning_rate": 4.9377224199288256e-05, + "loss": 1.1411, + "step": 8475 + }, + { + "epoch": 3.7671111111111113, + "grad_norm": 3.436415433883667, + "learning_rate": 4.935943060498221e-05, + "loss": 1.2356, + "step": 8476 + }, + { + "epoch": 3.7675555555555555, + "grad_norm": 3.413377285003662, + "learning_rate": 4.934163701067616e-05, + "loss": 1.6142, + "step": 8477 + }, + { + "epoch": 3.768, + "grad_norm": 3.4654288291931152, + "learning_rate": 4.9323843416370105e-05, + "loss": 1.4022, + "step": 8478 + }, + { + "epoch": 3.7684444444444445, + "grad_norm": 1.9338456392288208, + "learning_rate": 4.930604982206406e-05, + "loss": 0.5082, + "step": 8479 + }, + { + "epoch": 3.7688888888888887, + "grad_norm": 3.8473315238952637, + "learning_rate": 4.928825622775801e-05, + "loss": 1.4336, + "step": 8480 + }, + { + "epoch": 3.7693333333333334, + "grad_norm": 2.3992502689361572, + "learning_rate": 4.927046263345196e-05, + "loss": 0.7091, + "step": 8481 + }, + { + "epoch": 3.7697777777777777, + "grad_norm": 3.7076449394226074, + "learning_rate": 4.925266903914591e-05, + "loss": 1.5299, + "step": 8482 + }, + { + "epoch": 3.7702222222222224, + "grad_norm": 3.1709039211273193, + "learning_rate": 4.923487544483986e-05, + "loss": 1.0703, + "step": 8483 + }, + { + "epoch": 3.7706666666666666, + "grad_norm": 3.58585262298584, + "learning_rate": 4.921708185053381e-05, + "loss": 1.3857, + "step": 8484 + }, + { + "epoch": 3.771111111111111, + "grad_norm": 3.486786365509033, + "learning_rate": 4.919928825622776e-05, + "loss": 1.5706, + "step": 8485 + }, + { + "epoch": 3.7715555555555556, + "grad_norm": 3.2322800159454346, + "learning_rate": 4.918149466192171e-05, + "loss": 0.9788, + "step": 8486 + }, + { + "epoch": 3.7720000000000002, + "grad_norm": 3.4824700355529785, + "learning_rate": 4.916370106761566e-05, + "loss": 1.3818, + "step": 8487 + }, + { + "epoch": 3.7724444444444445, + "grad_norm": 3.4638094902038574, + "learning_rate": 4.914590747330961e-05, + "loss": 1.3499, + "step": 8488 + }, + { + "epoch": 3.7728888888888887, + "grad_norm": 3.505972146987915, + "learning_rate": 4.912811387900356e-05, + "loss": 1.0351, + "step": 8489 + }, + { + "epoch": 3.7733333333333334, + "grad_norm": 3.177283525466919, + "learning_rate": 4.911032028469751e-05, + "loss": 1.4239, + "step": 8490 + }, + { + "epoch": 3.7737777777777777, + "grad_norm": 3.8296144008636475, + "learning_rate": 4.909252669039146e-05, + "loss": 1.449, + "step": 8491 + }, + { + "epoch": 3.7742222222222224, + "grad_norm": 3.507258415222168, + "learning_rate": 4.9074733096085415e-05, + "loss": 1.1, + "step": 8492 + }, + { + "epoch": 3.7746666666666666, + "grad_norm": 3.143571615219116, + "learning_rate": 4.9056939501779365e-05, + "loss": 0.668, + "step": 8493 + }, + { + "epoch": 3.7751111111111113, + "grad_norm": 4.362837314605713, + "learning_rate": 4.903914590747331e-05, + "loss": 1.2331, + "step": 8494 + }, + { + "epoch": 3.7755555555555556, + "grad_norm": 4.254435062408447, + "learning_rate": 4.9021352313167264e-05, + "loss": 1.0221, + "step": 8495 + }, + { + "epoch": 3.776, + "grad_norm": 3.589878797531128, + "learning_rate": 4.9003558718861214e-05, + "loss": 0.9752, + "step": 8496 + }, + { + "epoch": 3.7764444444444445, + "grad_norm": 4.932192325592041, + "learning_rate": 4.8985765124555164e-05, + "loss": 1.1548, + "step": 8497 + }, + { + "epoch": 3.7768888888888887, + "grad_norm": 4.631229400634766, + "learning_rate": 4.896797153024911e-05, + "loss": 1.2203, + "step": 8498 + }, + { + "epoch": 3.7773333333333334, + "grad_norm": 5.158580780029297, + "learning_rate": 4.895017793594306e-05, + "loss": 1.5583, + "step": 8499 + }, + { + "epoch": 3.7777777777777777, + "grad_norm": 4.34719181060791, + "learning_rate": 4.893238434163701e-05, + "loss": 0.8711, + "step": 8500 + }, + { + "epoch": 3.7782222222222224, + "grad_norm": 1.7901962995529175, + "learning_rate": 4.891459074733096e-05, + "loss": 0.969, + "step": 8501 + }, + { + "epoch": 3.7786666666666666, + "grad_norm": 2.875005006790161, + "learning_rate": 4.889679715302491e-05, + "loss": 1.8006, + "step": 8502 + }, + { + "epoch": 3.779111111111111, + "grad_norm": 0.1947576105594635, + "learning_rate": 4.887900355871886e-05, + "loss": 0.0163, + "step": 8503 + }, + { + "epoch": 3.7795555555555556, + "grad_norm": 2.6576716899871826, + "learning_rate": 4.886120996441281e-05, + "loss": 1.585, + "step": 8504 + }, + { + "epoch": 3.7800000000000002, + "grad_norm": 2.818967342376709, + "learning_rate": 4.884341637010677e-05, + "loss": 1.658, + "step": 8505 + }, + { + "epoch": 3.7804444444444445, + "grad_norm": 3.126274585723877, + "learning_rate": 4.882562277580072e-05, + "loss": 1.2724, + "step": 8506 + }, + { + "epoch": 3.7808888888888887, + "grad_norm": 3.2297914028167725, + "learning_rate": 4.880782918149466e-05, + "loss": 1.7334, + "step": 8507 + }, + { + "epoch": 3.7813333333333334, + "grad_norm": 3.0050487518310547, + "learning_rate": 4.879003558718862e-05, + "loss": 1.734, + "step": 8508 + }, + { + "epoch": 3.7817777777777777, + "grad_norm": 3.3488566875457764, + "learning_rate": 4.877224199288257e-05, + "loss": 1.7469, + "step": 8509 + }, + { + "epoch": 3.7822222222222224, + "grad_norm": 2.3252031803131104, + "learning_rate": 4.875444839857651e-05, + "loss": 0.6167, + "step": 8510 + }, + { + "epoch": 3.7826666666666666, + "grad_norm": 3.154351234436035, + "learning_rate": 4.8736654804270466e-05, + "loss": 1.5694, + "step": 8511 + }, + { + "epoch": 3.7831111111111113, + "grad_norm": 3.574852705001831, + "learning_rate": 4.8718861209964416e-05, + "loss": 1.4769, + "step": 8512 + }, + { + "epoch": 3.7835555555555556, + "grad_norm": 2.932189464569092, + "learning_rate": 4.8701067615658366e-05, + "loss": 1.2641, + "step": 8513 + }, + { + "epoch": 3.784, + "grad_norm": 3.2192270755767822, + "learning_rate": 4.8683274021352316e-05, + "loss": 1.5916, + "step": 8514 + }, + { + "epoch": 3.7844444444444445, + "grad_norm": 3.2948851585388184, + "learning_rate": 4.8665480427046265e-05, + "loss": 1.4189, + "step": 8515 + }, + { + "epoch": 3.7848888888888887, + "grad_norm": 3.3815078735351562, + "learning_rate": 4.8647686832740215e-05, + "loss": 1.5318, + "step": 8516 + }, + { + "epoch": 3.7853333333333334, + "grad_norm": 3.34417724609375, + "learning_rate": 4.8629893238434165e-05, + "loss": 1.5307, + "step": 8517 + }, + { + "epoch": 3.7857777777777777, + "grad_norm": 3.700117826461792, + "learning_rate": 4.8612099644128115e-05, + "loss": 0.8523, + "step": 8518 + }, + { + "epoch": 3.7862222222222224, + "grad_norm": 3.057548999786377, + "learning_rate": 4.8594306049822064e-05, + "loss": 1.1474, + "step": 8519 + }, + { + "epoch": 3.7866666666666666, + "grad_norm": 3.9406673908233643, + "learning_rate": 4.8576512455516014e-05, + "loss": 1.5789, + "step": 8520 + }, + { + "epoch": 3.787111111111111, + "grad_norm": 2.2049951553344727, + "learning_rate": 4.855871886120997e-05, + "loss": 0.9952, + "step": 8521 + }, + { + "epoch": 3.7875555555555556, + "grad_norm": 3.1826930046081543, + "learning_rate": 4.854092526690392e-05, + "loss": 1.1047, + "step": 8522 + }, + { + "epoch": 3.7880000000000003, + "grad_norm": 5.1929192543029785, + "learning_rate": 4.852313167259786e-05, + "loss": 1.3763, + "step": 8523 + }, + { + "epoch": 3.7884444444444445, + "grad_norm": 3.3125691413879395, + "learning_rate": 4.850533807829182e-05, + "loss": 1.5083, + "step": 8524 + }, + { + "epoch": 3.7888888888888888, + "grad_norm": 2.958933115005493, + "learning_rate": 4.848754448398577e-05, + "loss": 0.9042, + "step": 8525 + }, + { + "epoch": 3.7893333333333334, + "grad_norm": 3.5505728721618652, + "learning_rate": 4.846975088967972e-05, + "loss": 1.4218, + "step": 8526 + }, + { + "epoch": 3.7897777777777777, + "grad_norm": 3.6742985248565674, + "learning_rate": 4.845195729537367e-05, + "loss": 1.1994, + "step": 8527 + }, + { + "epoch": 3.7902222222222224, + "grad_norm": 3.1948156356811523, + "learning_rate": 4.843416370106762e-05, + "loss": 1.439, + "step": 8528 + }, + { + "epoch": 3.7906666666666666, + "grad_norm": 3.452651262283325, + "learning_rate": 4.841637010676157e-05, + "loss": 0.9603, + "step": 8529 + }, + { + "epoch": 3.7911111111111113, + "grad_norm": 3.6245038509368896, + "learning_rate": 4.839857651245552e-05, + "loss": 1.3753, + "step": 8530 + }, + { + "epoch": 3.7915555555555556, + "grad_norm": 3.7631113529205322, + "learning_rate": 4.838078291814947e-05, + "loss": 1.3403, + "step": 8531 + }, + { + "epoch": 3.792, + "grad_norm": 3.130178213119507, + "learning_rate": 4.836298932384342e-05, + "loss": 1.4048, + "step": 8532 + }, + { + "epoch": 3.7924444444444445, + "grad_norm": 4.133440017700195, + "learning_rate": 4.834519572953737e-05, + "loss": 1.1999, + "step": 8533 + }, + { + "epoch": 3.7928888888888888, + "grad_norm": 3.3631365299224854, + "learning_rate": 4.832740213523132e-05, + "loss": 1.1746, + "step": 8534 + }, + { + "epoch": 3.7933333333333334, + "grad_norm": 1.9450854063034058, + "learning_rate": 4.830960854092527e-05, + "loss": 0.3845, + "step": 8535 + }, + { + "epoch": 3.7937777777777777, + "grad_norm": 3.2177698612213135, + "learning_rate": 4.8291814946619216e-05, + "loss": 0.9918, + "step": 8536 + }, + { + "epoch": 3.7942222222222224, + "grad_norm": 3.5220441818237305, + "learning_rate": 4.827402135231317e-05, + "loss": 1.4969, + "step": 8537 + }, + { + "epoch": 3.7946666666666666, + "grad_norm": 4.0864996910095215, + "learning_rate": 4.825622775800712e-05, + "loss": 1.7111, + "step": 8538 + }, + { + "epoch": 3.795111111111111, + "grad_norm": 3.846219062805176, + "learning_rate": 4.8238434163701065e-05, + "loss": 1.1624, + "step": 8539 + }, + { + "epoch": 3.7955555555555556, + "grad_norm": 3.6705775260925293, + "learning_rate": 4.822064056939502e-05, + "loss": 1.132, + "step": 8540 + }, + { + "epoch": 3.7960000000000003, + "grad_norm": 3.94295334815979, + "learning_rate": 4.820284697508897e-05, + "loss": 1.6029, + "step": 8541 + }, + { + "epoch": 3.7964444444444445, + "grad_norm": 4.677298069000244, + "learning_rate": 4.818505338078292e-05, + "loss": 1.2734, + "step": 8542 + }, + { + "epoch": 3.7968888888888888, + "grad_norm": 3.895962715148926, + "learning_rate": 4.816725978647687e-05, + "loss": 0.8572, + "step": 8543 + }, + { + "epoch": 3.7973333333333334, + "grad_norm": 3.5614423751831055, + "learning_rate": 4.814946619217082e-05, + "loss": 1.3358, + "step": 8544 + }, + { + "epoch": 3.7977777777777777, + "grad_norm": 4.679959774017334, + "learning_rate": 4.813167259786477e-05, + "loss": 1.301, + "step": 8545 + }, + { + "epoch": 3.7982222222222224, + "grad_norm": 3.1577718257904053, + "learning_rate": 4.811387900355872e-05, + "loss": 0.7731, + "step": 8546 + }, + { + "epoch": 3.7986666666666666, + "grad_norm": 3.898137331008911, + "learning_rate": 4.809608540925267e-05, + "loss": 1.1303, + "step": 8547 + }, + { + "epoch": 3.7991111111111113, + "grad_norm": 4.067081928253174, + "learning_rate": 4.807829181494662e-05, + "loss": 1.231, + "step": 8548 + }, + { + "epoch": 3.7995555555555556, + "grad_norm": 3.7059855461120605, + "learning_rate": 4.806049822064057e-05, + "loss": 1.2194, + "step": 8549 + }, + { + "epoch": 3.8, + "grad_norm": 4.398168563842773, + "learning_rate": 4.8042704626334526e-05, + "loss": 1.011, + "step": 8550 + }, + { + "epoch": 3.8004444444444445, + "grad_norm": 2.875924587249756, + "learning_rate": 4.8024911032028476e-05, + "loss": 1.9846, + "step": 8551 + }, + { + "epoch": 3.8008888888888888, + "grad_norm": 2.9650471210479736, + "learning_rate": 4.800711743772242e-05, + "loss": 2.1224, + "step": 8552 + }, + { + "epoch": 3.8013333333333335, + "grad_norm": 2.5322835445404053, + "learning_rate": 4.7989323843416375e-05, + "loss": 1.3393, + "step": 8553 + }, + { + "epoch": 3.8017777777777777, + "grad_norm": 3.003859281539917, + "learning_rate": 4.7971530249110325e-05, + "loss": 1.2544, + "step": 8554 + }, + { + "epoch": 3.8022222222222224, + "grad_norm": 3.0419626235961914, + "learning_rate": 4.795373665480427e-05, + "loss": 1.7736, + "step": 8555 + }, + { + "epoch": 3.8026666666666666, + "grad_norm": 3.1113429069519043, + "learning_rate": 4.7935943060498224e-05, + "loss": 1.4109, + "step": 8556 + }, + { + "epoch": 3.803111111111111, + "grad_norm": 3.09995698928833, + "learning_rate": 4.7918149466192174e-05, + "loss": 1.862, + "step": 8557 + }, + { + "epoch": 3.8035555555555556, + "grad_norm": 2.92777156829834, + "learning_rate": 4.7900355871886124e-05, + "loss": 1.3818, + "step": 8558 + }, + { + "epoch": 3.8040000000000003, + "grad_norm": 3.054277181625366, + "learning_rate": 4.7882562277580073e-05, + "loss": 1.6397, + "step": 8559 + }, + { + "epoch": 3.8044444444444445, + "grad_norm": 3.6034891605377197, + "learning_rate": 4.786476868327402e-05, + "loss": 1.8047, + "step": 8560 + }, + { + "epoch": 3.8048888888888888, + "grad_norm": 3.8269503116607666, + "learning_rate": 4.784697508896797e-05, + "loss": 1.6447, + "step": 8561 + }, + { + "epoch": 3.8053333333333335, + "grad_norm": 3.4392542839050293, + "learning_rate": 4.782918149466192e-05, + "loss": 1.5413, + "step": 8562 + }, + { + "epoch": 3.8057777777777777, + "grad_norm": 3.8955986499786377, + "learning_rate": 4.781138790035587e-05, + "loss": 1.7783, + "step": 8563 + }, + { + "epoch": 3.806222222222222, + "grad_norm": 3.504478693008423, + "learning_rate": 4.779359430604983e-05, + "loss": 1.0836, + "step": 8564 + }, + { + "epoch": 3.8066666666666666, + "grad_norm": 2.9569408893585205, + "learning_rate": 4.777580071174377e-05, + "loss": 1.1422, + "step": 8565 + }, + { + "epoch": 3.8071111111111113, + "grad_norm": 3.3336431980133057, + "learning_rate": 4.775800711743773e-05, + "loss": 1.6741, + "step": 8566 + }, + { + "epoch": 3.8075555555555556, + "grad_norm": 3.0466670989990234, + "learning_rate": 4.774021352313168e-05, + "loss": 1.207, + "step": 8567 + }, + { + "epoch": 3.808, + "grad_norm": 3.735114574432373, + "learning_rate": 4.772241992882562e-05, + "loss": 1.0772, + "step": 8568 + }, + { + "epoch": 3.8084444444444445, + "grad_norm": 3.699998617172241, + "learning_rate": 4.770462633451958e-05, + "loss": 1.3742, + "step": 8569 + }, + { + "epoch": 3.8088888888888888, + "grad_norm": 3.7992329597473145, + "learning_rate": 4.768683274021353e-05, + "loss": 1.3175, + "step": 8570 + }, + { + "epoch": 3.8093333333333335, + "grad_norm": 2.9106605052948, + "learning_rate": 4.766903914590748e-05, + "loss": 1.1899, + "step": 8571 + }, + { + "epoch": 3.8097777777777777, + "grad_norm": 3.2745654582977295, + "learning_rate": 4.7651245551601427e-05, + "loss": 1.0309, + "step": 8572 + }, + { + "epoch": 3.8102222222222224, + "grad_norm": 4.083169937133789, + "learning_rate": 4.7633451957295376e-05, + "loss": 1.5096, + "step": 8573 + }, + { + "epoch": 3.8106666666666666, + "grad_norm": 3.3155415058135986, + "learning_rate": 4.7615658362989326e-05, + "loss": 1.3212, + "step": 8574 + }, + { + "epoch": 3.811111111111111, + "grad_norm": 3.864699363708496, + "learning_rate": 4.7597864768683276e-05, + "loss": 1.3274, + "step": 8575 + }, + { + "epoch": 3.8115555555555556, + "grad_norm": 3.174386739730835, + "learning_rate": 4.7580071174377225e-05, + "loss": 1.1286, + "step": 8576 + }, + { + "epoch": 3.8120000000000003, + "grad_norm": 2.8707237243652344, + "learning_rate": 4.7562277580071175e-05, + "loss": 0.837, + "step": 8577 + }, + { + "epoch": 3.8124444444444445, + "grad_norm": 3.41292405128479, + "learning_rate": 4.7544483985765125e-05, + "loss": 1.3842, + "step": 8578 + }, + { + "epoch": 3.8128888888888888, + "grad_norm": 3.6212575435638428, + "learning_rate": 4.7526690391459075e-05, + "loss": 1.4985, + "step": 8579 + }, + { + "epoch": 3.8133333333333335, + "grad_norm": 3.4787771701812744, + "learning_rate": 4.750889679715303e-05, + "loss": 1.2398, + "step": 8580 + }, + { + "epoch": 3.8137777777777777, + "grad_norm": 3.5631048679351807, + "learning_rate": 4.7491103202846974e-05, + "loss": 1.2581, + "step": 8581 + }, + { + "epoch": 3.814222222222222, + "grad_norm": 3.253061056137085, + "learning_rate": 4.747330960854093e-05, + "loss": 0.8728, + "step": 8582 + }, + { + "epoch": 3.8146666666666667, + "grad_norm": 3.8624134063720703, + "learning_rate": 4.745551601423488e-05, + "loss": 1.3563, + "step": 8583 + }, + { + "epoch": 3.8151111111111113, + "grad_norm": 3.724658966064453, + "learning_rate": 4.743772241992882e-05, + "loss": 1.2391, + "step": 8584 + }, + { + "epoch": 3.8155555555555556, + "grad_norm": 4.286252975463867, + "learning_rate": 4.741992882562278e-05, + "loss": 1.6731, + "step": 8585 + }, + { + "epoch": 3.816, + "grad_norm": 4.29559326171875, + "learning_rate": 4.740213523131673e-05, + "loss": 1.3075, + "step": 8586 + }, + { + "epoch": 3.8164444444444445, + "grad_norm": 4.753545761108398, + "learning_rate": 4.738434163701068e-05, + "loss": 1.5687, + "step": 8587 + }, + { + "epoch": 3.8168888888888888, + "grad_norm": 2.7094476222991943, + "learning_rate": 4.736654804270463e-05, + "loss": 0.731, + "step": 8588 + }, + { + "epoch": 3.8173333333333335, + "grad_norm": 3.8449037075042725, + "learning_rate": 4.734875444839858e-05, + "loss": 1.4633, + "step": 8589 + }, + { + "epoch": 3.8177777777777777, + "grad_norm": 3.7161953449249268, + "learning_rate": 4.733096085409253e-05, + "loss": 0.9841, + "step": 8590 + }, + { + "epoch": 3.8182222222222224, + "grad_norm": 4.3916449546813965, + "learning_rate": 4.731316725978648e-05, + "loss": 1.2136, + "step": 8591 + }, + { + "epoch": 3.8186666666666667, + "grad_norm": 2.985351800918579, + "learning_rate": 4.729537366548043e-05, + "loss": 0.7911, + "step": 8592 + }, + { + "epoch": 3.819111111111111, + "grad_norm": 3.5371320247650146, + "learning_rate": 4.7277580071174384e-05, + "loss": 1.0655, + "step": 8593 + }, + { + "epoch": 3.8195555555555556, + "grad_norm": 9.645051956176758, + "learning_rate": 4.725978647686833e-05, + "loss": 1.1596, + "step": 8594 + }, + { + "epoch": 3.82, + "grad_norm": 4.049137115478516, + "learning_rate": 4.7241992882562284e-05, + "loss": 1.3832, + "step": 8595 + }, + { + "epoch": 3.8204444444444445, + "grad_norm": 3.475752592086792, + "learning_rate": 4.7224199288256233e-05, + "loss": 1.171, + "step": 8596 + }, + { + "epoch": 3.820888888888889, + "grad_norm": 4.638784408569336, + "learning_rate": 4.7206405693950176e-05, + "loss": 1.3144, + "step": 8597 + }, + { + "epoch": 3.8213333333333335, + "grad_norm": 4.170307636260986, + "learning_rate": 4.718861209964413e-05, + "loss": 1.1174, + "step": 8598 + }, + { + "epoch": 3.8217777777777777, + "grad_norm": 4.556606292724609, + "learning_rate": 4.717081850533808e-05, + "loss": 1.1181, + "step": 8599 + }, + { + "epoch": 3.822222222222222, + "grad_norm": 4.029767990112305, + "learning_rate": 4.7153024911032026e-05, + "loss": 0.8181, + "step": 8600 + }, + { + "epoch": 3.8226666666666667, + "grad_norm": 2.57649827003479, + "learning_rate": 4.713523131672598e-05, + "loss": 2.2343, + "step": 8601 + }, + { + "epoch": 3.8231111111111113, + "grad_norm": 3.349898338317871, + "learning_rate": 4.711743772241993e-05, + "loss": 1.74, + "step": 8602 + }, + { + "epoch": 3.8235555555555556, + "grad_norm": 3.0327224731445312, + "learning_rate": 4.709964412811388e-05, + "loss": 1.4284, + "step": 8603 + }, + { + "epoch": 3.824, + "grad_norm": 3.228236436843872, + "learning_rate": 4.708185053380783e-05, + "loss": 1.6167, + "step": 8604 + }, + { + "epoch": 3.8244444444444445, + "grad_norm": 3.0474486351013184, + "learning_rate": 4.706405693950178e-05, + "loss": 1.2462, + "step": 8605 + }, + { + "epoch": 3.824888888888889, + "grad_norm": 3.509099006652832, + "learning_rate": 4.704626334519573e-05, + "loss": 1.7278, + "step": 8606 + }, + { + "epoch": 3.8253333333333335, + "grad_norm": 2.920429229736328, + "learning_rate": 4.702846975088968e-05, + "loss": 1.5507, + "step": 8607 + }, + { + "epoch": 3.8257777777777777, + "grad_norm": 1.919208288192749, + "learning_rate": 4.701067615658363e-05, + "loss": 0.6798, + "step": 8608 + }, + { + "epoch": 3.8262222222222224, + "grad_norm": 3.108910322189331, + "learning_rate": 4.699288256227759e-05, + "loss": 0.9496, + "step": 8609 + }, + { + "epoch": 3.8266666666666667, + "grad_norm": 3.4903645515441895, + "learning_rate": 4.697508896797153e-05, + "loss": 1.2665, + "step": 8610 + }, + { + "epoch": 3.827111111111111, + "grad_norm": 3.292814254760742, + "learning_rate": 4.6957295373665486e-05, + "loss": 1.7923, + "step": 8611 + }, + { + "epoch": 3.8275555555555556, + "grad_norm": 2.9696640968322754, + "learning_rate": 4.6939501779359436e-05, + "loss": 1.1515, + "step": 8612 + }, + { + "epoch": 3.828, + "grad_norm": 2.911243438720703, + "learning_rate": 4.692170818505338e-05, + "loss": 1.2357, + "step": 8613 + }, + { + "epoch": 3.8284444444444445, + "grad_norm": 3.220693588256836, + "learning_rate": 4.6903914590747335e-05, + "loss": 1.6873, + "step": 8614 + }, + { + "epoch": 3.828888888888889, + "grad_norm": 3.2728185653686523, + "learning_rate": 4.6886120996441285e-05, + "loss": 0.9055, + "step": 8615 + }, + { + "epoch": 3.8293333333333335, + "grad_norm": 3.27384090423584, + "learning_rate": 4.6868327402135235e-05, + "loss": 1.1947, + "step": 8616 + }, + { + "epoch": 3.8297777777777777, + "grad_norm": 3.1152114868164062, + "learning_rate": 4.6850533807829184e-05, + "loss": 1.2603, + "step": 8617 + }, + { + "epoch": 3.830222222222222, + "grad_norm": 2.450045108795166, + "learning_rate": 4.6832740213523134e-05, + "loss": 0.7963, + "step": 8618 + }, + { + "epoch": 3.8306666666666667, + "grad_norm": 3.4387762546539307, + "learning_rate": 4.6814946619217084e-05, + "loss": 1.3703, + "step": 8619 + }, + { + "epoch": 3.8311111111111114, + "grad_norm": 2.993537187576294, + "learning_rate": 4.6797153024911034e-05, + "loss": 1.0733, + "step": 8620 + }, + { + "epoch": 3.8315555555555556, + "grad_norm": 3.041719436645508, + "learning_rate": 4.677935943060498e-05, + "loss": 1.1477, + "step": 8621 + }, + { + "epoch": 3.832, + "grad_norm": 3.9076027870178223, + "learning_rate": 4.676156583629894e-05, + "loss": 1.3451, + "step": 8622 + }, + { + "epoch": 3.8324444444444445, + "grad_norm": 3.3174734115600586, + "learning_rate": 4.674377224199288e-05, + "loss": 0.8637, + "step": 8623 + }, + { + "epoch": 3.832888888888889, + "grad_norm": 3.7948451042175293, + "learning_rate": 4.672597864768683e-05, + "loss": 1.4609, + "step": 8624 + }, + { + "epoch": 3.8333333333333335, + "grad_norm": 3.765836477279663, + "learning_rate": 4.670818505338079e-05, + "loss": 1.1835, + "step": 8625 + }, + { + "epoch": 3.8337777777777777, + "grad_norm": 3.9058310985565186, + "learning_rate": 4.669039145907473e-05, + "loss": 1.5075, + "step": 8626 + }, + { + "epoch": 3.8342222222222224, + "grad_norm": 3.738953113555908, + "learning_rate": 4.667259786476869e-05, + "loss": 1.2718, + "step": 8627 + }, + { + "epoch": 3.8346666666666667, + "grad_norm": 2.5701253414154053, + "learning_rate": 4.665480427046264e-05, + "loss": 0.6246, + "step": 8628 + }, + { + "epoch": 3.835111111111111, + "grad_norm": 4.04362678527832, + "learning_rate": 4.663701067615658e-05, + "loss": 1.4515, + "step": 8629 + }, + { + "epoch": 3.8355555555555556, + "grad_norm": 1.7729761600494385, + "learning_rate": 4.661921708185054e-05, + "loss": 0.4256, + "step": 8630 + }, + { + "epoch": 3.836, + "grad_norm": 3.309948444366455, + "learning_rate": 4.660142348754449e-05, + "loss": 0.6741, + "step": 8631 + }, + { + "epoch": 3.8364444444444445, + "grad_norm": 3.3855819702148438, + "learning_rate": 4.658362989323844e-05, + "loss": 1.4831, + "step": 8632 + }, + { + "epoch": 3.836888888888889, + "grad_norm": 3.421257734298706, + "learning_rate": 4.656583629893239e-05, + "loss": 1.3999, + "step": 8633 + }, + { + "epoch": 3.8373333333333335, + "grad_norm": 3.1086058616638184, + "learning_rate": 4.6548042704626336e-05, + "loss": 0.8159, + "step": 8634 + }, + { + "epoch": 3.8377777777777777, + "grad_norm": 3.4166300296783447, + "learning_rate": 4.6530249110320286e-05, + "loss": 1.3853, + "step": 8635 + }, + { + "epoch": 3.838222222222222, + "grad_norm": 4.6446051597595215, + "learning_rate": 4.6512455516014236e-05, + "loss": 1.4196, + "step": 8636 + }, + { + "epoch": 3.8386666666666667, + "grad_norm": 3.4511923789978027, + "learning_rate": 4.6494661921708186e-05, + "loss": 1.4235, + "step": 8637 + }, + { + "epoch": 3.8391111111111114, + "grad_norm": 4.0160980224609375, + "learning_rate": 4.647686832740214e-05, + "loss": 1.1172, + "step": 8638 + }, + { + "epoch": 3.8395555555555556, + "grad_norm": 3.6638247966766357, + "learning_rate": 4.6459074733096085e-05, + "loss": 0.8709, + "step": 8639 + }, + { + "epoch": 3.84, + "grad_norm": 3.9915387630462646, + "learning_rate": 4.644128113879004e-05, + "loss": 1.3121, + "step": 8640 + }, + { + "epoch": 3.8404444444444445, + "grad_norm": 3.331205368041992, + "learning_rate": 4.642348754448399e-05, + "loss": 0.9015, + "step": 8641 + }, + { + "epoch": 3.840888888888889, + "grad_norm": 3.3656539916992188, + "learning_rate": 4.6405693950177934e-05, + "loss": 1.0394, + "step": 8642 + }, + { + "epoch": 3.8413333333333335, + "grad_norm": 3.8786230087280273, + "learning_rate": 4.638790035587189e-05, + "loss": 1.1873, + "step": 8643 + }, + { + "epoch": 3.8417777777777777, + "grad_norm": 4.622972011566162, + "learning_rate": 4.637010676156584e-05, + "loss": 1.1365, + "step": 8644 + }, + { + "epoch": 3.8422222222222224, + "grad_norm": 3.3173351287841797, + "learning_rate": 4.635231316725978e-05, + "loss": 1.0542, + "step": 8645 + }, + { + "epoch": 3.8426666666666667, + "grad_norm": 4.794419765472412, + "learning_rate": 4.633451957295374e-05, + "loss": 1.4618, + "step": 8646 + }, + { + "epoch": 3.843111111111111, + "grad_norm": 4.264388084411621, + "learning_rate": 4.631672597864769e-05, + "loss": 0.9792, + "step": 8647 + }, + { + "epoch": 3.8435555555555556, + "grad_norm": 5.3533501625061035, + "learning_rate": 4.629893238434164e-05, + "loss": 1.031, + "step": 8648 + }, + { + "epoch": 3.844, + "grad_norm": 3.8206562995910645, + "learning_rate": 4.628113879003559e-05, + "loss": 0.9972, + "step": 8649 + }, + { + "epoch": 3.8444444444444446, + "grad_norm": 3.9211204051971436, + "learning_rate": 4.626334519572954e-05, + "loss": 0.4228, + "step": 8650 + }, + { + "epoch": 3.844888888888889, + "grad_norm": 2.347119092941284, + "learning_rate": 4.6245551601423495e-05, + "loss": 1.5638, + "step": 8651 + }, + { + "epoch": 3.8453333333333335, + "grad_norm": 2.3549845218658447, + "learning_rate": 4.622775800711744e-05, + "loss": 1.7397, + "step": 8652 + }, + { + "epoch": 3.8457777777777777, + "grad_norm": 2.7358062267303467, + "learning_rate": 4.620996441281139e-05, + "loss": 1.5028, + "step": 8653 + }, + { + "epoch": 3.846222222222222, + "grad_norm": 3.0936498641967773, + "learning_rate": 4.6192170818505344e-05, + "loss": 1.7485, + "step": 8654 + }, + { + "epoch": 3.8466666666666667, + "grad_norm": 1.712854266166687, + "learning_rate": 4.617437722419929e-05, + "loss": 0.8592, + "step": 8655 + }, + { + "epoch": 3.8471111111111114, + "grad_norm": 3.0746419429779053, + "learning_rate": 4.6156583629893244e-05, + "loss": 1.7988, + "step": 8656 + }, + { + "epoch": 3.8475555555555556, + "grad_norm": 2.9364185333251953, + "learning_rate": 4.6138790035587194e-05, + "loss": 1.5508, + "step": 8657 + }, + { + "epoch": 3.848, + "grad_norm": 3.4229776859283447, + "learning_rate": 4.6120996441281137e-05, + "loss": 1.9598, + "step": 8658 + }, + { + "epoch": 3.8484444444444446, + "grad_norm": 3.2707581520080566, + "learning_rate": 4.610320284697509e-05, + "loss": 2.055, + "step": 8659 + }, + { + "epoch": 3.848888888888889, + "grad_norm": 3.055335521697998, + "learning_rate": 4.608540925266904e-05, + "loss": 1.3324, + "step": 8660 + }, + { + "epoch": 3.8493333333333335, + "grad_norm": 3.291973114013672, + "learning_rate": 4.606761565836299e-05, + "loss": 2.014, + "step": 8661 + }, + { + "epoch": 3.8497777777777777, + "grad_norm": 2.739182233810425, + "learning_rate": 4.604982206405694e-05, + "loss": 0.6788, + "step": 8662 + }, + { + "epoch": 3.8502222222222224, + "grad_norm": 2.2576968669891357, + "learning_rate": 4.603202846975089e-05, + "loss": 0.7666, + "step": 8663 + }, + { + "epoch": 3.8506666666666667, + "grad_norm": 3.3199477195739746, + "learning_rate": 4.601423487544484e-05, + "loss": 1.4767, + "step": 8664 + }, + { + "epoch": 3.851111111111111, + "grad_norm": 3.386324405670166, + "learning_rate": 4.599644128113879e-05, + "loss": 1.273, + "step": 8665 + }, + { + "epoch": 3.8515555555555556, + "grad_norm": 3.1465635299682617, + "learning_rate": 4.597864768683274e-05, + "loss": 1.5099, + "step": 8666 + }, + { + "epoch": 3.852, + "grad_norm": 3.1457812786102295, + "learning_rate": 4.59608540925267e-05, + "loss": 1.2209, + "step": 8667 + }, + { + "epoch": 3.8524444444444446, + "grad_norm": 3.257748603820801, + "learning_rate": 4.594306049822064e-05, + "loss": 1.5008, + "step": 8668 + }, + { + "epoch": 3.852888888888889, + "grad_norm": 3.149172306060791, + "learning_rate": 4.592526690391459e-05, + "loss": 1.2685, + "step": 8669 + }, + { + "epoch": 3.8533333333333335, + "grad_norm": 2.9013049602508545, + "learning_rate": 4.590747330960855e-05, + "loss": 1.0847, + "step": 8670 + }, + { + "epoch": 3.8537777777777777, + "grad_norm": 2.962352752685547, + "learning_rate": 4.588967971530249e-05, + "loss": 0.8992, + "step": 8671 + }, + { + "epoch": 3.854222222222222, + "grad_norm": 3.3367416858673096, + "learning_rate": 4.5871886120996446e-05, + "loss": 1.6202, + "step": 8672 + }, + { + "epoch": 3.8546666666666667, + "grad_norm": 3.3775758743286133, + "learning_rate": 4.5854092526690396e-05, + "loss": 1.727, + "step": 8673 + }, + { + "epoch": 3.8551111111111114, + "grad_norm": 1.9687055349349976, + "learning_rate": 4.583629893238434e-05, + "loss": 0.7593, + "step": 8674 + }, + { + "epoch": 3.8555555555555556, + "grad_norm": 3.5084187984466553, + "learning_rate": 4.5818505338078295e-05, + "loss": 1.3926, + "step": 8675 + }, + { + "epoch": 3.856, + "grad_norm": 3.405186176300049, + "learning_rate": 4.5800711743772245e-05, + "loss": 1.5413, + "step": 8676 + }, + { + "epoch": 3.8564444444444446, + "grad_norm": 3.67044734954834, + "learning_rate": 4.5782918149466195e-05, + "loss": 1.5604, + "step": 8677 + }, + { + "epoch": 3.856888888888889, + "grad_norm": 3.343731641769409, + "learning_rate": 4.5765124555160144e-05, + "loss": 1.4601, + "step": 8678 + }, + { + "epoch": 3.857333333333333, + "grad_norm": 3.770385265350342, + "learning_rate": 4.5747330960854094e-05, + "loss": 1.3078, + "step": 8679 + }, + { + "epoch": 3.8577777777777778, + "grad_norm": 3.8169827461242676, + "learning_rate": 4.572953736654805e-05, + "loss": 1.362, + "step": 8680 + }, + { + "epoch": 3.8582222222222224, + "grad_norm": 3.3232243061065674, + "learning_rate": 4.5711743772241994e-05, + "loss": 1.4338, + "step": 8681 + }, + { + "epoch": 3.8586666666666667, + "grad_norm": 4.413866996765137, + "learning_rate": 4.569395017793594e-05, + "loss": 1.5802, + "step": 8682 + }, + { + "epoch": 3.859111111111111, + "grad_norm": 3.514523983001709, + "learning_rate": 4.56761565836299e-05, + "loss": 1.5358, + "step": 8683 + }, + { + "epoch": 3.8595555555555556, + "grad_norm": 3.5461461544036865, + "learning_rate": 4.565836298932384e-05, + "loss": 1.3908, + "step": 8684 + }, + { + "epoch": 3.86, + "grad_norm": 3.1903622150421143, + "learning_rate": 4.56405693950178e-05, + "loss": 1.0759, + "step": 8685 + }, + { + "epoch": 3.8604444444444446, + "grad_norm": 3.541867971420288, + "learning_rate": 4.562277580071175e-05, + "loss": 1.4128, + "step": 8686 + }, + { + "epoch": 3.860888888888889, + "grad_norm": 3.281956434249878, + "learning_rate": 4.560498220640569e-05, + "loss": 1.2347, + "step": 8687 + }, + { + "epoch": 3.8613333333333335, + "grad_norm": 3.273998498916626, + "learning_rate": 4.558718861209965e-05, + "loss": 0.9553, + "step": 8688 + }, + { + "epoch": 3.8617777777777778, + "grad_norm": 3.5180327892303467, + "learning_rate": 4.55693950177936e-05, + "loss": 1.2825, + "step": 8689 + }, + { + "epoch": 3.862222222222222, + "grad_norm": 3.3110413551330566, + "learning_rate": 4.555160142348754e-05, + "loss": 1.1399, + "step": 8690 + }, + { + "epoch": 3.8626666666666667, + "grad_norm": 3.2750625610351562, + "learning_rate": 4.55338078291815e-05, + "loss": 1.181, + "step": 8691 + }, + { + "epoch": 3.8631111111111114, + "grad_norm": 3.3777711391448975, + "learning_rate": 4.551601423487545e-05, + "loss": 0.9839, + "step": 8692 + }, + { + "epoch": 3.8635555555555556, + "grad_norm": 4.099118232727051, + "learning_rate": 4.54982206405694e-05, + "loss": 1.184, + "step": 8693 + }, + { + "epoch": 3.864, + "grad_norm": 3.8149666786193848, + "learning_rate": 4.548042704626335e-05, + "loss": 1.387, + "step": 8694 + }, + { + "epoch": 3.8644444444444446, + "grad_norm": 3.8572885990142822, + "learning_rate": 4.5462633451957297e-05, + "loss": 1.1251, + "step": 8695 + }, + { + "epoch": 3.864888888888889, + "grad_norm": 3.872878313064575, + "learning_rate": 4.544483985765125e-05, + "loss": 1.2452, + "step": 8696 + }, + { + "epoch": 3.865333333333333, + "grad_norm": 4.542336940765381, + "learning_rate": 4.5427046263345196e-05, + "loss": 1.1995, + "step": 8697 + }, + { + "epoch": 3.8657777777777778, + "grad_norm": 4.309566974639893, + "learning_rate": 4.5409252669039146e-05, + "loss": 1.4818, + "step": 8698 + }, + { + "epoch": 3.8662222222222224, + "grad_norm": 4.477492332458496, + "learning_rate": 4.53914590747331e-05, + "loss": 1.4093, + "step": 8699 + }, + { + "epoch": 3.8666666666666667, + "grad_norm": 3.625142812728882, + "learning_rate": 4.5373665480427045e-05, + "loss": 0.8632, + "step": 8700 + }, + { + "epoch": 3.867111111111111, + "grad_norm": 1.706745982170105, + "learning_rate": 4.5355871886121e-05, + "loss": 1.0516, + "step": 8701 + }, + { + "epoch": 3.8675555555555556, + "grad_norm": 2.5270049571990967, + "learning_rate": 4.533807829181495e-05, + "loss": 1.6525, + "step": 8702 + }, + { + "epoch": 3.868, + "grad_norm": 1.587844729423523, + "learning_rate": 4.5320284697508894e-05, + "loss": 0.7329, + "step": 8703 + }, + { + "epoch": 3.8684444444444446, + "grad_norm": 3.1046266555786133, + "learning_rate": 4.530249110320285e-05, + "loss": 1.6955, + "step": 8704 + }, + { + "epoch": 3.868888888888889, + "grad_norm": 3.3074681758880615, + "learning_rate": 4.52846975088968e-05, + "loss": 1.6316, + "step": 8705 + }, + { + "epoch": 3.8693333333333335, + "grad_norm": 3.674323320388794, + "learning_rate": 4.526690391459075e-05, + "loss": 2.3296, + "step": 8706 + }, + { + "epoch": 3.8697777777777778, + "grad_norm": 3.3690025806427, + "learning_rate": 4.52491103202847e-05, + "loss": 1.4658, + "step": 8707 + }, + { + "epoch": 3.870222222222222, + "grad_norm": 3.0607759952545166, + "learning_rate": 4.523131672597865e-05, + "loss": 1.8122, + "step": 8708 + }, + { + "epoch": 3.8706666666666667, + "grad_norm": 3.5003387928009033, + "learning_rate": 4.52135231316726e-05, + "loss": 1.5672, + "step": 8709 + }, + { + "epoch": 3.871111111111111, + "grad_norm": 3.6459617614746094, + "learning_rate": 4.519572953736655e-05, + "loss": 1.6405, + "step": 8710 + }, + { + "epoch": 3.8715555555555556, + "grad_norm": 3.704782247543335, + "learning_rate": 4.51779359430605e-05, + "loss": 1.7152, + "step": 8711 + }, + { + "epoch": 3.872, + "grad_norm": 3.561216115951538, + "learning_rate": 4.5160142348754455e-05, + "loss": 1.2059, + "step": 8712 + }, + { + "epoch": 3.8724444444444446, + "grad_norm": 3.5210413932800293, + "learning_rate": 4.51423487544484e-05, + "loss": 1.4903, + "step": 8713 + }, + { + "epoch": 3.872888888888889, + "grad_norm": 3.023265838623047, + "learning_rate": 4.512455516014235e-05, + "loss": 1.3212, + "step": 8714 + }, + { + "epoch": 3.873333333333333, + "grad_norm": 3.663731098175049, + "learning_rate": 4.5106761565836305e-05, + "loss": 1.3391, + "step": 8715 + }, + { + "epoch": 3.8737777777777778, + "grad_norm": 3.64522385597229, + "learning_rate": 4.508896797153025e-05, + "loss": 0.937, + "step": 8716 + }, + { + "epoch": 3.8742222222222225, + "grad_norm": 3.8014652729034424, + "learning_rate": 4.5071174377224204e-05, + "loss": 1.5124, + "step": 8717 + }, + { + "epoch": 3.8746666666666667, + "grad_norm": 2.626232624053955, + "learning_rate": 4.5053380782918154e-05, + "loss": 0.9118, + "step": 8718 + }, + { + "epoch": 3.875111111111111, + "grad_norm": 3.8472952842712402, + "learning_rate": 4.50355871886121e-05, + "loss": 1.2868, + "step": 8719 + }, + { + "epoch": 3.8755555555555556, + "grad_norm": 3.5172226428985596, + "learning_rate": 4.501779359430605e-05, + "loss": 1.4457, + "step": 8720 + }, + { + "epoch": 3.876, + "grad_norm": 2.7889018058776855, + "learning_rate": 4.5e-05, + "loss": 1.2991, + "step": 8721 + }, + { + "epoch": 3.8764444444444446, + "grad_norm": 3.4194562435150146, + "learning_rate": 4.498220640569395e-05, + "loss": 1.5408, + "step": 8722 + }, + { + "epoch": 3.876888888888889, + "grad_norm": 3.4279887676239014, + "learning_rate": 4.49644128113879e-05, + "loss": 1.0715, + "step": 8723 + }, + { + "epoch": 3.8773333333333335, + "grad_norm": 3.7102408409118652, + "learning_rate": 4.494661921708185e-05, + "loss": 1.19, + "step": 8724 + }, + { + "epoch": 3.8777777777777778, + "grad_norm": 4.325709342956543, + "learning_rate": 4.492882562277581e-05, + "loss": 0.7462, + "step": 8725 + }, + { + "epoch": 3.878222222222222, + "grad_norm": 3.521773099899292, + "learning_rate": 4.491103202846975e-05, + "loss": 0.9747, + "step": 8726 + }, + { + "epoch": 3.8786666666666667, + "grad_norm": 3.7296512126922607, + "learning_rate": 4.48932384341637e-05, + "loss": 1.3466, + "step": 8727 + }, + { + "epoch": 3.879111111111111, + "grad_norm": 2.833895444869995, + "learning_rate": 4.487544483985766e-05, + "loss": 1.0873, + "step": 8728 + }, + { + "epoch": 3.8795555555555556, + "grad_norm": 3.184312105178833, + "learning_rate": 4.48576512455516e-05, + "loss": 1.184, + "step": 8729 + }, + { + "epoch": 3.88, + "grad_norm": 2.265151023864746, + "learning_rate": 4.483985765124555e-05, + "loss": 0.5063, + "step": 8730 + }, + { + "epoch": 3.8804444444444446, + "grad_norm": 3.3480918407440186, + "learning_rate": 4.482206405693951e-05, + "loss": 1.0596, + "step": 8731 + }, + { + "epoch": 3.880888888888889, + "grad_norm": 4.041034698486328, + "learning_rate": 4.480427046263345e-05, + "loss": 1.4421, + "step": 8732 + }, + { + "epoch": 3.881333333333333, + "grad_norm": 2.776762008666992, + "learning_rate": 4.4786476868327406e-05, + "loss": 0.9752, + "step": 8733 + }, + { + "epoch": 3.8817777777777778, + "grad_norm": 3.5926032066345215, + "learning_rate": 4.4768683274021356e-05, + "loss": 1.1643, + "step": 8734 + }, + { + "epoch": 3.8822222222222225, + "grad_norm": 3.4924111366271973, + "learning_rate": 4.47508896797153e-05, + "loss": 1.1958, + "step": 8735 + }, + { + "epoch": 3.8826666666666667, + "grad_norm": 3.821939468383789, + "learning_rate": 4.4733096085409255e-05, + "loss": 1.3903, + "step": 8736 + }, + { + "epoch": 3.883111111111111, + "grad_norm": 2.8820078372955322, + "learning_rate": 4.4715302491103205e-05, + "loss": 0.5837, + "step": 8737 + }, + { + "epoch": 3.8835555555555556, + "grad_norm": 4.466034889221191, + "learning_rate": 4.4697508896797155e-05, + "loss": 1.4463, + "step": 8738 + }, + { + "epoch": 3.884, + "grad_norm": 3.6252734661102295, + "learning_rate": 4.4679715302491105e-05, + "loss": 1.1598, + "step": 8739 + }, + { + "epoch": 3.8844444444444446, + "grad_norm": 3.406493902206421, + "learning_rate": 4.4661921708185054e-05, + "loss": 0.9051, + "step": 8740 + }, + { + "epoch": 3.884888888888889, + "grad_norm": 3.9739794731140137, + "learning_rate": 4.464412811387901e-05, + "loss": 1.7437, + "step": 8741 + }, + { + "epoch": 3.8853333333333335, + "grad_norm": 4.158414840698242, + "learning_rate": 4.4626334519572954e-05, + "loss": 1.689, + "step": 8742 + }, + { + "epoch": 3.8857777777777778, + "grad_norm": 4.537271022796631, + "learning_rate": 4.4608540925266903e-05, + "loss": 1.1898, + "step": 8743 + }, + { + "epoch": 3.886222222222222, + "grad_norm": 2.962108612060547, + "learning_rate": 4.459074733096086e-05, + "loss": 1.0265, + "step": 8744 + }, + { + "epoch": 3.8866666666666667, + "grad_norm": 3.3965911865234375, + "learning_rate": 4.45729537366548e-05, + "loss": 1.1665, + "step": 8745 + }, + { + "epoch": 3.887111111111111, + "grad_norm": 4.494457244873047, + "learning_rate": 4.455516014234876e-05, + "loss": 1.4225, + "step": 8746 + }, + { + "epoch": 3.8875555555555557, + "grad_norm": 4.060354709625244, + "learning_rate": 4.453736654804271e-05, + "loss": 1.1413, + "step": 8747 + }, + { + "epoch": 3.888, + "grad_norm": 4.178462982177734, + "learning_rate": 4.451957295373665e-05, + "loss": 1.4544, + "step": 8748 + }, + { + "epoch": 3.8884444444444446, + "grad_norm": 3.7196671962738037, + "learning_rate": 4.450177935943061e-05, + "loss": 1.0486, + "step": 8749 + }, + { + "epoch": 3.888888888888889, + "grad_norm": 5.249838352203369, + "learning_rate": 4.448398576512456e-05, + "loss": 1.2359, + "step": 8750 + }, + { + "epoch": 3.889333333333333, + "grad_norm": 2.497725248336792, + "learning_rate": 4.446619217081851e-05, + "loss": 1.4867, + "step": 8751 + }, + { + "epoch": 3.889777777777778, + "grad_norm": 2.9607138633728027, + "learning_rate": 4.444839857651246e-05, + "loss": 1.9622, + "step": 8752 + }, + { + "epoch": 3.8902222222222225, + "grad_norm": 2.8709583282470703, + "learning_rate": 4.443060498220641e-05, + "loss": 1.7152, + "step": 8753 + }, + { + "epoch": 3.8906666666666667, + "grad_norm": 2.264155387878418, + "learning_rate": 4.441281138790036e-05, + "loss": 1.1801, + "step": 8754 + }, + { + "epoch": 3.891111111111111, + "grad_norm": 2.909691333770752, + "learning_rate": 4.439501779359431e-05, + "loss": 1.4905, + "step": 8755 + }, + { + "epoch": 3.8915555555555557, + "grad_norm": 2.7087597846984863, + "learning_rate": 4.437722419928826e-05, + "loss": 1.0053, + "step": 8756 + }, + { + "epoch": 3.892, + "grad_norm": 3.0195884704589844, + "learning_rate": 4.435943060498221e-05, + "loss": 1.4873, + "step": 8757 + }, + { + "epoch": 3.8924444444444446, + "grad_norm": 1.9923820495605469, + "learning_rate": 4.4341637010676156e-05, + "loss": 0.7845, + "step": 8758 + }, + { + "epoch": 3.892888888888889, + "grad_norm": 3.6413817405700684, + "learning_rate": 4.4323843416370106e-05, + "loss": 1.5665, + "step": 8759 + }, + { + "epoch": 3.8933333333333335, + "grad_norm": 3.3323585987091064, + "learning_rate": 4.430604982206406e-05, + "loss": 1.3828, + "step": 8760 + }, + { + "epoch": 3.893777777777778, + "grad_norm": 3.166923999786377, + "learning_rate": 4.4288256227758005e-05, + "loss": 1.5736, + "step": 8761 + }, + { + "epoch": 3.894222222222222, + "grad_norm": 3.236964464187622, + "learning_rate": 4.427046263345196e-05, + "loss": 1.5058, + "step": 8762 + }, + { + "epoch": 3.8946666666666667, + "grad_norm": 3.7442686557769775, + "learning_rate": 4.425266903914591e-05, + "loss": 1.1915, + "step": 8763 + }, + { + "epoch": 3.895111111111111, + "grad_norm": 3.836761236190796, + "learning_rate": 4.4234875444839854e-05, + "loss": 1.8305, + "step": 8764 + }, + { + "epoch": 3.8955555555555557, + "grad_norm": 3.648864984512329, + "learning_rate": 4.421708185053381e-05, + "loss": 1.4003, + "step": 8765 + }, + { + "epoch": 3.896, + "grad_norm": 3.3367178440093994, + "learning_rate": 4.419928825622776e-05, + "loss": 1.5771, + "step": 8766 + }, + { + "epoch": 3.8964444444444446, + "grad_norm": 4.111900806427002, + "learning_rate": 4.418149466192171e-05, + "loss": 1.5626, + "step": 8767 + }, + { + "epoch": 3.896888888888889, + "grad_norm": 3.5727264881134033, + "learning_rate": 4.416370106761566e-05, + "loss": 1.5124, + "step": 8768 + }, + { + "epoch": 3.897333333333333, + "grad_norm": 3.073716640472412, + "learning_rate": 4.414590747330961e-05, + "loss": 1.3135, + "step": 8769 + }, + { + "epoch": 3.897777777777778, + "grad_norm": 3.225069046020508, + "learning_rate": 4.4128113879003566e-05, + "loss": 1.4713, + "step": 8770 + }, + { + "epoch": 3.8982222222222225, + "grad_norm": 3.486926555633545, + "learning_rate": 4.411032028469751e-05, + "loss": 1.2325, + "step": 8771 + }, + { + "epoch": 3.8986666666666667, + "grad_norm": 3.7756948471069336, + "learning_rate": 4.409252669039146e-05, + "loss": 1.4494, + "step": 8772 + }, + { + "epoch": 3.899111111111111, + "grad_norm": 3.8202269077301025, + "learning_rate": 4.4074733096085415e-05, + "loss": 1.3107, + "step": 8773 + }, + { + "epoch": 3.8995555555555557, + "grad_norm": 3.3835747241973877, + "learning_rate": 4.405693950177936e-05, + "loss": 1.5086, + "step": 8774 + }, + { + "epoch": 3.9, + "grad_norm": 3.0347068309783936, + "learning_rate": 4.403914590747331e-05, + "loss": 1.3952, + "step": 8775 + }, + { + "epoch": 3.9004444444444446, + "grad_norm": 3.6462948322296143, + "learning_rate": 4.4021352313167265e-05, + "loss": 1.0433, + "step": 8776 + }, + { + "epoch": 3.900888888888889, + "grad_norm": 4.498116970062256, + "learning_rate": 4.400355871886121e-05, + "loss": 1.5242, + "step": 8777 + }, + { + "epoch": 3.9013333333333335, + "grad_norm": 3.007981061935425, + "learning_rate": 4.3985765124555164e-05, + "loss": 1.2649, + "step": 8778 + }, + { + "epoch": 3.901777777777778, + "grad_norm": 4.1256208419799805, + "learning_rate": 4.3967971530249114e-05, + "loss": 0.9533, + "step": 8779 + }, + { + "epoch": 3.902222222222222, + "grad_norm": 3.121368169784546, + "learning_rate": 4.395017793594306e-05, + "loss": 1.1291, + "step": 8780 + }, + { + "epoch": 3.9026666666666667, + "grad_norm": 2.9976303577423096, + "learning_rate": 4.393238434163701e-05, + "loss": 1.0546, + "step": 8781 + }, + { + "epoch": 3.903111111111111, + "grad_norm": 3.8373613357543945, + "learning_rate": 4.391459074733096e-05, + "loss": 1.2271, + "step": 8782 + }, + { + "epoch": 3.9035555555555557, + "grad_norm": 3.4057092666625977, + "learning_rate": 4.389679715302491e-05, + "loss": 1.0357, + "step": 8783 + }, + { + "epoch": 3.904, + "grad_norm": 3.2696046829223633, + "learning_rate": 4.387900355871886e-05, + "loss": 1.0851, + "step": 8784 + }, + { + "epoch": 3.9044444444444446, + "grad_norm": 3.9882287979125977, + "learning_rate": 4.386120996441281e-05, + "loss": 0.895, + "step": 8785 + }, + { + "epoch": 3.904888888888889, + "grad_norm": 2.950498104095459, + "learning_rate": 4.384341637010677e-05, + "loss": 1.1437, + "step": 8786 + }, + { + "epoch": 3.905333333333333, + "grad_norm": 3.5203466415405273, + "learning_rate": 4.382562277580071e-05, + "loss": 1.3823, + "step": 8787 + }, + { + "epoch": 3.905777777777778, + "grad_norm": 2.380671262741089, + "learning_rate": 4.380782918149466e-05, + "loss": 0.7028, + "step": 8788 + }, + { + "epoch": 3.9062222222222225, + "grad_norm": 3.788663625717163, + "learning_rate": 4.379003558718862e-05, + "loss": 1.3234, + "step": 8789 + }, + { + "epoch": 3.9066666666666667, + "grad_norm": 2.707402467727661, + "learning_rate": 4.377224199288256e-05, + "loss": 0.5875, + "step": 8790 + }, + { + "epoch": 3.907111111111111, + "grad_norm": 3.8801896572113037, + "learning_rate": 4.375444839857652e-05, + "loss": 1.3117, + "step": 8791 + }, + { + "epoch": 3.9075555555555557, + "grad_norm": 3.4378976821899414, + "learning_rate": 4.373665480427047e-05, + "loss": 1.6461, + "step": 8792 + }, + { + "epoch": 3.908, + "grad_norm": 4.15399169921875, + "learning_rate": 4.371886120996441e-05, + "loss": 1.313, + "step": 8793 + }, + { + "epoch": 3.9084444444444446, + "grad_norm": 5.039565086364746, + "learning_rate": 4.3701067615658366e-05, + "loss": 1.3966, + "step": 8794 + }, + { + "epoch": 3.908888888888889, + "grad_norm": 4.0032854080200195, + "learning_rate": 4.3683274021352316e-05, + "loss": 1.4609, + "step": 8795 + }, + { + "epoch": 3.9093333333333335, + "grad_norm": 3.7062151432037354, + "learning_rate": 4.3665480427046266e-05, + "loss": 1.0876, + "step": 8796 + }, + { + "epoch": 3.909777777777778, + "grad_norm": 4.596202373504639, + "learning_rate": 4.3647686832740216e-05, + "loss": 1.5062, + "step": 8797 + }, + { + "epoch": 3.910222222222222, + "grad_norm": 4.020972728729248, + "learning_rate": 4.3629893238434165e-05, + "loss": 1.3268, + "step": 8798 + }, + { + "epoch": 3.9106666666666667, + "grad_norm": 4.46404504776001, + "learning_rate": 4.3612099644128115e-05, + "loss": 0.916, + "step": 8799 + }, + { + "epoch": 3.911111111111111, + "grad_norm": 3.5452182292938232, + "learning_rate": 4.3594306049822065e-05, + "loss": 0.8356, + "step": 8800 + }, + { + "epoch": 3.9115555555555557, + "grad_norm": 2.5292141437530518, + "learning_rate": 4.3576512455516014e-05, + "loss": 2.0216, + "step": 8801 + }, + { + "epoch": 3.912, + "grad_norm": 2.2973062992095947, + "learning_rate": 4.355871886120997e-05, + "loss": 0.8748, + "step": 8802 + }, + { + "epoch": 3.9124444444444446, + "grad_norm": 2.9960880279541016, + "learning_rate": 4.3540925266903914e-05, + "loss": 1.6939, + "step": 8803 + }, + { + "epoch": 3.912888888888889, + "grad_norm": 2.8162448406219482, + "learning_rate": 4.3523131672597864e-05, + "loss": 1.1804, + "step": 8804 + }, + { + "epoch": 3.913333333333333, + "grad_norm": 3.2202110290527344, + "learning_rate": 4.350533807829182e-05, + "loss": 1.2968, + "step": 8805 + }, + { + "epoch": 3.913777777777778, + "grad_norm": 3.30061936378479, + "learning_rate": 4.348754448398576e-05, + "loss": 1.4436, + "step": 8806 + }, + { + "epoch": 3.9142222222222225, + "grad_norm": 3.3244714736938477, + "learning_rate": 4.346975088967972e-05, + "loss": 1.6709, + "step": 8807 + }, + { + "epoch": 3.9146666666666667, + "grad_norm": 3.8059160709381104, + "learning_rate": 4.345195729537367e-05, + "loss": 1.5652, + "step": 8808 + }, + { + "epoch": 3.915111111111111, + "grad_norm": 4.244156360626221, + "learning_rate": 4.343416370106761e-05, + "loss": 1.6886, + "step": 8809 + }, + { + "epoch": 3.9155555555555557, + "grad_norm": 3.246370553970337, + "learning_rate": 4.341637010676157e-05, + "loss": 1.2196, + "step": 8810 + }, + { + "epoch": 3.916, + "grad_norm": 3.3758699893951416, + "learning_rate": 4.339857651245552e-05, + "loss": 1.3343, + "step": 8811 + }, + { + "epoch": 3.916444444444444, + "grad_norm": 3.9745421409606934, + "learning_rate": 4.338078291814947e-05, + "loss": 1.5243, + "step": 8812 + }, + { + "epoch": 3.916888888888889, + "grad_norm": 2.7803597450256348, + "learning_rate": 4.336298932384342e-05, + "loss": 0.7635, + "step": 8813 + }, + { + "epoch": 3.9173333333333336, + "grad_norm": 2.751984119415283, + "learning_rate": 4.334519572953737e-05, + "loss": 1.0825, + "step": 8814 + }, + { + "epoch": 3.917777777777778, + "grad_norm": 3.5463430881500244, + "learning_rate": 4.3327402135231324e-05, + "loss": 0.7509, + "step": 8815 + }, + { + "epoch": 3.918222222222222, + "grad_norm": 2.5409305095672607, + "learning_rate": 4.330960854092527e-05, + "loss": 0.7807, + "step": 8816 + }, + { + "epoch": 3.9186666666666667, + "grad_norm": 3.2667789459228516, + "learning_rate": 4.329181494661922e-05, + "loss": 1.3171, + "step": 8817 + }, + { + "epoch": 3.919111111111111, + "grad_norm": 2.9117629528045654, + "learning_rate": 4.327402135231317e-05, + "loss": 1.1148, + "step": 8818 + }, + { + "epoch": 3.9195555555555557, + "grad_norm": 3.8055434226989746, + "learning_rate": 4.3256227758007116e-05, + "loss": 1.6964, + "step": 8819 + }, + { + "epoch": 3.92, + "grad_norm": 3.8291947841644287, + "learning_rate": 4.3238434163701066e-05, + "loss": 1.5032, + "step": 8820 + }, + { + "epoch": 3.9204444444444446, + "grad_norm": 3.757000207901001, + "learning_rate": 4.322064056939502e-05, + "loss": 1.4395, + "step": 8821 + }, + { + "epoch": 3.920888888888889, + "grad_norm": 3.474553108215332, + "learning_rate": 4.3202846975088965e-05, + "loss": 1.2536, + "step": 8822 + }, + { + "epoch": 3.921333333333333, + "grad_norm": 3.879840612411499, + "learning_rate": 4.318505338078292e-05, + "loss": 1.3524, + "step": 8823 + }, + { + "epoch": 3.921777777777778, + "grad_norm": 3.432889699935913, + "learning_rate": 4.316725978647687e-05, + "loss": 1.5825, + "step": 8824 + }, + { + "epoch": 3.9222222222222225, + "grad_norm": 3.799661159515381, + "learning_rate": 4.314946619217082e-05, + "loss": 1.4398, + "step": 8825 + }, + { + "epoch": 3.9226666666666667, + "grad_norm": 3.7226650714874268, + "learning_rate": 4.313167259786477e-05, + "loss": 1.1822, + "step": 8826 + }, + { + "epoch": 3.923111111111111, + "grad_norm": 3.7417545318603516, + "learning_rate": 4.311387900355872e-05, + "loss": 1.4244, + "step": 8827 + }, + { + "epoch": 3.9235555555555557, + "grad_norm": 3.5519144535064697, + "learning_rate": 4.309608540925267e-05, + "loss": 1.4199, + "step": 8828 + }, + { + "epoch": 3.924, + "grad_norm": 3.4887189865112305, + "learning_rate": 4.307829181494662e-05, + "loss": 1.4381, + "step": 8829 + }, + { + "epoch": 3.924444444444444, + "grad_norm": 3.223085880279541, + "learning_rate": 4.306049822064057e-05, + "loss": 1.0015, + "step": 8830 + }, + { + "epoch": 3.924888888888889, + "grad_norm": 3.3802876472473145, + "learning_rate": 4.3042704626334526e-05, + "loss": 1.0103, + "step": 8831 + }, + { + "epoch": 3.9253333333333336, + "grad_norm": 3.1447699069976807, + "learning_rate": 4.302491103202847e-05, + "loss": 1.2326, + "step": 8832 + }, + { + "epoch": 3.925777777777778, + "grad_norm": 3.206096887588501, + "learning_rate": 4.300711743772242e-05, + "loss": 0.9133, + "step": 8833 + }, + { + "epoch": 3.926222222222222, + "grad_norm": 3.67698335647583, + "learning_rate": 4.2989323843416376e-05, + "loss": 1.1838, + "step": 8834 + }, + { + "epoch": 3.9266666666666667, + "grad_norm": 3.849926233291626, + "learning_rate": 4.297153024911032e-05, + "loss": 1.1519, + "step": 8835 + }, + { + "epoch": 3.927111111111111, + "grad_norm": 3.2638113498687744, + "learning_rate": 4.2953736654804275e-05, + "loss": 1.0861, + "step": 8836 + }, + { + "epoch": 3.9275555555555557, + "grad_norm": 2.141021728515625, + "learning_rate": 4.2935943060498225e-05, + "loss": 0.3943, + "step": 8837 + }, + { + "epoch": 3.928, + "grad_norm": 2.9865190982818604, + "learning_rate": 4.291814946619217e-05, + "loss": 0.9882, + "step": 8838 + }, + { + "epoch": 3.9284444444444446, + "grad_norm": 3.7992167472839355, + "learning_rate": 4.2900355871886124e-05, + "loss": 1.3604, + "step": 8839 + }, + { + "epoch": 3.928888888888889, + "grad_norm": 5.145820617675781, + "learning_rate": 4.2882562277580074e-05, + "loss": 1.3215, + "step": 8840 + }, + { + "epoch": 3.929333333333333, + "grad_norm": 3.7761855125427246, + "learning_rate": 4.2864768683274024e-05, + "loss": 1.3284, + "step": 8841 + }, + { + "epoch": 3.929777777777778, + "grad_norm": 4.058622360229492, + "learning_rate": 4.284697508896797e-05, + "loss": 1.3971, + "step": 8842 + }, + { + "epoch": 3.930222222222222, + "grad_norm": 4.228296279907227, + "learning_rate": 4.282918149466192e-05, + "loss": 1.4811, + "step": 8843 + }, + { + "epoch": 3.9306666666666668, + "grad_norm": 4.4582085609436035, + "learning_rate": 4.281138790035587e-05, + "loss": 1.2761, + "step": 8844 + }, + { + "epoch": 3.931111111111111, + "grad_norm": 3.634289026260376, + "learning_rate": 4.279359430604982e-05, + "loss": 0.9966, + "step": 8845 + }, + { + "epoch": 3.9315555555555557, + "grad_norm": 5.011377334594727, + "learning_rate": 4.277580071174377e-05, + "loss": 1.1816, + "step": 8846 + }, + { + "epoch": 3.932, + "grad_norm": 4.198838233947754, + "learning_rate": 4.275800711743773e-05, + "loss": 1.4124, + "step": 8847 + }, + { + "epoch": 3.932444444444444, + "grad_norm": 4.471510887145996, + "learning_rate": 4.274021352313167e-05, + "loss": 1.0095, + "step": 8848 + }, + { + "epoch": 3.932888888888889, + "grad_norm": 3.8801698684692383, + "learning_rate": 4.272241992882562e-05, + "loss": 0.4851, + "step": 8849 + }, + { + "epoch": 3.9333333333333336, + "grad_norm": 3.203237533569336, + "learning_rate": 4.270462633451958e-05, + "loss": 0.411, + "step": 8850 + }, + { + "epoch": 3.933777777777778, + "grad_norm": 2.3682878017425537, + "learning_rate": 4.268683274021352e-05, + "loss": 2.0329, + "step": 8851 + }, + { + "epoch": 3.934222222222222, + "grad_norm": 2.3607728481292725, + "learning_rate": 4.266903914590748e-05, + "loss": 1.0491, + "step": 8852 + }, + { + "epoch": 3.9346666666666668, + "grad_norm": 3.4332642555236816, + "learning_rate": 4.265124555160143e-05, + "loss": 1.532, + "step": 8853 + }, + { + "epoch": 3.935111111111111, + "grad_norm": 3.4008309841156006, + "learning_rate": 4.263345195729538e-05, + "loss": 1.6618, + "step": 8854 + }, + { + "epoch": 3.9355555555555557, + "grad_norm": 3.161529779434204, + "learning_rate": 4.2615658362989326e-05, + "loss": 1.4769, + "step": 8855 + }, + { + "epoch": 3.936, + "grad_norm": 3.2563178539276123, + "learning_rate": 4.2597864768683276e-05, + "loss": 1.8546, + "step": 8856 + }, + { + "epoch": 3.9364444444444446, + "grad_norm": 3.0360910892486572, + "learning_rate": 4.2580071174377226e-05, + "loss": 1.0507, + "step": 8857 + }, + { + "epoch": 3.936888888888889, + "grad_norm": 3.976771116256714, + "learning_rate": 4.2562277580071176e-05, + "loss": 1.6975, + "step": 8858 + }, + { + "epoch": 3.937333333333333, + "grad_norm": 3.122298002243042, + "learning_rate": 4.2544483985765125e-05, + "loss": 1.4874, + "step": 8859 + }, + { + "epoch": 3.937777777777778, + "grad_norm": 3.1202025413513184, + "learning_rate": 4.252669039145908e-05, + "loss": 1.4433, + "step": 8860 + }, + { + "epoch": 3.938222222222222, + "grad_norm": 4.454637050628662, + "learning_rate": 4.2508896797153025e-05, + "loss": 1.6434, + "step": 8861 + }, + { + "epoch": 3.9386666666666668, + "grad_norm": 3.968618392944336, + "learning_rate": 4.2491103202846975e-05, + "loss": 1.365, + "step": 8862 + }, + { + "epoch": 3.939111111111111, + "grad_norm": 3.396158456802368, + "learning_rate": 4.247330960854093e-05, + "loss": 1.464, + "step": 8863 + }, + { + "epoch": 3.9395555555555557, + "grad_norm": 2.9130702018737793, + "learning_rate": 4.2455516014234874e-05, + "loss": 0.9835, + "step": 8864 + }, + { + "epoch": 3.94, + "grad_norm": 3.9278974533081055, + "learning_rate": 4.2437722419928824e-05, + "loss": 1.585, + "step": 8865 + }, + { + "epoch": 3.940444444444444, + "grad_norm": 4.043193817138672, + "learning_rate": 4.241992882562278e-05, + "loss": 1.4772, + "step": 8866 + }, + { + "epoch": 3.940888888888889, + "grad_norm": 3.436844825744629, + "learning_rate": 4.240213523131672e-05, + "loss": 1.0931, + "step": 8867 + }, + { + "epoch": 3.9413333333333336, + "grad_norm": 3.061730146408081, + "learning_rate": 4.238434163701068e-05, + "loss": 1.3839, + "step": 8868 + }, + { + "epoch": 3.941777777777778, + "grad_norm": 3.5630033016204834, + "learning_rate": 4.236654804270463e-05, + "loss": 1.4972, + "step": 8869 + }, + { + "epoch": 3.942222222222222, + "grad_norm": 5.172977924346924, + "learning_rate": 4.234875444839858e-05, + "loss": 1.0713, + "step": 8870 + }, + { + "epoch": 3.9426666666666668, + "grad_norm": 3.7209513187408447, + "learning_rate": 4.233096085409253e-05, + "loss": 1.5315, + "step": 8871 + }, + { + "epoch": 3.943111111111111, + "grad_norm": 3.7008655071258545, + "learning_rate": 4.231316725978648e-05, + "loss": 1.2798, + "step": 8872 + }, + { + "epoch": 3.9435555555555557, + "grad_norm": 3.4888625144958496, + "learning_rate": 4.229537366548043e-05, + "loss": 1.139, + "step": 8873 + }, + { + "epoch": 3.944, + "grad_norm": 3.5707790851593018, + "learning_rate": 4.227758007117438e-05, + "loss": 1.2187, + "step": 8874 + }, + { + "epoch": 3.9444444444444446, + "grad_norm": 3.2470076084136963, + "learning_rate": 4.225978647686833e-05, + "loss": 1.5406, + "step": 8875 + }, + { + "epoch": 3.944888888888889, + "grad_norm": 3.3586134910583496, + "learning_rate": 4.2241992882562284e-05, + "loss": 1.3913, + "step": 8876 + }, + { + "epoch": 3.945333333333333, + "grad_norm": 4.539661407470703, + "learning_rate": 4.222419928825623e-05, + "loss": 1.8586, + "step": 8877 + }, + { + "epoch": 3.945777777777778, + "grad_norm": 3.6496472358703613, + "learning_rate": 4.220640569395018e-05, + "loss": 1.5811, + "step": 8878 + }, + { + "epoch": 3.946222222222222, + "grad_norm": 2.9896080493927, + "learning_rate": 4.218861209964413e-05, + "loss": 1.1142, + "step": 8879 + }, + { + "epoch": 3.9466666666666668, + "grad_norm": 4.495405673980713, + "learning_rate": 4.2170818505338076e-05, + "loss": 1.316, + "step": 8880 + }, + { + "epoch": 3.947111111111111, + "grad_norm": 2.01225209236145, + "learning_rate": 4.215302491103203e-05, + "loss": 0.4757, + "step": 8881 + }, + { + "epoch": 3.9475555555555557, + "grad_norm": 2.771531343460083, + "learning_rate": 4.213523131672598e-05, + "loss": 0.4966, + "step": 8882 + }, + { + "epoch": 3.948, + "grad_norm": 3.772594928741455, + "learning_rate": 4.211743772241993e-05, + "loss": 1.0786, + "step": 8883 + }, + { + "epoch": 3.948444444444444, + "grad_norm": 3.722163438796997, + "learning_rate": 4.209964412811388e-05, + "loss": 1.3533, + "step": 8884 + }, + { + "epoch": 3.948888888888889, + "grad_norm": 4.219447135925293, + "learning_rate": 4.208185053380783e-05, + "loss": 1.7263, + "step": 8885 + }, + { + "epoch": 3.9493333333333336, + "grad_norm": 3.999436855316162, + "learning_rate": 4.206405693950178e-05, + "loss": 1.3957, + "step": 8886 + }, + { + "epoch": 3.949777777777778, + "grad_norm": 3.6108932495117188, + "learning_rate": 4.204626334519573e-05, + "loss": 1.2958, + "step": 8887 + }, + { + "epoch": 3.950222222222222, + "grad_norm": 3.92063570022583, + "learning_rate": 4.202846975088968e-05, + "loss": 1.4854, + "step": 8888 + }, + { + "epoch": 3.9506666666666668, + "grad_norm": 3.5336453914642334, + "learning_rate": 4.201067615658363e-05, + "loss": 1.0695, + "step": 8889 + }, + { + "epoch": 3.951111111111111, + "grad_norm": 4.354097366333008, + "learning_rate": 4.199288256227758e-05, + "loss": 1.4669, + "step": 8890 + }, + { + "epoch": 3.9515555555555557, + "grad_norm": 4.140068054199219, + "learning_rate": 4.197508896797153e-05, + "loss": 1.292, + "step": 8891 + }, + { + "epoch": 3.952, + "grad_norm": 4.120720863342285, + "learning_rate": 4.1957295373665486e-05, + "loss": 1.2629, + "step": 8892 + }, + { + "epoch": 3.9524444444444446, + "grad_norm": 4.0067901611328125, + "learning_rate": 4.193950177935943e-05, + "loss": 1.318, + "step": 8893 + }, + { + "epoch": 3.952888888888889, + "grad_norm": 3.640927314758301, + "learning_rate": 4.192170818505338e-05, + "loss": 0.8191, + "step": 8894 + }, + { + "epoch": 3.953333333333333, + "grad_norm": 3.907336473464966, + "learning_rate": 4.1903914590747336e-05, + "loss": 1.0409, + "step": 8895 + }, + { + "epoch": 3.953777777777778, + "grad_norm": 4.239986419677734, + "learning_rate": 4.188612099644128e-05, + "loss": 1.4091, + "step": 8896 + }, + { + "epoch": 3.954222222222222, + "grad_norm": 4.236495494842529, + "learning_rate": 4.1868327402135235e-05, + "loss": 1.1437, + "step": 8897 + }, + { + "epoch": 3.9546666666666668, + "grad_norm": 3.6864097118377686, + "learning_rate": 4.1850533807829185e-05, + "loss": 1.0221, + "step": 8898 + }, + { + "epoch": 3.955111111111111, + "grad_norm": 5.639880180358887, + "learning_rate": 4.1832740213523135e-05, + "loss": 1.8105, + "step": 8899 + }, + { + "epoch": 3.9555555555555557, + "grad_norm": 3.3932342529296875, + "learning_rate": 4.1814946619217084e-05, + "loss": 0.5343, + "step": 8900 + }, + { + "epoch": 3.956, + "grad_norm": 2.1324033737182617, + "learning_rate": 4.1797153024911034e-05, + "loss": 2.0662, + "step": 8901 + }, + { + "epoch": 3.956444444444444, + "grad_norm": 2.3241565227508545, + "learning_rate": 4.1779359430604984e-05, + "loss": 1.8588, + "step": 8902 + }, + { + "epoch": 3.956888888888889, + "grad_norm": 2.4009382724761963, + "learning_rate": 4.1761565836298933e-05, + "loss": 2.0112, + "step": 8903 + }, + { + "epoch": 3.9573333333333336, + "grad_norm": 2.540668487548828, + "learning_rate": 4.174377224199288e-05, + "loss": 1.4803, + "step": 8904 + }, + { + "epoch": 3.957777777777778, + "grad_norm": 2.241854190826416, + "learning_rate": 4.172597864768684e-05, + "loss": 1.6004, + "step": 8905 + }, + { + "epoch": 3.958222222222222, + "grad_norm": 3.1838226318359375, + "learning_rate": 4.170818505338078e-05, + "loss": 1.6044, + "step": 8906 + }, + { + "epoch": 3.958666666666667, + "grad_norm": 2.7227938175201416, + "learning_rate": 4.169039145907473e-05, + "loss": 1.3226, + "step": 8907 + }, + { + "epoch": 3.959111111111111, + "grad_norm": 2.933244466781616, + "learning_rate": 4.167259786476869e-05, + "loss": 1.3356, + "step": 8908 + }, + { + "epoch": 3.9595555555555557, + "grad_norm": 2.794614315032959, + "learning_rate": 4.165480427046263e-05, + "loss": 1.3394, + "step": 8909 + }, + { + "epoch": 3.96, + "grad_norm": 3.1257002353668213, + "learning_rate": 4.163701067615658e-05, + "loss": 1.6075, + "step": 8910 + }, + { + "epoch": 3.9604444444444447, + "grad_norm": 3.743485927581787, + "learning_rate": 4.161921708185054e-05, + "loss": 1.9586, + "step": 8911 + }, + { + "epoch": 3.960888888888889, + "grad_norm": 2.8176426887512207, + "learning_rate": 4.160142348754449e-05, + "loss": 1.7985, + "step": 8912 + }, + { + "epoch": 3.961333333333333, + "grad_norm": 3.1608290672302246, + "learning_rate": 4.158362989323844e-05, + "loss": 1.729, + "step": 8913 + }, + { + "epoch": 3.961777777777778, + "grad_norm": 3.7492446899414062, + "learning_rate": 4.156583629893239e-05, + "loss": 1.606, + "step": 8914 + }, + { + "epoch": 3.962222222222222, + "grad_norm": 3.0357353687286377, + "learning_rate": 4.154804270462634e-05, + "loss": 1.2235, + "step": 8915 + }, + { + "epoch": 3.962666666666667, + "grad_norm": 3.4608840942382812, + "learning_rate": 4.1530249110320287e-05, + "loss": 1.6648, + "step": 8916 + }, + { + "epoch": 3.963111111111111, + "grad_norm": 3.740095615386963, + "learning_rate": 4.1512455516014236e-05, + "loss": 1.4451, + "step": 8917 + }, + { + "epoch": 3.9635555555555557, + "grad_norm": 3.2377631664276123, + "learning_rate": 4.1494661921708186e-05, + "loss": 1.3791, + "step": 8918 + }, + { + "epoch": 3.964, + "grad_norm": 2.5673611164093018, + "learning_rate": 4.1476868327402136e-05, + "loss": 0.6982, + "step": 8919 + }, + { + "epoch": 3.964444444444444, + "grad_norm": 3.1579864025115967, + "learning_rate": 4.1459074733096085e-05, + "loss": 1.1644, + "step": 8920 + }, + { + "epoch": 3.964888888888889, + "grad_norm": 3.7532455921173096, + "learning_rate": 4.144128113879004e-05, + "loss": 1.3643, + "step": 8921 + }, + { + "epoch": 3.9653333333333336, + "grad_norm": 3.0965757369995117, + "learning_rate": 4.1423487544483985e-05, + "loss": 1.5047, + "step": 8922 + }, + { + "epoch": 3.965777777777778, + "grad_norm": 3.5118284225463867, + "learning_rate": 4.1405693950177935e-05, + "loss": 1.7549, + "step": 8923 + }, + { + "epoch": 3.966222222222222, + "grad_norm": 3.520890712738037, + "learning_rate": 4.138790035587189e-05, + "loss": 1.4312, + "step": 8924 + }, + { + "epoch": 3.966666666666667, + "grad_norm": 3.2968950271606445, + "learning_rate": 4.1370106761565834e-05, + "loss": 1.1051, + "step": 8925 + }, + { + "epoch": 3.967111111111111, + "grad_norm": 3.584757089614868, + "learning_rate": 4.135231316725979e-05, + "loss": 1.4383, + "step": 8926 + }, + { + "epoch": 3.9675555555555553, + "grad_norm": 2.9149937629699707, + "learning_rate": 4.133451957295374e-05, + "loss": 0.7798, + "step": 8927 + }, + { + "epoch": 3.968, + "grad_norm": 4.206515312194824, + "learning_rate": 4.131672597864769e-05, + "loss": 1.2928, + "step": 8928 + }, + { + "epoch": 3.9684444444444447, + "grad_norm": 3.031240224838257, + "learning_rate": 4.129893238434164e-05, + "loss": 1.0482, + "step": 8929 + }, + { + "epoch": 3.968888888888889, + "grad_norm": 3.6411449909210205, + "learning_rate": 4.128113879003559e-05, + "loss": 1.0632, + "step": 8930 + }, + { + "epoch": 3.969333333333333, + "grad_norm": 3.8553061485290527, + "learning_rate": 4.126334519572954e-05, + "loss": 1.5348, + "step": 8931 + }, + { + "epoch": 3.969777777777778, + "grad_norm": 3.2213852405548096, + "learning_rate": 4.124555160142349e-05, + "loss": 1.4563, + "step": 8932 + }, + { + "epoch": 3.970222222222222, + "grad_norm": 3.013375759124756, + "learning_rate": 4.122775800711744e-05, + "loss": 1.1047, + "step": 8933 + }, + { + "epoch": 3.970666666666667, + "grad_norm": 3.3184897899627686, + "learning_rate": 4.120996441281139e-05, + "loss": 1.2822, + "step": 8934 + }, + { + "epoch": 3.971111111111111, + "grad_norm": 3.349609375, + "learning_rate": 4.119217081850534e-05, + "loss": 1.4031, + "step": 8935 + }, + { + "epoch": 3.9715555555555557, + "grad_norm": 3.315196990966797, + "learning_rate": 4.117437722419929e-05, + "loss": 1.3917, + "step": 8936 + }, + { + "epoch": 3.972, + "grad_norm": 3.5079121589660645, + "learning_rate": 4.1156583629893244e-05, + "loss": 1.3935, + "step": 8937 + }, + { + "epoch": 3.9724444444444442, + "grad_norm": 3.8228821754455566, + "learning_rate": 4.113879003558719e-05, + "loss": 1.2339, + "step": 8938 + }, + { + "epoch": 3.972888888888889, + "grad_norm": 3.724152088165283, + "learning_rate": 4.112099644128114e-05, + "loss": 1.03, + "step": 8939 + }, + { + "epoch": 3.9733333333333336, + "grad_norm": 3.229318141937256, + "learning_rate": 4.1103202846975093e-05, + "loss": 0.7204, + "step": 8940 + }, + { + "epoch": 3.973777777777778, + "grad_norm": 4.946290493011475, + "learning_rate": 4.108540925266904e-05, + "loss": 1.647, + "step": 8941 + }, + { + "epoch": 3.974222222222222, + "grad_norm": 4.947329521179199, + "learning_rate": 4.106761565836299e-05, + "loss": 1.4321, + "step": 8942 + }, + { + "epoch": 3.974666666666667, + "grad_norm": 3.631192684173584, + "learning_rate": 4.104982206405694e-05, + "loss": 1.2327, + "step": 8943 + }, + { + "epoch": 3.975111111111111, + "grad_norm": 3.9020490646362305, + "learning_rate": 4.103202846975089e-05, + "loss": 1.3794, + "step": 8944 + }, + { + "epoch": 3.9755555555555553, + "grad_norm": 4.423495769500732, + "learning_rate": 4.101423487544484e-05, + "loss": 1.5862, + "step": 8945 + }, + { + "epoch": 3.976, + "grad_norm": 4.301177024841309, + "learning_rate": 4.099644128113879e-05, + "loss": 0.9748, + "step": 8946 + }, + { + "epoch": 3.9764444444444447, + "grad_norm": 3.9020419120788574, + "learning_rate": 4.097864768683274e-05, + "loss": 0.873, + "step": 8947 + }, + { + "epoch": 3.976888888888889, + "grad_norm": 4.565361022949219, + "learning_rate": 4.096085409252669e-05, + "loss": 0.9833, + "step": 8948 + }, + { + "epoch": 3.977333333333333, + "grad_norm": 3.912226676940918, + "learning_rate": 4.094306049822064e-05, + "loss": 1.3495, + "step": 8949 + }, + { + "epoch": 3.977777777777778, + "grad_norm": 5.293661594390869, + "learning_rate": 4.09252669039146e-05, + "loss": 1.3508, + "step": 8950 + }, + { + "epoch": 3.978222222222222, + "grad_norm": 0.17634160816669464, + "learning_rate": 4.090747330960854e-05, + "loss": 0.0163, + "step": 8951 + }, + { + "epoch": 3.978666666666667, + "grad_norm": 2.6012349128723145, + "learning_rate": 4.088967971530249e-05, + "loss": 1.7285, + "step": 8952 + }, + { + "epoch": 3.979111111111111, + "grad_norm": 2.6007161140441895, + "learning_rate": 4.0871886120996447e-05, + "loss": 0.9842, + "step": 8953 + }, + { + "epoch": 3.9795555555555557, + "grad_norm": 2.962765693664551, + "learning_rate": 4.085409252669039e-05, + "loss": 1.6334, + "step": 8954 + }, + { + "epoch": 3.98, + "grad_norm": 3.1964375972747803, + "learning_rate": 4.083629893238434e-05, + "loss": 1.7721, + "step": 8955 + }, + { + "epoch": 3.9804444444444442, + "grad_norm": 3.102666139602661, + "learning_rate": 4.0818505338078296e-05, + "loss": 1.5754, + "step": 8956 + }, + { + "epoch": 3.980888888888889, + "grad_norm": 3.193918466567993, + "learning_rate": 4.0800711743772245e-05, + "loss": 2.0491, + "step": 8957 + }, + { + "epoch": 3.981333333333333, + "grad_norm": 2.9397614002227783, + "learning_rate": 4.0782918149466195e-05, + "loss": 1.044, + "step": 8958 + }, + { + "epoch": 3.981777777777778, + "grad_norm": 3.4079766273498535, + "learning_rate": 4.0765124555160145e-05, + "loss": 1.0677, + "step": 8959 + }, + { + "epoch": 3.982222222222222, + "grad_norm": 3.466888666152954, + "learning_rate": 4.0747330960854095e-05, + "loss": 1.5662, + "step": 8960 + }, + { + "epoch": 3.982666666666667, + "grad_norm": 2.9022140502929688, + "learning_rate": 4.0729537366548044e-05, + "loss": 1.1119, + "step": 8961 + }, + { + "epoch": 3.983111111111111, + "grad_norm": 3.6132586002349854, + "learning_rate": 4.0711743772241994e-05, + "loss": 1.4747, + "step": 8962 + }, + { + "epoch": 3.9835555555555553, + "grad_norm": 3.143056631088257, + "learning_rate": 4.0693950177935944e-05, + "loss": 1.6553, + "step": 8963 + }, + { + "epoch": 3.984, + "grad_norm": 3.3530349731445312, + "learning_rate": 4.0676156583629894e-05, + "loss": 1.1596, + "step": 8964 + }, + { + "epoch": 3.9844444444444447, + "grad_norm": 3.0167648792266846, + "learning_rate": 4.065836298932384e-05, + "loss": 1.0196, + "step": 8965 + }, + { + "epoch": 3.984888888888889, + "grad_norm": 2.661754846572876, + "learning_rate": 4.06405693950178e-05, + "loss": 0.8042, + "step": 8966 + }, + { + "epoch": 3.985333333333333, + "grad_norm": 4.018869400024414, + "learning_rate": 4.062277580071174e-05, + "loss": 1.6149, + "step": 8967 + }, + { + "epoch": 3.985777777777778, + "grad_norm": 3.3034558296203613, + "learning_rate": 4.060498220640569e-05, + "loss": 1.0005, + "step": 8968 + }, + { + "epoch": 3.986222222222222, + "grad_norm": 3.277484178543091, + "learning_rate": 4.058718861209965e-05, + "loss": 1.4676, + "step": 8969 + }, + { + "epoch": 3.986666666666667, + "grad_norm": 2.8795299530029297, + "learning_rate": 4.05693950177936e-05, + "loss": 1.0455, + "step": 8970 + }, + { + "epoch": 3.987111111111111, + "grad_norm": 3.6743695735931396, + "learning_rate": 4.055160142348755e-05, + "loss": 1.3309, + "step": 8971 + }, + { + "epoch": 3.9875555555555557, + "grad_norm": 2.998657464981079, + "learning_rate": 4.05338078291815e-05, + "loss": 1.3431, + "step": 8972 + }, + { + "epoch": 3.988, + "grad_norm": 3.6148931980133057, + "learning_rate": 4.051601423487545e-05, + "loss": 1.167, + "step": 8973 + }, + { + "epoch": 3.9884444444444442, + "grad_norm": 3.467222213745117, + "learning_rate": 4.04982206405694e-05, + "loss": 1.2744, + "step": 8974 + }, + { + "epoch": 3.988888888888889, + "grad_norm": 3.517871618270874, + "learning_rate": 4.048042704626335e-05, + "loss": 1.4644, + "step": 8975 + }, + { + "epoch": 3.989333333333333, + "grad_norm": 3.028439998626709, + "learning_rate": 4.04626334519573e-05, + "loss": 0.8676, + "step": 8976 + }, + { + "epoch": 3.989777777777778, + "grad_norm": 3.8672285079956055, + "learning_rate": 4.044483985765125e-05, + "loss": 1.3043, + "step": 8977 + }, + { + "epoch": 3.990222222222222, + "grad_norm": 3.13926362991333, + "learning_rate": 4.0427046263345196e-05, + "loss": 1.0397, + "step": 8978 + }, + { + "epoch": 3.990666666666667, + "grad_norm": 3.7574684619903564, + "learning_rate": 4.0409252669039146e-05, + "loss": 1.2111, + "step": 8979 + }, + { + "epoch": 3.991111111111111, + "grad_norm": 3.468540906906128, + "learning_rate": 4.0391459074733096e-05, + "loss": 1.4037, + "step": 8980 + }, + { + "epoch": 3.9915555555555553, + "grad_norm": 2.5756216049194336, + "learning_rate": 4.0373665480427046e-05, + "loss": 0.6104, + "step": 8981 + }, + { + "epoch": 3.992, + "grad_norm": 4.789612293243408, + "learning_rate": 4.0355871886121e-05, + "loss": 1.3617, + "step": 8982 + }, + { + "epoch": 3.9924444444444447, + "grad_norm": 4.018329620361328, + "learning_rate": 4.0338078291814945e-05, + "loss": 1.3874, + "step": 8983 + }, + { + "epoch": 3.992888888888889, + "grad_norm": 3.488252639770508, + "learning_rate": 4.0320284697508895e-05, + "loss": 1.2444, + "step": 8984 + }, + { + "epoch": 3.993333333333333, + "grad_norm": 3.8759312629699707, + "learning_rate": 4.030249110320285e-05, + "loss": 1.1931, + "step": 8985 + }, + { + "epoch": 3.993777777777778, + "grad_norm": 2.9700896739959717, + "learning_rate": 4.02846975088968e-05, + "loss": 0.9424, + "step": 8986 + }, + { + "epoch": 3.994222222222222, + "grad_norm": 3.435788154602051, + "learning_rate": 4.026690391459075e-05, + "loss": 1.1437, + "step": 8987 + }, + { + "epoch": 3.994666666666667, + "grad_norm": 3.1736974716186523, + "learning_rate": 4.02491103202847e-05, + "loss": 1.014, + "step": 8988 + }, + { + "epoch": 3.995111111111111, + "grad_norm": 3.615291118621826, + "learning_rate": 4.023131672597865e-05, + "loss": 1.4715, + "step": 8989 + }, + { + "epoch": 3.9955555555555557, + "grad_norm": 3.378284215927124, + "learning_rate": 4.02135231316726e-05, + "loss": 1.2936, + "step": 8990 + }, + { + "epoch": 3.996, + "grad_norm": 3.3950233459472656, + "learning_rate": 4.019572953736655e-05, + "loss": 1.1708, + "step": 8991 + }, + { + "epoch": 3.9964444444444442, + "grad_norm": 3.4889776706695557, + "learning_rate": 4.01779359430605e-05, + "loss": 1.192, + "step": 8992 + }, + { + "epoch": 3.996888888888889, + "grad_norm": 3.3020076751708984, + "learning_rate": 4.016014234875445e-05, + "loss": 0.991, + "step": 8993 + }, + { + "epoch": 3.997333333333333, + "grad_norm": 4.46552038192749, + "learning_rate": 4.01423487544484e-05, + "loss": 1.4608, + "step": 8994 + }, + { + "epoch": 3.997777777777778, + "grad_norm": 4.276278495788574, + "learning_rate": 4.0124555160142355e-05, + "loss": 1.2873, + "step": 8995 + }, + { + "epoch": 3.998222222222222, + "grad_norm": 3.8281071186065674, + "learning_rate": 4.01067615658363e-05, + "loss": 1.461, + "step": 8996 + }, + { + "epoch": 3.998666666666667, + "grad_norm": 5.66308069229126, + "learning_rate": 4.008896797153025e-05, + "loss": 1.2658, + "step": 8997 + }, + { + "epoch": 3.999111111111111, + "grad_norm": 4.838554382324219, + "learning_rate": 4.0071174377224204e-05, + "loss": 1.175, + "step": 8998 + }, + { + "epoch": 3.9995555555555553, + "grad_norm": 4.6421990394592285, + "learning_rate": 4.0053380782918154e-05, + "loss": 1.2439, + "step": 8999 + }, + { + "epoch": 4.0, + "grad_norm": 3.736137628555298, + "learning_rate": 4.00355871886121e-05, + "loss": 0.6085, + "step": 9000 + }, + { + "epoch": 4.0, + "eval_loss": 2.81941556930542, + "eval_runtime": 47.453, + "eval_samples_per_second": 10.537, + "eval_steps_per_second": 10.537, + "step": 9000 + }, + { + "epoch": 4.000444444444445, + "grad_norm": 2.5390360355377197, + "learning_rate": 4.0017793594306054e-05, + "loss": 1.3083, + "step": 9001 + }, + { + "epoch": 4.0008888888888885, + "grad_norm": 2.7254271507263184, + "learning_rate": 4e-05, + "loss": 0.5198, + "step": 9002 + }, + { + "epoch": 4.001333333333333, + "grad_norm": 2.502013683319092, + "learning_rate": 3.998220640569395e-05, + "loss": 1.3684, + "step": 9003 + }, + { + "epoch": 4.001777777777778, + "grad_norm": 2.1942808628082275, + "learning_rate": 3.99644128113879e-05, + "loss": 0.9158, + "step": 9004 + }, + { + "epoch": 4.002222222222223, + "grad_norm": 2.4862608909606934, + "learning_rate": 3.994661921708185e-05, + "loss": 1.1732, + "step": 9005 + }, + { + "epoch": 4.002666666666666, + "grad_norm": 2.521996259689331, + "learning_rate": 3.99288256227758e-05, + "loss": 0.9865, + "step": 9006 + }, + { + "epoch": 4.003111111111111, + "grad_norm": 2.7241384983062744, + "learning_rate": 3.991103202846975e-05, + "loss": 0.9629, + "step": 9007 + }, + { + "epoch": 4.003555555555556, + "grad_norm": 2.860523223876953, + "learning_rate": 3.98932384341637e-05, + "loss": 0.9462, + "step": 9008 + }, + { + "epoch": 4.004, + "grad_norm": 2.424485921859741, + "learning_rate": 3.987544483985765e-05, + "loss": 0.762, + "step": 9009 + }, + { + "epoch": 4.004444444444444, + "grad_norm": 3.2008659839630127, + "learning_rate": 3.98576512455516e-05, + "loss": 1.2424, + "step": 9010 + }, + { + "epoch": 4.004888888888889, + "grad_norm": 3.4407267570495605, + "learning_rate": 3.983985765124556e-05, + "loss": 0.9112, + "step": 9011 + }, + { + "epoch": 4.005333333333334, + "grad_norm": 2.80182147026062, + "learning_rate": 3.98220640569395e-05, + "loss": 1.006, + "step": 9012 + }, + { + "epoch": 4.005777777777777, + "grad_norm": 3.254213571548462, + "learning_rate": 3.980427046263345e-05, + "loss": 1.0918, + "step": 9013 + }, + { + "epoch": 4.006222222222222, + "grad_norm": 3.264662027359009, + "learning_rate": 3.978647686832741e-05, + "loss": 1.248, + "step": 9014 + }, + { + "epoch": 4.006666666666667, + "grad_norm": 2.7406811714172363, + "learning_rate": 3.9768683274021356e-05, + "loss": 0.7658, + "step": 9015 + }, + { + "epoch": 4.0071111111111115, + "grad_norm": 3.4597320556640625, + "learning_rate": 3.9750889679715306e-05, + "loss": 0.9957, + "step": 9016 + }, + { + "epoch": 4.007555555555555, + "grad_norm": 3.379284620285034, + "learning_rate": 3.9733096085409256e-05, + "loss": 1.1522, + "step": 9017 + }, + { + "epoch": 4.008, + "grad_norm": 2.9539339542388916, + "learning_rate": 3.9715302491103206e-05, + "loss": 0.8746, + "step": 9018 + }, + { + "epoch": 4.008444444444445, + "grad_norm": 3.017645835876465, + "learning_rate": 3.9697508896797155e-05, + "loss": 0.7478, + "step": 9019 + }, + { + "epoch": 4.0088888888888885, + "grad_norm": 3.781395196914673, + "learning_rate": 3.9679715302491105e-05, + "loss": 1.0097, + "step": 9020 + }, + { + "epoch": 4.009333333333333, + "grad_norm": 3.5405569076538086, + "learning_rate": 3.9661921708185055e-05, + "loss": 0.7791, + "step": 9021 + }, + { + "epoch": 4.009777777777778, + "grad_norm": 4.20705509185791, + "learning_rate": 3.9644128113879004e-05, + "loss": 0.98, + "step": 9022 + }, + { + "epoch": 4.010222222222223, + "grad_norm": 3.9584195613861084, + "learning_rate": 3.9626334519572954e-05, + "loss": 0.6952, + "step": 9023 + }, + { + "epoch": 4.010666666666666, + "grad_norm": 4.0206522941589355, + "learning_rate": 3.9608540925266904e-05, + "loss": 1.0842, + "step": 9024 + }, + { + "epoch": 4.011111111111111, + "grad_norm": 4.080977439880371, + "learning_rate": 3.9590747330960854e-05, + "loss": 0.7917, + "step": 9025 + }, + { + "epoch": 4.011555555555556, + "grad_norm": 4.305196285247803, + "learning_rate": 3.95729537366548e-05, + "loss": 1.0147, + "step": 9026 + }, + { + "epoch": 4.012, + "grad_norm": 3.7779810428619385, + "learning_rate": 3.955516014234876e-05, + "loss": 0.764, + "step": 9027 + }, + { + "epoch": 4.012444444444444, + "grad_norm": 3.0636918544769287, + "learning_rate": 3.953736654804271e-05, + "loss": 0.5695, + "step": 9028 + }, + { + "epoch": 4.012888888888889, + "grad_norm": 3.5221712589263916, + "learning_rate": 3.951957295373665e-05, + "loss": 0.7406, + "step": 9029 + }, + { + "epoch": 4.013333333333334, + "grad_norm": 4.276998996734619, + "learning_rate": 3.950177935943061e-05, + "loss": 0.8573, + "step": 9030 + }, + { + "epoch": 4.0137777777777774, + "grad_norm": 5.169942855834961, + "learning_rate": 3.948398576512456e-05, + "loss": 1.1206, + "step": 9031 + }, + { + "epoch": 4.014222222222222, + "grad_norm": 3.651606798171997, + "learning_rate": 3.946619217081851e-05, + "loss": 0.6189, + "step": 9032 + }, + { + "epoch": 4.014666666666667, + "grad_norm": 4.276918411254883, + "learning_rate": 3.944839857651246e-05, + "loss": 1.2343, + "step": 9033 + }, + { + "epoch": 4.0151111111111115, + "grad_norm": 3.116567850112915, + "learning_rate": 3.943060498220641e-05, + "loss": 0.5037, + "step": 9034 + }, + { + "epoch": 4.015555555555555, + "grad_norm": 4.222073078155518, + "learning_rate": 3.941281138790036e-05, + "loss": 0.9321, + "step": 9035 + }, + { + "epoch": 4.016, + "grad_norm": 3.9893438816070557, + "learning_rate": 3.939501779359431e-05, + "loss": 1.0099, + "step": 9036 + }, + { + "epoch": 4.016444444444445, + "grad_norm": 4.333362102508545, + "learning_rate": 3.937722419928826e-05, + "loss": 1.0413, + "step": 9037 + }, + { + "epoch": 4.0168888888888885, + "grad_norm": 3.694157361984253, + "learning_rate": 3.935943060498221e-05, + "loss": 0.6196, + "step": 9038 + }, + { + "epoch": 4.017333333333333, + "grad_norm": 4.171072006225586, + "learning_rate": 3.9341637010676157e-05, + "loss": 0.6015, + "step": 9039 + }, + { + "epoch": 4.017777777777778, + "grad_norm": 4.184826374053955, + "learning_rate": 3.932384341637011e-05, + "loss": 0.9599, + "step": 9040 + }, + { + "epoch": 4.018222222222223, + "grad_norm": 3.0356812477111816, + "learning_rate": 3.9306049822064056e-05, + "loss": 0.5598, + "step": 9041 + }, + { + "epoch": 4.018666666666666, + "grad_norm": 5.620950222015381, + "learning_rate": 3.9288256227758006e-05, + "loss": 0.7523, + "step": 9042 + }, + { + "epoch": 4.019111111111111, + "grad_norm": 3.5355186462402344, + "learning_rate": 3.927046263345196e-05, + "loss": 0.5461, + "step": 9043 + }, + { + "epoch": 4.019555555555556, + "grad_norm": 4.568739414215088, + "learning_rate": 3.925266903914591e-05, + "loss": 0.8424, + "step": 9044 + }, + { + "epoch": 4.02, + "grad_norm": 3.9112837314605713, + "learning_rate": 3.9234875444839855e-05, + "loss": 0.6756, + "step": 9045 + }, + { + "epoch": 4.020444444444444, + "grad_norm": 4.544727325439453, + "learning_rate": 3.921708185053381e-05, + "loss": 0.7065, + "step": 9046 + }, + { + "epoch": 4.020888888888889, + "grad_norm": 4.089946746826172, + "learning_rate": 3.919928825622776e-05, + "loss": 1.122, + "step": 9047 + }, + { + "epoch": 4.021333333333334, + "grad_norm": 5.61737060546875, + "learning_rate": 3.918149466192171e-05, + "loss": 0.4246, + "step": 9048 + }, + { + "epoch": 4.0217777777777775, + "grad_norm": 4.636961460113525, + "learning_rate": 3.916370106761566e-05, + "loss": 0.4196, + "step": 9049 + }, + { + "epoch": 4.022222222222222, + "grad_norm": 7.075558662414551, + "learning_rate": 3.914590747330961e-05, + "loss": 0.5413, + "step": 9050 + }, + { + "epoch": 4.022666666666667, + "grad_norm": 2.6508331298828125, + "learning_rate": 3.912811387900356e-05, + "loss": 1.1433, + "step": 9051 + }, + { + "epoch": 4.0231111111111115, + "grad_norm": 2.6102499961853027, + "learning_rate": 3.911032028469751e-05, + "loss": 1.4944, + "step": 9052 + }, + { + "epoch": 4.023555555555555, + "grad_norm": 2.353135108947754, + "learning_rate": 3.909252669039146e-05, + "loss": 0.6206, + "step": 9053 + }, + { + "epoch": 4.024, + "grad_norm": 2.1263370513916016, + "learning_rate": 3.907473309608541e-05, + "loss": 0.9243, + "step": 9054 + }, + { + "epoch": 4.024444444444445, + "grad_norm": 3.224517583847046, + "learning_rate": 3.905693950177936e-05, + "loss": 1.5233, + "step": 9055 + }, + { + "epoch": 4.0248888888888885, + "grad_norm": 3.488454580307007, + "learning_rate": 3.9039145907473315e-05, + "loss": 1.5544, + "step": 9056 + }, + { + "epoch": 4.025333333333333, + "grad_norm": 3.6243624687194824, + "learning_rate": 3.9021352313167265e-05, + "loss": 1.1879, + "step": 9057 + }, + { + "epoch": 4.025777777777778, + "grad_norm": 3.8051021099090576, + "learning_rate": 3.900355871886121e-05, + "loss": 1.3218, + "step": 9058 + }, + { + "epoch": 4.026222222222223, + "grad_norm": 3.4981696605682373, + "learning_rate": 3.8985765124555164e-05, + "loss": 1.0695, + "step": 9059 + }, + { + "epoch": 4.026666666666666, + "grad_norm": 3.527284860610962, + "learning_rate": 3.8967971530249114e-05, + "loss": 1.0332, + "step": 9060 + }, + { + "epoch": 4.027111111111111, + "grad_norm": 3.228292942047119, + "learning_rate": 3.8950177935943064e-05, + "loss": 1.0043, + "step": 9061 + }, + { + "epoch": 4.027555555555556, + "grad_norm": 3.101989984512329, + "learning_rate": 3.8932384341637014e-05, + "loss": 0.7491, + "step": 9062 + }, + { + "epoch": 4.028, + "grad_norm": 3.7942066192626953, + "learning_rate": 3.891459074733096e-05, + "loss": 1.1937, + "step": 9063 + }, + { + "epoch": 4.028444444444444, + "grad_norm": 4.187127590179443, + "learning_rate": 3.889679715302491e-05, + "loss": 1.0301, + "step": 9064 + }, + { + "epoch": 4.028888888888889, + "grad_norm": 3.1858954429626465, + "learning_rate": 3.887900355871886e-05, + "loss": 0.8335, + "step": 9065 + }, + { + "epoch": 4.029333333333334, + "grad_norm": 3.0691826343536377, + "learning_rate": 3.886120996441281e-05, + "loss": 0.8928, + "step": 9066 + }, + { + "epoch": 4.0297777777777775, + "grad_norm": 2.3211822509765625, + "learning_rate": 3.884341637010676e-05, + "loss": 0.3684, + "step": 9067 + }, + { + "epoch": 4.030222222222222, + "grad_norm": 3.79524302482605, + "learning_rate": 3.882562277580071e-05, + "loss": 1.2104, + "step": 9068 + }, + { + "epoch": 4.030666666666667, + "grad_norm": 4.332894325256348, + "learning_rate": 3.880782918149466e-05, + "loss": 1.0843, + "step": 9069 + }, + { + "epoch": 4.0311111111111115, + "grad_norm": 4.316042423248291, + "learning_rate": 3.879003558718861e-05, + "loss": 0.5951, + "step": 9070 + }, + { + "epoch": 4.031555555555555, + "grad_norm": 3.6957225799560547, + "learning_rate": 3.877224199288256e-05, + "loss": 1.0805, + "step": 9071 + }, + { + "epoch": 4.032, + "grad_norm": 4.04442834854126, + "learning_rate": 3.875444839857652e-05, + "loss": 1.1406, + "step": 9072 + }, + { + "epoch": 4.032444444444445, + "grad_norm": 3.6291229724884033, + "learning_rate": 3.873665480427047e-05, + "loss": 1.2136, + "step": 9073 + }, + { + "epoch": 4.0328888888888885, + "grad_norm": 3.7658095359802246, + "learning_rate": 3.871886120996441e-05, + "loss": 0.9391, + "step": 9074 + }, + { + "epoch": 4.033333333333333, + "grad_norm": 4.630577564239502, + "learning_rate": 3.870106761565837e-05, + "loss": 0.9134, + "step": 9075 + }, + { + "epoch": 4.033777777777778, + "grad_norm": 4.5560407638549805, + "learning_rate": 3.8683274021352317e-05, + "loss": 1.164, + "step": 9076 + }, + { + "epoch": 4.034222222222223, + "grad_norm": 3.639586925506592, + "learning_rate": 3.8665480427046266e-05, + "loss": 0.7082, + "step": 9077 + }, + { + "epoch": 4.034666666666666, + "grad_norm": 3.4507131576538086, + "learning_rate": 3.8647686832740216e-05, + "loss": 1.0112, + "step": 9078 + }, + { + "epoch": 4.035111111111111, + "grad_norm": 3.1812744140625, + "learning_rate": 3.8629893238434166e-05, + "loss": 0.5256, + "step": 9079 + }, + { + "epoch": 4.035555555555556, + "grad_norm": 3.704305410385132, + "learning_rate": 3.8612099644128115e-05, + "loss": 0.7616, + "step": 9080 + }, + { + "epoch": 4.036, + "grad_norm": 2.5341804027557373, + "learning_rate": 3.8594306049822065e-05, + "loss": 0.4988, + "step": 9081 + }, + { + "epoch": 4.036444444444444, + "grad_norm": 3.8586108684539795, + "learning_rate": 3.8576512455516015e-05, + "loss": 0.7027, + "step": 9082 + }, + { + "epoch": 4.036888888888889, + "grad_norm": 3.8743813037872314, + "learning_rate": 3.8558718861209965e-05, + "loss": 1.023, + "step": 9083 + }, + { + "epoch": 4.037333333333334, + "grad_norm": 3.394871711730957, + "learning_rate": 3.8540925266903914e-05, + "loss": 1.0268, + "step": 9084 + }, + { + "epoch": 4.0377777777777775, + "grad_norm": 4.589004993438721, + "learning_rate": 3.852313167259787e-05, + "loss": 1.0115, + "step": 9085 + }, + { + "epoch": 4.038222222222222, + "grad_norm": 3.1262948513031006, + "learning_rate": 3.850533807829182e-05, + "loss": 0.576, + "step": 9086 + }, + { + "epoch": 4.038666666666667, + "grad_norm": 4.138561725616455, + "learning_rate": 3.8487544483985763e-05, + "loss": 0.9221, + "step": 9087 + }, + { + "epoch": 4.0391111111111115, + "grad_norm": 3.105081796646118, + "learning_rate": 3.846975088967972e-05, + "loss": 0.4531, + "step": 9088 + }, + { + "epoch": 4.039555555555555, + "grad_norm": 4.928419589996338, + "learning_rate": 3.845195729537367e-05, + "loss": 0.906, + "step": 9089 + }, + { + "epoch": 4.04, + "grad_norm": 4.220268249511719, + "learning_rate": 3.843416370106761e-05, + "loss": 0.8529, + "step": 9090 + }, + { + "epoch": 4.040444444444445, + "grad_norm": 5.022706031799316, + "learning_rate": 3.841637010676157e-05, + "loss": 0.9518, + "step": 9091 + }, + { + "epoch": 4.0408888888888885, + "grad_norm": 3.5744247436523438, + "learning_rate": 3.839857651245552e-05, + "loss": 0.8885, + "step": 9092 + }, + { + "epoch": 4.041333333333333, + "grad_norm": 3.773207902908325, + "learning_rate": 3.838078291814947e-05, + "loss": 0.7402, + "step": 9093 + }, + { + "epoch": 4.041777777777778, + "grad_norm": 3.8730592727661133, + "learning_rate": 3.836298932384342e-05, + "loss": 0.8627, + "step": 9094 + }, + { + "epoch": 4.042222222222223, + "grad_norm": 4.191495895385742, + "learning_rate": 3.834519572953737e-05, + "loss": 0.6384, + "step": 9095 + }, + { + "epoch": 4.042666666666666, + "grad_norm": 4.794586658477783, + "learning_rate": 3.832740213523132e-05, + "loss": 0.9203, + "step": 9096 + }, + { + "epoch": 4.043111111111111, + "grad_norm": 4.144529819488525, + "learning_rate": 3.830960854092527e-05, + "loss": 0.7881, + "step": 9097 + }, + { + "epoch": 4.043555555555556, + "grad_norm": 8.298238754272461, + "learning_rate": 3.829181494661922e-05, + "loss": 1.375, + "step": 9098 + }, + { + "epoch": 4.044, + "grad_norm": 3.5953052043914795, + "learning_rate": 3.827402135231317e-05, + "loss": 0.3941, + "step": 9099 + }, + { + "epoch": 4.044444444444444, + "grad_norm": 10.296087265014648, + "learning_rate": 3.825622775800712e-05, + "loss": 0.2957, + "step": 9100 + }, + { + "epoch": 4.044888888888889, + "grad_norm": 3.059195041656494, + "learning_rate": 3.823843416370107e-05, + "loss": 1.8857, + "step": 9101 + }, + { + "epoch": 4.045333333333334, + "grad_norm": 2.8062288761138916, + "learning_rate": 3.822064056939502e-05, + "loss": 1.7158, + "step": 9102 + }, + { + "epoch": 4.0457777777777775, + "grad_norm": 1.8304457664489746, + "learning_rate": 3.8202846975088966e-05, + "loss": 0.4758, + "step": 9103 + }, + { + "epoch": 4.046222222222222, + "grad_norm": 2.9134559631347656, + "learning_rate": 3.818505338078292e-05, + "loss": 1.2599, + "step": 9104 + }, + { + "epoch": 4.046666666666667, + "grad_norm": 2.96635365486145, + "learning_rate": 3.816725978647687e-05, + "loss": 1.1921, + "step": 9105 + }, + { + "epoch": 4.0471111111111115, + "grad_norm": 2.9679348468780518, + "learning_rate": 3.814946619217082e-05, + "loss": 1.0036, + "step": 9106 + }, + { + "epoch": 4.047555555555555, + "grad_norm": 3.5697379112243652, + "learning_rate": 3.813167259786477e-05, + "loss": 1.3876, + "step": 9107 + }, + { + "epoch": 4.048, + "grad_norm": 2.98809814453125, + "learning_rate": 3.811387900355872e-05, + "loss": 0.7762, + "step": 9108 + }, + { + "epoch": 4.048444444444445, + "grad_norm": 3.473585367202759, + "learning_rate": 3.809608540925267e-05, + "loss": 1.2346, + "step": 9109 + }, + { + "epoch": 4.0488888888888885, + "grad_norm": 3.487264394760132, + "learning_rate": 3.807829181494662e-05, + "loss": 1.0737, + "step": 9110 + }, + { + "epoch": 4.049333333333333, + "grad_norm": 3.552971839904785, + "learning_rate": 3.806049822064057e-05, + "loss": 1.1126, + "step": 9111 + }, + { + "epoch": 4.049777777777778, + "grad_norm": 4.056872844696045, + "learning_rate": 3.804270462633452e-05, + "loss": 1.2205, + "step": 9112 + }, + { + "epoch": 4.050222222222223, + "grad_norm": 3.7317447662353516, + "learning_rate": 3.802491103202847e-05, + "loss": 1.0883, + "step": 9113 + }, + { + "epoch": 4.050666666666666, + "grad_norm": 3.8739066123962402, + "learning_rate": 3.800711743772242e-05, + "loss": 0.8847, + "step": 9114 + }, + { + "epoch": 4.051111111111111, + "grad_norm": 3.574202060699463, + "learning_rate": 3.7989323843416376e-05, + "loss": 0.7981, + "step": 9115 + }, + { + "epoch": 4.051555555555556, + "grad_norm": 3.6287660598754883, + "learning_rate": 3.797153024911032e-05, + "loss": 1.0176, + "step": 9116 + }, + { + "epoch": 4.052, + "grad_norm": 3.372129201889038, + "learning_rate": 3.7953736654804275e-05, + "loss": 0.8463, + "step": 9117 + }, + { + "epoch": 4.052444444444444, + "grad_norm": 0.2473270297050476, + "learning_rate": 3.7935943060498225e-05, + "loss": 0.0263, + "step": 9118 + }, + { + "epoch": 4.052888888888889, + "grad_norm": 2.3387582302093506, + "learning_rate": 3.791814946619217e-05, + "loss": 0.4731, + "step": 9119 + }, + { + "epoch": 4.053333333333334, + "grad_norm": 3.4713640213012695, + "learning_rate": 3.7900355871886125e-05, + "loss": 1.0173, + "step": 9120 + }, + { + "epoch": 4.0537777777777775, + "grad_norm": 4.791953086853027, + "learning_rate": 3.7882562277580074e-05, + "loss": 1.3301, + "step": 9121 + }, + { + "epoch": 4.054222222222222, + "grad_norm": 3.886340618133545, + "learning_rate": 3.7864768683274024e-05, + "loss": 0.759, + "step": 9122 + }, + { + "epoch": 4.054666666666667, + "grad_norm": 3.3495802879333496, + "learning_rate": 3.7846975088967974e-05, + "loss": 0.8179, + "step": 9123 + }, + { + "epoch": 4.0551111111111116, + "grad_norm": 3.790015935897827, + "learning_rate": 3.7829181494661923e-05, + "loss": 1.1811, + "step": 9124 + }, + { + "epoch": 4.055555555555555, + "grad_norm": 4.013554096221924, + "learning_rate": 3.781138790035587e-05, + "loss": 0.7972, + "step": 9125 + }, + { + "epoch": 4.056, + "grad_norm": 4.106941223144531, + "learning_rate": 3.779359430604982e-05, + "loss": 1.1104, + "step": 9126 + }, + { + "epoch": 4.056444444444445, + "grad_norm": 4.0450334548950195, + "learning_rate": 3.777580071174377e-05, + "loss": 1.0491, + "step": 9127 + }, + { + "epoch": 4.0568888888888885, + "grad_norm": 4.481957912445068, + "learning_rate": 3.775800711743772e-05, + "loss": 1.1531, + "step": 9128 + }, + { + "epoch": 4.057333333333333, + "grad_norm": 3.5693199634552, + "learning_rate": 3.774021352313167e-05, + "loss": 0.8621, + "step": 9129 + }, + { + "epoch": 4.057777777777778, + "grad_norm": 2.899524688720703, + "learning_rate": 3.772241992882563e-05, + "loss": 0.5678, + "step": 9130 + }, + { + "epoch": 4.058222222222223, + "grad_norm": 4.416593551635742, + "learning_rate": 3.770462633451958e-05, + "loss": 1.0094, + "step": 9131 + }, + { + "epoch": 4.058666666666666, + "grad_norm": 4.252343654632568, + "learning_rate": 3.768683274021352e-05, + "loss": 0.7924, + "step": 9132 + }, + { + "epoch": 4.059111111111111, + "grad_norm": 4.362743377685547, + "learning_rate": 3.766903914590748e-05, + "loss": 0.5001, + "step": 9133 + }, + { + "epoch": 4.059555555555556, + "grad_norm": 4.057370662689209, + "learning_rate": 3.765124555160143e-05, + "loss": 0.8535, + "step": 9134 + }, + { + "epoch": 4.06, + "grad_norm": 4.266189098358154, + "learning_rate": 3.763345195729537e-05, + "loss": 0.7122, + "step": 9135 + }, + { + "epoch": 4.060444444444444, + "grad_norm": 3.547168016433716, + "learning_rate": 3.761565836298933e-05, + "loss": 0.6885, + "step": 9136 + }, + { + "epoch": 4.060888888888889, + "grad_norm": 3.209439992904663, + "learning_rate": 3.759786476868328e-05, + "loss": 0.7069, + "step": 9137 + }, + { + "epoch": 4.061333333333334, + "grad_norm": 3.9220142364501953, + "learning_rate": 3.7580071174377226e-05, + "loss": 0.8905, + "step": 9138 + }, + { + "epoch": 4.0617777777777775, + "grad_norm": 3.8512017726898193, + "learning_rate": 3.7562277580071176e-05, + "loss": 0.9789, + "step": 9139 + }, + { + "epoch": 4.062222222222222, + "grad_norm": 4.788553714752197, + "learning_rate": 3.7544483985765126e-05, + "loss": 1.0029, + "step": 9140 + }, + { + "epoch": 4.062666666666667, + "grad_norm": 5.606046676635742, + "learning_rate": 3.7526690391459076e-05, + "loss": 0.781, + "step": 9141 + }, + { + "epoch": 4.063111111111111, + "grad_norm": 4.540804386138916, + "learning_rate": 3.7508896797153025e-05, + "loss": 1.0023, + "step": 9142 + }, + { + "epoch": 4.063555555555555, + "grad_norm": 4.766351699829102, + "learning_rate": 3.7491103202846975e-05, + "loss": 1.0042, + "step": 9143 + }, + { + "epoch": 4.064, + "grad_norm": 4.029290199279785, + "learning_rate": 3.747330960854093e-05, + "loss": 0.7966, + "step": 9144 + }, + { + "epoch": 4.064444444444445, + "grad_norm": 6.49293327331543, + "learning_rate": 3.7455516014234874e-05, + "loss": 0.6414, + "step": 9145 + }, + { + "epoch": 4.0648888888888886, + "grad_norm": 4.462486267089844, + "learning_rate": 3.743772241992883e-05, + "loss": 0.9538, + "step": 9146 + }, + { + "epoch": 4.065333333333333, + "grad_norm": 3.4527547359466553, + "learning_rate": 3.741992882562278e-05, + "loss": 0.4922, + "step": 9147 + }, + { + "epoch": 4.065777777777778, + "grad_norm": 3.8521132469177246, + "learning_rate": 3.7402135231316724e-05, + "loss": 0.7803, + "step": 9148 + }, + { + "epoch": 4.066222222222223, + "grad_norm": 3.7171268463134766, + "learning_rate": 3.738434163701068e-05, + "loss": 0.6302, + "step": 9149 + }, + { + "epoch": 4.066666666666666, + "grad_norm": 2.2403533458709717, + "learning_rate": 3.736654804270463e-05, + "loss": 0.2649, + "step": 9150 + }, + { + "epoch": 4.067111111111111, + "grad_norm": 2.9199023246765137, + "learning_rate": 3.734875444839858e-05, + "loss": 1.3195, + "step": 9151 + }, + { + "epoch": 4.067555555555556, + "grad_norm": 3.4877567291259766, + "learning_rate": 3.733096085409253e-05, + "loss": 1.3656, + "step": 9152 + }, + { + "epoch": 4.068, + "grad_norm": 2.9192590713500977, + "learning_rate": 3.731316725978648e-05, + "loss": 1.1056, + "step": 9153 + }, + { + "epoch": 4.068444444444444, + "grad_norm": 4.004071235656738, + "learning_rate": 3.729537366548043e-05, + "loss": 1.1344, + "step": 9154 + }, + { + "epoch": 4.068888888888889, + "grad_norm": 3.422654628753662, + "learning_rate": 3.727758007117438e-05, + "loss": 0.9968, + "step": 9155 + }, + { + "epoch": 4.069333333333334, + "grad_norm": 3.3041553497314453, + "learning_rate": 3.725978647686833e-05, + "loss": 1.4082, + "step": 9156 + }, + { + "epoch": 4.0697777777777775, + "grad_norm": 3.1231582164764404, + "learning_rate": 3.7241992882562285e-05, + "loss": 1.2489, + "step": 9157 + }, + { + "epoch": 4.070222222222222, + "grad_norm": 2.934847593307495, + "learning_rate": 3.722419928825623e-05, + "loss": 1.0434, + "step": 9158 + }, + { + "epoch": 4.070666666666667, + "grad_norm": 4.84507942199707, + "learning_rate": 3.720640569395018e-05, + "loss": 1.56, + "step": 9159 + }, + { + "epoch": 4.071111111111111, + "grad_norm": 4.400614261627197, + "learning_rate": 3.7188612099644134e-05, + "loss": 1.1331, + "step": 9160 + }, + { + "epoch": 4.071555555555555, + "grad_norm": 4.824821949005127, + "learning_rate": 3.717081850533808e-05, + "loss": 0.8231, + "step": 9161 + }, + { + "epoch": 4.072, + "grad_norm": 3.7322332859039307, + "learning_rate": 3.715302491103203e-05, + "loss": 1.0261, + "step": 9162 + }, + { + "epoch": 4.072444444444445, + "grad_norm": 3.8609957695007324, + "learning_rate": 3.713523131672598e-05, + "loss": 0.9616, + "step": 9163 + }, + { + "epoch": 4.072888888888889, + "grad_norm": 5.574636936187744, + "learning_rate": 3.7117437722419926e-05, + "loss": 1.2415, + "step": 9164 + }, + { + "epoch": 4.073333333333333, + "grad_norm": 4.391664028167725, + "learning_rate": 3.709964412811388e-05, + "loss": 1.3483, + "step": 9165 + }, + { + "epoch": 4.073777777777778, + "grad_norm": 3.788674831390381, + "learning_rate": 3.708185053380783e-05, + "loss": 1.0001, + "step": 9166 + }, + { + "epoch": 4.074222222222223, + "grad_norm": 4.3154072761535645, + "learning_rate": 3.706405693950178e-05, + "loss": 1.3682, + "step": 9167 + }, + { + "epoch": 4.074666666666666, + "grad_norm": 3.440797805786133, + "learning_rate": 3.704626334519573e-05, + "loss": 0.8641, + "step": 9168 + }, + { + "epoch": 4.075111111111111, + "grad_norm": 4.154467582702637, + "learning_rate": 3.702846975088968e-05, + "loss": 0.9788, + "step": 9169 + }, + { + "epoch": 4.075555555555556, + "grad_norm": 3.506791591644287, + "learning_rate": 3.701067615658363e-05, + "loss": 0.9756, + "step": 9170 + }, + { + "epoch": 4.076, + "grad_norm": 4.369688987731934, + "learning_rate": 3.699288256227758e-05, + "loss": 1.4468, + "step": 9171 + }, + { + "epoch": 4.076444444444444, + "grad_norm": 4.314336776733398, + "learning_rate": 3.697508896797153e-05, + "loss": 0.9896, + "step": 9172 + }, + { + "epoch": 4.076888888888889, + "grad_norm": 3.7983624935150146, + "learning_rate": 3.695729537366549e-05, + "loss": 1.0006, + "step": 9173 + }, + { + "epoch": 4.077333333333334, + "grad_norm": 3.259927749633789, + "learning_rate": 3.693950177935943e-05, + "loss": 0.7188, + "step": 9174 + }, + { + "epoch": 4.0777777777777775, + "grad_norm": 3.296593189239502, + "learning_rate": 3.6921708185053386e-05, + "loss": 0.5057, + "step": 9175 + }, + { + "epoch": 4.078222222222222, + "grad_norm": 3.704531669616699, + "learning_rate": 3.6903914590747336e-05, + "loss": 0.7359, + "step": 9176 + }, + { + "epoch": 4.078666666666667, + "grad_norm": 3.4545626640319824, + "learning_rate": 3.688612099644128e-05, + "loss": 0.784, + "step": 9177 + }, + { + "epoch": 4.079111111111111, + "grad_norm": 3.940635919570923, + "learning_rate": 3.6868327402135236e-05, + "loss": 0.9062, + "step": 9178 + }, + { + "epoch": 4.079555555555555, + "grad_norm": 3.7405149936676025, + "learning_rate": 3.6850533807829185e-05, + "loss": 1.1758, + "step": 9179 + }, + { + "epoch": 4.08, + "grad_norm": 3.7189319133758545, + "learning_rate": 3.683274021352313e-05, + "loss": 1.0273, + "step": 9180 + }, + { + "epoch": 4.080444444444445, + "grad_norm": 3.3661651611328125, + "learning_rate": 3.6814946619217085e-05, + "loss": 0.5712, + "step": 9181 + }, + { + "epoch": 4.080888888888889, + "grad_norm": 4.403657913208008, + "learning_rate": 3.6797153024911034e-05, + "loss": 1.1332, + "step": 9182 + }, + { + "epoch": 4.081333333333333, + "grad_norm": 3.4443655014038086, + "learning_rate": 3.6779359430604984e-05, + "loss": 1.0131, + "step": 9183 + }, + { + "epoch": 4.081777777777778, + "grad_norm": 6.885471343994141, + "learning_rate": 3.6761565836298934e-05, + "loss": 0.9145, + "step": 9184 + }, + { + "epoch": 4.082222222222223, + "grad_norm": 4.67835807800293, + "learning_rate": 3.6743772241992884e-05, + "loss": 0.8626, + "step": 9185 + }, + { + "epoch": 4.082666666666666, + "grad_norm": 4.347279071807861, + "learning_rate": 3.672597864768684e-05, + "loss": 0.753, + "step": 9186 + }, + { + "epoch": 4.083111111111111, + "grad_norm": 4.374654293060303, + "learning_rate": 3.670818505338078e-05, + "loss": 0.8974, + "step": 9187 + }, + { + "epoch": 4.083555555555556, + "grad_norm": 3.8928260803222656, + "learning_rate": 3.669039145907473e-05, + "loss": 0.8108, + "step": 9188 + }, + { + "epoch": 4.084, + "grad_norm": 5.292436122894287, + "learning_rate": 3.667259786476869e-05, + "loss": 0.9941, + "step": 9189 + }, + { + "epoch": 4.084444444444444, + "grad_norm": 4.306451320648193, + "learning_rate": 3.665480427046263e-05, + "loss": 1.0679, + "step": 9190 + }, + { + "epoch": 4.084888888888889, + "grad_norm": 4.150672435760498, + "learning_rate": 3.663701067615659e-05, + "loss": 0.7973, + "step": 9191 + }, + { + "epoch": 4.085333333333334, + "grad_norm": 3.7112274169921875, + "learning_rate": 3.661921708185054e-05, + "loss": 0.7108, + "step": 9192 + }, + { + "epoch": 4.0857777777777775, + "grad_norm": 4.035175323486328, + "learning_rate": 3.660142348754448e-05, + "loss": 0.8418, + "step": 9193 + }, + { + "epoch": 4.086222222222222, + "grad_norm": 4.420337677001953, + "learning_rate": 3.658362989323844e-05, + "loss": 0.8937, + "step": 9194 + }, + { + "epoch": 4.086666666666667, + "grad_norm": 4.49367618560791, + "learning_rate": 3.656583629893239e-05, + "loss": 1.1035, + "step": 9195 + }, + { + "epoch": 4.087111111111111, + "grad_norm": 5.570310115814209, + "learning_rate": 3.654804270462634e-05, + "loss": 0.7943, + "step": 9196 + }, + { + "epoch": 4.087555555555555, + "grad_norm": 7.375543117523193, + "learning_rate": 3.653024911032029e-05, + "loss": 1.0006, + "step": 9197 + }, + { + "epoch": 4.088, + "grad_norm": 3.2930474281311035, + "learning_rate": 3.651245551601424e-05, + "loss": 0.3692, + "step": 9198 + }, + { + "epoch": 4.088444444444445, + "grad_norm": 2.375638961791992, + "learning_rate": 3.6494661921708186e-05, + "loss": 0.151, + "step": 9199 + }, + { + "epoch": 4.088888888888889, + "grad_norm": 3.8127799034118652, + "learning_rate": 3.6476868327402136e-05, + "loss": 0.318, + "step": 9200 + }, + { + "epoch": 4.089333333333333, + "grad_norm": 0.2315405160188675, + "learning_rate": 3.6459074733096086e-05, + "loss": 0.0145, + "step": 9201 + }, + { + "epoch": 4.089777777777778, + "grad_norm": 2.930769920349121, + "learning_rate": 3.644128113879004e-05, + "loss": 1.5272, + "step": 9202 + }, + { + "epoch": 4.090222222222223, + "grad_norm": 2.6000430583953857, + "learning_rate": 3.6423487544483985e-05, + "loss": 1.2693, + "step": 9203 + }, + { + "epoch": 4.0906666666666665, + "grad_norm": 3.497532367706299, + "learning_rate": 3.6405693950177935e-05, + "loss": 1.5588, + "step": 9204 + }, + { + "epoch": 4.091111111111111, + "grad_norm": 3.512805223464966, + "learning_rate": 3.638790035587189e-05, + "loss": 1.3456, + "step": 9205 + }, + { + "epoch": 4.091555555555556, + "grad_norm": 4.103816032409668, + "learning_rate": 3.6370106761565835e-05, + "loss": 0.9345, + "step": 9206 + }, + { + "epoch": 4.092, + "grad_norm": 3.1070504188537598, + "learning_rate": 3.635231316725979e-05, + "loss": 1.1946, + "step": 9207 + }, + { + "epoch": 4.092444444444444, + "grad_norm": 3.410989999771118, + "learning_rate": 3.633451957295374e-05, + "loss": 1.0064, + "step": 9208 + }, + { + "epoch": 4.092888888888889, + "grad_norm": 2.981509208679199, + "learning_rate": 3.6316725978647684e-05, + "loss": 0.8163, + "step": 9209 + }, + { + "epoch": 4.093333333333334, + "grad_norm": 2.2597262859344482, + "learning_rate": 3.629893238434164e-05, + "loss": 0.5533, + "step": 9210 + }, + { + "epoch": 4.0937777777777775, + "grad_norm": 2.4587838649749756, + "learning_rate": 3.628113879003559e-05, + "loss": 0.6522, + "step": 9211 + }, + { + "epoch": 4.094222222222222, + "grad_norm": 3.2743875980377197, + "learning_rate": 3.626334519572954e-05, + "loss": 0.9227, + "step": 9212 + }, + { + "epoch": 4.094666666666667, + "grad_norm": 3.8894238471984863, + "learning_rate": 3.624555160142349e-05, + "loss": 1.2398, + "step": 9213 + }, + { + "epoch": 4.095111111111111, + "grad_norm": 4.327610969543457, + "learning_rate": 3.622775800711744e-05, + "loss": 1.296, + "step": 9214 + }, + { + "epoch": 4.095555555555555, + "grad_norm": 3.491788864135742, + "learning_rate": 3.6209964412811396e-05, + "loss": 0.9961, + "step": 9215 + }, + { + "epoch": 4.096, + "grad_norm": 4.091619968414307, + "learning_rate": 3.619217081850534e-05, + "loss": 1.0178, + "step": 9216 + }, + { + "epoch": 4.096444444444445, + "grad_norm": 5.12504243850708, + "learning_rate": 3.617437722419929e-05, + "loss": 0.9988, + "step": 9217 + }, + { + "epoch": 4.096888888888889, + "grad_norm": 5.116464614868164, + "learning_rate": 3.6156583629893245e-05, + "loss": 1.3894, + "step": 9218 + }, + { + "epoch": 4.097333333333333, + "grad_norm": 3.9038383960723877, + "learning_rate": 3.613879003558719e-05, + "loss": 1.2835, + "step": 9219 + }, + { + "epoch": 4.097777777777778, + "grad_norm": 4.344372272491455, + "learning_rate": 3.6120996441281144e-05, + "loss": 1.0891, + "step": 9220 + }, + { + "epoch": 4.098222222222223, + "grad_norm": 4.03400993347168, + "learning_rate": 3.6103202846975094e-05, + "loss": 1.1626, + "step": 9221 + }, + { + "epoch": 4.0986666666666665, + "grad_norm": 4.054080486297607, + "learning_rate": 3.608540925266904e-05, + "loss": 1.0427, + "step": 9222 + }, + { + "epoch": 4.099111111111111, + "grad_norm": 4.079352855682373, + "learning_rate": 3.606761565836299e-05, + "loss": 1.0579, + "step": 9223 + }, + { + "epoch": 4.099555555555556, + "grad_norm": 3.899838924407959, + "learning_rate": 3.604982206405694e-05, + "loss": 0.7473, + "step": 9224 + }, + { + "epoch": 4.1, + "grad_norm": 3.900310754776001, + "learning_rate": 3.6032028469750886e-05, + "loss": 1.0974, + "step": 9225 + }, + { + "epoch": 4.100444444444444, + "grad_norm": 3.1842355728149414, + "learning_rate": 3.601423487544484e-05, + "loss": 0.7394, + "step": 9226 + }, + { + "epoch": 4.100888888888889, + "grad_norm": 3.721182346343994, + "learning_rate": 3.599644128113879e-05, + "loss": 0.7716, + "step": 9227 + }, + { + "epoch": 4.101333333333334, + "grad_norm": 5.3094353675842285, + "learning_rate": 3.597864768683274e-05, + "loss": 0.9151, + "step": 9228 + }, + { + "epoch": 4.1017777777777775, + "grad_norm": 3.914445638656616, + "learning_rate": 3.596085409252669e-05, + "loss": 1.0772, + "step": 9229 + }, + { + "epoch": 4.102222222222222, + "grad_norm": 4.377343654632568, + "learning_rate": 3.594306049822064e-05, + "loss": 0.9153, + "step": 9230 + }, + { + "epoch": 4.102666666666667, + "grad_norm": 3.879788637161255, + "learning_rate": 3.59252669039146e-05, + "loss": 0.6954, + "step": 9231 + }, + { + "epoch": 4.103111111111111, + "grad_norm": 4.046523571014404, + "learning_rate": 3.590747330960854e-05, + "loss": 0.8208, + "step": 9232 + }, + { + "epoch": 4.103555555555555, + "grad_norm": 4.044562339782715, + "learning_rate": 3.588967971530249e-05, + "loss": 0.9976, + "step": 9233 + }, + { + "epoch": 4.104, + "grad_norm": 3.7858481407165527, + "learning_rate": 3.587188612099645e-05, + "loss": 0.9062, + "step": 9234 + }, + { + "epoch": 4.104444444444445, + "grad_norm": 2.641073226928711, + "learning_rate": 3.585409252669039e-05, + "loss": 0.4488, + "step": 9235 + }, + { + "epoch": 4.104888888888889, + "grad_norm": 3.873842239379883, + "learning_rate": 3.5836298932384346e-05, + "loss": 0.8443, + "step": 9236 + }, + { + "epoch": 4.105333333333333, + "grad_norm": 3.6468522548675537, + "learning_rate": 3.5818505338078296e-05, + "loss": 0.8935, + "step": 9237 + }, + { + "epoch": 4.105777777777778, + "grad_norm": 4.058321952819824, + "learning_rate": 3.580071174377224e-05, + "loss": 0.7038, + "step": 9238 + }, + { + "epoch": 4.106222222222222, + "grad_norm": 4.622478008270264, + "learning_rate": 3.5782918149466196e-05, + "loss": 1.1684, + "step": 9239 + }, + { + "epoch": 4.1066666666666665, + "grad_norm": 3.3128762245178223, + "learning_rate": 3.5765124555160145e-05, + "loss": 0.6763, + "step": 9240 + }, + { + "epoch": 4.107111111111111, + "grad_norm": 3.4522128105163574, + "learning_rate": 3.5747330960854095e-05, + "loss": 0.7716, + "step": 9241 + }, + { + "epoch": 4.107555555555556, + "grad_norm": 4.44683837890625, + "learning_rate": 3.5729537366548045e-05, + "loss": 1.36, + "step": 9242 + }, + { + "epoch": 4.108, + "grad_norm": 4.440934181213379, + "learning_rate": 3.5711743772241995e-05, + "loss": 0.9069, + "step": 9243 + }, + { + "epoch": 4.108444444444444, + "grad_norm": 3.85774302482605, + "learning_rate": 3.5693950177935944e-05, + "loss": 0.7827, + "step": 9244 + }, + { + "epoch": 4.108888888888889, + "grad_norm": 5.362185955047607, + "learning_rate": 3.5676156583629894e-05, + "loss": 1.2746, + "step": 9245 + }, + { + "epoch": 4.109333333333334, + "grad_norm": 3.934819221496582, + "learning_rate": 3.5658362989323844e-05, + "loss": 0.7316, + "step": 9246 + }, + { + "epoch": 4.1097777777777775, + "grad_norm": 4.532813549041748, + "learning_rate": 3.56405693950178e-05, + "loss": 0.6302, + "step": 9247 + }, + { + "epoch": 4.110222222222222, + "grad_norm": 4.29311990737915, + "learning_rate": 3.562277580071174e-05, + "loss": 0.5404, + "step": 9248 + }, + { + "epoch": 4.110666666666667, + "grad_norm": 3.867619276046753, + "learning_rate": 3.560498220640569e-05, + "loss": 0.8902, + "step": 9249 + }, + { + "epoch": 4.111111111111111, + "grad_norm": 5.4382805824279785, + "learning_rate": 3.558718861209965e-05, + "loss": 0.8051, + "step": 9250 + }, + { + "epoch": 4.111555555555555, + "grad_norm": 1.7659415006637573, + "learning_rate": 3.556939501779359e-05, + "loss": 0.8157, + "step": 9251 + }, + { + "epoch": 4.112, + "grad_norm": 2.5079727172851562, + "learning_rate": 3.555160142348755e-05, + "loss": 1.3793, + "step": 9252 + }, + { + "epoch": 4.112444444444445, + "grad_norm": 3.136521100997925, + "learning_rate": 3.55338078291815e-05, + "loss": 1.223, + "step": 9253 + }, + { + "epoch": 4.112888888888889, + "grad_norm": 3.1797564029693604, + "learning_rate": 3.551601423487544e-05, + "loss": 1.2796, + "step": 9254 + }, + { + "epoch": 4.113333333333333, + "grad_norm": 3.613799810409546, + "learning_rate": 3.54982206405694e-05, + "loss": 1.1687, + "step": 9255 + }, + { + "epoch": 4.113777777777778, + "grad_norm": 3.8610999584198, + "learning_rate": 3.548042704626335e-05, + "loss": 0.957, + "step": 9256 + }, + { + "epoch": 4.114222222222223, + "grad_norm": 3.316948890686035, + "learning_rate": 3.54626334519573e-05, + "loss": 1.4552, + "step": 9257 + }, + { + "epoch": 4.1146666666666665, + "grad_norm": 3.1068124771118164, + "learning_rate": 3.544483985765125e-05, + "loss": 0.9491, + "step": 9258 + }, + { + "epoch": 4.115111111111111, + "grad_norm": 3.8049232959747314, + "learning_rate": 3.54270462633452e-05, + "loss": 1.0955, + "step": 9259 + }, + { + "epoch": 4.115555555555556, + "grad_norm": 3.263183355331421, + "learning_rate": 3.540925266903915e-05, + "loss": 0.8825, + "step": 9260 + }, + { + "epoch": 4.116, + "grad_norm": 3.227997303009033, + "learning_rate": 3.5391459074733096e-05, + "loss": 1.1426, + "step": 9261 + }, + { + "epoch": 4.116444444444444, + "grad_norm": 3.8824005126953125, + "learning_rate": 3.5373665480427046e-05, + "loss": 1.0386, + "step": 9262 + }, + { + "epoch": 4.116888888888889, + "grad_norm": 4.455204486846924, + "learning_rate": 3.5355871886121e-05, + "loss": 1.0883, + "step": 9263 + }, + { + "epoch": 4.117333333333334, + "grad_norm": 3.621575355529785, + "learning_rate": 3.5338078291814945e-05, + "loss": 1.4027, + "step": 9264 + }, + { + "epoch": 4.1177777777777775, + "grad_norm": 3.8370847702026367, + "learning_rate": 3.53202846975089e-05, + "loss": 1.212, + "step": 9265 + }, + { + "epoch": 4.118222222222222, + "grad_norm": 3.060487747192383, + "learning_rate": 3.530249110320285e-05, + "loss": 0.8656, + "step": 9266 + }, + { + "epoch": 4.118666666666667, + "grad_norm": 3.36088228225708, + "learning_rate": 3.5284697508896795e-05, + "loss": 0.7663, + "step": 9267 + }, + { + "epoch": 4.119111111111111, + "grad_norm": 3.684251308441162, + "learning_rate": 3.526690391459075e-05, + "loss": 0.9489, + "step": 9268 + }, + { + "epoch": 4.119555555555555, + "grad_norm": 4.51201868057251, + "learning_rate": 3.52491103202847e-05, + "loss": 1.1845, + "step": 9269 + }, + { + "epoch": 4.12, + "grad_norm": 4.410719394683838, + "learning_rate": 3.5231316725978644e-05, + "loss": 1.702, + "step": 9270 + }, + { + "epoch": 4.120444444444445, + "grad_norm": 3.741725444793701, + "learning_rate": 3.52135231316726e-05, + "loss": 0.7647, + "step": 9271 + }, + { + "epoch": 4.120888888888889, + "grad_norm": 4.066876411437988, + "learning_rate": 3.519572953736655e-05, + "loss": 1.046, + "step": 9272 + }, + { + "epoch": 4.121333333333333, + "grad_norm": 3.6181998252868652, + "learning_rate": 3.51779359430605e-05, + "loss": 1.0439, + "step": 9273 + }, + { + "epoch": 4.121777777777778, + "grad_norm": 4.158766746520996, + "learning_rate": 3.516014234875445e-05, + "loss": 0.8574, + "step": 9274 + }, + { + "epoch": 4.122222222222222, + "grad_norm": 3.483020067214966, + "learning_rate": 3.51423487544484e-05, + "loss": 0.9312, + "step": 9275 + }, + { + "epoch": 4.1226666666666665, + "grad_norm": 3.8150248527526855, + "learning_rate": 3.5124555160142356e-05, + "loss": 0.7764, + "step": 9276 + }, + { + "epoch": 4.123111111111111, + "grad_norm": 4.526205062866211, + "learning_rate": 3.51067615658363e-05, + "loss": 0.9631, + "step": 9277 + }, + { + "epoch": 4.123555555555556, + "grad_norm": 3.8249216079711914, + "learning_rate": 3.508896797153025e-05, + "loss": 0.7817, + "step": 9278 + }, + { + "epoch": 4.124, + "grad_norm": 3.743093729019165, + "learning_rate": 3.5071174377224205e-05, + "loss": 0.9439, + "step": 9279 + }, + { + "epoch": 4.124444444444444, + "grad_norm": 4.130136966705322, + "learning_rate": 3.505338078291815e-05, + "loss": 0.8861, + "step": 9280 + }, + { + "epoch": 4.124888888888889, + "grad_norm": 4.320974826812744, + "learning_rate": 3.5035587188612104e-05, + "loss": 0.9908, + "step": 9281 + }, + { + "epoch": 4.125333333333334, + "grad_norm": 4.112794876098633, + "learning_rate": 3.5017793594306054e-05, + "loss": 0.8566, + "step": 9282 + }, + { + "epoch": 4.1257777777777775, + "grad_norm": 4.176024913787842, + "learning_rate": 3.5e-05, + "loss": 0.9719, + "step": 9283 + }, + { + "epoch": 4.126222222222222, + "grad_norm": 3.905268669128418, + "learning_rate": 3.4982206405693953e-05, + "loss": 0.8443, + "step": 9284 + }, + { + "epoch": 4.126666666666667, + "grad_norm": 4.007937431335449, + "learning_rate": 3.49644128113879e-05, + "loss": 0.6952, + "step": 9285 + }, + { + "epoch": 4.127111111111111, + "grad_norm": 4.117457866668701, + "learning_rate": 3.494661921708185e-05, + "loss": 1.1521, + "step": 9286 + }, + { + "epoch": 4.127555555555555, + "grad_norm": 4.604625225067139, + "learning_rate": 3.49288256227758e-05, + "loss": 0.7057, + "step": 9287 + }, + { + "epoch": 4.128, + "grad_norm": 4.086756706237793, + "learning_rate": 3.491103202846975e-05, + "loss": 0.7674, + "step": 9288 + }, + { + "epoch": 4.128444444444445, + "grad_norm": 3.6481192111968994, + "learning_rate": 3.48932384341637e-05, + "loss": 0.5105, + "step": 9289 + }, + { + "epoch": 4.128888888888889, + "grad_norm": 3.9914069175720215, + "learning_rate": 3.487544483985765e-05, + "loss": 0.8233, + "step": 9290 + }, + { + "epoch": 4.129333333333333, + "grad_norm": 4.540968418121338, + "learning_rate": 3.48576512455516e-05, + "loss": 1.2001, + "step": 9291 + }, + { + "epoch": 4.129777777777778, + "grad_norm": 4.1771769523620605, + "learning_rate": 3.483985765124556e-05, + "loss": 0.759, + "step": 9292 + }, + { + "epoch": 4.130222222222222, + "grad_norm": 4.386975288391113, + "learning_rate": 3.48220640569395e-05, + "loss": 0.8304, + "step": 9293 + }, + { + "epoch": 4.1306666666666665, + "grad_norm": 3.9730632305145264, + "learning_rate": 3.480427046263345e-05, + "loss": 0.7385, + "step": 9294 + }, + { + "epoch": 4.131111111111111, + "grad_norm": 3.7220194339752197, + "learning_rate": 3.478647686832741e-05, + "loss": 0.8544, + "step": 9295 + }, + { + "epoch": 4.131555555555556, + "grad_norm": 3.411055564880371, + "learning_rate": 3.476868327402135e-05, + "loss": 0.7001, + "step": 9296 + }, + { + "epoch": 4.132, + "grad_norm": 4.14982795715332, + "learning_rate": 3.4750889679715307e-05, + "loss": 0.9112, + "step": 9297 + }, + { + "epoch": 4.132444444444444, + "grad_norm": 4.247723579406738, + "learning_rate": 3.4733096085409256e-05, + "loss": 0.8633, + "step": 9298 + }, + { + "epoch": 4.132888888888889, + "grad_norm": 5.790535926818848, + "learning_rate": 3.47153024911032e-05, + "loss": 0.6881, + "step": 9299 + }, + { + "epoch": 4.133333333333334, + "grad_norm": 3.3628830909729004, + "learning_rate": 3.4697508896797156e-05, + "loss": 0.5777, + "step": 9300 + }, + { + "epoch": 4.1337777777777776, + "grad_norm": 2.9413001537323, + "learning_rate": 3.4679715302491105e-05, + "loss": 1.5624, + "step": 9301 + }, + { + "epoch": 4.134222222222222, + "grad_norm": 2.9262659549713135, + "learning_rate": 3.4661921708185055e-05, + "loss": 1.451, + "step": 9302 + }, + { + "epoch": 4.134666666666667, + "grad_norm": 2.8405566215515137, + "learning_rate": 3.4644128113879005e-05, + "loss": 1.6277, + "step": 9303 + }, + { + "epoch": 4.135111111111111, + "grad_norm": 1.9037202596664429, + "learning_rate": 3.4626334519572955e-05, + "loss": 0.3974, + "step": 9304 + }, + { + "epoch": 4.135555555555555, + "grad_norm": 3.3218352794647217, + "learning_rate": 3.460854092526691e-05, + "loss": 1.4065, + "step": 9305 + }, + { + "epoch": 4.136, + "grad_norm": 3.091264009475708, + "learning_rate": 3.4590747330960854e-05, + "loss": 1.1051, + "step": 9306 + }, + { + "epoch": 4.136444444444445, + "grad_norm": 3.341606616973877, + "learning_rate": 3.4572953736654804e-05, + "loss": 1.2214, + "step": 9307 + }, + { + "epoch": 4.136888888888889, + "grad_norm": 3.4468531608581543, + "learning_rate": 3.455516014234876e-05, + "loss": 1.1427, + "step": 9308 + }, + { + "epoch": 4.137333333333333, + "grad_norm": 3.611560583114624, + "learning_rate": 3.45373665480427e-05, + "loss": 1.0753, + "step": 9309 + }, + { + "epoch": 4.137777777777778, + "grad_norm": 4.02140474319458, + "learning_rate": 3.451957295373665e-05, + "loss": 1.0676, + "step": 9310 + }, + { + "epoch": 4.138222222222222, + "grad_norm": 3.795001268386841, + "learning_rate": 3.450177935943061e-05, + "loss": 1.0637, + "step": 9311 + }, + { + "epoch": 4.1386666666666665, + "grad_norm": 3.871553659439087, + "learning_rate": 3.448398576512455e-05, + "loss": 1.1937, + "step": 9312 + }, + { + "epoch": 4.139111111111111, + "grad_norm": 3.7541697025299072, + "learning_rate": 3.446619217081851e-05, + "loss": 0.9619, + "step": 9313 + }, + { + "epoch": 4.139555555555556, + "grad_norm": 3.9635727405548096, + "learning_rate": 3.444839857651246e-05, + "loss": 0.9555, + "step": 9314 + }, + { + "epoch": 4.14, + "grad_norm": 4.11905574798584, + "learning_rate": 3.44306049822064e-05, + "loss": 1.1721, + "step": 9315 + }, + { + "epoch": 4.140444444444444, + "grad_norm": 4.109530448913574, + "learning_rate": 3.441281138790036e-05, + "loss": 1.0309, + "step": 9316 + }, + { + "epoch": 4.140888888888889, + "grad_norm": 4.221639633178711, + "learning_rate": 3.439501779359431e-05, + "loss": 0.8867, + "step": 9317 + }, + { + "epoch": 4.141333333333334, + "grad_norm": 4.1241068840026855, + "learning_rate": 3.437722419928826e-05, + "loss": 1.2, + "step": 9318 + }, + { + "epoch": 4.141777777777778, + "grad_norm": 3.115713596343994, + "learning_rate": 3.435943060498221e-05, + "loss": 0.8329, + "step": 9319 + }, + { + "epoch": 4.142222222222222, + "grad_norm": 4.011024475097656, + "learning_rate": 3.434163701067616e-05, + "loss": 0.9659, + "step": 9320 + }, + { + "epoch": 4.142666666666667, + "grad_norm": 4.23065185546875, + "learning_rate": 3.4323843416370113e-05, + "loss": 0.834, + "step": 9321 + }, + { + "epoch": 4.143111111111111, + "grad_norm": 4.004124641418457, + "learning_rate": 3.4306049822064056e-05, + "loss": 0.8905, + "step": 9322 + }, + { + "epoch": 4.143555555555555, + "grad_norm": 4.145106315612793, + "learning_rate": 3.4288256227758006e-05, + "loss": 0.8814, + "step": 9323 + }, + { + "epoch": 4.144, + "grad_norm": 3.7468135356903076, + "learning_rate": 3.427046263345196e-05, + "loss": 1.143, + "step": 9324 + }, + { + "epoch": 4.144444444444445, + "grad_norm": 4.9056596755981445, + "learning_rate": 3.4252669039145906e-05, + "loss": 0.8847, + "step": 9325 + }, + { + "epoch": 4.144888888888889, + "grad_norm": 3.6427161693573, + "learning_rate": 3.423487544483986e-05, + "loss": 0.7846, + "step": 9326 + }, + { + "epoch": 4.145333333333333, + "grad_norm": 3.6018316745758057, + "learning_rate": 3.421708185053381e-05, + "loss": 1.1534, + "step": 9327 + }, + { + "epoch": 4.145777777777778, + "grad_norm": 4.335727214813232, + "learning_rate": 3.4199288256227755e-05, + "loss": 0.7984, + "step": 9328 + }, + { + "epoch": 4.146222222222222, + "grad_norm": 2.9448020458221436, + "learning_rate": 3.418149466192171e-05, + "loss": 0.6594, + "step": 9329 + }, + { + "epoch": 4.1466666666666665, + "grad_norm": 5.207244396209717, + "learning_rate": 3.416370106761566e-05, + "loss": 0.9542, + "step": 9330 + }, + { + "epoch": 4.147111111111111, + "grad_norm": 4.200593948364258, + "learning_rate": 3.414590747330961e-05, + "loss": 0.7464, + "step": 9331 + }, + { + "epoch": 4.147555555555556, + "grad_norm": 3.8014614582061768, + "learning_rate": 3.412811387900356e-05, + "loss": 0.8468, + "step": 9332 + }, + { + "epoch": 4.148, + "grad_norm": 4.140382289886475, + "learning_rate": 3.411032028469751e-05, + "loss": 1.0457, + "step": 9333 + }, + { + "epoch": 4.148444444444444, + "grad_norm": 4.095797061920166, + "learning_rate": 3.409252669039146e-05, + "loss": 0.8541, + "step": 9334 + }, + { + "epoch": 4.148888888888889, + "grad_norm": 4.9063825607299805, + "learning_rate": 3.407473309608541e-05, + "loss": 1.012, + "step": 9335 + }, + { + "epoch": 4.149333333333334, + "grad_norm": 5.420862197875977, + "learning_rate": 3.405693950177936e-05, + "loss": 1.0987, + "step": 9336 + }, + { + "epoch": 4.149777777777778, + "grad_norm": 4.148375988006592, + "learning_rate": 3.4039145907473316e-05, + "loss": 0.7677, + "step": 9337 + }, + { + "epoch": 4.150222222222222, + "grad_norm": 4.134524345397949, + "learning_rate": 3.402135231316726e-05, + "loss": 0.7619, + "step": 9338 + }, + { + "epoch": 4.150666666666667, + "grad_norm": 3.2314364910125732, + "learning_rate": 3.400355871886121e-05, + "loss": 0.6115, + "step": 9339 + }, + { + "epoch": 4.151111111111111, + "grad_norm": 4.153497219085693, + "learning_rate": 3.3985765124555165e-05, + "loss": 1.0227, + "step": 9340 + }, + { + "epoch": 4.151555555555555, + "grad_norm": 5.603724002838135, + "learning_rate": 3.396797153024911e-05, + "loss": 1.2158, + "step": 9341 + }, + { + "epoch": 4.152, + "grad_norm": 4.0254716873168945, + "learning_rate": 3.3950177935943064e-05, + "loss": 0.9139, + "step": 9342 + }, + { + "epoch": 4.152444444444445, + "grad_norm": 3.7253804206848145, + "learning_rate": 3.3932384341637014e-05, + "loss": 0.6698, + "step": 9343 + }, + { + "epoch": 4.152888888888889, + "grad_norm": 3.8816728591918945, + "learning_rate": 3.391459074733096e-05, + "loss": 0.5557, + "step": 9344 + }, + { + "epoch": 4.153333333333333, + "grad_norm": 4.364738941192627, + "learning_rate": 3.3896797153024914e-05, + "loss": 0.7052, + "step": 9345 + }, + { + "epoch": 4.153777777777778, + "grad_norm": 4.066445827484131, + "learning_rate": 3.387900355871886e-05, + "loss": 0.6252, + "step": 9346 + }, + { + "epoch": 4.154222222222222, + "grad_norm": 5.738955497741699, + "learning_rate": 3.386120996441281e-05, + "loss": 1.3448, + "step": 9347 + }, + { + "epoch": 4.1546666666666665, + "grad_norm": 4.564332962036133, + "learning_rate": 3.384341637010676e-05, + "loss": 0.5821, + "step": 9348 + }, + { + "epoch": 4.155111111111111, + "grad_norm": 4.382009983062744, + "learning_rate": 3.382562277580071e-05, + "loss": 0.7055, + "step": 9349 + }, + { + "epoch": 4.155555555555556, + "grad_norm": 3.8401434421539307, + "learning_rate": 3.380782918149467e-05, + "loss": 0.4056, + "step": 9350 + }, + { + "epoch": 4.156, + "grad_norm": 2.941746711730957, + "learning_rate": 3.379003558718861e-05, + "loss": 1.3041, + "step": 9351 + }, + { + "epoch": 4.156444444444444, + "grad_norm": 3.0022342205047607, + "learning_rate": 3.377224199288256e-05, + "loss": 1.2223, + "step": 9352 + }, + { + "epoch": 4.156888888888889, + "grad_norm": 3.1641345024108887, + "learning_rate": 3.375444839857652e-05, + "loss": 1.1629, + "step": 9353 + }, + { + "epoch": 4.157333333333334, + "grad_norm": 3.8065378665924072, + "learning_rate": 3.373665480427046e-05, + "loss": 1.7449, + "step": 9354 + }, + { + "epoch": 4.157777777777778, + "grad_norm": 3.80554461479187, + "learning_rate": 3.371886120996441e-05, + "loss": 1.2545, + "step": 9355 + }, + { + "epoch": 4.158222222222222, + "grad_norm": 3.1130597591400146, + "learning_rate": 3.370106761565837e-05, + "loss": 0.8787, + "step": 9356 + }, + { + "epoch": 4.158666666666667, + "grad_norm": 3.7158026695251465, + "learning_rate": 3.368327402135231e-05, + "loss": 1.171, + "step": 9357 + }, + { + "epoch": 4.159111111111111, + "grad_norm": 3.599818468093872, + "learning_rate": 3.366548042704627e-05, + "loss": 1.0849, + "step": 9358 + }, + { + "epoch": 4.1595555555555555, + "grad_norm": 3.1732017993927, + "learning_rate": 3.3647686832740216e-05, + "loss": 0.9308, + "step": 9359 + }, + { + "epoch": 4.16, + "grad_norm": 3.3031604290008545, + "learning_rate": 3.3629893238434166e-05, + "loss": 0.8818, + "step": 9360 + }, + { + "epoch": 4.160444444444445, + "grad_norm": 3.976465940475464, + "learning_rate": 3.3612099644128116e-05, + "loss": 1.2361, + "step": 9361 + }, + { + "epoch": 4.160888888888889, + "grad_norm": 3.4227919578552246, + "learning_rate": 3.3594306049822066e-05, + "loss": 0.9153, + "step": 9362 + }, + { + "epoch": 4.161333333333333, + "grad_norm": 4.071769714355469, + "learning_rate": 3.3576512455516015e-05, + "loss": 1.1655, + "step": 9363 + }, + { + "epoch": 4.161777777777778, + "grad_norm": 3.795288324356079, + "learning_rate": 3.3558718861209965e-05, + "loss": 0.9261, + "step": 9364 + }, + { + "epoch": 4.162222222222222, + "grad_norm": 4.52875280380249, + "learning_rate": 3.3540925266903915e-05, + "loss": 1.2094, + "step": 9365 + }, + { + "epoch": 4.1626666666666665, + "grad_norm": 3.7887825965881348, + "learning_rate": 3.352313167259787e-05, + "loss": 0.9441, + "step": 9366 + }, + { + "epoch": 4.163111111111111, + "grad_norm": 3.6934292316436768, + "learning_rate": 3.3505338078291814e-05, + "loss": 1.0685, + "step": 9367 + }, + { + "epoch": 4.163555555555556, + "grad_norm": 4.112295150756836, + "learning_rate": 3.3487544483985764e-05, + "loss": 0.8924, + "step": 9368 + }, + { + "epoch": 4.164, + "grad_norm": 4.231710910797119, + "learning_rate": 3.346975088967972e-05, + "loss": 1.2001, + "step": 9369 + }, + { + "epoch": 4.164444444444444, + "grad_norm": 4.923578262329102, + "learning_rate": 3.345195729537366e-05, + "loss": 1.2478, + "step": 9370 + }, + { + "epoch": 4.164888888888889, + "grad_norm": 4.2798895835876465, + "learning_rate": 3.343416370106762e-05, + "loss": 0.9576, + "step": 9371 + }, + { + "epoch": 4.165333333333333, + "grad_norm": 3.816706895828247, + "learning_rate": 3.341637010676157e-05, + "loss": 0.9874, + "step": 9372 + }, + { + "epoch": 4.165777777777778, + "grad_norm": 4.367424011230469, + "learning_rate": 3.339857651245551e-05, + "loss": 0.93, + "step": 9373 + }, + { + "epoch": 4.166222222222222, + "grad_norm": 3.3792166709899902, + "learning_rate": 3.338078291814947e-05, + "loss": 0.7842, + "step": 9374 + }, + { + "epoch": 4.166666666666667, + "grad_norm": 4.096624374389648, + "learning_rate": 3.336298932384342e-05, + "loss": 0.6063, + "step": 9375 + }, + { + "epoch": 4.167111111111111, + "grad_norm": 3.6580982208251953, + "learning_rate": 3.334519572953737e-05, + "loss": 1.2232, + "step": 9376 + }, + { + "epoch": 4.1675555555555555, + "grad_norm": 4.178760528564453, + "learning_rate": 3.332740213523132e-05, + "loss": 1.1404, + "step": 9377 + }, + { + "epoch": 4.168, + "grad_norm": 2.8333945274353027, + "learning_rate": 3.330960854092527e-05, + "loss": 0.3454, + "step": 9378 + }, + { + "epoch": 4.168444444444445, + "grad_norm": 4.03084135055542, + "learning_rate": 3.329181494661922e-05, + "loss": 0.9051, + "step": 9379 + }, + { + "epoch": 4.168888888888889, + "grad_norm": 4.452610015869141, + "learning_rate": 3.327402135231317e-05, + "loss": 0.8157, + "step": 9380 + }, + { + "epoch": 4.169333333333333, + "grad_norm": 3.308134078979492, + "learning_rate": 3.325622775800712e-05, + "loss": 0.5715, + "step": 9381 + }, + { + "epoch": 4.169777777777778, + "grad_norm": 5.042409896850586, + "learning_rate": 3.3238434163701074e-05, + "loss": 0.9263, + "step": 9382 + }, + { + "epoch": 4.170222222222222, + "grad_norm": 5.0167646408081055, + "learning_rate": 3.3220640569395016e-05, + "loss": 1.4425, + "step": 9383 + }, + { + "epoch": 4.1706666666666665, + "grad_norm": 3.9779274463653564, + "learning_rate": 3.3202846975088966e-05, + "loss": 0.8751, + "step": 9384 + }, + { + "epoch": 4.171111111111111, + "grad_norm": 3.9376795291900635, + "learning_rate": 3.318505338078292e-05, + "loss": 0.9257, + "step": 9385 + }, + { + "epoch": 4.171555555555556, + "grad_norm": 6.37144136428833, + "learning_rate": 3.3167259786476866e-05, + "loss": 0.7895, + "step": 9386 + }, + { + "epoch": 4.172, + "grad_norm": 3.9572556018829346, + "learning_rate": 3.314946619217082e-05, + "loss": 0.8676, + "step": 9387 + }, + { + "epoch": 4.172444444444444, + "grad_norm": 4.135096073150635, + "learning_rate": 3.313167259786477e-05, + "loss": 0.977, + "step": 9388 + }, + { + "epoch": 4.172888888888889, + "grad_norm": 4.981659412384033, + "learning_rate": 3.311387900355872e-05, + "loss": 0.8796, + "step": 9389 + }, + { + "epoch": 4.173333333333334, + "grad_norm": 1.5012166500091553, + "learning_rate": 3.309608540925267e-05, + "loss": 0.1896, + "step": 9390 + }, + { + "epoch": 4.173777777777778, + "grad_norm": 3.713898181915283, + "learning_rate": 3.307829181494662e-05, + "loss": 0.9846, + "step": 9391 + }, + { + "epoch": 4.174222222222222, + "grad_norm": 4.347773551940918, + "learning_rate": 3.306049822064057e-05, + "loss": 0.9879, + "step": 9392 + }, + { + "epoch": 4.174666666666667, + "grad_norm": 4.975940227508545, + "learning_rate": 3.304270462633452e-05, + "loss": 0.8384, + "step": 9393 + }, + { + "epoch": 4.175111111111111, + "grad_norm": 4.148822784423828, + "learning_rate": 3.302491103202847e-05, + "loss": 0.621, + "step": 9394 + }, + { + "epoch": 4.1755555555555555, + "grad_norm": 3.4947614669799805, + "learning_rate": 3.300711743772243e-05, + "loss": 0.6469, + "step": 9395 + }, + { + "epoch": 4.176, + "grad_norm": 3.335649251937866, + "learning_rate": 3.298932384341637e-05, + "loss": 0.5926, + "step": 9396 + }, + { + "epoch": 4.176444444444445, + "grad_norm": 5.286258220672607, + "learning_rate": 3.297153024911032e-05, + "loss": 1.1108, + "step": 9397 + }, + { + "epoch": 4.176888888888889, + "grad_norm": 4.690072059631348, + "learning_rate": 3.2953736654804276e-05, + "loss": 0.9287, + "step": 9398 + }, + { + "epoch": 4.177333333333333, + "grad_norm": 5.190115928649902, + "learning_rate": 3.293594306049822e-05, + "loss": 0.8765, + "step": 9399 + }, + { + "epoch": 4.177777777777778, + "grad_norm": 5.4480719566345215, + "learning_rate": 3.291814946619217e-05, + "loss": 0.8428, + "step": 9400 + }, + { + "epoch": 4.178222222222222, + "grad_norm": 1.7260178327560425, + "learning_rate": 3.2900355871886125e-05, + "loss": 0.6587, + "step": 9401 + }, + { + "epoch": 4.1786666666666665, + "grad_norm": 3.0524849891662598, + "learning_rate": 3.288256227758007e-05, + "loss": 1.6461, + "step": 9402 + }, + { + "epoch": 4.179111111111111, + "grad_norm": 3.80712890625, + "learning_rate": 3.2864768683274024e-05, + "loss": 1.5275, + "step": 9403 + }, + { + "epoch": 4.179555555555556, + "grad_norm": 3.202948808670044, + "learning_rate": 3.2846975088967974e-05, + "loss": 1.101, + "step": 9404 + }, + { + "epoch": 4.18, + "grad_norm": 3.789433002471924, + "learning_rate": 3.2829181494661924e-05, + "loss": 1.4379, + "step": 9405 + }, + { + "epoch": 4.180444444444444, + "grad_norm": 3.376875162124634, + "learning_rate": 3.2811387900355874e-05, + "loss": 1.099, + "step": 9406 + }, + { + "epoch": 4.180888888888889, + "grad_norm": 3.4530889987945557, + "learning_rate": 3.279359430604982e-05, + "loss": 1.5196, + "step": 9407 + }, + { + "epoch": 4.181333333333333, + "grad_norm": 4.1841607093811035, + "learning_rate": 3.277580071174377e-05, + "loss": 1.3571, + "step": 9408 + }, + { + "epoch": 4.181777777777778, + "grad_norm": 4.057764053344727, + "learning_rate": 3.275800711743772e-05, + "loss": 1.0025, + "step": 9409 + }, + { + "epoch": 4.182222222222222, + "grad_norm": 3.6850550174713135, + "learning_rate": 3.274021352313167e-05, + "loss": 1.0223, + "step": 9410 + }, + { + "epoch": 4.182666666666667, + "grad_norm": 3.958878517150879, + "learning_rate": 3.272241992882563e-05, + "loss": 1.1901, + "step": 9411 + }, + { + "epoch": 4.183111111111111, + "grad_norm": 4.859387397766113, + "learning_rate": 3.270462633451957e-05, + "loss": 0.9504, + "step": 9412 + }, + { + "epoch": 4.1835555555555555, + "grad_norm": 2.9225330352783203, + "learning_rate": 3.268683274021352e-05, + "loss": 0.5792, + "step": 9413 + }, + { + "epoch": 4.184, + "grad_norm": 3.700796127319336, + "learning_rate": 3.266903914590748e-05, + "loss": 0.5557, + "step": 9414 + }, + { + "epoch": 4.184444444444445, + "grad_norm": 2.9793245792388916, + "learning_rate": 3.265124555160142e-05, + "loss": 0.5418, + "step": 9415 + }, + { + "epoch": 4.184888888888889, + "grad_norm": 4.47357702255249, + "learning_rate": 3.263345195729538e-05, + "loss": 1.2536, + "step": 9416 + }, + { + "epoch": 4.185333333333333, + "grad_norm": 3.908679485321045, + "learning_rate": 3.261565836298933e-05, + "loss": 1.2, + "step": 9417 + }, + { + "epoch": 4.185777777777778, + "grad_norm": 3.9256107807159424, + "learning_rate": 3.259786476868328e-05, + "loss": 0.7822, + "step": 9418 + }, + { + "epoch": 4.186222222222222, + "grad_norm": 4.371975421905518, + "learning_rate": 3.258007117437723e-05, + "loss": 0.8867, + "step": 9419 + }, + { + "epoch": 4.1866666666666665, + "grad_norm": 3.3875746726989746, + "learning_rate": 3.2562277580071177e-05, + "loss": 0.8726, + "step": 9420 + }, + { + "epoch": 4.187111111111111, + "grad_norm": 4.142739295959473, + "learning_rate": 3.2544483985765126e-05, + "loss": 0.6472, + "step": 9421 + }, + { + "epoch": 4.187555555555556, + "grad_norm": 3.9463632106781006, + "learning_rate": 3.2526690391459076e-05, + "loss": 1.1603, + "step": 9422 + }, + { + "epoch": 4.188, + "grad_norm": 5.386812210083008, + "learning_rate": 3.2508896797153026e-05, + "loss": 0.9674, + "step": 9423 + }, + { + "epoch": 4.188444444444444, + "grad_norm": 5.0619001388549805, + "learning_rate": 3.2491103202846975e-05, + "loss": 1.3021, + "step": 9424 + }, + { + "epoch": 4.188888888888889, + "grad_norm": 4.491568565368652, + "learning_rate": 3.2473309608540925e-05, + "loss": 1.1475, + "step": 9425 + }, + { + "epoch": 4.189333333333333, + "grad_norm": 4.122431755065918, + "learning_rate": 3.2455516014234875e-05, + "loss": 0.8021, + "step": 9426 + }, + { + "epoch": 4.189777777777778, + "grad_norm": 4.491034507751465, + "learning_rate": 3.243772241992883e-05, + "loss": 1.0464, + "step": 9427 + }, + { + "epoch": 4.190222222222222, + "grad_norm": 3.6117091178894043, + "learning_rate": 3.2419928825622774e-05, + "loss": 0.8858, + "step": 9428 + }, + { + "epoch": 4.190666666666667, + "grad_norm": 5.300388813018799, + "learning_rate": 3.2402135231316724e-05, + "loss": 1.0057, + "step": 9429 + }, + { + "epoch": 4.191111111111111, + "grad_norm": 3.5130858421325684, + "learning_rate": 3.238434163701068e-05, + "loss": 0.4761, + "step": 9430 + }, + { + "epoch": 4.1915555555555555, + "grad_norm": 4.297408103942871, + "learning_rate": 3.2366548042704623e-05, + "loss": 0.8196, + "step": 9431 + }, + { + "epoch": 4.192, + "grad_norm": 4.353086471557617, + "learning_rate": 3.234875444839858e-05, + "loss": 1.031, + "step": 9432 + }, + { + "epoch": 4.192444444444445, + "grad_norm": 3.7016918659210205, + "learning_rate": 3.233096085409253e-05, + "loss": 0.8314, + "step": 9433 + }, + { + "epoch": 4.192888888888889, + "grad_norm": 3.4741709232330322, + "learning_rate": 3.231316725978648e-05, + "loss": 0.894, + "step": 9434 + }, + { + "epoch": 4.193333333333333, + "grad_norm": 3.2105934619903564, + "learning_rate": 3.229537366548043e-05, + "loss": 0.5393, + "step": 9435 + }, + { + "epoch": 4.193777777777778, + "grad_norm": 4.7250447273254395, + "learning_rate": 3.227758007117438e-05, + "loss": 1.047, + "step": 9436 + }, + { + "epoch": 4.194222222222222, + "grad_norm": 4.391071796417236, + "learning_rate": 3.225978647686833e-05, + "loss": 0.7677, + "step": 9437 + }, + { + "epoch": 4.1946666666666665, + "grad_norm": 2.0137879848480225, + "learning_rate": 3.224199288256228e-05, + "loss": 0.2664, + "step": 9438 + }, + { + "epoch": 4.195111111111111, + "grad_norm": 4.647293567657471, + "learning_rate": 3.222419928825623e-05, + "loss": 0.6575, + "step": 9439 + }, + { + "epoch": 4.195555555555556, + "grad_norm": 3.5285840034484863, + "learning_rate": 3.2206405693950184e-05, + "loss": 0.8251, + "step": 9440 + }, + { + "epoch": 4.196, + "grad_norm": 4.494802474975586, + "learning_rate": 3.218861209964413e-05, + "loss": 0.7617, + "step": 9441 + }, + { + "epoch": 4.196444444444444, + "grad_norm": 3.300382614135742, + "learning_rate": 3.217081850533808e-05, + "loss": 0.8305, + "step": 9442 + }, + { + "epoch": 4.196888888888889, + "grad_norm": 5.507649898529053, + "learning_rate": 3.2153024911032034e-05, + "loss": 0.8494, + "step": 9443 + }, + { + "epoch": 4.197333333333333, + "grad_norm": 4.006106853485107, + "learning_rate": 3.2135231316725977e-05, + "loss": 0.6544, + "step": 9444 + }, + { + "epoch": 4.197777777777778, + "grad_norm": 4.67263126373291, + "learning_rate": 3.2117437722419926e-05, + "loss": 0.9304, + "step": 9445 + }, + { + "epoch": 4.198222222222222, + "grad_norm": 4.910307884216309, + "learning_rate": 3.209964412811388e-05, + "loss": 0.8954, + "step": 9446 + }, + { + "epoch": 4.198666666666667, + "grad_norm": 6.133388996124268, + "learning_rate": 3.208185053380783e-05, + "loss": 0.6887, + "step": 9447 + }, + { + "epoch": 4.199111111111111, + "grad_norm": 4.447690010070801, + "learning_rate": 3.206405693950178e-05, + "loss": 0.9454, + "step": 9448 + }, + { + "epoch": 4.1995555555555555, + "grad_norm": 4.388811111450195, + "learning_rate": 3.204626334519573e-05, + "loss": 0.8632, + "step": 9449 + }, + { + "epoch": 4.2, + "grad_norm": 3.5809247493743896, + "learning_rate": 3.202846975088968e-05, + "loss": 0.1799, + "step": 9450 + }, + { + "epoch": 4.200444444444445, + "grad_norm": 2.599862575531006, + "learning_rate": 3.201067615658363e-05, + "loss": 1.5166, + "step": 9451 + }, + { + "epoch": 4.200888888888889, + "grad_norm": 3.278106451034546, + "learning_rate": 3.199288256227758e-05, + "loss": 1.6669, + "step": 9452 + }, + { + "epoch": 4.201333333333333, + "grad_norm": 2.277113676071167, + "learning_rate": 3.197508896797153e-05, + "loss": 0.5748, + "step": 9453 + }, + { + "epoch": 4.201777777777778, + "grad_norm": 3.8572428226470947, + "learning_rate": 3.195729537366548e-05, + "loss": 1.4989, + "step": 9454 + }, + { + "epoch": 4.202222222222222, + "grad_norm": 3.595043659210205, + "learning_rate": 3.193950177935943e-05, + "loss": 1.0113, + "step": 9455 + }, + { + "epoch": 4.2026666666666666, + "grad_norm": 3.511258840560913, + "learning_rate": 3.192170818505339e-05, + "loss": 1.4261, + "step": 9456 + }, + { + "epoch": 4.203111111111111, + "grad_norm": 3.4274392127990723, + "learning_rate": 3.190391459074733e-05, + "loss": 1.0837, + "step": 9457 + }, + { + "epoch": 4.203555555555556, + "grad_norm": 3.7889490127563477, + "learning_rate": 3.188612099644128e-05, + "loss": 1.3672, + "step": 9458 + }, + { + "epoch": 4.204, + "grad_norm": 3.413290500640869, + "learning_rate": 3.1868327402135236e-05, + "loss": 1.0793, + "step": 9459 + }, + { + "epoch": 4.204444444444444, + "grad_norm": 3.5661888122558594, + "learning_rate": 3.185053380782918e-05, + "loss": 1.704, + "step": 9460 + }, + { + "epoch": 4.204888888888889, + "grad_norm": 3.6765122413635254, + "learning_rate": 3.1832740213523135e-05, + "loss": 1.1704, + "step": 9461 + }, + { + "epoch": 4.205333333333333, + "grad_norm": 3.350315570831299, + "learning_rate": 3.1814946619217085e-05, + "loss": 1.5331, + "step": 9462 + }, + { + "epoch": 4.205777777777778, + "grad_norm": 3.8041248321533203, + "learning_rate": 3.1797153024911035e-05, + "loss": 1.1945, + "step": 9463 + }, + { + "epoch": 4.206222222222222, + "grad_norm": 3.120882987976074, + "learning_rate": 3.1779359430604985e-05, + "loss": 0.9308, + "step": 9464 + }, + { + "epoch": 4.206666666666667, + "grad_norm": 3.490065574645996, + "learning_rate": 3.1761565836298934e-05, + "loss": 1.0944, + "step": 9465 + }, + { + "epoch": 4.207111111111111, + "grad_norm": 3.804657459259033, + "learning_rate": 3.1743772241992884e-05, + "loss": 1.2981, + "step": 9466 + }, + { + "epoch": 4.2075555555555555, + "grad_norm": 4.033463954925537, + "learning_rate": 3.1725978647686834e-05, + "loss": 0.8921, + "step": 9467 + }, + { + "epoch": 4.208, + "grad_norm": 3.6122589111328125, + "learning_rate": 3.1708185053380783e-05, + "loss": 0.8588, + "step": 9468 + }, + { + "epoch": 4.208444444444444, + "grad_norm": 4.304235935211182, + "learning_rate": 3.169039145907473e-05, + "loss": 1.2107, + "step": 9469 + }, + { + "epoch": 4.208888888888889, + "grad_norm": 4.344990253448486, + "learning_rate": 3.167259786476868e-05, + "loss": 1.1185, + "step": 9470 + }, + { + "epoch": 4.209333333333333, + "grad_norm": 3.5502078533172607, + "learning_rate": 3.165480427046263e-05, + "loss": 1.0842, + "step": 9471 + }, + { + "epoch": 4.209777777777778, + "grad_norm": 4.145521640777588, + "learning_rate": 3.163701067615659e-05, + "loss": 1.2853, + "step": 9472 + }, + { + "epoch": 4.210222222222222, + "grad_norm": 3.878098964691162, + "learning_rate": 3.161921708185053e-05, + "loss": 0.8788, + "step": 9473 + }, + { + "epoch": 4.210666666666667, + "grad_norm": 3.4567415714263916, + "learning_rate": 3.160142348754448e-05, + "loss": 0.9139, + "step": 9474 + }, + { + "epoch": 4.211111111111111, + "grad_norm": 3.87986421585083, + "learning_rate": 3.158362989323844e-05, + "loss": 0.7227, + "step": 9475 + }, + { + "epoch": 4.211555555555556, + "grad_norm": 5.376968860626221, + "learning_rate": 3.156583629893239e-05, + "loss": 1.4257, + "step": 9476 + }, + { + "epoch": 4.212, + "grad_norm": 5.025868892669678, + "learning_rate": 3.154804270462634e-05, + "loss": 0.8064, + "step": 9477 + }, + { + "epoch": 4.212444444444444, + "grad_norm": 3.2380011081695557, + "learning_rate": 3.153024911032029e-05, + "loss": 0.6622, + "step": 9478 + }, + { + "epoch": 4.212888888888889, + "grad_norm": 4.465753555297852, + "learning_rate": 3.151245551601424e-05, + "loss": 0.9084, + "step": 9479 + }, + { + "epoch": 4.213333333333333, + "grad_norm": 4.312685489654541, + "learning_rate": 3.149466192170819e-05, + "loss": 1.0134, + "step": 9480 + }, + { + "epoch": 4.213777777777778, + "grad_norm": 3.1639740467071533, + "learning_rate": 3.147686832740214e-05, + "loss": 0.6682, + "step": 9481 + }, + { + "epoch": 4.214222222222222, + "grad_norm": 3.7357571125030518, + "learning_rate": 3.1459074733096086e-05, + "loss": 0.8452, + "step": 9482 + }, + { + "epoch": 4.214666666666667, + "grad_norm": 2.6543757915496826, + "learning_rate": 3.1441281138790036e-05, + "loss": 0.5144, + "step": 9483 + }, + { + "epoch": 4.215111111111111, + "grad_norm": 4.193578243255615, + "learning_rate": 3.1423487544483986e-05, + "loss": 0.6794, + "step": 9484 + }, + { + "epoch": 4.2155555555555555, + "grad_norm": 3.771239757537842, + "learning_rate": 3.140569395017794e-05, + "loss": 0.7003, + "step": 9485 + }, + { + "epoch": 4.216, + "grad_norm": 4.657698631286621, + "learning_rate": 3.1387900355871885e-05, + "loss": 1.0667, + "step": 9486 + }, + { + "epoch": 4.216444444444445, + "grad_norm": 3.7996761798858643, + "learning_rate": 3.1370106761565835e-05, + "loss": 0.7508, + "step": 9487 + }, + { + "epoch": 4.216888888888889, + "grad_norm": 3.673656940460205, + "learning_rate": 3.135231316725979e-05, + "loss": 0.9042, + "step": 9488 + }, + { + "epoch": 4.217333333333333, + "grad_norm": 4.215774059295654, + "learning_rate": 3.1334519572953734e-05, + "loss": 0.5879, + "step": 9489 + }, + { + "epoch": 4.217777777777778, + "grad_norm": 5.385339260101318, + "learning_rate": 3.1316725978647684e-05, + "loss": 1.1556, + "step": 9490 + }, + { + "epoch": 4.218222222222222, + "grad_norm": 5.48359489440918, + "learning_rate": 3.129893238434164e-05, + "loss": 0.8134, + "step": 9491 + }, + { + "epoch": 4.218666666666667, + "grad_norm": 4.134088516235352, + "learning_rate": 3.128113879003559e-05, + "loss": 0.9353, + "step": 9492 + }, + { + "epoch": 4.219111111111111, + "grad_norm": 4.422386646270752, + "learning_rate": 3.126334519572954e-05, + "loss": 0.7743, + "step": 9493 + }, + { + "epoch": 4.219555555555556, + "grad_norm": 4.430136680603027, + "learning_rate": 3.124555160142349e-05, + "loss": 0.8493, + "step": 9494 + }, + { + "epoch": 4.22, + "grad_norm": 4.236592769622803, + "learning_rate": 3.122775800711744e-05, + "loss": 0.6579, + "step": 9495 + }, + { + "epoch": 4.220444444444444, + "grad_norm": 3.639747142791748, + "learning_rate": 3.120996441281139e-05, + "loss": 0.756, + "step": 9496 + }, + { + "epoch": 4.220888888888889, + "grad_norm": 4.818484783172607, + "learning_rate": 3.119217081850534e-05, + "loss": 1.1835, + "step": 9497 + }, + { + "epoch": 4.221333333333333, + "grad_norm": 4.322108745574951, + "learning_rate": 3.117437722419929e-05, + "loss": 0.6088, + "step": 9498 + }, + { + "epoch": 4.221777777777778, + "grad_norm": 4.885183334350586, + "learning_rate": 3.115658362989324e-05, + "loss": 0.8394, + "step": 9499 + }, + { + "epoch": 4.222222222222222, + "grad_norm": 0.4897964298725128, + "learning_rate": 3.113879003558719e-05, + "loss": 0.0723, + "step": 9500 + } + ], + "logging_steps": 1, + "max_steps": 11250, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 7.299541558648013e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}