diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,16914 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.24999189937074312, + "eval_steps": 2411, + "global_step": 2411, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00010368805448807264, + "grad_norm": 3.078125, + "learning_rate": 2e-05, + "loss": 11.9266, + "step": 1 + }, + { + "epoch": 0.00010368805448807264, + "eval_loss": 11.923038482666016, + "eval_runtime": 0.4444, + "eval_samples_per_second": 335.305, + "eval_steps_per_second": 15.753, + "step": 1 + }, + { + "epoch": 0.00020737610897614527, + "grad_norm": 3.046875, + "learning_rate": 4e-05, + "loss": 11.9246, + "step": 2 + }, + { + "epoch": 0.0003110641634642179, + "grad_norm": 3.171875, + "learning_rate": 6e-05, + "loss": 11.8897, + "step": 3 + }, + { + "epoch": 0.00041475221795229055, + "grad_norm": 2.890625, + "learning_rate": 8e-05, + "loss": 11.7637, + "step": 4 + }, + { + "epoch": 0.0005184402724403631, + "grad_norm": 2.6875, + "learning_rate": 0.0001, + "loss": 11.6167, + "step": 5 + }, + { + "epoch": 0.0006221283269284358, + "grad_norm": 2.5625, + "learning_rate": 0.00012, + "loss": 11.3739, + "step": 6 + }, + { + "epoch": 0.0007258163814165084, + "grad_norm": 2.53125, + "learning_rate": 0.00014, + "loss": 11.1311, + "step": 7 + }, + { + "epoch": 0.0008295044359045811, + "grad_norm": 2.359375, + "learning_rate": 0.00016, + "loss": 10.8909, + "step": 8 + }, + { + "epoch": 0.0009331924903926537, + "grad_norm": 2.21875, + "learning_rate": 0.00018, + "loss": 10.6657, + "step": 9 + }, + { + "epoch": 0.0010368805448807263, + "grad_norm": 2.125, + "learning_rate": 0.0002, + "loss": 10.4763, + "step": 10 + }, + { + "epoch": 0.001140568599368799, + "grad_norm": 2.078125, + "learning_rate": 0.00019999999941005286, + "loss": 10.2975, + "step": 11 + }, + { + "epoch": 0.0012442566538568716, + "grad_norm": 2.015625, + "learning_rate": 0.00019999999764021143, + "loss": 10.1245, + "step": 12 + }, + { + "epoch": 0.0013479447083449443, + "grad_norm": 2.0, + "learning_rate": 0.00019999999469047573, + "loss": 9.9439, + "step": 13 + }, + { + "epoch": 0.0014516327628330168, + "grad_norm": 2.015625, + "learning_rate": 0.0001999999905608458, + "loss": 9.7362, + "step": 14 + }, + { + "epoch": 0.0015553208173210895, + "grad_norm": 1.9609375, + "learning_rate": 0.00019999998525132166, + "loss": 9.5586, + "step": 15 + }, + { + "epoch": 0.0016590088718091622, + "grad_norm": 1.875, + "learning_rate": 0.00019999997876190344, + "loss": 9.4056, + "step": 16 + }, + { + "epoch": 0.0017626969262972347, + "grad_norm": 1.8515625, + "learning_rate": 0.00019999997109259115, + "loss": 9.2022, + "step": 17 + }, + { + "epoch": 0.0018663849807853074, + "grad_norm": 1.7890625, + "learning_rate": 0.00019999996224338487, + "loss": 9.0599, + "step": 18 + }, + { + "epoch": 0.00197007303527338, + "grad_norm": 1.734375, + "learning_rate": 0.0001999999522142848, + "loss": 8.9008, + "step": 19 + }, + { + "epoch": 0.0020737610897614525, + "grad_norm": 1.6171875, + "learning_rate": 0.000199999941005291, + "loss": 8.8079, + "step": 20 + }, + { + "epoch": 0.0021774491442495252, + "grad_norm": 1.5625, + "learning_rate": 0.00019999992861640355, + "loss": 8.654, + "step": 21 + }, + { + "epoch": 0.002281137198737598, + "grad_norm": 1.46875, + "learning_rate": 0.0001999999150476227, + "loss": 8.5387, + "step": 22 + }, + { + "epoch": 0.0023848252532256706, + "grad_norm": 1.390625, + "learning_rate": 0.0001999999002989485, + "loss": 8.4015, + "step": 23 + }, + { + "epoch": 0.0024885133077137433, + "grad_norm": 1.2890625, + "learning_rate": 0.00019999988437038123, + "loss": 8.3069, + "step": 24 + }, + { + "epoch": 0.002592201362201816, + "grad_norm": 1.2109375, + "learning_rate": 0.00019999986726192102, + "loss": 8.1671, + "step": 25 + }, + { + "epoch": 0.0026958894166898887, + "grad_norm": 1.0546875, + "learning_rate": 0.00019999984897356806, + "loss": 8.1281, + "step": 26 + }, + { + "epoch": 0.002799577471177961, + "grad_norm": 0.921875, + "learning_rate": 0.0001999998295053226, + "loss": 8.0346, + "step": 27 + }, + { + "epoch": 0.0029032655256660336, + "grad_norm": 0.7890625, + "learning_rate": 0.00019999980885718487, + "loss": 7.9803, + "step": 28 + }, + { + "epoch": 0.0030069535801541063, + "grad_norm": 0.67578125, + "learning_rate": 0.00019999978702915508, + "loss": 7.9035, + "step": 29 + }, + { + "epoch": 0.003110641634642179, + "grad_norm": 0.58984375, + "learning_rate": 0.0001999997640212335, + "loss": 7.8359, + "step": 30 + }, + { + "epoch": 0.0032143296891302517, + "grad_norm": 0.470703125, + "learning_rate": 0.00019999973983342043, + "loss": 7.8463, + "step": 31 + }, + { + "epoch": 0.0033180177436183244, + "grad_norm": 0.4140625, + "learning_rate": 0.0001999997144657161, + "loss": 7.7655, + "step": 32 + }, + { + "epoch": 0.003421705798106397, + "grad_norm": 0.33984375, + "learning_rate": 0.0001999996879181209, + "loss": 7.7699, + "step": 33 + }, + { + "epoch": 0.0035253938525944693, + "grad_norm": 0.296875, + "learning_rate": 0.00019999966019063506, + "loss": 7.7232, + "step": 34 + }, + { + "epoch": 0.003629081907082542, + "grad_norm": 0.28515625, + "learning_rate": 0.00019999963128325892, + "loss": 7.658, + "step": 35 + }, + { + "epoch": 0.0037327699615706147, + "grad_norm": 0.263671875, + "learning_rate": 0.00019999960119599283, + "loss": 7.6972, + "step": 36 + }, + { + "epoch": 0.0038364580160586874, + "grad_norm": 0.232421875, + "learning_rate": 0.00019999956992883716, + "loss": 7.6441, + "step": 37 + }, + { + "epoch": 0.00394014607054676, + "grad_norm": 0.296875, + "learning_rate": 0.00019999953748179228, + "loss": 7.5872, + "step": 38 + }, + { + "epoch": 0.004043834125034833, + "grad_norm": 0.37890625, + "learning_rate": 0.00019999950385485855, + "loss": 7.5796, + "step": 39 + }, + { + "epoch": 0.004147522179522905, + "grad_norm": 0.26171875, + "learning_rate": 0.00019999946904803638, + "loss": 7.5814, + "step": 40 + }, + { + "epoch": 0.004251210234010978, + "grad_norm": 0.255859375, + "learning_rate": 0.00019999943306132621, + "loss": 7.5596, + "step": 41 + }, + { + "epoch": 0.0043548982884990504, + "grad_norm": 0.275390625, + "learning_rate": 0.00019999939589472837, + "loss": 7.5181, + "step": 42 + }, + { + "epoch": 0.004458586342987124, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019999935754824342, + "loss": 7.4972, + "step": 43 + }, + { + "epoch": 0.004562274397475196, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019999931802187172, + "loss": 7.4948, + "step": 44 + }, + { + "epoch": 0.004665962451963268, + "grad_norm": 0.2578125, + "learning_rate": 0.0001999992773156138, + "loss": 7.4437, + "step": 45 + }, + { + "epoch": 0.004769650506451341, + "grad_norm": 0.240234375, + "learning_rate": 0.0001999992354294701, + "loss": 7.4184, + "step": 46 + }, + { + "epoch": 0.0048733385609394135, + "grad_norm": 0.2421875, + "learning_rate": 0.00019999919236344114, + "loss": 7.3949, + "step": 47 + }, + { + "epoch": 0.004977026615427487, + "grad_norm": 0.2451171875, + "learning_rate": 0.00019999914811752738, + "loss": 7.3632, + "step": 48 + }, + { + "epoch": 0.005080714669915559, + "grad_norm": 0.28125, + "learning_rate": 0.00019999910269172938, + "loss": 7.3576, + "step": 49 + }, + { + "epoch": 0.005184402724403632, + "grad_norm": 0.232421875, + "learning_rate": 0.0001999990560860477, + "loss": 7.3231, + "step": 50 + }, + { + "epoch": 0.005288090778891704, + "grad_norm": 0.263671875, + "learning_rate": 0.00019999900830048283, + "loss": 7.3035, + "step": 51 + }, + { + "epoch": 0.005391778833379777, + "grad_norm": 0.306640625, + "learning_rate": 0.0001999989593350354, + "loss": 7.2858, + "step": 52 + }, + { + "epoch": 0.00549546688786785, + "grad_norm": 0.265625, + "learning_rate": 0.00019999890918970592, + "loss": 7.2276, + "step": 53 + }, + { + "epoch": 0.005599154942355922, + "grad_norm": 0.2314453125, + "learning_rate": 0.00019999885786449505, + "loss": 7.2519, + "step": 54 + }, + { + "epoch": 0.005702842996843995, + "grad_norm": 0.302734375, + "learning_rate": 0.00019999880535940333, + "loss": 7.2018, + "step": 55 + }, + { + "epoch": 0.005806531051332067, + "grad_norm": 0.2392578125, + "learning_rate": 0.00019999875167443142, + "loss": 7.1873, + "step": 56 + }, + { + "epoch": 0.00591021910582014, + "grad_norm": 0.2470703125, + "learning_rate": 0.00019999869680957993, + "loss": 7.1665, + "step": 57 + }, + { + "epoch": 0.006013907160308213, + "grad_norm": 0.310546875, + "learning_rate": 0.00019999864076484955, + "loss": 7.1518, + "step": 58 + }, + { + "epoch": 0.006117595214796286, + "grad_norm": 0.376953125, + "learning_rate": 0.0001999985835402409, + "loss": 7.1188, + "step": 59 + }, + { + "epoch": 0.006221283269284358, + "grad_norm": 0.4375, + "learning_rate": 0.00019999852513575466, + "loss": 7.0633, + "step": 60 + }, + { + "epoch": 0.00632497132377243, + "grad_norm": 0.376953125, + "learning_rate": 0.00019999846555139152, + "loss": 7.0708, + "step": 61 + }, + { + "epoch": 0.006428659378260503, + "grad_norm": 0.296875, + "learning_rate": 0.0001999984047871522, + "loss": 7.0265, + "step": 62 + }, + { + "epoch": 0.006532347432748576, + "grad_norm": 0.40625, + "learning_rate": 0.0001999983428430374, + "loss": 6.976, + "step": 63 + }, + { + "epoch": 0.006636035487236649, + "grad_norm": 0.47265625, + "learning_rate": 0.00019999827971904787, + "loss": 6.9527, + "step": 64 + }, + { + "epoch": 0.006739723541724721, + "grad_norm": 0.357421875, + "learning_rate": 0.00019999821541518437, + "loss": 6.9225, + "step": 65 + }, + { + "epoch": 0.006843411596212794, + "grad_norm": 0.28125, + "learning_rate": 0.00019999814993144755, + "loss": 6.9846, + "step": 66 + }, + { + "epoch": 0.0069470996507008664, + "grad_norm": 0.310546875, + "learning_rate": 0.00019999808326783835, + "loss": 6.9026, + "step": 67 + }, + { + "epoch": 0.007050787705188939, + "grad_norm": 0.298828125, + "learning_rate": 0.00019999801542435743, + "loss": 6.8856, + "step": 68 + }, + { + "epoch": 0.007154475759677012, + "grad_norm": 0.32421875, + "learning_rate": 0.00019999794640100562, + "loss": 6.8605, + "step": 69 + }, + { + "epoch": 0.007258163814165084, + "grad_norm": 0.251953125, + "learning_rate": 0.00019999787619778375, + "loss": 6.8081, + "step": 70 + }, + { + "epoch": 0.007361851868653157, + "grad_norm": 0.380859375, + "learning_rate": 0.00019999780481469266, + "loss": 6.8199, + "step": 71 + }, + { + "epoch": 0.0074655399231412295, + "grad_norm": 0.48828125, + "learning_rate": 0.00019999773225173314, + "loss": 6.8462, + "step": 72 + }, + { + "epoch": 0.007569227977629303, + "grad_norm": 0.60546875, + "learning_rate": 0.00019999765850890614, + "loss": 6.7978, + "step": 73 + }, + { + "epoch": 0.007672916032117375, + "grad_norm": 0.5625, + "learning_rate": 0.0001999975835862124, + "loss": 6.7541, + "step": 74 + }, + { + "epoch": 0.007776604086605447, + "grad_norm": 0.4296875, + "learning_rate": 0.00019999750748365294, + "loss": 6.7211, + "step": 75 + }, + { + "epoch": 0.00788029214109352, + "grad_norm": 0.453125, + "learning_rate": 0.00019999743020122855, + "loss": 6.7321, + "step": 76 + }, + { + "epoch": 0.007983980195581592, + "grad_norm": 0.48828125, + "learning_rate": 0.0001999973517389402, + "loss": 6.7423, + "step": 77 + }, + { + "epoch": 0.008087668250069666, + "grad_norm": 0.515625, + "learning_rate": 0.00019999727209678883, + "loss": 6.6788, + "step": 78 + }, + { + "epoch": 0.008191356304557739, + "grad_norm": 0.5625, + "learning_rate": 0.0001999971912747753, + "loss": 6.689, + "step": 79 + }, + { + "epoch": 0.00829504435904581, + "grad_norm": 0.416015625, + "learning_rate": 0.00019999710927290064, + "loss": 6.6457, + "step": 80 + }, + { + "epoch": 0.008398732413533883, + "grad_norm": 0.453125, + "learning_rate": 0.00019999702609116578, + "loss": 6.6439, + "step": 81 + }, + { + "epoch": 0.008502420468021956, + "grad_norm": 0.56640625, + "learning_rate": 0.00019999694172957174, + "loss": 6.6209, + "step": 82 + }, + { + "epoch": 0.008606108522510028, + "grad_norm": 0.578125, + "learning_rate": 0.00019999685618811948, + "loss": 6.5961, + "step": 83 + }, + { + "epoch": 0.008709796576998101, + "grad_norm": 0.5546875, + "learning_rate": 0.00019999676946681, + "loss": 6.6656, + "step": 84 + }, + { + "epoch": 0.008813484631486174, + "grad_norm": 0.412109375, + "learning_rate": 0.00019999668156564436, + "loss": 6.5868, + "step": 85 + }, + { + "epoch": 0.008917172685974247, + "grad_norm": 0.443359375, + "learning_rate": 0.00019999659248462357, + "loss": 6.6077, + "step": 86 + }, + { + "epoch": 0.009020860740462319, + "grad_norm": 0.38671875, + "learning_rate": 0.0001999965022237487, + "loss": 6.5545, + "step": 87 + }, + { + "epoch": 0.009124548794950392, + "grad_norm": 0.318359375, + "learning_rate": 0.00019999641078302077, + "loss": 6.5389, + "step": 88 + }, + { + "epoch": 0.009228236849438465, + "grad_norm": 0.36328125, + "learning_rate": 0.00019999631816244095, + "loss": 6.5265, + "step": 89 + }, + { + "epoch": 0.009331924903926536, + "grad_norm": 0.5078125, + "learning_rate": 0.00019999622436201025, + "loss": 6.5339, + "step": 90 + }, + { + "epoch": 0.00943561295841461, + "grad_norm": 0.578125, + "learning_rate": 0.0001999961293817298, + "loss": 6.5092, + "step": 91 + }, + { + "epoch": 0.009539301012902682, + "grad_norm": 0.447265625, + "learning_rate": 0.0001999960332216007, + "loss": 6.4901, + "step": 92 + }, + { + "epoch": 0.009642989067390756, + "grad_norm": 0.6015625, + "learning_rate": 0.00019999593588162414, + "loss": 6.5021, + "step": 93 + }, + { + "epoch": 0.009746677121878827, + "grad_norm": 0.59765625, + "learning_rate": 0.00019999583736180122, + "loss": 6.4337, + "step": 94 + }, + { + "epoch": 0.0098503651763669, + "grad_norm": 0.4296875, + "learning_rate": 0.00019999573766213313, + "loss": 6.4328, + "step": 95 + }, + { + "epoch": 0.009954053230854973, + "grad_norm": 0.640625, + "learning_rate": 0.00019999563678262106, + "loss": 6.4444, + "step": 96 + }, + { + "epoch": 0.010057741285343045, + "grad_norm": 0.71484375, + "learning_rate": 0.00019999553472326614, + "loss": 6.4165, + "step": 97 + }, + { + "epoch": 0.010161429339831118, + "grad_norm": 1.4140625, + "learning_rate": 0.0001999954314840696, + "loss": 6.4282, + "step": 98 + }, + { + "epoch": 0.01026511739431919, + "grad_norm": 1.1875, + "learning_rate": 0.0001999953270650327, + "loss": 6.4431, + "step": 99 + }, + { + "epoch": 0.010368805448807264, + "grad_norm": 0.8515625, + "learning_rate": 0.00019999522146615662, + "loss": 6.3568, + "step": 100 + }, + { + "epoch": 0.010472493503295335, + "grad_norm": 0.69921875, + "learning_rate": 0.00019999511468744263, + "loss": 6.3853, + "step": 101 + }, + { + "epoch": 0.010576181557783408, + "grad_norm": 0.94921875, + "learning_rate": 0.000199995006728892, + "loss": 6.4344, + "step": 102 + }, + { + "epoch": 0.010679869612271482, + "grad_norm": 0.74609375, + "learning_rate": 0.000199994897590506, + "loss": 6.3957, + "step": 103 + }, + { + "epoch": 0.010783557666759555, + "grad_norm": 0.60546875, + "learning_rate": 0.00019999478727228588, + "loss": 6.3347, + "step": 104 + }, + { + "epoch": 0.010887245721247626, + "grad_norm": 0.5546875, + "learning_rate": 0.00019999467577423296, + "loss": 6.3382, + "step": 105 + }, + { + "epoch": 0.0109909337757357, + "grad_norm": 0.62890625, + "learning_rate": 0.0001999945630963486, + "loss": 6.3785, + "step": 106 + }, + { + "epoch": 0.011094621830223772, + "grad_norm": 0.439453125, + "learning_rate": 0.00019999444923863405, + "loss": 6.3325, + "step": 107 + }, + { + "epoch": 0.011198309884711844, + "grad_norm": 0.482421875, + "learning_rate": 0.00019999433420109073, + "loss": 6.3663, + "step": 108 + }, + { + "epoch": 0.011301997939199917, + "grad_norm": 0.41796875, + "learning_rate": 0.00019999421798371997, + "loss": 6.2954, + "step": 109 + }, + { + "epoch": 0.01140568599368799, + "grad_norm": 0.474609375, + "learning_rate": 0.00019999410058652313, + "loss": 6.2503, + "step": 110 + }, + { + "epoch": 0.011509374048176063, + "grad_norm": 0.396484375, + "learning_rate": 0.00019999398200950158, + "loss": 6.2885, + "step": 111 + }, + { + "epoch": 0.011613062102664135, + "grad_norm": 0.462890625, + "learning_rate": 0.00019999386225265676, + "loss": 6.3214, + "step": 112 + }, + { + "epoch": 0.011716750157152208, + "grad_norm": 0.314453125, + "learning_rate": 0.00019999374131599007, + "loss": 6.2841, + "step": 113 + }, + { + "epoch": 0.01182043821164028, + "grad_norm": 0.39453125, + "learning_rate": 0.00019999361919950293, + "loss": 6.29, + "step": 114 + }, + { + "epoch": 0.011924126266128352, + "grad_norm": 0.376953125, + "learning_rate": 0.00019999349590319677, + "loss": 6.2106, + "step": 115 + }, + { + "epoch": 0.012027814320616425, + "grad_norm": 0.4296875, + "learning_rate": 0.00019999337142707305, + "loss": 6.219, + "step": 116 + }, + { + "epoch": 0.012131502375104498, + "grad_norm": 0.482421875, + "learning_rate": 0.00019999324577113324, + "loss": 6.2419, + "step": 117 + }, + { + "epoch": 0.012235190429592572, + "grad_norm": 0.76953125, + "learning_rate": 0.00019999311893537883, + "loss": 6.2168, + "step": 118 + }, + { + "epoch": 0.012338878484080643, + "grad_norm": 1.515625, + "learning_rate": 0.00019999299091981134, + "loss": 6.2602, + "step": 119 + }, + { + "epoch": 0.012442566538568716, + "grad_norm": 1.15625, + "learning_rate": 0.00019999286172443223, + "loss": 6.2084, + "step": 120 + }, + { + "epoch": 0.01254625459305679, + "grad_norm": 0.890625, + "learning_rate": 0.00019999273134924307, + "loss": 6.2672, + "step": 121 + }, + { + "epoch": 0.01264994264754486, + "grad_norm": 1.1796875, + "learning_rate": 0.00019999259979424535, + "loss": 6.2597, + "step": 122 + }, + { + "epoch": 0.012753630702032934, + "grad_norm": 0.76171875, + "learning_rate": 0.00019999246705944068, + "loss": 6.1437, + "step": 123 + }, + { + "epoch": 0.012857318756521007, + "grad_norm": 0.97265625, + "learning_rate": 0.00019999233314483056, + "loss": 6.216, + "step": 124 + }, + { + "epoch": 0.01296100681100908, + "grad_norm": 0.87109375, + "learning_rate": 0.00019999219805041663, + "loss": 6.1778, + "step": 125 + }, + { + "epoch": 0.013064694865497151, + "grad_norm": 0.8203125, + "learning_rate": 0.00019999206177620047, + "loss": 6.1466, + "step": 126 + }, + { + "epoch": 0.013168382919985224, + "grad_norm": 0.74609375, + "learning_rate": 0.00019999192432218363, + "loss": 6.1517, + "step": 127 + }, + { + "epoch": 0.013272070974473298, + "grad_norm": 0.67578125, + "learning_rate": 0.00019999178568836783, + "loss": 6.1833, + "step": 128 + }, + { + "epoch": 0.013375759028961369, + "grad_norm": 0.6640625, + "learning_rate": 0.00019999164587475464, + "loss": 6.177, + "step": 129 + }, + { + "epoch": 0.013479447083449442, + "grad_norm": 0.57421875, + "learning_rate": 0.0001999915048813457, + "loss": 6.19, + "step": 130 + }, + { + "epoch": 0.013583135137937515, + "grad_norm": 0.6875, + "learning_rate": 0.0001999913627081427, + "loss": 6.1394, + "step": 131 + }, + { + "epoch": 0.013686823192425588, + "grad_norm": 0.578125, + "learning_rate": 0.00019999121935514736, + "loss": 6.1704, + "step": 132 + }, + { + "epoch": 0.01379051124691366, + "grad_norm": 0.57421875, + "learning_rate": 0.00019999107482236128, + "loss": 6.1321, + "step": 133 + }, + { + "epoch": 0.013894199301401733, + "grad_norm": 0.55078125, + "learning_rate": 0.00019999092910978625, + "loss": 6.1726, + "step": 134 + }, + { + "epoch": 0.013997887355889806, + "grad_norm": 0.5703125, + "learning_rate": 0.00019999078221742393, + "loss": 6.1438, + "step": 135 + }, + { + "epoch": 0.014101575410377877, + "grad_norm": 0.5703125, + "learning_rate": 0.00019999063414527607, + "loss": 6.1395, + "step": 136 + }, + { + "epoch": 0.01420526346486595, + "grad_norm": 0.6875, + "learning_rate": 0.00019999048489334443, + "loss": 6.0961, + "step": 137 + }, + { + "epoch": 0.014308951519354024, + "grad_norm": 0.80859375, + "learning_rate": 0.00019999033446163077, + "loss": 6.1165, + "step": 138 + }, + { + "epoch": 0.014412639573842097, + "grad_norm": 1.171875, + "learning_rate": 0.00019999018285013685, + "loss": 6.1087, + "step": 139 + }, + { + "epoch": 0.014516327628330168, + "grad_norm": 1.421875, + "learning_rate": 0.00019999003005886446, + "loss": 6.0861, + "step": 140 + }, + { + "epoch": 0.014620015682818241, + "grad_norm": 0.47265625, + "learning_rate": 0.00019998987608781544, + "loss": 6.1238, + "step": 141 + }, + { + "epoch": 0.014723703737306314, + "grad_norm": 1.734375, + "learning_rate": 0.00019998972093699153, + "loss": 6.0859, + "step": 142 + }, + { + "epoch": 0.014827391791794386, + "grad_norm": 0.97265625, + "learning_rate": 0.00019998956460639465, + "loss": 6.1074, + "step": 143 + }, + { + "epoch": 0.014931079846282459, + "grad_norm": 0.8046875, + "learning_rate": 0.00019998940709602657, + "loss": 6.1206, + "step": 144 + }, + { + "epoch": 0.015034767900770532, + "grad_norm": 1.265625, + "learning_rate": 0.00019998924840588917, + "loss": 6.0488, + "step": 145 + }, + { + "epoch": 0.015138455955258605, + "grad_norm": 0.8984375, + "learning_rate": 0.00019998908853598434, + "loss": 6.058, + "step": 146 + }, + { + "epoch": 0.015242144009746677, + "grad_norm": 0.65625, + "learning_rate": 0.000199988927486314, + "loss": 6.0474, + "step": 147 + }, + { + "epoch": 0.01534583206423475, + "grad_norm": 0.6328125, + "learning_rate": 0.00019998876525687998, + "loss": 6.0019, + "step": 148 + }, + { + "epoch": 0.015449520118722823, + "grad_norm": 0.7578125, + "learning_rate": 0.0001999886018476842, + "loss": 6.047, + "step": 149 + }, + { + "epoch": 0.015553208173210894, + "grad_norm": 0.80859375, + "learning_rate": 0.0001999884372587286, + "loss": 6.0433, + "step": 150 + }, + { + "epoch": 0.015656896227698967, + "grad_norm": 0.66796875, + "learning_rate": 0.0001999882714900152, + "loss": 6.0162, + "step": 151 + }, + { + "epoch": 0.01576058428218704, + "grad_norm": 0.50390625, + "learning_rate": 0.00019998810454154584, + "loss": 6.0574, + "step": 152 + }, + { + "epoch": 0.015864272336675114, + "grad_norm": 0.69140625, + "learning_rate": 0.00019998793641332256, + "loss": 6.0328, + "step": 153 + }, + { + "epoch": 0.015967960391163185, + "grad_norm": 0.515625, + "learning_rate": 0.0001999877671053473, + "loss": 5.9934, + "step": 154 + }, + { + "epoch": 0.01607164844565126, + "grad_norm": 0.45703125, + "learning_rate": 0.0001999875966176221, + "loss": 6.0257, + "step": 155 + }, + { + "epoch": 0.01617533650013933, + "grad_norm": 0.49609375, + "learning_rate": 0.00019998742495014896, + "loss": 5.9716, + "step": 156 + }, + { + "epoch": 0.016279024554627403, + "grad_norm": 0.57421875, + "learning_rate": 0.0001999872521029299, + "loss": 6.011, + "step": 157 + }, + { + "epoch": 0.016382712609115477, + "grad_norm": 0.373046875, + "learning_rate": 0.0001999870780759669, + "loss": 5.986, + "step": 158 + }, + { + "epoch": 0.01648640066360355, + "grad_norm": 0.4375, + "learning_rate": 0.0001999869028692621, + "loss": 5.9172, + "step": 159 + }, + { + "epoch": 0.01659008871809162, + "grad_norm": 0.65625, + "learning_rate": 0.00019998672648281757, + "loss": 5.9745, + "step": 160 + }, + { + "epoch": 0.016693776772579695, + "grad_norm": 0.81640625, + "learning_rate": 0.0001999865489166353, + "loss": 5.9591, + "step": 161 + }, + { + "epoch": 0.016797464827067767, + "grad_norm": 1.1171875, + "learning_rate": 0.00019998637017071752, + "loss": 5.982, + "step": 162 + }, + { + "epoch": 0.016901152881555838, + "grad_norm": 1.578125, + "learning_rate": 0.0001999861902450662, + "loss": 5.9767, + "step": 163 + }, + { + "epoch": 0.017004840936043913, + "grad_norm": 0.87890625, + "learning_rate": 0.00019998600913968356, + "loss": 6.0064, + "step": 164 + }, + { + "epoch": 0.017108528990531984, + "grad_norm": 0.84375, + "learning_rate": 0.00019998582685457165, + "loss": 5.9651, + "step": 165 + }, + { + "epoch": 0.017212217045020056, + "grad_norm": 0.9921875, + "learning_rate": 0.00019998564338973273, + "loss": 5.9246, + "step": 166 + }, + { + "epoch": 0.01731590509950813, + "grad_norm": 1.015625, + "learning_rate": 0.00019998545874516888, + "loss": 5.907, + "step": 167 + }, + { + "epoch": 0.017419593153996202, + "grad_norm": 1.1015625, + "learning_rate": 0.00019998527292088228, + "loss": 5.9681, + "step": 168 + }, + { + "epoch": 0.017523281208484277, + "grad_norm": 0.96875, + "learning_rate": 0.00019998508591687522, + "loss": 5.962, + "step": 169 + }, + { + "epoch": 0.017626969262972348, + "grad_norm": 0.9609375, + "learning_rate": 0.00019998489773314976, + "loss": 5.8911, + "step": 170 + }, + { + "epoch": 0.01773065731746042, + "grad_norm": 0.8359375, + "learning_rate": 0.00019998470836970827, + "loss": 5.924, + "step": 171 + }, + { + "epoch": 0.017834345371948494, + "grad_norm": 0.78125, + "learning_rate": 0.00019998451782655282, + "loss": 5.9342, + "step": 172 + }, + { + "epoch": 0.017938033426436566, + "grad_norm": 0.921875, + "learning_rate": 0.00019998432610368583, + "loss": 5.9439, + "step": 173 + }, + { + "epoch": 0.018041721480924637, + "grad_norm": 0.7734375, + "learning_rate": 0.00019998413320110943, + "loss": 5.9079, + "step": 174 + }, + { + "epoch": 0.018145409535412712, + "grad_norm": 0.62890625, + "learning_rate": 0.00019998393911882598, + "loss": 5.9049, + "step": 175 + }, + { + "epoch": 0.018249097589900783, + "grad_norm": 0.59765625, + "learning_rate": 0.0001999837438568377, + "loss": 5.8602, + "step": 176 + }, + { + "epoch": 0.018352785644388855, + "grad_norm": 0.51953125, + "learning_rate": 0.00019998354741514694, + "loss": 5.9309, + "step": 177 + }, + { + "epoch": 0.01845647369887693, + "grad_norm": 0.5625, + "learning_rate": 0.00019998334979375604, + "loss": 5.9288, + "step": 178 + }, + { + "epoch": 0.018560161753365, + "grad_norm": 0.5, + "learning_rate": 0.00019998315099266728, + "loss": 5.8735, + "step": 179 + }, + { + "epoch": 0.018663849807853072, + "grad_norm": 0.3671875, + "learning_rate": 0.000199982951011883, + "loss": 5.8988, + "step": 180 + }, + { + "epoch": 0.018767537862341147, + "grad_norm": 0.50390625, + "learning_rate": 0.0001999827498514056, + "loss": 5.8772, + "step": 181 + }, + { + "epoch": 0.01887122591682922, + "grad_norm": 0.69921875, + "learning_rate": 0.00019998254751123746, + "loss": 5.8569, + "step": 182 + }, + { + "epoch": 0.018974913971317293, + "grad_norm": 0.63671875, + "learning_rate": 0.00019998234399138092, + "loss": 5.8623, + "step": 183 + }, + { + "epoch": 0.019078602025805365, + "grad_norm": 0.5, + "learning_rate": 0.00019998213929183842, + "loss": 5.832, + "step": 184 + }, + { + "epoch": 0.019182290080293436, + "grad_norm": 0.453125, + "learning_rate": 0.00019998193341261238, + "loss": 5.8703, + "step": 185 + }, + { + "epoch": 0.01928597813478151, + "grad_norm": 0.52734375, + "learning_rate": 0.00019998172635370516, + "loss": 5.8666, + "step": 186 + }, + { + "epoch": 0.019389666189269582, + "grad_norm": 0.3671875, + "learning_rate": 0.00019998151811511928, + "loss": 5.8473, + "step": 187 + }, + { + "epoch": 0.019493354243757654, + "grad_norm": 0.498046875, + "learning_rate": 0.00019998130869685717, + "loss": 5.8832, + "step": 188 + }, + { + "epoch": 0.01959704229824573, + "grad_norm": 0.60546875, + "learning_rate": 0.00019998109809892133, + "loss": 5.8562, + "step": 189 + }, + { + "epoch": 0.0197007303527338, + "grad_norm": 0.474609375, + "learning_rate": 0.00019998088632131419, + "loss": 5.8587, + "step": 190 + }, + { + "epoch": 0.01980441840722187, + "grad_norm": 0.53515625, + "learning_rate": 0.00019998067336403827, + "loss": 5.8556, + "step": 191 + }, + { + "epoch": 0.019908106461709946, + "grad_norm": 0.76171875, + "learning_rate": 0.0001999804592270961, + "loss": 5.8503, + "step": 192 + }, + { + "epoch": 0.020011794516198018, + "grad_norm": 1.0390625, + "learning_rate": 0.0001999802439104902, + "loss": 5.8315, + "step": 193 + }, + { + "epoch": 0.02011548257068609, + "grad_norm": 1.671875, + "learning_rate": 0.0001999800274142231, + "loss": 5.7988, + "step": 194 + }, + { + "epoch": 0.020219170625174164, + "grad_norm": 0.453125, + "learning_rate": 0.00019997980973829736, + "loss": 5.7978, + "step": 195 + }, + { + "epoch": 0.020322858679662235, + "grad_norm": 2.46875, + "learning_rate": 0.00019997959088271554, + "loss": 5.8234, + "step": 196 + }, + { + "epoch": 0.02042654673415031, + "grad_norm": 0.9375, + "learning_rate": 0.00019997937084748025, + "loss": 5.8187, + "step": 197 + }, + { + "epoch": 0.02053023478863838, + "grad_norm": 3.75, + "learning_rate": 0.00019997914963259405, + "loss": 5.8793, + "step": 198 + }, + { + "epoch": 0.020633922843126453, + "grad_norm": 3.390625, + "learning_rate": 0.00019997892723805957, + "loss": 5.8885, + "step": 199 + }, + { + "epoch": 0.020737610897614528, + "grad_norm": 1.4765625, + "learning_rate": 0.00019997870366387943, + "loss": 5.8362, + "step": 200 + }, + { + "epoch": 0.0208412989521026, + "grad_norm": 3.171875, + "learning_rate": 0.00019997847891005627, + "loss": 5.8813, + "step": 201 + }, + { + "epoch": 0.02094498700659067, + "grad_norm": 2.4375, + "learning_rate": 0.00019997825297659273, + "loss": 5.9057, + "step": 202 + }, + { + "epoch": 0.021048675061078746, + "grad_norm": 2.03125, + "learning_rate": 0.0001999780258634915, + "loss": 5.8437, + "step": 203 + }, + { + "epoch": 0.021152363115566817, + "grad_norm": 1.1171875, + "learning_rate": 0.00019997779757075526, + "loss": 5.8491, + "step": 204 + }, + { + "epoch": 0.02125605117005489, + "grad_norm": 1.765625, + "learning_rate": 0.0001999775680983867, + "loss": 5.8136, + "step": 205 + }, + { + "epoch": 0.021359739224542963, + "grad_norm": 1.0703125, + "learning_rate": 0.00019997733744638846, + "loss": 5.8221, + "step": 206 + }, + { + "epoch": 0.021463427279031035, + "grad_norm": 1.6171875, + "learning_rate": 0.00019997710561476335, + "loss": 5.8324, + "step": 207 + }, + { + "epoch": 0.02156711533351911, + "grad_norm": 1.1171875, + "learning_rate": 0.0001999768726035141, + "loss": 5.8158, + "step": 208 + }, + { + "epoch": 0.02167080338800718, + "grad_norm": 1.2578125, + "learning_rate": 0.00019997663841264337, + "loss": 5.8085, + "step": 209 + }, + { + "epoch": 0.021774491442495252, + "grad_norm": 0.89453125, + "learning_rate": 0.00019997640304215402, + "loss": 5.807, + "step": 210 + }, + { + "epoch": 0.021878179496983327, + "grad_norm": 1.0703125, + "learning_rate": 0.0001999761664920488, + "loss": 5.8282, + "step": 211 + }, + { + "epoch": 0.0219818675514714, + "grad_norm": 0.7578125, + "learning_rate": 0.0001999759287623305, + "loss": 5.7731, + "step": 212 + }, + { + "epoch": 0.02208555560595947, + "grad_norm": 0.8515625, + "learning_rate": 0.0001999756898530019, + "loss": 5.7734, + "step": 213 + }, + { + "epoch": 0.022189243660447545, + "grad_norm": 0.6953125, + "learning_rate": 0.00019997544976406588, + "loss": 5.7714, + "step": 214 + }, + { + "epoch": 0.022292931714935616, + "grad_norm": 0.7109375, + "learning_rate": 0.00019997520849552517, + "loss": 5.7947, + "step": 215 + }, + { + "epoch": 0.022396619769423688, + "grad_norm": 0.6875, + "learning_rate": 0.00019997496604738272, + "loss": 5.8244, + "step": 216 + }, + { + "epoch": 0.022500307823911762, + "grad_norm": 0.6796875, + "learning_rate": 0.00019997472241964134, + "loss": 5.7605, + "step": 217 + }, + { + "epoch": 0.022603995878399834, + "grad_norm": 0.65234375, + "learning_rate": 0.00019997447761230393, + "loss": 5.7564, + "step": 218 + }, + { + "epoch": 0.022707683932887905, + "grad_norm": 0.57421875, + "learning_rate": 0.00019997423162537335, + "loss": 5.7313, + "step": 219 + }, + { + "epoch": 0.02281137198737598, + "grad_norm": 0.60546875, + "learning_rate": 0.00019997398445885248, + "loss": 5.7635, + "step": 220 + }, + { + "epoch": 0.02291506004186405, + "grad_norm": 0.5234375, + "learning_rate": 0.00019997373611274432, + "loss": 5.7636, + "step": 221 + }, + { + "epoch": 0.023018748096352126, + "grad_norm": 0.58203125, + "learning_rate": 0.00019997348658705173, + "loss": 5.7049, + "step": 222 + }, + { + "epoch": 0.023122436150840198, + "grad_norm": 0.5390625, + "learning_rate": 0.00019997323588177767, + "loss": 5.7628, + "step": 223 + }, + { + "epoch": 0.02322612420532827, + "grad_norm": 0.478515625, + "learning_rate": 0.0001999729839969251, + "loss": 5.7761, + "step": 224 + }, + { + "epoch": 0.023329812259816344, + "grad_norm": 0.490234375, + "learning_rate": 0.000199972730932497, + "loss": 5.772, + "step": 225 + }, + { + "epoch": 0.023433500314304415, + "grad_norm": 0.45703125, + "learning_rate": 0.00019997247668849638, + "loss": 5.7357, + "step": 226 + }, + { + "epoch": 0.023537188368792487, + "grad_norm": 0.54296875, + "learning_rate": 0.00019997222126492617, + "loss": 5.75, + "step": 227 + }, + { + "epoch": 0.02364087642328056, + "grad_norm": 0.4453125, + "learning_rate": 0.00019997196466178943, + "loss": 5.7343, + "step": 228 + }, + { + "epoch": 0.023744564477768633, + "grad_norm": 0.408203125, + "learning_rate": 0.00019997170687908919, + "loss": 5.7152, + "step": 229 + }, + { + "epoch": 0.023848252532256704, + "grad_norm": 0.4296875, + "learning_rate": 0.00019997144791682848, + "loss": 5.734, + "step": 230 + }, + { + "epoch": 0.02395194058674478, + "grad_norm": 0.392578125, + "learning_rate": 0.00019997118777501037, + "loss": 5.7307, + "step": 231 + }, + { + "epoch": 0.02405562864123285, + "grad_norm": 0.45703125, + "learning_rate": 0.0001999709264536379, + "loss": 5.7429, + "step": 232 + }, + { + "epoch": 0.024159316695720922, + "grad_norm": 0.458984375, + "learning_rate": 0.00019997066395271418, + "loss": 5.7384, + "step": 233 + }, + { + "epoch": 0.024263004750208997, + "grad_norm": 0.330078125, + "learning_rate": 0.00019997040027224232, + "loss": 5.7171, + "step": 234 + }, + { + "epoch": 0.024366692804697068, + "grad_norm": 0.400390625, + "learning_rate": 0.00019997013541222538, + "loss": 5.7003, + "step": 235 + }, + { + "epoch": 0.024470380859185143, + "grad_norm": 0.33203125, + "learning_rate": 0.00019996986937266653, + "loss": 5.6908, + "step": 236 + }, + { + "epoch": 0.024574068913673214, + "grad_norm": 0.33203125, + "learning_rate": 0.0001999696021535689, + "loss": 5.6583, + "step": 237 + }, + { + "epoch": 0.024677756968161286, + "grad_norm": 0.34375, + "learning_rate": 0.00019996933375493562, + "loss": 5.724, + "step": 238 + }, + { + "epoch": 0.02478144502264936, + "grad_norm": 0.333984375, + "learning_rate": 0.0001999690641767699, + "loss": 5.6948, + "step": 239 + }, + { + "epoch": 0.024885133077137432, + "grad_norm": 0.341796875, + "learning_rate": 0.00019996879341907487, + "loss": 5.7105, + "step": 240 + }, + { + "epoch": 0.024988821131625504, + "grad_norm": 0.41015625, + "learning_rate": 0.00019996852148185373, + "loss": 5.6769, + "step": 241 + }, + { + "epoch": 0.02509250918611358, + "grad_norm": 0.294921875, + "learning_rate": 0.00019996824836510975, + "loss": 5.671, + "step": 242 + }, + { + "epoch": 0.02519619724060165, + "grad_norm": 0.365234375, + "learning_rate": 0.0001999679740688461, + "loss": 5.6549, + "step": 243 + }, + { + "epoch": 0.02529988529508972, + "grad_norm": 0.447265625, + "learning_rate": 0.000199967698593066, + "loss": 5.6671, + "step": 244 + }, + { + "epoch": 0.025403573349577796, + "grad_norm": 0.392578125, + "learning_rate": 0.00019996742193777273, + "loss": 5.6164, + "step": 245 + }, + { + "epoch": 0.025507261404065867, + "grad_norm": 0.5, + "learning_rate": 0.00019996714410296958, + "loss": 5.647, + "step": 246 + }, + { + "epoch": 0.02561094945855394, + "grad_norm": 0.498046875, + "learning_rate": 0.0001999668650886598, + "loss": 5.64, + "step": 247 + }, + { + "epoch": 0.025714637513042014, + "grad_norm": 0.69140625, + "learning_rate": 0.00019996658489484666, + "loss": 5.6899, + "step": 248 + }, + { + "epoch": 0.025818325567530085, + "grad_norm": 0.921875, + "learning_rate": 0.00019996630352153353, + "loss": 5.6688, + "step": 249 + }, + { + "epoch": 0.02592201362201816, + "grad_norm": 1.78125, + "learning_rate": 0.0001999660209687236, + "loss": 5.6829, + "step": 250 + }, + { + "epoch": 0.02602570167650623, + "grad_norm": 0.6484375, + "learning_rate": 0.00019996573723642035, + "loss": 5.6252, + "step": 251 + }, + { + "epoch": 0.026129389730994303, + "grad_norm": 0.6015625, + "learning_rate": 0.00019996545232462708, + "loss": 5.6089, + "step": 252 + }, + { + "epoch": 0.026233077785482378, + "grad_norm": 1.4765625, + "learning_rate": 0.00019996516623334713, + "loss": 5.672, + "step": 253 + }, + { + "epoch": 0.02633676583997045, + "grad_norm": 1.046875, + "learning_rate": 0.00019996487896258388, + "loss": 5.6516, + "step": 254 + }, + { + "epoch": 0.02644045389445852, + "grad_norm": 1.1484375, + "learning_rate": 0.0001999645905123407, + "loss": 5.608, + "step": 255 + }, + { + "epoch": 0.026544141948946595, + "grad_norm": 0.87890625, + "learning_rate": 0.00019996430088262108, + "loss": 5.6102, + "step": 256 + }, + { + "epoch": 0.026647830003434667, + "grad_norm": 0.54296875, + "learning_rate": 0.00019996401007342832, + "loss": 5.6053, + "step": 257 + }, + { + "epoch": 0.026751518057922738, + "grad_norm": 0.65625, + "learning_rate": 0.00019996371808476596, + "loss": 5.6318, + "step": 258 + }, + { + "epoch": 0.026855206112410813, + "grad_norm": 0.796875, + "learning_rate": 0.00019996342491663733, + "loss": 5.612, + "step": 259 + }, + { + "epoch": 0.026958894166898884, + "grad_norm": 0.6171875, + "learning_rate": 0.000199963130569046, + "loss": 5.6529, + "step": 260 + }, + { + "epoch": 0.02706258222138696, + "grad_norm": 0.5390625, + "learning_rate": 0.00019996283504199538, + "loss": 5.5798, + "step": 261 + }, + { + "epoch": 0.02716627027587503, + "grad_norm": 0.62109375, + "learning_rate": 0.00019996253833548896, + "loss": 5.6042, + "step": 262 + }, + { + "epoch": 0.027269958330363102, + "grad_norm": 0.59375, + "learning_rate": 0.00019996224044953028, + "loss": 5.6064, + "step": 263 + }, + { + "epoch": 0.027373646384851177, + "grad_norm": 0.447265625, + "learning_rate": 0.0001999619413841228, + "loss": 5.6279, + "step": 264 + }, + { + "epoch": 0.027477334439339248, + "grad_norm": 0.51953125, + "learning_rate": 0.00019996164113927008, + "loss": 5.595, + "step": 265 + }, + { + "epoch": 0.02758102249382732, + "grad_norm": 0.46875, + "learning_rate": 0.00019996133971497568, + "loss": 5.6144, + "step": 266 + }, + { + "epoch": 0.027684710548315394, + "grad_norm": 0.48828125, + "learning_rate": 0.00019996103711124313, + "loss": 5.6075, + "step": 267 + }, + { + "epoch": 0.027788398602803466, + "grad_norm": 0.55078125, + "learning_rate": 0.000199960733328076, + "loss": 5.5957, + "step": 268 + }, + { + "epoch": 0.027892086657291537, + "grad_norm": 0.392578125, + "learning_rate": 0.00019996042836547786, + "loss": 5.5676, + "step": 269 + }, + { + "epoch": 0.027995774711779612, + "grad_norm": 0.435546875, + "learning_rate": 0.00019996012222345236, + "loss": 5.606, + "step": 270 + }, + { + "epoch": 0.028099462766267683, + "grad_norm": 0.5, + "learning_rate": 0.00019995981490200304, + "loss": 5.5437, + "step": 271 + }, + { + "epoch": 0.028203150820755755, + "grad_norm": 0.41796875, + "learning_rate": 0.0001999595064011336, + "loss": 5.5388, + "step": 272 + }, + { + "epoch": 0.02830683887524383, + "grad_norm": 0.5703125, + "learning_rate": 0.00019995919672084763, + "loss": 5.5548, + "step": 273 + }, + { + "epoch": 0.0284105269297319, + "grad_norm": 0.70703125, + "learning_rate": 0.0001999588858611488, + "loss": 5.5774, + "step": 274 + }, + { + "epoch": 0.028514214984219976, + "grad_norm": 0.63671875, + "learning_rate": 0.00019995857382204083, + "loss": 5.5795, + "step": 275 + }, + { + "epoch": 0.028617903038708047, + "grad_norm": 0.57421875, + "learning_rate": 0.00019995826060352728, + "loss": 5.5806, + "step": 276 + }, + { + "epoch": 0.02872159109319612, + "grad_norm": 0.70703125, + "learning_rate": 0.00019995794620561195, + "loss": 5.5304, + "step": 277 + }, + { + "epoch": 0.028825279147684194, + "grad_norm": 0.7421875, + "learning_rate": 0.0001999576306282985, + "loss": 5.594, + "step": 278 + }, + { + "epoch": 0.028928967202172265, + "grad_norm": 0.79296875, + "learning_rate": 0.00019995731387159067, + "loss": 5.5603, + "step": 279 + }, + { + "epoch": 0.029032655256660336, + "grad_norm": 1.0390625, + "learning_rate": 0.0001999569959354922, + "loss": 5.5512, + "step": 280 + }, + { + "epoch": 0.02913634331114841, + "grad_norm": 1.3125, + "learning_rate": 0.00019995667682000683, + "loss": 5.5618, + "step": 281 + }, + { + "epoch": 0.029240031365636483, + "grad_norm": 0.69921875, + "learning_rate": 0.00019995635652513835, + "loss": 5.5426, + "step": 282 + }, + { + "epoch": 0.029343719420124554, + "grad_norm": 0.69921875, + "learning_rate": 0.0001999560350508905, + "loss": 5.5263, + "step": 283 + }, + { + "epoch": 0.02944740747461263, + "grad_norm": 1.0859375, + "learning_rate": 0.0001999557123972671, + "loss": 5.5721, + "step": 284 + }, + { + "epoch": 0.0295510955291007, + "grad_norm": 1.53125, + "learning_rate": 0.00019995538856427196, + "loss": 5.5413, + "step": 285 + }, + { + "epoch": 0.02965478358358877, + "grad_norm": 0.546875, + "learning_rate": 0.00019995506355190889, + "loss": 5.5277, + "step": 286 + }, + { + "epoch": 0.029758471638076846, + "grad_norm": 1.34375, + "learning_rate": 0.00019995473736018172, + "loss": 5.5505, + "step": 287 + }, + { + "epoch": 0.029862159692564918, + "grad_norm": 1.0078125, + "learning_rate": 0.00019995440998909431, + "loss": 5.5775, + "step": 288 + }, + { + "epoch": 0.029965847747052993, + "grad_norm": 1.140625, + "learning_rate": 0.00019995408143865052, + "loss": 5.5016, + "step": 289 + }, + { + "epoch": 0.030069535801541064, + "grad_norm": 1.265625, + "learning_rate": 0.00019995375170885424, + "loss": 5.5683, + "step": 290 + }, + { + "epoch": 0.030173223856029135, + "grad_norm": 0.671875, + "learning_rate": 0.00019995342079970932, + "loss": 5.5217, + "step": 291 + }, + { + "epoch": 0.03027691191051721, + "grad_norm": 0.73828125, + "learning_rate": 0.00019995308871121971, + "loss": 5.562, + "step": 292 + }, + { + "epoch": 0.03038059996500528, + "grad_norm": 0.7109375, + "learning_rate": 0.00019995275544338928, + "loss": 5.5251, + "step": 293 + }, + { + "epoch": 0.030484288019493353, + "grad_norm": 0.875, + "learning_rate": 0.00019995242099622203, + "loss": 5.5147, + "step": 294 + }, + { + "epoch": 0.030587976073981428, + "grad_norm": 1.15625, + "learning_rate": 0.00019995208536972183, + "loss": 5.4956, + "step": 295 + }, + { + "epoch": 0.0306916641284695, + "grad_norm": 0.703125, + "learning_rate": 0.0001999517485638927, + "loss": 5.4974, + "step": 296 + }, + { + "epoch": 0.03079535218295757, + "grad_norm": 0.77734375, + "learning_rate": 0.00019995141057873857, + "loss": 5.4782, + "step": 297 + }, + { + "epoch": 0.030899040237445646, + "grad_norm": 0.9609375, + "learning_rate": 0.00019995107141426347, + "loss": 5.5044, + "step": 298 + }, + { + "epoch": 0.031002728291933717, + "grad_norm": 1.1328125, + "learning_rate": 0.00019995073107047134, + "loss": 5.5123, + "step": 299 + }, + { + "epoch": 0.03110641634642179, + "grad_norm": 0.82421875, + "learning_rate": 0.0001999503895473663, + "loss": 5.5219, + "step": 300 + }, + { + "epoch": 0.031210104400909863, + "grad_norm": 0.953125, + "learning_rate": 0.00019995004684495227, + "loss": 5.5151, + "step": 301 + }, + { + "epoch": 0.031313792455397935, + "grad_norm": 1.15625, + "learning_rate": 0.00019994970296323335, + "loss": 5.5151, + "step": 302 + }, + { + "epoch": 0.031417480509886006, + "grad_norm": 0.90234375, + "learning_rate": 0.00019994935790221358, + "loss": 5.5093, + "step": 303 + }, + { + "epoch": 0.03152116856437408, + "grad_norm": 0.83984375, + "learning_rate": 0.00019994901166189708, + "loss": 5.5052, + "step": 304 + }, + { + "epoch": 0.031624856618862156, + "grad_norm": 1.09375, + "learning_rate": 0.00019994866424228783, + "loss": 5.4662, + "step": 305 + }, + { + "epoch": 0.03172854467335023, + "grad_norm": 0.8671875, + "learning_rate": 0.00019994831564339004, + "loss": 5.4841, + "step": 306 + }, + { + "epoch": 0.0318322327278383, + "grad_norm": 0.79296875, + "learning_rate": 0.00019994796586520773, + "loss": 5.4731, + "step": 307 + }, + { + "epoch": 0.03193592078232637, + "grad_norm": 0.85546875, + "learning_rate": 0.00019994761490774513, + "loss": 5.4515, + "step": 308 + }, + { + "epoch": 0.03203960883681444, + "grad_norm": 0.66015625, + "learning_rate": 0.00019994726277100628, + "loss": 5.519, + "step": 309 + }, + { + "epoch": 0.03214329689130252, + "grad_norm": 0.62109375, + "learning_rate": 0.0001999469094549954, + "loss": 5.5203, + "step": 310 + }, + { + "epoch": 0.03224698494579059, + "grad_norm": 0.78515625, + "learning_rate": 0.0001999465549597166, + "loss": 5.5374, + "step": 311 + }, + { + "epoch": 0.03235067300027866, + "grad_norm": 0.62890625, + "learning_rate": 0.00019994619928517416, + "loss": 5.508, + "step": 312 + }, + { + "epoch": 0.032454361054766734, + "grad_norm": 0.6953125, + "learning_rate": 0.00019994584243137218, + "loss": 5.4988, + "step": 313 + }, + { + "epoch": 0.032558049109254805, + "grad_norm": 0.68359375, + "learning_rate": 0.00019994548439831487, + "loss": 5.4792, + "step": 314 + }, + { + "epoch": 0.03266173716374288, + "grad_norm": 0.462890625, + "learning_rate": 0.00019994512518600654, + "loss": 5.4493, + "step": 315 + }, + { + "epoch": 0.032765425218230955, + "grad_norm": 0.69921875, + "learning_rate": 0.0001999447647944514, + "loss": 5.4875, + "step": 316 + }, + { + "epoch": 0.032869113272719026, + "grad_norm": 0.55078125, + "learning_rate": 0.00019994440322365363, + "loss": 5.5292, + "step": 317 + }, + { + "epoch": 0.0329728013272071, + "grad_norm": 0.55859375, + "learning_rate": 0.00019994404047361756, + "loss": 5.4906, + "step": 318 + }, + { + "epoch": 0.03307648938169517, + "grad_norm": 0.404296875, + "learning_rate": 0.00019994367654434746, + "loss": 5.461, + "step": 319 + }, + { + "epoch": 0.03318017743618324, + "grad_norm": 0.470703125, + "learning_rate": 0.00019994331143584763, + "loss": 5.439, + "step": 320 + }, + { + "epoch": 0.03328386549067131, + "grad_norm": 0.515625, + "learning_rate": 0.00019994294514812238, + "loss": 5.451, + "step": 321 + }, + { + "epoch": 0.03338755354515939, + "grad_norm": 0.5703125, + "learning_rate": 0.00019994257768117602, + "loss": 5.4436, + "step": 322 + }, + { + "epoch": 0.03349124159964746, + "grad_norm": 0.609375, + "learning_rate": 0.0001999422090350129, + "loss": 5.4283, + "step": 323 + }, + { + "epoch": 0.03359492965413553, + "grad_norm": 0.6875, + "learning_rate": 0.0001999418392096373, + "loss": 5.4542, + "step": 324 + }, + { + "epoch": 0.033698617708623604, + "grad_norm": 0.5625, + "learning_rate": 0.0001999414682050537, + "loss": 5.4371, + "step": 325 + }, + { + "epoch": 0.033802305763111676, + "grad_norm": 0.431640625, + "learning_rate": 0.00019994109602126638, + "loss": 5.4293, + "step": 326 + }, + { + "epoch": 0.033905993817599754, + "grad_norm": 0.59765625, + "learning_rate": 0.00019994072265827977, + "loss": 5.4883, + "step": 327 + }, + { + "epoch": 0.034009681872087826, + "grad_norm": 0.66015625, + "learning_rate": 0.0001999403481160983, + "loss": 5.4215, + "step": 328 + }, + { + "epoch": 0.0341133699265759, + "grad_norm": 0.80859375, + "learning_rate": 0.00019993997239472634, + "loss": 5.4048, + "step": 329 + }, + { + "epoch": 0.03421705798106397, + "grad_norm": 1.0703125, + "learning_rate": 0.00019993959549416835, + "loss": 5.4454, + "step": 330 + }, + { + "epoch": 0.03432074603555204, + "grad_norm": 1.4921875, + "learning_rate": 0.00019993921741442877, + "loss": 5.4441, + "step": 331 + }, + { + "epoch": 0.03442443409004011, + "grad_norm": 0.56640625, + "learning_rate": 0.0001999388381555121, + "loss": 5.4147, + "step": 332 + }, + { + "epoch": 0.03452812214452819, + "grad_norm": 0.92578125, + "learning_rate": 0.00019993845771742276, + "loss": 5.4387, + "step": 333 + }, + { + "epoch": 0.03463181019901626, + "grad_norm": 1.921875, + "learning_rate": 0.00019993807610016524, + "loss": 5.3842, + "step": 334 + }, + { + "epoch": 0.03473549825350433, + "grad_norm": 0.921875, + "learning_rate": 0.00019993769330374408, + "loss": 5.4189, + "step": 335 + }, + { + "epoch": 0.034839186307992404, + "grad_norm": 5.53125, + "learning_rate": 0.00019993730932816377, + "loss": 5.5256, + "step": 336 + }, + { + "epoch": 0.034942874362480475, + "grad_norm": 5.0, + "learning_rate": 0.00019993692417342884, + "loss": 5.5175, + "step": 337 + }, + { + "epoch": 0.03504656241696855, + "grad_norm": 1.28125, + "learning_rate": 0.00019993653783954388, + "loss": 5.4119, + "step": 338 + }, + { + "epoch": 0.035150250471456625, + "grad_norm": 3.4375, + "learning_rate": 0.00019993615032651337, + "loss": 5.5242, + "step": 339 + }, + { + "epoch": 0.035253938525944696, + "grad_norm": 3.15625, + "learning_rate": 0.00019993576163434193, + "loss": 5.53, + "step": 340 + }, + { + "epoch": 0.03535762658043277, + "grad_norm": 1.5546875, + "learning_rate": 0.00019993537176303416, + "loss": 5.4046, + "step": 341 + }, + { + "epoch": 0.03546131463492084, + "grad_norm": 2.59375, + "learning_rate": 0.00019993498071259463, + "loss": 5.4244, + "step": 342 + }, + { + "epoch": 0.03556500268940891, + "grad_norm": 2.28125, + "learning_rate": 0.00019993458848302796, + "loss": 5.4798, + "step": 343 + }, + { + "epoch": 0.03566869074389699, + "grad_norm": 1.484375, + "learning_rate": 0.00019993419507433876, + "loss": 5.4492, + "step": 344 + }, + { + "epoch": 0.03577237879838506, + "grad_norm": 1.2109375, + "learning_rate": 0.00019993380048653175, + "loss": 5.4628, + "step": 345 + }, + { + "epoch": 0.03587606685287313, + "grad_norm": 1.34375, + "learning_rate": 0.0001999334047196115, + "loss": 5.4155, + "step": 346 + }, + { + "epoch": 0.0359797549073612, + "grad_norm": 1.2578125, + "learning_rate": 0.00019993300777358268, + "loss": 5.4204, + "step": 347 + }, + { + "epoch": 0.036083442961849274, + "grad_norm": 0.93359375, + "learning_rate": 0.00019993260964845, + "loss": 5.448, + "step": 348 + }, + { + "epoch": 0.03618713101633735, + "grad_norm": 1.046875, + "learning_rate": 0.0001999322103442182, + "loss": 5.4505, + "step": 349 + }, + { + "epoch": 0.036290819070825424, + "grad_norm": 1.109375, + "learning_rate": 0.00019993180986089192, + "loss": 5.4001, + "step": 350 + }, + { + "epoch": 0.036394507125313495, + "grad_norm": 0.96875, + "learning_rate": 0.00019993140819847595, + "loss": 5.3998, + "step": 351 + }, + { + "epoch": 0.03649819517980157, + "grad_norm": 0.97265625, + "learning_rate": 0.00019993100535697496, + "loss": 5.4476, + "step": 352 + }, + { + "epoch": 0.03660188323428964, + "grad_norm": 0.99609375, + "learning_rate": 0.00019993060133639376, + "loss": 5.4657, + "step": 353 + }, + { + "epoch": 0.03670557128877771, + "grad_norm": 0.99609375, + "learning_rate": 0.00019993019613673708, + "loss": 5.4182, + "step": 354 + }, + { + "epoch": 0.03680925934326579, + "grad_norm": 0.7578125, + "learning_rate": 0.00019992978975800972, + "loss": 5.4388, + "step": 355 + }, + { + "epoch": 0.03691294739775386, + "grad_norm": 0.76171875, + "learning_rate": 0.0001999293822002165, + "loss": 5.4333, + "step": 356 + }, + { + "epoch": 0.03701663545224193, + "grad_norm": 0.75, + "learning_rate": 0.00019992897346336218, + "loss": 5.4078, + "step": 357 + }, + { + "epoch": 0.03712032350673, + "grad_norm": 0.81640625, + "learning_rate": 0.00019992856354745158, + "loss": 5.3843, + "step": 358 + }, + { + "epoch": 0.03722401156121807, + "grad_norm": 0.6640625, + "learning_rate": 0.00019992815245248958, + "loss": 5.4116, + "step": 359 + }, + { + "epoch": 0.037327699615706145, + "grad_norm": 0.62109375, + "learning_rate": 0.000199927740178481, + "loss": 5.3862, + "step": 360 + }, + { + "epoch": 0.03743138767019422, + "grad_norm": 0.52734375, + "learning_rate": 0.00019992732672543073, + "loss": 5.3852, + "step": 361 + }, + { + "epoch": 0.037535075724682294, + "grad_norm": 0.6640625, + "learning_rate": 0.00019992691209334362, + "loss": 5.3804, + "step": 362 + }, + { + "epoch": 0.037638763779170366, + "grad_norm": 0.494140625, + "learning_rate": 0.0001999264962822246, + "loss": 5.3745, + "step": 363 + }, + { + "epoch": 0.03774245183365844, + "grad_norm": 0.58203125, + "learning_rate": 0.00019992607929207853, + "loss": 5.3948, + "step": 364 + }, + { + "epoch": 0.03784613988814651, + "grad_norm": 0.48828125, + "learning_rate": 0.00019992566112291034, + "loss": 5.384, + "step": 365 + }, + { + "epoch": 0.03794982794263459, + "grad_norm": 0.51953125, + "learning_rate": 0.000199925241774725, + "loss": 5.3606, + "step": 366 + }, + { + "epoch": 0.03805351599712266, + "grad_norm": 0.5703125, + "learning_rate": 0.0001999248212475274, + "loss": 5.4083, + "step": 367 + }, + { + "epoch": 0.03815720405161073, + "grad_norm": 0.439453125, + "learning_rate": 0.00019992439954132256, + "loss": 5.3636, + "step": 368 + }, + { + "epoch": 0.0382608921060988, + "grad_norm": 0.55078125, + "learning_rate": 0.00019992397665611543, + "loss": 5.3986, + "step": 369 + }, + { + "epoch": 0.03836458016058687, + "grad_norm": 0.46484375, + "learning_rate": 0.00019992355259191097, + "loss": 5.4029, + "step": 370 + }, + { + "epoch": 0.038468268215074944, + "grad_norm": 0.51171875, + "learning_rate": 0.00019992312734871425, + "loss": 5.405, + "step": 371 + }, + { + "epoch": 0.03857195626956302, + "grad_norm": 0.484375, + "learning_rate": 0.00019992270092653022, + "loss": 5.3392, + "step": 372 + }, + { + "epoch": 0.038675644324051094, + "grad_norm": 0.48828125, + "learning_rate": 0.00019992227332536397, + "loss": 5.3901, + "step": 373 + }, + { + "epoch": 0.038779332378539165, + "grad_norm": 0.439453125, + "learning_rate": 0.00019992184454522053, + "loss": 5.3694, + "step": 374 + }, + { + "epoch": 0.038883020433027236, + "grad_norm": 0.5078125, + "learning_rate": 0.0001999214145861049, + "loss": 5.359, + "step": 375 + }, + { + "epoch": 0.03898670848751531, + "grad_norm": 0.4609375, + "learning_rate": 0.00019992098344802223, + "loss": 5.3545, + "step": 376 + }, + { + "epoch": 0.039090396542003386, + "grad_norm": 0.41796875, + "learning_rate": 0.00019992055113097755, + "loss": 5.3735, + "step": 377 + }, + { + "epoch": 0.03919408459649146, + "grad_norm": 0.46484375, + "learning_rate": 0.000199920117634976, + "loss": 5.3751, + "step": 378 + }, + { + "epoch": 0.03929777265097953, + "grad_norm": 0.447265625, + "learning_rate": 0.0001999196829600227, + "loss": 5.2899, + "step": 379 + }, + { + "epoch": 0.0394014607054676, + "grad_norm": 0.478515625, + "learning_rate": 0.00019991924710612276, + "loss": 5.3484, + "step": 380 + }, + { + "epoch": 0.03950514875995567, + "grad_norm": 0.39453125, + "learning_rate": 0.00019991881007328131, + "loss": 5.3171, + "step": 381 + }, + { + "epoch": 0.03960883681444374, + "grad_norm": 0.494140625, + "learning_rate": 0.0001999183718615035, + "loss": 5.3399, + "step": 382 + }, + { + "epoch": 0.03971252486893182, + "grad_norm": 0.345703125, + "learning_rate": 0.00019991793247079457, + "loss": 5.3487, + "step": 383 + }, + { + "epoch": 0.03981621292341989, + "grad_norm": 0.416015625, + "learning_rate": 0.00019991749190115962, + "loss": 5.335, + "step": 384 + }, + { + "epoch": 0.039919900977907964, + "grad_norm": 0.435546875, + "learning_rate": 0.0001999170501526039, + "loss": 5.333, + "step": 385 + }, + { + "epoch": 0.040023589032396036, + "grad_norm": 0.439453125, + "learning_rate": 0.00019991660722513258, + "loss": 5.3132, + "step": 386 + }, + { + "epoch": 0.04012727708688411, + "grad_norm": 0.365234375, + "learning_rate": 0.00019991616311875092, + "loss": 5.3834, + "step": 387 + }, + { + "epoch": 0.04023096514137218, + "grad_norm": 0.443359375, + "learning_rate": 0.00019991571783346416, + "loss": 5.3623, + "step": 388 + }, + { + "epoch": 0.04033465319586026, + "grad_norm": 0.50390625, + "learning_rate": 0.00019991527136927753, + "loss": 5.3597, + "step": 389 + }, + { + "epoch": 0.04043834125034833, + "grad_norm": 0.439453125, + "learning_rate": 0.00019991482372619634, + "loss": 5.3423, + "step": 390 + }, + { + "epoch": 0.0405420293048364, + "grad_norm": 0.421875, + "learning_rate": 0.0001999143749042258, + "loss": 5.3152, + "step": 391 + }, + { + "epoch": 0.04064571735932447, + "grad_norm": 0.57421875, + "learning_rate": 0.00019991392490337128, + "loss": 5.3344, + "step": 392 + }, + { + "epoch": 0.04074940541381254, + "grad_norm": 0.55859375, + "learning_rate": 0.00019991347372363806, + "loss": 5.3016, + "step": 393 + }, + { + "epoch": 0.04085309346830062, + "grad_norm": 0.451171875, + "learning_rate": 0.00019991302136503148, + "loss": 5.3684, + "step": 394 + }, + { + "epoch": 0.04095678152278869, + "grad_norm": 0.447265625, + "learning_rate": 0.00019991256782755684, + "loss": 5.3299, + "step": 395 + }, + { + "epoch": 0.04106046957727676, + "grad_norm": 0.56640625, + "learning_rate": 0.0001999121131112195, + "loss": 5.2799, + "step": 396 + }, + { + "epoch": 0.041164157631764835, + "grad_norm": 0.486328125, + "learning_rate": 0.00019991165721602484, + "loss": 5.3184, + "step": 397 + }, + { + "epoch": 0.041267845686252906, + "grad_norm": 0.5546875, + "learning_rate": 0.00019991120014197828, + "loss": 5.3399, + "step": 398 + }, + { + "epoch": 0.04137153374074098, + "grad_norm": 0.46875, + "learning_rate": 0.00019991074188908513, + "loss": 5.2717, + "step": 399 + }, + { + "epoch": 0.041475221795229056, + "grad_norm": 0.546875, + "learning_rate": 0.00019991028245735083, + "loss": 5.344, + "step": 400 + }, + { + "epoch": 0.04157890984971713, + "grad_norm": 0.5703125, + "learning_rate": 0.00019990982184678086, + "loss": 5.3392, + "step": 401 + }, + { + "epoch": 0.0416825979042052, + "grad_norm": 0.48046875, + "learning_rate": 0.00019990936005738052, + "loss": 5.262, + "step": 402 + }, + { + "epoch": 0.04178628595869327, + "grad_norm": 0.69140625, + "learning_rate": 0.0001999088970891554, + "loss": 5.3249, + "step": 403 + }, + { + "epoch": 0.04188997401318134, + "grad_norm": 0.53515625, + "learning_rate": 0.00019990843294211087, + "loss": 5.3172, + "step": 404 + }, + { + "epoch": 0.04199366206766942, + "grad_norm": 0.578125, + "learning_rate": 0.00019990796761625246, + "loss": 5.3205, + "step": 405 + }, + { + "epoch": 0.04209735012215749, + "grad_norm": 0.5859375, + "learning_rate": 0.00019990750111158564, + "loss": 5.3259, + "step": 406 + }, + { + "epoch": 0.04220103817664556, + "grad_norm": 0.5625, + "learning_rate": 0.0001999070334281159, + "loss": 5.3271, + "step": 407 + }, + { + "epoch": 0.042304726231133634, + "grad_norm": 0.478515625, + "learning_rate": 0.00019990656456584876, + "loss": 5.3035, + "step": 408 + }, + { + "epoch": 0.042408414285621705, + "grad_norm": 0.51171875, + "learning_rate": 0.0001999060945247898, + "loss": 5.2843, + "step": 409 + }, + { + "epoch": 0.04251210234010978, + "grad_norm": 0.57421875, + "learning_rate": 0.0001999056233049445, + "loss": 5.2869, + "step": 410 + }, + { + "epoch": 0.042615790394597855, + "grad_norm": 0.5703125, + "learning_rate": 0.00019990515090631848, + "loss": 5.2991, + "step": 411 + }, + { + "epoch": 0.042719478449085926, + "grad_norm": 0.70703125, + "learning_rate": 0.00019990467732891725, + "loss": 5.2575, + "step": 412 + }, + { + "epoch": 0.042823166503574, + "grad_norm": 0.6484375, + "learning_rate": 0.00019990420257274643, + "loss": 5.3164, + "step": 413 + }, + { + "epoch": 0.04292685455806207, + "grad_norm": 0.6015625, + "learning_rate": 0.00019990372663781166, + "loss": 5.2825, + "step": 414 + }, + { + "epoch": 0.04303054261255014, + "grad_norm": 0.625, + "learning_rate": 0.00019990324952411846, + "loss": 5.2654, + "step": 415 + }, + { + "epoch": 0.04313423066703822, + "grad_norm": 0.53515625, + "learning_rate": 0.00019990277123167258, + "loss": 5.2965, + "step": 416 + }, + { + "epoch": 0.04323791872152629, + "grad_norm": 0.62109375, + "learning_rate": 0.00019990229176047958, + "loss": 5.2539, + "step": 417 + }, + { + "epoch": 0.04334160677601436, + "grad_norm": 0.71484375, + "learning_rate": 0.0001999018111105451, + "loss": 5.2908, + "step": 418 + }, + { + "epoch": 0.04344529483050243, + "grad_norm": 0.828125, + "learning_rate": 0.0001999013292818749, + "loss": 5.2979, + "step": 419 + }, + { + "epoch": 0.043548982884990504, + "grad_norm": 0.7578125, + "learning_rate": 0.0001999008462744746, + "loss": 5.2747, + "step": 420 + }, + { + "epoch": 0.043652670939478576, + "grad_norm": 0.4765625, + "learning_rate": 0.00019990036208834992, + "loss": 5.3147, + "step": 421 + }, + { + "epoch": 0.043756358993966654, + "grad_norm": 0.486328125, + "learning_rate": 0.00019989987672350656, + "loss": 5.2847, + "step": 422 + }, + { + "epoch": 0.043860047048454726, + "grad_norm": 0.67578125, + "learning_rate": 0.00019989939017995024, + "loss": 5.2892, + "step": 423 + }, + { + "epoch": 0.0439637351029428, + "grad_norm": 0.73046875, + "learning_rate": 0.00019989890245768673, + "loss": 5.2348, + "step": 424 + }, + { + "epoch": 0.04406742315743087, + "grad_norm": 0.70703125, + "learning_rate": 0.00019989841355672178, + "loss": 5.2523, + "step": 425 + }, + { + "epoch": 0.04417111121191894, + "grad_norm": 0.58984375, + "learning_rate": 0.00019989792347706114, + "loss": 5.2648, + "step": 426 + }, + { + "epoch": 0.04427479926640701, + "grad_norm": 0.51953125, + "learning_rate": 0.00019989743221871057, + "loss": 5.2998, + "step": 427 + }, + { + "epoch": 0.04437848732089509, + "grad_norm": 0.66015625, + "learning_rate": 0.00019989693978167595, + "loss": 5.1773, + "step": 428 + }, + { + "epoch": 0.04448217537538316, + "grad_norm": 0.58203125, + "learning_rate": 0.00019989644616596298, + "loss": 5.2602, + "step": 429 + }, + { + "epoch": 0.04458586342987123, + "grad_norm": 0.5625, + "learning_rate": 0.00019989595137157758, + "loss": 5.2278, + "step": 430 + }, + { + "epoch": 0.044689551484359304, + "grad_norm": 0.6484375, + "learning_rate": 0.0001998954553985255, + "loss": 5.2847, + "step": 431 + }, + { + "epoch": 0.044793239538847375, + "grad_norm": 0.8828125, + "learning_rate": 0.0001998949582468127, + "loss": 5.2977, + "step": 432 + }, + { + "epoch": 0.04489692759333545, + "grad_norm": 1.0625, + "learning_rate": 0.00019989445991644496, + "loss": 5.2542, + "step": 433 + }, + { + "epoch": 0.045000615647823525, + "grad_norm": 1.421875, + "learning_rate": 0.00019989396040742818, + "loss": 5.2308, + "step": 434 + }, + { + "epoch": 0.045104303702311596, + "grad_norm": 0.81640625, + "learning_rate": 0.00019989345971976828, + "loss": 5.2806, + "step": 435 + }, + { + "epoch": 0.04520799175679967, + "grad_norm": 0.9140625, + "learning_rate": 0.00019989295785347112, + "loss": 5.2863, + "step": 436 + }, + { + "epoch": 0.04531167981128774, + "grad_norm": 1.21875, + "learning_rate": 0.00019989245480854265, + "loss": 5.1997, + "step": 437 + }, + { + "epoch": 0.04541536786577581, + "grad_norm": 0.88671875, + "learning_rate": 0.00019989195058498882, + "loss": 5.264, + "step": 438 + }, + { + "epoch": 0.04551905592026389, + "grad_norm": 0.89453125, + "learning_rate": 0.00019989144518281558, + "loss": 5.2664, + "step": 439 + }, + { + "epoch": 0.04562274397475196, + "grad_norm": 0.84375, + "learning_rate": 0.00019989093860202885, + "loss": 5.2437, + "step": 440 + }, + { + "epoch": 0.04572643202924003, + "grad_norm": 0.63671875, + "learning_rate": 0.00019989043084263464, + "loss": 5.2767, + "step": 441 + }, + { + "epoch": 0.0458301200837281, + "grad_norm": 0.79296875, + "learning_rate": 0.00019988992190463894, + "loss": 5.2763, + "step": 442 + }, + { + "epoch": 0.045933808138216174, + "grad_norm": 0.9375, + "learning_rate": 0.00019988941178804775, + "loss": 5.24, + "step": 443 + }, + { + "epoch": 0.04603749619270425, + "grad_norm": 1.0625, + "learning_rate": 0.00019988890049286705, + "loss": 5.2116, + "step": 444 + }, + { + "epoch": 0.046141184247192324, + "grad_norm": 1.34375, + "learning_rate": 0.00019988838801910297, + "loss": 5.269, + "step": 445 + }, + { + "epoch": 0.046244872301680395, + "grad_norm": 0.63671875, + "learning_rate": 0.00019988787436676147, + "loss": 5.2098, + "step": 446 + }, + { + "epoch": 0.04634856035616847, + "grad_norm": 0.765625, + "learning_rate": 0.00019988735953584862, + "loss": 5.2502, + "step": 447 + }, + { + "epoch": 0.04645224841065654, + "grad_norm": 1.203125, + "learning_rate": 0.00019988684352637056, + "loss": 5.2842, + "step": 448 + }, + { + "epoch": 0.04655593646514461, + "grad_norm": 1.015625, + "learning_rate": 0.0001998863263383333, + "loss": 5.2292, + "step": 449 + }, + { + "epoch": 0.04665962451963269, + "grad_norm": 1.1953125, + "learning_rate": 0.00019988580797174297, + "loss": 5.2584, + "step": 450 + }, + { + "epoch": 0.04676331257412076, + "grad_norm": 0.69140625, + "learning_rate": 0.0001998852884266057, + "loss": 5.1916, + "step": 451 + }, + { + "epoch": 0.04686700062860883, + "grad_norm": 0.79296875, + "learning_rate": 0.00019988476770292762, + "loss": 5.2374, + "step": 452 + }, + { + "epoch": 0.0469706886830969, + "grad_norm": 1.21875, + "learning_rate": 0.00019988424580071485, + "loss": 5.2478, + "step": 453 + }, + { + "epoch": 0.04707437673758497, + "grad_norm": 0.75, + "learning_rate": 0.00019988372271997356, + "loss": 5.2032, + "step": 454 + }, + { + "epoch": 0.04717806479207305, + "grad_norm": 0.7265625, + "learning_rate": 0.0001998831984607099, + "loss": 5.2208, + "step": 455 + }, + { + "epoch": 0.04728175284656112, + "grad_norm": 0.71484375, + "learning_rate": 0.00019988267302293013, + "loss": 5.2389, + "step": 456 + }, + { + "epoch": 0.047385440901049194, + "grad_norm": 0.703125, + "learning_rate": 0.00019988214640664036, + "loss": 5.2707, + "step": 457 + }, + { + "epoch": 0.047489128955537266, + "grad_norm": 0.796875, + "learning_rate": 0.00019988161861184687, + "loss": 5.221, + "step": 458 + }, + { + "epoch": 0.04759281701002534, + "grad_norm": 0.515625, + "learning_rate": 0.00019988108963855586, + "loss": 5.1917, + "step": 459 + }, + { + "epoch": 0.04769650506451341, + "grad_norm": 0.61328125, + "learning_rate": 0.00019988055948677355, + "loss": 5.2225, + "step": 460 + }, + { + "epoch": 0.04780019311900149, + "grad_norm": 0.57421875, + "learning_rate": 0.00019988002815650622, + "loss": 5.1813, + "step": 461 + }, + { + "epoch": 0.04790388117348956, + "grad_norm": 0.72265625, + "learning_rate": 0.00019987949564776014, + "loss": 5.2486, + "step": 462 + }, + { + "epoch": 0.04800756922797763, + "grad_norm": 0.83984375, + "learning_rate": 0.0001998789619605416, + "loss": 5.2211, + "step": 463 + }, + { + "epoch": 0.0481112572824657, + "grad_norm": 0.81640625, + "learning_rate": 0.00019987842709485686, + "loss": 5.2175, + "step": 464 + }, + { + "epoch": 0.04821494533695377, + "grad_norm": 0.6796875, + "learning_rate": 0.0001998778910507123, + "loss": 5.2004, + "step": 465 + }, + { + "epoch": 0.048318633391441844, + "grad_norm": 0.51953125, + "learning_rate": 0.00019987735382811416, + "loss": 5.2169, + "step": 466 + }, + { + "epoch": 0.04842232144592992, + "grad_norm": 0.45703125, + "learning_rate": 0.0001998768154270688, + "loss": 5.165, + "step": 467 + }, + { + "epoch": 0.048526009500417994, + "grad_norm": 0.53515625, + "learning_rate": 0.00019987627584758263, + "loss": 5.2305, + "step": 468 + }, + { + "epoch": 0.048629697554906065, + "grad_norm": 0.439453125, + "learning_rate": 0.00019987573508966199, + "loss": 5.2377, + "step": 469 + }, + { + "epoch": 0.048733385609394136, + "grad_norm": 0.5234375, + "learning_rate": 0.00019987519315331324, + "loss": 5.2023, + "step": 470 + }, + { + "epoch": 0.04883707366388221, + "grad_norm": 0.49609375, + "learning_rate": 0.00019987465003854275, + "loss": 5.2201, + "step": 471 + }, + { + "epoch": 0.048940761718370286, + "grad_norm": 0.59765625, + "learning_rate": 0.000199874105745357, + "loss": 5.1884, + "step": 472 + }, + { + "epoch": 0.04904444977285836, + "grad_norm": 0.4609375, + "learning_rate": 0.00019987356027376238, + "loss": 5.1698, + "step": 473 + }, + { + "epoch": 0.04914813782734643, + "grad_norm": 0.46875, + "learning_rate": 0.0001998730136237653, + "loss": 5.2097, + "step": 474 + }, + { + "epoch": 0.0492518258818345, + "grad_norm": 0.609375, + "learning_rate": 0.00019987246579537222, + "loss": 5.2154, + "step": 475 + }, + { + "epoch": 0.04935551393632257, + "grad_norm": 0.6328125, + "learning_rate": 0.00019987191678858964, + "loss": 5.1997, + "step": 476 + }, + { + "epoch": 0.04945920199081064, + "grad_norm": 0.62890625, + "learning_rate": 0.00019987136660342398, + "loss": 5.2161, + "step": 477 + }, + { + "epoch": 0.04956289004529872, + "grad_norm": 0.6875, + "learning_rate": 0.00019987081523988178, + "loss": 5.2113, + "step": 478 + }, + { + "epoch": 0.04966657809978679, + "grad_norm": 0.68359375, + "learning_rate": 0.00019987026269796952, + "loss": 5.1944, + "step": 479 + }, + { + "epoch": 0.049770266154274864, + "grad_norm": 0.57421875, + "learning_rate": 0.00019986970897769375, + "loss": 5.1795, + "step": 480 + }, + { + "epoch": 0.049873954208762936, + "grad_norm": 0.64453125, + "learning_rate": 0.00019986915407906096, + "loss": 5.2348, + "step": 481 + }, + { + "epoch": 0.04997764226325101, + "grad_norm": 0.83203125, + "learning_rate": 0.00019986859800207772, + "loss": 5.1926, + "step": 482 + }, + { + "epoch": 0.050081330317739085, + "grad_norm": 0.87890625, + "learning_rate": 0.00019986804074675058, + "loss": 5.2065, + "step": 483 + }, + { + "epoch": 0.05018501837222716, + "grad_norm": 0.76953125, + "learning_rate": 0.00019986748231308615, + "loss": 5.1956, + "step": 484 + }, + { + "epoch": 0.05028870642671523, + "grad_norm": 0.66796875, + "learning_rate": 0.00019986692270109098, + "loss": 5.1497, + "step": 485 + }, + { + "epoch": 0.0503923944812033, + "grad_norm": 0.6796875, + "learning_rate": 0.00019986636191077168, + "loss": 5.1756, + "step": 486 + }, + { + "epoch": 0.05049608253569137, + "grad_norm": 0.76953125, + "learning_rate": 0.00019986579994213486, + "loss": 5.2124, + "step": 487 + }, + { + "epoch": 0.05059977059017944, + "grad_norm": 0.9375, + "learning_rate": 0.00019986523679518722, + "loss": 5.1792, + "step": 488 + }, + { + "epoch": 0.05070345864466752, + "grad_norm": 1.09375, + "learning_rate": 0.00019986467246993527, + "loss": 5.1996, + "step": 489 + }, + { + "epoch": 0.05080714669915559, + "grad_norm": 0.99609375, + "learning_rate": 0.0001998641069663858, + "loss": 5.2116, + "step": 490 + }, + { + "epoch": 0.05091083475364366, + "grad_norm": 1.1875, + "learning_rate": 0.00019986354028454542, + "loss": 5.1607, + "step": 491 + }, + { + "epoch": 0.051014522808131735, + "grad_norm": 0.8828125, + "learning_rate": 0.0001998629724244208, + "loss": 5.1851, + "step": 492 + }, + { + "epoch": 0.051118210862619806, + "grad_norm": 0.890625, + "learning_rate": 0.00019986240338601869, + "loss": 5.196, + "step": 493 + }, + { + "epoch": 0.05122189891710788, + "grad_norm": 1.1953125, + "learning_rate": 0.00019986183316934576, + "loss": 5.1985, + "step": 494 + }, + { + "epoch": 0.051325586971595956, + "grad_norm": 0.96484375, + "learning_rate": 0.0001998612617744088, + "loss": 5.1647, + "step": 495 + }, + { + "epoch": 0.05142927502608403, + "grad_norm": 0.97265625, + "learning_rate": 0.0001998606892012145, + "loss": 5.205, + "step": 496 + }, + { + "epoch": 0.0515329630805721, + "grad_norm": 1.1171875, + "learning_rate": 0.00019986011544976956, + "loss": 5.2331, + "step": 497 + }, + { + "epoch": 0.05163665113506017, + "grad_norm": 0.87890625, + "learning_rate": 0.00019985954052008085, + "loss": 5.1625, + "step": 498 + }, + { + "epoch": 0.05174033918954824, + "grad_norm": 0.7578125, + "learning_rate": 0.00019985896441215514, + "loss": 5.1805, + "step": 499 + }, + { + "epoch": 0.05184402724403632, + "grad_norm": 0.7421875, + "learning_rate": 0.0001998583871259992, + "loss": 5.1823, + "step": 500 + }, + { + "epoch": 0.05194771529852439, + "grad_norm": 0.71484375, + "learning_rate": 0.00019985780866161985, + "loss": 5.1178, + "step": 501 + }, + { + "epoch": 0.05205140335301246, + "grad_norm": 0.76953125, + "learning_rate": 0.00019985722901902389, + "loss": 5.1614, + "step": 502 + }, + { + "epoch": 0.052155091407500534, + "grad_norm": 0.78515625, + "learning_rate": 0.0001998566481982182, + "loss": 5.178, + "step": 503 + }, + { + "epoch": 0.052258779461988605, + "grad_norm": 0.80078125, + "learning_rate": 0.0001998560661992096, + "loss": 5.1482, + "step": 504 + }, + { + "epoch": 0.05236246751647668, + "grad_norm": 0.74609375, + "learning_rate": 0.00019985548302200497, + "loss": 5.1702, + "step": 505 + }, + { + "epoch": 0.052466155570964755, + "grad_norm": 0.78515625, + "learning_rate": 0.0001998548986666112, + "loss": 5.165, + "step": 506 + }, + { + "epoch": 0.052569843625452826, + "grad_norm": 0.7421875, + "learning_rate": 0.00019985431313303517, + "loss": 5.1466, + "step": 507 + }, + { + "epoch": 0.0526735316799409, + "grad_norm": 0.5546875, + "learning_rate": 0.00019985372642128383, + "loss": 5.183, + "step": 508 + }, + { + "epoch": 0.05277721973442897, + "grad_norm": 0.6328125, + "learning_rate": 0.00019985313853136403, + "loss": 5.1417, + "step": 509 + }, + { + "epoch": 0.05288090778891704, + "grad_norm": 0.69140625, + "learning_rate": 0.00019985254946328274, + "loss": 5.1574, + "step": 510 + }, + { + "epoch": 0.05298459584340512, + "grad_norm": 0.625, + "learning_rate": 0.00019985195921704696, + "loss": 5.1722, + "step": 511 + }, + { + "epoch": 0.05308828389789319, + "grad_norm": 0.58203125, + "learning_rate": 0.0001998513677926636, + "loss": 5.163, + "step": 512 + }, + { + "epoch": 0.05319197195238126, + "grad_norm": 0.69140625, + "learning_rate": 0.0001998507751901396, + "loss": 5.1497, + "step": 513 + }, + { + "epoch": 0.05329566000686933, + "grad_norm": 0.69140625, + "learning_rate": 0.000199850181409482, + "loss": 5.1216, + "step": 514 + }, + { + "epoch": 0.053399348061357405, + "grad_norm": 0.8359375, + "learning_rate": 0.00019984958645069786, + "loss": 5.1793, + "step": 515 + }, + { + "epoch": 0.053503036115845476, + "grad_norm": 0.9453125, + "learning_rate": 0.0001998489903137941, + "loss": 5.1748, + "step": 516 + }, + { + "epoch": 0.053606724170333554, + "grad_norm": 1.1484375, + "learning_rate": 0.0001998483929987778, + "loss": 5.1428, + "step": 517 + }, + { + "epoch": 0.053710412224821626, + "grad_norm": 0.875, + "learning_rate": 0.00019984779450565605, + "loss": 5.1818, + "step": 518 + }, + { + "epoch": 0.0538141002793097, + "grad_norm": 0.77734375, + "learning_rate": 0.00019984719483443587, + "loss": 5.1619, + "step": 519 + }, + { + "epoch": 0.05391778833379777, + "grad_norm": 0.796875, + "learning_rate": 0.0001998465939851243, + "loss": 5.1613, + "step": 520 + }, + { + "epoch": 0.05402147638828584, + "grad_norm": 0.82421875, + "learning_rate": 0.00019984599195772845, + "loss": 5.1229, + "step": 521 + }, + { + "epoch": 0.05412516444277392, + "grad_norm": 0.9453125, + "learning_rate": 0.00019984538875225547, + "loss": 5.1587, + "step": 522 + }, + { + "epoch": 0.05422885249726199, + "grad_norm": 1.21875, + "learning_rate": 0.00019984478436871244, + "loss": 5.163, + "step": 523 + }, + { + "epoch": 0.05433254055175006, + "grad_norm": 0.86328125, + "learning_rate": 0.00019984417880710646, + "loss": 5.1997, + "step": 524 + }, + { + "epoch": 0.05443622860623813, + "grad_norm": 0.85546875, + "learning_rate": 0.00019984357206744474, + "loss": 5.1593, + "step": 525 + }, + { + "epoch": 0.054539916660726204, + "grad_norm": 0.81640625, + "learning_rate": 0.0001998429641497344, + "loss": 5.1307, + "step": 526 + }, + { + "epoch": 0.054643604715214275, + "grad_norm": 0.6015625, + "learning_rate": 0.00019984235505398262, + "loss": 5.0949, + "step": 527 + }, + { + "epoch": 0.05474729276970235, + "grad_norm": 0.63671875, + "learning_rate": 0.0001998417447801966, + "loss": 5.1336, + "step": 528 + }, + { + "epoch": 0.054850980824190425, + "grad_norm": 0.63671875, + "learning_rate": 0.00019984113332838352, + "loss": 5.1222, + "step": 529 + }, + { + "epoch": 0.054954668878678496, + "grad_norm": 0.7578125, + "learning_rate": 0.0001998405206985506, + "loss": 5.1132, + "step": 530 + }, + { + "epoch": 0.05505835693316657, + "grad_norm": 0.77734375, + "learning_rate": 0.00019983990689070508, + "loss": 5.1254, + "step": 531 + }, + { + "epoch": 0.05516204498765464, + "grad_norm": 0.671875, + "learning_rate": 0.00019983929190485423, + "loss": 5.1434, + "step": 532 + }, + { + "epoch": 0.05526573304214271, + "grad_norm": 0.67578125, + "learning_rate": 0.0001998386757410052, + "loss": 5.0891, + "step": 533 + }, + { + "epoch": 0.05536942109663079, + "grad_norm": 0.69921875, + "learning_rate": 0.0001998380583991654, + "loss": 5.144, + "step": 534 + }, + { + "epoch": 0.05547310915111886, + "grad_norm": 0.71484375, + "learning_rate": 0.00019983743987934198, + "loss": 5.1256, + "step": 535 + }, + { + "epoch": 0.05557679720560693, + "grad_norm": 0.546875, + "learning_rate": 0.00019983682018154234, + "loss": 5.1397, + "step": 536 + }, + { + "epoch": 0.055680485260095, + "grad_norm": 0.58984375, + "learning_rate": 0.00019983619930577374, + "loss": 5.1786, + "step": 537 + }, + { + "epoch": 0.055784173314583074, + "grad_norm": 0.478515625, + "learning_rate": 0.00019983557725204352, + "loss": 5.1161, + "step": 538 + }, + { + "epoch": 0.05588786136907115, + "grad_norm": 0.54296875, + "learning_rate": 0.00019983495402035902, + "loss": 5.1185, + "step": 539 + }, + { + "epoch": 0.055991549423559224, + "grad_norm": 0.546875, + "learning_rate": 0.0001998343296107276, + "loss": 5.1027, + "step": 540 + }, + { + "epoch": 0.056095237478047295, + "grad_norm": 0.50390625, + "learning_rate": 0.0001998337040231566, + "loss": 5.1329, + "step": 541 + }, + { + "epoch": 0.05619892553253537, + "grad_norm": 0.49609375, + "learning_rate": 0.00019983307725765346, + "loss": 5.15, + "step": 542 + }, + { + "epoch": 0.05630261358702344, + "grad_norm": 0.625, + "learning_rate": 0.0001998324493142255, + "loss": 5.1271, + "step": 543 + }, + { + "epoch": 0.05640630164151151, + "grad_norm": 0.703125, + "learning_rate": 0.00019983182019288017, + "loss": 5.1476, + "step": 544 + }, + { + "epoch": 0.05650998969599959, + "grad_norm": 0.734375, + "learning_rate": 0.0001998311898936249, + "loss": 5.0857, + "step": 545 + }, + { + "epoch": 0.05661367775048766, + "grad_norm": 0.578125, + "learning_rate": 0.0001998305584164671, + "loss": 5.1089, + "step": 546 + }, + { + "epoch": 0.05671736580497573, + "grad_norm": 0.482421875, + "learning_rate": 0.00019982992576141425, + "loss": 5.1258, + "step": 547 + }, + { + "epoch": 0.0568210538594638, + "grad_norm": 0.59765625, + "learning_rate": 0.0001998292919284738, + "loss": 5.1405, + "step": 548 + }, + { + "epoch": 0.05692474191395187, + "grad_norm": 0.59765625, + "learning_rate": 0.00019982865691765323, + "loss": 5.1548, + "step": 549 + }, + { + "epoch": 0.05702842996843995, + "grad_norm": 0.71875, + "learning_rate": 0.00019982802072896004, + "loss": 5.0671, + "step": 550 + }, + { + "epoch": 0.05713211802292802, + "grad_norm": 0.66796875, + "learning_rate": 0.00019982738336240172, + "loss": 5.1215, + "step": 551 + }, + { + "epoch": 0.057235806077416095, + "grad_norm": 0.55859375, + "learning_rate": 0.0001998267448179858, + "loss": 5.0941, + "step": 552 + }, + { + "epoch": 0.057339494131904166, + "grad_norm": 0.6015625, + "learning_rate": 0.00019982610509571979, + "loss": 5.1049, + "step": 553 + }, + { + "epoch": 0.05744318218639224, + "grad_norm": 0.58203125, + "learning_rate": 0.0001998254641956113, + "loss": 5.1052, + "step": 554 + }, + { + "epoch": 0.05754687024088031, + "grad_norm": 0.67578125, + "learning_rate": 0.0001998248221176678, + "loss": 5.0879, + "step": 555 + }, + { + "epoch": 0.05765055829536839, + "grad_norm": 0.6796875, + "learning_rate": 0.00019982417886189698, + "loss": 5.1172, + "step": 556 + }, + { + "epoch": 0.05775424634985646, + "grad_norm": 0.5859375, + "learning_rate": 0.00019982353442830634, + "loss": 5.0876, + "step": 557 + }, + { + "epoch": 0.05785793440434453, + "grad_norm": 0.515625, + "learning_rate": 0.00019982288881690349, + "loss": 5.1051, + "step": 558 + }, + { + "epoch": 0.0579616224588326, + "grad_norm": 0.59765625, + "learning_rate": 0.00019982224202769611, + "loss": 5.0873, + "step": 559 + }, + { + "epoch": 0.05806531051332067, + "grad_norm": 0.66015625, + "learning_rate": 0.00019982159406069176, + "loss": 5.0969, + "step": 560 + }, + { + "epoch": 0.058168998567808744, + "grad_norm": 0.6328125, + "learning_rate": 0.00019982094491589813, + "loss": 5.1099, + "step": 561 + }, + { + "epoch": 0.05827268662229682, + "grad_norm": 0.6328125, + "learning_rate": 0.00019982029459332287, + "loss": 5.0659, + "step": 562 + }, + { + "epoch": 0.058376374676784894, + "grad_norm": 0.5625, + "learning_rate": 0.00019981964309297363, + "loss": 5.0939, + "step": 563 + }, + { + "epoch": 0.058480062731272965, + "grad_norm": 0.5859375, + "learning_rate": 0.00019981899041485813, + "loss": 5.1099, + "step": 564 + }, + { + "epoch": 0.058583750785761037, + "grad_norm": 0.6484375, + "learning_rate": 0.00019981833655898404, + "loss": 5.1289, + "step": 565 + }, + { + "epoch": 0.05868743884024911, + "grad_norm": 0.56640625, + "learning_rate": 0.00019981768152535913, + "loss": 5.0745, + "step": 566 + }, + { + "epoch": 0.058791126894737186, + "grad_norm": 0.59765625, + "learning_rate": 0.00019981702531399106, + "loss": 5.1127, + "step": 567 + }, + { + "epoch": 0.05889481494922526, + "grad_norm": 0.671875, + "learning_rate": 0.0001998163679248876, + "loss": 5.0662, + "step": 568 + }, + { + "epoch": 0.05899850300371333, + "grad_norm": 0.73046875, + "learning_rate": 0.0001998157093580565, + "loss": 5.109, + "step": 569 + }, + { + "epoch": 0.0591021910582014, + "grad_norm": 0.95703125, + "learning_rate": 0.00019981504961350558, + "loss": 5.0697, + "step": 570 + }, + { + "epoch": 0.05920587911268947, + "grad_norm": 1.1484375, + "learning_rate": 0.00019981438869124256, + "loss": 5.0217, + "step": 571 + }, + { + "epoch": 0.05930956716717754, + "grad_norm": 0.765625, + "learning_rate": 0.00019981372659127523, + "loss": 5.0847, + "step": 572 + }, + { + "epoch": 0.05941325522166562, + "grad_norm": 0.5078125, + "learning_rate": 0.00019981306331361148, + "loss": 5.0614, + "step": 573 + }, + { + "epoch": 0.05951694327615369, + "grad_norm": 0.74609375, + "learning_rate": 0.00019981239885825906, + "loss": 5.0795, + "step": 574 + }, + { + "epoch": 0.059620631330641764, + "grad_norm": 0.7734375, + "learning_rate": 0.00019981173322522586, + "loss": 5.0716, + "step": 575 + }, + { + "epoch": 0.059724319385129836, + "grad_norm": 0.62109375, + "learning_rate": 0.00019981106641451973, + "loss": 5.0803, + "step": 576 + }, + { + "epoch": 0.05982800743961791, + "grad_norm": 0.59765625, + "learning_rate": 0.0001998103984261485, + "loss": 5.0795, + "step": 577 + }, + { + "epoch": 0.059931695494105985, + "grad_norm": 0.62109375, + "learning_rate": 0.00019980972926012005, + "loss": 5.0748, + "step": 578 + }, + { + "epoch": 0.06003538354859406, + "grad_norm": 0.6328125, + "learning_rate": 0.0001998090589164423, + "loss": 5.0885, + "step": 579 + }, + { + "epoch": 0.06013907160308213, + "grad_norm": 0.671875, + "learning_rate": 0.0001998083873951232, + "loss": 5.0466, + "step": 580 + }, + { + "epoch": 0.0602427596575702, + "grad_norm": 0.65625, + "learning_rate": 0.00019980771469617058, + "loss": 5.0745, + "step": 581 + }, + { + "epoch": 0.06034644771205827, + "grad_norm": 0.625, + "learning_rate": 0.00019980704081959248, + "loss": 5.0909, + "step": 582 + }, + { + "epoch": 0.06045013576654634, + "grad_norm": 0.640625, + "learning_rate": 0.00019980636576539678, + "loss": 5.017, + "step": 583 + }, + { + "epoch": 0.06055382382103442, + "grad_norm": 0.63671875, + "learning_rate": 0.00019980568953359144, + "loss": 5.1101, + "step": 584 + }, + { + "epoch": 0.06065751187552249, + "grad_norm": 0.859375, + "learning_rate": 0.00019980501212418447, + "loss": 5.0365, + "step": 585 + }, + { + "epoch": 0.06076119993001056, + "grad_norm": 1.3515625, + "learning_rate": 0.0001998043335371839, + "loss": 5.07, + "step": 586 + }, + { + "epoch": 0.060864887984498635, + "grad_norm": 0.7734375, + "learning_rate": 0.00019980365377259763, + "loss": 5.0933, + "step": 587 + }, + { + "epoch": 0.060968576038986706, + "grad_norm": 0.921875, + "learning_rate": 0.00019980297283043379, + "loss": 5.1316, + "step": 588 + }, + { + "epoch": 0.061072264093474785, + "grad_norm": 1.3359375, + "learning_rate": 0.00019980229071070037, + "loss": 5.0325, + "step": 589 + }, + { + "epoch": 0.061175952147962856, + "grad_norm": 0.9375, + "learning_rate": 0.00019980160741340537, + "loss": 5.1035, + "step": 590 + }, + { + "epoch": 0.06127964020245093, + "grad_norm": 1.6328125, + "learning_rate": 0.0001998009229385569, + "loss": 5.0554, + "step": 591 + }, + { + "epoch": 0.061383328256939, + "grad_norm": 0.6484375, + "learning_rate": 0.00019980023728616305, + "loss": 5.0741, + "step": 592 + }, + { + "epoch": 0.06148701631142707, + "grad_norm": 2.171875, + "learning_rate": 0.0001997995504562319, + "loss": 5.083, + "step": 593 + }, + { + "epoch": 0.06159070436591514, + "grad_norm": 1.296875, + "learning_rate": 0.00019979886244877158, + "loss": 5.0724, + "step": 594 + }, + { + "epoch": 0.06169439242040322, + "grad_norm": 3.359375, + "learning_rate": 0.00019979817326379012, + "loss": 5.1534, + "step": 595 + }, + { + "epoch": 0.06179808047489129, + "grad_norm": 3.109375, + "learning_rate": 0.00019979748290129573, + "loss": 5.1388, + "step": 596 + }, + { + "epoch": 0.06190176852937936, + "grad_norm": 1.3984375, + "learning_rate": 0.00019979679136129653, + "loss": 5.0704, + "step": 597 + }, + { + "epoch": 0.062005456583867434, + "grad_norm": 2.40625, + "learning_rate": 0.00019979609864380067, + "loss": 5.1138, + "step": 598 + }, + { + "epoch": 0.062109144638355505, + "grad_norm": 2.265625, + "learning_rate": 0.00019979540474881634, + "loss": 5.1004, + "step": 599 + }, + { + "epoch": 0.06221283269284358, + "grad_norm": 1.2890625, + "learning_rate": 0.00019979470967635172, + "loss": 5.0521, + "step": 600 + }, + { + "epoch": 0.062316520747331655, + "grad_norm": 1.796875, + "learning_rate": 0.00019979401342641503, + "loss": 5.0985, + "step": 601 + }, + { + "epoch": 0.062420208801819727, + "grad_norm": 1.5, + "learning_rate": 0.00019979331599901445, + "loss": 5.0998, + "step": 602 + }, + { + "epoch": 0.0625238968563078, + "grad_norm": 1.078125, + "learning_rate": 0.00019979261739415825, + "loss": 5.1008, + "step": 603 + }, + { + "epoch": 0.06262758491079587, + "grad_norm": 2.265625, + "learning_rate": 0.00019979191761185466, + "loss": 5.088, + "step": 604 + }, + { + "epoch": 0.06273127296528394, + "grad_norm": 1.5625, + "learning_rate": 0.00019979121665211186, + "loss": 5.1075, + "step": 605 + }, + { + "epoch": 0.06283496101977201, + "grad_norm": 2.25, + "learning_rate": 0.00019979051451493826, + "loss": 5.1131, + "step": 606 + }, + { + "epoch": 0.06293864907426008, + "grad_norm": 1.375, + "learning_rate": 0.00019978981120034203, + "loss": 5.0863, + "step": 607 + }, + { + "epoch": 0.06304233712874815, + "grad_norm": 2.875, + "learning_rate": 0.0001997891067083315, + "loss": 5.126, + "step": 608 + }, + { + "epoch": 0.06314602518323624, + "grad_norm": 2.453125, + "learning_rate": 0.00019978840103891505, + "loss": 5.1418, + "step": 609 + }, + { + "epoch": 0.06324971323772431, + "grad_norm": 1.484375, + "learning_rate": 0.0001997876941921009, + "loss": 5.0454, + "step": 610 + }, + { + "epoch": 0.06335340129221238, + "grad_norm": 1.421875, + "learning_rate": 0.00019978698616789745, + "loss": 5.0991, + "step": 611 + }, + { + "epoch": 0.06345708934670045, + "grad_norm": 1.4296875, + "learning_rate": 0.00019978627696631306, + "loss": 5.0266, + "step": 612 + }, + { + "epoch": 0.06356077740118853, + "grad_norm": 1.3203125, + "learning_rate": 0.00019978556658735606, + "loss": 5.096, + "step": 613 + }, + { + "epoch": 0.0636644654556766, + "grad_norm": 1.140625, + "learning_rate": 0.00019978485503103485, + "loss": 5.0584, + "step": 614 + }, + { + "epoch": 0.06376815351016467, + "grad_norm": 1.3125, + "learning_rate": 0.0001997841422973578, + "loss": 5.092, + "step": 615 + }, + { + "epoch": 0.06387184156465274, + "grad_norm": 1.1484375, + "learning_rate": 0.00019978342838633344, + "loss": 5.1038, + "step": 616 + }, + { + "epoch": 0.06397552961914081, + "grad_norm": 1.1015625, + "learning_rate": 0.00019978271329797003, + "loss": 5.0346, + "step": 617 + }, + { + "epoch": 0.06407921767362888, + "grad_norm": 0.9921875, + "learning_rate": 0.00019978199703227608, + "loss": 5.0628, + "step": 618 + }, + { + "epoch": 0.06418290572811695, + "grad_norm": 0.96484375, + "learning_rate": 0.00019978127958926006, + "loss": 5.1023, + "step": 619 + }, + { + "epoch": 0.06428659378260504, + "grad_norm": 0.89453125, + "learning_rate": 0.00019978056096893042, + "loss": 5.0591, + "step": 620 + }, + { + "epoch": 0.06439028183709311, + "grad_norm": 0.98046875, + "learning_rate": 0.0001997798411712956, + "loss": 5.0737, + "step": 621 + }, + { + "epoch": 0.06449396989158118, + "grad_norm": 0.84765625, + "learning_rate": 0.00019977912019636415, + "loss": 5.0839, + "step": 622 + }, + { + "epoch": 0.06459765794606925, + "grad_norm": 0.7421875, + "learning_rate": 0.00019977839804414456, + "loss": 5.0516, + "step": 623 + }, + { + "epoch": 0.06470134600055732, + "grad_norm": 0.76171875, + "learning_rate": 0.00019977767471464531, + "loss": 5.0687, + "step": 624 + }, + { + "epoch": 0.0648050340550454, + "grad_norm": 0.671875, + "learning_rate": 0.00019977695020787498, + "loss": 5.0761, + "step": 625 + }, + { + "epoch": 0.06490872210953347, + "grad_norm": 0.76171875, + "learning_rate": 0.00019977622452384212, + "loss": 5.0706, + "step": 626 + }, + { + "epoch": 0.06501241016402154, + "grad_norm": 0.609375, + "learning_rate": 0.00019977549766255528, + "loss": 5.0099, + "step": 627 + }, + { + "epoch": 0.06511609821850961, + "grad_norm": 0.66015625, + "learning_rate": 0.00019977476962402304, + "loss": 5.0794, + "step": 628 + }, + { + "epoch": 0.06521978627299768, + "grad_norm": 0.69140625, + "learning_rate": 0.00019977404040825395, + "loss": 5.0676, + "step": 629 + }, + { + "epoch": 0.06532347432748575, + "grad_norm": 0.7265625, + "learning_rate": 0.0001997733100152567, + "loss": 5.0754, + "step": 630 + }, + { + "epoch": 0.06542716238197384, + "grad_norm": 0.66015625, + "learning_rate": 0.0001997725784450398, + "loss": 5.0106, + "step": 631 + }, + { + "epoch": 0.06553085043646191, + "grad_norm": 0.474609375, + "learning_rate": 0.000199771845697612, + "loss": 5.0163, + "step": 632 + }, + { + "epoch": 0.06563453849094998, + "grad_norm": 0.64453125, + "learning_rate": 0.00019977111177298183, + "loss": 5.0573, + "step": 633 + }, + { + "epoch": 0.06573822654543805, + "grad_norm": 0.53125, + "learning_rate": 0.00019977037667115802, + "loss": 5.0315, + "step": 634 + }, + { + "epoch": 0.06584191459992612, + "grad_norm": 0.5078125, + "learning_rate": 0.00019976964039214923, + "loss": 5.0168, + "step": 635 + }, + { + "epoch": 0.0659456026544142, + "grad_norm": 0.5078125, + "learning_rate": 0.00019976890293596416, + "loss": 5.0454, + "step": 636 + }, + { + "epoch": 0.06604929070890227, + "grad_norm": 0.5546875, + "learning_rate": 0.00019976816430261146, + "loss": 5.0578, + "step": 637 + }, + { + "epoch": 0.06615297876339034, + "grad_norm": 0.498046875, + "learning_rate": 0.00019976742449209992, + "loss": 5.0527, + "step": 638 + }, + { + "epoch": 0.06625666681787841, + "grad_norm": 0.46484375, + "learning_rate": 0.0001997666835044382, + "loss": 5.083, + "step": 639 + }, + { + "epoch": 0.06636035487236648, + "grad_norm": 0.6171875, + "learning_rate": 0.00019976594133963512, + "loss": 5.0622, + "step": 640 + }, + { + "epoch": 0.06646404292685455, + "grad_norm": 0.46484375, + "learning_rate": 0.00019976519799769931, + "loss": 5.0684, + "step": 641 + }, + { + "epoch": 0.06656773098134262, + "grad_norm": 0.5703125, + "learning_rate": 0.00019976445347863968, + "loss": 5.0087, + "step": 642 + }, + { + "epoch": 0.06667141903583071, + "grad_norm": 0.546875, + "learning_rate": 0.00019976370778246495, + "loss": 5.0554, + "step": 643 + }, + { + "epoch": 0.06677510709031878, + "grad_norm": 0.44140625, + "learning_rate": 0.0001997629609091839, + "loss": 5.0249, + "step": 644 + }, + { + "epoch": 0.06687879514480685, + "grad_norm": 0.470703125, + "learning_rate": 0.0001997622128588054, + "loss": 5.0804, + "step": 645 + }, + { + "epoch": 0.06698248319929492, + "grad_norm": 0.47265625, + "learning_rate": 0.0001997614636313382, + "loss": 5.0207, + "step": 646 + }, + { + "epoch": 0.067086171253783, + "grad_norm": 0.52734375, + "learning_rate": 0.0001997607132267912, + "loss": 5.0279, + "step": 647 + }, + { + "epoch": 0.06718985930827107, + "grad_norm": 0.4453125, + "learning_rate": 0.00019975996164517325, + "loss": 5.062, + "step": 648 + }, + { + "epoch": 0.06729354736275914, + "grad_norm": 0.50390625, + "learning_rate": 0.00019975920888649318, + "loss": 4.9891, + "step": 649 + }, + { + "epoch": 0.06739723541724721, + "grad_norm": 0.443359375, + "learning_rate": 0.00019975845495075992, + "loss": 5.0609, + "step": 650 + }, + { + "epoch": 0.06750092347173528, + "grad_norm": 0.5078125, + "learning_rate": 0.0001997576998379823, + "loss": 5.0621, + "step": 651 + }, + { + "epoch": 0.06760461152622335, + "grad_norm": 0.419921875, + "learning_rate": 0.0001997569435481693, + "loss": 4.9655, + "step": 652 + }, + { + "epoch": 0.06770829958071142, + "grad_norm": 0.466796875, + "learning_rate": 0.00019975618608132983, + "loss": 5.0336, + "step": 653 + }, + { + "epoch": 0.06781198763519951, + "grad_norm": 0.5, + "learning_rate": 0.0001997554274374728, + "loss": 5.0162, + "step": 654 + }, + { + "epoch": 0.06791567568968758, + "grad_norm": 0.490234375, + "learning_rate": 0.00019975466761660714, + "loss": 4.9897, + "step": 655 + }, + { + "epoch": 0.06801936374417565, + "grad_norm": 0.6484375, + "learning_rate": 0.00019975390661874188, + "loss": 4.9694, + "step": 656 + }, + { + "epoch": 0.06812305179866372, + "grad_norm": 0.6640625, + "learning_rate": 0.00019975314444388597, + "loss": 5.0425, + "step": 657 + }, + { + "epoch": 0.0682267398531518, + "grad_norm": 0.5234375, + "learning_rate": 0.00019975238109204836, + "loss": 5.0513, + "step": 658 + }, + { + "epoch": 0.06833042790763987, + "grad_norm": 0.50390625, + "learning_rate": 0.00019975161656323812, + "loss": 5.0141, + "step": 659 + }, + { + "epoch": 0.06843411596212794, + "grad_norm": 0.5625, + "learning_rate": 0.00019975085085746427, + "loss": 5.0405, + "step": 660 + }, + { + "epoch": 0.06853780401661601, + "grad_norm": 0.51171875, + "learning_rate": 0.00019975008397473578, + "loss": 5.0309, + "step": 661 + }, + { + "epoch": 0.06864149207110408, + "grad_norm": 0.44921875, + "learning_rate": 0.00019974931591506176, + "loss": 5.0045, + "step": 662 + }, + { + "epoch": 0.06874518012559215, + "grad_norm": 0.58203125, + "learning_rate": 0.00019974854667845126, + "loss": 5.0045, + "step": 663 + }, + { + "epoch": 0.06884886818008022, + "grad_norm": 0.55859375, + "learning_rate": 0.00019974777626491334, + "loss": 5.0137, + "step": 664 + }, + { + "epoch": 0.06895255623456831, + "grad_norm": 0.439453125, + "learning_rate": 0.0001997470046744571, + "loss": 5.0363, + "step": 665 + }, + { + "epoch": 0.06905624428905638, + "grad_norm": 0.55078125, + "learning_rate": 0.00019974623190709164, + "loss": 5.052, + "step": 666 + }, + { + "epoch": 0.06915993234354445, + "grad_norm": 0.6015625, + "learning_rate": 0.00019974545796282606, + "loss": 5.0489, + "step": 667 + }, + { + "epoch": 0.06926362039803252, + "grad_norm": 0.51171875, + "learning_rate": 0.00019974468284166954, + "loss": 5.0173, + "step": 668 + }, + { + "epoch": 0.06936730845252059, + "grad_norm": 0.5234375, + "learning_rate": 0.0001997439065436312, + "loss": 4.9908, + "step": 669 + }, + { + "epoch": 0.06947099650700866, + "grad_norm": 0.68359375, + "learning_rate": 0.00019974312906872018, + "loss": 5.024, + "step": 670 + }, + { + "epoch": 0.06957468456149674, + "grad_norm": 0.66796875, + "learning_rate": 0.00019974235041694566, + "loss": 5.0214, + "step": 671 + }, + { + "epoch": 0.06967837261598481, + "grad_norm": 0.5625, + "learning_rate": 0.00019974157058831685, + "loss": 5.0328, + "step": 672 + }, + { + "epoch": 0.06978206067047288, + "grad_norm": 0.51171875, + "learning_rate": 0.00019974078958284294, + "loss": 4.9868, + "step": 673 + }, + { + "epoch": 0.06988574872496095, + "grad_norm": 0.67578125, + "learning_rate": 0.00019974000740053316, + "loss": 4.9927, + "step": 674 + }, + { + "epoch": 0.06998943677944902, + "grad_norm": 0.6328125, + "learning_rate": 0.0001997392240413967, + "loss": 5.0442, + "step": 675 + }, + { + "epoch": 0.0700931248339371, + "grad_norm": 0.53515625, + "learning_rate": 0.0001997384395054428, + "loss": 5.0179, + "step": 676 + }, + { + "epoch": 0.07019681288842518, + "grad_norm": 0.58203125, + "learning_rate": 0.00019973765379268082, + "loss": 5.0022, + "step": 677 + }, + { + "epoch": 0.07030050094291325, + "grad_norm": 0.7109375, + "learning_rate": 0.00019973686690311987, + "loss": 5.0517, + "step": 678 + }, + { + "epoch": 0.07040418899740132, + "grad_norm": 0.458984375, + "learning_rate": 0.00019973607883676936, + "loss": 4.9883, + "step": 679 + }, + { + "epoch": 0.07050787705188939, + "grad_norm": 0.546875, + "learning_rate": 0.00019973528959363855, + "loss": 4.9936, + "step": 680 + }, + { + "epoch": 0.07061156510637746, + "grad_norm": 0.62890625, + "learning_rate": 0.00019973449917373674, + "loss": 5.0083, + "step": 681 + }, + { + "epoch": 0.07071525316086553, + "grad_norm": 0.53515625, + "learning_rate": 0.00019973370757707325, + "loss": 5.0072, + "step": 682 + }, + { + "epoch": 0.0708189412153536, + "grad_norm": 0.546875, + "learning_rate": 0.00019973291480365743, + "loss": 5.0331, + "step": 683 + }, + { + "epoch": 0.07092262926984168, + "grad_norm": 0.60546875, + "learning_rate": 0.00019973212085349867, + "loss": 4.9926, + "step": 684 + }, + { + "epoch": 0.07102631732432975, + "grad_norm": 0.67578125, + "learning_rate": 0.00019973132572660628, + "loss": 4.9922, + "step": 685 + }, + { + "epoch": 0.07113000537881782, + "grad_norm": 0.6015625, + "learning_rate": 0.00019973052942298967, + "loss": 5.0278, + "step": 686 + }, + { + "epoch": 0.0712336934333059, + "grad_norm": 0.6171875, + "learning_rate": 0.00019972973194265823, + "loss": 5.0312, + "step": 687 + }, + { + "epoch": 0.07133738148779398, + "grad_norm": 0.6796875, + "learning_rate": 0.00019972893328562137, + "loss": 4.9927, + "step": 688 + }, + { + "epoch": 0.07144106954228205, + "grad_norm": 0.48046875, + "learning_rate": 0.00019972813345188852, + "loss": 5.0116, + "step": 689 + }, + { + "epoch": 0.07154475759677012, + "grad_norm": 0.494140625, + "learning_rate": 0.00019972733244146912, + "loss": 4.9578, + "step": 690 + }, + { + "epoch": 0.07164844565125819, + "grad_norm": 0.58984375, + "learning_rate": 0.00019972653025437261, + "loss": 4.9803, + "step": 691 + }, + { + "epoch": 0.07175213370574626, + "grad_norm": 0.58984375, + "learning_rate": 0.00019972572689060846, + "loss": 4.9453, + "step": 692 + }, + { + "epoch": 0.07185582176023433, + "grad_norm": 0.474609375, + "learning_rate": 0.00019972492235018616, + "loss": 4.9737, + "step": 693 + }, + { + "epoch": 0.0719595098147224, + "grad_norm": 0.58203125, + "learning_rate": 0.00019972411663311517, + "loss": 5.0081, + "step": 694 + }, + { + "epoch": 0.07206319786921048, + "grad_norm": 0.7421875, + "learning_rate": 0.00019972330973940503, + "loss": 4.9832, + "step": 695 + }, + { + "epoch": 0.07216688592369855, + "grad_norm": 0.6875, + "learning_rate": 0.00019972250166906523, + "loss": 5.0042, + "step": 696 + }, + { + "epoch": 0.07227057397818662, + "grad_norm": 0.640625, + "learning_rate": 0.0001997216924221053, + "loss": 5.0134, + "step": 697 + }, + { + "epoch": 0.0723742620326747, + "grad_norm": 0.6015625, + "learning_rate": 0.00019972088199853488, + "loss": 4.9701, + "step": 698 + }, + { + "epoch": 0.07247795008716278, + "grad_norm": 0.58984375, + "learning_rate": 0.0001997200703983634, + "loss": 5.0043, + "step": 699 + }, + { + "epoch": 0.07258163814165085, + "grad_norm": 0.5234375, + "learning_rate": 0.00019971925762160054, + "loss": 5.0103, + "step": 700 + }, + { + "epoch": 0.07268532619613892, + "grad_norm": 0.55078125, + "learning_rate": 0.0001997184436682558, + "loss": 4.9916, + "step": 701 + }, + { + "epoch": 0.07278901425062699, + "grad_norm": 0.59375, + "learning_rate": 0.00019971762853833886, + "loss": 4.9837, + "step": 702 + }, + { + "epoch": 0.07289270230511506, + "grad_norm": 0.625, + "learning_rate": 0.0001997168122318593, + "loss": 4.9554, + "step": 703 + }, + { + "epoch": 0.07299639035960313, + "grad_norm": 0.609375, + "learning_rate": 0.0001997159947488268, + "loss": 5.0302, + "step": 704 + }, + { + "epoch": 0.0731000784140912, + "grad_norm": 0.58984375, + "learning_rate": 0.00019971517608925092, + "loss": 5.0219, + "step": 705 + }, + { + "epoch": 0.07320376646857928, + "grad_norm": 0.63671875, + "learning_rate": 0.00019971435625314139, + "loss": 4.9801, + "step": 706 + }, + { + "epoch": 0.07330745452306735, + "grad_norm": 0.5859375, + "learning_rate": 0.00019971353524050783, + "loss": 5.0094, + "step": 707 + }, + { + "epoch": 0.07341114257755542, + "grad_norm": 0.6796875, + "learning_rate": 0.00019971271305135998, + "loss": 4.9897, + "step": 708 + }, + { + "epoch": 0.07351483063204349, + "grad_norm": 0.8828125, + "learning_rate": 0.00019971188968570752, + "loss": 4.9852, + "step": 709 + }, + { + "epoch": 0.07361851868653158, + "grad_norm": 1.2265625, + "learning_rate": 0.00019971106514356018, + "loss": 5.0171, + "step": 710 + }, + { + "epoch": 0.07372220674101965, + "grad_norm": 1.0703125, + "learning_rate": 0.00019971023942492763, + "loss": 5.022, + "step": 711 + }, + { + "epoch": 0.07382589479550772, + "grad_norm": 0.8828125, + "learning_rate": 0.00019970941252981964, + "loss": 4.9833, + "step": 712 + }, + { + "epoch": 0.07392958284999579, + "grad_norm": 0.6171875, + "learning_rate": 0.00019970858445824603, + "loss": 5.025, + "step": 713 + }, + { + "epoch": 0.07403327090448386, + "grad_norm": 0.59375, + "learning_rate": 0.0001997077552102165, + "loss": 4.9984, + "step": 714 + }, + { + "epoch": 0.07413695895897193, + "grad_norm": 0.89453125, + "learning_rate": 0.00019970692478574084, + "loss": 4.9837, + "step": 715 + }, + { + "epoch": 0.07424064701346, + "grad_norm": 0.98046875, + "learning_rate": 0.00019970609318482887, + "loss": 5.0337, + "step": 716 + }, + { + "epoch": 0.07434433506794808, + "grad_norm": 1.0859375, + "learning_rate": 0.0001997052604074904, + "loss": 5.0083, + "step": 717 + }, + { + "epoch": 0.07444802312243615, + "grad_norm": 0.9375, + "learning_rate": 0.00019970442645373526, + "loss": 4.99, + "step": 718 + }, + { + "epoch": 0.07455171117692422, + "grad_norm": 1.0078125, + "learning_rate": 0.00019970359132357327, + "loss": 4.9583, + "step": 719 + }, + { + "epoch": 0.07465539923141229, + "grad_norm": 1.1796875, + "learning_rate": 0.0001997027550170143, + "loss": 4.9729, + "step": 720 + }, + { + "epoch": 0.07475908728590037, + "grad_norm": 0.6875, + "learning_rate": 0.0001997019175340682, + "loss": 5.0005, + "step": 721 + }, + { + "epoch": 0.07486277534038845, + "grad_norm": 0.7578125, + "learning_rate": 0.00019970107887474486, + "loss": 4.9606, + "step": 722 + }, + { + "epoch": 0.07496646339487652, + "grad_norm": 1.1796875, + "learning_rate": 0.0001997002390390542, + "loss": 4.9606, + "step": 723 + }, + { + "epoch": 0.07507015144936459, + "grad_norm": 1.0703125, + "learning_rate": 0.00019969939802700606, + "loss": 4.9956, + "step": 724 + }, + { + "epoch": 0.07517383950385266, + "grad_norm": 0.875, + "learning_rate": 0.00019969855583861046, + "loss": 5.0123, + "step": 725 + }, + { + "epoch": 0.07527752755834073, + "grad_norm": 0.8671875, + "learning_rate": 0.00019969771247387724, + "loss": 5.0062, + "step": 726 + }, + { + "epoch": 0.0753812156128288, + "grad_norm": 1.0625, + "learning_rate": 0.00019969686793281643, + "loss": 4.9654, + "step": 727 + }, + { + "epoch": 0.07548490366731687, + "grad_norm": 1.0625, + "learning_rate": 0.00019969602221543798, + "loss": 5.0011, + "step": 728 + }, + { + "epoch": 0.07558859172180495, + "grad_norm": 0.765625, + "learning_rate": 0.00019969517532175183, + "loss": 4.9748, + "step": 729 + }, + { + "epoch": 0.07569227977629302, + "grad_norm": 0.8828125, + "learning_rate": 0.000199694327251768, + "loss": 4.9883, + "step": 730 + }, + { + "epoch": 0.07579596783078109, + "grad_norm": 1.140625, + "learning_rate": 0.00019969347800549646, + "loss": 4.9907, + "step": 731 + }, + { + "epoch": 0.07589965588526917, + "grad_norm": 0.8984375, + "learning_rate": 0.0001996926275829473, + "loss": 4.9906, + "step": 732 + }, + { + "epoch": 0.07600334393975725, + "grad_norm": 0.6875, + "learning_rate": 0.0001996917759841305, + "loss": 4.9916, + "step": 733 + }, + { + "epoch": 0.07610703199424532, + "grad_norm": 0.82421875, + "learning_rate": 0.0001996909232090561, + "loss": 4.9873, + "step": 734 + }, + { + "epoch": 0.07621072004873339, + "grad_norm": 0.83984375, + "learning_rate": 0.0001996900692577342, + "loss": 5.0099, + "step": 735 + }, + { + "epoch": 0.07631440810322146, + "grad_norm": 0.9140625, + "learning_rate": 0.00019968921413017487, + "loss": 5.0039, + "step": 736 + }, + { + "epoch": 0.07641809615770953, + "grad_norm": 0.75, + "learning_rate": 0.0001996883578263882, + "loss": 4.9653, + "step": 737 + }, + { + "epoch": 0.0765217842121976, + "grad_norm": 0.734375, + "learning_rate": 0.00019968750034638427, + "loss": 4.967, + "step": 738 + }, + { + "epoch": 0.07662547226668567, + "grad_norm": 0.7109375, + "learning_rate": 0.0001996866416901732, + "loss": 4.9534, + "step": 739 + }, + { + "epoch": 0.07672916032117374, + "grad_norm": 0.6953125, + "learning_rate": 0.00019968578185776515, + "loss": 4.9588, + "step": 740 + }, + { + "epoch": 0.07683284837566182, + "grad_norm": 0.828125, + "learning_rate": 0.0001996849208491702, + "loss": 4.9746, + "step": 741 + }, + { + "epoch": 0.07693653643014989, + "grad_norm": 0.859375, + "learning_rate": 0.0001996840586643986, + "loss": 4.9714, + "step": 742 + }, + { + "epoch": 0.07704022448463797, + "grad_norm": 0.7265625, + "learning_rate": 0.00019968319530346048, + "loss": 4.9626, + "step": 743 + }, + { + "epoch": 0.07714391253912604, + "grad_norm": 0.80078125, + "learning_rate": 0.000199682330766366, + "loss": 4.9415, + "step": 744 + }, + { + "epoch": 0.07724760059361412, + "grad_norm": 0.84375, + "learning_rate": 0.0001996814650531254, + "loss": 4.9414, + "step": 745 + }, + { + "epoch": 0.07735128864810219, + "grad_norm": 1.15625, + "learning_rate": 0.00019968059816374888, + "loss": 4.9692, + "step": 746 + }, + { + "epoch": 0.07745497670259026, + "grad_norm": 0.79296875, + "learning_rate": 0.00019967973009824664, + "loss": 4.9797, + "step": 747 + }, + { + "epoch": 0.07755866475707833, + "grad_norm": 0.734375, + "learning_rate": 0.000199678860856629, + "loss": 4.9844, + "step": 748 + }, + { + "epoch": 0.0776623528115664, + "grad_norm": 0.90625, + "learning_rate": 0.00019967799043890615, + "loss": 4.982, + "step": 749 + }, + { + "epoch": 0.07776604086605447, + "grad_norm": 0.82421875, + "learning_rate": 0.00019967711884508839, + "loss": 4.9981, + "step": 750 + }, + { + "epoch": 0.07786972892054254, + "grad_norm": 0.87109375, + "learning_rate": 0.00019967624607518595, + "loss": 4.9928, + "step": 751 + }, + { + "epoch": 0.07797341697503062, + "grad_norm": 1.078125, + "learning_rate": 0.0001996753721292092, + "loss": 4.9345, + "step": 752 + }, + { + "epoch": 0.07807710502951869, + "grad_norm": 0.765625, + "learning_rate": 0.0001996744970071684, + "loss": 4.9615, + "step": 753 + }, + { + "epoch": 0.07818079308400677, + "grad_norm": 0.84375, + "learning_rate": 0.0001996736207090739, + "loss": 5.0009, + "step": 754 + }, + { + "epoch": 0.07828448113849484, + "grad_norm": 0.85546875, + "learning_rate": 0.00019967274323493605, + "loss": 4.9854, + "step": 755 + }, + { + "epoch": 0.07838816919298291, + "grad_norm": 0.61328125, + "learning_rate": 0.0001996718645847652, + "loss": 4.9598, + "step": 756 + }, + { + "epoch": 0.07849185724747099, + "grad_norm": 0.7890625, + "learning_rate": 0.0001996709847585717, + "loss": 4.9534, + "step": 757 + }, + { + "epoch": 0.07859554530195906, + "grad_norm": 0.95703125, + "learning_rate": 0.0001996701037563659, + "loss": 4.9808, + "step": 758 + }, + { + "epoch": 0.07869923335644713, + "grad_norm": 1.0703125, + "learning_rate": 0.00019966922157815825, + "loss": 4.9569, + "step": 759 + }, + { + "epoch": 0.0788029214109352, + "grad_norm": 0.94140625, + "learning_rate": 0.00019966833822395916, + "loss": 4.9297, + "step": 760 + }, + { + "epoch": 0.07890660946542327, + "grad_norm": 0.91015625, + "learning_rate": 0.000199667453693779, + "loss": 4.9103, + "step": 761 + }, + { + "epoch": 0.07901029751991134, + "grad_norm": 0.7109375, + "learning_rate": 0.00019966656798762827, + "loss": 4.9669, + "step": 762 + }, + { + "epoch": 0.07911398557439941, + "grad_norm": 0.60546875, + "learning_rate": 0.00019966568110551736, + "loss": 4.9627, + "step": 763 + }, + { + "epoch": 0.07921767362888749, + "grad_norm": 0.60546875, + "learning_rate": 0.0001996647930474568, + "loss": 4.9544, + "step": 764 + }, + { + "epoch": 0.07932136168337557, + "grad_norm": 0.7734375, + "learning_rate": 0.000199663903813457, + "loss": 4.9443, + "step": 765 + }, + { + "epoch": 0.07942504973786364, + "grad_norm": 0.6796875, + "learning_rate": 0.00019966301340352852, + "loss": 4.9739, + "step": 766 + }, + { + "epoch": 0.07952873779235171, + "grad_norm": 0.578125, + "learning_rate": 0.00019966212181768178, + "loss": 4.9593, + "step": 767 + }, + { + "epoch": 0.07963242584683979, + "grad_norm": 0.77734375, + "learning_rate": 0.0001996612290559274, + "loss": 4.9728, + "step": 768 + }, + { + "epoch": 0.07973611390132786, + "grad_norm": 0.65234375, + "learning_rate": 0.00019966033511827584, + "loss": 4.968, + "step": 769 + }, + { + "epoch": 0.07983980195581593, + "grad_norm": 0.58203125, + "learning_rate": 0.00019965944000473768, + "loss": 4.9428, + "step": 770 + }, + { + "epoch": 0.079943490010304, + "grad_norm": 0.6796875, + "learning_rate": 0.00019965854371532346, + "loss": 4.9784, + "step": 771 + }, + { + "epoch": 0.08004717806479207, + "grad_norm": 0.62890625, + "learning_rate": 0.00019965764625004377, + "loss": 4.9521, + "step": 772 + }, + { + "epoch": 0.08015086611928014, + "grad_norm": 0.5234375, + "learning_rate": 0.0001996567476089092, + "loss": 4.9318, + "step": 773 + }, + { + "epoch": 0.08025455417376821, + "grad_norm": 0.55859375, + "learning_rate": 0.00019965584779193035, + "loss": 4.9945, + "step": 774 + }, + { + "epoch": 0.08035824222825629, + "grad_norm": 0.55859375, + "learning_rate": 0.00019965494679911782, + "loss": 4.9496, + "step": 775 + }, + { + "epoch": 0.08046193028274436, + "grad_norm": 0.427734375, + "learning_rate": 0.0001996540446304823, + "loss": 4.9248, + "step": 776 + }, + { + "epoch": 0.08056561833723244, + "grad_norm": 0.625, + "learning_rate": 0.00019965314128603435, + "loss": 4.9327, + "step": 777 + }, + { + "epoch": 0.08066930639172051, + "grad_norm": 0.5546875, + "learning_rate": 0.00019965223676578472, + "loss": 4.9228, + "step": 778 + }, + { + "epoch": 0.08077299444620858, + "grad_norm": 0.546875, + "learning_rate": 0.00019965133106974396, + "loss": 4.9292, + "step": 779 + }, + { + "epoch": 0.08087668250069666, + "grad_norm": 0.54296875, + "learning_rate": 0.00019965042419792288, + "loss": 4.944, + "step": 780 + }, + { + "epoch": 0.08098037055518473, + "grad_norm": 0.51171875, + "learning_rate": 0.00019964951615033215, + "loss": 4.9587, + "step": 781 + }, + { + "epoch": 0.0810840586096728, + "grad_norm": 0.6328125, + "learning_rate": 0.0001996486069269824, + "loss": 4.9611, + "step": 782 + }, + { + "epoch": 0.08118774666416087, + "grad_norm": 0.61328125, + "learning_rate": 0.00019964769652788448, + "loss": 4.9236, + "step": 783 + }, + { + "epoch": 0.08129143471864894, + "grad_norm": 0.63671875, + "learning_rate": 0.00019964678495304906, + "loss": 4.9575, + "step": 784 + }, + { + "epoch": 0.08139512277313701, + "grad_norm": 0.55859375, + "learning_rate": 0.00019964587220248686, + "loss": 4.9343, + "step": 785 + }, + { + "epoch": 0.08149881082762508, + "grad_norm": 0.5390625, + "learning_rate": 0.00019964495827620875, + "loss": 4.9315, + "step": 786 + }, + { + "epoch": 0.08160249888211316, + "grad_norm": 0.640625, + "learning_rate": 0.0001996440431742254, + "loss": 4.9526, + "step": 787 + }, + { + "epoch": 0.08170618693660124, + "grad_norm": 0.56640625, + "learning_rate": 0.00019964312689654777, + "loss": 4.9687, + "step": 788 + }, + { + "epoch": 0.08180987499108931, + "grad_norm": 0.5390625, + "learning_rate": 0.0001996422094431865, + "loss": 4.9541, + "step": 789 + }, + { + "epoch": 0.08191356304557738, + "grad_norm": 0.515625, + "learning_rate": 0.0001996412908141525, + "loss": 5.007, + "step": 790 + }, + { + "epoch": 0.08201725110006546, + "grad_norm": 0.65625, + "learning_rate": 0.0001996403710094566, + "loss": 4.8851, + "step": 791 + }, + { + "epoch": 0.08212093915455353, + "grad_norm": 0.6484375, + "learning_rate": 0.00019963945002910964, + "loss": 4.8957, + "step": 792 + }, + { + "epoch": 0.0822246272090416, + "grad_norm": 0.6015625, + "learning_rate": 0.0001996385278731225, + "loss": 4.9079, + "step": 793 + }, + { + "epoch": 0.08232831526352967, + "grad_norm": 0.546875, + "learning_rate": 0.00019963760454150603, + "loss": 4.9504, + "step": 794 + }, + { + "epoch": 0.08243200331801774, + "grad_norm": 0.6640625, + "learning_rate": 0.0001996366800342712, + "loss": 4.9559, + "step": 795 + }, + { + "epoch": 0.08253569137250581, + "grad_norm": 0.6328125, + "learning_rate": 0.0001996357543514288, + "loss": 4.9396, + "step": 796 + }, + { + "epoch": 0.08263937942699388, + "grad_norm": 0.51171875, + "learning_rate": 0.00019963482749298984, + "loss": 4.9287, + "step": 797 + }, + { + "epoch": 0.08274306748148196, + "grad_norm": 0.71484375, + "learning_rate": 0.00019963389945896527, + "loss": 4.922, + "step": 798 + }, + { + "epoch": 0.08284675553597004, + "grad_norm": 0.73046875, + "learning_rate": 0.00019963297024936595, + "loss": 4.9211, + "step": 799 + }, + { + "epoch": 0.08295044359045811, + "grad_norm": 0.87109375, + "learning_rate": 0.00019963203986420296, + "loss": 4.9202, + "step": 800 + }, + { + "epoch": 0.08305413164494618, + "grad_norm": 0.98828125, + "learning_rate": 0.00019963110830348714, + "loss": 4.9276, + "step": 801 + }, + { + "epoch": 0.08315781969943425, + "grad_norm": 1.2109375, + "learning_rate": 0.00019963017556722963, + "loss": 4.8891, + "step": 802 + }, + { + "epoch": 0.08326150775392233, + "grad_norm": 0.9375, + "learning_rate": 0.0001996292416554413, + "loss": 4.9741, + "step": 803 + }, + { + "epoch": 0.0833651958084104, + "grad_norm": 0.78515625, + "learning_rate": 0.0001996283065681333, + "loss": 4.9151, + "step": 804 + }, + { + "epoch": 0.08346888386289847, + "grad_norm": 0.6796875, + "learning_rate": 0.00019962737030531654, + "loss": 4.9354, + "step": 805 + }, + { + "epoch": 0.08357257191738654, + "grad_norm": 0.80859375, + "learning_rate": 0.00019962643286700215, + "loss": 4.9499, + "step": 806 + }, + { + "epoch": 0.08367625997187461, + "grad_norm": 0.79296875, + "learning_rate": 0.00019962549425320112, + "loss": 4.9544, + "step": 807 + }, + { + "epoch": 0.08377994802636268, + "grad_norm": 0.72265625, + "learning_rate": 0.00019962455446392461, + "loss": 4.9092, + "step": 808 + }, + { + "epoch": 0.08388363608085075, + "grad_norm": 0.74609375, + "learning_rate": 0.00019962361349918365, + "loss": 4.9512, + "step": 809 + }, + { + "epoch": 0.08398732413533884, + "grad_norm": 0.6171875, + "learning_rate": 0.00019962267135898936, + "loss": 4.9327, + "step": 810 + }, + { + "epoch": 0.08409101218982691, + "grad_norm": 0.7109375, + "learning_rate": 0.00019962172804335285, + "loss": 4.944, + "step": 811 + }, + { + "epoch": 0.08419470024431498, + "grad_norm": 0.796875, + "learning_rate": 0.00019962078355228525, + "loss": 4.8733, + "step": 812 + }, + { + "epoch": 0.08429838829880305, + "grad_norm": 0.765625, + "learning_rate": 0.0001996198378857977, + "loss": 4.901, + "step": 813 + }, + { + "epoch": 0.08440207635329113, + "grad_norm": 0.65625, + "learning_rate": 0.00019961889104390138, + "loss": 4.9424, + "step": 814 + }, + { + "epoch": 0.0845057644077792, + "grad_norm": 0.62890625, + "learning_rate": 0.00019961794302660746, + "loss": 4.9486, + "step": 815 + }, + { + "epoch": 0.08460945246226727, + "grad_norm": 0.73046875, + "learning_rate": 0.00019961699383392708, + "loss": 4.9393, + "step": 816 + }, + { + "epoch": 0.08471314051675534, + "grad_norm": 0.6953125, + "learning_rate": 0.0001996160434658715, + "loss": 4.9351, + "step": 817 + }, + { + "epoch": 0.08481682857124341, + "grad_norm": 0.62890625, + "learning_rate": 0.0001996150919224519, + "loss": 4.9562, + "step": 818 + }, + { + "epoch": 0.08492051662573148, + "grad_norm": 0.61328125, + "learning_rate": 0.00019961413920367948, + "loss": 4.9268, + "step": 819 + }, + { + "epoch": 0.08502420468021955, + "grad_norm": 0.73046875, + "learning_rate": 0.00019961318530956556, + "loss": 4.9037, + "step": 820 + }, + { + "epoch": 0.08512789273470764, + "grad_norm": 0.625, + "learning_rate": 0.00019961223024012132, + "loss": 4.9261, + "step": 821 + }, + { + "epoch": 0.08523158078919571, + "grad_norm": 0.640625, + "learning_rate": 0.0001996112739953581, + "loss": 4.8719, + "step": 822 + }, + { + "epoch": 0.08533526884368378, + "grad_norm": 0.7109375, + "learning_rate": 0.00019961031657528708, + "loss": 4.8848, + "step": 823 + }, + { + "epoch": 0.08543895689817185, + "grad_norm": 0.63671875, + "learning_rate": 0.00019960935797991967, + "loss": 4.9255, + "step": 824 + }, + { + "epoch": 0.08554264495265992, + "grad_norm": 0.65234375, + "learning_rate": 0.0001996083982092671, + "loss": 4.8943, + "step": 825 + }, + { + "epoch": 0.085646333007148, + "grad_norm": 0.65234375, + "learning_rate": 0.00019960743726334072, + "loss": 4.8836, + "step": 826 + }, + { + "epoch": 0.08575002106163607, + "grad_norm": 0.62109375, + "learning_rate": 0.0001996064751421519, + "loss": 4.9091, + "step": 827 + }, + { + "epoch": 0.08585370911612414, + "grad_norm": 0.5234375, + "learning_rate": 0.00019960551184571192, + "loss": 4.8901, + "step": 828 + }, + { + "epoch": 0.08595739717061221, + "grad_norm": 0.52734375, + "learning_rate": 0.00019960454737403223, + "loss": 4.9074, + "step": 829 + }, + { + "epoch": 0.08606108522510028, + "grad_norm": 0.59765625, + "learning_rate": 0.00019960358172712412, + "loss": 4.9147, + "step": 830 + }, + { + "epoch": 0.08616477327958835, + "grad_norm": 0.470703125, + "learning_rate": 0.00019960261490499907, + "loss": 4.8972, + "step": 831 + }, + { + "epoch": 0.08626846133407644, + "grad_norm": 0.5234375, + "learning_rate": 0.00019960164690766843, + "loss": 4.9044, + "step": 832 + }, + { + "epoch": 0.08637214938856451, + "grad_norm": 0.6171875, + "learning_rate": 0.00019960067773514364, + "loss": 4.9212, + "step": 833 + }, + { + "epoch": 0.08647583744305258, + "grad_norm": 0.5546875, + "learning_rate": 0.00019959970738743613, + "loss": 4.9064, + "step": 834 + }, + { + "epoch": 0.08657952549754065, + "grad_norm": 0.5859375, + "learning_rate": 0.00019959873586455738, + "loss": 4.8741, + "step": 835 + }, + { + "epoch": 0.08668321355202872, + "grad_norm": 0.7578125, + "learning_rate": 0.0001995977631665188, + "loss": 4.9145, + "step": 836 + }, + { + "epoch": 0.0867869016065168, + "grad_norm": 0.80859375, + "learning_rate": 0.0001995967892933319, + "loss": 4.9278, + "step": 837 + }, + { + "epoch": 0.08689058966100487, + "grad_norm": 0.74609375, + "learning_rate": 0.00019959581424500817, + "loss": 4.9188, + "step": 838 + }, + { + "epoch": 0.08699427771549294, + "grad_norm": 0.6953125, + "learning_rate": 0.00019959483802155912, + "loss": 4.9492, + "step": 839 + }, + { + "epoch": 0.08709796576998101, + "grad_norm": 0.74609375, + "learning_rate": 0.00019959386062299626, + "loss": 4.9078, + "step": 840 + }, + { + "epoch": 0.08720165382446908, + "grad_norm": 0.89453125, + "learning_rate": 0.0001995928820493311, + "loss": 4.898, + "step": 841 + }, + { + "epoch": 0.08730534187895715, + "grad_norm": 1.078125, + "learning_rate": 0.00019959190230057518, + "loss": 4.9058, + "step": 842 + }, + { + "epoch": 0.08740902993344522, + "grad_norm": 1.203125, + "learning_rate": 0.00019959092137674013, + "loss": 4.8931, + "step": 843 + }, + { + "epoch": 0.08751271798793331, + "grad_norm": 0.828125, + "learning_rate": 0.0001995899392778375, + "loss": 4.9151, + "step": 844 + }, + { + "epoch": 0.08761640604242138, + "grad_norm": 0.703125, + "learning_rate": 0.0001995889560038788, + "loss": 4.9288, + "step": 845 + }, + { + "epoch": 0.08772009409690945, + "grad_norm": 0.6875, + "learning_rate": 0.0001995879715548757, + "loss": 4.8856, + "step": 846 + }, + { + "epoch": 0.08782378215139752, + "grad_norm": 1.046875, + "learning_rate": 0.00019958698593083981, + "loss": 4.8822, + "step": 847 + }, + { + "epoch": 0.0879274702058856, + "grad_norm": 1.1640625, + "learning_rate": 0.00019958599913178277, + "loss": 4.9602, + "step": 848 + }, + { + "epoch": 0.08803115826037367, + "grad_norm": 0.859375, + "learning_rate": 0.00019958501115771622, + "loss": 4.9163, + "step": 849 + }, + { + "epoch": 0.08813484631486174, + "grad_norm": 0.83203125, + "learning_rate": 0.00019958402200865178, + "loss": 4.918, + "step": 850 + }, + { + "epoch": 0.08823853436934981, + "grad_norm": 0.77734375, + "learning_rate": 0.00019958303168460115, + "loss": 4.8722, + "step": 851 + }, + { + "epoch": 0.08834222242383788, + "grad_norm": 0.81640625, + "learning_rate": 0.000199582040185576, + "loss": 4.8877, + "step": 852 + }, + { + "epoch": 0.08844591047832595, + "grad_norm": 0.85546875, + "learning_rate": 0.00019958104751158806, + "loss": 4.905, + "step": 853 + }, + { + "epoch": 0.08854959853281402, + "grad_norm": 0.70703125, + "learning_rate": 0.00019958005366264901, + "loss": 4.8835, + "step": 854 + }, + { + "epoch": 0.08865328658730211, + "grad_norm": 0.6171875, + "learning_rate": 0.0001995790586387706, + "loss": 4.8851, + "step": 855 + }, + { + "epoch": 0.08875697464179018, + "grad_norm": 0.57421875, + "learning_rate": 0.00019957806243996453, + "loss": 4.8804, + "step": 856 + }, + { + "epoch": 0.08886066269627825, + "grad_norm": 0.6328125, + "learning_rate": 0.0001995770650662426, + "loss": 4.9273, + "step": 857 + }, + { + "epoch": 0.08896435075076632, + "grad_norm": 0.7421875, + "learning_rate": 0.00019957606651761656, + "loss": 4.872, + "step": 858 + }, + { + "epoch": 0.0890680388052544, + "grad_norm": 0.78125, + "learning_rate": 0.00019957506679409818, + "loss": 4.9288, + "step": 859 + }, + { + "epoch": 0.08917172685974246, + "grad_norm": 0.75390625, + "learning_rate": 0.00019957406589569927, + "loss": 4.8692, + "step": 860 + }, + { + "epoch": 0.08927541491423054, + "grad_norm": 0.76171875, + "learning_rate": 0.00019957306382243167, + "loss": 4.9116, + "step": 861 + }, + { + "epoch": 0.08937910296871861, + "grad_norm": 0.7578125, + "learning_rate": 0.00019957206057430712, + "loss": 4.8783, + "step": 862 + }, + { + "epoch": 0.08948279102320668, + "grad_norm": 0.8671875, + "learning_rate": 0.00019957105615133754, + "loss": 4.8542, + "step": 863 + }, + { + "epoch": 0.08958647907769475, + "grad_norm": 0.67578125, + "learning_rate": 0.00019957005055353474, + "loss": 4.8961, + "step": 864 + }, + { + "epoch": 0.08969016713218282, + "grad_norm": 0.50390625, + "learning_rate": 0.0001995690437809106, + "loss": 4.848, + "step": 865 + }, + { + "epoch": 0.0897938551866709, + "grad_norm": 0.765625, + "learning_rate": 0.00019956803583347696, + "loss": 4.8486, + "step": 866 + }, + { + "epoch": 0.08989754324115898, + "grad_norm": 0.80859375, + "learning_rate": 0.0001995670267112458, + "loss": 4.8902, + "step": 867 + }, + { + "epoch": 0.09000123129564705, + "grad_norm": 0.65234375, + "learning_rate": 0.00019956601641422892, + "loss": 4.8824, + "step": 868 + }, + { + "epoch": 0.09010491935013512, + "grad_norm": 0.6953125, + "learning_rate": 0.00019956500494243832, + "loss": 4.8438, + "step": 869 + }, + { + "epoch": 0.09020860740462319, + "grad_norm": 0.59765625, + "learning_rate": 0.00019956399229588588, + "loss": 4.9038, + "step": 870 + }, + { + "epoch": 0.09031229545911126, + "grad_norm": 0.65234375, + "learning_rate": 0.0001995629784745836, + "loss": 4.8895, + "step": 871 + }, + { + "epoch": 0.09041598351359934, + "grad_norm": 0.6953125, + "learning_rate": 0.0001995619634785434, + "loss": 4.9058, + "step": 872 + }, + { + "epoch": 0.0905196715680874, + "grad_norm": 0.59765625, + "learning_rate": 0.0001995609473077773, + "loss": 4.8509, + "step": 873 + }, + { + "epoch": 0.09062335962257548, + "grad_norm": 0.625, + "learning_rate": 0.00019955992996229728, + "loss": 4.8988, + "step": 874 + }, + { + "epoch": 0.09072704767706355, + "grad_norm": 0.65625, + "learning_rate": 0.00019955891144211524, + "loss": 4.9242, + "step": 875 + }, + { + "epoch": 0.09083073573155162, + "grad_norm": 0.57421875, + "learning_rate": 0.00019955789174724338, + "loss": 4.8703, + "step": 876 + }, + { + "epoch": 0.0909344237860397, + "grad_norm": 0.7890625, + "learning_rate": 0.00019955687087769357, + "loss": 4.8834, + "step": 877 + }, + { + "epoch": 0.09103811184052778, + "grad_norm": 0.92578125, + "learning_rate": 0.00019955584883347792, + "loss": 4.9105, + "step": 878 + }, + { + "epoch": 0.09114179989501585, + "grad_norm": 0.859375, + "learning_rate": 0.00019955482561460853, + "loss": 4.9056, + "step": 879 + }, + { + "epoch": 0.09124548794950392, + "grad_norm": 0.94921875, + "learning_rate": 0.00019955380122109738, + "loss": 4.8805, + "step": 880 + }, + { + "epoch": 0.09134917600399199, + "grad_norm": 0.95703125, + "learning_rate": 0.00019955277565295666, + "loss": 4.8183, + "step": 881 + }, + { + "epoch": 0.09145286405848006, + "grad_norm": 0.90234375, + "learning_rate": 0.00019955174891019838, + "loss": 4.8363, + "step": 882 + }, + { + "epoch": 0.09155655211296813, + "grad_norm": 1.0390625, + "learning_rate": 0.00019955072099283472, + "loss": 4.8429, + "step": 883 + }, + { + "epoch": 0.0916602401674562, + "grad_norm": 1.0703125, + "learning_rate": 0.00019954969190087777, + "loss": 4.8869, + "step": 884 + }, + { + "epoch": 0.09176392822194428, + "grad_norm": 0.875, + "learning_rate": 0.0001995486616343397, + "loss": 4.9302, + "step": 885 + }, + { + "epoch": 0.09186761627643235, + "grad_norm": 1.015625, + "learning_rate": 0.00019954763019323265, + "loss": 4.8797, + "step": 886 + }, + { + "epoch": 0.09197130433092042, + "grad_norm": 1.21875, + "learning_rate": 0.00019954659757756877, + "loss": 4.8558, + "step": 887 + }, + { + "epoch": 0.0920749923854085, + "grad_norm": 0.5859375, + "learning_rate": 0.00019954556378736028, + "loss": 4.9109, + "step": 888 + }, + { + "epoch": 0.09217868043989658, + "grad_norm": 0.8515625, + "learning_rate": 0.00019954452882261933, + "loss": 4.8751, + "step": 889 + }, + { + "epoch": 0.09228236849438465, + "grad_norm": 1.3125, + "learning_rate": 0.0001995434926833582, + "loss": 4.8356, + "step": 890 + }, + { + "epoch": 0.09238605654887272, + "grad_norm": 0.66796875, + "learning_rate": 0.00019954245536958908, + "loss": 4.8736, + "step": 891 + }, + { + "epoch": 0.09248974460336079, + "grad_norm": 0.96875, + "learning_rate": 0.00019954141688132419, + "loss": 4.922, + "step": 892 + }, + { + "epoch": 0.09259343265784886, + "grad_norm": 1.1875, + "learning_rate": 0.0001995403772185758, + "loss": 4.8539, + "step": 893 + }, + { + "epoch": 0.09269712071233693, + "grad_norm": 0.80859375, + "learning_rate": 0.00019953933638135616, + "loss": 4.9001, + "step": 894 + }, + { + "epoch": 0.092800808766825, + "grad_norm": 1.078125, + "learning_rate": 0.00019953829436967759, + "loss": 4.8676, + "step": 895 + }, + { + "epoch": 0.09290449682131308, + "grad_norm": 0.94140625, + "learning_rate": 0.00019953725118355235, + "loss": 4.8616, + "step": 896 + }, + { + "epoch": 0.09300818487580115, + "grad_norm": 0.84765625, + "learning_rate": 0.00019953620682299278, + "loss": 4.8614, + "step": 897 + }, + { + "epoch": 0.09311187293028922, + "grad_norm": 0.9921875, + "learning_rate": 0.0001995351612880112, + "loss": 4.8647, + "step": 898 + }, + { + "epoch": 0.0932155609847773, + "grad_norm": 1.0625, + "learning_rate": 0.0001995341145786199, + "loss": 4.8408, + "step": 899 + }, + { + "epoch": 0.09331924903926538, + "grad_norm": 1.09375, + "learning_rate": 0.00019953306669483127, + "loss": 4.9216, + "step": 900 + }, + { + "epoch": 0.09342293709375345, + "grad_norm": 0.859375, + "learning_rate": 0.00019953201763665766, + "loss": 4.8706, + "step": 901 + }, + { + "epoch": 0.09352662514824152, + "grad_norm": 0.96484375, + "learning_rate": 0.00019953096740411144, + "loss": 4.8723, + "step": 902 + }, + { + "epoch": 0.09363031320272959, + "grad_norm": 0.96484375, + "learning_rate": 0.00019952991599720503, + "loss": 4.8612, + "step": 903 + }, + { + "epoch": 0.09373400125721766, + "grad_norm": 0.8125, + "learning_rate": 0.0001995288634159508, + "loss": 4.9012, + "step": 904 + }, + { + "epoch": 0.09383768931170573, + "grad_norm": 1.125, + "learning_rate": 0.00019952780966036123, + "loss": 4.8422, + "step": 905 + }, + { + "epoch": 0.0939413773661938, + "grad_norm": 0.64453125, + "learning_rate": 0.00019952675473044868, + "loss": 4.8762, + "step": 906 + }, + { + "epoch": 0.09404506542068188, + "grad_norm": 1.0078125, + "learning_rate": 0.00019952569862622562, + "loss": 4.8799, + "step": 907 + }, + { + "epoch": 0.09414875347516995, + "grad_norm": 0.87890625, + "learning_rate": 0.00019952464134770454, + "loss": 4.8731, + "step": 908 + }, + { + "epoch": 0.09425244152965802, + "grad_norm": 0.9453125, + "learning_rate": 0.0001995235828948979, + "loss": 4.8628, + "step": 909 + }, + { + "epoch": 0.0943561295841461, + "grad_norm": 0.96484375, + "learning_rate": 0.00019952252326781815, + "loss": 4.8959, + "step": 910 + }, + { + "epoch": 0.09445981763863417, + "grad_norm": 0.9296875, + "learning_rate": 0.00019952146246647785, + "loss": 4.8658, + "step": 911 + }, + { + "epoch": 0.09456350569312225, + "grad_norm": 0.8828125, + "learning_rate": 0.0001995204004908895, + "loss": 4.8525, + "step": 912 + }, + { + "epoch": 0.09466719374761032, + "grad_norm": 0.68359375, + "learning_rate": 0.0001995193373410656, + "loss": 4.8731, + "step": 913 + }, + { + "epoch": 0.09477088180209839, + "grad_norm": 0.765625, + "learning_rate": 0.00019951827301701872, + "loss": 4.862, + "step": 914 + }, + { + "epoch": 0.09487456985658646, + "grad_norm": 0.77734375, + "learning_rate": 0.00019951720751876142, + "loss": 4.8848, + "step": 915 + }, + { + "epoch": 0.09497825791107453, + "grad_norm": 0.66796875, + "learning_rate": 0.00019951614084630625, + "loss": 4.8816, + "step": 916 + }, + { + "epoch": 0.0950819459655626, + "grad_norm": 0.7890625, + "learning_rate": 0.00019951507299966585, + "loss": 4.8559, + "step": 917 + }, + { + "epoch": 0.09518563402005067, + "grad_norm": 0.63671875, + "learning_rate": 0.00019951400397885273, + "loss": 4.8733, + "step": 918 + }, + { + "epoch": 0.09528932207453875, + "grad_norm": 0.8359375, + "learning_rate": 0.00019951293378387962, + "loss": 4.7999, + "step": 919 + }, + { + "epoch": 0.09539301012902682, + "grad_norm": 0.8203125, + "learning_rate": 0.000199511862414759, + "loss": 4.829, + "step": 920 + }, + { + "epoch": 0.09549669818351489, + "grad_norm": 0.62890625, + "learning_rate": 0.00019951078987150365, + "loss": 4.893, + "step": 921 + }, + { + "epoch": 0.09560038623800297, + "grad_norm": 0.78515625, + "learning_rate": 0.00019950971615412616, + "loss": 4.853, + "step": 922 + }, + { + "epoch": 0.09570407429249105, + "grad_norm": 0.7578125, + "learning_rate": 0.00019950864126263917, + "loss": 4.8412, + "step": 923 + }, + { + "epoch": 0.09580776234697912, + "grad_norm": 0.76171875, + "learning_rate": 0.00019950756519705544, + "loss": 4.8648, + "step": 924 + }, + { + "epoch": 0.09591145040146719, + "grad_norm": 0.6953125, + "learning_rate": 0.0001995064879573876, + "loss": 4.8454, + "step": 925 + }, + { + "epoch": 0.09601513845595526, + "grad_norm": 0.68359375, + "learning_rate": 0.0001995054095436484, + "loss": 4.8425, + "step": 926 + }, + { + "epoch": 0.09611882651044333, + "grad_norm": 0.6015625, + "learning_rate": 0.00019950432995585054, + "loss": 4.8581, + "step": 927 + }, + { + "epoch": 0.0962225145649314, + "grad_norm": 0.73046875, + "learning_rate": 0.00019950324919400676, + "loss": 4.8878, + "step": 928 + }, + { + "epoch": 0.09632620261941947, + "grad_norm": 0.79296875, + "learning_rate": 0.00019950216725812982, + "loss": 4.7899, + "step": 929 + }, + { + "epoch": 0.09642989067390755, + "grad_norm": 0.5625, + "learning_rate": 0.0001995010841482325, + "loss": 4.8928, + "step": 930 + }, + { + "epoch": 0.09653357872839562, + "grad_norm": 0.6328125, + "learning_rate": 0.00019949999986432757, + "loss": 4.9347, + "step": 931 + }, + { + "epoch": 0.09663726678288369, + "grad_norm": 0.67578125, + "learning_rate": 0.0001994989144064278, + "loss": 4.8777, + "step": 932 + }, + { + "epoch": 0.09674095483737177, + "grad_norm": 0.6328125, + "learning_rate": 0.00019949782777454602, + "loss": 4.9026, + "step": 933 + }, + { + "epoch": 0.09684464289185984, + "grad_norm": 0.71484375, + "learning_rate": 0.00019949673996869506, + "loss": 4.8867, + "step": 934 + }, + { + "epoch": 0.09694833094634792, + "grad_norm": 0.59375, + "learning_rate": 0.00019949565098888771, + "loss": 4.847, + "step": 935 + }, + { + "epoch": 0.09705201900083599, + "grad_norm": 0.60546875, + "learning_rate": 0.00019949456083513686, + "loss": 4.8488, + "step": 936 + }, + { + "epoch": 0.09715570705532406, + "grad_norm": 0.578125, + "learning_rate": 0.00019949346950745537, + "loss": 4.8131, + "step": 937 + }, + { + "epoch": 0.09725939510981213, + "grad_norm": 0.51171875, + "learning_rate": 0.0001994923770058561, + "loss": 4.8802, + "step": 938 + }, + { + "epoch": 0.0973630831643002, + "grad_norm": 0.5546875, + "learning_rate": 0.00019949128333035198, + "loss": 4.8309, + "step": 939 + }, + { + "epoch": 0.09746677121878827, + "grad_norm": 0.6640625, + "learning_rate": 0.00019949018848095586, + "loss": 4.8319, + "step": 940 + }, + { + "epoch": 0.09757045927327634, + "grad_norm": 0.6015625, + "learning_rate": 0.00019948909245768066, + "loss": 4.8677, + "step": 941 + }, + { + "epoch": 0.09767414732776442, + "grad_norm": 0.625, + "learning_rate": 0.00019948799526053938, + "loss": 4.8561, + "step": 942 + }, + { + "epoch": 0.09777783538225249, + "grad_norm": 0.7265625, + "learning_rate": 0.00019948689688954489, + "loss": 4.8883, + "step": 943 + }, + { + "epoch": 0.09788152343674057, + "grad_norm": 0.62890625, + "learning_rate": 0.00019948579734471017, + "loss": 4.826, + "step": 944 + }, + { + "epoch": 0.09798521149122864, + "grad_norm": 0.5625, + "learning_rate": 0.00019948469662604823, + "loss": 4.7702, + "step": 945 + }, + { + "epoch": 0.09808889954571672, + "grad_norm": 0.55859375, + "learning_rate": 0.00019948359473357202, + "loss": 4.8859, + "step": 946 + }, + { + "epoch": 0.09819258760020479, + "grad_norm": 0.70703125, + "learning_rate": 0.00019948249166729454, + "loss": 4.8573, + "step": 947 + }, + { + "epoch": 0.09829627565469286, + "grad_norm": 0.51171875, + "learning_rate": 0.0001994813874272288, + "loss": 4.8528, + "step": 948 + }, + { + "epoch": 0.09839996370918093, + "grad_norm": 0.6171875, + "learning_rate": 0.00019948028201338788, + "loss": 4.9022, + "step": 949 + }, + { + "epoch": 0.098503651763669, + "grad_norm": 0.578125, + "learning_rate": 0.00019947917542578478, + "loss": 4.8382, + "step": 950 + }, + { + "epoch": 0.09860733981815707, + "grad_norm": 0.640625, + "learning_rate": 0.00019947806766443255, + "loss": 4.8507, + "step": 951 + }, + { + "epoch": 0.09871102787264514, + "grad_norm": 0.65234375, + "learning_rate": 0.0001994769587293443, + "loss": 4.7939, + "step": 952 + }, + { + "epoch": 0.09881471592713321, + "grad_norm": 0.5546875, + "learning_rate": 0.00019947584862053307, + "loss": 4.8567, + "step": 953 + }, + { + "epoch": 0.09891840398162129, + "grad_norm": 0.640625, + "learning_rate": 0.00019947473733801196, + "loss": 4.8055, + "step": 954 + }, + { + "epoch": 0.09902209203610937, + "grad_norm": 0.76953125, + "learning_rate": 0.00019947362488179413, + "loss": 4.8541, + "step": 955 + }, + { + "epoch": 0.09912578009059744, + "grad_norm": 0.89453125, + "learning_rate": 0.00019947251125189264, + "loss": 4.8555, + "step": 956 + }, + { + "epoch": 0.09922946814508551, + "grad_norm": 0.9296875, + "learning_rate": 0.0001994713964483207, + "loss": 4.8593, + "step": 957 + }, + { + "epoch": 0.09933315619957359, + "grad_norm": 0.796875, + "learning_rate": 0.00019947028047109143, + "loss": 4.8419, + "step": 958 + }, + { + "epoch": 0.09943684425406166, + "grad_norm": 0.81640625, + "learning_rate": 0.00019946916332021797, + "loss": 4.8051, + "step": 959 + }, + { + "epoch": 0.09954053230854973, + "grad_norm": 0.98046875, + "learning_rate": 0.00019946804499571354, + "loss": 4.8403, + "step": 960 + }, + { + "epoch": 0.0996442203630378, + "grad_norm": 1.15625, + "learning_rate": 0.00019946692549759133, + "loss": 4.8334, + "step": 961 + }, + { + "epoch": 0.09974790841752587, + "grad_norm": 0.96484375, + "learning_rate": 0.00019946580482586452, + "loss": 4.8314, + "step": 962 + }, + { + "epoch": 0.09985159647201394, + "grad_norm": 1.0546875, + "learning_rate": 0.00019946468298054636, + "loss": 4.8393, + "step": 963 + }, + { + "epoch": 0.09995528452650201, + "grad_norm": 1.046875, + "learning_rate": 0.00019946355996165006, + "loss": 4.8274, + "step": 964 + }, + { + "epoch": 0.10005897258099009, + "grad_norm": 1.0, + "learning_rate": 0.00019946243576918893, + "loss": 4.844, + "step": 965 + }, + { + "epoch": 0.10016266063547817, + "grad_norm": 0.90625, + "learning_rate": 0.00019946131040317618, + "loss": 4.8438, + "step": 966 + }, + { + "epoch": 0.10026634868996624, + "grad_norm": 0.76171875, + "learning_rate": 0.00019946018386362508, + "loss": 4.8389, + "step": 967 + }, + { + "epoch": 0.10037003674445431, + "grad_norm": 0.7734375, + "learning_rate": 0.00019945905615054898, + "loss": 4.822, + "step": 968 + }, + { + "epoch": 0.10047372479894238, + "grad_norm": 0.8203125, + "learning_rate": 0.00019945792726396114, + "loss": 4.8085, + "step": 969 + }, + { + "epoch": 0.10057741285343046, + "grad_norm": 0.890625, + "learning_rate": 0.00019945679720387486, + "loss": 4.8656, + "step": 970 + }, + { + "epoch": 0.10068110090791853, + "grad_norm": 1.03125, + "learning_rate": 0.00019945566597030353, + "loss": 4.8728, + "step": 971 + }, + { + "epoch": 0.1007847889624066, + "grad_norm": 0.9765625, + "learning_rate": 0.00019945453356326045, + "loss": 4.8481, + "step": 972 + }, + { + "epoch": 0.10088847701689467, + "grad_norm": 0.80859375, + "learning_rate": 0.00019945339998275903, + "loss": 4.8336, + "step": 973 + }, + { + "epoch": 0.10099216507138274, + "grad_norm": 0.71875, + "learning_rate": 0.0001994522652288126, + "loss": 4.8323, + "step": 974 + }, + { + "epoch": 0.10109585312587081, + "grad_norm": 0.81640625, + "learning_rate": 0.00019945112930143456, + "loss": 4.8381, + "step": 975 + }, + { + "epoch": 0.10119954118035888, + "grad_norm": 0.88671875, + "learning_rate": 0.00019944999220063834, + "loss": 4.8525, + "step": 976 + }, + { + "epoch": 0.10130322923484697, + "grad_norm": 0.87109375, + "learning_rate": 0.00019944885392643734, + "loss": 4.8475, + "step": 977 + }, + { + "epoch": 0.10140691728933504, + "grad_norm": 0.890625, + "learning_rate": 0.00019944771447884496, + "loss": 4.8167, + "step": 978 + }, + { + "epoch": 0.10151060534382311, + "grad_norm": 0.94921875, + "learning_rate": 0.00019944657385787467, + "loss": 4.8381, + "step": 979 + }, + { + "epoch": 0.10161429339831118, + "grad_norm": 0.93359375, + "learning_rate": 0.00019944543206353995, + "loss": 4.8668, + "step": 980 + }, + { + "epoch": 0.10171798145279926, + "grad_norm": 0.80078125, + "learning_rate": 0.00019944428909585423, + "loss": 4.8108, + "step": 981 + }, + { + "epoch": 0.10182166950728733, + "grad_norm": 0.8515625, + "learning_rate": 0.00019944314495483104, + "loss": 4.8041, + "step": 982 + }, + { + "epoch": 0.1019253575617754, + "grad_norm": 1.015625, + "learning_rate": 0.0001994419996404838, + "loss": 4.7916, + "step": 983 + }, + { + "epoch": 0.10202904561626347, + "grad_norm": 1.0546875, + "learning_rate": 0.00019944085315282614, + "loss": 4.8384, + "step": 984 + }, + { + "epoch": 0.10213273367075154, + "grad_norm": 1.09375, + "learning_rate": 0.0001994397054918715, + "loss": 4.8389, + "step": 985 + }, + { + "epoch": 0.10223642172523961, + "grad_norm": 0.7734375, + "learning_rate": 0.00019943855665763345, + "loss": 4.8711, + "step": 986 + }, + { + "epoch": 0.10234010977972768, + "grad_norm": 0.66796875, + "learning_rate": 0.00019943740665012553, + "loss": 4.8159, + "step": 987 + }, + { + "epoch": 0.10244379783421576, + "grad_norm": 1.0234375, + "learning_rate": 0.00019943625546936134, + "loss": 4.7974, + "step": 988 + }, + { + "epoch": 0.10254748588870384, + "grad_norm": 0.9609375, + "learning_rate": 0.00019943510311535445, + "loss": 4.8683, + "step": 989 + }, + { + "epoch": 0.10265117394319191, + "grad_norm": 0.734375, + "learning_rate": 0.00019943394958811842, + "loss": 4.8304, + "step": 990 + }, + { + "epoch": 0.10275486199767998, + "grad_norm": 0.62109375, + "learning_rate": 0.00019943279488766693, + "loss": 4.8283, + "step": 991 + }, + { + "epoch": 0.10285855005216805, + "grad_norm": 0.80859375, + "learning_rate": 0.00019943163901401355, + "loss": 4.8337, + "step": 992 + }, + { + "epoch": 0.10296223810665613, + "grad_norm": 0.79296875, + "learning_rate": 0.0001994304819671719, + "loss": 4.8154, + "step": 993 + }, + { + "epoch": 0.1030659261611442, + "grad_norm": 0.57421875, + "learning_rate": 0.0001994293237471557, + "loss": 4.8173, + "step": 994 + }, + { + "epoch": 0.10316961421563227, + "grad_norm": 0.73046875, + "learning_rate": 0.0001994281643539786, + "loss": 4.8074, + "step": 995 + }, + { + "epoch": 0.10327330227012034, + "grad_norm": 0.671875, + "learning_rate": 0.00019942700378765423, + "loss": 4.8224, + "step": 996 + }, + { + "epoch": 0.10337699032460841, + "grad_norm": 0.58203125, + "learning_rate": 0.00019942584204819632, + "loss": 4.8471, + "step": 997 + }, + { + "epoch": 0.10348067837909648, + "grad_norm": 0.64453125, + "learning_rate": 0.00019942467913561859, + "loss": 4.776, + "step": 998 + }, + { + "epoch": 0.10358436643358455, + "grad_norm": 0.68359375, + "learning_rate": 0.0001994235150499347, + "loss": 4.8177, + "step": 999 + }, + { + "epoch": 0.10368805448807264, + "grad_norm": 0.8515625, + "learning_rate": 0.00019942234979115848, + "loss": 4.8716, + "step": 1000 + }, + { + "epoch": 0.10379174254256071, + "grad_norm": 0.70703125, + "learning_rate": 0.0001994211833593036, + "loss": 4.8509, + "step": 1001 + }, + { + "epoch": 0.10389543059704878, + "grad_norm": 0.5, + "learning_rate": 0.00019942001575438384, + "loss": 4.7899, + "step": 1002 + }, + { + "epoch": 0.10399911865153685, + "grad_norm": 0.65625, + "learning_rate": 0.00019941884697641298, + "loss": 4.8424, + "step": 1003 + }, + { + "epoch": 0.10410280670602493, + "grad_norm": 0.63671875, + "learning_rate": 0.00019941767702540483, + "loss": 4.8242, + "step": 1004 + }, + { + "epoch": 0.104206494760513, + "grad_norm": 0.69921875, + "learning_rate": 0.0001994165059013732, + "loss": 4.8327, + "step": 1005 + }, + { + "epoch": 0.10431018281500107, + "grad_norm": 0.640625, + "learning_rate": 0.00019941533360433184, + "loss": 4.8073, + "step": 1006 + }, + { + "epoch": 0.10441387086948914, + "grad_norm": 0.56640625, + "learning_rate": 0.00019941416013429468, + "loss": 4.82, + "step": 1007 + }, + { + "epoch": 0.10451755892397721, + "grad_norm": 0.8046875, + "learning_rate": 0.0001994129854912755, + "loss": 4.8072, + "step": 1008 + }, + { + "epoch": 0.10462124697846528, + "grad_norm": 0.6640625, + "learning_rate": 0.00019941180967528817, + "loss": 4.8173, + "step": 1009 + }, + { + "epoch": 0.10472493503295335, + "grad_norm": 0.609375, + "learning_rate": 0.00019941063268634655, + "loss": 4.7941, + "step": 1010 + }, + { + "epoch": 0.10482862308744144, + "grad_norm": 0.859375, + "learning_rate": 0.00019940945452446456, + "loss": 4.8366, + "step": 1011 + }, + { + "epoch": 0.10493231114192951, + "grad_norm": 0.7890625, + "learning_rate": 0.0001994082751896561, + "loss": 4.832, + "step": 1012 + }, + { + "epoch": 0.10503599919641758, + "grad_norm": 0.68359375, + "learning_rate": 0.00019940709468193509, + "loss": 4.8127, + "step": 1013 + }, + { + "epoch": 0.10513968725090565, + "grad_norm": 0.734375, + "learning_rate": 0.00019940591300131538, + "loss": 4.8416, + "step": 1014 + }, + { + "epoch": 0.10524337530539372, + "grad_norm": 0.68359375, + "learning_rate": 0.00019940473014781103, + "loss": 4.8164, + "step": 1015 + }, + { + "epoch": 0.1053470633598818, + "grad_norm": 0.84765625, + "learning_rate": 0.0001994035461214359, + "loss": 4.8007, + "step": 1016 + }, + { + "epoch": 0.10545075141436987, + "grad_norm": 0.8125, + "learning_rate": 0.00019940236092220404, + "loss": 4.7715, + "step": 1017 + }, + { + "epoch": 0.10555443946885794, + "grad_norm": 0.77734375, + "learning_rate": 0.00019940117455012935, + "loss": 4.8128, + "step": 1018 + }, + { + "epoch": 0.10565812752334601, + "grad_norm": 0.58203125, + "learning_rate": 0.00019939998700522587, + "loss": 4.8322, + "step": 1019 + }, + { + "epoch": 0.10576181557783408, + "grad_norm": 0.79296875, + "learning_rate": 0.00019939879828750768, + "loss": 4.7856, + "step": 1020 + }, + { + "epoch": 0.10586550363232215, + "grad_norm": 0.6875, + "learning_rate": 0.00019939760839698866, + "loss": 4.7796, + "step": 1021 + }, + { + "epoch": 0.10596919168681024, + "grad_norm": 0.65625, + "learning_rate": 0.00019939641733368298, + "loss": 4.812, + "step": 1022 + }, + { + "epoch": 0.10607287974129831, + "grad_norm": 0.796875, + "learning_rate": 0.00019939522509760462, + "loss": 4.8104, + "step": 1023 + }, + { + "epoch": 0.10617656779578638, + "grad_norm": 0.8125, + "learning_rate": 0.0001993940316887677, + "loss": 4.8058, + "step": 1024 + }, + { + "epoch": 0.10628025585027445, + "grad_norm": 0.8359375, + "learning_rate": 0.00019939283710718623, + "loss": 4.8323, + "step": 1025 + }, + { + "epoch": 0.10638394390476252, + "grad_norm": 0.92578125, + "learning_rate": 0.00019939164135287433, + "loss": 4.8172, + "step": 1026 + }, + { + "epoch": 0.1064876319592506, + "grad_norm": 0.7109375, + "learning_rate": 0.00019939044442584617, + "loss": 4.8447, + "step": 1027 + }, + { + "epoch": 0.10659132001373867, + "grad_norm": 0.703125, + "learning_rate": 0.00019938924632611582, + "loss": 4.8457, + "step": 1028 + }, + { + "epoch": 0.10669500806822674, + "grad_norm": 0.69921875, + "learning_rate": 0.00019938804705369741, + "loss": 4.8359, + "step": 1029 + }, + { + "epoch": 0.10679869612271481, + "grad_norm": 0.67578125, + "learning_rate": 0.00019938684660860513, + "loss": 4.8462, + "step": 1030 + }, + { + "epoch": 0.10690238417720288, + "grad_norm": 0.69921875, + "learning_rate": 0.00019938564499085305, + "loss": 4.853, + "step": 1031 + }, + { + "epoch": 0.10700607223169095, + "grad_norm": 0.7421875, + "learning_rate": 0.00019938444220045547, + "loss": 4.7956, + "step": 1032 + }, + { + "epoch": 0.10710976028617904, + "grad_norm": 0.59765625, + "learning_rate": 0.0001993832382374265, + "loss": 4.8215, + "step": 1033 + }, + { + "epoch": 0.10721344834066711, + "grad_norm": 0.77734375, + "learning_rate": 0.0001993820331017804, + "loss": 4.8004, + "step": 1034 + }, + { + "epoch": 0.10731713639515518, + "grad_norm": 0.796875, + "learning_rate": 0.00019938082679353132, + "loss": 4.8516, + "step": 1035 + }, + { + "epoch": 0.10742082444964325, + "grad_norm": 0.80859375, + "learning_rate": 0.00019937961931269357, + "loss": 4.7898, + "step": 1036 + }, + { + "epoch": 0.10752451250413132, + "grad_norm": 0.625, + "learning_rate": 0.00019937841065928135, + "loss": 4.8164, + "step": 1037 + }, + { + "epoch": 0.1076282005586194, + "grad_norm": 0.6015625, + "learning_rate": 0.00019937720083330893, + "loss": 4.8158, + "step": 1038 + }, + { + "epoch": 0.10773188861310747, + "grad_norm": 0.65234375, + "learning_rate": 0.00019937598983479058, + "loss": 4.8225, + "step": 1039 + }, + { + "epoch": 0.10783557666759554, + "grad_norm": 0.703125, + "learning_rate": 0.0001993747776637406, + "loss": 4.7566, + "step": 1040 + }, + { + "epoch": 0.10793926472208361, + "grad_norm": 0.62890625, + "learning_rate": 0.00019937356432017328, + "loss": 4.8108, + "step": 1041 + }, + { + "epoch": 0.10804295277657168, + "grad_norm": 0.640625, + "learning_rate": 0.00019937234980410296, + "loss": 4.7864, + "step": 1042 + }, + { + "epoch": 0.10814664083105975, + "grad_norm": 0.7109375, + "learning_rate": 0.00019937113411554395, + "loss": 4.8389, + "step": 1043 + }, + { + "epoch": 0.10825032888554784, + "grad_norm": 0.6953125, + "learning_rate": 0.00019936991725451057, + "loss": 4.8236, + "step": 1044 + }, + { + "epoch": 0.10835401694003591, + "grad_norm": 0.69921875, + "learning_rate": 0.00019936869922101727, + "loss": 4.8259, + "step": 1045 + }, + { + "epoch": 0.10845770499452398, + "grad_norm": 0.69921875, + "learning_rate": 0.0001993674800150783, + "loss": 4.7816, + "step": 1046 + }, + { + "epoch": 0.10856139304901205, + "grad_norm": 0.66796875, + "learning_rate": 0.00019936625963670813, + "loss": 4.7689, + "step": 1047 + }, + { + "epoch": 0.10866508110350012, + "grad_norm": 0.7734375, + "learning_rate": 0.0001993650380859211, + "loss": 4.8243, + "step": 1048 + }, + { + "epoch": 0.1087687691579882, + "grad_norm": 0.828125, + "learning_rate": 0.00019936381536273166, + "loss": 4.828, + "step": 1049 + }, + { + "epoch": 0.10887245721247626, + "grad_norm": 0.87109375, + "learning_rate": 0.00019936259146715425, + "loss": 4.8372, + "step": 1050 + }, + { + "epoch": 0.10897614526696434, + "grad_norm": 0.89453125, + "learning_rate": 0.0001993613663992033, + "loss": 4.7794, + "step": 1051 + }, + { + "epoch": 0.10907983332145241, + "grad_norm": 0.7890625, + "learning_rate": 0.00019936014015889321, + "loss": 4.7735, + "step": 1052 + }, + { + "epoch": 0.10918352137594048, + "grad_norm": 0.78125, + "learning_rate": 0.00019935891274623852, + "loss": 4.7367, + "step": 1053 + }, + { + "epoch": 0.10928720943042855, + "grad_norm": 0.76953125, + "learning_rate": 0.00019935768416125371, + "loss": 4.8041, + "step": 1054 + }, + { + "epoch": 0.10939089748491662, + "grad_norm": 0.875, + "learning_rate": 0.00019935645440395322, + "loss": 4.8382, + "step": 1055 + }, + { + "epoch": 0.1094945855394047, + "grad_norm": 0.93359375, + "learning_rate": 0.0001993552234743516, + "loss": 4.7804, + "step": 1056 + }, + { + "epoch": 0.10959827359389278, + "grad_norm": 0.77734375, + "learning_rate": 0.00019935399137246338, + "loss": 4.8278, + "step": 1057 + }, + { + "epoch": 0.10970196164838085, + "grad_norm": 0.68359375, + "learning_rate": 0.00019935275809830307, + "loss": 4.7946, + "step": 1058 + }, + { + "epoch": 0.10980564970286892, + "grad_norm": 0.7265625, + "learning_rate": 0.00019935152365188526, + "loss": 4.8324, + "step": 1059 + }, + { + "epoch": 0.10990933775735699, + "grad_norm": 0.65234375, + "learning_rate": 0.00019935028803322444, + "loss": 4.7871, + "step": 1060 + }, + { + "epoch": 0.11001302581184506, + "grad_norm": 0.671875, + "learning_rate": 0.00019934905124233528, + "loss": 4.7859, + "step": 1061 + }, + { + "epoch": 0.11011671386633314, + "grad_norm": 0.75, + "learning_rate": 0.00019934781327923232, + "loss": 4.7975, + "step": 1062 + }, + { + "epoch": 0.1102204019208212, + "grad_norm": 0.921875, + "learning_rate": 0.00019934657414393016, + "loss": 4.785, + "step": 1063 + }, + { + "epoch": 0.11032408997530928, + "grad_norm": 0.890625, + "learning_rate": 0.00019934533383644348, + "loss": 4.8142, + "step": 1064 + }, + { + "epoch": 0.11042777802979735, + "grad_norm": 0.99609375, + "learning_rate": 0.00019934409235678683, + "loss": 4.8027, + "step": 1065 + }, + { + "epoch": 0.11053146608428542, + "grad_norm": 1.046875, + "learning_rate": 0.00019934284970497492, + "loss": 4.8281, + "step": 1066 + }, + { + "epoch": 0.1106351541387735, + "grad_norm": 0.80078125, + "learning_rate": 0.00019934160588102242, + "loss": 4.8019, + "step": 1067 + }, + { + "epoch": 0.11073884219326158, + "grad_norm": 0.9609375, + "learning_rate": 0.00019934036088494394, + "loss": 4.7709, + "step": 1068 + }, + { + "epoch": 0.11084253024774965, + "grad_norm": 1.0, + "learning_rate": 0.00019933911471675423, + "loss": 4.7505, + "step": 1069 + }, + { + "epoch": 0.11094621830223772, + "grad_norm": 0.9375, + "learning_rate": 0.00019933786737646797, + "loss": 4.8155, + "step": 1070 + }, + { + "epoch": 0.11104990635672579, + "grad_norm": 1.203125, + "learning_rate": 0.00019933661886409988, + "loss": 4.8044, + "step": 1071 + }, + { + "epoch": 0.11115359441121386, + "grad_norm": 0.9140625, + "learning_rate": 0.00019933536917966468, + "loss": 4.8024, + "step": 1072 + }, + { + "epoch": 0.11125728246570193, + "grad_norm": 0.94140625, + "learning_rate": 0.00019933411832317712, + "loss": 4.7923, + "step": 1073 + }, + { + "epoch": 0.11136097052019, + "grad_norm": 0.8828125, + "learning_rate": 0.000199332866294652, + "loss": 4.7929, + "step": 1074 + }, + { + "epoch": 0.11146465857467808, + "grad_norm": 0.9296875, + "learning_rate": 0.00019933161309410402, + "loss": 4.8082, + "step": 1075 + }, + { + "epoch": 0.11156834662916615, + "grad_norm": 1.0703125, + "learning_rate": 0.00019933035872154802, + "loss": 4.7484, + "step": 1076 + }, + { + "epoch": 0.11167203468365422, + "grad_norm": 1.046875, + "learning_rate": 0.00019932910317699878, + "loss": 4.7837, + "step": 1077 + }, + { + "epoch": 0.1117757227381423, + "grad_norm": 0.96484375, + "learning_rate": 0.00019932784646047112, + "loss": 4.8484, + "step": 1078 + }, + { + "epoch": 0.11187941079263038, + "grad_norm": 0.89453125, + "learning_rate": 0.00019932658857197986, + "loss": 4.8276, + "step": 1079 + }, + { + "epoch": 0.11198309884711845, + "grad_norm": 0.765625, + "learning_rate": 0.00019932532951153986, + "loss": 4.7676, + "step": 1080 + }, + { + "epoch": 0.11208678690160652, + "grad_norm": 0.8203125, + "learning_rate": 0.00019932406927916595, + "loss": 4.8098, + "step": 1081 + }, + { + "epoch": 0.11219047495609459, + "grad_norm": 0.82421875, + "learning_rate": 0.000199322807874873, + "loss": 4.8138, + "step": 1082 + }, + { + "epoch": 0.11229416301058266, + "grad_norm": 0.75390625, + "learning_rate": 0.00019932154529867595, + "loss": 4.8027, + "step": 1083 + }, + { + "epoch": 0.11239785106507073, + "grad_norm": 0.828125, + "learning_rate": 0.00019932028155058963, + "loss": 4.8091, + "step": 1084 + }, + { + "epoch": 0.1125015391195588, + "grad_norm": 0.95703125, + "learning_rate": 0.00019931901663062894, + "loss": 4.787, + "step": 1085 + }, + { + "epoch": 0.11260522717404688, + "grad_norm": 0.8125, + "learning_rate": 0.00019931775053880888, + "loss": 4.7998, + "step": 1086 + }, + { + "epoch": 0.11270891522853495, + "grad_norm": 0.86328125, + "learning_rate": 0.00019931648327514435, + "loss": 4.819, + "step": 1087 + }, + { + "epoch": 0.11281260328302302, + "grad_norm": 0.9375, + "learning_rate": 0.0001993152148396503, + "loss": 4.7823, + "step": 1088 + }, + { + "epoch": 0.1129162913375111, + "grad_norm": 0.77734375, + "learning_rate": 0.00019931394523234165, + "loss": 4.804, + "step": 1089 + }, + { + "epoch": 0.11301997939199918, + "grad_norm": 0.65625, + "learning_rate": 0.00019931267445323346, + "loss": 4.8188, + "step": 1090 + }, + { + "epoch": 0.11312366744648725, + "grad_norm": 0.85546875, + "learning_rate": 0.00019931140250234068, + "loss": 4.8397, + "step": 1091 + }, + { + "epoch": 0.11322735550097532, + "grad_norm": 0.7578125, + "learning_rate": 0.00019931012937967834, + "loss": 4.7646, + "step": 1092 + }, + { + "epoch": 0.11333104355546339, + "grad_norm": 0.75390625, + "learning_rate": 0.00019930885508526145, + "loss": 4.8125, + "step": 1093 + }, + { + "epoch": 0.11343473160995146, + "grad_norm": 0.81640625, + "learning_rate": 0.000199307579619105, + "loss": 4.7948, + "step": 1094 + }, + { + "epoch": 0.11353841966443953, + "grad_norm": 0.66015625, + "learning_rate": 0.00019930630298122415, + "loss": 4.7597, + "step": 1095 + }, + { + "epoch": 0.1136421077189276, + "grad_norm": 0.66796875, + "learning_rate": 0.00019930502517163386, + "loss": 4.7938, + "step": 1096 + }, + { + "epoch": 0.11374579577341568, + "grad_norm": 0.83984375, + "learning_rate": 0.00019930374619034927, + "loss": 4.8352, + "step": 1097 + }, + { + "epoch": 0.11384948382790375, + "grad_norm": 0.81640625, + "learning_rate": 0.0001993024660373854, + "loss": 4.8116, + "step": 1098 + }, + { + "epoch": 0.11395317188239182, + "grad_norm": 0.734375, + "learning_rate": 0.00019930118471275744, + "loss": 4.7617, + "step": 1099 + }, + { + "epoch": 0.1140568599368799, + "grad_norm": 0.77734375, + "learning_rate": 0.00019929990221648043, + "loss": 4.7716, + "step": 1100 + }, + { + "epoch": 0.11416054799136797, + "grad_norm": 0.76171875, + "learning_rate": 0.00019929861854856956, + "loss": 4.8149, + "step": 1101 + }, + { + "epoch": 0.11426423604585605, + "grad_norm": 0.73828125, + "learning_rate": 0.00019929733370903995, + "loss": 4.7617, + "step": 1102 + }, + { + "epoch": 0.11436792410034412, + "grad_norm": 0.6484375, + "learning_rate": 0.00019929604769790675, + "loss": 4.7973, + "step": 1103 + }, + { + "epoch": 0.11447161215483219, + "grad_norm": 0.640625, + "learning_rate": 0.00019929476051518518, + "loss": 4.8084, + "step": 1104 + }, + { + "epoch": 0.11457530020932026, + "grad_norm": 0.734375, + "learning_rate": 0.00019929347216089037, + "loss": 4.8131, + "step": 1105 + }, + { + "epoch": 0.11467898826380833, + "grad_norm": 0.8203125, + "learning_rate": 0.00019929218263503752, + "loss": 4.7745, + "step": 1106 + }, + { + "epoch": 0.1147826763182964, + "grad_norm": 0.90625, + "learning_rate": 0.00019929089193764193, + "loss": 4.7861, + "step": 1107 + }, + { + "epoch": 0.11488636437278447, + "grad_norm": 0.9296875, + "learning_rate": 0.00019928960006871874, + "loss": 4.7646, + "step": 1108 + }, + { + "epoch": 0.11499005242727255, + "grad_norm": 0.6953125, + "learning_rate": 0.00019928830702828322, + "loss": 4.8098, + "step": 1109 + }, + { + "epoch": 0.11509374048176062, + "grad_norm": 0.62109375, + "learning_rate": 0.00019928701281635066, + "loss": 4.7654, + "step": 1110 + }, + { + "epoch": 0.1151974285362487, + "grad_norm": 0.84375, + "learning_rate": 0.00019928571743293625, + "loss": 4.7427, + "step": 1111 + }, + { + "epoch": 0.11530111659073677, + "grad_norm": 1.046875, + "learning_rate": 0.00019928442087805537, + "loss": 4.8149, + "step": 1112 + }, + { + "epoch": 0.11540480464522485, + "grad_norm": 0.75, + "learning_rate": 0.00019928312315172324, + "loss": 4.8035, + "step": 1113 + }, + { + "epoch": 0.11550849269971292, + "grad_norm": 0.640625, + "learning_rate": 0.0001992818242539552, + "loss": 4.7691, + "step": 1114 + }, + { + "epoch": 0.11561218075420099, + "grad_norm": 0.734375, + "learning_rate": 0.0001992805241847666, + "loss": 4.7827, + "step": 1115 + }, + { + "epoch": 0.11571586880868906, + "grad_norm": 0.91796875, + "learning_rate": 0.00019927922294417277, + "loss": 4.7606, + "step": 1116 + }, + { + "epoch": 0.11581955686317713, + "grad_norm": 0.921875, + "learning_rate": 0.00019927792053218903, + "loss": 4.7606, + "step": 1117 + }, + { + "epoch": 0.1159232449176652, + "grad_norm": 0.828125, + "learning_rate": 0.00019927661694883075, + "loss": 4.7667, + "step": 1118 + }, + { + "epoch": 0.11602693297215327, + "grad_norm": 0.84375, + "learning_rate": 0.00019927531219411337, + "loss": 4.7663, + "step": 1119 + }, + { + "epoch": 0.11613062102664135, + "grad_norm": 0.65234375, + "learning_rate": 0.00019927400626805223, + "loss": 4.7812, + "step": 1120 + }, + { + "epoch": 0.11623430908112942, + "grad_norm": 0.76953125, + "learning_rate": 0.00019927269917066273, + "loss": 4.7531, + "step": 1121 + }, + { + "epoch": 0.11633799713561749, + "grad_norm": 0.74609375, + "learning_rate": 0.00019927139090196035, + "loss": 4.7844, + "step": 1122 + }, + { + "epoch": 0.11644168519010557, + "grad_norm": 0.69921875, + "learning_rate": 0.00019927008146196048, + "loss": 4.7869, + "step": 1123 + }, + { + "epoch": 0.11654537324459364, + "grad_norm": 0.80859375, + "learning_rate": 0.0001992687708506786, + "loss": 4.78, + "step": 1124 + }, + { + "epoch": 0.11664906129908172, + "grad_norm": 0.6953125, + "learning_rate": 0.00019926745906813012, + "loss": 4.7729, + "step": 1125 + }, + { + "epoch": 0.11675274935356979, + "grad_norm": 0.69140625, + "learning_rate": 0.00019926614611433057, + "loss": 4.8032, + "step": 1126 + }, + { + "epoch": 0.11685643740805786, + "grad_norm": 0.66796875, + "learning_rate": 0.0001992648319892954, + "loss": 4.7693, + "step": 1127 + }, + { + "epoch": 0.11696012546254593, + "grad_norm": 0.6953125, + "learning_rate": 0.00019926351669304017, + "loss": 4.7452, + "step": 1128 + }, + { + "epoch": 0.117063813517034, + "grad_norm": 0.6640625, + "learning_rate": 0.00019926220022558036, + "loss": 4.7493, + "step": 1129 + }, + { + "epoch": 0.11716750157152207, + "grad_norm": 0.62890625, + "learning_rate": 0.00019926088258693153, + "loss": 4.7705, + "step": 1130 + }, + { + "epoch": 0.11727118962601014, + "grad_norm": 0.703125, + "learning_rate": 0.0001992595637771092, + "loss": 4.7883, + "step": 1131 + }, + { + "epoch": 0.11737487768049822, + "grad_norm": 0.71484375, + "learning_rate": 0.0001992582437961289, + "loss": 4.767, + "step": 1132 + }, + { + "epoch": 0.11747856573498629, + "grad_norm": 0.67578125, + "learning_rate": 0.00019925692264400629, + "loss": 4.7713, + "step": 1133 + }, + { + "epoch": 0.11758225378947437, + "grad_norm": 0.68359375, + "learning_rate": 0.0001992556003207569, + "loss": 4.7332, + "step": 1134 + }, + { + "epoch": 0.11768594184396244, + "grad_norm": 0.77734375, + "learning_rate": 0.00019925427682639636, + "loss": 4.7582, + "step": 1135 + }, + { + "epoch": 0.11778962989845052, + "grad_norm": 0.72265625, + "learning_rate": 0.00019925295216094023, + "loss": 4.8117, + "step": 1136 + }, + { + "epoch": 0.11789331795293859, + "grad_norm": 0.609375, + "learning_rate": 0.00019925162632440419, + "loss": 4.7745, + "step": 1137 + }, + { + "epoch": 0.11799700600742666, + "grad_norm": 0.6015625, + "learning_rate": 0.0001992502993168039, + "loss": 4.7928, + "step": 1138 + }, + { + "epoch": 0.11810069406191473, + "grad_norm": 0.671875, + "learning_rate": 0.00019924897113815496, + "loss": 4.7954, + "step": 1139 + }, + { + "epoch": 0.1182043821164028, + "grad_norm": 0.65234375, + "learning_rate": 0.0001992476417884731, + "loss": 4.7575, + "step": 1140 + }, + { + "epoch": 0.11830807017089087, + "grad_norm": 0.62109375, + "learning_rate": 0.00019924631126777396, + "loss": 4.7496, + "step": 1141 + }, + { + "epoch": 0.11841175822537894, + "grad_norm": 0.65625, + "learning_rate": 0.00019924497957607326, + "loss": 4.7866, + "step": 1142 + }, + { + "epoch": 0.11851544627986701, + "grad_norm": 0.84375, + "learning_rate": 0.00019924364671338672, + "loss": 4.773, + "step": 1143 + }, + { + "epoch": 0.11861913433435509, + "grad_norm": 0.6953125, + "learning_rate": 0.00019924231267973004, + "loss": 4.7703, + "step": 1144 + }, + { + "epoch": 0.11872282238884317, + "grad_norm": 0.6875, + "learning_rate": 0.00019924097747511896, + "loss": 4.7405, + "step": 1145 + }, + { + "epoch": 0.11882651044333124, + "grad_norm": 0.79296875, + "learning_rate": 0.00019923964109956925, + "loss": 4.7476, + "step": 1146 + }, + { + "epoch": 0.11893019849781931, + "grad_norm": 0.828125, + "learning_rate": 0.00019923830355309669, + "loss": 4.7496, + "step": 1147 + }, + { + "epoch": 0.11903388655230739, + "grad_norm": 0.91015625, + "learning_rate": 0.00019923696483571703, + "loss": 4.7838, + "step": 1148 + }, + { + "epoch": 0.11913757460679546, + "grad_norm": 0.8828125, + "learning_rate": 0.00019923562494744611, + "loss": 4.7803, + "step": 1149 + }, + { + "epoch": 0.11924126266128353, + "grad_norm": 0.59375, + "learning_rate": 0.0001992342838882997, + "loss": 4.7226, + "step": 1150 + }, + { + "epoch": 0.1193449507157716, + "grad_norm": 0.68359375, + "learning_rate": 0.00019923294165829364, + "loss": 4.7638, + "step": 1151 + }, + { + "epoch": 0.11944863877025967, + "grad_norm": 0.7890625, + "learning_rate": 0.00019923159825744376, + "loss": 4.7377, + "step": 1152 + }, + { + "epoch": 0.11955232682474774, + "grad_norm": 0.71484375, + "learning_rate": 0.0001992302536857659, + "loss": 4.7789, + "step": 1153 + }, + { + "epoch": 0.11965601487923581, + "grad_norm": 0.59375, + "learning_rate": 0.00019922890794327595, + "loss": 4.7689, + "step": 1154 + }, + { + "epoch": 0.11975970293372389, + "grad_norm": 0.7265625, + "learning_rate": 0.00019922756102998975, + "loss": 4.7883, + "step": 1155 + }, + { + "epoch": 0.11986339098821197, + "grad_norm": 0.65234375, + "learning_rate": 0.00019922621294592326, + "loss": 4.7345, + "step": 1156 + }, + { + "epoch": 0.11996707904270004, + "grad_norm": 0.72265625, + "learning_rate": 0.00019922486369109232, + "loss": 4.7087, + "step": 1157 + }, + { + "epoch": 0.12007076709718811, + "grad_norm": 0.875, + "learning_rate": 0.00019922351326551288, + "loss": 4.764, + "step": 1158 + }, + { + "epoch": 0.12017445515167618, + "grad_norm": 0.84765625, + "learning_rate": 0.00019922216166920088, + "loss": 4.751, + "step": 1159 + }, + { + "epoch": 0.12027814320616426, + "grad_norm": 0.6875, + "learning_rate": 0.00019922080890217222, + "loss": 4.7546, + "step": 1160 + }, + { + "epoch": 0.12038183126065233, + "grad_norm": 0.61328125, + "learning_rate": 0.00019921945496444293, + "loss": 4.7621, + "step": 1161 + }, + { + "epoch": 0.1204855193151404, + "grad_norm": 0.70703125, + "learning_rate": 0.00019921809985602894, + "loss": 4.729, + "step": 1162 + }, + { + "epoch": 0.12058920736962847, + "grad_norm": 0.71875, + "learning_rate": 0.00019921674357694624, + "loss": 4.8148, + "step": 1163 + }, + { + "epoch": 0.12069289542411654, + "grad_norm": 0.60546875, + "learning_rate": 0.00019921538612721084, + "loss": 4.7172, + "step": 1164 + }, + { + "epoch": 0.12079658347860461, + "grad_norm": 0.83984375, + "learning_rate": 0.0001992140275068388, + "loss": 4.7725, + "step": 1165 + }, + { + "epoch": 0.12090027153309268, + "grad_norm": 0.7578125, + "learning_rate": 0.00019921266771584604, + "loss": 4.7636, + "step": 1166 + }, + { + "epoch": 0.12100395958758077, + "grad_norm": 0.609375, + "learning_rate": 0.0001992113067542487, + "loss": 4.7482, + "step": 1167 + }, + { + "epoch": 0.12110764764206884, + "grad_norm": 0.72265625, + "learning_rate": 0.00019920994462206283, + "loss": 4.7445, + "step": 1168 + }, + { + "epoch": 0.12121133569655691, + "grad_norm": 0.76171875, + "learning_rate": 0.0001992085813193045, + "loss": 4.8108, + "step": 1169 + }, + { + "epoch": 0.12131502375104498, + "grad_norm": 0.6875, + "learning_rate": 0.00019920721684598975, + "loss": 4.7589, + "step": 1170 + }, + { + "epoch": 0.12141871180553306, + "grad_norm": 0.5703125, + "learning_rate": 0.0001992058512021347, + "loss": 4.7789, + "step": 1171 + }, + { + "epoch": 0.12152239986002113, + "grad_norm": 0.6171875, + "learning_rate": 0.0001992044843877555, + "loss": 4.757, + "step": 1172 + }, + { + "epoch": 0.1216260879145092, + "grad_norm": 0.7421875, + "learning_rate": 0.00019920311640286823, + "loss": 4.7454, + "step": 1173 + }, + { + "epoch": 0.12172977596899727, + "grad_norm": 0.66796875, + "learning_rate": 0.00019920174724748902, + "loss": 4.7181, + "step": 1174 + }, + { + "epoch": 0.12183346402348534, + "grad_norm": 0.55859375, + "learning_rate": 0.00019920037692163409, + "loss": 4.7726, + "step": 1175 + }, + { + "epoch": 0.12193715207797341, + "grad_norm": 0.609375, + "learning_rate": 0.00019919900542531956, + "loss": 4.7689, + "step": 1176 + }, + { + "epoch": 0.12204084013246148, + "grad_norm": 0.66015625, + "learning_rate": 0.00019919763275856164, + "loss": 4.7677, + "step": 1177 + }, + { + "epoch": 0.12214452818694957, + "grad_norm": 0.671875, + "learning_rate": 0.0001991962589213765, + "loss": 4.77, + "step": 1178 + }, + { + "epoch": 0.12224821624143764, + "grad_norm": 0.55078125, + "learning_rate": 0.00019919488391378034, + "loss": 4.7638, + "step": 1179 + }, + { + "epoch": 0.12235190429592571, + "grad_norm": 0.6171875, + "learning_rate": 0.0001991935077357894, + "loss": 4.7877, + "step": 1180 + }, + { + "epoch": 0.12245559235041378, + "grad_norm": 0.609375, + "learning_rate": 0.00019919213038741996, + "loss": 4.7555, + "step": 1181 + }, + { + "epoch": 0.12255928040490185, + "grad_norm": 0.71484375, + "learning_rate": 0.00019919075186868824, + "loss": 4.7536, + "step": 1182 + }, + { + "epoch": 0.12266296845938993, + "grad_norm": 0.703125, + "learning_rate": 0.00019918937217961043, + "loss": 4.7577, + "step": 1183 + }, + { + "epoch": 0.122766656513878, + "grad_norm": 0.578125, + "learning_rate": 0.0001991879913202029, + "loss": 4.7453, + "step": 1184 + }, + { + "epoch": 0.12287034456836607, + "grad_norm": 0.5625, + "learning_rate": 0.00019918660929048196, + "loss": 4.8053, + "step": 1185 + }, + { + "epoch": 0.12297403262285414, + "grad_norm": 0.61328125, + "learning_rate": 0.00019918522609046387, + "loss": 4.7143, + "step": 1186 + }, + { + "epoch": 0.12307772067734221, + "grad_norm": 0.609375, + "learning_rate": 0.00019918384172016494, + "loss": 4.7592, + "step": 1187 + }, + { + "epoch": 0.12318140873183028, + "grad_norm": 0.6484375, + "learning_rate": 0.0001991824561796015, + "loss": 4.7616, + "step": 1188 + }, + { + "epoch": 0.12328509678631835, + "grad_norm": 0.65625, + "learning_rate": 0.00019918106946878995, + "loss": 4.7261, + "step": 1189 + }, + { + "epoch": 0.12338878484080644, + "grad_norm": 0.68359375, + "learning_rate": 0.00019917968158774657, + "loss": 4.7459, + "step": 1190 + }, + { + "epoch": 0.12349247289529451, + "grad_norm": 0.71484375, + "learning_rate": 0.00019917829253648784, + "loss": 4.7121, + "step": 1191 + }, + { + "epoch": 0.12359616094978258, + "grad_norm": 0.66796875, + "learning_rate": 0.00019917690231503006, + "loss": 4.7695, + "step": 1192 + }, + { + "epoch": 0.12369984900427065, + "grad_norm": 0.5703125, + "learning_rate": 0.00019917551092338967, + "loss": 4.768, + "step": 1193 + }, + { + "epoch": 0.12380353705875873, + "grad_norm": 0.61328125, + "learning_rate": 0.00019917411836158308, + "loss": 4.7858, + "step": 1194 + }, + { + "epoch": 0.1239072251132468, + "grad_norm": 0.69140625, + "learning_rate": 0.00019917272462962674, + "loss": 4.776, + "step": 1195 + }, + { + "epoch": 0.12401091316773487, + "grad_norm": 0.63671875, + "learning_rate": 0.0001991713297275371, + "loss": 4.77, + "step": 1196 + }, + { + "epoch": 0.12411460122222294, + "grad_norm": 0.6640625, + "learning_rate": 0.00019916993365533056, + "loss": 4.7629, + "step": 1197 + }, + { + "epoch": 0.12421828927671101, + "grad_norm": 0.76953125, + "learning_rate": 0.00019916853641302365, + "loss": 4.7412, + "step": 1198 + }, + { + "epoch": 0.12432197733119908, + "grad_norm": 0.8671875, + "learning_rate": 0.0001991671380006328, + "loss": 4.7251, + "step": 1199 + }, + { + "epoch": 0.12442566538568715, + "grad_norm": 0.75, + "learning_rate": 0.0001991657384181746, + "loss": 4.7247, + "step": 1200 + }, + { + "epoch": 0.12452935344017524, + "grad_norm": 0.79296875, + "learning_rate": 0.00019916433766566547, + "loss": 4.744, + "step": 1201 + }, + { + "epoch": 0.12463304149466331, + "grad_norm": 0.73046875, + "learning_rate": 0.000199162935743122, + "loss": 4.7852, + "step": 1202 + }, + { + "epoch": 0.12473672954915138, + "grad_norm": 0.80859375, + "learning_rate": 0.0001991615326505607, + "loss": 4.76, + "step": 1203 + }, + { + "epoch": 0.12484041760363945, + "grad_norm": 0.95703125, + "learning_rate": 0.00019916012838799813, + "loss": 4.7716, + "step": 1204 + }, + { + "epoch": 0.12494410565812752, + "grad_norm": 1.0546875, + "learning_rate": 0.00019915872295545086, + "loss": 4.7749, + "step": 1205 + }, + { + "epoch": 0.1250477937126156, + "grad_norm": 1.0, + "learning_rate": 0.00019915731635293548, + "loss": 4.7514, + "step": 1206 + }, + { + "epoch": 0.12515148176710367, + "grad_norm": 1.1484375, + "learning_rate": 0.00019915590858046858, + "loss": 4.765, + "step": 1207 + }, + { + "epoch": 0.12525516982159174, + "grad_norm": 0.90625, + "learning_rate": 0.00019915449963806676, + "loss": 4.7111, + "step": 1208 + }, + { + "epoch": 0.1253588578760798, + "grad_norm": 1.4140625, + "learning_rate": 0.0001991530895257467, + "loss": 4.806, + "step": 1209 + }, + { + "epoch": 0.12546254593056788, + "grad_norm": 0.734375, + "learning_rate": 0.00019915167824352493, + "loss": 4.7864, + "step": 1210 + }, + { + "epoch": 0.12556623398505595, + "grad_norm": 1.28125, + "learning_rate": 0.0001991502657914182, + "loss": 4.7845, + "step": 1211 + }, + { + "epoch": 0.12566992203954402, + "grad_norm": 1.078125, + "learning_rate": 0.00019914885216944312, + "loss": 4.7299, + "step": 1212 + }, + { + "epoch": 0.1257736100940321, + "grad_norm": 1.25, + "learning_rate": 0.0001991474373776164, + "loss": 4.7712, + "step": 1213 + }, + { + "epoch": 0.12587729814852017, + "grad_norm": 0.74609375, + "learning_rate": 0.00019914602141595476, + "loss": 4.7366, + "step": 1214 + }, + { + "epoch": 0.12598098620300824, + "grad_norm": 1.40625, + "learning_rate": 0.0001991446042844748, + "loss": 4.7595, + "step": 1215 + }, + { + "epoch": 0.1260846742574963, + "grad_norm": 0.828125, + "learning_rate": 0.00019914318598319338, + "loss": 4.7051, + "step": 1216 + }, + { + "epoch": 0.12618836231198438, + "grad_norm": 1.609375, + "learning_rate": 0.0001991417665121271, + "loss": 4.7561, + "step": 1217 + }, + { + "epoch": 0.12629205036647248, + "grad_norm": 1.09375, + "learning_rate": 0.0001991403458712928, + "loss": 4.7613, + "step": 1218 + }, + { + "epoch": 0.12639573842096055, + "grad_norm": 2.3125, + "learning_rate": 0.00019913892406070723, + "loss": 4.7776, + "step": 1219 + }, + { + "epoch": 0.12649942647544862, + "grad_norm": 2.0625, + "learning_rate": 0.00019913750108038715, + "loss": 4.7397, + "step": 1220 + }, + { + "epoch": 0.1266031145299367, + "grad_norm": 1.4140625, + "learning_rate": 0.00019913607693034934, + "loss": 4.7404, + "step": 1221 + }, + { + "epoch": 0.12670680258442477, + "grad_norm": 1.4921875, + "learning_rate": 0.0001991346516106106, + "loss": 4.7672, + "step": 1222 + }, + { + "epoch": 0.12681049063891284, + "grad_norm": 1.2734375, + "learning_rate": 0.00019913322512118774, + "loss": 4.7471, + "step": 1223 + }, + { + "epoch": 0.1269141786934009, + "grad_norm": 1.578125, + "learning_rate": 0.00019913179746209765, + "loss": 4.8062, + "step": 1224 + }, + { + "epoch": 0.12701786674788898, + "grad_norm": 1.2890625, + "learning_rate": 0.00019913036863335713, + "loss": 4.776, + "step": 1225 + }, + { + "epoch": 0.12712155480237705, + "grad_norm": 1.5390625, + "learning_rate": 0.00019912893863498305, + "loss": 4.7452, + "step": 1226 + }, + { + "epoch": 0.12722524285686512, + "grad_norm": 1.3125, + "learning_rate": 0.00019912750746699226, + "loss": 4.7816, + "step": 1227 + }, + { + "epoch": 0.1273289309113532, + "grad_norm": 1.234375, + "learning_rate": 0.0001991260751294017, + "loss": 4.719, + "step": 1228 + }, + { + "epoch": 0.12743261896584127, + "grad_norm": 1.21875, + "learning_rate": 0.00019912464162222818, + "loss": 4.7678, + "step": 1229 + }, + { + "epoch": 0.12753630702032934, + "grad_norm": 1.140625, + "learning_rate": 0.0001991232069454887, + "loss": 4.7393, + "step": 1230 + }, + { + "epoch": 0.1276399950748174, + "grad_norm": 0.9453125, + "learning_rate": 0.00019912177109920016, + "loss": 4.7238, + "step": 1231 + }, + { + "epoch": 0.12774368312930548, + "grad_norm": 1.1328125, + "learning_rate": 0.0001991203340833795, + "loss": 4.7208, + "step": 1232 + }, + { + "epoch": 0.12784737118379355, + "grad_norm": 1.0546875, + "learning_rate": 0.00019911889589804366, + "loss": 4.7451, + "step": 1233 + }, + { + "epoch": 0.12795105923828162, + "grad_norm": 0.90234375, + "learning_rate": 0.00019911745654320963, + "loss": 4.7744, + "step": 1234 + }, + { + "epoch": 0.1280547472927697, + "grad_norm": 1.1015625, + "learning_rate": 0.00019911601601889438, + "loss": 4.7203, + "step": 1235 + }, + { + "epoch": 0.12815843534725777, + "grad_norm": 0.78515625, + "learning_rate": 0.0001991145743251149, + "loss": 4.7109, + "step": 1236 + }, + { + "epoch": 0.12826212340174584, + "grad_norm": 0.984375, + "learning_rate": 0.00019911313146188823, + "loss": 4.7083, + "step": 1237 + }, + { + "epoch": 0.1283658114562339, + "grad_norm": 1.1875, + "learning_rate": 0.00019911168742923138, + "loss": 4.711, + "step": 1238 + }, + { + "epoch": 0.12846949951072198, + "grad_norm": 0.82421875, + "learning_rate": 0.00019911024222716138, + "loss": 4.754, + "step": 1239 + }, + { + "epoch": 0.12857318756521008, + "grad_norm": 0.83984375, + "learning_rate": 0.0001991087958556953, + "loss": 4.7559, + "step": 1240 + }, + { + "epoch": 0.12867687561969815, + "grad_norm": 0.75, + "learning_rate": 0.00019910734831485015, + "loss": 4.7452, + "step": 1241 + }, + { + "epoch": 0.12878056367418622, + "grad_norm": 0.69921875, + "learning_rate": 0.00019910589960464304, + "loss": 4.7699, + "step": 1242 + }, + { + "epoch": 0.1288842517286743, + "grad_norm": 0.80078125, + "learning_rate": 0.00019910444972509112, + "loss": 4.7216, + "step": 1243 + }, + { + "epoch": 0.12898793978316236, + "grad_norm": 0.67578125, + "learning_rate": 0.00019910299867621146, + "loss": 4.7461, + "step": 1244 + }, + { + "epoch": 0.12909162783765044, + "grad_norm": 0.70703125, + "learning_rate": 0.00019910154645802112, + "loss": 4.7232, + "step": 1245 + }, + { + "epoch": 0.1291953158921385, + "grad_norm": 0.82421875, + "learning_rate": 0.00019910009307053735, + "loss": 4.7075, + "step": 1246 + }, + { + "epoch": 0.12929900394662658, + "grad_norm": 0.71484375, + "learning_rate": 0.00019909863851377718, + "loss": 4.7002, + "step": 1247 + }, + { + "epoch": 0.12940269200111465, + "grad_norm": 0.73828125, + "learning_rate": 0.00019909718278775785, + "loss": 4.734, + "step": 1248 + }, + { + "epoch": 0.12950638005560272, + "grad_norm": 0.6015625, + "learning_rate": 0.0001990957258924965, + "loss": 4.731, + "step": 1249 + }, + { + "epoch": 0.1296100681100908, + "grad_norm": 0.640625, + "learning_rate": 0.00019909426782801037, + "loss": 4.7217, + "step": 1250 + }, + { + "epoch": 0.12971375616457886, + "grad_norm": 0.67578125, + "learning_rate": 0.00019909280859431658, + "loss": 4.723, + "step": 1251 + }, + { + "epoch": 0.12981744421906694, + "grad_norm": 0.59765625, + "learning_rate": 0.00019909134819143243, + "loss": 4.7431, + "step": 1252 + }, + { + "epoch": 0.129921132273555, + "grad_norm": 0.6328125, + "learning_rate": 0.00019908988661937512, + "loss": 4.7613, + "step": 1253 + }, + { + "epoch": 0.13002482032804308, + "grad_norm": 0.66796875, + "learning_rate": 0.00019908842387816183, + "loss": 4.7186, + "step": 1254 + }, + { + "epoch": 0.13012850838253115, + "grad_norm": 0.578125, + "learning_rate": 0.00019908695996780993, + "loss": 4.7484, + "step": 1255 + }, + { + "epoch": 0.13023219643701922, + "grad_norm": 0.56640625, + "learning_rate": 0.00019908549488833663, + "loss": 4.7378, + "step": 1256 + }, + { + "epoch": 0.1303358844915073, + "grad_norm": 0.6640625, + "learning_rate": 0.00019908402863975925, + "loss": 4.7338, + "step": 1257 + }, + { + "epoch": 0.13043957254599536, + "grad_norm": 0.515625, + "learning_rate": 0.00019908256122209506, + "loss": 4.7363, + "step": 1258 + }, + { + "epoch": 0.13054326060048344, + "grad_norm": 0.64453125, + "learning_rate": 0.00019908109263536134, + "loss": 4.7391, + "step": 1259 + }, + { + "epoch": 0.1306469486549715, + "grad_norm": 0.6015625, + "learning_rate": 0.00019907962287957548, + "loss": 4.7361, + "step": 1260 + }, + { + "epoch": 0.13075063670945958, + "grad_norm": 0.6328125, + "learning_rate": 0.00019907815195475484, + "loss": 4.7187, + "step": 1261 + }, + { + "epoch": 0.13085432476394768, + "grad_norm": 0.5390625, + "learning_rate": 0.0001990766798609167, + "loss": 4.7298, + "step": 1262 + }, + { + "epoch": 0.13095801281843575, + "grad_norm": 0.61328125, + "learning_rate": 0.0001990752065980785, + "loss": 4.7505, + "step": 1263 + }, + { + "epoch": 0.13106170087292382, + "grad_norm": 0.57421875, + "learning_rate": 0.00019907373216625755, + "loss": 4.6917, + "step": 1264 + }, + { + "epoch": 0.1311653889274119, + "grad_norm": 0.466796875, + "learning_rate": 0.0001990722565654713, + "loss": 4.7232, + "step": 1265 + }, + { + "epoch": 0.13126907698189996, + "grad_norm": 0.578125, + "learning_rate": 0.00019907077979573713, + "loss": 4.722, + "step": 1266 + }, + { + "epoch": 0.13137276503638803, + "grad_norm": 0.54296875, + "learning_rate": 0.0001990693018570725, + "loss": 4.734, + "step": 1267 + }, + { + "epoch": 0.1314764530908761, + "grad_norm": 0.51953125, + "learning_rate": 0.00019906782274949482, + "loss": 4.7565, + "step": 1268 + }, + { + "epoch": 0.13158014114536418, + "grad_norm": 0.58984375, + "learning_rate": 0.0001990663424730216, + "loss": 4.7149, + "step": 1269 + }, + { + "epoch": 0.13168382919985225, + "grad_norm": 0.5390625, + "learning_rate": 0.0001990648610276702, + "loss": 4.7146, + "step": 1270 + }, + { + "epoch": 0.13178751725434032, + "grad_norm": 0.546875, + "learning_rate": 0.00019906337841345818, + "loss": 4.7258, + "step": 1271 + }, + { + "epoch": 0.1318912053088284, + "grad_norm": 0.498046875, + "learning_rate": 0.00019906189463040299, + "loss": 4.7316, + "step": 1272 + }, + { + "epoch": 0.13199489336331646, + "grad_norm": 0.5859375, + "learning_rate": 0.00019906040967852215, + "loss": 4.7284, + "step": 1273 + }, + { + "epoch": 0.13209858141780453, + "grad_norm": 0.59375, + "learning_rate": 0.0001990589235578332, + "loss": 4.7609, + "step": 1274 + }, + { + "epoch": 0.1322022694722926, + "grad_norm": 0.63671875, + "learning_rate": 0.00019905743626835368, + "loss": 4.7654, + "step": 1275 + }, + { + "epoch": 0.13230595752678068, + "grad_norm": 0.59765625, + "learning_rate": 0.0001990559478101011, + "loss": 4.7341, + "step": 1276 + }, + { + "epoch": 0.13240964558126875, + "grad_norm": 0.6640625, + "learning_rate": 0.00019905445818309305, + "loss": 4.7512, + "step": 1277 + }, + { + "epoch": 0.13251333363575682, + "grad_norm": 0.57421875, + "learning_rate": 0.00019905296738734709, + "loss": 4.6994, + "step": 1278 + }, + { + "epoch": 0.1326170216902449, + "grad_norm": 0.69921875, + "learning_rate": 0.00019905147542288086, + "loss": 4.7447, + "step": 1279 + }, + { + "epoch": 0.13272070974473296, + "grad_norm": 0.8046875, + "learning_rate": 0.0001990499822897119, + "loss": 4.7525, + "step": 1280 + }, + { + "epoch": 0.13282439779922103, + "grad_norm": 0.73046875, + "learning_rate": 0.00019904848798785781, + "loss": 4.7446, + "step": 1281 + }, + { + "epoch": 0.1329280858537091, + "grad_norm": 0.671875, + "learning_rate": 0.00019904699251733628, + "loss": 4.7214, + "step": 1282 + }, + { + "epoch": 0.13303177390819718, + "grad_norm": 0.6484375, + "learning_rate": 0.00019904549587816494, + "loss": 4.7364, + "step": 1283 + }, + { + "epoch": 0.13313546196268525, + "grad_norm": 0.7265625, + "learning_rate": 0.00019904399807036145, + "loss": 4.7707, + "step": 1284 + }, + { + "epoch": 0.13323915001717335, + "grad_norm": 0.73046875, + "learning_rate": 0.00019904249909394347, + "loss": 4.7137, + "step": 1285 + }, + { + "epoch": 0.13334283807166142, + "grad_norm": 0.671875, + "learning_rate": 0.0001990409989489287, + "loss": 4.7243, + "step": 1286 + }, + { + "epoch": 0.1334465261261495, + "grad_norm": 0.625, + "learning_rate": 0.00019903949763533483, + "loss": 4.7646, + "step": 1287 + }, + { + "epoch": 0.13355021418063756, + "grad_norm": 0.6015625, + "learning_rate": 0.00019903799515317956, + "loss": 4.6683, + "step": 1288 + }, + { + "epoch": 0.13365390223512563, + "grad_norm": 0.55859375, + "learning_rate": 0.00019903649150248068, + "loss": 4.7398, + "step": 1289 + }, + { + "epoch": 0.1337575902896137, + "grad_norm": 0.62109375, + "learning_rate": 0.00019903498668325583, + "loss": 4.7181, + "step": 1290 + }, + { + "epoch": 0.13386127834410178, + "grad_norm": 0.50390625, + "learning_rate": 0.00019903348069552285, + "loss": 4.7208, + "step": 1291 + }, + { + "epoch": 0.13396496639858985, + "grad_norm": 0.5703125, + "learning_rate": 0.0001990319735392995, + "loss": 4.7317, + "step": 1292 + }, + { + "epoch": 0.13406865445307792, + "grad_norm": 0.67578125, + "learning_rate": 0.00019903046521460352, + "loss": 4.7149, + "step": 1293 + }, + { + "epoch": 0.134172342507566, + "grad_norm": 0.5234375, + "learning_rate": 0.00019902895572145274, + "loss": 4.6968, + "step": 1294 + }, + { + "epoch": 0.13427603056205406, + "grad_norm": 0.55078125, + "learning_rate": 0.00019902744505986494, + "loss": 4.7335, + "step": 1295 + }, + { + "epoch": 0.13437971861654213, + "grad_norm": 0.60546875, + "learning_rate": 0.00019902593322985797, + "loss": 4.6848, + "step": 1296 + }, + { + "epoch": 0.1344834066710302, + "grad_norm": 0.57421875, + "learning_rate": 0.0001990244202314497, + "loss": 4.7019, + "step": 1297 + }, + { + "epoch": 0.13458709472551827, + "grad_norm": 0.6328125, + "learning_rate": 0.0001990229060646579, + "loss": 4.7149, + "step": 1298 + }, + { + "epoch": 0.13469078278000635, + "grad_norm": 0.75390625, + "learning_rate": 0.0001990213907295005, + "loss": 4.7474, + "step": 1299 + }, + { + "epoch": 0.13479447083449442, + "grad_norm": 0.6796875, + "learning_rate": 0.00019901987422599535, + "loss": 4.694, + "step": 1300 + }, + { + "epoch": 0.1348981588889825, + "grad_norm": 0.6015625, + "learning_rate": 0.00019901835655416038, + "loss": 4.677, + "step": 1301 + }, + { + "epoch": 0.13500184694347056, + "grad_norm": 0.63671875, + "learning_rate": 0.00019901683771401344, + "loss": 4.7183, + "step": 1302 + }, + { + "epoch": 0.13510553499795863, + "grad_norm": 0.62890625, + "learning_rate": 0.0001990153177055725, + "loss": 4.7433, + "step": 1303 + }, + { + "epoch": 0.1352092230524467, + "grad_norm": 0.65234375, + "learning_rate": 0.00019901379652885543, + "loss": 4.7238, + "step": 1304 + }, + { + "epoch": 0.13531291110693477, + "grad_norm": 0.69921875, + "learning_rate": 0.00019901227418388028, + "loss": 4.7238, + "step": 1305 + }, + { + "epoch": 0.13541659916142285, + "grad_norm": 0.53125, + "learning_rate": 0.00019901075067066493, + "loss": 4.7488, + "step": 1306 + }, + { + "epoch": 0.13552028721591095, + "grad_norm": 0.6953125, + "learning_rate": 0.00019900922598922738, + "loss": 4.7578, + "step": 1307 + }, + { + "epoch": 0.13562397527039902, + "grad_norm": 0.74609375, + "learning_rate": 0.00019900770013958562, + "loss": 4.7248, + "step": 1308 + }, + { + "epoch": 0.1357276633248871, + "grad_norm": 0.6015625, + "learning_rate": 0.00019900617312175768, + "loss": 4.6949, + "step": 1309 + }, + { + "epoch": 0.13583135137937516, + "grad_norm": 0.5625, + "learning_rate": 0.00019900464493576153, + "loss": 4.7147, + "step": 1310 + }, + { + "epoch": 0.13593503943386323, + "grad_norm": 0.63671875, + "learning_rate": 0.00019900311558161522, + "loss": 4.7156, + "step": 1311 + }, + { + "epoch": 0.1360387274883513, + "grad_norm": 0.64453125, + "learning_rate": 0.00019900158505933678, + "loss": 4.7342, + "step": 1312 + }, + { + "epoch": 0.13614241554283937, + "grad_norm": 0.55078125, + "learning_rate": 0.0001990000533689443, + "loss": 4.7456, + "step": 1313 + }, + { + "epoch": 0.13624610359732744, + "grad_norm": 0.6796875, + "learning_rate": 0.00019899852051045587, + "loss": 4.7326, + "step": 1314 + }, + { + "epoch": 0.13634979165181552, + "grad_norm": 0.71484375, + "learning_rate": 0.00019899698648388948, + "loss": 4.7254, + "step": 1315 + }, + { + "epoch": 0.1364534797063036, + "grad_norm": 0.5703125, + "learning_rate": 0.00019899545128926333, + "loss": 4.7298, + "step": 1316 + }, + { + "epoch": 0.13655716776079166, + "grad_norm": 0.5625, + "learning_rate": 0.00019899391492659551, + "loss": 4.7243, + "step": 1317 + }, + { + "epoch": 0.13666085581527973, + "grad_norm": 0.59375, + "learning_rate": 0.00019899237739590413, + "loss": 4.723, + "step": 1318 + }, + { + "epoch": 0.1367645438697678, + "grad_norm": 0.54296875, + "learning_rate": 0.00019899083869720735, + "loss": 4.7191, + "step": 1319 + }, + { + "epoch": 0.13686823192425587, + "grad_norm": 0.546875, + "learning_rate": 0.00019898929883052326, + "loss": 4.6991, + "step": 1320 + }, + { + "epoch": 0.13697191997874394, + "grad_norm": 0.71875, + "learning_rate": 0.0001989877577958701, + "loss": 4.6526, + "step": 1321 + }, + { + "epoch": 0.13707560803323202, + "grad_norm": 0.5625, + "learning_rate": 0.00019898621559326607, + "loss": 4.6876, + "step": 1322 + }, + { + "epoch": 0.1371792960877201, + "grad_norm": 0.51953125, + "learning_rate": 0.00019898467222272928, + "loss": 4.6841, + "step": 1323 + }, + { + "epoch": 0.13728298414220816, + "grad_norm": 0.80078125, + "learning_rate": 0.00019898312768427802, + "loss": 4.677, + "step": 1324 + }, + { + "epoch": 0.13738667219669623, + "grad_norm": 0.83203125, + "learning_rate": 0.00019898158197793046, + "loss": 4.7136, + "step": 1325 + }, + { + "epoch": 0.1374903602511843, + "grad_norm": 0.6484375, + "learning_rate": 0.00019898003510370488, + "loss": 4.6611, + "step": 1326 + }, + { + "epoch": 0.13759404830567237, + "grad_norm": 0.7109375, + "learning_rate": 0.0001989784870616195, + "loss": 4.6735, + "step": 1327 + }, + { + "epoch": 0.13769773636016044, + "grad_norm": 0.8515625, + "learning_rate": 0.00019897693785169261, + "loss": 4.7221, + "step": 1328 + }, + { + "epoch": 0.13780142441464854, + "grad_norm": 0.79296875, + "learning_rate": 0.00019897538747394247, + "loss": 4.6936, + "step": 1329 + }, + { + "epoch": 0.13790511246913661, + "grad_norm": 0.64453125, + "learning_rate": 0.00019897383592838738, + "loss": 4.7377, + "step": 1330 + }, + { + "epoch": 0.1380088005236247, + "grad_norm": 0.6796875, + "learning_rate": 0.00019897228321504563, + "loss": 4.7499, + "step": 1331 + }, + { + "epoch": 0.13811248857811276, + "grad_norm": 0.9609375, + "learning_rate": 0.00019897072933393559, + "loss": 4.705, + "step": 1332 + }, + { + "epoch": 0.13821617663260083, + "grad_norm": 1.0390625, + "learning_rate": 0.00019896917428507553, + "loss": 4.719, + "step": 1333 + }, + { + "epoch": 0.1383198646870889, + "grad_norm": 1.0546875, + "learning_rate": 0.00019896761806848385, + "loss": 4.7109, + "step": 1334 + }, + { + "epoch": 0.13842355274157697, + "grad_norm": 1.109375, + "learning_rate": 0.00019896606068417887, + "loss": 4.7314, + "step": 1335 + }, + { + "epoch": 0.13852724079606504, + "grad_norm": 1.140625, + "learning_rate": 0.000198964502132179, + "loss": 4.7162, + "step": 1336 + }, + { + "epoch": 0.13863092885055311, + "grad_norm": 0.77734375, + "learning_rate": 0.00019896294241250262, + "loss": 4.7474, + "step": 1337 + }, + { + "epoch": 0.13873461690504119, + "grad_norm": 0.67578125, + "learning_rate": 0.00019896138152516812, + "loss": 4.6801, + "step": 1338 + }, + { + "epoch": 0.13883830495952926, + "grad_norm": 0.86328125, + "learning_rate": 0.00019895981947019392, + "loss": 4.6727, + "step": 1339 + }, + { + "epoch": 0.13894199301401733, + "grad_norm": 1.09375, + "learning_rate": 0.00019895825624759845, + "loss": 4.7344, + "step": 1340 + }, + { + "epoch": 0.1390456810685054, + "grad_norm": 0.9453125, + "learning_rate": 0.00019895669185740017, + "loss": 4.737, + "step": 1341 + }, + { + "epoch": 0.13914936912299347, + "grad_norm": 1.015625, + "learning_rate": 0.00019895512629961753, + "loss": 4.6886, + "step": 1342 + }, + { + "epoch": 0.13925305717748154, + "grad_norm": 0.88671875, + "learning_rate": 0.000198953559574269, + "loss": 4.658, + "step": 1343 + }, + { + "epoch": 0.13935674523196961, + "grad_norm": 0.57421875, + "learning_rate": 0.00019895199168137306, + "loss": 4.6986, + "step": 1344 + }, + { + "epoch": 0.13946043328645769, + "grad_norm": 0.78515625, + "learning_rate": 0.0001989504226209482, + "loss": 4.7303, + "step": 1345 + }, + { + "epoch": 0.13956412134094576, + "grad_norm": 1.0859375, + "learning_rate": 0.00019894885239301298, + "loss": 4.6842, + "step": 1346 + }, + { + "epoch": 0.13966780939543383, + "grad_norm": 0.78125, + "learning_rate": 0.0001989472809975859, + "loss": 4.6811, + "step": 1347 + }, + { + "epoch": 0.1397714974499219, + "grad_norm": 0.72265625, + "learning_rate": 0.00019894570843468544, + "loss": 4.713, + "step": 1348 + }, + { + "epoch": 0.13987518550440997, + "grad_norm": 0.98828125, + "learning_rate": 0.00019894413470433026, + "loss": 4.6777, + "step": 1349 + }, + { + "epoch": 0.13997887355889804, + "grad_norm": 0.875, + "learning_rate": 0.00019894255980653887, + "loss": 4.7175, + "step": 1350 + }, + { + "epoch": 0.14008256161338611, + "grad_norm": 0.953125, + "learning_rate": 0.00019894098374132987, + "loss": 4.7008, + "step": 1351 + }, + { + "epoch": 0.1401862496678742, + "grad_norm": 1.359375, + "learning_rate": 0.0001989394065087218, + "loss": 4.76, + "step": 1352 + }, + { + "epoch": 0.14028993772236228, + "grad_norm": 0.74609375, + "learning_rate": 0.00019893782810873338, + "loss": 4.7276, + "step": 1353 + }, + { + "epoch": 0.14039362577685036, + "grad_norm": 1.4296875, + "learning_rate": 0.00019893624854138312, + "loss": 4.7048, + "step": 1354 + }, + { + "epoch": 0.14049731383133843, + "grad_norm": 0.8046875, + "learning_rate": 0.00019893466780668972, + "loss": 4.6713, + "step": 1355 + }, + { + "epoch": 0.1406010018858265, + "grad_norm": 1.4609375, + "learning_rate": 0.00019893308590467185, + "loss": 4.7421, + "step": 1356 + }, + { + "epoch": 0.14070468994031457, + "grad_norm": 0.88671875, + "learning_rate": 0.0001989315028353481, + "loss": 4.7306, + "step": 1357 + }, + { + "epoch": 0.14080837799480264, + "grad_norm": 1.7109375, + "learning_rate": 0.00019892991859873723, + "loss": 4.7135, + "step": 1358 + }, + { + "epoch": 0.1409120660492907, + "grad_norm": 1.3125, + "learning_rate": 0.00019892833319485787, + "loss": 4.7376, + "step": 1359 + }, + { + "epoch": 0.14101575410377878, + "grad_norm": 2.25, + "learning_rate": 0.00019892674662372876, + "loss": 4.7339, + "step": 1360 + }, + { + "epoch": 0.14111944215826686, + "grad_norm": 2.078125, + "learning_rate": 0.0001989251588853686, + "loss": 4.717, + "step": 1361 + }, + { + "epoch": 0.14122313021275493, + "grad_norm": 1.359375, + "learning_rate": 0.00019892356997979613, + "loss": 4.7236, + "step": 1362 + }, + { + "epoch": 0.141326818267243, + "grad_norm": 1.59375, + "learning_rate": 0.0001989219799070301, + "loss": 4.6997, + "step": 1363 + }, + { + "epoch": 0.14143050632173107, + "grad_norm": 1.2578125, + "learning_rate": 0.00019892038866708932, + "loss": 4.6979, + "step": 1364 + }, + { + "epoch": 0.14153419437621914, + "grad_norm": 2.1875, + "learning_rate": 0.00019891879625999245, + "loss": 4.6499, + "step": 1365 + }, + { + "epoch": 0.1416378824307072, + "grad_norm": 1.7890625, + "learning_rate": 0.00019891720268575837, + "loss": 4.7374, + "step": 1366 + }, + { + "epoch": 0.14174157048519528, + "grad_norm": 2.109375, + "learning_rate": 0.00019891560794440587, + "loss": 4.6959, + "step": 1367 + }, + { + "epoch": 0.14184525853968336, + "grad_norm": 1.3828125, + "learning_rate": 0.00019891401203595374, + "loss": 4.7039, + "step": 1368 + }, + { + "epoch": 0.14194894659417143, + "grad_norm": 2.609375, + "learning_rate": 0.00019891241496042082, + "loss": 4.7391, + "step": 1369 + }, + { + "epoch": 0.1420526346486595, + "grad_norm": 2.40625, + "learning_rate": 0.000198910816717826, + "loss": 4.7635, + "step": 1370 + }, + { + "epoch": 0.14215632270314757, + "grad_norm": 1.6171875, + "learning_rate": 0.00019890921730818806, + "loss": 4.7121, + "step": 1371 + }, + { + "epoch": 0.14226001075763564, + "grad_norm": 1.6171875, + "learning_rate": 0.00019890761673152591, + "loss": 4.7292, + "step": 1372 + }, + { + "epoch": 0.1423636988121237, + "grad_norm": 1.453125, + "learning_rate": 0.00019890601498785844, + "loss": 4.71, + "step": 1373 + }, + { + "epoch": 0.1424673868666118, + "grad_norm": 1.421875, + "learning_rate": 0.00019890441207720454, + "loss": 4.7202, + "step": 1374 + }, + { + "epoch": 0.14257107492109988, + "grad_norm": 1.25, + "learning_rate": 0.0001989028079995831, + "loss": 4.7506, + "step": 1375 + }, + { + "epoch": 0.14267476297558795, + "grad_norm": 1.5390625, + "learning_rate": 0.00019890120275501308, + "loss": 4.684, + "step": 1376 + }, + { + "epoch": 0.14277845103007603, + "grad_norm": 1.0703125, + "learning_rate": 0.00019889959634351344, + "loss": 4.736, + "step": 1377 + }, + { + "epoch": 0.1428821390845641, + "grad_norm": 2.40625, + "learning_rate": 0.0001988979887651031, + "loss": 4.7732, + "step": 1378 + }, + { + "epoch": 0.14298582713905217, + "grad_norm": 2.03125, + "learning_rate": 0.00019889638001980103, + "loss": 4.7442, + "step": 1379 + }, + { + "epoch": 0.14308951519354024, + "grad_norm": 1.7890625, + "learning_rate": 0.00019889477010762618, + "loss": 4.6948, + "step": 1380 + }, + { + "epoch": 0.1431932032480283, + "grad_norm": 1.53125, + "learning_rate": 0.00019889315902859762, + "loss": 4.7068, + "step": 1381 + }, + { + "epoch": 0.14329689130251638, + "grad_norm": 1.7265625, + "learning_rate": 0.0001988915467827343, + "loss": 4.7583, + "step": 1382 + }, + { + "epoch": 0.14340057935700445, + "grad_norm": 1.3359375, + "learning_rate": 0.00019888993337005526, + "loss": 4.7032, + "step": 1383 + }, + { + "epoch": 0.14350426741149253, + "grad_norm": 1.5859375, + "learning_rate": 0.00019888831879057953, + "loss": 4.7053, + "step": 1384 + }, + { + "epoch": 0.1436079554659806, + "grad_norm": 1.0859375, + "learning_rate": 0.00019888670304432619, + "loss": 4.6836, + "step": 1385 + }, + { + "epoch": 0.14371164352046867, + "grad_norm": 1.8828125, + "learning_rate": 0.00019888508613131426, + "loss": 4.7571, + "step": 1386 + }, + { + "epoch": 0.14381533157495674, + "grad_norm": 1.3515625, + "learning_rate": 0.00019888346805156283, + "loss": 4.6962, + "step": 1387 + }, + { + "epoch": 0.1439190196294448, + "grad_norm": 2.1875, + "learning_rate": 0.00019888184880509103, + "loss": 4.7161, + "step": 1388 + }, + { + "epoch": 0.14402270768393288, + "grad_norm": 2.03125, + "learning_rate": 0.00019888022839191792, + "loss": 4.704, + "step": 1389 + }, + { + "epoch": 0.14412639573842095, + "grad_norm": 1.5390625, + "learning_rate": 0.00019887860681206266, + "loss": 4.7299, + "step": 1390 + }, + { + "epoch": 0.14423008379290903, + "grad_norm": 1.5390625, + "learning_rate": 0.00019887698406554431, + "loss": 4.7452, + "step": 1391 + }, + { + "epoch": 0.1443337718473971, + "grad_norm": 1.28125, + "learning_rate": 0.00019887536015238212, + "loss": 4.7157, + "step": 1392 + }, + { + "epoch": 0.14443745990188517, + "grad_norm": 1.171875, + "learning_rate": 0.00019887373507259518, + "loss": 4.6886, + "step": 1393 + }, + { + "epoch": 0.14454114795637324, + "grad_norm": 1.2734375, + "learning_rate": 0.00019887210882620266, + "loss": 4.7412, + "step": 1394 + }, + { + "epoch": 0.1446448360108613, + "grad_norm": 1.09375, + "learning_rate": 0.00019887048141322376, + "loss": 4.7263, + "step": 1395 + }, + { + "epoch": 0.1447485240653494, + "grad_norm": 1.109375, + "learning_rate": 0.00019886885283367772, + "loss": 4.7014, + "step": 1396 + }, + { + "epoch": 0.14485221211983748, + "grad_norm": 0.77734375, + "learning_rate": 0.00019886722308758373, + "loss": 4.6781, + "step": 1397 + }, + { + "epoch": 0.14495590017432555, + "grad_norm": 1.125, + "learning_rate": 0.00019886559217496098, + "loss": 4.6926, + "step": 1398 + }, + { + "epoch": 0.14505958822881362, + "grad_norm": 0.84375, + "learning_rate": 0.00019886396009582876, + "loss": 4.7069, + "step": 1399 + }, + { + "epoch": 0.1451632762833017, + "grad_norm": 0.78515625, + "learning_rate": 0.00019886232685020633, + "loss": 4.6784, + "step": 1400 + }, + { + "epoch": 0.14526696433778977, + "grad_norm": 0.87109375, + "learning_rate": 0.00019886069243811293, + "loss": 4.6576, + "step": 1401 + }, + { + "epoch": 0.14537065239227784, + "grad_norm": 0.7421875, + "learning_rate": 0.0001988590568595679, + "loss": 4.6882, + "step": 1402 + }, + { + "epoch": 0.1454743404467659, + "grad_norm": 0.703125, + "learning_rate": 0.00019885742011459045, + "loss": 4.7041, + "step": 1403 + }, + { + "epoch": 0.14557802850125398, + "grad_norm": 0.70703125, + "learning_rate": 0.00019885578220319995, + "loss": 4.714, + "step": 1404 + }, + { + "epoch": 0.14568171655574205, + "grad_norm": 0.6953125, + "learning_rate": 0.00019885414312541573, + "loss": 4.6943, + "step": 1405 + }, + { + "epoch": 0.14578540461023012, + "grad_norm": 0.61328125, + "learning_rate": 0.00019885250288125713, + "loss": 4.6866, + "step": 1406 + }, + { + "epoch": 0.1458890926647182, + "grad_norm": 0.71875, + "learning_rate": 0.00019885086147074344, + "loss": 4.6937, + "step": 1407 + }, + { + "epoch": 0.14599278071920627, + "grad_norm": 0.63671875, + "learning_rate": 0.0001988492188938941, + "loss": 4.7627, + "step": 1408 + }, + { + "epoch": 0.14609646877369434, + "grad_norm": 0.640625, + "learning_rate": 0.00019884757515072844, + "loss": 4.7113, + "step": 1409 + }, + { + "epoch": 0.1462001568281824, + "grad_norm": 0.640625, + "learning_rate": 0.00019884593024126592, + "loss": 4.6828, + "step": 1410 + }, + { + "epoch": 0.14630384488267048, + "grad_norm": 0.56640625, + "learning_rate": 0.0001988442841655259, + "loss": 4.7033, + "step": 1411 + }, + { + "epoch": 0.14640753293715855, + "grad_norm": 0.68359375, + "learning_rate": 0.00019884263692352777, + "loss": 4.7208, + "step": 1412 + }, + { + "epoch": 0.14651122099164662, + "grad_norm": 0.59765625, + "learning_rate": 0.00019884098851529104, + "loss": 4.7189, + "step": 1413 + }, + { + "epoch": 0.1466149090461347, + "grad_norm": 0.65625, + "learning_rate": 0.00019883933894083514, + "loss": 4.6782, + "step": 1414 + }, + { + "epoch": 0.14671859710062277, + "grad_norm": 0.67578125, + "learning_rate": 0.00019883768820017948, + "loss": 4.7375, + "step": 1415 + }, + { + "epoch": 0.14682228515511084, + "grad_norm": 0.68359375, + "learning_rate": 0.0001988360362933436, + "loss": 4.7187, + "step": 1416 + }, + { + "epoch": 0.1469259732095989, + "grad_norm": 0.61328125, + "learning_rate": 0.00019883438322034695, + "loss": 4.7011, + "step": 1417 + }, + { + "epoch": 0.14702966126408698, + "grad_norm": 0.54296875, + "learning_rate": 0.00019883272898120905, + "loss": 4.7445, + "step": 1418 + }, + { + "epoch": 0.14713334931857508, + "grad_norm": 0.65234375, + "learning_rate": 0.00019883107357594943, + "loss": 4.724, + "step": 1419 + }, + { + "epoch": 0.14723703737306315, + "grad_norm": 0.625, + "learning_rate": 0.0001988294170045876, + "loss": 4.6845, + "step": 1420 + }, + { + "epoch": 0.14734072542755122, + "grad_norm": 0.6015625, + "learning_rate": 0.00019882775926714313, + "loss": 4.7319, + "step": 1421 + }, + { + "epoch": 0.1474444134820393, + "grad_norm": 0.6328125, + "learning_rate": 0.00019882610036363557, + "loss": 4.71, + "step": 1422 + }, + { + "epoch": 0.14754810153652737, + "grad_norm": 0.828125, + "learning_rate": 0.00019882444029408448, + "loss": 4.7078, + "step": 1423 + }, + { + "epoch": 0.14765178959101544, + "grad_norm": 0.6640625, + "learning_rate": 0.00019882277905850946, + "loss": 4.6876, + "step": 1424 + }, + { + "epoch": 0.1477554776455035, + "grad_norm": 0.58984375, + "learning_rate": 0.00019882111665693011, + "loss": 4.6975, + "step": 1425 + }, + { + "epoch": 0.14785916569999158, + "grad_norm": 0.77734375, + "learning_rate": 0.00019881945308936603, + "loss": 4.6452, + "step": 1426 + }, + { + "epoch": 0.14796285375447965, + "grad_norm": 0.75390625, + "learning_rate": 0.00019881778835583686, + "loss": 4.7131, + "step": 1427 + }, + { + "epoch": 0.14806654180896772, + "grad_norm": 0.54296875, + "learning_rate": 0.00019881612245636226, + "loss": 4.681, + "step": 1428 + }, + { + "epoch": 0.1481702298634558, + "grad_norm": 0.66015625, + "learning_rate": 0.00019881445539096185, + "loss": 4.7163, + "step": 1429 + }, + { + "epoch": 0.14827391791794386, + "grad_norm": 0.73828125, + "learning_rate": 0.00019881278715965534, + "loss": 4.7312, + "step": 1430 + }, + { + "epoch": 0.14837760597243194, + "grad_norm": 0.6484375, + "learning_rate": 0.00019881111776246234, + "loss": 4.7217, + "step": 1431 + }, + { + "epoch": 0.14848129402692, + "grad_norm": 0.74609375, + "learning_rate": 0.00019880944719940263, + "loss": 4.7345, + "step": 1432 + }, + { + "epoch": 0.14858498208140808, + "grad_norm": 0.71484375, + "learning_rate": 0.0001988077754704959, + "loss": 4.6661, + "step": 1433 + }, + { + "epoch": 0.14868867013589615, + "grad_norm": 0.68359375, + "learning_rate": 0.0001988061025757619, + "loss": 4.662, + "step": 1434 + }, + { + "epoch": 0.14879235819038422, + "grad_norm": 0.94921875, + "learning_rate": 0.00019880442851522029, + "loss": 4.7057, + "step": 1435 + }, + { + "epoch": 0.1488960462448723, + "grad_norm": 0.89453125, + "learning_rate": 0.00019880275328889083, + "loss": 4.6554, + "step": 1436 + }, + { + "epoch": 0.14899973429936036, + "grad_norm": 0.76953125, + "learning_rate": 0.00019880107689679337, + "loss": 4.673, + "step": 1437 + }, + { + "epoch": 0.14910342235384844, + "grad_norm": 0.6796875, + "learning_rate": 0.00019879939933894762, + "loss": 4.6325, + "step": 1438 + }, + { + "epoch": 0.1492071104083365, + "grad_norm": 0.8515625, + "learning_rate": 0.0001987977206153734, + "loss": 4.7093, + "step": 1439 + }, + { + "epoch": 0.14931079846282458, + "grad_norm": 0.71875, + "learning_rate": 0.0001987960407260905, + "loss": 4.6711, + "step": 1440 + }, + { + "epoch": 0.14941448651731268, + "grad_norm": 0.6484375, + "learning_rate": 0.00019879435967111876, + "loss": 4.6034, + "step": 1441 + }, + { + "epoch": 0.14951817457180075, + "grad_norm": 0.71875, + "learning_rate": 0.000198792677450478, + "loss": 4.713, + "step": 1442 + }, + { + "epoch": 0.14962186262628882, + "grad_norm": 0.84375, + "learning_rate": 0.00019879099406418807, + "loss": 4.7042, + "step": 1443 + }, + { + "epoch": 0.1497255506807769, + "grad_norm": 0.5703125, + "learning_rate": 0.00019878930951226887, + "loss": 4.6731, + "step": 1444 + }, + { + "epoch": 0.14982923873526496, + "grad_norm": 0.69140625, + "learning_rate": 0.00019878762379474022, + "loss": 4.6662, + "step": 1445 + }, + { + "epoch": 0.14993292678975303, + "grad_norm": 0.71484375, + "learning_rate": 0.00019878593691162203, + "loss": 4.6774, + "step": 1446 + }, + { + "epoch": 0.1500366148442411, + "grad_norm": 0.78515625, + "learning_rate": 0.00019878424886293422, + "loss": 4.6983, + "step": 1447 + }, + { + "epoch": 0.15014030289872918, + "grad_norm": 0.70703125, + "learning_rate": 0.00019878255964869666, + "loss": 4.6673, + "step": 1448 + }, + { + "epoch": 0.15024399095321725, + "grad_norm": 0.62890625, + "learning_rate": 0.00019878086926892934, + "loss": 4.6991, + "step": 1449 + }, + { + "epoch": 0.15034767900770532, + "grad_norm": 0.671875, + "learning_rate": 0.00019877917772365215, + "loss": 4.7173, + "step": 1450 + }, + { + "epoch": 0.1504513670621934, + "grad_norm": 0.796875, + "learning_rate": 0.0001987774850128851, + "loss": 4.7385, + "step": 1451 + }, + { + "epoch": 0.15055505511668146, + "grad_norm": 0.68359375, + "learning_rate": 0.00019877579113664816, + "loss": 4.7129, + "step": 1452 + }, + { + "epoch": 0.15065874317116953, + "grad_norm": 0.64453125, + "learning_rate": 0.00019877409609496126, + "loss": 4.6592, + "step": 1453 + }, + { + "epoch": 0.1507624312256576, + "grad_norm": 0.69140625, + "learning_rate": 0.00019877239988784444, + "loss": 4.7102, + "step": 1454 + }, + { + "epoch": 0.15086611928014568, + "grad_norm": 0.640625, + "learning_rate": 0.00019877070251531772, + "loss": 4.7239, + "step": 1455 + }, + { + "epoch": 0.15096980733463375, + "grad_norm": 0.66015625, + "learning_rate": 0.0001987690039774011, + "loss": 4.6529, + "step": 1456 + }, + { + "epoch": 0.15107349538912182, + "grad_norm": 0.98828125, + "learning_rate": 0.00019876730427411467, + "loss": 4.7155, + "step": 1457 + }, + { + "epoch": 0.1511771834436099, + "grad_norm": 1.109375, + "learning_rate": 0.00019876560340547844, + "loss": 4.7025, + "step": 1458 + }, + { + "epoch": 0.15128087149809796, + "grad_norm": 0.69921875, + "learning_rate": 0.00019876390137151247, + "loss": 4.6933, + "step": 1459 + }, + { + "epoch": 0.15138455955258603, + "grad_norm": 0.67578125, + "learning_rate": 0.00019876219817223687, + "loss": 4.6941, + "step": 1460 + }, + { + "epoch": 0.1514882476070741, + "grad_norm": 0.8046875, + "learning_rate": 0.00019876049380767173, + "loss": 4.6765, + "step": 1461 + }, + { + "epoch": 0.15159193566156218, + "grad_norm": 0.87109375, + "learning_rate": 0.00019875878827783713, + "loss": 4.6645, + "step": 1462 + }, + { + "epoch": 0.15169562371605028, + "grad_norm": 0.93359375, + "learning_rate": 0.00019875708158275326, + "loss": 4.6656, + "step": 1463 + }, + { + "epoch": 0.15179931177053835, + "grad_norm": 0.7265625, + "learning_rate": 0.00019875537372244023, + "loss": 4.6743, + "step": 1464 + }, + { + "epoch": 0.15190299982502642, + "grad_norm": 0.5625, + "learning_rate": 0.00019875366469691814, + "loss": 4.7085, + "step": 1465 + }, + { + "epoch": 0.1520066878795145, + "grad_norm": 0.703125, + "learning_rate": 0.0001987519545062072, + "loss": 4.7214, + "step": 1466 + }, + { + "epoch": 0.15211037593400256, + "grad_norm": 0.73828125, + "learning_rate": 0.0001987502431503276, + "loss": 4.6776, + "step": 1467 + }, + { + "epoch": 0.15221406398849063, + "grad_norm": 0.6015625, + "learning_rate": 0.0001987485306292995, + "loss": 4.7105, + "step": 1468 + }, + { + "epoch": 0.1523177520429787, + "grad_norm": 0.63671875, + "learning_rate": 0.0001987468169431431, + "loss": 4.685, + "step": 1469 + }, + { + "epoch": 0.15242144009746678, + "grad_norm": 0.59765625, + "learning_rate": 0.0001987451020918787, + "loss": 4.6538, + "step": 1470 + }, + { + "epoch": 0.15252512815195485, + "grad_norm": 0.58984375, + "learning_rate": 0.00019874338607552642, + "loss": 4.6843, + "step": 1471 + }, + { + "epoch": 0.15262881620644292, + "grad_norm": 0.62109375, + "learning_rate": 0.00019874166889410658, + "loss": 4.6666, + "step": 1472 + }, + { + "epoch": 0.152732504260931, + "grad_norm": 0.546875, + "learning_rate": 0.0001987399505476394, + "loss": 4.6916, + "step": 1473 + }, + { + "epoch": 0.15283619231541906, + "grad_norm": 0.6171875, + "learning_rate": 0.0001987382310361452, + "loss": 4.6385, + "step": 1474 + }, + { + "epoch": 0.15293988036990713, + "grad_norm": 0.6328125, + "learning_rate": 0.00019873651035964425, + "loss": 4.6691, + "step": 1475 + }, + { + "epoch": 0.1530435684243952, + "grad_norm": 0.482421875, + "learning_rate": 0.00019873478851815683, + "loss": 4.6766, + "step": 1476 + }, + { + "epoch": 0.15314725647888328, + "grad_norm": 0.5546875, + "learning_rate": 0.00019873306551170328, + "loss": 4.6835, + "step": 1477 + }, + { + "epoch": 0.15325094453337135, + "grad_norm": 0.58203125, + "learning_rate": 0.00019873134134030393, + "loss": 4.6331, + "step": 1478 + }, + { + "epoch": 0.15335463258785942, + "grad_norm": 0.4609375, + "learning_rate": 0.0001987296160039791, + "loss": 4.6557, + "step": 1479 + }, + { + "epoch": 0.1534583206423475, + "grad_norm": 0.5390625, + "learning_rate": 0.00019872788950274918, + "loss": 4.6338, + "step": 1480 + }, + { + "epoch": 0.15356200869683556, + "grad_norm": 0.5234375, + "learning_rate": 0.00019872616183663451, + "loss": 4.6594, + "step": 1481 + }, + { + "epoch": 0.15366569675132363, + "grad_norm": 0.609375, + "learning_rate": 0.00019872443300565548, + "loss": 4.6934, + "step": 1482 + }, + { + "epoch": 0.1537693848058117, + "grad_norm": 0.55859375, + "learning_rate": 0.0001987227030098325, + "loss": 4.664, + "step": 1483 + }, + { + "epoch": 0.15387307286029978, + "grad_norm": 0.46875, + "learning_rate": 0.000198720971849186, + "loss": 4.6917, + "step": 1484 + }, + { + "epoch": 0.15397676091478785, + "grad_norm": 0.58984375, + "learning_rate": 0.00019871923952373635, + "loss": 4.6441, + "step": 1485 + }, + { + "epoch": 0.15408044896927595, + "grad_norm": 0.5078125, + "learning_rate": 0.00019871750603350408, + "loss": 4.6871, + "step": 1486 + }, + { + "epoch": 0.15418413702376402, + "grad_norm": 0.57421875, + "learning_rate": 0.00019871577137850954, + "loss": 4.6767, + "step": 1487 + }, + { + "epoch": 0.1542878250782521, + "grad_norm": 0.61328125, + "learning_rate": 0.00019871403555877327, + "loss": 4.6942, + "step": 1488 + }, + { + "epoch": 0.15439151313274016, + "grad_norm": 0.6015625, + "learning_rate": 0.00019871229857431572, + "loss": 4.6879, + "step": 1489 + }, + { + "epoch": 0.15449520118722823, + "grad_norm": 0.57421875, + "learning_rate": 0.0001987105604251574, + "loss": 4.6799, + "step": 1490 + }, + { + "epoch": 0.1545988892417163, + "grad_norm": 0.5234375, + "learning_rate": 0.0001987088211113188, + "loss": 4.6938, + "step": 1491 + }, + { + "epoch": 0.15470257729620437, + "grad_norm": 0.6171875, + "learning_rate": 0.00019870708063282044, + "loss": 4.6613, + "step": 1492 + }, + { + "epoch": 0.15480626535069245, + "grad_norm": 0.61328125, + "learning_rate": 0.00019870533898968287, + "loss": 4.6938, + "step": 1493 + }, + { + "epoch": 0.15490995340518052, + "grad_norm": 0.51171875, + "learning_rate": 0.00019870359618192663, + "loss": 4.6946, + "step": 1494 + }, + { + "epoch": 0.1550136414596686, + "grad_norm": 0.59375, + "learning_rate": 0.0001987018522095723, + "loss": 4.6413, + "step": 1495 + }, + { + "epoch": 0.15511732951415666, + "grad_norm": 0.5625, + "learning_rate": 0.00019870010707264045, + "loss": 4.6721, + "step": 1496 + }, + { + "epoch": 0.15522101756864473, + "grad_norm": 0.474609375, + "learning_rate": 0.00019869836077115164, + "loss": 4.7004, + "step": 1497 + }, + { + "epoch": 0.1553247056231328, + "grad_norm": 0.52734375, + "learning_rate": 0.00019869661330512654, + "loss": 4.6585, + "step": 1498 + }, + { + "epoch": 0.15542839367762087, + "grad_norm": 0.5390625, + "learning_rate": 0.0001986948646745857, + "loss": 4.6284, + "step": 1499 + }, + { + "epoch": 0.15553208173210895, + "grad_norm": 0.474609375, + "learning_rate": 0.0001986931148795498, + "loss": 4.6635, + "step": 1500 + }, + { + "epoch": 0.15563576978659702, + "grad_norm": 0.5390625, + "learning_rate": 0.00019869136392003945, + "loss": 4.6898, + "step": 1501 + }, + { + "epoch": 0.1557394578410851, + "grad_norm": 0.51953125, + "learning_rate": 0.00019868961179607536, + "loss": 4.7494, + "step": 1502 + }, + { + "epoch": 0.15584314589557316, + "grad_norm": 0.5, + "learning_rate": 0.00019868785850767813, + "loss": 4.6805, + "step": 1503 + }, + { + "epoch": 0.15594683395006123, + "grad_norm": 0.478515625, + "learning_rate": 0.0001986861040548685, + "loss": 4.6183, + "step": 1504 + }, + { + "epoch": 0.1560505220045493, + "grad_norm": 0.498046875, + "learning_rate": 0.00019868434843766717, + "loss": 4.6631, + "step": 1505 + }, + { + "epoch": 0.15615421005903737, + "grad_norm": 0.48046875, + "learning_rate": 0.00019868259165609482, + "loss": 4.6455, + "step": 1506 + }, + { + "epoch": 0.15625789811352545, + "grad_norm": 0.486328125, + "learning_rate": 0.00019868083371017223, + "loss": 4.6403, + "step": 1507 + }, + { + "epoch": 0.15636158616801354, + "grad_norm": 0.5234375, + "learning_rate": 0.0001986790745999201, + "loss": 4.6782, + "step": 1508 + }, + { + "epoch": 0.15646527422250162, + "grad_norm": 0.54296875, + "learning_rate": 0.00019867731432535922, + "loss": 4.6598, + "step": 1509 + }, + { + "epoch": 0.1565689622769897, + "grad_norm": 0.578125, + "learning_rate": 0.0001986755528865103, + "loss": 4.6547, + "step": 1510 + }, + { + "epoch": 0.15667265033147776, + "grad_norm": 0.53125, + "learning_rate": 0.00019867379028339416, + "loss": 4.6414, + "step": 1511 + }, + { + "epoch": 0.15677633838596583, + "grad_norm": 0.56640625, + "learning_rate": 0.0001986720265160316, + "loss": 4.6567, + "step": 1512 + }, + { + "epoch": 0.1568800264404539, + "grad_norm": 0.70703125, + "learning_rate": 0.00019867026158444344, + "loss": 4.6425, + "step": 1513 + }, + { + "epoch": 0.15698371449494197, + "grad_norm": 0.66015625, + "learning_rate": 0.0001986684954886505, + "loss": 4.6946, + "step": 1514 + }, + { + "epoch": 0.15708740254943004, + "grad_norm": 0.6328125, + "learning_rate": 0.0001986667282286736, + "loss": 4.693, + "step": 1515 + }, + { + "epoch": 0.15719109060391812, + "grad_norm": 0.65625, + "learning_rate": 0.0001986649598045336, + "loss": 4.673, + "step": 1516 + }, + { + "epoch": 0.1572947786584062, + "grad_norm": 0.640625, + "learning_rate": 0.00019866319021625138, + "loss": 4.6624, + "step": 1517 + }, + { + "epoch": 0.15739846671289426, + "grad_norm": 0.75390625, + "learning_rate": 0.0001986614194638478, + "loss": 4.6387, + "step": 1518 + }, + { + "epoch": 0.15750215476738233, + "grad_norm": 0.84765625, + "learning_rate": 0.00019865964754734377, + "loss": 4.6426, + "step": 1519 + }, + { + "epoch": 0.1576058428218704, + "grad_norm": 0.65625, + "learning_rate": 0.00019865787446676016, + "loss": 4.6452, + "step": 1520 + }, + { + "epoch": 0.15770953087635847, + "grad_norm": 0.65234375, + "learning_rate": 0.00019865610022211795, + "loss": 4.6182, + "step": 1521 + }, + { + "epoch": 0.15781321893084654, + "grad_norm": 0.8359375, + "learning_rate": 0.000198654324813438, + "loss": 4.6999, + "step": 1522 + }, + { + "epoch": 0.15791690698533462, + "grad_norm": 0.890625, + "learning_rate": 0.00019865254824074136, + "loss": 4.6654, + "step": 1523 + }, + { + "epoch": 0.1580205950398227, + "grad_norm": 0.87890625, + "learning_rate": 0.0001986507705040489, + "loss": 4.7078, + "step": 1524 + }, + { + "epoch": 0.15812428309431076, + "grad_norm": 0.65234375, + "learning_rate": 0.0001986489916033816, + "loss": 4.7079, + "step": 1525 + }, + { + "epoch": 0.15822797114879883, + "grad_norm": 0.8125, + "learning_rate": 0.0001986472115387605, + "loss": 4.6934, + "step": 1526 + }, + { + "epoch": 0.1583316592032869, + "grad_norm": 1.015625, + "learning_rate": 0.00019864543031020658, + "loss": 4.6808, + "step": 1527 + }, + { + "epoch": 0.15843534725777497, + "grad_norm": 0.75390625, + "learning_rate": 0.00019864364791774084, + "loss": 4.6659, + "step": 1528 + }, + { + "epoch": 0.15853903531226304, + "grad_norm": 0.6953125, + "learning_rate": 0.00019864186436138433, + "loss": 4.6941, + "step": 1529 + }, + { + "epoch": 0.15864272336675114, + "grad_norm": 0.70703125, + "learning_rate": 0.0001986400796411581, + "loss": 4.682, + "step": 1530 + }, + { + "epoch": 0.15874641142123921, + "grad_norm": 0.734375, + "learning_rate": 0.0001986382937570832, + "loss": 4.6594, + "step": 1531 + }, + { + "epoch": 0.15885009947572729, + "grad_norm": 0.79296875, + "learning_rate": 0.0001986365067091807, + "loss": 4.681, + "step": 1532 + }, + { + "epoch": 0.15895378753021536, + "grad_norm": 0.73828125, + "learning_rate": 0.00019863471849747167, + "loss": 4.7219, + "step": 1533 + }, + { + "epoch": 0.15905747558470343, + "grad_norm": 0.828125, + "learning_rate": 0.0001986329291219772, + "loss": 4.6721, + "step": 1534 + }, + { + "epoch": 0.1591611636391915, + "grad_norm": 0.6171875, + "learning_rate": 0.00019863113858271846, + "loss": 4.6567, + "step": 1535 + }, + { + "epoch": 0.15926485169367957, + "grad_norm": 0.63671875, + "learning_rate": 0.00019862934687971655, + "loss": 4.6987, + "step": 1536 + }, + { + "epoch": 0.15936853974816764, + "grad_norm": 0.78125, + "learning_rate": 0.00019862755401299257, + "loss": 4.6722, + "step": 1537 + }, + { + "epoch": 0.1594722278026557, + "grad_norm": 0.85546875, + "learning_rate": 0.00019862575998256773, + "loss": 4.6555, + "step": 1538 + }, + { + "epoch": 0.15957591585714379, + "grad_norm": 0.82421875, + "learning_rate": 0.00019862396478846316, + "loss": 4.6425, + "step": 1539 + }, + { + "epoch": 0.15967960391163186, + "grad_norm": 0.796875, + "learning_rate": 0.0001986221684307001, + "loss": 4.6574, + "step": 1540 + }, + { + "epoch": 0.15978329196611993, + "grad_norm": 0.765625, + "learning_rate": 0.00019862037090929966, + "loss": 4.6743, + "step": 1541 + }, + { + "epoch": 0.159886980020608, + "grad_norm": 0.8046875, + "learning_rate": 0.00019861857222428308, + "loss": 4.6539, + "step": 1542 + }, + { + "epoch": 0.15999066807509607, + "grad_norm": 0.80859375, + "learning_rate": 0.00019861677237567162, + "loss": 4.6581, + "step": 1543 + }, + { + "epoch": 0.16009435612958414, + "grad_norm": 0.7890625, + "learning_rate": 0.00019861497136348648, + "loss": 4.6538, + "step": 1544 + }, + { + "epoch": 0.1601980441840722, + "grad_norm": 0.77734375, + "learning_rate": 0.0001986131691877489, + "loss": 4.6637, + "step": 1545 + }, + { + "epoch": 0.16030173223856028, + "grad_norm": 0.703125, + "learning_rate": 0.00019861136584848019, + "loss": 4.6621, + "step": 1546 + }, + { + "epoch": 0.16040542029304836, + "grad_norm": 0.7578125, + "learning_rate": 0.0001986095613457016, + "loss": 4.6788, + "step": 1547 + }, + { + "epoch": 0.16050910834753643, + "grad_norm": 0.84765625, + "learning_rate": 0.0001986077556794344, + "loss": 4.6588, + "step": 1548 + }, + { + "epoch": 0.1606127964020245, + "grad_norm": 0.93359375, + "learning_rate": 0.00019860594884969993, + "loss": 4.6647, + "step": 1549 + }, + { + "epoch": 0.16071648445651257, + "grad_norm": 1.0625, + "learning_rate": 0.0001986041408565195, + "loss": 4.6423, + "step": 1550 + }, + { + "epoch": 0.16082017251100064, + "grad_norm": 0.890625, + "learning_rate": 0.0001986023316999144, + "loss": 4.6748, + "step": 1551 + }, + { + "epoch": 0.1609238605654887, + "grad_norm": 0.6953125, + "learning_rate": 0.00019860052137990605, + "loss": 4.6763, + "step": 1552 + }, + { + "epoch": 0.1610275486199768, + "grad_norm": 0.84765625, + "learning_rate": 0.00019859870989651576, + "loss": 4.6478, + "step": 1553 + }, + { + "epoch": 0.16113123667446488, + "grad_norm": 0.96875, + "learning_rate": 0.00019859689724976488, + "loss": 4.6574, + "step": 1554 + }, + { + "epoch": 0.16123492472895296, + "grad_norm": 0.953125, + "learning_rate": 0.0001985950834396749, + "loss": 4.6907, + "step": 1555 + }, + { + "epoch": 0.16133861278344103, + "grad_norm": 0.91796875, + "learning_rate": 0.00019859326846626712, + "loss": 4.6621, + "step": 1556 + }, + { + "epoch": 0.1614423008379291, + "grad_norm": 0.83984375, + "learning_rate": 0.000198591452329563, + "loss": 4.6589, + "step": 1557 + }, + { + "epoch": 0.16154598889241717, + "grad_norm": 0.69921875, + "learning_rate": 0.00019858963502958395, + "loss": 4.7071, + "step": 1558 + }, + { + "epoch": 0.16164967694690524, + "grad_norm": 0.71875, + "learning_rate": 0.00019858781656635142, + "loss": 4.6513, + "step": 1559 + }, + { + "epoch": 0.1617533650013933, + "grad_norm": 0.94140625, + "learning_rate": 0.00019858599693988688, + "loss": 4.6913, + "step": 1560 + }, + { + "epoch": 0.16185705305588138, + "grad_norm": 1.2265625, + "learning_rate": 0.00019858417615021176, + "loss": 4.6505, + "step": 1561 + }, + { + "epoch": 0.16196074111036945, + "grad_norm": 0.765625, + "learning_rate": 0.00019858235419734758, + "loss": 4.6975, + "step": 1562 + }, + { + "epoch": 0.16206442916485753, + "grad_norm": 1.0546875, + "learning_rate": 0.00019858053108131585, + "loss": 4.6967, + "step": 1563 + }, + { + "epoch": 0.1621681172193456, + "grad_norm": 1.203125, + "learning_rate": 0.00019857870680213804, + "loss": 4.6619, + "step": 1564 + }, + { + "epoch": 0.16227180527383367, + "grad_norm": 0.81640625, + "learning_rate": 0.0001985768813598357, + "loss": 4.6586, + "step": 1565 + }, + { + "epoch": 0.16237549332832174, + "grad_norm": 0.8984375, + "learning_rate": 0.00019857505475443033, + "loss": 4.6451, + "step": 1566 + }, + { + "epoch": 0.1624791813828098, + "grad_norm": 0.9921875, + "learning_rate": 0.00019857322698594353, + "loss": 4.6634, + "step": 1567 + }, + { + "epoch": 0.16258286943729788, + "grad_norm": 0.87890625, + "learning_rate": 0.00019857139805439688, + "loss": 4.6611, + "step": 1568 + }, + { + "epoch": 0.16268655749178595, + "grad_norm": 0.671875, + "learning_rate": 0.0001985695679598119, + "loss": 4.6922, + "step": 1569 + }, + { + "epoch": 0.16279024554627403, + "grad_norm": 0.8359375, + "learning_rate": 0.0001985677367022102, + "loss": 4.5804, + "step": 1570 + }, + { + "epoch": 0.1628939336007621, + "grad_norm": 0.9140625, + "learning_rate": 0.00019856590428161342, + "loss": 4.6266, + "step": 1571 + }, + { + "epoch": 0.16299762165525017, + "grad_norm": 0.85546875, + "learning_rate": 0.00019856407069804316, + "loss": 4.6377, + "step": 1572 + }, + { + "epoch": 0.16310130970973824, + "grad_norm": 0.83203125, + "learning_rate": 0.00019856223595152104, + "loss": 4.6668, + "step": 1573 + }, + { + "epoch": 0.1632049977642263, + "grad_norm": 0.77734375, + "learning_rate": 0.0001985604000420687, + "loss": 4.6548, + "step": 1574 + }, + { + "epoch": 0.1633086858187144, + "grad_norm": 0.73046875, + "learning_rate": 0.00019855856296970784, + "loss": 4.7052, + "step": 1575 + }, + { + "epoch": 0.16341237387320248, + "grad_norm": 0.69140625, + "learning_rate": 0.00019855672473446012, + "loss": 4.6898, + "step": 1576 + }, + { + "epoch": 0.16351606192769055, + "grad_norm": 0.796875, + "learning_rate": 0.00019855488533634724, + "loss": 4.6651, + "step": 1577 + }, + { + "epoch": 0.16361974998217862, + "grad_norm": 0.79296875, + "learning_rate": 0.00019855304477539085, + "loss": 4.6913, + "step": 1578 + }, + { + "epoch": 0.1637234380366667, + "grad_norm": 0.828125, + "learning_rate": 0.00019855120305161273, + "loss": 4.6939, + "step": 1579 + }, + { + "epoch": 0.16382712609115477, + "grad_norm": 0.7734375, + "learning_rate": 0.0001985493601650346, + "loss": 4.677, + "step": 1580 + }, + { + "epoch": 0.16393081414564284, + "grad_norm": 0.69921875, + "learning_rate": 0.0001985475161156782, + "loss": 4.6388, + "step": 1581 + }, + { + "epoch": 0.1640345022001309, + "grad_norm": 0.7421875, + "learning_rate": 0.00019854567090356526, + "loss": 4.6185, + "step": 1582 + }, + { + "epoch": 0.16413819025461898, + "grad_norm": 0.90625, + "learning_rate": 0.00019854382452871757, + "loss": 4.6091, + "step": 1583 + }, + { + "epoch": 0.16424187830910705, + "grad_norm": 0.86328125, + "learning_rate": 0.00019854197699115692, + "loss": 4.683, + "step": 1584 + }, + { + "epoch": 0.16434556636359512, + "grad_norm": 0.6875, + "learning_rate": 0.0001985401282909051, + "loss": 4.6445, + "step": 1585 + }, + { + "epoch": 0.1644492544180832, + "grad_norm": 0.7890625, + "learning_rate": 0.00019853827842798393, + "loss": 4.6586, + "step": 1586 + }, + { + "epoch": 0.16455294247257127, + "grad_norm": 0.63671875, + "learning_rate": 0.00019853642740241522, + "loss": 4.6508, + "step": 1587 + }, + { + "epoch": 0.16465663052705934, + "grad_norm": 0.7265625, + "learning_rate": 0.00019853457521422084, + "loss": 4.6936, + "step": 1588 + }, + { + "epoch": 0.1647603185815474, + "grad_norm": 0.6015625, + "learning_rate": 0.00019853272186342262, + "loss": 4.639, + "step": 1589 + }, + { + "epoch": 0.16486400663603548, + "grad_norm": 0.59765625, + "learning_rate": 0.00019853086735004247, + "loss": 4.6583, + "step": 1590 + }, + { + "epoch": 0.16496769469052355, + "grad_norm": 0.72265625, + "learning_rate": 0.0001985290116741022, + "loss": 4.6363, + "step": 1591 + }, + { + "epoch": 0.16507138274501162, + "grad_norm": 0.7421875, + "learning_rate": 0.00019852715483562375, + "loss": 4.6199, + "step": 1592 + }, + { + "epoch": 0.1651750707994997, + "grad_norm": 0.7578125, + "learning_rate": 0.00019852529683462902, + "loss": 4.6458, + "step": 1593 + }, + { + "epoch": 0.16527875885398777, + "grad_norm": 0.62109375, + "learning_rate": 0.00019852343767113993, + "loss": 4.6503, + "step": 1594 + }, + { + "epoch": 0.16538244690847584, + "grad_norm": 0.66015625, + "learning_rate": 0.00019852157734517843, + "loss": 4.6741, + "step": 1595 + }, + { + "epoch": 0.1654861349629639, + "grad_norm": 0.6484375, + "learning_rate": 0.00019851971585676644, + "loss": 4.6378, + "step": 1596 + }, + { + "epoch": 0.165589823017452, + "grad_norm": 0.7109375, + "learning_rate": 0.00019851785320592596, + "loss": 4.6136, + "step": 1597 + }, + { + "epoch": 0.16569351107194008, + "grad_norm": 0.70703125, + "learning_rate": 0.00019851598939267894, + "loss": 4.6617, + "step": 1598 + }, + { + "epoch": 0.16579719912642815, + "grad_norm": 0.62890625, + "learning_rate": 0.0001985141244170474, + "loss": 4.67, + "step": 1599 + }, + { + "epoch": 0.16590088718091622, + "grad_norm": 0.65234375, + "learning_rate": 0.0001985122582790533, + "loss": 4.6461, + "step": 1600 + }, + { + "epoch": 0.1660045752354043, + "grad_norm": 0.69921875, + "learning_rate": 0.00019851039097871872, + "loss": 4.6634, + "step": 1601 + }, + { + "epoch": 0.16610826328989237, + "grad_norm": 0.62890625, + "learning_rate": 0.00019850852251606562, + "loss": 4.6191, + "step": 1602 + }, + { + "epoch": 0.16621195134438044, + "grad_norm": 0.8125, + "learning_rate": 0.0001985066528911161, + "loss": 4.6047, + "step": 1603 + }, + { + "epoch": 0.1663156393988685, + "grad_norm": 0.83203125, + "learning_rate": 0.00019850478210389218, + "loss": 4.6388, + "step": 1604 + }, + { + "epoch": 0.16641932745335658, + "grad_norm": 0.7578125, + "learning_rate": 0.00019850291015441598, + "loss": 4.6735, + "step": 1605 + }, + { + "epoch": 0.16652301550784465, + "grad_norm": 0.73828125, + "learning_rate": 0.00019850103704270957, + "loss": 4.6201, + "step": 1606 + }, + { + "epoch": 0.16662670356233272, + "grad_norm": 0.66015625, + "learning_rate": 0.000198499162768795, + "loss": 4.6704, + "step": 1607 + }, + { + "epoch": 0.1667303916168208, + "grad_norm": 0.8203125, + "learning_rate": 0.00019849728733269446, + "loss": 4.6382, + "step": 1608 + }, + { + "epoch": 0.16683407967130887, + "grad_norm": 0.97265625, + "learning_rate": 0.00019849541073443006, + "loss": 4.6594, + "step": 1609 + }, + { + "epoch": 0.16693776772579694, + "grad_norm": 0.91015625, + "learning_rate": 0.00019849353297402388, + "loss": 4.6754, + "step": 1610 + }, + { + "epoch": 0.167041455780285, + "grad_norm": 0.9453125, + "learning_rate": 0.00019849165405149818, + "loss": 4.6522, + "step": 1611 + }, + { + "epoch": 0.16714514383477308, + "grad_norm": 1.0703125, + "learning_rate": 0.00019848977396687504, + "loss": 4.636, + "step": 1612 + }, + { + "epoch": 0.16724883188926115, + "grad_norm": 0.96484375, + "learning_rate": 0.00019848789272017668, + "loss": 4.6058, + "step": 1613 + }, + { + "epoch": 0.16735251994374922, + "grad_norm": 0.78515625, + "learning_rate": 0.0001984860103114253, + "loss": 4.6336, + "step": 1614 + }, + { + "epoch": 0.1674562079982373, + "grad_norm": 0.75, + "learning_rate": 0.00019848412674064306, + "loss": 4.6957, + "step": 1615 + }, + { + "epoch": 0.16755989605272537, + "grad_norm": 0.640625, + "learning_rate": 0.00019848224200785228, + "loss": 4.6593, + "step": 1616 + }, + { + "epoch": 0.16766358410721344, + "grad_norm": 0.81640625, + "learning_rate": 0.00019848035611307513, + "loss": 4.6497, + "step": 1617 + }, + { + "epoch": 0.1677672721617015, + "grad_norm": 1.1640625, + "learning_rate": 0.00019847846905633385, + "loss": 4.598, + "step": 1618 + }, + { + "epoch": 0.16787096021618958, + "grad_norm": 0.90234375, + "learning_rate": 0.00019847658083765076, + "loss": 4.6646, + "step": 1619 + }, + { + "epoch": 0.16797464827067768, + "grad_norm": 0.62890625, + "learning_rate": 0.0001984746914570481, + "loss": 4.6261, + "step": 1620 + }, + { + "epoch": 0.16807833632516575, + "grad_norm": 0.7890625, + "learning_rate": 0.0001984728009145482, + "loss": 4.6637, + "step": 1621 + }, + { + "epoch": 0.16818202437965382, + "grad_norm": 0.9609375, + "learning_rate": 0.0001984709092101733, + "loss": 4.6613, + "step": 1622 + }, + { + "epoch": 0.1682857124341419, + "grad_norm": 0.81640625, + "learning_rate": 0.00019846901634394576, + "loss": 4.6403, + "step": 1623 + }, + { + "epoch": 0.16838940048862996, + "grad_norm": 0.73828125, + "learning_rate": 0.00019846712231588796, + "loss": 4.647, + "step": 1624 + }, + { + "epoch": 0.16849308854311804, + "grad_norm": 0.69140625, + "learning_rate": 0.00019846522712602216, + "loss": 4.6536, + "step": 1625 + }, + { + "epoch": 0.1685967765976061, + "grad_norm": 0.75, + "learning_rate": 0.00019846333077437077, + "loss": 4.6393, + "step": 1626 + }, + { + "epoch": 0.16870046465209418, + "grad_norm": 0.80859375, + "learning_rate": 0.00019846143326095615, + "loss": 4.6717, + "step": 1627 + }, + { + "epoch": 0.16880415270658225, + "grad_norm": 0.69921875, + "learning_rate": 0.0001984595345858007, + "loss": 4.6327, + "step": 1628 + }, + { + "epoch": 0.16890784076107032, + "grad_norm": 0.69921875, + "learning_rate": 0.00019845763474892681, + "loss": 4.6776, + "step": 1629 + }, + { + "epoch": 0.1690115288155584, + "grad_norm": 0.66796875, + "learning_rate": 0.00019845573375035694, + "loss": 4.6468, + "step": 1630 + }, + { + "epoch": 0.16911521687004646, + "grad_norm": 0.78515625, + "learning_rate": 0.00019845383159011347, + "loss": 4.6192, + "step": 1631 + }, + { + "epoch": 0.16921890492453454, + "grad_norm": 0.7421875, + "learning_rate": 0.00019845192826821884, + "loss": 4.6868, + "step": 1632 + }, + { + "epoch": 0.1693225929790226, + "grad_norm": 0.74609375, + "learning_rate": 0.00019845002378469554, + "loss": 4.6603, + "step": 1633 + }, + { + "epoch": 0.16942628103351068, + "grad_norm": 0.83984375, + "learning_rate": 0.000198448118139566, + "loss": 4.6291, + "step": 1634 + }, + { + "epoch": 0.16952996908799875, + "grad_norm": 0.6796875, + "learning_rate": 0.00019844621133285276, + "loss": 4.5767, + "step": 1635 + }, + { + "epoch": 0.16963365714248682, + "grad_norm": 0.6875, + "learning_rate": 0.0001984443033645783, + "loss": 4.571, + "step": 1636 + }, + { + "epoch": 0.1697373451969749, + "grad_norm": 0.75390625, + "learning_rate": 0.00019844239423476507, + "loss": 4.6399, + "step": 1637 + }, + { + "epoch": 0.16984103325146296, + "grad_norm": 0.78515625, + "learning_rate": 0.00019844048394343568, + "loss": 4.6592, + "step": 1638 + }, + { + "epoch": 0.16994472130595104, + "grad_norm": 0.6875, + "learning_rate": 0.00019843857249061264, + "loss": 4.6768, + "step": 1639 + }, + { + "epoch": 0.1700484093604391, + "grad_norm": 0.7265625, + "learning_rate": 0.00019843665987631849, + "loss": 4.671, + "step": 1640 + }, + { + "epoch": 0.17015209741492718, + "grad_norm": 0.71484375, + "learning_rate": 0.00019843474610057576, + "loss": 4.6523, + "step": 1641 + }, + { + "epoch": 0.17025578546941528, + "grad_norm": 0.6953125, + "learning_rate": 0.00019843283116340713, + "loss": 4.6186, + "step": 1642 + }, + { + "epoch": 0.17035947352390335, + "grad_norm": 0.6875, + "learning_rate": 0.00019843091506483514, + "loss": 4.6413, + "step": 1643 + }, + { + "epoch": 0.17046316157839142, + "grad_norm": 0.73046875, + "learning_rate": 0.00019842899780488237, + "loss": 4.66, + "step": 1644 + }, + { + "epoch": 0.1705668496328795, + "grad_norm": 0.796875, + "learning_rate": 0.0001984270793835715, + "loss": 4.6728, + "step": 1645 + }, + { + "epoch": 0.17067053768736756, + "grad_norm": 0.59375, + "learning_rate": 0.00019842515980092514, + "loss": 4.6588, + "step": 1646 + }, + { + "epoch": 0.17077422574185563, + "grad_norm": 0.73046875, + "learning_rate": 0.00019842323905696588, + "loss": 4.6629, + "step": 1647 + }, + { + "epoch": 0.1708779137963437, + "grad_norm": 0.76171875, + "learning_rate": 0.0001984213171517165, + "loss": 4.6732, + "step": 1648 + }, + { + "epoch": 0.17098160185083178, + "grad_norm": 0.73046875, + "learning_rate": 0.00019841939408519958, + "loss": 4.6467, + "step": 1649 + }, + { + "epoch": 0.17108528990531985, + "grad_norm": 0.62109375, + "learning_rate": 0.00019841746985743786, + "loss": 4.6683, + "step": 1650 + }, + { + "epoch": 0.17118897795980792, + "grad_norm": 0.625, + "learning_rate": 0.00019841554446845404, + "loss": 4.6328, + "step": 1651 + }, + { + "epoch": 0.171292666014296, + "grad_norm": 0.73828125, + "learning_rate": 0.0001984136179182708, + "loss": 4.6829, + "step": 1652 + }, + { + "epoch": 0.17139635406878406, + "grad_norm": 0.65234375, + "learning_rate": 0.0001984116902069109, + "loss": 4.6603, + "step": 1653 + }, + { + "epoch": 0.17150004212327213, + "grad_norm": 0.6875, + "learning_rate": 0.00019840976133439706, + "loss": 4.6468, + "step": 1654 + }, + { + "epoch": 0.1716037301777602, + "grad_norm": 0.77734375, + "learning_rate": 0.00019840783130075206, + "loss": 4.6419, + "step": 1655 + }, + { + "epoch": 0.17170741823224828, + "grad_norm": 0.78125, + "learning_rate": 0.0001984059001059987, + "loss": 4.5958, + "step": 1656 + }, + { + "epoch": 0.17181110628673635, + "grad_norm": 0.7734375, + "learning_rate": 0.00019840396775015976, + "loss": 4.6597, + "step": 1657 + }, + { + "epoch": 0.17191479434122442, + "grad_norm": 0.8046875, + "learning_rate": 0.000198402034233258, + "loss": 4.6056, + "step": 1658 + }, + { + "epoch": 0.1720184823957125, + "grad_norm": 0.875, + "learning_rate": 0.00019840009955531622, + "loss": 4.6602, + "step": 1659 + }, + { + "epoch": 0.17212217045020056, + "grad_norm": 1.0234375, + "learning_rate": 0.0001983981637163573, + "loss": 4.6517, + "step": 1660 + }, + { + "epoch": 0.17222585850468863, + "grad_norm": 0.80078125, + "learning_rate": 0.00019839622671640405, + "loss": 4.6581, + "step": 1661 + }, + { + "epoch": 0.1723295465591767, + "grad_norm": 0.9921875, + "learning_rate": 0.00019839428855547935, + "loss": 4.6584, + "step": 1662 + }, + { + "epoch": 0.17243323461366478, + "grad_norm": 0.91015625, + "learning_rate": 0.00019839234923360604, + "loss": 4.6582, + "step": 1663 + }, + { + "epoch": 0.17253692266815288, + "grad_norm": 0.75390625, + "learning_rate": 0.00019839040875080702, + "loss": 4.6181, + "step": 1664 + }, + { + "epoch": 0.17264061072264095, + "grad_norm": 0.76171875, + "learning_rate": 0.0001983884671071052, + "loss": 4.6752, + "step": 1665 + }, + { + "epoch": 0.17274429877712902, + "grad_norm": 0.80078125, + "learning_rate": 0.00019838652430252346, + "loss": 4.6136, + "step": 1666 + }, + { + "epoch": 0.1728479868316171, + "grad_norm": 0.953125, + "learning_rate": 0.0001983845803370847, + "loss": 4.6409, + "step": 1667 + }, + { + "epoch": 0.17295167488610516, + "grad_norm": 0.90234375, + "learning_rate": 0.00019838263521081191, + "loss": 4.578, + "step": 1668 + }, + { + "epoch": 0.17305536294059323, + "grad_norm": 0.73046875, + "learning_rate": 0.000198380688923728, + "loss": 4.6316, + "step": 1669 + }, + { + "epoch": 0.1731590509950813, + "grad_norm": 0.8359375, + "learning_rate": 0.000198378741475856, + "loss": 4.6198, + "step": 1670 + }, + { + "epoch": 0.17326273904956938, + "grad_norm": 0.8671875, + "learning_rate": 0.0001983767928672188, + "loss": 4.6566, + "step": 1671 + }, + { + "epoch": 0.17336642710405745, + "grad_norm": 0.62890625, + "learning_rate": 0.00019837484309783945, + "loss": 4.6642, + "step": 1672 + }, + { + "epoch": 0.17347011515854552, + "grad_norm": 0.69140625, + "learning_rate": 0.00019837289216774093, + "loss": 4.6507, + "step": 1673 + }, + { + "epoch": 0.1735738032130336, + "grad_norm": 0.83203125, + "learning_rate": 0.0001983709400769463, + "loss": 4.5731, + "step": 1674 + }, + { + "epoch": 0.17367749126752166, + "grad_norm": 0.765625, + "learning_rate": 0.00019836898682547852, + "loss": 4.6674, + "step": 1675 + }, + { + "epoch": 0.17378117932200973, + "grad_norm": 0.69921875, + "learning_rate": 0.00019836703241336067, + "loss": 4.6437, + "step": 1676 + }, + { + "epoch": 0.1738848673764978, + "grad_norm": 0.703125, + "learning_rate": 0.00019836507684061584, + "loss": 4.6252, + "step": 1677 + }, + { + "epoch": 0.17398855543098588, + "grad_norm": 0.8125, + "learning_rate": 0.00019836312010726708, + "loss": 4.6167, + "step": 1678 + }, + { + "epoch": 0.17409224348547395, + "grad_norm": 1.0703125, + "learning_rate": 0.00019836116221333747, + "loss": 4.6435, + "step": 1679 + }, + { + "epoch": 0.17419593153996202, + "grad_norm": 0.93359375, + "learning_rate": 0.0001983592031588501, + "loss": 4.6064, + "step": 1680 + }, + { + "epoch": 0.1742996195944501, + "grad_norm": 0.70703125, + "learning_rate": 0.00019835724294382814, + "loss": 4.5754, + "step": 1681 + }, + { + "epoch": 0.17440330764893816, + "grad_norm": 0.6171875, + "learning_rate": 0.00019835528156829466, + "loss": 4.5984, + "step": 1682 + }, + { + "epoch": 0.17450699570342623, + "grad_norm": 0.76171875, + "learning_rate": 0.00019835331903227284, + "loss": 4.6164, + "step": 1683 + }, + { + "epoch": 0.1746106837579143, + "grad_norm": 0.84375, + "learning_rate": 0.0001983513553357858, + "loss": 4.6206, + "step": 1684 + }, + { + "epoch": 0.17471437181240237, + "grad_norm": 0.68359375, + "learning_rate": 0.00019834939047885675, + "loss": 4.5944, + "step": 1685 + }, + { + "epoch": 0.17481805986689045, + "grad_norm": 0.78515625, + "learning_rate": 0.0001983474244615088, + "loss": 4.6713, + "step": 1686 + }, + { + "epoch": 0.17492174792137855, + "grad_norm": 0.8359375, + "learning_rate": 0.00019834545728376527, + "loss": 4.646, + "step": 1687 + }, + { + "epoch": 0.17502543597586662, + "grad_norm": 0.71875, + "learning_rate": 0.00019834348894564924, + "loss": 4.6358, + "step": 1688 + }, + { + "epoch": 0.1751291240303547, + "grad_norm": 0.66015625, + "learning_rate": 0.00019834151944718404, + "loss": 4.6095, + "step": 1689 + }, + { + "epoch": 0.17523281208484276, + "grad_norm": 0.7890625, + "learning_rate": 0.00019833954878839283, + "loss": 4.6115, + "step": 1690 + }, + { + "epoch": 0.17533650013933083, + "grad_norm": 0.796875, + "learning_rate": 0.0001983375769692989, + "loss": 4.6866, + "step": 1691 + }, + { + "epoch": 0.1754401881938189, + "grad_norm": 0.78125, + "learning_rate": 0.00019833560398992552, + "loss": 4.6569, + "step": 1692 + }, + { + "epoch": 0.17554387624830697, + "grad_norm": 0.66015625, + "learning_rate": 0.00019833362985029594, + "loss": 4.6235, + "step": 1693 + }, + { + "epoch": 0.17564756430279505, + "grad_norm": 0.81640625, + "learning_rate": 0.0001983316545504335, + "loss": 4.6602, + "step": 1694 + }, + { + "epoch": 0.17575125235728312, + "grad_norm": 0.875, + "learning_rate": 0.00019832967809036144, + "loss": 4.6386, + "step": 1695 + }, + { + "epoch": 0.1758549404117712, + "grad_norm": 0.76171875, + "learning_rate": 0.00019832770047010316, + "loss": 4.6023, + "step": 1696 + }, + { + "epoch": 0.17595862846625926, + "grad_norm": 0.67578125, + "learning_rate": 0.00019832572168968193, + "loss": 4.6724, + "step": 1697 + }, + { + "epoch": 0.17606231652074733, + "grad_norm": 0.83203125, + "learning_rate": 0.00019832374174912111, + "loss": 4.5923, + "step": 1698 + }, + { + "epoch": 0.1761660045752354, + "grad_norm": 0.76953125, + "learning_rate": 0.00019832176064844408, + "loss": 4.6562, + "step": 1699 + }, + { + "epoch": 0.17626969262972347, + "grad_norm": 0.6640625, + "learning_rate": 0.00019831977838767422, + "loss": 4.5973, + "step": 1700 + }, + { + "epoch": 0.17637338068421154, + "grad_norm": 0.859375, + "learning_rate": 0.0001983177949668349, + "loss": 4.6508, + "step": 1701 + }, + { + "epoch": 0.17647706873869962, + "grad_norm": 0.78515625, + "learning_rate": 0.0001983158103859495, + "loss": 4.643, + "step": 1702 + }, + { + "epoch": 0.1765807567931877, + "grad_norm": 0.546875, + "learning_rate": 0.00019831382464504147, + "loss": 4.6174, + "step": 1703 + }, + { + "epoch": 0.17668444484767576, + "grad_norm": 0.64453125, + "learning_rate": 0.00019831183774413424, + "loss": 4.6328, + "step": 1704 + }, + { + "epoch": 0.17678813290216383, + "grad_norm": 0.6171875, + "learning_rate": 0.00019830984968325122, + "loss": 4.6157, + "step": 1705 + }, + { + "epoch": 0.1768918209566519, + "grad_norm": 0.5703125, + "learning_rate": 0.00019830786046241592, + "loss": 4.6173, + "step": 1706 + }, + { + "epoch": 0.17699550901113997, + "grad_norm": 0.671875, + "learning_rate": 0.00019830587008165177, + "loss": 4.606, + "step": 1707 + }, + { + "epoch": 0.17709919706562804, + "grad_norm": 0.65234375, + "learning_rate": 0.0001983038785409823, + "loss": 4.5933, + "step": 1708 + }, + { + "epoch": 0.17720288512011614, + "grad_norm": 0.640625, + "learning_rate": 0.00019830188584043094, + "loss": 4.6165, + "step": 1709 + }, + { + "epoch": 0.17730657317460422, + "grad_norm": 0.68359375, + "learning_rate": 0.00019829989198002124, + "loss": 4.6634, + "step": 1710 + }, + { + "epoch": 0.1774102612290923, + "grad_norm": 0.59375, + "learning_rate": 0.00019829789695977672, + "loss": 4.5776, + "step": 1711 + }, + { + "epoch": 0.17751394928358036, + "grad_norm": 0.69140625, + "learning_rate": 0.00019829590077972094, + "loss": 4.6129, + "step": 1712 + }, + { + "epoch": 0.17761763733806843, + "grad_norm": 0.8671875, + "learning_rate": 0.00019829390343987743, + "loss": 4.5699, + "step": 1713 + }, + { + "epoch": 0.1777213253925565, + "grad_norm": 0.921875, + "learning_rate": 0.00019829190494026974, + "loss": 4.6348, + "step": 1714 + }, + { + "epoch": 0.17782501344704457, + "grad_norm": 0.87109375, + "learning_rate": 0.00019828990528092147, + "loss": 4.635, + "step": 1715 + }, + { + "epoch": 0.17792870150153264, + "grad_norm": 0.828125, + "learning_rate": 0.00019828790446185622, + "loss": 4.6516, + "step": 1716 + }, + { + "epoch": 0.17803238955602071, + "grad_norm": 0.7890625, + "learning_rate": 0.0001982859024830976, + "loss": 4.6052, + "step": 1717 + }, + { + "epoch": 0.1781360776105088, + "grad_norm": 0.6953125, + "learning_rate": 0.0001982838993446692, + "loss": 4.6519, + "step": 1718 + }, + { + "epoch": 0.17823976566499686, + "grad_norm": 0.63671875, + "learning_rate": 0.00019828189504659472, + "loss": 4.6294, + "step": 1719 + }, + { + "epoch": 0.17834345371948493, + "grad_norm": 0.5546875, + "learning_rate": 0.00019827988958889776, + "loss": 4.6044, + "step": 1720 + }, + { + "epoch": 0.178447141773973, + "grad_norm": 0.640625, + "learning_rate": 0.00019827788297160196, + "loss": 4.6074, + "step": 1721 + }, + { + "epoch": 0.17855082982846107, + "grad_norm": 0.67578125, + "learning_rate": 0.00019827587519473107, + "loss": 4.6163, + "step": 1722 + }, + { + "epoch": 0.17865451788294914, + "grad_norm": 0.69140625, + "learning_rate": 0.00019827386625830871, + "loss": 4.6524, + "step": 1723 + }, + { + "epoch": 0.17875820593743721, + "grad_norm": 0.66015625, + "learning_rate": 0.0001982718561623586, + "loss": 4.5932, + "step": 1724 + }, + { + "epoch": 0.17886189399192529, + "grad_norm": 0.59375, + "learning_rate": 0.00019826984490690447, + "loss": 4.5631, + "step": 1725 + }, + { + "epoch": 0.17896558204641336, + "grad_norm": 0.69921875, + "learning_rate": 0.00019826783249197004, + "loss": 4.6035, + "step": 1726 + }, + { + "epoch": 0.17906927010090143, + "grad_norm": 0.7421875, + "learning_rate": 0.00019826581891757908, + "loss": 4.6006, + "step": 1727 + }, + { + "epoch": 0.1791729581553895, + "grad_norm": 0.71875, + "learning_rate": 0.00019826380418375532, + "loss": 4.6126, + "step": 1728 + }, + { + "epoch": 0.17927664620987757, + "grad_norm": 0.6171875, + "learning_rate": 0.00019826178829052254, + "loss": 4.6404, + "step": 1729 + }, + { + "epoch": 0.17938033426436564, + "grad_norm": 0.68359375, + "learning_rate": 0.0001982597712379045, + "loss": 4.6289, + "step": 1730 + }, + { + "epoch": 0.17948402231885374, + "grad_norm": 0.6328125, + "learning_rate": 0.00019825775302592503, + "loss": 4.6162, + "step": 1731 + }, + { + "epoch": 0.1795877103733418, + "grad_norm": 0.71875, + "learning_rate": 0.00019825573365460798, + "loss": 4.615, + "step": 1732 + }, + { + "epoch": 0.17969139842782988, + "grad_norm": 0.76171875, + "learning_rate": 0.0001982537131239771, + "loss": 4.6037, + "step": 1733 + }, + { + "epoch": 0.17979508648231796, + "grad_norm": 0.63671875, + "learning_rate": 0.00019825169143405623, + "loss": 4.6038, + "step": 1734 + }, + { + "epoch": 0.17989877453680603, + "grad_norm": 0.62109375, + "learning_rate": 0.00019824966858486933, + "loss": 4.6219, + "step": 1735 + }, + { + "epoch": 0.1800024625912941, + "grad_norm": 0.7734375, + "learning_rate": 0.00019824764457644016, + "loss": 4.6097, + "step": 1736 + }, + { + "epoch": 0.18010615064578217, + "grad_norm": 0.74609375, + "learning_rate": 0.00019824561940879262, + "loss": 4.5909, + "step": 1737 + }, + { + "epoch": 0.18020983870027024, + "grad_norm": 0.70703125, + "learning_rate": 0.00019824359308195068, + "loss": 4.5772, + "step": 1738 + }, + { + "epoch": 0.1803135267547583, + "grad_norm": 0.6328125, + "learning_rate": 0.00019824156559593813, + "loss": 4.6719, + "step": 1739 + }, + { + "epoch": 0.18041721480924638, + "grad_norm": 0.65234375, + "learning_rate": 0.00019823953695077896, + "loss": 4.5728, + "step": 1740 + }, + { + "epoch": 0.18052090286373446, + "grad_norm": 0.7109375, + "learning_rate": 0.0001982375071464971, + "loss": 4.6433, + "step": 1741 + }, + { + "epoch": 0.18062459091822253, + "grad_norm": 0.74609375, + "learning_rate": 0.00019823547618311654, + "loss": 4.6155, + "step": 1742 + }, + { + "epoch": 0.1807282789727106, + "grad_norm": 0.7421875, + "learning_rate": 0.00019823344406066115, + "loss": 4.6704, + "step": 1743 + }, + { + "epoch": 0.18083196702719867, + "grad_norm": 0.88671875, + "learning_rate": 0.00019823141077915496, + "loss": 4.6568, + "step": 1744 + }, + { + "epoch": 0.18093565508168674, + "grad_norm": 1.046875, + "learning_rate": 0.00019822937633862198, + "loss": 4.6189, + "step": 1745 + }, + { + "epoch": 0.1810393431361748, + "grad_norm": 0.84765625, + "learning_rate": 0.00019822734073908618, + "loss": 4.631, + "step": 1746 + }, + { + "epoch": 0.18114303119066288, + "grad_norm": 0.63671875, + "learning_rate": 0.0001982253039805716, + "loss": 4.5911, + "step": 1747 + }, + { + "epoch": 0.18124671924515096, + "grad_norm": 0.9453125, + "learning_rate": 0.00019822326606310227, + "loss": 4.6091, + "step": 1748 + }, + { + "epoch": 0.18135040729963903, + "grad_norm": 1.125, + "learning_rate": 0.0001982212269867022, + "loss": 4.5828, + "step": 1749 + }, + { + "epoch": 0.1814540953541271, + "grad_norm": 0.8671875, + "learning_rate": 0.00019821918675139548, + "loss": 4.6346, + "step": 1750 + }, + { + "epoch": 0.18155778340861517, + "grad_norm": 0.79296875, + "learning_rate": 0.0001982171453572062, + "loss": 4.6333, + "step": 1751 + }, + { + "epoch": 0.18166147146310324, + "grad_norm": 0.86328125, + "learning_rate": 0.00019821510280415837, + "loss": 4.658, + "step": 1752 + }, + { + "epoch": 0.1817651595175913, + "grad_norm": 0.76953125, + "learning_rate": 0.00019821305909227622, + "loss": 4.6648, + "step": 1753 + }, + { + "epoch": 0.1818688475720794, + "grad_norm": 0.87109375, + "learning_rate": 0.00019821101422158374, + "loss": 4.6353, + "step": 1754 + }, + { + "epoch": 0.18197253562656748, + "grad_norm": 0.94921875, + "learning_rate": 0.00019820896819210514, + "loss": 4.5773, + "step": 1755 + }, + { + "epoch": 0.18207622368105555, + "grad_norm": 1.015625, + "learning_rate": 0.0001982069210038645, + "loss": 4.5819, + "step": 1756 + }, + { + "epoch": 0.18217991173554363, + "grad_norm": 1.1484375, + "learning_rate": 0.00019820487265688602, + "loss": 4.6235, + "step": 1757 + }, + { + "epoch": 0.1822835997900317, + "grad_norm": 0.79296875, + "learning_rate": 0.00019820282315119382, + "loss": 4.6436, + "step": 1758 + }, + { + "epoch": 0.18238728784451977, + "grad_norm": 0.8359375, + "learning_rate": 0.0001982007724868121, + "loss": 4.6404, + "step": 1759 + }, + { + "epoch": 0.18249097589900784, + "grad_norm": 0.94921875, + "learning_rate": 0.00019819872066376512, + "loss": 4.6505, + "step": 1760 + }, + { + "epoch": 0.1825946639534959, + "grad_norm": 1.0703125, + "learning_rate": 0.000198196667682077, + "loss": 4.6425, + "step": 1761 + }, + { + "epoch": 0.18269835200798398, + "grad_norm": 1.140625, + "learning_rate": 0.00019819461354177205, + "loss": 4.666, + "step": 1762 + }, + { + "epoch": 0.18280204006247205, + "grad_norm": 0.890625, + "learning_rate": 0.0001981925582428744, + "loss": 4.5952, + "step": 1763 + }, + { + "epoch": 0.18290572811696013, + "grad_norm": 0.8046875, + "learning_rate": 0.0001981905017854084, + "loss": 4.6084, + "step": 1764 + }, + { + "epoch": 0.1830094161714482, + "grad_norm": 0.828125, + "learning_rate": 0.00019818844416939822, + "loss": 4.6354, + "step": 1765 + }, + { + "epoch": 0.18311310422593627, + "grad_norm": 0.94921875, + "learning_rate": 0.00019818638539486822, + "loss": 4.6241, + "step": 1766 + }, + { + "epoch": 0.18321679228042434, + "grad_norm": 1.2890625, + "learning_rate": 0.00019818432546184266, + "loss": 4.6408, + "step": 1767 + }, + { + "epoch": 0.1833204803349124, + "grad_norm": 0.796875, + "learning_rate": 0.00019818226437034583, + "loss": 4.5669, + "step": 1768 + }, + { + "epoch": 0.18342416838940048, + "grad_norm": 0.85546875, + "learning_rate": 0.0001981802021204021, + "loss": 4.6347, + "step": 1769 + }, + { + "epoch": 0.18352785644388855, + "grad_norm": 1.0703125, + "learning_rate": 0.00019817813871203573, + "loss": 4.6193, + "step": 1770 + }, + { + "epoch": 0.18363154449837663, + "grad_norm": 1.2109375, + "learning_rate": 0.00019817607414527108, + "loss": 4.6221, + "step": 1771 + }, + { + "epoch": 0.1837352325528647, + "grad_norm": 0.7734375, + "learning_rate": 0.00019817400842013258, + "loss": 4.6161, + "step": 1772 + }, + { + "epoch": 0.18383892060735277, + "grad_norm": 0.890625, + "learning_rate": 0.00019817194153664455, + "loss": 4.5937, + "step": 1773 + }, + { + "epoch": 0.18394260866184084, + "grad_norm": 0.7421875, + "learning_rate": 0.00019816987349483136, + "loss": 4.6, + "step": 1774 + }, + { + "epoch": 0.1840462967163289, + "grad_norm": 0.9375, + "learning_rate": 0.00019816780429471743, + "loss": 4.6343, + "step": 1775 + }, + { + "epoch": 0.184149984770817, + "grad_norm": 0.83203125, + "learning_rate": 0.0001981657339363272, + "loss": 4.592, + "step": 1776 + }, + { + "epoch": 0.18425367282530508, + "grad_norm": 0.82421875, + "learning_rate": 0.00019816366241968506, + "loss": 4.6029, + "step": 1777 + }, + { + "epoch": 0.18435736087979315, + "grad_norm": 0.703125, + "learning_rate": 0.00019816158974481548, + "loss": 4.6375, + "step": 1778 + }, + { + "epoch": 0.18446104893428122, + "grad_norm": 0.66796875, + "learning_rate": 0.0001981595159117429, + "loss": 4.6048, + "step": 1779 + }, + { + "epoch": 0.1845647369887693, + "grad_norm": 0.79296875, + "learning_rate": 0.0001981574409204918, + "loss": 4.5995, + "step": 1780 + }, + { + "epoch": 0.18466842504325737, + "grad_norm": 0.76171875, + "learning_rate": 0.00019815536477108662, + "loss": 4.6003, + "step": 1781 + }, + { + "epoch": 0.18477211309774544, + "grad_norm": 0.6328125, + "learning_rate": 0.00019815328746355192, + "loss": 4.6188, + "step": 1782 + }, + { + "epoch": 0.1848758011522335, + "grad_norm": 0.62109375, + "learning_rate": 0.00019815120899791216, + "loss": 4.6491, + "step": 1783 + }, + { + "epoch": 0.18497948920672158, + "grad_norm": 0.76953125, + "learning_rate": 0.0001981491293741919, + "loss": 4.6405, + "step": 1784 + }, + { + "epoch": 0.18508317726120965, + "grad_norm": 0.6796875, + "learning_rate": 0.00019814704859241565, + "loss": 4.6342, + "step": 1785 + }, + { + "epoch": 0.18518686531569772, + "grad_norm": 0.640625, + "learning_rate": 0.00019814496665260798, + "loss": 4.6543, + "step": 1786 + }, + { + "epoch": 0.1852905533701858, + "grad_norm": 0.71484375, + "learning_rate": 0.00019814288355479346, + "loss": 4.6173, + "step": 1787 + }, + { + "epoch": 0.18539424142467387, + "grad_norm": 0.83984375, + "learning_rate": 0.00019814079929899662, + "loss": 4.6002, + "step": 1788 + }, + { + "epoch": 0.18549792947916194, + "grad_norm": 0.58984375, + "learning_rate": 0.00019813871388524215, + "loss": 4.5986, + "step": 1789 + }, + { + "epoch": 0.18560161753365, + "grad_norm": 0.6171875, + "learning_rate": 0.00019813662731355452, + "loss": 4.6107, + "step": 1790 + }, + { + "epoch": 0.18570530558813808, + "grad_norm": 0.7578125, + "learning_rate": 0.00019813453958395847, + "loss": 4.6364, + "step": 1791 + }, + { + "epoch": 0.18580899364262615, + "grad_norm": 0.75, + "learning_rate": 0.00019813245069647857, + "loss": 4.5867, + "step": 1792 + }, + { + "epoch": 0.18591268169711422, + "grad_norm": 0.7578125, + "learning_rate": 0.0001981303606511395, + "loss": 4.5961, + "step": 1793 + }, + { + "epoch": 0.1860163697516023, + "grad_norm": 0.62109375, + "learning_rate": 0.00019812826944796586, + "loss": 4.6213, + "step": 1794 + }, + { + "epoch": 0.18612005780609037, + "grad_norm": 0.94921875, + "learning_rate": 0.00019812617708698241, + "loss": 4.5809, + "step": 1795 + }, + { + "epoch": 0.18622374586057844, + "grad_norm": 1.0703125, + "learning_rate": 0.00019812408356821378, + "loss": 4.6406, + "step": 1796 + }, + { + "epoch": 0.1863274339150665, + "grad_norm": 0.69140625, + "learning_rate": 0.00019812198889168468, + "loss": 4.6107, + "step": 1797 + }, + { + "epoch": 0.1864311219695546, + "grad_norm": 0.6484375, + "learning_rate": 0.00019811989305741984, + "loss": 4.5858, + "step": 1798 + }, + { + "epoch": 0.18653481002404268, + "grad_norm": 0.8046875, + "learning_rate": 0.00019811779606544397, + "loss": 4.6217, + "step": 1799 + }, + { + "epoch": 0.18663849807853075, + "grad_norm": 0.6796875, + "learning_rate": 0.00019811569791578182, + "loss": 4.5735, + "step": 1800 + }, + { + "epoch": 0.18674218613301882, + "grad_norm": 0.5859375, + "learning_rate": 0.00019811359860845814, + "loss": 4.6024, + "step": 1801 + }, + { + "epoch": 0.1868458741875069, + "grad_norm": 0.640625, + "learning_rate": 0.00019811149814349773, + "loss": 4.5761, + "step": 1802 + }, + { + "epoch": 0.18694956224199497, + "grad_norm": 0.58203125, + "learning_rate": 0.00019810939652092533, + "loss": 4.5857, + "step": 1803 + }, + { + "epoch": 0.18705325029648304, + "grad_norm": 0.6015625, + "learning_rate": 0.00019810729374076575, + "loss": 4.633, + "step": 1804 + }, + { + "epoch": 0.1871569383509711, + "grad_norm": 0.6953125, + "learning_rate": 0.00019810518980304385, + "loss": 4.628, + "step": 1805 + }, + { + "epoch": 0.18726062640545918, + "grad_norm": 0.62109375, + "learning_rate": 0.00019810308470778436, + "loss": 4.5949, + "step": 1806 + }, + { + "epoch": 0.18736431445994725, + "grad_norm": 0.71484375, + "learning_rate": 0.0001981009784550122, + "loss": 4.5659, + "step": 1807 + }, + { + "epoch": 0.18746800251443532, + "grad_norm": 0.69921875, + "learning_rate": 0.0001980988710447522, + "loss": 4.5788, + "step": 1808 + }, + { + "epoch": 0.1875716905689234, + "grad_norm": 0.64453125, + "learning_rate": 0.0001980967624770292, + "loss": 4.5945, + "step": 1809 + }, + { + "epoch": 0.18767537862341147, + "grad_norm": 0.73046875, + "learning_rate": 0.0001980946527518681, + "loss": 4.6004, + "step": 1810 + }, + { + "epoch": 0.18777906667789954, + "grad_norm": 0.6796875, + "learning_rate": 0.0001980925418692938, + "loss": 4.6061, + "step": 1811 + }, + { + "epoch": 0.1878827547323876, + "grad_norm": 0.64453125, + "learning_rate": 0.00019809042982933117, + "loss": 4.6141, + "step": 1812 + }, + { + "epoch": 0.18798644278687568, + "grad_norm": 0.609375, + "learning_rate": 0.00019808831663200517, + "loss": 4.641, + "step": 1813 + }, + { + "epoch": 0.18809013084136375, + "grad_norm": 0.59765625, + "learning_rate": 0.0001980862022773407, + "loss": 4.542, + "step": 1814 + }, + { + "epoch": 0.18819381889585182, + "grad_norm": 0.69140625, + "learning_rate": 0.00019808408676536275, + "loss": 4.587, + "step": 1815 + }, + { + "epoch": 0.1882975069503399, + "grad_norm": 0.5859375, + "learning_rate": 0.00019808197009609624, + "loss": 4.6407, + "step": 1816 + }, + { + "epoch": 0.18840119500482796, + "grad_norm": 0.58203125, + "learning_rate": 0.00019807985226956616, + "loss": 4.6117, + "step": 1817 + }, + { + "epoch": 0.18850488305931604, + "grad_norm": 0.63671875, + "learning_rate": 0.0001980777332857975, + "loss": 4.6044, + "step": 1818 + }, + { + "epoch": 0.1886085711138041, + "grad_norm": 0.609375, + "learning_rate": 0.00019807561314481525, + "loss": 4.5919, + "step": 1819 + }, + { + "epoch": 0.1887122591682922, + "grad_norm": 0.53515625, + "learning_rate": 0.00019807349184664447, + "loss": 4.5995, + "step": 1820 + }, + { + "epoch": 0.18881594722278028, + "grad_norm": 0.71875, + "learning_rate": 0.0001980713693913101, + "loss": 4.6098, + "step": 1821 + }, + { + "epoch": 0.18891963527726835, + "grad_norm": 0.67578125, + "learning_rate": 0.0001980692457788373, + "loss": 4.6336, + "step": 1822 + }, + { + "epoch": 0.18902332333175642, + "grad_norm": 0.67578125, + "learning_rate": 0.00019806712100925103, + "loss": 4.5602, + "step": 1823 + }, + { + "epoch": 0.1891270113862445, + "grad_norm": 0.7109375, + "learning_rate": 0.00019806499508257636, + "loss": 4.585, + "step": 1824 + }, + { + "epoch": 0.18923069944073256, + "grad_norm": 0.66796875, + "learning_rate": 0.00019806286799883846, + "loss": 4.6508, + "step": 1825 + }, + { + "epoch": 0.18933438749522064, + "grad_norm": 0.58203125, + "learning_rate": 0.00019806073975806235, + "loss": 4.5661, + "step": 1826 + }, + { + "epoch": 0.1894380755497087, + "grad_norm": 0.6015625, + "learning_rate": 0.00019805861036027318, + "loss": 4.6343, + "step": 1827 + }, + { + "epoch": 0.18954176360419678, + "grad_norm": 0.765625, + "learning_rate": 0.00019805647980549606, + "loss": 4.6045, + "step": 1828 + }, + { + "epoch": 0.18964545165868485, + "grad_norm": 0.7578125, + "learning_rate": 0.0001980543480937561, + "loss": 4.6495, + "step": 1829 + }, + { + "epoch": 0.18974913971317292, + "grad_norm": 0.6171875, + "learning_rate": 0.0001980522152250785, + "loss": 4.5932, + "step": 1830 + }, + { + "epoch": 0.189852827767661, + "grad_norm": 0.5625, + "learning_rate": 0.00019805008119948842, + "loss": 4.6119, + "step": 1831 + }, + { + "epoch": 0.18995651582214906, + "grad_norm": 0.859375, + "learning_rate": 0.000198047946017011, + "loss": 4.6007, + "step": 1832 + }, + { + "epoch": 0.19006020387663713, + "grad_norm": 0.9609375, + "learning_rate": 0.00019804580967767147, + "loss": 4.5951, + "step": 1833 + }, + { + "epoch": 0.1901638919311252, + "grad_norm": 0.9609375, + "learning_rate": 0.00019804367218149505, + "loss": 4.5742, + "step": 1834 + }, + { + "epoch": 0.19026757998561328, + "grad_norm": 0.8125, + "learning_rate": 0.0001980415335285069, + "loss": 4.5971, + "step": 1835 + }, + { + "epoch": 0.19037126804010135, + "grad_norm": 0.69140625, + "learning_rate": 0.0001980393937187323, + "loss": 4.6173, + "step": 1836 + }, + { + "epoch": 0.19047495609458942, + "grad_norm": 0.72265625, + "learning_rate": 0.00019803725275219648, + "loss": 4.6155, + "step": 1837 + }, + { + "epoch": 0.1905786441490775, + "grad_norm": 0.80859375, + "learning_rate": 0.0001980351106289247, + "loss": 4.5651, + "step": 1838 + }, + { + "epoch": 0.19068233220356556, + "grad_norm": 0.62890625, + "learning_rate": 0.00019803296734894227, + "loss": 4.6264, + "step": 1839 + }, + { + "epoch": 0.19078602025805363, + "grad_norm": 0.68359375, + "learning_rate": 0.00019803082291227443, + "loss": 4.599, + "step": 1840 + }, + { + "epoch": 0.1908897083125417, + "grad_norm": 0.89453125, + "learning_rate": 0.0001980286773189465, + "loss": 4.5985, + "step": 1841 + }, + { + "epoch": 0.19099339636702978, + "grad_norm": 0.84375, + "learning_rate": 0.0001980265305689838, + "loss": 4.5839, + "step": 1842 + }, + { + "epoch": 0.19109708442151788, + "grad_norm": 0.7265625, + "learning_rate": 0.0001980243826624117, + "loss": 4.5921, + "step": 1843 + }, + { + "epoch": 0.19120077247600595, + "grad_norm": 0.91796875, + "learning_rate": 0.00019802223359925545, + "loss": 4.6503, + "step": 1844 + }, + { + "epoch": 0.19130446053049402, + "grad_norm": 1.0546875, + "learning_rate": 0.00019802008337954047, + "loss": 4.5806, + "step": 1845 + }, + { + "epoch": 0.1914081485849821, + "grad_norm": 0.97265625, + "learning_rate": 0.00019801793200329213, + "loss": 4.5883, + "step": 1846 + }, + { + "epoch": 0.19151183663947016, + "grad_norm": 0.765625, + "learning_rate": 0.00019801577947053579, + "loss": 4.6491, + "step": 1847 + }, + { + "epoch": 0.19161552469395823, + "grad_norm": 0.66015625, + "learning_rate": 0.00019801362578129685, + "loss": 4.6137, + "step": 1848 + }, + { + "epoch": 0.1917192127484463, + "grad_norm": 0.90234375, + "learning_rate": 0.00019801147093560076, + "loss": 4.6222, + "step": 1849 + }, + { + "epoch": 0.19182290080293438, + "grad_norm": 0.921875, + "learning_rate": 0.00019800931493347288, + "loss": 4.6001, + "step": 1850 + }, + { + "epoch": 0.19192658885742245, + "grad_norm": 0.79296875, + "learning_rate": 0.00019800715777493872, + "loss": 4.6157, + "step": 1851 + }, + { + "epoch": 0.19203027691191052, + "grad_norm": 0.7578125, + "learning_rate": 0.00019800499946002366, + "loss": 4.581, + "step": 1852 + }, + { + "epoch": 0.1921339649663986, + "grad_norm": 0.78125, + "learning_rate": 0.00019800283998875324, + "loss": 4.6079, + "step": 1853 + }, + { + "epoch": 0.19223765302088666, + "grad_norm": 0.75390625, + "learning_rate": 0.00019800067936115288, + "loss": 4.577, + "step": 1854 + }, + { + "epoch": 0.19234134107537473, + "grad_norm": 0.72265625, + "learning_rate": 0.00019799851757724808, + "loss": 4.575, + "step": 1855 + }, + { + "epoch": 0.1924450291298628, + "grad_norm": 0.58984375, + "learning_rate": 0.00019799635463706438, + "loss": 4.6081, + "step": 1856 + }, + { + "epoch": 0.19254871718435088, + "grad_norm": 0.71484375, + "learning_rate": 0.00019799419054062728, + "loss": 4.6058, + "step": 1857 + }, + { + "epoch": 0.19265240523883895, + "grad_norm": 0.515625, + "learning_rate": 0.00019799202528796231, + "loss": 4.618, + "step": 1858 + }, + { + "epoch": 0.19275609329332702, + "grad_norm": 0.63671875, + "learning_rate": 0.00019798985887909502, + "loss": 4.6002, + "step": 1859 + }, + { + "epoch": 0.1928597813478151, + "grad_norm": 0.61328125, + "learning_rate": 0.00019798769131405098, + "loss": 4.6159, + "step": 1860 + }, + { + "epoch": 0.19296346940230316, + "grad_norm": 0.640625, + "learning_rate": 0.00019798552259285579, + "loss": 4.5643, + "step": 1861 + }, + { + "epoch": 0.19306715745679123, + "grad_norm": 0.625, + "learning_rate": 0.000197983352715535, + "loss": 4.6061, + "step": 1862 + }, + { + "epoch": 0.1931708455112793, + "grad_norm": 0.6640625, + "learning_rate": 0.0001979811816821142, + "loss": 4.5693, + "step": 1863 + }, + { + "epoch": 0.19327453356576738, + "grad_norm": 0.61328125, + "learning_rate": 0.000197979009492619, + "loss": 4.6038, + "step": 1864 + }, + { + "epoch": 0.19337822162025547, + "grad_norm": 0.8125, + "learning_rate": 0.00019797683614707512, + "loss": 4.5976, + "step": 1865 + }, + { + "epoch": 0.19348190967474355, + "grad_norm": 0.7890625, + "learning_rate": 0.0001979746616455081, + "loss": 4.5936, + "step": 1866 + }, + { + "epoch": 0.19358559772923162, + "grad_norm": 0.8359375, + "learning_rate": 0.00019797248598794364, + "loss": 4.5944, + "step": 1867 + }, + { + "epoch": 0.1936892857837197, + "grad_norm": 0.9296875, + "learning_rate": 0.0001979703091744074, + "loss": 4.6682, + "step": 1868 + }, + { + "epoch": 0.19379297383820776, + "grad_norm": 0.92578125, + "learning_rate": 0.00019796813120492507, + "loss": 4.562, + "step": 1869 + }, + { + "epoch": 0.19389666189269583, + "grad_norm": 0.8671875, + "learning_rate": 0.00019796595207952238, + "loss": 4.5784, + "step": 1870 + }, + { + "epoch": 0.1940003499471839, + "grad_norm": 0.78125, + "learning_rate": 0.000197963771798225, + "loss": 4.5825, + "step": 1871 + }, + { + "epoch": 0.19410403800167197, + "grad_norm": 0.66015625, + "learning_rate": 0.00019796159036105865, + "loss": 4.6164, + "step": 1872 + }, + { + "epoch": 0.19420772605616005, + "grad_norm": 0.74609375, + "learning_rate": 0.00019795940776804906, + "loss": 4.5685, + "step": 1873 + }, + { + "epoch": 0.19431141411064812, + "grad_norm": 0.796875, + "learning_rate": 0.00019795722401922205, + "loss": 4.6074, + "step": 1874 + }, + { + "epoch": 0.1944151021651362, + "grad_norm": 0.8046875, + "learning_rate": 0.0001979550391146033, + "loss": 4.5823, + "step": 1875 + }, + { + "epoch": 0.19451879021962426, + "grad_norm": 0.73828125, + "learning_rate": 0.0001979528530542187, + "loss": 4.6129, + "step": 1876 + }, + { + "epoch": 0.19462247827411233, + "grad_norm": 0.75390625, + "learning_rate": 0.00019795066583809393, + "loss": 4.5628, + "step": 1877 + }, + { + "epoch": 0.1947261663286004, + "grad_norm": 0.75390625, + "learning_rate": 0.00019794847746625483, + "loss": 4.5917, + "step": 1878 + }, + { + "epoch": 0.19482985438308847, + "grad_norm": 0.64453125, + "learning_rate": 0.00019794628793872723, + "loss": 4.6206, + "step": 1879 + }, + { + "epoch": 0.19493354243757655, + "grad_norm": 0.64453125, + "learning_rate": 0.00019794409725553699, + "loss": 4.5926, + "step": 1880 + }, + { + "epoch": 0.19503723049206462, + "grad_norm": 0.71484375, + "learning_rate": 0.00019794190541670993, + "loss": 4.6335, + "step": 1881 + }, + { + "epoch": 0.1951409185465527, + "grad_norm": 0.8984375, + "learning_rate": 0.0001979397124222719, + "loss": 4.6138, + "step": 1882 + }, + { + "epoch": 0.19524460660104076, + "grad_norm": 0.765625, + "learning_rate": 0.0001979375182722488, + "loss": 4.622, + "step": 1883 + }, + { + "epoch": 0.19534829465552883, + "grad_norm": 0.796875, + "learning_rate": 0.0001979353229666665, + "loss": 4.5461, + "step": 1884 + }, + { + "epoch": 0.1954519827100169, + "grad_norm": 0.78515625, + "learning_rate": 0.00019793312650555093, + "loss": 4.5757, + "step": 1885 + }, + { + "epoch": 0.19555567076450497, + "grad_norm": 0.88671875, + "learning_rate": 0.00019793092888892799, + "loss": 4.6087, + "step": 1886 + }, + { + "epoch": 0.19565935881899307, + "grad_norm": 0.94921875, + "learning_rate": 0.00019792873011682357, + "loss": 4.5903, + "step": 1887 + }, + { + "epoch": 0.19576304687348114, + "grad_norm": 0.8515625, + "learning_rate": 0.0001979265301892637, + "loss": 4.5752, + "step": 1888 + }, + { + "epoch": 0.19586673492796922, + "grad_norm": 0.83203125, + "learning_rate": 0.00019792432910627425, + "loss": 4.5782, + "step": 1889 + }, + { + "epoch": 0.1959704229824573, + "grad_norm": 0.81640625, + "learning_rate": 0.00019792212686788122, + "loss": 4.5527, + "step": 1890 + }, + { + "epoch": 0.19607411103694536, + "grad_norm": 0.98046875, + "learning_rate": 0.00019791992347411064, + "loss": 4.6206, + "step": 1891 + }, + { + "epoch": 0.19617779909143343, + "grad_norm": 1.28125, + "learning_rate": 0.00019791771892498843, + "loss": 4.6059, + "step": 1892 + }, + { + "epoch": 0.1962814871459215, + "grad_norm": 0.8125, + "learning_rate": 0.00019791551322054067, + "loss": 4.5715, + "step": 1893 + }, + { + "epoch": 0.19638517520040957, + "grad_norm": 0.86328125, + "learning_rate": 0.00019791330636079332, + "loss": 4.5888, + "step": 1894 + }, + { + "epoch": 0.19648886325489764, + "grad_norm": 0.921875, + "learning_rate": 0.0001979110983457725, + "loss": 4.5763, + "step": 1895 + }, + { + "epoch": 0.19659255130938572, + "grad_norm": 0.95703125, + "learning_rate": 0.0001979088891755042, + "loss": 4.5754, + "step": 1896 + }, + { + "epoch": 0.1966962393638738, + "grad_norm": 0.88671875, + "learning_rate": 0.00019790667885001448, + "loss": 4.6025, + "step": 1897 + }, + { + "epoch": 0.19679992741836186, + "grad_norm": 0.8671875, + "learning_rate": 0.00019790446736932946, + "loss": 4.6139, + "step": 1898 + }, + { + "epoch": 0.19690361547284993, + "grad_norm": 0.96875, + "learning_rate": 0.0001979022547334752, + "loss": 4.5304, + "step": 1899 + }, + { + "epoch": 0.197007303527338, + "grad_norm": 1.015625, + "learning_rate": 0.00019790004094247783, + "loss": 4.6188, + "step": 1900 + }, + { + "epoch": 0.19711099158182607, + "grad_norm": 0.81640625, + "learning_rate": 0.00019789782599636348, + "loss": 4.6019, + "step": 1901 + }, + { + "epoch": 0.19721467963631414, + "grad_norm": 0.8359375, + "learning_rate": 0.00019789560989515824, + "loss": 4.5996, + "step": 1902 + }, + { + "epoch": 0.19731836769080222, + "grad_norm": 1.1015625, + "learning_rate": 0.0001978933926388883, + "loss": 4.585, + "step": 1903 + }, + { + "epoch": 0.1974220557452903, + "grad_norm": 0.8828125, + "learning_rate": 0.0001978911742275798, + "loss": 4.6249, + "step": 1904 + }, + { + "epoch": 0.19752574379977836, + "grad_norm": 0.83984375, + "learning_rate": 0.0001978889546612589, + "loss": 4.5999, + "step": 1905 + }, + { + "epoch": 0.19762943185426643, + "grad_norm": 0.83984375, + "learning_rate": 0.00019788673393995182, + "loss": 4.6135, + "step": 1906 + }, + { + "epoch": 0.1977331199087545, + "grad_norm": 0.77734375, + "learning_rate": 0.00019788451206368475, + "loss": 4.6007, + "step": 1907 + }, + { + "epoch": 0.19783680796324257, + "grad_norm": 0.83984375, + "learning_rate": 0.00019788228903248393, + "loss": 4.6105, + "step": 1908 + }, + { + "epoch": 0.19794049601773064, + "grad_norm": 0.76171875, + "learning_rate": 0.00019788006484637553, + "loss": 4.6072, + "step": 1909 + }, + { + "epoch": 0.19804418407221874, + "grad_norm": 0.76171875, + "learning_rate": 0.00019787783950538587, + "loss": 4.59, + "step": 1910 + }, + { + "epoch": 0.19814787212670681, + "grad_norm": 0.78515625, + "learning_rate": 0.0001978756130095411, + "loss": 4.5976, + "step": 1911 + }, + { + "epoch": 0.19825156018119489, + "grad_norm": 0.6953125, + "learning_rate": 0.0001978733853588676, + "loss": 4.5981, + "step": 1912 + }, + { + "epoch": 0.19835524823568296, + "grad_norm": 0.6875, + "learning_rate": 0.00019787115655339163, + "loss": 4.6151, + "step": 1913 + }, + { + "epoch": 0.19845893629017103, + "grad_norm": 0.765625, + "learning_rate": 0.00019786892659313945, + "loss": 4.6266, + "step": 1914 + }, + { + "epoch": 0.1985626243446591, + "grad_norm": 0.70703125, + "learning_rate": 0.00019786669547813737, + "loss": 4.6112, + "step": 1915 + }, + { + "epoch": 0.19866631239914717, + "grad_norm": 0.72265625, + "learning_rate": 0.00019786446320841172, + "loss": 4.5899, + "step": 1916 + }, + { + "epoch": 0.19877000045363524, + "grad_norm": 0.84765625, + "learning_rate": 0.00019786222978398889, + "loss": 4.546, + "step": 1917 + }, + { + "epoch": 0.19887368850812331, + "grad_norm": 0.734375, + "learning_rate": 0.00019785999520489518, + "loss": 4.537, + "step": 1918 + }, + { + "epoch": 0.19897737656261139, + "grad_norm": 0.68359375, + "learning_rate": 0.00019785775947115696, + "loss": 4.5717, + "step": 1919 + }, + { + "epoch": 0.19908106461709946, + "grad_norm": 0.8828125, + "learning_rate": 0.00019785552258280064, + "loss": 4.5215, + "step": 1920 + }, + { + "epoch": 0.19918475267158753, + "grad_norm": 1.1171875, + "learning_rate": 0.00019785328453985257, + "loss": 4.6153, + "step": 1921 + }, + { + "epoch": 0.1992884407260756, + "grad_norm": 0.6875, + "learning_rate": 0.00019785104534233918, + "loss": 4.5823, + "step": 1922 + }, + { + "epoch": 0.19939212878056367, + "grad_norm": 0.73046875, + "learning_rate": 0.00019784880499028692, + "loss": 4.5774, + "step": 1923 + }, + { + "epoch": 0.19949581683505174, + "grad_norm": 1.0078125, + "learning_rate": 0.00019784656348372214, + "loss": 4.6047, + "step": 1924 + }, + { + "epoch": 0.1995995048895398, + "grad_norm": 0.87890625, + "learning_rate": 0.00019784432082267138, + "loss": 4.612, + "step": 1925 + }, + { + "epoch": 0.19970319294402789, + "grad_norm": 0.83984375, + "learning_rate": 0.00019784207700716103, + "loss": 4.6276, + "step": 1926 + }, + { + "epoch": 0.19980688099851596, + "grad_norm": 0.73828125, + "learning_rate": 0.00019783983203721758, + "loss": 4.5811, + "step": 1927 + }, + { + "epoch": 0.19991056905300403, + "grad_norm": 0.953125, + "learning_rate": 0.0001978375859128676, + "loss": 4.5809, + "step": 1928 + }, + { + "epoch": 0.2000142571074921, + "grad_norm": 1.0859375, + "learning_rate": 0.0001978353386341375, + "loss": 4.6231, + "step": 1929 + }, + { + "epoch": 0.20011794516198017, + "grad_norm": 0.796875, + "learning_rate": 0.00019783309020105375, + "loss": 4.5642, + "step": 1930 + }, + { + "epoch": 0.20022163321646824, + "grad_norm": 0.78515625, + "learning_rate": 0.00019783084061364303, + "loss": 4.581, + "step": 1931 + }, + { + "epoch": 0.20032532127095634, + "grad_norm": 1.1875, + "learning_rate": 0.00019782858987193178, + "loss": 4.5868, + "step": 1932 + }, + { + "epoch": 0.2004290093254444, + "grad_norm": 0.81640625, + "learning_rate": 0.00019782633797594659, + "loss": 4.617, + "step": 1933 + }, + { + "epoch": 0.20053269737993248, + "grad_norm": 0.8046875, + "learning_rate": 0.00019782408492571399, + "loss": 4.6109, + "step": 1934 + }, + { + "epoch": 0.20063638543442056, + "grad_norm": 1.0234375, + "learning_rate": 0.00019782183072126062, + "loss": 4.6252, + "step": 1935 + }, + { + "epoch": 0.20074007348890863, + "grad_norm": 1.0625, + "learning_rate": 0.00019781957536261303, + "loss": 4.6283, + "step": 1936 + }, + { + "epoch": 0.2008437615433967, + "grad_norm": 0.99609375, + "learning_rate": 0.00019781731884979786, + "loss": 4.606, + "step": 1937 + }, + { + "epoch": 0.20094744959788477, + "grad_norm": 0.86328125, + "learning_rate": 0.00019781506118284173, + "loss": 4.6014, + "step": 1938 + }, + { + "epoch": 0.20105113765237284, + "grad_norm": 0.7421875, + "learning_rate": 0.00019781280236177127, + "loss": 4.6413, + "step": 1939 + }, + { + "epoch": 0.2011548257068609, + "grad_norm": 0.7734375, + "learning_rate": 0.0001978105423866131, + "loss": 4.5813, + "step": 1940 + }, + { + "epoch": 0.20125851376134898, + "grad_norm": 0.66796875, + "learning_rate": 0.00019780828125739398, + "loss": 4.5541, + "step": 1941 + }, + { + "epoch": 0.20136220181583706, + "grad_norm": 0.8515625, + "learning_rate": 0.0001978060189741405, + "loss": 4.6159, + "step": 1942 + }, + { + "epoch": 0.20146588987032513, + "grad_norm": 0.97265625, + "learning_rate": 0.00019780375553687937, + "loss": 4.6638, + "step": 1943 + }, + { + "epoch": 0.2015695779248132, + "grad_norm": 0.96484375, + "learning_rate": 0.00019780149094563733, + "loss": 4.6151, + "step": 1944 + }, + { + "epoch": 0.20167326597930127, + "grad_norm": 0.95703125, + "learning_rate": 0.00019779922520044108, + "loss": 4.5835, + "step": 1945 + }, + { + "epoch": 0.20177695403378934, + "grad_norm": 0.96875, + "learning_rate": 0.00019779695830131732, + "loss": 4.5769, + "step": 1946 + }, + { + "epoch": 0.2018806420882774, + "grad_norm": 0.96875, + "learning_rate": 0.00019779469024829285, + "loss": 4.6143, + "step": 1947 + }, + { + "epoch": 0.20198433014276548, + "grad_norm": 1.1796875, + "learning_rate": 0.0001977924210413944, + "loss": 4.604, + "step": 1948 + }, + { + "epoch": 0.20208801819725355, + "grad_norm": 0.81640625, + "learning_rate": 0.00019779015068064877, + "loss": 4.5636, + "step": 1949 + }, + { + "epoch": 0.20219170625174163, + "grad_norm": 0.91796875, + "learning_rate": 0.00019778787916608273, + "loss": 4.5581, + "step": 1950 + }, + { + "epoch": 0.2022953943062297, + "grad_norm": 1.1171875, + "learning_rate": 0.00019778560649772305, + "loss": 4.5624, + "step": 1951 + }, + { + "epoch": 0.20239908236071777, + "grad_norm": 0.77734375, + "learning_rate": 0.00019778333267559658, + "loss": 4.6108, + "step": 1952 + }, + { + "epoch": 0.20250277041520584, + "grad_norm": 1.015625, + "learning_rate": 0.00019778105769973018, + "loss": 4.5904, + "step": 1953 + }, + { + "epoch": 0.20260645846969394, + "grad_norm": 1.3203125, + "learning_rate": 0.00019777878157015063, + "loss": 4.5902, + "step": 1954 + }, + { + "epoch": 0.202710146524182, + "grad_norm": 0.67578125, + "learning_rate": 0.00019777650428688483, + "loss": 4.6012, + "step": 1955 + }, + { + "epoch": 0.20281383457867008, + "grad_norm": 1.515625, + "learning_rate": 0.00019777422584995965, + "loss": 4.544, + "step": 1956 + }, + { + "epoch": 0.20291752263315815, + "grad_norm": 0.76171875, + "learning_rate": 0.00019777194625940193, + "loss": 4.6165, + "step": 1957 + }, + { + "epoch": 0.20302121068764623, + "grad_norm": 1.59375, + "learning_rate": 0.00019776966551523858, + "loss": 4.5806, + "step": 1958 + }, + { + "epoch": 0.2031248987421343, + "grad_norm": 0.9140625, + "learning_rate": 0.00019776738361749655, + "loss": 4.5912, + "step": 1959 + }, + { + "epoch": 0.20322858679662237, + "grad_norm": 1.9375, + "learning_rate": 0.00019776510056620272, + "loss": 4.6098, + "step": 1960 + }, + { + "epoch": 0.20333227485111044, + "grad_norm": 1.453125, + "learning_rate": 0.00019776281636138407, + "loss": 4.5712, + "step": 1961 + }, + { + "epoch": 0.2034359629055985, + "grad_norm": 2.65625, + "learning_rate": 0.0001977605310030675, + "loss": 4.6034, + "step": 1962 + }, + { + "epoch": 0.20353965096008658, + "grad_norm": 2.484375, + "learning_rate": 0.00019775824449128003, + "loss": 4.6355, + "step": 1963 + }, + { + "epoch": 0.20364333901457465, + "grad_norm": 1.40625, + "learning_rate": 0.0001977559568260486, + "loss": 4.5732, + "step": 1964 + }, + { + "epoch": 0.20374702706906272, + "grad_norm": 1.90625, + "learning_rate": 0.0001977536680074002, + "loss": 4.5894, + "step": 1965 + }, + { + "epoch": 0.2038507151235508, + "grad_norm": 1.59375, + "learning_rate": 0.00019775137803536186, + "loss": 4.6234, + "step": 1966 + }, + { + "epoch": 0.20395440317803887, + "grad_norm": 1.859375, + "learning_rate": 0.00019774908690996056, + "loss": 4.646, + "step": 1967 + }, + { + "epoch": 0.20405809123252694, + "grad_norm": 1.6875, + "learning_rate": 0.0001977467946312234, + "loss": 4.518, + "step": 1968 + }, + { + "epoch": 0.204161779287015, + "grad_norm": 1.5546875, + "learning_rate": 0.00019774450119917737, + "loss": 4.6341, + "step": 1969 + }, + { + "epoch": 0.20426546734150308, + "grad_norm": 1.28125, + "learning_rate": 0.00019774220661384956, + "loss": 4.5852, + "step": 1970 + }, + { + "epoch": 0.20436915539599115, + "grad_norm": 1.5, + "learning_rate": 0.000197739910875267, + "loss": 4.5918, + "step": 1971 + }, + { + "epoch": 0.20447284345047922, + "grad_norm": 1.078125, + "learning_rate": 0.00019773761398345682, + "loss": 4.5398, + "step": 1972 + }, + { + "epoch": 0.2045765315049673, + "grad_norm": 1.4375, + "learning_rate": 0.00019773531593844613, + "loss": 4.5249, + "step": 1973 + }, + { + "epoch": 0.20468021955945537, + "grad_norm": 1.2421875, + "learning_rate": 0.00019773301674026197, + "loss": 4.5895, + "step": 1974 + }, + { + "epoch": 0.20478390761394344, + "grad_norm": 1.234375, + "learning_rate": 0.00019773071638893157, + "loss": 4.6123, + "step": 1975 + }, + { + "epoch": 0.2048875956684315, + "grad_norm": 1.2265625, + "learning_rate": 0.00019772841488448198, + "loss": 4.5979, + "step": 1976 + }, + { + "epoch": 0.2049912837229196, + "grad_norm": 0.90234375, + "learning_rate": 0.00019772611222694045, + "loss": 4.5542, + "step": 1977 + }, + { + "epoch": 0.20509497177740768, + "grad_norm": 1.1015625, + "learning_rate": 0.00019772380841633406, + "loss": 4.5651, + "step": 1978 + }, + { + "epoch": 0.20519865983189575, + "grad_norm": 0.83984375, + "learning_rate": 0.00019772150345269003, + "loss": 4.5454, + "step": 1979 + }, + { + "epoch": 0.20530234788638382, + "grad_norm": 1.0859375, + "learning_rate": 0.00019771919733603557, + "loss": 4.6146, + "step": 1980 + }, + { + "epoch": 0.2054060359408719, + "grad_norm": 0.921875, + "learning_rate": 0.00019771689006639785, + "loss": 4.5667, + "step": 1981 + }, + { + "epoch": 0.20550972399535997, + "grad_norm": 0.9765625, + "learning_rate": 0.00019771458164380415, + "loss": 4.5801, + "step": 1982 + }, + { + "epoch": 0.20561341204984804, + "grad_norm": 1.234375, + "learning_rate": 0.00019771227206828167, + "loss": 4.6104, + "step": 1983 + }, + { + "epoch": 0.2057171001043361, + "grad_norm": 0.84765625, + "learning_rate": 0.00019770996133985767, + "loss": 4.6062, + "step": 1984 + }, + { + "epoch": 0.20582078815882418, + "grad_norm": 1.5, + "learning_rate": 0.00019770764945855937, + "loss": 4.5738, + "step": 1985 + }, + { + "epoch": 0.20592447621331225, + "grad_norm": 0.98828125, + "learning_rate": 0.00019770533642441413, + "loss": 4.5966, + "step": 1986 + }, + { + "epoch": 0.20602816426780032, + "grad_norm": 2.078125, + "learning_rate": 0.0001977030222374492, + "loss": 4.6273, + "step": 1987 + }, + { + "epoch": 0.2061318523222884, + "grad_norm": 1.765625, + "learning_rate": 0.00019770070689769184, + "loss": 4.5446, + "step": 1988 + }, + { + "epoch": 0.20623554037677647, + "grad_norm": 1.921875, + "learning_rate": 0.00019769839040516946, + "loss": 4.5785, + "step": 1989 + }, + { + "epoch": 0.20633922843126454, + "grad_norm": 1.7578125, + "learning_rate": 0.00019769607275990934, + "loss": 4.5787, + "step": 1990 + }, + { + "epoch": 0.2064429164857526, + "grad_norm": 1.546875, + "learning_rate": 0.00019769375396193881, + "loss": 4.5644, + "step": 1991 + }, + { + "epoch": 0.20654660454024068, + "grad_norm": 1.421875, + "learning_rate": 0.00019769143401128525, + "loss": 4.5818, + "step": 1992 + }, + { + "epoch": 0.20665029259472875, + "grad_norm": 1.3203125, + "learning_rate": 0.00019768911290797604, + "loss": 4.5749, + "step": 1993 + }, + { + "epoch": 0.20675398064921682, + "grad_norm": 1.2890625, + "learning_rate": 0.00019768679065203855, + "loss": 4.5796, + "step": 1994 + }, + { + "epoch": 0.2068576687037049, + "grad_norm": 1.03125, + "learning_rate": 0.00019768446724350024, + "loss": 4.5841, + "step": 1995 + }, + { + "epoch": 0.20696135675819297, + "grad_norm": 1.4453125, + "learning_rate": 0.00019768214268238842, + "loss": 4.569, + "step": 1996 + }, + { + "epoch": 0.20706504481268104, + "grad_norm": 1.046875, + "learning_rate": 0.00019767981696873057, + "loss": 4.6043, + "step": 1997 + }, + { + "epoch": 0.2071687328671691, + "grad_norm": 1.7890625, + "learning_rate": 0.00019767749010255416, + "loss": 4.6222, + "step": 1998 + }, + { + "epoch": 0.2072724209216572, + "grad_norm": 1.3984375, + "learning_rate": 0.0001976751620838866, + "loss": 4.5898, + "step": 1999 + }, + { + "epoch": 0.20737610897614528, + "grad_norm": 2.09375, + "learning_rate": 0.00019767283291275537, + "loss": 4.5915, + "step": 2000 + }, + { + "epoch": 0.20747979703063335, + "grad_norm": 1.640625, + "learning_rate": 0.00019767050258918798, + "loss": 4.5657, + "step": 2001 + }, + { + "epoch": 0.20758348508512142, + "grad_norm": 2.0, + "learning_rate": 0.00019766817111321186, + "loss": 4.6221, + "step": 2002 + }, + { + "epoch": 0.2076871731396095, + "grad_norm": 1.7265625, + "learning_rate": 0.0001976658384848546, + "loss": 4.6072, + "step": 2003 + }, + { + "epoch": 0.20779086119409756, + "grad_norm": 1.796875, + "learning_rate": 0.00019766350470414365, + "loss": 4.576, + "step": 2004 + }, + { + "epoch": 0.20789454924858564, + "grad_norm": 1.6328125, + "learning_rate": 0.00019766116977110661, + "loss": 4.5551, + "step": 2005 + }, + { + "epoch": 0.2079982373030737, + "grad_norm": 1.6015625, + "learning_rate": 0.000197658833685771, + "loss": 4.5634, + "step": 2006 + }, + { + "epoch": 0.20810192535756178, + "grad_norm": 1.4375, + "learning_rate": 0.00019765649644816436, + "loss": 4.5981, + "step": 2007 + }, + { + "epoch": 0.20820561341204985, + "grad_norm": 1.6484375, + "learning_rate": 0.0001976541580583143, + "loss": 4.5633, + "step": 2008 + }, + { + "epoch": 0.20830930146653792, + "grad_norm": 1.3828125, + "learning_rate": 0.0001976518185162484, + "loss": 4.6089, + "step": 2009 + }, + { + "epoch": 0.208412989521026, + "grad_norm": 1.8125, + "learning_rate": 0.00019764947782199426, + "loss": 4.5937, + "step": 2010 + }, + { + "epoch": 0.20851667757551406, + "grad_norm": 1.4140625, + "learning_rate": 0.00019764713597557952, + "loss": 4.5695, + "step": 2011 + }, + { + "epoch": 0.20862036563000214, + "grad_norm": 2.078125, + "learning_rate": 0.00019764479297703177, + "loss": 4.6169, + "step": 2012 + }, + { + "epoch": 0.2087240536844902, + "grad_norm": 1.8984375, + "learning_rate": 0.00019764244882637867, + "loss": 4.6056, + "step": 2013 + }, + { + "epoch": 0.20882774173897828, + "grad_norm": 1.40625, + "learning_rate": 0.00019764010352364792, + "loss": 4.5951, + "step": 2014 + }, + { + "epoch": 0.20893142979346635, + "grad_norm": 1.3046875, + "learning_rate": 0.00019763775706886714, + "loss": 4.5894, + "step": 2015 + }, + { + "epoch": 0.20903511784795442, + "grad_norm": 1.375, + "learning_rate": 0.00019763540946206404, + "loss": 4.6161, + "step": 2016 + }, + { + "epoch": 0.2091388059024425, + "grad_norm": 1.0625, + "learning_rate": 0.00019763306070326632, + "loss": 4.5434, + "step": 2017 + }, + { + "epoch": 0.20924249395693056, + "grad_norm": 1.7734375, + "learning_rate": 0.00019763071079250164, + "loss": 4.5784, + "step": 2018 + }, + { + "epoch": 0.20934618201141864, + "grad_norm": 1.453125, + "learning_rate": 0.00019762835972979783, + "loss": 4.5505, + "step": 2019 + }, + { + "epoch": 0.2094498700659067, + "grad_norm": 1.671875, + "learning_rate": 0.00019762600751518255, + "loss": 4.5706, + "step": 2020 + }, + { + "epoch": 0.2095535581203948, + "grad_norm": 1.46875, + "learning_rate": 0.00019762365414868356, + "loss": 4.5687, + "step": 2021 + }, + { + "epoch": 0.20965724617488288, + "grad_norm": 1.78125, + "learning_rate": 0.00019762129963032867, + "loss": 4.5933, + "step": 2022 + }, + { + "epoch": 0.20976093422937095, + "grad_norm": 1.4609375, + "learning_rate": 0.0001976189439601456, + "loss": 4.6114, + "step": 2023 + }, + { + "epoch": 0.20986462228385902, + "grad_norm": 1.8203125, + "learning_rate": 0.0001976165871381622, + "loss": 4.5804, + "step": 2024 + }, + { + "epoch": 0.2099683103383471, + "grad_norm": 1.5625, + "learning_rate": 0.0001976142291644063, + "loss": 4.6148, + "step": 2025 + }, + { + "epoch": 0.21007199839283516, + "grad_norm": 1.8828125, + "learning_rate": 0.00019761187003890563, + "loss": 4.5602, + "step": 2026 + }, + { + "epoch": 0.21017568644732323, + "grad_norm": 1.6796875, + "learning_rate": 0.0001976095097616881, + "loss": 4.6301, + "step": 2027 + }, + { + "epoch": 0.2102793745018113, + "grad_norm": 1.5703125, + "learning_rate": 0.00019760714833278148, + "loss": 4.5808, + "step": 2028 + }, + { + "epoch": 0.21038306255629938, + "grad_norm": 1.4453125, + "learning_rate": 0.00019760478575221372, + "loss": 4.5701, + "step": 2029 + }, + { + "epoch": 0.21048675061078745, + "grad_norm": 1.40625, + "learning_rate": 0.00019760242202001267, + "loss": 4.5741, + "step": 2030 + }, + { + "epoch": 0.21059043866527552, + "grad_norm": 1.1875, + "learning_rate": 0.00019760005713620623, + "loss": 4.5976, + "step": 2031 + }, + { + "epoch": 0.2106941267197636, + "grad_norm": 1.3359375, + "learning_rate": 0.00019759769110082223, + "loss": 4.6137, + "step": 2032 + }, + { + "epoch": 0.21079781477425166, + "grad_norm": 1.0859375, + "learning_rate": 0.00019759532391388867, + "loss": 4.5868, + "step": 2033 + }, + { + "epoch": 0.21090150282873973, + "grad_norm": 1.8125, + "learning_rate": 0.00019759295557543344, + "loss": 4.5967, + "step": 2034 + }, + { + "epoch": 0.2110051908832278, + "grad_norm": 1.421875, + "learning_rate": 0.0001975905860854845, + "loss": 4.5903, + "step": 2035 + }, + { + "epoch": 0.21110887893771588, + "grad_norm": 2.015625, + "learning_rate": 0.0001975882154440698, + "loss": 4.6173, + "step": 2036 + }, + { + "epoch": 0.21121256699220395, + "grad_norm": 1.8984375, + "learning_rate": 0.00019758584365121734, + "loss": 4.6118, + "step": 2037 + }, + { + "epoch": 0.21131625504669202, + "grad_norm": 1.4609375, + "learning_rate": 0.00019758347070695503, + "loss": 4.5663, + "step": 2038 + }, + { + "epoch": 0.2114199431011801, + "grad_norm": 1.4375, + "learning_rate": 0.00019758109661131092, + "loss": 4.5646, + "step": 2039 + }, + { + "epoch": 0.21152363115566816, + "grad_norm": 1.296875, + "learning_rate": 0.00019757872136431305, + "loss": 4.6037, + "step": 2040 + }, + { + "epoch": 0.21162731921015623, + "grad_norm": 1.1640625, + "learning_rate": 0.0001975763449659894, + "loss": 4.5981, + "step": 2041 + }, + { + "epoch": 0.2117310072646443, + "grad_norm": 1.09375, + "learning_rate": 0.00019757396741636803, + "loss": 4.6128, + "step": 2042 + }, + { + "epoch": 0.21183469531913238, + "grad_norm": 1.1875, + "learning_rate": 0.00019757158871547699, + "loss": 4.523, + "step": 2043 + }, + { + "epoch": 0.21193838337362048, + "grad_norm": 0.94921875, + "learning_rate": 0.00019756920886334432, + "loss": 4.5688, + "step": 2044 + }, + { + "epoch": 0.21204207142810855, + "grad_norm": 1.3671875, + "learning_rate": 0.00019756682785999812, + "loss": 4.5693, + "step": 2045 + }, + { + "epoch": 0.21214575948259662, + "grad_norm": 0.96875, + "learning_rate": 0.0001975644457054665, + "loss": 4.5807, + "step": 2046 + }, + { + "epoch": 0.2122494475370847, + "grad_norm": 1.7578125, + "learning_rate": 0.00019756206239977751, + "loss": 4.5522, + "step": 2047 + }, + { + "epoch": 0.21235313559157276, + "grad_norm": 1.453125, + "learning_rate": 0.00019755967794295938, + "loss": 4.5546, + "step": 2048 + }, + { + "epoch": 0.21245682364606083, + "grad_norm": 1.65625, + "learning_rate": 0.00019755729233504013, + "loss": 4.573, + "step": 2049 + }, + { + "epoch": 0.2125605117005489, + "grad_norm": 1.390625, + "learning_rate": 0.00019755490557604795, + "loss": 4.5695, + "step": 2050 + }, + { + "epoch": 0.21266419975503698, + "grad_norm": 1.859375, + "learning_rate": 0.000197552517666011, + "loss": 4.5793, + "step": 2051 + }, + { + "epoch": 0.21276788780952505, + "grad_norm": 1.4375, + "learning_rate": 0.00019755012860495747, + "loss": 4.601, + "step": 2052 + }, + { + "epoch": 0.21287157586401312, + "grad_norm": 2.109375, + "learning_rate": 0.00019754773839291556, + "loss": 4.6343, + "step": 2053 + }, + { + "epoch": 0.2129752639185012, + "grad_norm": 1.96875, + "learning_rate": 0.00019754534702991344, + "loss": 4.6069, + "step": 2054 + }, + { + "epoch": 0.21307895197298926, + "grad_norm": 1.25, + "learning_rate": 0.00019754295451597932, + "loss": 4.5174, + "step": 2055 + }, + { + "epoch": 0.21318264002747733, + "grad_norm": 1.296875, + "learning_rate": 0.00019754056085114144, + "loss": 4.5313, + "step": 2056 + }, + { + "epoch": 0.2132863280819654, + "grad_norm": 1.15625, + "learning_rate": 0.0001975381660354281, + "loss": 4.5743, + "step": 2057 + }, + { + "epoch": 0.21339001613645348, + "grad_norm": 1.140625, + "learning_rate": 0.00019753577006886744, + "loss": 4.5637, + "step": 2058 + }, + { + "epoch": 0.21349370419094155, + "grad_norm": 1.1171875, + "learning_rate": 0.0001975333729514878, + "loss": 4.5662, + "step": 2059 + }, + { + "epoch": 0.21359739224542962, + "grad_norm": 0.984375, + "learning_rate": 0.00019753097468331747, + "loss": 4.5688, + "step": 2060 + }, + { + "epoch": 0.2137010802999177, + "grad_norm": 1.15625, + "learning_rate": 0.00019752857526438472, + "loss": 4.5697, + "step": 2061 + }, + { + "epoch": 0.21380476835440576, + "grad_norm": 0.83203125, + "learning_rate": 0.0001975261746947179, + "loss": 4.6017, + "step": 2062 + }, + { + "epoch": 0.21390845640889383, + "grad_norm": 1.203125, + "learning_rate": 0.0001975237729743453, + "loss": 4.5543, + "step": 2063 + }, + { + "epoch": 0.2140121444633819, + "grad_norm": 0.859375, + "learning_rate": 0.00019752137010329527, + "loss": 4.5657, + "step": 2064 + }, + { + "epoch": 0.21411583251786998, + "grad_norm": 1.4609375, + "learning_rate": 0.00019751896608159614, + "loss": 4.5717, + "step": 2065 + }, + { + "epoch": 0.21421952057235807, + "grad_norm": 1.1015625, + "learning_rate": 0.0001975165609092763, + "loss": 4.5827, + "step": 2066 + }, + { + "epoch": 0.21432320862684615, + "grad_norm": 1.90625, + "learning_rate": 0.00019751415458636414, + "loss": 4.5571, + "step": 2067 + }, + { + "epoch": 0.21442689668133422, + "grad_norm": 1.90625, + "learning_rate": 0.000197511747112888, + "loss": 4.6079, + "step": 2068 + }, + { + "epoch": 0.2145305847358223, + "grad_norm": 0.91796875, + "learning_rate": 0.00019750933848887634, + "loss": 4.5673, + "step": 2069 + }, + { + "epoch": 0.21463427279031036, + "grad_norm": 1.5390625, + "learning_rate": 0.00019750692871435755, + "loss": 4.5605, + "step": 2070 + }, + { + "epoch": 0.21473796084479843, + "grad_norm": 1.1015625, + "learning_rate": 0.00019750451778936007, + "loss": 4.5711, + "step": 2071 + }, + { + "epoch": 0.2148416488992865, + "grad_norm": 1.578125, + "learning_rate": 0.00019750210571391232, + "loss": 4.563, + "step": 2072 + }, + { + "epoch": 0.21494533695377457, + "grad_norm": 1.5078125, + "learning_rate": 0.00019749969248804283, + "loss": 4.5555, + "step": 2073 + }, + { + "epoch": 0.21504902500826265, + "grad_norm": 1.015625, + "learning_rate": 0.00019749727811178, + "loss": 4.589, + "step": 2074 + }, + { + "epoch": 0.21515271306275072, + "grad_norm": 1.421875, + "learning_rate": 0.00019749486258515238, + "loss": 4.5742, + "step": 2075 + }, + { + "epoch": 0.2152564011172388, + "grad_norm": 1.0859375, + "learning_rate": 0.00019749244590818842, + "loss": 4.5774, + "step": 2076 + }, + { + "epoch": 0.21536008917172686, + "grad_norm": 1.7421875, + "learning_rate": 0.00019749002808091667, + "loss": 4.5611, + "step": 2077 + }, + { + "epoch": 0.21546377722621493, + "grad_norm": 1.5234375, + "learning_rate": 0.00019748760910336558, + "loss": 4.551, + "step": 2078 + }, + { + "epoch": 0.215567465280703, + "grad_norm": 1.53125, + "learning_rate": 0.00019748518897556383, + "loss": 4.6089, + "step": 2079 + }, + { + "epoch": 0.21567115333519107, + "grad_norm": 1.1796875, + "learning_rate": 0.00019748276769753983, + "loss": 4.5747, + "step": 2080 + }, + { + "epoch": 0.21577484138967915, + "grad_norm": 1.515625, + "learning_rate": 0.00019748034526932226, + "loss": 4.5606, + "step": 2081 + }, + { + "epoch": 0.21587852944416722, + "grad_norm": 0.9375, + "learning_rate": 0.00019747792169093963, + "loss": 4.5935, + "step": 2082 + }, + { + "epoch": 0.2159822174986553, + "grad_norm": 1.8125, + "learning_rate": 0.00019747549696242059, + "loss": 4.6101, + "step": 2083 + }, + { + "epoch": 0.21608590555314336, + "grad_norm": 1.59375, + "learning_rate": 0.0001974730710837937, + "loss": 4.5796, + "step": 2084 + }, + { + "epoch": 0.21618959360763143, + "grad_norm": 1.7890625, + "learning_rate": 0.00019747064405508763, + "loss": 4.5739, + "step": 2085 + }, + { + "epoch": 0.2162932816621195, + "grad_norm": 1.5390625, + "learning_rate": 0.00019746821587633099, + "loss": 4.5887, + "step": 2086 + }, + { + "epoch": 0.21639696971660757, + "grad_norm": 1.8359375, + "learning_rate": 0.0001974657865475524, + "loss": 4.544, + "step": 2087 + }, + { + "epoch": 0.21650065777109567, + "grad_norm": 1.6875, + "learning_rate": 0.00019746335606878054, + "loss": 4.5873, + "step": 2088 + }, + { + "epoch": 0.21660434582558374, + "grad_norm": 1.625, + "learning_rate": 0.00019746092444004412, + "loss": 4.6125, + "step": 2089 + }, + { + "epoch": 0.21670803388007182, + "grad_norm": 1.453125, + "learning_rate": 0.00019745849166137185, + "loss": 4.5585, + "step": 2090 + }, + { + "epoch": 0.2168117219345599, + "grad_norm": 1.453125, + "learning_rate": 0.00019745605773279236, + "loss": 4.5969, + "step": 2091 + }, + { + "epoch": 0.21691540998904796, + "grad_norm": 1.203125, + "learning_rate": 0.00019745362265433442, + "loss": 4.5379, + "step": 2092 + }, + { + "epoch": 0.21701909804353603, + "grad_norm": 1.7578125, + "learning_rate": 0.00019745118642602673, + "loss": 4.5126, + "step": 2093 + }, + { + "epoch": 0.2171227860980241, + "grad_norm": 1.484375, + "learning_rate": 0.00019744874904789806, + "loss": 4.5135, + "step": 2094 + }, + { + "epoch": 0.21722647415251217, + "grad_norm": 1.8359375, + "learning_rate": 0.00019744631051997718, + "loss": 4.5868, + "step": 2095 + }, + { + "epoch": 0.21733016220700024, + "grad_norm": 1.640625, + "learning_rate": 0.0001974438708422928, + "loss": 4.5675, + "step": 2096 + }, + { + "epoch": 0.21743385026148832, + "grad_norm": 1.453125, + "learning_rate": 0.00019744143001487378, + "loss": 4.5875, + "step": 2097 + }, + { + "epoch": 0.2175375383159764, + "grad_norm": 1.34375, + "learning_rate": 0.00019743898803774884, + "loss": 4.6141, + "step": 2098 + }, + { + "epoch": 0.21764122637046446, + "grad_norm": 1.4140625, + "learning_rate": 0.0001974365449109469, + "loss": 4.5761, + "step": 2099 + }, + { + "epoch": 0.21774491442495253, + "grad_norm": 1.2734375, + "learning_rate": 0.0001974341006344967, + "loss": 4.5581, + "step": 2100 + }, + { + "epoch": 0.2178486024794406, + "grad_norm": 1.1484375, + "learning_rate": 0.0001974316552084271, + "loss": 4.5847, + "step": 2101 + }, + { + "epoch": 0.21795229053392867, + "grad_norm": 1.7109375, + "learning_rate": 0.00019742920863276693, + "loss": 4.5691, + "step": 2102 + }, + { + "epoch": 0.21805597858841674, + "grad_norm": 1.234375, + "learning_rate": 0.00019742676090754512, + "loss": 4.5477, + "step": 2103 + }, + { + "epoch": 0.21815966664290481, + "grad_norm": 2.421875, + "learning_rate": 0.0001974243120327905, + "loss": 4.5926, + "step": 2104 + }, + { + "epoch": 0.2182633546973929, + "grad_norm": 2.1875, + "learning_rate": 0.000197421862008532, + "loss": 4.588, + "step": 2105 + }, + { + "epoch": 0.21836704275188096, + "grad_norm": 1.5703125, + "learning_rate": 0.0001974194108347985, + "loss": 4.5484, + "step": 2106 + }, + { + "epoch": 0.21847073080636903, + "grad_norm": 1.515625, + "learning_rate": 0.00019741695851161893, + "loss": 4.6028, + "step": 2107 + }, + { + "epoch": 0.2185744188608571, + "grad_norm": 1.6171875, + "learning_rate": 0.00019741450503902222, + "loss": 4.6113, + "step": 2108 + }, + { + "epoch": 0.21867810691534517, + "grad_norm": 1.1171875, + "learning_rate": 0.00019741205041703733, + "loss": 4.5785, + "step": 2109 + }, + { + "epoch": 0.21878179496983324, + "grad_norm": 2.265625, + "learning_rate": 0.0001974095946456932, + "loss": 4.5964, + "step": 2110 + }, + { + "epoch": 0.21888548302432134, + "grad_norm": 2.09375, + "learning_rate": 0.0001974071377250188, + "loss": 4.5467, + "step": 2111 + }, + { + "epoch": 0.2189891710788094, + "grad_norm": 1.34375, + "learning_rate": 0.0001974046796550432, + "loss": 4.5953, + "step": 2112 + }, + { + "epoch": 0.21909285913329749, + "grad_norm": 1.3359375, + "learning_rate": 0.00019740222043579527, + "loss": 4.6259, + "step": 2113 + }, + { + "epoch": 0.21919654718778556, + "grad_norm": 1.421875, + "learning_rate": 0.00019739976006730414, + "loss": 4.5235, + "step": 2114 + }, + { + "epoch": 0.21930023524227363, + "grad_norm": 1.140625, + "learning_rate": 0.0001973972985495988, + "loss": 4.5858, + "step": 2115 + }, + { + "epoch": 0.2194039232967617, + "grad_norm": 1.84375, + "learning_rate": 0.00019739483588270828, + "loss": 4.5866, + "step": 2116 + }, + { + "epoch": 0.21950761135124977, + "grad_norm": 1.484375, + "learning_rate": 0.00019739237206666164, + "loss": 4.6022, + "step": 2117 + }, + { + "epoch": 0.21961129940573784, + "grad_norm": 1.8828125, + "learning_rate": 0.00019738990710148796, + "loss": 4.6095, + "step": 2118 + }, + { + "epoch": 0.2197149874602259, + "grad_norm": 1.7265625, + "learning_rate": 0.00019738744098721632, + "loss": 4.5528, + "step": 2119 + }, + { + "epoch": 0.21981867551471398, + "grad_norm": 1.6796875, + "learning_rate": 0.00019738497372387586, + "loss": 4.5543, + "step": 2120 + }, + { + "epoch": 0.21992236356920206, + "grad_norm": 1.4453125, + "learning_rate": 0.0001973825053114956, + "loss": 4.5641, + "step": 2121 + }, + { + "epoch": 0.22002605162369013, + "grad_norm": 1.75, + "learning_rate": 0.00019738003575010474, + "loss": 4.5842, + "step": 2122 + }, + { + "epoch": 0.2201297396781782, + "grad_norm": 1.3203125, + "learning_rate": 0.0001973775650397324, + "loss": 4.5878, + "step": 2123 + }, + { + "epoch": 0.22023342773266627, + "grad_norm": 2.21875, + "learning_rate": 0.0001973750931804077, + "loss": 4.5831, + "step": 2124 + }, + { + "epoch": 0.22033711578715434, + "grad_norm": 1.984375, + "learning_rate": 0.00019737262017215982, + "loss": 4.5747, + "step": 2125 + }, + { + "epoch": 0.2204408038416424, + "grad_norm": 1.546875, + "learning_rate": 0.000197370146015018, + "loss": 4.5351, + "step": 2126 + }, + { + "epoch": 0.22054449189613048, + "grad_norm": 1.5390625, + "learning_rate": 0.00019736767070901133, + "loss": 4.5474, + "step": 2127 + }, + { + "epoch": 0.22064817995061856, + "grad_norm": 1.34375, + "learning_rate": 0.00019736519425416908, + "loss": 4.6022, + "step": 2128 + }, + { + "epoch": 0.22075186800510663, + "grad_norm": 1.21875, + "learning_rate": 0.00019736271665052047, + "loss": 4.5988, + "step": 2129 + }, + { + "epoch": 0.2208555560595947, + "grad_norm": 1.5390625, + "learning_rate": 0.00019736023789809472, + "loss": 4.5532, + "step": 2130 + }, + { + "epoch": 0.22095924411408277, + "grad_norm": 1.296875, + "learning_rate": 0.0001973577579969211, + "loss": 4.5863, + "step": 2131 + }, + { + "epoch": 0.22106293216857084, + "grad_norm": 2.0, + "learning_rate": 0.0001973552769470288, + "loss": 4.5876, + "step": 2132 + }, + { + "epoch": 0.22116662022305894, + "grad_norm": 1.8125, + "learning_rate": 0.00019735279474844718, + "loss": 4.6137, + "step": 2133 + }, + { + "epoch": 0.221270308277547, + "grad_norm": 1.4375, + "learning_rate": 0.00019735031140120547, + "loss": 4.5873, + "step": 2134 + }, + { + "epoch": 0.22137399633203508, + "grad_norm": 1.3515625, + "learning_rate": 0.00019734782690533298, + "loss": 4.605, + "step": 2135 + }, + { + "epoch": 0.22147768438652315, + "grad_norm": 1.4921875, + "learning_rate": 0.00019734534126085904, + "loss": 4.603, + "step": 2136 + }, + { + "epoch": 0.22158137244101123, + "grad_norm": 1.34375, + "learning_rate": 0.00019734285446781297, + "loss": 4.5563, + "step": 2137 + }, + { + "epoch": 0.2216850604954993, + "grad_norm": 1.5078125, + "learning_rate": 0.00019734036652622412, + "loss": 4.5683, + "step": 2138 + }, + { + "epoch": 0.22178874854998737, + "grad_norm": 1.421875, + "learning_rate": 0.00019733787743612185, + "loss": 4.5973, + "step": 2139 + }, + { + "epoch": 0.22189243660447544, + "grad_norm": 1.40625, + "learning_rate": 0.00019733538719753552, + "loss": 4.6036, + "step": 2140 + }, + { + "epoch": 0.2219961246589635, + "grad_norm": 1.2109375, + "learning_rate": 0.00019733289581049448, + "loss": 4.5687, + "step": 2141 + }, + { + "epoch": 0.22209981271345158, + "grad_norm": 1.421875, + "learning_rate": 0.00019733040327502815, + "loss": 4.6214, + "step": 2142 + }, + { + "epoch": 0.22220350076793965, + "grad_norm": 1.203125, + "learning_rate": 0.000197327909591166, + "loss": 4.5759, + "step": 2143 + }, + { + "epoch": 0.22230718882242773, + "grad_norm": 1.5078125, + "learning_rate": 0.00019732541475893733, + "loss": 4.5674, + "step": 2144 + }, + { + "epoch": 0.2224108768769158, + "grad_norm": 1.3125, + "learning_rate": 0.00019732291877837165, + "loss": 4.5504, + "step": 2145 + }, + { + "epoch": 0.22251456493140387, + "grad_norm": 1.4296875, + "learning_rate": 0.00019732042164949845, + "loss": 4.5623, + "step": 2146 + }, + { + "epoch": 0.22261825298589194, + "grad_norm": 1.1875, + "learning_rate": 0.0001973179233723471, + "loss": 4.5639, + "step": 2147 + }, + { + "epoch": 0.22272194104038, + "grad_norm": 1.3671875, + "learning_rate": 0.00019731542394694715, + "loss": 4.5888, + "step": 2148 + }, + { + "epoch": 0.22282562909486808, + "grad_norm": 1.171875, + "learning_rate": 0.00019731292337332807, + "loss": 4.5787, + "step": 2149 + }, + { + "epoch": 0.22292931714935615, + "grad_norm": 1.46875, + "learning_rate": 0.00019731042165151936, + "loss": 4.5718, + "step": 2150 + }, + { + "epoch": 0.22303300520384423, + "grad_norm": 1.21875, + "learning_rate": 0.00019730791878155052, + "loss": 4.5511, + "step": 2151 + }, + { + "epoch": 0.2231366932583323, + "grad_norm": 1.484375, + "learning_rate": 0.0001973054147634511, + "loss": 4.588, + "step": 2152 + }, + { + "epoch": 0.22324038131282037, + "grad_norm": 1.4140625, + "learning_rate": 0.00019730290959725063, + "loss": 4.5352, + "step": 2153 + }, + { + "epoch": 0.22334406936730844, + "grad_norm": 1.484375, + "learning_rate": 0.0001973004032829787, + "loss": 4.5747, + "step": 2154 + }, + { + "epoch": 0.22344775742179654, + "grad_norm": 1.3046875, + "learning_rate": 0.00019729789582066486, + "loss": 4.5886, + "step": 2155 + }, + { + "epoch": 0.2235514454762846, + "grad_norm": 1.46875, + "learning_rate": 0.0001972953872103387, + "loss": 4.5761, + "step": 2156 + }, + { + "epoch": 0.22365513353077268, + "grad_norm": 1.25, + "learning_rate": 0.0001972928774520298, + "loss": 4.5831, + "step": 2157 + }, + { + "epoch": 0.22375882158526075, + "grad_norm": 1.5078125, + "learning_rate": 0.0001972903665457678, + "loss": 4.5881, + "step": 2158 + }, + { + "epoch": 0.22386250963974882, + "grad_norm": 1.2421875, + "learning_rate": 0.00019728785449158232, + "loss": 4.5763, + "step": 2159 + }, + { + "epoch": 0.2239661976942369, + "grad_norm": 1.546875, + "learning_rate": 0.00019728534128950299, + "loss": 4.5638, + "step": 2160 + }, + { + "epoch": 0.22406988574872497, + "grad_norm": 1.1171875, + "learning_rate": 0.00019728282693955946, + "loss": 4.5464, + "step": 2161 + }, + { + "epoch": 0.22417357380321304, + "grad_norm": 1.78125, + "learning_rate": 0.00019728031144178142, + "loss": 4.589, + "step": 2162 + }, + { + "epoch": 0.2242772618577011, + "grad_norm": 1.4296875, + "learning_rate": 0.00019727779479619852, + "loss": 4.5857, + "step": 2163 + }, + { + "epoch": 0.22438094991218918, + "grad_norm": 1.984375, + "learning_rate": 0.00019727527700284046, + "loss": 4.5437, + "step": 2164 + }, + { + "epoch": 0.22448463796667725, + "grad_norm": 1.7890625, + "learning_rate": 0.00019727275806173696, + "loss": 4.573, + "step": 2165 + }, + { + "epoch": 0.22458832602116532, + "grad_norm": 1.421875, + "learning_rate": 0.00019727023797291778, + "loss": 4.5802, + "step": 2166 + }, + { + "epoch": 0.2246920140756534, + "grad_norm": 1.359375, + "learning_rate": 0.00019726771673641256, + "loss": 4.5805, + "step": 2167 + }, + { + "epoch": 0.22479570213014147, + "grad_norm": 1.65625, + "learning_rate": 0.00019726519435225113, + "loss": 4.593, + "step": 2168 + }, + { + "epoch": 0.22489939018462954, + "grad_norm": 1.4296875, + "learning_rate": 0.0001972626708204632, + "loss": 4.5521, + "step": 2169 + }, + { + "epoch": 0.2250030782391176, + "grad_norm": 1.546875, + "learning_rate": 0.00019726014614107856, + "loss": 4.5509, + "step": 2170 + }, + { + "epoch": 0.22510676629360568, + "grad_norm": 1.484375, + "learning_rate": 0.00019725762031412702, + "loss": 4.5501, + "step": 2171 + }, + { + "epoch": 0.22521045434809375, + "grad_norm": 1.515625, + "learning_rate": 0.0001972550933396384, + "loss": 4.6034, + "step": 2172 + }, + { + "epoch": 0.22531414240258182, + "grad_norm": 1.328125, + "learning_rate": 0.0001972525652176424, + "loss": 4.5989, + "step": 2173 + }, + { + "epoch": 0.2254178304570699, + "grad_norm": 1.4921875, + "learning_rate": 0.000197250035948169, + "loss": 4.6182, + "step": 2174 + }, + { + "epoch": 0.22552151851155797, + "grad_norm": 1.296875, + "learning_rate": 0.00019724750553124794, + "loss": 4.54, + "step": 2175 + }, + { + "epoch": 0.22562520656604604, + "grad_norm": 1.5625, + "learning_rate": 0.00019724497396690911, + "loss": 4.6102, + "step": 2176 + }, + { + "epoch": 0.2257288946205341, + "grad_norm": 1.3984375, + "learning_rate": 0.0001972424412551824, + "loss": 4.6087, + "step": 2177 + }, + { + "epoch": 0.2258325826750222, + "grad_norm": 1.484375, + "learning_rate": 0.00019723990739609765, + "loss": 4.5766, + "step": 2178 + }, + { + "epoch": 0.22593627072951028, + "grad_norm": 1.3046875, + "learning_rate": 0.0001972373723896848, + "loss": 4.5922, + "step": 2179 + }, + { + "epoch": 0.22603995878399835, + "grad_norm": 1.5625, + "learning_rate": 0.0001972348362359737, + "loss": 4.6109, + "step": 2180 + }, + { + "epoch": 0.22614364683848642, + "grad_norm": 1.296875, + "learning_rate": 0.00019723229893499436, + "loss": 4.5587, + "step": 2181 + }, + { + "epoch": 0.2262473348929745, + "grad_norm": 1.6796875, + "learning_rate": 0.00019722976048677668, + "loss": 4.6016, + "step": 2182 + }, + { + "epoch": 0.22635102294746257, + "grad_norm": 1.46875, + "learning_rate": 0.00019722722089135058, + "loss": 4.6004, + "step": 2183 + }, + { + "epoch": 0.22645471100195064, + "grad_norm": 1.40625, + "learning_rate": 0.00019722468014874602, + "loss": 4.5439, + "step": 2184 + }, + { + "epoch": 0.2265583990564387, + "grad_norm": 1.328125, + "learning_rate": 0.00019722213825899306, + "loss": 4.538, + "step": 2185 + }, + { + "epoch": 0.22666208711092678, + "grad_norm": 1.421875, + "learning_rate": 0.0001972195952221216, + "loss": 4.5763, + "step": 2186 + }, + { + "epoch": 0.22676577516541485, + "grad_norm": 1.2421875, + "learning_rate": 0.00019721705103816167, + "loss": 4.5971, + "step": 2187 + }, + { + "epoch": 0.22686946321990292, + "grad_norm": 1.3671875, + "learning_rate": 0.00019721450570714332, + "loss": 4.5941, + "step": 2188 + }, + { + "epoch": 0.226973151274391, + "grad_norm": 1.2578125, + "learning_rate": 0.00019721195922909658, + "loss": 4.5869, + "step": 2189 + }, + { + "epoch": 0.22707683932887907, + "grad_norm": 1.515625, + "learning_rate": 0.00019720941160405146, + "loss": 4.5875, + "step": 2190 + }, + { + "epoch": 0.22718052738336714, + "grad_norm": 1.2421875, + "learning_rate": 0.00019720686283203803, + "loss": 4.5523, + "step": 2191 + }, + { + "epoch": 0.2272842154378552, + "grad_norm": 1.5078125, + "learning_rate": 0.0001972043129130864, + "loss": 4.5619, + "step": 2192 + }, + { + "epoch": 0.22738790349234328, + "grad_norm": 1.2890625, + "learning_rate": 0.00019720176184722662, + "loss": 4.55, + "step": 2193 + }, + { + "epoch": 0.22749159154683135, + "grad_norm": 1.9375, + "learning_rate": 0.00019719920963448876, + "loss": 4.5597, + "step": 2194 + }, + { + "epoch": 0.22759527960131942, + "grad_norm": 1.609375, + "learning_rate": 0.000197196656274903, + "loss": 4.5254, + "step": 2195 + }, + { + "epoch": 0.2276989676558075, + "grad_norm": 1.6875, + "learning_rate": 0.00019719410176849943, + "loss": 4.5352, + "step": 2196 + }, + { + "epoch": 0.22780265571029557, + "grad_norm": 1.484375, + "learning_rate": 0.0001971915461153082, + "loss": 4.5494, + "step": 2197 + }, + { + "epoch": 0.22790634376478364, + "grad_norm": 1.421875, + "learning_rate": 0.00019718898931535948, + "loss": 4.5682, + "step": 2198 + }, + { + "epoch": 0.2280100318192717, + "grad_norm": 1.2734375, + "learning_rate": 0.0001971864313686834, + "loss": 4.562, + "step": 2199 + }, + { + "epoch": 0.2281137198737598, + "grad_norm": 1.5, + "learning_rate": 0.00019718387227531014, + "loss": 4.5403, + "step": 2200 + }, + { + "epoch": 0.22821740792824788, + "grad_norm": 1.21875, + "learning_rate": 0.00019718131203526996, + "loss": 4.5693, + "step": 2201 + }, + { + "epoch": 0.22832109598273595, + "grad_norm": 1.4140625, + "learning_rate": 0.00019717875064859298, + "loss": 4.5906, + "step": 2202 + }, + { + "epoch": 0.22842478403722402, + "grad_norm": 1.203125, + "learning_rate": 0.0001971761881153095, + "loss": 4.535, + "step": 2203 + }, + { + "epoch": 0.2285284720917121, + "grad_norm": 1.6171875, + "learning_rate": 0.0001971736244354497, + "loss": 4.5743, + "step": 2204 + }, + { + "epoch": 0.22863216014620016, + "grad_norm": 1.2421875, + "learning_rate": 0.00019717105960904386, + "loss": 4.5418, + "step": 2205 + }, + { + "epoch": 0.22873584820068824, + "grad_norm": 1.7421875, + "learning_rate": 0.00019716849363612222, + "loss": 4.581, + "step": 2206 + }, + { + "epoch": 0.2288395362551763, + "grad_norm": 1.375, + "learning_rate": 0.00019716592651671506, + "loss": 4.5945, + "step": 2207 + }, + { + "epoch": 0.22894322430966438, + "grad_norm": 1.90625, + "learning_rate": 0.00019716335825085269, + "loss": 4.5853, + "step": 2208 + }, + { + "epoch": 0.22904691236415245, + "grad_norm": 1.8359375, + "learning_rate": 0.0001971607888385654, + "loss": 4.574, + "step": 2209 + }, + { + "epoch": 0.22915060041864052, + "grad_norm": 1.4140625, + "learning_rate": 0.0001971582182798835, + "loss": 4.5358, + "step": 2210 + }, + { + "epoch": 0.2292542884731286, + "grad_norm": 1.34375, + "learning_rate": 0.0001971556465748373, + "loss": 4.5675, + "step": 2211 + }, + { + "epoch": 0.22935797652761666, + "grad_norm": 1.546875, + "learning_rate": 0.0001971530737234572, + "loss": 4.5541, + "step": 2212 + }, + { + "epoch": 0.22946166458210474, + "grad_norm": 1.328125, + "learning_rate": 0.00019715049972577353, + "loss": 4.5301, + "step": 2213 + }, + { + "epoch": 0.2295653526365928, + "grad_norm": 1.5703125, + "learning_rate": 0.00019714792458181663, + "loss": 4.5643, + "step": 2214 + }, + { + "epoch": 0.22966904069108088, + "grad_norm": 1.4140625, + "learning_rate": 0.00019714534829161693, + "loss": 4.5358, + "step": 2215 + }, + { + "epoch": 0.22977272874556895, + "grad_norm": 1.4609375, + "learning_rate": 0.0001971427708552048, + "loss": 4.5537, + "step": 2216 + }, + { + "epoch": 0.22987641680005702, + "grad_norm": 1.3046875, + "learning_rate": 0.00019714019227261067, + "loss": 4.593, + "step": 2217 + }, + { + "epoch": 0.2299801048545451, + "grad_norm": 1.59375, + "learning_rate": 0.00019713761254386495, + "loss": 4.5416, + "step": 2218 + }, + { + "epoch": 0.23008379290903316, + "grad_norm": 1.375, + "learning_rate": 0.00019713503166899807, + "loss": 4.5677, + "step": 2219 + }, + { + "epoch": 0.23018748096352123, + "grad_norm": 1.5390625, + "learning_rate": 0.0001971324496480405, + "loss": 4.5802, + "step": 2220 + }, + { + "epoch": 0.2302911690180093, + "grad_norm": 1.4609375, + "learning_rate": 0.0001971298664810227, + "loss": 4.516, + "step": 2221 + }, + { + "epoch": 0.2303948570724974, + "grad_norm": 1.3828125, + "learning_rate": 0.00019712728216797514, + "loss": 4.5601, + "step": 2222 + }, + { + "epoch": 0.23049854512698548, + "grad_norm": 1.1796875, + "learning_rate": 0.0001971246967089283, + "loss": 4.5543, + "step": 2223 + }, + { + "epoch": 0.23060223318147355, + "grad_norm": 1.515625, + "learning_rate": 0.00019712211010391274, + "loss": 4.5348, + "step": 2224 + }, + { + "epoch": 0.23070592123596162, + "grad_norm": 1.2734375, + "learning_rate": 0.0001971195223529589, + "loss": 4.5637, + "step": 2225 + }, + { + "epoch": 0.2308096092904497, + "grad_norm": 1.90625, + "learning_rate": 0.00019711693345609739, + "loss": 4.608, + "step": 2226 + }, + { + "epoch": 0.23091329734493776, + "grad_norm": 1.671875, + "learning_rate": 0.0001971143434133587, + "loss": 4.5851, + "step": 2227 + }, + { + "epoch": 0.23101698539942583, + "grad_norm": 1.7109375, + "learning_rate": 0.00019711175222477344, + "loss": 4.5839, + "step": 2228 + }, + { + "epoch": 0.2311206734539139, + "grad_norm": 1.546875, + "learning_rate": 0.00019710915989037213, + "loss": 4.5727, + "step": 2229 + }, + { + "epoch": 0.23122436150840198, + "grad_norm": 1.390625, + "learning_rate": 0.0001971065664101854, + "loss": 4.5646, + "step": 2230 + }, + { + "epoch": 0.23132804956289005, + "grad_norm": 1.3828125, + "learning_rate": 0.00019710397178424383, + "loss": 4.5607, + "step": 2231 + }, + { + "epoch": 0.23143173761737812, + "grad_norm": 1.1875, + "learning_rate": 0.00019710137601257804, + "loss": 4.5645, + "step": 2232 + }, + { + "epoch": 0.2315354256718662, + "grad_norm": 1.1015625, + "learning_rate": 0.00019709877909521864, + "loss": 4.5555, + "step": 2233 + }, + { + "epoch": 0.23163911372635426, + "grad_norm": 1.1171875, + "learning_rate": 0.0001970961810321963, + "loss": 4.5427, + "step": 2234 + }, + { + "epoch": 0.23174280178084233, + "grad_norm": 0.9140625, + "learning_rate": 0.00019709358182354162, + "loss": 4.5674, + "step": 2235 + }, + { + "epoch": 0.2318464898353304, + "grad_norm": 1.171875, + "learning_rate": 0.00019709098146928535, + "loss": 4.53, + "step": 2236 + }, + { + "epoch": 0.23195017788981848, + "grad_norm": 0.8515625, + "learning_rate": 0.0001970883799694581, + "loss": 4.5445, + "step": 2237 + }, + { + "epoch": 0.23205386594430655, + "grad_norm": 1.375, + "learning_rate": 0.00019708577732409062, + "loss": 4.5639, + "step": 2238 + }, + { + "epoch": 0.23215755399879462, + "grad_norm": 1.0703125, + "learning_rate": 0.00019708317353321357, + "loss": 4.5738, + "step": 2239 + }, + { + "epoch": 0.2322612420532827, + "grad_norm": 1.609375, + "learning_rate": 0.0001970805685968577, + "loss": 4.5423, + "step": 2240 + }, + { + "epoch": 0.23236493010777076, + "grad_norm": 1.5546875, + "learning_rate": 0.00019707796251505375, + "loss": 4.5786, + "step": 2241 + }, + { + "epoch": 0.23246861816225883, + "grad_norm": 1.0078125, + "learning_rate": 0.00019707535528783244, + "loss": 4.562, + "step": 2242 + }, + { + "epoch": 0.2325723062167469, + "grad_norm": 1.09375, + "learning_rate": 0.00019707274691522456, + "loss": 4.547, + "step": 2243 + }, + { + "epoch": 0.23267599427123498, + "grad_norm": 0.99609375, + "learning_rate": 0.0001970701373972609, + "loss": 4.5492, + "step": 2244 + }, + { + "epoch": 0.23277968232572308, + "grad_norm": 0.9296875, + "learning_rate": 0.00019706752673397218, + "loss": 4.5297, + "step": 2245 + }, + { + "epoch": 0.23288337038021115, + "grad_norm": 0.90625, + "learning_rate": 0.00019706491492538927, + "loss": 4.5933, + "step": 2246 + }, + { + "epoch": 0.23298705843469922, + "grad_norm": 0.8984375, + "learning_rate": 0.00019706230197154298, + "loss": 4.5682, + "step": 2247 + }, + { + "epoch": 0.2330907464891873, + "grad_norm": 0.83984375, + "learning_rate": 0.00019705968787246412, + "loss": 4.5609, + "step": 2248 + }, + { + "epoch": 0.23319443454367536, + "grad_norm": 0.828125, + "learning_rate": 0.00019705707262818354, + "loss": 4.5367, + "step": 2249 + }, + { + "epoch": 0.23329812259816343, + "grad_norm": 0.80078125, + "learning_rate": 0.0001970544562387321, + "loss": 4.5668, + "step": 2250 + }, + { + "epoch": 0.2334018106526515, + "grad_norm": 0.78515625, + "learning_rate": 0.00019705183870414062, + "loss": 4.5505, + "step": 2251 + }, + { + "epoch": 0.23350549870713957, + "grad_norm": 0.73828125, + "learning_rate": 0.00019704922002444008, + "loss": 4.5626, + "step": 2252 + }, + { + "epoch": 0.23360918676162765, + "grad_norm": 0.71875, + "learning_rate": 0.00019704660019966133, + "loss": 4.5656, + "step": 2253 + }, + { + "epoch": 0.23371287481611572, + "grad_norm": 0.6953125, + "learning_rate": 0.00019704397922983526, + "loss": 4.5666, + "step": 2254 + }, + { + "epoch": 0.2338165628706038, + "grad_norm": 0.6328125, + "learning_rate": 0.00019704135711499286, + "loss": 4.5267, + "step": 2255 + }, + { + "epoch": 0.23392025092509186, + "grad_norm": 0.6328125, + "learning_rate": 0.00019703873385516497, + "loss": 4.5764, + "step": 2256 + }, + { + "epoch": 0.23402393897957993, + "grad_norm": 0.6328125, + "learning_rate": 0.0001970361094503826, + "loss": 4.5261, + "step": 2257 + }, + { + "epoch": 0.234127627034068, + "grad_norm": 0.62109375, + "learning_rate": 0.00019703348390067674, + "loss": 4.5832, + "step": 2258 + }, + { + "epoch": 0.23423131508855607, + "grad_norm": 0.6171875, + "learning_rate": 0.0001970308572060783, + "loss": 4.5381, + "step": 2259 + }, + { + "epoch": 0.23433500314304415, + "grad_norm": 0.62890625, + "learning_rate": 0.00019702822936661836, + "loss": 4.5449, + "step": 2260 + }, + { + "epoch": 0.23443869119753222, + "grad_norm": 0.609375, + "learning_rate": 0.00019702560038232782, + "loss": 4.5032, + "step": 2261 + }, + { + "epoch": 0.2345423792520203, + "grad_norm": 0.609375, + "learning_rate": 0.0001970229702532378, + "loss": 4.5321, + "step": 2262 + }, + { + "epoch": 0.23464606730650836, + "grad_norm": 0.60546875, + "learning_rate": 0.00019702033897937927, + "loss": 4.5649, + "step": 2263 + }, + { + "epoch": 0.23474975536099643, + "grad_norm": 0.5390625, + "learning_rate": 0.0001970177065607833, + "loss": 4.6028, + "step": 2264 + }, + { + "epoch": 0.2348534434154845, + "grad_norm": 0.55859375, + "learning_rate": 0.00019701507299748095, + "loss": 4.539, + "step": 2265 + }, + { + "epoch": 0.23495713146997257, + "grad_norm": 0.5390625, + "learning_rate": 0.00019701243828950329, + "loss": 4.5593, + "step": 2266 + }, + { + "epoch": 0.23506081952446067, + "grad_norm": 0.5703125, + "learning_rate": 0.0001970098024368814, + "loss": 4.5882, + "step": 2267 + }, + { + "epoch": 0.23516450757894874, + "grad_norm": 0.55859375, + "learning_rate": 0.00019700716543964638, + "loss": 4.5798, + "step": 2268 + }, + { + "epoch": 0.23526819563343682, + "grad_norm": 0.56640625, + "learning_rate": 0.00019700452729782934, + "loss": 4.5523, + "step": 2269 + }, + { + "epoch": 0.2353718836879249, + "grad_norm": 0.59375, + "learning_rate": 0.0001970018880114614, + "loss": 4.5734, + "step": 2270 + }, + { + "epoch": 0.23547557174241296, + "grad_norm": 0.5546875, + "learning_rate": 0.00019699924758057377, + "loss": 4.5664, + "step": 2271 + }, + { + "epoch": 0.23557925979690103, + "grad_norm": 0.5703125, + "learning_rate": 0.00019699660600519753, + "loss": 4.557, + "step": 2272 + }, + { + "epoch": 0.2356829478513891, + "grad_norm": 0.58203125, + "learning_rate": 0.00019699396328536384, + "loss": 4.5388, + "step": 2273 + }, + { + "epoch": 0.23578663590587717, + "grad_norm": 0.59765625, + "learning_rate": 0.00019699131942110397, + "loss": 4.5299, + "step": 2274 + }, + { + "epoch": 0.23589032396036524, + "grad_norm": 0.6015625, + "learning_rate": 0.000196988674412449, + "loss": 4.5497, + "step": 2275 + }, + { + "epoch": 0.23599401201485332, + "grad_norm": 0.58203125, + "learning_rate": 0.0001969860282594302, + "loss": 4.5566, + "step": 2276 + }, + { + "epoch": 0.2360977000693414, + "grad_norm": 0.578125, + "learning_rate": 0.00019698338096207883, + "loss": 4.5407, + "step": 2277 + }, + { + "epoch": 0.23620138812382946, + "grad_norm": 0.5625, + "learning_rate": 0.00019698073252042605, + "loss": 4.5606, + "step": 2278 + }, + { + "epoch": 0.23630507617831753, + "grad_norm": 0.58984375, + "learning_rate": 0.00019697808293450312, + "loss": 4.5585, + "step": 2279 + }, + { + "epoch": 0.2364087642328056, + "grad_norm": 0.482421875, + "learning_rate": 0.00019697543220434133, + "loss": 4.5512, + "step": 2280 + }, + { + "epoch": 0.23651245228729367, + "grad_norm": 0.58203125, + "learning_rate": 0.00019697278032997198, + "loss": 4.5865, + "step": 2281 + }, + { + "epoch": 0.23661614034178174, + "grad_norm": 0.51171875, + "learning_rate": 0.0001969701273114263, + "loss": 4.5634, + "step": 2282 + }, + { + "epoch": 0.23671982839626982, + "grad_norm": 0.5703125, + "learning_rate": 0.0001969674731487356, + "loss": 4.5198, + "step": 2283 + }, + { + "epoch": 0.2368235164507579, + "grad_norm": 0.53515625, + "learning_rate": 0.00019696481784193127, + "loss": 4.4959, + "step": 2284 + }, + { + "epoch": 0.23692720450524596, + "grad_norm": 0.5234375, + "learning_rate": 0.00019696216139104453, + "loss": 4.5441, + "step": 2285 + }, + { + "epoch": 0.23703089255973403, + "grad_norm": 0.52734375, + "learning_rate": 0.00019695950379610682, + "loss": 4.5769, + "step": 2286 + }, + { + "epoch": 0.2371345806142221, + "grad_norm": 0.5, + "learning_rate": 0.00019695684505714942, + "loss": 4.5464, + "step": 2287 + }, + { + "epoch": 0.23723826866871017, + "grad_norm": 0.5546875, + "learning_rate": 0.00019695418517420377, + "loss": 4.545, + "step": 2288 + }, + { + "epoch": 0.23734195672319827, + "grad_norm": 0.5078125, + "learning_rate": 0.0001969515241473012, + "loss": 4.5357, + "step": 2289 + }, + { + "epoch": 0.23744564477768634, + "grad_norm": 0.5, + "learning_rate": 0.00019694886197647312, + "loss": 4.5784, + "step": 2290 + }, + { + "epoch": 0.23754933283217441, + "grad_norm": 0.482421875, + "learning_rate": 0.00019694619866175098, + "loss": 4.5894, + "step": 2291 + }, + { + "epoch": 0.23765302088666249, + "grad_norm": 0.52734375, + "learning_rate": 0.00019694353420316615, + "loss": 4.5586, + "step": 2292 + }, + { + "epoch": 0.23775670894115056, + "grad_norm": 0.53515625, + "learning_rate": 0.0001969408686007501, + "loss": 4.5119, + "step": 2293 + }, + { + "epoch": 0.23786039699563863, + "grad_norm": 0.53125, + "learning_rate": 0.00019693820185453427, + "loss": 4.5694, + "step": 2294 + }, + { + "epoch": 0.2379640850501267, + "grad_norm": 0.5078125, + "learning_rate": 0.00019693553396455012, + "loss": 4.5731, + "step": 2295 + }, + { + "epoch": 0.23806777310461477, + "grad_norm": 0.546875, + "learning_rate": 0.0001969328649308291, + "loss": 4.5255, + "step": 2296 + }, + { + "epoch": 0.23817146115910284, + "grad_norm": 0.5859375, + "learning_rate": 0.0001969301947534028, + "loss": 4.5547, + "step": 2297 + }, + { + "epoch": 0.23827514921359091, + "grad_norm": 0.53515625, + "learning_rate": 0.00019692752343230264, + "loss": 4.5426, + "step": 2298 + }, + { + "epoch": 0.23837883726807899, + "grad_norm": 0.5859375, + "learning_rate": 0.00019692485096756016, + "loss": 4.551, + "step": 2299 + }, + { + "epoch": 0.23848252532256706, + "grad_norm": 0.59765625, + "learning_rate": 0.0001969221773592069, + "loss": 4.5627, + "step": 2300 + }, + { + "epoch": 0.23858621337705513, + "grad_norm": 0.609375, + "learning_rate": 0.00019691950260727437, + "loss": 4.5379, + "step": 2301 + }, + { + "epoch": 0.2386899014315432, + "grad_norm": 0.56640625, + "learning_rate": 0.00019691682671179415, + "loss": 4.5418, + "step": 2302 + }, + { + "epoch": 0.23879358948603127, + "grad_norm": 0.5859375, + "learning_rate": 0.00019691414967279786, + "loss": 4.5495, + "step": 2303 + }, + { + "epoch": 0.23889727754051934, + "grad_norm": 0.578125, + "learning_rate": 0.00019691147149031703, + "loss": 4.557, + "step": 2304 + }, + { + "epoch": 0.23900096559500741, + "grad_norm": 0.61328125, + "learning_rate": 0.00019690879216438325, + "loss": 4.5889, + "step": 2305 + }, + { + "epoch": 0.23910465364949549, + "grad_norm": 0.5859375, + "learning_rate": 0.0001969061116950282, + "loss": 4.539, + "step": 2306 + }, + { + "epoch": 0.23920834170398356, + "grad_norm": 0.57421875, + "learning_rate": 0.00019690343008228343, + "loss": 4.5705, + "step": 2307 + }, + { + "epoch": 0.23931202975847163, + "grad_norm": 0.63671875, + "learning_rate": 0.00019690074732618066, + "loss": 4.5375, + "step": 2308 + }, + { + "epoch": 0.2394157178129597, + "grad_norm": 0.5703125, + "learning_rate": 0.00019689806342675147, + "loss": 4.5483, + "step": 2309 + }, + { + "epoch": 0.23951940586744777, + "grad_norm": 0.625, + "learning_rate": 0.00019689537838402758, + "loss": 4.5793, + "step": 2310 + }, + { + "epoch": 0.23962309392193584, + "grad_norm": 0.58203125, + "learning_rate": 0.0001968926921980406, + "loss": 4.5669, + "step": 2311 + }, + { + "epoch": 0.23972678197642394, + "grad_norm": 0.5625, + "learning_rate": 0.00019689000486882235, + "loss": 4.5548, + "step": 2312 + }, + { + "epoch": 0.239830470030912, + "grad_norm": 0.5703125, + "learning_rate": 0.00019688731639640438, + "loss": 4.5066, + "step": 2313 + }, + { + "epoch": 0.23993415808540008, + "grad_norm": 0.59765625, + "learning_rate": 0.00019688462678081852, + "loss": 4.5549, + "step": 2314 + }, + { + "epoch": 0.24003784613988816, + "grad_norm": 0.59765625, + "learning_rate": 0.00019688193602209652, + "loss": 4.5162, + "step": 2315 + }, + { + "epoch": 0.24014153419437623, + "grad_norm": 0.54296875, + "learning_rate": 0.00019687924412027004, + "loss": 4.5942, + "step": 2316 + }, + { + "epoch": 0.2402452222488643, + "grad_norm": 0.59765625, + "learning_rate": 0.00019687655107537087, + "loss": 4.5106, + "step": 2317 + }, + { + "epoch": 0.24034891030335237, + "grad_norm": 0.5390625, + "learning_rate": 0.00019687385688743083, + "loss": 4.5898, + "step": 2318 + }, + { + "epoch": 0.24045259835784044, + "grad_norm": 0.50390625, + "learning_rate": 0.00019687116155648167, + "loss": 4.5338, + "step": 2319 + }, + { + "epoch": 0.2405562864123285, + "grad_norm": 0.53125, + "learning_rate": 0.00019686846508255518, + "loss": 4.5561, + "step": 2320 + }, + { + "epoch": 0.24065997446681658, + "grad_norm": 0.5234375, + "learning_rate": 0.00019686576746568321, + "loss": 4.5322, + "step": 2321 + }, + { + "epoch": 0.24076366252130466, + "grad_norm": 0.59375, + "learning_rate": 0.0001968630687058976, + "loss": 4.556, + "step": 2322 + }, + { + "epoch": 0.24086735057579273, + "grad_norm": 0.486328125, + "learning_rate": 0.00019686036880323012, + "loss": 4.5641, + "step": 2323 + }, + { + "epoch": 0.2409710386302808, + "grad_norm": 0.6171875, + "learning_rate": 0.00019685766775771272, + "loss": 4.541, + "step": 2324 + }, + { + "epoch": 0.24107472668476887, + "grad_norm": 0.5, + "learning_rate": 0.00019685496556937722, + "loss": 4.5167, + "step": 2325 + }, + { + "epoch": 0.24117841473925694, + "grad_norm": 0.56640625, + "learning_rate": 0.0001968522622382555, + "loss": 4.5524, + "step": 2326 + }, + { + "epoch": 0.241282102793745, + "grad_norm": 0.4765625, + "learning_rate": 0.00019684955776437947, + "loss": 4.5259, + "step": 2327 + }, + { + "epoch": 0.24138579084823308, + "grad_norm": 0.53125, + "learning_rate": 0.000196846852147781, + "loss": 4.5491, + "step": 2328 + }, + { + "epoch": 0.24148947890272116, + "grad_norm": 0.55078125, + "learning_rate": 0.00019684414538849207, + "loss": 4.5132, + "step": 2329 + }, + { + "epoch": 0.24159316695720923, + "grad_norm": 0.50390625, + "learning_rate": 0.00019684143748654458, + "loss": 4.5794, + "step": 2330 + }, + { + "epoch": 0.2416968550116973, + "grad_norm": 0.671875, + "learning_rate": 0.00019683872844197052, + "loss": 4.5889, + "step": 2331 + }, + { + "epoch": 0.24180054306618537, + "grad_norm": 0.578125, + "learning_rate": 0.0001968360182548018, + "loss": 4.5713, + "step": 2332 + }, + { + "epoch": 0.24190423112067344, + "grad_norm": 0.5859375, + "learning_rate": 0.00019683330692507042, + "loss": 4.5571, + "step": 2333 + }, + { + "epoch": 0.24200791917516154, + "grad_norm": 0.56640625, + "learning_rate": 0.00019683059445280837, + "loss": 4.5484, + "step": 2334 + }, + { + "epoch": 0.2421116072296496, + "grad_norm": 0.5859375, + "learning_rate": 0.00019682788083804771, + "loss": 4.538, + "step": 2335 + }, + { + "epoch": 0.24221529528413768, + "grad_norm": 0.6015625, + "learning_rate": 0.00019682516608082037, + "loss": 4.5748, + "step": 2336 + }, + { + "epoch": 0.24231898333862575, + "grad_norm": 0.52734375, + "learning_rate": 0.00019682245018115842, + "loss": 4.5281, + "step": 2337 + }, + { + "epoch": 0.24242267139311383, + "grad_norm": 0.53515625, + "learning_rate": 0.0001968197331390939, + "loss": 4.5424, + "step": 2338 + }, + { + "epoch": 0.2425263594476019, + "grad_norm": 0.5234375, + "learning_rate": 0.00019681701495465889, + "loss": 4.5263, + "step": 2339 + }, + { + "epoch": 0.24263004750208997, + "grad_norm": 0.5546875, + "learning_rate": 0.00019681429562788542, + "loss": 4.5765, + "step": 2340 + }, + { + "epoch": 0.24273373555657804, + "grad_norm": 0.58984375, + "learning_rate": 0.00019681157515880564, + "loss": 4.4784, + "step": 2341 + }, + { + "epoch": 0.2428374236110661, + "grad_norm": 0.51953125, + "learning_rate": 0.00019680885354745158, + "loss": 4.542, + "step": 2342 + }, + { + "epoch": 0.24294111166555418, + "grad_norm": 0.52734375, + "learning_rate": 0.00019680613079385537, + "loss": 4.5242, + "step": 2343 + }, + { + "epoch": 0.24304479972004225, + "grad_norm": 0.52734375, + "learning_rate": 0.00019680340689804914, + "loss": 4.5195, + "step": 2344 + }, + { + "epoch": 0.24314848777453033, + "grad_norm": 0.54296875, + "learning_rate": 0.00019680068186006506, + "loss": 4.5474, + "step": 2345 + }, + { + "epoch": 0.2432521758290184, + "grad_norm": 0.470703125, + "learning_rate": 0.00019679795567993527, + "loss": 4.5701, + "step": 2346 + }, + { + "epoch": 0.24335586388350647, + "grad_norm": 0.52734375, + "learning_rate": 0.00019679522835769188, + "loss": 4.5311, + "step": 2347 + }, + { + "epoch": 0.24345955193799454, + "grad_norm": 0.439453125, + "learning_rate": 0.00019679249989336715, + "loss": 4.5522, + "step": 2348 + }, + { + "epoch": 0.2435632399924826, + "grad_norm": 0.56640625, + "learning_rate": 0.00019678977028699318, + "loss": 4.5616, + "step": 2349 + }, + { + "epoch": 0.24366692804697068, + "grad_norm": 0.45703125, + "learning_rate": 0.0001967870395386023, + "loss": 4.5, + "step": 2350 + }, + { + "epoch": 0.24377061610145875, + "grad_norm": 0.55859375, + "learning_rate": 0.00019678430764822661, + "loss": 4.5487, + "step": 2351 + }, + { + "epoch": 0.24387430415594682, + "grad_norm": 0.466796875, + "learning_rate": 0.00019678157461589844, + "loss": 4.522, + "step": 2352 + }, + { + "epoch": 0.2439779922104349, + "grad_norm": 0.609375, + "learning_rate": 0.00019677884044164997, + "loss": 4.5293, + "step": 2353 + }, + { + "epoch": 0.24408168026492297, + "grad_norm": 0.5, + "learning_rate": 0.00019677610512551348, + "loss": 4.5629, + "step": 2354 + }, + { + "epoch": 0.24418536831941104, + "grad_norm": 0.53125, + "learning_rate": 0.00019677336866752123, + "loss": 4.5529, + "step": 2355 + }, + { + "epoch": 0.24428905637389914, + "grad_norm": 0.482421875, + "learning_rate": 0.00019677063106770555, + "loss": 4.5488, + "step": 2356 + }, + { + "epoch": 0.2443927444283872, + "grad_norm": 0.52734375, + "learning_rate": 0.00019676789232609868, + "loss": 4.5474, + "step": 2357 + }, + { + "epoch": 0.24449643248287528, + "grad_norm": 0.515625, + "learning_rate": 0.000196765152442733, + "loss": 4.5178, + "step": 2358 + }, + { + "epoch": 0.24460012053736335, + "grad_norm": 0.5078125, + "learning_rate": 0.0001967624114176408, + "loss": 4.5141, + "step": 2359 + }, + { + "epoch": 0.24470380859185142, + "grad_norm": 0.51953125, + "learning_rate": 0.00019675966925085443, + "loss": 4.4834, + "step": 2360 + }, + { + "epoch": 0.2448074966463395, + "grad_norm": 0.470703125, + "learning_rate": 0.00019675692594240624, + "loss": 4.5927, + "step": 2361 + }, + { + "epoch": 0.24491118470082757, + "grad_norm": 0.45703125, + "learning_rate": 0.0001967541814923286, + "loss": 4.5491, + "step": 2362 + }, + { + "epoch": 0.24501487275531564, + "grad_norm": 0.490234375, + "learning_rate": 0.00019675143590065387, + "loss": 4.5038, + "step": 2363 + }, + { + "epoch": 0.2451185608098037, + "grad_norm": 0.44921875, + "learning_rate": 0.00019674868916741452, + "loss": 4.5872, + "step": 2364 + }, + { + "epoch": 0.24522224886429178, + "grad_norm": 0.51953125, + "learning_rate": 0.00019674594129264286, + "loss": 4.5441, + "step": 2365 + }, + { + "epoch": 0.24532593691877985, + "grad_norm": 0.4921875, + "learning_rate": 0.0001967431922763714, + "loss": 4.5074, + "step": 2366 + }, + { + "epoch": 0.24542962497326792, + "grad_norm": 0.546875, + "learning_rate": 0.00019674044211863247, + "loss": 4.4877, + "step": 2367 + }, + { + "epoch": 0.245533313027756, + "grad_norm": 0.55078125, + "learning_rate": 0.00019673769081945863, + "loss": 4.527, + "step": 2368 + }, + { + "epoch": 0.24563700108224407, + "grad_norm": 0.59375, + "learning_rate": 0.00019673493837888228, + "loss": 4.5584, + "step": 2369 + }, + { + "epoch": 0.24574068913673214, + "grad_norm": 0.5703125, + "learning_rate": 0.00019673218479693592, + "loss": 4.5505, + "step": 2370 + }, + { + "epoch": 0.2458443771912202, + "grad_norm": 0.5703125, + "learning_rate": 0.00019672943007365202, + "loss": 4.5845, + "step": 2371 + }, + { + "epoch": 0.24594806524570828, + "grad_norm": 0.625, + "learning_rate": 0.00019672667420906308, + "loss": 4.5538, + "step": 2372 + }, + { + "epoch": 0.24605175330019635, + "grad_norm": 0.5703125, + "learning_rate": 0.00019672391720320165, + "loss": 4.523, + "step": 2373 + }, + { + "epoch": 0.24615544135468442, + "grad_norm": 0.62109375, + "learning_rate": 0.00019672115905610023, + "loss": 4.5312, + "step": 2374 + }, + { + "epoch": 0.2462591294091725, + "grad_norm": 0.5859375, + "learning_rate": 0.00019671839976779138, + "loss": 4.5019, + "step": 2375 + }, + { + "epoch": 0.24636281746366057, + "grad_norm": 0.57421875, + "learning_rate": 0.00019671563933830767, + "loss": 4.5387, + "step": 2376 + }, + { + "epoch": 0.24646650551814864, + "grad_norm": 0.62890625, + "learning_rate": 0.0001967128777676816, + "loss": 4.5441, + "step": 2377 + }, + { + "epoch": 0.2465701935726367, + "grad_norm": 0.60546875, + "learning_rate": 0.00019671011505594581, + "loss": 4.5023, + "step": 2378 + }, + { + "epoch": 0.2466738816271248, + "grad_norm": 0.490234375, + "learning_rate": 0.0001967073512031329, + "loss": 4.5643, + "step": 2379 + }, + { + "epoch": 0.24677756968161288, + "grad_norm": 0.5859375, + "learning_rate": 0.00019670458620927548, + "loss": 4.5538, + "step": 2380 + }, + { + "epoch": 0.24688125773610095, + "grad_norm": 0.55078125, + "learning_rate": 0.00019670182007440614, + "loss": 4.583, + "step": 2381 + }, + { + "epoch": 0.24698494579058902, + "grad_norm": 0.55078125, + "learning_rate": 0.0001966990527985576, + "loss": 4.5283, + "step": 2382 + }, + { + "epoch": 0.2470886338450771, + "grad_norm": 0.58984375, + "learning_rate": 0.0001966962843817624, + "loss": 4.5397, + "step": 2383 + }, + { + "epoch": 0.24719232189956516, + "grad_norm": 0.515625, + "learning_rate": 0.00019669351482405324, + "loss": 4.5366, + "step": 2384 + }, + { + "epoch": 0.24729600995405324, + "grad_norm": 0.71875, + "learning_rate": 0.00019669074412546284, + "loss": 4.5615, + "step": 2385 + }, + { + "epoch": 0.2473996980085413, + "grad_norm": 0.5078125, + "learning_rate": 0.0001966879722860239, + "loss": 4.5026, + "step": 2386 + }, + { + "epoch": 0.24750338606302938, + "grad_norm": 0.71484375, + "learning_rate": 0.00019668519930576904, + "loss": 4.5545, + "step": 2387 + }, + { + "epoch": 0.24760707411751745, + "grad_norm": 0.6484375, + "learning_rate": 0.00019668242518473106, + "loss": 4.5279, + "step": 2388 + }, + { + "epoch": 0.24771076217200552, + "grad_norm": 0.56640625, + "learning_rate": 0.00019667964992294264, + "loss": 4.5575, + "step": 2389 + }, + { + "epoch": 0.2478144502264936, + "grad_norm": 0.6015625, + "learning_rate": 0.00019667687352043655, + "loss": 4.5308, + "step": 2390 + }, + { + "epoch": 0.24791813828098166, + "grad_norm": 0.6015625, + "learning_rate": 0.00019667409597724553, + "loss": 4.5585, + "step": 2391 + }, + { + "epoch": 0.24802182633546974, + "grad_norm": 0.5625, + "learning_rate": 0.0001966713172934024, + "loss": 4.5285, + "step": 2392 + }, + { + "epoch": 0.2481255143899578, + "grad_norm": 0.59765625, + "learning_rate": 0.00019666853746893987, + "loss": 4.4994, + "step": 2393 + }, + { + "epoch": 0.24822920244444588, + "grad_norm": 0.5546875, + "learning_rate": 0.00019666575650389084, + "loss": 4.491, + "step": 2394 + }, + { + "epoch": 0.24833289049893395, + "grad_norm": 0.53515625, + "learning_rate": 0.000196662974398288, + "loss": 4.4838, + "step": 2395 + }, + { + "epoch": 0.24843657855342202, + "grad_norm": 0.53125, + "learning_rate": 0.0001966601911521643, + "loss": 4.5466, + "step": 2396 + }, + { + "epoch": 0.2485402666079101, + "grad_norm": 0.59765625, + "learning_rate": 0.00019665740676555246, + "loss": 4.5693, + "step": 2397 + }, + { + "epoch": 0.24864395466239816, + "grad_norm": 0.53515625, + "learning_rate": 0.00019665462123848545, + "loss": 4.5713, + "step": 2398 + }, + { + "epoch": 0.24874764271688624, + "grad_norm": 0.59765625, + "learning_rate": 0.00019665183457099602, + "loss": 4.5254, + "step": 2399 + }, + { + "epoch": 0.2488513307713743, + "grad_norm": 0.53125, + "learning_rate": 0.00019664904676311716, + "loss": 4.4666, + "step": 2400 + }, + { + "epoch": 0.2489550188258624, + "grad_norm": 0.57421875, + "learning_rate": 0.00019664625781488167, + "loss": 4.4776, + "step": 2401 + }, + { + "epoch": 0.24905870688035048, + "grad_norm": 0.56640625, + "learning_rate": 0.00019664346772632252, + "loss": 4.5156, + "step": 2402 + }, + { + "epoch": 0.24916239493483855, + "grad_norm": 0.56640625, + "learning_rate": 0.0001966406764974726, + "loss": 4.5436, + "step": 2403 + }, + { + "epoch": 0.24926608298932662, + "grad_norm": 0.54296875, + "learning_rate": 0.00019663788412836483, + "loss": 4.4748, + "step": 2404 + }, + { + "epoch": 0.2493697710438147, + "grad_norm": 0.60546875, + "learning_rate": 0.0001966350906190322, + "loss": 4.5189, + "step": 2405 + }, + { + "epoch": 0.24947345909830276, + "grad_norm": 0.6796875, + "learning_rate": 0.00019663229596950766, + "loss": 4.4902, + "step": 2406 + }, + { + "epoch": 0.24957714715279083, + "grad_norm": 0.59375, + "learning_rate": 0.00019662950017982416, + "loss": 4.5363, + "step": 2407 + }, + { + "epoch": 0.2496808352072789, + "grad_norm": 0.55078125, + "learning_rate": 0.00019662670325001468, + "loss": 4.5453, + "step": 2408 + }, + { + "epoch": 0.24978452326176698, + "grad_norm": 0.578125, + "learning_rate": 0.00019662390518011228, + "loss": 4.5477, + "step": 2409 + }, + { + "epoch": 0.24988821131625505, + "grad_norm": 0.49609375, + "learning_rate": 0.0001966211059701499, + "loss": 4.5409, + "step": 2410 + }, + { + "epoch": 0.24999189937074312, + "grad_norm": 0.61328125, + "learning_rate": 0.0001966183056201606, + "loss": 4.525, + "step": 2411 + }, + { + "epoch": 0.24999189937074312, + "eval_loss": 4.54582405090332, + "eval_runtime": 0.4393, + "eval_samples_per_second": 339.196, + "eval_steps_per_second": 15.935, + "step": 2411 + } + ], + "logging_steps": 1, + "max_steps": 28932, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 2411, + "total_flos": 3.4729528980175585e+18, + "train_batch_size": 3, + "trial_name": null, + "trial_params": null +}