diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6134 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.4999552075681293, + "eval_steps": 500, + "global_step": 872, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005733431279451024, + "grad_norm": 343.8302001953125, + "learning_rate": 3.7037037037037036e-07, + "loss": 15.7056, + "step": 1 + }, + { + "epoch": 0.0011466862558902047, + "grad_norm": 354.2492370605469, + "learning_rate": 7.407407407407407e-07, + "loss": 15.7685, + "step": 2 + }, + { + "epoch": 0.0017200293838353072, + "grad_norm": 332.5870666503906, + "learning_rate": 1.111111111111111e-06, + "loss": 15.7735, + "step": 3 + }, + { + "epoch": 0.0022933725117804094, + "grad_norm": 337.5965576171875, + "learning_rate": 1.4814814814814815e-06, + "loss": 15.6978, + "step": 4 + }, + { + "epoch": 0.002866715639725512, + "grad_norm": 337.68743896484375, + "learning_rate": 1.8518518518518519e-06, + "loss": 15.7328, + "step": 5 + }, + { + "epoch": 0.0034400587676706143, + "grad_norm": 352.12841796875, + "learning_rate": 2.222222222222222e-06, + "loss": 15.7606, + "step": 6 + }, + { + "epoch": 0.004013401895615717, + "grad_norm": 350.4210510253906, + "learning_rate": 2.5925925925925925e-06, + "loss": 15.7352, + "step": 7 + }, + { + "epoch": 0.004586745023560819, + "grad_norm": 326.6189270019531, + "learning_rate": 2.962962962962963e-06, + "loss": 15.7233, + "step": 8 + }, + { + "epoch": 0.005160088151505922, + "grad_norm": 337.8202819824219, + "learning_rate": 3.3333333333333333e-06, + "loss": 15.7473, + "step": 9 + }, + { + "epoch": 0.005733431279451024, + "grad_norm": 346.01373291015625, + "learning_rate": 3.7037037037037037e-06, + "loss": 15.7525, + "step": 10 + }, + { + "epoch": 0.006306774407396127, + "grad_norm": 345.3333435058594, + "learning_rate": 4.074074074074074e-06, + "loss": 15.7312, + "step": 11 + }, + { + "epoch": 0.006880117535341229, + "grad_norm": 366.6058044433594, + "learning_rate": 4.444444444444444e-06, + "loss": 15.8701, + "step": 12 + }, + { + "epoch": 0.0074534606632863315, + "grad_norm": 343.2021484375, + "learning_rate": 4.814814814814815e-06, + "loss": 15.7247, + "step": 13 + }, + { + "epoch": 0.008026803791231434, + "grad_norm": 355.9106750488281, + "learning_rate": 5.185185185185185e-06, + "loss": 15.8606, + "step": 14 + }, + { + "epoch": 0.008600146919176536, + "grad_norm": 348.240966796875, + "learning_rate": 5.555555555555557e-06, + "loss": 15.7669, + "step": 15 + }, + { + "epoch": 0.009173490047121638, + "grad_norm": 338.352783203125, + "learning_rate": 5.925925925925926e-06, + "loss": 15.7275, + "step": 16 + }, + { + "epoch": 0.00974683317506674, + "grad_norm": 345.3967590332031, + "learning_rate": 6.296296296296297e-06, + "loss": 15.6908, + "step": 17 + }, + { + "epoch": 0.010320176303011843, + "grad_norm": 337.52459716796875, + "learning_rate": 6.666666666666667e-06, + "loss": 15.7063, + "step": 18 + }, + { + "epoch": 0.010893519430956946, + "grad_norm": 348.9382019042969, + "learning_rate": 7.0370370370370375e-06, + "loss": 15.7312, + "step": 19 + }, + { + "epoch": 0.011466862558902047, + "grad_norm": 367.71954345703125, + "learning_rate": 7.4074074074074075e-06, + "loss": 15.6506, + "step": 20 + }, + { + "epoch": 0.01204020568684715, + "grad_norm": 340.8507080078125, + "learning_rate": 7.77777777777778e-06, + "loss": 15.6737, + "step": 21 + }, + { + "epoch": 0.012613548814792253, + "grad_norm": 342.04400634765625, + "learning_rate": 8.148148148148148e-06, + "loss": 15.7023, + "step": 22 + }, + { + "epoch": 0.013186891942737354, + "grad_norm": 341.126708984375, + "learning_rate": 8.518518518518519e-06, + "loss": 15.7475, + "step": 23 + }, + { + "epoch": 0.013760235070682457, + "grad_norm": 355.2649841308594, + "learning_rate": 8.888888888888888e-06, + "loss": 15.719, + "step": 24 + }, + { + "epoch": 0.01433357819862756, + "grad_norm": 332.40753173828125, + "learning_rate": 9.25925925925926e-06, + "loss": 15.7316, + "step": 25 + }, + { + "epoch": 0.014906921326572663, + "grad_norm": 337.4178771972656, + "learning_rate": 9.62962962962963e-06, + "loss": 15.7153, + "step": 26 + }, + { + "epoch": 0.015480264454517764, + "grad_norm": 337.7605895996094, + "learning_rate": 1e-05, + "loss": 15.6421, + "step": 27 + }, + { + "epoch": 0.016053607582462867, + "grad_norm": 339.8226013183594, + "learning_rate": 9.999965443811378e-06, + "loss": 15.7325, + "step": 28 + }, + { + "epoch": 0.01662695071040797, + "grad_norm": 338.6639709472656, + "learning_rate": 9.999861775723162e-06, + "loss": 15.5632, + "step": 29 + }, + { + "epoch": 0.017200293838353073, + "grad_norm": 313.73358154296875, + "learning_rate": 9.999688997168301e-06, + "loss": 15.6312, + "step": 30 + }, + { + "epoch": 0.017773636966298176, + "grad_norm": 324.01812744140625, + "learning_rate": 9.999447110535026e-06, + "loss": 15.568, + "step": 31 + }, + { + "epoch": 0.018346980094243275, + "grad_norm": 350.6103820800781, + "learning_rate": 9.999136119166803e-06, + "loss": 15.6763, + "step": 32 + }, + { + "epoch": 0.018920323222188378, + "grad_norm": 345.98101806640625, + "learning_rate": 9.998756027362308e-06, + "loss": 15.6779, + "step": 33 + }, + { + "epoch": 0.01949366635013348, + "grad_norm": 332.9476013183594, + "learning_rate": 9.99830684037535e-06, + "loss": 15.5666, + "step": 34 + }, + { + "epoch": 0.020067009478078584, + "grad_norm": 323.1610412597656, + "learning_rate": 9.9977885644148e-06, + "loss": 15.5645, + "step": 35 + }, + { + "epoch": 0.020640352606023687, + "grad_norm": 345.10504150390625, + "learning_rate": 9.997201206644522e-06, + "loss": 15.6663, + "step": 36 + }, + { + "epoch": 0.02121369573396879, + "grad_norm": 327.5580749511719, + "learning_rate": 9.99654477518325e-06, + "loss": 15.5561, + "step": 37 + }, + { + "epoch": 0.021787038861913893, + "grad_norm": 321.0745849609375, + "learning_rate": 9.995819279104494e-06, + "loss": 15.6465, + "step": 38 + }, + { + "epoch": 0.022360381989858992, + "grad_norm": 331.85736083984375, + "learning_rate": 9.995024728436402e-06, + "loss": 15.6326, + "step": 39 + }, + { + "epoch": 0.022933725117804095, + "grad_norm": 317.6231994628906, + "learning_rate": 9.994161134161635e-06, + "loss": 15.5944, + "step": 40 + }, + { + "epoch": 0.023507068245749198, + "grad_norm": 329.2905578613281, + "learning_rate": 9.993228508217201e-06, + "loss": 15.6057, + "step": 41 + }, + { + "epoch": 0.0240804113736943, + "grad_norm": 331.61407470703125, + "learning_rate": 9.9922268634943e-06, + "loss": 15.5034, + "step": 42 + }, + { + "epoch": 0.024653754501639404, + "grad_norm": 308.8927917480469, + "learning_rate": 9.991156213838143e-06, + "loss": 15.5626, + "step": 43 + }, + { + "epoch": 0.025227097629584506, + "grad_norm": 331.08624267578125, + "learning_rate": 9.990016574047757e-06, + "loss": 15.628, + "step": 44 + }, + { + "epoch": 0.02580044075752961, + "grad_norm": 343.9646911621094, + "learning_rate": 9.988807959875785e-06, + "loss": 15.52, + "step": 45 + }, + { + "epoch": 0.02637378388547471, + "grad_norm": 317.4025573730469, + "learning_rate": 9.987530388028269e-06, + "loss": 15.5872, + "step": 46 + }, + { + "epoch": 0.02694712701341981, + "grad_norm": 322.307373046875, + "learning_rate": 9.986183876164412e-06, + "loss": 15.4988, + "step": 47 + }, + { + "epoch": 0.027520470141364915, + "grad_norm": 317.954833984375, + "learning_rate": 9.984768442896342e-06, + "loss": 15.4338, + "step": 48 + }, + { + "epoch": 0.028093813269310017, + "grad_norm": 316.19512939453125, + "learning_rate": 9.983284107788852e-06, + "loss": 15.464, + "step": 49 + }, + { + "epoch": 0.02866715639725512, + "grad_norm": 309.9515686035156, + "learning_rate": 9.981730891359123e-06, + "loss": 15.4762, + "step": 50 + }, + { + "epoch": 0.029240499525200223, + "grad_norm": 325.8763427734375, + "learning_rate": 9.980108815076456e-06, + "loss": 15.4914, + "step": 51 + }, + { + "epoch": 0.029813842653145326, + "grad_norm": 310.78424072265625, + "learning_rate": 9.978417901361958e-06, + "loss": 15.5108, + "step": 52 + }, + { + "epoch": 0.030387185781090426, + "grad_norm": 335.9707946777344, + "learning_rate": 9.976658173588244e-06, + "loss": 15.5588, + "step": 53 + }, + { + "epoch": 0.03096052890903553, + "grad_norm": 325.359375, + "learning_rate": 9.974829656079106e-06, + "loss": 15.5329, + "step": 54 + }, + { + "epoch": 0.03153387203698063, + "grad_norm": 313.94293212890625, + "learning_rate": 9.972932374109184e-06, + "loss": 15.4339, + "step": 55 + }, + { + "epoch": 0.032107215164925734, + "grad_norm": 328.7725830078125, + "learning_rate": 9.97096635390361e-06, + "loss": 15.5493, + "step": 56 + }, + { + "epoch": 0.03268055829287084, + "grad_norm": 329.2198791503906, + "learning_rate": 9.968931622637652e-06, + "loss": 15.5452, + "step": 57 + }, + { + "epoch": 0.03325390142081594, + "grad_norm": 323.4937438964844, + "learning_rate": 9.966828208436332e-06, + "loss": 15.4639, + "step": 58 + }, + { + "epoch": 0.03382724454876104, + "grad_norm": 318.2495422363281, + "learning_rate": 9.96465614037404e-06, + "loss": 15.4536, + "step": 59 + }, + { + "epoch": 0.034400587676706146, + "grad_norm": 315.8673095703125, + "learning_rate": 9.962415448474134e-06, + "loss": 15.4971, + "step": 60 + }, + { + "epoch": 0.03497393080465125, + "grad_norm": 313.1468505859375, + "learning_rate": 9.960106163708522e-06, + "loss": 15.4236, + "step": 61 + }, + { + "epoch": 0.03554727393259635, + "grad_norm": 311.7138977050781, + "learning_rate": 9.95772831799724e-06, + "loss": 15.4216, + "step": 62 + }, + { + "epoch": 0.03612061706054145, + "grad_norm": 306.1082763671875, + "learning_rate": 9.955281944207998e-06, + "loss": 15.4497, + "step": 63 + }, + { + "epoch": 0.03669396018848655, + "grad_norm": 320.2241516113281, + "learning_rate": 9.95276707615574e-06, + "loss": 15.4434, + "step": 64 + }, + { + "epoch": 0.03726730331643165, + "grad_norm": 314.4595642089844, + "learning_rate": 9.950183748602164e-06, + "loss": 15.4662, + "step": 65 + }, + { + "epoch": 0.037840646444376756, + "grad_norm": 329.10784912109375, + "learning_rate": 9.947531997255256e-06, + "loss": 15.4808, + "step": 66 + }, + { + "epoch": 0.03841398957232186, + "grad_norm": 312.20135498046875, + "learning_rate": 9.944811858768782e-06, + "loss": 15.4633, + "step": 67 + }, + { + "epoch": 0.03898733270026696, + "grad_norm": 331.2897033691406, + "learning_rate": 9.94202337074179e-06, + "loss": 15.4861, + "step": 68 + }, + { + "epoch": 0.039560675828212065, + "grad_norm": 325.3959655761719, + "learning_rate": 9.939166571718086e-06, + "loss": 15.349, + "step": 69 + }, + { + "epoch": 0.04013401895615717, + "grad_norm": 312.45513916015625, + "learning_rate": 9.936241501185706e-06, + "loss": 15.476, + "step": 70 + }, + { + "epoch": 0.04070736208410227, + "grad_norm": 318.12322998046875, + "learning_rate": 9.933248199576366e-06, + "loss": 15.5243, + "step": 71 + }, + { + "epoch": 0.041280705212047374, + "grad_norm": 313.1653137207031, + "learning_rate": 9.930186708264902e-06, + "loss": 15.4566, + "step": 72 + }, + { + "epoch": 0.041854048339992476, + "grad_norm": 310.0314636230469, + "learning_rate": 9.927057069568704e-06, + "loss": 15.4365, + "step": 73 + }, + { + "epoch": 0.04242739146793758, + "grad_norm": 319.7293701171875, + "learning_rate": 9.923859326747125e-06, + "loss": 15.4605, + "step": 74 + }, + { + "epoch": 0.04300073459588268, + "grad_norm": 306.68524169921875, + "learning_rate": 9.920593524000887e-06, + "loss": 15.3812, + "step": 75 + }, + { + "epoch": 0.043574077723827785, + "grad_norm": 309.94500732421875, + "learning_rate": 9.917259706471469e-06, + "loss": 15.3971, + "step": 76 + }, + { + "epoch": 0.04414742085177288, + "grad_norm": 321.57647705078125, + "learning_rate": 9.913857920240481e-06, + "loss": 15.471, + "step": 77 + }, + { + "epoch": 0.044720763979717984, + "grad_norm": 310.59991455078125, + "learning_rate": 9.91038821232903e-06, + "loss": 15.4669, + "step": 78 + }, + { + "epoch": 0.04529410710766309, + "grad_norm": 298.2730407714844, + "learning_rate": 9.906850630697068e-06, + "loss": 15.4534, + "step": 79 + }, + { + "epoch": 0.04586745023560819, + "grad_norm": 303.1147766113281, + "learning_rate": 9.903245224242732e-06, + "loss": 15.3767, + "step": 80 + }, + { + "epoch": 0.04644079336355329, + "grad_norm": 299.9115905761719, + "learning_rate": 9.899572042801662e-06, + "loss": 15.3181, + "step": 81 + }, + { + "epoch": 0.047014136491498396, + "grad_norm": 299.8761901855469, + "learning_rate": 9.895831137146319e-06, + "loss": 15.3273, + "step": 82 + }, + { + "epoch": 0.0475874796194435, + "grad_norm": 305.44244384765625, + "learning_rate": 9.89202255898528e-06, + "loss": 15.3504, + "step": 83 + }, + { + "epoch": 0.0481608227473886, + "grad_norm": 302.8594055175781, + "learning_rate": 9.888146360962523e-06, + "loss": 15.4113, + "step": 84 + }, + { + "epoch": 0.048734165875333704, + "grad_norm": 310.83587646484375, + "learning_rate": 9.8842025966567e-06, + "loss": 15.4274, + "step": 85 + }, + { + "epoch": 0.04930750900327881, + "grad_norm": 292.6897277832031, + "learning_rate": 9.880191320580396e-06, + "loss": 15.2777, + "step": 86 + }, + { + "epoch": 0.04988085213122391, + "grad_norm": 308.5329895019531, + "learning_rate": 9.876112588179378e-06, + "loss": 15.3073, + "step": 87 + }, + { + "epoch": 0.05045419525916901, + "grad_norm": 295.7265319824219, + "learning_rate": 9.87196645583182e-06, + "loss": 15.3201, + "step": 88 + }, + { + "epoch": 0.051027538387114116, + "grad_norm": 300.5785827636719, + "learning_rate": 9.86775298084754e-06, + "loss": 15.4455, + "step": 89 + }, + { + "epoch": 0.05160088151505922, + "grad_norm": 293.6327819824219, + "learning_rate": 9.863472221467189e-06, + "loss": 15.4047, + "step": 90 + }, + { + "epoch": 0.052174224643004315, + "grad_norm": 314.74468994140625, + "learning_rate": 9.85912423686146e-06, + "loss": 15.4229, + "step": 91 + }, + { + "epoch": 0.05274756777094942, + "grad_norm": 292.306640625, + "learning_rate": 9.854709087130261e-06, + "loss": 15.3212, + "step": 92 + }, + { + "epoch": 0.05332091089889452, + "grad_norm": 321.5920715332031, + "learning_rate": 9.850226833301893e-06, + "loss": 15.3655, + "step": 93 + }, + { + "epoch": 0.05389425402683962, + "grad_norm": 288.67535400390625, + "learning_rate": 9.8456775373322e-06, + "loss": 15.3491, + "step": 94 + }, + { + "epoch": 0.054467597154784726, + "grad_norm": 301.61151123046875, + "learning_rate": 9.841061262103713e-06, + "loss": 15.4396, + "step": 95 + }, + { + "epoch": 0.05504094028272983, + "grad_norm": 291.6568908691406, + "learning_rate": 9.836378071424782e-06, + "loss": 15.3401, + "step": 96 + }, + { + "epoch": 0.05561428341067493, + "grad_norm": 292.19915771484375, + "learning_rate": 9.831628030028698e-06, + "loss": 15.3169, + "step": 97 + }, + { + "epoch": 0.056187626538620035, + "grad_norm": 291.9767150878906, + "learning_rate": 9.826811203572785e-06, + "loss": 15.3443, + "step": 98 + }, + { + "epoch": 0.05676096966656514, + "grad_norm": 304.70599365234375, + "learning_rate": 9.821927658637518e-06, + "loss": 15.3755, + "step": 99 + }, + { + "epoch": 0.05733431279451024, + "grad_norm": 302.005859375, + "learning_rate": 9.81697746272557e-06, + "loss": 15.332, + "step": 100 + }, + { + "epoch": 0.057907655922455344, + "grad_norm": 302.4617004394531, + "learning_rate": 9.811960684260907e-06, + "loss": 15.4224, + "step": 101 + }, + { + "epoch": 0.058480999050400446, + "grad_norm": 298.9280700683594, + "learning_rate": 9.80687739258782e-06, + "loss": 15.377, + "step": 102 + }, + { + "epoch": 0.05905434217834555, + "grad_norm": 287.72869873046875, + "learning_rate": 9.801727657969988e-06, + "loss": 15.3631, + "step": 103 + }, + { + "epoch": 0.05962768530629065, + "grad_norm": 275.61376953125, + "learning_rate": 9.796511551589492e-06, + "loss": 15.2808, + "step": 104 + }, + { + "epoch": 0.06020102843423575, + "grad_norm": 283.3959655761719, + "learning_rate": 9.791229145545832e-06, + "loss": 15.3636, + "step": 105 + }, + { + "epoch": 0.06077437156218085, + "grad_norm": 295.7449035644531, + "learning_rate": 9.785880512854937e-06, + "loss": 15.2886, + "step": 106 + }, + { + "epoch": 0.061347714690125954, + "grad_norm": 287.8319091796875, + "learning_rate": 9.78046572744815e-06, + "loss": 15.2978, + "step": 107 + }, + { + "epoch": 0.06192105781807106, + "grad_norm": 294.1200256347656, + "learning_rate": 9.77498486417121e-06, + "loss": 15.2803, + "step": 108 + }, + { + "epoch": 0.06249440094601616, + "grad_norm": 281.8334655761719, + "learning_rate": 9.769437998783216e-06, + "loss": 15.3278, + "step": 109 + }, + { + "epoch": 0.06306774407396126, + "grad_norm": 283.81732177734375, + "learning_rate": 9.763825207955577e-06, + "loss": 15.2408, + "step": 110 + }, + { + "epoch": 0.06364108720190637, + "grad_norm": 289.8335876464844, + "learning_rate": 9.758146569270957e-06, + "loss": 15.2072, + "step": 111 + }, + { + "epoch": 0.06421443032985147, + "grad_norm": 283.79541015625, + "learning_rate": 9.7524021612222e-06, + "loss": 15.2841, + "step": 112 + }, + { + "epoch": 0.06478777345779657, + "grad_norm": 278.183349609375, + "learning_rate": 9.746592063211247e-06, + "loss": 15.2678, + "step": 113 + }, + { + "epoch": 0.06536111658574167, + "grad_norm": 285.3382568359375, + "learning_rate": 9.74071635554803e-06, + "loss": 15.2402, + "step": 114 + }, + { + "epoch": 0.06593445971368678, + "grad_norm": 278.955078125, + "learning_rate": 9.73477511944938e-06, + "loss": 15.3042, + "step": 115 + }, + { + "epoch": 0.06650780284163188, + "grad_norm": 279.9234924316406, + "learning_rate": 9.728768437037882e-06, + "loss": 15.2099, + "step": 116 + }, + { + "epoch": 0.06708114596957698, + "grad_norm": 279.9479064941406, + "learning_rate": 9.722696391340762e-06, + "loss": 15.3344, + "step": 117 + }, + { + "epoch": 0.06765448909752209, + "grad_norm": 286.9149169921875, + "learning_rate": 9.716559066288716e-06, + "loss": 15.2665, + "step": 118 + }, + { + "epoch": 0.06822783222546719, + "grad_norm": 306.3753356933594, + "learning_rate": 9.710356546714774e-06, + "loss": 15.2843, + "step": 119 + }, + { + "epoch": 0.06880117535341229, + "grad_norm": 276.3395690917969, + "learning_rate": 9.704088918353108e-06, + "loss": 15.2029, + "step": 120 + }, + { + "epoch": 0.0693745184813574, + "grad_norm": 283.53411865234375, + "learning_rate": 9.697756267837856e-06, + "loss": 15.2337, + "step": 121 + }, + { + "epoch": 0.0699478616093025, + "grad_norm": 288.1679382324219, + "learning_rate": 9.691358682701927e-06, + "loss": 15.1838, + "step": 122 + }, + { + "epoch": 0.0705212047372476, + "grad_norm": 275.3481750488281, + "learning_rate": 9.684896251375784e-06, + "loss": 15.214, + "step": 123 + }, + { + "epoch": 0.0710945478651927, + "grad_norm": 271.927490234375, + "learning_rate": 9.678369063186224e-06, + "loss": 15.2439, + "step": 124 + }, + { + "epoch": 0.0716678909931378, + "grad_norm": 280.0527648925781, + "learning_rate": 9.671777208355146e-06, + "loss": 15.2833, + "step": 125 + }, + { + "epoch": 0.0722412341210829, + "grad_norm": 286.959228515625, + "learning_rate": 9.665120777998303e-06, + "loss": 15.3076, + "step": 126 + }, + { + "epoch": 0.072814577249028, + "grad_norm": 268.98553466796875, + "learning_rate": 9.658399864124037e-06, + "loss": 15.3274, + "step": 127 + }, + { + "epoch": 0.0733879203769731, + "grad_norm": 261.5675964355469, + "learning_rate": 9.65161455963202e-06, + "loss": 15.2216, + "step": 128 + }, + { + "epoch": 0.0739612635049182, + "grad_norm": 272.29425048828125, + "learning_rate": 9.64476495831195e-06, + "loss": 15.2385, + "step": 129 + }, + { + "epoch": 0.0745346066328633, + "grad_norm": 282.3617248535156, + "learning_rate": 9.637851154842279e-06, + "loss": 15.2864, + "step": 130 + }, + { + "epoch": 0.07510794976080841, + "grad_norm": 260.4862976074219, + "learning_rate": 9.630873244788884e-06, + "loss": 15.3039, + "step": 131 + }, + { + "epoch": 0.07568129288875351, + "grad_norm": 268.15582275390625, + "learning_rate": 9.623831324603755e-06, + "loss": 15.2402, + "step": 132 + }, + { + "epoch": 0.07625463601669862, + "grad_norm": 275.54180908203125, + "learning_rate": 9.61672549162366e-06, + "loss": 15.2216, + "step": 133 + }, + { + "epoch": 0.07682797914464372, + "grad_norm": 274.50299072265625, + "learning_rate": 9.6095558440688e-06, + "loss": 15.2265, + "step": 134 + }, + { + "epoch": 0.07740132227258882, + "grad_norm": 274.8090515136719, + "learning_rate": 9.602322481041457e-06, + "loss": 15.2518, + "step": 135 + }, + { + "epoch": 0.07797466540053392, + "grad_norm": 264.6287841796875, + "learning_rate": 9.595025502524609e-06, + "loss": 15.2621, + "step": 136 + }, + { + "epoch": 0.07854800852847903, + "grad_norm": 261.9557189941406, + "learning_rate": 9.587665009380565e-06, + "loss": 15.2255, + "step": 137 + }, + { + "epoch": 0.07912135165642413, + "grad_norm": 264.4668273925781, + "learning_rate": 9.580241103349562e-06, + "loss": 15.1974, + "step": 138 + }, + { + "epoch": 0.07969469478436923, + "grad_norm": 268.053955078125, + "learning_rate": 9.572753887048353e-06, + "loss": 15.1732, + "step": 139 + }, + { + "epoch": 0.08026803791231434, + "grad_norm": 275.5241394042969, + "learning_rate": 9.565203463968808e-06, + "loss": 15.2277, + "step": 140 + }, + { + "epoch": 0.08084138104025944, + "grad_norm": 270.20001220703125, + "learning_rate": 9.557589938476462e-06, + "loss": 15.2393, + "step": 141 + }, + { + "epoch": 0.08141472416820454, + "grad_norm": 274.71453857421875, + "learning_rate": 9.549913415809084e-06, + "loss": 15.1832, + "step": 142 + }, + { + "epoch": 0.08198806729614964, + "grad_norm": 266.2647399902344, + "learning_rate": 9.542174002075221e-06, + "loss": 15.1934, + "step": 143 + }, + { + "epoch": 0.08256141042409475, + "grad_norm": 270.1286315917969, + "learning_rate": 9.534371804252727e-06, + "loss": 15.1652, + "step": 144 + }, + { + "epoch": 0.08313475355203985, + "grad_norm": 253.12673950195312, + "learning_rate": 9.526506930187294e-06, + "loss": 15.2471, + "step": 145 + }, + { + "epoch": 0.08370809667998495, + "grad_norm": 266.5976867675781, + "learning_rate": 9.518579488590947e-06, + "loss": 15.26, + "step": 146 + }, + { + "epoch": 0.08428143980793006, + "grad_norm": 264.99481201171875, + "learning_rate": 9.510589589040554e-06, + "loss": 15.1794, + "step": 147 + }, + { + "epoch": 0.08485478293587516, + "grad_norm": 255.4492950439453, + "learning_rate": 9.502537341976305e-06, + "loss": 15.2214, + "step": 148 + }, + { + "epoch": 0.08542812606382026, + "grad_norm": 264.4046325683594, + "learning_rate": 9.494422858700188e-06, + "loss": 15.1397, + "step": 149 + }, + { + "epoch": 0.08600146919176536, + "grad_norm": 276.0195007324219, + "learning_rate": 9.48624625137445e-06, + "loss": 15.2514, + "step": 150 + }, + { + "epoch": 0.08657481231971047, + "grad_norm": 261.25848388671875, + "learning_rate": 9.478007633020043e-06, + "loss": 15.1633, + "step": 151 + }, + { + "epoch": 0.08714815544765557, + "grad_norm": 273.81439208984375, + "learning_rate": 9.469707117515068e-06, + "loss": 15.3146, + "step": 152 + }, + { + "epoch": 0.08772149857560067, + "grad_norm": 278.4958801269531, + "learning_rate": 9.461344819593194e-06, + "loss": 15.2173, + "step": 153 + }, + { + "epoch": 0.08829484170354576, + "grad_norm": 270.7554931640625, + "learning_rate": 9.452920854842085e-06, + "loss": 15.2049, + "step": 154 + }, + { + "epoch": 0.08886818483149087, + "grad_norm": 277.895751953125, + "learning_rate": 9.44443533970178e-06, + "loss": 15.2012, + "step": 155 + }, + { + "epoch": 0.08944152795943597, + "grad_norm": 260.6186828613281, + "learning_rate": 9.435888391463108e-06, + "loss": 15.1519, + "step": 156 + }, + { + "epoch": 0.09001487108738107, + "grad_norm": 266.2400817871094, + "learning_rate": 9.427280128266049e-06, + "loss": 15.1982, + "step": 157 + }, + { + "epoch": 0.09058821421532617, + "grad_norm": 258.3689270019531, + "learning_rate": 9.418610669098114e-06, + "loss": 15.2358, + "step": 158 + }, + { + "epoch": 0.09116155734327128, + "grad_norm": 255.73751831054688, + "learning_rate": 9.409880133792684e-06, + "loss": 15.2167, + "step": 159 + }, + { + "epoch": 0.09173490047121638, + "grad_norm": 257.6156311035156, + "learning_rate": 9.40108864302737e-06, + "loss": 15.1499, + "step": 160 + }, + { + "epoch": 0.09230824359916148, + "grad_norm": 259.1768493652344, + "learning_rate": 9.392236318322339e-06, + "loss": 15.1413, + "step": 161 + }, + { + "epoch": 0.09288158672710659, + "grad_norm": 259.98583984375, + "learning_rate": 9.383323282038632e-06, + "loss": 15.2688, + "step": 162 + }, + { + "epoch": 0.09345492985505169, + "grad_norm": 270.8675537109375, + "learning_rate": 9.374349657376473e-06, + "loss": 15.19, + "step": 163 + }, + { + "epoch": 0.09402827298299679, + "grad_norm": 252.3112030029297, + "learning_rate": 9.365315568373569e-06, + "loss": 15.1946, + "step": 164 + }, + { + "epoch": 0.0946016161109419, + "grad_norm": 271.8454284667969, + "learning_rate": 9.356221139903395e-06, + "loss": 15.1801, + "step": 165 + }, + { + "epoch": 0.095174959238887, + "grad_norm": 252.07545471191406, + "learning_rate": 9.347066497673462e-06, + "loss": 15.169, + "step": 166 + }, + { + "epoch": 0.0957483023668321, + "grad_norm": 274.141357421875, + "learning_rate": 9.337851768223589e-06, + "loss": 15.2279, + "step": 167 + }, + { + "epoch": 0.0963216454947772, + "grad_norm": 257.8874206542969, + "learning_rate": 9.328577078924151e-06, + "loss": 15.2368, + "step": 168 + }, + { + "epoch": 0.0968949886227223, + "grad_norm": 259.5989990234375, + "learning_rate": 9.319242557974306e-06, + "loss": 15.1261, + "step": 169 + }, + { + "epoch": 0.09746833175066741, + "grad_norm": 268.7466735839844, + "learning_rate": 9.309848334400247e-06, + "loss": 15.1956, + "step": 170 + }, + { + "epoch": 0.09804167487861251, + "grad_norm": 250.24107360839844, + "learning_rate": 9.300394538053395e-06, + "loss": 15.2186, + "step": 171 + }, + { + "epoch": 0.09861501800655761, + "grad_norm": 242.27389526367188, + "learning_rate": 9.29088129960862e-06, + "loss": 15.208, + "step": 172 + }, + { + "epoch": 0.09918836113450272, + "grad_norm": 257.0928649902344, + "learning_rate": 9.281308750562426e-06, + "loss": 15.2165, + "step": 173 + }, + { + "epoch": 0.09976170426244782, + "grad_norm": 252.54974365234375, + "learning_rate": 9.271677023231137e-06, + "loss": 15.2131, + "step": 174 + }, + { + "epoch": 0.10033504739039292, + "grad_norm": 257.41192626953125, + "learning_rate": 9.261986250749068e-06, + "loss": 15.1474, + "step": 175 + }, + { + "epoch": 0.10090839051833803, + "grad_norm": 260.325439453125, + "learning_rate": 9.252236567066686e-06, + "loss": 15.1335, + "step": 176 + }, + { + "epoch": 0.10148173364628313, + "grad_norm": 265.9437561035156, + "learning_rate": 9.242428106948748e-06, + "loss": 15.2201, + "step": 177 + }, + { + "epoch": 0.10205507677422823, + "grad_norm": 255.51026916503906, + "learning_rate": 9.23256100597246e-06, + "loss": 15.167, + "step": 178 + }, + { + "epoch": 0.10262841990217333, + "grad_norm": 254.357666015625, + "learning_rate": 9.22263540052558e-06, + "loss": 15.2428, + "step": 179 + }, + { + "epoch": 0.10320176303011844, + "grad_norm": 253.48025512695312, + "learning_rate": 9.212651427804544e-06, + "loss": 15.0791, + "step": 180 + }, + { + "epoch": 0.10377510615806354, + "grad_norm": 258.47149658203125, + "learning_rate": 9.202609225812572e-06, + "loss": 15.2475, + "step": 181 + }, + { + "epoch": 0.10434844928600863, + "grad_norm": 257.2544860839844, + "learning_rate": 9.192508933357753e-06, + "loss": 15.1288, + "step": 182 + }, + { + "epoch": 0.10492179241395373, + "grad_norm": 250.79588317871094, + "learning_rate": 9.182350690051134e-06, + "loss": 15.1739, + "step": 183 + }, + { + "epoch": 0.10549513554189884, + "grad_norm": 275.7869873046875, + "learning_rate": 9.172134636304783e-06, + "loss": 15.1487, + "step": 184 + }, + { + "epoch": 0.10606847866984394, + "grad_norm": 256.7626647949219, + "learning_rate": 9.16186091332985e-06, + "loss": 15.1919, + "step": 185 + }, + { + "epoch": 0.10664182179778904, + "grad_norm": 255.94090270996094, + "learning_rate": 9.15152966313462e-06, + "loss": 15.1635, + "step": 186 + }, + { + "epoch": 0.10721516492573414, + "grad_norm": 267.16448974609375, + "learning_rate": 9.141141028522544e-06, + "loss": 15.1597, + "step": 187 + }, + { + "epoch": 0.10778850805367925, + "grad_norm": 258.8427734375, + "learning_rate": 9.130695153090272e-06, + "loss": 15.1459, + "step": 188 + }, + { + "epoch": 0.10836185118162435, + "grad_norm": 253.86849975585938, + "learning_rate": 9.120192181225658e-06, + "loss": 15.1216, + "step": 189 + }, + { + "epoch": 0.10893519430956945, + "grad_norm": 265.7057189941406, + "learning_rate": 9.109632258105771e-06, + "loss": 15.1723, + "step": 190 + }, + { + "epoch": 0.10950853743751456, + "grad_norm": 250.55398559570312, + "learning_rate": 9.099015529694894e-06, + "loss": 15.026, + "step": 191 + }, + { + "epoch": 0.11008188056545966, + "grad_norm": 255.69390869140625, + "learning_rate": 9.088342142742493e-06, + "loss": 15.1254, + "step": 192 + }, + { + "epoch": 0.11065522369340476, + "grad_norm": 254.11236572265625, + "learning_rate": 9.077612244781196e-06, + "loss": 15.079, + "step": 193 + }, + { + "epoch": 0.11122856682134986, + "grad_norm": 247.76478576660156, + "learning_rate": 9.066825984124751e-06, + "loss": 15.1122, + "step": 194 + }, + { + "epoch": 0.11180190994929497, + "grad_norm": 265.3432922363281, + "learning_rate": 9.055983509865988e-06, + "loss": 15.305, + "step": 195 + }, + { + "epoch": 0.11237525307724007, + "grad_norm": 244.3975067138672, + "learning_rate": 9.045084971874738e-06, + "loss": 15.1207, + "step": 196 + }, + { + "epoch": 0.11294859620518517, + "grad_norm": 245.3219757080078, + "learning_rate": 9.034130520795774e-06, + "loss": 15.2254, + "step": 197 + }, + { + "epoch": 0.11352193933313028, + "grad_norm": 248.05052185058594, + "learning_rate": 9.023120308046726e-06, + "loss": 15.0549, + "step": 198 + }, + { + "epoch": 0.11409528246107538, + "grad_norm": 249.66659545898438, + "learning_rate": 9.012054485815995e-06, + "loss": 15.0402, + "step": 199 + }, + { + "epoch": 0.11466862558902048, + "grad_norm": 247.83876037597656, + "learning_rate": 9.00093320706063e-06, + "loss": 15.1167, + "step": 200 + }, + { + "epoch": 0.11524196871696558, + "grad_norm": 241.92027282714844, + "learning_rate": 8.989756625504237e-06, + "loss": 15.0883, + "step": 201 + }, + { + "epoch": 0.11581531184491069, + "grad_norm": 247.57127380371094, + "learning_rate": 8.978524895634842e-06, + "loss": 15.0762, + "step": 202 + }, + { + "epoch": 0.11638865497285579, + "grad_norm": 260.8078918457031, + "learning_rate": 8.967238172702754e-06, + "loss": 15.1708, + "step": 203 + }, + { + "epoch": 0.11696199810080089, + "grad_norm": 234.99139404296875, + "learning_rate": 8.95589661271842e-06, + "loss": 15.0437, + "step": 204 + }, + { + "epoch": 0.117535341228746, + "grad_norm": 252.7474822998047, + "learning_rate": 8.94450037245028e-06, + "loss": 15.1181, + "step": 205 + }, + { + "epoch": 0.1181086843566911, + "grad_norm": 254.7908477783203, + "learning_rate": 8.933049609422582e-06, + "loss": 15.053, + "step": 206 + }, + { + "epoch": 0.1186820274846362, + "grad_norm": 249.38302612304688, + "learning_rate": 8.921544481913218e-06, + "loss": 15.1128, + "step": 207 + }, + { + "epoch": 0.1192553706125813, + "grad_norm": 244.7653350830078, + "learning_rate": 8.909985148951528e-06, + "loss": 15.0565, + "step": 208 + }, + { + "epoch": 0.11982871374052641, + "grad_norm": 246.28976440429688, + "learning_rate": 8.898371770316113e-06, + "loss": 14.9964, + "step": 209 + }, + { + "epoch": 0.1204020568684715, + "grad_norm": 260.9649658203125, + "learning_rate": 8.886704506532611e-06, + "loss": 15.0536, + "step": 210 + }, + { + "epoch": 0.1209753999964166, + "grad_norm": 254.28854370117188, + "learning_rate": 8.874983518871488e-06, + "loss": 15.1222, + "step": 211 + }, + { + "epoch": 0.1215487431243617, + "grad_norm": 233.388427734375, + "learning_rate": 8.86320896934581e-06, + "loss": 15.1175, + "step": 212 + }, + { + "epoch": 0.1221220862523068, + "grad_norm": 264.84063720703125, + "learning_rate": 8.851381020709e-06, + "loss": 15.0966, + "step": 213 + }, + { + "epoch": 0.12269542938025191, + "grad_norm": 238.38485717773438, + "learning_rate": 8.839499836452584e-06, + "loss": 15.0013, + "step": 214 + }, + { + "epoch": 0.12326877250819701, + "grad_norm": 251.5662384033203, + "learning_rate": 8.827565580803944e-06, + "loss": 15.1437, + "step": 215 + }, + { + "epoch": 0.12384211563614211, + "grad_norm": 250.67286682128906, + "learning_rate": 8.815578418724031e-06, + "loss": 15.0635, + "step": 216 + }, + { + "epoch": 0.12441545876408722, + "grad_norm": 262.172607421875, + "learning_rate": 8.803538515905102e-06, + "loss": 15.1516, + "step": 217 + }, + { + "epoch": 0.12498880189203232, + "grad_norm": 241.4354705810547, + "learning_rate": 8.791446038768416e-06, + "loss": 15.1259, + "step": 218 + }, + { + "epoch": 0.12556214501997742, + "grad_norm": 247.70347595214844, + "learning_rate": 8.779301154461945e-06, + "loss": 15.1325, + "step": 219 + }, + { + "epoch": 0.12613548814792253, + "grad_norm": 234.08982849121094, + "learning_rate": 8.76710403085805e-06, + "loss": 15.01, + "step": 220 + }, + { + "epoch": 0.12670883127586763, + "grad_norm": 245.60804748535156, + "learning_rate": 8.754854836551174e-06, + "loss": 15.0905, + "step": 221 + }, + { + "epoch": 0.12728217440381273, + "grad_norm": 254.1485137939453, + "learning_rate": 8.742553740855507e-06, + "loss": 15.1127, + "step": 222 + }, + { + "epoch": 0.12785551753175783, + "grad_norm": 238.38563537597656, + "learning_rate": 8.730200913802638e-06, + "loss": 15.0614, + "step": 223 + }, + { + "epoch": 0.12842886065970294, + "grad_norm": 248.24403381347656, + "learning_rate": 8.717796526139218e-06, + "loss": 15.0618, + "step": 224 + }, + { + "epoch": 0.12900220378764804, + "grad_norm": 246.24209594726562, + "learning_rate": 8.70534074932459e-06, + "loss": 15.0455, + "step": 225 + }, + { + "epoch": 0.12957554691559314, + "grad_norm": 237.25454711914062, + "learning_rate": 8.692833755528426e-06, + "loss": 15.0558, + "step": 226 + }, + { + "epoch": 0.13014889004353825, + "grad_norm": 246.07095336914062, + "learning_rate": 8.680275717628336e-06, + "loss": 15.0205, + "step": 227 + }, + { + "epoch": 0.13072223317148335, + "grad_norm": 242.9619903564453, + "learning_rate": 8.667666809207495e-06, + "loss": 15.142, + "step": 228 + }, + { + "epoch": 0.13129557629942845, + "grad_norm": 242.89532470703125, + "learning_rate": 8.655007204552228e-06, + "loss": 15.0199, + "step": 229 + }, + { + "epoch": 0.13186891942737355, + "grad_norm": 254.67239379882812, + "learning_rate": 8.64229707864961e-06, + "loss": 15.088, + "step": 230 + }, + { + "epoch": 0.13244226255531866, + "grad_norm": 240.30972290039062, + "learning_rate": 8.629536607185042e-06, + "loss": 15.1037, + "step": 231 + }, + { + "epoch": 0.13301560568326376, + "grad_norm": 250.13949584960938, + "learning_rate": 8.616725966539831e-06, + "loss": 15.0717, + "step": 232 + }, + { + "epoch": 0.13358894881120886, + "grad_norm": 237.8465576171875, + "learning_rate": 8.60386533378874e-06, + "loss": 15.05, + "step": 233 + }, + { + "epoch": 0.13416229193915397, + "grad_norm": 244.82315063476562, + "learning_rate": 8.590954886697554e-06, + "loss": 15.101, + "step": 234 + }, + { + "epoch": 0.13473563506709907, + "grad_norm": 237.0764923095703, + "learning_rate": 8.577994803720605e-06, + "loss": 15.0211, + "step": 235 + }, + { + "epoch": 0.13530897819504417, + "grad_norm": 241.53424072265625, + "learning_rate": 8.564985263998327e-06, + "loss": 15.0495, + "step": 236 + }, + { + "epoch": 0.13588232132298927, + "grad_norm": 232.84251403808594, + "learning_rate": 8.551926447354759e-06, + "loss": 14.9438, + "step": 237 + }, + { + "epoch": 0.13645566445093438, + "grad_norm": 242.9515838623047, + "learning_rate": 8.538818534295076e-06, + "loss": 15.028, + "step": 238 + }, + { + "epoch": 0.13702900757887948, + "grad_norm": 248.1451416015625, + "learning_rate": 8.525661706003083e-06, + "loss": 15.0705, + "step": 239 + }, + { + "epoch": 0.13760235070682458, + "grad_norm": 253.95338439941406, + "learning_rate": 8.512456144338717e-06, + "loss": 15.097, + "step": 240 + }, + { + "epoch": 0.1381756938347697, + "grad_norm": 243.39439392089844, + "learning_rate": 8.499202031835532e-06, + "loss": 15.0549, + "step": 241 + }, + { + "epoch": 0.1387490369627148, + "grad_norm": 247.52191162109375, + "learning_rate": 8.485899551698166e-06, + "loss": 15.1328, + "step": 242 + }, + { + "epoch": 0.1393223800906599, + "grad_norm": 236.9805908203125, + "learning_rate": 8.472548887799833e-06, + "loss": 15.0222, + "step": 243 + }, + { + "epoch": 0.139895723218605, + "grad_norm": 239.95289611816406, + "learning_rate": 8.45915022467975e-06, + "loss": 15.0937, + "step": 244 + }, + { + "epoch": 0.1404690663465501, + "grad_norm": 254.6737060546875, + "learning_rate": 8.445703747540614e-06, + "loss": 15.06, + "step": 245 + }, + { + "epoch": 0.1410424094744952, + "grad_norm": 247.96080017089844, + "learning_rate": 8.43220964224602e-06, + "loss": 15.0793, + "step": 246 + }, + { + "epoch": 0.1416157526024403, + "grad_norm": 241.89292907714844, + "learning_rate": 8.418668095317912e-06, + "loss": 15.0339, + "step": 247 + }, + { + "epoch": 0.1421890957303854, + "grad_norm": 245.7707061767578, + "learning_rate": 8.405079293933986e-06, + "loss": 15.0187, + "step": 248 + }, + { + "epoch": 0.1427624388583305, + "grad_norm": 244.69918823242188, + "learning_rate": 8.391443425925118e-06, + "loss": 14.9716, + "step": 249 + }, + { + "epoch": 0.1433357819862756, + "grad_norm": 247.9059295654297, + "learning_rate": 8.37776067977276e-06, + "loss": 15.0733, + "step": 250 + }, + { + "epoch": 0.14390912511422072, + "grad_norm": 238.36126708984375, + "learning_rate": 8.36403124460633e-06, + "loss": 14.9511, + "step": 251 + }, + { + "epoch": 0.1444824682421658, + "grad_norm": 239.73057556152344, + "learning_rate": 8.350255310200611e-06, + "loss": 15.0428, + "step": 252 + }, + { + "epoch": 0.1450558113701109, + "grad_norm": 230.3163299560547, + "learning_rate": 8.336433066973122e-06, + "loss": 14.997, + "step": 253 + }, + { + "epoch": 0.145629154498056, + "grad_norm": 237.23446655273438, + "learning_rate": 8.322564705981476e-06, + "loss": 14.973, + "step": 254 + }, + { + "epoch": 0.1462024976260011, + "grad_norm": 230.16468811035156, + "learning_rate": 8.308650418920751e-06, + "loss": 15.0256, + "step": 255 + }, + { + "epoch": 0.1467758407539462, + "grad_norm": 233.07260131835938, + "learning_rate": 8.294690398120843e-06, + "loss": 14.945, + "step": 256 + }, + { + "epoch": 0.1473491838818913, + "grad_norm": 240.12940979003906, + "learning_rate": 8.280684836543794e-06, + "loss": 14.9974, + "step": 257 + }, + { + "epoch": 0.1479225270098364, + "grad_norm": 243.80523681640625, + "learning_rate": 8.266633927781135e-06, + "loss": 15.0705, + "step": 258 + }, + { + "epoch": 0.1484958701377815, + "grad_norm": 244.0867462158203, + "learning_rate": 8.25253786605121e-06, + "loss": 15.0141, + "step": 259 + }, + { + "epoch": 0.1490692132657266, + "grad_norm": 247.33151245117188, + "learning_rate": 8.238396846196483e-06, + "loss": 15.0344, + "step": 260 + }, + { + "epoch": 0.14964255639367172, + "grad_norm": 250.08273315429688, + "learning_rate": 8.224211063680854e-06, + "loss": 14.9305, + "step": 261 + }, + { + "epoch": 0.15021589952161682, + "grad_norm": 257.2216491699219, + "learning_rate": 8.209980714586955e-06, + "loss": 14.9938, + "step": 262 + }, + { + "epoch": 0.15078924264956192, + "grad_norm": 238.5064239501953, + "learning_rate": 8.195705995613436e-06, + "loss": 15.0064, + "step": 263 + }, + { + "epoch": 0.15136258577750702, + "grad_norm": 232.31155395507812, + "learning_rate": 8.181387104072252e-06, + "loss": 14.9449, + "step": 264 + }, + { + "epoch": 0.15193592890545213, + "grad_norm": 227.94029235839844, + "learning_rate": 8.167024237885927e-06, + "loss": 14.8337, + "step": 265 + }, + { + "epoch": 0.15250927203339723, + "grad_norm": 240.96424865722656, + "learning_rate": 8.152617595584827e-06, + "loss": 15.0939, + "step": 266 + }, + { + "epoch": 0.15308261516134233, + "grad_norm": 237.70541381835938, + "learning_rate": 8.138167376304411e-06, + "loss": 14.909, + "step": 267 + }, + { + "epoch": 0.15365595828928744, + "grad_norm": 233.10304260253906, + "learning_rate": 8.123673779782481e-06, + "loss": 14.9505, + "step": 268 + }, + { + "epoch": 0.15422930141723254, + "grad_norm": 240.28123474121094, + "learning_rate": 8.10913700635642e-06, + "loss": 14.9045, + "step": 269 + }, + { + "epoch": 0.15480264454517764, + "grad_norm": 233.11627197265625, + "learning_rate": 8.094557256960419e-06, + "loss": 14.9225, + "step": 270 + }, + { + "epoch": 0.15537598767312275, + "grad_norm": 244.76693725585938, + "learning_rate": 8.079934733122708e-06, + "loss": 14.9717, + "step": 271 + }, + { + "epoch": 0.15594933080106785, + "grad_norm": 240.1745147705078, + "learning_rate": 8.065269636962765e-06, + "loss": 15.0261, + "step": 272 + }, + { + "epoch": 0.15652267392901295, + "grad_norm": 246.17298889160156, + "learning_rate": 8.05056217118852e-06, + "loss": 14.9933, + "step": 273 + }, + { + "epoch": 0.15709601705695805, + "grad_norm": 244.8893585205078, + "learning_rate": 8.035812539093557e-06, + "loss": 15.0351, + "step": 274 + }, + { + "epoch": 0.15766936018490316, + "grad_norm": 244.82302856445312, + "learning_rate": 8.021020944554305e-06, + "loss": 14.9442, + "step": 275 + }, + { + "epoch": 0.15824270331284826, + "grad_norm": 243.9514923095703, + "learning_rate": 8.006187592027215e-06, + "loss": 14.9621, + "step": 276 + }, + { + "epoch": 0.15881604644079336, + "grad_norm": 230.46597290039062, + "learning_rate": 7.991312686545939e-06, + "loss": 14.8903, + "step": 277 + }, + { + "epoch": 0.15938938956873847, + "grad_norm": 249.49838256835938, + "learning_rate": 7.976396433718492e-06, + "loss": 14.9777, + "step": 278 + }, + { + "epoch": 0.15996273269668357, + "grad_norm": 243.70870971679688, + "learning_rate": 7.961439039724413e-06, + "loss": 15.0312, + "step": 279 + }, + { + "epoch": 0.16053607582462867, + "grad_norm": 230.47183227539062, + "learning_rate": 7.946440711311913e-06, + "loss": 14.9198, + "step": 280 + }, + { + "epoch": 0.16110941895257377, + "grad_norm": 236.70082092285156, + "learning_rate": 7.931401655795021e-06, + "loss": 14.9223, + "step": 281 + }, + { + "epoch": 0.16168276208051888, + "grad_norm": 234.71527099609375, + "learning_rate": 7.916322081050708e-06, + "loss": 14.9188, + "step": 282 + }, + { + "epoch": 0.16225610520846398, + "grad_norm": 235.15675354003906, + "learning_rate": 7.90120219551603e-06, + "loss": 14.9309, + "step": 283 + }, + { + "epoch": 0.16282944833640908, + "grad_norm": 229.10137939453125, + "learning_rate": 7.88604220818523e-06, + "loss": 14.8877, + "step": 284 + }, + { + "epoch": 0.16340279146435419, + "grad_norm": 237.02072143554688, + "learning_rate": 7.870842328606863e-06, + "loss": 15.0099, + "step": 285 + }, + { + "epoch": 0.1639761345922993, + "grad_norm": 236.75343322753906, + "learning_rate": 7.85560276688089e-06, + "loss": 14.8486, + "step": 286 + }, + { + "epoch": 0.1645494777202444, + "grad_norm": 233.91934204101562, + "learning_rate": 7.84032373365578e-06, + "loss": 14.897, + "step": 287 + }, + { + "epoch": 0.1651228208481895, + "grad_norm": 230.60330200195312, + "learning_rate": 7.825005440125595e-06, + "loss": 14.9105, + "step": 288 + }, + { + "epoch": 0.1656961639761346, + "grad_norm": 235.03897094726562, + "learning_rate": 7.809648098027067e-06, + "loss": 14.994, + "step": 289 + }, + { + "epoch": 0.1662695071040797, + "grad_norm": 233.12936401367188, + "learning_rate": 7.794251919636687e-06, + "loss": 14.9753, + "step": 290 + }, + { + "epoch": 0.1668428502320248, + "grad_norm": 231.44244384765625, + "learning_rate": 7.778817117767748e-06, + "loss": 14.994, + "step": 291 + }, + { + "epoch": 0.1674161933599699, + "grad_norm": 228.026611328125, + "learning_rate": 7.76334390576742e-06, + "loss": 14.9458, + "step": 292 + }, + { + "epoch": 0.167989536487915, + "grad_norm": 231.06951904296875, + "learning_rate": 7.747832497513797e-06, + "loss": 14.9729, + "step": 293 + }, + { + "epoch": 0.1685628796158601, + "grad_norm": 239.63568115234375, + "learning_rate": 7.732283107412938e-06, + "loss": 14.9274, + "step": 294 + }, + { + "epoch": 0.16913622274380521, + "grad_norm": 220.87551879882812, + "learning_rate": 7.71669595039591e-06, + "loss": 14.9327, + "step": 295 + }, + { + "epoch": 0.16970956587175032, + "grad_norm": 214.35519409179688, + "learning_rate": 7.701071241915804e-06, + "loss": 14.8955, + "step": 296 + }, + { + "epoch": 0.17028290899969542, + "grad_norm": 229.36508178710938, + "learning_rate": 7.685409197944768e-06, + "loss": 14.903, + "step": 297 + }, + { + "epoch": 0.17085625212764052, + "grad_norm": 224.3822021484375, + "learning_rate": 7.669710034971025e-06, + "loss": 14.9543, + "step": 298 + }, + { + "epoch": 0.17142959525558563, + "grad_norm": 228.7742462158203, + "learning_rate": 7.653973969995866e-06, + "loss": 14.9022, + "step": 299 + }, + { + "epoch": 0.17200293838353073, + "grad_norm": 228.00148010253906, + "learning_rate": 7.638201220530664e-06, + "loss": 14.8216, + "step": 300 + }, + { + "epoch": 0.17257628151147583, + "grad_norm": 216.36854553222656, + "learning_rate": 7.622392004593862e-06, + "loss": 14.8582, + "step": 301 + }, + { + "epoch": 0.17314962463942093, + "grad_norm": 221.77157592773438, + "learning_rate": 7.60654654070796e-06, + "loss": 14.9161, + "step": 302 + }, + { + "epoch": 0.17372296776736604, + "grad_norm": 223.14935302734375, + "learning_rate": 7.59066504789649e-06, + "loss": 14.9057, + "step": 303 + }, + { + "epoch": 0.17429631089531114, + "grad_norm": 219.07955932617188, + "learning_rate": 7.574747745681e-06, + "loss": 14.8669, + "step": 304 + }, + { + "epoch": 0.17486965402325624, + "grad_norm": 226.2716827392578, + "learning_rate": 7.558794854078006e-06, + "loss": 14.8365, + "step": 305 + }, + { + "epoch": 0.17544299715120135, + "grad_norm": 243.78469848632812, + "learning_rate": 7.542806593595963e-06, + "loss": 14.9013, + "step": 306 + }, + { + "epoch": 0.17601634027914642, + "grad_norm": 214.9324188232422, + "learning_rate": 7.526783185232208e-06, + "loss": 14.7971, + "step": 307 + }, + { + "epoch": 0.17658968340709152, + "grad_norm": 217.00315856933594, + "learning_rate": 7.51072485046991e-06, + "loss": 14.8198, + "step": 308 + }, + { + "epoch": 0.17716302653503663, + "grad_norm": 230.4095001220703, + "learning_rate": 7.494631811275008e-06, + "loss": 14.8371, + "step": 309 + }, + { + "epoch": 0.17773636966298173, + "grad_norm": 236.96478271484375, + "learning_rate": 7.478504290093138e-06, + "loss": 14.8929, + "step": 310 + }, + { + "epoch": 0.17830971279092683, + "grad_norm": 222.3997344970703, + "learning_rate": 7.462342509846571e-06, + "loss": 14.9166, + "step": 311 + }, + { + "epoch": 0.17888305591887194, + "grad_norm": 230.09429931640625, + "learning_rate": 7.446146693931111e-06, + "loss": 14.8528, + "step": 312 + }, + { + "epoch": 0.17945639904681704, + "grad_norm": 222.97035217285156, + "learning_rate": 7.42991706621303e-06, + "loss": 14.8732, + "step": 313 + }, + { + "epoch": 0.18002974217476214, + "grad_norm": 226.1836700439453, + "learning_rate": 7.413653851025959e-06, + "loss": 14.7586, + "step": 314 + }, + { + "epoch": 0.18060308530270724, + "grad_norm": 222.79554748535156, + "learning_rate": 7.397357273167789e-06, + "loss": 14.8905, + "step": 315 + }, + { + "epoch": 0.18117642843065235, + "grad_norm": 230.41497802734375, + "learning_rate": 7.381027557897568e-06, + "loss": 14.7686, + "step": 316 + }, + { + "epoch": 0.18174977155859745, + "grad_norm": 207.27145385742188, + "learning_rate": 7.364664930932385e-06, + "loss": 14.8313, + "step": 317 + }, + { + "epoch": 0.18232311468654255, + "grad_norm": 224.7344207763672, + "learning_rate": 7.348269618444248e-06, + "loss": 14.7949, + "step": 318 + }, + { + "epoch": 0.18289645781448766, + "grad_norm": 227.63766479492188, + "learning_rate": 7.331841847056962e-06, + "loss": 14.7235, + "step": 319 + }, + { + "epoch": 0.18346980094243276, + "grad_norm": 214.8011932373047, + "learning_rate": 7.315381843842995e-06, + "loss": 14.7835, + "step": 320 + }, + { + "epoch": 0.18404314407037786, + "grad_norm": 217.45916748046875, + "learning_rate": 7.298889836320334e-06, + "loss": 14.8223, + "step": 321 + }, + { + "epoch": 0.18461648719832296, + "grad_norm": 221.9704132080078, + "learning_rate": 7.282366052449351e-06, + "loss": 14.871, + "step": 322 + }, + { + "epoch": 0.18518983032626807, + "grad_norm": 222.32537841796875, + "learning_rate": 7.265810720629643e-06, + "loss": 14.8007, + "step": 323 + }, + { + "epoch": 0.18576317345421317, + "grad_norm": 227.74884033203125, + "learning_rate": 7.249224069696876e-06, + "loss": 14.8103, + "step": 324 + }, + { + "epoch": 0.18633651658215827, + "grad_norm": 219.51748657226562, + "learning_rate": 7.232606328919627e-06, + "loss": 14.7732, + "step": 325 + }, + { + "epoch": 0.18690985971010338, + "grad_norm": 217.20773315429688, + "learning_rate": 7.215957727996208e-06, + "loss": 14.7552, + "step": 326 + }, + { + "epoch": 0.18748320283804848, + "grad_norm": 209.55203247070312, + "learning_rate": 7.199278497051498e-06, + "loss": 14.7018, + "step": 327 + }, + { + "epoch": 0.18805654596599358, + "grad_norm": 214.1074676513672, + "learning_rate": 7.182568866633757e-06, + "loss": 14.7702, + "step": 328 + }, + { + "epoch": 0.18862988909393869, + "grad_norm": 229.8917236328125, + "learning_rate": 7.16582906771144e-06, + "loss": 14.7891, + "step": 329 + }, + { + "epoch": 0.1892032322218838, + "grad_norm": 217.26866149902344, + "learning_rate": 7.149059331670009e-06, + "loss": 14.7741, + "step": 330 + }, + { + "epoch": 0.1897765753498289, + "grad_norm": 210.88253784179688, + "learning_rate": 7.132259890308726e-06, + "loss": 14.715, + "step": 331 + }, + { + "epoch": 0.190349918477774, + "grad_norm": 231.31787109375, + "learning_rate": 7.115430975837457e-06, + "loss": 14.7906, + "step": 332 + }, + { + "epoch": 0.1909232616057191, + "grad_norm": 224.2241973876953, + "learning_rate": 7.098572820873461e-06, + "loss": 14.7868, + "step": 333 + }, + { + "epoch": 0.1914966047336642, + "grad_norm": 220.03028869628906, + "learning_rate": 7.081685658438173e-06, + "loss": 14.7613, + "step": 334 + }, + { + "epoch": 0.1920699478616093, + "grad_norm": 213.73609924316406, + "learning_rate": 7.064769721953975e-06, + "loss": 14.7319, + "step": 335 + }, + { + "epoch": 0.1926432909895544, + "grad_norm": 223.67706298828125, + "learning_rate": 7.047825245240989e-06, + "loss": 14.8181, + "step": 336 + }, + { + "epoch": 0.1932166341174995, + "grad_norm": 207.2647705078125, + "learning_rate": 7.030852462513827e-06, + "loss": 14.7896, + "step": 337 + }, + { + "epoch": 0.1937899772454446, + "grad_norm": 213.09942626953125, + "learning_rate": 7.013851608378359e-06, + "loss": 14.727, + "step": 338 + }, + { + "epoch": 0.19436332037338971, + "grad_norm": 229.02037048339844, + "learning_rate": 6.9968229178284775e-06, + "loss": 14.7458, + "step": 339 + }, + { + "epoch": 0.19493666350133482, + "grad_norm": 222.83213806152344, + "learning_rate": 6.979766626242839e-06, + "loss": 14.7459, + "step": 340 + }, + { + "epoch": 0.19551000662927992, + "grad_norm": 220.72726440429688, + "learning_rate": 6.9626829693816135e-06, + "loss": 14.7011, + "step": 341 + }, + { + "epoch": 0.19608334975722502, + "grad_norm": 214.8241424560547, + "learning_rate": 6.945572183383229e-06, + "loss": 14.7731, + "step": 342 + }, + { + "epoch": 0.19665669288517013, + "grad_norm": 222.2461700439453, + "learning_rate": 6.928434504761106e-06, + "loss": 14.681, + "step": 343 + }, + { + "epoch": 0.19723003601311523, + "grad_norm": 223.89845275878906, + "learning_rate": 6.911270170400385e-06, + "loss": 14.7092, + "step": 344 + }, + { + "epoch": 0.19780337914106033, + "grad_norm": 219.92869567871094, + "learning_rate": 6.894079417554657e-06, + "loss": 14.8403, + "step": 345 + }, + { + "epoch": 0.19837672226900543, + "grad_norm": 219.98406982421875, + "learning_rate": 6.8768624838426815e-06, + "loss": 14.7576, + "step": 346 + }, + { + "epoch": 0.19895006539695054, + "grad_norm": 207.61367797851562, + "learning_rate": 6.859619607245102e-06, + "loss": 14.7059, + "step": 347 + }, + { + "epoch": 0.19952340852489564, + "grad_norm": 206.98719787597656, + "learning_rate": 6.842351026101155e-06, + "loss": 14.6511, + "step": 348 + }, + { + "epoch": 0.20009675165284074, + "grad_norm": 210.80372619628906, + "learning_rate": 6.825056979105382e-06, + "loss": 14.7222, + "step": 349 + }, + { + "epoch": 0.20067009478078585, + "grad_norm": 213.69117736816406, + "learning_rate": 6.807737705304324e-06, + "loss": 14.7251, + "step": 350 + }, + { + "epoch": 0.20124343790873095, + "grad_norm": 219.47328186035156, + "learning_rate": 6.790393444093214e-06, + "loss": 14.7487, + "step": 351 + }, + { + "epoch": 0.20181678103667605, + "grad_norm": 214.07040405273438, + "learning_rate": 6.773024435212678e-06, + "loss": 14.6365, + "step": 352 + }, + { + "epoch": 0.20239012416462115, + "grad_norm": 214.93496704101562, + "learning_rate": 6.7556309187454185e-06, + "loss": 14.6673, + "step": 353 + }, + { + "epoch": 0.20296346729256626, + "grad_norm": 206.5713348388672, + "learning_rate": 6.738213135112884e-06, + "loss": 14.7522, + "step": 354 + }, + { + "epoch": 0.20353681042051136, + "grad_norm": 210.60606384277344, + "learning_rate": 6.720771325071965e-06, + "loss": 14.6979, + "step": 355 + }, + { + "epoch": 0.20411015354845646, + "grad_norm": 212.65887451171875, + "learning_rate": 6.703305729711653e-06, + "loss": 14.7409, + "step": 356 + }, + { + "epoch": 0.20468349667640157, + "grad_norm": 216.2197723388672, + "learning_rate": 6.685816590449708e-06, + "loss": 14.7433, + "step": 357 + }, + { + "epoch": 0.20525683980434667, + "grad_norm": 210.51260375976562, + "learning_rate": 6.668304149029331e-06, + "loss": 14.7338, + "step": 358 + }, + { + "epoch": 0.20583018293229177, + "grad_norm": 210.6771697998047, + "learning_rate": 6.650768647515813e-06, + "loss": 14.7397, + "step": 359 + }, + { + "epoch": 0.20640352606023687, + "grad_norm": 216.00897216796875, + "learning_rate": 6.63321032829319e-06, + "loss": 14.8058, + "step": 360 + }, + { + "epoch": 0.20697686918818198, + "grad_norm": 206.54159545898438, + "learning_rate": 6.615629434060903e-06, + "loss": 14.6842, + "step": 361 + }, + { + "epoch": 0.20755021231612708, + "grad_norm": 213.61300659179688, + "learning_rate": 6.598026207830428e-06, + "loss": 14.6042, + "step": 362 + }, + { + "epoch": 0.20812355544407216, + "grad_norm": 217.9312744140625, + "learning_rate": 6.5804008929219284e-06, + "loss": 14.7647, + "step": 363 + }, + { + "epoch": 0.20869689857201726, + "grad_norm": 220.873291015625, + "learning_rate": 6.562753732960887e-06, + "loss": 14.7314, + "step": 364 + }, + { + "epoch": 0.20927024169996236, + "grad_norm": 223.9777069091797, + "learning_rate": 6.545084971874738e-06, + "loss": 14.7555, + "step": 365 + }, + { + "epoch": 0.20984358482790746, + "grad_norm": 217.828125, + "learning_rate": 6.527394853889499e-06, + "loss": 14.7245, + "step": 366 + }, + { + "epoch": 0.21041692795585257, + "grad_norm": 224.16778564453125, + "learning_rate": 6.5096836235263904e-06, + "loss": 14.7414, + "step": 367 + }, + { + "epoch": 0.21099027108379767, + "grad_norm": 216.91224670410156, + "learning_rate": 6.491951525598461e-06, + "loss": 14.6045, + "step": 368 + }, + { + "epoch": 0.21156361421174277, + "grad_norm": 209.5393829345703, + "learning_rate": 6.4741988052071965e-06, + "loss": 14.6805, + "step": 369 + }, + { + "epoch": 0.21213695733968788, + "grad_norm": 222.77627563476562, + "learning_rate": 6.45642570773914e-06, + "loss": 14.746, + "step": 370 + }, + { + "epoch": 0.21271030046763298, + "grad_norm": 216.05712890625, + "learning_rate": 6.438632478862495e-06, + "loss": 14.6645, + "step": 371 + }, + { + "epoch": 0.21328364359557808, + "grad_norm": 206.27911376953125, + "learning_rate": 6.4208193645237314e-06, + "loss": 14.6834, + "step": 372 + }, + { + "epoch": 0.21385698672352318, + "grad_norm": 215.7952880859375, + "learning_rate": 6.402986610944183e-06, + "loss": 14.7863, + "step": 373 + }, + { + "epoch": 0.2144303298514683, + "grad_norm": 212.9938201904297, + "learning_rate": 6.385134464616649e-06, + "loss": 14.7525, + "step": 374 + }, + { + "epoch": 0.2150036729794134, + "grad_norm": 200.97154235839844, + "learning_rate": 6.367263172301985e-06, + "loss": 14.649, + "step": 375 + }, + { + "epoch": 0.2155770161073585, + "grad_norm": 222.55943298339844, + "learning_rate": 6.3493729810256895e-06, + "loss": 14.7005, + "step": 376 + }, + { + "epoch": 0.2161503592353036, + "grad_norm": 220.4983367919922, + "learning_rate": 6.331464138074493e-06, + "loss": 14.7608, + "step": 377 + }, + { + "epoch": 0.2167237023632487, + "grad_norm": 213.09095764160156, + "learning_rate": 6.313536890992935e-06, + "loss": 14.5953, + "step": 378 + }, + { + "epoch": 0.2172970454911938, + "grad_norm": 211.12828063964844, + "learning_rate": 6.29559148757995e-06, + "loss": 14.6474, + "step": 379 + }, + { + "epoch": 0.2178703886191389, + "grad_norm": 222.33969116210938, + "learning_rate": 6.277628175885437e-06, + "loss": 14.7324, + "step": 380 + }, + { + "epoch": 0.218443731747084, + "grad_norm": 209.89747619628906, + "learning_rate": 6.2596472042068275e-06, + "loss": 14.622, + "step": 381 + }, + { + "epoch": 0.2190170748750291, + "grad_norm": 219.60342407226562, + "learning_rate": 6.241648821085666e-06, + "loss": 14.6497, + "step": 382 + }, + { + "epoch": 0.2195904180029742, + "grad_norm": 221.1376953125, + "learning_rate": 6.223633275304157e-06, + "loss": 14.7248, + "step": 383 + }, + { + "epoch": 0.22016376113091932, + "grad_norm": 217.87611389160156, + "learning_rate": 6.205600815881741e-06, + "loss": 14.7175, + "step": 384 + }, + { + "epoch": 0.22073710425886442, + "grad_norm": 210.81985473632812, + "learning_rate": 6.187551692071648e-06, + "loss": 14.7288, + "step": 385 + }, + { + "epoch": 0.22131044738680952, + "grad_norm": 218.46176147460938, + "learning_rate": 6.1694861533574445e-06, + "loss": 14.6473, + "step": 386 + }, + { + "epoch": 0.22188379051475463, + "grad_norm": 211.04080200195312, + "learning_rate": 6.1514044494496e-06, + "loss": 14.728, + "step": 387 + }, + { + "epoch": 0.22245713364269973, + "grad_norm": 214.88522338867188, + "learning_rate": 6.133306830282021e-06, + "loss": 14.5944, + "step": 388 + }, + { + "epoch": 0.22303047677064483, + "grad_norm": 214.91293334960938, + "learning_rate": 6.115193546008602e-06, + "loss": 14.6812, + "step": 389 + }, + { + "epoch": 0.22360381989858993, + "grad_norm": 218.2246856689453, + "learning_rate": 6.097064846999774e-06, + "loss": 14.6757, + "step": 390 + }, + { + "epoch": 0.22417716302653504, + "grad_norm": 209.82518005371094, + "learning_rate": 6.078920983839032e-06, + "loss": 14.6697, + "step": 391 + }, + { + "epoch": 0.22475050615448014, + "grad_norm": 219.08514404296875, + "learning_rate": 6.060762207319479e-06, + "loss": 14.663, + "step": 392 + }, + { + "epoch": 0.22532384928242524, + "grad_norm": 224.61856079101562, + "learning_rate": 6.042588768440358e-06, + "loss": 14.6559, + "step": 393 + }, + { + "epoch": 0.22589719241037035, + "grad_norm": 216.43028259277344, + "learning_rate": 6.024400918403581e-06, + "loss": 14.6848, + "step": 394 + }, + { + "epoch": 0.22647053553831545, + "grad_norm": 217.51576232910156, + "learning_rate": 6.006198908610261e-06, + "loss": 14.6885, + "step": 395 + }, + { + "epoch": 0.22704387866626055, + "grad_norm": 194.5399627685547, + "learning_rate": 5.987982990657229e-06, + "loss": 14.589, + "step": 396 + }, + { + "epoch": 0.22761722179420565, + "grad_norm": 214.05809020996094, + "learning_rate": 5.9697534163335645e-06, + "loss": 14.6364, + "step": 397 + }, + { + "epoch": 0.22819056492215076, + "grad_norm": 212.87832641601562, + "learning_rate": 5.95151043761711e-06, + "loss": 14.7834, + "step": 398 + }, + { + "epoch": 0.22876390805009586, + "grad_norm": 203.37142944335938, + "learning_rate": 5.933254306670995e-06, + "loss": 14.5586, + "step": 399 + }, + { + "epoch": 0.22933725117804096, + "grad_norm": 217.5912322998047, + "learning_rate": 5.914985275840135e-06, + "loss": 14.7334, + "step": 400 + }, + { + "epoch": 0.22991059430598607, + "grad_norm": 201.1334991455078, + "learning_rate": 5.896703597647765e-06, + "loss": 14.6263, + "step": 401 + }, + { + "epoch": 0.23048393743393117, + "grad_norm": 206.36265563964844, + "learning_rate": 5.878409524791931e-06, + "loss": 14.6252, + "step": 402 + }, + { + "epoch": 0.23105728056187627, + "grad_norm": 213.31422424316406, + "learning_rate": 5.8601033101420055e-06, + "loss": 14.718, + "step": 403 + }, + { + "epoch": 0.23163062368982137, + "grad_norm": 213.38626098632812, + "learning_rate": 5.841785206735192e-06, + "loss": 14.5727, + "step": 404 + }, + { + "epoch": 0.23220396681776648, + "grad_norm": 189.9121551513672, + "learning_rate": 5.823455467773027e-06, + "loss": 14.5197, + "step": 405 + }, + { + "epoch": 0.23277730994571158, + "grad_norm": 198.7380828857422, + "learning_rate": 5.805114346617874e-06, + "loss": 14.5848, + "step": 406 + }, + { + "epoch": 0.23335065307365668, + "grad_norm": 212.24783325195312, + "learning_rate": 5.786762096789431e-06, + "loss": 14.6107, + "step": 407 + }, + { + "epoch": 0.23392399620160179, + "grad_norm": 219.87643432617188, + "learning_rate": 5.768398971961221e-06, + "loss": 14.7092, + "step": 408 + }, + { + "epoch": 0.2344973393295469, + "grad_norm": 206.90530395507812, + "learning_rate": 5.750025225957086e-06, + "loss": 14.5481, + "step": 409 + }, + { + "epoch": 0.235070682457492, + "grad_norm": 202.2758331298828, + "learning_rate": 5.731641112747679e-06, + "loss": 14.6385, + "step": 410 + }, + { + "epoch": 0.2356440255854371, + "grad_norm": 215.7546844482422, + "learning_rate": 5.713246886446954e-06, + "loss": 14.5969, + "step": 411 + }, + { + "epoch": 0.2362173687133822, + "grad_norm": 208.98550415039062, + "learning_rate": 5.694842801308651e-06, + "loss": 14.6304, + "step": 412 + }, + { + "epoch": 0.2367907118413273, + "grad_norm": 207.6781005859375, + "learning_rate": 5.676429111722786e-06, + "loss": 14.6177, + "step": 413 + }, + { + "epoch": 0.2373640549692724, + "grad_norm": 201.2788543701172, + "learning_rate": 5.6580060722121325e-06, + "loss": 14.5918, + "step": 414 + }, + { + "epoch": 0.2379373980972175, + "grad_norm": 213.871826171875, + "learning_rate": 5.639573937428699e-06, + "loss": 14.5532, + "step": 415 + }, + { + "epoch": 0.2385107412251626, + "grad_norm": 196.2823486328125, + "learning_rate": 5.621132962150216e-06, + "loss": 14.5558, + "step": 416 + }, + { + "epoch": 0.2390840843531077, + "grad_norm": 199.7825927734375, + "learning_rate": 5.6026834012766155e-06, + "loss": 14.5658, + "step": 417 + }, + { + "epoch": 0.23965742748105281, + "grad_norm": 192.31263732910156, + "learning_rate": 5.584225509826497e-06, + "loss": 14.5083, + "step": 418 + }, + { + "epoch": 0.2402307706089979, + "grad_norm": 201.0004119873047, + "learning_rate": 5.565759542933612e-06, + "loss": 14.6235, + "step": 419 + }, + { + "epoch": 0.240804113736943, + "grad_norm": 197.17825317382812, + "learning_rate": 5.547285755843334e-06, + "loss": 14.5237, + "step": 420 + }, + { + "epoch": 0.2413774568648881, + "grad_norm": 209.01620483398438, + "learning_rate": 5.5288044039091335e-06, + "loss": 14.596, + "step": 421 + }, + { + "epoch": 0.2419507999928332, + "grad_norm": 204.07884216308594, + "learning_rate": 5.510315742589042e-06, + "loss": 14.617, + "step": 422 + }, + { + "epoch": 0.2425241431207783, + "grad_norm": 208.53651428222656, + "learning_rate": 5.491820027442126e-06, + "loss": 14.6785, + "step": 423 + }, + { + "epoch": 0.2430974862487234, + "grad_norm": 199.32315063476562, + "learning_rate": 5.473317514124958e-06, + "loss": 14.512, + "step": 424 + }, + { + "epoch": 0.2436708293766685, + "grad_norm": 206.72837829589844, + "learning_rate": 5.454808458388069e-06, + "loss": 14.6038, + "step": 425 + }, + { + "epoch": 0.2442441725046136, + "grad_norm": 196.9921112060547, + "learning_rate": 5.436293116072431e-06, + "loss": 14.5451, + "step": 426 + }, + { + "epoch": 0.2448175156325587, + "grad_norm": 207.21530151367188, + "learning_rate": 5.417771743105908e-06, + "loss": 14.551, + "step": 427 + }, + { + "epoch": 0.24539085876050382, + "grad_norm": 201.5275115966797, + "learning_rate": 5.399244595499721e-06, + "loss": 14.5262, + "step": 428 + }, + { + "epoch": 0.24596420188844892, + "grad_norm": 204.6480712890625, + "learning_rate": 5.380711929344915e-06, + "loss": 14.4846, + "step": 429 + }, + { + "epoch": 0.24653754501639402, + "grad_norm": 194.9892120361328, + "learning_rate": 5.362174000808813e-06, + "loss": 14.5942, + "step": 430 + }, + { + "epoch": 0.24711088814433912, + "grad_norm": 199.96047973632812, + "learning_rate": 5.343631066131476e-06, + "loss": 14.6091, + "step": 431 + }, + { + "epoch": 0.24768423127228423, + "grad_norm": 212.93307495117188, + "learning_rate": 5.325083381622165e-06, + "loss": 14.5455, + "step": 432 + }, + { + "epoch": 0.24825757440022933, + "grad_norm": 194.9511260986328, + "learning_rate": 5.30653120365579e-06, + "loss": 14.5044, + "step": 433 + }, + { + "epoch": 0.24883091752817443, + "grad_norm": 200.14315795898438, + "learning_rate": 5.28797478866938e-06, + "loss": 14.6439, + "step": 434 + }, + { + "epoch": 0.24940426065611954, + "grad_norm": 197.60902404785156, + "learning_rate": 5.269414393158523e-06, + "loss": 14.5721, + "step": 435 + }, + { + "epoch": 0.24997760378406464, + "grad_norm": 192.06671142578125, + "learning_rate": 5.250850273673831e-06, + "loss": 14.5812, + "step": 436 + }, + { + "epoch": 0.25055094691200974, + "grad_norm": 189.84034729003906, + "learning_rate": 5.232282686817392e-06, + "loss": 14.6002, + "step": 437 + }, + { + "epoch": 0.25112429003995484, + "grad_norm": 195.87533569335938, + "learning_rate": 5.213711889239214e-06, + "loss": 14.4797, + "step": 438 + }, + { + "epoch": 0.25169763316789995, + "grad_norm": 186.12464904785156, + "learning_rate": 5.195138137633695e-06, + "loss": 14.5298, + "step": 439 + }, + { + "epoch": 0.25227097629584505, + "grad_norm": 189.66380310058594, + "learning_rate": 5.17656168873606e-06, + "loss": 14.4488, + "step": 440 + }, + { + "epoch": 0.25284431942379015, + "grad_norm": 196.0492401123047, + "learning_rate": 5.157982799318817e-06, + "loss": 14.5268, + "step": 441 + }, + { + "epoch": 0.25341766255173526, + "grad_norm": 192.8926239013672, + "learning_rate": 5.139401726188208e-06, + "loss": 14.555, + "step": 442 + }, + { + "epoch": 0.25399100567968036, + "grad_norm": 201.20632934570312, + "learning_rate": 5.120818726180662e-06, + "loss": 14.4914, + "step": 443 + }, + { + "epoch": 0.25456434880762546, + "grad_norm": 200.86207580566406, + "learning_rate": 5.1022340561592396e-06, + "loss": 14.5471, + "step": 444 + }, + { + "epoch": 0.25513769193557057, + "grad_norm": 203.37557983398438, + "learning_rate": 5.083647973010085e-06, + "loss": 14.5438, + "step": 445 + }, + { + "epoch": 0.25571103506351567, + "grad_norm": 193.55697631835938, + "learning_rate": 5.065060733638878e-06, + "loss": 14.4965, + "step": 446 + }, + { + "epoch": 0.25628437819146077, + "grad_norm": 195.2728271484375, + "learning_rate": 5.046472594967279e-06, + "loss": 14.5723, + "step": 447 + }, + { + "epoch": 0.2568577213194059, + "grad_norm": 197.77818298339844, + "learning_rate": 5.027883813929374e-06, + "loss": 14.4772, + "step": 448 + }, + { + "epoch": 0.257431064447351, + "grad_norm": 196.05238342285156, + "learning_rate": 5.009294647468137e-06, + "loss": 14.5655, + "step": 449 + }, + { + "epoch": 0.2580044075752961, + "grad_norm": 194.8416290283203, + "learning_rate": 4.990705352531864e-06, + "loss": 14.5701, + "step": 450 + }, + { + "epoch": 0.2585777507032412, + "grad_norm": 193.21575927734375, + "learning_rate": 4.972116186070626e-06, + "loss": 14.5292, + "step": 451 + }, + { + "epoch": 0.2591510938311863, + "grad_norm": 189.819580078125, + "learning_rate": 4.953527405032723e-06, + "loss": 14.4925, + "step": 452 + }, + { + "epoch": 0.2597244369591314, + "grad_norm": 194.4360809326172, + "learning_rate": 4.934939266361123e-06, + "loss": 14.4965, + "step": 453 + }, + { + "epoch": 0.2602977800870765, + "grad_norm": 198.99061584472656, + "learning_rate": 4.916352026989914e-06, + "loss": 14.484, + "step": 454 + }, + { + "epoch": 0.2608711232150216, + "grad_norm": 193.81446838378906, + "learning_rate": 4.897765943840761e-06, + "loss": 14.527, + "step": 455 + }, + { + "epoch": 0.2614444663429667, + "grad_norm": 189.20484924316406, + "learning_rate": 4.87918127381934e-06, + "loss": 14.4895, + "step": 456 + }, + { + "epoch": 0.2620178094709118, + "grad_norm": 190.6830291748047, + "learning_rate": 4.860598273811793e-06, + "loss": 14.4308, + "step": 457 + }, + { + "epoch": 0.2625911525988569, + "grad_norm": 189.31912231445312, + "learning_rate": 4.842017200681185e-06, + "loss": 14.5519, + "step": 458 + }, + { + "epoch": 0.263164495726802, + "grad_norm": 188.8474578857422, + "learning_rate": 4.823438311263943e-06, + "loss": 14.4147, + "step": 459 + }, + { + "epoch": 0.2637378388547471, + "grad_norm": 192.68406677246094, + "learning_rate": 4.804861862366306e-06, + "loss": 14.471, + "step": 460 + }, + { + "epoch": 0.2643111819826922, + "grad_norm": 188.2942657470703, + "learning_rate": 4.786288110760787e-06, + "loss": 14.5164, + "step": 461 + }, + { + "epoch": 0.2648845251106373, + "grad_norm": 191.98313903808594, + "learning_rate": 4.767717313182611e-06, + "loss": 14.3865, + "step": 462 + }, + { + "epoch": 0.2654578682385824, + "grad_norm": 197.7642364501953, + "learning_rate": 4.74914972632617e-06, + "loss": 14.6162, + "step": 463 + }, + { + "epoch": 0.2660312113665275, + "grad_norm": 199.40097045898438, + "learning_rate": 4.730585606841479e-06, + "loss": 14.4812, + "step": 464 + }, + { + "epoch": 0.2666045544944726, + "grad_norm": 191.48199462890625, + "learning_rate": 4.7120252113306216e-06, + "loss": 14.445, + "step": 465 + }, + { + "epoch": 0.2671778976224177, + "grad_norm": 195.9621124267578, + "learning_rate": 4.693468796344211e-06, + "loss": 14.4466, + "step": 466 + }, + { + "epoch": 0.26775124075036283, + "grad_norm": 193.89913940429688, + "learning_rate": 4.6749166183778375e-06, + "loss": 14.4653, + "step": 467 + }, + { + "epoch": 0.26832458387830793, + "grad_norm": 185.12448120117188, + "learning_rate": 4.656368933868525e-06, + "loss": 14.4962, + "step": 468 + }, + { + "epoch": 0.26889792700625303, + "grad_norm": 188.17173767089844, + "learning_rate": 4.637825999191189e-06, + "loss": 14.4282, + "step": 469 + }, + { + "epoch": 0.26947127013419814, + "grad_norm": 179.78378295898438, + "learning_rate": 4.619288070655086e-06, + "loss": 14.4112, + "step": 470 + }, + { + "epoch": 0.27004461326214324, + "grad_norm": 184.57598876953125, + "learning_rate": 4.600755404500281e-06, + "loss": 14.4972, + "step": 471 + }, + { + "epoch": 0.27061795639008834, + "grad_norm": 190.61500549316406, + "learning_rate": 4.582228256894093e-06, + "loss": 14.4585, + "step": 472 + }, + { + "epoch": 0.27119129951803345, + "grad_norm": 191.43365478515625, + "learning_rate": 4.56370688392757e-06, + "loss": 14.3984, + "step": 473 + }, + { + "epoch": 0.27176464264597855, + "grad_norm": 189.6448211669922, + "learning_rate": 4.545191541611933e-06, + "loss": 14.4596, + "step": 474 + }, + { + "epoch": 0.27233798577392365, + "grad_norm": 199.958740234375, + "learning_rate": 4.526682485875044e-06, + "loss": 14.5124, + "step": 475 + }, + { + "epoch": 0.27291132890186875, + "grad_norm": 187.1591033935547, + "learning_rate": 4.508179972557875e-06, + "loss": 14.4502, + "step": 476 + }, + { + "epoch": 0.27348467202981386, + "grad_norm": 212.7876739501953, + "learning_rate": 4.489684257410959e-06, + "loss": 14.4952, + "step": 477 + }, + { + "epoch": 0.27405801515775896, + "grad_norm": 197.2154541015625, + "learning_rate": 4.471195596090867e-06, + "loss": 14.5392, + "step": 478 + }, + { + "epoch": 0.27463135828570406, + "grad_norm": 193.218505859375, + "learning_rate": 4.452714244156667e-06, + "loss": 14.5221, + "step": 479 + }, + { + "epoch": 0.27520470141364917, + "grad_norm": 195.3530731201172, + "learning_rate": 4.434240457066388e-06, + "loss": 14.4045, + "step": 480 + }, + { + "epoch": 0.27577804454159427, + "grad_norm": 191.08155822753906, + "learning_rate": 4.415774490173504e-06, + "loss": 14.363, + "step": 481 + }, + { + "epoch": 0.2763513876695394, + "grad_norm": 205.4665985107422, + "learning_rate": 4.397316598723385e-06, + "loss": 14.5536, + "step": 482 + }, + { + "epoch": 0.2769247307974845, + "grad_norm": 202.93714904785156, + "learning_rate": 4.3788670378497836e-06, + "loss": 14.4253, + "step": 483 + }, + { + "epoch": 0.2774980739254296, + "grad_norm": 199.6490020751953, + "learning_rate": 4.360426062571303e-06, + "loss": 14.5529, + "step": 484 + }, + { + "epoch": 0.2780714170533747, + "grad_norm": 198.09494018554688, + "learning_rate": 4.341993927787871e-06, + "loss": 14.4701, + "step": 485 + }, + { + "epoch": 0.2786447601813198, + "grad_norm": 194.7907257080078, + "learning_rate": 4.323570888277215e-06, + "loss": 14.4267, + "step": 486 + }, + { + "epoch": 0.2792181033092649, + "grad_norm": 204.8142852783203, + "learning_rate": 4.305157198691351e-06, + "loss": 14.4313, + "step": 487 + }, + { + "epoch": 0.27979144643721, + "grad_norm": 199.0611572265625, + "learning_rate": 4.286753113553049e-06, + "loss": 14.4615, + "step": 488 + }, + { + "epoch": 0.2803647895651551, + "grad_norm": 188.00750732421875, + "learning_rate": 4.268358887252322e-06, + "loss": 14.3631, + "step": 489 + }, + { + "epoch": 0.2809381326931002, + "grad_norm": 191.73825073242188, + "learning_rate": 4.249974774042915e-06, + "loss": 14.4741, + "step": 490 + }, + { + "epoch": 0.2815114758210453, + "grad_norm": 188.29759216308594, + "learning_rate": 4.231601028038781e-06, + "loss": 14.446, + "step": 491 + }, + { + "epoch": 0.2820848189489904, + "grad_norm": 197.5531768798828, + "learning_rate": 4.2132379032105695e-06, + "loss": 14.4405, + "step": 492 + }, + { + "epoch": 0.2826581620769355, + "grad_norm": 190.16937255859375, + "learning_rate": 4.194885653382128e-06, + "loss": 14.3906, + "step": 493 + }, + { + "epoch": 0.2832315052048806, + "grad_norm": 188.8497772216797, + "learning_rate": 4.176544532226974e-06, + "loss": 14.4415, + "step": 494 + }, + { + "epoch": 0.2838048483328257, + "grad_norm": 186.59799194335938, + "learning_rate": 4.158214793264808e-06, + "loss": 14.4197, + "step": 495 + }, + { + "epoch": 0.2843781914607708, + "grad_norm": 184.35581970214844, + "learning_rate": 4.139896689857995e-06, + "loss": 14.3536, + "step": 496 + }, + { + "epoch": 0.2849515345887159, + "grad_norm": 199.46311950683594, + "learning_rate": 4.121590475208071e-06, + "loss": 14.4356, + "step": 497 + }, + { + "epoch": 0.285524877716661, + "grad_norm": 200.33966064453125, + "learning_rate": 4.1032964023522366e-06, + "loss": 14.4552, + "step": 498 + }, + { + "epoch": 0.2860982208446061, + "grad_norm": 189.87977600097656, + "learning_rate": 4.085014724159866e-06, + "loss": 14.3919, + "step": 499 + }, + { + "epoch": 0.2866715639725512, + "grad_norm": 196.80152893066406, + "learning_rate": 4.066745693329008e-06, + "loss": 14.5031, + "step": 500 + }, + { + "epoch": 0.2872449071004963, + "grad_norm": 193.42140197753906, + "learning_rate": 4.0484895623828906e-06, + "loss": 14.4403, + "step": 501 + }, + { + "epoch": 0.28781825022844143, + "grad_norm": 194.4940948486328, + "learning_rate": 4.030246583666437e-06, + "loss": 14.4734, + "step": 502 + }, + { + "epoch": 0.2883915933563865, + "grad_norm": 192.37107849121094, + "learning_rate": 4.012017009342773e-06, + "loss": 14.4512, + "step": 503 + }, + { + "epoch": 0.2889649364843316, + "grad_norm": 181.2819366455078, + "learning_rate": 3.99380109138974e-06, + "loss": 14.4906, + "step": 504 + }, + { + "epoch": 0.2895382796122767, + "grad_norm": 199.6365509033203, + "learning_rate": 3.97559908159642e-06, + "loss": 14.4517, + "step": 505 + }, + { + "epoch": 0.2901116227402218, + "grad_norm": 182.9588165283203, + "learning_rate": 3.9574112315596425e-06, + "loss": 14.4496, + "step": 506 + }, + { + "epoch": 0.2906849658681669, + "grad_norm": 183.8024139404297, + "learning_rate": 3.9392377926805226e-06, + "loss": 14.403, + "step": 507 + }, + { + "epoch": 0.291258308996112, + "grad_norm": 195.86257934570312, + "learning_rate": 3.92107901616097e-06, + "loss": 14.3586, + "step": 508 + }, + { + "epoch": 0.2918316521240571, + "grad_norm": 193.3267822265625, + "learning_rate": 3.9029351530002264e-06, + "loss": 14.4352, + "step": 509 + }, + { + "epoch": 0.2924049952520022, + "grad_norm": 189.76773071289062, + "learning_rate": 3.884806453991399e-06, + "loss": 14.3374, + "step": 510 + }, + { + "epoch": 0.2929783383799473, + "grad_norm": 190.036865234375, + "learning_rate": 3.866693169717982e-06, + "loss": 14.3719, + "step": 511 + }, + { + "epoch": 0.2935516815078924, + "grad_norm": 187.96229553222656, + "learning_rate": 3.848595550550401e-06, + "loss": 14.4594, + "step": 512 + }, + { + "epoch": 0.2941250246358375, + "grad_norm": 189.76959228515625, + "learning_rate": 3.830513846642556e-06, + "loss": 14.3997, + "step": 513 + }, + { + "epoch": 0.2946983677637826, + "grad_norm": 188.51016235351562, + "learning_rate": 3.8124483079283546e-06, + "loss": 14.3977, + "step": 514 + }, + { + "epoch": 0.2952717108917277, + "grad_norm": 182.27618408203125, + "learning_rate": 3.7943991841182586e-06, + "loss": 14.3342, + "step": 515 + }, + { + "epoch": 0.2958450540196728, + "grad_norm": 194.53384399414062, + "learning_rate": 3.7763667246958447e-06, + "loss": 14.3353, + "step": 516 + }, + { + "epoch": 0.2964183971476179, + "grad_norm": 186.60391235351562, + "learning_rate": 3.758351178914336e-06, + "loss": 14.3462, + "step": 517 + }, + { + "epoch": 0.296991740275563, + "grad_norm": 208.77110290527344, + "learning_rate": 3.7403527957931716e-06, + "loss": 14.4527, + "step": 518 + }, + { + "epoch": 0.2975650834035081, + "grad_norm": 192.8214111328125, + "learning_rate": 3.7223718241145646e-06, + "loss": 14.3971, + "step": 519 + }, + { + "epoch": 0.2981384265314532, + "grad_norm": 185.70005798339844, + "learning_rate": 3.7044085124200517e-06, + "loss": 14.3432, + "step": 520 + }, + { + "epoch": 0.29871176965939833, + "grad_norm": 196.39981079101562, + "learning_rate": 3.6864631090070656e-06, + "loss": 14.5102, + "step": 521 + }, + { + "epoch": 0.29928511278734343, + "grad_norm": 187.2920684814453, + "learning_rate": 3.668535861925509e-06, + "loss": 14.4782, + "step": 522 + }, + { + "epoch": 0.29985845591528854, + "grad_norm": 186.00146484375, + "learning_rate": 3.650627018974312e-06, + "loss": 14.4494, + "step": 523 + }, + { + "epoch": 0.30043179904323364, + "grad_norm": 189.43801879882812, + "learning_rate": 3.632736827698015e-06, + "loss": 14.3908, + "step": 524 + }, + { + "epoch": 0.30100514217117874, + "grad_norm": 201.06126403808594, + "learning_rate": 3.6148655353833518e-06, + "loss": 14.458, + "step": 525 + }, + { + "epoch": 0.30157848529912384, + "grad_norm": 190.3157501220703, + "learning_rate": 3.5970133890558184e-06, + "loss": 14.3939, + "step": 526 + }, + { + "epoch": 0.30215182842706895, + "grad_norm": 203.18019104003906, + "learning_rate": 3.5791806354762702e-06, + "loss": 14.4642, + "step": 527 + }, + { + "epoch": 0.30272517155501405, + "grad_norm": 186.1299285888672, + "learning_rate": 3.5613675211375066e-06, + "loss": 14.3403, + "step": 528 + }, + { + "epoch": 0.30329851468295915, + "grad_norm": 188.37765502929688, + "learning_rate": 3.5435742922608618e-06, + "loss": 14.3578, + "step": 529 + }, + { + "epoch": 0.30387185781090426, + "grad_norm": 184.9286346435547, + "learning_rate": 3.525801194792805e-06, + "loss": 14.3543, + "step": 530 + }, + { + "epoch": 0.30444520093884936, + "grad_norm": 193.71884155273438, + "learning_rate": 3.508048474401541e-06, + "loss": 14.3639, + "step": 531 + }, + { + "epoch": 0.30501854406679446, + "grad_norm": 187.72390747070312, + "learning_rate": 3.4903163764736104e-06, + "loss": 14.2493, + "step": 532 + }, + { + "epoch": 0.30559188719473956, + "grad_norm": 195.72886657714844, + "learning_rate": 3.4726051461105016e-06, + "loss": 14.4045, + "step": 533 + }, + { + "epoch": 0.30616523032268467, + "grad_norm": 185.08929443359375, + "learning_rate": 3.4549150281252635e-06, + "loss": 14.4521, + "step": 534 + }, + { + "epoch": 0.30673857345062977, + "grad_norm": 182.60292053222656, + "learning_rate": 3.437246267039115e-06, + "loss": 14.3866, + "step": 535 + }, + { + "epoch": 0.3073119165785749, + "grad_norm": 181.70509338378906, + "learning_rate": 3.419599107078073e-06, + "loss": 14.4036, + "step": 536 + }, + { + "epoch": 0.30788525970652, + "grad_norm": 187.29672241210938, + "learning_rate": 3.401973792169574e-06, + "loss": 14.3734, + "step": 537 + }, + { + "epoch": 0.3084586028344651, + "grad_norm": 187.84115600585938, + "learning_rate": 3.384370565939098e-06, + "loss": 14.4167, + "step": 538 + }, + { + "epoch": 0.3090319459624102, + "grad_norm": 200.47061157226562, + "learning_rate": 3.3667896717068105e-06, + "loss": 14.4517, + "step": 539 + }, + { + "epoch": 0.3096052890903553, + "grad_norm": 192.6443634033203, + "learning_rate": 3.34923135248419e-06, + "loss": 14.4143, + "step": 540 + }, + { + "epoch": 0.3101786322183004, + "grad_norm": 189.818115234375, + "learning_rate": 3.33169585097067e-06, + "loss": 14.3478, + "step": 541 + }, + { + "epoch": 0.3107519753462455, + "grad_norm": 185.73080444335938, + "learning_rate": 3.314183409550293e-06, + "loss": 14.3765, + "step": 542 + }, + { + "epoch": 0.3113253184741906, + "grad_norm": 183.9041290283203, + "learning_rate": 3.2966942702883494e-06, + "loss": 14.3506, + "step": 543 + }, + { + "epoch": 0.3118986616021357, + "grad_norm": 188.9761505126953, + "learning_rate": 3.279228674928035e-06, + "loss": 14.4349, + "step": 544 + }, + { + "epoch": 0.3124720047300808, + "grad_norm": 190.45909118652344, + "learning_rate": 3.261786864887117e-06, + "loss": 14.3562, + "step": 545 + }, + { + "epoch": 0.3130453478580259, + "grad_norm": 191.3506317138672, + "learning_rate": 3.244369081254585e-06, + "loss": 14.2781, + "step": 546 + }, + { + "epoch": 0.313618690985971, + "grad_norm": 181.74490356445312, + "learning_rate": 3.226975564787322e-06, + "loss": 14.3264, + "step": 547 + }, + { + "epoch": 0.3141920341139161, + "grad_norm": 186.11990356445312, + "learning_rate": 3.209606555906788e-06, + "loss": 14.3599, + "step": 548 + }, + { + "epoch": 0.3147653772418612, + "grad_norm": 192.1141357421875, + "learning_rate": 3.192262294695679e-06, + "loss": 14.3444, + "step": 549 + }, + { + "epoch": 0.3153387203698063, + "grad_norm": 193.52890014648438, + "learning_rate": 3.174943020894618e-06, + "loss": 14.4323, + "step": 550 + }, + { + "epoch": 0.3159120634977514, + "grad_norm": 183.9879150390625, + "learning_rate": 3.1576489738988457e-06, + "loss": 14.2539, + "step": 551 + }, + { + "epoch": 0.3164854066256965, + "grad_norm": 186.39529418945312, + "learning_rate": 3.140380392754901e-06, + "loss": 14.3633, + "step": 552 + }, + { + "epoch": 0.3170587497536416, + "grad_norm": 193.56439208984375, + "learning_rate": 3.12313751615732e-06, + "loss": 14.3256, + "step": 553 + }, + { + "epoch": 0.3176320928815867, + "grad_norm": 187.15281677246094, + "learning_rate": 3.1059205824453446e-06, + "loss": 14.3763, + "step": 554 + }, + { + "epoch": 0.31820543600953183, + "grad_norm": 188.94200134277344, + "learning_rate": 3.0887298295996183e-06, + "loss": 14.3864, + "step": 555 + }, + { + "epoch": 0.31877877913747693, + "grad_norm": 186.75950622558594, + "learning_rate": 3.0715654952388957e-06, + "loss": 14.3803, + "step": 556 + }, + { + "epoch": 0.31935212226542203, + "grad_norm": 193.4385223388672, + "learning_rate": 3.054427816616773e-06, + "loss": 14.2965, + "step": 557 + }, + { + "epoch": 0.31992546539336714, + "grad_norm": 188.6703338623047, + "learning_rate": 3.0373170306183885e-06, + "loss": 14.4114, + "step": 558 + }, + { + "epoch": 0.32049880852131224, + "grad_norm": 194.2964630126953, + "learning_rate": 3.020233373757162e-06, + "loss": 14.2351, + "step": 559 + }, + { + "epoch": 0.32107215164925734, + "grad_norm": 204.58041381835938, + "learning_rate": 3.0031770821715233e-06, + "loss": 14.3925, + "step": 560 + }, + { + "epoch": 0.32164549477720245, + "grad_norm": 182.63665771484375, + "learning_rate": 2.9861483916216404e-06, + "loss": 14.371, + "step": 561 + }, + { + "epoch": 0.32221883790514755, + "grad_norm": 201.17764282226562, + "learning_rate": 2.969147537486175e-06, + "loss": 14.301, + "step": 562 + }, + { + "epoch": 0.32279218103309265, + "grad_norm": 187.64376831054688, + "learning_rate": 2.952174754759012e-06, + "loss": 14.3037, + "step": 563 + }, + { + "epoch": 0.32336552416103775, + "grad_norm": 182.01651000976562, + "learning_rate": 2.935230278046025e-06, + "loss": 14.2326, + "step": 564 + }, + { + "epoch": 0.32393886728898286, + "grad_norm": 184.65011596679688, + "learning_rate": 2.9183143415618297e-06, + "loss": 14.3121, + "step": 565 + }, + { + "epoch": 0.32451221041692796, + "grad_norm": 172.1057891845703, + "learning_rate": 2.9014271791265403e-06, + "loss": 14.203, + "step": 566 + }, + { + "epoch": 0.32508555354487306, + "grad_norm": 178.24777221679688, + "learning_rate": 2.8845690241625437e-06, + "loss": 14.3961, + "step": 567 + }, + { + "epoch": 0.32565889667281817, + "grad_norm": 198.43179321289062, + "learning_rate": 2.867740109691277e-06, + "loss": 14.3644, + "step": 568 + }, + { + "epoch": 0.32623223980076327, + "grad_norm": 184.53721618652344, + "learning_rate": 2.850940668329996e-06, + "loss": 14.3736, + "step": 569 + }, + { + "epoch": 0.32680558292870837, + "grad_norm": 186.57337951660156, + "learning_rate": 2.8341709322885624e-06, + "loss": 14.2914, + "step": 570 + }, + { + "epoch": 0.3273789260566535, + "grad_norm": 194.31634521484375, + "learning_rate": 2.817431133366246e-06, + "loss": 14.3647, + "step": 571 + }, + { + "epoch": 0.3279522691845986, + "grad_norm": 189.49636840820312, + "learning_rate": 2.800721502948506e-06, + "loss": 14.4111, + "step": 572 + }, + { + "epoch": 0.3285256123125437, + "grad_norm": 194.70204162597656, + "learning_rate": 2.7840422720037943e-06, + "loss": 14.4538, + "step": 573 + }, + { + "epoch": 0.3290989554404888, + "grad_norm": 191.64688110351562, + "learning_rate": 2.767393671080376e-06, + "loss": 14.2899, + "step": 574 + }, + { + "epoch": 0.3296722985684339, + "grad_norm": 193.7047576904297, + "learning_rate": 2.7507759303031257e-06, + "loss": 14.3198, + "step": 575 + }, + { + "epoch": 0.330245641696379, + "grad_norm": 189.0587158203125, + "learning_rate": 2.7341892793703594e-06, + "loss": 14.3457, + "step": 576 + }, + { + "epoch": 0.3308189848243241, + "grad_norm": 188.7035675048828, + "learning_rate": 2.7176339475506515e-06, + "loss": 14.2817, + "step": 577 + }, + { + "epoch": 0.3313923279522692, + "grad_norm": 184.22344970703125, + "learning_rate": 2.7011101636796677e-06, + "loss": 14.3146, + "step": 578 + }, + { + "epoch": 0.3319656710802143, + "grad_norm": 180.2777557373047, + "learning_rate": 2.6846181561570085e-06, + "loss": 14.3799, + "step": 579 + }, + { + "epoch": 0.3325390142081594, + "grad_norm": 185.93838500976562, + "learning_rate": 2.668158152943039e-06, + "loss": 14.3632, + "step": 580 + }, + { + "epoch": 0.3331123573361045, + "grad_norm": 183.86941528320312, + "learning_rate": 2.651730381555754e-06, + "loss": 14.3327, + "step": 581 + }, + { + "epoch": 0.3336857004640496, + "grad_norm": 184.0933074951172, + "learning_rate": 2.635335069067617e-06, + "loss": 14.3807, + "step": 582 + }, + { + "epoch": 0.3342590435919947, + "grad_norm": 183.67532348632812, + "learning_rate": 2.618972442102432e-06, + "loss": 14.4402, + "step": 583 + }, + { + "epoch": 0.3348323867199398, + "grad_norm": 185.25009155273438, + "learning_rate": 2.602642726832212e-06, + "loss": 14.3258, + "step": 584 + }, + { + "epoch": 0.3354057298478849, + "grad_norm": 186.76087951660156, + "learning_rate": 2.5863461489740403e-06, + "loss": 14.2503, + "step": 585 + }, + { + "epoch": 0.33597907297583, + "grad_norm": 183.74209594726562, + "learning_rate": 2.57008293378697e-06, + "loss": 14.282, + "step": 586 + }, + { + "epoch": 0.3365524161037751, + "grad_norm": 185.21743774414062, + "learning_rate": 2.553853306068888e-06, + "loss": 14.3058, + "step": 587 + }, + { + "epoch": 0.3371257592317202, + "grad_norm": 180.64405822753906, + "learning_rate": 2.5376574901534303e-06, + "loss": 14.2191, + "step": 588 + }, + { + "epoch": 0.3376991023596653, + "grad_norm": 197.49221801757812, + "learning_rate": 2.5214957099068613e-06, + "loss": 14.2684, + "step": 589 + }, + { + "epoch": 0.33827244548761043, + "grad_norm": 178.35708618164062, + "learning_rate": 2.5053681887249916e-06, + "loss": 14.2358, + "step": 590 + }, + { + "epoch": 0.33884578861555553, + "grad_norm": 181.4188995361328, + "learning_rate": 2.4892751495300893e-06, + "loss": 14.3204, + "step": 591 + }, + { + "epoch": 0.33941913174350063, + "grad_norm": 178.8732452392578, + "learning_rate": 2.4732168147677927e-06, + "loss": 14.2609, + "step": 592 + }, + { + "epoch": 0.33999247487144574, + "grad_norm": 191.7628631591797, + "learning_rate": 2.4571934064040364e-06, + "loss": 14.2528, + "step": 593 + }, + { + "epoch": 0.34056581799939084, + "grad_norm": 193.52305603027344, + "learning_rate": 2.4412051459219945e-06, + "loss": 14.3341, + "step": 594 + }, + { + "epoch": 0.34113916112733594, + "grad_norm": 198.21897888183594, + "learning_rate": 2.425252254319002e-06, + "loss": 14.3828, + "step": 595 + }, + { + "epoch": 0.34171250425528105, + "grad_norm": 191.85609436035156, + "learning_rate": 2.4093349521035105e-06, + "loss": 14.3309, + "step": 596 + }, + { + "epoch": 0.34228584738322615, + "grad_norm": 185.22528076171875, + "learning_rate": 2.3934534592920416e-06, + "loss": 14.2623, + "step": 597 + }, + { + "epoch": 0.34285919051117125, + "grad_norm": 188.74754333496094, + "learning_rate": 2.3776079954061385e-06, + "loss": 14.4269, + "step": 598 + }, + { + "epoch": 0.34343253363911636, + "grad_norm": 178.31825256347656, + "learning_rate": 2.3617987794693358e-06, + "loss": 14.2489, + "step": 599 + }, + { + "epoch": 0.34400587676706146, + "grad_norm": 188.00209045410156, + "learning_rate": 2.3460260300041355e-06, + "loss": 14.3401, + "step": 600 + }, + { + "epoch": 0.34457921989500656, + "grad_norm": 191.75465393066406, + "learning_rate": 2.3302899650289773e-06, + "loss": 14.3273, + "step": 601 + }, + { + "epoch": 0.34515256302295166, + "grad_norm": 185.55166625976562, + "learning_rate": 2.314590802055232e-06, + "loss": 14.3695, + "step": 602 + }, + { + "epoch": 0.34572590615089677, + "grad_norm": 177.90130615234375, + "learning_rate": 2.2989287580841985e-06, + "loss": 14.3113, + "step": 603 + }, + { + "epoch": 0.34629924927884187, + "grad_norm": 189.20179748535156, + "learning_rate": 2.2833040496040925e-06, + "loss": 14.2244, + "step": 604 + }, + { + "epoch": 0.346872592406787, + "grad_norm": 192.3074493408203, + "learning_rate": 2.267716892587062e-06, + "loss": 14.28, + "step": 605 + }, + { + "epoch": 0.3474459355347321, + "grad_norm": 192.26055908203125, + "learning_rate": 2.252167502486205e-06, + "loss": 14.2554, + "step": 606 + }, + { + "epoch": 0.3480192786626772, + "grad_norm": 184.66305541992188, + "learning_rate": 2.2366560942325833e-06, + "loss": 14.3175, + "step": 607 + }, + { + "epoch": 0.3485926217906223, + "grad_norm": 186.08566284179688, + "learning_rate": 2.2211828822322547e-06, + "loss": 14.2586, + "step": 608 + }, + { + "epoch": 0.3491659649185674, + "grad_norm": 183.10336303710938, + "learning_rate": 2.205748080363316e-06, + "loss": 14.3051, + "step": 609 + }, + { + "epoch": 0.3497393080465125, + "grad_norm": 188.01463317871094, + "learning_rate": 2.190351901972935e-06, + "loss": 14.2597, + "step": 610 + }, + { + "epoch": 0.3503126511744576, + "grad_norm": 186.796630859375, + "learning_rate": 2.1749945598744076e-06, + "loss": 14.3121, + "step": 611 + }, + { + "epoch": 0.3508859943024027, + "grad_norm": 197.26966857910156, + "learning_rate": 2.159676266344222e-06, + "loss": 14.3272, + "step": 612 + }, + { + "epoch": 0.3514593374303478, + "grad_norm": 188.6767578125, + "learning_rate": 2.144397233119112e-06, + "loss": 14.2799, + "step": 613 + }, + { + "epoch": 0.35203268055829284, + "grad_norm": 185.7920684814453, + "learning_rate": 2.1291576713931382e-06, + "loss": 14.3654, + "step": 614 + }, + { + "epoch": 0.35260602368623795, + "grad_norm": 183.85186767578125, + "learning_rate": 2.1139577918147715e-06, + "loss": 14.2435, + "step": 615 + }, + { + "epoch": 0.35317936681418305, + "grad_norm": 188.81492614746094, + "learning_rate": 2.0987978044839707e-06, + "loss": 14.3787, + "step": 616 + }, + { + "epoch": 0.35375270994212815, + "grad_norm": 181.99166870117188, + "learning_rate": 2.0836779189492925e-06, + "loss": 14.3489, + "step": 617 + }, + { + "epoch": 0.35432605307007325, + "grad_norm": 182.6253204345703, + "learning_rate": 2.068598344204981e-06, + "loss": 14.2816, + "step": 618 + }, + { + "epoch": 0.35489939619801836, + "grad_norm": 178.6793975830078, + "learning_rate": 2.053559288688086e-06, + "loss": 14.2392, + "step": 619 + }, + { + "epoch": 0.35547273932596346, + "grad_norm": 190.26219177246094, + "learning_rate": 2.0385609602755878e-06, + "loss": 14.2875, + "step": 620 + }, + { + "epoch": 0.35604608245390856, + "grad_norm": 199.85971069335938, + "learning_rate": 2.02360356628151e-06, + "loss": 14.3167, + "step": 621 + }, + { + "epoch": 0.35661942558185367, + "grad_norm": 199.51605224609375, + "learning_rate": 2.0086873134540626e-06, + "loss": 14.336, + "step": 622 + }, + { + "epoch": 0.35719276870979877, + "grad_norm": 183.92247009277344, + "learning_rate": 1.9938124079727874e-06, + "loss": 14.2201, + "step": 623 + }, + { + "epoch": 0.35776611183774387, + "grad_norm": 193.48175048828125, + "learning_rate": 1.9789790554456977e-06, + "loss": 14.2868, + "step": 624 + }, + { + "epoch": 0.358339454965689, + "grad_norm": 189.4330291748047, + "learning_rate": 1.9641874609064443e-06, + "loss": 14.2538, + "step": 625 + }, + { + "epoch": 0.3589127980936341, + "grad_norm": 182.5979461669922, + "learning_rate": 1.9494378288114816e-06, + "loss": 14.2463, + "step": 626 + }, + { + "epoch": 0.3594861412215792, + "grad_norm": 177.77850341796875, + "learning_rate": 1.9347303630372373e-06, + "loss": 14.1946, + "step": 627 + }, + { + "epoch": 0.3600594843495243, + "grad_norm": 182.85313415527344, + "learning_rate": 1.9200652668772924e-06, + "loss": 14.2852, + "step": 628 + }, + { + "epoch": 0.3606328274774694, + "grad_norm": 189.149169921875, + "learning_rate": 1.9054427430395828e-06, + "loss": 14.2522, + "step": 629 + }, + { + "epoch": 0.3612061706054145, + "grad_norm": 186.2698211669922, + "learning_rate": 1.890862993643583e-06, + "loss": 14.2526, + "step": 630 + }, + { + "epoch": 0.3617795137333596, + "grad_norm": 188.8157196044922, + "learning_rate": 1.8763262202175204e-06, + "loss": 14.2772, + "step": 631 + }, + { + "epoch": 0.3623528568613047, + "grad_norm": 184.87147521972656, + "learning_rate": 1.8618326236955908e-06, + "loss": 14.3395, + "step": 632 + }, + { + "epoch": 0.3629261999892498, + "grad_norm": 185.856201171875, + "learning_rate": 1.8473824044151762e-06, + "loss": 14.2998, + "step": 633 + }, + { + "epoch": 0.3634995431171949, + "grad_norm": 184.26248168945312, + "learning_rate": 1.8329757621140748e-06, + "loss": 14.2654, + "step": 634 + }, + { + "epoch": 0.36407288624514, + "grad_norm": 186.35105895996094, + "learning_rate": 1.81861289592775e-06, + "loss": 14.2294, + "step": 635 + }, + { + "epoch": 0.3646462293730851, + "grad_norm": 187.1624298095703, + "learning_rate": 1.8042940043865658e-06, + "loss": 14.3037, + "step": 636 + }, + { + "epoch": 0.3652195725010302, + "grad_norm": 176.15463256835938, + "learning_rate": 1.7900192854130465e-06, + "loss": 14.2271, + "step": 637 + }, + { + "epoch": 0.3657929156289753, + "grad_norm": 188.59449768066406, + "learning_rate": 1.7757889363191484e-06, + "loss": 14.3419, + "step": 638 + }, + { + "epoch": 0.3663662587569204, + "grad_norm": 180.50051879882812, + "learning_rate": 1.7616031538035189e-06, + "loss": 14.2815, + "step": 639 + }, + { + "epoch": 0.3669396018848655, + "grad_norm": 185.34474182128906, + "learning_rate": 1.7474621339487925e-06, + "loss": 14.2534, + "step": 640 + }, + { + "epoch": 0.3675129450128106, + "grad_norm": 184.1910858154297, + "learning_rate": 1.7333660722188667e-06, + "loss": 14.2397, + "step": 641 + }, + { + "epoch": 0.3680862881407557, + "grad_norm": 185.2908477783203, + "learning_rate": 1.7193151634562071e-06, + "loss": 14.2306, + "step": 642 + }, + { + "epoch": 0.3686596312687008, + "grad_norm": 183.8131103515625, + "learning_rate": 1.7053096018791588e-06, + "loss": 14.2843, + "step": 643 + }, + { + "epoch": 0.36923297439664593, + "grad_norm": 189.00628662109375, + "learning_rate": 1.691349581079249e-06, + "loss": 14.1944, + "step": 644 + }, + { + "epoch": 0.36980631752459103, + "grad_norm": 189.68801879882812, + "learning_rate": 1.6774352940185269e-06, + "loss": 14.2894, + "step": 645 + }, + { + "epoch": 0.37037966065253614, + "grad_norm": 193.29290771484375, + "learning_rate": 1.663566933026879e-06, + "loss": 14.3125, + "step": 646 + }, + { + "epoch": 0.37095300378048124, + "grad_norm": 189.4978790283203, + "learning_rate": 1.6497446897993885e-06, + "loss": 14.1912, + "step": 647 + }, + { + "epoch": 0.37152634690842634, + "grad_norm": 187.17823791503906, + "learning_rate": 1.6359687553936714e-06, + "loss": 14.2728, + "step": 648 + }, + { + "epoch": 0.37209969003637144, + "grad_norm": 180.1759033203125, + "learning_rate": 1.6222393202272414e-06, + "loss": 14.2409, + "step": 649 + }, + { + "epoch": 0.37267303316431655, + "grad_norm": 175.7593536376953, + "learning_rate": 1.6085565740748825e-06, + "loss": 14.1765, + "step": 650 + }, + { + "epoch": 0.37324637629226165, + "grad_norm": 183.71810913085938, + "learning_rate": 1.5949207060660138e-06, + "loss": 14.2563, + "step": 651 + }, + { + "epoch": 0.37381971942020675, + "grad_norm": 185.6693572998047, + "learning_rate": 1.581331904682089e-06, + "loss": 14.3579, + "step": 652 + }, + { + "epoch": 0.37439306254815186, + "grad_norm": 189.27444458007812, + "learning_rate": 1.5677903577539806e-06, + "loss": 14.2853, + "step": 653 + }, + { + "epoch": 0.37496640567609696, + "grad_norm": 190.42837524414062, + "learning_rate": 1.5542962524593869e-06, + "loss": 14.2187, + "step": 654 + }, + { + "epoch": 0.37553974880404206, + "grad_norm": 177.54698181152344, + "learning_rate": 1.54084977532025e-06, + "loss": 14.1745, + "step": 655 + }, + { + "epoch": 0.37611309193198716, + "grad_norm": 183.06019592285156, + "learning_rate": 1.5274511122001684e-06, + "loss": 14.2742, + "step": 656 + }, + { + "epoch": 0.37668643505993227, + "grad_norm": 190.93809509277344, + "learning_rate": 1.5141004483018323e-06, + "loss": 14.3287, + "step": 657 + }, + { + "epoch": 0.37725977818787737, + "grad_norm": 195.81625366210938, + "learning_rate": 1.5007979681644696e-06, + "loss": 14.2384, + "step": 658 + }, + { + "epoch": 0.3778331213158225, + "grad_norm": 187.17530822753906, + "learning_rate": 1.4875438556612836e-06, + "loss": 14.25, + "step": 659 + }, + { + "epoch": 0.3784064644437676, + "grad_norm": 183.16397094726562, + "learning_rate": 1.474338293996917e-06, + "loss": 14.3265, + "step": 660 + }, + { + "epoch": 0.3789798075717127, + "grad_norm": 177.78402709960938, + "learning_rate": 1.4611814657049257e-06, + "loss": 14.1526, + "step": 661 + }, + { + "epoch": 0.3795531506996578, + "grad_norm": 187.61419677734375, + "learning_rate": 1.4480735526452427e-06, + "loss": 14.2041, + "step": 662 + }, + { + "epoch": 0.3801264938276029, + "grad_norm": 181.4232635498047, + "learning_rate": 1.4350147360016743e-06, + "loss": 14.2766, + "step": 663 + }, + { + "epoch": 0.380699836955548, + "grad_norm": 185.21261596679688, + "learning_rate": 1.4220051962793952e-06, + "loss": 14.216, + "step": 664 + }, + { + "epoch": 0.3812731800834931, + "grad_norm": 187.9059295654297, + "learning_rate": 1.4090451133024473e-06, + "loss": 14.2696, + "step": 665 + }, + { + "epoch": 0.3818465232114382, + "grad_norm": 184.80746459960938, + "learning_rate": 1.3961346662112585e-06, + "loss": 14.2777, + "step": 666 + }, + { + "epoch": 0.3824198663393833, + "grad_norm": 178.53359985351562, + "learning_rate": 1.3832740334601692e-06, + "loss": 14.2119, + "step": 667 + }, + { + "epoch": 0.3829932094673284, + "grad_norm": 186.9265594482422, + "learning_rate": 1.3704633928149575e-06, + "loss": 14.278, + "step": 668 + }, + { + "epoch": 0.3835665525952735, + "grad_norm": 181.26290893554688, + "learning_rate": 1.3577029213503911e-06, + "loss": 14.2922, + "step": 669 + }, + { + "epoch": 0.3841398957232186, + "grad_norm": 182.86557006835938, + "learning_rate": 1.3449927954477732e-06, + "loss": 14.2855, + "step": 670 + }, + { + "epoch": 0.3847132388511637, + "grad_norm": 182.475830078125, + "learning_rate": 1.3323331907925046e-06, + "loss": 14.2958, + "step": 671 + }, + { + "epoch": 0.3852865819791088, + "grad_norm": 189.7706756591797, + "learning_rate": 1.319724282371664e-06, + "loss": 14.2176, + "step": 672 + }, + { + "epoch": 0.3858599251070539, + "grad_norm": 193.93069458007812, + "learning_rate": 1.307166244471576e-06, + "loss": 14.2117, + "step": 673 + }, + { + "epoch": 0.386433268234999, + "grad_norm": 179.2334442138672, + "learning_rate": 1.2946592506754097e-06, + "loss": 14.3632, + "step": 674 + }, + { + "epoch": 0.3870066113629441, + "grad_norm": 189.32432556152344, + "learning_rate": 1.282203473860783e-06, + "loss": 14.1928, + "step": 675 + }, + { + "epoch": 0.3875799544908892, + "grad_norm": 182.27935791015625, + "learning_rate": 1.2697990861973635e-06, + "loss": 14.2161, + "step": 676 + }, + { + "epoch": 0.3881532976188343, + "grad_norm": 181.55154418945312, + "learning_rate": 1.257446259144494e-06, + "loss": 14.2658, + "step": 677 + }, + { + "epoch": 0.38872664074677943, + "grad_norm": 183.76902770996094, + "learning_rate": 1.2451451634488264e-06, + "loss": 14.3169, + "step": 678 + }, + { + "epoch": 0.38929998387472453, + "grad_norm": 179.52069091796875, + "learning_rate": 1.2328959691419517e-06, + "loss": 14.261, + "step": 679 + }, + { + "epoch": 0.38987332700266963, + "grad_norm": 187.97842407226562, + "learning_rate": 1.2206988455380558e-06, + "loss": 14.1935, + "step": 680 + }, + { + "epoch": 0.39044667013061474, + "grad_norm": 177.58485412597656, + "learning_rate": 1.2085539612315844e-06, + "loss": 14.0745, + "step": 681 + }, + { + "epoch": 0.39102001325855984, + "grad_norm": 178.7311248779297, + "learning_rate": 1.1964614840949002e-06, + "loss": 14.223, + "step": 682 + }, + { + "epoch": 0.39159335638650494, + "grad_norm": 181.48497009277344, + "learning_rate": 1.1844215812759708e-06, + "loss": 14.1863, + "step": 683 + }, + { + "epoch": 0.39216669951445005, + "grad_norm": 183.38412475585938, + "learning_rate": 1.1724344191960591e-06, + "loss": 14.2664, + "step": 684 + }, + { + "epoch": 0.39274004264239515, + "grad_norm": 190.3087921142578, + "learning_rate": 1.1605001635474183e-06, + "loss": 14.3032, + "step": 685 + }, + { + "epoch": 0.39331338577034025, + "grad_norm": 179.9006805419922, + "learning_rate": 1.1486189792910024e-06, + "loss": 14.2501, + "step": 686 + }, + { + "epoch": 0.39388672889828535, + "grad_norm": 186.22154235839844, + "learning_rate": 1.1367910306541918e-06, + "loss": 14.1971, + "step": 687 + }, + { + "epoch": 0.39446007202623046, + "grad_norm": 180.23377990722656, + "learning_rate": 1.1250164811285148e-06, + "loss": 14.2892, + "step": 688 + }, + { + "epoch": 0.39503341515417556, + "grad_norm": 177.89480590820312, + "learning_rate": 1.1132954934673911e-06, + "loss": 14.1728, + "step": 689 + }, + { + "epoch": 0.39560675828212066, + "grad_norm": 187.4567108154297, + "learning_rate": 1.1016282296838887e-06, + "loss": 14.2579, + "step": 690 + }, + { + "epoch": 0.39618010141006577, + "grad_norm": 176.47003173828125, + "learning_rate": 1.090014851048473e-06, + "loss": 14.2398, + "step": 691 + }, + { + "epoch": 0.39675344453801087, + "grad_norm": 182.03118896484375, + "learning_rate": 1.078455518086784e-06, + "loss": 14.2395, + "step": 692 + }, + { + "epoch": 0.39732678766595597, + "grad_norm": 181.1314697265625, + "learning_rate": 1.0669503905774198e-06, + "loss": 14.1643, + "step": 693 + }, + { + "epoch": 0.3979001307939011, + "grad_norm": 189.62818908691406, + "learning_rate": 1.055499627549722e-06, + "loss": 14.1924, + "step": 694 + }, + { + "epoch": 0.3984734739218462, + "grad_norm": 180.246337890625, + "learning_rate": 1.0441033872815804e-06, + "loss": 14.2148, + "step": 695 + }, + { + "epoch": 0.3990468170497913, + "grad_norm": 180.3937530517578, + "learning_rate": 1.0327618272972484e-06, + "loss": 14.2263, + "step": 696 + }, + { + "epoch": 0.3996201601777364, + "grad_norm": 189.4615478515625, + "learning_rate": 1.0214751043651582e-06, + "loss": 14.2253, + "step": 697 + }, + { + "epoch": 0.4001935033056815, + "grad_norm": 177.67706298828125, + "learning_rate": 1.010243374495763e-06, + "loss": 14.1809, + "step": 698 + }, + { + "epoch": 0.4007668464336266, + "grad_norm": 176.24996948242188, + "learning_rate": 9.990667929393715e-07, + "loss": 14.0939, + "step": 699 + }, + { + "epoch": 0.4013401895615717, + "grad_norm": 184.9351806640625, + "learning_rate": 9.879455141840067e-07, + "loss": 14.3538, + "step": 700 + }, + { + "epoch": 0.4019135326895168, + "grad_norm": 189.4423370361328, + "learning_rate": 9.768796919532742e-07, + "loss": 14.2778, + "step": 701 + }, + { + "epoch": 0.4024868758174619, + "grad_norm": 190.33895874023438, + "learning_rate": 9.658694792042284e-07, + "loss": 14.3299, + "step": 702 + }, + { + "epoch": 0.403060218945407, + "grad_norm": 183.4825897216797, + "learning_rate": 9.549150281252633e-07, + "loss": 14.1587, + "step": 703 + }, + { + "epoch": 0.4036335620733521, + "grad_norm": 184.19715881347656, + "learning_rate": 9.440164901340127e-07, + "loss": 14.1235, + "step": 704 + }, + { + "epoch": 0.4042069052012972, + "grad_norm": 191.84231567382812, + "learning_rate": 9.331740158752495e-07, + "loss": 14.1645, + "step": 705 + }, + { + "epoch": 0.4047802483292423, + "grad_norm": 181.87342834472656, + "learning_rate": 9.223877552188065e-07, + "loss": 14.2719, + "step": 706 + }, + { + "epoch": 0.4053535914571874, + "grad_norm": 183.34930419921875, + "learning_rate": 9.116578572575091e-07, + "loss": 14.2534, + "step": 707 + }, + { + "epoch": 0.4059269345851325, + "grad_norm": 174.75514221191406, + "learning_rate": 9.009844703051063e-07, + "loss": 14.3114, + "step": 708 + }, + { + "epoch": 0.4065002777130776, + "grad_norm": 176.34121704101562, + "learning_rate": 8.903677418942292e-07, + "loss": 14.2201, + "step": 709 + }, + { + "epoch": 0.4070736208410227, + "grad_norm": 183.08766174316406, + "learning_rate": 8.79807818774343e-07, + "loss": 14.1528, + "step": 710 + }, + { + "epoch": 0.4076469639689678, + "grad_norm": 189.90757751464844, + "learning_rate": 8.693048469097293e-07, + "loss": 14.2383, + "step": 711 + }, + { + "epoch": 0.4082203070969129, + "grad_norm": 181.50448608398438, + "learning_rate": 8.58858971477457e-07, + "loss": 14.262, + "step": 712 + }, + { + "epoch": 0.40879365022485803, + "grad_norm": 178.92880249023438, + "learning_rate": 8.484703368653812e-07, + "loss": 14.1923, + "step": 713 + }, + { + "epoch": 0.40936699335280313, + "grad_norm": 186.92608642578125, + "learning_rate": 8.381390866701517e-07, + "loss": 14.1751, + "step": 714 + }, + { + "epoch": 0.40994033648074824, + "grad_norm": 183.1122589111328, + "learning_rate": 8.278653636952177e-07, + "loss": 14.2072, + "step": 715 + }, + { + "epoch": 0.41051367960869334, + "grad_norm": 172.70138549804688, + "learning_rate": 8.176493099488664e-07, + "loss": 14.209, + "step": 716 + }, + { + "epoch": 0.41108702273663844, + "grad_norm": 193.0767822265625, + "learning_rate": 8.074910666422475e-07, + "loss": 14.2055, + "step": 717 + }, + { + "epoch": 0.41166036586458354, + "grad_norm": 181.7238006591797, + "learning_rate": 7.973907741874287e-07, + "loss": 14.2313, + "step": 718 + }, + { + "epoch": 0.41223370899252865, + "grad_norm": 196.82655334472656, + "learning_rate": 7.873485721954572e-07, + "loss": 14.3521, + "step": 719 + }, + { + "epoch": 0.41280705212047375, + "grad_norm": 184.25498962402344, + "learning_rate": 7.773645994744222e-07, + "loss": 14.2955, + "step": 720 + }, + { + "epoch": 0.41338039524841885, + "grad_norm": 179.338623046875, + "learning_rate": 7.674389940275406e-07, + "loss": 14.1519, + "step": 721 + }, + { + "epoch": 0.41395373837636396, + "grad_norm": 179.32083129882812, + "learning_rate": 7.575718930512516e-07, + "loss": 14.2179, + "step": 722 + }, + { + "epoch": 0.41452708150430906, + "grad_norm": 178.83621215820312, + "learning_rate": 7.47763432933315e-07, + "loss": 14.2179, + "step": 723 + }, + { + "epoch": 0.41510042463225416, + "grad_norm": 184.3859100341797, + "learning_rate": 7.380137492509309e-07, + "loss": 14.2816, + "step": 724 + }, + { + "epoch": 0.41567376776019926, + "grad_norm": 178.84129333496094, + "learning_rate": 7.283229767688627e-07, + "loss": 14.2278, + "step": 725 + }, + { + "epoch": 0.4162471108881443, + "grad_norm": 171.81666564941406, + "learning_rate": 7.186912494375736e-07, + "loss": 14.1466, + "step": 726 + }, + { + "epoch": 0.4168204540160894, + "grad_norm": 194.59820556640625, + "learning_rate": 7.091187003913802e-07, + "loss": 14.2792, + "step": 727 + }, + { + "epoch": 0.4173937971440345, + "grad_norm": 180.1846160888672, + "learning_rate": 6.996054619466053e-07, + "loss": 14.1733, + "step": 728 + }, + { + "epoch": 0.4179671402719796, + "grad_norm": 180.338134765625, + "learning_rate": 6.901516655997536e-07, + "loss": 14.1878, + "step": 729 + }, + { + "epoch": 0.4185404833999247, + "grad_norm": 182.3441162109375, + "learning_rate": 6.80757442025694e-07, + "loss": 14.2232, + "step": 730 + }, + { + "epoch": 0.4191138265278698, + "grad_norm": 180.3588104248047, + "learning_rate": 6.714229210758516e-07, + "loss": 14.2163, + "step": 731 + }, + { + "epoch": 0.41968716965581493, + "grad_norm": 181.55784606933594, + "learning_rate": 6.621482317764105e-07, + "loss": 14.1579, + "step": 732 + }, + { + "epoch": 0.42026051278376003, + "grad_norm": 184.411376953125, + "learning_rate": 6.529335023265387e-07, + "loss": 14.2631, + "step": 733 + }, + { + "epoch": 0.42083385591170513, + "grad_norm": 182.96253967285156, + "learning_rate": 6.437788600966066e-07, + "loss": 14.285, + "step": 734 + }, + { + "epoch": 0.42140719903965024, + "grad_norm": 192.8575897216797, + "learning_rate": 6.346844316264312e-07, + "loss": 14.1554, + "step": 735 + }, + { + "epoch": 0.42198054216759534, + "grad_norm": 176.40582275390625, + "learning_rate": 6.256503426235277e-07, + "loss": 14.2083, + "step": 736 + }, + { + "epoch": 0.42255388529554044, + "grad_norm": 183.86581420898438, + "learning_rate": 6.166767179613691e-07, + "loss": 14.2304, + "step": 737 + }, + { + "epoch": 0.42312722842348555, + "grad_norm": 190.2710723876953, + "learning_rate": 6.077636816776611e-07, + "loss": 14.2459, + "step": 738 + }, + { + "epoch": 0.42370057155143065, + "grad_norm": 183.04217529296875, + "learning_rate": 5.989113569726312e-07, + "loss": 14.1955, + "step": 739 + }, + { + "epoch": 0.42427391467937575, + "grad_norm": 176.5095672607422, + "learning_rate": 5.901198662073188e-07, + "loss": 14.2403, + "step": 740 + }, + { + "epoch": 0.42484725780732085, + "grad_norm": 175.92588806152344, + "learning_rate": 5.813893309018881e-07, + "loss": 14.2281, + "step": 741 + }, + { + "epoch": 0.42542060093526596, + "grad_norm": 190.41502380371094, + "learning_rate": 5.727198717339511e-07, + "loss": 14.239, + "step": 742 + }, + { + "epoch": 0.42599394406321106, + "grad_norm": 179.48741149902344, + "learning_rate": 5.641116085368931e-07, + "loss": 14.2565, + "step": 743 + }, + { + "epoch": 0.42656728719115616, + "grad_norm": 195.33184814453125, + "learning_rate": 5.555646602982207e-07, + "loss": 14.3216, + "step": 744 + }, + { + "epoch": 0.42714063031910127, + "grad_norm": 185.87525939941406, + "learning_rate": 5.470791451579172e-07, + "loss": 14.242, + "step": 745 + }, + { + "epoch": 0.42771397344704637, + "grad_norm": 188.23599243164062, + "learning_rate": 5.386551804068063e-07, + "loss": 14.2882, + "step": 746 + }, + { + "epoch": 0.4282873165749915, + "grad_norm": 177.58998107910156, + "learning_rate": 5.302928824849335e-07, + "loss": 14.2356, + "step": 747 + }, + { + "epoch": 0.4288606597029366, + "grad_norm": 186.3286895751953, + "learning_rate": 5.219923669799587e-07, + "loss": 14.2915, + "step": 748 + }, + { + "epoch": 0.4294340028308817, + "grad_norm": 180.6791229248047, + "learning_rate": 5.137537486255517e-07, + "loss": 14.2342, + "step": 749 + }, + { + "epoch": 0.4300073459588268, + "grad_norm": 194.50714111328125, + "learning_rate": 5.055771412998122e-07, + "loss": 14.2382, + "step": 750 + }, + { + "epoch": 0.4305806890867719, + "grad_norm": 181.63011169433594, + "learning_rate": 4.974626580236957e-07, + "loss": 14.1548, + "step": 751 + }, + { + "epoch": 0.431154032214717, + "grad_norm": 185.96437072753906, + "learning_rate": 4.894104109594466e-07, + "loss": 14.2133, + "step": 752 + }, + { + "epoch": 0.4317273753426621, + "grad_norm": 177.23391723632812, + "learning_rate": 4.814205114090543e-07, + "loss": 14.213, + "step": 753 + }, + { + "epoch": 0.4323007184706072, + "grad_norm": 178.10658264160156, + "learning_rate": 4.734930698127077e-07, + "loss": 14.216, + "step": 754 + }, + { + "epoch": 0.4328740615985523, + "grad_norm": 178.41822814941406, + "learning_rate": 4.6562819574727304e-07, + "loss": 14.0747, + "step": 755 + }, + { + "epoch": 0.4334474047264974, + "grad_norm": 192.12301635742188, + "learning_rate": 4.578259979247801e-07, + "loss": 14.2543, + "step": 756 + }, + { + "epoch": 0.4340207478544425, + "grad_norm": 182.95399475097656, + "learning_rate": 4.500865841909169e-07, + "loss": 14.1967, + "step": 757 + }, + { + "epoch": 0.4345940909823876, + "grad_norm": 182.12098693847656, + "learning_rate": 4.4241006152353885e-07, + "loss": 14.233, + "step": 758 + }, + { + "epoch": 0.4351674341103327, + "grad_norm": 185.19178771972656, + "learning_rate": 4.3479653603119287e-07, + "loss": 14.1932, + "step": 759 + }, + { + "epoch": 0.4357407772382778, + "grad_norm": 175.16232299804688, + "learning_rate": 4.2724611295164755e-07, + "loss": 14.2061, + "step": 760 + }, + { + "epoch": 0.4363141203662229, + "grad_norm": 171.42161560058594, + "learning_rate": 4.197588966504401e-07, + "loss": 14.1964, + "step": 761 + }, + { + "epoch": 0.436887463494168, + "grad_norm": 179.8773193359375, + "learning_rate": 4.123349906194357e-07, + "loss": 14.1541, + "step": 762 + }, + { + "epoch": 0.4374608066221131, + "grad_norm": 179.10585021972656, + "learning_rate": 4.0497449747539217e-07, + "loss": 14.1968, + "step": 763 + }, + { + "epoch": 0.4380341497500582, + "grad_norm": 191.01058959960938, + "learning_rate": 3.9767751895854467e-07, + "loss": 14.2196, + "step": 764 + }, + { + "epoch": 0.4386074928780033, + "grad_norm": 183.64254760742188, + "learning_rate": 3.904441559312006e-07, + "loss": 14.2129, + "step": 765 + }, + { + "epoch": 0.4391808360059484, + "grad_norm": 186.27633666992188, + "learning_rate": 3.8327450837634284e-07, + "loss": 14.1771, + "step": 766 + }, + { + "epoch": 0.43975417913389353, + "grad_norm": 189.0173797607422, + "learning_rate": 3.7616867539624733e-07, + "loss": 14.275, + "step": 767 + }, + { + "epoch": 0.44032752226183863, + "grad_norm": 187.9246368408203, + "learning_rate": 3.691267552111183e-07, + "loss": 14.2115, + "step": 768 + }, + { + "epoch": 0.44090086538978374, + "grad_norm": 185.96083068847656, + "learning_rate": 3.621488451577221e-07, + "loss": 14.1871, + "step": 769 + }, + { + "epoch": 0.44147420851772884, + "grad_norm": 180.14927673339844, + "learning_rate": 3.552350416880507e-07, + "loss": 14.1769, + "step": 770 + }, + { + "epoch": 0.44204755164567394, + "grad_norm": 190.77037048339844, + "learning_rate": 3.483854403679832e-07, + "loss": 14.159, + "step": 771 + }, + { + "epoch": 0.44262089477361904, + "grad_norm": 179.29052734375, + "learning_rate": 3.416001358759635e-07, + "loss": 14.2194, + "step": 772 + }, + { + "epoch": 0.44319423790156415, + "grad_norm": 187.48687744140625, + "learning_rate": 3.3487922200169944e-07, + "loss": 14.2782, + "step": 773 + }, + { + "epoch": 0.44376758102950925, + "grad_norm": 175.16188049316406, + "learning_rate": 3.2822279164485494e-07, + "loss": 14.1779, + "step": 774 + }, + { + "epoch": 0.44434092415745435, + "grad_norm": 182.10446166992188, + "learning_rate": 3.2163093681377765e-07, + "loss": 14.1585, + "step": 775 + }, + { + "epoch": 0.44491426728539946, + "grad_norm": 179.84536743164062, + "learning_rate": 3.151037486242181e-07, + "loss": 14.1605, + "step": 776 + }, + { + "epoch": 0.44548761041334456, + "grad_norm": 179.8004608154297, + "learning_rate": 3.08641317298074e-07, + "loss": 14.231, + "step": 777 + }, + { + "epoch": 0.44606095354128966, + "grad_norm": 190.25631713867188, + "learning_rate": 3.022437321621452e-07, + "loss": 14.2661, + "step": 778 + }, + { + "epoch": 0.44663429666923476, + "grad_norm": 177.27598571777344, + "learning_rate": 2.959110816468935e-07, + "loss": 14.3369, + "step": 779 + }, + { + "epoch": 0.44720763979717987, + "grad_norm": 180.63668823242188, + "learning_rate": 2.896434532852277e-07, + "loss": 14.1925, + "step": 780 + }, + { + "epoch": 0.44778098292512497, + "grad_norm": 172.8029022216797, + "learning_rate": 2.834409337112842e-07, + "loss": 14.2616, + "step": 781 + }, + { + "epoch": 0.4483543260530701, + "grad_norm": 182.10931396484375, + "learning_rate": 2.7730360865923954e-07, + "loss": 14.2489, + "step": 782 + }, + { + "epoch": 0.4489276691810152, + "grad_norm": 182.58995056152344, + "learning_rate": 2.712315629621176e-07, + "loss": 14.2247, + "step": 783 + }, + { + "epoch": 0.4495010123089603, + "grad_norm": 182.17227172851562, + "learning_rate": 2.6522488055062076e-07, + "loss": 14.251, + "step": 784 + }, + { + "epoch": 0.4500743554369054, + "grad_norm": 179.82858276367188, + "learning_rate": 2.5928364445196975e-07, + "loss": 14.2028, + "step": 785 + }, + { + "epoch": 0.4506476985648505, + "grad_norm": 177.07699584960938, + "learning_rate": 2.534079367887549e-07, + "loss": 14.1402, + "step": 786 + }, + { + "epoch": 0.4512210416927956, + "grad_norm": 174.88539123535156, + "learning_rate": 2.475978387778e-07, + "loss": 14.2159, + "step": 787 + }, + { + "epoch": 0.4517943848207407, + "grad_norm": 182.9810028076172, + "learning_rate": 2.4185343072904376e-07, + "loss": 14.2624, + "step": 788 + }, + { + "epoch": 0.4523677279486858, + "grad_norm": 180.19107055664062, + "learning_rate": 2.3617479204442462e-07, + "loss": 14.2149, + "step": 789 + }, + { + "epoch": 0.4529410710766309, + "grad_norm": 181.24143981933594, + "learning_rate": 2.305620012167853e-07, + "loss": 14.1732, + "step": 790 + }, + { + "epoch": 0.453514414204576, + "grad_norm": 174.54727172851562, + "learning_rate": 2.2501513582879108e-07, + "loss": 14.1911, + "step": 791 + }, + { + "epoch": 0.4540877573325211, + "grad_norm": 181.31564331054688, + "learning_rate": 2.1953427255185122e-07, + "loss": 14.2618, + "step": 792 + }, + { + "epoch": 0.4546611004604662, + "grad_norm": 179.88681030273438, + "learning_rate": 2.1411948714506414e-07, + "loss": 14.2918, + "step": 793 + }, + { + "epoch": 0.4552344435884113, + "grad_norm": 175.06451416015625, + "learning_rate": 2.0877085445416889e-07, + "loss": 14.1995, + "step": 794 + }, + { + "epoch": 0.4558077867163564, + "grad_norm": 181.00540161132812, + "learning_rate": 2.034884484105093e-07, + "loss": 14.1838, + "step": 795 + }, + { + "epoch": 0.4563811298443015, + "grad_norm": 182.90286254882812, + "learning_rate": 1.98272342030012e-07, + "loss": 14.2421, + "step": 796 + }, + { + "epoch": 0.4569544729722466, + "grad_norm": 188.80038452148438, + "learning_rate": 1.9312260741218114e-07, + "loss": 14.2287, + "step": 797 + }, + { + "epoch": 0.4575278161001917, + "grad_norm": 189.58168029785156, + "learning_rate": 1.8803931573909584e-07, + "loss": 14.1547, + "step": 798 + }, + { + "epoch": 0.4581011592281368, + "grad_norm": 182.40635681152344, + "learning_rate": 1.8302253727443041e-07, + "loss": 14.1816, + "step": 799 + }, + { + "epoch": 0.4586745023560819, + "grad_norm": 177.64756774902344, + "learning_rate": 1.7807234136248296e-07, + "loss": 14.0972, + "step": 800 + }, + { + "epoch": 0.45924784548402703, + "grad_norm": 179.09646606445312, + "learning_rate": 1.731887964272144e-07, + "loss": 14.2329, + "step": 801 + }, + { + "epoch": 0.45982118861197213, + "grad_norm": 182.92236328125, + "learning_rate": 1.6837196997130434e-07, + "loss": 14.129, + "step": 802 + }, + { + "epoch": 0.46039453173991723, + "grad_norm": 180.4651641845703, + "learning_rate": 1.6362192857521942e-07, + "loss": 14.2359, + "step": 803 + }, + { + "epoch": 0.46096787486786234, + "grad_norm": 188.5583038330078, + "learning_rate": 1.5893873789628812e-07, + "loss": 14.3177, + "step": 804 + }, + { + "epoch": 0.46154121799580744, + "grad_norm": 179.5811767578125, + "learning_rate": 1.5432246266780083e-07, + "loss": 14.1691, + "step": 805 + }, + { + "epoch": 0.46211456112375254, + "grad_norm": 174.0120391845703, + "learning_rate": 1.4977316669810782e-07, + "loss": 14.1824, + "step": 806 + }, + { + "epoch": 0.46268790425169765, + "grad_norm": 191.203369140625, + "learning_rate": 1.4529091286973994e-07, + "loss": 14.1955, + "step": 807 + }, + { + "epoch": 0.46326124737964275, + "grad_norm": 183.5585479736328, + "learning_rate": 1.4087576313854212e-07, + "loss": 14.2568, + "step": 808 + }, + { + "epoch": 0.46383459050758785, + "grad_norm": 183.38294982910156, + "learning_rate": 1.365277785328123e-07, + "loss": 14.1888, + "step": 809 + }, + { + "epoch": 0.46440793363553295, + "grad_norm": 178.88182067871094, + "learning_rate": 1.3224701915246053e-07, + "loss": 14.1905, + "step": 810 + }, + { + "epoch": 0.46498127676347806, + "grad_norm": 176.96397399902344, + "learning_rate": 1.280335441681796e-07, + "loss": 14.2524, + "step": 811 + }, + { + "epoch": 0.46555461989142316, + "grad_norm": 182.11790466308594, + "learning_rate": 1.2388741182062348e-07, + "loss": 14.161, + "step": 812 + }, + { + "epoch": 0.46612796301936826, + "grad_norm": 178.43495178222656, + "learning_rate": 1.198086794196035e-07, + "loss": 14.2621, + "step": 813 + }, + { + "epoch": 0.46670130614731337, + "grad_norm": 172.70196533203125, + "learning_rate": 1.1579740334330014e-07, + "loss": 14.1181, + "step": 814 + }, + { + "epoch": 0.46727464927525847, + "grad_norm": 187.98484802246094, + "learning_rate": 1.1185363903747748e-07, + "loss": 14.269, + "step": 815 + }, + { + "epoch": 0.46784799240320357, + "grad_norm": 172.6577911376953, + "learning_rate": 1.0797744101472052e-07, + "loss": 14.1737, + "step": 816 + }, + { + "epoch": 0.4684213355311487, + "grad_norm": 181.676025390625, + "learning_rate": 1.0416886285368188e-07, + "loss": 14.2495, + "step": 817 + }, + { + "epoch": 0.4689946786590938, + "grad_norm": 176.9022216796875, + "learning_rate": 1.0042795719833964e-07, + "loss": 14.1739, + "step": 818 + }, + { + "epoch": 0.4695680217870389, + "grad_norm": 174.6312255859375, + "learning_rate": 9.675477575726954e-08, + "loss": 14.2219, + "step": 819 + }, + { + "epoch": 0.470141364914984, + "grad_norm": 176.77125549316406, + "learning_rate": 9.314936930293283e-08, + "loss": 14.1415, + "step": 820 + }, + { + "epoch": 0.4707147080429291, + "grad_norm": 180.0824432373047, + "learning_rate": 8.961178767097178e-08, + "loss": 14.2163, + "step": 821 + }, + { + "epoch": 0.4712880511708742, + "grad_norm": 179.52847290039062, + "learning_rate": 8.614207975952083e-08, + "loss": 14.2163, + "step": 822 + }, + { + "epoch": 0.4718613942988193, + "grad_norm": 175.8388671875, + "learning_rate": 8.274029352853264e-08, + "loss": 14.1408, + "step": 823 + }, + { + "epoch": 0.4724347374267644, + "grad_norm": 179.85772705078125, + "learning_rate": 7.940647599911477e-08, + "loss": 14.2558, + "step": 824 + }, + { + "epoch": 0.4730080805547095, + "grad_norm": 176.0479278564453, + "learning_rate": 7.614067325287632e-08, + "loss": 14.1834, + "step": 825 + }, + { + "epoch": 0.4735814236826546, + "grad_norm": 179.95497131347656, + "learning_rate": 7.294293043129785e-08, + "loss": 14.2747, + "step": 826 + }, + { + "epoch": 0.4741547668105997, + "grad_norm": 187.21307373046875, + "learning_rate": 6.981329173509909e-08, + "loss": 14.235, + "step": 827 + }, + { + "epoch": 0.4747281099385448, + "grad_norm": 183.1041717529297, + "learning_rate": 6.675180042363505e-08, + "loss": 14.2802, + "step": 828 + }, + { + "epoch": 0.4753014530664899, + "grad_norm": 177.82183837890625, + "learning_rate": 6.375849881429418e-08, + "loss": 14.2127, + "step": 829 + }, + { + "epoch": 0.475874796194435, + "grad_norm": 185.0269775390625, + "learning_rate": 6.083342828191453e-08, + "loss": 14.1445, + "step": 830 + }, + { + "epoch": 0.4764481393223801, + "grad_norm": 184.57952880859375, + "learning_rate": 5.797662925821068e-08, + "loss": 14.2531, + "step": 831 + }, + { + "epoch": 0.4770214824503252, + "grad_norm": 184.90017700195312, + "learning_rate": 5.518814123121885e-08, + "loss": 14.1998, + "step": 832 + }, + { + "epoch": 0.4775948255782703, + "grad_norm": 178.5499267578125, + "learning_rate": 5.246800274474439e-08, + "loss": 14.1822, + "step": 833 + }, + { + "epoch": 0.4781681687062154, + "grad_norm": 176.92861938476562, + "learning_rate": 4.981625139783619e-08, + "loss": 14.1861, + "step": 834 + }, + { + "epoch": 0.4787415118341605, + "grad_norm": 176.53111267089844, + "learning_rate": 4.723292384426203e-08, + "loss": 14.1773, + "step": 835 + }, + { + "epoch": 0.47931485496210563, + "grad_norm": 198.08750915527344, + "learning_rate": 4.471805579200239e-08, + "loss": 14.3216, + "step": 836 + }, + { + "epoch": 0.47988819809005073, + "grad_norm": 180.246826171875, + "learning_rate": 4.227168200276077e-08, + "loss": 14.0681, + "step": 837 + }, + { + "epoch": 0.4804615412179958, + "grad_norm": 181.2344970703125, + "learning_rate": 3.989383629147747e-08, + "loss": 14.239, + "step": 838 + }, + { + "epoch": 0.4810348843459409, + "grad_norm": 182.63856506347656, + "learning_rate": 3.758455152586715e-08, + "loss": 14.1785, + "step": 839 + }, + { + "epoch": 0.481608227473886, + "grad_norm": 176.11099243164062, + "learning_rate": 3.534385962596143e-08, + "loss": 14.1423, + "step": 840 + }, + { + "epoch": 0.4821815706018311, + "grad_norm": 175.12725830078125, + "learning_rate": 3.3171791563669785e-08, + "loss": 14.2053, + "step": 841 + }, + { + "epoch": 0.4827549137297762, + "grad_norm": 185.15928649902344, + "learning_rate": 3.10683773623488e-08, + "loss": 14.146, + "step": 842 + }, + { + "epoch": 0.4833282568577213, + "grad_norm": 188.5362548828125, + "learning_rate": 2.9033646096390255e-08, + "loss": 14.2097, + "step": 843 + }, + { + "epoch": 0.4839015999856664, + "grad_norm": 187.72796630859375, + "learning_rate": 2.706762589081646e-08, + "loss": 14.1802, + "step": 844 + }, + { + "epoch": 0.4844749431136115, + "grad_norm": 177.17909240722656, + "learning_rate": 2.517034392089446e-08, + "loss": 14.1847, + "step": 845 + }, + { + "epoch": 0.4850482862415566, + "grad_norm": 174.41868591308594, + "learning_rate": 2.3341826411756863e-08, + "loss": 14.1541, + "step": 846 + }, + { + "epoch": 0.4856216293695017, + "grad_norm": 177.609130859375, + "learning_rate": 2.158209863804217e-08, + "loss": 14.2386, + "step": 847 + }, + { + "epoch": 0.4861949724974468, + "grad_norm": 182.3568115234375, + "learning_rate": 1.9891184923544472e-08, + "loss": 14.1531, + "step": 848 + }, + { + "epoch": 0.4867683156253919, + "grad_norm": 174.87728881835938, + "learning_rate": 1.826910864087761e-08, + "loss": 14.1399, + "step": 849 + }, + { + "epoch": 0.487341658753337, + "grad_norm": 183.8682403564453, + "learning_rate": 1.6715892211150442e-08, + "loss": 14.1189, + "step": 850 + }, + { + "epoch": 0.4879150018812821, + "grad_norm": 176.34315490722656, + "learning_rate": 1.5231557103658755e-08, + "loss": 14.2468, + "step": 851 + }, + { + "epoch": 0.4884883450092272, + "grad_norm": 177.79586791992188, + "learning_rate": 1.3816123835588835e-08, + "loss": 14.2414, + "step": 852 + }, + { + "epoch": 0.4890616881371723, + "grad_norm": 173.83486938476562, + "learning_rate": 1.2469611971731576e-08, + "loss": 14.1864, + "step": 853 + }, + { + "epoch": 0.4896350312651174, + "grad_norm": 181.15512084960938, + "learning_rate": 1.1192040124214931e-08, + "loss": 14.1471, + "step": 854 + }, + { + "epoch": 0.49020837439306253, + "grad_norm": 185.8532257080078, + "learning_rate": 9.983425952243552e-09, + "loss": 14.2145, + "step": 855 + }, + { + "epoch": 0.49078171752100763, + "grad_norm": 172.852783203125, + "learning_rate": 8.84378616185788e-09, + "loss": 14.1675, + "step": 856 + }, + { + "epoch": 0.49135506064895274, + "grad_norm": 183.1013946533203, + "learning_rate": 7.773136505700995e-09, + "loss": 14.1541, + "step": 857 + }, + { + "epoch": 0.49192840377689784, + "grad_norm": 173.04444885253906, + "learning_rate": 6.7714917828004545e-09, + "loss": 14.1, + "step": 858 + }, + { + "epoch": 0.49250174690484294, + "grad_norm": 173.59991455078125, + "learning_rate": 5.838865838366792e-09, + "loss": 14.1511, + "step": 859 + }, + { + "epoch": 0.49307509003278804, + "grad_norm": 197.82601928710938, + "learning_rate": 4.975271563599227e-09, + "loss": 14.2182, + "step": 860 + }, + { + "epoch": 0.49364843316073315, + "grad_norm": 183.84568786621094, + "learning_rate": 4.180720895508028e-09, + "loss": 14.1797, + "step": 861 + }, + { + "epoch": 0.49422177628867825, + "grad_norm": 194.36610412597656, + "learning_rate": 3.4552248167507576e-09, + "loss": 14.237, + "step": 862 + }, + { + "epoch": 0.49479511941662335, + "grad_norm": 172.23765563964844, + "learning_rate": 2.798793355478502e-09, + "loss": 14.094, + "step": 863 + }, + { + "epoch": 0.49536846254456846, + "grad_norm": 185.76551818847656, + "learning_rate": 2.2114355851993175e-09, + "loss": 14.1855, + "step": 864 + }, + { + "epoch": 0.49594180567251356, + "grad_norm": 175.8227081298828, + "learning_rate": 1.6931596246516636e-09, + "loss": 14.2448, + "step": 865 + }, + { + "epoch": 0.49651514880045866, + "grad_norm": 174.75340270996094, + "learning_rate": 1.24397263769227e-09, + "loss": 14.103, + "step": 866 + }, + { + "epoch": 0.49708849192840376, + "grad_norm": 175.39210510253906, + "learning_rate": 8.638808331973281e-10, + "loss": 14.1831, + "step": 867 + }, + { + "epoch": 0.49766183505634887, + "grad_norm": 180.33180236816406, + "learning_rate": 5.528894649758921e-10, + "loss": 14.1561, + "step": 868 + }, + { + "epoch": 0.49823517818429397, + "grad_norm": 168.7253875732422, + "learning_rate": 3.1100283169938074e-10, + "loss": 14.2352, + "step": 869 + }, + { + "epoch": 0.4988085213122391, + "grad_norm": 179.21212768554688, + "learning_rate": 1.3822427683884975e-10, + "loss": 14.2388, + "step": 870 + }, + { + "epoch": 0.4993818644401842, + "grad_norm": 181.83961486816406, + "learning_rate": 3.4556188622802964e-11, + "loss": 14.1703, + "step": 871 + }, + { + "epoch": 0.4999552075681293, + "grad_norm": 178.8787078857422, + "learning_rate": 0.0, + "loss": 14.2526, + "step": 872 + }, + { + "epoch": 0.4999552075681293, + "step": 872, + "total_flos": 7.585435033523978e+18, + "train_loss": 14.689219380737445, + "train_runtime": 70676.4546, + "train_samples_per_second": 3.948, + "train_steps_per_second": 0.012 + } + ], + "logging_steps": 1.0, + "max_steps": 872, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 7.585435033523978e+18, + "train_batch_size": 10, + "trial_name": null, + "trial_params": null +}