{ "best_metric": 1.322394609451294, "best_model_checkpoint": "output/output__lora/checkpoint-400", "epoch": 0.139640425903299, "eval_steps": 100, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00034910106475824753, "grad_norm": 2.6783504486083984, "learning_rate": 0.0, "loss": 1.5271, "step": 1 }, { "epoch": 0.0006982021295164951, "grad_norm": 1.3333820104599, "learning_rate": 8.859191006777897e-06, "loss": 1.3963, "step": 2 }, { "epoch": 0.0010473031942747426, "grad_norm": 1.2807133197784424, "learning_rate": 1.4041485532469073e-05, "loss": 1.4192, "step": 3 }, { "epoch": 0.0013964042590329901, "grad_norm": 1.1956514120101929, "learning_rate": 1.7718382013555794e-05, "loss": 1.5083, "step": 4 }, { "epoch": 0.0017455053237912376, "grad_norm": 1.2733005285263062, "learning_rate": 2.0570404496611053e-05, "loss": 1.4963, "step": 5 }, { "epoch": 0.0020946063885494853, "grad_norm": 0.8666600584983826, "learning_rate": 2.2900676539246968e-05, "loss": 1.5552, "step": 6 }, { "epoch": 0.0024437074533077328, "grad_norm": 0.7445533275604248, "learning_rate": 2.4870893478326387e-05, "loss": 1.2858, "step": 7 }, { "epoch": 0.0027928085180659802, "grad_norm": 0.8400186896324158, "learning_rate": 2.6577573020333684e-05, "loss": 1.3413, "step": 8 }, { "epoch": 0.0031419095828242277, "grad_norm": 0.8454774618148804, "learning_rate": 2.8082971064938146e-05, "loss": 1.467, "step": 9 }, { "epoch": 0.003491010647582475, "grad_norm": 0.8853550553321838, "learning_rate": 2.9429595503388953e-05, "loss": 1.4477, "step": 10 }, { "epoch": 0.0038401117123407227, "grad_norm": 1.4953877925872803, "learning_rate": 3.064776548439465e-05, "loss": 1.4012, "step": 11 }, { "epoch": 0.0041892127770989706, "grad_norm": 0.8356307148933411, "learning_rate": 3.1759867546024865e-05, "loss": 1.3855, "step": 12 }, { "epoch": 0.004538313841857218, "grad_norm": 0.7591987252235413, "learning_rate": 3.2782902272079295e-05, "loss": 1.3561, "step": 13 }, { "epoch": 0.0048874149066154655, "grad_norm": 0.9811077117919922, "learning_rate": 3.373008448510428e-05, "loss": 1.3175, "step": 14 }, { "epoch": 0.005236515971373713, "grad_norm": 0.8403587341308594, "learning_rate": 3.4611890029080124e-05, "loss": 1.341, "step": 15 }, { "epoch": 0.0055856170361319605, "grad_norm": 0.750234067440033, "learning_rate": 3.543676402711159e-05, "loss": 1.4247, "step": 16 }, { "epoch": 0.005934718100890208, "grad_norm": 0.7567417621612549, "learning_rate": 3.621161404374383e-05, "loss": 1.416, "step": 17 }, { "epoch": 0.006283819165648455, "grad_norm": 0.7126427292823792, "learning_rate": 3.694216207171603e-05, "loss": 1.4426, "step": 18 }, { "epoch": 0.006632920230406703, "grad_norm": 0.7808831930160522, "learning_rate": 3.76332012245438e-05, "loss": 1.4287, "step": 19 }, { "epoch": 0.00698202129516495, "grad_norm": 0.6165328025817871, "learning_rate": 3.8288786510166846e-05, "loss": 1.3391, "step": 20 }, { "epoch": 0.007331122359923198, "grad_norm": 0.7212307453155518, "learning_rate": 3.8912379010795455e-05, "loss": 1.3375, "step": 21 }, { "epoch": 0.007680223424681445, "grad_norm": 0.6797880530357361, "learning_rate": 3.9506956491172545e-05, "loss": 1.2713, "step": 22 }, { "epoch": 0.008029324489439693, "grad_norm": 0.7757507562637329, "learning_rate": 4.007509939970292e-05, "loss": 1.3599, "step": 23 }, { "epoch": 0.008378425554197941, "grad_norm": 0.539090096950531, "learning_rate": 4.061905855280276e-05, "loss": 1.5154, "step": 24 }, { "epoch": 0.008727526618956188, "grad_norm": 0.652180552482605, "learning_rate": 4.1140808993222106e-05, "loss": 1.3438, "step": 25 }, { "epoch": 0.009076627683714436, "grad_norm": 0.7319611310958862, "learning_rate": 4.164209327885719e-05, "loss": 1.5033, "step": 26 }, { "epoch": 0.009425728748472683, "grad_norm": 0.702570378780365, "learning_rate": 4.2124456597407214e-05, "loss": 1.2238, "step": 27 }, { "epoch": 0.009774829813230931, "grad_norm": 0.6835883855819702, "learning_rate": 4.258927549188218e-05, "loss": 1.3648, "step": 28 }, { "epoch": 0.010123930877989178, "grad_norm": 0.6773353219032288, "learning_rate": 4.303778154313212e-05, "loss": 1.3074, "step": 29 }, { "epoch": 0.010473031942747426, "grad_norm": 0.6387542486190796, "learning_rate": 4.347108103585803e-05, "loss": 1.2265, "step": 30 }, { "epoch": 0.010822133007505673, "grad_norm": 0.6249099969863892, "learning_rate": 4.389017139879164e-05, "loss": 1.3321, "step": 31 }, { "epoch": 0.011171234072263921, "grad_norm": 0.7121676802635193, "learning_rate": 4.429595503388948e-05, "loss": 1.3729, "step": 32 }, { "epoch": 0.011520335137022168, "grad_norm": 0.7367205619812012, "learning_rate": 4.468925101686371e-05, "loss": 1.3937, "step": 33 }, { "epoch": 0.011869436201780416, "grad_norm": 0.6183043718338013, "learning_rate": 4.507080505052173e-05, "loss": 1.4321, "step": 34 }, { "epoch": 0.012218537266538662, "grad_norm": 1.1439142227172852, "learning_rate": 4.544129797493744e-05, "loss": 1.3515, "step": 35 }, { "epoch": 0.01256763833129691, "grad_norm": 0.7980801463127136, "learning_rate": 4.5801353078493936e-05, "loss": 1.3929, "step": 36 }, { "epoch": 0.012916739396055157, "grad_norm": 0.8890343904495239, "learning_rate": 4.615154240700883e-05, "loss": 1.2895, "step": 37 }, { "epoch": 0.013265840460813406, "grad_norm": 0.7107703685760498, "learning_rate": 4.6492392231321696e-05, "loss": 1.3054, "step": 38 }, { "epoch": 0.013614941525571652, "grad_norm": 0.605403482913971, "learning_rate": 4.682438780454837e-05, "loss": 1.3817, "step": 39 }, { "epoch": 0.0139640425903299, "grad_norm": 0.6489142775535583, "learning_rate": 4.714797751694474e-05, "loss": 1.4109, "step": 40 }, { "epoch": 0.014313143655088147, "grad_norm": 0.5896831750869751, "learning_rate": 4.7463576537657414e-05, "loss": 1.3383, "step": 41 }, { "epoch": 0.014662244719846396, "grad_norm": 0.8319935202598572, "learning_rate": 4.777157001757336e-05, "loss": 1.4239, "step": 42 }, { "epoch": 0.015011345784604642, "grad_norm": 0.6128418445587158, "learning_rate": 4.8072315915252694e-05, "loss": 1.3541, "step": 43 }, { "epoch": 0.01536044684936289, "grad_norm": 0.6820589900016785, "learning_rate": 4.8366147497950435e-05, "loss": 1.2663, "step": 44 }, { "epoch": 0.015709547914121137, "grad_norm": 0.8375743627548218, "learning_rate": 4.8653375561549195e-05, "loss": 1.3803, "step": 45 }, { "epoch": 0.016058648978879386, "grad_norm": 0.6585806608200073, "learning_rate": 4.8934290406480814e-05, "loss": 1.3143, "step": 46 }, { "epoch": 0.016407750043637634, "grad_norm": 0.7528412342071533, "learning_rate": 4.920916360113129e-05, "loss": 1.293, "step": 47 }, { "epoch": 0.016756851108395882, "grad_norm": 0.6918306946754456, "learning_rate": 4.947824955958066e-05, "loss": 1.4991, "step": 48 }, { "epoch": 0.017105952173154127, "grad_norm": 0.6764557361602783, "learning_rate": 4.9741786956652774e-05, "loss": 1.2755, "step": 49 }, { "epoch": 0.017455053237912375, "grad_norm": 0.6525936722755432, "learning_rate": 5e-05, "loss": 1.3897, "step": 50 }, { "epoch": 0.017804154302670624, "grad_norm": 0.627804160118103, "learning_rate": 5e-05, "loss": 1.3027, "step": 51 }, { "epoch": 0.018153255367428872, "grad_norm": 0.8060218095779419, "learning_rate": 5e-05, "loss": 1.3477, "step": 52 }, { "epoch": 0.018502356432187117, "grad_norm": 0.6655098795890808, "learning_rate": 5e-05, "loss": 1.3631, "step": 53 }, { "epoch": 0.018851457496945365, "grad_norm": 0.7165637016296387, "learning_rate": 5e-05, "loss": 1.347, "step": 54 }, { "epoch": 0.019200558561703614, "grad_norm": 0.6562020778656006, "learning_rate": 5e-05, "loss": 1.3535, "step": 55 }, { "epoch": 0.019549659626461862, "grad_norm": 0.7588657736778259, "learning_rate": 5e-05, "loss": 1.3291, "step": 56 }, { "epoch": 0.019898760691220107, "grad_norm": 0.6295105814933777, "learning_rate": 5e-05, "loss": 1.3542, "step": 57 }, { "epoch": 0.020247861755978355, "grad_norm": 1.339097023010254, "learning_rate": 5e-05, "loss": 1.3649, "step": 58 }, { "epoch": 0.020596962820736604, "grad_norm": 0.6976660490036011, "learning_rate": 5e-05, "loss": 1.2852, "step": 59 }, { "epoch": 0.020946063885494852, "grad_norm": 0.7590420246124268, "learning_rate": 5e-05, "loss": 1.354, "step": 60 }, { "epoch": 0.021295164950253097, "grad_norm": 0.6279817819595337, "learning_rate": 5e-05, "loss": 1.2537, "step": 61 }, { "epoch": 0.021644266015011345, "grad_norm": 0.6099221110343933, "learning_rate": 5e-05, "loss": 1.2423, "step": 62 }, { "epoch": 0.021993367079769593, "grad_norm": 0.6252647638320923, "learning_rate": 5e-05, "loss": 1.3667, "step": 63 }, { "epoch": 0.022342468144527842, "grad_norm": 0.8939846158027649, "learning_rate": 5e-05, "loss": 1.2889, "step": 64 }, { "epoch": 0.022691569209286087, "grad_norm": 0.85840904712677, "learning_rate": 5e-05, "loss": 1.3747, "step": 65 }, { "epoch": 0.023040670274044335, "grad_norm": 0.8478113412857056, "learning_rate": 5e-05, "loss": 1.3417, "step": 66 }, { "epoch": 0.023389771338802583, "grad_norm": 0.6869573593139648, "learning_rate": 5e-05, "loss": 1.4033, "step": 67 }, { "epoch": 0.02373887240356083, "grad_norm": 0.6566379070281982, "learning_rate": 5e-05, "loss": 1.3617, "step": 68 }, { "epoch": 0.02408797346831908, "grad_norm": 0.6871697306632996, "learning_rate": 5e-05, "loss": 1.2932, "step": 69 }, { "epoch": 0.024437074533077325, "grad_norm": 0.7102701663970947, "learning_rate": 5e-05, "loss": 1.4062, "step": 70 }, { "epoch": 0.024786175597835573, "grad_norm": 0.8392966985702515, "learning_rate": 5e-05, "loss": 1.1992, "step": 71 }, { "epoch": 0.02513527666259382, "grad_norm": 0.670971155166626, "learning_rate": 5e-05, "loss": 1.4131, "step": 72 }, { "epoch": 0.02548437772735207, "grad_norm": 0.7271628975868225, "learning_rate": 5e-05, "loss": 1.2928, "step": 73 }, { "epoch": 0.025833478792110315, "grad_norm": 0.7184221744537354, "learning_rate": 5e-05, "loss": 1.2239, "step": 74 }, { "epoch": 0.026182579856868563, "grad_norm": 0.5685485005378723, "learning_rate": 5e-05, "loss": 1.2692, "step": 75 }, { "epoch": 0.02653168092162681, "grad_norm": 0.5677881836891174, "learning_rate": 5e-05, "loss": 1.2951, "step": 76 }, { "epoch": 0.02688078198638506, "grad_norm": 0.6896436810493469, "learning_rate": 5e-05, "loss": 1.3297, "step": 77 }, { "epoch": 0.027229883051143305, "grad_norm": 0.6284964084625244, "learning_rate": 5e-05, "loss": 1.2402, "step": 78 }, { "epoch": 0.027578984115901553, "grad_norm": 0.618015468120575, "learning_rate": 5e-05, "loss": 1.2999, "step": 79 }, { "epoch": 0.0279280851806598, "grad_norm": 0.7585094571113586, "learning_rate": 5e-05, "loss": 1.3378, "step": 80 }, { "epoch": 0.02827718624541805, "grad_norm": 0.6674929857254028, "learning_rate": 5e-05, "loss": 1.3585, "step": 81 }, { "epoch": 0.028626287310176295, "grad_norm": 0.583121120929718, "learning_rate": 5e-05, "loss": 1.3236, "step": 82 }, { "epoch": 0.028975388374934543, "grad_norm": 0.661668062210083, "learning_rate": 5e-05, "loss": 1.3264, "step": 83 }, { "epoch": 0.02932448943969279, "grad_norm": 0.8168457746505737, "learning_rate": 5e-05, "loss": 1.3132, "step": 84 }, { "epoch": 0.02967359050445104, "grad_norm": 0.6123843193054199, "learning_rate": 5e-05, "loss": 1.3224, "step": 85 }, { "epoch": 0.030022691569209285, "grad_norm": 0.7081793546676636, "learning_rate": 5e-05, "loss": 1.3641, "step": 86 }, { "epoch": 0.030371792633967533, "grad_norm": 0.7772612571716309, "learning_rate": 5e-05, "loss": 1.3634, "step": 87 }, { "epoch": 0.03072089369872578, "grad_norm": 0.603370726108551, "learning_rate": 5e-05, "loss": 1.4486, "step": 88 }, { "epoch": 0.03106999476348403, "grad_norm": 0.6567598581314087, "learning_rate": 5e-05, "loss": 1.4228, "step": 89 }, { "epoch": 0.031419095828242274, "grad_norm": 0.6245101690292358, "learning_rate": 5e-05, "loss": 1.2928, "step": 90 }, { "epoch": 0.031768196893000526, "grad_norm": 0.7198782563209534, "learning_rate": 5e-05, "loss": 1.3304, "step": 91 }, { "epoch": 0.03211729795775877, "grad_norm": 0.526452898979187, "learning_rate": 5e-05, "loss": 1.3418, "step": 92 }, { "epoch": 0.032466399022517016, "grad_norm": 0.7534317374229431, "learning_rate": 5e-05, "loss": 1.333, "step": 93 }, { "epoch": 0.03281550008727527, "grad_norm": 0.5721869468688965, "learning_rate": 5e-05, "loss": 1.1849, "step": 94 }, { "epoch": 0.03316460115203351, "grad_norm": 0.6943261027336121, "learning_rate": 5e-05, "loss": 1.3263, "step": 95 }, { "epoch": 0.033513702216791764, "grad_norm": 0.5904171466827393, "learning_rate": 5e-05, "loss": 1.3103, "step": 96 }, { "epoch": 0.03386280328155001, "grad_norm": 0.7743117809295654, "learning_rate": 5e-05, "loss": 1.3633, "step": 97 }, { "epoch": 0.034211904346308254, "grad_norm": 1.298839807510376, "learning_rate": 5e-05, "loss": 1.335, "step": 98 }, { "epoch": 0.034561005411066506, "grad_norm": 0.7134571671485901, "learning_rate": 5e-05, "loss": 1.4154, "step": 99 }, { "epoch": 0.03491010647582475, "grad_norm": 0.6801385879516602, "learning_rate": 5e-05, "loss": 1.3412, "step": 100 }, { "epoch": 0.03491010647582475, "eval_loss": 1.337953805923462, "eval_runtime": 3305.6905, "eval_samples_per_second": 6.932, "eval_steps_per_second": 0.867, "step": 100 }, { "epoch": 0.035259207540582996, "grad_norm": 1.0192288160324097, "learning_rate": 5e-05, "loss": 1.2821, "step": 101 }, { "epoch": 0.03560830860534125, "grad_norm": 0.6322550773620605, "learning_rate": 5e-05, "loss": 1.3561, "step": 102 }, { "epoch": 0.03595740967009949, "grad_norm": 0.6499407291412354, "learning_rate": 5e-05, "loss": 1.3164, "step": 103 }, { "epoch": 0.036306510734857744, "grad_norm": 0.7576645612716675, "learning_rate": 5e-05, "loss": 1.2924, "step": 104 }, { "epoch": 0.03665561179961599, "grad_norm": 0.6215568780899048, "learning_rate": 5e-05, "loss": 1.2551, "step": 105 }, { "epoch": 0.037004712864374234, "grad_norm": 0.6197790503501892, "learning_rate": 5e-05, "loss": 1.317, "step": 106 }, { "epoch": 0.037353813929132486, "grad_norm": 0.677772045135498, "learning_rate": 5e-05, "loss": 1.428, "step": 107 }, { "epoch": 0.03770291499389073, "grad_norm": 0.6386198401451111, "learning_rate": 5e-05, "loss": 1.4206, "step": 108 }, { "epoch": 0.038052016058648976, "grad_norm": 1.113053798675537, "learning_rate": 5e-05, "loss": 1.3992, "step": 109 }, { "epoch": 0.03840111712340723, "grad_norm": 0.668409526348114, "learning_rate": 5e-05, "loss": 1.3358, "step": 110 }, { "epoch": 0.03875021818816547, "grad_norm": 0.6381022930145264, "learning_rate": 5e-05, "loss": 1.245, "step": 111 }, { "epoch": 0.039099319252923724, "grad_norm": 0.7082274556159973, "learning_rate": 5e-05, "loss": 1.3107, "step": 112 }, { "epoch": 0.03944842031768197, "grad_norm": 0.6497403979301453, "learning_rate": 5e-05, "loss": 1.3174, "step": 113 }, { "epoch": 0.039797521382440214, "grad_norm": 0.7390655279159546, "learning_rate": 5e-05, "loss": 1.2791, "step": 114 }, { "epoch": 0.040146622447198466, "grad_norm": 0.6828505992889404, "learning_rate": 5e-05, "loss": 1.3903, "step": 115 }, { "epoch": 0.04049572351195671, "grad_norm": 0.6913119554519653, "learning_rate": 5e-05, "loss": 1.3147, "step": 116 }, { "epoch": 0.04084482457671496, "grad_norm": 0.6394439339637756, "learning_rate": 5e-05, "loss": 1.3308, "step": 117 }, { "epoch": 0.04119392564147321, "grad_norm": 0.6368663907051086, "learning_rate": 5e-05, "loss": 1.3021, "step": 118 }, { "epoch": 0.04154302670623145, "grad_norm": 0.625417947769165, "learning_rate": 5e-05, "loss": 1.4122, "step": 119 }, { "epoch": 0.041892127770989704, "grad_norm": 0.5640509724617004, "learning_rate": 5e-05, "loss": 1.3216, "step": 120 }, { "epoch": 0.04224122883574795, "grad_norm": 0.6355682611465454, "learning_rate": 5e-05, "loss": 1.2522, "step": 121 }, { "epoch": 0.042590329900506194, "grad_norm": 2.130183696746826, "learning_rate": 5e-05, "loss": 1.398, "step": 122 }, { "epoch": 0.042939430965264445, "grad_norm": 0.7858290672302246, "learning_rate": 5e-05, "loss": 1.3543, "step": 123 }, { "epoch": 0.04328853203002269, "grad_norm": 0.6912608742713928, "learning_rate": 5e-05, "loss": 1.3338, "step": 124 }, { "epoch": 0.04363763309478094, "grad_norm": 0.6326834559440613, "learning_rate": 5e-05, "loss": 1.2968, "step": 125 }, { "epoch": 0.04398673415953919, "grad_norm": 0.6076151728630066, "learning_rate": 5e-05, "loss": 1.2705, "step": 126 }, { "epoch": 0.04433583522429743, "grad_norm": 0.767652153968811, "learning_rate": 5e-05, "loss": 1.3601, "step": 127 }, { "epoch": 0.044684936289055684, "grad_norm": 0.621769905090332, "learning_rate": 5e-05, "loss": 1.2834, "step": 128 }, { "epoch": 0.04503403735381393, "grad_norm": 0.6216384768486023, "learning_rate": 5e-05, "loss": 1.3322, "step": 129 }, { "epoch": 0.04538313841857217, "grad_norm": 0.626325249671936, "learning_rate": 5e-05, "loss": 1.4601, "step": 130 }, { "epoch": 0.045732239483330425, "grad_norm": 0.8063498735427856, "learning_rate": 5e-05, "loss": 1.293, "step": 131 }, { "epoch": 0.04608134054808867, "grad_norm": 1.117038369178772, "learning_rate": 5e-05, "loss": 1.3635, "step": 132 }, { "epoch": 0.04643044161284692, "grad_norm": 1.4540647268295288, "learning_rate": 5e-05, "loss": 1.3346, "step": 133 }, { "epoch": 0.04677954267760517, "grad_norm": 0.6695774793624878, "learning_rate": 5e-05, "loss": 1.4109, "step": 134 }, { "epoch": 0.04712864374236341, "grad_norm": 0.8146533370018005, "learning_rate": 5e-05, "loss": 1.3515, "step": 135 }, { "epoch": 0.04747774480712166, "grad_norm": 0.6705998778343201, "learning_rate": 5e-05, "loss": 1.2752, "step": 136 }, { "epoch": 0.04782684587187991, "grad_norm": 0.7589219808578491, "learning_rate": 5e-05, "loss": 1.4393, "step": 137 }, { "epoch": 0.04817594693663816, "grad_norm": 0.9603825807571411, "learning_rate": 5e-05, "loss": 1.4609, "step": 138 }, { "epoch": 0.048525048001396405, "grad_norm": 0.6351510286331177, "learning_rate": 5e-05, "loss": 1.371, "step": 139 }, { "epoch": 0.04887414906615465, "grad_norm": 0.5652881860733032, "learning_rate": 5e-05, "loss": 1.2845, "step": 140 }, { "epoch": 0.0492232501309129, "grad_norm": 0.7579118609428406, "learning_rate": 5e-05, "loss": 1.2526, "step": 141 }, { "epoch": 0.04957235119567115, "grad_norm": 0.7851598262786865, "learning_rate": 5e-05, "loss": 1.3379, "step": 142 }, { "epoch": 0.04992145226042939, "grad_norm": 0.5865357518196106, "learning_rate": 5e-05, "loss": 1.4802, "step": 143 }, { "epoch": 0.05027055332518764, "grad_norm": 1.3862611055374146, "learning_rate": 5e-05, "loss": 1.357, "step": 144 }, { "epoch": 0.05061965438994589, "grad_norm": 0.6249399185180664, "learning_rate": 5e-05, "loss": 1.2587, "step": 145 }, { "epoch": 0.05096875545470414, "grad_norm": 0.5966644883155823, "learning_rate": 5e-05, "loss": 1.3534, "step": 146 }, { "epoch": 0.051317856519462385, "grad_norm": 0.6312971711158752, "learning_rate": 5e-05, "loss": 1.1815, "step": 147 }, { "epoch": 0.05166695758422063, "grad_norm": 0.6539703011512756, "learning_rate": 5e-05, "loss": 1.3946, "step": 148 }, { "epoch": 0.05201605864897888, "grad_norm": 0.8756076097488403, "learning_rate": 5e-05, "loss": 1.2384, "step": 149 }, { "epoch": 0.052365159713737126, "grad_norm": 0.7149311304092407, "learning_rate": 5e-05, "loss": 1.2998, "step": 150 }, { "epoch": 0.05271426077849537, "grad_norm": 0.79525226354599, "learning_rate": 5e-05, "loss": 1.3376, "step": 151 }, { "epoch": 0.05306336184325362, "grad_norm": 0.6921191811561584, "learning_rate": 5e-05, "loss": 1.3461, "step": 152 }, { "epoch": 0.05341246290801187, "grad_norm": 0.7444896697998047, "learning_rate": 5e-05, "loss": 1.4089, "step": 153 }, { "epoch": 0.05376156397277012, "grad_norm": 0.6216670274734497, "learning_rate": 5e-05, "loss": 1.3402, "step": 154 }, { "epoch": 0.054110665037528365, "grad_norm": 0.5917710661888123, "learning_rate": 5e-05, "loss": 1.3253, "step": 155 }, { "epoch": 0.05445976610228661, "grad_norm": 0.8648408055305481, "learning_rate": 5e-05, "loss": 1.4447, "step": 156 }, { "epoch": 0.05480886716704486, "grad_norm": 0.6752570271492004, "learning_rate": 5e-05, "loss": 1.3097, "step": 157 }, { "epoch": 0.055157968231803106, "grad_norm": 0.5603750944137573, "learning_rate": 5e-05, "loss": 1.4177, "step": 158 }, { "epoch": 0.05550706929656136, "grad_norm": 0.6317929029464722, "learning_rate": 5e-05, "loss": 1.3509, "step": 159 }, { "epoch": 0.0558561703613196, "grad_norm": 0.6017687320709229, "learning_rate": 5e-05, "loss": 1.3471, "step": 160 }, { "epoch": 0.05620527142607785, "grad_norm": 0.6761009693145752, "learning_rate": 5e-05, "loss": 1.4473, "step": 161 }, { "epoch": 0.0565543724908361, "grad_norm": 0.7266319990158081, "learning_rate": 5e-05, "loss": 1.2896, "step": 162 }, { "epoch": 0.056903473555594344, "grad_norm": 0.6436321139335632, "learning_rate": 5e-05, "loss": 1.2812, "step": 163 }, { "epoch": 0.05725257462035259, "grad_norm": 0.9664864540100098, "learning_rate": 5e-05, "loss": 1.294, "step": 164 }, { "epoch": 0.05760167568511084, "grad_norm": 0.6690096855163574, "learning_rate": 5e-05, "loss": 1.2801, "step": 165 }, { "epoch": 0.057950776749869086, "grad_norm": 0.6227753162384033, "learning_rate": 5e-05, "loss": 1.3384, "step": 166 }, { "epoch": 0.05829987781462734, "grad_norm": 0.7900117039680481, "learning_rate": 5e-05, "loss": 1.3424, "step": 167 }, { "epoch": 0.05864897887938558, "grad_norm": 0.6928064823150635, "learning_rate": 5e-05, "loss": 1.296, "step": 168 }, { "epoch": 0.05899807994414383, "grad_norm": 0.8754634261131287, "learning_rate": 5e-05, "loss": 1.4471, "step": 169 }, { "epoch": 0.05934718100890208, "grad_norm": 0.5537067651748657, "learning_rate": 5e-05, "loss": 1.2825, "step": 170 }, { "epoch": 0.059696282073660324, "grad_norm": 0.6705783009529114, "learning_rate": 5e-05, "loss": 1.3768, "step": 171 }, { "epoch": 0.06004538313841857, "grad_norm": 0.5732744932174683, "learning_rate": 5e-05, "loss": 1.3309, "step": 172 }, { "epoch": 0.06039448420317682, "grad_norm": 1.120721459388733, "learning_rate": 5e-05, "loss": 1.3702, "step": 173 }, { "epoch": 0.060743585267935066, "grad_norm": 0.7755718231201172, "learning_rate": 5e-05, "loss": 1.3425, "step": 174 }, { "epoch": 0.06109268633269332, "grad_norm": 0.5984740257263184, "learning_rate": 5e-05, "loss": 1.4886, "step": 175 }, { "epoch": 0.06144178739745156, "grad_norm": 0.7374542951583862, "learning_rate": 5e-05, "loss": 1.3667, "step": 176 }, { "epoch": 0.06179088846220981, "grad_norm": 0.5558515787124634, "learning_rate": 5e-05, "loss": 1.3737, "step": 177 }, { "epoch": 0.06213998952696806, "grad_norm": 0.700268566608429, "learning_rate": 5e-05, "loss": 1.364, "step": 178 }, { "epoch": 0.062489090591726304, "grad_norm": 0.5781232118606567, "learning_rate": 5e-05, "loss": 1.3443, "step": 179 }, { "epoch": 0.06283819165648455, "grad_norm": 0.7157448530197144, "learning_rate": 5e-05, "loss": 1.3702, "step": 180 }, { "epoch": 0.0631872927212428, "grad_norm": 0.5329631567001343, "learning_rate": 5e-05, "loss": 1.1786, "step": 181 }, { "epoch": 0.06353639378600105, "grad_norm": 0.5949011445045471, "learning_rate": 5e-05, "loss": 1.3809, "step": 182 }, { "epoch": 0.0638854948507593, "grad_norm": 0.6756107807159424, "learning_rate": 5e-05, "loss": 1.2792, "step": 183 }, { "epoch": 0.06423459591551754, "grad_norm": 0.7747790813446045, "learning_rate": 5e-05, "loss": 1.3714, "step": 184 }, { "epoch": 0.06458369698027579, "grad_norm": 1.1907461881637573, "learning_rate": 5e-05, "loss": 1.3055, "step": 185 }, { "epoch": 0.06493279804503403, "grad_norm": 0.5747818946838379, "learning_rate": 5e-05, "loss": 1.2003, "step": 186 }, { "epoch": 0.06528189910979229, "grad_norm": 0.614464521408081, "learning_rate": 5e-05, "loss": 1.3108, "step": 187 }, { "epoch": 0.06563100017455054, "grad_norm": 0.6040724515914917, "learning_rate": 5e-05, "loss": 1.2371, "step": 188 }, { "epoch": 0.06598010123930878, "grad_norm": 0.6369174122810364, "learning_rate": 5e-05, "loss": 1.1662, "step": 189 }, { "epoch": 0.06632920230406703, "grad_norm": 0.6132228374481201, "learning_rate": 5e-05, "loss": 1.3257, "step": 190 }, { "epoch": 0.06667830336882527, "grad_norm": 0.6686124801635742, "learning_rate": 5e-05, "loss": 1.3757, "step": 191 }, { "epoch": 0.06702740443358353, "grad_norm": 0.6709855794906616, "learning_rate": 5e-05, "loss": 1.3341, "step": 192 }, { "epoch": 0.06737650549834177, "grad_norm": 0.5295905470848083, "learning_rate": 5e-05, "loss": 1.2587, "step": 193 }, { "epoch": 0.06772560656310002, "grad_norm": 0.6111523509025574, "learning_rate": 5e-05, "loss": 1.3365, "step": 194 }, { "epoch": 0.06807470762785826, "grad_norm": 0.5655878782272339, "learning_rate": 5e-05, "loss": 1.3265, "step": 195 }, { "epoch": 0.06842380869261651, "grad_norm": 0.6125257015228271, "learning_rate": 5e-05, "loss": 1.3475, "step": 196 }, { "epoch": 0.06877290975737475, "grad_norm": 0.6268573999404907, "learning_rate": 5e-05, "loss": 1.3002, "step": 197 }, { "epoch": 0.06912201082213301, "grad_norm": 0.7267619967460632, "learning_rate": 5e-05, "loss": 1.4104, "step": 198 }, { "epoch": 0.06947111188689126, "grad_norm": 0.5741710066795349, "learning_rate": 5e-05, "loss": 1.318, "step": 199 }, { "epoch": 0.0698202129516495, "grad_norm": 0.6447280049324036, "learning_rate": 5e-05, "loss": 1.3477, "step": 200 }, { "epoch": 0.0698202129516495, "eval_loss": 1.3300124406814575, "eval_runtime": 3301.7334, "eval_samples_per_second": 6.941, "eval_steps_per_second": 0.868, "step": 200 }, { "epoch": 0.07016931401640775, "grad_norm": 1.4164685010910034, "learning_rate": 5e-05, "loss": 1.4048, "step": 201 }, { "epoch": 0.07051841508116599, "grad_norm": 0.5867809057235718, "learning_rate": 5e-05, "loss": 1.4018, "step": 202 }, { "epoch": 0.07086751614592425, "grad_norm": 0.6882596611976624, "learning_rate": 5e-05, "loss": 1.2737, "step": 203 }, { "epoch": 0.0712166172106825, "grad_norm": 0.6038634181022644, "learning_rate": 5e-05, "loss": 1.2399, "step": 204 }, { "epoch": 0.07156571827544074, "grad_norm": 0.6428863406181335, "learning_rate": 5e-05, "loss": 1.3729, "step": 205 }, { "epoch": 0.07191481934019898, "grad_norm": 0.7008076906204224, "learning_rate": 5e-05, "loss": 1.3353, "step": 206 }, { "epoch": 0.07226392040495723, "grad_norm": 0.6662419438362122, "learning_rate": 5e-05, "loss": 1.3442, "step": 207 }, { "epoch": 0.07261302146971549, "grad_norm": 0.7249788045883179, "learning_rate": 5e-05, "loss": 1.2526, "step": 208 }, { "epoch": 0.07296212253447373, "grad_norm": 0.6323925852775574, "learning_rate": 5e-05, "loss": 1.2929, "step": 209 }, { "epoch": 0.07331122359923198, "grad_norm": 0.8273724317550659, "learning_rate": 5e-05, "loss": 1.5291, "step": 210 }, { "epoch": 0.07366032466399022, "grad_norm": 0.8445104956626892, "learning_rate": 5e-05, "loss": 1.2417, "step": 211 }, { "epoch": 0.07400942572874847, "grad_norm": 0.6157236695289612, "learning_rate": 5e-05, "loss": 1.3739, "step": 212 }, { "epoch": 0.07435852679350673, "grad_norm": 0.6917769312858582, "learning_rate": 5e-05, "loss": 1.3078, "step": 213 }, { "epoch": 0.07470762785826497, "grad_norm": 0.7838917970657349, "learning_rate": 5e-05, "loss": 1.3086, "step": 214 }, { "epoch": 0.07505672892302322, "grad_norm": 0.6962039470672607, "learning_rate": 5e-05, "loss": 1.3907, "step": 215 }, { "epoch": 0.07540582998778146, "grad_norm": 0.6962039470672607, "learning_rate": 5e-05, "loss": 1.3615, "step": 216 }, { "epoch": 0.0757549310525397, "grad_norm": 0.6687365770339966, "learning_rate": 5e-05, "loss": 1.3408, "step": 217 }, { "epoch": 0.07610403211729795, "grad_norm": 0.5566404461860657, "learning_rate": 5e-05, "loss": 1.2872, "step": 218 }, { "epoch": 0.07645313318205621, "grad_norm": 0.6419705748558044, "learning_rate": 5e-05, "loss": 1.2883, "step": 219 }, { "epoch": 0.07680223424681445, "grad_norm": 0.7758398652076721, "learning_rate": 5e-05, "loss": 1.3832, "step": 220 }, { "epoch": 0.0771513353115727, "grad_norm": 0.9763804078102112, "learning_rate": 5e-05, "loss": 1.3414, "step": 221 }, { "epoch": 0.07750043637633094, "grad_norm": 0.8815904259681702, "learning_rate": 5e-05, "loss": 1.3297, "step": 222 }, { "epoch": 0.07784953744108919, "grad_norm": 0.590263307094574, "learning_rate": 5e-05, "loss": 1.3401, "step": 223 }, { "epoch": 0.07819863850584745, "grad_norm": 0.677057147026062, "learning_rate": 5e-05, "loss": 1.2449, "step": 224 }, { "epoch": 0.07854773957060569, "grad_norm": 1.5185271501541138, "learning_rate": 5e-05, "loss": 1.3127, "step": 225 }, { "epoch": 0.07889684063536394, "grad_norm": 0.5751495957374573, "learning_rate": 5e-05, "loss": 1.1587, "step": 226 }, { "epoch": 0.07924594170012218, "grad_norm": 0.8122138977050781, "learning_rate": 5e-05, "loss": 1.2316, "step": 227 }, { "epoch": 0.07959504276488043, "grad_norm": 0.6675130724906921, "learning_rate": 5e-05, "loss": 1.3539, "step": 228 }, { "epoch": 0.07994414382963869, "grad_norm": 0.8163532614707947, "learning_rate": 5e-05, "loss": 1.328, "step": 229 }, { "epoch": 0.08029324489439693, "grad_norm": 0.8377723693847656, "learning_rate": 5e-05, "loss": 1.353, "step": 230 }, { "epoch": 0.08064234595915518, "grad_norm": 0.7325611710548401, "learning_rate": 5e-05, "loss": 1.3396, "step": 231 }, { "epoch": 0.08099144702391342, "grad_norm": 0.8941824436187744, "learning_rate": 5e-05, "loss": 1.2906, "step": 232 }, { "epoch": 0.08134054808867167, "grad_norm": 0.6284440159797668, "learning_rate": 5e-05, "loss": 1.4264, "step": 233 }, { "epoch": 0.08168964915342992, "grad_norm": 0.689984917640686, "learning_rate": 5e-05, "loss": 1.3696, "step": 234 }, { "epoch": 0.08203875021818817, "grad_norm": 0.5813177227973938, "learning_rate": 5e-05, "loss": 1.2931, "step": 235 }, { "epoch": 0.08238785128294641, "grad_norm": 0.5287997126579285, "learning_rate": 5e-05, "loss": 1.3264, "step": 236 }, { "epoch": 0.08273695234770466, "grad_norm": 0.7944268584251404, "learning_rate": 5e-05, "loss": 1.2708, "step": 237 }, { "epoch": 0.0830860534124629, "grad_norm": 0.534864068031311, "learning_rate": 5e-05, "loss": 1.2535, "step": 238 }, { "epoch": 0.08343515447722115, "grad_norm": 0.6260988712310791, "learning_rate": 5e-05, "loss": 1.2757, "step": 239 }, { "epoch": 0.08378425554197941, "grad_norm": 0.579078197479248, "learning_rate": 5e-05, "loss": 1.2906, "step": 240 }, { "epoch": 0.08413335660673765, "grad_norm": 0.5578561425209045, "learning_rate": 5e-05, "loss": 1.289, "step": 241 }, { "epoch": 0.0844824576714959, "grad_norm": 0.626961350440979, "learning_rate": 5e-05, "loss": 1.2807, "step": 242 }, { "epoch": 0.08483155873625414, "grad_norm": 0.782669186592102, "learning_rate": 5e-05, "loss": 1.3933, "step": 243 }, { "epoch": 0.08518065980101239, "grad_norm": 0.6670363545417786, "learning_rate": 5e-05, "loss": 1.2732, "step": 244 }, { "epoch": 0.08552976086577065, "grad_norm": 0.7201350331306458, "learning_rate": 5e-05, "loss": 1.2962, "step": 245 }, { "epoch": 0.08587886193052889, "grad_norm": 0.6021212339401245, "learning_rate": 5e-05, "loss": 1.35, "step": 246 }, { "epoch": 0.08622796299528714, "grad_norm": 0.8081540465354919, "learning_rate": 5e-05, "loss": 1.3568, "step": 247 }, { "epoch": 0.08657706406004538, "grad_norm": 0.5358250737190247, "learning_rate": 5e-05, "loss": 1.4603, "step": 248 }, { "epoch": 0.08692616512480363, "grad_norm": 0.6927733421325684, "learning_rate": 5e-05, "loss": 1.2506, "step": 249 }, { "epoch": 0.08727526618956188, "grad_norm": 0.6187159419059753, "learning_rate": 5e-05, "loss": 1.3497, "step": 250 }, { "epoch": 0.08762436725432013, "grad_norm": 0.6304159760475159, "learning_rate": 5e-05, "loss": 1.3087, "step": 251 }, { "epoch": 0.08797346831907837, "grad_norm": 0.6446660161018372, "learning_rate": 5e-05, "loss": 1.3424, "step": 252 }, { "epoch": 0.08832256938383662, "grad_norm": 0.6535473465919495, "learning_rate": 5e-05, "loss": 1.3471, "step": 253 }, { "epoch": 0.08867167044859486, "grad_norm": 0.601290225982666, "learning_rate": 5e-05, "loss": 1.3557, "step": 254 }, { "epoch": 0.08902077151335312, "grad_norm": 0.641854465007782, "learning_rate": 5e-05, "loss": 1.3138, "step": 255 }, { "epoch": 0.08936987257811137, "grad_norm": 0.5452507138252258, "learning_rate": 5e-05, "loss": 1.2898, "step": 256 }, { "epoch": 0.08971897364286961, "grad_norm": 0.5870373249053955, "learning_rate": 5e-05, "loss": 1.2953, "step": 257 }, { "epoch": 0.09006807470762786, "grad_norm": 0.5798627734184265, "learning_rate": 5e-05, "loss": 1.2973, "step": 258 }, { "epoch": 0.0904171757723861, "grad_norm": 0.5798627734184265, "learning_rate": 5e-05, "loss": 1.3628, "step": 259 }, { "epoch": 0.09076627683714435, "grad_norm": 0.7382280230522156, "learning_rate": 5e-05, "loss": 1.3111, "step": 260 }, { "epoch": 0.0911153779019026, "grad_norm": 0.6882988810539246, "learning_rate": 5e-05, "loss": 1.329, "step": 261 }, { "epoch": 0.09146447896666085, "grad_norm": 0.6590788960456848, "learning_rate": 5e-05, "loss": 1.3089, "step": 262 }, { "epoch": 0.0918135800314191, "grad_norm": 0.682006299495697, "learning_rate": 5e-05, "loss": 1.344, "step": 263 }, { "epoch": 0.09216268109617734, "grad_norm": 0.6040222644805908, "learning_rate": 5e-05, "loss": 1.3919, "step": 264 }, { "epoch": 0.09251178216093559, "grad_norm": 0.5964936017990112, "learning_rate": 5e-05, "loss": 1.3397, "step": 265 }, { "epoch": 0.09286088322569384, "grad_norm": 0.5645217299461365, "learning_rate": 5e-05, "loss": 1.3488, "step": 266 }, { "epoch": 0.09320998429045209, "grad_norm": 0.7771989703178406, "learning_rate": 5e-05, "loss": 1.3485, "step": 267 }, { "epoch": 0.09355908535521033, "grad_norm": 0.6003885865211487, "learning_rate": 5e-05, "loss": 1.3109, "step": 268 }, { "epoch": 0.09390818641996858, "grad_norm": 0.5627903938293457, "learning_rate": 5e-05, "loss": 1.2906, "step": 269 }, { "epoch": 0.09425728748472682, "grad_norm": 0.6381875276565552, "learning_rate": 5e-05, "loss": 1.3063, "step": 270 }, { "epoch": 0.09460638854948508, "grad_norm": 1.2558772563934326, "learning_rate": 5e-05, "loss": 1.2985, "step": 271 }, { "epoch": 0.09495548961424333, "grad_norm": 0.6977007389068604, "learning_rate": 5e-05, "loss": 1.4955, "step": 272 }, { "epoch": 0.09530459067900157, "grad_norm": 0.7846536040306091, "learning_rate": 5e-05, "loss": 1.4439, "step": 273 }, { "epoch": 0.09565369174375982, "grad_norm": 0.7036994695663452, "learning_rate": 5e-05, "loss": 1.1942, "step": 274 }, { "epoch": 0.09600279280851806, "grad_norm": 0.6119917631149292, "learning_rate": 5e-05, "loss": 1.3607, "step": 275 }, { "epoch": 0.09635189387327632, "grad_norm": 0.6243535280227661, "learning_rate": 5e-05, "loss": 1.3029, "step": 276 }, { "epoch": 0.09670099493803457, "grad_norm": 0.5424296855926514, "learning_rate": 5e-05, "loss": 1.2995, "step": 277 }, { "epoch": 0.09705009600279281, "grad_norm": 0.7677564024925232, "learning_rate": 5e-05, "loss": 1.2686, "step": 278 }, { "epoch": 0.09739919706755105, "grad_norm": 0.625275194644928, "learning_rate": 5e-05, "loss": 1.2897, "step": 279 }, { "epoch": 0.0977482981323093, "grad_norm": 0.5734910368919373, "learning_rate": 5e-05, "loss": 1.3298, "step": 280 }, { "epoch": 0.09809739919706754, "grad_norm": 0.660658061504364, "learning_rate": 5e-05, "loss": 1.2643, "step": 281 }, { "epoch": 0.0984465002618258, "grad_norm": 0.679891049861908, "learning_rate": 5e-05, "loss": 1.3189, "step": 282 }, { "epoch": 0.09879560132658405, "grad_norm": 0.6248694658279419, "learning_rate": 5e-05, "loss": 1.1688, "step": 283 }, { "epoch": 0.0991447023913423, "grad_norm": 0.6428897380828857, "learning_rate": 5e-05, "loss": 1.3274, "step": 284 }, { "epoch": 0.09949380345610054, "grad_norm": 0.586065411567688, "learning_rate": 5e-05, "loss": 1.3852, "step": 285 }, { "epoch": 0.09984290452085878, "grad_norm": 0.5755594372749329, "learning_rate": 5e-05, "loss": 1.3665, "step": 286 }, { "epoch": 0.10019200558561704, "grad_norm": 0.7748963236808777, "learning_rate": 5e-05, "loss": 1.4551, "step": 287 }, { "epoch": 0.10054110665037529, "grad_norm": 0.6308531165122986, "learning_rate": 5e-05, "loss": 1.2793, "step": 288 }, { "epoch": 0.10089020771513353, "grad_norm": 0.6195006966590881, "learning_rate": 5e-05, "loss": 1.3649, "step": 289 }, { "epoch": 0.10123930877989178, "grad_norm": 0.6098636984825134, "learning_rate": 5e-05, "loss": 1.2956, "step": 290 }, { "epoch": 0.10158840984465002, "grad_norm": 0.8072320818901062, "learning_rate": 5e-05, "loss": 1.3469, "step": 291 }, { "epoch": 0.10193751090940828, "grad_norm": 0.6090126633644104, "learning_rate": 5e-05, "loss": 1.2958, "step": 292 }, { "epoch": 0.10228661197416652, "grad_norm": 0.5718780159950256, "learning_rate": 5e-05, "loss": 1.363, "step": 293 }, { "epoch": 0.10263571303892477, "grad_norm": 0.7197532653808594, "learning_rate": 5e-05, "loss": 1.3868, "step": 294 }, { "epoch": 0.10298481410368301, "grad_norm": 0.5578592419624329, "learning_rate": 5e-05, "loss": 1.2627, "step": 295 }, { "epoch": 0.10333391516844126, "grad_norm": 0.730226457118988, "learning_rate": 5e-05, "loss": 1.3182, "step": 296 }, { "epoch": 0.10368301623319952, "grad_norm": 0.6234796047210693, "learning_rate": 5e-05, "loss": 1.1777, "step": 297 }, { "epoch": 0.10403211729795776, "grad_norm": 0.5563578009605408, "learning_rate": 5e-05, "loss": 1.3275, "step": 298 }, { "epoch": 0.10438121836271601, "grad_norm": 0.6864249110221863, "learning_rate": 5e-05, "loss": 1.2813, "step": 299 }, { "epoch": 0.10473031942747425, "grad_norm": 0.8850319385528564, "learning_rate": 5e-05, "loss": 1.3057, "step": 300 }, { "epoch": 0.10473031942747425, "eval_loss": 1.3255380392074585, "eval_runtime": 3311.4237, "eval_samples_per_second": 6.92, "eval_steps_per_second": 0.865, "step": 300 }, { "epoch": 0.1050794204922325, "grad_norm": 0.9439303278923035, "learning_rate": 5e-05, "loss": 1.281, "step": 301 }, { "epoch": 0.10542852155699074, "grad_norm": 0.6651242971420288, "learning_rate": 5e-05, "loss": 1.3492, "step": 302 }, { "epoch": 0.105777622621749, "grad_norm": 0.9047183394432068, "learning_rate": 5e-05, "loss": 1.4246, "step": 303 }, { "epoch": 0.10612672368650725, "grad_norm": 0.6983138918876648, "learning_rate": 5e-05, "loss": 1.324, "step": 304 }, { "epoch": 0.10647582475126549, "grad_norm": 0.6347063779830933, "learning_rate": 5e-05, "loss": 1.3389, "step": 305 }, { "epoch": 0.10682492581602374, "grad_norm": 0.6051842570304871, "learning_rate": 5e-05, "loss": 1.3278, "step": 306 }, { "epoch": 0.10717402688078198, "grad_norm": 0.9355935454368591, "learning_rate": 5e-05, "loss": 1.2663, "step": 307 }, { "epoch": 0.10752312794554024, "grad_norm": 1.0706268548965454, "learning_rate": 5e-05, "loss": 1.3142, "step": 308 }, { "epoch": 0.10787222901029848, "grad_norm": 0.8131638765335083, "learning_rate": 5e-05, "loss": 1.3445, "step": 309 }, { "epoch": 0.10822133007505673, "grad_norm": 0.5791985392570496, "learning_rate": 5e-05, "loss": 1.2746, "step": 310 }, { "epoch": 0.10857043113981497, "grad_norm": 0.5536484718322754, "learning_rate": 5e-05, "loss": 1.2613, "step": 311 }, { "epoch": 0.10891953220457322, "grad_norm": 0.7847089767456055, "learning_rate": 5e-05, "loss": 1.4607, "step": 312 }, { "epoch": 0.10926863326933148, "grad_norm": 0.7828165888786316, "learning_rate": 5e-05, "loss": 1.4399, "step": 313 }, { "epoch": 0.10961773433408972, "grad_norm": 0.5692522525787354, "learning_rate": 5e-05, "loss": 1.3044, "step": 314 }, { "epoch": 0.10996683539884797, "grad_norm": 0.5592648386955261, "learning_rate": 5e-05, "loss": 1.3211, "step": 315 }, { "epoch": 0.11031593646360621, "grad_norm": 0.7055444717407227, "learning_rate": 5e-05, "loss": 1.2944, "step": 316 }, { "epoch": 0.11066503752836446, "grad_norm": 0.5370152592658997, "learning_rate": 5e-05, "loss": 1.2776, "step": 317 }, { "epoch": 0.11101413859312272, "grad_norm": 0.6320214867591858, "learning_rate": 5e-05, "loss": 1.347, "step": 318 }, { "epoch": 0.11136323965788096, "grad_norm": 0.6425771713256836, "learning_rate": 5e-05, "loss": 1.5038, "step": 319 }, { "epoch": 0.1117123407226392, "grad_norm": 0.585542619228363, "learning_rate": 5e-05, "loss": 1.3573, "step": 320 }, { "epoch": 0.11206144178739745, "grad_norm": 0.5627699494361877, "learning_rate": 5e-05, "loss": 1.2693, "step": 321 }, { "epoch": 0.1124105428521557, "grad_norm": 0.6050506830215454, "learning_rate": 5e-05, "loss": 1.2787, "step": 322 }, { "epoch": 0.11275964391691394, "grad_norm": 0.6247337460517883, "learning_rate": 5e-05, "loss": 1.4146, "step": 323 }, { "epoch": 0.1131087449816722, "grad_norm": 0.7732966542243958, "learning_rate": 5e-05, "loss": 1.2626, "step": 324 }, { "epoch": 0.11345784604643044, "grad_norm": 0.5666255354881287, "learning_rate": 5e-05, "loss": 1.4219, "step": 325 }, { "epoch": 0.11380694711118869, "grad_norm": 0.5973132848739624, "learning_rate": 5e-05, "loss": 1.3522, "step": 326 }, { "epoch": 0.11415604817594693, "grad_norm": 0.8540626764297485, "learning_rate": 5e-05, "loss": 1.304, "step": 327 }, { "epoch": 0.11450514924070518, "grad_norm": 0.574573278427124, "learning_rate": 5e-05, "loss": 1.3487, "step": 328 }, { "epoch": 0.11485425030546344, "grad_norm": 0.5949917435646057, "learning_rate": 5e-05, "loss": 1.254, "step": 329 }, { "epoch": 0.11520335137022168, "grad_norm": 0.6005589365959167, "learning_rate": 5e-05, "loss": 1.3073, "step": 330 }, { "epoch": 0.11555245243497993, "grad_norm": 0.5026714205741882, "learning_rate": 5e-05, "loss": 1.2418, "step": 331 }, { "epoch": 0.11590155349973817, "grad_norm": 0.7160278558731079, "learning_rate": 5e-05, "loss": 1.3437, "step": 332 }, { "epoch": 0.11625065456449642, "grad_norm": 0.6049554347991943, "learning_rate": 5e-05, "loss": 1.4858, "step": 333 }, { "epoch": 0.11659975562925468, "grad_norm": 0.7706385254859924, "learning_rate": 5e-05, "loss": 1.3971, "step": 334 }, { "epoch": 0.11694885669401292, "grad_norm": 0.6254088282585144, "learning_rate": 5e-05, "loss": 1.3359, "step": 335 }, { "epoch": 0.11729795775877117, "grad_norm": 0.5904930830001831, "learning_rate": 5e-05, "loss": 1.3262, "step": 336 }, { "epoch": 0.11764705882352941, "grad_norm": 1.9982556104660034, "learning_rate": 5e-05, "loss": 1.3656, "step": 337 }, { "epoch": 0.11799615988828766, "grad_norm": 0.5776758790016174, "learning_rate": 5e-05, "loss": 1.2654, "step": 338 }, { "epoch": 0.1183452609530459, "grad_norm": 0.6094497442245483, "learning_rate": 5e-05, "loss": 1.3505, "step": 339 }, { "epoch": 0.11869436201780416, "grad_norm": 0.9940481185913086, "learning_rate": 5e-05, "loss": 1.2853, "step": 340 }, { "epoch": 0.1190434630825624, "grad_norm": 1.1043668985366821, "learning_rate": 5e-05, "loss": 1.2813, "step": 341 }, { "epoch": 0.11939256414732065, "grad_norm": 0.5494128465652466, "learning_rate": 5e-05, "loss": 1.202, "step": 342 }, { "epoch": 0.1197416652120789, "grad_norm": 0.6436132192611694, "learning_rate": 5e-05, "loss": 1.2898, "step": 343 }, { "epoch": 0.12009076627683714, "grad_norm": 0.6878450512886047, "learning_rate": 5e-05, "loss": 1.3392, "step": 344 }, { "epoch": 0.1204398673415954, "grad_norm": 0.5806905627250671, "learning_rate": 5e-05, "loss": 1.2221, "step": 345 }, { "epoch": 0.12078896840635364, "grad_norm": 0.5916112065315247, "learning_rate": 5e-05, "loss": 1.2761, "step": 346 }, { "epoch": 0.12113806947111189, "grad_norm": 0.5216647386550903, "learning_rate": 5e-05, "loss": 1.223, "step": 347 }, { "epoch": 0.12148717053587013, "grad_norm": 0.707747220993042, "learning_rate": 5e-05, "loss": 1.2933, "step": 348 }, { "epoch": 0.12183627160062838, "grad_norm": 0.6644443273544312, "learning_rate": 5e-05, "loss": 1.3367, "step": 349 }, { "epoch": 0.12218537266538664, "grad_norm": 0.7112720012664795, "learning_rate": 5e-05, "loss": 1.2368, "step": 350 }, { "epoch": 0.12253447373014488, "grad_norm": 0.6551552414894104, "learning_rate": 5e-05, "loss": 1.3348, "step": 351 }, { "epoch": 0.12288357479490312, "grad_norm": 0.5377748012542725, "learning_rate": 5e-05, "loss": 1.2859, "step": 352 }, { "epoch": 0.12323267585966137, "grad_norm": 0.580769956111908, "learning_rate": 5e-05, "loss": 1.2442, "step": 353 }, { "epoch": 0.12358177692441961, "grad_norm": 0.6772916316986084, "learning_rate": 5e-05, "loss": 1.2994, "step": 354 }, { "epoch": 0.12393087798917787, "grad_norm": 0.6245989799499512, "learning_rate": 5e-05, "loss": 1.2093, "step": 355 }, { "epoch": 0.12427997905393612, "grad_norm": 0.6136452555656433, "learning_rate": 5e-05, "loss": 1.2258, "step": 356 }, { "epoch": 0.12462908011869436, "grad_norm": 0.5786277055740356, "learning_rate": 5e-05, "loss": 1.2856, "step": 357 }, { "epoch": 0.12497818118345261, "grad_norm": 0.5986611247062683, "learning_rate": 5e-05, "loss": 1.4524, "step": 358 }, { "epoch": 0.12532728224821085, "grad_norm": 0.6240454316139221, "learning_rate": 5e-05, "loss": 1.3325, "step": 359 }, { "epoch": 0.1256763833129691, "grad_norm": 0.6426084041595459, "learning_rate": 5e-05, "loss": 1.219, "step": 360 }, { "epoch": 0.12602548437772734, "grad_norm": 0.6227401494979858, "learning_rate": 5e-05, "loss": 1.3342, "step": 361 }, { "epoch": 0.1263745854424856, "grad_norm": 0.7462456226348877, "learning_rate": 5e-05, "loss": 1.3747, "step": 362 }, { "epoch": 0.12672368650724386, "grad_norm": 0.7022641897201538, "learning_rate": 5e-05, "loss": 1.2957, "step": 363 }, { "epoch": 0.1270727875720021, "grad_norm": 0.657645046710968, "learning_rate": 5e-05, "loss": 1.3125, "step": 364 }, { "epoch": 0.12742188863676035, "grad_norm": 0.662497878074646, "learning_rate": 5e-05, "loss": 1.321, "step": 365 }, { "epoch": 0.1277709897015186, "grad_norm": 0.6295817494392395, "learning_rate": 5e-05, "loss": 1.3814, "step": 366 }, { "epoch": 0.12812009076627684, "grad_norm": 0.7357390522956848, "learning_rate": 5e-05, "loss": 1.374, "step": 367 }, { "epoch": 0.12846919183103508, "grad_norm": 0.6728739142417908, "learning_rate": 5e-05, "loss": 1.1957, "step": 368 }, { "epoch": 0.12881829289579333, "grad_norm": 0.6290231943130493, "learning_rate": 5e-05, "loss": 1.2948, "step": 369 }, { "epoch": 0.12916739396055157, "grad_norm": 1.0889554023742676, "learning_rate": 5e-05, "loss": 1.3465, "step": 370 }, { "epoch": 0.12951649502530982, "grad_norm": 0.6978388428688049, "learning_rate": 5e-05, "loss": 1.2898, "step": 371 }, { "epoch": 0.12986559609006806, "grad_norm": 1.0806949138641357, "learning_rate": 5e-05, "loss": 1.2656, "step": 372 }, { "epoch": 0.1302146971548263, "grad_norm": 0.5989696979522705, "learning_rate": 5e-05, "loss": 1.354, "step": 373 }, { "epoch": 0.13056379821958458, "grad_norm": 0.5808868408203125, "learning_rate": 5e-05, "loss": 1.2911, "step": 374 }, { "epoch": 0.13091289928434283, "grad_norm": 0.6175510883331299, "learning_rate": 5e-05, "loss": 1.3392, "step": 375 }, { "epoch": 0.13126200034910107, "grad_norm": 0.7896063923835754, "learning_rate": 5e-05, "loss": 1.3598, "step": 376 }, { "epoch": 0.13161110141385932, "grad_norm": 0.6890353560447693, "learning_rate": 5e-05, "loss": 1.2259, "step": 377 }, { "epoch": 0.13196020247861756, "grad_norm": 0.7264868021011353, "learning_rate": 5e-05, "loss": 1.3747, "step": 378 }, { "epoch": 0.1323093035433758, "grad_norm": 0.5779114365577698, "learning_rate": 5e-05, "loss": 1.2566, "step": 379 }, { "epoch": 0.13265840460813405, "grad_norm": 0.6164990067481995, "learning_rate": 5e-05, "loss": 1.3123, "step": 380 }, { "epoch": 0.1330075056728923, "grad_norm": 0.5990901589393616, "learning_rate": 5e-05, "loss": 1.399, "step": 381 }, { "epoch": 0.13335660673765054, "grad_norm": 0.5799390077590942, "learning_rate": 5e-05, "loss": 1.2697, "step": 382 }, { "epoch": 0.13370570780240879, "grad_norm": 0.6446252465248108, "learning_rate": 5e-05, "loss": 1.3321, "step": 383 }, { "epoch": 0.13405480886716706, "grad_norm": 0.5626406669616699, "learning_rate": 5e-05, "loss": 1.2867, "step": 384 }, { "epoch": 0.1344039099319253, "grad_norm": 0.5967420935630798, "learning_rate": 5e-05, "loss": 1.3514, "step": 385 }, { "epoch": 0.13475301099668355, "grad_norm": 0.622344434261322, "learning_rate": 5e-05, "loss": 1.2814, "step": 386 }, { "epoch": 0.1351021120614418, "grad_norm": 0.5952975749969482, "learning_rate": 5e-05, "loss": 1.3616, "step": 387 }, { "epoch": 0.13545121312620004, "grad_norm": 1.6270025968551636, "learning_rate": 5e-05, "loss": 1.3057, "step": 388 }, { "epoch": 0.13580031419095828, "grad_norm": 0.6453176736831665, "learning_rate": 5e-05, "loss": 1.2203, "step": 389 }, { "epoch": 0.13614941525571653, "grad_norm": 0.6074663400650024, "learning_rate": 5e-05, "loss": 1.2705, "step": 390 }, { "epoch": 0.13649851632047477, "grad_norm": 0.5617640018463135, "learning_rate": 5e-05, "loss": 1.2692, "step": 391 }, { "epoch": 0.13684761738523302, "grad_norm": 0.5138052701950073, "learning_rate": 5e-05, "loss": 1.2914, "step": 392 }, { "epoch": 0.13719671844999126, "grad_norm": 0.6522411108016968, "learning_rate": 5e-05, "loss": 1.3055, "step": 393 }, { "epoch": 0.1375458195147495, "grad_norm": 0.6821246147155762, "learning_rate": 5e-05, "loss": 1.2674, "step": 394 }, { "epoch": 0.13789492057950778, "grad_norm": 0.6284828186035156, "learning_rate": 5e-05, "loss": 1.2842, "step": 395 }, { "epoch": 0.13824402164426602, "grad_norm": 0.6461937427520752, "learning_rate": 5e-05, "loss": 1.305, "step": 396 }, { "epoch": 0.13859312270902427, "grad_norm": 0.8084800243377686, "learning_rate": 5e-05, "loss": 1.3539, "step": 397 }, { "epoch": 0.1389422237737825, "grad_norm": 0.5511135458946228, "learning_rate": 5e-05, "loss": 1.2364, "step": 398 }, { "epoch": 0.13929132483854076, "grad_norm": 0.6121107339859009, "learning_rate": 5e-05, "loss": 1.3212, "step": 399 }, { "epoch": 0.139640425903299, "grad_norm": 0.5705773234367371, "learning_rate": 5e-05, "loss": 1.3116, "step": 400 }, { "epoch": 0.139640425903299, "eval_loss": 1.322394609451294, "eval_runtime": 3311.45, "eval_samples_per_second": 6.92, "eval_steps_per_second": 0.865, "step": 400 }, { "epoch": 0.139640425903299, "step": 400, "total_flos": 8.590417732871127e+17, "train_loss": 1.3312159395217895, "train_runtime": 17991.8527, "train_samples_per_second": 1.779, "train_steps_per_second": 0.056 } ], "logging_steps": 1.0, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 8.590417732871127e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }