diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,6 +1,6 @@ { - "best_metric": 0.2850923240184784, - "best_model_checkpoint": "./cifar100_outputs/checkpoint-26565", + "best_metric": 0.2550007104873657, + "best_model_checkpoint": "./cifar100_lora_outputs/checkpoint-26565", "epoch": 5.0, "eval_steps": 500, "global_step": 26565, @@ -10,18649 +10,18649 @@ "log_history": [ { "epoch": 0.0018821757952192735, - "grad_norm": 3.259775161743164, - "learning_rate": 2.9988706945228687e-05, - "loss": 4.6305, + "grad_norm": 1.771733283996582, + "learning_rate": 0.0004998117824204781, + "loss": 4.6194, "step": 10 }, { "epoch": 0.003764351590438547, - "grad_norm": 2.9150309562683105, - "learning_rate": 2.997741389045737e-05, - "loss": 4.5916, + "grad_norm": 1.8498225212097168, + "learning_rate": 0.0004996235648409562, + "loss": 4.5768, "step": 20 }, { "epoch": 0.00564652738565782, - "grad_norm": 2.639889717102051, - "learning_rate": 2.9966120835686056e-05, - "loss": 4.5977, + "grad_norm": 1.8128389120101929, + "learning_rate": 0.0004994353472614342, + "loss": 4.5582, "step": 30 }, { "epoch": 0.007528703180877094, - "grad_norm": 2.729921340942383, - "learning_rate": 2.9954827780914738e-05, - "loss": 4.5964, + "grad_norm": 1.824784755706787, + "learning_rate": 0.0004992471296819123, + "loss": 4.5086, "step": 40 }, { "epoch": 0.009410878976096368, - "grad_norm": 2.3783774375915527, - "learning_rate": 2.994353472614342e-05, - "loss": 4.6022, + "grad_norm": 1.7717084884643555, + "learning_rate": 0.0004990589121023904, + "loss": 4.5419, "step": 50 }, { "epoch": 0.01129305477131564, - "grad_norm": 2.656496524810791, - "learning_rate": 2.9932241671372107e-05, - "loss": 4.5908, + "grad_norm": 1.8626877069473267, + "learning_rate": 0.0004988706945228685, + "loss": 4.475, "step": 60 }, { "epoch": 0.013175230566534914, - "grad_norm": 2.692168712615967, - "learning_rate": 2.992094861660079e-05, - "loss": 4.5829, + "grad_norm": 1.8645660877227783, + "learning_rate": 0.0004986824769433465, + "loss": 4.4153, "step": 70 }, { "epoch": 0.015057406361754188, - "grad_norm": 2.8096225261688232, - "learning_rate": 2.9909655561829476e-05, - "loss": 4.5645, + "grad_norm": 1.8023477792739868, + "learning_rate": 0.0004984942593638246, + "loss": 4.4258, "step": 80 }, { "epoch": 0.01693958215697346, - "grad_norm": 2.7067952156066895, - "learning_rate": 2.989836250705816e-05, - "loss": 4.5448, + "grad_norm": 1.8429126739501953, + "learning_rate": 0.0004983060417843027, + "loss": 4.3373, "step": 90 }, { "epoch": 0.018821757952192736, - "grad_norm": 2.5358433723449707, - "learning_rate": 2.9887069452286845e-05, - "loss": 4.5471, + "grad_norm": 1.9091802835464478, + "learning_rate": 0.0004981178242047808, + "loss": 4.2982, "step": 100 }, { "epoch": 0.020703933747412008, - "grad_norm": 2.4757063388824463, - "learning_rate": 2.987577639751553e-05, - "loss": 4.5546, + "grad_norm": 1.7966872453689575, + "learning_rate": 0.0004979296066252589, + "loss": 4.2458, "step": 110 }, { "epoch": 0.02258610954263128, - "grad_norm": 2.8501641750335693, - "learning_rate": 2.9864483342744213e-05, - "loss": 4.5471, + "grad_norm": 1.9354859590530396, + "learning_rate": 0.0004977413890457369, + "loss": 4.1789, "step": 120 }, { "epoch": 0.024468285337850556, - "grad_norm": 2.506326198577881, - "learning_rate": 2.98531902879729e-05, - "loss": 4.5289, + "grad_norm": 2.0137393474578857, + "learning_rate": 0.000497553171466215, + "loss": 4.105, "step": 130 }, { "epoch": 0.026350461133069828, - "grad_norm": 2.7096266746520996, - "learning_rate": 2.9841897233201582e-05, - "loss": 4.5213, + "grad_norm": 1.9952950477600098, + "learning_rate": 0.000497364953886693, + "loss": 4.0637, "step": 140 }, { "epoch": 0.028232636928289104, - "grad_norm": 2.9105842113494873, - "learning_rate": 2.9830604178430265e-05, - "loss": 4.5017, + "grad_norm": 2.067469835281372, + "learning_rate": 0.000497176736307171, + "loss": 3.9707, "step": 150 }, { "epoch": 0.030114812723508376, - "grad_norm": 2.713897466659546, - "learning_rate": 2.981931112365895e-05, - "loss": 4.5014, + "grad_norm": 2.226876974105835, + "learning_rate": 0.0004969885187276491, + "loss": 3.958, "step": 160 }, { "epoch": 0.03199698851872765, - "grad_norm": 3.520843744277954, - "learning_rate": 2.9808018068887634e-05, - "loss": 4.482, + "grad_norm": 2.162407159805298, + "learning_rate": 0.0004968003011481272, + "loss": 3.9696, "step": 170 }, { "epoch": 0.03387916431394692, - "grad_norm": 3.0594747066497803, - "learning_rate": 2.979672501411632e-05, - "loss": 4.4706, + "grad_norm": 1.9613149166107178, + "learning_rate": 0.0004966120835686053, + "loss": 3.7365, "step": 180 }, { "epoch": 0.0357613401091662, - "grad_norm": 2.884718418121338, - "learning_rate": 2.9785431959345006e-05, - "loss": 4.4648, + "grad_norm": 2.009887933731079, + "learning_rate": 0.0004964238659890834, + "loss": 3.6623, "step": 190 }, { "epoch": 0.03764351590438547, - "grad_norm": 2.8616080284118652, - "learning_rate": 2.977413890457369e-05, - "loss": 4.4629, + "grad_norm": 2.3235840797424316, + "learning_rate": 0.0004962356484095614, + "loss": 3.6737, "step": 200 }, { "epoch": 0.039525691699604744, - "grad_norm": 2.6512672901153564, - "learning_rate": 2.9762845849802374e-05, - "loss": 4.4555, + "grad_norm": 1.996651291847229, + "learning_rate": 0.0004960474308300395, + "loss": 3.6248, "step": 210 }, { "epoch": 0.041407867494824016, - "grad_norm": 3.5559914112091064, - "learning_rate": 2.9751552795031054e-05, - "loss": 4.4335, + "grad_norm": 2.1277413368225098, + "learning_rate": 0.0004958592132505176, + "loss": 3.5943, "step": 220 }, { "epoch": 0.04329004329004329, - "grad_norm": 2.8921027183532715, - "learning_rate": 2.974025974025974e-05, - "loss": 4.3462, + "grad_norm": 2.05141019821167, + "learning_rate": 0.0004956709956709957, + "loss": 3.2136, "step": 230 }, { "epoch": 0.04517221908526256, - "grad_norm": 2.7680187225341797, - "learning_rate": 2.9728966685488426e-05, - "loss": 4.4017, + "grad_norm": 2.045433521270752, + "learning_rate": 0.0004954827780914737, + "loss": 3.376, "step": 240 }, { "epoch": 0.04705439488048184, - "grad_norm": 2.8839056491851807, - "learning_rate": 2.971767363071711e-05, - "loss": 4.4098, + "grad_norm": 2.1396985054016113, + "learning_rate": 0.0004952945605119518, + "loss": 3.3767, "step": 250 }, { "epoch": 0.04893657067570111, - "grad_norm": 2.8372817039489746, - "learning_rate": 2.9706380575945795e-05, - "loss": 4.3837, + "grad_norm": 2.0575475692749023, + "learning_rate": 0.0004951063429324299, + "loss": 3.2651, "step": 260 }, { "epoch": 0.050818746470920384, - "grad_norm": 2.884737968444824, - "learning_rate": 2.9695087521174477e-05, - "loss": 4.3769, + "grad_norm": 2.0891125202178955, + "learning_rate": 0.000494918125352908, + "loss": 3.2083, "step": 270 }, { "epoch": 0.052700922266139656, - "grad_norm": 2.899841070175171, - "learning_rate": 2.9683794466403163e-05, - "loss": 4.3447, + "grad_norm": 2.27366304397583, + "learning_rate": 0.000494729907773386, + "loss": 2.9784, "step": 280 }, { "epoch": 0.05458309806135893, - "grad_norm": 3.1840929985046387, - "learning_rate": 2.967250141163185e-05, - "loss": 4.3174, + "grad_norm": 2.2188055515289307, + "learning_rate": 0.0004945416901938641, + "loss": 2.981, "step": 290 }, { "epoch": 0.05646527385657821, - "grad_norm": 2.9409921169281006, - "learning_rate": 2.9661208356860532e-05, - "loss": 4.3072, + "grad_norm": 2.5624613761901855, + "learning_rate": 0.0004943534726143422, + "loss": 2.9, "step": 300 }, { "epoch": 0.05834744965179748, - "grad_norm": 3.342649459838867, - "learning_rate": 2.9649915302089215e-05, - "loss": 4.3291, + "grad_norm": 2.1516923904418945, + "learning_rate": 0.0004941652550348203, + "loss": 3.0586, "step": 310 }, { "epoch": 0.06022962544701675, - "grad_norm": 2.746373176574707, - "learning_rate": 2.9638622247317898e-05, - "loss": 4.3288, + "grad_norm": 2.2759182453155518, + "learning_rate": 0.0004939770374552983, + "loss": 3.0083, "step": 320 }, { "epoch": 0.062111801242236024, - "grad_norm": 3.7221550941467285, - "learning_rate": 2.9627329192546584e-05, - "loss": 4.2612, + "grad_norm": 2.1301019191741943, + "learning_rate": 0.0004937888198757764, + "loss": 2.653, "step": 330 }, { "epoch": 0.0639939770374553, - "grad_norm": 2.8289361000061035, - "learning_rate": 2.961603613777527e-05, - "loss": 4.2885, + "grad_norm": 2.1771981716156006, + "learning_rate": 0.0004936006022962545, + "loss": 2.8033, "step": 340 }, { "epoch": 0.06587615283267458, - "grad_norm": 3.2310261726379395, - "learning_rate": 2.9604743083003952e-05, - "loss": 4.2502, + "grad_norm": 2.0707926750183105, + "learning_rate": 0.0004934123847167326, + "loss": 2.6861, "step": 350 }, { "epoch": 0.06775832862789384, - "grad_norm": 2.873511791229248, - "learning_rate": 2.959345002823264e-05, - "loss": 4.2713, + "grad_norm": 2.312077283859253, + "learning_rate": 0.0004932241671372107, + "loss": 2.7215, "step": 360 }, { "epoch": 0.06964050442311312, - "grad_norm": 2.8159632682800293, - "learning_rate": 2.958215697346132e-05, - "loss": 4.237, + "grad_norm": 2.0129685401916504, + "learning_rate": 0.0004930359495576887, + "loss": 2.5832, "step": 370 }, { "epoch": 0.0715226802183324, - "grad_norm": 4.2501220703125, - "learning_rate": 2.9570863918690007e-05, - "loss": 4.2377, + "grad_norm": 2.031329393386841, + "learning_rate": 0.0004928477319781668, + "loss": 2.6758, "step": 380 }, { "epoch": 0.07340485601355166, - "grad_norm": 3.2059030532836914, - "learning_rate": 2.9559570863918693e-05, - "loss": 4.245, + "grad_norm": 2.230658769607544, + "learning_rate": 0.0004926595143986449, + "loss": 2.696, "step": 390 }, { "epoch": 0.07528703180877094, - "grad_norm": 3.3538947105407715, - "learning_rate": 2.9548277809147373e-05, - "loss": 4.2061, + "grad_norm": 2.8371505737304688, + "learning_rate": 0.0004924712968191229, + "loss": 2.5156, "step": 400 }, { "epoch": 0.07716920760399021, - "grad_norm": 3.506152391433716, - "learning_rate": 2.953698475437606e-05, - "loss": 4.1917, + "grad_norm": 2.619385004043579, + "learning_rate": 0.0004922830792396009, + "loss": 2.4818, "step": 410 }, { "epoch": 0.07905138339920949, - "grad_norm": 3.823164701461792, - "learning_rate": 2.9525691699604745e-05, - "loss": 4.1214, + "grad_norm": 2.360231637954712, + "learning_rate": 0.000492094861660079, + "loss": 2.2714, "step": 420 }, { "epoch": 0.08093355919442875, - "grad_norm": 3.458685874938965, - "learning_rate": 2.9514398644833427e-05, - "loss": 4.1866, + "grad_norm": 2.3182997703552246, + "learning_rate": 0.0004919066440805571, + "loss": 2.3886, "step": 430 }, { "epoch": 0.08281573498964803, - "grad_norm": 3.0740559101104736, - "learning_rate": 2.9503105590062114e-05, - "loss": 4.1349, + "grad_norm": 2.0524964332580566, + "learning_rate": 0.0004917184265010352, + "loss": 2.2321, "step": 440 }, { "epoch": 0.08469791078486731, - "grad_norm": 4.038434982299805, - "learning_rate": 2.9491812535290796e-05, - "loss": 4.1513, + "grad_norm": 1.905816912651062, + "learning_rate": 0.0004915302089215133, + "loss": 2.2257, "step": 450 }, { "epoch": 0.08658008658008658, - "grad_norm": 3.2392990589141846, - "learning_rate": 2.9480519480519482e-05, - "loss": 4.0751, + "grad_norm": 2.24204421043396, + "learning_rate": 0.0004913419913419914, + "loss": 2.1486, "step": 460 }, { "epoch": 0.08846226237530586, - "grad_norm": 2.7104597091674805, - "learning_rate": 2.946922642574817e-05, - "loss": 4.1389, + "grad_norm": 1.8304845094680786, + "learning_rate": 0.0004911537737624695, + "loss": 2.3883, "step": 470 }, { "epoch": 0.09034443817052512, - "grad_norm": 3.526334762573242, - "learning_rate": 2.9457933370976848e-05, - "loss": 4.0984, + "grad_norm": 2.062603235244751, + "learning_rate": 0.0004909655561829475, + "loss": 2.2133, "step": 480 }, { "epoch": 0.0922266139657444, - "grad_norm": 4.827049732208252, - "learning_rate": 2.9446640316205534e-05, - "loss": 4.065, + "grad_norm": 2.481670618057251, + "learning_rate": 0.0004907773386034255, + "loss": 1.9732, "step": 490 }, { "epoch": 0.09410878976096368, - "grad_norm": 3.1481475830078125, - "learning_rate": 2.9435347261434216e-05, - "loss": 4.0429, + "grad_norm": 1.9979970455169678, + "learning_rate": 0.0004905891210239036, + "loss": 2.0477, "step": 500 }, { "epoch": 0.09599096555618294, - "grad_norm": 3.47776198387146, - "learning_rate": 2.9424054206662903e-05, - "loss": 4.0432, + "grad_norm": 2.0687055587768555, + "learning_rate": 0.0004904009034443817, + "loss": 1.9344, "step": 510 }, { "epoch": 0.09787314135140222, - "grad_norm": 2.9574878215789795, - "learning_rate": 2.941276115189159e-05, - "loss": 4.0486, + "grad_norm": 1.8960387706756592, + "learning_rate": 0.0004902126858648598, + "loss": 2.1094, "step": 520 }, { "epoch": 0.09975531714662149, - "grad_norm": 3.398369073867798, - "learning_rate": 2.940146809712027e-05, - "loss": 3.9975, + "grad_norm": 2.011258125305176, + "learning_rate": 0.0004900244682853378, + "loss": 2.0836, "step": 530 }, { "epoch": 0.10163749294184077, - "grad_norm": 3.2423532009124756, - "learning_rate": 2.9390175042348957e-05, - "loss": 4.0189, + "grad_norm": 2.1397879123687744, + "learning_rate": 0.0004898362507058159, + "loss": 2.1036, "step": 540 }, { "epoch": 0.10351966873706005, - "grad_norm": 4.2174153327941895, - "learning_rate": 2.937888198757764e-05, - "loss": 3.9679, + "grad_norm": 2.2543036937713623, + "learning_rate": 0.000489648033126294, + "loss": 1.727, "step": 550 }, { "epoch": 0.10540184453227931, - "grad_norm": 4.3004045486450195, - "learning_rate": 2.9367588932806326e-05, - "loss": 4.0162, + "grad_norm": 2.0110862255096436, + "learning_rate": 0.0004894598155467721, + "loss": 2.0504, "step": 560 }, { "epoch": 0.10728402032749859, - "grad_norm": 3.0636956691741943, - "learning_rate": 2.935629587803501e-05, - "loss": 3.9957, + "grad_norm": 2.556957483291626, + "learning_rate": 0.0004892715979672501, + "loss": 2.0245, "step": 570 }, { "epoch": 0.10916619612271786, - "grad_norm": 3.1375842094421387, - "learning_rate": 2.934500282326369e-05, - "loss": 4.0078, + "grad_norm": 3.036496639251709, + "learning_rate": 0.0004890833803877282, + "loss": 1.9936, "step": 580 }, { "epoch": 0.11104837191793714, - "grad_norm": 3.439136505126953, - "learning_rate": 2.9333709768492378e-05, - "loss": 3.9473, + "grad_norm": 2.3555924892425537, + "learning_rate": 0.0004888951628082063, + "loss": 1.8314, "step": 590 }, { "epoch": 0.11293054771315642, - "grad_norm": 3.252802610397339, - "learning_rate": 2.932241671372106e-05, - "loss": 3.9155, + "grad_norm": 1.9588710069656372, + "learning_rate": 0.0004887069452286844, + "loss": 1.8489, "step": 600 }, { "epoch": 0.11481272350837568, - "grad_norm": 3.3427743911743164, - "learning_rate": 2.9311123658949746e-05, - "loss": 3.9003, + "grad_norm": 2.1294167041778564, + "learning_rate": 0.0004885187276491625, + "loss": 2.0076, "step": 610 }, { "epoch": 0.11669489930359496, - "grad_norm": 3.1991214752197266, - "learning_rate": 2.9299830604178432e-05, - "loss": 3.8892, + "grad_norm": 2.881962299346924, + "learning_rate": 0.0004883305100696405, + "loss": 1.6918, "step": 620 }, { "epoch": 0.11857707509881422, - "grad_norm": 3.2154698371887207, - "learning_rate": 2.9288537549407115e-05, - "loss": 3.8717, + "grad_norm": 2.8056647777557373, + "learning_rate": 0.0004881422924901186, + "loss": 1.7188, "step": 630 }, { "epoch": 0.1204592508940335, - "grad_norm": 2.7783098220825195, - "learning_rate": 2.92772444946358e-05, - "loss": 3.8236, + "grad_norm": 1.6415379047393799, + "learning_rate": 0.0004879540749105967, + "loss": 1.5487, "step": 640 }, { "epoch": 0.12234142668925278, - "grad_norm": 4.598132610321045, - "learning_rate": 2.9265951439864487e-05, - "loss": 3.9406, + "grad_norm": 4.757891654968262, + "learning_rate": 0.00048776585733107476, + "loss": 1.8836, "step": 650 }, { "epoch": 0.12422360248447205, - "grad_norm": 3.633305549621582, - "learning_rate": 2.9254658385093167e-05, - "loss": 3.7923, + "grad_norm": 3.0680055618286133, + "learning_rate": 0.0004875776397515528, + "loss": 1.5568, "step": 660 }, { "epoch": 0.12610577827969133, - "grad_norm": 3.794126272201538, - "learning_rate": 2.9243365330321853e-05, - "loss": 3.7865, + "grad_norm": 2.2202789783477783, + "learning_rate": 0.00048738942217203086, + "loss": 1.5855, "step": 670 }, { "epoch": 0.1279879540749106, - "grad_norm": 3.1769659519195557, - "learning_rate": 2.9232072275550535e-05, - "loss": 3.7679, + "grad_norm": 4.045197010040283, + "learning_rate": 0.00048720120459250894, + "loss": 1.6895, "step": 680 }, { "epoch": 0.12987012987012986, - "grad_norm": 3.30849289894104, - "learning_rate": 2.922077922077922e-05, - "loss": 3.8356, + "grad_norm": 2.5233097076416016, + "learning_rate": 0.000487012987012987, + "loss": 1.5461, "step": 690 }, { "epoch": 0.13175230566534915, - "grad_norm": 4.426156997680664, - "learning_rate": 2.9209486166007908e-05, - "loss": 3.7757, + "grad_norm": 2.6602494716644287, + "learning_rate": 0.0004868247694334651, + "loss": 1.6852, "step": 700 }, { "epoch": 0.13363448146056842, - "grad_norm": 3.004460096359253, - "learning_rate": 2.919819311123659e-05, - "loss": 3.6859, + "grad_norm": 1.7626466751098633, + "learning_rate": 0.00048663655185394317, + "loss": 1.4548, "step": 710 }, { "epoch": 0.13551665725578768, - "grad_norm": 3.6174192428588867, - "learning_rate": 2.9186900056465276e-05, - "loss": 3.745, + "grad_norm": 2.280395030975342, + "learning_rate": 0.00048644833427442124, + "loss": 1.5311, "step": 720 }, { "epoch": 0.13739883305100697, - "grad_norm": 3.1848292350769043, - "learning_rate": 2.917560700169396e-05, - "loss": 3.6902, + "grad_norm": 1.9135655164718628, + "learning_rate": 0.0004862601166948993, + "loss": 1.5031, "step": 730 }, { "epoch": 0.13928100884622624, - "grad_norm": 3.1406095027923584, - "learning_rate": 2.9164313946922645e-05, - "loss": 3.7553, + "grad_norm": 1.908615231513977, + "learning_rate": 0.0004860718991153774, + "loss": 1.6354, "step": 740 }, { "epoch": 0.1411631846414455, - "grad_norm": 3.086644411087036, - "learning_rate": 2.9153020892151328e-05, - "loss": 3.736, + "grad_norm": 1.6709424257278442, + "learning_rate": 0.0004858836815358554, + "loss": 1.4328, "step": 750 }, { "epoch": 0.1430453604366648, - "grad_norm": 5.149982452392578, - "learning_rate": 2.914172783738001e-05, - "loss": 3.6852, + "grad_norm": 3.3939626216888428, + "learning_rate": 0.0004856954639563335, + "loss": 1.3476, "step": 760 }, { "epoch": 0.14492753623188406, - "grad_norm": 4.200068950653076, - "learning_rate": 2.9130434782608696e-05, - "loss": 3.6916, + "grad_norm": 2.578439712524414, + "learning_rate": 0.00048550724637681157, + "loss": 1.4497, "step": 770 }, { "epoch": 0.14680971202710333, - "grad_norm": 3.5384104251861572, - "learning_rate": 2.911914172783738e-05, - "loss": 3.6724, + "grad_norm": 2.4868099689483643, + "learning_rate": 0.0004853190287972897, + "loss": 1.5841, "step": 780 }, { "epoch": 0.1486918878223226, - "grad_norm": 4.555196285247803, - "learning_rate": 2.9107848673066065e-05, - "loss": 3.6608, + "grad_norm": 2.2631711959838867, + "learning_rate": 0.0004851308112177678, + "loss": 1.2718, "step": 790 }, { "epoch": 0.1505740636175419, - "grad_norm": 4.264336585998535, - "learning_rate": 2.909655561829475e-05, - "loss": 3.6538, + "grad_norm": 1.6401269435882568, + "learning_rate": 0.00048494259363824585, + "loss": 1.3654, "step": 800 }, { "epoch": 0.15245623941276115, - "grad_norm": 4.192948818206787, - "learning_rate": 2.9085262563523434e-05, - "loss": 3.5788, + "grad_norm": 3.559016466140747, + "learning_rate": 0.00048475437605872393, + "loss": 1.4383, "step": 810 }, { "epoch": 0.15433841520798042, - "grad_norm": 4.293764114379883, - "learning_rate": 2.907396950875212e-05, - "loss": 3.6034, + "grad_norm": 2.363433361053467, + "learning_rate": 0.000484566158479202, + "loss": 1.3603, "step": 820 }, { "epoch": 0.1562205910031997, - "grad_norm": 3.505387544631958, - "learning_rate": 2.90626764539808e-05, - "loss": 3.6135, + "grad_norm": 2.9046192169189453, + "learning_rate": 0.00048437794089968003, + "loss": 1.3902, "step": 830 }, { "epoch": 0.15810276679841898, - "grad_norm": 3.4899723529815674, - "learning_rate": 2.9051383399209485e-05, - "loss": 3.7187, + "grad_norm": 3.304086923599243, + "learning_rate": 0.0004841897233201581, + "loss": 1.6489, "step": 840 }, { "epoch": 0.15998494259363824, - "grad_norm": 5.375092029571533, - "learning_rate": 2.904009034443817e-05, - "loss": 3.6486, + "grad_norm": 2.0543618202209473, + "learning_rate": 0.0004840015057406362, + "loss": 1.4324, "step": 850 }, { "epoch": 0.1618671183888575, - "grad_norm": 4.234030246734619, - "learning_rate": 2.9028797289666854e-05, - "loss": 3.7097, + "grad_norm": 3.817758560180664, + "learning_rate": 0.00048381328816111426, + "loss": 1.497, "step": 860 }, { "epoch": 0.1637492941840768, - "grad_norm": 3.9143476486206055, - "learning_rate": 2.901750423489554e-05, - "loss": 3.4356, + "grad_norm": 3.9704749584198, + "learning_rate": 0.00048362507058159233, + "loss": 1.192, "step": 870 }, { "epoch": 0.16563146997929606, - "grad_norm": 3.578145742416382, - "learning_rate": 2.9006211180124223e-05, - "loss": 3.4969, + "grad_norm": 2.5795369148254395, + "learning_rate": 0.0004834368530020704, + "loss": 1.2547, "step": 880 }, { "epoch": 0.16751364577451533, - "grad_norm": 4.751120567321777, - "learning_rate": 2.899491812535291e-05, - "loss": 3.5853, + "grad_norm": 3.441765069961548, + "learning_rate": 0.0004832486354225485, + "loss": 1.396, "step": 890 }, { "epoch": 0.16939582156973462, - "grad_norm": 4.476664066314697, - "learning_rate": 2.8983625070581595e-05, - "loss": 3.5424, + "grad_norm": 2.1846425533294678, + "learning_rate": 0.00048306041784302656, + "loss": 1.4624, "step": 900 }, { "epoch": 0.1712779973649539, - "grad_norm": 3.4918501377105713, - "learning_rate": 2.8972332015810278e-05, - "loss": 3.5004, + "grad_norm": 2.7610526084899902, + "learning_rate": 0.00048287220026350464, + "loss": 1.249, "step": 910 }, { "epoch": 0.17316017316017315, - "grad_norm": 9.847784996032715, - "learning_rate": 2.896103896103896e-05, - "loss": 3.5159, + "grad_norm": 3.498777151107788, + "learning_rate": 0.00048268398268398266, + "loss": 1.2124, "step": 920 }, { "epoch": 0.17504234895539245, - "grad_norm": 3.551976442337036, - "learning_rate": 2.8949745906267647e-05, - "loss": 3.475, + "grad_norm": 2.5937039852142334, + "learning_rate": 0.00048249576510446074, + "loss": 1.2934, "step": 930 }, { "epoch": 0.1769245247506117, - "grad_norm": 4.220102787017822, - "learning_rate": 2.893845285149633e-05, - "loss": 3.4396, + "grad_norm": 3.5550036430358887, + "learning_rate": 0.0004823075475249388, + "loss": 1.2937, "step": 940 }, { "epoch": 0.17880670054583098, - "grad_norm": 6.077229976654053, - "learning_rate": 2.8927159796725015e-05, - "loss": 3.448, + "grad_norm": 2.0531303882598877, + "learning_rate": 0.0004821193299454169, + "loss": 1.3274, "step": 950 }, { "epoch": 0.18068887634105024, - "grad_norm": 4.203821659088135, - "learning_rate": 2.8915866741953698e-05, - "loss": 3.4436, + "grad_norm": 2.3458123207092285, + "learning_rate": 0.00048193111236589497, + "loss": 1.3146, "step": 960 }, { "epoch": 0.18257105213626953, - "grad_norm": 3.7120420932769775, - "learning_rate": 2.8904573687182384e-05, - "loss": 3.416, + "grad_norm": 4.257774829864502, + "learning_rate": 0.00048174289478637304, + "loss": 1.2663, "step": 970 }, { "epoch": 0.1844532279314888, - "grad_norm": 4.179491996765137, - "learning_rate": 2.889328063241107e-05, - "loss": 3.3503, + "grad_norm": 4.102976322174072, + "learning_rate": 0.0004815546772068511, + "loss": 1.0864, "step": 980 }, { "epoch": 0.18633540372670807, - "grad_norm": 3.9055488109588623, - "learning_rate": 2.8881987577639753e-05, - "loss": 3.4424, + "grad_norm": 2.92472505569458, + "learning_rate": 0.0004813664596273292, + "loss": 1.5308, "step": 990 }, { "epoch": 0.18821757952192736, - "grad_norm": 3.6015703678131104, - "learning_rate": 2.887069452286844e-05, - "loss": 3.3619, + "grad_norm": 4.026129722595215, + "learning_rate": 0.00048117824204780733, + "loss": 1.2942, "step": 1000 }, { "epoch": 0.19009975531714662, - "grad_norm": 5.409115314483643, - "learning_rate": 2.8859401468097118e-05, - "loss": 3.4609, + "grad_norm": 3.1226966381073, + "learning_rate": 0.00048099002446828535, + "loss": 1.3299, "step": 1010 }, { "epoch": 0.1919819311123659, - "grad_norm": 3.458120584487915, - "learning_rate": 2.8848108413325804e-05, - "loss": 3.4654, + "grad_norm": 1.2881345748901367, + "learning_rate": 0.0004808018068887634, + "loss": 1.3436, "step": 1020 }, { "epoch": 0.19386410690758518, - "grad_norm": 3.861100673675537, - "learning_rate": 2.883681535855449e-05, - "loss": 3.3997, + "grad_norm": 4.3795366287231445, + "learning_rate": 0.0004806135893092415, + "loss": 1.3336, "step": 1030 }, { "epoch": 0.19574628270280445, - "grad_norm": 8.975924491882324, - "learning_rate": 2.8825522303783173e-05, - "loss": 3.297, + "grad_norm": 6.313399791717529, + "learning_rate": 0.0004804253717297196, + "loss": 0.9824, "step": 1040 }, { "epoch": 0.1976284584980237, - "grad_norm": 9.686234474182129, - "learning_rate": 2.881422924901186e-05, - "loss": 3.3386, + "grad_norm": 6.534666061401367, + "learning_rate": 0.00048023715415019766, + "loss": 1.3169, "step": 1050 }, { "epoch": 0.19951063429324298, - "grad_norm": 6.7616658210754395, - "learning_rate": 2.8802936194240542e-05, - "loss": 3.3721, + "grad_norm": 4.672704219818115, + "learning_rate": 0.00048004893657067573, + "loss": 1.1771, "step": 1060 }, { "epoch": 0.20139281008846227, - "grad_norm": 3.126573324203491, - "learning_rate": 2.8791643139469228e-05, - "loss": 3.4284, + "grad_norm": 1.6210001707077026, + "learning_rate": 0.0004798607189911538, + "loss": 1.2003, "step": 1070 }, { "epoch": 0.20327498588368154, - "grad_norm": 4.771273136138916, - "learning_rate": 2.8780350084697914e-05, - "loss": 3.2652, + "grad_norm": 2.4551827907562256, + "learning_rate": 0.0004796725014116319, + "loss": 1.3198, "step": 1080 }, { "epoch": 0.2051571616789008, - "grad_norm": 3.69577956199646, - "learning_rate": 2.8769057029926593e-05, - "loss": 3.3577, + "grad_norm": 4.158587455749512, + "learning_rate": 0.0004794842838321099, + "loss": 1.2115, "step": 1090 }, { "epoch": 0.2070393374741201, - "grad_norm": 4.15922212600708, - "learning_rate": 2.875776397515528e-05, - "loss": 3.4093, + "grad_norm": 2.0976548194885254, + "learning_rate": 0.000479296066252588, + "loss": 1.4164, "step": 1100 }, { "epoch": 0.20892151326933936, - "grad_norm": 15.523391723632812, - "learning_rate": 2.8746470920383962e-05, - "loss": 3.2856, + "grad_norm": 6.073217391967773, + "learning_rate": 0.00047910784867306606, + "loss": 1.0083, "step": 1110 }, { "epoch": 0.21080368906455862, - "grad_norm": 3.744535207748413, - "learning_rate": 2.8735177865612648e-05, - "loss": 3.344, + "grad_norm": 1.7033714056015015, + "learning_rate": 0.00047891963109354414, + "loss": 1.1871, "step": 1120 }, { "epoch": 0.2126858648597779, - "grad_norm": 4.708155632019043, - "learning_rate": 2.8723884810841334e-05, - "loss": 3.2039, + "grad_norm": 5.1013875007629395, + "learning_rate": 0.0004787314135140222, + "loss": 0.8595, "step": 1130 }, { "epoch": 0.21456804065499718, - "grad_norm": 3.991323232650757, - "learning_rate": 2.8712591756070017e-05, - "loss": 3.276, + "grad_norm": 5.065658092498779, + "learning_rate": 0.0004785431959345003, + "loss": 1.1723, "step": 1140 }, { "epoch": 0.21645021645021645, - "grad_norm": 6.3713154792785645, - "learning_rate": 2.8701298701298703e-05, - "loss": 3.3719, + "grad_norm": 2.3889753818511963, + "learning_rate": 0.00047835497835497837, + "loss": 1.3787, "step": 1150 }, { "epoch": 0.2183323922454357, - "grad_norm": 3.9468255043029785, - "learning_rate": 2.869000564652739e-05, - "loss": 3.2762, + "grad_norm": 2.41831374168396, + "learning_rate": 0.00047816676077545644, + "loss": 1.1492, "step": 1160 }, { "epoch": 0.220214568040655, - "grad_norm": 3.5731687545776367, - "learning_rate": 2.8678712591756072e-05, - "loss": 3.1112, + "grad_norm": 0.9150934815406799, + "learning_rate": 0.0004779785431959345, + "loss": 0.9076, "step": 1170 }, { "epoch": 0.22209674383587427, - "grad_norm": 5.240946292877197, - "learning_rate": 2.8667419536984754e-05, - "loss": 3.1546, + "grad_norm": 2.3671467304229736, + "learning_rate": 0.00047779032561641254, + "loss": 0.8078, "step": 1180 }, { "epoch": 0.22397891963109354, - "grad_norm": 3.877687931060791, - "learning_rate": 2.8656126482213437e-05, - "loss": 3.2343, + "grad_norm": 2.145510673522949, + "learning_rate": 0.0004776021080368906, + "loss": 1.1798, "step": 1190 }, { "epoch": 0.22586109542631283, - "grad_norm": 4.6856584548950195, - "learning_rate": 2.8644833427442123e-05, - "loss": 3.2184, + "grad_norm": 2.422679901123047, + "learning_rate": 0.0004774138904573687, + "loss": 1.2867, "step": 1200 }, { "epoch": 0.2277432712215321, - "grad_norm": 2.8222122192382812, - "learning_rate": 2.863354037267081e-05, - "loss": 3.1481, + "grad_norm": 1.5619584321975708, + "learning_rate": 0.00047722567287784677, + "loss": 0.9626, "step": 1210 }, { "epoch": 0.22962544701675136, - "grad_norm": 2.7809884548187256, - "learning_rate": 2.8622247317899492e-05, - "loss": 3.21, + "grad_norm": 1.2849690914154053, + "learning_rate": 0.0004770374552983249, + "loss": 1.196, "step": 1220 }, { "epoch": 0.23150762281197063, - "grad_norm": 4.449065208435059, - "learning_rate": 2.8610954263128178e-05, - "loss": 3.1756, + "grad_norm": 4.115751266479492, + "learning_rate": 0.000476849237718803, + "loss": 1.1092, "step": 1230 }, { "epoch": 0.23338979860718992, - "grad_norm": 5.27800989151001, - "learning_rate": 2.859966120835686e-05, - "loss": 3.1003, + "grad_norm": 3.9603044986724854, + "learning_rate": 0.00047666102013928105, + "loss": 1.1435, "step": 1240 }, { "epoch": 0.23527197440240918, - "grad_norm": 6.26723051071167, - "learning_rate": 2.8588368153585547e-05, - "loss": 3.3139, + "grad_norm": 3.748976230621338, + "learning_rate": 0.00047647280255975913, + "loss": 1.3499, "step": 1250 }, { "epoch": 0.23715415019762845, - "grad_norm": 10.930907249450684, - "learning_rate": 2.8577075098814233e-05, - "loss": 2.9757, + "grad_norm": 3.044076919555664, + "learning_rate": 0.0004762845849802372, + "loss": 0.8821, "step": 1260 }, { "epoch": 0.23903632599284774, - "grad_norm": 8.510289192199707, - "learning_rate": 2.8565782044042912e-05, - "loss": 3.2473, + "grad_norm": 3.245182514190674, + "learning_rate": 0.00047609636740071523, + "loss": 1.3718, "step": 1270 }, { "epoch": 0.240918501788067, - "grad_norm": 9.550591468811035, - "learning_rate": 2.8554488989271598e-05, - "loss": 3.0974, + "grad_norm": 2.004108428955078, + "learning_rate": 0.0004759081498211933, + "loss": 1.0094, "step": 1280 }, { "epoch": 0.24280067758328627, - "grad_norm": 3.702929735183716, - "learning_rate": 2.854319593450028e-05, - "loss": 3.0416, + "grad_norm": 3.3926773071289062, + "learning_rate": 0.0004757199322416714, + "loss": 1.1662, "step": 1290 }, { "epoch": 0.24468285337850557, - "grad_norm": 3.1774191856384277, - "learning_rate": 2.8531902879728967e-05, - "loss": 3.2329, + "grad_norm": 1.7959283590316772, + "learning_rate": 0.00047553171466214946, + "loss": 1.2912, "step": 1300 }, { "epoch": 0.24656502917372483, - "grad_norm": 3.9982802867889404, - "learning_rate": 2.8520609824957653e-05, - "loss": 3.145, + "grad_norm": 3.153946876525879, + "learning_rate": 0.00047534349708262753, + "loss": 1.0199, "step": 1310 }, { "epoch": 0.2484472049689441, - "grad_norm": 10.488585472106934, - "learning_rate": 2.8509316770186336e-05, - "loss": 3.1751, + "grad_norm": 3.7788596153259277, + "learning_rate": 0.0004751552795031056, + "loss": 1.4888, "step": 1320 }, { "epoch": 0.2503293807641634, - "grad_norm": 4.030777454376221, - "learning_rate": 2.8498023715415022e-05, - "loss": 3.0312, + "grad_norm": 2.140678644180298, + "learning_rate": 0.0004749670619235837, + "loss": 1.2057, "step": 1330 }, { "epoch": 0.25221155655938265, - "grad_norm": 5.116481304168701, - "learning_rate": 2.8486730660643705e-05, - "loss": 3.0881, + "grad_norm": 2.290731430053711, + "learning_rate": 0.00047477884434406176, + "loss": 1.2266, "step": 1340 }, { "epoch": 0.2540937323546019, - "grad_norm": 6.651054382324219, - "learning_rate": 2.847543760587239e-05, - "loss": 2.9569, + "grad_norm": 7.3565545082092285, + "learning_rate": 0.00047459062676453984, + "loss": 0.9182, "step": 1350 }, { "epoch": 0.2559759081498212, - "grad_norm": 7.519700050354004, - "learning_rate": 2.8464144551101073e-05, - "loss": 3.1106, + "grad_norm": 2.279402732849121, + "learning_rate": 0.00047440240918501786, + "loss": 1.1294, "step": 1360 }, { "epoch": 0.25785808394504045, - "grad_norm": 5.690998554229736, - "learning_rate": 2.8452851496329756e-05, - "loss": 3.176, + "grad_norm": 2.3557279109954834, + "learning_rate": 0.00047421419160549594, + "loss": 1.3733, "step": 1370 }, { "epoch": 0.2597402597402597, - "grad_norm": 7.928246021270752, - "learning_rate": 2.8441558441558442e-05, - "loss": 3.0882, + "grad_norm": 2.3519694805145264, + "learning_rate": 0.000474025974025974, + "loss": 0.7605, "step": 1380 }, { "epoch": 0.26162243553547904, - "grad_norm": 4.824883460998535, - "learning_rate": 2.8430265386787128e-05, - "loss": 3.0282, + "grad_norm": 5.51458215713501, + "learning_rate": 0.0004738377564464521, + "loss": 1.2157, "step": 1390 }, { "epoch": 0.2635046113306983, - "grad_norm": 5.6374640464782715, - "learning_rate": 2.841897233201581e-05, - "loss": 2.8817, + "grad_norm": 3.193882942199707, + "learning_rate": 0.00047364953886693017, + "loss": 0.8343, "step": 1400 }, { "epoch": 0.26538678712591757, - "grad_norm": 5.54218053817749, - "learning_rate": 2.8407679277244497e-05, - "loss": 3.0196, + "grad_norm": 2.5685150623321533, + "learning_rate": 0.00047346132128740824, + "loss": 0.8767, "step": 1410 }, { "epoch": 0.26726896292113683, - "grad_norm": 3.4347853660583496, - "learning_rate": 2.839638622247318e-05, - "loss": 2.9811, + "grad_norm": 2.659619092941284, + "learning_rate": 0.0004732731037078863, + "loss": 1.1711, "step": 1420 }, { "epoch": 0.2691511387163561, - "grad_norm": 3.8664019107818604, - "learning_rate": 2.8385093167701866e-05, - "loss": 2.9376, + "grad_norm": 2.564542055130005, + "learning_rate": 0.0004730848861283644, + "loss": 1.0881, "step": 1430 }, { "epoch": 0.27103331451157536, - "grad_norm": 5.806457996368408, - "learning_rate": 2.837380011293055e-05, - "loss": 3.1184, + "grad_norm": 2.108336925506592, + "learning_rate": 0.0004728966685488424, + "loss": 1.2762, "step": 1440 }, { "epoch": 0.27291549030679463, - "grad_norm": 4.753952980041504, - "learning_rate": 2.836250705815923e-05, - "loss": 2.9645, + "grad_norm": 2.0329091548919678, + "learning_rate": 0.00047270845096932055, + "loss": 0.9341, "step": 1450 }, { "epoch": 0.27479766610201395, - "grad_norm": 4.03913688659668, - "learning_rate": 2.8351214003387917e-05, - "loss": 2.8864, + "grad_norm": 1.2084460258483887, + "learning_rate": 0.0004725202333897986, + "loss": 0.7881, "step": 1460 }, { "epoch": 0.2766798418972332, - "grad_norm": 8.83630657196045, - "learning_rate": 2.83399209486166e-05, - "loss": 3.0768, + "grad_norm": 1.7741458415985107, + "learning_rate": 0.0004723320158102767, + "loss": 1.1409, "step": 1470 }, { "epoch": 0.2785620176924525, - "grad_norm": 7.6859540939331055, - "learning_rate": 2.8328627893845286e-05, - "loss": 2.915, + "grad_norm": 3.576479911804199, + "learning_rate": 0.0004721437982307548, + "loss": 1.0313, "step": 1480 }, { "epoch": 0.28044419348767174, - "grad_norm": 4.7729811668396, - "learning_rate": 2.8317334839073972e-05, - "loss": 2.9354, + "grad_norm": 4.221381664276123, + "learning_rate": 0.00047195558065123286, + "loss": 1.138, "step": 1490 }, { "epoch": 0.282326369282891, - "grad_norm": 6.917285442352295, - "learning_rate": 2.8306041784302655e-05, - "loss": 2.8416, + "grad_norm": 3.0888664722442627, + "learning_rate": 0.00047176736307171093, + "loss": 1.0203, "step": 1500 }, { "epoch": 0.2842085450781103, - "grad_norm": 4.168449878692627, - "learning_rate": 2.829474872953134e-05, - "loss": 3.0573, + "grad_norm": 3.0183537006378174, + "learning_rate": 0.000471579145492189, + "loss": 1.2671, "step": 1510 }, { "epoch": 0.2860907208733296, - "grad_norm": 3.04876708984375, - "learning_rate": 2.8283455674760023e-05, - "loss": 2.9579, + "grad_norm": 0.9898282289505005, + "learning_rate": 0.0004713909279126671, + "loss": 1.2029, "step": 1520 }, { "epoch": 0.28797289666854886, - "grad_norm": 6.365512847900391, - "learning_rate": 2.8272162619988706e-05, - "loss": 2.9621, + "grad_norm": 2.6439521312713623, + "learning_rate": 0.0004712027103331451, + "loss": 1.047, "step": 1530 }, { "epoch": 0.2898550724637681, - "grad_norm": 4.556066989898682, - "learning_rate": 2.8260869565217392e-05, - "loss": 2.9296, + "grad_norm": 2.931302070617676, + "learning_rate": 0.0004710144927536232, + "loss": 1.4159, "step": 1540 }, { "epoch": 0.2917372482589874, - "grad_norm": 2.86582088470459, - "learning_rate": 2.8249576510446075e-05, - "loss": 2.7293, + "grad_norm": 0.5969520807266235, + "learning_rate": 0.00047082627517410126, + "loss": 0.7143, "step": 1550 }, { "epoch": 0.29361942405420666, - "grad_norm": 4.107886791229248, - "learning_rate": 2.823828345567476e-05, - "loss": 3.0, + "grad_norm": 2.982952117919922, + "learning_rate": 0.00047063805759457934, + "loss": 1.2682, "step": 1560 }, { "epoch": 0.2955015998494259, - "grad_norm": 5.17814826965332, - "learning_rate": 2.8226990400903444e-05, - "loss": 2.9396, + "grad_norm": 4.021246433258057, + "learning_rate": 0.0004704498400150574, + "loss": 1.1403, "step": 1570 }, { "epoch": 0.2973837756446452, - "grad_norm": 3.8069369792938232, - "learning_rate": 2.821569734613213e-05, - "loss": 2.8673, + "grad_norm": 1.7243213653564453, + "learning_rate": 0.0004702616224355355, + "loss": 0.875, "step": 1580 }, { "epoch": 0.2992659514398645, - "grad_norm": 4.464517593383789, - "learning_rate": 2.8204404291360816e-05, - "loss": 2.8787, + "grad_norm": 1.9460840225219727, + "learning_rate": 0.00047007340485601357, + "loss": 1.1448, "step": 1590 }, { "epoch": 0.3011481272350838, - "grad_norm": 6.4331536293029785, - "learning_rate": 2.81931112365895e-05, - "loss": 2.7073, + "grad_norm": 3.757749080657959, + "learning_rate": 0.00046988518727649164, + "loss": 1.0793, "step": 1600 }, { "epoch": 0.30303030303030304, - "grad_norm": 5.987295627593994, - "learning_rate": 2.8181818181818185e-05, - "loss": 2.7962, + "grad_norm": 1.5117601156234741, + "learning_rate": 0.0004696969696969697, + "loss": 1.0534, "step": 1610 }, { "epoch": 0.3049124788255223, - "grad_norm": 5.483144760131836, - "learning_rate": 2.8170525127046864e-05, - "loss": 2.9668, + "grad_norm": 5.415975093841553, + "learning_rate": 0.00046950875211744774, + "loss": 1.2198, "step": 1620 }, { "epoch": 0.30679465462074157, - "grad_norm": 3.3265535831451416, - "learning_rate": 2.815923207227555e-05, - "loss": 2.8663, + "grad_norm": 3.129091739654541, + "learning_rate": 0.0004693205345379258, + "loss": 1.1713, "step": 1630 }, { "epoch": 0.30867683041596083, - "grad_norm": 6.472630500793457, - "learning_rate": 2.8147939017504236e-05, - "loss": 2.8595, + "grad_norm": 2.059892177581787, + "learning_rate": 0.0004691323169584039, + "loss": 1.0626, "step": 1640 }, { "epoch": 0.3105590062111801, - "grad_norm": 4.674870491027832, - "learning_rate": 2.813664596273292e-05, - "loss": 2.9178, + "grad_norm": 3.697364091873169, + "learning_rate": 0.00046894409937888197, + "loss": 1.1652, "step": 1650 }, { "epoch": 0.3124411820063994, - "grad_norm": 4.7341628074646, - "learning_rate": 2.8125352907961605e-05, - "loss": 2.7904, + "grad_norm": 3.6531107425689697, + "learning_rate": 0.0004687558817993601, + "loss": 1.1815, "step": 1660 }, { "epoch": 0.3143233578016187, - "grad_norm": 3.6063005924224854, - "learning_rate": 2.811405985319029e-05, - "loss": 2.8127, + "grad_norm": 2.3599324226379395, + "learning_rate": 0.0004685676642198382, + "loss": 1.1458, "step": 1670 }, { "epoch": 0.31620553359683795, - "grad_norm": 10.121143341064453, - "learning_rate": 2.8102766798418974e-05, - "loss": 2.8588, + "grad_norm": 1.7624262571334839, + "learning_rate": 0.00046837944664031625, + "loss": 1.018, "step": 1680 }, { "epoch": 0.3180877093920572, - "grad_norm": 5.036760330200195, - "learning_rate": 2.809147374364766e-05, - "loss": 2.7812, + "grad_norm": 0.9360486268997192, + "learning_rate": 0.00046819122906079433, + "loss": 1.07, "step": 1690 }, { "epoch": 0.3199698851872765, - "grad_norm": 8.621694564819336, - "learning_rate": 2.808018068887634e-05, - "loss": 2.8555, + "grad_norm": 4.804688453674316, + "learning_rate": 0.00046800301148127235, + "loss": 1.2024, "step": 1700 }, { "epoch": 0.32185206098249575, - "grad_norm": 6.541132926940918, - "learning_rate": 2.8068887634105025e-05, - "loss": 2.794, + "grad_norm": 2.2155728340148926, + "learning_rate": 0.00046781479390175043, + "loss": 1.0527, "step": 1710 }, { "epoch": 0.323734236777715, - "grad_norm": 5.8154425621032715, - "learning_rate": 2.805759457933371e-05, - "loss": 2.6459, + "grad_norm": 2.3131532669067383, + "learning_rate": 0.0004676265763222285, + "loss": 0.8434, "step": 1720 }, { "epoch": 0.32561641257293433, - "grad_norm": 3.9110445976257324, - "learning_rate": 2.8046301524562394e-05, - "loss": 2.7375, + "grad_norm": 6.129045486450195, + "learning_rate": 0.0004674383587427066, + "loss": 1.0509, "step": 1730 }, { "epoch": 0.3274985883681536, - "grad_norm": 3.47908878326416, - "learning_rate": 2.803500846979108e-05, - "loss": 2.8092, + "grad_norm": 2.4154810905456543, + "learning_rate": 0.00046725014116318466, + "loss": 1.0117, "step": 1740 }, { "epoch": 0.32938076416337286, - "grad_norm": 4.073140621185303, - "learning_rate": 2.8023715415019763e-05, - "loss": 2.7205, + "grad_norm": 1.0004245042800903, + "learning_rate": 0.00046706192358366273, + "loss": 1.0944, "step": 1750 }, { "epoch": 0.33126293995859213, - "grad_norm": 9.20858097076416, - "learning_rate": 2.801242236024845e-05, - "loss": 2.6537, + "grad_norm": 2.5676519870758057, + "learning_rate": 0.0004668737060041408, + "loss": 0.9929, "step": 1760 }, { "epoch": 0.3331451157538114, - "grad_norm": 3.811323881149292, - "learning_rate": 2.8001129305477135e-05, - "loss": 2.7925, + "grad_norm": 3.247866153717041, + "learning_rate": 0.0004666854884246189, + "loss": 1.089, "step": 1770 }, { "epoch": 0.33502729154903066, - "grad_norm": 3.6044814586639404, - "learning_rate": 2.7989836250705817e-05, - "loss": 2.7287, + "grad_norm": 2.009756088256836, + "learning_rate": 0.00046649727084509696, + "loss": 1.0313, "step": 1780 }, { "epoch": 0.33690946734425, - "grad_norm": 5.667727470397949, - "learning_rate": 2.79785431959345e-05, - "loss": 2.5265, + "grad_norm": 1.9623408317565918, + "learning_rate": 0.000466309053265575, + "loss": 0.7833, "step": 1790 }, { "epoch": 0.33879164313946925, - "grad_norm": 4.497561931610107, - "learning_rate": 2.7967250141163183e-05, - "loss": 2.5919, + "grad_norm": 1.7636711597442627, + "learning_rate": 0.00046612083568605306, + "loss": 0.7889, "step": 1800 }, { "epoch": 0.3406738189346885, - "grad_norm": 6.864144802093506, - "learning_rate": 2.795595708639187e-05, - "loss": 2.7344, + "grad_norm": 3.7700984477996826, + "learning_rate": 0.00046593261810653114, + "loss": 1.3347, "step": 1810 }, { "epoch": 0.3425559947299078, - "grad_norm": 5.193475723266602, - "learning_rate": 2.7944664031620555e-05, - "loss": 2.7341, + "grad_norm": 1.4358450174331665, + "learning_rate": 0.0004657444005270092, + "loss": 0.9798, "step": 1820 }, { "epoch": 0.34443817052512704, - "grad_norm": 10.060043334960938, - "learning_rate": 2.7933370976849238e-05, - "loss": 2.6031, + "grad_norm": 4.207058906555176, + "learning_rate": 0.0004655561829474873, + "loss": 1.0304, "step": 1830 }, { "epoch": 0.3463203463203463, - "grad_norm": 7.921087265014648, - "learning_rate": 2.7922077922077924e-05, - "loss": 2.6902, + "grad_norm": 2.066248655319214, + "learning_rate": 0.00046536796536796537, + "loss": 0.9523, "step": 1840 }, { "epoch": 0.34820252211556557, - "grad_norm": 5.25139856338501, - "learning_rate": 2.7910784867306606e-05, - "loss": 2.7383, + "grad_norm": 2.937966823577881, + "learning_rate": 0.00046517974778844344, + "loss": 1.2679, "step": 1850 }, { "epoch": 0.3500846979107849, - "grad_norm": 9.80843734741211, - "learning_rate": 2.7899491812535292e-05, - "loss": 2.5868, + "grad_norm": 4.462133407592773, + "learning_rate": 0.0004649915302089215, + "loss": 0.9922, "step": 1860 }, { "epoch": 0.35196687370600416, - "grad_norm": 8.038129806518555, - "learning_rate": 2.788819875776398e-05, - "loss": 2.6574, + "grad_norm": 2.355633020401001, + "learning_rate": 0.0004648033126293996, + "loss": 1.0605, "step": 1870 }, { "epoch": 0.3538490495012234, - "grad_norm": 5.235549449920654, - "learning_rate": 2.7876905702992658e-05, - "loss": 2.623, + "grad_norm": 3.410402297973633, + "learning_rate": 0.0004646150950498776, + "loss": 1.1485, "step": 1880 }, { "epoch": 0.3557312252964427, - "grad_norm": 3.0741400718688965, - "learning_rate": 2.7865612648221344e-05, - "loss": 2.6216, + "grad_norm": 4.6109113693237305, + "learning_rate": 0.00046442687747035575, + "loss": 1.2647, "step": 1890 }, { "epoch": 0.35761340109166195, - "grad_norm": 3.884474515914917, - "learning_rate": 2.785431959345003e-05, - "loss": 2.5854, + "grad_norm": 3.604688882827759, + "learning_rate": 0.0004642386598908338, + "loss": 0.8707, "step": 1900 }, { "epoch": 0.3594955768868812, - "grad_norm": 3.7831873893737793, - "learning_rate": 2.7843026538678713e-05, - "loss": 2.3631, + "grad_norm": 5.367961406707764, + "learning_rate": 0.0004640504423113119, + "loss": 0.9146, "step": 1910 }, { "epoch": 0.3613777526821005, - "grad_norm": 6.738157272338867, - "learning_rate": 2.78317334839074e-05, - "loss": 2.6551, + "grad_norm": 2.2205069065093994, + "learning_rate": 0.00046386222473179, + "loss": 1.1418, "step": 1920 }, { "epoch": 0.3632599284773198, - "grad_norm": 5.1181793212890625, - "learning_rate": 2.782044042913608e-05, - "loss": 2.6403, + "grad_norm": 3.098167896270752, + "learning_rate": 0.00046367400715226806, + "loss": 1.1865, "step": 1930 }, { "epoch": 0.36514210427253907, - "grad_norm": 8.148448944091797, - "learning_rate": 2.7809147374364768e-05, - "loss": 2.4368, + "grad_norm": 1.8569923639297485, + "learning_rate": 0.00046348578957274613, + "loss": 0.7607, "step": 1940 }, { "epoch": 0.36702428006775834, - "grad_norm": 14.908823013305664, - "learning_rate": 2.7797854319593454e-05, - "loss": 2.7317, + "grad_norm": 4.62168025970459, + "learning_rate": 0.0004632975719932242, + "loss": 1.2438, "step": 1950 }, { "epoch": 0.3689064558629776, - "grad_norm": 4.497227668762207, - "learning_rate": 2.7786561264822133e-05, - "loss": 2.5193, + "grad_norm": 4.406530380249023, + "learning_rate": 0.00046310935441370223, + "loss": 0.9185, "step": 1960 }, { "epoch": 0.37078863165819687, - "grad_norm": 7.139688491821289, - "learning_rate": 2.777526821005082e-05, - "loss": 2.6937, + "grad_norm": 2.9073426723480225, + "learning_rate": 0.0004629211368341803, + "loss": 0.9801, "step": 1970 }, { "epoch": 0.37267080745341613, - "grad_norm": 11.658743858337402, - "learning_rate": 2.77639751552795e-05, - "loss": 2.3396, + "grad_norm": 7.176252365112305, + "learning_rate": 0.0004627329192546584, + "loss": 0.7723, "step": 1980 }, { "epoch": 0.3745529832486354, - "grad_norm": 11.21719741821289, - "learning_rate": 2.7752682100508188e-05, - "loss": 2.6433, + "grad_norm": 2.731356382369995, + "learning_rate": 0.00046254470167513646, + "loss": 1.2079, "step": 1990 }, { "epoch": 0.3764351590438547, - "grad_norm": 3.650562047958374, - "learning_rate": 2.7741389045736874e-05, - "loss": 2.5423, + "grad_norm": 1.2026039361953735, + "learning_rate": 0.00046235648409561454, + "loss": 1.0573, "step": 2000 }, { "epoch": 0.378317334839074, - "grad_norm": 6.978822231292725, - "learning_rate": 2.7730095990965556e-05, - "loss": 2.4149, + "grad_norm": 3.050309419631958, + "learning_rate": 0.0004621682665160926, + "loss": 0.9802, "step": 2010 }, { "epoch": 0.38019951063429325, - "grad_norm": 12.115926742553711, - "learning_rate": 2.7718802936194243e-05, - "loss": 2.6864, + "grad_norm": 3.7857017517089844, + "learning_rate": 0.0004619800489365707, + "loss": 1.2937, "step": 2020 }, { "epoch": 0.3820816864295125, - "grad_norm": 12.849691390991211, - "learning_rate": 2.7707509881422925e-05, - "loss": 2.4976, + "grad_norm": 3.345871925354004, + "learning_rate": 0.00046179183135704877, + "loss": 1.0949, "step": 2030 }, { "epoch": 0.3839638622247318, - "grad_norm": 7.091000556945801, - "learning_rate": 2.769621682665161e-05, - "loss": 2.3657, + "grad_norm": 3.366966485977173, + "learning_rate": 0.00046160361377752684, + "loss": 0.9634, "step": 2040 }, { "epoch": 0.38584603801995104, - "grad_norm": 7.3910746574401855, - "learning_rate": 2.7684923771880294e-05, - "loss": 2.4003, + "grad_norm": 2.2242603302001953, + "learning_rate": 0.00046141539619800486, + "loss": 0.8493, "step": 2050 }, { "epoch": 0.38772821381517036, - "grad_norm": 4.848266124725342, - "learning_rate": 2.7673630717108977e-05, - "loss": 2.6143, + "grad_norm": 2.916806221008301, + "learning_rate": 0.00046122717861848294, + "loss": 1.1871, "step": 2060 }, { "epoch": 0.38961038961038963, - "grad_norm": 5.94016170501709, - "learning_rate": 2.7662337662337663e-05, - "loss": 2.4341, + "grad_norm": 4.017305850982666, + "learning_rate": 0.000461038961038961, + "loss": 0.925, "step": 2070 }, { "epoch": 0.3914925654056089, - "grad_norm": 8.351221084594727, - "learning_rate": 2.7651044607566345e-05, - "loss": 2.4935, + "grad_norm": 3.971277952194214, + "learning_rate": 0.0004608507434594391, + "loss": 0.962, "step": 2080 }, { "epoch": 0.39337474120082816, - "grad_norm": 8.837783813476562, - "learning_rate": 2.763975155279503e-05, - "loss": 2.3477, + "grad_norm": 4.612211227416992, + "learning_rate": 0.00046066252587991717, + "loss": 0.8731, "step": 2090 }, { "epoch": 0.3952569169960474, - "grad_norm": 9.10621166229248, - "learning_rate": 2.7628458498023718e-05, - "loss": 2.4406, + "grad_norm": 2.1569032669067383, + "learning_rate": 0.00046047430830039525, + "loss": 0.839, "step": 2100 }, { "epoch": 0.3971390927912667, - "grad_norm": 9.467292785644531, - "learning_rate": 2.76171654432524e-05, - "loss": 2.6356, + "grad_norm": 2.8226122856140137, + "learning_rate": 0.0004602860907208734, + "loss": 1.1247, "step": 2110 }, { "epoch": 0.39902126858648596, - "grad_norm": 5.230591297149658, - "learning_rate": 2.7605872388481086e-05, - "loss": 2.4345, + "grad_norm": 5.517162322998047, + "learning_rate": 0.00046009787314135145, + "loss": 1.1889, "step": 2120 }, { "epoch": 0.4009034443817053, - "grad_norm": 5.1924591064453125, - "learning_rate": 2.759457933370977e-05, - "loss": 2.5366, + "grad_norm": 1.8455901145935059, + "learning_rate": 0.00045990965556182953, + "loss": 0.9728, "step": 2130 }, { "epoch": 0.40278562017692454, - "grad_norm": 6.529165744781494, - "learning_rate": 2.7583286278938452e-05, - "loss": 2.3564, + "grad_norm": 3.769131660461426, + "learning_rate": 0.00045972143798230755, + "loss": 0.7964, "step": 2140 }, { "epoch": 0.4046677959721438, - "grad_norm": 7.249635219573975, - "learning_rate": 2.7571993224167138e-05, - "loss": 2.2548, + "grad_norm": 2.248361825942993, + "learning_rate": 0.00045953322040278563, + "loss": 0.6952, "step": 2150 }, { "epoch": 0.40654997176736307, - "grad_norm": 9.388668060302734, - "learning_rate": 2.756070016939582e-05, - "loss": 2.6006, + "grad_norm": 3.3281447887420654, + "learning_rate": 0.0004593450028232637, + "loss": 1.1356, "step": 2160 }, { "epoch": 0.40843214756258234, - "grad_norm": 3.7543675899505615, - "learning_rate": 2.7549407114624507e-05, - "loss": 2.3548, + "grad_norm": 1.2701328992843628, + "learning_rate": 0.0004591567852437418, + "loss": 0.9366, "step": 2170 }, { "epoch": 0.4103143233578016, - "grad_norm": 7.2959442138671875, - "learning_rate": 2.7538114059853193e-05, - "loss": 2.4884, + "grad_norm": 2.7324378490448, + "learning_rate": 0.00045896856766421986, + "loss": 0.9716, "step": 2180 }, { "epoch": 0.41219649915302087, - "grad_norm": 4.733274936676025, - "learning_rate": 2.7526821005081875e-05, - "loss": 2.5703, + "grad_norm": 1.8330858945846558, + "learning_rate": 0.00045878035008469793, + "loss": 1.2041, "step": 2190 }, { "epoch": 0.4140786749482402, - "grad_norm": 14.75526237487793, - "learning_rate": 2.751552795031056e-05, - "loss": 2.4666, + "grad_norm": 4.173506736755371, + "learning_rate": 0.000458592132505176, + "loss": 0.8957, "step": 2200 }, { "epoch": 0.41596085074345945, - "grad_norm": 5.791693210601807, - "learning_rate": 2.7504234895539244e-05, - "loss": 2.4131, + "grad_norm": 2.277362108230591, + "learning_rate": 0.0004584039149256541, + "loss": 0.9527, "step": 2210 }, { "epoch": 0.4178430265386787, - "grad_norm": 5.518586158752441, - "learning_rate": 2.749294184076793e-05, - "loss": 2.2924, + "grad_norm": 2.5186853408813477, + "learning_rate": 0.00045821569734613216, + "loss": 0.94, "step": 2220 }, { "epoch": 0.419725202333898, - "grad_norm": 8.407965660095215, - "learning_rate": 2.7481648785996613e-05, - "loss": 2.2358, + "grad_norm": 3.54046893119812, + "learning_rate": 0.0004580274797666102, + "loss": 0.722, "step": 2230 }, { "epoch": 0.42160737812911725, - "grad_norm": 6.686824798583984, - "learning_rate": 2.7470355731225296e-05, - "loss": 2.4671, + "grad_norm": 4.817910671234131, + "learning_rate": 0.00045783926218708826, + "loss": 1.0113, "step": 2240 }, { "epoch": 0.4234895539243365, - "grad_norm": 5.854467391967773, - "learning_rate": 2.745906267645398e-05, - "loss": 2.3917, + "grad_norm": 2.397691011428833, + "learning_rate": 0.00045765104460756634, + "loss": 0.9322, "step": 2250 }, { "epoch": 0.4253717297195558, - "grad_norm": 9.136099815368652, - "learning_rate": 2.7447769621682664e-05, - "loss": 2.2994, + "grad_norm": 2.3579189777374268, + "learning_rate": 0.0004574628270280444, + "loss": 1.1121, "step": 2260 }, { "epoch": 0.4272539055147751, - "grad_norm": 4.834329128265381, - "learning_rate": 2.743647656691135e-05, - "loss": 2.1033, + "grad_norm": 6.9814453125, + "learning_rate": 0.0004572746094485225, + "loss": 0.6762, "step": 2270 }, { "epoch": 0.42913608130999437, - "grad_norm": 5.209042072296143, - "learning_rate": 2.7425183512140036e-05, - "loss": 2.4016, + "grad_norm": 4.888842582702637, + "learning_rate": 0.00045708639186900057, + "loss": 0.8394, "step": 2280 }, { "epoch": 0.43101825710521363, - "grad_norm": 11.149210929870605, - "learning_rate": 2.741389045736872e-05, - "loss": 2.3709, + "grad_norm": 4.279430389404297, + "learning_rate": 0.00045689817428947864, + "loss": 0.9319, "step": 2290 }, { "epoch": 0.4329004329004329, - "grad_norm": 12.44275188446045, - "learning_rate": 2.7402597402597405e-05, - "loss": 2.3464, + "grad_norm": 7.005417823791504, + "learning_rate": 0.0004567099567099567, + "loss": 0.8899, "step": 2300 }, { "epoch": 0.43478260869565216, - "grad_norm": 12.738330841064453, - "learning_rate": 2.7391304347826085e-05, - "loss": 2.1641, + "grad_norm": 3.8382112979888916, + "learning_rate": 0.00045652173913043474, + "loss": 0.8132, "step": 2310 }, { "epoch": 0.4366647844908714, - "grad_norm": 8.074936866760254, - "learning_rate": 2.738001129305477e-05, - "loss": 2.1561, + "grad_norm": 4.163649559020996, + "learning_rate": 0.0004563335215509128, + "loss": 0.7774, "step": 2320 }, { "epoch": 0.43854696028609075, - "grad_norm": 5.971907615661621, - "learning_rate": 2.7368718238283457e-05, - "loss": 2.2271, + "grad_norm": 2.5022313594818115, + "learning_rate": 0.00045614530397139095, + "loss": 0.9465, "step": 2330 }, { "epoch": 0.44042913608131, - "grad_norm": 3.5811924934387207, - "learning_rate": 2.735742518351214e-05, - "loss": 2.2939, + "grad_norm": 2.3390891551971436, + "learning_rate": 0.000455957086391869, + "loss": 1.1819, "step": 2340 }, { "epoch": 0.4423113118765293, - "grad_norm": 7.316919326782227, - "learning_rate": 2.7346132128740825e-05, - "loss": 2.2452, + "grad_norm": 4.921488285064697, + "learning_rate": 0.0004557688688123471, + "loss": 1.0202, "step": 2350 }, { "epoch": 0.44419348767174854, - "grad_norm": 12.880569458007812, - "learning_rate": 2.7334839073969508e-05, - "loss": 2.2113, + "grad_norm": 3.4981935024261475, + "learning_rate": 0.0004555806512328252, + "loss": 0.8734, "step": 2360 }, { "epoch": 0.4460756634669678, - "grad_norm": 7.582205295562744, - "learning_rate": 2.7323546019198194e-05, - "loss": 2.3516, + "grad_norm": 2.6764838695526123, + "learning_rate": 0.00045539243365330326, + "loss": 0.8693, "step": 2370 }, { "epoch": 0.4479578392621871, - "grad_norm": 5.996733665466309, - "learning_rate": 2.731225296442688e-05, - "loss": 2.17, + "grad_norm": 2.6041977405548096, + "learning_rate": 0.00045520421607378133, + "loss": 0.773, "step": 2380 }, { "epoch": 0.44984001505740634, - "grad_norm": 4.608865737915039, - "learning_rate": 2.7300959909655563e-05, - "loss": 2.3382, + "grad_norm": 2.146116018295288, + "learning_rate": 0.0004550159984942594, + "loss": 0.981, "step": 2390 }, { "epoch": 0.45172219085262566, - "grad_norm": 8.942387580871582, - "learning_rate": 2.7289666854884246e-05, - "loss": 2.4042, + "grad_norm": 1.5882443189620972, + "learning_rate": 0.00045482778091473743, + "loss": 1.2898, "step": 2400 }, { "epoch": 0.4536043666478449, - "grad_norm": 5.910847187042236, - "learning_rate": 2.7278373800112932e-05, - "loss": 2.2939, + "grad_norm": 2.2083537578582764, + "learning_rate": 0.0004546395633352155, + "loss": 1.0654, "step": 2410 }, { "epoch": 0.4554865424430642, - "grad_norm": 7.828692436218262, - "learning_rate": 2.7267080745341614e-05, - "loss": 2.244, + "grad_norm": 3.9478726387023926, + "learning_rate": 0.0004544513457556936, + "loss": 0.8267, "step": 2420 }, { "epoch": 0.45736871823828346, - "grad_norm": 6.50481653213501, - "learning_rate": 2.72557876905703e-05, - "loss": 2.2278, + "grad_norm": 3.5692625045776367, + "learning_rate": 0.00045426312817617166, + "loss": 0.8909, "step": 2430 }, { "epoch": 0.4592508940335027, - "grad_norm": 6.783820629119873, - "learning_rate": 2.7244494635798983e-05, - "loss": 2.2561, + "grad_norm": 4.099732398986816, + "learning_rate": 0.00045407491059664974, + "loss": 0.9475, "step": 2440 }, { "epoch": 0.461133069828722, - "grad_norm": 5.222927570343018, - "learning_rate": 2.723320158102767e-05, - "loss": 2.1948, + "grad_norm": 2.3055837154388428, + "learning_rate": 0.0004538866930171278, + "loss": 0.9427, "step": 2450 }, { "epoch": 0.46301524562394125, - "grad_norm": 8.41578197479248, - "learning_rate": 2.7221908526256355e-05, - "loss": 2.1365, + "grad_norm": 1.5885486602783203, + "learning_rate": 0.0004536984754376059, + "loss": 0.8121, "step": 2460 }, { "epoch": 0.4648974214191606, - "grad_norm": 7.914490222930908, - "learning_rate": 2.7210615471485038e-05, - "loss": 2.0912, + "grad_norm": 2.3060200214385986, + "learning_rate": 0.00045351025785808397, + "loss": 0.8262, "step": 2470 }, { "epoch": 0.46677959721437984, - "grad_norm": 10.265892028808594, - "learning_rate": 2.7199322416713724e-05, - "loss": 2.2027, + "grad_norm": 2.7296106815338135, + "learning_rate": 0.00045332204027856204, + "loss": 1.0472, "step": 2480 }, { "epoch": 0.4686617730095991, - "grad_norm": 5.603802680969238, - "learning_rate": 2.7188029361942403e-05, - "loss": 2.1609, + "grad_norm": 1.5217969417572021, + "learning_rate": 0.00045313382269904006, + "loss": 0.8783, "step": 2490 }, { "epoch": 0.47054394880481837, - "grad_norm": 11.390222549438477, - "learning_rate": 2.717673630717109e-05, - "loss": 2.1755, + "grad_norm": 2.4178285598754883, + "learning_rate": 0.00045294560511951814, + "loss": 1.0481, "step": 2500 }, { "epoch": 0.47242612460003763, - "grad_norm": 4.981803894042969, - "learning_rate": 2.7165443252399776e-05, - "loss": 1.873, + "grad_norm": 2.3695430755615234, + "learning_rate": 0.0004527573875399962, + "loss": 0.4676, "step": 2510 }, { "epoch": 0.4743083003952569, - "grad_norm": 6.110774040222168, - "learning_rate": 2.7154150197628458e-05, - "loss": 2.0585, + "grad_norm": 1.467848777770996, + "learning_rate": 0.0004525691699604743, + "loss": 0.7778, "step": 2520 }, { "epoch": 0.47619047619047616, - "grad_norm": 6.286103248596191, - "learning_rate": 2.7142857142857144e-05, - "loss": 1.9113, + "grad_norm": 2.409031629562378, + "learning_rate": 0.00045238095238095237, + "loss": 0.7023, "step": 2530 }, { "epoch": 0.4780726519856955, - "grad_norm": 5.199717044830322, - "learning_rate": 2.7131564088085827e-05, - "loss": 2.0643, + "grad_norm": 1.9384913444519043, + "learning_rate": 0.00045219273480143045, + "loss": 0.8401, "step": 2540 }, { "epoch": 0.47995482778091475, - "grad_norm": 10.684744834899902, - "learning_rate": 2.7120271033314513e-05, - "loss": 2.1255, + "grad_norm": 1.967634916305542, + "learning_rate": 0.0004520045172219086, + "loss": 0.7964, "step": 2550 }, { "epoch": 0.481837003576134, - "grad_norm": 6.335547924041748, - "learning_rate": 2.71089779785432e-05, - "loss": 2.0588, + "grad_norm": 2.215298652648926, + "learning_rate": 0.00045181629964238665, + "loss": 0.8712, "step": 2560 }, { "epoch": 0.4837191793713533, - "grad_norm": 15.644701957702637, - "learning_rate": 2.709768492377188e-05, - "loss": 2.2138, + "grad_norm": 5.0799078941345215, + "learning_rate": 0.0004516280820628647, + "loss": 1.0858, "step": 2570 }, { "epoch": 0.48560135516657255, - "grad_norm": 8.629146575927734, - "learning_rate": 2.7086391869000565e-05, - "loss": 1.8363, + "grad_norm": 4.027543067932129, + "learning_rate": 0.00045143986448334275, + "loss": 0.5627, "step": 2580 }, { "epoch": 0.4874835309617918, - "grad_norm": 6.585087299346924, - "learning_rate": 2.7075098814229247e-05, - "loss": 2.096, + "grad_norm": 2.23659348487854, + "learning_rate": 0.00045125164690382083, + "loss": 0.9524, "step": 2590 }, { "epoch": 0.48936570675701113, - "grad_norm": 4.812081336975098, - "learning_rate": 2.7063805759457933e-05, - "loss": 2.0322, + "grad_norm": 2.323295831680298, + "learning_rate": 0.0004510634293242989, + "loss": 0.915, "step": 2600 }, { "epoch": 0.4912478825522304, - "grad_norm": 3.4937989711761475, - "learning_rate": 2.705251270468662e-05, - "loss": 2.2819, + "grad_norm": 1.380283236503601, + "learning_rate": 0.000450875211744777, + "loss": 0.9743, "step": 2610 }, { "epoch": 0.49313005834744966, - "grad_norm": 13.015551567077637, - "learning_rate": 2.7041219649915302e-05, - "loss": 2.0931, + "grad_norm": 2.988337516784668, + "learning_rate": 0.00045068699416525506, + "loss": 0.8426, "step": 2620 }, { "epoch": 0.4950122341426689, - "grad_norm": 26.068567276000977, - "learning_rate": 2.7029926595143988e-05, - "loss": 2.1267, + "grad_norm": 4.08306884765625, + "learning_rate": 0.00045049877658573313, + "loss": 0.8518, "step": 2630 }, { "epoch": 0.4968944099378882, - "grad_norm": 6.577176094055176, - "learning_rate": 2.701863354037267e-05, - "loss": 1.9937, + "grad_norm": 2.828056812286377, + "learning_rate": 0.0004503105590062112, + "loss": 0.8582, "step": 2640 }, { "epoch": 0.49877658573310746, - "grad_norm": 8.388608932495117, - "learning_rate": 2.7007340485601357e-05, - "loss": 2.1152, + "grad_norm": 1.375273585319519, + "learning_rate": 0.0004501223414266893, + "loss": 0.8976, "step": 2650 }, { "epoch": 0.5006587615283268, - "grad_norm": 5.307988166809082, - "learning_rate": 2.699604743083004e-05, - "loss": 2.0221, + "grad_norm": 4.221978187561035, + "learning_rate": 0.0004499341238471673, + "loss": 0.9568, "step": 2660 }, { "epoch": 0.502540937323546, - "grad_norm": 7.120198726654053, - "learning_rate": 2.6984754376058722e-05, - "loss": 2.0487, + "grad_norm": 2.1360394954681396, + "learning_rate": 0.0004497459062676454, + "loss": 0.8552, "step": 2670 }, { "epoch": 0.5044231131187653, - "grad_norm": 5.206243991851807, - "learning_rate": 2.697346132128741e-05, - "loss": 2.112, + "grad_norm": 3.500844955444336, + "learning_rate": 0.00044955768868812346, + "loss": 0.9702, "step": 2680 }, { "epoch": 0.5063052889139845, - "grad_norm": 12.726581573486328, - "learning_rate": 2.6962168266516094e-05, - "loss": 2.1619, + "grad_norm": 3.776733636856079, + "learning_rate": 0.00044936947110860154, + "loss": 1.0864, "step": 2690 }, { "epoch": 0.5081874647092038, - "grad_norm": 5.604727745056152, - "learning_rate": 2.6950875211744777e-05, - "loss": 1.8609, + "grad_norm": 3.072545289993286, + "learning_rate": 0.0004491812535290796, + "loss": 0.8421, "step": 2700 }, { "epoch": 0.5100696405044232, - "grad_norm": 4.6965012550354, - "learning_rate": 2.6939582156973463e-05, - "loss": 2.095, + "grad_norm": 0.5951893925666809, + "learning_rate": 0.0004489930359495577, + "loss": 0.9968, "step": 2710 }, { "epoch": 0.5119518162996424, - "grad_norm": 6.7473602294921875, - "learning_rate": 2.6928289102202146e-05, - "loss": 2.0439, + "grad_norm": 4.747662544250488, + "learning_rate": 0.00044880481837003577, + "loss": 0.9355, "step": 2720 }, { "epoch": 0.5138339920948617, - "grad_norm": 9.199115753173828, - "learning_rate": 2.6916996047430832e-05, - "loss": 2.1664, + "grad_norm": 3.932286024093628, + "learning_rate": 0.00044861660079051384, + "loss": 1.321, "step": 2730 }, { "epoch": 0.5157161678900809, - "grad_norm": 8.569664001464844, - "learning_rate": 2.6905702992659518e-05, - "loss": 1.9945, + "grad_norm": 2.5157997608184814, + "learning_rate": 0.0004484283832109919, + "loss": 0.8607, "step": 2740 }, { "epoch": 0.5175983436853002, - "grad_norm": 13.89288330078125, - "learning_rate": 2.6894409937888197e-05, - "loss": 1.9975, + "grad_norm": 1.6060855388641357, + "learning_rate": 0.00044824016563146994, + "loss": 0.9437, "step": 2750 }, { "epoch": 0.5194805194805194, - "grad_norm": 3.2873876094818115, - "learning_rate": 2.6883116883116883e-05, - "loss": 2.0202, + "grad_norm": 0.8732848763465881, + "learning_rate": 0.000448051948051948, + "loss": 0.9885, "step": 2760 }, { "epoch": 0.5213626952757388, - "grad_norm": 3.6722116470336914, - "learning_rate": 2.6871823828345566e-05, - "loss": 2.1103, + "grad_norm": 1.563890814781189, + "learning_rate": 0.00044786373047242615, + "loss": 1.1645, "step": 2770 }, { "epoch": 0.5232448710709581, - "grad_norm": 6.789108753204346, - "learning_rate": 2.6860530773574252e-05, - "loss": 2.0607, + "grad_norm": 2.6220126152038574, + "learning_rate": 0.0004476755128929042, + "loss": 0.9488, "step": 2780 }, { "epoch": 0.5251270468661773, - "grad_norm": 9.947601318359375, - "learning_rate": 2.6849237718802938e-05, - "loss": 1.8943, + "grad_norm": 2.038132429122925, + "learning_rate": 0.0004474872953133823, + "loss": 0.7027, "step": 2790 }, { "epoch": 0.5270092226613966, - "grad_norm": 29.515361785888672, - "learning_rate": 2.683794466403162e-05, - "loss": 1.9724, + "grad_norm": 4.242816925048828, + "learning_rate": 0.0004472990777338604, + "loss": 0.927, "step": 2800 }, { "epoch": 0.5288913984566158, - "grad_norm": 14.963418960571289, - "learning_rate": 2.6826651609260307e-05, - "loss": 2.0618, + "grad_norm": 3.240506410598755, + "learning_rate": 0.00044711086015433846, + "loss": 1.091, "step": 2810 }, { "epoch": 0.5307735742518351, - "grad_norm": 16.72760772705078, - "learning_rate": 2.681535855448899e-05, - "loss": 2.1478, + "grad_norm": 5.185576438903809, + "learning_rate": 0.00044692264257481653, + "loss": 0.9626, "step": 2820 }, { "epoch": 0.5326557500470543, - "grad_norm": 11.509770393371582, - "learning_rate": 2.6804065499717676e-05, - "loss": 2.0953, + "grad_norm": 3.4086499214172363, + "learning_rate": 0.0004467344249952946, + "loss": 1.359, "step": 2830 }, { "epoch": 0.5345379258422737, - "grad_norm": 13.363036155700684, - "learning_rate": 2.679277244494636e-05, - "loss": 2.0589, + "grad_norm": 1.9781001806259155, + "learning_rate": 0.00044654620741577263, + "loss": 0.959, "step": 2840 }, { "epoch": 0.536420101637493, - "grad_norm": 4.475217342376709, - "learning_rate": 2.678147939017504e-05, - "loss": 1.8262, + "grad_norm": 1.5130640268325806, + "learning_rate": 0.0004463579898362507, + "loss": 0.7976, "step": 2850 }, { "epoch": 0.5383022774327122, - "grad_norm": 3.634226083755493, - "learning_rate": 2.6770186335403727e-05, - "loss": 1.7898, + "grad_norm": 1.7196760177612305, + "learning_rate": 0.0004461697722567288, + "loss": 0.7976, "step": 2860 }, { "epoch": 0.5401844532279315, - "grad_norm": 22.259286880493164, - "learning_rate": 2.675889328063241e-05, - "loss": 1.8528, + "grad_norm": 4.232097625732422, + "learning_rate": 0.00044598155467720686, + "loss": 0.87, "step": 2870 }, { "epoch": 0.5420666290231507, - "grad_norm": 11.183906555175781, - "learning_rate": 2.6747600225861096e-05, - "loss": 1.7605, + "grad_norm": 3.785689115524292, + "learning_rate": 0.00044579333709768494, + "loss": 0.8258, "step": 2880 }, { "epoch": 0.54394880481837, - "grad_norm": 6.606888294219971, - "learning_rate": 2.6736307171089782e-05, - "loss": 1.8818, + "grad_norm": 1.9887821674346924, + "learning_rate": 0.000445605119518163, + "loss": 0.7209, "step": 2890 }, { "epoch": 0.5458309806135893, - "grad_norm": 8.689189910888672, - "learning_rate": 2.6725014116318465e-05, - "loss": 1.9096, + "grad_norm": 1.6758228540420532, + "learning_rate": 0.0004454169019386411, + "loss": 0.8262, "step": 2900 }, { "epoch": 0.5477131564088086, - "grad_norm": 6.653372287750244, - "learning_rate": 2.671372106154715e-05, - "loss": 1.7785, + "grad_norm": 2.393141031265259, + "learning_rate": 0.00044522868435911917, + "loss": 0.8338, "step": 2910 }, { "epoch": 0.5495953322040279, - "grad_norm": 11.017134666442871, - "learning_rate": 2.6702428006775834e-05, - "loss": 1.8423, + "grad_norm": 6.172318458557129, + "learning_rate": 0.0004450404667795972, + "loss": 0.8609, "step": 2920 }, { "epoch": 0.5514775079992471, - "grad_norm": 6.855719089508057, - "learning_rate": 2.6691134952004516e-05, - "loss": 1.8001, + "grad_norm": 0.41666731238365173, + "learning_rate": 0.00044485224920007526, + "loss": 0.86, "step": 2930 }, { "epoch": 0.5533596837944664, - "grad_norm": 5.900759220123291, - "learning_rate": 2.6679841897233202e-05, - "loss": 1.987, + "grad_norm": 3.7566018104553223, + "learning_rate": 0.00044466403162055334, + "loss": 0.9788, "step": 2940 }, { "epoch": 0.5552418595896856, - "grad_norm": 5.474452495574951, - "learning_rate": 2.6668548842461885e-05, - "loss": 1.9144, + "grad_norm": 0.9079445004463196, + "learning_rate": 0.0004444758140410314, + "loss": 0.8222, "step": 2950 }, { "epoch": 0.557124035384905, - "grad_norm": 13.115139961242676, - "learning_rate": 2.665725578769057e-05, - "loss": 2.0743, + "grad_norm": 2.0461573600769043, + "learning_rate": 0.0004442875964615095, + "loss": 1.1583, "step": 2960 }, { "epoch": 0.5590062111801242, - "grad_norm": 8.339238166809082, - "learning_rate": 2.6645962732919257e-05, - "loss": 1.9056, + "grad_norm": 3.3327903747558594, + "learning_rate": 0.00044409937888198757, + "loss": 0.9933, "step": 2970 }, { "epoch": 0.5608883869753435, - "grad_norm": 12.62999153137207, - "learning_rate": 2.663466967814794e-05, - "loss": 1.8391, + "grad_norm": 2.7679216861724854, + "learning_rate": 0.00044391116130246565, + "loss": 0.8464, "step": 2980 }, { "epoch": 0.5627705627705628, - "grad_norm": 5.597290515899658, - "learning_rate": 2.6623376623376626e-05, - "loss": 1.9349, + "grad_norm": 1.8985081911087036, + "learning_rate": 0.0004437229437229438, + "loss": 0.9887, "step": 2990 }, { "epoch": 0.564652738565782, - "grad_norm": 2.5785648822784424, - "learning_rate": 2.661208356860531e-05, - "loss": 1.9463, + "grad_norm": 1.1193677186965942, + "learning_rate": 0.00044353472614342185, + "loss": 0.9821, "step": 3000 }, { "epoch": 0.5665349143610013, - "grad_norm": 13.423247337341309, - "learning_rate": 2.660079051383399e-05, - "loss": 1.7291, + "grad_norm": 2.58882999420166, + "learning_rate": 0.0004433465085638999, + "loss": 0.7463, "step": 3010 }, { "epoch": 0.5684170901562206, - "grad_norm": 7.054490089416504, - "learning_rate": 2.6589497459062677e-05, - "loss": 1.8435, + "grad_norm": 4.277840614318848, + "learning_rate": 0.00044315829098437795, + "loss": 0.7602, "step": 3020 }, { "epoch": 0.5702992659514399, - "grad_norm": 7.4367523193359375, - "learning_rate": 2.657820440429136e-05, - "loss": 1.7105, + "grad_norm": 0.9204025268554688, + "learning_rate": 0.00044297007340485603, + "loss": 0.6596, "step": 3030 }, { "epoch": 0.5721814417466592, - "grad_norm": 9.216171264648438, - "learning_rate": 2.6566911349520046e-05, - "loss": 1.9434, + "grad_norm": 3.1999411582946777, + "learning_rate": 0.0004427818558253341, + "loss": 0.8291, "step": 3040 }, { "epoch": 0.5740636175418784, - "grad_norm": 7.6077165603637695, - "learning_rate": 2.655561829474873e-05, - "loss": 1.8798, + "grad_norm": 1.4632461071014404, + "learning_rate": 0.0004425936382458122, + "loss": 1.1211, "step": 3050 }, { "epoch": 0.5759457933370977, - "grad_norm": 11.647812843322754, - "learning_rate": 2.6544325239977415e-05, - "loss": 1.7522, + "grad_norm": 2.777308225631714, + "learning_rate": 0.00044240542066629026, + "loss": 0.7434, "step": 3060 }, { "epoch": 0.5778279691323169, - "grad_norm": 5.945573806762695, - "learning_rate": 2.65330321852061e-05, - "loss": 2.0416, + "grad_norm": 2.7749745845794678, + "learning_rate": 0.00044221720308676833, + "loss": 1.0088, "step": 3070 }, { "epoch": 0.5797101449275363, - "grad_norm": 24.724990844726562, - "learning_rate": 2.6521739130434784e-05, - "loss": 1.8017, + "grad_norm": 1.9405794143676758, + "learning_rate": 0.0004420289855072464, + "loss": 0.9694, "step": 3080 }, { "epoch": 0.5815923207227555, - "grad_norm": 4.811154842376709, - "learning_rate": 2.651044607566347e-05, - "loss": 1.6813, + "grad_norm": 1.6995877027511597, + "learning_rate": 0.0004418407679277245, + "loss": 0.5558, "step": 3090 }, { "epoch": 0.5834744965179748, - "grad_norm": 11.191183090209961, - "learning_rate": 2.649915302089215e-05, - "loss": 1.7622, + "grad_norm": 2.6965272426605225, + "learning_rate": 0.0004416525503482025, + "loss": 0.8034, "step": 3100 }, { "epoch": 0.5853566723131941, - "grad_norm": 12.16427230834961, - "learning_rate": 2.6487859966120835e-05, - "loss": 1.9663, + "grad_norm": 1.4357980489730835, + "learning_rate": 0.0004414643327686806, + "loss": 1.1173, "step": 3110 }, { "epoch": 0.5872388481084133, - "grad_norm": 8.902604103088379, - "learning_rate": 2.647656691134952e-05, - "loss": 1.7131, + "grad_norm": 4.319276332855225, + "learning_rate": 0.00044127611518915866, + "loss": 0.9903, "step": 3120 }, { "epoch": 0.5891210239036326, - "grad_norm": 7.6382904052734375, - "learning_rate": 2.6465273856578204e-05, - "loss": 1.7472, + "grad_norm": 2.23298978805542, + "learning_rate": 0.00044108789760963674, + "loss": 0.7759, "step": 3130 }, { "epoch": 0.5910031996988518, - "grad_norm": 23.864648818969727, - "learning_rate": 2.645398080180689e-05, - "loss": 1.679, + "grad_norm": 3.0950074195861816, + "learning_rate": 0.0004408996800301148, + "loss": 0.7602, "step": 3140 }, { "epoch": 0.5928853754940712, - "grad_norm": 18.402881622314453, - "learning_rate": 2.6442687747035573e-05, - "loss": 2.1442, + "grad_norm": 5.252658843994141, + "learning_rate": 0.0004407114624505929, + "loss": 1.4432, "step": 3150 }, { "epoch": 0.5947675512892904, - "grad_norm": 6.911153793334961, - "learning_rate": 2.643139469226426e-05, - "loss": 1.6756, + "grad_norm": 3.7704455852508545, + "learning_rate": 0.00044052324487107097, + "loss": 0.7093, "step": 3160 }, { "epoch": 0.5966497270845097, - "grad_norm": 12.472163200378418, - "learning_rate": 2.6420101637492945e-05, - "loss": 1.9381, + "grad_norm": 3.0606801509857178, + "learning_rate": 0.00044033502729154904, + "loss": 1.3438, "step": 3170 }, { "epoch": 0.598531902879729, - "grad_norm": 4.999609470367432, - "learning_rate": 2.6408808582721624e-05, - "loss": 1.5681, + "grad_norm": 1.9222668409347534, + "learning_rate": 0.00044014680971202707, + "loss": 0.8256, "step": 3180 }, { "epoch": 0.6004140786749482, - "grad_norm": 5.08629846572876, - "learning_rate": 2.639751552795031e-05, - "loss": 1.8375, + "grad_norm": 1.1805700063705444, + "learning_rate": 0.00043995859213250514, + "loss": 1.0023, "step": 3190 }, { "epoch": 0.6022962544701675, - "grad_norm": 5.805210590362549, - "learning_rate": 2.6386222473178996e-05, - "loss": 2.0463, + "grad_norm": 4.068279266357422, + "learning_rate": 0.0004397703745529832, + "loss": 1.2515, "step": 3200 }, { "epoch": 0.6041784302653868, - "grad_norm": 7.146366119384766, - "learning_rate": 2.637492941840768e-05, - "loss": 1.5877, + "grad_norm": 1.760879397392273, + "learning_rate": 0.0004395821569734613, + "loss": 0.6779, "step": 3210 }, { "epoch": 0.6060606060606061, - "grad_norm": 4.034036159515381, - "learning_rate": 2.6363636363636365e-05, - "loss": 1.835, + "grad_norm": 0.343962162733078, + "learning_rate": 0.0004393939393939394, + "loss": 0.8352, "step": 3220 }, { "epoch": 0.6079427818558253, - "grad_norm": 22.046175003051758, - "learning_rate": 2.6352343308865048e-05, - "loss": 1.702, + "grad_norm": 4.934606075286865, + "learning_rate": 0.0004392057218144175, + "loss": 0.7756, "step": 3230 }, { "epoch": 0.6098249576510446, - "grad_norm": 20.46267318725586, - "learning_rate": 2.6341050254093734e-05, - "loss": 1.8594, + "grad_norm": 5.34637975692749, + "learning_rate": 0.0004390175042348956, + "loss": 0.9066, "step": 3240 }, { "epoch": 0.6117071334462639, - "grad_norm": 6.352889537811279, - "learning_rate": 2.632975719932242e-05, - "loss": 1.78, + "grad_norm": 1.3073912858963013, + "learning_rate": 0.00043882928665537366, + "loss": 0.8333, "step": 3250 }, { "epoch": 0.6135893092414831, - "grad_norm": 5.597209453582764, - "learning_rate": 2.6318464144551103e-05, - "loss": 1.9062, + "grad_norm": 1.1687594652175903, + "learning_rate": 0.00043864106907585173, + "loss": 0.9611, "step": 3260 }, { "epoch": 0.6154714850367025, - "grad_norm": 8.404434204101562, - "learning_rate": 2.6307171089779785e-05, - "loss": 1.7729, + "grad_norm": 1.1420823335647583, + "learning_rate": 0.00043845285149632975, + "loss": 0.9158, "step": 3270 }, { "epoch": 0.6173536608319217, - "grad_norm": 9.512775421142578, - "learning_rate": 2.6295878035008468e-05, - "loss": 1.6567, + "grad_norm": 2.647937536239624, + "learning_rate": 0.00043826463391680783, + "loss": 0.7782, "step": 3280 }, { "epoch": 0.619235836627141, - "grad_norm": 11.096758842468262, - "learning_rate": 2.6284584980237154e-05, - "loss": 1.5892, + "grad_norm": 4.739561557769775, + "learning_rate": 0.0004380764163372859, + "loss": 0.709, "step": 3290 }, { "epoch": 0.6211180124223602, - "grad_norm": 4.303831100463867, - "learning_rate": 2.627329192546584e-05, - "loss": 1.7157, + "grad_norm": 1.5627340078353882, + "learning_rate": 0.000437888198757764, + "loss": 0.8886, "step": 3300 }, { "epoch": 0.6230001882175795, - "grad_norm": 3.8192107677459717, - "learning_rate": 2.6261998870694523e-05, - "loss": 1.7188, + "grad_norm": 3.9366836547851562, + "learning_rate": 0.00043769998117824206, + "loss": 0.9539, "step": 3310 }, { "epoch": 0.6248823640127988, - "grad_norm": 10.297846794128418, - "learning_rate": 2.625070581592321e-05, - "loss": 1.7473, + "grad_norm": 3.5125503540039062, + "learning_rate": 0.00043751176359872014, + "loss": 0.9994, "step": 3320 }, { "epoch": 0.626764539808018, - "grad_norm": 8.126511573791504, - "learning_rate": 2.623941276115189e-05, - "loss": 1.5694, + "grad_norm": 3.3123700618743896, + "learning_rate": 0.0004373235460191982, + "loss": 0.7918, "step": 3330 }, { "epoch": 0.6286467156032374, - "grad_norm": 4.767712116241455, - "learning_rate": 2.6228119706380578e-05, - "loss": 1.7124, + "grad_norm": 2.980029821395874, + "learning_rate": 0.0004371353284396763, + "loss": 0.9837, "step": 3340 }, { "epoch": 0.6305288913984566, - "grad_norm": 4.750823020935059, - "learning_rate": 2.6216826651609264e-05, - "loss": 1.8946, + "grad_norm": 1.5912530422210693, + "learning_rate": 0.00043694711086015437, + "loss": 1.2457, "step": 3350 }, { "epoch": 0.6324110671936759, - "grad_norm": 9.973583221435547, - "learning_rate": 2.6205533596837943e-05, - "loss": 1.6632, + "grad_norm": 1.0829591751098633, + "learning_rate": 0.0004367588932806324, + "loss": 0.8409, "step": 3360 }, { "epoch": 0.6342932429888951, - "grad_norm": 10.300981521606445, - "learning_rate": 2.619424054206663e-05, - "loss": 1.8623, + "grad_norm": 1.4647136926651, + "learning_rate": 0.00043657067570111046, + "loss": 1.1027, "step": 3370 }, { "epoch": 0.6361754187841144, - "grad_norm": 17.23389434814453, - "learning_rate": 2.6182947487295312e-05, - "loss": 1.8112, + "grad_norm": 4.678377151489258, + "learning_rate": 0.00043638245812158854, + "loss": 0.9029, "step": 3380 }, { "epoch": 0.6380575945793338, - "grad_norm": 8.108388900756836, - "learning_rate": 2.6171654432523998e-05, - "loss": 1.6344, + "grad_norm": 2.8882977962493896, + "learning_rate": 0.0004361942405420666, + "loss": 1.0712, "step": 3390 }, { "epoch": 0.639939770374553, - "grad_norm": 3.373798131942749, - "learning_rate": 2.6160361377752684e-05, - "loss": 1.5514, + "grad_norm": 3.299114227294922, + "learning_rate": 0.0004360060229625447, + "loss": 0.846, "step": 3400 }, { "epoch": 0.6418219461697723, - "grad_norm": 8.216525077819824, - "learning_rate": 2.6149068322981367e-05, - "loss": 1.727, + "grad_norm": 2.3301033973693848, + "learning_rate": 0.00043581780538302277, + "loss": 0.8764, "step": 3410 }, { "epoch": 0.6437041219649915, - "grad_norm": 9.52584457397461, - "learning_rate": 2.6137775268210053e-05, - "loss": 1.8575, + "grad_norm": 3.6711699962615967, + "learning_rate": 0.00043562958780350085, + "loss": 1.0153, "step": 3420 }, { "epoch": 0.6455862977602108, - "grad_norm": 7.7391581535339355, - "learning_rate": 2.612648221343874e-05, - "loss": 1.7685, + "grad_norm": 3.8490591049194336, + "learning_rate": 0.000435441370223979, + "loss": 0.9266, "step": 3430 }, { "epoch": 0.64746847355543, - "grad_norm": 8.019952774047852, - "learning_rate": 2.611518915866742e-05, - "loss": 1.6028, + "grad_norm": 2.865903377532959, + "learning_rate": 0.00043525315264445705, + "loss": 0.8813, "step": 3440 }, { "epoch": 0.6493506493506493, - "grad_norm": 11.445884704589844, - "learning_rate": 2.6103896103896104e-05, - "loss": 1.9301, + "grad_norm": 3.837740659713745, + "learning_rate": 0.0004350649350649351, + "loss": 1.2178, "step": 3450 }, { "epoch": 0.6512328251458687, - "grad_norm": 8.298888206481934, - "learning_rate": 2.6092603049124787e-05, - "loss": 1.5639, + "grad_norm": 1.263821005821228, + "learning_rate": 0.00043487671748541315, + "loss": 0.7914, "step": 3460 }, { "epoch": 0.6531150009410879, - "grad_norm": 6.707033157348633, - "learning_rate": 2.6081309994353473e-05, - "loss": 1.9865, + "grad_norm": 2.0916056632995605, + "learning_rate": 0.00043468849990589123, + "loss": 1.3344, "step": 3470 }, { "epoch": 0.6549971767363072, - "grad_norm": 13.604408264160156, - "learning_rate": 2.607001693958216e-05, - "loss": 1.8029, + "grad_norm": 2.437861919403076, + "learning_rate": 0.0004345002823263693, + "loss": 1.138, "step": 3480 }, { "epoch": 0.6568793525315264, - "grad_norm": 13.318224906921387, - "learning_rate": 2.605872388481084e-05, - "loss": 1.6995, + "grad_norm": 0.3909652829170227, + "learning_rate": 0.0004343120647468474, + "loss": 0.8935, "step": 3490 }, { "epoch": 0.6587615283267457, - "grad_norm": 10.210710525512695, - "learning_rate": 2.6047430830039528e-05, - "loss": 1.4445, + "grad_norm": 3.115875244140625, + "learning_rate": 0.00043412384716732546, + "loss": 0.684, "step": 3500 }, { "epoch": 0.6606437041219649, - "grad_norm": 23.614696502685547, - "learning_rate": 2.603613777526821e-05, - "loss": 1.7893, + "grad_norm": 6.081343173980713, + "learning_rate": 0.00043393562958780353, + "loss": 0.9985, "step": 3510 }, { "epoch": 0.6625258799171843, - "grad_norm": 6.86085844039917, - "learning_rate": 2.6024844720496896e-05, - "loss": 1.5184, + "grad_norm": 1.3088817596435547, + "learning_rate": 0.0004337474120082816, + "loss": 0.8571, "step": 3520 }, { "epoch": 0.6644080557124036, - "grad_norm": 7.119161128997803, - "learning_rate": 2.601355166572558e-05, - "loss": 1.6189, + "grad_norm": 1.4548195600509644, + "learning_rate": 0.00043355919442875963, + "loss": 0.9322, "step": 3530 }, { "epoch": 0.6662902315076228, - "grad_norm": 7.726982593536377, - "learning_rate": 2.6002258610954262e-05, - "loss": 1.7796, + "grad_norm": 2.6708908081054688, + "learning_rate": 0.0004333709768492377, + "loss": 1.035, "step": 3540 }, { "epoch": 0.6681724073028421, - "grad_norm": 7.957070350646973, - "learning_rate": 2.5990965556182948e-05, - "loss": 1.5165, + "grad_norm": 2.1637444496154785, + "learning_rate": 0.0004331827592697158, + "loss": 0.5862, "step": 3550 }, { "epoch": 0.6700545830980613, - "grad_norm": 11.317330360412598, - "learning_rate": 2.597967250141163e-05, - "loss": 1.835, + "grad_norm": 4.600494861602783, + "learning_rate": 0.00043299454169019386, + "loss": 0.9426, "step": 3560 }, { "epoch": 0.6719367588932806, - "grad_norm": 7.407431602478027, - "learning_rate": 2.5968379446640317e-05, - "loss": 1.3834, + "grad_norm": 2.8519279956817627, + "learning_rate": 0.00043280632411067194, + "loss": 0.5583, "step": 3570 }, { "epoch": 0.6738189346885, - "grad_norm": 9.898704528808594, - "learning_rate": 2.5957086391869003e-05, - "loss": 1.432, + "grad_norm": 1.3281288146972656, + "learning_rate": 0.00043261810653115, + "loss": 0.7769, "step": 3580 }, { "epoch": 0.6757011104837192, - "grad_norm": 5.612309455871582, - "learning_rate": 2.5945793337097685e-05, - "loss": 1.7256, + "grad_norm": 2.5639514923095703, + "learning_rate": 0.0004324298889516281, + "loss": 1.0062, "step": 3590 }, { "epoch": 0.6775832862789385, - "grad_norm": 5.5622663497924805, - "learning_rate": 2.593450028232637e-05, - "loss": 1.5599, + "grad_norm": 2.5893678665161133, + "learning_rate": 0.00043224167137210617, + "loss": 0.9067, "step": 3600 }, { "epoch": 0.6794654620741577, - "grad_norm": 5.885371685028076, - "learning_rate": 2.5923207227555054e-05, - "loss": 1.7386, + "grad_norm": 2.5090417861938477, + "learning_rate": 0.00043205345379258424, + "loss": 0.905, "step": 3610 }, { "epoch": 0.681347637869377, - "grad_norm": 3.2068424224853516, - "learning_rate": 2.5911914172783737e-05, - "loss": 1.5411, + "grad_norm": 3.526054859161377, + "learning_rate": 0.00043186523621306227, + "loss": 0.8226, "step": 3620 }, { "epoch": 0.6832298136645962, - "grad_norm": 3.0026233196258545, - "learning_rate": 2.5900621118012423e-05, - "loss": 1.7239, + "grad_norm": 13.659317016601562, + "learning_rate": 0.00043167701863354034, + "loss": 0.9691, "step": 3630 }, { "epoch": 0.6851119894598156, - "grad_norm": 23.438610076904297, - "learning_rate": 2.5889328063241106e-05, - "loss": 1.7582, + "grad_norm": 3.2574679851531982, + "learning_rate": 0.0004314888010540184, + "loss": 1.0528, "step": 3640 }, { "epoch": 0.6869941652550349, - "grad_norm": 3.962045907974243, - "learning_rate": 2.5878035008469792e-05, - "loss": 1.4579, + "grad_norm": 1.4542733430862427, + "learning_rate": 0.0004313005834744965, + "loss": 0.8987, "step": 3650 }, { "epoch": 0.6888763410502541, - "grad_norm": 3.3566129207611084, - "learning_rate": 2.5866741953698478e-05, - "loss": 1.4641, + "grad_norm": 1.2248775959014893, + "learning_rate": 0.0004311123658949746, + "loss": 0.6569, "step": 3660 }, { "epoch": 0.6907585168454734, - "grad_norm": 8.942924499511719, - "learning_rate": 2.585544889892716e-05, - "loss": 1.5699, + "grad_norm": 6.2517008781433105, + "learning_rate": 0.0004309241483154527, + "loss": 0.8825, "step": 3670 }, { "epoch": 0.6926406926406926, - "grad_norm": 12.78019905090332, - "learning_rate": 2.5844155844155847e-05, - "loss": 1.6055, + "grad_norm": 2.5166094303131104, + "learning_rate": 0.0004307359307359308, + "loss": 0.7376, "step": 3680 }, { "epoch": 0.6945228684359119, - "grad_norm": 14.28182315826416, - "learning_rate": 2.583286278938453e-05, - "loss": 1.6354, + "grad_norm": 5.175745964050293, + "learning_rate": 0.00043054771315640886, + "loss": 0.8224, "step": 3690 }, { "epoch": 0.6964050442311311, - "grad_norm": 10.26236629486084, - "learning_rate": 2.5821569734613215e-05, - "loss": 1.5518, + "grad_norm": 0.9270199537277222, + "learning_rate": 0.00043035949557688693, + "loss": 0.8223, "step": 3700 }, { "epoch": 0.6982872200263505, - "grad_norm": 6.233335494995117, - "learning_rate": 2.5810276679841898e-05, - "loss": 1.4589, + "grad_norm": 1.036481261253357, + "learning_rate": 0.00043017127799736495, + "loss": 0.6669, "step": 3710 }, { "epoch": 0.7001693958215698, - "grad_norm": 17.028034210205078, - "learning_rate": 2.579898362507058e-05, - "loss": 1.6329, + "grad_norm": 4.925989627838135, + "learning_rate": 0.00042998306041784303, + "loss": 0.6371, "step": 3720 }, { "epoch": 0.702051571616789, - "grad_norm": 9.508376121520996, - "learning_rate": 2.5787690570299267e-05, - "loss": 1.4574, + "grad_norm": 4.520897388458252, + "learning_rate": 0.0004297948428383211, + "loss": 0.7395, "step": 3730 }, { "epoch": 0.7039337474120083, - "grad_norm": 10.823552131652832, - "learning_rate": 2.577639751552795e-05, - "loss": 1.7267, + "grad_norm": 3.2131059169769287, + "learning_rate": 0.0004296066252587992, + "loss": 1.3384, "step": 3740 }, { "epoch": 0.7058159232072275, - "grad_norm": 19.552865982055664, - "learning_rate": 2.5765104460756636e-05, - "loss": 1.7066, + "grad_norm": 2.445014476776123, + "learning_rate": 0.00042941840767927726, + "loss": 0.7885, "step": 3750 }, { "epoch": 0.7076980990024468, - "grad_norm": 13.044384002685547, - "learning_rate": 2.575381140598532e-05, - "loss": 1.5771, + "grad_norm": 3.6331963539123535, + "learning_rate": 0.00042923019009975534, + "loss": 0.9128, "step": 3760 }, { "epoch": 0.709580274797666, - "grad_norm": 7.333944797515869, - "learning_rate": 2.5742518351214004e-05, - "loss": 1.4535, + "grad_norm": 3.285827398300171, + "learning_rate": 0.0004290419725202334, + "loss": 0.8912, "step": 3770 }, { "epoch": 0.7114624505928854, - "grad_norm": 12.022722244262695, - "learning_rate": 2.573122529644269e-05, - "loss": 1.6377, + "grad_norm": 9.035295486450195, + "learning_rate": 0.0004288537549407115, + "loss": 1.109, "step": 3780 }, { "epoch": 0.7133446263881047, - "grad_norm": 6.42018461227417, - "learning_rate": 2.571993224167137e-05, - "loss": 1.4887, + "grad_norm": 3.6709072589874268, + "learning_rate": 0.0004286655373611895, + "loss": 0.6925, "step": 3790 }, { "epoch": 0.7152268021833239, - "grad_norm": 2.690678596496582, - "learning_rate": 2.5708639186900056e-05, - "loss": 1.7047, + "grad_norm": 0.12020440399646759, + "learning_rate": 0.0004284773197816676, + "loss": 1.1243, "step": 3800 }, { "epoch": 0.7171089779785432, - "grad_norm": 25.072540283203125, - "learning_rate": 2.5697346132128742e-05, - "loss": 1.6334, + "grad_norm": 4.302594184875488, + "learning_rate": 0.00042828910220214566, + "loss": 0.9626, "step": 3810 }, { "epoch": 0.7189911537737624, - "grad_norm": 19.594491958618164, - "learning_rate": 2.5686053077357425e-05, - "loss": 1.3771, + "grad_norm": 4.240476608276367, + "learning_rate": 0.00042810088462262374, + "loss": 0.8101, "step": 3820 }, { "epoch": 0.7208733295689818, - "grad_norm": 32.11001205444336, - "learning_rate": 2.567476002258611e-05, - "loss": 1.5945, + "grad_norm": 2.9120934009552, + "learning_rate": 0.0004279126670431018, + "loss": 0.9986, "step": 3830 }, { "epoch": 0.722755505364201, - "grad_norm": 5.285523891448975, - "learning_rate": 2.5663466967814793e-05, - "loss": 1.4196, + "grad_norm": 3.199678659439087, + "learning_rate": 0.0004277244494635799, + "loss": 1.0096, "step": 3840 }, { "epoch": 0.7246376811594203, - "grad_norm": 9.592608451843262, - "learning_rate": 2.565217391304348e-05, - "loss": 1.6145, + "grad_norm": 2.3253014087677, + "learning_rate": 0.00042753623188405797, + "loss": 0.8287, "step": 3850 }, { "epoch": 0.7265198569546396, - "grad_norm": 11.879024505615234, - "learning_rate": 2.5640880858272165e-05, - "loss": 1.6065, + "grad_norm": 2.6479291915893555, + "learning_rate": 0.00042734801430453605, + "loss": 0.9506, "step": 3860 }, { "epoch": 0.7284020327498588, - "grad_norm": 15.967653274536133, - "learning_rate": 2.5629587803500848e-05, - "loss": 1.716, + "grad_norm": 4.7788214683532715, + "learning_rate": 0.0004271597967250141, + "loss": 0.8718, "step": 3870 }, { "epoch": 0.7302842085450781, - "grad_norm": 20.394914627075195, - "learning_rate": 2.561829474872953e-05, - "loss": 1.4524, + "grad_norm": 1.1644917726516724, + "learning_rate": 0.0004269715791454922, + "loss": 0.774, "step": 3880 }, { "epoch": 0.7321663843402973, - "grad_norm": 8.076727867126465, - "learning_rate": 2.5607001693958214e-05, - "loss": 1.6307, + "grad_norm": 5.440783977508545, + "learning_rate": 0.0004267833615659703, + "loss": 1.0616, "step": 3890 }, { "epoch": 0.7340485601355167, - "grad_norm": 15.703822135925293, - "learning_rate": 2.55957086391869e-05, - "loss": 1.2349, + "grad_norm": 2.430136203765869, + "learning_rate": 0.00042659514398644835, + "loss": 0.543, "step": 3900 }, { "epoch": 0.7359307359307359, - "grad_norm": 13.136933326721191, - "learning_rate": 2.5584415584415586e-05, - "loss": 1.6267, + "grad_norm": 3.5641326904296875, + "learning_rate": 0.00042640692640692643, + "loss": 0.9902, "step": 3910 }, { "epoch": 0.7378129117259552, - "grad_norm": 10.711684226989746, - "learning_rate": 2.557312252964427e-05, - "loss": 1.5033, + "grad_norm": 2.292509078979492, + "learning_rate": 0.0004262187088274045, + "loss": 1.04, "step": 3920 }, { "epoch": 0.7396950875211745, - "grad_norm": 12.624408721923828, - "learning_rate": 2.5561829474872954e-05, - "loss": 1.516, + "grad_norm": 2.2531626224517822, + "learning_rate": 0.0004260304912478826, + "loss": 0.7291, "step": 3930 }, { "epoch": 0.7415772633163937, - "grad_norm": 5.2435479164123535, - "learning_rate": 2.555053642010164e-05, - "loss": 1.3178, + "grad_norm": 2.3294925689697266, + "learning_rate": 0.00042584227366836066, + "loss": 0.8943, "step": 3940 }, { "epoch": 0.743459439111613, - "grad_norm": 12.167356491088867, - "learning_rate": 2.5539243365330323e-05, - "loss": 1.7724, + "grad_norm": 2.969782590866089, + "learning_rate": 0.00042565405608883873, + "loss": 0.8468, "step": 3950 }, { "epoch": 0.7453416149068323, - "grad_norm": 11.038766860961914, - "learning_rate": 2.552795031055901e-05, - "loss": 1.397, + "grad_norm": 2.1566853523254395, + "learning_rate": 0.0004254658385093168, + "loss": 0.6394, "step": 3960 }, { "epoch": 0.7472237907020516, - "grad_norm": 11.564054489135742, - "learning_rate": 2.551665725578769e-05, - "loss": 1.5357, + "grad_norm": 5.316396713256836, + "learning_rate": 0.00042527762092979483, + "loss": 1.0201, "step": 3970 }, { "epoch": 0.7491059664972708, - "grad_norm": 19.608352661132812, - "learning_rate": 2.5505364201016375e-05, - "loss": 1.5665, + "grad_norm": 4.417022705078125, + "learning_rate": 0.0004250894033502729, + "loss": 0.8763, "step": 3980 }, { "epoch": 0.7509881422924901, - "grad_norm": 13.263577461242676, - "learning_rate": 2.549407114624506e-05, - "loss": 1.4692, + "grad_norm": 3.9584805965423584, + "learning_rate": 0.000424901185770751, + "loss": 0.8368, "step": 3990 }, { "epoch": 0.7528703180877094, - "grad_norm": 21.618656158447266, - "learning_rate": 2.5482778091473743e-05, - "loss": 1.5511, + "grad_norm": 4.402260780334473, + "learning_rate": 0.00042471296819122906, + "loss": 0.9425, "step": 4000 }, { "epoch": 0.7547524938829286, - "grad_norm": 4.379132270812988, - "learning_rate": 2.547148503670243e-05, - "loss": 1.6006, + "grad_norm": 1.8441309928894043, + "learning_rate": 0.00042452475061170714, + "loss": 0.9747, "step": 4010 }, { "epoch": 0.756634669678148, - "grad_norm": 5.841297626495361, - "learning_rate": 2.5460191981931112e-05, - "loss": 1.3704, + "grad_norm": 2.9513907432556152, + "learning_rate": 0.0004243365330321852, + "loss": 0.6434, "step": 4020 }, { "epoch": 0.7585168454733672, - "grad_norm": 9.923529624938965, - "learning_rate": 2.5448898927159798e-05, - "loss": 1.6851, + "grad_norm": 3.533721923828125, + "learning_rate": 0.0004241483154526633, + "loss": 1.06, "step": 4030 }, { "epoch": 0.7603990212685865, - "grad_norm": 6.657973766326904, - "learning_rate": 2.5437605872388484e-05, - "loss": 1.3996, + "grad_norm": 3.201486110687256, + "learning_rate": 0.00042396009787314137, + "loss": 0.7152, "step": 4040 }, { "epoch": 0.7622811970638057, - "grad_norm": 7.652090072631836, - "learning_rate": 2.5426312817617167e-05, - "loss": 1.3747, + "grad_norm": 3.1424193382263184, + "learning_rate": 0.00042377188029361944, + "loss": 0.8416, "step": 4050 }, { "epoch": 0.764163372859025, - "grad_norm": 6.833724498748779, - "learning_rate": 2.541501976284585e-05, - "loss": 1.3425, + "grad_norm": 3.6960856914520264, + "learning_rate": 0.00042358366271409747, + "loss": 0.7052, "step": 4060 }, { "epoch": 0.7660455486542443, - "grad_norm": 3.7029807567596436, - "learning_rate": 2.5403726708074532e-05, - "loss": 1.5997, + "grad_norm": 1.366542935371399, + "learning_rate": 0.00042339544513457554, + "loss": 1.1092, "step": 4070 }, { "epoch": 0.7679277244494636, - "grad_norm": 9.09876537322998, - "learning_rate": 2.539243365330322e-05, - "loss": 1.3796, + "grad_norm": 4.202461242675781, + "learning_rate": 0.0004232072275550536, + "loss": 0.735, "step": 4080 }, { "epoch": 0.7698099002446829, - "grad_norm": 6.666367053985596, - "learning_rate": 2.5381140598531905e-05, - "loss": 1.2452, + "grad_norm": 0.5352344512939453, + "learning_rate": 0.0004230190099755317, + "loss": 0.6184, "step": 4090 }, { "epoch": 0.7716920760399021, - "grad_norm": 53.3235969543457, - "learning_rate": 2.5369847543760587e-05, - "loss": 1.4592, + "grad_norm": 3.9562442302703857, + "learning_rate": 0.0004228307923960098, + "loss": 0.8374, "step": 4100 }, { "epoch": 0.7735742518351214, - "grad_norm": 14.514080047607422, - "learning_rate": 2.5358554488989273e-05, - "loss": 1.4565, + "grad_norm": 4.5890936851501465, + "learning_rate": 0.0004226425748164879, + "loss": 0.7613, "step": 4110 }, { "epoch": 0.7754564276303407, - "grad_norm": 4.159520626068115, - "learning_rate": 2.5347261434217956e-05, - "loss": 1.2396, + "grad_norm": 2.076005458831787, + "learning_rate": 0.000422454357236966, + "loss": 0.697, "step": 4120 }, { "epoch": 0.7773386034255599, - "grad_norm": 6.2468438148498535, - "learning_rate": 2.5335968379446642e-05, - "loss": 1.2646, + "grad_norm": 2.3517327308654785, + "learning_rate": 0.00042226613965744406, + "loss": 0.8244, "step": 4130 }, { "epoch": 0.7792207792207793, - "grad_norm": 14.043899536132812, - "learning_rate": 2.5324675324675325e-05, - "loss": 1.5409, + "grad_norm": 2.801920175552368, + "learning_rate": 0.0004220779220779221, + "loss": 0.9129, "step": 4140 }, { "epoch": 0.7811029550159985, - "grad_norm": 10.741174697875977, - "learning_rate": 2.5313382269904007e-05, - "loss": 1.2822, + "grad_norm": 3.0341615676879883, + "learning_rate": 0.00042188970449840015, + "loss": 0.6649, "step": 4150 }, { "epoch": 0.7829851308112178, - "grad_norm": 5.443540573120117, - "learning_rate": 2.5302089215132694e-05, - "loss": 1.561, + "grad_norm": 1.9421992301940918, + "learning_rate": 0.00042170148691887823, + "loss": 1.1939, "step": 4160 }, { "epoch": 0.784867306606437, - "grad_norm": 4.099931716918945, - "learning_rate": 2.529079616036138e-05, - "loss": 1.6496, + "grad_norm": 2.504599094390869, + "learning_rate": 0.0004215132693393563, + "loss": 1.0883, "step": 4170 }, { "epoch": 0.7867494824016563, - "grad_norm": 4.217696666717529, - "learning_rate": 2.5279503105590062e-05, - "loss": 1.3227, + "grad_norm": 2.8098371028900146, + "learning_rate": 0.0004213250517598344, + "loss": 0.7397, "step": 4180 }, { "epoch": 0.7886316581968756, - "grad_norm": 18.865585327148438, - "learning_rate": 2.526821005081875e-05, - "loss": 1.5359, + "grad_norm": 3.403374433517456, + "learning_rate": 0.00042113683418031246, + "loss": 0.9343, "step": 4190 }, { "epoch": 0.7905138339920948, - "grad_norm": 7.36782693862915, - "learning_rate": 2.525691699604743e-05, - "loss": 1.3648, + "grad_norm": 3.9628524780273438, + "learning_rate": 0.00042094861660079054, + "loss": 0.9282, "step": 4200 }, { "epoch": 0.7923960097873142, - "grad_norm": 11.05923843383789, - "learning_rate": 2.5245623941276117e-05, - "loss": 1.556, + "grad_norm": 4.7820820808410645, + "learning_rate": 0.0004207603990212686, + "loss": 1.0349, "step": 4210 }, { "epoch": 0.7942781855825334, - "grad_norm": 9.203524589538574, - "learning_rate": 2.5234330886504803e-05, - "loss": 1.4646, + "grad_norm": 3.7244391441345215, + "learning_rate": 0.0004205721814417467, + "loss": 0.9997, "step": 4220 }, { "epoch": 0.7961603613777527, - "grad_norm": 9.965381622314453, - "learning_rate": 2.5223037831733483e-05, - "loss": 1.4233, + "grad_norm": 3.3195786476135254, + "learning_rate": 0.0004203839638622247, + "loss": 0.8765, "step": 4230 }, { "epoch": 0.7980425371729719, - "grad_norm": 17.369232177734375, - "learning_rate": 2.521174477696217e-05, - "loss": 1.4067, + "grad_norm": 2.580580234527588, + "learning_rate": 0.0004201957462827028, + "loss": 0.797, "step": 4240 }, { "epoch": 0.7999247129681912, - "grad_norm": 4.989372253417969, - "learning_rate": 2.520045172219085e-05, - "loss": 1.773, + "grad_norm": 2.195201873779297, + "learning_rate": 0.00042000752870318086, + "loss": 1.167, "step": 4250 }, { "epoch": 0.8018068887634106, - "grad_norm": 10.048711776733398, - "learning_rate": 2.5189158667419537e-05, - "loss": 1.8644, + "grad_norm": 2.2976858615875244, + "learning_rate": 0.00041981931112365894, + "loss": 1.1836, "step": 4260 }, { "epoch": 0.8036890645586298, - "grad_norm": 4.219175338745117, - "learning_rate": 2.5177865612648223e-05, - "loss": 1.133, + "grad_norm": 2.622638702392578, + "learning_rate": 0.000419631093544137, + "loss": 0.7872, "step": 4270 }, { "epoch": 0.8055712403538491, - "grad_norm": 20.187997817993164, - "learning_rate": 2.5166572557876906e-05, - "loss": 1.5066, + "grad_norm": 3.801210403442383, + "learning_rate": 0.0004194428759646151, + "loss": 0.8956, "step": 4280 }, { "epoch": 0.8074534161490683, - "grad_norm": 7.595444202423096, - "learning_rate": 2.5155279503105592e-05, - "loss": 1.6849, + "grad_norm": 2.6912474632263184, + "learning_rate": 0.00041925465838509317, + "loss": 1.2465, "step": 4290 }, { "epoch": 0.8093355919442876, - "grad_norm": 14.513413429260254, - "learning_rate": 2.5143986448334275e-05, - "loss": 1.3276, + "grad_norm": 2.3820366859436035, + "learning_rate": 0.00041906644080557125, + "loss": 0.7562, "step": 4300 }, { "epoch": 0.8112177677395068, - "grad_norm": 15.310640335083008, - "learning_rate": 2.513269339356296e-05, - "loss": 1.3818, + "grad_norm": 3.9497461318969727, + "learning_rate": 0.0004188782232260493, + "loss": 0.8316, "step": 4310 }, { "epoch": 0.8130999435347261, - "grad_norm": 10.264412879943848, - "learning_rate": 2.5121400338791644e-05, - "loss": 1.2301, + "grad_norm": 1.9162503480911255, + "learning_rate": 0.0004186900056465274, + "loss": 0.5897, "step": 4320 }, { "epoch": 0.8149821193299455, - "grad_norm": 6.609447002410889, - "learning_rate": 2.5110107284020326e-05, - "loss": 1.646, + "grad_norm": 3.3370280265808105, + "learning_rate": 0.0004185017880670055, + "loss": 1.5245, "step": 4330 }, { "epoch": 0.8168642951251647, - "grad_norm": 5.502641677856445, - "learning_rate": 2.5098814229249012e-05, - "loss": 1.2485, + "grad_norm": 2.484536647796631, + "learning_rate": 0.00041831357048748355, + "loss": 0.7318, "step": 4340 }, { "epoch": 0.818746470920384, - "grad_norm": 10.554330825805664, - "learning_rate": 2.5087521174477695e-05, - "loss": 1.3115, + "grad_norm": 3.806140184402466, + "learning_rate": 0.00041812535290796163, + "loss": 0.7705, "step": 4350 }, { "epoch": 0.8206286467156032, - "grad_norm": 4.31608247756958, - "learning_rate": 2.507622811970638e-05, - "loss": 1.1539, + "grad_norm": 1.3592685461044312, + "learning_rate": 0.0004179371353284397, + "loss": 0.5629, "step": 4360 }, { "epoch": 0.8225108225108225, - "grad_norm": 8.371481895446777, - "learning_rate": 2.5064935064935067e-05, - "loss": 1.4917, + "grad_norm": 4.968286991119385, + "learning_rate": 0.0004177489177489178, + "loss": 1.035, "step": 4370 }, { "epoch": 0.8243929983060417, - "grad_norm": 19.33193016052246, - "learning_rate": 2.505364201016375e-05, - "loss": 1.2167, + "grad_norm": 2.162809133529663, + "learning_rate": 0.00041756070016939586, + "loss": 0.6883, "step": 4380 }, { "epoch": 0.8262751741012611, - "grad_norm": 14.059998512268066, - "learning_rate": 2.5042348955392436e-05, - "loss": 1.4155, + "grad_norm": 3.7793686389923096, + "learning_rate": 0.00041737248258987393, + "loss": 0.8916, "step": 4390 }, { "epoch": 0.8281573498964804, - "grad_norm": 19.17221450805664, - "learning_rate": 2.503105590062112e-05, - "loss": 1.6316, + "grad_norm": 5.029797077178955, + "learning_rate": 0.00041718426501035196, + "loss": 1.0611, "step": 4400 }, { "epoch": 0.8300395256916996, - "grad_norm": 3.8574047088623047, - "learning_rate": 2.50197628458498e-05, - "loss": 1.3991, + "grad_norm": 1.8391931056976318, + "learning_rate": 0.00041699604743083003, + "loss": 1.018, "step": 4410 }, { "epoch": 0.8319217014869189, - "grad_norm": 11.87222671508789, - "learning_rate": 2.5008469791078487e-05, - "loss": 1.5969, + "grad_norm": 2.113842725753784, + "learning_rate": 0.0004168078298513081, + "loss": 0.9259, "step": 4420 }, { "epoch": 0.8338038772821381, - "grad_norm": 5.910922050476074, - "learning_rate": 2.499717673630717e-05, - "loss": 1.4359, + "grad_norm": 1.8083542585372925, + "learning_rate": 0.0004166196122717862, + "loss": 0.8516, "step": 4430 }, { "epoch": 0.8356860530773574, - "grad_norm": 13.17109203338623, - "learning_rate": 2.4985883681535856e-05, - "loss": 1.3405, + "grad_norm": 4.179044723510742, + "learning_rate": 0.00041643139469226426, + "loss": 0.9547, "step": 4440 }, { "epoch": 0.8375682288725766, - "grad_norm": 10.746623039245605, - "learning_rate": 2.4974590626764542e-05, - "loss": 1.4437, + "grad_norm": 4.096711158752441, + "learning_rate": 0.00041624317711274234, + "loss": 1.0307, "step": 4450 }, { "epoch": 0.839450404667796, - "grad_norm": 18.121070861816406, - "learning_rate": 2.4963297571993225e-05, - "loss": 1.4979, + "grad_norm": 2.8632447719573975, + "learning_rate": 0.0004160549595332204, + "loss": 1.0436, "step": 4460 }, { "epoch": 0.8413325804630153, - "grad_norm": 8.530396461486816, - "learning_rate": 2.495200451722191e-05, - "loss": 1.7246, + "grad_norm": 3.738539695739746, + "learning_rate": 0.0004158667419536985, + "loss": 1.0449, "step": 4470 }, { "epoch": 0.8432147562582345, - "grad_norm": 16.504993438720703, - "learning_rate": 2.4940711462450594e-05, - "loss": 1.4092, + "grad_norm": 4.9758148193359375, + "learning_rate": 0.00041567852437417657, + "loss": 0.7716, "step": 4480 }, { "epoch": 0.8450969320534538, - "grad_norm": 7.536314010620117, - "learning_rate": 2.4929418407679276e-05, - "loss": 0.9995, + "grad_norm": 1.671683669090271, + "learning_rate": 0.0004154903067946546, + "loss": 0.5224, "step": 4490 }, { "epoch": 0.846979107848673, - "grad_norm": 5.592616081237793, - "learning_rate": 2.4918125352907963e-05, - "loss": 1.3312, + "grad_norm": 1.9305776357650757, + "learning_rate": 0.00041530208921513267, + "loss": 0.8533, "step": 4500 }, { "epoch": 0.8488612836438924, - "grad_norm": 18.722604751586914, - "learning_rate": 2.4906832298136645e-05, - "loss": 1.4819, + "grad_norm": 4.232862949371338, + "learning_rate": 0.00041511387163561074, + "loss": 0.9005, "step": 4510 }, { "epoch": 0.8507434594391116, - "grad_norm": 16.095701217651367, - "learning_rate": 2.489553924336533e-05, - "loss": 1.4257, + "grad_norm": 2.9629571437835693, + "learning_rate": 0.0004149256540560888, + "loss": 0.8665, "step": 4520 }, { "epoch": 0.8526256352343309, - "grad_norm": 11.100549697875977, - "learning_rate": 2.4884246188594014e-05, - "loss": 1.4673, + "grad_norm": 3.31140398979187, + "learning_rate": 0.0004147374364765669, + "loss": 0.9225, "step": 4530 }, { "epoch": 0.8545078110295502, - "grad_norm": 22.106538772583008, - "learning_rate": 2.48729531338227e-05, - "loss": 1.3014, + "grad_norm": 4.2920823097229, + "learning_rate": 0.000414549218897045, + "loss": 0.5833, "step": 4540 }, { "epoch": 0.8563899868247694, - "grad_norm": 9.377236366271973, - "learning_rate": 2.4861660079051386e-05, - "loss": 1.5026, + "grad_norm": 3.0021259784698486, + "learning_rate": 0.0004143610013175231, + "loss": 1.0226, "step": 4550 }, { "epoch": 0.8582721626199887, - "grad_norm": 7.03391695022583, - "learning_rate": 2.485036702428007e-05, - "loss": 1.3629, + "grad_norm": 2.458348035812378, + "learning_rate": 0.0004141727837380012, + "loss": 0.8703, "step": 4560 }, { "epoch": 0.8601543384152079, - "grad_norm": 19.828781127929688, - "learning_rate": 2.4839073969508755e-05, - "loss": 1.6024, + "grad_norm": 4.885277271270752, + "learning_rate": 0.00041398456615847926, + "loss": 1.175, "step": 4570 }, { "epoch": 0.8620365142104273, - "grad_norm": 14.324966430664062, - "learning_rate": 2.4827780914737434e-05, - "loss": 1.3317, + "grad_norm": 5.482606410980225, + "learning_rate": 0.0004137963485789573, + "loss": 0.8186, "step": 4580 }, { "epoch": 0.8639186900056465, - "grad_norm": 17.969161987304688, - "learning_rate": 2.481648785996612e-05, - "loss": 1.5452, + "grad_norm": 4.906975746154785, + "learning_rate": 0.00041360813099943535, + "loss": 0.9061, "step": 4590 }, { "epoch": 0.8658008658008658, - "grad_norm": 28.252727508544922, - "learning_rate": 2.4805194805194806e-05, - "loss": 1.3952, + "grad_norm": 3.1498374938964844, + "learning_rate": 0.00041341991341991343, + "loss": 0.9264, "step": 4600 }, { "epoch": 0.8676830415960851, - "grad_norm": 6.551124095916748, - "learning_rate": 2.479390175042349e-05, - "loss": 1.4017, + "grad_norm": 3.5111498832702637, + "learning_rate": 0.0004132316958403915, + "loss": 1.0421, "step": 4610 }, { "epoch": 0.8695652173913043, - "grad_norm": 9.968870162963867, - "learning_rate": 2.4782608695652175e-05, - "loss": 1.4794, + "grad_norm": 2.239441156387329, + "learning_rate": 0.0004130434782608696, + "loss": 0.922, "step": 4620 }, { "epoch": 0.8714473931865236, - "grad_norm": 10.416319847106934, - "learning_rate": 2.4771315640880858e-05, - "loss": 1.4633, + "grad_norm": 2.3396077156066895, + "learning_rate": 0.00041285526068134766, + "loss": 0.9517, "step": 4630 }, { "epoch": 0.8733295689817429, - "grad_norm": 12.675484657287598, - "learning_rate": 2.4760022586109544e-05, - "loss": 1.1787, + "grad_norm": 3.4688830375671387, + "learning_rate": 0.00041266704310182574, + "loss": 0.6071, "step": 4640 }, { "epoch": 0.8752117447769622, - "grad_norm": 4.599917411804199, - "learning_rate": 2.474872953133823e-05, - "loss": 1.4834, + "grad_norm": 3.2922751903533936, + "learning_rate": 0.0004124788255223038, + "loss": 0.892, "step": 4650 }, { "epoch": 0.8770939205721815, - "grad_norm": 23.191003799438477, - "learning_rate": 2.473743647656691e-05, - "loss": 1.2879, + "grad_norm": 2.0869195461273193, + "learning_rate": 0.00041229060794278183, + "loss": 0.8705, "step": 4660 }, { "epoch": 0.8789760963674007, - "grad_norm": 10.024175643920898, - "learning_rate": 2.4726143421795595e-05, - "loss": 1.1128, + "grad_norm": 0.24055927991867065, + "learning_rate": 0.0004121023903632599, + "loss": 0.722, "step": 4670 }, { "epoch": 0.88085827216262, - "grad_norm": 3.1574692726135254, - "learning_rate": 2.471485036702428e-05, - "loss": 1.2905, + "grad_norm": 1.8887193202972412, + "learning_rate": 0.000411914172783738, + "loss": 0.9261, "step": 4680 }, { "epoch": 0.8827404479578392, - "grad_norm": 5.446253776550293, - "learning_rate": 2.4703557312252964e-05, - "loss": 1.5343, + "grad_norm": 9.118612289428711, + "learning_rate": 0.00041172595520421606, + "loss": 1.1263, "step": 4690 }, { "epoch": 0.8846226237530586, - "grad_norm": 13.522638320922852, - "learning_rate": 2.469226425748165e-05, - "loss": 1.3689, + "grad_norm": 4.854977607727051, + "learning_rate": 0.00041153773762469414, + "loss": 0.7742, "step": 4700 }, { "epoch": 0.8865047995482778, - "grad_norm": 20.1780948638916, - "learning_rate": 2.4680971202710333e-05, - "loss": 1.4354, + "grad_norm": 2.078373908996582, + "learning_rate": 0.0004113495200451722, + "loss": 0.8702, "step": 4710 }, { "epoch": 0.8883869753434971, - "grad_norm": 10.403576850891113, - "learning_rate": 2.466967814793902e-05, - "loss": 1.5017, + "grad_norm": 3.6736721992492676, + "learning_rate": 0.0004111613024656503, + "loss": 0.9657, "step": 4720 }, { "epoch": 0.8902691511387164, - "grad_norm": 9.197463989257812, - "learning_rate": 2.4658385093167705e-05, - "loss": 1.2154, + "grad_norm": 3.352625846862793, + "learning_rate": 0.00041097308488612837, + "loss": 0.6156, "step": 4730 }, { "epoch": 0.8921513269339356, - "grad_norm": 11.027040481567383, - "learning_rate": 2.4647092038396388e-05, - "loss": 1.1206, + "grad_norm": 5.416534900665283, + "learning_rate": 0.00041078486730660645, + "loss": 0.8606, "step": 4740 }, { "epoch": 0.8940335027291549, - "grad_norm": 10.964394569396973, - "learning_rate": 2.463579898362507e-05, - "loss": 1.3562, + "grad_norm": 1.8365694284439087, + "learning_rate": 0.00041059664972708447, + "loss": 0.9732, "step": 4750 }, { "epoch": 0.8959156785243741, - "grad_norm": 8.044187545776367, - "learning_rate": 2.4624505928853753e-05, - "loss": 1.1975, + "grad_norm": 1.2886053323745728, + "learning_rate": 0.00041040843214756254, + "loss": 0.5367, "step": 4760 }, { "epoch": 0.8977978543195935, - "grad_norm": 12.479483604431152, - "learning_rate": 2.461321287408244e-05, - "loss": 1.3726, + "grad_norm": 1.6298965215682983, + "learning_rate": 0.0004102202145680407, + "loss": 0.8823, "step": 4770 }, { "epoch": 0.8996800301148127, - "grad_norm": 10.56410026550293, - "learning_rate": 2.4601919819311125e-05, - "loss": 1.3986, + "grad_norm": 3.4384331703186035, + "learning_rate": 0.00041003199698851875, + "loss": 0.9704, "step": 4780 }, { "epoch": 0.901562205910032, - "grad_norm": 13.470977783203125, - "learning_rate": 2.4590626764539808e-05, - "loss": 1.2694, + "grad_norm": 2.444072961807251, + "learning_rate": 0.00040984377940899683, + "loss": 0.7552, "step": 4790 }, { "epoch": 0.9034443817052513, - "grad_norm": 4.0369744300842285, - "learning_rate": 2.4579333709768494e-05, - "loss": 1.0665, + "grad_norm": 1.4015096426010132, + "learning_rate": 0.0004096555618294749, + "loss": 0.8217, "step": 4800 }, { "epoch": 0.9053265575004705, - "grad_norm": 15.835465431213379, - "learning_rate": 2.4568040654997177e-05, - "loss": 1.2974, + "grad_norm": 3.5089902877807617, + "learning_rate": 0.000409467344249953, + "loss": 0.8258, "step": 4810 }, { "epoch": 0.9072087332956899, - "grad_norm": 13.87853717803955, - "learning_rate": 2.4556747600225863e-05, - "loss": 1.3459, + "grad_norm": 3.8666586875915527, + "learning_rate": 0.00040927912667043106, + "loss": 0.8321, "step": 4820 }, { "epoch": 0.9090909090909091, - "grad_norm": 27.265249252319336, - "learning_rate": 2.454545454545455e-05, - "loss": 1.3109, + "grad_norm": 2.164626359939575, + "learning_rate": 0.00040909090909090913, + "loss": 0.8906, "step": 4830 }, { "epoch": 0.9109730848861284, - "grad_norm": 5.671602725982666, - "learning_rate": 2.4534161490683228e-05, - "loss": 1.6534, + "grad_norm": 1.8831568956375122, + "learning_rate": 0.00040890269151138716, + "loss": 1.1285, "step": 4840 }, { "epoch": 0.9128552606813476, - "grad_norm": 4.075274467468262, - "learning_rate": 2.4522868435911914e-05, - "loss": 1.3106, + "grad_norm": 2.536550998687744, + "learning_rate": 0.00040871447393186523, + "loss": 0.8967, "step": 4850 }, { "epoch": 0.9147374364765669, - "grad_norm": 17.051898956298828, - "learning_rate": 2.4511575381140597e-05, - "loss": 1.2559, + "grad_norm": 2.850433826446533, + "learning_rate": 0.0004085262563523433, + "loss": 0.7869, "step": 4860 }, { "epoch": 0.9166196122717862, - "grad_norm": 14.164190292358398, - "learning_rate": 2.4500282326369283e-05, - "loss": 1.4232, + "grad_norm": 3.7627031803131104, + "learning_rate": 0.0004083380387728214, + "loss": 0.8574, "step": 4870 }, { "epoch": 0.9185017880670054, - "grad_norm": 4.818867206573486, - "learning_rate": 2.448898927159797e-05, - "loss": 1.1879, + "grad_norm": 3.227433919906616, + "learning_rate": 0.00040814982119329946, + "loss": 0.6237, "step": 4880 }, { "epoch": 0.9203839638622248, - "grad_norm": 6.399837493896484, - "learning_rate": 2.4477696216826652e-05, - "loss": 1.3137, + "grad_norm": 2.9040145874023438, + "learning_rate": 0.00040796160361377754, + "loss": 0.8478, "step": 4890 }, { "epoch": 0.922266139657444, - "grad_norm": 26.00821304321289, - "learning_rate": 2.4466403162055338e-05, - "loss": 1.4008, + "grad_norm": 4.651259899139404, + "learning_rate": 0.0004077733860342556, + "loss": 0.9602, "step": 4900 }, { "epoch": 0.9241483154526633, - "grad_norm": 19.372074127197266, - "learning_rate": 2.445511010728402e-05, - "loss": 1.4288, + "grad_norm": 2.856947183609009, + "learning_rate": 0.0004075851684547337, + "loss": 1.1861, "step": 4910 }, { "epoch": 0.9260304912478825, - "grad_norm": 21.262939453125, - "learning_rate": 2.4443817052512707e-05, - "loss": 1.3958, + "grad_norm": 3.3068671226501465, + "learning_rate": 0.00040739695087521177, + "loss": 0.9059, "step": 4920 }, { "epoch": 0.9279126670431018, - "grad_norm": 8.744524955749512, - "learning_rate": 2.443252399774139e-05, - "loss": 1.2075, + "grad_norm": 2.0299556255340576, + "learning_rate": 0.0004072087332956898, + "loss": 0.9159, "step": 4930 }, { "epoch": 0.9297948428383211, - "grad_norm": 19.05074691772461, - "learning_rate": 2.4421230942970072e-05, - "loss": 1.4352, + "grad_norm": 4.704328536987305, + "learning_rate": 0.00040702051571616787, + "loss": 1.2919, "step": 4940 }, { "epoch": 0.9316770186335404, - "grad_norm": 4.271505355834961, - "learning_rate": 2.4409937888198758e-05, - "loss": 0.8662, + "grad_norm": 4.627618312835693, + "learning_rate": 0.00040683229813664594, + "loss": 0.5245, "step": 4950 }, { "epoch": 0.9335591944287597, - "grad_norm": 11.079838752746582, - "learning_rate": 2.4398644833427444e-05, - "loss": 0.9989, + "grad_norm": 4.851251125335693, + "learning_rate": 0.000406644080557124, + "loss": 0.6766, "step": 4960 }, { "epoch": 0.9354413702239789, - "grad_norm": 12.747160911560059, - "learning_rate": 2.4387351778656127e-05, - "loss": 1.3546, + "grad_norm": 3.3305420875549316, + "learning_rate": 0.0004064558629776021, + "loss": 0.966, "step": 4970 }, { "epoch": 0.9373235460191982, - "grad_norm": 9.06821060180664, - "learning_rate": 2.4376058723884813e-05, - "loss": 1.237, + "grad_norm": 3.7064177989959717, + "learning_rate": 0.0004062676453980802, + "loss": 0.8435, "step": 4980 }, { "epoch": 0.9392057218144174, - "grad_norm": 18.579593658447266, - "learning_rate": 2.4364765669113496e-05, - "loss": 1.0582, + "grad_norm": 3.640709161758423, + "learning_rate": 0.0004060794278185583, + "loss": 0.6872, "step": 4990 }, { "epoch": 0.9410878976096367, - "grad_norm": 25.581363677978516, - "learning_rate": 2.435347261434218e-05, - "loss": 1.1142, + "grad_norm": 4.062694549560547, + "learning_rate": 0.0004058912102390364, + "loss": 0.8231, "step": 5000 }, { "epoch": 0.9429700734048561, - "grad_norm": 3.7508864402770996, - "learning_rate": 2.4342179559570864e-05, - "loss": 1.1593, + "grad_norm": 5.525752544403076, + "learning_rate": 0.0004057029926595144, + "loss": 0.7747, "step": 5010 }, { "epoch": 0.9448522492000753, - "grad_norm": 12.199710845947266, - "learning_rate": 2.4330886504799547e-05, - "loss": 1.2923, + "grad_norm": 3.2349648475646973, + "learning_rate": 0.0004055147750799925, + "loss": 0.9788, "step": 5020 }, { "epoch": 0.9467344249952946, - "grad_norm": 15.573914527893066, - "learning_rate": 2.4319593450028233e-05, - "loss": 1.2529, + "grad_norm": 3.297736406326294, + "learning_rate": 0.00040532655750047055, + "loss": 0.9376, "step": 5030 }, { "epoch": 0.9486166007905138, - "grad_norm": 10.434758186340332, - "learning_rate": 2.4308300395256916e-05, - "loss": 1.2285, + "grad_norm": 4.1556549072265625, + "learning_rate": 0.00040513833992094863, + "loss": 0.8717, "step": 5040 }, { "epoch": 0.9504987765857331, - "grad_norm": 14.420498847961426, - "learning_rate": 2.4297007340485602e-05, - "loss": 1.3006, + "grad_norm": 4.240426063537598, + "learning_rate": 0.0004049501223414267, + "loss": 0.8023, "step": 5050 }, { "epoch": 0.9523809523809523, - "grad_norm": 32.99720764160156, - "learning_rate": 2.4285714285714288e-05, - "loss": 1.4461, + "grad_norm": 3.5286829471588135, + "learning_rate": 0.0004047619047619048, + "loss": 0.8408, "step": 5060 }, { "epoch": 0.9542631281761716, - "grad_norm": 4.45308780670166, - "learning_rate": 2.427442123094297e-05, - "loss": 1.1251, + "grad_norm": 3.1347432136535645, + "learning_rate": 0.00040457368718238286, + "loss": 0.6977, "step": 5070 }, { "epoch": 0.956145303971391, - "grad_norm": 23.680532455444336, - "learning_rate": 2.4263128176171657e-05, - "loss": 1.0521, + "grad_norm": 2.5033462047576904, + "learning_rate": 0.00040438546960286094, + "loss": 0.8087, "step": 5080 }, { "epoch": 0.9580274797666102, - "grad_norm": 7.266300201416016, - "learning_rate": 2.425183512140034e-05, - "loss": 1.2313, + "grad_norm": 1.7074840068817139, + "learning_rate": 0.000404197252023339, + "loss": 0.8927, "step": 5090 }, { "epoch": 0.9599096555618295, - "grad_norm": 9.216297149658203, - "learning_rate": 2.4240542066629022e-05, - "loss": 1.1339, + "grad_norm": 5.167369365692139, + "learning_rate": 0.00040400903444381703, + "loss": 0.9689, "step": 5100 }, { "epoch": 0.9617918313570487, - "grad_norm": 11.177430152893066, - "learning_rate": 2.4229249011857708e-05, - "loss": 1.1926, + "grad_norm": 2.3252077102661133, + "learning_rate": 0.0004038208168642951, + "loss": 0.8471, "step": 5110 }, { "epoch": 0.963674007152268, - "grad_norm": 21.9661808013916, - "learning_rate": 2.421795595708639e-05, - "loss": 1.4161, + "grad_norm": 3.6321616172790527, + "learning_rate": 0.0004036325992847732, + "loss": 0.9404, "step": 5120 }, { "epoch": 0.9655561829474872, - "grad_norm": 12.636842727661133, - "learning_rate": 2.4206662902315077e-05, - "loss": 1.1809, + "grad_norm": 3.3031723499298096, + "learning_rate": 0.00040344438170525126, + "loss": 0.7534, "step": 5130 }, { "epoch": 0.9674383587427066, - "grad_norm": 12.683283805847168, - "learning_rate": 2.419536984754376e-05, - "loss": 1.2115, + "grad_norm": 3.7766780853271484, + "learning_rate": 0.00040325616412572934, + "loss": 0.7902, "step": 5140 }, { "epoch": 0.9693205345379259, - "grad_norm": 14.036438941955566, - "learning_rate": 2.4184076792772446e-05, - "loss": 1.4084, + "grad_norm": 4.320493221282959, + "learning_rate": 0.0004030679465462074, + "loss": 0.8989, "step": 5150 }, { "epoch": 0.9712027103331451, - "grad_norm": 8.703644752502441, - "learning_rate": 2.4172783738001132e-05, - "loss": 1.3985, + "grad_norm": 3.0502378940582275, + "learning_rate": 0.0004028797289666855, + "loss": 1.1857, "step": 5160 }, { "epoch": 0.9730848861283644, - "grad_norm": 15.209477424621582, - "learning_rate": 2.4161490683229814e-05, - "loss": 1.3594, + "grad_norm": 6.050513744354248, + "learning_rate": 0.00040269151138716357, + "loss": 1.2846, "step": 5170 }, { "epoch": 0.9749670619235836, - "grad_norm": 8.397993087768555, - "learning_rate": 2.41501976284585e-05, - "loss": 1.2813, + "grad_norm": 2.5459749698638916, + "learning_rate": 0.00040250329380764165, + "loss": 0.7691, "step": 5180 }, { "epoch": 0.9768492377188029, - "grad_norm": 11.615766525268555, - "learning_rate": 2.4138904573687183e-05, - "loss": 1.1447, + "grad_norm": 3.656423568725586, + "learning_rate": 0.00040231507622811967, + "loss": 0.7498, "step": 5190 }, { "epoch": 0.9787314135140223, - "grad_norm": 3.9449048042297363, - "learning_rate": 2.4127611518915866e-05, - "loss": 1.0322, + "grad_norm": 3.0081403255462646, + "learning_rate": 0.00040212685864859774, + "loss": 0.7818, "step": 5200 }, { "epoch": 0.9806135893092415, - "grad_norm": 23.256526947021484, - "learning_rate": 2.4116318464144552e-05, - "loss": 1.0706, + "grad_norm": 6.50275182723999, + "learning_rate": 0.0004019386410690759, + "loss": 0.8074, "step": 5210 }, { "epoch": 0.9824957651044608, - "grad_norm": 6.652702808380127, - "learning_rate": 2.4105025409373235e-05, - "loss": 1.2482, + "grad_norm": 8.647149085998535, + "learning_rate": 0.00040175042348955395, + "loss": 0.9535, "step": 5220 }, { "epoch": 0.98437794089968, - "grad_norm": 27.787336349487305, - "learning_rate": 2.409373235460192e-05, - "loss": 1.2317, + "grad_norm": 1.342649221420288, + "learning_rate": 0.00040156220591003203, + "loss": 0.7369, "step": 5230 }, { "epoch": 0.9862601166948993, - "grad_norm": 4.631372451782227, - "learning_rate": 2.4082439299830607e-05, - "loss": 0.9322, + "grad_norm": 3.2202084064483643, + "learning_rate": 0.0004013739883305101, + "loss": 0.5109, "step": 5240 }, { "epoch": 0.9881422924901185, - "grad_norm": 7.675413131713867, - "learning_rate": 2.407114624505929e-05, - "loss": 1.1535, + "grad_norm": 3.334763765335083, + "learning_rate": 0.0004011857707509882, + "loss": 0.9238, "step": 5250 }, { "epoch": 0.9900244682853379, - "grad_norm": 11.056670188903809, - "learning_rate": 2.4059853190287976e-05, - "loss": 1.066, + "grad_norm": 1.7026392221450806, + "learning_rate": 0.00040099755317146626, + "loss": 0.841, "step": 5260 }, { "epoch": 0.9919066440805572, - "grad_norm": 2.178229808807373, - "learning_rate": 2.4048560135516655e-05, - "loss": 1.0266, + "grad_norm": 1.0753185749053955, + "learning_rate": 0.0004008093355919443, + "loss": 0.7236, "step": 5270 }, { "epoch": 0.9937888198757764, - "grad_norm": 5.711111068725586, - "learning_rate": 2.403726708074534e-05, - "loss": 1.2493, + "grad_norm": 2.5947108268737793, + "learning_rate": 0.00040062111801242236, + "loss": 1.0401, "step": 5280 }, { "epoch": 0.9956709956709957, - "grad_norm": 27.90021514892578, - "learning_rate": 2.4025974025974027e-05, - "loss": 1.1559, + "grad_norm": 2.397989273071289, + "learning_rate": 0.00040043290043290043, + "loss": 0.5921, "step": 5290 }, { "epoch": 0.9975531714662149, - "grad_norm": 8.907947540283203, - "learning_rate": 2.401468097120271e-05, - "loss": 1.0816, + "grad_norm": 2.8067519664764404, + "learning_rate": 0.0004002446828533785, + "loss": 0.9171, "step": 5300 }, { "epoch": 0.9994353472614342, - "grad_norm": 21.335691452026367, - "learning_rate": 2.4003387916431396e-05, - "loss": 1.0634, + "grad_norm": 3.1910457611083984, + "learning_rate": 0.0004000564652738566, + "loss": 0.5206, "step": 5310 }, { "epoch": 1.0, - "eval_accuracy": 0.8777333333333334, - "eval_loss": 0.7235307097434998, - "eval_runtime": 244.378, - "eval_samples_per_second": 30.69, - "eval_steps_per_second": 3.838, + "eval_accuracy": 0.9050666666666667, + "eval_loss": 0.33233147859573364, + "eval_runtime": 307.7593, + "eval_samples_per_second": 24.37, + "eval_steps_per_second": 3.048, "step": 5313 }, { "epoch": 1.0013175230566536, - "grad_norm": 5.803168773651123, - "learning_rate": 2.399209486166008e-05, - "loss": 1.4338, + "grad_norm": 2.1240122318267822, + "learning_rate": 0.00039986824769433466, + "loss": 1.0903, "step": 5320 }, { "epoch": 1.0031996988518728, - "grad_norm": 16.894943237304688, - "learning_rate": 2.3980801806888765e-05, - "loss": 1.3967, + "grad_norm": 2.710055351257324, + "learning_rate": 0.00039968003011481274, + "loss": 0.9562, "step": 5330 }, { "epoch": 1.005081874647092, - "grad_norm": 9.325886726379395, - "learning_rate": 2.396950875211745e-05, - "loss": 1.1418, + "grad_norm": 3.0997753143310547, + "learning_rate": 0.0003994918125352908, + "loss": 0.7154, "step": 5340 }, { "epoch": 1.0069640504423114, - "grad_norm": 15.385432243347168, - "learning_rate": 2.3958215697346133e-05, - "loss": 1.1674, + "grad_norm": 2.385953903198242, + "learning_rate": 0.0003993035949557689, + "loss": 0.933, "step": 5350 }, { "epoch": 1.0088462262375306, - "grad_norm": 16.328964233398438, - "learning_rate": 2.3946922642574816e-05, - "loss": 1.2397, + "grad_norm": 2.483708620071411, + "learning_rate": 0.0003991153773762469, + "loss": 0.7209, "step": 5360 }, { "epoch": 1.0107284020327498, - "grad_norm": 16.835899353027344, - "learning_rate": 2.39356295878035e-05, - "loss": 1.1855, + "grad_norm": 2.988840341567993, + "learning_rate": 0.000398927159796725, + "loss": 0.9888, "step": 5370 }, { "epoch": 1.012610577827969, - "grad_norm": 1.2168173789978027, - "learning_rate": 2.3924336533032185e-05, - "loss": 1.185, + "grad_norm": 0.3290826380252838, + "learning_rate": 0.00039873894221720307, + "loss": 0.9194, "step": 5380 }, { "epoch": 1.0144927536231885, - "grad_norm": 17.501239776611328, - "learning_rate": 2.391304347826087e-05, - "loss": 1.0554, + "grad_norm": 3.4336321353912354, + "learning_rate": 0.00039855072463768114, + "loss": 0.6798, "step": 5390 }, { "epoch": 1.0163749294184077, - "grad_norm": 7.978825569152832, - "learning_rate": 2.3901750423489554e-05, - "loss": 1.0179, + "grad_norm": 2.0959396362304688, + "learning_rate": 0.0003983625070581592, + "loss": 0.7914, "step": 5400 }, { "epoch": 1.018257105213627, - "grad_norm": 21.958223342895508, - "learning_rate": 2.389045736871824e-05, - "loss": 1.2771, + "grad_norm": 3.9184632301330566, + "learning_rate": 0.0003981742894786373, + "loss": 0.798, "step": 5410 }, { "epoch": 1.0201392810088463, - "grad_norm": 10.229572296142578, - "learning_rate": 2.3879164313946926e-05, - "loss": 0.9884, + "grad_norm": 5.386630535125732, + "learning_rate": 0.00039798607189911537, + "loss": 0.7589, "step": 5420 }, { "epoch": 1.0220214568040655, - "grad_norm": 10.10629653930664, - "learning_rate": 2.386787125917561e-05, - "loss": 1.2098, + "grad_norm": 3.6790056228637695, + "learning_rate": 0.0003977978543195935, + "loss": 1.0433, "step": 5430 }, { "epoch": 1.0239036325992847, - "grad_norm": 8.695528030395508, - "learning_rate": 2.3856578204404294e-05, - "loss": 1.0261, + "grad_norm": 3.881471872329712, + "learning_rate": 0.0003976096367400716, + "loss": 0.8501, "step": 5440 }, { "epoch": 1.025785808394504, - "grad_norm": 3.444990873336792, - "learning_rate": 2.3845285149632974e-05, - "loss": 1.0464, + "grad_norm": 2.0723538398742676, + "learning_rate": 0.0003974214191605496, + "loss": 0.8193, "step": 5450 }, { "epoch": 1.0276679841897234, - "grad_norm": 6.334211826324463, - "learning_rate": 2.383399209486166e-05, - "loss": 1.1021, + "grad_norm": 2.0003159046173096, + "learning_rate": 0.0003972332015810277, + "loss": 0.8174, "step": 5460 }, { "epoch": 1.0295501599849426, - "grad_norm": 7.3270978927612305, - "learning_rate": 2.3822699040090346e-05, - "loss": 1.0433, + "grad_norm": 1.533605933189392, + "learning_rate": 0.00039704498400150575, + "loss": 0.6906, "step": 5470 }, { "epoch": 1.0314323357801618, - "grad_norm": 19.58551788330078, - "learning_rate": 2.381140598531903e-05, - "loss": 0.9511, + "grad_norm": 3.0430614948272705, + "learning_rate": 0.00039685676642198383, + "loss": 0.6862, "step": 5480 }, { "epoch": 1.0333145115753812, - "grad_norm": 11.17794132232666, - "learning_rate": 2.3800112930547715e-05, - "loss": 1.3029, + "grad_norm": 4.184202671051025, + "learning_rate": 0.0003966685488424619, + "loss": 1.1837, "step": 5490 }, { "epoch": 1.0351966873706004, - "grad_norm": 20.934467315673828, - "learning_rate": 2.3788819875776397e-05, - "loss": 1.2481, + "grad_norm": 7.424351692199707, + "learning_rate": 0.00039648033126294, + "loss": 1.0442, "step": 5500 }, { "epoch": 1.0370788631658197, - "grad_norm": 5.4833083152771, - "learning_rate": 2.3777526821005083e-05, - "loss": 0.9677, + "grad_norm": 2.174436569213867, + "learning_rate": 0.00039629211368341806, + "loss": 0.7614, "step": 5510 }, { "epoch": 1.0389610389610389, - "grad_norm": 17.41068458557129, - "learning_rate": 2.376623376623377e-05, - "loss": 1.0862, + "grad_norm": 2.585970401763916, + "learning_rate": 0.00039610389610389614, + "loss": 0.7771, "step": 5520 }, { "epoch": 1.0408432147562583, - "grad_norm": 10.463713645935059, - "learning_rate": 2.3754940711462452e-05, - "loss": 1.1165, + "grad_norm": 2.9651038646698, + "learning_rate": 0.0003959156785243742, + "loss": 0.6379, "step": 5530 }, { "epoch": 1.0427253905514775, - "grad_norm": 4.4030232429504395, - "learning_rate": 2.3743647656691135e-05, - "loss": 0.8207, + "grad_norm": 2.588423013687134, + "learning_rate": 0.00039572746094485223, + "loss": 0.5153, "step": 5540 }, { "epoch": 1.0446075663466967, - "grad_norm": 8.17664909362793, - "learning_rate": 2.3732354601919818e-05, - "loss": 0.8548, + "grad_norm": 2.387333393096924, + "learning_rate": 0.0003955392433653303, + "loss": 0.5602, "step": 5550 }, { "epoch": 1.0464897421419161, - "grad_norm": 18.136306762695312, - "learning_rate": 2.3721061547148504e-05, - "loss": 1.0368, + "grad_norm": 2.9230353832244873, + "learning_rate": 0.0003953510257858084, + "loss": 0.7409, "step": 5560 }, { "epoch": 1.0483719179371354, - "grad_norm": 8.143134117126465, - "learning_rate": 2.370976849237719e-05, - "loss": 1.1101, + "grad_norm": 1.8143259286880493, + "learning_rate": 0.00039516280820628646, + "loss": 0.8831, "step": 5570 }, { "epoch": 1.0502540937323546, - "grad_norm": 17.662364959716797, - "learning_rate": 2.3698475437605872e-05, - "loss": 1.3467, + "grad_norm": 5.663400650024414, + "learning_rate": 0.00039497459062676454, + "loss": 1.0302, "step": 5580 }, { "epoch": 1.0521362695275738, - "grad_norm": 18.27084732055664, - "learning_rate": 2.368718238283456e-05, - "loss": 0.7784, + "grad_norm": 2.289546489715576, + "learning_rate": 0.0003947863730472426, + "loss": 0.4768, "step": 5590 }, { "epoch": 1.0540184453227932, - "grad_norm": 20.256084442138672, - "learning_rate": 2.367588932806324e-05, - "loss": 0.9596, + "grad_norm": 3.498897075653076, + "learning_rate": 0.0003945981554677207, + "loss": 0.8049, "step": 5600 }, { "epoch": 1.0559006211180124, - "grad_norm": 5.476850509643555, - "learning_rate": 2.3664596273291927e-05, - "loss": 1.2, + "grad_norm": 1.5211207866668701, + "learning_rate": 0.00039440993788819877, + "loss": 0.8888, "step": 5610 }, { "epoch": 1.0577827969132316, - "grad_norm": 13.74804973602295, - "learning_rate": 2.365330321852061e-05, - "loss": 1.3438, + "grad_norm": 2.7362804412841797, + "learning_rate": 0.0003942217203086768, + "loss": 1.1457, "step": 5620 }, { "epoch": 1.059664972708451, - "grad_norm": 7.789999008178711, - "learning_rate": 2.3642010163749293e-05, - "loss": 0.8268, + "grad_norm": 3.4366722106933594, + "learning_rate": 0.00039403350272915487, + "loss": 0.6405, "step": 5630 }, { "epoch": 1.0615471485036703, - "grad_norm": 10.119312286376953, - "learning_rate": 2.363071710897798e-05, - "loss": 1.03, + "grad_norm": 2.035243034362793, + "learning_rate": 0.00039384528514963294, + "loss": 0.724, "step": 5640 }, { "epoch": 1.0634293242988895, - "grad_norm": 17.798921585083008, - "learning_rate": 2.361942405420666e-05, - "loss": 0.9418, + "grad_norm": 3.6405818462371826, + "learning_rate": 0.0003936570675701111, + "loss": 0.7233, "step": 5650 }, { "epoch": 1.0653115000941087, - "grad_norm": 16.53011703491211, - "learning_rate": 2.3608130999435347e-05, - "loss": 1.0546, + "grad_norm": 1.665967583656311, + "learning_rate": 0.00039346884999058915, + "loss": 0.7456, "step": 5660 }, { "epoch": 1.0671936758893281, - "grad_norm": 8.095433235168457, - "learning_rate": 2.3596837944664034e-05, - "loss": 0.9267, + "grad_norm": 2.409528970718384, + "learning_rate": 0.00039328063241106723, + "loss": 0.6585, "step": 5670 }, { "epoch": 1.0690758516845473, - "grad_norm": 29.615936279296875, - "learning_rate": 2.3585544889892716e-05, - "loss": 1.2167, + "grad_norm": 4.774532794952393, + "learning_rate": 0.0003930924148315453, + "loss": 1.0393, "step": 5680 }, { "epoch": 1.0709580274797665, - "grad_norm": 2.900878667831421, - "learning_rate": 2.3574251835121402e-05, - "loss": 0.9583, + "grad_norm": 6.189159393310547, + "learning_rate": 0.0003929041972520234, + "loss": 0.7568, "step": 5690 }, { "epoch": 1.072840203274986, - "grad_norm": 2.4044950008392334, - "learning_rate": 2.356295878035009e-05, - "loss": 0.8682, + "grad_norm": 0.9074216485023499, + "learning_rate": 0.00039271597967250146, + "loss": 0.7797, "step": 5700 }, { "epoch": 1.0747223790702052, - "grad_norm": 3.0829226970672607, - "learning_rate": 2.3551665725578768e-05, - "loss": 0.9567, + "grad_norm": 1.9530513286590576, + "learning_rate": 0.0003925277620929795, + "loss": 0.7868, "step": 5710 }, { "epoch": 1.0766045548654244, - "grad_norm": 8.497003555297852, - "learning_rate": 2.3540372670807454e-05, - "loss": 0.889, + "grad_norm": 4.772610187530518, + "learning_rate": 0.00039233954451345756, + "loss": 0.5664, "step": 5720 }, { "epoch": 1.0784867306606438, - "grad_norm": 5.439596652984619, - "learning_rate": 2.3529079616036136e-05, - "loss": 1.1877, + "grad_norm": 1.7776116132736206, + "learning_rate": 0.00039215132693393563, + "loss": 1.1096, "step": 5730 }, { "epoch": 1.080368906455863, - "grad_norm": 13.089673042297363, - "learning_rate": 2.3517786561264823e-05, - "loss": 0.9833, + "grad_norm": 4.608347415924072, + "learning_rate": 0.0003919631093544137, + "loss": 0.779, "step": 5740 }, { "epoch": 1.0822510822510822, - "grad_norm": 8.794281959533691, - "learning_rate": 2.350649350649351e-05, - "loss": 1.4184, + "grad_norm": 3.2807509899139404, + "learning_rate": 0.0003917748917748918, + "loss": 1.134, "step": 5750 }, { "epoch": 1.0841332580463015, - "grad_norm": 11.838709831237793, - "learning_rate": 2.349520045172219e-05, - "loss": 1.2191, + "grad_norm": 2.146608591079712, + "learning_rate": 0.00039158667419536986, + "loss": 0.9571, "step": 5760 }, { "epoch": 1.0860154338415209, - "grad_norm": 12.768487930297852, - "learning_rate": 2.3483907396950877e-05, - "loss": 1.0469, + "grad_norm": 4.23948335647583, + "learning_rate": 0.00039139845661584794, + "loss": 0.8992, "step": 5770 }, { "epoch": 1.08789760963674, - "grad_norm": 16.489809036254883, - "learning_rate": 2.347261434217956e-05, - "loss": 1.2401, + "grad_norm": 4.417486667633057, + "learning_rate": 0.000391210239036326, + "loss": 0.9175, "step": 5780 }, { "epoch": 1.0897797854319593, - "grad_norm": 22.284759521484375, - "learning_rate": 2.3461321287408246e-05, - "loss": 1.1759, + "grad_norm": 3.7243125438690186, + "learning_rate": 0.0003910220214568041, + "loss": 0.8366, "step": 5790 }, { "epoch": 1.0916619612271785, - "grad_norm": 22.32616424560547, - "learning_rate": 2.345002823263693e-05, - "loss": 1.0558, + "grad_norm": 1.57295823097229, + "learning_rate": 0.0003908338038772821, + "loss": 0.8425, "step": 5800 }, { "epoch": 1.093544137022398, - "grad_norm": 6.583852767944336, - "learning_rate": 2.343873517786561e-05, - "loss": 1.1139, + "grad_norm": 2.6780879497528076, + "learning_rate": 0.0003906455862977602, + "loss": 0.8754, "step": 5810 }, { "epoch": 1.0954263128176172, - "grad_norm": 10.06447982788086, - "learning_rate": 2.3427442123094298e-05, - "loss": 1.0589, + "grad_norm": 0.9460309147834778, + "learning_rate": 0.00039045736871823827, + "loss": 0.5829, "step": 5820 }, { "epoch": 1.0973084886128364, - "grad_norm": 9.685362815856934, - "learning_rate": 2.341614906832298e-05, - "loss": 1.0719, + "grad_norm": 3.544323205947876, + "learning_rate": 0.00039026915113871634, + "loss": 0.7961, "step": 5830 }, { "epoch": 1.0991906644080558, - "grad_norm": 10.641950607299805, - "learning_rate": 2.3404856013551666e-05, - "loss": 0.8788, + "grad_norm": 4.158580303192139, + "learning_rate": 0.0003900809335591944, + "loss": 0.718, "step": 5840 }, { "epoch": 1.101072840203275, - "grad_norm": 9.749815940856934, - "learning_rate": 2.3393562958780352e-05, - "loss": 0.9172, + "grad_norm": 2.3165791034698486, + "learning_rate": 0.0003898927159796725, + "loss": 0.5707, "step": 5850 }, { "epoch": 1.1029550159984942, - "grad_norm": 15.738201141357422, - "learning_rate": 2.3382269904009035e-05, - "loss": 1.1344, + "grad_norm": 2.589895248413086, + "learning_rate": 0.00038970449840015057, + "loss": 0.9552, "step": 5860 }, { "epoch": 1.1048371917937136, - "grad_norm": 5.296823024749756, - "learning_rate": 2.337097684923772e-05, - "loss": 0.9969, + "grad_norm": 2.370169162750244, + "learning_rate": 0.0003895162808206287, + "loss": 0.7267, "step": 5870 }, { "epoch": 1.1067193675889329, - "grad_norm": 9.7347993850708, - "learning_rate": 2.33596837944664e-05, - "loss": 1.2592, + "grad_norm": 0.8681706786155701, + "learning_rate": 0.0003893280632411067, + "loss": 1.0151, "step": 5880 }, { "epoch": 1.108601543384152, - "grad_norm": 15.322550773620605, - "learning_rate": 2.3348390739695087e-05, - "loss": 0.8988, + "grad_norm": 0.5660446286201477, + "learning_rate": 0.0003891398456615848, + "loss": 0.6168, "step": 5890 }, { "epoch": 1.1104837191793713, - "grad_norm": 8.814277648925781, - "learning_rate": 2.3337097684923773e-05, - "loss": 0.9639, + "grad_norm": 2.0442614555358887, + "learning_rate": 0.0003889516280820629, + "loss": 0.603, "step": 5900 }, { "epoch": 1.1123658949745907, - "grad_norm": 9.318452835083008, - "learning_rate": 2.3325804630152455e-05, - "loss": 0.9157, + "grad_norm": 2.1244430541992188, + "learning_rate": 0.00038876341050254095, + "loss": 0.5466, "step": 5910 }, { "epoch": 1.11424807076981, - "grad_norm": 9.616998672485352, - "learning_rate": 2.331451157538114e-05, - "loss": 0.9965, + "grad_norm": 4.894069671630859, + "learning_rate": 0.00038857519292301903, + "loss": 0.7559, "step": 5920 }, { "epoch": 1.1161302465650291, - "grad_norm": 20.399810791015625, - "learning_rate": 2.3303218520609827e-05, - "loss": 0.9574, + "grad_norm": 4.432802200317383, + "learning_rate": 0.0003883869753434971, + "loss": 0.7947, "step": 5930 }, { "epoch": 1.1180124223602483, - "grad_norm": 25.457443237304688, - "learning_rate": 2.329192546583851e-05, - "loss": 0.948, + "grad_norm": 3.1599254608154297, + "learning_rate": 0.0003881987577639752, + "loss": 0.6203, "step": 5940 }, { "epoch": 1.1198945981554678, - "grad_norm": 10.880556106567383, - "learning_rate": 2.3280632411067196e-05, - "loss": 1.1425, + "grad_norm": 1.5855923891067505, + "learning_rate": 0.00038801054018445326, + "loss": 0.9732, "step": 5950 }, { "epoch": 1.121776773950687, - "grad_norm": 26.9040584564209, - "learning_rate": 2.326933935629588e-05, - "loss": 1.2454, + "grad_norm": 4.721362590789795, + "learning_rate": 0.00038782232260493134, + "loss": 1.12, "step": 5960 }, { "epoch": 1.1236589497459062, - "grad_norm": 12.784603118896484, - "learning_rate": 2.325804630152456e-05, - "loss": 1.0512, + "grad_norm": 2.280820846557617, + "learning_rate": 0.00038763410502540936, + "loss": 0.7846, "step": 5970 }, { "epoch": 1.1255411255411256, - "grad_norm": 6.434301853179932, - "learning_rate": 2.3246753246753248e-05, - "loss": 0.8505, + "grad_norm": 1.847952961921692, + "learning_rate": 0.00038744588744588743, + "loss": 0.6701, "step": 5980 }, { "epoch": 1.1274233013363448, - "grad_norm": 12.109904289245605, - "learning_rate": 2.323546019198193e-05, - "loss": 1.0467, + "grad_norm": 6.386806964874268, + "learning_rate": 0.0003872576698663655, + "loss": 0.8316, "step": 5990 }, { "epoch": 1.129305477131564, - "grad_norm": 11.11652660369873, - "learning_rate": 2.3224167137210616e-05, - "loss": 0.7246, + "grad_norm": 3.615067481994629, + "learning_rate": 0.0003870694522868436, + "loss": 0.6289, "step": 6000 }, { "epoch": 1.1311876529267835, - "grad_norm": 15.59671401977539, - "learning_rate": 2.32128740824393e-05, - "loss": 0.9292, + "grad_norm": 2.8482844829559326, + "learning_rate": 0.00038688123470732166, + "loss": 0.7003, "step": 6010 }, { "epoch": 1.1330698287220027, - "grad_norm": 6.132266521453857, - "learning_rate": 2.3201581027667985e-05, - "loss": 0.721, + "grad_norm": 1.1809179782867432, + "learning_rate": 0.00038669301712779974, + "loss": 0.5173, "step": 6020 }, { "epoch": 1.134952004517222, - "grad_norm": 3.5677640438079834, - "learning_rate": 2.319028797289667e-05, - "loss": 0.9246, + "grad_norm": 2.176791191101074, + "learning_rate": 0.0003865047995482778, + "loss": 0.7291, "step": 6030 }, { "epoch": 1.136834180312441, - "grad_norm": 15.319580078125, - "learning_rate": 2.3178994918125354e-05, - "loss": 0.9668, + "grad_norm": 2.8241522312164307, + "learning_rate": 0.0003863165819687559, + "loss": 0.6527, "step": 6040 }, { "epoch": 1.1387163561076605, - "grad_norm": 15.318434715270996, - "learning_rate": 2.316770186335404e-05, - "loss": 1.0329, + "grad_norm": 3.1613717079162598, + "learning_rate": 0.00038612836438923397, + "loss": 0.639, "step": 6050 }, { "epoch": 1.1405985319028797, - "grad_norm": 8.09421157836914, - "learning_rate": 2.315640880858272e-05, - "loss": 1.0839, + "grad_norm": 2.8154726028442383, + "learning_rate": 0.000385940146809712, + "loss": 0.7908, "step": 6060 }, { "epoch": 1.142480707698099, - "grad_norm": 33.64767837524414, - "learning_rate": 2.3145115753811405e-05, - "loss": 1.1178, + "grad_norm": 3.6952784061431885, + "learning_rate": 0.00038575192923019007, + "loss": 0.826, "step": 6070 }, { "epoch": 1.1443628834933182, - "grad_norm": 23.52187728881836, - "learning_rate": 2.313382269904009e-05, - "loss": 1.1034, + "grad_norm": 3.8638036251068115, + "learning_rate": 0.00038556371165066814, + "loss": 1.0331, "step": 6080 }, { "epoch": 1.1462450592885376, - "grad_norm": 13.548325538635254, - "learning_rate": 2.3122529644268774e-05, - "loss": 0.801, + "grad_norm": 3.39660906791687, + "learning_rate": 0.0003853754940711463, + "loss": 0.4696, "step": 6090 }, { "epoch": 1.1481272350837568, - "grad_norm": 19.462448120117188, - "learning_rate": 2.311123658949746e-05, - "loss": 0.8789, + "grad_norm": 3.148796319961548, + "learning_rate": 0.00038518727649162435, + "loss": 0.5888, "step": 6100 }, { "epoch": 1.150009410878976, - "grad_norm": 32.65730285644531, - "learning_rate": 2.3099943534726143e-05, - "loss": 1.2276, + "grad_norm": 1.6677438020706177, + "learning_rate": 0.00038499905891210243, + "loss": 1.0059, "step": 6110 }, { "epoch": 1.1518915866741954, - "grad_norm": 1.7889130115509033, - "learning_rate": 2.308865047995483e-05, - "loss": 0.9995, + "grad_norm": 0.408934623003006, + "learning_rate": 0.0003848108413325805, + "loss": 0.7653, "step": 6120 }, { "epoch": 1.1537737624694147, - "grad_norm": 2.802178382873535, - "learning_rate": 2.3077357425183515e-05, - "loss": 0.9639, + "grad_norm": 0.9431570172309875, + "learning_rate": 0.0003846226237530586, + "loss": 0.8325, "step": 6130 }, { "epoch": 1.1556559382646339, - "grad_norm": 30.473918914794922, - "learning_rate": 2.3066064370412198e-05, - "loss": 0.9189, + "grad_norm": 3.6121902465820312, + "learning_rate": 0.00038443440617353666, + "loss": 0.6998, "step": 6140 }, { "epoch": 1.1575381140598533, - "grad_norm": 12.530424118041992, - "learning_rate": 2.305477131564088e-05, - "loss": 1.1765, + "grad_norm": 4.1678948402404785, + "learning_rate": 0.0003842461885940147, + "loss": 0.9246, "step": 6150 }, { "epoch": 1.1594202898550725, - "grad_norm": 20.285593032836914, - "learning_rate": 2.3043478260869563e-05, - "loss": 1.0729, + "grad_norm": 2.139291763305664, + "learning_rate": 0.00038405797101449276, + "loss": 0.7167, "step": 6160 }, { "epoch": 1.1613024656502917, - "grad_norm": 17.06644058227539, - "learning_rate": 2.303218520609825e-05, - "loss": 1.1532, + "grad_norm": 3.6605629920959473, + "learning_rate": 0.00038386975343497083, + "loss": 0.9195, "step": 6170 }, { "epoch": 1.163184641445511, - "grad_norm": 13.454843521118164, - "learning_rate": 2.3020892151326935e-05, - "loss": 1.0656, + "grad_norm": 0.30113252997398376, + "learning_rate": 0.0003836815358554489, + "loss": 0.7755, "step": 6180 }, { "epoch": 1.1650668172407304, - "grad_norm": 14.98215103149414, - "learning_rate": 2.3009599096555618e-05, - "loss": 0.7957, + "grad_norm": 3.287532091140747, + "learning_rate": 0.000383493318275927, + "loss": 0.6074, "step": 6190 }, { "epoch": 1.1669489930359496, - "grad_norm": 12.102943420410156, - "learning_rate": 2.2998306041784304e-05, - "loss": 1.3556, + "grad_norm": 5.199028491973877, + "learning_rate": 0.00038330510069640506, + "loss": 1.1058, "step": 6200 }, { "epoch": 1.1688311688311688, - "grad_norm": 10.94884204864502, - "learning_rate": 2.298701298701299e-05, - "loss": 1.0466, + "grad_norm": 3.132143259048462, + "learning_rate": 0.00038311688311688314, + "loss": 0.8156, "step": 6210 }, { "epoch": 1.170713344626388, - "grad_norm": 5.2002177238464355, - "learning_rate": 2.2975719932241673e-05, - "loss": 1.1543, + "grad_norm": 4.496564865112305, + "learning_rate": 0.0003829286655373612, + "loss": 0.78, "step": 6220 }, { "epoch": 1.1725955204216074, - "grad_norm": 8.19458293914795, - "learning_rate": 2.2964426877470356e-05, - "loss": 0.7794, + "grad_norm": 2.972040891647339, + "learning_rate": 0.00038274044795783924, + "loss": 0.6054, "step": 6230 }, { "epoch": 1.1744776962168266, - "grad_norm": 7.3538970947265625, - "learning_rate": 2.2953133822699038e-05, - "loss": 0.7533, + "grad_norm": 3.018740177154541, + "learning_rate": 0.0003825522303783173, + "loss": 0.596, "step": 6240 }, { "epoch": 1.1763598720120458, - "grad_norm": 13.170063018798828, - "learning_rate": 2.2941840767927724e-05, - "loss": 1.1663, + "grad_norm": 4.458706378936768, + "learning_rate": 0.0003823640127987954, + "loss": 1.0045, "step": 6250 }, { "epoch": 1.1782420478072653, - "grad_norm": 22.807252883911133, - "learning_rate": 2.293054771315641e-05, - "loss": 0.9954, + "grad_norm": 2.131452798843384, + "learning_rate": 0.00038217579521927347, + "loss": 0.7716, "step": 6260 }, { "epoch": 1.1801242236024845, - "grad_norm": 4.545765399932861, - "learning_rate": 2.2919254658385093e-05, - "loss": 0.9198, + "grad_norm": 1.9051343202590942, + "learning_rate": 0.00038198757763975154, + "loss": 0.6796, "step": 6270 }, { "epoch": 1.1820063993977037, - "grad_norm": 8.624770164489746, - "learning_rate": 2.290796160361378e-05, - "loss": 0.9429, + "grad_norm": 2.997279644012451, + "learning_rate": 0.0003817993600602296, + "loss": 0.6617, "step": 6280 }, { "epoch": 1.1838885751929231, - "grad_norm": 12.10987377166748, - "learning_rate": 2.2896668548842462e-05, - "loss": 1.2487, + "grad_norm": 2.8390822410583496, + "learning_rate": 0.0003816111424807077, + "loss": 1.022, "step": 6290 }, { "epoch": 1.1857707509881423, - "grad_norm": 1.273734211921692, - "learning_rate": 2.2885375494071148e-05, - "loss": 0.9025, + "grad_norm": 0.6079533696174622, + "learning_rate": 0.00038142292490118577, + "loss": 0.6297, "step": 6300 }, { "epoch": 1.1876529267833615, - "grad_norm": 6.115864276885986, - "learning_rate": 2.2874082439299834e-05, - "loss": 0.9436, + "grad_norm": 2.57958984375, + "learning_rate": 0.0003812347073216639, + "loss": 0.701, "step": 6310 }, { "epoch": 1.1895351025785807, - "grad_norm": 6.128392696380615, - "learning_rate": 2.2862789384528513e-05, - "loss": 0.9464, + "grad_norm": 3.771247625350952, + "learning_rate": 0.0003810464897421419, + "loss": 0.7474, "step": 6320 }, { "epoch": 1.1914172783738002, - "grad_norm": 13.86414623260498, - "learning_rate": 2.28514963297572e-05, - "loss": 0.8451, + "grad_norm": 2.2047035694122314, + "learning_rate": 0.00038085827216262, + "loss": 0.5503, "step": 6330 }, { "epoch": 1.1932994541690194, - "grad_norm": 19.781795501708984, - "learning_rate": 2.2840203274985882e-05, - "loss": 0.7738, + "grad_norm": 4.219132900238037, + "learning_rate": 0.0003806700545830981, + "loss": 0.6213, "step": 6340 }, { "epoch": 1.1951816299642386, - "grad_norm": 30.369861602783203, - "learning_rate": 2.2828910220214568e-05, - "loss": 1.2183, + "grad_norm": 3.2710931301116943, + "learning_rate": 0.00038048183700357615, + "loss": 0.6869, "step": 6350 }, { "epoch": 1.1970638057594578, - "grad_norm": 12.43057918548584, - "learning_rate": 2.2817617165443254e-05, - "loss": 1.1899, + "grad_norm": 2.1299989223480225, + "learning_rate": 0.00038029361942405423, + "loss": 1.0877, "step": 6360 }, { "epoch": 1.1989459815546772, - "grad_norm": 9.697637557983398, - "learning_rate": 2.2806324110671937e-05, - "loss": 1.2109, + "grad_norm": 2.7965474128723145, + "learning_rate": 0.0003801054018445323, + "loss": 0.9136, "step": 6370 }, { "epoch": 1.2008281573498965, - "grad_norm": 33.04664993286133, - "learning_rate": 2.2795031055900623e-05, - "loss": 1.2716, + "grad_norm": 4.0959696769714355, + "learning_rate": 0.0003799171842650104, + "loss": 1.0894, "step": 6380 }, { "epoch": 1.2027103331451157, - "grad_norm": 16.485483169555664, - "learning_rate": 2.2783738001129306e-05, - "loss": 1.1587, + "grad_norm": 3.3608808517456055, + "learning_rate": 0.00037972896668548846, + "loss": 0.7197, "step": 6390 }, { "epoch": 1.204592508940335, - "grad_norm": 8.360411643981934, - "learning_rate": 2.2772444946357992e-05, - "loss": 1.0419, + "grad_norm": 2.8144304752349854, + "learning_rate": 0.00037954074910596654, + "loss": 0.7231, "step": 6400 }, { "epoch": 1.2064746847355543, - "grad_norm": 3.943709373474121, - "learning_rate": 2.2761151891586674e-05, - "loss": 1.0027, + "grad_norm": 0.6063675880432129, + "learning_rate": 0.00037935253152644456, + "loss": 0.6725, "step": 6410 }, { "epoch": 1.2083568605307735, - "grad_norm": 17.139930725097656, - "learning_rate": 2.2749858836815357e-05, - "loss": 1.1939, + "grad_norm": 1.4623092412948608, + "learning_rate": 0.00037916431394692263, + "loss": 1.0756, "step": 6420 }, { "epoch": 1.210239036325993, - "grad_norm": 14.557121276855469, - "learning_rate": 2.2738565782044043e-05, - "loss": 0.8377, + "grad_norm": 1.7580760717391968, + "learning_rate": 0.0003789760963674007, + "loss": 0.6686, "step": 6430 }, { "epoch": 1.2121212121212122, - "grad_norm": 17.73250961303711, - "learning_rate": 2.272727272727273e-05, - "loss": 0.9496, + "grad_norm": 8.147522926330566, + "learning_rate": 0.0003787878787878788, + "loss": 0.7119, "step": 6440 }, { "epoch": 1.2140033879164314, - "grad_norm": 15.301102638244629, - "learning_rate": 2.2715979672501412e-05, - "loss": 1.0668, + "grad_norm": 3.453514337539673, + "learning_rate": 0.00037859966120835686, + "loss": 0.8779, "step": 6450 }, { "epoch": 1.2158855637116506, - "grad_norm": 15.799238204956055, - "learning_rate": 2.2704686617730098e-05, - "loss": 1.0151, + "grad_norm": 2.9244697093963623, + "learning_rate": 0.00037841144362883494, + "loss": 0.7888, "step": 6460 }, { "epoch": 1.21776773950687, - "grad_norm": 20.844924926757812, - "learning_rate": 2.269339356295878e-05, - "loss": 0.9301, + "grad_norm": 4.729470729827881, + "learning_rate": 0.000378223226049313, + "loss": 0.6608, "step": 6470 }, { "epoch": 1.2196499153020892, - "grad_norm": 7.686192512512207, - "learning_rate": 2.2682100508187467e-05, - "loss": 0.797, + "grad_norm": 2.5494601726531982, + "learning_rate": 0.0003780350084697911, + "loss": 0.6861, "step": 6480 }, { "epoch": 1.2215320910973084, - "grad_norm": 15.735981941223145, - "learning_rate": 2.267080745341615e-05, - "loss": 1.0817, + "grad_norm": 4.567807674407959, + "learning_rate": 0.0003778467908902691, + "loss": 0.8115, "step": 6490 }, { "epoch": 1.2234142668925279, - "grad_norm": 10.269287109375, - "learning_rate": 2.2659514398644832e-05, - "loss": 1.1608, + "grad_norm": 2.6255970001220703, + "learning_rate": 0.0003776585733107472, + "loss": 0.976, "step": 6500 }, { "epoch": 1.225296442687747, - "grad_norm": 28.582277297973633, - "learning_rate": 2.2648221343873518e-05, - "loss": 0.8283, + "grad_norm": 2.7389473915100098, + "learning_rate": 0.00037747035573122527, + "loss": 0.6704, "step": 6510 }, { "epoch": 1.2271786184829663, - "grad_norm": 8.514233589172363, - "learning_rate": 2.26369282891022e-05, - "loss": 1.2402, + "grad_norm": 2.68717360496521, + "learning_rate": 0.00037728213815170334, + "loss": 0.9755, "step": 6520 }, { "epoch": 1.2290607942781855, - "grad_norm": 9.531684875488281, - "learning_rate": 2.2625635234330887e-05, - "loss": 1.1151, + "grad_norm": 1.1812397241592407, + "learning_rate": 0.0003770939205721814, + "loss": 0.9663, "step": 6530 }, { "epoch": 1.230942970073405, - "grad_norm": 10.337912559509277, - "learning_rate": 2.2614342179559573e-05, - "loss": 0.8757, + "grad_norm": 2.752870798110962, + "learning_rate": 0.00037690570299265955, + "loss": 0.6739, "step": 6540 }, { "epoch": 1.2328251458686241, - "grad_norm": 8.955284118652344, - "learning_rate": 2.2603049124788256e-05, - "loss": 0.76, + "grad_norm": 3.182645797729492, + "learning_rate": 0.00037671748541313763, + "loss": 0.5563, "step": 6550 }, { "epoch": 1.2347073216638433, - "grad_norm": 14.72435474395752, - "learning_rate": 2.2591756070016942e-05, - "loss": 0.9185, + "grad_norm": 3.5560567378997803, + "learning_rate": 0.0003765292678336157, + "loss": 0.9168, "step": 6560 }, { "epoch": 1.2365894974590628, - "grad_norm": 16.09954071044922, - "learning_rate": 2.2580463015245625e-05, - "loss": 1.313, + "grad_norm": 3.1835429668426514, + "learning_rate": 0.0003763410502540938, + "loss": 1.2578, "step": 6570 }, { "epoch": 1.238471673254282, - "grad_norm": 8.912398338317871, - "learning_rate": 2.2569169960474307e-05, - "loss": 0.8678, + "grad_norm": 3.730724334716797, + "learning_rate": 0.0003761528326745718, + "loss": 0.5987, "step": 6580 }, { "epoch": 1.2403538490495012, - "grad_norm": 8.538116455078125, - "learning_rate": 2.2557876905702993e-05, - "loss": 1.0072, + "grad_norm": 1.5414713621139526, + "learning_rate": 0.0003759646150950499, + "loss": 0.7506, "step": 6590 }, { "epoch": 1.2422360248447206, - "grad_norm": 14.951505661010742, - "learning_rate": 2.2546583850931676e-05, - "loss": 0.6922, + "grad_norm": 1.6156970262527466, + "learning_rate": 0.00037577639751552796, + "loss": 0.4276, "step": 6600 }, { "epoch": 1.2441182006399398, - "grad_norm": 8.427967071533203, - "learning_rate": 2.2535290796160362e-05, - "loss": 0.6708, + "grad_norm": 1.9981142282485962, + "learning_rate": 0.00037558817993600603, + "loss": 0.6881, "step": 6610 }, { "epoch": 1.246000376435159, - "grad_norm": 9.497509002685547, - "learning_rate": 2.2523997741389045e-05, - "loss": 1.0899, + "grad_norm": 1.740738034248352, + "learning_rate": 0.0003753999623564841, + "loss": 1.0043, "step": 6620 }, { "epoch": 1.2478825522303783, - "grad_norm": 30.798381805419922, - "learning_rate": 2.251270468661773e-05, - "loss": 0.9245, + "grad_norm": 1.780943512916565, + "learning_rate": 0.0003752117447769622, + "loss": 0.5519, "step": 6630 }, { "epoch": 1.2497647280255977, - "grad_norm": 12.764777183532715, - "learning_rate": 2.2501411631846417e-05, - "loss": 0.8884, + "grad_norm": 5.119179725646973, + "learning_rate": 0.00037502352719744026, + "loss": 0.9018, "step": 6640 }, { "epoch": 1.251646903820817, - "grad_norm": 6.89249324798584, - "learning_rate": 2.24901185770751e-05, - "loss": 0.9125, + "grad_norm": 2.3510608673095703, + "learning_rate": 0.00037483530961791834, + "loss": 0.6956, "step": 6650 }, { "epoch": 1.253529079616036, - "grad_norm": 9.787991523742676, - "learning_rate": 2.2478825522303786e-05, - "loss": 0.8764, + "grad_norm": 4.710165500640869, + "learning_rate": 0.0003746470920383964, + "loss": 0.5775, "step": 6660 }, { "epoch": 1.2554112554112553, - "grad_norm": 20.8765811920166, - "learning_rate": 2.246753246753247e-05, - "loss": 1.1334, + "grad_norm": 4.0059051513671875, + "learning_rate": 0.00037445887445887444, + "loss": 0.7891, "step": 6670 }, { "epoch": 1.2572934312064747, - "grad_norm": 19.523963928222656, - "learning_rate": 2.245623941276115e-05, - "loss": 1.2794, + "grad_norm": 3.0449984073638916, + "learning_rate": 0.0003742706568793525, + "loss": 1.1969, "step": 6680 }, { "epoch": 1.259175607001694, - "grad_norm": 17.773313522338867, - "learning_rate": 2.2444946357989837e-05, - "loss": 0.8465, + "grad_norm": 2.8127171993255615, + "learning_rate": 0.0003740824392998306, + "loss": 0.6317, "step": 6690 }, { "epoch": 1.2610577827969132, - "grad_norm": 11.96570110321045, - "learning_rate": 2.243365330321852e-05, - "loss": 1.1552, + "grad_norm": 3.5369529724121094, + "learning_rate": 0.00037389422172030867, + "loss": 1.0789, "step": 6700 }, { "epoch": 1.2629399585921326, - "grad_norm": 8.867982864379883, - "learning_rate": 2.2422360248447206e-05, - "loss": 1.011, + "grad_norm": 0.9900074005126953, + "learning_rate": 0.00037370600414078674, + "loss": 0.7734, "step": 6710 }, { "epoch": 1.2648221343873518, - "grad_norm": 29.349536895751953, - "learning_rate": 2.2411067193675892e-05, - "loss": 0.8635, + "grad_norm": 5.281899452209473, + "learning_rate": 0.0003735177865612648, + "loss": 0.6816, "step": 6720 }, { "epoch": 1.266704310182571, - "grad_norm": 12.906034469604492, - "learning_rate": 2.2399774138904575e-05, - "loss": 0.8193, + "grad_norm": 3.5994303226470947, + "learning_rate": 0.0003733295689817429, + "loss": 0.7046, "step": 6730 }, { "epoch": 1.2685864859777904, - "grad_norm": 12.88680648803711, - "learning_rate": 2.238848108413326e-05, - "loss": 1.264, + "grad_norm": 3.552111864089966, + "learning_rate": 0.00037314135140222097, + "loss": 0.9432, "step": 6740 }, { "epoch": 1.2704686617730097, - "grad_norm": 5.127486228942871, - "learning_rate": 2.2377188029361943e-05, - "loss": 0.839, + "grad_norm": 1.8412734270095825, + "learning_rate": 0.0003729531338226991, + "loss": 0.7193, "step": 6750 }, { "epoch": 1.2723508375682289, - "grad_norm": 5.232704162597656, - "learning_rate": 2.2365894974590626e-05, - "loss": 0.9379, + "grad_norm": 2.4162864685058594, + "learning_rate": 0.0003727649162431771, + "loss": 0.7516, "step": 6760 }, { "epoch": 1.274233013363448, - "grad_norm": 38.552825927734375, - "learning_rate": 2.2354601919819312e-05, - "loss": 0.9256, + "grad_norm": 3.4946014881134033, + "learning_rate": 0.0003725766986636552, + "loss": 0.8546, "step": 6770 }, { "epoch": 1.2761151891586673, - "grad_norm": 27.74937629699707, - "learning_rate": 2.2343308865047995e-05, - "loss": 0.9923, + "grad_norm": 2.2517666816711426, + "learning_rate": 0.0003723884810841333, + "loss": 0.8215, "step": 6780 }, { "epoch": 1.2779973649538867, - "grad_norm": 26.892642974853516, - "learning_rate": 2.233201581027668e-05, - "loss": 1.0955, + "grad_norm": 4.775296688079834, + "learning_rate": 0.00037220026350461135, + "loss": 0.8108, "step": 6790 }, { "epoch": 1.279879540749106, - "grad_norm": 11.914067268371582, - "learning_rate": 2.2320722755505364e-05, - "loss": 0.9401, + "grad_norm": 2.924154043197632, + "learning_rate": 0.00037201204592508943, + "loss": 0.6881, "step": 6800 }, { "epoch": 1.2817617165443251, - "grad_norm": 16.04292869567871, - "learning_rate": 2.230942970073405e-05, - "loss": 0.9384, + "grad_norm": 2.4805564880371094, + "learning_rate": 0.0003718238283455675, + "loss": 0.7581, "step": 6810 }, { "epoch": 1.2836438923395446, - "grad_norm": 7.591270923614502, - "learning_rate": 2.2298136645962736e-05, - "loss": 0.8969, + "grad_norm": 1.8149421215057373, + "learning_rate": 0.0003716356107660456, + "loss": 0.7755, "step": 6820 }, { "epoch": 1.2855260681347638, - "grad_norm": 23.051708221435547, - "learning_rate": 2.228684359119142e-05, - "loss": 0.7347, + "grad_norm": 2.719374418258667, + "learning_rate": 0.00037144739318652366, + "loss": 0.5672, "step": 6830 }, { "epoch": 1.287408243929983, - "grad_norm": 19.278038024902344, - "learning_rate": 2.22755505364201e-05, - "loss": 0.873, + "grad_norm": 1.9230657815933228, + "learning_rate": 0.0003712591756070017, + "loss": 0.7672, "step": 6840 }, { "epoch": 1.2892904197252024, - "grad_norm": 6.090950012207031, - "learning_rate": 2.2264257481648784e-05, - "loss": 0.8029, + "grad_norm": 1.078181266784668, + "learning_rate": 0.00037107095802747976, + "loss": 0.4504, "step": 6850 }, { "epoch": 1.2911725955204216, - "grad_norm": 7.98006010055542, - "learning_rate": 2.225296442687747e-05, - "loss": 1.2243, + "grad_norm": 2.379368543624878, + "learning_rate": 0.00037088274044795783, + "loss": 0.9793, "step": 6860 }, { "epoch": 1.2930547713156408, - "grad_norm": 10.448835372924805, - "learning_rate": 2.2241671372106156e-05, - "loss": 1.0374, + "grad_norm": 3.081979990005493, + "learning_rate": 0.0003706945228684359, + "loss": 0.9406, "step": 6870 }, { "epoch": 1.2949369471108603, - "grad_norm": 8.826786041259766, - "learning_rate": 2.223037831733484e-05, - "loss": 0.9484, + "grad_norm": 1.935721516609192, + "learning_rate": 0.000370506305288914, + "loss": 0.6767, "step": 6880 }, { "epoch": 1.2968191229060795, - "grad_norm": 7.370842933654785, - "learning_rate": 2.2219085262563525e-05, - "loss": 1.0126, + "grad_norm": 4.712551116943359, + "learning_rate": 0.00037031808770939206, + "loss": 0.8502, "step": 6890 }, { "epoch": 1.2987012987012987, - "grad_norm": 18.180156707763672, - "learning_rate": 2.2207792207792207e-05, - "loss": 1.2361, + "grad_norm": 4.596050262451172, + "learning_rate": 0.00037012987012987014, + "loss": 0.9557, "step": 6900 }, { "epoch": 1.300583474496518, - "grad_norm": 11.186609268188477, - "learning_rate": 2.2196499153020894e-05, - "loss": 1.0604, + "grad_norm": 2.515564203262329, + "learning_rate": 0.0003699416525503482, + "loss": 0.8905, "step": 6910 }, { "epoch": 1.3024656502917373, - "grad_norm": 20.87862777709961, - "learning_rate": 2.218520609824958e-05, - "loss": 0.8857, + "grad_norm": 1.882664442062378, + "learning_rate": 0.0003697534349708263, + "loss": 0.5869, "step": 6920 }, { "epoch": 1.3043478260869565, - "grad_norm": 12.313833236694336, - "learning_rate": 2.217391304347826e-05, - "loss": 0.9329, + "grad_norm": 2.301548719406128, + "learning_rate": 0.0003695652173913043, + "loss": 0.8646, "step": 6930 }, { "epoch": 1.3062300018821758, - "grad_norm": 9.278630256652832, - "learning_rate": 2.2162619988706945e-05, - "loss": 0.9219, + "grad_norm": 1.3468194007873535, + "learning_rate": 0.0003693769998117824, + "loss": 0.8281, "step": 6940 }, { "epoch": 1.308112177677395, - "grad_norm": 14.707159996032715, - "learning_rate": 2.215132693393563e-05, - "loss": 1.227, + "grad_norm": 2.1500511169433594, + "learning_rate": 0.00036918878223226047, + "loss": 0.9431, "step": 6950 }, { "epoch": 1.3099943534726144, - "grad_norm": 31.15422248840332, - "learning_rate": 2.2140033879164314e-05, - "loss": 0.7853, + "grad_norm": 3.4444191455841064, + "learning_rate": 0.00036900056465273854, + "loss": 0.6617, "step": 6960 }, { "epoch": 1.3118765292678336, - "grad_norm": 8.061625480651855, - "learning_rate": 2.2128740824393e-05, - "loss": 0.9793, + "grad_norm": 2.3794643878936768, + "learning_rate": 0.0003688123470732166, + "loss": 0.7803, "step": 6970 }, { "epoch": 1.3137587050630528, - "grad_norm": 9.95386791229248, - "learning_rate": 2.2117447769621683e-05, - "loss": 1.2524, + "grad_norm": 2.2236216068267822, + "learning_rate": 0.00036862412949369475, + "loss": 1.1628, "step": 6980 }, { "epoch": 1.3156408808582722, - "grad_norm": 6.791532516479492, - "learning_rate": 2.210615471485037e-05, - "loss": 0.8785, + "grad_norm": 1.8513524532318115, + "learning_rate": 0.00036843591191417283, + "loss": 0.7694, "step": 6990 }, { "epoch": 1.3175230566534915, - "grad_norm": 7.024257659912109, - "learning_rate": 2.2094861660079055e-05, - "loss": 0.9782, + "grad_norm": 5.277342796325684, + "learning_rate": 0.0003682476943346509, + "loss": 0.7636, "step": 7000 }, { "epoch": 1.3194052324487107, - "grad_norm": 7.559739589691162, - "learning_rate": 2.2083568605307737e-05, - "loss": 0.9149, + "grad_norm": 2.040113925933838, + "learning_rate": 0.000368059476755129, + "loss": 0.6748, "step": 7010 }, { "epoch": 1.32128740824393, - "grad_norm": 8.2528076171875, - "learning_rate": 2.207227555053642e-05, - "loss": 0.7117, + "grad_norm": 4.988844871520996, + "learning_rate": 0.000367871259175607, + "loss": 0.5714, "step": 7020 }, { "epoch": 1.3231695840391493, - "grad_norm": 21.831268310546875, - "learning_rate": 2.2060982495765103e-05, - "loss": 0.9922, + "grad_norm": 4.413284778594971, + "learning_rate": 0.0003676830415960851, + "loss": 0.7948, "step": 7030 }, { "epoch": 1.3250517598343685, - "grad_norm": 16.495553970336914, - "learning_rate": 2.204968944099379e-05, - "loss": 1.0543, + "grad_norm": 3.844486713409424, + "learning_rate": 0.00036749482401656316, + "loss": 0.8911, "step": 7040 }, { "epoch": 1.3269339356295877, - "grad_norm": 10.989031791687012, - "learning_rate": 2.2038396386222475e-05, - "loss": 0.9902, + "grad_norm": 3.5554070472717285, + "learning_rate": 0.00036730660643704123, + "loss": 0.7859, "step": 7050 }, { "epoch": 1.3288161114248072, - "grad_norm": 10.473587036132812, - "learning_rate": 2.2027103331451158e-05, - "loss": 0.8303, + "grad_norm": 2.1481847763061523, + "learning_rate": 0.0003671183888575193, + "loss": 0.7252, "step": 7060 }, { "epoch": 1.3306982872200264, - "grad_norm": 11.674604415893555, - "learning_rate": 2.2015810276679844e-05, - "loss": 0.79, + "grad_norm": 2.8761439323425293, + "learning_rate": 0.0003669301712779974, + "loss": 0.6704, "step": 7070 }, { "epoch": 1.3325804630152456, - "grad_norm": 8.529352188110352, - "learning_rate": 2.2004517221908526e-05, - "loss": 1.1502, + "grad_norm": 3.5740177631378174, + "learning_rate": 0.00036674195369847546, + "loss": 0.9431, "step": 7080 }, { "epoch": 1.3344626388104648, - "grad_norm": 14.45956802368164, - "learning_rate": 2.1993224167137212e-05, - "loss": 1.0809, + "grad_norm": 6.081122875213623, + "learning_rate": 0.00036655373611895354, + "loss": 0.8061, "step": 7090 }, { "epoch": 1.3363448146056842, - "grad_norm": 7.735544681549072, - "learning_rate": 2.1981931112365895e-05, - "loss": 1.4952, + "grad_norm": 3.7727537155151367, + "learning_rate": 0.00036636551853943156, + "loss": 1.3814, "step": 7100 }, { "epoch": 1.3382269904009034, - "grad_norm": 4.9393134117126465, - "learning_rate": 2.1970638057594578e-05, - "loss": 0.8674, + "grad_norm": 4.154539585113525, + "learning_rate": 0.00036617730095990964, + "loss": 0.6843, "step": 7110 }, { "epoch": 1.3401091661961226, - "grad_norm": 15.033855438232422, - "learning_rate": 2.1959345002823264e-05, - "loss": 0.8618, + "grad_norm": 5.381721019744873, + "learning_rate": 0.0003659890833803877, + "loss": 0.7377, "step": 7120 }, { "epoch": 1.341991341991342, - "grad_norm": 6.540989398956299, - "learning_rate": 2.1948051948051947e-05, - "loss": 0.8127, + "grad_norm": 2.434192657470703, + "learning_rate": 0.0003658008658008658, + "loss": 0.6333, "step": 7130 }, { "epoch": 1.3438735177865613, - "grad_norm": 5.148128032684326, - "learning_rate": 2.1936758893280633e-05, - "loss": 0.8892, + "grad_norm": 1.2883131504058838, + "learning_rate": 0.00036561264822134387, + "loss": 0.8143, "step": 7140 }, { "epoch": 1.3457556935817805, - "grad_norm": 54.40917205810547, - "learning_rate": 2.192546583850932e-05, - "loss": 1.0773, + "grad_norm": 6.206827640533447, + "learning_rate": 0.00036542443064182194, + "loss": 0.7948, "step": 7150 }, { "epoch": 1.347637869377, - "grad_norm": 20.81314468383789, - "learning_rate": 2.1914172783738e-05, - "loss": 1.0652, + "grad_norm": 2.471853017807007, + "learning_rate": 0.0003652362130623, + "loss": 0.807, "step": 7160 }, { "epoch": 1.3495200451722191, - "grad_norm": 7.903687000274658, - "learning_rate": 2.1902879728966687e-05, - "loss": 0.8812, + "grad_norm": 2.1814422607421875, + "learning_rate": 0.0003650479954827781, + "loss": 0.778, "step": 7170 }, { "epoch": 1.3514022209674383, - "grad_norm": 13.962177276611328, - "learning_rate": 2.189158667419537e-05, - "loss": 1.0603, + "grad_norm": 3.2641725540161133, + "learning_rate": 0.00036485977790325617, + "loss": 0.8233, "step": 7180 }, { "epoch": 1.3532843967626578, - "grad_norm": 4.5833892822265625, - "learning_rate": 2.1880293619424053e-05, - "loss": 1.2734, + "grad_norm": 0.6247275471687317, + "learning_rate": 0.0003646715603237342, + "loss": 1.2489, "step": 7190 }, { "epoch": 1.355166572557877, - "grad_norm": 13.545757293701172, - "learning_rate": 2.186900056465274e-05, - "loss": 0.9052, + "grad_norm": 2.727606773376465, + "learning_rate": 0.0003644833427442123, + "loss": 0.8297, "step": 7200 }, { "epoch": 1.3570487483530962, - "grad_norm": 31.69056510925293, - "learning_rate": 2.185770750988142e-05, - "loss": 0.8867, + "grad_norm": 3.449462413787842, + "learning_rate": 0.0003642951251646904, + "loss": 0.8319, "step": 7210 }, { "epoch": 1.3589309241483154, - "grad_norm": 13.554387092590332, - "learning_rate": 2.1846414455110108e-05, - "loss": 0.8744, + "grad_norm": 3.3109967708587646, + "learning_rate": 0.0003641069075851685, + "loss": 0.8439, "step": 7220 }, { "epoch": 1.3608130999435346, - "grad_norm": 4.856804370880127, - "learning_rate": 2.1835121400338794e-05, - "loss": 0.8372, + "grad_norm": 3.461745023727417, + "learning_rate": 0.00036391869000564655, + "loss": 0.807, "step": 7230 }, { "epoch": 1.362695275738754, - "grad_norm": 12.263862609863281, - "learning_rate": 2.1823828345567476e-05, - "loss": 1.0886, + "grad_norm": 3.968158721923828, + "learning_rate": 0.00036373047242612463, + "loss": 0.784, "step": 7240 }, { "epoch": 1.3645774515339733, - "grad_norm": 18.752655029296875, - "learning_rate": 2.1812535290796163e-05, - "loss": 0.7373, + "grad_norm": 2.976491928100586, + "learning_rate": 0.0003635422548466027, + "loss": 0.6565, "step": 7250 }, { "epoch": 1.3664596273291925, - "grad_norm": 22.475677490234375, - "learning_rate": 2.1801242236024845e-05, - "loss": 1.0194, + "grad_norm": 5.492811679840088, + "learning_rate": 0.0003633540372670808, + "loss": 0.889, "step": 7260 }, { "epoch": 1.368341803124412, - "grad_norm": 3.7865233421325684, - "learning_rate": 2.178994918125353e-05, - "loss": 0.9387, + "grad_norm": 2.031484365463257, + "learning_rate": 0.00036316581968755886, + "loss": 0.8284, "step": 7270 }, { "epoch": 1.370223978919631, - "grad_norm": 23.85765266418457, - "learning_rate": 2.1778656126482214e-05, - "loss": 1.1156, + "grad_norm": 2.7551069259643555, + "learning_rate": 0.0003629776021080369, + "loss": 0.9452, "step": 7280 }, { "epoch": 1.3721061547148503, - "grad_norm": 13.753911972045898, - "learning_rate": 2.1767363071710897e-05, - "loss": 0.89, + "grad_norm": 2.5312998294830322, + "learning_rate": 0.00036278938452851496, + "loss": 0.8268, "step": 7290 }, { "epoch": 1.3739883305100697, - "grad_norm": 13.338386535644531, - "learning_rate": 2.1756070016939583e-05, - "loss": 1.0544, + "grad_norm": 2.5570919513702393, + "learning_rate": 0.00036260116694899303, + "loss": 0.7484, "step": 7300 }, { "epoch": 1.375870506305289, - "grad_norm": 7.914046287536621, - "learning_rate": 2.1744776962168265e-05, - "loss": 0.9427, + "grad_norm": 2.838815450668335, + "learning_rate": 0.0003624129493694711, + "loss": 0.9233, "step": 7310 }, { "epoch": 1.3777526821005082, - "grad_norm": 33.587493896484375, - "learning_rate": 2.173348390739695e-05, - "loss": 1.066, + "grad_norm": 4.210334777832031, + "learning_rate": 0.0003622247317899492, + "loss": 0.9044, "step": 7320 }, { "epoch": 1.3796348578957276, - "grad_norm": 5.1027326583862305, - "learning_rate": 2.1722190852625638e-05, - "loss": 0.6101, + "grad_norm": 1.67719566822052, + "learning_rate": 0.00036203651421042726, + "loss": 0.4253, "step": 7330 }, { "epoch": 1.3815170336909468, - "grad_norm": 0.7987704277038574, - "learning_rate": 2.171089779785432e-05, - "loss": 1.1217, + "grad_norm": 0.12403179705142975, + "learning_rate": 0.00036184829663090534, + "loss": 0.6734, "step": 7340 }, { "epoch": 1.383399209486166, - "grad_norm": 9.396214485168457, - "learning_rate": 2.1699604743083006e-05, - "loss": 1.0526, + "grad_norm": 6.574319362640381, + "learning_rate": 0.0003616600790513834, + "loss": 0.8685, "step": 7350 }, { "epoch": 1.3852813852813852, - "grad_norm": 21.794408798217773, - "learning_rate": 2.1688311688311686e-05, - "loss": 0.8303, + "grad_norm": 1.8935904502868652, + "learning_rate": 0.00036147186147186144, + "loss": 0.7353, "step": 7360 }, { "epoch": 1.3871635610766044, - "grad_norm": 22.232450485229492, - "learning_rate": 2.1677018633540372e-05, - "loss": 1.1084, + "grad_norm": 5.131047248840332, + "learning_rate": 0.0003612836438923395, + "loss": 0.6971, "step": 7370 }, { "epoch": 1.3890457368718239, - "grad_norm": 7.189990520477295, - "learning_rate": 2.1665725578769058e-05, - "loss": 1.1183, + "grad_norm": 5.154910564422607, + "learning_rate": 0.0003610954263128176, + "loss": 0.9539, "step": 7380 }, { "epoch": 1.390927912667043, - "grad_norm": 4.046782970428467, - "learning_rate": 2.165443252399774e-05, - "loss": 0.6394, + "grad_norm": 1.3230202198028564, + "learning_rate": 0.00036090720873329567, + "loss": 0.514, "step": 7390 }, { "epoch": 1.3928100884622623, - "grad_norm": 6.194625377655029, - "learning_rate": 2.1643139469226427e-05, - "loss": 0.8828, + "grad_norm": 1.5253217220306396, + "learning_rate": 0.00036071899115377374, + "loss": 0.8314, "step": 7400 }, { "epoch": 1.3946922642574817, - "grad_norm": 15.320564270019531, - "learning_rate": 2.163184641445511e-05, - "loss": 0.8554, + "grad_norm": 3.205615520477295, + "learning_rate": 0.0003605307735742518, + "loss": 0.7711, "step": 7410 }, { "epoch": 1.396574440052701, - "grad_norm": 9.800172805786133, - "learning_rate": 2.1620553359683795e-05, - "loss": 0.9175, + "grad_norm": 2.362624168395996, + "learning_rate": 0.00036034255599472995, + "loss": 0.8926, "step": 7420 }, { "epoch": 1.3984566158479201, - "grad_norm": 4.258259296417236, - "learning_rate": 2.160926030491248e-05, - "loss": 0.872, + "grad_norm": 0.3860720992088318, + "learning_rate": 0.00036015433841520803, + "loss": 0.7266, "step": 7430 }, { "epoch": 1.4003387916431396, - "grad_norm": 16.88814353942871, - "learning_rate": 2.1597967250141164e-05, - "loss": 0.9596, + "grad_norm": 5.132907390594482, + "learning_rate": 0.0003599661208356861, + "loss": 0.8531, "step": 7440 }, { "epoch": 1.4022209674383588, - "grad_norm": 11.031537055969238, - "learning_rate": 2.1586674195369847e-05, - "loss": 0.8293, + "grad_norm": 2.3389828205108643, + "learning_rate": 0.0003597779032561641, + "loss": 0.5889, "step": 7450 }, { "epoch": 1.404103143233578, - "grad_norm": 20.199750900268555, - "learning_rate": 2.1575381140598533e-05, - "loss": 0.9311, + "grad_norm": 6.251373291015625, + "learning_rate": 0.0003595896856766422, + "loss": 0.8839, "step": 7460 }, { "epoch": 1.4059853190287974, - "grad_norm": 10.844975471496582, - "learning_rate": 2.1564088085827216e-05, - "loss": 0.8357, + "grad_norm": 3.035172939300537, + "learning_rate": 0.0003594014680971203, + "loss": 0.5464, "step": 7470 }, { "epoch": 1.4078674948240166, - "grad_norm": 26.47841453552246, - "learning_rate": 2.15527950310559e-05, - "loss": 0.9514, + "grad_norm": 5.207313060760498, + "learning_rate": 0.00035921325051759836, + "loss": 0.8323, "step": 7480 }, { "epoch": 1.4097496706192358, - "grad_norm": 27.28598976135254, - "learning_rate": 2.1541501976284584e-05, - "loss": 1.2359, + "grad_norm": 2.937911033630371, + "learning_rate": 0.00035902503293807643, + "loss": 1.0031, "step": 7490 }, { "epoch": 1.411631846414455, - "grad_norm": 22.267330169677734, - "learning_rate": 2.153020892151327e-05, - "loss": 0.7749, + "grad_norm": 3.0856587886810303, + "learning_rate": 0.0003588368153585545, + "loss": 0.6141, "step": 7500 }, { "epoch": 1.4135140222096743, - "grad_norm": 10.783427238464355, - "learning_rate": 2.1518915866741956e-05, - "loss": 0.7608, + "grad_norm": 3.810689926147461, + "learning_rate": 0.0003586485977790326, + "loss": 0.6492, "step": 7510 }, { "epoch": 1.4153961980048937, - "grad_norm": 12.085504531860352, - "learning_rate": 2.150762281197064e-05, - "loss": 0.8435, + "grad_norm": 3.4080021381378174, + "learning_rate": 0.00035846038019951066, + "loss": 0.4189, "step": 7520 }, { "epoch": 1.417278373800113, - "grad_norm": 4.772538185119629, - "learning_rate": 2.1496329757199325e-05, - "loss": 1.1515, + "grad_norm": 1.4870398044586182, + "learning_rate": 0.00035827216261998874, + "loss": 0.9096, "step": 7530 }, { "epoch": 1.419160549595332, - "grad_norm": 16.843612670898438, - "learning_rate": 2.1485036702428005e-05, - "loss": 0.6556, + "grad_norm": 4.46442174911499, + "learning_rate": 0.00035808394504046676, + "loss": 0.6143, "step": 7540 }, { "epoch": 1.4210427253905515, - "grad_norm": 16.49220085144043, - "learning_rate": 2.147374364765669e-05, - "loss": 0.9657, + "grad_norm": 4.368817329406738, + "learning_rate": 0.00035789572746094484, + "loss": 0.9284, "step": 7550 }, { "epoch": 1.4229249011857708, - "grad_norm": 9.660463333129883, - "learning_rate": 2.1462450592885377e-05, - "loss": 1.0375, + "grad_norm": 1.6569066047668457, + "learning_rate": 0.0003577075098814229, + "loss": 0.9065, "step": 7560 }, { "epoch": 1.42480707698099, - "grad_norm": 14.508869171142578, - "learning_rate": 2.145115753811406e-05, - "loss": 1.191, + "grad_norm": 7.855297565460205, + "learning_rate": 0.000357519292301901, + "loss": 1.0547, "step": 7570 }, { "epoch": 1.4266892527762094, - "grad_norm": 9.467730522155762, - "learning_rate": 2.1439864483342745e-05, - "loss": 0.911, + "grad_norm": 3.0609915256500244, + "learning_rate": 0.00035733107472237907, + "loss": 0.8127, "step": 7580 }, { "epoch": 1.4285714285714286, - "grad_norm": 9.366085052490234, - "learning_rate": 2.1428571428571428e-05, - "loss": 0.6016, + "grad_norm": 1.1458548307418823, + "learning_rate": 0.00035714285714285714, + "loss": 0.6241, "step": 7590 }, { "epoch": 1.4304536043666478, - "grad_norm": 6.722076892852783, - "learning_rate": 2.1417278373800114e-05, - "loss": 1.1092, + "grad_norm": 2.866499662399292, + "learning_rate": 0.0003569546395633352, + "loss": 0.7273, "step": 7600 }, { "epoch": 1.4323357801618672, - "grad_norm": 7.0276336669921875, - "learning_rate": 2.14059853190288e-05, - "loss": 0.8131, + "grad_norm": 4.291321754455566, + "learning_rate": 0.0003567664219838133, + "loss": 0.6635, "step": 7610 }, { "epoch": 1.4342179559570865, - "grad_norm": 9.224479675292969, - "learning_rate": 2.1394692264257483e-05, - "loss": 0.8081, + "grad_norm": 2.3771772384643555, + "learning_rate": 0.00035657820440429137, + "loss": 0.8284, "step": 7620 }, { "epoch": 1.4361001317523057, - "grad_norm": 18.966449737548828, - "learning_rate": 2.1383399209486166e-05, - "loss": 0.8604, + "grad_norm": 6.619555950164795, + "learning_rate": 0.0003563899868247694, + "loss": 0.7846, "step": 7630 }, { "epoch": 1.4379823075475249, - "grad_norm": 6.464701175689697, - "learning_rate": 2.137210615471485e-05, - "loss": 0.7825, + "grad_norm": 2.625793695449829, + "learning_rate": 0.0003562017692452475, + "loss": 0.6474, "step": 7640 }, { "epoch": 1.439864483342744, - "grad_norm": 31.871877670288086, - "learning_rate": 2.1360813099943534e-05, - "loss": 0.7266, + "grad_norm": 3.1928863525390625, + "learning_rate": 0.0003560135516657256, + "loss": 0.4706, "step": 7650 }, { "epoch": 1.4417466591379635, - "grad_norm": 0.4453635513782501, - "learning_rate": 2.134952004517222e-05, - "loss": 0.9575, + "grad_norm": 0.3685864210128784, + "learning_rate": 0.0003558253340862037, + "loss": 0.7511, "step": 7660 }, { "epoch": 1.4436288349331827, - "grad_norm": 20.737749099731445, - "learning_rate": 2.1338226990400903e-05, - "loss": 0.8743, + "grad_norm": 6.002312660217285, + "learning_rate": 0.00035563711650668175, + "loss": 0.652, "step": 7670 }, { "epoch": 1.445511010728402, - "grad_norm": 7.961155414581299, - "learning_rate": 2.132693393562959e-05, - "loss": 0.8414, + "grad_norm": 2.449984312057495, + "learning_rate": 0.00035544889892715983, + "loss": 0.4978, "step": 7680 }, { "epoch": 1.4473931865236214, - "grad_norm": 14.688976287841797, - "learning_rate": 2.1315640880858275e-05, - "loss": 0.8901, + "grad_norm": 5.680086612701416, + "learning_rate": 0.0003552606813476379, + "loss": 0.9043, "step": 7690 }, { "epoch": 1.4492753623188406, - "grad_norm": 1.112459421157837, - "learning_rate": 2.1304347826086958e-05, - "loss": 0.7949, + "grad_norm": 1.8493766784667969, + "learning_rate": 0.000355072463768116, + "loss": 0.7197, "step": 7700 }, { "epoch": 1.4511575381140598, - "grad_norm": 1.5727367401123047, - "learning_rate": 2.129305477131564e-05, - "loss": 0.834, + "grad_norm": 0.3454371392726898, + "learning_rate": 0.000354884246188594, + "loss": 0.6712, "step": 7710 }, { "epoch": 1.4530397139092792, - "grad_norm": 15.447263717651367, - "learning_rate": 2.1281761716544323e-05, - "loss": 0.8197, + "grad_norm": 3.6451921463012695, + "learning_rate": 0.0003546960286090721, + "loss": 0.8027, "step": 7720 }, { "epoch": 1.4549218897044984, - "grad_norm": 16.137331008911133, - "learning_rate": 2.127046866177301e-05, - "loss": 0.7958, + "grad_norm": 3.223358154296875, + "learning_rate": 0.00035450781102955016, + "loss": 0.7259, "step": 7730 }, { "epoch": 1.4568040654997176, - "grad_norm": 10.978745460510254, - "learning_rate": 2.1259175607001696e-05, - "loss": 1.0626, + "grad_norm": 3.4901061058044434, + "learning_rate": 0.00035431959345002823, + "loss": 0.8298, "step": 7740 }, { "epoch": 1.458686241294937, - "grad_norm": 13.138747215270996, - "learning_rate": 2.1247882552230378e-05, - "loss": 0.8053, + "grad_norm": 1.583277940750122, + "learning_rate": 0.0003541313758705063, + "loss": 0.6178, "step": 7750 }, { "epoch": 1.4605684170901563, - "grad_norm": 15.29321575164795, - "learning_rate": 2.1236589497459064e-05, - "loss": 1.0308, + "grad_norm": 1.1956707239151, + "learning_rate": 0.0003539431582909844, + "loss": 0.8466, "step": 7760 }, { "epoch": 1.4624505928853755, - "grad_norm": 7.9823527336120605, - "learning_rate": 2.1225296442687747e-05, - "loss": 1.0307, + "grad_norm": 2.058783769607544, + "learning_rate": 0.00035375494071146246, + "loss": 0.905, "step": 7770 }, { "epoch": 1.4643327686805947, - "grad_norm": 28.051477432250977, - "learning_rate": 2.1214003387916433e-05, - "loss": 0.9783, + "grad_norm": 8.526945114135742, + "learning_rate": 0.00035356672313194054, + "loss": 0.9288, "step": 7780 }, { "epoch": 1.466214944475814, - "grad_norm": 2.81699800491333, - "learning_rate": 2.120271033314512e-05, - "loss": 0.6847, + "grad_norm": 3.497265577316284, + "learning_rate": 0.0003533785055524186, + "loss": 0.6056, "step": 7790 }, { "epoch": 1.4680971202710333, - "grad_norm": 20.243371963500977, - "learning_rate": 2.11914172783738e-05, - "loss": 0.8305, + "grad_norm": 2.1482982635498047, + "learning_rate": 0.00035319028797289664, + "loss": 0.6909, "step": 7800 }, { "epoch": 1.4699792960662525, - "grad_norm": 5.7048115730285645, - "learning_rate": 2.1180124223602485e-05, - "loss": 0.6723, + "grad_norm": 2.2635204792022705, + "learning_rate": 0.0003530020703933747, + "loss": 0.6319, "step": 7810 }, { "epoch": 1.4718614718614718, - "grad_norm": 14.926733016967773, - "learning_rate": 2.1168831168831167e-05, - "loss": 0.9217, + "grad_norm": 2.105267286300659, + "learning_rate": 0.0003528138528138528, + "loss": 0.7792, "step": 7820 }, { "epoch": 1.4737436476566912, - "grad_norm": 22.269062042236328, - "learning_rate": 2.1157538114059853e-05, - "loss": 1.0226, + "grad_norm": 2.60502552986145, + "learning_rate": 0.00035262563523433087, + "loss": 0.8495, "step": 7830 }, { "epoch": 1.4756258234519104, - "grad_norm": 5.699794292449951, - "learning_rate": 2.114624505928854e-05, - "loss": 0.749, + "grad_norm": 1.7702150344848633, + "learning_rate": 0.00035243741765480894, + "loss": 0.6141, "step": 7840 }, { "epoch": 1.4775079992471296, - "grad_norm": 5.858451843261719, - "learning_rate": 2.1134952004517222e-05, - "loss": 0.7048, + "grad_norm": 4.5006184577941895, + "learning_rate": 0.000352249200075287, + "loss": 0.7274, "step": 7850 }, { "epoch": 1.479390175042349, - "grad_norm": 8.59745979309082, - "learning_rate": 2.1123658949745908e-05, - "loss": 0.9316, + "grad_norm": 3.0699360370635986, + "learning_rate": 0.00035206098249576515, + "loss": 0.8281, "step": 7860 }, { "epoch": 1.4812723508375683, - "grad_norm": 12.093297958374023, - "learning_rate": 2.111236589497459e-05, - "loss": 1.0178, + "grad_norm": 8.668073654174805, + "learning_rate": 0.00035187276491624323, + "loss": 0.8215, "step": 7870 }, { "epoch": 1.4831545266327875, - "grad_norm": 12.393393516540527, - "learning_rate": 2.1101072840203277e-05, - "loss": 0.7238, + "grad_norm": 7.539294242858887, + "learning_rate": 0.0003516845473367213, + "loss": 0.8282, "step": 7880 }, { "epoch": 1.485036702428007, - "grad_norm": 46.78508758544922, - "learning_rate": 2.108977978543196e-05, - "loss": 0.8131, + "grad_norm": 3.0696523189544678, + "learning_rate": 0.0003514963297571993, + "loss": 0.6651, "step": 7890 }, { "epoch": 1.486918878223226, - "grad_norm": 1.7145179510116577, - "learning_rate": 2.1078486730660642e-05, - "loss": 0.7799, + "grad_norm": 2.3803205490112305, + "learning_rate": 0.0003513081121776774, + "loss": 0.7852, "step": 7900 }, { "epoch": 1.4888010540184453, - "grad_norm": 24.29248809814453, - "learning_rate": 2.106719367588933e-05, - "loss": 1.083, + "grad_norm": 1.9712696075439453, + "learning_rate": 0.0003511198945981555, + "loss": 0.8236, "step": 7910 }, { "epoch": 1.4906832298136645, - "grad_norm": 10.044913291931152, - "learning_rate": 2.105590062111801e-05, - "loss": 0.7995, + "grad_norm": 3.2915306091308594, + "learning_rate": 0.00035093167701863356, + "loss": 0.7561, "step": 7920 }, { "epoch": 1.4925654056088837, - "grad_norm": 17.686159133911133, - "learning_rate": 2.1044607566346697e-05, - "loss": 1.23, + "grad_norm": 2.5508902072906494, + "learning_rate": 0.00035074345943911163, + "loss": 1.0034, "step": 7930 }, { "epoch": 1.4944475814041032, - "grad_norm": 12.747879981994629, - "learning_rate": 2.1033314511575383e-05, - "loss": 1.0773, + "grad_norm": 4.021931171417236, + "learning_rate": 0.0003505552418595897, + "loss": 0.9428, "step": 7940 }, { "epoch": 1.4963297571993224, - "grad_norm": 14.406421661376953, - "learning_rate": 2.1022021456804066e-05, - "loss": 0.8235, + "grad_norm": 3.7076730728149414, + "learning_rate": 0.0003503670242800678, + "loss": 0.7309, "step": 7950 }, { "epoch": 1.4982119329945416, - "grad_norm": 9.100472450256348, - "learning_rate": 2.1010728402032752e-05, - "loss": 0.5385, + "grad_norm": 2.309861421585083, + "learning_rate": 0.00035017880670054586, + "loss": 0.4758, "step": 7960 }, { "epoch": 1.500094108789761, - "grad_norm": 1.3265353441238403, - "learning_rate": 2.0999435347261435e-05, - "loss": 0.7218, + "grad_norm": 4.0389862060546875, + "learning_rate": 0.0003499905891210239, + "loss": 0.7024, "step": 7970 }, { "epoch": 1.5019762845849802, - "grad_norm": 22.250818252563477, - "learning_rate": 2.0988142292490117e-05, - "loss": 0.9737, + "grad_norm": 3.7609620094299316, + "learning_rate": 0.00034980237154150196, + "loss": 0.8486, "step": 7980 }, { "epoch": 1.5038584603801994, - "grad_norm": 6.612005710601807, - "learning_rate": 2.0976849237718803e-05, - "loss": 0.7305, + "grad_norm": 2.8926572799682617, + "learning_rate": 0.00034961415396198004, + "loss": 0.5895, "step": 7990 }, { "epoch": 1.5057406361754189, - "grad_norm": 26.889253616333008, - "learning_rate": 2.0965556182947486e-05, - "loss": 0.82, + "grad_norm": 4.276566982269287, + "learning_rate": 0.0003494259363824581, + "loss": 0.6477, "step": 8000 }, { "epoch": 1.507622811970638, - "grad_norm": 15.2909574508667, - "learning_rate": 2.0954263128176172e-05, - "loss": 0.8488, + "grad_norm": 3.2659835815429688, + "learning_rate": 0.0003492377188029362, + "loss": 0.7834, "step": 8010 }, { "epoch": 1.5095049877658573, - "grad_norm": 14.072726249694824, - "learning_rate": 2.0942970073404858e-05, - "loss": 0.7917, + "grad_norm": 2.6856377124786377, + "learning_rate": 0.00034904950122341427, + "loss": 0.6446, "step": 8020 }, { "epoch": 1.5113871635610767, - "grad_norm": 19.601593017578125, - "learning_rate": 2.093167701863354e-05, - "loss": 0.8319, + "grad_norm": 2.0098695755004883, + "learning_rate": 0.00034886128364389234, + "loss": 0.7457, "step": 8030 }, { "epoch": 1.513269339356296, - "grad_norm": 11.248756408691406, - "learning_rate": 2.0920383963862227e-05, - "loss": 0.8455, + "grad_norm": 2.1096489429473877, + "learning_rate": 0.0003486730660643704, + "loss": 0.7569, "step": 8040 }, { "epoch": 1.5151515151515151, - "grad_norm": 14.29757022857666, - "learning_rate": 2.090909090909091e-05, - "loss": 0.836, + "grad_norm": 2.1530377864837646, + "learning_rate": 0.0003484848484848485, + "loss": 0.6903, "step": 8050 }, { "epoch": 1.5170336909467346, - "grad_norm": 18.773252487182617, - "learning_rate": 2.0897797854319592e-05, - "loss": 0.7697, + "grad_norm": 3.4614245891571045, + "learning_rate": 0.0003482966309053265, + "loss": 0.7235, "step": 8060 }, { "epoch": 1.5189158667419536, - "grad_norm": 13.430651664733887, - "learning_rate": 2.088650479954828e-05, - "loss": 0.7545, + "grad_norm": 2.9231855869293213, + "learning_rate": 0.0003481084133258046, + "loss": 0.6586, "step": 8070 }, { "epoch": 1.520798042537173, - "grad_norm": 18.43965721130371, - "learning_rate": 2.087521174477696e-05, - "loss": 0.9061, + "grad_norm": 3.0614635944366455, + "learning_rate": 0.00034792019574628267, + "loss": 0.827, "step": 8080 }, { "epoch": 1.5226802183323922, - "grad_norm": 4.812375068664551, - "learning_rate": 2.0863918690005647e-05, - "loss": 0.7921, + "grad_norm": 1.5056312084197998, + "learning_rate": 0.0003477319781667608, + "loss": 0.7082, "step": 8090 }, { "epoch": 1.5245623941276114, - "grad_norm": 13.704950332641602, - "learning_rate": 2.085262563523433e-05, - "loss": 1.0409, + "grad_norm": 1.0233908891677856, + "learning_rate": 0.0003475437605872389, + "loss": 0.7483, "step": 8100 }, { "epoch": 1.5264445699228308, - "grad_norm": 7.7468414306640625, - "learning_rate": 2.0841332580463016e-05, - "loss": 0.8707, + "grad_norm": 2.48866868019104, + "learning_rate": 0.00034735554300771695, + "loss": 0.8398, "step": 8110 }, { "epoch": 1.52832674571805, - "grad_norm": 8.92183780670166, - "learning_rate": 2.0830039525691702e-05, - "loss": 1.0549, + "grad_norm": 2.297361373901367, + "learning_rate": 0.00034716732542819503, + "loss": 0.8928, "step": 8120 }, { "epoch": 1.5302089215132693, - "grad_norm": 12.238114356994629, - "learning_rate": 2.0818746470920385e-05, - "loss": 1.0467, + "grad_norm": 1.6928592920303345, + "learning_rate": 0.0003469791078486731, + "loss": 0.7641, "step": 8130 }, { "epoch": 1.5320910973084887, - "grad_norm": 15.757767677307129, - "learning_rate": 2.080745341614907e-05, - "loss": 0.8356, + "grad_norm": 3.405965805053711, + "learning_rate": 0.0003467908902691512, + "loss": 0.8176, "step": 8140 }, { "epoch": 1.533973273103708, - "grad_norm": 15.579939842224121, - "learning_rate": 2.079616036137775e-05, - "loss": 0.7802, + "grad_norm": 2.9637680053710938, + "learning_rate": 0.0003466026726896292, + "loss": 0.7238, "step": 8150 }, { "epoch": 1.5358554488989271, - "grad_norm": 7.932995796203613, - "learning_rate": 2.0784867306606436e-05, - "loss": 0.782, + "grad_norm": 1.8810489177703857, + "learning_rate": 0.0003464144551101073, + "loss": 0.788, "step": 8160 }, { "epoch": 1.5377376246941465, - "grad_norm": 34.61065673828125, - "learning_rate": 2.0773574251835122e-05, - "loss": 1.0706, + "grad_norm": 8.370185852050781, + "learning_rate": 0.00034622623753058536, + "loss": 1.0583, "step": 8170 }, { "epoch": 1.5396198004893658, - "grad_norm": 6.347330093383789, - "learning_rate": 2.0762281197063805e-05, - "loss": 0.862, + "grad_norm": 1.2462303638458252, + "learning_rate": 0.00034603801995106343, + "loss": 0.7773, "step": 8180 }, { "epoch": 1.541501976284585, - "grad_norm": 12.102984428405762, - "learning_rate": 2.075098814229249e-05, - "loss": 0.976, + "grad_norm": 2.5813164710998535, + "learning_rate": 0.0003458498023715415, + "loss": 0.7235, "step": 8190 }, { "epoch": 1.5433841520798044, - "grad_norm": 34.155494689941406, - "learning_rate": 2.0739695087521177e-05, - "loss": 0.9479, + "grad_norm": 3.9357566833496094, + "learning_rate": 0.0003456615847920196, + "loss": 0.7332, "step": 8200 }, { "epoch": 1.5452663278750234, - "grad_norm": 4.237630844116211, - "learning_rate": 2.072840203274986e-05, - "loss": 0.6754, + "grad_norm": 1.743631362915039, + "learning_rate": 0.00034547336721249766, + "loss": 0.6358, "step": 8210 }, { "epoch": 1.5471485036702428, - "grad_norm": 9.868550300598145, - "learning_rate": 2.0717108977978546e-05, - "loss": 0.6415, + "grad_norm": 1.980130672454834, + "learning_rate": 0.00034528514963297574, + "loss": 0.6059, "step": 8220 }, { "epoch": 1.549030679465462, - "grad_norm": 9.84210205078125, - "learning_rate": 2.070581592320723e-05, - "loss": 1.0436, + "grad_norm": 3.3148014545440674, + "learning_rate": 0.0003450969320534538, + "loss": 0.9266, "step": 8230 }, { "epoch": 1.5509128552606812, - "grad_norm": 6.024266242980957, - "learning_rate": 2.069452286843591e-05, - "loss": 0.749, + "grad_norm": 4.528003692626953, + "learning_rate": 0.00034490871447393184, + "loss": 0.7274, "step": 8240 }, { "epoch": 1.5527950310559007, - "grad_norm": 24.58380889892578, - "learning_rate": 2.0683229813664597e-05, - "loss": 1.102, + "grad_norm": 2.7898247241973877, + "learning_rate": 0.0003447204968944099, + "loss": 0.9216, "step": 8250 }, { "epoch": 1.5546772068511199, - "grad_norm": 15.549361228942871, - "learning_rate": 2.067193675889328e-05, - "loss": 0.619, + "grad_norm": 5.00869083404541, + "learning_rate": 0.000344532279314888, + "loss": 0.5532, "step": 8260 }, { "epoch": 1.556559382646339, - "grad_norm": 11.536001205444336, - "learning_rate": 2.0660643704121966e-05, - "loss": 0.7282, + "grad_norm": 1.2055312395095825, + "learning_rate": 0.00034434406173536607, + "loss": 0.5286, "step": 8270 }, { "epoch": 1.5584415584415585, - "grad_norm": 1.458553671836853, - "learning_rate": 2.064935064935065e-05, - "loss": 0.8282, + "grad_norm": 0.34826549887657166, + "learning_rate": 0.00034415584415584414, + "loss": 0.6315, "step": 8280 }, { "epoch": 1.5603237342367777, - "grad_norm": 22.097503662109375, - "learning_rate": 2.0638057594579335e-05, - "loss": 1.32, + "grad_norm": 1.7927368879318237, + "learning_rate": 0.0003439676265763222, + "loss": 1.0556, "step": 8290 }, { "epoch": 1.562205910031997, - "grad_norm": 1.0465900897979736, - "learning_rate": 2.062676453980802e-05, - "loss": 0.6825, + "grad_norm": 3.9631786346435547, + "learning_rate": 0.00034377940899680035, + "loss": 0.5423, "step": 8300 }, { "epoch": 1.5640880858272164, - "grad_norm": 4.5169782638549805, - "learning_rate": 2.0615471485036704e-05, - "loss": 1.0293, + "grad_norm": 1.3428804874420166, + "learning_rate": 0.00034359119141727843, + "loss": 0.9069, "step": 8310 }, { "epoch": 1.5659702616224356, - "grad_norm": 24.006298065185547, - "learning_rate": 2.0604178430265386e-05, - "loss": 0.4358, + "grad_norm": 2.6552329063415527, + "learning_rate": 0.00034340297383775645, + "loss": 0.3473, "step": 8320 }, { "epoch": 1.5678524374176548, - "grad_norm": 34.115413665771484, - "learning_rate": 2.059288537549407e-05, - "loss": 0.7429, + "grad_norm": 3.216970920562744, + "learning_rate": 0.0003432147562582345, + "loss": 0.6878, "step": 8330 }, { "epoch": 1.5697346132128742, - "grad_norm": 0.8864557147026062, - "learning_rate": 2.0581592320722755e-05, - "loss": 0.6736, + "grad_norm": 1.658892273902893, + "learning_rate": 0.0003430265386787126, + "loss": 0.6124, "step": 8340 }, { "epoch": 1.5716167890080932, - "grad_norm": 3.0202507972717285, - "learning_rate": 2.057029926595144e-05, - "loss": 0.6662, + "grad_norm": 1.2303948402404785, + "learning_rate": 0.0003428383210991907, + "loss": 0.456, "step": 8350 }, { "epoch": 1.5734989648033126, - "grad_norm": 16.685216903686523, - "learning_rate": 2.0559006211180124e-05, - "loss": 1.1072, + "grad_norm": 3.7826428413391113, + "learning_rate": 0.00034265010351966876, + "loss": 0.8232, "step": 8360 }, { "epoch": 1.5753811405985318, - "grad_norm": 14.748647689819336, - "learning_rate": 2.054771315640881e-05, - "loss": 0.6912, + "grad_norm": 2.520322799682617, + "learning_rate": 0.00034246188594014683, + "loss": 0.4863, "step": 8370 }, { "epoch": 1.577263316393751, - "grad_norm": 19.778024673461914, - "learning_rate": 2.0536420101637493e-05, - "loss": 1.0492, + "grad_norm": 3.2636899948120117, + "learning_rate": 0.0003422736683606249, + "loss": 0.9228, "step": 8380 }, { "epoch": 1.5791454921889705, - "grad_norm": 1.4614105224609375, - "learning_rate": 2.052512704686618e-05, - "loss": 1.0389, + "grad_norm": 0.5615966320037842, + "learning_rate": 0.000342085450781103, + "loss": 0.8344, "step": 8390 }, { "epoch": 1.5810276679841897, - "grad_norm": 2.2111265659332275, - "learning_rate": 2.0513833992094865e-05, - "loss": 0.7137, + "grad_norm": 1.5312647819519043, + "learning_rate": 0.00034189723320158106, + "loss": 0.5623, "step": 8400 }, { "epoch": 1.582909843779409, - "grad_norm": 25.663915634155273, - "learning_rate": 2.0502540937323544e-05, - "loss": 0.8827, + "grad_norm": 2.7488324642181396, + "learning_rate": 0.0003417090156220591, + "loss": 0.7472, "step": 8410 }, { "epoch": 1.5847920195746283, - "grad_norm": 32.34187316894531, - "learning_rate": 2.049124788255223e-05, - "loss": 1.359, + "grad_norm": 2.1798367500305176, + "learning_rate": 0.00034152079804253716, + "loss": 1.2064, "step": 8420 }, { "epoch": 1.5866741953698476, - "grad_norm": 9.49844741821289, - "learning_rate": 2.0479954827780913e-05, - "loss": 0.7594, + "grad_norm": 1.7178131341934204, + "learning_rate": 0.00034133258046301524, + "loss": 0.7014, "step": 8430 }, { "epoch": 1.5885563711650668, - "grad_norm": 8.139872550964355, - "learning_rate": 2.04686617730096e-05, - "loss": 0.9236, + "grad_norm": 1.6949135065078735, + "learning_rate": 0.0003411443628834933, + "loss": 0.7091, "step": 8440 }, { "epoch": 1.5904385469602862, - "grad_norm": 4.609650135040283, - "learning_rate": 2.0457368718238285e-05, - "loss": 1.0342, + "grad_norm": 2.0318641662597656, + "learning_rate": 0.0003409561453039714, + "loss": 0.8617, "step": 8450 }, { "epoch": 1.5923207227555054, - "grad_norm": 13.851377487182617, - "learning_rate": 2.0446075663466968e-05, - "loss": 0.985, + "grad_norm": 2.8767454624176025, + "learning_rate": 0.00034076792772444947, + "loss": 0.7656, "step": 8460 }, { "epoch": 1.5942028985507246, - "grad_norm": 17.376338958740234, - "learning_rate": 2.0434782608695654e-05, - "loss": 0.8613, + "grad_norm": 5.067991256713867, + "learning_rate": 0.00034057971014492754, + "loss": 0.7261, "step": 8470 }, { "epoch": 1.596085074345944, - "grad_norm": 16.559776306152344, - "learning_rate": 2.042348955392434e-05, - "loss": 0.6948, + "grad_norm": 2.4623968601226807, + "learning_rate": 0.0003403914925654056, + "loss": 0.6039, "step": 8480 }, { "epoch": 1.597967250141163, - "grad_norm": 28.88027000427246, - "learning_rate": 2.0412196499153023e-05, - "loss": 0.7705, + "grad_norm": 2.616107940673828, + "learning_rate": 0.0003402032749858837, + "loss": 0.5545, "step": 8490 }, { "epoch": 1.5998494259363825, - "grad_norm": 25.906251907348633, - "learning_rate": 2.0400903444381705e-05, - "loss": 0.8089, + "grad_norm": 2.3766703605651855, + "learning_rate": 0.0003400150574063617, + "loss": 0.569, "step": 8500 }, { "epoch": 1.601731601731602, - "grad_norm": 3.415189266204834, - "learning_rate": 2.0389610389610388e-05, - "loss": 0.4868, + "grad_norm": 1.4852138757705688, + "learning_rate": 0.0003398268398268398, + "loss": 0.4487, "step": 8510 }, { "epoch": 1.6036137775268209, - "grad_norm": 7.368528366088867, - "learning_rate": 2.0378317334839074e-05, - "loss": 0.6249, + "grad_norm": 10.471375465393066, + "learning_rate": 0.00033963862224731787, + "loss": 0.4925, "step": 8520 }, { "epoch": 1.6054959533220403, - "grad_norm": 12.94930362701416, - "learning_rate": 2.036702428006776e-05, - "loss": 0.8234, + "grad_norm": 3.325183629989624, + "learning_rate": 0.000339450404667796, + "loss": 0.5884, "step": 8530 }, { "epoch": 1.6073781291172595, - "grad_norm": 1.4030075073242188, - "learning_rate": 2.0355731225296443e-05, - "loss": 0.7259, + "grad_norm": 0.5877097249031067, + "learning_rate": 0.0003392621870882741, + "loss": 0.669, "step": 8540 }, { "epoch": 1.6092603049124787, - "grad_norm": 21.617116928100586, - "learning_rate": 2.034443817052513e-05, - "loss": 1.0274, + "grad_norm": 1.0777863264083862, + "learning_rate": 0.00033907396950875215, + "loss": 0.9738, "step": 8550 }, { "epoch": 1.6111424807076982, - "grad_norm": 14.721446990966797, - "learning_rate": 2.033314511575381e-05, - "loss": 0.7038, + "grad_norm": 4.0034685134887695, + "learning_rate": 0.00033888575192923023, + "loss": 0.5929, "step": 8560 }, { "epoch": 1.6130246565029174, - "grad_norm": 37.175865173339844, - "learning_rate": 2.0321852060982498e-05, - "loss": 0.8129, + "grad_norm": 0.5476213693618774, + "learning_rate": 0.0003386975343497083, + "loss": 0.7154, "step": 8570 }, { "epoch": 1.6149068322981366, - "grad_norm": 18.872203826904297, - "learning_rate": 2.031055900621118e-05, - "loss": 0.7054, + "grad_norm": 2.511484384536743, + "learning_rate": 0.00033850931677018633, + "loss": 0.5125, "step": 8580 }, { "epoch": 1.616789008093356, - "grad_norm": 18.754467010498047, - "learning_rate": 2.0299265951439863e-05, - "loss": 0.7435, + "grad_norm": 5.705085277557373, + "learning_rate": 0.0003383210991906644, + "loss": 0.6806, "step": 8590 }, { "epoch": 1.6186711838885752, - "grad_norm": 17.79302406311035, - "learning_rate": 2.028797289666855e-05, - "loss": 0.8313, + "grad_norm": 5.182740211486816, + "learning_rate": 0.0003381328816111425, + "loss": 0.7014, "step": 8600 }, { "epoch": 1.6205533596837944, - "grad_norm": 17.659814834594727, - "learning_rate": 2.0276679841897232e-05, - "loss": 0.7874, + "grad_norm": 5.4863080978393555, + "learning_rate": 0.00033794466403162056, + "loss": 0.664, "step": 8610 }, { "epoch": 1.6224355354790139, - "grad_norm": 14.71423625946045, - "learning_rate": 2.0265386787125918e-05, - "loss": 0.83, + "grad_norm": 2.908485174179077, + "learning_rate": 0.00033775644645209863, + "loss": 0.6387, "step": 8620 }, { "epoch": 1.6243177112742329, - "grad_norm": 17.527603149414062, - "learning_rate": 2.0254093732354604e-05, - "loss": 1.0773, + "grad_norm": 4.561892032623291, + "learning_rate": 0.0003375682288725767, + "loss": 0.9372, "step": 8630 }, { "epoch": 1.6261998870694523, - "grad_norm": 12.642634391784668, - "learning_rate": 2.0242800677583287e-05, - "loss": 0.9208, + "grad_norm": 1.4518635272979736, + "learning_rate": 0.0003373800112930548, + "loss": 0.9095, "step": 8640 }, { "epoch": 1.6280820628646717, - "grad_norm": 1.2495397329330444, - "learning_rate": 2.0231507622811973e-05, - "loss": 0.9029, + "grad_norm": 0.5780901908874512, + "learning_rate": 0.00033719179371353286, + "loss": 0.8878, "step": 8650 }, { "epoch": 1.6299642386598907, - "grad_norm": 0.4178260564804077, - "learning_rate": 2.0220214568040655e-05, - "loss": 0.6525, + "grad_norm": 0.13929180800914764, + "learning_rate": 0.00033700357613401094, + "loss": 0.6141, "step": 8660 }, { "epoch": 1.6318464144551101, - "grad_norm": 12.84122085571289, - "learning_rate": 2.0208921513269338e-05, - "loss": 0.6563, + "grad_norm": 0.9517858624458313, + "learning_rate": 0.00033681535855448896, + "loss": 0.4543, "step": 8670 }, { "epoch": 1.6337285902503293, - "grad_norm": 14.375418663024902, - "learning_rate": 2.0197628458498024e-05, - "loss": 0.9404, + "grad_norm": 7.420551300048828, + "learning_rate": 0.00033662714097496704, + "loss": 0.8612, "step": 8680 }, { "epoch": 1.6356107660455486, - "grad_norm": 5.309718132019043, - "learning_rate": 2.0186335403726707e-05, - "loss": 0.8255, + "grad_norm": 3.4701571464538574, + "learning_rate": 0.0003364389233954451, + "loss": 0.8368, "step": 8690 }, { "epoch": 1.637492941840768, - "grad_norm": 19.447336196899414, - "learning_rate": 2.0175042348955393e-05, - "loss": 0.9448, + "grad_norm": 4.836475372314453, + "learning_rate": 0.0003362507058159232, + "loss": 0.8814, "step": 8700 }, { "epoch": 1.6393751176359872, - "grad_norm": 5.150761127471924, - "learning_rate": 2.016374929418408e-05, - "loss": 0.7773, + "grad_norm": 2.2584071159362793, + "learning_rate": 0.00033606248823640127, + "loss": 0.6878, "step": 8710 }, { "epoch": 1.6412572934312064, - "grad_norm": 10.67483139038086, - "learning_rate": 2.015245623941276e-05, - "loss": 0.6024, + "grad_norm": 2.989353656768799, + "learning_rate": 0.00033587427065687934, + "loss": 0.5362, "step": 8720 }, { "epoch": 1.6431394692264258, - "grad_norm": 18.900775909423828, - "learning_rate": 2.0141163184641448e-05, - "loss": 0.6435, + "grad_norm": 4.017577171325684, + "learning_rate": 0.0003356860530773574, + "loss": 0.5628, "step": 8730 }, { "epoch": 1.645021645021645, - "grad_norm": 7.011523246765137, - "learning_rate": 2.012987012987013e-05, - "loss": 0.9581, + "grad_norm": 2.840949773788452, + "learning_rate": 0.0003354978354978355, + "loss": 0.6433, "step": 8740 }, { "epoch": 1.6469038208168643, - "grad_norm": 20.141672134399414, - "learning_rate": 2.0118577075098816e-05, - "loss": 0.8604, + "grad_norm": 2.3696165084838867, + "learning_rate": 0.00033530961791831363, + "loss": 0.8615, "step": 8750 }, { "epoch": 1.6487859966120837, - "grad_norm": 19.226642608642578, - "learning_rate": 2.01072840203275e-05, - "loss": 0.8389, + "grad_norm": 2.571803092956543, + "learning_rate": 0.00033512140033879165, + "loss": 0.7667, "step": 8760 }, { "epoch": 1.6506681724073027, - "grad_norm": 12.022441864013672, - "learning_rate": 2.0095990965556182e-05, - "loss": 0.8879, + "grad_norm": 4.093145847320557, + "learning_rate": 0.0003349331827592697, + "loss": 0.7379, "step": 8770 }, { "epoch": 1.6525503482025221, - "grad_norm": 16.575210571289062, - "learning_rate": 2.0084697910784868e-05, - "loss": 0.9649, + "grad_norm": 4.968115329742432, + "learning_rate": 0.0003347449651797478, + "loss": 0.9293, "step": 8780 }, { "epoch": 1.6544325239977415, - "grad_norm": 13.532410621643066, - "learning_rate": 2.007340485601355e-05, - "loss": 1.1161, + "grad_norm": 4.6447248458862305, + "learning_rate": 0.0003345567476002259, + "loss": 1.1339, "step": 8790 }, { "epoch": 1.6563146997929605, - "grad_norm": 13.928875923156738, - "learning_rate": 2.0062111801242237e-05, - "loss": 0.57, + "grad_norm": 2.701174736022949, + "learning_rate": 0.00033436853002070396, + "loss": 0.6028, "step": 8800 }, { "epoch": 1.65819687558818, - "grad_norm": 9.75232982635498, - "learning_rate": 2.0050818746470923e-05, - "loss": 0.5475, + "grad_norm": 10.108606338500977, + "learning_rate": 0.00033418031244118203, + "loss": 0.593, "step": 8810 }, { "epoch": 1.6600790513833992, - "grad_norm": 19.574495315551758, - "learning_rate": 2.0039525691699605e-05, - "loss": 0.8012, + "grad_norm": 2.6389102935791016, + "learning_rate": 0.0003339920948616601, + "loss": 0.7667, "step": 8820 }, { "epoch": 1.6619612271786184, - "grad_norm": 8.311776161193848, - "learning_rate": 2.002823263692829e-05, - "loss": 0.8989, + "grad_norm": 8.076936721801758, + "learning_rate": 0.0003338038772821382, + "loss": 0.7453, "step": 8830 }, { "epoch": 1.6638434029738378, - "grad_norm": 3.7794203758239746, - "learning_rate": 2.0016939582156974e-05, - "loss": 0.7828, + "grad_norm": 0.3804866075515747, + "learning_rate": 0.00033361565970261626, + "loss": 0.822, "step": 8840 }, { "epoch": 1.665725578769057, - "grad_norm": 22.881847381591797, - "learning_rate": 2.0005646527385657e-05, - "loss": 1.0597, + "grad_norm": 4.877434253692627, + "learning_rate": 0.0003334274421230943, + "loss": 0.8363, "step": 8850 }, { "epoch": 1.6676077545642762, - "grad_norm": 23.629314422607422, - "learning_rate": 1.9994353472614343e-05, - "loss": 0.905, + "grad_norm": 2.456123113632202, + "learning_rate": 0.00033323922454357236, + "loss": 0.796, "step": 8860 }, { "epoch": 1.6694899303594957, - "grad_norm": 19.81004524230957, - "learning_rate": 1.9983060417843026e-05, - "loss": 1.0952, + "grad_norm": 4.350971698760986, + "learning_rate": 0.00033305100696405044, + "loss": 0.998, "step": 8870 }, { "epoch": 1.6713721061547149, - "grad_norm": 8.309386253356934, - "learning_rate": 1.9971767363071712e-05, - "loss": 0.7427, + "grad_norm": 2.929431200027466, + "learning_rate": 0.0003328627893845285, + "loss": 0.5895, "step": 8880 }, { "epoch": 1.673254281949934, - "grad_norm": 6.19067907333374, - "learning_rate": 1.9960474308300394e-05, - "loss": 0.7889, + "grad_norm": 0.23275358974933624, + "learning_rate": 0.0003326745718050066, + "loss": 0.6795, "step": 8890 }, { "epoch": 1.6751364577451535, - "grad_norm": 2.2768592834472656, - "learning_rate": 1.994918125352908e-05, - "loss": 0.8985, + "grad_norm": 2.660062074661255, + "learning_rate": 0.00033248635422548467, + "loss": 0.7639, "step": 8900 }, { "epoch": 1.6770186335403725, - "grad_norm": 7.979760646820068, - "learning_rate": 1.9937888198757767e-05, - "loss": 0.6913, + "grad_norm": 3.0600240230560303, + "learning_rate": 0.00033229813664596274, + "loss": 0.7491, "step": 8910 }, { "epoch": 1.678900809335592, - "grad_norm": 10.671245574951172, - "learning_rate": 1.992659514398645e-05, - "loss": 0.995, + "grad_norm": 2.6997058391571045, + "learning_rate": 0.0003321099190664408, + "loss": 0.9388, "step": 8920 }, { "epoch": 1.6807829851308114, - "grad_norm": 8.021512031555176, - "learning_rate": 1.9915302089215132e-05, - "loss": 1.1053, + "grad_norm": 3.960489511489868, + "learning_rate": 0.00033192170148691884, + "loss": 0.8138, "step": 8930 }, { "epoch": 1.6826651609260304, - "grad_norm": 10.151590347290039, - "learning_rate": 1.9904009034443818e-05, - "loss": 0.7479, + "grad_norm": 1.3196567296981812, + "learning_rate": 0.0003317334839073969, + "loss": 0.6019, "step": 8940 }, { "epoch": 1.6845473367212498, - "grad_norm": 15.058424949645996, - "learning_rate": 1.98927159796725e-05, - "loss": 0.9105, + "grad_norm": 3.450125217437744, + "learning_rate": 0.000331545266327875, + "loss": 0.7857, "step": 8950 }, { "epoch": 1.686429512516469, - "grad_norm": 0.7414102554321289, - "learning_rate": 1.9881422924901187e-05, - "loss": 0.7906, + "grad_norm": 3.65901517868042, + "learning_rate": 0.00033135704874835307, + "loss": 0.6392, "step": 8960 }, { "epoch": 1.6883116883116882, - "grad_norm": 16.738800048828125, - "learning_rate": 1.987012987012987e-05, - "loss": 0.891, + "grad_norm": 2.7813315391540527, + "learning_rate": 0.0003311688311688312, + "loss": 0.842, "step": 8970 }, { "epoch": 1.6901938641069076, - "grad_norm": 11.300554275512695, - "learning_rate": 1.9858836815358556e-05, - "loss": 0.849, + "grad_norm": 1.5750610828399658, + "learning_rate": 0.0003309806135893093, + "loss": 0.7625, "step": 8980 }, { "epoch": 1.6920760399021268, - "grad_norm": 23.922866821289062, - "learning_rate": 1.984754376058724e-05, - "loss": 1.182, + "grad_norm": 5.054027080535889, + "learning_rate": 0.00033079239600978735, + "loss": 0.9992, "step": 8990 }, { "epoch": 1.693958215697346, - "grad_norm": 15.910576820373535, - "learning_rate": 1.9836250705815924e-05, - "loss": 0.692, + "grad_norm": 1.384577751159668, + "learning_rate": 0.00033060417843026543, + "loss": 0.5368, "step": 9000 }, { "epoch": 1.6958403914925655, - "grad_norm": 12.029934883117676, - "learning_rate": 1.982495765104461e-05, - "loss": 0.8861, + "grad_norm": 2.0076184272766113, + "learning_rate": 0.0003304159608507435, + "loss": 0.8174, "step": 9010 }, { "epoch": 1.6977225672877847, - "grad_norm": 11.433530807495117, - "learning_rate": 1.981366459627329e-05, - "loss": 0.8564, + "grad_norm": 2.57498836517334, + "learning_rate": 0.00033022774327122153, + "loss": 0.7774, "step": 9020 }, { "epoch": 1.699604743083004, - "grad_norm": 16.016300201416016, - "learning_rate": 1.9802371541501976e-05, - "loss": 0.7998, + "grad_norm": 4.454178333282471, + "learning_rate": 0.0003300395256916996, + "loss": 0.6064, "step": 9030 }, { "epoch": 1.7014869188782233, - "grad_norm": 13.163691520690918, - "learning_rate": 1.9791078486730662e-05, - "loss": 0.8364, + "grad_norm": 3.815958023071289, + "learning_rate": 0.0003298513081121777, + "loss": 0.7997, "step": 9040 }, { "epoch": 1.7033690946734426, - "grad_norm": 15.584578514099121, - "learning_rate": 1.9779785431959345e-05, - "loss": 0.8937, + "grad_norm": 3.1478142738342285, + "learning_rate": 0.00032966309053265576, + "loss": 0.7746, "step": 9050 }, { "epoch": 1.7052512704686618, - "grad_norm": 34.573814392089844, - "learning_rate": 1.976849237718803e-05, - "loss": 0.4642, + "grad_norm": 2.721916675567627, + "learning_rate": 0.00032947487295313383, + "loss": 0.4329, "step": 9060 }, { "epoch": 1.7071334462638812, - "grad_norm": 3.945563316345215, - "learning_rate": 1.9757199322416713e-05, - "loss": 0.9052, + "grad_norm": 1.7750840187072754, + "learning_rate": 0.0003292866553736119, + "loss": 0.8113, "step": 9070 }, { "epoch": 1.7090156220591002, - "grad_norm": 14.851612091064453, - "learning_rate": 1.97459062676454e-05, - "loss": 0.766, + "grad_norm": 4.334325313568115, + "learning_rate": 0.00032909843779409, + "loss": 0.6708, "step": 9080 }, { "epoch": 1.7108977978543196, - "grad_norm": 20.84731101989746, - "learning_rate": 1.9734613212874085e-05, - "loss": 0.7886, + "grad_norm": 4.547921657562256, + "learning_rate": 0.00032891022021456806, + "loss": 0.6698, "step": 9090 }, { "epoch": 1.7127799736495388, - "grad_norm": 14.792206764221191, - "learning_rate": 1.9723320158102768e-05, - "loss": 0.8115, + "grad_norm": 5.556723117828369, + "learning_rate": 0.00032872200263504614, + "loss": 0.6937, "step": 9100 }, { "epoch": 1.714662149444758, - "grad_norm": 8.901041030883789, - "learning_rate": 1.971202710333145e-05, - "loss": 0.9397, + "grad_norm": 3.2558112144470215, + "learning_rate": 0.00032853378505552416, + "loss": 0.7803, "step": 9110 }, { "epoch": 1.7165443252399775, - "grad_norm": 8.225708961486816, - "learning_rate": 1.9700734048560134e-05, - "loss": 1.1384, + "grad_norm": 3.6377789974212646, + "learning_rate": 0.00032834556747600224, + "loss": 0.8596, "step": 9120 }, { "epoch": 1.7184265010351967, - "grad_norm": 5.585742473602295, - "learning_rate": 1.968944099378882e-05, - "loss": 0.794, + "grad_norm": 2.1207480430603027, + "learning_rate": 0.0003281573498964803, + "loss": 0.6312, "step": 9130 }, { "epoch": 1.7203086768304159, - "grad_norm": 4.999485969543457, - "learning_rate": 1.9678147939017506e-05, - "loss": 0.6178, + "grad_norm": 2.050334930419922, + "learning_rate": 0.0003279691323169584, + "loss": 0.6679, "step": 9140 }, { "epoch": 1.7221908526256353, - "grad_norm": 15.160903930664062, - "learning_rate": 1.966685488424619e-05, - "loss": 1.1031, + "grad_norm": 2.546337604522705, + "learning_rate": 0.00032778091473743647, + "loss": 0.9513, "step": 9150 }, { "epoch": 1.7240730284208545, - "grad_norm": 27.700048446655273, - "learning_rate": 1.9655561829474874e-05, - "loss": 0.9744, + "grad_norm": 3.854846715927124, + "learning_rate": 0.00032759269715791454, + "loss": 0.7858, "step": 9160 }, { "epoch": 1.7259552042160737, - "grad_norm": 15.794424057006836, - "learning_rate": 1.9644268774703557e-05, - "loss": 0.9782, + "grad_norm": 5.680496692657471, + "learning_rate": 0.0003274044795783926, + "loss": 1.0537, "step": 9170 }, { "epoch": 1.7278373800112932, - "grad_norm": 13.252302169799805, - "learning_rate": 1.9632975719932243e-05, - "loss": 0.9227, + "grad_norm": 2.7812232971191406, + "learning_rate": 0.0003272162619988707, + "loss": 0.8572, "step": 9180 }, { "epoch": 1.7297195558065124, - "grad_norm": 19.900714874267578, - "learning_rate": 1.9621682665160926e-05, - "loss": 0.7203, + "grad_norm": 2.869856357574463, + "learning_rate": 0.0003270280444193488, + "loss": 0.7442, "step": 9190 }, { "epoch": 1.7316017316017316, - "grad_norm": 8.167475700378418, - "learning_rate": 1.961038961038961e-05, - "loss": 0.9278, + "grad_norm": 1.521945595741272, + "learning_rate": 0.00032683982683982685, + "loss": 0.8358, "step": 9200 }, { "epoch": 1.733483907396951, - "grad_norm": 18.376384735107422, - "learning_rate": 1.9599096555618295e-05, - "loss": 1.005, + "grad_norm": 2.468168258666992, + "learning_rate": 0.0003266516092603049, + "loss": 0.9282, "step": 9210 }, { "epoch": 1.73536608319217, - "grad_norm": 12.970340728759766, - "learning_rate": 1.958780350084698e-05, - "loss": 0.7519, + "grad_norm": 1.156663417816162, + "learning_rate": 0.000326463391680783, + "loss": 0.6668, "step": 9220 }, { "epoch": 1.7372482589873894, - "grad_norm": 10.988688468933105, - "learning_rate": 1.9576510446075663e-05, - "loss": 0.8544, + "grad_norm": 2.1691527366638184, + "learning_rate": 0.0003262751741012611, + "loss": 0.8898, "step": 9230 }, { "epoch": 1.7391304347826086, - "grad_norm": 7.907069206237793, - "learning_rate": 1.956521739130435e-05, - "loss": 1.2205, + "grad_norm": 2.910538911819458, + "learning_rate": 0.00032608695652173916, + "loss": 0.8819, "step": 9240 }, { "epoch": 1.7410126105778279, - "grad_norm": 24.321718215942383, - "learning_rate": 1.9553924336533032e-05, - "loss": 0.7836, + "grad_norm": 5.341407299041748, + "learning_rate": 0.00032589873894221723, + "loss": 0.7117, "step": 9250 }, { "epoch": 1.7428947863730473, - "grad_norm": 11.95626449584961, - "learning_rate": 1.9542631281761718e-05, - "loss": 0.7509, + "grad_norm": 2.8497936725616455, + "learning_rate": 0.0003257105213626953, + "loss": 0.5795, "step": 9260 }, { "epoch": 1.7447769621682665, - "grad_norm": 1.1062142848968506, - "learning_rate": 1.9531338226990404e-05, - "loss": 0.5899, + "grad_norm": 0.632068932056427, + "learning_rate": 0.0003255223037831734, + "loss": 0.5179, "step": 9270 }, { "epoch": 1.7466591379634857, - "grad_norm": 12.256535530090332, - "learning_rate": 1.9520045172219084e-05, - "loss": 0.7338, + "grad_norm": 8.0675630569458, + "learning_rate": 0.0003253340862036514, + "loss": 0.5875, "step": 9280 }, { "epoch": 1.7485413137587051, - "grad_norm": 28.20414924621582, - "learning_rate": 1.950875211744777e-05, - "loss": 0.848, + "grad_norm": 7.597952365875244, + "learning_rate": 0.0003251458686241295, + "loss": 0.7736, "step": 9290 }, { "epoch": 1.7504234895539243, - "grad_norm": 10.83683967590332, - "learning_rate": 1.9497459062676452e-05, - "loss": 1.0118, + "grad_norm": 4.242430210113525, + "learning_rate": 0.00032495765104460756, + "loss": 0.9301, "step": 9300 }, { "epoch": 1.7523056653491436, - "grad_norm": 12.281524658203125, - "learning_rate": 1.948616600790514e-05, - "loss": 0.7269, + "grad_norm": 0.9840834736824036, + "learning_rate": 0.00032476943346508564, + "loss": 0.5589, "step": 9310 }, { "epoch": 1.754187841144363, - "grad_norm": 3.447277545928955, - "learning_rate": 1.9474872953133825e-05, - "loss": 0.6222, + "grad_norm": 2.141883134841919, + "learning_rate": 0.0003245812158855637, + "loss": 0.7073, "step": 9320 }, { "epoch": 1.7560700169395822, - "grad_norm": 21.582672119140625, - "learning_rate": 1.9463579898362507e-05, - "loss": 0.9156, + "grad_norm": 3.3031342029571533, + "learning_rate": 0.0003243929983060418, + "loss": 0.7915, "step": 9330 }, { "epoch": 1.7579521927348014, - "grad_norm": 8.327972412109375, - "learning_rate": 1.9452286843591193e-05, - "loss": 0.7264, + "grad_norm": 3.9373908042907715, + "learning_rate": 0.00032420478072651987, + "loss": 0.5134, "step": 9340 }, { "epoch": 1.7598343685300208, - "grad_norm": 9.982987403869629, - "learning_rate": 1.9440993788819876e-05, - "loss": 0.5234, + "grad_norm": 3.2525720596313477, + "learning_rate": 0.00032401656314699794, + "loss": 0.4895, "step": 9350 }, { "epoch": 1.7617165443252398, - "grad_norm": 18.895675659179688, - "learning_rate": 1.9429700734048562e-05, - "loss": 0.6, + "grad_norm": 3.599588394165039, + "learning_rate": 0.000323828345567476, + "loss": 0.5655, "step": 9360 }, { "epoch": 1.7635987201204593, - "grad_norm": 3.999253749847412, - "learning_rate": 1.9418407679277245e-05, - "loss": 0.8148, + "grad_norm": 1.8969835042953491, + "learning_rate": 0.00032364012798795404, + "loss": 0.6763, "step": 9370 }, { "epoch": 1.7654808959156785, - "grad_norm": 7.097409725189209, - "learning_rate": 1.9407114624505927e-05, - "loss": 0.6838, + "grad_norm": 4.947686195373535, + "learning_rate": 0.0003234519104084321, + "loss": 0.6181, "step": 9380 }, { "epoch": 1.7673630717108977, - "grad_norm": 10.425472259521484, - "learning_rate": 1.9395821569734614e-05, - "loss": 0.7291, + "grad_norm": 1.9779622554779053, + "learning_rate": 0.0003232636928289102, + "loss": 0.7254, "step": 9390 }, { "epoch": 1.7692452475061171, - "grad_norm": 15.1722993850708, - "learning_rate": 1.9384528514963296e-05, - "loss": 0.7761, + "grad_norm": 2.9042108058929443, + "learning_rate": 0.00032307547524938827, + "loss": 0.6338, "step": 9400 }, { "epoch": 1.7711274233013363, - "grad_norm": 11.412665367126465, - "learning_rate": 1.9373235460191982e-05, - "loss": 0.904, + "grad_norm": 3.2115519046783447, + "learning_rate": 0.0003228872576698664, + "loss": 0.8777, "step": 9410 }, { "epoch": 1.7730095990965555, - "grad_norm": 19.17182159423828, - "learning_rate": 1.936194240542067e-05, - "loss": 0.9217, + "grad_norm": 3.146235466003418, + "learning_rate": 0.0003226990400903445, + "loss": 0.7832, "step": 9420 }, { "epoch": 1.774891774891775, - "grad_norm": 4.5345845222473145, - "learning_rate": 1.935064935064935e-05, - "loss": 0.6485, + "grad_norm": 2.5843141078948975, + "learning_rate": 0.00032251082251082255, + "loss": 0.4189, "step": 9430 }, { "epoch": 1.7767739506869942, - "grad_norm": 24.256860733032227, - "learning_rate": 1.9339356295878037e-05, - "loss": 1.0271, + "grad_norm": 5.2682271003723145, + "learning_rate": 0.00032232260493130063, + "loss": 0.9093, "step": 9440 }, { "epoch": 1.7786561264822134, - "grad_norm": 13.506421089172363, - "learning_rate": 1.932806324110672e-05, - "loss": 0.9675, + "grad_norm": 2.954876184463501, + "learning_rate": 0.0003221343873517787, + "loss": 0.7608, "step": 9450 }, { "epoch": 1.7805383022774328, - "grad_norm": 20.032943725585938, - "learning_rate": 1.9316770186335403e-05, - "loss": 0.8068, + "grad_norm": 1.4374929666519165, + "learning_rate": 0.00032194616977225673, + "loss": 0.629, "step": 9460 }, { "epoch": 1.782420478072652, - "grad_norm": 30.704687118530273, - "learning_rate": 1.930547713156409e-05, - "loss": 0.4575, + "grad_norm": 0.3903244137763977, + "learning_rate": 0.0003217579521927348, + "loss": 0.4012, "step": 9470 }, { "epoch": 1.7843026538678712, - "grad_norm": 29.728713989257812, - "learning_rate": 1.929418407679277e-05, - "loss": 0.9168, + "grad_norm": 2.7502241134643555, + "learning_rate": 0.0003215697346132129, + "loss": 0.7284, "step": 9480 }, { "epoch": 1.7861848296630907, - "grad_norm": 21.02536964416504, - "learning_rate": 1.9282891022021457e-05, - "loss": 1.0197, + "grad_norm": 4.083826065063477, + "learning_rate": 0.00032138151703369096, + "loss": 0.9896, "step": 9490 }, { "epoch": 1.7880670054583097, - "grad_norm": 0.558627724647522, - "learning_rate": 1.9271597967250143e-05, - "loss": 0.7217, + "grad_norm": 0.2609062194824219, + "learning_rate": 0.00032119329945416903, + "loss": 0.6251, "step": 9500 }, { "epoch": 1.789949181253529, - "grad_norm": 18.84478759765625, - "learning_rate": 1.9260304912478826e-05, - "loss": 0.7922, + "grad_norm": 3.0656192302703857, + "learning_rate": 0.0003210050818746471, + "loss": 0.5306, "step": 9510 }, { "epoch": 1.7918313570487485, - "grad_norm": 17.696800231933594, - "learning_rate": 1.9249011857707512e-05, - "loss": 1.0852, + "grad_norm": 5.871590614318848, + "learning_rate": 0.0003208168642951252, + "loss": 0.9829, "step": 9520 }, { "epoch": 1.7937135328439675, - "grad_norm": 20.650711059570312, - "learning_rate": 1.9237718802936195e-05, - "loss": 0.7487, + "grad_norm": 2.4569530487060547, + "learning_rate": 0.00032062864671560326, + "loss": 0.7821, "step": 9530 }, { "epoch": 1.795595708639187, - "grad_norm": 11.20978832244873, - "learning_rate": 1.9226425748164878e-05, - "loss": 1.1077, + "grad_norm": 2.2387983798980713, + "learning_rate": 0.0003204404291360813, + "loss": 0.8974, "step": 9540 }, { "epoch": 1.7974778844344061, - "grad_norm": 9.016942977905273, - "learning_rate": 1.9215132693393564e-05, - "loss": 0.7859, + "grad_norm": 1.3203648328781128, + "learning_rate": 0.00032025221155655936, + "loss": 0.7528, "step": 9550 }, { "epoch": 1.7993600602296254, - "grad_norm": 18.516727447509766, - "learning_rate": 1.9203839638622246e-05, - "loss": 0.6019, + "grad_norm": 3.0121521949768066, + "learning_rate": 0.00032006399397703744, + "loss": 0.6661, "step": 9560 }, { "epoch": 1.8012422360248448, - "grad_norm": 14.529791831970215, - "learning_rate": 1.9192546583850932e-05, - "loss": 0.8048, + "grad_norm": 3.89560866355896, + "learning_rate": 0.0003198757763975155, + "loss": 0.6831, "step": 9570 }, { "epoch": 1.803124411820064, - "grad_norm": 13.547689437866211, - "learning_rate": 1.9181253529079615e-05, - "loss": 1.0294, + "grad_norm": 2.6359190940856934, + "learning_rate": 0.0003196875588179936, + "loss": 0.9127, "step": 9580 }, { "epoch": 1.8050065876152832, - "grad_norm": 11.232539176940918, - "learning_rate": 1.91699604743083e-05, - "loss": 0.798, + "grad_norm": 3.913572072982788, + "learning_rate": 0.00031949934123847167, + "loss": 0.7298, "step": 9590 }, { "epoch": 1.8068887634105026, - "grad_norm": 13.629487037658691, - "learning_rate": 1.9158667419536987e-05, - "loss": 0.6611, + "grad_norm": 2.637514591217041, + "learning_rate": 0.00031931112365894974, + "loss": 0.6781, "step": 9600 }, { "epoch": 1.8087709392057219, - "grad_norm": 9.937800407409668, - "learning_rate": 1.914737436476567e-05, - "loss": 0.5109, + "grad_norm": 3.258054494857788, + "learning_rate": 0.0003191229060794278, + "loss": 0.4907, "step": 9610 }, { "epoch": 1.810653115000941, - "grad_norm": 14.390426635742188, - "learning_rate": 1.9136081309994356e-05, - "loss": 0.8687, + "grad_norm": 2.188828468322754, + "learning_rate": 0.0003189346884999059, + "loss": 0.9083, "step": 9620 }, { "epoch": 1.8125352907961605, - "grad_norm": 21.730375289916992, - "learning_rate": 1.9124788255223035e-05, - "loss": 1.0062, + "grad_norm": 6.632510185241699, + "learning_rate": 0.0003187464709203839, + "loss": 0.834, "step": 9630 }, { "epoch": 1.8144174665913795, - "grad_norm": 17.650148391723633, - "learning_rate": 1.911349520045172e-05, - "loss": 0.7727, + "grad_norm": 1.7166807651519775, + "learning_rate": 0.00031855825334086205, + "loss": 0.5749, "step": 9640 }, { "epoch": 1.816299642386599, - "grad_norm": 13.232230186462402, - "learning_rate": 1.9102202145680407e-05, - "loss": 0.987, + "grad_norm": 3.0382962226867676, + "learning_rate": 0.0003183700357613401, + "loss": 0.8518, "step": 9650 }, { "epoch": 1.8181818181818183, - "grad_norm": 21.150760650634766, - "learning_rate": 1.909090909090909e-05, - "loss": 1.2057, + "grad_norm": 2.7788681983947754, + "learning_rate": 0.0003181818181818182, + "loss": 0.9275, "step": 9660 }, { "epoch": 1.8200639939770373, - "grad_norm": 15.0765962600708, - "learning_rate": 1.9079616036137776e-05, - "loss": 0.9163, + "grad_norm": 2.9813215732574463, + "learning_rate": 0.0003179936006022963, + "loss": 0.9127, "step": 9670 }, { "epoch": 1.8219461697722568, - "grad_norm": 16.338342666625977, - "learning_rate": 1.906832298136646e-05, - "loss": 0.6045, + "grad_norm": 4.174839496612549, + "learning_rate": 0.00031780538302277436, + "loss": 0.5552, "step": 9680 }, { "epoch": 1.823828345567476, - "grad_norm": 28.87519073486328, - "learning_rate": 1.9057029926595145e-05, - "loss": 1.0076, + "grad_norm": 4.95925235748291, + "learning_rate": 0.00031761716544325243, + "loss": 0.8057, "step": 9690 }, { "epoch": 1.8257105213626952, - "grad_norm": 20.42050552368164, - "learning_rate": 1.904573687182383e-05, - "loss": 1.0798, + "grad_norm": 5.213214874267578, + "learning_rate": 0.0003174289478637305, + "loss": 0.8951, "step": 9700 }, { "epoch": 1.8275926971579146, - "grad_norm": 14.828946113586426, - "learning_rate": 1.9034443817052514e-05, - "loss": 1.1065, + "grad_norm": 4.759930610656738, + "learning_rate": 0.0003172407302842086, + "loss": 0.9093, "step": 9710 }, { "epoch": 1.8294748729531338, - "grad_norm": 17.2311954498291, - "learning_rate": 1.9023150762281196e-05, - "loss": 0.8212, + "grad_norm": 2.821655035018921, + "learning_rate": 0.0003170525127046866, + "loss": 0.8012, "step": 9720 }, { "epoch": 1.831357048748353, - "grad_norm": 17.343181610107422, - "learning_rate": 1.9011857707509883e-05, - "loss": 1.0861, + "grad_norm": 0.5296029448509216, + "learning_rate": 0.0003168642951251647, + "loss": 0.9112, "step": 9730 }, { "epoch": 1.8332392245435725, - "grad_norm": 15.251105308532715, - "learning_rate": 1.9000564652738565e-05, - "loss": 0.8787, + "grad_norm": 4.0418701171875, + "learning_rate": 0.00031667607754564276, + "loss": 1.006, "step": 9740 }, { "epoch": 1.8351214003387917, - "grad_norm": 7.921949863433838, - "learning_rate": 1.898927159796725e-05, - "loss": 1.5269, + "grad_norm": 2.679586410522461, + "learning_rate": 0.00031648785996612084, + "loss": 1.1059, "step": 9750 }, { "epoch": 1.8370035761340109, - "grad_norm": 11.668200492858887, - "learning_rate": 1.8977978543195934e-05, - "loss": 0.7354, + "grad_norm": 1.4031389951705933, + "learning_rate": 0.0003162996423865989, + "loss": 0.5594, "step": 9760 }, { "epoch": 1.8388857519292303, - "grad_norm": 27.10658073425293, - "learning_rate": 1.896668548842462e-05, - "loss": 0.8172, + "grad_norm": 2.9446029663085938, + "learning_rate": 0.000316111424807077, + "loss": 0.7634, "step": 9770 }, { "epoch": 1.8407679277244493, - "grad_norm": 16.760330200195312, - "learning_rate": 1.8955392433653306e-05, - "loss": 0.8198, + "grad_norm": 2.208606243133545, + "learning_rate": 0.00031592320722755507, + "loss": 0.7794, "step": 9780 }, { "epoch": 1.8426501035196687, - "grad_norm": 16.54085350036621, - "learning_rate": 1.894409937888199e-05, - "loss": 0.8366, + "grad_norm": 4.726104259490967, + "learning_rate": 0.00031573498964803314, + "loss": 0.7705, "step": 9790 }, { "epoch": 1.8445322793148882, - "grad_norm": 12.325793266296387, - "learning_rate": 1.893280632411067e-05, - "loss": 0.7522, + "grad_norm": 4.122304916381836, + "learning_rate": 0.00031554677206851116, + "loss": 0.7017, "step": 9800 }, { "epoch": 1.8464144551101072, - "grad_norm": 21.898651123046875, - "learning_rate": 1.8921513269339354e-05, - "loss": 0.7455, + "grad_norm": 5.453103542327881, + "learning_rate": 0.00031535855448898924, + "loss": 0.7014, "step": 9810 }, { "epoch": 1.8482966309053266, - "grad_norm": 8.566146850585938, - "learning_rate": 1.891022021456804e-05, - "loss": 0.7789, + "grad_norm": 2.251783609390259, + "learning_rate": 0.0003151703369094673, + "loss": 0.7531, "step": 9820 }, { "epoch": 1.8501788067005458, - "grad_norm": 13.785645484924316, - "learning_rate": 1.8898927159796726e-05, - "loss": 0.7233, + "grad_norm": 3.082606792449951, + "learning_rate": 0.0003149821193299454, + "loss": 0.6417, "step": 9830 }, { "epoch": 1.852060982495765, - "grad_norm": 15.743664741516113, - "learning_rate": 1.888763410502541e-05, - "loss": 0.9657, + "grad_norm": 3.380202293395996, + "learning_rate": 0.00031479390175042347, + "loss": 0.9505, "step": 9840 }, { "epoch": 1.8539431582909844, - "grad_norm": 32.00886535644531, - "learning_rate": 1.8876341050254095e-05, - "loss": 0.7379, + "grad_norm": 5.193691253662109, + "learning_rate": 0.00031460568417090155, + "loss": 0.763, "step": 9850 }, { "epoch": 1.8558253340862036, - "grad_norm": 7.174389839172363, - "learning_rate": 1.8865047995482778e-05, - "loss": 0.8011, + "grad_norm": 2.495882511138916, + "learning_rate": 0.0003144174665913797, + "loss": 0.7655, "step": 9860 }, { "epoch": 1.8577075098814229, - "grad_norm": 20.806581497192383, - "learning_rate": 1.8853754940711464e-05, - "loss": 0.8431, + "grad_norm": 2.836336374282837, + "learning_rate": 0.00031422924901185775, + "loss": 0.8337, "step": 9870 }, { "epoch": 1.8595896856766423, - "grad_norm": 21.621334075927734, - "learning_rate": 1.884246188594015e-05, - "loss": 0.9357, + "grad_norm": 1.8456288576126099, + "learning_rate": 0.00031404103143233583, + "loss": 0.8305, "step": 9880 }, { "epoch": 1.8614718614718615, - "grad_norm": 4.900033473968506, - "learning_rate": 1.883116883116883e-05, - "loss": 1.0361, + "grad_norm": 1.920587420463562, + "learning_rate": 0.00031385281385281385, + "loss": 0.8039, "step": 9890 }, { "epoch": 1.8633540372670807, - "grad_norm": 19.732091903686523, - "learning_rate": 1.8819875776397515e-05, - "loss": 0.6999, + "grad_norm": 5.574342727661133, + "learning_rate": 0.00031366459627329193, + "loss": 0.6394, "step": 9900 }, { "epoch": 1.8652362130623001, - "grad_norm": 41.449031829833984, - "learning_rate": 1.8808582721626198e-05, - "loss": 0.9272, + "grad_norm": 1.785643458366394, + "learning_rate": 0.00031347637869377, + "loss": 1.0156, "step": 9910 }, { "epoch": 1.8671183888575191, - "grad_norm": 21.400299072265625, - "learning_rate": 1.8797289666854884e-05, - "loss": 0.9431, + "grad_norm": 3.011502742767334, + "learning_rate": 0.0003132881611142481, + "loss": 0.9452, "step": 9920 }, { "epoch": 1.8690005646527386, - "grad_norm": 17.52521514892578, - "learning_rate": 1.878599661208357e-05, - "loss": 0.6384, + "grad_norm": 3.422855854034424, + "learning_rate": 0.00031309994353472616, + "loss": 0.4303, "step": 9930 }, { "epoch": 1.870882740447958, - "grad_norm": 12.966445922851562, - "learning_rate": 1.8774703557312253e-05, - "loss": 0.899, + "grad_norm": 4.045591354370117, + "learning_rate": 0.00031291172595520423, + "loss": 0.9059, "step": 9940 }, { "epoch": 1.872764916243177, - "grad_norm": 23.314476013183594, - "learning_rate": 1.876341050254094e-05, - "loss": 0.7705, + "grad_norm": 4.1779937744140625, + "learning_rate": 0.0003127235083756823, + "loss": 0.7193, "step": 9950 }, { "epoch": 1.8746470920383964, - "grad_norm": 14.737226486206055, - "learning_rate": 1.8752117447769625e-05, - "loss": 0.7806, + "grad_norm": 5.253387451171875, + "learning_rate": 0.0003125352907961604, + "loss": 0.7338, "step": 9960 }, { "epoch": 1.8765292678336156, - "grad_norm": 9.335084915161133, - "learning_rate": 1.8740824392998308e-05, - "loss": 0.8484, + "grad_norm": 1.7977385520935059, + "learning_rate": 0.00031234707321663846, + "loss": 0.7472, "step": 9970 }, { "epoch": 1.8784114436288348, - "grad_norm": 5.914954662322998, - "learning_rate": 1.872953133822699e-05, - "loss": 0.9675, + "grad_norm": 0.8867546319961548, + "learning_rate": 0.0003121588556371165, + "loss": 0.8371, "step": 9980 }, { "epoch": 1.8802936194240543, - "grad_norm": 13.312799453735352, - "learning_rate": 1.8718238283455673e-05, - "loss": 0.7423, + "grad_norm": 1.7153702974319458, + "learning_rate": 0.00031197063805759456, + "loss": 0.6443, "step": 9990 }, { "epoch": 1.8821757952192735, - "grad_norm": 10.06735610961914, - "learning_rate": 1.870694522868436e-05, - "loss": 1.0082, + "grad_norm": 0.5608319044113159, + "learning_rate": 0.00031178242047807264, + "loss": 0.7286, "step": 10000 }, { "epoch": 1.8840579710144927, - "grad_norm": 13.502887725830078, - "learning_rate": 1.8695652173913045e-05, - "loss": 0.7395, + "grad_norm": 1.7437026500701904, + "learning_rate": 0.0003115942028985507, + "loss": 0.6849, "step": 10010 }, { "epoch": 1.8859401468097121, - "grad_norm": 19.229324340820312, - "learning_rate": 1.8684359119141728e-05, - "loss": 1.0014, + "grad_norm": 3.591338872909546, + "learning_rate": 0.0003114059853190288, + "loss": 0.8433, "step": 10020 }, { "epoch": 1.8878223226049313, - "grad_norm": 15.201828002929688, - "learning_rate": 1.8673066064370414e-05, - "loss": 0.8707, + "grad_norm": 3.8551323413848877, + "learning_rate": 0.00031121776773950687, + "loss": 0.8481, "step": 10030 }, { "epoch": 1.8897044984001505, - "grad_norm": 28.766010284423828, - "learning_rate": 1.8661773009599097e-05, - "loss": 0.7835, + "grad_norm": 2.6424741744995117, + "learning_rate": 0.00031102955015998494, + "loss": 0.6314, "step": 10040 }, { "epoch": 1.89158667419537, - "grad_norm": 2.099621295928955, - "learning_rate": 1.8650479954827783e-05, - "loss": 0.8257, + "grad_norm": 2.6313867568969727, + "learning_rate": 0.000310841332580463, + "loss": 0.7825, "step": 10050 }, { "epoch": 1.8934688499905892, - "grad_norm": 11.689505577087402, - "learning_rate": 1.8639186900056465e-05, - "loss": 0.8311, + "grad_norm": 1.8866409063339233, + "learning_rate": 0.00031065311500094104, + "loss": 0.6601, "step": 10060 }, { "epoch": 1.8953510257858084, - "grad_norm": 1.5185855627059937, - "learning_rate": 1.8627893845285148e-05, - "loss": 0.7473, + "grad_norm": 0.6496089100837708, + "learning_rate": 0.0003104648974214191, + "loss": 0.7797, "step": 10070 }, { "epoch": 1.8972332015810278, - "grad_norm": 16.93794822692871, - "learning_rate": 1.8616600790513834e-05, - "loss": 0.8576, + "grad_norm": 3.582031726837158, + "learning_rate": 0.00031027667984189725, + "loss": 0.8081, "step": 10080 }, { "epoch": 1.8991153773762468, - "grad_norm": 12.918668746948242, - "learning_rate": 1.8605307735742517e-05, - "loss": 0.6406, + "grad_norm": 3.5374810695648193, + "learning_rate": 0.0003100884622623753, + "loss": 0.6228, "step": 10090 }, { "epoch": 1.9009975531714662, - "grad_norm": 38.772605895996094, - "learning_rate": 1.8594014680971203e-05, - "loss": 1.0629, + "grad_norm": 2.6551787853240967, + "learning_rate": 0.0003099002446828534, + "loss": 0.9884, "step": 10100 }, { "epoch": 1.9028797289666854, - "grad_norm": 8.743882179260254, - "learning_rate": 1.858272162619989e-05, - "loss": 0.8832, + "grad_norm": 2.751276969909668, + "learning_rate": 0.0003097120271033315, + "loss": 0.6363, "step": 10110 }, { "epoch": 1.9047619047619047, - "grad_norm": 18.133054733276367, - "learning_rate": 1.8571428571428572e-05, - "loss": 0.5021, + "grad_norm": 0.6503388285636902, + "learning_rate": 0.00030952380952380956, + "loss": 0.5765, "step": 10120 }, { "epoch": 1.906644080557124, - "grad_norm": 18.592235565185547, - "learning_rate": 1.8560135516657258e-05, - "loss": 0.6759, + "grad_norm": 1.9065133333206177, + "learning_rate": 0.00030933559194428763, + "loss": 0.6963, "step": 10130 }, { "epoch": 1.9085262563523433, - "grad_norm": 18.375886917114258, - "learning_rate": 1.854884246188594e-05, - "loss": 0.5934, + "grad_norm": 2.041531801223755, + "learning_rate": 0.0003091473743647657, + "loss": 0.4221, "step": 10140 }, { "epoch": 1.9104084321475625, - "grad_norm": 9.453605651855469, - "learning_rate": 1.8537549407114623e-05, - "loss": 0.8925, + "grad_norm": 1.8870247602462769, + "learning_rate": 0.00030895915678524373, + "loss": 0.7617, "step": 10150 }, { "epoch": 1.912290607942782, - "grad_norm": 20.740285873413086, - "learning_rate": 1.852625635234331e-05, - "loss": 0.931, + "grad_norm": 3.5730133056640625, + "learning_rate": 0.0003087709392057218, + "loss": 0.9216, "step": 10160 }, { "epoch": 1.9141727837380011, - "grad_norm": 21.86232566833496, - "learning_rate": 1.8514963297571992e-05, - "loss": 1.0436, + "grad_norm": 3.7722654342651367, + "learning_rate": 0.0003085827216261999, + "loss": 0.9224, "step": 10170 }, { "epoch": 1.9160549595332204, - "grad_norm": 31.541776657104492, - "learning_rate": 1.8503670242800678e-05, - "loss": 0.8187, + "grad_norm": 3.3979384899139404, + "learning_rate": 0.00030839450404667796, + "loss": 0.894, "step": 10180 }, { "epoch": 1.9179371353284398, - "grad_norm": 3.2206337451934814, - "learning_rate": 1.849237718802936e-05, - "loss": 0.6882, + "grad_norm": 1.5552890300750732, + "learning_rate": 0.00030820628646715604, + "loss": 0.6415, "step": 10190 }, { "epoch": 1.919819311123659, - "grad_norm": 1.3856621980667114, - "learning_rate": 1.8481084133258047e-05, - "loss": 0.6564, + "grad_norm": 0.8526018857955933, + "learning_rate": 0.0003080180688876341, + "loss": 0.5772, "step": 10200 }, { "epoch": 1.9217014869188782, - "grad_norm": 45.16568374633789, - "learning_rate": 1.8469791078486733e-05, - "loss": 0.7004, + "grad_norm": 2.919462203979492, + "learning_rate": 0.0003078298513081122, + "loss": 0.5405, "step": 10210 }, { "epoch": 1.9235836627140976, - "grad_norm": 12.040491104125977, - "learning_rate": 1.8458498023715416e-05, - "loss": 0.8935, + "grad_norm": 2.8013722896575928, + "learning_rate": 0.00030764163372859027, + "loss": 1.0859, "step": 10220 }, { "epoch": 1.9254658385093166, - "grad_norm": 1.0651041269302368, - "learning_rate": 1.84472049689441e-05, - "loss": 0.9316, + "grad_norm": 0.8399747610092163, + "learning_rate": 0.00030745341614906834, + "loss": 0.9671, "step": 10230 }, { "epoch": 1.927348014304536, - "grad_norm": 24.65813636779785, - "learning_rate": 1.8435911914172784e-05, - "loss": 1.1372, + "grad_norm": 3.0836198329925537, + "learning_rate": 0.00030726519856954636, + "loss": 1.083, "step": 10240 }, { "epoch": 1.9292301900997553, - "grad_norm": 11.479782104492188, - "learning_rate": 1.8424618859401467e-05, - "loss": 1.0353, + "grad_norm": 5.9964776039123535, + "learning_rate": 0.00030707698099002444, + "loss": 1.0115, "step": 10250 }, { "epoch": 1.9311123658949745, - "grad_norm": 12.16601848602295, - "learning_rate": 1.8413325804630153e-05, - "loss": 0.7897, + "grad_norm": 0.20760832726955414, + "learning_rate": 0.0003068887634105025, + "loss": 0.6393, "step": 10260 }, { "epoch": 1.932994541690194, - "grad_norm": 17.336091995239258, - "learning_rate": 1.8402032749858836e-05, - "loss": 0.8251, + "grad_norm": 4.026695251464844, + "learning_rate": 0.0003067005458309806, + "loss": 0.673, "step": 10270 }, { "epoch": 1.9348767174854131, - "grad_norm": 9.198282241821289, - "learning_rate": 1.8390739695087522e-05, - "loss": 0.7357, + "grad_norm": 2.453230142593384, + "learning_rate": 0.00030651232825145867, + "loss": 0.8183, "step": 10280 }, { "epoch": 1.9367588932806323, - "grad_norm": 12.248185157775879, - "learning_rate": 1.8379446640316208e-05, - "loss": 0.7766, + "grad_norm": 0.2803904116153717, + "learning_rate": 0.00030632411067193675, + "loss": 0.716, "step": 10290 }, { "epoch": 1.9386410690758518, - "grad_norm": 0.7270705103874207, - "learning_rate": 1.836815358554489e-05, - "loss": 0.5565, + "grad_norm": 0.20611296594142914, + "learning_rate": 0.0003061358930924149, + "loss": 0.5156, "step": 10300 }, { "epoch": 1.940523244871071, - "grad_norm": 18.369525909423828, - "learning_rate": 1.8356860530773577e-05, - "loss": 1.054, + "grad_norm": 7.233029365539551, + "learning_rate": 0.00030594767551289295, + "loss": 0.9044, "step": 10310 }, { "epoch": 1.9424054206662902, - "grad_norm": 3.5889933109283447, - "learning_rate": 1.834556747600226e-05, - "loss": 1.0666, + "grad_norm": 0.5617844462394714, + "learning_rate": 0.00030575945793337103, + "loss": 1.0455, "step": 10320 }, { "epoch": 1.9442875964615096, - "grad_norm": 16.404603958129883, - "learning_rate": 1.8334274421230942e-05, - "loss": 0.5469, + "grad_norm": 2.7063069343566895, + "learning_rate": 0.00030557124035384905, + "loss": 0.4773, "step": 10330 }, { "epoch": 1.9461697722567288, - "grad_norm": 5.033247470855713, - "learning_rate": 1.8322981366459628e-05, - "loss": 0.7387, + "grad_norm": 4.051327228546143, + "learning_rate": 0.00030538302277432713, + "loss": 0.665, "step": 10340 }, { "epoch": 1.948051948051948, - "grad_norm": 20.548690795898438, - "learning_rate": 1.831168831168831e-05, - "loss": 0.6346, + "grad_norm": 3.9234256744384766, + "learning_rate": 0.0003051948051948052, + "loss": 0.5562, "step": 10350 }, { "epoch": 1.9499341238471675, - "grad_norm": 19.923683166503906, - "learning_rate": 1.8300395256916997e-05, - "loss": 0.6418, + "grad_norm": 2.521334648132324, + "learning_rate": 0.0003050065876152833, + "loss": 0.6868, "step": 10360 }, { "epoch": 1.9518162996423865, - "grad_norm": 14.779675483703613, - "learning_rate": 1.828910220214568e-05, - "loss": 0.6338, + "grad_norm": 2.2171504497528076, + "learning_rate": 0.00030481837003576136, + "loss": 0.4326, "step": 10370 }, { "epoch": 1.9536984754376059, - "grad_norm": 24.59194564819336, - "learning_rate": 1.8277809147374366e-05, - "loss": 0.6815, + "grad_norm": 0.8069236278533936, + "learning_rate": 0.00030463015245623943, + "loss": 0.637, "step": 10380 }, { "epoch": 1.955580651232825, - "grad_norm": 6.548442840576172, - "learning_rate": 1.8266516092603052e-05, - "loss": 0.9073, + "grad_norm": 2.4707930088043213, + "learning_rate": 0.0003044419348767175, + "loss": 0.6729, "step": 10390 }, { "epoch": 1.9574628270280443, - "grad_norm": 25.888690948486328, - "learning_rate": 1.8255223037831734e-05, - "loss": 0.8705, + "grad_norm": 2.727604389190674, + "learning_rate": 0.0003042537172971956, + "loss": 0.7498, "step": 10400 }, { "epoch": 1.9593450028232637, - "grad_norm": 14.560491561889648, - "learning_rate": 1.8243929983060417e-05, - "loss": 1.0138, + "grad_norm": 3.5283005237579346, + "learning_rate": 0.0003040654997176736, + "loss": 0.9411, "step": 10410 }, { "epoch": 1.961227178618483, - "grad_norm": 18.18019676208496, - "learning_rate": 1.82326369282891e-05, - "loss": 0.7988, + "grad_norm": 3.2777674198150635, + "learning_rate": 0.0003038772821381517, + "loss": 0.6871, "step": 10420 }, { "epoch": 1.9631093544137022, - "grad_norm": 6.463456153869629, - "learning_rate": 1.8221343873517786e-05, - "loss": 0.6973, + "grad_norm": 2.1917569637298584, + "learning_rate": 0.00030368906455862976, + "loss": 0.603, "step": 10430 }, { "epoch": 1.9649915302089216, - "grad_norm": 6.2463226318359375, - "learning_rate": 1.8210050818746472e-05, - "loss": 0.6555, + "grad_norm": 0.9027181267738342, + "learning_rate": 0.00030350084697910784, + "loss": 0.4535, "step": 10440 }, { "epoch": 1.9668737060041408, - "grad_norm": 18.08149528503418, - "learning_rate": 1.8198757763975155e-05, - "loss": 0.6245, + "grad_norm": 3.275017261505127, + "learning_rate": 0.0003033126293995859, + "loss": 0.6109, "step": 10450 }, { "epoch": 1.96875588179936, - "grad_norm": 17.84767723083496, - "learning_rate": 1.818746470920384e-05, - "loss": 1.1495, + "grad_norm": 2.2948038578033447, + "learning_rate": 0.000303124411820064, + "loss": 0.9541, "step": 10460 }, { "epoch": 1.9706380575945794, - "grad_norm": 6.831356048583984, - "learning_rate": 1.8176171654432527e-05, - "loss": 0.8015, + "grad_norm": 2.1932127475738525, + "learning_rate": 0.00030293619424054207, + "loss": 0.7815, "step": 10470 }, { "epoch": 1.9725202333897986, - "grad_norm": 14.89473819732666, - "learning_rate": 1.816487859966121e-05, - "loss": 0.8459, + "grad_norm": 2.2697620391845703, + "learning_rate": 0.00030274797666102014, + "loss": 0.7469, "step": 10480 }, { "epoch": 1.9744024091850179, - "grad_norm": 18.0239315032959, - "learning_rate": 1.8153585544889896e-05, - "loss": 0.891, + "grad_norm": 3.1242027282714844, + "learning_rate": 0.0003025597590814982, + "loss": 0.8609, "step": 10490 }, { "epoch": 1.9762845849802373, - "grad_norm": 15.742325782775879, - "learning_rate": 1.8142292490118575e-05, - "loss": 1.2079, + "grad_norm": 2.455536365509033, + "learning_rate": 0.00030237154150197624, + "loss": 0.9871, "step": 10500 }, { "epoch": 1.9781667607754563, - "grad_norm": 6.855963230133057, - "learning_rate": 1.813099943534726e-05, - "loss": 0.7506, + "grad_norm": 1.7314091920852661, + "learning_rate": 0.0003021833239224543, + "loss": 0.7157, "step": 10510 }, { "epoch": 1.9800489365706757, - "grad_norm": 23.201900482177734, - "learning_rate": 1.8119706380575947e-05, - "loss": 0.854, + "grad_norm": 2.10880708694458, + "learning_rate": 0.00030199510634293245, + "loss": 0.7884, "step": 10520 }, { "epoch": 1.981931112365895, - "grad_norm": 11.044225692749023, - "learning_rate": 1.810841332580463e-05, - "loss": 0.9139, + "grad_norm": 3.019819974899292, + "learning_rate": 0.0003018068887634105, + "loss": 0.7561, "step": 10530 }, { "epoch": 1.9838132881611141, - "grad_norm": 10.52823257446289, - "learning_rate": 1.8097120271033316e-05, - "loss": 0.8894, + "grad_norm": 1.5337930917739868, + "learning_rate": 0.0003016186711838886, + "loss": 0.7473, "step": 10540 }, { "epoch": 1.9856954639563336, - "grad_norm": 11.858381271362305, - "learning_rate": 1.8085827216262e-05, - "loss": 0.5292, + "grad_norm": 4.329298973083496, + "learning_rate": 0.0003014304536043667, + "loss": 0.4503, "step": 10550 }, { "epoch": 1.9875776397515528, - "grad_norm": 20.706117630004883, - "learning_rate": 1.8074534161490685e-05, - "loss": 0.7082, + "grad_norm": 4.061004638671875, + "learning_rate": 0.00030124223602484476, + "loss": 0.5419, "step": 10560 }, { "epoch": 1.989459815546772, - "grad_norm": 9.068899154663086, - "learning_rate": 1.806324110671937e-05, - "loss": 0.9249, + "grad_norm": 2.1341538429260254, + "learning_rate": 0.00030105401844532283, + "loss": 0.873, "step": 10570 }, { "epoch": 1.9913419913419914, - "grad_norm": 12.875887870788574, - "learning_rate": 1.8051948051948053e-05, - "loss": 0.7443, + "grad_norm": 3.725860834121704, + "learning_rate": 0.0003008658008658009, + "loss": 0.7146, "step": 10580 }, { "epoch": 1.9932241671372106, - "grad_norm": 15.794068336486816, - "learning_rate": 1.8040654997176736e-05, - "loss": 0.7504, + "grad_norm": 4.153800010681152, + "learning_rate": 0.00030067758328627893, + "loss": 0.6242, "step": 10590 }, { "epoch": 1.9951063429324298, - "grad_norm": 9.319598197937012, - "learning_rate": 1.802936194240542e-05, - "loss": 0.7693, + "grad_norm": 2.7031161785125732, + "learning_rate": 0.000300489365706757, + "loss": 0.7066, "step": 10600 }, { "epoch": 1.9969885187276493, - "grad_norm": 7.584019660949707, - "learning_rate": 1.8018068887634105e-05, - "loss": 0.8374, + "grad_norm": 2.1315081119537354, + "learning_rate": 0.0003003011481272351, + "loss": 0.7769, "step": 10610 }, { "epoch": 1.9988706945228685, - "grad_norm": 7.562906742095947, - "learning_rate": 1.800677583286279e-05, - "loss": 0.7839, + "grad_norm": 2.260913610458374, + "learning_rate": 0.00030011293054771316, + "loss": 0.7608, "step": 10620 }, { "epoch": 2.0, - "eval_accuracy": 0.9056, - "eval_loss": 0.3731466829776764, - "eval_runtime": 275.5436, - "eval_samples_per_second": 27.219, - "eval_steps_per_second": 3.404, + "eval_accuracy": 0.9138666666666667, + "eval_loss": 0.2929131090641022, + "eval_runtime": 173.9091, + "eval_samples_per_second": 43.126, + "eval_steps_per_second": 5.394, "step": 10626 }, { "epoch": 2.0007528703180877, - "grad_norm": 26.831989288330078, - "learning_rate": 1.7995482778091474e-05, - "loss": 0.8433, + "grad_norm": 4.4684343338012695, + "learning_rate": 0.00029992471296819124, + "loss": 0.7618, "step": 10630 }, { "epoch": 2.002635046113307, - "grad_norm": 14.433839797973633, - "learning_rate": 1.798418972332016e-05, - "loss": 0.874, + "grad_norm": 2.1581475734710693, + "learning_rate": 0.0002997364953886693, + "loss": 0.7593, "step": 10640 }, { "epoch": 2.004517221908526, - "grad_norm": 6.329076290130615, - "learning_rate": 1.7972896668548842e-05, - "loss": 0.953, + "grad_norm": 3.0635669231414795, + "learning_rate": 0.0002995482778091474, + "loss": 1.0145, "step": 10650 }, { "epoch": 2.0063993977037455, - "grad_norm": 28.34707260131836, - "learning_rate": 1.796160361377753e-05, - "loss": 0.7231, + "grad_norm": 1.1936360597610474, + "learning_rate": 0.00029936006022962547, + "loss": 0.5021, "step": 10660 }, { "epoch": 2.008281573498965, - "grad_norm": 6.1005778312683105, - "learning_rate": 1.795031055900621e-05, - "loss": 1.0041, + "grad_norm": 2.743591547012329, + "learning_rate": 0.0002991718426501035, + "loss": 1.0748, "step": 10670 }, { "epoch": 2.010163749294184, - "grad_norm": 5.388144493103027, - "learning_rate": 1.7939017504234894e-05, - "loss": 0.71, + "grad_norm": 3.357374668121338, + "learning_rate": 0.00029898362507058156, + "loss": 0.7152, "step": 10680 }, { "epoch": 2.0120459250894034, - "grad_norm": 22.38304901123047, - "learning_rate": 1.792772444946358e-05, - "loss": 0.5202, + "grad_norm": 3.9789493083953857, + "learning_rate": 0.00029879540749105964, + "loss": 0.3214, "step": 10690 }, { "epoch": 2.013928100884623, - "grad_norm": 6.131063938140869, - "learning_rate": 1.7916431394692266e-05, - "loss": 0.8542, + "grad_norm": 9.24729061126709, + "learning_rate": 0.0002986071899115377, + "loss": 0.8628, "step": 10700 }, { "epoch": 2.015810276679842, - "grad_norm": 18.350811004638672, - "learning_rate": 1.790513833992095e-05, - "loss": 0.8689, + "grad_norm": 4.8567280769348145, + "learning_rate": 0.0002984189723320158, + "loss": 0.8648, "step": 10710 }, { "epoch": 2.0176924524750612, - "grad_norm": 1.005018949508667, - "learning_rate": 1.7893845285149635e-05, - "loss": 0.8021, + "grad_norm": 0.8375732898712158, + "learning_rate": 0.00029823075475249387, + "loss": 0.792, "step": 10720 }, { "epoch": 2.0195746282702802, - "grad_norm": 11.301158905029297, - "learning_rate": 1.7882552230378317e-05, - "loss": 0.7946, + "grad_norm": 2.4992613792419434, + "learning_rate": 0.00029804253717297195, + "loss": 0.7085, "step": 10730 }, { "epoch": 2.0214568040654997, - "grad_norm": 22.24584197998047, - "learning_rate": 1.7871259175607003e-05, - "loss": 0.8225, + "grad_norm": 4.913736820220947, + "learning_rate": 0.0002978543195934501, + "loss": 0.6829, "step": 10740 }, { "epoch": 2.023338979860719, - "grad_norm": 4.584529399871826, - "learning_rate": 1.785996612083569e-05, - "loss": 0.9226, + "grad_norm": 3.4823250770568848, + "learning_rate": 0.00029766610201392815, + "loss": 0.7979, "step": 10750 }, { "epoch": 2.025221155655938, - "grad_norm": 9.03012466430664, - "learning_rate": 1.784867306606437e-05, - "loss": 0.7807, + "grad_norm": 3.6224143505096436, + "learning_rate": 0.0002974778844344062, + "loss": 0.781, "step": 10760 }, { "epoch": 2.0271033314511575, - "grad_norm": 14.892461776733398, - "learning_rate": 1.7837380011293055e-05, - "loss": 0.5403, + "grad_norm": 2.89218807220459, + "learning_rate": 0.00029728966685488425, + "loss": 0.5409, "step": 10770 }, { "epoch": 2.028985507246377, - "grad_norm": 18.233808517456055, - "learning_rate": 1.7826086956521738e-05, - "loss": 0.7445, + "grad_norm": 2.972883462905884, + "learning_rate": 0.00029710144927536233, + "loss": 0.5425, "step": 10780 }, { "epoch": 2.030867683041596, - "grad_norm": 7.108969211578369, - "learning_rate": 1.7814793901750424e-05, - "loss": 0.7014, + "grad_norm": 2.568448781967163, + "learning_rate": 0.0002969132316958404, + "loss": 0.7254, "step": 10790 }, { "epoch": 2.0327498588368154, - "grad_norm": 3.6049118041992188, - "learning_rate": 1.780350084697911e-05, - "loss": 0.6114, + "grad_norm": 1.1434894800186157, + "learning_rate": 0.0002967250141163185, + "loss": 0.6191, "step": 10800 }, { "epoch": 2.034632034632035, - "grad_norm": 8.403626441955566, - "learning_rate": 1.7792207792207792e-05, - "loss": 0.856, + "grad_norm": 2.585139751434326, + "learning_rate": 0.00029653679653679656, + "loss": 0.8831, "step": 10810 }, { "epoch": 2.036514210427254, - "grad_norm": 3.540233850479126, - "learning_rate": 1.778091473743648e-05, - "loss": 0.5674, + "grad_norm": 2.129727363586426, + "learning_rate": 0.00029634857895727463, + "loss": 0.5341, "step": 10820 }, { "epoch": 2.038396386222473, - "grad_norm": 13.208251953125, - "learning_rate": 1.776962168266516e-05, - "loss": 0.829, + "grad_norm": 3.551943063735962, + "learning_rate": 0.0002961603613777527, + "loss": 0.8215, "step": 10830 }, { "epoch": 2.0402785620176926, - "grad_norm": 3.715716600418091, - "learning_rate": 1.7758328627893847e-05, - "loss": 0.7736, + "grad_norm": 0.25661563873291016, + "learning_rate": 0.0002959721437982308, + "loss": 0.6864, "step": 10840 }, { "epoch": 2.0421607378129116, - "grad_norm": 14.193865776062012, - "learning_rate": 1.774703557312253e-05, - "loss": 0.4945, + "grad_norm": 2.998699903488159, + "learning_rate": 0.0002957839262187088, + "loss": 0.512, "step": 10850 }, { "epoch": 2.044042913608131, - "grad_norm": 0.2950342893600464, - "learning_rate": 1.7735742518351213e-05, - "loss": 0.7292, + "grad_norm": 0.43398717045783997, + "learning_rate": 0.0002955957086391869, + "loss": 0.5836, "step": 10860 }, { "epoch": 2.04592508940335, - "grad_norm": 6.488024711608887, - "learning_rate": 1.77244494635799e-05, - "loss": 0.7146, + "grad_norm": 1.4403588771820068, + "learning_rate": 0.00029540749105966496, + "loss": 0.6644, "step": 10870 }, { "epoch": 2.0478072651985695, - "grad_norm": 19.793678283691406, - "learning_rate": 1.771315640880858e-05, - "loss": 0.6736, + "grad_norm": 2.820168972015381, + "learning_rate": 0.00029521927348014304, + "loss": 0.6328, "step": 10880 }, { "epoch": 2.049689440993789, - "grad_norm": 12.970588684082031, - "learning_rate": 1.7701863354037267e-05, - "loss": 0.8303, + "grad_norm": 3.101900339126587, + "learning_rate": 0.0002950310559006211, + "loss": 0.6874, "step": 10890 }, { "epoch": 2.051571616789008, - "grad_norm": 8.980803489685059, - "learning_rate": 1.7690570299265954e-05, - "loss": 0.9072, + "grad_norm": 3.710576295852661, + "learning_rate": 0.0002948428383210992, + "loss": 0.9401, "step": 10900 }, { "epoch": 2.0534537925842273, - "grad_norm": 3.427503824234009, - "learning_rate": 1.7679277244494636e-05, - "loss": 0.6361, + "grad_norm": 2.558265447616577, + "learning_rate": 0.00029465462074157727, + "loss": 0.6474, "step": 10910 }, { "epoch": 2.0553359683794468, - "grad_norm": 9.372367858886719, - "learning_rate": 1.7667984189723322e-05, - "loss": 0.601, + "grad_norm": 1.6897790431976318, + "learning_rate": 0.00029446640316205534, + "loss": 0.5211, "step": 10920 }, { "epoch": 2.0572181441746658, - "grad_norm": 31.908103942871094, - "learning_rate": 1.7656691134952005e-05, - "loss": 0.4396, + "grad_norm": 3.507096529006958, + "learning_rate": 0.0002942781855825334, + "loss": 0.5326, "step": 10930 }, { "epoch": 2.059100319969885, - "grad_norm": 10.805863380432129, - "learning_rate": 1.7645398080180688e-05, - "loss": 0.6055, + "grad_norm": 3.556894540786743, + "learning_rate": 0.00029408996800301144, + "loss": 0.6389, "step": 10940 }, { "epoch": 2.0609824957651046, - "grad_norm": 9.006129264831543, - "learning_rate": 1.7634105025409374e-05, - "loss": 0.7486, + "grad_norm": 2.366298198699951, + "learning_rate": 0.0002939017504234895, + "loss": 0.5868, "step": 10950 }, { "epoch": 2.0628646715603236, - "grad_norm": 12.054622650146484, - "learning_rate": 1.7622811970638056e-05, - "loss": 0.5213, + "grad_norm": 2.686265707015991, + "learning_rate": 0.00029371353284396765, + "loss": 0.5434, "step": 10960 }, { "epoch": 2.064746847355543, - "grad_norm": 4.848597526550293, - "learning_rate": 1.7611518915866743e-05, - "loss": 0.4883, + "grad_norm": 1.7855459451675415, + "learning_rate": 0.0002935253152644457, + "loss": 0.5321, "step": 10970 }, { "epoch": 2.0666290231507625, - "grad_norm": 0.7347248792648315, - "learning_rate": 1.760022586109543e-05, - "loss": 0.6849, + "grad_norm": 2.971705436706543, + "learning_rate": 0.0002933370976849238, + "loss": 0.6643, "step": 10980 }, { "epoch": 2.0685111989459815, - "grad_norm": 3.6684467792510986, - "learning_rate": 1.758893280632411e-05, - "loss": 0.7872, + "grad_norm": 0.7849436402320862, + "learning_rate": 0.0002931488801054019, + "loss": 0.6436, "step": 10990 }, { "epoch": 2.070393374741201, - "grad_norm": 1.874008059501648, - "learning_rate": 1.7577639751552797e-05, - "loss": 0.8974, + "grad_norm": 0.2096141129732132, + "learning_rate": 0.00029296066252587996, + "loss": 0.8931, "step": 11000 }, { "epoch": 2.0722755505364203, - "grad_norm": 8.359776496887207, - "learning_rate": 1.756634669678148e-05, - "loss": 0.9378, + "grad_norm": 2.518460273742676, + "learning_rate": 0.00029277244494635803, + "loss": 0.9559, "step": 11010 }, { "epoch": 2.0741577263316393, - "grad_norm": 8.968391418457031, - "learning_rate": 1.7555053642010163e-05, - "loss": 0.404, + "grad_norm": 4.189047813415527, + "learning_rate": 0.00029258422736683605, + "loss": 0.4586, "step": 11020 }, { "epoch": 2.0760399021268587, - "grad_norm": 40.07794952392578, - "learning_rate": 1.754376058723885e-05, - "loss": 0.5055, + "grad_norm": 7.893125057220459, + "learning_rate": 0.00029239600978731413, + "loss": 0.4236, "step": 11030 }, { "epoch": 2.0779220779220777, - "grad_norm": 45.67797088623047, - "learning_rate": 1.753246753246753e-05, - "loss": 0.5616, + "grad_norm": 19.647254943847656, + "learning_rate": 0.0002922077922077922, + "loss": 0.5715, "step": 11040 }, { "epoch": 2.079804253717297, - "grad_norm": 11.17763900756836, - "learning_rate": 1.7521174477696218e-05, - "loss": 0.6244, + "grad_norm": 2.632347822189331, + "learning_rate": 0.0002920195746282703, + "loss": 0.5478, "step": 11050 }, { "epoch": 2.0816864295125166, - "grad_norm": 11.508498191833496, - "learning_rate": 1.75098814229249e-05, - "loss": 0.6413, + "grad_norm": 5.197866439819336, + "learning_rate": 0.00029183135704874836, + "loss": 0.7797, "step": 11060 }, { "epoch": 2.0835686053077356, - "grad_norm": 7.054535865783691, - "learning_rate": 1.7498588368153586e-05, - "loss": 1.0319, + "grad_norm": 2.2426016330718994, + "learning_rate": 0.00029164313946922644, + "loss": 0.9397, "step": 11070 }, { "epoch": 2.085450781102955, - "grad_norm": 1.5207102298736572, - "learning_rate": 1.7487295313382272e-05, - "loss": 0.6158, + "grad_norm": 1.2567468881607056, + "learning_rate": 0.0002914549218897045, + "loss": 0.6406, "step": 11080 }, { "epoch": 2.0873329568981744, - "grad_norm": 6.750110149383545, - "learning_rate": 1.7476002258610955e-05, - "loss": 0.5906, + "grad_norm": 3.520487070083618, + "learning_rate": 0.0002912667043101826, + "loss": 0.5526, "step": 11090 }, { "epoch": 2.0892151326933934, - "grad_norm": 5.071254730224609, - "learning_rate": 1.746470920383964e-05, - "loss": 0.7611, + "grad_norm": 3.5984339714050293, + "learning_rate": 0.00029107848673066067, + "loss": 0.6277, "step": 11100 }, { "epoch": 2.091097308488613, - "grad_norm": 8.039938926696777, - "learning_rate": 1.745341614906832e-05, - "loss": 0.7571, + "grad_norm": 3.593083381652832, + "learning_rate": 0.0002908902691511387, + "loss": 0.8077, "step": 11110 }, { "epoch": 2.0929794842838323, - "grad_norm": 9.071053504943848, - "learning_rate": 1.7442123094297007e-05, - "loss": 0.6185, + "grad_norm": 4.32984733581543, + "learning_rate": 0.00029070205157161676, + "loss": 0.5985, "step": 11120 }, { "epoch": 2.0948616600790513, - "grad_norm": 7.90115213394165, - "learning_rate": 1.7430830039525693e-05, - "loss": 0.8708, + "grad_norm": 4.792181968688965, + "learning_rate": 0.00029051383399209484, + "loss": 0.91, "step": 11130 }, { "epoch": 2.0967438358742707, - "grad_norm": 1.540299415588379, - "learning_rate": 1.7419536984754375e-05, - "loss": 0.6717, + "grad_norm": 0.8018267154693604, + "learning_rate": 0.0002903256164125729, + "loss": 0.8573, "step": 11140 }, { "epoch": 2.09862601166949, - "grad_norm": 5.848174095153809, - "learning_rate": 1.740824392998306e-05, - "loss": 0.785, + "grad_norm": 1.1662284135818481, + "learning_rate": 0.000290137398833051, + "loss": 0.8669, "step": 11150 }, { "epoch": 2.100508187464709, - "grad_norm": 12.827478408813477, - "learning_rate": 1.7396950875211744e-05, - "loss": 0.6168, + "grad_norm": 3.500774621963501, + "learning_rate": 0.00028994918125352907, + "loss": 0.6141, "step": 11160 }, { "epoch": 2.1023903632599286, - "grad_norm": 2.8471341133117676, - "learning_rate": 1.738565782044043e-05, - "loss": 0.6963, + "grad_norm": 0.8624693751335144, + "learning_rate": 0.00028976096367400715, + "loss": 0.6528, "step": 11170 }, { "epoch": 2.1042725390551476, - "grad_norm": 12.310620307922363, - "learning_rate": 1.7374364765669116e-05, - "loss": 0.6679, + "grad_norm": 4.650200843811035, + "learning_rate": 0.0002895727460944853, + "loss": 0.5764, "step": 11180 }, { "epoch": 2.106154714850367, - "grad_norm": 2.6820571422576904, - "learning_rate": 1.73630717108978e-05, - "loss": 0.7162, + "grad_norm": 0.9694834351539612, + "learning_rate": 0.00028938452851496335, + "loss": 0.7189, "step": 11190 }, { "epoch": 2.1080368906455864, - "grad_norm": 28.728363037109375, - "learning_rate": 1.735177865612648e-05, - "loss": 1.1478, + "grad_norm": 3.7187447547912598, + "learning_rate": 0.0002891963109354414, + "loss": 1.0254, "step": 11200 }, { "epoch": 2.1099190664408054, - "grad_norm": 13.076791763305664, - "learning_rate": 1.7340485601355168e-05, - "loss": 0.9537, + "grad_norm": 0.849228024482727, + "learning_rate": 0.00028900809335591945, + "loss": 0.8398, "step": 11210 }, { "epoch": 2.111801242236025, - "grad_norm": 22.759828567504883, - "learning_rate": 1.732919254658385e-05, - "loss": 0.8617, + "grad_norm": 3.1720340251922607, + "learning_rate": 0.00028881987577639753, + "loss": 0.7663, "step": 11220 }, { "epoch": 2.1136834180312443, - "grad_norm": 9.096440315246582, - "learning_rate": 1.7317899491812536e-05, - "loss": 0.5249, + "grad_norm": 0.4519988000392914, + "learning_rate": 0.0002886316581968756, + "loss": 0.3329, "step": 11230 }, { "epoch": 2.1155655938264633, - "grad_norm": 14.975255012512207, - "learning_rate": 1.730660643704122e-05, - "loss": 0.6067, + "grad_norm": 3.3207151889801025, + "learning_rate": 0.0002884434406173537, + "loss": 0.5899, "step": 11240 }, { "epoch": 2.1174477696216827, - "grad_norm": 16.731473922729492, - "learning_rate": 1.7295313382269905e-05, - "loss": 0.588, + "grad_norm": 2.0980584621429443, + "learning_rate": 0.00028825522303783176, + "loss": 0.4889, "step": 11250 }, { "epoch": 2.119329945416902, - "grad_norm": 37.46254348754883, - "learning_rate": 1.728402032749859e-05, - "loss": 0.6994, + "grad_norm": 6.887931823730469, + "learning_rate": 0.00028806700545830983, + "loss": 0.7055, "step": 11260 }, { "epoch": 2.121212121212121, - "grad_norm": 4.683788299560547, - "learning_rate": 1.7272727272727274e-05, - "loss": 0.5429, + "grad_norm": 0.9553453326225281, + "learning_rate": 0.0002878787878787879, + "loss": 0.5491, "step": 11270 }, { "epoch": 2.1230942970073405, - "grad_norm": 1.7957135438919067, - "learning_rate": 1.7261434217955957e-05, - "loss": 0.4845, + "grad_norm": 0.46205490827560425, + "learning_rate": 0.00028769057029926593, + "loss": 0.4709, "step": 11280 }, { "epoch": 2.12497647280256, - "grad_norm": 13.624578475952148, - "learning_rate": 1.725014116318464e-05, - "loss": 0.892, + "grad_norm": 2.981776475906372, + "learning_rate": 0.000287502352719744, + "loss": 0.7086, "step": 11290 }, { "epoch": 2.126858648597779, - "grad_norm": 40.41400909423828, - "learning_rate": 1.7238848108413325e-05, - "loss": 0.8509, + "grad_norm": 6.404483318328857, + "learning_rate": 0.0002873141351402221, + "loss": 0.6254, "step": 11300 }, { "epoch": 2.1287408243929984, - "grad_norm": 16.918149948120117, - "learning_rate": 1.722755505364201e-05, - "loss": 0.9205, + "grad_norm": 2.772292375564575, + "learning_rate": 0.00028712591756070016, + "loss": 0.7199, "step": 11310 }, { "epoch": 2.1306230001882174, - "grad_norm": 35.62562561035156, - "learning_rate": 1.7216261998870694e-05, - "loss": 1.0122, + "grad_norm": 3.2327826023101807, + "learning_rate": 0.00028693769998117824, + "loss": 0.8967, "step": 11320 }, { "epoch": 2.132505175983437, - "grad_norm": 10.24116325378418, - "learning_rate": 1.720496894409938e-05, - "loss": 0.7243, + "grad_norm": 2.0082974433898926, + "learning_rate": 0.0002867494824016563, + "loss": 0.7676, "step": 11330 }, { "epoch": 2.1343873517786562, - "grad_norm": 20.230663299560547, - "learning_rate": 1.7193675889328063e-05, - "loss": 0.8808, + "grad_norm": 3.831946611404419, + "learning_rate": 0.0002865612648221344, + "loss": 0.8358, "step": 11340 }, { "epoch": 2.1362695275738752, - "grad_norm": 11.159217834472656, - "learning_rate": 1.718238283455675e-05, - "loss": 0.9238, + "grad_norm": 1.170330286026001, + "learning_rate": 0.00028637304724261247, + "loss": 0.9855, "step": 11350 }, { "epoch": 2.1381517033690947, - "grad_norm": 15.367325782775879, - "learning_rate": 1.7171089779785435e-05, - "loss": 0.837, + "grad_norm": 2.1947906017303467, + "learning_rate": 0.00028618482966309054, + "loss": 0.7599, "step": 11360 }, { "epoch": 2.140033879164314, - "grad_norm": 6.264622688293457, - "learning_rate": 1.7159796725014114e-05, - "loss": 0.7584, + "grad_norm": 0.594887375831604, + "learning_rate": 0.00028599661208356857, + "loss": 0.6236, "step": 11370 }, { "epoch": 2.141916054959533, - "grad_norm": 7.990375518798828, - "learning_rate": 1.71485036702428e-05, - "loss": 0.5522, + "grad_norm": 2.23850417137146, + "learning_rate": 0.00028580839450404664, + "loss": 0.5686, "step": 11380 }, { "epoch": 2.1437982307547525, - "grad_norm": 13.449748039245605, - "learning_rate": 1.7137210615471483e-05, - "loss": 0.9335, + "grad_norm": 2.4512314796447754, + "learning_rate": 0.0002856201769245247, + "loss": 0.8437, "step": 11390 }, { "epoch": 2.145680406549972, - "grad_norm": 16.00578498840332, - "learning_rate": 1.712591756070017e-05, - "loss": 0.6726, + "grad_norm": 2.53229022026062, + "learning_rate": 0.0002854319593450028, + "loss": 0.7272, "step": 11400 }, { "epoch": 2.147562582345191, - "grad_norm": 6.167961120605469, - "learning_rate": 1.7114624505928855e-05, - "loss": 0.9023, + "grad_norm": 2.3535914421081543, + "learning_rate": 0.0002852437417654809, + "loss": 0.9296, "step": 11410 }, { "epoch": 2.1494447581404104, - "grad_norm": 16.115100860595703, - "learning_rate": 1.7103331451157538e-05, - "loss": 0.9606, + "grad_norm": 3.0220978260040283, + "learning_rate": 0.000285055524185959, + "loss": 0.7514, "step": 11420 }, { "epoch": 2.15132693393563, - "grad_norm": 30.645904541015625, - "learning_rate": 1.7092038396386224e-05, - "loss": 0.6028, + "grad_norm": 4.237234592437744, + "learning_rate": 0.0002848673066064371, + "loss": 0.5762, "step": 11430 }, { "epoch": 2.153209109730849, - "grad_norm": 4.448369026184082, - "learning_rate": 1.7080745341614907e-05, - "loss": 0.9826, + "grad_norm": 1.3424307107925415, + "learning_rate": 0.00028467908902691516, + "loss": 0.9281, "step": 11440 }, { "epoch": 2.155091285526068, - "grad_norm": 13.802371978759766, - "learning_rate": 1.7069452286843593e-05, - "loss": 1.0515, + "grad_norm": 3.5801596641540527, + "learning_rate": 0.00028449087144739323, + "loss": 1.062, "step": 11450 }, { "epoch": 2.1569734613212876, - "grad_norm": 18.508493423461914, - "learning_rate": 1.7058159232072276e-05, - "loss": 0.5478, + "grad_norm": 3.8475425243377686, + "learning_rate": 0.00028430265386787125, + "loss": 0.5018, "step": 11460 }, { "epoch": 2.1588556371165066, - "grad_norm": 18.791305541992188, - "learning_rate": 1.7046866177300958e-05, - "loss": 0.6688, + "grad_norm": 5.038808822631836, + "learning_rate": 0.00028411443628834933, + "loss": 0.4989, "step": 11470 }, { "epoch": 2.160737812911726, - "grad_norm": 13.487887382507324, - "learning_rate": 1.7035573122529644e-05, - "loss": 0.7173, + "grad_norm": 4.923741817474365, + "learning_rate": 0.0002839262187088274, + "loss": 0.6093, "step": 11480 }, { "epoch": 2.162619988706945, - "grad_norm": 7.567160129547119, - "learning_rate": 1.702428006775833e-05, - "loss": 0.5889, + "grad_norm": 1.457595705986023, + "learning_rate": 0.0002837380011293055, + "loss": 0.6103, "step": 11490 }, { "epoch": 2.1645021645021645, - "grad_norm": 14.993462562561035, - "learning_rate": 1.7012987012987013e-05, - "loss": 0.6631, + "grad_norm": 3.0185911655426025, + "learning_rate": 0.00028354978354978356, + "loss": 0.7228, "step": 11500 }, { "epoch": 2.166384340297384, - "grad_norm": 2.703808069229126, - "learning_rate": 1.70016939582157e-05, - "loss": 0.8147, + "grad_norm": 1.2738662958145142, + "learning_rate": 0.00028336156597026164, + "loss": 0.6904, "step": 11510 }, { "epoch": 2.168266516092603, - "grad_norm": 13.842448234558105, - "learning_rate": 1.6990400903444382e-05, - "loss": 0.6228, + "grad_norm": 1.419976830482483, + "learning_rate": 0.0002831733483907397, + "loss": 0.7167, "step": 11520 }, { "epoch": 2.1701486918878223, - "grad_norm": 14.048020362854004, - "learning_rate": 1.6979107848673068e-05, - "loss": 0.6811, + "grad_norm": 3.3569860458374023, + "learning_rate": 0.0002829851308112178, + "loss": 0.5419, "step": 11530 }, { "epoch": 2.1720308676830418, - "grad_norm": 9.474082946777344, - "learning_rate": 1.6967814793901754e-05, - "loss": 1.0404, + "grad_norm": 1.6389613151550293, + "learning_rate": 0.00028279691323169587, + "loss": 0.8041, "step": 11540 }, { "epoch": 2.1739130434782608, - "grad_norm": 7.542745113372803, - "learning_rate": 1.6956521739130433e-05, - "loss": 0.7817, + "grad_norm": 1.0149704217910767, + "learning_rate": 0.0002826086956521739, + "loss": 0.6128, "step": 11550 }, { "epoch": 2.17579521927348, - "grad_norm": 14.078716278076172, - "learning_rate": 1.694522868435912e-05, - "loss": 0.9704, + "grad_norm": 3.377378225326538, + "learning_rate": 0.00028242047807265196, + "loss": 0.8804, "step": 11560 }, { "epoch": 2.1776773950686996, - "grad_norm": 19.203222274780273, - "learning_rate": 1.6933935629587802e-05, - "loss": 0.6797, + "grad_norm": 1.8844712972640991, + "learning_rate": 0.00028223226049313004, + "loss": 0.8288, "step": 11570 }, { "epoch": 2.1795595708639186, - "grad_norm": 10.718687057495117, - "learning_rate": 1.6922642574816488e-05, - "loss": 0.7273, + "grad_norm": 2.894857168197632, + "learning_rate": 0.0002820440429136081, + "loss": 0.7565, "step": 11580 }, { "epoch": 2.181441746659138, - "grad_norm": 7.656870365142822, - "learning_rate": 1.6911349520045174e-05, - "loss": 0.4612, + "grad_norm": 4.635049343109131, + "learning_rate": 0.0002818558253340862, + "loss": 0.6153, "step": 11590 }, { "epoch": 2.183323922454357, - "grad_norm": 9.046762466430664, - "learning_rate": 1.6900056465273857e-05, - "loss": 0.815, + "grad_norm": 2.9711811542510986, + "learning_rate": 0.00028166760775456427, + "loss": 0.6096, "step": 11600 }, { "epoch": 2.1852060982495765, - "grad_norm": 16.06904411315918, - "learning_rate": 1.6888763410502543e-05, - "loss": 0.7, + "grad_norm": 2.8086323738098145, + "learning_rate": 0.00028147939017504235, + "loss": 0.6508, "step": 11610 }, { "epoch": 2.187088274044796, - "grad_norm": 17.846696853637695, - "learning_rate": 1.6877470355731226e-05, - "loss": 0.8311, + "grad_norm": 2.190927743911743, + "learning_rate": 0.0002812911725955205, + "loss": 0.9356, "step": 11620 }, { "epoch": 2.188970449840015, - "grad_norm": 0.43426594138145447, - "learning_rate": 1.686617730095991e-05, - "loss": 0.901, + "grad_norm": 0.3786638379096985, + "learning_rate": 0.0002811029550159985, + "loss": 1.0633, "step": 11630 }, { "epoch": 2.1908526256352343, - "grad_norm": 21.463172912597656, - "learning_rate": 1.6854884246188594e-05, - "loss": 1.1355, + "grad_norm": 4.207404136657715, + "learning_rate": 0.0002809147374364766, + "loss": 0.9465, "step": 11640 }, { "epoch": 2.1927348014304537, - "grad_norm": 4.620266914367676, - "learning_rate": 1.6843591191417277e-05, - "loss": 0.6449, + "grad_norm": 1.1538981199264526, + "learning_rate": 0.00028072651985695465, + "loss": 0.6115, "step": 11650 }, { "epoch": 2.1946169772256727, - "grad_norm": 1.4968249797821045, - "learning_rate": 1.6832298136645963e-05, - "loss": 0.6416, + "grad_norm": 0.5456836223602295, + "learning_rate": 0.00028053830227743273, + "loss": 0.5041, "step": 11660 }, { "epoch": 2.196499153020892, - "grad_norm": 5.055369853973389, - "learning_rate": 1.6821005081874646e-05, - "loss": 0.6069, + "grad_norm": 2.0955114364624023, + "learning_rate": 0.0002803500846979108, + "loss": 0.6258, "step": 11670 }, { "epoch": 2.1983813288161116, - "grad_norm": 17.229318618774414, - "learning_rate": 1.6809712027103332e-05, - "loss": 0.7881, + "grad_norm": 2.7322144508361816, + "learning_rate": 0.0002801618671183889, + "loss": 0.8496, "step": 11680 }, { "epoch": 2.2002635046113306, - "grad_norm": 26.358625411987305, - "learning_rate": 1.6798418972332018e-05, - "loss": 0.9306, + "grad_norm": 6.970944404602051, + "learning_rate": 0.00027997364953886696, + "loss": 0.8897, "step": 11690 }, { "epoch": 2.20214568040655, - "grad_norm": 14.272749900817871, - "learning_rate": 1.67871259175607e-05, - "loss": 0.7933, + "grad_norm": 0.6811609864234924, + "learning_rate": 0.00027978543195934503, + "loss": 0.8129, "step": 11700 }, { "epoch": 2.2040278562017694, - "grad_norm": 17.075315475463867, - "learning_rate": 1.6775832862789387e-05, - "loss": 0.9922, + "grad_norm": 5.177496433258057, + "learning_rate": 0.0002795972143798231, + "loss": 0.8563, "step": 11710 }, { "epoch": 2.2059100319969884, - "grad_norm": 18.468908309936523, - "learning_rate": 1.676453980801807e-05, - "loss": 0.549, + "grad_norm": 3.5530927181243896, + "learning_rate": 0.00027940899680030113, + "loss": 0.5076, "step": 11720 }, { "epoch": 2.207792207792208, - "grad_norm": 7.144861698150635, - "learning_rate": 1.6753246753246752e-05, - "loss": 0.7335, + "grad_norm": 2.238463878631592, + "learning_rate": 0.0002792207792207792, + "loss": 0.7625, "step": 11730 }, { "epoch": 2.2096743835874273, - "grad_norm": 20.43572235107422, - "learning_rate": 1.6741953698475438e-05, - "loss": 0.4813, + "grad_norm": 3.0056402683258057, + "learning_rate": 0.0002790325616412573, + "loss": 0.5276, "step": 11740 }, { "epoch": 2.2115565593826463, - "grad_norm": 8.59123420715332, - "learning_rate": 1.673066064370412e-05, - "loss": 0.8275, + "grad_norm": 2.8394482135772705, + "learning_rate": 0.00027884434406173536, + "loss": 0.8038, "step": 11750 }, { "epoch": 2.2134387351778657, - "grad_norm": 17.360458374023438, - "learning_rate": 1.6719367588932807e-05, - "loss": 0.7514, + "grad_norm": 1.930206298828125, + "learning_rate": 0.00027865612648221344, + "loss": 0.7018, "step": 11760 }, { "epoch": 2.2153209109730847, - "grad_norm": 10.042230606079102, - "learning_rate": 1.6708074534161493e-05, - "loss": 1.0016, + "grad_norm": 2.354780912399292, + "learning_rate": 0.0002784679089026915, + "loss": 0.987, "step": 11770 }, { "epoch": 2.217203086768304, - "grad_norm": 19.570775985717773, - "learning_rate": 1.6696781479390176e-05, - "loss": 0.8327, + "grad_norm": 5.227718830108643, + "learning_rate": 0.0002782796913231696, + "loss": 0.7865, "step": 11780 }, { "epoch": 2.2190852625635236, - "grad_norm": 14.94032096862793, - "learning_rate": 1.6685488424618862e-05, - "loss": 0.6058, + "grad_norm": 4.314568042755127, + "learning_rate": 0.00027809147374364767, + "loss": 0.6195, "step": 11790 }, { "epoch": 2.2209674383587426, - "grad_norm": 15.301405906677246, - "learning_rate": 1.6674195369847545e-05, - "loss": 0.4892, + "grad_norm": 2.7393124103546143, + "learning_rate": 0.00027790325616412574, + "loss": 0.4084, "step": 11800 }, { "epoch": 2.222849614153962, - "grad_norm": 18.80755615234375, - "learning_rate": 1.6662902315076227e-05, - "loss": 0.7236, + "grad_norm": 2.8917291164398193, + "learning_rate": 0.00027771503858460377, + "loss": 0.6465, "step": 11810 }, { "epoch": 2.2247317899491814, - "grad_norm": 15.558100700378418, - "learning_rate": 1.6651609260304913e-05, - "loss": 0.7063, + "grad_norm": 1.8676915168762207, + "learning_rate": 0.00027752682100508184, + "loss": 0.776, "step": 11820 }, { "epoch": 2.2266139657444004, - "grad_norm": 13.449424743652344, - "learning_rate": 1.6640316205533596e-05, - "loss": 0.5049, + "grad_norm": 4.098349094390869, + "learning_rate": 0.0002773386034255599, + "loss": 0.4904, "step": 11830 }, { "epoch": 2.22849614153962, - "grad_norm": 10.576664924621582, - "learning_rate": 1.6629023150762282e-05, - "loss": 1.2078, + "grad_norm": 3.7217068672180176, + "learning_rate": 0.000277150385846038, + "loss": 0.9678, "step": 11840 }, { "epoch": 2.2303783173348393, - "grad_norm": 4.189072608947754, - "learning_rate": 1.6617730095990965e-05, - "loss": 0.7146, + "grad_norm": 3.343392848968506, + "learning_rate": 0.0002769621682665161, + "loss": 0.7159, "step": 11850 }, { "epoch": 2.2322604931300583, - "grad_norm": 10.161325454711914, - "learning_rate": 1.660643704121965e-05, - "loss": 0.6925, + "grad_norm": 3.8852624893188477, + "learning_rate": 0.0002767739506869942, + "loss": 0.6171, "step": 11860 }, { "epoch": 2.2341426689252777, - "grad_norm": 29.488740921020508, - "learning_rate": 1.6595143986448337e-05, - "loss": 0.6859, + "grad_norm": 3.444432258605957, + "learning_rate": 0.0002765857331074723, + "loss": 0.4322, "step": 11870 }, { "epoch": 2.2360248447204967, - "grad_norm": 6.638284206390381, - "learning_rate": 1.658385093167702e-05, - "loss": 0.7455, + "grad_norm": 4.453119277954102, + "learning_rate": 0.00027639751552795036, + "loss": 0.7776, "step": 11880 }, { "epoch": 2.237907020515716, - "grad_norm": 14.819140434265137, - "learning_rate": 1.6572557876905702e-05, - "loss": 0.9682, + "grad_norm": 1.7033886909484863, + "learning_rate": 0.0002762092979484284, + "loss": 0.7152, "step": 11890 }, { "epoch": 2.2397891963109355, - "grad_norm": 16.32223129272461, - "learning_rate": 1.6561264822134385e-05, - "loss": 0.8409, + "grad_norm": 3.5321342945098877, + "learning_rate": 0.00027602108036890645, + "loss": 0.7527, "step": 11900 }, { "epoch": 2.2416713721061545, - "grad_norm": 33.57245635986328, - "learning_rate": 1.654997176736307e-05, - "loss": 0.8191, + "grad_norm": 4.50179386138916, + "learning_rate": 0.00027583286278938453, + "loss": 0.7161, "step": 11910 }, { "epoch": 2.243553547901374, - "grad_norm": 11.583626747131348, - "learning_rate": 1.6538678712591757e-05, - "loss": 0.7672, + "grad_norm": 4.627577304840088, + "learning_rate": 0.0002756446452098626, + "loss": 0.7929, "step": 11920 }, { "epoch": 2.2454357236965934, - "grad_norm": 19.42552947998047, - "learning_rate": 1.652738565782044e-05, - "loss": 0.6141, + "grad_norm": 1.926927089691162, + "learning_rate": 0.0002754564276303407, + "loss": 0.5007, "step": 11930 }, { "epoch": 2.2473178994918124, - "grad_norm": 8.740943908691406, - "learning_rate": 1.6516092603049126e-05, - "loss": 0.5423, + "grad_norm": 7.196998119354248, + "learning_rate": 0.00027526821005081876, + "loss": 0.4507, "step": 11940 }, { "epoch": 2.249200075287032, - "grad_norm": 9.475835800170898, - "learning_rate": 1.650479954827781e-05, - "loss": 0.9297, + "grad_norm": 1.706291675567627, + "learning_rate": 0.00027507999247129684, + "loss": 1.0266, "step": 11950 }, { "epoch": 2.2510822510822512, - "grad_norm": 10.173590660095215, - "learning_rate": 1.6493506493506495e-05, - "loss": 0.6418, + "grad_norm": 2.996903657913208, + "learning_rate": 0.0002748917748917749, + "loss": 0.5698, "step": 11960 }, { "epoch": 2.2529644268774702, - "grad_norm": 5.159122467041016, - "learning_rate": 1.648221343873518e-05, - "loss": 0.4525, + "grad_norm": 1.0445246696472168, + "learning_rate": 0.000274703557312253, + "loss": 0.382, "step": 11970 }, { "epoch": 2.2548466026726897, - "grad_norm": 20.905466079711914, - "learning_rate": 1.647092038396386e-05, - "loss": 0.4453, + "grad_norm": 3.8929078578948975, + "learning_rate": 0.000274515339732731, + "loss": 0.3706, "step": 11980 }, { "epoch": 2.256728778467909, - "grad_norm": 14.282583236694336, - "learning_rate": 1.6459627329192546e-05, - "loss": 0.6531, + "grad_norm": 3.6355106830596924, + "learning_rate": 0.0002743271221532091, + "loss": 0.6808, "step": 11990 }, { "epoch": 2.258610954263128, - "grad_norm": 4.331676006317139, - "learning_rate": 1.6448334274421232e-05, - "loss": 0.6144, + "grad_norm": 0.9284248352050781, + "learning_rate": 0.00027413890457368716, + "loss": 0.5733, "step": 12000 }, { "epoch": 2.2604931300583475, - "grad_norm": 18.63337516784668, - "learning_rate": 1.6437041219649915e-05, - "loss": 0.9553, + "grad_norm": 4.699124336242676, + "learning_rate": 0.00027395068699416524, + "loss": 0.8723, "step": 12010 }, { "epoch": 2.262375305853567, - "grad_norm": 8.563776016235352, - "learning_rate": 1.64257481648786e-05, - "loss": 0.5918, + "grad_norm": 2.852851152420044, + "learning_rate": 0.0002737624694146433, + "loss": 0.6141, "step": 12020 }, { "epoch": 2.264257481648786, - "grad_norm": 26.9168758392334, - "learning_rate": 1.6414455110107284e-05, - "loss": 0.9051, + "grad_norm": 3.81124210357666, + "learning_rate": 0.0002735742518351214, + "loss": 0.8832, "step": 12030 }, { "epoch": 2.2661396574440054, - "grad_norm": 20.256752014160156, - "learning_rate": 1.640316205533597e-05, - "loss": 0.6102, + "grad_norm": 2.0278055667877197, + "learning_rate": 0.00027338603425559947, + "loss": 0.49, "step": 12040 }, { "epoch": 2.2680218332392243, - "grad_norm": 0.3493950366973877, - "learning_rate": 1.6391869000564656e-05, - "loss": 0.5125, + "grad_norm": 0.07482793182134628, + "learning_rate": 0.00027319781667607755, + "loss": 0.5725, "step": 12050 }, { "epoch": 2.269904009034444, - "grad_norm": 2.2542970180511475, - "learning_rate": 1.638057594579334e-05, - "loss": 0.7501, + "grad_norm": 1.8518965244293213, + "learning_rate": 0.0002730095990965556, + "loss": 0.6832, "step": 12060 }, { "epoch": 2.271786184829663, - "grad_norm": 8.313566207885742, - "learning_rate": 1.636928289102202e-05, - "loss": 0.6996, + "grad_norm": 1.8055180311203003, + "learning_rate": 0.0002728213815170337, + "loss": 0.6431, "step": 12070 }, { "epoch": 2.273668360624882, - "grad_norm": 7.590768337249756, - "learning_rate": 1.6357989836250704e-05, - "loss": 0.6913, + "grad_norm": 1.2969673871994019, + "learning_rate": 0.0002726331639375118, + "loss": 0.5825, "step": 12080 }, { "epoch": 2.2755505364201016, - "grad_norm": 17.979164123535156, - "learning_rate": 1.634669678147939e-05, - "loss": 0.5275, + "grad_norm": 4.328277587890625, + "learning_rate": 0.00027244494635798985, + "loss": 0.4851, "step": 12090 }, { "epoch": 2.277432712215321, - "grad_norm": 30.397066116333008, - "learning_rate": 1.6335403726708076e-05, - "loss": 0.8555, + "grad_norm": 4.203436851501465, + "learning_rate": 0.00027225672877846793, + "loss": 0.7723, "step": 12100 }, { "epoch": 2.27931488801054, - "grad_norm": 8.03955364227295, - "learning_rate": 1.632411067193676e-05, - "loss": 0.6765, + "grad_norm": 2.0930275917053223, + "learning_rate": 0.000272068511198946, + "loss": 0.6907, "step": 12110 }, { "epoch": 2.2811970638057595, - "grad_norm": 8.310486793518066, - "learning_rate": 1.6312817617165445e-05, - "loss": 0.8106, + "grad_norm": 1.692587971687317, + "learning_rate": 0.0002718802936194241, + "loss": 0.6536, "step": 12120 }, { "epoch": 2.283079239600979, - "grad_norm": 5.86186408996582, - "learning_rate": 1.6301524562394127e-05, - "loss": 0.4987, + "grad_norm": 1.9245163202285767, + "learning_rate": 0.00027169207603990216, + "loss": 0.4055, "step": 12130 }, { "epoch": 2.284961415396198, - "grad_norm": 19.821392059326172, - "learning_rate": 1.6290231507622814e-05, - "loss": 0.5728, + "grad_norm": 4.446379661560059, + "learning_rate": 0.00027150385846038023, + "loss": 0.4683, "step": 12140 }, { "epoch": 2.2868435911914173, - "grad_norm": 12.544803619384766, - "learning_rate": 1.62789384528515e-05, - "loss": 0.7435, + "grad_norm": 3.3666815757751465, + "learning_rate": 0.0002713156408808583, + "loss": 0.81, "step": 12150 }, { "epoch": 2.2887257669866363, - "grad_norm": 13.145238876342773, - "learning_rate": 1.626764539808018e-05, - "loss": 0.4866, + "grad_norm": 6.301428318023682, + "learning_rate": 0.00027112742330133633, + "loss": 0.5008, "step": 12160 }, { "epoch": 2.2906079427818558, - "grad_norm": 15.785924911499023, - "learning_rate": 1.6256352343308865e-05, - "loss": 0.8673, + "grad_norm": 4.091559410095215, + "learning_rate": 0.0002709392057218144, + "loss": 0.8201, "step": 12170 }, { "epoch": 2.292490118577075, - "grad_norm": 9.898990631103516, - "learning_rate": 1.6245059288537548e-05, - "loss": 0.7766, + "grad_norm": 2.8951735496520996, + "learning_rate": 0.0002707509881422925, + "loss": 0.6976, "step": 12180 }, { "epoch": 2.2943722943722946, - "grad_norm": 14.689043998718262, - "learning_rate": 1.6233766233766234e-05, - "loss": 0.9227, + "grad_norm": 2.0975356101989746, + "learning_rate": 0.00027056277056277056, + "loss": 0.7667, "step": 12190 }, { "epoch": 2.2962544701675136, - "grad_norm": 6.211693286895752, - "learning_rate": 1.622247317899492e-05, - "loss": 0.5012, + "grad_norm": 2.5957348346710205, + "learning_rate": 0.00027037455298324864, + "loss": 0.4919, "step": 12200 }, { "epoch": 2.298136645962733, - "grad_norm": 6.014758586883545, - "learning_rate": 1.6211180124223603e-05, - "loss": 0.8023, + "grad_norm": 1.4557297229766846, + "learning_rate": 0.0002701863354037267, + "loss": 0.6835, "step": 12210 }, { "epoch": 2.300018821757952, - "grad_norm": 32.03548812866211, - "learning_rate": 1.619988706945229e-05, - "loss": 0.86, + "grad_norm": 2.45815110206604, + "learning_rate": 0.0002699981178242048, + "loss": 0.8599, "step": 12220 }, { "epoch": 2.3019009975531715, - "grad_norm": 25.236791610717773, - "learning_rate": 1.6188594014680975e-05, - "loss": 0.7572, + "grad_norm": 3.9854960441589355, + "learning_rate": 0.00026980990024468287, + "loss": 0.6801, "step": 12230 }, { "epoch": 2.303783173348391, - "grad_norm": 7.06341552734375, - "learning_rate": 1.6177300959909654e-05, - "loss": 0.6443, + "grad_norm": 1.293655514717102, + "learning_rate": 0.0002696216826651609, + "loss": 0.6667, "step": 12240 }, { "epoch": 2.30566534914361, - "grad_norm": 3.819842576980591, - "learning_rate": 1.616600790513834e-05, - "loss": 0.6632, + "grad_norm": 2.157909870147705, + "learning_rate": 0.00026943346508563897, + "loss": 0.7323, "step": 12250 }, { "epoch": 2.3075475249388293, - "grad_norm": 11.308211326599121, - "learning_rate": 1.6154714850367023e-05, - "loss": 0.7541, + "grad_norm": 2.9319350719451904, + "learning_rate": 0.00026924524750611704, + "loss": 0.7194, "step": 12260 }, { "epoch": 2.3094297007340487, - "grad_norm": 9.683237075805664, - "learning_rate": 1.614342179559571e-05, - "loss": 0.9261, + "grad_norm": 3.461108922958374, + "learning_rate": 0.0002690570299265951, + "loss": 0.8909, "step": 12270 }, { "epoch": 2.3113118765292677, - "grad_norm": 21.996740341186523, - "learning_rate": 1.6132128740824395e-05, - "loss": 0.6662, + "grad_norm": 2.8008790016174316, + "learning_rate": 0.0002688688123470732, + "loss": 0.6946, "step": 12280 }, { "epoch": 2.313194052324487, - "grad_norm": 8.742904663085938, - "learning_rate": 1.6120835686053078e-05, - "loss": 0.7921, + "grad_norm": 2.3989953994750977, + "learning_rate": 0.0002686805947675513, + "loss": 0.9312, "step": 12290 }, { "epoch": 2.3150762281197066, - "grad_norm": 1.3880571126937866, - "learning_rate": 1.6109542631281764e-05, - "loss": 0.8751, + "grad_norm": 0.9723449945449829, + "learning_rate": 0.0002684923771880294, + "loss": 0.8079, "step": 12300 }, { "epoch": 2.3169584039149256, - "grad_norm": 26.320383071899414, - "learning_rate": 1.6098249576510446e-05, - "loss": 0.6595, + "grad_norm": 2.027454376220703, + "learning_rate": 0.0002683041596085075, + "loss": 0.5137, "step": 12310 }, { "epoch": 2.318840579710145, - "grad_norm": 34.07208251953125, - "learning_rate": 1.6086956521739132e-05, - "loss": 0.7587, + "grad_norm": 3.510030746459961, + "learning_rate": 0.00026811594202898556, + "loss": 0.6373, "step": 12320 }, { "epoch": 2.320722755505364, - "grad_norm": 7.523997783660889, - "learning_rate": 1.6075663466967815e-05, - "loss": 0.6733, + "grad_norm": 2.559351682662964, + "learning_rate": 0.0002679277244494636, + "loss": 0.6808, "step": 12330 }, { "epoch": 2.3226049313005834, - "grad_norm": 18.36374855041504, - "learning_rate": 1.6064370412196498e-05, - "loss": 0.8033, + "grad_norm": 3.51894474029541, + "learning_rate": 0.00026773950686994165, + "loss": 0.7996, "step": 12340 }, { "epoch": 2.324487107095803, - "grad_norm": 13.632719039916992, - "learning_rate": 1.6053077357425184e-05, - "loss": 0.9909, + "grad_norm": 3.97428822517395, + "learning_rate": 0.00026755128929041973, + "loss": 1.1201, "step": 12350 }, { "epoch": 2.326369282891022, - "grad_norm": 13.183788299560547, - "learning_rate": 1.6041784302653867e-05, - "loss": 0.8184, + "grad_norm": 2.7532434463500977, + "learning_rate": 0.0002673630717108978, + "loss": 0.8154, "step": 12360 }, { "epoch": 2.3282514586862413, - "grad_norm": 2.855896472930908, - "learning_rate": 1.6030491247882553e-05, - "loss": 0.5908, + "grad_norm": 4.514427185058594, + "learning_rate": 0.0002671748541313759, + "loss": 0.6268, "step": 12370 }, { "epoch": 2.3301336344814607, - "grad_norm": 2.077853202819824, - "learning_rate": 1.601919819311124e-05, - "loss": 0.5688, + "grad_norm": 1.4542497396469116, + "learning_rate": 0.00026698663655185396, + "loss": 0.6466, "step": 12380 }, { "epoch": 2.3320158102766797, - "grad_norm": 10.118651390075684, - "learning_rate": 1.600790513833992e-05, - "loss": 0.7142, + "grad_norm": 1.8610076904296875, + "learning_rate": 0.00026679841897233204, + "loss": 0.6654, "step": 12390 }, { "epoch": 2.333897986071899, - "grad_norm": 11.305803298950195, - "learning_rate": 1.5996612083568607e-05, - "loss": 1.1076, + "grad_norm": 2.503138303756714, + "learning_rate": 0.0002666102013928101, + "loss": 1.1184, "step": 12400 }, { "epoch": 2.3357801618671186, - "grad_norm": 10.361531257629395, - "learning_rate": 1.598531902879729e-05, - "loss": 0.7602, + "grad_norm": 4.176797389984131, + "learning_rate": 0.0002664219838132882, + "loss": 0.7134, "step": 12410 }, { "epoch": 2.3376623376623376, - "grad_norm": 20.7995662689209, - "learning_rate": 1.5974025974025973e-05, - "loss": 0.6195, + "grad_norm": 3.3237433433532715, + "learning_rate": 0.0002662337662337662, + "loss": 0.6421, "step": 12420 }, { "epoch": 2.339544513457557, - "grad_norm": 8.287638664245605, - "learning_rate": 1.596273291925466e-05, - "loss": 0.8501, + "grad_norm": 1.97113037109375, + "learning_rate": 0.0002660455486542443, + "loss": 0.8516, "step": 12430 }, { "epoch": 2.341426689252776, - "grad_norm": 20.09409523010254, - "learning_rate": 1.595143986448334e-05, - "loss": 0.8378, + "grad_norm": 3.358541488647461, + "learning_rate": 0.00026585733107472236, + "loss": 0.8102, "step": 12440 }, { "epoch": 2.3433088650479954, - "grad_norm": 26.83869171142578, - "learning_rate": 1.5940146809712028e-05, - "loss": 0.9183, + "grad_norm": 7.149372100830078, + "learning_rate": 0.00026566911349520044, + "loss": 0.9773, "step": 12450 }, { "epoch": 2.345191040843215, - "grad_norm": 0.30550795793533325, - "learning_rate": 1.592885375494071e-05, - "loss": 1.0637, + "grad_norm": 0.12019229680299759, + "learning_rate": 0.0002654808959156785, + "loss": 0.9357, "step": 12460 }, { "epoch": 2.3470732166384343, - "grad_norm": 2.1787874698638916, - "learning_rate": 1.5917560700169396e-05, - "loss": 0.7166, + "grad_norm": 2.3379759788513184, + "learning_rate": 0.0002652926783361566, + "loss": 0.5609, "step": 12470 }, { "epoch": 2.3489553924336533, - "grad_norm": 9.78334903717041, - "learning_rate": 1.5906267645398083e-05, - "loss": 1.08, + "grad_norm": 2.5689191818237305, + "learning_rate": 0.00026510446075663467, + "loss": 1.118, "step": 12480 }, { "epoch": 2.3508375682288727, - "grad_norm": 26.472990036010742, - "learning_rate": 1.5894974590626765e-05, - "loss": 0.5742, + "grad_norm": 4.990429401397705, + "learning_rate": 0.00026491624317711275, + "loss": 0.4498, "step": 12490 }, { "epoch": 2.3527197440240917, - "grad_norm": 7.039992332458496, - "learning_rate": 1.5883681535855448e-05, - "loss": 0.4755, + "grad_norm": 5.412718772888184, + "learning_rate": 0.00026472802559759077, + "loss": 0.458, "step": 12500 }, { "epoch": 2.354601919819311, - "grad_norm": 12.98085880279541, - "learning_rate": 1.5872388481084134e-05, - "loss": 0.9198, + "grad_norm": 4.076390743255615, + "learning_rate": 0.0002645398080180689, + "loss": 0.9594, "step": 12510 }, { "epoch": 2.3564840956145305, - "grad_norm": 19.38686752319336, - "learning_rate": 1.5861095426312817e-05, - "loss": 0.512, + "grad_norm": 5.14693546295166, + "learning_rate": 0.000264351590438547, + "loss": 0.6583, "step": 12520 }, { "epoch": 2.3583662714097495, - "grad_norm": 16.51802635192871, - "learning_rate": 1.5849802371541503e-05, - "loss": 0.7799, + "grad_norm": 2.6487724781036377, + "learning_rate": 0.00026416337285902505, + "loss": 0.8533, "step": 12530 }, { "epoch": 2.360248447204969, - "grad_norm": 5.628615379333496, - "learning_rate": 1.5838509316770185e-05, - "loss": 0.8057, + "grad_norm": 1.8451364040374756, + "learning_rate": 0.00026397515527950313, + "loss": 0.7168, "step": 12540 }, { "epoch": 2.3621306230001884, - "grad_norm": 25.433387756347656, - "learning_rate": 1.582721626199887e-05, - "loss": 0.6807, + "grad_norm": 7.2082977294921875, + "learning_rate": 0.0002637869376999812, + "loss": 0.7608, "step": 12550 }, { "epoch": 2.3640127987954074, - "grad_norm": 11.819149017333984, - "learning_rate": 1.5815923207227558e-05, - "loss": 0.6426, + "grad_norm": 3.3436646461486816, + "learning_rate": 0.0002635987201204593, + "loss": 0.8196, "step": 12560 }, { "epoch": 2.365894974590627, - "grad_norm": 18.320758819580078, - "learning_rate": 1.580463015245624e-05, - "loss": 0.5381, + "grad_norm": 6.613883018493652, + "learning_rate": 0.00026341050254093736, + "loss": 0.5313, "step": 12570 }, { "epoch": 2.3677771503858462, - "grad_norm": 21.337982177734375, - "learning_rate": 1.5793337097684926e-05, - "loss": 0.6222, + "grad_norm": 5.031750679016113, + "learning_rate": 0.00026322228496141543, + "loss": 0.505, "step": 12580 }, { "epoch": 2.3696593261810652, - "grad_norm": 30.114587783813477, - "learning_rate": 1.5782044042913606e-05, - "loss": 0.8209, + "grad_norm": 3.989000082015991, + "learning_rate": 0.00026303406738189346, + "loss": 0.82, "step": 12590 }, { "epoch": 2.3715415019762847, - "grad_norm": 13.773634910583496, - "learning_rate": 1.5770750988142292e-05, - "loss": 0.7724, + "grad_norm": 3.471708297729492, + "learning_rate": 0.00026284584980237153, + "loss": 0.7288, "step": 12600 }, { "epoch": 2.3734236777715036, - "grad_norm": 7.200165748596191, - "learning_rate": 1.5759457933370978e-05, - "loss": 0.9503, + "grad_norm": 2.664614200592041, + "learning_rate": 0.0002626576322228496, + "loss": 0.8995, "step": 12610 }, { "epoch": 2.375305853566723, - "grad_norm": 16.634714126586914, - "learning_rate": 1.574816487859966e-05, - "loss": 0.5145, + "grad_norm": 2.755415439605713, + "learning_rate": 0.0002624694146433277, + "loss": 0.6285, "step": 12620 }, { "epoch": 2.3771880293619425, - "grad_norm": 13.062280654907227, - "learning_rate": 1.5736871823828347e-05, - "loss": 0.584, + "grad_norm": 4.312176704406738, + "learning_rate": 0.00026228119706380576, + "loss": 0.6126, "step": 12630 }, { "epoch": 2.3790702051571615, - "grad_norm": 15.320236206054688, - "learning_rate": 1.572557876905703e-05, - "loss": 1.109, + "grad_norm": 4.8685736656188965, + "learning_rate": 0.00026209297948428384, + "loss": 1.2171, "step": 12640 }, { "epoch": 2.380952380952381, - "grad_norm": 18.621957778930664, - "learning_rate": 1.5714285714285715e-05, - "loss": 0.5475, + "grad_norm": 1.3688360452651978, + "learning_rate": 0.0002619047619047619, + "loss": 0.6181, "step": 12650 }, { "epoch": 2.3828345567476004, - "grad_norm": 21.138986587524414, - "learning_rate": 1.57029926595144e-05, - "loss": 0.6148, + "grad_norm": 5.439986705780029, + "learning_rate": 0.00026171654432524, + "loss": 0.4511, "step": 12660 }, { "epoch": 2.3847167325428194, - "grad_norm": 7.1945929527282715, - "learning_rate": 1.5691699604743084e-05, - "loss": 0.6836, + "grad_norm": 2.772137403488159, + "learning_rate": 0.00026152832674571807, + "loss": 0.6025, "step": 12670 }, { "epoch": 2.386598908338039, - "grad_norm": 7.824986457824707, - "learning_rate": 1.5680406549971767e-05, - "loss": 1.0433, + "grad_norm": 3.3040308952331543, + "learning_rate": 0.0002613401091661961, + "loss": 1.046, "step": 12680 }, { "epoch": 2.388481084133258, - "grad_norm": 26.856182098388672, - "learning_rate": 1.566911349520045e-05, - "loss": 0.9668, + "grad_norm": 1.7851004600524902, + "learning_rate": 0.00026115189158667417, + "loss": 0.6861, "step": 12690 }, { "epoch": 2.390363259928477, - "grad_norm": 1.3294514417648315, - "learning_rate": 1.5657820440429136e-05, - "loss": 0.6016, + "grad_norm": 1.1720726490020752, + "learning_rate": 0.00026096367400715224, + "loss": 0.5638, "step": 12700 }, { "epoch": 2.3922454357236966, - "grad_norm": 25.570005416870117, - "learning_rate": 1.564652738565782e-05, - "loss": 0.6145, + "grad_norm": 2.6076951026916504, + "learning_rate": 0.0002607754564276303, + "loss": 0.8071, "step": 12710 }, { "epoch": 2.3941276115189156, - "grad_norm": 22.4234676361084, - "learning_rate": 1.5635234330886504e-05, - "loss": 0.7482, + "grad_norm": 2.853635549545288, + "learning_rate": 0.0002605872388481084, + "loss": 0.7192, "step": 12720 }, { "epoch": 2.396009787314135, - "grad_norm": 14.264887809753418, - "learning_rate": 1.562394127611519e-05, - "loss": 0.9606, + "grad_norm": 3.179858922958374, + "learning_rate": 0.0002603990212685865, + "loss": 0.8151, "step": 12730 }, { "epoch": 2.3978919631093545, - "grad_norm": 23.162593841552734, - "learning_rate": 1.5612648221343876e-05, - "loss": 0.6843, + "grad_norm": 3.948190689086914, + "learning_rate": 0.0002602108036890646, + "loss": 0.5294, "step": 12740 }, { "epoch": 2.399774138904574, - "grad_norm": 30.500329971313477, - "learning_rate": 1.560135516657256e-05, - "loss": 0.8691, + "grad_norm": 3.1576452255249023, + "learning_rate": 0.0002600225861095427, + "loss": 0.7788, "step": 12750 }, { "epoch": 2.401656314699793, - "grad_norm": 10.762563705444336, - "learning_rate": 1.5590062111801242e-05, - "loss": 0.7518, + "grad_norm": 4.19622278213501, + "learning_rate": 0.0002598343685300207, + "loss": 0.7703, "step": 12760 }, { "epoch": 2.4035384904950123, - "grad_norm": 38.12343978881836, - "learning_rate": 1.5578769057029925e-05, - "loss": 0.8324, + "grad_norm": 5.953280925750732, + "learning_rate": 0.0002596461509504988, + "loss": 0.8527, "step": 12770 }, { "epoch": 2.4054206662902313, - "grad_norm": 26.032785415649414, - "learning_rate": 1.556747600225861e-05, - "loss": 0.9027, + "grad_norm": 5.18197774887085, + "learning_rate": 0.00025945793337097685, + "loss": 0.9497, "step": 12780 }, { "epoch": 2.4073028420854508, - "grad_norm": 12.26636028289795, - "learning_rate": 1.5556182947487297e-05, - "loss": 0.6673, + "grad_norm": 2.9982385635375977, + "learning_rate": 0.00025926971579145493, + "loss": 0.8016, "step": 12790 }, { "epoch": 2.40918501788067, - "grad_norm": 22.655187606811523, - "learning_rate": 1.554488989271598e-05, - "loss": 0.9456, + "grad_norm": 3.964134931564331, + "learning_rate": 0.000259081498211933, + "loss": 1.0195, "step": 12800 }, { "epoch": 2.411067193675889, - "grad_norm": 8.51685619354248, - "learning_rate": 1.5533596837944665e-05, - "loss": 0.6657, + "grad_norm": 1.3622925281524658, + "learning_rate": 0.0002588932806324111, + "loss": 0.7713, "step": 12810 }, { "epoch": 2.4129493694711086, - "grad_norm": 8.790712356567383, - "learning_rate": 1.5522303783173348e-05, - "loss": 0.6555, + "grad_norm": 0.5836087465286255, + "learning_rate": 0.00025870506305288916, + "loss": 0.5852, "step": 12820 }, { "epoch": 2.414831545266328, - "grad_norm": 24.235885620117188, - "learning_rate": 1.5511010728402034e-05, - "loss": 0.5839, + "grad_norm": 0.42106154561042786, + "learning_rate": 0.00025851684547336724, + "loss": 0.744, "step": 12830 }, { "epoch": 2.416713721061547, - "grad_norm": 11.751230239868164, - "learning_rate": 1.549971767363072e-05, - "loss": 0.53, + "grad_norm": 4.040215492248535, + "learning_rate": 0.0002583286278938453, + "loss": 0.6945, "step": 12840 }, { "epoch": 2.4185958968567665, - "grad_norm": 11.033337593078613, - "learning_rate": 1.54884246188594e-05, - "loss": 0.7369, + "grad_norm": 1.9589953422546387, + "learning_rate": 0.00025814041031432334, + "loss": 0.7177, "step": 12850 }, { "epoch": 2.420478072651986, - "grad_norm": 11.11131763458252, - "learning_rate": 1.5477131564088086e-05, - "loss": 0.5856, + "grad_norm": 13.924626350402832, + "learning_rate": 0.0002579521927348014, + "loss": 0.5441, "step": 12860 }, { "epoch": 2.422360248447205, - "grad_norm": 31.728572845458984, - "learning_rate": 1.546583850931677e-05, - "loss": 0.8794, + "grad_norm": 4.234344959259033, + "learning_rate": 0.0002577639751552795, + "loss": 0.7613, "step": 12870 }, { "epoch": 2.4242424242424243, - "grad_norm": 15.746371269226074, - "learning_rate": 1.5454545454545454e-05, - "loss": 0.6741, + "grad_norm": 4.341315746307373, + "learning_rate": 0.00025757575757575756, + "loss": 0.6245, "step": 12880 }, { "epoch": 2.4261246000376433, - "grad_norm": 7.750115394592285, - "learning_rate": 1.544325239977414e-05, - "loss": 0.9826, + "grad_norm": 4.634925365447998, + "learning_rate": 0.00025738753999623564, + "loss": 0.9347, "step": 12890 }, { "epoch": 2.4280067758328627, - "grad_norm": 12.74660873413086, - "learning_rate": 1.5431959345002823e-05, - "loss": 0.7174, + "grad_norm": 4.451136589050293, + "learning_rate": 0.0002571993224167137, + "loss": 0.7098, "step": 12900 }, { "epoch": 2.429888951628082, - "grad_norm": 10.540325164794922, - "learning_rate": 1.542066629023151e-05, - "loss": 0.6789, + "grad_norm": 2.306314468383789, + "learning_rate": 0.0002570111048371918, + "loss": 0.8713, "step": 12910 }, { "epoch": 2.431771127423301, - "grad_norm": 16.440242767333984, - "learning_rate": 1.5409373235460192e-05, - "loss": 0.5718, + "grad_norm": 2.34293270111084, + "learning_rate": 0.00025682288725766987, + "loss": 0.5413, "step": 12920 }, { "epoch": 2.4336533032185206, - "grad_norm": 6.608902931213379, - "learning_rate": 1.5398080180688878e-05, - "loss": 0.7068, + "grad_norm": 2.1198337078094482, + "learning_rate": 0.00025663466967814795, + "loss": 0.7821, "step": 12930 }, { "epoch": 2.43553547901374, - "grad_norm": 9.328181266784668, - "learning_rate": 1.538678712591756e-05, - "loss": 0.5848, + "grad_norm": 2.277205228805542, + "learning_rate": 0.00025644645209862597, + "loss": 0.6455, "step": 12940 }, { "epoch": 2.437417654808959, - "grad_norm": 3.718935966491699, - "learning_rate": 1.5375494071146243e-05, - "loss": 0.7668, + "grad_norm": 1.2600597143173218, + "learning_rate": 0.00025625823451910405, + "loss": 0.8161, "step": 12950 }, { "epoch": 2.4392998306041784, - "grad_norm": 12.87177848815918, - "learning_rate": 1.536420101637493e-05, - "loss": 0.5857, + "grad_norm": 2.1574079990386963, + "learning_rate": 0.0002560700169395822, + "loss": 0.4708, "step": 12960 }, { "epoch": 2.441182006399398, - "grad_norm": 2.911306142807007, - "learning_rate": 1.5352907961603616e-05, - "loss": 0.8994, + "grad_norm": 1.071172833442688, + "learning_rate": 0.00025588179936006025, + "loss": 0.7987, "step": 12970 }, { "epoch": 2.443064182194617, - "grad_norm": 10.852980613708496, - "learning_rate": 1.5341614906832298e-05, - "loss": 0.8624, + "grad_norm": 3.36719012260437, + "learning_rate": 0.00025569358178053833, + "loss": 0.79, "step": 12980 }, { "epoch": 2.4449463579898363, - "grad_norm": 5.891432285308838, - "learning_rate": 1.5330321852060984e-05, - "loss": 0.7049, + "grad_norm": 4.198141098022461, + "learning_rate": 0.0002555053642010164, + "loss": 0.764, "step": 12990 }, { "epoch": 2.4468285337850557, - "grad_norm": 7.124139785766602, - "learning_rate": 1.5319028797289667e-05, - "loss": 0.7183, + "grad_norm": 2.2598376274108887, + "learning_rate": 0.0002553171466214945, + "loss": 0.6441, "step": 13000 }, { "epoch": 2.4487107095802747, - "grad_norm": 11.209503173828125, - "learning_rate": 1.5307735742518353e-05, - "loss": 0.5121, + "grad_norm": 4.295562267303467, + "learning_rate": 0.00025512892904197256, + "loss": 0.5728, "step": 13010 }, { "epoch": 2.450592885375494, - "grad_norm": 12.793279647827148, - "learning_rate": 1.529644268774704e-05, - "loss": 0.6083, + "grad_norm": 5.111675262451172, + "learning_rate": 0.00025494071146245063, + "loss": 0.6935, "step": 13020 }, { "epoch": 2.4524750611707136, - "grad_norm": 16.64006996154785, - "learning_rate": 1.528514963297572e-05, - "loss": 0.3401, + "grad_norm": 4.394181251525879, + "learning_rate": 0.00025475249388292866, + "loss": 0.4368, "step": 13030 }, { "epoch": 2.4543572369659326, - "grad_norm": 3.3853352069854736, - "learning_rate": 1.5273856578204405e-05, - "loss": 0.5073, + "grad_norm": 1.7228174209594727, + "learning_rate": 0.00025456427630340673, + "loss": 0.5746, "step": 13040 }, { "epoch": 2.456239412761152, - "grad_norm": 11.800168991088867, - "learning_rate": 1.5262563523433087e-05, - "loss": 0.4107, + "grad_norm": 1.8490511178970337, + "learning_rate": 0.0002543760587238848, + "loss": 0.432, "step": 13050 }, { "epoch": 2.458121588556371, - "grad_norm": 17.713973999023438, - "learning_rate": 1.5251270468661773e-05, - "loss": 0.4237, + "grad_norm": 2.733003854751587, + "learning_rate": 0.0002541878411443629, + "loss": 0.3822, "step": 13060 }, { "epoch": 2.4600037643515904, - "grad_norm": 14.999812126159668, - "learning_rate": 1.5239977413890458e-05, - "loss": 0.8685, + "grad_norm": 2.048168420791626, + "learning_rate": 0.00025399962356484096, + "loss": 0.758, "step": 13070 }, { "epoch": 2.46188594014681, - "grad_norm": 10.52674674987793, - "learning_rate": 1.5228684359119142e-05, - "loss": 0.8917, + "grad_norm": 1.9236838817596436, + "learning_rate": 0.00025381140598531904, + "loss": 0.8686, "step": 13080 }, { "epoch": 2.463768115942029, - "grad_norm": 21.042428970336914, - "learning_rate": 1.5217391304347828e-05, - "loss": 0.7128, + "grad_norm": 3.5483946800231934, + "learning_rate": 0.0002536231884057971, + "loss": 0.7579, "step": 13090 }, { "epoch": 2.4656502917372483, - "grad_norm": 0.1580163836479187, - "learning_rate": 1.5206098249576513e-05, - "loss": 0.9413, + "grad_norm": 0.027952812612056732, + "learning_rate": 0.0002534349708262752, + "loss": 0.8328, "step": 13100 }, { "epoch": 2.4675324675324677, - "grad_norm": 7.958024501800537, - "learning_rate": 1.5194805194805194e-05, - "loss": 0.4504, + "grad_norm": 2.3775055408477783, + "learning_rate": 0.0002532467532467532, + "loss": 0.4102, "step": 13110 }, { "epoch": 2.4694146433276867, - "grad_norm": 21.001428604125977, - "learning_rate": 1.5183512140033878e-05, - "loss": 0.9076, + "grad_norm": 3.9588053226470947, + "learning_rate": 0.0002530585356672313, + "loss": 0.9381, "step": 13120 }, { "epoch": 2.471296819122906, - "grad_norm": 14.668991088867188, - "learning_rate": 1.5172219085262562e-05, - "loss": 0.7698, + "grad_norm": 5.909026145935059, + "learning_rate": 0.00025287031808770937, + "loss": 0.7946, "step": 13130 }, { "epoch": 2.4731789949181255, - "grad_norm": 9.113917350769043, - "learning_rate": 1.5160926030491248e-05, - "loss": 0.8785, + "grad_norm": 2.5847971439361572, + "learning_rate": 0.00025268210050818744, + "loss": 0.7612, "step": 13140 }, { "epoch": 2.4750611707133445, - "grad_norm": 16.353412628173828, - "learning_rate": 1.5149632975719933e-05, - "loss": 0.7005, + "grad_norm": 3.247206211090088, + "learning_rate": 0.0002524938829286655, + "loss": 0.6497, "step": 13150 }, { "epoch": 2.476943346508564, - "grad_norm": 7.100589275360107, - "learning_rate": 1.5138339920948617e-05, - "loss": 0.5536, + "grad_norm": 2.7777774333953857, + "learning_rate": 0.0002523056653491436, + "loss": 0.7623, "step": 13160 }, { "epoch": 2.478825522303783, - "grad_norm": 10.338568687438965, - "learning_rate": 1.5127046866177302e-05, - "loss": 0.794, + "grad_norm": 3.336306095123291, + "learning_rate": 0.00025211744776962167, + "loss": 0.7382, "step": 13170 }, { "epoch": 2.4807076980990024, - "grad_norm": 19.2764949798584, - "learning_rate": 1.5115753811405988e-05, - "loss": 0.7741, + "grad_norm": 2.404705286026001, + "learning_rate": 0.0002519292301900998, + "loss": 0.7361, "step": 13180 }, { "epoch": 2.482589873894222, - "grad_norm": 41.238563537597656, - "learning_rate": 1.5104460756634672e-05, - "loss": 0.5174, + "grad_norm": 4.079376220703125, + "learning_rate": 0.0002517410126105779, + "loss": 0.5151, "step": 13190 }, { "epoch": 2.4844720496894412, - "grad_norm": 14.756248474121094, - "learning_rate": 1.5093167701863353e-05, - "loss": 0.6969, + "grad_norm": 2.1398980617523193, + "learning_rate": 0.0002515527950310559, + "loss": 0.6292, "step": 13200 }, { "epoch": 2.4863542254846602, - "grad_norm": 25.224035263061523, - "learning_rate": 1.5081874647092037e-05, - "loss": 0.7373, + "grad_norm": 4.041257858276367, + "learning_rate": 0.000251364577451534, + "loss": 0.6575, "step": 13210 }, { "epoch": 2.4882364012798797, - "grad_norm": 27.549787521362305, - "learning_rate": 1.5070581592320722e-05, - "loss": 0.7657, + "grad_norm": 3.372771739959717, + "learning_rate": 0.00025117635987201205, + "loss": 0.591, "step": 13220 }, { "epoch": 2.4901185770750986, - "grad_norm": 13.31575870513916, - "learning_rate": 1.5059288537549408e-05, - "loss": 0.8013, + "grad_norm": 3.360886573791504, + "learning_rate": 0.00025098814229249013, + "loss": 0.728, "step": 13230 }, { "epoch": 2.492000752870318, - "grad_norm": 15.506152153015137, - "learning_rate": 1.5047995482778092e-05, - "loss": 0.8362, + "grad_norm": 3.187817335128784, + "learning_rate": 0.0002507999247129682, + "loss": 0.7468, "step": 13240 }, { "epoch": 2.4938829286655375, - "grad_norm": 12.64037036895752, - "learning_rate": 1.5036702428006777e-05, - "loss": 0.7907, + "grad_norm": 1.4808297157287598, + "learning_rate": 0.0002506117071334463, + "loss": 0.6668, "step": 13250 }, { "epoch": 2.4957651044607565, - "grad_norm": 7.721663475036621, - "learning_rate": 1.5025409373235461e-05, - "loss": 0.6027, + "grad_norm": 1.4633982181549072, + "learning_rate": 0.00025042348955392436, + "loss": 0.6514, "step": 13260 }, { "epoch": 2.497647280255976, - "grad_norm": 20.326704025268555, - "learning_rate": 1.5014116318464145e-05, - "loss": 0.7932, + "grad_norm": 4.78045129776001, + "learning_rate": 0.00025023527197440244, + "loss": 0.7101, "step": 13270 }, { "epoch": 2.4995294560511954, - "grad_norm": 3.6941041946411133, - "learning_rate": 1.5002823263692831e-05, - "loss": 0.6615, + "grad_norm": 2.0362279415130615, + "learning_rate": 0.0002500470543948805, + "loss": 0.7201, "step": 13280 }, { "epoch": 2.5014116318464144, - "grad_norm": 23.14988899230957, - "learning_rate": 1.4991530208921514e-05, - "loss": 0.6778, + "grad_norm": 2.6918833255767822, + "learning_rate": 0.0002498588368153586, + "loss": 0.6822, "step": 13290 }, { "epoch": 2.503293807641634, - "grad_norm": 8.961627960205078, - "learning_rate": 1.4980237154150198e-05, - "loss": 0.5089, + "grad_norm": 5.8409504890441895, + "learning_rate": 0.00024967061923583667, + "loss": 0.4258, "step": 13300 }, { "epoch": 2.505175983436853, - "grad_norm": 17.82870864868164, - "learning_rate": 1.4968944099378881e-05, - "loss": 0.8967, + "grad_norm": 3.082871437072754, + "learning_rate": 0.0002494824016563147, + "loss": 0.9643, "step": 13310 }, { "epoch": 2.507058159232072, - "grad_norm": 3.7372190952301025, - "learning_rate": 1.4957651044607567e-05, - "loss": 0.6563, + "grad_norm": 0.6219594478607178, + "learning_rate": 0.00024929418407679276, + "loss": 0.5603, "step": 13320 }, { "epoch": 2.5089403350272916, - "grad_norm": 17.692962646484375, - "learning_rate": 1.4946357989836252e-05, - "loss": 0.9301, + "grad_norm": 3.4000182151794434, + "learning_rate": 0.00024910596649727084, + "loss": 0.7337, "step": 13330 }, { "epoch": 2.5108225108225106, - "grad_norm": 11.346606254577637, - "learning_rate": 1.4935064935064936e-05, - "loss": 0.689, + "grad_norm": 2.250074625015259, + "learning_rate": 0.0002489177489177489, + "loss": 0.6619, "step": 13340 }, { "epoch": 2.51270468661773, - "grad_norm": 25.42557144165039, - "learning_rate": 1.4923771880293619e-05, - "loss": 0.6755, + "grad_norm": 8.501276969909668, + "learning_rate": 0.000248729531338227, + "loss": 0.5692, "step": 13350 }, { "epoch": 2.5145868624129495, - "grad_norm": 16.371252059936523, - "learning_rate": 1.4912478825522303e-05, - "loss": 0.6872, + "grad_norm": 2.2752902507781982, + "learning_rate": 0.00024854131375870507, + "loss": 0.6721, "step": 13360 }, { "epoch": 2.516469038208169, - "grad_norm": 10.551392555236816, - "learning_rate": 1.4901185770750989e-05, - "loss": 0.3674, + "grad_norm": 3.7881362438201904, + "learning_rate": 0.00024835309617918315, + "loss": 0.5059, "step": 13370 }, { "epoch": 2.518351214003388, - "grad_norm": 7.954850673675537, - "learning_rate": 1.4889892715979674e-05, - "loss": 0.445, + "grad_norm": 5.592350006103516, + "learning_rate": 0.0002481648785996612, + "loss": 0.4467, "step": 13380 }, { "epoch": 2.5202333897986073, - "grad_norm": 6.890119552612305, - "learning_rate": 1.4878599661208358e-05, - "loss": 0.5636, + "grad_norm": 2.011734962463379, + "learning_rate": 0.0002479766610201393, + "loss": 0.6093, "step": 13390 }, { "epoch": 2.5221155655938263, - "grad_norm": 11.65666389465332, - "learning_rate": 1.486730660643704e-05, - "loss": 0.6063, + "grad_norm": 2.0985467433929443, + "learning_rate": 0.0002477884434406174, + "loss": 0.8551, "step": 13400 }, { "epoch": 2.5239977413890458, - "grad_norm": 17.123275756835938, - "learning_rate": 1.4856013551665727e-05, - "loss": 0.615, + "grad_norm": 3.176607608795166, + "learning_rate": 0.00024760022586109545, + "loss": 0.7844, "step": 13410 }, { "epoch": 2.525879917184265, - "grad_norm": 24.541301727294922, - "learning_rate": 1.4844720496894411e-05, - "loss": 0.9433, + "grad_norm": 5.991983890533447, + "learning_rate": 0.00024741200828157353, + "loss": 1.1296, "step": 13420 }, { "epoch": 2.527762092979484, - "grad_norm": 11.410900115966797, - "learning_rate": 1.4833427442123095e-05, - "loss": 0.5644, + "grad_norm": 2.146676540374756, + "learning_rate": 0.0002472237907020516, + "loss": 0.5867, "step": 13430 }, { "epoch": 2.5296442687747036, - "grad_norm": 7.17426872253418, - "learning_rate": 1.4822134387351778e-05, - "loss": 0.5348, + "grad_norm": 1.7108885049819946, + "learning_rate": 0.00024703557312252963, + "loss": 0.5653, "step": 13440 }, { "epoch": 2.5315264445699226, - "grad_norm": 19.93268585205078, - "learning_rate": 1.4810841332580463e-05, - "loss": 0.9548, + "grad_norm": 4.368906497955322, + "learning_rate": 0.0002468473555430077, + "loss": 0.9203, "step": 13450 }, { "epoch": 2.533408620365142, - "grad_norm": 12.935050964355469, - "learning_rate": 1.4799548277809149e-05, - "loss": 0.6989, + "grad_norm": 1.6300593614578247, + "learning_rate": 0.0002466591379634858, + "loss": 0.5578, "step": 13460 }, { "epoch": 2.5352907961603615, - "grad_norm": 19.485132217407227, - "learning_rate": 1.4788255223037833e-05, - "loss": 0.7934, + "grad_norm": 6.861873149871826, + "learning_rate": 0.00024647092038396386, + "loss": 0.7893, "step": 13470 }, { "epoch": 2.537172971955581, - "grad_norm": 28.300479888916016, - "learning_rate": 1.4776962168266516e-05, - "loss": 0.6659, + "grad_norm": 3.013028860092163, + "learning_rate": 0.00024628270280444193, + "loss": 0.4753, "step": 13480 }, { "epoch": 2.5390551477508, - "grad_norm": 27.766523361206055, - "learning_rate": 1.47656691134952e-05, - "loss": 1.0873, + "grad_norm": 3.5351712703704834, + "learning_rate": 0.00024609448522492, + "loss": 1.182, "step": 13490 }, { "epoch": 2.5409373235460193, - "grad_norm": 23.600576400756836, - "learning_rate": 1.4754376058723884e-05, - "loss": 0.5181, + "grad_norm": 2.7321786880493164, + "learning_rate": 0.0002459062676453981, + "loss": 0.6641, "step": 13500 }, { "epoch": 2.5428194993412383, - "grad_norm": 12.327131271362305, - "learning_rate": 1.474308300395257e-05, - "loss": 0.7655, + "grad_norm": 2.764721393585205, + "learning_rate": 0.00024571805006587616, + "loss": 0.6495, "step": 13510 }, { "epoch": 2.5447016751364577, - "grad_norm": 19.353673934936523, - "learning_rate": 1.4731789949181255e-05, - "loss": 0.8238, + "grad_norm": 4.903502941131592, + "learning_rate": 0.00024552983248635424, + "loss": 0.6116, "step": 13520 }, { "epoch": 2.546583850931677, - "grad_norm": 23.07147979736328, - "learning_rate": 1.4720496894409938e-05, - "loss": 0.8695, + "grad_norm": 5.111001014709473, + "learning_rate": 0.0002453416149068323, + "loss": 0.9473, "step": 13530 }, { "epoch": 2.548466026726896, - "grad_norm": 19.188295364379883, - "learning_rate": 1.4709203839638622e-05, - "loss": 0.7139, + "grad_norm": 1.7780566215515137, + "learning_rate": 0.0002451533973273104, + "loss": 0.7633, "step": 13540 }, { "epoch": 2.5503482025221156, - "grad_norm": 8.434711456298828, - "learning_rate": 1.4697910784867308e-05, - "loss": 0.8626, + "grad_norm": 5.7975850105285645, + "learning_rate": 0.00024496517974778847, + "loss": 0.8065, "step": 13550 }, { "epoch": 2.5522303783173346, - "grad_norm": 4.494293212890625, - "learning_rate": 1.4686617730095992e-05, - "loss": 0.7623, + "grad_norm": 2.282766819000244, + "learning_rate": 0.00024477696216826654, + "loss": 0.7373, "step": 13560 }, { "epoch": 2.554112554112554, - "grad_norm": 17.76078224182129, - "learning_rate": 1.4675324675324675e-05, - "loss": 0.7771, + "grad_norm": 3.5103821754455566, + "learning_rate": 0.00024458874458874457, + "loss": 0.7865, "step": 13570 }, { "epoch": 2.5559947299077734, - "grad_norm": 18.283334732055664, - "learning_rate": 1.466403162055336e-05, - "loss": 0.615, + "grad_norm": 4.03853178024292, + "learning_rate": 0.00024440052700922264, + "loss": 0.8197, "step": 13580 }, { "epoch": 2.557876905702993, - "grad_norm": 15.662129402160645, - "learning_rate": 1.4652738565782044e-05, - "loss": 1.0202, + "grad_norm": 2.961564540863037, + "learning_rate": 0.0002442123094297007, + "loss": 1.087, "step": 13590 }, { "epoch": 2.559759081498212, - "grad_norm": 11.736997604370117, - "learning_rate": 1.464144551101073e-05, - "loss": 0.6311, + "grad_norm": 4.064123153686523, + "learning_rate": 0.00024402409185017882, + "loss": 0.5747, "step": 13600 }, { "epoch": 2.5616412572934313, - "grad_norm": 26.242910385131836, - "learning_rate": 1.4630152456239413e-05, - "loss": 0.5408, + "grad_norm": 2.7881264686584473, + "learning_rate": 0.00024383587427065687, + "loss": 0.6466, "step": 13610 }, { "epoch": 2.5635234330886503, - "grad_norm": 34.35280227661133, - "learning_rate": 1.4618859401468097e-05, - "loss": 0.5723, + "grad_norm": 3.5973377227783203, + "learning_rate": 0.00024364765669113495, + "loss": 0.5321, "step": 13620 }, { "epoch": 2.5654056088838697, - "grad_norm": 12.662686347961426, - "learning_rate": 1.4607566346696781e-05, - "loss": 0.9724, + "grad_norm": 1.8289278745651245, + "learning_rate": 0.00024345943911161303, + "loss": 1.026, "step": 13630 }, { "epoch": 2.567287784679089, - "grad_norm": 7.174187660217285, - "learning_rate": 1.4596273291925466e-05, - "loss": 0.7687, + "grad_norm": 3.8798322677612305, + "learning_rate": 0.0002432712215320911, + "loss": 0.8, "step": 13640 }, { "epoch": 2.5691699604743086, - "grad_norm": 13.642120361328125, - "learning_rate": 1.4584980237154152e-05, - "loss": 0.5166, + "grad_norm": 6.2854461669921875, + "learning_rate": 0.00024308300395256918, + "loss": 0.5575, "step": 13650 }, { "epoch": 2.5710521362695276, - "grad_norm": 18.49991798400879, - "learning_rate": 1.4573687182382835e-05, - "loss": 0.889, + "grad_norm": 2.248408794403076, + "learning_rate": 0.00024289478637304723, + "loss": 0.8251, "step": 13660 }, { "epoch": 2.572934312064747, - "grad_norm": 15.515763282775879, - "learning_rate": 1.4562394127611519e-05, - "loss": 0.7712, + "grad_norm": 3.2871415615081787, + "learning_rate": 0.0002427065687935253, + "loss": 0.8645, "step": 13670 }, { "epoch": 2.574816487859966, - "grad_norm": 9.801226615905762, - "learning_rate": 1.4551101072840203e-05, - "loss": 1.0405, + "grad_norm": 2.0668184757232666, + "learning_rate": 0.0002425183512140034, + "loss": 1.0611, "step": 13680 }, { "epoch": 2.5766986636551854, - "grad_norm": 8.876042366027832, - "learning_rate": 1.453980801806889e-05, - "loss": 0.3575, + "grad_norm": 2.037912368774414, + "learning_rate": 0.00024233013363448148, + "loss": 0.3269, "step": 13690 }, { "epoch": 2.578580839450405, - "grad_norm": 23.405811309814453, - "learning_rate": 1.4528514963297572e-05, - "loss": 1.0442, + "grad_norm": 4.442802906036377, + "learning_rate": 0.00024214191605495953, + "loss": 0.9673, "step": 13700 }, { "epoch": 2.580463015245624, - "grad_norm": 19.530874252319336, - "learning_rate": 1.4517221908526256e-05, - "loss": 0.7402, + "grad_norm": 5.721139430999756, + "learning_rate": 0.0002419536984754376, + "loss": 0.627, "step": 13710 }, { "epoch": 2.5823451910408433, - "grad_norm": 14.73330020904541, - "learning_rate": 1.450592885375494e-05, - "loss": 0.6519, + "grad_norm": 12.355080604553223, + "learning_rate": 0.00024176548089591569, + "loss": 0.6522, "step": 13720 }, { "epoch": 2.5842273668360622, - "grad_norm": 31.803998947143555, - "learning_rate": 1.4494635798983625e-05, - "loss": 1.0237, + "grad_norm": 4.2488813400268555, + "learning_rate": 0.00024157726331639376, + "loss": 0.8524, "step": 13730 }, { "epoch": 2.5861095426312817, - "grad_norm": 25.37823486328125, - "learning_rate": 1.448334274421231e-05, - "loss": 0.9164, + "grad_norm": 5.772805690765381, + "learning_rate": 0.0002413890457368718, + "loss": 0.8406, "step": 13740 }, { "epoch": 2.587991718426501, - "grad_norm": 25.509199142456055, - "learning_rate": 1.4472049689440994e-05, - "loss": 0.76, + "grad_norm": 4.153681755065918, + "learning_rate": 0.0002412008281573499, + "loss": 0.6873, "step": 13750 }, { "epoch": 2.5898738942217205, - "grad_norm": 4.799928665161133, - "learning_rate": 1.4460756634669678e-05, - "loss": 0.2943, + "grad_norm": 0.7333030104637146, + "learning_rate": 0.00024101261057782796, + "loss": 0.2909, "step": 13760 }, { "epoch": 2.5917560700169395, - "grad_norm": 12.444110870361328, - "learning_rate": 1.4449463579898363e-05, - "loss": 0.8041, + "grad_norm": 0.3361065983772278, + "learning_rate": 0.00024082439299830604, + "loss": 0.8215, "step": 13770 }, { "epoch": 2.593638245812159, - "grad_norm": 16.842971801757812, - "learning_rate": 1.4438170525127047e-05, - "loss": 0.6848, + "grad_norm": 3.8252694606781006, + "learning_rate": 0.00024063617541878414, + "loss": 0.521, "step": 13780 }, { "epoch": 2.595520421607378, - "grad_norm": 21.784448623657227, - "learning_rate": 1.4426877470355732e-05, - "loss": 0.7807, + "grad_norm": 4.091780185699463, + "learning_rate": 0.0002404479578392622, + "loss": 0.8096, "step": 13790 }, { "epoch": 2.5974025974025974, - "grad_norm": 1.9101155996322632, - "learning_rate": 1.4415584415584416e-05, - "loss": 0.4079, + "grad_norm": 1.5982844829559326, + "learning_rate": 0.00024025974025974027, + "loss": 0.4528, "step": 13800 }, { "epoch": 2.599284773197817, - "grad_norm": 10.34659194946289, - "learning_rate": 1.44042913608131e-05, - "loss": 0.9427, + "grad_norm": 3.969874620437622, + "learning_rate": 0.00024007152268021835, + "loss": 0.9265, "step": 13810 }, { "epoch": 2.601166948993036, - "grad_norm": 32.64616394042969, - "learning_rate": 1.4392998306041785e-05, - "loss": 0.968, + "grad_norm": 3.207587242126465, + "learning_rate": 0.00023988330510069642, + "loss": 1.0132, "step": 13820 }, { "epoch": 2.6030491247882552, - "grad_norm": 23.128334045410156, - "learning_rate": 1.4381705251270469e-05, - "loss": 0.8594, + "grad_norm": 4.083827495574951, + "learning_rate": 0.00023969508752117447, + "loss": 0.7304, "step": 13830 }, { "epoch": 2.6049313005834747, - "grad_norm": 13.279285430908203, - "learning_rate": 1.4370412196499153e-05, - "loss": 0.7133, + "grad_norm": 3.4876480102539062, + "learning_rate": 0.00023950686994165255, + "loss": 0.618, "step": 13840 }, { "epoch": 2.6068134763786937, - "grad_norm": 3.632760763168335, - "learning_rate": 1.4359119141727838e-05, - "loss": 0.4146, + "grad_norm": 0.7380732297897339, + "learning_rate": 0.00023931865236213063, + "loss": 0.4644, "step": 13850 }, { "epoch": 2.608695652173913, - "grad_norm": 5.402322769165039, - "learning_rate": 1.4347826086956522e-05, - "loss": 0.4458, + "grad_norm": 1.9918617010116577, + "learning_rate": 0.0002391304347826087, + "loss": 0.3434, "step": 13860 }, { "epoch": 2.6105778279691325, - "grad_norm": 18.91129493713379, - "learning_rate": 1.4336533032185207e-05, - "loss": 0.7429, + "grad_norm": 0.4870288074016571, + "learning_rate": 0.00023894221720308678, + "loss": 0.7492, "step": 13870 }, { "epoch": 2.6124600037643515, - "grad_norm": 29.642852783203125, - "learning_rate": 1.4325239977413891e-05, - "loss": 0.6789, + "grad_norm": 3.0908610820770264, + "learning_rate": 0.00023875399962356483, + "loss": 0.7168, "step": 13880 }, { "epoch": 2.614342179559571, - "grad_norm": 26.60822868347168, - "learning_rate": 1.4313946922642575e-05, - "loss": 0.708, + "grad_norm": 2.242732524871826, + "learning_rate": 0.0002385657820440429, + "loss": 0.7139, "step": 13890 }, { "epoch": 2.61622435535479, - "grad_norm": 8.151140213012695, - "learning_rate": 1.430265386787126e-05, - "loss": 0.9136, + "grad_norm": 4.407373905181885, + "learning_rate": 0.000238377564464521, + "loss": 1.0584, "step": 13900 }, { "epoch": 2.6181065311500094, - "grad_norm": 9.463515281677246, - "learning_rate": 1.4291360813099944e-05, - "loss": 0.7923, + "grad_norm": 4.035942077636719, + "learning_rate": 0.00023818934688499908, + "loss": 0.7392, "step": 13910 }, { "epoch": 2.619988706945229, - "grad_norm": 21.169139862060547, - "learning_rate": 1.4280067758328628e-05, - "loss": 0.7719, + "grad_norm": 5.146146774291992, + "learning_rate": 0.00023800112930547713, + "loss": 0.7454, "step": 13920 }, { "epoch": 2.621870882740448, - "grad_norm": 2.78265643119812, - "learning_rate": 1.4268774703557313e-05, - "loss": 0.564, + "grad_norm": 2.152848482131958, + "learning_rate": 0.0002378129117259552, + "loss": 0.5348, "step": 13930 }, { "epoch": 2.623753058535667, - "grad_norm": 10.70055103302002, - "learning_rate": 1.4257481648785997e-05, - "loss": 0.7906, + "grad_norm": 4.042505741119385, + "learning_rate": 0.00023762469414643329, + "loss": 0.673, "step": 13940 }, { "epoch": 2.6256352343308866, - "grad_norm": 6.613348007202148, - "learning_rate": 1.4246188594014682e-05, - "loss": 0.4984, + "grad_norm": 4.227035999298096, + "learning_rate": 0.00023743647656691136, + "loss": 0.4952, "step": 13950 }, { "epoch": 2.6275174101261056, - "grad_norm": 13.339488983154297, - "learning_rate": 1.4234895539243364e-05, - "loss": 0.9332, + "grad_norm": 3.608710289001465, + "learning_rate": 0.0002372482589873894, + "loss": 0.9144, "step": 13960 }, { "epoch": 2.629399585921325, - "grad_norm": 11.467644691467285, - "learning_rate": 1.422360248447205e-05, - "loss": 0.5156, + "grad_norm": 3.100811004638672, + "learning_rate": 0.0002370600414078675, + "loss": 0.6548, "step": 13970 }, { "epoch": 2.6312817617165445, - "grad_norm": 29.80975914001465, - "learning_rate": 1.4212309429700735e-05, - "loss": 0.5243, + "grad_norm": 3.055030584335327, + "learning_rate": 0.00023687182382834556, + "loss": 0.4532, "step": 13980 }, { "epoch": 2.6331639375117635, - "grad_norm": 18.506032943725586, - "learning_rate": 1.4201016374929419e-05, - "loss": 0.9703, + "grad_norm": 3.4377622604370117, + "learning_rate": 0.00023668360624882364, + "loss": 0.9787, "step": 13990 }, { "epoch": 2.635046113306983, - "grad_norm": 12.98060131072998, - "learning_rate": 1.4189723320158104e-05, - "loss": 0.9162, + "grad_norm": 5.330870628356934, + "learning_rate": 0.00023649538866930172, + "loss": 0.8944, "step": 14000 }, { "epoch": 2.636928289102202, - "grad_norm": 3.217790365219116, - "learning_rate": 1.4178430265386786e-05, - "loss": 0.6168, + "grad_norm": 1.1406224966049194, + "learning_rate": 0.0002363071710897798, + "loss": 0.5997, "step": 14010 }, { "epoch": 2.6388104648974213, - "grad_norm": 12.240312576293945, - "learning_rate": 1.4167137210615472e-05, - "loss": 0.587, + "grad_norm": 2.4622602462768555, + "learning_rate": 0.00023611895351025787, + "loss": 0.6162, "step": 14020 }, { "epoch": 2.6406926406926408, - "grad_norm": 17.046951293945312, - "learning_rate": 1.4155844155844157e-05, - "loss": 0.4854, + "grad_norm": 0.7391192317008972, + "learning_rate": 0.00023593073593073595, + "loss": 0.4323, "step": 14030 }, { "epoch": 2.64257481648786, - "grad_norm": 20.683822631835938, - "learning_rate": 1.4144551101072841e-05, - "loss": 0.5354, + "grad_norm": 4.043608665466309, + "learning_rate": 0.00023574251835121402, + "loss": 0.6287, "step": 14040 }, { "epoch": 2.644456992283079, - "grad_norm": 19.47835350036621, - "learning_rate": 1.4133258046301524e-05, - "loss": 0.5272, + "grad_norm": 4.61051082611084, + "learning_rate": 0.00023555430077169207, + "loss": 0.691, "step": 14050 }, { "epoch": 2.6463391680782986, - "grad_norm": 5.546383380889893, - "learning_rate": 1.412196499153021e-05, - "loss": 0.7365, + "grad_norm": 0.22076202929019928, + "learning_rate": 0.00023536608319217015, + "loss": 0.8374, "step": 14060 }, { "epoch": 2.6482213438735176, - "grad_norm": 19.25607681274414, - "learning_rate": 1.4110671936758894e-05, - "loss": 0.4252, + "grad_norm": 5.464056968688965, + "learning_rate": 0.00023517786561264823, + "loss": 0.3803, "step": 14070 }, { "epoch": 2.650103519668737, - "grad_norm": 5.1752448081970215, - "learning_rate": 1.4099378881987579e-05, - "loss": 0.7042, + "grad_norm": 0.7410657405853271, + "learning_rate": 0.0002349896480331263, + "loss": 0.6079, "step": 14080 }, { "epoch": 2.6519856954639565, - "grad_norm": 1.2071647644042969, - "learning_rate": 1.4088085827216261e-05, - "loss": 0.5272, + "grad_norm": 3.5357539653778076, + "learning_rate": 0.00023480143045360435, + "loss": 0.5994, "step": 14090 }, { "epoch": 2.6538678712591754, - "grad_norm": 33.30137252807617, - "learning_rate": 1.4076792772444946e-05, - "loss": 0.7419, + "grad_norm": 1.7675532102584839, + "learning_rate": 0.00023461321287408243, + "loss": 0.7059, "step": 14100 }, { "epoch": 2.655750047054395, - "grad_norm": 2.014209508895874, - "learning_rate": 1.4065499717673632e-05, - "loss": 0.7304, + "grad_norm": 2.598212480545044, + "learning_rate": 0.0002344249952945605, + "loss": 0.6269, "step": 14110 }, { "epoch": 2.6576322228496143, - "grad_norm": 24.12761116027832, - "learning_rate": 1.4054206662902316e-05, - "loss": 0.859, + "grad_norm": 5.439962387084961, + "learning_rate": 0.0002342367777150386, + "loss": 0.9013, "step": 14120 }, { "epoch": 2.6595143986448333, - "grad_norm": 13.404014587402344, - "learning_rate": 1.4042913608131e-05, - "loss": 0.7206, + "grad_norm": 2.2769856452941895, + "learning_rate": 0.00023404856013551668, + "loss": 0.7557, "step": 14130 }, { "epoch": 2.6613965744400527, - "grad_norm": 28.344799041748047, - "learning_rate": 1.4031620553359683e-05, - "loss": 0.7208, + "grad_norm": 4.408846855163574, + "learning_rate": 0.00023386034255599473, + "loss": 0.6591, "step": 14140 }, { "epoch": 2.663278750235272, - "grad_norm": 23.86800765991211, - "learning_rate": 1.4020327498588368e-05, - "loss": 0.8126, + "grad_norm": 5.050843715667725, + "learning_rate": 0.0002336721249764728, + "loss": 0.8656, "step": 14150 }, { "epoch": 2.665160926030491, - "grad_norm": 14.190437316894531, - "learning_rate": 1.4009034443817054e-05, - "loss": 0.6967, + "grad_norm": 3.493145704269409, + "learning_rate": 0.00023348390739695089, + "loss": 0.5883, "step": 14160 }, { "epoch": 2.6670431018257106, - "grad_norm": 0.3645530045032501, - "learning_rate": 1.3997741389045738e-05, - "loss": 0.4742, + "grad_norm": 0.21144308149814606, + "learning_rate": 0.00023329568981742896, + "loss": 0.4608, "step": 14170 }, { "epoch": 2.6689252776209296, - "grad_norm": 15.038154602050781, - "learning_rate": 1.398644833427442e-05, - "loss": 1.0225, + "grad_norm": 5.041484832763672, + "learning_rate": 0.000233107472237907, + "loss": 0.9009, "step": 14180 }, { "epoch": 2.670807453416149, - "grad_norm": 12.963705062866211, - "learning_rate": 1.3975155279503105e-05, - "loss": 0.6174, + "grad_norm": 2.556727409362793, + "learning_rate": 0.0002329192546583851, + "loss": 0.6786, "step": 14190 }, { "epoch": 2.6726896292113684, - "grad_norm": 15.526269912719727, - "learning_rate": 1.3963862224731791e-05, - "loss": 0.6071, + "grad_norm": 3.09854793548584, + "learning_rate": 0.00023273103707886316, + "loss": 0.6226, "step": 14200 }, { "epoch": 2.674571805006588, - "grad_norm": 12.310412406921387, - "learning_rate": 1.3952569169960476e-05, - "loss": 1.0341, + "grad_norm": 2.4376440048217773, + "learning_rate": 0.00023254281949934124, + "loss": 0.9534, "step": 14210 }, { "epoch": 2.676453980801807, - "grad_norm": 17.386167526245117, - "learning_rate": 1.3941276115189158e-05, - "loss": 0.9449, + "grad_norm": 1.3566420078277588, + "learning_rate": 0.0002323546019198193, + "loss": 0.9804, "step": 14220 }, { "epoch": 2.6783361565970263, - "grad_norm": 3.0282480716705322, - "learning_rate": 1.3929983060417843e-05, - "loss": 0.4089, + "grad_norm": 0.6485238671302795, + "learning_rate": 0.0002321663843402974, + "loss": 0.4259, "step": 14230 }, { "epoch": 2.6802183323922453, - "grad_norm": 1.6062228679656982, - "learning_rate": 1.3918690005646527e-05, - "loss": 0.4825, + "grad_norm": 0.4050973653793335, + "learning_rate": 0.00023197816676077547, + "loss": 0.5302, "step": 14240 }, { "epoch": 2.6821005081874647, - "grad_norm": 1.4682273864746094, - "learning_rate": 1.3907396950875213e-05, - "loss": 0.597, + "grad_norm": 2.0743534564971924, + "learning_rate": 0.00023178994918125355, + "loss": 0.6506, "step": 14250 }, { "epoch": 2.683982683982684, - "grad_norm": 19.706525802612305, - "learning_rate": 1.3896103896103897e-05, - "loss": 0.9402, + "grad_norm": 3.749516487121582, + "learning_rate": 0.00023160173160173162, + "loss": 0.9767, "step": 14260 }, { "epoch": 2.685864859777903, - "grad_norm": 13.49492073059082, - "learning_rate": 1.388481084133258e-05, - "loss": 1.1029, + "grad_norm": 2.801708936691284, + "learning_rate": 0.00023141351402220967, + "loss": 1.0014, "step": 14270 }, { "epoch": 2.6877470355731226, - "grad_norm": 27.556337356567383, - "learning_rate": 1.3873517786561265e-05, - "loss": 0.789, + "grad_norm": 6.906372547149658, + "learning_rate": 0.00023122529644268775, + "loss": 0.7968, "step": 14280 }, { "epoch": 2.6896292113683415, - "grad_norm": 16.97081756591797, - "learning_rate": 1.3862224731789949e-05, - "loss": 0.5639, + "grad_norm": 3.7884345054626465, + "learning_rate": 0.00023103707886316583, + "loss": 0.7028, "step": 14290 }, { "epoch": 2.691511387163561, - "grad_norm": 24.679975509643555, - "learning_rate": 1.3850931677018635e-05, - "loss": 0.7003, + "grad_norm": 1.527142882347107, + "learning_rate": 0.0002308488612836439, + "loss": 0.6148, "step": 14300 }, { "epoch": 2.6933935629587804, - "grad_norm": 24.730314254760742, - "learning_rate": 1.3839638622247318e-05, - "loss": 0.6968, + "grad_norm": 2.611942768096924, + "learning_rate": 0.00023066064370412195, + "loss": 0.6652, "step": 14310 }, { "epoch": 2.695275738754, - "grad_norm": 13.802581787109375, - "learning_rate": 1.3828345567476002e-05, - "loss": 0.7957, + "grad_norm": 3.685335636138916, + "learning_rate": 0.00023047242612460003, + "loss": 0.7482, "step": 14320 }, { "epoch": 2.697157914549219, - "grad_norm": 21.049495697021484, - "learning_rate": 1.3817052512704686e-05, - "loss": 0.3892, + "grad_norm": 1.5061359405517578, + "learning_rate": 0.0002302842085450781, + "loss": 0.4571, "step": 14330 }, { "epoch": 2.6990400903444383, - "grad_norm": 4.196599006652832, - "learning_rate": 1.3805759457933373e-05, - "loss": 0.6734, + "grad_norm": 1.478553056716919, + "learning_rate": 0.0002300959909655562, + "loss": 0.6591, "step": 14340 }, { "epoch": 2.7009222661396572, - "grad_norm": 37.20542526245117, - "learning_rate": 1.3794466403162055e-05, - "loss": 0.9126, + "grad_norm": 5.025265693664551, + "learning_rate": 0.00022990777338603426, + "loss": 0.8795, "step": 14350 }, { "epoch": 2.7028044419348767, - "grad_norm": 11.342184066772461, - "learning_rate": 1.378317334839074e-05, - "loss": 0.5825, + "grad_norm": 4.221109390258789, + "learning_rate": 0.00022971955580651233, + "loss": 0.7459, "step": 14360 }, { "epoch": 2.704686617730096, - "grad_norm": 9.5272798538208, - "learning_rate": 1.3771880293619424e-05, - "loss": 0.6676, + "grad_norm": 4.284846782684326, + "learning_rate": 0.0002295313382269904, + "loss": 0.7888, "step": 14370 }, { "epoch": 2.7065687935253155, - "grad_norm": 14.55465030670166, - "learning_rate": 1.3760587238848108e-05, - "loss": 0.899, + "grad_norm": 3.834550619125366, + "learning_rate": 0.00022934312064746849, + "loss": 0.7785, "step": 14380 }, { "epoch": 2.7084509693205345, - "grad_norm": 16.193349838256836, - "learning_rate": 1.3749294184076794e-05, - "loss": 0.5359, + "grad_norm": 5.762689590454102, + "learning_rate": 0.00022915490306794656, + "loss": 0.5134, "step": 14390 }, { "epoch": 2.710333145115754, - "grad_norm": 0.4011596441268921, - "learning_rate": 1.3738001129305477e-05, - "loss": 0.6422, + "grad_norm": 0.5311426520347595, + "learning_rate": 0.0002289666854884246, + "loss": 0.6027, "step": 14400 }, { "epoch": 2.712215320910973, - "grad_norm": 18.894222259521484, - "learning_rate": 1.3726708074534161e-05, - "loss": 0.6957, + "grad_norm": 3.2206649780273438, + "learning_rate": 0.0002287784679089027, + "loss": 0.7232, "step": 14410 }, { "epoch": 2.7140974967061924, - "grad_norm": 23.98822593688965, - "learning_rate": 1.3715415019762846e-05, - "loss": 0.9018, + "grad_norm": 3.6478111743927, + "learning_rate": 0.00022859025032938076, + "loss": 0.893, "step": 14420 }, { "epoch": 2.715979672501412, - "grad_norm": 24.73630142211914, - "learning_rate": 1.3704121964991532e-05, - "loss": 0.6684, + "grad_norm": 5.927709102630615, + "learning_rate": 0.00022840203274985884, + "loss": 0.7657, "step": 14430 }, { "epoch": 2.717861848296631, - "grad_norm": 0.7143511772155762, - "learning_rate": 1.3692828910220215e-05, - "loss": 0.7074, + "grad_norm": 0.11305610835552216, + "learning_rate": 0.0002282138151703369, + "loss": 0.5883, "step": 14440 }, { "epoch": 2.7197440240918502, - "grad_norm": 13.2362699508667, - "learning_rate": 1.3681535855448899e-05, - "loss": 0.6953, + "grad_norm": 2.982926845550537, + "learning_rate": 0.000228025597590815, + "loss": 0.5712, "step": 14450 }, { "epoch": 2.721626199887069, - "grad_norm": 0.7118558287620544, - "learning_rate": 1.3670242800677583e-05, - "loss": 0.6667, + "grad_norm": 3.5719408988952637, + "learning_rate": 0.00022783738001129307, + "loss": 0.6554, "step": 14460 }, { "epoch": 2.7235083756822887, - "grad_norm": 10.795101165771484, - "learning_rate": 1.3658949745906268e-05, - "loss": 0.8088, + "grad_norm": 2.8321266174316406, + "learning_rate": 0.00022764916243177115, + "loss": 0.7021, "step": 14470 }, { "epoch": 2.725390551477508, - "grad_norm": 21.050437927246094, - "learning_rate": 1.3647656691134954e-05, - "loss": 0.6499, + "grad_norm": 1.245736837387085, + "learning_rate": 0.00022746094485224922, + "loss": 0.5736, "step": 14480 }, { "epoch": 2.7272727272727275, - "grad_norm": 15.647856712341309, - "learning_rate": 1.3636363636363637e-05, - "loss": 0.769, + "grad_norm": 3.6251332759857178, + "learning_rate": 0.00022727272727272727, + "loss": 0.7772, "step": 14490 }, { "epoch": 2.7291549030679465, - "grad_norm": 18.59046173095703, - "learning_rate": 1.3625070581592321e-05, - "loss": 0.8155, + "grad_norm": 3.757814407348633, + "learning_rate": 0.00022708450969320535, + "loss": 0.8901, "step": 14500 }, { "epoch": 2.731037078863166, - "grad_norm": 0.6544022560119629, - "learning_rate": 1.3613777526821005e-05, - "loss": 0.5492, + "grad_norm": 0.9213415384292603, + "learning_rate": 0.00022689629211368343, + "loss": 0.7134, "step": 14510 }, { "epoch": 2.732919254658385, - "grad_norm": 5.2929182052612305, - "learning_rate": 1.360248447204969e-05, - "loss": 0.6304, + "grad_norm": 3.748073101043701, + "learning_rate": 0.0002267080745341615, + "loss": 0.6412, "step": 14520 }, { "epoch": 2.7348014304536044, - "grad_norm": 18.719219207763672, - "learning_rate": 1.3591191417278374e-05, - "loss": 0.7405, + "grad_norm": 6.530447006225586, + "learning_rate": 0.00022651985695463955, + "loss": 0.7077, "step": 14530 }, { "epoch": 2.736683606248824, - "grad_norm": 14.632789611816406, - "learning_rate": 1.3579898362507058e-05, - "loss": 0.6674, + "grad_norm": 2.432340621948242, + "learning_rate": 0.00022633163937511763, + "loss": 0.5575, "step": 14540 }, { "epoch": 2.7385657820440428, - "grad_norm": 1.8742183446884155, - "learning_rate": 1.3568605307735743e-05, - "loss": 0.7499, + "grad_norm": 0.6200935244560242, + "learning_rate": 0.0002261434217955957, + "loss": 0.6548, "step": 14550 }, { "epoch": 2.740447957839262, - "grad_norm": 14.247894287109375, - "learning_rate": 1.3557312252964427e-05, - "loss": 0.8707, + "grad_norm": 4.548830986022949, + "learning_rate": 0.0002259552042160738, + "loss": 0.9457, "step": 14560 }, { "epoch": 2.742330133634481, - "grad_norm": 12.40461254119873, - "learning_rate": 1.3546019198193112e-05, - "loss": 0.6975, + "grad_norm": 3.477888822555542, + "learning_rate": 0.00022576698663655186, + "loss": 0.7322, "step": 14570 }, { "epoch": 2.7442123094297006, - "grad_norm": 3.423053503036499, - "learning_rate": 1.3534726143421796e-05, - "loss": 0.9175, + "grad_norm": 2.17950701713562, + "learning_rate": 0.00022557876905702993, + "loss": 0.9557, "step": 14580 }, { "epoch": 2.74609448522492, - "grad_norm": 6.041767597198486, - "learning_rate": 1.352343308865048e-05, - "loss": 0.6782, + "grad_norm": 0.9166530966758728, + "learning_rate": 0.000225390551477508, + "loss": 0.7076, "step": 14590 }, { "epoch": 2.7479766610201395, - "grad_norm": 29.75321388244629, - "learning_rate": 1.3512140033879165e-05, - "loss": 0.6639, + "grad_norm": 3.2019553184509277, + "learning_rate": 0.00022520233389798609, + "loss": 0.6551, "step": 14600 }, { "epoch": 2.7498588368153585, - "grad_norm": 18.750484466552734, - "learning_rate": 1.3500846979107849e-05, - "loss": 0.6613, + "grad_norm": 3.706820249557495, + "learning_rate": 0.00022501411631846416, + "loss": 0.6233, "step": 14610 }, { "epoch": 2.751741012610578, - "grad_norm": 15.438997268676758, - "learning_rate": 1.3489553924336534e-05, - "loss": 0.551, + "grad_norm": 3.3140599727630615, + "learning_rate": 0.0002248258987389422, + "loss": 0.6466, "step": 14620 }, { "epoch": 2.753623188405797, - "grad_norm": 14.487974166870117, - "learning_rate": 1.3478260869565218e-05, - "loss": 0.7749, + "grad_norm": 3.88219952583313, + "learning_rate": 0.0002246376811594203, + "loss": 0.7267, "step": 14630 }, { "epoch": 2.7555053642010163, - "grad_norm": 3.726011276245117, - "learning_rate": 1.3466967814793902e-05, - "loss": 0.4723, + "grad_norm": 1.0075336694717407, + "learning_rate": 0.00022444946357989836, + "loss": 0.4335, "step": 14640 }, { "epoch": 2.7573875399962358, - "grad_norm": 12.075268745422363, - "learning_rate": 1.3455674760022587e-05, - "loss": 0.7522, + "grad_norm": 2.2053027153015137, + "learning_rate": 0.00022426124600037644, + "loss": 0.7073, "step": 14650 }, { "epoch": 2.759269715791455, - "grad_norm": 6.138002872467041, - "learning_rate": 1.344438170525127e-05, - "loss": 0.7348, + "grad_norm": 6.247775554656982, + "learning_rate": 0.0002240730284208545, + "loss": 0.6382, "step": 14660 }, { "epoch": 2.761151891586674, - "grad_norm": 1.5578501224517822, - "learning_rate": 1.3433088650479955e-05, - "loss": 0.7188, + "grad_norm": 2.4420855045318604, + "learning_rate": 0.0002238848108413326, + "loss": 0.6792, "step": 14670 }, { "epoch": 2.7630340673818936, - "grad_norm": 24.87708282470703, - "learning_rate": 1.342179559570864e-05, - "loss": 0.8655, + "grad_norm": 5.8336687088012695, + "learning_rate": 0.00022369659326181067, + "loss": 0.8575, "step": 14680 }, { "epoch": 2.7649162431771126, - "grad_norm": 13.054447174072266, - "learning_rate": 1.3410502540937324e-05, - "loss": 0.6821, + "grad_norm": 10.038944244384766, + "learning_rate": 0.00022350837568228875, + "loss": 0.6917, "step": 14690 }, { "epoch": 2.766798418972332, - "grad_norm": 0.5486339926719666, - "learning_rate": 1.3399209486166007e-05, - "loss": 0.5678, + "grad_norm": 2.5733683109283447, + "learning_rate": 0.0002233201581027668, + "loss": 0.7355, "step": 14700 }, { "epoch": 2.7686805947675515, - "grad_norm": 16.782821655273438, - "learning_rate": 1.3387916431394693e-05, - "loss": 0.8178, + "grad_norm": 2.4171252250671387, + "learning_rate": 0.00022313194052324487, + "loss": 0.6444, "step": 14710 }, { "epoch": 2.7705627705627704, - "grad_norm": 5.81451416015625, - "learning_rate": 1.3376623376623377e-05, - "loss": 0.5588, + "grad_norm": 1.818503975868225, + "learning_rate": 0.00022294372294372295, + "loss": 0.4167, "step": 14720 }, { "epoch": 2.77244494635799, - "grad_norm": 3.6618595123291016, - "learning_rate": 1.3365330321852062e-05, - "loss": 0.7615, + "grad_norm": 1.2341984510421753, + "learning_rate": 0.00022275550536420103, + "loss": 0.8849, "step": 14730 }, { "epoch": 2.774327122153209, - "grad_norm": 8.5778169631958, - "learning_rate": 1.3354037267080746e-05, - "loss": 0.576, + "grad_norm": 1.8926407098770142, + "learning_rate": 0.0002225672877846791, + "loss": 0.7797, "step": 14740 }, { "epoch": 2.7762092979484283, - "grad_norm": 13.566119194030762, - "learning_rate": 1.3342744212309429e-05, - "loss": 0.5681, + "grad_norm": 3.853085517883301, + "learning_rate": 0.00022237907020515715, + "loss": 0.7213, "step": 14750 }, { "epoch": 2.7780914737436477, - "grad_norm": 15.843841552734375, - "learning_rate": 1.3331451157538115e-05, - "loss": 0.6232, + "grad_norm": 3.408320188522339, + "learning_rate": 0.00022219085262563523, + "loss": 0.7639, "step": 14760 }, { "epoch": 2.779973649538867, - "grad_norm": 8.985896110534668, - "learning_rate": 1.33201581027668e-05, - "loss": 0.538, + "grad_norm": 2.5825889110565186, + "learning_rate": 0.0002220026350461133, + "loss": 0.3973, "step": 14770 }, { "epoch": 2.781855825334086, - "grad_norm": 17.511743545532227, - "learning_rate": 1.3308865047995484e-05, - "loss": 0.8905, + "grad_norm": 3.6277236938476562, + "learning_rate": 0.0002218144174665914, + "loss": 0.7533, "step": 14780 }, { "epoch": 2.7837380011293056, - "grad_norm": 23.286775588989258, - "learning_rate": 1.3297571993224166e-05, - "loss": 0.8913, + "grad_norm": 2.047395706176758, + "learning_rate": 0.00022162619988706946, + "loss": 0.9952, "step": 14790 }, { "epoch": 2.7856201769245246, - "grad_norm": 23.496522903442383, - "learning_rate": 1.3286278938452852e-05, - "loss": 0.6562, + "grad_norm": 4.63572883605957, + "learning_rate": 0.00022143798230754753, + "loss": 0.8083, "step": 14800 }, { "epoch": 2.787502352719744, - "grad_norm": 11.973505973815918, - "learning_rate": 1.3274985883681537e-05, - "loss": 0.6814, + "grad_norm": 2.965135097503662, + "learning_rate": 0.0002212497647280256, + "loss": 0.7247, "step": 14810 }, { "epoch": 2.7893845285149634, - "grad_norm": 19.595155715942383, - "learning_rate": 1.3263692828910221e-05, - "loss": 0.4778, + "grad_norm": 6.954098701477051, + "learning_rate": 0.00022106154714850369, + "loss": 0.5122, "step": 14820 }, { "epoch": 2.7912667043101824, - "grad_norm": 10.94100284576416, - "learning_rate": 1.3252399774138904e-05, - "loss": 0.8547, + "grad_norm": 1.5478140115737915, + "learning_rate": 0.00022087332956898174, + "loss": 0.9916, "step": 14830 }, { "epoch": 2.793148880105402, - "grad_norm": 9.172542572021484, - "learning_rate": 1.3241106719367588e-05, - "loss": 0.8835, + "grad_norm": 7.099700450897217, + "learning_rate": 0.0002206851119894598, + "loss": 0.883, "step": 14840 }, { "epoch": 2.795031055900621, - "grad_norm": 49.06556701660156, - "learning_rate": 1.3229813664596274e-05, - "loss": 0.7209, + "grad_norm": 1.6341519355773926, + "learning_rate": 0.0002204968944099379, + "loss": 0.8133, "step": 14850 }, { "epoch": 2.7969132316958403, - "grad_norm": 2.8272016048431396, - "learning_rate": 1.3218520609824959e-05, - "loss": 0.6661, + "grad_norm": 1.9076987504959106, + "learning_rate": 0.00022030867683041596, + "loss": 0.7103, "step": 14860 }, { "epoch": 2.7987954074910597, - "grad_norm": 0.3546334505081177, - "learning_rate": 1.3207227555053643e-05, - "loss": 0.4056, + "grad_norm": 2.6881508827209473, + "learning_rate": 0.00022012045925089404, + "loss": 0.5018, "step": 14870 }, { "epoch": 2.800677583286279, - "grad_norm": 3.786240816116333, - "learning_rate": 1.3195934500282326e-05, - "loss": 0.964, + "grad_norm": 1.636492133140564, + "learning_rate": 0.0002199322416713721, + "loss": 0.8002, "step": 14880 }, { "epoch": 2.802559759081498, - "grad_norm": 5.78407096862793, - "learning_rate": 1.318464144551101e-05, - "loss": 0.6051, + "grad_norm": 3.2086358070373535, + "learning_rate": 0.0002197440240918502, + "loss": 0.7168, "step": 14890 }, { "epoch": 2.8044419348767176, - "grad_norm": 6.689875602722168, - "learning_rate": 1.3173348390739696e-05, - "loss": 0.6458, + "grad_norm": 1.55039381980896, + "learning_rate": 0.00021955580651232827, + "loss": 0.5735, "step": 14900 }, { "epoch": 2.8063241106719365, - "grad_norm": 12.659087181091309, - "learning_rate": 1.316205533596838e-05, - "loss": 0.8754, + "grad_norm": 4.564043045043945, + "learning_rate": 0.00021936758893280635, + "loss": 0.7911, "step": 14910 }, { "epoch": 2.808206286467156, - "grad_norm": 14.06718921661377, - "learning_rate": 1.3150762281197063e-05, - "loss": 0.9916, + "grad_norm": 5.466314792633057, + "learning_rate": 0.0002191793713532844, + "loss": 1.0529, "step": 14920 }, { "epoch": 2.8100884622623754, - "grad_norm": 15.806565284729004, - "learning_rate": 1.3139469226425748e-05, - "loss": 0.7995, + "grad_norm": 3.9521117210388184, + "learning_rate": 0.00021899115377376247, + "loss": 0.7721, "step": 14930 }, { "epoch": 2.811970638057595, - "grad_norm": 1.6029272079467773, - "learning_rate": 1.3128176171654434e-05, - "loss": 0.4056, + "grad_norm": 2.2904415130615234, + "learning_rate": 0.00021880293619424055, + "loss": 0.3554, "step": 14940 }, { "epoch": 2.813852813852814, - "grad_norm": 11.386082649230957, - "learning_rate": 1.3116883116883118e-05, - "loss": 0.6312, + "grad_norm": 2.367241859436035, + "learning_rate": 0.00021861471861471863, + "loss": 0.5897, "step": 14950 }, { "epoch": 2.8157349896480333, - "grad_norm": 12.017684936523438, - "learning_rate": 1.31055900621118e-05, - "loss": 0.8528, + "grad_norm": 6.305996894836426, + "learning_rate": 0.00021842650103519667, + "loss": 0.7687, "step": 14960 }, { "epoch": 2.8176171654432522, - "grad_norm": 10.35236644744873, - "learning_rate": 1.3094297007340485e-05, - "loss": 0.63, + "grad_norm": 4.461951732635498, + "learning_rate": 0.00021823828345567475, + "loss": 0.4529, "step": 14970 }, { "epoch": 2.8194993412384717, - "grad_norm": 18.851110458374023, - "learning_rate": 1.308300395256917e-05, - "loss": 0.7506, + "grad_norm": 1.6320971250534058, + "learning_rate": 0.00021805006587615283, + "loss": 0.7452, "step": 14980 }, { "epoch": 2.821381517033691, - "grad_norm": 7.350698947906494, - "learning_rate": 1.3071710897797856e-05, - "loss": 0.5077, + "grad_norm": 1.834617018699646, + "learning_rate": 0.0002178618482966309, + "loss": 0.6039, "step": 14990 }, { "epoch": 2.82326369282891, - "grad_norm": 9.5648775100708, - "learning_rate": 1.306041784302654e-05, - "loss": 0.6446, + "grad_norm": 1.4062933921813965, + "learning_rate": 0.000217673630717109, + "loss": 0.6868, "step": 15000 }, { "epoch": 2.8251458686241295, - "grad_norm": 46.55702209472656, - "learning_rate": 1.3049124788255223e-05, - "loss": 0.6478, + "grad_norm": 3.7862212657928467, + "learning_rate": 0.00021748541313758706, + "loss": 0.5454, "step": 15010 }, { "epoch": 2.8270280444193485, - "grad_norm": 22.763042449951172, - "learning_rate": 1.3037831733483907e-05, - "loss": 0.5402, + "grad_norm": 6.517659664154053, + "learning_rate": 0.00021729719555806513, + "loss": 0.5488, "step": 15020 }, { "epoch": 2.828910220214568, - "grad_norm": 23.328462600708008, - "learning_rate": 1.3026538678712591e-05, - "loss": 0.7589, + "grad_norm": 2.6208972930908203, + "learning_rate": 0.0002171089779785432, + "loss": 0.6687, "step": 15030 }, { "epoch": 2.8307923960097874, - "grad_norm": 9.546448707580566, - "learning_rate": 1.3015245623941278e-05, - "loss": 0.4769, + "grad_norm": 2.6357429027557373, + "learning_rate": 0.00021692076039902129, + "loss": 0.4496, "step": 15040 }, { "epoch": 2.832674571805007, - "grad_norm": 9.505398750305176, - "learning_rate": 1.300395256916996e-05, - "loss": 0.6579, + "grad_norm": 0.8929344415664673, + "learning_rate": 0.00021673254281949934, + "loss": 0.6089, "step": 15050 }, { "epoch": 2.834556747600226, - "grad_norm": 10.608928680419922, - "learning_rate": 1.2992659514398645e-05, - "loss": 0.6198, + "grad_norm": 1.9503272771835327, + "learning_rate": 0.0002165443252399774, + "loss": 0.6058, "step": 15060 }, { "epoch": 2.8364389233954452, - "grad_norm": 13.843592643737793, - "learning_rate": 1.2981366459627329e-05, - "loss": 0.74, + "grad_norm": 4.708406448364258, + "learning_rate": 0.0002163561076604555, + "loss": 0.9228, "step": 15070 }, { "epoch": 2.838321099190664, - "grad_norm": 18.67850112915039, - "learning_rate": 1.2970073404856015e-05, - "loss": 0.4672, + "grad_norm": 3.6658589839935303, + "learning_rate": 0.00021616789008093356, + "loss": 0.4709, "step": 15080 }, { "epoch": 2.8402032749858837, - "grad_norm": 6.967514991760254, - "learning_rate": 1.2958780350084698e-05, - "loss": 0.4409, + "grad_norm": 2.2713499069213867, + "learning_rate": 0.00021597967250141161, + "loss": 0.5153, "step": 15090 }, { "epoch": 2.842085450781103, - "grad_norm": 6.820723533630371, - "learning_rate": 1.2947487295313382e-05, - "loss": 0.7573, + "grad_norm": 1.2780481576919556, + "learning_rate": 0.0002157914549218897, + "loss": 0.7618, "step": 15100 }, { "epoch": 2.843967626576322, - "grad_norm": 18.13920783996582, - "learning_rate": 1.2936194240542067e-05, - "loss": 0.7914, + "grad_norm": 3.080092430114746, + "learning_rate": 0.0002156032373423678, + "loss": 0.791, "step": 15110 }, { "epoch": 2.8458498023715415, - "grad_norm": 7.37209939956665, - "learning_rate": 1.2924901185770751e-05, - "loss": 0.6351, + "grad_norm": 1.203285813331604, + "learning_rate": 0.00021541501976284587, + "loss": 0.9003, "step": 15120 }, { "epoch": 2.847731978166761, - "grad_norm": 0.3693366050720215, - "learning_rate": 1.2913608130999437e-05, - "loss": 0.5822, + "grad_norm": 0.1718599647283554, + "learning_rate": 0.00021522680218332395, + "loss": 0.5085, "step": 15130 }, { "epoch": 2.84961415396198, - "grad_norm": 21.349327087402344, - "learning_rate": 1.290231507622812e-05, - "loss": 0.6001, + "grad_norm": 1.1150635480880737, + "learning_rate": 0.000215038584603802, + "loss": 0.5646, "step": 15140 }, { "epoch": 2.8514963297571994, - "grad_norm": 8.462204933166504, - "learning_rate": 1.2891022021456804e-05, - "loss": 0.4354, + "grad_norm": 3.6562788486480713, + "learning_rate": 0.00021485036702428007, + "loss": 0.4479, "step": 15150 }, { "epoch": 2.853378505552419, - "grad_norm": 9.716917991638184, - "learning_rate": 1.2879728966685488e-05, - "loss": 0.7532, + "grad_norm": 3.168921947479248, + "learning_rate": 0.00021466214944475815, + "loss": 0.7417, "step": 15160 }, { "epoch": 2.8552606813476378, - "grad_norm": 4.279250144958496, - "learning_rate": 1.2868435911914173e-05, - "loss": 0.7497, + "grad_norm": 1.4422719478607178, + "learning_rate": 0.00021447393186523623, + "loss": 0.5953, "step": 15170 }, { "epoch": 2.857142857142857, - "grad_norm": 27.534521102905273, - "learning_rate": 1.2857142857142857e-05, - "loss": 0.5156, + "grad_norm": 6.2547221183776855, + "learning_rate": 0.00021428571428571427, + "loss": 0.4218, "step": 15180 }, { "epoch": 2.859025032938076, - "grad_norm": 10.771012306213379, - "learning_rate": 1.2845849802371542e-05, - "loss": 0.7265, + "grad_norm": 2.6894936561584473, + "learning_rate": 0.00021409749670619235, + "loss": 0.7421, "step": 15190 }, { "epoch": 2.8609072087332956, - "grad_norm": 18.646312713623047, - "learning_rate": 1.2834556747600226e-05, - "loss": 0.6741, + "grad_norm": 3.028836488723755, + "learning_rate": 0.00021390927912667043, + "loss": 0.6819, "step": 15200 }, { "epoch": 2.862789384528515, - "grad_norm": 19.257171630859375, - "learning_rate": 1.282326369282891e-05, - "loss": 1.1365, + "grad_norm": 8.584593772888184, + "learning_rate": 0.0002137210615471485, + "loss": 1.0062, "step": 15210 }, { "epoch": 2.8646715603237345, - "grad_norm": 5.380772590637207, - "learning_rate": 1.2811970638057596e-05, - "loss": 0.8546, + "grad_norm": 1.5320723056793213, + "learning_rate": 0.0002135328439676266, + "loss": 0.7337, "step": 15220 }, { "epoch": 2.8665537361189535, - "grad_norm": 23.16881561279297, - "learning_rate": 1.2800677583286279e-05, - "loss": 0.7943, + "grad_norm": 4.4340033531188965, + "learning_rate": 0.00021334462638810466, + "loss": 0.7246, "step": 15230 }, { "epoch": 2.868435911914173, - "grad_norm": 3.4778006076812744, - "learning_rate": 1.2789384528514964e-05, - "loss": 0.6461, + "grad_norm": 2.8282470703125, + "learning_rate": 0.00021315640880858273, + "loss": 0.6766, "step": 15240 }, { "epoch": 2.870318087709392, - "grad_norm": 6.4198994636535645, - "learning_rate": 1.2778091473743648e-05, - "loss": 0.8293, + "grad_norm": 3.918661117553711, + "learning_rate": 0.0002129681912290608, + "loss": 0.8459, "step": 15250 }, { "epoch": 2.8722002635046113, - "grad_norm": 9.544682502746582, - "learning_rate": 1.2766798418972332e-05, - "loss": 0.7434, + "grad_norm": 2.269125461578369, + "learning_rate": 0.00021277997364953889, + "loss": 0.7308, "step": 15260 }, { "epoch": 2.8740824392998308, - "grad_norm": 20.545665740966797, - "learning_rate": 1.2755505364201017e-05, - "loss": 0.6885, + "grad_norm": 5.307600021362305, + "learning_rate": 0.00021259175607001694, + "loss": 0.7939, "step": 15270 }, { "epoch": 2.8759646150950497, - "grad_norm": 21.490854263305664, - "learning_rate": 1.2744212309429701e-05, - "loss": 0.9683, + "grad_norm": 5.625484466552734, + "learning_rate": 0.000212403538490495, + "loss": 0.747, "step": 15280 }, { "epoch": 2.877846790890269, - "grad_norm": 3.391572952270508, - "learning_rate": 1.2732919254658385e-05, - "loss": 0.421, + "grad_norm": 0.8601845502853394, + "learning_rate": 0.0002122153209109731, + "loss": 0.4009, "step": 15290 }, { "epoch": 2.879728966685488, - "grad_norm": 6.413089275360107, - "learning_rate": 1.272162619988707e-05, - "loss": 0.6439, + "grad_norm": 4.669708251953125, + "learning_rate": 0.00021202710333145116, + "loss": 0.6273, "step": 15300 }, { "epoch": 2.8816111424807076, - "grad_norm": 9.638285636901855, - "learning_rate": 1.2710333145115754e-05, - "loss": 0.6487, + "grad_norm": 2.922671318054199, + "learning_rate": 0.00021183888575192921, + "loss": 0.846, "step": 15310 }, { "epoch": 2.883493318275927, - "grad_norm": 19.76523208618164, - "learning_rate": 1.2699040090344439e-05, - "loss": 0.7171, + "grad_norm": 3.249382734298706, + "learning_rate": 0.0002116506681724073, + "loss": 0.8076, "step": 15320 }, { "epoch": 2.8853754940711465, - "grad_norm": 18.670621871948242, - "learning_rate": 1.2687747035573123e-05, - "loss": 0.4666, + "grad_norm": 2.7542879581451416, + "learning_rate": 0.00021146245059288537, + "loss": 0.5032, "step": 15330 }, { "epoch": 2.8872576698663655, - "grad_norm": 11.9641695022583, - "learning_rate": 1.2676453980801807e-05, - "loss": 0.5461, + "grad_norm": 4.564001083374023, + "learning_rate": 0.00021127423301336347, + "loss": 0.6309, "step": 15340 }, { "epoch": 2.889139845661585, - "grad_norm": 1.4184321165084839, - "learning_rate": 1.2665160926030492e-05, - "loss": 0.407, + "grad_norm": 0.7549136877059937, + "learning_rate": 0.00021108601543384155, + "loss": 0.2515, "step": 15350 }, { "epoch": 2.891022021456804, - "grad_norm": 19.386415481567383, - "learning_rate": 1.2653867871259176e-05, - "loss": 0.4194, + "grad_norm": 2.5547895431518555, + "learning_rate": 0.0002108977978543196, + "loss": 0.5358, "step": 15360 }, { "epoch": 2.8929041972520233, - "grad_norm": 25.078582763671875, - "learning_rate": 1.264257481648786e-05, - "loss": 0.8741, + "grad_norm": 5.23292875289917, + "learning_rate": 0.00021070958027479767, + "loss": 0.8811, "step": 15370 }, { "epoch": 2.8947863730472427, - "grad_norm": 18.054174423217773, - "learning_rate": 1.2631281761716545e-05, - "loss": 0.8567, + "grad_norm": 5.233101844787598, + "learning_rate": 0.00021052136269527575, + "loss": 0.8276, "step": 15380 }, { "epoch": 2.8966685488424617, - "grad_norm": 9.56291675567627, - "learning_rate": 1.261998870694523e-05, - "loss": 1.0115, + "grad_norm": 4.63691520690918, + "learning_rate": 0.00021033314511575383, + "loss": 0.8278, "step": 15390 }, { "epoch": 2.898550724637681, - "grad_norm": 10.484170913696289, - "learning_rate": 1.2608695652173912e-05, - "loss": 0.5493, + "grad_norm": 5.9823479652404785, + "learning_rate": 0.00021014492753623187, + "loss": 0.5929, "step": 15400 }, { "epoch": 2.9004329004329006, - "grad_norm": 17.50212287902832, - "learning_rate": 1.2597402597402598e-05, - "loss": 0.8972, + "grad_norm": 5.181570529937744, + "learning_rate": 0.00020995670995670995, + "loss": 0.9555, "step": 15410 }, { "epoch": 2.9023150762281196, - "grad_norm": 11.050121307373047, - "learning_rate": 1.2586109542631282e-05, - "loss": 0.572, + "grad_norm": 1.9936301708221436, + "learning_rate": 0.00020976849237718803, + "loss": 0.4135, "step": 15420 }, { "epoch": 2.904197252023339, - "grad_norm": 19.2404727935791, - "learning_rate": 1.2574816487859967e-05, - "loss": 0.6457, + "grad_norm": 5.654709815979004, + "learning_rate": 0.0002095802747976661, + "loss": 0.7488, "step": 15430 }, { "epoch": 2.9060794278185584, - "grad_norm": 7.877418518066406, - "learning_rate": 1.256352343308865e-05, - "loss": 0.5262, + "grad_norm": 3.6621265411376953, + "learning_rate": 0.00020939205721814415, + "loss": 0.4353, "step": 15440 }, { "epoch": 2.9079616036137774, - "grad_norm": 25.962966918945312, - "learning_rate": 1.2552230378317336e-05, - "loss": 1.0387, + "grad_norm": 3.0909690856933594, + "learning_rate": 0.00020920383963862226, + "loss": 0.8345, "step": 15450 }, { "epoch": 2.909843779408997, - "grad_norm": 18.10149383544922, - "learning_rate": 1.254093732354602e-05, - "loss": 0.3264, + "grad_norm": 1.9756587743759155, + "learning_rate": 0.00020901562205910033, + "loss": 0.3801, "step": 15460 }, { "epoch": 2.911725955204216, - "grad_norm": 12.390528678894043, - "learning_rate": 1.2529644268774704e-05, - "loss": 0.5733, + "grad_norm": 1.5738675594329834, + "learning_rate": 0.0002088274044795784, + "loss": 0.4169, "step": 15470 }, { "epoch": 2.9136081309994353, - "grad_norm": 6.335510730743408, - "learning_rate": 1.2518351214003389e-05, - "loss": 0.9647, + "grad_norm": 1.9008299112319946, + "learning_rate": 0.00020863918690005649, + "loss": 1.0148, "step": 15480 }, { "epoch": 2.9154903067946547, - "grad_norm": 8.489152908325195, - "learning_rate": 1.2507058159232071e-05, - "loss": 0.5086, + "grad_norm": 1.7302570343017578, + "learning_rate": 0.00020845096932053454, + "loss": 0.554, "step": 15490 }, { "epoch": 2.917372482589874, - "grad_norm": 5.623629570007324, - "learning_rate": 1.2495765104460757e-05, - "loss": 0.8003, + "grad_norm": 1.9822744131088257, + "learning_rate": 0.0002082627517410126, + "loss": 0.7925, "step": 15500 }, { "epoch": 2.919254658385093, - "grad_norm": 7.3673787117004395, - "learning_rate": 1.2484472049689442e-05, - "loss": 0.5053, + "grad_norm": 1.7959871292114258, + "learning_rate": 0.0002080745341614907, + "loss": 0.5921, "step": 15510 }, { "epoch": 2.9211368341803126, - "grad_norm": 25.399425506591797, - "learning_rate": 1.2473178994918126e-05, - "loss": 0.6455, + "grad_norm": 2.998852014541626, + "learning_rate": 0.00020788631658196876, + "loss": 0.6003, "step": 15520 }, { "epoch": 2.9230190099755315, - "grad_norm": 15.276825904846191, - "learning_rate": 1.2461885940146809e-05, - "loss": 0.5071, + "grad_norm": 1.354353427886963, + "learning_rate": 0.00020769809900244681, + "loss": 0.52, "step": 15530 }, { "epoch": 2.924901185770751, - "grad_norm": 11.166011810302734, - "learning_rate": 1.2450592885375493e-05, - "loss": 0.8913, + "grad_norm": 2.2639453411102295, + "learning_rate": 0.0002075098814229249, + "loss": 0.7204, "step": 15540 }, { "epoch": 2.9267833615659704, - "grad_norm": 16.122472763061523, - "learning_rate": 1.243929983060418e-05, - "loss": 1.0398, + "grad_norm": 1.4416176080703735, + "learning_rate": 0.00020732166384340297, + "loss": 1.0761, "step": 15550 }, { "epoch": 2.9286655373611894, - "grad_norm": 21.32600212097168, - "learning_rate": 1.2428006775832864e-05, - "loss": 0.6833, + "grad_norm": 3.021568775177002, + "learning_rate": 0.00020713344626388107, + "loss": 0.6917, "step": 15560 }, { "epoch": 2.930547713156409, - "grad_norm": 3.505770206451416, - "learning_rate": 1.2416713721061546e-05, - "loss": 0.7454, + "grad_norm": 1.8868095874786377, + "learning_rate": 0.00020694522868435912, + "loss": 0.7583, "step": 15570 }, { "epoch": 2.932429888951628, - "grad_norm": 18.41620445251465, - "learning_rate": 1.240542066629023e-05, - "loss": 0.5258, + "grad_norm": 2.691807508468628, + "learning_rate": 0.0002067570111048372, + "loss": 0.5029, "step": 15580 }, { "epoch": 2.9343120647468472, - "grad_norm": 25.811620712280273, - "learning_rate": 1.2394127611518917e-05, - "loss": 0.607, + "grad_norm": 2.2595269680023193, + "learning_rate": 0.00020656879352531527, + "loss": 0.5884, "step": 15590 }, { "epoch": 2.9361942405420667, - "grad_norm": 40.27906036376953, - "learning_rate": 1.2382834556747601e-05, - "loss": 0.8219, + "grad_norm": 3.909071683883667, + "learning_rate": 0.00020638057594579335, + "loss": 0.8451, "step": 15600 }, { "epoch": 2.938076416337286, - "grad_norm": 24.22795867919922, - "learning_rate": 1.2371541501976286e-05, - "loss": 0.943, + "grad_norm": 6.179553985595703, + "learning_rate": 0.00020619235836627143, + "loss": 0.801, "step": 15610 }, { "epoch": 2.939958592132505, - "grad_norm": 25.92055892944336, - "learning_rate": 1.2360248447204968e-05, - "loss": 0.8276, + "grad_norm": 2.8360793590545654, + "learning_rate": 0.00020600414078674947, + "loss": 0.7074, "step": 15620 }, { "epoch": 2.9418407679277245, - "grad_norm": 8.324178695678711, - "learning_rate": 1.2348955392433653e-05, - "loss": 0.6895, + "grad_norm": 5.890757083892822, + "learning_rate": 0.00020581592320722755, + "loss": 0.686, "step": 15630 }, { "epoch": 2.9437229437229435, - "grad_norm": 22.274948120117188, - "learning_rate": 1.2337662337662339e-05, - "loss": 1.0351, + "grad_norm": 3.1145524978637695, + "learning_rate": 0.00020562770562770563, + "loss": 0.9267, "step": 15640 }, { "epoch": 2.945605119518163, - "grad_norm": 40.22727584838867, - "learning_rate": 1.2326369282891023e-05, - "loss": 0.5756, + "grad_norm": 3.2289531230926514, + "learning_rate": 0.0002054394880481837, + "loss": 0.5452, "step": 15650 }, { "epoch": 2.9474872953133824, - "grad_norm": 2.7665634155273438, - "learning_rate": 1.2315076228119706e-05, - "loss": 0.5928, + "grad_norm": 1.3664658069610596, + "learning_rate": 0.00020525127046866175, + "loss": 0.6293, "step": 15660 }, { "epoch": 2.949369471108602, - "grad_norm": 20.086702346801758, - "learning_rate": 1.230378317334839e-05, - "loss": 0.7117, + "grad_norm": 2.922199010848999, + "learning_rate": 0.00020506305288913986, + "loss": 0.8441, "step": 15670 }, { "epoch": 2.951251646903821, - "grad_norm": 13.280003547668457, - "learning_rate": 1.2292490118577076e-05, - "loss": 0.7403, + "grad_norm": 3.505491256713867, + "learning_rate": 0.00020487483530961793, + "loss": 0.6791, "step": 15680 }, { "epoch": 2.9531338226990402, - "grad_norm": 37.245201110839844, - "learning_rate": 1.228119706380576e-05, - "loss": 0.8726, + "grad_norm": 7.177020072937012, + "learning_rate": 0.000204686617730096, + "loss": 0.8528, "step": 15690 }, { "epoch": 2.955015998494259, - "grad_norm": 6.580207824707031, - "learning_rate": 1.2269904009034443e-05, - "loss": 0.6027, + "grad_norm": 2.837167739868164, + "learning_rate": 0.00020449840015057406, + "loss": 0.5434, "step": 15700 }, { "epoch": 2.9568981742894787, - "grad_norm": 1.7421516180038452, - "learning_rate": 1.2258610954263128e-05, - "loss": 0.6724, + "grad_norm": 6.894123077392578, + "learning_rate": 0.00020431018257105214, + "loss": 0.7725, "step": 15710 }, { "epoch": 2.958780350084698, - "grad_norm": 29.77842140197754, - "learning_rate": 1.2247317899491812e-05, - "loss": 0.5805, + "grad_norm": 3.7151124477386475, + "learning_rate": 0.0002041219649915302, + "loss": 0.5149, "step": 15720 }, { "epoch": 2.960662525879917, - "grad_norm": 26.471179962158203, - "learning_rate": 1.2236024844720498e-05, - "loss": 0.7421, + "grad_norm": 3.3849830627441406, + "learning_rate": 0.0002039337474120083, + "loss": 0.6201, "step": 15730 }, { "epoch": 2.9625447016751365, - "grad_norm": 8.780817031860352, - "learning_rate": 1.2224731789949183e-05, - "loss": 0.7289, + "grad_norm": 2.0344691276550293, + "learning_rate": 0.00020374552983248636, + "loss": 0.7283, "step": 15740 }, { "epoch": 2.9644268774703555, - "grad_norm": 14.556037902832031, - "learning_rate": 1.2213438735177865e-05, - "loss": 0.4937, + "grad_norm": 1.9597744941711426, + "learning_rate": 0.00020355731225296441, + "loss": 0.4933, "step": 15750 }, { "epoch": 2.966309053265575, - "grad_norm": 25.941247940063477, - "learning_rate": 1.220214568040655e-05, - "loss": 0.8835, + "grad_norm": 4.379768371582031, + "learning_rate": 0.0002033690946734425, + "loss": 0.8107, "step": 15760 }, { "epoch": 2.9681912290607944, - "grad_norm": 6.276753902435303, - "learning_rate": 1.2190852625635234e-05, - "loss": 0.6071, + "grad_norm": 2.1065194606781006, + "learning_rate": 0.00020318087709392057, + "loss": 0.5086, "step": 15770 }, { "epoch": 2.970073404856014, - "grad_norm": 24.19154930114746, - "learning_rate": 1.217955957086392e-05, - "loss": 0.907, + "grad_norm": 3.758819341659546, + "learning_rate": 0.00020299265951439867, + "loss": 0.9853, "step": 15780 }, { "epoch": 2.9719555806512328, - "grad_norm": 5.083404064178467, - "learning_rate": 1.2168266516092603e-05, - "loss": 0.5493, + "grad_norm": 0.23007777333259583, + "learning_rate": 0.00020280444193487672, + "loss": 0.5216, "step": 15790 }, { "epoch": 2.973837756446452, - "grad_norm": 9.346044540405273, - "learning_rate": 1.2156973461321287e-05, - "loss": 0.6961, + "grad_norm": 5.689883232116699, + "learning_rate": 0.0002026162243553548, + "loss": 0.7901, "step": 15800 }, { "epoch": 2.975719932241671, - "grad_norm": 1.4552770853042603, - "learning_rate": 1.2145680406549972e-05, - "loss": 0.6451, + "grad_norm": 0.3998923599720001, + "learning_rate": 0.00020242800677583287, + "loss": 0.7314, "step": 15810 }, { "epoch": 2.9776021080368906, - "grad_norm": 17.02495765686035, - "learning_rate": 1.2134387351778658e-05, - "loss": 0.5733, + "grad_norm": 1.291471004486084, + "learning_rate": 0.00020223978919631095, + "loss": 0.6298, "step": 15820 }, { "epoch": 2.97948428383211, - "grad_norm": 2.2290263175964355, - "learning_rate": 1.2123094297007342e-05, - "loss": 0.7652, + "grad_norm": 1.6297998428344727, + "learning_rate": 0.00020205157161678903, + "loss": 0.721, "step": 15830 }, { "epoch": 2.981366459627329, - "grad_norm": 20.987024307250977, - "learning_rate": 1.2111801242236025e-05, - "loss": 0.7364, + "grad_norm": 0.40168535709381104, + "learning_rate": 0.00020186335403726707, + "loss": 0.7562, "step": 15840 }, { "epoch": 2.9832486354225485, - "grad_norm": 18.569358825683594, - "learning_rate": 1.2100508187464709e-05, - "loss": 1.0693, + "grad_norm": 3.6822330951690674, + "learning_rate": 0.00020167513645774515, + "loss": 0.9433, "step": 15850 }, { "epoch": 2.9851308112177675, - "grad_norm": 22.19905662536621, - "learning_rate": 1.2089215132693394e-05, - "loss": 0.8939, + "grad_norm": 6.4109787940979, + "learning_rate": 0.00020148691887822323, + "loss": 0.7135, "step": 15860 }, { "epoch": 2.987012987012987, - "grad_norm": 20.058185577392578, - "learning_rate": 1.207792207792208e-05, - "loss": 0.5969, + "grad_norm": 1.9936270713806152, + "learning_rate": 0.0002012987012987013, + "loss": 0.7199, "step": 15870 }, { "epoch": 2.9888951628082063, - "grad_norm": 14.325201034545898, - "learning_rate": 1.2066629023150762e-05, - "loss": 0.4321, + "grad_norm": 2.7041404247283936, + "learning_rate": 0.00020111048371917935, + "loss": 0.2948, "step": 15880 }, { "epoch": 2.9907773386034258, - "grad_norm": 3.7948050498962402, - "learning_rate": 1.2055335968379447e-05, - "loss": 0.6696, + "grad_norm": 3.060731887817383, + "learning_rate": 0.00020092226613965746, + "loss": 0.7353, "step": 15890 }, { "epoch": 2.9926595143986447, - "grad_norm": 10.829690933227539, - "learning_rate": 1.2044042913608131e-05, - "loss": 0.6765, + "grad_norm": 2.4952120780944824, + "learning_rate": 0.00020073404856013553, + "loss": 0.6693, "step": 15900 }, { "epoch": 2.994541690193864, - "grad_norm": 16.70258903503418, - "learning_rate": 1.2032749858836815e-05, - "loss": 0.5554, + "grad_norm": 3.944856643676758, + "learning_rate": 0.0002005458309806136, + "loss": 0.5033, "step": 15910 }, { "epoch": 2.996423865989083, - "grad_norm": 8.688410758972168, - "learning_rate": 1.20214568040655e-05, - "loss": 0.542, + "grad_norm": 2.602952480316162, + "learning_rate": 0.00020035761340109166, + "loss": 0.6788, "step": 15920 }, { "epoch": 2.9983060417843026, - "grad_norm": 7.034015655517578, - "learning_rate": 1.2010163749294184e-05, - "loss": 0.5749, + "grad_norm": 3.6875159740448, + "learning_rate": 0.00020016939582156974, + "loss": 0.8691, "step": 15930 }, { "epoch": 3.0, - "eval_accuracy": 0.9153333333333333, - "eval_loss": 0.32141387462615967, - "eval_runtime": 319.8636, - "eval_samples_per_second": 23.447, - "eval_steps_per_second": 2.932, + "eval_accuracy": 0.9173333333333333, + "eval_loss": 0.27245163917541504, + "eval_runtime": 467.1837, + "eval_samples_per_second": 16.054, + "eval_steps_per_second": 2.008, "step": 15939 }, { "epoch": 3.000188217579522, - "grad_norm": 13.433454513549805, - "learning_rate": 1.1998870694522869e-05, - "loss": 0.4825, + "grad_norm": 5.088705062866211, + "learning_rate": 0.0001999811782420478, + "loss": 0.5685, "step": 15940 }, { "epoch": 3.002070393374741, - "grad_norm": 18.549455642700195, - "learning_rate": 1.1987577639751553e-05, - "loss": 0.7692, + "grad_norm": 4.702372074127197, + "learning_rate": 0.0001997929606625259, + "loss": 0.8557, "step": 15950 }, { "epoch": 3.0039525691699605, - "grad_norm": 22.799373626708984, - "learning_rate": 1.1976284584980239e-05, - "loss": 0.6418, + "grad_norm": 2.6750690937042236, + "learning_rate": 0.00019960474308300396, + "loss": 0.6402, "step": 15960 }, { "epoch": 3.00583474496518, - "grad_norm": 21.21796989440918, - "learning_rate": 1.1964991530208922e-05, - "loss": 0.5274, + "grad_norm": 3.8041257858276367, + "learning_rate": 0.00019941652550348201, + "loss": 0.5554, "step": 15970 }, { "epoch": 3.007716920760399, - "grad_norm": 25.831186294555664, - "learning_rate": 1.1953698475437606e-05, - "loss": 0.5339, + "grad_norm": 2.85681414604187, + "learning_rate": 0.0001992283079239601, + "loss": 0.617, "step": 15980 }, { "epoch": 3.0095990965556183, - "grad_norm": 9.926560401916504, - "learning_rate": 1.194240542066629e-05, - "loss": 0.8981, + "grad_norm": 2.0742359161376953, + "learning_rate": 0.00019904009034443817, + "loss": 0.8394, "step": 15990 }, { "epoch": 3.0114812723508377, - "grad_norm": 22.619985580444336, - "learning_rate": 1.1931112365894975e-05, - "loss": 0.7825, + "grad_norm": 4.4268035888671875, + "learning_rate": 0.00019885187276491627, + "loss": 0.7344, "step": 16000 }, { "epoch": 3.0133634481460567, - "grad_norm": 12.743892669677734, - "learning_rate": 1.191981931112366e-05, - "loss": 0.4295, + "grad_norm": 3.6508936882019043, + "learning_rate": 0.00019866365518539432, + "loss": 0.4556, "step": 16010 }, { "epoch": 3.015245623941276, - "grad_norm": 13.726835250854492, - "learning_rate": 1.1908526256352344e-05, - "loss": 0.8777, + "grad_norm": 2.8904614448547363, + "learning_rate": 0.0001984754376058724, + "loss": 0.8619, "step": 16020 }, { "epoch": 3.0171277997364956, - "grad_norm": 14.803107261657715, - "learning_rate": 1.1897233201581028e-05, - "loss": 0.3934, + "grad_norm": 1.6528865098953247, + "learning_rate": 0.00019828722002635047, + "loss": 0.3956, "step": 16030 }, { "epoch": 3.0190099755317146, - "grad_norm": 41.224464416503906, - "learning_rate": 1.1885940146809712e-05, - "loss": 0.459, + "grad_norm": 6.028669834136963, + "learning_rate": 0.00019809900244682855, + "loss": 0.7254, "step": 16040 }, { "epoch": 3.020892151326934, - "grad_norm": 14.260553359985352, - "learning_rate": 1.1874647092038397e-05, - "loss": 0.3349, + "grad_norm": 0.40972083806991577, + "learning_rate": 0.0001979107848673066, + "loss": 0.2728, "step": 16050 }, { "epoch": 3.022774327122153, - "grad_norm": 2.4585812091827393, - "learning_rate": 1.1863354037267081e-05, - "loss": 0.8261, + "grad_norm": 1.0837814807891846, + "learning_rate": 0.00019772256728778467, + "loss": 0.8039, "step": 16060 }, { "epoch": 3.0246565029173724, - "grad_norm": 15.067172050476074, - "learning_rate": 1.1852060982495766e-05, - "loss": 0.4319, + "grad_norm": 1.60385000705719, + "learning_rate": 0.00019753434970826275, + "loss": 0.4548, "step": 16070 }, { "epoch": 3.026538678712592, - "grad_norm": 16.329282760620117, - "learning_rate": 1.184076792772445e-05, - "loss": 0.7257, + "grad_norm": 2.654540538787842, + "learning_rate": 0.00019734613212874083, + "loss": 0.6307, "step": 16080 }, { "epoch": 3.028420854507811, - "grad_norm": 12.787527084350586, - "learning_rate": 1.1829474872953134e-05, - "loss": 0.3546, + "grad_norm": 2.1555159091949463, + "learning_rate": 0.0001971579145492189, + "loss": 0.4333, "step": 16090 }, { "epoch": 3.0303030303030303, - "grad_norm": 3.467725992202759, - "learning_rate": 1.1818181818181819e-05, - "loss": 0.6743, + "grad_norm": 1.0611692667007446, + "learning_rate": 0.00019696969696969695, + "loss": 0.5241, "step": 16100 }, { "epoch": 3.0321852060982497, - "grad_norm": 26.06414794921875, - "learning_rate": 1.1806888763410503e-05, - "loss": 0.3207, + "grad_norm": 1.8001810312271118, + "learning_rate": 0.00019678147939017506, + "loss": 0.3581, "step": 16110 }, { "epoch": 3.0340673818934687, - "grad_norm": 18.53022003173828, - "learning_rate": 1.1795595708639187e-05, - "loss": 0.6154, + "grad_norm": 2.1027791500091553, + "learning_rate": 0.00019659326181065313, + "loss": 0.5984, "step": 16120 }, { "epoch": 3.035949557688688, - "grad_norm": 7.747043132781982, - "learning_rate": 1.1784302653867872e-05, - "loss": 0.2841, + "grad_norm": 1.0899429321289062, + "learning_rate": 0.0001964050442311312, + "loss": 0.373, "step": 16130 }, { "epoch": 3.0378317334839076, - "grad_norm": 8.015532493591309, - "learning_rate": 1.1773009599096555e-05, - "loss": 0.5264, + "grad_norm": 2.035482883453369, + "learning_rate": 0.00019621682665160926, + "loss": 0.5412, "step": 16140 }, { "epoch": 3.0397139092791265, - "grad_norm": 7.965919494628906, - "learning_rate": 1.176171654432524e-05, - "loss": 0.566, + "grad_norm": 3.0822765827178955, + "learning_rate": 0.00019602860907208734, + "loss": 0.4806, "step": 16150 }, { "epoch": 3.041596085074346, - "grad_norm": 0.4477425813674927, - "learning_rate": 1.1750423489553925e-05, - "loss": 0.5456, + "grad_norm": 0.3393348157405853, + "learning_rate": 0.0001958403914925654, + "loss": 0.556, "step": 16160 }, { "epoch": 3.0434782608695654, - "grad_norm": 6.554590702056885, - "learning_rate": 1.173913043478261e-05, - "loss": 1.0002, + "grad_norm": 4.573878288269043, + "learning_rate": 0.0001956521739130435, + "loss": 0.7014, "step": 16170 }, { "epoch": 3.0453604366647844, - "grad_norm": 12.476119041442871, - "learning_rate": 1.1727837380011292e-05, - "loss": 0.8468, + "grad_norm": 4.302673816680908, + "learning_rate": 0.00019546395633352154, + "loss": 0.668, "step": 16180 }, { "epoch": 3.047242612460004, - "grad_norm": 13.832260131835938, - "learning_rate": 1.1716544325239978e-05, - "loss": 0.7469, + "grad_norm": 5.19127893447876, + "learning_rate": 0.00019527573875399961, + "loss": 0.7206, "step": 16190 }, { "epoch": 3.049124788255223, - "grad_norm": 1.2430438995361328, - "learning_rate": 1.1705251270468663e-05, - "loss": 0.7722, + "grad_norm": 0.9039764404296875, + "learning_rate": 0.0001950875211744777, + "loss": 0.7466, "step": 16200 }, { "epoch": 3.0510069640504422, - "grad_norm": 14.875078201293945, - "learning_rate": 1.1693958215697347e-05, - "loss": 0.7945, + "grad_norm": 3.421377182006836, + "learning_rate": 0.00019489930359495577, + "loss": 0.9628, "step": 16210 }, { "epoch": 3.0528891398456617, - "grad_norm": 5.486425399780273, - "learning_rate": 1.1682665160926031e-05, - "loss": 0.696, + "grad_norm": 3.118133068084717, + "learning_rate": 0.00019471108601543387, + "loss": 0.86, "step": 16220 }, { "epoch": 3.0547713156408807, - "grad_norm": 12.678677558898926, - "learning_rate": 1.1671372106154714e-05, - "loss": 0.339, + "grad_norm": 3.2065300941467285, + "learning_rate": 0.00019452286843591192, + "loss": 0.3279, "step": 16230 }, { "epoch": 3.0566534914361, - "grad_norm": 21.625308990478516, - "learning_rate": 1.16600790513834e-05, - "loss": 0.6191, + "grad_norm": 2.9220197200775146, + "learning_rate": 0.00019433465085639, + "loss": 0.7055, "step": 16240 }, { "epoch": 3.0585356672313195, - "grad_norm": 41.28392791748047, - "learning_rate": 1.1648785996612084e-05, - "loss": 0.6735, + "grad_norm": 7.088646411895752, + "learning_rate": 0.00019414643327686807, + "loss": 0.7472, "step": 16250 }, { "epoch": 3.0604178430265385, - "grad_norm": 2.554927349090576, - "learning_rate": 1.1637492941840769e-05, - "loss": 0.7547, + "grad_norm": 1.323919653892517, + "learning_rate": 0.00019395821569734615, + "loss": 0.6519, "step": 16260 }, { "epoch": 3.062300018821758, - "grad_norm": 0.5858107805252075, - "learning_rate": 1.1626199887069451e-05, - "loss": 0.6705, + "grad_norm": 0.42989590764045715, + "learning_rate": 0.0001937699981178242, + "loss": 0.7908, "step": 16270 }, { "epoch": 3.0641821946169774, - "grad_norm": 35.934181213378906, - "learning_rate": 1.1614906832298136e-05, - "loss": 0.5326, + "grad_norm": 5.152632713317871, + "learning_rate": 0.00019358178053830227, + "loss": 0.5153, "step": 16280 }, { "epoch": 3.0660643704121964, - "grad_norm": 11.104010581970215, - "learning_rate": 1.1603613777526822e-05, - "loss": 0.8268, + "grad_norm": 4.753070831298828, + "learning_rate": 0.00019339356295878035, + "loss": 0.8615, "step": 16290 }, { "epoch": 3.067946546207416, - "grad_norm": 6.697833061218262, - "learning_rate": 1.1592320722755506e-05, - "loss": 0.6316, + "grad_norm": 2.295179605484009, + "learning_rate": 0.00019320534537925843, + "loss": 0.6854, "step": 16300 }, { "epoch": 3.0698287220026352, - "grad_norm": 22.01197052001953, - "learning_rate": 1.1581027667984189e-05, - "loss": 0.5534, + "grad_norm": 3.444908618927002, + "learning_rate": 0.00019301712779973648, + "loss": 0.5909, "step": 16310 }, { "epoch": 3.0717108977978542, - "grad_norm": 5.453395843505859, - "learning_rate": 1.1569734613212873e-05, - "loss": 0.7422, + "grad_norm": 1.3395254611968994, + "learning_rate": 0.00019282891022021455, + "loss": 0.7558, "step": 16320 }, { "epoch": 3.0735930735930737, - "grad_norm": 11.562372207641602, - "learning_rate": 1.155844155844156e-05, - "loss": 0.9676, + "grad_norm": 2.442232131958008, + "learning_rate": 0.00019264069264069266, + "loss": 0.9028, "step": 16330 }, { "epoch": 3.075475249388293, - "grad_norm": 12.74392032623291, - "learning_rate": 1.1547148503670244e-05, - "loss": 0.6064, + "grad_norm": 5.411843776702881, + "learning_rate": 0.00019245247506117073, + "loss": 0.6706, "step": 16340 }, { "epoch": 3.077357425183512, - "grad_norm": 20.787565231323242, - "learning_rate": 1.1535855448898928e-05, - "loss": 0.4245, + "grad_norm": 4.872277736663818, + "learning_rate": 0.0001922642574816488, + "loss": 0.5234, "step": 16350 }, { "epoch": 3.0792396009787315, - "grad_norm": 27.309429168701172, - "learning_rate": 1.1524562394127611e-05, - "loss": 0.7836, + "grad_norm": 4.058775424957275, + "learning_rate": 0.00019207603990212686, + "loss": 0.7612, "step": 16360 }, { "epoch": 3.0811217767739505, - "grad_norm": 4.994350433349609, - "learning_rate": 1.1513269339356295e-05, - "loss": 0.7593, + "grad_norm": 1.6236318349838257, + "learning_rate": 0.00019188782232260494, + "loss": 0.7788, "step": 16370 }, { "epoch": 3.08300395256917, - "grad_norm": 21.30853271484375, - "learning_rate": 1.1501976284584981e-05, - "loss": 0.6651, + "grad_norm": 2.5879671573638916, + "learning_rate": 0.000191699604743083, + "loss": 0.7316, "step": 16380 }, { "epoch": 3.0848861283643894, - "grad_norm": 36.19512939453125, - "learning_rate": 1.1490683229813666e-05, - "loss": 0.6865, + "grad_norm": 3.530181407928467, + "learning_rate": 0.0001915113871635611, + "loss": 0.5907, "step": 16390 }, { "epoch": 3.0867683041596083, - "grad_norm": 0.0781698077917099, - "learning_rate": 1.1479390175042348e-05, - "loss": 0.7204, + "grad_norm": 0.017271628603339195, + "learning_rate": 0.00019132316958403914, + "loss": 0.7044, "step": 16400 }, { "epoch": 3.0886504799548278, - "grad_norm": 6.799276351928711, - "learning_rate": 1.1468097120271033e-05, - "loss": 0.606, + "grad_norm": 1.9577358961105347, + "learning_rate": 0.00019113495200451721, + "loss": 0.6627, "step": 16410 }, { "epoch": 3.090532655750047, - "grad_norm": 3.7640774250030518, - "learning_rate": 1.1456804065499717e-05, - "loss": 0.6198, + "grad_norm": 1.9960639476776123, + "learning_rate": 0.0001909467344249953, + "loss": 0.6884, "step": 16420 }, { "epoch": 3.092414831545266, - "grad_norm": 16.729280471801758, - "learning_rate": 1.1445511010728403e-05, - "loss": 0.6511, + "grad_norm": 4.5003886222839355, + "learning_rate": 0.00019075851684547337, + "loss": 0.5766, "step": 16430 }, { "epoch": 3.0942970073404856, - "grad_norm": 24.438823699951172, - "learning_rate": 1.1434217955957086e-05, - "loss": 0.4929, + "grad_norm": 3.9035720825195312, + "learning_rate": 0.00019057029926595144, + "loss": 0.515, "step": 16440 }, { "epoch": 3.096179183135705, - "grad_norm": 5.958948612213135, - "learning_rate": 1.142292490118577e-05, - "loss": 0.5966, + "grad_norm": 1.8054392337799072, + "learning_rate": 0.00019038208168642952, + "loss": 0.6085, "step": 16450 }, { "epoch": 3.098061358930924, - "grad_norm": 1.2739803791046143, - "learning_rate": 1.1411631846414455e-05, - "loss": 0.9081, + "grad_norm": 3.321089267730713, + "learning_rate": 0.0001901938641069076, + "loss": 0.8556, "step": 16460 }, { "epoch": 3.0999435347261435, - "grad_norm": 0.8364112377166748, - "learning_rate": 1.140033879164314e-05, - "loss": 0.4441, + "grad_norm": 1.0997257232666016, + "learning_rate": 0.00019000564652738567, + "loss": 0.5064, "step": 16470 }, { "epoch": 3.1018257105213625, - "grad_norm": 11.922718048095703, - "learning_rate": 1.1389045736871825e-05, - "loss": 0.669, + "grad_norm": 1.8963079452514648, + "learning_rate": 0.00018981742894786375, + "loss": 0.7537, "step": 16480 }, { "epoch": 3.103707886316582, - "grad_norm": 12.48450756072998, - "learning_rate": 1.1377752682100508e-05, - "loss": 0.4577, + "grad_norm": 5.995891571044922, + "learning_rate": 0.0001896292113683418, + "loss": 0.571, "step": 16490 }, { "epoch": 3.1055900621118013, - "grad_norm": 22.12200164794922, - "learning_rate": 1.1366459627329192e-05, - "loss": 0.4632, + "grad_norm": 1.4215861558914185, + "learning_rate": 0.00018944099378881987, + "loss": 0.4311, "step": 16500 }, { "epoch": 3.1074722379070203, - "grad_norm": 21.450420379638672, - "learning_rate": 1.1355166572557877e-05, - "loss": 0.798, + "grad_norm": 2.2452361583709717, + "learning_rate": 0.00018925277620929795, + "loss": 0.6251, "step": 16510 }, { "epoch": 3.1093544137022397, - "grad_norm": 0.42957913875579834, - "learning_rate": 1.1343873517786563e-05, - "loss": 0.5381, + "grad_norm": 0.5021165609359741, + "learning_rate": 0.00018906455862977603, + "loss": 0.5959, "step": 16520 }, { "epoch": 3.111236589497459, - "grad_norm": 5.7693281173706055, - "learning_rate": 1.1332580463015245e-05, - "loss": 0.5663, + "grad_norm": 3.0534772872924805, + "learning_rate": 0.00018887634105025408, + "loss": 0.6973, "step": 16530 }, { "epoch": 3.113118765292678, - "grad_norm": 25.847362518310547, - "learning_rate": 1.132128740824393e-05, - "loss": 0.5048, + "grad_norm": 3.5288970470428467, + "learning_rate": 0.00018868812347073215, + "loss": 0.5277, "step": 16540 }, { "epoch": 3.1150009410878976, - "grad_norm": 3.086296319961548, - "learning_rate": 1.1309994353472614e-05, - "loss": 0.5231, + "grad_norm": 0.23889213800430298, + "learning_rate": 0.00018849990589121026, + "loss": 0.5217, "step": 16550 }, { "epoch": 3.116883116883117, - "grad_norm": 6.833352088928223, - "learning_rate": 1.12987012987013e-05, - "loss": 0.6046, + "grad_norm": 3.701779365539551, + "learning_rate": 0.00018831168831168833, + "loss": 0.652, "step": 16560 }, { "epoch": 3.118765292678336, - "grad_norm": 11.189995765686035, - "learning_rate": 1.1287408243929985e-05, - "loss": 0.4659, + "grad_norm": 2.4766476154327393, + "learning_rate": 0.0001881234707321664, + "loss": 0.4487, "step": 16570 }, { "epoch": 3.1206474684735555, - "grad_norm": 17.51889419555664, - "learning_rate": 1.1276115189158667e-05, - "loss": 0.7338, + "grad_norm": 4.216089725494385, + "learning_rate": 0.00018793525315264446, + "loss": 0.6731, "step": 16580 }, { "epoch": 3.122529644268775, - "grad_norm": 18.669206619262695, - "learning_rate": 1.1264822134387352e-05, - "loss": 0.3663, + "grad_norm": 1.8510828018188477, + "learning_rate": 0.00018774703557312254, + "loss": 0.4693, "step": 16590 }, { "epoch": 3.124411820063994, - "grad_norm": 4.161768436431885, - "learning_rate": 1.1253529079616036e-05, - "loss": 0.7614, + "grad_norm": 3.4371228218078613, + "learning_rate": 0.0001875588179936006, + "loss": 0.7364, "step": 16600 }, { "epoch": 3.1262939958592133, - "grad_norm": 17.522029876708984, - "learning_rate": 1.1242236024844722e-05, - "loss": 0.7639, + "grad_norm": 4.299537181854248, + "learning_rate": 0.0001873706004140787, + "loss": 0.6797, "step": 16610 }, { "epoch": 3.1281761716544327, - "grad_norm": 20.829774856567383, - "learning_rate": 1.1230942970073405e-05, - "loss": 0.5254, + "grad_norm": 2.3644449710845947, + "learning_rate": 0.00018718238283455674, + "loss": 0.4436, "step": 16620 }, { "epoch": 3.1300583474496517, - "grad_norm": 12.240787506103516, - "learning_rate": 1.121964991530209e-05, - "loss": 0.6138, + "grad_norm": 1.7090197801589966, + "learning_rate": 0.00018699416525503481, + "loss": 0.6329, "step": 16630 }, { "epoch": 3.131940523244871, - "grad_norm": 18.72231101989746, - "learning_rate": 1.1208356860530774e-05, - "loss": 0.8026, + "grad_norm": 5.4652791023254395, + "learning_rate": 0.0001868059476755129, + "loss": 0.7495, "step": 16640 }, { "epoch": 3.13382269904009, - "grad_norm": 8.982686042785645, - "learning_rate": 1.1197063805759458e-05, - "loss": 0.5638, + "grad_norm": 2.184948444366455, + "learning_rate": 0.00018661773009599097, + "loss": 0.6987, "step": 16650 }, { "epoch": 3.1357048748353096, - "grad_norm": 0.6803452968597412, - "learning_rate": 1.1185770750988142e-05, - "loss": 0.6837, + "grad_norm": 0.10448634624481201, + "learning_rate": 0.00018642951251646904, + "loss": 0.7188, "step": 16660 }, { "epoch": 3.137587050630529, - "grad_norm": 2.5247535705566406, - "learning_rate": 1.1174477696216827e-05, - "loss": 0.6615, + "grad_norm": 1.1686161756515503, + "learning_rate": 0.00018624129493694712, + "loss": 0.5243, "step": 16670 }, { "epoch": 3.139469226425748, - "grad_norm": 7.049471378326416, - "learning_rate": 1.1163184641445511e-05, - "loss": 0.5665, + "grad_norm": 1.7935549020767212, + "learning_rate": 0.0001860530773574252, + "loss": 0.5573, "step": 16680 }, { "epoch": 3.1413514022209674, - "grad_norm": 22.32665252685547, - "learning_rate": 1.1151891586674196e-05, - "loss": 0.878, + "grad_norm": 2.2225093841552734, + "learning_rate": 0.00018586485977790327, + "loss": 0.788, "step": 16690 }, { "epoch": 3.143233578016187, - "grad_norm": 18.906843185424805, - "learning_rate": 1.1140598531902882e-05, - "loss": 0.6834, + "grad_norm": 4.060264587402344, + "learning_rate": 0.00018567664219838135, + "loss": 0.5902, "step": 16700 }, { "epoch": 3.145115753811406, - "grad_norm": 0.7685168385505676, - "learning_rate": 1.1129305477131564e-05, - "loss": 0.6544, + "grad_norm": 0.9784095287322998, + "learning_rate": 0.0001854884246188594, + "loss": 0.7869, "step": 16710 }, { "epoch": 3.1469979296066253, - "grad_norm": 0.6841329336166382, - "learning_rate": 1.1118012422360249e-05, - "loss": 0.5769, + "grad_norm": 0.18420638144016266, + "learning_rate": 0.00018530020703933747, + "loss": 0.638, "step": 16720 }, { "epoch": 3.1488801054018447, - "grad_norm": 9.697307586669922, - "learning_rate": 1.1106719367588933e-05, - "loss": 0.5116, + "grad_norm": 2.6315994262695312, + "learning_rate": 0.00018511198945981555, + "loss": 0.5666, "step": 16730 }, { "epoch": 3.1507622811970637, - "grad_norm": 16.02259635925293, - "learning_rate": 1.1095426312817617e-05, - "loss": 0.5188, + "grad_norm": 3.2660489082336426, + "learning_rate": 0.00018492377188029363, + "loss": 0.5412, "step": 16740 }, { "epoch": 3.152644456992283, - "grad_norm": 5.338298797607422, - "learning_rate": 1.1084133258046302e-05, - "loss": 0.8259, + "grad_norm": 1.4657108783721924, + "learning_rate": 0.00018473555430077168, + "loss": 0.8718, "step": 16750 }, { "epoch": 3.1545266327875026, - "grad_norm": 18.724058151245117, - "learning_rate": 1.1072840203274986e-05, - "loss": 0.6036, + "grad_norm": 3.5344760417938232, + "learning_rate": 0.00018454733672124975, + "loss": 0.4465, "step": 16760 }, { "epoch": 3.1564088085827215, - "grad_norm": 5.406846523284912, - "learning_rate": 1.106154714850367e-05, - "loss": 0.8246, + "grad_norm": 1.9240225553512573, + "learning_rate": 0.00018435911914172786, + "loss": 0.8639, "step": 16770 }, { "epoch": 3.158290984377941, - "grad_norm": 8.084793090820312, - "learning_rate": 1.1050254093732355e-05, - "loss": 0.8828, + "grad_norm": 3.1279828548431396, + "learning_rate": 0.00018417090156220593, + "loss": 0.8417, "step": 16780 }, { "epoch": 3.16017316017316, - "grad_norm": 24.73105812072754, - "learning_rate": 1.1038961038961038e-05, - "loss": 0.8427, + "grad_norm": 4.149167060852051, + "learning_rate": 0.00018398268398268398, + "loss": 0.8718, "step": 16790 }, { "epoch": 3.1620553359683794, - "grad_norm": 21.116586685180664, - "learning_rate": 1.1027667984189724e-05, - "loss": 0.6331, + "grad_norm": 5.092727184295654, + "learning_rate": 0.00018379446640316206, + "loss": 0.5612, "step": 16800 }, { "epoch": 3.163937511763599, - "grad_norm": 15.968790054321289, - "learning_rate": 1.1016374929418408e-05, - "loss": 0.562, + "grad_norm": 3.1172432899475098, + "learning_rate": 0.00018360624882364014, + "loss": 0.5667, "step": 16810 }, { "epoch": 3.165819687558818, - "grad_norm": 13.15872859954834, - "learning_rate": 1.1005081874647093e-05, - "loss": 0.4653, + "grad_norm": 2.793973922729492, + "learning_rate": 0.0001834180312441182, + "loss": 0.5911, "step": 16820 }, { "epoch": 3.1677018633540373, - "grad_norm": 13.242486953735352, - "learning_rate": 1.0993788819875777e-05, - "loss": 0.3991, + "grad_norm": 1.6922173500061035, + "learning_rate": 0.0001832298136645963, + "loss": 0.4133, "step": 16830 }, { "epoch": 3.1695840391492567, - "grad_norm": 12.819069862365723, - "learning_rate": 1.0982495765104461e-05, - "loss": 0.7846, + "grad_norm": 4.027005672454834, + "learning_rate": 0.00018304159608507434, + "loss": 0.7502, "step": 16840 }, { "epoch": 3.1714662149444757, - "grad_norm": 13.536721229553223, - "learning_rate": 1.0971202710333146e-05, - "loss": 0.9832, + "grad_norm": 2.4809811115264893, + "learning_rate": 0.00018285337850555241, + "loss": 0.8741, "step": 16850 }, { "epoch": 3.173348390739695, - "grad_norm": 2.7697112560272217, - "learning_rate": 1.095990965556183e-05, - "loss": 0.4785, + "grad_norm": 2.1828391551971436, + "learning_rate": 0.0001826651609260305, + "loss": 0.3988, "step": 16860 }, { "epoch": 3.1752305665349145, - "grad_norm": 5.944636821746826, - "learning_rate": 1.0948616600790514e-05, - "loss": 0.6094, + "grad_norm": 5.239040374755859, + "learning_rate": 0.00018247694334650857, + "loss": 0.7494, "step": 16870 }, { "epoch": 3.1771127423301335, - "grad_norm": 2.6697866916656494, - "learning_rate": 1.0937323546019197e-05, - "loss": 0.3562, + "grad_norm": 0.8617979288101196, + "learning_rate": 0.00018228872576698662, + "loss": 0.4216, "step": 16880 }, { "epoch": 3.178994918125353, - "grad_norm": 7.476838111877441, - "learning_rate": 1.0926030491247883e-05, - "loss": 0.8976, + "grad_norm": 2.9077141284942627, + "learning_rate": 0.00018210050818746472, + "loss": 1.0169, "step": 16890 }, { "epoch": 3.1808770939205724, - "grad_norm": 6.028730869293213, - "learning_rate": 1.0914737436476568e-05, - "loss": 0.4407, + "grad_norm": 2.365379810333252, + "learning_rate": 0.0001819122906079428, + "loss": 0.5274, "step": 16900 }, { "epoch": 3.1827592697157914, - "grad_norm": 1.0638821125030518, - "learning_rate": 1.0903444381705252e-05, - "loss": 0.6061, + "grad_norm": 1.3251471519470215, + "learning_rate": 0.00018172407302842087, + "loss": 0.6267, "step": 16910 }, { "epoch": 3.184641445511011, - "grad_norm": 39.25739669799805, - "learning_rate": 1.0892151326933935e-05, - "loss": 0.6129, + "grad_norm": 4.387365818023682, + "learning_rate": 0.00018153585544889892, + "loss": 0.612, "step": 16920 }, { "epoch": 3.18652362130623, - "grad_norm": 26.864404678344727, - "learning_rate": 1.0880858272162619e-05, - "loss": 0.6046, + "grad_norm": 5.2866387367248535, + "learning_rate": 0.000181347637869377, + "loss": 0.5251, "step": 16930 }, { "epoch": 3.1884057971014492, - "grad_norm": 1.348874568939209, - "learning_rate": 1.0869565217391305e-05, - "loss": 0.6278, + "grad_norm": 2.2605485916137695, + "learning_rate": 0.00018115942028985507, + "loss": 0.5894, "step": 16940 }, { "epoch": 3.1902879728966687, - "grad_norm": 13.918986320495605, - "learning_rate": 1.085827216261999e-05, - "loss": 0.8359, + "grad_norm": 2.464541435241699, + "learning_rate": 0.00018097120271033315, + "loss": 0.6772, "step": 16950 }, { "epoch": 3.1921701486918876, - "grad_norm": 12.762542724609375, - "learning_rate": 1.0846979107848674e-05, - "loss": 0.4523, + "grad_norm": 0.5530261993408203, + "learning_rate": 0.00018078298513081123, + "loss": 0.4976, "step": 16960 }, { "epoch": 3.194052324487107, - "grad_norm": 0.14461807906627655, - "learning_rate": 1.0835686053077357e-05, - "loss": 0.3795, + "grad_norm": 0.07630673050880432, + "learning_rate": 0.00018059476755128928, + "loss": 0.3104, "step": 16970 }, { "epoch": 3.1959345002823265, - "grad_norm": 12.9683256149292, - "learning_rate": 1.0824392998306043e-05, - "loss": 0.6801, + "grad_norm": 1.6199790239334106, + "learning_rate": 0.00018040654997176735, + "loss": 0.719, "step": 16980 }, { "epoch": 3.1978166760775455, - "grad_norm": 18.373062133789062, - "learning_rate": 1.0813099943534727e-05, - "loss": 0.5675, + "grad_norm": 1.9143341779708862, + "learning_rate": 0.00018021833239224543, + "loss": 0.7699, "step": 16990 }, { "epoch": 3.199698851872765, - "grad_norm": 6.4098310470581055, - "learning_rate": 1.0801806888763411e-05, - "loss": 0.7209, + "grad_norm": 2.9342286586761475, + "learning_rate": 0.00018003011481272353, + "loss": 0.8464, "step": 17000 }, { "epoch": 3.2015810276679844, - "grad_norm": 1.8334890604019165, - "learning_rate": 1.0790513833992094e-05, - "loss": 0.7512, + "grad_norm": 2.5452444553375244, + "learning_rate": 0.00017984189723320158, + "loss": 0.7062, "step": 17010 }, { "epoch": 3.2034632034632033, - "grad_norm": 12.161665916442871, - "learning_rate": 1.0779220779220778e-05, - "loss": 0.8162, + "grad_norm": 2.8599061965942383, + "learning_rate": 0.00017965367965367966, + "loss": 0.7402, "step": 17020 }, { "epoch": 3.2053453792584228, - "grad_norm": 14.931073188781738, - "learning_rate": 1.0767927724449465e-05, - "loss": 0.3628, + "grad_norm": 3.1525862216949463, + "learning_rate": 0.00017946546207415774, + "loss": 0.4111, "step": 17030 }, { "epoch": 3.207227555053642, - "grad_norm": 31.593656539916992, - "learning_rate": 1.0756634669678149e-05, - "loss": 0.8832, + "grad_norm": 5.667859077453613, + "learning_rate": 0.0001792772444946358, + "loss": 0.8955, "step": 17040 }, { "epoch": 3.209109730848861, - "grad_norm": 15.769742965698242, - "learning_rate": 1.0745341614906832e-05, - "loss": 0.5008, + "grad_norm": 4.671546936035156, + "learning_rate": 0.00017908902691511386, + "loss": 0.5498, "step": 17050 }, { "epoch": 3.2109919066440806, - "grad_norm": 14.6060152053833, - "learning_rate": 1.0734048560135516e-05, - "loss": 0.6849, + "grad_norm": 10.911493301391602, + "learning_rate": 0.00017890080933559194, + "loss": 0.7891, "step": 17060 }, { "epoch": 3.2128740824393, - "grad_norm": 9.041996002197266, - "learning_rate": 1.0722755505364202e-05, - "loss": 0.8141, + "grad_norm": 2.427689790725708, + "learning_rate": 0.00017871259175607001, + "loss": 0.9593, "step": 17070 }, { "epoch": 3.214756258234519, - "grad_norm": 17.594465255737305, - "learning_rate": 1.0711462450592886e-05, - "loss": 0.5805, + "grad_norm": 3.4500486850738525, + "learning_rate": 0.0001785243741765481, + "loss": 0.5289, "step": 17080 }, { "epoch": 3.2166384340297385, - "grad_norm": 26.354915618896484, - "learning_rate": 1.070016939582157e-05, - "loss": 0.6139, + "grad_norm": 5.073642730712891, + "learning_rate": 0.00017833615659702617, + "loss": 0.7019, "step": 17090 }, { "epoch": 3.2185206098249575, - "grad_norm": 20.24001693725586, - "learning_rate": 1.0688876341050254e-05, - "loss": 0.726, + "grad_norm": 4.536356449127197, + "learning_rate": 0.00017814793901750422, + "loss": 0.6364, "step": 17100 }, { "epoch": 3.220402785620177, - "grad_norm": 9.388453483581543, - "learning_rate": 1.0677583286278938e-05, - "loss": 0.837, + "grad_norm": 3.2975924015045166, + "learning_rate": 0.00017795972143798232, + "loss": 0.8828, "step": 17110 }, { "epoch": 3.2222849614153963, - "grad_norm": 33.33871841430664, - "learning_rate": 1.0666290231507624e-05, - "loss": 0.6729, + "grad_norm": 4.286025047302246, + "learning_rate": 0.0001777715038584604, + "loss": 0.5538, "step": 17120 }, { "epoch": 3.2241671372106153, - "grad_norm": 14.027726173400879, - "learning_rate": 1.0654997176736308e-05, - "loss": 0.6477, + "grad_norm": 2.76507830619812, + "learning_rate": 0.00017758328627893847, + "loss": 0.5498, "step": 17130 }, { "epoch": 3.2260493130058348, - "grad_norm": 10.19504451751709, - "learning_rate": 1.0643704121964991e-05, - "loss": 0.3972, + "grad_norm": 2.945713996887207, + "learning_rate": 0.00017739506869941652, + "loss": 0.5465, "step": 17140 }, { "epoch": 3.227931488801054, - "grad_norm": 20.70015525817871, - "learning_rate": 1.0632411067193675e-05, - "loss": 0.6082, + "grad_norm": 6.803636074066162, + "learning_rate": 0.0001772068511198946, + "loss": 0.6545, "step": 17150 }, { "epoch": 3.229813664596273, - "grad_norm": 6.828171730041504, - "learning_rate": 1.062111801242236e-05, - "loss": 0.7273, + "grad_norm": 4.420771598815918, + "learning_rate": 0.00017701863354037267, + "loss": 0.7304, "step": 17160 }, { "epoch": 3.2316958403914926, - "grad_norm": 13.635649681091309, - "learning_rate": 1.0609824957651046e-05, - "loss": 0.6924, + "grad_norm": 2.6977665424346924, + "learning_rate": 0.00017683041596085075, + "loss": 0.7973, "step": 17170 }, { "epoch": 3.233578016186712, - "grad_norm": 5.508321285247803, - "learning_rate": 1.059853190287973e-05, - "loss": 0.5075, + "grad_norm": 1.1652570962905884, + "learning_rate": 0.00017664219838132883, + "loss": 0.4759, "step": 17180 }, { "epoch": 3.235460191981931, - "grad_norm": 20.685861587524414, - "learning_rate": 1.0587238848108413e-05, - "loss": 0.5851, + "grad_norm": 1.8155357837677002, + "learning_rate": 0.00017645398080180688, + "loss": 0.7366, "step": 17190 }, { "epoch": 3.2373423677771505, - "grad_norm": 14.502724647521973, - "learning_rate": 1.0575945793337097e-05, - "loss": 0.4921, + "grad_norm": 2.0348408222198486, + "learning_rate": 0.00017626576322228495, + "loss": 0.5368, "step": 17200 }, { "epoch": 3.2392245435723694, - "grad_norm": 8.47658634185791, - "learning_rate": 1.0564652738565783e-05, - "loss": 0.6247, + "grad_norm": 4.619930744171143, + "learning_rate": 0.00017607754564276303, + "loss": 0.6692, "step": 17210 }, { "epoch": 3.241106719367589, - "grad_norm": 10.943485260009766, - "learning_rate": 1.0553359683794468e-05, - "loss": 0.8117, + "grad_norm": 2.546478271484375, + "learning_rate": 0.00017588932806324113, + "loss": 0.7912, "step": 17220 }, { "epoch": 3.2429888951628083, - "grad_norm": 10.894013404846191, - "learning_rate": 1.054206662902315e-05, - "loss": 0.494, + "grad_norm": 3.549199342727661, + "learning_rate": 0.00017570111048371918, + "loss": 0.5598, "step": 17230 }, { "epoch": 3.2448710709580273, - "grad_norm": 12.969403266906738, - "learning_rate": 1.0530773574251835e-05, - "loss": 0.4713, + "grad_norm": 3.683444023132324, + "learning_rate": 0.00017551289290419726, + "loss": 0.6625, "step": 17240 }, { "epoch": 3.2467532467532467, - "grad_norm": 13.456480979919434, - "learning_rate": 1.051948051948052e-05, - "loss": 0.6888, + "grad_norm": 2.0694761276245117, + "learning_rate": 0.00017532467532467534, + "loss": 0.7337, "step": 17250 }, { "epoch": 3.248635422548466, - "grad_norm": 12.490468978881836, - "learning_rate": 1.0508187464709205e-05, - "loss": 0.6876, + "grad_norm": 5.7573394775390625, + "learning_rate": 0.0001751364577451534, + "loss": 0.641, "step": 17260 }, { "epoch": 3.250517598343685, - "grad_norm": 13.233989715576172, - "learning_rate": 1.0496894409937888e-05, - "loss": 0.5344, + "grad_norm": 2.7505602836608887, + "learning_rate": 0.00017494824016563146, + "loss": 0.5127, "step": 17270 }, { "epoch": 3.2523997741389046, - "grad_norm": 26.581886291503906, - "learning_rate": 1.0485601355166572e-05, - "loss": 0.6548, + "grad_norm": 10.71851634979248, + "learning_rate": 0.00017476002258610954, + "loss": 0.6569, "step": 17280 }, { "epoch": 3.254281949934124, - "grad_norm": 5.6854777336120605, - "learning_rate": 1.0474308300395257e-05, - "loss": 0.3306, + "grad_norm": 2.259294033050537, + "learning_rate": 0.00017457180500658761, + "loss": 0.3766, "step": 17290 }, { "epoch": 3.256164125729343, - "grad_norm": 15.864883422851562, - "learning_rate": 1.0463015245623941e-05, - "loss": 0.3117, + "grad_norm": 5.869109630584717, + "learning_rate": 0.0001743835874270657, + "loss": 0.5871, "step": 17300 }, { "epoch": 3.2580463015245624, - "grad_norm": 1.5253409147262573, - "learning_rate": 1.0451722190852627e-05, - "loss": 0.6778, + "grad_norm": 4.830927848815918, + "learning_rate": 0.00017419536984754377, + "loss": 0.6989, "step": 17310 }, { "epoch": 3.259928477319782, - "grad_norm": 13.313026428222656, - "learning_rate": 1.044042913608131e-05, - "loss": 0.4523, + "grad_norm": 2.818700075149536, + "learning_rate": 0.00017400715226802182, + "loss": 0.4231, "step": 17320 }, { "epoch": 3.261810653115001, - "grad_norm": 6.4470343589782715, - "learning_rate": 1.0429136081309994e-05, - "loss": 0.4609, + "grad_norm": 3.7204909324645996, + "learning_rate": 0.00017381893468849992, + "loss": 0.5142, "step": 17330 }, { "epoch": 3.2636928289102203, - "grad_norm": 6.336509704589844, - "learning_rate": 1.0417843026538679e-05, - "loss": 0.398, + "grad_norm": 1.9712015390396118, + "learning_rate": 0.000173630717108978, + "loss": 0.6661, "step": 17340 }, { "epoch": 3.2655750047054397, - "grad_norm": 17.98333740234375, - "learning_rate": 1.0406549971767365e-05, - "loss": 0.8517, + "grad_norm": 4.433485984802246, + "learning_rate": 0.00017344249952945607, + "loss": 0.808, "step": 17350 }, { "epoch": 3.2674571805006587, - "grad_norm": 22.246747970581055, - "learning_rate": 1.0395256916996047e-05, - "loss": 0.5689, + "grad_norm": 3.4073405265808105, + "learning_rate": 0.00017325428194993412, + "loss": 0.5962, "step": 17360 }, { "epoch": 3.269339356295878, - "grad_norm": 16.68685531616211, - "learning_rate": 1.0383963862224732e-05, - "loss": 0.8071, + "grad_norm": 2.7334978580474854, + "learning_rate": 0.0001730660643704122, + "loss": 0.8815, "step": 17370 }, { "epoch": 3.271221532091097, - "grad_norm": 25.194631576538086, - "learning_rate": 1.0372670807453416e-05, - "loss": 0.6331, + "grad_norm": 2.573606014251709, + "learning_rate": 0.00017287784679089027, + "loss": 0.6079, "step": 17380 }, { "epoch": 3.2731037078863165, - "grad_norm": 10.572707176208496, - "learning_rate": 1.03613777526821e-05, + "grad_norm": 2.335155487060547, + "learning_rate": 0.00017268962921136835, "loss": 0.4869, "step": 17390 }, { "epoch": 3.274985883681536, - "grad_norm": 7.161264896392822, - "learning_rate": 1.0350084697910785e-05, - "loss": 0.4585, + "grad_norm": 4.357646942138672, + "learning_rate": 0.0001725014116318464, + "loss": 0.6921, "step": 17400 }, { "epoch": 3.276868059476755, - "grad_norm": 13.183558464050293, - "learning_rate": 1.033879164313947e-05, - "loss": 0.8198, + "grad_norm": 5.32853364944458, + "learning_rate": 0.00017231319405232448, + "loss": 0.8342, "step": 17410 }, { "epoch": 3.2787502352719744, - "grad_norm": 21.05937385559082, - "learning_rate": 1.0327498588368154e-05, - "loss": 0.4036, + "grad_norm": 2.327008008956909, + "learning_rate": 0.00017212497647280255, + "loss": 0.5999, "step": 17420 }, { "epoch": 3.280632411067194, - "grad_norm": 5.843810558319092, - "learning_rate": 1.0316205533596838e-05, - "loss": 0.5591, + "grad_norm": 2.867143154144287, + "learning_rate": 0.00017193675889328063, + "loss": 0.6467, "step": 17430 }, { "epoch": 3.282514586862413, - "grad_norm": 3.102691888809204, - "learning_rate": 1.0304912478825523e-05, - "loss": 0.511, + "grad_norm": 0.24477972090244293, + "learning_rate": 0.00017174854131375873, + "loss": 0.5012, "step": 17440 }, { "epoch": 3.2843967626576323, - "grad_norm": 22.08820915222168, - "learning_rate": 1.0293619424054207e-05, - "loss": 0.5755, + "grad_norm": 4.458093643188477, + "learning_rate": 0.00017156032373423678, + "loss": 0.5334, "step": 17450 }, { "epoch": 3.2862789384528517, - "grad_norm": 6.78153657913208, - "learning_rate": 1.0282326369282891e-05, - "loss": 0.3316, + "grad_norm": 2.012943744659424, + "learning_rate": 0.00017137210615471486, + "loss": 0.4542, "step": 17460 }, { "epoch": 3.2881611142480707, - "grad_norm": 24.446062088012695, - "learning_rate": 1.0271033314511576e-05, - "loss": 0.4758, + "grad_norm": 2.415400266647339, + "learning_rate": 0.00017118388857519294, + "loss": 0.572, "step": 17470 }, { "epoch": 3.29004329004329, - "grad_norm": 23.952836990356445, - "learning_rate": 1.025974025974026e-05, - "loss": 0.622, + "grad_norm": 1.1272308826446533, + "learning_rate": 0.000170995670995671, + "loss": 0.5705, "step": 17480 }, { "epoch": 3.291925465838509, - "grad_norm": 39.4639892578125, - "learning_rate": 1.0248447204968944e-05, - "loss": 0.5434, + "grad_norm": 5.979650020599365, + "learning_rate": 0.00017080745341614906, + "loss": 0.5985, "step": 17490 }, { "epoch": 3.2938076416337285, - "grad_norm": 17.225452423095703, - "learning_rate": 1.0237154150197629e-05, - "loss": 0.9825, + "grad_norm": 3.5249295234680176, + "learning_rate": 0.00017061923583662714, + "loss": 0.9201, "step": 17500 }, { "epoch": 3.295689817428948, - "grad_norm": 9.208147048950195, - "learning_rate": 1.0225861095426313e-05, - "loss": 0.6339, + "grad_norm": 4.946028709411621, + "learning_rate": 0.00017043101825710521, + "loss": 0.6594, "step": 17510 }, { "epoch": 3.2975719932241674, - "grad_norm": 30.452302932739258, - "learning_rate": 1.0214568040654998e-05, - "loss": 0.6128, + "grad_norm": 5.57255744934082, + "learning_rate": 0.0001702428006775833, + "loss": 0.6134, "step": 17520 }, { "epoch": 3.2994541690193864, - "grad_norm": 32.60814666748047, - "learning_rate": 1.020327498588368e-05, - "loss": 0.5782, + "grad_norm": 3.510176658630371, + "learning_rate": 0.00017005458309806134, + "loss": 0.6441, "step": 17530 }, { "epoch": 3.301336344814606, - "grad_norm": 16.545825958251953, - "learning_rate": 1.0191981931112366e-05, - "loss": 0.5999, + "grad_norm": 4.090064525604248, + "learning_rate": 0.00016986636551853942, + "loss": 0.5774, "step": 17540 }, { "epoch": 3.303218520609825, - "grad_norm": 15.124717712402344, - "learning_rate": 1.018068887634105e-05, - "loss": 0.4035, + "grad_norm": 1.2690385580062866, + "learning_rate": 0.00016967814793901752, + "loss": 0.4282, "step": 17550 }, { "epoch": 3.3051006964050442, - "grad_norm": 6.470951080322266, - "learning_rate": 1.0169395821569735e-05, - "loss": 0.7927, + "grad_norm": 1.3262484073638916, + "learning_rate": 0.0001694899303594956, + "loss": 0.8544, "step": 17560 }, { "epoch": 3.3069828722002637, - "grad_norm": 16.971155166625977, - "learning_rate": 1.015810276679842e-05, - "loss": 0.6804, + "grad_norm": 3.6288297176361084, + "learning_rate": 0.00016930171277997367, + "loss": 0.759, "step": 17570 }, { "epoch": 3.3088650479954826, - "grad_norm": 29.655359268188477, - "learning_rate": 1.0146809712027104e-05, - "loss": 0.7321, + "grad_norm": 5.26113748550415, + "learning_rate": 0.00016911349520045172, + "loss": 0.6652, "step": 17580 }, { "epoch": 3.310747223790702, - "grad_norm": 28.08059310913086, - "learning_rate": 1.0135516657255788e-05, - "loss": 0.7635, + "grad_norm": 4.848659992218018, + "learning_rate": 0.0001689252776209298, + "loss": 0.7512, "step": 17590 }, { "epoch": 3.3126293995859215, - "grad_norm": 0.4805198013782501, - "learning_rate": 1.0124223602484473e-05, - "loss": 0.4791, + "grad_norm": 0.0900026485323906, + "learning_rate": 0.00016873706004140787, + "loss": 0.3456, "step": 17600 }, { "epoch": 3.3145115753811405, - "grad_norm": 30.67386245727539, - "learning_rate": 1.0112930547713157e-05, - "loss": 0.6166, + "grad_norm": 4.362743377685547, + "learning_rate": 0.00016854884246188595, + "loss": 0.7094, "step": 17610 }, { "epoch": 3.31639375117636, - "grad_norm": 30.00206756591797, - "learning_rate": 1.010163749294184e-05, - "loss": 0.9969, + "grad_norm": 2.96457576751709, + "learning_rate": 0.000168360624882364, + "loss": 0.987, "step": 17620 }, { "epoch": 3.3182759269715794, - "grad_norm": 11.875704765319824, - "learning_rate": 1.0090344438170526e-05, - "loss": 0.4861, + "grad_norm": 3.4136481285095215, + "learning_rate": 0.00016817240730284208, + "loss": 0.5137, "step": 17630 }, { "epoch": 3.3201581027667983, - "grad_norm": 59.30400848388672, - "learning_rate": 1.007905138339921e-05, - "loss": 0.6634, + "grad_norm": 5.220181941986084, + "learning_rate": 0.00016798418972332015, + "loss": 0.8406, "step": 17640 }, { "epoch": 3.322040278562018, - "grad_norm": 7.319840908050537, - "learning_rate": 1.0067758328627895e-05, - "loss": 0.1843, + "grad_norm": 3.106947898864746, + "learning_rate": 0.00016779597214379823, + "loss": 0.2564, "step": 17650 }, { "epoch": 3.3239224543572368, - "grad_norm": 1.659640908241272, - "learning_rate": 1.0056465273856577e-05, - "loss": 0.8501, + "grad_norm": 0.1169840395450592, + "learning_rate": 0.0001676077545642763, + "loss": 0.7908, "step": 17660 }, { "epoch": 3.325804630152456, - "grad_norm": 15.587262153625488, - "learning_rate": 1.0045172219085262e-05, - "loss": 0.5065, + "grad_norm": 3.5396854877471924, + "learning_rate": 0.00016741953698475438, + "loss": 0.5651, "step": 17670 }, { "epoch": 3.3276868059476756, - "grad_norm": 25.570293426513672, - "learning_rate": 1.0033879164313948e-05, - "loss": 0.8362, + "grad_norm": 4.229373931884766, + "learning_rate": 0.00016723131940523246, + "loss": 0.6979, "step": 17680 }, { "epoch": 3.3295689817428946, - "grad_norm": 5.94968318939209, - "learning_rate": 1.0022586109542632e-05, - "loss": 0.7938, + "grad_norm": 2.629528045654297, + "learning_rate": 0.00016704310182571054, + "loss": 0.6957, "step": 17690 }, { "epoch": 3.331451157538114, - "grad_norm": 5.780099868774414, - "learning_rate": 1.0011293054771316e-05, - "loss": 0.6672, + "grad_norm": 0.5072752237319946, + "learning_rate": 0.0001668548842461886, + "loss": 0.6849, "step": 17700 }, { "epoch": 3.3333333333333335, - "grad_norm": 32.476585388183594, - "learning_rate": 9.999999999999999e-06, - "loss": 0.5259, + "grad_norm": 8.92314338684082, + "learning_rate": 0.00016666666666666666, + "loss": 0.6571, "step": 17710 }, { "epoch": 3.3352155091285525, - "grad_norm": 6.785778999328613, - "learning_rate": 9.988706945228685e-06, - "loss": 0.719, + "grad_norm": 1.982568621635437, + "learning_rate": 0.00016647844908714474, + "loss": 0.7167, "step": 17720 }, { "epoch": 3.337097684923772, - "grad_norm": 16.405250549316406, - "learning_rate": 9.97741389045737e-06, - "loss": 0.8035, + "grad_norm": 5.065396308898926, + "learning_rate": 0.00016629023150762281, + "loss": 0.9074, "step": 17730 }, { "epoch": 3.3389798607189913, - "grad_norm": 8.930052757263184, - "learning_rate": 9.966120835686054e-06, - "loss": 0.7513, + "grad_norm": 2.799494504928589, + "learning_rate": 0.0001661020139281009, + "loss": 0.9093, "step": 17740 }, { "epoch": 3.3408620365142103, - "grad_norm": 0.5434146523475647, - "learning_rate": 9.954827780914737e-06, - "loss": 0.5323, + "grad_norm": 0.08490809053182602, + "learning_rate": 0.00016591379634857894, + "loss": 0.5279, "step": 17750 }, { "epoch": 3.3427442123094298, - "grad_norm": 12.213115692138672, - "learning_rate": 9.943534726143421e-06, - "loss": 0.5251, + "grad_norm": 5.632839679718018, + "learning_rate": 0.00016572557876905702, + "loss": 0.729, "step": 17760 }, { "epoch": 3.3446263881046487, - "grad_norm": 18.9610595703125, - "learning_rate": 9.932241671372107e-06, - "loss": 0.833, + "grad_norm": 3.7609736919403076, + "learning_rate": 0.00016553736118953512, + "loss": 0.8124, "step": 17770 }, { "epoch": 3.346508563899868, - "grad_norm": 8.55979061126709, - "learning_rate": 9.920948616600791e-06, - "loss": 0.9362, + "grad_norm": 2.617806911468506, + "learning_rate": 0.0001653491436100132, + "loss": 1.064, "step": 17780 }, { "epoch": 3.3483907396950876, - "grad_norm": 1.9761569499969482, - "learning_rate": 9.909655561829474e-06, - "loss": 0.5021, + "grad_norm": 1.1742140054702759, + "learning_rate": 0.00016516092603049125, + "loss": 0.4695, "step": 17790 }, { "epoch": 3.350272915490307, - "grad_norm": 16.7060546875, - "learning_rate": 9.898362507058159e-06, - "loss": 0.5859, + "grad_norm": 2.0559253692626953, + "learning_rate": 0.00016497270845096932, + "loss": 0.4638, "step": 17800 }, { "epoch": 3.352155091285526, - "grad_norm": 16.749752044677734, - "learning_rate": 9.887069452286843e-06, - "loss": 0.4982, + "grad_norm": 2.232964277267456, + "learning_rate": 0.0001647844908714474, + "loss": 0.4741, "step": 17810 }, { "epoch": 3.3540372670807455, - "grad_norm": 3.789137601852417, - "learning_rate": 9.875776397515529e-06, - "loss": 0.4616, + "grad_norm": 2.2919833660125732, + "learning_rate": 0.00016459627329192547, + "loss": 0.5339, "step": 17820 }, { "epoch": 3.3559194428759644, - "grad_norm": 13.295493125915527, - "learning_rate": 9.864483342744213e-06, - "loss": 0.6548, + "grad_norm": 2.7423524856567383, + "learning_rate": 0.00016440805571240355, + "loss": 0.4588, "step": 17830 }, { "epoch": 3.357801618671184, - "grad_norm": 25.58221435546875, - "learning_rate": 9.853190287972896e-06, - "loss": 0.7388, + "grad_norm": 3.7243103981018066, + "learning_rate": 0.0001642198381328816, + "loss": 0.7305, "step": 17840 }, { "epoch": 3.3596837944664033, - "grad_norm": 29.051992416381836, - "learning_rate": 9.84189723320158e-06, - "loss": 0.5679, + "grad_norm": 6.474857807159424, + "learning_rate": 0.00016403162055335968, + "loss": 0.5081, "step": 17850 }, { "epoch": 3.3615659702616223, - "grad_norm": 11.573369026184082, - "learning_rate": 9.830604178430267e-06, - "loss": 0.7047, + "grad_norm": 3.899993419647217, + "learning_rate": 0.00016384340297383775, + "loss": 0.7415, "step": 17860 }, { "epoch": 3.3634481460568417, - "grad_norm": 16.204544067382812, - "learning_rate": 9.819311123658951e-06, - "loss": 0.5335, + "grad_norm": 4.013248920440674, + "learning_rate": 0.00016365518539431583, + "loss": 0.5383, "step": 17870 }, { "epoch": 3.365330321852061, - "grad_norm": 18.992883682250977, - "learning_rate": 9.808018068887634e-06, - "loss": 0.5, + "grad_norm": 3.6923396587371826, + "learning_rate": 0.0001634669678147939, + "loss": 0.598, "step": 17880 }, { "epoch": 3.36721249764728, - "grad_norm": 17.08180809020996, - "learning_rate": 9.796725014116318e-06, - "loss": 0.6355, + "grad_norm": 1.4304816722869873, + "learning_rate": 0.00016327875023527198, + "loss": 0.6347, "step": 17890 }, { "epoch": 3.3690946734424996, - "grad_norm": 12.415989875793457, - "learning_rate": 9.785431959345002e-06, - "loss": 0.6227, + "grad_norm": 5.5415425300598145, + "learning_rate": 0.00016309053265575006, + "loss": 0.6624, "step": 17900 }, { "epoch": 3.370976849237719, - "grad_norm": 23.010984420776367, - "learning_rate": 9.774138904573688e-06, - "loss": 0.6095, + "grad_norm": 1.9383195638656616, + "learning_rate": 0.00016290231507622814, + "loss": 0.5608, "step": 17910 }, { "epoch": 3.372859025032938, - "grad_norm": 20.306901931762695, - "learning_rate": 9.762845849802373e-06, - "loss": 0.7475, + "grad_norm": 5.054727077484131, + "learning_rate": 0.0001627140974967062, + "loss": 0.8852, "step": 17920 }, { "epoch": 3.3747412008281574, - "grad_norm": 7.907451629638672, - "learning_rate": 9.751552795031056e-06, - "loss": 0.7657, + "grad_norm": 1.5502482652664185, + "learning_rate": 0.00016252587991718426, + "loss": 0.7796, "step": 17930 }, { "epoch": 3.3766233766233764, - "grad_norm": 10.358199119567871, - "learning_rate": 9.74025974025974e-06, - "loss": 0.8862, + "grad_norm": 4.015044689178467, + "learning_rate": 0.00016233766233766234, + "loss": 0.9499, "step": 17940 }, { "epoch": 3.378505552418596, - "grad_norm": 7.511529445648193, - "learning_rate": 9.728966685488426e-06, - "loss": 0.4816, + "grad_norm": 2.692499876022339, + "learning_rate": 0.00016214944475814041, + "loss": 0.5061, "step": 17950 }, { "epoch": 3.3803877282138153, - "grad_norm": 11.36762523651123, - "learning_rate": 9.71767363071711e-06, - "loss": 0.6414, + "grad_norm": 3.55739164352417, + "learning_rate": 0.0001619612271786185, + "loss": 0.7057, "step": 17960 }, { "epoch": 3.3822699040090343, - "grad_norm": 4.441726207733154, - "learning_rate": 9.706380575945793e-06, - "loss": 0.3488, + "grad_norm": 0.6521520614624023, + "learning_rate": 0.00016177300959909654, + "loss": 0.4082, "step": 17970 }, { "epoch": 3.3841520798042537, - "grad_norm": 19.867027282714844, - "learning_rate": 9.695087521174477e-06, - "loss": 0.7145, + "grad_norm": 3.7358646392822266, + "learning_rate": 0.00016158479201957462, + "loss": 0.741, "step": 17980 }, { "epoch": 3.386034255599473, - "grad_norm": 10.9057035446167, - "learning_rate": 9.683794466403162e-06, - "loss": 0.6342, + "grad_norm": 1.7314327955245972, + "learning_rate": 0.00016139657444005272, + "loss": 0.6929, "step": 17990 }, { "epoch": 3.387916431394692, - "grad_norm": 8.741947174072266, - "learning_rate": 9.672501411631848e-06, - "loss": 0.5844, + "grad_norm": 3.96738862991333, + "learning_rate": 0.0001612083568605308, + "loss": 0.5361, "step": 18000 }, { "epoch": 3.3897986071899116, - "grad_norm": 25.616764068603516, - "learning_rate": 9.66120835686053e-06, - "loss": 0.6007, + "grad_norm": 4.310543060302734, + "learning_rate": 0.00016102013928100885, + "loss": 0.5798, "step": 18010 }, { "epoch": 3.391680782985131, - "grad_norm": 8.889225959777832, - "learning_rate": 9.649915302089215e-06, - "loss": 0.4933, + "grad_norm": 1.0986257791519165, + "learning_rate": 0.00016083192170148692, + "loss": 0.4943, "step": 18020 }, { "epoch": 3.39356295878035, - "grad_norm": 9.683721542358398, - "learning_rate": 9.6386222473179e-06, - "loss": 0.5452, + "grad_norm": 2.5314037799835205, + "learning_rate": 0.000160643704121965, + "loss": 0.576, "step": 18030 }, { "epoch": 3.3954451345755694, - "grad_norm": 5.871509075164795, - "learning_rate": 9.627329192546584e-06, - "loss": 0.5289, + "grad_norm": 0.23214668035507202, + "learning_rate": 0.00016045548654244307, + "loss": 0.6748, "step": 18040 }, { "epoch": 3.3973273103707884, - "grad_norm": 12.041633605957031, - "learning_rate": 9.61603613777527e-06, - "loss": 0.6335, + "grad_norm": 3.713710308074951, + "learning_rate": 0.00016026726896292115, + "loss": 0.6695, "step": 18050 }, { "epoch": 3.399209486166008, - "grad_norm": 17.63007926940918, - "learning_rate": 9.604743083003952e-06, - "loss": 0.8369, + "grad_norm": 3.0103020668029785, + "learning_rate": 0.0001600790513833992, + "loss": 0.882, "step": 18060 }, { "epoch": 3.4010916619612273, - "grad_norm": 17.726680755615234, - "learning_rate": 9.593450028232637e-06, - "loss": 0.5378, + "grad_norm": 4.872165679931641, + "learning_rate": 0.00015989083380387728, + "loss": 0.7086, "step": 18070 }, { "epoch": 3.4029738377564467, - "grad_norm": 7.5816330909729, - "learning_rate": 9.582156973461321e-06, - "loss": 0.5959, + "grad_norm": 3.5198495388031006, + "learning_rate": 0.00015970261622435535, + "loss": 0.5763, "step": 18080 }, { "epoch": 3.4048560135516657, - "grad_norm": 12.254039764404297, - "learning_rate": 9.570863918690007e-06, - "loss": 0.5402, + "grad_norm": 3.0030627250671387, + "learning_rate": 0.00015951439864483343, + "loss": 0.6998, "step": 18090 }, { "epoch": 3.406738189346885, - "grad_norm": 20.039281845092773, - "learning_rate": 9.55957086391869e-06, - "loss": 0.4543, + "grad_norm": 5.010890960693359, + "learning_rate": 0.0001593261810653115, + "loss": 0.5333, "step": 18100 }, { "epoch": 3.408620365142104, - "grad_norm": 31.039440155029297, - "learning_rate": 9.548277809147374e-06, - "loss": 0.7242, + "grad_norm": 2.5112898349761963, + "learning_rate": 0.00015913796348578958, + "loss": 0.6082, "step": 18110 }, { "epoch": 3.4105025409373235, - "grad_norm": 21.17696189880371, - "learning_rate": 9.536984754376059e-06, - "loss": 0.5722, + "grad_norm": 2.1638071537017822, + "learning_rate": 0.00015894974590626766, + "loss": 0.6394, "step": 18120 }, { "epoch": 3.412384716732543, - "grad_norm": 37.80180740356445, - "learning_rate": 9.525691699604743e-06, - "loss": 0.6266, + "grad_norm": 4.111144065856934, + "learning_rate": 0.00015876152832674574, + "loss": 0.6279, "step": 18130 }, { "epoch": 3.414266892527762, - "grad_norm": 0.5937038064002991, - "learning_rate": 9.514398644833428e-06, - "loss": 0.6332, + "grad_norm": 0.4466768503189087, + "learning_rate": 0.00015857331074722378, + "loss": 0.5873, "step": 18140 }, { "epoch": 3.4161490683229814, - "grad_norm": 18.84284782409668, - "learning_rate": 9.503105590062112e-06, - "loss": 0.5928, + "grad_norm": 1.232139229774475, + "learning_rate": 0.00015838509316770186, + "loss": 0.6969, "step": 18150 }, { "epoch": 3.418031244118201, - "grad_norm": 3.241466522216797, - "learning_rate": 9.491812535290796e-06, - "loss": 0.5802, + "grad_norm": 4.0589985847473145, + "learning_rate": 0.00015819687558817994, + "loss": 0.7058, "step": 18160 }, { "epoch": 3.41991341991342, - "grad_norm": 4.864845275878906, - "learning_rate": 9.48051948051948e-06, - "loss": 0.4499, + "grad_norm": 1.1862062215805054, + "learning_rate": 0.00015800865800865801, + "loss": 0.5094, "step": 18170 }, { "epoch": 3.4217955957086392, - "grad_norm": 10.21737289428711, - "learning_rate": 9.469226425748165e-06, - "loss": 0.466, + "grad_norm": 1.4233732223510742, + "learning_rate": 0.0001578204404291361, + "loss": 0.4951, "step": 18180 }, { "epoch": 3.4236777715038587, - "grad_norm": 21.10309410095215, - "learning_rate": 9.45793337097685e-06, - "loss": 0.4943, + "grad_norm": 3.4455318450927734, + "learning_rate": 0.00015763222284961414, + "loss": 0.5394, "step": 18190 }, { "epoch": 3.4255599472990776, - "grad_norm": 20.39488983154297, - "learning_rate": 9.446640316205534e-06, - "loss": 0.4178, + "grad_norm": 2.995675802230835, + "learning_rate": 0.00015744400527009222, + "loss": 0.4398, "step": 18200 }, { "epoch": 3.427442123094297, - "grad_norm": 10.568327903747559, - "learning_rate": 9.435347261434218e-06, - "loss": 0.5787, + "grad_norm": 1.3443666696548462, + "learning_rate": 0.00015725578769057032, + "loss": 0.4524, "step": 18210 }, { "epoch": 3.429324298889516, - "grad_norm": 15.859696388244629, - "learning_rate": 9.424054206662903e-06, - "loss": 0.5257, + "grad_norm": 4.236741542816162, + "learning_rate": 0.0001570675701110484, + "loss": 0.5184, "step": 18220 }, { "epoch": 3.4312064746847355, - "grad_norm": 3.5136094093322754, - "learning_rate": 9.412761151891587e-06, - "loss": 0.4556, + "grad_norm": 0.17351055145263672, + "learning_rate": 0.00015687935253152645, + "loss": 0.3975, "step": 18230 }, { "epoch": 3.433088650479955, - "grad_norm": 43.259742736816406, - "learning_rate": 9.401468097120271e-06, - "loss": 0.2424, + "grad_norm": 1.4714381694793701, + "learning_rate": 0.00015669113495200452, + "loss": 0.3543, "step": 18240 }, { "epoch": 3.434970826275174, - "grad_norm": 17.20220947265625, - "learning_rate": 9.390175042348956e-06, - "loss": 0.8755, + "grad_norm": 3.5774731636047363, + "learning_rate": 0.0001565029173724826, + "loss": 0.8003, "step": 18250 }, { "epoch": 3.4368530020703933, - "grad_norm": 35.17365646362305, - "learning_rate": 9.37888198757764e-06, - "loss": 0.5231, + "grad_norm": 5.654221534729004, + "learning_rate": 0.00015631469979296067, + "loss": 0.5738, "step": 18260 }, { "epoch": 3.438735177865613, - "grad_norm": 11.22391128540039, - "learning_rate": 9.367588932806323e-06, - "loss": 0.8492, + "grad_norm": 3.335806369781494, + "learning_rate": 0.00015612648221343872, + "loss": 0.7048, "step": 18270 }, { "epoch": 3.4406173536608318, - "grad_norm": 15.989961624145508, - "learning_rate": 9.356295878035009e-06, - "loss": 0.6255, + "grad_norm": 5.399444580078125, + "learning_rate": 0.0001559382646339168, + "loss": 0.5377, "step": 18280 }, { "epoch": 3.442499529456051, - "grad_norm": 10.886322975158691, - "learning_rate": 9.345002823263693e-06, - "loss": 0.5604, + "grad_norm": 2.897859573364258, + "learning_rate": 0.00015575004705439488, + "loss": 0.5503, "step": 18290 }, { "epoch": 3.4443817052512706, - "grad_norm": 0.8173845410346985, - "learning_rate": 9.333709768492378e-06, - "loss": 0.5142, + "grad_norm": 1.5217775106430054, + "learning_rate": 0.00015556182947487295, + "loss": 0.639, "step": 18300 }, { "epoch": 3.4462638810464896, - "grad_norm": 12.035292625427246, - "learning_rate": 9.322416713721062e-06, - "loss": 0.7188, + "grad_norm": 1.9020847082138062, + "learning_rate": 0.00015537361189535103, + "loss": 0.7124, "step": 18310 }, { "epoch": 3.448146056841709, - "grad_norm": 24.942886352539062, - "learning_rate": 9.311123658949746e-06, - "loss": 0.5322, + "grad_norm": 0.4594549238681793, + "learning_rate": 0.0001551853943158291, + "loss": 0.4842, "step": 18320 }, { "epoch": 3.4500282326369285, - "grad_norm": 0.916724681854248, - "learning_rate": 9.29983060417843e-06, - "loss": 0.6067, + "grad_norm": 0.26805880665779114, + "learning_rate": 0.00015499717673630718, + "loss": 0.5915, "step": 18330 }, { "epoch": 3.4519104084321475, - "grad_norm": 1.716382384300232, - "learning_rate": 9.288537549407115e-06, - "loss": 0.5839, + "grad_norm": 0.42122676968574524, + "learning_rate": 0.00015480895915678526, + "loss": 0.6011, "step": 18340 }, { "epoch": 3.453792584227367, - "grad_norm": 5.684045314788818, - "learning_rate": 9.2772444946358e-06, - "loss": 0.6052, + "grad_norm": 2.4079372882843018, + "learning_rate": 0.00015462074157726334, + "loss": 0.5244, "step": 18350 }, { "epoch": 3.4556747600225863, - "grad_norm": 3.6360864639282227, - "learning_rate": 9.265951439864482e-06, - "loss": 0.6305, + "grad_norm": 3.0662310123443604, + "learning_rate": 0.00015443252399774138, + "loss": 0.6324, "step": 18360 }, { "epoch": 3.4575569358178053, - "grad_norm": 7.340415954589844, - "learning_rate": 9.254658385093168e-06, - "loss": 0.4902, + "grad_norm": 2.075991153717041, + "learning_rate": 0.00015424430641821946, + "loss": 0.5104, "step": 18370 }, { "epoch": 3.4594391116130248, - "grad_norm": 12.40035343170166, - "learning_rate": 9.243365330321853e-06, - "loss": 0.6022, + "grad_norm": 3.1180338859558105, + "learning_rate": 0.00015405608883869754, + "loss": 0.548, "step": 18380 }, { "epoch": 3.4613212874082437, - "grad_norm": 6.647785186767578, - "learning_rate": 9.232072275550537e-06, - "loss": 0.6062, + "grad_norm": 2.0586273670196533, + "learning_rate": 0.00015386787125917561, + "loss": 0.6929, "step": 18390 }, { "epoch": 3.463203463203463, - "grad_norm": 16.84423828125, - "learning_rate": 9.22077922077922e-06, - "loss": 0.5901, + "grad_norm": 5.819500923156738, + "learning_rate": 0.00015367965367965366, + "loss": 0.6192, "step": 18400 }, { "epoch": 3.4650856389986826, - "grad_norm": 4.819279193878174, - "learning_rate": 9.209486166007904e-06, - "loss": 0.6798, + "grad_norm": 1.630347490310669, + "learning_rate": 0.00015349143610013174, + "loss": 0.635, "step": 18410 }, { "epoch": 3.4669678147939016, - "grad_norm": 20.76621437072754, - "learning_rate": 9.19819311123659e-06, - "loss": 0.9342, + "grad_norm": 4.307313919067383, + "learning_rate": 0.00015330321852060982, + "loss": 0.9541, "step": 18420 }, { "epoch": 3.468849990589121, - "grad_norm": 18.379783630371094, - "learning_rate": 9.186900056465275e-06, - "loss": 0.7618, + "grad_norm": 3.076420783996582, + "learning_rate": 0.00015311500094108792, + "loss": 0.713, "step": 18430 }, { "epoch": 3.4707321663843405, - "grad_norm": 22.449026107788086, - "learning_rate": 9.175607001693959e-06, - "loss": 0.7875, + "grad_norm": 5.041100978851318, + "learning_rate": 0.000152926783361566, + "loss": 0.7066, "step": 18440 }, { "epoch": 3.4726143421795594, - "grad_norm": 3.216062545776367, - "learning_rate": 9.164313946922642e-06, - "loss": 0.7226, + "grad_norm": 2.18129301071167, + "learning_rate": 0.00015273856578204405, + "loss": 0.7679, "step": 18450 }, { "epoch": 3.474496517974779, - "grad_norm": 18.014450073242188, - "learning_rate": 9.153020892151328e-06, - "loss": 0.6888, + "grad_norm": 3.314056396484375, + "learning_rate": 0.00015255034820252212, + "loss": 0.7436, "step": 18460 }, { "epoch": 3.4763786937699983, - "grad_norm": 33.41580581665039, - "learning_rate": 9.141727837380012e-06, - "loss": 0.4367, + "grad_norm": 1.7835749387741089, + "learning_rate": 0.0001523621306230002, + "loss": 0.4808, "step": 18470 }, { "epoch": 3.4782608695652173, - "grad_norm": 0.49927645921707153, - "learning_rate": 9.130434782608697e-06, - "loss": 0.683, + "grad_norm": 2.1161506175994873, + "learning_rate": 0.00015217391304347827, + "loss": 0.6418, "step": 18480 }, { "epoch": 3.4801430453604367, - "grad_norm": 16.403583526611328, - "learning_rate": 9.11914172783738e-06, - "loss": 0.5037, + "grad_norm": 2.818833351135254, + "learning_rate": 0.00015198569546395632, + "loss": 0.633, "step": 18490 }, { "epoch": 3.4820252211556557, - "grad_norm": 19.10352325439453, - "learning_rate": 9.107848673066064e-06, - "loss": 0.8453, + "grad_norm": 4.812803268432617, + "learning_rate": 0.0001517974778844344, + "loss": 0.791, "step": 18500 }, { "epoch": 3.483907396950875, - "grad_norm": 42.00209426879883, - "learning_rate": 9.09655561829475e-06, - "loss": 0.4188, + "grad_norm": 1.8344483375549316, + "learning_rate": 0.00015160926030491248, + "loss": 0.4743, "step": 18510 }, { "epoch": 3.4857895727460946, - "grad_norm": 22.176076889038086, - "learning_rate": 9.085262563523434e-06, - "loss": 0.8926, + "grad_norm": 1.7361068725585938, + "learning_rate": 0.00015142104272539055, + "loss": 0.8563, "step": 18520 }, { "epoch": 3.4876717485413136, - "grad_norm": 13.631558418273926, - "learning_rate": 9.073969508752118e-06, - "loss": 0.7174, + "grad_norm": 2.331780195236206, + "learning_rate": 0.00015123282514586863, + "loss": 0.7446, "step": 18530 }, { "epoch": 3.489553924336533, - "grad_norm": 16.817617416381836, - "learning_rate": 9.062676453980801e-06, - "loss": 0.3922, + "grad_norm": 5.2538652420043945, + "learning_rate": 0.00015104460756634668, + "loss": 0.4836, "step": 18540 }, { "epoch": 3.4914361001317524, - "grad_norm": 7.2339768409729, - "learning_rate": 9.051383399209486e-06, - "loss": 0.5843, + "grad_norm": 1.902012825012207, + "learning_rate": 0.00015085638998682478, + "loss": 0.5584, "step": 18550 }, { "epoch": 3.4933182759269714, - "grad_norm": 7.782161712646484, - "learning_rate": 9.040090344438172e-06, - "loss": 0.6768, + "grad_norm": 2.5823628902435303, + "learning_rate": 0.00015066817240730286, + "loss": 0.6969, "step": 18560 }, { "epoch": 3.495200451722191, - "grad_norm": 14.207245826721191, - "learning_rate": 9.028797289666856e-06, - "loss": 0.5713, + "grad_norm": 1.9991360902786255, + "learning_rate": 0.00015047995482778094, + "loss": 0.5022, "step": 18570 }, { "epoch": 3.4970826275174103, - "grad_norm": 13.005362510681152, - "learning_rate": 9.017504234895539e-06, - "loss": 0.7295, + "grad_norm": 3.4400622844696045, + "learning_rate": 0.00015029173724825898, + "loss": 0.6502, "step": 18580 }, { "epoch": 3.4989648033126293, - "grad_norm": 5.221713066101074, - "learning_rate": 9.006211180124223e-06, - "loss": 0.5095, + "grad_norm": 1.1559652090072632, + "learning_rate": 0.00015010351966873706, + "loss": 0.5335, "step": 18590 }, { "epoch": 3.5008469791078487, - "grad_norm": 6.634870529174805, - "learning_rate": 8.994918125352909e-06, - "loss": 0.6905, + "grad_norm": 1.5509639978408813, + "learning_rate": 0.00014991530208921514, + "loss": 0.6104, "step": 18600 }, { "epoch": 3.5027291549030677, - "grad_norm": 22.765445709228516, - "learning_rate": 8.983625070581594e-06, - "loss": 0.4576, + "grad_norm": 4.370759010314941, + "learning_rate": 0.00014972708450969321, + "loss": 0.4614, "step": 18610 }, { "epoch": 3.504611330698287, - "grad_norm": 12.745418548583984, - "learning_rate": 8.972332015810276e-06, - "loss": 0.7584, + "grad_norm": 5.9260993003845215, + "learning_rate": 0.00014953886693017126, + "loss": 0.8786, "step": 18620 }, { "epoch": 3.5064935064935066, - "grad_norm": 7.202927112579346, - "learning_rate": 8.96103896103896e-06, - "loss": 0.6597, + "grad_norm": 1.7598670721054077, + "learning_rate": 0.00014935064935064934, + "loss": 0.6033, "step": 18630 }, { "epoch": 3.508375682288726, - "grad_norm": 19.406003952026367, - "learning_rate": 8.949745906267645e-06, - "loss": 0.6717, + "grad_norm": 3.866478681564331, + "learning_rate": 0.00014916243177112742, + "loss": 0.4937, "step": 18640 }, { "epoch": 3.510257858083945, - "grad_norm": 9.290009498596191, - "learning_rate": 8.938452851496331e-06, - "loss": 0.4197, + "grad_norm": 1.5161197185516357, + "learning_rate": 0.0001489742141916055, + "loss": 0.6126, "step": 18650 }, { "epoch": 3.5121400338791644, - "grad_norm": 29.786117553710938, - "learning_rate": 8.927159796725015e-06, - "loss": 1.0078, + "grad_norm": 4.176273345947266, + "learning_rate": 0.0001487859966120836, + "loss": 0.9906, "step": 18660 }, { "epoch": 3.5140222096743834, - "grad_norm": 27.318464279174805, - "learning_rate": 8.915866741953698e-06, - "loss": 0.6775, + "grad_norm": 8.102840423583984, + "learning_rate": 0.00014859777903256165, + "loss": 0.7627, "step": 18670 }, { "epoch": 3.515904385469603, - "grad_norm": 3.237138509750366, - "learning_rate": 8.904573687182382e-06, - "loss": 0.429, + "grad_norm": 1.2033134698867798, + "learning_rate": 0.00014840956145303972, + "loss": 0.3544, "step": 18680 }, { "epoch": 3.5177865612648223, - "grad_norm": 5.157870292663574, - "learning_rate": 8.893280632411067e-06, - "loss": 0.5416, + "grad_norm": 2.3178234100341797, + "learning_rate": 0.0001482213438735178, + "loss": 0.6326, "step": 18690 }, { "epoch": 3.5196687370600412, - "grad_norm": 16.835599899291992, - "learning_rate": 8.881987577639753e-06, - "loss": 0.6814, + "grad_norm": 4.31680965423584, + "learning_rate": 0.00014803312629399587, + "loss": 0.6204, "step": 18700 }, { "epoch": 3.5215509128552607, - "grad_norm": 15.013919830322266, - "learning_rate": 8.870694522868436e-06, - "loss": 0.6731, + "grad_norm": 3.589350700378418, + "learning_rate": 0.00014784490871447392, + "loss": 0.4283, "step": 18710 }, { "epoch": 3.52343308865048, - "grad_norm": 20.667890548706055, - "learning_rate": 8.85940146809712e-06, - "loss": 0.5596, + "grad_norm": 3.9112446308135986, + "learning_rate": 0.000147656691134952, + "loss": 0.647, "step": 18720 }, { "epoch": 3.525315264445699, - "grad_norm": 14.67215347290039, - "learning_rate": 8.848108413325804e-06, - "loss": 0.6222, + "grad_norm": 1.732999563217163, + "learning_rate": 0.00014746847355543008, + "loss": 0.7145, "step": 18730 }, { "epoch": 3.5271974402409185, - "grad_norm": 11.272614479064941, - "learning_rate": 8.83681535855449e-06, - "loss": 0.8376, + "grad_norm": 1.423341155052185, + "learning_rate": 0.00014728025597590815, + "loss": 0.8925, "step": 18740 }, { "epoch": 3.529079616036138, - "grad_norm": 10.547713279724121, - "learning_rate": 8.825522303783173e-06, - "loss": 0.5504, + "grad_norm": 2.423460006713867, + "learning_rate": 0.0001470920383963862, + "loss": 0.6344, "step": 18750 }, { "epoch": 3.530961791831357, - "grad_norm": 12.09305191040039, - "learning_rate": 8.814229249011858e-06, - "loss": 0.5685, + "grad_norm": 4.275598049163818, + "learning_rate": 0.00014690382081686428, + "loss": 0.6458, "step": 18760 }, { "epoch": 3.5328439676265764, - "grad_norm": 13.567402839660645, - "learning_rate": 8.802936194240542e-06, - "loss": 0.4545, + "grad_norm": 3.5608088970184326, + "learning_rate": 0.00014671560323734238, + "loss": 0.6465, "step": 18770 }, { "epoch": 3.5347261434217954, - "grad_norm": 8.988545417785645, - "learning_rate": 8.791643139469226e-06, - "loss": 0.7346, + "grad_norm": 8.03825855255127, + "learning_rate": 0.00014652738565782046, + "loss": 0.5933, "step": 18780 }, { "epoch": 3.536608319217015, - "grad_norm": 13.106191635131836, - "learning_rate": 8.780350084697912e-06, - "loss": 0.6023, + "grad_norm": 0.4490392506122589, + "learning_rate": 0.00014633916807829854, + "loss": 0.6048, "step": 18790 }, { "epoch": 3.5384904950122342, - "grad_norm": 28.7813777923584, - "learning_rate": 8.769057029926595e-06, - "loss": 0.6156, + "grad_norm": 5.7437214851379395, + "learning_rate": 0.00014615095049877658, + "loss": 0.7403, "step": 18800 }, { "epoch": 3.5403726708074537, - "grad_norm": 22.4103946685791, - "learning_rate": 8.75776397515528e-06, - "loss": 0.4925, + "grad_norm": 2.6373136043548584, + "learning_rate": 0.00014596273291925466, + "loss": 0.4953, "step": 18810 }, { "epoch": 3.5422548466026726, - "grad_norm": 20.627689361572266, - "learning_rate": 8.746470920383964e-06, - "loss": 0.6625, + "grad_norm": 3.8211348056793213, + "learning_rate": 0.00014577451533973274, + "loss": 0.6762, "step": 18820 }, { "epoch": 3.544137022397892, - "grad_norm": 8.956947326660156, - "learning_rate": 8.73517786561265e-06, - "loss": 0.5561, + "grad_norm": 2.7318568229675293, + "learning_rate": 0.00014558629776021081, + "loss": 0.6779, "step": 18830 }, { "epoch": 3.546019198193111, - "grad_norm": 7.5617594718933105, - "learning_rate": 8.723884810841333e-06, - "loss": 0.604, + "grad_norm": 1.7246413230895996, + "learning_rate": 0.00014539808018068886, + "loss": 0.5515, "step": 18840 }, { "epoch": 3.5479013739883305, - "grad_norm": 17.85495948791504, - "learning_rate": 8.712591756070017e-06, - "loss": 0.6587, + "grad_norm": 3.1513025760650635, + "learning_rate": 0.00014520986260116694, + "loss": 0.7151, "step": 18850 }, { "epoch": 3.54978354978355, - "grad_norm": 8.020273208618164, - "learning_rate": 8.701298701298701e-06, - "loss": 0.8588, + "grad_norm": 5.167235851287842, + "learning_rate": 0.00014502164502164502, + "loss": 0.8498, "step": 18860 }, { "epoch": 3.551665725578769, - "grad_norm": 25.53630256652832, - "learning_rate": 8.690005646527386e-06, - "loss": 0.6081, + "grad_norm": 3.688398838043213, + "learning_rate": 0.0001448334274421231, + "loss": 0.6606, "step": 18870 }, { "epoch": 3.5535479013739883, - "grad_norm": 5.950412273406982, - "learning_rate": 8.67871259175607e-06, - "loss": 0.3223, + "grad_norm": 2.404766082763672, + "learning_rate": 0.00014464520986260117, + "loss": 0.3633, "step": 18880 }, { "epoch": 3.5554300771692073, - "grad_norm": 7.792263031005859, - "learning_rate": 8.667419536984755e-06, - "loss": 0.6444, + "grad_norm": 1.7548309564590454, + "learning_rate": 0.00014445699228307925, + "loss": 0.7681, "step": 18890 }, { "epoch": 3.5573122529644268, - "grad_norm": 8.66256332397461, - "learning_rate": 8.656126482213439e-06, - "loss": 0.3844, + "grad_norm": 2.3239974975585938, + "learning_rate": 0.00014426877470355732, + "loss": 0.5125, "step": 18900 }, { "epoch": 3.559194428759646, - "grad_norm": 22.50640296936035, - "learning_rate": 8.644833427442123e-06, - "loss": 0.6602, + "grad_norm": 3.089728355407715, + "learning_rate": 0.0001440805571240354, + "loss": 0.7069, "step": 18910 }, { "epoch": 3.5610766045548656, - "grad_norm": 2.1984527111053467, - "learning_rate": 8.633540372670808e-06, - "loss": 0.288, + "grad_norm": 2.1467270851135254, + "learning_rate": 0.00014389233954451347, + "loss": 0.4031, "step": 18920 }, { "epoch": 3.5629587803500846, - "grad_norm": 8.957244873046875, - "learning_rate": 8.622247317899492e-06, - "loss": 0.7115, + "grad_norm": 1.7423038482666016, + "learning_rate": 0.00014370412196499152, + "loss": 0.8048, "step": 18930 }, { "epoch": 3.564840956145304, - "grad_norm": 12.717720985412598, - "learning_rate": 8.610954263128176e-06, - "loss": 0.5397, + "grad_norm": 2.2575857639312744, + "learning_rate": 0.0001435159043854696, + "loss": 0.5316, "step": 18940 }, { "epoch": 3.566723131940523, - "grad_norm": 17.717315673828125, - "learning_rate": 8.59966120835686e-06, - "loss": 0.8032, + "grad_norm": 5.246999740600586, + "learning_rate": 0.00014332768680594768, + "loss": 0.834, "step": 18950 }, { "epoch": 3.5686053077357425, - "grad_norm": 35.10055160522461, - "learning_rate": 8.588368153585545e-06, - "loss": 0.6323, + "grad_norm": 2.156754493713379, + "learning_rate": 0.00014313946922642575, + "loss": 0.6575, "step": 18960 }, { "epoch": 3.570487483530962, - "grad_norm": 16.153400421142578, - "learning_rate": 8.57707509881423e-06, - "loss": 0.9937, + "grad_norm": 2.3258817195892334, + "learning_rate": 0.0001429512516469038, + "loss": 1.1311, "step": 18970 }, { "epoch": 3.5723696593261813, - "grad_norm": 7.813040733337402, - "learning_rate": 8.565782044042914e-06, - "loss": 0.5768, + "grad_norm": 4.504016876220703, + "learning_rate": 0.00014276303406738188, + "loss": 0.5333, "step": 18980 }, { "epoch": 3.5742518351214003, - "grad_norm": 38.98237228393555, - "learning_rate": 8.554488989271598e-06, - "loss": 0.4722, + "grad_norm": 2.8779964447021484, + "learning_rate": 0.00014257481648785998, + "loss": 0.3841, "step": 18990 }, { "epoch": 3.5761340109166198, - "grad_norm": 8.980436325073242, - "learning_rate": 8.543195934500283e-06, - "loss": 0.6281, + "grad_norm": 2.204181432723999, + "learning_rate": 0.00014238659890833806, + "loss": 0.4968, "step": 19000 }, { "epoch": 3.5780161867118387, - "grad_norm": 4.206551551818848, - "learning_rate": 8.531902879728965e-06, - "loss": 0.7266, + "grad_norm": 1.2488787174224854, + "learning_rate": 0.0001421983813288161, + "loss": 0.7569, "step": 19010 }, { "epoch": 3.579898362507058, - "grad_norm": 7.171810150146484, - "learning_rate": 8.520609824957651e-06, - "loss": 0.5348, + "grad_norm": 3.739560842514038, + "learning_rate": 0.00014201016374929418, + "loss": 0.6607, "step": 19020 }, { "epoch": 3.5817805383022776, - "grad_norm": 8.998128890991211, - "learning_rate": 8.509316770186336e-06, - "loss": 0.3459, + "grad_norm": 5.1085381507873535, + "learning_rate": 0.00014182194616977226, + "loss": 0.4285, "step": 19030 }, { "epoch": 3.5836627140974966, - "grad_norm": 11.471923828125, - "learning_rate": 8.49802371541502e-06, - "loss": 0.5549, + "grad_norm": 2.485764980316162, + "learning_rate": 0.00014163372859025034, + "loss": 0.4453, "step": 19040 }, { "epoch": 3.585544889892716, - "grad_norm": 1.2988466024398804, - "learning_rate": 8.486730660643705e-06, - "loss": 0.4202, + "grad_norm": 0.32622551918029785, + "learning_rate": 0.00014144551101072841, + "loss": 0.4728, "step": 19050 }, { "epoch": 3.587427065687935, - "grad_norm": 12.022832870483398, - "learning_rate": 8.475437605872387e-06, - "loss": 0.882, + "grad_norm": 1.6413334608078003, + "learning_rate": 0.00014125729343120646, + "loss": 0.8364, "step": 19060 }, { "epoch": 3.5893092414831544, - "grad_norm": 18.806732177734375, - "learning_rate": 8.464144551101073e-06, - "loss": 0.4618, + "grad_norm": 2.5387015342712402, + "learning_rate": 0.00014106907585168454, + "loss": 0.4835, "step": 19070 }, { "epoch": 3.591191417278374, - "grad_norm": 7.265424728393555, - "learning_rate": 8.452851496329758e-06, - "loss": 0.5997, + "grad_norm": 2.042985677719116, + "learning_rate": 0.00014088085827216262, + "loss": 0.6753, "step": 19080 }, { "epoch": 3.5930735930735933, - "grad_norm": 11.584159851074219, - "learning_rate": 8.441558441558442e-06, - "loss": 0.6913, + "grad_norm": 6.939350128173828, + "learning_rate": 0.0001406926406926407, + "loss": 0.7062, "step": 19090 }, { "epoch": 3.5949557688688123, - "grad_norm": 39.18511199951172, - "learning_rate": 8.430265386787125e-06, - "loss": 0.6348, + "grad_norm": 5.651983737945557, + "learning_rate": 0.00014050442311311877, + "loss": 0.5953, "step": 19100 }, { "epoch": 3.5968379446640317, - "grad_norm": 29.593963623046875, - "learning_rate": 8.418972332015811e-06, - "loss": 0.7804, + "grad_norm": 5.715653896331787, + "learning_rate": 0.00014031620553359685, + "loss": 0.7144, "step": 19110 }, { "epoch": 3.5987201204592507, - "grad_norm": 12.343049049377441, - "learning_rate": 8.407679277244495e-06, - "loss": 0.5887, + "grad_norm": 3.6187081336975098, + "learning_rate": 0.00014012798795407492, + "loss": 0.6545, "step": 19120 }, { "epoch": 3.60060229625447, - "grad_norm": 27.028095245361328, - "learning_rate": 8.39638622247318e-06, - "loss": 0.5748, + "grad_norm": 2.5229785442352295, + "learning_rate": 0.000139939770374553, + "loss": 0.584, "step": 19130 }, { "epoch": 3.6024844720496896, - "grad_norm": 10.208629608154297, - "learning_rate": 8.385093167701862e-06, - "loss": 0.5336, + "grad_norm": 3.843477487564087, + "learning_rate": 0.00013975155279503105, + "loss": 0.5401, "step": 19140 }, { "epoch": 3.6043666478449086, - "grad_norm": 6.941380023956299, - "learning_rate": 8.373800112930547e-06, - "loss": 0.5231, + "grad_norm": 1.425986886024475, + "learning_rate": 0.00013956333521550912, + "loss": 0.416, "step": 19150 }, { "epoch": 3.606248823640128, - "grad_norm": 24.04265785217285, - "learning_rate": 8.362507058159233e-06, - "loss": 0.9771, + "grad_norm": 5.949957847595215, + "learning_rate": 0.0001393751176359872, + "loss": 0.885, "step": 19160 }, { "epoch": 3.608130999435347, - "grad_norm": 7.969100475311279, - "learning_rate": 8.351214003387917e-06, - "loss": 0.6908, + "grad_norm": 2.58278489112854, + "learning_rate": 0.00013918690005646528, + "loss": 0.5372, "step": 19170 }, { "epoch": 3.6100131752305664, - "grad_norm": 5.254550457000732, - "learning_rate": 8.339920948616602e-06, - "loss": 0.4686, + "grad_norm": 2.7384581565856934, + "learning_rate": 0.00013899868247694335, + "loss": 0.5182, "step": 19180 }, { "epoch": 3.611895351025786, - "grad_norm": 11.416996955871582, - "learning_rate": 8.328627893845284e-06, - "loss": 0.9016, + "grad_norm": 2.234143018722534, + "learning_rate": 0.0001388104648974214, + "loss": 0.8945, "step": 19190 }, { "epoch": 3.6137775268210053, - "grad_norm": 22.956119537353516, - "learning_rate": 8.31733483907397e-06, - "loss": 0.718, + "grad_norm": 2.7675840854644775, + "learning_rate": 0.00013862224731789948, + "loss": 0.737, "step": 19200 }, { "epoch": 3.6156597026162243, - "grad_norm": 20.53351402282715, - "learning_rate": 8.306041784302655e-06, - "loss": 0.6639, + "grad_norm": 5.574830532073975, + "learning_rate": 0.00013843402973837758, + "loss": 0.6644, "step": 19210 }, { "epoch": 3.6175418784114437, - "grad_norm": 15.131000518798828, - "learning_rate": 8.294748729531339e-06, - "loss": 0.3244, + "grad_norm": 4.628572463989258, + "learning_rate": 0.00013824581215885566, + "loss": 0.5055, "step": 19220 }, { "epoch": 3.6194240542066627, - "grad_norm": 24.35951805114746, - "learning_rate": 8.283455674760022e-06, - "loss": 0.467, + "grad_norm": 5.0951032638549805, + "learning_rate": 0.0001380575945793337, + "loss": 0.3973, "step": 19230 }, { "epoch": 3.621306230001882, - "grad_norm": 10.79614543914795, - "learning_rate": 8.272162619988706e-06, - "loss": 0.6985, + "grad_norm": 3.8626248836517334, + "learning_rate": 0.00013786937699981178, + "loss": 0.7009, "step": 19240 }, { "epoch": 3.6231884057971016, - "grad_norm": 8.274187088012695, - "learning_rate": 8.260869565217392e-06, - "loss": 0.7224, + "grad_norm": 1.549597144126892, + "learning_rate": 0.00013768115942028986, + "loss": 0.7579, "step": 19250 }, { "epoch": 3.625070581592321, - "grad_norm": 0.10666459053754807, - "learning_rate": 8.249576510446077e-06, - "loss": 0.6622, + "grad_norm": 0.22615107893943787, + "learning_rate": 0.00013749294184076794, + "loss": 0.7572, "step": 19260 }, { "epoch": 3.62695275738754, - "grad_norm": 24.340177536010742, - "learning_rate": 8.238283455674761e-06, - "loss": 0.7348, + "grad_norm": 1.8409181833267212, + "learning_rate": 0.00013730472426124601, + "loss": 0.6716, "step": 19270 }, { "epoch": 3.6288349331827594, - "grad_norm": 20.038578033447266, - "learning_rate": 8.226990400903444e-06, - "loss": 0.8442, + "grad_norm": 4.8296027183532715, + "learning_rate": 0.00013711650668172406, + "loss": 0.8643, "step": 19280 }, { "epoch": 3.6307171089779784, - "grad_norm": 11.465856552124023, - "learning_rate": 8.215697346132128e-06, - "loss": 0.7398, + "grad_norm": 1.5565869808197021, + "learning_rate": 0.00013692828910220214, + "loss": 0.6602, "step": 19290 }, { "epoch": 3.632599284773198, - "grad_norm": 13.356294631958008, - "learning_rate": 8.204404291360814e-06, - "loss": 0.4302, + "grad_norm": 3.0663068294525146, + "learning_rate": 0.00013674007152268022, + "loss": 0.5523, "step": 19300 }, { "epoch": 3.6344814605684173, - "grad_norm": 17.840681076049805, - "learning_rate": 8.193111236589499e-06, - "loss": 0.562, + "grad_norm": 4.205318450927734, + "learning_rate": 0.0001365518539431583, + "loss": 0.6649, "step": 19310 }, { "epoch": 3.6363636363636362, - "grad_norm": 0.5151638984680176, - "learning_rate": 8.181818181818181e-06, - "loss": 0.5005, + "grad_norm": 3.561537742614746, + "learning_rate": 0.00013636363636363637, + "loss": 0.6846, "step": 19320 }, { "epoch": 3.6382458121588557, - "grad_norm": 8.19641399383545, - "learning_rate": 8.170525127046866e-06, - "loss": 0.6157, + "grad_norm": 1.4255841970443726, + "learning_rate": 0.00013617541878411445, + "loss": 0.6412, "step": 19330 }, { "epoch": 3.6401279879540747, - "grad_norm": 19.37883949279785, - "learning_rate": 8.159232072275552e-06, - "loss": 0.5518, + "grad_norm": 4.167680263519287, + "learning_rate": 0.00013598720120459252, + "loss": 0.5654, "step": 19340 }, { "epoch": 3.642010163749294, - "grad_norm": 2.3880703449249268, - "learning_rate": 8.147939017504236e-06, - "loss": 0.5452, + "grad_norm": 2.158613443374634, + "learning_rate": 0.0001357989836250706, + "loss": 0.5479, "step": 19350 }, { "epoch": 3.6438923395445135, - "grad_norm": 12.739452362060547, - "learning_rate": 8.136645962732919e-06, - "loss": 0.945, + "grad_norm": 2.723226308822632, + "learning_rate": 0.00013561076604554865, + "loss": 1.0151, "step": 19360 }, { "epoch": 3.645774515339733, - "grad_norm": 14.899114608764648, - "learning_rate": 8.125352907961603e-06, - "loss": 0.9551, + "grad_norm": 2.7262959480285645, + "learning_rate": 0.00013542254846602672, + "loss": 0.8649, "step": 19370 }, { "epoch": 3.647656691134952, - "grad_norm": 24.746118545532227, - "learning_rate": 8.114059853190288e-06, - "loss": 0.7507, + "grad_norm": 3.184572458267212, + "learning_rate": 0.0001352343308865048, + "loss": 0.8191, "step": 19380 }, { "epoch": 3.6495388669301714, - "grad_norm": 10.281793594360352, - "learning_rate": 8.102766798418974e-06, - "loss": 0.5547, + "grad_norm": 2.500969409942627, + "learning_rate": 0.00013504611330698288, + "loss": 0.6022, "step": 19390 }, { "epoch": 3.6514210427253904, - "grad_norm": 0.5525919198989868, - "learning_rate": 8.091473743647658e-06, - "loss": 0.5547, + "grad_norm": 0.1688629537820816, + "learning_rate": 0.00013485789572746095, + "loss": 0.7283, "step": 19400 }, { "epoch": 3.65330321852061, - "grad_norm": 51.09165573120117, - "learning_rate": 8.08018068887634e-06, - "loss": 0.6717, + "grad_norm": 4.442159175872803, + "learning_rate": 0.000134669678147939, + "loss": 0.7114, "step": 19410 }, { "epoch": 3.6551853943158292, - "grad_norm": 18.030263900756836, - "learning_rate": 8.068887634105025e-06, - "loss": 0.6471, + "grad_norm": 3.3339366912841797, + "learning_rate": 0.00013448146056841708, + "loss": 0.8124, "step": 19420 }, { "epoch": 3.657067570111048, - "grad_norm": 10.15441608428955, - "learning_rate": 8.05759457933371e-06, - "loss": 0.5754, + "grad_norm": 3.360987663269043, + "learning_rate": 0.00013429324298889518, + "loss": 0.5944, "step": 19430 }, { "epoch": 3.6589497459062676, - "grad_norm": 8.153667449951172, - "learning_rate": 8.046301524562396e-06, - "loss": 0.4898, + "grad_norm": 1.367207407951355, + "learning_rate": 0.00013410502540937326, + "loss": 0.5212, "step": 19440 }, { "epoch": 3.660831921701487, - "grad_norm": 22.26079750061035, - "learning_rate": 8.035008469791078e-06, - "loss": 0.6954, + "grad_norm": 2.3143234252929688, + "learning_rate": 0.0001339168078298513, + "loss": 0.7531, "step": 19450 }, { "epoch": 3.662714097496706, - "grad_norm": 8.062853813171387, - "learning_rate": 8.023715415019763e-06, - "loss": 0.5817, + "grad_norm": 2.4220802783966064, + "learning_rate": 0.00013372859025032938, + "loss": 0.573, "step": 19460 }, { "epoch": 3.6645962732919255, - "grad_norm": 20.452653884887695, - "learning_rate": 8.012422360248447e-06, - "loss": 0.6119, + "grad_norm": 3.9013524055480957, + "learning_rate": 0.00013354037267080746, + "loss": 0.6934, "step": 19470 }, { "epoch": 3.666478449087145, - "grad_norm": 16.677806854248047, - "learning_rate": 8.001129305477133e-06, - "loss": 0.5955, + "grad_norm": 2.481264114379883, + "learning_rate": 0.00013335215509128554, + "loss": 0.6393, "step": 19480 }, { "epoch": 3.668360624882364, - "grad_norm": 13.450298309326172, - "learning_rate": 7.989836250705816e-06, - "loss": 0.4935, + "grad_norm": 1.6518964767456055, + "learning_rate": 0.0001331639375117636, + "loss": 0.4722, "step": 19490 }, { "epoch": 3.6702428006775834, - "grad_norm": 12.314119338989258, - "learning_rate": 7.9785431959345e-06, - "loss": 0.7169, + "grad_norm": 3.1066956520080566, + "learning_rate": 0.00013297571993224166, + "loss": 0.722, "step": 19500 }, { "epoch": 3.6721249764728023, - "grad_norm": 13.286541938781738, - "learning_rate": 7.967250141163185e-06, - "loss": 0.7006, + "grad_norm": 3.954586982727051, + "learning_rate": 0.00013278750235271974, + "loss": 0.5826, "step": 19510 }, { "epoch": 3.6740071522680218, - "grad_norm": 8.442977905273438, - "learning_rate": 7.955957086391869e-06, - "loss": 0.4225, + "grad_norm": 1.8204858303070068, + "learning_rate": 0.00013259928477319782, + "loss": 0.6208, "step": 19520 }, { "epoch": 3.675889328063241, - "grad_norm": 18.344675064086914, - "learning_rate": 7.944664031620555e-06, - "loss": 0.5021, + "grad_norm": 2.552551746368408, + "learning_rate": 0.0001324110671936759, + "loss": 0.4284, "step": 19530 }, { "epoch": 3.6777715038584606, - "grad_norm": 22.64541244506836, - "learning_rate": 7.933370976849238e-06, - "loss": 0.596, + "grad_norm": 4.546378135681152, + "learning_rate": 0.00013222284961415397, + "loss": 0.6156, "step": 19540 }, { "epoch": 3.6796536796536796, - "grad_norm": 20.384031295776367, - "learning_rate": 7.922077922077922e-06, - "loss": 0.6877, + "grad_norm": 3.2740485668182373, + "learning_rate": 0.00013203463203463205, + "loss": 0.6867, "step": 19550 }, { "epoch": 3.681535855448899, - "grad_norm": 8.203783988952637, - "learning_rate": 7.910784867306606e-06, - "loss": 0.6984, + "grad_norm": 5.2918877601623535, + "learning_rate": 0.00013184641445511012, + "loss": 0.6754, "step": 19560 }, { "epoch": 3.683418031244118, - "grad_norm": 8.949334144592285, - "learning_rate": 7.89949181253529e-06, - "loss": 0.642, + "grad_norm": 2.1436519622802734, + "learning_rate": 0.0001316581968755882, + "loss": 0.674, "step": 19570 }, { "epoch": 3.6853002070393375, - "grad_norm": 9.0608491897583, - "learning_rate": 7.888198757763975e-06, - "loss": 0.6827, + "grad_norm": 1.6020269393920898, + "learning_rate": 0.00013146997929606625, + "loss": 0.6801, "step": 19580 }, { "epoch": 3.687182382834557, - "grad_norm": 11.608131408691406, - "learning_rate": 7.87690570299266e-06, - "loss": 0.5154, + "grad_norm": 3.0209949016571045, + "learning_rate": 0.00013128176171654432, + "loss": 0.5285, "step": 19590 }, { "epoch": 3.689064558629776, - "grad_norm": 42.18769836425781, - "learning_rate": 7.865612648221344e-06, - "loss": 0.6003, + "grad_norm": 6.093017101287842, + "learning_rate": 0.0001310935441370224, + "loss": 0.8178, "step": 19600 }, { "epoch": 3.6909467344249953, - "grad_norm": 10.675950050354004, - "learning_rate": 7.854319593450028e-06, - "loss": 0.4056, + "grad_norm": 2.7886905670166016, + "learning_rate": 0.00013090532655750048, + "loss": 0.5863, "step": 19610 }, { "epoch": 3.6928289102202143, - "grad_norm": 21.224416732788086, - "learning_rate": 7.843026538678713e-06, - "loss": 0.5917, + "grad_norm": 3.8390111923217773, + "learning_rate": 0.00013071710897797853, + "loss": 0.5651, "step": 19620 }, { "epoch": 3.6947110860154337, - "grad_norm": 13.734648704528809, - "learning_rate": 7.831733483907397e-06, - "loss": 0.6431, + "grad_norm": 4.392716407775879, + "learning_rate": 0.0001305288913984566, + "loss": 0.7295, "step": 19630 }, { "epoch": 3.696593261810653, - "grad_norm": 5.891987323760986, - "learning_rate": 7.820440429136081e-06, - "loss": 0.4731, + "grad_norm": 2.144402503967285, + "learning_rate": 0.00013034067381893468, + "loss": 0.6048, "step": 19640 }, { "epoch": 3.6984754376058726, - "grad_norm": 6.475367546081543, - "learning_rate": 7.809147374364766e-06, - "loss": 0.5094, + "grad_norm": 0.8373141288757324, + "learning_rate": 0.00013015245623941278, + "loss": 0.6461, "step": 19650 }, { "epoch": 3.7003576134010916, - "grad_norm": 5.192591667175293, - "learning_rate": 7.79785431959345e-06, - "loss": 0.508, + "grad_norm": 1.1086032390594482, + "learning_rate": 0.00012996423865989086, + "loss": 0.5463, "step": 19660 }, { "epoch": 3.702239789196311, - "grad_norm": 5.926048755645752, - "learning_rate": 7.786561264822135e-06, - "loss": 0.5057, + "grad_norm": 4.724222183227539, + "learning_rate": 0.0001297760210803689, + "loss": 0.7127, "step": 19670 }, { "epoch": 3.70412196499153, - "grad_norm": 41.563819885253906, - "learning_rate": 7.775268210050819e-06, - "loss": 0.7474, + "grad_norm": 5.788373947143555, + "learning_rate": 0.00012958780350084698, + "loss": 0.7291, "step": 19680 }, { "epoch": 3.7060041407867494, - "grad_norm": 22.899145126342773, - "learning_rate": 7.763975155279503e-06, - "loss": 0.4772, + "grad_norm": 2.612990617752075, + "learning_rate": 0.00012939958592132506, + "loss": 0.5288, "step": 19690 }, { "epoch": 3.707886316581969, - "grad_norm": 1.0630342960357666, - "learning_rate": 7.752682100508188e-06, - "loss": 0.3682, + "grad_norm": 0.08691232651472092, + "learning_rate": 0.00012921136834180314, + "loss": 0.3476, "step": 19700 }, { "epoch": 3.709768492377188, - "grad_norm": 10.798871994018555, - "learning_rate": 7.741389045736872e-06, - "loss": 0.7488, + "grad_norm": 4.636226654052734, + "learning_rate": 0.0001290231507622812, + "loss": 0.786, "step": 19710 }, { "epoch": 3.7116506681724073, - "grad_norm": 9.957785606384277, - "learning_rate": 7.730095990965557e-06, - "loss": 0.4364, + "grad_norm": 3.9436421394348145, + "learning_rate": 0.00012883493318275926, + "loss": 0.4371, "step": 19720 }, { "epoch": 3.7135328439676267, - "grad_norm": 3.4311161041259766, - "learning_rate": 7.718802936194241e-06, - "loss": 0.4192, + "grad_norm": 1.0998643636703491, + "learning_rate": 0.00012864671560323734, + "loss": 0.5665, "step": 19730 }, { "epoch": 3.7154150197628457, - "grad_norm": 10.924822807312012, - "learning_rate": 7.707509881422925e-06, - "loss": 0.5174, + "grad_norm": 5.126893520355225, + "learning_rate": 0.00012845849802371542, + "loss": 0.6374, "step": 19740 }, { "epoch": 3.717297195558065, - "grad_norm": 11.127129554748535, - "learning_rate": 7.696216826651608e-06, - "loss": 0.4632, + "grad_norm": 1.9523310661315918, + "learning_rate": 0.00012827028044419347, + "loss": 0.6891, "step": 19750 }, { "epoch": 3.7191793713532846, - "grad_norm": 2.7011196613311768, - "learning_rate": 7.684923771880294e-06, - "loss": 0.499, + "grad_norm": 1.3989789485931396, + "learning_rate": 0.00012808206286467157, + "loss": 0.5985, "step": 19760 }, { "epoch": 3.7210615471485036, - "grad_norm": 3.9069223403930664, - "learning_rate": 7.673630717108978e-06, - "loss": 0.6029, + "grad_norm": 0.2790946960449219, + "learning_rate": 0.00012789384528514965, + "loss": 0.7122, "step": 19770 }, { "epoch": 3.722943722943723, - "grad_norm": 4.461427211761475, - "learning_rate": 7.662337662337663e-06, - "loss": 0.7491, + "grad_norm": 2.048234224319458, + "learning_rate": 0.00012770562770562772, + "loss": 0.7603, "step": 19780 }, { "epoch": 3.724825898738942, - "grad_norm": 19.533382415771484, - "learning_rate": 7.651044607566347e-06, - "loss": 0.6521, + "grad_norm": 3.0206234455108643, + "learning_rate": 0.0001275174101261058, + "loss": 0.7907, "step": 19790 }, { "epoch": 3.7267080745341614, - "grad_norm": 2.6040291786193848, - "learning_rate": 7.63975155279503e-06, - "loss": 0.4989, + "grad_norm": 2.8944666385650635, + "learning_rate": 0.00012732919254658385, + "loss": 0.6928, "step": 19800 }, { "epoch": 3.728590250329381, - "grad_norm": 11.266732215881348, - "learning_rate": 7.628458498023715e-06, - "loss": 0.5427, + "grad_norm": 2.925626754760742, + "learning_rate": 0.00012714097496706192, + "loss": 0.6453, "step": 19810 }, { "epoch": 3.7304724261246003, - "grad_norm": 0.7041659951210022, - "learning_rate": 7.6171654432524e-06, - "loss": 0.482, + "grad_norm": 2.9843673706054688, + "learning_rate": 0.00012695275738754, + "loss": 0.5517, "step": 19820 }, { "epoch": 3.7323546019198193, - "grad_norm": 7.669864654541016, - "learning_rate": 7.605872388481085e-06, - "loss": 0.4862, + "grad_norm": 2.6966941356658936, + "learning_rate": 0.00012676453980801808, + "loss": 0.6548, "step": 19830 }, { "epoch": 3.7342367777150387, - "grad_norm": 10.8816499710083, - "learning_rate": 7.594579333709768e-06, - "loss": 0.7602, + "grad_norm": 2.4660897254943848, + "learning_rate": 0.00012657632222849613, + "loss": 0.6221, "step": 19840 }, { "epoch": 3.7361189535102577, - "grad_norm": 12.468655586242676, - "learning_rate": 7.583286278938453e-06, - "loss": 0.4088, + "grad_norm": 2.4245846271514893, + "learning_rate": 0.0001263881046489742, + "loss": 0.4908, "step": 19850 }, { "epoch": 3.738001129305477, - "grad_norm": 0.618268609046936, - "learning_rate": 7.571993224167137e-06, - "loss": 0.2964, + "grad_norm": 2.279542922973633, + "learning_rate": 0.00012619988706945228, + "loss": 0.3842, "step": 19860 }, { "epoch": 3.7398833051006966, - "grad_norm": 19.64562225341797, - "learning_rate": 7.560700169395822e-06, - "loss": 0.7775, + "grad_norm": 4.801576614379883, + "learning_rate": 0.00012601166948993038, + "loss": 0.7761, "step": 19870 }, { "epoch": 3.7417654808959155, - "grad_norm": 18.206167221069336, - "learning_rate": 7.549407114624507e-06, - "loss": 0.7153, + "grad_norm": 7.5804901123046875, + "learning_rate": 0.00012582345191040846, + "loss": 0.8545, "step": 19880 }, { "epoch": 3.743647656691135, - "grad_norm": 7.192927837371826, - "learning_rate": 7.53811405985319e-06, - "loss": 0.4181, + "grad_norm": 2.122183322906494, + "learning_rate": 0.0001256352343308865, + "loss": 0.566, "step": 19890 }, { "epoch": 3.745529832486354, - "grad_norm": 29.97498893737793, - "learning_rate": 7.5268210050818746e-06, - "loss": 0.3835, + "grad_norm": 0.906879186630249, + "learning_rate": 0.00012544701675136458, + "loss": 0.3437, "step": 19900 }, { "epoch": 3.7474120082815734, - "grad_norm": 5.730542182922363, - "learning_rate": 7.51552795031056e-06, - "loss": 0.5346, + "grad_norm": 1.77112877368927, + "learning_rate": 0.00012525879917184266, + "loss": 0.5665, "step": 19910 }, { "epoch": 3.749294184076793, - "grad_norm": 15.303305625915527, - "learning_rate": 7.504234895539244e-06, - "loss": 0.5668, + "grad_norm": 2.0256805419921875, + "learning_rate": 0.00012507058159232074, + "loss": 0.5925, "step": 19920 }, { "epoch": 3.7511763598720123, - "grad_norm": 7.135068893432617, - "learning_rate": 7.4929418407679286e-06, - "loss": 0.5419, + "grad_norm": 3.9407191276550293, + "learning_rate": 0.00012488236401279881, + "loss": 0.523, "step": 19930 }, { "epoch": 3.7530585356672312, - "grad_norm": 18.504470825195312, - "learning_rate": 7.481648785996612e-06, - "loss": 0.6651, + "grad_norm": 7.084681034088135, + "learning_rate": 0.00012469414643327686, + "loss": 0.7849, "step": 19940 }, { "epoch": 3.7549407114624507, - "grad_norm": 4.049873352050781, - "learning_rate": 7.4703557312252965e-06, - "loss": 0.6697, + "grad_norm": 3.3468875885009766, + "learning_rate": 0.00012450592885375494, + "loss": 0.7904, "step": 19950 }, { "epoch": 3.7568228872576697, - "grad_norm": 0.3904266953468323, - "learning_rate": 7.459062676453981e-06, - "loss": 0.5437, + "grad_norm": 0.175517737865448, + "learning_rate": 0.00012431771127423302, + "loss": 0.4774, "step": 19960 }, { "epoch": 3.758705063052889, - "grad_norm": 2.0129761695861816, - "learning_rate": 7.447769621682665e-06, - "loss": 0.4477, + "grad_norm": 4.669493198394775, + "learning_rate": 0.0001241294936947111, + "loss": 0.6702, "step": 19970 }, { "epoch": 3.7605872388481085, - "grad_norm": 25.443471908569336, - "learning_rate": 7.43647656691135e-06, - "loss": 0.6205, + "grad_norm": 4.180354118347168, + "learning_rate": 0.00012394127611518917, + "loss": 0.7476, "step": 19980 }, { "epoch": 3.762469414643328, - "grad_norm": 27.52094841003418, - "learning_rate": 7.425183512140034e-06, - "loss": 0.4978, + "grad_norm": 4.086312294006348, + "learning_rate": 0.00012375305853566725, + "loss": 0.5892, "step": 19990 }, { "epoch": 3.764351590438547, - "grad_norm": 0.09147056192159653, - "learning_rate": 7.413890457368718e-06, - "loss": 0.398, + "grad_norm": 0.048435747623443604, + "learning_rate": 0.0001235648409561453, + "loss": 0.415, "step": 20000 }, { "epoch": 3.7662337662337664, - "grad_norm": 0.8058493733406067, - "learning_rate": 7.402597402597403e-06, - "loss": 0.4703, + "grad_norm": 0.6088755130767822, + "learning_rate": 0.00012337662337662337, + "loss": 0.5425, "step": 20010 }, { "epoch": 3.7681159420289854, - "grad_norm": 8.946433067321777, - "learning_rate": 7.391304347826087e-06, - "loss": 0.6018, + "grad_norm": 3.2463440895080566, + "learning_rate": 0.00012318840579710145, + "loss": 0.6901, "step": 20020 }, { "epoch": 3.769998117824205, - "grad_norm": 12.180794715881348, - "learning_rate": 7.3800112930547715e-06, - "loss": 0.5685, + "grad_norm": 2.989475965499878, + "learning_rate": 0.00012300018821757952, + "loss": 0.5331, "step": 20030 }, { "epoch": 3.7718802936194242, - "grad_norm": 11.476252555847168, - "learning_rate": 7.368718238283456e-06, - "loss": 0.5504, + "grad_norm": 6.133965492248535, + "learning_rate": 0.0001228119706380576, + "loss": 0.5936, "step": 20040 }, { "epoch": 3.773762469414643, - "grad_norm": 14.593816757202148, - "learning_rate": 7.35742518351214e-06, - "loss": 0.5439, + "grad_norm": 2.2449915409088135, + "learning_rate": 0.00012262375305853568, + "loss": 0.573, "step": 20050 }, { "epoch": 3.7756446452098626, - "grad_norm": 42.94921112060547, - "learning_rate": 7.346132128740825e-06, - "loss": 0.7765, + "grad_norm": 6.024438858032227, + "learning_rate": 0.00012243553547901375, + "loss": 0.7494, "step": 20060 }, { "epoch": 3.7775268210050816, - "grad_norm": 8.45206069946289, - "learning_rate": 7.334839073969509e-06, - "loss": 0.4416, + "grad_norm": 0.22473078966140747, + "learning_rate": 0.0001222473178994918, + "loss": 0.5063, "step": 20070 }, { "epoch": 3.779408996800301, - "grad_norm": 11.394601821899414, - "learning_rate": 7.3235460191981934e-06, - "loss": 0.5376, + "grad_norm": 4.407290458679199, + "learning_rate": 0.00012205910031996989, + "loss": 0.6881, "step": 20080 }, { "epoch": 3.7812911725955205, - "grad_norm": 0.5047250986099243, - "learning_rate": 7.312252964426877e-06, - "loss": 0.5465, + "grad_norm": 3.2443549633026123, + "learning_rate": 0.00012187088274044796, + "loss": 0.659, "step": 20090 }, { "epoch": 3.78317334839074, - "grad_norm": 8.610776901245117, - "learning_rate": 7.300959909655562e-06, - "loss": 0.4937, + "grad_norm": 2.6818675994873047, + "learning_rate": 0.00012168266516092603, + "loss": 0.6821, "step": 20100 }, { "epoch": 3.785055524185959, - "grad_norm": 16.135894775390625, - "learning_rate": 7.289666854884246e-06, - "loss": 0.3992, + "grad_norm": 3.2473978996276855, + "learning_rate": 0.0001214944475814041, + "loss": 0.2873, "step": 20110 }, { "epoch": 3.7869376999811784, - "grad_norm": 10.323850631713867, - "learning_rate": 7.278373800112931e-06, - "loss": 0.6229, + "grad_norm": 2.7352027893066406, + "learning_rate": 0.00012130623000188218, + "loss": 0.8633, "step": 20120 }, { "epoch": 3.7888198757763973, - "grad_norm": 24.070560455322266, - "learning_rate": 7.267080745341615e-06, - "loss": 0.6724, + "grad_norm": 3.0541322231292725, + "learning_rate": 0.00012111801242236026, + "loss": 0.7022, "step": 20130 }, { "epoch": 3.7907020515716168, - "grad_norm": 15.11103343963623, - "learning_rate": 7.2557876905703e-06, - "loss": 0.4975, + "grad_norm": 3.460502862930298, + "learning_rate": 0.00012092979484283832, + "loss": 0.4832, "step": 20140 }, { "epoch": 3.792584227366836, - "grad_norm": 14.725980758666992, - "learning_rate": 7.244494635798984e-06, - "loss": 0.65, + "grad_norm": 3.218946933746338, + "learning_rate": 0.0001207415772633164, + "loss": 0.8061, "step": 20150 }, { "epoch": 3.794466403162055, - "grad_norm": 13.276191711425781, - "learning_rate": 7.233201581027668e-06, - "loss": 0.2699, + "grad_norm": 2.4092910289764404, + "learning_rate": 0.00012055335968379446, + "loss": 0.3311, "step": 20160 }, { "epoch": 3.7963485789572746, - "grad_norm": 16.452234268188477, - "learning_rate": 7.221908526256353e-06, - "loss": 0.7108, + "grad_norm": 1.346550703048706, + "learning_rate": 0.00012036514210427254, + "loss": 0.609, "step": 20170 }, { "epoch": 3.7982307547524936, - "grad_norm": 18.935544967651367, - "learning_rate": 7.210615471485036e-06, - "loss": 0.4207, + "grad_norm": 3.535579204559326, + "learning_rate": 0.00012017692452475062, + "loss": 0.4008, "step": 20180 }, { "epoch": 3.800112930547713, - "grad_norm": 14.351372718811035, - "learning_rate": 7.199322416713722e-06, - "loss": 0.554, + "grad_norm": 3.253265857696533, + "learning_rate": 0.00011998870694522869, + "loss": 0.5573, "step": 20190 }, { "epoch": 3.8019951063429325, - "grad_norm": 2.5707204341888428, - "learning_rate": 7.188029361942405e-06, - "loss": 0.392, + "grad_norm": 0.9760240316390991, + "learning_rate": 0.00011980048936570676, + "loss": 0.5194, "step": 20200 }, { "epoch": 3.803877282138152, - "grad_norm": 1.579572081565857, - "learning_rate": 7.17673630717109e-06, - "loss": 0.3267, + "grad_norm": 4.288007736206055, + "learning_rate": 0.00011961227178618483, + "loss": 0.514, "step": 20210 }, { "epoch": 3.805759457933371, - "grad_norm": 4.0918660163879395, - "learning_rate": 7.165443252399774e-06, - "loss": 0.6002, + "grad_norm": 7.366967678070068, + "learning_rate": 0.0001194240542066629, + "loss": 0.5861, "step": 20220 }, { "epoch": 3.8076416337285903, - "grad_norm": 16.16851043701172, - "learning_rate": 7.154150197628458e-06, - "loss": 0.3712, + "grad_norm": 1.3266446590423584, + "learning_rate": 0.00011923583662714098, + "loss": 0.3292, "step": 20230 }, { "epoch": 3.8095238095238093, - "grad_norm": 11.626598358154297, - "learning_rate": 7.142857142857143e-06, - "loss": 0.8826, + "grad_norm": 2.6739280223846436, + "learning_rate": 0.00011904761904761905, + "loss": 0.934, "step": 20240 }, { "epoch": 3.8114059853190287, - "grad_norm": 17.99329948425293, - "learning_rate": 7.131564088085827e-06, - "loss": 0.8901, + "grad_norm": 4.105169773101807, + "learning_rate": 0.00011885940146809712, + "loss": 0.9709, "step": 20250 }, { "epoch": 3.813288161114248, - "grad_norm": 0.3546718955039978, - "learning_rate": 7.120271033314512e-06, - "loss": 0.7524, + "grad_norm": 2.8270390033721924, + "learning_rate": 0.0001186711838885752, + "loss": 0.7253, "step": 20260 }, { "epoch": 3.8151703369094676, - "grad_norm": 6.967835426330566, - "learning_rate": 7.108977978543196e-06, - "loss": 0.3611, + "grad_norm": 3.085862398147583, + "learning_rate": 0.00011848296630905326, + "loss": 0.3944, "step": 20270 }, { "epoch": 3.8170525127046866, - "grad_norm": 0.7506970763206482, - "learning_rate": 7.097684923771881e-06, - "loss": 0.6239, + "grad_norm": 2.2109007835388184, + "learning_rate": 0.00011829474872953134, + "loss": 0.6252, "step": 20280 }, { "epoch": 3.818934688499906, - "grad_norm": 7.021599769592285, - "learning_rate": 7.086391869000565e-06, - "loss": 0.8331, + "grad_norm": 1.0854592323303223, + "learning_rate": 0.00011810653115000942, + "loss": 0.8545, "step": 20290 }, { "epoch": 3.820816864295125, - "grad_norm": 0.8781418800354004, - "learning_rate": 7.075098814229249e-06, - "loss": 0.5364, + "grad_norm": 2.4989964962005615, + "learning_rate": 0.00011791831357048749, + "loss": 0.6113, "step": 20300 }, { "epoch": 3.8226990400903444, - "grad_norm": 10.943838119506836, - "learning_rate": 7.063805759457933e-06, - "loss": 0.3288, + "grad_norm": 2.8781096935272217, + "learning_rate": 0.00011773009599096556, + "loss": 0.3864, "step": 20310 }, { "epoch": 3.824581215885564, - "grad_norm": 6.258754730224609, - "learning_rate": 7.052512704686618e-06, - "loss": 0.2297, + "grad_norm": 5.14607572555542, + "learning_rate": 0.00011754187841144363, + "loss": 0.4674, "step": 20320 }, { "epoch": 3.826463391680783, - "grad_norm": 6.189356803894043, - "learning_rate": 7.041219649915302e-06, - "loss": 0.4229, + "grad_norm": 2.13116455078125, + "learning_rate": 0.0001173536608319217, + "loss": 0.5366, "step": 20330 }, { "epoch": 3.8283455674760023, - "grad_norm": 0.3370305299758911, - "learning_rate": 7.0299265951439865e-06, - "loss": 0.8312, + "grad_norm": 0.10455270856618881, + "learning_rate": 0.00011716544325239978, + "loss": 0.9137, "step": 20340 }, { "epoch": 3.8302277432712213, - "grad_norm": 0.39705947041511536, - "learning_rate": 7.018633540372671e-06, - "loss": 0.6544, + "grad_norm": 0.04126379266381264, + "learning_rate": 0.00011697722567287785, + "loss": 0.6901, "step": 20350 }, { "epoch": 3.8321099190664407, - "grad_norm": 5.138480186462402, - "learning_rate": 7.007340485601355e-06, - "loss": 0.5875, + "grad_norm": 2.3497095108032227, + "learning_rate": 0.00011678900809335592, + "loss": 0.5338, "step": 20360 }, { "epoch": 3.83399209486166, - "grad_norm": 16.162595748901367, - "learning_rate": 6.9960474308300405e-06, - "loss": 0.694, + "grad_norm": 4.9876179695129395, + "learning_rate": 0.000116600790513834, + "loss": 0.8286, "step": 20370 }, { "epoch": 3.8358742706568796, - "grad_norm": 10.226083755493164, - "learning_rate": 6.984754376058724e-06, - "loss": 0.6075, + "grad_norm": 7.40828800201416, + "learning_rate": 0.00011641257293431206, + "loss": 0.6535, "step": 20380 }, { "epoch": 3.8377564464520986, - "grad_norm": 11.182906150817871, - "learning_rate": 6.9734613212874084e-06, - "loss": 0.5489, + "grad_norm": 2.155172109603882, + "learning_rate": 0.00011622435535479014, + "loss": 0.5168, "step": 20390 }, { "epoch": 3.839638622247318, - "grad_norm": 16.98402214050293, - "learning_rate": 6.962168266516093e-06, - "loss": 0.5389, + "grad_norm": 4.699321269989014, + "learning_rate": 0.00011603613777526822, + "loss": 0.4554, "step": 20400 }, { "epoch": 3.841520798042537, - "grad_norm": 7.632681846618652, - "learning_rate": 6.950875211744777e-06, - "loss": 0.4979, + "grad_norm": 2.950913429260254, + "learning_rate": 0.00011584792019574629, + "loss": 0.5303, "step": 20410 }, { "epoch": 3.8434029738377564, - "grad_norm": 11.018415451049805, - "learning_rate": 6.939582156973462e-06, - "loss": 0.792, + "grad_norm": 1.8240998983383179, + "learning_rate": 0.00011565970261622436, + "loss": 0.7742, "step": 20420 }, { "epoch": 3.845285149632976, - "grad_norm": 18.204879760742188, - "learning_rate": 6.928289102202146e-06, - "loss": 0.4916, + "grad_norm": 5.689794063568115, + "learning_rate": 0.00011547148503670243, + "loss": 0.5243, "step": 20430 }, { "epoch": 3.847167325428195, - "grad_norm": 17.605222702026367, - "learning_rate": 6.91699604743083e-06, - "loss": 0.2598, + "grad_norm": 3.2978150844573975, + "learning_rate": 0.0001152832674571805, + "loss": 0.354, "step": 20440 }, { "epoch": 3.8490495012234143, - "grad_norm": 31.94579315185547, - "learning_rate": 6.905702992659515e-06, - "loss": 0.843, + "grad_norm": 4.142574787139893, + "learning_rate": 0.00011509504987765858, + "loss": 0.6876, "step": 20450 }, { "epoch": 3.8509316770186337, - "grad_norm": 16.11570167541504, - "learning_rate": 6.894409937888198e-06, - "loss": 0.5118, + "grad_norm": 3.3246145248413086, + "learning_rate": 0.00011490683229813665, + "loss": 0.7418, "step": 20460 }, { "epoch": 3.8528138528138527, - "grad_norm": 9.326797485351562, - "learning_rate": 6.8831168831168835e-06, - "loss": 0.5509, + "grad_norm": 1.9157615900039673, + "learning_rate": 0.00011471861471861472, + "loss": 0.6626, "step": 20470 }, { "epoch": 3.854696028609072, - "grad_norm": 12.51727294921875, - "learning_rate": 6.871823828345567e-06, - "loss": 0.5935, + "grad_norm": 2.236476182937622, + "learning_rate": 0.00011453039713909279, + "loss": 0.6926, "step": 20480 }, { "epoch": 3.8565782044042916, - "grad_norm": 18.62541389465332, - "learning_rate": 6.860530773574252e-06, - "loss": 0.7294, + "grad_norm": 2.5907342433929443, + "learning_rate": 0.00011434217955957086, + "loss": 0.8657, "step": 20490 }, { "epoch": 3.8584603801995105, - "grad_norm": 18.53264045715332, - "learning_rate": 6.849237718802937e-06, - "loss": 0.4357, + "grad_norm": 3.643303871154785, + "learning_rate": 0.00011415396198004894, + "loss": 0.6398, "step": 20500 }, { "epoch": 3.86034255599473, - "grad_norm": 22.299819946289062, - "learning_rate": 6.837944664031621e-06, - "loss": 0.5943, + "grad_norm": 4.363615036010742, + "learning_rate": 0.00011396574440052702, + "loss": 0.7104, "step": 20510 }, { "epoch": 3.862224731789949, - "grad_norm": 23.825881958007812, - "learning_rate": 6.826651609260305e-06, - "loss": 0.5521, + "grad_norm": 3.6445014476776123, + "learning_rate": 0.00011377752682100509, + "loss": 0.7257, "step": 20520 }, { "epoch": 3.8641069075851684, - "grad_norm": 20.267616271972656, - "learning_rate": 6.815358554488989e-06, - "loss": 0.8127, + "grad_norm": 1.6341899633407593, + "learning_rate": 0.00011358930924148316, + "loss": 0.7366, "step": 20530 }, { "epoch": 3.865989083380388, - "grad_norm": 29.187458038330078, - "learning_rate": 6.804065499717674e-06, - "loss": 0.5979, + "grad_norm": 3.796801805496216, + "learning_rate": 0.00011340109166196123, + "loss": 0.6457, "step": 20540 }, { "epoch": 3.8678712591756073, - "grad_norm": 15.423782348632812, - "learning_rate": 6.792772444946358e-06, - "loss": 0.6075, + "grad_norm": 2.6889047622680664, + "learning_rate": 0.0001132128740824393, + "loss": 0.5895, "step": 20550 }, { "epoch": 3.8697534349708262, - "grad_norm": 8.876800537109375, - "learning_rate": 6.781479390175043e-06, - "loss": 0.8203, + "grad_norm": 1.7333699464797974, + "learning_rate": 0.00011302465650291738, + "loss": 1.041, "step": 20560 }, { "epoch": 3.8716356107660457, - "grad_norm": 0.373295396566391, - "learning_rate": 6.7701863354037265e-06, - "loss": 0.7442, + "grad_norm": 0.7206604480743408, + "learning_rate": 0.00011283643892339545, + "loss": 0.7296, "step": 20570 }, { "epoch": 3.8735177865612647, - "grad_norm": 3.6619198322296143, - "learning_rate": 6.758893280632412e-06, - "loss": 0.2888, + "grad_norm": 0.6784987449645996, + "learning_rate": 0.00011264822134387352, + "loss": 0.3055, "step": 20580 }, { "epoch": 3.875399962356484, - "grad_norm": 7.333616256713867, - "learning_rate": 6.747600225861095e-06, - "loss": 0.5505, + "grad_norm": 3.1988539695739746, + "learning_rate": 0.00011246000376435159, + "loss": 0.7054, "step": 20590 }, { "epoch": 3.8772821381517035, - "grad_norm": 11.97646427154541, - "learning_rate": 6.73630717108978e-06, - "loss": 0.5715, + "grad_norm": 2.1100261211395264, + "learning_rate": 0.00011227178618482966, + "loss": 0.6783, "step": 20600 }, { "epoch": 3.8791643139469225, - "grad_norm": 17.42437744140625, - "learning_rate": 6.725014116318464e-06, - "loss": 0.7322, + "grad_norm": 3.418389320373535, + "learning_rate": 0.00011208356860530773, + "loss": 0.8839, "step": 20610 }, { "epoch": 3.881046489742142, - "grad_norm": 4.155119895935059, - "learning_rate": 6.713721061547148e-06, - "loss": 0.5882, + "grad_norm": 1.9344919919967651, + "learning_rate": 0.00011189535102578582, + "loss": 0.603, "step": 20620 }, { "epoch": 3.882928665537361, - "grad_norm": 20.934206008911133, - "learning_rate": 6.702428006775834e-06, - "loss": 0.7397, + "grad_norm": 4.261674880981445, + "learning_rate": 0.00011170713344626389, + "loss": 0.794, "step": 20630 }, { "epoch": 3.8848108413325804, - "grad_norm": 12.857183456420898, - "learning_rate": 6.691134952004517e-06, - "loss": 0.4704, + "grad_norm": 2.8918564319610596, + "learning_rate": 0.00011151891586674196, + "loss": 0.4593, "step": 20640 }, { "epoch": 3.8866930171278, - "grad_norm": 8.88806438446045, - "learning_rate": 6.679841897233202e-06, - "loss": 0.3095, + "grad_norm": 1.8943965435028076, + "learning_rate": 0.00011133069828722003, + "loss": 0.4184, "step": 20650 }, { "epoch": 3.8885751929230192, - "grad_norm": 11.002625465393066, - "learning_rate": 6.668548842461886e-06, - "loss": 0.5045, + "grad_norm": 3.679342269897461, + "learning_rate": 0.0001111424807076981, + "loss": 0.5149, "step": 20660 }, { "epoch": 3.890457368718238, - "grad_norm": 7.77680778503418, - "learning_rate": 6.65725578769057e-06, - "loss": 0.8244, + "grad_norm": 3.0474326610565186, + "learning_rate": 0.00011095426312817617, + "loss": 0.8624, "step": 20670 }, { "epoch": 3.8923395445134576, - "grad_norm": 0.3390113115310669, - "learning_rate": 6.645962732919255e-06, - "loss": 0.5562, + "grad_norm": 0.4835365116596222, + "learning_rate": 0.00011076604554865425, + "loss": 0.5949, "step": 20680 }, { "epoch": 3.8942217203086766, - "grad_norm": 19.375041961669922, - "learning_rate": 6.634669678147939e-06, - "loss": 0.4, + "grad_norm": 4.019068241119385, + "learning_rate": 0.00011057782796913232, + "loss": 0.4482, "step": 20690 }, { "epoch": 3.896103896103896, - "grad_norm": 47.01906967163086, - "learning_rate": 6.6233766233766234e-06, - "loss": 0.4626, + "grad_norm": 3.6996610164642334, + "learning_rate": 0.00011038961038961039, + "loss": 0.5172, "step": 20700 }, { "epoch": 3.8979860718991155, - "grad_norm": 12.001082420349121, - "learning_rate": 6.612083568605308e-06, - "loss": 0.7142, + "grad_norm": 2.3082823753356934, + "learning_rate": 0.00011020139281008846, + "loss": 0.6819, "step": 20710 }, { "epoch": 3.8998682476943345, - "grad_norm": 12.084139823913574, - "learning_rate": 6.600790513833992e-06, - "loss": 0.4786, + "grad_norm": 3.6274797916412354, + "learning_rate": 0.00011001317523056653, + "loss": 0.6246, "step": 20720 }, { "epoch": 3.901750423489554, - "grad_norm": 10.616601943969727, - "learning_rate": 6.589497459062677e-06, - "loss": 0.4461, + "grad_norm": 2.167787790298462, + "learning_rate": 0.00010982495765104462, + "loss": 0.5745, "step": 20730 }, { "epoch": 3.9036325992847734, - "grad_norm": 15.530953407287598, - "learning_rate": 6.578204404291361e-06, - "loss": 0.6108, + "grad_norm": 3.232084274291992, + "learning_rate": 0.00010963674007152269, + "loss": 0.6898, "step": 20740 }, { "epoch": 3.9055147750799923, - "grad_norm": 0.3299574851989746, - "learning_rate": 6.566911349520045e-06, - "loss": 0.5048, + "grad_norm": 1.4799513816833496, + "learning_rate": 0.00010944852249200076, + "loss": 0.5645, "step": 20750 }, { "epoch": 3.9073969508752118, - "grad_norm": 0.755125880241394, - "learning_rate": 6.55561829474873e-06, - "loss": 0.3229, + "grad_norm": 0.08950027078390121, + "learning_rate": 0.00010926030491247883, + "loss": 0.3951, "step": 20760 }, { "epoch": 3.909279126670431, - "grad_norm": 0.48004603385925293, - "learning_rate": 6.544325239977414e-06, - "loss": 0.4146, + "grad_norm": 0.2402801364660263, + "learning_rate": 0.0001090720873329569, + "loss": 0.3863, "step": 20770 }, { "epoch": 3.91116130246565, - "grad_norm": 33.05055236816406, - "learning_rate": 6.5330321852060985e-06, - "loss": 0.4969, + "grad_norm": 5.756802558898926, + "learning_rate": 0.00010888386975343497, + "loss": 0.5367, "step": 20780 }, { "epoch": 3.9130434782608696, - "grad_norm": 2.9246139526367188, - "learning_rate": 6.521739130434783e-06, - "loss": 1.0341, + "grad_norm": 2.1738858222961426, + "learning_rate": 0.00010869565217391305, + "loss": 0.9819, "step": 20790 }, { "epoch": 3.9149256540560886, - "grad_norm": 8.560576438903809, - "learning_rate": 6.510446075663467e-06, - "loss": 0.4784, + "grad_norm": 1.9287985563278198, + "learning_rate": 0.00010850743459439112, + "loss": 0.5903, "step": 20800 }, { "epoch": 3.916807829851308, - "grad_norm": 14.475516319274902, - "learning_rate": 6.499153020892152e-06, - "loss": 0.422, + "grad_norm": 2.640705108642578, + "learning_rate": 0.00010831921701486919, + "loss": 0.4065, "step": 20810 }, { "epoch": 3.9186900056465275, - "grad_norm": 23.599973678588867, - "learning_rate": 6.487859966120836e-06, - "loss": 0.7579, + "grad_norm": 5.2311296463012695, + "learning_rate": 0.00010813099943534726, + "loss": 0.8136, "step": 20820 }, { "epoch": 3.920572181441747, - "grad_norm": 21.075658798217773, - "learning_rate": 6.4765669113495196e-06, - "loss": 0.5062, + "grad_norm": 1.2336487770080566, + "learning_rate": 0.00010794278185582533, + "loss": 0.5395, "step": 20830 }, { "epoch": 3.922454357236966, - "grad_norm": 11.33735466003418, - "learning_rate": 6.465273856578205e-06, - "loss": 0.7134, + "grad_norm": 2.3239247798919678, + "learning_rate": 0.00010775456427630342, + "loss": 0.7615, "step": 20840 }, { "epoch": 3.9243365330321853, - "grad_norm": 13.300880432128906, - "learning_rate": 6.453980801806888e-06, - "loss": 0.4887, + "grad_norm": 2.2713286876678467, + "learning_rate": 0.00010756634669678148, + "loss": 0.6509, "step": 20850 }, { "epoch": 3.9262187088274043, - "grad_norm": 16.807415008544922, - "learning_rate": 6.4426877470355736e-06, - "loss": 0.3613, + "grad_norm": 2.7799088954925537, + "learning_rate": 0.00010737812911725956, + "loss": 0.3079, "step": 20860 }, { "epoch": 3.9281008846226237, - "grad_norm": 10.84011459350586, - "learning_rate": 6.431394692264258e-06, - "loss": 0.5069, + "grad_norm": 4.758141994476318, + "learning_rate": 0.00010718991153773763, + "loss": 0.5902, "step": 20870 }, { "epoch": 3.929983060417843, - "grad_norm": 21.17901611328125, - "learning_rate": 6.420101637492942e-06, - "loss": 0.6276, + "grad_norm": 5.282011985778809, + "learning_rate": 0.0001070016939582157, + "loss": 0.5354, "step": 20880 }, { "epoch": 3.931865236213062, - "grad_norm": 12.811206817626953, - "learning_rate": 6.408808582721627e-06, - "loss": 0.5739, + "grad_norm": 2.415411949157715, + "learning_rate": 0.00010681347637869377, + "loss": 0.6325, "step": 20890 }, { "epoch": 3.9337474120082816, - "grad_norm": 2.8694605827331543, - "learning_rate": 6.39751552795031e-06, - "loss": 0.3009, + "grad_norm": 0.12160457670688629, + "learning_rate": 0.00010662525879917185, + "loss": 0.3388, "step": 20900 }, { "epoch": 3.9356295878035006, - "grad_norm": 10.879847526550293, - "learning_rate": 6.3862224731789955e-06, - "loss": 0.5047, + "grad_norm": 4.200422286987305, + "learning_rate": 0.00010643704121964992, + "loss": 0.6201, "step": 20910 }, { "epoch": 3.93751176359872, - "grad_norm": 12.21065616607666, - "learning_rate": 6.374929418407679e-06, - "loss": 0.6257, + "grad_norm": 4.672911167144775, + "learning_rate": 0.00010624882364012799, + "loss": 0.5476, "step": 20920 }, { "epoch": 3.9393939393939394, - "grad_norm": 12.078612327575684, - "learning_rate": 6.363636363636364e-06, - "loss": 0.7487, + "grad_norm": 3.7594289779663086, + "learning_rate": 0.00010606060606060606, + "loss": 0.6746, "step": 20930 }, { "epoch": 3.941276115189159, - "grad_norm": 7.435342311859131, - "learning_rate": 6.352343308865048e-06, - "loss": 0.4637, + "grad_norm": 1.4969550371170044, + "learning_rate": 0.00010587238848108413, + "loss": 0.4174, "step": 20940 }, { "epoch": 3.943158290984378, - "grad_norm": 16.511627197265625, - "learning_rate": 6.341050254093733e-06, - "loss": 0.6262, + "grad_norm": 0.9225571155548096, + "learning_rate": 0.00010568417090156222, + "loss": 0.5711, "step": 20950 }, { "epoch": 3.9450404667795973, - "grad_norm": 0.3107624053955078, - "learning_rate": 6.3297571993224165e-06, - "loss": 0.5074, + "grad_norm": 4.92745304107666, + "learning_rate": 0.00010549595332204028, + "loss": 0.6358, "step": 20960 }, { "epoch": 3.9469226425748163, - "grad_norm": 0.1293262541294098, - "learning_rate": 6.318464144551101e-06, - "loss": 0.7009, + "grad_norm": 0.09020210057497025, + "learning_rate": 0.00010530773574251836, + "loss": 0.7014, "step": 20970 }, { "epoch": 3.9488048183700357, - "grad_norm": 16.236751556396484, - "learning_rate": 6.307171089779785e-06, - "loss": 0.9354, + "grad_norm": 4.352484226226807, + "learning_rate": 0.00010511951816299642, + "loss": 1.125, "step": 20980 }, { "epoch": 3.950686994165255, - "grad_norm": 4.352895259857178, - "learning_rate": 6.29587803500847e-06, - "loss": 0.7928, + "grad_norm": 1.2882801294326782, + "learning_rate": 0.0001049313005834745, + "loss": 0.7509, "step": 20990 }, { "epoch": 3.9525691699604746, - "grad_norm": 7.811060428619385, - "learning_rate": 6.284584980237155e-06, - "loss": 0.384, + "grad_norm": 2.5389184951782227, + "learning_rate": 0.00010474308300395257, + "loss": 0.5879, "step": 21000 }, { "epoch": 3.9544513457556936, - "grad_norm": 30.867467880249023, - "learning_rate": 6.2732919254658384e-06, - "loss": 0.4535, + "grad_norm": 4.555139541625977, + "learning_rate": 0.00010455486542443065, + "loss": 0.6353, "step": 21010 }, { "epoch": 3.956333521550913, - "grad_norm": 9.880651473999023, - "learning_rate": 6.261998870694524e-06, - "loss": 0.4897, + "grad_norm": 2.414466381072998, + "learning_rate": 0.00010436664784490872, + "loss": 0.5994, "step": 21020 }, { "epoch": 3.958215697346132, - "grad_norm": 26.8701114654541, - "learning_rate": 6.250705815923207e-06, - "loss": 0.5893, + "grad_norm": 2.828115701675415, + "learning_rate": 0.00010417843026538679, + "loss": 0.6584, "step": 21030 }, { "epoch": 3.9600978731413514, - "grad_norm": 21.494901657104492, - "learning_rate": 6.239412761151892e-06, - "loss": 0.5116, + "grad_norm": 4.323361873626709, + "learning_rate": 0.00010399021268586486, + "loss": 0.568, "step": 21040 }, { "epoch": 3.961980048936571, - "grad_norm": 13.416314125061035, - "learning_rate": 6.228119706380576e-06, - "loss": 0.6689, + "grad_norm": 1.821200966835022, + "learning_rate": 0.00010380199510634293, + "loss": 0.6949, "step": 21050 }, { "epoch": 3.96386222473179, - "grad_norm": 7.51889705657959, - "learning_rate": 6.21682665160926e-06, - "loss": 0.4839, + "grad_norm": 7.879891395568848, + "learning_rate": 0.00010361377752682102, + "loss": 0.5554, "step": 21060 }, { "epoch": 3.9657444005270093, - "grad_norm": 12.946698188781738, - "learning_rate": 6.205533596837945e-06, - "loss": 0.6613, + "grad_norm": 2.9675745964050293, + "learning_rate": 0.00010342555994729908, + "loss": 0.5815, "step": 21070 }, { "epoch": 3.9676265763222283, - "grad_norm": 18.94002342224121, - "learning_rate": 6.194240542066629e-06, - "loss": 0.607, + "grad_norm": 3.8450276851654053, + "learning_rate": 0.00010323734236777716, + "loss": 0.734, "step": 21080 }, { "epoch": 3.9695087521174477, - "grad_norm": 13.655399322509766, - "learning_rate": 6.1829474872953135e-06, - "loss": 0.6801, + "grad_norm": 3.7981014251708984, + "learning_rate": 0.00010304912478825522, + "loss": 0.7174, "step": 21090 }, { "epoch": 3.971390927912667, - "grad_norm": 12.325647354125977, - "learning_rate": 6.171654432523998e-06, - "loss": 0.6112, + "grad_norm": 3.7332234382629395, + "learning_rate": 0.0001028609072087333, + "loss": 0.5115, "step": 21100 }, { "epoch": 3.9732731037078866, - "grad_norm": 6.132079601287842, - "learning_rate": 6.160361377752682e-06, - "loss": 0.6071, + "grad_norm": 1.9932327270507812, + "learning_rate": 0.00010267268962921137, + "loss": 0.7141, "step": 21110 }, { "epoch": 3.9751552795031055, - "grad_norm": 0.7968364357948303, - "learning_rate": 6.149068322981367e-06, - "loss": 0.5114, + "grad_norm": 1.1441056728363037, + "learning_rate": 0.00010248447204968945, + "loss": 0.5968, "step": 21120 }, { "epoch": 3.977037455298325, - "grad_norm": 5.779326438903809, - "learning_rate": 6.137775268210051e-06, - "loss": 0.6626, + "grad_norm": 1.9460235834121704, + "learning_rate": 0.00010229625447016752, + "loss": 0.6997, "step": 21130 }, { "epoch": 3.978919631093544, - "grad_norm": 15.42678451538086, - "learning_rate": 6.126482213438735e-06, - "loss": 0.6293, + "grad_norm": 2.354952096939087, + "learning_rate": 0.00010210803689064559, + "loss": 0.5117, "step": 21140 }, { "epoch": 3.9808018068887634, - "grad_norm": 23.818042755126953, - "learning_rate": 6.11518915866742e-06, - "loss": 0.7765, + "grad_norm": 3.873466968536377, + "learning_rate": 0.00010191981931112366, + "loss": 0.7877, "step": 21150 }, { "epoch": 3.982683982683983, - "grad_norm": 15.091184616088867, - "learning_rate": 6.103896103896104e-06, - "loss": 0.6724, + "grad_norm": 5.34775972366333, + "learning_rate": 0.00010173160173160173, + "loss": 0.6326, "step": 21160 }, { "epoch": 3.984566158479202, - "grad_norm": 13.259024620056152, - "learning_rate": 6.0926030491247886e-06, - "loss": 0.6242, + "grad_norm": 2.9042680263519287, + "learning_rate": 0.00010154338415207982, + "loss": 0.762, "step": 21170 }, { "epoch": 3.9864483342744212, - "grad_norm": 10.13891887664795, - "learning_rate": 6.081309994353472e-06, - "loss": 0.807, + "grad_norm": 4.9012675285339355, + "learning_rate": 0.00010135516657255788, + "loss": 0.9534, "step": 21180 }, { "epoch": 3.9883305100696402, - "grad_norm": 10.566964149475098, - "learning_rate": 6.070016939582157e-06, - "loss": 0.4579, + "grad_norm": 3.0978500843048096, + "learning_rate": 0.00010116694899303596, + "loss": 0.5921, "step": 21190 }, { "epoch": 3.9902126858648597, - "grad_norm": 9.14899730682373, - "learning_rate": 6.058723884810841e-06, - "loss": 0.5235, + "grad_norm": 3.7232377529144287, + "learning_rate": 0.00010097873141351402, + "loss": 0.4836, "step": 21200 }, { "epoch": 3.992094861660079, - "grad_norm": 2.9169421195983887, - "learning_rate": 6.047430830039526e-06, - "loss": 0.3648, + "grad_norm": 2.5855820178985596, + "learning_rate": 0.0001007905138339921, + "loss": 0.478, "step": 21210 }, { "epoch": 3.9939770374552985, - "grad_norm": 14.589804649353027, - "learning_rate": 6.03613777526821e-06, - "loss": 0.4533, + "grad_norm": 2.2673332691192627, + "learning_rate": 0.00010060229625447016, + "loss": 0.52, "step": 21220 }, { "epoch": 3.9958592132505175, - "grad_norm": 24.296232223510742, - "learning_rate": 6.024844720496895e-06, - "loss": 0.5101, + "grad_norm": 3.1919641494750977, + "learning_rate": 0.00010041407867494825, + "loss": 0.4867, "step": 21230 }, { "epoch": 3.997741389045737, - "grad_norm": 33.149715423583984, - "learning_rate": 6.013551665725579e-06, - "loss": 0.6524, + "grad_norm": 5.216162204742432, + "learning_rate": 0.00010022586109542632, + "loss": 0.7683, "step": 21240 }, { "epoch": 3.999623564840956, - "grad_norm": 15.509395599365234, - "learning_rate": 6.002258610954264e-06, - "loss": 0.3432, + "grad_norm": 3.5101685523986816, + "learning_rate": 0.00010003764351590439, + "loss": 0.3582, "step": 21250 }, { "epoch": 4.0, - "eval_accuracy": 0.9209333333333334, - "eval_loss": 0.2990027368068695, - "eval_runtime": 253.833, - "eval_samples_per_second": 29.547, - "eval_steps_per_second": 3.695, + "eval_accuracy": 0.9232, + "eval_loss": 0.25806137919425964, + "eval_runtime": 293.9532, + "eval_samples_per_second": 25.514, + "eval_steps_per_second": 3.191, "step": 21252 }, { "epoch": 4.001505740636175, - "grad_norm": 10.212028503417969, - "learning_rate": 5.990965556182948e-06, - "loss": 0.751, + "grad_norm": 2.5859482288360596, + "learning_rate": 9.984942593638246e-05, + "loss": 0.6965, "step": 21260 }, { "epoch": 4.003387916431395, - "grad_norm": 12.337179183959961, - "learning_rate": 5.9796725014116315e-06, - "loss": 0.4736, + "grad_norm": 2.421527862548828, + "learning_rate": 9.966120835686053e-05, + "loss": 0.5074, "step": 21270 }, { "epoch": 4.005270092226614, - "grad_norm": 1.4147861003875732, - "learning_rate": 5.968379446640317e-06, - "loss": 0.3634, + "grad_norm": 0.755755603313446, + "learning_rate": 9.947299077733862e-05, + "loss": 0.3855, "step": 21280 }, { "epoch": 4.007152268021834, - "grad_norm": 21.909902572631836, - "learning_rate": 5.957086391869e-06, - "loss": 0.5764, + "grad_norm": 1.626813292503357, + "learning_rate": 9.928477319781668e-05, + "loss": 0.6369, "step": 21290 }, { "epoch": 4.009034443817052, - "grad_norm": 8.32170295715332, - "learning_rate": 5.9457933370976855e-06, - "loss": 0.5818, + "grad_norm": 1.9735829830169678, + "learning_rate": 9.909655561829476e-05, + "loss": 0.6928, "step": 21300 }, { "epoch": 4.010916619612272, - "grad_norm": 4.794271945953369, - "learning_rate": 5.934500282326369e-06, - "loss": 0.7417, + "grad_norm": 2.7982254028320312, + "learning_rate": 9.890833803877282e-05, + "loss": 0.7617, "step": 21310 }, { "epoch": 4.012798795407491, - "grad_norm": 25.876705169677734, - "learning_rate": 5.923207227555054e-06, - "loss": 0.4546, + "grad_norm": 3.808712959289551, + "learning_rate": 9.87201204592509e-05, + "loss": 0.5099, "step": 21320 }, { "epoch": 4.0146809712027105, - "grad_norm": 26.065282821655273, - "learning_rate": 5.911914172783738e-06, - "loss": 0.7006, + "grad_norm": 4.210937976837158, + "learning_rate": 9.853190287972896e-05, + "loss": 0.7595, "step": 21330 }, { "epoch": 4.01656314699793, - "grad_norm": 34.19929122924805, - "learning_rate": 5.900621118012422e-06, - "loss": 0.6222, + "grad_norm": 8.570923805236816, + "learning_rate": 9.834368530020705e-05, + "loss": 0.6814, "step": 21340 }, { "epoch": 4.0184453227931485, - "grad_norm": 1.1643229722976685, - "learning_rate": 5.889328063241107e-06, - "loss": 0.4165, + "grad_norm": 1.556383728981018, + "learning_rate": 9.815546772068511e-05, + "loss": 0.4653, "step": 21350 }, { "epoch": 4.020327498588368, - "grad_norm": 10.392925262451172, - "learning_rate": 5.878035008469791e-06, - "loss": 0.5281, + "grad_norm": 2.515901803970337, + "learning_rate": 9.796725014116319e-05, + "loss": 0.5543, "step": 21360 }, { "epoch": 4.022209674383587, - "grad_norm": 25.98088264465332, - "learning_rate": 5.866741953698476e-06, - "loss": 0.4653, + "grad_norm": 1.1342847347259521, + "learning_rate": 9.777903256164126e-05, + "loss": 0.577, "step": 21370 }, { "epoch": 4.024091850178807, - "grad_norm": 19.15206527709961, - "learning_rate": 5.85544889892716e-06, - "loss": 0.6925, + "grad_norm": 2.5764143466949463, + "learning_rate": 9.759081498211933e-05, + "loss": 0.7136, "step": 21380 }, { "epoch": 4.025974025974026, - "grad_norm": 8.375141143798828, - "learning_rate": 5.844155844155845e-06, - "loss": 0.975, + "grad_norm": 3.2266030311584473, + "learning_rate": 9.740259740259742e-05, + "loss": 0.9059, "step": 21390 }, { "epoch": 4.027856201769246, - "grad_norm": 11.092159271240234, - "learning_rate": 5.8328627893845285e-06, - "loss": 0.4277, + "grad_norm": 2.965477466583252, + "learning_rate": 9.721437982307548e-05, + "loss": 0.537, "step": 21400 }, { "epoch": 4.029738377564464, - "grad_norm": 17.455846786499023, - "learning_rate": 5.821569734613213e-06, - "loss": 0.7711, + "grad_norm": 2.400604248046875, + "learning_rate": 9.702616224355356e-05, + "loss": 0.7048, "step": 21410 }, { "epoch": 4.031620553359684, - "grad_norm": 5.300505638122559, - "learning_rate": 5.810276679841897e-06, - "loss": 0.6583, + "grad_norm": 2.111332893371582, + "learning_rate": 9.683794466403162e-05, + "loss": 0.7618, "step": 21420 }, { "epoch": 4.033502729154903, - "grad_norm": 0.19853296875953674, - "learning_rate": 5.798983625070582e-06, - "loss": 0.5738, + "grad_norm": 0.10220424830913544, + "learning_rate": 9.66497270845097e-05, + "loss": 0.6055, "step": 21430 }, { "epoch": 4.0353849049501225, - "grad_norm": 19.96097183227539, - "learning_rate": 5.787690570299266e-06, - "loss": 0.3168, + "grad_norm": 6.471917152404785, + "learning_rate": 9.646150950498776e-05, + "loss": 0.4042, "step": 21440 }, { "epoch": 4.037267080745342, - "grad_norm": 10.578010559082031, - "learning_rate": 5.77639751552795e-06, - "loss": 0.3206, + "grad_norm": 2.8826186656951904, + "learning_rate": 9.627329192546585e-05, + "loss": 0.442, "step": 21450 }, { "epoch": 4.0391492565405605, - "grad_norm": 7.319991111755371, - "learning_rate": 5.765104460756635e-06, - "loss": 0.6817, + "grad_norm": 1.8660509586334229, + "learning_rate": 9.608507434594391e-05, + "loss": 0.5408, "step": 21460 }, { "epoch": 4.04103143233578, - "grad_norm": 1.0113133192062378, - "learning_rate": 5.753811405985319e-06, - "loss": 0.4977, + "grad_norm": 0.3903960585594177, + "learning_rate": 9.589685676642199e-05, + "loss": 0.5129, "step": 21470 }, { "epoch": 4.042913608130999, - "grad_norm": 0.2078145146369934, - "learning_rate": 5.7425183512140036e-06, - "loss": 0.3547, + "grad_norm": 4.441036701202393, + "learning_rate": 9.570863918690006e-05, + "loss": 0.3228, "step": 21480 }, { "epoch": 4.044795783926219, - "grad_norm": 19.170316696166992, - "learning_rate": 5.731225296442688e-06, - "loss": 0.3471, + "grad_norm": 3.8729546070098877, + "learning_rate": 9.552042160737813e-05, + "loss": 0.45, "step": 21490 }, { "epoch": 4.046677959721438, - "grad_norm": 20.73012924194336, - "learning_rate": 5.719932241671372e-06, - "loss": 0.8461, + "grad_norm": 3.6702287197113037, + "learning_rate": 9.53322040278562e-05, + "loss": 1.0155, "step": 21500 }, { "epoch": 4.048560135516658, - "grad_norm": 33.84339141845703, - "learning_rate": 5.708639186900057e-06, - "loss": 0.406, + "grad_norm": 3.694718360900879, + "learning_rate": 9.514398644833428e-05, + "loss": 0.4059, "step": 21510 }, { "epoch": 4.050442311311876, - "grad_norm": 0.295187771320343, - "learning_rate": 5.697346132128741e-06, - "loss": 0.2405, + "grad_norm": 1.9021106958389282, + "learning_rate": 9.495576886881236e-05, + "loss": 0.3598, "step": 21520 }, { "epoch": 4.052324487107096, - "grad_norm": 8.998740196228027, - "learning_rate": 5.6860530773574255e-06, - "loss": 0.4896, + "grad_norm": 3.583451986312866, + "learning_rate": 9.476755128929042e-05, + "loss": 0.3496, "step": 21530 }, { "epoch": 4.054206662902315, - "grad_norm": 2.618791103363037, - "learning_rate": 5.67476002258611e-06, - "loss": 0.3719, + "grad_norm": 2.1199567317962646, + "learning_rate": 9.45793337097685e-05, + "loss": 0.4274, "step": 21540 }, { "epoch": 4.0560888386975344, - "grad_norm": 12.432368278503418, - "learning_rate": 5.663466967814793e-06, - "loss": 0.9172, + "grad_norm": 1.4874187707901, + "learning_rate": 9.439111613024656e-05, + "loss": 0.9868, "step": 21550 }, { "epoch": 4.057971014492754, - "grad_norm": 22.641036987304688, - "learning_rate": 5.652173913043479e-06, - "loss": 0.498, + "grad_norm": 2.7171809673309326, + "learning_rate": 9.420289855072465e-05, + "loss": 0.4751, "step": 21560 }, { "epoch": 4.059853190287973, - "grad_norm": 1.2676461935043335, - "learning_rate": 5.640880858272162e-06, - "loss": 0.4709, + "grad_norm": 0.14315591752529144, + "learning_rate": 9.401468097120271e-05, + "loss": 0.3579, "step": 21570 }, { "epoch": 4.061735366083192, - "grad_norm": 8.317973136901855, - "learning_rate": 5.629587803500847e-06, - "loss": 0.463, + "grad_norm": 1.0544301271438599, + "learning_rate": 9.382646339168079e-05, + "loss": 0.4521, "step": 21580 }, { "epoch": 4.063617541878411, - "grad_norm": 12.644018173217773, - "learning_rate": 5.618294748729531e-06, - "loss": 0.4324, + "grad_norm": 3.1227588653564453, + "learning_rate": 9.363824581215885e-05, + "loss": 0.6073, "step": 21590 }, { "epoch": 4.065499717673631, - "grad_norm": 6.407628536224365, - "learning_rate": 5.607001693958216e-06, - "loss": 0.256, + "grad_norm": 2.726243019104004, + "learning_rate": 9.345002823263693e-05, + "loss": 0.3532, "step": 21600 }, { "epoch": 4.06738189346885, - "grad_norm": 0.3914599120616913, - "learning_rate": 5.5957086391869005e-06, - "loss": 0.6861, + "grad_norm": 2.325624942779541, + "learning_rate": 9.3261810653115e-05, + "loss": 0.7147, "step": 21610 }, { "epoch": 4.06926406926407, - "grad_norm": 9.922120094299316, - "learning_rate": 5.584415584415584e-06, - "loss": 0.4863, + "grad_norm": 2.536409854888916, + "learning_rate": 9.307359307359308e-05, + "loss": 0.5313, "step": 21620 }, { "epoch": 4.071146245059288, - "grad_norm": 0.7256487011909485, - "learning_rate": 5.573122529644269e-06, - "loss": 0.585, + "grad_norm": 0.09779352694749832, + "learning_rate": 9.288537549407116e-05, + "loss": 0.7419, "step": 21630 }, { "epoch": 4.073028420854508, - "grad_norm": 10.62320613861084, - "learning_rate": 5.561829474872953e-06, - "loss": 0.43, + "grad_norm": 1.094053864479065, + "learning_rate": 9.269715791454922e-05, + "loss": 0.4456, "step": 21640 }, { "epoch": 4.074910596649727, - "grad_norm": 2.3755974769592285, - "learning_rate": 5.550536420101638e-06, - "loss": 0.4797, + "grad_norm": 2.9910216331481934, + "learning_rate": 9.25089403350273e-05, + "loss": 0.4874, "step": 21650 }, { "epoch": 4.076792772444946, - "grad_norm": 0.20227846503257751, - "learning_rate": 5.539243365330322e-06, - "loss": 0.4626, + "grad_norm": 0.17736339569091797, + "learning_rate": 9.232072275550536e-05, + "loss": 0.3904, "step": 21660 }, { "epoch": 4.078674948240166, - "grad_norm": 12.040499687194824, - "learning_rate": 5.527950310559007e-06, - "loss": 0.3619, + "grad_norm": 3.1419992446899414, + "learning_rate": 9.213250517598345e-05, + "loss": 0.3972, "step": 21670 }, { "epoch": 4.080557124035385, - "grad_norm": 1.3578740358352661, - "learning_rate": 5.51665725578769e-06, - "loss": 0.4352, + "grad_norm": 2.997563600540161, + "learning_rate": 9.194428759646151e-05, + "loss": 0.468, "step": 21680 }, { "epoch": 4.082439299830604, - "grad_norm": 7.650432109832764, - "learning_rate": 5.505364201016376e-06, - "loss": 0.673, + "grad_norm": 2.441227912902832, + "learning_rate": 9.175607001693959e-05, + "loss": 0.7764, "step": 21690 }, { "epoch": 4.084321475625823, - "grad_norm": 7.1012983322143555, - "learning_rate": 5.494071146245059e-06, - "loss": 0.5103, + "grad_norm": 3.6427226066589355, + "learning_rate": 9.156785243741765e-05, + "loss": 0.5896, "step": 21700 }, { "epoch": 4.086203651421043, - "grad_norm": 28.039716720581055, - "learning_rate": 5.4827780914737435e-06, - "loss": 0.4092, + "grad_norm": 4.410248756408691, + "learning_rate": 9.137963485789573e-05, + "loss": 0.4323, "step": 21710 }, { "epoch": 4.088085827216262, - "grad_norm": 7.539644718170166, - "learning_rate": 5.471485036702429e-06, - "loss": 0.6337, + "grad_norm": 7.132479667663574, + "learning_rate": 9.11914172783738e-05, + "loss": 0.7631, "step": 21720 }, { "epoch": 4.089968003011482, - "grad_norm": 28.933773040771484, - "learning_rate": 5.460191981931112e-06, - "loss": 0.4827, + "grad_norm": 2.9803075790405273, + "learning_rate": 9.100319969885188e-05, + "loss": 0.567, "step": 21730 }, { "epoch": 4.0918501788067, - "grad_norm": 25.65105628967285, - "learning_rate": 5.4488989271597975e-06, - "loss": 0.5597, + "grad_norm": 2.718230724334717, + "learning_rate": 9.081498211932996e-05, + "loss": 0.606, "step": 21740 }, { "epoch": 4.0937323546019195, - "grad_norm": 27.194011688232422, - "learning_rate": 5.437605872388481e-06, - "loss": 0.5449, + "grad_norm": 8.262181282043457, + "learning_rate": 9.062676453980802e-05, + "loss": 0.6464, "step": 21750 }, { "epoch": 4.095614530397139, - "grad_norm": 11.716024398803711, - "learning_rate": 5.426312817617166e-06, - "loss": 0.6295, + "grad_norm": 3.146902084350586, + "learning_rate": 9.04385469602861e-05, + "loss": 0.6862, "step": 21760 }, { "epoch": 4.097496706192358, - "grad_norm": 17.159563064575195, - "learning_rate": 5.41501976284585e-06, - "loss": 0.4375, + "grad_norm": 2.2113897800445557, + "learning_rate": 9.025032938076416e-05, + "loss": 0.6772, "step": 21770 }, { "epoch": 4.099378881987578, - "grad_norm": 6.972395420074463, - "learning_rate": 5.403726708074534e-06, - "loss": 0.6678, + "grad_norm": 2.7351415157318115, + "learning_rate": 9.006211180124225e-05, + "loss": 0.6737, "step": 21780 }, { "epoch": 4.101261057782797, - "grad_norm": 2.577281951904297, - "learning_rate": 5.3924336533032186e-06, - "loss": 0.3085, + "grad_norm": 2.420198678970337, + "learning_rate": 8.987389422172031e-05, + "loss": 0.3986, "step": 21790 }, { "epoch": 4.103143233578016, - "grad_norm": 8.173493385314941, - "learning_rate": 5.381140598531903e-06, - "loss": 0.4501, + "grad_norm": 5.054831027984619, + "learning_rate": 8.968567664219839e-05, + "loss": 0.6978, "step": 21800 }, { "epoch": 4.105025409373235, - "grad_norm": 0.4988132417201996, - "learning_rate": 5.369847543760587e-06, - "loss": 0.4765, + "grad_norm": 0.5988843441009521, + "learning_rate": 8.949745906267645e-05, + "loss": 0.5788, "step": 21810 }, { "epoch": 4.106907585168455, - "grad_norm": 0.59678715467453, - "learning_rate": 5.358554488989272e-06, - "loss": 0.3523, + "grad_norm": 5.237829685211182, + "learning_rate": 8.930924148315453e-05, + "loss": 0.5006, "step": 21820 }, { "epoch": 4.108789760963674, - "grad_norm": 22.480344772338867, - "learning_rate": 5.347261434217956e-06, - "loss": 0.5372, + "grad_norm": 6.762528419494629, + "learning_rate": 8.912102390363259e-05, + "loss": 0.5774, "step": 21830 }, { "epoch": 4.1106719367588935, - "grad_norm": 6.5898284912109375, - "learning_rate": 5.3359683794466405e-06, - "loss": 0.6108, + "grad_norm": 2.084718704223633, + "learning_rate": 8.893280632411068e-05, + "loss": 0.6407, "step": 21840 }, { "epoch": 4.112554112554113, - "grad_norm": 11.716509819030762, - "learning_rate": 5.324675324675325e-06, - "loss": 0.3293, + "grad_norm": 1.660806655883789, + "learning_rate": 8.874458874458876e-05, + "loss": 0.4582, "step": 21850 }, { "epoch": 4.1144362883493315, - "grad_norm": 9.849950790405273, - "learning_rate": 5.313382269904009e-06, - "loss": 0.5861, + "grad_norm": 3.171679973602295, + "learning_rate": 8.855637116506682e-05, + "loss": 0.5192, "step": 21860 }, { "epoch": 4.116318464144551, - "grad_norm": 16.31063461303711, - "learning_rate": 5.302089215132694e-06, - "loss": 0.3767, + "grad_norm": 2.277590751647949, + "learning_rate": 8.83681535855449e-05, + "loss": 0.4837, "step": 21870 }, { "epoch": 4.11820063993977, - "grad_norm": 18.616395950317383, - "learning_rate": 5.290796160361378e-06, - "loss": 0.5778, + "grad_norm": 2.3669557571411133, + "learning_rate": 8.817993600602296e-05, + "loss": 0.5819, "step": 21880 }, { "epoch": 4.12008281573499, - "grad_norm": 12.037795066833496, - "learning_rate": 5.279503105590062e-06, - "loss": 0.5588, + "grad_norm": 2.413088798522949, + "learning_rate": 8.799171842650105e-05, + "loss": 0.5921, "step": 21890 }, { "epoch": 4.121964991530209, - "grad_norm": 5.515803337097168, - "learning_rate": 5.268210050818747e-06, - "loss": 0.2835, + "grad_norm": 5.965948104858398, + "learning_rate": 8.780350084697911e-05, + "loss": 0.3192, "step": 21900 }, { "epoch": 4.123847167325428, - "grad_norm": 8.707112312316895, - "learning_rate": 5.256916996047431e-06, - "loss": 0.3438, + "grad_norm": 1.5568970441818237, + "learning_rate": 8.761528326745719e-05, + "loss": 0.4662, "step": 21910 }, { "epoch": 4.125729343120647, - "grad_norm": 0.553016185760498, - "learning_rate": 5.245623941276115e-06, - "loss": 0.3274, + "grad_norm": 0.12782663106918335, + "learning_rate": 8.742706568793525e-05, + "loss": 0.4925, "step": 21920 }, { "epoch": 4.127611518915867, - "grad_norm": 14.9821195602417, - "learning_rate": 5.2343308865048e-06, - "loss": 0.5166, + "grad_norm": 1.7786810398101807, + "learning_rate": 8.723884810841333e-05, + "loss": 0.5588, "step": 21930 }, { "epoch": 4.129493694711086, - "grad_norm": 34.491355895996094, - "learning_rate": 5.2230378317334834e-06, - "loss": 0.4849, + "grad_norm": 6.479440212249756, + "learning_rate": 8.705063052889139e-05, + "loss": 0.5754, "step": 21940 }, { "epoch": 4.1313758705063055, - "grad_norm": 14.449700355529785, - "learning_rate": 5.211744776962169e-06, - "loss": 0.768, + "grad_norm": 2.8325908184051514, + "learning_rate": 8.686241294936948e-05, + "loss": 0.9421, "step": 21950 }, { "epoch": 4.133258046301525, - "grad_norm": 17.19442367553711, - "learning_rate": 5.200451722190852e-06, - "loss": 0.3727, + "grad_norm": 4.928275108337402, + "learning_rate": 8.667419536984754e-05, + "loss": 0.469, "step": 21960 }, { "epoch": 4.1351402220967435, - "grad_norm": 2.9677646160125732, - "learning_rate": 5.1891586674195374e-06, - "loss": 0.6443, + "grad_norm": 0.39288848638534546, + "learning_rate": 8.648597779032562e-05, + "loss": 0.5645, "step": 21970 }, { "epoch": 4.137022397891963, - "grad_norm": 8.971100807189941, - "learning_rate": 5.177865612648222e-06, - "loss": 0.744, + "grad_norm": 1.3426021337509155, + "learning_rate": 8.62977602108037e-05, + "loss": 0.822, "step": 21980 }, { "epoch": 4.138904573687182, - "grad_norm": 14.347002029418945, - "learning_rate": 5.166572557876905e-06, - "loss": 0.4241, + "grad_norm": 2.0728249549865723, + "learning_rate": 8.610954263128176e-05, + "loss": 0.5174, "step": 21990 }, { "epoch": 4.140786749482402, - "grad_norm": 3.654014825820923, - "learning_rate": 5.155279503105591e-06, - "loss": 0.6624, + "grad_norm": 2.072270631790161, + "learning_rate": 8.592132505175985e-05, + "loss": 0.5649, "step": 22000 }, { "epoch": 4.142668925277621, - "grad_norm": 18.717609405517578, - "learning_rate": 5.143986448334274e-06, - "loss": 0.6139, + "grad_norm": 3.1153640747070312, + "learning_rate": 8.573310747223791e-05, + "loss": 0.5106, "step": 22010 }, { "epoch": 4.144551101072841, - "grad_norm": 25.252634048461914, - "learning_rate": 5.132693393562959e-06, - "loss": 0.5933, + "grad_norm": 4.662202835083008, + "learning_rate": 8.554488989271599e-05, + "loss": 0.6303, "step": 22020 }, { "epoch": 4.146433276868059, - "grad_norm": 8.735464096069336, - "learning_rate": 5.121400338791643e-06, - "loss": 0.782, + "grad_norm": 3.654289484024048, + "learning_rate": 8.535667231319405e-05, + "loss": 0.8146, "step": 22030 }, { "epoch": 4.148315452663279, - "grad_norm": 2.7719712257385254, - "learning_rate": 5.110107284020328e-06, - "loss": 0.4075, + "grad_norm": 1.2205557823181152, + "learning_rate": 8.516845473367213e-05, + "loss": 0.4592, "step": 22040 }, { "epoch": 4.150197628458498, - "grad_norm": 14.126623153686523, - "learning_rate": 5.098814229249012e-06, - "loss": 0.4613, + "grad_norm": 2.4779789447784424, + "learning_rate": 8.498023715415019e-05, + "loss": 0.467, "step": 22050 }, { "epoch": 4.1520798042537175, - "grad_norm": 1.2759391069412231, - "learning_rate": 5.087521174477696e-06, - "loss": 0.7308, + "grad_norm": 0.1437557190656662, + "learning_rate": 8.479201957462828e-05, + "loss": 0.7689, "step": 22060 }, { "epoch": 4.153961980048937, - "grad_norm": 2.6802024841308594, - "learning_rate": 5.07622811970638e-06, - "loss": 0.3868, + "grad_norm": 1.2576069831848145, + "learning_rate": 8.460380199510634e-05, + "loss": 0.5398, "step": 22070 }, { "epoch": 4.1558441558441555, - "grad_norm": 20.169902801513672, - "learning_rate": 5.064935064935065e-06, - "loss": 0.6009, + "grad_norm": 6.528464317321777, + "learning_rate": 8.441558441558442e-05, + "loss": 0.6382, "step": 22080 }, { "epoch": 4.157726331639375, - "grad_norm": 8.015284538269043, - "learning_rate": 5.05364201016375e-06, - "loss": 0.5751, + "grad_norm": 2.254312038421631, + "learning_rate": 8.42273668360625e-05, + "loss": 0.659, "step": 22090 }, { "epoch": 4.159608507434594, - "grad_norm": 9.399229049682617, - "learning_rate": 5.0423489553924336e-06, - "loss": 0.472, + "grad_norm": 2.7255895137786865, + "learning_rate": 8.403914925654056e-05, + "loss": 0.527, "step": 22100 }, { "epoch": 4.161490683229814, - "grad_norm": 16.64990997314453, - "learning_rate": 5.031055900621119e-06, - "loss": 0.2494, + "grad_norm": 2.356421947479248, + "learning_rate": 8.385093167701865e-05, + "loss": 0.2846, "step": 22110 }, { "epoch": 4.163372859025033, - "grad_norm": 34.64622116088867, - "learning_rate": 5.019762845849802e-06, - "loss": 0.5575, + "grad_norm": 5.929986000061035, + "learning_rate": 8.366271409749671e-05, + "loss": 0.625, "step": 22120 }, { "epoch": 4.165255034820253, - "grad_norm": 43.623146057128906, - "learning_rate": 5.0084697910784875e-06, - "loss": 0.4865, + "grad_norm": 3.677624225616455, + "learning_rate": 8.347449651797479e-05, + "loss": 0.4381, "step": 22130 }, { "epoch": 4.167137210615471, - "grad_norm": 11.624351501464844, - "learning_rate": 4.997176736307171e-06, - "loss": 0.6536, + "grad_norm": 4.380472660064697, + "learning_rate": 8.328627893845285e-05, + "loss": 0.6449, "step": 22140 }, { "epoch": 4.169019386410691, - "grad_norm": 6.294399261474609, - "learning_rate": 4.9858836815358555e-06, - "loss": 0.6794, + "grad_norm": 3.776270866394043, + "learning_rate": 8.309806135893093e-05, + "loss": 0.7702, "step": 22150 }, { "epoch": 4.17090156220591, - "grad_norm": 16.353845596313477, - "learning_rate": 4.97459062676454e-06, - "loss": 0.2492, + "grad_norm": 3.1007485389709473, + "learning_rate": 8.290984377940899e-05, + "loss": 0.286, "step": 22160 }, { "epoch": 4.1727837380011294, - "grad_norm": 6.7267608642578125, - "learning_rate": 4.963297571993224e-06, - "loss": 0.5912, + "grad_norm": 4.839145660400391, + "learning_rate": 8.272162619988708e-05, + "loss": 0.7047, "step": 22170 }, { "epoch": 4.174665913796349, - "grad_norm": 0.19940780103206635, - "learning_rate": 4.952004517221909e-06, - "loss": 0.3615, + "grad_norm": 0.20469647645950317, + "learning_rate": 8.253340862036514e-05, + "loss": 0.5551, "step": 22180 }, { "epoch": 4.176548089591567, - "grad_norm": 13.068621635437012, - "learning_rate": 4.940711462450593e-06, - "loss": 0.7576, + "grad_norm": 2.0428857803344727, + "learning_rate": 8.234519104084322e-05, + "loss": 0.7462, "step": 22190 }, { "epoch": 4.178430265386787, - "grad_norm": 4.3102240562438965, - "learning_rate": 4.929418407679277e-06, - "loss": 0.5176, + "grad_norm": 1.7860795259475708, + "learning_rate": 8.215697346132128e-05, + "loss": 0.6495, "step": 22200 }, { "epoch": 4.180312441182006, - "grad_norm": 17.297819137573242, - "learning_rate": 4.918125352907962e-06, - "loss": 0.7968, + "grad_norm": 3.704061508178711, + "learning_rate": 8.196875588179936e-05, + "loss": 0.8011, "step": 22210 }, { "epoch": 4.182194616977226, - "grad_norm": 29.88180923461914, - "learning_rate": 4.906832298136646e-06, - "loss": 0.5499, + "grad_norm": 5.398728370666504, + "learning_rate": 8.178053830227745e-05, + "loss": 0.6543, "step": 22220 }, { "epoch": 4.184076792772445, - "grad_norm": 35.76202392578125, - "learning_rate": 4.8955392433653305e-06, - "loss": 0.2527, + "grad_norm": 4.630385875701904, + "learning_rate": 8.159232072275551e-05, + "loss": 0.3238, "step": 22230 }, { "epoch": 4.185958968567665, - "grad_norm": 23.927745819091797, - "learning_rate": 4.884246188594015e-06, - "loss": 0.4063, + "grad_norm": 3.8577587604522705, + "learning_rate": 8.140410314323359e-05, + "loss": 0.4467, "step": 22240 }, { "epoch": 4.187841144362883, - "grad_norm": 7.700712203979492, - "learning_rate": 4.872953133822699e-06, - "loss": 0.4893, + "grad_norm": 2.0660860538482666, + "learning_rate": 8.121588556371165e-05, + "loss": 0.4921, "step": 22250 }, { "epoch": 4.189723320158103, - "grad_norm": 14.746334075927734, - "learning_rate": 4.861660079051384e-06, - "loss": 0.7473, + "grad_norm": 2.0969619750976562, + "learning_rate": 8.102766798418973e-05, + "loss": 0.8799, "step": 22260 }, { "epoch": 4.191605495953322, - "grad_norm": 0.5177063941955566, - "learning_rate": 4.850367024280068e-06, - "loss": 0.5172, + "grad_norm": 1.2200628519058228, + "learning_rate": 8.083945040466779e-05, + "loss": 0.5306, "step": 22270 }, { "epoch": 4.193487671748541, - "grad_norm": 5.81657075881958, - "learning_rate": 4.8390739695087524e-06, - "loss": 0.5498, + "grad_norm": 1.9948633909225464, + "learning_rate": 8.065123282514588e-05, + "loss": 0.6008, "step": 22280 }, { "epoch": 4.195369847543761, - "grad_norm": 0.22678916156291962, - "learning_rate": 4.827780914737436e-06, - "loss": 0.4705, + "grad_norm": 0.05267168954014778, + "learning_rate": 8.046301524562394e-05, + "loss": 0.6233, "step": 22290 }, { "epoch": 4.19725202333898, - "grad_norm": 18.123903274536133, - "learning_rate": 4.816487859966121e-06, - "loss": 0.4834, + "grad_norm": 4.359118461608887, + "learning_rate": 8.027479766610202e-05, + "loss": 0.5167, "step": 22300 }, { "epoch": 4.199134199134199, - "grad_norm": 22.923141479492188, - "learning_rate": 4.805194805194805e-06, - "loss": 0.6182, + "grad_norm": 2.881960391998291, + "learning_rate": 8.008658008658008e-05, + "loss": 0.6885, "step": 22310 }, { "epoch": 4.201016374929418, - "grad_norm": 30.303974151611328, - "learning_rate": 4.79390175042349e-06, - "loss": 0.844, + "grad_norm": 5.036227703094482, + "learning_rate": 7.989836250705816e-05, + "loss": 0.9313, "step": 22320 }, { "epoch": 4.202898550724638, - "grad_norm": 0.405670702457428, - "learning_rate": 4.7826086956521735e-06, - "loss": 0.5439, + "grad_norm": 0.4130703806877136, + "learning_rate": 7.971014492753622e-05, + "loss": 0.6305, "step": 22330 }, { "epoch": 4.204780726519857, - "grad_norm": 19.21993064880371, - "learning_rate": 4.771315640880859e-06, - "loss": 0.4976, + "grad_norm": 5.823428630828857, + "learning_rate": 7.952192734801431e-05, + "loss": 0.5402, "step": 22340 }, { "epoch": 4.206662902315077, - "grad_norm": 18.2979679107666, - "learning_rate": 4.760022586109543e-06, - "loss": 0.7726, + "grad_norm": 4.27875280380249, + "learning_rate": 7.933370976849239e-05, + "loss": 0.8322, "step": 22350 }, { "epoch": 4.208545078110295, - "grad_norm": 4.73086404800415, - "learning_rate": 4.748729531338227e-06, - "loss": 0.6009, + "grad_norm": 0.5691286325454712, + "learning_rate": 7.914549218897045e-05, + "loss": 0.5322, "step": 22360 }, { "epoch": 4.2104272539055145, - "grad_norm": 0.11267448216676712, - "learning_rate": 4.737436476566912e-06, - "loss": 0.4873, + "grad_norm": 0.19783131778240204, + "learning_rate": 7.895727460944853e-05, + "loss": 0.5621, "step": 22370 }, { "epoch": 4.212309429700734, - "grad_norm": 13.262358665466309, - "learning_rate": 4.726143421795595e-06, - "loss": 0.588, + "grad_norm": 4.439708709716797, + "learning_rate": 7.876905702992659e-05, + "loss": 0.7453, "step": 22380 }, { "epoch": 4.214191605495953, - "grad_norm": 5.213744163513184, - "learning_rate": 4.714850367024281e-06, - "loss": 0.4623, + "grad_norm": 1.6272809505462646, + "learning_rate": 7.858083945040468e-05, + "loss": 0.5086, "step": 22390 }, { "epoch": 4.216073781291173, - "grad_norm": 16.470924377441406, - "learning_rate": 4.703557312252964e-06, - "loss": 0.6373, + "grad_norm": 4.118422031402588, + "learning_rate": 7.839262187088274e-05, + "loss": 0.6641, "step": 22400 }, { "epoch": 4.217955957086392, - "grad_norm": 10.145905494689941, - "learning_rate": 4.692264257481649e-06, - "loss": 0.6058, + "grad_norm": 2.0501649379730225, + "learning_rate": 7.820440429136082e-05, + "loss": 0.5865, "step": 22410 }, { "epoch": 4.219838132881611, - "grad_norm": 1.2421616315841675, - "learning_rate": 4.680971202710333e-06, - "loss": 0.4736, + "grad_norm": 0.5741236209869385, + "learning_rate": 7.801618671183888e-05, + "loss": 0.5987, "step": 22420 }, { "epoch": 4.22172030867683, - "grad_norm": 19.461740493774414, - "learning_rate": 4.669678147939017e-06, - "loss": 0.4471, + "grad_norm": 1.6564340591430664, + "learning_rate": 7.782796913231696e-05, + "loss": 0.4888, "step": 22430 }, { "epoch": 4.22360248447205, - "grad_norm": 11.891735076904297, - "learning_rate": 4.658385093167702e-06, - "loss": 0.5111, + "grad_norm": 3.18048357963562, + "learning_rate": 7.763975155279502e-05, + "loss": 0.7065, "step": 22440 }, { "epoch": 4.225484660267269, - "grad_norm": 42.17231750488281, - "learning_rate": 4.647092038396386e-06, - "loss": 1.0541, + "grad_norm": 6.997757911682129, + "learning_rate": 7.745153397327311e-05, + "loss": 1.157, "step": 22450 }, { "epoch": 4.2273668360624885, - "grad_norm": 8.65245246887207, - "learning_rate": 4.635798983625071e-06, - "loss": 0.6871, + "grad_norm": 2.3890907764434814, + "learning_rate": 7.726331639375119e-05, + "loss": 0.8599, "step": 22460 }, { "epoch": 4.229249011857707, - "grad_norm": 0.8841702938079834, - "learning_rate": 4.624505928853755e-06, - "loss": 0.478, + "grad_norm": 1.6332005262374878, + "learning_rate": 7.707509881422925e-05, + "loss": 0.487, "step": 22470 }, { "epoch": 4.2311311876529265, - "grad_norm": 7.112485408782959, - "learning_rate": 4.61321287408244e-06, - "loss": 0.7968, + "grad_norm": 2.324662208557129, + "learning_rate": 7.688688123470733e-05, + "loss": 0.8454, "step": 22480 }, { "epoch": 4.233013363448146, - "grad_norm": 11.136316299438477, - "learning_rate": 4.601919819311124e-06, - "loss": 0.3291, + "grad_norm": 3.145379066467285, + "learning_rate": 7.669866365518539e-05, + "loss": 0.4188, "step": 22490 }, { "epoch": 4.234895539243365, - "grad_norm": 11.090184211730957, - "learning_rate": 4.590626764539808e-06, - "loss": 0.5061, + "grad_norm": 1.542700171470642, + "learning_rate": 7.651044607566348e-05, + "loss": 0.5681, "step": 22500 }, { "epoch": 4.236777715038585, - "grad_norm": 3.2487759590148926, - "learning_rate": 4.579333709768492e-06, - "loss": 0.4647, + "grad_norm": 4.345068454742432, + "learning_rate": 7.632222849614154e-05, + "loss": 0.5611, "step": 22510 }, { "epoch": 4.238659890833804, - "grad_norm": 30.191455841064453, - "learning_rate": 4.568040654997177e-06, - "loss": 0.3747, + "grad_norm": 3.63413667678833, + "learning_rate": 7.613401091661962e-05, + "loss": 0.5058, "step": 22520 }, { "epoch": 4.240542066629023, - "grad_norm": 12.943588256835938, - "learning_rate": 4.556747600225861e-06, - "loss": 0.5714, + "grad_norm": 2.347446918487549, + "learning_rate": 7.594579333709768e-05, + "loss": 0.72, "step": 22530 }, { "epoch": 4.242424242424242, - "grad_norm": 6.223907947540283, - "learning_rate": 4.5454545454545455e-06, - "loss": 0.37, + "grad_norm": 4.7115092277526855, + "learning_rate": 7.575757575757576e-05, + "loss": 0.4005, "step": 22540 }, { "epoch": 4.244306418219462, - "grad_norm": 23.05088996887207, - "learning_rate": 4.53416149068323e-06, - "loss": 0.456, + "grad_norm": 2.4576327800750732, + "learning_rate": 7.556935817805382e-05, + "loss": 0.5354, "step": 22550 }, { "epoch": 4.246188594014681, - "grad_norm": 21.883832931518555, - "learning_rate": 4.522868435911914e-06, - "loss": 0.3142, + "grad_norm": 4.690468788146973, + "learning_rate": 7.538114059853191e-05, + "loss": 0.406, "step": 22560 }, { "epoch": 4.2480707698099005, - "grad_norm": 10.83736515045166, - "learning_rate": 4.511575381140599e-06, - "loss": 0.5158, + "grad_norm": 2.983757495880127, + "learning_rate": 7.519292301900997e-05, + "loss": 0.6362, "step": 22570 }, { "epoch": 4.24995294560512, - "grad_norm": 2.516878366470337, - "learning_rate": 4.500282326369283e-06, - "loss": 0.623, + "grad_norm": 2.384744644165039, + "learning_rate": 7.500470543948805e-05, + "loss": 0.8287, "step": 22580 }, { "epoch": 4.2518351214003385, - "grad_norm": 0.16205434501171112, - "learning_rate": 4.4889892715979674e-06, - "loss": 0.5343, + "grad_norm": 3.082204580307007, + "learning_rate": 7.481648785996613e-05, + "loss": 0.4311, "step": 22590 }, { "epoch": 4.253717297195558, - "grad_norm": 8.219768524169922, - "learning_rate": 4.477696216826652e-06, - "loss": 0.612, + "grad_norm": 2.5956528186798096, + "learning_rate": 7.462827028044419e-05, + "loss": 0.6546, "step": 22600 }, { "epoch": 4.255599472990777, - "grad_norm": 13.677404403686523, - "learning_rate": 4.466403162055336e-06, - "loss": 0.6012, + "grad_norm": 5.551482200622559, + "learning_rate": 7.444005270092228e-05, + "loss": 0.7867, "step": 22610 }, { "epoch": 4.257481648785997, - "grad_norm": 10.702381134033203, - "learning_rate": 4.4551101072840206e-06, - "loss": 0.3911, + "grad_norm": 4.27431583404541, + "learning_rate": 7.425183512140034e-05, + "loss": 0.657, "step": 22620 }, { "epoch": 4.259363824581216, - "grad_norm": 0.8724668622016907, - "learning_rate": 4.443817052512705e-06, - "loss": 0.7228, + "grad_norm": 1.6535813808441162, + "learning_rate": 7.406361754187842e-05, + "loss": 0.6216, "step": 22630 }, { "epoch": 4.261246000376435, - "grad_norm": 40.3377799987793, - "learning_rate": 4.432523997741389e-06, - "loss": 0.7456, + "grad_norm": 3.505030870437622, + "learning_rate": 7.387539996235648e-05, + "loss": 0.709, "step": 22640 }, { "epoch": 4.263128176171654, - "grad_norm": 15.633112907409668, - "learning_rate": 4.421230942970074e-06, - "loss": 0.5719, + "grad_norm": 2.203139543533325, + "learning_rate": 7.368718238283456e-05, + "loss": 0.6866, "step": 22650 }, { "epoch": 4.265010351966874, - "grad_norm": 12.120109558105469, - "learning_rate": 4.409937888198757e-06, - "loss": 0.6745, + "grad_norm": 2.7182722091674805, + "learning_rate": 7.349896480331262e-05, + "loss": 0.8602, "step": 22660 }, { "epoch": 4.266892527762093, - "grad_norm": 17.709228515625, - "learning_rate": 4.3986448334274425e-06, - "loss": 0.5516, + "grad_norm": 1.7698827981948853, + "learning_rate": 7.331074722379071e-05, + "loss": 0.707, "step": 22670 }, { "epoch": 4.2687747035573125, - "grad_norm": 8.81289005279541, - "learning_rate": 4.387351778656126e-06, - "loss": 0.4194, + "grad_norm": 2.303340196609497, + "learning_rate": 7.312252964426877e-05, + "loss": 0.4766, "step": 22680 }, { "epoch": 4.270656879352532, - "grad_norm": 14.428284645080566, - "learning_rate": 4.376058723884811e-06, - "loss": 0.4524, + "grad_norm": 2.723996639251709, + "learning_rate": 7.293431206474685e-05, + "loss": 0.5091, "step": 22690 }, { "epoch": 4.2725390551477505, - "grad_norm": 15.037586212158203, - "learning_rate": 4.364765669113496e-06, - "loss": 0.6796, + "grad_norm": 2.801182746887207, + "learning_rate": 7.274609448522493e-05, + "loss": 0.7886, "step": 22700 }, { "epoch": 4.27442123094297, - "grad_norm": 7.700161933898926, - "learning_rate": 4.35347261434218e-06, - "loss": 0.4788, + "grad_norm": 1.358647346496582, + "learning_rate": 7.255787690570299e-05, + "loss": 0.5111, "step": 22710 }, { "epoch": 4.276303406738189, - "grad_norm": 3.1909353733062744, - "learning_rate": 4.342179559570864e-06, - "loss": 0.4486, + "grad_norm": 1.516494870185852, + "learning_rate": 7.236965932618108e-05, + "loss": 0.6179, "step": 22720 }, { "epoch": 4.278185582533409, - "grad_norm": 17.010690689086914, - "learning_rate": 4.330886504799548e-06, - "loss": 0.6104, + "grad_norm": 2.8411619663238525, + "learning_rate": 7.218144174665914e-05, + "loss": 0.5986, "step": 22730 }, { "epoch": 4.280067758328628, - "grad_norm": 8.390015602111816, - "learning_rate": 4.319593450028233e-06, - "loss": 0.5245, + "grad_norm": 1.708974838256836, + "learning_rate": 7.199322416713722e-05, + "loss": 0.6238, "step": 22740 }, { "epoch": 4.281949934123848, - "grad_norm": 14.018758773803711, - "learning_rate": 4.308300395256917e-06, - "loss": 0.4835, + "grad_norm": 3.111238718032837, + "learning_rate": 7.180500658761528e-05, + "loss": 0.4499, "step": 22750 }, { "epoch": 4.283832109919066, - "grad_norm": 0.4540165662765503, - "learning_rate": 4.297007340485602e-06, - "loss": 0.5203, + "grad_norm": 0.690464198589325, + "learning_rate": 7.161678900809336e-05, + "loss": 0.4686, "step": 22760 }, { "epoch": 4.285714285714286, - "grad_norm": 2.0470516681671143, - "learning_rate": 4.2857142857142855e-06, - "loss": 0.4774, + "grad_norm": 0.1224987581372261, + "learning_rate": 7.142857142857142e-05, + "loss": 0.3938, "step": 22770 }, { "epoch": 4.287596461509505, - "grad_norm": 9.436823844909668, - "learning_rate": 4.274421230942971e-06, - "loss": 0.3557, + "grad_norm": 3.601773738861084, + "learning_rate": 7.124035384904951e-05, + "loss": 0.4035, "step": 22780 }, { "epoch": 4.2894786373047245, - "grad_norm": 14.441892623901367, - "learning_rate": 4.263128176171654e-06, - "loss": 0.4613, + "grad_norm": 2.2314205169677734, + "learning_rate": 7.105213626952757e-05, + "loss": 0.5302, "step": 22790 }, { "epoch": 4.291360813099944, - "grad_norm": 21.169797897338867, - "learning_rate": 4.251835121400339e-06, - "loss": 0.4245, + "grad_norm": 1.4461051225662231, + "learning_rate": 7.086391869000565e-05, + "loss": 0.3793, "step": 22800 }, { "epoch": 4.293242988895162, - "grad_norm": 5.513450622558594, - "learning_rate": 4.240542066629023e-06, - "loss": 0.5082, + "grad_norm": 1.465973138809204, + "learning_rate": 7.067570111048371e-05, + "loss": 0.6275, "step": 22810 }, { "epoch": 4.295125164690382, - "grad_norm": 9.460198402404785, - "learning_rate": 4.229249011857707e-06, - "loss": 0.6396, + "grad_norm": 4.16806173324585, + "learning_rate": 7.048748353096179e-05, + "loss": 0.8245, "step": 22820 }, { "epoch": 4.297007340485601, - "grad_norm": 16.516008377075195, - "learning_rate": 4.217955957086393e-06, - "loss": 0.4007, + "grad_norm": 1.0273966789245605, + "learning_rate": 7.029926595143988e-05, + "loss": 0.4617, "step": 22830 }, { "epoch": 4.298889516280821, - "grad_norm": 12.221315383911133, - "learning_rate": 4.206662902315076e-06, - "loss": 0.541, + "grad_norm": 1.136577844619751, + "learning_rate": 7.011104837191794e-05, + "loss": 0.4239, "step": 22840 }, { "epoch": 4.30077169207604, - "grad_norm": 24.764720916748047, - "learning_rate": 4.195369847543761e-06, - "loss": 0.6108, + "grad_norm": 3.0240557193756104, + "learning_rate": 6.992283079239602e-05, + "loss": 0.5731, "step": 22850 }, { "epoch": 4.30265386787126, - "grad_norm": 16.752914428710938, - "learning_rate": 4.184076792772445e-06, - "loss": 0.6381, + "grad_norm": 2.209796190261841, + "learning_rate": 6.973461321287408e-05, + "loss": 0.7176, "step": 22860 }, { "epoch": 4.304536043666478, - "grad_norm": 30.31788444519043, - "learning_rate": 4.172783738001129e-06, - "loss": 0.5743, + "grad_norm": 4.721385478973389, + "learning_rate": 6.954639563335216e-05, + "loss": 0.7321, "step": 22870 }, { "epoch": 4.306418219461698, - "grad_norm": 14.529326438903809, - "learning_rate": 4.161490683229814e-06, - "loss": 0.4165, + "grad_norm": 5.0673747062683105, + "learning_rate": 6.935817805383022e-05, + "loss": 0.4734, "step": 22880 }, { "epoch": 4.308300395256917, - "grad_norm": 5.0346808433532715, - "learning_rate": 4.150197628458498e-06, - "loss": 0.6804, + "grad_norm": 2.6815907955169678, + "learning_rate": 6.916996047430831e-05, + "loss": 0.7312, "step": 22890 }, { "epoch": 4.310182571052136, - "grad_norm": 12.99887466430664, - "learning_rate": 4.1389045736871824e-06, - "loss": 0.3671, + "grad_norm": 3.661799669265747, + "learning_rate": 6.898174289478637e-05, + "loss": 0.4466, "step": 22900 }, { "epoch": 4.312064746847356, - "grad_norm": 26.515596389770508, - "learning_rate": 4.127611518915867e-06, - "loss": 0.479, + "grad_norm": 4.6033935546875, + "learning_rate": 6.879352531526445e-05, + "loss": 0.5665, "step": 22910 }, { "epoch": 4.313946922642575, - "grad_norm": 21.167415618896484, - "learning_rate": 4.116318464144551e-06, - "loss": 0.5802, + "grad_norm": 1.0957890748977661, + "learning_rate": 6.860530773574251e-05, + "loss": 0.5804, "step": 22920 }, { "epoch": 4.315829098437794, - "grad_norm": 16.790386199951172, - "learning_rate": 4.1050254093732356e-06, - "loss": 0.6888, + "grad_norm": 2.1576459407806396, + "learning_rate": 6.841709015622059e-05, + "loss": 0.6856, "step": 22930 }, { "epoch": 4.317711274233013, - "grad_norm": 10.245500564575195, - "learning_rate": 4.093732354601919e-06, - "loss": 0.7874, + "grad_norm": 1.8269578218460083, + "learning_rate": 6.822887257669865e-05, + "loss": 0.6718, "step": 22940 }, { "epoch": 4.319593450028233, - "grad_norm": 21.216289520263672, - "learning_rate": 4.082439299830604e-06, - "loss": 0.3578, + "grad_norm": 1.6212677955627441, + "learning_rate": 6.804065499717674e-05, + "loss": 0.5032, "step": 22950 }, { "epoch": 4.321475625823452, - "grad_norm": 18.367511749267578, - "learning_rate": 4.071146245059289e-06, - "loss": 0.6691, + "grad_norm": 3.279130697250366, + "learning_rate": 6.785243741765482e-05, + "loss": 0.743, "step": 22960 }, { "epoch": 4.323357801618672, - "grad_norm": 24.98716163635254, - "learning_rate": 4.059853190287973e-06, - "loss": 0.5337, + "grad_norm": 0.7260986566543579, + "learning_rate": 6.766421983813288e-05, + "loss": 0.5386, "step": 22970 }, { "epoch": 4.32523997741389, - "grad_norm": 0.125027135014534, - "learning_rate": 4.0485601355166575e-06, - "loss": 0.6233, + "grad_norm": 0.01941692642867565, + "learning_rate": 6.747600225861096e-05, + "loss": 0.4597, "step": 22980 }, { "epoch": 4.3271221532091095, - "grad_norm": 19.9777889251709, - "learning_rate": 4.037267080745342e-06, - "loss": 0.5926, + "grad_norm": 4.9999847412109375, + "learning_rate": 6.728778467908902e-05, + "loss": 0.6838, "step": 22990 }, { "epoch": 4.329004329004329, - "grad_norm": 20.854936599731445, - "learning_rate": 4.025974025974026e-06, - "loss": 0.5644, + "grad_norm": 2.1625707149505615, + "learning_rate": 6.709956709956711e-05, + "loss": 0.6977, "step": 23000 }, { "epoch": 4.330886504799548, - "grad_norm": 7.7878546714782715, - "learning_rate": 4.014680971202711e-06, - "loss": 0.5073, + "grad_norm": 1.335459589958191, + "learning_rate": 6.691134952004517e-05, + "loss": 0.5518, "step": 23010 }, { "epoch": 4.332768680594768, - "grad_norm": 7.36667013168335, - "learning_rate": 4.003387916431395e-06, - "loss": 0.1648, + "grad_norm": 2.090069055557251, + "learning_rate": 6.672313194052325e-05, + "loss": 0.2411, "step": 23020 }, { "epoch": 4.334650856389986, - "grad_norm": 9.753989219665527, - "learning_rate": 3.9920948616600785e-06, - "loss": 0.4856, + "grad_norm": 3.832279682159424, + "learning_rate": 6.653491436100131e-05, + "loss": 0.5272, "step": 23030 }, { "epoch": 4.336533032185206, - "grad_norm": 14.510024070739746, - "learning_rate": 3.980801806888764e-06, - "loss": 0.4652, + "grad_norm": 5.671814918518066, + "learning_rate": 6.634669678147939e-05, + "loss": 0.469, "step": 23040 }, { "epoch": 4.338415207980425, - "grad_norm": 15.549747467041016, - "learning_rate": 3.969508752117447e-06, - "loss": 0.4127, + "grad_norm": 2.7710330486297607, + "learning_rate": 6.615847920195745e-05, + "loss": 0.4091, "step": 23050 }, { "epoch": 4.340297383775645, - "grad_norm": 8.210662841796875, - "learning_rate": 3.9582156973461325e-06, - "loss": 0.5347, + "grad_norm": 2.5829217433929443, + "learning_rate": 6.597026162243554e-05, + "loss": 0.4833, "step": 23060 }, { "epoch": 4.342179559570864, - "grad_norm": 7.637692451477051, - "learning_rate": 3.946922642574817e-06, - "loss": 0.5743, + "grad_norm": 1.8346978425979614, + "learning_rate": 6.578204404291362e-05, + "loss": 0.5997, "step": 23070 }, { "epoch": 4.3440617353660835, - "grad_norm": 0.12690700590610504, - "learning_rate": 3.935629587803501e-06, - "loss": 0.6929, + "grad_norm": 0.10548819601535797, + "learning_rate": 6.559382646339168e-05, + "loss": 0.6695, "step": 23080 }, { "epoch": 4.345943911161302, - "grad_norm": 4.598480701446533, - "learning_rate": 3.924336533032186e-06, - "loss": 0.3635, + "grad_norm": 0.4382118582725525, + "learning_rate": 6.540560888386976e-05, + "loss": 0.3085, "step": 23090 }, { "epoch": 4.3478260869565215, - "grad_norm": 2.793907403945923, - "learning_rate": 3.913043478260869e-06, - "loss": 0.5651, + "grad_norm": 0.8366402387619019, + "learning_rate": 6.521739130434782e-05, + "loss": 0.55, "step": 23100 }, { "epoch": 4.349708262751741, - "grad_norm": 32.544918060302734, - "learning_rate": 3.9017504234895545e-06, - "loss": 0.5491, + "grad_norm": 6.871497631072998, + "learning_rate": 6.502917372482591e-05, + "loss": 0.5395, "step": 23110 }, { "epoch": 4.35159043854696, - "grad_norm": 15.509510040283203, - "learning_rate": 3.890457368718238e-06, - "loss": 1.0166, + "grad_norm": 5.250075340270996, + "learning_rate": 6.484095614530397e-05, + "loss": 0.9494, "step": 23120 }, { "epoch": 4.35347261434218, - "grad_norm": 3.5904457569122314, - "learning_rate": 3.879164313946923e-06, - "loss": 0.6177, + "grad_norm": 0.5241207480430603, + "learning_rate": 6.465273856578205e-05, + "loss": 0.6133, "step": 23130 }, { "epoch": 4.355354790137399, - "grad_norm": 7.414836406707764, - "learning_rate": 3.867871259175607e-06, - "loss": 0.4744, + "grad_norm": 2.0515754222869873, + "learning_rate": 6.446452098626011e-05, + "loss": 0.4578, "step": 23140 }, { "epoch": 4.357236965932618, - "grad_norm": 6.72654390335083, - "learning_rate": 3.856578204404292e-06, - "loss": 0.4433, + "grad_norm": 2.292283535003662, + "learning_rate": 6.427630340673819e-05, + "loss": 0.5271, "step": 23150 }, { "epoch": 4.359119141727837, - "grad_norm": 1.880739450454712, - "learning_rate": 3.8452851496329755e-06, - "loss": 0.4546, + "grad_norm": 1.606432557106018, + "learning_rate": 6.408808582721625e-05, + "loss": 0.5568, "step": 23160 }, { "epoch": 4.361001317523057, - "grad_norm": 15.60661792755127, - "learning_rate": 3.83399209486166e-06, - "loss": 0.6959, + "grad_norm": 5.097689151763916, + "learning_rate": 6.389986824769434e-05, + "loss": 0.6891, "step": 23170 }, { "epoch": 4.362883493318276, - "grad_norm": 0.70307856798172, - "learning_rate": 3.822699040090344e-06, - "loss": 0.5917, + "grad_norm": 0.15392223000526428, + "learning_rate": 6.37116506681724e-05, + "loss": 0.6539, "step": 23180 }, { "epoch": 4.3647656691134955, - "grad_norm": 16.895275115966797, - "learning_rate": 3.811405985319029e-06, - "loss": 0.6483, + "grad_norm": 7.145848274230957, + "learning_rate": 6.352343308865048e-05, + "loss": 0.612, "step": 23190 }, { "epoch": 4.366647844908714, - "grad_norm": 14.281275749206543, - "learning_rate": 3.8001129305477135e-06, - "loss": 0.4527, + "grad_norm": 2.0338876247406006, + "learning_rate": 6.333521550912856e-05, + "loss": 0.4728, "step": 23200 }, { "epoch": 4.3685300207039335, - "grad_norm": 16.391109466552734, - "learning_rate": 3.7888198757763974e-06, - "loss": 0.8592, + "grad_norm": 4.034317493438721, + "learning_rate": 6.314699792960662e-05, + "loss": 0.855, "step": 23210 }, { "epoch": 4.370412196499153, - "grad_norm": 10.493399620056152, - "learning_rate": 3.7775268210050822e-06, - "loss": 0.8362, + "grad_norm": 2.8626930713653564, + "learning_rate": 6.295878035008471e-05, + "loss": 0.8229, "step": 23220 }, { "epoch": 4.372294372294372, - "grad_norm": 4.833158493041992, - "learning_rate": 3.766233766233766e-06, - "loss": 0.4836, + "grad_norm": 1.998652696609497, + "learning_rate": 6.277056277056277e-05, + "loss": 0.521, "step": 23230 }, { "epoch": 4.374176548089592, - "grad_norm": 19.595304489135742, - "learning_rate": 3.754940711462451e-06, - "loss": 0.3935, + "grad_norm": 1.864508867263794, + "learning_rate": 6.258234519104085e-05, + "loss": 0.4708, "step": 23240 }, { "epoch": 4.376058723884811, - "grad_norm": 9.765375137329102, - "learning_rate": 3.743647656691135e-06, - "loss": 0.642, + "grad_norm": 2.864326000213623, + "learning_rate": 6.239412761151891e-05, + "loss": 0.6477, "step": 23250 }, { "epoch": 4.37794089968003, - "grad_norm": 33.9497184753418, - "learning_rate": 3.7323546019198198e-06, - "loss": 0.5137, + "grad_norm": 4.261322498321533, + "learning_rate": 6.220591003199699e-05, + "loss": 0.5401, "step": 23260 }, { "epoch": 4.379823075475249, - "grad_norm": 13.267662048339844, - "learning_rate": 3.7210615471485037e-06, - "loss": 0.3194, + "grad_norm": 4.250820636749268, + "learning_rate": 6.201769245247507e-05, + "loss": 0.3505, "step": 23270 }, { "epoch": 4.381705251270469, - "grad_norm": 25.924053192138672, - "learning_rate": 3.709768492377188e-06, - "loss": 0.3042, + "grad_norm": 3.9857656955718994, + "learning_rate": 6.182947487295314e-05, + "loss": 0.3791, "step": 23280 }, { "epoch": 4.383587427065688, - "grad_norm": 0.388808935880661, - "learning_rate": 3.6984754376058725e-06, - "loss": 0.5223, + "grad_norm": 0.5790356397628784, + "learning_rate": 6.16412572934312e-05, + "loss": 0.523, "step": 23290 }, { "epoch": 4.3854696028609075, - "grad_norm": 28.93108558654785, - "learning_rate": 3.687182382834557e-06, - "loss": 0.6084, + "grad_norm": 4.269297122955322, + "learning_rate": 6.145303971390928e-05, + "loss": 0.6019, "step": 23300 }, { "epoch": 4.387351778656127, - "grad_norm": 14.811662673950195, - "learning_rate": 3.6758893280632412e-06, - "loss": 0.522, + "grad_norm": 4.494009971618652, + "learning_rate": 6.126482213438736e-05, + "loss": 0.7931, "step": 23310 }, { "epoch": 4.3892339544513455, - "grad_norm": 11.806975364685059, - "learning_rate": 3.6645962732919256e-06, - "loss": 0.4666, + "grad_norm": 2.184983015060425, + "learning_rate": 6.107660455486542e-05, + "loss": 0.6101, "step": 23320 }, { "epoch": 4.391116130246565, - "grad_norm": 76.58401489257812, - "learning_rate": 3.65330321852061e-06, - "loss": 0.4884, + "grad_norm": 2.364179849624634, + "learning_rate": 6.08883869753435e-05, + "loss": 0.4942, "step": 23330 }, { "epoch": 4.392998306041784, - "grad_norm": 0.8018186688423157, - "learning_rate": 3.6420101637492944e-06, - "loss": 0.5054, + "grad_norm": 0.16114617884159088, + "learning_rate": 6.0700169395821566e-05, + "loss": 0.526, "step": 23340 }, { "epoch": 4.394880481837004, - "grad_norm": 6.801021099090576, - "learning_rate": 3.6307171089779784e-06, - "loss": 0.339, + "grad_norm": 2.0968990325927734, + "learning_rate": 6.051195181629964e-05, + "loss": 0.2826, "step": 23350 }, { "epoch": 4.396762657632223, - "grad_norm": 12.988116264343262, - "learning_rate": 3.6194240542066627e-06, - "loss": 0.2921, + "grad_norm": 2.517059087753296, + "learning_rate": 6.032373423677771e-05, + "loss": 0.5407, "step": 23360 }, { "epoch": 4.398644833427442, - "grad_norm": 19.021095275878906, - "learning_rate": 3.608130999435347e-06, - "loss": 0.4502, + "grad_norm": 2.5663349628448486, + "learning_rate": 6.013551665725579e-05, + "loss": 0.4843, "step": 23370 }, { "epoch": 4.400527009222661, - "grad_norm": 8.15141487121582, - "learning_rate": 3.596837944664032e-06, - "loss": 0.6093, + "grad_norm": 1.0666170120239258, + "learning_rate": 5.9947299077733865e-05, + "loss": 0.6544, "step": 23380 }, { "epoch": 4.402409185017881, - "grad_norm": 0.6214210987091064, - "learning_rate": 3.5855448898927163e-06, - "loss": 0.6574, + "grad_norm": 0.42081281542778015, + "learning_rate": 5.9759081498211935e-05, + "loss": 0.7458, "step": 23390 }, { "epoch": 4.4042913608131, - "grad_norm": 14.339752197265625, - "learning_rate": 3.5742518351214007e-06, - "loss": 0.4536, + "grad_norm": 2.0457112789154053, + "learning_rate": 5.957086391869001e-05, + "loss": 0.4973, "step": 23400 }, { "epoch": 4.4061735366083195, - "grad_norm": 16.84188461303711, - "learning_rate": 3.562958780350085e-06, - "loss": 0.2564, + "grad_norm": 5.91927433013916, + "learning_rate": 5.938264633916808e-05, + "loss": 0.3556, "step": 23410 }, { "epoch": 4.408055712403539, - "grad_norm": 18.416526794433594, - "learning_rate": 3.551665725578769e-06, - "loss": 0.4888, + "grad_norm": 2.4589664936065674, + "learning_rate": 5.919442875964615e-05, + "loss": 0.5741, "step": 23420 }, { "epoch": 4.409937888198757, - "grad_norm": 0.2957004904747009, - "learning_rate": 3.5403726708074534e-06, - "loss": 0.4994, + "grad_norm": 0.18644341826438904, + "learning_rate": 5.900621118012423e-05, + "loss": 0.5359, "step": 23430 }, { "epoch": 4.411820063993977, - "grad_norm": 14.493985176086426, - "learning_rate": 3.529079616036138e-06, - "loss": 0.5243, + "grad_norm": 3.3280177116394043, + "learning_rate": 5.88179936006023e-05, + "loss": 0.5731, "step": 23440 }, { "epoch": 4.413702239789196, - "grad_norm": 22.472604751586914, - "learning_rate": 3.517786561264822e-06, - "loss": 0.8668, + "grad_norm": 6.775310039520264, + "learning_rate": 5.8629776021080366e-05, + "loss": 0.8888, "step": 23450 }, { "epoch": 4.415584415584416, - "grad_norm": 1.9218357801437378, - "learning_rate": 3.5064935064935066e-06, - "loss": 0.55, + "grad_norm": 1.3114447593688965, + "learning_rate": 5.844155844155844e-05, + "loss": 0.6345, "step": 23460 }, { "epoch": 4.417466591379635, - "grad_norm": 15.064146995544434, - "learning_rate": 3.495200451722191e-06, - "loss": 0.6122, + "grad_norm": 2.811979055404663, + "learning_rate": 5.825334086203651e-05, + "loss": 0.5995, "step": 23470 }, { "epoch": 4.419348767174855, - "grad_norm": 29.05467987060547, - "learning_rate": 3.4839073969508753e-06, - "loss": 0.2962, + "grad_norm": 0.5569702982902527, + "learning_rate": 5.806512328251459e-05, + "loss": 0.394, "step": 23480 }, { "epoch": 4.421230942970073, - "grad_norm": 10.816146850585938, - "learning_rate": 3.4726143421795593e-06, - "loss": 0.4214, + "grad_norm": 2.1078720092773438, + "learning_rate": 5.787690570299266e-05, + "loss": 0.5194, "step": 23490 }, { "epoch": 4.423113118765293, - "grad_norm": 8.355408668518066, - "learning_rate": 3.461321287408244e-06, - "loss": 0.5314, + "grad_norm": 1.9005637168884277, + "learning_rate": 5.7688688123470735e-05, + "loss": 0.5483, "step": 23500 }, { "epoch": 4.424995294560512, - "grad_norm": 13.588001251220703, - "learning_rate": 3.4500282326369285e-06, - "loss": 0.8294, + "grad_norm": 3.41694712638855, + "learning_rate": 5.750047054394881e-05, + "loss": 0.7021, "step": 23510 }, { "epoch": 4.426877470355731, - "grad_norm": 20.003461837768555, - "learning_rate": 3.438735177865613e-06, - "loss": 0.7186, + "grad_norm": 4.113175868988037, + "learning_rate": 5.731225296442688e-05, + "loss": 0.7608, "step": 23520 }, { "epoch": 4.428759646150951, - "grad_norm": 5.769153594970703, - "learning_rate": 3.4274421230942972e-06, - "loss": 0.3721, + "grad_norm": 1.1039425134658813, + "learning_rate": 5.712403538490495e-05, + "loss": 0.4509, "step": 23530 }, { "epoch": 4.430641821946169, - "grad_norm": 5.754838466644287, - "learning_rate": 3.4161490683229816e-06, - "loss": 0.5478, + "grad_norm": 1.165903091430664, + "learning_rate": 5.693581780538303e-05, + "loss": 0.5474, "step": 23540 }, { "epoch": 4.432523997741389, - "grad_norm": 0.23657679557800293, - "learning_rate": 3.404856013551666e-06, - "loss": 0.3118, + "grad_norm": 5.634236812591553, + "learning_rate": 5.67476002258611e-05, + "loss": 0.3828, "step": 23550 }, { "epoch": 4.434406173536608, - "grad_norm": 15.234368324279785, - "learning_rate": 3.3935629587803504e-06, - "loss": 0.4746, + "grad_norm": 2.44069242477417, + "learning_rate": 5.6559382646339166e-05, + "loss": 0.5159, "step": 23560 }, { "epoch": 4.436288349331828, - "grad_norm": 2.7576918601989746, - "learning_rate": 3.3822699040090343e-06, - "loss": 0.6348, + "grad_norm": 1.1310418844223022, + "learning_rate": 5.637116506681724e-05, + "loss": 0.7407, "step": 23570 }, { "epoch": 4.438170525127047, - "grad_norm": 20.79529571533203, - "learning_rate": 3.3709768492377187e-06, - "loss": 0.4821, + "grad_norm": 3.2341320514678955, + "learning_rate": 5.618294748729531e-05, + "loss": 0.5031, "step": 23580 }, { "epoch": 4.440052700922266, - "grad_norm": 15.99216365814209, - "learning_rate": 3.359683794466403e-06, - "loss": 0.4527, + "grad_norm": 2.2330312728881836, + "learning_rate": 5.599472990777338e-05, + "loss": 0.6094, "step": 23590 }, { "epoch": 4.441934876717485, - "grad_norm": 11.632766723632812, - "learning_rate": 3.3483907396950875e-06, - "loss": 0.4984, + "grad_norm": 4.402441024780273, + "learning_rate": 5.580651232825146e-05, + "loss": 0.6228, "step": 23600 }, { "epoch": 4.4438170525127045, - "grad_norm": 13.765467643737793, - "learning_rate": 3.337097684923772e-06, - "loss": 0.6588, + "grad_norm": 3.7783493995666504, + "learning_rate": 5.561829474872953e-05, + "loss": 0.6626, "step": 23610 }, { "epoch": 4.445699228307924, - "grad_norm": 2.9188790321350098, - "learning_rate": 3.3258046301524562e-06, - "loss": 0.5091, + "grad_norm": 3.7687785625457764, + "learning_rate": 5.5430077169207605e-05, + "loss": 0.542, "step": 23620 }, { "epoch": 4.447581404103143, - "grad_norm": 33.282928466796875, - "learning_rate": 3.314511575381141e-06, - "loss": 0.665, + "grad_norm": 5.35054349899292, + "learning_rate": 5.524185958968568e-05, + "loss": 0.749, "step": 23630 }, { "epoch": 4.449463579898363, - "grad_norm": 0.6511355042457581, - "learning_rate": 3.303218520609825e-06, - "loss": 0.5527, + "grad_norm": 2.302736520767212, + "learning_rate": 5.505364201016375e-05, + "loss": 0.6489, "step": 23640 }, { "epoch": 4.451345755693581, - "grad_norm": 12.938034057617188, - "learning_rate": 3.2919254658385094e-06, - "loss": 0.5798, + "grad_norm": 2.1349406242370605, + "learning_rate": 5.486542443064183e-05, + "loss": 0.8253, "step": 23650 }, { "epoch": 4.453227931488801, - "grad_norm": 19.567174911499023, - "learning_rate": 3.2806324110671938e-06, - "loss": 0.624, + "grad_norm": 4.4704132080078125, + "learning_rate": 5.46772068511199e-05, + "loss": 0.5917, "step": 23660 }, { "epoch": 4.45511010728402, - "grad_norm": 0.04238367825746536, - "learning_rate": 3.269339356295878e-06, - "loss": 0.2131, + "grad_norm": 0.0549953430891037, + "learning_rate": 5.4488989271597966e-05, + "loss": 0.3432, "step": 23670 }, { "epoch": 4.45699228307924, - "grad_norm": 14.076987266540527, - "learning_rate": 3.2580463015245625e-06, - "loss": 0.6618, + "grad_norm": 2.6983518600463867, + "learning_rate": 5.430077169207604e-05, + "loss": 0.6768, "step": 23680 }, { "epoch": 4.458874458874459, - "grad_norm": 7.118105888366699, - "learning_rate": 3.246753246753247e-06, - "loss": 0.4138, + "grad_norm": 2.198878049850464, + "learning_rate": 5.411255411255411e-05, + "loss": 0.3771, "step": 23690 }, { "epoch": 4.4607566346696785, - "grad_norm": 13.331232070922852, - "learning_rate": 3.2354601919819313e-06, - "loss": 0.3546, + "grad_norm": 3.5589592456817627, + "learning_rate": 5.392433653303218e-05, + "loss": 0.3683, "step": 23700 }, { "epoch": 4.462638810464897, - "grad_norm": 19.263700485229492, - "learning_rate": 3.2241671372106153e-06, - "loss": 0.4596, + "grad_norm": 4.189352512359619, + "learning_rate": 5.373611895351026e-05, + "loss": 0.4458, "step": 23710 }, { "epoch": 4.4645209862601165, - "grad_norm": 23.13578987121582, - "learning_rate": 3.2128740824392996e-06, - "loss": 0.6003, + "grad_norm": 6.152927875518799, + "learning_rate": 5.354790137398833e-05, + "loss": 0.6845, "step": 23720 }, { "epoch": 4.466403162055336, - "grad_norm": 2.132723331451416, - "learning_rate": 3.201581027667984e-06, - "loss": 0.6584, + "grad_norm": 0.19540274143218994, + "learning_rate": 5.3359683794466405e-05, + "loss": 0.6013, "step": 23730 }, { "epoch": 4.468285337850555, - "grad_norm": 10.117582321166992, - "learning_rate": 3.1902879728966684e-06, - "loss": 0.3707, + "grad_norm": 1.5613901615142822, + "learning_rate": 5.3171466214944474e-05, + "loss": 0.4262, "step": 23740 }, { "epoch": 4.470167513645775, - "grad_norm": 1.6187089681625366, - "learning_rate": 3.1789949181253532e-06, - "loss": 0.3961, + "grad_norm": 0.4342918395996094, + "learning_rate": 5.298324863542255e-05, + "loss": 0.4913, "step": 23750 }, { "epoch": 4.472049689440993, - "grad_norm": 8.55710506439209, - "learning_rate": 3.1677018633540376e-06, - "loss": 0.3721, + "grad_norm": 1.4780311584472656, + "learning_rate": 5.279503105590063e-05, + "loss": 0.2758, "step": 23760 }, { "epoch": 4.473931865236213, - "grad_norm": 11.327536582946777, - "learning_rate": 3.156408808582722e-06, - "loss": 0.6747, + "grad_norm": 1.1511306762695312, + "learning_rate": 5.26068134763787e-05, + "loss": 0.5309, "step": 23770 }, { "epoch": 4.475814041031432, - "grad_norm": 7.744390487670898, - "learning_rate": 3.1451157538114064e-06, - "loss": 0.6423, + "grad_norm": 3.953789710998535, + "learning_rate": 5.2418595896856766e-05, + "loss": 0.6147, "step": 23780 }, { "epoch": 4.477696216826652, - "grad_norm": 1.3057587146759033, - "learning_rate": 3.1338226990400903e-06, - "loss": 0.2579, + "grad_norm": 0.7184831500053406, + "learning_rate": 5.223037831733484e-05, + "loss": 0.34, "step": 23790 }, { "epoch": 4.479578392621871, - "grad_norm": 10.926691055297852, - "learning_rate": 3.1225296442687747e-06, - "loss": 0.5395, + "grad_norm": 0.6724153757095337, + "learning_rate": 5.204216073781291e-05, + "loss": 0.5609, "step": 23800 }, { "epoch": 4.4814605684170905, - "grad_norm": 13.025545120239258, - "learning_rate": 3.111236589497459e-06, - "loss": 0.5213, + "grad_norm": 2.403193950653076, + "learning_rate": 5.185394315829098e-05, + "loss": 0.6468, "step": 23810 }, { "epoch": 4.483342744212309, - "grad_norm": 8.658554077148438, - "learning_rate": 3.0999435347261435e-06, - "loss": 0.522, + "grad_norm": 2.3661391735076904, + "learning_rate": 5.166572557876906e-05, + "loss": 0.5268, "step": 23820 }, { "epoch": 4.4852249200075285, - "grad_norm": 0.7279936671257019, - "learning_rate": 3.088650479954828e-06, - "loss": 0.3601, + "grad_norm": 3.755697250366211, + "learning_rate": 5.147750799924713e-05, + "loss": 0.5165, "step": 23830 }, { "epoch": 4.487107095802748, - "grad_norm": 11.042409896850586, - "learning_rate": 3.0773574251835122e-06, - "loss": 0.5312, + "grad_norm": 2.821866512298584, + "learning_rate": 5.1289290419725205e-05, + "loss": 0.6264, "step": 23840 }, { "epoch": 4.488989271597967, - "grad_norm": 2.8644516468048096, - "learning_rate": 3.0660643704121966e-06, - "loss": 0.5416, + "grad_norm": 1.1792480945587158, + "learning_rate": 5.1101072840203274e-05, + "loss": 0.5714, "step": 23850 }, { "epoch": 4.490871447393187, - "grad_norm": 16.289228439331055, - "learning_rate": 3.0547713156408806e-06, - "loss": 0.37, + "grad_norm": 3.8462843894958496, + "learning_rate": 5.0912855260681344e-05, + "loss": 0.3337, "step": 23860 }, { "epoch": 4.492753623188406, - "grad_norm": 25.66763687133789, - "learning_rate": 3.0434782608695654e-06, - "loss": 0.9307, + "grad_norm": 4.147275924682617, + "learning_rate": 5.072463768115943e-05, + "loss": 1.012, "step": 23870 }, { "epoch": 4.494635798983625, - "grad_norm": 3.9312744140625, - "learning_rate": 3.0321852060982498e-06, - "loss": 0.3846, + "grad_norm": 4.648754596710205, + "learning_rate": 5.05364201016375e-05, + "loss": 0.6537, "step": 23880 }, { "epoch": 4.496517974778844, - "grad_norm": 2.878774404525757, - "learning_rate": 3.020892151326934e-06, - "loss": 0.6847, + "grad_norm": 0.7671937346458435, + "learning_rate": 5.0348202522115566e-05, + "loss": 0.8298, "step": 23890 }, { "epoch": 4.498400150574064, - "grad_norm": 8.510382652282715, - "learning_rate": 3.0095990965556185e-06, - "loss": 0.4522, + "grad_norm": 1.893687129020691, + "learning_rate": 5.015998494259364e-05, + "loss": 0.4802, "step": 23900 }, { "epoch": 4.500282326369283, - "grad_norm": 9.131433486938477, - "learning_rate": 2.998306041784303e-06, - "loss": 0.3852, + "grad_norm": 2.864920139312744, + "learning_rate": 4.997176736307171e-05, + "loss": 0.4142, "step": 23910 }, { "epoch": 4.5021645021645025, - "grad_norm": 14.799365043640137, - "learning_rate": 2.9870129870129873e-06, - "loss": 0.3988, + "grad_norm": 2.016402244567871, + "learning_rate": 4.978354978354978e-05, + "loss": 0.5811, "step": 23920 }, { "epoch": 4.504046677959721, - "grad_norm": 25.332460403442383, - "learning_rate": 2.9757199322416712e-06, - "loss": 0.5088, + "grad_norm": 3.925957441329956, + "learning_rate": 4.959533220402786e-05, + "loss": 0.6436, "step": 23930 }, { "epoch": 4.5059288537549405, - "grad_norm": 15.210441589355469, - "learning_rate": 2.9644268774703556e-06, - "loss": 0.5733, + "grad_norm": 6.499224662780762, + "learning_rate": 4.940711462450593e-05, + "loss": 0.6918, "step": 23940 }, { "epoch": 4.50781102955016, - "grad_norm": 17.5853271484375, - "learning_rate": 2.95313382269904e-06, - "loss": 0.8435, + "grad_norm": 2.7809550762176514, + "learning_rate": 4.9218897044984e-05, + "loss": 0.8341, "step": 23950 }, { "epoch": 4.509693205345379, - "grad_norm": 5.9127020835876465, - "learning_rate": 2.9418407679277244e-06, - "loss": 0.405, + "grad_norm": 2.4689996242523193, + "learning_rate": 4.9030679465462074e-05, + "loss": 0.5343, "step": 23960 }, { "epoch": 4.511575381140599, - "grad_norm": 15.873945236206055, - "learning_rate": 2.9305477131564088e-06, - "loss": 0.3017, + "grad_norm": 2.489816188812256, + "learning_rate": 4.8842461885940144e-05, + "loss": 0.3929, "step": 23970 }, { "epoch": 4.513457556935818, - "grad_norm": 6.168259620666504, - "learning_rate": 2.919254658385093e-06, - "loss": 0.8213, + "grad_norm": 2.4629080295562744, + "learning_rate": 4.865424430641822e-05, + "loss": 0.8677, "step": 23980 }, { "epoch": 4.515339732731037, - "grad_norm": 14.95897102355957, - "learning_rate": 2.907961603613778e-06, - "loss": 0.7194, + "grad_norm": 3.8149843215942383, + "learning_rate": 4.84660267268963e-05, + "loss": 0.6299, "step": 23990 }, { "epoch": 4.517221908526256, - "grad_norm": 9.933831214904785, - "learning_rate": 2.896668548842462e-06, - "loss": 0.4408, + "grad_norm": 1.8469231128692627, + "learning_rate": 4.8277809147374366e-05, + "loss": 0.5836, "step": 24000 }, { "epoch": 4.519104084321476, - "grad_norm": 0.1559751182794571, - "learning_rate": 2.8853754940711463e-06, - "loss": 0.5496, + "grad_norm": 0.25325506925582886, + "learning_rate": 4.808959156785244e-05, + "loss": 0.5975, "step": 24010 }, { "epoch": 4.520986260116695, - "grad_norm": 16.345056533813477, - "learning_rate": 2.8740824392998307e-06, - "loss": 0.5864, + "grad_norm": 2.1251931190490723, + "learning_rate": 4.790137398833051e-05, + "loss": 0.5914, "step": 24020 }, { "epoch": 4.5228684359119145, - "grad_norm": 28.054203033447266, - "learning_rate": 2.862789384528515e-06, - "loss": 0.7297, + "grad_norm": 4.482444763183594, + "learning_rate": 4.771315640880858e-05, + "loss": 0.6149, "step": 24030 }, { "epoch": 4.524750611707134, - "grad_norm": 14.966489791870117, - "learning_rate": 2.8514963297571994e-06, - "loss": 0.3968, + "grad_norm": 0.7547290921211243, + "learning_rate": 4.752493882928666e-05, + "loss": 0.4447, "step": 24040 }, { "epoch": 4.526632787502352, - "grad_norm": 20.097759246826172, - "learning_rate": 2.840203274985884e-06, - "loss": 0.7483, + "grad_norm": 3.609314441680908, + "learning_rate": 4.733672124976473e-05, + "loss": 0.8325, "step": 24050 }, { "epoch": 4.528514963297572, - "grad_norm": 19.95496940612793, - "learning_rate": 2.828910220214568e-06, - "loss": 0.6654, + "grad_norm": 2.2269513607025146, + "learning_rate": 4.71485036702428e-05, + "loss": 0.6928, "step": 24060 }, { "epoch": 4.530397139092791, - "grad_norm": 8.588127136230469, - "learning_rate": 2.8176171654432526e-06, - "loss": 0.2225, + "grad_norm": 2.025987148284912, + "learning_rate": 4.6960286090720874e-05, + "loss": 0.3431, "step": 24070 }, { "epoch": 4.532279314888011, - "grad_norm": 10.798761367797852, - "learning_rate": 2.8063241106719366e-06, - "loss": 0.5675, + "grad_norm": 1.6247036457061768, + "learning_rate": 4.6772068511198944e-05, + "loss": 0.6616, "step": 24080 }, { "epoch": 4.53416149068323, - "grad_norm": 0.21323099732398987, - "learning_rate": 2.795031055900621e-06, - "loss": 0.4654, + "grad_norm": 3.7297098636627197, + "learning_rate": 4.658385093167702e-05, + "loss": 0.5445, "step": 24090 }, { "epoch": 4.536043666478449, - "grad_norm": 12.417448997497559, - "learning_rate": 2.7837380011293053e-06, - "loss": 0.7423, + "grad_norm": 2.0358946323394775, + "learning_rate": 4.639563335215509e-05, + "loss": 0.8047, "step": 24100 }, { "epoch": 4.537925842273668, - "grad_norm": 8.83974838256836, - "learning_rate": 2.7724449463579897e-06, - "loss": 0.3925, + "grad_norm": 2.806361675262451, + "learning_rate": 4.620741577263316e-05, + "loss": 0.5435, "step": 24110 }, { "epoch": 4.539808018068888, - "grad_norm": 28.08986473083496, - "learning_rate": 2.7611518915866745e-06, - "loss": 0.5835, + "grad_norm": 4.876021385192871, + "learning_rate": 4.601919819311124e-05, + "loss": 0.6177, "step": 24120 }, { "epoch": 4.541690193864107, - "grad_norm": 2.058382034301758, - "learning_rate": 2.749858836815359e-06, - "loss": 0.5558, + "grad_norm": 0.8642858862876892, + "learning_rate": 4.583098061358931e-05, + "loss": 0.5793, "step": 24130 }, { "epoch": 4.543572369659326, - "grad_norm": 11.856590270996094, - "learning_rate": 2.7385657820440433e-06, - "loss": 0.6624, + "grad_norm": 3.2075188159942627, + "learning_rate": 4.564276303406738e-05, + "loss": 0.6995, "step": 24140 }, { "epoch": 4.545454545454545, - "grad_norm": 28.79555320739746, - "learning_rate": 2.7272727272727272e-06, - "loss": 0.4069, + "grad_norm": 2.1683106422424316, + "learning_rate": 4.545454545454546e-05, + "loss": 0.4836, "step": 24150 }, { "epoch": 4.547336721249764, - "grad_norm": 0.156916081905365, - "learning_rate": 2.7159796725014116e-06, - "loss": 0.3982, + "grad_norm": 8.566926002502441, + "learning_rate": 4.526632787502353e-05, + "loss": 0.5407, "step": 24160 }, { "epoch": 4.549218897044984, - "grad_norm": 22.804506301879883, - "learning_rate": 2.704686617730096e-06, - "loss": 0.5942, + "grad_norm": 3.3746559619903564, + "learning_rate": 4.50781102955016e-05, + "loss": 0.524, "step": 24170 }, { "epoch": 4.551101072840203, - "grad_norm": 12.148880004882812, - "learning_rate": 2.6933935629587804e-06, - "loss": 0.6395, + "grad_norm": 2.927900791168213, + "learning_rate": 4.4889892715979674e-05, + "loss": 0.7662, "step": 24180 }, { "epoch": 4.552983248635423, - "grad_norm": 17.041955947875977, - "learning_rate": 2.6821005081874648e-06, - "loss": 0.349, + "grad_norm": 1.0152127742767334, + "learning_rate": 4.4701675136457744e-05, + "loss": 0.3934, "step": 24190 }, { "epoch": 4.554865424430642, - "grad_norm": 15.154557228088379, - "learning_rate": 2.670807453416149e-06, - "loss": 0.5323, + "grad_norm": 5.355255603790283, + "learning_rate": 4.451345755693582e-05, + "loss": 0.5864, "step": 24200 }, { "epoch": 4.556747600225862, - "grad_norm": 5.193390369415283, - "learning_rate": 2.6595143986448335e-06, - "loss": 0.306, + "grad_norm": 2.3401451110839844, + "learning_rate": 4.432523997741389e-05, + "loss": 0.3616, "step": 24210 }, { "epoch": 4.55862977602108, - "grad_norm": 14.101144790649414, - "learning_rate": 2.648221343873518e-06, - "loss": 0.5806, + "grad_norm": 2.2497141361236572, + "learning_rate": 4.413702239789196e-05, + "loss": 0.6165, "step": 24220 }, { "epoch": 4.5605119518162995, - "grad_norm": 4.696810722351074, - "learning_rate": 2.636928289102202e-06, - "loss": 0.4525, + "grad_norm": 1.6242727041244507, + "learning_rate": 4.3948804818370036e-05, + "loss": 0.5554, "step": 24230 }, { "epoch": 4.562394127611519, - "grad_norm": 2.615445613861084, - "learning_rate": 2.6256352343308867e-06, - "loss": 0.4967, + "grad_norm": 2.4595236778259277, + "learning_rate": 4.376058723884811e-05, + "loss": 0.5662, "step": 24240 }, { "epoch": 4.564276303406738, - "grad_norm": 35.27506637573242, - "learning_rate": 2.614342179559571e-06, - "loss": 0.4971, + "grad_norm": 0.9235624074935913, + "learning_rate": 4.357236965932618e-05, + "loss": 0.4612, "step": 24250 }, { "epoch": 4.566158479201958, - "grad_norm": 8.616056442260742, - "learning_rate": 2.6030491247882554e-06, - "loss": 0.3323, + "grad_norm": 1.4337669610977173, + "learning_rate": 4.338415207980426e-05, + "loss": 0.428, "step": 24260 }, { "epoch": 4.568040654997176, - "grad_norm": 1.696595311164856, - "learning_rate": 2.59175607001694e-06, - "loss": 0.343, + "grad_norm": 2.5631136894226074, + "learning_rate": 4.319593450028233e-05, + "loss": 0.3682, "step": 24270 }, { "epoch": 4.569922830792396, - "grad_norm": 8.711554527282715, - "learning_rate": 2.580463015245624e-06, - "loss": 0.5896, + "grad_norm": 2.581136703491211, + "learning_rate": 4.30077169207604e-05, + "loss": 0.7191, "step": 24280 }, { "epoch": 4.571805006587615, - "grad_norm": 15.512261390686035, - "learning_rate": 2.5691699604743086e-06, - "loss": 0.4962, + "grad_norm": 1.4882745742797852, + "learning_rate": 4.2819499341238474e-05, + "loss": 0.5948, "step": 24290 }, { "epoch": 4.573687182382835, - "grad_norm": 10.31464672088623, - "learning_rate": 2.5578769057029925e-06, - "loss": 0.5446, + "grad_norm": 2.5179505348205566, + "learning_rate": 4.2631281761716544e-05, + "loss": 0.639, "step": 24300 }, { "epoch": 4.575569358178054, - "grad_norm": 68.28598022460938, - "learning_rate": 2.546583850931677e-06, - "loss": 0.5234, + "grad_norm": 6.4887919425964355, + "learning_rate": 4.244306418219462e-05, + "loss": 0.6493, "step": 24310 }, { "epoch": 4.577451533973273, - "grad_norm": 23.362018585205078, - "learning_rate": 2.5352907961603613e-06, - "loss": 0.4662, + "grad_norm": 2.8530259132385254, + "learning_rate": 4.225484660267269e-05, + "loss": 0.4943, "step": 24320 }, { "epoch": 4.579333709768492, - "grad_norm": 0.38361719250679016, - "learning_rate": 2.5239977413890457e-06, - "loss": 0.3885, + "grad_norm": 0.3246712386608124, + "learning_rate": 4.206662902315076e-05, + "loss": 0.3812, "step": 24330 }, { "epoch": 4.5812158855637115, - "grad_norm": 13.636258125305176, - "learning_rate": 2.51270468661773e-06, - "loss": 0.1951, + "grad_norm": 2.248939037322998, + "learning_rate": 4.1878411443628836e-05, + "loss": 0.3863, "step": 24340 }, { "epoch": 4.583098061358931, - "grad_norm": 13.915003776550293, - "learning_rate": 2.5014116318464144e-06, - "loss": 0.2864, + "grad_norm": 3.7241125106811523, + "learning_rate": 4.1690193864106906e-05, + "loss": 0.4399, "step": 24350 }, { "epoch": 4.58498023715415, - "grad_norm": 28.40807342529297, - "learning_rate": 2.4901185770750993e-06, - "loss": 0.4974, + "grad_norm": 7.332564353942871, + "learning_rate": 4.150197628458498e-05, + "loss": 0.5854, "step": 24360 }, { "epoch": 4.58686241294937, - "grad_norm": 21.473955154418945, - "learning_rate": 2.478825522303783e-06, - "loss": 0.6872, + "grad_norm": 4.252923965454102, + "learning_rate": 4.131375870506306e-05, + "loss": 0.6474, "step": 24370 }, { "epoch": 4.588744588744589, - "grad_norm": 39.9091911315918, - "learning_rate": 2.4675324675324676e-06, - "loss": 0.4134, + "grad_norm": 5.477579593658447, + "learning_rate": 4.112554112554113e-05, + "loss": 0.5945, "step": 24380 }, { "epoch": 4.590626764539808, - "grad_norm": 18.155563354492188, - "learning_rate": 2.456239412761152e-06, - "loss": 0.6036, + "grad_norm": 4.131031513214111, + "learning_rate": 4.09373235460192e-05, + "loss": 0.6526, "step": 24390 }, { "epoch": 4.592508940335027, - "grad_norm": 9.114380836486816, - "learning_rate": 2.4449463579898364e-06, - "loss": 0.6799, + "grad_norm": 1.5530714988708496, + "learning_rate": 4.0749105966497274e-05, + "loss": 0.6654, "step": 24400 }, { "epoch": 4.594391116130247, - "grad_norm": 6.3043904304504395, - "learning_rate": 2.4336533032185207e-06, - "loss": 0.626, + "grad_norm": 2.8524997234344482, + "learning_rate": 4.0560888386975344e-05, + "loss": 0.5783, "step": 24410 }, { "epoch": 4.596273291925466, - "grad_norm": 16.44568634033203, - "learning_rate": 2.422360248447205e-06, - "loss": 0.3922, + "grad_norm": 2.7015812397003174, + "learning_rate": 4.0372670807453414e-05, + "loss": 0.4753, "step": 24420 }, { "epoch": 4.5981554677206855, - "grad_norm": 9.025197982788086, - "learning_rate": 2.4110671936758895e-06, - "loss": 0.7261, + "grad_norm": 1.2602338790893555, + "learning_rate": 4.018445322793149e-05, + "loss": 0.7536, "step": 24430 }, { "epoch": 4.600037643515904, - "grad_norm": 19.083044052124023, - "learning_rate": 2.399774138904574e-06, - "loss": 0.4588, + "grad_norm": 3.109779119491577, + "learning_rate": 3.999623564840956e-05, + "loss": 0.5292, "step": 24440 }, { "epoch": 4.6019198193111235, - "grad_norm": 16.03182029724121, - "learning_rate": 2.388481084133258e-06, - "loss": 0.4318, + "grad_norm": 3.305142402648926, + "learning_rate": 3.9808018068887636e-05, + "loss": 0.44, "step": 24450 }, { "epoch": 4.603801995106343, - "grad_norm": 1.4416394233703613, - "learning_rate": 2.3771880293619422e-06, - "loss": 0.658, + "grad_norm": 0.8609833717346191, + "learning_rate": 3.9619800489365706e-05, + "loss": 0.592, "step": 24460 }, { "epoch": 4.605684170901562, - "grad_norm": 7.5059075355529785, - "learning_rate": 2.3658949745906266e-06, - "loss": 0.4453, + "grad_norm": 1.3746150732040405, + "learning_rate": 3.9431582909843775e-05, + "loss": 0.5808, "step": 24470 }, { "epoch": 4.607566346696782, - "grad_norm": 4.021427631378174, - "learning_rate": 2.3546019198193114e-06, - "loss": 0.5636, + "grad_norm": 1.0084211826324463, + "learning_rate": 3.924336533032186e-05, + "loss": 0.7987, "step": 24480 }, { "epoch": 4.609448522492, - "grad_norm": 5.8382744789123535, - "learning_rate": 2.343308865047996e-06, - "loss": 0.2773, + "grad_norm": 2.227627992630005, + "learning_rate": 3.905514775079993e-05, + "loss": 0.4238, "step": 24490 }, { "epoch": 4.61133069828722, - "grad_norm": 10.830999374389648, - "learning_rate": 2.33201581027668e-06, - "loss": 0.5919, + "grad_norm": 3.0300769805908203, + "learning_rate": 3.8866930171278e-05, + "loss": 0.4748, "step": 24500 }, { "epoch": 4.613212874082439, - "grad_norm": 30.35820198059082, - "learning_rate": 2.3207227555053646e-06, - "loss": 0.3376, + "grad_norm": 3.6097495555877686, + "learning_rate": 3.8678712591756074e-05, + "loss": 0.3743, "step": 24510 }, { "epoch": 4.615095049877659, - "grad_norm": 17.739852905273438, - "learning_rate": 2.3094297007340485e-06, - "loss": 0.9373, + "grad_norm": 5.164699077606201, + "learning_rate": 3.8490495012234144e-05, + "loss": 0.8332, "step": 24520 }, { "epoch": 4.616977225672878, - "grad_norm": 9.197490692138672, - "learning_rate": 2.298136645962733e-06, - "loss": 0.3972, + "grad_norm": 1.3957386016845703, + "learning_rate": 3.8302277432712214e-05, + "loss": 0.532, "step": 24530 }, { "epoch": 4.6188594014680975, - "grad_norm": 16.6794490814209, - "learning_rate": 2.2868435911914173e-06, - "loss": 0.3707, + "grad_norm": 6.435964107513428, + "learning_rate": 3.811405985319029e-05, + "loss": 0.5485, "step": 24540 }, { "epoch": 4.620741577263316, - "grad_norm": 0.1481352001428604, - "learning_rate": 2.2755505364201017e-06, - "loss": 0.4161, + "grad_norm": 0.23536163568496704, + "learning_rate": 3.792584227366836e-05, + "loss": 0.5219, "step": 24550 }, { "epoch": 4.6226237530585355, - "grad_norm": 0.12366284430027008, - "learning_rate": 2.264257481648786e-06, - "loss": 0.4102, + "grad_norm": 0.12256874889135361, + "learning_rate": 3.7737624694146436e-05, + "loss": 0.3599, "step": 24560 }, { "epoch": 4.624505928853755, - "grad_norm": 3.985370397567749, - "learning_rate": 2.2529644268774704e-06, - "loss": 0.6824, + "grad_norm": 0.9305835366249084, + "learning_rate": 3.7549407114624506e-05, + "loss": 0.6705, "step": 24570 }, { "epoch": 4.626388104648974, - "grad_norm": 4.121525764465332, - "learning_rate": 2.241671372106155e-06, - "loss": 0.4749, + "grad_norm": 1.9154576063156128, + "learning_rate": 3.7361189535102575e-05, + "loss": 0.5016, "step": 24580 }, { "epoch": 4.628270280444194, - "grad_norm": 3.955786943435669, - "learning_rate": 2.2303783173348388e-06, - "loss": 0.4139, + "grad_norm": 2.5845377445220947, + "learning_rate": 3.717297195558065e-05, + "loss": 0.4827, "step": 24590 }, { "epoch": 4.630152456239413, - "grad_norm": 17.2208251953125, - "learning_rate": 2.219085262563523e-06, - "loss": 0.7047, + "grad_norm": 2.6821815967559814, + "learning_rate": 3.698475437605872e-05, + "loss": 0.7094, "step": 24600 }, { "epoch": 4.632034632034632, - "grad_norm": 7.985387325286865, - "learning_rate": 2.207792207792208e-06, - "loss": 0.4245, + "grad_norm": 1.9128459692001343, + "learning_rate": 3.67965367965368e-05, + "loss": 0.4997, "step": 24610 }, { "epoch": 4.633916807829851, - "grad_norm": 10.840561866760254, - "learning_rate": 2.1964991530208923e-06, - "loss": 0.451, + "grad_norm": 4.187811851501465, + "learning_rate": 3.6608319217014874e-05, + "loss": 0.6142, "step": 24620 }, { "epoch": 4.635798983625071, - "grad_norm": 1.2460243701934814, - "learning_rate": 2.1852060982495767e-06, - "loss": 0.5374, + "grad_norm": 0.5653007626533508, + "learning_rate": 3.6420101637492944e-05, + "loss": 0.5527, "step": 24630 }, { "epoch": 4.63768115942029, - "grad_norm": 7.391758918762207, - "learning_rate": 2.173913043478261e-06, - "loss": 0.5306, + "grad_norm": 4.011232852935791, + "learning_rate": 3.6231884057971014e-05, + "loss": 0.5676, "step": 24640 }, { "epoch": 4.6395633352155095, - "grad_norm": 17.164552688598633, - "learning_rate": 2.1626199887069455e-06, - "loss": 0.4736, + "grad_norm": 1.5307304859161377, + "learning_rate": 3.604366647844909e-05, + "loss": 0.4625, "step": 24650 }, { "epoch": 4.641445511010728, - "grad_norm": 15.955860137939453, - "learning_rate": 2.1513269339356294e-06, - "loss": 0.7017, + "grad_norm": 3.410989761352539, + "learning_rate": 3.585544889892716e-05, + "loss": 0.7372, "step": 24660 }, { "epoch": 4.643327686805947, - "grad_norm": 6.390231609344482, - "learning_rate": 2.140033879164314e-06, - "loss": 0.6922, + "grad_norm": 4.534970283508301, + "learning_rate": 3.5667231319405236e-05, + "loss": 0.668, "step": 24670 }, { "epoch": 4.645209862601167, - "grad_norm": 35.61813735961914, - "learning_rate": 2.128740824392998e-06, - "loss": 0.7417, + "grad_norm": 5.051069259643555, + "learning_rate": 3.5479013739883306e-05, + "loss": 0.6868, "step": 24680 }, { "epoch": 4.647092038396386, - "grad_norm": 23.050209045410156, - "learning_rate": 2.1174477696216826e-06, - "loss": 0.6244, + "grad_norm": 2.785597085952759, + "learning_rate": 3.5290796160361375e-05, + "loss": 0.6126, "step": 24690 }, { "epoch": 4.648974214191606, - "grad_norm": 8.97560977935791, - "learning_rate": 2.106154714850367e-06, - "loss": 0.6843, + "grad_norm": 2.1288540363311768, + "learning_rate": 3.510257858083945e-05, + "loss": 0.7734, "step": 24700 }, { "epoch": 4.650856389986825, - "grad_norm": 5.241304874420166, - "learning_rate": 2.0948616600790514e-06, - "loss": 0.4725, + "grad_norm": 1.6902096271514893, + "learning_rate": 3.491436100131752e-05, + "loss": 0.5791, "step": 24710 }, { "epoch": 4.652738565782044, - "grad_norm": 15.338078498840332, - "learning_rate": 2.0835686053077357e-06, - "loss": 0.6333, + "grad_norm": 4.554157257080078, + "learning_rate": 3.472614342179559e-05, + "loss": 0.7908, "step": 24720 }, { "epoch": 4.654620741577263, - "grad_norm": 31.047271728515625, - "learning_rate": 2.0722755505364205e-06, - "loss": 0.2574, + "grad_norm": 2.3936281204223633, + "learning_rate": 3.4537925842273674e-05, + "loss": 0.3783, "step": 24730 }, { "epoch": 4.656502917372483, - "grad_norm": 39.24232864379883, - "learning_rate": 2.0609824957651045e-06, - "loss": 0.3953, + "grad_norm": 3.950031280517578, + "learning_rate": 3.4349708262751744e-05, + "loss": 0.4945, "step": 24740 }, { "epoch": 4.658385093167702, - "grad_norm": 6.512187957763672, - "learning_rate": 2.049689440993789e-06, - "loss": 0.398, + "grad_norm": 2.500994920730591, + "learning_rate": 3.4161490683229814e-05, + "loss": 0.637, "step": 24750 }, { "epoch": 4.660267268962921, - "grad_norm": 10.04133415222168, - "learning_rate": 2.0383963862224733e-06, - "loss": 0.6215, + "grad_norm": 2.361637592315674, + "learning_rate": 3.397327310370789e-05, + "loss": 0.6903, "step": 24760 }, { "epoch": 4.662149444758141, - "grad_norm": 20.212570190429688, - "learning_rate": 2.0271033314511576e-06, - "loss": 0.8138, + "grad_norm": 2.957148551940918, + "learning_rate": 3.378505552418596e-05, + "loss": 0.8531, "step": 24770 }, { "epoch": 4.664031620553359, - "grad_norm": 8.476882934570312, - "learning_rate": 2.015810276679842e-06, - "loss": 0.7214, + "grad_norm": 2.1123647689819336, + "learning_rate": 3.359683794466403e-05, + "loss": 0.7265, "step": 24780 }, { "epoch": 4.665913796348579, - "grad_norm": 11.082260131835938, - "learning_rate": 2.0045172219085264e-06, - "loss": 0.4978, + "grad_norm": 2.273115396499634, + "learning_rate": 3.3408620365142106e-05, + "loss": 0.651, "step": 24790 }, { "epoch": 4.667795972143798, - "grad_norm": 0.5434314012527466, - "learning_rate": 1.993224167137211e-06, - "loss": 0.72, + "grad_norm": 0.9176938533782959, + "learning_rate": 3.3220402785620175e-05, + "loss": 0.8676, "step": 24800 }, { "epoch": 4.669678147939018, - "grad_norm": 28.41150665283203, - "learning_rate": 1.9819311123658948e-06, - "loss": 0.6336, + "grad_norm": 5.138895511627197, + "learning_rate": 3.303218520609825e-05, + "loss": 0.6918, "step": 24810 }, { "epoch": 4.671560323734237, - "grad_norm": 1.0246213674545288, - "learning_rate": 1.970638057594579e-06, - "loss": 0.888, + "grad_norm": 1.913735270500183, + "learning_rate": 3.284396762657632e-05, + "loss": 0.8698, "step": 24820 }, { "epoch": 4.673442499529456, - "grad_norm": 15.647318840026855, - "learning_rate": 1.9593450028232635e-06, - "loss": 0.6784, + "grad_norm": 3.0344340801239014, + "learning_rate": 3.265575004705439e-05, + "loss": 0.7489, "step": 24830 }, { "epoch": 4.675324675324675, - "grad_norm": 16.091299057006836, - "learning_rate": 1.948051948051948e-06, - "loss": 0.2994, + "grad_norm": 3.871687889099121, + "learning_rate": 3.246753246753247e-05, + "loss": 0.3914, "step": 24840 }, { "epoch": 4.6772068511198945, - "grad_norm": 20.402158737182617, - "learning_rate": 1.9367588932806327e-06, - "loss": 0.4222, + "grad_norm": 7.366399765014648, + "learning_rate": 3.2279314888010544e-05, + "loss": 0.6409, "step": 24850 }, { "epoch": 4.679089026915114, - "grad_norm": 29.887250900268555, - "learning_rate": 1.925465838509317e-06, - "loss": 0.5632, + "grad_norm": 6.943544864654541, + "learning_rate": 3.2091097308488614e-05, + "loss": 0.6329, "step": 24860 }, { "epoch": 4.680971202710333, - "grad_norm": 0.39280015230178833, - "learning_rate": 1.9141727837380015e-06, - "loss": 0.3391, + "grad_norm": 0.23021714389324188, + "learning_rate": 3.190287972896669e-05, + "loss": 0.3462, "step": 24870 }, { "epoch": 4.682853378505552, - "grad_norm": 1.1423434019088745, - "learning_rate": 1.9028797289666856e-06, - "loss": 0.4052, + "grad_norm": 3.1440484523773193, + "learning_rate": 3.171466214944476e-05, + "loss": 0.5147, "step": 24880 }, { "epoch": 4.684735554300771, - "grad_norm": 0.18918712437152863, - "learning_rate": 1.89158667419537e-06, - "loss": 0.6111, + "grad_norm": 0.49845820665359497, + "learning_rate": 3.152644456992283e-05, + "loss": 0.8418, "step": 24890 }, { "epoch": 4.686617730095991, - "grad_norm": 6.821197986602783, - "learning_rate": 1.8802936194240542e-06, - "loss": 0.3858, + "grad_norm": 2.562613010406494, + "learning_rate": 3.1338226990400906e-05, + "loss": 0.4522, "step": 24900 }, { "epoch": 4.68849990589121, - "grad_norm": 18.688823699951172, - "learning_rate": 1.8690005646527386e-06, - "loss": 0.5372, + "grad_norm": 1.2562291622161865, + "learning_rate": 3.1150009410878975e-05, + "loss": 0.5723, "step": 24910 }, { "epoch": 4.69038208168643, - "grad_norm": 22.61110496520996, - "learning_rate": 1.857707509881423e-06, - "loss": 0.5846, + "grad_norm": 3.6861045360565186, + "learning_rate": 3.096179183135705e-05, + "loss": 0.6128, "step": 24920 }, { "epoch": 4.692264257481649, - "grad_norm": 14.270989418029785, - "learning_rate": 1.8464144551101073e-06, - "loss": 0.3677, + "grad_norm": 2.452535629272461, + "learning_rate": 3.077357425183512e-05, + "loss": 0.5187, "step": 24930 }, { "epoch": 4.6941464332768685, - "grad_norm": 20.81678581237793, - "learning_rate": 1.8351214003387917e-06, - "loss": 0.3929, + "grad_norm": 5.4972944259643555, + "learning_rate": 3.05853566723132e-05, + "loss": 0.4926, "step": 24940 }, { "epoch": 4.696028609072087, - "grad_norm": 0.7753129601478577, - "learning_rate": 1.823828345567476e-06, - "loss": 0.512, + "grad_norm": 0.3770606815814972, + "learning_rate": 3.0397139092791268e-05, + "loss": 0.5036, "step": 24950 }, { "epoch": 4.6979107848673065, - "grad_norm": 7.8035664558410645, - "learning_rate": 1.8125352907961605e-06, - "loss": 0.8553, + "grad_norm": 1.6760812997817993, + "learning_rate": 3.020892151326934e-05, + "loss": 0.7949, "step": 24960 }, { "epoch": 4.699792960662526, - "grad_norm": 9.686589241027832, - "learning_rate": 1.8012422360248449e-06, - "loss": 0.7138, + "grad_norm": 1.8957998752593994, + "learning_rate": 3.0020703933747414e-05, + "loss": 0.891, "step": 24970 }, { "epoch": 4.701675136457745, - "grad_norm": 0.7475610375404358, - "learning_rate": 1.789949181253529e-06, - "loss": 0.6173, + "grad_norm": 0.5944697260856628, + "learning_rate": 2.9832486354225483e-05, + "loss": 0.5906, "step": 24980 }, { "epoch": 4.703557312252965, - "grad_norm": 11.084076881408691, - "learning_rate": 1.7786561264822134e-06, - "loss": 0.353, + "grad_norm": 4.481252670288086, + "learning_rate": 2.9644268774703556e-05, + "loss": 0.4426, "step": 24990 }, { "epoch": 4.705439488048183, - "grad_norm": 33.30714416503906, - "learning_rate": 1.7673630717108978e-06, - "loss": 0.578, + "grad_norm": 4.642335891723633, + "learning_rate": 2.945605119518163e-05, + "loss": 0.5466, "step": 25000 }, { "epoch": 4.707321663843403, - "grad_norm": 35.408416748046875, - "learning_rate": 1.7560700169395822e-06, - "loss": 0.4082, + "grad_norm": 3.7608211040496826, + "learning_rate": 2.9267833615659706e-05, + "loss": 0.5645, "step": 25010 }, { "epoch": 4.709203839638622, - "grad_norm": 37.653106689453125, - "learning_rate": 1.7447769621682666e-06, - "loss": 0.5041, + "grad_norm": 4.015994071960449, + "learning_rate": 2.9079616036137775e-05, + "loss": 0.5362, "step": 25020 }, { "epoch": 4.711086015433842, - "grad_norm": 0.38172101974487305, - "learning_rate": 1.733483907396951e-06, - "loss": 0.6022, + "grad_norm": 0.06955814361572266, + "learning_rate": 2.889139845661585e-05, + "loss": 0.7009, "step": 25030 }, { "epoch": 4.712968191229061, - "grad_norm": 0.1132197305560112, - "learning_rate": 1.7221908526256353e-06, - "loss": 0.4891, + "grad_norm": 0.20994648337364197, + "learning_rate": 2.870318087709392e-05, + "loss": 0.4559, "step": 25040 }, { "epoch": 4.71485036702428, - "grad_norm": 24.087400436401367, - "learning_rate": 1.7108977978543195e-06, - "loss": 0.4906, + "grad_norm": 0.5177777409553528, + "learning_rate": 2.851496329757199e-05, + "loss": 0.5917, "step": 25050 }, { "epoch": 4.716732542819499, - "grad_norm": 9.231995582580566, - "learning_rate": 1.6996047430830039e-06, - "loss": 0.6238, + "grad_norm": 1.8041309118270874, + "learning_rate": 2.8326745718050064e-05, + "loss": 0.4951, "step": 25060 }, { "epoch": 4.7186147186147185, - "grad_norm": 22.86023712158203, - "learning_rate": 1.6883116883116885e-06, - "loss": 0.9478, + "grad_norm": 3.804542303085327, + "learning_rate": 2.813852813852814e-05, + "loss": 1.0139, "step": 25070 }, { "epoch": 4.720496894409938, - "grad_norm": 7.785012245178223, - "learning_rate": 1.6770186335403729e-06, - "loss": 0.5008, + "grad_norm": 3.876880168914795, + "learning_rate": 2.7950310559006214e-05, + "loss": 0.5698, "step": 25080 }, { "epoch": 4.722379070205157, - "grad_norm": 2.3250386714935303, - "learning_rate": 1.665725578769057e-06, - "loss": 0.7925, + "grad_norm": 3.0488855838775635, + "learning_rate": 2.7762092979484283e-05, + "loss": 0.9231, "step": 25090 }, { "epoch": 4.724261246000377, - "grad_norm": 12.125810623168945, - "learning_rate": 1.6544325239977414e-06, - "loss": 0.3744, + "grad_norm": 2.9211177825927734, + "learning_rate": 2.7573875399962356e-05, + "loss": 0.4906, "step": 25100 }, { "epoch": 4.726143421795595, - "grad_norm": 14.144658088684082, - "learning_rate": 1.6431394692264258e-06, - "loss": 0.4073, + "grad_norm": 2.8851497173309326, + "learning_rate": 2.738565782044043e-05, + "loss": 0.4443, "step": 25110 }, { "epoch": 4.728025597590815, - "grad_norm": 4.601728916168213, - "learning_rate": 1.63184641445511e-06, - "loss": 0.3743, + "grad_norm": 2.138963460922241, + "learning_rate": 2.7197440240918502e-05, + "loss": 0.3601, "step": 25120 }, { "epoch": 4.729907773386034, - "grad_norm": 9.717355728149414, - "learning_rate": 1.6205533596837946e-06, - "loss": 0.5983, + "grad_norm": 2.7832868099212646, + "learning_rate": 2.7009222661396575e-05, + "loss": 0.669, "step": 25130 }, { "epoch": 4.731789949181254, - "grad_norm": 13.010869026184082, - "learning_rate": 1.609260304912479e-06, - "loss": 0.4777, + "grad_norm": 3.739305019378662, + "learning_rate": 2.682100508187465e-05, + "loss": 0.5055, "step": 25140 }, { "epoch": 4.733672124976473, - "grad_norm": 10.841914176940918, - "learning_rate": 1.5979672501411633e-06, - "loss": 0.7914, + "grad_norm": 2.2495994567871094, + "learning_rate": 2.663278750235272e-05, + "loss": 0.8668, "step": 25150 }, { "epoch": 4.7355543007716925, - "grad_norm": 11.253789901733398, - "learning_rate": 1.5866741953698475e-06, - "loss": 0.5007, + "grad_norm": 2.521467924118042, + "learning_rate": 2.644456992283079e-05, + "loss": 0.5573, "step": 25160 }, { "epoch": 4.737436476566911, - "grad_norm": 20.718843460083008, - "learning_rate": 1.5753811405985319e-06, - "loss": 0.4821, + "grad_norm": 2.1538898944854736, + "learning_rate": 2.6256352343308864e-05, + "loss": 0.5449, "step": 25170 }, { "epoch": 4.7393186523621305, - "grad_norm": 15.470338821411133, - "learning_rate": 1.5640880858272163e-06, - "loss": 0.3737, + "grad_norm": 5.599278926849365, + "learning_rate": 2.6068134763786937e-05, + "loss": 0.4944, "step": 25180 }, { "epoch": 4.74120082815735, - "grad_norm": 0.15901845693588257, - "learning_rate": 1.5527950310559008e-06, - "loss": 0.1535, + "grad_norm": 1.8122804164886475, + "learning_rate": 2.5879917184265014e-05, + "loss": 0.3517, "step": 25190 }, { "epoch": 4.743083003952569, - "grad_norm": 17.537002563476562, - "learning_rate": 1.541501976284585e-06, - "loss": 0.3773, + "grad_norm": 3.613555669784546, + "learning_rate": 2.5691699604743083e-05, + "loss": 0.5506, "step": 25200 }, { "epoch": 4.744965179747789, - "grad_norm": 6.956587314605713, - "learning_rate": 1.5302089215132694e-06, - "loss": 0.3261, + "grad_norm": 3.1564066410064697, + "learning_rate": 2.5503482025221156e-05, + "loss": 0.3377, "step": 25210 }, { "epoch": 4.746847355543007, - "grad_norm": 5.067175388336182, - "learning_rate": 1.5189158667419538e-06, - "loss": 0.6587, + "grad_norm": 1.0180994272232056, + "learning_rate": 2.531526444569923e-05, + "loss": 0.7821, "step": 25220 }, { "epoch": 4.748729531338227, - "grad_norm": 25.866880416870117, - "learning_rate": 1.507622811970638e-06, - "loss": 0.6093, + "grad_norm": 1.7408175468444824, + "learning_rate": 2.5127046866177302e-05, + "loss": 0.6947, "step": 25230 }, { "epoch": 4.750611707133446, - "grad_norm": 2.593472957611084, - "learning_rate": 1.4963297571993223e-06, - "loss": 0.2919, + "grad_norm": 0.4839925467967987, + "learning_rate": 2.4938829286655372e-05, + "loss": 0.3432, "step": 25240 }, { "epoch": 4.752493882928666, - "grad_norm": 5.722588062286377, - "learning_rate": 1.485036702428007e-06, - "loss": 0.464, + "grad_norm": 3.0300328731536865, + "learning_rate": 2.475061170713345e-05, + "loss": 0.5659, "step": 25250 }, { "epoch": 4.754376058723885, - "grad_norm": 12.117194175720215, - "learning_rate": 1.4737436476566913e-06, - "loss": 0.2929, + "grad_norm": 1.7946454286575317, + "learning_rate": 2.456239412761152e-05, + "loss": 0.3518, "step": 25260 }, { "epoch": 4.7562582345191045, - "grad_norm": 18.254680633544922, - "learning_rate": 1.4624505928853755e-06, - "loss": 0.757, + "grad_norm": 2.8185391426086426, + "learning_rate": 2.437417654808959e-05, + "loss": 0.7663, "step": 25270 }, { "epoch": 4.758140410314323, - "grad_norm": 0.17077714204788208, - "learning_rate": 1.4511575381140599e-06, - "loss": 0.4409, + "grad_norm": 0.3781205713748932, + "learning_rate": 2.4185958968567664e-05, + "loss": 0.65, "step": 25280 }, { "epoch": 4.760022586109542, - "grad_norm": 11.023554801940918, - "learning_rate": 1.4398644833427442e-06, - "loss": 0.6109, + "grad_norm": 3.49873423576355, + "learning_rate": 2.3997741389045737e-05, + "loss": 0.6592, "step": 25290 }, { "epoch": 4.761904761904762, - "grad_norm": 11.318467140197754, - "learning_rate": 1.4285714285714286e-06, - "loss": 0.41, + "grad_norm": 2.9731030464172363, + "learning_rate": 2.380952380952381e-05, + "loss": 0.4748, "step": 25300 }, { "epoch": 4.763786937699981, - "grad_norm": 0.2806694507598877, - "learning_rate": 1.417278373800113e-06, - "loss": 0.3642, + "grad_norm": 1.2424784898757935, + "learning_rate": 2.3621306230001883e-05, + "loss": 0.3967, "step": 25310 }, { "epoch": 4.765669113495201, - "grad_norm": 3.844435930252075, - "learning_rate": 1.4059853190287974e-06, - "loss": 0.5849, + "grad_norm": 2.284749746322632, + "learning_rate": 2.3433088650479956e-05, + "loss": 0.6169, "step": 25320 }, { "epoch": 4.76755128929042, - "grad_norm": 6.41024923324585, - "learning_rate": 1.3946922642574818e-06, - "loss": 0.4466, + "grad_norm": 4.062412738800049, + "learning_rate": 2.324487107095803e-05, + "loss": 0.4828, "step": 25330 }, { "epoch": 4.769433465085639, - "grad_norm": 12.630584716796875, - "learning_rate": 1.383399209486166e-06, - "loss": 0.4274, + "grad_norm": 1.2760084867477417, + "learning_rate": 2.30566534914361e-05, + "loss": 0.4661, "step": 25340 }, { "epoch": 4.771315640880858, - "grad_norm": 14.851526260375977, - "learning_rate": 1.3721061547148503e-06, - "loss": 0.5182, + "grad_norm": 2.2040622234344482, + "learning_rate": 2.2868435911914172e-05, + "loss": 0.5224, "step": 25350 }, { "epoch": 4.773197816676078, - "grad_norm": 10.023591995239258, - "learning_rate": 1.3608130999435347e-06, - "loss": 0.7274, + "grad_norm": 2.973165273666382, + "learning_rate": 2.2680218332392245e-05, + "loss": 0.7409, "step": 25360 }, { "epoch": 4.775079992471297, - "grad_norm": 32.864810943603516, - "learning_rate": 1.3495200451722193e-06, - "loss": 0.464, + "grad_norm": 4.954020023345947, + "learning_rate": 2.249200075287032e-05, + "loss": 0.3633, "step": 25370 }, { "epoch": 4.776962168266516, - "grad_norm": 17.206785202026367, - "learning_rate": 1.3382269904009035e-06, - "loss": 0.4297, + "grad_norm": 3.838167905807495, + "learning_rate": 2.230378317334839e-05, + "loss": 0.5015, "step": 25380 }, { "epoch": 4.778844344061735, - "grad_norm": 11.58726978302002, - "learning_rate": 1.3269339356295879e-06, - "loss": 0.4302, + "grad_norm": 2.710631847381592, + "learning_rate": 2.2115565593826464e-05, + "loss": 0.6205, "step": 25390 }, { "epoch": 4.780726519856954, - "grad_norm": 5.545811176300049, - "learning_rate": 1.3156408808582722e-06, - "loss": 0.4928, + "grad_norm": 1.263107419013977, + "learning_rate": 2.1927348014304537e-05, + "loss": 0.5492, "step": 25400 }, { "epoch": 4.782608695652174, - "grad_norm": 14.361126899719238, - "learning_rate": 1.3043478260869566e-06, - "loss": 0.6301, + "grad_norm": 2.989341974258423, + "learning_rate": 2.173913043478261e-05, + "loss": 0.5812, "step": 25410 }, { "epoch": 4.784490871447393, - "grad_norm": 16.078994750976562, - "learning_rate": 1.2930547713156408e-06, - "loss": 0.8214, + "grad_norm": 1.7873269319534302, + "learning_rate": 2.155091285526068e-05, + "loss": 0.7841, "step": 25420 }, { "epoch": 4.786373047242613, - "grad_norm": 18.931100845336914, - "learning_rate": 1.2817617165443252e-06, - "loss": 0.8551, + "grad_norm": 28.162975311279297, + "learning_rate": 2.1362695275738753e-05, + "loss": 0.8946, "step": 25430 }, { "epoch": 4.788255223037831, - "grad_norm": 12.793909072875977, - "learning_rate": 1.2704686617730098e-06, - "loss": 0.8418, + "grad_norm": 2.6471946239471436, + "learning_rate": 2.117447769621683e-05, + "loss": 0.8679, "step": 25440 }, { "epoch": 4.790137398833051, - "grad_norm": 9.514110565185547, - "learning_rate": 1.259175607001694e-06, - "loss": 0.4481, + "grad_norm": 5.760558128356934, + "learning_rate": 2.09862601166949e-05, + "loss": 0.6001, "step": 25450 }, { "epoch": 4.79201957462827, - "grad_norm": 2.824892997741699, - "learning_rate": 1.2478825522303783e-06, - "loss": 0.7443, + "grad_norm": 2.5529234409332275, + "learning_rate": 2.0798042537172972e-05, + "loss": 0.8864, "step": 25460 }, { "epoch": 4.7939017504234895, - "grad_norm": 0.1960848569869995, - "learning_rate": 1.2365894974590627e-06, - "loss": 0.5792, + "grad_norm": 0.7746294140815735, + "learning_rate": 2.0609824957651045e-05, + "loss": 0.6143, "step": 25470 }, { "epoch": 4.795783926218709, - "grad_norm": 0.339618057012558, - "learning_rate": 1.225296442687747e-06, - "loss": 0.5427, + "grad_norm": 0.07354772090911865, + "learning_rate": 2.0421607378129118e-05, + "loss": 0.4895, "step": 25480 }, { "epoch": 4.797666102013928, - "grad_norm": 6.723775863647461, - "learning_rate": 1.2140033879164313e-06, - "loss": 0.3267, + "grad_norm": 1.4285778999328613, + "learning_rate": 2.0233389798607188e-05, + "loss": 0.3116, "step": 25490 }, { "epoch": 4.799548277809148, - "grad_norm": 1.1799036264419556, - "learning_rate": 1.2027103331451158e-06, - "loss": 0.3805, + "grad_norm": 1.7135233879089355, + "learning_rate": 2.0045172219085264e-05, + "loss": 0.4581, "step": 25500 }, { "epoch": 4.801430453604366, - "grad_norm": 3.111079216003418, - "learning_rate": 1.1914172783738002e-06, - "loss": 0.3117, + "grad_norm": 1.4032816886901855, + "learning_rate": 1.9856954639563337e-05, + "loss": 0.3102, "step": 25510 }, { "epoch": 4.803312629399586, - "grad_norm": 0.1729825735092163, - "learning_rate": 1.1801242236024846e-06, - "loss": 0.4701, + "grad_norm": 0.2566004693508148, + "learning_rate": 1.9668737060041407e-05, + "loss": 0.4983, "step": 25520 }, { "epoch": 4.805194805194805, - "grad_norm": 8.52800464630127, - "learning_rate": 1.1688311688311688e-06, - "loss": 0.4286, + "grad_norm": 1.9948326349258423, + "learning_rate": 1.948051948051948e-05, + "loss": 0.5406, "step": 25530 }, { "epoch": 4.807076980990025, - "grad_norm": 10.847475051879883, - "learning_rate": 1.1575381140598532e-06, - "loss": 0.3005, + "grad_norm": 1.6091513633728027, + "learning_rate": 1.9292301900997553e-05, + "loss": 0.3729, "step": 25540 }, { "epoch": 4.808959156785244, - "grad_norm": 11.9356689453125, - "learning_rate": 1.1462450592885375e-06, - "loss": 0.4045, + "grad_norm": 2.1368279457092285, + "learning_rate": 1.9104084321475626e-05, + "loss": 0.4786, "step": 25550 }, { "epoch": 4.810841332580463, - "grad_norm": 7.127353668212891, - "learning_rate": 1.134952004517222e-06, - "loss": 0.5741, + "grad_norm": 2.6382944583892822, + "learning_rate": 1.89158667419537e-05, + "loss": 0.5737, "step": 25560 }, { "epoch": 4.812723508375682, - "grad_norm": 16.197805404663086, - "learning_rate": 1.1236589497459063e-06, - "loss": 0.5069, + "grad_norm": 4.222015857696533, + "learning_rate": 1.8727649162431772e-05, + "loss": 0.5957, "step": 25570 }, { "epoch": 4.8146056841709015, - "grad_norm": 0.5227009057998657, - "learning_rate": 1.1123658949745907e-06, - "loss": 0.4081, + "grad_norm": 4.877696514129639, + "learning_rate": 1.8539431582909845e-05, + "loss": 0.5378, "step": 25580 }, { "epoch": 4.816487859966121, - "grad_norm": 16.148591995239258, - "learning_rate": 1.101072840203275e-06, - "loss": 0.5699, + "grad_norm": 2.3263192176818848, + "learning_rate": 1.8351214003387918e-05, + "loss": 0.6127, "step": 25590 }, { "epoch": 4.81837003576134, - "grad_norm": 1.3249871730804443, - "learning_rate": 1.0897797854319592e-06, - "loss": 0.4742, + "grad_norm": 5.47594690322876, + "learning_rate": 1.8162996423865988e-05, + "loss": 0.5668, "step": 25600 }, { "epoch": 4.820252211556559, - "grad_norm": 11.252976417541504, - "learning_rate": 1.0784867306606436e-06, - "loss": 0.5348, + "grad_norm": 0.9088131785392761, + "learning_rate": 1.797477884434406e-05, + "loss": 0.5909, "step": 25610 }, { "epoch": 4.822134387351778, - "grad_norm": 9.052772521972656, - "learning_rate": 1.0671936758893282e-06, - "loss": 0.5361, + "grad_norm": 3.0957508087158203, + "learning_rate": 1.7786561264822137e-05, + "loss": 0.6015, "step": 25620 }, { "epoch": 4.824016563146998, - "grad_norm": 34.760986328125, - "learning_rate": 1.0559006211180126e-06, - "loss": 0.4909, + "grad_norm": 6.333992004394531, + "learning_rate": 1.7598343685300207e-05, + "loss": 0.6555, "step": 25630 }, { "epoch": 4.825898738942217, - "grad_norm": 18.605308532714844, - "learning_rate": 1.0446075663466968e-06, - "loss": 0.3664, + "grad_norm": 2.639153480529785, + "learning_rate": 1.741012610577828e-05, + "loss": 0.3782, "step": 25640 }, { "epoch": 4.827780914737437, - "grad_norm": 1.8343195915222168, - "learning_rate": 1.0333145115753812e-06, - "loss": 0.2793, + "grad_norm": 0.8514479398727417, + "learning_rate": 1.7221908526256353e-05, + "loss": 0.2632, "step": 25650 }, { "epoch": 4.829663090532656, - "grad_norm": 6.831780433654785, - "learning_rate": 1.0220214568040655e-06, - "loss": 0.4601, + "grad_norm": 2.4760193824768066, + "learning_rate": 1.7033690946734426e-05, + "loss": 0.506, "step": 25660 }, { "epoch": 4.8315452663278755, - "grad_norm": 9.190847396850586, - "learning_rate": 1.0107284020327497e-06, - "loss": 0.3852, + "grad_norm": 1.6171988248825073, + "learning_rate": 1.6845473367212496e-05, + "loss": 0.4352, "step": 25670 }, { "epoch": 4.833427442123094, - "grad_norm": 10.883723258972168, - "learning_rate": 9.994353472614343e-07, - "loss": 0.3296, + "grad_norm": 3.2398006916046143, + "learning_rate": 1.6657255787690572e-05, + "loss": 0.4091, "step": 25680 }, { "epoch": 4.8353096179183135, - "grad_norm": 11.370898246765137, - "learning_rate": 9.881422924901187e-07, - "loss": 0.3896, + "grad_norm": 3.9767098426818848, + "learning_rate": 1.6469038208168645e-05, + "loss": 0.4436, "step": 25690 }, { "epoch": 4.837191793713533, - "grad_norm": 0.8723617792129517, - "learning_rate": 9.76849237718803e-07, - "loss": 0.4617, + "grad_norm": 0.5642136931419373, + "learning_rate": 1.6280820628646715e-05, + "loss": 0.4776, "step": 25700 }, { "epoch": 4.839073969508752, - "grad_norm": 2.9830756187438965, - "learning_rate": 9.655561829474872e-07, - "loss": 0.3264, + "grad_norm": 1.381508469581604, + "learning_rate": 1.6092603049124788e-05, + "loss": 0.412, "step": 25710 }, { "epoch": 4.840956145303972, - "grad_norm": 9.031828880310059, - "learning_rate": 9.542631281761716e-07, - "loss": 0.5838, + "grad_norm": 4.1336750984191895, + "learning_rate": 1.590438546960286e-05, + "loss": 0.6967, "step": 25720 }, { "epoch": 4.84283832109919, - "grad_norm": 37.80723571777344, - "learning_rate": 9.42970073404856e-07, - "loss": 0.2824, + "grad_norm": 4.109206199645996, + "learning_rate": 1.5716167890080934e-05, + "loss": 0.3631, "step": 25730 }, { "epoch": 4.84472049689441, - "grad_norm": 9.098519325256348, - "learning_rate": 9.316770186335404e-07, - "loss": 0.4279, + "grad_norm": 1.8715598583221436, + "learning_rate": 1.5527950310559007e-05, + "loss": 0.5251, "step": 25740 }, { "epoch": 4.846602672689629, - "grad_norm": 12.211234092712402, - "learning_rate": 9.203839638622247e-07, - "loss": 0.5721, + "grad_norm": 1.6625672578811646, + "learning_rate": 1.533973273103708e-05, + "loss": 0.4531, "step": 25750 }, { "epoch": 4.848484848484849, - "grad_norm": 1.9672722816467285, - "learning_rate": 9.090909090909091e-07, - "loss": 0.432, + "grad_norm": 1.0363481044769287, + "learning_rate": 1.5151515151515153e-05, + "loss": 0.5578, "step": 25760 }, { "epoch": 4.850367024280068, - "grad_norm": 18.3715763092041, - "learning_rate": 8.977978543195934e-07, - "loss": 0.5275, + "grad_norm": 12.811112403869629, + "learning_rate": 1.4963297571993224e-05, + "loss": 0.6469, "step": 25770 }, { "epoch": 4.852249200075287, - "grad_norm": 6.322066307067871, - "learning_rate": 8.865047995482778e-07, - "loss": 0.4312, + "grad_norm": 2.2655227184295654, + "learning_rate": 1.4775079992471297e-05, + "loss": 0.4671, "step": 25780 }, { "epoch": 4.854131375870506, - "grad_norm": 13.303924560546875, - "learning_rate": 8.752117447769622e-07, - "loss": 0.5374, + "grad_norm": 1.3414403200149536, + "learning_rate": 1.458686241294937e-05, + "loss": 0.5239, "step": 25790 }, { "epoch": 4.8560135516657255, - "grad_norm": 18.152463912963867, - "learning_rate": 8.639186900056466e-07, - "loss": 0.4755, + "grad_norm": 3.1365103721618652, + "learning_rate": 1.4398644833427442e-05, + "loss": 0.4356, "step": 25800 }, { "epoch": 4.857895727460945, - "grad_norm": 1.1153311729431152, - "learning_rate": 8.526256352343308e-07, - "loss": 0.5785, + "grad_norm": 0.4507659673690796, + "learning_rate": 1.4210427253905515e-05, + "loss": 0.5114, "step": 25810 }, { "epoch": 4.859777903256164, - "grad_norm": 17.98719024658203, - "learning_rate": 8.413325804630153e-07, - "loss": 0.4314, + "grad_norm": 2.2164368629455566, + "learning_rate": 1.4022209674383588e-05, + "loss": 0.4448, "step": 25820 }, { "epoch": 4.861660079051384, - "grad_norm": 5.713428020477295, - "learning_rate": 8.300395256916996e-07, - "loss": 0.5436, + "grad_norm": 1.65787672996521, + "learning_rate": 1.383399209486166e-05, + "loss": 0.5972, "step": 25830 }, { "epoch": 4.863542254846602, - "grad_norm": 17.330768585205078, - "learning_rate": 8.18746470920384e-07, - "loss": 0.3292, + "grad_norm": 2.5747125148773193, + "learning_rate": 1.3645774515339732e-05, + "loss": 0.3413, "step": 25840 }, { "epoch": 4.865424430641822, - "grad_norm": 0.1503809541463852, - "learning_rate": 8.074534161490684e-07, - "loss": 0.575, + "grad_norm": 0.23792870342731476, + "learning_rate": 1.3457556935817807e-05, + "loss": 0.496, "step": 25850 }, { "epoch": 4.867306606437041, - "grad_norm": 29.405603408813477, - "learning_rate": 7.961603613777526e-07, - "loss": 0.7215, + "grad_norm": 8.1236572265625, + "learning_rate": 1.3269339356295878e-05, + "loss": 0.9485, "step": 25860 }, { "epoch": 4.869188782232261, - "grad_norm": 5.5133209228515625, - "learning_rate": 7.84867306606437e-07, - "loss": 0.2398, + "grad_norm": 2.197131633758545, + "learning_rate": 1.3081121776773951e-05, + "loss": 0.4668, "step": 25870 }, { "epoch": 4.87107095802748, - "grad_norm": 10.060201644897461, - "learning_rate": 7.735742518351214e-07, - "loss": 0.7002, + "grad_norm": 3.55192494392395, + "learning_rate": 1.2892904197252024e-05, + "loss": 0.7436, "step": 25880 }, { "epoch": 4.8729531338226995, - "grad_norm": 31.25115966796875, - "learning_rate": 7.622811970638058e-07, - "loss": 0.5119, + "grad_norm": 2.4934020042419434, + "learning_rate": 1.2704686617730096e-05, + "loss": 0.5262, "step": 25890 }, { "epoch": 4.874835309617918, - "grad_norm": 11.812898635864258, - "learning_rate": 7.509881422924901e-07, - "loss": 0.622, + "grad_norm": 3.092714786529541, + "learning_rate": 1.2516469038208169e-05, + "loss": 0.6179, "step": 25900 }, { "epoch": 4.876717485413137, - "grad_norm": 6.675577163696289, - "learning_rate": 7.396950875211746e-07, - "loss": 0.3038, + "grad_norm": 2.5720813274383545, + "learning_rate": 1.2328251458686242e-05, + "loss": 0.4667, "step": 25910 }, { "epoch": 4.878599661208357, - "grad_norm": 9.983612060546875, - "learning_rate": 7.284020327498588e-07, - "loss": 0.7158, + "grad_norm": 5.265648365020752, + "learning_rate": 1.2140033879164315e-05, + "loss": 0.7364, "step": 25920 }, { "epoch": 4.880481837003576, - "grad_norm": 8.77939510345459, - "learning_rate": 7.171089779785432e-07, - "loss": 0.7233, + "grad_norm": 3.7148709297180176, + "learning_rate": 1.1951816299642386e-05, + "loss": 0.7846, "step": 25930 }, { "epoch": 4.882364012798796, - "grad_norm": 11.423147201538086, - "learning_rate": 7.058159232072276e-07, - "loss": 0.8275, + "grad_norm": 3.3122756481170654, + "learning_rate": 1.176359872012046e-05, + "loss": 0.8727, "step": 25940 }, { "epoch": 4.884246188594014, - "grad_norm": 11.282443046569824, - "learning_rate": 6.94522868435912e-07, - "loss": 0.6037, + "grad_norm": 3.0102932453155518, + "learning_rate": 1.1575381140598532e-05, + "loss": 0.6746, "step": 25950 }, { "epoch": 4.886128364389234, - "grad_norm": 34.89209747314453, - "learning_rate": 6.832298136645963e-07, - "loss": 0.6057, + "grad_norm": 3.3670566082000732, + "learning_rate": 1.1387163561076605e-05, + "loss": 0.5846, "step": 25960 }, { "epoch": 4.888010540184453, - "grad_norm": 13.334453582763672, - "learning_rate": 6.719367588932806e-07, - "loss": 0.5336, + "grad_norm": 3.1409409046173096, + "learning_rate": 1.1198945981554678e-05, + "loss": 0.5595, "step": 25970 }, { "epoch": 4.889892715979673, - "grad_norm": 43.71876525878906, - "learning_rate": 6.60643704121965e-07, - "loss": 0.4498, + "grad_norm": 5.394914627075195, + "learning_rate": 1.101072840203275e-05, + "loss": 0.4697, "step": 25980 }, { "epoch": 4.891774891774892, - "grad_norm": 4.845729351043701, - "learning_rate": 6.493506493506493e-07, - "loss": 0.4448, + "grad_norm": 1.916937232017517, + "learning_rate": 1.0822510822510823e-05, + "loss": 0.5682, "step": 25990 }, { "epoch": 4.893657067570111, - "grad_norm": 10.815642356872559, - "learning_rate": 6.380575945793338e-07, - "loss": 0.5838, + "grad_norm": 2.7022321224212646, + "learning_rate": 1.0634293242988896e-05, + "loss": 0.453, "step": 26000 }, { "epoch": 4.89553924336533, - "grad_norm": 17.238981246948242, - "learning_rate": 6.267645398080181e-07, - "loss": 0.444, + "grad_norm": 6.567044734954834, + "learning_rate": 1.0446075663466969e-05, + "loss": 0.5277, "step": 26010 }, { "epoch": 4.897421419160549, - "grad_norm": 10.10824966430664, - "learning_rate": 6.154714850367024e-07, - "loss": 0.6663, + "grad_norm": 2.0348784923553467, + "learning_rate": 1.025785808394504e-05, + "loss": 0.7916, "step": 26020 }, { "epoch": 4.899303594955769, - "grad_norm": 33.35478973388672, - "learning_rate": 6.041784302653868e-07, - "loss": 0.4297, + "grad_norm": 6.761394023895264, + "learning_rate": 1.0069640504423115e-05, + "loss": 0.4096, "step": 26030 }, { "epoch": 4.901185770750988, - "grad_norm": 0.8835696578025818, - "learning_rate": 5.928853754940712e-07, - "loss": 0.6774, + "grad_norm": 1.9423185586929321, + "learning_rate": 9.881422924901186e-06, + "loss": 0.77, "step": 26040 }, { "epoch": 4.903067946546208, - "grad_norm": 5.606777191162109, - "learning_rate": 5.815923207227555e-07, - "loss": 0.3105, + "grad_norm": 1.9975135326385498, + "learning_rate": 9.693205345379259e-06, + "loss": 0.2964, "step": 26050 }, { "epoch": 4.904950122341427, - "grad_norm": 13.368224143981934, - "learning_rate": 5.7029926595144e-07, - "loss": 0.6274, + "grad_norm": 2.800941228866577, + "learning_rate": 9.504987765857332e-06, + "loss": 0.7167, "step": 26060 }, { "epoch": 4.906832298136646, - "grad_norm": 17.68413543701172, - "learning_rate": 5.590062111801243e-07, - "loss": 0.5218, + "grad_norm": 3.057074546813965, + "learning_rate": 9.316770186335403e-06, + "loss": 0.6159, "step": 26070 }, { "epoch": 4.908714473931865, - "grad_norm": 31.081344604492188, - "learning_rate": 5.477131564088085e-07, - "loss": 0.5692, + "grad_norm": 7.213024616241455, + "learning_rate": 9.128552606813476e-06, + "loss": 0.4206, "step": 26080 }, { "epoch": 4.9105966497270845, - "grad_norm": 11.856854438781738, - "learning_rate": 5.36420101637493e-07, - "loss": 0.6386, + "grad_norm": 1.9982532262802124, + "learning_rate": 8.94033502729155e-06, + "loss": 0.8574, "step": 26090 }, { "epoch": 4.912478825522304, - "grad_norm": 6.689940929412842, - "learning_rate": 5.251270468661773e-07, - "loss": 0.3447, + "grad_norm": 2.2955069541931152, + "learning_rate": 8.752117447769623e-06, + "loss": 0.4433, "step": 26100 }, { "epoch": 4.914361001317523, - "grad_norm": 36.05409622192383, - "learning_rate": 5.138339920948617e-07, - "loss": 0.2413, + "grad_norm": 5.474578857421875, + "learning_rate": 8.563899868247694e-06, + "loss": 0.3454, "step": 26110 }, { "epoch": 4.916243177112742, - "grad_norm": 21.475088119506836, - "learning_rate": 5.02540937323546e-07, - "loss": 0.6389, + "grad_norm": 3.1911139488220215, + "learning_rate": 8.375682288725767e-06, + "loss": 0.5058, "step": 26120 }, { "epoch": 4.918125352907961, - "grad_norm": 1.5936756134033203, - "learning_rate": 4.912478825522304e-07, - "loss": 0.4807, + "grad_norm": 2.5135414600372314, + "learning_rate": 8.18746470920384e-06, + "loss": 0.6098, "step": 26130 }, { "epoch": 4.920007528703181, - "grad_norm": 28.940305709838867, - "learning_rate": 4.799548277809147e-07, - "loss": 0.4051, + "grad_norm": 4.524852275848389, + "learning_rate": 7.999247129681913e-06, + "loss": 0.4982, "step": 26140 }, { "epoch": 4.9218897044984, - "grad_norm": 5.989145755767822, - "learning_rate": 4.686617730095991e-07, - "loss": 0.442, + "grad_norm": 1.1532628536224365, + "learning_rate": 7.811029550159986e-06, + "loss": 0.4741, "step": 26150 }, { "epoch": 4.92377188029362, - "grad_norm": 11.877305030822754, - "learning_rate": 4.573687182382835e-07, - "loss": 0.6841, + "grad_norm": 2.0357584953308105, + "learning_rate": 7.622811970638058e-06, + "loss": 0.7255, "step": 26160 }, { "epoch": 4.925654056088838, - "grad_norm": 10.223360061645508, - "learning_rate": 4.460756634669678e-07, - "loss": 0.5501, + "grad_norm": 4.0735344886779785, + "learning_rate": 7.43459439111613e-06, + "loss": 0.706, "step": 26170 }, { "epoch": 4.927536231884058, - "grad_norm": 6.764021396636963, - "learning_rate": 4.347826086956522e-07, - "loss": 0.711, + "grad_norm": 2.2796082496643066, + "learning_rate": 7.246376811594203e-06, + "loss": 0.7243, "step": 26180 }, { "epoch": 4.929418407679277, - "grad_norm": 9.022865295410156, - "learning_rate": 4.2348955392433657e-07, - "loss": 0.5528, + "grad_norm": 1.2028344869613647, + "learning_rate": 7.0581592320722764e-06, + "loss": 0.628, "step": 26190 }, { "epoch": 4.9313005834744965, - "grad_norm": 0.19420573115348816, - "learning_rate": 4.121964991530209e-07, - "loss": 0.8545, + "grad_norm": 0.6326656937599182, + "learning_rate": 6.869941652550348e-06, + "loss": 0.8906, "step": 26200 }, { "epoch": 4.933182759269716, - "grad_norm": 32.37028884887695, - "learning_rate": 4.009034443817053e-07, - "loss": 0.6968, + "grad_norm": 14.477922439575195, + "learning_rate": 6.681724073028421e-06, + "loss": 0.8574, "step": 26210 }, { "epoch": 4.935064935064935, - "grad_norm": 5.03609561920166, - "learning_rate": 3.8961038961038966e-07, - "loss": 0.5307, + "grad_norm": 2.4440884590148926, + "learning_rate": 6.493506493506494e-06, + "loss": 0.4625, "step": 26220 }, { "epoch": 4.936947110860155, - "grad_norm": 7.052425861358643, - "learning_rate": 3.78317334839074e-07, - "loss": 0.8121, + "grad_norm": 1.841376543045044, + "learning_rate": 6.305288913984566e-06, + "loss": 0.7268, "step": 26230 }, { "epoch": 4.938829286655373, - "grad_norm": 0.8062214255332947, - "learning_rate": 3.670242800677583e-07, - "loss": 0.4944, + "grad_norm": 0.1650257259607315, + "learning_rate": 6.117071334462639e-06, + "loss": 0.4164, "step": 26240 }, { "epoch": 4.940711462450593, - "grad_norm": 10.001143455505371, - "learning_rate": 3.5573122529644265e-07, - "loss": 0.3683, + "grad_norm": 2.4682869911193848, + "learning_rate": 5.928853754940711e-06, + "loss": 0.4473, "step": 26250 }, { "epoch": 4.942593638245812, - "grad_norm": 1.526486873626709, - "learning_rate": 3.4443817052512703e-07, - "loss": 0.3605, + "grad_norm": 0.20925642549991608, + "learning_rate": 5.740636175418784e-06, + "loss": 0.4475, "step": 26260 }, { "epoch": 4.944475814041032, - "grad_norm": 11.505463600158691, - "learning_rate": 3.331451157538114e-07, - "loss": 0.498, + "grad_norm": 0.88543301820755, + "learning_rate": 5.552418595896857e-06, + "loss": 0.5341, "step": 26270 }, { "epoch": 4.946357989836251, - "grad_norm": 18.725603103637695, - "learning_rate": 3.2185206098249575e-07, - "loss": 0.3436, + "grad_norm": 3.832179307937622, + "learning_rate": 5.3642010163749295e-06, + "loss": 0.4383, "step": 26280 }, { "epoch": 4.94824016563147, - "grad_norm": 8.331537246704102, - "learning_rate": 3.1055900621118013e-07, - "loss": 0.5627, + "grad_norm": 1.4329594373703003, + "learning_rate": 5.175983436853002e-06, + "loss": 0.5517, "step": 26290 }, { "epoch": 4.950122341426689, - "grad_norm": 10.584824562072754, - "learning_rate": 2.992659514398645e-07, - "loss": 0.4182, + "grad_norm": 4.162771701812744, + "learning_rate": 4.987765857331075e-06, + "loss": 0.4524, "step": 26300 }, { "epoch": 4.9520045172219085, - "grad_norm": 0.15740695595741272, - "learning_rate": 2.8797289666854884e-07, - "loss": 0.5521, + "grad_norm": 0.10146909952163696, + "learning_rate": 4.799548277809147e-06, + "loss": 0.5414, "step": 26310 }, { "epoch": 4.953886693017128, - "grad_norm": 11.263009071350098, - "learning_rate": 2.766798418972332e-07, - "loss": 0.4944, + "grad_norm": 2.9978926181793213, + "learning_rate": 4.61133069828722e-06, + "loss": 0.5602, "step": 26320 }, { "epoch": 4.955768868812347, - "grad_norm": 7.119065284729004, - "learning_rate": 2.653867871259176e-07, - "loss": 0.7362, + "grad_norm": 3.2251737117767334, + "learning_rate": 4.423113118765293e-06, + "loss": 0.7076, "step": 26330 }, { "epoch": 4.957651044607566, - "grad_norm": 1.8257107734680176, - "learning_rate": 2.5409373235460193e-07, - "loss": 0.1585, + "grad_norm": 1.7196509838104248, + "learning_rate": 4.234895539243365e-06, + "loss": 0.2048, "step": 26340 }, { "epoch": 4.959533220402785, - "grad_norm": 33.10144805908203, - "learning_rate": 2.4280067758328626e-07, - "loss": 0.4476, + "grad_norm": 1.6203038692474365, + "learning_rate": 4.046677959721438e-06, + "loss": 0.4421, "step": 26350 }, { "epoch": 4.961415396198005, - "grad_norm": 2.7676703929901123, - "learning_rate": 2.3150762281197064e-07, - "loss": 0.366, + "grad_norm": 2.4939804077148438, + "learning_rate": 3.85846038019951e-06, + "loss": 0.5145, "step": 26360 }, { "epoch": 4.963297571993224, - "grad_norm": 0.12121053785085678, - "learning_rate": 2.2021456804065503e-07, - "loss": 0.4664, + "grad_norm": 0.217250794172287, + "learning_rate": 3.6702428006775834e-06, + "loss": 0.5779, "step": 26370 }, { "epoch": 4.965179747788444, - "grad_norm": 0.2563054859638214, - "learning_rate": 2.0892151326933935e-07, - "loss": 0.3163, + "grad_norm": 1.894600510597229, + "learning_rate": 3.482025221155656e-06, + "loss": 0.4681, "step": 26380 }, { "epoch": 4.967061923583663, - "grad_norm": 15.612191200256348, - "learning_rate": 1.976284584980237e-07, - "loss": 0.4692, + "grad_norm": 4.688291549682617, + "learning_rate": 3.2938076416337287e-06, + "loss": 0.6112, "step": 26390 }, { "epoch": 4.9689440993788825, - "grad_norm": 12.518272399902344, - "learning_rate": 1.8633540372670807e-07, - "loss": 0.5011, + "grad_norm": 3.2292239665985107, + "learning_rate": 3.1055900621118013e-06, + "loss": 0.5831, "step": 26400 }, { "epoch": 4.970826275174101, - "grad_norm": 8.736549377441406, - "learning_rate": 1.7504234895539245e-07, - "loss": 0.7325, + "grad_norm": 2.3493993282318115, + "learning_rate": 2.917372482589874e-06, + "loss": 0.5896, "step": 26410 }, { "epoch": 4.9727084509693205, - "grad_norm": 45.901878356933594, - "learning_rate": 1.637492941840768e-07, - "loss": 0.5119, + "grad_norm": 6.07112455368042, + "learning_rate": 2.7291549030679465e-06, + "loss": 0.6324, "step": 26420 }, { "epoch": 4.97459062676454, - "grad_norm": 15.459980010986328, - "learning_rate": 1.5245623941276113e-07, - "loss": 0.4057, + "grad_norm": 1.9999442100524902, + "learning_rate": 2.540937323546019e-06, + "loss": 0.3659, "step": 26430 }, { "epoch": 4.976472802559759, - "grad_norm": 12.372354507446289, - "learning_rate": 1.4116318464144551e-07, - "loss": 0.9019, + "grad_norm": 1.9524005651474, + "learning_rate": 2.352719744024092e-06, + "loss": 0.7785, "step": 26440 }, { "epoch": 4.978354978354979, - "grad_norm": 8.161259651184082, - "learning_rate": 1.2987012987012987e-07, - "loss": 0.3162, + "grad_norm": 1.6963441371917725, + "learning_rate": 2.1645021645021648e-06, + "loss": 0.3846, "step": 26450 }, { "epoch": 4.980237154150197, - "grad_norm": 2.586444854736328, - "learning_rate": 1.1857707509881423e-07, - "loss": 0.6493, + "grad_norm": 2.9382524490356445, + "learning_rate": 1.976284584980237e-06, + "loss": 0.6171, "step": 26460 }, { "epoch": 4.982119329945417, - "grad_norm": 0.4139041602611542, - "learning_rate": 1.0728402032749858e-07, - "loss": 0.3598, + "grad_norm": 0.4651086628437042, + "learning_rate": 1.7880670054583098e-06, + "loss": 0.3558, "step": 26470 }, { "epoch": 4.984001505740636, - "grad_norm": 0.5446306467056274, - "learning_rate": 9.599096555618295e-08, - "loss": 0.4985, + "grad_norm": 4.864592552185059, + "learning_rate": 1.5998494259363826e-06, + "loss": 0.5754, "step": 26480 }, { "epoch": 4.985883681535856, - "grad_norm": 14.608466148376465, - "learning_rate": 8.46979107848673e-08, - "loss": 0.3913, + "grad_norm": 2.7198994159698486, + "learning_rate": 1.411631846414455e-06, + "loss": 0.4226, "step": 26490 }, { "epoch": 4.987765857331075, - "grad_norm": 27.869840621948242, - "learning_rate": 7.340485601355166e-08, - "loss": 0.5136, + "grad_norm": 3.3242056369781494, + "learning_rate": 1.2234142668925278e-06, + "loss": 0.6177, "step": 26500 }, { "epoch": 4.989648033126294, - "grad_norm": 7.235048294067383, - "learning_rate": 6.211180124223603e-08, - "loss": 0.5271, + "grad_norm": 3.1476473808288574, + "learning_rate": 1.0351966873706004e-06, + "loss": 0.5798, "step": 26510 }, { "epoch": 4.991530208921513, - "grad_norm": 16.698986053466797, - "learning_rate": 5.0818746470920386e-08, - "loss": 0.6373, + "grad_norm": 2.154055118560791, + "learning_rate": 8.469791078486731e-07, + "loss": 0.7016, "step": 26520 }, { "epoch": 4.993412384716732, - "grad_norm": 7.239467620849609, - "learning_rate": 3.952569169960475e-08, - "loss": 0.4937, + "grad_norm": 9.276118278503418, + "learning_rate": 6.587615283267458e-07, + "loss": 0.4034, "step": 26530 }, { "epoch": 4.995294560511952, - "grad_norm": 33.553871154785156, - "learning_rate": 2.8232636928289104e-08, - "loss": 0.6561, + "grad_norm": 5.92098331451416, + "learning_rate": 4.7054394880481837e-07, + "loss": 0.7285, "step": 26540 }, { "epoch": 4.997176736307171, - "grad_norm": 11.409107208251953, - "learning_rate": 1.693958215697346e-08, - "loss": 0.5426, + "grad_norm": 1.5935169458389282, + "learning_rate": 2.8232636928289103e-07, + "loss": 0.6105, "step": 26550 }, { "epoch": 4.999058912102391, - "grad_norm": 26.004404067993164, - "learning_rate": 5.6465273856578205e-09, - "loss": 0.4763, + "grad_norm": 2.964055061340332, + "learning_rate": 9.410878976096367e-08, + "loss": 0.4711, "step": 26560 }, { "epoch": 5.0, - "eval_accuracy": 0.9252, - "eval_loss": 0.2850923240184784, - "eval_runtime": 277.3592, - "eval_samples_per_second": 27.041, - "eval_steps_per_second": 3.382, + "eval_accuracy": 0.9236, + "eval_loss": 0.2550007104873657, + "eval_runtime": 89.7951, + "eval_samples_per_second": 83.523, + "eval_steps_per_second": 10.446, "step": 26565 }, { "epoch": 5.0, "step": 26565, - "total_flos": 1.64815115092992e+19, - "train_loss": 1.0248974845253678, - "train_runtime": 12727.7584, - "train_samples_per_second": 16.696, - "train_steps_per_second": 2.087 + "total_flos": 1.66660717676544e+19, + "train_loss": 0.7802943237124471, + "train_runtime": 11340.9032, + "train_samples_per_second": 18.737, + "train_steps_per_second": 2.342 } ], "logging_steps": 10, @@ -18672,7 +18672,7 @@ "attributes": {} } }, - "total_flos": 1.64815115092992e+19, + "total_flos": 1.66660717676544e+19, "train_batch_size": 8, "trial_name": null, "trial_params": null