{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997584346565747, "eval_steps": 500, "global_step": 3104, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.1600731611251831, "learning_rate": 2.9999999999999997e-06, "loss": 1.2092, "step": 1 }, { "epoch": 0.0, "grad_norm": 0.18172824382781982, "learning_rate": 5.999999999999999e-06, "loss": 1.3776, "step": 2 }, { "epoch": 0.0, "grad_norm": 0.16943985223770142, "learning_rate": 8.999999999999999e-06, "loss": 1.3126, "step": 3 }, { "epoch": 0.0, "grad_norm": 0.13296286761760712, "learning_rate": 1.1999999999999999e-05, "loss": 1.3224, "step": 4 }, { "epoch": 0.0, "grad_norm": 0.14123758673667908, "learning_rate": 1.4999999999999999e-05, "loss": 1.4569, "step": 5 }, { "epoch": 0.0, "grad_norm": 0.1710270494222641, "learning_rate": 1.7999999999999997e-05, "loss": 1.2856, "step": 6 }, { "epoch": 0.0, "grad_norm": 0.18251967430114746, "learning_rate": 2.1e-05, "loss": 1.3047, "step": 7 }, { "epoch": 0.0, "grad_norm": 0.13840997219085693, "learning_rate": 2.3999999999999997e-05, "loss": 1.4455, "step": 8 }, { "epoch": 0.0, "grad_norm": 0.1863042563199997, "learning_rate": 2.6999999999999996e-05, "loss": 1.4185, "step": 9 }, { "epoch": 0.0, "grad_norm": 0.22244535386562347, "learning_rate": 2.9999999999999997e-05, "loss": 1.2023, "step": 10 }, { "epoch": 0.0, "grad_norm": 0.18667283654212952, "learning_rate": 3.2999999999999996e-05, "loss": 1.276, "step": 11 }, { "epoch": 0.0, "grad_norm": 0.20198574662208557, "learning_rate": 3.5999999999999994e-05, "loss": 1.2276, "step": 12 }, { "epoch": 0.0, "grad_norm": 0.16169218719005585, "learning_rate": 3.9e-05, "loss": 1.3412, "step": 13 }, { "epoch": 0.0, "grad_norm": 0.15591345727443695, "learning_rate": 4.2e-05, "loss": 1.3133, "step": 14 }, { "epoch": 0.0, "grad_norm": 0.17219580709934235, "learning_rate": 4.4999999999999996e-05, "loss": 1.2028, "step": 15 }, { "epoch": 0.01, "grad_norm": 0.2136317938566208, "learning_rate": 4.7999999999999994e-05, "loss": 1.3106, "step": 16 }, { "epoch": 0.01, "grad_norm": 0.24422112107276917, "learning_rate": 5.1e-05, "loss": 1.341, "step": 17 }, { "epoch": 0.01, "grad_norm": 0.1567581444978714, "learning_rate": 5.399999999999999e-05, "loss": 1.2432, "step": 18 }, { "epoch": 0.01, "grad_norm": 0.1834290623664856, "learning_rate": 5.6999999999999996e-05, "loss": 1.2465, "step": 19 }, { "epoch": 0.01, "grad_norm": 0.18374434113502502, "learning_rate": 5.9999999999999995e-05, "loss": 1.364, "step": 20 }, { "epoch": 0.01, "grad_norm": 0.2012396603822708, "learning_rate": 6.299999999999999e-05, "loss": 1.1013, "step": 21 }, { "epoch": 0.01, "grad_norm": 0.2053655982017517, "learning_rate": 6.599999999999999e-05, "loss": 1.4885, "step": 22 }, { "epoch": 0.01, "grad_norm": 0.23425662517547607, "learning_rate": 6.9e-05, "loss": 1.4354, "step": 23 }, { "epoch": 0.01, "grad_norm": 0.21764937043190002, "learning_rate": 7.199999999999999e-05, "loss": 1.3118, "step": 24 }, { "epoch": 0.01, "grad_norm": 0.28006067872047424, "learning_rate": 7.5e-05, "loss": 1.3169, "step": 25 }, { "epoch": 0.01, "grad_norm": 0.2246096432209015, "learning_rate": 7.8e-05, "loss": 1.0421, "step": 26 }, { "epoch": 0.01, "grad_norm": 0.18318617343902588, "learning_rate": 8.1e-05, "loss": 1.3792, "step": 27 }, { "epoch": 0.01, "grad_norm": 0.1860298067331314, "learning_rate": 8.4e-05, "loss": 1.1395, "step": 28 }, { "epoch": 0.01, "grad_norm": 0.19276364147663116, "learning_rate": 8.699999999999999e-05, "loss": 1.152, "step": 29 }, { "epoch": 0.01, "grad_norm": 0.1895924210548401, "learning_rate": 8.999999999999999e-05, "loss": 1.184, "step": 30 }, { "epoch": 0.01, "grad_norm": 0.20944935083389282, "learning_rate": 9.3e-05, "loss": 1.2572, "step": 31 }, { "epoch": 0.01, "grad_norm": 0.22710011899471283, "learning_rate": 9.599999999999999e-05, "loss": 1.5693, "step": 32 }, { "epoch": 0.01, "grad_norm": 0.18556331098079681, "learning_rate": 9.9e-05, "loss": 1.1317, "step": 33 }, { "epoch": 0.01, "grad_norm": 0.2806234359741211, "learning_rate": 0.000102, "loss": 1.3453, "step": 34 }, { "epoch": 0.01, "grad_norm": 0.18875961005687714, "learning_rate": 0.00010499999999999999, "loss": 1.4719, "step": 35 }, { "epoch": 0.01, "grad_norm": 0.22078992426395416, "learning_rate": 0.00010799999999999998, "loss": 1.3228, "step": 36 }, { "epoch": 0.01, "grad_norm": 0.16293686628341675, "learning_rate": 0.00011099999999999999, "loss": 1.2622, "step": 37 }, { "epoch": 0.01, "grad_norm": 0.2033080905675888, "learning_rate": 0.00011399999999999999, "loss": 1.2024, "step": 38 }, { "epoch": 0.01, "grad_norm": 0.16291578114032745, "learning_rate": 0.000117, "loss": 1.2014, "step": 39 }, { "epoch": 0.01, "grad_norm": 0.17717377841472626, "learning_rate": 0.00011999999999999999, "loss": 1.2232, "step": 40 }, { "epoch": 0.01, "grad_norm": 0.16549211740493774, "learning_rate": 0.00012299999999999998, "loss": 1.2905, "step": 41 }, { "epoch": 0.01, "grad_norm": 0.18767574429512024, "learning_rate": 0.00012599999999999997, "loss": 1.038, "step": 42 }, { "epoch": 0.01, "grad_norm": 0.19393223524093628, "learning_rate": 0.000129, "loss": 1.3551, "step": 43 }, { "epoch": 0.01, "grad_norm": 0.16287484765052795, "learning_rate": 0.00013199999999999998, "loss": 1.1724, "step": 44 }, { "epoch": 0.01, "grad_norm": 0.15733399987220764, "learning_rate": 0.000135, "loss": 1.2932, "step": 45 }, { "epoch": 0.01, "grad_norm": 0.1465727686882019, "learning_rate": 0.000138, "loss": 1.4175, "step": 46 }, { "epoch": 0.02, "grad_norm": 0.17167991399765015, "learning_rate": 0.00014099999999999998, "loss": 1.3516, "step": 47 }, { "epoch": 0.02, "grad_norm": 0.1732979714870453, "learning_rate": 0.00014399999999999998, "loss": 1.3156, "step": 48 }, { "epoch": 0.02, "grad_norm": 0.1421053111553192, "learning_rate": 0.000147, "loss": 1.2487, "step": 49 }, { "epoch": 0.02, "grad_norm": 0.15713661909103394, "learning_rate": 0.00015, "loss": 1.2375, "step": 50 }, { "epoch": 0.02, "grad_norm": 0.17428313195705414, "learning_rate": 0.00015299999999999998, "loss": 1.1914, "step": 51 }, { "epoch": 0.02, "grad_norm": 0.16010093688964844, "learning_rate": 0.000156, "loss": 1.0435, "step": 52 }, { "epoch": 0.02, "grad_norm": 0.1384117305278778, "learning_rate": 0.000159, "loss": 1.1366, "step": 53 }, { "epoch": 0.02, "grad_norm": 0.17006906867027283, "learning_rate": 0.000162, "loss": 1.1331, "step": 54 }, { "epoch": 0.02, "grad_norm": 0.15873034298419952, "learning_rate": 0.000165, "loss": 1.1048, "step": 55 }, { "epoch": 0.02, "grad_norm": 0.16062800586223602, "learning_rate": 0.000168, "loss": 1.183, "step": 56 }, { "epoch": 0.02, "grad_norm": 0.1499948501586914, "learning_rate": 0.00017099999999999998, "loss": 1.4005, "step": 57 }, { "epoch": 0.02, "grad_norm": 0.1598653346300125, "learning_rate": 0.00017399999999999997, "loss": 1.1654, "step": 58 }, { "epoch": 0.02, "grad_norm": 0.17563337087631226, "learning_rate": 0.00017699999999999997, "loss": 1.5385, "step": 59 }, { "epoch": 0.02, "grad_norm": 0.1687101423740387, "learning_rate": 0.00017999999999999998, "loss": 1.3827, "step": 60 }, { "epoch": 0.02, "grad_norm": 0.13692913949489594, "learning_rate": 0.00018299999999999998, "loss": 1.2653, "step": 61 }, { "epoch": 0.02, "grad_norm": 0.13749787211418152, "learning_rate": 0.000186, "loss": 1.3527, "step": 62 }, { "epoch": 0.02, "grad_norm": 0.1487198770046234, "learning_rate": 0.00018899999999999999, "loss": 1.3718, "step": 63 }, { "epoch": 0.02, "grad_norm": 0.3944415748119354, "learning_rate": 0.00019199999999999998, "loss": 0.8982, "step": 64 }, { "epoch": 0.02, "grad_norm": 0.14923959970474243, "learning_rate": 0.000195, "loss": 1.4096, "step": 65 }, { "epoch": 0.02, "grad_norm": 0.17287668585777283, "learning_rate": 0.000198, "loss": 1.2961, "step": 66 }, { "epoch": 0.02, "grad_norm": 0.13826805353164673, "learning_rate": 0.000201, "loss": 1.3034, "step": 67 }, { "epoch": 0.02, "grad_norm": 0.1515992134809494, "learning_rate": 0.000204, "loss": 1.2896, "step": 68 }, { "epoch": 0.02, "grad_norm": 0.15631988644599915, "learning_rate": 0.00020699999999999996, "loss": 1.3542, "step": 69 }, { "epoch": 0.02, "grad_norm": 0.1843297928571701, "learning_rate": 0.00020999999999999998, "loss": 1.3337, "step": 70 }, { "epoch": 0.02, "grad_norm": 0.18046703934669495, "learning_rate": 0.00021299999999999997, "loss": 1.3133, "step": 71 }, { "epoch": 0.02, "grad_norm": 0.1389905959367752, "learning_rate": 0.00021599999999999996, "loss": 1.2541, "step": 72 }, { "epoch": 0.02, "grad_norm": 0.1717814803123474, "learning_rate": 0.00021899999999999998, "loss": 1.2473, "step": 73 }, { "epoch": 0.02, "grad_norm": 0.21069203317165375, "learning_rate": 0.00022199999999999998, "loss": 0.9712, "step": 74 }, { "epoch": 0.02, "grad_norm": 0.15687905251979828, "learning_rate": 0.000225, "loss": 1.4761, "step": 75 }, { "epoch": 0.02, "grad_norm": 0.15336477756500244, "learning_rate": 0.00022799999999999999, "loss": 1.501, "step": 76 }, { "epoch": 0.02, "grad_norm": 0.20116883516311646, "learning_rate": 0.00023099999999999998, "loss": 1.3941, "step": 77 }, { "epoch": 0.03, "grad_norm": 0.255719393491745, "learning_rate": 0.000234, "loss": 1.2055, "step": 78 }, { "epoch": 0.03, "grad_norm": 0.17634828388690948, "learning_rate": 0.000237, "loss": 1.3754, "step": 79 }, { "epoch": 0.03, "grad_norm": 0.1286117434501648, "learning_rate": 0.00023999999999999998, "loss": 1.2388, "step": 80 }, { "epoch": 0.03, "grad_norm": 0.1580284982919693, "learning_rate": 0.000243, "loss": 1.4178, "step": 81 }, { "epoch": 0.03, "grad_norm": 0.1607348769903183, "learning_rate": 0.00024599999999999996, "loss": 1.3483, "step": 82 }, { "epoch": 0.03, "grad_norm": 0.20425604283809662, "learning_rate": 0.000249, "loss": 1.3633, "step": 83 }, { "epoch": 0.03, "grad_norm": 0.15520161390304565, "learning_rate": 0.00025199999999999995, "loss": 1.3877, "step": 84 }, { "epoch": 0.03, "grad_norm": 0.15836600959300995, "learning_rate": 0.00025499999999999996, "loss": 1.3523, "step": 85 }, { "epoch": 0.03, "grad_norm": 0.13606102764606476, "learning_rate": 0.000258, "loss": 1.3434, "step": 86 }, { "epoch": 0.03, "grad_norm": 0.16367009282112122, "learning_rate": 0.000261, "loss": 1.2937, "step": 87 }, { "epoch": 0.03, "grad_norm": 0.15520524978637695, "learning_rate": 0.00026399999999999997, "loss": 1.2968, "step": 88 }, { "epoch": 0.03, "grad_norm": 0.15542207658290863, "learning_rate": 0.000267, "loss": 1.3183, "step": 89 }, { "epoch": 0.03, "grad_norm": 0.15033961832523346, "learning_rate": 0.00027, "loss": 1.3185, "step": 90 }, { "epoch": 0.03, "grad_norm": 0.25355687737464905, "learning_rate": 0.00027299999999999997, "loss": 1.2811, "step": 91 }, { "epoch": 0.03, "grad_norm": 0.13350434601306915, "learning_rate": 0.000276, "loss": 1.2975, "step": 92 }, { "epoch": 0.03, "grad_norm": 0.16181416809558868, "learning_rate": 0.000279, "loss": 1.1305, "step": 93 }, { "epoch": 0.03, "grad_norm": 0.1898832470178604, "learning_rate": 0.00028199999999999997, "loss": 1.3409, "step": 94 }, { "epoch": 0.03, "grad_norm": 0.18393121659755707, "learning_rate": 0.000285, "loss": 1.297, "step": 95 }, { "epoch": 0.03, "grad_norm": 0.15223050117492676, "learning_rate": 0.00028799999999999995, "loss": 1.2074, "step": 96 }, { "epoch": 0.03, "grad_norm": 0.1481645107269287, "learning_rate": 0.00029099999999999997, "loss": 1.3209, "step": 97 }, { "epoch": 0.03, "grad_norm": 0.14738555252552032, "learning_rate": 0.000294, "loss": 1.2134, "step": 98 }, { "epoch": 0.03, "grad_norm": 0.1728326678276062, "learning_rate": 0.00029699999999999996, "loss": 1.4452, "step": 99 }, { "epoch": 0.03, "grad_norm": 0.16517844796180725, "learning_rate": 0.0003, "loss": 1.2751, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.16002611815929413, "learning_rate": 0.0002999999179721908, "loss": 1.3291, "step": 101 }, { "epoch": 0.03, "grad_norm": 0.16053710877895355, "learning_rate": 0.00029999967188885294, "loss": 1.4194, "step": 102 }, { "epoch": 0.03, "grad_norm": 0.14420418441295624, "learning_rate": 0.0002999992617502555, "loss": 1.1549, "step": 103 }, { "epoch": 0.03, "grad_norm": 0.16145503520965576, "learning_rate": 0.0002999986875568472, "loss": 1.2118, "step": 104 }, { "epoch": 0.03, "grad_norm": 0.16058529913425446, "learning_rate": 0.0002999979493092558, "loss": 1.1406, "step": 105 }, { "epoch": 0.03, "grad_norm": 0.16317486763000488, "learning_rate": 0.00029999704700828895, "loss": 1.1145, "step": 106 }, { "epoch": 0.03, "grad_norm": 0.15758004784584045, "learning_rate": 0.0002999959806549334, "loss": 1.2719, "step": 107 }, { "epoch": 0.03, "grad_norm": 0.14353294670581818, "learning_rate": 0.0002999947502503554, "loss": 1.2814, "step": 108 }, { "epoch": 0.04, "grad_norm": 0.28295430541038513, "learning_rate": 0.00029999335579590074, "loss": 1.4914, "step": 109 }, { "epoch": 0.04, "grad_norm": 0.19108439981937408, "learning_rate": 0.00029999179729309444, "loss": 1.1197, "step": 110 }, { "epoch": 0.04, "grad_norm": 0.1572168618440628, "learning_rate": 0.0002999900747436411, "loss": 1.4034, "step": 111 }, { "epoch": 0.04, "grad_norm": 0.1664152294397354, "learning_rate": 0.00029998818814942464, "loss": 1.3144, "step": 112 }, { "epoch": 0.04, "grad_norm": 0.17340485751628876, "learning_rate": 0.0002999861375125085, "loss": 0.9677, "step": 113 }, { "epoch": 0.04, "grad_norm": 0.14122898876667023, "learning_rate": 0.00029998392283513543, "loss": 1.2546, "step": 114 }, { "epoch": 0.04, "grad_norm": 0.19369304180145264, "learning_rate": 0.0002999815441197276, "loss": 1.4409, "step": 115 }, { "epoch": 0.04, "grad_norm": 0.14894457161426544, "learning_rate": 0.0002999790013688867, "loss": 1.1764, "step": 116 }, { "epoch": 0.04, "grad_norm": 0.14941565692424774, "learning_rate": 0.00029997629458539365, "loss": 1.2857, "step": 117 }, { "epoch": 0.04, "grad_norm": 0.1801568865776062, "learning_rate": 0.00029997342377220893, "loss": 1.3687, "step": 118 }, { "epoch": 0.04, "grad_norm": 0.16208602488040924, "learning_rate": 0.00029997038893247237, "loss": 1.4687, "step": 119 }, { "epoch": 0.04, "grad_norm": 0.1463913917541504, "learning_rate": 0.00029996719006950316, "loss": 1.4743, "step": 120 }, { "epoch": 0.04, "grad_norm": 0.15486371517181396, "learning_rate": 0.00029996382718679995, "loss": 1.3301, "step": 121 }, { "epoch": 0.04, "grad_norm": 0.17152847349643707, "learning_rate": 0.00029996030028804063, "loss": 1.1968, "step": 122 }, { "epoch": 0.04, "grad_norm": 0.19680410623550415, "learning_rate": 0.0002999566093770827, "loss": 1.4573, "step": 123 }, { "epoch": 0.04, "grad_norm": 0.17065325379371643, "learning_rate": 0.0002999527544579629, "loss": 1.1437, "step": 124 }, { "epoch": 0.04, "grad_norm": 0.1630697399377823, "learning_rate": 0.00029994873553489733, "loss": 1.195, "step": 125 }, { "epoch": 0.04, "grad_norm": 0.16416873037815094, "learning_rate": 0.0002999445526122815, "loss": 1.2113, "step": 126 }, { "epoch": 0.04, "grad_norm": 0.15932227671146393, "learning_rate": 0.0002999402056946904, "loss": 1.3886, "step": 127 }, { "epoch": 0.04, "grad_norm": 0.19321565330028534, "learning_rate": 0.0002999356947868782, "loss": 1.3711, "step": 128 }, { "epoch": 0.04, "grad_norm": 0.1563544124364853, "learning_rate": 0.00029993101989377844, "loss": 1.0875, "step": 129 }, { "epoch": 0.04, "grad_norm": 0.1827336996793747, "learning_rate": 0.0002999261810205041, "loss": 1.3273, "step": 130 }, { "epoch": 0.04, "grad_norm": 0.18887676298618317, "learning_rate": 0.00029992117817234746, "loss": 1.2346, "step": 131 }, { "epoch": 0.04, "grad_norm": 0.17841386795043945, "learning_rate": 0.00029991601135478024, "loss": 1.323, "step": 132 }, { "epoch": 0.04, "grad_norm": 0.19825519621372223, "learning_rate": 0.00029991068057345335, "loss": 1.3562, "step": 133 }, { "epoch": 0.04, "grad_norm": 0.1822575032711029, "learning_rate": 0.0002999051858341971, "loss": 1.3654, "step": 134 }, { "epoch": 0.04, "grad_norm": 0.15277878940105438, "learning_rate": 0.00029989952714302114, "loss": 1.266, "step": 135 }, { "epoch": 0.04, "grad_norm": 0.17956991493701935, "learning_rate": 0.00029989370450611433, "loss": 1.2592, "step": 136 }, { "epoch": 0.04, "grad_norm": 0.19778341054916382, "learning_rate": 0.00029988771792984495, "loss": 1.26, "step": 137 }, { "epoch": 0.04, "grad_norm": 0.14252574741840363, "learning_rate": 0.0002998815674207605, "loss": 1.0392, "step": 138 }, { "epoch": 0.04, "grad_norm": 0.21389810740947723, "learning_rate": 0.0002998752529855879, "loss": 1.1152, "step": 139 }, { "epoch": 0.05, "grad_norm": 0.15959124267101288, "learning_rate": 0.00029986877463123325, "loss": 1.4143, "step": 140 }, { "epoch": 0.05, "grad_norm": 0.1598285734653473, "learning_rate": 0.0002998621323647819, "loss": 1.304, "step": 141 }, { "epoch": 0.05, "grad_norm": 0.20414327085018158, "learning_rate": 0.0002998553261934986, "loss": 1.3947, "step": 142 }, { "epoch": 0.05, "grad_norm": 0.17282746732234955, "learning_rate": 0.0002998483561248273, "loss": 1.4117, "step": 143 }, { "epoch": 0.05, "grad_norm": 0.1783457100391388, "learning_rate": 0.0002998412221663911, "loss": 1.2918, "step": 144 }, { "epoch": 0.05, "grad_norm": 0.18180879950523376, "learning_rate": 0.00029983392432599243, "loss": 1.2846, "step": 145 }, { "epoch": 0.05, "grad_norm": 0.1581745147705078, "learning_rate": 0.000299826462611613, "loss": 1.3002, "step": 146 }, { "epoch": 0.05, "grad_norm": 0.3048487603664398, "learning_rate": 0.00029981883703141383, "loss": 1.2501, "step": 147 }, { "epoch": 0.05, "grad_norm": 0.18119028210639954, "learning_rate": 0.000299811047593735, "loss": 1.3982, "step": 148 }, { "epoch": 0.05, "grad_norm": 0.1822800636291504, "learning_rate": 0.0002998030943070957, "loss": 1.1494, "step": 149 }, { "epoch": 0.05, "grad_norm": 0.1572967916727066, "learning_rate": 0.0002997949771801946, "loss": 1.0647, "step": 150 }, { "epoch": 0.05, "grad_norm": 0.2083917260169983, "learning_rate": 0.0002997866962219094, "loss": 1.359, "step": 151 }, { "epoch": 0.05, "grad_norm": 0.18043898046016693, "learning_rate": 0.00029977825144129704, "loss": 1.2676, "step": 152 }, { "epoch": 0.05, "grad_norm": 0.20631720125675201, "learning_rate": 0.0002997696428475936, "loss": 1.4485, "step": 153 }, { "epoch": 0.05, "grad_norm": 0.20143349468708038, "learning_rate": 0.0002997608704502144, "loss": 1.2401, "step": 154 }, { "epoch": 0.05, "grad_norm": 0.16213050484657288, "learning_rate": 0.0002997519342587537, "loss": 1.184, "step": 155 }, { "epoch": 0.05, "grad_norm": 0.19849686324596405, "learning_rate": 0.0002997428342829852, "loss": 1.2835, "step": 156 }, { "epoch": 0.05, "grad_norm": 0.16852635145187378, "learning_rate": 0.00029973357053286146, "loss": 1.2369, "step": 157 }, { "epoch": 0.05, "grad_norm": 0.1694997251033783, "learning_rate": 0.0002997241430185144, "loss": 1.211, "step": 158 }, { "epoch": 0.05, "grad_norm": 0.23641088604927063, "learning_rate": 0.00029971455175025483, "loss": 1.4528, "step": 159 }, { "epoch": 0.05, "grad_norm": 0.17931026220321655, "learning_rate": 0.0002997047967385728, "loss": 1.3314, "step": 160 }, { "epoch": 0.05, "grad_norm": 0.1892920434474945, "learning_rate": 0.00029969487799413744, "loss": 1.4512, "step": 161 }, { "epoch": 0.05, "grad_norm": 0.18620984256267548, "learning_rate": 0.0002996847955277968, "loss": 1.285, "step": 162 }, { "epoch": 0.05, "grad_norm": 0.16308173537254333, "learning_rate": 0.00029967454935057826, "loss": 1.2942, "step": 163 }, { "epoch": 0.05, "grad_norm": 0.18099895119667053, "learning_rate": 0.0002996641394736881, "loss": 1.2007, "step": 164 }, { "epoch": 0.05, "grad_norm": 0.16969528794288635, "learning_rate": 0.0002996535659085115, "loss": 1.3844, "step": 165 }, { "epoch": 0.05, "grad_norm": 0.16654370725154877, "learning_rate": 0.00029964282866661295, "loss": 1.3488, "step": 166 }, { "epoch": 0.05, "grad_norm": 0.8953688144683838, "learning_rate": 0.00029963192775973575, "loss": 1.3111, "step": 167 }, { "epoch": 0.05, "grad_norm": 0.1819576621055603, "learning_rate": 0.00029962086319980233, "loss": 1.1723, "step": 168 }, { "epoch": 0.05, "grad_norm": 0.20332075655460358, "learning_rate": 0.00029960963499891396, "loss": 1.1099, "step": 169 }, { "epoch": 0.05, "grad_norm": 0.26618075370788574, "learning_rate": 0.000299598243169351, "loss": 1.42, "step": 170 }, { "epoch": 0.06, "grad_norm": 0.22319144010543823, "learning_rate": 0.0002995866877235728, "loss": 1.3224, "step": 171 }, { "epoch": 0.06, "grad_norm": 0.18376323580741882, "learning_rate": 0.00029957496867421754, "loss": 1.1404, "step": 172 }, { "epoch": 0.06, "grad_norm": 0.15550604462623596, "learning_rate": 0.0002995630860341024, "loss": 1.4051, "step": 173 }, { "epoch": 0.06, "grad_norm": 0.22546988725662231, "learning_rate": 0.00029955103981622346, "loss": 1.1568, "step": 174 }, { "epoch": 0.06, "grad_norm": 0.7630566954612732, "learning_rate": 0.0002995388300337557, "loss": 1.2733, "step": 175 }, { "epoch": 0.06, "grad_norm": 0.22691138088703156, "learning_rate": 0.00029952645670005315, "loss": 1.4, "step": 176 }, { "epoch": 0.06, "grad_norm": 0.24916407465934753, "learning_rate": 0.0002995139198286484, "loss": 1.3277, "step": 177 }, { "epoch": 0.06, "grad_norm": 0.33264005184173584, "learning_rate": 0.0002995012194332531, "loss": 1.2367, "step": 178 }, { "epoch": 0.06, "grad_norm": 0.2790135145187378, "learning_rate": 0.00029948835552775794, "loss": 1.187, "step": 179 }, { "epoch": 0.06, "grad_norm": 0.32110825181007385, "learning_rate": 0.0002994753281262319, "loss": 1.0816, "step": 180 }, { "epoch": 0.06, "grad_norm": 0.23527830839157104, "learning_rate": 0.00029946213724292337, "loss": 1.4324, "step": 181 }, { "epoch": 0.06, "grad_norm": 0.27017852663993835, "learning_rate": 0.00029944878289225906, "loss": 1.2916, "step": 182 }, { "epoch": 0.06, "grad_norm": 0.19658833742141724, "learning_rate": 0.00029943526508884483, "loss": 1.1479, "step": 183 }, { "epoch": 0.06, "grad_norm": 0.2085239291191101, "learning_rate": 0.00029942158384746513, "loss": 1.3354, "step": 184 }, { "epoch": 0.06, "grad_norm": 0.23562414944171906, "learning_rate": 0.00029940773918308314, "loss": 1.0246, "step": 185 }, { "epoch": 0.06, "grad_norm": 0.18551231920719147, "learning_rate": 0.0002993937311108409, "loss": 1.1615, "step": 186 }, { "epoch": 0.06, "grad_norm": 0.2176637202501297, "learning_rate": 0.000299379559646059, "loss": 1.3824, "step": 187 }, { "epoch": 0.06, "grad_norm": 0.2104397416114807, "learning_rate": 0.0002993652248042369, "loss": 1.3782, "step": 188 }, { "epoch": 0.06, "grad_norm": 0.21271292865276337, "learning_rate": 0.0002993507266010527, "loss": 1.203, "step": 189 }, { "epoch": 0.06, "grad_norm": 0.29035332798957825, "learning_rate": 0.00029933606505236305, "loss": 1.3559, "step": 190 }, { "epoch": 0.06, "grad_norm": 0.22577838599681854, "learning_rate": 0.00029932124017420346, "loss": 1.4067, "step": 191 }, { "epoch": 0.06, "grad_norm": 0.2052624374628067, "learning_rate": 0.0002993062519827879, "loss": 1.3022, "step": 192 }, { "epoch": 0.06, "grad_norm": 0.3349408507347107, "learning_rate": 0.00029929110049450896, "loss": 1.3752, "step": 193 }, { "epoch": 0.06, "grad_norm": 0.2688114643096924, "learning_rate": 0.0002992757857259379, "loss": 1.3986, "step": 194 }, { "epoch": 0.06, "grad_norm": 0.2042258381843567, "learning_rate": 0.0002992603076938247, "loss": 1.3701, "step": 195 }, { "epoch": 0.06, "grad_norm": 0.2382868379354477, "learning_rate": 0.0002992446664150976, "loss": 1.1451, "step": 196 }, { "epoch": 0.06, "grad_norm": 0.2188626378774643, "learning_rate": 0.00029922886190686355, "loss": 1.2386, "step": 197 }, { "epoch": 0.06, "grad_norm": 0.2801951467990875, "learning_rate": 0.000299212894186408, "loss": 1.2924, "step": 198 }, { "epoch": 0.06, "grad_norm": 0.2162003368139267, "learning_rate": 0.000299196763271195, "loss": 1.3419, "step": 199 }, { "epoch": 0.06, "grad_norm": 0.21345525979995728, "learning_rate": 0.0002991804691788669, "loss": 1.0983, "step": 200 }, { "epoch": 0.06, "grad_norm": 0.19928047060966492, "learning_rate": 0.0002991640119272446, "loss": 1.1863, "step": 201 }, { "epoch": 0.07, "grad_norm": 0.2397114783525467, "learning_rate": 0.0002991473915343276, "loss": 1.2753, "step": 202 }, { "epoch": 0.07, "grad_norm": 0.23138543963432312, "learning_rate": 0.0002991306080182936, "loss": 1.0768, "step": 203 }, { "epoch": 0.07, "grad_norm": 0.21599417924880981, "learning_rate": 0.0002991136613974987, "loss": 1.1293, "step": 204 }, { "epoch": 0.07, "grad_norm": 0.18751734495162964, "learning_rate": 0.0002990965516904777, "loss": 1.1463, "step": 205 }, { "epoch": 0.07, "grad_norm": 0.21556243300437927, "learning_rate": 0.0002990792789159434, "loss": 1.3618, "step": 206 }, { "epoch": 0.07, "grad_norm": 0.23060421645641327, "learning_rate": 0.00029906184309278714, "loss": 1.3474, "step": 207 }, { "epoch": 0.07, "grad_norm": 0.22939856350421906, "learning_rate": 0.0002990442442400786, "loss": 1.5008, "step": 208 }, { "epoch": 0.07, "grad_norm": 0.1748415231704712, "learning_rate": 0.00029902648237706573, "loss": 1.1812, "step": 209 }, { "epoch": 0.07, "grad_norm": 0.1896459460258484, "learning_rate": 0.0002990085575231746, "loss": 1.3053, "step": 210 }, { "epoch": 0.07, "grad_norm": 0.2102210968732834, "learning_rate": 0.0002989904696980099, "loss": 1.2098, "step": 211 }, { "epoch": 0.07, "grad_norm": 0.47533586621284485, "learning_rate": 0.00029897221892135424, "loss": 1.2873, "step": 212 }, { "epoch": 0.07, "grad_norm": 0.2753942310810089, "learning_rate": 0.00029895380521316857, "loss": 1.2636, "step": 213 }, { "epoch": 0.07, "grad_norm": 0.18430787324905396, "learning_rate": 0.0002989352285935921, "loss": 1.2533, "step": 214 }, { "epoch": 0.07, "grad_norm": 0.18881487846374512, "learning_rate": 0.0002989164890829421, "loss": 1.4258, "step": 215 }, { "epoch": 0.07, "grad_norm": 0.23149730265140533, "learning_rate": 0.0002988975867017141, "loss": 1.2361, "step": 216 }, { "epoch": 0.07, "grad_norm": 0.26199719309806824, "learning_rate": 0.0002988785214705816, "loss": 1.3745, "step": 217 }, { "epoch": 0.07, "grad_norm": 0.17126867175102234, "learning_rate": 0.00029885929341039646, "loss": 0.999, "step": 218 }, { "epoch": 0.07, "grad_norm": 0.22533132135868073, "learning_rate": 0.0002988399025421884, "loss": 1.4177, "step": 219 }, { "epoch": 0.07, "grad_norm": 0.2754909098148346, "learning_rate": 0.00029882034888716535, "loss": 1.1933, "step": 220 }, { "epoch": 0.07, "grad_norm": 0.19594623148441315, "learning_rate": 0.00029880063246671317, "loss": 1.1805, "step": 221 }, { "epoch": 0.07, "grad_norm": 0.1724214106798172, "learning_rate": 0.00029878075330239584, "loss": 1.2261, "step": 222 }, { "epoch": 0.07, "grad_norm": 0.2020808309316635, "learning_rate": 0.00029876071141595525, "loss": 1.1821, "step": 223 }, { "epoch": 0.07, "grad_norm": 0.22039833664894104, "learning_rate": 0.0002987405068293113, "loss": 1.253, "step": 224 }, { "epoch": 0.07, "grad_norm": 0.1710633784532547, "learning_rate": 0.0002987201395645619, "loss": 1.346, "step": 225 }, { "epoch": 0.07, "grad_norm": 0.18389591574668884, "learning_rate": 0.00029869960964398263, "loss": 1.3029, "step": 226 }, { "epoch": 0.07, "grad_norm": 0.17335979640483856, "learning_rate": 0.0002986789170900274, "loss": 1.1376, "step": 227 }, { "epoch": 0.07, "grad_norm": 0.200583815574646, "learning_rate": 0.00029865806192532743, "loss": 1.5255, "step": 228 }, { "epoch": 0.07, "grad_norm": 0.23017296195030212, "learning_rate": 0.0002986370441726924, "loss": 1.3896, "step": 229 }, { "epoch": 0.07, "grad_norm": 0.2027190923690796, "learning_rate": 0.0002986158638551093, "loss": 1.2535, "step": 230 }, { "epoch": 0.07, "grad_norm": 0.21630370616912842, "learning_rate": 0.00029859452099574335, "loss": 1.4002, "step": 231 }, { "epoch": 0.07, "grad_norm": 0.1866302341222763, "learning_rate": 0.00029857301561793707, "loss": 1.3952, "step": 232 }, { "epoch": 0.08, "grad_norm": 0.22952046990394592, "learning_rate": 0.0002985513477452112, "loss": 1.3583, "step": 233 }, { "epoch": 0.08, "grad_norm": 0.21824747323989868, "learning_rate": 0.00029852951740126383, "loss": 1.2856, "step": 234 }, { "epoch": 0.08, "grad_norm": 0.22822396457195282, "learning_rate": 0.00029850752460997096, "loss": 1.2915, "step": 235 }, { "epoch": 0.08, "grad_norm": 0.24144254624843597, "learning_rate": 0.0002984853693953862, "loss": 1.2635, "step": 236 }, { "epoch": 0.08, "grad_norm": 0.23091521859169006, "learning_rate": 0.00029846305178174074, "loss": 1.341, "step": 237 }, { "epoch": 0.08, "grad_norm": 0.1603642851114273, "learning_rate": 0.00029844057179344356, "loss": 0.9598, "step": 238 }, { "epoch": 0.08, "grad_norm": 0.22247399389743805, "learning_rate": 0.00029841792945508103, "loss": 1.2969, "step": 239 }, { "epoch": 0.08, "grad_norm": 0.2136106938123703, "learning_rate": 0.00029839512479141713, "loss": 1.3173, "step": 240 }, { "epoch": 0.08, "grad_norm": 0.2817842662334442, "learning_rate": 0.0002983721578273935, "loss": 1.3639, "step": 241 }, { "epoch": 0.08, "grad_norm": 0.21935509145259857, "learning_rate": 0.00029834902858812926, "loss": 1.341, "step": 242 }, { "epoch": 0.08, "grad_norm": 0.1989377737045288, "learning_rate": 0.00029832573709892075, "loss": 1.1453, "step": 243 }, { "epoch": 0.08, "grad_norm": 0.1999141126871109, "learning_rate": 0.00029830228338524216, "loss": 1.2626, "step": 244 }, { "epoch": 0.08, "grad_norm": 0.24541524052619934, "learning_rate": 0.00029827866747274485, "loss": 1.3393, "step": 245 }, { "epoch": 0.08, "grad_norm": 0.20727522671222687, "learning_rate": 0.0002982548893872576, "loss": 1.2052, "step": 246 }, { "epoch": 0.08, "grad_norm": 0.16816599667072296, "learning_rate": 0.00029823094915478664, "loss": 1.0878, "step": 247 }, { "epoch": 0.08, "grad_norm": 0.23523330688476562, "learning_rate": 0.0002982068468015155, "loss": 1.0521, "step": 248 }, { "epoch": 0.08, "grad_norm": 0.22416900098323822, "learning_rate": 0.000298182582353805, "loss": 1.2625, "step": 249 }, { "epoch": 0.08, "grad_norm": 0.2372831255197525, "learning_rate": 0.0002981581558381933, "loss": 1.3528, "step": 250 }, { "epoch": 0.08, "grad_norm": 0.28233033418655396, "learning_rate": 0.0002981335672813957, "loss": 1.2469, "step": 251 }, { "epoch": 0.08, "grad_norm": 0.17429836094379425, "learning_rate": 0.00029810881671030496, "loss": 0.981, "step": 252 }, { "epoch": 0.08, "grad_norm": 0.18835853040218353, "learning_rate": 0.00029808390415199073, "loss": 0.9436, "step": 253 }, { "epoch": 0.08, "grad_norm": 0.22751984000205994, "learning_rate": 0.0002980588296337001, "loss": 1.0579, "step": 254 }, { "epoch": 0.08, "grad_norm": 0.216526597738266, "learning_rate": 0.00029803359318285707, "loss": 1.275, "step": 255 }, { "epoch": 0.08, "grad_norm": 0.18656998872756958, "learning_rate": 0.00029800819482706295, "loss": 1.377, "step": 256 }, { "epoch": 0.08, "grad_norm": 0.2101496458053589, "learning_rate": 0.0002979826345940959, "loss": 1.4238, "step": 257 }, { "epoch": 0.08, "grad_norm": 0.19177651405334473, "learning_rate": 0.00029795691251191135, "loss": 1.2619, "step": 258 }, { "epoch": 0.08, "grad_norm": 0.19432249665260315, "learning_rate": 0.0002979310286086416, "loss": 1.1638, "step": 259 }, { "epoch": 0.08, "grad_norm": 0.24972280859947205, "learning_rate": 0.000297904982912596, "loss": 1.2677, "step": 260 }, { "epoch": 0.08, "grad_norm": 0.1777634173631668, "learning_rate": 0.00029787877545226084, "loss": 1.3363, "step": 261 }, { "epoch": 0.08, "grad_norm": 0.17763619124889374, "learning_rate": 0.00029785240625629936, "loss": 1.2401, "step": 262 }, { "epoch": 0.08, "grad_norm": 0.17736496031284332, "learning_rate": 0.00029782587535355157, "loss": 1.2916, "step": 263 }, { "epoch": 0.09, "grad_norm": 0.18855898082256317, "learning_rate": 0.0002977991827730345, "loss": 1.2716, "step": 264 }, { "epoch": 0.09, "grad_norm": 0.19698740541934967, "learning_rate": 0.00029777232854394197, "loss": 1.34, "step": 265 }, { "epoch": 0.09, "grad_norm": 0.1721935123205185, "learning_rate": 0.0002977453126956444, "loss": 1.2858, "step": 266 }, { "epoch": 0.09, "grad_norm": 0.1703348159790039, "learning_rate": 0.0002977181352576893, "loss": 1.206, "step": 267 }, { "epoch": 0.09, "grad_norm": 0.18318620324134827, "learning_rate": 0.00029769079625980075, "loss": 1.0507, "step": 268 }, { "epoch": 0.09, "grad_norm": 0.2022814303636551, "learning_rate": 0.00029766329573187935, "loss": 1.1773, "step": 269 }, { "epoch": 0.09, "grad_norm": 0.2273266613483429, "learning_rate": 0.0002976356337040027, "loss": 1.2178, "step": 270 }, { "epoch": 0.09, "grad_norm": 0.17452342808246613, "learning_rate": 0.0002976078102064248, "loss": 1.3512, "step": 271 }, { "epoch": 0.09, "grad_norm": 0.28071877360343933, "learning_rate": 0.00029757982526957644, "loss": 1.2078, "step": 272 }, { "epoch": 0.09, "grad_norm": 0.2291635274887085, "learning_rate": 0.0002975516789240647, "loss": 1.1594, "step": 273 }, { "epoch": 0.09, "grad_norm": 0.18158882856369019, "learning_rate": 0.00029752337120067345, "loss": 1.4135, "step": 274 }, { "epoch": 0.09, "grad_norm": 0.17074541747570038, "learning_rate": 0.0002974949021303629, "loss": 1.3612, "step": 275 }, { "epoch": 0.09, "grad_norm": 0.20534443855285645, "learning_rate": 0.00029746627174426986, "loss": 1.4957, "step": 276 }, { "epoch": 0.09, "grad_norm": 0.19830013811588287, "learning_rate": 0.0002974374800737074, "loss": 1.2804, "step": 277 }, { "epoch": 0.09, "grad_norm": 0.1790958046913147, "learning_rate": 0.00029740852715016524, "loss": 1.2036, "step": 278 }, { "epoch": 0.09, "grad_norm": 0.19341765344142914, "learning_rate": 0.0002973794130053092, "loss": 1.1753, "step": 279 }, { "epoch": 0.09, "grad_norm": 0.193817600607872, "learning_rate": 0.0002973501376709816, "loss": 1.288, "step": 280 }, { "epoch": 0.09, "grad_norm": 0.18417881429195404, "learning_rate": 0.00029732070117920097, "loss": 1.2393, "step": 281 }, { "epoch": 0.09, "grad_norm": 0.2064652144908905, "learning_rate": 0.0002972911035621621, "loss": 1.2536, "step": 282 }, { "epoch": 0.09, "grad_norm": 0.19507160782814026, "learning_rate": 0.00029726134485223596, "loss": 1.0287, "step": 283 }, { "epoch": 0.09, "grad_norm": 0.1934509128332138, "learning_rate": 0.00029723142508196996, "loss": 1.236, "step": 284 }, { "epoch": 0.09, "grad_norm": 0.3497771918773651, "learning_rate": 0.0002972013442840874, "loss": 1.4314, "step": 285 }, { "epoch": 0.09, "grad_norm": 0.3455434739589691, "learning_rate": 0.0002971711024914877, "loss": 1.2034, "step": 286 }, { "epoch": 0.09, "grad_norm": 0.27084752917289734, "learning_rate": 0.0002971406997372464, "loss": 1.3144, "step": 287 }, { "epoch": 0.09, "grad_norm": 0.2655249238014221, "learning_rate": 0.0002971101360546153, "loss": 1.2209, "step": 288 }, { "epoch": 0.09, "grad_norm": 0.18072941899299622, "learning_rate": 0.00029707941147702184, "loss": 1.2084, "step": 289 }, { "epoch": 0.09, "grad_norm": 0.1846039593219757, "learning_rate": 0.00029704852603806977, "loss": 1.3297, "step": 290 }, { "epoch": 0.09, "grad_norm": 0.2048686146736145, "learning_rate": 0.00029701747977153853, "loss": 1.2968, "step": 291 }, { "epoch": 0.09, "grad_norm": 0.19128942489624023, "learning_rate": 0.00029698627271138354, "loss": 1.2349, "step": 292 }, { "epoch": 0.09, "grad_norm": 0.21982653439044952, "learning_rate": 0.00029695490489173613, "loss": 1.3629, "step": 293 }, { "epoch": 0.09, "grad_norm": 0.21954968571662903, "learning_rate": 0.00029692337634690345, "loss": 1.232, "step": 294 }, { "epoch": 0.1, "grad_norm": 0.22075019776821136, "learning_rate": 0.0002968916871113683, "loss": 1.4036, "step": 295 }, { "epoch": 0.1, "grad_norm": 0.3606947064399719, "learning_rate": 0.0002968598372197894, "loss": 1.2837, "step": 296 }, { "epoch": 0.1, "grad_norm": 0.2510881721973419, "learning_rate": 0.0002968278267070011, "loss": 1.4161, "step": 297 }, { "epoch": 0.1, "grad_norm": 0.2106585055589676, "learning_rate": 0.0002967956556080134, "loss": 1.2574, "step": 298 }, { "epoch": 0.1, "grad_norm": 0.18030983209609985, "learning_rate": 0.000296763323958012, "loss": 1.275, "step": 299 }, { "epoch": 0.1, "grad_norm": 0.22040830552577972, "learning_rate": 0.0002967308317923581, "loss": 1.1752, "step": 300 }, { "epoch": 0.1, "grad_norm": 0.1864234060049057, "learning_rate": 0.00029669817914658865, "loss": 1.2616, "step": 301 }, { "epoch": 0.1, "grad_norm": 0.22592277824878693, "learning_rate": 0.0002966653660564158, "loss": 1.325, "step": 302 }, { "epoch": 0.1, "grad_norm": 0.2607811987400055, "learning_rate": 0.0002966323925577275, "loss": 1.6383, "step": 303 }, { "epoch": 0.1, "grad_norm": 0.18044641613960266, "learning_rate": 0.00029659925868658695, "loss": 1.1666, "step": 304 }, { "epoch": 0.1, "grad_norm": 0.2061993032693863, "learning_rate": 0.0002965659644792328, "loss": 1.4557, "step": 305 }, { "epoch": 0.1, "grad_norm": 0.22277647256851196, "learning_rate": 0.00029653250997207906, "loss": 1.2704, "step": 306 }, { "epoch": 0.1, "grad_norm": 0.19953471422195435, "learning_rate": 0.0002964988952017151, "loss": 1.0763, "step": 307 }, { "epoch": 0.1, "grad_norm": 0.1835670918226242, "learning_rate": 0.0002964651202049054, "loss": 1.2727, "step": 308 }, { "epoch": 0.1, "grad_norm": 0.19624808430671692, "learning_rate": 0.00029643118501859, "loss": 1.4021, "step": 309 }, { "epoch": 0.1, "grad_norm": 0.16536067426204681, "learning_rate": 0.00029639708967988385, "loss": 1.1824, "step": 310 }, { "epoch": 0.1, "grad_norm": 0.1886124610900879, "learning_rate": 0.0002963628342260772, "loss": 1.1126, "step": 311 }, { "epoch": 0.1, "grad_norm": 0.1860448718070984, "learning_rate": 0.0002963284186946354, "loss": 1.4845, "step": 312 }, { "epoch": 0.1, "grad_norm": 0.20875199139118195, "learning_rate": 0.00029629384312319874, "loss": 1.3209, "step": 313 }, { "epoch": 0.1, "grad_norm": 0.19897021353244781, "learning_rate": 0.0002962591075495828, "loss": 1.3768, "step": 314 }, { "epoch": 0.1, "grad_norm": 0.23331812024116516, "learning_rate": 0.00029622421201177797, "loss": 1.5416, "step": 315 }, { "epoch": 0.1, "grad_norm": 0.19847387075424194, "learning_rate": 0.0002961891565479496, "loss": 1.2471, "step": 316 }, { "epoch": 0.1, "grad_norm": 0.23611626029014587, "learning_rate": 0.000296153941196438, "loss": 1.0618, "step": 317 }, { "epoch": 0.1, "grad_norm": 0.20590175688266754, "learning_rate": 0.00029611856599575845, "loss": 0.9892, "step": 318 }, { "epoch": 0.1, "grad_norm": 0.18343736231327057, "learning_rate": 0.0002960830309846009, "loss": 1.2702, "step": 319 }, { "epoch": 0.1, "grad_norm": 0.2210874706506729, "learning_rate": 0.00029604733620183007, "loss": 0.9381, "step": 320 }, { "epoch": 0.1, "grad_norm": 0.21434368193149567, "learning_rate": 0.00029601148168648556, "loss": 1.381, "step": 321 }, { "epoch": 0.1, "grad_norm": 0.20672471821308136, "learning_rate": 0.00029597546747778155, "loss": 0.9146, "step": 322 }, { "epoch": 0.1, "grad_norm": 0.18069346249103546, "learning_rate": 0.000295939293615107, "loss": 1.2729, "step": 323 }, { "epoch": 0.1, "grad_norm": 0.24016359448432922, "learning_rate": 0.0002959029601380254, "loss": 1.4269, "step": 324 }, { "epoch": 0.1, "grad_norm": 0.22291342914104462, "learning_rate": 0.0002958664670862747, "loss": 1.4188, "step": 325 }, { "epoch": 0.11, "grad_norm": 0.22529089450836182, "learning_rate": 0.0002958298144997677, "loss": 1.3582, "step": 326 }, { "epoch": 0.11, "grad_norm": 0.1949230134487152, "learning_rate": 0.00029579300241859135, "loss": 1.554, "step": 327 }, { "epoch": 0.11, "grad_norm": 0.2182324081659317, "learning_rate": 0.00029575603088300724, "loss": 1.195, "step": 328 }, { "epoch": 0.11, "grad_norm": 0.20953096449375153, "learning_rate": 0.0002957188999334512, "loss": 1.2006, "step": 329 }, { "epoch": 0.11, "grad_norm": 0.2154708355665207, "learning_rate": 0.00029568160961053355, "loss": 1.1669, "step": 330 }, { "epoch": 0.11, "grad_norm": 0.20091965794563293, "learning_rate": 0.000295644159955039, "loss": 1.4942, "step": 331 }, { "epoch": 0.11, "grad_norm": 0.20214878022670746, "learning_rate": 0.0002956065510079262, "loss": 1.1926, "step": 332 }, { "epoch": 0.11, "grad_norm": 0.15653903782367706, "learning_rate": 0.0002955687828103283, "loss": 1.205, "step": 333 }, { "epoch": 0.11, "grad_norm": 0.2034248262643814, "learning_rate": 0.0002955308554035525, "loss": 1.1898, "step": 334 }, { "epoch": 0.11, "grad_norm": 0.19061972200870514, "learning_rate": 0.00029549276882908016, "loss": 1.3366, "step": 335 }, { "epoch": 0.11, "grad_norm": 0.18442215025424957, "learning_rate": 0.0002954545231285668, "loss": 1.205, "step": 336 }, { "epoch": 0.11, "grad_norm": 0.19930151104927063, "learning_rate": 0.0002954161183438418, "loss": 1.338, "step": 337 }, { "epoch": 0.11, "grad_norm": 0.20483240485191345, "learning_rate": 0.00029537755451690867, "loss": 1.4109, "step": 338 }, { "epoch": 0.11, "grad_norm": 0.2031257450580597, "learning_rate": 0.00029533883168994486, "loss": 1.0451, "step": 339 }, { "epoch": 0.11, "grad_norm": 0.18557147681713104, "learning_rate": 0.0002952999499053016, "loss": 1.2217, "step": 340 }, { "epoch": 0.11, "grad_norm": 0.21159876883029938, "learning_rate": 0.00029526090920550417, "loss": 1.3914, "step": 341 }, { "epoch": 0.11, "grad_norm": 0.2037302404642105, "learning_rate": 0.00029522170963325155, "loss": 1.1385, "step": 342 }, { "epoch": 0.11, "grad_norm": 0.23729722201824188, "learning_rate": 0.00029518235123141635, "loss": 1.2836, "step": 343 }, { "epoch": 0.11, "grad_norm": 0.18577586114406586, "learning_rate": 0.00029514283404304505, "loss": 1.2644, "step": 344 }, { "epoch": 0.11, "grad_norm": 0.20099975168704987, "learning_rate": 0.0002951031581113578, "loss": 1.2947, "step": 345 }, { "epoch": 0.11, "grad_norm": 0.23851241171360016, "learning_rate": 0.00029506332347974834, "loss": 1.2804, "step": 346 }, { "epoch": 0.11, "grad_norm": 0.18896931409835815, "learning_rate": 0.000295023330191784, "loss": 1.26, "step": 347 }, { "epoch": 0.11, "grad_norm": 0.21737080812454224, "learning_rate": 0.0002949831782912055, "loss": 1.348, "step": 348 }, { "epoch": 0.11, "grad_norm": 0.23378679156303406, "learning_rate": 0.0002949428678219272, "loss": 1.3666, "step": 349 }, { "epoch": 0.11, "grad_norm": 0.19387869536876678, "learning_rate": 0.00029490239882803683, "loss": 1.2878, "step": 350 }, { "epoch": 0.11, "grad_norm": 0.20650902390480042, "learning_rate": 0.0002948617713537955, "loss": 1.2745, "step": 351 }, { "epoch": 0.11, "grad_norm": 0.1916474550962448, "learning_rate": 0.00029482098544363766, "loss": 1.1412, "step": 352 }, { "epoch": 0.11, "grad_norm": 0.20413325726985931, "learning_rate": 0.00029478004114217096, "loss": 1.5487, "step": 353 }, { "epoch": 0.11, "grad_norm": 0.19700175523757935, "learning_rate": 0.00029473893849417637, "loss": 1.4713, "step": 354 }, { "epoch": 0.11, "grad_norm": 0.1786103993654251, "learning_rate": 0.0002946976775446081, "loss": 1.4226, "step": 355 }, { "epoch": 0.11, "grad_norm": 0.2108960598707199, "learning_rate": 0.0002946562583385933, "loss": 1.4211, "step": 356 }, { "epoch": 0.11, "grad_norm": 0.16197030246257782, "learning_rate": 0.0002946146809214325, "loss": 1.3371, "step": 357 }, { "epoch": 0.12, "grad_norm": 0.20396749675273895, "learning_rate": 0.0002945729453385989, "loss": 1.3902, "step": 358 }, { "epoch": 0.12, "grad_norm": 0.1861545443534851, "learning_rate": 0.000294531051635739, "loss": 1.1527, "step": 359 }, { "epoch": 0.12, "grad_norm": 0.17796246707439423, "learning_rate": 0.00029448899985867207, "loss": 1.206, "step": 360 }, { "epoch": 0.12, "grad_norm": 0.16761477291584015, "learning_rate": 0.00029444679005339035, "loss": 1.3195, "step": 361 }, { "epoch": 0.12, "grad_norm": 0.19139355421066284, "learning_rate": 0.0002944044222660589, "loss": 1.0939, "step": 362 }, { "epoch": 0.12, "grad_norm": 0.16853193938732147, "learning_rate": 0.00029436189654301543, "loss": 1.177, "step": 363 }, { "epoch": 0.12, "grad_norm": 0.3845210373401642, "learning_rate": 0.00029431921293077054, "loss": 1.3707, "step": 364 }, { "epoch": 0.12, "grad_norm": 0.17332561314105988, "learning_rate": 0.0002942763714760075, "loss": 1.0964, "step": 365 }, { "epoch": 0.12, "grad_norm": 0.19516496360301971, "learning_rate": 0.00029423337222558225, "loss": 1.341, "step": 366 }, { "epoch": 0.12, "grad_norm": 0.22161240875720978, "learning_rate": 0.00029419021522652313, "loss": 1.3291, "step": 367 }, { "epoch": 0.12, "grad_norm": 0.1965475082397461, "learning_rate": 0.00029414690052603125, "loss": 1.2866, "step": 368 }, { "epoch": 0.12, "grad_norm": 0.19041012227535248, "learning_rate": 0.0002941034281714799, "loss": 1.0236, "step": 369 }, { "epoch": 0.12, "grad_norm": 0.22056685388088226, "learning_rate": 0.0002940597982104152, "loss": 1.2903, "step": 370 }, { "epoch": 0.12, "grad_norm": 0.24413897097110748, "learning_rate": 0.0002940160106905552, "loss": 1.3922, "step": 371 }, { "epoch": 0.12, "grad_norm": 0.19511739909648895, "learning_rate": 0.0002939720656597906, "loss": 1.1648, "step": 372 }, { "epoch": 0.12, "grad_norm": 0.18013593554496765, "learning_rate": 0.0002939279631661843, "loss": 1.3984, "step": 373 }, { "epoch": 0.12, "grad_norm": 0.1889912635087967, "learning_rate": 0.00029388370325797127, "loss": 1.3719, "step": 374 }, { "epoch": 0.12, "grad_norm": 0.2447318583726883, "learning_rate": 0.00029383928598355883, "loss": 1.4006, "step": 375 }, { "epoch": 0.12, "grad_norm": 0.2693634629249573, "learning_rate": 0.00029379471139152636, "loss": 1.33, "step": 376 }, { "epoch": 0.12, "grad_norm": 0.24705712497234344, "learning_rate": 0.0002937499795306252, "loss": 1.437, "step": 377 }, { "epoch": 0.12, "grad_norm": 0.24105963110923767, "learning_rate": 0.00029370509044977875, "loss": 1.1729, "step": 378 }, { "epoch": 0.12, "grad_norm": 0.2169589400291443, "learning_rate": 0.00029366004419808254, "loss": 1.3984, "step": 379 }, { "epoch": 0.12, "grad_norm": 0.18109892308712006, "learning_rate": 0.0002936148408248037, "loss": 1.4229, "step": 380 }, { "epoch": 0.12, "grad_norm": 0.32904404401779175, "learning_rate": 0.00029356948037938133, "loss": 1.2781, "step": 381 }, { "epoch": 0.12, "grad_norm": 0.1983826756477356, "learning_rate": 0.00029352396291142644, "loss": 1.2805, "step": 382 }, { "epoch": 0.12, "grad_norm": 0.15984392166137695, "learning_rate": 0.00029347828847072166, "loss": 1.1598, "step": 383 }, { "epoch": 0.12, "grad_norm": 0.21843133866786957, "learning_rate": 0.0002934324571072212, "loss": 1.4128, "step": 384 }, { "epoch": 0.12, "grad_norm": 0.19231949746608734, "learning_rate": 0.0002933864688710511, "loss": 1.3059, "step": 385 }, { "epoch": 0.12, "grad_norm": 0.2009160965681076, "learning_rate": 0.00029334032381250893, "loss": 1.3189, "step": 386 }, { "epoch": 0.12, "grad_norm": 0.22765067219734192, "learning_rate": 0.0002932940219820636, "loss": 1.4236, "step": 387 }, { "epoch": 0.12, "grad_norm": 0.19401536881923676, "learning_rate": 0.00029324756343035576, "loss": 1.3306, "step": 388 }, { "epoch": 0.13, "grad_norm": 0.19268982112407684, "learning_rate": 0.0002932009482081972, "loss": 1.3107, "step": 389 }, { "epoch": 0.13, "grad_norm": 0.18206031620502472, "learning_rate": 0.00029315417636657126, "loss": 1.0462, "step": 390 }, { "epoch": 0.13, "grad_norm": 0.29555708169937134, "learning_rate": 0.0002931072479566324, "loss": 1.2304, "step": 391 }, { "epoch": 0.13, "grad_norm": 0.20137804746627808, "learning_rate": 0.00029306016302970653, "loss": 1.3141, "step": 392 }, { "epoch": 0.13, "grad_norm": 0.22294843196868896, "learning_rate": 0.00029301292163729057, "loss": 1.3591, "step": 393 }, { "epoch": 0.13, "grad_norm": 0.17836473882198334, "learning_rate": 0.0002929655238310526, "loss": 1.3304, "step": 394 }, { "epoch": 0.13, "grad_norm": 0.27757528424263, "learning_rate": 0.00029291796966283187, "loss": 1.2129, "step": 395 }, { "epoch": 0.13, "grad_norm": 0.1803009808063507, "learning_rate": 0.0002928702591846385, "loss": 1.0968, "step": 396 }, { "epoch": 0.13, "grad_norm": 0.1806107461452484, "learning_rate": 0.00029282239244865363, "loss": 1.3711, "step": 397 }, { "epoch": 0.13, "grad_norm": 0.22698688507080078, "learning_rate": 0.00029277436950722943, "loss": 1.3863, "step": 398 }, { "epoch": 0.13, "grad_norm": 0.20161105692386627, "learning_rate": 0.0002927261904128886, "loss": 1.2908, "step": 399 }, { "epoch": 0.13, "grad_norm": 0.19971446692943573, "learning_rate": 0.000292677855218325, "loss": 1.0556, "step": 400 }, { "epoch": 0.13, "grad_norm": 0.16600541770458221, "learning_rate": 0.00029262936397640287, "loss": 1.2031, "step": 401 }, { "epoch": 0.13, "grad_norm": 0.22752271592617035, "learning_rate": 0.0002925807167401574, "loss": 1.3892, "step": 402 }, { "epoch": 0.13, "grad_norm": 0.2094002664089203, "learning_rate": 0.0002925319135627942, "loss": 1.3501, "step": 403 }, { "epoch": 0.13, "grad_norm": 0.19968034327030182, "learning_rate": 0.0002924829544976896, "loss": 1.1546, "step": 404 }, { "epoch": 0.13, "grad_norm": 0.19852077960968018, "learning_rate": 0.0002924338395983902, "loss": 1.309, "step": 405 }, { "epoch": 0.13, "grad_norm": 0.217856764793396, "learning_rate": 0.00029238456891861325, "loss": 1.2695, "step": 406 }, { "epoch": 0.13, "grad_norm": 0.2269602119922638, "learning_rate": 0.00029233514251224627, "loss": 1.2678, "step": 407 }, { "epoch": 0.13, "grad_norm": 0.18717491626739502, "learning_rate": 0.0002922855604333472, "loss": 1.0883, "step": 408 }, { "epoch": 0.13, "grad_norm": 0.18462176620960236, "learning_rate": 0.00029223582273614403, "loss": 1.2625, "step": 409 }, { "epoch": 0.13, "grad_norm": 0.25430941581726074, "learning_rate": 0.0002921859294750351, "loss": 1.0225, "step": 410 }, { "epoch": 0.13, "grad_norm": 0.20792731642723083, "learning_rate": 0.00029213588070458907, "loss": 1.1048, "step": 411 }, { "epoch": 0.13, "grad_norm": 0.20754867792129517, "learning_rate": 0.00029208567647954424, "loss": 1.3209, "step": 412 }, { "epoch": 0.13, "grad_norm": 0.22838467359542847, "learning_rate": 0.0002920353168548093, "loss": 1.3632, "step": 413 }, { "epoch": 0.13, "grad_norm": 0.17146356403827667, "learning_rate": 0.0002919848018854628, "loss": 1.1823, "step": 414 }, { "epoch": 0.13, "grad_norm": 0.2207202911376953, "learning_rate": 0.0002919341316267531, "loss": 1.1628, "step": 415 }, { "epoch": 0.13, "grad_norm": 0.2747710347175598, "learning_rate": 0.00029188330613409854, "loss": 1.3305, "step": 416 }, { "epoch": 0.13, "grad_norm": 0.2124461978673935, "learning_rate": 0.0002918323254630871, "loss": 1.3418, "step": 417 }, { "epoch": 0.13, "grad_norm": 0.2285691350698471, "learning_rate": 0.0002917811896694766, "loss": 1.2898, "step": 418 }, { "epoch": 0.13, "grad_norm": 0.1864543855190277, "learning_rate": 0.0002917298988091944, "loss": 1.2198, "step": 419 }, { "epoch": 0.14, "grad_norm": 0.21441522240638733, "learning_rate": 0.00029167845293833763, "loss": 1.2526, "step": 420 }, { "epoch": 0.14, "grad_norm": 0.21441811323165894, "learning_rate": 0.0002916268521131728, "loss": 1.2588, "step": 421 }, { "epoch": 0.14, "grad_norm": 0.24133192002773285, "learning_rate": 0.0002915750963901359, "loss": 1.2711, "step": 422 }, { "epoch": 0.14, "grad_norm": 0.2278161197900772, "learning_rate": 0.0002915231858258325, "loss": 1.2744, "step": 423 }, { "epoch": 0.14, "grad_norm": 0.2705667316913605, "learning_rate": 0.00029147112047703727, "loss": 1.2319, "step": 424 }, { "epoch": 0.14, "grad_norm": 0.2582227885723114, "learning_rate": 0.0002914189004006943, "loss": 1.1931, "step": 425 }, { "epoch": 0.14, "grad_norm": 0.1880197674036026, "learning_rate": 0.00029136652565391703, "loss": 1.1554, "step": 426 }, { "epoch": 0.14, "grad_norm": 0.18736205995082855, "learning_rate": 0.0002913139962939878, "loss": 1.0816, "step": 427 }, { "epoch": 0.14, "grad_norm": 0.20044726133346558, "learning_rate": 0.00029126131237835833, "loss": 1.5263, "step": 428 }, { "epoch": 0.14, "grad_norm": 0.1887778341770172, "learning_rate": 0.00029120847396464904, "loss": 1.2259, "step": 429 }, { "epoch": 0.14, "grad_norm": 0.19141988456249237, "learning_rate": 0.0002911554811106497, "loss": 1.24, "step": 430 }, { "epoch": 0.14, "grad_norm": 0.21844489872455597, "learning_rate": 0.0002911023338743187, "loss": 1.3682, "step": 431 }, { "epoch": 0.14, "grad_norm": 0.2657244801521301, "learning_rate": 0.00029104903231378347, "loss": 1.2196, "step": 432 }, { "epoch": 0.14, "grad_norm": 0.2134760618209839, "learning_rate": 0.0002909955764873401, "loss": 1.367, "step": 433 }, { "epoch": 0.14, "grad_norm": 0.19301612675189972, "learning_rate": 0.00029094196645345347, "loss": 1.1259, "step": 434 }, { "epoch": 0.14, "grad_norm": 0.19872386753559113, "learning_rate": 0.00029088820227075705, "loss": 1.4887, "step": 435 }, { "epoch": 0.14, "grad_norm": 0.1925172358751297, "learning_rate": 0.000290834283998053, "loss": 1.5551, "step": 436 }, { "epoch": 0.14, "grad_norm": 0.2053399384021759, "learning_rate": 0.00029078021169431196, "loss": 1.342, "step": 437 }, { "epoch": 0.14, "grad_norm": 0.2159048616886139, "learning_rate": 0.00029072598541867296, "loss": 1.489, "step": 438 }, { "epoch": 0.14, "grad_norm": 0.22334644198417664, "learning_rate": 0.00029067160523044356, "loss": 1.3091, "step": 439 }, { "epoch": 0.14, "grad_norm": 0.204558327794075, "learning_rate": 0.0002906170711890996, "loss": 1.3116, "step": 440 }, { "epoch": 0.14, "grad_norm": 0.23514696955680847, "learning_rate": 0.00029056238335428523, "loss": 1.5508, "step": 441 }, { "epoch": 0.14, "grad_norm": 0.17452383041381836, "learning_rate": 0.0002905075417858126, "loss": 1.1897, "step": 442 }, { "epoch": 0.14, "grad_norm": 0.1876997947692871, "learning_rate": 0.0002904525465436623, "loss": 1.2794, "step": 443 }, { "epoch": 0.14, "grad_norm": 0.24446715414524078, "learning_rate": 0.0002903973976879829, "loss": 1.2544, "step": 444 }, { "epoch": 0.14, "grad_norm": 0.20094996690750122, "learning_rate": 0.00029034209527909074, "loss": 1.0349, "step": 445 }, { "epoch": 0.14, "grad_norm": 0.19834104180335999, "learning_rate": 0.00029028663937747046, "loss": 1.2237, "step": 446 }, { "epoch": 0.14, "grad_norm": 0.2367020547389984, "learning_rate": 0.00029023103004377433, "loss": 1.4011, "step": 447 }, { "epoch": 0.14, "grad_norm": 0.17769268155097961, "learning_rate": 0.0002901752673388225, "loss": 0.9901, "step": 448 }, { "epoch": 0.14, "grad_norm": 0.1881648600101471, "learning_rate": 0.00029011935132360295, "loss": 0.9871, "step": 449 }, { "epoch": 0.14, "grad_norm": 0.23311956226825714, "learning_rate": 0.00029006328205927114, "loss": 1.4147, "step": 450 }, { "epoch": 0.15, "grad_norm": 0.21570636332035065, "learning_rate": 0.00029000705960715033, "loss": 1.2354, "step": 451 }, { "epoch": 0.15, "grad_norm": 0.5622038841247559, "learning_rate": 0.00028995068402873125, "loss": 1.1444, "step": 452 }, { "epoch": 0.15, "grad_norm": 0.24820460379123688, "learning_rate": 0.0002898941553856721, "loss": 1.3194, "step": 453 }, { "epoch": 0.15, "grad_norm": 0.24505335092544556, "learning_rate": 0.00028983747373979846, "loss": 1.1764, "step": 454 }, { "epoch": 0.15, "grad_norm": 0.2576861083507538, "learning_rate": 0.00028978063915310333, "loss": 1.1559, "step": 455 }, { "epoch": 0.15, "grad_norm": 0.20876803994178772, "learning_rate": 0.0002897236516877469, "loss": 1.1617, "step": 456 }, { "epoch": 0.15, "grad_norm": 0.2768891453742981, "learning_rate": 0.0002896665114060565, "loss": 1.1088, "step": 457 }, { "epoch": 0.15, "grad_norm": 0.20095758140087128, "learning_rate": 0.00028960921837052693, "loss": 1.1522, "step": 458 }, { "epoch": 0.15, "grad_norm": 0.18675941228866577, "learning_rate": 0.0002895517726438196, "loss": 1.0348, "step": 459 }, { "epoch": 0.15, "grad_norm": 0.19333766400814056, "learning_rate": 0.00028949417428876317, "loss": 1.2352, "step": 460 }, { "epoch": 0.15, "grad_norm": 0.19238299131393433, "learning_rate": 0.0002894364233683533, "loss": 1.2291, "step": 461 }, { "epoch": 0.15, "grad_norm": 0.22816987335681915, "learning_rate": 0.0002893785199457523, "loss": 1.3524, "step": 462 }, { "epoch": 0.15, "grad_norm": 0.2200428694486618, "learning_rate": 0.0002893204640842895, "loss": 1.1358, "step": 463 }, { "epoch": 0.15, "grad_norm": 0.19916164875030518, "learning_rate": 0.00028926225584746077, "loss": 1.1506, "step": 464 }, { "epoch": 0.15, "grad_norm": 0.18463507294654846, "learning_rate": 0.0002892038952989287, "loss": 1.0044, "step": 465 }, { "epoch": 0.15, "grad_norm": 0.4064522683620453, "learning_rate": 0.0002891453825025224, "loss": 1.3685, "step": 466 }, { "epoch": 0.15, "grad_norm": 0.19359849393367767, "learning_rate": 0.0002890867175222377, "loss": 1.3725, "step": 467 }, { "epoch": 0.15, "grad_norm": 0.26550599932670593, "learning_rate": 0.0002890279004222366, "loss": 1.2143, "step": 468 }, { "epoch": 0.15, "grad_norm": 0.23289050161838531, "learning_rate": 0.0002889689312668476, "loss": 1.1566, "step": 469 }, { "epoch": 0.15, "grad_norm": 0.19105130434036255, "learning_rate": 0.00028890981012056564, "loss": 1.091, "step": 470 }, { "epoch": 0.15, "grad_norm": 0.20877230167388916, "learning_rate": 0.00028885053704805165, "loss": 1.1959, "step": 471 }, { "epoch": 0.15, "grad_norm": 0.20549297332763672, "learning_rate": 0.00028879111211413286, "loss": 1.1099, "step": 472 }, { "epoch": 0.15, "grad_norm": 0.20245303213596344, "learning_rate": 0.0002887315353838026, "loss": 1.2561, "step": 473 }, { "epoch": 0.15, "grad_norm": 0.21010564267635345, "learning_rate": 0.0002886718069222201, "loss": 1.1789, "step": 474 }, { "epoch": 0.15, "grad_norm": 3.6655099391937256, "learning_rate": 0.0002886119267947107, "loss": 1.3839, "step": 475 }, { "epoch": 0.15, "grad_norm": 0.47599852085113525, "learning_rate": 0.0002885518950667656, "loss": 1.3919, "step": 476 }, { "epoch": 0.15, "grad_norm": 0.3773764669895172, "learning_rate": 0.00028849171180404164, "loss": 1.3646, "step": 477 }, { "epoch": 0.15, "grad_norm": 0.30505913496017456, "learning_rate": 0.0002884313770723615, "loss": 1.1402, "step": 478 }, { "epoch": 0.15, "grad_norm": 0.2129291146993637, "learning_rate": 0.0002883708909377136, "loss": 1.1944, "step": 479 }, { "epoch": 0.15, "grad_norm": 0.28025561571121216, "learning_rate": 0.0002883102534662518, "loss": 1.3743, "step": 480 }, { "epoch": 0.15, "grad_norm": 0.2301461100578308, "learning_rate": 0.0002882494647242956, "loss": 1.053, "step": 481 }, { "epoch": 0.16, "grad_norm": 0.22085820138454437, "learning_rate": 0.0002881885247783299, "loss": 1.4372, "step": 482 }, { "epoch": 0.16, "grad_norm": 0.22393077611923218, "learning_rate": 0.00028812743369500493, "loss": 1.0942, "step": 483 }, { "epoch": 0.16, "grad_norm": 0.23788024485111237, "learning_rate": 0.00028806619154113627, "loss": 1.1427, "step": 484 }, { "epoch": 0.16, "grad_norm": 0.22877907752990723, "learning_rate": 0.00028800479838370476, "loss": 1.2523, "step": 485 }, { "epoch": 0.16, "grad_norm": 0.2150229513645172, "learning_rate": 0.0002879432542898563, "loss": 0.958, "step": 486 }, { "epoch": 0.16, "grad_norm": 0.22865931689739227, "learning_rate": 0.00028788155932690194, "loss": 1.2415, "step": 487 }, { "epoch": 0.16, "grad_norm": 0.2687344551086426, "learning_rate": 0.0002878197135623177, "loss": 1.2761, "step": 488 }, { "epoch": 0.16, "grad_norm": 0.23295961320400238, "learning_rate": 0.0002877577170637446, "loss": 1.2926, "step": 489 }, { "epoch": 0.16, "grad_norm": 0.24820323288440704, "learning_rate": 0.00028769556989898844, "loss": 1.2386, "step": 490 }, { "epoch": 0.16, "grad_norm": 0.30020925402641296, "learning_rate": 0.00028763327213601975, "loss": 1.4408, "step": 491 }, { "epoch": 0.16, "grad_norm": 0.18680036067962646, "learning_rate": 0.000287570823842974, "loss": 1.1552, "step": 492 }, { "epoch": 0.16, "grad_norm": 0.29228851199150085, "learning_rate": 0.000287508225088151, "loss": 1.2051, "step": 493 }, { "epoch": 0.16, "grad_norm": 0.22396889328956604, "learning_rate": 0.00028744547594001537, "loss": 1.3138, "step": 494 }, { "epoch": 0.16, "grad_norm": 0.20433083176612854, "learning_rate": 0.000287382576467196, "loss": 1.2362, "step": 495 }, { "epoch": 0.16, "grad_norm": 0.2335730344057083, "learning_rate": 0.00028731952673848645, "loss": 1.4049, "step": 496 }, { "epoch": 0.16, "grad_norm": 0.22156302630901337, "learning_rate": 0.0002872563268228444, "loss": 1.2176, "step": 497 }, { "epoch": 0.16, "grad_norm": 0.23028746247291565, "learning_rate": 0.00028719297678939185, "loss": 1.1029, "step": 498 }, { "epoch": 0.16, "grad_norm": 0.19861739873886108, "learning_rate": 0.0002871294767074149, "loss": 1.1561, "step": 499 }, { "epoch": 0.16, "grad_norm": 0.23160377144813538, "learning_rate": 0.0002870658266463641, "loss": 1.3063, "step": 500 }, { "epoch": 0.16, "grad_norm": 0.20824435353279114, "learning_rate": 0.0002870020266758535, "loss": 1.1921, "step": 501 }, { "epoch": 0.16, "grad_norm": 0.21031267940998077, "learning_rate": 0.00028693807686566155, "loss": 1.1372, "step": 502 }, { "epoch": 0.16, "grad_norm": 0.2732871472835541, "learning_rate": 0.00028687397728573047, "loss": 1.2479, "step": 503 }, { "epoch": 0.16, "grad_norm": 0.27661120891571045, "learning_rate": 0.00028680972800616605, "loss": 1.0956, "step": 504 }, { "epoch": 0.16, "grad_norm": 0.29317259788513184, "learning_rate": 0.0002867453290972382, "loss": 1.3523, "step": 505 }, { "epoch": 0.16, "grad_norm": 0.2287442982196808, "learning_rate": 0.0002866807806293801, "loss": 1.3562, "step": 506 }, { "epoch": 0.16, "grad_norm": 0.2164156436920166, "learning_rate": 0.0002866160826731888, "loss": 1.2273, "step": 507 }, { "epoch": 0.16, "grad_norm": 0.2253824770450592, "learning_rate": 0.0002865512352994246, "loss": 1.213, "step": 508 }, { "epoch": 0.16, "grad_norm": 0.23474432528018951, "learning_rate": 0.00028648623857901144, "loss": 1.2714, "step": 509 }, { "epoch": 0.16, "grad_norm": 0.2302689403295517, "learning_rate": 0.0002864210925830364, "loss": 1.3074, "step": 510 }, { "epoch": 0.16, "grad_norm": 0.23234087228775024, "learning_rate": 0.0002863557973827501, "loss": 1.1666, "step": 511 }, { "epoch": 0.16, "grad_norm": 0.25001204013824463, "learning_rate": 0.00028629035304956606, "loss": 1.154, "step": 512 }, { "epoch": 0.17, "grad_norm": 0.2037460207939148, "learning_rate": 0.000286224759655061, "loss": 1.1915, "step": 513 }, { "epoch": 0.17, "grad_norm": 0.18487860262393951, "learning_rate": 0.00028615901727097466, "loss": 0.9837, "step": 514 }, { "epoch": 0.17, "grad_norm": 0.24634243547916412, "learning_rate": 0.00028609312596920985, "loss": 0.9706, "step": 515 }, { "epoch": 0.17, "grad_norm": 0.24692267179489136, "learning_rate": 0.00028602708582183207, "loss": 1.1653, "step": 516 }, { "epoch": 0.17, "grad_norm": 0.2324896901845932, "learning_rate": 0.0002859608969010698, "loss": 1.0269, "step": 517 }, { "epoch": 0.17, "grad_norm": 0.22957377135753632, "learning_rate": 0.000285894559279314, "loss": 1.083, "step": 518 }, { "epoch": 0.17, "grad_norm": 0.24322019517421722, "learning_rate": 0.0002858280730291185, "loss": 1.0955, "step": 519 }, { "epoch": 0.17, "grad_norm": 0.1990022212266922, "learning_rate": 0.00028576143822319963, "loss": 1.4198, "step": 520 }, { "epoch": 0.17, "grad_norm": 0.22350846230983734, "learning_rate": 0.000285694654934436, "loss": 1.165, "step": 521 }, { "epoch": 0.17, "grad_norm": 0.2614850401878357, "learning_rate": 0.0002856277232358689, "loss": 1.3033, "step": 522 }, { "epoch": 0.17, "grad_norm": 0.2266923487186432, "learning_rate": 0.00028556064320070176, "loss": 1.3283, "step": 523 }, { "epoch": 0.17, "grad_norm": 0.21300069987773895, "learning_rate": 0.0002854934149023003, "loss": 1.11, "step": 524 }, { "epoch": 0.17, "grad_norm": 0.1873844563961029, "learning_rate": 0.0002854260384141924, "loss": 1.1823, "step": 525 }, { "epoch": 0.17, "grad_norm": 0.22557958960533142, "learning_rate": 0.0002853585138100679, "loss": 1.1951, "step": 526 }, { "epoch": 0.17, "grad_norm": 0.21490180492401123, "learning_rate": 0.0002852908411637789, "loss": 1.2679, "step": 527 }, { "epoch": 0.17, "grad_norm": 0.24320073425769806, "learning_rate": 0.0002852230205493391, "loss": 1.2406, "step": 528 }, { "epoch": 0.17, "grad_norm": 0.23181137442588806, "learning_rate": 0.0002851550520409243, "loss": 1.1723, "step": 529 }, { "epoch": 0.17, "grad_norm": 0.19753852486610413, "learning_rate": 0.00028508693571287186, "loss": 1.2953, "step": 530 }, { "epoch": 0.17, "grad_norm": 0.2239779531955719, "learning_rate": 0.00028501867163968094, "loss": 1.3735, "step": 531 }, { "epoch": 0.17, "grad_norm": 0.19497892260551453, "learning_rate": 0.0002849502598960122, "loss": 1.2069, "step": 532 }, { "epoch": 0.17, "grad_norm": 0.23042428493499756, "learning_rate": 0.0002848817005566879, "loss": 1.3765, "step": 533 }, { "epoch": 0.17, "grad_norm": 0.18199646472930908, "learning_rate": 0.0002848129936966916, "loss": 1.0587, "step": 534 }, { "epoch": 0.17, "grad_norm": 0.26894697546958923, "learning_rate": 0.00028474413939116834, "loss": 1.215, "step": 535 }, { "epoch": 0.17, "grad_norm": 0.2242213785648346, "learning_rate": 0.00028467513771542437, "loss": 1.3603, "step": 536 }, { "epoch": 0.17, "grad_norm": 0.24185891449451447, "learning_rate": 0.00028460598874492703, "loss": 1.3761, "step": 537 }, { "epoch": 0.17, "grad_norm": 0.20893679559230804, "learning_rate": 0.0002845366925553049, "loss": 1.4145, "step": 538 }, { "epoch": 0.17, "grad_norm": 0.23005111515522003, "learning_rate": 0.0002844672492223474, "loss": 1.1952, "step": 539 }, { "epoch": 0.17, "grad_norm": 0.20279347896575928, "learning_rate": 0.0002843976588220052, "loss": 1.4025, "step": 540 }, { "epoch": 0.17, "grad_norm": 0.25496912002563477, "learning_rate": 0.00028432792143038936, "loss": 1.4541, "step": 541 }, { "epoch": 0.17, "grad_norm": 0.23914286494255066, "learning_rate": 0.00028425803712377217, "loss": 1.1568, "step": 542 }, { "epoch": 0.17, "grad_norm": 0.22152502834796906, "learning_rate": 0.0002841880059785862, "loss": 1.1349, "step": 543 }, { "epoch": 0.18, "grad_norm": 0.2096378356218338, "learning_rate": 0.0002841178280714249, "loss": 1.5316, "step": 544 }, { "epoch": 0.18, "grad_norm": 0.2166428118944168, "learning_rate": 0.00028404750347904215, "loss": 0.9993, "step": 545 }, { "epoch": 0.18, "grad_norm": 0.2531513273715973, "learning_rate": 0.00028397703227835223, "loss": 1.1697, "step": 546 }, { "epoch": 0.18, "grad_norm": 0.1891101747751236, "learning_rate": 0.0002839064145464298, "loss": 1.0851, "step": 547 }, { "epoch": 0.18, "grad_norm": 0.41961485147476196, "learning_rate": 0.0002838356503605097, "loss": 1.2569, "step": 548 }, { "epoch": 0.18, "grad_norm": 0.228632852435112, "learning_rate": 0.0002837647397979871, "loss": 1.2693, "step": 549 }, { "epoch": 0.18, "grad_norm": 0.22972503304481506, "learning_rate": 0.00028369368293641706, "loss": 1.353, "step": 550 }, { "epoch": 0.18, "grad_norm": 0.23591460287570953, "learning_rate": 0.00028362247985351486, "loss": 1.2205, "step": 551 }, { "epoch": 0.18, "grad_norm": 0.263882577419281, "learning_rate": 0.00028355113062715557, "loss": 1.1425, "step": 552 }, { "epoch": 0.18, "grad_norm": 0.2365957796573639, "learning_rate": 0.0002834796353353741, "loss": 1.3896, "step": 553 }, { "epoch": 0.18, "grad_norm": 0.22514571249485016, "learning_rate": 0.0002834079940563652, "loss": 1.2593, "step": 554 }, { "epoch": 0.18, "grad_norm": 0.2720796763896942, "learning_rate": 0.00028333620686848325, "loss": 1.4877, "step": 555 }, { "epoch": 0.18, "grad_norm": 0.21377746760845184, "learning_rate": 0.00028326427385024216, "loss": 1.244, "step": 556 }, { "epoch": 0.18, "grad_norm": 0.23723536729812622, "learning_rate": 0.0002831921950803153, "loss": 1.2696, "step": 557 }, { "epoch": 0.18, "grad_norm": 0.21046771109104156, "learning_rate": 0.0002831199706375356, "loss": 1.0809, "step": 558 }, { "epoch": 0.18, "grad_norm": 0.2552836835384369, "learning_rate": 0.0002830476006008952, "loss": 1.331, "step": 559 }, { "epoch": 0.18, "grad_norm": 0.2433222234249115, "learning_rate": 0.00028297508504954557, "loss": 1.3104, "step": 560 }, { "epoch": 0.18, "grad_norm": 0.20975928008556366, "learning_rate": 0.0002829024240627972, "loss": 1.2201, "step": 561 }, { "epoch": 0.18, "grad_norm": 0.23376978933811188, "learning_rate": 0.0002828296177201198, "loss": 1.297, "step": 562 }, { "epoch": 0.18, "grad_norm": 0.37278518080711365, "learning_rate": 0.0002827566661011418, "loss": 1.3191, "step": 563 }, { "epoch": 0.18, "grad_norm": 0.2397356629371643, "learning_rate": 0.00028268356928565084, "loss": 1.2221, "step": 564 }, { "epoch": 0.18, "grad_norm": 0.24155579507350922, "learning_rate": 0.0002826103273535931, "loss": 1.1461, "step": 565 }, { "epoch": 0.18, "grad_norm": 0.2508285641670227, "learning_rate": 0.0002825369403850737, "loss": 1.2571, "step": 566 }, { "epoch": 0.18, "grad_norm": 0.23085041344165802, "learning_rate": 0.00028246340846035614, "loss": 1.4272, "step": 567 }, { "epoch": 0.18, "grad_norm": 0.27458062767982483, "learning_rate": 0.00028238973165986266, "loss": 1.283, "step": 568 }, { "epoch": 0.18, "grad_norm": 0.19682402908802032, "learning_rate": 0.0002823159100641739, "loss": 1.1658, "step": 569 }, { "epoch": 0.18, "grad_norm": 0.24419572949409485, "learning_rate": 0.0002822419437540288, "loss": 1.1962, "step": 570 }, { "epoch": 0.18, "grad_norm": 0.2702164947986603, "learning_rate": 0.00028216783281032466, "loss": 1.0576, "step": 571 }, { "epoch": 0.18, "grad_norm": 0.196401908993721, "learning_rate": 0.0002820935773141169, "loss": 1.2563, "step": 572 }, { "epoch": 0.18, "grad_norm": 0.22208736836910248, "learning_rate": 0.00028201917734661905, "loss": 1.4583, "step": 573 }, { "epoch": 0.18, "grad_norm": 0.18070030212402344, "learning_rate": 0.0002819446329892027, "loss": 1.2858, "step": 574 }, { "epoch": 0.19, "grad_norm": 0.20115630328655243, "learning_rate": 0.0002818699443233972, "loss": 1.2491, "step": 575 }, { "epoch": 0.19, "grad_norm": 0.2135741114616394, "learning_rate": 0.00028179511143089006, "loss": 0.9851, "step": 576 }, { "epoch": 0.19, "grad_norm": 0.2074771225452423, "learning_rate": 0.00028172013439352615, "loss": 1.3028, "step": 577 }, { "epoch": 0.19, "grad_norm": 0.211680069565773, "learning_rate": 0.0002816450132933083, "loss": 1.13, "step": 578 }, { "epoch": 0.19, "grad_norm": 0.4100401699542999, "learning_rate": 0.0002815697482123966, "loss": 1.5572, "step": 579 }, { "epoch": 0.19, "grad_norm": 0.19802726805210114, "learning_rate": 0.0002814943392331089, "loss": 1.183, "step": 580 }, { "epoch": 0.19, "grad_norm": 0.1951359212398529, "learning_rate": 0.00028141878643792034, "loss": 1.0568, "step": 581 }, { "epoch": 0.19, "grad_norm": 0.2577122151851654, "learning_rate": 0.0002813430899094632, "loss": 1.2056, "step": 582 }, { "epoch": 0.19, "grad_norm": 0.22231917083263397, "learning_rate": 0.0002812672497305272, "loss": 1.4132, "step": 583 }, { "epoch": 0.19, "grad_norm": 0.2275608628988266, "learning_rate": 0.00028119126598405897, "loss": 1.4328, "step": 584 }, { "epoch": 0.19, "grad_norm": 0.20136159658432007, "learning_rate": 0.0002811151387531623, "loss": 1.1699, "step": 585 }, { "epoch": 0.19, "grad_norm": 0.2209021896123886, "learning_rate": 0.0002810388681210978, "loss": 1.3727, "step": 586 }, { "epoch": 0.19, "grad_norm": 0.20952655375003815, "learning_rate": 0.00028096245417128304, "loss": 1.154, "step": 587 }, { "epoch": 0.19, "grad_norm": 0.2137332707643509, "learning_rate": 0.0002808858969872923, "loss": 1.2577, "step": 588 }, { "epoch": 0.19, "grad_norm": 0.23967716097831726, "learning_rate": 0.0002808091966528564, "loss": 1.3433, "step": 589 }, { "epoch": 0.19, "grad_norm": 0.23651421070098877, "learning_rate": 0.00028073235325186283, "loss": 1.2943, "step": 590 }, { "epoch": 0.19, "grad_norm": 0.21847447752952576, "learning_rate": 0.00028065536686835556, "loss": 1.0489, "step": 591 }, { "epoch": 0.19, "grad_norm": 0.18677181005477905, "learning_rate": 0.0002805782375865349, "loss": 1.2383, "step": 592 }, { "epoch": 0.19, "grad_norm": 0.209944948554039, "learning_rate": 0.0002805009654907575, "loss": 1.1979, "step": 593 }, { "epoch": 0.19, "grad_norm": 0.2282859981060028, "learning_rate": 0.00028042355066553614, "loss": 1.2325, "step": 594 }, { "epoch": 0.19, "grad_norm": 0.2366640418767929, "learning_rate": 0.0002803459931955397, "loss": 1.3469, "step": 595 }, { "epoch": 0.19, "grad_norm": 0.2041768729686737, "learning_rate": 0.0002802682931655931, "loss": 1.256, "step": 596 }, { "epoch": 0.19, "grad_norm": 0.2036171555519104, "learning_rate": 0.0002801904506606773, "loss": 1.2949, "step": 597 }, { "epoch": 0.19, "grad_norm": 0.30637267231941223, "learning_rate": 0.00028011246576592887, "loss": 1.3685, "step": 598 }, { "epoch": 0.19, "grad_norm": 0.1900072693824768, "learning_rate": 0.00028003433856664024, "loss": 1.1495, "step": 599 }, { "epoch": 0.19, "grad_norm": 0.2228790521621704, "learning_rate": 0.0002799560691482594, "loss": 1.2601, "step": 600 }, { "epoch": 0.19, "grad_norm": 0.22529004514217377, "learning_rate": 0.00027987765759639004, "loss": 1.1352, "step": 601 }, { "epoch": 0.19, "grad_norm": 0.21611414849758148, "learning_rate": 0.0002797991039967911, "loss": 1.2765, "step": 602 }, { "epoch": 0.19, "grad_norm": 0.24556797742843628, "learning_rate": 0.000279720408435377, "loss": 0.9106, "step": 603 }, { "epoch": 0.19, "grad_norm": 0.5431991219520569, "learning_rate": 0.0002796415709982174, "loss": 1.1226, "step": 604 }, { "epoch": 0.19, "grad_norm": 0.22893905639648438, "learning_rate": 0.00027956259177153717, "loss": 1.3838, "step": 605 }, { "epoch": 0.2, "grad_norm": 0.22664658725261688, "learning_rate": 0.00027948347084171617, "loss": 1.2929, "step": 606 }, { "epoch": 0.2, "grad_norm": 0.25395622849464417, "learning_rate": 0.0002794042082952893, "loss": 1.4142, "step": 607 }, { "epoch": 0.2, "grad_norm": 0.20377038419246674, "learning_rate": 0.00027932480421894633, "loss": 1.0624, "step": 608 }, { "epoch": 0.2, "grad_norm": 0.24217286705970764, "learning_rate": 0.0002792452586995319, "loss": 1.3196, "step": 609 }, { "epoch": 0.2, "grad_norm": 0.18047085404396057, "learning_rate": 0.0002791655718240451, "loss": 1.245, "step": 610 }, { "epoch": 0.2, "grad_norm": 0.25788384675979614, "learning_rate": 0.00027908574367964, "loss": 1.2231, "step": 611 }, { "epoch": 0.2, "grad_norm": 0.24533262848854065, "learning_rate": 0.0002790057743536248, "loss": 1.1873, "step": 612 }, { "epoch": 0.2, "grad_norm": 0.22649244964122772, "learning_rate": 0.0002789256639334623, "loss": 1.3529, "step": 613 }, { "epoch": 0.2, "grad_norm": 0.19514693319797516, "learning_rate": 0.00027884541250676974, "loss": 1.0977, "step": 614 }, { "epoch": 0.2, "grad_norm": 0.19251416623592377, "learning_rate": 0.00027876502016131835, "loss": 1.0891, "step": 615 }, { "epoch": 0.2, "grad_norm": 0.20709547400474548, "learning_rate": 0.00027868448698503353, "loss": 1.1931, "step": 616 }, { "epoch": 0.2, "grad_norm": 0.21667586266994476, "learning_rate": 0.00027860381306599484, "loss": 1.4682, "step": 617 }, { "epoch": 0.2, "grad_norm": 0.20012988150119781, "learning_rate": 0.0002785229984924355, "loss": 1.3497, "step": 618 }, { "epoch": 0.2, "grad_norm": 0.21917125582695007, "learning_rate": 0.00027844204335274295, "loss": 1.2314, "step": 619 }, { "epoch": 0.2, "grad_norm": 0.2225695103406906, "learning_rate": 0.000278360947735458, "loss": 1.2754, "step": 620 }, { "epoch": 0.2, "grad_norm": 0.19350537657737732, "learning_rate": 0.00027827971172927543, "loss": 1.3414, "step": 621 }, { "epoch": 0.2, "grad_norm": 0.20780587196350098, "learning_rate": 0.00027819833542304324, "loss": 1.2774, "step": 622 }, { "epoch": 0.2, "grad_norm": 0.2697571814060211, "learning_rate": 0.0002781168189057631, "loss": 1.3928, "step": 623 }, { "epoch": 0.2, "grad_norm": 0.23526206612586975, "learning_rate": 0.0002780351622665899, "loss": 1.3114, "step": 624 }, { "epoch": 0.2, "grad_norm": 0.2090139091014862, "learning_rate": 0.00027795336559483197, "loss": 1.1558, "step": 625 }, { "epoch": 0.2, "grad_norm": 0.549271285533905, "learning_rate": 0.00027787142897995053, "loss": 1.4257, "step": 626 }, { "epoch": 0.2, "grad_norm": 0.2010613977909088, "learning_rate": 0.0002777893525115601, "loss": 1.2282, "step": 627 }, { "epoch": 0.2, "grad_norm": 0.19325284659862518, "learning_rate": 0.00027770713627942793, "loss": 1.0565, "step": 628 }, { "epoch": 0.2, "grad_norm": 0.24261559545993805, "learning_rate": 0.00027762478037347436, "loss": 1.1929, "step": 629 }, { "epoch": 0.2, "grad_norm": 0.3264019191265106, "learning_rate": 0.00027754228488377237, "loss": 0.8698, "step": 630 }, { "epoch": 0.2, "grad_norm": 0.20387952029705048, "learning_rate": 0.00027745964990054763, "loss": 1.0313, "step": 631 }, { "epoch": 0.2, "grad_norm": 0.24064399302005768, "learning_rate": 0.00027737687551417827, "loss": 1.3957, "step": 632 }, { "epoch": 0.2, "grad_norm": 0.31109580397605896, "learning_rate": 0.00027729396181519505, "loss": 1.346, "step": 633 }, { "epoch": 0.2, "grad_norm": 0.19490142166614532, "learning_rate": 0.0002772109088942811, "loss": 1.0407, "step": 634 }, { "epoch": 0.2, "grad_norm": 0.2684940993785858, "learning_rate": 0.0002771277168422715, "loss": 1.1964, "step": 635 }, { "epoch": 0.2, "grad_norm": 0.253006249666214, "learning_rate": 0.00027704438575015393, "loss": 1.2935, "step": 636 }, { "epoch": 0.21, "grad_norm": 0.2332218438386917, "learning_rate": 0.000276960915709068, "loss": 1.1798, "step": 637 }, { "epoch": 0.21, "grad_norm": 0.21353299915790558, "learning_rate": 0.0002768773068103051, "loss": 1.1086, "step": 638 }, { "epoch": 0.21, "grad_norm": 0.2249504029750824, "learning_rate": 0.00027679355914530867, "loss": 1.419, "step": 639 }, { "epoch": 0.21, "grad_norm": 0.36277222633361816, "learning_rate": 0.0002767096728056739, "loss": 1.2521, "step": 640 }, { "epoch": 0.21, "grad_norm": 0.2038441151380539, "learning_rate": 0.0002766256478831476, "loss": 1.2068, "step": 641 }, { "epoch": 0.21, "grad_norm": 0.264575719833374, "learning_rate": 0.0002765414844696282, "loss": 1.1429, "step": 642 }, { "epoch": 0.21, "grad_norm": 0.21586742997169495, "learning_rate": 0.00027645718265716557, "loss": 1.1446, "step": 643 }, { "epoch": 0.21, "grad_norm": 0.24729059636592865, "learning_rate": 0.0002763727425379609, "loss": 1.153, "step": 644 }, { "epoch": 0.21, "grad_norm": 0.25709933042526245, "learning_rate": 0.0002762881642043667, "loss": 1.3528, "step": 645 }, { "epoch": 0.21, "grad_norm": 0.19542774558067322, "learning_rate": 0.0002762034477488867, "loss": 1.0193, "step": 646 }, { "epoch": 0.21, "grad_norm": 0.2279549241065979, "learning_rate": 0.00027611859326417563, "loss": 1.2969, "step": 647 }, { "epoch": 0.21, "grad_norm": 0.2092573642730713, "learning_rate": 0.0002760336008430392, "loss": 1.0363, "step": 648 }, { "epoch": 0.21, "grad_norm": 0.28551411628723145, "learning_rate": 0.0002759484705784339, "loss": 1.5038, "step": 649 }, { "epoch": 0.21, "grad_norm": 0.2185361236333847, "learning_rate": 0.00027586320256346714, "loss": 1.2201, "step": 650 }, { "epoch": 0.21, "grad_norm": 0.22983217239379883, "learning_rate": 0.0002757777968913968, "loss": 1.2232, "step": 651 }, { "epoch": 0.21, "grad_norm": 0.37281855940818787, "learning_rate": 0.0002756922536556315, "loss": 1.342, "step": 652 }, { "epoch": 0.21, "grad_norm": 0.21394875645637512, "learning_rate": 0.00027560657294973016, "loss": 1.3993, "step": 653 }, { "epoch": 0.21, "grad_norm": 0.21211597323417664, "learning_rate": 0.0002755207548674022, "loss": 1.0538, "step": 654 }, { "epoch": 0.21, "grad_norm": 0.2468247264623642, "learning_rate": 0.0002754347995025072, "loss": 1.3049, "step": 655 }, { "epoch": 0.21, "grad_norm": 0.216883584856987, "learning_rate": 0.00027534870694905476, "loss": 1.1871, "step": 656 }, { "epoch": 0.21, "grad_norm": 0.24510933458805084, "learning_rate": 0.0002752624773012049, "loss": 1.1312, "step": 657 }, { "epoch": 0.21, "grad_norm": 0.23576533794403076, "learning_rate": 0.0002751761106532671, "loss": 1.468, "step": 658 }, { "epoch": 0.21, "grad_norm": 0.26149120926856995, "learning_rate": 0.0002750896070997011, "loss": 1.2832, "step": 659 }, { "epoch": 0.21, "grad_norm": 0.23776665329933167, "learning_rate": 0.00027500296673511605, "loss": 1.3336, "step": 660 }, { "epoch": 0.21, "grad_norm": 0.2501676678657532, "learning_rate": 0.000274916189654271, "loss": 1.2047, "step": 661 }, { "epoch": 0.21, "grad_norm": 0.24051010608673096, "learning_rate": 0.0002748292759520743, "loss": 1.2989, "step": 662 }, { "epoch": 0.21, "grad_norm": 0.2298332005739212, "learning_rate": 0.00027474222572358394, "loss": 1.1623, "step": 663 }, { "epoch": 0.21, "grad_norm": 0.18832460045814514, "learning_rate": 0.000274655039064007, "loss": 1.0561, "step": 664 }, { "epoch": 0.21, "grad_norm": 0.26167237758636475, "learning_rate": 0.00027456771606869997, "loss": 1.4276, "step": 665 }, { "epoch": 0.21, "grad_norm": 0.22198782861232758, "learning_rate": 0.0002744802568331683, "loss": 1.1954, "step": 666 }, { "epoch": 0.21, "grad_norm": 0.2386026233434677, "learning_rate": 0.0002743926614530665, "loss": 1.1591, "step": 667 }, { "epoch": 0.22, "grad_norm": 0.22878511250019073, "learning_rate": 0.0002743049300241982, "loss": 1.4905, "step": 668 }, { "epoch": 0.22, "grad_norm": 0.24387897551059723, "learning_rate": 0.00027421706264251535, "loss": 1.0752, "step": 669 }, { "epoch": 0.22, "grad_norm": 0.23252733051776886, "learning_rate": 0.000274129059404119, "loss": 1.1386, "step": 670 }, { "epoch": 0.22, "grad_norm": 0.26237496733665466, "learning_rate": 0.00027404092040525865, "loss": 1.2962, "step": 671 }, { "epoch": 0.22, "grad_norm": 0.1965530663728714, "learning_rate": 0.0002739526457423323, "loss": 1.253, "step": 672 }, { "epoch": 0.22, "grad_norm": 0.2224564254283905, "learning_rate": 0.0002738642355118863, "loss": 1.3266, "step": 673 }, { "epoch": 0.22, "grad_norm": 0.22237393260002136, "learning_rate": 0.0002737756898106153, "loss": 1.1112, "step": 674 }, { "epoch": 0.22, "grad_norm": 0.24572043120861053, "learning_rate": 0.00027368700873536204, "loss": 1.2372, "step": 675 }, { "epoch": 0.22, "grad_norm": 0.22195427119731903, "learning_rate": 0.0002735981923831174, "loss": 1.3602, "step": 676 }, { "epoch": 0.22, "grad_norm": 0.24270844459533691, "learning_rate": 0.00027350924085102024, "loss": 1.3074, "step": 677 }, { "epoch": 0.22, "grad_norm": 0.2195555865764618, "learning_rate": 0.0002734201542363572, "loss": 1.1744, "step": 678 }, { "epoch": 0.22, "grad_norm": 0.21969103813171387, "learning_rate": 0.0002733309326365626, "loss": 1.1372, "step": 679 }, { "epoch": 0.22, "grad_norm": 0.41213974356651306, "learning_rate": 0.0002732415761492186, "loss": 1.1307, "step": 680 }, { "epoch": 0.22, "grad_norm": 0.24672943353652954, "learning_rate": 0.0002731520848720546, "loss": 1.1619, "step": 681 }, { "epoch": 0.22, "grad_norm": 0.23280732333660126, "learning_rate": 0.0002730624589029478, "loss": 1.0978, "step": 682 }, { "epoch": 0.22, "grad_norm": 0.2617351710796356, "learning_rate": 0.00027297269833992225, "loss": 1.3163, "step": 683 }, { "epoch": 0.22, "grad_norm": 0.27428269386291504, "learning_rate": 0.00027288280328114956, "loss": 1.2799, "step": 684 }, { "epoch": 0.22, "grad_norm": 0.3015119135379791, "learning_rate": 0.0002727927738249484, "loss": 1.2495, "step": 685 }, { "epoch": 0.22, "grad_norm": 0.1831410676240921, "learning_rate": 0.00027270261006978423, "loss": 1.0932, "step": 686 }, { "epoch": 0.22, "grad_norm": 0.27450791001319885, "learning_rate": 0.0002726123121142696, "loss": 1.2597, "step": 687 }, { "epoch": 0.22, "grad_norm": 0.2232239842414856, "learning_rate": 0.0002725218800571637, "loss": 1.2054, "step": 688 }, { "epoch": 0.22, "grad_norm": 0.22805006802082062, "learning_rate": 0.00027243131399737254, "loss": 1.2826, "step": 689 }, { "epoch": 0.22, "grad_norm": 0.2222464680671692, "learning_rate": 0.00027234061403394845, "loss": 1.0484, "step": 690 }, { "epoch": 0.22, "grad_norm": 0.22322127223014832, "learning_rate": 0.0002722497802660904, "loss": 1.2485, "step": 691 }, { "epoch": 0.22, "grad_norm": 0.24204668402671814, "learning_rate": 0.0002721588127931438, "loss": 1.163, "step": 692 }, { "epoch": 0.22, "grad_norm": 0.23898741602897644, "learning_rate": 0.0002720677117146, "loss": 1.469, "step": 693 }, { "epoch": 0.22, "grad_norm": 0.3404325544834137, "learning_rate": 0.0002719764771300966, "loss": 1.3656, "step": 694 }, { "epoch": 0.22, "grad_norm": 0.24955818057060242, "learning_rate": 0.00027188510913941737, "loss": 1.4558, "step": 695 }, { "epoch": 0.22, "grad_norm": 0.25114861130714417, "learning_rate": 0.00027179360784249177, "loss": 1.3945, "step": 696 }, { "epoch": 0.22, "grad_norm": 0.2791580855846405, "learning_rate": 0.0002717019733393952, "loss": 1.4187, "step": 697 }, { "epoch": 0.22, "grad_norm": 0.22418774664402008, "learning_rate": 0.0002716102057303486, "loss": 1.2726, "step": 698 }, { "epoch": 0.23, "grad_norm": 0.19569823145866394, "learning_rate": 0.00027151830511571866, "loss": 1.1979, "step": 699 }, { "epoch": 0.23, "grad_norm": 0.23457087576389313, "learning_rate": 0.00027142627159601744, "loss": 1.414, "step": 700 }, { "epoch": 0.23, "grad_norm": 0.21526119112968445, "learning_rate": 0.00027133410527190244, "loss": 1.2784, "step": 701 }, { "epoch": 0.23, "grad_norm": 0.20960667729377747, "learning_rate": 0.00027124180624417627, "loss": 1.1152, "step": 702 }, { "epoch": 0.23, "grad_norm": 0.28942322731018066, "learning_rate": 0.00027114937461378676, "loss": 1.2852, "step": 703 }, { "epoch": 0.23, "grad_norm": 0.22690434753894806, "learning_rate": 0.00027105681048182677, "loss": 1.1896, "step": 704 }, { "epoch": 0.23, "grad_norm": 0.21796655654907227, "learning_rate": 0.00027096411394953406, "loss": 1.1894, "step": 705 }, { "epoch": 0.23, "grad_norm": 0.21171554923057556, "learning_rate": 0.0002708712851182913, "loss": 1.2469, "step": 706 }, { "epoch": 0.23, "grad_norm": 0.25982657074928284, "learning_rate": 0.0002707783240896257, "loss": 1.0578, "step": 707 }, { "epoch": 0.23, "grad_norm": 0.20410950481891632, "learning_rate": 0.0002706852309652091, "loss": 1.0638, "step": 708 }, { "epoch": 0.23, "grad_norm": 0.3087175190448761, "learning_rate": 0.0002705920058468579, "loss": 1.3769, "step": 709 }, { "epoch": 0.23, "grad_norm": 0.22283148765563965, "learning_rate": 0.00027049864883653266, "loss": 1.0876, "step": 710 }, { "epoch": 0.23, "grad_norm": 0.25557127594947815, "learning_rate": 0.0002704051600363385, "loss": 1.2994, "step": 711 }, { "epoch": 0.23, "grad_norm": 0.189194917678833, "learning_rate": 0.00027031153954852437, "loss": 1.218, "step": 712 }, { "epoch": 0.23, "grad_norm": 0.6964691877365112, "learning_rate": 0.00027021778747548354, "loss": 1.2773, "step": 713 }, { "epoch": 0.23, "grad_norm": 0.20203107595443726, "learning_rate": 0.00027012390391975284, "loss": 1.2887, "step": 714 }, { "epoch": 0.23, "grad_norm": 0.26938050985336304, "learning_rate": 0.0002700298889840132, "loss": 1.0133, "step": 715 }, { "epoch": 0.23, "grad_norm": 0.43010213971138, "learning_rate": 0.0002699357427710892, "loss": 1.4529, "step": 716 }, { "epoch": 0.23, "grad_norm": 0.4410056173801422, "learning_rate": 0.0002698414653839489, "loss": 1.2619, "step": 717 }, { "epoch": 0.23, "grad_norm": 0.2188965082168579, "learning_rate": 0.00026974705692570377, "loss": 1.1752, "step": 718 }, { "epoch": 0.23, "grad_norm": 0.21618852019309998, "learning_rate": 0.0002696525174996089, "loss": 1.3304, "step": 719 }, { "epoch": 0.23, "grad_norm": 0.25104033946990967, "learning_rate": 0.00026955784720906226, "loss": 1.2013, "step": 720 }, { "epoch": 0.23, "grad_norm": 0.23166468739509583, "learning_rate": 0.00026946304615760536, "loss": 1.1764, "step": 721 }, { "epoch": 0.23, "grad_norm": 0.26862502098083496, "learning_rate": 0.00026936811444892226, "loss": 1.1918, "step": 722 }, { "epoch": 0.23, "grad_norm": 0.20016691088676453, "learning_rate": 0.00026927305218684035, "loss": 1.0887, "step": 723 }, { "epoch": 0.23, "grad_norm": 0.21966229379177094, "learning_rate": 0.0002691778594753296, "loss": 1.4075, "step": 724 }, { "epoch": 0.23, "grad_norm": 0.2962917685508728, "learning_rate": 0.0002690825364185025, "loss": 1.321, "step": 725 }, { "epoch": 0.23, "grad_norm": 0.21434324979782104, "learning_rate": 0.0002689870831206145, "loss": 1.2813, "step": 726 }, { "epoch": 0.23, "grad_norm": 0.2470979392528534, "learning_rate": 0.0002688914996860631, "loss": 1.0972, "step": 727 }, { "epoch": 0.23, "grad_norm": 0.22922155261039734, "learning_rate": 0.0002687957862193883, "loss": 1.0853, "step": 728 }, { "epoch": 0.23, "grad_norm": 0.32598957419395447, "learning_rate": 0.00026869994282527245, "loss": 1.316, "step": 729 }, { "epoch": 0.24, "grad_norm": 0.23238208889961243, "learning_rate": 0.00026860396960853974, "loss": 1.1284, "step": 730 }, { "epoch": 0.24, "grad_norm": 0.2337213009595871, "learning_rate": 0.0002685078666741565, "loss": 1.2809, "step": 731 }, { "epoch": 0.24, "grad_norm": 0.2668529748916626, "learning_rate": 0.000268411634127231, "loss": 1.0953, "step": 732 }, { "epoch": 0.24, "grad_norm": 0.28542205691337585, "learning_rate": 0.000268315272073013, "loss": 1.0755, "step": 733 }, { "epoch": 0.24, "grad_norm": 0.20577193796634674, "learning_rate": 0.0002682187806168943, "loss": 1.0169, "step": 734 }, { "epoch": 0.24, "grad_norm": 0.2726905047893524, "learning_rate": 0.00026812215986440766, "loss": 1.2008, "step": 735 }, { "epoch": 0.24, "grad_norm": 0.33090198040008545, "learning_rate": 0.00026802540992122795, "loss": 1.2519, "step": 736 }, { "epoch": 0.24, "grad_norm": 0.28124040365219116, "learning_rate": 0.0002679285308931708, "loss": 1.5281, "step": 737 }, { "epoch": 0.24, "grad_norm": 0.22662505507469177, "learning_rate": 0.0002678315228861933, "loss": 1.4573, "step": 738 }, { "epoch": 0.24, "grad_norm": 0.2400646209716797, "learning_rate": 0.0002677343860063934, "loss": 1.0586, "step": 739 }, { "epoch": 0.24, "grad_norm": 0.2816977798938751, "learning_rate": 0.00026763712036001014, "loss": 1.1921, "step": 740 }, { "epoch": 0.24, "grad_norm": 0.22784672677516937, "learning_rate": 0.00026753972605342334, "loss": 1.4177, "step": 741 }, { "epoch": 0.24, "grad_norm": 0.2288491129875183, "learning_rate": 0.00026744220319315366, "loss": 1.3377, "step": 742 }, { "epoch": 0.24, "grad_norm": 0.31299129128456116, "learning_rate": 0.00026734455188586213, "loss": 1.2219, "step": 743 }, { "epoch": 0.24, "grad_norm": 0.2318633645772934, "learning_rate": 0.0002672467722383505, "loss": 1.3746, "step": 744 }, { "epoch": 0.24, "grad_norm": 0.21518418192863464, "learning_rate": 0.00026714886435756066, "loss": 1.0086, "step": 745 }, { "epoch": 0.24, "grad_norm": 0.22531038522720337, "learning_rate": 0.00026705082835057494, "loss": 1.3523, "step": 746 }, { "epoch": 0.24, "grad_norm": 0.22481463849544525, "learning_rate": 0.0002669526643246157, "loss": 1.2324, "step": 747 }, { "epoch": 0.24, "grad_norm": 0.23594602942466736, "learning_rate": 0.0002668543723870453, "loss": 1.1087, "step": 748 }, { "epoch": 0.24, "grad_norm": 0.26375705003738403, "learning_rate": 0.0002667559526453661, "loss": 1.4959, "step": 749 }, { "epoch": 0.24, "grad_norm": 0.2138039767742157, "learning_rate": 0.00026665740520722016, "loss": 1.2695, "step": 750 }, { "epoch": 0.24, "grad_norm": 0.20246140658855438, "learning_rate": 0.0002665587301803892, "loss": 1.2362, "step": 751 }, { "epoch": 0.24, "grad_norm": 0.22007615864276886, "learning_rate": 0.00026645992767279457, "loss": 1.287, "step": 752 }, { "epoch": 0.24, "grad_norm": 0.24630385637283325, "learning_rate": 0.00026636099779249684, "loss": 1.1899, "step": 753 }, { "epoch": 0.24, "grad_norm": 0.20941542088985443, "learning_rate": 0.0002662619406476962, "loss": 1.0778, "step": 754 }, { "epoch": 0.24, "grad_norm": 0.32671457529067993, "learning_rate": 0.0002661627563467317, "loss": 1.0053, "step": 755 }, { "epoch": 0.24, "grad_norm": 0.24226491153240204, "learning_rate": 0.0002660634449980817, "loss": 1.0671, "step": 756 }, { "epoch": 0.24, "grad_norm": 0.20549605786800385, "learning_rate": 0.0002659640067103635, "loss": 1.1184, "step": 757 }, { "epoch": 0.24, "grad_norm": 0.2574581205844879, "learning_rate": 0.00026586444159233305, "loss": 1.184, "step": 758 }, { "epoch": 0.24, "grad_norm": 0.2219834178686142, "learning_rate": 0.00026576474975288514, "loss": 1.2018, "step": 759 }, { "epoch": 0.24, "grad_norm": 0.24195437133312225, "learning_rate": 0.0002656649313010532, "loss": 1.5525, "step": 760 }, { "epoch": 0.25, "grad_norm": 0.22968795895576477, "learning_rate": 0.000265564986346009, "loss": 1.1708, "step": 761 }, { "epoch": 0.25, "grad_norm": 0.24150444567203522, "learning_rate": 0.0002654649149970629, "loss": 1.4033, "step": 762 }, { "epoch": 0.25, "grad_norm": 0.22888855636119843, "learning_rate": 0.0002653647173636632, "loss": 1.1344, "step": 763 }, { "epoch": 0.25, "grad_norm": 0.2143932580947876, "learning_rate": 0.0002652643935553965, "loss": 1.244, "step": 764 }, { "epoch": 0.25, "grad_norm": 0.21116125583648682, "learning_rate": 0.00026516394368198743, "loss": 1.1295, "step": 765 }, { "epoch": 0.25, "grad_norm": 0.22374720871448517, "learning_rate": 0.00026506336785329834, "loss": 1.2607, "step": 766 }, { "epoch": 0.25, "grad_norm": 0.22909872233867645, "learning_rate": 0.00026496266617932946, "loss": 1.3434, "step": 767 }, { "epoch": 0.25, "grad_norm": 0.23101361095905304, "learning_rate": 0.0002648618387702187, "loss": 1.3714, "step": 768 }, { "epoch": 0.25, "grad_norm": 0.23311373591423035, "learning_rate": 0.0002647608857362413, "loss": 1.3971, "step": 769 }, { "epoch": 0.25, "grad_norm": 0.2103973627090454, "learning_rate": 0.00026465980718781007, "loss": 1.208, "step": 770 }, { "epoch": 0.25, "grad_norm": 0.21689274907112122, "learning_rate": 0.000264558603235475, "loss": 1.2227, "step": 771 }, { "epoch": 0.25, "grad_norm": 0.19489361345767975, "learning_rate": 0.0002644572739899233, "loss": 1.2649, "step": 772 }, { "epoch": 0.25, "grad_norm": 0.22305546700954437, "learning_rate": 0.00026435581956197923, "loss": 1.3211, "step": 773 }, { "epoch": 0.25, "grad_norm": 0.222812220454216, "learning_rate": 0.0002642542400626038, "loss": 1.1703, "step": 774 }, { "epoch": 0.25, "grad_norm": 0.19476795196533203, "learning_rate": 0.000264152535602895, "loss": 1.3518, "step": 775 }, { "epoch": 0.25, "grad_norm": 0.20013649761676788, "learning_rate": 0.00026405070629408745, "loss": 1.4447, "step": 776 }, { "epoch": 0.25, "grad_norm": 0.34230348467826843, "learning_rate": 0.0002639487522475522, "loss": 1.3548, "step": 777 }, { "epoch": 0.25, "grad_norm": 0.22145825624465942, "learning_rate": 0.00026384667357479683, "loss": 1.1983, "step": 778 }, { "epoch": 0.25, "grad_norm": 0.22445926070213318, "learning_rate": 0.00026374447038746525, "loss": 1.2312, "step": 779 }, { "epoch": 0.25, "grad_norm": 0.2127223163843155, "learning_rate": 0.0002636421427973375, "loss": 1.159, "step": 780 }, { "epoch": 0.25, "grad_norm": 0.2383660078048706, "learning_rate": 0.00026353969091632965, "loss": 1.2289, "step": 781 }, { "epoch": 0.25, "grad_norm": 0.22692888975143433, "learning_rate": 0.00026343711485649383, "loss": 1.3239, "step": 782 }, { "epoch": 0.25, "grad_norm": 0.20642992854118347, "learning_rate": 0.0002633344147300178, "loss": 1.2478, "step": 783 }, { "epoch": 0.25, "grad_norm": 0.1938752681016922, "learning_rate": 0.0002632315906492251, "loss": 1.1085, "step": 784 }, { "epoch": 0.25, "grad_norm": 0.20911762118339539, "learning_rate": 0.00026312864272657493, "loss": 1.0872, "step": 785 }, { "epoch": 0.25, "grad_norm": 0.22548238933086395, "learning_rate": 0.00026302557107466185, "loss": 0.9969, "step": 786 }, { "epoch": 0.25, "grad_norm": 0.20537856221199036, "learning_rate": 0.0002629223758062157, "loss": 1.2302, "step": 787 }, { "epoch": 0.25, "grad_norm": 0.24405203759670258, "learning_rate": 0.00026281905703410164, "loss": 1.3444, "step": 788 }, { "epoch": 0.25, "grad_norm": 0.24314309656620026, "learning_rate": 0.0002627156148713197, "loss": 1.2449, "step": 789 }, { "epoch": 0.25, "grad_norm": 0.23985081911087036, "learning_rate": 0.00026261204943100515, "loss": 1.3212, "step": 790 }, { "epoch": 0.25, "grad_norm": 0.23066315054893494, "learning_rate": 0.0002625083608264279, "loss": 1.3365, "step": 791 }, { "epoch": 0.26, "grad_norm": 0.27888551354408264, "learning_rate": 0.0002624045491709926, "loss": 1.2213, "step": 792 }, { "epoch": 0.26, "grad_norm": 0.23509328067302704, "learning_rate": 0.00026230061457823843, "loss": 1.3845, "step": 793 }, { "epoch": 0.26, "grad_norm": 0.19994336366653442, "learning_rate": 0.00026219655716183915, "loss": 1.2635, "step": 794 }, { "epoch": 0.26, "grad_norm": 2.3000566959381104, "learning_rate": 0.0002620923770356028, "loss": 1.244, "step": 795 }, { "epoch": 0.26, "grad_norm": 0.23555368185043335, "learning_rate": 0.0002619880743134715, "loss": 1.3082, "step": 796 }, { "epoch": 0.26, "grad_norm": 0.1970159262418747, "learning_rate": 0.00026188364910952164, "loss": 1.1063, "step": 797 }, { "epoch": 0.26, "grad_norm": 0.3258186876773834, "learning_rate": 0.00026177910153796355, "loss": 1.3107, "step": 798 }, { "epoch": 0.26, "grad_norm": 0.2443915605545044, "learning_rate": 0.00026167443171314124, "loss": 1.3063, "step": 799 }, { "epoch": 0.26, "grad_norm": 0.2885858714580536, "learning_rate": 0.00026156963974953266, "loss": 1.2695, "step": 800 }, { "epoch": 0.26, "grad_norm": 0.2606044411659241, "learning_rate": 0.0002614647257617491, "loss": 1.0579, "step": 801 }, { "epoch": 0.26, "grad_norm": 0.22612668573856354, "learning_rate": 0.0002613596898645354, "loss": 1.207, "step": 802 }, { "epoch": 0.26, "grad_norm": 0.22919657826423645, "learning_rate": 0.0002612545321727698, "loss": 1.286, "step": 803 }, { "epoch": 0.26, "grad_norm": 0.2953253984451294, "learning_rate": 0.00026114925280146375, "loss": 1.3112, "step": 804 }, { "epoch": 0.26, "grad_norm": 0.24736064672470093, "learning_rate": 0.0002610438518657617, "loss": 1.2157, "step": 805 }, { "epoch": 0.26, "grad_norm": 0.31541958451271057, "learning_rate": 0.000260938329480941, "loss": 1.2363, "step": 806 }, { "epoch": 0.26, "grad_norm": 0.23319390416145325, "learning_rate": 0.00026083268576241204, "loss": 1.2861, "step": 807 }, { "epoch": 0.26, "grad_norm": 0.206162229180336, "learning_rate": 0.0002607269208257178, "loss": 1.2959, "step": 808 }, { "epoch": 0.26, "grad_norm": 0.2405722439289093, "learning_rate": 0.00026062103478653373, "loss": 1.0172, "step": 809 }, { "epoch": 0.26, "grad_norm": 0.2326429933309555, "learning_rate": 0.00026051502776066785, "loss": 1.2198, "step": 810 }, { "epoch": 0.26, "grad_norm": 0.24582675099372864, "learning_rate": 0.0002604088998640605, "loss": 1.2547, "step": 811 }, { "epoch": 0.26, "grad_norm": 0.2657950222492218, "learning_rate": 0.0002603026512127842, "loss": 1.367, "step": 812 }, { "epoch": 0.26, "grad_norm": 0.25160637497901917, "learning_rate": 0.00026019628192304353, "loss": 1.1131, "step": 813 }, { "epoch": 0.26, "grad_norm": 0.24217893183231354, "learning_rate": 0.000260089792111175, "loss": 1.1083, "step": 814 }, { "epoch": 0.26, "grad_norm": 0.21020649373531342, "learning_rate": 0.00025998318189364704, "loss": 1.3535, "step": 815 }, { "epoch": 0.26, "grad_norm": 0.27169790863990784, "learning_rate": 0.00025987645138705965, "loss": 1.0812, "step": 816 }, { "epoch": 0.26, "grad_norm": 0.25895482301712036, "learning_rate": 0.0002597696007081443, "loss": 1.4663, "step": 817 }, { "epoch": 0.26, "grad_norm": 0.2218000888824463, "learning_rate": 0.0002596626299737642, "loss": 1.29, "step": 818 }, { "epoch": 0.26, "grad_norm": 0.260469526052475, "learning_rate": 0.00025955553930091357, "loss": 0.9306, "step": 819 }, { "epoch": 0.26, "grad_norm": 0.26513901352882385, "learning_rate": 0.000259448328806718, "loss": 1.1092, "step": 820 }, { "epoch": 0.26, "grad_norm": 0.2185366153717041, "learning_rate": 0.00025934099860843395, "loss": 0.9951, "step": 821 }, { "epoch": 0.26, "grad_norm": 0.24202194809913635, "learning_rate": 0.00025923354882344897, "loss": 1.2922, "step": 822 }, { "epoch": 0.27, "grad_norm": 0.21701733767986298, "learning_rate": 0.0002591259795692813, "loss": 1.3088, "step": 823 }, { "epoch": 0.27, "grad_norm": 0.25753530859947205, "learning_rate": 0.0002590182909635799, "loss": 1.2953, "step": 824 }, { "epoch": 0.27, "grad_norm": 0.21381810307502747, "learning_rate": 0.00025891048312412426, "loss": 1.1005, "step": 825 }, { "epoch": 0.27, "grad_norm": 0.2262168526649475, "learning_rate": 0.0002588025561688242, "loss": 1.2344, "step": 826 }, { "epoch": 0.27, "grad_norm": 0.24448952078819275, "learning_rate": 0.00025869451021571995, "loss": 1.3415, "step": 827 }, { "epoch": 0.27, "grad_norm": 0.2586999535560608, "learning_rate": 0.00025858634538298176, "loss": 1.3589, "step": 828 }, { "epoch": 0.27, "grad_norm": 0.2136625051498413, "learning_rate": 0.00025847806178890995, "loss": 1.2133, "step": 829 }, { "epoch": 0.27, "grad_norm": 0.22123007476329803, "learning_rate": 0.00025836965955193476, "loss": 1.2324, "step": 830 }, { "epoch": 0.27, "grad_norm": 0.23126721382141113, "learning_rate": 0.0002582611387906161, "loss": 1.5193, "step": 831 }, { "epoch": 0.27, "grad_norm": 0.23664012551307678, "learning_rate": 0.0002581524996236437, "loss": 1.3047, "step": 832 }, { "epoch": 0.27, "grad_norm": 0.22825421392917633, "learning_rate": 0.00025804374216983656, "loss": 1.3225, "step": 833 }, { "epoch": 0.27, "grad_norm": 0.22088941931724548, "learning_rate": 0.0002579348665481432, "loss": 1.2653, "step": 834 }, { "epoch": 0.27, "grad_norm": 0.2544088363647461, "learning_rate": 0.00025782587287764136, "loss": 1.4064, "step": 835 }, { "epoch": 0.27, "grad_norm": 0.2667694687843323, "learning_rate": 0.0002577167612775378, "loss": 0.984, "step": 836 }, { "epoch": 0.27, "grad_norm": 0.2887456715106964, "learning_rate": 0.00025760753186716834, "loss": 1.3498, "step": 837 }, { "epoch": 0.27, "grad_norm": 0.34887316823005676, "learning_rate": 0.0002574981847659977, "loss": 1.137, "step": 838 }, { "epoch": 0.27, "grad_norm": 0.2630285322666168, "learning_rate": 0.00025738872009361917, "loss": 1.072, "step": 839 }, { "epoch": 0.27, "grad_norm": 0.22439756989479065, "learning_rate": 0.0002572791379697548, "loss": 1.2323, "step": 840 }, { "epoch": 0.27, "grad_norm": 0.23943135142326355, "learning_rate": 0.0002571694385142549, "loss": 1.0429, "step": 841 }, { "epoch": 0.27, "grad_norm": 0.21306543052196503, "learning_rate": 0.0002570596218470983, "loss": 1.3153, "step": 842 }, { "epoch": 0.27, "grad_norm": 0.2701868712902069, "learning_rate": 0.0002569496880883919, "loss": 1.0751, "step": 843 }, { "epoch": 0.27, "grad_norm": 0.21261711418628693, "learning_rate": 0.00025683963735837075, "loss": 1.4404, "step": 844 }, { "epoch": 0.27, "grad_norm": 0.19577303528785706, "learning_rate": 0.0002567294697773978, "loss": 1.089, "step": 845 }, { "epoch": 0.27, "grad_norm": 0.21214625239372253, "learning_rate": 0.00025661918546596374, "loss": 1.3025, "step": 846 }, { "epoch": 0.27, "grad_norm": 0.2344605028629303, "learning_rate": 0.000256508784544687, "loss": 1.3171, "step": 847 }, { "epoch": 0.27, "grad_norm": 0.2725510001182556, "learning_rate": 0.00025639826713431344, "loss": 1.3396, "step": 848 }, { "epoch": 0.27, "grad_norm": 0.23080793023109436, "learning_rate": 0.0002562876333557165, "loss": 1.4199, "step": 849 }, { "epoch": 0.27, "grad_norm": 0.23614276945590973, "learning_rate": 0.0002561768833298968, "loss": 1.3943, "step": 850 }, { "epoch": 0.27, "grad_norm": 0.21194449067115784, "learning_rate": 0.00025606601717798207, "loss": 1.1492, "step": 851 }, { "epoch": 0.27, "grad_norm": 0.26848122477531433, "learning_rate": 0.0002559550350212271, "loss": 1.5026, "step": 852 }, { "epoch": 0.27, "grad_norm": 0.21137623488903046, "learning_rate": 0.00025584393698101357, "loss": 1.2804, "step": 853 }, { "epoch": 0.28, "grad_norm": 0.2300230711698532, "learning_rate": 0.00025573272317884975, "loss": 1.2451, "step": 854 }, { "epoch": 0.28, "grad_norm": 0.29950615763664246, "learning_rate": 0.0002556213937363707, "loss": 1.2894, "step": 855 }, { "epoch": 0.28, "grad_norm": 0.21852363646030426, "learning_rate": 0.00025550994877533787, "loss": 1.1656, "step": 856 }, { "epoch": 0.28, "grad_norm": 0.20930595695972443, "learning_rate": 0.0002553983884176391, "loss": 1.1967, "step": 857 }, { "epoch": 0.28, "grad_norm": 0.2647859752178192, "learning_rate": 0.0002552867127852884, "loss": 1.2552, "step": 858 }, { "epoch": 0.28, "grad_norm": 0.2683856785297394, "learning_rate": 0.00025517492200042587, "loss": 1.2585, "step": 859 }, { "epoch": 0.28, "grad_norm": 0.20761440694332123, "learning_rate": 0.0002550630161853176, "loss": 1.2586, "step": 860 }, { "epoch": 0.28, "grad_norm": 0.24035774171352386, "learning_rate": 0.00025495099546235533, "loss": 1.1908, "step": 861 }, { "epoch": 0.28, "grad_norm": 0.23338118195533752, "learning_rate": 0.0002548388599540567, "loss": 1.223, "step": 862 }, { "epoch": 0.28, "grad_norm": 0.2043347954750061, "learning_rate": 0.00025472660978306474, "loss": 1.3782, "step": 863 }, { "epoch": 0.28, "grad_norm": 0.28423306345939636, "learning_rate": 0.00025461424507214786, "loss": 1.1485, "step": 864 }, { "epoch": 0.28, "grad_norm": 0.3138136565685272, "learning_rate": 0.0002545017659441999, "loss": 1.3737, "step": 865 }, { "epoch": 0.28, "grad_norm": 0.22493910789489746, "learning_rate": 0.0002543891725222397, "loss": 1.1989, "step": 866 }, { "epoch": 0.28, "grad_norm": 0.26088377833366394, "learning_rate": 0.0002542764649294112, "loss": 1.3365, "step": 867 }, { "epoch": 0.28, "grad_norm": 0.31550633907318115, "learning_rate": 0.0002541636432889831, "loss": 1.2695, "step": 868 }, { "epoch": 0.28, "grad_norm": 0.27935466170310974, "learning_rate": 0.0002540507077243488, "loss": 1.2246, "step": 869 }, { "epoch": 0.28, "grad_norm": 0.23790161311626434, "learning_rate": 0.00025393765835902667, "loss": 1.3268, "step": 870 }, { "epoch": 0.28, "grad_norm": 0.22995251417160034, "learning_rate": 0.000253824495316659, "loss": 1.2764, "step": 871 }, { "epoch": 0.28, "grad_norm": 0.2869279980659485, "learning_rate": 0.00025371121872101276, "loss": 1.3404, "step": 872 }, { "epoch": 0.28, "grad_norm": 0.21731115877628326, "learning_rate": 0.0002535978286959791, "loss": 1.3221, "step": 873 }, { "epoch": 0.28, "grad_norm": 0.2382771372795105, "learning_rate": 0.0002534843253655731, "loss": 1.3325, "step": 874 }, { "epoch": 0.28, "grad_norm": 0.22409158945083618, "learning_rate": 0.0002533707088539338, "loss": 1.2242, "step": 875 }, { "epoch": 0.28, "grad_norm": 0.23111607134342194, "learning_rate": 0.00025325697928532404, "loss": 1.1826, "step": 876 }, { "epoch": 0.28, "grad_norm": 0.21633534133434296, "learning_rate": 0.00025314313678413046, "loss": 1.2929, "step": 877 }, { "epoch": 0.28, "grad_norm": 0.23429009318351746, "learning_rate": 0.00025302918147486285, "loss": 1.1007, "step": 878 }, { "epoch": 0.28, "grad_norm": 0.20751525461673737, "learning_rate": 0.00025291511348215476, "loss": 1.2304, "step": 879 }, { "epoch": 0.28, "grad_norm": 0.41742047667503357, "learning_rate": 0.00025280093293076277, "loss": 1.2458, "step": 880 }, { "epoch": 0.28, "grad_norm": 0.34537363052368164, "learning_rate": 0.00025268663994556666, "loss": 1.2081, "step": 881 }, { "epoch": 0.28, "grad_norm": 0.2006489783525467, "learning_rate": 0.00025257223465156907, "loss": 0.9716, "step": 882 }, { "epoch": 0.28, "grad_norm": 0.21495725214481354, "learning_rate": 0.00025245771717389556, "loss": 1.3532, "step": 883 }, { "epoch": 0.28, "grad_norm": 0.24058909714221954, "learning_rate": 0.0002523430876377944, "loss": 1.2779, "step": 884 }, { "epoch": 0.29, "grad_norm": 0.2430250495672226, "learning_rate": 0.0002522283461686364, "loss": 1.3751, "step": 885 }, { "epoch": 0.29, "grad_norm": 0.2253216654062271, "learning_rate": 0.0002521134928919147, "loss": 1.069, "step": 886 }, { "epoch": 0.29, "grad_norm": 0.24315324425697327, "learning_rate": 0.0002519985279332449, "loss": 1.414, "step": 887 }, { "epoch": 0.29, "grad_norm": 0.2164440006017685, "learning_rate": 0.0002518834514183646, "loss": 1.2539, "step": 888 }, { "epoch": 0.29, "grad_norm": 0.2390885204076767, "learning_rate": 0.0002517682634731334, "loss": 1.343, "step": 889 }, { "epoch": 0.29, "grad_norm": 0.2525710463523865, "learning_rate": 0.00025165296422353294, "loss": 1.1198, "step": 890 }, { "epoch": 0.29, "grad_norm": 0.21281762421131134, "learning_rate": 0.00025153755379566647, "loss": 0.9703, "step": 891 }, { "epoch": 0.29, "grad_norm": 0.20334158837795258, "learning_rate": 0.00025142203231575874, "loss": 1.1099, "step": 892 }, { "epoch": 0.29, "grad_norm": 0.2375725954771042, "learning_rate": 0.0002513063999101562, "loss": 1.3965, "step": 893 }, { "epoch": 0.29, "grad_norm": 0.23868125677108765, "learning_rate": 0.0002511906567053264, "loss": 1.3147, "step": 894 }, { "epoch": 0.29, "grad_norm": 0.7390242218971252, "learning_rate": 0.0002510748028278582, "loss": 1.0827, "step": 895 }, { "epoch": 0.29, "grad_norm": 0.23130983114242554, "learning_rate": 0.00025095883840446143, "loss": 1.3738, "step": 896 }, { "epoch": 0.29, "grad_norm": 0.24143138527870178, "learning_rate": 0.00025084276356196695, "loss": 1.3229, "step": 897 }, { "epoch": 0.29, "grad_norm": 0.2621254026889801, "learning_rate": 0.00025072657842732616, "loss": 1.0701, "step": 898 }, { "epoch": 0.29, "grad_norm": 0.23603560030460358, "learning_rate": 0.0002506102831276113, "loss": 1.1094, "step": 899 }, { "epoch": 0.29, "grad_norm": 0.28034037351608276, "learning_rate": 0.00025049387779001506, "loss": 1.4621, "step": 900 }, { "epoch": 0.29, "grad_norm": 0.26521971821784973, "learning_rate": 0.00025037736254185036, "loss": 1.3367, "step": 901 }, { "epoch": 0.29, "grad_norm": 0.2200937420129776, "learning_rate": 0.00025026073751055044, "loss": 1.3875, "step": 902 }, { "epoch": 0.29, "grad_norm": 0.24607901275157928, "learning_rate": 0.00025014400282366854, "loss": 1.1009, "step": 903 }, { "epoch": 0.29, "grad_norm": 0.2978629767894745, "learning_rate": 0.0002500271586088779, "loss": 0.9707, "step": 904 }, { "epoch": 0.29, "grad_norm": 0.2542332410812378, "learning_rate": 0.00024991020499397156, "loss": 1.4489, "step": 905 }, { "epoch": 0.29, "grad_norm": 0.23108340799808502, "learning_rate": 0.00024979314210686214, "loss": 1.6332, "step": 906 }, { "epoch": 0.29, "grad_norm": 0.1889384388923645, "learning_rate": 0.00024967597007558175, "loss": 1.2712, "step": 907 }, { "epoch": 0.29, "grad_norm": 0.18098032474517822, "learning_rate": 0.00024955868902828195, "loss": 1.0415, "step": 908 }, { "epoch": 0.29, "grad_norm": 0.32039955258369446, "learning_rate": 0.00024944129909323356, "loss": 1.2807, "step": 909 }, { "epoch": 0.29, "grad_norm": 0.22193314135074615, "learning_rate": 0.0002493238003988264, "loss": 0.9529, "step": 910 }, { "epoch": 0.29, "grad_norm": 0.22407302260398865, "learning_rate": 0.0002492061930735692, "loss": 1.1141, "step": 911 }, { "epoch": 0.29, "grad_norm": 0.2517699599266052, "learning_rate": 0.00024908847724608966, "loss": 0.946, "step": 912 }, { "epoch": 0.29, "grad_norm": 0.2606646716594696, "learning_rate": 0.0002489706530451342, "loss": 1.2357, "step": 913 }, { "epoch": 0.29, "grad_norm": 0.2925983667373657, "learning_rate": 0.0002488527205995673, "loss": 1.3119, "step": 914 }, { "epoch": 0.29, "grad_norm": 0.22658801078796387, "learning_rate": 0.0002487346800383725, "loss": 1.2592, "step": 915 }, { "epoch": 0.3, "grad_norm": 0.28261563181877136, "learning_rate": 0.000248616531490651, "loss": 1.1192, "step": 916 }, { "epoch": 0.3, "grad_norm": 0.22098995745182037, "learning_rate": 0.00024849827508562246, "loss": 1.2857, "step": 917 }, { "epoch": 0.3, "grad_norm": 0.23068101704120636, "learning_rate": 0.00024837991095262444, "loss": 1.3349, "step": 918 }, { "epoch": 0.3, "grad_norm": 0.20983710885047913, "learning_rate": 0.00024826143922111217, "loss": 1.2935, "step": 919 }, { "epoch": 0.3, "grad_norm": 0.2006354033946991, "learning_rate": 0.00024814286002065877, "loss": 1.0463, "step": 920 }, { "epoch": 0.3, "grad_norm": 0.240416020154953, "learning_rate": 0.00024802417348095476, "loss": 1.1852, "step": 921 }, { "epoch": 0.3, "grad_norm": 0.30370470881462097, "learning_rate": 0.0002479053797318081, "loss": 1.5388, "step": 922 }, { "epoch": 0.3, "grad_norm": 0.21763229370117188, "learning_rate": 0.00024778647890314403, "loss": 1.3089, "step": 923 }, { "epoch": 0.3, "grad_norm": 0.23802721500396729, "learning_rate": 0.0002476674711250048, "loss": 1.3205, "step": 924 }, { "epoch": 0.3, "grad_norm": 0.25143080949783325, "learning_rate": 0.0002475483565275498, "loss": 1.4152, "step": 925 }, { "epoch": 0.3, "grad_norm": 0.21853703260421753, "learning_rate": 0.00024742913524105503, "loss": 1.177, "step": 926 }, { "epoch": 0.3, "grad_norm": 5.0222930908203125, "learning_rate": 0.0002473098073959135, "loss": 1.2266, "step": 927 }, { "epoch": 0.3, "grad_norm": 0.2560270130634308, "learning_rate": 0.0002471903731226344, "loss": 1.1993, "step": 928 }, { "epoch": 0.3, "grad_norm": 0.42218005657196045, "learning_rate": 0.00024707083255184355, "loss": 1.3243, "step": 929 }, { "epoch": 0.3, "grad_norm": 0.2299363613128662, "learning_rate": 0.00024695118581428293, "loss": 1.1662, "step": 930 }, { "epoch": 0.3, "grad_norm": 0.2860582768917084, "learning_rate": 0.0002468314330408107, "loss": 1.2033, "step": 931 }, { "epoch": 0.3, "grad_norm": 0.2828530967235565, "learning_rate": 0.00024671157436240095, "loss": 1.2348, "step": 932 }, { "epoch": 0.3, "grad_norm": 0.20473450422286987, "learning_rate": 0.00024659160991014365, "loss": 1.152, "step": 933 }, { "epoch": 0.3, "grad_norm": 0.24812953174114227, "learning_rate": 0.0002464715398152444, "loss": 1.2413, "step": 934 }, { "epoch": 0.3, "grad_norm": 0.2362663745880127, "learning_rate": 0.0002463513642090243, "loss": 1.2793, "step": 935 }, { "epoch": 0.3, "grad_norm": 0.2987329065799713, "learning_rate": 0.00024623108322291995, "loss": 1.4219, "step": 936 }, { "epoch": 0.3, "grad_norm": 0.2253057360649109, "learning_rate": 0.0002461106969884831, "loss": 1.1688, "step": 937 }, { "epoch": 0.3, "grad_norm": 0.2308044731616974, "learning_rate": 0.00024599020563738084, "loss": 1.3336, "step": 938 }, { "epoch": 0.3, "grad_norm": 0.23591387271881104, "learning_rate": 0.0002458696093013949, "loss": 1.2061, "step": 939 }, { "epoch": 0.3, "grad_norm": 0.21461857855319977, "learning_rate": 0.000245748908112422, "loss": 1.2593, "step": 940 }, { "epoch": 0.3, "grad_norm": 0.2860175669193268, "learning_rate": 0.00024562810220247355, "loss": 1.0944, "step": 941 }, { "epoch": 0.3, "grad_norm": 0.22761978209018707, "learning_rate": 0.00024550719170367556, "loss": 1.1492, "step": 942 }, { "epoch": 0.3, "grad_norm": 0.23205538094043732, "learning_rate": 0.00024538617674826825, "loss": 1.1017, "step": 943 }, { "epoch": 0.3, "grad_norm": 0.221958190202713, "learning_rate": 0.0002452650574686062, "loss": 1.1604, "step": 944 }, { "epoch": 0.3, "grad_norm": 0.2892865836620331, "learning_rate": 0.00024514383399715803, "loss": 1.2346, "step": 945 }, { "epoch": 0.3, "grad_norm": 0.24759164452552795, "learning_rate": 0.0002450225064665064, "loss": 1.2579, "step": 946 }, { "epoch": 0.31, "grad_norm": 0.24830806255340576, "learning_rate": 0.00024490107500934764, "loss": 1.3702, "step": 947 }, { "epoch": 0.31, "grad_norm": 0.8362816572189331, "learning_rate": 0.00024477953975849195, "loss": 1.3379, "step": 948 }, { "epoch": 0.31, "grad_norm": 0.286589115858078, "learning_rate": 0.00024465790084686294, "loss": 1.3163, "step": 949 }, { "epoch": 0.31, "grad_norm": 0.3104639947414398, "learning_rate": 0.00024453615840749746, "loss": 1.4245, "step": 950 }, { "epoch": 0.31, "grad_norm": 0.2724129855632782, "learning_rate": 0.00024441431257354586, "loss": 1.3017, "step": 951 }, { "epoch": 0.31, "grad_norm": 0.26041650772094727, "learning_rate": 0.0002442923634782713, "loss": 1.2416, "step": 952 }, { "epoch": 0.31, "grad_norm": 0.2634851932525635, "learning_rate": 0.00024417031125505015, "loss": 1.44, "step": 953 }, { "epoch": 0.31, "grad_norm": 0.23814193904399872, "learning_rate": 0.0002440481560373713, "loss": 1.2156, "step": 954 }, { "epoch": 0.31, "grad_norm": 0.2707337737083435, "learning_rate": 0.00024392589795883647, "loss": 1.2715, "step": 955 }, { "epoch": 0.31, "grad_norm": 0.25759026408195496, "learning_rate": 0.00024380353715315985, "loss": 1.4143, "step": 956 }, { "epoch": 0.31, "grad_norm": 0.22148004174232483, "learning_rate": 0.0002436810737541679, "loss": 1.2119, "step": 957 }, { "epoch": 0.31, "grad_norm": 0.2464108169078827, "learning_rate": 0.0002435585078957994, "loss": 1.2258, "step": 958 }, { "epoch": 0.31, "grad_norm": 0.27217644453048706, "learning_rate": 0.0002434358397121051, "loss": 1.1886, "step": 959 }, { "epoch": 0.31, "grad_norm": 0.2336444854736328, "learning_rate": 0.00024331306933724771, "loss": 1.4379, "step": 960 }, { "epoch": 0.31, "grad_norm": 0.17853498458862305, "learning_rate": 0.0002431901969055017, "loss": 0.98, "step": 961 }, { "epoch": 0.31, "grad_norm": 0.26151198148727417, "learning_rate": 0.00024306722255125316, "loss": 1.6655, "step": 962 }, { "epoch": 0.31, "grad_norm": 0.41195064783096313, "learning_rate": 0.00024294414640899963, "loss": 1.2138, "step": 963 }, { "epoch": 0.31, "grad_norm": 0.24623966217041016, "learning_rate": 0.00024282096861335002, "loss": 1.4359, "step": 964 }, { "epoch": 0.31, "grad_norm": 0.25928232073783875, "learning_rate": 0.00024269768929902432, "loss": 1.2306, "step": 965 }, { "epoch": 0.31, "grad_norm": 0.26378339529037476, "learning_rate": 0.0002425743086008537, "loss": 1.2983, "step": 966 }, { "epoch": 0.31, "grad_norm": 0.24026435613632202, "learning_rate": 0.0002424508266537801, "loss": 1.331, "step": 967 }, { "epoch": 0.31, "grad_norm": 0.21786873042583466, "learning_rate": 0.0002423272435928563, "loss": 1.1239, "step": 968 }, { "epoch": 0.31, "grad_norm": 0.22867447137832642, "learning_rate": 0.00024220355955324553, "loss": 1.1905, "step": 969 }, { "epoch": 0.31, "grad_norm": 0.21581189334392548, "learning_rate": 0.00024207977467022155, "loss": 1.1329, "step": 970 }, { "epoch": 0.31, "grad_norm": 0.3643098473548889, "learning_rate": 0.00024195588907916844, "loss": 1.029, "step": 971 }, { "epoch": 0.31, "grad_norm": 0.2443319410085678, "learning_rate": 0.0002418319029155803, "loss": 1.226, "step": 972 }, { "epoch": 0.31, "grad_norm": 0.2356473207473755, "learning_rate": 0.00024170781631506139, "loss": 1.1675, "step": 973 }, { "epoch": 0.31, "grad_norm": 0.23318816721439362, "learning_rate": 0.0002415836294133257, "loss": 1.0198, "step": 974 }, { "epoch": 0.31, "grad_norm": 0.2520805299282074, "learning_rate": 0.00024145934234619696, "loss": 1.2341, "step": 975 }, { "epoch": 0.31, "grad_norm": 0.23710520565509796, "learning_rate": 0.00024133495524960842, "loss": 1.2756, "step": 976 }, { "epoch": 0.31, "grad_norm": 0.28691402077674866, "learning_rate": 0.0002412104682596028, "loss": 1.3239, "step": 977 }, { "epoch": 0.32, "grad_norm": 0.2831035554409027, "learning_rate": 0.00024108588151233203, "loss": 1.1775, "step": 978 }, { "epoch": 0.32, "grad_norm": 0.2497682422399521, "learning_rate": 0.00024096119514405715, "loss": 1.2681, "step": 979 }, { "epoch": 0.32, "grad_norm": 0.24265536665916443, "learning_rate": 0.00024083640929114812, "loss": 1.2701, "step": 980 }, { "epoch": 0.32, "grad_norm": 0.3308228552341461, "learning_rate": 0.00024071152409008376, "loss": 1.1895, "step": 981 }, { "epoch": 0.32, "grad_norm": 0.2346339374780655, "learning_rate": 0.00024058653967745156, "loss": 1.1195, "step": 982 }, { "epoch": 0.32, "grad_norm": 0.2702274024486542, "learning_rate": 0.00024046145618994744, "loss": 1.1405, "step": 983 }, { "epoch": 0.32, "grad_norm": 0.27345848083496094, "learning_rate": 0.00024033627376437576, "loss": 1.3745, "step": 984 }, { "epoch": 0.32, "grad_norm": 0.32658037543296814, "learning_rate": 0.00024021099253764903, "loss": 1.1522, "step": 985 }, { "epoch": 0.32, "grad_norm": 0.2531717121601105, "learning_rate": 0.00024008561264678792, "loss": 1.3755, "step": 986 }, { "epoch": 0.32, "grad_norm": 0.21644409000873566, "learning_rate": 0.00023996013422892084, "loss": 1.0277, "step": 987 }, { "epoch": 0.32, "grad_norm": 0.2833665609359741, "learning_rate": 0.00023983455742128404, "loss": 0.9538, "step": 988 }, { "epoch": 0.32, "grad_norm": 0.24716176092624664, "learning_rate": 0.00023970888236122146, "loss": 1.1491, "step": 989 }, { "epoch": 0.32, "grad_norm": 0.22365549206733704, "learning_rate": 0.00023958310918618443, "loss": 1.3351, "step": 990 }, { "epoch": 0.32, "grad_norm": 0.34866926074028015, "learning_rate": 0.00023945723803373155, "loss": 1.0638, "step": 991 }, { "epoch": 0.32, "grad_norm": 0.19000953435897827, "learning_rate": 0.00023933126904152866, "loss": 1.1976, "step": 992 }, { "epoch": 0.32, "grad_norm": 0.2231382578611374, "learning_rate": 0.00023920520234734852, "loss": 1.1022, "step": 993 }, { "epoch": 0.32, "grad_norm": 0.3001597821712494, "learning_rate": 0.00023907903808907078, "loss": 1.228, "step": 994 }, { "epoch": 0.32, "grad_norm": 0.28958210349082947, "learning_rate": 0.00023895277640468193, "loss": 1.1426, "step": 995 }, { "epoch": 0.32, "grad_norm": 0.261559396982193, "learning_rate": 0.00023882641743227473, "loss": 1.257, "step": 996 }, { "epoch": 0.32, "grad_norm": 0.29150450229644775, "learning_rate": 0.00023869996131004867, "loss": 1.1405, "step": 997 }, { "epoch": 0.32, "grad_norm": 0.2568801939487457, "learning_rate": 0.0002385734081763092, "loss": 1.3729, "step": 998 }, { "epoch": 0.32, "grad_norm": 0.24785907566547394, "learning_rate": 0.000238446758169468, "loss": 1.3329, "step": 999 }, { "epoch": 0.32, "grad_norm": 0.22915875911712646, "learning_rate": 0.0002383200114280429, "loss": 1.3824, "step": 1000 }, { "epoch": 0.32, "grad_norm": 0.2207372486591339, "learning_rate": 0.00023819316809065708, "loss": 1.3311, "step": 1001 }, { "epoch": 0.32, "grad_norm": 0.19947972893714905, "learning_rate": 0.00023806622829603978, "loss": 1.152, "step": 1002 }, { "epoch": 0.32, "grad_norm": 0.24530254304409027, "learning_rate": 0.00023793919218302552, "loss": 1.308, "step": 1003 }, { "epoch": 0.32, "grad_norm": 0.28042072057724, "learning_rate": 0.00023781205989055422, "loss": 1.3758, "step": 1004 }, { "epoch": 0.32, "grad_norm": 0.26824137568473816, "learning_rate": 0.00023768483155767103, "loss": 1.42, "step": 1005 }, { "epoch": 0.32, "grad_norm": 0.23175200819969177, "learning_rate": 0.00023755750732352602, "loss": 1.2744, "step": 1006 }, { "epoch": 0.32, "grad_norm": 0.24839933216571808, "learning_rate": 0.00023743008732737437, "loss": 1.1694, "step": 1007 }, { "epoch": 0.32, "grad_norm": 0.4314773678779602, "learning_rate": 0.00023730257170857572, "loss": 1.2606, "step": 1008 }, { "epoch": 0.32, "grad_norm": 0.2149488478899002, "learning_rate": 0.0002371749606065945, "loss": 1.3017, "step": 1009 }, { "epoch": 0.33, "grad_norm": 0.23948344588279724, "learning_rate": 0.00023704725416099948, "loss": 1.3018, "step": 1010 }, { "epoch": 0.33, "grad_norm": 0.3104545772075653, "learning_rate": 0.0002369194525114637, "loss": 1.1092, "step": 1011 }, { "epoch": 0.33, "grad_norm": 0.2715686559677124, "learning_rate": 0.00023679155579776437, "loss": 1.1878, "step": 1012 }, { "epoch": 0.33, "grad_norm": 0.2340475618839264, "learning_rate": 0.00023666356415978266, "loss": 1.1362, "step": 1013 }, { "epoch": 0.33, "grad_norm": 0.2539551556110382, "learning_rate": 0.0002365354777375036, "loss": 1.0444, "step": 1014 }, { "epoch": 0.33, "grad_norm": 0.2603508234024048, "learning_rate": 0.00023640729667101572, "loss": 1.2983, "step": 1015 }, { "epoch": 0.33, "grad_norm": 0.2267984002828598, "learning_rate": 0.00023627902110051133, "loss": 1.2938, "step": 1016 }, { "epoch": 0.33, "grad_norm": 0.2391950786113739, "learning_rate": 0.00023615065116628576, "loss": 1.232, "step": 1017 }, { "epoch": 0.33, "grad_norm": 0.26265597343444824, "learning_rate": 0.00023602218700873793, "loss": 1.2653, "step": 1018 }, { "epoch": 0.33, "grad_norm": 0.3995567858219147, "learning_rate": 0.0002358936287683695, "loss": 1.4445, "step": 1019 }, { "epoch": 0.33, "grad_norm": 0.2440849393606186, "learning_rate": 0.00023576497658578518, "loss": 1.3048, "step": 1020 }, { "epoch": 0.33, "grad_norm": 0.2372875213623047, "learning_rate": 0.00023563623060169245, "loss": 1.525, "step": 1021 }, { "epoch": 0.33, "grad_norm": 0.2205083668231964, "learning_rate": 0.0002355073909569012, "loss": 0.958, "step": 1022 }, { "epoch": 0.33, "grad_norm": 0.20565034449100494, "learning_rate": 0.00023537845779232397, "loss": 1.4153, "step": 1023 }, { "epoch": 0.33, "grad_norm": 0.19819019734859467, "learning_rate": 0.00023524943124897548, "loss": 1.239, "step": 1024 }, { "epoch": 0.33, "grad_norm": 0.22586505115032196, "learning_rate": 0.00023512031146797255, "loss": 1.3606, "step": 1025 }, { "epoch": 0.33, "grad_norm": 0.2513049840927124, "learning_rate": 0.00023499109859053408, "loss": 1.3901, "step": 1026 }, { "epoch": 0.33, "grad_norm": 0.21966643631458282, "learning_rate": 0.00023486179275798068, "loss": 1.1698, "step": 1027 }, { "epoch": 0.33, "grad_norm": 0.23724479973316193, "learning_rate": 0.00023473239411173476, "loss": 1.523, "step": 1028 }, { "epoch": 0.33, "grad_norm": 0.2456350326538086, "learning_rate": 0.00023460290279332002, "loss": 1.4692, "step": 1029 }, { "epoch": 0.33, "grad_norm": 0.21821612119674683, "learning_rate": 0.0002344733189443617, "loss": 1.1855, "step": 1030 }, { "epoch": 0.33, "grad_norm": 0.2670833468437195, "learning_rate": 0.00023434364270658622, "loss": 1.1299, "step": 1031 }, { "epoch": 0.33, "grad_norm": 0.28050893545150757, "learning_rate": 0.00023421387422182098, "loss": 1.2993, "step": 1032 }, { "epoch": 0.33, "grad_norm": 0.20133428275585175, "learning_rate": 0.00023408401363199437, "loss": 1.1166, "step": 1033 }, { "epoch": 0.33, "grad_norm": 0.251229852437973, "learning_rate": 0.00023395406107913538, "loss": 1.219, "step": 1034 }, { "epoch": 0.33, "grad_norm": 0.2751162648200989, "learning_rate": 0.00023382401670537365, "loss": 1.3185, "step": 1035 }, { "epoch": 0.33, "grad_norm": 0.231204554438591, "learning_rate": 0.00023369388065293934, "loss": 1.1795, "step": 1036 }, { "epoch": 0.33, "grad_norm": 0.26815468072891235, "learning_rate": 0.00023356365306416267, "loss": 1.3169, "step": 1037 }, { "epoch": 0.33, "grad_norm": 0.21483924984931946, "learning_rate": 0.00023343333408147417, "loss": 1.2548, "step": 1038 }, { "epoch": 0.33, "grad_norm": 0.38068825006484985, "learning_rate": 0.0002333029238474042, "loss": 1.3254, "step": 1039 }, { "epoch": 0.33, "grad_norm": 0.3087809681892395, "learning_rate": 0.00023317242250458302, "loss": 1.4325, "step": 1040 }, { "epoch": 0.34, "grad_norm": 0.26474475860595703, "learning_rate": 0.00023304183019574046, "loss": 1.2574, "step": 1041 }, { "epoch": 0.34, "grad_norm": 0.23597531020641327, "learning_rate": 0.00023291114706370584, "loss": 1.2103, "step": 1042 }, { "epoch": 0.34, "grad_norm": 0.19980569183826447, "learning_rate": 0.0002327803732514079, "loss": 0.9423, "step": 1043 }, { "epoch": 0.34, "grad_norm": 0.28744134306907654, "learning_rate": 0.00023264950890187445, "loss": 1.3432, "step": 1044 }, { "epoch": 0.34, "grad_norm": 0.20204778015613556, "learning_rate": 0.00023251855415823238, "loss": 1.309, "step": 1045 }, { "epoch": 0.34, "grad_norm": 0.22051692008972168, "learning_rate": 0.00023238750916370745, "loss": 1.117, "step": 1046 }, { "epoch": 0.34, "grad_norm": 0.22198964655399323, "learning_rate": 0.00023225637406162406, "loss": 1.1978, "step": 1047 }, { "epoch": 0.34, "grad_norm": 0.2946959435939789, "learning_rate": 0.0002321251489954053, "loss": 1.2572, "step": 1048 }, { "epoch": 0.34, "grad_norm": 0.22174333035945892, "learning_rate": 0.00023199383410857247, "loss": 1.28, "step": 1049 }, { "epoch": 0.34, "grad_norm": 0.225773885846138, "learning_rate": 0.00023186242954474528, "loss": 1.3896, "step": 1050 }, { "epoch": 0.34, "grad_norm": 0.257705956697464, "learning_rate": 0.0002317309354476414, "loss": 1.1819, "step": 1051 }, { "epoch": 0.34, "grad_norm": 0.29032233357429504, "learning_rate": 0.0002315993519610765, "loss": 1.209, "step": 1052 }, { "epoch": 0.34, "grad_norm": 0.28631994128227234, "learning_rate": 0.0002314676792289639, "loss": 1.3049, "step": 1053 }, { "epoch": 0.34, "grad_norm": 0.21806158125400543, "learning_rate": 0.00023133591739531474, "loss": 1.215, "step": 1054 }, { "epoch": 0.34, "grad_norm": 0.2461784929037094, "learning_rate": 0.0002312040666042374, "loss": 1.0202, "step": 1055 }, { "epoch": 0.34, "grad_norm": 0.30435606837272644, "learning_rate": 0.00023107212699993758, "loss": 1.4081, "step": 1056 }, { "epoch": 0.34, "grad_norm": 0.22192128002643585, "learning_rate": 0.0002309400987267183, "loss": 1.2511, "step": 1057 }, { "epoch": 0.34, "grad_norm": 0.2474905252456665, "learning_rate": 0.00023080798192897932, "loss": 1.2674, "step": 1058 }, { "epoch": 0.34, "grad_norm": 0.23558448255062103, "learning_rate": 0.00023067577675121734, "loss": 1.1608, "step": 1059 }, { "epoch": 0.34, "grad_norm": 0.23850491642951965, "learning_rate": 0.0002305434833380258, "loss": 1.2223, "step": 1060 }, { "epoch": 0.34, "grad_norm": 0.2023753970861435, "learning_rate": 0.00023041110183409443, "loss": 1.0586, "step": 1061 }, { "epoch": 0.34, "grad_norm": 0.22840489447116852, "learning_rate": 0.00023027863238420955, "loss": 1.2671, "step": 1062 }, { "epoch": 0.34, "grad_norm": 0.25523367524147034, "learning_rate": 0.00023014607513325343, "loss": 1.0151, "step": 1063 }, { "epoch": 0.34, "grad_norm": 0.2632114887237549, "learning_rate": 0.0002300134302262045, "loss": 1.0919, "step": 1064 }, { "epoch": 0.34, "grad_norm": 0.2425667643547058, "learning_rate": 0.0002298806978081371, "loss": 1.3162, "step": 1065 }, { "epoch": 0.34, "grad_norm": 0.26016178727149963, "learning_rate": 0.00022974787802422124, "loss": 1.2801, "step": 1066 }, { "epoch": 0.34, "grad_norm": 0.2596909701824188, "learning_rate": 0.00022961497101972237, "loss": 1.2087, "step": 1067 }, { "epoch": 0.34, "grad_norm": 0.2092851996421814, "learning_rate": 0.00022948197694000146, "loss": 1.2886, "step": 1068 }, { "epoch": 0.34, "grad_norm": 0.2113674134016037, "learning_rate": 0.0002293488959305147, "loss": 1.1233, "step": 1069 }, { "epoch": 0.34, "grad_norm": 0.2489120066165924, "learning_rate": 0.00022921572813681333, "loss": 1.3795, "step": 1070 }, { "epoch": 0.34, "grad_norm": 0.2105655074119568, "learning_rate": 0.00022908247370454353, "loss": 1.3016, "step": 1071 }, { "epoch": 0.35, "grad_norm": 0.2958005964756012, "learning_rate": 0.00022894913277944628, "loss": 1.2156, "step": 1072 }, { "epoch": 0.35, "grad_norm": 0.2695716619491577, "learning_rate": 0.00022881570550735696, "loss": 1.3019, "step": 1073 }, { "epoch": 0.35, "grad_norm": 0.2667190730571747, "learning_rate": 0.0002286821920342056, "loss": 1.3079, "step": 1074 }, { "epoch": 0.35, "grad_norm": 0.25533461570739746, "learning_rate": 0.0002285485925060165, "loss": 1.2613, "step": 1075 }, { "epoch": 0.35, "grad_norm": 0.2108837515115738, "learning_rate": 0.00022841490706890789, "loss": 1.1623, "step": 1076 }, { "epoch": 0.35, "grad_norm": 0.20042939484119415, "learning_rate": 0.0002282811358690922, "loss": 1.1518, "step": 1077 }, { "epoch": 0.35, "grad_norm": 0.2896502614021301, "learning_rate": 0.00022814727905287542, "loss": 1.3289, "step": 1078 }, { "epoch": 0.35, "grad_norm": 0.24182327091693878, "learning_rate": 0.00022801333676665742, "loss": 1.144, "step": 1079 }, { "epoch": 0.35, "grad_norm": 0.27088847756385803, "learning_rate": 0.00022787930915693137, "loss": 1.3289, "step": 1080 }, { "epoch": 0.35, "grad_norm": 0.21726277470588684, "learning_rate": 0.00022774519637028384, "loss": 1.2291, "step": 1081 }, { "epoch": 0.35, "grad_norm": 0.26406314969062805, "learning_rate": 0.00022761099855339454, "loss": 1.2546, "step": 1082 }, { "epoch": 0.35, "grad_norm": 0.24489174783229828, "learning_rate": 0.00022747671585303614, "loss": 1.2161, "step": 1083 }, { "epoch": 0.35, "grad_norm": 0.23895005881786346, "learning_rate": 0.00022734234841607416, "loss": 1.0268, "step": 1084 }, { "epoch": 0.35, "grad_norm": 0.25352969765663147, "learning_rate": 0.0002272078963894669, "loss": 1.2504, "step": 1085 }, { "epoch": 0.35, "grad_norm": 0.19931866228580475, "learning_rate": 0.00022707335992026509, "loss": 1.2043, "step": 1086 }, { "epoch": 0.35, "grad_norm": 0.26669052243232727, "learning_rate": 0.00022693873915561181, "loss": 1.0613, "step": 1087 }, { "epoch": 0.35, "grad_norm": 0.21188481152057648, "learning_rate": 0.00022680403424274226, "loss": 1.0666, "step": 1088 }, { "epoch": 0.35, "grad_norm": 0.18295982480049133, "learning_rate": 0.00022666924532898386, "loss": 1.1632, "step": 1089 }, { "epoch": 0.35, "grad_norm": 0.19004176557064056, "learning_rate": 0.00022653437256175575, "loss": 1.0094, "step": 1090 }, { "epoch": 0.35, "grad_norm": 0.3691116273403168, "learning_rate": 0.0002263994160885689, "loss": 1.3905, "step": 1091 }, { "epoch": 0.35, "grad_norm": 0.24704019725322723, "learning_rate": 0.0002262643760570257, "loss": 1.1197, "step": 1092 }, { "epoch": 0.35, "grad_norm": 0.30205240845680237, "learning_rate": 0.00022612925261481997, "loss": 1.0839, "step": 1093 }, { "epoch": 0.35, "grad_norm": 0.20657497644424438, "learning_rate": 0.00022599404590973676, "loss": 1.1619, "step": 1094 }, { "epoch": 0.35, "grad_norm": 0.25482451915740967, "learning_rate": 0.00022585875608965232, "loss": 1.147, "step": 1095 }, { "epoch": 0.35, "grad_norm": 0.2777192294597626, "learning_rate": 0.00022572338330253364, "loss": 1.3937, "step": 1096 }, { "epoch": 0.35, "grad_norm": 0.267014741897583, "learning_rate": 0.0002255879276964384, "loss": 1.0994, "step": 1097 }, { "epoch": 0.35, "grad_norm": 0.2486651986837387, "learning_rate": 0.00022545238941951504, "loss": 1.2643, "step": 1098 }, { "epoch": 0.35, "grad_norm": 0.23484791815280914, "learning_rate": 0.0002253167686200023, "loss": 1.2397, "step": 1099 }, { "epoch": 0.35, "grad_norm": 0.30598944425582886, "learning_rate": 0.0002251810654462292, "loss": 1.2416, "step": 1100 }, { "epoch": 0.35, "grad_norm": 0.24617154896259308, "learning_rate": 0.00022504528004661494, "loss": 1.218, "step": 1101 }, { "epoch": 0.35, "grad_norm": 0.23339052498340607, "learning_rate": 0.00022490941256966848, "loss": 1.3727, "step": 1102 }, { "epoch": 0.36, "grad_norm": 0.2580726444721222, "learning_rate": 0.00022477346316398869, "loss": 1.3036, "step": 1103 }, { "epoch": 0.36, "grad_norm": 0.24429742991924286, "learning_rate": 0.00022463743197826395, "loss": 1.0016, "step": 1104 }, { "epoch": 0.36, "grad_norm": 0.2356441169977188, "learning_rate": 0.00022450131916127214, "loss": 1.4147, "step": 1105 }, { "epoch": 0.36, "grad_norm": 0.27938273549079895, "learning_rate": 0.00022436512486188046, "loss": 1.2465, "step": 1106 }, { "epoch": 0.36, "grad_norm": 0.23710989952087402, "learning_rate": 0.00022422884922904514, "loss": 1.3296, "step": 1107 }, { "epoch": 0.36, "grad_norm": 0.23140433430671692, "learning_rate": 0.0002240924924118114, "loss": 1.0326, "step": 1108 }, { "epoch": 0.36, "grad_norm": 0.3381577134132385, "learning_rate": 0.00022395605455931324, "loss": 1.4995, "step": 1109 }, { "epoch": 0.36, "grad_norm": 0.22035624086856842, "learning_rate": 0.00022381953582077332, "loss": 1.1133, "step": 1110 }, { "epoch": 0.36, "grad_norm": 0.23101073503494263, "learning_rate": 0.00022368293634550274, "loss": 1.4145, "step": 1111 }, { "epoch": 0.36, "grad_norm": 0.2863337993621826, "learning_rate": 0.0002235462562829009, "loss": 1.2627, "step": 1112 }, { "epoch": 0.36, "grad_norm": 0.21907080709934235, "learning_rate": 0.00022340949578245544, "loss": 1.2731, "step": 1113 }, { "epoch": 0.36, "grad_norm": 0.2069971114397049, "learning_rate": 0.00022327265499374173, "loss": 1.2143, "step": 1114 }, { "epoch": 0.36, "grad_norm": 0.2408277541399002, "learning_rate": 0.0002231357340664232, "loss": 1.3168, "step": 1115 }, { "epoch": 0.36, "grad_norm": 0.23668554425239563, "learning_rate": 0.0002229987331502508, "loss": 1.0819, "step": 1116 }, { "epoch": 0.36, "grad_norm": 0.226751446723938, "learning_rate": 0.00022286165239506305, "loss": 1.4183, "step": 1117 }, { "epoch": 0.36, "grad_norm": 0.24679474532604218, "learning_rate": 0.00022272449195078565, "loss": 1.1659, "step": 1118 }, { "epoch": 0.36, "grad_norm": 0.19619359076023102, "learning_rate": 0.00022258725196743162, "loss": 1.0621, "step": 1119 }, { "epoch": 0.36, "grad_norm": 0.25470787286758423, "learning_rate": 0.00022244993259510086, "loss": 1.1347, "step": 1120 }, { "epoch": 0.36, "grad_norm": 0.2693122923374176, "learning_rate": 0.00022231253398398012, "loss": 1.4055, "step": 1121 }, { "epoch": 0.36, "grad_norm": 0.2335427850484848, "learning_rate": 0.00022217505628434287, "loss": 1.2561, "step": 1122 }, { "epoch": 0.36, "grad_norm": 0.23105846345424652, "learning_rate": 0.00022203749964654903, "loss": 1.1731, "step": 1123 }, { "epoch": 0.36, "grad_norm": 0.22029852867126465, "learning_rate": 0.0002218998642210448, "loss": 1.2507, "step": 1124 }, { "epoch": 0.36, "grad_norm": 0.22752423584461212, "learning_rate": 0.00022176215015836265, "loss": 1.2708, "step": 1125 }, { "epoch": 0.36, "grad_norm": 0.20707625150680542, "learning_rate": 0.00022162435760912107, "loss": 1.2919, "step": 1126 }, { "epoch": 0.36, "grad_norm": 0.2509956955909729, "learning_rate": 0.0002214864867240243, "loss": 1.2409, "step": 1127 }, { "epoch": 0.36, "grad_norm": 0.31117919087409973, "learning_rate": 0.00022134853765386228, "loss": 1.4001, "step": 1128 }, { "epoch": 0.36, "grad_norm": 0.22909560799598694, "learning_rate": 0.00022121051054951047, "loss": 1.2889, "step": 1129 }, { "epoch": 0.36, "grad_norm": 0.24778012931346893, "learning_rate": 0.00022107240556192966, "loss": 1.1039, "step": 1130 }, { "epoch": 0.36, "grad_norm": 0.2813015878200531, "learning_rate": 0.00022093422284216594, "loss": 1.1711, "step": 1131 }, { "epoch": 0.36, "grad_norm": 0.3038373291492462, "learning_rate": 0.00022079596254135026, "loss": 1.2086, "step": 1132 }, { "epoch": 0.36, "grad_norm": 0.23540645837783813, "learning_rate": 0.00022065762481069849, "loss": 1.2104, "step": 1133 }, { "epoch": 0.37, "grad_norm": 0.19529864192008972, "learning_rate": 0.0002205192098015112, "loss": 1.0127, "step": 1134 }, { "epoch": 0.37, "grad_norm": 0.23978424072265625, "learning_rate": 0.00022038071766517336, "loss": 1.2168, "step": 1135 }, { "epoch": 0.37, "grad_norm": 0.2529107332229614, "learning_rate": 0.00022024214855315447, "loss": 1.2702, "step": 1136 }, { "epoch": 0.37, "grad_norm": 0.2570422291755676, "learning_rate": 0.00022010350261700816, "loss": 1.2315, "step": 1137 }, { "epoch": 0.37, "grad_norm": 0.21645359694957733, "learning_rate": 0.00021996478000837203, "loss": 1.1734, "step": 1138 }, { "epoch": 0.37, "grad_norm": 0.21818828582763672, "learning_rate": 0.00021982598087896756, "loss": 1.3249, "step": 1139 }, { "epoch": 0.37, "grad_norm": 0.20920565724372864, "learning_rate": 0.0002196871053805999, "loss": 1.1503, "step": 1140 }, { "epoch": 0.37, "grad_norm": 0.23288384079933167, "learning_rate": 0.0002195481536651578, "loss": 1.2563, "step": 1141 }, { "epoch": 0.37, "grad_norm": 0.25354886054992676, "learning_rate": 0.0002194091258846134, "loss": 1.1162, "step": 1142 }, { "epoch": 0.37, "grad_norm": 0.25000399351119995, "learning_rate": 0.00021927002219102179, "loss": 0.999, "step": 1143 }, { "epoch": 0.37, "grad_norm": 0.24784931540489197, "learning_rate": 0.0002191308427365214, "loss": 1.1722, "step": 1144 }, { "epoch": 0.37, "grad_norm": 0.29084479808807373, "learning_rate": 0.00021899158767333326, "loss": 1.293, "step": 1145 }, { "epoch": 0.37, "grad_norm": 0.2662232220172882, "learning_rate": 0.00021885225715376127, "loss": 1.3416, "step": 1146 }, { "epoch": 0.37, "grad_norm": 0.22885380685329437, "learning_rate": 0.00021871285133019176, "loss": 1.3112, "step": 1147 }, { "epoch": 0.37, "grad_norm": 0.20972925424575806, "learning_rate": 0.0002185733703550935, "loss": 1.1001, "step": 1148 }, { "epoch": 0.37, "grad_norm": 0.19356688857078552, "learning_rate": 0.0002184338143810174, "loss": 1.2809, "step": 1149 }, { "epoch": 0.37, "grad_norm": 0.24730347096920013, "learning_rate": 0.00021829418356059636, "loss": 1.3522, "step": 1150 }, { "epoch": 0.37, "grad_norm": 0.20958828926086426, "learning_rate": 0.00021815447804654522, "loss": 1.1556, "step": 1151 }, { "epoch": 0.37, "grad_norm": 0.22640813887119293, "learning_rate": 0.00021801469799166045, "loss": 1.2607, "step": 1152 }, { "epoch": 0.37, "grad_norm": 0.22032903134822845, "learning_rate": 0.00021787484354882013, "loss": 1.1487, "step": 1153 }, { "epoch": 0.37, "grad_norm": 0.2520946264266968, "learning_rate": 0.00021773491487098358, "loss": 1.2792, "step": 1154 }, { "epoch": 0.37, "grad_norm": 0.31829676032066345, "learning_rate": 0.0002175949121111914, "loss": 1.2222, "step": 1155 }, { "epoch": 0.37, "grad_norm": 0.19280177354812622, "learning_rate": 0.00021745483542256512, "loss": 1.292, "step": 1156 }, { "epoch": 0.37, "grad_norm": 0.21910007297992706, "learning_rate": 0.00021731468495830733, "loss": 1.3368, "step": 1157 }, { "epoch": 0.37, "grad_norm": 0.24554970860481262, "learning_rate": 0.00021717446087170106, "loss": 1.1794, "step": 1158 }, { "epoch": 0.37, "grad_norm": 0.23703397810459137, "learning_rate": 0.00021703416331611, "loss": 1.3545, "step": 1159 }, { "epoch": 0.37, "grad_norm": 0.2932862341403961, "learning_rate": 0.00021689379244497814, "loss": 1.1327, "step": 1160 }, { "epoch": 0.37, "grad_norm": 0.19714978337287903, "learning_rate": 0.00021675334841182972, "loss": 1.3132, "step": 1161 }, { "epoch": 0.37, "grad_norm": 0.2341964691877365, "learning_rate": 0.00021661283137026894, "loss": 1.0276, "step": 1162 }, { "epoch": 0.37, "grad_norm": 0.2098851352930069, "learning_rate": 0.00021647224147397994, "loss": 1.0184, "step": 1163 }, { "epoch": 0.37, "grad_norm": 0.5450015068054199, "learning_rate": 0.00021633157887672634, "loss": 1.2317, "step": 1164 }, { "epoch": 0.38, "grad_norm": 0.46259719133377075, "learning_rate": 0.00021619084373235148, "loss": 1.0911, "step": 1165 }, { "epoch": 0.38, "grad_norm": 0.23908789455890656, "learning_rate": 0.0002160500361947779, "loss": 1.1669, "step": 1166 }, { "epoch": 0.38, "grad_norm": 0.20369701087474823, "learning_rate": 0.0002159091564180075, "loss": 0.9572, "step": 1167 }, { "epoch": 0.38, "grad_norm": 0.2798102796077728, "learning_rate": 0.00021576820455612098, "loss": 1.3512, "step": 1168 }, { "epoch": 0.38, "grad_norm": 0.265949547290802, "learning_rate": 0.00021562718076327804, "loss": 1.3135, "step": 1169 }, { "epoch": 0.38, "grad_norm": 0.19142241775989532, "learning_rate": 0.00021548608519371688, "loss": 1.1539, "step": 1170 }, { "epoch": 0.38, "grad_norm": 0.23777072131633759, "learning_rate": 0.0002153449180017544, "loss": 1.2384, "step": 1171 }, { "epoch": 0.38, "grad_norm": 0.2292533665895462, "learning_rate": 0.00021520367934178575, "loss": 1.1277, "step": 1172 }, { "epoch": 0.38, "grad_norm": 0.22202859818935394, "learning_rate": 0.0002150623693682842, "loss": 1.1218, "step": 1173 }, { "epoch": 0.38, "grad_norm": 0.1951579749584198, "learning_rate": 0.00021492098823580098, "loss": 1.2922, "step": 1174 }, { "epoch": 0.38, "grad_norm": 0.21843190491199493, "learning_rate": 0.0002147795360989653, "loss": 1.3422, "step": 1175 }, { "epoch": 0.38, "grad_norm": 3.5558485984802246, "learning_rate": 0.00021463801311248393, "loss": 1.3106, "step": 1176 }, { "epoch": 0.38, "grad_norm": 0.22525081038475037, "learning_rate": 0.00021449641943114116, "loss": 1.1468, "step": 1177 }, { "epoch": 0.38, "grad_norm": 0.23903535306453705, "learning_rate": 0.0002143547552097986, "loss": 1.0903, "step": 1178 }, { "epoch": 0.38, "grad_norm": 0.43220099806785583, "learning_rate": 0.0002142130206033949, "loss": 1.4302, "step": 1179 }, { "epoch": 0.38, "grad_norm": 0.4071425795555115, "learning_rate": 0.0002140712157669459, "loss": 1.5925, "step": 1180 }, { "epoch": 0.38, "grad_norm": 0.5719355940818787, "learning_rate": 0.00021392934085554398, "loss": 1.3241, "step": 1181 }, { "epoch": 0.38, "grad_norm": 0.24162732064723969, "learning_rate": 0.0002137873960243585, "loss": 1.1522, "step": 1182 }, { "epoch": 0.38, "grad_norm": 0.2779856324195862, "learning_rate": 0.00021364538142863498, "loss": 1.2422, "step": 1183 }, { "epoch": 0.38, "grad_norm": 2.2244160175323486, "learning_rate": 0.00021350329722369542, "loss": 1.348, "step": 1184 }, { "epoch": 0.38, "grad_norm": 0.3796921372413635, "learning_rate": 0.0002133611435649379, "loss": 1.3133, "step": 1185 }, { "epoch": 0.38, "grad_norm": 0.2678883969783783, "learning_rate": 0.0002132189206078364, "loss": 1.214, "step": 1186 }, { "epoch": 0.38, "grad_norm": 0.27173006534576416, "learning_rate": 0.00021307662850794087, "loss": 1.2519, "step": 1187 }, { "epoch": 0.38, "grad_norm": 0.25615301728248596, "learning_rate": 0.00021293426742087664, "loss": 1.3371, "step": 1188 }, { "epoch": 0.38, "grad_norm": 0.21323223412036896, "learning_rate": 0.00021279183750234475, "loss": 1.2887, "step": 1189 }, { "epoch": 0.38, "grad_norm": 0.24542051553726196, "learning_rate": 0.00021264933890812127, "loss": 1.403, "step": 1190 }, { "epoch": 0.38, "grad_norm": 0.2285470813512802, "learning_rate": 0.00021250677179405757, "loss": 1.2742, "step": 1191 }, { "epoch": 0.38, "grad_norm": 0.405487060546875, "learning_rate": 0.00021236413631607983, "loss": 1.5066, "step": 1192 }, { "epoch": 0.38, "grad_norm": 0.2824304699897766, "learning_rate": 0.00021222143263018915, "loss": 1.2472, "step": 1193 }, { "epoch": 0.38, "grad_norm": 0.29078444838523865, "learning_rate": 0.00021207866089246105, "loss": 1.2308, "step": 1194 }, { "epoch": 0.38, "grad_norm": 0.30253639817237854, "learning_rate": 0.00021193582125904561, "loss": 1.3997, "step": 1195 }, { "epoch": 0.39, "grad_norm": 0.21117225289344788, "learning_rate": 0.00021179291388616716, "loss": 0.975, "step": 1196 }, { "epoch": 0.39, "grad_norm": 0.27029138803482056, "learning_rate": 0.000211649938930124, "loss": 1.1294, "step": 1197 }, { "epoch": 0.39, "grad_norm": 0.2949385941028595, "learning_rate": 0.00021150689654728848, "loss": 1.1066, "step": 1198 }, { "epoch": 0.39, "grad_norm": 0.24892771244049072, "learning_rate": 0.00021136378689410668, "loss": 1.2059, "step": 1199 }, { "epoch": 0.39, "grad_norm": 0.27645790576934814, "learning_rate": 0.00021122061012709815, "loss": 1.2354, "step": 1200 }, { "epoch": 0.39, "grad_norm": 0.23932577669620514, "learning_rate": 0.00021107736640285592, "loss": 1.3201, "step": 1201 }, { "epoch": 0.39, "grad_norm": 0.26966550946235657, "learning_rate": 0.00021093405587804627, "loss": 1.4628, "step": 1202 }, { "epoch": 0.39, "grad_norm": 0.23287123441696167, "learning_rate": 0.00021079067870940848, "loss": 1.5065, "step": 1203 }, { "epoch": 0.39, "grad_norm": 0.3138710856437683, "learning_rate": 0.00021064723505375484, "loss": 1.3879, "step": 1204 }, { "epoch": 0.39, "grad_norm": 0.2674337923526764, "learning_rate": 0.00021050372506797014, "loss": 1.2007, "step": 1205 }, { "epoch": 0.39, "grad_norm": 0.3244151473045349, "learning_rate": 0.00021036014890901192, "loss": 1.402, "step": 1206 }, { "epoch": 0.39, "grad_norm": 0.24924007058143616, "learning_rate": 0.00021021650673390995, "loss": 1.2812, "step": 1207 }, { "epoch": 0.39, "grad_norm": 1.4263643026351929, "learning_rate": 0.00021007279869976638, "loss": 1.2768, "step": 1208 }, { "epoch": 0.39, "grad_norm": 0.5953125953674316, "learning_rate": 0.00020992902496375519, "loss": 1.3451, "step": 1209 }, { "epoch": 0.39, "grad_norm": 0.3636452853679657, "learning_rate": 0.00020978518568312235, "loss": 1.2465, "step": 1210 }, { "epoch": 0.39, "grad_norm": 1.7867380380630493, "learning_rate": 0.00020964128101518543, "loss": 1.2395, "step": 1211 }, { "epoch": 0.39, "grad_norm": 1.534584879875183, "learning_rate": 0.0002094973111173336, "loss": 1.2743, "step": 1212 }, { "epoch": 0.39, "grad_norm": 12.487269401550293, "learning_rate": 0.0002093532761470273, "loss": 1.5784, "step": 1213 }, { "epoch": 0.39, "grad_norm": 0.6728346943855286, "learning_rate": 0.00020920917626179823, "loss": 1.4599, "step": 1214 }, { "epoch": 0.39, "grad_norm": 0.32532382011413574, "learning_rate": 0.00020906501161924897, "loss": 1.2604, "step": 1215 }, { "epoch": 0.39, "grad_norm": 0.3304091691970825, "learning_rate": 0.000208920782377053, "loss": 1.2863, "step": 1216 }, { "epoch": 0.39, "grad_norm": 0.23182323575019836, "learning_rate": 0.0002087764886929544, "loss": 1.1673, "step": 1217 }, { "epoch": 0.39, "grad_norm": 0.4589391350746155, "learning_rate": 0.00020863213072476783, "loss": 1.3729, "step": 1218 }, { "epoch": 0.39, "grad_norm": 0.6436921954154968, "learning_rate": 0.00020848770863037824, "loss": 1.3301, "step": 1219 }, { "epoch": 0.39, "grad_norm": 0.25582990050315857, "learning_rate": 0.00020834322256774052, "loss": 1.0793, "step": 1220 }, { "epoch": 0.39, "grad_norm": 0.3699045181274414, "learning_rate": 0.0002081986726948798, "loss": 1.3453, "step": 1221 }, { "epoch": 0.39, "grad_norm": 0.2602224051952362, "learning_rate": 0.00020805405916989072, "loss": 0.9567, "step": 1222 }, { "epoch": 0.39, "grad_norm": 0.2792646884918213, "learning_rate": 0.00020790938215093792, "loss": 1.1649, "step": 1223 }, { "epoch": 0.39, "grad_norm": 0.3908153474330902, "learning_rate": 0.00020776464179625504, "loss": 1.2516, "step": 1224 }, { "epoch": 0.39, "grad_norm": 0.4792688190937042, "learning_rate": 0.00020761983826414533, "loss": 1.3099, "step": 1225 }, { "epoch": 0.39, "grad_norm": 0.3886181712150574, "learning_rate": 0.000207474971712981, "loss": 1.1806, "step": 1226 }, { "epoch": 0.4, "grad_norm": 0.5824118256568909, "learning_rate": 0.00020733004230120308, "loss": 1.026, "step": 1227 }, { "epoch": 0.4, "grad_norm": 0.3338281512260437, "learning_rate": 0.0002071850501873216, "loss": 1.3484, "step": 1228 }, { "epoch": 0.4, "grad_norm": 0.48280686140060425, "learning_rate": 0.00020703999552991497, "loss": 1.4277, "step": 1229 }, { "epoch": 0.4, "grad_norm": 0.25054460763931274, "learning_rate": 0.00020689487848763005, "loss": 1.0735, "step": 1230 }, { "epoch": 0.4, "grad_norm": 0.26955321431159973, "learning_rate": 0.000206749699219182, "loss": 1.3249, "step": 1231 }, { "epoch": 0.4, "grad_norm": 0.4094875454902649, "learning_rate": 0.00020660445788335394, "loss": 1.2325, "step": 1232 }, { "epoch": 0.4, "grad_norm": 0.4510117173194885, "learning_rate": 0.00020645915463899696, "loss": 1.4591, "step": 1233 }, { "epoch": 0.4, "grad_norm": 0.3244355022907257, "learning_rate": 0.00020631378964502977, "loss": 1.2371, "step": 1234 }, { "epoch": 0.4, "grad_norm": 0.2393747717142105, "learning_rate": 0.00020616836306043874, "loss": 1.3152, "step": 1235 }, { "epoch": 0.4, "grad_norm": 0.3207148015499115, "learning_rate": 0.0002060228750442774, "loss": 1.289, "step": 1236 }, { "epoch": 0.4, "grad_norm": 0.24100615084171295, "learning_rate": 0.0002058773257556667, "loss": 1.2886, "step": 1237 }, { "epoch": 0.4, "grad_norm": 0.2825073003768921, "learning_rate": 0.00020573171535379444, "loss": 1.1907, "step": 1238 }, { "epoch": 0.4, "grad_norm": 0.26788756251335144, "learning_rate": 0.00020558604399791534, "loss": 1.2454, "step": 1239 }, { "epoch": 0.4, "grad_norm": 0.29157260060310364, "learning_rate": 0.00020544031184735082, "loss": 1.3593, "step": 1240 }, { "epoch": 0.4, "grad_norm": 0.2663780450820923, "learning_rate": 0.0002052945190614886, "loss": 1.2803, "step": 1241 }, { "epoch": 0.4, "grad_norm": 0.25722479820251465, "learning_rate": 0.00020514866579978295, "loss": 1.2249, "step": 1242 }, { "epoch": 0.4, "grad_norm": 0.27564549446105957, "learning_rate": 0.00020500275222175415, "loss": 1.1808, "step": 1243 }, { "epoch": 0.4, "grad_norm": 0.2709313929080963, "learning_rate": 0.00020485677848698853, "loss": 1.1497, "step": 1244 }, { "epoch": 0.4, "grad_norm": 0.21792276203632355, "learning_rate": 0.0002047107447551381, "loss": 1.3215, "step": 1245 }, { "epoch": 0.4, "grad_norm": 0.36292344331741333, "learning_rate": 0.00020456465118592058, "loss": 1.2662, "step": 1246 }, { "epoch": 0.4, "grad_norm": 0.28960904479026794, "learning_rate": 0.0002044184979391191, "loss": 1.3634, "step": 1247 }, { "epoch": 0.4, "grad_norm": 0.2862597405910492, "learning_rate": 0.0002042722851745821, "loss": 1.4534, "step": 1248 }, { "epoch": 0.4, "grad_norm": 0.2560533285140991, "learning_rate": 0.00020412601305222302, "loss": 1.187, "step": 1249 }, { "epoch": 0.4, "grad_norm": 0.3362022936344147, "learning_rate": 0.00020397968173202038, "loss": 1.1598, "step": 1250 }, { "epoch": 0.4, "grad_norm": 0.32160186767578125, "learning_rate": 0.00020383329137401728, "loss": 1.3409, "step": 1251 }, { "epoch": 0.4, "grad_norm": 0.48343226313591003, "learning_rate": 0.00020368684213832144, "loss": 1.4584, "step": 1252 }, { "epoch": 0.4, "grad_norm": 0.2727360129356384, "learning_rate": 0.00020354033418510506, "loss": 1.3483, "step": 1253 }, { "epoch": 0.4, "grad_norm": 0.21804024279117584, "learning_rate": 0.00020339376767460442, "loss": 0.9952, "step": 1254 }, { "epoch": 0.4, "grad_norm": 0.22215305268764496, "learning_rate": 0.00020324714276712003, "loss": 1.2851, "step": 1255 }, { "epoch": 0.4, "grad_norm": 0.24394068121910095, "learning_rate": 0.000203100459623016, "loss": 1.2156, "step": 1256 }, { "epoch": 0.4, "grad_norm": 0.2051287591457367, "learning_rate": 0.00020295371840272042, "loss": 1.0852, "step": 1257 }, { "epoch": 0.41, "grad_norm": 0.2536994516849518, "learning_rate": 0.00020280691926672468, "loss": 1.1494, "step": 1258 }, { "epoch": 0.41, "grad_norm": 0.24850700795650482, "learning_rate": 0.00020266006237558364, "loss": 1.1644, "step": 1259 }, { "epoch": 0.41, "grad_norm": 0.28062012791633606, "learning_rate": 0.0002025131478899153, "loss": 1.1152, "step": 1260 }, { "epoch": 0.41, "grad_norm": 0.37917661666870117, "learning_rate": 0.0002023661759704006, "loss": 1.2173, "step": 1261 }, { "epoch": 0.41, "grad_norm": 0.2571663558483124, "learning_rate": 0.00020221914677778335, "loss": 1.1862, "step": 1262 }, { "epoch": 0.41, "grad_norm": 0.2680908739566803, "learning_rate": 0.00020207206047287004, "loss": 1.328, "step": 1263 }, { "epoch": 0.41, "grad_norm": 0.310056209564209, "learning_rate": 0.00020192491721652948, "loss": 1.4154, "step": 1264 }, { "epoch": 0.41, "grad_norm": 0.33653393387794495, "learning_rate": 0.00020177771716969292, "loss": 1.4665, "step": 1265 }, { "epoch": 0.41, "grad_norm": 0.2048826664686203, "learning_rate": 0.0002016304604933536, "loss": 1.2281, "step": 1266 }, { "epoch": 0.41, "grad_norm": 0.2364034503698349, "learning_rate": 0.00020148314734856686, "loss": 1.0593, "step": 1267 }, { "epoch": 0.41, "grad_norm": 0.26425111293792725, "learning_rate": 0.0002013357778964495, "loss": 1.0821, "step": 1268 }, { "epoch": 0.41, "grad_norm": 0.3039323687553406, "learning_rate": 0.00020118835229818033, "loss": 1.3323, "step": 1269 }, { "epoch": 0.41, "grad_norm": 0.23416057229042053, "learning_rate": 0.00020104087071499917, "loss": 1.1192, "step": 1270 }, { "epoch": 0.41, "grad_norm": 0.2914274036884308, "learning_rate": 0.00020089333330820726, "loss": 1.4408, "step": 1271 }, { "epoch": 0.41, "grad_norm": 0.2506740391254425, "learning_rate": 0.00020074574023916698, "loss": 1.1668, "step": 1272 }, { "epoch": 0.41, "grad_norm": 0.24619008600711823, "learning_rate": 0.00020059809166930132, "loss": 1.3363, "step": 1273 }, { "epoch": 0.41, "grad_norm": 0.25422435998916626, "learning_rate": 0.00020045038776009426, "loss": 1.1576, "step": 1274 }, { "epoch": 0.41, "grad_norm": 0.30053389072418213, "learning_rate": 0.00020030262867309006, "loss": 1.2125, "step": 1275 }, { "epoch": 0.41, "grad_norm": 0.31676578521728516, "learning_rate": 0.00020015481456989357, "loss": 1.3692, "step": 1276 }, { "epoch": 0.41, "grad_norm": 0.24905432760715485, "learning_rate": 0.0002000069456121696, "loss": 1.4655, "step": 1277 }, { "epoch": 0.41, "grad_norm": 0.276501327753067, "learning_rate": 0.00019985902196164302, "loss": 1.276, "step": 1278 }, { "epoch": 0.41, "grad_norm": 0.2406342774629593, "learning_rate": 0.0001997110437800986, "loss": 1.1992, "step": 1279 }, { "epoch": 0.41, "grad_norm": 0.24459819495677948, "learning_rate": 0.00019956301122938064, "loss": 1.2005, "step": 1280 }, { "epoch": 0.41, "grad_norm": 0.29144859313964844, "learning_rate": 0.000199414924471393, "loss": 1.3061, "step": 1281 }, { "epoch": 0.41, "grad_norm": 0.263268381357193, "learning_rate": 0.00019926678366809873, "loss": 1.1995, "step": 1282 }, { "epoch": 0.41, "grad_norm": 0.22886592149734497, "learning_rate": 0.00019911858898152005, "loss": 1.166, "step": 1283 }, { "epoch": 0.41, "grad_norm": 0.23738765716552734, "learning_rate": 0.0001989703405737381, "loss": 1.0203, "step": 1284 }, { "epoch": 0.41, "grad_norm": 0.29597777128219604, "learning_rate": 0.00019882203860689278, "loss": 1.2305, "step": 1285 }, { "epoch": 0.41, "grad_norm": 0.2685182988643646, "learning_rate": 0.0001986736832431826, "loss": 1.3109, "step": 1286 }, { "epoch": 0.41, "grad_norm": 0.23401445150375366, "learning_rate": 0.00019852527464486435, "loss": 1.1297, "step": 1287 }, { "epoch": 0.41, "grad_norm": 0.2430437058210373, "learning_rate": 0.0001983768129742532, "loss": 1.3147, "step": 1288 }, { "epoch": 0.42, "grad_norm": 0.255560964345932, "learning_rate": 0.00019822829839372225, "loss": 1.2413, "step": 1289 }, { "epoch": 0.42, "grad_norm": 0.24332919716835022, "learning_rate": 0.0001980797310657025, "loss": 1.0773, "step": 1290 }, { "epoch": 0.42, "grad_norm": 0.23022247850894928, "learning_rate": 0.0001979311111526827, "loss": 1.0797, "step": 1291 }, { "epoch": 0.42, "grad_norm": 0.26815053820610046, "learning_rate": 0.00019778243881720902, "loss": 1.1434, "step": 1292 }, { "epoch": 0.42, "grad_norm": 0.25689926743507385, "learning_rate": 0.000197633714221885, "loss": 1.0312, "step": 1293 }, { "epoch": 0.42, "grad_norm": 0.2148447185754776, "learning_rate": 0.0001974849375293714, "loss": 1.2509, "step": 1294 }, { "epoch": 0.42, "grad_norm": 0.24174413084983826, "learning_rate": 0.00019733610890238586, "loss": 1.1323, "step": 1295 }, { "epoch": 0.42, "grad_norm": 0.24388526380062103, "learning_rate": 0.00019718722850370286, "loss": 1.2062, "step": 1296 }, { "epoch": 0.42, "grad_norm": 0.2509593665599823, "learning_rate": 0.00019703829649615347, "loss": 1.1415, "step": 1297 }, { "epoch": 0.42, "grad_norm": 0.30560100078582764, "learning_rate": 0.00019688931304262533, "loss": 1.2197, "step": 1298 }, { "epoch": 0.42, "grad_norm": 0.20017120242118835, "learning_rate": 0.00019674027830606219, "loss": 1.3501, "step": 1299 }, { "epoch": 0.42, "grad_norm": 0.24840351939201355, "learning_rate": 0.00019659119244946403, "loss": 1.2677, "step": 1300 }, { "epoch": 0.42, "grad_norm": 0.20912209153175354, "learning_rate": 0.00019644205563588654, "loss": 1.3203, "step": 1301 }, { "epoch": 0.42, "grad_norm": 0.30538704991340637, "learning_rate": 0.00019629286802844135, "loss": 1.4279, "step": 1302 }, { "epoch": 0.42, "grad_norm": 0.25008153915405273, "learning_rate": 0.00019614362979029557, "loss": 1.1111, "step": 1303 }, { "epoch": 0.42, "grad_norm": 0.23786742985248566, "learning_rate": 0.00019599434108467164, "loss": 1.3182, "step": 1304 }, { "epoch": 0.42, "grad_norm": 0.22090737521648407, "learning_rate": 0.00019584500207484729, "loss": 1.1997, "step": 1305 }, { "epoch": 0.42, "grad_norm": 0.2523505389690399, "learning_rate": 0.00019569561292415513, "loss": 1.0545, "step": 1306 }, { "epoch": 0.42, "grad_norm": 0.2162105292081833, "learning_rate": 0.00019554617379598273, "loss": 1.3081, "step": 1307 }, { "epoch": 0.42, "grad_norm": 0.2973666489124298, "learning_rate": 0.00019539668485377232, "loss": 1.2669, "step": 1308 }, { "epoch": 0.42, "grad_norm": 0.25674208998680115, "learning_rate": 0.00019524714626102048, "loss": 1.3691, "step": 1309 }, { "epoch": 0.42, "grad_norm": 0.25870808959007263, "learning_rate": 0.00019509755818127825, "loss": 1.3212, "step": 1310 }, { "epoch": 0.42, "grad_norm": 0.24933266639709473, "learning_rate": 0.00019494792077815072, "loss": 1.3742, "step": 1311 }, { "epoch": 0.42, "grad_norm": 0.24605928361415863, "learning_rate": 0.000194798234215297, "loss": 1.2792, "step": 1312 }, { "epoch": 0.42, "grad_norm": 0.22337552905082703, "learning_rate": 0.00019464849865642977, "loss": 0.9058, "step": 1313 }, { "epoch": 0.42, "grad_norm": 0.23972637951374054, "learning_rate": 0.00019449871426531547, "loss": 1.3173, "step": 1314 }, { "epoch": 0.42, "grad_norm": 0.2690894603729248, "learning_rate": 0.00019434888120577396, "loss": 1.3572, "step": 1315 }, { "epoch": 0.42, "grad_norm": 0.24285298585891724, "learning_rate": 0.00019419899964167824, "loss": 1.2276, "step": 1316 }, { "epoch": 0.42, "grad_norm": 0.2160593867301941, "learning_rate": 0.00019404906973695442, "loss": 1.3218, "step": 1317 }, { "epoch": 0.42, "grad_norm": 0.2015027105808258, "learning_rate": 0.0001938990916555814, "loss": 1.2633, "step": 1318 }, { "epoch": 0.42, "grad_norm": 0.2585669457912445, "learning_rate": 0.00019374906556159086, "loss": 1.4395, "step": 1319 }, { "epoch": 0.43, "grad_norm": 0.2580777704715729, "learning_rate": 0.00019359899161906695, "loss": 1.1474, "step": 1320 }, { "epoch": 0.43, "grad_norm": 0.26732850074768066, "learning_rate": 0.00019344886999214618, "loss": 1.3202, "step": 1321 }, { "epoch": 0.43, "grad_norm": 0.26117655634880066, "learning_rate": 0.00019329870084501723, "loss": 1.35, "step": 1322 }, { "epoch": 0.43, "grad_norm": 0.24906988441944122, "learning_rate": 0.00019314848434192058, "loss": 1.2851, "step": 1323 }, { "epoch": 0.43, "grad_norm": 0.2738703191280365, "learning_rate": 0.00019299822064714874, "loss": 1.2568, "step": 1324 }, { "epoch": 0.43, "grad_norm": 0.42745593190193176, "learning_rate": 0.00019284790992504572, "loss": 1.3272, "step": 1325 }, { "epoch": 0.43, "grad_norm": 0.2993389666080475, "learning_rate": 0.00019269755234000699, "loss": 1.313, "step": 1326 }, { "epoch": 0.43, "grad_norm": 0.24389056861400604, "learning_rate": 0.0001925471480564792, "loss": 1.3684, "step": 1327 }, { "epoch": 0.43, "grad_norm": 0.2115030437707901, "learning_rate": 0.00019239669723896026, "loss": 1.069, "step": 1328 }, { "epoch": 0.43, "grad_norm": 0.20956125855445862, "learning_rate": 0.00019224620005199868, "loss": 1.3509, "step": 1329 }, { "epoch": 0.43, "grad_norm": 0.21496626734733582, "learning_rate": 0.00019209565666019398, "loss": 1.3061, "step": 1330 }, { "epoch": 0.43, "grad_norm": 0.21823544800281525, "learning_rate": 0.000191945067228196, "loss": 1.2128, "step": 1331 }, { "epoch": 0.43, "grad_norm": 0.25674375891685486, "learning_rate": 0.00019179443192070513, "loss": 1.0671, "step": 1332 }, { "epoch": 0.43, "grad_norm": 0.26295408606529236, "learning_rate": 0.00019164375090247176, "loss": 1.264, "step": 1333 }, { "epoch": 0.43, "grad_norm": 0.2957514226436615, "learning_rate": 0.00019149302433829632, "loss": 1.3006, "step": 1334 }, { "epoch": 0.43, "grad_norm": 0.2609538733959198, "learning_rate": 0.00019134225239302908, "loss": 1.2758, "step": 1335 }, { "epoch": 0.43, "grad_norm": 0.3169461786746979, "learning_rate": 0.00019119143523157, "loss": 1.3484, "step": 1336 }, { "epoch": 0.43, "grad_norm": 0.2681329846382141, "learning_rate": 0.00019104057301886843, "loss": 1.5341, "step": 1337 }, { "epoch": 0.43, "grad_norm": 0.24301958084106445, "learning_rate": 0.00019088966591992288, "loss": 1.3044, "step": 1338 }, { "epoch": 0.43, "grad_norm": 0.2784822881221771, "learning_rate": 0.00019073871409978115, "loss": 1.1796, "step": 1339 }, { "epoch": 0.43, "grad_norm": 0.26077017188072205, "learning_rate": 0.0001905877177235399, "loss": 1.2951, "step": 1340 }, { "epoch": 0.43, "grad_norm": 0.19372043013572693, "learning_rate": 0.00019043667695634446, "loss": 0.9712, "step": 1341 }, { "epoch": 0.43, "grad_norm": 0.23456844687461853, "learning_rate": 0.00019028559196338868, "loss": 1.3807, "step": 1342 }, { "epoch": 0.43, "grad_norm": 0.2496589720249176, "learning_rate": 0.0001901344629099149, "loss": 1.3283, "step": 1343 }, { "epoch": 0.43, "grad_norm": 0.23619501292705536, "learning_rate": 0.00018998328996121358, "loss": 1.3489, "step": 1344 }, { "epoch": 0.43, "grad_norm": 0.270975261926651, "learning_rate": 0.00018983207328262318, "loss": 1.2126, "step": 1345 }, { "epoch": 0.43, "grad_norm": 0.30686482787132263, "learning_rate": 0.00018968081303953006, "loss": 1.1829, "step": 1346 }, { "epoch": 0.43, "grad_norm": 0.22022150456905365, "learning_rate": 0.0001895295093973681, "loss": 1.1734, "step": 1347 }, { "epoch": 0.43, "grad_norm": 1.1094398498535156, "learning_rate": 0.00018937816252161875, "loss": 1.4484, "step": 1348 }, { "epoch": 0.43, "grad_norm": 0.23081688582897186, "learning_rate": 0.00018922677257781072, "loss": 1.2288, "step": 1349 }, { "epoch": 0.43, "grad_norm": 0.28901249170303345, "learning_rate": 0.0001890753397315198, "loss": 1.3983, "step": 1350 }, { "epoch": 0.44, "grad_norm": 0.2851351201534271, "learning_rate": 0.00018892386414836874, "loss": 1.2642, "step": 1351 }, { "epoch": 0.44, "grad_norm": 0.285856693983078, "learning_rate": 0.000188772345994027, "loss": 1.2954, "step": 1352 }, { "epoch": 0.44, "grad_norm": 0.2640651762485504, "learning_rate": 0.0001886207854342106, "loss": 1.3729, "step": 1353 }, { "epoch": 0.44, "grad_norm": 0.2769777774810791, "learning_rate": 0.0001884691826346819, "loss": 1.2615, "step": 1354 }, { "epoch": 0.44, "grad_norm": 0.536563515663147, "learning_rate": 0.00018831753776124956, "loss": 1.3672, "step": 1355 }, { "epoch": 0.44, "grad_norm": 0.2593139111995697, "learning_rate": 0.00018816585097976827, "loss": 1.2041, "step": 1356 }, { "epoch": 0.44, "grad_norm": 0.21815559267997742, "learning_rate": 0.00018801412245613838, "loss": 1.1569, "step": 1357 }, { "epoch": 0.44, "grad_norm": 0.7389503717422485, "learning_rate": 0.0001878623523563061, "loss": 1.0088, "step": 1358 }, { "epoch": 0.44, "grad_norm": 0.32005685567855835, "learning_rate": 0.0001877105408462629, "loss": 1.1232, "step": 1359 }, { "epoch": 0.44, "grad_norm": 0.2618655860424042, "learning_rate": 0.00018755868809204569, "loss": 1.1203, "step": 1360 }, { "epoch": 0.44, "grad_norm": 0.3280000388622284, "learning_rate": 0.00018740679425973653, "loss": 1.2815, "step": 1361 }, { "epoch": 0.44, "grad_norm": 0.25135889649391174, "learning_rate": 0.00018725485951546223, "loss": 1.171, "step": 1362 }, { "epoch": 0.44, "grad_norm": 0.2526176869869232, "learning_rate": 0.00018710288402539453, "loss": 1.2498, "step": 1363 }, { "epoch": 0.44, "grad_norm": 0.2644837498664856, "learning_rate": 0.0001869508679557496, "loss": 1.3939, "step": 1364 }, { "epoch": 0.44, "grad_norm": 0.37182486057281494, "learning_rate": 0.00018679881147278804, "loss": 1.0589, "step": 1365 }, { "epoch": 0.44, "grad_norm": 0.2478458136320114, "learning_rate": 0.00018664671474281466, "loss": 1.0522, "step": 1366 }, { "epoch": 0.44, "grad_norm": 0.2045702487230301, "learning_rate": 0.0001864945779321783, "loss": 0.9671, "step": 1367 }, { "epoch": 0.44, "grad_norm": 0.25786322355270386, "learning_rate": 0.00018634240120727163, "loss": 1.2126, "step": 1368 }, { "epoch": 0.44, "grad_norm": 0.2558210790157318, "learning_rate": 0.0001861901847345309, "loss": 1.2481, "step": 1369 }, { "epoch": 0.44, "grad_norm": 0.22150172293186188, "learning_rate": 0.00018603792868043595, "loss": 1.1868, "step": 1370 }, { "epoch": 0.44, "grad_norm": 0.35407787561416626, "learning_rate": 0.00018588563321150978, "loss": 1.3735, "step": 1371 }, { "epoch": 0.44, "grad_norm": 0.2171659916639328, "learning_rate": 0.00018573329849431866, "loss": 1.2289, "step": 1372 }, { "epoch": 0.44, "grad_norm": 0.2658819258213043, "learning_rate": 0.0001855809246954717, "loss": 1.1468, "step": 1373 }, { "epoch": 0.44, "grad_norm": 0.22894729673862457, "learning_rate": 0.00018542851198162066, "loss": 1.2728, "step": 1374 }, { "epoch": 0.44, "grad_norm": 0.24462756514549255, "learning_rate": 0.00018527606051946006, "loss": 1.4897, "step": 1375 }, { "epoch": 0.44, "grad_norm": 0.3109615743160248, "learning_rate": 0.0001851235704757266, "loss": 1.417, "step": 1376 }, { "epoch": 0.44, "grad_norm": 0.23328709602355957, "learning_rate": 0.00018497104201719932, "loss": 1.2826, "step": 1377 }, { "epoch": 0.44, "grad_norm": 0.20979438722133636, "learning_rate": 0.0001848184753106992, "loss": 0.9592, "step": 1378 }, { "epoch": 0.44, "grad_norm": 0.2272309809923172, "learning_rate": 0.00018466587052308911, "loss": 1.2967, "step": 1379 }, { "epoch": 0.44, "grad_norm": 0.2982543706893921, "learning_rate": 0.0001845132278212735, "loss": 1.1735, "step": 1380 }, { "epoch": 0.44, "grad_norm": 0.27587994933128357, "learning_rate": 0.00018436054737219832, "loss": 1.1957, "step": 1381 }, { "epoch": 0.45, "grad_norm": 0.277906596660614, "learning_rate": 0.00018420782934285088, "loss": 1.243, "step": 1382 }, { "epoch": 0.45, "grad_norm": 0.6120585799217224, "learning_rate": 0.00018405507390025945, "loss": 1.5157, "step": 1383 }, { "epoch": 0.45, "grad_norm": 0.2808517515659332, "learning_rate": 0.00018390228121149328, "loss": 1.1561, "step": 1384 }, { "epoch": 0.45, "grad_norm": 0.2600584328174591, "learning_rate": 0.00018374945144366238, "loss": 1.0842, "step": 1385 }, { "epoch": 0.45, "grad_norm": 0.21619965136051178, "learning_rate": 0.0001835965847639173, "loss": 1.2054, "step": 1386 }, { "epoch": 0.45, "grad_norm": 0.23842580616474152, "learning_rate": 0.00018344368133944898, "loss": 1.1654, "step": 1387 }, { "epoch": 0.45, "grad_norm": 0.23953334987163544, "learning_rate": 0.0001832907413374885, "loss": 1.2496, "step": 1388 }, { "epoch": 0.45, "grad_norm": 0.2894213795661926, "learning_rate": 0.000183137764925307, "loss": 1.5301, "step": 1389 }, { "epoch": 0.45, "grad_norm": 0.2559410333633423, "learning_rate": 0.0001829847522702153, "loss": 1.327, "step": 1390 }, { "epoch": 0.45, "grad_norm": 0.26667001843452454, "learning_rate": 0.00018283170353956412, "loss": 1.1302, "step": 1391 }, { "epoch": 0.45, "grad_norm": 0.2339579463005066, "learning_rate": 0.00018267861890074345, "loss": 1.0505, "step": 1392 }, { "epoch": 0.45, "grad_norm": 0.27073046565055847, "learning_rate": 0.00018252549852118254, "loss": 1.1058, "step": 1393 }, { "epoch": 0.45, "grad_norm": 0.28436413407325745, "learning_rate": 0.0001823723425683498, "loss": 1.1779, "step": 1394 }, { "epoch": 0.45, "grad_norm": 0.2777477204799652, "learning_rate": 0.00018221915120975255, "loss": 1.1632, "step": 1395 }, { "epoch": 0.45, "grad_norm": 0.2608356773853302, "learning_rate": 0.00018206592461293675, "loss": 1.3781, "step": 1396 }, { "epoch": 0.45, "grad_norm": 0.3091738522052765, "learning_rate": 0.00018191266294548708, "loss": 1.3795, "step": 1397 }, { "epoch": 0.45, "grad_norm": 0.405531108379364, "learning_rate": 0.00018175936637502632, "loss": 1.3409, "step": 1398 }, { "epoch": 0.45, "grad_norm": 0.2255665510892868, "learning_rate": 0.00018160603506921572, "loss": 0.9987, "step": 1399 }, { "epoch": 0.45, "grad_norm": 0.21383413672447205, "learning_rate": 0.00018145266919575417, "loss": 1.3916, "step": 1400 }, { "epoch": 0.45, "grad_norm": 0.2389056384563446, "learning_rate": 0.00018129926892237861, "loss": 1.2178, "step": 1401 }, { "epoch": 0.45, "grad_norm": 0.2516612410545349, "learning_rate": 0.00018114583441686372, "loss": 1.4721, "step": 1402 }, { "epoch": 0.45, "grad_norm": 0.25566530227661133, "learning_rate": 0.00018099236584702126, "loss": 1.3398, "step": 1403 }, { "epoch": 0.45, "grad_norm": 0.20767934620380402, "learning_rate": 0.00018083886338070054, "loss": 1.1419, "step": 1404 }, { "epoch": 0.45, "grad_norm": 0.2441464066505432, "learning_rate": 0.0001806853271857878, "loss": 1.3558, "step": 1405 }, { "epoch": 0.45, "grad_norm": 0.22742381691932678, "learning_rate": 0.00018053175743020617, "loss": 1.3675, "step": 1406 }, { "epoch": 0.45, "grad_norm": 0.27875831723213196, "learning_rate": 0.0001803781542819156, "loss": 1.3517, "step": 1407 }, { "epoch": 0.45, "grad_norm": 0.28443703055381775, "learning_rate": 0.00018022451790891244, "loss": 1.3404, "step": 1408 }, { "epoch": 0.45, "grad_norm": 0.21591919660568237, "learning_rate": 0.00018007084847922948, "loss": 1.0503, "step": 1409 }, { "epoch": 0.45, "grad_norm": 0.3335200250148773, "learning_rate": 0.00017991714616093552, "loss": 1.3138, "step": 1410 }, { "epoch": 0.45, "grad_norm": 0.2156064659357071, "learning_rate": 0.0001797634111221355, "loss": 1.363, "step": 1411 }, { "epoch": 0.45, "grad_norm": 0.20503109693527222, "learning_rate": 0.00017960964353097, "loss": 1.1755, "step": 1412 }, { "epoch": 0.46, "grad_norm": 0.2487822026014328, "learning_rate": 0.0001794558435556153, "loss": 1.1136, "step": 1413 }, { "epoch": 0.46, "grad_norm": 0.21257705986499786, "learning_rate": 0.0001793020113642831, "loss": 1.1641, "step": 1414 }, { "epoch": 0.46, "grad_norm": 0.2611944079399109, "learning_rate": 0.00017914814712522028, "loss": 1.1375, "step": 1415 }, { "epoch": 0.46, "grad_norm": 0.24480777978897095, "learning_rate": 0.00017899425100670877, "loss": 1.241, "step": 1416 }, { "epoch": 0.46, "grad_norm": 0.30976399779319763, "learning_rate": 0.0001788403231770654, "loss": 1.249, "step": 1417 }, { "epoch": 0.46, "grad_norm": 0.2510818839073181, "learning_rate": 0.00017868636380464166, "loss": 1.3378, "step": 1418 }, { "epoch": 0.46, "grad_norm": 0.23657752573490143, "learning_rate": 0.00017853237305782358, "loss": 1.1717, "step": 1419 }, { "epoch": 0.46, "grad_norm": 0.23904891312122345, "learning_rate": 0.00017837835110503142, "loss": 1.4122, "step": 1420 }, { "epoch": 0.46, "grad_norm": 0.20305310189723969, "learning_rate": 0.0001782242981147197, "loss": 1.1025, "step": 1421 }, { "epoch": 0.46, "grad_norm": 0.24740561842918396, "learning_rate": 0.00017807021425537677, "loss": 1.3244, "step": 1422 }, { "epoch": 0.46, "grad_norm": 0.5409308671951294, "learning_rate": 0.0001779160996955248, "loss": 1.0038, "step": 1423 }, { "epoch": 0.46, "grad_norm": 0.2471180558204651, "learning_rate": 0.0001777619546037195, "loss": 1.2922, "step": 1424 }, { "epoch": 0.46, "grad_norm": 0.25401636958122253, "learning_rate": 0.00017760777914855, "loss": 1.241, "step": 1425 }, { "epoch": 0.46, "grad_norm": 0.28496992588043213, "learning_rate": 0.00017745357349863863, "loss": 0.9695, "step": 1426 }, { "epoch": 0.46, "grad_norm": 0.2682209610939026, "learning_rate": 0.0001772993378226407, "loss": 1.5614, "step": 1427 }, { "epoch": 0.46, "grad_norm": 0.20310848951339722, "learning_rate": 0.00017714507228924452, "loss": 1.0688, "step": 1428 }, { "epoch": 0.46, "grad_norm": 0.21409355103969574, "learning_rate": 0.0001769907770671708, "loss": 1.3331, "step": 1429 }, { "epoch": 0.46, "grad_norm": 0.2100203037261963, "learning_rate": 0.00017683645232517302, "loss": 1.1126, "step": 1430 }, { "epoch": 0.46, "grad_norm": 0.2546529173851013, "learning_rate": 0.0001766820982320366, "loss": 1.3756, "step": 1431 }, { "epoch": 0.46, "grad_norm": 0.2026955634355545, "learning_rate": 0.00017652771495657937, "loss": 1.2528, "step": 1432 }, { "epoch": 0.46, "grad_norm": 0.3088555335998535, "learning_rate": 0.00017637330266765097, "loss": 1.1712, "step": 1433 }, { "epoch": 0.46, "grad_norm": 0.2645629942417145, "learning_rate": 0.0001762188615341327, "loss": 1.3002, "step": 1434 }, { "epoch": 0.46, "grad_norm": 0.27968308329582214, "learning_rate": 0.00017606439172493748, "loss": 1.2503, "step": 1435 }, { "epoch": 0.46, "grad_norm": 0.24719227850437164, "learning_rate": 0.00017590989340900959, "loss": 1.2583, "step": 1436 }, { "epoch": 0.46, "grad_norm": 0.229497030377388, "learning_rate": 0.0001757553667553244, "loss": 1.1235, "step": 1437 }, { "epoch": 0.46, "grad_norm": 0.22048208117485046, "learning_rate": 0.00017560081193288848, "loss": 1.3804, "step": 1438 }, { "epoch": 0.46, "grad_norm": 0.22758013010025024, "learning_rate": 0.00017544622911073898, "loss": 0.8674, "step": 1439 }, { "epoch": 0.46, "grad_norm": 0.25167861580848694, "learning_rate": 0.0001752916184579438, "loss": 1.3606, "step": 1440 }, { "epoch": 0.46, "grad_norm": 0.250252902507782, "learning_rate": 0.00017513698014360124, "loss": 1.2121, "step": 1441 }, { "epoch": 0.46, "grad_norm": 0.20466458797454834, "learning_rate": 0.00017498231433683988, "loss": 1.0989, "step": 1442 }, { "epoch": 0.46, "grad_norm": 0.2575603425502777, "learning_rate": 0.0001748276212068183, "loss": 1.0429, "step": 1443 }, { "epoch": 0.47, "grad_norm": 0.22591513395309448, "learning_rate": 0.0001746729009227251, "loss": 0.9964, "step": 1444 }, { "epoch": 0.47, "grad_norm": 0.28525248169898987, "learning_rate": 0.00017451815365377845, "loss": 1.2448, "step": 1445 }, { "epoch": 0.47, "grad_norm": 0.23043383657932281, "learning_rate": 0.00017436337956922604, "loss": 1.3895, "step": 1446 }, { "epoch": 0.47, "grad_norm": 0.25271075963974, "learning_rate": 0.00017420857883834493, "loss": 0.9865, "step": 1447 }, { "epoch": 0.47, "grad_norm": 0.2077396959066391, "learning_rate": 0.00017405375163044137, "loss": 1.1557, "step": 1448 }, { "epoch": 0.47, "grad_norm": 0.2676021456718445, "learning_rate": 0.0001738988981148505, "loss": 1.4366, "step": 1449 }, { "epoch": 0.47, "grad_norm": 0.2379731833934784, "learning_rate": 0.00017374401846093624, "loss": 1.0559, "step": 1450 }, { "epoch": 0.47, "grad_norm": 0.254433274269104, "learning_rate": 0.0001735891128380911, "loss": 1.2726, "step": 1451 }, { "epoch": 0.47, "grad_norm": 0.7841512560844421, "learning_rate": 0.00017343418141573596, "loss": 1.297, "step": 1452 }, { "epoch": 0.47, "grad_norm": 0.21455001831054688, "learning_rate": 0.00017327922436332, "loss": 1.2935, "step": 1453 }, { "epoch": 0.47, "grad_norm": 0.214283287525177, "learning_rate": 0.00017312424185032043, "loss": 0.9537, "step": 1454 }, { "epoch": 0.47, "grad_norm": 0.2814158797264099, "learning_rate": 0.0001729692340462422, "loss": 1.3703, "step": 1455 }, { "epoch": 0.47, "grad_norm": 0.21305161714553833, "learning_rate": 0.000172814201120618, "loss": 1.1536, "step": 1456 }, { "epoch": 0.47, "grad_norm": 0.26291581988334656, "learning_rate": 0.00017265914324300798, "loss": 1.1388, "step": 1457 }, { "epoch": 0.47, "grad_norm": 0.2189137488603592, "learning_rate": 0.00017250406058299955, "loss": 1.0634, "step": 1458 }, { "epoch": 0.47, "grad_norm": 0.24101874232292175, "learning_rate": 0.00017234895331020734, "loss": 0.8994, "step": 1459 }, { "epoch": 0.47, "grad_norm": 0.23657524585723877, "learning_rate": 0.00017219382159427271, "loss": 1.214, "step": 1460 }, { "epoch": 0.47, "grad_norm": 0.30180686712265015, "learning_rate": 0.00017203866560486393, "loss": 1.5045, "step": 1461 }, { "epoch": 0.47, "grad_norm": 0.2578027844429016, "learning_rate": 0.0001718834855116757, "loss": 1.4631, "step": 1462 }, { "epoch": 0.47, "grad_norm": 0.25256744027137756, "learning_rate": 0.00017172828148442917, "loss": 1.423, "step": 1463 }, { "epoch": 0.47, "grad_norm": 0.24915343523025513, "learning_rate": 0.00017157305369287163, "loss": 1.3429, "step": 1464 }, { "epoch": 0.47, "grad_norm": 0.24765363335609436, "learning_rate": 0.0001714178023067763, "loss": 1.2237, "step": 1465 }, { "epoch": 0.47, "grad_norm": 0.26353564858436584, "learning_rate": 0.00017126252749594229, "loss": 1.3731, "step": 1466 }, { "epoch": 0.47, "grad_norm": 0.286467581987381, "learning_rate": 0.00017110722943019428, "loss": 1.3894, "step": 1467 }, { "epoch": 0.47, "grad_norm": 0.24320590496063232, "learning_rate": 0.00017095190827938245, "loss": 1.1634, "step": 1468 }, { "epoch": 0.47, "grad_norm": 0.2334159016609192, "learning_rate": 0.00017079656421338217, "loss": 1.2717, "step": 1469 }, { "epoch": 0.47, "grad_norm": 0.23037934303283691, "learning_rate": 0.00017064119740209385, "loss": 1.4041, "step": 1470 }, { "epoch": 0.47, "grad_norm": 0.22255410254001617, "learning_rate": 0.0001704858080154429, "loss": 1.3183, "step": 1471 }, { "epoch": 0.47, "grad_norm": 0.21216066181659698, "learning_rate": 0.00017033039622337914, "loss": 1.2911, "step": 1472 }, { "epoch": 0.47, "grad_norm": 0.2603231370449066, "learning_rate": 0.0001701749621958773, "loss": 1.3778, "step": 1473 }, { "epoch": 0.47, "grad_norm": 0.2557697594165802, "learning_rate": 0.00017001950610293616, "loss": 1.3548, "step": 1474 }, { "epoch": 0.48, "grad_norm": 0.3435133397579193, "learning_rate": 0.00016986402811457863, "loss": 1.378, "step": 1475 }, { "epoch": 0.48, "grad_norm": 0.20626889169216156, "learning_rate": 0.00016970852840085166, "loss": 1.0778, "step": 1476 }, { "epoch": 0.48, "grad_norm": 0.2134951502084732, "learning_rate": 0.00016955300713182589, "loss": 1.2601, "step": 1477 }, { "epoch": 0.48, "grad_norm": 0.204842209815979, "learning_rate": 0.00016939746447759566, "loss": 1.0933, "step": 1478 }, { "epoch": 0.48, "grad_norm": 0.2518988251686096, "learning_rate": 0.00016924190060827856, "loss": 1.1646, "step": 1479 }, { "epoch": 0.48, "grad_norm": 0.2374560832977295, "learning_rate": 0.0001690863156940154, "loss": 1.3568, "step": 1480 }, { "epoch": 0.48, "grad_norm": 0.24957898259162903, "learning_rate": 0.00016893070990497015, "loss": 1.1031, "step": 1481 }, { "epoch": 0.48, "grad_norm": 0.20891770720481873, "learning_rate": 0.00016877508341132938, "loss": 1.2222, "step": 1482 }, { "epoch": 0.48, "grad_norm": 0.24333617091178894, "learning_rate": 0.0001686194363833025, "loss": 1.4099, "step": 1483 }, { "epoch": 0.48, "grad_norm": 0.3059709072113037, "learning_rate": 0.00016846376899112126, "loss": 1.4515, "step": 1484 }, { "epoch": 0.48, "grad_norm": 0.23012784123420715, "learning_rate": 0.00016830808140503976, "loss": 1.3418, "step": 1485 }, { "epoch": 0.48, "grad_norm": 0.20992106199264526, "learning_rate": 0.00016815237379533423, "loss": 1.2318, "step": 1486 }, { "epoch": 0.48, "grad_norm": 0.2109375298023224, "learning_rate": 0.00016799664633230255, "loss": 1.2648, "step": 1487 }, { "epoch": 0.48, "grad_norm": 0.24101604521274567, "learning_rate": 0.0001678408991862646, "loss": 1.1905, "step": 1488 }, { "epoch": 0.48, "grad_norm": 0.23678269982337952, "learning_rate": 0.00016768513252756167, "loss": 1.202, "step": 1489 }, { "epoch": 0.48, "grad_norm": 0.8147335648536682, "learning_rate": 0.00016752934652655637, "loss": 1.2483, "step": 1490 }, { "epoch": 0.48, "grad_norm": 0.21765907108783722, "learning_rate": 0.00016737354135363253, "loss": 1.3171, "step": 1491 }, { "epoch": 0.48, "grad_norm": 0.2073349505662918, "learning_rate": 0.0001672177171791949, "loss": 1.117, "step": 1492 }, { "epoch": 0.48, "grad_norm": 0.2179872989654541, "learning_rate": 0.000167061874173669, "loss": 1.3385, "step": 1493 }, { "epoch": 0.48, "grad_norm": 0.26294490694999695, "learning_rate": 0.00016690601250750095, "loss": 1.1413, "step": 1494 }, { "epoch": 0.48, "grad_norm": 0.23928041756153107, "learning_rate": 0.00016675013235115736, "loss": 1.2653, "step": 1495 }, { "epoch": 0.48, "grad_norm": 0.21966585516929626, "learning_rate": 0.00016659423387512494, "loss": 1.0253, "step": 1496 }, { "epoch": 0.48, "grad_norm": 0.21739692986011505, "learning_rate": 0.00016643831724991054, "loss": 1.0595, "step": 1497 }, { "epoch": 0.48, "grad_norm": 0.2546144425868988, "learning_rate": 0.0001662823826460408, "loss": 1.2261, "step": 1498 }, { "epoch": 0.48, "grad_norm": 0.2508935332298279, "learning_rate": 0.000166126430234062, "loss": 1.2881, "step": 1499 }, { "epoch": 0.48, "grad_norm": 0.2539462745189667, "learning_rate": 0.00016597046018454, "loss": 1.2648, "step": 1500 }, { "epoch": 0.48, "grad_norm": 0.21485939621925354, "learning_rate": 0.00016581447266805982, "loss": 1.2879, "step": 1501 }, { "epoch": 0.48, "grad_norm": 0.21827048063278198, "learning_rate": 0.0001656584678552257, "loss": 1.2051, "step": 1502 }, { "epoch": 0.48, "grad_norm": 0.8284204602241516, "learning_rate": 0.00016550244591666076, "loss": 1.3674, "step": 1503 }, { "epoch": 0.48, "grad_norm": 0.29936954379081726, "learning_rate": 0.00016534640702300677, "loss": 1.0292, "step": 1504 }, { "epoch": 0.48, "grad_norm": 0.25196605920791626, "learning_rate": 0.0001651903513449242, "loss": 1.0705, "step": 1505 }, { "epoch": 0.49, "grad_norm": 0.2159924954175949, "learning_rate": 0.00016503427905309172, "loss": 1.1852, "step": 1506 }, { "epoch": 0.49, "grad_norm": 0.25989067554473877, "learning_rate": 0.00016487819031820627, "loss": 1.2115, "step": 1507 }, { "epoch": 0.49, "grad_norm": 0.26670318841934204, "learning_rate": 0.00016472208531098268, "loss": 1.3172, "step": 1508 }, { "epoch": 0.49, "grad_norm": 0.22528024017810822, "learning_rate": 0.00016456596420215373, "loss": 0.9948, "step": 1509 }, { "epoch": 0.49, "grad_norm": 0.251113623380661, "learning_rate": 0.00016440982716246972, "loss": 1.3811, "step": 1510 }, { "epoch": 0.49, "grad_norm": 0.27358734607696533, "learning_rate": 0.00016425367436269828, "loss": 0.98, "step": 1511 }, { "epoch": 0.49, "grad_norm": 0.26939284801483154, "learning_rate": 0.00016409750597362447, "loss": 1.1206, "step": 1512 }, { "epoch": 0.49, "grad_norm": 0.2426464855670929, "learning_rate": 0.00016394132216605018, "loss": 1.2834, "step": 1513 }, { "epoch": 0.49, "grad_norm": 0.22493954002857208, "learning_rate": 0.00016378512311079443, "loss": 1.0831, "step": 1514 }, { "epoch": 0.49, "grad_norm": 0.3227583169937134, "learning_rate": 0.00016362890897869272, "loss": 1.2375, "step": 1515 }, { "epoch": 0.49, "grad_norm": 0.31195446848869324, "learning_rate": 0.00016347267994059703, "loss": 1.4659, "step": 1516 }, { "epoch": 0.49, "grad_norm": 0.2591206729412079, "learning_rate": 0.0001633164361673758, "loss": 1.3171, "step": 1517 }, { "epoch": 0.49, "grad_norm": 0.24098674952983856, "learning_rate": 0.00016316017782991337, "loss": 1.1248, "step": 1518 }, { "epoch": 0.49, "grad_norm": 0.3440319001674652, "learning_rate": 0.00016300390509911024, "loss": 1.1657, "step": 1519 }, { "epoch": 0.49, "grad_norm": 0.22756685316562653, "learning_rate": 0.00016284761814588247, "loss": 1.1603, "step": 1520 }, { "epoch": 0.49, "grad_norm": 0.24171866476535797, "learning_rate": 0.00016269131714116177, "loss": 1.2797, "step": 1521 }, { "epoch": 0.49, "grad_norm": 0.27722859382629395, "learning_rate": 0.0001625350022558952, "loss": 1.2482, "step": 1522 }, { "epoch": 0.49, "grad_norm": 0.22590476274490356, "learning_rate": 0.00016237867366104497, "loss": 1.228, "step": 1523 }, { "epoch": 0.49, "grad_norm": 0.24651046097278595, "learning_rate": 0.00016222233152758835, "loss": 1.3233, "step": 1524 }, { "epoch": 0.49, "grad_norm": 0.21102569997310638, "learning_rate": 0.00016206597602651729, "loss": 1.14, "step": 1525 }, { "epoch": 0.49, "grad_norm": 0.34059643745422363, "learning_rate": 0.00016190960732883854, "loss": 1.2275, "step": 1526 }, { "epoch": 0.49, "grad_norm": 0.20935557782649994, "learning_rate": 0.0001617532256055732, "loss": 1.1146, "step": 1527 }, { "epoch": 0.49, "grad_norm": 0.24451646208763123, "learning_rate": 0.00016159683102775653, "loss": 1.1026, "step": 1528 }, { "epoch": 0.49, "grad_norm": 0.22406873106956482, "learning_rate": 0.00016144042376643796, "loss": 1.2665, "step": 1529 }, { "epoch": 0.49, "grad_norm": 0.2598498463630676, "learning_rate": 0.00016128400399268073, "loss": 1.3574, "step": 1530 }, { "epoch": 0.49, "grad_norm": 0.22919444739818573, "learning_rate": 0.00016112757187756186, "loss": 1.3355, "step": 1531 }, { "epoch": 0.49, "grad_norm": 0.22339090704917908, "learning_rate": 0.0001609711275921717, "loss": 1.1446, "step": 1532 }, { "epoch": 0.49, "grad_norm": 0.21919262409210205, "learning_rate": 0.00016081467130761408, "loss": 1.1632, "step": 1533 }, { "epoch": 0.49, "grad_norm": 0.2630949318408966, "learning_rate": 0.0001606582031950059, "loss": 1.3923, "step": 1534 }, { "epoch": 0.49, "grad_norm": 0.31600168347358704, "learning_rate": 0.0001605017234254769, "loss": 1.2651, "step": 1535 }, { "epoch": 0.49, "grad_norm": 0.26233118772506714, "learning_rate": 0.00016034523217016972, "loss": 1.3185, "step": 1536 }, { "epoch": 0.5, "grad_norm": 0.25318318605422974, "learning_rate": 0.00016018872960023948, "loss": 1.2105, "step": 1537 }, { "epoch": 0.5, "grad_norm": 0.2502976357936859, "learning_rate": 0.00016003221588685362, "loss": 1.2468, "step": 1538 }, { "epoch": 0.5, "grad_norm": 0.22900889813899994, "learning_rate": 0.0001598756912011919, "loss": 1.1783, "step": 1539 }, { "epoch": 0.5, "grad_norm": 0.23169326782226562, "learning_rate": 0.00015971915571444604, "loss": 1.3507, "step": 1540 }, { "epoch": 0.5, "grad_norm": 0.23655344545841217, "learning_rate": 0.0001595626095978195, "loss": 1.3462, "step": 1541 }, { "epoch": 0.5, "grad_norm": 0.2406223863363266, "learning_rate": 0.00015940605302252738, "loss": 1.2657, "step": 1542 }, { "epoch": 0.5, "grad_norm": 0.23621821403503418, "learning_rate": 0.00015924948615979635, "loss": 1.228, "step": 1543 }, { "epoch": 0.5, "grad_norm": 0.22758765518665314, "learning_rate": 0.0001590929091808641, "loss": 1.2687, "step": 1544 }, { "epoch": 0.5, "grad_norm": 0.26736927032470703, "learning_rate": 0.00015893632225697963, "loss": 1.3929, "step": 1545 }, { "epoch": 0.5, "grad_norm": 0.23600561916828156, "learning_rate": 0.0001587797255594027, "loss": 1.2151, "step": 1546 }, { "epoch": 0.5, "grad_norm": 0.2248542755842209, "learning_rate": 0.00015862311925940372, "loss": 1.3484, "step": 1547 }, { "epoch": 0.5, "grad_norm": 0.21238739788532257, "learning_rate": 0.0001584665035282636, "loss": 1.2715, "step": 1548 }, { "epoch": 0.5, "grad_norm": 0.2529599368572235, "learning_rate": 0.00015830987853727372, "loss": 1.256, "step": 1549 }, { "epoch": 0.5, "grad_norm": 0.25334468483924866, "learning_rate": 0.0001581532444577354, "loss": 1.4364, "step": 1550 }, { "epoch": 0.5, "grad_norm": 0.2232888787984848, "learning_rate": 0.00015799660146096004, "loss": 1.2773, "step": 1551 }, { "epoch": 0.5, "grad_norm": 0.34795913100242615, "learning_rate": 0.00015783994971826869, "loss": 1.2711, "step": 1552 }, { "epoch": 0.5, "grad_norm": 0.24499773979187012, "learning_rate": 0.000157683289400992, "loss": 0.897, "step": 1553 }, { "epoch": 0.5, "grad_norm": 0.24803298711776733, "learning_rate": 0.00015752662068046997, "loss": 0.9971, "step": 1554 }, { "epoch": 0.5, "grad_norm": 0.25855711102485657, "learning_rate": 0.0001573699437280519, "loss": 1.3811, "step": 1555 }, { "epoch": 0.5, "grad_norm": 0.24379390478134155, "learning_rate": 0.00015721325871509602, "loss": 1.0263, "step": 1556 }, { "epoch": 0.5, "grad_norm": 0.21267180144786835, "learning_rate": 0.00015705656581296932, "loss": 1.1091, "step": 1557 }, { "epoch": 0.5, "grad_norm": 0.26950404047966003, "learning_rate": 0.00015689986519304755, "loss": 1.3979, "step": 1558 }, { "epoch": 0.5, "grad_norm": 0.2216482311487198, "learning_rate": 0.0001567431570267147, "loss": 1.4078, "step": 1559 }, { "epoch": 0.5, "grad_norm": 0.22988948225975037, "learning_rate": 0.00015658644148536328, "loss": 1.2564, "step": 1560 }, { "epoch": 0.5, "grad_norm": 0.21240773797035217, "learning_rate": 0.0001564297187403936, "loss": 1.2279, "step": 1561 }, { "epoch": 0.5, "grad_norm": 0.2505764067173004, "learning_rate": 0.00015627298896321403, "loss": 1.3517, "step": 1562 }, { "epoch": 0.5, "grad_norm": 0.19865256547927856, "learning_rate": 0.00015611625232524062, "loss": 1.0169, "step": 1563 }, { "epoch": 0.5, "grad_norm": 0.3551575839519501, "learning_rate": 0.00015595950899789676, "loss": 1.154, "step": 1564 }, { "epoch": 0.5, "grad_norm": 0.25363636016845703, "learning_rate": 0.00015580275915261335, "loss": 1.3528, "step": 1565 }, { "epoch": 0.5, "grad_norm": 0.35021254420280457, "learning_rate": 0.0001556460029608284, "loss": 1.2488, "step": 1566 }, { "epoch": 0.5, "grad_norm": 0.20811891555786133, "learning_rate": 0.00015548924059398663, "loss": 1.2893, "step": 1567 }, { "epoch": 0.51, "grad_norm": 0.23298859596252441, "learning_rate": 0.00015533247222353992, "loss": 1.4122, "step": 1568 }, { "epoch": 0.51, "grad_norm": 0.29284557700157166, "learning_rate": 0.00015517569802094626, "loss": 1.3038, "step": 1569 }, { "epoch": 0.51, "grad_norm": 0.2354835867881775, "learning_rate": 0.00015501891815767035, "loss": 1.2306, "step": 1570 }, { "epoch": 0.51, "grad_norm": 0.21321597695350647, "learning_rate": 0.00015486213280518297, "loss": 0.7853, "step": 1571 }, { "epoch": 0.51, "grad_norm": 0.2908453047275543, "learning_rate": 0.00015470534213496092, "loss": 1.1391, "step": 1572 }, { "epoch": 0.51, "grad_norm": 0.2898194491863251, "learning_rate": 0.00015454854631848674, "loss": 1.4296, "step": 1573 }, { "epoch": 0.51, "grad_norm": 0.23081327974796295, "learning_rate": 0.00015439174552724875, "loss": 1.243, "step": 1574 }, { "epoch": 0.51, "grad_norm": 0.24647469818592072, "learning_rate": 0.00015423493993274053, "loss": 1.3486, "step": 1575 }, { "epoch": 0.51, "grad_norm": 0.23457692563533783, "learning_rate": 0.00015407812970646102, "loss": 1.488, "step": 1576 }, { "epoch": 0.51, "grad_norm": 0.2446373552083969, "learning_rate": 0.00015392131501991432, "loss": 1.2355, "step": 1577 }, { "epoch": 0.51, "grad_norm": 0.5035969614982605, "learning_rate": 0.00015376449604460914, "loss": 1.3593, "step": 1578 }, { "epoch": 0.51, "grad_norm": 0.22341127693653107, "learning_rate": 0.00015360767295205914, "loss": 1.2736, "step": 1579 }, { "epoch": 0.51, "grad_norm": 0.2362758368253708, "learning_rate": 0.0001534508459137824, "loss": 1.3737, "step": 1580 }, { "epoch": 0.51, "grad_norm": 0.2362406998872757, "learning_rate": 0.0001532940151013012, "loss": 1.41, "step": 1581 }, { "epoch": 0.51, "grad_norm": 0.2151055932044983, "learning_rate": 0.00015313718068614215, "loss": 1.2575, "step": 1582 }, { "epoch": 0.51, "grad_norm": 0.20800676941871643, "learning_rate": 0.0001529803428398356, "loss": 1.1801, "step": 1583 }, { "epoch": 0.51, "grad_norm": 0.22609196603298187, "learning_rate": 0.00015282350173391585, "loss": 1.2595, "step": 1584 }, { "epoch": 0.51, "grad_norm": 0.21742387115955353, "learning_rate": 0.00015266665753992057, "loss": 1.2135, "step": 1585 }, { "epoch": 0.51, "grad_norm": 0.23828984797000885, "learning_rate": 0.00015250981042939097, "loss": 1.2485, "step": 1586 }, { "epoch": 0.51, "grad_norm": 0.2674875259399414, "learning_rate": 0.0001523529605738714, "loss": 1.3524, "step": 1587 }, { "epoch": 0.51, "grad_norm": 0.22233335673809052, "learning_rate": 0.00015219610814490908, "loss": 0.9728, "step": 1588 }, { "epoch": 0.51, "grad_norm": 0.23566585779190063, "learning_rate": 0.00015203925331405422, "loss": 1.1415, "step": 1589 }, { "epoch": 0.51, "grad_norm": 0.26862964034080505, "learning_rate": 0.00015188239625285965, "loss": 1.4408, "step": 1590 }, { "epoch": 0.51, "grad_norm": 0.2085997462272644, "learning_rate": 0.0001517255371328805, "loss": 1.2891, "step": 1591 }, { "epoch": 0.51, "grad_norm": 0.3142246603965759, "learning_rate": 0.0001515686761256743, "loss": 1.3807, "step": 1592 }, { "epoch": 0.51, "grad_norm": 0.25033247470855713, "learning_rate": 0.00015141181340280058, "loss": 0.9247, "step": 1593 }, { "epoch": 0.51, "grad_norm": 0.23265966773033142, "learning_rate": 0.0001512549491358207, "loss": 1.2045, "step": 1594 }, { "epoch": 0.51, "grad_norm": 0.2903764247894287, "learning_rate": 0.00015109808349629772, "loss": 1.4293, "step": 1595 }, { "epoch": 0.51, "grad_norm": 0.24264168739318848, "learning_rate": 0.0001509412166557964, "loss": 1.2675, "step": 1596 }, { "epoch": 0.51, "grad_norm": 0.3101588785648346, "learning_rate": 0.00015078434878588247, "loss": 1.4482, "step": 1597 }, { "epoch": 0.51, "grad_norm": 0.2266916185617447, "learning_rate": 0.00015062748005812305, "loss": 0.9479, "step": 1598 }, { "epoch": 0.52, "grad_norm": 0.2518002688884735, "learning_rate": 0.00015047061064408612, "loss": 1.3155, "step": 1599 }, { "epoch": 0.52, "grad_norm": 0.3581833839416504, "learning_rate": 0.00015031374071534034, "loss": 1.2784, "step": 1600 }, { "epoch": 0.52, "grad_norm": 0.23671278357505798, "learning_rate": 0.0001501568704434551, "loss": 1.2288, "step": 1601 }, { "epoch": 0.52, "grad_norm": 0.2307826578617096, "learning_rate": 0.00015, "loss": 1.0576, "step": 1602 }, { "epoch": 0.52, "grad_norm": 0.24422898888587952, "learning_rate": 0.0001498431295565449, "loss": 1.1751, "step": 1603 }, { "epoch": 0.52, "grad_norm": 0.21578340232372284, "learning_rate": 0.0001496862592846596, "loss": 1.1042, "step": 1604 }, { "epoch": 0.52, "grad_norm": 0.26470884680747986, "learning_rate": 0.00014952938935591388, "loss": 1.3052, "step": 1605 }, { "epoch": 0.52, "grad_norm": 0.21441107988357544, "learning_rate": 0.00014937251994187695, "loss": 1.337, "step": 1606 }, { "epoch": 0.52, "grad_norm": 0.23484186828136444, "learning_rate": 0.00014921565121411753, "loss": 1.5276, "step": 1607 }, { "epoch": 0.52, "grad_norm": 0.24122141301631927, "learning_rate": 0.00014905878334420358, "loss": 1.0699, "step": 1608 }, { "epoch": 0.52, "grad_norm": 0.2634845972061157, "learning_rate": 0.00014890191650370223, "loss": 1.2858, "step": 1609 }, { "epoch": 0.52, "grad_norm": 0.220708429813385, "learning_rate": 0.0001487450508641793, "loss": 1.1062, "step": 1610 }, { "epoch": 0.52, "grad_norm": 0.21657653152942657, "learning_rate": 0.00014858818659719942, "loss": 1.0564, "step": 1611 }, { "epoch": 0.52, "grad_norm": 0.2031516581773758, "learning_rate": 0.00014843132387432566, "loss": 1.2523, "step": 1612 }, { "epoch": 0.52, "grad_norm": 0.229112908244133, "learning_rate": 0.00014827446286711944, "loss": 1.122, "step": 1613 }, { "epoch": 0.52, "grad_norm": 0.22326606512069702, "learning_rate": 0.00014811760374714033, "loss": 1.2725, "step": 1614 }, { "epoch": 0.52, "grad_norm": 0.2707209885120392, "learning_rate": 0.00014796074668594575, "loss": 1.2709, "step": 1615 }, { "epoch": 0.52, "grad_norm": 0.25618892908096313, "learning_rate": 0.00014780389185509093, "loss": 1.0952, "step": 1616 }, { "epoch": 0.52, "grad_norm": 0.23042134940624237, "learning_rate": 0.0001476470394261286, "loss": 1.1576, "step": 1617 }, { "epoch": 0.52, "grad_norm": 0.2500133216381073, "learning_rate": 0.00014749018957060898, "loss": 1.2692, "step": 1618 }, { "epoch": 0.52, "grad_norm": 0.24643288552761078, "learning_rate": 0.0001473333424600794, "loss": 1.2701, "step": 1619 }, { "epoch": 0.52, "grad_norm": 0.23762288689613342, "learning_rate": 0.00014717649826608416, "loss": 1.2737, "step": 1620 }, { "epoch": 0.52, "grad_norm": 0.23032434284687042, "learning_rate": 0.00014701965716016436, "loss": 1.194, "step": 1621 }, { "epoch": 0.52, "grad_norm": 0.24965626001358032, "learning_rate": 0.00014686281931385782, "loss": 1.3091, "step": 1622 }, { "epoch": 0.52, "grad_norm": 0.2108844518661499, "learning_rate": 0.00014670598489869877, "loss": 1.3138, "step": 1623 }, { "epoch": 0.52, "grad_norm": 0.21282514929771423, "learning_rate": 0.00014654915408621766, "loss": 1.1461, "step": 1624 }, { "epoch": 0.52, "grad_norm": 0.26730260252952576, "learning_rate": 0.00014639232704794083, "loss": 1.1659, "step": 1625 }, { "epoch": 0.52, "grad_norm": 0.2273942083120346, "learning_rate": 0.00014623550395539083, "loss": 1.2562, "step": 1626 }, { "epoch": 0.52, "grad_norm": 0.6414650082588196, "learning_rate": 0.00014607868498008568, "loss": 1.0068, "step": 1627 }, { "epoch": 0.52, "grad_norm": 0.2086189091205597, "learning_rate": 0.00014592187029353892, "loss": 1.2711, "step": 1628 }, { "epoch": 0.52, "grad_norm": 0.27094727754592896, "learning_rate": 0.0001457650600672595, "loss": 1.2043, "step": 1629 }, { "epoch": 0.53, "grad_norm": 0.20932742953300476, "learning_rate": 0.00014560825447275125, "loss": 1.215, "step": 1630 }, { "epoch": 0.53, "grad_norm": 0.23949886858463287, "learning_rate": 0.00014545145368151323, "loss": 1.314, "step": 1631 }, { "epoch": 0.53, "grad_norm": 0.21047930419445038, "learning_rate": 0.00014529465786503905, "loss": 1.0034, "step": 1632 }, { "epoch": 0.53, "grad_norm": 0.2940872311592102, "learning_rate": 0.000145137867194817, "loss": 1.3534, "step": 1633 }, { "epoch": 0.53, "grad_norm": 0.25560569763183594, "learning_rate": 0.00014498108184232965, "loss": 1.213, "step": 1634 }, { "epoch": 0.53, "grad_norm": 0.29463455080986023, "learning_rate": 0.00014482430197905374, "loss": 1.3477, "step": 1635 }, { "epoch": 0.53, "grad_norm": 0.2248086929321289, "learning_rate": 0.00014466752777646008, "loss": 1.2285, "step": 1636 }, { "epoch": 0.53, "grad_norm": 0.19034427404403687, "learning_rate": 0.0001445107594060133, "loss": 1.1274, "step": 1637 }, { "epoch": 0.53, "grad_norm": 0.23198747634887695, "learning_rate": 0.00014435399703917158, "loss": 0.9628, "step": 1638 }, { "epoch": 0.53, "grad_norm": 0.27859044075012207, "learning_rate": 0.00014419724084738665, "loss": 1.1166, "step": 1639 }, { "epoch": 0.53, "grad_norm": 0.21656042337417603, "learning_rate": 0.00014404049100210324, "loss": 1.1335, "step": 1640 }, { "epoch": 0.53, "grad_norm": 0.24224089086055756, "learning_rate": 0.00014388374767475938, "loss": 1.1167, "step": 1641 }, { "epoch": 0.53, "grad_norm": 0.241190567612648, "learning_rate": 0.00014372701103678592, "loss": 1.1057, "step": 1642 }, { "epoch": 0.53, "grad_norm": 0.24533629417419434, "learning_rate": 0.00014357028125960643, "loss": 1.1973, "step": 1643 }, { "epoch": 0.53, "grad_norm": 0.20219525694847107, "learning_rate": 0.00014341355851463675, "loss": 1.16, "step": 1644 }, { "epoch": 0.53, "grad_norm": 0.23816728591918945, "learning_rate": 0.0001432568429732853, "loss": 1.2762, "step": 1645 }, { "epoch": 0.53, "grad_norm": 0.25517550110816956, "learning_rate": 0.00014310013480695242, "loss": 1.1964, "step": 1646 }, { "epoch": 0.53, "grad_norm": 0.24166275560855865, "learning_rate": 0.00014294343418703063, "loss": 1.3106, "step": 1647 }, { "epoch": 0.53, "grad_norm": 0.2412921041250229, "learning_rate": 0.00014278674128490398, "loss": 1.3653, "step": 1648 }, { "epoch": 0.53, "grad_norm": 0.23236589133739471, "learning_rate": 0.00014263005627194807, "loss": 1.0778, "step": 1649 }, { "epoch": 0.53, "grad_norm": 0.19976171851158142, "learning_rate": 0.00014247337931953003, "loss": 1.0926, "step": 1650 }, { "epoch": 0.53, "grad_norm": 0.2190147340297699, "learning_rate": 0.000142316710599008, "loss": 0.9636, "step": 1651 }, { "epoch": 0.53, "grad_norm": 0.20461887121200562, "learning_rate": 0.0001421600502817313, "loss": 1.135, "step": 1652 }, { "epoch": 0.53, "grad_norm": 0.24712473154067993, "learning_rate": 0.00014200339853904, "loss": 1.2034, "step": 1653 }, { "epoch": 0.53, "grad_norm": 0.2302912473678589, "learning_rate": 0.00014184675554226457, "loss": 1.2554, "step": 1654 }, { "epoch": 0.53, "grad_norm": 0.3030833601951599, "learning_rate": 0.00014169012146272628, "loss": 1.3606, "step": 1655 }, { "epoch": 0.53, "grad_norm": 0.23465321958065033, "learning_rate": 0.00014153349647173636, "loss": 1.3024, "step": 1656 }, { "epoch": 0.53, "grad_norm": 0.22444435954093933, "learning_rate": 0.00014137688074059625, "loss": 1.2704, "step": 1657 }, { "epoch": 0.53, "grad_norm": 0.21859301626682281, "learning_rate": 0.00014122027444059732, "loss": 1.3748, "step": 1658 }, { "epoch": 0.53, "grad_norm": 0.2137126326560974, "learning_rate": 0.00014106367774302034, "loss": 1.1443, "step": 1659 }, { "epoch": 0.53, "grad_norm": 0.23214185237884521, "learning_rate": 0.00014090709081913586, "loss": 1.3256, "step": 1660 }, { "epoch": 0.53, "grad_norm": 0.2891673445701599, "learning_rate": 0.00014075051384020366, "loss": 1.1028, "step": 1661 }, { "epoch": 0.54, "grad_norm": 0.2229388803243637, "learning_rate": 0.00014059394697747254, "loss": 1.3893, "step": 1662 }, { "epoch": 0.54, "grad_norm": 0.19649145007133484, "learning_rate": 0.0001404373904021805, "loss": 1.2502, "step": 1663 }, { "epoch": 0.54, "grad_norm": 0.23898644745349884, "learning_rate": 0.00014028084428555396, "loss": 1.3535, "step": 1664 }, { "epoch": 0.54, "grad_norm": 0.24631142616271973, "learning_rate": 0.00014012430879880807, "loss": 1.1548, "step": 1665 }, { "epoch": 0.54, "grad_norm": 0.2399219423532486, "learning_rate": 0.00013996778411314636, "loss": 1.3171, "step": 1666 }, { "epoch": 0.54, "grad_norm": 0.27015888690948486, "learning_rate": 0.00013981127039976057, "loss": 1.0438, "step": 1667 }, { "epoch": 0.54, "grad_norm": 0.2660621702671051, "learning_rate": 0.00013965476782983028, "loss": 1.3493, "step": 1668 }, { "epoch": 0.54, "grad_norm": 0.30424585938453674, "learning_rate": 0.0001394982765745231, "loss": 1.1947, "step": 1669 }, { "epoch": 0.54, "grad_norm": 0.3218717873096466, "learning_rate": 0.00013934179680499413, "loss": 1.3296, "step": 1670 }, { "epoch": 0.54, "grad_norm": 0.2257612943649292, "learning_rate": 0.0001391853286923859, "loss": 1.166, "step": 1671 }, { "epoch": 0.54, "grad_norm": 0.2217746526002884, "learning_rate": 0.00013902887240782832, "loss": 1.1306, "step": 1672 }, { "epoch": 0.54, "grad_norm": 0.44517597556114197, "learning_rate": 0.00013887242812243817, "loss": 1.2912, "step": 1673 }, { "epoch": 0.54, "grad_norm": 0.17705735564231873, "learning_rate": 0.00013871599600731924, "loss": 1.1194, "step": 1674 }, { "epoch": 0.54, "grad_norm": 0.2050253450870514, "learning_rate": 0.00013855957623356205, "loss": 0.9663, "step": 1675 }, { "epoch": 0.54, "grad_norm": 0.26876187324523926, "learning_rate": 0.00013840316897224345, "loss": 1.2683, "step": 1676 }, { "epoch": 0.54, "grad_norm": 0.23476672172546387, "learning_rate": 0.0001382467743944268, "loss": 1.3261, "step": 1677 }, { "epoch": 0.54, "grad_norm": 0.21701236069202423, "learning_rate": 0.00013809039267116143, "loss": 1.1567, "step": 1678 }, { "epoch": 0.54, "grad_norm": 0.222859725356102, "learning_rate": 0.0001379340239734827, "loss": 1.0252, "step": 1679 }, { "epoch": 0.54, "grad_norm": 0.22152623534202576, "learning_rate": 0.00013777766847241168, "loss": 1.2364, "step": 1680 }, { "epoch": 0.54, "grad_norm": 0.23724286258220673, "learning_rate": 0.000137621326338955, "loss": 0.9603, "step": 1681 }, { "epoch": 0.54, "grad_norm": 0.23205243051052094, "learning_rate": 0.00013746499774410482, "loss": 0.9519, "step": 1682 }, { "epoch": 0.54, "grad_norm": 0.2581680417060852, "learning_rate": 0.00013730868285883823, "loss": 1.4469, "step": 1683 }, { "epoch": 0.54, "grad_norm": 0.26698341965675354, "learning_rate": 0.00013715238185411753, "loss": 1.5174, "step": 1684 }, { "epoch": 0.54, "grad_norm": 0.3794737458229065, "learning_rate": 0.00013699609490088976, "loss": 1.2785, "step": 1685 }, { "epoch": 0.54, "grad_norm": 0.2057531625032425, "learning_rate": 0.0001368398221700866, "loss": 0.9671, "step": 1686 }, { "epoch": 0.54, "grad_norm": 0.5364126563072205, "learning_rate": 0.00013668356383262425, "loss": 1.2572, "step": 1687 }, { "epoch": 0.54, "grad_norm": 0.2504572570323944, "learning_rate": 0.00013652732005940295, "loss": 1.1886, "step": 1688 }, { "epoch": 0.54, "grad_norm": 0.23494388163089752, "learning_rate": 0.00013637109102130728, "loss": 1.3303, "step": 1689 }, { "epoch": 0.54, "grad_norm": 0.23413904011249542, "learning_rate": 0.00013621487688920552, "loss": 1.2055, "step": 1690 }, { "epoch": 0.54, "grad_norm": 0.24939773976802826, "learning_rate": 0.00013605867783394974, "loss": 1.1723, "step": 1691 }, { "epoch": 0.54, "grad_norm": 0.38203689455986023, "learning_rate": 0.00013590249402637555, "loss": 1.1396, "step": 1692 }, { "epoch": 0.55, "grad_norm": 0.23382645845413208, "learning_rate": 0.0001357463256373017, "loss": 1.42, "step": 1693 }, { "epoch": 0.55, "grad_norm": 0.3202337324619293, "learning_rate": 0.00013559017283753028, "loss": 1.0866, "step": 1694 }, { "epoch": 0.55, "grad_norm": 0.2544618248939514, "learning_rate": 0.00013543403579784622, "loss": 1.1311, "step": 1695 }, { "epoch": 0.55, "grad_norm": 0.22361619770526886, "learning_rate": 0.0001352779146890173, "loss": 1.2246, "step": 1696 }, { "epoch": 0.55, "grad_norm": 0.2234838902950287, "learning_rate": 0.00013512180968179374, "loss": 1.1079, "step": 1697 }, { "epoch": 0.55, "grad_norm": 0.20235858857631683, "learning_rate": 0.00013496572094690826, "loss": 1.3106, "step": 1698 }, { "epoch": 0.55, "grad_norm": 0.22292087972164154, "learning_rate": 0.00013480964865507578, "loss": 1.2049, "step": 1699 }, { "epoch": 0.55, "grad_norm": 0.2936997413635254, "learning_rate": 0.00013465359297699318, "loss": 1.3757, "step": 1700 }, { "epoch": 0.55, "grad_norm": 0.21486248075962067, "learning_rate": 0.00013449755408333924, "loss": 1.3829, "step": 1701 }, { "epoch": 0.55, "grad_norm": 0.28266215324401855, "learning_rate": 0.00013434153214477427, "loss": 1.2073, "step": 1702 }, { "epoch": 0.55, "grad_norm": 0.22703468799591064, "learning_rate": 0.00013418552733194018, "loss": 1.3674, "step": 1703 }, { "epoch": 0.55, "grad_norm": 0.20287437736988068, "learning_rate": 0.00013402953981546, "loss": 1.3195, "step": 1704 }, { "epoch": 0.55, "grad_norm": 0.20729462802410126, "learning_rate": 0.00013387356976593797, "loss": 1.1685, "step": 1705 }, { "epoch": 0.55, "grad_norm": 0.20638199150562286, "learning_rate": 0.0001337176173539592, "loss": 1.1365, "step": 1706 }, { "epoch": 0.55, "grad_norm": 0.25338494777679443, "learning_rate": 0.00013356168275008946, "loss": 0.9568, "step": 1707 }, { "epoch": 0.55, "grad_norm": 0.22763125598430634, "learning_rate": 0.00013340576612487503, "loss": 1.0572, "step": 1708 }, { "epoch": 0.55, "grad_norm": 0.22499041259288788, "learning_rate": 0.0001332498676488426, "loss": 1.245, "step": 1709 }, { "epoch": 0.55, "grad_norm": 0.22621946036815643, "learning_rate": 0.00013309398749249902, "loss": 0.949, "step": 1710 }, { "epoch": 0.55, "grad_norm": 0.4430404305458069, "learning_rate": 0.00013293812582633102, "loss": 1.1371, "step": 1711 }, { "epoch": 0.55, "grad_norm": 0.20326457917690277, "learning_rate": 0.0001327822828208051, "loss": 1.3171, "step": 1712 }, { "epoch": 0.55, "grad_norm": 0.20981870591640472, "learning_rate": 0.00013262645864636744, "loss": 1.3614, "step": 1713 }, { "epoch": 0.55, "grad_norm": 0.23312869668006897, "learning_rate": 0.00013247065347344358, "loss": 1.2791, "step": 1714 }, { "epoch": 0.55, "grad_norm": 0.23025205731391907, "learning_rate": 0.0001323148674724383, "loss": 1.169, "step": 1715 }, { "epoch": 0.55, "grad_norm": 0.2391413450241089, "learning_rate": 0.00013215910081373542, "loss": 1.2416, "step": 1716 }, { "epoch": 0.55, "grad_norm": 0.24452711641788483, "learning_rate": 0.00013200335366769745, "loss": 1.5071, "step": 1717 }, { "epoch": 0.55, "grad_norm": 0.21766050159931183, "learning_rate": 0.0001318476262046658, "loss": 1.3096, "step": 1718 }, { "epoch": 0.55, "grad_norm": 0.24307939410209656, "learning_rate": 0.00013169191859496019, "loss": 1.3571, "step": 1719 }, { "epoch": 0.55, "grad_norm": 0.21929921209812164, "learning_rate": 0.00013153623100887877, "loss": 1.3, "step": 1720 }, { "epoch": 0.55, "grad_norm": 0.22726953029632568, "learning_rate": 0.00013138056361669754, "loss": 1.1649, "step": 1721 }, { "epoch": 0.55, "grad_norm": 0.2119276374578476, "learning_rate": 0.00013122491658867062, "loss": 1.1427, "step": 1722 }, { "epoch": 0.55, "grad_norm": 0.2069201022386551, "learning_rate": 0.00013106929009502983, "loss": 1.4009, "step": 1723 }, { "epoch": 0.56, "grad_norm": 0.19603683054447174, "learning_rate": 0.00013091368430598454, "loss": 1.1742, "step": 1724 }, { "epoch": 0.56, "grad_norm": 0.23030413687229156, "learning_rate": 0.00013075809939172147, "loss": 0.9885, "step": 1725 }, { "epoch": 0.56, "grad_norm": 0.201499804854393, "learning_rate": 0.00013060253552240434, "loss": 1.1544, "step": 1726 }, { "epoch": 0.56, "grad_norm": 0.22075827419757843, "learning_rate": 0.0001304469928681741, "loss": 1.4398, "step": 1727 }, { "epoch": 0.56, "grad_norm": 0.229030579328537, "learning_rate": 0.0001302914715991483, "loss": 1.3727, "step": 1728 }, { "epoch": 0.56, "grad_norm": 0.38695430755615234, "learning_rate": 0.00013013597188542134, "loss": 1.2257, "step": 1729 }, { "epoch": 0.56, "grad_norm": 0.44165509939193726, "learning_rate": 0.00012998049389706387, "loss": 0.9639, "step": 1730 }, { "epoch": 0.56, "grad_norm": 0.2776918113231659, "learning_rate": 0.00012982503780412268, "loss": 1.1116, "step": 1731 }, { "epoch": 0.56, "grad_norm": 0.21218807995319366, "learning_rate": 0.00012966960377662083, "loss": 1.379, "step": 1732 }, { "epoch": 0.56, "grad_norm": 0.18688738346099854, "learning_rate": 0.0001295141919845571, "loss": 1.188, "step": 1733 }, { "epoch": 0.56, "grad_norm": 0.2611802816390991, "learning_rate": 0.0001293588025979061, "loss": 1.3064, "step": 1734 }, { "epoch": 0.56, "grad_norm": 0.21465393900871277, "learning_rate": 0.00012920343578661785, "loss": 1.0721, "step": 1735 }, { "epoch": 0.56, "grad_norm": 0.21907645463943481, "learning_rate": 0.00012904809172061755, "loss": 1.3437, "step": 1736 }, { "epoch": 0.56, "grad_norm": 0.25810706615448, "learning_rate": 0.00012889277056980572, "loss": 1.1781, "step": 1737 }, { "epoch": 0.56, "grad_norm": 0.22493663430213928, "learning_rate": 0.0001287374725040577, "loss": 1.2993, "step": 1738 }, { "epoch": 0.56, "grad_norm": 0.2009238749742508, "learning_rate": 0.00012858219769322366, "loss": 1.2012, "step": 1739 }, { "epoch": 0.56, "grad_norm": 0.22021310031414032, "learning_rate": 0.0001284269463071284, "loss": 1.3962, "step": 1740 }, { "epoch": 0.56, "grad_norm": 0.2546767294406891, "learning_rate": 0.0001282717185155708, "loss": 1.0887, "step": 1741 }, { "epoch": 0.56, "grad_norm": 0.28939446806907654, "learning_rate": 0.0001281165144883243, "loss": 1.4368, "step": 1742 }, { "epoch": 0.56, "grad_norm": 0.21344709396362305, "learning_rate": 0.00012796133439513607, "loss": 1.2574, "step": 1743 }, { "epoch": 0.56, "grad_norm": 0.2045958787202835, "learning_rate": 0.00012780617840572723, "loss": 1.2755, "step": 1744 }, { "epoch": 0.56, "grad_norm": 0.21394963562488556, "learning_rate": 0.00012765104668979269, "loss": 1.3204, "step": 1745 }, { "epoch": 0.56, "grad_norm": 0.21434995532035828, "learning_rate": 0.00012749593941700045, "loss": 1.2884, "step": 1746 }, { "epoch": 0.56, "grad_norm": 0.267566055059433, "learning_rate": 0.00012734085675699205, "loss": 1.4186, "step": 1747 }, { "epoch": 0.56, "grad_norm": 0.18060138821601868, "learning_rate": 0.000127185798879382, "loss": 0.9666, "step": 1748 }, { "epoch": 0.56, "grad_norm": 0.3071848452091217, "learning_rate": 0.0001270307659537578, "loss": 1.2533, "step": 1749 }, { "epoch": 0.56, "grad_norm": 0.22055885195732117, "learning_rate": 0.00012687575814967957, "loss": 1.0328, "step": 1750 }, { "epoch": 0.56, "grad_norm": 0.19806601107120514, "learning_rate": 0.00012672077563667997, "loss": 1.2654, "step": 1751 }, { "epoch": 0.56, "grad_norm": 0.22993607819080353, "learning_rate": 0.00012656581858426404, "loss": 1.3918, "step": 1752 }, { "epoch": 0.56, "grad_norm": 0.2341468334197998, "learning_rate": 0.00012641088716190894, "loss": 1.3506, "step": 1753 }, { "epoch": 0.56, "grad_norm": 0.32777896523475647, "learning_rate": 0.0001262559815390638, "loss": 1.1757, "step": 1754 }, { "epoch": 0.57, "grad_norm": 0.26919037103652954, "learning_rate": 0.0001261011018851495, "loss": 1.2344, "step": 1755 }, { "epoch": 0.57, "grad_norm": 0.21598467230796814, "learning_rate": 0.00012594624836955863, "loss": 1.3741, "step": 1756 }, { "epoch": 0.57, "grad_norm": 0.22658003866672516, "learning_rate": 0.00012579142116165505, "loss": 1.2902, "step": 1757 }, { "epoch": 0.57, "grad_norm": 0.23922444880008698, "learning_rate": 0.00012563662043077396, "loss": 1.35, "step": 1758 }, { "epoch": 0.57, "grad_norm": 0.2366904467344284, "learning_rate": 0.00012548184634622158, "loss": 1.0372, "step": 1759 }, { "epoch": 0.57, "grad_norm": 0.18250064551830292, "learning_rate": 0.0001253270990772749, "loss": 1.2982, "step": 1760 }, { "epoch": 0.57, "grad_norm": 0.20805002748966217, "learning_rate": 0.0001251723787931817, "loss": 1.0807, "step": 1761 }, { "epoch": 0.57, "grad_norm": 0.2013995349407196, "learning_rate": 0.00012501768566316012, "loss": 1.265, "step": 1762 }, { "epoch": 0.57, "grad_norm": 0.2193235158920288, "learning_rate": 0.00012486301985639874, "loss": 1.3376, "step": 1763 }, { "epoch": 0.57, "grad_norm": 0.2166881114244461, "learning_rate": 0.0001247083815420562, "loss": 1.037, "step": 1764 }, { "epoch": 0.57, "grad_norm": 0.22316882014274597, "learning_rate": 0.00012455377088926102, "loss": 1.4331, "step": 1765 }, { "epoch": 0.57, "grad_norm": 0.20598086714744568, "learning_rate": 0.00012439918806711153, "loss": 1.3929, "step": 1766 }, { "epoch": 0.57, "grad_norm": 0.23496967554092407, "learning_rate": 0.00012424463324467558, "loss": 1.4753, "step": 1767 }, { "epoch": 0.57, "grad_norm": 0.22421838343143463, "learning_rate": 0.00012409010659099041, "loss": 1.0245, "step": 1768 }, { "epoch": 0.57, "grad_norm": 0.22015327215194702, "learning_rate": 0.00012393560827506252, "loss": 1.3246, "step": 1769 }, { "epoch": 0.57, "grad_norm": 0.21748648583889008, "learning_rate": 0.0001237811384658673, "loss": 1.2644, "step": 1770 }, { "epoch": 0.57, "grad_norm": 0.22206948697566986, "learning_rate": 0.000123626697332349, "loss": 1.2515, "step": 1771 }, { "epoch": 0.57, "grad_norm": 0.21728329360485077, "learning_rate": 0.00012347228504342055, "loss": 1.4214, "step": 1772 }, { "epoch": 0.57, "grad_norm": 0.21426033973693848, "learning_rate": 0.00012331790176796337, "loss": 1.3781, "step": 1773 }, { "epoch": 0.57, "grad_norm": 0.2122887670993805, "learning_rate": 0.00012316354767482698, "loss": 1.073, "step": 1774 }, { "epoch": 0.57, "grad_norm": 0.22110514342784882, "learning_rate": 0.00012300922293282917, "loss": 1.4026, "step": 1775 }, { "epoch": 0.57, "grad_norm": 0.22227083146572113, "learning_rate": 0.00012285492771075545, "loss": 1.3836, "step": 1776 }, { "epoch": 0.57, "grad_norm": 0.23720119893550873, "learning_rate": 0.00012270066217735925, "loss": 1.413, "step": 1777 }, { "epoch": 0.57, "grad_norm": 0.1991962492465973, "learning_rate": 0.00012254642650136137, "loss": 1.2751, "step": 1778 }, { "epoch": 0.57, "grad_norm": 0.23723700642585754, "learning_rate": 0.00012239222085145, "loss": 0.9677, "step": 1779 }, { "epoch": 0.57, "grad_norm": 0.23922596871852875, "learning_rate": 0.00012223804539628048, "loss": 1.2508, "step": 1780 }, { "epoch": 0.57, "grad_norm": 0.20311285555362701, "learning_rate": 0.00012208390030447517, "loss": 1.2748, "step": 1781 }, { "epoch": 0.57, "grad_norm": 0.22811037302017212, "learning_rate": 0.00012192978574462318, "loss": 1.0314, "step": 1782 }, { "epoch": 0.57, "grad_norm": 0.20153997838497162, "learning_rate": 0.00012177570188528028, "loss": 1.3408, "step": 1783 }, { "epoch": 0.57, "grad_norm": 0.2226332575082779, "learning_rate": 0.00012162164889496855, "loss": 1.3224, "step": 1784 }, { "epoch": 0.57, "grad_norm": 0.23445908725261688, "learning_rate": 0.00012146762694217642, "loss": 1.4494, "step": 1785 }, { "epoch": 0.58, "grad_norm": 0.21297618746757507, "learning_rate": 0.00012131363619535834, "loss": 1.1832, "step": 1786 }, { "epoch": 0.58, "grad_norm": 0.19945524632930756, "learning_rate": 0.0001211596768229346, "loss": 1.2869, "step": 1787 }, { "epoch": 0.58, "grad_norm": 0.23487547039985657, "learning_rate": 0.00012100574899329124, "loss": 1.4144, "step": 1788 }, { "epoch": 0.58, "grad_norm": 0.2405741959810257, "learning_rate": 0.00012085185287477971, "loss": 1.0906, "step": 1789 }, { "epoch": 0.58, "grad_norm": 0.2534414529800415, "learning_rate": 0.00012069798863571687, "loss": 1.1246, "step": 1790 }, { "epoch": 0.58, "grad_norm": 0.20556822419166565, "learning_rate": 0.00012054415644438464, "loss": 1.3959, "step": 1791 }, { "epoch": 0.58, "grad_norm": 0.22653372585773468, "learning_rate": 0.00012039035646902996, "loss": 1.2019, "step": 1792 }, { "epoch": 0.58, "grad_norm": 0.24736931920051575, "learning_rate": 0.00012023658887786451, "loss": 1.1281, "step": 1793 }, { "epoch": 0.58, "grad_norm": 0.20384804904460907, "learning_rate": 0.00012008285383906447, "loss": 1.0493, "step": 1794 }, { "epoch": 0.58, "grad_norm": 0.21456141769886017, "learning_rate": 0.00011992915152077052, "loss": 1.463, "step": 1795 }, { "epoch": 0.58, "grad_norm": 0.2617669105529785, "learning_rate": 0.00011977548209108752, "loss": 0.9729, "step": 1796 }, { "epoch": 0.58, "grad_norm": 0.24370965361595154, "learning_rate": 0.00011962184571808443, "loss": 1.2921, "step": 1797 }, { "epoch": 0.58, "grad_norm": 0.22423836588859558, "learning_rate": 0.00011946824256979384, "loss": 1.1939, "step": 1798 }, { "epoch": 0.58, "grad_norm": 0.2696157693862915, "learning_rate": 0.00011931467281421221, "loss": 1.1849, "step": 1799 }, { "epoch": 0.58, "grad_norm": 0.26216816902160645, "learning_rate": 0.00011916113661929943, "loss": 1.2368, "step": 1800 }, { "epoch": 0.58, "grad_norm": 0.27551862597465515, "learning_rate": 0.0001190076341529787, "loss": 1.114, "step": 1801 }, { "epoch": 0.58, "grad_norm": 0.23337715864181519, "learning_rate": 0.0001188541655831363, "loss": 1.1294, "step": 1802 }, { "epoch": 0.58, "grad_norm": 0.19856880605220795, "learning_rate": 0.00011870073107762135, "loss": 1.0991, "step": 1803 }, { "epoch": 0.58, "grad_norm": 0.25236210227012634, "learning_rate": 0.00011854733080424583, "loss": 1.3738, "step": 1804 }, { "epoch": 0.58, "grad_norm": 0.49598488211631775, "learning_rate": 0.0001183939649307843, "loss": 1.15, "step": 1805 }, { "epoch": 0.58, "grad_norm": 0.2416837066411972, "learning_rate": 0.00011824063362497364, "loss": 1.0956, "step": 1806 }, { "epoch": 0.58, "grad_norm": 0.24185633659362793, "learning_rate": 0.00011808733705451296, "loss": 1.1182, "step": 1807 }, { "epoch": 0.58, "grad_norm": 0.21964658796787262, "learning_rate": 0.00011793407538706324, "loss": 1.3242, "step": 1808 }, { "epoch": 0.58, "grad_norm": 0.3176639974117279, "learning_rate": 0.00011778084879024743, "loss": 1.2038, "step": 1809 }, { "epoch": 0.58, "grad_norm": 0.21960467100143433, "learning_rate": 0.00011762765743165018, "loss": 1.2613, "step": 1810 }, { "epoch": 0.58, "grad_norm": 0.2215711623430252, "learning_rate": 0.00011747450147881744, "loss": 1.2924, "step": 1811 }, { "epoch": 0.58, "grad_norm": 0.23576363921165466, "learning_rate": 0.00011732138109925659, "loss": 1.2397, "step": 1812 }, { "epoch": 0.58, "grad_norm": 0.25986191630363464, "learning_rate": 0.00011716829646043589, "loss": 1.2499, "step": 1813 }, { "epoch": 0.58, "grad_norm": 0.2579520344734192, "learning_rate": 0.00011701524772978467, "loss": 1.4488, "step": 1814 }, { "epoch": 0.58, "grad_norm": 0.21521271765232086, "learning_rate": 0.000116862235074693, "loss": 1.5131, "step": 1815 }, { "epoch": 0.58, "grad_norm": 0.2142353504896164, "learning_rate": 0.00011670925866251146, "loss": 1.1715, "step": 1816 }, { "epoch": 0.59, "grad_norm": 0.21959179639816284, "learning_rate": 0.00011655631866055103, "loss": 1.2725, "step": 1817 }, { "epoch": 0.59, "grad_norm": 0.2995471954345703, "learning_rate": 0.0001164034152360827, "loss": 1.2608, "step": 1818 }, { "epoch": 0.59, "grad_norm": 0.2581825256347656, "learning_rate": 0.00011625054855633761, "loss": 1.2283, "step": 1819 }, { "epoch": 0.59, "grad_norm": 0.20214980840682983, "learning_rate": 0.00011609771878850668, "loss": 1.1517, "step": 1820 }, { "epoch": 0.59, "grad_norm": 0.20980459451675415, "learning_rate": 0.00011594492609974051, "loss": 1.1786, "step": 1821 }, { "epoch": 0.59, "grad_norm": 0.22752881050109863, "learning_rate": 0.00011579217065714912, "loss": 1.1326, "step": 1822 }, { "epoch": 0.59, "grad_norm": 0.2721670866012573, "learning_rate": 0.00011563945262780165, "loss": 1.4103, "step": 1823 }, { "epoch": 0.59, "grad_norm": 0.21241535246372223, "learning_rate": 0.00011548677217872649, "loss": 1.1984, "step": 1824 }, { "epoch": 0.59, "grad_norm": 0.24626873433589935, "learning_rate": 0.00011533412947691085, "loss": 1.2851, "step": 1825 }, { "epoch": 0.59, "grad_norm": 0.23120957612991333, "learning_rate": 0.0001151815246893008, "loss": 1.3263, "step": 1826 }, { "epoch": 0.59, "grad_norm": 0.23121647536754608, "learning_rate": 0.0001150289579828007, "loss": 1.2497, "step": 1827 }, { "epoch": 0.59, "grad_norm": 0.2209133803844452, "learning_rate": 0.00011487642952427341, "loss": 1.4821, "step": 1828 }, { "epoch": 0.59, "grad_norm": 0.19246645271778107, "learning_rate": 0.00011472393948053996, "loss": 1.1959, "step": 1829 }, { "epoch": 0.59, "grad_norm": 0.24106065928936005, "learning_rate": 0.00011457148801837933, "loss": 1.4708, "step": 1830 }, { "epoch": 0.59, "grad_norm": 0.20953597128391266, "learning_rate": 0.00011441907530452832, "loss": 1.0556, "step": 1831 }, { "epoch": 0.59, "grad_norm": 0.21197937428951263, "learning_rate": 0.00011426670150568132, "loss": 1.3906, "step": 1832 }, { "epoch": 0.59, "grad_norm": 0.20220711827278137, "learning_rate": 0.0001141143667884902, "loss": 1.2598, "step": 1833 }, { "epoch": 0.59, "grad_norm": 0.21943514049053192, "learning_rate": 0.00011396207131956407, "loss": 1.239, "step": 1834 }, { "epoch": 0.59, "grad_norm": 0.22879134118556976, "learning_rate": 0.00011380981526546909, "loss": 1.2377, "step": 1835 }, { "epoch": 0.59, "grad_norm": 0.21351803839206696, "learning_rate": 0.0001136575987927284, "loss": 1.23, "step": 1836 }, { "epoch": 0.59, "grad_norm": 0.22202856838703156, "learning_rate": 0.00011350542206782168, "loss": 1.1324, "step": 1837 }, { "epoch": 0.59, "grad_norm": 0.22175559401512146, "learning_rate": 0.00011335328525718533, "loss": 1.2625, "step": 1838 }, { "epoch": 0.59, "grad_norm": 0.2328222095966339, "learning_rate": 0.00011320118852721194, "loss": 1.089, "step": 1839 }, { "epoch": 0.59, "grad_norm": 0.2044118344783783, "learning_rate": 0.00011304913204425039, "loss": 1.3283, "step": 1840 }, { "epoch": 0.59, "grad_norm": 0.2801876962184906, "learning_rate": 0.00011289711597460548, "loss": 1.2036, "step": 1841 }, { "epoch": 0.59, "grad_norm": 0.21582306921482086, "learning_rate": 0.00011274514048453775, "loss": 1.2389, "step": 1842 }, { "epoch": 0.59, "grad_norm": 0.22457802295684814, "learning_rate": 0.00011259320574026346, "loss": 1.2925, "step": 1843 }, { "epoch": 0.59, "grad_norm": 0.2044416069984436, "learning_rate": 0.00011244131190795427, "loss": 1.0156, "step": 1844 }, { "epoch": 0.59, "grad_norm": 0.23998640477657318, "learning_rate": 0.0001122894591537371, "loss": 1.3934, "step": 1845 }, { "epoch": 0.59, "grad_norm": 0.20840789377689362, "learning_rate": 0.00011213764764369392, "loss": 1.2729, "step": 1846 }, { "epoch": 0.59, "grad_norm": 0.2587125897407532, "learning_rate": 0.00011198587754386158, "loss": 1.1647, "step": 1847 }, { "epoch": 0.6, "grad_norm": 0.203425794839859, "learning_rate": 0.00011183414902023172, "loss": 1.0706, "step": 1848 }, { "epoch": 0.6, "grad_norm": 0.22708185017108917, "learning_rate": 0.00011168246223875037, "loss": 1.1797, "step": 1849 }, { "epoch": 0.6, "grad_norm": 0.24745896458625793, "learning_rate": 0.00011153081736531809, "loss": 1.4414, "step": 1850 }, { "epoch": 0.6, "grad_norm": 0.2021932750940323, "learning_rate": 0.00011137921456578944, "loss": 1.3088, "step": 1851 }, { "epoch": 0.6, "grad_norm": 0.2612607181072235, "learning_rate": 0.00011122765400597302, "loss": 1.2349, "step": 1852 }, { "epoch": 0.6, "grad_norm": 0.24339541792869568, "learning_rate": 0.00011107613585163125, "loss": 1.0177, "step": 1853 }, { "epoch": 0.6, "grad_norm": 0.21897396445274353, "learning_rate": 0.00011092466026848016, "loss": 1.126, "step": 1854 }, { "epoch": 0.6, "grad_norm": 0.3887563645839691, "learning_rate": 0.00011077322742218928, "loss": 1.4, "step": 1855 }, { "epoch": 0.6, "grad_norm": 0.21683749556541443, "learning_rate": 0.00011062183747838124, "loss": 1.339, "step": 1856 }, { "epoch": 0.6, "grad_norm": 0.2909065783023834, "learning_rate": 0.00011047049060263189, "loss": 0.9296, "step": 1857 }, { "epoch": 0.6, "grad_norm": 0.2707578241825104, "learning_rate": 0.00011031918696046993, "loss": 1.1381, "step": 1858 }, { "epoch": 0.6, "grad_norm": 0.284667044878006, "learning_rate": 0.00011016792671737678, "loss": 1.382, "step": 1859 }, { "epoch": 0.6, "grad_norm": 0.21222124993801117, "learning_rate": 0.00011001671003878643, "loss": 1.2128, "step": 1860 }, { "epoch": 0.6, "grad_norm": 0.20484209060668945, "learning_rate": 0.0001098655370900851, "loss": 1.0611, "step": 1861 }, { "epoch": 0.6, "grad_norm": 0.21300098299980164, "learning_rate": 0.00010971440803661132, "loss": 1.1812, "step": 1862 }, { "epoch": 0.6, "grad_norm": 0.23233044147491455, "learning_rate": 0.00010956332304365555, "loss": 1.1084, "step": 1863 }, { "epoch": 0.6, "grad_norm": 0.20488545298576355, "learning_rate": 0.00010941228227646009, "loss": 1.0721, "step": 1864 }, { "epoch": 0.6, "grad_norm": 0.21013228595256805, "learning_rate": 0.00010926128590021883, "loss": 1.05, "step": 1865 }, { "epoch": 0.6, "grad_norm": 0.21118295192718506, "learning_rate": 0.00010911033408007714, "loss": 1.2836, "step": 1866 }, { "epoch": 0.6, "grad_norm": 0.21325752139091492, "learning_rate": 0.00010895942698113159, "loss": 1.2775, "step": 1867 }, { "epoch": 0.6, "grad_norm": 0.251006156206131, "learning_rate": 0.00010880856476842997, "loss": 0.9895, "step": 1868 }, { "epoch": 0.6, "grad_norm": 0.22415049374103546, "learning_rate": 0.00010865774760697088, "loss": 1.3555, "step": 1869 }, { "epoch": 0.6, "grad_norm": 0.2364586591720581, "learning_rate": 0.0001085069756617037, "loss": 1.4023, "step": 1870 }, { "epoch": 0.6, "grad_norm": 0.19929179549217224, "learning_rate": 0.00010835624909752825, "loss": 1.1431, "step": 1871 }, { "epoch": 0.6, "grad_norm": 0.22072359919548035, "learning_rate": 0.00010820556807929487, "loss": 1.2197, "step": 1872 }, { "epoch": 0.6, "grad_norm": 0.22425276041030884, "learning_rate": 0.00010805493277180398, "loss": 1.2691, "step": 1873 }, { "epoch": 0.6, "grad_norm": 0.3481486439704895, "learning_rate": 0.00010790434333980608, "loss": 1.172, "step": 1874 }, { "epoch": 0.6, "grad_norm": 0.22306928038597107, "learning_rate": 0.00010775379994800131, "loss": 1.2418, "step": 1875 }, { "epoch": 0.6, "grad_norm": 0.2636454105377197, "learning_rate": 0.00010760330276103976, "loss": 1.1361, "step": 1876 }, { "epoch": 0.6, "grad_norm": 0.20511306822299957, "learning_rate": 0.00010745285194352076, "loss": 1.3113, "step": 1877 }, { "epoch": 0.6, "grad_norm": 0.22308415174484253, "learning_rate": 0.00010730244765999299, "loss": 1.1879, "step": 1878 }, { "epoch": 0.61, "grad_norm": 0.20552557706832886, "learning_rate": 0.00010715209007495428, "loss": 1.1463, "step": 1879 }, { "epoch": 0.61, "grad_norm": 0.19496938586235046, "learning_rate": 0.00010700177935285126, "loss": 1.1817, "step": 1880 }, { "epoch": 0.61, "grad_norm": 0.20593203604221344, "learning_rate": 0.00010685151565807941, "loss": 1.2131, "step": 1881 }, { "epoch": 0.61, "grad_norm": 0.21656179428100586, "learning_rate": 0.00010670129915498279, "loss": 1.1767, "step": 1882 }, { "epoch": 0.61, "grad_norm": 0.2205333560705185, "learning_rate": 0.00010655113000785377, "loss": 1.2963, "step": 1883 }, { "epoch": 0.61, "grad_norm": 0.19941720366477966, "learning_rate": 0.00010640100838093305, "loss": 0.9849, "step": 1884 }, { "epoch": 0.61, "grad_norm": 0.18570660054683685, "learning_rate": 0.00010625093443840916, "loss": 1.1643, "step": 1885 }, { "epoch": 0.61, "grad_norm": 0.2152935415506363, "learning_rate": 0.00010610090834441858, "loss": 1.2753, "step": 1886 }, { "epoch": 0.61, "grad_norm": 0.26462268829345703, "learning_rate": 0.00010595093026304558, "loss": 1.5593, "step": 1887 }, { "epoch": 0.61, "grad_norm": 0.22305627167224884, "learning_rate": 0.00010580100035832173, "loss": 1.2472, "step": 1888 }, { "epoch": 0.61, "grad_norm": 0.20606304705142975, "learning_rate": 0.00010565111879422605, "loss": 1.1204, "step": 1889 }, { "epoch": 0.61, "grad_norm": 0.23786011338233948, "learning_rate": 0.00010550128573468455, "loss": 1.1481, "step": 1890 }, { "epoch": 0.61, "grad_norm": 0.21782884001731873, "learning_rate": 0.00010535150134357023, "loss": 1.0289, "step": 1891 }, { "epoch": 0.61, "grad_norm": 0.26819780468940735, "learning_rate": 0.000105201765784703, "loss": 1.4091, "step": 1892 }, { "epoch": 0.61, "grad_norm": 0.21830952167510986, "learning_rate": 0.00010505207922184922, "loss": 1.303, "step": 1893 }, { "epoch": 0.61, "grad_norm": 0.24495765566825867, "learning_rate": 0.00010490244181872175, "loss": 1.0259, "step": 1894 }, { "epoch": 0.61, "grad_norm": 0.19578564167022705, "learning_rate": 0.00010475285373897954, "loss": 1.2931, "step": 1895 }, { "epoch": 0.61, "grad_norm": 0.21793358027935028, "learning_rate": 0.00010460331514622768, "loss": 1.2331, "step": 1896 }, { "epoch": 0.61, "grad_norm": 0.25122058391571045, "learning_rate": 0.00010445382620401723, "loss": 1.1788, "step": 1897 }, { "epoch": 0.61, "grad_norm": 0.350884348154068, "learning_rate": 0.00010430438707584483, "loss": 1.1495, "step": 1898 }, { "epoch": 0.61, "grad_norm": 0.22064292430877686, "learning_rate": 0.00010415499792515276, "loss": 1.389, "step": 1899 }, { "epoch": 0.61, "grad_norm": 0.2121577262878418, "learning_rate": 0.00010400565891532836, "loss": 1.307, "step": 1900 }, { "epoch": 0.61, "grad_norm": 0.2289271503686905, "learning_rate": 0.00010385637020970442, "loss": 1.1834, "step": 1901 }, { "epoch": 0.61, "grad_norm": 0.2517905831336975, "learning_rate": 0.0001037071319715586, "loss": 0.9681, "step": 1902 }, { "epoch": 0.61, "grad_norm": 0.20954447984695435, "learning_rate": 0.00010355794436411348, "loss": 1.449, "step": 1903 }, { "epoch": 0.61, "grad_norm": 0.24039679765701294, "learning_rate": 0.000103408807550536, "loss": 1.3456, "step": 1904 }, { "epoch": 0.61, "grad_norm": 0.27330678701400757, "learning_rate": 0.00010325972169393777, "loss": 1.1987, "step": 1905 }, { "epoch": 0.61, "grad_norm": 0.22395426034927368, "learning_rate": 0.00010311068695737465, "loss": 1.3119, "step": 1906 }, { "epoch": 0.61, "grad_norm": 0.24717484414577484, "learning_rate": 0.00010296170350384646, "loss": 1.5399, "step": 1907 }, { "epoch": 0.61, "grad_norm": 0.22264094650745392, "learning_rate": 0.00010281277149629715, "loss": 1.2535, "step": 1908 }, { "epoch": 0.61, "grad_norm": 0.23011121153831482, "learning_rate": 0.00010266389109761414, "loss": 1.2149, "step": 1909 }, { "epoch": 0.62, "grad_norm": 0.21383850276470184, "learning_rate": 0.0001025150624706286, "loss": 1.2845, "step": 1910 }, { "epoch": 0.62, "grad_norm": 0.22065086662769318, "learning_rate": 0.00010236628577811496, "loss": 1.3267, "step": 1911 }, { "epoch": 0.62, "grad_norm": 0.3555450737476349, "learning_rate": 0.00010221756118279092, "loss": 1.0963, "step": 1912 }, { "epoch": 0.62, "grad_norm": 0.25223326683044434, "learning_rate": 0.00010206888884731731, "loss": 1.1424, "step": 1913 }, { "epoch": 0.62, "grad_norm": 0.2400698959827423, "learning_rate": 0.00010192026893429749, "loss": 1.1868, "step": 1914 }, { "epoch": 0.62, "grad_norm": 0.31586912274360657, "learning_rate": 0.00010177170160627775, "loss": 1.0083, "step": 1915 }, { "epoch": 0.62, "grad_norm": 0.21913914382457733, "learning_rate": 0.0001016231870257468, "loss": 1.0697, "step": 1916 }, { "epoch": 0.62, "grad_norm": 0.24297192692756653, "learning_rate": 0.00010147472535513563, "loss": 1.0932, "step": 1917 }, { "epoch": 0.62, "grad_norm": 0.22146426141262054, "learning_rate": 0.00010132631675681742, "loss": 1.2403, "step": 1918 }, { "epoch": 0.62, "grad_norm": 0.23472250998020172, "learning_rate": 0.00010117796139310721, "loss": 1.3704, "step": 1919 }, { "epoch": 0.62, "grad_norm": 0.224572092294693, "learning_rate": 0.0001010296594262619, "loss": 1.2928, "step": 1920 }, { "epoch": 0.62, "grad_norm": 0.25819674134254456, "learning_rate": 0.00010088141101847995, "loss": 1.2134, "step": 1921 }, { "epoch": 0.62, "grad_norm": 0.2260867804288864, "learning_rate": 0.00010073321633190127, "loss": 1.1778, "step": 1922 }, { "epoch": 0.62, "grad_norm": 0.24793016910552979, "learning_rate": 0.00010058507552860702, "loss": 1.1423, "step": 1923 }, { "epoch": 0.62, "grad_norm": 0.20945219695568085, "learning_rate": 0.00010043698877061936, "loss": 1.3041, "step": 1924 }, { "epoch": 0.62, "grad_norm": 0.18299436569213867, "learning_rate": 0.00010028895621990142, "loss": 1.2202, "step": 1925 }, { "epoch": 0.62, "grad_norm": 0.25692957639694214, "learning_rate": 0.00010014097803835697, "loss": 1.3388, "step": 1926 }, { "epoch": 0.62, "grad_norm": 0.21527087688446045, "learning_rate": 9.999305438783042e-05, "loss": 1.1144, "step": 1927 }, { "epoch": 0.62, "grad_norm": 0.24602557718753815, "learning_rate": 9.984518543010645e-05, "loss": 1.4404, "step": 1928 }, { "epoch": 0.62, "grad_norm": 0.30776163935661316, "learning_rate": 9.969737132690993e-05, "loss": 1.2381, "step": 1929 }, { "epoch": 0.62, "grad_norm": 0.24421177804470062, "learning_rate": 9.954961223990575e-05, "loss": 1.3164, "step": 1930 }, { "epoch": 0.62, "grad_norm": 0.2572387456893921, "learning_rate": 9.940190833069867e-05, "loss": 1.43, "step": 1931 }, { "epoch": 0.62, "grad_norm": 0.2048458456993103, "learning_rate": 9.925425976083305e-05, "loss": 1.1708, "step": 1932 }, { "epoch": 0.62, "grad_norm": 0.23342037200927734, "learning_rate": 9.910666669179271e-05, "loss": 1.2068, "step": 1933 }, { "epoch": 0.62, "grad_norm": 0.3681604266166687, "learning_rate": 9.895912928500085e-05, "loss": 1.165, "step": 1934 }, { "epoch": 0.62, "grad_norm": 0.1980866938829422, "learning_rate": 9.881164770181968e-05, "loss": 1.2631, "step": 1935 }, { "epoch": 0.62, "grad_norm": 0.25956645607948303, "learning_rate": 9.866422210355047e-05, "loss": 1.3839, "step": 1936 }, { "epoch": 0.62, "grad_norm": 0.22958047688007355, "learning_rate": 9.851685265143319e-05, "loss": 1.1106, "step": 1937 }, { "epoch": 0.62, "grad_norm": 0.2529613673686981, "learning_rate": 9.836953950664638e-05, "loss": 1.1626, "step": 1938 }, { "epoch": 0.62, "grad_norm": 0.20266762375831604, "learning_rate": 9.822228283030709e-05, "loss": 1.2303, "step": 1939 }, { "epoch": 0.62, "grad_norm": 0.22945378720760345, "learning_rate": 9.80750827834705e-05, "loss": 1.3449, "step": 1940 }, { "epoch": 0.63, "grad_norm": 0.2189895063638687, "learning_rate": 9.792793952712994e-05, "loss": 1.399, "step": 1941 }, { "epoch": 0.63, "grad_norm": 0.25502172112464905, "learning_rate": 9.778085322221662e-05, "loss": 1.4674, "step": 1942 }, { "epoch": 0.63, "grad_norm": 0.2518256902694702, "learning_rate": 9.763382402959939e-05, "loss": 1.2113, "step": 1943 }, { "epoch": 0.63, "grad_norm": 0.19882406294345856, "learning_rate": 9.748685211008469e-05, "loss": 0.9867, "step": 1944 }, { "epoch": 0.63, "grad_norm": 0.22933201491832733, "learning_rate": 9.733993762441632e-05, "loss": 1.4593, "step": 1945 }, { "epoch": 0.63, "grad_norm": 0.21869313716888428, "learning_rate": 9.719308073327529e-05, "loss": 1.2329, "step": 1946 }, { "epoch": 0.63, "grad_norm": 0.19844625890254974, "learning_rate": 9.70462815972796e-05, "loss": 1.1469, "step": 1947 }, { "epoch": 0.63, "grad_norm": 0.2335352897644043, "learning_rate": 9.689954037698399e-05, "loss": 1.107, "step": 1948 }, { "epoch": 0.63, "grad_norm": 0.21913117170333862, "learning_rate": 9.675285723287997e-05, "loss": 1.1987, "step": 1949 }, { "epoch": 0.63, "grad_norm": 0.3224123418331146, "learning_rate": 9.660623232539552e-05, "loss": 1.0892, "step": 1950 }, { "epoch": 0.63, "grad_norm": 0.23191231489181519, "learning_rate": 9.645966581489491e-05, "loss": 1.3477, "step": 1951 }, { "epoch": 0.63, "grad_norm": 0.22934098541736603, "learning_rate": 9.631315786167852e-05, "loss": 1.5689, "step": 1952 }, { "epoch": 0.63, "grad_norm": 0.23654374480247498, "learning_rate": 9.616670862598271e-05, "loss": 1.2871, "step": 1953 }, { "epoch": 0.63, "grad_norm": 0.19966895878314972, "learning_rate": 9.602031826797959e-05, "loss": 1.2639, "step": 1954 }, { "epoch": 0.63, "grad_norm": 0.20698106288909912, "learning_rate": 9.587398694777692e-05, "loss": 1.1492, "step": 1955 }, { "epoch": 0.63, "grad_norm": 0.305268794298172, "learning_rate": 9.572771482541795e-05, "loss": 1.1549, "step": 1956 }, { "epoch": 0.63, "grad_norm": 0.2239961326122284, "learning_rate": 9.558150206088088e-05, "loss": 1.1732, "step": 1957 }, { "epoch": 0.63, "grad_norm": 0.21797560155391693, "learning_rate": 9.543534881407941e-05, "loss": 0.975, "step": 1958 }, { "epoch": 0.63, "grad_norm": 0.24047617614269257, "learning_rate": 9.52892552448619e-05, "loss": 1.304, "step": 1959 }, { "epoch": 0.63, "grad_norm": 0.2007191926240921, "learning_rate": 9.514322151301146e-05, "loss": 1.2226, "step": 1960 }, { "epoch": 0.63, "grad_norm": 0.2196929156780243, "learning_rate": 9.499724777824585e-05, "loss": 1.2487, "step": 1961 }, { "epoch": 0.63, "grad_norm": 0.27890273928642273, "learning_rate": 9.485133420021704e-05, "loss": 1.4039, "step": 1962 }, { "epoch": 0.63, "grad_norm": 0.2295176088809967, "learning_rate": 9.470548093851138e-05, "loss": 1.2571, "step": 1963 }, { "epoch": 0.63, "grad_norm": 0.20063872635364532, "learning_rate": 9.455968815264919e-05, "loss": 1.103, "step": 1964 }, { "epoch": 0.63, "grad_norm": 0.22612449526786804, "learning_rate": 9.44139560020846e-05, "loss": 1.4099, "step": 1965 }, { "epoch": 0.63, "grad_norm": 0.18530815839767456, "learning_rate": 9.426828464620557e-05, "loss": 1.2479, "step": 1966 }, { "epoch": 0.63, "grad_norm": 0.22532792389392853, "learning_rate": 9.41226742443333e-05, "loss": 0.8874, "step": 1967 }, { "epoch": 0.63, "grad_norm": 0.20175321400165558, "learning_rate": 9.397712495572258e-05, "loss": 1.1751, "step": 1968 }, { "epoch": 0.63, "grad_norm": 0.2157297283411026, "learning_rate": 9.383163693956126e-05, "loss": 1.2366, "step": 1969 }, { "epoch": 0.63, "grad_norm": 0.20636597275733948, "learning_rate": 9.368621035497017e-05, "loss": 1.0186, "step": 1970 }, { "epoch": 0.63, "grad_norm": 0.23067057132720947, "learning_rate": 9.354084536100306e-05, "loss": 1.3954, "step": 1971 }, { "epoch": 0.64, "grad_norm": 0.2348373681306839, "learning_rate": 9.339554211664607e-05, "loss": 1.1939, "step": 1972 }, { "epoch": 0.64, "grad_norm": 0.22375744581222534, "learning_rate": 9.325030078081797e-05, "loss": 1.4146, "step": 1973 }, { "epoch": 0.64, "grad_norm": 0.2274620532989502, "learning_rate": 9.31051215123699e-05, "loss": 1.1567, "step": 1974 }, { "epoch": 0.64, "grad_norm": 0.2911345958709717, "learning_rate": 9.296000447008502e-05, "loss": 1.3973, "step": 1975 }, { "epoch": 0.64, "grad_norm": 0.24222539365291595, "learning_rate": 9.281494981267843e-05, "loss": 1.24, "step": 1976 }, { "epoch": 0.64, "grad_norm": 0.24429704248905182, "learning_rate": 9.266995769879694e-05, "loss": 1.0422, "step": 1977 }, { "epoch": 0.64, "grad_norm": 0.2325567752122879, "learning_rate": 9.252502828701901e-05, "loss": 1.259, "step": 1978 }, { "epoch": 0.64, "grad_norm": 0.20549564063549042, "learning_rate": 9.238016173585461e-05, "loss": 1.3806, "step": 1979 }, { "epoch": 0.64, "grad_norm": 0.26381024718284607, "learning_rate": 9.223535820374496e-05, "loss": 1.3889, "step": 1980 }, { "epoch": 0.64, "grad_norm": 0.22102083265781403, "learning_rate": 9.209061784906211e-05, "loss": 1.2081, "step": 1981 }, { "epoch": 0.64, "grad_norm": 0.23138472437858582, "learning_rate": 9.194594083010924e-05, "loss": 1.3579, "step": 1982 }, { "epoch": 0.64, "grad_norm": 0.23616616427898407, "learning_rate": 9.18013273051202e-05, "loss": 1.2485, "step": 1983 }, { "epoch": 0.64, "grad_norm": 0.21881447732448578, "learning_rate": 9.165677743225944e-05, "loss": 1.0163, "step": 1984 }, { "epoch": 0.64, "grad_norm": 0.27872389554977417, "learning_rate": 9.15122913696218e-05, "loss": 1.3777, "step": 1985 }, { "epoch": 0.64, "grad_norm": 0.24436554312705994, "learning_rate": 9.136786927523213e-05, "loss": 1.2731, "step": 1986 }, { "epoch": 0.64, "grad_norm": 0.3758157789707184, "learning_rate": 9.122351130704558e-05, "loss": 1.0469, "step": 1987 }, { "epoch": 0.64, "grad_norm": 0.2289917916059494, "learning_rate": 9.1079217622947e-05, "loss": 1.3187, "step": 1988 }, { "epoch": 0.64, "grad_norm": 0.2095954865217209, "learning_rate": 9.093498838075099e-05, "loss": 1.1808, "step": 1989 }, { "epoch": 0.64, "grad_norm": 0.21162250638008118, "learning_rate": 9.079082373820177e-05, "loss": 1.3325, "step": 1990 }, { "epoch": 0.64, "grad_norm": 0.2411748766899109, "learning_rate": 9.064672385297266e-05, "loss": 1.1563, "step": 1991 }, { "epoch": 0.64, "grad_norm": 0.22125375270843506, "learning_rate": 9.050268888266639e-05, "loss": 1.2401, "step": 1992 }, { "epoch": 0.64, "grad_norm": 0.20558294653892517, "learning_rate": 9.035871898481457e-05, "loss": 1.1569, "step": 1993 }, { "epoch": 0.64, "grad_norm": 0.24073773622512817, "learning_rate": 9.021481431687761e-05, "loss": 1.3168, "step": 1994 }, { "epoch": 0.64, "grad_norm": 0.23686756193637848, "learning_rate": 9.007097503624481e-05, "loss": 1.1133, "step": 1995 }, { "epoch": 0.64, "grad_norm": 0.21778719127178192, "learning_rate": 8.992720130023362e-05, "loss": 1.1682, "step": 1996 }, { "epoch": 0.64, "grad_norm": 0.2066560685634613, "learning_rate": 8.978349326609002e-05, "loss": 1.143, "step": 1997 }, { "epoch": 0.64, "grad_norm": 0.2338377833366394, "learning_rate": 8.963985109098809e-05, "loss": 1.2581, "step": 1998 }, { "epoch": 0.64, "grad_norm": 0.24010683596134186, "learning_rate": 8.94962749320298e-05, "loss": 1.2774, "step": 1999 }, { "epoch": 0.64, "grad_norm": 0.21142487227916718, "learning_rate": 8.935276494624519e-05, "loss": 1.321, "step": 2000 }, { "epoch": 0.64, "grad_norm": 0.22684046626091003, "learning_rate": 8.920932129059148e-05, "loss": 1.3202, "step": 2001 }, { "epoch": 0.64, "grad_norm": 0.20757192373275757, "learning_rate": 8.906594412195373e-05, "loss": 1.0998, "step": 2002 }, { "epoch": 0.65, "grad_norm": 0.21215011179447174, "learning_rate": 8.892263359714406e-05, "loss": 1.2113, "step": 2003 }, { "epoch": 0.65, "grad_norm": 0.2848752737045288, "learning_rate": 8.877938987290186e-05, "loss": 1.205, "step": 2004 }, { "epoch": 0.65, "grad_norm": 0.1946825534105301, "learning_rate": 8.863621310589333e-05, "loss": 1.1998, "step": 2005 }, { "epoch": 0.65, "grad_norm": 0.21399159729480743, "learning_rate": 8.84931034527115e-05, "loss": 1.3243, "step": 2006 }, { "epoch": 0.65, "grad_norm": 0.20812033116817474, "learning_rate": 8.835006106987598e-05, "loss": 1.2817, "step": 2007 }, { "epoch": 0.65, "grad_norm": 0.23887750506401062, "learning_rate": 8.820708611383285e-05, "loss": 1.0653, "step": 2008 }, { "epoch": 0.65, "grad_norm": 0.1965831220149994, "learning_rate": 8.806417874095439e-05, "loss": 1.1025, "step": 2009 }, { "epoch": 0.65, "grad_norm": 0.2256527543067932, "learning_rate": 8.792133910753895e-05, "loss": 1.2895, "step": 2010 }, { "epoch": 0.65, "grad_norm": 0.20449650287628174, "learning_rate": 8.777856736981087e-05, "loss": 1.1611, "step": 2011 }, { "epoch": 0.65, "grad_norm": 0.2411455363035202, "learning_rate": 8.763586368392014e-05, "loss": 1.2028, "step": 2012 }, { "epoch": 0.65, "grad_norm": 0.2809162139892578, "learning_rate": 8.74932282059424e-05, "loss": 1.2476, "step": 2013 }, { "epoch": 0.65, "grad_norm": 0.23943482339382172, "learning_rate": 8.735066109187877e-05, "loss": 1.2903, "step": 2014 }, { "epoch": 0.65, "grad_norm": 0.2923581600189209, "learning_rate": 8.720816249765526e-05, "loss": 1.0699, "step": 2015 }, { "epoch": 0.65, "grad_norm": 0.21726946532726288, "learning_rate": 8.70657325791233e-05, "loss": 1.4313, "step": 2016 }, { "epoch": 0.65, "grad_norm": 0.21433621644973755, "learning_rate": 8.692337149205911e-05, "loss": 1.2683, "step": 2017 }, { "epoch": 0.65, "grad_norm": 0.24635010957717896, "learning_rate": 8.678107939216353e-05, "loss": 1.3554, "step": 2018 }, { "epoch": 0.65, "grad_norm": 0.22019051015377045, "learning_rate": 8.663885643506214e-05, "loss": 1.3116, "step": 2019 }, { "epoch": 0.65, "grad_norm": 0.20503367483615875, "learning_rate": 8.649670277630458e-05, "loss": 1.1222, "step": 2020 }, { "epoch": 0.65, "grad_norm": 0.22920569777488708, "learning_rate": 8.635461857136504e-05, "loss": 1.011, "step": 2021 }, { "epoch": 0.65, "grad_norm": 0.20049595832824707, "learning_rate": 8.621260397564148e-05, "loss": 1.1005, "step": 2022 }, { "epoch": 0.65, "grad_norm": 0.21705208718776703, "learning_rate": 8.607065914445594e-05, "loss": 1.369, "step": 2023 }, { "epoch": 0.65, "grad_norm": 0.23551462590694427, "learning_rate": 8.592878423305417e-05, "loss": 1.2692, "step": 2024 }, { "epoch": 0.65, "grad_norm": 0.19666720926761627, "learning_rate": 8.57869793966051e-05, "loss": 1.0895, "step": 2025 }, { "epoch": 0.65, "grad_norm": 0.18406784534454346, "learning_rate": 8.564524479020143e-05, "loss": 1.2968, "step": 2026 }, { "epoch": 0.65, "grad_norm": 0.21811358630657196, "learning_rate": 8.550358056885879e-05, "loss": 1.1884, "step": 2027 }, { "epoch": 0.65, "grad_norm": 0.19620393216609955, "learning_rate": 8.536198688751599e-05, "loss": 1.0622, "step": 2028 }, { "epoch": 0.65, "grad_norm": 0.2169800102710724, "learning_rate": 8.522046390103472e-05, "loss": 1.406, "step": 2029 }, { "epoch": 0.65, "grad_norm": 0.2159881442785263, "learning_rate": 8.5079011764199e-05, "loss": 1.3427, "step": 2030 }, { "epoch": 0.65, "grad_norm": 0.21826326847076416, "learning_rate": 8.493763063171584e-05, "loss": 1.2343, "step": 2031 }, { "epoch": 0.65, "grad_norm": 0.22435736656188965, "learning_rate": 8.479632065821423e-05, "loss": 1.289, "step": 2032 }, { "epoch": 0.65, "grad_norm": 0.21648605167865753, "learning_rate": 8.465508199824555e-05, "loss": 1.0693, "step": 2033 }, { "epoch": 0.66, "grad_norm": 0.22372418642044067, "learning_rate": 8.451391480628312e-05, "loss": 1.2366, "step": 2034 }, { "epoch": 0.66, "grad_norm": 0.23840102553367615, "learning_rate": 8.437281923672197e-05, "loss": 1.4474, "step": 2035 }, { "epoch": 0.66, "grad_norm": 0.3252316415309906, "learning_rate": 8.423179544387902e-05, "loss": 1.2487, "step": 2036 }, { "epoch": 0.66, "grad_norm": 0.23795999586582184, "learning_rate": 8.409084358199247e-05, "loss": 1.2433, "step": 2037 }, { "epoch": 0.66, "grad_norm": 0.23820997774600983, "learning_rate": 8.394996380522207e-05, "loss": 1.338, "step": 2038 }, { "epoch": 0.66, "grad_norm": 0.20267938077449799, "learning_rate": 8.380915626764857e-05, "loss": 1.1344, "step": 2039 }, { "epoch": 0.66, "grad_norm": 0.2077198475599289, "learning_rate": 8.366842112327366e-05, "loss": 1.2202, "step": 2040 }, { "epoch": 0.66, "grad_norm": 0.2409699261188507, "learning_rate": 8.35277585260201e-05, "loss": 1.2214, "step": 2041 }, { "epoch": 0.66, "grad_norm": 0.20482519268989563, "learning_rate": 8.3387168629731e-05, "loss": 1.0234, "step": 2042 }, { "epoch": 0.66, "grad_norm": 0.19511370360851288, "learning_rate": 8.324665158817028e-05, "loss": 1.2534, "step": 2043 }, { "epoch": 0.66, "grad_norm": 0.2048705816268921, "learning_rate": 8.310620755502188e-05, "loss": 1.1353, "step": 2044 }, { "epoch": 0.66, "grad_norm": 0.2178376466035843, "learning_rate": 8.296583668389e-05, "loss": 1.252, "step": 2045 }, { "epoch": 0.66, "grad_norm": 0.2069086879491806, "learning_rate": 8.282553912829896e-05, "loss": 1.1726, "step": 2046 }, { "epoch": 0.66, "grad_norm": 0.247243732213974, "learning_rate": 8.268531504169264e-05, "loss": 1.36, "step": 2047 }, { "epoch": 0.66, "grad_norm": 0.22932399809360504, "learning_rate": 8.254516457743484e-05, "loss": 1.2114, "step": 2048 }, { "epoch": 0.66, "grad_norm": 0.1920744776725769, "learning_rate": 8.240508788880864e-05, "loss": 1.1057, "step": 2049 }, { "epoch": 0.66, "grad_norm": 0.20696032047271729, "learning_rate": 8.226508512901641e-05, "loss": 1.2395, "step": 2050 }, { "epoch": 0.66, "grad_norm": 0.2279173582792282, "learning_rate": 8.212515645117988e-05, "loss": 1.1563, "step": 2051 }, { "epoch": 0.66, "grad_norm": 0.2230670154094696, "learning_rate": 8.198530200833949e-05, "loss": 1.2481, "step": 2052 }, { "epoch": 0.66, "grad_norm": 0.25494638085365295, "learning_rate": 8.184552195345477e-05, "loss": 1.3873, "step": 2053 }, { "epoch": 0.66, "grad_norm": 0.24089281260967255, "learning_rate": 8.170581643940365e-05, "loss": 1.0003, "step": 2054 }, { "epoch": 0.66, "grad_norm": 0.2301253229379654, "learning_rate": 8.15661856189826e-05, "loss": 1.219, "step": 2055 }, { "epoch": 0.66, "grad_norm": 0.34010156989097595, "learning_rate": 8.142662964490649e-05, "loss": 1.2343, "step": 2056 }, { "epoch": 0.66, "grad_norm": 0.250393271446228, "learning_rate": 8.128714866980824e-05, "loss": 1.2891, "step": 2057 }, { "epoch": 0.66, "grad_norm": 0.2116616666316986, "learning_rate": 8.114774284623872e-05, "loss": 1.0901, "step": 2058 }, { "epoch": 0.66, "grad_norm": 0.2716885209083557, "learning_rate": 8.100841232666675e-05, "loss": 1.322, "step": 2059 }, { "epoch": 0.66, "grad_norm": 0.20988288521766663, "learning_rate": 8.086915726347861e-05, "loss": 1.1568, "step": 2060 }, { "epoch": 0.66, "grad_norm": 0.21560055017471313, "learning_rate": 8.07299778089782e-05, "loss": 1.233, "step": 2061 }, { "epoch": 0.66, "grad_norm": 0.1897503137588501, "learning_rate": 8.059087411538665e-05, "loss": 1.2766, "step": 2062 }, { "epoch": 0.66, "grad_norm": 0.23447217047214508, "learning_rate": 8.045184633484214e-05, "loss": 1.2572, "step": 2063 }, { "epoch": 0.66, "grad_norm": 0.23582716286182404, "learning_rate": 8.03128946194001e-05, "loss": 1.3946, "step": 2064 }, { "epoch": 0.67, "grad_norm": 0.1963294893503189, "learning_rate": 8.017401912103244e-05, "loss": 1.2426, "step": 2065 }, { "epoch": 0.67, "grad_norm": 0.21835148334503174, "learning_rate": 8.003521999162798e-05, "loss": 1.1914, "step": 2066 }, { "epoch": 0.67, "grad_norm": 0.21767401695251465, "learning_rate": 7.989649738299186e-05, "loss": 1.4345, "step": 2067 }, { "epoch": 0.67, "grad_norm": 0.21325565874576569, "learning_rate": 7.975785144684551e-05, "loss": 1.3424, "step": 2068 }, { "epoch": 0.67, "grad_norm": 0.1895415335893631, "learning_rate": 7.961928233482666e-05, "loss": 1.2902, "step": 2069 }, { "epoch": 0.67, "grad_norm": 0.21198135614395142, "learning_rate": 7.948079019848881e-05, "loss": 1.2886, "step": 2070 }, { "epoch": 0.67, "grad_norm": 0.21947389841079712, "learning_rate": 7.934237518930149e-05, "loss": 1.1969, "step": 2071 }, { "epoch": 0.67, "grad_norm": 0.22453470528125763, "learning_rate": 7.920403745864974e-05, "loss": 1.2877, "step": 2072 }, { "epoch": 0.67, "grad_norm": 0.23647601902484894, "learning_rate": 7.906577715783402e-05, "loss": 1.283, "step": 2073 }, { "epoch": 0.67, "grad_norm": 0.21207551658153534, "learning_rate": 7.892759443807031e-05, "loss": 1.1676, "step": 2074 }, { "epoch": 0.67, "grad_norm": 0.25062137842178345, "learning_rate": 7.87894894504895e-05, "loss": 1.4689, "step": 2075 }, { "epoch": 0.67, "grad_norm": 0.1753040999174118, "learning_rate": 7.865146234613774e-05, "loss": 1.1221, "step": 2076 }, { "epoch": 0.67, "grad_norm": 0.20802544057369232, "learning_rate": 7.851351327597572e-05, "loss": 1.3427, "step": 2077 }, { "epoch": 0.67, "grad_norm": 0.2348269671201706, "learning_rate": 7.83756423908789e-05, "loss": 1.189, "step": 2078 }, { "epoch": 0.67, "grad_norm": 0.23785416781902313, "learning_rate": 7.82378498416373e-05, "loss": 1.0392, "step": 2079 }, { "epoch": 0.67, "grad_norm": 0.213900625705719, "learning_rate": 7.810013577895516e-05, "loss": 0.9618, "step": 2080 }, { "epoch": 0.67, "grad_norm": 0.21432040631771088, "learning_rate": 7.796250035345098e-05, "loss": 1.2538, "step": 2081 }, { "epoch": 0.67, "grad_norm": 0.22437874972820282, "learning_rate": 7.782494371565713e-05, "loss": 1.1354, "step": 2082 }, { "epoch": 0.67, "grad_norm": 0.23798759281635284, "learning_rate": 7.768746601601984e-05, "loss": 1.3249, "step": 2083 }, { "epoch": 0.67, "grad_norm": 0.22009612619876862, "learning_rate": 7.755006740489914e-05, "loss": 0.9539, "step": 2084 }, { "epoch": 0.67, "grad_norm": 0.211623415350914, "learning_rate": 7.741274803256832e-05, "loss": 1.3124, "step": 2085 }, { "epoch": 0.67, "grad_norm": 0.2262391448020935, "learning_rate": 7.727550804921437e-05, "loss": 0.9569, "step": 2086 }, { "epoch": 0.67, "grad_norm": 0.192141592502594, "learning_rate": 7.713834760493696e-05, "loss": 1.1825, "step": 2087 }, { "epoch": 0.67, "grad_norm": 0.21659709513187408, "learning_rate": 7.700126684974914e-05, "loss": 1.0916, "step": 2088 }, { "epoch": 0.67, "grad_norm": 0.24990904331207275, "learning_rate": 7.68642659335768e-05, "loss": 1.1326, "step": 2089 }, { "epoch": 0.67, "grad_norm": 0.2712962031364441, "learning_rate": 7.672734500625823e-05, "loss": 1.2803, "step": 2090 }, { "epoch": 0.67, "grad_norm": 0.20040152966976166, "learning_rate": 7.659050421754461e-05, "loss": 1.256, "step": 2091 }, { "epoch": 0.67, "grad_norm": 0.234332874417305, "learning_rate": 7.645374371709908e-05, "loss": 1.0011, "step": 2092 }, { "epoch": 0.67, "grad_norm": 0.24715901911258698, "learning_rate": 7.631706365449722e-05, "loss": 0.9587, "step": 2093 }, { "epoch": 0.67, "grad_norm": 0.30443090200424194, "learning_rate": 7.618046417922669e-05, "loss": 1.0578, "step": 2094 }, { "epoch": 0.67, "grad_norm": 0.25814035534858704, "learning_rate": 7.604394544068673e-05, "loss": 1.3069, "step": 2095 }, { "epoch": 0.68, "grad_norm": 0.22119086980819702, "learning_rate": 7.590750758818865e-05, "loss": 1.287, "step": 2096 }, { "epoch": 0.68, "grad_norm": 0.2524826228618622, "learning_rate": 7.577115077095487e-05, "loss": 1.2928, "step": 2097 }, { "epoch": 0.68, "grad_norm": 0.22999361157417297, "learning_rate": 7.563487513811951e-05, "loss": 1.3141, "step": 2098 }, { "epoch": 0.68, "grad_norm": 0.2393343150615692, "learning_rate": 7.549868083872783e-05, "loss": 1.2353, "step": 2099 }, { "epoch": 0.68, "grad_norm": 0.21352727711200714, "learning_rate": 7.536256802173601e-05, "loss": 1.3767, "step": 2100 }, { "epoch": 0.68, "grad_norm": 0.2191859930753708, "learning_rate": 7.522653683601136e-05, "loss": 1.169, "step": 2101 }, { "epoch": 0.68, "grad_norm": 0.2023894190788269, "learning_rate": 7.509058743033152e-05, "loss": 1.0971, "step": 2102 }, { "epoch": 0.68, "grad_norm": 0.21936270594596863, "learning_rate": 7.495471995338504e-05, "loss": 1.2654, "step": 2103 }, { "epoch": 0.68, "grad_norm": 0.18186677992343903, "learning_rate": 7.481893455377076e-05, "loss": 1.2854, "step": 2104 }, { "epoch": 0.68, "grad_norm": 0.23681816458702087, "learning_rate": 7.468323137999764e-05, "loss": 1.2286, "step": 2105 }, { "epoch": 0.68, "grad_norm": 0.24785993993282318, "learning_rate": 7.454761058048499e-05, "loss": 1.2051, "step": 2106 }, { "epoch": 0.68, "grad_norm": 0.23144258558750153, "learning_rate": 7.44120723035616e-05, "loss": 1.2043, "step": 2107 }, { "epoch": 0.68, "grad_norm": 0.23059044778347015, "learning_rate": 7.427661669746639e-05, "loss": 1.251, "step": 2108 }, { "epoch": 0.68, "grad_norm": 0.24321940541267395, "learning_rate": 7.414124391034763e-05, "loss": 1.3773, "step": 2109 }, { "epoch": 0.68, "grad_norm": 0.23281802237033844, "learning_rate": 7.40059540902632e-05, "loss": 1.2072, "step": 2110 }, { "epoch": 0.68, "grad_norm": 0.23939506709575653, "learning_rate": 7.387074738518007e-05, "loss": 1.2969, "step": 2111 }, { "epoch": 0.68, "grad_norm": 0.2104898989200592, "learning_rate": 7.37356239429743e-05, "loss": 1.2653, "step": 2112 }, { "epoch": 0.68, "grad_norm": 0.20797181129455566, "learning_rate": 7.36005839114311e-05, "loss": 0.9898, "step": 2113 }, { "epoch": 0.68, "grad_norm": 0.24435727298259735, "learning_rate": 7.346562743824419e-05, "loss": 1.159, "step": 2114 }, { "epoch": 0.68, "grad_norm": 0.20350463688373566, "learning_rate": 7.333075467101613e-05, "loss": 1.2012, "step": 2115 }, { "epoch": 0.68, "grad_norm": 0.24540941417217255, "learning_rate": 7.319596575725774e-05, "loss": 1.4329, "step": 2116 }, { "epoch": 0.68, "grad_norm": 0.2652702033519745, "learning_rate": 7.30612608443882e-05, "loss": 1.3183, "step": 2117 }, { "epoch": 0.68, "grad_norm": 0.24594725668430328, "learning_rate": 7.292664007973491e-05, "loss": 1.1611, "step": 2118 }, { "epoch": 0.68, "grad_norm": 0.2050599455833435, "learning_rate": 7.279210361053305e-05, "loss": 0.9837, "step": 2119 }, { "epoch": 0.68, "grad_norm": 0.19493547081947327, "learning_rate": 7.265765158392583e-05, "loss": 1.1453, "step": 2120 }, { "epoch": 0.68, "grad_norm": 0.19850972294807434, "learning_rate": 7.252328414696392e-05, "loss": 1.2008, "step": 2121 }, { "epoch": 0.68, "grad_norm": 0.20128007233142853, "learning_rate": 7.238900144660548e-05, "loss": 1.1401, "step": 2122 }, { "epoch": 0.68, "grad_norm": 0.24815785884857178, "learning_rate": 7.225480362971617e-05, "loss": 1.4211, "step": 2123 }, { "epoch": 0.68, "grad_norm": 0.2082618772983551, "learning_rate": 7.212069084306859e-05, "loss": 1.0278, "step": 2124 }, { "epoch": 0.68, "grad_norm": 0.1967877894639969, "learning_rate": 7.198666323334258e-05, "loss": 1.2673, "step": 2125 }, { "epoch": 0.68, "grad_norm": 0.21694181859493256, "learning_rate": 7.185272094712459e-05, "loss": 1.2649, "step": 2126 }, { "epoch": 0.69, "grad_norm": 0.21920688450336456, "learning_rate": 7.17188641309078e-05, "loss": 1.2296, "step": 2127 }, { "epoch": 0.69, "grad_norm": 0.22853052616119385, "learning_rate": 7.15850929310921e-05, "loss": 0.9255, "step": 2128 }, { "epoch": 0.69, "grad_norm": 0.21062695980072021, "learning_rate": 7.145140749398348e-05, "loss": 1.3635, "step": 2129 }, { "epoch": 0.69, "grad_norm": 0.22043827176094055, "learning_rate": 7.131780796579436e-05, "loss": 1.2383, "step": 2130 }, { "epoch": 0.69, "grad_norm": 0.192093625664711, "learning_rate": 7.118429449264304e-05, "loss": 1.2517, "step": 2131 }, { "epoch": 0.69, "grad_norm": 0.2408820539712906, "learning_rate": 7.105086722055372e-05, "loss": 0.9445, "step": 2132 }, { "epoch": 0.69, "grad_norm": 0.21114985644817352, "learning_rate": 7.091752629545643e-05, "loss": 1.1958, "step": 2133 }, { "epoch": 0.69, "grad_norm": 0.20771026611328125, "learning_rate": 7.078427186318666e-05, "loss": 1.2602, "step": 2134 }, { "epoch": 0.69, "grad_norm": 0.26136893033981323, "learning_rate": 7.065110406948529e-05, "loss": 1.2508, "step": 2135 }, { "epoch": 0.69, "grad_norm": 0.21568098664283752, "learning_rate": 7.051802305999855e-05, "loss": 1.3509, "step": 2136 }, { "epoch": 0.69, "grad_norm": 0.2012990415096283, "learning_rate": 7.038502898027763e-05, "loss": 1.1483, "step": 2137 }, { "epoch": 0.69, "grad_norm": 0.24891623854637146, "learning_rate": 7.025212197577878e-05, "loss": 1.1031, "step": 2138 }, { "epoch": 0.69, "grad_norm": 0.7770939469337463, "learning_rate": 7.01193021918629e-05, "loss": 0.9363, "step": 2139 }, { "epoch": 0.69, "grad_norm": 0.21969616413116455, "learning_rate": 6.998656977379546e-05, "loss": 1.5128, "step": 2140 }, { "epoch": 0.69, "grad_norm": 0.189207524061203, "learning_rate": 6.985392486674658e-05, "loss": 1.2517, "step": 2141 }, { "epoch": 0.69, "grad_norm": 0.20651468634605408, "learning_rate": 6.972136761579043e-05, "loss": 1.2838, "step": 2142 }, { "epoch": 0.69, "grad_norm": 0.21794593334197998, "learning_rate": 6.958889816590553e-05, "loss": 1.3575, "step": 2143 }, { "epoch": 0.69, "grad_norm": 0.2124781608581543, "learning_rate": 6.945651666197421e-05, "loss": 1.3845, "step": 2144 }, { "epoch": 0.69, "grad_norm": 0.23195292055606842, "learning_rate": 6.932422324878261e-05, "loss": 1.4305, "step": 2145 }, { "epoch": 0.69, "grad_norm": 0.23518086969852448, "learning_rate": 6.91920180710207e-05, "loss": 1.0177, "step": 2146 }, { "epoch": 0.69, "grad_norm": 0.2591630816459656, "learning_rate": 6.90599012732817e-05, "loss": 1.0422, "step": 2147 }, { "epoch": 0.69, "grad_norm": 0.2082175314426422, "learning_rate": 6.89278730000624e-05, "loss": 1.2043, "step": 2148 }, { "epoch": 0.69, "grad_norm": 0.21911568939685822, "learning_rate": 6.879593339576264e-05, "loss": 0.9675, "step": 2149 }, { "epoch": 0.69, "grad_norm": 0.2385088950395584, "learning_rate": 6.866408260468525e-05, "loss": 1.2578, "step": 2150 }, { "epoch": 0.69, "grad_norm": 0.21753735840320587, "learning_rate": 6.853232077103609e-05, "loss": 1.18, "step": 2151 }, { "epoch": 0.69, "grad_norm": 0.24696414172649384, "learning_rate": 6.840064803892348e-05, "loss": 1.2942, "step": 2152 }, { "epoch": 0.69, "grad_norm": 0.3201030194759369, "learning_rate": 6.826906455235858e-05, "loss": 1.1245, "step": 2153 }, { "epoch": 0.69, "grad_norm": 0.24701577425003052, "learning_rate": 6.813757045525473e-05, "loss": 1.4204, "step": 2154 }, { "epoch": 0.69, "grad_norm": 0.20736686885356903, "learning_rate": 6.80061658914275e-05, "loss": 1.1447, "step": 2155 }, { "epoch": 0.69, "grad_norm": 0.2964246869087219, "learning_rate": 6.787485100459472e-05, "loss": 1.3364, "step": 2156 }, { "epoch": 0.69, "grad_norm": 0.22118380665779114, "learning_rate": 6.77436259383759e-05, "loss": 1.3668, "step": 2157 }, { "epoch": 0.7, "grad_norm": 0.23021437227725983, "learning_rate": 6.761249083629255e-05, "loss": 1.3613, "step": 2158 }, { "epoch": 0.7, "grad_norm": 0.21247412264347076, "learning_rate": 6.748144584176764e-05, "loss": 1.3302, "step": 2159 }, { "epoch": 0.7, "grad_norm": 0.23494952917099, "learning_rate": 6.735049109812554e-05, "loss": 1.2324, "step": 2160 }, { "epoch": 0.7, "grad_norm": 0.28689083456993103, "learning_rate": 6.72196267485921e-05, "loss": 1.0808, "step": 2161 }, { "epoch": 0.7, "grad_norm": 0.23431643843650818, "learning_rate": 6.708885293629411e-05, "loss": 1.3918, "step": 2162 }, { "epoch": 0.7, "grad_norm": 0.23590588569641113, "learning_rate": 6.695816980425954e-05, "loss": 1.2609, "step": 2163 }, { "epoch": 0.7, "grad_norm": 0.2869270443916321, "learning_rate": 6.6827577495417e-05, "loss": 1.2351, "step": 2164 }, { "epoch": 0.7, "grad_norm": 0.25939908623695374, "learning_rate": 6.669707615259577e-05, "loss": 1.0491, "step": 2165 }, { "epoch": 0.7, "grad_norm": 0.2543277144432068, "learning_rate": 6.656666591852583e-05, "loss": 1.2965, "step": 2166 }, { "epoch": 0.7, "grad_norm": 0.20502924919128418, "learning_rate": 6.643634693583729e-05, "loss": 1.2365, "step": 2167 }, { "epoch": 0.7, "grad_norm": 0.22834144532680511, "learning_rate": 6.630611934706072e-05, "loss": 1.3825, "step": 2168 }, { "epoch": 0.7, "grad_norm": 0.2505572438240051, "learning_rate": 6.617598329462634e-05, "loss": 0.9741, "step": 2169 }, { "epoch": 0.7, "grad_norm": 0.24858832359313965, "learning_rate": 6.604593892086461e-05, "loss": 1.2862, "step": 2170 }, { "epoch": 0.7, "grad_norm": 0.2111663520336151, "learning_rate": 6.591598636800563e-05, "loss": 0.9436, "step": 2171 }, { "epoch": 0.7, "grad_norm": 0.21366052329540253, "learning_rate": 6.578612577817895e-05, "loss": 1.2928, "step": 2172 }, { "epoch": 0.7, "grad_norm": 0.23009291291236877, "learning_rate": 6.56563572934138e-05, "loss": 1.1801, "step": 2173 }, { "epoch": 0.7, "grad_norm": 0.27441802620887756, "learning_rate": 6.55266810556383e-05, "loss": 1.2605, "step": 2174 }, { "epoch": 0.7, "grad_norm": 0.20026183128356934, "learning_rate": 6.539709720667996e-05, "loss": 1.1158, "step": 2175 }, { "epoch": 0.7, "grad_norm": 0.21234755218029022, "learning_rate": 6.526760588826526e-05, "loss": 1.2844, "step": 2176 }, { "epoch": 0.7, "grad_norm": 0.2521378695964813, "learning_rate": 6.513820724201925e-05, "loss": 1.2511, "step": 2177 }, { "epoch": 0.7, "grad_norm": 0.2580187916755676, "learning_rate": 6.500890140946593e-05, "loss": 1.1545, "step": 2178 }, { "epoch": 0.7, "grad_norm": 0.20864969491958618, "learning_rate": 6.487968853202743e-05, "loss": 1.2957, "step": 2179 }, { "epoch": 0.7, "grad_norm": 0.2509130835533142, "learning_rate": 6.475056875102448e-05, "loss": 1.1761, "step": 2180 }, { "epoch": 0.7, "grad_norm": 0.2818349301815033, "learning_rate": 6.4621542207676e-05, "loss": 0.9293, "step": 2181 }, { "epoch": 0.7, "grad_norm": 0.2032376527786255, "learning_rate": 6.449260904309876e-05, "loss": 1.3705, "step": 2182 }, { "epoch": 0.7, "grad_norm": 0.22746068239212036, "learning_rate": 6.43637693983076e-05, "loss": 1.0104, "step": 2183 }, { "epoch": 0.7, "grad_norm": 0.22112274169921875, "learning_rate": 6.423502341421478e-05, "loss": 1.0536, "step": 2184 }, { "epoch": 0.7, "grad_norm": 0.22156673669815063, "learning_rate": 6.410637123163044e-05, "loss": 1.2978, "step": 2185 }, { "epoch": 0.7, "grad_norm": 0.21167348325252533, "learning_rate": 6.397781299126204e-05, "loss": 1.2437, "step": 2186 }, { "epoch": 0.7, "grad_norm": 0.24293354153633118, "learning_rate": 6.384934883371421e-05, "loss": 1.2057, "step": 2187 }, { "epoch": 0.7, "grad_norm": 0.21188586950302124, "learning_rate": 6.372097889948872e-05, "loss": 1.1117, "step": 2188 }, { "epoch": 0.71, "grad_norm": 0.22132626175880432, "learning_rate": 6.359270332898426e-05, "loss": 1.2054, "step": 2189 }, { "epoch": 0.71, "grad_norm": 0.19134750962257385, "learning_rate": 6.346452226249635e-05, "loss": 1.2721, "step": 2190 }, { "epoch": 0.71, "grad_norm": 0.23727542161941528, "learning_rate": 6.333643584021729e-05, "loss": 1.2934, "step": 2191 }, { "epoch": 0.71, "grad_norm": 0.2331557720899582, "learning_rate": 6.32084442022356e-05, "loss": 1.4589, "step": 2192 }, { "epoch": 0.71, "grad_norm": 0.23714928328990936, "learning_rate": 6.308054748853632e-05, "loss": 1.2279, "step": 2193 }, { "epoch": 0.71, "grad_norm": 0.21280521154403687, "learning_rate": 6.295274583900051e-05, "loss": 1.3566, "step": 2194 }, { "epoch": 0.71, "grad_norm": 0.22394101321697235, "learning_rate": 6.282503939340551e-05, "loss": 1.2505, "step": 2195 }, { "epoch": 0.71, "grad_norm": 0.1881391704082489, "learning_rate": 6.269742829142424e-05, "loss": 1.3322, "step": 2196 }, { "epoch": 0.71, "grad_norm": 0.2821137607097626, "learning_rate": 6.256991267262563e-05, "loss": 1.3294, "step": 2197 }, { "epoch": 0.71, "grad_norm": 0.21807533502578735, "learning_rate": 6.244249267647395e-05, "loss": 1.0312, "step": 2198 }, { "epoch": 0.71, "grad_norm": 0.21811676025390625, "learning_rate": 6.231516844232896e-05, "loss": 1.3159, "step": 2199 }, { "epoch": 0.71, "grad_norm": 0.22103741765022278, "learning_rate": 6.218794010944578e-05, "loss": 1.1905, "step": 2200 }, { "epoch": 0.71, "grad_norm": 0.23731067776679993, "learning_rate": 6.206080781697447e-05, "loss": 1.3087, "step": 2201 }, { "epoch": 0.71, "grad_norm": 0.236775740981102, "learning_rate": 6.193377170396022e-05, "loss": 1.2486, "step": 2202 }, { "epoch": 0.71, "grad_norm": 0.18403004109859467, "learning_rate": 6.180683190934294e-05, "loss": 1.2299, "step": 2203 }, { "epoch": 0.71, "grad_norm": 0.25756746530532837, "learning_rate": 6.167998857195711e-05, "loss": 1.1907, "step": 2204 }, { "epoch": 0.71, "grad_norm": 0.23585070669651031, "learning_rate": 6.155324183053196e-05, "loss": 1.2972, "step": 2205 }, { "epoch": 0.71, "grad_norm": 2.2281575202941895, "learning_rate": 6.142659182369078e-05, "loss": 1.2596, "step": 2206 }, { "epoch": 0.71, "grad_norm": 0.22870998084545135, "learning_rate": 6.130003868995135e-05, "loss": 1.4846, "step": 2207 }, { "epoch": 0.71, "grad_norm": 0.2569896876811981, "learning_rate": 6.117358256772527e-05, "loss": 1.2842, "step": 2208 }, { "epoch": 0.71, "grad_norm": 0.1746165156364441, "learning_rate": 6.104722359531806e-05, "loss": 1.1491, "step": 2209 }, { "epoch": 0.71, "grad_norm": 0.20735208690166473, "learning_rate": 6.092096191092919e-05, "loss": 1.0063, "step": 2210 }, { "epoch": 0.71, "grad_norm": 0.26124507188796997, "learning_rate": 6.079479765265152e-05, "loss": 1.1016, "step": 2211 }, { "epoch": 0.71, "grad_norm": 0.21848316490650177, "learning_rate": 6.0668730958471346e-05, "loss": 1.2246, "step": 2212 }, { "epoch": 0.71, "grad_norm": 0.21096019446849823, "learning_rate": 6.0542761966268446e-05, "loss": 1.1725, "step": 2213 }, { "epoch": 0.71, "grad_norm": 0.32728666067123413, "learning_rate": 6.0416890813815554e-05, "loss": 1.4812, "step": 2214 }, { "epoch": 0.71, "grad_norm": 0.2370179444551468, "learning_rate": 6.029111763877852e-05, "loss": 1.1911, "step": 2215 }, { "epoch": 0.71, "grad_norm": 0.2474047839641571, "learning_rate": 6.016544257871597e-05, "loss": 1.2234, "step": 2216 }, { "epoch": 0.71, "grad_norm": 0.2255847007036209, "learning_rate": 6.003986577107918e-05, "loss": 1.2533, "step": 2217 }, { "epoch": 0.71, "grad_norm": 0.226525217294693, "learning_rate": 5.99143873532121e-05, "loss": 1.2979, "step": 2218 }, { "epoch": 0.71, "grad_norm": 0.3025433421134949, "learning_rate": 5.978900746235092e-05, "loss": 1.2072, "step": 2219 }, { "epoch": 0.72, "grad_norm": 0.21538271009922028, "learning_rate": 5.9663726235624224e-05, "loss": 0.9267, "step": 2220 }, { "epoch": 0.72, "grad_norm": 0.21958796679973602, "learning_rate": 5.9538543810052577e-05, "loss": 1.1552, "step": 2221 }, { "epoch": 0.72, "grad_norm": 0.2941027581691742, "learning_rate": 5.9413460322548425e-05, "loss": 1.2898, "step": 2222 }, { "epoch": 0.72, "grad_norm": 0.23357254266738892, "learning_rate": 5.928847590991624e-05, "loss": 1.2547, "step": 2223 }, { "epoch": 0.72, "grad_norm": 0.24381878972053528, "learning_rate": 5.916359070885186e-05, "loss": 1.0089, "step": 2224 }, { "epoch": 0.72, "grad_norm": 0.3168037235736847, "learning_rate": 5.9038804855942865e-05, "loss": 1.2698, "step": 2225 }, { "epoch": 0.72, "grad_norm": 0.1952332854270935, "learning_rate": 5.8914118487667986e-05, "loss": 1.0256, "step": 2226 }, { "epoch": 0.72, "grad_norm": 0.21431083977222443, "learning_rate": 5.878953174039717e-05, "loss": 1.2773, "step": 2227 }, { "epoch": 0.72, "grad_norm": 0.2037636637687683, "learning_rate": 5.8665044750391586e-05, "loss": 1.3317, "step": 2228 }, { "epoch": 0.72, "grad_norm": 0.22199414670467377, "learning_rate": 5.854065765380302e-05, "loss": 1.1225, "step": 2229 }, { "epoch": 0.72, "grad_norm": 0.24343295395374298, "learning_rate": 5.8416370586674294e-05, "loss": 1.1449, "step": 2230 }, { "epoch": 0.72, "grad_norm": 0.2512519359588623, "learning_rate": 5.829218368493861e-05, "loss": 1.3894, "step": 2231 }, { "epoch": 0.72, "grad_norm": 0.21513843536376953, "learning_rate": 5.8168097084419655e-05, "loss": 1.2884, "step": 2232 }, { "epoch": 0.72, "grad_norm": 0.2376331239938736, "learning_rate": 5.804411092083156e-05, "loss": 1.2795, "step": 2233 }, { "epoch": 0.72, "grad_norm": 0.26838359236717224, "learning_rate": 5.79202253297784e-05, "loss": 1.2509, "step": 2234 }, { "epoch": 0.72, "grad_norm": 0.24553947150707245, "learning_rate": 5.779644044675445e-05, "loss": 1.1761, "step": 2235 }, { "epoch": 0.72, "grad_norm": 0.20432928204536438, "learning_rate": 5.767275640714371e-05, "loss": 1.3218, "step": 2236 }, { "epoch": 0.72, "grad_norm": 0.24067507684230804, "learning_rate": 5.754917334621986e-05, "loss": 1.4372, "step": 2237 }, { "epoch": 0.72, "grad_norm": 0.3351072669029236, "learning_rate": 5.74256913991463e-05, "loss": 1.1836, "step": 2238 }, { "epoch": 0.72, "grad_norm": 0.22834013402462006, "learning_rate": 5.730231070097566e-05, "loss": 1.2521, "step": 2239 }, { "epoch": 0.72, "grad_norm": 0.20946410298347473, "learning_rate": 5.7179031386649996e-05, "loss": 1.0578, "step": 2240 }, { "epoch": 0.72, "grad_norm": 0.19541223347187042, "learning_rate": 5.705585359100039e-05, "loss": 1.2685, "step": 2241 }, { "epoch": 0.72, "grad_norm": 0.2095188945531845, "learning_rate": 5.6932777448746816e-05, "loss": 1.2775, "step": 2242 }, { "epoch": 0.72, "grad_norm": 0.20821121335029602, "learning_rate": 5.6809803094498276e-05, "loss": 1.3161, "step": 2243 }, { "epoch": 0.72, "grad_norm": 0.19233067333698273, "learning_rate": 5.668693066275225e-05, "loss": 1.0491, "step": 2244 }, { "epoch": 0.72, "grad_norm": 0.2045384794473648, "learning_rate": 5.656416028789488e-05, "loss": 1.247, "step": 2245 }, { "epoch": 0.72, "grad_norm": 0.21797935664653778, "learning_rate": 5.64414921042006e-05, "loss": 1.1551, "step": 2246 }, { "epoch": 0.72, "grad_norm": 0.2299063801765442, "learning_rate": 5.631892624583206e-05, "loss": 1.1501, "step": 2247 }, { "epoch": 0.72, "grad_norm": 0.23679415881633759, "learning_rate": 5.619646284684015e-05, "loss": 1.3719, "step": 2248 }, { "epoch": 0.72, "grad_norm": 0.5014662742614746, "learning_rate": 5.6074102041163495e-05, "loss": 1.3077, "step": 2249 }, { "epoch": 0.72, "grad_norm": 0.22179429233074188, "learning_rate": 5.5951843962628685e-05, "loss": 1.2199, "step": 2250 }, { "epoch": 0.73, "grad_norm": 0.20018130540847778, "learning_rate": 5.582968874494988e-05, "loss": 1.2382, "step": 2251 }, { "epoch": 0.73, "grad_norm": 0.22466012835502625, "learning_rate": 5.570763652172864e-05, "loss": 1.2542, "step": 2252 }, { "epoch": 0.73, "grad_norm": 0.1851232647895813, "learning_rate": 5.558568742645414e-05, "loss": 0.9919, "step": 2253 }, { "epoch": 0.73, "grad_norm": 0.19465391337871552, "learning_rate": 5.5463841592502475e-05, "loss": 1.19, "step": 2254 }, { "epoch": 0.73, "grad_norm": 0.21929214894771576, "learning_rate": 5.534209915313711e-05, "loss": 0.986, "step": 2255 }, { "epoch": 0.73, "grad_norm": 0.26153960824012756, "learning_rate": 5.5220460241508014e-05, "loss": 1.3899, "step": 2256 }, { "epoch": 0.73, "grad_norm": 0.25255057215690613, "learning_rate": 5.50989249906523e-05, "loss": 1.2628, "step": 2257 }, { "epoch": 0.73, "grad_norm": 0.2175099402666092, "learning_rate": 5.4977493533493606e-05, "loss": 1.2382, "step": 2258 }, { "epoch": 0.73, "grad_norm": 0.23655438423156738, "learning_rate": 5.485616600284193e-05, "loss": 1.1134, "step": 2259 }, { "epoch": 0.73, "grad_norm": 0.21165654063224792, "learning_rate": 5.4734942531393836e-05, "loss": 1.2353, "step": 2260 }, { "epoch": 0.73, "grad_norm": 0.23662541806697845, "learning_rate": 5.461382325173173e-05, "loss": 1.1241, "step": 2261 }, { "epoch": 0.73, "grad_norm": 0.24661824107170105, "learning_rate": 5.4492808296324376e-05, "loss": 1.4047, "step": 2262 }, { "epoch": 0.73, "grad_norm": 0.41220831871032715, "learning_rate": 5.4371897797526384e-05, "loss": 1.2586, "step": 2263 }, { "epoch": 0.73, "grad_norm": 0.18713320791721344, "learning_rate": 5.4251091887577984e-05, "loss": 1.2847, "step": 2264 }, { "epoch": 0.73, "grad_norm": 0.18440790474414825, "learning_rate": 5.413039069860515e-05, "loss": 1.1988, "step": 2265 }, { "epoch": 0.73, "grad_norm": 0.2038755863904953, "learning_rate": 5.400979436261917e-05, "loss": 1.2486, "step": 2266 }, { "epoch": 0.73, "grad_norm": 0.21428117156028748, "learning_rate": 5.388930301151683e-05, "loss": 1.2955, "step": 2267 }, { "epoch": 0.73, "grad_norm": 0.2533896863460541, "learning_rate": 5.376891677708006e-05, "loss": 1.4266, "step": 2268 }, { "epoch": 0.73, "grad_norm": 0.31164658069610596, "learning_rate": 5.364863579097573e-05, "loss": 1.0827, "step": 2269 }, { "epoch": 0.73, "grad_norm": 0.20499534904956818, "learning_rate": 5.352846018475565e-05, "loss": 1.2728, "step": 2270 }, { "epoch": 0.73, "grad_norm": 0.17841042578220367, "learning_rate": 5.3408390089856346e-05, "loss": 0.9878, "step": 2271 }, { "epoch": 0.73, "grad_norm": 0.20634087920188904, "learning_rate": 5.3288425637598994e-05, "loss": 1.3127, "step": 2272 }, { "epoch": 0.73, "grad_norm": 0.2369149774312973, "learning_rate": 5.316856695918928e-05, "loss": 1.3482, "step": 2273 }, { "epoch": 0.73, "grad_norm": 0.34048789739608765, "learning_rate": 5.304881418571708e-05, "loss": 1.471, "step": 2274 }, { "epoch": 0.73, "grad_norm": 0.18917810916900635, "learning_rate": 5.29291674481565e-05, "loss": 1.2982, "step": 2275 }, { "epoch": 0.73, "grad_norm": 0.2298641800880432, "learning_rate": 5.2809626877365605e-05, "loss": 1.031, "step": 2276 }, { "epoch": 0.73, "grad_norm": 0.2993468940258026, "learning_rate": 5.269019260408647e-05, "loss": 1.4191, "step": 2277 }, { "epoch": 0.73, "grad_norm": 0.21320775151252747, "learning_rate": 5.2570864758944905e-05, "loss": 0.8791, "step": 2278 }, { "epoch": 0.73, "grad_norm": 0.20666863024234772, "learning_rate": 5.245164347245021e-05, "loss": 1.2299, "step": 2279 }, { "epoch": 0.73, "grad_norm": 0.22562263906002045, "learning_rate": 5.23325288749952e-05, "loss": 1.2902, "step": 2280 }, { "epoch": 0.73, "grad_norm": 0.23781682550907135, "learning_rate": 5.2213521096855976e-05, "loss": 1.31, "step": 2281 }, { "epoch": 0.74, "grad_norm": 0.24541722238063812, "learning_rate": 5.209462026819189e-05, "loss": 1.2592, "step": 2282 }, { "epoch": 0.74, "grad_norm": 0.24569053947925568, "learning_rate": 5.1975826519045204e-05, "loss": 1.3254, "step": 2283 }, { "epoch": 0.74, "grad_norm": 0.2197752594947815, "learning_rate": 5.185713997934121e-05, "loss": 1.2617, "step": 2284 }, { "epoch": 0.74, "grad_norm": 0.22758296132087708, "learning_rate": 5.173856077888782e-05, "loss": 1.215, "step": 2285 }, { "epoch": 0.74, "grad_norm": 0.20878055691719055, "learning_rate": 5.162008904737556e-05, "loss": 1.1761, "step": 2286 }, { "epoch": 0.74, "grad_norm": 0.28720173239707947, "learning_rate": 5.1501724914377536e-05, "loss": 1.2537, "step": 2287 }, { "epoch": 0.74, "grad_norm": 0.22562849521636963, "learning_rate": 5.1383468509348985e-05, "loss": 1.3169, "step": 2288 }, { "epoch": 0.74, "grad_norm": 0.21605585515499115, "learning_rate": 5.126531996162754e-05, "loss": 1.1182, "step": 2289 }, { "epoch": 0.74, "grad_norm": 0.2085842490196228, "learning_rate": 5.114727940043268e-05, "loss": 1.3689, "step": 2290 }, { "epoch": 0.74, "grad_norm": 0.22275252640247345, "learning_rate": 5.1029346954865834e-05, "loss": 1.2295, "step": 2291 }, { "epoch": 0.74, "grad_norm": 0.247910276055336, "learning_rate": 5.091152275391028e-05, "loss": 1.2635, "step": 2292 }, { "epoch": 0.74, "grad_norm": 0.23480704426765442, "learning_rate": 5.079380692643079e-05, "loss": 1.1203, "step": 2293 }, { "epoch": 0.74, "grad_norm": 0.2220291644334793, "learning_rate": 5.067619960117362e-05, "loss": 1.3663, "step": 2294 }, { "epoch": 0.74, "grad_norm": 0.23014244437217712, "learning_rate": 5.055870090676645e-05, "loss": 1.2658, "step": 2295 }, { "epoch": 0.74, "grad_norm": 0.2805074453353882, "learning_rate": 5.044131097171802e-05, "loss": 1.1951, "step": 2296 }, { "epoch": 0.74, "grad_norm": 0.21681630611419678, "learning_rate": 5.032402992441825e-05, "loss": 1.1008, "step": 2297 }, { "epoch": 0.74, "grad_norm": 0.2018338441848755, "learning_rate": 5.02068578931379e-05, "loss": 1.2777, "step": 2298 }, { "epoch": 0.74, "grad_norm": 0.21056321263313293, "learning_rate": 5.008979500602841e-05, "loss": 1.4109, "step": 2299 }, { "epoch": 0.74, "grad_norm": 0.20418652892112732, "learning_rate": 4.997284139112206e-05, "loss": 1.2862, "step": 2300 }, { "epoch": 0.74, "grad_norm": 0.2665635049343109, "learning_rate": 4.9855997176331406e-05, "loss": 1.2654, "step": 2301 }, { "epoch": 0.74, "grad_norm": 0.2729850709438324, "learning_rate": 4.973926248944955e-05, "loss": 1.3143, "step": 2302 }, { "epoch": 0.74, "grad_norm": 0.19264191389083862, "learning_rate": 4.962263745814964e-05, "loss": 1.0812, "step": 2303 }, { "epoch": 0.74, "grad_norm": 0.18730293214321136, "learning_rate": 4.950612220998489e-05, "loss": 1.3439, "step": 2304 }, { "epoch": 0.74, "grad_norm": 0.2342204451560974, "learning_rate": 4.9389716872388644e-05, "loss": 1.3138, "step": 2305 }, { "epoch": 0.74, "grad_norm": 0.2228112667798996, "learning_rate": 4.927342157267379e-05, "loss": 1.0991, "step": 2306 }, { "epoch": 0.74, "grad_norm": 0.24099193513393402, "learning_rate": 4.915723643803304e-05, "loss": 1.018, "step": 2307 }, { "epoch": 0.74, "grad_norm": 0.27335187792778015, "learning_rate": 4.9041161595538546e-05, "loss": 1.2028, "step": 2308 }, { "epoch": 0.74, "grad_norm": 0.18766868114471436, "learning_rate": 4.892519717214178e-05, "loss": 0.8332, "step": 2309 }, { "epoch": 0.74, "grad_norm": 0.20240958034992218, "learning_rate": 4.88093432946736e-05, "loss": 1.1725, "step": 2310 }, { "epoch": 0.74, "grad_norm": 0.2199336141347885, "learning_rate": 4.869360008984378e-05, "loss": 1.2245, "step": 2311 }, { "epoch": 0.74, "grad_norm": 0.2617591917514801, "learning_rate": 4.857796768424124e-05, "loss": 1.2989, "step": 2312 }, { "epoch": 0.74, "grad_norm": 0.21024799346923828, "learning_rate": 4.8462446204333564e-05, "loss": 1.1573, "step": 2313 }, { "epoch": 0.75, "grad_norm": 0.20323072373867035, "learning_rate": 4.834703577646702e-05, "loss": 1.222, "step": 2314 }, { "epoch": 0.75, "grad_norm": 0.22930362820625305, "learning_rate": 4.823173652686657e-05, "loss": 1.1688, "step": 2315 }, { "epoch": 0.75, "grad_norm": 0.21471363306045532, "learning_rate": 4.8116548581635384e-05, "loss": 1.144, "step": 2316 }, { "epoch": 0.75, "grad_norm": 0.20837542414665222, "learning_rate": 4.800147206675508e-05, "loss": 1.1339, "step": 2317 }, { "epoch": 0.75, "grad_norm": 0.21714484691619873, "learning_rate": 4.788650710808528e-05, "loss": 1.4826, "step": 2318 }, { "epoch": 0.75, "grad_norm": 0.22740092873573303, "learning_rate": 4.777165383136359e-05, "loss": 1.1908, "step": 2319 }, { "epoch": 0.75, "grad_norm": 0.22971788048744202, "learning_rate": 4.765691236220559e-05, "loss": 1.4364, "step": 2320 }, { "epoch": 0.75, "grad_norm": 0.19853341579437256, "learning_rate": 4.754228282610441e-05, "loss": 1.0794, "step": 2321 }, { "epoch": 0.75, "grad_norm": 0.21666325628757477, "learning_rate": 4.742776534843093e-05, "loss": 1.3153, "step": 2322 }, { "epoch": 0.75, "grad_norm": 0.24543672800064087, "learning_rate": 4.731336005443337e-05, "loss": 1.3389, "step": 2323 }, { "epoch": 0.75, "grad_norm": 0.3177873492240906, "learning_rate": 4.71990670692372e-05, "loss": 1.1959, "step": 2324 }, { "epoch": 0.75, "grad_norm": 0.18456320464611053, "learning_rate": 4.7084886517845236e-05, "loss": 1.2588, "step": 2325 }, { "epoch": 0.75, "grad_norm": 0.246647447347641, "learning_rate": 4.697081852513711e-05, "loss": 1.4287, "step": 2326 }, { "epoch": 0.75, "grad_norm": 0.4055507481098175, "learning_rate": 4.685686321586955e-05, "loss": 1.2445, "step": 2327 }, { "epoch": 0.75, "grad_norm": 0.22217895090579987, "learning_rate": 4.674302071467592e-05, "loss": 1.3508, "step": 2328 }, { "epoch": 0.75, "grad_norm": 0.22864960134029388, "learning_rate": 4.662929114606617e-05, "loss": 1.0841, "step": 2329 }, { "epoch": 0.75, "grad_norm": 0.22361540794372559, "learning_rate": 4.6515674634426906e-05, "loss": 1.1453, "step": 2330 }, { "epoch": 0.75, "grad_norm": 0.23902644217014313, "learning_rate": 4.640217130402086e-05, "loss": 1.2111, "step": 2331 }, { "epoch": 0.75, "grad_norm": 0.18916961550712585, "learning_rate": 4.6288781278987206e-05, "loss": 1.3164, "step": 2332 }, { "epoch": 0.75, "grad_norm": 0.21487203240394592, "learning_rate": 4.617550468334101e-05, "loss": 1.3867, "step": 2333 }, { "epoch": 0.75, "grad_norm": 0.2080216407775879, "learning_rate": 4.6062341640973324e-05, "loss": 1.3458, "step": 2334 }, { "epoch": 0.75, "grad_norm": 0.21560664474964142, "learning_rate": 4.594929227565114e-05, "loss": 1.2645, "step": 2335 }, { "epoch": 0.75, "grad_norm": 0.25917938351631165, "learning_rate": 4.583635671101691e-05, "loss": 1.3683, "step": 2336 }, { "epoch": 0.75, "grad_norm": 0.21215267479419708, "learning_rate": 4.57235350705888e-05, "loss": 1.0408, "step": 2337 }, { "epoch": 0.75, "grad_norm": 0.21238918602466583, "learning_rate": 4.561082747776028e-05, "loss": 1.3333, "step": 2338 }, { "epoch": 0.75, "grad_norm": 0.22586382925510406, "learning_rate": 4.549823405580004e-05, "loss": 1.0041, "step": 2339 }, { "epoch": 0.75, "grad_norm": 0.22151890397071838, "learning_rate": 4.538575492785211e-05, "loss": 1.358, "step": 2340 }, { "epoch": 0.75, "grad_norm": 0.18518319725990295, "learning_rate": 4.5273390216935274e-05, "loss": 1.2415, "step": 2341 }, { "epoch": 0.75, "grad_norm": 0.22389277815818787, "learning_rate": 4.51611400459433e-05, "loss": 1.3047, "step": 2342 }, { "epoch": 0.75, "grad_norm": 0.20083646476268768, "learning_rate": 4.5049004537644635e-05, "loss": 1.1721, "step": 2343 }, { "epoch": 0.75, "grad_norm": 0.2227754443883896, "learning_rate": 4.493698381468237e-05, "loss": 1.3452, "step": 2344 }, { "epoch": 0.76, "grad_norm": 0.24828249216079712, "learning_rate": 4.482507799957409e-05, "loss": 0.9559, "step": 2345 }, { "epoch": 0.76, "grad_norm": 0.2133481353521347, "learning_rate": 4.4713287214711585e-05, "loss": 1.2031, "step": 2346 }, { "epoch": 0.76, "grad_norm": 0.33339381217956543, "learning_rate": 4.460161158236091e-05, "loss": 1.1846, "step": 2347 }, { "epoch": 0.76, "grad_norm": 0.2663831114768982, "learning_rate": 4.449005122466212e-05, "loss": 1.2344, "step": 2348 }, { "epoch": 0.76, "grad_norm": 0.22585856914520264, "learning_rate": 4.437860626362927e-05, "loss": 1.2827, "step": 2349 }, { "epoch": 0.76, "grad_norm": 0.1946796029806137, "learning_rate": 4.4267276821150246e-05, "loss": 1.1031, "step": 2350 }, { "epoch": 0.76, "grad_norm": 0.22370725870132446, "learning_rate": 4.415606301898645e-05, "loss": 1.1734, "step": 2351 }, { "epoch": 0.76, "grad_norm": 0.2194320261478424, "learning_rate": 4.4044964978772873e-05, "loss": 1.2697, "step": 2352 }, { "epoch": 0.76, "grad_norm": 0.1999955028295517, "learning_rate": 4.3933982822017876e-05, "loss": 1.0619, "step": 2353 }, { "epoch": 0.76, "grad_norm": 0.24653524160385132, "learning_rate": 4.3823116670103134e-05, "loss": 1.1792, "step": 2354 }, { "epoch": 0.76, "grad_norm": 0.2132009118795395, "learning_rate": 4.371236664428344e-05, "loss": 1.1513, "step": 2355 }, { "epoch": 0.76, "grad_norm": 0.20659105479717255, "learning_rate": 4.360173286568655e-05, "loss": 1.1428, "step": 2356 }, { "epoch": 0.76, "grad_norm": 0.2040523886680603, "learning_rate": 4.349121545531305e-05, "loss": 1.1487, "step": 2357 }, { "epoch": 0.76, "grad_norm": 0.22183774411678314, "learning_rate": 4.338081453403625e-05, "loss": 1.0943, "step": 2358 }, { "epoch": 0.76, "grad_norm": 0.27433106303215027, "learning_rate": 4.327053022260215e-05, "loss": 1.2136, "step": 2359 }, { "epoch": 0.76, "grad_norm": 0.21540331840515137, "learning_rate": 4.3160362641629196e-05, "loss": 1.2362, "step": 2360 }, { "epoch": 0.76, "grad_norm": 0.23283080756664276, "learning_rate": 4.3050311911608076e-05, "loss": 1.2323, "step": 2361 }, { "epoch": 0.76, "grad_norm": 0.5120077729225159, "learning_rate": 4.2940378152901725e-05, "loss": 1.3269, "step": 2362 }, { "epoch": 0.76, "grad_norm": 0.31086596846580505, "learning_rate": 4.283056148574511e-05, "loss": 1.3755, "step": 2363 }, { "epoch": 0.76, "grad_norm": 0.27182891964912415, "learning_rate": 4.2720862030245204e-05, "loss": 1.4046, "step": 2364 }, { "epoch": 0.76, "grad_norm": 0.21352630853652954, "learning_rate": 4.261127990638081e-05, "loss": 1.3841, "step": 2365 }, { "epoch": 0.76, "grad_norm": 0.224158376455307, "learning_rate": 4.2501815234002326e-05, "loss": 1.1311, "step": 2366 }, { "epoch": 0.76, "grad_norm": 0.2029469758272171, "learning_rate": 4.239246813283167e-05, "loss": 1.3216, "step": 2367 }, { "epoch": 0.76, "grad_norm": 0.26960307359695435, "learning_rate": 4.228323872246219e-05, "loss": 1.3646, "step": 2368 }, { "epoch": 0.76, "grad_norm": 0.635570764541626, "learning_rate": 4.2174127122358646e-05, "loss": 1.3226, "step": 2369 }, { "epoch": 0.76, "grad_norm": 0.22573620080947876, "learning_rate": 4.206513345185679e-05, "loss": 1.1269, "step": 2370 }, { "epoch": 0.76, "grad_norm": 0.21843558549880981, "learning_rate": 4.19562578301634e-05, "loss": 1.3758, "step": 2371 }, { "epoch": 0.76, "grad_norm": 0.19524838030338287, "learning_rate": 4.18475003763563e-05, "loss": 1.0311, "step": 2372 }, { "epoch": 0.76, "grad_norm": 0.2102925330400467, "learning_rate": 4.173886120938384e-05, "loss": 0.9954, "step": 2373 }, { "epoch": 0.76, "grad_norm": 0.200016051530838, "learning_rate": 4.163034044806527e-05, "loss": 1.1694, "step": 2374 }, { "epoch": 0.76, "grad_norm": 0.2688789665699005, "learning_rate": 4.152193821109008e-05, "loss": 1.2757, "step": 2375 }, { "epoch": 0.77, "grad_norm": 0.21371038258075714, "learning_rate": 4.1413654617018244e-05, "loss": 1.4424, "step": 2376 }, { "epoch": 0.77, "grad_norm": 0.2660287916660309, "learning_rate": 4.1305489784280074e-05, "loss": 1.2075, "step": 2377 }, { "epoch": 0.77, "grad_norm": 0.22412258386611938, "learning_rate": 4.119744383117576e-05, "loss": 1.4646, "step": 2378 }, { "epoch": 0.77, "grad_norm": 0.22850491106510162, "learning_rate": 4.1089516875875745e-05, "loss": 1.372, "step": 2379 }, { "epoch": 0.77, "grad_norm": 0.22145313024520874, "learning_rate": 4.098170903642011e-05, "loss": 1.3475, "step": 2380 }, { "epoch": 0.77, "grad_norm": 0.20773600041866302, "learning_rate": 4.087402043071868e-05, "loss": 1.1015, "step": 2381 }, { "epoch": 0.77, "grad_norm": 0.21755066514015198, "learning_rate": 4.0766451176551044e-05, "loss": 1.1296, "step": 2382 }, { "epoch": 0.77, "grad_norm": 0.2050647735595703, "learning_rate": 4.065900139156604e-05, "loss": 1.1597, "step": 2383 }, { "epoch": 0.77, "grad_norm": 0.20881879329681396, "learning_rate": 4.055167119328202e-05, "loss": 1.3754, "step": 2384 }, { "epoch": 0.77, "grad_norm": 0.19198527932167053, "learning_rate": 4.044446069908643e-05, "loss": 1.211, "step": 2385 }, { "epoch": 0.77, "grad_norm": 0.21665316820144653, "learning_rate": 4.0337370026235784e-05, "loss": 1.2233, "step": 2386 }, { "epoch": 0.77, "grad_norm": 0.22755710780620575, "learning_rate": 4.0230399291855676e-05, "loss": 1.1378, "step": 2387 }, { "epoch": 0.77, "grad_norm": 0.21291953325271606, "learning_rate": 4.012354861294036e-05, "loss": 1.1464, "step": 2388 }, { "epoch": 0.77, "grad_norm": 0.23778006434440613, "learning_rate": 4.001681810635294e-05, "loss": 1.6265, "step": 2389 }, { "epoch": 0.77, "grad_norm": 0.3385898768901825, "learning_rate": 3.9910207888824986e-05, "loss": 1.2865, "step": 2390 }, { "epoch": 0.77, "grad_norm": 0.23066991567611694, "learning_rate": 3.9803718076956456e-05, "loss": 1.3313, "step": 2391 }, { "epoch": 0.77, "grad_norm": 0.19122926890850067, "learning_rate": 3.969734878721581e-05, "loss": 1.3203, "step": 2392 }, { "epoch": 0.77, "grad_norm": 0.1987881362438202, "learning_rate": 3.959110013593948e-05, "loss": 1.2884, "step": 2393 }, { "epoch": 0.77, "grad_norm": 0.19724521040916443, "learning_rate": 3.9484972239332163e-05, "loss": 1.1261, "step": 2394 }, { "epoch": 0.77, "grad_norm": 0.18060439825057983, "learning_rate": 3.93789652134663e-05, "loss": 1.1655, "step": 2395 }, { "epoch": 0.77, "grad_norm": 0.21575675904750824, "learning_rate": 3.9273079174282204e-05, "loss": 1.0538, "step": 2396 }, { "epoch": 0.77, "grad_norm": 0.20283041894435883, "learning_rate": 3.916731423758791e-05, "loss": 1.3236, "step": 2397 }, { "epoch": 0.77, "grad_norm": 0.24221737682819366, "learning_rate": 3.906167051905894e-05, "loss": 1.3486, "step": 2398 }, { "epoch": 0.77, "grad_norm": 0.20369650423526764, "learning_rate": 3.89561481342383e-05, "loss": 1.2148, "step": 2399 }, { "epoch": 0.77, "grad_norm": 0.24150840938091278, "learning_rate": 3.885074719853624e-05, "loss": 1.3347, "step": 2400 }, { "epoch": 0.77, "grad_norm": 0.19720442593097687, "learning_rate": 3.874546782723015e-05, "loss": 1.2501, "step": 2401 }, { "epoch": 0.77, "grad_norm": 0.287729412317276, "learning_rate": 3.8640310135464594e-05, "loss": 1.1181, "step": 2402 }, { "epoch": 0.77, "grad_norm": 0.23758263885974884, "learning_rate": 3.8535274238250906e-05, "loss": 1.3084, "step": 2403 }, { "epoch": 0.77, "grad_norm": 0.24509981274604797, "learning_rate": 3.843036025046733e-05, "loss": 1.3768, "step": 2404 }, { "epoch": 0.77, "grad_norm": 0.2398320883512497, "learning_rate": 3.832556828685873e-05, "loss": 1.1834, "step": 2405 }, { "epoch": 0.77, "grad_norm": 0.26266220211982727, "learning_rate": 3.822089846203642e-05, "loss": 1.2686, "step": 2406 }, { "epoch": 0.78, "grad_norm": 0.2453865110874176, "learning_rate": 3.811635089047833e-05, "loss": 1.2088, "step": 2407 }, { "epoch": 0.78, "grad_norm": 0.1843041628599167, "learning_rate": 3.801192568652848e-05, "loss": 1.1827, "step": 2408 }, { "epoch": 0.78, "grad_norm": 0.2275291234254837, "learning_rate": 3.790762296439722e-05, "loss": 1.2321, "step": 2409 }, { "epoch": 0.78, "grad_norm": 0.21033547818660736, "learning_rate": 3.780344283816084e-05, "loss": 1.0856, "step": 2410 }, { "epoch": 0.78, "grad_norm": 0.39550963044166565, "learning_rate": 3.769938542176152e-05, "loss": 1.2315, "step": 2411 }, { "epoch": 0.78, "grad_norm": 0.19324235618114471, "learning_rate": 3.7595450829007394e-05, "loss": 1.2457, "step": 2412 }, { "epoch": 0.78, "grad_norm": 0.21405427157878876, "learning_rate": 3.749163917357204e-05, "loss": 1.0642, "step": 2413 }, { "epoch": 0.78, "grad_norm": 0.2989640235900879, "learning_rate": 3.738795056899481e-05, "loss": 1.1741, "step": 2414 }, { "epoch": 0.78, "grad_norm": 0.25388088822364807, "learning_rate": 3.728438512868028e-05, "loss": 1.2472, "step": 2415 }, { "epoch": 0.78, "grad_norm": 0.21443837881088257, "learning_rate": 3.7180942965898365e-05, "loss": 1.3116, "step": 2416 }, { "epoch": 0.78, "grad_norm": 0.3156970143318176, "learning_rate": 3.70776241937843e-05, "loss": 1.3822, "step": 2417 }, { "epoch": 0.78, "grad_norm": 0.196189746260643, "learning_rate": 3.697442892533816e-05, "loss": 1.0679, "step": 2418 }, { "epoch": 0.78, "grad_norm": 0.27412328124046326, "learning_rate": 3.687135727342504e-05, "loss": 1.424, "step": 2419 }, { "epoch": 0.78, "grad_norm": 0.23566727340221405, "learning_rate": 3.676840935077489e-05, "loss": 1.2325, "step": 2420 }, { "epoch": 0.78, "grad_norm": 0.20569080114364624, "learning_rate": 3.66655852699822e-05, "loss": 1.2771, "step": 2421 }, { "epoch": 0.78, "grad_norm": 0.2277299463748932, "learning_rate": 3.656288514350617e-05, "loss": 1.1911, "step": 2422 }, { "epoch": 0.78, "grad_norm": 0.23339278995990753, "learning_rate": 3.646030908367032e-05, "loss": 1.3509, "step": 2423 }, { "epoch": 0.78, "grad_norm": 0.24868783354759216, "learning_rate": 3.6357857202662453e-05, "loss": 1.3095, "step": 2424 }, { "epoch": 0.78, "grad_norm": 0.24752821028232574, "learning_rate": 3.625552961253473e-05, "loss": 1.4324, "step": 2425 }, { "epoch": 0.78, "grad_norm": 0.22618141770362854, "learning_rate": 3.6153326425203116e-05, "loss": 1.288, "step": 2426 }, { "epoch": 0.78, "grad_norm": 0.20129132270812988, "learning_rate": 3.605124775244781e-05, "loss": 1.1299, "step": 2427 }, { "epoch": 0.78, "grad_norm": 0.2118718922138214, "learning_rate": 3.594929370591257e-05, "loss": 1.1039, "step": 2428 }, { "epoch": 0.78, "grad_norm": 0.350980669260025, "learning_rate": 3.584746439710501e-05, "loss": 1.181, "step": 2429 }, { "epoch": 0.78, "grad_norm": 0.20620863139629364, "learning_rate": 3.5745759937396184e-05, "loss": 1.3246, "step": 2430 }, { "epoch": 0.78, "grad_norm": 0.2305171936750412, "learning_rate": 3.564418043802075e-05, "loss": 1.118, "step": 2431 }, { "epoch": 0.78, "grad_norm": 0.2044614553451538, "learning_rate": 3.554272601007665e-05, "loss": 1.2299, "step": 2432 }, { "epoch": 0.78, "grad_norm": 0.2271304428577423, "learning_rate": 3.5441396764524984e-05, "loss": 1.2613, "step": 2433 }, { "epoch": 0.78, "grad_norm": 0.210028275847435, "learning_rate": 3.534019281218995e-05, "loss": 1.3357, "step": 2434 }, { "epoch": 0.78, "grad_norm": 0.20889489352703094, "learning_rate": 3.523911426375869e-05, "loss": 1.3216, "step": 2435 }, { "epoch": 0.78, "grad_norm": 0.19610823690891266, "learning_rate": 3.513816122978127e-05, "loss": 1.1316, "step": 2436 }, { "epoch": 0.78, "grad_norm": 0.2037346214056015, "learning_rate": 3.503733382067049e-05, "loss": 1.3857, "step": 2437 }, { "epoch": 0.79, "grad_norm": 0.2666509449481964, "learning_rate": 3.4936632146701635e-05, "loss": 1.3892, "step": 2438 }, { "epoch": 0.79, "grad_norm": 0.20371995866298676, "learning_rate": 3.483605631801259e-05, "loss": 1.0481, "step": 2439 }, { "epoch": 0.79, "grad_norm": 0.2127661406993866, "learning_rate": 3.473560644460347e-05, "loss": 1.2128, "step": 2440 }, { "epoch": 0.79, "grad_norm": 0.21213097870349884, "learning_rate": 3.463528263633676e-05, "loss": 0.9826, "step": 2441 }, { "epoch": 0.79, "grad_norm": 0.21129171550273895, "learning_rate": 3.4535085002937076e-05, "loss": 1.2809, "step": 2442 }, { "epoch": 0.79, "grad_norm": 0.210170179605484, "learning_rate": 3.4435013653990954e-05, "loss": 1.2875, "step": 2443 }, { "epoch": 0.79, "grad_norm": 0.23691099882125854, "learning_rate": 3.433506869894681e-05, "loss": 1.1098, "step": 2444 }, { "epoch": 0.79, "grad_norm": 0.23637908697128296, "learning_rate": 3.423525024711484e-05, "loss": 1.1998, "step": 2445 }, { "epoch": 0.79, "grad_norm": 0.21661993861198425, "learning_rate": 3.4135558407666926e-05, "loss": 1.128, "step": 2446 }, { "epoch": 0.79, "grad_norm": 0.2377147227525711, "learning_rate": 3.403599328963652e-05, "loss": 1.107, "step": 2447 }, { "epoch": 0.79, "grad_norm": 0.24793460965156555, "learning_rate": 3.393655500191825e-05, "loss": 1.2793, "step": 2448 }, { "epoch": 0.79, "grad_norm": 0.24005292356014252, "learning_rate": 3.383724365326829e-05, "loss": 1.4756, "step": 2449 }, { "epoch": 0.79, "grad_norm": 0.21006405353546143, "learning_rate": 3.37380593523038e-05, "loss": 1.4066, "step": 2450 }, { "epoch": 0.79, "grad_norm": 0.31929394602775574, "learning_rate": 3.3639002207503095e-05, "loss": 1.1919, "step": 2451 }, { "epoch": 0.79, "grad_norm": 0.3132856488227844, "learning_rate": 3.354007232720548e-05, "loss": 1.188, "step": 2452 }, { "epoch": 0.79, "grad_norm": 0.22735187411308289, "learning_rate": 3.3441269819610774e-05, "loss": 1.2051, "step": 2453 }, { "epoch": 0.79, "grad_norm": 0.22491632401943207, "learning_rate": 3.334259479277985e-05, "loss": 1.4106, "step": 2454 }, { "epoch": 0.79, "grad_norm": 0.23871050775051117, "learning_rate": 3.3244047354633875e-05, "loss": 1.1021, "step": 2455 }, { "epoch": 0.79, "grad_norm": 0.34298884868621826, "learning_rate": 3.31456276129547e-05, "loss": 1.2357, "step": 2456 }, { "epoch": 0.79, "grad_norm": 0.20800204575061798, "learning_rate": 3.304733567538434e-05, "loss": 1.1059, "step": 2457 }, { "epoch": 0.79, "grad_norm": 0.21702982485294342, "learning_rate": 3.294917164942507e-05, "loss": 0.9711, "step": 2458 }, { "epoch": 0.79, "grad_norm": 0.20532673597335815, "learning_rate": 3.2851135642439346e-05, "loss": 1.3331, "step": 2459 }, { "epoch": 0.79, "grad_norm": 0.20005390048027039, "learning_rate": 3.2753227761649496e-05, "loss": 1.2255, "step": 2460 }, { "epoch": 0.79, "grad_norm": 0.33468469977378845, "learning_rate": 3.265544811413784e-05, "loss": 1.1606, "step": 2461 }, { "epoch": 0.79, "grad_norm": 0.18394415080547333, "learning_rate": 3.255779680684634e-05, "loss": 1.062, "step": 2462 }, { "epoch": 0.79, "grad_norm": 0.24787510931491852, "learning_rate": 3.246027394657661e-05, "loss": 1.13, "step": 2463 }, { "epoch": 0.79, "grad_norm": 0.2108093798160553, "learning_rate": 3.2362879639989876e-05, "loss": 1.2408, "step": 2464 }, { "epoch": 0.79, "grad_norm": 0.2239259034395218, "learning_rate": 3.22656139936066e-05, "loss": 1.2249, "step": 2465 }, { "epoch": 0.79, "grad_norm": 0.22281751036643982, "learning_rate": 3.2168477113806716e-05, "loss": 1.3052, "step": 2466 }, { "epoch": 0.79, "grad_norm": 0.2006983608007431, "learning_rate": 3.2071469106829186e-05, "loss": 1.2646, "step": 2467 }, { "epoch": 0.79, "grad_norm": 0.21356245875358582, "learning_rate": 3.1974590078772014e-05, "loss": 1.2613, "step": 2468 }, { "epoch": 0.8, "grad_norm": 0.19806163012981415, "learning_rate": 3.187784013559229e-05, "loss": 1.137, "step": 2469 }, { "epoch": 0.8, "grad_norm": 0.24663065373897552, "learning_rate": 3.178121938310573e-05, "loss": 1.0474, "step": 2470 }, { "epoch": 0.8, "grad_norm": 0.2348141223192215, "learning_rate": 3.1684727926986954e-05, "loss": 1.3446, "step": 2471 }, { "epoch": 0.8, "grad_norm": 0.2364383041858673, "learning_rate": 3.1588365872769015e-05, "loss": 1.3311, "step": 2472 }, { "epoch": 0.8, "grad_norm": 0.38134604692459106, "learning_rate": 3.1492133325843424e-05, "loss": 1.3247, "step": 2473 }, { "epoch": 0.8, "grad_norm": 0.22544440627098083, "learning_rate": 3.1396030391460225e-05, "loss": 1.166, "step": 2474 }, { "epoch": 0.8, "grad_norm": 0.22395560145378113, "learning_rate": 3.13000571747275e-05, "loss": 1.324, "step": 2475 }, { "epoch": 0.8, "grad_norm": 0.20989082753658295, "learning_rate": 3.120421378061165e-05, "loss": 0.8664, "step": 2476 }, { "epoch": 0.8, "grad_norm": 0.23963458836078644, "learning_rate": 3.110850031393692e-05, "loss": 1.2911, "step": 2477 }, { "epoch": 0.8, "grad_norm": 0.20695413649082184, "learning_rate": 3.101291687938549e-05, "loss": 1.1918, "step": 2478 }, { "epoch": 0.8, "grad_norm": 0.22519661486148834, "learning_rate": 3.091746358149746e-05, "loss": 1.1968, "step": 2479 }, { "epoch": 0.8, "grad_norm": 0.198447585105896, "learning_rate": 3.0822140524670395e-05, "loss": 1.4094, "step": 2480 }, { "epoch": 0.8, "grad_norm": 0.18947833776474, "learning_rate": 3.072694781315959e-05, "loss": 1.203, "step": 2481 }, { "epoch": 0.8, "grad_norm": 0.2958656847476959, "learning_rate": 3.06318855510777e-05, "loss": 1.0724, "step": 2482 }, { "epoch": 0.8, "grad_norm": 0.22111733257770538, "learning_rate": 3.053695384239462e-05, "loss": 1.337, "step": 2483 }, { "epoch": 0.8, "grad_norm": 0.19868934154510498, "learning_rate": 3.044215279093769e-05, "loss": 0.9382, "step": 2484 }, { "epoch": 0.8, "grad_norm": 0.2371215969324112, "learning_rate": 3.0347482500391074e-05, "loss": 1.4108, "step": 2485 }, { "epoch": 0.8, "grad_norm": 0.20389240980148315, "learning_rate": 3.025294307429618e-05, "loss": 1.259, "step": 2486 }, { "epoch": 0.8, "grad_norm": 0.20793108642101288, "learning_rate": 3.0158534616051107e-05, "loss": 1.0685, "step": 2487 }, { "epoch": 0.8, "grad_norm": 0.22384586930274963, "learning_rate": 3.0064257228910747e-05, "loss": 1.1628, "step": 2488 }, { "epoch": 0.8, "grad_norm": 0.20597702264785767, "learning_rate": 2.9970111015986747e-05, "loss": 1.2652, "step": 2489 }, { "epoch": 0.8, "grad_norm": 0.21245965361595154, "learning_rate": 2.9876096080247118e-05, "loss": 1.2907, "step": 2490 }, { "epoch": 0.8, "grad_norm": 0.21153417229652405, "learning_rate": 2.9782212524516462e-05, "loss": 1.1427, "step": 2491 }, { "epoch": 0.8, "grad_norm": 0.19052162766456604, "learning_rate": 2.9688460451475594e-05, "loss": 1.0911, "step": 2492 }, { "epoch": 0.8, "grad_norm": 0.2168237268924713, "learning_rate": 2.959483996366147e-05, "loss": 1.3144, "step": 2493 }, { "epoch": 0.8, "grad_norm": 0.24548564851284027, "learning_rate": 2.9501351163467307e-05, "loss": 1.2923, "step": 2494 }, { "epoch": 0.8, "grad_norm": 0.21204926073551178, "learning_rate": 2.9407994153142094e-05, "loss": 1.3452, "step": 2495 }, { "epoch": 0.8, "grad_norm": 0.2632032036781311, "learning_rate": 2.9314769034790873e-05, "loss": 1.2473, "step": 2496 }, { "epoch": 0.8, "grad_norm": 0.21274328231811523, "learning_rate": 2.9221675910374305e-05, "loss": 1.0749, "step": 2497 }, { "epoch": 0.8, "grad_norm": 0.20457890629768372, "learning_rate": 2.912871488170867e-05, "loss": 1.3203, "step": 2498 }, { "epoch": 0.8, "grad_norm": 0.21552684903144836, "learning_rate": 2.903588605046591e-05, "loss": 1.2973, "step": 2499 }, { "epoch": 0.81, "grad_norm": 0.26007795333862305, "learning_rate": 2.894318951817325e-05, "loss": 1.3835, "step": 2500 }, { "epoch": 0.81, "grad_norm": 0.2154267132282257, "learning_rate": 2.885062538621325e-05, "loss": 1.1062, "step": 2501 }, { "epoch": 0.81, "grad_norm": 0.2170044183731079, "learning_rate": 2.8758193755823765e-05, "loss": 1.1485, "step": 2502 }, { "epoch": 0.81, "grad_norm": 0.2677794396877289, "learning_rate": 2.866589472809755e-05, "loss": 1.2855, "step": 2503 }, { "epoch": 0.81, "grad_norm": 0.20414406061172485, "learning_rate": 2.8573728403982532e-05, "loss": 1.1017, "step": 2504 }, { "epoch": 0.81, "grad_norm": 0.20020079612731934, "learning_rate": 2.8481694884281332e-05, "loss": 1.3114, "step": 2505 }, { "epoch": 0.81, "grad_norm": 0.20823369920253754, "learning_rate": 2.8389794269651378e-05, "loss": 1.2549, "step": 2506 }, { "epoch": 0.81, "grad_norm": 0.21405576169490814, "learning_rate": 2.829802666060482e-05, "loss": 1.3788, "step": 2507 }, { "epoch": 0.81, "grad_norm": 0.20739121735095978, "learning_rate": 2.8206392157508195e-05, "loss": 1.0523, "step": 2508 }, { "epoch": 0.81, "grad_norm": 0.22200918197631836, "learning_rate": 2.811489086058261e-05, "loss": 1.054, "step": 2509 }, { "epoch": 0.81, "grad_norm": 0.21838872134685516, "learning_rate": 2.802352286990337e-05, "loss": 1.4429, "step": 2510 }, { "epoch": 0.81, "grad_norm": 0.18278439342975616, "learning_rate": 2.7932288285399983e-05, "loss": 1.2052, "step": 2511 }, { "epoch": 0.81, "grad_norm": 0.19671063125133514, "learning_rate": 2.7841187206856184e-05, "loss": 1.417, "step": 2512 }, { "epoch": 0.81, "grad_norm": 0.2500137388706207, "learning_rate": 2.7750219733909508e-05, "loss": 1.159, "step": 2513 }, { "epoch": 0.81, "grad_norm": 0.2250618040561676, "learning_rate": 2.765938596605153e-05, "loss": 1.0511, "step": 2514 }, { "epoch": 0.81, "grad_norm": 0.2352951467037201, "learning_rate": 2.7568686002627484e-05, "loss": 1.205, "step": 2515 }, { "epoch": 0.81, "grad_norm": 0.1804763674736023, "learning_rate": 2.7478119942836286e-05, "loss": 1.2203, "step": 2516 }, { "epoch": 0.81, "grad_norm": 0.20908230543136597, "learning_rate": 2.7387687885730386e-05, "loss": 1.1817, "step": 2517 }, { "epoch": 0.81, "grad_norm": 0.20371294021606445, "learning_rate": 2.729738993021572e-05, "loss": 1.3029, "step": 2518 }, { "epoch": 0.81, "grad_norm": 0.20894916355609894, "learning_rate": 2.7207226175051583e-05, "loss": 1.4606, "step": 2519 }, { "epoch": 0.81, "grad_norm": 0.23562423884868622, "learning_rate": 2.71171967188504e-05, "loss": 1.3362, "step": 2520 }, { "epoch": 0.81, "grad_norm": 0.26470446586608887, "learning_rate": 2.702730166007775e-05, "loss": 1.2516, "step": 2521 }, { "epoch": 0.81, "grad_norm": 0.25547870993614197, "learning_rate": 2.693754109705222e-05, "loss": 1.1291, "step": 2522 }, { "epoch": 0.81, "grad_norm": 0.18754644691944122, "learning_rate": 2.6847915127945325e-05, "loss": 0.9233, "step": 2523 }, { "epoch": 0.81, "grad_norm": 0.21381473541259766, "learning_rate": 2.6758423850781425e-05, "loss": 1.1401, "step": 2524 }, { "epoch": 0.81, "grad_norm": 0.22489571571350098, "learning_rate": 2.666906736343738e-05, "loss": 1.2579, "step": 2525 }, { "epoch": 0.81, "grad_norm": 0.1810484379529953, "learning_rate": 2.6579845763642817e-05, "loss": 1.2756, "step": 2526 }, { "epoch": 0.81, "grad_norm": 0.22160746157169342, "learning_rate": 2.6490759148979724e-05, "loss": 1.4737, "step": 2527 }, { "epoch": 0.81, "grad_norm": 0.23212210834026337, "learning_rate": 2.640180761688253e-05, "loss": 1.4928, "step": 2528 }, { "epoch": 0.81, "grad_norm": 0.2161523401737213, "learning_rate": 2.6312991264637967e-05, "loss": 1.376, "step": 2529 }, { "epoch": 0.81, "grad_norm": 0.20982767641544342, "learning_rate": 2.6224310189384695e-05, "loss": 1.3216, "step": 2530 }, { "epoch": 0.82, "grad_norm": 0.2121298909187317, "learning_rate": 2.613576448811368e-05, "loss": 1.2927, "step": 2531 }, { "epoch": 0.82, "grad_norm": 0.21585653722286224, "learning_rate": 2.6047354257667658e-05, "loss": 1.2692, "step": 2532 }, { "epoch": 0.82, "grad_norm": 0.3144233524799347, "learning_rate": 2.5959079594741276e-05, "loss": 1.4917, "step": 2533 }, { "epoch": 0.82, "grad_norm": 0.19031038880348206, "learning_rate": 2.5870940595880996e-05, "loss": 1.3139, "step": 2534 }, { "epoch": 0.82, "grad_norm": 0.25078848004341125, "learning_rate": 2.578293735748465e-05, "loss": 1.5709, "step": 2535 }, { "epoch": 0.82, "grad_norm": 0.2186817228794098, "learning_rate": 2.5695069975801853e-05, "loss": 1.0687, "step": 2536 }, { "epoch": 0.82, "grad_norm": 0.23858857154846191, "learning_rate": 2.5607338546933425e-05, "loss": 1.3644, "step": 2537 }, { "epoch": 0.82, "grad_norm": 0.21974505484104156, "learning_rate": 2.5519743166831653e-05, "loss": 1.1776, "step": 2538 }, { "epoch": 0.82, "grad_norm": 0.20883122086524963, "learning_rate": 2.5432283931300044e-05, "loss": 1.2405, "step": 2539 }, { "epoch": 0.82, "grad_norm": 0.20382799208164215, "learning_rate": 2.5344960935992975e-05, "loss": 1.1703, "step": 2540 }, { "epoch": 0.82, "grad_norm": 0.20374338328838348, "learning_rate": 2.525777427641606e-05, "loss": 1.3938, "step": 2541 }, { "epoch": 0.82, "grad_norm": 0.20455406606197357, "learning_rate": 2.5170724047925638e-05, "loss": 1.1289, "step": 2542 }, { "epoch": 0.82, "grad_norm": 0.20010049641132355, "learning_rate": 2.508381034572899e-05, "loss": 1.0929, "step": 2543 }, { "epoch": 0.82, "grad_norm": 0.2060403823852539, "learning_rate": 2.4997033264883938e-05, "loss": 1.3615, "step": 2544 }, { "epoch": 0.82, "grad_norm": 0.23262515664100647, "learning_rate": 2.4910392900298915e-05, "loss": 1.1833, "step": 2545 }, { "epoch": 0.82, "grad_norm": 1.1468894481658936, "learning_rate": 2.482388934673289e-05, "loss": 1.2003, "step": 2546 }, { "epoch": 0.82, "grad_norm": 0.21295364201068878, "learning_rate": 2.4737522698795116e-05, "loss": 1.1367, "step": 2547 }, { "epoch": 0.82, "grad_norm": 0.20371927320957184, "learning_rate": 2.4651293050945196e-05, "loss": 1.1715, "step": 2548 }, { "epoch": 0.82, "grad_norm": 0.21592770516872406, "learning_rate": 2.456520049749283e-05, "loss": 1.2073, "step": 2549 }, { "epoch": 0.82, "grad_norm": 0.2039128690958023, "learning_rate": 2.447924513259778e-05, "loss": 1.1398, "step": 2550 }, { "epoch": 0.82, "grad_norm": 0.21094568073749542, "learning_rate": 2.439342705026981e-05, "loss": 1.2381, "step": 2551 }, { "epoch": 0.82, "grad_norm": 0.22174948453903198, "learning_rate": 2.430774634436849e-05, "loss": 1.4054, "step": 2552 }, { "epoch": 0.82, "grad_norm": 0.21741759777069092, "learning_rate": 2.4222203108603195e-05, "loss": 1.1437, "step": 2553 }, { "epoch": 0.82, "grad_norm": 0.7423094511032104, "learning_rate": 2.4136797436532884e-05, "loss": 1.2135, "step": 2554 }, { "epoch": 0.82, "grad_norm": 0.2253720611333847, "learning_rate": 2.4051529421566056e-05, "loss": 1.2639, "step": 2555 }, { "epoch": 0.82, "grad_norm": 0.22494401037693024, "learning_rate": 2.3966399156960798e-05, "loss": 1.092, "step": 2556 }, { "epoch": 0.82, "grad_norm": 0.20783503353595734, "learning_rate": 2.388140673582431e-05, "loss": 1.2838, "step": 2557 }, { "epoch": 0.82, "grad_norm": 0.20407342910766602, "learning_rate": 2.3796552251113255e-05, "loss": 1.0007, "step": 2558 }, { "epoch": 0.82, "grad_norm": 0.24243716895580292, "learning_rate": 2.371183579563328e-05, "loss": 1.0674, "step": 2559 }, { "epoch": 0.82, "grad_norm": 0.23078884184360504, "learning_rate": 2.3627257462039094e-05, "loss": 1.2737, "step": 2560 }, { "epoch": 0.82, "grad_norm": 0.25055992603302, "learning_rate": 2.3542817342834462e-05, "loss": 1.2391, "step": 2561 }, { "epoch": 0.83, "grad_norm": 0.3825729489326477, "learning_rate": 2.3458515530371774e-05, "loss": 1.1916, "step": 2562 }, { "epoch": 0.83, "grad_norm": 0.21512942016124725, "learning_rate": 2.3374352116852372e-05, "loss": 1.1402, "step": 2563 }, { "epoch": 0.83, "grad_norm": 0.23286530375480652, "learning_rate": 2.3290327194326096e-05, "loss": 1.3249, "step": 2564 }, { "epoch": 0.83, "grad_norm": 0.21273455023765564, "learning_rate": 2.32064408546913e-05, "loss": 1.2594, "step": 2565 }, { "epoch": 0.83, "grad_norm": 0.22298014163970947, "learning_rate": 2.312269318969489e-05, "loss": 1.1231, "step": 2566 }, { "epoch": 0.83, "grad_norm": 0.2723322808742523, "learning_rate": 2.303908429093197e-05, "loss": 1.3096, "step": 2567 }, { "epoch": 0.83, "grad_norm": 0.23351716995239258, "learning_rate": 2.2955614249846004e-05, "loss": 1.4166, "step": 2568 }, { "epoch": 0.83, "grad_norm": 0.20691895484924316, "learning_rate": 2.2872283157728487e-05, "loss": 1.1681, "step": 2569 }, { "epoch": 0.83, "grad_norm": 0.20736005902290344, "learning_rate": 2.278909110571893e-05, "loss": 1.1639, "step": 2570 }, { "epoch": 0.83, "grad_norm": 0.21406172215938568, "learning_rate": 2.270603818480492e-05, "loss": 1.3668, "step": 2571 }, { "epoch": 0.83, "grad_norm": 0.23492351174354553, "learning_rate": 2.2623124485821692e-05, "loss": 1.474, "step": 2572 }, { "epoch": 0.83, "grad_norm": 0.22889940440654755, "learning_rate": 2.254035009945236e-05, "loss": 1.3855, "step": 2573 }, { "epoch": 0.83, "grad_norm": 0.2279137820005417, "learning_rate": 2.2457715116227602e-05, "loss": 1.1441, "step": 2574 }, { "epoch": 0.83, "grad_norm": 0.25462624430656433, "learning_rate": 2.2375219626525583e-05, "loss": 1.3248, "step": 2575 }, { "epoch": 0.83, "grad_norm": 0.23538081347942352, "learning_rate": 2.2292863720572036e-05, "loss": 0.9822, "step": 2576 }, { "epoch": 0.83, "grad_norm": 0.19967021048069, "learning_rate": 2.2210647488439932e-05, "loss": 1.2378, "step": 2577 }, { "epoch": 0.83, "grad_norm": 0.21117240190505981, "learning_rate": 2.2128571020049457e-05, "loss": 1.3326, "step": 2578 }, { "epoch": 0.83, "grad_norm": 0.22723238170146942, "learning_rate": 2.2046634405168056e-05, "loss": 1.0716, "step": 2579 }, { "epoch": 0.83, "grad_norm": 0.23386487364768982, "learning_rate": 2.1964837733410062e-05, "loss": 1.1747, "step": 2580 }, { "epoch": 0.83, "grad_norm": 0.20886430144309998, "learning_rate": 2.1883181094236914e-05, "loss": 1.3171, "step": 2581 }, { "epoch": 0.83, "grad_norm": 0.3096659183502197, "learning_rate": 2.1801664576956783e-05, "loss": 1.1957, "step": 2582 }, { "epoch": 0.83, "grad_norm": 0.23158787190914154, "learning_rate": 2.172028827072456e-05, "loss": 1.2041, "step": 2583 }, { "epoch": 0.83, "grad_norm": 0.2822292149066925, "learning_rate": 2.1639052264541952e-05, "loss": 0.9835, "step": 2584 }, { "epoch": 0.83, "grad_norm": 0.24923059344291687, "learning_rate": 2.155795664725703e-05, "loss": 1.2257, "step": 2585 }, { "epoch": 0.83, "grad_norm": 0.21424978971481323, "learning_rate": 2.1477001507564467e-05, "loss": 1.1995, "step": 2586 }, { "epoch": 0.83, "grad_norm": 0.22909684479236603, "learning_rate": 2.1396186934005205e-05, "loss": 1.3467, "step": 2587 }, { "epoch": 0.83, "grad_norm": 0.2019147425889969, "learning_rate": 2.131551301496644e-05, "loss": 1.0408, "step": 2588 }, { "epoch": 0.83, "grad_norm": 0.19620494544506073, "learning_rate": 2.1234979838681628e-05, "loss": 1.299, "step": 2589 }, { "epoch": 0.83, "grad_norm": 0.23002435266971588, "learning_rate": 2.115458749323019e-05, "loss": 1.3114, "step": 2590 }, { "epoch": 0.83, "grad_norm": 0.20682118833065033, "learning_rate": 2.1074336066537617e-05, "loss": 1.0677, "step": 2591 }, { "epoch": 0.83, "grad_norm": 0.287730872631073, "learning_rate": 2.0994225646375197e-05, "loss": 1.2005, "step": 2592 }, { "epoch": 0.84, "grad_norm": 0.22069893777370453, "learning_rate": 2.0914256320360007e-05, "loss": 1.0104, "step": 2593 }, { "epoch": 0.84, "grad_norm": 0.21730034053325653, "learning_rate": 2.0834428175954854e-05, "loss": 1.1928, "step": 2594 }, { "epoch": 0.84, "grad_norm": 0.23702941834926605, "learning_rate": 2.0754741300468098e-05, "loss": 1.1477, "step": 2595 }, { "epoch": 0.84, "grad_norm": 0.23137353360652924, "learning_rate": 2.0675195781053622e-05, "loss": 1.3375, "step": 2596 }, { "epoch": 0.84, "grad_norm": 0.1918213814496994, "learning_rate": 2.059579170471067e-05, "loss": 1.3993, "step": 2597 }, { "epoch": 0.84, "grad_norm": 0.2209881991147995, "learning_rate": 2.0516529158283785e-05, "loss": 1.1346, "step": 2598 }, { "epoch": 0.84, "grad_norm": 0.22647030651569366, "learning_rate": 2.0437408228462808e-05, "loss": 1.262, "step": 2599 }, { "epoch": 0.84, "grad_norm": 0.20189087092876434, "learning_rate": 2.0358429001782538e-05, "loss": 1.3406, "step": 2600 }, { "epoch": 0.84, "grad_norm": 0.20793788135051727, "learning_rate": 2.027959156462301e-05, "loss": 1.2345, "step": 2601 }, { "epoch": 0.84, "grad_norm": 0.21970579028129578, "learning_rate": 2.02008960032089e-05, "loss": 1.3531, "step": 2602 }, { "epoch": 0.84, "grad_norm": 0.21924223005771637, "learning_rate": 2.012234240360996e-05, "loss": 0.9896, "step": 2603 }, { "epoch": 0.84, "grad_norm": 0.20955730974674225, "learning_rate": 2.0043930851740548e-05, "loss": 1.2606, "step": 2604 }, { "epoch": 0.84, "grad_norm": 0.22648537158966064, "learning_rate": 1.9965661433359716e-05, "loss": 1.189, "step": 2605 }, { "epoch": 0.84, "grad_norm": 0.2713281512260437, "learning_rate": 1.9887534234071116e-05, "loss": 1.252, "step": 2606 }, { "epoch": 0.84, "grad_norm": 0.22715206444263458, "learning_rate": 1.9809549339322674e-05, "loss": 1.3225, "step": 2607 }, { "epoch": 0.84, "grad_norm": 0.2335875928401947, "learning_rate": 1.9731706834406853e-05, "loss": 1.4297, "step": 2608 }, { "epoch": 0.84, "grad_norm": 0.21881718933582306, "learning_rate": 1.9654006804460287e-05, "loss": 1.43, "step": 2609 }, { "epoch": 0.84, "grad_norm": 0.23299317061901093, "learning_rate": 1.957644933446385e-05, "loss": 1.1395, "step": 2610 }, { "epoch": 0.84, "grad_norm": 0.22964709997177124, "learning_rate": 1.9499034509242507e-05, "loss": 1.0305, "step": 2611 }, { "epoch": 0.84, "grad_norm": 0.2163316160440445, "learning_rate": 1.9421762413465076e-05, "loss": 1.2735, "step": 2612 }, { "epoch": 0.84, "grad_norm": 0.22118829190731049, "learning_rate": 1.934463313164444e-05, "loss": 1.2564, "step": 2613 }, { "epoch": 0.84, "grad_norm": 0.22809793055057526, "learning_rate": 1.926764674813715e-05, "loss": 1.0224, "step": 2614 }, { "epoch": 0.84, "grad_norm": 0.2405829131603241, "learning_rate": 1.9190803347143573e-05, "loss": 1.3131, "step": 2615 }, { "epoch": 0.84, "grad_norm": 0.2206529676914215, "learning_rate": 1.9114103012707716e-05, "loss": 1.1677, "step": 2616 }, { "epoch": 0.84, "grad_norm": 0.21017585694789886, "learning_rate": 1.9037545828716922e-05, "loss": 1.23, "step": 2617 }, { "epoch": 0.84, "grad_norm": 0.21250700950622559, "learning_rate": 1.896113187890218e-05, "loss": 0.9827, "step": 2618 }, { "epoch": 0.84, "grad_norm": 0.22042091190814972, "learning_rate": 1.888486124683769e-05, "loss": 0.9908, "step": 2619 }, { "epoch": 0.84, "grad_norm": 0.22527514398097992, "learning_rate": 1.8808734015941006e-05, "loss": 1.337, "step": 2620 }, { "epoch": 0.84, "grad_norm": 0.24068433046340942, "learning_rate": 1.873275026947283e-05, "loss": 1.3501, "step": 2621 }, { "epoch": 0.84, "grad_norm": 0.2076960951089859, "learning_rate": 1.8656910090536782e-05, "loss": 1.0245, "step": 2622 }, { "epoch": 0.84, "grad_norm": 0.22592374682426453, "learning_rate": 1.8581213562079676e-05, "loss": 1.2386, "step": 2623 }, { "epoch": 0.85, "grad_norm": 0.2394360452890396, "learning_rate": 1.8505660766891046e-05, "loss": 1.1014, "step": 2624 }, { "epoch": 0.85, "grad_norm": 0.2008574903011322, "learning_rate": 1.843025178760337e-05, "loss": 1.2446, "step": 2625 }, { "epoch": 0.85, "grad_norm": 0.3371758460998535, "learning_rate": 1.8354986706691722e-05, "loss": 1.2649, "step": 2626 }, { "epoch": 0.85, "grad_norm": 0.24438707530498505, "learning_rate": 1.8279865606473793e-05, "loss": 1.2188, "step": 2627 }, { "epoch": 0.85, "grad_norm": 0.2076987773180008, "learning_rate": 1.8204888569109927e-05, "loss": 1.4246, "step": 2628 }, { "epoch": 0.85, "grad_norm": 0.21835608780384064, "learning_rate": 1.8130055676602727e-05, "loss": 1.085, "step": 2629 }, { "epoch": 0.85, "grad_norm": 0.2575867772102356, "learning_rate": 1.805536701079731e-05, "loss": 1.3648, "step": 2630 }, { "epoch": 0.85, "grad_norm": 0.2614276111125946, "learning_rate": 1.798082265338095e-05, "loss": 1.3301, "step": 2631 }, { "epoch": 0.85, "grad_norm": 0.21322058141231537, "learning_rate": 1.7906422685883078e-05, "loss": 1.2334, "step": 2632 }, { "epoch": 0.85, "grad_norm": 0.20337682962417603, "learning_rate": 1.7832167189675317e-05, "loss": 1.1544, "step": 2633 }, { "epoch": 0.85, "grad_norm": 0.20988339185714722, "learning_rate": 1.775805624597116e-05, "loss": 1.2798, "step": 2634 }, { "epoch": 0.85, "grad_norm": 0.23993812501430511, "learning_rate": 1.768408993582608e-05, "loss": 1.2867, "step": 2635 }, { "epoch": 0.85, "grad_norm": 0.22211414575576782, "learning_rate": 1.7610268340137334e-05, "loss": 1.3782, "step": 2636 }, { "epoch": 0.85, "grad_norm": 0.2883293032646179, "learning_rate": 1.753659153964384e-05, "loss": 0.9724, "step": 2637 }, { "epoch": 0.85, "grad_norm": 0.2177015095949173, "learning_rate": 1.7463059614926323e-05, "loss": 1.2769, "step": 2638 }, { "epoch": 0.85, "grad_norm": 0.21047255396842957, "learning_rate": 1.7389672646406865e-05, "loss": 1.0098, "step": 2639 }, { "epoch": 0.85, "grad_norm": 0.24475805461406708, "learning_rate": 1.7316430714349167e-05, "loss": 0.7642, "step": 2640 }, { "epoch": 0.85, "grad_norm": 0.19378653168678284, "learning_rate": 1.7243333898858196e-05, "loss": 1.1476, "step": 2641 }, { "epoch": 0.85, "grad_norm": 0.23169466853141785, "learning_rate": 1.7170382279880206e-05, "loss": 1.3121, "step": 2642 }, { "epoch": 0.85, "grad_norm": 0.19682246446609497, "learning_rate": 1.7097575937202758e-05, "loss": 1.3695, "step": 2643 }, { "epoch": 0.85, "grad_norm": 0.23044447600841522, "learning_rate": 1.7024914950454364e-05, "loss": 1.0878, "step": 2644 }, { "epoch": 0.85, "grad_norm": 0.2057802379131317, "learning_rate": 1.695239939910473e-05, "loss": 1.3091, "step": 2645 }, { "epoch": 0.85, "grad_norm": 0.20374082028865814, "learning_rate": 1.6880029362464382e-05, "loss": 1.1218, "step": 2646 }, { "epoch": 0.85, "grad_norm": 0.23885183036327362, "learning_rate": 1.680780491968468e-05, "loss": 1.2714, "step": 2647 }, { "epoch": 0.85, "grad_norm": 0.21870915591716766, "learning_rate": 1.673572614975788e-05, "loss": 1.1077, "step": 2648 }, { "epoch": 0.85, "grad_norm": 0.22429321706295013, "learning_rate": 1.6663793131516728e-05, "loss": 1.0818, "step": 2649 }, { "epoch": 0.85, "grad_norm": 0.237508624792099, "learning_rate": 1.6592005943634768e-05, "loss": 1.0843, "step": 2650 }, { "epoch": 0.85, "grad_norm": 0.266805499792099, "learning_rate": 1.652036466462589e-05, "loss": 1.3308, "step": 2651 }, { "epoch": 0.85, "grad_norm": 0.23939108848571777, "learning_rate": 1.6448869372844438e-05, "loss": 0.956, "step": 2652 }, { "epoch": 0.85, "grad_norm": 0.23057600855827332, "learning_rate": 1.6377520146485158e-05, "loss": 1.3838, "step": 2653 }, { "epoch": 0.85, "grad_norm": 0.21300604939460754, "learning_rate": 1.6306317063582948e-05, "loss": 1.2835, "step": 2654 }, { "epoch": 0.86, "grad_norm": 0.21200552582740784, "learning_rate": 1.6235260202012914e-05, "loss": 1.2811, "step": 2655 }, { "epoch": 0.86, "grad_norm": 0.20785057544708252, "learning_rate": 1.6164349639490286e-05, "loss": 1.0561, "step": 2656 }, { "epoch": 0.86, "grad_norm": 0.19967113435268402, "learning_rate": 1.609358545357017e-05, "loss": 1.0534, "step": 2657 }, { "epoch": 0.86, "grad_norm": 0.18869948387145996, "learning_rate": 1.602296772164773e-05, "loss": 1.154, "step": 2658 }, { "epoch": 0.86, "grad_norm": 0.18837584555149078, "learning_rate": 1.59524965209578e-05, "loss": 1.0327, "step": 2659 }, { "epoch": 0.86, "grad_norm": 0.2433336079120636, "learning_rate": 1.588217192857504e-05, "loss": 1.4523, "step": 2660 }, { "epoch": 0.86, "grad_norm": 0.18971115350723267, "learning_rate": 1.5811994021413776e-05, "loss": 1.3038, "step": 2661 }, { "epoch": 0.86, "grad_norm": 0.3401230573654175, "learning_rate": 1.5741962876227837e-05, "loss": 1.3604, "step": 2662 }, { "epoch": 0.86, "grad_norm": 0.2067655771970749, "learning_rate": 1.56720785696106e-05, "loss": 1.3937, "step": 2663 }, { "epoch": 0.86, "grad_norm": 0.2836259603500366, "learning_rate": 1.5602341177994833e-05, "loss": 1.2459, "step": 2664 }, { "epoch": 0.86, "grad_norm": 0.2410612404346466, "learning_rate": 1.5532750777652548e-05, "loss": 1.1216, "step": 2665 }, { "epoch": 0.86, "grad_norm": 0.23079510033130646, "learning_rate": 1.5463307444695117e-05, "loss": 1.4412, "step": 2666 }, { "epoch": 0.86, "grad_norm": 0.19119030237197876, "learning_rate": 1.5394011255072936e-05, "loss": 1.4869, "step": 2667 }, { "epoch": 0.86, "grad_norm": 0.19545243680477142, "learning_rate": 1.5324862284575612e-05, "loss": 1.1616, "step": 2668 }, { "epoch": 0.86, "grad_norm": 0.21275627613067627, "learning_rate": 1.5255860608831615e-05, "loss": 1.5569, "step": 2669 }, { "epoch": 0.86, "grad_norm": 0.1942959427833557, "learning_rate": 1.5187006303308346e-05, "loss": 1.2446, "step": 2670 }, { "epoch": 0.86, "grad_norm": 0.25128865242004395, "learning_rate": 1.5118299443312082e-05, "loss": 1.345, "step": 2671 }, { "epoch": 0.86, "grad_norm": 0.19144459068775177, "learning_rate": 1.5049740103987745e-05, "loss": 1.292, "step": 2672 }, { "epoch": 0.86, "grad_norm": 0.2145407497882843, "learning_rate": 1.4981328360319039e-05, "loss": 0.9583, "step": 2673 }, { "epoch": 0.86, "grad_norm": 0.21100027859210968, "learning_rate": 1.4913064287128129e-05, "loss": 1.2679, "step": 2674 }, { "epoch": 0.86, "grad_norm": 0.2513868808746338, "learning_rate": 1.4844947959075693e-05, "loss": 1.0622, "step": 2675 }, { "epoch": 0.86, "grad_norm": 0.2283564805984497, "learning_rate": 1.4776979450660886e-05, "loss": 1.1535, "step": 2676 }, { "epoch": 0.86, "grad_norm": 0.19953398406505585, "learning_rate": 1.4709158836221097e-05, "loss": 1.1751, "step": 2677 }, { "epoch": 0.86, "grad_norm": 0.22994588315486908, "learning_rate": 1.4641486189932106e-05, "loss": 1.4579, "step": 2678 }, { "epoch": 0.86, "grad_norm": 0.19281511008739471, "learning_rate": 1.4573961585807625e-05, "loss": 1.4912, "step": 2679 }, { "epoch": 0.86, "grad_norm": 0.20727375149726868, "learning_rate": 1.4506585097699663e-05, "loss": 1.1315, "step": 2680 }, { "epoch": 0.86, "grad_norm": 0.20005817711353302, "learning_rate": 1.4439356799298208e-05, "loss": 1.1842, "step": 2681 }, { "epoch": 0.86, "grad_norm": 0.23680996894836426, "learning_rate": 1.4372276764131042e-05, "loss": 1.4061, "step": 2682 }, { "epoch": 0.86, "grad_norm": 0.20222634077072144, "learning_rate": 1.4305345065563979e-05, "loss": 1.1652, "step": 2683 }, { "epoch": 0.86, "grad_norm": 0.20027969777584076, "learning_rate": 1.4238561776800378e-05, "loss": 1.2322, "step": 2684 }, { "epoch": 0.86, "grad_norm": 0.2254108190536499, "learning_rate": 1.4171926970881431e-05, "loss": 1.0566, "step": 2685 }, { "epoch": 0.87, "grad_norm": 0.24286209046840668, "learning_rate": 1.4105440720685974e-05, "loss": 1.352, "step": 2686 }, { "epoch": 0.87, "grad_norm": 0.242966890335083, "learning_rate": 1.4039103098930205e-05, "loss": 1.3683, "step": 2687 }, { "epoch": 0.87, "grad_norm": 0.21039436757564545, "learning_rate": 1.3972914178167937e-05, "loss": 1.0376, "step": 2688 }, { "epoch": 0.87, "grad_norm": 0.258365660905838, "learning_rate": 1.3906874030790166e-05, "loss": 1.0218, "step": 2689 }, { "epoch": 0.87, "grad_norm": 0.1842976063489914, "learning_rate": 1.3840982729025346e-05, "loss": 1.2469, "step": 2690 }, { "epoch": 0.87, "grad_norm": 0.2696279287338257, "learning_rate": 1.3775240344939015e-05, "loss": 1.1082, "step": 2691 }, { "epoch": 0.87, "grad_norm": 0.22328437864780426, "learning_rate": 1.3709646950433923e-05, "loss": 1.378, "step": 2692 }, { "epoch": 0.87, "grad_norm": 0.22148098051548004, "learning_rate": 1.3644202617249883e-05, "loss": 1.3324, "step": 2693 }, { "epoch": 0.87, "grad_norm": 0.21664345264434814, "learning_rate": 1.3578907416963524e-05, "loss": 1.2161, "step": 2694 }, { "epoch": 0.87, "grad_norm": 0.21598991751670837, "learning_rate": 1.3513761420988572e-05, "loss": 1.0558, "step": 2695 }, { "epoch": 0.87, "grad_norm": 0.18632641434669495, "learning_rate": 1.3448764700575398e-05, "loss": 1.0664, "step": 2696 }, { "epoch": 0.87, "grad_norm": 0.2132555991411209, "learning_rate": 1.3383917326811205e-05, "loss": 1.1554, "step": 2697 }, { "epoch": 0.87, "grad_norm": 0.21434958279132843, "learning_rate": 1.331921937061991e-05, "loss": 1.3142, "step": 2698 }, { "epoch": 0.87, "grad_norm": 0.1908016800880432, "learning_rate": 1.325467090276181e-05, "loss": 1.1487, "step": 2699 }, { "epoch": 0.87, "grad_norm": 0.2176845818758011, "learning_rate": 1.3190271993833917e-05, "loss": 1.2763, "step": 2700 }, { "epoch": 0.87, "grad_norm": 0.2382155954837799, "learning_rate": 1.3126022714269524e-05, "loss": 1.2514, "step": 2701 }, { "epoch": 0.87, "grad_norm": 0.2081136852502823, "learning_rate": 1.306192313433837e-05, "loss": 1.2695, "step": 2702 }, { "epoch": 0.87, "grad_norm": 0.1944865882396698, "learning_rate": 1.2997973324146478e-05, "loss": 1.0969, "step": 2703 }, { "epoch": 0.87, "grad_norm": 0.22359035909175873, "learning_rate": 1.2934173353635913e-05, "loss": 1.1728, "step": 2704 }, { "epoch": 0.87, "grad_norm": 0.19107495248317719, "learning_rate": 1.2870523292585044e-05, "loss": 1.0674, "step": 2705 }, { "epoch": 0.87, "grad_norm": 0.22313840687274933, "learning_rate": 1.2807023210608153e-05, "loss": 1.2405, "step": 2706 }, { "epoch": 0.87, "grad_norm": 0.26205310225486755, "learning_rate": 1.2743673177155583e-05, "loss": 1.2616, "step": 2707 }, { "epoch": 0.87, "grad_norm": 0.22373488545417786, "learning_rate": 1.268047326151353e-05, "loss": 1.122, "step": 2708 }, { "epoch": 0.87, "grad_norm": 0.19649678468704224, "learning_rate": 1.2617423532803932e-05, "loss": 0.9809, "step": 2709 }, { "epoch": 0.87, "grad_norm": 0.2066134363412857, "learning_rate": 1.2554524059984633e-05, "loss": 1.4492, "step": 2710 }, { "epoch": 0.87, "grad_norm": 0.19933819770812988, "learning_rate": 1.2491774911848984e-05, "loss": 1.0899, "step": 2711 }, { "epoch": 0.87, "grad_norm": 0.1837177872657776, "learning_rate": 1.2429176157026006e-05, "loss": 1.0896, "step": 2712 }, { "epoch": 0.87, "grad_norm": 0.22087064385414124, "learning_rate": 1.236672786398023e-05, "loss": 1.0539, "step": 2713 }, { "epoch": 0.87, "grad_norm": 0.19860056042671204, "learning_rate": 1.2304430101011575e-05, "loss": 1.2179, "step": 2714 }, { "epoch": 0.87, "grad_norm": 0.2627449631690979, "learning_rate": 1.2242282936255387e-05, "loss": 1.143, "step": 2715 }, { "epoch": 0.87, "grad_norm": 0.20935390889644623, "learning_rate": 1.2180286437682262e-05, "loss": 1.1459, "step": 2716 }, { "epoch": 0.88, "grad_norm": 0.18618261814117432, "learning_rate": 1.2118440673098045e-05, "loss": 1.4866, "step": 2717 }, { "epoch": 0.88, "grad_norm": 0.2161167562007904, "learning_rate": 1.2056745710143711e-05, "loss": 1.3362, "step": 2718 }, { "epoch": 0.88, "grad_norm": 0.18389247357845306, "learning_rate": 1.1995201616295236e-05, "loss": 1.1306, "step": 2719 }, { "epoch": 0.88, "grad_norm": 0.2515436112880707, "learning_rate": 1.1933808458863725e-05, "loss": 1.2665, "step": 2720 }, { "epoch": 0.88, "grad_norm": 0.2080898880958557, "learning_rate": 1.1872566304995067e-05, "loss": 1.0103, "step": 2721 }, { "epoch": 0.88, "grad_norm": 0.20481188595294952, "learning_rate": 1.1811475221670118e-05, "loss": 1.093, "step": 2722 }, { "epoch": 0.88, "grad_norm": 0.22969433665275574, "learning_rate": 1.1750535275704414e-05, "loss": 1.3783, "step": 2723 }, { "epoch": 0.88, "grad_norm": 0.187138170003891, "learning_rate": 1.168974653374819e-05, "loss": 1.2506, "step": 2724 }, { "epoch": 0.88, "grad_norm": 0.21252480149269104, "learning_rate": 1.1629109062286412e-05, "loss": 1.125, "step": 2725 }, { "epoch": 0.88, "grad_norm": 0.48198428750038147, "learning_rate": 1.1568622927638466e-05, "loss": 1.3012, "step": 2726 }, { "epoch": 0.88, "grad_norm": 0.21271160244941711, "learning_rate": 1.1508288195958348e-05, "loss": 1.1821, "step": 2727 }, { "epoch": 0.88, "grad_norm": 0.2019740492105484, "learning_rate": 1.1448104933234376e-05, "loss": 1.1653, "step": 2728 }, { "epoch": 0.88, "grad_norm": 0.21588487923145294, "learning_rate": 1.138807320528921e-05, "loss": 1.487, "step": 2729 }, { "epoch": 0.88, "grad_norm": 0.2085544914007187, "learning_rate": 1.1328193077779834e-05, "loss": 1.1073, "step": 2730 }, { "epoch": 0.88, "grad_norm": 0.2434205859899521, "learning_rate": 1.1268464616197393e-05, "loss": 1.096, "step": 2731 }, { "epoch": 0.88, "grad_norm": 0.20718039572238922, "learning_rate": 1.1208887885867095e-05, "loss": 1.156, "step": 2732 }, { "epoch": 0.88, "grad_norm": 0.26030561327934265, "learning_rate": 1.1149462951948323e-05, "loss": 1.451, "step": 2733 }, { "epoch": 0.88, "grad_norm": 0.23293662071228027, "learning_rate": 1.1090189879434335e-05, "loss": 1.0, "step": 2734 }, { "epoch": 0.88, "grad_norm": 0.20650392770767212, "learning_rate": 1.103106873315236e-05, "loss": 1.0962, "step": 2735 }, { "epoch": 0.88, "grad_norm": 0.24615128338336945, "learning_rate": 1.0972099577763421e-05, "loss": 1.4942, "step": 2736 }, { "epoch": 0.88, "grad_norm": 0.2348039150238037, "learning_rate": 1.0913282477762297e-05, "loss": 1.2651, "step": 2737 }, { "epoch": 0.88, "grad_norm": 0.1918676793575287, "learning_rate": 1.0854617497477564e-05, "loss": 1.2644, "step": 2738 }, { "epoch": 0.88, "grad_norm": 0.21500752866268158, "learning_rate": 1.0796104701071279e-05, "loss": 0.9762, "step": 2739 }, { "epoch": 0.88, "grad_norm": 0.20647293329238892, "learning_rate": 1.0737744152539201e-05, "loss": 1.0837, "step": 2740 }, { "epoch": 0.88, "grad_norm": 0.1950092315673828, "learning_rate": 1.0679535915710457e-05, "loss": 1.135, "step": 2741 }, { "epoch": 0.88, "grad_norm": 0.22994616627693176, "learning_rate": 1.0621480054247622e-05, "loss": 1.2346, "step": 2742 }, { "epoch": 0.88, "grad_norm": 0.2290288209915161, "learning_rate": 1.056357663164667e-05, "loss": 1.3519, "step": 2743 }, { "epoch": 0.88, "grad_norm": 0.2177969217300415, "learning_rate": 1.0505825711236776e-05, "loss": 1.2283, "step": 2744 }, { "epoch": 0.88, "grad_norm": 0.22610656917095184, "learning_rate": 1.04482273561804e-05, "loss": 1.0566, "step": 2745 }, { "epoch": 0.88, "grad_norm": 0.22660550475120544, "learning_rate": 1.0390781629473082e-05, "loss": 1.1981, "step": 2746 }, { "epoch": 0.88, "grad_norm": 0.2223191261291504, "learning_rate": 1.0333488593943434e-05, "loss": 1.2649, "step": 2747 }, { "epoch": 0.89, "grad_norm": 0.19843046367168427, "learning_rate": 1.0276348312253114e-05, "loss": 1.0707, "step": 2748 }, { "epoch": 0.89, "grad_norm": 0.32322561740875244, "learning_rate": 1.0219360846896646e-05, "loss": 1.2153, "step": 2749 }, { "epoch": 0.89, "grad_norm": 0.19400252401828766, "learning_rate": 1.0162526260201509e-05, "loss": 1.0712, "step": 2750 }, { "epoch": 0.89, "grad_norm": 0.21708038449287415, "learning_rate": 1.010584461432788e-05, "loss": 1.2663, "step": 2751 }, { "epoch": 0.89, "grad_norm": 0.22748051583766937, "learning_rate": 1.0049315971268701e-05, "loss": 1.2465, "step": 2752 }, { "epoch": 0.89, "grad_norm": 0.21346887946128845, "learning_rate": 9.992940392849636e-06, "loss": 1.2882, "step": 2753 }, { "epoch": 0.89, "grad_norm": 0.20852018892765045, "learning_rate": 9.936717940728827e-06, "loss": 1.1466, "step": 2754 }, { "epoch": 0.89, "grad_norm": 0.221393421292305, "learning_rate": 9.88064867639709e-06, "loss": 1.1418, "step": 2755 }, { "epoch": 0.89, "grad_norm": 0.23445017635822296, "learning_rate": 9.824732661177486e-06, "loss": 1.2282, "step": 2756 }, { "epoch": 0.89, "grad_norm": 0.1923072785139084, "learning_rate": 9.768969956225665e-06, "loss": 1.2681, "step": 2757 }, { "epoch": 0.89, "grad_norm": 0.19985762238502502, "learning_rate": 9.713360622529538e-06, "loss": 1.3521, "step": 2758 }, { "epoch": 0.89, "grad_norm": 0.2572389543056488, "learning_rate": 9.65790472090922e-06, "loss": 1.3802, "step": 2759 }, { "epoch": 0.89, "grad_norm": 0.21306990087032318, "learning_rate": 9.602602312017133e-06, "loss": 1.3689, "step": 2760 }, { "epoch": 0.89, "grad_norm": 0.26831889152526855, "learning_rate": 9.547453456337656e-06, "loss": 1.0052, "step": 2761 }, { "epoch": 0.89, "grad_norm": 0.20502912998199463, "learning_rate": 9.492458214187359e-06, "loss": 1.2007, "step": 2762 }, { "epoch": 0.89, "grad_norm": 0.19259442389011383, "learning_rate": 9.437616645714796e-06, "loss": 1.2461, "step": 2763 }, { "epoch": 0.89, "grad_norm": 0.23113146424293518, "learning_rate": 9.382928810900352e-06, "loss": 1.2764, "step": 2764 }, { "epoch": 0.89, "grad_norm": 0.25105658173561096, "learning_rate": 9.328394769556435e-06, "loss": 1.1539, "step": 2765 }, { "epoch": 0.89, "grad_norm": 0.208001509308815, "learning_rate": 9.274014581327033e-06, "loss": 1.2825, "step": 2766 }, { "epoch": 0.89, "grad_norm": 0.23090344667434692, "learning_rate": 9.219788305688053e-06, "loss": 1.1737, "step": 2767 }, { "epoch": 0.89, "grad_norm": 0.24020306766033173, "learning_rate": 9.165716001946988e-06, "loss": 1.3004, "step": 2768 }, { "epoch": 0.89, "grad_norm": 0.22680556774139404, "learning_rate": 9.111797729242931e-06, "loss": 1.3269, "step": 2769 }, { "epoch": 0.89, "grad_norm": 0.23302654922008514, "learning_rate": 9.058033546546544e-06, "loss": 1.2783, "step": 2770 }, { "epoch": 0.89, "grad_norm": 0.18683023750782013, "learning_rate": 9.004423512659892e-06, "loss": 1.2761, "step": 2771 }, { "epoch": 0.89, "grad_norm": 0.2854492962360382, "learning_rate": 8.95096768621652e-06, "loss": 1.1404, "step": 2772 }, { "epoch": 0.89, "grad_norm": 0.22639605402946472, "learning_rate": 8.897666125681263e-06, "loss": 1.3158, "step": 2773 }, { "epoch": 0.89, "grad_norm": 0.23072847723960876, "learning_rate": 8.844518889350272e-06, "loss": 1.282, "step": 2774 }, { "epoch": 0.89, "grad_norm": 0.19964586198329926, "learning_rate": 8.791526035350932e-06, "loss": 1.1537, "step": 2775 }, { "epoch": 0.89, "grad_norm": 0.21436162292957306, "learning_rate": 8.738687621641682e-06, "loss": 1.3435, "step": 2776 }, { "epoch": 0.89, "grad_norm": 0.19969449937343597, "learning_rate": 8.686003706012146e-06, "loss": 1.2174, "step": 2777 }, { "epoch": 0.89, "grad_norm": 0.33268243074417114, "learning_rate": 8.63347434608293e-06, "loss": 1.0857, "step": 2778 }, { "epoch": 0.9, "grad_norm": 0.23116347193717957, "learning_rate": 8.581099599305613e-06, "loss": 1.1184, "step": 2779 }, { "epoch": 0.9, "grad_norm": 0.242537260055542, "learning_rate": 8.528879522962722e-06, "loss": 1.498, "step": 2780 }, { "epoch": 0.9, "grad_norm": 0.19596107304096222, "learning_rate": 8.47681417416749e-06, "loss": 1.2715, "step": 2781 }, { "epoch": 0.9, "grad_norm": 0.24605482816696167, "learning_rate": 8.424903609864048e-06, "loss": 1.3727, "step": 2782 }, { "epoch": 0.9, "grad_norm": 0.33082059025764465, "learning_rate": 8.37314788682718e-06, "loss": 1.2916, "step": 2783 }, { "epoch": 0.9, "grad_norm": 0.25856995582580566, "learning_rate": 8.321547061662342e-06, "loss": 1.381, "step": 2784 }, { "epoch": 0.9, "grad_norm": 0.22183988988399506, "learning_rate": 8.270101190805572e-06, "loss": 1.1219, "step": 2785 }, { "epoch": 0.9, "grad_norm": 0.20748820900917053, "learning_rate": 8.218810330523395e-06, "loss": 1.2272, "step": 2786 }, { "epoch": 0.9, "grad_norm": 0.2358664870262146, "learning_rate": 8.167674536912905e-06, "loss": 1.1206, "step": 2787 }, { "epoch": 0.9, "grad_norm": 0.18910685181617737, "learning_rate": 8.116693865901447e-06, "loss": 1.2951, "step": 2788 }, { "epoch": 0.9, "grad_norm": 0.19448606669902802, "learning_rate": 8.06586837324687e-06, "loss": 1.3914, "step": 2789 }, { "epoch": 0.9, "grad_norm": 0.23550482094287872, "learning_rate": 8.015198114537191e-06, "loss": 1.307, "step": 2790 }, { "epoch": 0.9, "grad_norm": 0.20314140617847443, "learning_rate": 7.964683145190665e-06, "loss": 1.3731, "step": 2791 }, { "epoch": 0.9, "grad_norm": 0.21990586817264557, "learning_rate": 7.914323520455745e-06, "loss": 1.4815, "step": 2792 }, { "epoch": 0.9, "grad_norm": 0.22295819222927094, "learning_rate": 7.864119295410926e-06, "loss": 1.1356, "step": 2793 }, { "epoch": 0.9, "grad_norm": 0.21799375116825104, "learning_rate": 7.814070524964832e-06, "loss": 1.1773, "step": 2794 }, { "epoch": 0.9, "grad_norm": 0.2649502456188202, "learning_rate": 7.76417726385598e-06, "loss": 1.2482, "step": 2795 }, { "epoch": 0.9, "grad_norm": 0.21837103366851807, "learning_rate": 7.71443956665282e-06, "loss": 1.3696, "step": 2796 }, { "epoch": 0.9, "grad_norm": 0.2279781550168991, "learning_rate": 7.6648574877537e-06, "loss": 1.3378, "step": 2797 }, { "epoch": 0.9, "grad_norm": 0.19774462282657623, "learning_rate": 7.61543108138673e-06, "loss": 1.3298, "step": 2798 }, { "epoch": 0.9, "grad_norm": 0.26735246181488037, "learning_rate": 7.566160401609784e-06, "loss": 1.3606, "step": 2799 }, { "epoch": 0.9, "grad_norm": 0.22719453275203705, "learning_rate": 7.51704550231042e-06, "loss": 1.1874, "step": 2800 }, { "epoch": 0.9, "grad_norm": 0.25194790959358215, "learning_rate": 7.468086437205756e-06, "loss": 1.2332, "step": 2801 }, { "epoch": 0.9, "grad_norm": 0.1815948337316513, "learning_rate": 7.41928325984259e-06, "loss": 1.3101, "step": 2802 }, { "epoch": 0.9, "grad_norm": 0.1946275681257248, "learning_rate": 7.370636023597093e-06, "loss": 1.1177, "step": 2803 }, { "epoch": 0.9, "grad_norm": 0.18576684594154358, "learning_rate": 7.322144781675004e-06, "loss": 1.3472, "step": 2804 }, { "epoch": 0.9, "grad_norm": 0.21525172889232635, "learning_rate": 7.273809587111368e-06, "loss": 1.1118, "step": 2805 }, { "epoch": 0.9, "grad_norm": 0.20123501121997833, "learning_rate": 7.225630492770579e-06, "loss": 1.1998, "step": 2806 }, { "epoch": 0.9, "grad_norm": 0.18338635563850403, "learning_rate": 7.17760755134632e-06, "loss": 1.1382, "step": 2807 }, { "epoch": 0.9, "grad_norm": 0.1959913820028305, "learning_rate": 7.129740815361495e-06, "loss": 1.2795, "step": 2808 }, { "epoch": 0.9, "grad_norm": 0.22533652186393738, "learning_rate": 7.08203033716811e-06, "loss": 1.1693, "step": 2809 }, { "epoch": 0.91, "grad_norm": 0.2524736821651459, "learning_rate": 7.0344761689473726e-06, "loss": 1.0641, "step": 2810 }, { "epoch": 0.91, "grad_norm": 0.24914728105068207, "learning_rate": 6.987078362709397e-06, "loss": 1.1096, "step": 2811 }, { "epoch": 0.91, "grad_norm": 0.18972717225551605, "learning_rate": 6.9398369702934335e-06, "loss": 1.1962, "step": 2812 }, { "epoch": 0.91, "grad_norm": 0.2130640745162964, "learning_rate": 6.892752043367567e-06, "loss": 1.0679, "step": 2813 }, { "epoch": 0.91, "grad_norm": 0.206035777926445, "learning_rate": 6.845823633428721e-06, "loss": 1.2877, "step": 2814 }, { "epoch": 0.91, "grad_norm": 0.2699582278728485, "learning_rate": 6.799051791802757e-06, "loss": 1.2395, "step": 2815 }, { "epoch": 0.91, "grad_norm": 0.21336403489112854, "learning_rate": 6.752436569644204e-06, "loss": 1.2799, "step": 2816 }, { "epoch": 0.91, "grad_norm": 0.27105534076690674, "learning_rate": 6.7059780179363306e-06, "loss": 1.3122, "step": 2817 }, { "epoch": 0.91, "grad_norm": 0.19145183265209198, "learning_rate": 6.659676187491042e-06, "loss": 1.1008, "step": 2818 }, { "epoch": 0.91, "grad_norm": 0.21421635150909424, "learning_rate": 6.613531128948829e-06, "loss": 1.1602, "step": 2819 }, { "epoch": 0.91, "grad_norm": 0.20954810082912445, "learning_rate": 6.567542892778754e-06, "loss": 1.2956, "step": 2820 }, { "epoch": 0.91, "grad_norm": 0.32044994831085205, "learning_rate": 6.521711529278318e-06, "loss": 1.2038, "step": 2821 }, { "epoch": 0.91, "grad_norm": 0.2040777951478958, "learning_rate": 6.476037088573505e-06, "loss": 1.1832, "step": 2822 }, { "epoch": 0.91, "grad_norm": 0.2742263972759247, "learning_rate": 6.430519620618624e-06, "loss": 1.1256, "step": 2823 }, { "epoch": 0.91, "grad_norm": 0.1820758432149887, "learning_rate": 6.3851591751962864e-06, "loss": 1.2549, "step": 2824 }, { "epoch": 0.91, "grad_norm": 0.21634554862976074, "learning_rate": 6.33995580191744e-06, "loss": 1.2159, "step": 2825 }, { "epoch": 0.91, "grad_norm": 0.24630890786647797, "learning_rate": 6.294909550221172e-06, "loss": 1.0824, "step": 2826 }, { "epoch": 0.91, "grad_norm": 0.21551227569580078, "learning_rate": 6.250020469374789e-06, "loss": 1.2519, "step": 2827 }, { "epoch": 0.91, "grad_norm": 0.22985535860061646, "learning_rate": 6.205288608473635e-06, "loss": 1.2398, "step": 2828 }, { "epoch": 0.91, "grad_norm": 0.24032415449619293, "learning_rate": 6.1607140164411265e-06, "loss": 1.4222, "step": 2829 }, { "epoch": 0.91, "grad_norm": 0.21333862841129303, "learning_rate": 6.116296742028698e-06, "loss": 1.1811, "step": 2830 }, { "epoch": 0.91, "grad_norm": 0.2779097855091095, "learning_rate": 6.072036833815691e-06, "loss": 1.2498, "step": 2831 }, { "epoch": 0.91, "grad_norm": 0.22501486539840698, "learning_rate": 6.027934340209367e-06, "loss": 1.4666, "step": 2832 }, { "epoch": 0.91, "grad_norm": 0.18370682001113892, "learning_rate": 5.983989309444792e-06, "loss": 1.3594, "step": 2833 }, { "epoch": 0.91, "grad_norm": 0.19408045709133148, "learning_rate": 5.940201789584803e-06, "loss": 1.2566, "step": 2834 }, { "epoch": 0.91, "grad_norm": 0.2721293568611145, "learning_rate": 5.896571828520041e-06, "loss": 1.2839, "step": 2835 }, { "epoch": 0.91, "grad_norm": 0.2097996175289154, "learning_rate": 5.853099473968736e-06, "loss": 1.3355, "step": 2836 }, { "epoch": 0.91, "grad_norm": 0.30015403032302856, "learning_rate": 5.8097847734768384e-06, "loss": 1.1374, "step": 2837 }, { "epoch": 0.91, "grad_norm": 0.2036822885274887, "learning_rate": 5.76662777441772e-06, "loss": 1.1616, "step": 2838 }, { "epoch": 0.91, "grad_norm": 0.2668648362159729, "learning_rate": 5.723628523992424e-06, "loss": 1.2398, "step": 2839 }, { "epoch": 0.91, "grad_norm": 0.1957867443561554, "learning_rate": 5.680787069229431e-06, "loss": 1.2335, "step": 2840 }, { "epoch": 0.92, "grad_norm": 0.17413325607776642, "learning_rate": 5.638103456984561e-06, "loss": 0.8118, "step": 2841 }, { "epoch": 0.92, "grad_norm": 0.21067920327186584, "learning_rate": 5.59557773394112e-06, "loss": 1.3365, "step": 2842 }, { "epoch": 0.92, "grad_norm": 0.20367437601089478, "learning_rate": 5.5532099466096044e-06, "loss": 1.2764, "step": 2843 }, { "epoch": 0.92, "grad_norm": 0.19703145325183868, "learning_rate": 5.511000141327865e-06, "loss": 1.2644, "step": 2844 }, { "epoch": 0.92, "grad_norm": 0.2500531077384949, "learning_rate": 5.4689483642609556e-06, "loss": 1.1126, "step": 2845 }, { "epoch": 0.92, "grad_norm": 0.21397368609905243, "learning_rate": 5.427054661401054e-06, "loss": 1.3207, "step": 2846 }, { "epoch": 0.92, "grad_norm": 0.2722577154636383, "learning_rate": 5.385319078567507e-06, "loss": 1.3621, "step": 2847 }, { "epoch": 0.92, "grad_norm": 0.2006002962589264, "learning_rate": 5.343741661406653e-06, "loss": 1.2765, "step": 2848 }, { "epoch": 0.92, "grad_norm": 0.2030160278081894, "learning_rate": 5.302322455391883e-06, "loss": 1.2668, "step": 2849 }, { "epoch": 0.92, "grad_norm": 0.20585373044013977, "learning_rate": 5.261061505823611e-06, "loss": 1.0832, "step": 2850 }, { "epoch": 0.92, "grad_norm": 0.25153854489326477, "learning_rate": 5.219958857829037e-06, "loss": 1.1831, "step": 2851 }, { "epoch": 0.92, "grad_norm": 0.26484403014183044, "learning_rate": 5.179014556362371e-06, "loss": 1.0791, "step": 2852 }, { "epoch": 0.92, "grad_norm": 0.23459693789482117, "learning_rate": 5.138228646204473e-06, "loss": 1.205, "step": 2853 }, { "epoch": 0.92, "grad_norm": 0.22847022116184235, "learning_rate": 5.097601171963128e-06, "loss": 1.1346, "step": 2854 }, { "epoch": 0.92, "grad_norm": 0.2004699558019638, "learning_rate": 5.057132178072776e-06, "loss": 1.2862, "step": 2855 }, { "epoch": 0.92, "grad_norm": 0.19763325154781342, "learning_rate": 5.0168217087944775e-06, "loss": 0.913, "step": 2856 }, { "epoch": 0.92, "grad_norm": 0.220400869846344, "learning_rate": 4.976669808216016e-06, "loss": 1.1707, "step": 2857 }, { "epoch": 0.92, "grad_norm": 0.20859555900096893, "learning_rate": 4.936676520251615e-06, "loss": 1.3084, "step": 2858 }, { "epoch": 0.92, "grad_norm": 0.19233278930187225, "learning_rate": 4.896841888642167e-06, "loss": 1.2007, "step": 2859 }, { "epoch": 0.92, "grad_norm": 0.3014526069164276, "learning_rate": 4.8571659569549225e-06, "loss": 1.1598, "step": 2860 }, { "epoch": 0.92, "grad_norm": 0.23483283817768097, "learning_rate": 4.817648768583671e-06, "loss": 1.2773, "step": 2861 }, { "epoch": 0.92, "grad_norm": 0.19074618816375732, "learning_rate": 4.778290366748472e-06, "loss": 0.8507, "step": 2862 }, { "epoch": 0.92, "grad_norm": 0.2189537137746811, "learning_rate": 4.7390907944957755e-06, "loss": 1.4602, "step": 2863 }, { "epoch": 0.92, "grad_norm": 0.21362616121768951, "learning_rate": 4.700050094698355e-06, "loss": 1.0931, "step": 2864 }, { "epoch": 0.92, "grad_norm": 0.21839377284049988, "learning_rate": 4.661168310055136e-06, "loss": 1.1501, "step": 2865 }, { "epoch": 0.92, "grad_norm": 0.21247981488704681, "learning_rate": 4.622445483091319e-06, "loss": 1.1347, "step": 2866 }, { "epoch": 0.92, "grad_norm": 0.21114350855350494, "learning_rate": 4.583881656158211e-06, "loss": 1.2438, "step": 2867 }, { "epoch": 0.92, "grad_norm": 0.21347831189632416, "learning_rate": 4.545476871433224e-06, "loss": 1.3581, "step": 2868 }, { "epoch": 0.92, "grad_norm": 0.17886561155319214, "learning_rate": 4.50723117091984e-06, "loss": 1.272, "step": 2869 }, { "epoch": 0.92, "grad_norm": 0.22050735354423523, "learning_rate": 4.469144596447516e-06, "loss": 1.1653, "step": 2870 }, { "epoch": 0.92, "grad_norm": 0.25917670130729675, "learning_rate": 4.431217189671732e-06, "loss": 1.4139, "step": 2871 }, { "epoch": 0.93, "grad_norm": 0.20795965194702148, "learning_rate": 4.393448992073822e-06, "loss": 1.2432, "step": 2872 }, { "epoch": 0.93, "grad_norm": 0.21689553558826447, "learning_rate": 4.355840044961012e-06, "loss": 1.3214, "step": 2873 }, { "epoch": 0.93, "grad_norm": 0.2014576941728592, "learning_rate": 4.31839038946638e-06, "loss": 1.1514, "step": 2874 }, { "epoch": 0.93, "grad_norm": 0.2719906270503998, "learning_rate": 4.281100066548765e-06, "loss": 1.1086, "step": 2875 }, { "epoch": 0.93, "grad_norm": 0.25137367844581604, "learning_rate": 4.243969116992757e-06, "loss": 1.3525, "step": 2876 }, { "epoch": 0.93, "grad_norm": 0.20500199496746063, "learning_rate": 4.206997581408622e-06, "loss": 1.0549, "step": 2877 }, { "epoch": 0.93, "grad_norm": 0.22599537670612335, "learning_rate": 4.170185500232265e-06, "loss": 1.1522, "step": 2878 }, { "epoch": 0.93, "grad_norm": 0.24747546017169952, "learning_rate": 4.133532913725246e-06, "loss": 1.2489, "step": 2879 }, { "epoch": 0.93, "grad_norm": 0.23822152614593506, "learning_rate": 4.097039861974599e-06, "loss": 1.2072, "step": 2880 }, { "epoch": 0.93, "grad_norm": 0.21630865335464478, "learning_rate": 4.060706384892964e-06, "loss": 1.0987, "step": 2881 }, { "epoch": 0.93, "grad_norm": 0.23279927670955658, "learning_rate": 4.024532522218421e-06, "loss": 1.0705, "step": 2882 }, { "epoch": 0.93, "grad_norm": 0.19793373346328735, "learning_rate": 3.988518313514422e-06, "loss": 1.0684, "step": 2883 }, { "epoch": 0.93, "grad_norm": 0.20356062054634094, "learning_rate": 3.952663798169925e-06, "loss": 1.0458, "step": 2884 }, { "epoch": 0.93, "grad_norm": 0.2220221608877182, "learning_rate": 3.9169690153991125e-06, "loss": 1.4821, "step": 2885 }, { "epoch": 0.93, "grad_norm": 0.18154558539390564, "learning_rate": 3.881434004241507e-06, "loss": 1.1501, "step": 2886 }, { "epoch": 0.93, "grad_norm": 0.1997040957212448, "learning_rate": 3.846058803561952e-06, "loss": 1.2967, "step": 2887 }, { "epoch": 0.93, "grad_norm": 0.22973819077014923, "learning_rate": 3.810843452050383e-06, "loss": 1.2307, "step": 2888 }, { "epoch": 0.93, "grad_norm": 0.20220449566841125, "learning_rate": 3.77578798822204e-06, "loss": 1.1534, "step": 2889 }, { "epoch": 0.93, "grad_norm": 0.21676543354988098, "learning_rate": 3.740892450417188e-06, "loss": 1.434, "step": 2890 }, { "epoch": 0.93, "grad_norm": 0.19895415008068085, "learning_rate": 3.7061568768012305e-06, "loss": 1.1469, "step": 2891 }, { "epoch": 0.93, "grad_norm": 0.24648542702198029, "learning_rate": 3.671581305364596e-06, "loss": 1.1098, "step": 2892 }, { "epoch": 0.93, "grad_norm": 0.25188910961151123, "learning_rate": 3.6371657739227513e-06, "loss": 1.3992, "step": 2893 }, { "epoch": 0.93, "grad_norm": 0.2857767343521118, "learning_rate": 3.602910320116087e-06, "loss": 1.2508, "step": 2894 }, { "epoch": 0.93, "grad_norm": 0.21849793195724487, "learning_rate": 3.568814981409951e-06, "loss": 1.2637, "step": 2895 }, { "epoch": 0.93, "grad_norm": 0.18847282230854034, "learning_rate": 3.5348797950945305e-06, "loss": 1.2668, "step": 2896 }, { "epoch": 0.93, "grad_norm": 0.18869826197624207, "learning_rate": 3.5011047982849204e-06, "loss": 0.9272, "step": 2897 }, { "epoch": 0.93, "grad_norm": 0.2328697144985199, "learning_rate": 3.4674900279209215e-06, "loss": 1.2039, "step": 2898 }, { "epoch": 0.93, "grad_norm": 0.2697151005268097, "learning_rate": 3.4340355207671922e-06, "loss": 1.3786, "step": 2899 }, { "epoch": 0.93, "grad_norm": 0.19302067160606384, "learning_rate": 3.40074131341303e-06, "loss": 1.1636, "step": 2900 }, { "epoch": 0.93, "grad_norm": 0.20468702912330627, "learning_rate": 3.3676074422724742e-06, "loss": 1.3101, "step": 2901 }, { "epoch": 0.93, "grad_norm": 0.21813908219337463, "learning_rate": 3.334633943584153e-06, "loss": 1.2393, "step": 2902 }, { "epoch": 0.94, "grad_norm": 0.19346292316913605, "learning_rate": 3.3018208534113353e-06, "loss": 1.2184, "step": 2903 }, { "epoch": 0.94, "grad_norm": 0.21471066772937775, "learning_rate": 3.269168207641831e-06, "loss": 1.1191, "step": 2904 }, { "epoch": 0.94, "grad_norm": 0.19301147758960724, "learning_rate": 3.2366760419879734e-06, "loss": 1.251, "step": 2905 }, { "epoch": 0.94, "grad_norm": 0.23063261806964874, "learning_rate": 3.2043443919865687e-06, "loss": 1.282, "step": 2906 }, { "epoch": 0.94, "grad_norm": 0.2663281559944153, "learning_rate": 3.172173292998881e-06, "loss": 1.245, "step": 2907 }, { "epoch": 0.94, "grad_norm": 0.17692798376083374, "learning_rate": 3.1401627802105645e-06, "loss": 1.2658, "step": 2908 }, { "epoch": 0.94, "grad_norm": 0.2187288999557495, "learning_rate": 3.108312888631681e-06, "loss": 1.3193, "step": 2909 }, { "epoch": 0.94, "grad_norm": 0.23482878506183624, "learning_rate": 3.0766236530965495e-06, "loss": 1.2535, "step": 2910 }, { "epoch": 0.94, "grad_norm": 0.2476070523262024, "learning_rate": 3.0450951082638286e-06, "loss": 0.9355, "step": 2911 }, { "epoch": 0.94, "grad_norm": 0.32750555872917175, "learning_rate": 3.0137272886164354e-06, "loss": 1.0615, "step": 2912 }, { "epoch": 0.94, "grad_norm": 0.204639732837677, "learning_rate": 2.98252022846146e-06, "loss": 1.2315, "step": 2913 }, { "epoch": 0.94, "grad_norm": 0.2057492434978485, "learning_rate": 2.9514739619302174e-06, "loss": 1.2896, "step": 2914 }, { "epoch": 0.94, "grad_norm": 0.19291555881500244, "learning_rate": 2.9205885229781288e-06, "loss": 1.1625, "step": 2915 }, { "epoch": 0.94, "grad_norm": 0.20283856987953186, "learning_rate": 2.889863945384707e-06, "loss": 1.4208, "step": 2916 }, { "epoch": 0.94, "grad_norm": 0.2994186580181122, "learning_rate": 2.859300262753572e-06, "loss": 1.1465, "step": 2917 }, { "epoch": 0.94, "grad_norm": 0.3460998237133026, "learning_rate": 2.8288975085123342e-06, "loss": 1.1877, "step": 2918 }, { "epoch": 0.94, "grad_norm": 0.24207432568073273, "learning_rate": 2.798655715912629e-06, "loss": 1.3108, "step": 2919 }, { "epoch": 0.94, "grad_norm": 0.1953257918357849, "learning_rate": 2.7685749180299976e-06, "loss": 1.2858, "step": 2920 }, { "epoch": 0.94, "grad_norm": 0.18343736231327057, "learning_rate": 2.738655147763974e-06, "loss": 1.0716, "step": 2921 }, { "epoch": 0.94, "grad_norm": 0.25337380170822144, "learning_rate": 2.7088964378378987e-06, "loss": 1.2065, "step": 2922 }, { "epoch": 0.94, "grad_norm": 0.22930052876472473, "learning_rate": 2.6792988207990196e-06, "loss": 1.1797, "step": 2923 }, { "epoch": 0.94, "grad_norm": 0.2169332057237625, "learning_rate": 2.6498623290183762e-06, "loss": 1.2414, "step": 2924 }, { "epoch": 0.94, "grad_norm": 0.21031774580478668, "learning_rate": 2.6205869946907486e-06, "loss": 1.4454, "step": 2925 }, { "epoch": 0.94, "grad_norm": 0.1933813840150833, "learning_rate": 2.591472849834708e-06, "loss": 1.2321, "step": 2926 }, { "epoch": 0.94, "grad_norm": 0.1937023550271988, "learning_rate": 2.5625199262925335e-06, "loss": 1.2487, "step": 2927 }, { "epoch": 0.94, "grad_norm": 0.20714187622070312, "learning_rate": 2.5337282557301277e-06, "loss": 1.2633, "step": 2928 }, { "epoch": 0.94, "grad_norm": 0.20406267046928406, "learning_rate": 2.505097869637085e-06, "loss": 1.2236, "step": 2929 }, { "epoch": 0.94, "grad_norm": 0.23937442898750305, "learning_rate": 2.4766287993265576e-06, "loss": 1.2946, "step": 2930 }, { "epoch": 0.94, "grad_norm": 0.19479309022426605, "learning_rate": 2.4483210759352724e-06, "loss": 1.168, "step": 2931 }, { "epoch": 0.94, "grad_norm": 0.24713723361492157, "learning_rate": 2.4201747304235463e-06, "loss": 1.371, "step": 2932 }, { "epoch": 0.94, "grad_norm": 0.22998321056365967, "learning_rate": 2.392189793575122e-06, "loss": 1.0117, "step": 2933 }, { "epoch": 0.95, "grad_norm": 0.21425311267375946, "learning_rate": 2.3643662959972665e-06, "loss": 1.3539, "step": 2934 }, { "epoch": 0.95, "grad_norm": 0.1997741311788559, "learning_rate": 2.3367042681206038e-06, "loss": 1.2349, "step": 2935 }, { "epoch": 0.95, "grad_norm": 0.19950374960899353, "learning_rate": 2.3092037401992504e-06, "loss": 1.1431, "step": 2936 }, { "epoch": 0.95, "grad_norm": 0.20382539927959442, "learning_rate": 2.2818647423106293e-06, "loss": 1.3737, "step": 2937 }, { "epoch": 0.95, "grad_norm": 0.2596120536327362, "learning_rate": 2.2546873043555403e-06, "loss": 1.4167, "step": 2938 }, { "epoch": 0.95, "grad_norm": 0.21846041083335876, "learning_rate": 2.2276714560580388e-06, "loss": 1.0783, "step": 2939 }, { "epoch": 0.95, "grad_norm": 0.21899844706058502, "learning_rate": 2.20081722696544e-06, "loss": 1.209, "step": 2940 }, { "epoch": 0.95, "grad_norm": 0.29577356576919556, "learning_rate": 2.174124646448383e-06, "loss": 1.223, "step": 2941 }, { "epoch": 0.95, "grad_norm": 0.3142232596874237, "learning_rate": 2.147593743700615e-06, "loss": 1.3397, "step": 2942 }, { "epoch": 0.95, "grad_norm": 0.2154240608215332, "learning_rate": 2.1212245477391255e-06, "loss": 1.1953, "step": 2943 }, { "epoch": 0.95, "grad_norm": 0.21242626011371613, "learning_rate": 2.0950170874039772e-06, "loss": 1.3286, "step": 2944 }, { "epoch": 0.95, "grad_norm": 0.32984066009521484, "learning_rate": 2.068971391358376e-06, "loss": 1.2562, "step": 2945 }, { "epoch": 0.95, "grad_norm": 0.2120627611875534, "learning_rate": 2.0430874880886516e-06, "loss": 1.3352, "step": 2946 }, { "epoch": 0.95, "grad_norm": 0.36935850977897644, "learning_rate": 2.017365405904092e-06, "loss": 1.3411, "step": 2947 }, { "epoch": 0.95, "grad_norm": 0.2157108187675476, "learning_rate": 1.991805172937061e-06, "loss": 1.0799, "step": 2948 }, { "epoch": 0.95, "grad_norm": 0.23252037167549133, "learning_rate": 1.966406817142896e-06, "loss": 1.1601, "step": 2949 }, { "epoch": 0.95, "grad_norm": 0.21538683772087097, "learning_rate": 1.9411703662998768e-06, "loss": 0.9962, "step": 2950 }, { "epoch": 0.95, "grad_norm": 0.20532137155532837, "learning_rate": 1.916095848009208e-06, "loss": 1.3044, "step": 2951 }, { "epoch": 0.95, "grad_norm": 0.21251997351646423, "learning_rate": 1.891183289695003e-06, "loss": 1.2521, "step": 2952 }, { "epoch": 0.95, "grad_norm": 0.2230801284313202, "learning_rate": 1.8664327186042328e-06, "loss": 1.1007, "step": 2953 }, { "epoch": 0.95, "grad_norm": 0.2749767601490021, "learning_rate": 1.8418441618066937e-06, "loss": 1.212, "step": 2954 }, { "epoch": 0.95, "grad_norm": 0.22611796855926514, "learning_rate": 1.817417646194974e-06, "loss": 1.3546, "step": 2955 }, { "epoch": 0.95, "grad_norm": 0.2187192738056183, "learning_rate": 1.793153198484487e-06, "loss": 1.0389, "step": 2956 }, { "epoch": 0.95, "grad_norm": 0.29812750220298767, "learning_rate": 1.7690508452133378e-06, "loss": 1.3802, "step": 2957 }, { "epoch": 0.95, "grad_norm": 0.21622705459594727, "learning_rate": 1.7451106127423896e-06, "loss": 1.1911, "step": 2958 }, { "epoch": 0.95, "grad_norm": 0.21071727573871613, "learning_rate": 1.7213325272551482e-06, "loss": 1.309, "step": 2959 }, { "epoch": 0.95, "grad_norm": 0.20969344675540924, "learning_rate": 1.6977166147577936e-06, "loss": 1.2132, "step": 2960 }, { "epoch": 0.95, "grad_norm": 0.25961506366729736, "learning_rate": 1.6742629010791986e-06, "loss": 1.216, "step": 2961 }, { "epoch": 0.95, "grad_norm": 0.20921078324317932, "learning_rate": 1.650971411870744e-06, "loss": 1.3436, "step": 2962 }, { "epoch": 0.95, "grad_norm": 0.22874806821346283, "learning_rate": 1.6278421726064362e-06, "loss": 1.1904, "step": 2963 }, { "epoch": 0.95, "grad_norm": 0.20848116278648376, "learning_rate": 1.6048752085828232e-06, "loss": 1.2953, "step": 2964 }, { "epoch": 0.95, "grad_norm": 0.20504917204380035, "learning_rate": 1.5820705449189619e-06, "loss": 1.1419, "step": 2965 }, { "epoch": 0.96, "grad_norm": 0.2500324845314026, "learning_rate": 1.559428206556418e-06, "loss": 1.4543, "step": 2966 }, { "epoch": 0.96, "grad_norm": 0.20792636275291443, "learning_rate": 1.5369482182592152e-06, "loss": 1.3494, "step": 2967 }, { "epoch": 0.96, "grad_norm": 0.2669319212436676, "learning_rate": 1.5146306046137869e-06, "loss": 1.0543, "step": 2968 }, { "epoch": 0.96, "grad_norm": 0.2273537814617157, "learning_rate": 1.4924753900290077e-06, "loss": 0.9534, "step": 2969 }, { "epoch": 0.96, "grad_norm": 0.194903165102005, "learning_rate": 1.4704825987361446e-06, "loss": 1.2697, "step": 2970 }, { "epoch": 0.96, "grad_norm": 0.19797708094120026, "learning_rate": 1.4486522547887736e-06, "loss": 1.4315, "step": 2971 }, { "epoch": 0.96, "grad_norm": 0.20695436000823975, "learning_rate": 1.4269843820628623e-06, "loss": 1.26, "step": 2972 }, { "epoch": 0.96, "grad_norm": 0.1990443766117096, "learning_rate": 1.4054790042566376e-06, "loss": 1.3469, "step": 2973 }, { "epoch": 0.96, "grad_norm": 0.21150019764900208, "learning_rate": 1.3841361448906186e-06, "loss": 1.1229, "step": 2974 }, { "epoch": 0.96, "grad_norm": 0.1990777850151062, "learning_rate": 1.3629558273075658e-06, "loss": 1.089, "step": 2975 }, { "epoch": 0.96, "grad_norm": 0.19422724843025208, "learning_rate": 1.341938074672516e-06, "loss": 0.9525, "step": 2976 }, { "epoch": 0.96, "grad_norm": 0.2207486629486084, "learning_rate": 1.3210829099726482e-06, "loss": 1.3554, "step": 2977 }, { "epoch": 0.96, "grad_norm": 0.2041798084974289, "learning_rate": 1.3003903560173335e-06, "loss": 1.2337, "step": 2978 }, { "epoch": 0.96, "grad_norm": 0.21051788330078125, "learning_rate": 1.279860435438118e-06, "loss": 1.3796, "step": 2979 }, { "epoch": 0.96, "grad_norm": 0.20771023631095886, "learning_rate": 1.2594931706886579e-06, "loss": 1.1427, "step": 2980 }, { "epoch": 0.96, "grad_norm": 0.21032074093818665, "learning_rate": 1.2392885840447174e-06, "loss": 1.1962, "step": 2981 }, { "epoch": 0.96, "grad_norm": 0.20551161468029022, "learning_rate": 1.2192466976041371e-06, "loss": 1.1626, "step": 2982 }, { "epoch": 0.96, "grad_norm": 0.21574048697948456, "learning_rate": 1.1993675332867825e-06, "loss": 1.1774, "step": 2983 }, { "epoch": 0.96, "grad_norm": 0.20330537855625153, "learning_rate": 1.1796511128346286e-06, "loss": 1.1013, "step": 2984 }, { "epoch": 0.96, "grad_norm": 0.2227029651403427, "learning_rate": 1.1600974578115595e-06, "loss": 1.1061, "step": 2985 }, { "epoch": 0.96, "grad_norm": 0.21662212908267975, "learning_rate": 1.1407065896035184e-06, "loss": 1.3045, "step": 2986 }, { "epoch": 0.96, "grad_norm": 0.20878757536411285, "learning_rate": 1.1214785294183736e-06, "loss": 1.162, "step": 2987 }, { "epoch": 0.96, "grad_norm": 0.23158998787403107, "learning_rate": 1.1024132982859035e-06, "loss": 1.4806, "step": 2988 }, { "epoch": 0.96, "grad_norm": 0.19625507295131683, "learning_rate": 1.0835109170578782e-06, "loss": 1.03, "step": 2989 }, { "epoch": 0.96, "grad_norm": 0.23137478530406952, "learning_rate": 1.0647714064078605e-06, "loss": 1.1962, "step": 2990 }, { "epoch": 0.96, "grad_norm": 0.22324861586093903, "learning_rate": 1.0461947868313725e-06, "loss": 1.1992, "step": 2991 }, { "epoch": 0.96, "grad_norm": 0.24606914818286896, "learning_rate": 1.0277810786457285e-06, "loss": 1.1877, "step": 2992 }, { "epoch": 0.96, "grad_norm": 0.2578897178173065, "learning_rate": 1.0095303019900525e-06, "loss": 1.2905, "step": 2993 }, { "epoch": 0.96, "grad_norm": 0.20602759718894958, "learning_rate": 9.914424768253272e-07, "loss": 1.2595, "step": 2994 }, { "epoch": 0.96, "grad_norm": 0.18805988132953644, "learning_rate": 9.73517622934261e-07, "loss": 1.2649, "step": 2995 }, { "epoch": 0.96, "grad_norm": 0.20274625718593597, "learning_rate": 9.557557599213395e-07, "loss": 1.2206, "step": 2996 }, { "epoch": 0.97, "grad_norm": 0.22508542239665985, "learning_rate": 9.381569072128059e-07, "loss": 1.0212, "step": 2997 }, { "epoch": 0.97, "grad_norm": 0.20444688200950623, "learning_rate": 9.207210840565638e-07, "loss": 1.3358, "step": 2998 }, { "epoch": 0.97, "grad_norm": 0.20904405415058136, "learning_rate": 9.034483095222922e-07, "loss": 1.3483, "step": 2999 }, { "epoch": 0.97, "grad_norm": 0.2833104729652405, "learning_rate": 8.863386025012464e-07, "loss": 1.3531, "step": 3000 }, { "epoch": 0.97, "grad_norm": 0.22315222024917603, "learning_rate": 8.693919817064077e-07, "loss": 1.1517, "step": 3001 }, { "epoch": 0.97, "grad_norm": 0.22138257324695587, "learning_rate": 8.52608465672383e-07, "loss": 1.0855, "step": 3002 }, { "epoch": 0.97, "grad_norm": 0.22780625522136688, "learning_rate": 8.359880727553391e-07, "loss": 1.3235, "step": 3003 }, { "epoch": 0.97, "grad_norm": 0.18408280611038208, "learning_rate": 8.195308211330853e-07, "loss": 1.1421, "step": 3004 }, { "epoch": 0.97, "grad_norm": 0.2709517776966095, "learning_rate": 8.032367288049734e-07, "loss": 1.3299, "step": 3005 }, { "epoch": 0.97, "grad_norm": 0.23830865323543549, "learning_rate": 7.871058135919483e-07, "loss": 1.2218, "step": 3006 }, { "epoch": 0.97, "grad_norm": 0.21717938780784607, "learning_rate": 7.711380931364308e-07, "loss": 1.2198, "step": 3007 }, { "epoch": 0.97, "grad_norm": 0.1960144191980362, "learning_rate": 7.553335849023845e-07, "loss": 1.0983, "step": 3008 }, { "epoch": 0.97, "grad_norm": 0.18898122012615204, "learning_rate": 7.396923061752658e-07, "loss": 1.1263, "step": 3009 }, { "epoch": 0.97, "grad_norm": 0.18626967072486877, "learning_rate": 7.242142740620238e-07, "loss": 1.3829, "step": 3010 }, { "epoch": 0.97, "grad_norm": 0.21338540315628052, "learning_rate": 7.08899505491034e-07, "loss": 1.2803, "step": 3011 }, { "epoch": 0.97, "grad_norm": 0.3038027584552765, "learning_rate": 6.937480172121146e-07, "loss": 1.0852, "step": 3012 }, { "epoch": 0.97, "grad_norm": 0.2645401954650879, "learning_rate": 6.787598257965265e-07, "loss": 1.0977, "step": 3013 }, { "epoch": 0.97, "grad_norm": 0.2623496651649475, "learning_rate": 6.639349476369071e-07, "loss": 1.2292, "step": 3014 }, { "epoch": 0.97, "grad_norm": 0.21809649467468262, "learning_rate": 6.492733989472864e-07, "loss": 1.1677, "step": 3015 }, { "epoch": 0.97, "grad_norm": 0.19584286212921143, "learning_rate": 6.347751957630542e-07, "loss": 0.9689, "step": 3016 }, { "epoch": 0.97, "grad_norm": 0.19132545590400696, "learning_rate": 6.204403539409597e-07, "loss": 1.0981, "step": 3017 }, { "epoch": 0.97, "grad_norm": 0.24526198208332062, "learning_rate": 6.06268889159095e-07, "loss": 1.0811, "step": 3018 }, { "epoch": 0.97, "grad_norm": 0.19747696816921234, "learning_rate": 5.922608169168285e-07, "loss": 1.0905, "step": 3019 }, { "epoch": 0.97, "grad_norm": 0.2044946700334549, "learning_rate": 5.784161525348552e-07, "loss": 1.4308, "step": 3020 }, { "epoch": 0.97, "grad_norm": 0.2034980058670044, "learning_rate": 5.647349111551458e-07, "loss": 1.2787, "step": 3021 }, { "epoch": 0.97, "grad_norm": 0.18381592631340027, "learning_rate": 5.512171077409145e-07, "loss": 0.9743, "step": 3022 }, { "epoch": 0.97, "grad_norm": 0.18453752994537354, "learning_rate": 5.378627570766514e-07, "loss": 1.1959, "step": 3023 }, { "epoch": 0.97, "grad_norm": 0.20288006961345673, "learning_rate": 5.246718737680732e-07, "loss": 0.8786, "step": 3024 }, { "epoch": 0.97, "grad_norm": 0.25132229924201965, "learning_rate": 5.116444722420898e-07, "loss": 1.3068, "step": 3025 }, { "epoch": 0.97, "grad_norm": 0.25415992736816406, "learning_rate": 4.987805667468203e-07, "loss": 1.0893, "step": 3026 }, { "epoch": 0.97, "grad_norm": 0.25896620750427246, "learning_rate": 4.860801713515772e-07, "loss": 1.1291, "step": 3027 }, { "epoch": 0.98, "grad_norm": 0.22166511416435242, "learning_rate": 4.735432999468325e-07, "loss": 1.3411, "step": 3028 }, { "epoch": 0.98, "grad_norm": 0.2268417924642563, "learning_rate": 4.611699662442181e-07, "loss": 1.3041, "step": 3029 }, { "epoch": 0.98, "grad_norm": 0.36666619777679443, "learning_rate": 4.4896018377650887e-07, "loss": 1.0772, "step": 3030 }, { "epoch": 0.98, "grad_norm": 0.31620877981185913, "learning_rate": 4.3691396589758953e-07, "loss": 1.2846, "step": 3031 }, { "epoch": 0.98, "grad_norm": 0.23643505573272705, "learning_rate": 4.250313257824378e-07, "loss": 1.2675, "step": 3032 }, { "epoch": 0.98, "grad_norm": 0.22231833636760712, "learning_rate": 4.133122764271746e-07, "loss": 1.0722, "step": 3033 }, { "epoch": 0.98, "grad_norm": 0.18659643828868866, "learning_rate": 4.0175683064894715e-07, "loss": 1.21, "step": 3034 }, { "epoch": 0.98, "grad_norm": 0.2432122379541397, "learning_rate": 3.9036500108601264e-07, "loss": 1.308, "step": 3035 }, { "epoch": 0.98, "grad_norm": 0.24083353579044342, "learning_rate": 3.791368001976547e-07, "loss": 1.1654, "step": 3036 }, { "epoch": 0.98, "grad_norm": 0.190092995762825, "learning_rate": 3.6807224026421664e-07, "loss": 1.3229, "step": 3037 }, { "epoch": 0.98, "grad_norm": 0.19974614679813385, "learning_rate": 3.5717133338703517e-07, "loss": 1.2828, "step": 3038 }, { "epoch": 0.98, "grad_norm": 0.19391106069087982, "learning_rate": 3.4643409148847334e-07, "loss": 1.0998, "step": 3039 }, { "epoch": 0.98, "grad_norm": 0.1979617178440094, "learning_rate": 3.358605263119041e-07, "loss": 1.233, "step": 3040 }, { "epoch": 0.98, "grad_norm": 0.23216165602207184, "learning_rate": 3.254506494216935e-07, "loss": 1.3534, "step": 3041 }, { "epoch": 0.98, "grad_norm": 0.21311341226100922, "learning_rate": 3.1520447220315104e-07, "loss": 1.1242, "step": 3042 }, { "epoch": 0.98, "grad_norm": 0.26359251141548157, "learning_rate": 3.0512200586257917e-07, "loss": 1.1582, "step": 3043 }, { "epoch": 0.98, "grad_norm": 0.22696076333522797, "learning_rate": 2.952032614271904e-07, "loss": 1.1132, "step": 3044 }, { "epoch": 0.98, "grad_norm": 0.22511805593967438, "learning_rate": 2.854482497451571e-07, "loss": 1.324, "step": 3045 }, { "epoch": 0.98, "grad_norm": 0.26600709557533264, "learning_rate": 2.7585698148559487e-07, "loss": 1.1387, "step": 3046 }, { "epoch": 0.98, "grad_norm": 0.20513179898262024, "learning_rate": 2.664294671384959e-07, "loss": 1.2238, "step": 3047 }, { "epoch": 0.98, "grad_norm": 0.19514255225658417, "learning_rate": 2.571657170147956e-07, "loss": 1.29, "step": 3048 }, { "epoch": 0.98, "grad_norm": 0.20239292085170746, "learning_rate": 2.4806574124627274e-07, "loss": 1.1726, "step": 3049 }, { "epoch": 0.98, "grad_norm": 0.2402336299419403, "learning_rate": 2.391295497855994e-07, "loss": 1.2342, "step": 3050 }, { "epoch": 0.98, "grad_norm": 0.18020671606063843, "learning_rate": 2.3035715240635744e-07, "loss": 1.1531, "step": 3051 }, { "epoch": 0.98, "grad_norm": 0.19556179642677307, "learning_rate": 2.2174855870293885e-07, "loss": 1.2562, "step": 3052 }, { "epoch": 0.98, "grad_norm": 0.20109932124614716, "learning_rate": 2.1330377809059552e-07, "loss": 1.3968, "step": 3053 }, { "epoch": 0.98, "grad_norm": 0.21938496828079224, "learning_rate": 2.0502281980540602e-07, "loss": 1.35, "step": 3054 }, { "epoch": 0.98, "grad_norm": 0.20988674461841583, "learning_rate": 1.9690569290430892e-07, "loss": 0.9997, "step": 3055 }, { "epoch": 0.98, "grad_norm": 0.22504641115665436, "learning_rate": 1.8895240626503605e-07, "loss": 1.2903, "step": 3056 }, { "epoch": 0.98, "grad_norm": 0.29814696311950684, "learning_rate": 1.8116296858612934e-07, "loss": 1.2613, "step": 3057 }, { "epoch": 0.98, "grad_norm": 0.2208028882741928, "learning_rate": 1.73537388386924e-07, "loss": 1.2698, "step": 3058 }, { "epoch": 0.99, "grad_norm": 0.2123088389635086, "learning_rate": 1.6607567400754862e-07, "loss": 1.0734, "step": 3059 }, { "epoch": 0.99, "grad_norm": 0.16880770027637482, "learning_rate": 1.587778336089085e-07, "loss": 1.3347, "step": 3060 }, { "epoch": 0.99, "grad_norm": 0.24325545132160187, "learning_rate": 1.5164387517270226e-07, "loss": 1.0918, "step": 3061 }, { "epoch": 0.99, "grad_norm": 0.2142578512430191, "learning_rate": 1.4467380650133863e-07, "loss": 1.105, "step": 3062 }, { "epoch": 0.99, "grad_norm": 0.20524798333644867, "learning_rate": 1.3786763521803634e-07, "loss": 1.1003, "step": 3063 }, { "epoch": 0.99, "grad_norm": 0.20316936075687408, "learning_rate": 1.3122536876670754e-07, "loss": 1.3401, "step": 3064 }, { "epoch": 0.99, "grad_norm": 0.28031620383262634, "learning_rate": 1.247470144120577e-07, "loss": 1.1635, "step": 3065 }, { "epoch": 0.99, "grad_norm": 0.21004506945610046, "learning_rate": 1.1843257923945249e-07, "loss": 0.9996, "step": 3066 }, { "epoch": 0.99, "grad_norm": 0.1962009221315384, "learning_rate": 1.1228207015505086e-07, "loss": 1.1764, "step": 3067 }, { "epoch": 0.99, "grad_norm": 0.19936078786849976, "learning_rate": 1.0629549388567193e-07, "loss": 1.1319, "step": 3068 }, { "epoch": 0.99, "grad_norm": 0.22708888351917267, "learning_rate": 1.0047285697887819e-07, "loss": 1.3896, "step": 3069 }, { "epoch": 0.99, "grad_norm": 0.18494471907615662, "learning_rate": 9.481416580287559e-08, "loss": 1.2548, "step": 3070 }, { "epoch": 0.99, "grad_norm": 0.18663693964481354, "learning_rate": 8.93194265466135e-08, "loss": 1.2083, "step": 3071 }, { "epoch": 0.99, "grad_norm": 0.24993234872817993, "learning_rate": 8.39886452197347e-08, "loss": 1.105, "step": 3072 }, { "epoch": 0.99, "grad_norm": 0.1814039647579193, "learning_rate": 7.882182765250877e-08, "loss": 1.1958, "step": 3073 }, { "epoch": 0.99, "grad_norm": 0.2735080420970917, "learning_rate": 7.381897949591542e-08, "loss": 0.8613, "step": 3074 }, { "epoch": 0.99, "grad_norm": 0.21579164266586304, "learning_rate": 6.89801062215778e-08, "loss": 1.1875, "step": 3075 }, { "epoch": 0.99, "grad_norm": 0.24002602696418762, "learning_rate": 6.430521312181247e-08, "loss": 1.3696, "step": 3076 }, { "epoch": 0.99, "grad_norm": 0.1967988908290863, "learning_rate": 5.979430530956286e-08, "loss": 1.0893, "step": 3077 }, { "epoch": 0.99, "grad_norm": 0.2620498538017273, "learning_rate": 5.544738771841584e-08, "loss": 1.1432, "step": 3078 }, { "epoch": 0.99, "grad_norm": 0.2146095335483551, "learning_rate": 5.126446510263504e-08, "loss": 1.3043, "step": 3079 }, { "epoch": 0.99, "grad_norm": 0.19152632355690002, "learning_rate": 4.724554203706099e-08, "loss": 1.0175, "step": 3080 }, { "epoch": 0.99, "grad_norm": 0.22604241967201233, "learning_rate": 4.339062291726092e-08, "loss": 1.1442, "step": 3081 }, { "epoch": 0.99, "grad_norm": 0.23163717985153198, "learning_rate": 3.969971195932897e-08, "loss": 1.3788, "step": 3082 }, { "epoch": 0.99, "grad_norm": 0.21221813559532166, "learning_rate": 3.617281320005272e-08, "loss": 1.3317, "step": 3083 }, { "epoch": 0.99, "grad_norm": 0.43326812982559204, "learning_rate": 3.2809930496813244e-08, "loss": 1.1744, "step": 3084 }, { "epoch": 0.99, "grad_norm": 0.22502721846103668, "learning_rate": 2.9611067527601782e-08, "loss": 1.1797, "step": 3085 }, { "epoch": 0.99, "grad_norm": 0.19872401654720306, "learning_rate": 2.657622779103641e-08, "loss": 1.2019, "step": 3086 }, { "epoch": 0.99, "grad_norm": 0.2525040805339813, "learning_rate": 2.370541460632869e-08, "loss": 1.3437, "step": 3087 }, { "epoch": 0.99, "grad_norm": 0.20215272903442383, "learning_rate": 2.0998631113300358e-08, "loss": 1.3073, "step": 3088 }, { "epoch": 0.99, "grad_norm": 0.23610876500606537, "learning_rate": 1.8455880272366662e-08, "loss": 1.1734, "step": 3089 }, { "epoch": 1.0, "grad_norm": 0.19176597893238068, "learning_rate": 1.6077164864553015e-08, "loss": 1.197, "step": 3090 }, { "epoch": 1.0, "grad_norm": 0.19546709954738617, "learning_rate": 1.3862487491461683e-08, "loss": 1.181, "step": 3091 }, { "epoch": 1.0, "grad_norm": 0.20127837359905243, "learning_rate": 1.1811850575321747e-08, "loss": 1.468, "step": 3092 }, { "epoch": 1.0, "grad_norm": 0.19144563376903534, "learning_rate": 9.925256358872535e-09, "loss": 1.2531, "step": 3093 }, { "epoch": 1.0, "grad_norm": 0.19337943196296692, "learning_rate": 8.202706905530154e-09, "loss": 0.9921, "step": 3094 }, { "epoch": 1.0, "grad_norm": 0.2259693294763565, "learning_rate": 6.644204099254258e-09, "loss": 1.1109, "step": 3095 }, { "epoch": 1.0, "grad_norm": 0.21101686358451843, "learning_rate": 5.249749644564705e-09, "loss": 1.2347, "step": 3096 }, { "epoch": 1.0, "grad_norm": 0.2066415697336197, "learning_rate": 4.019345066591517e-09, "loss": 1.1942, "step": 3097 }, { "epoch": 1.0, "grad_norm": 0.19787804782390594, "learning_rate": 2.952991711024921e-09, "loss": 1.2001, "step": 3098 }, { "epoch": 1.0, "grad_norm": 0.21120736002922058, "learning_rate": 2.0506907441653065e-09, "loss": 0.9996, "step": 3099 }, { "epoch": 1.0, "grad_norm": 0.2854808270931244, "learning_rate": 1.312443152823306e-09, "loss": 1.5483, "step": 3100 }, { "epoch": 1.0, "grad_norm": 0.20577532052993774, "learning_rate": 7.382497444696766e-10, "loss": 1.3072, "step": 3101 }, { "epoch": 1.0, "grad_norm": 0.24084864556789398, "learning_rate": 3.2811114706876584e-10, "loss": 1.2882, "step": 3102 }, { "epoch": 1.0, "grad_norm": 0.21301700174808502, "learning_rate": 8.202780919508434e-11, "loss": 1.3753, "step": 3103 }, { "epoch": 1.0, "grad_norm": 0.2754041254520416, "learning_rate": 0.0, "loss": 1.2179, "step": 3104 }, { "epoch": 1.0, "step": 3104, "total_flos": 2.1007237300220854e+19, "train_loss": 1.238065081082054, "train_runtime": 202729.2626, "train_samples_per_second": 0.061, "train_steps_per_second": 0.015 } ], "logging_steps": 1.0, "max_steps": 3104, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 2.1007237300220854e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }