{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9979836168872085, "eval_steps": 199, "global_step": 3968, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0005040322580645161, "grad_norm": 16.202816979237873, "learning_rate": 1e-08, "loss": 0.4122, "step": 1 }, { "epoch": 0.0005040322580645161, "eval_loss": 0.443972110748291, "eval_runtime": 17.2668, "eval_samples_per_second": 49.517, "eval_steps_per_second": 1.042, "step": 1 }, { "epoch": 0.0010080645161290322, "grad_norm": 16.205315003883026, "learning_rate": 2e-08, "loss": 0.4183, "step": 2 }, { "epoch": 0.0015120967741935483, "grad_norm": 15.468979543825476, "learning_rate": 3e-08, "loss": 0.4182, "step": 3 }, { "epoch": 0.0020161290322580645, "grad_norm": 15.468273739112808, "learning_rate": 4e-08, "loss": 0.402, "step": 4 }, { "epoch": 0.0025201612903225806, "grad_norm": 15.057464396772035, "learning_rate": 5e-08, "loss": 0.4211, "step": 5 }, { "epoch": 0.0030241935483870967, "grad_norm": 15.204132593848971, "learning_rate": 6e-08, "loss": 0.4134, "step": 6 }, { "epoch": 0.003528225806451613, "grad_norm": 17.615532773933918, "learning_rate": 7e-08, "loss": 0.3878, "step": 7 }, { "epoch": 0.004032258064516129, "grad_norm": 16.340341169724105, "learning_rate": 8e-08, "loss": 0.3982, "step": 8 }, { "epoch": 0.0045362903225806455, "grad_norm": 15.059609779441438, "learning_rate": 9e-08, "loss": 0.4178, "step": 9 }, { "epoch": 0.005040322580645161, "grad_norm": 17.296721168903797, "learning_rate": 1e-07, "loss": 0.4115, "step": 10 }, { "epoch": 0.005544354838709678, "grad_norm": 17.3187697691886, "learning_rate": 1.0999999999999999e-07, "loss": 0.4104, "step": 11 }, { "epoch": 0.006048387096774193, "grad_norm": 18.030550149312244, "learning_rate": 1.2e-07, "loss": 0.4104, "step": 12 }, { "epoch": 0.00655241935483871, "grad_norm": 14.002199418274493, "learning_rate": 1.3e-07, "loss": 0.4047, "step": 13 }, { "epoch": 0.007056451612903226, "grad_norm": 153.88554829319122, "learning_rate": 1.4e-07, "loss": 0.3935, "step": 14 }, { "epoch": 0.007560483870967742, "grad_norm": 6.155317751584709, "learning_rate": 1.5e-07, "loss": 0.3934, "step": 15 }, { "epoch": 0.008064516129032258, "grad_norm": 8.516052807530963, "learning_rate": 1.6e-07, "loss": 0.3992, "step": 16 }, { "epoch": 0.008568548387096774, "grad_norm": 12.572451947505744, "learning_rate": 1.7000000000000001e-07, "loss": 0.4091, "step": 17 }, { "epoch": 0.009072580645161291, "grad_norm": 26.666526989945357, "learning_rate": 1.8e-07, "loss": 0.4194, "step": 18 }, { "epoch": 0.009576612903225807, "grad_norm": 43.96981776742769, "learning_rate": 1.8999999999999998e-07, "loss": 0.4073, "step": 19 }, { "epoch": 0.010080645161290322, "grad_norm": 58.38481172991244, "learning_rate": 2e-07, "loss": 0.4135, "step": 20 }, { "epoch": 0.010584677419354838, "grad_norm": 8.92743387072961, "learning_rate": 2.0999999999999997e-07, "loss": 0.4233, "step": 21 }, { "epoch": 0.011088709677419355, "grad_norm": 7.114049513845685, "learning_rate": 2.1999999999999998e-07, "loss": 0.3915, "step": 22 }, { "epoch": 0.011592741935483871, "grad_norm": 7.5930944951978825, "learning_rate": 2.3e-07, "loss": 0.4119, "step": 23 }, { "epoch": 0.012096774193548387, "grad_norm": 17.847773187914957, "learning_rate": 2.4e-07, "loss": 0.4035, "step": 24 }, { "epoch": 0.012600806451612902, "grad_norm": 14.342646251100152, "learning_rate": 2.5e-07, "loss": 0.4023, "step": 25 }, { "epoch": 0.01310483870967742, "grad_norm": 5.653916048149905, "learning_rate": 2.6e-07, "loss": 0.3999, "step": 26 }, { "epoch": 0.013608870967741936, "grad_norm": 4.978130519093393, "learning_rate": 2.7e-07, "loss": 0.4012, "step": 27 }, { "epoch": 0.014112903225806451, "grad_norm": 5.1928104536034345, "learning_rate": 2.8e-07, "loss": 0.3835, "step": 28 }, { "epoch": 0.014616935483870967, "grad_norm": 5.713989015081132, "learning_rate": 2.9e-07, "loss": 0.4207, "step": 29 }, { "epoch": 0.015120967741935484, "grad_norm": 10.819751935660008, "learning_rate": 3e-07, "loss": 0.3895, "step": 30 }, { "epoch": 0.015625, "grad_norm": 9.415309461004313, "learning_rate": 3.1e-07, "loss": 0.4104, "step": 31 }, { "epoch": 0.016129032258064516, "grad_norm": 3.283165538142218, "learning_rate": 3.2e-07, "loss": 0.4182, "step": 32 }, { "epoch": 0.01663306451612903, "grad_norm": 6.31824176396841, "learning_rate": 3.3e-07, "loss": 0.3982, "step": 33 }, { "epoch": 0.017137096774193547, "grad_norm": 2.471098407656219, "learning_rate": 3.4000000000000003e-07, "loss": 0.4051, "step": 34 }, { "epoch": 0.017641129032258066, "grad_norm": 3.7875313205613943, "learning_rate": 3.5e-07, "loss": 0.3897, "step": 35 }, { "epoch": 0.018145161290322582, "grad_norm": 21.98400784558835, "learning_rate": 3.6e-07, "loss": 0.398, "step": 36 }, { "epoch": 0.018649193548387098, "grad_norm": 2.5064868793159714, "learning_rate": 3.7e-07, "loss": 0.3951, "step": 37 }, { "epoch": 0.019153225806451613, "grad_norm": 24.059819417771145, "learning_rate": 3.7999999999999996e-07, "loss": 0.3989, "step": 38 }, { "epoch": 0.01965725806451613, "grad_norm": 6.95393911781356, "learning_rate": 3.8999999999999997e-07, "loss": 0.4014, "step": 39 }, { "epoch": 0.020161290322580645, "grad_norm": 2.8868850874278182, "learning_rate": 4e-07, "loss": 0.3864, "step": 40 }, { "epoch": 0.02066532258064516, "grad_norm": 2.1387916921557624, "learning_rate": 4.0999999999999994e-07, "loss": 0.3765, "step": 41 }, { "epoch": 0.021169354838709676, "grad_norm": 2.627363698866213, "learning_rate": 4.1999999999999995e-07, "loss": 0.3819, "step": 42 }, { "epoch": 0.021673387096774195, "grad_norm": 4.399523991039246, "learning_rate": 4.2999999999999996e-07, "loss": 0.396, "step": 43 }, { "epoch": 0.02217741935483871, "grad_norm": 6.2743195319461265, "learning_rate": 4.3999999999999997e-07, "loss": 0.3719, "step": 44 }, { "epoch": 0.022681451612903226, "grad_norm": 3.5237571635789706, "learning_rate": 4.5e-07, "loss": 0.3787, "step": 45 }, { "epoch": 0.023185483870967742, "grad_norm": 4.8083225470670845, "learning_rate": 4.6e-07, "loss": 0.3842, "step": 46 }, { "epoch": 0.023689516129032258, "grad_norm": 4.2342259419992985, "learning_rate": 4.6999999999999995e-07, "loss": 0.3921, "step": 47 }, { "epoch": 0.024193548387096774, "grad_norm": 5.233597040220851, "learning_rate": 4.8e-07, "loss": 0.3803, "step": 48 }, { "epoch": 0.02469758064516129, "grad_norm": 3.3462375668312427, "learning_rate": 4.9e-07, "loss": 0.3762, "step": 49 }, { "epoch": 0.025201612903225805, "grad_norm": 5.738883941273613, "learning_rate": 5e-07, "loss": 0.3691, "step": 50 }, { "epoch": 0.025705645161290324, "grad_norm": 3.4906691092581608, "learning_rate": 5.1e-07, "loss": 0.3927, "step": 51 }, { "epoch": 0.02620967741935484, "grad_norm": 3.1684109647850445, "learning_rate": 5.2e-07, "loss": 0.3649, "step": 52 }, { "epoch": 0.026713709677419355, "grad_norm": 2.1354179898716037, "learning_rate": 5.3e-07, "loss": 0.3982, "step": 53 }, { "epoch": 0.02721774193548387, "grad_norm": 9.732639588208007, "learning_rate": 5.4e-07, "loss": 0.3897, "step": 54 }, { "epoch": 0.027721774193548387, "grad_norm": 1.114588852331627, "learning_rate": 5.5e-07, "loss": 0.3668, "step": 55 }, { "epoch": 0.028225806451612902, "grad_norm": 6.114641493595002, "learning_rate": 5.6e-07, "loss": 0.377, "step": 56 }, { "epoch": 0.028729838709677418, "grad_norm": 1.3354617010610528, "learning_rate": 5.699999999999999e-07, "loss": 0.38, "step": 57 }, { "epoch": 0.029233870967741934, "grad_norm": 1.2850750187972262, "learning_rate": 5.8e-07, "loss": 0.3847, "step": 58 }, { "epoch": 0.029737903225806453, "grad_norm": 1.3750785124101343, "learning_rate": 5.9e-07, "loss": 0.3592, "step": 59 }, { "epoch": 0.03024193548387097, "grad_norm": 2.9929928043047975, "learning_rate": 6e-07, "loss": 0.3633, "step": 60 }, { "epoch": 0.030745967741935484, "grad_norm": 6.301747495454331, "learning_rate": 6.1e-07, "loss": 0.3766, "step": 61 }, { "epoch": 0.03125, "grad_norm": 2.1168564747094076, "learning_rate": 6.2e-07, "loss": 0.3713, "step": 62 }, { "epoch": 0.031754032258064516, "grad_norm": 0.7578830680461082, "learning_rate": 6.3e-07, "loss": 0.369, "step": 63 }, { "epoch": 0.03225806451612903, "grad_norm": 1.6734191696925034, "learning_rate": 6.4e-07, "loss": 0.366, "step": 64 }, { "epoch": 0.03276209677419355, "grad_norm": 1.2789631524237748, "learning_rate": 6.5e-07, "loss": 0.3617, "step": 65 }, { "epoch": 0.03326612903225806, "grad_norm": 0.9523361493720094, "learning_rate": 6.6e-07, "loss": 0.3383, "step": 66 }, { "epoch": 0.03377016129032258, "grad_norm": 1.166610477006102, "learning_rate": 6.7e-07, "loss": 0.3626, "step": 67 }, { "epoch": 0.034274193548387094, "grad_norm": 1.2998523358147898, "learning_rate": 6.800000000000001e-07, "loss": 0.3521, "step": 68 }, { "epoch": 0.03477822580645161, "grad_norm": 1.4526130491588676, "learning_rate": 6.9e-07, "loss": 0.3464, "step": 69 }, { "epoch": 0.03528225806451613, "grad_norm": 0.57853677880428, "learning_rate": 7e-07, "loss": 0.3459, "step": 70 }, { "epoch": 0.03578629032258065, "grad_norm": 1.268886862963294, "learning_rate": 7.1e-07, "loss": 0.3658, "step": 71 }, { "epoch": 0.036290322580645164, "grad_norm": 1.3395158890738286, "learning_rate": 7.2e-07, "loss": 0.3382, "step": 72 }, { "epoch": 0.03679435483870968, "grad_norm": 0.7066691364777967, "learning_rate": 7.3e-07, "loss": 0.3484, "step": 73 }, { "epoch": 0.037298387096774195, "grad_norm": 0.8562245745506359, "learning_rate": 7.4e-07, "loss": 0.3696, "step": 74 }, { "epoch": 0.03780241935483871, "grad_norm": 1.583149325798458, "learning_rate": 7.5e-07, "loss": 0.352, "step": 75 }, { "epoch": 0.038306451612903226, "grad_norm": 0.4718209553213981, "learning_rate": 7.599999999999999e-07, "loss": 0.3474, "step": 76 }, { "epoch": 0.03881048387096774, "grad_norm": 16.741995431233445, "learning_rate": 7.699999999999999e-07, "loss": 0.3514, "step": 77 }, { "epoch": 0.03931451612903226, "grad_norm": 0.5609500021734007, "learning_rate": 7.799999999999999e-07, "loss": 0.3562, "step": 78 }, { "epoch": 0.039818548387096774, "grad_norm": 0.5482085713967506, "learning_rate": 7.9e-07, "loss": 0.3571, "step": 79 }, { "epoch": 0.04032258064516129, "grad_norm": 0.5862216706734081, "learning_rate": 8e-07, "loss": 0.3775, "step": 80 }, { "epoch": 0.040826612903225805, "grad_norm": 0.9433271815134637, "learning_rate": 8.1e-07, "loss": 0.3622, "step": 81 }, { "epoch": 0.04133064516129032, "grad_norm": 2.312304687609339, "learning_rate": 8.199999999999999e-07, "loss": 0.3354, "step": 82 }, { "epoch": 0.041834677419354836, "grad_norm": 0.7657086859357075, "learning_rate": 8.299999999999999e-07, "loss": 0.3386, "step": 83 }, { "epoch": 0.04233870967741935, "grad_norm": 0.5693450749645984, "learning_rate": 8.399999999999999e-07, "loss": 0.336, "step": 84 }, { "epoch": 0.04284274193548387, "grad_norm": 0.7237524467723284, "learning_rate": 8.499999999999999e-07, "loss": 0.3574, "step": 85 }, { "epoch": 0.04334677419354839, "grad_norm": 1.6741553791841324, "learning_rate": 8.599999999999999e-07, "loss": 0.3678, "step": 86 }, { "epoch": 0.043850806451612906, "grad_norm": 1.066623540734941, "learning_rate": 8.699999999999999e-07, "loss": 0.3528, "step": 87 }, { "epoch": 0.04435483870967742, "grad_norm": 0.5236790478758251, "learning_rate": 8.799999999999999e-07, "loss": 0.3485, "step": 88 }, { "epoch": 0.04485887096774194, "grad_norm": 0.37322566471667024, "learning_rate": 8.9e-07, "loss": 0.3403, "step": 89 }, { "epoch": 0.04536290322580645, "grad_norm": 0.37334552504540236, "learning_rate": 9e-07, "loss": 0.334, "step": 90 }, { "epoch": 0.04586693548387097, "grad_norm": 0.6233477368274808, "learning_rate": 9.1e-07, "loss": 0.3577, "step": 91 }, { "epoch": 0.046370967741935484, "grad_norm": 0.380512676596753, "learning_rate": 9.2e-07, "loss": 0.3437, "step": 92 }, { "epoch": 0.046875, "grad_norm": 0.7218988883748737, "learning_rate": 9.3e-07, "loss": 0.3622, "step": 93 }, { "epoch": 0.047379032258064516, "grad_norm": 1.0217493525557952, "learning_rate": 9.399999999999999e-07, "loss": 0.3249, "step": 94 }, { "epoch": 0.04788306451612903, "grad_norm": 0.8160482348861993, "learning_rate": 9.499999999999999e-07, "loss": 0.3569, "step": 95 }, { "epoch": 0.04838709677419355, "grad_norm": 1.6620382396716078, "learning_rate": 9.6e-07, "loss": 0.3325, "step": 96 }, { "epoch": 0.04889112903225806, "grad_norm": 0.9899117085666442, "learning_rate": 9.7e-07, "loss": 0.3565, "step": 97 }, { "epoch": 0.04939516129032258, "grad_norm": 0.6672181715686732, "learning_rate": 9.8e-07, "loss": 0.3383, "step": 98 }, { "epoch": 0.049899193548387094, "grad_norm": 0.8490767132728996, "learning_rate": 9.9e-07, "loss": 0.3628, "step": 99 }, { "epoch": 0.05040322580645161, "grad_norm": 3.230091455346306, "learning_rate": 1e-06, "loss": 0.3525, "step": 100 }, { "epoch": 0.05090725806451613, "grad_norm": 1.0048400815002736, "learning_rate": 9.999999278765487e-07, "loss": 0.3465, "step": 101 }, { "epoch": 0.05141129032258065, "grad_norm": 0.46323136632477013, "learning_rate": 9.999997115062153e-07, "loss": 0.3613, "step": 102 }, { "epoch": 0.051915322580645164, "grad_norm": 1.1303903715708508, "learning_rate": 9.999993508890626e-07, "loss": 0.3439, "step": 103 }, { "epoch": 0.05241935483870968, "grad_norm": 0.8278300274368537, "learning_rate": 9.999988460251948e-07, "loss": 0.3364, "step": 104 }, { "epoch": 0.052923387096774195, "grad_norm": 0.8385630360604959, "learning_rate": 9.99998196914757e-07, "loss": 0.3341, "step": 105 }, { "epoch": 0.05342741935483871, "grad_norm": 0.5181207146190654, "learning_rate": 9.999974035579367e-07, "loss": 0.3504, "step": 106 }, { "epoch": 0.053931451612903226, "grad_norm": 0.867225679439543, "learning_rate": 9.999964659549629e-07, "loss": 0.3487, "step": 107 }, { "epoch": 0.05443548387096774, "grad_norm": 0.4359954195764285, "learning_rate": 9.999953841061059e-07, "loss": 0.3511, "step": 108 }, { "epoch": 0.05493951612903226, "grad_norm": 0.49236276321466904, "learning_rate": 9.99994158011678e-07, "loss": 0.3301, "step": 109 }, { "epoch": 0.055443548387096774, "grad_norm": 0.42334304009907897, "learning_rate": 9.999927876720327e-07, "loss": 0.3354, "step": 110 }, { "epoch": 0.05594758064516129, "grad_norm": 0.3717776940463779, "learning_rate": 9.999912730875654e-07, "loss": 0.3536, "step": 111 }, { "epoch": 0.056451612903225805, "grad_norm": 0.7134857290167043, "learning_rate": 9.999896142587133e-07, "loss": 0.3423, "step": 112 }, { "epoch": 0.05695564516129032, "grad_norm": 0.6589031582715488, "learning_rate": 9.999878111859545e-07, "loss": 0.3528, "step": 113 }, { "epoch": 0.057459677419354836, "grad_norm": 0.4715779115727926, "learning_rate": 9.999858638698095e-07, "loss": 0.331, "step": 114 }, { "epoch": 0.05796370967741935, "grad_norm": 1.8324714803517166, "learning_rate": 9.999837723108403e-07, "loss": 0.3361, "step": 115 }, { "epoch": 0.05846774193548387, "grad_norm": 3.4504051941803455, "learning_rate": 9.999815365096497e-07, "loss": 0.3381, "step": 116 }, { "epoch": 0.05897177419354839, "grad_norm": 0.7082367647106664, "learning_rate": 9.999791564668832e-07, "loss": 0.3357, "step": 117 }, { "epoch": 0.059475806451612906, "grad_norm": 0.8943478209896717, "learning_rate": 9.99976632183227e-07, "loss": 0.334, "step": 118 }, { "epoch": 0.05997983870967742, "grad_norm": 2.639935863362195, "learning_rate": 9.9997396365941e-07, "loss": 0.3515, "step": 119 }, { "epoch": 0.06048387096774194, "grad_norm": 0.5612525598880919, "learning_rate": 9.999711508962014e-07, "loss": 0.3421, "step": 120 }, { "epoch": 0.06098790322580645, "grad_norm": 1.0108383298218186, "learning_rate": 9.99968193894413e-07, "loss": 0.3485, "step": 121 }, { "epoch": 0.06149193548387097, "grad_norm": 1.5821951581009004, "learning_rate": 9.999650926548979e-07, "loss": 0.3293, "step": 122 }, { "epoch": 0.061995967741935484, "grad_norm": 1.1703013492954635, "learning_rate": 9.999618471785505e-07, "loss": 0.3725, "step": 123 }, { "epoch": 0.0625, "grad_norm": 0.3496060502874034, "learning_rate": 9.999584574663074e-07, "loss": 0.3401, "step": 124 }, { "epoch": 0.06300403225806452, "grad_norm": 0.6132017993310452, "learning_rate": 9.999549235191465e-07, "loss": 0.3455, "step": 125 }, { "epoch": 0.06350806451612903, "grad_norm": 1.5073872786250602, "learning_rate": 9.999512453380869e-07, "loss": 0.3389, "step": 126 }, { "epoch": 0.06401209677419355, "grad_norm": 1.539376597083665, "learning_rate": 9.999474229241903e-07, "loss": 0.3332, "step": 127 }, { "epoch": 0.06451612903225806, "grad_norm": 1.2824746138412246, "learning_rate": 9.99943456278559e-07, "loss": 0.3456, "step": 128 }, { "epoch": 0.06502016129032258, "grad_norm": 1.4717370768722402, "learning_rate": 9.99939345402338e-07, "loss": 0.3361, "step": 129 }, { "epoch": 0.0655241935483871, "grad_norm": 0.3607286264558088, "learning_rate": 9.999350902967124e-07, "loss": 0.3309, "step": 130 }, { "epoch": 0.06602822580645161, "grad_norm": 0.3812650155511185, "learning_rate": 9.999306909629103e-07, "loss": 0.3336, "step": 131 }, { "epoch": 0.06653225806451613, "grad_norm": 2.886726675491379, "learning_rate": 9.999261474022007e-07, "loss": 0.35, "step": 132 }, { "epoch": 0.06703629032258064, "grad_norm": 2.482458443393656, "learning_rate": 9.999214596158946e-07, "loss": 0.333, "step": 133 }, { "epoch": 0.06754032258064516, "grad_norm": 0.3364379109508271, "learning_rate": 9.999166276053442e-07, "loss": 0.3371, "step": 134 }, { "epoch": 0.06804435483870967, "grad_norm": 0.7506818894597985, "learning_rate": 9.999116513719434e-07, "loss": 0.3339, "step": 135 }, { "epoch": 0.06854838709677419, "grad_norm": 1.4202719086270377, "learning_rate": 9.999065309171282e-07, "loss": 0.3248, "step": 136 }, { "epoch": 0.0690524193548387, "grad_norm": 0.7484158287932788, "learning_rate": 9.999012662423754e-07, "loss": 0.3322, "step": 137 }, { "epoch": 0.06955645161290322, "grad_norm": 0.374995905774015, "learning_rate": 9.998958573492042e-07, "loss": 0.3388, "step": 138 }, { "epoch": 0.07006048387096774, "grad_norm": 3.1451635192774416, "learning_rate": 9.998903042391747e-07, "loss": 0.3308, "step": 139 }, { "epoch": 0.07056451612903226, "grad_norm": 4.100760115701697, "learning_rate": 9.99884606913889e-07, "loss": 0.3393, "step": 140 }, { "epoch": 0.07106854838709678, "grad_norm": 1.0116256210926982, "learning_rate": 9.99878765374991e-07, "loss": 0.3365, "step": 141 }, { "epoch": 0.0715725806451613, "grad_norm": 0.4898776317688679, "learning_rate": 9.998727796241657e-07, "loss": 0.3388, "step": 142 }, { "epoch": 0.07207661290322581, "grad_norm": 1.2664512508752508, "learning_rate": 9.9986664966314e-07, "loss": 0.3245, "step": 143 }, { "epoch": 0.07258064516129033, "grad_norm": 1.6834640389600903, "learning_rate": 9.998603754936825e-07, "loss": 0.3407, "step": 144 }, { "epoch": 0.07308467741935484, "grad_norm": 1.3464610251528206, "learning_rate": 9.99853957117603e-07, "loss": 0.3225, "step": 145 }, { "epoch": 0.07358870967741936, "grad_norm": 1.0984235521861139, "learning_rate": 9.998473945367535e-07, "loss": 0.3231, "step": 146 }, { "epoch": 0.07409274193548387, "grad_norm": 2.8158986365995156, "learning_rate": 9.998406877530267e-07, "loss": 0.3331, "step": 147 }, { "epoch": 0.07459677419354839, "grad_norm": 0.3788437430061732, "learning_rate": 9.998338367683583e-07, "loss": 0.3392, "step": 148 }, { "epoch": 0.0751008064516129, "grad_norm": 0.6183083703029865, "learning_rate": 9.99826841584724e-07, "loss": 0.3305, "step": 149 }, { "epoch": 0.07560483870967742, "grad_norm": 0.4095762625785864, "learning_rate": 9.998197022041422e-07, "loss": 0.3263, "step": 150 }, { "epoch": 0.07610887096774194, "grad_norm": 1.2756291414954701, "learning_rate": 9.998124186286724e-07, "loss": 0.3394, "step": 151 }, { "epoch": 0.07661290322580645, "grad_norm": 0.3597641387730504, "learning_rate": 9.998049908604163e-07, "loss": 0.3462, "step": 152 }, { "epoch": 0.07711693548387097, "grad_norm": 0.3430879414583386, "learning_rate": 9.997974189015163e-07, "loss": 0.337, "step": 153 }, { "epoch": 0.07762096774193548, "grad_norm": 1.1595831180201008, "learning_rate": 9.997897027541571e-07, "loss": 0.3372, "step": 154 }, { "epoch": 0.078125, "grad_norm": 0.48618010029983355, "learning_rate": 9.997818424205647e-07, "loss": 0.3371, "step": 155 }, { "epoch": 0.07862903225806452, "grad_norm": 0.39786757641752357, "learning_rate": 9.997738379030068e-07, "loss": 0.3314, "step": 156 }, { "epoch": 0.07913306451612903, "grad_norm": 1.7056412415922666, "learning_rate": 9.997656892037924e-07, "loss": 0.319, "step": 157 }, { "epoch": 0.07963709677419355, "grad_norm": 0.3653360435949616, "learning_rate": 9.997573963252725e-07, "loss": 0.3354, "step": 158 }, { "epoch": 0.08014112903225806, "grad_norm": 2.829079281128038, "learning_rate": 9.997489592698399e-07, "loss": 0.3318, "step": 159 }, { "epoch": 0.08064516129032258, "grad_norm": 1.2295812347072044, "learning_rate": 9.997403780399282e-07, "loss": 0.3494, "step": 160 }, { "epoch": 0.0811491935483871, "grad_norm": 0.8812794930898853, "learning_rate": 9.997316526380131e-07, "loss": 0.3344, "step": 161 }, { "epoch": 0.08165322580645161, "grad_norm": 0.8353193931880987, "learning_rate": 9.997227830666118e-07, "loss": 0.3305, "step": 162 }, { "epoch": 0.08215725806451613, "grad_norm": 0.9923756258213962, "learning_rate": 9.997137693282833e-07, "loss": 0.3421, "step": 163 }, { "epoch": 0.08266129032258064, "grad_norm": 2.476846910057538, "learning_rate": 9.99704611425628e-07, "loss": 0.3303, "step": 164 }, { "epoch": 0.08316532258064516, "grad_norm": 3.0540963644253294, "learning_rate": 9.996953093612877e-07, "loss": 0.3244, "step": 165 }, { "epoch": 0.08366935483870967, "grad_norm": 1.9232590922797959, "learning_rate": 9.99685863137946e-07, "loss": 0.3566, "step": 166 }, { "epoch": 0.08417338709677419, "grad_norm": 0.8404131782552129, "learning_rate": 9.996762727583285e-07, "loss": 0.3248, "step": 167 }, { "epoch": 0.0846774193548387, "grad_norm": 0.7858824684266251, "learning_rate": 9.996665382252014e-07, "loss": 0.3445, "step": 168 }, { "epoch": 0.08518145161290322, "grad_norm": 0.7353820488132891, "learning_rate": 9.996566595413734e-07, "loss": 0.3284, "step": 169 }, { "epoch": 0.08568548387096774, "grad_norm": 0.4062481131054341, "learning_rate": 9.996466367096943e-07, "loss": 0.3204, "step": 170 }, { "epoch": 0.08618951612903226, "grad_norm": 0.7805121538010222, "learning_rate": 9.996364697330555e-07, "loss": 0.3222, "step": 171 }, { "epoch": 0.08669354838709678, "grad_norm": 0.45723066112420185, "learning_rate": 9.996261586143904e-07, "loss": 0.3359, "step": 172 }, { "epoch": 0.0871975806451613, "grad_norm": 0.9178689167005624, "learning_rate": 9.996157033566737e-07, "loss": 0.3446, "step": 173 }, { "epoch": 0.08770161290322581, "grad_norm": 1.1988895413065521, "learning_rate": 9.996051039629214e-07, "loss": 0.3196, "step": 174 }, { "epoch": 0.08820564516129033, "grad_norm": 1.5252246353478032, "learning_rate": 9.995943604361915e-07, "loss": 0.3296, "step": 175 }, { "epoch": 0.08870967741935484, "grad_norm": 2.9977266959256217, "learning_rate": 9.995834727795834e-07, "loss": 0.332, "step": 176 }, { "epoch": 0.08921370967741936, "grad_norm": 0.4475301770853643, "learning_rate": 9.995724409962381e-07, "loss": 0.3229, "step": 177 }, { "epoch": 0.08971774193548387, "grad_norm": 0.5571776297010896, "learning_rate": 9.995612650893384e-07, "loss": 0.339, "step": 178 }, { "epoch": 0.09022177419354839, "grad_norm": 0.3379507080955966, "learning_rate": 9.995499450621084e-07, "loss": 0.3363, "step": 179 }, { "epoch": 0.0907258064516129, "grad_norm": 0.5115984237062103, "learning_rate": 9.995384809178135e-07, "loss": 0.3318, "step": 180 }, { "epoch": 0.09122983870967742, "grad_norm": 0.597842695756601, "learning_rate": 9.995268726597616e-07, "loss": 0.3355, "step": 181 }, { "epoch": 0.09173387096774194, "grad_norm": 1.6790645385073486, "learning_rate": 9.995151202913013e-07, "loss": 0.3379, "step": 182 }, { "epoch": 0.09223790322580645, "grad_norm": 1.4108139487058173, "learning_rate": 9.99503223815823e-07, "loss": 0.3435, "step": 183 }, { "epoch": 0.09274193548387097, "grad_norm": 1.1761800110707394, "learning_rate": 9.99491183236759e-07, "loss": 0.3304, "step": 184 }, { "epoch": 0.09324596774193548, "grad_norm": 0.5429451949780051, "learning_rate": 9.99478998557583e-07, "loss": 0.3275, "step": 185 }, { "epoch": 0.09375, "grad_norm": 1.0221880040659963, "learning_rate": 9.994666697818097e-07, "loss": 0.3248, "step": 186 }, { "epoch": 0.09425403225806452, "grad_norm": 1.7971515949261194, "learning_rate": 9.994541969129963e-07, "loss": 0.3252, "step": 187 }, { "epoch": 0.09475806451612903, "grad_norm": 0.9715025504301362, "learning_rate": 9.99441579954741e-07, "loss": 0.3275, "step": 188 }, { "epoch": 0.09526209677419355, "grad_norm": 0.3672192941528604, "learning_rate": 9.99428818910684e-07, "loss": 0.3269, "step": 189 }, { "epoch": 0.09576612903225806, "grad_norm": 0.520936669282708, "learning_rate": 9.994159137845062e-07, "loss": 0.3195, "step": 190 }, { "epoch": 0.09627016129032258, "grad_norm": 0.7477175783403504, "learning_rate": 9.994028645799312e-07, "loss": 0.3136, "step": 191 }, { "epoch": 0.0967741935483871, "grad_norm": 0.3729287311518564, "learning_rate": 9.993896713007234e-07, "loss": 0.3272, "step": 192 }, { "epoch": 0.09727822580645161, "grad_norm": 0.34440602307523366, "learning_rate": 9.99376333950689e-07, "loss": 0.3233, "step": 193 }, { "epoch": 0.09778225806451613, "grad_norm": 2.072604244189497, "learning_rate": 9.993628525336757e-07, "loss": 0.3436, "step": 194 }, { "epoch": 0.09828629032258064, "grad_norm": 1.019190865070132, "learning_rate": 9.993492270535728e-07, "loss": 0.3313, "step": 195 }, { "epoch": 0.09879032258064516, "grad_norm": 0.7904036059961682, "learning_rate": 9.99335457514311e-07, "loss": 0.3274, "step": 196 }, { "epoch": 0.09929435483870967, "grad_norm": 0.6959515322641198, "learning_rate": 9.993215439198632e-07, "loss": 0.3208, "step": 197 }, { "epoch": 0.09979838709677419, "grad_norm": 0.7837606339000123, "learning_rate": 9.993074862742432e-07, "loss": 0.3277, "step": 198 }, { "epoch": 0.1003024193548387, "grad_norm": 0.3983563349530772, "learning_rate": 9.992932845815062e-07, "loss": 0.3259, "step": 199 }, { "epoch": 0.1003024193548387, "eval_loss": 0.3568817675113678, "eval_runtime": 17.0912, "eval_samples_per_second": 50.026, "eval_steps_per_second": 1.053, "step": 199 }, { "epoch": 0.10080645161290322, "grad_norm": 2.1662289172188123, "learning_rate": 9.992789388457496e-07, "loss": 0.3374, "step": 200 }, { "epoch": 0.10131048387096774, "grad_norm": 0.3792807469503325, "learning_rate": 9.992644490711122e-07, "loss": 0.3322, "step": 201 }, { "epoch": 0.10181451612903226, "grad_norm": 0.8809934605286607, "learning_rate": 9.99249815261774e-07, "loss": 0.3234, "step": 202 }, { "epoch": 0.10231854838709678, "grad_norm": 0.8130891805575898, "learning_rate": 9.992350374219565e-07, "loss": 0.3205, "step": 203 }, { "epoch": 0.1028225806451613, "grad_norm": 0.6892675097386061, "learning_rate": 9.992201155559235e-07, "loss": 0.3265, "step": 204 }, { "epoch": 0.10332661290322581, "grad_norm": 0.3620028450107508, "learning_rate": 9.992050496679796e-07, "loss": 0.3308, "step": 205 }, { "epoch": 0.10383064516129033, "grad_norm": 1.1011333579285603, "learning_rate": 9.991898397624713e-07, "loss": 0.3297, "step": 206 }, { "epoch": 0.10433467741935484, "grad_norm": 1.066929614971622, "learning_rate": 9.991744858437867e-07, "loss": 0.3343, "step": 207 }, { "epoch": 0.10483870967741936, "grad_norm": 0.5043517533284387, "learning_rate": 9.99158987916355e-07, "loss": 0.3357, "step": 208 }, { "epoch": 0.10534274193548387, "grad_norm": 2.028553721867032, "learning_rate": 9.991433459846475e-07, "loss": 0.3381, "step": 209 }, { "epoch": 0.10584677419354839, "grad_norm": 1.0136025757003981, "learning_rate": 9.991275600531766e-07, "loss": 0.3256, "step": 210 }, { "epoch": 0.1063508064516129, "grad_norm": 0.4260801721814953, "learning_rate": 9.991116301264965e-07, "loss": 0.3183, "step": 211 }, { "epoch": 0.10685483870967742, "grad_norm": 0.8776821224341731, "learning_rate": 9.990955562092032e-07, "loss": 0.3086, "step": 212 }, { "epoch": 0.10735887096774194, "grad_norm": 0.9670921019854246, "learning_rate": 9.990793383059336e-07, "loss": 0.3163, "step": 213 }, { "epoch": 0.10786290322580645, "grad_norm": 0.4414053256258077, "learning_rate": 9.990629764213663e-07, "loss": 0.3274, "step": 214 }, { "epoch": 0.10836693548387097, "grad_norm": 0.8635272396829062, "learning_rate": 9.99046470560222e-07, "loss": 0.3328, "step": 215 }, { "epoch": 0.10887096774193548, "grad_norm": 1.2397064297417275, "learning_rate": 9.990298207272625e-07, "loss": 0.323, "step": 216 }, { "epoch": 0.109375, "grad_norm": 0.4800605247066369, "learning_rate": 9.99013026927291e-07, "loss": 0.3248, "step": 217 }, { "epoch": 0.10987903225806452, "grad_norm": 0.655524859523369, "learning_rate": 9.989960891651521e-07, "loss": 0.3237, "step": 218 }, { "epoch": 0.11038306451612903, "grad_norm": 1.666081221879668, "learning_rate": 9.98979007445733e-07, "loss": 0.307, "step": 219 }, { "epoch": 0.11088709677419355, "grad_norm": 0.6823811172697462, "learning_rate": 9.989617817739612e-07, "loss": 0.325, "step": 220 }, { "epoch": 0.11139112903225806, "grad_norm": 0.38222379218675673, "learning_rate": 9.989444121548061e-07, "loss": 0.3284, "step": 221 }, { "epoch": 0.11189516129032258, "grad_norm": 3.066211544417165, "learning_rate": 9.989268985932789e-07, "loss": 0.3192, "step": 222 }, { "epoch": 0.1123991935483871, "grad_norm": 2.642378852763342, "learning_rate": 9.989092410944321e-07, "loss": 0.3381, "step": 223 }, { "epoch": 0.11290322580645161, "grad_norm": 0.49898455707374983, "learning_rate": 9.9889143966336e-07, "loss": 0.3126, "step": 224 }, { "epoch": 0.11340725806451613, "grad_norm": 0.8540861654265093, "learning_rate": 9.988734943051981e-07, "loss": 0.3272, "step": 225 }, { "epoch": 0.11391129032258064, "grad_norm": 1.2446923755324546, "learning_rate": 9.988554050251232e-07, "loss": 0.329, "step": 226 }, { "epoch": 0.11441532258064516, "grad_norm": 1.1002977014857587, "learning_rate": 9.988371718283543e-07, "loss": 0.3196, "step": 227 }, { "epoch": 0.11491935483870967, "grad_norm": 0.42649460084589286, "learning_rate": 9.988187947201517e-07, "loss": 0.3226, "step": 228 }, { "epoch": 0.11542338709677419, "grad_norm": 2.4011621649576824, "learning_rate": 9.988002737058166e-07, "loss": 0.3157, "step": 229 }, { "epoch": 0.1159274193548387, "grad_norm": 1.9831170581698965, "learning_rate": 9.987816087906924e-07, "loss": 0.3128, "step": 230 }, { "epoch": 0.11643145161290322, "grad_norm": 0.5007493489029178, "learning_rate": 9.987627999801638e-07, "loss": 0.3098, "step": 231 }, { "epoch": 0.11693548387096774, "grad_norm": 1.3882086918200365, "learning_rate": 9.987438472796572e-07, "loss": 0.3205, "step": 232 }, { "epoch": 0.11743951612903226, "grad_norm": 0.8267276195807572, "learning_rate": 9.987247506946401e-07, "loss": 0.3017, "step": 233 }, { "epoch": 0.11794354838709678, "grad_norm": 0.5834541685362084, "learning_rate": 9.98705510230622e-07, "loss": 0.3211, "step": 234 }, { "epoch": 0.1184475806451613, "grad_norm": 0.4068048722742785, "learning_rate": 9.986861258931535e-07, "loss": 0.317, "step": 235 }, { "epoch": 0.11895161290322581, "grad_norm": 0.6447372842200539, "learning_rate": 9.986665976878269e-07, "loss": 0.3162, "step": 236 }, { "epoch": 0.11945564516129033, "grad_norm": 0.3789031033771853, "learning_rate": 9.986469256202758e-07, "loss": 0.3434, "step": 237 }, { "epoch": 0.11995967741935484, "grad_norm": 0.7092549539496891, "learning_rate": 9.986271096961758e-07, "loss": 0.3277, "step": 238 }, { "epoch": 0.12046370967741936, "grad_norm": 0.5202838451750067, "learning_rate": 9.986071499212435e-07, "loss": 0.3138, "step": 239 }, { "epoch": 0.12096774193548387, "grad_norm": 0.3561759374531645, "learning_rate": 9.98587046301237e-07, "loss": 0.3335, "step": 240 }, { "epoch": 0.12147177419354839, "grad_norm": 0.3707646084919827, "learning_rate": 9.985667988419562e-07, "loss": 0.3162, "step": 241 }, { "epoch": 0.1219758064516129, "grad_norm": 0.6547926315231538, "learning_rate": 9.985464075492424e-07, "loss": 0.3279, "step": 242 }, { "epoch": 0.12247983870967742, "grad_norm": 0.6547474733966526, "learning_rate": 9.985258724289784e-07, "loss": 0.329, "step": 243 }, { "epoch": 0.12298387096774194, "grad_norm": 1.4287976852804898, "learning_rate": 9.985051934870886e-07, "loss": 0.3171, "step": 244 }, { "epoch": 0.12348790322580645, "grad_norm": 0.33812675544730314, "learning_rate": 9.984843707295384e-07, "loss": 0.3171, "step": 245 }, { "epoch": 0.12399193548387097, "grad_norm": 0.5972872507124707, "learning_rate": 9.98463404162335e-07, "loss": 0.3195, "step": 246 }, { "epoch": 0.12449596774193548, "grad_norm": 0.5523944045513064, "learning_rate": 9.984422937915276e-07, "loss": 0.3137, "step": 247 }, { "epoch": 0.125, "grad_norm": 0.3520728667966682, "learning_rate": 9.98421039623206e-07, "loss": 0.3181, "step": 248 }, { "epoch": 0.12550403225806453, "grad_norm": 1.0978808931136894, "learning_rate": 9.98399641663502e-07, "loss": 0.3212, "step": 249 }, { "epoch": 0.12600806451612903, "grad_norm": 0.4007468615552269, "learning_rate": 9.98378099918589e-07, "loss": 0.3235, "step": 250 }, { "epoch": 0.12651209677419356, "grad_norm": 0.38517912392455744, "learning_rate": 9.983564143946813e-07, "loss": 0.3221, "step": 251 }, { "epoch": 0.12701612903225806, "grad_norm": 0.8562512071253154, "learning_rate": 9.98334585098035e-07, "loss": 0.3168, "step": 252 }, { "epoch": 0.1275201612903226, "grad_norm": 0.37641066199373835, "learning_rate": 9.98312612034948e-07, "loss": 0.3338, "step": 253 }, { "epoch": 0.1280241935483871, "grad_norm": 0.3364345875156596, "learning_rate": 9.982904952117597e-07, "loss": 0.3149, "step": 254 }, { "epoch": 0.12852822580645162, "grad_norm": 0.4087566783250614, "learning_rate": 9.9826823463485e-07, "loss": 0.3256, "step": 255 }, { "epoch": 0.12903225806451613, "grad_norm": 0.7179205392340822, "learning_rate": 9.982458303106411e-07, "loss": 0.3375, "step": 256 }, { "epoch": 0.12953629032258066, "grad_norm": 0.8800188682694119, "learning_rate": 9.982232822455968e-07, "loss": 0.3378, "step": 257 }, { "epoch": 0.13004032258064516, "grad_norm": 0.4425193312322176, "learning_rate": 9.982005904462219e-07, "loss": 0.3303, "step": 258 }, { "epoch": 0.1305443548387097, "grad_norm": 0.9161891570130994, "learning_rate": 9.981777549190627e-07, "loss": 0.3036, "step": 259 }, { "epoch": 0.1310483870967742, "grad_norm": 0.3551274779066522, "learning_rate": 9.981547756707074e-07, "loss": 0.3195, "step": 260 }, { "epoch": 0.13155241935483872, "grad_norm": 0.4319317585133586, "learning_rate": 9.981316527077852e-07, "loss": 0.3321, "step": 261 }, { "epoch": 0.13205645161290322, "grad_norm": 1.1093458242995622, "learning_rate": 9.981083860369668e-07, "loss": 0.3341, "step": 262 }, { "epoch": 0.13256048387096775, "grad_norm": 1.1354326631084213, "learning_rate": 9.98084975664965e-07, "loss": 0.3184, "step": 263 }, { "epoch": 0.13306451612903225, "grad_norm": 0.3445717345214689, "learning_rate": 9.980614215985327e-07, "loss": 0.326, "step": 264 }, { "epoch": 0.13356854838709678, "grad_norm": 1.795842384240905, "learning_rate": 9.98037723844466e-07, "loss": 0.3176, "step": 265 }, { "epoch": 0.13407258064516128, "grad_norm": 1.1674146767927045, "learning_rate": 9.98013882409601e-07, "loss": 0.3153, "step": 266 }, { "epoch": 0.1345766129032258, "grad_norm": 0.3532656598688029, "learning_rate": 9.97989897300816e-07, "loss": 0.3237, "step": 267 }, { "epoch": 0.1350806451612903, "grad_norm": 1.0598457327232365, "learning_rate": 9.979657685250305e-07, "loss": 0.3316, "step": 268 }, { "epoch": 0.13558467741935484, "grad_norm": 0.4426506579911205, "learning_rate": 9.979414960892055e-07, "loss": 0.3031, "step": 269 }, { "epoch": 0.13608870967741934, "grad_norm": 0.4547436037084278, "learning_rate": 9.979170800003436e-07, "loss": 0.3122, "step": 270 }, { "epoch": 0.13659274193548387, "grad_norm": 0.4462058135399398, "learning_rate": 9.978925202654883e-07, "loss": 0.3227, "step": 271 }, { "epoch": 0.13709677419354838, "grad_norm": 1.6285597918474548, "learning_rate": 9.978678168917253e-07, "loss": 0.3103, "step": 272 }, { "epoch": 0.1376008064516129, "grad_norm": 0.8755040490518289, "learning_rate": 9.978429698861812e-07, "loss": 0.3048, "step": 273 }, { "epoch": 0.1381048387096774, "grad_norm": 0.90287853250279, "learning_rate": 9.978179792560245e-07, "loss": 0.3265, "step": 274 }, { "epoch": 0.13860887096774194, "grad_norm": 0.3787373854311499, "learning_rate": 9.977928450084642e-07, "loss": 0.3241, "step": 275 }, { "epoch": 0.13911290322580644, "grad_norm": 0.5847992322700967, "learning_rate": 9.977675671507522e-07, "loss": 0.3176, "step": 276 }, { "epoch": 0.13961693548387097, "grad_norm": 0.3486681848522623, "learning_rate": 9.977421456901803e-07, "loss": 0.3158, "step": 277 }, { "epoch": 0.14012096774193547, "grad_norm": 0.38818889942185175, "learning_rate": 9.977165806340827e-07, "loss": 0.3135, "step": 278 }, { "epoch": 0.140625, "grad_norm": 0.4543982688708197, "learning_rate": 9.97690871989835e-07, "loss": 0.3191, "step": 279 }, { "epoch": 0.14112903225806453, "grad_norm": 1.4162173420118023, "learning_rate": 9.976650197648536e-07, "loss": 0.314, "step": 280 }, { "epoch": 0.14163306451612903, "grad_norm": 0.6252978926999877, "learning_rate": 9.976390239665971e-07, "loss": 0.3241, "step": 281 }, { "epoch": 0.14213709677419356, "grad_norm": 0.3461689499768043, "learning_rate": 9.976128846025646e-07, "loss": 0.3219, "step": 282 }, { "epoch": 0.14264112903225806, "grad_norm": 0.3699553094334184, "learning_rate": 9.975866016802977e-07, "loss": 0.3037, "step": 283 }, { "epoch": 0.1431451612903226, "grad_norm": 0.39061647423480156, "learning_rate": 9.975601752073783e-07, "loss": 0.3063, "step": 284 }, { "epoch": 0.1436491935483871, "grad_norm": 0.5258130756540553, "learning_rate": 9.975336051914307e-07, "loss": 0.3108, "step": 285 }, { "epoch": 0.14415322580645162, "grad_norm": 0.8216411787070486, "learning_rate": 9.9750689164012e-07, "loss": 0.3056, "step": 286 }, { "epoch": 0.14465725806451613, "grad_norm": 0.4881519867613522, "learning_rate": 9.974800345611532e-07, "loss": 0.3164, "step": 287 }, { "epoch": 0.14516129032258066, "grad_norm": 0.43534512320301666, "learning_rate": 9.974530339622779e-07, "loss": 0.3211, "step": 288 }, { "epoch": 0.14566532258064516, "grad_norm": 0.6993861205113443, "learning_rate": 9.97425889851284e-07, "loss": 0.3267, "step": 289 }, { "epoch": 0.1461693548387097, "grad_norm": 1.0418090024102467, "learning_rate": 9.973986022360022e-07, "loss": 0.3239, "step": 290 }, { "epoch": 0.1466733870967742, "grad_norm": 0.3319740811109165, "learning_rate": 9.97371171124305e-07, "loss": 0.3153, "step": 291 }, { "epoch": 0.14717741935483872, "grad_norm": 0.7719912354248915, "learning_rate": 9.973435965241058e-07, "loss": 0.3101, "step": 292 }, { "epoch": 0.14768145161290322, "grad_norm": 0.8608701948441182, "learning_rate": 9.973158784433599e-07, "loss": 0.3329, "step": 293 }, { "epoch": 0.14818548387096775, "grad_norm": 0.5399943385601701, "learning_rate": 9.972880168900638e-07, "loss": 0.3136, "step": 294 }, { "epoch": 0.14868951612903225, "grad_norm": 1.21808306827626, "learning_rate": 9.972600118722555e-07, "loss": 0.312, "step": 295 }, { "epoch": 0.14919354838709678, "grad_norm": 2.232388742950908, "learning_rate": 9.97231863398014e-07, "loss": 0.3077, "step": 296 }, { "epoch": 0.14969758064516128, "grad_norm": 0.40750824930135454, "learning_rate": 9.972035714754602e-07, "loss": 0.3423, "step": 297 }, { "epoch": 0.1502016129032258, "grad_norm": 0.6444065733453856, "learning_rate": 9.971751361127562e-07, "loss": 0.3108, "step": 298 }, { "epoch": 0.1507056451612903, "grad_norm": 0.7680402422746748, "learning_rate": 9.971465573181049e-07, "loss": 0.3089, "step": 299 }, { "epoch": 0.15120967741935484, "grad_norm": 0.38712847346257356, "learning_rate": 9.971178350997516e-07, "loss": 0.2963, "step": 300 }, { "epoch": 0.15171370967741934, "grad_norm": 0.921107522036033, "learning_rate": 9.970889694659823e-07, "loss": 0.3149, "step": 301 }, { "epoch": 0.15221774193548387, "grad_norm": 1.009394455117804, "learning_rate": 9.970599604251247e-07, "loss": 0.3017, "step": 302 }, { "epoch": 0.15272177419354838, "grad_norm": 0.4398399428836015, "learning_rate": 9.970308079855476e-07, "loss": 0.3036, "step": 303 }, { "epoch": 0.1532258064516129, "grad_norm": 0.8373555944492586, "learning_rate": 9.970015121556615e-07, "loss": 0.3204, "step": 304 }, { "epoch": 0.1537298387096774, "grad_norm": 1.226841071199288, "learning_rate": 9.969720729439177e-07, "loss": 0.31, "step": 305 }, { "epoch": 0.15423387096774194, "grad_norm": 0.4736294159912266, "learning_rate": 9.969424903588094e-07, "loss": 0.3273, "step": 306 }, { "epoch": 0.15473790322580644, "grad_norm": 1.3438422876161837, "learning_rate": 9.969127644088713e-07, "loss": 0.3229, "step": 307 }, { "epoch": 0.15524193548387097, "grad_norm": 2.420491209892375, "learning_rate": 9.968828951026786e-07, "loss": 0.3016, "step": 308 }, { "epoch": 0.15574596774193547, "grad_norm": 0.9363754119347487, "learning_rate": 9.96852882448849e-07, "loss": 0.3096, "step": 309 }, { "epoch": 0.15625, "grad_norm": 0.7464871725596692, "learning_rate": 9.968227264560404e-07, "loss": 0.3114, "step": 310 }, { "epoch": 0.15675403225806453, "grad_norm": 0.8045539981080978, "learning_rate": 9.96792427132953e-07, "loss": 0.3074, "step": 311 }, { "epoch": 0.15725806451612903, "grad_norm": 0.7968575088428339, "learning_rate": 9.967619844883277e-07, "loss": 0.3202, "step": 312 }, { "epoch": 0.15776209677419356, "grad_norm": 0.3462338506463475, "learning_rate": 9.967313985309472e-07, "loss": 0.3104, "step": 313 }, { "epoch": 0.15826612903225806, "grad_norm": 0.3411937335850433, "learning_rate": 9.967006692696353e-07, "loss": 0.325, "step": 314 }, { "epoch": 0.1587701612903226, "grad_norm": 1.6309339803104832, "learning_rate": 9.966697967132573e-07, "loss": 0.3048, "step": 315 }, { "epoch": 0.1592741935483871, "grad_norm": 0.8520422369690264, "learning_rate": 9.966387808707196e-07, "loss": 0.3031, "step": 316 }, { "epoch": 0.15977822580645162, "grad_norm": 0.550306745006942, "learning_rate": 9.966076217509702e-07, "loss": 0.3174, "step": 317 }, { "epoch": 0.16028225806451613, "grad_norm": 0.3244041619511754, "learning_rate": 9.965763193629982e-07, "loss": 0.3051, "step": 318 }, { "epoch": 0.16078629032258066, "grad_norm": 0.3289354707634873, "learning_rate": 9.965448737158343e-07, "loss": 0.3024, "step": 319 }, { "epoch": 0.16129032258064516, "grad_norm": 0.5698828377828442, "learning_rate": 9.9651328481855e-07, "loss": 0.3197, "step": 320 }, { "epoch": 0.1617943548387097, "grad_norm": 0.3603627753702205, "learning_rate": 9.964815526802588e-07, "loss": 0.3267, "step": 321 }, { "epoch": 0.1622983870967742, "grad_norm": 0.41260756113885855, "learning_rate": 9.964496773101155e-07, "loss": 0.3128, "step": 322 }, { "epoch": 0.16280241935483872, "grad_norm": 0.3123646419774433, "learning_rate": 9.964176587173154e-07, "loss": 0.2909, "step": 323 }, { "epoch": 0.16330645161290322, "grad_norm": 0.32862366743648225, "learning_rate": 9.963854969110958e-07, "loss": 0.3249, "step": 324 }, { "epoch": 0.16381048387096775, "grad_norm": 0.32317653597651286, "learning_rate": 9.963531919007355e-07, "loss": 0.3179, "step": 325 }, { "epoch": 0.16431451612903225, "grad_norm": 0.3636694182909097, "learning_rate": 9.963207436955539e-07, "loss": 0.3094, "step": 326 }, { "epoch": 0.16481854838709678, "grad_norm": 0.5601385575192833, "learning_rate": 9.962881523049125e-07, "loss": 0.313, "step": 327 }, { "epoch": 0.16532258064516128, "grad_norm": 0.32805972924192683, "learning_rate": 9.962554177382134e-07, "loss": 0.304, "step": 328 }, { "epoch": 0.1658266129032258, "grad_norm": 0.4699969421619995, "learning_rate": 9.962225400049004e-07, "loss": 0.309, "step": 329 }, { "epoch": 0.1663306451612903, "grad_norm": 0.3767389273779974, "learning_rate": 9.961895191144586e-07, "loss": 0.2983, "step": 330 }, { "epoch": 0.16683467741935484, "grad_norm": 0.503719665226859, "learning_rate": 9.961563550764143e-07, "loss": 0.3112, "step": 331 }, { "epoch": 0.16733870967741934, "grad_norm": 0.3438328709283268, "learning_rate": 9.961230479003348e-07, "loss": 0.2985, "step": 332 }, { "epoch": 0.16784274193548387, "grad_norm": 0.3083813504969578, "learning_rate": 9.960895975958296e-07, "loss": 0.3134, "step": 333 }, { "epoch": 0.16834677419354838, "grad_norm": 0.4787512237902366, "learning_rate": 9.960560041725486e-07, "loss": 0.3213, "step": 334 }, { "epoch": 0.1688508064516129, "grad_norm": 1.4378814525493548, "learning_rate": 9.960222676401833e-07, "loss": 0.3056, "step": 335 }, { "epoch": 0.1693548387096774, "grad_norm": 0.3506129924302826, "learning_rate": 9.959883880084664e-07, "loss": 0.3174, "step": 336 }, { "epoch": 0.16985887096774194, "grad_norm": 0.35990282931470485, "learning_rate": 9.95954365287172e-07, "loss": 0.3224, "step": 337 }, { "epoch": 0.17036290322580644, "grad_norm": 1.093999056396477, "learning_rate": 9.959201994861156e-07, "loss": 0.2938, "step": 338 }, { "epoch": 0.17086693548387097, "grad_norm": 0.7575441587383402, "learning_rate": 9.958858906151536e-07, "loss": 0.3092, "step": 339 }, { "epoch": 0.17137096774193547, "grad_norm": 0.7138793464492631, "learning_rate": 9.958514386841842e-07, "loss": 0.3126, "step": 340 }, { "epoch": 0.171875, "grad_norm": 0.6820850764538464, "learning_rate": 9.95816843703146e-07, "loss": 0.3051, "step": 341 }, { "epoch": 0.17237903225806453, "grad_norm": 0.438185665282885, "learning_rate": 9.957821056820202e-07, "loss": 0.3107, "step": 342 }, { "epoch": 0.17288306451612903, "grad_norm": 0.2993355268251931, "learning_rate": 9.957472246308278e-07, "loss": 0.3177, "step": 343 }, { "epoch": 0.17338709677419356, "grad_norm": 0.3143562128856459, "learning_rate": 9.957122005596324e-07, "loss": 0.3028, "step": 344 }, { "epoch": 0.17389112903225806, "grad_norm": 0.29963517583943, "learning_rate": 9.956770334785377e-07, "loss": 0.303, "step": 345 }, { "epoch": 0.1743951612903226, "grad_norm": 0.41635137878897605, "learning_rate": 9.956417233976895e-07, "loss": 0.3007, "step": 346 }, { "epoch": 0.1748991935483871, "grad_norm": 0.3795788244623765, "learning_rate": 9.956062703272744e-07, "loss": 0.3183, "step": 347 }, { "epoch": 0.17540322580645162, "grad_norm": 0.5158539655619079, "learning_rate": 9.955706742775204e-07, "loss": 0.3034, "step": 348 }, { "epoch": 0.17590725806451613, "grad_norm": 0.41143417528165055, "learning_rate": 9.955349352586968e-07, "loss": 0.3064, "step": 349 }, { "epoch": 0.17641129032258066, "grad_norm": 0.3499570984862177, "learning_rate": 9.95499053281114e-07, "loss": 0.3165, "step": 350 }, { "epoch": 0.17691532258064516, "grad_norm": 0.6117708541856488, "learning_rate": 9.95463028355124e-07, "loss": 0.3135, "step": 351 }, { "epoch": 0.1774193548387097, "grad_norm": 0.3512173058497519, "learning_rate": 9.954268604911193e-07, "loss": 0.3086, "step": 352 }, { "epoch": 0.1779233870967742, "grad_norm": 0.5459910083191652, "learning_rate": 9.953905496995346e-07, "loss": 0.3212, "step": 353 }, { "epoch": 0.17842741935483872, "grad_norm": 0.48953620036372436, "learning_rate": 9.953540959908448e-07, "loss": 0.3082, "step": 354 }, { "epoch": 0.17893145161290322, "grad_norm": 0.4806785690019039, "learning_rate": 9.953174993755669e-07, "loss": 0.3114, "step": 355 }, { "epoch": 0.17943548387096775, "grad_norm": 0.27574681950143637, "learning_rate": 9.95280759864259e-07, "loss": 0.3088, "step": 356 }, { "epoch": 0.17993951612903225, "grad_norm": 0.4046536057043104, "learning_rate": 9.952438774675199e-07, "loss": 0.3014, "step": 357 }, { "epoch": 0.18044354838709678, "grad_norm": 0.526200938156327, "learning_rate": 9.952068521959898e-07, "loss": 0.3042, "step": 358 }, { "epoch": 0.18094758064516128, "grad_norm": 0.5729896337994929, "learning_rate": 9.951696840603508e-07, "loss": 0.3096, "step": 359 }, { "epoch": 0.1814516129032258, "grad_norm": 0.5261901087089867, "learning_rate": 9.95132373071325e-07, "loss": 0.3077, "step": 360 }, { "epoch": 0.1819556451612903, "grad_norm": 0.6668832949382653, "learning_rate": 9.950949192396772e-07, "loss": 0.3025, "step": 361 }, { "epoch": 0.18245967741935484, "grad_norm": 0.862055275986415, "learning_rate": 9.950573225762117e-07, "loss": 0.3015, "step": 362 }, { "epoch": 0.18296370967741934, "grad_norm": 0.6293046065894792, "learning_rate": 9.950195830917756e-07, "loss": 0.302, "step": 363 }, { "epoch": 0.18346774193548387, "grad_norm": 0.6877385211906398, "learning_rate": 9.949817007972563e-07, "loss": 0.3176, "step": 364 }, { "epoch": 0.18397177419354838, "grad_norm": 0.29549386712677456, "learning_rate": 9.949436757035825e-07, "loss": 0.3089, "step": 365 }, { "epoch": 0.1844758064516129, "grad_norm": 0.8557932164329561, "learning_rate": 9.949055078217244e-07, "loss": 0.3296, "step": 366 }, { "epoch": 0.1849798387096774, "grad_norm": 0.5402766960899985, "learning_rate": 9.948671971626927e-07, "loss": 0.3175, "step": 367 }, { "epoch": 0.18548387096774194, "grad_norm": 0.49998887489839794, "learning_rate": 9.948287437375403e-07, "loss": 0.314, "step": 368 }, { "epoch": 0.18598790322580644, "grad_norm": 0.33956036638788806, "learning_rate": 9.94790147557361e-07, "loss": 0.3072, "step": 369 }, { "epoch": 0.18649193548387097, "grad_norm": 0.4691366516694783, "learning_rate": 9.94751408633289e-07, "loss": 0.3007, "step": 370 }, { "epoch": 0.18699596774193547, "grad_norm": 0.4294056345752334, "learning_rate": 9.947125269765001e-07, "loss": 0.3244, "step": 371 }, { "epoch": 0.1875, "grad_norm": 0.5198726803714843, "learning_rate": 9.946735025982121e-07, "loss": 0.3309, "step": 372 }, { "epoch": 0.18800403225806453, "grad_norm": 0.5083174614120537, "learning_rate": 9.94634335509683e-07, "loss": 0.3064, "step": 373 }, { "epoch": 0.18850806451612903, "grad_norm": 0.4820500182649492, "learning_rate": 9.94595025722212e-07, "loss": 0.3042, "step": 374 }, { "epoch": 0.18901209677419356, "grad_norm": 0.5290432017994986, "learning_rate": 9.9455557324714e-07, "loss": 0.2969, "step": 375 }, { "epoch": 0.18951612903225806, "grad_norm": 0.9665951840791808, "learning_rate": 9.945159780958487e-07, "loss": 0.3053, "step": 376 }, { "epoch": 0.1900201612903226, "grad_norm": 0.27376501652334334, "learning_rate": 9.94476240279761e-07, "loss": 0.2975, "step": 377 }, { "epoch": 0.1905241935483871, "grad_norm": 0.6907529950403531, "learning_rate": 9.944363598103412e-07, "loss": 0.3028, "step": 378 }, { "epoch": 0.19102822580645162, "grad_norm": 0.7206171183198804, "learning_rate": 9.943963366990944e-07, "loss": 0.3244, "step": 379 }, { "epoch": 0.19153225806451613, "grad_norm": 0.4351702422546798, "learning_rate": 9.943561709575671e-07, "loss": 0.3019, "step": 380 }, { "epoch": 0.19203629032258066, "grad_norm": 0.4993062351913905, "learning_rate": 9.94315862597347e-07, "loss": 0.3147, "step": 381 }, { "epoch": 0.19254032258064516, "grad_norm": 1.900968501331666, "learning_rate": 9.942754116300627e-07, "loss": 0.2954, "step": 382 }, { "epoch": 0.1930443548387097, "grad_norm": 1.6477482220874253, "learning_rate": 9.942348180673838e-07, "loss": 0.3121, "step": 383 }, { "epoch": 0.1935483870967742, "grad_norm": 0.742153580574187, "learning_rate": 9.941940819210215e-07, "loss": 0.2992, "step": 384 }, { "epoch": 0.19405241935483872, "grad_norm": 0.47705653322280234, "learning_rate": 9.941532032027281e-07, "loss": 0.3244, "step": 385 }, { "epoch": 0.19455645161290322, "grad_norm": 0.911598765383663, "learning_rate": 9.941121819242965e-07, "loss": 0.319, "step": 386 }, { "epoch": 0.19506048387096775, "grad_norm": 0.8928463372091003, "learning_rate": 9.940710180975615e-07, "loss": 0.3176, "step": 387 }, { "epoch": 0.19556451612903225, "grad_norm": 0.5466103602367745, "learning_rate": 9.940297117343983e-07, "loss": 0.3117, "step": 388 }, { "epoch": 0.19606854838709678, "grad_norm": 0.28786769682640595, "learning_rate": 9.939882628467235e-07, "loss": 0.319, "step": 389 }, { "epoch": 0.19657258064516128, "grad_norm": 1.750270794203163, "learning_rate": 9.939466714464953e-07, "loss": 0.3097, "step": 390 }, { "epoch": 0.1970766129032258, "grad_norm": 1.3109389597034193, "learning_rate": 9.939049375457117e-07, "loss": 0.3095, "step": 391 }, { "epoch": 0.1975806451612903, "grad_norm": 0.3810245348808811, "learning_rate": 9.938630611564136e-07, "loss": 0.3015, "step": 392 }, { "epoch": 0.19808467741935484, "grad_norm": 0.2822642693051148, "learning_rate": 9.938210422906816e-07, "loss": 0.3065, "step": 393 }, { "epoch": 0.19858870967741934, "grad_norm": 0.4548013874567676, "learning_rate": 9.93778880960638e-07, "loss": 0.3059, "step": 394 }, { "epoch": 0.19909274193548387, "grad_norm": 0.7403977526231464, "learning_rate": 9.937365771784458e-07, "loss": 0.3001, "step": 395 }, { "epoch": 0.19959677419354838, "grad_norm": 0.40584105744807936, "learning_rate": 9.936941309563097e-07, "loss": 0.3018, "step": 396 }, { "epoch": 0.2001008064516129, "grad_norm": 0.4027563712329421, "learning_rate": 9.936515423064752e-07, "loss": 0.2821, "step": 397 }, { "epoch": 0.2006048387096774, "grad_norm": 0.3215902734384433, "learning_rate": 9.936088112412288e-07, "loss": 0.3093, "step": 398 }, { "epoch": 0.2006048387096774, "eval_loss": 0.33719342947006226, "eval_runtime": 17.0014, "eval_samples_per_second": 50.29, "eval_steps_per_second": 1.059, "step": 398 }, { "epoch": 0.20110887096774194, "grad_norm": 0.9421611516647215, "learning_rate": 9.935659377728982e-07, "loss": 0.2979, "step": 399 }, { "epoch": 0.20161290322580644, "grad_norm": 0.9245468490926179, "learning_rate": 9.935229219138517e-07, "loss": 0.2942, "step": 400 }, { "epoch": 0.20211693548387097, "grad_norm": 0.6824205781396179, "learning_rate": 9.934797636764999e-07, "loss": 0.2971, "step": 401 }, { "epoch": 0.20262096774193547, "grad_norm": 0.27185149783417195, "learning_rate": 9.934364630732928e-07, "loss": 0.3107, "step": 402 }, { "epoch": 0.203125, "grad_norm": 0.6827808287541225, "learning_rate": 9.933930201167228e-07, "loss": 0.3184, "step": 403 }, { "epoch": 0.20362903225806453, "grad_norm": 0.6173711604161257, "learning_rate": 9.93349434819323e-07, "loss": 0.3137, "step": 404 }, { "epoch": 0.20413306451612903, "grad_norm": 0.8730361088623172, "learning_rate": 9.933057071936674e-07, "loss": 0.3045, "step": 405 }, { "epoch": 0.20463709677419356, "grad_norm": 0.2347631790989028, "learning_rate": 9.932618372523712e-07, "loss": 0.3174, "step": 406 }, { "epoch": 0.20514112903225806, "grad_norm": 0.2618825790896803, "learning_rate": 9.932178250080905e-07, "loss": 0.3053, "step": 407 }, { "epoch": 0.2056451612903226, "grad_norm": 0.8173532124618281, "learning_rate": 9.931736704735226e-07, "loss": 0.2987, "step": 408 }, { "epoch": 0.2061491935483871, "grad_norm": 0.6079169109005206, "learning_rate": 9.931293736614059e-07, "loss": 0.3131, "step": 409 }, { "epoch": 0.20665322580645162, "grad_norm": 0.4779413914330351, "learning_rate": 9.930849345845195e-07, "loss": 0.3095, "step": 410 }, { "epoch": 0.20715725806451613, "grad_norm": 0.5403444992322752, "learning_rate": 9.930403532556841e-07, "loss": 0.2985, "step": 411 }, { "epoch": 0.20766129032258066, "grad_norm": 0.33724492734825506, "learning_rate": 9.929956296877609e-07, "loss": 0.3029, "step": 412 }, { "epoch": 0.20816532258064516, "grad_norm": 0.803588317511041, "learning_rate": 9.929507638936527e-07, "loss": 0.304, "step": 413 }, { "epoch": 0.2086693548387097, "grad_norm": 0.24653864233446288, "learning_rate": 9.929057558863025e-07, "loss": 0.2991, "step": 414 }, { "epoch": 0.2091733870967742, "grad_norm": 0.3080563000251818, "learning_rate": 9.928606056786953e-07, "loss": 0.3045, "step": 415 }, { "epoch": 0.20967741935483872, "grad_norm": 0.23276679829744804, "learning_rate": 9.928153132838564e-07, "loss": 0.2998, "step": 416 }, { "epoch": 0.21018145161290322, "grad_norm": 0.3031284409615895, "learning_rate": 9.927698787148524e-07, "loss": 0.3129, "step": 417 }, { "epoch": 0.21068548387096775, "grad_norm": 0.7190277552056444, "learning_rate": 9.92724301984791e-07, "loss": 0.318, "step": 418 }, { "epoch": 0.21118951612903225, "grad_norm": 0.671405235151991, "learning_rate": 9.92678583106821e-07, "loss": 0.3152, "step": 419 }, { "epoch": 0.21169354838709678, "grad_norm": 0.26427462576363475, "learning_rate": 9.926327220941313e-07, "loss": 0.3131, "step": 420 }, { "epoch": 0.21219758064516128, "grad_norm": 0.21989852886239047, "learning_rate": 9.925867189599534e-07, "loss": 0.2913, "step": 421 }, { "epoch": 0.2127016129032258, "grad_norm": 0.5902102443498939, "learning_rate": 9.925405737175582e-07, "loss": 0.3047, "step": 422 }, { "epoch": 0.2132056451612903, "grad_norm": 0.6036703980264593, "learning_rate": 9.924942863802586e-07, "loss": 0.3023, "step": 423 }, { "epoch": 0.21370967741935484, "grad_norm": 0.6785617848990195, "learning_rate": 9.924478569614084e-07, "loss": 0.3059, "step": 424 }, { "epoch": 0.21421370967741934, "grad_norm": 0.41465435154148006, "learning_rate": 9.924012854744019e-07, "loss": 0.2937, "step": 425 }, { "epoch": 0.21471774193548387, "grad_norm": 0.5756370196897739, "learning_rate": 9.923545719326748e-07, "loss": 0.3003, "step": 426 }, { "epoch": 0.21522177419354838, "grad_norm": 1.1087103802236575, "learning_rate": 9.923077163497037e-07, "loss": 0.3115, "step": 427 }, { "epoch": 0.2157258064516129, "grad_norm": 0.5075052144087366, "learning_rate": 9.922607187390062e-07, "loss": 0.2861, "step": 428 }, { "epoch": 0.2162298387096774, "grad_norm": 0.21875262709516646, "learning_rate": 9.922135791141407e-07, "loss": 0.3111, "step": 429 }, { "epoch": 0.21673387096774194, "grad_norm": 0.5081145594897964, "learning_rate": 9.921662974887067e-07, "loss": 0.3012, "step": 430 }, { "epoch": 0.21723790322580644, "grad_norm": 0.3760477536962576, "learning_rate": 9.921188738763447e-07, "loss": 0.307, "step": 431 }, { "epoch": 0.21774193548387097, "grad_norm": 0.3082054028318284, "learning_rate": 9.92071308290736e-07, "loss": 0.3171, "step": 432 }, { "epoch": 0.21824596774193547, "grad_norm": 0.22029848103815816, "learning_rate": 9.920236007456031e-07, "loss": 0.2931, "step": 433 }, { "epoch": 0.21875, "grad_norm": 0.41253335918523026, "learning_rate": 9.919757512547094e-07, "loss": 0.3011, "step": 434 }, { "epoch": 0.21925403225806453, "grad_norm": 0.5610820660085883, "learning_rate": 9.91927759831859e-07, "loss": 0.3088, "step": 435 }, { "epoch": 0.21975806451612903, "grad_norm": 0.35401767629764336, "learning_rate": 9.918796264908973e-07, "loss": 0.3003, "step": 436 }, { "epoch": 0.22026209677419356, "grad_norm": 0.28203957531336254, "learning_rate": 9.918313512457104e-07, "loss": 0.3081, "step": 437 }, { "epoch": 0.22076612903225806, "grad_norm": 0.2554173995348508, "learning_rate": 9.917829341102254e-07, "loss": 0.2942, "step": 438 }, { "epoch": 0.2212701612903226, "grad_norm": 0.23639618149360714, "learning_rate": 9.917343750984102e-07, "loss": 0.3021, "step": 439 }, { "epoch": 0.2217741935483871, "grad_norm": 0.4305124117168449, "learning_rate": 9.91685674224274e-07, "loss": 0.3085, "step": 440 }, { "epoch": 0.22227822580645162, "grad_norm": 0.2220338714231415, "learning_rate": 9.916368315018666e-07, "loss": 0.3028, "step": 441 }, { "epoch": 0.22278225806451613, "grad_norm": 0.2927983739524596, "learning_rate": 9.91587846945279e-07, "loss": 0.3017, "step": 442 }, { "epoch": 0.22328629032258066, "grad_norm": 0.3420210184854715, "learning_rate": 9.915387205686427e-07, "loss": 0.3081, "step": 443 }, { "epoch": 0.22379032258064516, "grad_norm": 0.39311494277912007, "learning_rate": 9.914894523861304e-07, "loss": 0.3028, "step": 444 }, { "epoch": 0.2242943548387097, "grad_norm": 0.4151663339605208, "learning_rate": 9.914400424119555e-07, "loss": 0.2956, "step": 445 }, { "epoch": 0.2247983870967742, "grad_norm": 0.2123585820142778, "learning_rate": 9.91390490660373e-07, "loss": 0.3075, "step": 446 }, { "epoch": 0.22530241935483872, "grad_norm": 0.571261555860595, "learning_rate": 9.913407971456778e-07, "loss": 0.3013, "step": 447 }, { "epoch": 0.22580645161290322, "grad_norm": 0.23955795641291014, "learning_rate": 9.912909618822063e-07, "loss": 0.3134, "step": 448 }, { "epoch": 0.22631048387096775, "grad_norm": 0.570859883971083, "learning_rate": 9.912409848843358e-07, "loss": 0.3056, "step": 449 }, { "epoch": 0.22681451612903225, "grad_norm": 0.48199321601840345, "learning_rate": 9.911908661664842e-07, "loss": 0.3154, "step": 450 }, { "epoch": 0.22731854838709678, "grad_norm": 0.5422943126443341, "learning_rate": 9.911406057431104e-07, "loss": 0.2987, "step": 451 }, { "epoch": 0.22782258064516128, "grad_norm": 0.3574993769315431, "learning_rate": 9.910902036287143e-07, "loss": 0.2941, "step": 452 }, { "epoch": 0.2283266129032258, "grad_norm": 1.1833977307135548, "learning_rate": 9.910396598378366e-07, "loss": 0.3017, "step": 453 }, { "epoch": 0.2288306451612903, "grad_norm": 0.25337165989867305, "learning_rate": 9.90988974385059e-07, "loss": 0.2925, "step": 454 }, { "epoch": 0.22933467741935484, "grad_norm": 0.2480951752658535, "learning_rate": 9.909381472850036e-07, "loss": 0.3021, "step": 455 }, { "epoch": 0.22983870967741934, "grad_norm": 0.409014941345192, "learning_rate": 9.90887178552334e-07, "loss": 0.3272, "step": 456 }, { "epoch": 0.23034274193548387, "grad_norm": 0.7457987975927259, "learning_rate": 9.908360682017544e-07, "loss": 0.3048, "step": 457 }, { "epoch": 0.23084677419354838, "grad_norm": 0.45956267234032566, "learning_rate": 9.907848162480094e-07, "loss": 0.3066, "step": 458 }, { "epoch": 0.2313508064516129, "grad_norm": 0.3779765403542269, "learning_rate": 9.907334227058855e-07, "loss": 0.3009, "step": 459 }, { "epoch": 0.2318548387096774, "grad_norm": 0.2666151159273384, "learning_rate": 9.90681887590209e-07, "loss": 0.3124, "step": 460 }, { "epoch": 0.23235887096774194, "grad_norm": 0.2839428445664857, "learning_rate": 9.906302109158474e-07, "loss": 0.3034, "step": 461 }, { "epoch": 0.23286290322580644, "grad_norm": 0.4380368120654733, "learning_rate": 9.905783926977094e-07, "loss": 0.3087, "step": 462 }, { "epoch": 0.23336693548387097, "grad_norm": 0.46446987722443317, "learning_rate": 9.90526432950744e-07, "loss": 0.3066, "step": 463 }, { "epoch": 0.23387096774193547, "grad_norm": 0.2400929003281859, "learning_rate": 9.904743316899412e-07, "loss": 0.2971, "step": 464 }, { "epoch": 0.234375, "grad_norm": 0.22533888473808292, "learning_rate": 9.904220889303322e-07, "loss": 0.3034, "step": 465 }, { "epoch": 0.23487903225806453, "grad_norm": 0.5169333077194566, "learning_rate": 9.903697046869885e-07, "loss": 0.3058, "step": 466 }, { "epoch": 0.23538306451612903, "grad_norm": 0.37976744599257856, "learning_rate": 9.903171789750227e-07, "loss": 0.2943, "step": 467 }, { "epoch": 0.23588709677419356, "grad_norm": 0.303653990179142, "learning_rate": 9.90264511809588e-07, "loss": 0.3037, "step": 468 }, { "epoch": 0.23639112903225806, "grad_norm": 0.21581398194659623, "learning_rate": 9.902117032058788e-07, "loss": 0.3142, "step": 469 }, { "epoch": 0.2368951612903226, "grad_norm": 0.4025038895307978, "learning_rate": 9.9015875317913e-07, "loss": 0.2886, "step": 470 }, { "epoch": 0.2373991935483871, "grad_norm": 0.7490982140928762, "learning_rate": 9.90105661744617e-07, "loss": 0.3048, "step": 471 }, { "epoch": 0.23790322580645162, "grad_norm": 0.2765613593877836, "learning_rate": 9.900524289176571e-07, "loss": 0.3113, "step": 472 }, { "epoch": 0.23840725806451613, "grad_norm": 0.25206617170451434, "learning_rate": 9.899990547136068e-07, "loss": 0.3031, "step": 473 }, { "epoch": 0.23891129032258066, "grad_norm": 0.5866005018512471, "learning_rate": 9.899455391478646e-07, "loss": 0.3011, "step": 474 }, { "epoch": 0.23941532258064516, "grad_norm": 0.39806300430021535, "learning_rate": 9.898918822358695e-07, "loss": 0.2955, "step": 475 }, { "epoch": 0.2399193548387097, "grad_norm": 0.2584605383681581, "learning_rate": 9.898380839931012e-07, "loss": 0.3044, "step": 476 }, { "epoch": 0.2404233870967742, "grad_norm": 0.19700196645461712, "learning_rate": 9.897841444350799e-07, "loss": 0.2983, "step": 477 }, { "epoch": 0.24092741935483872, "grad_norm": 0.306687320143844, "learning_rate": 9.89730063577367e-07, "loss": 0.2963, "step": 478 }, { "epoch": 0.24143145161290322, "grad_norm": 0.19514269842432405, "learning_rate": 9.896758414355646e-07, "loss": 0.2852, "step": 479 }, { "epoch": 0.24193548387096775, "grad_norm": 0.2903995442375573, "learning_rate": 9.89621478025315e-07, "loss": 0.3039, "step": 480 }, { "epoch": 0.24243951612903225, "grad_norm": 0.24936460961507168, "learning_rate": 9.895669733623024e-07, "loss": 0.3025, "step": 481 }, { "epoch": 0.24294354838709678, "grad_norm": 0.20112659720170964, "learning_rate": 9.895123274622506e-07, "loss": 0.308, "step": 482 }, { "epoch": 0.24344758064516128, "grad_norm": 0.3732280770776686, "learning_rate": 9.894575403409246e-07, "loss": 0.3048, "step": 483 }, { "epoch": 0.2439516129032258, "grad_norm": 0.3084274056795106, "learning_rate": 9.894026120141304e-07, "loss": 0.2999, "step": 484 }, { "epoch": 0.2444556451612903, "grad_norm": 0.23552597211145404, "learning_rate": 9.893475424977143e-07, "loss": 0.3073, "step": 485 }, { "epoch": 0.24495967741935484, "grad_norm": 1.0356894202770972, "learning_rate": 9.892923318075634e-07, "loss": 0.2958, "step": 486 }, { "epoch": 0.24546370967741934, "grad_norm": 0.22009519521768645, "learning_rate": 9.892369799596057e-07, "loss": 0.3059, "step": 487 }, { "epoch": 0.24596774193548387, "grad_norm": 0.2573429172383206, "learning_rate": 9.891814869698101e-07, "loss": 0.2992, "step": 488 }, { "epoch": 0.24647177419354838, "grad_norm": 0.2350845142780554, "learning_rate": 9.891258528541859e-07, "loss": 0.3033, "step": 489 }, { "epoch": 0.2469758064516129, "grad_norm": 0.2879929160489008, "learning_rate": 9.89070077628783e-07, "loss": 0.3117, "step": 490 }, { "epoch": 0.2474798387096774, "grad_norm": 0.25961881540855053, "learning_rate": 9.890141613096924e-07, "loss": 0.3168, "step": 491 }, { "epoch": 0.24798387096774194, "grad_norm": 0.19180476078696093, "learning_rate": 9.889581039130455e-07, "loss": 0.2868, "step": 492 }, { "epoch": 0.24848790322580644, "grad_norm": 0.45241056266402024, "learning_rate": 9.889019054550144e-07, "loss": 0.304, "step": 493 }, { "epoch": 0.24899193548387097, "grad_norm": 0.24790970349975638, "learning_rate": 9.888455659518124e-07, "loss": 0.3188, "step": 494 }, { "epoch": 0.24949596774193547, "grad_norm": 0.22736377748906864, "learning_rate": 9.887890854196928e-07, "loss": 0.3086, "step": 495 }, { "epoch": 0.25, "grad_norm": 0.2157688569695134, "learning_rate": 9.8873246387495e-07, "loss": 0.3055, "step": 496 }, { "epoch": 0.2505040322580645, "grad_norm": 0.2401397569367212, "learning_rate": 9.886757013339188e-07, "loss": 0.2903, "step": 497 }, { "epoch": 0.25100806451612906, "grad_norm": 0.27724457476206976, "learning_rate": 9.88618797812975e-07, "loss": 0.2809, "step": 498 }, { "epoch": 0.25151209677419356, "grad_norm": 0.22112121556855863, "learning_rate": 9.885617533285349e-07, "loss": 0.3083, "step": 499 }, { "epoch": 0.25201612903225806, "grad_norm": 0.26659993041208113, "learning_rate": 9.885045678970554e-07, "loss": 0.2945, "step": 500 }, { "epoch": 0.25252016129032256, "grad_norm": 0.474336083459808, "learning_rate": 9.884472415350342e-07, "loss": 0.3066, "step": 501 }, { "epoch": 0.2530241935483871, "grad_norm": 0.3246752836417456, "learning_rate": 9.883897742590094e-07, "loss": 0.315, "step": 502 }, { "epoch": 0.2535282258064516, "grad_norm": 0.19279895004344805, "learning_rate": 9.883321660855604e-07, "loss": 0.2902, "step": 503 }, { "epoch": 0.2540322580645161, "grad_norm": 0.26362617716913594, "learning_rate": 9.882744170313065e-07, "loss": 0.3309, "step": 504 }, { "epoch": 0.2545362903225806, "grad_norm": 0.5083521686121811, "learning_rate": 9.882165271129078e-07, "loss": 0.3039, "step": 505 }, { "epoch": 0.2550403225806452, "grad_norm": 0.20136858022759457, "learning_rate": 9.881584963470657e-07, "loss": 0.3135, "step": 506 }, { "epoch": 0.2555443548387097, "grad_norm": 0.24947755269293234, "learning_rate": 9.88100324750521e-07, "loss": 0.3, "step": 507 }, { "epoch": 0.2560483870967742, "grad_norm": 0.1934113012969022, "learning_rate": 9.880420123400567e-07, "loss": 0.2998, "step": 508 }, { "epoch": 0.2565524193548387, "grad_norm": 0.19024377903728643, "learning_rate": 9.879835591324947e-07, "loss": 0.3191, "step": 509 }, { "epoch": 0.25705645161290325, "grad_norm": 0.19125874820918395, "learning_rate": 9.87924965144699e-07, "loss": 0.3075, "step": 510 }, { "epoch": 0.25756048387096775, "grad_norm": 0.19244259184808277, "learning_rate": 9.878662303935732e-07, "loss": 0.2956, "step": 511 }, { "epoch": 0.25806451612903225, "grad_norm": 0.22509070180606666, "learning_rate": 9.878073548960623e-07, "loss": 0.3218, "step": 512 }, { "epoch": 0.25856854838709675, "grad_norm": 0.2717698227036736, "learning_rate": 9.877483386691513e-07, "loss": 0.3027, "step": 513 }, { "epoch": 0.2590725806451613, "grad_norm": 0.1867442771638362, "learning_rate": 9.876891817298658e-07, "loss": 0.2846, "step": 514 }, { "epoch": 0.2595766129032258, "grad_norm": 0.2377584026295176, "learning_rate": 9.876298840952726e-07, "loss": 0.2849, "step": 515 }, { "epoch": 0.2600806451612903, "grad_norm": 0.2936917090291921, "learning_rate": 9.875704457824786e-07, "loss": 0.3013, "step": 516 }, { "epoch": 0.2605846774193548, "grad_norm": 0.4263228296671513, "learning_rate": 9.875108668086313e-07, "loss": 0.2846, "step": 517 }, { "epoch": 0.2610887096774194, "grad_norm": 0.2718679782241284, "learning_rate": 9.874511471909189e-07, "loss": 0.2842, "step": 518 }, { "epoch": 0.2615927419354839, "grad_norm": 0.18725434376492076, "learning_rate": 9.873912869465701e-07, "loss": 0.3015, "step": 519 }, { "epoch": 0.2620967741935484, "grad_norm": 0.19020214607186645, "learning_rate": 9.873312860928541e-07, "loss": 0.3149, "step": 520 }, { "epoch": 0.2626008064516129, "grad_norm": 0.244741870819488, "learning_rate": 9.87271144647081e-07, "loss": 0.305, "step": 521 }, { "epoch": 0.26310483870967744, "grad_norm": 0.22885289257455954, "learning_rate": 9.872108626266014e-07, "loss": 0.2933, "step": 522 }, { "epoch": 0.26360887096774194, "grad_norm": 0.29727559200743137, "learning_rate": 9.871504400488059e-07, "loss": 0.2989, "step": 523 }, { "epoch": 0.26411290322580644, "grad_norm": 0.3347319040673116, "learning_rate": 9.870898769311261e-07, "loss": 0.3003, "step": 524 }, { "epoch": 0.26461693548387094, "grad_norm": 0.21285823806164653, "learning_rate": 9.870291732910343e-07, "loss": 0.2909, "step": 525 }, { "epoch": 0.2651209677419355, "grad_norm": 0.19714692459161542, "learning_rate": 9.86968329146043e-07, "loss": 0.2863, "step": 526 }, { "epoch": 0.265625, "grad_norm": 0.37762602100369747, "learning_rate": 9.869073445137054e-07, "loss": 0.3072, "step": 527 }, { "epoch": 0.2661290322580645, "grad_norm": 0.2322570227181064, "learning_rate": 9.868462194116149e-07, "loss": 0.2952, "step": 528 }, { "epoch": 0.26663306451612906, "grad_norm": 0.25028173693381045, "learning_rate": 9.86784953857406e-07, "loss": 0.2979, "step": 529 }, { "epoch": 0.26713709677419356, "grad_norm": 0.23342075160977557, "learning_rate": 9.867235478687534e-07, "loss": 0.3029, "step": 530 }, { "epoch": 0.26764112903225806, "grad_norm": 0.2202223368280641, "learning_rate": 9.866620014633725e-07, "loss": 0.2995, "step": 531 }, { "epoch": 0.26814516129032256, "grad_norm": 0.40688579823270754, "learning_rate": 9.866003146590186e-07, "loss": 0.2998, "step": 532 }, { "epoch": 0.2686491935483871, "grad_norm": 0.23558843514280095, "learning_rate": 9.865384874734886e-07, "loss": 0.2919, "step": 533 }, { "epoch": 0.2691532258064516, "grad_norm": 0.350876653755429, "learning_rate": 9.864765199246187e-07, "loss": 0.2979, "step": 534 }, { "epoch": 0.2696572580645161, "grad_norm": 0.5484769536987208, "learning_rate": 9.864144120302865e-07, "loss": 0.3098, "step": 535 }, { "epoch": 0.2701612903225806, "grad_norm": 0.20073285824672496, "learning_rate": 9.863521638084093e-07, "loss": 0.299, "step": 536 }, { "epoch": 0.2706653225806452, "grad_norm": 0.18356884936864395, "learning_rate": 9.86289775276946e-07, "loss": 0.3042, "step": 537 }, { "epoch": 0.2711693548387097, "grad_norm": 0.2886255584956994, "learning_rate": 9.862272464538946e-07, "loss": 0.3132, "step": 538 }, { "epoch": 0.2716733870967742, "grad_norm": 0.2658005187689356, "learning_rate": 9.86164577357295e-07, "loss": 0.308, "step": 539 }, { "epoch": 0.2721774193548387, "grad_norm": 0.36659782067418106, "learning_rate": 9.861017680052262e-07, "loss": 0.2983, "step": 540 }, { "epoch": 0.27268145161290325, "grad_norm": 0.26876552668494674, "learning_rate": 9.860388184158086e-07, "loss": 0.3026, "step": 541 }, { "epoch": 0.27318548387096775, "grad_norm": 0.18123995001034987, "learning_rate": 9.859757286072028e-07, "loss": 0.2984, "step": 542 }, { "epoch": 0.27368951612903225, "grad_norm": 0.3490662102838747, "learning_rate": 9.859124985976097e-07, "loss": 0.3006, "step": 543 }, { "epoch": 0.27419354838709675, "grad_norm": 0.5077591494356637, "learning_rate": 9.858491284052708e-07, "loss": 0.2942, "step": 544 }, { "epoch": 0.2746975806451613, "grad_norm": 0.1899610370247458, "learning_rate": 9.857856180484682e-07, "loss": 0.2952, "step": 545 }, { "epoch": 0.2752016129032258, "grad_norm": 0.33906573112018934, "learning_rate": 9.857219675455236e-07, "loss": 0.2825, "step": 546 }, { "epoch": 0.2757056451612903, "grad_norm": 0.24179860042616932, "learning_rate": 9.856581769148007e-07, "loss": 0.2963, "step": 547 }, { "epoch": 0.2762096774193548, "grad_norm": 0.24173305167642747, "learning_rate": 9.855942461747023e-07, "loss": 0.3157, "step": 548 }, { "epoch": 0.2767137096774194, "grad_norm": 0.28928103551655465, "learning_rate": 9.855301753436718e-07, "loss": 0.3087, "step": 549 }, { "epoch": 0.2772177419354839, "grad_norm": 0.18922268926852628, "learning_rate": 9.854659644401934e-07, "loss": 0.3134, "step": 550 }, { "epoch": 0.2777217741935484, "grad_norm": 0.20784139183584166, "learning_rate": 9.854016134827916e-07, "loss": 0.3051, "step": 551 }, { "epoch": 0.2782258064516129, "grad_norm": 0.5667720250099785, "learning_rate": 9.853371224900313e-07, "loss": 0.2847, "step": 552 }, { "epoch": 0.27872983870967744, "grad_norm": 0.34781718527992644, "learning_rate": 9.852724914805175e-07, "loss": 0.2873, "step": 553 }, { "epoch": 0.27923387096774194, "grad_norm": 0.19716253139941464, "learning_rate": 9.852077204728961e-07, "loss": 0.3027, "step": 554 }, { "epoch": 0.27973790322580644, "grad_norm": 0.19344190301481812, "learning_rate": 9.85142809485853e-07, "loss": 0.2897, "step": 555 }, { "epoch": 0.28024193548387094, "grad_norm": 0.7127285715175351, "learning_rate": 9.850777585381146e-07, "loss": 0.2866, "step": 556 }, { "epoch": 0.2807459677419355, "grad_norm": 0.40797443381529386, "learning_rate": 9.85012567648448e-07, "loss": 0.3056, "step": 557 }, { "epoch": 0.28125, "grad_norm": 0.2136697521115876, "learning_rate": 9.8494723683566e-07, "loss": 0.3064, "step": 558 }, { "epoch": 0.2817540322580645, "grad_norm": 0.26165293739971646, "learning_rate": 9.848817661185984e-07, "loss": 0.2921, "step": 559 }, { "epoch": 0.28225806451612906, "grad_norm": 0.1846995068150634, "learning_rate": 9.848161555161507e-07, "loss": 0.3131, "step": 560 }, { "epoch": 0.28276209677419356, "grad_norm": 1.0083499880979159, "learning_rate": 9.847504050472454e-07, "loss": 0.3053, "step": 561 }, { "epoch": 0.28326612903225806, "grad_norm": 0.6105046373752532, "learning_rate": 9.846845147308514e-07, "loss": 0.3082, "step": 562 }, { "epoch": 0.28377016129032256, "grad_norm": 0.32182959498303215, "learning_rate": 9.846184845859772e-07, "loss": 0.2886, "step": 563 }, { "epoch": 0.2842741935483871, "grad_norm": 0.5616737715729443, "learning_rate": 9.845523146316722e-07, "loss": 0.3138, "step": 564 }, { "epoch": 0.2847782258064516, "grad_norm": 0.34317343338743084, "learning_rate": 9.844860048870261e-07, "loss": 0.3058, "step": 565 }, { "epoch": 0.2852822580645161, "grad_norm": 0.38672910543834893, "learning_rate": 9.84419555371169e-07, "loss": 0.2991, "step": 566 }, { "epoch": 0.2857862903225806, "grad_norm": 0.27114300647024586, "learning_rate": 9.843529661032706e-07, "loss": 0.3083, "step": 567 }, { "epoch": 0.2862903225806452, "grad_norm": 0.1896232339668482, "learning_rate": 9.842862371025422e-07, "loss": 0.3025, "step": 568 }, { "epoch": 0.2867943548387097, "grad_norm": 0.3930098230344227, "learning_rate": 9.842193683882344e-07, "loss": 0.3079, "step": 569 }, { "epoch": 0.2872983870967742, "grad_norm": 0.43124046850579134, "learning_rate": 9.841523599796382e-07, "loss": 0.2777, "step": 570 }, { "epoch": 0.2878024193548387, "grad_norm": 0.7587117457845125, "learning_rate": 9.840852118960853e-07, "loss": 0.3064, "step": 571 }, { "epoch": 0.28830645161290325, "grad_norm": 0.3117074250999561, "learning_rate": 9.840179241569478e-07, "loss": 0.299, "step": 572 }, { "epoch": 0.28881048387096775, "grad_norm": 0.18731266500576274, "learning_rate": 9.839504967816374e-07, "loss": 0.3035, "step": 573 }, { "epoch": 0.28931451612903225, "grad_norm": 0.43778747829645503, "learning_rate": 9.838829297896065e-07, "loss": 0.319, "step": 574 }, { "epoch": 0.28981854838709675, "grad_norm": 0.40776414993659865, "learning_rate": 9.83815223200348e-07, "loss": 0.2955, "step": 575 }, { "epoch": 0.2903225806451613, "grad_norm": 0.25990172322006877, "learning_rate": 9.837473770333945e-07, "loss": 0.3132, "step": 576 }, { "epoch": 0.2908266129032258, "grad_norm": 0.2296880045553122, "learning_rate": 9.836793913083195e-07, "loss": 0.2936, "step": 577 }, { "epoch": 0.2913306451612903, "grad_norm": 0.1953127561261216, "learning_rate": 9.836112660447362e-07, "loss": 0.3138, "step": 578 }, { "epoch": 0.2918346774193548, "grad_norm": 0.27208394851949347, "learning_rate": 9.835430012622988e-07, "loss": 0.313, "step": 579 }, { "epoch": 0.2923387096774194, "grad_norm": 0.2877633978499272, "learning_rate": 9.834745969807006e-07, "loss": 0.3013, "step": 580 }, { "epoch": 0.2928427419354839, "grad_norm": 0.20402433175664483, "learning_rate": 9.834060532196761e-07, "loss": 0.2948, "step": 581 }, { "epoch": 0.2933467741935484, "grad_norm": 0.3541481547851912, "learning_rate": 9.833373699989999e-07, "loss": 0.3057, "step": 582 }, { "epoch": 0.2938508064516129, "grad_norm": 0.25513070455002546, "learning_rate": 9.832685473384868e-07, "loss": 0.2972, "step": 583 }, { "epoch": 0.29435483870967744, "grad_norm": 0.6078105436530532, "learning_rate": 9.83199585257991e-07, "loss": 0.2974, "step": 584 }, { "epoch": 0.29485887096774194, "grad_norm": 0.2701844685693586, "learning_rate": 9.831304837774086e-07, "loss": 0.314, "step": 585 }, { "epoch": 0.29536290322580644, "grad_norm": 0.19399840828869006, "learning_rate": 9.830612429166743e-07, "loss": 0.3031, "step": 586 }, { "epoch": 0.29586693548387094, "grad_norm": 0.184738711676414, "learning_rate": 9.829918626957635e-07, "loss": 0.3031, "step": 587 }, { "epoch": 0.2963709677419355, "grad_norm": 0.3269160031771926, "learning_rate": 9.829223431346926e-07, "loss": 0.2911, "step": 588 }, { "epoch": 0.296875, "grad_norm": 0.38332552633652756, "learning_rate": 9.828526842535174e-07, "loss": 0.2864, "step": 589 }, { "epoch": 0.2973790322580645, "grad_norm": 0.21468223114928167, "learning_rate": 9.827828860723338e-07, "loss": 0.3109, "step": 590 }, { "epoch": 0.29788306451612906, "grad_norm": 0.18987287925768598, "learning_rate": 9.827129486112782e-07, "loss": 0.3011, "step": 591 }, { "epoch": 0.29838709677419356, "grad_norm": 0.3156090049134527, "learning_rate": 9.82642871890527e-07, "loss": 0.2906, "step": 592 }, { "epoch": 0.29889112903225806, "grad_norm": 0.40698452999719154, "learning_rate": 9.82572655930297e-07, "loss": 0.306, "step": 593 }, { "epoch": 0.29939516129032256, "grad_norm": 0.24367422580598258, "learning_rate": 9.825023007508456e-07, "loss": 0.3006, "step": 594 }, { "epoch": 0.2998991935483871, "grad_norm": 0.27250412398540247, "learning_rate": 9.82431806372469e-07, "loss": 0.2989, "step": 595 }, { "epoch": 0.3004032258064516, "grad_norm": 0.19246483798552555, "learning_rate": 9.82361172815505e-07, "loss": 0.2973, "step": 596 }, { "epoch": 0.3009072580645161, "grad_norm": 0.18545484272367835, "learning_rate": 9.822904001003306e-07, "loss": 0.2996, "step": 597 }, { "epoch": 0.3009072580645161, "eval_loss": 0.32989731431007385, "eval_runtime": 18.7201, "eval_samples_per_second": 45.673, "eval_steps_per_second": 0.962, "step": 597 }, { "epoch": 0.3014112903225806, "grad_norm": 0.5063679318757863, "learning_rate": 9.822194882473635e-07, "loss": 0.2864, "step": 598 }, { "epoch": 0.3019153225806452, "grad_norm": 0.2855392122632754, "learning_rate": 9.821484372770612e-07, "loss": 0.295, "step": 599 }, { "epoch": 0.3024193548387097, "grad_norm": 0.6735980580137383, "learning_rate": 9.820772472099215e-07, "loss": 0.2951, "step": 600 }, { "epoch": 0.3029233870967742, "grad_norm": 0.35135382164679746, "learning_rate": 9.82005918066482e-07, "loss": 0.31, "step": 601 }, { "epoch": 0.3034274193548387, "grad_norm": 0.19875119958210918, "learning_rate": 9.819344498673215e-07, "loss": 0.2956, "step": 602 }, { "epoch": 0.30393145161290325, "grad_norm": 0.9315288935617498, "learning_rate": 9.818628426330574e-07, "loss": 0.3217, "step": 603 }, { "epoch": 0.30443548387096775, "grad_norm": 1.0733717026504053, "learning_rate": 9.817910963843481e-07, "loss": 0.302, "step": 604 }, { "epoch": 0.30493951612903225, "grad_norm": 0.1821439303858277, "learning_rate": 9.81719211141892e-07, "loss": 0.2951, "step": 605 }, { "epoch": 0.30544354838709675, "grad_norm": 0.2227625770904509, "learning_rate": 9.816471869264278e-07, "loss": 0.2962, "step": 606 }, { "epoch": 0.3059475806451613, "grad_norm": 0.6196036153090734, "learning_rate": 9.815750237587338e-07, "loss": 0.3029, "step": 607 }, { "epoch": 0.3064516129032258, "grad_norm": 0.6010581227628021, "learning_rate": 9.815027216596282e-07, "loss": 0.3046, "step": 608 }, { "epoch": 0.3069556451612903, "grad_norm": 0.3528611360569492, "learning_rate": 9.814302806499707e-07, "loss": 0.2932, "step": 609 }, { "epoch": 0.3074596774193548, "grad_norm": 0.2979313249836626, "learning_rate": 9.813577007506594e-07, "loss": 0.2899, "step": 610 }, { "epoch": 0.3079637096774194, "grad_norm": 0.489841762858414, "learning_rate": 9.81284981982633e-07, "loss": 0.2887, "step": 611 }, { "epoch": 0.3084677419354839, "grad_norm": 0.32286398990319887, "learning_rate": 9.81212124366871e-07, "loss": 0.2972, "step": 612 }, { "epoch": 0.3089717741935484, "grad_norm": 0.3654449282383879, "learning_rate": 9.81139127924392e-07, "loss": 0.2817, "step": 613 }, { "epoch": 0.3094758064516129, "grad_norm": 0.23506329847966434, "learning_rate": 9.810659926762551e-07, "loss": 0.3041, "step": 614 }, { "epoch": 0.30997983870967744, "grad_norm": 0.18739033718362563, "learning_rate": 9.809927186435594e-07, "loss": 0.3039, "step": 615 }, { "epoch": 0.31048387096774194, "grad_norm": 0.4424195854136909, "learning_rate": 9.809193058474438e-07, "loss": 0.3027, "step": 616 }, { "epoch": 0.31098790322580644, "grad_norm": 0.34612288423111537, "learning_rate": 9.808457543090878e-07, "loss": 0.2893, "step": 617 }, { "epoch": 0.31149193548387094, "grad_norm": 0.6207470476564412, "learning_rate": 9.807720640497103e-07, "loss": 0.3089, "step": 618 }, { "epoch": 0.3119959677419355, "grad_norm": 0.5606872023789801, "learning_rate": 9.806982350905703e-07, "loss": 0.3051, "step": 619 }, { "epoch": 0.3125, "grad_norm": 0.6658134114343093, "learning_rate": 9.806242674529676e-07, "loss": 0.2879, "step": 620 }, { "epoch": 0.3130040322580645, "grad_norm": 0.650400551133684, "learning_rate": 9.80550161158241e-07, "loss": 0.2998, "step": 621 }, { "epoch": 0.31350806451612906, "grad_norm": 0.17947781258266207, "learning_rate": 9.804759162277696e-07, "loss": 0.3134, "step": 622 }, { "epoch": 0.31401209677419356, "grad_norm": 0.3273834000987335, "learning_rate": 9.804015326829728e-07, "loss": 0.3108, "step": 623 }, { "epoch": 0.31451612903225806, "grad_norm": 0.27388099966321255, "learning_rate": 9.803270105453098e-07, "loss": 0.2995, "step": 624 }, { "epoch": 0.31502016129032256, "grad_norm": 0.4361569565465292, "learning_rate": 9.802523498362797e-07, "loss": 0.3037, "step": 625 }, { "epoch": 0.3155241935483871, "grad_norm": 0.43147162718536736, "learning_rate": 9.801775505774218e-07, "loss": 0.3142, "step": 626 }, { "epoch": 0.3160282258064516, "grad_norm": 0.23130882783317155, "learning_rate": 9.801026127903149e-07, "loss": 0.2981, "step": 627 }, { "epoch": 0.3165322580645161, "grad_norm": 0.42494886746355115, "learning_rate": 9.800275364965782e-07, "loss": 0.3083, "step": 628 }, { "epoch": 0.3170362903225806, "grad_norm": 0.23979029210918654, "learning_rate": 9.79952321717871e-07, "loss": 0.2927, "step": 629 }, { "epoch": 0.3175403225806452, "grad_norm": 0.2649450830135195, "learning_rate": 9.798769684758924e-07, "loss": 0.3055, "step": 630 }, { "epoch": 0.3180443548387097, "grad_norm": 0.20691288403035124, "learning_rate": 9.798014767923807e-07, "loss": 0.2939, "step": 631 }, { "epoch": 0.3185483870967742, "grad_norm": 0.2818136169636899, "learning_rate": 9.797258466891152e-07, "loss": 0.3034, "step": 632 }, { "epoch": 0.3190524193548387, "grad_norm": 0.1855253307389335, "learning_rate": 9.796500781879148e-07, "loss": 0.2845, "step": 633 }, { "epoch": 0.31955645161290325, "grad_norm": 0.2383083001412052, "learning_rate": 9.79574171310638e-07, "loss": 0.2999, "step": 634 }, { "epoch": 0.32006048387096775, "grad_norm": 0.2155300521952254, "learning_rate": 9.794981260791837e-07, "loss": 0.3115, "step": 635 }, { "epoch": 0.32056451612903225, "grad_norm": 0.59372804366046, "learning_rate": 9.794219425154904e-07, "loss": 0.3057, "step": 636 }, { "epoch": 0.32106854838709675, "grad_norm": 0.17584328105654679, "learning_rate": 9.793456206415362e-07, "loss": 0.3007, "step": 637 }, { "epoch": 0.3215725806451613, "grad_norm": 0.2442097318205203, "learning_rate": 9.792691604793402e-07, "loss": 0.2897, "step": 638 }, { "epoch": 0.3220766129032258, "grad_norm": 0.17991513417504787, "learning_rate": 9.791925620509603e-07, "loss": 0.289, "step": 639 }, { "epoch": 0.3225806451612903, "grad_norm": 0.2022189687655052, "learning_rate": 9.791158253784945e-07, "loss": 0.3072, "step": 640 }, { "epoch": 0.3230846774193548, "grad_norm": 0.3259212247774595, "learning_rate": 9.79038950484081e-07, "loss": 0.2989, "step": 641 }, { "epoch": 0.3235887096774194, "grad_norm": 0.33045805468173267, "learning_rate": 9.789619373898981e-07, "loss": 0.3038, "step": 642 }, { "epoch": 0.3240927419354839, "grad_norm": 0.24681922232019723, "learning_rate": 9.788847861181631e-07, "loss": 0.3138, "step": 643 }, { "epoch": 0.3245967741935484, "grad_norm": 0.2518300810532734, "learning_rate": 9.788074966911337e-07, "loss": 0.2899, "step": 644 }, { "epoch": 0.3251008064516129, "grad_norm": 0.29278633614058125, "learning_rate": 9.787300691311077e-07, "loss": 0.3139, "step": 645 }, { "epoch": 0.32560483870967744, "grad_norm": 0.21645406918937576, "learning_rate": 9.786525034604224e-07, "loss": 0.309, "step": 646 }, { "epoch": 0.32610887096774194, "grad_norm": 0.3666140583384514, "learning_rate": 9.785747997014547e-07, "loss": 0.2949, "step": 647 }, { "epoch": 0.32661290322580644, "grad_norm": 0.1947942753801369, "learning_rate": 9.78496957876622e-07, "loss": 0.3075, "step": 648 }, { "epoch": 0.32711693548387094, "grad_norm": 0.18162530785958844, "learning_rate": 9.784189780083812e-07, "loss": 0.294, "step": 649 }, { "epoch": 0.3276209677419355, "grad_norm": 0.28107058253792394, "learning_rate": 9.78340860119229e-07, "loss": 0.2927, "step": 650 }, { "epoch": 0.328125, "grad_norm": 0.1863145084715224, "learning_rate": 9.782626042317015e-07, "loss": 0.2977, "step": 651 }, { "epoch": 0.3286290322580645, "grad_norm": 0.19255877376409836, "learning_rate": 9.781842103683756e-07, "loss": 0.3242, "step": 652 }, { "epoch": 0.32913306451612906, "grad_norm": 0.22119159460683763, "learning_rate": 9.78105678551867e-07, "loss": 0.2949, "step": 653 }, { "epoch": 0.32963709677419356, "grad_norm": 0.3370309906377243, "learning_rate": 9.78027008804832e-07, "loss": 0.3281, "step": 654 }, { "epoch": 0.33014112903225806, "grad_norm": 0.21007597885166748, "learning_rate": 9.779482011499662e-07, "loss": 0.3011, "step": 655 }, { "epoch": 0.33064516129032256, "grad_norm": 0.2114458853983953, "learning_rate": 9.77869255610005e-07, "loss": 0.3144, "step": 656 }, { "epoch": 0.3311491935483871, "grad_norm": 0.4562451777517263, "learning_rate": 9.77790172207724e-07, "loss": 0.2804, "step": 657 }, { "epoch": 0.3316532258064516, "grad_norm": 0.4033116791658588, "learning_rate": 9.777109509659378e-07, "loss": 0.2896, "step": 658 }, { "epoch": 0.3321572580645161, "grad_norm": 0.19637371460146463, "learning_rate": 9.776315919075015e-07, "loss": 0.2851, "step": 659 }, { "epoch": 0.3326612903225806, "grad_norm": 0.23427740658090485, "learning_rate": 9.7755209505531e-07, "loss": 0.2888, "step": 660 }, { "epoch": 0.3331653225806452, "grad_norm": 0.2742494578051774, "learning_rate": 9.77472460432297e-07, "loss": 0.2811, "step": 661 }, { "epoch": 0.3336693548387097, "grad_norm": 0.4976330279305446, "learning_rate": 9.77392688061437e-07, "loss": 0.2981, "step": 662 }, { "epoch": 0.3341733870967742, "grad_norm": 0.20762430916034352, "learning_rate": 9.773127779657442e-07, "loss": 0.2928, "step": 663 }, { "epoch": 0.3346774193548387, "grad_norm": 0.1974247496986688, "learning_rate": 9.772327301682714e-07, "loss": 0.2983, "step": 664 }, { "epoch": 0.33518145161290325, "grad_norm": 0.2693856221779068, "learning_rate": 9.771525446921123e-07, "loss": 0.2966, "step": 665 }, { "epoch": 0.33568548387096775, "grad_norm": 0.18606159238485928, "learning_rate": 9.770722215604e-07, "loss": 0.2989, "step": 666 }, { "epoch": 0.33618951612903225, "grad_norm": 0.375528189461835, "learning_rate": 9.769917607963068e-07, "loss": 0.2908, "step": 667 }, { "epoch": 0.33669354838709675, "grad_norm": 0.2859681728471663, "learning_rate": 9.769111624230457e-07, "loss": 0.2918, "step": 668 }, { "epoch": 0.3371975806451613, "grad_norm": 0.20521406800135222, "learning_rate": 9.768304264638684e-07, "loss": 0.2987, "step": 669 }, { "epoch": 0.3377016129032258, "grad_norm": 0.23397409956060417, "learning_rate": 9.76749552942067e-07, "loss": 0.3029, "step": 670 }, { "epoch": 0.3382056451612903, "grad_norm": 0.4520922291898478, "learning_rate": 9.766685418809727e-07, "loss": 0.2963, "step": 671 }, { "epoch": 0.3387096774193548, "grad_norm": 0.18452310561487084, "learning_rate": 9.76587393303957e-07, "loss": 0.2931, "step": 672 }, { "epoch": 0.3392137096774194, "grad_norm": 0.22095628174169515, "learning_rate": 9.765061072344305e-07, "loss": 0.293, "step": 673 }, { "epoch": 0.3397177419354839, "grad_norm": 0.45292152421659004, "learning_rate": 9.764246836958439e-07, "loss": 0.3098, "step": 674 }, { "epoch": 0.3402217741935484, "grad_norm": 0.22491373287467453, "learning_rate": 9.763431227116875e-07, "loss": 0.297, "step": 675 }, { "epoch": 0.3407258064516129, "grad_norm": 0.36919058274186534, "learning_rate": 9.76261424305491e-07, "loss": 0.3075, "step": 676 }, { "epoch": 0.34122983870967744, "grad_norm": 0.3042293571669918, "learning_rate": 9.761795885008236e-07, "loss": 0.3018, "step": 677 }, { "epoch": 0.34173387096774194, "grad_norm": 0.30087168043463913, "learning_rate": 9.76097615321295e-07, "loss": 0.2985, "step": 678 }, { "epoch": 0.34223790322580644, "grad_norm": 0.24352689031383537, "learning_rate": 9.760155047905534e-07, "loss": 0.2938, "step": 679 }, { "epoch": 0.34274193548387094, "grad_norm": 0.25871019913875265, "learning_rate": 9.759332569322876e-07, "loss": 0.2766, "step": 680 }, { "epoch": 0.3432459677419355, "grad_norm": 0.3732275698199399, "learning_rate": 9.758508717702253e-07, "loss": 0.3068, "step": 681 }, { "epoch": 0.34375, "grad_norm": 0.18175157181935991, "learning_rate": 9.757683493281343e-07, "loss": 0.3096, "step": 682 }, { "epoch": 0.3442540322580645, "grad_norm": 0.3020686172952058, "learning_rate": 9.756856896298218e-07, "loss": 0.2874, "step": 683 }, { "epoch": 0.34475806451612906, "grad_norm": 0.4287852398031048, "learning_rate": 9.756028926991344e-07, "loss": 0.3064, "step": 684 }, { "epoch": 0.34526209677419356, "grad_norm": 0.3877586447077066, "learning_rate": 9.755199585599587e-07, "loss": 0.3034, "step": 685 }, { "epoch": 0.34576612903225806, "grad_norm": 0.42768152833859835, "learning_rate": 9.754368872362209e-07, "loss": 0.3066, "step": 686 }, { "epoch": 0.34627016129032256, "grad_norm": 0.2249355344351282, "learning_rate": 9.75353678751886e-07, "loss": 0.2882, "step": 687 }, { "epoch": 0.3467741935483871, "grad_norm": 0.27912853498203316, "learning_rate": 9.752703331309596e-07, "loss": 0.3041, "step": 688 }, { "epoch": 0.3472782258064516, "grad_norm": 0.43506449721846435, "learning_rate": 9.751868503974862e-07, "loss": 0.2846, "step": 689 }, { "epoch": 0.3477822580645161, "grad_norm": 0.26206165096276, "learning_rate": 9.7510323057555e-07, "loss": 0.3068, "step": 690 }, { "epoch": 0.3482862903225806, "grad_norm": 0.20904315219220723, "learning_rate": 9.75019473689275e-07, "loss": 0.2997, "step": 691 }, { "epoch": 0.3487903225806452, "grad_norm": 0.19583961501100697, "learning_rate": 9.749355797628243e-07, "loss": 0.2819, "step": 692 }, { "epoch": 0.3492943548387097, "grad_norm": 0.4477203330124689, "learning_rate": 9.74851548820401e-07, "loss": 0.2889, "step": 693 }, { "epoch": 0.3497983870967742, "grad_norm": 0.465876564157471, "learning_rate": 9.747673808862476e-07, "loss": 0.2927, "step": 694 }, { "epoch": 0.3503024193548387, "grad_norm": 0.19047436893148592, "learning_rate": 9.746830759846456e-07, "loss": 0.2972, "step": 695 }, { "epoch": 0.35080645161290325, "grad_norm": 0.20298210286927948, "learning_rate": 9.745986341399166e-07, "loss": 0.2865, "step": 696 }, { "epoch": 0.35131048387096775, "grad_norm": 0.21381675798513802, "learning_rate": 9.745140553764219e-07, "loss": 0.3057, "step": 697 }, { "epoch": 0.35181451612903225, "grad_norm": 0.29038109612051655, "learning_rate": 9.744293397185615e-07, "loss": 0.2929, "step": 698 }, { "epoch": 0.35231854838709675, "grad_norm": 0.3321256067582296, "learning_rate": 9.743444871907756e-07, "loss": 0.2896, "step": 699 }, { "epoch": 0.3528225806451613, "grad_norm": 0.3056406476467989, "learning_rate": 9.742594978175436e-07, "loss": 0.2996, "step": 700 }, { "epoch": 0.3533266129032258, "grad_norm": 0.22979049368338594, "learning_rate": 9.741743716233843e-07, "loss": 0.2879, "step": 701 }, { "epoch": 0.3538306451612903, "grad_norm": 0.2167981736696398, "learning_rate": 9.74089108632856e-07, "loss": 0.3008, "step": 702 }, { "epoch": 0.3543346774193548, "grad_norm": 0.6466745464300053, "learning_rate": 9.74003708870557e-07, "loss": 0.2883, "step": 703 }, { "epoch": 0.3548387096774194, "grad_norm": 0.5205046458590996, "learning_rate": 9.73918172361124e-07, "loss": 0.2941, "step": 704 }, { "epoch": 0.3553427419354839, "grad_norm": 0.4750444899997186, "learning_rate": 9.73832499129234e-07, "loss": 0.2978, "step": 705 }, { "epoch": 0.3558467741935484, "grad_norm": 0.2755306914200271, "learning_rate": 9.737466891996035e-07, "loss": 0.2761, "step": 706 }, { "epoch": 0.3563508064516129, "grad_norm": 0.23830584140125033, "learning_rate": 9.736607425969878e-07, "loss": 0.2817, "step": 707 }, { "epoch": 0.35685483870967744, "grad_norm": 0.5272610922884131, "learning_rate": 9.73574659346182e-07, "loss": 0.3099, "step": 708 }, { "epoch": 0.35735887096774194, "grad_norm": 0.4888851365749825, "learning_rate": 9.734884394720207e-07, "loss": 0.3035, "step": 709 }, { "epoch": 0.35786290322580644, "grad_norm": 0.19529005617043382, "learning_rate": 9.734020829993778e-07, "loss": 0.288, "step": 710 }, { "epoch": 0.35836693548387094, "grad_norm": 0.3168784896287719, "learning_rate": 9.733155899531661e-07, "loss": 0.2747, "step": 711 }, { "epoch": 0.3588709677419355, "grad_norm": 0.20038440381297584, "learning_rate": 9.73228960358339e-07, "loss": 0.2855, "step": 712 }, { "epoch": 0.359375, "grad_norm": 0.2637581920266508, "learning_rate": 9.731421942398882e-07, "loss": 0.3007, "step": 713 }, { "epoch": 0.3598790322580645, "grad_norm": 0.677620290574485, "learning_rate": 9.730552916228455e-07, "loss": 0.31, "step": 714 }, { "epoch": 0.36038306451612906, "grad_norm": 0.22952816515508798, "learning_rate": 9.729682525322817e-07, "loss": 0.3066, "step": 715 }, { "epoch": 0.36088709677419356, "grad_norm": 0.1917187197394172, "learning_rate": 9.728810769933066e-07, "loss": 0.3017, "step": 716 }, { "epoch": 0.36139112903225806, "grad_norm": 0.20951491712451265, "learning_rate": 9.727937650310704e-07, "loss": 0.2973, "step": 717 }, { "epoch": 0.36189516129032256, "grad_norm": 0.3141631511750949, "learning_rate": 9.727063166707619e-07, "loss": 0.308, "step": 718 }, { "epoch": 0.3623991935483871, "grad_norm": 0.18721524079592824, "learning_rate": 9.726187319376088e-07, "loss": 0.2927, "step": 719 }, { "epoch": 0.3629032258064516, "grad_norm": 0.28391314645074966, "learning_rate": 9.725310108568795e-07, "loss": 0.2949, "step": 720 }, { "epoch": 0.3634072580645161, "grad_norm": 0.17932969621501238, "learning_rate": 9.724431534538809e-07, "loss": 0.294, "step": 721 }, { "epoch": 0.3639112903225806, "grad_norm": 0.21062063293701141, "learning_rate": 9.723551597539591e-07, "loss": 0.2836, "step": 722 }, { "epoch": 0.3644153225806452, "grad_norm": 0.18043765044502827, "learning_rate": 9.722670297824998e-07, "loss": 0.3093, "step": 723 }, { "epoch": 0.3649193548387097, "grad_norm": 0.29050763149878694, "learning_rate": 9.72178763564928e-07, "loss": 0.2907, "step": 724 }, { "epoch": 0.3654233870967742, "grad_norm": 0.18845371146424408, "learning_rate": 9.720903611267077e-07, "loss": 0.2986, "step": 725 }, { "epoch": 0.3659274193548387, "grad_norm": 0.18828740365731594, "learning_rate": 9.720018224933427e-07, "loss": 0.2872, "step": 726 }, { "epoch": 0.36643145161290325, "grad_norm": 0.19190513878279883, "learning_rate": 9.71913147690376e-07, "loss": 0.3062, "step": 727 }, { "epoch": 0.36693548387096775, "grad_norm": 0.19814485234368132, "learning_rate": 9.718243367433893e-07, "loss": 0.2929, "step": 728 }, { "epoch": 0.36743951612903225, "grad_norm": 0.43417746968071463, "learning_rate": 9.717353896780042e-07, "loss": 0.2861, "step": 729 }, { "epoch": 0.36794354838709675, "grad_norm": 0.2278557334184222, "learning_rate": 9.716463065198817e-07, "loss": 0.3085, "step": 730 }, { "epoch": 0.3684475806451613, "grad_norm": 0.17793722781989707, "learning_rate": 9.715570872947213e-07, "loss": 0.2919, "step": 731 }, { "epoch": 0.3689516129032258, "grad_norm": 0.20613998379871223, "learning_rate": 9.714677320282623e-07, "loss": 0.3027, "step": 732 }, { "epoch": 0.3694556451612903, "grad_norm": 0.1966436661511563, "learning_rate": 9.713782407462834e-07, "loss": 0.2741, "step": 733 }, { "epoch": 0.3699596774193548, "grad_norm": 0.28922865753235927, "learning_rate": 9.712886134746019e-07, "loss": 0.2858, "step": 734 }, { "epoch": 0.3704637096774194, "grad_norm": 0.2702305708992764, "learning_rate": 9.71198850239075e-07, "loss": 0.297, "step": 735 }, { "epoch": 0.3709677419354839, "grad_norm": 0.194378574505089, "learning_rate": 9.711089510655985e-07, "loss": 0.3142, "step": 736 }, { "epoch": 0.3714717741935484, "grad_norm": 0.18465691225415032, "learning_rate": 9.710189159801084e-07, "loss": 0.2972, "step": 737 }, { "epoch": 0.3719758064516129, "grad_norm": 0.19730530115872685, "learning_rate": 9.709287450085786e-07, "loss": 0.2954, "step": 738 }, { "epoch": 0.37247983870967744, "grad_norm": 0.2189962450750751, "learning_rate": 9.70838438177023e-07, "loss": 0.2922, "step": 739 }, { "epoch": 0.37298387096774194, "grad_norm": 0.4433211186238402, "learning_rate": 9.707479955114948e-07, "loss": 0.3027, "step": 740 }, { "epoch": 0.37348790322580644, "grad_norm": 0.24228833774809522, "learning_rate": 9.70657417038086e-07, "loss": 0.2983, "step": 741 }, { "epoch": 0.37399193548387094, "grad_norm": 0.18660389324918686, "learning_rate": 9.70566702782928e-07, "loss": 0.3081, "step": 742 }, { "epoch": 0.3744959677419355, "grad_norm": 0.29156052417574047, "learning_rate": 9.704758527721912e-07, "loss": 0.2827, "step": 743 }, { "epoch": 0.375, "grad_norm": 0.30556626157560013, "learning_rate": 9.703848670320855e-07, "loss": 0.3043, "step": 744 }, { "epoch": 0.3755040322580645, "grad_norm": 0.28242712763277444, "learning_rate": 9.702937455888593e-07, "loss": 0.2971, "step": 745 }, { "epoch": 0.37600806451612906, "grad_norm": 0.18221657954549308, "learning_rate": 9.70202488468801e-07, "loss": 0.2947, "step": 746 }, { "epoch": 0.37651209677419356, "grad_norm": 0.17989299690744823, "learning_rate": 9.701110956982374e-07, "loss": 0.2924, "step": 747 }, { "epoch": 0.37701612903225806, "grad_norm": 0.18220343446730708, "learning_rate": 9.700195673035349e-07, "loss": 0.2885, "step": 748 }, { "epoch": 0.37752016129032256, "grad_norm": 0.22346642161187105, "learning_rate": 9.699279033110988e-07, "loss": 0.3244, "step": 749 }, { "epoch": 0.3780241935483871, "grad_norm": 0.19169547017843008, "learning_rate": 9.698361037473738e-07, "loss": 0.2961, "step": 750 }, { "epoch": 0.3785282258064516, "grad_norm": 0.22065738901487775, "learning_rate": 9.697441686388432e-07, "loss": 0.3068, "step": 751 }, { "epoch": 0.3790322580645161, "grad_norm": 0.19267369377139024, "learning_rate": 9.696520980120299e-07, "loss": 0.2898, "step": 752 }, { "epoch": 0.3795362903225806, "grad_norm": 0.19447034655516351, "learning_rate": 9.695598918934958e-07, "loss": 0.2938, "step": 753 }, { "epoch": 0.3800403225806452, "grad_norm": 0.17575932802696953, "learning_rate": 9.694675503098415e-07, "loss": 0.299, "step": 754 }, { "epoch": 0.3805443548387097, "grad_norm": 0.27717817200263767, "learning_rate": 9.693750732877071e-07, "loss": 0.2902, "step": 755 }, { "epoch": 0.3810483870967742, "grad_norm": 0.18017521043674514, "learning_rate": 9.692824608537718e-07, "loss": 0.2934, "step": 756 }, { "epoch": 0.3815524193548387, "grad_norm": 0.18161011916708894, "learning_rate": 9.691897130347536e-07, "loss": 0.2958, "step": 757 }, { "epoch": 0.38205645161290325, "grad_norm": 0.20730764541104563, "learning_rate": 9.690968298574095e-07, "loss": 0.2908, "step": 758 }, { "epoch": 0.38256048387096775, "grad_norm": 0.2200352742107318, "learning_rate": 9.69003811348536e-07, "loss": 0.3019, "step": 759 }, { "epoch": 0.38306451612903225, "grad_norm": 0.19253930185176787, "learning_rate": 9.689106575349682e-07, "loss": 0.2864, "step": 760 }, { "epoch": 0.38356854838709675, "grad_norm": 0.21888290789299836, "learning_rate": 9.688173684435806e-07, "loss": 0.313, "step": 761 }, { "epoch": 0.3840725806451613, "grad_norm": 0.2275047774015037, "learning_rate": 9.687239441012863e-07, "loss": 0.2927, "step": 762 }, { "epoch": 0.3845766129032258, "grad_norm": 0.17433382207800896, "learning_rate": 9.686303845350377e-07, "loss": 0.2934, "step": 763 }, { "epoch": 0.3850806451612903, "grad_norm": 0.1910468291040477, "learning_rate": 9.68536689771826e-07, "loss": 0.2892, "step": 764 }, { "epoch": 0.3855846774193548, "grad_norm": 0.2073819671598411, "learning_rate": 9.68442859838682e-07, "loss": 0.2874, "step": 765 }, { "epoch": 0.3860887096774194, "grad_norm": 0.18337565075052092, "learning_rate": 9.683488947626746e-07, "loss": 0.2986, "step": 766 }, { "epoch": 0.3865927419354839, "grad_norm": 0.19262381834997364, "learning_rate": 9.682547945709125e-07, "loss": 0.2954, "step": 767 }, { "epoch": 0.3870967741935484, "grad_norm": 0.19206670831950323, "learning_rate": 9.681605592905425e-07, "loss": 0.2796, "step": 768 }, { "epoch": 0.3876008064516129, "grad_norm": 0.18021565531041417, "learning_rate": 9.680661889487517e-07, "loss": 0.2974, "step": 769 }, { "epoch": 0.38810483870967744, "grad_norm": 0.20648881640932878, "learning_rate": 9.679716835727647e-07, "loss": 0.304, "step": 770 }, { "epoch": 0.38860887096774194, "grad_norm": 0.21633577449914515, "learning_rate": 9.67877043189846e-07, "loss": 0.2815, "step": 771 }, { "epoch": 0.38911290322580644, "grad_norm": 0.19027954862930285, "learning_rate": 9.677822678272986e-07, "loss": 0.2775, "step": 772 }, { "epoch": 0.38961693548387094, "grad_norm": 0.1853233674997291, "learning_rate": 9.676873575124647e-07, "loss": 0.2736, "step": 773 }, { "epoch": 0.3901209677419355, "grad_norm": 0.28029424195764874, "learning_rate": 9.675923122727253e-07, "loss": 0.2919, "step": 774 }, { "epoch": 0.390625, "grad_norm": 0.2271137389969601, "learning_rate": 9.674971321355003e-07, "loss": 0.2884, "step": 775 }, { "epoch": 0.3911290322580645, "grad_norm": 0.2482514592600029, "learning_rate": 9.67401817128249e-07, "loss": 0.2947, "step": 776 }, { "epoch": 0.39163306451612906, "grad_norm": 0.29951392676665967, "learning_rate": 9.673063672784684e-07, "loss": 0.289, "step": 777 }, { "epoch": 0.39213709677419356, "grad_norm": 0.17456103242486867, "learning_rate": 9.67210782613696e-07, "loss": 0.2995, "step": 778 }, { "epoch": 0.39264112903225806, "grad_norm": 0.6967952430229244, "learning_rate": 9.67115063161507e-07, "loss": 0.3022, "step": 779 }, { "epoch": 0.39314516129032256, "grad_norm": 0.29680448703729057, "learning_rate": 9.67019208949516e-07, "loss": 0.2862, "step": 780 }, { "epoch": 0.3936491935483871, "grad_norm": 0.177999847832676, "learning_rate": 9.669232200053759e-07, "loss": 0.2962, "step": 781 }, { "epoch": 0.3941532258064516, "grad_norm": 0.27026664375469966, "learning_rate": 9.668270963567794e-07, "loss": 0.2986, "step": 782 }, { "epoch": 0.3946572580645161, "grad_norm": 0.19723594083135135, "learning_rate": 9.667308380314576e-07, "loss": 0.3063, "step": 783 }, { "epoch": 0.3951612903225806, "grad_norm": 0.21862308310307982, "learning_rate": 9.666344450571801e-07, "loss": 0.2964, "step": 784 }, { "epoch": 0.3956653225806452, "grad_norm": 0.1758663671801379, "learning_rate": 9.665379174617558e-07, "loss": 0.2914, "step": 785 }, { "epoch": 0.3961693548387097, "grad_norm": 0.32096497636941274, "learning_rate": 9.664412552730326e-07, "loss": 0.2897, "step": 786 }, { "epoch": 0.3966733870967742, "grad_norm": 0.17374107415020215, "learning_rate": 9.663444585188965e-07, "loss": 0.2867, "step": 787 }, { "epoch": 0.3971774193548387, "grad_norm": 0.18444275375398292, "learning_rate": 9.66247527227273e-07, "loss": 0.2961, "step": 788 }, { "epoch": 0.39768145161290325, "grad_norm": 0.19643289496174723, "learning_rate": 9.661504614261261e-07, "loss": 0.3125, "step": 789 }, { "epoch": 0.39818548387096775, "grad_norm": 0.2167645010446501, "learning_rate": 9.660532611434591e-07, "loss": 0.2921, "step": 790 }, { "epoch": 0.39868951612903225, "grad_norm": 0.1806328023049352, "learning_rate": 9.659559264073129e-07, "loss": 0.2846, "step": 791 }, { "epoch": 0.39919354838709675, "grad_norm": 0.2612388559577726, "learning_rate": 9.658584572457686e-07, "loss": 0.2945, "step": 792 }, { "epoch": 0.3996975806451613, "grad_norm": 0.2793761291173004, "learning_rate": 9.657608536869451e-07, "loss": 0.2837, "step": 793 }, { "epoch": 0.4002016129032258, "grad_norm": 0.18310300206951588, "learning_rate": 9.656631157590004e-07, "loss": 0.2978, "step": 794 }, { "epoch": 0.4007056451612903, "grad_norm": 0.18408805840331044, "learning_rate": 9.655652434901317e-07, "loss": 0.3079, "step": 795 }, { "epoch": 0.4012096774193548, "grad_norm": 0.2342952244459477, "learning_rate": 9.654672369085742e-07, "loss": 0.2834, "step": 796 }, { "epoch": 0.4012096774193548, "eval_loss": 0.3250449001789093, "eval_runtime": 18.239, "eval_samples_per_second": 46.878, "eval_steps_per_second": 0.987, "step": 796 }, { "epoch": 0.4017137096774194, "grad_norm": 0.6951664973155232, "learning_rate": 9.653690960426024e-07, "loss": 0.2949, "step": 797 }, { "epoch": 0.4022177419354839, "grad_norm": 0.3007352567348458, "learning_rate": 9.652708209205289e-07, "loss": 0.308, "step": 798 }, { "epoch": 0.4027217741935484, "grad_norm": 0.2111912313466037, "learning_rate": 9.651724115707059e-07, "loss": 0.2868, "step": 799 }, { "epoch": 0.4032258064516129, "grad_norm": 0.25518959182252676, "learning_rate": 9.650738680215237e-07, "loss": 0.2926, "step": 800 }, { "epoch": 0.40372983870967744, "grad_norm": 0.18835961947528512, "learning_rate": 9.649751903014117e-07, "loss": 0.2894, "step": 801 }, { "epoch": 0.40423387096774194, "grad_norm": 0.17984205410768345, "learning_rate": 9.648763784388375e-07, "loss": 0.2857, "step": 802 }, { "epoch": 0.40473790322580644, "grad_norm": 0.19376522082181055, "learning_rate": 9.647774324623082e-07, "loss": 0.2956, "step": 803 }, { "epoch": 0.40524193548387094, "grad_norm": 0.2473519213828851, "learning_rate": 9.646783524003684e-07, "loss": 0.2816, "step": 804 }, { "epoch": 0.4057459677419355, "grad_norm": 0.4135144683121957, "learning_rate": 9.645791382816026e-07, "loss": 0.2794, "step": 805 }, { "epoch": 0.40625, "grad_norm": 0.22178842307704266, "learning_rate": 9.644797901346333e-07, "loss": 0.2859, "step": 806 }, { "epoch": 0.4067540322580645, "grad_norm": 0.1865025598672438, "learning_rate": 9.64380307988122e-07, "loss": 0.2826, "step": 807 }, { "epoch": 0.40725806451612906, "grad_norm": 0.21780748939537628, "learning_rate": 9.642806918707685e-07, "loss": 0.3003, "step": 808 }, { "epoch": 0.40776209677419356, "grad_norm": 0.23524333096509437, "learning_rate": 9.641809418113113e-07, "loss": 0.2731, "step": 809 }, { "epoch": 0.40826612903225806, "grad_norm": 0.3522546815652214, "learning_rate": 9.64081057838528e-07, "loss": 0.3006, "step": 810 }, { "epoch": 0.40877016129032256, "grad_norm": 0.2060231832092746, "learning_rate": 9.63981039981234e-07, "loss": 0.2889, "step": 811 }, { "epoch": 0.4092741935483871, "grad_norm": 0.17805134176779017, "learning_rate": 9.638808882682845e-07, "loss": 0.28, "step": 812 }, { "epoch": 0.4097782258064516, "grad_norm": 0.21338586685342792, "learning_rate": 9.637806027285721e-07, "loss": 0.3266, "step": 813 }, { "epoch": 0.4102822580645161, "grad_norm": 0.23761874762782823, "learning_rate": 9.636801833910291e-07, "loss": 0.2928, "step": 814 }, { "epoch": 0.4107862903225806, "grad_norm": 0.349479056634183, "learning_rate": 9.635796302846253e-07, "loss": 0.2893, "step": 815 }, { "epoch": 0.4112903225806452, "grad_norm": 0.1793918088824337, "learning_rate": 9.6347894343837e-07, "loss": 0.3022, "step": 816 }, { "epoch": 0.4117943548387097, "grad_norm": 0.20651527805989847, "learning_rate": 9.633781228813107e-07, "loss": 0.289, "step": 817 }, { "epoch": 0.4122983870967742, "grad_norm": 0.2120940285959409, "learning_rate": 9.63277168642533e-07, "loss": 0.2796, "step": 818 }, { "epoch": 0.4128024193548387, "grad_norm": 0.18644947273341353, "learning_rate": 9.631760807511624e-07, "loss": 0.2823, "step": 819 }, { "epoch": 0.41330645161290325, "grad_norm": 0.2559146778083527, "learning_rate": 9.630748592363617e-07, "loss": 0.3102, "step": 820 }, { "epoch": 0.41381048387096775, "grad_norm": 0.20144278166717078, "learning_rate": 9.629735041273325e-07, "loss": 0.2949, "step": 821 }, { "epoch": 0.41431451612903225, "grad_norm": 0.22358896888220894, "learning_rate": 9.628720154533157e-07, "loss": 0.2809, "step": 822 }, { "epoch": 0.41481854838709675, "grad_norm": 0.2056875307546662, "learning_rate": 9.627703932435895e-07, "loss": 0.2838, "step": 823 }, { "epoch": 0.4153225806451613, "grad_norm": 0.199611316920199, "learning_rate": 9.626686375274715e-07, "loss": 0.2731, "step": 824 }, { "epoch": 0.4158266129032258, "grad_norm": 0.17872361611126808, "learning_rate": 9.625667483343177e-07, "loss": 0.2885, "step": 825 }, { "epoch": 0.4163306451612903, "grad_norm": 0.1715080401060158, "learning_rate": 9.624647256935226e-07, "loss": 0.2918, "step": 826 }, { "epoch": 0.4168346774193548, "grad_norm": 0.28942913404328185, "learning_rate": 9.623625696345187e-07, "loss": 0.2909, "step": 827 }, { "epoch": 0.4173387096774194, "grad_norm": 0.5219653514726549, "learning_rate": 9.62260280186778e-07, "loss": 0.2917, "step": 828 }, { "epoch": 0.4178427419354839, "grad_norm": 0.1803172885571653, "learning_rate": 9.621578573798098e-07, "loss": 0.29, "step": 829 }, { "epoch": 0.4183467741935484, "grad_norm": 0.26437521269828795, "learning_rate": 9.620553012431626e-07, "loss": 0.2962, "step": 830 }, { "epoch": 0.4188508064516129, "grad_norm": 0.22357063643965017, "learning_rate": 9.619526118064234e-07, "loss": 0.2841, "step": 831 }, { "epoch": 0.41935483870967744, "grad_norm": 0.27227090245290375, "learning_rate": 9.618497890992171e-07, "loss": 0.2791, "step": 832 }, { "epoch": 0.41985887096774194, "grad_norm": 0.18400884656891223, "learning_rate": 9.61746833151208e-07, "loss": 0.2922, "step": 833 }, { "epoch": 0.42036290322580644, "grad_norm": 0.17737272960072972, "learning_rate": 9.616437439920977e-07, "loss": 0.2941, "step": 834 }, { "epoch": 0.42086693548387094, "grad_norm": 0.1845994861654205, "learning_rate": 9.61540521651627e-07, "loss": 0.2993, "step": 835 }, { "epoch": 0.4213709677419355, "grad_norm": 0.184243652976245, "learning_rate": 9.61437166159575e-07, "loss": 0.2954, "step": 836 }, { "epoch": 0.421875, "grad_norm": 0.1962394295882692, "learning_rate": 9.61333677545759e-07, "loss": 0.2797, "step": 837 }, { "epoch": 0.4223790322580645, "grad_norm": 0.19907302059194398, "learning_rate": 9.612300558400348e-07, "loss": 0.2913, "step": 838 }, { "epoch": 0.42288306451612906, "grad_norm": 0.17791364001664575, "learning_rate": 9.611263010722968e-07, "loss": 0.308, "step": 839 }, { "epoch": 0.42338709677419356, "grad_norm": 0.20244623183338803, "learning_rate": 9.610224132724772e-07, "loss": 0.2984, "step": 840 }, { "epoch": 0.42389112903225806, "grad_norm": 0.1821553749592407, "learning_rate": 9.609183924705473e-07, "loss": 0.276, "step": 841 }, { "epoch": 0.42439516129032256, "grad_norm": 0.18302758905749872, "learning_rate": 9.608142386965166e-07, "loss": 0.2886, "step": 842 }, { "epoch": 0.4248991935483871, "grad_norm": 0.2814755691064198, "learning_rate": 9.607099519804325e-07, "loss": 0.2996, "step": 843 }, { "epoch": 0.4254032258064516, "grad_norm": 0.2188163570039596, "learning_rate": 9.60605532352381e-07, "loss": 0.2849, "step": 844 }, { "epoch": 0.4259072580645161, "grad_norm": 0.19063373332278225, "learning_rate": 9.605009798424871e-07, "loss": 0.2826, "step": 845 }, { "epoch": 0.4264112903225806, "grad_norm": 0.22803495479430694, "learning_rate": 9.60396294480913e-07, "loss": 0.2881, "step": 846 }, { "epoch": 0.4269153225806452, "grad_norm": 0.2532580997944776, "learning_rate": 9.6029147629786e-07, "loss": 0.2931, "step": 847 }, { "epoch": 0.4274193548387097, "grad_norm": 0.29895228840340143, "learning_rate": 9.601865253235673e-07, "loss": 0.2851, "step": 848 }, { "epoch": 0.4279233870967742, "grad_norm": 0.19966385962690142, "learning_rate": 9.60081441588313e-07, "loss": 0.2977, "step": 849 }, { "epoch": 0.4284274193548387, "grad_norm": 0.3898444466529591, "learning_rate": 9.599762251224125e-07, "loss": 0.3104, "step": 850 }, { "epoch": 0.42893145161290325, "grad_norm": 0.24259677387671844, "learning_rate": 9.598708759562208e-07, "loss": 0.2987, "step": 851 }, { "epoch": 0.42943548387096775, "grad_norm": 0.20006735807640816, "learning_rate": 9.5976539412013e-07, "loss": 0.2864, "step": 852 }, { "epoch": 0.42993951612903225, "grad_norm": 0.2882245398429841, "learning_rate": 9.59659779644571e-07, "loss": 0.2858, "step": 853 }, { "epoch": 0.43044354838709675, "grad_norm": 0.22513281216087977, "learning_rate": 9.59554032560013e-07, "loss": 0.2824, "step": 854 }, { "epoch": 0.4309475806451613, "grad_norm": 0.26571888341504696, "learning_rate": 9.594481528969635e-07, "loss": 0.3086, "step": 855 }, { "epoch": 0.4314516129032258, "grad_norm": 0.19249797160463983, "learning_rate": 9.59342140685968e-07, "loss": 0.2889, "step": 856 }, { "epoch": 0.4319556451612903, "grad_norm": 0.17883290237014954, "learning_rate": 9.592359959576104e-07, "loss": 0.2913, "step": 857 }, { "epoch": 0.4324596774193548, "grad_norm": 0.38963242443484597, "learning_rate": 9.591297187425128e-07, "loss": 0.2905, "step": 858 }, { "epoch": 0.4329637096774194, "grad_norm": 0.4499390873499669, "learning_rate": 9.590233090713354e-07, "loss": 0.3016, "step": 859 }, { "epoch": 0.4334677419354839, "grad_norm": 0.23147026645054392, "learning_rate": 9.58916766974777e-07, "loss": 0.2753, "step": 860 }, { "epoch": 0.4339717741935484, "grad_norm": 0.23004776559869194, "learning_rate": 9.58810092483574e-07, "loss": 0.2857, "step": 861 }, { "epoch": 0.4344758064516129, "grad_norm": 0.38524648235237546, "learning_rate": 9.587032856285016e-07, "loss": 0.295, "step": 862 }, { "epoch": 0.43497983870967744, "grad_norm": 0.26978296400638296, "learning_rate": 9.585963464403727e-07, "loss": 0.2924, "step": 863 }, { "epoch": 0.43548387096774194, "grad_norm": 0.2304216233656417, "learning_rate": 9.584892749500388e-07, "loss": 0.2782, "step": 864 }, { "epoch": 0.43598790322580644, "grad_norm": 0.20628041373927403, "learning_rate": 9.58382071188389e-07, "loss": 0.2938, "step": 865 }, { "epoch": 0.43649193548387094, "grad_norm": 0.24806721102476476, "learning_rate": 9.582747351863518e-07, "loss": 0.3051, "step": 866 }, { "epoch": 0.4369959677419355, "grad_norm": 0.18289677229578133, "learning_rate": 9.58167266974892e-07, "loss": 0.2928, "step": 867 }, { "epoch": 0.4375, "grad_norm": 0.20539381909701737, "learning_rate": 9.580596665850139e-07, "loss": 0.2873, "step": 868 }, { "epoch": 0.4380040322580645, "grad_norm": 0.18821302140885868, "learning_rate": 9.579519340477592e-07, "loss": 0.2903, "step": 869 }, { "epoch": 0.43850806451612906, "grad_norm": 0.37407049401588166, "learning_rate": 9.57844069394209e-07, "loss": 0.2931, "step": 870 }, { "epoch": 0.43901209677419356, "grad_norm": 0.2339579704921313, "learning_rate": 9.577360726554804e-07, "loss": 0.2847, "step": 871 }, { "epoch": 0.43951612903225806, "grad_norm": 0.23524427674559492, "learning_rate": 9.576279438627308e-07, "loss": 0.2754, "step": 872 }, { "epoch": 0.44002016129032256, "grad_norm": 0.18413711797969656, "learning_rate": 9.57519683047154e-07, "loss": 0.2893, "step": 873 }, { "epoch": 0.4405241935483871, "grad_norm": 0.22031589128909412, "learning_rate": 9.574112902399829e-07, "loss": 0.2884, "step": 874 }, { "epoch": 0.4410282258064516, "grad_norm": 0.22755059592326796, "learning_rate": 9.573027654724882e-07, "loss": 0.29, "step": 875 }, { "epoch": 0.4415322580645161, "grad_norm": 0.18373793370970248, "learning_rate": 9.571941087759782e-07, "loss": 0.3064, "step": 876 }, { "epoch": 0.4420362903225806, "grad_norm": 0.19409387364473213, "learning_rate": 9.570853201818002e-07, "loss": 0.2922, "step": 877 }, { "epoch": 0.4425403225806452, "grad_norm": 0.21129036924339822, "learning_rate": 9.569763997213387e-07, "loss": 0.2714, "step": 878 }, { "epoch": 0.4430443548387097, "grad_norm": 0.17440531697354897, "learning_rate": 9.568673474260168e-07, "loss": 0.2847, "step": 879 }, { "epoch": 0.4435483870967742, "grad_norm": 0.18668182286896606, "learning_rate": 9.567581633272955e-07, "loss": 0.3118, "step": 880 }, { "epoch": 0.4440524193548387, "grad_norm": 0.1797795667764904, "learning_rate": 9.56648847456673e-07, "loss": 0.3019, "step": 881 }, { "epoch": 0.44455645161290325, "grad_norm": 0.3707822100666679, "learning_rate": 9.565393998456874e-07, "loss": 0.2912, "step": 882 }, { "epoch": 0.44506048387096775, "grad_norm": 0.289665390914343, "learning_rate": 9.564298205259126e-07, "loss": 0.2895, "step": 883 }, { "epoch": 0.44556451612903225, "grad_norm": 0.2056754789971548, "learning_rate": 9.563201095289624e-07, "loss": 0.298, "step": 884 }, { "epoch": 0.44606854838709675, "grad_norm": 0.1814615855914537, "learning_rate": 9.562102668864871e-07, "loss": 0.2934, "step": 885 }, { "epoch": 0.4465725806451613, "grad_norm": 0.37487748973707236, "learning_rate": 9.56100292630176e-07, "loss": 0.2973, "step": 886 }, { "epoch": 0.4470766129032258, "grad_norm": 0.1759000528091719, "learning_rate": 9.559901867917556e-07, "loss": 0.3039, "step": 887 }, { "epoch": 0.4475806451612903, "grad_norm": 0.32191288919848915, "learning_rate": 9.558799494029914e-07, "loss": 0.2893, "step": 888 }, { "epoch": 0.4480846774193548, "grad_norm": 0.19810987185465256, "learning_rate": 9.557695804956856e-07, "loss": 0.3127, "step": 889 }, { "epoch": 0.4485887096774194, "grad_norm": 0.24720381123354968, "learning_rate": 9.556590801016793e-07, "loss": 0.2918, "step": 890 }, { "epoch": 0.4490927419354839, "grad_norm": 0.18168928920802488, "learning_rate": 9.555484482528508e-07, "loss": 0.2881, "step": 891 }, { "epoch": 0.4495967741935484, "grad_norm": 0.18687548855169936, "learning_rate": 9.554376849811173e-07, "loss": 0.2905, "step": 892 }, { "epoch": 0.4501008064516129, "grad_norm": 0.20251482097557522, "learning_rate": 9.553267903184327e-07, "loss": 0.2939, "step": 893 }, { "epoch": 0.45060483870967744, "grad_norm": 0.2887402542876595, "learning_rate": 9.552157642967897e-07, "loss": 0.2878, "step": 894 }, { "epoch": 0.45110887096774194, "grad_norm": 0.19945034327270725, "learning_rate": 9.551046069482186e-07, "loss": 0.2845, "step": 895 }, { "epoch": 0.45161290322580644, "grad_norm": 0.23431424692187636, "learning_rate": 9.549933183047877e-07, "loss": 0.2917, "step": 896 }, { "epoch": 0.45211693548387094, "grad_norm": 0.2190333169890677, "learning_rate": 9.54881898398603e-07, "loss": 0.2952, "step": 897 }, { "epoch": 0.4526209677419355, "grad_norm": 0.17822949705219454, "learning_rate": 9.547703472618086e-07, "loss": 0.299, "step": 898 }, { "epoch": 0.453125, "grad_norm": 0.2102549610365594, "learning_rate": 9.54658664926586e-07, "loss": 0.2935, "step": 899 }, { "epoch": 0.4536290322580645, "grad_norm": 0.19289696781722984, "learning_rate": 9.545468514251552e-07, "loss": 0.2998, "step": 900 }, { "epoch": 0.45413306451612906, "grad_norm": 0.3042982461642612, "learning_rate": 9.544349067897734e-07, "loss": 0.2937, "step": 901 }, { "epoch": 0.45463709677419356, "grad_norm": 0.19107058828557277, "learning_rate": 9.54322831052736e-07, "loss": 0.2765, "step": 902 }, { "epoch": 0.45514112903225806, "grad_norm": 0.28747582105179686, "learning_rate": 9.542106242463764e-07, "loss": 0.2833, "step": 903 }, { "epoch": 0.45564516129032256, "grad_norm": 0.18458129956622982, "learning_rate": 9.540982864030653e-07, "loss": 0.2831, "step": 904 }, { "epoch": 0.4561491935483871, "grad_norm": 0.2130309759800133, "learning_rate": 9.539858175552115e-07, "loss": 0.2836, "step": 905 }, { "epoch": 0.4566532258064516, "grad_norm": 0.3057883689836788, "learning_rate": 9.538732177352617e-07, "loss": 0.3007, "step": 906 }, { "epoch": 0.4571572580645161, "grad_norm": 0.31124910837676767, "learning_rate": 9.537604869757001e-07, "loss": 0.2764, "step": 907 }, { "epoch": 0.4576612903225806, "grad_norm": 0.2441004343752962, "learning_rate": 9.53647625309049e-07, "loss": 0.2868, "step": 908 }, { "epoch": 0.4581653225806452, "grad_norm": 0.2357500626232433, "learning_rate": 9.535346327678682e-07, "loss": 0.3016, "step": 909 }, { "epoch": 0.4586693548387097, "grad_norm": 0.20066160584984583, "learning_rate": 9.534215093847552e-07, "loss": 0.3001, "step": 910 }, { "epoch": 0.4591733870967742, "grad_norm": 0.22773495544084632, "learning_rate": 9.533082551923458e-07, "loss": 0.2756, "step": 911 }, { "epoch": 0.4596774193548387, "grad_norm": 0.1906199840165581, "learning_rate": 9.531948702233126e-07, "loss": 0.2869, "step": 912 }, { "epoch": 0.46018145161290325, "grad_norm": 0.3245639131414002, "learning_rate": 9.530813545103667e-07, "loss": 0.2708, "step": 913 }, { "epoch": 0.46068548387096775, "grad_norm": 0.21271580471854526, "learning_rate": 9.52967708086257e-07, "loss": 0.2892, "step": 914 }, { "epoch": 0.46118951612903225, "grad_norm": 0.2104148343959223, "learning_rate": 9.528539309837693e-07, "loss": 0.2877, "step": 915 }, { "epoch": 0.46169354838709675, "grad_norm": 0.19824185563673147, "learning_rate": 9.527400232357279e-07, "loss": 0.276, "step": 916 }, { "epoch": 0.4621975806451613, "grad_norm": 0.21951663687873915, "learning_rate": 9.526259848749943e-07, "loss": 0.2859, "step": 917 }, { "epoch": 0.4627016129032258, "grad_norm": 0.37545971476160844, "learning_rate": 9.52511815934468e-07, "loss": 0.2774, "step": 918 }, { "epoch": 0.4632056451612903, "grad_norm": 0.22438052196032507, "learning_rate": 9.523975164470859e-07, "loss": 0.2978, "step": 919 }, { "epoch": 0.4637096774193548, "grad_norm": 0.1757823732376565, "learning_rate": 9.522830864458227e-07, "loss": 0.3022, "step": 920 }, { "epoch": 0.4642137096774194, "grad_norm": 0.27657899716696727, "learning_rate": 9.521685259636909e-07, "loss": 0.2944, "step": 921 }, { "epoch": 0.4647177419354839, "grad_norm": 0.18756079463012706, "learning_rate": 9.520538350337404e-07, "loss": 0.2884, "step": 922 }, { "epoch": 0.4652217741935484, "grad_norm": 0.2062606714488354, "learning_rate": 9.519390136890589e-07, "loss": 0.2778, "step": 923 }, { "epoch": 0.4657258064516129, "grad_norm": 0.366011125828711, "learning_rate": 9.518240619627713e-07, "loss": 0.3001, "step": 924 }, { "epoch": 0.46622983870967744, "grad_norm": 0.2541955304250703, "learning_rate": 9.51708979888041e-07, "loss": 0.2905, "step": 925 }, { "epoch": 0.46673387096774194, "grad_norm": 0.36412234331970916, "learning_rate": 9.51593767498068e-07, "loss": 0.2822, "step": 926 }, { "epoch": 0.46723790322580644, "grad_norm": 0.21139676616548536, "learning_rate": 9.514784248260908e-07, "loss": 0.3045, "step": 927 }, { "epoch": 0.46774193548387094, "grad_norm": 0.19079104145467554, "learning_rate": 9.513629519053845e-07, "loss": 0.2737, "step": 928 }, { "epoch": 0.4682459677419355, "grad_norm": 0.23607892317649437, "learning_rate": 9.512473487692628e-07, "loss": 0.2999, "step": 929 }, { "epoch": 0.46875, "grad_norm": 0.1803840363558066, "learning_rate": 9.511316154510763e-07, "loss": 0.287, "step": 930 }, { "epoch": 0.4692540322580645, "grad_norm": 0.17453949054667584, "learning_rate": 9.510157519842133e-07, "loss": 0.2851, "step": 931 }, { "epoch": 0.46975806451612906, "grad_norm": 0.24449967092054903, "learning_rate": 9.508997584020997e-07, "loss": 0.2872, "step": 932 }, { "epoch": 0.47026209677419356, "grad_norm": 0.1745644546122727, "learning_rate": 9.507836347381992e-07, "loss": 0.3005, "step": 933 }, { "epoch": 0.47076612903225806, "grad_norm": 0.18053174508073816, "learning_rate": 9.506673810260123e-07, "loss": 0.2929, "step": 934 }, { "epoch": 0.47127016129032256, "grad_norm": 0.31029290882211796, "learning_rate": 9.505509972990778e-07, "loss": 0.2926, "step": 935 }, { "epoch": 0.4717741935483871, "grad_norm": 0.22938531150844882, "learning_rate": 9.504344835909716e-07, "loss": 0.2885, "step": 936 }, { "epoch": 0.4722782258064516, "grad_norm": 0.17882862009471923, "learning_rate": 9.503178399353072e-07, "loss": 0.2743, "step": 937 }, { "epoch": 0.4727822580645161, "grad_norm": 0.1854810156795896, "learning_rate": 9.502010663657354e-07, "loss": 0.2955, "step": 938 }, { "epoch": 0.4732862903225806, "grad_norm": 0.21043848944429472, "learning_rate": 9.50084162915945e-07, "loss": 0.2964, "step": 939 }, { "epoch": 0.4737903225806452, "grad_norm": 0.2847988534984117, "learning_rate": 9.499671296196617e-07, "loss": 0.285, "step": 940 }, { "epoch": 0.4742943548387097, "grad_norm": 0.25216723077802655, "learning_rate": 9.498499665106487e-07, "loss": 0.2836, "step": 941 }, { "epoch": 0.4747983870967742, "grad_norm": 0.18397402571778532, "learning_rate": 9.497326736227071e-07, "loss": 0.2937, "step": 942 }, { "epoch": 0.4753024193548387, "grad_norm": 0.28708542295201006, "learning_rate": 9.496152509896753e-07, "loss": 0.2851, "step": 943 }, { "epoch": 0.47580645161290325, "grad_norm": 0.2460818429957608, "learning_rate": 9.494976986454286e-07, "loss": 0.29, "step": 944 }, { "epoch": 0.47631048387096775, "grad_norm": 0.19878718222512176, "learning_rate": 9.493800166238805e-07, "loss": 0.2917, "step": 945 }, { "epoch": 0.47681451612903225, "grad_norm": 0.22398506177876373, "learning_rate": 9.492622049589812e-07, "loss": 0.3093, "step": 946 }, { "epoch": 0.47731854838709675, "grad_norm": 0.19821540893857262, "learning_rate": 9.491442636847189e-07, "loss": 0.2946, "step": 947 }, { "epoch": 0.4778225806451613, "grad_norm": 0.19307685380255402, "learning_rate": 9.490261928351189e-07, "loss": 0.2969, "step": 948 }, { "epoch": 0.4783266129032258, "grad_norm": 0.216166144258633, "learning_rate": 9.489079924442438e-07, "loss": 0.2981, "step": 949 }, { "epoch": 0.4788306451612903, "grad_norm": 0.17612196294528637, "learning_rate": 9.487896625461935e-07, "loss": 0.2907, "step": 950 }, { "epoch": 0.4793346774193548, "grad_norm": 0.2589705775054588, "learning_rate": 9.486712031751058e-07, "loss": 0.3049, "step": 951 }, { "epoch": 0.4798387096774194, "grad_norm": 0.24483548456140278, "learning_rate": 9.485526143651555e-07, "loss": 0.3105, "step": 952 }, { "epoch": 0.4803427419354839, "grad_norm": 0.20974832200044807, "learning_rate": 9.484338961505544e-07, "loss": 0.2935, "step": 953 }, { "epoch": 0.4808467741935484, "grad_norm": 0.17639810431043276, "learning_rate": 9.483150485655523e-07, "loss": 0.2818, "step": 954 }, { "epoch": 0.4813508064516129, "grad_norm": 0.17457844069908712, "learning_rate": 9.481960716444358e-07, "loss": 0.2954, "step": 955 }, { "epoch": 0.48185483870967744, "grad_norm": 0.17714100105123767, "learning_rate": 9.480769654215291e-07, "loss": 0.2811, "step": 956 }, { "epoch": 0.48235887096774194, "grad_norm": 0.1958920141207276, "learning_rate": 9.479577299311934e-07, "loss": 0.2898, "step": 957 }, { "epoch": 0.48286290322580644, "grad_norm": 0.2882352875771043, "learning_rate": 9.478383652078277e-07, "loss": 0.296, "step": 958 }, { "epoch": 0.48336693548387094, "grad_norm": 0.21959333859610755, "learning_rate": 9.477188712858679e-07, "loss": 0.2816, "step": 959 }, { "epoch": 0.4838709677419355, "grad_norm": 0.33566206173811586, "learning_rate": 9.475992481997872e-07, "loss": 0.2957, "step": 960 }, { "epoch": 0.484375, "grad_norm": 0.24126700688554154, "learning_rate": 9.474794959840959e-07, "loss": 0.2913, "step": 961 }, { "epoch": 0.4848790322580645, "grad_norm": 0.25752819972368163, "learning_rate": 9.473596146733422e-07, "loss": 0.2815, "step": 962 }, { "epoch": 0.48538306451612906, "grad_norm": 0.19963355802829852, "learning_rate": 9.472396043021109e-07, "loss": 0.2859, "step": 963 }, { "epoch": 0.48588709677419356, "grad_norm": 0.1726400189184517, "learning_rate": 9.471194649050243e-07, "loss": 0.2932, "step": 964 }, { "epoch": 0.48639112903225806, "grad_norm": 0.1769030343965752, "learning_rate": 9.469991965167416e-07, "loss": 0.3046, "step": 965 }, { "epoch": 0.48689516129032256, "grad_norm": 0.19108815400810478, "learning_rate": 9.4687879917196e-07, "loss": 0.2912, "step": 966 }, { "epoch": 0.4873991935483871, "grad_norm": 0.20802165343766263, "learning_rate": 9.46758272905413e-07, "loss": 0.2895, "step": 967 }, { "epoch": 0.4879032258064516, "grad_norm": 0.21787519659072058, "learning_rate": 9.466376177518718e-07, "loss": 0.287, "step": 968 }, { "epoch": 0.4884072580645161, "grad_norm": 0.1761978641368217, "learning_rate": 9.465168337461447e-07, "loss": 0.2778, "step": 969 }, { "epoch": 0.4889112903225806, "grad_norm": 0.20346855806980782, "learning_rate": 9.46395920923077e-07, "loss": 0.2868, "step": 970 }, { "epoch": 0.4894153225806452, "grad_norm": 0.1746303170409007, "learning_rate": 9.462748793175514e-07, "loss": 0.2853, "step": 971 }, { "epoch": 0.4899193548387097, "grad_norm": 0.27860715367389927, "learning_rate": 9.461537089644876e-07, "loss": 0.2984, "step": 972 }, { "epoch": 0.4904233870967742, "grad_norm": 0.17619075334859385, "learning_rate": 9.460324098988426e-07, "loss": 0.2902, "step": 973 }, { "epoch": 0.4909274193548387, "grad_norm": 0.2092460695723593, "learning_rate": 9.459109821556104e-07, "loss": 0.2893, "step": 974 }, { "epoch": 0.49143145161290325, "grad_norm": 0.19009628066171583, "learning_rate": 9.457894257698221e-07, "loss": 0.289, "step": 975 }, { "epoch": 0.49193548387096775, "grad_norm": 0.19214552662936102, "learning_rate": 9.456677407765459e-07, "loss": 0.2914, "step": 976 }, { "epoch": 0.49243951612903225, "grad_norm": 0.21669299426430066, "learning_rate": 9.455459272108873e-07, "loss": 0.2829, "step": 977 }, { "epoch": 0.49294354838709675, "grad_norm": 0.22291135020577527, "learning_rate": 9.454239851079888e-07, "loss": 0.2927, "step": 978 }, { "epoch": 0.4934475806451613, "grad_norm": 0.1874075382197213, "learning_rate": 9.453019145030296e-07, "loss": 0.2821, "step": 979 }, { "epoch": 0.4939516129032258, "grad_norm": 0.17233245804235334, "learning_rate": 9.451797154312269e-07, "loss": 0.2764, "step": 980 }, { "epoch": 0.4944556451612903, "grad_norm": 0.1834747384575909, "learning_rate": 9.450573879278338e-07, "loss": 0.2735, "step": 981 }, { "epoch": 0.4949596774193548, "grad_norm": 0.17012166060463613, "learning_rate": 9.449349320281414e-07, "loss": 0.2676, "step": 982 }, { "epoch": 0.4954637096774194, "grad_norm": 0.24518957923359821, "learning_rate": 9.448123477674773e-07, "loss": 0.2964, "step": 983 }, { "epoch": 0.4959677419354839, "grad_norm": 0.2037237351416288, "learning_rate": 9.446896351812064e-07, "loss": 0.2989, "step": 984 }, { "epoch": 0.4964717741935484, "grad_norm": 0.17254543605119085, "learning_rate": 9.445667943047303e-07, "loss": 0.2816, "step": 985 }, { "epoch": 0.4969758064516129, "grad_norm": 0.17270938767860752, "learning_rate": 9.444438251734881e-07, "loss": 0.29, "step": 986 }, { "epoch": 0.49747983870967744, "grad_norm": 0.18549793090433508, "learning_rate": 9.443207278229554e-07, "loss": 0.2938, "step": 987 }, { "epoch": 0.49798387096774194, "grad_norm": 0.17378591850071334, "learning_rate": 9.441975022886453e-07, "loss": 0.2863, "step": 988 }, { "epoch": 0.49848790322580644, "grad_norm": 0.2036952349756055, "learning_rate": 9.440741486061075e-07, "loss": 0.2979, "step": 989 }, { "epoch": 0.49899193548387094, "grad_norm": 0.24105084066474844, "learning_rate": 9.439506668109284e-07, "loss": 0.2938, "step": 990 }, { "epoch": 0.4994959677419355, "grad_norm": 0.18506171017034273, "learning_rate": 9.438270569387323e-07, "loss": 0.2985, "step": 991 }, { "epoch": 0.5, "grad_norm": 0.22764502062218736, "learning_rate": 9.437033190251797e-07, "loss": 0.2936, "step": 992 }, { "epoch": 0.5005040322580645, "grad_norm": 0.1987156755550509, "learning_rate": 9.43579453105968e-07, "loss": 0.2941, "step": 993 }, { "epoch": 0.501008064516129, "grad_norm": 0.18473840920584347, "learning_rate": 9.43455459216832e-07, "loss": 0.2951, "step": 994 }, { "epoch": 0.5015120967741935, "grad_norm": 0.18615931801205335, "learning_rate": 9.433313373935429e-07, "loss": 0.2864, "step": 995 }, { "epoch": 0.5015120967741935, "eval_loss": 0.3215525150299072, "eval_runtime": 17.0311, "eval_samples_per_second": 50.202, "eval_steps_per_second": 1.057, "step": 995 }, { "epoch": 0.5020161290322581, "grad_norm": 0.18646346465182403, "learning_rate": 9.432070876719095e-07, "loss": 0.2873, "step": 996 }, { "epoch": 0.5025201612903226, "grad_norm": 0.3058678234050536, "learning_rate": 9.430827100877767e-07, "loss": 0.2898, "step": 997 }, { "epoch": 0.5030241935483871, "grad_norm": 0.20468910289808356, "learning_rate": 9.429582046770268e-07, "loss": 0.2961, "step": 998 }, { "epoch": 0.5035282258064516, "grad_norm": 0.18693796681842775, "learning_rate": 9.428335714755788e-07, "loss": 0.2917, "step": 999 }, { "epoch": 0.5040322580645161, "grad_norm": 0.21831422621342852, "learning_rate": 9.427088105193888e-07, "loss": 0.2921, "step": 1000 }, { "epoch": 0.5045362903225806, "grad_norm": 0.2733584938342229, "learning_rate": 9.425839218444492e-07, "loss": 0.2848, "step": 1001 }, { "epoch": 0.5050403225806451, "grad_norm": 0.22194832047757593, "learning_rate": 9.424589054867899e-07, "loss": 0.2816, "step": 1002 }, { "epoch": 0.5055443548387096, "grad_norm": 0.2631541311468527, "learning_rate": 9.423337614824772e-07, "loss": 0.2985, "step": 1003 }, { "epoch": 0.5060483870967742, "grad_norm": 0.23862558481556265, "learning_rate": 9.422084898676146e-07, "loss": 0.2925, "step": 1004 }, { "epoch": 0.5065524193548387, "grad_norm": 0.18127086379393875, "learning_rate": 9.420830906783418e-07, "loss": 0.2893, "step": 1005 }, { "epoch": 0.5070564516129032, "grad_norm": 0.23817820439070797, "learning_rate": 9.419575639508359e-07, "loss": 0.2912, "step": 1006 }, { "epoch": 0.5075604838709677, "grad_norm": 0.19504523429748843, "learning_rate": 9.418319097213108e-07, "loss": 0.287, "step": 1007 }, { "epoch": 0.5080645161290323, "grad_norm": 0.20304831048415597, "learning_rate": 9.417061280260165e-07, "loss": 0.2804, "step": 1008 }, { "epoch": 0.5085685483870968, "grad_norm": 0.22706657240299058, "learning_rate": 9.415802189012407e-07, "loss": 0.3014, "step": 1009 }, { "epoch": 0.5090725806451613, "grad_norm": 0.1859609106588216, "learning_rate": 9.414541823833071e-07, "loss": 0.2984, "step": 1010 }, { "epoch": 0.5095766129032258, "grad_norm": 0.1734707367992668, "learning_rate": 9.413280185085766e-07, "loss": 0.2717, "step": 1011 }, { "epoch": 0.5100806451612904, "grad_norm": 0.1795148808426381, "learning_rate": 9.412017273134464e-07, "loss": 0.2963, "step": 1012 }, { "epoch": 0.5105846774193549, "grad_norm": 0.17831883735108947, "learning_rate": 9.410753088343513e-07, "loss": 0.2844, "step": 1013 }, { "epoch": 0.5110887096774194, "grad_norm": 0.17970109519050018, "learning_rate": 9.409487631077617e-07, "loss": 0.2961, "step": 1014 }, { "epoch": 0.5115927419354839, "grad_norm": 0.18545886299498057, "learning_rate": 9.408220901701856e-07, "loss": 0.2944, "step": 1015 }, { "epoch": 0.5120967741935484, "grad_norm": 0.2176487874487353, "learning_rate": 9.40695290058167e-07, "loss": 0.2805, "step": 1016 }, { "epoch": 0.5126008064516129, "grad_norm": 0.1848666221951678, "learning_rate": 9.405683628082875e-07, "loss": 0.2923, "step": 1017 }, { "epoch": 0.5131048387096774, "grad_norm": 0.179797631951874, "learning_rate": 9.404413084571643e-07, "loss": 0.2985, "step": 1018 }, { "epoch": 0.5136088709677419, "grad_norm": 0.18358769665622782, "learning_rate": 9.403141270414521e-07, "loss": 0.2935, "step": 1019 }, { "epoch": 0.5141129032258065, "grad_norm": 0.2395742446188053, "learning_rate": 9.401868185978418e-07, "loss": 0.2985, "step": 1020 }, { "epoch": 0.514616935483871, "grad_norm": 0.21187575778017587, "learning_rate": 9.400593831630612e-07, "loss": 0.2934, "step": 1021 }, { "epoch": 0.5151209677419355, "grad_norm": 0.1735450576887722, "learning_rate": 9.399318207738744e-07, "loss": 0.2753, "step": 1022 }, { "epoch": 0.515625, "grad_norm": 0.18638678577349388, "learning_rate": 9.398041314670828e-07, "loss": 0.2787, "step": 1023 }, { "epoch": 0.5161290322580645, "grad_norm": 0.24434453479231824, "learning_rate": 9.396763152795237e-07, "loss": 0.2855, "step": 1024 }, { "epoch": 0.516633064516129, "grad_norm": 0.18013654047589361, "learning_rate": 9.395483722480711e-07, "loss": 0.2946, "step": 1025 }, { "epoch": 0.5171370967741935, "grad_norm": 0.3202201260573074, "learning_rate": 9.394203024096361e-07, "loss": 0.2769, "step": 1026 }, { "epoch": 0.5176411290322581, "grad_norm": 0.17815074975274992, "learning_rate": 9.392921058011658e-07, "loss": 0.2979, "step": 1027 }, { "epoch": 0.5181451612903226, "grad_norm": 0.19221302362826442, "learning_rate": 9.391637824596443e-07, "loss": 0.2955, "step": 1028 }, { "epoch": 0.5186491935483871, "grad_norm": 0.2866346023755273, "learning_rate": 9.390353324220921e-07, "loss": 0.292, "step": 1029 }, { "epoch": 0.5191532258064516, "grad_norm": 0.17531514828702485, "learning_rate": 9.389067557255661e-07, "loss": 0.2925, "step": 1030 }, { "epoch": 0.5196572580645161, "grad_norm": 0.186495533039337, "learning_rate": 9.3877805240716e-07, "loss": 0.2951, "step": 1031 }, { "epoch": 0.5201612903225806, "grad_norm": 0.20577228144105744, "learning_rate": 9.386492225040039e-07, "loss": 0.2962, "step": 1032 }, { "epoch": 0.5206653225806451, "grad_norm": 0.17419768193385268, "learning_rate": 9.385202660532643e-07, "loss": 0.2855, "step": 1033 }, { "epoch": 0.5211693548387096, "grad_norm": 0.21933160804478288, "learning_rate": 9.383911830921445e-07, "loss": 0.3005, "step": 1034 }, { "epoch": 0.5216733870967742, "grad_norm": 0.18215272532714025, "learning_rate": 9.38261973657884e-07, "loss": 0.2836, "step": 1035 }, { "epoch": 0.5221774193548387, "grad_norm": 0.17976375005098702, "learning_rate": 9.38132637787759e-07, "loss": 0.287, "step": 1036 }, { "epoch": 0.5226814516129032, "grad_norm": 0.28128296670889563, "learning_rate": 9.380031755190823e-07, "loss": 0.2775, "step": 1037 }, { "epoch": 0.5231854838709677, "grad_norm": 0.23134564761372306, "learning_rate": 9.378735868892024e-07, "loss": 0.2968, "step": 1038 }, { "epoch": 0.5236895161290323, "grad_norm": 0.18342110673735257, "learning_rate": 9.377438719355054e-07, "loss": 0.2907, "step": 1039 }, { "epoch": 0.5241935483870968, "grad_norm": 0.18862222844193371, "learning_rate": 9.37614030695413e-07, "loss": 0.2911, "step": 1040 }, { "epoch": 0.5246975806451613, "grad_norm": 0.30077945487794844, "learning_rate": 9.374840632063836e-07, "loss": 0.2783, "step": 1041 }, { "epoch": 0.5252016129032258, "grad_norm": 0.18403419856623815, "learning_rate": 9.37353969505912e-07, "loss": 0.2873, "step": 1042 }, { "epoch": 0.5257056451612904, "grad_norm": 0.19055837973677883, "learning_rate": 9.372237496315295e-07, "loss": 0.3061, "step": 1043 }, { "epoch": 0.5262096774193549, "grad_norm": 0.1832527411516626, "learning_rate": 9.370934036208037e-07, "loss": 0.3085, "step": 1044 }, { "epoch": 0.5267137096774194, "grad_norm": 0.20346555869158944, "learning_rate": 9.369629315113385e-07, "loss": 0.3114, "step": 1045 }, { "epoch": 0.5272177419354839, "grad_norm": 0.1881315222709536, "learning_rate": 9.368323333407746e-07, "loss": 0.3027, "step": 1046 }, { "epoch": 0.5277217741935484, "grad_norm": 0.20889874047965842, "learning_rate": 9.367016091467885e-07, "loss": 0.2935, "step": 1047 }, { "epoch": 0.5282258064516129, "grad_norm": 0.22770873904210787, "learning_rate": 9.365707589670933e-07, "loss": 0.2808, "step": 1048 }, { "epoch": 0.5287298387096774, "grad_norm": 0.1992019569628758, "learning_rate": 9.364397828394386e-07, "loss": 0.2804, "step": 1049 }, { "epoch": 0.5292338709677419, "grad_norm": 0.1753127135289721, "learning_rate": 9.3630868080161e-07, "loss": 0.2811, "step": 1050 }, { "epoch": 0.5297379032258065, "grad_norm": 0.17661722427381488, "learning_rate": 9.361774528914299e-07, "loss": 0.2806, "step": 1051 }, { "epoch": 0.530241935483871, "grad_norm": 0.1889290840413473, "learning_rate": 9.360460991467567e-07, "loss": 0.2913, "step": 1052 }, { "epoch": 0.5307459677419355, "grad_norm": 0.1829226305533146, "learning_rate": 9.35914619605485e-07, "loss": 0.2979, "step": 1053 }, { "epoch": 0.53125, "grad_norm": 0.19271715211398896, "learning_rate": 9.357830143055459e-07, "loss": 0.282, "step": 1054 }, { "epoch": 0.5317540322580645, "grad_norm": 0.18190812734541403, "learning_rate": 9.356512832849066e-07, "loss": 0.2941, "step": 1055 }, { "epoch": 0.532258064516129, "grad_norm": 0.18587084451978964, "learning_rate": 9.35519426581571e-07, "loss": 0.3082, "step": 1056 }, { "epoch": 0.5327620967741935, "grad_norm": 0.2198278002815469, "learning_rate": 9.353874442335785e-07, "loss": 0.302, "step": 1057 }, { "epoch": 0.5332661290322581, "grad_norm": 0.17574611035139306, "learning_rate": 9.352553362790054e-07, "loss": 0.2882, "step": 1058 }, { "epoch": 0.5337701612903226, "grad_norm": 0.1747327396768012, "learning_rate": 9.351231027559643e-07, "loss": 0.2867, "step": 1059 }, { "epoch": 0.5342741935483871, "grad_norm": 0.22698038959811936, "learning_rate": 9.349907437026034e-07, "loss": 0.2811, "step": 1060 }, { "epoch": 0.5347782258064516, "grad_norm": 0.18501161743729416, "learning_rate": 9.348582591571075e-07, "loss": 0.2917, "step": 1061 }, { "epoch": 0.5352822580645161, "grad_norm": 0.19282052250090315, "learning_rate": 9.347256491576976e-07, "loss": 0.2725, "step": 1062 }, { "epoch": 0.5357862903225806, "grad_norm": 0.18478580670133535, "learning_rate": 9.345929137426311e-07, "loss": 0.2774, "step": 1063 }, { "epoch": 0.5362903225806451, "grad_norm": 0.17685008655382356, "learning_rate": 9.344600529502009e-07, "loss": 0.2945, "step": 1064 }, { "epoch": 0.5367943548387096, "grad_norm": 0.20983177271923298, "learning_rate": 9.34327066818737e-07, "loss": 0.281, "step": 1065 }, { "epoch": 0.5372983870967742, "grad_norm": 0.2077770172618288, "learning_rate": 9.341939553866048e-07, "loss": 0.2922, "step": 1066 }, { "epoch": 0.5378024193548387, "grad_norm": 0.22247565527885046, "learning_rate": 9.340607186922059e-07, "loss": 0.2788, "step": 1067 }, { "epoch": 0.5383064516129032, "grad_norm": 0.2072454495569315, "learning_rate": 9.339273567739787e-07, "loss": 0.2919, "step": 1068 }, { "epoch": 0.5388104838709677, "grad_norm": 0.1745117991986638, "learning_rate": 9.337938696703971e-07, "loss": 0.2847, "step": 1069 }, { "epoch": 0.5393145161290323, "grad_norm": 0.2100754257810942, "learning_rate": 9.336602574199713e-07, "loss": 0.3018, "step": 1070 }, { "epoch": 0.5398185483870968, "grad_norm": 0.20378489641904865, "learning_rate": 9.335265200612477e-07, "loss": 0.2911, "step": 1071 }, { "epoch": 0.5403225806451613, "grad_norm": 0.20915414310743302, "learning_rate": 9.333926576328087e-07, "loss": 0.2934, "step": 1072 }, { "epoch": 0.5408266129032258, "grad_norm": 0.29515599957599536, "learning_rate": 9.332586701732725e-07, "loss": 0.2899, "step": 1073 }, { "epoch": 0.5413306451612904, "grad_norm": 0.1983091401677111, "learning_rate": 9.331245577212938e-07, "loss": 0.2741, "step": 1074 }, { "epoch": 0.5418346774193549, "grad_norm": 0.2538289487328285, "learning_rate": 9.329903203155633e-07, "loss": 0.2927, "step": 1075 }, { "epoch": 0.5423387096774194, "grad_norm": 0.17986202552902045, "learning_rate": 9.328559579948078e-07, "loss": 0.2922, "step": 1076 }, { "epoch": 0.5428427419354839, "grad_norm": 0.1825249327016375, "learning_rate": 9.327214707977898e-07, "loss": 0.3001, "step": 1077 }, { "epoch": 0.5433467741935484, "grad_norm": 0.2444225537955273, "learning_rate": 9.325868587633079e-07, "loss": 0.2947, "step": 1078 }, { "epoch": 0.5438508064516129, "grad_norm": 0.41133577581929226, "learning_rate": 9.32452121930197e-07, "loss": 0.2912, "step": 1079 }, { "epoch": 0.5443548387096774, "grad_norm": 0.26649460477657994, "learning_rate": 9.323172603373278e-07, "loss": 0.2992, "step": 1080 }, { "epoch": 0.5448588709677419, "grad_norm": 0.18547372683540594, "learning_rate": 9.321822740236071e-07, "loss": 0.3008, "step": 1081 }, { "epoch": 0.5453629032258065, "grad_norm": 0.2046461782645962, "learning_rate": 9.320471630279776e-07, "loss": 0.2998, "step": 1082 }, { "epoch": 0.545866935483871, "grad_norm": 0.18110940549710675, "learning_rate": 9.319119273894179e-07, "loss": 0.2761, "step": 1083 }, { "epoch": 0.5463709677419355, "grad_norm": 0.20900035767008068, "learning_rate": 9.317765671469428e-07, "loss": 0.2808, "step": 1084 }, { "epoch": 0.546875, "grad_norm": 0.36426577176365416, "learning_rate": 9.316410823396026e-07, "loss": 0.2904, "step": 1085 }, { "epoch": 0.5473790322580645, "grad_norm": 0.2316323693848249, "learning_rate": 9.315054730064842e-07, "loss": 0.284, "step": 1086 }, { "epoch": 0.547883064516129, "grad_norm": 0.18001191011621256, "learning_rate": 9.313697391867099e-07, "loss": 0.2781, "step": 1087 }, { "epoch": 0.5483870967741935, "grad_norm": 0.18570361057953308, "learning_rate": 9.31233880919438e-07, "loss": 0.3016, "step": 1088 }, { "epoch": 0.5488911290322581, "grad_norm": 0.1965322122969676, "learning_rate": 9.310978982438627e-07, "loss": 0.2717, "step": 1089 }, { "epoch": 0.5493951612903226, "grad_norm": 0.18899479748188522, "learning_rate": 9.309617911992143e-07, "loss": 0.3011, "step": 1090 }, { "epoch": 0.5498991935483871, "grad_norm": 0.19454709669812478, "learning_rate": 9.308255598247589e-07, "loss": 0.2959, "step": 1091 }, { "epoch": 0.5504032258064516, "grad_norm": 0.25948798557199915, "learning_rate": 9.306892041597982e-07, "loss": 0.2894, "step": 1092 }, { "epoch": 0.5509072580645161, "grad_norm": 0.18772311499552344, "learning_rate": 9.305527242436702e-07, "loss": 0.2753, "step": 1093 }, { "epoch": 0.5514112903225806, "grad_norm": 0.1933597255454464, "learning_rate": 9.304161201157484e-07, "loss": 0.288, "step": 1094 }, { "epoch": 0.5519153225806451, "grad_norm": 0.1769738727867225, "learning_rate": 9.30279391815442e-07, "loss": 0.2872, "step": 1095 }, { "epoch": 0.5524193548387096, "grad_norm": 0.18100112677940808, "learning_rate": 9.301425393821967e-07, "loss": 0.2818, "step": 1096 }, { "epoch": 0.5529233870967742, "grad_norm": 0.23375376145862312, "learning_rate": 9.300055628554933e-07, "loss": 0.2949, "step": 1097 }, { "epoch": 0.5534274193548387, "grad_norm": 0.19417823391326344, "learning_rate": 9.298684622748487e-07, "loss": 0.2877, "step": 1098 }, { "epoch": 0.5539314516129032, "grad_norm": 0.1744889878512034, "learning_rate": 9.297312376798158e-07, "loss": 0.2842, "step": 1099 }, { "epoch": 0.5544354838709677, "grad_norm": 0.16932825168461108, "learning_rate": 9.295938891099828e-07, "loss": 0.2951, "step": 1100 }, { "epoch": 0.5549395161290323, "grad_norm": 0.17870713463323182, "learning_rate": 9.294564166049739e-07, "loss": 0.2962, "step": 1101 }, { "epoch": 0.5554435483870968, "grad_norm": 0.2132310949282931, "learning_rate": 9.293188202044493e-07, "loss": 0.289, "step": 1102 }, { "epoch": 0.5559475806451613, "grad_norm": 0.19629585998507623, "learning_rate": 9.291810999481045e-07, "loss": 0.2867, "step": 1103 }, { "epoch": 0.5564516129032258, "grad_norm": 0.173069340951293, "learning_rate": 9.29043255875671e-07, "loss": 0.2885, "step": 1104 }, { "epoch": 0.5569556451612904, "grad_norm": 0.18743625034354597, "learning_rate": 9.289052880269159e-07, "loss": 0.2889, "step": 1105 }, { "epoch": 0.5574596774193549, "grad_norm": 0.1747610249774233, "learning_rate": 9.287671964416423e-07, "loss": 0.2981, "step": 1106 }, { "epoch": 0.5579637096774194, "grad_norm": 0.1803315112726341, "learning_rate": 9.286289811596883e-07, "loss": 0.2958, "step": 1107 }, { "epoch": 0.5584677419354839, "grad_norm": 0.18669991895521104, "learning_rate": 9.284906422209288e-07, "loss": 0.2868, "step": 1108 }, { "epoch": 0.5589717741935484, "grad_norm": 0.1836448818997824, "learning_rate": 9.283521796652732e-07, "loss": 0.2896, "step": 1109 }, { "epoch": 0.5594758064516129, "grad_norm": 0.17629063140324913, "learning_rate": 9.282135935326672e-07, "loss": 0.2853, "step": 1110 }, { "epoch": 0.5599798387096774, "grad_norm": 0.17145118521395394, "learning_rate": 9.280748838630923e-07, "loss": 0.2845, "step": 1111 }, { "epoch": 0.5604838709677419, "grad_norm": 0.2113289196484219, "learning_rate": 9.27936050696565e-07, "loss": 0.2881, "step": 1112 }, { "epoch": 0.5609879032258065, "grad_norm": 0.21214281262489457, "learning_rate": 9.277970940731381e-07, "loss": 0.278, "step": 1113 }, { "epoch": 0.561491935483871, "grad_norm": 0.27553648004742354, "learning_rate": 9.276580140328996e-07, "loss": 0.3008, "step": 1114 }, { "epoch": 0.5619959677419355, "grad_norm": 0.22325828203211032, "learning_rate": 9.275188106159732e-07, "loss": 0.2786, "step": 1115 }, { "epoch": 0.5625, "grad_norm": 0.1847934093487764, "learning_rate": 9.273794838625184e-07, "loss": 0.2748, "step": 1116 }, { "epoch": 0.5630040322580645, "grad_norm": 0.1909450760618991, "learning_rate": 9.272400338127299e-07, "loss": 0.2772, "step": 1117 }, { "epoch": 0.563508064516129, "grad_norm": 0.1793116127902729, "learning_rate": 9.271004605068382e-07, "loss": 0.3071, "step": 1118 }, { "epoch": 0.5640120967741935, "grad_norm": 0.1813871461319359, "learning_rate": 9.269607639851095e-07, "loss": 0.2968, "step": 1119 }, { "epoch": 0.5645161290322581, "grad_norm": 0.2411100780601475, "learning_rate": 9.268209442878452e-07, "loss": 0.287, "step": 1120 }, { "epoch": 0.5650201612903226, "grad_norm": 0.22883000547852828, "learning_rate": 9.266810014553826e-07, "loss": 0.2943, "step": 1121 }, { "epoch": 0.5655241935483871, "grad_norm": 0.2008124860780277, "learning_rate": 9.265409355280941e-07, "loss": 0.2911, "step": 1122 }, { "epoch": 0.5660282258064516, "grad_norm": 0.3472215822956789, "learning_rate": 9.26400746546388e-07, "loss": 0.2988, "step": 1123 }, { "epoch": 0.5665322580645161, "grad_norm": 0.26593221261781297, "learning_rate": 9.262604345507079e-07, "loss": 0.2965, "step": 1124 }, { "epoch": 0.5670362903225806, "grad_norm": 0.1797781151682231, "learning_rate": 9.26119999581533e-07, "loss": 0.2818, "step": 1125 }, { "epoch": 0.5675403225806451, "grad_norm": 0.18192713954001474, "learning_rate": 9.25979441679378e-07, "loss": 0.2847, "step": 1126 }, { "epoch": 0.5680443548387096, "grad_norm": 0.2591562379865001, "learning_rate": 9.258387608847928e-07, "loss": 0.2906, "step": 1127 }, { "epoch": 0.5685483870967742, "grad_norm": 0.23679205899885067, "learning_rate": 9.256979572383631e-07, "loss": 0.2826, "step": 1128 }, { "epoch": 0.5690524193548387, "grad_norm": 0.31203954672135736, "learning_rate": 9.255570307807097e-07, "loss": 0.2811, "step": 1129 }, { "epoch": 0.5695564516129032, "grad_norm": 0.24288407570831536, "learning_rate": 9.254159815524891e-07, "loss": 0.2914, "step": 1130 }, { "epoch": 0.5700604838709677, "grad_norm": 0.22988163476909568, "learning_rate": 9.252748095943931e-07, "loss": 0.2903, "step": 1131 }, { "epoch": 0.5705645161290323, "grad_norm": 0.2003178753492969, "learning_rate": 9.251335149471491e-07, "loss": 0.2816, "step": 1132 }, { "epoch": 0.5710685483870968, "grad_norm": 0.17031682685543056, "learning_rate": 9.249920976515195e-07, "loss": 0.275, "step": 1133 }, { "epoch": 0.5715725806451613, "grad_norm": 0.1827276740823613, "learning_rate": 9.248505577483026e-07, "loss": 0.2869, "step": 1134 }, { "epoch": 0.5720766129032258, "grad_norm": 0.2619379606957767, "learning_rate": 9.247088952783313e-07, "loss": 0.2812, "step": 1135 }, { "epoch": 0.5725806451612904, "grad_norm": 0.1802141216250462, "learning_rate": 9.245671102824748e-07, "loss": 0.2921, "step": 1136 }, { "epoch": 0.5730846774193549, "grad_norm": 0.19106913986785806, "learning_rate": 9.244252028016371e-07, "loss": 0.284, "step": 1137 }, { "epoch": 0.5735887096774194, "grad_norm": 0.1812826559618984, "learning_rate": 9.242831728767575e-07, "loss": 0.2803, "step": 1138 }, { "epoch": 0.5740927419354839, "grad_norm": 0.1734515314933056, "learning_rate": 9.241410205488108e-07, "loss": 0.2869, "step": 1139 }, { "epoch": 0.5745967741935484, "grad_norm": 0.19432579296048832, "learning_rate": 9.23998745858807e-07, "loss": 0.2856, "step": 1140 }, { "epoch": 0.5751008064516129, "grad_norm": 0.18377955104622667, "learning_rate": 9.238563488477918e-07, "loss": 0.2926, "step": 1141 }, { "epoch": 0.5756048387096774, "grad_norm": 0.18200221102162675, "learning_rate": 9.237138295568454e-07, "loss": 0.2881, "step": 1142 }, { "epoch": 0.5761088709677419, "grad_norm": 0.18619952457281, "learning_rate": 9.23571188027084e-07, "loss": 0.2842, "step": 1143 }, { "epoch": 0.5766129032258065, "grad_norm": 0.1962070569411852, "learning_rate": 9.234284242996588e-07, "loss": 0.2831, "step": 1144 }, { "epoch": 0.577116935483871, "grad_norm": 0.17904916055767378, "learning_rate": 9.23285538415756e-07, "loss": 0.2927, "step": 1145 }, { "epoch": 0.5776209677419355, "grad_norm": 0.20485807453286825, "learning_rate": 9.231425304165976e-07, "loss": 0.283, "step": 1146 }, { "epoch": 0.578125, "grad_norm": 0.17697640569451753, "learning_rate": 9.229994003434405e-07, "loss": 0.2987, "step": 1147 }, { "epoch": 0.5786290322580645, "grad_norm": 0.17424623251522328, "learning_rate": 9.228561482375766e-07, "loss": 0.2746, "step": 1148 }, { "epoch": 0.579133064516129, "grad_norm": 0.19037141787800266, "learning_rate": 9.227127741403336e-07, "loss": 0.2873, "step": 1149 }, { "epoch": 0.5796370967741935, "grad_norm": 0.18343536870106394, "learning_rate": 9.225692780930736e-07, "loss": 0.2876, "step": 1150 }, { "epoch": 0.5801411290322581, "grad_norm": 0.1774843130008874, "learning_rate": 9.224256601371947e-07, "loss": 0.2934, "step": 1151 }, { "epoch": 0.5806451612903226, "grad_norm": 0.1890382236967552, "learning_rate": 9.222819203141295e-07, "loss": 0.2716, "step": 1152 }, { "epoch": 0.5811491935483871, "grad_norm": 0.1813676084499092, "learning_rate": 9.221380586653462e-07, "loss": 0.2794, "step": 1153 }, { "epoch": 0.5816532258064516, "grad_norm": 0.170202685121712, "learning_rate": 9.21994075232348e-07, "loss": 0.2872, "step": 1154 }, { "epoch": 0.5821572580645161, "grad_norm": 0.1896602238301285, "learning_rate": 9.218499700566733e-07, "loss": 0.2904, "step": 1155 }, { "epoch": 0.5826612903225806, "grad_norm": 0.1816413232762944, "learning_rate": 9.217057431798954e-07, "loss": 0.2873, "step": 1156 }, { "epoch": 0.5831653225806451, "grad_norm": 0.19216255744528832, "learning_rate": 9.215613946436229e-07, "loss": 0.297, "step": 1157 }, { "epoch": 0.5836693548387096, "grad_norm": 0.17974789944825, "learning_rate": 9.214169244894996e-07, "loss": 0.293, "step": 1158 }, { "epoch": 0.5841733870967742, "grad_norm": 0.17992839360699955, "learning_rate": 9.21272332759204e-07, "loss": 0.287, "step": 1159 }, { "epoch": 0.5846774193548387, "grad_norm": 0.18513535414685894, "learning_rate": 9.211276194944501e-07, "loss": 0.2794, "step": 1160 }, { "epoch": 0.5851814516129032, "grad_norm": 0.1819833313378893, "learning_rate": 9.209827847369866e-07, "loss": 0.2744, "step": 1161 }, { "epoch": 0.5856854838709677, "grad_norm": 0.17281034086736458, "learning_rate": 9.208378285285977e-07, "loss": 0.2945, "step": 1162 }, { "epoch": 0.5861895161290323, "grad_norm": 0.23268827282788637, "learning_rate": 9.206927509111022e-07, "loss": 0.2931, "step": 1163 }, { "epoch": 0.5866935483870968, "grad_norm": 0.18019045409013396, "learning_rate": 9.205475519263541e-07, "loss": 0.2892, "step": 1164 }, { "epoch": 0.5871975806451613, "grad_norm": 0.19222136198482753, "learning_rate": 9.204022316162424e-07, "loss": 0.2977, "step": 1165 }, { "epoch": 0.5877016129032258, "grad_norm": 0.17442507458285844, "learning_rate": 9.202567900226912e-07, "loss": 0.286, "step": 1166 }, { "epoch": 0.5882056451612904, "grad_norm": 0.17138337080066757, "learning_rate": 9.201112271876593e-07, "loss": 0.284, "step": 1167 }, { "epoch": 0.5887096774193549, "grad_norm": 0.2425654731892946, "learning_rate": 9.199655431531409e-07, "loss": 0.2814, "step": 1168 }, { "epoch": 0.5892137096774194, "grad_norm": 0.1846246469710363, "learning_rate": 9.198197379611647e-07, "loss": 0.2854, "step": 1169 }, { "epoch": 0.5897177419354839, "grad_norm": 0.19156185603852782, "learning_rate": 9.19673811653795e-07, "loss": 0.2999, "step": 1170 }, { "epoch": 0.5902217741935484, "grad_norm": 0.17846888518866763, "learning_rate": 9.195277642731303e-07, "loss": 0.2944, "step": 1171 }, { "epoch": 0.5907258064516129, "grad_norm": 0.18016209804691175, "learning_rate": 9.193815958613043e-07, "loss": 0.2909, "step": 1172 }, { "epoch": 0.5912298387096774, "grad_norm": 0.1818568667655358, "learning_rate": 9.192353064604858e-07, "loss": 0.2977, "step": 1173 }, { "epoch": 0.5917338709677419, "grad_norm": 0.17227833790308136, "learning_rate": 9.190888961128787e-07, "loss": 0.2799, "step": 1174 }, { "epoch": 0.5922379032258065, "grad_norm": 0.1738364554972949, "learning_rate": 9.18942364860721e-07, "loss": 0.2764, "step": 1175 }, { "epoch": 0.592741935483871, "grad_norm": 0.19302662529321246, "learning_rate": 9.187957127462863e-07, "loss": 0.2769, "step": 1176 }, { "epoch": 0.5932459677419355, "grad_norm": 0.17788471334305958, "learning_rate": 9.186489398118827e-07, "loss": 0.2824, "step": 1177 }, { "epoch": 0.59375, "grad_norm": 0.17681503923586112, "learning_rate": 9.185020460998534e-07, "loss": 0.287, "step": 1178 }, { "epoch": 0.5942540322580645, "grad_norm": 0.176047828469218, "learning_rate": 9.183550316525763e-07, "loss": 0.2854, "step": 1179 }, { "epoch": 0.594758064516129, "grad_norm": 0.17351698958327808, "learning_rate": 9.182078965124643e-07, "loss": 0.3057, "step": 1180 }, { "epoch": 0.5952620967741935, "grad_norm": 0.1821800882504011, "learning_rate": 9.180606407219644e-07, "loss": 0.2759, "step": 1181 }, { "epoch": 0.5957661290322581, "grad_norm": 0.17241468751173047, "learning_rate": 9.179132643235598e-07, "loss": 0.2909, "step": 1182 }, { "epoch": 0.5962701612903226, "grad_norm": 0.18984463320176223, "learning_rate": 9.177657673597671e-07, "loss": 0.2772, "step": 1183 }, { "epoch": 0.5967741935483871, "grad_norm": 0.17848369146299375, "learning_rate": 9.176181498731385e-07, "loss": 0.293, "step": 1184 }, { "epoch": 0.5972782258064516, "grad_norm": 0.17602029953890927, "learning_rate": 9.174704119062606e-07, "loss": 0.2843, "step": 1185 }, { "epoch": 0.5977822580645161, "grad_norm": 0.19474495339674502, "learning_rate": 9.173225535017551e-07, "loss": 0.2928, "step": 1186 }, { "epoch": 0.5982862903225806, "grad_norm": 0.17273250226175776, "learning_rate": 9.171745747022778e-07, "loss": 0.2914, "step": 1187 }, { "epoch": 0.5987903225806451, "grad_norm": 0.21145261381905367, "learning_rate": 9.170264755505201e-07, "loss": 0.2928, "step": 1188 }, { "epoch": 0.5992943548387096, "grad_norm": 0.2039002377170998, "learning_rate": 9.168782560892077e-07, "loss": 0.2701, "step": 1189 }, { "epoch": 0.5997983870967742, "grad_norm": 0.17988346160874077, "learning_rate": 9.167299163611007e-07, "loss": 0.2751, "step": 1190 }, { "epoch": 0.6003024193548387, "grad_norm": 0.2493981489367964, "learning_rate": 9.165814564089944e-07, "loss": 0.2657, "step": 1191 }, { "epoch": 0.6008064516129032, "grad_norm": 0.1797786958096351, "learning_rate": 9.164328762757184e-07, "loss": 0.2833, "step": 1192 }, { "epoch": 0.6013104838709677, "grad_norm": 0.19193093734013503, "learning_rate": 9.162841760041373e-07, "loss": 0.2858, "step": 1193 }, { "epoch": 0.6018145161290323, "grad_norm": 0.2769241015425666, "learning_rate": 9.161353556371503e-07, "loss": 0.3026, "step": 1194 }, { "epoch": 0.6018145161290323, "eval_loss": 0.31878647208213806, "eval_runtime": 16.9743, "eval_samples_per_second": 50.37, "eval_steps_per_second": 1.06, "step": 1194 }, { "epoch": 0.6023185483870968, "grad_norm": 0.181263001899909, "learning_rate": 9.159864152176908e-07, "loss": 0.304, "step": 1195 }, { "epoch": 0.6028225806451613, "grad_norm": 0.17978051150161645, "learning_rate": 9.158373547887275e-07, "loss": 0.2873, "step": 1196 }, { "epoch": 0.6033266129032258, "grad_norm": 0.20272556727914262, "learning_rate": 9.156881743932634e-07, "loss": 0.2815, "step": 1197 }, { "epoch": 0.6038306451612904, "grad_norm": 0.21068658372368834, "learning_rate": 9.155388740743361e-07, "loss": 0.2853, "step": 1198 }, { "epoch": 0.6043346774193549, "grad_norm": 0.22437088927660387, "learning_rate": 9.153894538750176e-07, "loss": 0.283, "step": 1199 }, { "epoch": 0.6048387096774194, "grad_norm": 0.1931533084578361, "learning_rate": 9.152399138384148e-07, "loss": 0.282, "step": 1200 }, { "epoch": 0.6053427419354839, "grad_norm": 0.18479701434227452, "learning_rate": 9.150902540076693e-07, "loss": 0.2965, "step": 1201 }, { "epoch": 0.6058467741935484, "grad_norm": 0.2210412349427078, "learning_rate": 9.149404744259568e-07, "loss": 0.2888, "step": 1202 }, { "epoch": 0.6063508064516129, "grad_norm": 0.18226679427751902, "learning_rate": 9.147905751364878e-07, "loss": 0.2887, "step": 1203 }, { "epoch": 0.6068548387096774, "grad_norm": 0.1832755725258158, "learning_rate": 9.146405561825075e-07, "loss": 0.2948, "step": 1204 }, { "epoch": 0.6073588709677419, "grad_norm": 0.1720010372993489, "learning_rate": 9.144904176072952e-07, "loss": 0.2829, "step": 1205 }, { "epoch": 0.6078629032258065, "grad_norm": 0.17565878536855542, "learning_rate": 9.14340159454165e-07, "loss": 0.2815, "step": 1206 }, { "epoch": 0.608366935483871, "grad_norm": 0.18227852681153148, "learning_rate": 9.141897817664657e-07, "loss": 0.2878, "step": 1207 }, { "epoch": 0.6088709677419355, "grad_norm": 0.1965323622835235, "learning_rate": 9.140392845875799e-07, "loss": 0.2845, "step": 1208 }, { "epoch": 0.609375, "grad_norm": 0.17206440789039248, "learning_rate": 9.138886679609254e-07, "loss": 0.2852, "step": 1209 }, { "epoch": 0.6098790322580645, "grad_norm": 0.17264736183064056, "learning_rate": 9.137379319299542e-07, "loss": 0.274, "step": 1210 }, { "epoch": 0.610383064516129, "grad_norm": 0.18598352903511223, "learning_rate": 9.135870765381525e-07, "loss": 0.2896, "step": 1211 }, { "epoch": 0.6108870967741935, "grad_norm": 0.19699736881940794, "learning_rate": 9.134361018290413e-07, "loss": 0.2763, "step": 1212 }, { "epoch": 0.6113911290322581, "grad_norm": 0.22313038636092158, "learning_rate": 9.132850078461758e-07, "loss": 0.2846, "step": 1213 }, { "epoch": 0.6118951612903226, "grad_norm": 0.30031307009566305, "learning_rate": 9.131337946331458e-07, "loss": 0.2989, "step": 1214 }, { "epoch": 0.6123991935483871, "grad_norm": 0.2329858913080578, "learning_rate": 9.129824622335752e-07, "loss": 0.3048, "step": 1215 }, { "epoch": 0.6129032258064516, "grad_norm": 0.17993489418040118, "learning_rate": 9.128310106911226e-07, "loss": 0.2932, "step": 1216 }, { "epoch": 0.6134072580645161, "grad_norm": 0.18148081451402603, "learning_rate": 9.126794400494806e-07, "loss": 0.3034, "step": 1217 }, { "epoch": 0.6139112903225806, "grad_norm": 0.20177903959056726, "learning_rate": 9.125277503523766e-07, "loss": 0.2919, "step": 1218 }, { "epoch": 0.6144153225806451, "grad_norm": 0.27324162227606025, "learning_rate": 9.123759416435722e-07, "loss": 0.2922, "step": 1219 }, { "epoch": 0.6149193548387096, "grad_norm": 0.34121798755615584, "learning_rate": 9.122240139668631e-07, "loss": 0.2832, "step": 1220 }, { "epoch": 0.6154233870967742, "grad_norm": 0.21697155231985218, "learning_rate": 9.120719673660796e-07, "loss": 0.282, "step": 1221 }, { "epoch": 0.6159274193548387, "grad_norm": 0.17674277405784744, "learning_rate": 9.119198018850862e-07, "loss": 0.2816, "step": 1222 }, { "epoch": 0.6164314516129032, "grad_norm": 0.16871547242020288, "learning_rate": 9.117675175677815e-07, "loss": 0.2756, "step": 1223 }, { "epoch": 0.6169354838709677, "grad_norm": 0.20057684589974267, "learning_rate": 9.11615114458099e-07, "loss": 0.275, "step": 1224 }, { "epoch": 0.6174395161290323, "grad_norm": 0.18893088728346058, "learning_rate": 9.114625926000057e-07, "loss": 0.2932, "step": 1225 }, { "epoch": 0.6179435483870968, "grad_norm": 0.24323661515306044, "learning_rate": 9.113099520375032e-07, "loss": 0.2884, "step": 1226 }, { "epoch": 0.6184475806451613, "grad_norm": 0.17657599055834897, "learning_rate": 9.111571928146276e-07, "loss": 0.2847, "step": 1227 }, { "epoch": 0.6189516129032258, "grad_norm": 0.1839549106035099, "learning_rate": 9.110043149754487e-07, "loss": 0.2976, "step": 1228 }, { "epoch": 0.6194556451612904, "grad_norm": 0.2056215870054611, "learning_rate": 9.108513185640712e-07, "loss": 0.2812, "step": 1229 }, { "epoch": 0.6199596774193549, "grad_norm": 0.17065753231349062, "learning_rate": 9.106982036246331e-07, "loss": 0.2857, "step": 1230 }, { "epoch": 0.6204637096774194, "grad_norm": 0.18476180000705647, "learning_rate": 9.105449702013076e-07, "loss": 0.2965, "step": 1231 }, { "epoch": 0.6209677419354839, "grad_norm": 0.2000432350203024, "learning_rate": 9.103916183383013e-07, "loss": 0.289, "step": 1232 }, { "epoch": 0.6214717741935484, "grad_norm": 0.17671157012984728, "learning_rate": 9.102381480798553e-07, "loss": 0.2887, "step": 1233 }, { "epoch": 0.6219758064516129, "grad_norm": 0.19186276950774964, "learning_rate": 9.100845594702451e-07, "loss": 0.2855, "step": 1234 }, { "epoch": 0.6224798387096774, "grad_norm": 0.17315544977260117, "learning_rate": 9.099308525537796e-07, "loss": 0.2787, "step": 1235 }, { "epoch": 0.6229838709677419, "grad_norm": 0.21400229908517301, "learning_rate": 9.097770273748027e-07, "loss": 0.2923, "step": 1236 }, { "epoch": 0.6234879032258065, "grad_norm": 0.19574643374456288, "learning_rate": 9.096230839776917e-07, "loss": 0.2849, "step": 1237 }, { "epoch": 0.623991935483871, "grad_norm": 0.18182176245833126, "learning_rate": 9.094690224068585e-07, "loss": 0.282, "step": 1238 }, { "epoch": 0.6244959677419355, "grad_norm": 0.23124623736324354, "learning_rate": 9.09314842706749e-07, "loss": 0.2964, "step": 1239 }, { "epoch": 0.625, "grad_norm": 0.17853145875824106, "learning_rate": 9.091605449218427e-07, "loss": 0.3069, "step": 1240 }, { "epoch": 0.6255040322580645, "grad_norm": 0.2052782675999351, "learning_rate": 9.09006129096654e-07, "loss": 0.3013, "step": 1241 }, { "epoch": 0.626008064516129, "grad_norm": 0.1823088396970403, "learning_rate": 9.088515952757306e-07, "loss": 0.2922, "step": 1242 }, { "epoch": 0.6265120967741935, "grad_norm": 0.20577771722463917, "learning_rate": 9.086969435036547e-07, "loss": 0.2748, "step": 1243 }, { "epoch": 0.6270161290322581, "grad_norm": 0.17813055873331182, "learning_rate": 9.085421738250422e-07, "loss": 0.2939, "step": 1244 }, { "epoch": 0.6275201612903226, "grad_norm": 0.1814603197270135, "learning_rate": 9.083872862845436e-07, "loss": 0.2884, "step": 1245 }, { "epoch": 0.6280241935483871, "grad_norm": 0.19148692203351, "learning_rate": 9.082322809268425e-07, "loss": 0.2726, "step": 1246 }, { "epoch": 0.6285282258064516, "grad_norm": 0.1933452685613287, "learning_rate": 9.080771577966574e-07, "loss": 0.2894, "step": 1247 }, { "epoch": 0.6290322580645161, "grad_norm": 0.18291707054663686, "learning_rate": 9.0792191693874e-07, "loss": 0.2999, "step": 1248 }, { "epoch": 0.6295362903225806, "grad_norm": 0.17702736035496525, "learning_rate": 9.077665583978766e-07, "loss": 0.3027, "step": 1249 }, { "epoch": 0.6300403225806451, "grad_norm": 0.1694200412404369, "learning_rate": 9.076110822188872e-07, "loss": 0.28, "step": 1250 }, { "epoch": 0.6305443548387096, "grad_norm": 0.18581080838795144, "learning_rate": 9.074554884466254e-07, "loss": 0.2938, "step": 1251 }, { "epoch": 0.6310483870967742, "grad_norm": 0.1997263136466941, "learning_rate": 9.072997771259793e-07, "loss": 0.2802, "step": 1252 }, { "epoch": 0.6315524193548387, "grad_norm": 0.21481822334565182, "learning_rate": 9.071439483018708e-07, "loss": 0.2932, "step": 1253 }, { "epoch": 0.6320564516129032, "grad_norm": 0.1791325184424046, "learning_rate": 9.06988002019255e-07, "loss": 0.2834, "step": 1254 }, { "epoch": 0.6325604838709677, "grad_norm": 0.1836618100696337, "learning_rate": 9.06831938323122e-07, "loss": 0.2938, "step": 1255 }, { "epoch": 0.6330645161290323, "grad_norm": 0.19292497762523103, "learning_rate": 9.066757572584948e-07, "loss": 0.2896, "step": 1256 }, { "epoch": 0.6335685483870968, "grad_norm": 0.18240313649056145, "learning_rate": 9.065194588704311e-07, "loss": 0.2814, "step": 1257 }, { "epoch": 0.6340725806451613, "grad_norm": 0.22052975724510962, "learning_rate": 9.063630432040216e-07, "loss": 0.2886, "step": 1258 }, { "epoch": 0.6345766129032258, "grad_norm": 0.20672385722341882, "learning_rate": 9.062065103043915e-07, "loss": 0.2954, "step": 1259 }, { "epoch": 0.6350806451612904, "grad_norm": 0.17995424113662914, "learning_rate": 9.060498602166995e-07, "loss": 0.2866, "step": 1260 }, { "epoch": 0.6355846774193549, "grad_norm": 0.17630973219376414, "learning_rate": 9.058930929861381e-07, "loss": 0.2803, "step": 1261 }, { "epoch": 0.6360887096774194, "grad_norm": 0.2239699462767389, "learning_rate": 9.057362086579336e-07, "loss": 0.2944, "step": 1262 }, { "epoch": 0.6365927419354839, "grad_norm": 0.30137928217223403, "learning_rate": 9.055792072773466e-07, "loss": 0.297, "step": 1263 }, { "epoch": 0.6370967741935484, "grad_norm": 0.22695762562998079, "learning_rate": 9.054220888896706e-07, "loss": 0.2815, "step": 1264 }, { "epoch": 0.6376008064516129, "grad_norm": 0.1751438430718199, "learning_rate": 9.052648535402334e-07, "loss": 0.2862, "step": 1265 }, { "epoch": 0.6381048387096774, "grad_norm": 0.17639322627147327, "learning_rate": 9.051075012743965e-07, "loss": 0.2648, "step": 1266 }, { "epoch": 0.6386088709677419, "grad_norm": 0.18539526527243927, "learning_rate": 9.049500321375549e-07, "loss": 0.2719, "step": 1267 }, { "epoch": 0.6391129032258065, "grad_norm": 0.2689595278098081, "learning_rate": 9.047924461751376e-07, "loss": 0.2875, "step": 1268 }, { "epoch": 0.639616935483871, "grad_norm": 0.178579851083188, "learning_rate": 9.046347434326072e-07, "loss": 0.2915, "step": 1269 }, { "epoch": 0.6401209677419355, "grad_norm": 0.17904389890697903, "learning_rate": 9.044769239554599e-07, "loss": 0.2745, "step": 1270 }, { "epoch": 0.640625, "grad_norm": 0.1896562559936013, "learning_rate": 9.043189877892254e-07, "loss": 0.2883, "step": 1271 }, { "epoch": 0.6411290322580645, "grad_norm": 0.1908189648372141, "learning_rate": 9.041609349794678e-07, "loss": 0.283, "step": 1272 }, { "epoch": 0.641633064516129, "grad_norm": 0.21147318798440223, "learning_rate": 9.040027655717841e-07, "loss": 0.301, "step": 1273 }, { "epoch": 0.6421370967741935, "grad_norm": 0.18345975087456926, "learning_rate": 9.03844479611805e-07, "loss": 0.3139, "step": 1274 }, { "epoch": 0.6426411290322581, "grad_norm": 0.19734085403857743, "learning_rate": 9.036860771451954e-07, "loss": 0.2733, "step": 1275 }, { "epoch": 0.6431451612903226, "grad_norm": 0.17121744147827178, "learning_rate": 9.035275582176533e-07, "loss": 0.2846, "step": 1276 }, { "epoch": 0.6436491935483871, "grad_norm": 0.17622953005984843, "learning_rate": 9.033689228749102e-07, "loss": 0.2965, "step": 1277 }, { "epoch": 0.6441532258064516, "grad_norm": 0.18180991170706243, "learning_rate": 9.032101711627316e-07, "loss": 0.2863, "step": 1278 }, { "epoch": 0.6446572580645161, "grad_norm": 0.18383068016833068, "learning_rate": 9.030513031269165e-07, "loss": 0.2916, "step": 1279 }, { "epoch": 0.6451612903225806, "grad_norm": 0.23069065361517252, "learning_rate": 9.02892318813297e-07, "loss": 0.285, "step": 1280 }, { "epoch": 0.6456653225806451, "grad_norm": 0.27998314668853796, "learning_rate": 9.027332182677397e-07, "loss": 0.2854, "step": 1281 }, { "epoch": 0.6461693548387096, "grad_norm": 0.17547605295017601, "learning_rate": 9.025740015361433e-07, "loss": 0.2839, "step": 1282 }, { "epoch": 0.6466733870967742, "grad_norm": 0.17884356887542877, "learning_rate": 9.024146686644415e-07, "loss": 0.2922, "step": 1283 }, { "epoch": 0.6471774193548387, "grad_norm": 0.18212333265861616, "learning_rate": 9.022552196986006e-07, "loss": 0.2735, "step": 1284 }, { "epoch": 0.6476814516129032, "grad_norm": 0.17605258249805855, "learning_rate": 9.020956546846205e-07, "loss": 0.2907, "step": 1285 }, { "epoch": 0.6481854838709677, "grad_norm": 0.23268702781185105, "learning_rate": 9.01935973668535e-07, "loss": 0.2899, "step": 1286 }, { "epoch": 0.6486895161290323, "grad_norm": 0.17715332276237264, "learning_rate": 9.017761766964111e-07, "loss": 0.2966, "step": 1287 }, { "epoch": 0.6491935483870968, "grad_norm": 0.18537544001467623, "learning_rate": 9.01616263814349e-07, "loss": 0.2851, "step": 1288 }, { "epoch": 0.6496975806451613, "grad_norm": 0.19742055543365888, "learning_rate": 9.014562350684824e-07, "loss": 0.2904, "step": 1289 }, { "epoch": 0.6502016129032258, "grad_norm": 0.17753937663898317, "learning_rate": 9.012960905049791e-07, "loss": 0.2909, "step": 1290 }, { "epoch": 0.6507056451612904, "grad_norm": 0.18127362373513947, "learning_rate": 9.011358301700397e-07, "loss": 0.2991, "step": 1291 }, { "epoch": 0.6512096774193549, "grad_norm": 0.1897788399464913, "learning_rate": 9.009754541098982e-07, "loss": 0.2995, "step": 1292 }, { "epoch": 0.6517137096774194, "grad_norm": 0.17813293972191205, "learning_rate": 9.008149623708219e-07, "loss": 0.2765, "step": 1293 }, { "epoch": 0.6522177419354839, "grad_norm": 0.18329773158412632, "learning_rate": 9.006543549991119e-07, "loss": 0.2936, "step": 1294 }, { "epoch": 0.6527217741935484, "grad_norm": 0.20529257571242965, "learning_rate": 9.004936320411025e-07, "loss": 0.2949, "step": 1295 }, { "epoch": 0.6532258064516129, "grad_norm": 0.21151253255173527, "learning_rate": 9.003327935431612e-07, "loss": 0.2771, "step": 1296 }, { "epoch": 0.6537298387096774, "grad_norm": 0.1796301906371901, "learning_rate": 9.001718395516889e-07, "loss": 0.2869, "step": 1297 }, { "epoch": 0.6542338709677419, "grad_norm": 0.1717234788709778, "learning_rate": 9.000107701131196e-07, "loss": 0.2823, "step": 1298 }, { "epoch": 0.6547379032258065, "grad_norm": 0.17845055491572026, "learning_rate": 8.998495852739213e-07, "loss": 0.3044, "step": 1299 }, { "epoch": 0.655241935483871, "grad_norm": 0.17442341108486234, "learning_rate": 8.996882850805944e-07, "loss": 0.28, "step": 1300 }, { "epoch": 0.6557459677419355, "grad_norm": 0.20073841491430633, "learning_rate": 8.995268695796734e-07, "loss": 0.2811, "step": 1301 }, { "epoch": 0.65625, "grad_norm": 0.21128884092445868, "learning_rate": 8.993653388177254e-07, "loss": 0.2929, "step": 1302 }, { "epoch": 0.6567540322580645, "grad_norm": 0.1850833947673501, "learning_rate": 8.99203692841351e-07, "loss": 0.2914, "step": 1303 }, { "epoch": 0.657258064516129, "grad_norm": 0.19939616483650877, "learning_rate": 8.990419316971842e-07, "loss": 0.2859, "step": 1304 }, { "epoch": 0.6577620967741935, "grad_norm": 0.1750117264820056, "learning_rate": 8.988800554318921e-07, "loss": 0.2982, "step": 1305 }, { "epoch": 0.6582661290322581, "grad_norm": 0.2075447584954836, "learning_rate": 8.987180640921747e-07, "loss": 0.299, "step": 1306 }, { "epoch": 0.6587701612903226, "grad_norm": 0.1790266951422517, "learning_rate": 8.985559577247661e-07, "loss": 0.2776, "step": 1307 }, { "epoch": 0.6592741935483871, "grad_norm": 0.178773579825902, "learning_rate": 8.983937363764324e-07, "loss": 0.2936, "step": 1308 }, { "epoch": 0.6597782258064516, "grad_norm": 0.1738135103379992, "learning_rate": 8.982314000939737e-07, "loss": 0.2912, "step": 1309 }, { "epoch": 0.6602822580645161, "grad_norm": 0.18476555106324266, "learning_rate": 8.98068948924223e-07, "loss": 0.3009, "step": 1310 }, { "epoch": 0.6607862903225806, "grad_norm": 0.20244847551602765, "learning_rate": 8.979063829140465e-07, "loss": 0.2945, "step": 1311 }, { "epoch": 0.6612903225806451, "grad_norm": 0.19339358525691477, "learning_rate": 8.977437021103433e-07, "loss": 0.2788, "step": 1312 }, { "epoch": 0.6617943548387096, "grad_norm": 0.18712592000883932, "learning_rate": 8.975809065600459e-07, "loss": 0.3064, "step": 1313 }, { "epoch": 0.6622983870967742, "grad_norm": 0.22452691139273012, "learning_rate": 8.974179963101201e-07, "loss": 0.2898, "step": 1314 }, { "epoch": 0.6628024193548387, "grad_norm": 0.1796052556590874, "learning_rate": 8.97254971407564e-07, "loss": 0.2813, "step": 1315 }, { "epoch": 0.6633064516129032, "grad_norm": 0.17710526677908528, "learning_rate": 8.970918318994096e-07, "loss": 0.2894, "step": 1316 }, { "epoch": 0.6638104838709677, "grad_norm": 0.18446794030145566, "learning_rate": 8.969285778327215e-07, "loss": 0.2796, "step": 1317 }, { "epoch": 0.6643145161290323, "grad_norm": 0.22546463300214195, "learning_rate": 8.967652092545976e-07, "loss": 0.292, "step": 1318 }, { "epoch": 0.6648185483870968, "grad_norm": 0.24026811231174375, "learning_rate": 8.966017262121687e-07, "loss": 0.2846, "step": 1319 }, { "epoch": 0.6653225806451613, "grad_norm": 0.18394372591314476, "learning_rate": 8.964381287525986e-07, "loss": 0.2994, "step": 1320 }, { "epoch": 0.6658266129032258, "grad_norm": 0.1803179306900601, "learning_rate": 8.962744169230841e-07, "loss": 0.2797, "step": 1321 }, { "epoch": 0.6663306451612904, "grad_norm": 0.16839049819871527, "learning_rate": 8.96110590770855e-07, "loss": 0.2631, "step": 1322 }, { "epoch": 0.6668346774193549, "grad_norm": 0.22155662980251253, "learning_rate": 8.959466503431744e-07, "loss": 0.2816, "step": 1323 }, { "epoch": 0.6673387096774194, "grad_norm": 0.19139441389189604, "learning_rate": 8.957825956873379e-07, "loss": 0.2855, "step": 1324 }, { "epoch": 0.6678427419354839, "grad_norm": 0.19210293112712495, "learning_rate": 8.956184268506742e-07, "loss": 0.2817, "step": 1325 }, { "epoch": 0.6683467741935484, "grad_norm": 0.1857304337358492, "learning_rate": 8.954541438805452e-07, "loss": 0.2873, "step": 1326 }, { "epoch": 0.6688508064516129, "grad_norm": 0.17268224825580095, "learning_rate": 8.952897468243454e-07, "loss": 0.2713, "step": 1327 }, { "epoch": 0.6693548387096774, "grad_norm": 0.1957606146053459, "learning_rate": 8.951252357295022e-07, "loss": 0.2767, "step": 1328 }, { "epoch": 0.6698588709677419, "grad_norm": 0.1824912944669447, "learning_rate": 8.949606106434763e-07, "loss": 0.287, "step": 1329 }, { "epoch": 0.6703629032258065, "grad_norm": 0.1798044187166269, "learning_rate": 8.947958716137608e-07, "loss": 0.3125, "step": 1330 }, { "epoch": 0.670866935483871, "grad_norm": 0.17749986390962377, "learning_rate": 8.946310186878821e-07, "loss": 0.2675, "step": 1331 }, { "epoch": 0.6713709677419355, "grad_norm": 0.17752734169122558, "learning_rate": 8.94466051913399e-07, "loss": 0.2772, "step": 1332 }, { "epoch": 0.671875, "grad_norm": 0.1987933223640386, "learning_rate": 8.943009713379034e-07, "loss": 0.2838, "step": 1333 }, { "epoch": 0.6723790322580645, "grad_norm": 0.1792807492758479, "learning_rate": 8.941357770090203e-07, "loss": 0.2872, "step": 1334 }, { "epoch": 0.672883064516129, "grad_norm": 0.17948534651864595, "learning_rate": 8.939704689744071e-07, "loss": 0.3002, "step": 1335 }, { "epoch": 0.6733870967741935, "grad_norm": 0.1812113705229493, "learning_rate": 8.93805047281754e-07, "loss": 0.2954, "step": 1336 }, { "epoch": 0.6738911290322581, "grad_norm": 0.18844957646714744, "learning_rate": 8.936395119787842e-07, "loss": 0.2697, "step": 1337 }, { "epoch": 0.6743951612903226, "grad_norm": 0.20929818984501958, "learning_rate": 8.934738631132539e-07, "loss": 0.2871, "step": 1338 }, { "epoch": 0.6748991935483871, "grad_norm": 0.188001165636237, "learning_rate": 8.933081007329515e-07, "loss": 0.2973, "step": 1339 }, { "epoch": 0.6754032258064516, "grad_norm": 0.16918518262975, "learning_rate": 8.931422248856982e-07, "loss": 0.2769, "step": 1340 }, { "epoch": 0.6759072580645161, "grad_norm": 0.17795218348215974, "learning_rate": 8.929762356193486e-07, "loss": 0.2957, "step": 1341 }, { "epoch": 0.6764112903225806, "grad_norm": 0.23569227174562507, "learning_rate": 8.928101329817894e-07, "loss": 0.296, "step": 1342 }, { "epoch": 0.6769153225806451, "grad_norm": 0.1810084239182294, "learning_rate": 8.926439170209401e-07, "loss": 0.3002, "step": 1343 }, { "epoch": 0.6774193548387096, "grad_norm": 0.17662865898790905, "learning_rate": 8.92477587784753e-07, "loss": 0.2877, "step": 1344 }, { "epoch": 0.6779233870967742, "grad_norm": 0.20068556141454189, "learning_rate": 8.923111453212131e-07, "loss": 0.2987, "step": 1345 }, { "epoch": 0.6784274193548387, "grad_norm": 0.1856885926434189, "learning_rate": 8.921445896783381e-07, "loss": 0.2762, "step": 1346 }, { "epoch": 0.6789314516129032, "grad_norm": 0.17738716689811637, "learning_rate": 8.919779209041782e-07, "loss": 0.2917, "step": 1347 }, { "epoch": 0.6794354838709677, "grad_norm": 0.17222917661740367, "learning_rate": 8.918111390468162e-07, "loss": 0.2801, "step": 1348 }, { "epoch": 0.6799395161290323, "grad_norm": 0.19151284132256455, "learning_rate": 8.916442441543678e-07, "loss": 0.2706, "step": 1349 }, { "epoch": 0.6804435483870968, "grad_norm": 0.17262129740464702, "learning_rate": 8.91477236274981e-07, "loss": 0.2769, "step": 1350 }, { "epoch": 0.6809475806451613, "grad_norm": 0.1738945976891568, "learning_rate": 8.913101154568366e-07, "loss": 0.2849, "step": 1351 }, { "epoch": 0.6814516129032258, "grad_norm": 0.18441311480041897, "learning_rate": 8.91142881748148e-07, "loss": 0.2684, "step": 1352 }, { "epoch": 0.6819556451612904, "grad_norm": 0.18119499884546747, "learning_rate": 8.90975535197161e-07, "loss": 0.2886, "step": 1353 }, { "epoch": 0.6824596774193549, "grad_norm": 0.1829298283270949, "learning_rate": 8.90808075852154e-07, "loss": 0.2929, "step": 1354 }, { "epoch": 0.6829637096774194, "grad_norm": 0.18164824873198268, "learning_rate": 8.906405037614382e-07, "loss": 0.2935, "step": 1355 }, { "epoch": 0.6834677419354839, "grad_norm": 0.20468371157832624, "learning_rate": 8.904728189733568e-07, "loss": 0.2748, "step": 1356 }, { "epoch": 0.6839717741935484, "grad_norm": 0.1825106724862103, "learning_rate": 8.90305021536286e-07, "loss": 0.3001, "step": 1357 }, { "epoch": 0.6844758064516129, "grad_norm": 0.17622106582753136, "learning_rate": 8.901371114986343e-07, "loss": 0.2819, "step": 1358 }, { "epoch": 0.6849798387096774, "grad_norm": 0.18643087038948788, "learning_rate": 8.899690889088427e-07, "loss": 0.2949, "step": 1359 }, { "epoch": 0.6854838709677419, "grad_norm": 0.1786561974876954, "learning_rate": 8.898009538153847e-07, "loss": 0.2798, "step": 1360 }, { "epoch": 0.6859879032258065, "grad_norm": 0.17854373951330638, "learning_rate": 8.896327062667663e-07, "loss": 0.2989, "step": 1361 }, { "epoch": 0.686491935483871, "grad_norm": 0.1945546348671395, "learning_rate": 8.894643463115257e-07, "loss": 0.2961, "step": 1362 }, { "epoch": 0.6869959677419355, "grad_norm": 0.22049274658233767, "learning_rate": 8.892958739982338e-07, "loss": 0.275, "step": 1363 }, { "epoch": 0.6875, "grad_norm": 0.2020492398451764, "learning_rate": 8.891272893754937e-07, "loss": 0.2949, "step": 1364 }, { "epoch": 0.6880040322580645, "grad_norm": 0.20796221258156572, "learning_rate": 8.889585924919414e-07, "loss": 0.2738, "step": 1365 }, { "epoch": 0.688508064516129, "grad_norm": 0.17715818724452845, "learning_rate": 8.887897833962444e-07, "loss": 0.2779, "step": 1366 }, { "epoch": 0.6890120967741935, "grad_norm": 0.18291220238667136, "learning_rate": 8.886208621371034e-07, "loss": 0.2867, "step": 1367 }, { "epoch": 0.6895161290322581, "grad_norm": 0.1726979302547636, "learning_rate": 8.88451828763251e-07, "loss": 0.2777, "step": 1368 }, { "epoch": 0.6900201612903226, "grad_norm": 0.23127714111621359, "learning_rate": 8.882826833234525e-07, "loss": 0.3048, "step": 1369 }, { "epoch": 0.6905241935483871, "grad_norm": 0.20073601560327714, "learning_rate": 8.881134258665051e-07, "loss": 0.3165, "step": 1370 }, { "epoch": 0.6910282258064516, "grad_norm": 0.18006036389690872, "learning_rate": 8.879440564412384e-07, "loss": 0.2749, "step": 1371 }, { "epoch": 0.6915322580645161, "grad_norm": 0.2323105469609835, "learning_rate": 8.877745750965146e-07, "loss": 0.2909, "step": 1372 }, { "epoch": 0.6920362903225806, "grad_norm": 0.178854463220733, "learning_rate": 8.876049818812281e-07, "loss": 0.2951, "step": 1373 }, { "epoch": 0.6925403225806451, "grad_norm": 0.2431317742511991, "learning_rate": 8.874352768443055e-07, "loss": 0.2938, "step": 1374 }, { "epoch": 0.6930443548387096, "grad_norm": 0.17355381981942172, "learning_rate": 8.872654600347055e-07, "loss": 0.2904, "step": 1375 }, { "epoch": 0.6935483870967742, "grad_norm": 0.19800049499752437, "learning_rate": 8.870955315014193e-07, "loss": 0.284, "step": 1376 }, { "epoch": 0.6940524193548387, "grad_norm": 0.18380625874212533, "learning_rate": 8.869254912934701e-07, "loss": 0.2777, "step": 1377 }, { "epoch": 0.6945564516129032, "grad_norm": 0.2003918541066735, "learning_rate": 8.867553394599137e-07, "loss": 0.2759, "step": 1378 }, { "epoch": 0.6950604838709677, "grad_norm": 0.18695292816627587, "learning_rate": 8.865850760498375e-07, "loss": 0.2978, "step": 1379 }, { "epoch": 0.6955645161290323, "grad_norm": 0.18229058385029212, "learning_rate": 8.864147011123617e-07, "loss": 0.3109, "step": 1380 }, { "epoch": 0.6960685483870968, "grad_norm": 0.2363688988979662, "learning_rate": 8.862442146966385e-07, "loss": 0.3016, "step": 1381 }, { "epoch": 0.6965725806451613, "grad_norm": 0.1795567842038728, "learning_rate": 8.860736168518517e-07, "loss": 0.2957, "step": 1382 }, { "epoch": 0.6970766129032258, "grad_norm": 0.18655389810189696, "learning_rate": 8.859029076272182e-07, "loss": 0.2809, "step": 1383 }, { "epoch": 0.6975806451612904, "grad_norm": 0.17971498800102143, "learning_rate": 8.857320870719864e-07, "loss": 0.2838, "step": 1384 }, { "epoch": 0.6980846774193549, "grad_norm": 0.1915148111115413, "learning_rate": 8.85561155235437e-07, "loss": 0.2922, "step": 1385 }, { "epoch": 0.6985887096774194, "grad_norm": 0.1858146547352289, "learning_rate": 8.853901121668828e-07, "loss": 0.287, "step": 1386 }, { "epoch": 0.6990927419354839, "grad_norm": 0.19284974835593893, "learning_rate": 8.852189579156684e-07, "loss": 0.2994, "step": 1387 }, { "epoch": 0.6995967741935484, "grad_norm": 0.17664023987078498, "learning_rate": 8.850476925311711e-07, "loss": 0.2753, "step": 1388 }, { "epoch": 0.7001008064516129, "grad_norm": 0.17603041348701104, "learning_rate": 8.848763160627997e-07, "loss": 0.2862, "step": 1389 }, { "epoch": 0.7006048387096774, "grad_norm": 0.17114594668139757, "learning_rate": 8.847048285599952e-07, "loss": 0.2832, "step": 1390 }, { "epoch": 0.7011088709677419, "grad_norm": 0.18467144014813242, "learning_rate": 8.84533230072231e-07, "loss": 0.2761, "step": 1391 }, { "epoch": 0.7016129032258065, "grad_norm": 0.19349322968299676, "learning_rate": 8.843615206490118e-07, "loss": 0.2953, "step": 1392 }, { "epoch": 0.702116935483871, "grad_norm": 0.18516765422030596, "learning_rate": 8.841897003398749e-07, "loss": 0.2892, "step": 1393 }, { "epoch": 0.702116935483871, "eval_loss": 0.31658247113227844, "eval_runtime": 16.8213, "eval_samples_per_second": 50.829, "eval_steps_per_second": 1.07, "step": 1393 }, { "epoch": 0.7026209677419355, "grad_norm": 0.17985038724483002, "learning_rate": 8.840177691943895e-07, "loss": 0.2785, "step": 1394 }, { "epoch": 0.703125, "grad_norm": 0.19728174011424768, "learning_rate": 8.838457272621565e-07, "loss": 0.2942, "step": 1395 }, { "epoch": 0.7036290322580645, "grad_norm": 0.22391820163520273, "learning_rate": 8.836735745928089e-07, "loss": 0.285, "step": 1396 }, { "epoch": 0.704133064516129, "grad_norm": 0.17792486538644323, "learning_rate": 8.835013112360118e-07, "loss": 0.2889, "step": 1397 }, { "epoch": 0.7046370967741935, "grad_norm": 0.17801651435359686, "learning_rate": 8.833289372414621e-07, "loss": 0.2652, "step": 1398 }, { "epoch": 0.7051411290322581, "grad_norm": 0.18753019276564675, "learning_rate": 8.831564526588886e-07, "loss": 0.28, "step": 1399 }, { "epoch": 0.7056451612903226, "grad_norm": 0.2040287956468218, "learning_rate": 8.82983857538052e-07, "loss": 0.2754, "step": 1400 }, { "epoch": 0.7061491935483871, "grad_norm": 0.2003540418757947, "learning_rate": 8.828111519287451e-07, "loss": 0.289, "step": 1401 }, { "epoch": 0.7066532258064516, "grad_norm": 0.20667446057464192, "learning_rate": 8.82638335880792e-07, "loss": 0.2983, "step": 1402 }, { "epoch": 0.7071572580645161, "grad_norm": 0.18640904253903684, "learning_rate": 8.824654094440496e-07, "loss": 0.28, "step": 1403 }, { "epoch": 0.7076612903225806, "grad_norm": 0.1841514649749984, "learning_rate": 8.822923726684057e-07, "loss": 0.2883, "step": 1404 }, { "epoch": 0.7081653225806451, "grad_norm": 0.18229858409313876, "learning_rate": 8.821192256037804e-07, "loss": 0.2909, "step": 1405 }, { "epoch": 0.7086693548387096, "grad_norm": 0.21389681862115453, "learning_rate": 8.819459683001257e-07, "loss": 0.2807, "step": 1406 }, { "epoch": 0.7091733870967742, "grad_norm": 0.17509601564433855, "learning_rate": 8.817726008074252e-07, "loss": 0.2866, "step": 1407 }, { "epoch": 0.7096774193548387, "grad_norm": 0.1766608417202575, "learning_rate": 8.815991231756942e-07, "loss": 0.2832, "step": 1408 }, { "epoch": 0.7101814516129032, "grad_norm": 0.17771797879063636, "learning_rate": 8.814255354549801e-07, "loss": 0.2859, "step": 1409 }, { "epoch": 0.7106854838709677, "grad_norm": 0.18708822232800043, "learning_rate": 8.81251837695362e-07, "loss": 0.2857, "step": 1410 }, { "epoch": 0.7111895161290323, "grad_norm": 0.17909240174834304, "learning_rate": 8.810780299469502e-07, "loss": 0.2839, "step": 1411 }, { "epoch": 0.7116935483870968, "grad_norm": 0.18475558009415183, "learning_rate": 8.809041122598875e-07, "loss": 0.2886, "step": 1412 }, { "epoch": 0.7121975806451613, "grad_norm": 0.18675745708053215, "learning_rate": 8.80730084684348e-07, "loss": 0.2709, "step": 1413 }, { "epoch": 0.7127016129032258, "grad_norm": 0.17219148431342304, "learning_rate": 8.805559472705375e-07, "loss": 0.2867, "step": 1414 }, { "epoch": 0.7132056451612904, "grad_norm": 0.19026935921175026, "learning_rate": 8.803817000686937e-07, "loss": 0.286, "step": 1415 }, { "epoch": 0.7137096774193549, "grad_norm": 0.19083134675660773, "learning_rate": 8.802073431290857e-07, "loss": 0.2782, "step": 1416 }, { "epoch": 0.7142137096774194, "grad_norm": 0.17182972306227576, "learning_rate": 8.800328765020146e-07, "loss": 0.2764, "step": 1417 }, { "epoch": 0.7147177419354839, "grad_norm": 0.1685745381686828, "learning_rate": 8.798583002378128e-07, "loss": 0.2793, "step": 1418 }, { "epoch": 0.7152217741935484, "grad_norm": 0.17135712002602105, "learning_rate": 8.796836143868445e-07, "loss": 0.283, "step": 1419 }, { "epoch": 0.7157258064516129, "grad_norm": 0.18005621021675017, "learning_rate": 8.795088189995052e-07, "loss": 0.2774, "step": 1420 }, { "epoch": 0.7162298387096774, "grad_norm": 0.1757591514793466, "learning_rate": 8.793339141262228e-07, "loss": 0.2938, "step": 1421 }, { "epoch": 0.7167338709677419, "grad_norm": 0.17230964184772374, "learning_rate": 8.791588998174559e-07, "loss": 0.2853, "step": 1422 }, { "epoch": 0.7172379032258065, "grad_norm": 0.19439553949354596, "learning_rate": 8.789837761236954e-07, "loss": 0.2706, "step": 1423 }, { "epoch": 0.717741935483871, "grad_norm": 0.18372365279361863, "learning_rate": 8.788085430954629e-07, "loss": 0.2756, "step": 1424 }, { "epoch": 0.7182459677419355, "grad_norm": 0.17528943529169524, "learning_rate": 8.786332007833123e-07, "loss": 0.2999, "step": 1425 }, { "epoch": 0.71875, "grad_norm": 0.1718666710733766, "learning_rate": 8.78457749237829e-07, "loss": 0.2828, "step": 1426 }, { "epoch": 0.7192540322580645, "grad_norm": 0.19363050866395, "learning_rate": 8.782821885096294e-07, "loss": 0.3002, "step": 1427 }, { "epoch": 0.719758064516129, "grad_norm": 0.24520324213496356, "learning_rate": 8.781065186493617e-07, "loss": 0.2983, "step": 1428 }, { "epoch": 0.7202620967741935, "grad_norm": 0.17701465130657107, "learning_rate": 8.779307397077056e-07, "loss": 0.2941, "step": 1429 }, { "epoch": 0.7207661290322581, "grad_norm": 0.17040112419340506, "learning_rate": 8.777548517353722e-07, "loss": 0.2758, "step": 1430 }, { "epoch": 0.7212701612903226, "grad_norm": 0.1741219490089326, "learning_rate": 8.775788547831042e-07, "loss": 0.2726, "step": 1431 }, { "epoch": 0.7217741935483871, "grad_norm": 0.17863386512186544, "learning_rate": 8.774027489016756e-07, "loss": 0.2979, "step": 1432 }, { "epoch": 0.7222782258064516, "grad_norm": 0.1894540149341605, "learning_rate": 8.772265341418918e-07, "loss": 0.2775, "step": 1433 }, { "epoch": 0.7227822580645161, "grad_norm": 0.17905751081552182, "learning_rate": 8.770502105545898e-07, "loss": 0.2759, "step": 1434 }, { "epoch": 0.7232862903225806, "grad_norm": 0.2250191897070994, "learning_rate": 8.768737781906377e-07, "loss": 0.2792, "step": 1435 }, { "epoch": 0.7237903225806451, "grad_norm": 0.1873361676914799, "learning_rate": 8.766972371009351e-07, "loss": 0.2849, "step": 1436 }, { "epoch": 0.7242943548387096, "grad_norm": 0.18214326291659777, "learning_rate": 8.765205873364132e-07, "loss": 0.3074, "step": 1437 }, { "epoch": 0.7247983870967742, "grad_norm": 0.18444546366186285, "learning_rate": 8.763438289480343e-07, "loss": 0.2974, "step": 1438 }, { "epoch": 0.7253024193548387, "grad_norm": 0.19982363016080767, "learning_rate": 8.76166961986792e-07, "loss": 0.2661, "step": 1439 }, { "epoch": 0.7258064516129032, "grad_norm": 0.1781094109669711, "learning_rate": 8.759899865037115e-07, "loss": 0.2823, "step": 1440 }, { "epoch": 0.7263104838709677, "grad_norm": 0.1810221506084548, "learning_rate": 8.758129025498488e-07, "loss": 0.2994, "step": 1441 }, { "epoch": 0.7268145161290323, "grad_norm": 0.17107486611373632, "learning_rate": 8.75635710176292e-07, "loss": 0.2733, "step": 1442 }, { "epoch": 0.7273185483870968, "grad_norm": 0.1920323851595878, "learning_rate": 8.754584094341597e-07, "loss": 0.2949, "step": 1443 }, { "epoch": 0.7278225806451613, "grad_norm": 0.18240009322545153, "learning_rate": 8.75281000374602e-07, "loss": 0.2936, "step": 1444 }, { "epoch": 0.7283266129032258, "grad_norm": 0.17921579254401834, "learning_rate": 8.751034830488006e-07, "loss": 0.2885, "step": 1445 }, { "epoch": 0.7288306451612904, "grad_norm": 0.17928589014845386, "learning_rate": 8.749258575079678e-07, "loss": 0.303, "step": 1446 }, { "epoch": 0.7293346774193549, "grad_norm": 0.183832758370968, "learning_rate": 8.747481238033478e-07, "loss": 0.2843, "step": 1447 }, { "epoch": 0.7298387096774194, "grad_norm": 0.1725772769965713, "learning_rate": 8.745702819862155e-07, "loss": 0.2702, "step": 1448 }, { "epoch": 0.7303427419354839, "grad_norm": 0.18098089374754683, "learning_rate": 8.743923321078772e-07, "loss": 0.2931, "step": 1449 }, { "epoch": 0.7308467741935484, "grad_norm": 0.19180274311843135, "learning_rate": 8.742142742196703e-07, "loss": 0.2856, "step": 1450 }, { "epoch": 0.7313508064516129, "grad_norm": 0.17825200131791993, "learning_rate": 8.740361083729634e-07, "loss": 0.2902, "step": 1451 }, { "epoch": 0.7318548387096774, "grad_norm": 0.17705711042600694, "learning_rate": 8.738578346191563e-07, "loss": 0.2989, "step": 1452 }, { "epoch": 0.7323588709677419, "grad_norm": 0.18722947981092045, "learning_rate": 8.736794530096797e-07, "loss": 0.2837, "step": 1453 }, { "epoch": 0.7328629032258065, "grad_norm": 0.17077286964873561, "learning_rate": 8.735009635959958e-07, "loss": 0.2837, "step": 1454 }, { "epoch": 0.733366935483871, "grad_norm": 0.17523945290796009, "learning_rate": 8.733223664295976e-07, "loss": 0.3038, "step": 1455 }, { "epoch": 0.7338709677419355, "grad_norm": 0.21527334289626718, "learning_rate": 8.731436615620095e-07, "loss": 0.2785, "step": 1456 }, { "epoch": 0.734375, "grad_norm": 0.17398355824973538, "learning_rate": 8.729648490447864e-07, "loss": 0.2849, "step": 1457 }, { "epoch": 0.7348790322580645, "grad_norm": 0.19117546742540467, "learning_rate": 8.727859289295147e-07, "loss": 0.2922, "step": 1458 }, { "epoch": 0.735383064516129, "grad_norm": 0.17261671346075716, "learning_rate": 8.726069012678119e-07, "loss": 0.2916, "step": 1459 }, { "epoch": 0.7358870967741935, "grad_norm": 0.17887054876224195, "learning_rate": 8.724277661113262e-07, "loss": 0.2873, "step": 1460 }, { "epoch": 0.7363911290322581, "grad_norm": 0.18810141505407652, "learning_rate": 8.722485235117369e-07, "loss": 0.2754, "step": 1461 }, { "epoch": 0.7368951612903226, "grad_norm": 0.1711988777610929, "learning_rate": 8.720691735207549e-07, "loss": 0.2795, "step": 1462 }, { "epoch": 0.7373991935483871, "grad_norm": 0.181403489411845, "learning_rate": 8.718897161901208e-07, "loss": 0.2812, "step": 1463 }, { "epoch": 0.7379032258064516, "grad_norm": 0.1741359968081314, "learning_rate": 8.717101515716074e-07, "loss": 0.2897, "step": 1464 }, { "epoch": 0.7384072580645161, "grad_norm": 0.18618138691906877, "learning_rate": 8.71530479717018e-07, "loss": 0.2862, "step": 1465 }, { "epoch": 0.7389112903225806, "grad_norm": 0.18489427612306297, "learning_rate": 8.713507006781867e-07, "loss": 0.2798, "step": 1466 }, { "epoch": 0.7394153225806451, "grad_norm": 0.2032269124376584, "learning_rate": 8.711708145069787e-07, "loss": 0.2826, "step": 1467 }, { "epoch": 0.7399193548387096, "grad_norm": 0.17163938575826002, "learning_rate": 8.709908212552899e-07, "loss": 0.284, "step": 1468 }, { "epoch": 0.7404233870967742, "grad_norm": 0.17551816826492267, "learning_rate": 8.708107209750473e-07, "loss": 0.2986, "step": 1469 }, { "epoch": 0.7409274193548387, "grad_norm": 0.1707997048312571, "learning_rate": 8.706305137182089e-07, "loss": 0.2795, "step": 1470 }, { "epoch": 0.7414314516129032, "grad_norm": 0.18060170508978393, "learning_rate": 8.70450199536763e-07, "loss": 0.2766, "step": 1471 }, { "epoch": 0.7419354838709677, "grad_norm": 0.17735325459231624, "learning_rate": 8.702697784827295e-07, "loss": 0.2748, "step": 1472 }, { "epoch": 0.7424395161290323, "grad_norm": 0.18812112690658891, "learning_rate": 8.700892506081587e-07, "loss": 0.2933, "step": 1473 }, { "epoch": 0.7429435483870968, "grad_norm": 0.17809974738272327, "learning_rate": 8.699086159651314e-07, "loss": 0.2793, "step": 1474 }, { "epoch": 0.7434475806451613, "grad_norm": 0.18817435016585066, "learning_rate": 8.697278746057602e-07, "loss": 0.29, "step": 1475 }, { "epoch": 0.7439516129032258, "grad_norm": 0.18251009568650395, "learning_rate": 8.695470265821871e-07, "loss": 0.2832, "step": 1476 }, { "epoch": 0.7444556451612904, "grad_norm": 0.17400386528720746, "learning_rate": 8.693660719465865e-07, "loss": 0.284, "step": 1477 }, { "epoch": 0.7449596774193549, "grad_norm": 0.22195293952140055, "learning_rate": 8.69185010751162e-07, "loss": 0.2793, "step": 1478 }, { "epoch": 0.7454637096774194, "grad_norm": 0.16879955516508652, "learning_rate": 8.690038430481489e-07, "loss": 0.2851, "step": 1479 }, { "epoch": 0.7459677419354839, "grad_norm": 0.17379174977002937, "learning_rate": 8.688225688898129e-07, "loss": 0.2997, "step": 1480 }, { "epoch": 0.7464717741935484, "grad_norm": 0.17053843236094848, "learning_rate": 8.686411883284505e-07, "loss": 0.2772, "step": 1481 }, { "epoch": 0.7469758064516129, "grad_norm": 0.1966482045417518, "learning_rate": 8.684597014163891e-07, "loss": 0.2921, "step": 1482 }, { "epoch": 0.7474798387096774, "grad_norm": 0.1842097594300451, "learning_rate": 8.682781082059861e-07, "loss": 0.2902, "step": 1483 }, { "epoch": 0.7479838709677419, "grad_norm": 0.17503229863085937, "learning_rate": 8.680964087496303e-07, "loss": 0.2974, "step": 1484 }, { "epoch": 0.7484879032258065, "grad_norm": 0.1848664095238554, "learning_rate": 8.679146030997409e-07, "loss": 0.2856, "step": 1485 }, { "epoch": 0.748991935483871, "grad_norm": 0.1762096766799662, "learning_rate": 8.677326913087675e-07, "loss": 0.2715, "step": 1486 }, { "epoch": 0.7494959677419355, "grad_norm": 0.20658420010486583, "learning_rate": 8.675506734291906e-07, "loss": 0.2974, "step": 1487 }, { "epoch": 0.75, "grad_norm": 0.17593053114884585, "learning_rate": 8.673685495135214e-07, "loss": 0.2883, "step": 1488 }, { "epoch": 0.7505040322580645, "grad_norm": 0.17964689526353583, "learning_rate": 8.671863196143014e-07, "loss": 0.2931, "step": 1489 }, { "epoch": 0.751008064516129, "grad_norm": 0.1794940966748331, "learning_rate": 8.670039837841028e-07, "loss": 0.2693, "step": 1490 }, { "epoch": 0.7515120967741935, "grad_norm": 0.17496758427902515, "learning_rate": 8.668215420755282e-07, "loss": 0.2708, "step": 1491 }, { "epoch": 0.7520161290322581, "grad_norm": 0.173160052063297, "learning_rate": 8.666389945412112e-07, "loss": 0.2903, "step": 1492 }, { "epoch": 0.7525201612903226, "grad_norm": 0.19227463269140316, "learning_rate": 8.664563412338154e-07, "loss": 0.2928, "step": 1493 }, { "epoch": 0.7530241935483871, "grad_norm": 0.20236023713329304, "learning_rate": 8.662735822060352e-07, "loss": 0.2776, "step": 1494 }, { "epoch": 0.7535282258064516, "grad_norm": 0.20307254860433052, "learning_rate": 8.660907175105956e-07, "loss": 0.2789, "step": 1495 }, { "epoch": 0.7540322580645161, "grad_norm": 0.18535559574459684, "learning_rate": 8.659077472002518e-07, "loss": 0.2977, "step": 1496 }, { "epoch": 0.7545362903225806, "grad_norm": 0.19262270536952245, "learning_rate": 8.657246713277895e-07, "loss": 0.2884, "step": 1497 }, { "epoch": 0.7550403225806451, "grad_norm": 0.17225775461167198, "learning_rate": 8.655414899460251e-07, "loss": 0.2798, "step": 1498 }, { "epoch": 0.7555443548387096, "grad_norm": 0.18058865422335652, "learning_rate": 8.653582031078053e-07, "loss": 0.2803, "step": 1499 }, { "epoch": 0.7560483870967742, "grad_norm": 0.193197708837322, "learning_rate": 8.651748108660072e-07, "loss": 0.285, "step": 1500 }, { "epoch": 0.7565524193548387, "grad_norm": 0.19695362977929617, "learning_rate": 8.649913132735383e-07, "loss": 0.2871, "step": 1501 }, { "epoch": 0.7570564516129032, "grad_norm": 0.17883642405940453, "learning_rate": 8.648077103833365e-07, "loss": 0.2799, "step": 1502 }, { "epoch": 0.7575604838709677, "grad_norm": 0.17172477648515241, "learning_rate": 8.646240022483699e-07, "loss": 0.292, "step": 1503 }, { "epoch": 0.7580645161290323, "grad_norm": 0.17575732800042274, "learning_rate": 8.644401889216377e-07, "loss": 0.2777, "step": 1504 }, { "epoch": 0.7585685483870968, "grad_norm": 0.17678434388091105, "learning_rate": 8.642562704561684e-07, "loss": 0.2956, "step": 1505 }, { "epoch": 0.7590725806451613, "grad_norm": 0.18058078973223446, "learning_rate": 8.640722469050217e-07, "loss": 0.2887, "step": 1506 }, { "epoch": 0.7595766129032258, "grad_norm": 0.1742859796225662, "learning_rate": 8.638881183212869e-07, "loss": 0.2787, "step": 1507 }, { "epoch": 0.7600806451612904, "grad_norm": 0.17057933697554573, "learning_rate": 8.637038847580842e-07, "loss": 0.2822, "step": 1508 }, { "epoch": 0.7605846774193549, "grad_norm": 0.17396796248874147, "learning_rate": 8.635195462685637e-07, "loss": 0.2898, "step": 1509 }, { "epoch": 0.7610887096774194, "grad_norm": 0.1754476685521095, "learning_rate": 8.633351029059061e-07, "loss": 0.2816, "step": 1510 }, { "epoch": 0.7615927419354839, "grad_norm": 0.1996562682521391, "learning_rate": 8.63150554723322e-07, "loss": 0.2802, "step": 1511 }, { "epoch": 0.7620967741935484, "grad_norm": 0.18654386311810392, "learning_rate": 8.629659017740525e-07, "loss": 0.2843, "step": 1512 }, { "epoch": 0.7626008064516129, "grad_norm": 0.1795688988462513, "learning_rate": 8.627811441113688e-07, "loss": 0.2972, "step": 1513 }, { "epoch": 0.7631048387096774, "grad_norm": 0.1700302651183351, "learning_rate": 8.625962817885723e-07, "loss": 0.276, "step": 1514 }, { "epoch": 0.7636088709677419, "grad_norm": 0.17276816293597855, "learning_rate": 8.624113148589947e-07, "loss": 0.2857, "step": 1515 }, { "epoch": 0.7641129032258065, "grad_norm": 0.17205240872531083, "learning_rate": 8.622262433759976e-07, "loss": 0.2898, "step": 1516 }, { "epoch": 0.764616935483871, "grad_norm": 0.19308777537983587, "learning_rate": 8.620410673929732e-07, "loss": 0.2911, "step": 1517 }, { "epoch": 0.7651209677419355, "grad_norm": 0.18066352556552442, "learning_rate": 8.618557869633438e-07, "loss": 0.2851, "step": 1518 }, { "epoch": 0.765625, "grad_norm": 0.1740551426558877, "learning_rate": 8.616704021405613e-07, "loss": 0.2754, "step": 1519 }, { "epoch": 0.7661290322580645, "grad_norm": 0.20220657101308243, "learning_rate": 8.614849129781084e-07, "loss": 0.2675, "step": 1520 }, { "epoch": 0.766633064516129, "grad_norm": 0.17142601346124745, "learning_rate": 8.612993195294971e-07, "loss": 0.2697, "step": 1521 }, { "epoch": 0.7671370967741935, "grad_norm": 0.20383494070933988, "learning_rate": 8.611136218482704e-07, "loss": 0.2917, "step": 1522 }, { "epoch": 0.7676411290322581, "grad_norm": 0.17942668361212222, "learning_rate": 8.609278199880007e-07, "loss": 0.2996, "step": 1523 }, { "epoch": 0.7681451612903226, "grad_norm": 0.21582357596764432, "learning_rate": 8.607419140022908e-07, "loss": 0.2905, "step": 1524 }, { "epoch": 0.7686491935483871, "grad_norm": 0.19537946016922536, "learning_rate": 8.605559039447734e-07, "loss": 0.2897, "step": 1525 }, { "epoch": 0.7691532258064516, "grad_norm": 0.17446732973784299, "learning_rate": 8.603697898691112e-07, "loss": 0.2922, "step": 1526 }, { "epoch": 0.7696572580645161, "grad_norm": 0.17940436217661643, "learning_rate": 8.601835718289971e-07, "loss": 0.2897, "step": 1527 }, { "epoch": 0.7701612903225806, "grad_norm": 0.16967508800400594, "learning_rate": 8.599972498781536e-07, "loss": 0.2838, "step": 1528 }, { "epoch": 0.7706653225806451, "grad_norm": 0.17730457999766902, "learning_rate": 8.598108240703337e-07, "loss": 0.2863, "step": 1529 }, { "epoch": 0.7711693548387096, "grad_norm": 0.17432021927654012, "learning_rate": 8.5962429445932e-07, "loss": 0.2838, "step": 1530 }, { "epoch": 0.7716733870967742, "grad_norm": 0.18292364816715403, "learning_rate": 8.594376610989249e-07, "loss": 0.2889, "step": 1531 }, { "epoch": 0.7721774193548387, "grad_norm": 0.1692258045511517, "learning_rate": 8.592509240429913e-07, "loss": 0.2859, "step": 1532 }, { "epoch": 0.7726814516129032, "grad_norm": 0.17793048496272398, "learning_rate": 8.590640833453916e-07, "loss": 0.2962, "step": 1533 }, { "epoch": 0.7731854838709677, "grad_norm": 0.170590500112818, "learning_rate": 8.58877139060028e-07, "loss": 0.273, "step": 1534 }, { "epoch": 0.7736895161290323, "grad_norm": 0.1748601955404486, "learning_rate": 8.58690091240833e-07, "loss": 0.2844, "step": 1535 }, { "epoch": 0.7741935483870968, "grad_norm": 0.18730945527731682, "learning_rate": 8.585029399417687e-07, "loss": 0.3091, "step": 1536 }, { "epoch": 0.7746975806451613, "grad_norm": 0.18332986307488439, "learning_rate": 8.583156852168269e-07, "loss": 0.2793, "step": 1537 }, { "epoch": 0.7752016129032258, "grad_norm": 0.18440491915084045, "learning_rate": 8.581283271200297e-07, "loss": 0.2862, "step": 1538 }, { "epoch": 0.7757056451612904, "grad_norm": 0.173582809480461, "learning_rate": 8.579408657054286e-07, "loss": 0.2792, "step": 1539 }, { "epoch": 0.7762096774193549, "grad_norm": 0.18014379318891854, "learning_rate": 8.577533010271049e-07, "loss": 0.2919, "step": 1540 }, { "epoch": 0.7767137096774194, "grad_norm": 0.1722503671402626, "learning_rate": 8.575656331391702e-07, "loss": 0.2697, "step": 1541 }, { "epoch": 0.7772177419354839, "grad_norm": 0.1727617532348802, "learning_rate": 8.573778620957652e-07, "loss": 0.3044, "step": 1542 }, { "epoch": 0.7777217741935484, "grad_norm": 0.17458552774869926, "learning_rate": 8.571899879510609e-07, "loss": 0.3022, "step": 1543 }, { "epoch": 0.7782258064516129, "grad_norm": 0.19575211026743364, "learning_rate": 8.570020107592579e-07, "loss": 0.2729, "step": 1544 }, { "epoch": 0.7787298387096774, "grad_norm": 0.18571171303798414, "learning_rate": 8.568139305745861e-07, "loss": 0.2797, "step": 1545 }, { "epoch": 0.7792338709677419, "grad_norm": 0.19155677402308682, "learning_rate": 8.566257474513057e-07, "loss": 0.2999, "step": 1546 }, { "epoch": 0.7797379032258065, "grad_norm": 0.19852085649960613, "learning_rate": 8.564374614437065e-07, "loss": 0.2948, "step": 1547 }, { "epoch": 0.780241935483871, "grad_norm": 0.18180259804141408, "learning_rate": 8.562490726061074e-07, "loss": 0.272, "step": 1548 }, { "epoch": 0.7807459677419355, "grad_norm": 0.1757617028962345, "learning_rate": 8.560605809928578e-07, "loss": 0.2899, "step": 1549 }, { "epoch": 0.78125, "grad_norm": 0.18002181651881002, "learning_rate": 8.558719866583364e-07, "loss": 0.2801, "step": 1550 }, { "epoch": 0.7817540322580645, "grad_norm": 0.18391600786228307, "learning_rate": 8.556832896569512e-07, "loss": 0.2953, "step": 1551 }, { "epoch": 0.782258064516129, "grad_norm": 0.1696821891773392, "learning_rate": 8.554944900431405e-07, "loss": 0.2765, "step": 1552 }, { "epoch": 0.7827620967741935, "grad_norm": 0.17250755946675386, "learning_rate": 8.553055878713714e-07, "loss": 0.2742, "step": 1553 }, { "epoch": 0.7832661290322581, "grad_norm": 0.17460773929441345, "learning_rate": 8.551165831961414e-07, "loss": 0.2941, "step": 1554 }, { "epoch": 0.7837701612903226, "grad_norm": 0.17494634704080653, "learning_rate": 8.549274760719767e-07, "loss": 0.2838, "step": 1555 }, { "epoch": 0.7842741935483871, "grad_norm": 0.18400774964808703, "learning_rate": 8.547382665534339e-07, "loss": 0.2913, "step": 1556 }, { "epoch": 0.7847782258064516, "grad_norm": 0.17748115232473677, "learning_rate": 8.545489546950988e-07, "loss": 0.2859, "step": 1557 }, { "epoch": 0.7852822580645161, "grad_norm": 0.18218986931380948, "learning_rate": 8.543595405515864e-07, "loss": 0.2812, "step": 1558 }, { "epoch": 0.7857862903225806, "grad_norm": 0.1969916491736838, "learning_rate": 8.541700241775419e-07, "loss": 0.2888, "step": 1559 }, { "epoch": 0.7862903225806451, "grad_norm": 0.20960556550108175, "learning_rate": 8.539804056276393e-07, "loss": 0.2896, "step": 1560 }, { "epoch": 0.7867943548387096, "grad_norm": 0.18784903454787713, "learning_rate": 8.537906849565824e-07, "loss": 0.292, "step": 1561 }, { "epoch": 0.7872983870967742, "grad_norm": 0.17668988962371776, "learning_rate": 8.536008622191047e-07, "loss": 0.2832, "step": 1562 }, { "epoch": 0.7878024193548387, "grad_norm": 0.1938218281223095, "learning_rate": 8.534109374699685e-07, "loss": 0.2806, "step": 1563 }, { "epoch": 0.7883064516129032, "grad_norm": 0.1724503265255416, "learning_rate": 8.532209107639661e-07, "loss": 0.2836, "step": 1564 }, { "epoch": 0.7888104838709677, "grad_norm": 0.216684297634416, "learning_rate": 8.530307821559192e-07, "loss": 0.2834, "step": 1565 }, { "epoch": 0.7893145161290323, "grad_norm": 0.1952086001178248, "learning_rate": 8.528405517006785e-07, "loss": 0.2703, "step": 1566 }, { "epoch": 0.7898185483870968, "grad_norm": 0.17917540283085726, "learning_rate": 8.526502194531242e-07, "loss": 0.2795, "step": 1567 }, { "epoch": 0.7903225806451613, "grad_norm": 0.16957021131556896, "learning_rate": 8.524597854681663e-07, "loss": 0.2849, "step": 1568 }, { "epoch": 0.7908266129032258, "grad_norm": 0.17315335375074736, "learning_rate": 8.522692498007436e-07, "loss": 0.2706, "step": 1569 }, { "epoch": 0.7913306451612904, "grad_norm": 0.17415225412091026, "learning_rate": 8.520786125058246e-07, "loss": 0.2784, "step": 1570 }, { "epoch": 0.7918346774193549, "grad_norm": 0.19715463128787936, "learning_rate": 8.518878736384067e-07, "loss": 0.2791, "step": 1571 }, { "epoch": 0.7923387096774194, "grad_norm": 0.18485659414062103, "learning_rate": 8.516970332535174e-07, "loss": 0.2883, "step": 1572 }, { "epoch": 0.7928427419354839, "grad_norm": 0.17412481517409986, "learning_rate": 8.515060914062124e-07, "loss": 0.2891, "step": 1573 }, { "epoch": 0.7933467741935484, "grad_norm": 0.19186859393133207, "learning_rate": 8.513150481515777e-07, "loss": 0.2861, "step": 1574 }, { "epoch": 0.7938508064516129, "grad_norm": 0.17924622608650354, "learning_rate": 8.511239035447277e-07, "loss": 0.2769, "step": 1575 }, { "epoch": 0.7943548387096774, "grad_norm": 0.21636343101039252, "learning_rate": 8.509326576408066e-07, "loss": 0.2809, "step": 1576 }, { "epoch": 0.7948588709677419, "grad_norm": 0.1988798854713325, "learning_rate": 8.507413104949878e-07, "loss": 0.2817, "step": 1577 }, { "epoch": 0.7953629032258065, "grad_norm": 0.20133302174026105, "learning_rate": 8.505498621624738e-07, "loss": 0.273, "step": 1578 }, { "epoch": 0.795866935483871, "grad_norm": 0.18247301896520096, "learning_rate": 8.503583126984959e-07, "loss": 0.2851, "step": 1579 }, { "epoch": 0.7963709677419355, "grad_norm": 0.179121678370444, "learning_rate": 8.501666621583152e-07, "loss": 0.2817, "step": 1580 }, { "epoch": 0.796875, "grad_norm": 0.1794015027687013, "learning_rate": 8.499749105972216e-07, "loss": 0.277, "step": 1581 }, { "epoch": 0.7973790322580645, "grad_norm": 0.18799369028010127, "learning_rate": 8.497830580705343e-07, "loss": 0.2811, "step": 1582 }, { "epoch": 0.797883064516129, "grad_norm": 0.19305091528117443, "learning_rate": 8.495911046336015e-07, "loss": 0.2995, "step": 1583 }, { "epoch": 0.7983870967741935, "grad_norm": 0.2066980461918033, "learning_rate": 8.493990503418007e-07, "loss": 0.2982, "step": 1584 }, { "epoch": 0.7988911290322581, "grad_norm": 0.17520616254907886, "learning_rate": 8.492068952505382e-07, "loss": 0.271, "step": 1585 }, { "epoch": 0.7993951612903226, "grad_norm": 0.17626401945753917, "learning_rate": 8.490146394152497e-07, "loss": 0.2858, "step": 1586 }, { "epoch": 0.7998991935483871, "grad_norm": 0.1752484381904052, "learning_rate": 8.488222828913998e-07, "loss": 0.2862, "step": 1587 }, { "epoch": 0.8004032258064516, "grad_norm": 0.18080248190940781, "learning_rate": 8.486298257344821e-07, "loss": 0.2815, "step": 1588 }, { "epoch": 0.8009072580645161, "grad_norm": 0.18782657218788468, "learning_rate": 8.484372680000193e-07, "loss": 0.2907, "step": 1589 }, { "epoch": 0.8014112903225806, "grad_norm": 0.17866164980608917, "learning_rate": 8.482446097435631e-07, "loss": 0.2863, "step": 1590 }, { "epoch": 0.8019153225806451, "grad_norm": 0.16998904102569148, "learning_rate": 8.480518510206942e-07, "loss": 0.2866, "step": 1591 }, { "epoch": 0.8024193548387096, "grad_norm": 0.17671260570065095, "learning_rate": 8.478589918870225e-07, "loss": 0.2965, "step": 1592 }, { "epoch": 0.8024193548387096, "eval_loss": 0.31474894285202026, "eval_runtime": 17.2161, "eval_samples_per_second": 49.663, "eval_steps_per_second": 1.046, "step": 1592 }, { "epoch": 0.8029233870967742, "grad_norm": 0.17663891905474427, "learning_rate": 8.476660323981863e-07, "loss": 0.2894, "step": 1593 }, { "epoch": 0.8034274193548387, "grad_norm": 0.2011698318082641, "learning_rate": 8.474729726098537e-07, "loss": 0.2922, "step": 1594 }, { "epoch": 0.8039314516129032, "grad_norm": 0.20386721072025002, "learning_rate": 8.472798125777208e-07, "loss": 0.2797, "step": 1595 }, { "epoch": 0.8044354838709677, "grad_norm": 0.21744706928084864, "learning_rate": 8.470865523575133e-07, "loss": 0.2773, "step": 1596 }, { "epoch": 0.8049395161290323, "grad_norm": 0.1749022648400484, "learning_rate": 8.468931920049855e-07, "loss": 0.2893, "step": 1597 }, { "epoch": 0.8054435483870968, "grad_norm": 0.17378425941567904, "learning_rate": 8.466997315759207e-07, "loss": 0.2885, "step": 1598 }, { "epoch": 0.8059475806451613, "grad_norm": 0.2151441210122262, "learning_rate": 8.465061711261312e-07, "loss": 0.2812, "step": 1599 }, { "epoch": 0.8064516129032258, "grad_norm": 0.20189295912194238, "learning_rate": 8.463125107114576e-07, "loss": 0.2938, "step": 1600 }, { "epoch": 0.8069556451612904, "grad_norm": 0.1737385784520349, "learning_rate": 8.461187503877701e-07, "loss": 0.2913, "step": 1601 }, { "epoch": 0.8074596774193549, "grad_norm": 0.17511209441842182, "learning_rate": 8.459248902109671e-07, "loss": 0.2972, "step": 1602 }, { "epoch": 0.8079637096774194, "grad_norm": 0.17927894922934345, "learning_rate": 8.457309302369762e-07, "loss": 0.2751, "step": 1603 }, { "epoch": 0.8084677419354839, "grad_norm": 0.1711157379063894, "learning_rate": 8.455368705217536e-07, "loss": 0.293, "step": 1604 }, { "epoch": 0.8089717741935484, "grad_norm": 0.20561980849817957, "learning_rate": 8.453427111212844e-07, "loss": 0.2804, "step": 1605 }, { "epoch": 0.8094758064516129, "grad_norm": 0.18016754167062388, "learning_rate": 8.451484520915823e-07, "loss": 0.2931, "step": 1606 }, { "epoch": 0.8099798387096774, "grad_norm": 0.19807530942971208, "learning_rate": 8.449540934886898e-07, "loss": 0.2947, "step": 1607 }, { "epoch": 0.8104838709677419, "grad_norm": 0.18074849101785265, "learning_rate": 8.447596353686783e-07, "loss": 0.3026, "step": 1608 }, { "epoch": 0.8109879032258065, "grad_norm": 0.18227592034924972, "learning_rate": 8.445650777876477e-07, "loss": 0.2942, "step": 1609 }, { "epoch": 0.811491935483871, "grad_norm": 0.1807387057488744, "learning_rate": 8.443704208017265e-07, "loss": 0.2905, "step": 1610 }, { "epoch": 0.8119959677419355, "grad_norm": 0.17752462791309626, "learning_rate": 8.441756644670721e-07, "loss": 0.3053, "step": 1611 }, { "epoch": 0.8125, "grad_norm": 0.17960458304361665, "learning_rate": 8.439808088398708e-07, "loss": 0.2807, "step": 1612 }, { "epoch": 0.8130040322580645, "grad_norm": 0.17933572002728534, "learning_rate": 8.437858539763368e-07, "loss": 0.2874, "step": 1613 }, { "epoch": 0.813508064516129, "grad_norm": 0.17388277996545867, "learning_rate": 8.435907999327137e-07, "loss": 0.2872, "step": 1614 }, { "epoch": 0.8140120967741935, "grad_norm": 0.20696380286831806, "learning_rate": 8.433956467652731e-07, "loss": 0.296, "step": 1615 }, { "epoch": 0.8145161290322581, "grad_norm": 0.19036323172621478, "learning_rate": 8.432003945303157e-07, "loss": 0.2919, "step": 1616 }, { "epoch": 0.8150201612903226, "grad_norm": 0.17417826300098163, "learning_rate": 8.430050432841705e-07, "loss": 0.2839, "step": 1617 }, { "epoch": 0.8155241935483871, "grad_norm": 0.1764029810149293, "learning_rate": 8.428095930831951e-07, "loss": 0.2801, "step": 1618 }, { "epoch": 0.8160282258064516, "grad_norm": 0.18297026792189108, "learning_rate": 8.426140439837758e-07, "loss": 0.2907, "step": 1619 }, { "epoch": 0.8165322580645161, "grad_norm": 0.1821471162865555, "learning_rate": 8.42418396042327e-07, "loss": 0.2871, "step": 1620 }, { "epoch": 0.8170362903225806, "grad_norm": 0.18365629188801313, "learning_rate": 8.422226493152923e-07, "loss": 0.2729, "step": 1621 }, { "epoch": 0.8175403225806451, "grad_norm": 0.17997053499497517, "learning_rate": 8.420268038591432e-07, "loss": 0.2928, "step": 1622 }, { "epoch": 0.8180443548387096, "grad_norm": 0.18199210579188108, "learning_rate": 8.418308597303798e-07, "loss": 0.2833, "step": 1623 }, { "epoch": 0.8185483870967742, "grad_norm": 0.17684377413059332, "learning_rate": 8.41634816985531e-07, "loss": 0.2821, "step": 1624 }, { "epoch": 0.8190524193548387, "grad_norm": 0.1729650092760548, "learning_rate": 8.414386756811538e-07, "loss": 0.2915, "step": 1625 }, { "epoch": 0.8195564516129032, "grad_norm": 0.16853170774887175, "learning_rate": 8.412424358738337e-07, "loss": 0.296, "step": 1626 }, { "epoch": 0.8200604838709677, "grad_norm": 0.17189289211370995, "learning_rate": 8.410460976201847e-07, "loss": 0.3022, "step": 1627 }, { "epoch": 0.8205645161290323, "grad_norm": 0.18303909561880122, "learning_rate": 8.408496609768494e-07, "loss": 0.27, "step": 1628 }, { "epoch": 0.8210685483870968, "grad_norm": 0.1689191180195712, "learning_rate": 8.406531260004983e-07, "loss": 0.283, "step": 1629 }, { "epoch": 0.8215725806451613, "grad_norm": 0.19059066020698312, "learning_rate": 8.404564927478304e-07, "loss": 0.2733, "step": 1630 }, { "epoch": 0.8220766129032258, "grad_norm": 0.18412586501703707, "learning_rate": 8.402597612755736e-07, "loss": 0.2958, "step": 1631 }, { "epoch": 0.8225806451612904, "grad_norm": 0.1865008483611006, "learning_rate": 8.400629316404833e-07, "loss": 0.28, "step": 1632 }, { "epoch": 0.8230846774193549, "grad_norm": 0.1848533702786811, "learning_rate": 8.398660038993439e-07, "loss": 0.2806, "step": 1633 }, { "epoch": 0.8235887096774194, "grad_norm": 0.1737421957224456, "learning_rate": 8.396689781089676e-07, "loss": 0.285, "step": 1634 }, { "epoch": 0.8240927419354839, "grad_norm": 0.16653152620902592, "learning_rate": 8.394718543261954e-07, "loss": 0.2723, "step": 1635 }, { "epoch": 0.8245967741935484, "grad_norm": 0.1758054798983086, "learning_rate": 8.392746326078961e-07, "loss": 0.2889, "step": 1636 }, { "epoch": 0.8251008064516129, "grad_norm": 0.17075324603300618, "learning_rate": 8.39077313010967e-07, "loss": 0.2858, "step": 1637 }, { "epoch": 0.8256048387096774, "grad_norm": 0.18600083079394014, "learning_rate": 8.388798955923335e-07, "loss": 0.2735, "step": 1638 }, { "epoch": 0.8261088709677419, "grad_norm": 0.1961620789751493, "learning_rate": 8.386823804089496e-07, "loss": 0.2992, "step": 1639 }, { "epoch": 0.8266129032258065, "grad_norm": 0.2056030735817931, "learning_rate": 8.384847675177968e-07, "loss": 0.2966, "step": 1640 }, { "epoch": 0.827116935483871, "grad_norm": 0.178659647296446, "learning_rate": 8.382870569758853e-07, "loss": 0.2943, "step": 1641 }, { "epoch": 0.8276209677419355, "grad_norm": 0.17144740006998577, "learning_rate": 8.380892488402535e-07, "loss": 0.2725, "step": 1642 }, { "epoch": 0.828125, "grad_norm": 0.2048573646210055, "learning_rate": 8.378913431679677e-07, "loss": 0.2654, "step": 1643 }, { "epoch": 0.8286290322580645, "grad_norm": 0.19608901393965525, "learning_rate": 8.376933400161226e-07, "loss": 0.2657, "step": 1644 }, { "epoch": 0.829133064516129, "grad_norm": 0.16996202799605425, "learning_rate": 8.374952394418409e-07, "loss": 0.2857, "step": 1645 }, { "epoch": 0.8296370967741935, "grad_norm": 0.1804403768350341, "learning_rate": 8.37297041502273e-07, "loss": 0.2889, "step": 1646 }, { "epoch": 0.8301411290322581, "grad_norm": 0.18021938104304921, "learning_rate": 8.370987462545984e-07, "loss": 0.2836, "step": 1647 }, { "epoch": 0.8306451612903226, "grad_norm": 0.18418845425920574, "learning_rate": 8.369003537560237e-07, "loss": 0.2812, "step": 1648 }, { "epoch": 0.8311491935483871, "grad_norm": 0.2151893686661586, "learning_rate": 8.367018640637838e-07, "loss": 0.2856, "step": 1649 }, { "epoch": 0.8316532258064516, "grad_norm": 0.25324081072788207, "learning_rate": 8.365032772351419e-07, "loss": 0.2745, "step": 1650 }, { "epoch": 0.8321572580645161, "grad_norm": 0.17621766684272705, "learning_rate": 8.363045933273889e-07, "loss": 0.2813, "step": 1651 }, { "epoch": 0.8326612903225806, "grad_norm": 0.1810792109703597, "learning_rate": 8.361058123978442e-07, "loss": 0.2958, "step": 1652 }, { "epoch": 0.8331653225806451, "grad_norm": 0.17290815775300664, "learning_rate": 8.359069345038548e-07, "loss": 0.2821, "step": 1653 }, { "epoch": 0.8336693548387096, "grad_norm": 0.17231901628427526, "learning_rate": 8.357079597027954e-07, "loss": 0.2793, "step": 1654 }, { "epoch": 0.8341733870967742, "grad_norm": 0.19031643192982242, "learning_rate": 8.355088880520693e-07, "loss": 0.2931, "step": 1655 }, { "epoch": 0.8346774193548387, "grad_norm": 0.1773771991850869, "learning_rate": 8.353097196091074e-07, "loss": 0.2897, "step": 1656 }, { "epoch": 0.8351814516129032, "grad_norm": 0.18048345032642082, "learning_rate": 8.351104544313685e-07, "loss": 0.2836, "step": 1657 }, { "epoch": 0.8356854838709677, "grad_norm": 0.18443696405092166, "learning_rate": 8.349110925763393e-07, "loss": 0.2846, "step": 1658 }, { "epoch": 0.8361895161290323, "grad_norm": 0.1772897016179952, "learning_rate": 8.347116341015347e-07, "loss": 0.2932, "step": 1659 }, { "epoch": 0.8366935483870968, "grad_norm": 0.1790343811345245, "learning_rate": 8.34512079064497e-07, "loss": 0.2915, "step": 1660 }, { "epoch": 0.8371975806451613, "grad_norm": 0.18840621984787398, "learning_rate": 8.343124275227968e-07, "loss": 0.2816, "step": 1661 }, { "epoch": 0.8377016129032258, "grad_norm": 0.17628267263776604, "learning_rate": 8.341126795340321e-07, "loss": 0.2805, "step": 1662 }, { "epoch": 0.8382056451612904, "grad_norm": 0.17600045722386132, "learning_rate": 8.339128351558291e-07, "loss": 0.29, "step": 1663 }, { "epoch": 0.8387096774193549, "grad_norm": 0.182325857327577, "learning_rate": 8.337128944458415e-07, "loss": 0.2903, "step": 1664 }, { "epoch": 0.8392137096774194, "grad_norm": 0.19535349793236487, "learning_rate": 8.335128574617513e-07, "loss": 0.2858, "step": 1665 }, { "epoch": 0.8397177419354839, "grad_norm": 0.27705115942369163, "learning_rate": 8.333127242612677e-07, "loss": 0.2857, "step": 1666 }, { "epoch": 0.8402217741935484, "grad_norm": 0.185724216553934, "learning_rate": 8.331124949021279e-07, "loss": 0.2583, "step": 1667 }, { "epoch": 0.8407258064516129, "grad_norm": 0.18254445448952297, "learning_rate": 8.329121694420969e-07, "loss": 0.2915, "step": 1668 }, { "epoch": 0.8412298387096774, "grad_norm": 0.18038012693100844, "learning_rate": 8.327117479389672e-07, "loss": 0.2792, "step": 1669 }, { "epoch": 0.8417338709677419, "grad_norm": 0.19274233034208005, "learning_rate": 8.325112304505592e-07, "loss": 0.2805, "step": 1670 }, { "epoch": 0.8422379032258065, "grad_norm": 0.17681044961531925, "learning_rate": 8.323106170347212e-07, "loss": 0.2723, "step": 1671 }, { "epoch": 0.842741935483871, "grad_norm": 0.17251043391808404, "learning_rate": 8.321099077493285e-07, "loss": 0.2696, "step": 1672 }, { "epoch": 0.8432459677419355, "grad_norm": 0.1752390712831652, "learning_rate": 8.319091026522848e-07, "loss": 0.2738, "step": 1673 }, { "epoch": 0.84375, "grad_norm": 0.17935779324663959, "learning_rate": 8.317082018015211e-07, "loss": 0.2862, "step": 1674 }, { "epoch": 0.8442540322580645, "grad_norm": 0.19723380525297224, "learning_rate": 8.315072052549961e-07, "loss": 0.2961, "step": 1675 }, { "epoch": 0.844758064516129, "grad_norm": 0.17594438243367191, "learning_rate": 8.313061130706959e-07, "loss": 0.2838, "step": 1676 }, { "epoch": 0.8452620967741935, "grad_norm": 0.18502761220597608, "learning_rate": 8.311049253066344e-07, "loss": 0.2904, "step": 1677 }, { "epoch": 0.8457661290322581, "grad_norm": 0.1735632363360221, "learning_rate": 8.30903642020853e-07, "loss": 0.286, "step": 1678 }, { "epoch": 0.8462701612903226, "grad_norm": 0.1685121845779261, "learning_rate": 8.307022632714208e-07, "loss": 0.266, "step": 1679 }, { "epoch": 0.8467741935483871, "grad_norm": 0.17378234473772378, "learning_rate": 8.305007891164341e-07, "loss": 0.2904, "step": 1680 }, { "epoch": 0.8472782258064516, "grad_norm": 0.2032632255195518, "learning_rate": 8.302992196140173e-07, "loss": 0.2895, "step": 1681 }, { "epoch": 0.8477822580645161, "grad_norm": 0.17748247042521456, "learning_rate": 8.300975548223214e-07, "loss": 0.3003, "step": 1682 }, { "epoch": 0.8482862903225806, "grad_norm": 0.195328665507149, "learning_rate": 8.298957947995261e-07, "loss": 0.2832, "step": 1683 }, { "epoch": 0.8487903225806451, "grad_norm": 0.17552608040636258, "learning_rate": 8.296939396038375e-07, "loss": 0.2866, "step": 1684 }, { "epoch": 0.8492943548387096, "grad_norm": 0.17824578366133345, "learning_rate": 8.294919892934896e-07, "loss": 0.2779, "step": 1685 }, { "epoch": 0.8497983870967742, "grad_norm": 0.17486942680363315, "learning_rate": 8.29289943926744e-07, "loss": 0.2617, "step": 1686 }, { "epoch": 0.8503024193548387, "grad_norm": 0.18914845225754362, "learning_rate": 8.290878035618893e-07, "loss": 0.2812, "step": 1687 }, { "epoch": 0.8508064516129032, "grad_norm": 0.18115901123985698, "learning_rate": 8.28885568257242e-07, "loss": 0.3004, "step": 1688 }, { "epoch": 0.8513104838709677, "grad_norm": 0.17449979315444422, "learning_rate": 8.286832380711454e-07, "loss": 0.273, "step": 1689 }, { "epoch": 0.8518145161290323, "grad_norm": 0.1782814890164981, "learning_rate": 8.284808130619708e-07, "loss": 0.2997, "step": 1690 }, { "epoch": 0.8523185483870968, "grad_norm": 0.1710342855678077, "learning_rate": 8.282782932881165e-07, "loss": 0.2864, "step": 1691 }, { "epoch": 0.8528225806451613, "grad_norm": 0.18551314368513988, "learning_rate": 8.280756788080081e-07, "loss": 0.2847, "step": 1692 }, { "epoch": 0.8533266129032258, "grad_norm": 0.17793439535374442, "learning_rate": 8.278729696800988e-07, "loss": 0.3006, "step": 1693 }, { "epoch": 0.8538306451612904, "grad_norm": 0.17754655120950547, "learning_rate": 8.276701659628686e-07, "loss": 0.2889, "step": 1694 }, { "epoch": 0.8543346774193549, "grad_norm": 0.18489194070233667, "learning_rate": 8.274672677148256e-07, "loss": 0.2725, "step": 1695 }, { "epoch": 0.8548387096774194, "grad_norm": 0.1967447154512761, "learning_rate": 8.272642749945042e-07, "loss": 0.2857, "step": 1696 }, { "epoch": 0.8553427419354839, "grad_norm": 0.17696581032696806, "learning_rate": 8.270611878604669e-07, "loss": 0.2793, "step": 1697 }, { "epoch": 0.8558467741935484, "grad_norm": 0.17041142623174294, "learning_rate": 8.268580063713028e-07, "loss": 0.2708, "step": 1698 }, { "epoch": 0.8563508064516129, "grad_norm": 0.17554540593528697, "learning_rate": 8.266547305856288e-07, "loss": 0.2708, "step": 1699 }, { "epoch": 0.8568548387096774, "grad_norm": 0.17308164296880016, "learning_rate": 8.264513605620884e-07, "loss": 0.2738, "step": 1700 }, { "epoch": 0.8573588709677419, "grad_norm": 0.2285975263899735, "learning_rate": 8.262478963593529e-07, "loss": 0.2732, "step": 1701 }, { "epoch": 0.8578629032258065, "grad_norm": 0.1765909866610657, "learning_rate": 8.260443380361201e-07, "loss": 0.2822, "step": 1702 }, { "epoch": 0.858366935483871, "grad_norm": 0.18612762071005345, "learning_rate": 8.258406856511157e-07, "loss": 0.2897, "step": 1703 }, { "epoch": 0.8588709677419355, "grad_norm": 0.18274802390819692, "learning_rate": 8.256369392630918e-07, "loss": 0.2826, "step": 1704 }, { "epoch": 0.859375, "grad_norm": 0.1702610425035792, "learning_rate": 8.254330989308283e-07, "loss": 0.2825, "step": 1705 }, { "epoch": 0.8598790322580645, "grad_norm": 0.18746425872965825, "learning_rate": 8.252291647131315e-07, "loss": 0.2831, "step": 1706 }, { "epoch": 0.860383064516129, "grad_norm": 0.1729312918615105, "learning_rate": 8.250251366688357e-07, "loss": 0.2788, "step": 1707 }, { "epoch": 0.8608870967741935, "grad_norm": 0.17310685779871965, "learning_rate": 8.248210148568011e-07, "loss": 0.2807, "step": 1708 }, { "epoch": 0.8613911290322581, "grad_norm": 0.17388995344225858, "learning_rate": 8.246167993359159e-07, "loss": 0.278, "step": 1709 }, { "epoch": 0.8618951612903226, "grad_norm": 0.17561470000841797, "learning_rate": 8.244124901650951e-07, "loss": 0.2852, "step": 1710 }, { "epoch": 0.8623991935483871, "grad_norm": 0.19059405969768753, "learning_rate": 8.242080874032804e-07, "loss": 0.2811, "step": 1711 }, { "epoch": 0.8629032258064516, "grad_norm": 0.17640465354683116, "learning_rate": 8.24003591109441e-07, "loss": 0.2837, "step": 1712 }, { "epoch": 0.8634072580645161, "grad_norm": 0.1946696258918474, "learning_rate": 8.237990013425725e-07, "loss": 0.2943, "step": 1713 }, { "epoch": 0.8639112903225806, "grad_norm": 0.17143548711069195, "learning_rate": 8.23594318161698e-07, "loss": 0.2844, "step": 1714 }, { "epoch": 0.8644153225806451, "grad_norm": 0.170912525533573, "learning_rate": 8.233895416258673e-07, "loss": 0.2828, "step": 1715 }, { "epoch": 0.8649193548387096, "grad_norm": 0.17030957212565428, "learning_rate": 8.231846717941572e-07, "loss": 0.2841, "step": 1716 }, { "epoch": 0.8654233870967742, "grad_norm": 0.18030711489803564, "learning_rate": 8.229797087256711e-07, "loss": 0.2892, "step": 1717 }, { "epoch": 0.8659274193548387, "grad_norm": 0.1811276763324713, "learning_rate": 8.2277465247954e-07, "loss": 0.2833, "step": 1718 }, { "epoch": 0.8664314516129032, "grad_norm": 0.18181075140547173, "learning_rate": 8.22569503114921e-07, "loss": 0.2936, "step": 1719 }, { "epoch": 0.8669354838709677, "grad_norm": 0.17610807113952007, "learning_rate": 8.223642606909986e-07, "loss": 0.2918, "step": 1720 }, { "epoch": 0.8674395161290323, "grad_norm": 0.17498207358667517, "learning_rate": 8.221589252669841e-07, "loss": 0.2706, "step": 1721 }, { "epoch": 0.8679435483870968, "grad_norm": 0.18462559185235636, "learning_rate": 8.219534969021151e-07, "loss": 0.2862, "step": 1722 }, { "epoch": 0.8684475806451613, "grad_norm": 0.17613631479568562, "learning_rate": 8.217479756556567e-07, "loss": 0.2901, "step": 1723 }, { "epoch": 0.8689516129032258, "grad_norm": 0.17518479858622174, "learning_rate": 8.215423615869005e-07, "loss": 0.2611, "step": 1724 }, { "epoch": 0.8694556451612904, "grad_norm": 0.17109997839484933, "learning_rate": 8.213366547551648e-07, "loss": 0.2817, "step": 1725 }, { "epoch": 0.8699596774193549, "grad_norm": 0.1966008607228567, "learning_rate": 8.211308552197948e-07, "loss": 0.2792, "step": 1726 }, { "epoch": 0.8704637096774194, "grad_norm": 0.17565135968705492, "learning_rate": 8.209249630401623e-07, "loss": 0.2854, "step": 1727 }, { "epoch": 0.8709677419354839, "grad_norm": 0.17137189614479656, "learning_rate": 8.207189782756661e-07, "loss": 0.2662, "step": 1728 }, { "epoch": 0.8714717741935484, "grad_norm": 0.17297331900482513, "learning_rate": 8.205129009857312e-07, "loss": 0.2799, "step": 1729 }, { "epoch": 0.8719758064516129, "grad_norm": 0.1724815696106094, "learning_rate": 8.203067312298101e-07, "loss": 0.2815, "step": 1730 }, { "epoch": 0.8724798387096774, "grad_norm": 0.18096897924623875, "learning_rate": 8.20100469067381e-07, "loss": 0.2837, "step": 1731 }, { "epoch": 0.8729838709677419, "grad_norm": 0.19781239041691703, "learning_rate": 8.198941145579496e-07, "loss": 0.2939, "step": 1732 }, { "epoch": 0.8734879032258065, "grad_norm": 0.17163633709375878, "learning_rate": 8.196876677610479e-07, "loss": 0.266, "step": 1733 }, { "epoch": 0.873991935483871, "grad_norm": 0.18825663758858197, "learning_rate": 8.194811287362344e-07, "loss": 0.291, "step": 1734 }, { "epoch": 0.8744959677419355, "grad_norm": 0.19151890851189823, "learning_rate": 8.192744975430941e-07, "loss": 0.2887, "step": 1735 }, { "epoch": 0.875, "grad_norm": 0.17804405341187254, "learning_rate": 8.190677742412393e-07, "loss": 0.2848, "step": 1736 }, { "epoch": 0.8755040322580645, "grad_norm": 0.17441120813931166, "learning_rate": 8.188609588903081e-07, "loss": 0.297, "step": 1737 }, { "epoch": 0.876008064516129, "grad_norm": 0.17132241278155927, "learning_rate": 8.186540515499653e-07, "loss": 0.2698, "step": 1738 }, { "epoch": 0.8765120967741935, "grad_norm": 0.17642561108901925, "learning_rate": 8.184470522799029e-07, "loss": 0.2908, "step": 1739 }, { "epoch": 0.8770161290322581, "grad_norm": 0.231128422621628, "learning_rate": 8.182399611398385e-07, "loss": 0.275, "step": 1740 }, { "epoch": 0.8775201612903226, "grad_norm": 0.17983640539436302, "learning_rate": 8.180327781895166e-07, "loss": 0.2746, "step": 1741 }, { "epoch": 0.8780241935483871, "grad_norm": 0.18298601240785914, "learning_rate": 8.178255034887083e-07, "loss": 0.2933, "step": 1742 }, { "epoch": 0.8785282258064516, "grad_norm": 0.17727125106526648, "learning_rate": 8.176181370972112e-07, "loss": 0.3026, "step": 1743 }, { "epoch": 0.8790322580645161, "grad_norm": 0.17900225615415002, "learning_rate": 8.174106790748489e-07, "loss": 0.2732, "step": 1744 }, { "epoch": 0.8795362903225806, "grad_norm": 0.16825501654368868, "learning_rate": 8.172031294814721e-07, "loss": 0.2648, "step": 1745 }, { "epoch": 0.8800403225806451, "grad_norm": 0.17838543584835861, "learning_rate": 8.169954883769573e-07, "loss": 0.2826, "step": 1746 }, { "epoch": 0.8805443548387096, "grad_norm": 0.20624261385412906, "learning_rate": 8.16787755821208e-07, "loss": 0.2931, "step": 1747 }, { "epoch": 0.8810483870967742, "grad_norm": 0.19622027217406832, "learning_rate": 8.165799318741533e-07, "loss": 0.2855, "step": 1748 }, { "epoch": 0.8815524193548387, "grad_norm": 0.1795090027517633, "learning_rate": 8.163720165957494e-07, "loss": 0.2855, "step": 1749 }, { "epoch": 0.8820564516129032, "grad_norm": 0.1727005576579451, "learning_rate": 8.161640100459785e-07, "loss": 0.2945, "step": 1750 }, { "epoch": 0.8825604838709677, "grad_norm": 0.17361486955355637, "learning_rate": 8.159559122848494e-07, "loss": 0.2845, "step": 1751 }, { "epoch": 0.8830645161290323, "grad_norm": 0.1844805830827498, "learning_rate": 8.157477233723969e-07, "loss": 0.298, "step": 1752 }, { "epoch": 0.8835685483870968, "grad_norm": 0.17263438233102688, "learning_rate": 8.15539443368682e-07, "loss": 0.2929, "step": 1753 }, { "epoch": 0.8840725806451613, "grad_norm": 0.18475595233458791, "learning_rate": 8.153310723337923e-07, "loss": 0.2844, "step": 1754 }, { "epoch": 0.8845766129032258, "grad_norm": 0.1730273189248541, "learning_rate": 8.151226103278417e-07, "loss": 0.284, "step": 1755 }, { "epoch": 0.8850806451612904, "grad_norm": 0.18104372418355474, "learning_rate": 8.149140574109701e-07, "loss": 0.2897, "step": 1756 }, { "epoch": 0.8855846774193549, "grad_norm": 0.17839123584579997, "learning_rate": 8.147054136433437e-07, "loss": 0.2908, "step": 1757 }, { "epoch": 0.8860887096774194, "grad_norm": 0.178362220369913, "learning_rate": 8.144966790851551e-07, "loss": 0.2724, "step": 1758 }, { "epoch": 0.8865927419354839, "grad_norm": 0.18349648424878212, "learning_rate": 8.142878537966225e-07, "loss": 0.2813, "step": 1759 }, { "epoch": 0.8870967741935484, "grad_norm": 0.1852606758136101, "learning_rate": 8.140789378379911e-07, "loss": 0.2705, "step": 1760 }, { "epoch": 0.8876008064516129, "grad_norm": 0.19163597573225657, "learning_rate": 8.138699312695318e-07, "loss": 0.2763, "step": 1761 }, { "epoch": 0.8881048387096774, "grad_norm": 0.16967793891115396, "learning_rate": 8.136608341515417e-07, "loss": 0.2878, "step": 1762 }, { "epoch": 0.8886088709677419, "grad_norm": 0.17409231137199876, "learning_rate": 8.13451646544344e-07, "loss": 0.2824, "step": 1763 }, { "epoch": 0.8891129032258065, "grad_norm": 0.1718892011569422, "learning_rate": 8.132423685082879e-07, "loss": 0.2819, "step": 1764 }, { "epoch": 0.889616935483871, "grad_norm": 0.1768902921291496, "learning_rate": 8.130330001037489e-07, "loss": 0.2866, "step": 1765 }, { "epoch": 0.8901209677419355, "grad_norm": 0.17727162868649207, "learning_rate": 8.128235413911286e-07, "loss": 0.2904, "step": 1766 }, { "epoch": 0.890625, "grad_norm": 0.18000850290405332, "learning_rate": 8.126139924308544e-07, "loss": 0.2821, "step": 1767 }, { "epoch": 0.8911290322580645, "grad_norm": 0.1950805957693296, "learning_rate": 8.124043532833799e-07, "loss": 0.2724, "step": 1768 }, { "epoch": 0.891633064516129, "grad_norm": 0.16953136358361712, "learning_rate": 8.121946240091847e-07, "loss": 0.2897, "step": 1769 }, { "epoch": 0.8921370967741935, "grad_norm": 0.17382780411451854, "learning_rate": 8.119848046687745e-07, "loss": 0.2743, "step": 1770 }, { "epoch": 0.8926411290322581, "grad_norm": 0.17618015127985004, "learning_rate": 8.117748953226807e-07, "loss": 0.293, "step": 1771 }, { "epoch": 0.8931451612903226, "grad_norm": 0.19101986869600812, "learning_rate": 8.115648960314609e-07, "loss": 0.282, "step": 1772 }, { "epoch": 0.8936491935483871, "grad_norm": 0.18335641858591029, "learning_rate": 8.113548068556989e-07, "loss": 0.2782, "step": 1773 }, { "epoch": 0.8941532258064516, "grad_norm": 0.17305922736379498, "learning_rate": 8.111446278560037e-07, "loss": 0.2819, "step": 1774 }, { "epoch": 0.8946572580645161, "grad_norm": 0.17289190092892376, "learning_rate": 8.109343590930107e-07, "loss": 0.2722, "step": 1775 }, { "epoch": 0.8951612903225806, "grad_norm": 0.20419317136274495, "learning_rate": 8.107240006273815e-07, "loss": 0.2984, "step": 1776 }, { "epoch": 0.8956653225806451, "grad_norm": 0.17574416151651231, "learning_rate": 8.105135525198026e-07, "loss": 0.2785, "step": 1777 }, { "epoch": 0.8961693548387096, "grad_norm": 0.18197635365780418, "learning_rate": 8.103030148309876e-07, "loss": 0.2671, "step": 1778 }, { "epoch": 0.8966733870967742, "grad_norm": 0.182868231288254, "learning_rate": 8.10092387621675e-07, "loss": 0.2825, "step": 1779 }, { "epoch": 0.8971774193548387, "grad_norm": 0.1753018528864952, "learning_rate": 8.098816709526293e-07, "loss": 0.2742, "step": 1780 }, { "epoch": 0.8976814516129032, "grad_norm": 0.17996946202759295, "learning_rate": 8.096708648846416e-07, "loss": 0.2721, "step": 1781 }, { "epoch": 0.8981854838709677, "grad_norm": 0.17250723947592084, "learning_rate": 8.094599694785272e-07, "loss": 0.2857, "step": 1782 }, { "epoch": 0.8986895161290323, "grad_norm": 0.17857618093982208, "learning_rate": 8.092489847951288e-07, "loss": 0.289, "step": 1783 }, { "epoch": 0.8991935483870968, "grad_norm": 0.180980362554131, "learning_rate": 8.09037910895314e-07, "loss": 0.2815, "step": 1784 }, { "epoch": 0.8996975806451613, "grad_norm": 0.19044748994727145, "learning_rate": 8.088267478399761e-07, "loss": 0.2763, "step": 1785 }, { "epoch": 0.9002016129032258, "grad_norm": 0.18502618756219494, "learning_rate": 8.086154956900348e-07, "loss": 0.2777, "step": 1786 }, { "epoch": 0.9007056451612904, "grad_norm": 0.17795283297213882, "learning_rate": 8.084041545064347e-07, "loss": 0.277, "step": 1787 }, { "epoch": 0.9012096774193549, "grad_norm": 0.18095952785719716, "learning_rate": 8.081927243501465e-07, "loss": 0.2849, "step": 1788 }, { "epoch": 0.9017137096774194, "grad_norm": 0.20352287553838194, "learning_rate": 8.079812052821665e-07, "loss": 0.268, "step": 1789 }, { "epoch": 0.9022177419354839, "grad_norm": 0.1817677931102643, "learning_rate": 8.077695973635165e-07, "loss": 0.2821, "step": 1790 }, { "epoch": 0.9027217741935484, "grad_norm": 0.20806765911305017, "learning_rate": 8.075579006552442e-07, "loss": 0.2802, "step": 1791 }, { "epoch": 0.9027217741935484, "eval_loss": 0.31333088874816895, "eval_runtime": 17.8076, "eval_samples_per_second": 48.013, "eval_steps_per_second": 1.011, "step": 1791 }, { "epoch": 0.9032258064516129, "grad_norm": 0.18964086276352513, "learning_rate": 8.073461152184229e-07, "loss": 0.2955, "step": 1792 }, { "epoch": 0.9037298387096774, "grad_norm": 0.20100101509553, "learning_rate": 8.071342411141511e-07, "loss": 0.2833, "step": 1793 }, { "epoch": 0.9042338709677419, "grad_norm": 0.16556895445429864, "learning_rate": 8.069222784035536e-07, "loss": 0.2572, "step": 1794 }, { "epoch": 0.9047379032258065, "grad_norm": 0.18585984965180008, "learning_rate": 8.067102271477798e-07, "loss": 0.292, "step": 1795 }, { "epoch": 0.905241935483871, "grad_norm": 0.1714046882678454, "learning_rate": 8.064980874080056e-07, "loss": 0.2783, "step": 1796 }, { "epoch": 0.9057459677419355, "grad_norm": 0.1845849829174842, "learning_rate": 8.062858592454318e-07, "loss": 0.2719, "step": 1797 }, { "epoch": 0.90625, "grad_norm": 0.18224225892643142, "learning_rate": 8.060735427212848e-07, "loss": 0.2812, "step": 1798 }, { "epoch": 0.9067540322580645, "grad_norm": 0.1694582941399971, "learning_rate": 8.05861137896817e-07, "loss": 0.269, "step": 1799 }, { "epoch": 0.907258064516129, "grad_norm": 0.1866565288452891, "learning_rate": 8.056486448333053e-07, "loss": 0.2867, "step": 1800 }, { "epoch": 0.9077620967741935, "grad_norm": 0.18228705266437853, "learning_rate": 8.054360635920532e-07, "loss": 0.2919, "step": 1801 }, { "epoch": 0.9082661290322581, "grad_norm": 0.17847533993145076, "learning_rate": 8.052233942343889e-07, "loss": 0.2864, "step": 1802 }, { "epoch": 0.9087701612903226, "grad_norm": 0.16890284755735133, "learning_rate": 8.050106368216661e-07, "loss": 0.2748, "step": 1803 }, { "epoch": 0.9092741935483871, "grad_norm": 0.17414946004467158, "learning_rate": 8.047977914152639e-07, "loss": 0.2886, "step": 1804 }, { "epoch": 0.9097782258064516, "grad_norm": 0.17442792515844072, "learning_rate": 8.045848580765869e-07, "loss": 0.2926, "step": 1805 }, { "epoch": 0.9102822580645161, "grad_norm": 0.17085485937836853, "learning_rate": 8.043718368670654e-07, "loss": 0.2847, "step": 1806 }, { "epoch": 0.9107862903225806, "grad_norm": 0.17339005862232762, "learning_rate": 8.041587278481541e-07, "loss": 0.2729, "step": 1807 }, { "epoch": 0.9112903225806451, "grad_norm": 0.17518880582664062, "learning_rate": 8.039455310813343e-07, "loss": 0.3047, "step": 1808 }, { "epoch": 0.9117943548387096, "grad_norm": 0.17160540781556277, "learning_rate": 8.037322466281116e-07, "loss": 0.277, "step": 1809 }, { "epoch": 0.9122983870967742, "grad_norm": 0.18984801005727484, "learning_rate": 8.035188745500171e-07, "loss": 0.2975, "step": 1810 }, { "epoch": 0.9128024193548387, "grad_norm": 0.18036724606771928, "learning_rate": 8.033054149086076e-07, "loss": 0.2738, "step": 1811 }, { "epoch": 0.9133064516129032, "grad_norm": 0.1694811997928256, "learning_rate": 8.030918677654648e-07, "loss": 0.2802, "step": 1812 }, { "epoch": 0.9138104838709677, "grad_norm": 1.122599384666042, "learning_rate": 8.028782331821956e-07, "loss": 0.2729, "step": 1813 }, { "epoch": 0.9143145161290323, "grad_norm": 0.19587893697493347, "learning_rate": 8.026645112204325e-07, "loss": 0.2844, "step": 1814 }, { "epoch": 0.9148185483870968, "grad_norm": 0.18843188442999503, "learning_rate": 8.024507019418327e-07, "loss": 0.2622, "step": 1815 }, { "epoch": 0.9153225806451613, "grad_norm": 0.2412193144484893, "learning_rate": 8.022368054080789e-07, "loss": 0.2864, "step": 1816 }, { "epoch": 0.9158266129032258, "grad_norm": 0.2982039518983422, "learning_rate": 8.020228216808792e-07, "loss": 0.2827, "step": 1817 }, { "epoch": 0.9163306451612904, "grad_norm": 0.24012806458676572, "learning_rate": 8.018087508219664e-07, "loss": 0.2749, "step": 1818 }, { "epoch": 0.9168346774193549, "grad_norm": 0.20105639329407363, "learning_rate": 8.015945928930985e-07, "loss": 0.2751, "step": 1819 }, { "epoch": 0.9173387096774194, "grad_norm": 0.17599257831809287, "learning_rate": 8.013803479560588e-07, "loss": 0.2726, "step": 1820 }, { "epoch": 0.9178427419354839, "grad_norm": 0.17545331694711577, "learning_rate": 8.011660160726556e-07, "loss": 0.2783, "step": 1821 }, { "epoch": 0.9183467741935484, "grad_norm": 0.1782601795128222, "learning_rate": 8.009515973047225e-07, "loss": 0.2721, "step": 1822 }, { "epoch": 0.9188508064516129, "grad_norm": 0.1927011349013866, "learning_rate": 8.007370917141177e-07, "loss": 0.2762, "step": 1823 }, { "epoch": 0.9193548387096774, "grad_norm": 0.1920937971165849, "learning_rate": 8.005224993627251e-07, "loss": 0.2704, "step": 1824 }, { "epoch": 0.9198588709677419, "grad_norm": 0.1793904274021918, "learning_rate": 8.003078203124532e-07, "loss": 0.2906, "step": 1825 }, { "epoch": 0.9203629032258065, "grad_norm": 0.1816291362076379, "learning_rate": 8.000930546252351e-07, "loss": 0.2691, "step": 1826 }, { "epoch": 0.920866935483871, "grad_norm": 0.1783523495764457, "learning_rate": 7.998782023630299e-07, "loss": 0.2823, "step": 1827 }, { "epoch": 0.9213709677419355, "grad_norm": 0.20855882950783844, "learning_rate": 7.996632635878209e-07, "loss": 0.2804, "step": 1828 }, { "epoch": 0.921875, "grad_norm": 0.17492678074423923, "learning_rate": 7.994482383616168e-07, "loss": 0.2832, "step": 1829 }, { "epoch": 0.9223790322580645, "grad_norm": 0.17234593489453404, "learning_rate": 7.992331267464509e-07, "loss": 0.2768, "step": 1830 }, { "epoch": 0.922883064516129, "grad_norm": 0.17200804609809464, "learning_rate": 7.990179288043815e-07, "loss": 0.2677, "step": 1831 }, { "epoch": 0.9233870967741935, "grad_norm": 0.17572805845486925, "learning_rate": 7.98802644597492e-07, "loss": 0.2821, "step": 1832 }, { "epoch": 0.9238911290322581, "grad_norm": 0.20644587156717698, "learning_rate": 7.985872741878905e-07, "loss": 0.2732, "step": 1833 }, { "epoch": 0.9243951612903226, "grad_norm": 0.17874642664132656, "learning_rate": 7.983718176377101e-07, "loss": 0.269, "step": 1834 }, { "epoch": 0.9248991935483871, "grad_norm": 0.17456990653485246, "learning_rate": 7.981562750091085e-07, "loss": 0.2745, "step": 1835 }, { "epoch": 0.9254032258064516, "grad_norm": 0.16939889686969853, "learning_rate": 7.979406463642686e-07, "loss": 0.2916, "step": 1836 }, { "epoch": 0.9259072580645161, "grad_norm": 0.17330331884417968, "learning_rate": 7.977249317653979e-07, "loss": 0.2897, "step": 1837 }, { "epoch": 0.9264112903225806, "grad_norm": 0.17625952471758344, "learning_rate": 7.975091312747286e-07, "loss": 0.28, "step": 1838 }, { "epoch": 0.9269153225806451, "grad_norm": 0.18477999685889357, "learning_rate": 7.97293244954518e-07, "loss": 0.3018, "step": 1839 }, { "epoch": 0.9274193548387096, "grad_norm": 0.17443677971673763, "learning_rate": 7.970772728670479e-07, "loss": 0.2701, "step": 1840 }, { "epoch": 0.9279233870967742, "grad_norm": 0.17476215415812357, "learning_rate": 7.968612150746247e-07, "loss": 0.2917, "step": 1841 }, { "epoch": 0.9284274193548387, "grad_norm": 0.172486146139852, "learning_rate": 7.966450716395801e-07, "loss": 0.2802, "step": 1842 }, { "epoch": 0.9289314516129032, "grad_norm": 0.17784609076169455, "learning_rate": 7.9642884262427e-07, "loss": 0.2795, "step": 1843 }, { "epoch": 0.9294354838709677, "grad_norm": 0.1837707194984302, "learning_rate": 7.96212528091075e-07, "loss": 0.2899, "step": 1844 }, { "epoch": 0.9299395161290323, "grad_norm": 0.18094587392424352, "learning_rate": 7.959961281024004e-07, "loss": 0.2939, "step": 1845 }, { "epoch": 0.9304435483870968, "grad_norm": 0.18248542798816517, "learning_rate": 7.957796427206766e-07, "loss": 0.2926, "step": 1846 }, { "epoch": 0.9309475806451613, "grad_norm": 0.17614486195538318, "learning_rate": 7.955630720083581e-07, "loss": 0.2912, "step": 1847 }, { "epoch": 0.9314516129032258, "grad_norm": 0.1726471918494145, "learning_rate": 7.953464160279244e-07, "loss": 0.2891, "step": 1848 }, { "epoch": 0.9319556451612904, "grad_norm": 0.1774055068005992, "learning_rate": 7.951296748418789e-07, "loss": 0.3031, "step": 1849 }, { "epoch": 0.9324596774193549, "grad_norm": 0.18664971286954127, "learning_rate": 7.949128485127508e-07, "loss": 0.2798, "step": 1850 }, { "epoch": 0.9329637096774194, "grad_norm": 0.21309616857185437, "learning_rate": 7.946959371030926e-07, "loss": 0.2852, "step": 1851 }, { "epoch": 0.9334677419354839, "grad_norm": 0.17453472429034075, "learning_rate": 7.944789406754821e-07, "loss": 0.2796, "step": 1852 }, { "epoch": 0.9339717741935484, "grad_norm": 0.17686343311528496, "learning_rate": 7.942618592925214e-07, "loss": 0.2821, "step": 1853 }, { "epoch": 0.9344758064516129, "grad_norm": 0.2050361536049733, "learning_rate": 7.940446930168372e-07, "loss": 0.2855, "step": 1854 }, { "epoch": 0.9349798387096774, "grad_norm": 0.19293005978824818, "learning_rate": 7.938274419110806e-07, "loss": 0.2853, "step": 1855 }, { "epoch": 0.9354838709677419, "grad_norm": 0.18447151377144042, "learning_rate": 7.936101060379272e-07, "loss": 0.2909, "step": 1856 }, { "epoch": 0.9359879032258065, "grad_norm": 0.17295084942405378, "learning_rate": 7.93392685460077e-07, "loss": 0.2828, "step": 1857 }, { "epoch": 0.936491935483871, "grad_norm": 0.1695374241026228, "learning_rate": 7.931751802402544e-07, "loss": 0.2799, "step": 1858 }, { "epoch": 0.9369959677419355, "grad_norm": 0.17220089424019935, "learning_rate": 7.929575904412086e-07, "loss": 0.2783, "step": 1859 }, { "epoch": 0.9375, "grad_norm": 0.17583666760835148, "learning_rate": 7.927399161257127e-07, "loss": 0.268, "step": 1860 }, { "epoch": 0.9380040322580645, "grad_norm": 0.181882166877518, "learning_rate": 7.925221573565644e-07, "loss": 0.2931, "step": 1861 }, { "epoch": 0.938508064516129, "grad_norm": 0.17175675035279075, "learning_rate": 7.923043141965857e-07, "loss": 0.2763, "step": 1862 }, { "epoch": 0.9390120967741935, "grad_norm": 0.16867999230536798, "learning_rate": 7.920863867086232e-07, "loss": 0.2764, "step": 1863 }, { "epoch": 0.9395161290322581, "grad_norm": 0.18179217734847186, "learning_rate": 7.918683749555473e-07, "loss": 0.2865, "step": 1864 }, { "epoch": 0.9400201612903226, "grad_norm": 0.17793466089999202, "learning_rate": 7.916502790002535e-07, "loss": 0.2933, "step": 1865 }, { "epoch": 0.9405241935483871, "grad_norm": 0.1687244667253048, "learning_rate": 7.914320989056608e-07, "loss": 0.2646, "step": 1866 }, { "epoch": 0.9410282258064516, "grad_norm": 0.16924880345437254, "learning_rate": 7.912138347347128e-07, "loss": 0.2747, "step": 1867 }, { "epoch": 0.9415322580645161, "grad_norm": 0.1998764610080855, "learning_rate": 7.909954865503776e-07, "loss": 0.2848, "step": 1868 }, { "epoch": 0.9420362903225806, "grad_norm": 0.17645688052721015, "learning_rate": 7.907770544156471e-07, "loss": 0.2744, "step": 1869 }, { "epoch": 0.9425403225806451, "grad_norm": 0.17444186260064237, "learning_rate": 7.905585383935377e-07, "loss": 0.2861, "step": 1870 }, { "epoch": 0.9430443548387096, "grad_norm": 0.18655327778687014, "learning_rate": 7.903399385470898e-07, "loss": 0.2723, "step": 1871 }, { "epoch": 0.9435483870967742, "grad_norm": 0.17432236510965454, "learning_rate": 7.901212549393682e-07, "loss": 0.2782, "step": 1872 }, { "epoch": 0.9440524193548387, "grad_norm": 0.1866368609884871, "learning_rate": 7.899024876334619e-07, "loss": 0.27, "step": 1873 }, { "epoch": 0.9445564516129032, "grad_norm": 0.1867440259653203, "learning_rate": 7.896836366924836e-07, "loss": 0.2861, "step": 1874 }, { "epoch": 0.9450604838709677, "grad_norm": 0.17504198342034763, "learning_rate": 7.894647021795707e-07, "loss": 0.288, "step": 1875 }, { "epoch": 0.9455645161290323, "grad_norm": 0.17474732068264098, "learning_rate": 7.892456841578843e-07, "loss": 0.2839, "step": 1876 }, { "epoch": 0.9460685483870968, "grad_norm": 0.17433782582765533, "learning_rate": 7.890265826906097e-07, "loss": 0.2695, "step": 1877 }, { "epoch": 0.9465725806451613, "grad_norm": 0.1732874776440476, "learning_rate": 7.888073978409568e-07, "loss": 0.2894, "step": 1878 }, { "epoch": 0.9470766129032258, "grad_norm": 0.1829899116719353, "learning_rate": 7.885881296721584e-07, "loss": 0.2943, "step": 1879 }, { "epoch": 0.9475806451612904, "grad_norm": 0.17001971535664087, "learning_rate": 7.883687782474723e-07, "loss": 0.2856, "step": 1880 }, { "epoch": 0.9480846774193549, "grad_norm": 0.1780345466436045, "learning_rate": 7.8814934363018e-07, "loss": 0.2746, "step": 1881 }, { "epoch": 0.9485887096774194, "grad_norm": 0.1811602910506968, "learning_rate": 7.879298258835872e-07, "loss": 0.2776, "step": 1882 }, { "epoch": 0.9490927419354839, "grad_norm": 0.18771497800701312, "learning_rate": 7.877102250710231e-07, "loss": 0.2829, "step": 1883 }, { "epoch": 0.9495967741935484, "grad_norm": 0.18472560622513845, "learning_rate": 7.874905412558415e-07, "loss": 0.2707, "step": 1884 }, { "epoch": 0.9501008064516129, "grad_norm": 0.1795065532871576, "learning_rate": 7.872707745014195e-07, "loss": 0.2812, "step": 1885 }, { "epoch": 0.9506048387096774, "grad_norm": 0.19450850100162909, "learning_rate": 7.870509248711588e-07, "loss": 0.262, "step": 1886 }, { "epoch": 0.9511088709677419, "grad_norm": 0.17189805933616106, "learning_rate": 7.868309924284842e-07, "loss": 0.2681, "step": 1887 }, { "epoch": 0.9516129032258065, "grad_norm": 0.17073814905474735, "learning_rate": 7.866109772368453e-07, "loss": 0.2774, "step": 1888 }, { "epoch": 0.952116935483871, "grad_norm": 0.17066105523996877, "learning_rate": 7.863908793597149e-07, "loss": 0.2883, "step": 1889 }, { "epoch": 0.9526209677419355, "grad_norm": 0.16966954884907165, "learning_rate": 7.861706988605898e-07, "loss": 0.2832, "step": 1890 }, { "epoch": 0.953125, "grad_norm": 0.17731653359539812, "learning_rate": 7.859504358029909e-07, "loss": 0.2856, "step": 1891 }, { "epoch": 0.9536290322580645, "grad_norm": 0.18810146382045853, "learning_rate": 7.857300902504628e-07, "loss": 0.2853, "step": 1892 }, { "epoch": 0.954133064516129, "grad_norm": 0.18445022791672494, "learning_rate": 7.855096622665735e-07, "loss": 0.2867, "step": 1893 }, { "epoch": 0.9546370967741935, "grad_norm": 0.1728990132935549, "learning_rate": 7.852891519149152e-07, "loss": 0.2874, "step": 1894 }, { "epoch": 0.9551411290322581, "grad_norm": 0.1761006592616753, "learning_rate": 7.85068559259104e-07, "loss": 0.2844, "step": 1895 }, { "epoch": 0.9556451612903226, "grad_norm": 0.183998556068685, "learning_rate": 7.848478843627792e-07, "loss": 0.2705, "step": 1896 }, { "epoch": 0.9561491935483871, "grad_norm": 0.17318304016376618, "learning_rate": 7.846271272896044e-07, "loss": 0.2778, "step": 1897 }, { "epoch": 0.9566532258064516, "grad_norm": 0.1824137043313419, "learning_rate": 7.844062881032664e-07, "loss": 0.2872, "step": 1898 }, { "epoch": 0.9571572580645161, "grad_norm": 0.17344844058434108, "learning_rate": 7.841853668674763e-07, "loss": 0.2787, "step": 1899 }, { "epoch": 0.9576612903225806, "grad_norm": 0.1758545032107461, "learning_rate": 7.839643636459683e-07, "loss": 0.2857, "step": 1900 }, { "epoch": 0.9581653225806451, "grad_norm": 0.1729782764700269, "learning_rate": 7.837432785025004e-07, "loss": 0.2765, "step": 1901 }, { "epoch": 0.9586693548387096, "grad_norm": 0.17898116556336083, "learning_rate": 7.835221115008542e-07, "loss": 0.2877, "step": 1902 }, { "epoch": 0.9591733870967742, "grad_norm": 0.1895613129053808, "learning_rate": 7.833008627048352e-07, "loss": 0.2754, "step": 1903 }, { "epoch": 0.9596774193548387, "grad_norm": 0.20979437224680517, "learning_rate": 7.830795321782724e-07, "loss": 0.2773, "step": 1904 }, { "epoch": 0.9601814516129032, "grad_norm": 0.17134279973430921, "learning_rate": 7.828581199850182e-07, "loss": 0.2785, "step": 1905 }, { "epoch": 0.9606854838709677, "grad_norm": 0.1703859949701216, "learning_rate": 7.826366261889483e-07, "loss": 0.2755, "step": 1906 }, { "epoch": 0.9611895161290323, "grad_norm": 0.1708391359023031, "learning_rate": 7.824150508539628e-07, "loss": 0.2784, "step": 1907 }, { "epoch": 0.9616935483870968, "grad_norm": 0.17468338877763465, "learning_rate": 7.821933940439847e-07, "loss": 0.2584, "step": 1908 }, { "epoch": 0.9621975806451613, "grad_norm": 0.17669530382160473, "learning_rate": 7.819716558229604e-07, "loss": 0.2594, "step": 1909 }, { "epoch": 0.9627016129032258, "grad_norm": 0.18182770484400235, "learning_rate": 7.8174983625486e-07, "loss": 0.2929, "step": 1910 }, { "epoch": 0.9632056451612904, "grad_norm": 0.1718420918323365, "learning_rate": 7.815279354036772e-07, "loss": 0.2741, "step": 1911 }, { "epoch": 0.9637096774193549, "grad_norm": 0.18047954008297915, "learning_rate": 7.813059533334292e-07, "loss": 0.282, "step": 1912 }, { "epoch": 0.9642137096774194, "grad_norm": 0.1708373327019462, "learning_rate": 7.810838901081561e-07, "loss": 0.2671, "step": 1913 }, { "epoch": 0.9647177419354839, "grad_norm": 0.17530947113755785, "learning_rate": 7.80861745791922e-07, "loss": 0.2863, "step": 1914 }, { "epoch": 0.9652217741935484, "grad_norm": 0.1856980391062388, "learning_rate": 7.80639520448814e-07, "loss": 0.2953, "step": 1915 }, { "epoch": 0.9657258064516129, "grad_norm": 0.17104956670285965, "learning_rate": 7.80417214142943e-07, "loss": 0.2929, "step": 1916 }, { "epoch": 0.9662298387096774, "grad_norm": 0.17968017354027993, "learning_rate": 7.801948269384427e-07, "loss": 0.2805, "step": 1917 }, { "epoch": 0.9667338709677419, "grad_norm": 0.16885090448355405, "learning_rate": 7.799723588994706e-07, "loss": 0.275, "step": 1918 }, { "epoch": 0.9672379032258065, "grad_norm": 0.16977668143159869, "learning_rate": 7.797498100902071e-07, "loss": 0.2955, "step": 1919 }, { "epoch": 0.967741935483871, "grad_norm": 0.17543252330569692, "learning_rate": 7.795271805748565e-07, "loss": 0.285, "step": 1920 }, { "epoch": 0.9682459677419355, "grad_norm": 0.1779214329877026, "learning_rate": 7.793044704176459e-07, "loss": 0.2751, "step": 1921 }, { "epoch": 0.96875, "grad_norm": 0.1727984897805244, "learning_rate": 7.790816796828259e-07, "loss": 0.2917, "step": 1922 }, { "epoch": 0.9692540322580645, "grad_norm": 0.19093500891641543, "learning_rate": 7.788588084346699e-07, "loss": 0.2912, "step": 1923 }, { "epoch": 0.969758064516129, "grad_norm": 0.1877331833500236, "learning_rate": 7.786358567374752e-07, "loss": 0.2828, "step": 1924 }, { "epoch": 0.9702620967741935, "grad_norm": 0.1964520978166042, "learning_rate": 7.784128246555619e-07, "loss": 0.2878, "step": 1925 }, { "epoch": 0.9707661290322581, "grad_norm": 0.1793857792176289, "learning_rate": 7.781897122532732e-07, "loss": 0.3009, "step": 1926 }, { "epoch": 0.9712701612903226, "grad_norm": 0.1778515329690811, "learning_rate": 7.779665195949761e-07, "loss": 0.2686, "step": 1927 }, { "epoch": 0.9717741935483871, "grad_norm": 0.18899510212928142, "learning_rate": 7.777432467450598e-07, "loss": 0.2944, "step": 1928 }, { "epoch": 0.9722782258064516, "grad_norm": 0.18103789111426294, "learning_rate": 7.775198937679375e-07, "loss": 0.2853, "step": 1929 }, { "epoch": 0.9727822580645161, "grad_norm": 0.17187773087886613, "learning_rate": 7.772964607280448e-07, "loss": 0.2853, "step": 1930 }, { "epoch": 0.9732862903225806, "grad_norm": 0.17830119933021793, "learning_rate": 7.77072947689841e-07, "loss": 0.2883, "step": 1931 }, { "epoch": 0.9737903225806451, "grad_norm": 0.18468034488609253, "learning_rate": 7.768493547178083e-07, "loss": 0.2783, "step": 1932 }, { "epoch": 0.9742943548387096, "grad_norm": 0.1649149359989447, "learning_rate": 7.766256818764517e-07, "loss": 0.2712, "step": 1933 }, { "epoch": 0.9747983870967742, "grad_norm": 0.18006035200393466, "learning_rate": 7.764019292302994e-07, "loss": 0.2873, "step": 1934 }, { "epoch": 0.9753024193548387, "grad_norm": 0.18387324362971386, "learning_rate": 7.761780968439027e-07, "loss": 0.2825, "step": 1935 }, { "epoch": 0.9758064516129032, "grad_norm": 0.1756603042845968, "learning_rate": 7.759541847818361e-07, "loss": 0.2633, "step": 1936 }, { "epoch": 0.9763104838709677, "grad_norm": 0.17483974536625063, "learning_rate": 7.757301931086963e-07, "loss": 0.2982, "step": 1937 }, { "epoch": 0.9768145161290323, "grad_norm": 0.18502806181928816, "learning_rate": 7.755061218891041e-07, "loss": 0.2836, "step": 1938 }, { "epoch": 0.9773185483870968, "grad_norm": 0.17424427520526933, "learning_rate": 7.752819711877024e-07, "loss": 0.2909, "step": 1939 }, { "epoch": 0.9778225806451613, "grad_norm": 0.17479328408301795, "learning_rate": 7.750577410691572e-07, "loss": 0.2892, "step": 1940 }, { "epoch": 0.9783266129032258, "grad_norm": 0.18236213050678932, "learning_rate": 7.748334315981577e-07, "loss": 0.2904, "step": 1941 }, { "epoch": 0.9788306451612904, "grad_norm": 0.17005102876538494, "learning_rate": 7.746090428394156e-07, "loss": 0.2784, "step": 1942 }, { "epoch": 0.9793346774193549, "grad_norm": 0.17329730021771092, "learning_rate": 7.743845748576659e-07, "loss": 0.2836, "step": 1943 }, { "epoch": 0.9798387096774194, "grad_norm": 0.1714850364043798, "learning_rate": 7.741600277176659e-07, "loss": 0.2857, "step": 1944 }, { "epoch": 0.9803427419354839, "grad_norm": 0.19242374904544585, "learning_rate": 7.739354014841963e-07, "loss": 0.286, "step": 1945 }, { "epoch": 0.9808467741935484, "grad_norm": 0.17505647013555173, "learning_rate": 7.737106962220603e-07, "loss": 0.2961, "step": 1946 }, { "epoch": 0.9813508064516129, "grad_norm": 0.17558706687337872, "learning_rate": 7.734859119960841e-07, "loss": 0.2735, "step": 1947 }, { "epoch": 0.9818548387096774, "grad_norm": 0.17903519468817597, "learning_rate": 7.732610488711162e-07, "loss": 0.2901, "step": 1948 }, { "epoch": 0.9823588709677419, "grad_norm": 0.18371964890476014, "learning_rate": 7.730361069120286e-07, "loss": 0.2985, "step": 1949 }, { "epoch": 0.9828629032258065, "grad_norm": 0.1723762496803238, "learning_rate": 7.728110861837156e-07, "loss": 0.2837, "step": 1950 }, { "epoch": 0.983366935483871, "grad_norm": 0.1790011706688029, "learning_rate": 7.725859867510942e-07, "loss": 0.2696, "step": 1951 }, { "epoch": 0.9838709677419355, "grad_norm": 0.18850089072217402, "learning_rate": 7.72360808679104e-07, "loss": 0.2811, "step": 1952 }, { "epoch": 0.984375, "grad_norm": 0.17379961506239888, "learning_rate": 7.72135552032708e-07, "loss": 0.2743, "step": 1953 }, { "epoch": 0.9848790322580645, "grad_norm": 0.18355475523994572, "learning_rate": 7.719102168768907e-07, "loss": 0.2918, "step": 1954 }, { "epoch": 0.985383064516129, "grad_norm": 0.18238534494389397, "learning_rate": 7.716848032766605e-07, "loss": 0.2972, "step": 1955 }, { "epoch": 0.9858870967741935, "grad_norm": 0.1729414375179342, "learning_rate": 7.714593112970473e-07, "loss": 0.2761, "step": 1956 }, { "epoch": 0.9863911290322581, "grad_norm": 0.1768407372110884, "learning_rate": 7.712337410031046e-07, "loss": 0.2663, "step": 1957 }, { "epoch": 0.9868951612903226, "grad_norm": 0.1803839421907579, "learning_rate": 7.710080924599077e-07, "loss": 0.2913, "step": 1958 }, { "epoch": 0.9873991935483871, "grad_norm": 0.1722830670020056, "learning_rate": 7.707823657325549e-07, "loss": 0.2899, "step": 1959 }, { "epoch": 0.9879032258064516, "grad_norm": 0.17157007007009292, "learning_rate": 7.705565608861673e-07, "loss": 0.2832, "step": 1960 }, { "epoch": 0.9884072580645161, "grad_norm": 0.17179339985585704, "learning_rate": 7.703306779858875e-07, "loss": 0.275, "step": 1961 }, { "epoch": 0.9889112903225806, "grad_norm": 0.1750481355495183, "learning_rate": 7.701047170968819e-07, "loss": 0.2935, "step": 1962 }, { "epoch": 0.9894153225806451, "grad_norm": 0.1703164887136099, "learning_rate": 7.698786782843386e-07, "loss": 0.2578, "step": 1963 }, { "epoch": 0.9899193548387096, "grad_norm": 0.17963472103153666, "learning_rate": 7.696525616134686e-07, "loss": 0.2885, "step": 1964 }, { "epoch": 0.9904233870967742, "grad_norm": 0.17443507400988176, "learning_rate": 7.694263671495047e-07, "loss": 0.2992, "step": 1965 }, { "epoch": 0.9909274193548387, "grad_norm": 0.18131986198353908, "learning_rate": 7.692000949577031e-07, "loss": 0.2897, "step": 1966 }, { "epoch": 0.9914314516129032, "grad_norm": 0.19050009588750086, "learning_rate": 7.689737451033415e-07, "loss": 0.2975, "step": 1967 }, { "epoch": 0.9919354838709677, "grad_norm": 0.21986786250372692, "learning_rate": 7.687473176517209e-07, "loss": 0.2775, "step": 1968 }, { "epoch": 0.9924395161290323, "grad_norm": 0.1712446456152378, "learning_rate": 7.685208126681637e-07, "loss": 0.2653, "step": 1969 }, { "epoch": 0.9929435483870968, "grad_norm": 0.17416424744387057, "learning_rate": 7.682942302180155e-07, "loss": 0.3078, "step": 1970 }, { "epoch": 0.9934475806451613, "grad_norm": 0.17288742218707365, "learning_rate": 7.680675703666439e-07, "loss": 0.278, "step": 1971 }, { "epoch": 0.9939516129032258, "grad_norm": 0.1795604518756118, "learning_rate": 7.67840833179439e-07, "loss": 0.2849, "step": 1972 }, { "epoch": 0.9944556451612904, "grad_norm": 0.1715422047295615, "learning_rate": 7.676140187218128e-07, "loss": 0.2665, "step": 1973 }, { "epoch": 0.9949596774193549, "grad_norm": 0.17763829951279275, "learning_rate": 7.673871270592e-07, "loss": 0.2907, "step": 1974 }, { "epoch": 0.9954637096774194, "grad_norm": 0.1748484464046483, "learning_rate": 7.671601582570573e-07, "loss": 0.2722, "step": 1975 }, { "epoch": 0.9959677419354839, "grad_norm": 0.18807863525366347, "learning_rate": 7.66933112380864e-07, "loss": 0.2966, "step": 1976 }, { "epoch": 0.9964717741935484, "grad_norm": 0.19625503483008158, "learning_rate": 7.667059894961214e-07, "loss": 0.2688, "step": 1977 }, { "epoch": 0.9969758064516129, "grad_norm": 0.1714858168146041, "learning_rate": 7.664787896683528e-07, "loss": 0.2857, "step": 1978 }, { "epoch": 0.9974798387096774, "grad_norm": 0.17049746359373963, "learning_rate": 7.662515129631045e-07, "loss": 0.2732, "step": 1979 }, { "epoch": 0.9979838709677419, "grad_norm": 0.17253458678680952, "learning_rate": 7.660241594459437e-07, "loss": 0.2802, "step": 1980 }, { "epoch": 0.9984879032258065, "grad_norm": 0.17152385183060628, "learning_rate": 7.65796729182461e-07, "loss": 0.2775, "step": 1981 }, { "epoch": 0.998991935483871, "grad_norm": 0.17962880378962143, "learning_rate": 7.655692222382683e-07, "loss": 0.282, "step": 1982 }, { "epoch": 0.9994959677419355, "grad_norm": 0.1815884076274797, "learning_rate": 7.653416386790003e-07, "loss": 0.2807, "step": 1983 }, { "epoch": 1.0, "grad_norm": 0.17036014949141842, "learning_rate": 7.651139785703131e-07, "loss": 0.2756, "step": 1984 }, { "epoch": 1.0005040322580645, "grad_norm": 0.1858739335123262, "learning_rate": 7.648862419778854e-07, "loss": 0.2754, "step": 1985 }, { "epoch": 1.001008064516129, "grad_norm": 0.17713241635811486, "learning_rate": 7.646584289674178e-07, "loss": 0.2913, "step": 1986 }, { "epoch": 1.0015120967741935, "grad_norm": 0.17224329918547132, "learning_rate": 7.644305396046328e-07, "loss": 0.2838, "step": 1987 }, { "epoch": 1.002016129032258, "grad_norm": 0.16815918206812863, "learning_rate": 7.642025739552753e-07, "loss": 0.2738, "step": 1988 }, { "epoch": 1.0003780718336484, "grad_norm": 0.17810881094257833, "learning_rate": 7.639745320851118e-07, "loss": 0.2829, "step": 1989 }, { "epoch": 1.0008821676118462, "grad_norm": 0.17225692284996372, "learning_rate": 7.637464140599312e-07, "loss": 0.2648, "step": 1990 }, { "epoch": 1.0008821676118462, "eval_loss": 0.31192973256111145, "eval_runtime": 16.7125, "eval_samples_per_second": 51.159, "eval_steps_per_second": 1.077, "step": 1990 }, { "epoch": 1.001386263390044, "grad_norm": 0.18131745361437934, "learning_rate": 7.635182199455437e-07, "loss": 0.2904, "step": 1991 }, { "epoch": 1.001890359168242, "grad_norm": 0.1674300111635892, "learning_rate": 7.632899498077824e-07, "loss": 0.2781, "step": 1992 }, { "epoch": 1.0023944549464399, "grad_norm": 0.16930495343661658, "learning_rate": 7.630616037125015e-07, "loss": 0.2716, "step": 1993 }, { "epoch": 1.0028985507246377, "grad_norm": 0.17149835744787098, "learning_rate": 7.628331817255775e-07, "loss": 0.3012, "step": 1994 }, { "epoch": 1.0034026465028356, "grad_norm": 0.17694237469391477, "learning_rate": 7.626046839129087e-07, "loss": 0.2826, "step": 1995 }, { "epoch": 1.0039067422810335, "grad_norm": 0.18584553067561851, "learning_rate": 7.623761103404154e-07, "loss": 0.2834, "step": 1996 }, { "epoch": 1.0044108380592311, "grad_norm": 0.17723779816231616, "learning_rate": 7.621474610740396e-07, "loss": 0.2794, "step": 1997 }, { "epoch": 1.004914933837429, "grad_norm": 0.18721280669165485, "learning_rate": 7.619187361797451e-07, "loss": 0.2813, "step": 1998 }, { "epoch": 1.005419029615627, "grad_norm": 0.1752600172196804, "learning_rate": 7.616899357235178e-07, "loss": 0.2636, "step": 1999 }, { "epoch": 1.0059231253938248, "grad_norm": 0.1855969151006619, "learning_rate": 7.614610597713651e-07, "loss": 0.2923, "step": 2000 }, { "epoch": 1.0064272211720227, "grad_norm": 0.16729057622113636, "learning_rate": 7.612321083893163e-07, "loss": 0.2892, "step": 2001 }, { "epoch": 1.0069313169502205, "grad_norm": 0.1693767296113182, "learning_rate": 7.610030816434224e-07, "loss": 0.2801, "step": 2002 }, { "epoch": 1.0074354127284184, "grad_norm": 0.17521037625410535, "learning_rate": 7.607739795997563e-07, "loss": 0.2725, "step": 2003 }, { "epoch": 1.0079395085066163, "grad_norm": 0.17979436463239115, "learning_rate": 7.605448023244127e-07, "loss": 0.2771, "step": 2004 }, { "epoch": 1.0084436042848142, "grad_norm": 0.17931260879453892, "learning_rate": 7.603155498835075e-07, "loss": 0.2668, "step": 2005 }, { "epoch": 1.008947700063012, "grad_norm": 0.17071051877009574, "learning_rate": 7.600862223431787e-07, "loss": 0.2828, "step": 2006 }, { "epoch": 1.00945179584121, "grad_norm": 0.17196704916567862, "learning_rate": 7.598568197695858e-07, "loss": 0.2794, "step": 2007 }, { "epoch": 1.0099558916194078, "grad_norm": 0.17562030132329703, "learning_rate": 7.596273422289103e-07, "loss": 0.2833, "step": 2008 }, { "epoch": 1.0104599873976055, "grad_norm": 0.18089872134544313, "learning_rate": 7.593977897873548e-07, "loss": 0.2833, "step": 2009 }, { "epoch": 1.0109640831758033, "grad_norm": 0.18222407962137802, "learning_rate": 7.591681625111439e-07, "loss": 0.2816, "step": 2010 }, { "epoch": 1.0114681789540012, "grad_norm": 0.18109669021912, "learning_rate": 7.589384604665235e-07, "loss": 0.288, "step": 2011 }, { "epoch": 1.011972274732199, "grad_norm": 0.17814912652297168, "learning_rate": 7.587086837197614e-07, "loss": 0.2753, "step": 2012 }, { "epoch": 1.012476370510397, "grad_norm": 0.16991715478737968, "learning_rate": 7.584788323371466e-07, "loss": 0.2855, "step": 2013 }, { "epoch": 1.0129804662885948, "grad_norm": 0.17035192594812326, "learning_rate": 7.582489063849899e-07, "loss": 0.2629, "step": 2014 }, { "epoch": 1.0134845620667927, "grad_norm": 0.17513209286858666, "learning_rate": 7.580189059296234e-07, "loss": 0.2772, "step": 2015 }, { "epoch": 1.0139886578449906, "grad_norm": 0.19229502375720714, "learning_rate": 7.577888310374007e-07, "loss": 0.2764, "step": 2016 }, { "epoch": 1.0144927536231885, "grad_norm": 0.16703432210037336, "learning_rate": 7.575586817746975e-07, "loss": 0.2749, "step": 2017 }, { "epoch": 1.0149968494013863, "grad_norm": 0.17186499930225924, "learning_rate": 7.573284582079098e-07, "loss": 0.2696, "step": 2018 }, { "epoch": 1.0155009451795842, "grad_norm": 0.20803141423115293, "learning_rate": 7.570981604034563e-07, "loss": 0.274, "step": 2019 }, { "epoch": 1.0160050409577819, "grad_norm": 0.17601420291337044, "learning_rate": 7.56867788427776e-07, "loss": 0.2908, "step": 2020 }, { "epoch": 1.0165091367359798, "grad_norm": 0.18319232781478847, "learning_rate": 7.566373423473299e-07, "loss": 0.2923, "step": 2021 }, { "epoch": 1.0170132325141776, "grad_norm": 0.17400976480499947, "learning_rate": 7.564068222286004e-07, "loss": 0.2808, "step": 2022 }, { "epoch": 1.0175173282923755, "grad_norm": 0.17912905548016714, "learning_rate": 7.56176228138091e-07, "loss": 0.2865, "step": 2023 }, { "epoch": 1.0180214240705734, "grad_norm": 0.1722722944490204, "learning_rate": 7.559455601423266e-07, "loss": 0.2741, "step": 2024 }, { "epoch": 1.0185255198487713, "grad_norm": 0.16659736898121513, "learning_rate": 7.557148183078539e-07, "loss": 0.2821, "step": 2025 }, { "epoch": 1.0190296156269691, "grad_norm": 0.17461443095442888, "learning_rate": 7.554840027012399e-07, "loss": 0.2764, "step": 2026 }, { "epoch": 1.019533711405167, "grad_norm": 0.18391922179509013, "learning_rate": 7.552531133890738e-07, "loss": 0.295, "step": 2027 }, { "epoch": 1.020037807183365, "grad_norm": 0.20322970192602707, "learning_rate": 7.550221504379659e-07, "loss": 0.2865, "step": 2028 }, { "epoch": 1.0205419029615628, "grad_norm": 0.177400814926691, "learning_rate": 7.547911139145472e-07, "loss": 0.2911, "step": 2029 }, { "epoch": 1.0210459987397607, "grad_norm": 0.1834139875378068, "learning_rate": 7.545600038854705e-07, "loss": 0.2795, "step": 2030 }, { "epoch": 1.0215500945179583, "grad_norm": 0.17234090006004013, "learning_rate": 7.543288204174096e-07, "loss": 0.2812, "step": 2031 }, { "epoch": 1.0220541902961562, "grad_norm": 0.19523445589007987, "learning_rate": 7.540975635770595e-07, "loss": 0.2634, "step": 2032 }, { "epoch": 1.022558286074354, "grad_norm": 0.17425774393185609, "learning_rate": 7.538662334311363e-07, "loss": 0.2923, "step": 2033 }, { "epoch": 1.023062381852552, "grad_norm": 0.18190325865887647, "learning_rate": 7.536348300463775e-07, "loss": 0.2763, "step": 2034 }, { "epoch": 1.0235664776307498, "grad_norm": 0.17592758218306667, "learning_rate": 7.534033534895415e-07, "loss": 0.2869, "step": 2035 }, { "epoch": 1.0240705734089477, "grad_norm": 0.17477622082253388, "learning_rate": 7.531718038274076e-07, "loss": 0.291, "step": 2036 }, { "epoch": 1.0245746691871456, "grad_norm": 0.22007149638124626, "learning_rate": 7.529401811267765e-07, "loss": 0.2676, "step": 2037 }, { "epoch": 1.0250787649653434, "grad_norm": 0.16688921868859855, "learning_rate": 7.527084854544701e-07, "loss": 0.267, "step": 2038 }, { "epoch": 1.0255828607435413, "grad_norm": 0.1867065847855188, "learning_rate": 7.524767168773311e-07, "loss": 0.2694, "step": 2039 }, { "epoch": 1.0260869565217392, "grad_norm": 0.205670590026773, "learning_rate": 7.522448754622234e-07, "loss": 0.2889, "step": 2040 }, { "epoch": 1.026591052299937, "grad_norm": 0.17475031087338616, "learning_rate": 7.520129612760318e-07, "loss": 0.2697, "step": 2041 }, { "epoch": 1.0270951480781347, "grad_norm": 0.18541787593421327, "learning_rate": 7.517809743856618e-07, "loss": 0.2774, "step": 2042 }, { "epoch": 1.0275992438563326, "grad_norm": 0.17371102962725932, "learning_rate": 7.515489148580405e-07, "loss": 0.2594, "step": 2043 }, { "epoch": 1.0281033396345305, "grad_norm": 0.17385943346589472, "learning_rate": 7.513167827601154e-07, "loss": 0.2675, "step": 2044 }, { "epoch": 1.0286074354127284, "grad_norm": 0.17595128300347995, "learning_rate": 7.510845781588554e-07, "loss": 0.2793, "step": 2045 }, { "epoch": 1.0291115311909262, "grad_norm": 0.179237071829832, "learning_rate": 7.5085230112125e-07, "loss": 0.2792, "step": 2046 }, { "epoch": 1.0296156269691241, "grad_norm": 0.17451242012769375, "learning_rate": 7.506199517143095e-07, "loss": 0.276, "step": 2047 }, { "epoch": 1.030119722747322, "grad_norm": 0.18158012272605775, "learning_rate": 7.503875300050656e-07, "loss": 0.2788, "step": 2048 }, { "epoch": 1.0306238185255199, "grad_norm": 0.18636629727957202, "learning_rate": 7.501550360605704e-07, "loss": 0.293, "step": 2049 }, { "epoch": 1.0311279143037178, "grad_norm": 0.17720408232848278, "learning_rate": 7.499224699478969e-07, "loss": 0.2813, "step": 2050 }, { "epoch": 1.0316320100819156, "grad_norm": 0.1916262449849373, "learning_rate": 7.496898317341389e-07, "loss": 0.2677, "step": 2051 }, { "epoch": 1.0321361058601135, "grad_norm": 0.17348752838152534, "learning_rate": 7.494571214864113e-07, "loss": 0.2751, "step": 2052 }, { "epoch": 1.0326402016383114, "grad_norm": 0.19058244263360036, "learning_rate": 7.492243392718493e-07, "loss": 0.2722, "step": 2053 }, { "epoch": 1.033144297416509, "grad_norm": 0.1890009158650643, "learning_rate": 7.489914851576095e-07, "loss": 0.2655, "step": 2054 }, { "epoch": 1.033648393194707, "grad_norm": 0.172808831512677, "learning_rate": 7.487585592108685e-07, "loss": 0.2847, "step": 2055 }, { "epoch": 1.0341524889729048, "grad_norm": 0.17546626349825928, "learning_rate": 7.485255614988241e-07, "loss": 0.2817, "step": 2056 }, { "epoch": 1.0346565847511027, "grad_norm": 0.1775777311831324, "learning_rate": 7.482924920886949e-07, "loss": 0.2754, "step": 2057 }, { "epoch": 1.0351606805293005, "grad_norm": 0.1749949082772431, "learning_rate": 7.480593510477197e-07, "loss": 0.2831, "step": 2058 }, { "epoch": 1.0356647763074984, "grad_norm": 0.18632122267782072, "learning_rate": 7.478261384431585e-07, "loss": 0.2725, "step": 2059 }, { "epoch": 1.0361688720856963, "grad_norm": 0.17774286945133272, "learning_rate": 7.475928543422916e-07, "loss": 0.2815, "step": 2060 }, { "epoch": 1.0366729678638942, "grad_norm": 0.18490810909000924, "learning_rate": 7.473594988124199e-07, "loss": 0.2846, "step": 2061 }, { "epoch": 1.037177063642092, "grad_norm": 0.1728261249927515, "learning_rate": 7.471260719208649e-07, "loss": 0.2689, "step": 2062 }, { "epoch": 1.03768115942029, "grad_norm": 0.18211593138151266, "learning_rate": 7.468925737349693e-07, "loss": 0.2923, "step": 2063 }, { "epoch": 1.0381852551984878, "grad_norm": 0.17343204712457802, "learning_rate": 7.466590043220955e-07, "loss": 0.2809, "step": 2064 }, { "epoch": 1.0386893509766855, "grad_norm": 0.17100328567163497, "learning_rate": 7.46425363749627e-07, "loss": 0.2754, "step": 2065 }, { "epoch": 1.0391934467548833, "grad_norm": 0.1818697379651732, "learning_rate": 7.461916520849674e-07, "loss": 0.2724, "step": 2066 }, { "epoch": 1.0396975425330812, "grad_norm": 0.18227441948785947, "learning_rate": 7.459578693955413e-07, "loss": 0.2709, "step": 2067 }, { "epoch": 1.040201638311279, "grad_norm": 0.1760139361145817, "learning_rate": 7.457240157487935e-07, "loss": 0.2806, "step": 2068 }, { "epoch": 1.040705734089477, "grad_norm": 0.1795019228044322, "learning_rate": 7.454900912121894e-07, "loss": 0.2631, "step": 2069 }, { "epoch": 1.0412098298676749, "grad_norm": 0.18888478288221203, "learning_rate": 7.452560958532147e-07, "loss": 0.2889, "step": 2070 }, { "epoch": 1.0417139256458727, "grad_norm": 0.21616086911752216, "learning_rate": 7.450220297393756e-07, "loss": 0.2683, "step": 2071 }, { "epoch": 1.0422180214240706, "grad_norm": 0.17939705286161006, "learning_rate": 7.447878929381989e-07, "loss": 0.293, "step": 2072 }, { "epoch": 1.0427221172022685, "grad_norm": 0.1798424204037018, "learning_rate": 7.445536855172313e-07, "loss": 0.2706, "step": 2073 }, { "epoch": 1.0432262129804664, "grad_norm": 0.17088142114120458, "learning_rate": 7.443194075440405e-07, "loss": 0.2819, "step": 2074 }, { "epoch": 1.0437303087586642, "grad_norm": 0.17886842519012602, "learning_rate": 7.44085059086214e-07, "loss": 0.2893, "step": 2075 }, { "epoch": 1.0442344045368621, "grad_norm": 0.1803195665228042, "learning_rate": 7.4385064021136e-07, "loss": 0.2581, "step": 2076 }, { "epoch": 1.0447385003150598, "grad_norm": 0.21531931727822987, "learning_rate": 7.436161509871069e-07, "loss": 0.2852, "step": 2077 }, { "epoch": 1.0452425960932576, "grad_norm": 0.16822216389718422, "learning_rate": 7.433815914811033e-07, "loss": 0.265, "step": 2078 }, { "epoch": 1.0457466918714555, "grad_norm": 0.17669997866886272, "learning_rate": 7.431469617610183e-07, "loss": 0.272, "step": 2079 }, { "epoch": 1.0462507876496534, "grad_norm": 0.17526090755685278, "learning_rate": 7.429122618945409e-07, "loss": 0.2741, "step": 2080 }, { "epoch": 1.0467548834278513, "grad_norm": 0.17590646009524652, "learning_rate": 7.426774919493808e-07, "loss": 0.2825, "step": 2081 }, { "epoch": 1.0472589792060492, "grad_norm": 0.172169218025432, "learning_rate": 7.424426519932676e-07, "loss": 0.2588, "step": 2082 }, { "epoch": 1.047763074984247, "grad_norm": 0.1805765775734755, "learning_rate": 7.422077420939511e-07, "loss": 0.2686, "step": 2083 }, { "epoch": 1.048267170762445, "grad_norm": 0.18419409997013503, "learning_rate": 7.419727623192013e-07, "loss": 0.273, "step": 2084 }, { "epoch": 1.0487712665406428, "grad_norm": 0.18365937796388698, "learning_rate": 7.417377127368087e-07, "loss": 0.2884, "step": 2085 }, { "epoch": 1.0492753623188407, "grad_norm": 0.1690690352871276, "learning_rate": 7.415025934145836e-07, "loss": 0.2711, "step": 2086 }, { "epoch": 1.0497794580970385, "grad_norm": 0.17486780574416283, "learning_rate": 7.412674044203561e-07, "loss": 0.2664, "step": 2087 }, { "epoch": 1.0502835538752362, "grad_norm": 0.1809124255952242, "learning_rate": 7.410321458219771e-07, "loss": 0.291, "step": 2088 }, { "epoch": 1.050787649653434, "grad_norm": 0.17461075528139122, "learning_rate": 7.407968176873169e-07, "loss": 0.2829, "step": 2089 }, { "epoch": 1.051291745431632, "grad_norm": 0.17457876173639864, "learning_rate": 7.405614200842668e-07, "loss": 0.2733, "step": 2090 }, { "epoch": 1.0517958412098298, "grad_norm": 0.1729548254201044, "learning_rate": 7.40325953080737e-07, "loss": 0.2617, "step": 2091 }, { "epoch": 1.0522999369880277, "grad_norm": 0.1731484045045451, "learning_rate": 7.400904167446585e-07, "loss": 0.2717, "step": 2092 }, { "epoch": 1.0528040327662256, "grad_norm": 0.18214823178441017, "learning_rate": 7.39854811143982e-07, "loss": 0.2715, "step": 2093 }, { "epoch": 1.0533081285444235, "grad_norm": 0.21466975211674327, "learning_rate": 7.396191363466785e-07, "loss": 0.2567, "step": 2094 }, { "epoch": 1.0538122243226213, "grad_norm": 0.1830837150368865, "learning_rate": 7.393833924207385e-07, "loss": 0.2669, "step": 2095 }, { "epoch": 1.0543163201008192, "grad_norm": 0.1772829383212441, "learning_rate": 7.391475794341725e-07, "loss": 0.2798, "step": 2096 }, { "epoch": 1.054820415879017, "grad_norm": 0.17883144461072345, "learning_rate": 7.389116974550114e-07, "loss": 0.3041, "step": 2097 }, { "epoch": 1.055324511657215, "grad_norm": 0.16977328934679697, "learning_rate": 7.386757465513055e-07, "loss": 0.2896, "step": 2098 }, { "epoch": 1.0558286074354126, "grad_norm": 0.17447815375668077, "learning_rate": 7.384397267911252e-07, "loss": 0.2813, "step": 2099 }, { "epoch": 1.0563327032136105, "grad_norm": 0.17549836356043094, "learning_rate": 7.382036382425608e-07, "loss": 0.2604, "step": 2100 }, { "epoch": 1.0568367989918084, "grad_norm": 0.168631281997645, "learning_rate": 7.379674809737226e-07, "loss": 0.2595, "step": 2101 }, { "epoch": 1.0573408947700063, "grad_norm": 0.1779153181273789, "learning_rate": 7.377312550527399e-07, "loss": 0.2769, "step": 2102 }, { "epoch": 1.0578449905482041, "grad_norm": 0.17489911047054466, "learning_rate": 7.37494960547763e-07, "loss": 0.2638, "step": 2103 }, { "epoch": 1.058349086326402, "grad_norm": 0.18710665444256072, "learning_rate": 7.372585975269612e-07, "loss": 0.2733, "step": 2104 }, { "epoch": 1.0588531821045999, "grad_norm": 0.16631564764201226, "learning_rate": 7.370221660585238e-07, "loss": 0.2549, "step": 2105 }, { "epoch": 1.0593572778827978, "grad_norm": 0.1763591739795769, "learning_rate": 7.367856662106595e-07, "loss": 0.2724, "step": 2106 }, { "epoch": 1.0598613736609956, "grad_norm": 0.17005232527255398, "learning_rate": 7.365490980515976e-07, "loss": 0.294, "step": 2107 }, { "epoch": 1.0603654694391935, "grad_norm": 0.18645746328153773, "learning_rate": 7.36312461649586e-07, "loss": 0.28, "step": 2108 }, { "epoch": 1.0608695652173914, "grad_norm": 0.17883161921298946, "learning_rate": 7.360757570728934e-07, "loss": 0.2848, "step": 2109 }, { "epoch": 1.061373660995589, "grad_norm": 0.16761850246628487, "learning_rate": 7.358389843898071e-07, "loss": 0.267, "step": 2110 }, { "epoch": 1.061877756773787, "grad_norm": 0.16661816127263798, "learning_rate": 7.356021436686347e-07, "loss": 0.2669, "step": 2111 }, { "epoch": 1.0623818525519848, "grad_norm": 0.17946012130007954, "learning_rate": 7.353652349777033e-07, "loss": 0.2973, "step": 2112 }, { "epoch": 1.0628859483301827, "grad_norm": 0.18375956514983224, "learning_rate": 7.351282583853597e-07, "loss": 0.2679, "step": 2113 }, { "epoch": 1.0633900441083806, "grad_norm": 0.17321969243454696, "learning_rate": 7.348912139599701e-07, "loss": 0.2699, "step": 2114 }, { "epoch": 1.0638941398865784, "grad_norm": 0.17030523274157758, "learning_rate": 7.346541017699204e-07, "loss": 0.2752, "step": 2115 }, { "epoch": 1.0643982356647763, "grad_norm": 0.17855378803648458, "learning_rate": 7.344169218836161e-07, "loss": 0.2634, "step": 2116 }, { "epoch": 1.0649023314429742, "grad_norm": 0.17747877929878342, "learning_rate": 7.341796743694817e-07, "loss": 0.2796, "step": 2117 }, { "epoch": 1.065406427221172, "grad_norm": 0.17074631177901406, "learning_rate": 7.339423592959619e-07, "loss": 0.2741, "step": 2118 }, { "epoch": 1.06591052299937, "grad_norm": 0.16930821017014017, "learning_rate": 7.337049767315207e-07, "loss": 0.2669, "step": 2119 }, { "epoch": 1.0664146187775678, "grad_norm": 0.17690264524984564, "learning_rate": 7.334675267446415e-07, "loss": 0.2725, "step": 2120 }, { "epoch": 1.0669187145557655, "grad_norm": 0.17283246521488307, "learning_rate": 7.33230009403827e-07, "loss": 0.2883, "step": 2121 }, { "epoch": 1.0674228103339634, "grad_norm": 0.1829853318315655, "learning_rate": 7.329924247775997e-07, "loss": 0.266, "step": 2122 }, { "epoch": 1.0679269061121612, "grad_norm": 0.1685321993543116, "learning_rate": 7.327547729345012e-07, "loss": 0.2811, "step": 2123 }, { "epoch": 1.0684310018903591, "grad_norm": 0.1849302770037306, "learning_rate": 7.325170539430924e-07, "loss": 0.2819, "step": 2124 }, { "epoch": 1.068935097668557, "grad_norm": 0.1722656015767197, "learning_rate": 7.32279267871954e-07, "loss": 0.2689, "step": 2125 }, { "epoch": 1.0694391934467549, "grad_norm": 0.1724271053593593, "learning_rate": 7.320414147896857e-07, "loss": 0.2667, "step": 2126 }, { "epoch": 1.0699432892249527, "grad_norm": 0.17269435432280894, "learning_rate": 7.318034947649064e-07, "loss": 0.2799, "step": 2127 }, { "epoch": 1.0704473850031506, "grad_norm": 0.17637659253320054, "learning_rate": 7.31565507866255e-07, "loss": 0.2916, "step": 2128 }, { "epoch": 1.0709514807813485, "grad_norm": 0.1773899855628958, "learning_rate": 7.313274541623891e-07, "loss": 0.2817, "step": 2129 }, { "epoch": 1.0714555765595464, "grad_norm": 0.18431910974801158, "learning_rate": 7.310893337219857e-07, "loss": 0.2909, "step": 2130 }, { "epoch": 1.0719596723377443, "grad_norm": 0.16833157774674223, "learning_rate": 7.30851146613741e-07, "loss": 0.2765, "step": 2131 }, { "epoch": 1.0724637681159421, "grad_norm": 0.17919301749031188, "learning_rate": 7.306128929063705e-07, "loss": 0.2882, "step": 2132 }, { "epoch": 1.0729678638941398, "grad_norm": 0.17270532104058647, "learning_rate": 7.303745726686091e-07, "loss": 0.2781, "step": 2133 }, { "epoch": 1.0734719596723377, "grad_norm": 0.1783614757197418, "learning_rate": 7.301361859692103e-07, "loss": 0.2754, "step": 2134 }, { "epoch": 1.0739760554505355, "grad_norm": 0.17544697010356183, "learning_rate": 7.298977328769476e-07, "loss": 0.2854, "step": 2135 }, { "epoch": 1.0744801512287334, "grad_norm": 0.18870456706038555, "learning_rate": 7.296592134606133e-07, "loss": 0.2766, "step": 2136 }, { "epoch": 1.0749842470069313, "grad_norm": 0.18171100036742838, "learning_rate": 7.294206277890185e-07, "loss": 0.2744, "step": 2137 }, { "epoch": 1.0754883427851292, "grad_norm": 0.1677025505094429, "learning_rate": 7.291819759309936e-07, "loss": 0.2647, "step": 2138 }, { "epoch": 1.075992438563327, "grad_norm": 0.17288124268881677, "learning_rate": 7.289432579553885e-07, "loss": 0.2731, "step": 2139 }, { "epoch": 1.076496534341525, "grad_norm": 0.1703425006119981, "learning_rate": 7.287044739310717e-07, "loss": 0.2795, "step": 2140 }, { "epoch": 1.0770006301197228, "grad_norm": 0.20025547919726422, "learning_rate": 7.284656239269308e-07, "loss": 0.2877, "step": 2141 }, { "epoch": 1.0775047258979207, "grad_norm": 0.17516136688479655, "learning_rate": 7.282267080118727e-07, "loss": 0.2929, "step": 2142 }, { "epoch": 1.0780088216761186, "grad_norm": 0.17474524965964358, "learning_rate": 7.279877262548232e-07, "loss": 0.2768, "step": 2143 }, { "epoch": 1.0785129174543164, "grad_norm": 0.17466518332420242, "learning_rate": 7.277486787247269e-07, "loss": 0.285, "step": 2144 }, { "epoch": 1.079017013232514, "grad_norm": 0.1819043058750424, "learning_rate": 7.275095654905477e-07, "loss": 0.2952, "step": 2145 }, { "epoch": 1.079521109010712, "grad_norm": 0.1737875425062245, "learning_rate": 7.272703866212682e-07, "loss": 0.2827, "step": 2146 }, { "epoch": 1.0800252047889098, "grad_norm": 0.16958237624506534, "learning_rate": 7.2703114218589e-07, "loss": 0.268, "step": 2147 }, { "epoch": 1.0805293005671077, "grad_norm": 0.17257636404154555, "learning_rate": 7.267918322534336e-07, "loss": 0.2971, "step": 2148 }, { "epoch": 1.0810333963453056, "grad_norm": 0.16928675310322286, "learning_rate": 7.265524568929386e-07, "loss": 0.2736, "step": 2149 }, { "epoch": 1.0815374921235035, "grad_norm": 0.1760838964224455, "learning_rate": 7.263130161734632e-07, "loss": 0.2849, "step": 2150 }, { "epoch": 1.0820415879017014, "grad_norm": 0.18328166077952487, "learning_rate": 7.260735101640845e-07, "loss": 0.258, "step": 2151 }, { "epoch": 1.0825456836798992, "grad_norm": 0.1739117009909244, "learning_rate": 7.258339389338987e-07, "loss": 0.2924, "step": 2152 }, { "epoch": 1.083049779458097, "grad_norm": 0.1781115926870173, "learning_rate": 7.255943025520203e-07, "loss": 0.2596, "step": 2153 }, { "epoch": 1.083553875236295, "grad_norm": 0.16844391904359102, "learning_rate": 7.253546010875832e-07, "loss": 0.2703, "step": 2154 }, { "epoch": 1.0840579710144929, "grad_norm": 0.3278887015033608, "learning_rate": 7.251148346097398e-07, "loss": 0.2779, "step": 2155 }, { "epoch": 1.0845620667926905, "grad_norm": 0.18990927841452848, "learning_rate": 7.248750031876609e-07, "loss": 0.2946, "step": 2156 }, { "epoch": 1.0850661625708884, "grad_norm": 0.17850530711366283, "learning_rate": 7.246351068905368e-07, "loss": 0.2875, "step": 2157 }, { "epoch": 1.0855702583490863, "grad_norm": 0.17728631017755883, "learning_rate": 7.243951457875758e-07, "loss": 0.2818, "step": 2158 }, { "epoch": 1.0860743541272841, "grad_norm": 0.1774048928579683, "learning_rate": 7.241551199480051e-07, "loss": 0.3007, "step": 2159 }, { "epoch": 1.086578449905482, "grad_norm": 0.17200567169289427, "learning_rate": 7.239150294410712e-07, "loss": 0.2717, "step": 2160 }, { "epoch": 1.08708254568368, "grad_norm": 0.17241132084381322, "learning_rate": 7.236748743360384e-07, "loss": 0.2668, "step": 2161 }, { "epoch": 1.0875866414618778, "grad_norm": 0.17166550486003698, "learning_rate": 7.234346547021896e-07, "loss": 0.2995, "step": 2162 }, { "epoch": 1.0880907372400757, "grad_norm": 0.17146325451469707, "learning_rate": 7.231943706088273e-07, "loss": 0.2833, "step": 2163 }, { "epoch": 1.0885948330182735, "grad_norm": 0.17437078445725077, "learning_rate": 7.229540221252716e-07, "loss": 0.2667, "step": 2164 }, { "epoch": 1.0890989287964714, "grad_norm": 0.17975096501612886, "learning_rate": 7.227136093208617e-07, "loss": 0.2789, "step": 2165 }, { "epoch": 1.0896030245746693, "grad_norm": 0.17177377044930098, "learning_rate": 7.22473132264955e-07, "loss": 0.2712, "step": 2166 }, { "epoch": 1.090107120352867, "grad_norm": 0.16926098260953162, "learning_rate": 7.22232591026928e-07, "loss": 0.2655, "step": 2167 }, { "epoch": 1.0906112161310648, "grad_norm": 0.1813817491695504, "learning_rate": 7.21991985676175e-07, "loss": 0.2782, "step": 2168 }, { "epoch": 1.0911153119092627, "grad_norm": 0.1740637250383412, "learning_rate": 7.217513162821094e-07, "loss": 0.282, "step": 2169 }, { "epoch": 1.0916194076874606, "grad_norm": 0.16909096484224836, "learning_rate": 7.215105829141627e-07, "loss": 0.2777, "step": 2170 }, { "epoch": 1.0921235034656585, "grad_norm": 0.19169602165222183, "learning_rate": 7.21269785641785e-07, "loss": 0.2746, "step": 2171 }, { "epoch": 1.0926275992438563, "grad_norm": 0.17606098979705256, "learning_rate": 7.210289245344447e-07, "loss": 0.2796, "step": 2172 }, { "epoch": 1.0931316950220542, "grad_norm": 0.17081810769761613, "learning_rate": 7.207879996616291e-07, "loss": 0.272, "step": 2173 }, { "epoch": 1.093635790800252, "grad_norm": 0.18390464264943393, "learning_rate": 7.205470110928431e-07, "loss": 0.2598, "step": 2174 }, { "epoch": 1.09413988657845, "grad_norm": 0.1752617305053687, "learning_rate": 7.203059588976107e-07, "loss": 0.2702, "step": 2175 }, { "epoch": 1.0946439823566478, "grad_norm": 0.17985065994012292, "learning_rate": 7.20064843145474e-07, "loss": 0.279, "step": 2176 }, { "epoch": 1.0951480781348457, "grad_norm": 0.19788977536594046, "learning_rate": 7.198236639059932e-07, "loss": 0.2763, "step": 2177 }, { "epoch": 1.0956521739130434, "grad_norm": 0.17078468603904942, "learning_rate": 7.19582421248747e-07, "loss": 0.2696, "step": 2178 }, { "epoch": 1.0961562696912412, "grad_norm": 0.17485440247279033, "learning_rate": 7.193411152433327e-07, "loss": 0.2703, "step": 2179 }, { "epoch": 1.0966603654694391, "grad_norm": 0.17440915689799694, "learning_rate": 7.190997459593651e-07, "loss": 0.272, "step": 2180 }, { "epoch": 1.097164461247637, "grad_norm": 0.1798376956782012, "learning_rate": 7.188583134664783e-07, "loss": 0.2857, "step": 2181 }, { "epoch": 1.0976685570258349, "grad_norm": 0.17843642267892243, "learning_rate": 7.186168178343239e-07, "loss": 0.3012, "step": 2182 }, { "epoch": 1.0981726528040328, "grad_norm": 0.18526971863081118, "learning_rate": 7.183752591325716e-07, "loss": 0.2691, "step": 2183 }, { "epoch": 1.0986767485822306, "grad_norm": 0.19144116070312203, "learning_rate": 7.181336374309098e-07, "loss": 0.2925, "step": 2184 }, { "epoch": 1.0991808443604285, "grad_norm": 0.1727996053307209, "learning_rate": 7.17891952799045e-07, "loss": 0.2745, "step": 2185 }, { "epoch": 1.0996849401386264, "grad_norm": 0.1706944820187512, "learning_rate": 7.176502053067016e-07, "loss": 0.2773, "step": 2186 }, { "epoch": 1.1001890359168243, "grad_norm": 0.17180090645464358, "learning_rate": 7.17408395023622e-07, "loss": 0.2812, "step": 2187 }, { "epoch": 1.1006931316950221, "grad_norm": 0.18042375948484118, "learning_rate": 7.171665220195675e-07, "loss": 0.2832, "step": 2188 }, { "epoch": 1.1011972274732198, "grad_norm": 0.17833078926842466, "learning_rate": 7.169245863643165e-07, "loss": 0.2781, "step": 2189 }, { "epoch": 1.1011972274732198, "eval_loss": 0.3109322786331177, "eval_runtime": 18.3883, "eval_samples_per_second": 46.497, "eval_steps_per_second": 0.979, "step": 2189 }, { "epoch": 1.1017013232514177, "grad_norm": 0.17668710642540988, "learning_rate": 7.166825881276663e-07, "loss": 0.2814, "step": 2190 }, { "epoch": 1.1022054190296156, "grad_norm": 0.17892757631840098, "learning_rate": 7.164405273794315e-07, "loss": 0.287, "step": 2191 }, { "epoch": 1.1027095148078134, "grad_norm": 0.19272913994143642, "learning_rate": 7.161984041894453e-07, "loss": 0.2671, "step": 2192 }, { "epoch": 1.1032136105860113, "grad_norm": 0.1686523316395002, "learning_rate": 7.159562186275589e-07, "loss": 0.281, "step": 2193 }, { "epoch": 1.1037177063642092, "grad_norm": 0.1802469776190519, "learning_rate": 7.157139707636411e-07, "loss": 0.2778, "step": 2194 }, { "epoch": 1.104221802142407, "grad_norm": 0.17372788767408012, "learning_rate": 7.15471660667579e-07, "loss": 0.2766, "step": 2195 }, { "epoch": 1.104725897920605, "grad_norm": 0.17515408548411832, "learning_rate": 7.152292884092776e-07, "loss": 0.2781, "step": 2196 }, { "epoch": 1.1052299936988028, "grad_norm": 0.17120886057523532, "learning_rate": 7.149868540586599e-07, "loss": 0.2804, "step": 2197 }, { "epoch": 1.1057340894770007, "grad_norm": 0.1967834646928095, "learning_rate": 7.147443576856667e-07, "loss": 0.2701, "step": 2198 }, { "epoch": 1.1062381852551986, "grad_norm": 0.17730817500369223, "learning_rate": 7.145017993602562e-07, "loss": 0.2743, "step": 2199 }, { "epoch": 1.1067422810333964, "grad_norm": 0.1963166032968741, "learning_rate": 7.142591791524056e-07, "loss": 0.2837, "step": 2200 }, { "epoch": 1.107246376811594, "grad_norm": 0.17742060612270208, "learning_rate": 7.14016497132109e-07, "loss": 0.2878, "step": 2201 }, { "epoch": 1.107750472589792, "grad_norm": 0.1746362136216461, "learning_rate": 7.137737533693787e-07, "loss": 0.2639, "step": 2202 }, { "epoch": 1.1082545683679899, "grad_norm": 0.17804698672223912, "learning_rate": 7.135309479342449e-07, "loss": 0.2787, "step": 2203 }, { "epoch": 1.1087586641461877, "grad_norm": 0.17579637022355174, "learning_rate": 7.132880808967553e-07, "loss": 0.2842, "step": 2204 }, { "epoch": 1.1092627599243856, "grad_norm": 0.16989385004038, "learning_rate": 7.130451523269757e-07, "loss": 0.2677, "step": 2205 }, { "epoch": 1.1097668557025835, "grad_norm": 0.17551046931385217, "learning_rate": 7.128021622949894e-07, "loss": 0.2797, "step": 2206 }, { "epoch": 1.1102709514807814, "grad_norm": 0.17789389712157294, "learning_rate": 7.125591108708973e-07, "loss": 0.2638, "step": 2207 }, { "epoch": 1.1107750472589792, "grad_norm": 0.1676683446730687, "learning_rate": 7.123159981248187e-07, "loss": 0.2743, "step": 2208 }, { "epoch": 1.1112791430371771, "grad_norm": 0.17269955486897387, "learning_rate": 7.120728241268897e-07, "loss": 0.2753, "step": 2209 }, { "epoch": 1.111783238815375, "grad_norm": 0.1772494652206068, "learning_rate": 7.118295889472648e-07, "loss": 0.2836, "step": 2210 }, { "epoch": 1.1122873345935729, "grad_norm": 0.17063122243632267, "learning_rate": 7.115862926561156e-07, "loss": 0.2727, "step": 2211 }, { "epoch": 1.1127914303717708, "grad_norm": 0.18443201636788845, "learning_rate": 7.113429353236317e-07, "loss": 0.2724, "step": 2212 }, { "epoch": 1.1132955261499684, "grad_norm": 0.17187215404760892, "learning_rate": 7.110995170200203e-07, "loss": 0.2791, "step": 2213 }, { "epoch": 1.1137996219281663, "grad_norm": 0.21059430107866162, "learning_rate": 7.108560378155058e-07, "loss": 0.2753, "step": 2214 }, { "epoch": 1.1143037177063642, "grad_norm": 0.17008648209502905, "learning_rate": 7.106124977803305e-07, "loss": 0.2679, "step": 2215 }, { "epoch": 1.114807813484562, "grad_norm": 0.17428666864442163, "learning_rate": 7.103688969847544e-07, "loss": 0.2788, "step": 2216 }, { "epoch": 1.11531190926276, "grad_norm": 0.16978332708330493, "learning_rate": 7.101252354990547e-07, "loss": 0.2812, "step": 2217 }, { "epoch": 1.1158160050409578, "grad_norm": 0.16763401695592228, "learning_rate": 7.09881513393526e-07, "loss": 0.2718, "step": 2218 }, { "epoch": 1.1163201008191557, "grad_norm": 0.17695900270150158, "learning_rate": 7.09637730738481e-07, "loss": 0.2775, "step": 2219 }, { "epoch": 1.1168241965973535, "grad_norm": 0.17577147289069614, "learning_rate": 7.093938876042495e-07, "loss": 0.2817, "step": 2220 }, { "epoch": 1.1173282923755514, "grad_norm": 0.17875894680716742, "learning_rate": 7.091499840611782e-07, "loss": 0.2822, "step": 2221 }, { "epoch": 1.1178323881537493, "grad_norm": 0.17471488732657428, "learning_rate": 7.089060201796323e-07, "loss": 0.2804, "step": 2222 }, { "epoch": 1.1183364839319472, "grad_norm": 0.17146413279465827, "learning_rate": 7.086619960299936e-07, "loss": 0.2662, "step": 2223 }, { "epoch": 1.1188405797101448, "grad_norm": 0.17497006413256116, "learning_rate": 7.084179116826616e-07, "loss": 0.273, "step": 2224 }, { "epoch": 1.1193446754883427, "grad_norm": 0.17948306341455134, "learning_rate": 7.081737672080533e-07, "loss": 0.2772, "step": 2225 }, { "epoch": 1.1198487712665406, "grad_norm": 0.18647108183544914, "learning_rate": 7.079295626766026e-07, "loss": 0.2788, "step": 2226 }, { "epoch": 1.1203528670447385, "grad_norm": 0.1974823122970526, "learning_rate": 7.076852981587613e-07, "loss": 0.2993, "step": 2227 }, { "epoch": 1.1208569628229363, "grad_norm": 0.17288627565030815, "learning_rate": 7.07440973724998e-07, "loss": 0.2775, "step": 2228 }, { "epoch": 1.1213610586011342, "grad_norm": 0.18565848907871105, "learning_rate": 7.071965894457987e-07, "loss": 0.2763, "step": 2229 }, { "epoch": 1.121865154379332, "grad_norm": 0.18033187048793917, "learning_rate": 7.069521453916669e-07, "loss": 0.2814, "step": 2230 }, { "epoch": 1.12236925015753, "grad_norm": 0.17992401726222884, "learning_rate": 7.067076416331233e-07, "loss": 0.2729, "step": 2231 }, { "epoch": 1.1228733459357279, "grad_norm": 0.17515567963879827, "learning_rate": 7.064630782407053e-07, "loss": 0.2971, "step": 2232 }, { "epoch": 1.1233774417139257, "grad_norm": 0.16996972984196193, "learning_rate": 7.062184552849683e-07, "loss": 0.2752, "step": 2233 }, { "epoch": 1.1238815374921236, "grad_norm": 0.1681151195763795, "learning_rate": 7.059737728364844e-07, "loss": 0.2764, "step": 2234 }, { "epoch": 1.1243856332703213, "grad_norm": 0.17197455617986673, "learning_rate": 7.05729030965843e-07, "loss": 0.2732, "step": 2235 }, { "epoch": 1.1248897290485191, "grad_norm": 0.17380812930247844, "learning_rate": 7.054842297436506e-07, "loss": 0.2713, "step": 2236 }, { "epoch": 1.125393824826717, "grad_norm": 0.17146791708239711, "learning_rate": 7.052393692405308e-07, "loss": 0.274, "step": 2237 }, { "epoch": 1.125897920604915, "grad_norm": 0.17522062714630593, "learning_rate": 7.049944495271244e-07, "loss": 0.291, "step": 2238 }, { "epoch": 1.1264020163831128, "grad_norm": 0.18020842179105392, "learning_rate": 7.047494706740891e-07, "loss": 0.2776, "step": 2239 }, { "epoch": 1.1269061121613106, "grad_norm": 0.17094614470132977, "learning_rate": 7.045044327521e-07, "loss": 0.2774, "step": 2240 }, { "epoch": 1.1274102079395085, "grad_norm": 0.2013051302911893, "learning_rate": 7.042593358318488e-07, "loss": 0.274, "step": 2241 }, { "epoch": 1.1279143037177064, "grad_norm": 0.17297868860952875, "learning_rate": 7.040141799840446e-07, "loss": 0.2651, "step": 2242 }, { "epoch": 1.1284183994959043, "grad_norm": 0.17357556224692539, "learning_rate": 7.037689652794132e-07, "loss": 0.2742, "step": 2243 }, { "epoch": 1.1289224952741022, "grad_norm": 0.1726295931526859, "learning_rate": 7.035236917886977e-07, "loss": 0.2749, "step": 2244 }, { "epoch": 1.1294265910523, "grad_norm": 0.18408306057697696, "learning_rate": 7.032783595826577e-07, "loss": 0.2749, "step": 2245 }, { "epoch": 1.1299306868304977, "grad_norm": 0.1760499835143201, "learning_rate": 7.030329687320704e-07, "loss": 0.2757, "step": 2246 }, { "epoch": 1.1304347826086956, "grad_norm": 0.17625183802064207, "learning_rate": 7.027875193077293e-07, "loss": 0.2794, "step": 2247 }, { "epoch": 1.1309388783868934, "grad_norm": 0.17473737339780746, "learning_rate": 7.02542011380445e-07, "loss": 0.2814, "step": 2248 }, { "epoch": 1.1314429741650913, "grad_norm": 0.17699959608452862, "learning_rate": 7.022964450210451e-07, "loss": 0.2793, "step": 2249 }, { "epoch": 1.1319470699432892, "grad_norm": 0.17541995578274833, "learning_rate": 7.020508203003741e-07, "loss": 0.2669, "step": 2250 }, { "epoch": 1.132451165721487, "grad_norm": 0.1835401757832211, "learning_rate": 7.01805137289293e-07, "loss": 0.2743, "step": 2251 }, { "epoch": 1.132955261499685, "grad_norm": 0.19020483202650845, "learning_rate": 7.015593960586799e-07, "loss": 0.2774, "step": 2252 }, { "epoch": 1.1334593572778828, "grad_norm": 0.17000536309275166, "learning_rate": 7.013135966794295e-07, "loss": 0.2811, "step": 2253 }, { "epoch": 1.1339634530560807, "grad_norm": 0.17801380513763565, "learning_rate": 7.010677392224537e-07, "loss": 0.2935, "step": 2254 }, { "epoch": 1.1344675488342786, "grad_norm": 0.1787152664038223, "learning_rate": 7.008218237586807e-07, "loss": 0.2664, "step": 2255 }, { "epoch": 1.1349716446124765, "grad_norm": 0.17254574236169345, "learning_rate": 7.005758503590555e-07, "loss": 0.2844, "step": 2256 }, { "epoch": 1.1354757403906741, "grad_norm": 0.18663732747974232, "learning_rate": 7.003298190945399e-07, "loss": 0.2615, "step": 2257 }, { "epoch": 1.135979836168872, "grad_norm": 0.18225915688322497, "learning_rate": 7.000837300361127e-07, "loss": 0.273, "step": 2258 }, { "epoch": 1.1364839319470699, "grad_norm": 0.17525558203612415, "learning_rate": 6.998375832547687e-07, "loss": 0.2709, "step": 2259 }, { "epoch": 1.1369880277252677, "grad_norm": 0.17721206051763022, "learning_rate": 6.995913788215198e-07, "loss": 0.2638, "step": 2260 }, { "epoch": 1.1374921235034656, "grad_norm": 0.17478241116943685, "learning_rate": 6.993451168073945e-07, "loss": 0.2773, "step": 2261 }, { "epoch": 1.1379962192816635, "grad_norm": 0.1774428076107305, "learning_rate": 6.990987972834382e-07, "loss": 0.2631, "step": 2262 }, { "epoch": 1.1385003150598614, "grad_norm": 0.17410794951513323, "learning_rate": 6.988524203207117e-07, "loss": 0.2707, "step": 2263 }, { "epoch": 1.1390044108380593, "grad_norm": 0.17441217480295762, "learning_rate": 6.986059859902941e-07, "loss": 0.2868, "step": 2264 }, { "epoch": 1.1395085066162571, "grad_norm": 0.1747291161914758, "learning_rate": 6.983594943632799e-07, "loss": 0.2649, "step": 2265 }, { "epoch": 1.140012602394455, "grad_norm": 0.17307228646736642, "learning_rate": 6.981129455107802e-07, "loss": 0.2861, "step": 2266 }, { "epoch": 1.1405166981726529, "grad_norm": 0.18952720395194383, "learning_rate": 6.978663395039231e-07, "loss": 0.2865, "step": 2267 }, { "epoch": 1.1410207939508505, "grad_norm": 0.17922645708493065, "learning_rate": 6.976196764138526e-07, "loss": 0.2808, "step": 2268 }, { "epoch": 1.1415248897290486, "grad_norm": 0.17827854018956354, "learning_rate": 6.973729563117297e-07, "loss": 0.2667, "step": 2269 }, { "epoch": 1.1420289855072463, "grad_norm": 0.18226122048319768, "learning_rate": 6.971261792687315e-07, "loss": 0.2784, "step": 2270 }, { "epoch": 1.1425330812854442, "grad_norm": 0.1720926208778343, "learning_rate": 6.968793453560518e-07, "loss": 0.2696, "step": 2271 }, { "epoch": 1.143037177063642, "grad_norm": 0.18305708913870564, "learning_rate": 6.966324546449006e-07, "loss": 0.281, "step": 2272 }, { "epoch": 1.14354127284184, "grad_norm": 0.1731336224949858, "learning_rate": 6.963855072065043e-07, "loss": 0.2779, "step": 2273 }, { "epoch": 1.1440453686200378, "grad_norm": 0.1748927423648997, "learning_rate": 6.961385031121057e-07, "loss": 0.2753, "step": 2274 }, { "epoch": 1.1445494643982357, "grad_norm": 0.17333262216089212, "learning_rate": 6.958914424329638e-07, "loss": 0.2885, "step": 2275 }, { "epoch": 1.1450535601764336, "grad_norm": 0.18183632440429925, "learning_rate": 6.956443252403544e-07, "loss": 0.2724, "step": 2276 }, { "epoch": 1.1455576559546314, "grad_norm": 0.17479355568350904, "learning_rate": 6.953971516055691e-07, "loss": 0.2785, "step": 2277 }, { "epoch": 1.1460617517328293, "grad_norm": 0.1785764248499925, "learning_rate": 6.951499215999161e-07, "loss": 0.2818, "step": 2278 }, { "epoch": 1.1465658475110272, "grad_norm": 0.1707075256134137, "learning_rate": 6.949026352947194e-07, "loss": 0.2776, "step": 2279 }, { "epoch": 1.147069943289225, "grad_norm": 0.17454520681763225, "learning_rate": 6.946552927613201e-07, "loss": 0.279, "step": 2280 }, { "epoch": 1.1475740390674227, "grad_norm": 0.16725292243096776, "learning_rate": 6.944078940710743e-07, "loss": 0.2706, "step": 2281 }, { "epoch": 1.1480781348456206, "grad_norm": 0.18158312079337943, "learning_rate": 6.941604392953555e-07, "loss": 0.2857, "step": 2282 }, { "epoch": 1.1485822306238185, "grad_norm": 0.17686176298428924, "learning_rate": 6.939129285055527e-07, "loss": 0.2982, "step": 2283 }, { "epoch": 1.1490863264020164, "grad_norm": 0.16814975626978187, "learning_rate": 6.936653617730712e-07, "loss": 0.2719, "step": 2284 }, { "epoch": 1.1495904221802142, "grad_norm": 0.1759711441539806, "learning_rate": 6.934177391693327e-07, "loss": 0.2744, "step": 2285 }, { "epoch": 1.150094517958412, "grad_norm": 0.17511838680958677, "learning_rate": 6.931700607657744e-07, "loss": 0.2739, "step": 2286 }, { "epoch": 1.15059861373661, "grad_norm": 0.17740624835254679, "learning_rate": 6.929223266338504e-07, "loss": 0.269, "step": 2287 }, { "epoch": 1.1511027095148079, "grad_norm": 0.1681514431068215, "learning_rate": 6.926745368450301e-07, "loss": 0.2749, "step": 2288 }, { "epoch": 1.1516068052930057, "grad_norm": 0.1691426996495545, "learning_rate": 6.924266914707995e-07, "loss": 0.2644, "step": 2289 }, { "epoch": 1.1521109010712036, "grad_norm": 0.1790330541665502, "learning_rate": 6.921787905826605e-07, "loss": 0.2823, "step": 2290 }, { "epoch": 1.1526149968494015, "grad_norm": 0.1734370061342181, "learning_rate": 6.919308342521308e-07, "loss": 0.2764, "step": 2291 }, { "epoch": 1.1531190926275992, "grad_norm": 0.18865399840145108, "learning_rate": 6.916828225507443e-07, "loss": 0.2811, "step": 2292 }, { "epoch": 1.153623188405797, "grad_norm": 0.17616068936635845, "learning_rate": 6.914347555500513e-07, "loss": 0.2879, "step": 2293 }, { "epoch": 1.154127284183995, "grad_norm": 0.17879090160590586, "learning_rate": 6.911866333216169e-07, "loss": 0.2717, "step": 2294 }, { "epoch": 1.1546313799621928, "grad_norm": 0.18207500480662475, "learning_rate": 6.909384559370233e-07, "loss": 0.275, "step": 2295 }, { "epoch": 1.1551354757403907, "grad_norm": 0.17522149059459446, "learning_rate": 6.906902234678678e-07, "loss": 0.2688, "step": 2296 }, { "epoch": 1.1556395715185885, "grad_norm": 0.1938343793990561, "learning_rate": 6.904419359857641e-07, "loss": 0.2925, "step": 2297 }, { "epoch": 1.1561436672967864, "grad_norm": 0.20218185185326207, "learning_rate": 6.901935935623415e-07, "loss": 0.2804, "step": 2298 }, { "epoch": 1.1566477630749843, "grad_norm": 0.17255949907153817, "learning_rate": 6.899451962692454e-07, "loss": 0.2717, "step": 2299 }, { "epoch": 1.1571518588531822, "grad_norm": 0.17432970101357595, "learning_rate": 6.896967441781368e-07, "loss": 0.2609, "step": 2300 }, { "epoch": 1.15765595463138, "grad_norm": 0.17725867905667875, "learning_rate": 6.894482373606927e-07, "loss": 0.2877, "step": 2301 }, { "epoch": 1.158160050409578, "grad_norm": 0.1817120717141377, "learning_rate": 6.891996758886058e-07, "loss": 0.2776, "step": 2302 }, { "epoch": 1.1586641461877756, "grad_norm": 0.17327028373633524, "learning_rate": 6.889510598335843e-07, "loss": 0.2491, "step": 2303 }, { "epoch": 1.1591682419659735, "grad_norm": 0.17396621817828842, "learning_rate": 6.887023892673525e-07, "loss": 0.2872, "step": 2304 }, { "epoch": 1.1596723377441713, "grad_norm": 0.17083195902984658, "learning_rate": 6.884536642616504e-07, "loss": 0.2714, "step": 2305 }, { "epoch": 1.1601764335223692, "grad_norm": 0.17991670145657968, "learning_rate": 6.882048848882335e-07, "loss": 0.2973, "step": 2306 }, { "epoch": 1.160680529300567, "grad_norm": 0.17544125470825095, "learning_rate": 6.879560512188733e-07, "loss": 0.2767, "step": 2307 }, { "epoch": 1.161184625078765, "grad_norm": 0.18423859716455426, "learning_rate": 6.877071633253566e-07, "loss": 0.2763, "step": 2308 }, { "epoch": 1.1616887208569628, "grad_norm": 0.18681981813114995, "learning_rate": 6.874582212794861e-07, "loss": 0.2789, "step": 2309 }, { "epoch": 1.1621928166351607, "grad_norm": 0.17640590031489775, "learning_rate": 6.872092251530799e-07, "loss": 0.288, "step": 2310 }, { "epoch": 1.1626969124133586, "grad_norm": 0.17113861686922094, "learning_rate": 6.869601750179721e-07, "loss": 0.2721, "step": 2311 }, { "epoch": 1.1632010081915565, "grad_norm": 0.1713692791805304, "learning_rate": 6.867110709460118e-07, "loss": 0.2767, "step": 2312 }, { "epoch": 1.1637051039697544, "grad_norm": 0.1729727527955606, "learning_rate": 6.864619130090642e-07, "loss": 0.298, "step": 2313 }, { "epoch": 1.164209199747952, "grad_norm": 0.18317780598435884, "learning_rate": 6.862127012790098e-07, "loss": 0.271, "step": 2314 }, { "epoch": 1.1647132955261499, "grad_norm": 0.16947854182681354, "learning_rate": 6.859634358277445e-07, "loss": 0.2731, "step": 2315 }, { "epoch": 1.1652173913043478, "grad_norm": 0.1764313134129964, "learning_rate": 6.8571411672718e-07, "loss": 0.2898, "step": 2316 }, { "epoch": 1.1657214870825456, "grad_norm": 0.18128405805703027, "learning_rate": 6.854647440492434e-07, "loss": 0.2713, "step": 2317 }, { "epoch": 1.1662255828607435, "grad_norm": 0.17302921586300837, "learning_rate": 6.852153178658768e-07, "loss": 0.2791, "step": 2318 }, { "epoch": 1.1667296786389414, "grad_norm": 0.18238569158548426, "learning_rate": 6.849658382490386e-07, "loss": 0.2637, "step": 2319 }, { "epoch": 1.1672337744171393, "grad_norm": 0.20560524437351874, "learning_rate": 6.847163052707017e-07, "loss": 0.2886, "step": 2320 }, { "epoch": 1.1677378701953371, "grad_norm": 0.17279730398402016, "learning_rate": 6.84466719002855e-07, "loss": 0.276, "step": 2321 }, { "epoch": 1.168241965973535, "grad_norm": 0.1748627881177702, "learning_rate": 6.842170795175025e-07, "loss": 0.2777, "step": 2322 }, { "epoch": 1.168746061751733, "grad_norm": 0.20772510184287282, "learning_rate": 6.839673868866639e-07, "loss": 0.2745, "step": 2323 }, { "epoch": 1.1692501575299308, "grad_norm": 0.1703532830956251, "learning_rate": 6.837176411823738e-07, "loss": 0.2631, "step": 2324 }, { "epoch": 1.1697542533081284, "grad_norm": 0.1793111095271819, "learning_rate": 6.834678424766822e-07, "loss": 0.2818, "step": 2325 }, { "epoch": 1.1702583490863263, "grad_norm": 0.17008601463744455, "learning_rate": 6.832179908416546e-07, "loss": 0.2637, "step": 2326 }, { "epoch": 1.1707624448645242, "grad_norm": 0.16973624705065182, "learning_rate": 6.829680863493717e-07, "loss": 0.3012, "step": 2327 }, { "epoch": 1.171266540642722, "grad_norm": 0.18216274445798897, "learning_rate": 6.82718129071929e-07, "loss": 0.2887, "step": 2328 }, { "epoch": 1.17177063642092, "grad_norm": 0.18182861892057042, "learning_rate": 6.824681190814383e-07, "loss": 0.2825, "step": 2329 }, { "epoch": 1.1722747321991178, "grad_norm": 0.18263373935726934, "learning_rate": 6.822180564500254e-07, "loss": 0.294, "step": 2330 }, { "epoch": 1.1727788279773157, "grad_norm": 0.17351672150821956, "learning_rate": 6.81967941249832e-07, "loss": 0.2799, "step": 2331 }, { "epoch": 1.1732829237555136, "grad_norm": 0.17364729038219348, "learning_rate": 6.817177735530149e-07, "loss": 0.2664, "step": 2332 }, { "epoch": 1.1737870195337115, "grad_norm": 0.1808124777048507, "learning_rate": 6.814675534317457e-07, "loss": 0.2714, "step": 2333 }, { "epoch": 1.1742911153119093, "grad_norm": 0.180877239519155, "learning_rate": 6.812172809582114e-07, "loss": 0.2828, "step": 2334 }, { "epoch": 1.1747952110901072, "grad_norm": 0.17380368919234485, "learning_rate": 6.809669562046142e-07, "loss": 0.2737, "step": 2335 }, { "epoch": 1.1752993068683049, "grad_norm": 0.19015792157284472, "learning_rate": 6.807165792431712e-07, "loss": 0.2909, "step": 2336 }, { "epoch": 1.175803402646503, "grad_norm": 0.1739820937338746, "learning_rate": 6.804661501461146e-07, "loss": 0.2797, "step": 2337 }, { "epoch": 1.1763074984247006, "grad_norm": 0.17829150310208366, "learning_rate": 6.802156689856914e-07, "loss": 0.272, "step": 2338 }, { "epoch": 1.1768115942028985, "grad_norm": 0.17618058591368718, "learning_rate": 6.799651358341644e-07, "loss": 0.259, "step": 2339 }, { "epoch": 1.1773156899810964, "grad_norm": 0.16972502373995604, "learning_rate": 6.797145507638103e-07, "loss": 0.2758, "step": 2340 }, { "epoch": 1.1778197857592942, "grad_norm": 0.17376894673770923, "learning_rate": 6.794639138469215e-07, "loss": 0.2598, "step": 2341 }, { "epoch": 1.1783238815374921, "grad_norm": 0.16890243175150166, "learning_rate": 6.792132251558057e-07, "loss": 0.2679, "step": 2342 }, { "epoch": 1.17882797731569, "grad_norm": 0.16892397355410096, "learning_rate": 6.789624847627842e-07, "loss": 0.2713, "step": 2343 }, { "epoch": 1.1793320730938879, "grad_norm": 0.18081668386093178, "learning_rate": 6.787116927401947e-07, "loss": 0.2839, "step": 2344 }, { "epoch": 1.1798361688720858, "grad_norm": 0.1785462609641959, "learning_rate": 6.784608491603887e-07, "loss": 0.2767, "step": 2345 }, { "epoch": 1.1803402646502836, "grad_norm": 0.17328874161653496, "learning_rate": 6.782099540957334e-07, "loss": 0.2792, "step": 2346 }, { "epoch": 1.1808443604284813, "grad_norm": 0.17812431021686165, "learning_rate": 6.779590076186103e-07, "loss": 0.2756, "step": 2347 }, { "epoch": 1.1813484562066794, "grad_norm": 0.18026369211177573, "learning_rate": 6.777080098014157e-07, "loss": 0.2829, "step": 2348 }, { "epoch": 1.181852551984877, "grad_norm": 0.17224760523581167, "learning_rate": 6.774569607165612e-07, "loss": 0.2715, "step": 2349 }, { "epoch": 1.182356647763075, "grad_norm": 0.17446195179033158, "learning_rate": 6.772058604364728e-07, "loss": 0.2816, "step": 2350 }, { "epoch": 1.1828607435412728, "grad_norm": 0.18401389185741523, "learning_rate": 6.769547090335915e-07, "loss": 0.2756, "step": 2351 }, { "epoch": 1.1833648393194707, "grad_norm": 0.17710337017025912, "learning_rate": 6.767035065803728e-07, "loss": 0.2662, "step": 2352 }, { "epoch": 1.1838689350976686, "grad_norm": 0.17209284528655844, "learning_rate": 6.76452253149287e-07, "loss": 0.2643, "step": 2353 }, { "epoch": 1.1843730308758664, "grad_norm": 0.19313801740333889, "learning_rate": 6.762009488128193e-07, "loss": 0.2859, "step": 2354 }, { "epoch": 1.1848771266540643, "grad_norm": 0.185715764727419, "learning_rate": 6.759495936434694e-07, "loss": 0.2778, "step": 2355 }, { "epoch": 1.1853812224322622, "grad_norm": 0.18586478407699625, "learning_rate": 6.756981877137515e-07, "loss": 0.2781, "step": 2356 }, { "epoch": 1.18588531821046, "grad_norm": 0.18286265156559076, "learning_rate": 6.754467310961951e-07, "loss": 0.2941, "step": 2357 }, { "epoch": 1.186389413988658, "grad_norm": 0.1827838570028727, "learning_rate": 6.751952238633435e-07, "loss": 0.2799, "step": 2358 }, { "epoch": 1.1868935097668558, "grad_norm": 0.17564052156297236, "learning_rate": 6.74943666087755e-07, "loss": 0.2859, "step": 2359 }, { "epoch": 1.1873976055450535, "grad_norm": 0.17276098802425446, "learning_rate": 6.746920578420027e-07, "loss": 0.273, "step": 2360 }, { "epoch": 1.1879017013232513, "grad_norm": 0.16583786340074003, "learning_rate": 6.744403991986737e-07, "loss": 0.2751, "step": 2361 }, { "epoch": 1.1884057971014492, "grad_norm": 0.1758298830227382, "learning_rate": 6.741886902303703e-07, "loss": 0.2817, "step": 2362 }, { "epoch": 1.188909892879647, "grad_norm": 0.1786122710003466, "learning_rate": 6.739369310097087e-07, "loss": 0.2746, "step": 2363 }, { "epoch": 1.189413988657845, "grad_norm": 0.1725746663086315, "learning_rate": 6.7368512160932e-07, "loss": 0.2697, "step": 2364 }, { "epoch": 1.1899180844360429, "grad_norm": 0.17766797524511566, "learning_rate": 6.734332621018497e-07, "loss": 0.2731, "step": 2365 }, { "epoch": 1.1904221802142407, "grad_norm": 0.1773919783826711, "learning_rate": 6.731813525599576e-07, "loss": 0.2811, "step": 2366 }, { "epoch": 1.1909262759924386, "grad_norm": 0.18244881573424246, "learning_rate": 6.72929393056318e-07, "loss": 0.2733, "step": 2367 }, { "epoch": 1.1914303717706365, "grad_norm": 0.17285562925317308, "learning_rate": 6.7267738366362e-07, "loss": 0.2691, "step": 2368 }, { "epoch": 1.1919344675488344, "grad_norm": 0.18991372620629085, "learning_rate": 6.724253244545663e-07, "loss": 0.2787, "step": 2369 }, { "epoch": 1.1924385633270322, "grad_norm": 0.17368277958423853, "learning_rate": 6.721732155018747e-07, "loss": 0.2746, "step": 2370 }, { "epoch": 1.19294265910523, "grad_norm": 0.17143055764086812, "learning_rate": 6.719210568782768e-07, "loss": 0.2774, "step": 2371 }, { "epoch": 1.1934467548834278, "grad_norm": 0.1723446011688051, "learning_rate": 6.716688486565192e-07, "loss": 0.2615, "step": 2372 }, { "epoch": 1.1939508506616257, "grad_norm": 0.17415883235396237, "learning_rate": 6.714165909093621e-07, "loss": 0.2904, "step": 2373 }, { "epoch": 1.1944549464398235, "grad_norm": 0.17281066848566462, "learning_rate": 6.711642837095804e-07, "loss": 0.2629, "step": 2374 }, { "epoch": 1.1949590422180214, "grad_norm": 0.17706321838125264, "learning_rate": 6.709119271299631e-07, "loss": 0.2791, "step": 2375 }, { "epoch": 1.1954631379962193, "grad_norm": 0.17409373969496145, "learning_rate": 6.706595212433137e-07, "loss": 0.286, "step": 2376 }, { "epoch": 1.1959672337744172, "grad_norm": 0.17363748305455576, "learning_rate": 6.704070661224496e-07, "loss": 0.2809, "step": 2377 }, { "epoch": 1.196471329552615, "grad_norm": 0.17224099285294484, "learning_rate": 6.701545618402025e-07, "loss": 0.2696, "step": 2378 }, { "epoch": 1.196975425330813, "grad_norm": 0.17472439778273308, "learning_rate": 6.699020084694183e-07, "loss": 0.2719, "step": 2379 }, { "epoch": 1.1974795211090108, "grad_norm": 0.17609087999551598, "learning_rate": 6.69649406082957e-07, "loss": 0.2687, "step": 2380 }, { "epoch": 1.1979836168872087, "grad_norm": 0.17688131759879658, "learning_rate": 6.693967547536932e-07, "loss": 0.2839, "step": 2381 }, { "epoch": 1.1984877126654063, "grad_norm": 0.1801110500802134, "learning_rate": 6.69144054554515e-07, "loss": 0.2898, "step": 2382 }, { "epoch": 1.1989918084436042, "grad_norm": 0.17434074905342395, "learning_rate": 6.688913055583247e-07, "loss": 0.2673, "step": 2383 }, { "epoch": 1.199495904221802, "grad_norm": 0.18206024523654654, "learning_rate": 6.686385078380392e-07, "loss": 0.2817, "step": 2384 }, { "epoch": 1.2, "grad_norm": 0.1875715751646641, "learning_rate": 6.683856614665887e-07, "loss": 0.2694, "step": 2385 }, { "epoch": 1.2005040957781978, "grad_norm": 0.17720280812976338, "learning_rate": 6.68132766516918e-07, "loss": 0.293, "step": 2386 }, { "epoch": 1.2010081915563957, "grad_norm": 0.1827446975631408, "learning_rate": 6.678798230619856e-07, "loss": 0.2607, "step": 2387 }, { "epoch": 1.2015122873345936, "grad_norm": 0.18309536687811928, "learning_rate": 6.676268311747644e-07, "loss": 0.2776, "step": 2388 }, { "epoch": 1.2015122873345936, "eval_loss": 0.30996033549308777, "eval_runtime": 18.406, "eval_samples_per_second": 46.452, "eval_steps_per_second": 0.978, "step": 2388 }, { "epoch": 1.2020163831127915, "grad_norm": 0.17691733319401884, "learning_rate": 6.673737909282406e-07, "loss": 0.2905, "step": 2389 }, { "epoch": 1.2025204788909893, "grad_norm": 0.17048139269250298, "learning_rate": 6.671207023954151e-07, "loss": 0.2683, "step": 2390 }, { "epoch": 1.2030245746691872, "grad_norm": 0.18100250219396277, "learning_rate": 6.66867565649302e-07, "loss": 0.2676, "step": 2391 }, { "epoch": 1.203528670447385, "grad_norm": 0.1723576811925902, "learning_rate": 6.666143807629302e-07, "loss": 0.2808, "step": 2392 }, { "epoch": 1.2040327662255828, "grad_norm": 0.17272371476581297, "learning_rate": 6.663611478093415e-07, "loss": 0.2668, "step": 2393 }, { "epoch": 1.2045368620037806, "grad_norm": 0.1852834091042434, "learning_rate": 6.661078668615922e-07, "loss": 0.276, "step": 2394 }, { "epoch": 1.2050409577819785, "grad_norm": 0.17253248810106345, "learning_rate": 6.658545379927523e-07, "loss": 0.2737, "step": 2395 }, { "epoch": 1.2055450535601764, "grad_norm": 0.17189335242974707, "learning_rate": 6.656011612759056e-07, "loss": 0.2671, "step": 2396 }, { "epoch": 1.2060491493383743, "grad_norm": 0.17237484748444734, "learning_rate": 6.653477367841497e-07, "loss": 0.2702, "step": 2397 }, { "epoch": 1.2065532451165721, "grad_norm": 0.17030567087724324, "learning_rate": 6.65094264590596e-07, "loss": 0.2688, "step": 2398 }, { "epoch": 1.20705734089477, "grad_norm": 0.17311851732438963, "learning_rate": 6.648407447683698e-07, "loss": 0.2945, "step": 2399 }, { "epoch": 1.207561436672968, "grad_norm": 0.1868284113694255, "learning_rate": 6.645871773906098e-07, "loss": 0.2685, "step": 2400 }, { "epoch": 1.2080655324511658, "grad_norm": 0.1797570381187558, "learning_rate": 6.643335625304687e-07, "loss": 0.2778, "step": 2401 }, { "epoch": 1.2085696282293636, "grad_norm": 0.1736947065045101, "learning_rate": 6.640799002611127e-07, "loss": 0.2701, "step": 2402 }, { "epoch": 1.2090737240075615, "grad_norm": 0.16988523916524562, "learning_rate": 6.638261906557219e-07, "loss": 0.2716, "step": 2403 }, { "epoch": 1.2095778197857592, "grad_norm": 0.17637575120619103, "learning_rate": 6.635724337874902e-07, "loss": 0.2792, "step": 2404 }, { "epoch": 1.210081915563957, "grad_norm": 0.18457061836282274, "learning_rate": 6.633186297296244e-07, "loss": 0.2726, "step": 2405 }, { "epoch": 1.210586011342155, "grad_norm": 0.1835538319807229, "learning_rate": 6.630647785553456e-07, "loss": 0.2778, "step": 2406 }, { "epoch": 1.2110901071203528, "grad_norm": 0.17108814635241743, "learning_rate": 6.628108803378884e-07, "loss": 0.2596, "step": 2407 }, { "epoch": 1.2115942028985507, "grad_norm": 0.17109777831078982, "learning_rate": 6.625569351505008e-07, "loss": 0.2799, "step": 2408 }, { "epoch": 1.2120982986767486, "grad_norm": 0.17949433198481227, "learning_rate": 6.623029430664444e-07, "loss": 0.2717, "step": 2409 }, { "epoch": 1.2126023944549464, "grad_norm": 0.18703173955181984, "learning_rate": 6.620489041589942e-07, "loss": 0.2734, "step": 2410 }, { "epoch": 1.2131064902331443, "grad_norm": 0.16778996136487268, "learning_rate": 6.617948185014392e-07, "loss": 0.2673, "step": 2411 }, { "epoch": 1.2136105860113422, "grad_norm": 0.17151013082627964, "learning_rate": 6.615406861670811e-07, "loss": 0.2848, "step": 2412 }, { "epoch": 1.21411468178954, "grad_norm": 0.17205458561571332, "learning_rate": 6.612865072292359e-07, "loss": 0.2788, "step": 2413 }, { "epoch": 1.214618777567738, "grad_norm": 0.1697154843584016, "learning_rate": 6.610322817612326e-07, "loss": 0.2782, "step": 2414 }, { "epoch": 1.2151228733459356, "grad_norm": 0.17155395420115768, "learning_rate": 6.607780098364133e-07, "loss": 0.2757, "step": 2415 }, { "epoch": 1.2156269691241337, "grad_norm": 0.17914770936359822, "learning_rate": 6.605236915281343e-07, "loss": 0.2625, "step": 2416 }, { "epoch": 1.2161310649023314, "grad_norm": 0.17886990627397134, "learning_rate": 6.602693269097646e-07, "loss": 0.2826, "step": 2417 }, { "epoch": 1.2166351606805292, "grad_norm": 0.17137302898724377, "learning_rate": 6.600149160546868e-07, "loss": 0.2654, "step": 2418 }, { "epoch": 1.2171392564587271, "grad_norm": 0.16812006244582586, "learning_rate": 6.597604590362972e-07, "loss": 0.2693, "step": 2419 }, { "epoch": 1.217643352236925, "grad_norm": 0.1967968966854081, "learning_rate": 6.595059559280047e-07, "loss": 0.2851, "step": 2420 }, { "epoch": 1.2181474480151229, "grad_norm": 0.18024230166349509, "learning_rate": 6.592514068032321e-07, "loss": 0.2802, "step": 2421 }, { "epoch": 1.2186515437933207, "grad_norm": 0.17680540350469492, "learning_rate": 6.58996811735415e-07, "loss": 0.267, "step": 2422 }, { "epoch": 1.2191556395715186, "grad_norm": 0.17123047011311054, "learning_rate": 6.587421707980027e-07, "loss": 0.2691, "step": 2423 }, { "epoch": 1.2196597353497165, "grad_norm": 0.1817858622939275, "learning_rate": 6.584874840644575e-07, "loss": 0.2695, "step": 2424 }, { "epoch": 1.2201638311279144, "grad_norm": 0.17367383253762286, "learning_rate": 6.582327516082549e-07, "loss": 0.2836, "step": 2425 }, { "epoch": 1.2206679269061123, "grad_norm": 0.17880252652432166, "learning_rate": 6.579779735028836e-07, "loss": 0.2733, "step": 2426 }, { "epoch": 1.2211720226843101, "grad_norm": 0.18216013842602857, "learning_rate": 6.577231498218457e-07, "loss": 0.2563, "step": 2427 }, { "epoch": 1.2216761184625078, "grad_norm": 0.17923344984805104, "learning_rate": 6.574682806386559e-07, "loss": 0.2727, "step": 2428 }, { "epoch": 1.2221802142407057, "grad_norm": 0.17305807088265074, "learning_rate": 6.572133660268428e-07, "loss": 0.2786, "step": 2429 }, { "epoch": 1.2226843100189035, "grad_norm": 0.1826130969086575, "learning_rate": 6.569584060599475e-07, "loss": 0.2639, "step": 2430 }, { "epoch": 1.2231884057971014, "grad_norm": 0.17180169078282023, "learning_rate": 6.567034008115242e-07, "loss": 0.2567, "step": 2431 }, { "epoch": 1.2236925015752993, "grad_norm": 0.17999665867139894, "learning_rate": 6.564483503551406e-07, "loss": 0.2868, "step": 2432 }, { "epoch": 1.2241965973534972, "grad_norm": 0.1734485518145704, "learning_rate": 6.56193254764377e-07, "loss": 0.2842, "step": 2433 }, { "epoch": 1.224700693131695, "grad_norm": 0.19524933134868758, "learning_rate": 6.55938114112827e-07, "loss": 0.2805, "step": 2434 }, { "epoch": 1.225204788909893, "grad_norm": 0.3859143312648577, "learning_rate": 6.556829284740972e-07, "loss": 0.2793, "step": 2435 }, { "epoch": 1.2257088846880908, "grad_norm": 0.16949921412202032, "learning_rate": 6.554276979218069e-07, "loss": 0.2633, "step": 2436 }, { "epoch": 1.2262129804662887, "grad_norm": 0.17509147431622993, "learning_rate": 6.551724225295885e-07, "loss": 0.2585, "step": 2437 }, { "epoch": 1.2267170762444866, "grad_norm": 0.18102430967796426, "learning_rate": 6.549171023710874e-07, "loss": 0.2568, "step": 2438 }, { "epoch": 1.2272211720226842, "grad_norm": 0.17595819751957628, "learning_rate": 6.54661737519962e-07, "loss": 0.2678, "step": 2439 }, { "epoch": 1.227725267800882, "grad_norm": 0.18034785174616086, "learning_rate": 6.544063280498834e-07, "loss": 0.286, "step": 2440 }, { "epoch": 1.22822936357908, "grad_norm": 0.1751845129218413, "learning_rate": 6.541508740345357e-07, "loss": 0.2704, "step": 2441 }, { "epoch": 1.2287334593572778, "grad_norm": 0.1707508154159352, "learning_rate": 6.538953755476155e-07, "loss": 0.2657, "step": 2442 }, { "epoch": 1.2292375551354757, "grad_norm": 0.17571528214389306, "learning_rate": 6.53639832662833e-07, "loss": 0.2737, "step": 2443 }, { "epoch": 1.2297416509136736, "grad_norm": 0.1767996956782402, "learning_rate": 6.533842454539105e-07, "loss": 0.2692, "step": 2444 }, { "epoch": 1.2302457466918715, "grad_norm": 0.17677555995052904, "learning_rate": 6.531286139945834e-07, "loss": 0.2747, "step": 2445 }, { "epoch": 1.2307498424700694, "grad_norm": 0.17418823992379287, "learning_rate": 6.528729383585997e-07, "loss": 0.2687, "step": 2446 }, { "epoch": 1.2312539382482672, "grad_norm": 0.1981211293612195, "learning_rate": 6.526172186197203e-07, "loss": 0.2699, "step": 2447 }, { "epoch": 1.231758034026465, "grad_norm": 0.1770440172204685, "learning_rate": 6.523614548517187e-07, "loss": 0.2758, "step": 2448 }, { "epoch": 1.232262129804663, "grad_norm": 0.1733214708512733, "learning_rate": 6.521056471283811e-07, "loss": 0.2761, "step": 2449 }, { "epoch": 1.2327662255828606, "grad_norm": 0.2036211297547966, "learning_rate": 6.518497955235068e-07, "loss": 0.268, "step": 2450 }, { "epoch": 1.2332703213610585, "grad_norm": 0.17325758117743048, "learning_rate": 6.515939001109069e-07, "loss": 0.2748, "step": 2451 }, { "epoch": 1.2337744171392564, "grad_norm": 0.17462788382443756, "learning_rate": 6.513379609644062e-07, "loss": 0.2782, "step": 2452 }, { "epoch": 1.2342785129174543, "grad_norm": 0.1711968410986884, "learning_rate": 6.51081978157841e-07, "loss": 0.2707, "step": 2453 }, { "epoch": 1.2347826086956522, "grad_norm": 0.1683105761384845, "learning_rate": 6.50825951765061e-07, "loss": 0.2773, "step": 2454 }, { "epoch": 1.23528670447385, "grad_norm": 0.17554589173116267, "learning_rate": 6.505698818599284e-07, "loss": 0.2676, "step": 2455 }, { "epoch": 1.235790800252048, "grad_norm": 0.1759694214712948, "learning_rate": 6.503137685163173e-07, "loss": 0.2749, "step": 2456 }, { "epoch": 1.2362948960302458, "grad_norm": 0.17957710933654297, "learning_rate": 6.500576118081155e-07, "loss": 0.2762, "step": 2457 }, { "epoch": 1.2367989918084437, "grad_norm": 0.19053173662723633, "learning_rate": 6.49801411809222e-07, "loss": 0.2803, "step": 2458 }, { "epoch": 1.2373030875866415, "grad_norm": 0.17718428947532663, "learning_rate": 6.495451685935494e-07, "loss": 0.2729, "step": 2459 }, { "epoch": 1.2378071833648394, "grad_norm": 0.17785327429842798, "learning_rate": 6.492888822350219e-07, "loss": 0.2661, "step": 2460 }, { "epoch": 1.238311279143037, "grad_norm": 0.17300591946217833, "learning_rate": 6.490325528075766e-07, "loss": 0.2867, "step": 2461 }, { "epoch": 1.238815374921235, "grad_norm": 0.16921354432191585, "learning_rate": 6.487761803851631e-07, "loss": 0.2711, "step": 2462 }, { "epoch": 1.2393194706994328, "grad_norm": 0.17810885372654386, "learning_rate": 6.485197650417431e-07, "loss": 0.258, "step": 2463 }, { "epoch": 1.2398235664776307, "grad_norm": 0.18103805033195008, "learning_rate": 6.482633068512911e-07, "loss": 0.2856, "step": 2464 }, { "epoch": 1.2403276622558286, "grad_norm": 0.18360526541296773, "learning_rate": 6.480068058877934e-07, "loss": 0.2743, "step": 2465 }, { "epoch": 1.2408317580340265, "grad_norm": 0.18884668384982423, "learning_rate": 6.47750262225249e-07, "loss": 0.2769, "step": 2466 }, { "epoch": 1.2413358538122243, "grad_norm": 0.2002321224410455, "learning_rate": 6.474936759376693e-07, "loss": 0.2707, "step": 2467 }, { "epoch": 1.2418399495904222, "grad_norm": 0.18325425135295045, "learning_rate": 6.472370470990778e-07, "loss": 0.2694, "step": 2468 }, { "epoch": 1.24234404536862, "grad_norm": 0.1696869877874564, "learning_rate": 6.469803757835102e-07, "loss": 0.2747, "step": 2469 }, { "epoch": 1.242848141146818, "grad_norm": 0.3486889386014526, "learning_rate": 6.467236620650147e-07, "loss": 0.2826, "step": 2470 }, { "epoch": 1.2433522369250158, "grad_norm": 0.16956469304659022, "learning_rate": 6.464669060176516e-07, "loss": 0.2687, "step": 2471 }, { "epoch": 1.2438563327032135, "grad_norm": 0.20746929532665012, "learning_rate": 6.462101077154935e-07, "loss": 0.2721, "step": 2472 }, { "epoch": 1.2443604284814114, "grad_norm": 0.18257378343955985, "learning_rate": 6.459532672326249e-07, "loss": 0.2725, "step": 2473 }, { "epoch": 1.2448645242596093, "grad_norm": 0.17681868108144572, "learning_rate": 6.45696384643143e-07, "loss": 0.2798, "step": 2474 }, { "epoch": 1.2453686200378071, "grad_norm": 0.19498569967155788, "learning_rate": 6.454394600211565e-07, "loss": 0.276, "step": 2475 }, { "epoch": 1.245872715816005, "grad_norm": 0.19909897682075145, "learning_rate": 6.45182493440787e-07, "loss": 0.2883, "step": 2476 }, { "epoch": 1.2463768115942029, "grad_norm": 0.17632241811925964, "learning_rate": 6.449254849761672e-07, "loss": 0.2688, "step": 2477 }, { "epoch": 1.2468809073724008, "grad_norm": 0.1762000666442149, "learning_rate": 6.446684347014428e-07, "loss": 0.2741, "step": 2478 }, { "epoch": 1.2473850031505986, "grad_norm": 0.1773686838716517, "learning_rate": 6.444113426907713e-07, "loss": 0.2899, "step": 2479 }, { "epoch": 1.2478890989287965, "grad_norm": 0.23756861929225664, "learning_rate": 6.44154209018322e-07, "loss": 0.2687, "step": 2480 }, { "epoch": 1.2483931947069944, "grad_norm": 0.16938250561272578, "learning_rate": 6.438970337582764e-07, "loss": 0.2817, "step": 2481 }, { "epoch": 1.2488972904851923, "grad_norm": 0.18157388578326444, "learning_rate": 6.436398169848278e-07, "loss": 0.2718, "step": 2482 }, { "epoch": 1.24940138626339, "grad_norm": 0.19502196013884435, "learning_rate": 6.43382558772182e-07, "loss": 0.2768, "step": 2483 }, { "epoch": 1.249905482041588, "grad_norm": 0.1734733146967059, "learning_rate": 6.431252591945561e-07, "loss": 0.2769, "step": 2484 }, { "epoch": 1.2504095778197857, "grad_norm": 0.188235466172801, "learning_rate": 6.428679183261796e-07, "loss": 0.2705, "step": 2485 }, { "epoch": 1.2509136735979836, "grad_norm": 0.19152028195774132, "learning_rate": 6.426105362412935e-07, "loss": 0.2732, "step": 2486 }, { "epoch": 1.2514177693761814, "grad_norm": 0.1695103765753016, "learning_rate": 6.423531130141513e-07, "loss": 0.2753, "step": 2487 }, { "epoch": 1.2519218651543793, "grad_norm": 0.17465424303832264, "learning_rate": 6.420956487190177e-07, "loss": 0.2744, "step": 2488 }, { "epoch": 1.2524259609325772, "grad_norm": 0.17601676662066001, "learning_rate": 6.418381434301698e-07, "loss": 0.2639, "step": 2489 }, { "epoch": 1.252930056710775, "grad_norm": 0.17576563315273216, "learning_rate": 6.415805972218962e-07, "loss": 0.2731, "step": 2490 }, { "epoch": 1.253434152488973, "grad_norm": 0.1817582073444604, "learning_rate": 6.413230101684972e-07, "loss": 0.2915, "step": 2491 }, { "epoch": 1.2539382482671708, "grad_norm": 0.1871857208126174, "learning_rate": 6.410653823442853e-07, "loss": 0.2669, "step": 2492 }, { "epoch": 1.2544423440453687, "grad_norm": 0.17014233122771064, "learning_rate": 6.408077138235843e-07, "loss": 0.274, "step": 2493 }, { "epoch": 1.2549464398235664, "grad_norm": 0.1714640417770811, "learning_rate": 6.405500046807303e-07, "loss": 0.2686, "step": 2494 }, { "epoch": 1.2554505356017645, "grad_norm": 0.19243821454194698, "learning_rate": 6.402922549900705e-07, "loss": 0.2891, "step": 2495 }, { "epoch": 1.255954631379962, "grad_norm": 0.17463123533718689, "learning_rate": 6.400344648259644e-07, "loss": 0.2699, "step": 2496 }, { "epoch": 1.25645872715816, "grad_norm": 0.1910291136178679, "learning_rate": 6.397766342627825e-07, "loss": 0.2635, "step": 2497 }, { "epoch": 1.2569628229363579, "grad_norm": 0.17138494155286757, "learning_rate": 6.395187633749075e-07, "loss": 0.2833, "step": 2498 }, { "epoch": 1.2574669187145557, "grad_norm": 0.17443608504712982, "learning_rate": 6.392608522367336e-07, "loss": 0.2723, "step": 2499 }, { "epoch": 1.2579710144927536, "grad_norm": 0.1699382749604669, "learning_rate": 6.390029009226664e-07, "loss": 0.2657, "step": 2500 }, { "epoch": 1.2584751102709515, "grad_norm": 0.17760734553417154, "learning_rate": 6.387449095071234e-07, "loss": 0.2738, "step": 2501 }, { "epoch": 1.2589792060491494, "grad_norm": 0.17525575952040232, "learning_rate": 6.384868780645335e-07, "loss": 0.2703, "step": 2502 }, { "epoch": 1.2594833018273472, "grad_norm": 0.18894681759150508, "learning_rate": 6.382288066693372e-07, "loss": 0.2816, "step": 2503 }, { "epoch": 1.2599873976055451, "grad_norm": 0.18849104352320012, "learning_rate": 6.379706953959865e-07, "loss": 0.2766, "step": 2504 }, { "epoch": 1.2604914933837428, "grad_norm": 0.17320664007502315, "learning_rate": 6.377125443189446e-07, "loss": 0.2558, "step": 2505 }, { "epoch": 1.2609955891619409, "grad_norm": 0.1981593202287322, "learning_rate": 6.37454353512687e-07, "loss": 0.2854, "step": 2506 }, { "epoch": 1.2614996849401385, "grad_norm": 0.1674126260844481, "learning_rate": 6.371961230516997e-07, "loss": 0.2687, "step": 2507 }, { "epoch": 1.2620037807183364, "grad_norm": 0.17667010185551682, "learning_rate": 6.36937853010481e-07, "loss": 0.2699, "step": 2508 }, { "epoch": 1.2625078764965343, "grad_norm": 0.17818670333223607, "learning_rate": 6.366795434635398e-07, "loss": 0.2687, "step": 2509 }, { "epoch": 1.2630119722747322, "grad_norm": 0.17694309827192736, "learning_rate": 6.364211944853971e-07, "loss": 0.2635, "step": 2510 }, { "epoch": 1.26351606805293, "grad_norm": 0.17785156941257393, "learning_rate": 6.361628061505849e-07, "loss": 0.2831, "step": 2511 }, { "epoch": 1.264020163831128, "grad_norm": 0.17826472857233217, "learning_rate": 6.359043785336467e-07, "loss": 0.2731, "step": 2512 }, { "epoch": 1.2645242596093258, "grad_norm": 0.18356140095576604, "learning_rate": 6.356459117091369e-07, "loss": 0.2601, "step": 2513 }, { "epoch": 1.2650283553875237, "grad_norm": 0.2119595973428602, "learning_rate": 6.353874057516222e-07, "loss": 0.2809, "step": 2514 }, { "epoch": 1.2655324511657216, "grad_norm": 0.16952440646698724, "learning_rate": 6.351288607356793e-07, "loss": 0.2731, "step": 2515 }, { "epoch": 1.2660365469439194, "grad_norm": 0.17460234803900188, "learning_rate": 6.348702767358974e-07, "loss": 0.2797, "step": 2516 }, { "epoch": 1.2665406427221173, "grad_norm": 0.17160497843474512, "learning_rate": 6.34611653826876e-07, "loss": 0.2698, "step": 2517 }, { "epoch": 1.267044738500315, "grad_norm": 0.18778464548766757, "learning_rate": 6.343529920832263e-07, "loss": 0.2573, "step": 2518 }, { "epoch": 1.2675488342785128, "grad_norm": 0.172443493887068, "learning_rate": 6.340942915795708e-07, "loss": 0.2748, "step": 2519 }, { "epoch": 1.2680529300567107, "grad_norm": 0.17168130332053846, "learning_rate": 6.338355523905427e-07, "loss": 0.2625, "step": 2520 }, { "epoch": 1.2685570258349086, "grad_norm": 0.17567263307465228, "learning_rate": 6.335767745907869e-07, "loss": 0.2859, "step": 2521 }, { "epoch": 1.2690611216131065, "grad_norm": 0.20501744023706306, "learning_rate": 6.33317958254959e-07, "loss": 0.2773, "step": 2522 }, { "epoch": 1.2695652173913043, "grad_norm": 0.1693368131106986, "learning_rate": 6.33059103457726e-07, "loss": 0.2653, "step": 2523 }, { "epoch": 1.2700693131695022, "grad_norm": 0.18006664625555702, "learning_rate": 6.32800210273766e-07, "loss": 0.2645, "step": 2524 }, { "epoch": 1.2705734089477, "grad_norm": 0.17456080347662142, "learning_rate": 6.32541278777768e-07, "loss": 0.2699, "step": 2525 }, { "epoch": 1.271077504725898, "grad_norm": 0.16574198775311022, "learning_rate": 6.322823090444322e-07, "loss": 0.2662, "step": 2526 }, { "epoch": 1.2715816005040959, "grad_norm": 0.18598160934075628, "learning_rate": 6.320233011484696e-07, "loss": 0.2828, "step": 2527 }, { "epoch": 1.2720856962822937, "grad_norm": 0.1711774666753222, "learning_rate": 6.317642551646024e-07, "loss": 0.2803, "step": 2528 }, { "epoch": 1.2725897920604914, "grad_norm": 0.17441523118583016, "learning_rate": 6.315051711675639e-07, "loss": 0.2864, "step": 2529 }, { "epoch": 1.2730938878386895, "grad_norm": 0.18168000027383474, "learning_rate": 6.312460492320981e-07, "loss": 0.2786, "step": 2530 }, { "epoch": 1.2735979836168871, "grad_norm": 0.18114157608848858, "learning_rate": 6.309868894329602e-07, "loss": 0.2805, "step": 2531 }, { "epoch": 1.274102079395085, "grad_norm": 0.18530369106217595, "learning_rate": 6.30727691844916e-07, "loss": 0.2804, "step": 2532 }, { "epoch": 1.274606175173283, "grad_norm": 0.1927697807229042, "learning_rate": 6.304684565427427e-07, "loss": 0.2667, "step": 2533 }, { "epoch": 1.2751102709514808, "grad_norm": 0.17177026835602924, "learning_rate": 6.302091836012278e-07, "loss": 0.2633, "step": 2534 }, { "epoch": 1.2756143667296787, "grad_norm": 0.20642706373970884, "learning_rate": 6.299498730951699e-07, "loss": 0.2878, "step": 2535 }, { "epoch": 1.2761184625078765, "grad_norm": 0.17594445837282194, "learning_rate": 6.296905250993787e-07, "loss": 0.2946, "step": 2536 }, { "epoch": 1.2766225582860744, "grad_norm": 0.1762144552346574, "learning_rate": 6.294311396886745e-07, "loss": 0.2915, "step": 2537 }, { "epoch": 1.2771266540642723, "grad_norm": 0.17045816209341122, "learning_rate": 6.291717169378881e-07, "loss": 0.2782, "step": 2538 }, { "epoch": 1.2776307498424702, "grad_norm": 0.18383362547518797, "learning_rate": 6.289122569218615e-07, "loss": 0.2748, "step": 2539 }, { "epoch": 1.2781348456206678, "grad_norm": 0.1807772669969172, "learning_rate": 6.286527597154475e-07, "loss": 0.2885, "step": 2540 }, { "epoch": 1.278638941398866, "grad_norm": 0.19302775133426964, "learning_rate": 6.283932253935094e-07, "loss": 0.2886, "step": 2541 }, { "epoch": 1.2791430371770636, "grad_norm": 0.1888729384221993, "learning_rate": 6.28133654030921e-07, "loss": 0.2807, "step": 2542 }, { "epoch": 1.2796471329552614, "grad_norm": 0.17228549927667094, "learning_rate": 6.278740457025671e-07, "loss": 0.2754, "step": 2543 }, { "epoch": 1.2801512287334593, "grad_norm": 0.1729966859664312, "learning_rate": 6.276144004833432e-07, "loss": 0.2828, "step": 2544 }, { "epoch": 1.2806553245116572, "grad_norm": 0.17500523408983926, "learning_rate": 6.273547184481554e-07, "loss": 0.2679, "step": 2545 }, { "epoch": 1.281159420289855, "grad_norm": 0.17974221115337133, "learning_rate": 6.270949996719202e-07, "loss": 0.28, "step": 2546 }, { "epoch": 1.281663516068053, "grad_norm": 0.18105222606013233, "learning_rate": 6.268352442295648e-07, "loss": 0.2794, "step": 2547 }, { "epoch": 1.2821676118462508, "grad_norm": 0.18180911542908265, "learning_rate": 6.265754521960272e-07, "loss": 0.2815, "step": 2548 }, { "epoch": 1.2826717076244487, "grad_norm": 0.19054982521679337, "learning_rate": 6.263156236462557e-07, "loss": 0.2603, "step": 2549 }, { "epoch": 1.2831758034026466, "grad_norm": 0.17388626076521252, "learning_rate": 6.260557586552094e-07, "loss": 0.2787, "step": 2550 }, { "epoch": 1.2836798991808442, "grad_norm": 0.18270654576376585, "learning_rate": 6.257958572978573e-07, "loss": 0.2664, "step": 2551 }, { "epoch": 1.2841839949590423, "grad_norm": 0.1727969110053315, "learning_rate": 6.255359196491799e-07, "loss": 0.2669, "step": 2552 }, { "epoch": 1.28468809073724, "grad_norm": 0.17719931615423382, "learning_rate": 6.252759457841672e-07, "loss": 0.2806, "step": 2553 }, { "epoch": 1.2851921865154379, "grad_norm": 0.16515633828315085, "learning_rate": 6.250159357778202e-07, "loss": 0.2804, "step": 2554 }, { "epoch": 1.2856962822936358, "grad_norm": 0.18820483320106177, "learning_rate": 6.247558897051503e-07, "loss": 0.2965, "step": 2555 }, { "epoch": 1.2862003780718336, "grad_norm": 0.17604374414885252, "learning_rate": 6.244958076411789e-07, "loss": 0.2622, "step": 2556 }, { "epoch": 1.2867044738500315, "grad_norm": 0.1696182132757248, "learning_rate": 6.242356896609383e-07, "loss": 0.2742, "step": 2557 }, { "epoch": 1.2872085696282294, "grad_norm": 0.1737771138671047, "learning_rate": 6.239755358394707e-07, "loss": 0.2667, "step": 2558 }, { "epoch": 1.2877126654064273, "grad_norm": 0.1801598415185936, "learning_rate": 6.237153462518291e-07, "loss": 0.2654, "step": 2559 }, { "epoch": 1.2882167611846251, "grad_norm": 0.1756137146667004, "learning_rate": 6.234551209730765e-07, "loss": 0.2947, "step": 2560 }, { "epoch": 1.288720856962823, "grad_norm": 0.17233304314756842, "learning_rate": 6.231948600782863e-07, "loss": 0.2776, "step": 2561 }, { "epoch": 1.2892249527410207, "grad_norm": 0.18074095930111286, "learning_rate": 6.229345636425421e-07, "loss": 0.2778, "step": 2562 }, { "epoch": 1.2897290485192188, "grad_norm": 0.17475122322002845, "learning_rate": 6.226742317409378e-07, "loss": 0.2907, "step": 2563 }, { "epoch": 1.2902331442974164, "grad_norm": 0.17347744444476335, "learning_rate": 6.224138644485775e-07, "loss": 0.2737, "step": 2564 }, { "epoch": 1.2907372400756143, "grad_norm": 0.17186378287100884, "learning_rate": 6.221534618405757e-07, "loss": 0.277, "step": 2565 }, { "epoch": 1.2912413358538122, "grad_norm": 0.1705178171148712, "learning_rate": 6.218930239920568e-07, "loss": 0.2685, "step": 2566 }, { "epoch": 1.29174543163201, "grad_norm": 0.1786888372127578, "learning_rate": 6.216325509781556e-07, "loss": 0.2785, "step": 2567 }, { "epoch": 1.292249527410208, "grad_norm": 0.17449611257529518, "learning_rate": 6.213720428740168e-07, "loss": 0.2653, "step": 2568 }, { "epoch": 1.2927536231884058, "grad_norm": 0.168813740260223, "learning_rate": 6.211114997547956e-07, "loss": 0.2629, "step": 2569 }, { "epoch": 1.2932577189666037, "grad_norm": 0.16790027679595843, "learning_rate": 6.208509216956572e-07, "loss": 0.2706, "step": 2570 }, { "epoch": 1.2937618147448016, "grad_norm": 0.17099748648486032, "learning_rate": 6.205903087717761e-07, "loss": 0.2687, "step": 2571 }, { "epoch": 1.2942659105229994, "grad_norm": 0.18162517160448968, "learning_rate": 6.203296610583382e-07, "loss": 0.2747, "step": 2572 }, { "epoch": 1.294770006301197, "grad_norm": 0.17063028798101706, "learning_rate": 6.200689786305383e-07, "loss": 0.2761, "step": 2573 }, { "epoch": 1.2952741020793952, "grad_norm": 0.17398933181018333, "learning_rate": 6.19808261563582e-07, "loss": 0.283, "step": 2574 }, { "epoch": 1.2957781978575929, "grad_norm": 0.16776550305812016, "learning_rate": 6.195475099326843e-07, "loss": 0.2617, "step": 2575 }, { "epoch": 1.2962822936357907, "grad_norm": 0.1747053664007132, "learning_rate": 6.192867238130708e-07, "loss": 0.2774, "step": 2576 }, { "epoch": 1.2967863894139886, "grad_norm": 0.18115021944031587, "learning_rate": 6.190259032799761e-07, "loss": 0.2751, "step": 2577 }, { "epoch": 1.2972904851921865, "grad_norm": 0.1790395768010079, "learning_rate": 6.187650484086459e-07, "loss": 0.29, "step": 2578 }, { "epoch": 1.2977945809703844, "grad_norm": 0.1719106072500581, "learning_rate": 6.185041592743348e-07, "loss": 0.2729, "step": 2579 }, { "epoch": 1.2982986767485822, "grad_norm": 0.17444897223126585, "learning_rate": 6.182432359523079e-07, "loss": 0.2789, "step": 2580 }, { "epoch": 1.2988027725267801, "grad_norm": 0.1766242163102816, "learning_rate": 6.179822785178398e-07, "loss": 0.2583, "step": 2581 }, { "epoch": 1.299306868304978, "grad_norm": 0.18162411025003278, "learning_rate": 6.177212870462152e-07, "loss": 0.2686, "step": 2582 }, { "epoch": 1.2998109640831759, "grad_norm": 0.16955702326217229, "learning_rate": 6.174602616127287e-07, "loss": 0.2736, "step": 2583 }, { "epoch": 1.3003150598613735, "grad_norm": 0.173479162516625, "learning_rate": 6.171992022926841e-07, "loss": 0.2738, "step": 2584 }, { "epoch": 1.3008191556395716, "grad_norm": 0.17139404620083853, "learning_rate": 6.16938109161396e-07, "loss": 0.2757, "step": 2585 }, { "epoch": 1.3013232514177693, "grad_norm": 0.17095006127373458, "learning_rate": 6.166769822941877e-07, "loss": 0.2748, "step": 2586 }, { "epoch": 1.3018273471959672, "grad_norm": 0.17350708973766807, "learning_rate": 6.164158217663926e-07, "loss": 0.2729, "step": 2587 }, { "epoch": 1.3018273471959672, "eval_loss": 0.3091413378715515, "eval_runtime": 17.1268, "eval_samples_per_second": 49.922, "eval_steps_per_second": 1.051, "step": 2587 }, { "epoch": 1.302331442974165, "grad_norm": 0.19869258969365142, "learning_rate": 6.161546276533542e-07, "loss": 0.2881, "step": 2588 }, { "epoch": 1.302835538752363, "grad_norm": 0.1703478811098866, "learning_rate": 6.158934000304251e-07, "loss": 0.2701, "step": 2589 }, { "epoch": 1.3033396345305608, "grad_norm": 0.1696150699647383, "learning_rate": 6.156321389729682e-07, "loss": 0.2839, "step": 2590 }, { "epoch": 1.3038437303087587, "grad_norm": 0.1738118040743564, "learning_rate": 6.153708445563555e-07, "loss": 0.2852, "step": 2591 }, { "epoch": 1.3043478260869565, "grad_norm": 0.1767377470005941, "learning_rate": 6.151095168559688e-07, "loss": 0.2769, "step": 2592 }, { "epoch": 1.3048519218651544, "grad_norm": 0.17056286239729945, "learning_rate": 6.148481559471995e-07, "loss": 0.2634, "step": 2593 }, { "epoch": 1.3053560176433523, "grad_norm": 0.1964952722351761, "learning_rate": 6.145867619054487e-07, "loss": 0.2821, "step": 2594 }, { "epoch": 1.3058601134215502, "grad_norm": 0.17002975410785548, "learning_rate": 6.143253348061271e-07, "loss": 0.2751, "step": 2595 }, { "epoch": 1.306364209199748, "grad_norm": 0.17294236211348307, "learning_rate": 6.140638747246543e-07, "loss": 0.2777, "step": 2596 }, { "epoch": 1.3068683049779457, "grad_norm": 0.18702273486347906, "learning_rate": 6.138023817364603e-07, "loss": 0.2817, "step": 2597 }, { "epoch": 1.3073724007561438, "grad_norm": 0.18257740617721263, "learning_rate": 6.135408559169842e-07, "loss": 0.2869, "step": 2598 }, { "epoch": 1.3078764965343415, "grad_norm": 0.1679551881506779, "learning_rate": 6.132792973416744e-07, "loss": 0.2671, "step": 2599 }, { "epoch": 1.3083805923125393, "grad_norm": 0.1754134210625823, "learning_rate": 6.130177060859894e-07, "loss": 0.302, "step": 2600 }, { "epoch": 1.3088846880907372, "grad_norm": 0.1772876906689951, "learning_rate": 6.12756082225396e-07, "loss": 0.2783, "step": 2601 }, { "epoch": 1.309388783868935, "grad_norm": 0.20064464760978862, "learning_rate": 6.124944258353714e-07, "loss": 0.2839, "step": 2602 }, { "epoch": 1.309892879647133, "grad_norm": 0.18138531276139028, "learning_rate": 6.122327369914018e-07, "loss": 0.2928, "step": 2603 }, { "epoch": 1.3103969754253308, "grad_norm": 0.17693406432408068, "learning_rate": 6.119710157689828e-07, "loss": 0.264, "step": 2604 }, { "epoch": 1.3109010712035287, "grad_norm": 0.17240536698687978, "learning_rate": 6.117092622436194e-07, "loss": 0.2803, "step": 2605 }, { "epoch": 1.3114051669817266, "grad_norm": 0.17830285202833615, "learning_rate": 6.11447476490826e-07, "loss": 0.271, "step": 2606 }, { "epoch": 1.3119092627599245, "grad_norm": 0.18109922157734024, "learning_rate": 6.111856585861259e-07, "loss": 0.2673, "step": 2607 }, { "epoch": 1.3124133585381221, "grad_norm": 0.17133961085931276, "learning_rate": 6.10923808605052e-07, "loss": 0.2726, "step": 2608 }, { "epoch": 1.3129174543163202, "grad_norm": 0.17649723679339357, "learning_rate": 6.106619266231467e-07, "loss": 0.2674, "step": 2609 }, { "epoch": 1.3134215500945179, "grad_norm": 0.17184213041429233, "learning_rate": 6.104000127159608e-07, "loss": 0.279, "step": 2610 }, { "epoch": 1.3139256458727158, "grad_norm": 0.17468087267169044, "learning_rate": 6.101380669590551e-07, "loss": 0.279, "step": 2611 }, { "epoch": 1.3144297416509136, "grad_norm": 0.17337912043291634, "learning_rate": 6.098760894279995e-07, "loss": 0.2685, "step": 2612 }, { "epoch": 1.3149338374291115, "grad_norm": 0.18366133473396995, "learning_rate": 6.096140801983727e-07, "loss": 0.2711, "step": 2613 }, { "epoch": 1.3154379332073094, "grad_norm": 0.17222644312128002, "learning_rate": 6.093520393457627e-07, "loss": 0.2631, "step": 2614 }, { "epoch": 1.3159420289855073, "grad_norm": 0.17738930991704355, "learning_rate": 6.09089966945767e-07, "loss": 0.2743, "step": 2615 }, { "epoch": 1.3164461247637051, "grad_norm": 0.17185117857715593, "learning_rate": 6.088278630739915e-07, "loss": 0.2772, "step": 2616 }, { "epoch": 1.316950220541903, "grad_norm": 0.16984493121172142, "learning_rate": 6.085657278060515e-07, "loss": 0.2656, "step": 2617 }, { "epoch": 1.317454316320101, "grad_norm": 0.17192888657461453, "learning_rate": 6.083035612175716e-07, "loss": 0.277, "step": 2618 }, { "epoch": 1.3179584120982986, "grad_norm": 0.18354467561304685, "learning_rate": 6.080413633841853e-07, "loss": 0.2701, "step": 2619 }, { "epoch": 1.3184625078764967, "grad_norm": 0.17503373460612875, "learning_rate": 6.077791343815349e-07, "loss": 0.2825, "step": 2620 }, { "epoch": 1.3189666036546943, "grad_norm": 0.1706123234531778, "learning_rate": 6.075168742852718e-07, "loss": 0.2662, "step": 2621 }, { "epoch": 1.3194706994328922, "grad_norm": 0.17412767194593945, "learning_rate": 6.072545831710567e-07, "loss": 0.2722, "step": 2622 }, { "epoch": 1.31997479521109, "grad_norm": 0.17240484877752552, "learning_rate": 6.069922611145587e-07, "loss": 0.2686, "step": 2623 }, { "epoch": 1.320478890989288, "grad_norm": 0.16827994239810876, "learning_rate": 6.06729908191456e-07, "loss": 0.2641, "step": 2624 }, { "epoch": 1.3209829867674858, "grad_norm": 0.17930969669376187, "learning_rate": 6.064675244774362e-07, "loss": 0.2852, "step": 2625 }, { "epoch": 1.3214870825456837, "grad_norm": 0.18006157470320514, "learning_rate": 6.062051100481949e-07, "loss": 0.2758, "step": 2626 }, { "epoch": 1.3219911783238816, "grad_norm": 0.1727448964101327, "learning_rate": 6.059426649794374e-07, "loss": 0.2714, "step": 2627 }, { "epoch": 1.3224952741020795, "grad_norm": 0.17424563952604652, "learning_rate": 6.056801893468773e-07, "loss": 0.2787, "step": 2628 }, { "epoch": 1.3229993698802773, "grad_norm": 0.1731354768377102, "learning_rate": 6.054176832262371e-07, "loss": 0.2776, "step": 2629 }, { "epoch": 1.323503465658475, "grad_norm": 0.1804386379598718, "learning_rate": 6.051551466932485e-07, "loss": 0.2651, "step": 2630 }, { "epoch": 1.324007561436673, "grad_norm": 0.17399836046155562, "learning_rate": 6.048925798236512e-07, "loss": 0.2702, "step": 2631 }, { "epoch": 1.3245116572148707, "grad_norm": 0.43353712827340424, "learning_rate": 6.046299826931946e-07, "loss": 0.2753, "step": 2632 }, { "epoch": 1.3250157529930686, "grad_norm": 0.17183489960437298, "learning_rate": 6.043673553776361e-07, "loss": 0.2857, "step": 2633 }, { "epoch": 1.3255198487712665, "grad_norm": 0.17519525464739322, "learning_rate": 6.041046979527422e-07, "loss": 0.2724, "step": 2634 }, { "epoch": 1.3260239445494644, "grad_norm": 0.18357488474151853, "learning_rate": 6.038420104942877e-07, "loss": 0.2662, "step": 2635 }, { "epoch": 1.3265280403276623, "grad_norm": 0.1859694085856373, "learning_rate": 6.035792930780565e-07, "loss": 0.2615, "step": 2636 }, { "epoch": 1.3270321361058601, "grad_norm": 0.17450754260587856, "learning_rate": 6.033165457798408e-07, "loss": 0.2754, "step": 2637 }, { "epoch": 1.327536231884058, "grad_norm": 0.1668193821161752, "learning_rate": 6.030537686754419e-07, "loss": 0.2698, "step": 2638 }, { "epoch": 1.3280403276622559, "grad_norm": 0.1728884240425731, "learning_rate": 6.027909618406689e-07, "loss": 0.275, "step": 2639 }, { "epoch": 1.3285444234404538, "grad_norm": 0.17398093656070354, "learning_rate": 6.025281253513404e-07, "loss": 0.2627, "step": 2640 }, { "epoch": 1.3290485192186514, "grad_norm": 0.203938933296385, "learning_rate": 6.022652592832827e-07, "loss": 0.2782, "step": 2641 }, { "epoch": 1.3295526149968495, "grad_norm": 0.18386883362802814, "learning_rate": 6.02002363712331e-07, "loss": 0.2748, "step": 2642 }, { "epoch": 1.3300567107750472, "grad_norm": 0.17575912101228022, "learning_rate": 6.017394387143294e-07, "loss": 0.2752, "step": 2643 }, { "epoch": 1.330560806553245, "grad_norm": 0.18067748484917678, "learning_rate": 6.0147648436513e-07, "loss": 0.2815, "step": 2644 }, { "epoch": 1.331064902331443, "grad_norm": 0.16917344843892523, "learning_rate": 6.012135007405936e-07, "loss": 0.2648, "step": 2645 }, { "epoch": 1.3315689981096408, "grad_norm": 0.17412949415096388, "learning_rate": 6.009504879165891e-07, "loss": 0.2754, "step": 2646 }, { "epoch": 1.3320730938878387, "grad_norm": 0.18047299736884104, "learning_rate": 6.006874459689942e-07, "loss": 0.2694, "step": 2647 }, { "epoch": 1.3325771896660366, "grad_norm": 0.19127600843343956, "learning_rate": 6.004243749736947e-07, "loss": 0.2705, "step": 2648 }, { "epoch": 1.3330812854442344, "grad_norm": 0.17608841608371475, "learning_rate": 6.001612750065853e-07, "loss": 0.2797, "step": 2649 }, { "epoch": 1.3335853812224323, "grad_norm": 0.17931850245102035, "learning_rate": 5.998981461435685e-07, "loss": 0.2937, "step": 2650 }, { "epoch": 1.3340894770006302, "grad_norm": 0.18450380693233284, "learning_rate": 5.996349884605552e-07, "loss": 0.2702, "step": 2651 }, { "epoch": 1.3345935727788278, "grad_norm": 0.17087115966825997, "learning_rate": 5.993718020334652e-07, "loss": 0.2762, "step": 2652 }, { "epoch": 1.335097668557026, "grad_norm": 0.1718079561805658, "learning_rate": 5.991085869382258e-07, "loss": 0.2715, "step": 2653 }, { "epoch": 1.3356017643352236, "grad_norm": 0.18602810395230512, "learning_rate": 5.988453432507729e-07, "loss": 0.2744, "step": 2654 }, { "epoch": 1.3361058601134215, "grad_norm": 0.18389684807636075, "learning_rate": 5.985820710470509e-07, "loss": 0.2808, "step": 2655 }, { "epoch": 1.3366099558916194, "grad_norm": 0.19592914957135987, "learning_rate": 5.983187704030123e-07, "loss": 0.2676, "step": 2656 }, { "epoch": 1.3371140516698172, "grad_norm": 0.18190367471809632, "learning_rate": 5.980554413946172e-07, "loss": 0.2938, "step": 2657 }, { "epoch": 1.337618147448015, "grad_norm": 0.1717978516962743, "learning_rate": 5.977920840978346e-07, "loss": 0.2865, "step": 2658 }, { "epoch": 1.338122243226213, "grad_norm": 0.1860456269002095, "learning_rate": 5.975286985886418e-07, "loss": 0.2763, "step": 2659 }, { "epoch": 1.3386263390044109, "grad_norm": 0.17913391357600741, "learning_rate": 5.972652849430235e-07, "loss": 0.2702, "step": 2660 }, { "epoch": 1.3391304347826087, "grad_norm": 0.17254123661223592, "learning_rate": 5.97001843236973e-07, "loss": 0.2711, "step": 2661 }, { "epoch": 1.3396345305608066, "grad_norm": 0.17083369763214978, "learning_rate": 5.967383735464916e-07, "loss": 0.2582, "step": 2662 }, { "epoch": 1.3401386263390045, "grad_norm": 0.17895529015768472, "learning_rate": 5.964748759475887e-07, "loss": 0.2818, "step": 2663 }, { "epoch": 1.3406427221172024, "grad_norm": 0.17751494439480603, "learning_rate": 5.962113505162818e-07, "loss": 0.2741, "step": 2664 }, { "epoch": 1.3411468178954, "grad_norm": 0.1703756144111887, "learning_rate": 5.959477973285961e-07, "loss": 0.2643, "step": 2665 }, { "epoch": 1.3416509136735981, "grad_norm": 0.16962499032704564, "learning_rate": 5.956842164605654e-07, "loss": 0.2632, "step": 2666 }, { "epoch": 1.3421550094517958, "grad_norm": 0.16795341629826513, "learning_rate": 5.954206079882311e-07, "loss": 0.274, "step": 2667 }, { "epoch": 1.3426591052299937, "grad_norm": 0.17092016847119307, "learning_rate": 5.951569719876421e-07, "loss": 0.2778, "step": 2668 }, { "epoch": 1.3431632010081915, "grad_norm": 0.1790697008564231, "learning_rate": 5.948933085348564e-07, "loss": 0.2677, "step": 2669 }, { "epoch": 1.3436672967863894, "grad_norm": 0.16930642394721995, "learning_rate": 5.94629617705939e-07, "loss": 0.2744, "step": 2670 }, { "epoch": 1.3441713925645873, "grad_norm": 0.17294047568014345, "learning_rate": 5.943658995769631e-07, "loss": 0.2542, "step": 2671 }, { "epoch": 1.3446754883427852, "grad_norm": 0.1767866393054919, "learning_rate": 5.941021542240098e-07, "loss": 0.259, "step": 2672 }, { "epoch": 1.345179584120983, "grad_norm": 0.1684270679379651, "learning_rate": 5.938383817231678e-07, "loss": 0.2668, "step": 2673 }, { "epoch": 1.345683679899181, "grad_norm": 0.17591755928235953, "learning_rate": 5.93574582150534e-07, "loss": 0.271, "step": 2674 }, { "epoch": 1.3461877756773788, "grad_norm": 0.17125486033048526, "learning_rate": 5.933107555822131e-07, "loss": 0.2793, "step": 2675 }, { "epoch": 1.3466918714555765, "grad_norm": 0.17748188736278603, "learning_rate": 5.930469020943171e-07, "loss": 0.2777, "step": 2676 }, { "epoch": 1.3471959672337745, "grad_norm": 0.16672523194129693, "learning_rate": 5.927830217629661e-07, "loss": 0.2536, "step": 2677 }, { "epoch": 1.3477000630119722, "grad_norm": 0.1713361784166428, "learning_rate": 5.925191146642883e-07, "loss": 0.2622, "step": 2678 }, { "epoch": 1.34820415879017, "grad_norm": 0.17829104370453977, "learning_rate": 5.92255180874419e-07, "loss": 0.2817, "step": 2679 }, { "epoch": 1.348708254568368, "grad_norm": 0.18860577855505326, "learning_rate": 5.919912204695014e-07, "loss": 0.2799, "step": 2680 }, { "epoch": 1.3492123503465658, "grad_norm": 0.1833287597973856, "learning_rate": 5.917272335256865e-07, "loss": 0.2858, "step": 2681 }, { "epoch": 1.3497164461247637, "grad_norm": 0.19807321067808822, "learning_rate": 5.914632201191332e-07, "loss": 0.2787, "step": 2682 }, { "epoch": 1.3502205419029616, "grad_norm": 0.16859634450252498, "learning_rate": 5.911991803260074e-07, "loss": 0.2764, "step": 2683 }, { "epoch": 1.3507246376811595, "grad_norm": 0.190261438179237, "learning_rate": 5.909351142224829e-07, "loss": 0.272, "step": 2684 }, { "epoch": 1.3512287334593573, "grad_norm": 0.17519831750598353, "learning_rate": 5.906710218847413e-07, "loss": 0.2603, "step": 2685 }, { "epoch": 1.3517328292375552, "grad_norm": 0.18778966952607162, "learning_rate": 5.904069033889716e-07, "loss": 0.261, "step": 2686 }, { "epoch": 1.3522369250157529, "grad_norm": 0.17347841958582969, "learning_rate": 5.901427588113703e-07, "loss": 0.2885, "step": 2687 }, { "epoch": 1.352741020793951, "grad_norm": 0.17645693666916326, "learning_rate": 5.898785882281415e-07, "loss": 0.2831, "step": 2688 }, { "epoch": 1.3532451165721486, "grad_norm": 0.17182233696335003, "learning_rate": 5.896143917154967e-07, "loss": 0.2712, "step": 2689 }, { "epoch": 1.3537492123503465, "grad_norm": 0.17603565137881136, "learning_rate": 5.893501693496553e-07, "loss": 0.2634, "step": 2690 }, { "epoch": 1.3542533081285444, "grad_norm": 0.18263857001668504, "learning_rate": 5.890859212068433e-07, "loss": 0.2738, "step": 2691 }, { "epoch": 1.3547574039067423, "grad_norm": 0.1730156353984436, "learning_rate": 5.888216473632949e-07, "loss": 0.2716, "step": 2692 }, { "epoch": 1.3552614996849401, "grad_norm": 0.17823231943674622, "learning_rate": 5.885573478952515e-07, "loss": 0.275, "step": 2693 }, { "epoch": 1.355765595463138, "grad_norm": 0.17236615330455812, "learning_rate": 5.882930228789617e-07, "loss": 0.2674, "step": 2694 }, { "epoch": 1.356269691241336, "grad_norm": 0.16824543267551886, "learning_rate": 5.880286723906817e-07, "loss": 0.2711, "step": 2695 }, { "epoch": 1.3567737870195338, "grad_norm": 0.17367807155908455, "learning_rate": 5.877642965066753e-07, "loss": 0.2839, "step": 2696 }, { "epoch": 1.3572778827977316, "grad_norm": 0.17538723204136947, "learning_rate": 5.874998953032128e-07, "loss": 0.2686, "step": 2697 }, { "epoch": 1.3577819785759293, "grad_norm": 0.17766393535437983, "learning_rate": 5.872354688565727e-07, "loss": 0.2816, "step": 2698 }, { "epoch": 1.3582860743541274, "grad_norm": 0.1699641810797796, "learning_rate": 5.869710172430401e-07, "loss": 0.2605, "step": 2699 }, { "epoch": 1.358790170132325, "grad_norm": 0.17422760346977517, "learning_rate": 5.867065405389078e-07, "loss": 0.2846, "step": 2700 }, { "epoch": 1.359294265910523, "grad_norm": 0.18149647107592987, "learning_rate": 5.864420388204757e-07, "loss": 0.2749, "step": 2701 }, { "epoch": 1.3597983616887208, "grad_norm": 0.1847991842938757, "learning_rate": 5.86177512164051e-07, "loss": 0.2695, "step": 2702 }, { "epoch": 1.3603024574669187, "grad_norm": 0.16847607424615257, "learning_rate": 5.859129606459477e-07, "loss": 0.2702, "step": 2703 }, { "epoch": 1.3608065532451166, "grad_norm": 0.17245367094502653, "learning_rate": 5.856483843424875e-07, "loss": 0.2667, "step": 2704 }, { "epoch": 1.3613106490233144, "grad_norm": 0.16958701012632785, "learning_rate": 5.853837833299991e-07, "loss": 0.2748, "step": 2705 }, { "epoch": 1.3618147448015123, "grad_norm": 0.17294794292884255, "learning_rate": 5.85119157684818e-07, "loss": 0.2742, "step": 2706 }, { "epoch": 1.3623188405797102, "grad_norm": 0.16738628688725057, "learning_rate": 5.848545074832873e-07, "loss": 0.2549, "step": 2707 }, { "epoch": 1.362822936357908, "grad_norm": 0.17652623817244345, "learning_rate": 5.845898328017566e-07, "loss": 0.2758, "step": 2708 }, { "epoch": 1.3633270321361057, "grad_norm": 0.16927838446248678, "learning_rate": 5.843251337165834e-07, "loss": 0.2714, "step": 2709 }, { "epoch": 1.3638311279143038, "grad_norm": 0.17423646348688704, "learning_rate": 5.840604103041313e-07, "loss": 0.2652, "step": 2710 }, { "epoch": 1.3643352236925015, "grad_norm": 0.17007075110833333, "learning_rate": 5.837956626407717e-07, "loss": 0.2815, "step": 2711 }, { "epoch": 1.3648393194706994, "grad_norm": 0.17563065467297476, "learning_rate": 5.835308908028826e-07, "loss": 0.2837, "step": 2712 }, { "epoch": 1.3653434152488972, "grad_norm": 0.17681431025674674, "learning_rate": 5.832660948668488e-07, "loss": 0.2872, "step": 2713 }, { "epoch": 1.3658475110270951, "grad_norm": 0.17713189828891057, "learning_rate": 5.830012749090624e-07, "loss": 0.2704, "step": 2714 }, { "epoch": 1.366351606805293, "grad_norm": 0.17125824414564597, "learning_rate": 5.827364310059224e-07, "loss": 0.2698, "step": 2715 }, { "epoch": 1.3668557025834909, "grad_norm": 0.1721925153952408, "learning_rate": 5.824715632338345e-07, "loss": 0.2614, "step": 2716 }, { "epoch": 1.3673597983616887, "grad_norm": 0.1738669650774558, "learning_rate": 5.822066716692116e-07, "loss": 0.2891, "step": 2717 }, { "epoch": 1.3678638941398866, "grad_norm": 0.17612032698684543, "learning_rate": 5.819417563884732e-07, "loss": 0.2888, "step": 2718 }, { "epoch": 1.3683679899180845, "grad_norm": 0.1710729077993677, "learning_rate": 5.816768174680457e-07, "loss": 0.2763, "step": 2719 }, { "epoch": 1.3688720856962822, "grad_norm": 0.17794493706059122, "learning_rate": 5.814118549843623e-07, "loss": 0.2623, "step": 2720 }, { "epoch": 1.3693761814744803, "grad_norm": 0.17143323856959825, "learning_rate": 5.81146869013863e-07, "loss": 0.2647, "step": 2721 }, { "epoch": 1.369880277252678, "grad_norm": 0.19765533176326622, "learning_rate": 5.808818596329947e-07, "loss": 0.2896, "step": 2722 }, { "epoch": 1.3703843730308758, "grad_norm": 0.17134806523272622, "learning_rate": 5.806168269182108e-07, "loss": 0.2844, "step": 2723 }, { "epoch": 1.3708884688090737, "grad_norm": 0.17764670109993702, "learning_rate": 5.803517709459718e-07, "loss": 0.2857, "step": 2724 }, { "epoch": 1.3713925645872715, "grad_norm": 0.17405509742325434, "learning_rate": 5.800866917927448e-07, "loss": 0.2808, "step": 2725 }, { "epoch": 1.3718966603654694, "grad_norm": 0.17704791599385142, "learning_rate": 5.798215895350032e-07, "loss": 0.2569, "step": 2726 }, { "epoch": 1.3724007561436673, "grad_norm": 0.19139286392663546, "learning_rate": 5.795564642492274e-07, "loss": 0.2782, "step": 2727 }, { "epoch": 1.3729048519218652, "grad_norm": 0.18973954678563565, "learning_rate": 5.792913160119046e-07, "loss": 0.2646, "step": 2728 }, { "epoch": 1.373408947700063, "grad_norm": 0.1717813204204622, "learning_rate": 5.790261448995283e-07, "loss": 0.2752, "step": 2729 }, { "epoch": 1.373913043478261, "grad_norm": 0.1707475223327584, "learning_rate": 5.787609509885987e-07, "loss": 0.2551, "step": 2730 }, { "epoch": 1.3744171392564588, "grad_norm": 0.1715006155883366, "learning_rate": 5.784957343556227e-07, "loss": 0.2837, "step": 2731 }, { "epoch": 1.3749212350346567, "grad_norm": 0.16563216070091388, "learning_rate": 5.782304950771136e-07, "loss": 0.2482, "step": 2732 }, { "epoch": 1.3754253308128543, "grad_norm": 0.1701892292799077, "learning_rate": 5.779652332295912e-07, "loss": 0.2713, "step": 2733 }, { "epoch": 1.3759294265910522, "grad_norm": 0.16506563913482972, "learning_rate": 5.776999488895821e-07, "loss": 0.2603, "step": 2734 }, { "epoch": 1.37643352236925, "grad_norm": 0.17768766370328332, "learning_rate": 5.774346421336191e-07, "loss": 0.2691, "step": 2735 }, { "epoch": 1.376937618147448, "grad_norm": 0.1821534720498173, "learning_rate": 5.771693130382413e-07, "loss": 0.2755, "step": 2736 }, { "epoch": 1.3774417139256458, "grad_norm": 0.1719459414964014, "learning_rate": 5.769039616799949e-07, "loss": 0.2834, "step": 2737 }, { "epoch": 1.3779458097038437, "grad_norm": 0.17228563061652777, "learning_rate": 5.766385881354319e-07, "loss": 0.2977, "step": 2738 }, { "epoch": 1.3784499054820416, "grad_norm": 0.17256562211361462, "learning_rate": 5.76373192481111e-07, "loss": 0.2736, "step": 2739 }, { "epoch": 1.3789540012602395, "grad_norm": 0.17577017025513997, "learning_rate": 5.76107774793597e-07, "loss": 0.2693, "step": 2740 }, { "epoch": 1.3794580970384374, "grad_norm": 0.17615100356954397, "learning_rate": 5.758423351494617e-07, "loss": 0.2755, "step": 2741 }, { "epoch": 1.3799621928166352, "grad_norm": 0.1769331316897802, "learning_rate": 5.755768736252824e-07, "loss": 0.2651, "step": 2742 }, { "epoch": 1.3804662885948331, "grad_norm": 0.18475639772963626, "learning_rate": 5.753113902976433e-07, "loss": 0.2818, "step": 2743 }, { "epoch": 1.3809703843730308, "grad_norm": 0.17618416233072068, "learning_rate": 5.750458852431346e-07, "loss": 0.2993, "step": 2744 }, { "epoch": 1.3814744801512289, "grad_norm": 0.1855289675606494, "learning_rate": 5.747803585383529e-07, "loss": 0.2645, "step": 2745 }, { "epoch": 1.3819785759294265, "grad_norm": 0.17112395237932768, "learning_rate": 5.74514810259901e-07, "loss": 0.2796, "step": 2746 }, { "epoch": 1.3824826717076244, "grad_norm": 0.17069954299670645, "learning_rate": 5.742492404843877e-07, "loss": 0.2725, "step": 2747 }, { "epoch": 1.3829867674858223, "grad_norm": 0.17473478086312352, "learning_rate": 5.739836492884287e-07, "loss": 0.2567, "step": 2748 }, { "epoch": 1.3834908632640202, "grad_norm": 0.17717349784959838, "learning_rate": 5.737180367486453e-07, "loss": 0.2764, "step": 2749 }, { "epoch": 1.383994959042218, "grad_norm": 0.18599321000957772, "learning_rate": 5.734524029416648e-07, "loss": 0.2776, "step": 2750 }, { "epoch": 1.384499054820416, "grad_norm": 0.17518820340484337, "learning_rate": 5.731867479441211e-07, "loss": 0.2585, "step": 2751 }, { "epoch": 1.3850031505986138, "grad_norm": 0.1792914417737057, "learning_rate": 5.72921071832654e-07, "loss": 0.282, "step": 2752 }, { "epoch": 1.3855072463768117, "grad_norm": 0.17099619039747274, "learning_rate": 5.726553746839094e-07, "loss": 0.2632, "step": 2753 }, { "epoch": 1.3860113421550095, "grad_norm": 0.18225019166152914, "learning_rate": 5.72389656574539e-07, "loss": 0.2695, "step": 2754 }, { "epoch": 1.3865154379332072, "grad_norm": 0.1707269538625628, "learning_rate": 5.721239175812014e-07, "loss": 0.272, "step": 2755 }, { "epoch": 1.3870195337114053, "grad_norm": 0.1796542283862304, "learning_rate": 5.718581577805604e-07, "loss": 0.2721, "step": 2756 }, { "epoch": 1.387523629489603, "grad_norm": 0.1770367282404336, "learning_rate": 5.71592377249286e-07, "loss": 0.2735, "step": 2757 }, { "epoch": 1.3880277252678008, "grad_norm": 0.16921478099539097, "learning_rate": 5.713265760640541e-07, "loss": 0.2778, "step": 2758 }, { "epoch": 1.3885318210459987, "grad_norm": 0.17268757073355442, "learning_rate": 5.71060754301547e-07, "loss": 0.2748, "step": 2759 }, { "epoch": 1.3890359168241966, "grad_norm": 0.17598942075530893, "learning_rate": 5.707949120384523e-07, "loss": 0.2724, "step": 2760 }, { "epoch": 1.3895400126023945, "grad_norm": 0.18011915470443154, "learning_rate": 5.705290493514642e-07, "loss": 0.2891, "step": 2761 }, { "epoch": 1.3900441083805923, "grad_norm": 0.17876735405789806, "learning_rate": 5.702631663172822e-07, "loss": 0.2772, "step": 2762 }, { "epoch": 1.3905482041587902, "grad_norm": 0.17718778143728184, "learning_rate": 5.69997263012612e-07, "loss": 0.2719, "step": 2763 }, { "epoch": 1.391052299936988, "grad_norm": 0.17114075203463552, "learning_rate": 5.697313395141651e-07, "loss": 0.2689, "step": 2764 }, { "epoch": 1.391556395715186, "grad_norm": 0.1880075108323104, "learning_rate": 5.694653958986586e-07, "loss": 0.2893, "step": 2765 }, { "epoch": 1.3920604914933836, "grad_norm": 0.177316108146064, "learning_rate": 5.691994322428159e-07, "loss": 0.2774, "step": 2766 }, { "epoch": 1.3925645872715817, "grad_norm": 0.16915555475740024, "learning_rate": 5.689334486233655e-07, "loss": 0.2605, "step": 2767 }, { "epoch": 1.3930686830497794, "grad_norm": 0.1706580418948374, "learning_rate": 5.686674451170421e-07, "loss": 0.2669, "step": 2768 }, { "epoch": 1.3935727788279773, "grad_norm": 0.18166657991393095, "learning_rate": 5.684014218005861e-07, "loss": 0.271, "step": 2769 }, { "epoch": 1.3940768746061751, "grad_norm": 0.18074355543323026, "learning_rate": 5.68135378750744e-07, "loss": 0.2787, "step": 2770 }, { "epoch": 1.394580970384373, "grad_norm": 0.18191764036464922, "learning_rate": 5.678693160442668e-07, "loss": 0.2694, "step": 2771 }, { "epoch": 1.3950850661625709, "grad_norm": 0.1742981275140808, "learning_rate": 5.676032337579126e-07, "loss": 0.2668, "step": 2772 }, { "epoch": 1.3955891619407688, "grad_norm": 0.182525466069335, "learning_rate": 5.673371319684443e-07, "loss": 0.2704, "step": 2773 }, { "epoch": 1.3960932577189666, "grad_norm": 0.18422795498525135, "learning_rate": 5.670710107526303e-07, "loss": 0.2554, "step": 2774 }, { "epoch": 1.3965973534971645, "grad_norm": 0.16935758864299294, "learning_rate": 5.668048701872453e-07, "loss": 0.2787, "step": 2775 }, { "epoch": 1.3971014492753624, "grad_norm": 0.17250150189867655, "learning_rate": 5.665387103490691e-07, "loss": 0.269, "step": 2776 }, { "epoch": 1.39760554505356, "grad_norm": 0.16845458026081864, "learning_rate": 5.662725313148872e-07, "loss": 0.2591, "step": 2777 }, { "epoch": 1.3981096408317581, "grad_norm": 0.19087750959177846, "learning_rate": 5.660063331614905e-07, "loss": 0.2684, "step": 2778 }, { "epoch": 1.3986137366099558, "grad_norm": 0.17287405774573983, "learning_rate": 5.657401159656757e-07, "loss": 0.2707, "step": 2779 }, { "epoch": 1.3991178323881537, "grad_norm": 0.17266422585724345, "learning_rate": 5.654738798042445e-07, "loss": 0.2669, "step": 2780 }, { "epoch": 1.3996219281663516, "grad_norm": 0.1687413663962204, "learning_rate": 5.652076247540045e-07, "loss": 0.269, "step": 2781 }, { "epoch": 1.4001260239445494, "grad_norm": 0.17337825335332843, "learning_rate": 5.649413508917689e-07, "loss": 0.2687, "step": 2782 }, { "epoch": 1.4006301197227473, "grad_norm": 0.17254045056559464, "learning_rate": 5.646750582943558e-07, "loss": 0.2806, "step": 2783 }, { "epoch": 1.4011342155009452, "grad_norm": 0.17604086047481585, "learning_rate": 5.644087470385889e-07, "loss": 0.2794, "step": 2784 }, { "epoch": 1.401638311279143, "grad_norm": 0.17593248783979848, "learning_rate": 5.641424172012976e-07, "loss": 0.2687, "step": 2785 }, { "epoch": 1.402142407057341, "grad_norm": 0.17677274912632954, "learning_rate": 5.638760688593162e-07, "loss": 0.2667, "step": 2786 }, { "epoch": 1.402142407057341, "eval_loss": 0.3082324266433716, "eval_runtime": 17.3029, "eval_samples_per_second": 49.414, "eval_steps_per_second": 1.04, "step": 2786 }, { "epoch": 1.4026465028355388, "grad_norm": 0.17615085346486575, "learning_rate": 5.636097020894849e-07, "loss": 0.2652, "step": 2787 }, { "epoch": 1.4031505986137365, "grad_norm": 0.18280305120298956, "learning_rate": 5.633433169686483e-07, "loss": 0.2622, "step": 2788 }, { "epoch": 1.4036546943919346, "grad_norm": 0.1766526757543098, "learning_rate": 5.630769135736573e-07, "loss": 0.2855, "step": 2789 }, { "epoch": 1.4041587901701322, "grad_norm": 0.18013844905463533, "learning_rate": 5.628104919813673e-07, "loss": 0.2864, "step": 2790 }, { "epoch": 1.40466288594833, "grad_norm": 0.17308521362796225, "learning_rate": 5.625440522686395e-07, "loss": 0.2822, "step": 2791 }, { "epoch": 1.405166981726528, "grad_norm": 0.17384117426113055, "learning_rate": 5.622775945123401e-07, "loss": 0.2726, "step": 2792 }, { "epoch": 1.4056710775047259, "grad_norm": 0.18309399117577832, "learning_rate": 5.620111187893404e-07, "loss": 0.2632, "step": 2793 }, { "epoch": 1.4061751732829237, "grad_norm": 0.17415826316818256, "learning_rate": 5.617446251765173e-07, "loss": 0.2731, "step": 2794 }, { "epoch": 1.4066792690611216, "grad_norm": 0.1739100478730036, "learning_rate": 5.614781137507521e-07, "loss": 0.2762, "step": 2795 }, { "epoch": 1.4071833648393195, "grad_norm": 0.1764423898652386, "learning_rate": 5.612115845889321e-07, "loss": 0.2849, "step": 2796 }, { "epoch": 1.4076874606175174, "grad_norm": 0.1760604570799819, "learning_rate": 5.60945037767949e-07, "loss": 0.2719, "step": 2797 }, { "epoch": 1.4081915563957152, "grad_norm": 0.17431566775862142, "learning_rate": 5.606784733647e-07, "loss": 0.253, "step": 2798 }, { "epoch": 1.4086956521739131, "grad_norm": 0.17776800316760477, "learning_rate": 5.604118914560873e-07, "loss": 0.2699, "step": 2799 }, { "epoch": 1.409199747952111, "grad_norm": 0.18918069069185012, "learning_rate": 5.601452921190183e-07, "loss": 0.2717, "step": 2800 }, { "epoch": 1.4097038437303087, "grad_norm": 0.1732601715342729, "learning_rate": 5.598786754304051e-07, "loss": 0.2714, "step": 2801 }, { "epoch": 1.4102079395085065, "grad_norm": 0.1791884506349457, "learning_rate": 5.596120414671649e-07, "loss": 0.2693, "step": 2802 }, { "epoch": 1.4107120352867044, "grad_norm": 0.17551762309525726, "learning_rate": 5.5934539030622e-07, "loss": 0.2931, "step": 2803 }, { "epoch": 1.4112161310649023, "grad_norm": 0.17653447291530958, "learning_rate": 5.590787220244975e-07, "loss": 0.279, "step": 2804 }, { "epoch": 1.4117202268431002, "grad_norm": 0.1715844081659434, "learning_rate": 5.588120366989299e-07, "loss": 0.2692, "step": 2805 }, { "epoch": 1.412224322621298, "grad_norm": 0.17195024969559972, "learning_rate": 5.585453344064538e-07, "loss": 0.2755, "step": 2806 }, { "epoch": 1.412728418399496, "grad_norm": 0.17614434935253434, "learning_rate": 5.582786152240116e-07, "loss": 0.2868, "step": 2807 }, { "epoch": 1.4132325141776938, "grad_norm": 0.17488803322082694, "learning_rate": 5.580118792285497e-07, "loss": 0.2785, "step": 2808 }, { "epoch": 1.4137366099558917, "grad_norm": 0.17403718634941737, "learning_rate": 5.577451264970203e-07, "loss": 0.2779, "step": 2809 }, { "epoch": 1.4142407057340896, "grad_norm": 0.180098783645297, "learning_rate": 5.574783571063795e-07, "loss": 0.2805, "step": 2810 }, { "epoch": 1.4147448015122874, "grad_norm": 0.1735655015170712, "learning_rate": 5.572115711335886e-07, "loss": 0.2695, "step": 2811 }, { "epoch": 1.415248897290485, "grad_norm": 0.1741615072776797, "learning_rate": 5.56944768655614e-07, "loss": 0.2585, "step": 2812 }, { "epoch": 1.4157529930686832, "grad_norm": 0.1743862849176332, "learning_rate": 5.566779497494264e-07, "loss": 0.2979, "step": 2813 }, { "epoch": 1.4162570888468808, "grad_norm": 0.17758390356185494, "learning_rate": 5.564111144920013e-07, "loss": 0.2745, "step": 2814 }, { "epoch": 1.4167611846250787, "grad_norm": 0.1834026948019301, "learning_rate": 5.561442629603192e-07, "loss": 0.2768, "step": 2815 }, { "epoch": 1.4172652804032766, "grad_norm": 0.17218386664662416, "learning_rate": 5.55877395231365e-07, "loss": 0.278, "step": 2816 }, { "epoch": 1.4177693761814745, "grad_norm": 0.17269835730848343, "learning_rate": 5.556105113821285e-07, "loss": 0.272, "step": 2817 }, { "epoch": 1.4182734719596723, "grad_norm": 0.19670398507490758, "learning_rate": 5.553436114896037e-07, "loss": 0.2511, "step": 2818 }, { "epoch": 1.4187775677378702, "grad_norm": 0.1715178941839397, "learning_rate": 5.5507669563079e-07, "loss": 0.2735, "step": 2819 }, { "epoch": 1.419281663516068, "grad_norm": 0.1729826705555197, "learning_rate": 5.548097638826907e-07, "loss": 0.2707, "step": 2820 }, { "epoch": 1.419785759294266, "grad_norm": 0.1725819935420221, "learning_rate": 5.545428163223142e-07, "loss": 0.2671, "step": 2821 }, { "epoch": 1.4202898550724639, "grad_norm": 0.17665111931235344, "learning_rate": 5.542758530266729e-07, "loss": 0.2538, "step": 2822 }, { "epoch": 1.4207939508506615, "grad_norm": 0.17897097101640105, "learning_rate": 5.540088740727843e-07, "loss": 0.2697, "step": 2823 }, { "epoch": 1.4212980466288596, "grad_norm": 0.1687081289149051, "learning_rate": 5.537418795376702e-07, "loss": 0.2828, "step": 2824 }, { "epoch": 1.4218021424070573, "grad_norm": 0.20222752341146646, "learning_rate": 5.534748694983567e-07, "loss": 0.2713, "step": 2825 }, { "epoch": 1.4223062381852551, "grad_norm": 0.17189172904171757, "learning_rate": 5.532078440318746e-07, "loss": 0.2698, "step": 2826 }, { "epoch": 1.422810333963453, "grad_norm": 0.17345223486754652, "learning_rate": 5.52940803215259e-07, "loss": 0.276, "step": 2827 }, { "epoch": 1.423314429741651, "grad_norm": 0.2080580963062512, "learning_rate": 5.526737471255498e-07, "loss": 0.2878, "step": 2828 }, { "epoch": 1.4238185255198488, "grad_norm": 0.1993251803507769, "learning_rate": 5.524066758397907e-07, "loss": 0.2741, "step": 2829 }, { "epoch": 1.4243226212980467, "grad_norm": 0.19657833333195346, "learning_rate": 5.521395894350303e-07, "loss": 0.2743, "step": 2830 }, { "epoch": 1.4248267170762445, "grad_norm": 0.18203360278973735, "learning_rate": 5.518724879883215e-07, "loss": 0.2769, "step": 2831 }, { "epoch": 1.4253308128544424, "grad_norm": 0.18264070770705262, "learning_rate": 5.516053715767211e-07, "loss": 0.2853, "step": 2832 }, { "epoch": 1.4258349086326403, "grad_norm": 0.18164978911251264, "learning_rate": 5.513382402772906e-07, "loss": 0.2755, "step": 2833 }, { "epoch": 1.426339004410838, "grad_norm": 0.1662574674843768, "learning_rate": 5.510710941670959e-07, "loss": 0.2689, "step": 2834 }, { "epoch": 1.426843100189036, "grad_norm": 0.17319135764139246, "learning_rate": 5.508039333232069e-07, "loss": 0.2856, "step": 2835 }, { "epoch": 1.4273471959672337, "grad_norm": 0.1827176229024098, "learning_rate": 5.505367578226978e-07, "loss": 0.2842, "step": 2836 }, { "epoch": 1.4278512917454316, "grad_norm": 0.1908930430427692, "learning_rate": 5.502695677426471e-07, "loss": 0.2634, "step": 2837 }, { "epoch": 1.4283553875236294, "grad_norm": 0.18109239755144405, "learning_rate": 5.500023631601376e-07, "loss": 0.2786, "step": 2838 }, { "epoch": 1.4288594833018273, "grad_norm": 0.18601457798351573, "learning_rate": 5.497351441522561e-07, "loss": 0.2773, "step": 2839 }, { "epoch": 1.4293635790800252, "grad_norm": 0.1851254103241904, "learning_rate": 5.494679107960936e-07, "loss": 0.2918, "step": 2840 }, { "epoch": 1.429867674858223, "grad_norm": 0.1740395548507666, "learning_rate": 5.492006631687451e-07, "loss": 0.2681, "step": 2841 }, { "epoch": 1.430371770636421, "grad_norm": 0.17944562058468594, "learning_rate": 5.489334013473103e-07, "loss": 0.2643, "step": 2842 }, { "epoch": 1.4308758664146188, "grad_norm": 0.1721575725843985, "learning_rate": 5.486661254088921e-07, "loss": 0.2678, "step": 2843 }, { "epoch": 1.4313799621928167, "grad_norm": 0.18124114212980594, "learning_rate": 5.483988354305984e-07, "loss": 0.286, "step": 2844 }, { "epoch": 1.4318840579710144, "grad_norm": 0.17177819687786705, "learning_rate": 5.481315314895404e-07, "loss": 0.2883, "step": 2845 }, { "epoch": 1.4323881537492125, "grad_norm": 0.17999663813819824, "learning_rate": 5.478642136628338e-07, "loss": 0.2506, "step": 2846 }, { "epoch": 1.4328922495274101, "grad_norm": 0.1759171770648982, "learning_rate": 5.47596882027598e-07, "loss": 0.2735, "step": 2847 }, { "epoch": 1.433396345305608, "grad_norm": 0.1816944331335307, "learning_rate": 5.473295366609566e-07, "loss": 0.2849, "step": 2848 }, { "epoch": 1.4339004410838059, "grad_norm": 0.17100685390259318, "learning_rate": 5.470621776400371e-07, "loss": 0.2612, "step": 2849 }, { "epoch": 1.4344045368620038, "grad_norm": 0.1819142852011383, "learning_rate": 5.467948050419707e-07, "loss": 0.2783, "step": 2850 }, { "epoch": 1.4349086326402016, "grad_norm": 0.18361474818908685, "learning_rate": 5.465274189438931e-07, "loss": 0.284, "step": 2851 }, { "epoch": 1.4354127284183995, "grad_norm": 0.1752427494123583, "learning_rate": 5.462600194229432e-07, "loss": 0.2823, "step": 2852 }, { "epoch": 1.4359168241965974, "grad_norm": 0.1773020482286541, "learning_rate": 5.459926065562643e-07, "loss": 0.2686, "step": 2853 }, { "epoch": 1.4364209199747953, "grad_norm": 0.17566660311428434, "learning_rate": 5.457251804210035e-07, "loss": 0.2668, "step": 2854 }, { "epoch": 1.4369250157529931, "grad_norm": 0.17963004958574516, "learning_rate": 5.454577410943113e-07, "loss": 0.2713, "step": 2855 }, { "epoch": 1.4374291115311908, "grad_norm": 0.17390970931818708, "learning_rate": 5.451902886533424e-07, "loss": 0.2681, "step": 2856 }, { "epoch": 1.437933207309389, "grad_norm": 0.17729799370957, "learning_rate": 5.449228231752551e-07, "loss": 0.272, "step": 2857 }, { "epoch": 1.4384373030875865, "grad_norm": 0.17507123221756268, "learning_rate": 5.446553447372117e-07, "loss": 0.2775, "step": 2858 }, { "epoch": 1.4389413988657844, "grad_norm": 0.17681228025938894, "learning_rate": 5.443878534163779e-07, "loss": 0.2831, "step": 2859 }, { "epoch": 1.4394454946439823, "grad_norm": 0.1821736618960098, "learning_rate": 5.441203492899233e-07, "loss": 0.2841, "step": 2860 }, { "epoch": 1.4399495904221802, "grad_norm": 0.17903890868161446, "learning_rate": 5.438528324350214e-07, "loss": 0.278, "step": 2861 }, { "epoch": 1.440453686200378, "grad_norm": 0.180429015598911, "learning_rate": 5.435853029288489e-07, "loss": 0.2746, "step": 2862 }, { "epoch": 1.440957781978576, "grad_norm": 0.17355901123245668, "learning_rate": 5.433177608485865e-07, "loss": 0.2679, "step": 2863 }, { "epoch": 1.4414618777567738, "grad_norm": 0.180173248420686, "learning_rate": 5.430502062714184e-07, "loss": 0.2698, "step": 2864 }, { "epoch": 1.4419659735349717, "grad_norm": 0.17214222893696038, "learning_rate": 5.427826392745325e-07, "loss": 0.2769, "step": 2865 }, { "epoch": 1.4424700693131696, "grad_norm": 0.1726707684780785, "learning_rate": 5.425150599351201e-07, "loss": 0.2819, "step": 2866 }, { "epoch": 1.4429741650913672, "grad_norm": 0.17661401865470128, "learning_rate": 5.422474683303765e-07, "loss": 0.2633, "step": 2867 }, { "epoch": 1.4434782608695653, "grad_norm": 0.16966464132316653, "learning_rate": 5.419798645374998e-07, "loss": 0.2749, "step": 2868 }, { "epoch": 1.443982356647763, "grad_norm": 0.17637511222129923, "learning_rate": 5.417122486336923e-07, "loss": 0.2747, "step": 2869 }, { "epoch": 1.4444864524259609, "grad_norm": 0.1747096047216983, "learning_rate": 5.414446206961596e-07, "loss": 0.2701, "step": 2870 }, { "epoch": 1.4449905482041587, "grad_norm": 0.18391353933023288, "learning_rate": 5.411769808021104e-07, "loss": 0.2786, "step": 2871 }, { "epoch": 1.4454946439823566, "grad_norm": 0.1731778758855043, "learning_rate": 5.409093290287573e-07, "loss": 0.2568, "step": 2872 }, { "epoch": 1.4459987397605545, "grad_norm": 0.16898295126793264, "learning_rate": 5.406416654533163e-07, "loss": 0.2657, "step": 2873 }, { "epoch": 1.4465028355387524, "grad_norm": 0.18058265233021242, "learning_rate": 5.403739901530065e-07, "loss": 0.2765, "step": 2874 }, { "epoch": 1.4470069313169502, "grad_norm": 0.17654601160313765, "learning_rate": 5.401063032050507e-07, "loss": 0.2789, "step": 2875 }, { "epoch": 1.4475110270951481, "grad_norm": 0.17294349882056775, "learning_rate": 5.398386046866747e-07, "loss": 0.268, "step": 2876 }, { "epoch": 1.448015122873346, "grad_norm": 0.17407010490339622, "learning_rate": 5.395708946751083e-07, "loss": 0.2752, "step": 2877 }, { "epoch": 1.4485192186515439, "grad_norm": 0.1869029531049797, "learning_rate": 5.393031732475837e-07, "loss": 0.2839, "step": 2878 }, { "epoch": 1.4490233144297417, "grad_norm": 0.17047646712682293, "learning_rate": 5.390354404813373e-07, "loss": 0.2783, "step": 2879 }, { "epoch": 1.4495274102079394, "grad_norm": 0.1740578915583476, "learning_rate": 5.38767696453608e-07, "loss": 0.2903, "step": 2880 }, { "epoch": 1.4500315059861375, "grad_norm": 0.18937601572967064, "learning_rate": 5.384999412416383e-07, "loss": 0.2649, "step": 2881 }, { "epoch": 1.4505356017643352, "grad_norm": 0.17078602897509845, "learning_rate": 5.382321749226743e-07, "loss": 0.2753, "step": 2882 }, { "epoch": 1.451039697542533, "grad_norm": 0.18237399698730128, "learning_rate": 5.379643975739647e-07, "loss": 0.2991, "step": 2883 }, { "epoch": 1.451543793320731, "grad_norm": 0.1741266975993158, "learning_rate": 5.376966092727613e-07, "loss": 0.271, "step": 2884 }, { "epoch": 1.4520478890989288, "grad_norm": 0.17610657049121992, "learning_rate": 5.374288100963198e-07, "loss": 0.2889, "step": 2885 }, { "epoch": 1.4525519848771267, "grad_norm": 0.17138137935782394, "learning_rate": 5.371610001218983e-07, "loss": 0.2649, "step": 2886 }, { "epoch": 1.4530560806553245, "grad_norm": 0.17838838912402527, "learning_rate": 5.368931794267586e-07, "loss": 0.2753, "step": 2887 }, { "epoch": 1.4535601764335224, "grad_norm": 0.1746687660444875, "learning_rate": 5.366253480881651e-07, "loss": 0.2763, "step": 2888 }, { "epoch": 1.4540642722117203, "grad_norm": 0.17490010732782957, "learning_rate": 5.363575061833856e-07, "loss": 0.273, "step": 2889 }, { "epoch": 1.4545683679899182, "grad_norm": 0.1753044578433687, "learning_rate": 5.360896537896909e-07, "loss": 0.2818, "step": 2890 }, { "epoch": 1.4550724637681158, "grad_norm": 0.17281297026171832, "learning_rate": 5.358217909843545e-07, "loss": 0.2616, "step": 2891 }, { "epoch": 1.455576559546314, "grad_norm": 0.17311583025271762, "learning_rate": 5.355539178446535e-07, "loss": 0.2705, "step": 2892 }, { "epoch": 1.4560806553245116, "grad_norm": 0.17027692820248555, "learning_rate": 5.352860344478673e-07, "loss": 0.2736, "step": 2893 }, { "epoch": 1.4565847511027095, "grad_norm": 0.17311501562837145, "learning_rate": 5.350181408712787e-07, "loss": 0.2563, "step": 2894 }, { "epoch": 1.4570888468809073, "grad_norm": 0.17543624536918084, "learning_rate": 5.347502371921735e-07, "loss": 0.2736, "step": 2895 }, { "epoch": 1.4575929426591052, "grad_norm": 0.19910330074338614, "learning_rate": 5.3448232348784e-07, "loss": 0.273, "step": 2896 }, { "epoch": 1.458097038437303, "grad_norm": 0.1738967237730725, "learning_rate": 5.342143998355698e-07, "loss": 0.274, "step": 2897 }, { "epoch": 1.458601134215501, "grad_norm": 0.17570081461431064, "learning_rate": 5.339464663126574e-07, "loss": 0.2759, "step": 2898 }, { "epoch": 1.4591052299936988, "grad_norm": 0.17791619225009586, "learning_rate": 5.336785229963996e-07, "loss": 0.2623, "step": 2899 }, { "epoch": 1.4596093257718967, "grad_norm": 0.17002260231876098, "learning_rate": 5.334105699640965e-07, "loss": 0.266, "step": 2900 }, { "epoch": 1.4601134215500946, "grad_norm": 0.18429176939879613, "learning_rate": 5.33142607293051e-07, "loss": 0.2819, "step": 2901 }, { "epoch": 1.4606175173282923, "grad_norm": 0.17868955547716125, "learning_rate": 5.328746350605685e-07, "loss": 0.2682, "step": 2902 }, { "epoch": 1.4611216131064904, "grad_norm": 0.16918396641956726, "learning_rate": 5.326066533439575e-07, "loss": 0.2755, "step": 2903 }, { "epoch": 1.461625708884688, "grad_norm": 0.17069887650764975, "learning_rate": 5.323386622205291e-07, "loss": 0.2801, "step": 2904 }, { "epoch": 1.462129804662886, "grad_norm": 0.1701037068935557, "learning_rate": 5.320706617675968e-07, "loss": 0.2765, "step": 2905 }, { "epoch": 1.4626339004410838, "grad_norm": 0.17789523122182876, "learning_rate": 5.318026520624774e-07, "loss": 0.2912, "step": 2906 }, { "epoch": 1.4631379962192816, "grad_norm": 0.19749764941108588, "learning_rate": 5.315346331824898e-07, "loss": 0.2853, "step": 2907 }, { "epoch": 1.4636420919974795, "grad_norm": 0.1756855169152155, "learning_rate": 5.31266605204956e-07, "loss": 0.2832, "step": 2908 }, { "epoch": 1.4641461877756774, "grad_norm": 0.1786872687703457, "learning_rate": 5.309985682072001e-07, "loss": 0.2665, "step": 2909 }, { "epoch": 1.4646502835538753, "grad_norm": 0.1877786342320192, "learning_rate": 5.307305222665494e-07, "loss": 0.2774, "step": 2910 }, { "epoch": 1.4651543793320732, "grad_norm": 0.17127707594522085, "learning_rate": 5.304624674603335e-07, "loss": 0.2646, "step": 2911 }, { "epoch": 1.465658475110271, "grad_norm": 0.2560823194379178, "learning_rate": 5.301944038658842e-07, "loss": 0.2669, "step": 2912 }, { "epoch": 1.4661625708884687, "grad_norm": 0.1766829426286369, "learning_rate": 5.299263315605367e-07, "loss": 0.2781, "step": 2913 }, { "epoch": 1.4666666666666668, "grad_norm": 0.16776123278488433, "learning_rate": 5.296582506216277e-07, "loss": 0.2616, "step": 2914 }, { "epoch": 1.4671707624448644, "grad_norm": 0.17648227246454712, "learning_rate": 5.293901611264971e-07, "loss": 0.2649, "step": 2915 }, { "epoch": 1.4676748582230623, "grad_norm": 0.17432483953665048, "learning_rate": 5.291220631524872e-07, "loss": 0.2687, "step": 2916 }, { "epoch": 1.4681789540012602, "grad_norm": 0.20520459105179642, "learning_rate": 5.288539567769424e-07, "loss": 0.2862, "step": 2917 }, { "epoch": 1.468683049779458, "grad_norm": 0.18543376500369232, "learning_rate": 5.285858420772099e-07, "loss": 0.2541, "step": 2918 }, { "epoch": 1.469187145557656, "grad_norm": 0.17322589885199907, "learning_rate": 5.283177191306389e-07, "loss": 0.2748, "step": 2919 }, { "epoch": 1.4696912413358538, "grad_norm": 0.18616715044944854, "learning_rate": 5.280495880145814e-07, "loss": 0.2791, "step": 2920 }, { "epoch": 1.4701953371140517, "grad_norm": 0.17972067487672042, "learning_rate": 5.277814488063918e-07, "loss": 0.283, "step": 2921 }, { "epoch": 1.4706994328922496, "grad_norm": 0.19354601757670176, "learning_rate": 5.27513301583426e-07, "loss": 0.2733, "step": 2922 }, { "epoch": 1.4712035286704475, "grad_norm": 0.17386234930511313, "learning_rate": 5.272451464230433e-07, "loss": 0.2667, "step": 2923 }, { "epoch": 1.4717076244486451, "grad_norm": 0.1747812393098067, "learning_rate": 5.269769834026045e-07, "loss": 0.2538, "step": 2924 }, { "epoch": 1.4722117202268432, "grad_norm": 0.18216928171454855, "learning_rate": 5.267088125994732e-07, "loss": 0.2689, "step": 2925 }, { "epoch": 1.4727158160050409, "grad_norm": 0.18671286603946396, "learning_rate": 5.264406340910148e-07, "loss": 0.2682, "step": 2926 }, { "epoch": 1.4732199117832387, "grad_norm": 0.17551742566377754, "learning_rate": 5.261724479545974e-07, "loss": 0.2913, "step": 2927 }, { "epoch": 1.4737240075614366, "grad_norm": 0.24593696823308445, "learning_rate": 5.259042542675907e-07, "loss": 0.2717, "step": 2928 }, { "epoch": 1.4742281033396345, "grad_norm": 0.1716232359748941, "learning_rate": 5.256360531073674e-07, "loss": 0.2791, "step": 2929 }, { "epoch": 1.4747321991178324, "grad_norm": 0.17889186640749882, "learning_rate": 5.253678445513014e-07, "loss": 0.2559, "step": 2930 }, { "epoch": 1.4752362948960303, "grad_norm": 0.16660287168913102, "learning_rate": 5.250996286767693e-07, "loss": 0.2796, "step": 2931 }, { "epoch": 1.4757403906742281, "grad_norm": 0.17208791704520798, "learning_rate": 5.248314055611499e-07, "loss": 0.2789, "step": 2932 }, { "epoch": 1.476244486452426, "grad_norm": 0.16785449767453983, "learning_rate": 5.245631752818238e-07, "loss": 0.2517, "step": 2933 }, { "epoch": 1.4767485822306239, "grad_norm": 0.17104687297149335, "learning_rate": 5.242949379161739e-07, "loss": 0.2669, "step": 2934 }, { "epoch": 1.4772526780088215, "grad_norm": 0.17816748061415208, "learning_rate": 5.240266935415847e-07, "loss": 0.2761, "step": 2935 }, { "epoch": 1.4777567737870196, "grad_norm": 0.17497683850506593, "learning_rate": 5.237584422354435e-07, "loss": 0.2691, "step": 2936 }, { "epoch": 1.4782608695652173, "grad_norm": 0.17896943432738507, "learning_rate": 5.234901840751388e-07, "loss": 0.2691, "step": 2937 }, { "epoch": 1.4787649653434152, "grad_norm": 0.18082033748198661, "learning_rate": 5.232219191380614e-07, "loss": 0.2726, "step": 2938 }, { "epoch": 1.479269061121613, "grad_norm": 0.17731545802828333, "learning_rate": 5.229536475016044e-07, "loss": 0.2817, "step": 2939 }, { "epoch": 1.479773156899811, "grad_norm": 0.17098014110692908, "learning_rate": 5.22685369243162e-07, "loss": 0.2714, "step": 2940 }, { "epoch": 1.4802772526780088, "grad_norm": 0.1813522016663038, "learning_rate": 5.224170844401313e-07, "loss": 0.2846, "step": 2941 }, { "epoch": 1.4807813484562067, "grad_norm": 0.17197034879530912, "learning_rate": 5.221487931699107e-07, "loss": 0.2667, "step": 2942 }, { "epoch": 1.4812854442344046, "grad_norm": 0.16697715148870823, "learning_rate": 5.218804955099003e-07, "loss": 0.2627, "step": 2943 }, { "epoch": 1.4817895400126024, "grad_norm": 0.17040842764616276, "learning_rate": 5.216121915375026e-07, "loss": 0.2577, "step": 2944 }, { "epoch": 1.4822936357908003, "grad_norm": 0.17579731593772854, "learning_rate": 5.213438813301214e-07, "loss": 0.2811, "step": 2945 }, { "epoch": 1.4827977315689982, "grad_norm": 0.17045185683058092, "learning_rate": 5.210755649651627e-07, "loss": 0.255, "step": 2946 }, { "epoch": 1.483301827347196, "grad_norm": 0.17068201983517348, "learning_rate": 5.20807242520034e-07, "loss": 0.2743, "step": 2947 }, { "epoch": 1.4838059231253937, "grad_norm": 0.18181611693726946, "learning_rate": 5.205389140721448e-07, "loss": 0.2732, "step": 2948 }, { "epoch": 1.4843100189035916, "grad_norm": 0.1852929761291196, "learning_rate": 5.202705796989061e-07, "loss": 0.2801, "step": 2949 }, { "epoch": 1.4848141146817895, "grad_norm": 0.17600122731072515, "learning_rate": 5.200022394777308e-07, "loss": 0.2716, "step": 2950 }, { "epoch": 1.4853182104599874, "grad_norm": 0.1718236542287459, "learning_rate": 5.197338934860332e-07, "loss": 0.2674, "step": 2951 }, { "epoch": 1.4858223062381852, "grad_norm": 0.1775790487723521, "learning_rate": 5.194655418012295e-07, "loss": 0.2825, "step": 2952 }, { "epoch": 1.486326402016383, "grad_norm": 0.1807315629873165, "learning_rate": 5.191971845007378e-07, "loss": 0.3003, "step": 2953 }, { "epoch": 1.486830497794581, "grad_norm": 0.17385001875969464, "learning_rate": 5.189288216619773e-07, "loss": 0.2586, "step": 2954 }, { "epoch": 1.4873345935727789, "grad_norm": 0.17077436408789423, "learning_rate": 5.186604533623689e-07, "loss": 0.2774, "step": 2955 }, { "epoch": 1.4878386893509767, "grad_norm": 0.17354882163404403, "learning_rate": 5.183920796793353e-07, "loss": 0.2695, "step": 2956 }, { "epoch": 1.4883427851291746, "grad_norm": 0.18056872611734137, "learning_rate": 5.181237006903007e-07, "loss": 0.2575, "step": 2957 }, { "epoch": 1.4888468809073725, "grad_norm": 0.1747297098238214, "learning_rate": 5.178553164726906e-07, "loss": 0.2832, "step": 2958 }, { "epoch": 1.4893509766855701, "grad_norm": 0.17268688557025336, "learning_rate": 5.175869271039325e-07, "loss": 0.2745, "step": 2959 }, { "epoch": 1.4898550724637682, "grad_norm": 0.1723723042058958, "learning_rate": 5.173185326614546e-07, "loss": 0.2835, "step": 2960 }, { "epoch": 1.490359168241966, "grad_norm": 0.16999642870279735, "learning_rate": 5.170501332226875e-07, "loss": 0.2734, "step": 2961 }, { "epoch": 1.4908632640201638, "grad_norm": 0.16872935699465505, "learning_rate": 5.167817288650625e-07, "loss": 0.2756, "step": 2962 }, { "epoch": 1.4913673597983617, "grad_norm": 0.18195730992832942, "learning_rate": 5.165133196660128e-07, "loss": 0.275, "step": 2963 }, { "epoch": 1.4918714555765595, "grad_norm": 0.1694600503693703, "learning_rate": 5.162449057029725e-07, "loss": 0.2677, "step": 2964 }, { "epoch": 1.4923755513547574, "grad_norm": 0.17420508681132724, "learning_rate": 5.159764870533777e-07, "loss": 0.2804, "step": 2965 }, { "epoch": 1.4928796471329553, "grad_norm": 0.16859261160349226, "learning_rate": 5.157080637946654e-07, "loss": 0.2705, "step": 2966 }, { "epoch": 1.4933837429111532, "grad_norm": 0.17551194260816222, "learning_rate": 5.154396360042738e-07, "loss": 0.2592, "step": 2967 }, { "epoch": 1.493887838689351, "grad_norm": 0.17227527863330538, "learning_rate": 5.15171203759643e-07, "loss": 0.2674, "step": 2968 }, { "epoch": 1.494391934467549, "grad_norm": 0.1728755992481674, "learning_rate": 5.149027671382138e-07, "loss": 0.2705, "step": 2969 }, { "epoch": 1.4948960302457466, "grad_norm": 0.17382519055950477, "learning_rate": 5.146343262174286e-07, "loss": 0.2652, "step": 2970 }, { "epoch": 1.4954001260239447, "grad_norm": 0.1769092204357377, "learning_rate": 5.14365881074731e-07, "loss": 0.2767, "step": 2971 }, { "epoch": 1.4959042218021423, "grad_norm": 0.17476416087544372, "learning_rate": 5.140974317875657e-07, "loss": 0.2753, "step": 2972 }, { "epoch": 1.4964083175803402, "grad_norm": 0.169687985399867, "learning_rate": 5.138289784333787e-07, "loss": 0.2765, "step": 2973 }, { "epoch": 1.496912413358538, "grad_norm": 0.17261770629037, "learning_rate": 5.13560521089617e-07, "loss": 0.2732, "step": 2974 }, { "epoch": 1.497416509136736, "grad_norm": 0.17477899451723963, "learning_rate": 5.13292059833729e-07, "loss": 0.2599, "step": 2975 }, { "epoch": 1.4979206049149338, "grad_norm": 0.18102926267539213, "learning_rate": 5.13023594743164e-07, "loss": 0.2674, "step": 2976 }, { "epoch": 1.4984247006931317, "grad_norm": 0.17738927553973277, "learning_rate": 5.127551258953727e-07, "loss": 0.2672, "step": 2977 }, { "epoch": 1.4989287964713296, "grad_norm": 0.18069762077371987, "learning_rate": 5.124866533678066e-07, "loss": 0.2665, "step": 2978 }, { "epoch": 1.4994328922495275, "grad_norm": 0.17624639732046102, "learning_rate": 5.122181772379182e-07, "loss": 0.2847, "step": 2979 }, { "epoch": 1.4999369880277253, "grad_norm": 0.21149596045503335, "learning_rate": 5.119496975831616e-07, "loss": 0.2755, "step": 2980 }, { "epoch": 1.500441083805923, "grad_norm": 0.17200655792505817, "learning_rate": 5.116812144809911e-07, "loss": 0.2776, "step": 2981 }, { "epoch": 1.500945179584121, "grad_norm": 0.199818514038348, "learning_rate": 5.114127280088627e-07, "loss": 0.2721, "step": 2982 }, { "epoch": 1.5014492753623188, "grad_norm": 0.1767000934409416, "learning_rate": 5.111442382442328e-07, "loss": 0.2897, "step": 2983 }, { "epoch": 1.5019533711405169, "grad_norm": 0.17657617178530624, "learning_rate": 5.108757452645594e-07, "loss": 0.2585, "step": 2984 }, { "epoch": 1.5024574669187145, "grad_norm": 0.17907925642962816, "learning_rate": 5.106072491473008e-07, "loss": 0.2753, "step": 2985 }, { "epoch": 1.5024574669187145, "eval_loss": 0.30766692757606506, "eval_runtime": 17.4903, "eval_samples_per_second": 48.884, "eval_steps_per_second": 1.029, "step": 2985 }, { "epoch": 1.5029615626969124, "grad_norm": 0.17603805841778422, "learning_rate": 5.103387499699164e-07, "loss": 0.2658, "step": 2986 }, { "epoch": 1.5034656584751103, "grad_norm": 0.1772161820090902, "learning_rate": 5.100702478098667e-07, "loss": 0.2757, "step": 2987 }, { "epoch": 1.5039697542533081, "grad_norm": 0.17226447203172757, "learning_rate": 5.098017427446132e-07, "loss": 0.2765, "step": 2988 }, { "epoch": 1.504473850031506, "grad_norm": 0.17276877170127075, "learning_rate": 5.095332348516172e-07, "loss": 0.2752, "step": 2989 }, { "epoch": 1.504977945809704, "grad_norm": 0.1847638673736269, "learning_rate": 5.092647242083423e-07, "loss": 0.2789, "step": 2990 }, { "epoch": 1.5054820415879018, "grad_norm": 0.17901616149206137, "learning_rate": 5.089962108922517e-07, "loss": 0.2911, "step": 2991 }, { "epoch": 1.5059861373660994, "grad_norm": 0.18718528180473848, "learning_rate": 5.0872769498081e-07, "loss": 0.2618, "step": 2992 }, { "epoch": 1.5064902331442975, "grad_norm": 0.17561340990666174, "learning_rate": 5.084591765514824e-07, "loss": 0.2796, "step": 2993 }, { "epoch": 1.5069943289224952, "grad_norm": 0.17573027328641988, "learning_rate": 5.081906556817348e-07, "loss": 0.2746, "step": 2994 }, { "epoch": 1.5074984247006933, "grad_norm": 0.1946924830736341, "learning_rate": 5.079221324490338e-07, "loss": 0.2596, "step": 2995 }, { "epoch": 1.508002520478891, "grad_norm": 0.19483610334363802, "learning_rate": 5.076536069308466e-07, "loss": 0.2661, "step": 2996 }, { "epoch": 1.5085066162570888, "grad_norm": 0.17050635285106475, "learning_rate": 5.073850792046411e-07, "loss": 0.2845, "step": 2997 }, { "epoch": 1.5090107120352867, "grad_norm": 0.17434918537651012, "learning_rate": 5.071165493478862e-07, "loss": 0.2763, "step": 2998 }, { "epoch": 1.5095148078134846, "grad_norm": 0.17421166183775727, "learning_rate": 5.068480174380507e-07, "loss": 0.2849, "step": 2999 }, { "epoch": 1.5100189035916824, "grad_norm": 0.17727506977234753, "learning_rate": 5.065794835526047e-07, "loss": 0.266, "step": 3000 }, { "epoch": 1.5105229993698803, "grad_norm": 0.17988130416225281, "learning_rate": 5.063109477690186e-07, "loss": 0.2621, "step": 3001 }, { "epoch": 1.5110270951480782, "grad_norm": 0.17698102799975615, "learning_rate": 5.060424101647631e-07, "loss": 0.2805, "step": 3002 }, { "epoch": 1.5115311909262759, "grad_norm": 0.1800412666094284, "learning_rate": 5.057738708173096e-07, "loss": 0.2749, "step": 3003 }, { "epoch": 1.512035286704474, "grad_norm": 0.17319416297445328, "learning_rate": 5.055053298041302e-07, "loss": 0.2782, "step": 3004 }, { "epoch": 1.5125393824826716, "grad_norm": 0.1683959181018551, "learning_rate": 5.052367872026971e-07, "loss": 0.2595, "step": 3005 }, { "epoch": 1.5130434782608697, "grad_norm": 0.19739859925036402, "learning_rate": 5.049682430904835e-07, "loss": 0.2671, "step": 3006 }, { "epoch": 1.5135475740390674, "grad_norm": 0.17458057209940056, "learning_rate": 5.046996975449624e-07, "loss": 0.2812, "step": 3007 }, { "epoch": 1.5140516698172652, "grad_norm": 0.17642668853441784, "learning_rate": 5.044311506436077e-07, "loss": 0.254, "step": 3008 }, { "epoch": 1.5145557655954631, "grad_norm": 0.18216491050641825, "learning_rate": 5.041626024638935e-07, "loss": 0.2928, "step": 3009 }, { "epoch": 1.515059861373661, "grad_norm": 0.16982395667266062, "learning_rate": 5.038940530832944e-07, "loss": 0.2792, "step": 3010 }, { "epoch": 1.5155639571518589, "grad_norm": 0.17736924822640682, "learning_rate": 5.03625502579285e-07, "loss": 0.27, "step": 3011 }, { "epoch": 1.5160680529300568, "grad_norm": 0.17990063325148373, "learning_rate": 5.033569510293406e-07, "loss": 0.2742, "step": 3012 }, { "epoch": 1.5165721487082546, "grad_norm": 0.17269683446652181, "learning_rate": 5.030883985109367e-07, "loss": 0.2718, "step": 3013 }, { "epoch": 1.5170762444864523, "grad_norm": 0.1713114478517227, "learning_rate": 5.028198451015488e-07, "loss": 0.292, "step": 3014 }, { "epoch": 1.5175803402646504, "grad_norm": 0.17237373122427047, "learning_rate": 5.025512908786531e-07, "loss": 0.2708, "step": 3015 }, { "epoch": 1.518084436042848, "grad_norm": 0.17979894721140827, "learning_rate": 5.022827359197259e-07, "loss": 0.2749, "step": 3016 }, { "epoch": 1.5185885318210461, "grad_norm": 0.18029323092096203, "learning_rate": 5.020141803022435e-07, "loss": 0.2718, "step": 3017 }, { "epoch": 1.5190926275992438, "grad_norm": 0.1709949058851969, "learning_rate": 5.017456241036826e-07, "loss": 0.2842, "step": 3018 }, { "epoch": 1.5195967233774417, "grad_norm": 0.17615800092590866, "learning_rate": 5.014770674015199e-07, "loss": 0.2556, "step": 3019 }, { "epoch": 1.5201008191556395, "grad_norm": 0.17565842258793046, "learning_rate": 5.012085102732323e-07, "loss": 0.2704, "step": 3020 }, { "epoch": 1.5206049149338374, "grad_norm": 0.1706652936469358, "learning_rate": 5.00939952796297e-07, "loss": 0.2751, "step": 3021 }, { "epoch": 1.5211090107120353, "grad_norm": 0.17904301457889313, "learning_rate": 5.006713950481911e-07, "loss": 0.2708, "step": 3022 }, { "epoch": 1.5216131064902332, "grad_norm": 0.19348962154742488, "learning_rate": 5.00402837106392e-07, "loss": 0.2691, "step": 3023 }, { "epoch": 1.522117202268431, "grad_norm": 0.17760169438436854, "learning_rate": 5.001342790483769e-07, "loss": 0.2812, "step": 3024 }, { "epoch": 1.5226212980466287, "grad_norm": 0.1874622214559223, "learning_rate": 4.998657209516231e-07, "loss": 0.2703, "step": 3025 }, { "epoch": 1.5231253938248268, "grad_norm": 0.19774473741326665, "learning_rate": 4.995971628936078e-07, "loss": 0.2642, "step": 3026 }, { "epoch": 1.5236294896030245, "grad_norm": 0.19986370176179066, "learning_rate": 4.993286049518088e-07, "loss": 0.2949, "step": 3027 }, { "epoch": 1.5241335853812226, "grad_norm": 0.16961474491016765, "learning_rate": 4.990600472037029e-07, "loss": 0.2767, "step": 3028 }, { "epoch": 1.5246376811594202, "grad_norm": 0.18853446766118223, "learning_rate": 4.987914897267678e-07, "loss": 0.2781, "step": 3029 }, { "epoch": 1.525141776937618, "grad_norm": 0.16987515007270199, "learning_rate": 4.985229325984803e-07, "loss": 0.2882, "step": 3030 }, { "epoch": 1.525645872715816, "grad_norm": 0.1714486129335063, "learning_rate": 4.982543758963174e-07, "loss": 0.2704, "step": 3031 }, { "epoch": 1.5261499684940139, "grad_norm": 0.16809069324008713, "learning_rate": 4.979858196977566e-07, "loss": 0.2591, "step": 3032 }, { "epoch": 1.5266540642722117, "grad_norm": 0.1738515169060616, "learning_rate": 4.977172640802741e-07, "loss": 0.2582, "step": 3033 }, { "epoch": 1.5271581600504096, "grad_norm": 0.17664113554519453, "learning_rate": 4.974487091213469e-07, "loss": 0.267, "step": 3034 }, { "epoch": 1.5276622558286075, "grad_norm": 0.1916027084703846, "learning_rate": 4.971801548984511e-07, "loss": 0.2731, "step": 3035 }, { "epoch": 1.5281663516068051, "grad_norm": 0.17313830459160134, "learning_rate": 4.969116014890634e-07, "loss": 0.2806, "step": 3036 }, { "epoch": 1.5286704473850032, "grad_norm": 0.20413150350420245, "learning_rate": 4.966430489706594e-07, "loss": 0.2745, "step": 3037 }, { "epoch": 1.529174543163201, "grad_norm": 0.1747943524307337, "learning_rate": 4.96374497420715e-07, "loss": 0.2634, "step": 3038 }, { "epoch": 1.529678638941399, "grad_norm": 0.1728937587667476, "learning_rate": 4.961059469167056e-07, "loss": 0.2814, "step": 3039 }, { "epoch": 1.5301827347195966, "grad_norm": 0.1743204044600501, "learning_rate": 4.958373975361063e-07, "loss": 0.2551, "step": 3040 }, { "epoch": 1.5306868304977945, "grad_norm": 0.17502392690878435, "learning_rate": 4.955688493563922e-07, "loss": 0.267, "step": 3041 }, { "epoch": 1.5311909262759924, "grad_norm": 0.19500065101179812, "learning_rate": 4.953003024550375e-07, "loss": 0.2819, "step": 3042 }, { "epoch": 1.5316950220541903, "grad_norm": 0.17254980716871499, "learning_rate": 4.950317569095166e-07, "loss": 0.2649, "step": 3043 }, { "epoch": 1.5321991178323882, "grad_norm": 0.1713124294788534, "learning_rate": 4.94763212797303e-07, "loss": 0.2563, "step": 3044 }, { "epoch": 1.532703213610586, "grad_norm": 0.17577986512817073, "learning_rate": 4.944946701958698e-07, "loss": 0.273, "step": 3045 }, { "epoch": 1.533207309388784, "grad_norm": 0.1701862456963333, "learning_rate": 4.942261291826905e-07, "loss": 0.2814, "step": 3046 }, { "epoch": 1.5337114051669816, "grad_norm": 0.18127900976396943, "learning_rate": 4.93957589835237e-07, "loss": 0.2713, "step": 3047 }, { "epoch": 1.5342155009451797, "grad_norm": 0.17254874589059224, "learning_rate": 4.936890522309815e-07, "loss": 0.2734, "step": 3048 }, { "epoch": 1.5347195967233773, "grad_norm": 0.1717770510577077, "learning_rate": 4.934205164473952e-07, "loss": 0.2773, "step": 3049 }, { "epoch": 1.5352236925015754, "grad_norm": 0.1908978528330254, "learning_rate": 4.931519825619493e-07, "loss": 0.2788, "step": 3050 }, { "epoch": 1.535727788279773, "grad_norm": 0.171288059642186, "learning_rate": 4.928834506521138e-07, "loss": 0.2703, "step": 3051 }, { "epoch": 1.5362318840579712, "grad_norm": 0.1675001073197896, "learning_rate": 4.926149207953588e-07, "loss": 0.2642, "step": 3052 }, { "epoch": 1.5367359798361688, "grad_norm": 0.17021689322972322, "learning_rate": 4.923463930691535e-07, "loss": 0.2702, "step": 3053 }, { "epoch": 1.5372400756143667, "grad_norm": 0.17371445641850253, "learning_rate": 4.920778675509662e-07, "loss": 0.2589, "step": 3054 }, { "epoch": 1.5377441713925646, "grad_norm": 0.17478774431954305, "learning_rate": 4.918093443182652e-07, "loss": 0.2803, "step": 3055 }, { "epoch": 1.5382482671707625, "grad_norm": 0.17591722779410152, "learning_rate": 4.915408234485175e-07, "loss": 0.2897, "step": 3056 }, { "epoch": 1.5387523629489603, "grad_norm": 0.17280009330380483, "learning_rate": 4.912723050191899e-07, "loss": 0.2717, "step": 3057 }, { "epoch": 1.539256458727158, "grad_norm": 0.16969841106908495, "learning_rate": 4.910037891077482e-07, "loss": 0.2602, "step": 3058 }, { "epoch": 1.539760554505356, "grad_norm": 0.16865488414462979, "learning_rate": 4.907352757916577e-07, "loss": 0.2724, "step": 3059 }, { "epoch": 1.5402646502835537, "grad_norm": 0.17584292304841348, "learning_rate": 4.904667651483828e-07, "loss": 0.2742, "step": 3060 }, { "epoch": 1.5407687460617518, "grad_norm": 0.16673800323953514, "learning_rate": 4.901982572553869e-07, "loss": 0.2695, "step": 3061 }, { "epoch": 1.5412728418399495, "grad_norm": 0.18326131122744185, "learning_rate": 4.899297521901333e-07, "loss": 0.2789, "step": 3062 }, { "epoch": 1.5417769376181476, "grad_norm": 0.18805933934917177, "learning_rate": 4.896612500300835e-07, "loss": 0.272, "step": 3063 }, { "epoch": 1.5422810333963453, "grad_norm": 0.1712806706085473, "learning_rate": 4.893927508526993e-07, "loss": 0.2669, "step": 3064 }, { "epoch": 1.5427851291745431, "grad_norm": 0.1731870501531892, "learning_rate": 4.891242547354406e-07, "loss": 0.2812, "step": 3065 }, { "epoch": 1.543289224952741, "grad_norm": 0.17506473097896597, "learning_rate": 4.888557617557672e-07, "loss": 0.2813, "step": 3066 }, { "epoch": 1.543793320730939, "grad_norm": 0.17307157635246334, "learning_rate": 4.885872719911375e-07, "loss": 0.2695, "step": 3067 }, { "epoch": 1.5442974165091368, "grad_norm": 0.17106830633462392, "learning_rate": 4.883187855190089e-07, "loss": 0.264, "step": 3068 }, { "epoch": 1.5448015122873346, "grad_norm": 0.17296030737339352, "learning_rate": 4.880503024168384e-07, "loss": 0.2776, "step": 3069 }, { "epoch": 1.5453056080655325, "grad_norm": 0.17398895118716073, "learning_rate": 4.877818227620816e-07, "loss": 0.2641, "step": 3070 }, { "epoch": 1.5458097038437302, "grad_norm": 0.16983527604252388, "learning_rate": 4.875133466321934e-07, "loss": 0.2791, "step": 3071 }, { "epoch": 1.5463137996219283, "grad_norm": 0.17446130630330534, "learning_rate": 4.872448741046272e-07, "loss": 0.2709, "step": 3072 }, { "epoch": 1.546817895400126, "grad_norm": 0.17284082991627042, "learning_rate": 4.86976405256836e-07, "loss": 0.2671, "step": 3073 }, { "epoch": 1.547321991178324, "grad_norm": 0.182632054641719, "learning_rate": 4.867079401662711e-07, "loss": 0.2776, "step": 3074 }, { "epoch": 1.5478260869565217, "grad_norm": 0.17300501368762167, "learning_rate": 4.864394789103829e-07, "loss": 0.2794, "step": 3075 }, { "epoch": 1.5483301827347196, "grad_norm": 0.17667332011696144, "learning_rate": 4.861710215666213e-07, "loss": 0.2853, "step": 3076 }, { "epoch": 1.5488342785129174, "grad_norm": 0.16996586627278085, "learning_rate": 4.859025682124341e-07, "loss": 0.2786, "step": 3077 }, { "epoch": 1.5493383742911153, "grad_norm": 0.17057715081373995, "learning_rate": 4.85634118925269e-07, "loss": 0.2691, "step": 3078 }, { "epoch": 1.5498424700693132, "grad_norm": 0.17161445251025925, "learning_rate": 4.853656737825713e-07, "loss": 0.283, "step": 3079 }, { "epoch": 1.550346565847511, "grad_norm": 0.17675037956109935, "learning_rate": 4.850972328617863e-07, "loss": 0.2721, "step": 3080 }, { "epoch": 1.550850661625709, "grad_norm": 0.17208347939492113, "learning_rate": 4.848287962403571e-07, "loss": 0.2653, "step": 3081 }, { "epoch": 1.5513547574039066, "grad_norm": 0.17004686976042202, "learning_rate": 4.845603639957263e-07, "loss": 0.2709, "step": 3082 }, { "epoch": 1.5518588531821047, "grad_norm": 0.17995963542830856, "learning_rate": 4.842919362053348e-07, "loss": 0.265, "step": 3083 }, { "epoch": 1.5523629489603024, "grad_norm": 0.18099808248082644, "learning_rate": 4.840235129466222e-07, "loss": 0.2651, "step": 3084 }, { "epoch": 1.5528670447385005, "grad_norm": 0.17473760103531463, "learning_rate": 4.837550942970275e-07, "loss": 0.2841, "step": 3085 }, { "epoch": 1.5533711405166981, "grad_norm": 0.19493429052326036, "learning_rate": 4.834866803339872e-07, "loss": 0.2789, "step": 3086 }, { "epoch": 1.553875236294896, "grad_norm": 0.174886487259404, "learning_rate": 4.832182711349374e-07, "loss": 0.2897, "step": 3087 }, { "epoch": 1.5543793320730939, "grad_norm": 0.1873744343078876, "learning_rate": 4.829498667773126e-07, "loss": 0.2596, "step": 3088 }, { "epoch": 1.5548834278512917, "grad_norm": 0.1776819518952955, "learning_rate": 4.826814673385454e-07, "loss": 0.2592, "step": 3089 }, { "epoch": 1.5553875236294896, "grad_norm": 0.17480879943963698, "learning_rate": 4.824130728960677e-07, "loss": 0.2854, "step": 3090 }, { "epoch": 1.5558916194076875, "grad_norm": 0.17863519439054704, "learning_rate": 4.821446835273093e-07, "loss": 0.2848, "step": 3091 }, { "epoch": 1.5563957151858854, "grad_norm": 0.20186594037979866, "learning_rate": 4.818762993096994e-07, "loss": 0.2742, "step": 3092 }, { "epoch": 1.556899810964083, "grad_norm": 0.17253352705225278, "learning_rate": 4.816079203206648e-07, "loss": 0.2735, "step": 3093 }, { "epoch": 1.5574039067422811, "grad_norm": 0.17021429772016514, "learning_rate": 4.813395466376311e-07, "loss": 0.2757, "step": 3094 }, { "epoch": 1.5579080025204788, "grad_norm": 0.17831771845073935, "learning_rate": 4.810711783380227e-07, "loss": 0.2899, "step": 3095 }, { "epoch": 1.5584120982986769, "grad_norm": 0.17539283313588858, "learning_rate": 4.808028154992622e-07, "loss": 0.2803, "step": 3096 }, { "epoch": 1.5589161940768745, "grad_norm": 0.17570461231863696, "learning_rate": 4.805344581987704e-07, "loss": 0.2702, "step": 3097 }, { "epoch": 1.5594202898550724, "grad_norm": 0.18256077525124403, "learning_rate": 4.802661065139667e-07, "loss": 0.279, "step": 3098 }, { "epoch": 1.5599243856332703, "grad_norm": 0.1773908250652285, "learning_rate": 4.799977605222693e-07, "loss": 0.283, "step": 3099 }, { "epoch": 1.5604284814114682, "grad_norm": 0.17701100191661834, "learning_rate": 4.797294203010939e-07, "loss": 0.2796, "step": 3100 }, { "epoch": 1.560932577189666, "grad_norm": 0.17106773731405026, "learning_rate": 4.794610859278552e-07, "loss": 0.2669, "step": 3101 }, { "epoch": 1.561436672967864, "grad_norm": 0.17235342667180142, "learning_rate": 4.791927574799659e-07, "loss": 0.2642, "step": 3102 }, { "epoch": 1.5619407687460618, "grad_norm": 0.1791600844603549, "learning_rate": 4.789244350348374e-07, "loss": 0.2687, "step": 3103 }, { "epoch": 1.5624448645242595, "grad_norm": 0.17661856365985878, "learning_rate": 4.786561186698788e-07, "loss": 0.2794, "step": 3104 }, { "epoch": 1.5629489603024576, "grad_norm": 0.1755295092888704, "learning_rate": 4.783878084624975e-07, "loss": 0.2747, "step": 3105 }, { "epoch": 1.5634530560806552, "grad_norm": 0.19305165095057605, "learning_rate": 4.781195044900998e-07, "loss": 0.2787, "step": 3106 }, { "epoch": 1.5639571518588533, "grad_norm": 0.18063804583238724, "learning_rate": 4.778512068300893e-07, "loss": 0.2754, "step": 3107 }, { "epoch": 1.564461247637051, "grad_norm": 0.18403206850653575, "learning_rate": 4.775829155598686e-07, "loss": 0.2778, "step": 3108 }, { "epoch": 1.5649653434152488, "grad_norm": 0.1700956199275683, "learning_rate": 4.773146307568379e-07, "loss": 0.2727, "step": 3109 }, { "epoch": 1.5654694391934467, "grad_norm": 0.18086574577750156, "learning_rate": 4.770463524983956e-07, "loss": 0.2912, "step": 3110 }, { "epoch": 1.5659735349716446, "grad_norm": 0.17147115049908643, "learning_rate": 4.7677808086193854e-07, "loss": 0.27, "step": 3111 }, { "epoch": 1.5664776307498425, "grad_norm": 0.1738895601221236, "learning_rate": 4.7650981592486123e-07, "loss": 0.2673, "step": 3112 }, { "epoch": 1.5669817265280404, "grad_norm": 0.180723254015497, "learning_rate": 4.7624155776455647e-07, "loss": 0.2633, "step": 3113 }, { "epoch": 1.5674858223062382, "grad_norm": 0.175819284778621, "learning_rate": 4.7597330645841515e-07, "loss": 0.2582, "step": 3114 }, { "epoch": 1.5679899180844359, "grad_norm": 0.1753712910797449, "learning_rate": 4.757050620838262e-07, "loss": 0.2632, "step": 3115 }, { "epoch": 1.568494013862634, "grad_norm": 0.17369096100282744, "learning_rate": 4.754368247181761e-07, "loss": 0.2798, "step": 3116 }, { "epoch": 1.5689981096408316, "grad_norm": 0.17849770678240312, "learning_rate": 4.751685944388501e-07, "loss": 0.2842, "step": 3117 }, { "epoch": 1.5695022054190297, "grad_norm": 0.19881328567873902, "learning_rate": 4.749003713232308e-07, "loss": 0.2733, "step": 3118 }, { "epoch": 1.5700063011972274, "grad_norm": 0.1774218478893288, "learning_rate": 4.7463215544869865e-07, "loss": 0.2863, "step": 3119 }, { "epoch": 1.5705103969754255, "grad_norm": 0.1733835174988626, "learning_rate": 4.743639468926328e-07, "loss": 0.2681, "step": 3120 }, { "epoch": 1.5710144927536231, "grad_norm": 0.1736480466599747, "learning_rate": 4.740957457324092e-07, "loss": 0.2584, "step": 3121 }, { "epoch": 1.571518588531821, "grad_norm": 0.17950796056364546, "learning_rate": 4.738275520454027e-07, "loss": 0.2508, "step": 3122 }, { "epoch": 1.572022684310019, "grad_norm": 0.1778356015662689, "learning_rate": 4.735593659089851e-07, "loss": 0.2576, "step": 3123 }, { "epoch": 1.5725267800882168, "grad_norm": 0.1687063104049516, "learning_rate": 4.732911874005269e-07, "loss": 0.2559, "step": 3124 }, { "epoch": 1.5730308758664147, "grad_norm": 0.17493624040875075, "learning_rate": 4.7302301659739547e-07, "loss": 0.2648, "step": 3125 }, { "epoch": 1.5735349716446123, "grad_norm": 0.17507468008329716, "learning_rate": 4.7275485357695673e-07, "loss": 0.2568, "step": 3126 }, { "epoch": 1.5740390674228104, "grad_norm": 0.17975235409806142, "learning_rate": 4.7248669841657404e-07, "loss": 0.2776, "step": 3127 }, { "epoch": 1.574543163201008, "grad_norm": 0.17454934150306223, "learning_rate": 4.7221855119360824e-07, "loss": 0.2681, "step": 3128 }, { "epoch": 1.5750472589792062, "grad_norm": 0.18520223540546543, "learning_rate": 4.7195041198541854e-07, "loss": 0.2727, "step": 3129 }, { "epoch": 1.5755513547574038, "grad_norm": 0.17055487749470938, "learning_rate": 4.7168228086936096e-07, "loss": 0.2692, "step": 3130 }, { "epoch": 1.576055450535602, "grad_norm": 0.1751981137839451, "learning_rate": 4.7141415792279015e-07, "loss": 0.2754, "step": 3131 }, { "epoch": 1.5765595463137996, "grad_norm": 0.180062722006383, "learning_rate": 4.7114604322305747e-07, "loss": 0.264, "step": 3132 }, { "epoch": 1.5770636420919975, "grad_norm": 0.17530448742608076, "learning_rate": 4.708779368475128e-07, "loss": 0.2875, "step": 3133 }, { "epoch": 1.5775677378701953, "grad_norm": 0.1716762643727378, "learning_rate": 4.70609838873503e-07, "loss": 0.27, "step": 3134 }, { "epoch": 1.5780718336483932, "grad_norm": 0.17341353040640353, "learning_rate": 4.703417493783723e-07, "loss": 0.2623, "step": 3135 }, { "epoch": 1.578575929426591, "grad_norm": 0.2080724128483008, "learning_rate": 4.700736684394635e-07, "loss": 0.2823, "step": 3136 }, { "epoch": 1.579080025204789, "grad_norm": 0.17039241780253916, "learning_rate": 4.6980559613411576e-07, "loss": 0.2722, "step": 3137 }, { "epoch": 1.5795841209829868, "grad_norm": 0.1742720965174898, "learning_rate": 4.695375325396666e-07, "loss": 0.2745, "step": 3138 }, { "epoch": 1.5800882167611845, "grad_norm": 0.17895066474047672, "learning_rate": 4.692694777334505e-07, "loss": 0.2664, "step": 3139 }, { "epoch": 1.5805923125393826, "grad_norm": 0.1779553651912266, "learning_rate": 4.6900143179279984e-07, "loss": 0.2729, "step": 3140 }, { "epoch": 1.5810964083175802, "grad_norm": 0.1757573994664284, "learning_rate": 4.687333947950441e-07, "loss": 0.2628, "step": 3141 }, { "epoch": 1.5816005040957783, "grad_norm": 0.17335221224324718, "learning_rate": 4.684653668175102e-07, "loss": 0.2564, "step": 3142 }, { "epoch": 1.582104599873976, "grad_norm": 0.17526400475703355, "learning_rate": 4.6819734793752257e-07, "loss": 0.2724, "step": 3143 }, { "epoch": 1.5826086956521739, "grad_norm": 0.17495007637044283, "learning_rate": 4.679293382324031e-07, "loss": 0.2844, "step": 3144 }, { "epoch": 1.5831127914303718, "grad_norm": 0.17017524004684992, "learning_rate": 4.67661337779471e-07, "loss": 0.2772, "step": 3145 }, { "epoch": 1.5836168872085696, "grad_norm": 0.17206013880701826, "learning_rate": 4.6739334665604234e-07, "loss": 0.2839, "step": 3146 }, { "epoch": 1.5841209829867675, "grad_norm": 0.16939135952437226, "learning_rate": 4.671253649394315e-07, "loss": 0.2684, "step": 3147 }, { "epoch": 1.5846250787649654, "grad_norm": 0.18799473978425724, "learning_rate": 4.668573927069491e-07, "loss": 0.2622, "step": 3148 }, { "epoch": 1.5851291745431633, "grad_norm": 0.17747869099450092, "learning_rate": 4.665894300359035e-07, "loss": 0.2616, "step": 3149 }, { "epoch": 1.585633270321361, "grad_norm": 0.17806445921975966, "learning_rate": 4.6632147700360055e-07, "loss": 0.2607, "step": 3150 }, { "epoch": 1.586137366099559, "grad_norm": 0.17372413070642495, "learning_rate": 4.6605353368734265e-07, "loss": 0.2709, "step": 3151 }, { "epoch": 1.5866414618777567, "grad_norm": 0.16458604740808605, "learning_rate": 4.6578560016443013e-07, "loss": 0.26, "step": 3152 }, { "epoch": 1.5871455576559548, "grad_norm": 0.17882994895541682, "learning_rate": 4.655176765121599e-07, "loss": 0.27, "step": 3153 }, { "epoch": 1.5876496534341524, "grad_norm": 0.17487944942600261, "learning_rate": 4.652497628078266e-07, "loss": 0.2579, "step": 3154 }, { "epoch": 1.5881537492123503, "grad_norm": 0.17194013515573972, "learning_rate": 4.6498185912872137e-07, "loss": 0.2749, "step": 3155 }, { "epoch": 1.5886578449905482, "grad_norm": 0.17344590975757027, "learning_rate": 4.6471396555213273e-07, "loss": 0.2625, "step": 3156 }, { "epoch": 1.589161940768746, "grad_norm": 0.17496373280977637, "learning_rate": 4.6444608215534657e-07, "loss": 0.2715, "step": 3157 }, { "epoch": 1.589666036546944, "grad_norm": 0.17877746835344072, "learning_rate": 4.641782090156454e-07, "loss": 0.2544, "step": 3158 }, { "epoch": 1.5901701323251418, "grad_norm": 0.18160946077780746, "learning_rate": 4.6391034621030903e-07, "loss": 0.271, "step": 3159 }, { "epoch": 1.5906742281033397, "grad_norm": 0.17284059925719977, "learning_rate": 4.636424938166142e-07, "loss": 0.2754, "step": 3160 }, { "epoch": 1.5911783238815373, "grad_norm": 0.17028743922696857, "learning_rate": 4.633746519118348e-07, "loss": 0.263, "step": 3161 }, { "epoch": 1.5916824196597354, "grad_norm": 0.1747381029040225, "learning_rate": 4.631068205732413e-07, "loss": 0.2684, "step": 3162 }, { "epoch": 1.592186515437933, "grad_norm": 0.18722584514120708, "learning_rate": 4.6283899987810164e-07, "loss": 0.2854, "step": 3163 }, { "epoch": 1.5926906112161312, "grad_norm": 0.16942535047104845, "learning_rate": 4.6257118990368036e-07, "loss": 0.2793, "step": 3164 }, { "epoch": 1.5931947069943289, "grad_norm": 0.16883072945164623, "learning_rate": 4.6230339072723874e-07, "loss": 0.2707, "step": 3165 }, { "epoch": 1.5936988027725267, "grad_norm": 0.17706329109752933, "learning_rate": 4.6203560242603556e-07, "loss": 0.2726, "step": 3166 }, { "epoch": 1.5942028985507246, "grad_norm": 0.1686584353657218, "learning_rate": 4.617678250773256e-07, "loss": 0.2666, "step": 3167 }, { "epoch": 1.5947069943289225, "grad_norm": 0.17166944845187698, "learning_rate": 4.615000587583616e-07, "loss": 0.2692, "step": 3168 }, { "epoch": 1.5952110901071204, "grad_norm": 0.17764393811133042, "learning_rate": 4.6123230354639194e-07, "loss": 0.2578, "step": 3169 }, { "epoch": 1.5957151858853182, "grad_norm": 0.18326712297594602, "learning_rate": 4.6096455951866277e-07, "loss": 0.2783, "step": 3170 }, { "epoch": 1.5962192816635161, "grad_norm": 0.16700467021445017, "learning_rate": 4.6069682675241626e-07, "loss": 0.2729, "step": 3171 }, { "epoch": 1.5967233774417138, "grad_norm": 0.17742240847400476, "learning_rate": 4.6042910532489165e-07, "loss": 0.2702, "step": 3172 }, { "epoch": 1.5972274732199119, "grad_norm": 0.18272675092406107, "learning_rate": 4.601613953133252e-07, "loss": 0.2776, "step": 3173 }, { "epoch": 1.5977315689981095, "grad_norm": 0.18849423602774695, "learning_rate": 4.5989369679494935e-07, "loss": 0.2831, "step": 3174 }, { "epoch": 1.5982356647763076, "grad_norm": 0.19311234812321373, "learning_rate": 4.5962600984699364e-07, "loss": 0.271, "step": 3175 }, { "epoch": 1.5987397605545053, "grad_norm": 0.17325148338292742, "learning_rate": 4.593583345466837e-07, "loss": 0.269, "step": 3176 }, { "epoch": 1.5992438563327032, "grad_norm": 0.17501523218289874, "learning_rate": 4.590906709712427e-07, "loss": 0.2683, "step": 3177 }, { "epoch": 1.599747952110901, "grad_norm": 0.18641330474821174, "learning_rate": 4.588230191978898e-07, "loss": 0.2681, "step": 3178 }, { "epoch": 1.600252047889099, "grad_norm": 0.17265182176031832, "learning_rate": 4.585553793038405e-07, "loss": 0.2799, "step": 3179 }, { "epoch": 1.6007561436672968, "grad_norm": 0.17101509452047053, "learning_rate": 4.582877513663077e-07, "loss": 0.2653, "step": 3180 }, { "epoch": 1.6012602394454947, "grad_norm": 0.172204707085013, "learning_rate": 4.5802013546250014e-07, "loss": 0.2797, "step": 3181 }, { "epoch": 1.6017643352236925, "grad_norm": 0.17967573980986776, "learning_rate": 4.577525316696236e-07, "loss": 0.262, "step": 3182 }, { "epoch": 1.6022684310018902, "grad_norm": 0.17469624599365358, "learning_rate": 4.574849400648797e-07, "loss": 0.2636, "step": 3183 }, { "epoch": 1.6027725267800883, "grad_norm": 0.1743708880339133, "learning_rate": 4.5721736072546754e-07, "loss": 0.2649, "step": 3184 }, { "epoch": 1.6027725267800883, "eval_loss": 0.307079017162323, "eval_runtime": 18.6674, "eval_samples_per_second": 45.802, "eval_steps_per_second": 0.964, "step": 3184 }, { "epoch": 1.603276622558286, "grad_norm": 0.18603703776161118, "learning_rate": 4.569497937285817e-07, "loss": 0.2849, "step": 3185 }, { "epoch": 1.603780718336484, "grad_norm": 0.1746753818130289, "learning_rate": 4.566822391514135e-07, "loss": 0.2644, "step": 3186 }, { "epoch": 1.6042848141146817, "grad_norm": 0.19466418253252832, "learning_rate": 4.5641469707115123e-07, "loss": 0.2719, "step": 3187 }, { "epoch": 1.6047889098928798, "grad_norm": 0.19722375083757057, "learning_rate": 4.5614716756497856e-07, "loss": 0.2765, "step": 3188 }, { "epoch": 1.6052930056710775, "grad_norm": 0.17956583472244503, "learning_rate": 4.5587965071007664e-07, "loss": 0.2806, "step": 3189 }, { "epoch": 1.6057971014492753, "grad_norm": 0.196228627957389, "learning_rate": 4.556121465836221e-07, "loss": 0.2753, "step": 3190 }, { "epoch": 1.6063011972274732, "grad_norm": 0.1689632051276557, "learning_rate": 4.553446552627884e-07, "loss": 0.2714, "step": 3191 }, { "epoch": 1.606805293005671, "grad_norm": 0.17456704042056104, "learning_rate": 4.5507717682474475e-07, "loss": 0.2732, "step": 3192 }, { "epoch": 1.607309388783869, "grad_norm": 0.18246575730580003, "learning_rate": 4.5480971134665765e-07, "loss": 0.2684, "step": 3193 }, { "epoch": 1.6078134845620666, "grad_norm": 0.16994370889715202, "learning_rate": 4.545422589056888e-07, "loss": 0.2581, "step": 3194 }, { "epoch": 1.6083175803402647, "grad_norm": 0.1681073205628179, "learning_rate": 4.5427481957899643e-07, "loss": 0.2746, "step": 3195 }, { "epoch": 1.6088216761184624, "grad_norm": 0.1935072839716343, "learning_rate": 4.540073934437356e-07, "loss": 0.2728, "step": 3196 }, { "epoch": 1.6093257718966605, "grad_norm": 0.16743041342533738, "learning_rate": 4.5373998057705667e-07, "loss": 0.2709, "step": 3197 }, { "epoch": 1.6098298676748581, "grad_norm": 0.18397921239376722, "learning_rate": 4.53472581056107e-07, "loss": 0.2801, "step": 3198 }, { "epoch": 1.6103339634530562, "grad_norm": 0.17331537650260462, "learning_rate": 4.5320519495802915e-07, "loss": 0.2695, "step": 3199 }, { "epoch": 1.610838059231254, "grad_norm": 0.17081500880990227, "learning_rate": 4.5293782235996303e-07, "loss": 0.271, "step": 3200 }, { "epoch": 1.6113421550094518, "grad_norm": 0.17923914190076334, "learning_rate": 4.526704633390435e-07, "loss": 0.265, "step": 3201 }, { "epoch": 1.6118462507876496, "grad_norm": 0.17772987495851134, "learning_rate": 4.52403117972402e-07, "loss": 0.2783, "step": 3202 }, { "epoch": 1.6123503465658475, "grad_norm": 0.18238344872837062, "learning_rate": 4.5213578633716627e-07, "loss": 0.2756, "step": 3203 }, { "epoch": 1.6128544423440454, "grad_norm": 0.17223510416546425, "learning_rate": 4.5186846851045957e-07, "loss": 0.2561, "step": 3204 }, { "epoch": 1.6133585381222433, "grad_norm": 0.17827644377200683, "learning_rate": 4.516011645694016e-07, "loss": 0.2741, "step": 3205 }, { "epoch": 1.6138626339004412, "grad_norm": 0.16814074106631602, "learning_rate": 4.513338745911078e-07, "loss": 0.2675, "step": 3206 }, { "epoch": 1.6143667296786388, "grad_norm": 0.1774956008916992, "learning_rate": 4.5106659865268973e-07, "loss": 0.2697, "step": 3207 }, { "epoch": 1.614870825456837, "grad_norm": 0.18296109118162968, "learning_rate": 4.507993368312548e-07, "loss": 0.2689, "step": 3208 }, { "epoch": 1.6153749212350346, "grad_norm": 0.1858855435073671, "learning_rate": 4.5053208920390646e-07, "loss": 0.2669, "step": 3209 }, { "epoch": 1.6158790170132327, "grad_norm": 0.17175203740808448, "learning_rate": 4.5026485584774397e-07, "loss": 0.2781, "step": 3210 }, { "epoch": 1.6163831127914303, "grad_norm": 0.1792404257748635, "learning_rate": 4.499976368398623e-07, "loss": 0.2651, "step": 3211 }, { "epoch": 1.6168872085696282, "grad_norm": 0.17147755827754907, "learning_rate": 4.497304322573529e-07, "loss": 0.2768, "step": 3212 }, { "epoch": 1.617391304347826, "grad_norm": 0.1689049644593327, "learning_rate": 4.494632421773021e-07, "loss": 0.2644, "step": 3213 }, { "epoch": 1.617895400126024, "grad_norm": 0.17249930136505048, "learning_rate": 4.4919606667679314e-07, "loss": 0.2695, "step": 3214 }, { "epoch": 1.6183994959042218, "grad_norm": 0.17187403315174393, "learning_rate": 4.489289058329042e-07, "loss": 0.2753, "step": 3215 }, { "epoch": 1.6189035916824197, "grad_norm": 0.17003988333355505, "learning_rate": 4.4866175972270934e-07, "loss": 0.2773, "step": 3216 }, { "epoch": 1.6194076874606176, "grad_norm": 0.174941393420484, "learning_rate": 4.4839462842327905e-07, "loss": 0.2624, "step": 3217 }, { "epoch": 1.6199117832388152, "grad_norm": 0.16731226660363765, "learning_rate": 4.481275120116785e-07, "loss": 0.2787, "step": 3218 }, { "epoch": 1.6204158790170133, "grad_norm": 0.18079153758648603, "learning_rate": 4.478604105649697e-07, "loss": 0.2649, "step": 3219 }, { "epoch": 1.620919974795211, "grad_norm": 0.1717217807780553, "learning_rate": 4.475933241602093e-07, "loss": 0.278, "step": 3220 }, { "epoch": 1.621424070573409, "grad_norm": 0.1679242542537086, "learning_rate": 4.473262528744502e-07, "loss": 0.2614, "step": 3221 }, { "epoch": 1.6219281663516067, "grad_norm": 0.17508722076028932, "learning_rate": 4.47059196784741e-07, "loss": 0.2681, "step": 3222 }, { "epoch": 1.6224322621298046, "grad_norm": 0.17649494235855326, "learning_rate": 4.467921559681255e-07, "loss": 0.2636, "step": 3223 }, { "epoch": 1.6229363579080025, "grad_norm": 0.171248906662282, "learning_rate": 4.4652513050164344e-07, "loss": 0.2675, "step": 3224 }, { "epoch": 1.6234404536862004, "grad_norm": 0.1772203147323702, "learning_rate": 4.462581204623298e-07, "loss": 0.2676, "step": 3225 }, { "epoch": 1.6239445494643983, "grad_norm": 0.17277840226247196, "learning_rate": 4.4599112592721567e-07, "loss": 0.2818, "step": 3226 }, { "epoch": 1.6244486452425961, "grad_norm": 0.1888456571031224, "learning_rate": 4.4572414697332694e-07, "loss": 0.2582, "step": 3227 }, { "epoch": 1.624952741020794, "grad_norm": 0.17481572073876417, "learning_rate": 4.454571836776859e-07, "loss": 0.2845, "step": 3228 }, { "epoch": 1.6254568367989917, "grad_norm": 0.17113867643332575, "learning_rate": 4.4519023611730913e-07, "loss": 0.2628, "step": 3229 }, { "epoch": 1.6259609325771898, "grad_norm": 0.1700358617278328, "learning_rate": 4.4492330436920997e-07, "loss": 0.272, "step": 3230 }, { "epoch": 1.6264650283553874, "grad_norm": 0.17282908698853652, "learning_rate": 4.4465638851039636e-07, "loss": 0.2695, "step": 3231 }, { "epoch": 1.6269691241335855, "grad_norm": 0.17626881997268046, "learning_rate": 4.4438948861787164e-07, "loss": 0.2744, "step": 3232 }, { "epoch": 1.6274732199117832, "grad_norm": 0.1752662023406442, "learning_rate": 4.4412260476863513e-07, "loss": 0.2699, "step": 3233 }, { "epoch": 1.627977315689981, "grad_norm": 0.18770480212311322, "learning_rate": 4.4385573703968074e-07, "loss": 0.2798, "step": 3234 }, { "epoch": 1.628481411468179, "grad_norm": 0.17091794174949165, "learning_rate": 4.435888855079987e-07, "loss": 0.2554, "step": 3235 }, { "epoch": 1.6289855072463768, "grad_norm": 0.18183277964556385, "learning_rate": 4.4332205025057363e-07, "loss": 0.2645, "step": 3236 }, { "epoch": 1.6294896030245747, "grad_norm": 0.1721678222436179, "learning_rate": 4.4305523134438603e-07, "loss": 0.2714, "step": 3237 }, { "epoch": 1.6299936988027726, "grad_norm": 0.17185890801401496, "learning_rate": 4.427884288664114e-07, "loss": 0.274, "step": 3238 }, { "epoch": 1.6304977945809704, "grad_norm": 0.1705781529081766, "learning_rate": 4.4252164289362055e-07, "loss": 0.2689, "step": 3239 }, { "epoch": 1.631001890359168, "grad_norm": 0.17045943428176855, "learning_rate": 4.422548735029798e-07, "loss": 0.2612, "step": 3240 }, { "epoch": 1.6315059861373662, "grad_norm": 0.17057437607989734, "learning_rate": 4.4198812077145014e-07, "loss": 0.275, "step": 3241 }, { "epoch": 1.6320100819155638, "grad_norm": 0.17813857592829013, "learning_rate": 4.417213847759885e-07, "loss": 0.2851, "step": 3242 }, { "epoch": 1.632514177693762, "grad_norm": 0.1712522316623488, "learning_rate": 4.4145466559354606e-07, "loss": 0.2792, "step": 3243 }, { "epoch": 1.6330182734719596, "grad_norm": 0.177089666182129, "learning_rate": 4.4118796330107015e-07, "loss": 0.2812, "step": 3244 }, { "epoch": 1.6335223692501575, "grad_norm": 0.1804936269724506, "learning_rate": 4.409212779755026e-07, "loss": 0.2923, "step": 3245 }, { "epoch": 1.6340264650283554, "grad_norm": 0.20349951530764857, "learning_rate": 4.4065460969378e-07, "loss": 0.2755, "step": 3246 }, { "epoch": 1.6345305608065532, "grad_norm": 0.18026027565564015, "learning_rate": 4.403879585328353e-07, "loss": 0.27, "step": 3247 }, { "epoch": 1.6350346565847511, "grad_norm": 0.17425845355042768, "learning_rate": 4.4012132456959497e-07, "loss": 0.2744, "step": 3248 }, { "epoch": 1.635538752362949, "grad_norm": 0.17129421952442173, "learning_rate": 4.3985470788098175e-07, "loss": 0.2645, "step": 3249 }, { "epoch": 1.6360428481411469, "grad_norm": 0.17611725539203973, "learning_rate": 4.395881085439126e-07, "loss": 0.2703, "step": 3250 }, { "epoch": 1.6365469439193445, "grad_norm": 0.17504232119969135, "learning_rate": 4.393215266353e-07, "loss": 0.2543, "step": 3251 }, { "epoch": 1.6370510396975426, "grad_norm": 0.17656439945011204, "learning_rate": 4.3905496223205114e-07, "loss": 0.2704, "step": 3252 }, { "epoch": 1.6375551354757403, "grad_norm": 0.17063701784531435, "learning_rate": 4.3878841541106805e-07, "loss": 0.2889, "step": 3253 }, { "epoch": 1.6380592312539384, "grad_norm": 0.16853391156671269, "learning_rate": 4.385218862492479e-07, "loss": 0.2796, "step": 3254 }, { "epoch": 1.638563327032136, "grad_norm": 0.17227356978627587, "learning_rate": 4.3825537482348274e-07, "loss": 0.2848, "step": 3255 }, { "epoch": 1.6390674228103341, "grad_norm": 0.17455061108226785, "learning_rate": 4.3798888121065957e-07, "loss": 0.2816, "step": 3256 }, { "epoch": 1.6395715185885318, "grad_norm": 0.17637327346626572, "learning_rate": 4.377224054876598e-07, "loss": 0.2725, "step": 3257 }, { "epoch": 1.6400756143667297, "grad_norm": 0.17175945902648324, "learning_rate": 4.374559477313605e-07, "loss": 0.2716, "step": 3258 }, { "epoch": 1.6405797101449275, "grad_norm": 0.17464832215924994, "learning_rate": 4.3718950801863263e-07, "loss": 0.2816, "step": 3259 }, { "epoch": 1.6410838059231254, "grad_norm": 0.17327368547696761, "learning_rate": 4.369230864263428e-07, "loss": 0.2708, "step": 3260 }, { "epoch": 1.6415879017013233, "grad_norm": 0.17593115327303463, "learning_rate": 4.3665668303135184e-07, "loss": 0.271, "step": 3261 }, { "epoch": 1.642091997479521, "grad_norm": 0.17355291392861003, "learning_rate": 4.363902979105151e-07, "loss": 0.2744, "step": 3262 }, { "epoch": 1.642596093257719, "grad_norm": 0.17670933469610142, "learning_rate": 4.361239311406837e-07, "loss": 0.2767, "step": 3263 }, { "epoch": 1.6431001890359167, "grad_norm": 0.1743548246208639, "learning_rate": 4.358575827987022e-07, "loss": 0.2877, "step": 3264 }, { "epoch": 1.6436042848141148, "grad_norm": 0.1780920002901691, "learning_rate": 4.3559125296141097e-07, "loss": 0.2728, "step": 3265 }, { "epoch": 1.6441083805923125, "grad_norm": 0.16978778461761998, "learning_rate": 4.3532494170564413e-07, "loss": 0.2638, "step": 3266 }, { "epoch": 1.6446124763705106, "grad_norm": 0.19486305777071525, "learning_rate": 4.35058649108231e-07, "loss": 0.2745, "step": 3267 }, { "epoch": 1.6451165721487082, "grad_norm": 0.18084492964621957, "learning_rate": 4.3479237524599544e-07, "loss": 0.2939, "step": 3268 }, { "epoch": 1.645620667926906, "grad_norm": 0.17132944207036718, "learning_rate": 4.345261201957556e-07, "loss": 0.2664, "step": 3269 }, { "epoch": 1.646124763705104, "grad_norm": 0.17437387694568648, "learning_rate": 4.342598840343244e-07, "loss": 0.2859, "step": 3270 }, { "epoch": 1.6466288594833018, "grad_norm": 0.17301815118977484, "learning_rate": 4.3399366683850946e-07, "loss": 0.275, "step": 3271 }, { "epoch": 1.6471329552614997, "grad_norm": 0.18122055882591992, "learning_rate": 4.3372746868511284e-07, "loss": 0.286, "step": 3272 }, { "epoch": 1.6476370510396976, "grad_norm": 0.17723397432269952, "learning_rate": 4.3346128965093077e-07, "loss": 0.272, "step": 3273 }, { "epoch": 1.6481411468178955, "grad_norm": 0.1801415778809958, "learning_rate": 4.331951298127547e-07, "loss": 0.2686, "step": 3274 }, { "epoch": 1.6486452425960931, "grad_norm": 0.17591886578148874, "learning_rate": 4.3292898924736976e-07, "loss": 0.2549, "step": 3275 }, { "epoch": 1.6491493383742912, "grad_norm": 0.1759321403355535, "learning_rate": 4.326628680315558e-07, "loss": 0.2836, "step": 3276 }, { "epoch": 1.6496534341524889, "grad_norm": 0.17426684713872134, "learning_rate": 4.323967662420874e-07, "loss": 0.266, "step": 3277 }, { "epoch": 1.650157529930687, "grad_norm": 0.1741569992164329, "learning_rate": 4.3213068395573304e-07, "loss": 0.2647, "step": 3278 }, { "epoch": 1.6506616257088846, "grad_norm": 0.17222858934473642, "learning_rate": 4.3186462124925616e-07, "loss": 0.2735, "step": 3279 }, { "epoch": 1.6511657214870825, "grad_norm": 0.1823409553601484, "learning_rate": 4.315985781994137e-07, "loss": 0.2756, "step": 3280 }, { "epoch": 1.6516698172652804, "grad_norm": 0.18057706073850457, "learning_rate": 4.3133255488295793e-07, "loss": 0.2938, "step": 3281 }, { "epoch": 1.6521739130434783, "grad_norm": 0.16850573295711344, "learning_rate": 4.310665513766347e-07, "loss": 0.258, "step": 3282 }, { "epoch": 1.6526780088216761, "grad_norm": 0.17055663371284882, "learning_rate": 4.308005677571842e-07, "loss": 0.2773, "step": 3283 }, { "epoch": 1.653182104599874, "grad_norm": 0.18266654108280814, "learning_rate": 4.305346041013414e-07, "loss": 0.2641, "step": 3284 }, { "epoch": 1.653686200378072, "grad_norm": 0.18075413873305338, "learning_rate": 4.302686604858349e-07, "loss": 0.2729, "step": 3285 }, { "epoch": 1.6541902961562696, "grad_norm": 0.16673471863682812, "learning_rate": 4.3000273698738804e-07, "loss": 0.2734, "step": 3286 }, { "epoch": 1.6546943919344677, "grad_norm": 0.17371867860201198, "learning_rate": 4.2973683368271775e-07, "loss": 0.2755, "step": 3287 }, { "epoch": 1.6551984877126653, "grad_norm": 0.17118499221022526, "learning_rate": 4.2947095064853586e-07, "loss": 0.2644, "step": 3288 }, { "epoch": 1.6557025834908634, "grad_norm": 0.17084068049014356, "learning_rate": 4.2920508796154755e-07, "loss": 0.2749, "step": 3289 }, { "epoch": 1.656206679269061, "grad_norm": 0.17288119445171216, "learning_rate": 4.289392456984531e-07, "loss": 0.2776, "step": 3290 }, { "epoch": 1.656710775047259, "grad_norm": 0.17586923741312185, "learning_rate": 4.2867342393594596e-07, "loss": 0.2771, "step": 3291 }, { "epoch": 1.6572148708254568, "grad_norm": 0.17101471939495422, "learning_rate": 4.284076227507141e-07, "loss": 0.263, "step": 3292 }, { "epoch": 1.6577189666036547, "grad_norm": 0.16831960805801494, "learning_rate": 4.2814184221943964e-07, "loss": 0.2837, "step": 3293 }, { "epoch": 1.6582230623818526, "grad_norm": 0.17622469939088511, "learning_rate": 4.2787608241879847e-07, "loss": 0.2639, "step": 3294 }, { "epoch": 1.6587271581600505, "grad_norm": 0.16999028518865347, "learning_rate": 4.2761034342546087e-07, "loss": 0.2775, "step": 3295 }, { "epoch": 1.6592312539382483, "grad_norm": 0.17456572812362198, "learning_rate": 4.2734462531609063e-07, "loss": 0.2736, "step": 3296 }, { "epoch": 1.659735349716446, "grad_norm": 0.18460824603728268, "learning_rate": 4.270789281673461e-07, "loss": 0.2742, "step": 3297 }, { "epoch": 1.660239445494644, "grad_norm": 0.17060321969936063, "learning_rate": 4.26813252055879e-07, "loss": 0.283, "step": 3298 }, { "epoch": 1.6607435412728417, "grad_norm": 0.17365085031389696, "learning_rate": 4.265475970583353e-07, "loss": 0.2633, "step": 3299 }, { "epoch": 1.6612476370510398, "grad_norm": 0.17714863665239125, "learning_rate": 4.262819632513548e-07, "loss": 0.268, "step": 3300 }, { "epoch": 1.6617517328292375, "grad_norm": 0.1801000943195406, "learning_rate": 4.260163507115712e-07, "loss": 0.285, "step": 3301 }, { "epoch": 1.6622558286074354, "grad_norm": 0.17866572538561554, "learning_rate": 4.257507595156123e-07, "loss": 0.2608, "step": 3302 }, { "epoch": 1.6627599243856332, "grad_norm": 0.17544127376337787, "learning_rate": 4.2548518974009906e-07, "loss": 0.2845, "step": 3303 }, { "epoch": 1.6632640201638311, "grad_norm": 0.17044959098671644, "learning_rate": 4.2521964146164726e-07, "loss": 0.267, "step": 3304 }, { "epoch": 1.663768115942029, "grad_norm": 0.18277292620452268, "learning_rate": 4.249541147568656e-07, "loss": 0.2654, "step": 3305 }, { "epoch": 1.6642722117202269, "grad_norm": 0.1700015852346814, "learning_rate": 4.2468860970235676e-07, "loss": 0.278, "step": 3306 }, { "epoch": 1.6647763074984248, "grad_norm": 0.19816383195198753, "learning_rate": 4.244231263747177e-07, "loss": 0.2819, "step": 3307 }, { "epoch": 1.6652804032766224, "grad_norm": 0.17293706700556136, "learning_rate": 4.241576648505383e-07, "loss": 0.2721, "step": 3308 }, { "epoch": 1.6657844990548205, "grad_norm": 0.16833769270413493, "learning_rate": 4.2389222520640297e-07, "loss": 0.2629, "step": 3309 }, { "epoch": 1.6662885948330182, "grad_norm": 0.1796549386579674, "learning_rate": 4.2362680751888894e-07, "loss": 0.281, "step": 3310 }, { "epoch": 1.6667926906112163, "grad_norm": 0.17213164153650173, "learning_rate": 4.2336141186456815e-07, "loss": 0.278, "step": 3311 }, { "epoch": 1.667296786389414, "grad_norm": 0.16665647034381148, "learning_rate": 4.2309603832000523e-07, "loss": 0.2677, "step": 3312 }, { "epoch": 1.6678008821676118, "grad_norm": 0.19006288706354632, "learning_rate": 4.2283068696175867e-07, "loss": 0.2752, "step": 3313 }, { "epoch": 1.6683049779458097, "grad_norm": 0.17508641308271217, "learning_rate": 4.225653578663811e-07, "loss": 0.2767, "step": 3314 }, { "epoch": 1.6688090737240076, "grad_norm": 0.17313752219693374, "learning_rate": 4.2230005111041793e-07, "loss": 0.2752, "step": 3315 }, { "epoch": 1.6693131695022054, "grad_norm": 0.17393250261360155, "learning_rate": 4.2203476677040876e-07, "loss": 0.2722, "step": 3316 }, { "epoch": 1.6698172652804033, "grad_norm": 0.18422917201705832, "learning_rate": 4.217695049228864e-07, "loss": 0.271, "step": 3317 }, { "epoch": 1.6703213610586012, "grad_norm": 0.1711040855484809, "learning_rate": 4.2150426564437737e-07, "loss": 0.2628, "step": 3318 }, { "epoch": 1.6708254568367988, "grad_norm": 0.17294159684320054, "learning_rate": 4.212390490114014e-07, "loss": 0.2892, "step": 3319 }, { "epoch": 1.671329552614997, "grad_norm": 0.18793079339537197, "learning_rate": 4.2097385510047166e-07, "loss": 0.2637, "step": 3320 }, { "epoch": 1.6718336483931946, "grad_norm": 0.1773209876917145, "learning_rate": 4.207086839880955e-07, "loss": 0.27, "step": 3321 }, { "epoch": 1.6723377441713927, "grad_norm": 0.17369452560686208, "learning_rate": 4.204435357507725e-07, "loss": 0.2695, "step": 3322 }, { "epoch": 1.6728418399495903, "grad_norm": 0.17729418122505228, "learning_rate": 4.201784104649969e-07, "loss": 0.2631, "step": 3323 }, { "epoch": 1.6733459357277882, "grad_norm": 0.17757333703014838, "learning_rate": 4.199133082072552e-07, "loss": 0.2761, "step": 3324 }, { "epoch": 1.673850031505986, "grad_norm": 0.17481749922840548, "learning_rate": 4.1964822905402817e-07, "loss": 0.2948, "step": 3325 }, { "epoch": 1.674354127284184, "grad_norm": 0.17607568877612143, "learning_rate": 4.1938317308178903e-07, "loss": 0.2789, "step": 3326 }, { "epoch": 1.6748582230623819, "grad_norm": 0.17514273867736174, "learning_rate": 4.191181403670054e-07, "loss": 0.2693, "step": 3327 }, { "epoch": 1.6753623188405797, "grad_norm": 0.16986181408491555, "learning_rate": 4.1885313098613714e-07, "loss": 0.2719, "step": 3328 }, { "epoch": 1.6758664146187776, "grad_norm": 0.17536949245885125, "learning_rate": 4.185881450156377e-07, "loss": 0.294, "step": 3329 }, { "epoch": 1.6763705103969753, "grad_norm": 0.1729391810469066, "learning_rate": 4.183231825319544e-07, "loss": 0.274, "step": 3330 }, { "epoch": 1.6768746061751734, "grad_norm": 0.181209965516427, "learning_rate": 4.1805824361152677e-07, "loss": 0.2828, "step": 3331 }, { "epoch": 1.677378701953371, "grad_norm": 0.17437527629077368, "learning_rate": 4.177933283307884e-07, "loss": 0.264, "step": 3332 }, { "epoch": 1.6778827977315691, "grad_norm": 0.17749950810916038, "learning_rate": 4.1752843676616533e-07, "loss": 0.2774, "step": 3333 }, { "epoch": 1.6783868935097668, "grad_norm": 0.17035105633216754, "learning_rate": 4.1726356899407765e-07, "loss": 0.2691, "step": 3334 }, { "epoch": 1.6788909892879649, "grad_norm": 0.18282073284261757, "learning_rate": 4.1699872509093774e-07, "loss": 0.2774, "step": 3335 }, { "epoch": 1.6793950850661625, "grad_norm": 0.1739086220354733, "learning_rate": 4.167339051331513e-07, "loss": 0.2761, "step": 3336 }, { "epoch": 1.6798991808443604, "grad_norm": 0.178642966353365, "learning_rate": 4.164691091971176e-07, "loss": 0.2807, "step": 3337 }, { "epoch": 1.6804032766225583, "grad_norm": 0.17158028839895453, "learning_rate": 4.162043373592282e-07, "loss": 0.2658, "step": 3338 }, { "epoch": 1.6809073724007562, "grad_norm": 0.1761545331753215, "learning_rate": 4.1593958969586864e-07, "loss": 0.2683, "step": 3339 }, { "epoch": 1.681411468178954, "grad_norm": 0.1790803594461759, "learning_rate": 4.156748662834165e-07, "loss": 0.2744, "step": 3340 }, { "epoch": 1.6819155639571517, "grad_norm": 0.17622383520305496, "learning_rate": 4.154101671982433e-07, "loss": 0.2654, "step": 3341 }, { "epoch": 1.6824196597353498, "grad_norm": 0.1749463588948772, "learning_rate": 4.151454925167129e-07, "loss": 0.292, "step": 3342 }, { "epoch": 1.6829237555135474, "grad_norm": 0.17771616434870652, "learning_rate": 4.14880842315182e-07, "loss": 0.2689, "step": 3343 }, { "epoch": 1.6834278512917455, "grad_norm": 0.171167947600746, "learning_rate": 4.14616216670001e-07, "loss": 0.2675, "step": 3344 }, { "epoch": 1.6839319470699432, "grad_norm": 0.16943013435788834, "learning_rate": 4.143516156575124e-07, "loss": 0.2602, "step": 3345 }, { "epoch": 1.6844360428481413, "grad_norm": 0.18285712491263306, "learning_rate": 4.1408703935405234e-07, "loss": 0.278, "step": 3346 }, { "epoch": 1.684940138626339, "grad_norm": 0.17919325702734495, "learning_rate": 4.1382248783594905e-07, "loss": 0.2892, "step": 3347 }, { "epoch": 1.6854442344045368, "grad_norm": 0.17248307230193116, "learning_rate": 4.135579611795243e-07, "loss": 0.2802, "step": 3348 }, { "epoch": 1.6859483301827347, "grad_norm": 0.17017032309653435, "learning_rate": 4.132934594610922e-07, "loss": 0.2613, "step": 3349 }, { "epoch": 1.6864524259609326, "grad_norm": 0.17232227547146478, "learning_rate": 4.130289827569599e-07, "loss": 0.28, "step": 3350 }, { "epoch": 1.6869565217391305, "grad_norm": 0.19143726314461165, "learning_rate": 4.127645311434275e-07, "loss": 0.2882, "step": 3351 }, { "epoch": 1.6874606175173283, "grad_norm": 0.17907586158542865, "learning_rate": 4.125001046967871e-07, "loss": 0.2795, "step": 3352 }, { "epoch": 1.6879647132955262, "grad_norm": 0.1745434199070893, "learning_rate": 4.122357034933248e-07, "loss": 0.2827, "step": 3353 }, { "epoch": 1.6884688090737239, "grad_norm": 0.18581346560412773, "learning_rate": 4.119713276093181e-07, "loss": 0.2744, "step": 3354 }, { "epoch": 1.688972904851922, "grad_norm": 0.17050406426787684, "learning_rate": 4.117069771210384e-07, "loss": 0.2638, "step": 3355 }, { "epoch": 1.6894770006301196, "grad_norm": 0.1678464885882017, "learning_rate": 4.114426521047485e-07, "loss": 0.274, "step": 3356 }, { "epoch": 1.6899810964083177, "grad_norm": 0.18966378316531626, "learning_rate": 4.1117835263670515e-07, "loss": 0.2813, "step": 3357 }, { "epoch": 1.6904851921865154, "grad_norm": 0.1700410651953894, "learning_rate": 4.109140787931569e-07, "loss": 0.261, "step": 3358 }, { "epoch": 1.6909892879647133, "grad_norm": 0.165637909932199, "learning_rate": 4.1064983065034476e-07, "loss": 0.2654, "step": 3359 }, { "epoch": 1.6914933837429111, "grad_norm": 0.1701023187284413, "learning_rate": 4.103856082845032e-07, "loss": 0.2779, "step": 3360 }, { "epoch": 1.691997479521109, "grad_norm": 0.16764886438759627, "learning_rate": 4.1012141177185846e-07, "loss": 0.2495, "step": 3361 }, { "epoch": 1.692501575299307, "grad_norm": 0.17964220409792864, "learning_rate": 4.098572411886296e-07, "loss": 0.2759, "step": 3362 }, { "epoch": 1.6930056710775048, "grad_norm": 0.17356212055438042, "learning_rate": 4.095930966110283e-07, "loss": 0.2607, "step": 3363 }, { "epoch": 1.6935097668557026, "grad_norm": 0.1748654301154168, "learning_rate": 4.0932897811525866e-07, "loss": 0.2647, "step": 3364 }, { "epoch": 1.6940138626339003, "grad_norm": 0.17194284386041553, "learning_rate": 4.0906488577751714e-07, "loss": 0.2757, "step": 3365 }, { "epoch": 1.6945179584120984, "grad_norm": 0.17475464121096163, "learning_rate": 4.0880081967399265e-07, "loss": 0.2766, "step": 3366 }, { "epoch": 1.695022054190296, "grad_norm": 0.17506845809429164, "learning_rate": 4.085367798808669e-07, "loss": 0.2813, "step": 3367 }, { "epoch": 1.6955261499684942, "grad_norm": 0.16997507190026173, "learning_rate": 4.082727664743133e-07, "loss": 0.2748, "step": 3368 }, { "epoch": 1.6960302457466918, "grad_norm": 0.1823347794889387, "learning_rate": 4.080087795304986e-07, "loss": 0.2795, "step": 3369 }, { "epoch": 1.6965343415248897, "grad_norm": 0.1714581235316788, "learning_rate": 4.0774481912558103e-07, "loss": 0.267, "step": 3370 }, { "epoch": 1.6970384373030876, "grad_norm": 0.16918073676897602, "learning_rate": 4.0748088533571174e-07, "loss": 0.2659, "step": 3371 }, { "epoch": 1.6975425330812854, "grad_norm": 0.1683780550275527, "learning_rate": 4.072169782370339e-07, "loss": 0.2792, "step": 3372 }, { "epoch": 1.6980466288594833, "grad_norm": 0.1772465529025339, "learning_rate": 4.069530979056829e-07, "loss": 0.2675, "step": 3373 }, { "epoch": 1.6985507246376812, "grad_norm": 0.20723872319464334, "learning_rate": 4.06689244417787e-07, "loss": 0.2761, "step": 3374 }, { "epoch": 1.699054820415879, "grad_norm": 0.17653778927841088, "learning_rate": 4.064254178494658e-07, "loss": 0.278, "step": 3375 }, { "epoch": 1.6995589161940767, "grad_norm": 0.1733117296454345, "learning_rate": 4.0616161827683215e-07, "loss": 0.2633, "step": 3376 }, { "epoch": 1.7000630119722748, "grad_norm": 0.17322471461748393, "learning_rate": 4.058978457759902e-07, "loss": 0.2777, "step": 3377 }, { "epoch": 1.7005671077504725, "grad_norm": 0.16784867847711188, "learning_rate": 4.056341004230368e-07, "loss": 0.2646, "step": 3378 }, { "epoch": 1.7010712035286706, "grad_norm": 0.18157842039507768, "learning_rate": 4.0537038229406093e-07, "loss": 0.286, "step": 3379 }, { "epoch": 1.7015752993068682, "grad_norm": 0.17284511881224912, "learning_rate": 4.0510669146514356e-07, "loss": 0.2667, "step": 3380 }, { "epoch": 1.7020793950850661, "grad_norm": 0.1750808365681563, "learning_rate": 4.0484302801235794e-07, "loss": 0.2888, "step": 3381 }, { "epoch": 1.702583490863264, "grad_norm": 0.17993571684627316, "learning_rate": 4.0457939201176905e-07, "loss": 0.2657, "step": 3382 }, { "epoch": 1.7030875866414619, "grad_norm": 0.1860139337219952, "learning_rate": 4.0431578353943464e-07, "loss": 0.2712, "step": 3383 }, { "epoch": 1.7030875866414619, "eval_loss": 0.3066566586494446, "eval_runtime": 17.4494, "eval_samples_per_second": 48.999, "eval_steps_per_second": 1.032, "step": 3383 }, { "epoch": 1.7035916824196597, "grad_norm": 0.1901404388554431, "learning_rate": 4.0405220267140375e-07, "loss": 0.2657, "step": 3384 }, { "epoch": 1.7040957781978576, "grad_norm": 0.17602879795743603, "learning_rate": 4.0378864948371824e-07, "loss": 0.2814, "step": 3385 }, { "epoch": 1.7045998739760555, "grad_norm": 0.17623442887630172, "learning_rate": 4.035251240524114e-07, "loss": 0.2811, "step": 3386 }, { "epoch": 1.7051039697542532, "grad_norm": 0.18006049009972366, "learning_rate": 4.0326162645350837e-07, "loss": 0.2789, "step": 3387 }, { "epoch": 1.7056080655324513, "grad_norm": 0.19068107643868426, "learning_rate": 4.029981567630271e-07, "loss": 0.2684, "step": 3388 }, { "epoch": 1.706112161310649, "grad_norm": 0.17351592149290326, "learning_rate": 4.027347150569765e-07, "loss": 0.2635, "step": 3389 }, { "epoch": 1.706616257088847, "grad_norm": 0.18358987651652428, "learning_rate": 4.0247130141135825e-07, "loss": 0.2971, "step": 3390 }, { "epoch": 1.7071203528670447, "grad_norm": 0.1726651344266121, "learning_rate": 4.0220791590216516e-07, "loss": 0.2766, "step": 3391 }, { "epoch": 1.7076244486452425, "grad_norm": 0.17121736158478892, "learning_rate": 4.0194455860538284e-07, "loss": 0.264, "step": 3392 }, { "epoch": 1.7081285444234404, "grad_norm": 0.1723867701481724, "learning_rate": 4.016812295969878e-07, "loss": 0.2808, "step": 3393 }, { "epoch": 1.7086326402016383, "grad_norm": 0.17772561012794783, "learning_rate": 4.01417928952949e-07, "loss": 0.2841, "step": 3394 }, { "epoch": 1.7091367359798362, "grad_norm": 0.17268289420348543, "learning_rate": 4.0115465674922697e-07, "loss": 0.2715, "step": 3395 }, { "epoch": 1.709640831758034, "grad_norm": 0.17538041239709554, "learning_rate": 4.008914130617742e-07, "loss": 0.2944, "step": 3396 }, { "epoch": 1.710144927536232, "grad_norm": 0.1893702660671161, "learning_rate": 4.006281979665349e-07, "loss": 0.2851, "step": 3397 }, { "epoch": 1.7106490233144296, "grad_norm": 0.17669490425000803, "learning_rate": 4.003650115394446e-07, "loss": 0.2619, "step": 3398 }, { "epoch": 1.7111531190926277, "grad_norm": 0.17081475118990422, "learning_rate": 4.0010185385643156e-07, "loss": 0.2664, "step": 3399 }, { "epoch": 1.7116572148708253, "grad_norm": 0.18139383971070436, "learning_rate": 3.9983872499341463e-07, "loss": 0.2722, "step": 3400 }, { "epoch": 1.7121613106490234, "grad_norm": 0.16819965099097467, "learning_rate": 3.9957562502630524e-07, "loss": 0.2527, "step": 3401 }, { "epoch": 1.712665406427221, "grad_norm": 0.17426197590337408, "learning_rate": 3.99312554031006e-07, "loss": 0.2675, "step": 3402 }, { "epoch": 1.7131695022054192, "grad_norm": 0.1781326625588422, "learning_rate": 3.99049512083411e-07, "loss": 0.2769, "step": 3403 }, { "epoch": 1.7136735979836168, "grad_norm": 0.18770153745691615, "learning_rate": 3.9878649925940653e-07, "loss": 0.266, "step": 3404 }, { "epoch": 1.7141776937618147, "grad_norm": 0.17464611340669486, "learning_rate": 3.9852351563486987e-07, "loss": 0.2761, "step": 3405 }, { "epoch": 1.7146817895400126, "grad_norm": 0.171635429988461, "learning_rate": 3.9826056128567053e-07, "loss": 0.2696, "step": 3406 }, { "epoch": 1.7151858853182105, "grad_norm": 0.18495245117417844, "learning_rate": 3.9799763628766895e-07, "loss": 0.2975, "step": 3407 }, { "epoch": 1.7156899810964084, "grad_norm": 0.169836479191478, "learning_rate": 3.977347407167174e-07, "loss": 0.2755, "step": 3408 }, { "epoch": 1.716194076874606, "grad_norm": 0.17255997130578216, "learning_rate": 3.9747187464865984e-07, "loss": 0.2772, "step": 3409 }, { "epoch": 1.716698172652804, "grad_norm": 0.16749153560856467, "learning_rate": 3.972090381593311e-07, "loss": 0.2736, "step": 3410 }, { "epoch": 1.7172022684310018, "grad_norm": 0.17079123437345936, "learning_rate": 3.9694623132455815e-07, "loss": 0.2602, "step": 3411 }, { "epoch": 1.7177063642091999, "grad_norm": 0.17321152114395624, "learning_rate": 3.96683454220159e-07, "loss": 0.275, "step": 3412 }, { "epoch": 1.7182104599873975, "grad_norm": 0.16962109663638752, "learning_rate": 3.964207069219435e-07, "loss": 0.2732, "step": 3413 }, { "epoch": 1.7187145557655956, "grad_norm": 0.1746549136635196, "learning_rate": 3.9615798950571216e-07, "loss": 0.2726, "step": 3414 }, { "epoch": 1.7192186515437933, "grad_norm": 0.16750819214015739, "learning_rate": 3.9589530204725787e-07, "loss": 0.2659, "step": 3415 }, { "epoch": 1.7197227473219912, "grad_norm": 0.1808474492424595, "learning_rate": 3.956326446223639e-07, "loss": 0.2684, "step": 3416 }, { "epoch": 1.720226843100189, "grad_norm": 0.17539397294716638, "learning_rate": 3.953700173068053e-07, "loss": 0.2714, "step": 3417 }, { "epoch": 1.720730938878387, "grad_norm": 0.17517500086130025, "learning_rate": 3.9510742017634877e-07, "loss": 0.2629, "step": 3418 }, { "epoch": 1.7212350346565848, "grad_norm": 0.17325681094241782, "learning_rate": 3.9484485330675153e-07, "loss": 0.2714, "step": 3419 }, { "epoch": 1.7217391304347827, "grad_norm": 0.17437360960471293, "learning_rate": 3.9458231677376297e-07, "loss": 0.2784, "step": 3420 }, { "epoch": 1.7222432262129805, "grad_norm": 0.17254819902144547, "learning_rate": 3.943198106531227e-07, "loss": 0.2596, "step": 3421 }, { "epoch": 1.7227473219911782, "grad_norm": 0.16965358248820828, "learning_rate": 3.9405733502056267e-07, "loss": 0.2705, "step": 3422 }, { "epoch": 1.7232514177693763, "grad_norm": 0.17540405751695837, "learning_rate": 3.9379488995180503e-07, "loss": 0.275, "step": 3423 }, { "epoch": 1.723755513547574, "grad_norm": 0.17044320687255016, "learning_rate": 3.935324755225638e-07, "loss": 0.261, "step": 3424 }, { "epoch": 1.724259609325772, "grad_norm": 0.17079300009403095, "learning_rate": 3.932700918085439e-07, "loss": 0.2795, "step": 3425 }, { "epoch": 1.7247637051039697, "grad_norm": 0.17492261380235571, "learning_rate": 3.930077388854413e-07, "loss": 0.2708, "step": 3426 }, { "epoch": 1.7252678008821676, "grad_norm": 0.1697136489985747, "learning_rate": 3.9274541682894325e-07, "loss": 0.2646, "step": 3427 }, { "epoch": 1.7257718966603655, "grad_norm": 0.17356605819677773, "learning_rate": 3.9248312571472803e-07, "loss": 0.2725, "step": 3428 }, { "epoch": 1.7262759924385633, "grad_norm": 0.1798724083697512, "learning_rate": 3.9222086561846515e-07, "loss": 0.2689, "step": 3429 }, { "epoch": 1.7267800882167612, "grad_norm": 0.172033557393297, "learning_rate": 3.9195863661581466e-07, "loss": 0.2675, "step": 3430 }, { "epoch": 1.727284183994959, "grad_norm": 0.17813642880671784, "learning_rate": 3.9169643878242836e-07, "loss": 0.2538, "step": 3431 }, { "epoch": 1.727788279773157, "grad_norm": 0.1744243470379689, "learning_rate": 3.9143427219394857e-07, "loss": 0.2719, "step": 3432 }, { "epoch": 1.7282923755513546, "grad_norm": 0.19169988100579302, "learning_rate": 3.911721369260086e-07, "loss": 0.2729, "step": 3433 }, { "epoch": 1.7287964713295527, "grad_norm": 0.1669226246028055, "learning_rate": 3.9091003305423315e-07, "loss": 0.2644, "step": 3434 }, { "epoch": 1.7293005671077504, "grad_norm": 0.17848442108252346, "learning_rate": 3.9064796065423716e-07, "loss": 0.2765, "step": 3435 }, { "epoch": 1.7298046628859485, "grad_norm": 0.17288599385442902, "learning_rate": 3.9038591980162733e-07, "loss": 0.2624, "step": 3436 }, { "epoch": 1.7303087586641461, "grad_norm": 0.17418345316818656, "learning_rate": 3.901239105720004e-07, "loss": 0.2695, "step": 3437 }, { "epoch": 1.730812854442344, "grad_norm": 0.16887161070123388, "learning_rate": 3.8986193304094483e-07, "loss": 0.269, "step": 3438 }, { "epoch": 1.7313169502205419, "grad_norm": 0.1715448109776923, "learning_rate": 3.8959998728403937e-07, "loss": 0.2749, "step": 3439 }, { "epoch": 1.7318210459987398, "grad_norm": 0.17162576830034787, "learning_rate": 3.8933807337685344e-07, "loss": 0.2866, "step": 3440 }, { "epoch": 1.7323251417769376, "grad_norm": 0.17140705013497554, "learning_rate": 3.89076191394948e-07, "loss": 0.2515, "step": 3441 }, { "epoch": 1.7328292375551355, "grad_norm": 0.17186684179031347, "learning_rate": 3.8881434141387414e-07, "loss": 0.2633, "step": 3442 }, { "epoch": 1.7333333333333334, "grad_norm": 0.17247735874366607, "learning_rate": 3.885525235091741e-07, "loss": 0.2699, "step": 3443 }, { "epoch": 1.733837429111531, "grad_norm": 0.19035869520485968, "learning_rate": 3.8829073775638043e-07, "loss": 0.2869, "step": 3444 }, { "epoch": 1.7343415248897291, "grad_norm": 0.17126200809840406, "learning_rate": 3.8802898423101715e-07, "loss": 0.2659, "step": 3445 }, { "epoch": 1.7348456206679268, "grad_norm": 0.17097316798308923, "learning_rate": 3.877672630085983e-07, "loss": 0.2747, "step": 3446 }, { "epoch": 1.735349716446125, "grad_norm": 0.16663798950741054, "learning_rate": 3.8750557416462863e-07, "loss": 0.2666, "step": 3447 }, { "epoch": 1.7358538122243226, "grad_norm": 0.17374809497432028, "learning_rate": 3.8724391777460415e-07, "loss": 0.2676, "step": 3448 }, { "epoch": 1.7363579080025204, "grad_norm": 0.1705608920788751, "learning_rate": 3.869822939140107e-07, "loss": 0.273, "step": 3449 }, { "epoch": 1.7368620037807183, "grad_norm": 0.1853493481955229, "learning_rate": 3.867207026583255e-07, "loss": 0.2858, "step": 3450 }, { "epoch": 1.7373660995589162, "grad_norm": 0.1768536035338538, "learning_rate": 3.8645914408301574e-07, "loss": 0.2832, "step": 3451 }, { "epoch": 1.737870195337114, "grad_norm": 0.16841766882214215, "learning_rate": 3.861976182635397e-07, "loss": 0.2623, "step": 3452 }, { "epoch": 1.738374291115312, "grad_norm": 0.21467318021749093, "learning_rate": 3.8593612527534585e-07, "loss": 0.2769, "step": 3453 }, { "epoch": 1.7388783868935098, "grad_norm": 0.17835594443596708, "learning_rate": 3.8567466519387305e-07, "loss": 0.2818, "step": 3454 }, { "epoch": 1.7393824826717075, "grad_norm": 0.17944917802544677, "learning_rate": 3.854132380945513e-07, "loss": 0.285, "step": 3455 }, { "epoch": 1.7398865784499056, "grad_norm": 0.1756271448121031, "learning_rate": 3.8515184405280054e-07, "loss": 0.2837, "step": 3456 }, { "epoch": 1.7403906742281032, "grad_norm": 0.16873921221680957, "learning_rate": 3.8489048314403124e-07, "loss": 0.2668, "step": 3457 }, { "epoch": 1.7408947700063013, "grad_norm": 0.1837468940499396, "learning_rate": 3.846291554436445e-07, "loss": 0.2771, "step": 3458 }, { "epoch": 1.741398865784499, "grad_norm": 0.17119908182449262, "learning_rate": 3.8436786102703186e-07, "loss": 0.2642, "step": 3459 }, { "epoch": 1.7419029615626969, "grad_norm": 0.17158536564497223, "learning_rate": 3.841065999695748e-07, "loss": 0.2716, "step": 3460 }, { "epoch": 1.7424070573408947, "grad_norm": 0.18136191876998276, "learning_rate": 3.838453723466459e-07, "loss": 0.2701, "step": 3461 }, { "epoch": 1.7429111531190926, "grad_norm": 0.17402251262948262, "learning_rate": 3.8358417823360757e-07, "loss": 0.2705, "step": 3462 }, { "epoch": 1.7434152488972905, "grad_norm": 0.16714838859967632, "learning_rate": 3.8332301770581244e-07, "loss": 0.2705, "step": 3463 }, { "epoch": 1.7439193446754884, "grad_norm": 0.16798057047478643, "learning_rate": 3.8306189083860414e-07, "loss": 0.265, "step": 3464 }, { "epoch": 1.7444234404536862, "grad_norm": 0.1675965482058386, "learning_rate": 3.8280079770731565e-07, "loss": 0.2642, "step": 3465 }, { "epoch": 1.744927536231884, "grad_norm": 0.1722718830074659, "learning_rate": 3.8253973838727134e-07, "loss": 0.2783, "step": 3466 }, { "epoch": 1.745431632010082, "grad_norm": 0.17511740118469324, "learning_rate": 3.822787129537846e-07, "loss": 0.2548, "step": 3467 }, { "epoch": 1.7459357277882797, "grad_norm": 0.1692592884492893, "learning_rate": 3.820177214821602e-07, "loss": 0.2756, "step": 3468 }, { "epoch": 1.7464398235664778, "grad_norm": 0.1838015667126571, "learning_rate": 3.8175676404769226e-07, "loss": 0.2835, "step": 3469 }, { "epoch": 1.7469439193446754, "grad_norm": 0.17221059457481896, "learning_rate": 3.8149584072566516e-07, "loss": 0.2679, "step": 3470 }, { "epoch": 1.7474480151228735, "grad_norm": 0.17240520098361659, "learning_rate": 3.8123495159135414e-07, "loss": 0.2602, "step": 3471 }, { "epoch": 1.7479521109010712, "grad_norm": 0.17645638344602915, "learning_rate": 3.8097409672002384e-07, "loss": 0.2664, "step": 3472 }, { "epoch": 1.748456206679269, "grad_norm": 0.17119884887181155, "learning_rate": 3.8071327618692926e-07, "loss": 0.2771, "step": 3473 }, { "epoch": 1.748960302457467, "grad_norm": 0.16737717354943357, "learning_rate": 3.8045249006731554e-07, "loss": 0.257, "step": 3474 }, { "epoch": 1.7494643982356648, "grad_norm": 0.17011856009678725, "learning_rate": 3.80191738436418e-07, "loss": 0.2622, "step": 3475 }, { "epoch": 1.7499684940138627, "grad_norm": 0.171796792732586, "learning_rate": 3.799310213694618e-07, "loss": 0.2818, "step": 3476 }, { "epoch": 1.7504725897920603, "grad_norm": 0.1928931218051459, "learning_rate": 3.796703389416619e-07, "loss": 0.2763, "step": 3477 }, { "epoch": 1.7509766855702584, "grad_norm": 0.20147336279423894, "learning_rate": 3.7940969122822395e-07, "loss": 0.2824, "step": 3478 }, { "epoch": 1.751480781348456, "grad_norm": 0.175565231620901, "learning_rate": 3.7914907830434296e-07, "loss": 0.2864, "step": 3479 }, { "epoch": 1.7519848771266542, "grad_norm": 0.17087917079117643, "learning_rate": 3.7888850024520443e-07, "loss": 0.2664, "step": 3480 }, { "epoch": 1.7524889729048518, "grad_norm": 0.17664931365772008, "learning_rate": 3.7862795712598307e-07, "loss": 0.2758, "step": 3481 }, { "epoch": 1.75299306868305, "grad_norm": 0.17456510661593722, "learning_rate": 3.783674490218445e-07, "loss": 0.2734, "step": 3482 }, { "epoch": 1.7534971644612476, "grad_norm": 0.18609071641176797, "learning_rate": 3.7810697600794335e-07, "loss": 0.267, "step": 3483 }, { "epoch": 1.7540012602394455, "grad_norm": 0.17283492786683485, "learning_rate": 3.7784653815942436e-07, "loss": 0.2837, "step": 3484 }, { "epoch": 1.7545053560176433, "grad_norm": 0.17490689341326401, "learning_rate": 3.775861355514226e-07, "loss": 0.2799, "step": 3485 }, { "epoch": 1.7550094517958412, "grad_norm": 0.17672370129936685, "learning_rate": 3.7732576825906223e-07, "loss": 0.2539, "step": 3486 }, { "epoch": 1.755513547574039, "grad_norm": 0.17912383987865965, "learning_rate": 3.7706543635745795e-07, "loss": 0.2949, "step": 3487 }, { "epoch": 1.756017643352237, "grad_norm": 0.1744911022180228, "learning_rate": 3.768051399217137e-07, "loss": 0.2803, "step": 3488 }, { "epoch": 1.7565217391304349, "grad_norm": 0.17364431498626157, "learning_rate": 3.7654487902692355e-07, "loss": 0.2761, "step": 3489 }, { "epoch": 1.7570258349086325, "grad_norm": 0.1746170171542789, "learning_rate": 3.762846537481708e-07, "loss": 0.2696, "step": 3490 }, { "epoch": 1.7575299306868306, "grad_norm": 0.17817739668123456, "learning_rate": 3.7602446416052926e-07, "loss": 0.2801, "step": 3491 }, { "epoch": 1.7580340264650283, "grad_norm": 0.17133707420290878, "learning_rate": 3.757643103390618e-07, "loss": 0.278, "step": 3492 }, { "epoch": 1.7585381222432264, "grad_norm": 0.17227766863166188, "learning_rate": 3.75504192358821e-07, "loss": 0.2655, "step": 3493 }, { "epoch": 1.759042218021424, "grad_norm": 0.1841860670831539, "learning_rate": 3.7524411029484984e-07, "loss": 0.284, "step": 3494 }, { "epoch": 1.759546313799622, "grad_norm": 0.18128560254995008, "learning_rate": 3.7498406422217967e-07, "loss": 0.2688, "step": 3495 }, { "epoch": 1.7600504095778198, "grad_norm": 0.1777006939689815, "learning_rate": 3.7472405421583283e-07, "loss": 0.2665, "step": 3496 }, { "epoch": 1.7605545053560177, "grad_norm": 0.17651881222532234, "learning_rate": 3.7446408035082e-07, "loss": 0.2659, "step": 3497 }, { "epoch": 1.7610586011342155, "grad_norm": 0.17121563039114812, "learning_rate": 3.742041427021426e-07, "loss": 0.2661, "step": 3498 }, { "epoch": 1.7615626969124134, "grad_norm": 0.17086616322817438, "learning_rate": 3.7394424134479085e-07, "loss": 0.2748, "step": 3499 }, { "epoch": 1.7620667926906113, "grad_norm": 0.1845306405915281, "learning_rate": 3.736843763537443e-07, "loss": 0.2666, "step": 3500 }, { "epoch": 1.762570888468809, "grad_norm": 0.18385553877215485, "learning_rate": 3.734245478039729e-07, "loss": 0.2876, "step": 3501 }, { "epoch": 1.763074984247007, "grad_norm": 0.1766369636170071, "learning_rate": 3.7316475577043515e-07, "loss": 0.2662, "step": 3502 }, { "epoch": 1.7635790800252047, "grad_norm": 0.17153706729666343, "learning_rate": 3.7290500032807994e-07, "loss": 0.2629, "step": 3503 }, { "epoch": 1.7640831758034028, "grad_norm": 0.17299192192087665, "learning_rate": 3.726452815518446e-07, "loss": 0.2619, "step": 3504 }, { "epoch": 1.7645872715816004, "grad_norm": 0.19384856527161845, "learning_rate": 3.7238559951665684e-07, "loss": 0.264, "step": 3505 }, { "epoch": 1.7650913673597983, "grad_norm": 0.17125569407787852, "learning_rate": 3.721259542974329e-07, "loss": 0.2736, "step": 3506 }, { "epoch": 1.7655954631379962, "grad_norm": 0.17240058028350044, "learning_rate": 3.7186634596907905e-07, "loss": 0.2747, "step": 3507 }, { "epoch": 1.766099558916194, "grad_norm": 0.1788859679300591, "learning_rate": 3.716067746064907e-07, "loss": 0.275, "step": 3508 }, { "epoch": 1.766603654694392, "grad_norm": 0.17068346062555592, "learning_rate": 3.7134724028455235e-07, "loss": 0.2596, "step": 3509 }, { "epoch": 1.7671077504725898, "grad_norm": 0.17273509001374454, "learning_rate": 3.710877430781384e-07, "loss": 0.2734, "step": 3510 }, { "epoch": 1.7676118462507877, "grad_norm": 0.19441499623673344, "learning_rate": 3.708282830621118e-07, "loss": 0.2865, "step": 3511 }, { "epoch": 1.7681159420289854, "grad_norm": 0.18157876524943378, "learning_rate": 3.705688603113256e-07, "loss": 0.2659, "step": 3512 }, { "epoch": 1.7686200378071835, "grad_norm": 0.1755434232731158, "learning_rate": 3.703094749006214e-07, "loss": 0.2657, "step": 3513 }, { "epoch": 1.7691241335853811, "grad_norm": 0.17616250911630144, "learning_rate": 3.700501269048301e-07, "loss": 0.2696, "step": 3514 }, { "epoch": 1.7696282293635792, "grad_norm": 0.1717712747533756, "learning_rate": 3.697908163987724e-07, "loss": 0.263, "step": 3515 }, { "epoch": 1.7701323251417769, "grad_norm": 0.1746899392847966, "learning_rate": 3.6953154345725733e-07, "loss": 0.2833, "step": 3516 }, { "epoch": 1.7706364209199748, "grad_norm": 0.1705652748493287, "learning_rate": 3.6927230815508403e-07, "loss": 0.2637, "step": 3517 }, { "epoch": 1.7711405166981726, "grad_norm": 0.18596072083429435, "learning_rate": 3.6901311056703985e-07, "loss": 0.2953, "step": 3518 }, { "epoch": 1.7716446124763705, "grad_norm": 0.17652711756186662, "learning_rate": 3.6875395076790186e-07, "loss": 0.2861, "step": 3519 }, { "epoch": 1.7721487082545684, "grad_norm": 0.17706846127443582, "learning_rate": 3.684948288324362e-07, "loss": 0.2646, "step": 3520 }, { "epoch": 1.7726528040327663, "grad_norm": 0.17967101443910952, "learning_rate": 3.6823574483539767e-07, "loss": 0.2862, "step": 3521 }, { "epoch": 1.7731568998109641, "grad_norm": 0.1747251079117009, "learning_rate": 3.6797669885153046e-07, "loss": 0.2683, "step": 3522 }, { "epoch": 1.7736609955891618, "grad_norm": 0.1687110695892719, "learning_rate": 3.6771769095556783e-07, "loss": 0.2653, "step": 3523 }, { "epoch": 1.77416509136736, "grad_norm": 0.17110856789881085, "learning_rate": 3.6745872122223196e-07, "loss": 0.2758, "step": 3524 }, { "epoch": 1.7746691871455575, "grad_norm": 0.17460922566080275, "learning_rate": 3.6719978972623385e-07, "loss": 0.2753, "step": 3525 }, { "epoch": 1.7751732829237556, "grad_norm": 0.1847111261299794, "learning_rate": 3.669408965422739e-07, "loss": 0.2731, "step": 3526 }, { "epoch": 1.7756773787019533, "grad_norm": 0.17449754787956734, "learning_rate": 3.6668204174504085e-07, "loss": 0.2726, "step": 3527 }, { "epoch": 1.7761814744801512, "grad_norm": 0.17338254071796924, "learning_rate": 3.664232254092131e-07, "loss": 0.2793, "step": 3528 }, { "epoch": 1.776685570258349, "grad_norm": 0.17628576320824763, "learning_rate": 3.6616444760945744e-07, "loss": 0.2793, "step": 3529 }, { "epoch": 1.777189666036547, "grad_norm": 0.1761083816514799, "learning_rate": 3.659057084204292e-07, "loss": 0.2762, "step": 3530 }, { "epoch": 1.7776937618147448, "grad_norm": 0.17618815238730148, "learning_rate": 3.656470079167737e-07, "loss": 0.2723, "step": 3531 }, { "epoch": 1.7781978575929427, "grad_norm": 0.2082525337835813, "learning_rate": 3.6538834617312395e-07, "loss": 0.2656, "step": 3532 }, { "epoch": 1.7787019533711406, "grad_norm": 0.1740382361506214, "learning_rate": 3.6512972326410274e-07, "loss": 0.2874, "step": 3533 }, { "epoch": 1.7792060491493382, "grad_norm": 0.18497511420153842, "learning_rate": 3.6487113926432067e-07, "loss": 0.2831, "step": 3534 }, { "epoch": 1.7797101449275363, "grad_norm": 0.1868818820255656, "learning_rate": 3.6461259424837787e-07, "loss": 0.2704, "step": 3535 }, { "epoch": 1.780214240705734, "grad_norm": 0.1747555047792214, "learning_rate": 3.643540882908631e-07, "loss": 0.2742, "step": 3536 }, { "epoch": 1.780718336483932, "grad_norm": 0.16955294868925422, "learning_rate": 3.640956214663534e-07, "loss": 0.2805, "step": 3537 }, { "epoch": 1.7812224322621297, "grad_norm": 0.17195016907202454, "learning_rate": 3.638371938494151e-07, "loss": 0.2726, "step": 3538 }, { "epoch": 1.7817265280403278, "grad_norm": 0.17077392343569042, "learning_rate": 3.635788055146028e-07, "loss": 0.2695, "step": 3539 }, { "epoch": 1.7822306238185255, "grad_norm": 0.16922021837013979, "learning_rate": 3.633204565364602e-07, "loss": 0.2607, "step": 3540 }, { "epoch": 1.7827347195967234, "grad_norm": 0.17020929306602342, "learning_rate": 3.63062146989519e-07, "loss": 0.2774, "step": 3541 }, { "epoch": 1.7832388153749212, "grad_norm": 0.18083426533544908, "learning_rate": 3.628038769483002e-07, "loss": 0.2841, "step": 3542 }, { "epoch": 1.7837429111531191, "grad_norm": 0.1790034539370043, "learning_rate": 3.625456464873131e-07, "loss": 0.2713, "step": 3543 }, { "epoch": 1.784247006931317, "grad_norm": 0.1729253600603901, "learning_rate": 3.6228745568105534e-07, "loss": 0.2714, "step": 3544 }, { "epoch": 1.7847511027095146, "grad_norm": 0.20077594866662438, "learning_rate": 3.620293046040137e-07, "loss": 0.2694, "step": 3545 }, { "epoch": 1.7852551984877127, "grad_norm": 0.17484350438222895, "learning_rate": 3.6177119333066274e-07, "loss": 0.2577, "step": 3546 }, { "epoch": 1.7857592942659104, "grad_norm": 0.17015014074186688, "learning_rate": 3.6151312193546653e-07, "loss": 0.2756, "step": 3547 }, { "epoch": 1.7862633900441085, "grad_norm": 0.18310485144032457, "learning_rate": 3.612550904928765e-07, "loss": 0.2605, "step": 3548 }, { "epoch": 1.7867674858223062, "grad_norm": 0.1697602759063394, "learning_rate": 3.6099709907733355e-07, "loss": 0.277, "step": 3549 }, { "epoch": 1.7872715816005043, "grad_norm": 0.17942080037574837, "learning_rate": 3.6073914776326654e-07, "loss": 0.2766, "step": 3550 }, { "epoch": 1.787775677378702, "grad_norm": 0.1749196729191675, "learning_rate": 3.6048123662509257e-07, "loss": 0.2702, "step": 3551 }, { "epoch": 1.7882797731568998, "grad_norm": 0.17142762886817356, "learning_rate": 3.6022336573721757e-07, "loss": 0.2646, "step": 3552 }, { "epoch": 1.7887838689350977, "grad_norm": 0.16935198240355007, "learning_rate": 3.599655351740356e-07, "loss": 0.2799, "step": 3553 }, { "epoch": 1.7892879647132955, "grad_norm": 0.17626517942064382, "learning_rate": 3.597077450099295e-07, "loss": 0.2608, "step": 3554 }, { "epoch": 1.7897920604914934, "grad_norm": 0.1756679530760868, "learning_rate": 3.5944999531926963e-07, "loss": 0.2762, "step": 3555 }, { "epoch": 1.7902961562696913, "grad_norm": 0.17715691284816204, "learning_rate": 3.5919228617641564e-07, "loss": 0.2753, "step": 3556 }, { "epoch": 1.7908002520478892, "grad_norm": 0.1778053393171107, "learning_rate": 3.5893461765571464e-07, "loss": 0.2705, "step": 3557 }, { "epoch": 1.7913043478260868, "grad_norm": 0.17959544599325605, "learning_rate": 3.5867698983150277e-07, "loss": 0.2629, "step": 3558 }, { "epoch": 1.791808443604285, "grad_norm": 0.1662083229493581, "learning_rate": 3.5841940277810395e-07, "loss": 0.2628, "step": 3559 }, { "epoch": 1.7923125393824826, "grad_norm": 0.16874268935126943, "learning_rate": 3.581618565698301e-07, "loss": 0.27, "step": 3560 }, { "epoch": 1.7928166351606807, "grad_norm": 0.17322137849836952, "learning_rate": 3.5790435128098227e-07, "loss": 0.2669, "step": 3561 }, { "epoch": 1.7933207309388783, "grad_norm": 0.1742075041952224, "learning_rate": 3.576468869858486e-07, "loss": 0.2726, "step": 3562 }, { "epoch": 1.7938248267170762, "grad_norm": 0.168819659068297, "learning_rate": 3.573894637587064e-07, "loss": 0.2707, "step": 3563 }, { "epoch": 1.794328922495274, "grad_norm": 0.19374260839838295, "learning_rate": 3.571320816738205e-07, "loss": 0.2779, "step": 3564 }, { "epoch": 1.794833018273472, "grad_norm": 0.16947665412300375, "learning_rate": 3.568747408054439e-07, "loss": 0.2748, "step": 3565 }, { "epoch": 1.7953371140516698, "grad_norm": 0.17469189198050658, "learning_rate": 3.5661744122781804e-07, "loss": 0.2618, "step": 3566 }, { "epoch": 1.7958412098298677, "grad_norm": 0.1702481414901748, "learning_rate": 3.5636018301517213e-07, "loss": 0.2597, "step": 3567 }, { "epoch": 1.7963453056080656, "grad_norm": 0.17222409493077534, "learning_rate": 3.5610296624172365e-07, "loss": 0.2649, "step": 3568 }, { "epoch": 1.7968494013862633, "grad_norm": 0.16931592244630336, "learning_rate": 3.55845790981678e-07, "loss": 0.2731, "step": 3569 }, { "epoch": 1.7973534971644614, "grad_norm": 0.17620807131039093, "learning_rate": 3.555886573092287e-07, "loss": 0.2749, "step": 3570 }, { "epoch": 1.797857592942659, "grad_norm": 0.18052515979122832, "learning_rate": 3.55331565298557e-07, "loss": 0.2878, "step": 3571 }, { "epoch": 1.798361688720857, "grad_norm": 0.17191185207152138, "learning_rate": 3.550745150238328e-07, "loss": 0.2862, "step": 3572 }, { "epoch": 1.7988657844990548, "grad_norm": 0.1757024787598414, "learning_rate": 3.548175065592132e-07, "loss": 0.2649, "step": 3573 }, { "epoch": 1.7993698802772526, "grad_norm": 0.1729303037797562, "learning_rate": 3.545605399788434e-07, "loss": 0.2562, "step": 3574 }, { "epoch": 1.7998739760554505, "grad_norm": 0.17438697638244505, "learning_rate": 3.543036153568571e-07, "loss": 0.2601, "step": 3575 }, { "epoch": 1.8003780718336484, "grad_norm": 0.1785781990776129, "learning_rate": 3.5404673276737503e-07, "loss": 0.2732, "step": 3576 }, { "epoch": 1.8008821676118463, "grad_norm": 0.17317098668224498, "learning_rate": 3.5378989228450664e-07, "loss": 0.2797, "step": 3577 }, { "epoch": 1.8013862633900442, "grad_norm": 0.1690137234088919, "learning_rate": 3.5353309398234833e-07, "loss": 0.2721, "step": 3578 }, { "epoch": 1.801890359168242, "grad_norm": 0.17537848542244042, "learning_rate": 3.5327633793498535e-07, "loss": 0.2714, "step": 3579 }, { "epoch": 1.8023944549464397, "grad_norm": 0.16986985399350213, "learning_rate": 3.530196242164899e-07, "loss": 0.2637, "step": 3580 }, { "epoch": 1.8028985507246378, "grad_norm": 0.16970500175613376, "learning_rate": 3.5276295290092225e-07, "loss": 0.2919, "step": 3581 }, { "epoch": 1.8034026465028354, "grad_norm": 0.1798669561885536, "learning_rate": 3.525063240623307e-07, "loss": 0.2743, "step": 3582 }, { "epoch": 1.8034026465028354, "eval_loss": 0.3062366843223572, "eval_runtime": 17.926, "eval_samples_per_second": 47.696, "eval_steps_per_second": 1.004, "step": 3582 }, { "epoch": 1.8039067422810335, "grad_norm": 0.17610026348303112, "learning_rate": 3.5224973777475096e-07, "loss": 0.2622, "step": 3583 }, { "epoch": 1.8044108380592312, "grad_norm": 0.1819103774919584, "learning_rate": 3.519931941122066e-07, "loss": 0.2749, "step": 3584 }, { "epoch": 1.804914933837429, "grad_norm": 0.1717127969736599, "learning_rate": 3.517366931487089e-07, "loss": 0.2703, "step": 3585 }, { "epoch": 1.805419029615627, "grad_norm": 0.17885330880659442, "learning_rate": 3.514802349582569e-07, "loss": 0.2907, "step": 3586 }, { "epoch": 1.8059231253938248, "grad_norm": 0.18707259558766182, "learning_rate": 3.5122381961483684e-07, "loss": 0.276, "step": 3587 }, { "epoch": 1.8064272211720227, "grad_norm": 0.1782099276673349, "learning_rate": 3.509674471924234e-07, "loss": 0.2716, "step": 3588 }, { "epoch": 1.8069313169502206, "grad_norm": 0.17525854196465432, "learning_rate": 3.507111177649783e-07, "loss": 0.2575, "step": 3589 }, { "epoch": 1.8074354127284185, "grad_norm": 0.18183285195462048, "learning_rate": 3.5045483140645063e-07, "loss": 0.2871, "step": 3590 }, { "epoch": 1.807939508506616, "grad_norm": 0.16829292657965922, "learning_rate": 3.50198588190778e-07, "loss": 0.277, "step": 3591 }, { "epoch": 1.8084436042848142, "grad_norm": 0.16750119184558296, "learning_rate": 3.4994238819188446e-07, "loss": 0.2688, "step": 3592 }, { "epoch": 1.8089477000630119, "grad_norm": 0.19435552393670724, "learning_rate": 3.496862314836826e-07, "loss": 0.2752, "step": 3593 }, { "epoch": 1.80945179584121, "grad_norm": 0.17151739988556397, "learning_rate": 3.4943011814007154e-07, "loss": 0.2752, "step": 3594 }, { "epoch": 1.8099558916194076, "grad_norm": 0.17044681265940018, "learning_rate": 3.4917404823493895e-07, "loss": 0.2764, "step": 3595 }, { "epoch": 1.8104599873976055, "grad_norm": 0.1706463322105864, "learning_rate": 3.489180218421591e-07, "loss": 0.2795, "step": 3596 }, { "epoch": 1.8109640831758034, "grad_norm": 0.1743724675299324, "learning_rate": 3.486620390355939e-07, "loss": 0.2778, "step": 3597 }, { "epoch": 1.8114681789540013, "grad_norm": 0.1672255541484769, "learning_rate": 3.48406099889093e-07, "loss": 0.2602, "step": 3598 }, { "epoch": 1.8119722747321991, "grad_norm": 0.17213091184076965, "learning_rate": 3.4815020447649325e-07, "loss": 0.2662, "step": 3599 }, { "epoch": 1.812476370510397, "grad_norm": 0.17382490282350782, "learning_rate": 3.478943528716189e-07, "loss": 0.2702, "step": 3600 }, { "epoch": 1.8129804662885949, "grad_norm": 0.1707010922925502, "learning_rate": 3.476385451482813e-07, "loss": 0.2694, "step": 3601 }, { "epoch": 1.8134845620667925, "grad_norm": 0.16861692088845995, "learning_rate": 3.473827813802798e-07, "loss": 0.2682, "step": 3602 }, { "epoch": 1.8139886578449906, "grad_norm": 0.17226530366785384, "learning_rate": 3.4712706164140044e-07, "loss": 0.2707, "step": 3603 }, { "epoch": 1.8144927536231883, "grad_norm": 0.17374229482438772, "learning_rate": 3.468713860054166e-07, "loss": 0.2665, "step": 3604 }, { "epoch": 1.8149968494013864, "grad_norm": 0.17166711837546353, "learning_rate": 3.466157545460895e-07, "loss": 0.2614, "step": 3605 }, { "epoch": 1.815500945179584, "grad_norm": 0.18055980554179216, "learning_rate": 3.463601673371669e-07, "loss": 0.286, "step": 3606 }, { "epoch": 1.816005040957782, "grad_norm": 0.16875286919833732, "learning_rate": 3.4610462445238447e-07, "loss": 0.2754, "step": 3607 }, { "epoch": 1.8165091367359798, "grad_norm": 0.17716433583796581, "learning_rate": 3.4584912596546435e-07, "loss": 0.2741, "step": 3608 }, { "epoch": 1.8170132325141777, "grad_norm": 0.17475847439692482, "learning_rate": 3.4559367195011663e-07, "loss": 0.2709, "step": 3609 }, { "epoch": 1.8175173282923756, "grad_norm": 0.17444775099348855, "learning_rate": 3.4533826248003807e-07, "loss": 0.2818, "step": 3610 }, { "epoch": 1.8180214240705734, "grad_norm": 0.16496716406158368, "learning_rate": 3.4508289762891253e-07, "loss": 0.2597, "step": 3611 }, { "epoch": 1.8185255198487713, "grad_norm": 0.17214388173996828, "learning_rate": 3.448275774704116e-07, "loss": 0.2858, "step": 3612 }, { "epoch": 1.819029615626969, "grad_norm": 0.17019231591147854, "learning_rate": 3.4457230207819317e-07, "loss": 0.2678, "step": 3613 }, { "epoch": 1.819533711405167, "grad_norm": 0.17699657912936148, "learning_rate": 3.443170715259027e-07, "loss": 0.2577, "step": 3614 }, { "epoch": 1.8200378071833647, "grad_norm": 0.1752044669776778, "learning_rate": 3.440618858871728e-07, "loss": 0.2641, "step": 3615 }, { "epoch": 1.8205419029615628, "grad_norm": 0.1796499070557965, "learning_rate": 3.438067452356229e-07, "loss": 0.2818, "step": 3616 }, { "epoch": 1.8210459987397605, "grad_norm": 0.17743300718849356, "learning_rate": 3.435516496448594e-07, "loss": 0.2632, "step": 3617 }, { "epoch": 1.8215500945179586, "grad_norm": 0.17048364525336646, "learning_rate": 3.4329659918847574e-07, "loss": 0.2797, "step": 3618 }, { "epoch": 1.8220541902961562, "grad_norm": 0.17700606194697294, "learning_rate": 3.430415939400526e-07, "loss": 0.2738, "step": 3619 }, { "epoch": 1.822558286074354, "grad_norm": 0.1732841514749814, "learning_rate": 3.4278663397315714e-07, "loss": 0.2767, "step": 3620 }, { "epoch": 1.823062381852552, "grad_norm": 0.1746313075508639, "learning_rate": 3.4253171936134407e-07, "loss": 0.2737, "step": 3621 }, { "epoch": 1.8235664776307499, "grad_norm": 0.16856675923392617, "learning_rate": 3.4227685017815435e-07, "loss": 0.2704, "step": 3622 }, { "epoch": 1.8240705734089477, "grad_norm": 0.17817947743263224, "learning_rate": 3.420220264971164e-07, "loss": 0.2633, "step": 3623 }, { "epoch": 1.8245746691871454, "grad_norm": 0.17390789109467564, "learning_rate": 3.417672483917451e-07, "loss": 0.2631, "step": 3624 }, { "epoch": 1.8250787649653435, "grad_norm": 0.170222280271219, "learning_rate": 3.4151251593554255e-07, "loss": 0.2666, "step": 3625 }, { "epoch": 1.8255828607435411, "grad_norm": 0.18503069700309455, "learning_rate": 3.4125782920199744e-07, "loss": 0.2727, "step": 3626 }, { "epoch": 1.8260869565217392, "grad_norm": 0.17199894183725506, "learning_rate": 3.41003188264585e-07, "loss": 0.268, "step": 3627 }, { "epoch": 1.826591052299937, "grad_norm": 0.17933515312179926, "learning_rate": 3.407485931967681e-07, "loss": 0.2793, "step": 3628 }, { "epoch": 1.827095148078135, "grad_norm": 0.17059652406791323, "learning_rate": 3.404940440719953e-07, "loss": 0.2635, "step": 3629 }, { "epoch": 1.8275992438563327, "grad_norm": 0.17365046482826368, "learning_rate": 3.4023954096370285e-07, "loss": 0.2762, "step": 3630 }, { "epoch": 1.8281033396345305, "grad_norm": 0.17048013344794072, "learning_rate": 3.399850839453131e-07, "loss": 0.2672, "step": 3631 }, { "epoch": 1.8286074354127284, "grad_norm": 0.18254834974101053, "learning_rate": 3.397306730902355e-07, "loss": 0.2855, "step": 3632 }, { "epoch": 1.8291115311909263, "grad_norm": 0.1782057659837957, "learning_rate": 3.394763084718659e-07, "loss": 0.2723, "step": 3633 }, { "epoch": 1.8296156269691242, "grad_norm": 0.1869729195366187, "learning_rate": 3.3922199016358663e-07, "loss": 0.2815, "step": 3634 }, { "epoch": 1.830119722747322, "grad_norm": 0.17300425848362272, "learning_rate": 3.389677182387676e-07, "loss": 0.2766, "step": 3635 }, { "epoch": 1.83062381852552, "grad_norm": 0.17409263469313568, "learning_rate": 3.38713492770764e-07, "loss": 0.2725, "step": 3636 }, { "epoch": 1.8311279143037176, "grad_norm": 0.17168330174108185, "learning_rate": 3.384593138329188e-07, "loss": 0.2636, "step": 3637 }, { "epoch": 1.8316320100819157, "grad_norm": 0.17321147308428417, "learning_rate": 3.382051814985607e-07, "loss": 0.2643, "step": 3638 }, { "epoch": 1.8321361058601133, "grad_norm": 0.17382144255134466, "learning_rate": 3.3795109584100565e-07, "loss": 0.2774, "step": 3639 }, { "epoch": 1.8326402016383114, "grad_norm": 0.17237782646607822, "learning_rate": 3.376970569335557e-07, "loss": 0.2912, "step": 3640 }, { "epoch": 1.833144297416509, "grad_norm": 0.18252171874104944, "learning_rate": 3.374430648494991e-07, "loss": 0.2874, "step": 3641 }, { "epoch": 1.833648393194707, "grad_norm": 0.17047186028552003, "learning_rate": 3.371891196621116e-07, "loss": 0.2752, "step": 3642 }, { "epoch": 1.8341524889729048, "grad_norm": 0.1670392087043219, "learning_rate": 3.3693522144465426e-07, "loss": 0.2659, "step": 3643 }, { "epoch": 1.8346565847511027, "grad_norm": 0.173528591535757, "learning_rate": 3.366813702703757e-07, "loss": 0.2817, "step": 3644 }, { "epoch": 1.8351606805293006, "grad_norm": 0.17390840294297605, "learning_rate": 3.364275662125099e-07, "loss": 0.2736, "step": 3645 }, { "epoch": 1.8356647763074985, "grad_norm": 0.16690990450267892, "learning_rate": 3.361738093442781e-07, "loss": 0.2603, "step": 3646 }, { "epoch": 1.8361688720856963, "grad_norm": 0.17814929644569416, "learning_rate": 3.359200997388874e-07, "loss": 0.2645, "step": 3647 }, { "epoch": 1.836672967863894, "grad_norm": 0.18228489937252948, "learning_rate": 3.3566643746953136e-07, "loss": 0.2847, "step": 3648 }, { "epoch": 1.837177063642092, "grad_norm": 0.17393875067169687, "learning_rate": 3.354128226093903e-07, "loss": 0.2589, "step": 3649 }, { "epoch": 1.8376811594202898, "grad_norm": 0.17147582807039788, "learning_rate": 3.351592552316302e-07, "loss": 0.2617, "step": 3650 }, { "epoch": 1.8381852551984879, "grad_norm": 0.17359008786164207, "learning_rate": 3.34905735409404e-07, "loss": 0.2672, "step": 3651 }, { "epoch": 1.8386893509766855, "grad_norm": 0.17091069458322225, "learning_rate": 3.346522632158502e-07, "loss": 0.2737, "step": 3652 }, { "epoch": 1.8391934467548834, "grad_norm": 0.17036599489519516, "learning_rate": 3.343988387240945e-07, "loss": 0.2655, "step": 3653 }, { "epoch": 1.8396975425330813, "grad_norm": 0.1751804328426936, "learning_rate": 3.3414546200724765e-07, "loss": 0.2778, "step": 3654 }, { "epoch": 1.8402016383112791, "grad_norm": 0.1705462977459914, "learning_rate": 3.338921331384078e-07, "loss": 0.2775, "step": 3655 }, { "epoch": 1.840705734089477, "grad_norm": 0.17625220305705913, "learning_rate": 3.336388521906587e-07, "loss": 0.271, "step": 3656 }, { "epoch": 1.841209829867675, "grad_norm": 0.1711600986441844, "learning_rate": 3.333856192370699e-07, "loss": 0.2773, "step": 3657 }, { "epoch": 1.8417139256458728, "grad_norm": 0.1861777517647817, "learning_rate": 3.3313243435069796e-07, "loss": 0.278, "step": 3658 }, { "epoch": 1.8422180214240704, "grad_norm": 0.17302149508680212, "learning_rate": 3.328792976045849e-07, "loss": 0.2668, "step": 3659 }, { "epoch": 1.8427221172022685, "grad_norm": 0.17199119227595985, "learning_rate": 3.3262620907175935e-07, "loss": 0.2591, "step": 3660 }, { "epoch": 1.8432262129804662, "grad_norm": 0.17344070020135455, "learning_rate": 3.323731688252356e-07, "loss": 0.2716, "step": 3661 }, { "epoch": 1.8437303087586643, "grad_norm": 0.1724615950455862, "learning_rate": 3.3212017693801444e-07, "loss": 0.2701, "step": 3662 }, { "epoch": 1.844234404536862, "grad_norm": 0.1751562140997247, "learning_rate": 3.3186723348308204e-07, "loss": 0.2808, "step": 3663 }, { "epoch": 1.8447385003150598, "grad_norm": 0.18510250956865856, "learning_rate": 3.316143385334113e-07, "loss": 0.2698, "step": 3664 }, { "epoch": 1.8452425960932577, "grad_norm": 0.19779933795030186, "learning_rate": 3.3136149216196094e-07, "loss": 0.2812, "step": 3665 }, { "epoch": 1.8457466918714556, "grad_norm": 0.17539119190444977, "learning_rate": 3.311086944416752e-07, "loss": 0.267, "step": 3666 }, { "epoch": 1.8462507876496534, "grad_norm": 0.1688698268367981, "learning_rate": 3.3085594544548506e-07, "loss": 0.2706, "step": 3667 }, { "epoch": 1.8467548834278513, "grad_norm": 0.16370546456235416, "learning_rate": 3.306032452463067e-07, "loss": 0.2593, "step": 3668 }, { "epoch": 1.8472589792060492, "grad_norm": 0.16664793193380006, "learning_rate": 3.303505939170429e-07, "loss": 0.2747, "step": 3669 }, { "epoch": 1.8477630749842469, "grad_norm": 0.1920173379347818, "learning_rate": 3.3009799153058185e-07, "loss": 0.2808, "step": 3670 }, { "epoch": 1.848267170762445, "grad_norm": 0.17169232742511578, "learning_rate": 3.298454381597976e-07, "loss": 0.2737, "step": 3671 }, { "epoch": 1.8487712665406426, "grad_norm": 0.1768506079726659, "learning_rate": 3.295929338775505e-07, "loss": 0.2821, "step": 3672 }, { "epoch": 1.8492753623188407, "grad_norm": 0.17791535437923783, "learning_rate": 3.2934047875668624e-07, "loss": 0.3001, "step": 3673 }, { "epoch": 1.8497794580970384, "grad_norm": 0.1739360428422386, "learning_rate": 3.290880728700368e-07, "loss": 0.2636, "step": 3674 }, { "epoch": 1.8502835538752362, "grad_norm": 0.17787282666122797, "learning_rate": 3.2883571629041964e-07, "loss": 0.2658, "step": 3675 }, { "epoch": 1.8507876496534341, "grad_norm": 0.17402992624760774, "learning_rate": 3.2858340909063785e-07, "loss": 0.2821, "step": 3676 }, { "epoch": 1.851291745431632, "grad_norm": 0.16863049632313487, "learning_rate": 3.283311513434809e-07, "loss": 0.2762, "step": 3677 }, { "epoch": 1.8517958412098299, "grad_norm": 0.17175916416523554, "learning_rate": 3.280789431217231e-07, "loss": 0.2646, "step": 3678 }, { "epoch": 1.8522999369880278, "grad_norm": 0.1816439994226865, "learning_rate": 3.278267844981254e-07, "loss": 0.2751, "step": 3679 }, { "epoch": 1.8528040327662256, "grad_norm": 0.17768054704900385, "learning_rate": 3.275746755454337e-07, "loss": 0.2934, "step": 3680 }, { "epoch": 1.8533081285444233, "grad_norm": 0.17526569011593024, "learning_rate": 3.2732261633638014e-07, "loss": 0.2783, "step": 3681 }, { "epoch": 1.8538122243226214, "grad_norm": 0.1683831095084952, "learning_rate": 3.2707060694368185e-07, "loss": 0.271, "step": 3682 }, { "epoch": 1.854316320100819, "grad_norm": 0.184280262405193, "learning_rate": 3.268186474400424e-07, "loss": 0.2726, "step": 3683 }, { "epoch": 1.8548204158790171, "grad_norm": 0.17522567840778597, "learning_rate": 3.2656673789815045e-07, "loss": 0.2847, "step": 3684 }, { "epoch": 1.8553245116572148, "grad_norm": 0.17558489027707613, "learning_rate": 3.2631487839067995e-07, "loss": 0.2865, "step": 3685 }, { "epoch": 1.855828607435413, "grad_norm": 0.17002382304061303, "learning_rate": 3.260630689902913e-07, "loss": 0.2491, "step": 3686 }, { "epoch": 1.8563327032136105, "grad_norm": 0.17988503644984635, "learning_rate": 3.2581130976962966e-07, "loss": 0.2584, "step": 3687 }, { "epoch": 1.8568367989918084, "grad_norm": 0.17012660875417476, "learning_rate": 3.255596008013263e-07, "loss": 0.2755, "step": 3688 }, { "epoch": 1.8573408947700063, "grad_norm": 0.17339868792540347, "learning_rate": 3.2530794215799726e-07, "loss": 0.2957, "step": 3689 }, { "epoch": 1.8578449905482042, "grad_norm": 0.17081604800735575, "learning_rate": 3.2505633391224497e-07, "loss": 0.2619, "step": 3690 }, { "epoch": 1.858349086326402, "grad_norm": 0.17873255506104102, "learning_rate": 3.248047761366566e-07, "loss": 0.2494, "step": 3691 }, { "epoch": 1.8588531821045997, "grad_norm": 0.18817722225475367, "learning_rate": 3.2455326890380493e-07, "loss": 0.2767, "step": 3692 }, { "epoch": 1.8593572778827978, "grad_norm": 0.17152115532488543, "learning_rate": 3.243018122862484e-07, "loss": 0.2821, "step": 3693 }, { "epoch": 1.8598613736609955, "grad_norm": 0.16839244253657323, "learning_rate": 3.240504063565307e-07, "loss": 0.2529, "step": 3694 }, { "epoch": 1.8603654694391936, "grad_norm": 0.1730644160338181, "learning_rate": 3.2379905118718075e-07, "loss": 0.2669, "step": 3695 }, { "epoch": 1.8608695652173912, "grad_norm": 0.1760485174615298, "learning_rate": 3.2354774685071297e-07, "loss": 0.2525, "step": 3696 }, { "epoch": 1.8613736609955893, "grad_norm": 0.1724016942572103, "learning_rate": 3.232964934196273e-07, "loss": 0.2646, "step": 3697 }, { "epoch": 1.861877756773787, "grad_norm": 0.1695018921199434, "learning_rate": 3.230452909664084e-07, "loss": 0.2597, "step": 3698 }, { "epoch": 1.8623818525519849, "grad_norm": 0.17585413997700072, "learning_rate": 3.2279413956352713e-07, "loss": 0.2672, "step": 3699 }, { "epoch": 1.8628859483301827, "grad_norm": 0.17301030936638456, "learning_rate": 3.2254303928343886e-07, "loss": 0.2656, "step": 3700 }, { "epoch": 1.8633900441083806, "grad_norm": 0.17993787406188083, "learning_rate": 3.2229199019858426e-07, "loss": 0.2731, "step": 3701 }, { "epoch": 1.8638941398865785, "grad_norm": 0.17406377463523626, "learning_rate": 3.2204099238138986e-07, "loss": 0.2663, "step": 3702 }, { "epoch": 1.8643982356647764, "grad_norm": 0.17543670483670815, "learning_rate": 3.217900459042666e-07, "loss": 0.2744, "step": 3703 }, { "epoch": 1.8649023314429742, "grad_norm": 0.18680717781511091, "learning_rate": 3.2153915083961124e-07, "loss": 0.2768, "step": 3704 }, { "epoch": 1.865406427221172, "grad_norm": 0.1799452415248967, "learning_rate": 3.2128830725980527e-07, "loss": 0.2829, "step": 3705 }, { "epoch": 1.86591052299937, "grad_norm": 0.17633674870398874, "learning_rate": 3.210375152372157e-07, "loss": 0.2732, "step": 3706 }, { "epoch": 1.8664146187775676, "grad_norm": 0.17843417862899802, "learning_rate": 3.207867748441945e-07, "loss": 0.2672, "step": 3707 }, { "epoch": 1.8669187145557657, "grad_norm": 0.1716848365869712, "learning_rate": 3.2053608615307836e-07, "loss": 0.2626, "step": 3708 }, { "epoch": 1.8674228103339634, "grad_norm": 0.17365023761079248, "learning_rate": 3.202854492361897e-07, "loss": 0.2682, "step": 3709 }, { "epoch": 1.8679269061121613, "grad_norm": 0.17180044980161524, "learning_rate": 3.2003486416583566e-07, "loss": 0.2616, "step": 3710 }, { "epoch": 1.8684310018903592, "grad_norm": 0.1722207017048821, "learning_rate": 3.1978433101430857e-07, "loss": 0.2749, "step": 3711 }, { "epoch": 1.868935097668557, "grad_norm": 0.16714750604744708, "learning_rate": 3.1953384985388543e-07, "loss": 0.2548, "step": 3712 }, { "epoch": 1.869439193446755, "grad_norm": 0.17097699024786078, "learning_rate": 3.192834207568288e-07, "loss": 0.2771, "step": 3713 }, { "epoch": 1.8699432892249528, "grad_norm": 0.18094945147962893, "learning_rate": 3.1903304379538585e-07, "loss": 0.2816, "step": 3714 }, { "epoch": 1.8704473850031507, "grad_norm": 0.188619079337791, "learning_rate": 3.1878271904178855e-07, "loss": 0.2558, "step": 3715 }, { "epoch": 1.8709514807813483, "grad_norm": 0.17733938393348464, "learning_rate": 3.1853244656825446e-07, "loss": 0.2689, "step": 3716 }, { "epoch": 1.8714555765595464, "grad_norm": 0.17881276774598476, "learning_rate": 3.1828222644698515e-07, "loss": 0.2676, "step": 3717 }, { "epoch": 1.871959672337744, "grad_norm": 0.18577840959577496, "learning_rate": 3.1803205875016806e-07, "loss": 0.2862, "step": 3718 }, { "epoch": 1.8724637681159422, "grad_norm": 0.1708409232652426, "learning_rate": 3.1778194354997456e-07, "loss": 0.2637, "step": 3719 }, { "epoch": 1.8729678638941398, "grad_norm": 0.17019804041538505, "learning_rate": 3.1753188091856176e-07, "loss": 0.2607, "step": 3720 }, { "epoch": 1.8734719596723377, "grad_norm": 0.17419845830941813, "learning_rate": 3.172818709280709e-07, "loss": 0.2902, "step": 3721 }, { "epoch": 1.8739760554505356, "grad_norm": 0.16739241329355717, "learning_rate": 3.1703191365062843e-07, "loss": 0.2526, "step": 3722 }, { "epoch": 1.8744801512287335, "grad_norm": 0.1736290579522879, "learning_rate": 3.167820091583455e-07, "loss": 0.2879, "step": 3723 }, { "epoch": 1.8749842470069313, "grad_norm": 0.1829860060599394, "learning_rate": 3.1653215752331784e-07, "loss": 0.2577, "step": 3724 }, { "epoch": 1.8754883427851292, "grad_norm": 0.17208808352446284, "learning_rate": 3.1628235881762624e-07, "loss": 0.2624, "step": 3725 }, { "epoch": 1.875992438563327, "grad_norm": 0.17226579687443258, "learning_rate": 3.160326131133361e-07, "loss": 0.2724, "step": 3726 }, { "epoch": 1.8764965343415247, "grad_norm": 0.1758528503018195, "learning_rate": 3.1578292048249743e-07, "loss": 0.2635, "step": 3727 }, { "epoch": 1.8770006301197228, "grad_norm": 0.17602460233894715, "learning_rate": 3.1553328099714493e-07, "loss": 0.2698, "step": 3728 }, { "epoch": 1.8775047258979205, "grad_norm": 0.17916344278800375, "learning_rate": 3.152836947292984e-07, "loss": 0.2717, "step": 3729 }, { "epoch": 1.8780088216761186, "grad_norm": 0.17570265601183882, "learning_rate": 3.1503416175096156e-07, "loss": 0.2794, "step": 3730 }, { "epoch": 1.8785129174543163, "grad_norm": 0.1730642589307263, "learning_rate": 3.147846821341231e-07, "loss": 0.2521, "step": 3731 }, { "epoch": 1.8790170132325141, "grad_norm": 0.1773966697187365, "learning_rate": 3.145352559507567e-07, "loss": 0.2722, "step": 3732 }, { "epoch": 1.879521109010712, "grad_norm": 0.17094044431659802, "learning_rate": 3.1428588327281993e-07, "loss": 0.2758, "step": 3733 }, { "epoch": 1.8800252047889099, "grad_norm": 0.1851002728085005, "learning_rate": 3.140365641722555e-07, "loss": 0.2734, "step": 3734 }, { "epoch": 1.8805293005671078, "grad_norm": 0.18808599356233432, "learning_rate": 3.137872987209902e-07, "loss": 0.2826, "step": 3735 }, { "epoch": 1.8810333963453056, "grad_norm": 0.16938800737251228, "learning_rate": 3.1353808699093583e-07, "loss": 0.2684, "step": 3736 }, { "epoch": 1.8815374921235035, "grad_norm": 0.17470028516437633, "learning_rate": 3.132889290539883e-07, "loss": 0.2722, "step": 3737 }, { "epoch": 1.8820415879017012, "grad_norm": 0.17323545442596444, "learning_rate": 3.13039824982028e-07, "loss": 0.2674, "step": 3738 }, { "epoch": 1.8825456836798993, "grad_norm": 0.168222975337303, "learning_rate": 3.127907748469201e-07, "loss": 0.2596, "step": 3739 }, { "epoch": 1.883049779458097, "grad_norm": 0.18129268141555105, "learning_rate": 3.12541778720514e-07, "loss": 0.27, "step": 3740 }, { "epoch": 1.883553875236295, "grad_norm": 0.1747582775376678, "learning_rate": 3.122928366746434e-07, "loss": 0.263, "step": 3741 }, { "epoch": 1.8840579710144927, "grad_norm": 0.17513777293951543, "learning_rate": 3.1204394878112665e-07, "loss": 0.2868, "step": 3742 }, { "epoch": 1.8845620667926906, "grad_norm": 0.16905872732868277, "learning_rate": 3.1179511511176646e-07, "loss": 0.2661, "step": 3743 }, { "epoch": 1.8850661625708884, "grad_norm": 0.1711316990404876, "learning_rate": 3.1154633573834973e-07, "loss": 0.2722, "step": 3744 }, { "epoch": 1.8855702583490863, "grad_norm": 0.1739717737065482, "learning_rate": 3.112976107326475e-07, "loss": 0.268, "step": 3745 }, { "epoch": 1.8860743541272842, "grad_norm": 0.17944928774305824, "learning_rate": 3.110489401664158e-07, "loss": 0.2812, "step": 3746 }, { "epoch": 1.886578449905482, "grad_norm": 0.1762596235261297, "learning_rate": 3.108003241113942e-07, "loss": 0.2604, "step": 3747 }, { "epoch": 1.88708254568368, "grad_norm": 0.17582649188250596, "learning_rate": 3.1055176263930725e-07, "loss": 0.27, "step": 3748 }, { "epoch": 1.8875866414618776, "grad_norm": 0.18349586560030343, "learning_rate": 3.10303255821863e-07, "loss": 0.2731, "step": 3749 }, { "epoch": 1.8880907372400757, "grad_norm": 0.18096878339273567, "learning_rate": 3.100548037307546e-07, "loss": 0.2817, "step": 3750 }, { "epoch": 1.8885948330182734, "grad_norm": 0.17078057550904882, "learning_rate": 3.0980640643765867e-07, "loss": 0.2677, "step": 3751 }, { "epoch": 1.8890989287964715, "grad_norm": 0.16893384397010494, "learning_rate": 3.09558064014236e-07, "loss": 0.2664, "step": 3752 }, { "epoch": 1.889603024574669, "grad_norm": 0.18217985296544154, "learning_rate": 3.093097765321324e-07, "loss": 0.2688, "step": 3753 }, { "epoch": 1.8901071203528672, "grad_norm": 0.17031724602389556, "learning_rate": 3.0906154406297677e-07, "loss": 0.2675, "step": 3754 }, { "epoch": 1.8906112161310649, "grad_norm": 0.17612084822756136, "learning_rate": 3.0881336667838313e-07, "loss": 0.259, "step": 3755 }, { "epoch": 1.8911153119092627, "grad_norm": 0.1661742309383176, "learning_rate": 3.085652444499488e-07, "loss": 0.2555, "step": 3756 }, { "epoch": 1.8916194076874606, "grad_norm": 0.17479975552258276, "learning_rate": 3.0831717744925556e-07, "loss": 0.2719, "step": 3757 }, { "epoch": 1.8921235034656585, "grad_norm": 0.1723634999690176, "learning_rate": 3.080691657478691e-07, "loss": 0.2728, "step": 3758 }, { "epoch": 1.8926275992438564, "grad_norm": 0.17468461669614443, "learning_rate": 3.0782120941733954e-07, "loss": 0.2765, "step": 3759 }, { "epoch": 1.893131695022054, "grad_norm": 0.1710752190489546, "learning_rate": 3.075733085292006e-07, "loss": 0.2687, "step": 3760 }, { "epoch": 1.8936357908002521, "grad_norm": 0.16637775831353452, "learning_rate": 3.0732546315496986e-07, "loss": 0.2613, "step": 3761 }, { "epoch": 1.8941398865784498, "grad_norm": 0.18290536724761589, "learning_rate": 3.070776733661497e-07, "loss": 0.2847, "step": 3762 }, { "epoch": 1.8946439823566479, "grad_norm": 0.17059102780201477, "learning_rate": 3.068299392342255e-07, "loss": 0.2651, "step": 3763 }, { "epoch": 1.8951480781348455, "grad_norm": 0.17464195983436098, "learning_rate": 3.065822608306674e-07, "loss": 0.2901, "step": 3764 }, { "epoch": 1.8956521739130436, "grad_norm": 0.16589700319021694, "learning_rate": 3.063346382269286e-07, "loss": 0.2665, "step": 3765 }, { "epoch": 1.8961562696912413, "grad_norm": 0.17656288621526112, "learning_rate": 3.060870714944473e-07, "loss": 0.2642, "step": 3766 }, { "epoch": 1.8966603654694392, "grad_norm": 0.18513022931135426, "learning_rate": 3.058395607046446e-07, "loss": 0.2714, "step": 3767 }, { "epoch": 1.897164461247637, "grad_norm": 0.17250542864882773, "learning_rate": 3.0559210592892567e-07, "loss": 0.2758, "step": 3768 }, { "epoch": 1.897668557025835, "grad_norm": 0.17243208247681027, "learning_rate": 3.053447072386801e-07, "loss": 0.2769, "step": 3769 }, { "epoch": 1.8981726528040328, "grad_norm": 0.17667298757324432, "learning_rate": 3.050973647052805e-07, "loss": 0.2703, "step": 3770 }, { "epoch": 1.8986767485822307, "grad_norm": 0.16775068473994517, "learning_rate": 3.0485007840008394e-07, "loss": 0.2497, "step": 3771 }, { "epoch": 1.8991808443604286, "grad_norm": 0.17502025512014624, "learning_rate": 3.046028483944308e-07, "loss": 0.2672, "step": 3772 }, { "epoch": 1.8996849401386262, "grad_norm": 0.17430596589676, "learning_rate": 3.043556747596456e-07, "loss": 0.2753, "step": 3773 }, { "epoch": 1.9001890359168243, "grad_norm": 0.17212088868648046, "learning_rate": 3.0410855756703614e-07, "loss": 0.2712, "step": 3774 }, { "epoch": 1.900693131695022, "grad_norm": 0.17745212599865196, "learning_rate": 3.0386149688789434e-07, "loss": 0.2847, "step": 3775 }, { "epoch": 1.90119722747322, "grad_norm": 0.17153573115828954, "learning_rate": 3.036144927934958e-07, "loss": 0.276, "step": 3776 }, { "epoch": 1.9017013232514177, "grad_norm": 0.1780316189262351, "learning_rate": 3.033675453550994e-07, "loss": 0.2849, "step": 3777 }, { "epoch": 1.9022054190296156, "grad_norm": 0.1832700985266359, "learning_rate": 3.031206546439482e-07, "loss": 0.2594, "step": 3778 }, { "epoch": 1.9027095148078135, "grad_norm": 0.1699394230622764, "learning_rate": 3.0287382073126837e-07, "loss": 0.2766, "step": 3779 }, { "epoch": 1.9032136105860114, "grad_norm": 0.17706015901322072, "learning_rate": 3.0262704368827036e-07, "loss": 0.2752, "step": 3780 }, { "epoch": 1.9037177063642092, "grad_norm": 0.1775906710352642, "learning_rate": 3.0238032358614753e-07, "loss": 0.2721, "step": 3781 }, { "epoch": 1.9037177063642092, "eval_loss": 0.3058184087276459, "eval_runtime": 17.9595, "eval_samples_per_second": 47.607, "eval_steps_per_second": 1.002, "step": 3781 }, { "epoch": 1.904221802142407, "grad_norm": 0.18004522794254016, "learning_rate": 3.0213366049607703e-07, "loss": 0.2564, "step": 3782 }, { "epoch": 1.904725897920605, "grad_norm": 0.17630616450078446, "learning_rate": 3.0188705448921994e-07, "loss": 0.2788, "step": 3783 }, { "epoch": 1.9052299936988026, "grad_norm": 0.17486538442100466, "learning_rate": 3.0164050563672004e-07, "loss": 0.2699, "step": 3784 }, { "epoch": 1.9057340894770007, "grad_norm": 0.17611096571524854, "learning_rate": 3.0139401400970586e-07, "loss": 0.2754, "step": 3785 }, { "epoch": 1.9062381852551984, "grad_norm": 0.1737782779159714, "learning_rate": 3.0114757967928816e-07, "loss": 0.2657, "step": 3786 }, { "epoch": 1.9067422810333965, "grad_norm": 0.1739385161722817, "learning_rate": 3.0090120271656194e-07, "loss": 0.2653, "step": 3787 }, { "epoch": 1.9072463768115941, "grad_norm": 0.17747427811726257, "learning_rate": 3.0065488319260535e-07, "loss": 0.2728, "step": 3788 }, { "epoch": 1.907750472589792, "grad_norm": 0.17131014859330937, "learning_rate": 3.004086211784802e-07, "loss": 0.2675, "step": 3789 }, { "epoch": 1.90825456836799, "grad_norm": 0.17148523515796457, "learning_rate": 3.001624167452315e-07, "loss": 0.2672, "step": 3790 }, { "epoch": 1.9087586641461878, "grad_norm": 0.16882516072988113, "learning_rate": 2.999162699638873e-07, "loss": 0.2637, "step": 3791 }, { "epoch": 1.9092627599243857, "grad_norm": 0.1786046049841935, "learning_rate": 2.996701809054601e-07, "loss": 0.2848, "step": 3792 }, { "epoch": 1.9097668557025835, "grad_norm": 0.17145262177042117, "learning_rate": 2.994241496409444e-07, "loss": 0.2712, "step": 3793 }, { "epoch": 1.9102709514807814, "grad_norm": 0.171516448246734, "learning_rate": 2.991781762413194e-07, "loss": 0.2847, "step": 3794 }, { "epoch": 1.910775047258979, "grad_norm": 0.17763020432437332, "learning_rate": 2.989322607775462e-07, "loss": 0.2837, "step": 3795 }, { "epoch": 1.9112791430371772, "grad_norm": 0.1746370724053624, "learning_rate": 2.986864033205704e-07, "loss": 0.2727, "step": 3796 }, { "epoch": 1.9117832388153748, "grad_norm": 0.185899284676683, "learning_rate": 2.984406039413202e-07, "loss": 0.2887, "step": 3797 }, { "epoch": 1.912287334593573, "grad_norm": 0.16862868846992204, "learning_rate": 2.98194862710707e-07, "loss": 0.2645, "step": 3798 }, { "epoch": 1.9127914303717706, "grad_norm": 0.17734208433370674, "learning_rate": 2.9794917969962595e-07, "loss": 0.2769, "step": 3799 }, { "epoch": 1.9132955261499685, "grad_norm": 0.1646715304601581, "learning_rate": 2.977035549789548e-07, "loss": 0.2596, "step": 3800 }, { "epoch": 1.9137996219281663, "grad_norm": 0.1718225932856344, "learning_rate": 2.9745798861955497e-07, "loss": 0.281, "step": 3801 }, { "epoch": 1.9143037177063642, "grad_norm": 0.17618806208047236, "learning_rate": 2.972124806922707e-07, "loss": 0.2662, "step": 3802 }, { "epoch": 1.914807813484562, "grad_norm": 0.1719512615295124, "learning_rate": 2.9696703126792967e-07, "loss": 0.2669, "step": 3803 }, { "epoch": 1.91531190926276, "grad_norm": 0.18136678946666576, "learning_rate": 2.967216404173423e-07, "loss": 0.2602, "step": 3804 }, { "epoch": 1.9158160050409578, "grad_norm": 0.1746419414840373, "learning_rate": 2.9647630821130234e-07, "loss": 0.2723, "step": 3805 }, { "epoch": 1.9163201008191555, "grad_norm": 0.17198386237681593, "learning_rate": 2.9623103472058685e-07, "loss": 0.276, "step": 3806 }, { "epoch": 1.9168241965973536, "grad_norm": 0.17409294867594544, "learning_rate": 2.959858200159554e-07, "loss": 0.2668, "step": 3807 }, { "epoch": 1.9173282923755512, "grad_norm": 0.16517572018109172, "learning_rate": 2.9574066416815123e-07, "loss": 0.2668, "step": 3808 }, { "epoch": 1.9178323881537493, "grad_norm": 0.17327174886971222, "learning_rate": 2.9549556724789995e-07, "loss": 0.2654, "step": 3809 }, { "epoch": 1.918336483931947, "grad_norm": 0.17123477242224328, "learning_rate": 2.952505293259108e-07, "loss": 0.2666, "step": 3810 }, { "epoch": 1.9188405797101449, "grad_norm": 0.1711720330959814, "learning_rate": 2.950055504728757e-07, "loss": 0.275, "step": 3811 }, { "epoch": 1.9193446754883428, "grad_norm": 0.17427388711209385, "learning_rate": 2.9476063075946915e-07, "loss": 0.2762, "step": 3812 }, { "epoch": 1.9198487712665406, "grad_norm": 0.17268175591003512, "learning_rate": 2.945157702563494e-07, "loss": 0.2744, "step": 3813 }, { "epoch": 1.9203528670447385, "grad_norm": 0.16809166747283374, "learning_rate": 2.9427096903415694e-07, "loss": 0.2778, "step": 3814 }, { "epoch": 1.9208569628229364, "grad_norm": 0.17330157587108844, "learning_rate": 2.940262271635156e-07, "loss": 0.2768, "step": 3815 }, { "epoch": 1.9213610586011343, "grad_norm": 0.1738154543130853, "learning_rate": 2.9378154471503156e-07, "loss": 0.2878, "step": 3816 }, { "epoch": 1.921865154379332, "grad_norm": 0.18288271259832445, "learning_rate": 2.9353692175929475e-07, "loss": 0.2671, "step": 3817 }, { "epoch": 1.92236925015753, "grad_norm": 0.18141220460600882, "learning_rate": 2.9329235836687684e-07, "loss": 0.2806, "step": 3818 }, { "epoch": 1.9228733459357277, "grad_norm": 0.18790688920159956, "learning_rate": 2.930478546083331e-07, "loss": 0.2681, "step": 3819 }, { "epoch": 1.9233774417139258, "grad_norm": 0.16964152403213711, "learning_rate": 2.9280341055420133e-07, "loss": 0.2664, "step": 3820 }, { "epoch": 1.9238815374921234, "grad_norm": 0.1671671212800262, "learning_rate": 2.9255902627500204e-07, "loss": 0.2641, "step": 3821 }, { "epoch": 1.9243856332703215, "grad_norm": 0.17197335612344677, "learning_rate": 2.923147018412387e-07, "loss": 0.2696, "step": 3822 }, { "epoch": 1.9248897290485192, "grad_norm": 0.17243819294137702, "learning_rate": 2.920704373233972e-07, "loss": 0.2754, "step": 3823 }, { "epoch": 1.925393824826717, "grad_norm": 0.17043519865324416, "learning_rate": 2.918262327919466e-07, "loss": 0.2715, "step": 3824 }, { "epoch": 1.925897920604915, "grad_norm": 0.1722409742574077, "learning_rate": 2.915820883173383e-07, "loss": 0.2578, "step": 3825 }, { "epoch": 1.9264020163831128, "grad_norm": 0.1740557480804691, "learning_rate": 2.9133800397000627e-07, "loss": 0.2765, "step": 3826 }, { "epoch": 1.9269061121613107, "grad_norm": 0.1680406260361292, "learning_rate": 2.910939798203677e-07, "loss": 0.2795, "step": 3827 }, { "epoch": 1.9274102079395083, "grad_norm": 0.17149022418835733, "learning_rate": 2.9085001593882187e-07, "loss": 0.2604, "step": 3828 }, { "epoch": 1.9279143037177064, "grad_norm": 0.17010503345411235, "learning_rate": 2.9060611239575085e-07, "loss": 0.2827, "step": 3829 }, { "epoch": 1.928418399495904, "grad_norm": 0.1749493958223586, "learning_rate": 2.9036226926151897e-07, "loss": 0.2758, "step": 3830 }, { "epoch": 1.9289224952741022, "grad_norm": 0.17008445038688108, "learning_rate": 2.90118486606474e-07, "loss": 0.2645, "step": 3831 }, { "epoch": 1.9294265910522999, "grad_norm": 0.17652489240339891, "learning_rate": 2.898747645009454e-07, "loss": 0.2746, "step": 3832 }, { "epoch": 1.929930686830498, "grad_norm": 0.22700479625148187, "learning_rate": 2.896311030152457e-07, "loss": 0.2753, "step": 3833 }, { "epoch": 1.9304347826086956, "grad_norm": 0.17427450545863746, "learning_rate": 2.8938750221966965e-07, "loss": 0.2703, "step": 3834 }, { "epoch": 1.9309388783868935, "grad_norm": 0.17151048823577203, "learning_rate": 2.891439621844943e-07, "loss": 0.2595, "step": 3835 }, { "epoch": 1.9314429741650914, "grad_norm": 0.17149156103858243, "learning_rate": 2.8890048297997985e-07, "loss": 0.252, "step": 3836 }, { "epoch": 1.9319470699432892, "grad_norm": 0.16817818345200922, "learning_rate": 2.886570646763682e-07, "loss": 0.2643, "step": 3837 }, { "epoch": 1.9324511657214871, "grad_norm": 0.1694609499296146, "learning_rate": 2.8841370734388444e-07, "loss": 0.2774, "step": 3838 }, { "epoch": 1.9329552614996848, "grad_norm": 0.18663748392100837, "learning_rate": 2.8817041105273513e-07, "loss": 0.2794, "step": 3839 }, { "epoch": 1.9334593572778829, "grad_norm": 0.17290683655437977, "learning_rate": 2.8792717587311027e-07, "loss": 0.2751, "step": 3840 }, { "epoch": 1.9339634530560805, "grad_norm": 0.17362487357823306, "learning_rate": 2.876840018751814e-07, "loss": 0.263, "step": 3841 }, { "epoch": 1.9344675488342786, "grad_norm": 0.18573004563502468, "learning_rate": 2.8744088912910257e-07, "loss": 0.2749, "step": 3842 }, { "epoch": 1.9349716446124763, "grad_norm": 0.16915731492502192, "learning_rate": 2.8719783770501074e-07, "loss": 0.2589, "step": 3843 }, { "epoch": 1.9354757403906744, "grad_norm": 0.17885497820565566, "learning_rate": 2.8695484767302423e-07, "loss": 0.2829, "step": 3844 }, { "epoch": 1.935979836168872, "grad_norm": 0.16662065368985032, "learning_rate": 2.8671191910324466e-07, "loss": 0.2675, "step": 3845 }, { "epoch": 1.93648393194707, "grad_norm": 0.18153383756126074, "learning_rate": 2.86469052065755e-07, "loss": 0.2636, "step": 3846 }, { "epoch": 1.9369880277252678, "grad_norm": 0.17318438090821503, "learning_rate": 2.8622624663062125e-07, "loss": 0.2942, "step": 3847 }, { "epoch": 1.9374921235034657, "grad_norm": 0.1733848907134762, "learning_rate": 2.859835028678911e-07, "loss": 0.2782, "step": 3848 }, { "epoch": 1.9379962192816635, "grad_norm": 0.16974026130832987, "learning_rate": 2.8574082084759434e-07, "loss": 0.2549, "step": 3849 }, { "epoch": 1.9385003150598614, "grad_norm": 0.17091600219366612, "learning_rate": 2.854982006397438e-07, "loss": 0.2838, "step": 3850 }, { "epoch": 1.9390044108380593, "grad_norm": 0.17588856131135513, "learning_rate": 2.852556423143333e-07, "loss": 0.2667, "step": 3851 }, { "epoch": 1.939508506616257, "grad_norm": 0.16743774165838593, "learning_rate": 2.8501314594133996e-07, "loss": 0.2672, "step": 3852 }, { "epoch": 1.940012602394455, "grad_norm": 0.17244186575284368, "learning_rate": 2.8477071159072206e-07, "loss": 0.2764, "step": 3853 }, { "epoch": 1.9405166981726527, "grad_norm": 0.17790819827202037, "learning_rate": 2.845283393324208e-07, "loss": 0.2717, "step": 3854 }, { "epoch": 1.9410207939508508, "grad_norm": 0.17254105972174483, "learning_rate": 2.8428602923635894e-07, "loss": 0.28, "step": 3855 }, { "epoch": 1.9415248897290485, "grad_norm": 0.16993890956105945, "learning_rate": 2.84043781372441e-07, "loss": 0.2677, "step": 3856 }, { "epoch": 1.9420289855072463, "grad_norm": 0.17692085437275623, "learning_rate": 2.838015958105547e-07, "loss": 0.2878, "step": 3857 }, { "epoch": 1.9425330812854442, "grad_norm": 0.175016522473398, "learning_rate": 2.8355947262056865e-07, "loss": 0.2736, "step": 3858 }, { "epoch": 1.943037177063642, "grad_norm": 0.1845066084934637, "learning_rate": 2.833174118723338e-07, "loss": 0.2712, "step": 3859 }, { "epoch": 1.94354127284184, "grad_norm": 0.17292158533010546, "learning_rate": 2.8307541363568356e-07, "loss": 0.2648, "step": 3860 }, { "epoch": 1.9440453686200379, "grad_norm": 0.17403302200728327, "learning_rate": 2.8283347798043265e-07, "loss": 0.2601, "step": 3861 }, { "epoch": 1.9445494643982357, "grad_norm": 0.17209410998540525, "learning_rate": 2.825916049763779e-07, "loss": 0.2885, "step": 3862 }, { "epoch": 1.9450535601764334, "grad_norm": 0.1730295963728701, "learning_rate": 2.8234979469329856e-07, "loss": 0.2773, "step": 3863 }, { "epoch": 1.9455576559546315, "grad_norm": 0.17409432528563223, "learning_rate": 2.8210804720095516e-07, "loss": 0.2762, "step": 3864 }, { "epoch": 1.9460617517328291, "grad_norm": 0.18150338712543956, "learning_rate": 2.818663625690902e-07, "loss": 0.2729, "step": 3865 }, { "epoch": 1.9465658475110272, "grad_norm": 0.17569994960621013, "learning_rate": 2.8162474086742854e-07, "loss": 0.2756, "step": 3866 }, { "epoch": 1.947069943289225, "grad_norm": 0.17879985777229393, "learning_rate": 2.813831821656762e-07, "loss": 0.2919, "step": 3867 }, { "epoch": 1.9475740390674228, "grad_norm": 0.17230743106972518, "learning_rate": 2.811416865335217e-07, "loss": 0.276, "step": 3868 }, { "epoch": 1.9480781348456206, "grad_norm": 0.17984069998116, "learning_rate": 2.8090025404063477e-07, "loss": 0.2785, "step": 3869 }, { "epoch": 1.9485822306238185, "grad_norm": 0.1744539834333857, "learning_rate": 2.8065888475666745e-07, "loss": 0.2884, "step": 3870 }, { "epoch": 1.9490863264020164, "grad_norm": 0.18027869943306707, "learning_rate": 2.80417578751253e-07, "loss": 0.278, "step": 3871 }, { "epoch": 1.9495904221802143, "grad_norm": 0.17865984324678172, "learning_rate": 2.801763360940068e-07, "loss": 0.2602, "step": 3872 }, { "epoch": 1.9500945179584122, "grad_norm": 0.17671158591958067, "learning_rate": 2.7993515685452613e-07, "loss": 0.2779, "step": 3873 }, { "epoch": 1.9505986137366098, "grad_norm": 0.1762952439624626, "learning_rate": 2.796940411023892e-07, "loss": 0.2593, "step": 3874 }, { "epoch": 1.951102709514808, "grad_norm": 0.1823791079352628, "learning_rate": 2.794529889071569e-07, "loss": 0.2758, "step": 3875 }, { "epoch": 1.9516068052930056, "grad_norm": 0.178230741517296, "learning_rate": 2.792120003383709e-07, "loss": 0.2628, "step": 3876 }, { "epoch": 1.9521109010712037, "grad_norm": 0.17483875950866737, "learning_rate": 2.7897107546555525e-07, "loss": 0.2742, "step": 3877 }, { "epoch": 1.9526149968494013, "grad_norm": 0.17465758090677505, "learning_rate": 2.787302143582152e-07, "loss": 0.276, "step": 3878 }, { "epoch": 1.9531190926275992, "grad_norm": 0.17129898524450557, "learning_rate": 2.784894170858373e-07, "loss": 0.2652, "step": 3879 }, { "epoch": 1.953623188405797, "grad_norm": 0.1658101810588016, "learning_rate": 2.782486837178907e-07, "loss": 0.263, "step": 3880 }, { "epoch": 1.954127284183995, "grad_norm": 0.1755462609743805, "learning_rate": 2.780080143238249e-07, "loss": 0.2587, "step": 3881 }, { "epoch": 1.9546313799621928, "grad_norm": 0.17509315570998626, "learning_rate": 2.7776740897307203e-07, "loss": 0.2666, "step": 3882 }, { "epoch": 1.9551354757403907, "grad_norm": 0.16889327157699743, "learning_rate": 2.7752686773504486e-07, "loss": 0.2704, "step": 3883 }, { "epoch": 1.9556395715185886, "grad_norm": 0.17329963412035843, "learning_rate": 2.7728639067913826e-07, "loss": 0.2765, "step": 3884 }, { "epoch": 1.9561436672967862, "grad_norm": 0.17191012346840492, "learning_rate": 2.7704597787472825e-07, "loss": 0.2689, "step": 3885 }, { "epoch": 1.9566477630749843, "grad_norm": 0.16839979801264043, "learning_rate": 2.7680562939117265e-07, "loss": 0.2665, "step": 3886 }, { "epoch": 1.957151858853182, "grad_norm": 0.17446755444392845, "learning_rate": 2.765653452978103e-07, "loss": 0.2837, "step": 3887 }, { "epoch": 1.95765595463138, "grad_norm": 0.1726644996987495, "learning_rate": 2.7632512566396185e-07, "loss": 0.2825, "step": 3888 }, { "epoch": 1.9581600504095777, "grad_norm": 0.18240867556012677, "learning_rate": 2.7608497055892877e-07, "loss": 0.2799, "step": 3889 }, { "epoch": 1.9586641461877756, "grad_norm": 0.1852368217976518, "learning_rate": 2.758448800519948e-07, "loss": 0.2774, "step": 3890 }, { "epoch": 1.9591682419659735, "grad_norm": 0.17519851532629838, "learning_rate": 2.756048542124244e-07, "loss": 0.2664, "step": 3891 }, { "epoch": 1.9596723377441714, "grad_norm": 0.17550478596249003, "learning_rate": 2.7536489310946325e-07, "loss": 0.2765, "step": 3892 }, { "epoch": 1.9601764335223693, "grad_norm": 0.17986773014699392, "learning_rate": 2.751249968123391e-07, "loss": 0.2741, "step": 3893 }, { "epoch": 1.9606805293005671, "grad_norm": 0.18286596209791772, "learning_rate": 2.748851653902604e-07, "loss": 0.274, "step": 3894 }, { "epoch": 1.961184625078765, "grad_norm": 0.16845294749892417, "learning_rate": 2.7464539891241677e-07, "loss": 0.2706, "step": 3895 }, { "epoch": 1.9616887208569627, "grad_norm": 0.1667625234228712, "learning_rate": 2.744056974479798e-07, "loss": 0.2689, "step": 3896 }, { "epoch": 1.9621928166351608, "grad_norm": 0.17330081052295093, "learning_rate": 2.741660610661013e-07, "loss": 0.271, "step": 3897 }, { "epoch": 1.9626969124133584, "grad_norm": 0.16872657786163298, "learning_rate": 2.7392648983591547e-07, "loss": 0.2622, "step": 3898 }, { "epoch": 1.9632010081915565, "grad_norm": 0.17255810013786174, "learning_rate": 2.736869838265368e-07, "loss": 0.2772, "step": 3899 }, { "epoch": 1.9637051039697542, "grad_norm": 0.18979660370274565, "learning_rate": 2.7344754310706135e-07, "loss": 0.2774, "step": 3900 }, { "epoch": 1.9642091997479523, "grad_norm": 0.2005994736740754, "learning_rate": 2.732081677465664e-07, "loss": 0.2558, "step": 3901 }, { "epoch": 1.96471329552615, "grad_norm": 0.16769942763481963, "learning_rate": 2.7296885781410997e-07, "loss": 0.2729, "step": 3902 }, { "epoch": 1.9652173913043478, "grad_norm": 0.17196075848567396, "learning_rate": 2.7272961337873184e-07, "loss": 0.2597, "step": 3903 }, { "epoch": 1.9657214870825457, "grad_norm": 0.17574737438377164, "learning_rate": 2.724904345094522e-07, "loss": 0.2787, "step": 3904 }, { "epoch": 1.9662255828607436, "grad_norm": 0.1801545153423894, "learning_rate": 2.7225132127527305e-07, "loss": 0.2665, "step": 3905 }, { "epoch": 1.9667296786389414, "grad_norm": 0.18767570216174043, "learning_rate": 2.720122737451767e-07, "loss": 0.2963, "step": 3906 }, { "epoch": 1.967233774417139, "grad_norm": 0.17392592966729542, "learning_rate": 2.717732919881273e-07, "loss": 0.2664, "step": 3907 }, { "epoch": 1.9677378701953372, "grad_norm": 0.18336201230753177, "learning_rate": 2.715343760730693e-07, "loss": 0.2722, "step": 3908 }, { "epoch": 1.9682419659735348, "grad_norm": 0.17088548349889449, "learning_rate": 2.7129552606892834e-07, "loss": 0.2686, "step": 3909 }, { "epoch": 1.968746061751733, "grad_norm": 0.17989017966945003, "learning_rate": 2.710567420446116e-07, "loss": 0.2661, "step": 3910 }, { "epoch": 1.9692501575299306, "grad_norm": 0.17148613832259865, "learning_rate": 2.708180240690063e-07, "loss": 0.2682, "step": 3911 }, { "epoch": 1.9697542533081287, "grad_norm": 0.16861185373135207, "learning_rate": 2.705793722109816e-07, "loss": 0.2704, "step": 3912 }, { "epoch": 1.9702583490863264, "grad_norm": 0.19696241693330502, "learning_rate": 2.7034078653938663e-07, "loss": 0.2718, "step": 3913 }, { "epoch": 1.9707624448645242, "grad_norm": 0.18050920428069256, "learning_rate": 2.7010226712305227e-07, "loss": 0.2849, "step": 3914 }, { "epoch": 1.971266540642722, "grad_norm": 0.17461718966022174, "learning_rate": 2.698638140307897e-07, "loss": 0.2709, "step": 3915 }, { "epoch": 1.97177063642092, "grad_norm": 0.16987029931279596, "learning_rate": 2.6962542733139094e-07, "loss": 0.2749, "step": 3916 }, { "epoch": 1.9722747321991179, "grad_norm": 0.1780473567269595, "learning_rate": 2.6938710709362953e-07, "loss": 0.2801, "step": 3917 }, { "epoch": 1.9727788279773157, "grad_norm": 0.18378204778035598, "learning_rate": 2.691488533862589e-07, "loss": 0.2737, "step": 3918 }, { "epoch": 1.9732829237555136, "grad_norm": 0.17887945423278645, "learning_rate": 2.689106662780143e-07, "loss": 0.2733, "step": 3919 }, { "epoch": 1.9737870195337113, "grad_norm": 0.16923234306379617, "learning_rate": 2.686725458376109e-07, "loss": 0.2718, "step": 3920 }, { "epoch": 1.9742911153119094, "grad_norm": 0.17182101067457767, "learning_rate": 2.684344921337449e-07, "loss": 0.272, "step": 3921 }, { "epoch": 1.974795211090107, "grad_norm": 0.1811448482903096, "learning_rate": 2.681965052350935e-07, "loss": 0.2757, "step": 3922 }, { "epoch": 1.9752993068683051, "grad_norm": 0.17076536993145938, "learning_rate": 2.6795858521031455e-07, "loss": 0.272, "step": 3923 }, { "epoch": 1.9758034026465028, "grad_norm": 0.17273807428844049, "learning_rate": 2.6772073212804626e-07, "loss": 0.2623, "step": 3924 }, { "epoch": 1.9763074984247007, "grad_norm": 0.170028734836891, "learning_rate": 2.674829460569077e-07, "loss": 0.2783, "step": 3925 }, { "epoch": 1.9768115942028985, "grad_norm": 0.16806227375301971, "learning_rate": 2.67245227065499e-07, "loss": 0.2556, "step": 3926 }, { "epoch": 1.9773156899810964, "grad_norm": 0.17164028145315433, "learning_rate": 2.6700757522240025e-07, "loss": 0.2745, "step": 3927 }, { "epoch": 1.9778197857592943, "grad_norm": 0.16995123735916823, "learning_rate": 2.66769990596173e-07, "loss": 0.2746, "step": 3928 }, { "epoch": 1.9783238815374922, "grad_norm": 0.17598543808774827, "learning_rate": 2.6653247325535843e-07, "loss": 0.2622, "step": 3929 }, { "epoch": 1.97882797731569, "grad_norm": 0.17359313560540118, "learning_rate": 2.662950232684793e-07, "loss": 0.2776, "step": 3930 }, { "epoch": 1.9793320730938877, "grad_norm": 0.167656438622552, "learning_rate": 2.6605764070403817e-07, "loss": 0.262, "step": 3931 }, { "epoch": 1.9798361688720858, "grad_norm": 0.17411531156307122, "learning_rate": 2.6582032563051835e-07, "loss": 0.2746, "step": 3932 }, { "epoch": 1.9803402646502835, "grad_norm": 0.17074273937666753, "learning_rate": 2.6558307811638415e-07, "loss": 0.2745, "step": 3933 }, { "epoch": 1.9808443604284816, "grad_norm": 0.17097062535567104, "learning_rate": 2.653458982300795e-07, "loss": 0.2762, "step": 3934 }, { "epoch": 1.9813484562066792, "grad_norm": 0.1683002481107667, "learning_rate": 2.6510878604002984e-07, "loss": 0.28, "step": 3935 }, { "epoch": 1.981852551984877, "grad_norm": 0.17106853535957925, "learning_rate": 2.648717416146401e-07, "loss": 0.2673, "step": 3936 }, { "epoch": 1.982356647763075, "grad_norm": 0.18084388076626143, "learning_rate": 2.6463476502229664e-07, "loss": 0.2863, "step": 3937 }, { "epoch": 1.9828607435412728, "grad_norm": 0.16736153224440417, "learning_rate": 2.643978563313654e-07, "loss": 0.2522, "step": 3938 }, { "epoch": 1.9833648393194707, "grad_norm": 0.1708328607685619, "learning_rate": 2.64161015610193e-07, "loss": 0.2713, "step": 3939 }, { "epoch": 1.9838689350976686, "grad_norm": 0.16701316011846237, "learning_rate": 2.639242429271068e-07, "loss": 0.2684, "step": 3940 }, { "epoch": 1.9843730308758665, "grad_norm": 0.19205395390104565, "learning_rate": 2.6368753835041384e-07, "loss": 0.2652, "step": 3941 }, { "epoch": 1.9848771266540641, "grad_norm": 0.1717816839539935, "learning_rate": 2.634509019484025e-07, "loss": 0.277, "step": 3942 }, { "epoch": 1.9853812224322622, "grad_norm": 0.17022274259442283, "learning_rate": 2.6321433378934035e-07, "loss": 0.2609, "step": 3943 }, { "epoch": 1.9858853182104599, "grad_norm": 0.17756035746010013, "learning_rate": 2.629778339414763e-07, "loss": 0.2712, "step": 3944 }, { "epoch": 1.986389413988658, "grad_norm": 0.17227310821132044, "learning_rate": 2.627414024730389e-07, "loss": 0.2636, "step": 3945 }, { "epoch": 1.9868935097668556, "grad_norm": 0.1759199181751579, "learning_rate": 2.625050394522369e-07, "loss": 0.2692, "step": 3946 }, { "epoch": 1.9873976055450535, "grad_norm": 0.17477365439151, "learning_rate": 2.6226874494726005e-07, "loss": 0.2719, "step": 3947 }, { "epoch": 1.9879017013232514, "grad_norm": 0.17546888078854525, "learning_rate": 2.620325190262774e-07, "loss": 0.2693, "step": 3948 }, { "epoch": 1.9884057971014493, "grad_norm": 0.17339448401055704, "learning_rate": 2.6179636175743905e-07, "loss": 0.2735, "step": 3949 }, { "epoch": 1.9889098928796471, "grad_norm": 0.1692734111192285, "learning_rate": 2.615602732088748e-07, "loss": 0.2712, "step": 3950 }, { "epoch": 1.989413988657845, "grad_norm": 0.1779563761961173, "learning_rate": 2.6132425344869446e-07, "loss": 0.2596, "step": 3951 }, { "epoch": 1.989918084436043, "grad_norm": 0.17689388931184377, "learning_rate": 2.610883025449887e-07, "loss": 0.2756, "step": 3952 }, { "epoch": 1.9904221802142406, "grad_norm": 0.18739670056285473, "learning_rate": 2.6085242056582764e-07, "loss": 0.2713, "step": 3953 }, { "epoch": 1.9909262759924387, "grad_norm": 0.17988497242704043, "learning_rate": 2.6061660757926163e-07, "loss": 0.272, "step": 3954 }, { "epoch": 1.9914303717706363, "grad_norm": 0.1864336481558806, "learning_rate": 2.6038086365332155e-07, "loss": 0.2731, "step": 3955 }, { "epoch": 1.9919344675488344, "grad_norm": 0.17348195377594694, "learning_rate": 2.6014518885601803e-07, "loss": 0.2747, "step": 3956 }, { "epoch": 1.992438563327032, "grad_norm": 0.17263796785386343, "learning_rate": 2.599095832553415e-07, "loss": 0.2837, "step": 3957 }, { "epoch": 1.99294265910523, "grad_norm": 0.17274927381019026, "learning_rate": 2.596740469192631e-07, "loss": 0.2724, "step": 3958 }, { "epoch": 1.9934467548834278, "grad_norm": 0.18942508993924725, "learning_rate": 2.594385799157333e-07, "loss": 0.2704, "step": 3959 }, { "epoch": 1.9939508506616257, "grad_norm": 0.16909085704359703, "learning_rate": 2.59203182312683e-07, "loss": 0.2658, "step": 3960 }, { "epoch": 1.9944549464398236, "grad_norm": 0.17982713814067064, "learning_rate": 2.5896785417802313e-07, "loss": 0.3008, "step": 3961 }, { "epoch": 1.9949590422180214, "grad_norm": 0.17137213119070738, "learning_rate": 2.5873259557964395e-07, "loss": 0.2798, "step": 3962 }, { "epoch": 1.9954631379962193, "grad_norm": 0.1794592545298311, "learning_rate": 2.584974065854165e-07, "loss": 0.2686, "step": 3963 }, { "epoch": 1.995967233774417, "grad_norm": 0.17623111078697679, "learning_rate": 2.5826228726319116e-07, "loss": 0.2792, "step": 3964 }, { "epoch": 1.996471329552615, "grad_norm": 0.1704272750332749, "learning_rate": 2.5802723768079857e-07, "loss": 0.2706, "step": 3965 }, { "epoch": 1.9969754253308127, "grad_norm": 0.17123275269483276, "learning_rate": 2.5779225790604887e-07, "loss": 0.28, "step": 3966 }, { "epoch": 1.9974795211090108, "grad_norm": 0.170434187962785, "learning_rate": 2.5755734800673243e-07, "loss": 0.2765, "step": 3967 }, { "epoch": 1.9979836168872085, "grad_norm": 0.1680976681438808, "learning_rate": 2.573225080506193e-07, "loss": 0.2617, "step": 3968 } ], "logging_steps": 1, "max_steps": 5949, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 992, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6240331729207296.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }