diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,27969 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.9979836168872085, + "eval_steps": 199, + "global_step": 3968, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0005040322580645161, + "grad_norm": 16.202816979237873, + "learning_rate": 1e-08, + "loss": 0.4122, + "step": 1 + }, + { + "epoch": 0.0005040322580645161, + "eval_loss": 0.443972110748291, + "eval_runtime": 17.2668, + "eval_samples_per_second": 49.517, + "eval_steps_per_second": 1.042, + "step": 1 + }, + { + "epoch": 0.0010080645161290322, + "grad_norm": 16.205315003883026, + "learning_rate": 2e-08, + "loss": 0.4183, + "step": 2 + }, + { + "epoch": 0.0015120967741935483, + "grad_norm": 15.468979543825476, + "learning_rate": 3e-08, + "loss": 0.4182, + "step": 3 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 15.468273739112808, + "learning_rate": 4e-08, + "loss": 0.402, + "step": 4 + }, + { + "epoch": 0.0025201612903225806, + "grad_norm": 15.057464396772035, + "learning_rate": 5e-08, + "loss": 0.4211, + "step": 5 + }, + { + "epoch": 0.0030241935483870967, + "grad_norm": 15.204132593848971, + "learning_rate": 6e-08, + "loss": 0.4134, + "step": 6 + }, + { + "epoch": 0.003528225806451613, + "grad_norm": 17.615532773933918, + "learning_rate": 7e-08, + "loss": 0.3878, + "step": 7 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 16.340341169724105, + "learning_rate": 8e-08, + "loss": 0.3982, + "step": 8 + }, + { + "epoch": 0.0045362903225806455, + "grad_norm": 15.059609779441438, + "learning_rate": 9e-08, + "loss": 0.4178, + "step": 9 + }, + { + "epoch": 0.005040322580645161, + "grad_norm": 17.296721168903797, + "learning_rate": 1e-07, + "loss": 0.4115, + "step": 10 + }, + { + "epoch": 0.005544354838709678, + "grad_norm": 17.3187697691886, + "learning_rate": 1.0999999999999999e-07, + "loss": 0.4104, + "step": 11 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 18.030550149312244, + "learning_rate": 1.2e-07, + "loss": 0.4104, + "step": 12 + }, + { + "epoch": 0.00655241935483871, + "grad_norm": 14.002199418274493, + "learning_rate": 1.3e-07, + "loss": 0.4047, + "step": 13 + }, + { + "epoch": 0.007056451612903226, + "grad_norm": 153.88554829319122, + "learning_rate": 1.4e-07, + "loss": 0.3935, + "step": 14 + }, + { + "epoch": 0.007560483870967742, + "grad_norm": 6.155317751584709, + "learning_rate": 1.5e-07, + "loss": 0.3934, + "step": 15 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 8.516052807530963, + "learning_rate": 1.6e-07, + "loss": 0.3992, + "step": 16 + }, + { + "epoch": 0.008568548387096774, + "grad_norm": 12.572451947505744, + "learning_rate": 1.7000000000000001e-07, + "loss": 0.4091, + "step": 17 + }, + { + "epoch": 0.009072580645161291, + "grad_norm": 26.666526989945357, + "learning_rate": 1.8e-07, + "loss": 0.4194, + "step": 18 + }, + { + "epoch": 0.009576612903225807, + "grad_norm": 43.96981776742769, + "learning_rate": 1.8999999999999998e-07, + "loss": 0.4073, + "step": 19 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 58.38481172991244, + "learning_rate": 2e-07, + "loss": 0.4135, + "step": 20 + }, + { + "epoch": 0.010584677419354838, + "grad_norm": 8.92743387072961, + "learning_rate": 2.0999999999999997e-07, + "loss": 0.4233, + "step": 21 + }, + { + "epoch": 0.011088709677419355, + "grad_norm": 7.114049513845685, + "learning_rate": 2.1999999999999998e-07, + "loss": 0.3915, + "step": 22 + }, + { + "epoch": 0.011592741935483871, + "grad_norm": 7.5930944951978825, + "learning_rate": 2.3e-07, + "loss": 0.4119, + "step": 23 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 17.847773187914957, + "learning_rate": 2.4e-07, + "loss": 0.4035, + "step": 24 + }, + { + "epoch": 0.012600806451612902, + "grad_norm": 14.342646251100152, + "learning_rate": 2.5e-07, + "loss": 0.4023, + "step": 25 + }, + { + "epoch": 0.01310483870967742, + "grad_norm": 5.653916048149905, + "learning_rate": 2.6e-07, + "loss": 0.3999, + "step": 26 + }, + { + "epoch": 0.013608870967741936, + "grad_norm": 4.978130519093393, + "learning_rate": 2.7e-07, + "loss": 0.4012, + "step": 27 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 5.1928104536034345, + "learning_rate": 2.8e-07, + "loss": 0.3835, + "step": 28 + }, + { + "epoch": 0.014616935483870967, + "grad_norm": 5.713989015081132, + "learning_rate": 2.9e-07, + "loss": 0.4207, + "step": 29 + }, + { + "epoch": 0.015120967741935484, + "grad_norm": 10.819751935660008, + "learning_rate": 3e-07, + "loss": 0.3895, + "step": 30 + }, + { + "epoch": 0.015625, + "grad_norm": 9.415309461004313, + "learning_rate": 3.1e-07, + "loss": 0.4104, + "step": 31 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.283165538142218, + "learning_rate": 3.2e-07, + "loss": 0.4182, + "step": 32 + }, + { + "epoch": 0.01663306451612903, + "grad_norm": 6.31824176396841, + "learning_rate": 3.3e-07, + "loss": 0.3982, + "step": 33 + }, + { + "epoch": 0.017137096774193547, + "grad_norm": 2.471098407656219, + "learning_rate": 3.4000000000000003e-07, + "loss": 0.4051, + "step": 34 + }, + { + "epoch": 0.017641129032258066, + "grad_norm": 3.7875313205613943, + "learning_rate": 3.5e-07, + "loss": 0.3897, + "step": 35 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 21.98400784558835, + "learning_rate": 3.6e-07, + "loss": 0.398, + "step": 36 + }, + { + "epoch": 0.018649193548387098, + "grad_norm": 2.5064868793159714, + "learning_rate": 3.7e-07, + "loss": 0.3951, + "step": 37 + }, + { + "epoch": 0.019153225806451613, + "grad_norm": 24.059819417771145, + "learning_rate": 3.7999999999999996e-07, + "loss": 0.3989, + "step": 38 + }, + { + "epoch": 0.01965725806451613, + "grad_norm": 6.95393911781356, + "learning_rate": 3.8999999999999997e-07, + "loss": 0.4014, + "step": 39 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.8868850874278182, + "learning_rate": 4e-07, + "loss": 0.3864, + "step": 40 + }, + { + "epoch": 0.02066532258064516, + "grad_norm": 2.1387916921557624, + "learning_rate": 4.0999999999999994e-07, + "loss": 0.3765, + "step": 41 + }, + { + "epoch": 0.021169354838709676, + "grad_norm": 2.627363698866213, + "learning_rate": 4.1999999999999995e-07, + "loss": 0.3819, + "step": 42 + }, + { + "epoch": 0.021673387096774195, + "grad_norm": 4.399523991039246, + "learning_rate": 4.2999999999999996e-07, + "loss": 0.396, + "step": 43 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 6.2743195319461265, + "learning_rate": 4.3999999999999997e-07, + "loss": 0.3719, + "step": 44 + }, + { + "epoch": 0.022681451612903226, + "grad_norm": 3.5237571635789706, + "learning_rate": 4.5e-07, + "loss": 0.3787, + "step": 45 + }, + { + "epoch": 0.023185483870967742, + "grad_norm": 4.8083225470670845, + "learning_rate": 4.6e-07, + "loss": 0.3842, + "step": 46 + }, + { + "epoch": 0.023689516129032258, + "grad_norm": 4.2342259419992985, + "learning_rate": 4.6999999999999995e-07, + "loss": 0.3921, + "step": 47 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 5.233597040220851, + "learning_rate": 4.8e-07, + "loss": 0.3803, + "step": 48 + }, + { + "epoch": 0.02469758064516129, + "grad_norm": 3.3462375668312427, + "learning_rate": 4.9e-07, + "loss": 0.3762, + "step": 49 + }, + { + "epoch": 0.025201612903225805, + "grad_norm": 5.738883941273613, + "learning_rate": 5e-07, + "loss": 0.3691, + "step": 50 + }, + { + "epoch": 0.025705645161290324, + "grad_norm": 3.4906691092581608, + "learning_rate": 5.1e-07, + "loss": 0.3927, + "step": 51 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 3.1684109647850445, + "learning_rate": 5.2e-07, + "loss": 0.3649, + "step": 52 + }, + { + "epoch": 0.026713709677419355, + "grad_norm": 2.1354179898716037, + "learning_rate": 5.3e-07, + "loss": 0.3982, + "step": 53 + }, + { + "epoch": 0.02721774193548387, + "grad_norm": 9.732639588208007, + "learning_rate": 5.4e-07, + "loss": 0.3897, + "step": 54 + }, + { + "epoch": 0.027721774193548387, + "grad_norm": 1.114588852331627, + "learning_rate": 5.5e-07, + "loss": 0.3668, + "step": 55 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 6.114641493595002, + "learning_rate": 5.6e-07, + "loss": 0.377, + "step": 56 + }, + { + "epoch": 0.028729838709677418, + "grad_norm": 1.3354617010610528, + "learning_rate": 5.699999999999999e-07, + "loss": 0.38, + "step": 57 + }, + { + "epoch": 0.029233870967741934, + "grad_norm": 1.2850750187972262, + "learning_rate": 5.8e-07, + "loss": 0.3847, + "step": 58 + }, + { + "epoch": 0.029737903225806453, + "grad_norm": 1.3750785124101343, + "learning_rate": 5.9e-07, + "loss": 0.3592, + "step": 59 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.9929928043047975, + "learning_rate": 6e-07, + "loss": 0.3633, + "step": 60 + }, + { + "epoch": 0.030745967741935484, + "grad_norm": 6.301747495454331, + "learning_rate": 6.1e-07, + "loss": 0.3766, + "step": 61 + }, + { + "epoch": 0.03125, + "grad_norm": 2.1168564747094076, + "learning_rate": 6.2e-07, + "loss": 0.3713, + "step": 62 + }, + { + "epoch": 0.031754032258064516, + "grad_norm": 0.7578830680461082, + "learning_rate": 6.3e-07, + "loss": 0.369, + "step": 63 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 1.6734191696925034, + "learning_rate": 6.4e-07, + "loss": 0.366, + "step": 64 + }, + { + "epoch": 0.03276209677419355, + "grad_norm": 1.2789631524237748, + "learning_rate": 6.5e-07, + "loss": 0.3617, + "step": 65 + }, + { + "epoch": 0.03326612903225806, + "grad_norm": 0.9523361493720094, + "learning_rate": 6.6e-07, + "loss": 0.3383, + "step": 66 + }, + { + "epoch": 0.03377016129032258, + "grad_norm": 1.166610477006102, + "learning_rate": 6.7e-07, + "loss": 0.3626, + "step": 67 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 1.2998523358147898, + "learning_rate": 6.800000000000001e-07, + "loss": 0.3521, + "step": 68 + }, + { + "epoch": 0.03477822580645161, + "grad_norm": 1.4526130491588676, + "learning_rate": 6.9e-07, + "loss": 0.3464, + "step": 69 + }, + { + "epoch": 0.03528225806451613, + "grad_norm": 0.57853677880428, + "learning_rate": 7e-07, + "loss": 0.3459, + "step": 70 + }, + { + "epoch": 0.03578629032258065, + "grad_norm": 1.268886862963294, + "learning_rate": 7.1e-07, + "loss": 0.3658, + "step": 71 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.3395158890738286, + "learning_rate": 7.2e-07, + "loss": 0.3382, + "step": 72 + }, + { + "epoch": 0.03679435483870968, + "grad_norm": 0.7066691364777967, + "learning_rate": 7.3e-07, + "loss": 0.3484, + "step": 73 + }, + { + "epoch": 0.037298387096774195, + "grad_norm": 0.8562245745506359, + "learning_rate": 7.4e-07, + "loss": 0.3696, + "step": 74 + }, + { + "epoch": 0.03780241935483871, + "grad_norm": 1.583149325798458, + "learning_rate": 7.5e-07, + "loss": 0.352, + "step": 75 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 0.4718209553213981, + "learning_rate": 7.599999999999999e-07, + "loss": 0.3474, + "step": 76 + }, + { + "epoch": 0.03881048387096774, + "grad_norm": 16.741995431233445, + "learning_rate": 7.699999999999999e-07, + "loss": 0.3514, + "step": 77 + }, + { + "epoch": 0.03931451612903226, + "grad_norm": 0.5609500021734007, + "learning_rate": 7.799999999999999e-07, + "loss": 0.3562, + "step": 78 + }, + { + "epoch": 0.039818548387096774, + "grad_norm": 0.5482085713967506, + "learning_rate": 7.9e-07, + "loss": 0.3571, + "step": 79 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 0.5862216706734081, + "learning_rate": 8e-07, + "loss": 0.3775, + "step": 80 + }, + { + "epoch": 0.040826612903225805, + "grad_norm": 0.9433271815134637, + "learning_rate": 8.1e-07, + "loss": 0.3622, + "step": 81 + }, + { + "epoch": 0.04133064516129032, + "grad_norm": 2.312304687609339, + "learning_rate": 8.199999999999999e-07, + "loss": 0.3354, + "step": 82 + }, + { + "epoch": 0.041834677419354836, + "grad_norm": 0.7657086859357075, + "learning_rate": 8.299999999999999e-07, + "loss": 0.3386, + "step": 83 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 0.5693450749645984, + "learning_rate": 8.399999999999999e-07, + "loss": 0.336, + "step": 84 + }, + { + "epoch": 0.04284274193548387, + "grad_norm": 0.7237524467723284, + "learning_rate": 8.499999999999999e-07, + "loss": 0.3574, + "step": 85 + }, + { + "epoch": 0.04334677419354839, + "grad_norm": 1.6741553791841324, + "learning_rate": 8.599999999999999e-07, + "loss": 0.3678, + "step": 86 + }, + { + "epoch": 0.043850806451612906, + "grad_norm": 1.066623540734941, + "learning_rate": 8.699999999999999e-07, + "loss": 0.3528, + "step": 87 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 0.5236790478758251, + "learning_rate": 8.799999999999999e-07, + "loss": 0.3485, + "step": 88 + }, + { + "epoch": 0.04485887096774194, + "grad_norm": 0.37322566471667024, + "learning_rate": 8.9e-07, + "loss": 0.3403, + "step": 89 + }, + { + "epoch": 0.04536290322580645, + "grad_norm": 0.37334552504540236, + "learning_rate": 9e-07, + "loss": 0.334, + "step": 90 + }, + { + "epoch": 0.04586693548387097, + "grad_norm": 0.6233477368274808, + "learning_rate": 9.1e-07, + "loss": 0.3577, + "step": 91 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 0.380512676596753, + "learning_rate": 9.2e-07, + "loss": 0.3437, + "step": 92 + }, + { + "epoch": 0.046875, + "grad_norm": 0.7218988883748737, + "learning_rate": 9.3e-07, + "loss": 0.3622, + "step": 93 + }, + { + "epoch": 0.047379032258064516, + "grad_norm": 1.0217493525557952, + "learning_rate": 9.399999999999999e-07, + "loss": 0.3249, + "step": 94 + }, + { + "epoch": 0.04788306451612903, + "grad_norm": 0.8160482348861993, + "learning_rate": 9.499999999999999e-07, + "loss": 0.3569, + "step": 95 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.6620382396716078, + "learning_rate": 9.6e-07, + "loss": 0.3325, + "step": 96 + }, + { + "epoch": 0.04889112903225806, + "grad_norm": 0.9899117085666442, + "learning_rate": 9.7e-07, + "loss": 0.3565, + "step": 97 + }, + { + "epoch": 0.04939516129032258, + "grad_norm": 0.6672181715686732, + "learning_rate": 9.8e-07, + "loss": 0.3383, + "step": 98 + }, + { + "epoch": 0.049899193548387094, + "grad_norm": 0.8490767132728996, + "learning_rate": 9.9e-07, + "loss": 0.3628, + "step": 99 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 3.230091455346306, + "learning_rate": 1e-06, + "loss": 0.3525, + "step": 100 + }, + { + "epoch": 0.05090725806451613, + "grad_norm": 1.0048400815002736, + "learning_rate": 9.999999278765487e-07, + "loss": 0.3465, + "step": 101 + }, + { + "epoch": 0.05141129032258065, + "grad_norm": 0.46323136632477013, + "learning_rate": 9.999997115062153e-07, + "loss": 0.3613, + "step": 102 + }, + { + "epoch": 0.051915322580645164, + "grad_norm": 1.1303903715708508, + "learning_rate": 9.999993508890626e-07, + "loss": 0.3439, + "step": 103 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 0.8278300274368537, + "learning_rate": 9.999988460251948e-07, + "loss": 0.3364, + "step": 104 + }, + { + "epoch": 0.052923387096774195, + "grad_norm": 0.8385630360604959, + "learning_rate": 9.99998196914757e-07, + "loss": 0.3341, + "step": 105 + }, + { + "epoch": 0.05342741935483871, + "grad_norm": 0.5181207146190654, + "learning_rate": 9.999974035579367e-07, + "loss": 0.3504, + "step": 106 + }, + { + "epoch": 0.053931451612903226, + "grad_norm": 0.867225679439543, + "learning_rate": 9.999964659549629e-07, + "loss": 0.3487, + "step": 107 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 0.4359954195764285, + "learning_rate": 9.999953841061059e-07, + "loss": 0.3511, + "step": 108 + }, + { + "epoch": 0.05493951612903226, + "grad_norm": 0.49236276321466904, + "learning_rate": 9.99994158011678e-07, + "loss": 0.3301, + "step": 109 + }, + { + "epoch": 0.055443548387096774, + "grad_norm": 0.42334304009907897, + "learning_rate": 9.999927876720327e-07, + "loss": 0.3354, + "step": 110 + }, + { + "epoch": 0.05594758064516129, + "grad_norm": 0.3717776940463779, + "learning_rate": 9.999912730875654e-07, + "loss": 0.3536, + "step": 111 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 0.7134857290167043, + "learning_rate": 9.999896142587133e-07, + "loss": 0.3423, + "step": 112 + }, + { + "epoch": 0.05695564516129032, + "grad_norm": 0.6589031582715488, + "learning_rate": 9.999878111859545e-07, + "loss": 0.3528, + "step": 113 + }, + { + "epoch": 0.057459677419354836, + "grad_norm": 0.4715779115727926, + "learning_rate": 9.999858638698095e-07, + "loss": 0.331, + "step": 114 + }, + { + "epoch": 0.05796370967741935, + "grad_norm": 1.8324714803517166, + "learning_rate": 9.999837723108403e-07, + "loss": 0.3361, + "step": 115 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 3.4504051941803455, + "learning_rate": 9.999815365096497e-07, + "loss": 0.3381, + "step": 116 + }, + { + "epoch": 0.05897177419354839, + "grad_norm": 0.7082367647106664, + "learning_rate": 9.999791564668832e-07, + "loss": 0.3357, + "step": 117 + }, + { + "epoch": 0.059475806451612906, + "grad_norm": 0.8943478209896717, + "learning_rate": 9.99976632183227e-07, + "loss": 0.334, + "step": 118 + }, + { + "epoch": 0.05997983870967742, + "grad_norm": 2.639935863362195, + "learning_rate": 9.9997396365941e-07, + "loss": 0.3515, + "step": 119 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 0.5612525598880919, + "learning_rate": 9.999711508962014e-07, + "loss": 0.3421, + "step": 120 + }, + { + "epoch": 0.06098790322580645, + "grad_norm": 1.0108383298218186, + "learning_rate": 9.99968193894413e-07, + "loss": 0.3485, + "step": 121 + }, + { + "epoch": 0.06149193548387097, + "grad_norm": 1.5821951581009004, + "learning_rate": 9.999650926548979e-07, + "loss": 0.3293, + "step": 122 + }, + { + "epoch": 0.061995967741935484, + "grad_norm": 1.1703013492954635, + "learning_rate": 9.999618471785505e-07, + "loss": 0.3725, + "step": 123 + }, + { + "epoch": 0.0625, + "grad_norm": 0.3496060502874034, + "learning_rate": 9.999584574663074e-07, + "loss": 0.3401, + "step": 124 + }, + { + "epoch": 0.06300403225806452, + "grad_norm": 0.6132017993310452, + "learning_rate": 9.999549235191465e-07, + "loss": 0.3455, + "step": 125 + }, + { + "epoch": 0.06350806451612903, + "grad_norm": 1.5073872786250602, + "learning_rate": 9.999512453380869e-07, + "loss": 0.3389, + "step": 126 + }, + { + "epoch": 0.06401209677419355, + "grad_norm": 1.539376597083665, + "learning_rate": 9.999474229241903e-07, + "loss": 0.3332, + "step": 127 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.2824746138412246, + "learning_rate": 9.99943456278559e-07, + "loss": 0.3456, + "step": 128 + }, + { + "epoch": 0.06502016129032258, + "grad_norm": 1.4717370768722402, + "learning_rate": 9.99939345402338e-07, + "loss": 0.3361, + "step": 129 + }, + { + "epoch": 0.0655241935483871, + "grad_norm": 0.3607286264558088, + "learning_rate": 9.999350902967124e-07, + "loss": 0.3309, + "step": 130 + }, + { + "epoch": 0.06602822580645161, + "grad_norm": 0.3812650155511185, + "learning_rate": 9.999306909629103e-07, + "loss": 0.3336, + "step": 131 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 2.886726675491379, + "learning_rate": 9.999261474022007e-07, + "loss": 0.35, + "step": 132 + }, + { + "epoch": 0.06703629032258064, + "grad_norm": 2.482458443393656, + "learning_rate": 9.999214596158946e-07, + "loss": 0.333, + "step": 133 + }, + { + "epoch": 0.06754032258064516, + "grad_norm": 0.3364379109508271, + "learning_rate": 9.999166276053442e-07, + "loss": 0.3371, + "step": 134 + }, + { + "epoch": 0.06804435483870967, + "grad_norm": 0.7506818894597985, + "learning_rate": 9.999116513719434e-07, + "loss": 0.3339, + "step": 135 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 1.4202719086270377, + "learning_rate": 9.999065309171282e-07, + "loss": 0.3248, + "step": 136 + }, + { + "epoch": 0.0690524193548387, + "grad_norm": 0.7484158287932788, + "learning_rate": 9.999012662423754e-07, + "loss": 0.3322, + "step": 137 + }, + { + "epoch": 0.06955645161290322, + "grad_norm": 0.374995905774015, + "learning_rate": 9.998958573492042e-07, + "loss": 0.3388, + "step": 138 + }, + { + "epoch": 0.07006048387096774, + "grad_norm": 3.1451635192774416, + "learning_rate": 9.998903042391747e-07, + "loss": 0.3308, + "step": 139 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 4.100760115701697, + "learning_rate": 9.99884606913889e-07, + "loss": 0.3393, + "step": 140 + }, + { + "epoch": 0.07106854838709678, + "grad_norm": 1.0116256210926982, + "learning_rate": 9.99878765374991e-07, + "loss": 0.3365, + "step": 141 + }, + { + "epoch": 0.0715725806451613, + "grad_norm": 0.4898776317688679, + "learning_rate": 9.998727796241657e-07, + "loss": 0.3388, + "step": 142 + }, + { + "epoch": 0.07207661290322581, + "grad_norm": 1.2664512508752508, + "learning_rate": 9.9986664966314e-07, + "loss": 0.3245, + "step": 143 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 1.6834640389600903, + "learning_rate": 9.998603754936825e-07, + "loss": 0.3407, + "step": 144 + }, + { + "epoch": 0.07308467741935484, + "grad_norm": 1.3464610251528206, + "learning_rate": 9.99853957117603e-07, + "loss": 0.3225, + "step": 145 + }, + { + "epoch": 0.07358870967741936, + "grad_norm": 1.0984235521861139, + "learning_rate": 9.998473945367535e-07, + "loss": 0.3231, + "step": 146 + }, + { + "epoch": 0.07409274193548387, + "grad_norm": 2.8158986365995156, + "learning_rate": 9.998406877530267e-07, + "loss": 0.3331, + "step": 147 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 0.3788437430061732, + "learning_rate": 9.998338367683583e-07, + "loss": 0.3392, + "step": 148 + }, + { + "epoch": 0.0751008064516129, + "grad_norm": 0.6183083703029865, + "learning_rate": 9.99826841584724e-07, + "loss": 0.3305, + "step": 149 + }, + { + "epoch": 0.07560483870967742, + "grad_norm": 0.4095762625785864, + "learning_rate": 9.998197022041422e-07, + "loss": 0.3263, + "step": 150 + }, + { + "epoch": 0.07610887096774194, + "grad_norm": 1.2756291414954701, + "learning_rate": 9.998124186286724e-07, + "loss": 0.3394, + "step": 151 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 0.3597641387730504, + "learning_rate": 9.998049908604163e-07, + "loss": 0.3462, + "step": 152 + }, + { + "epoch": 0.07711693548387097, + "grad_norm": 0.3430879414583386, + "learning_rate": 9.997974189015163e-07, + "loss": 0.337, + "step": 153 + }, + { + "epoch": 0.07762096774193548, + "grad_norm": 1.1595831180201008, + "learning_rate": 9.997897027541571e-07, + "loss": 0.3372, + "step": 154 + }, + { + "epoch": 0.078125, + "grad_norm": 0.48618010029983355, + "learning_rate": 9.997818424205647e-07, + "loss": 0.3371, + "step": 155 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 0.39786757641752357, + "learning_rate": 9.997738379030068e-07, + "loss": 0.3314, + "step": 156 + }, + { + "epoch": 0.07913306451612903, + "grad_norm": 1.7056412415922666, + "learning_rate": 9.997656892037924e-07, + "loss": 0.319, + "step": 157 + }, + { + "epoch": 0.07963709677419355, + "grad_norm": 0.3653360435949616, + "learning_rate": 9.997573963252725e-07, + "loss": 0.3354, + "step": 158 + }, + { + "epoch": 0.08014112903225806, + "grad_norm": 2.829079281128038, + "learning_rate": 9.997489592698399e-07, + "loss": 0.3318, + "step": 159 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.2295812347072044, + "learning_rate": 9.997403780399282e-07, + "loss": 0.3494, + "step": 160 + }, + { + "epoch": 0.0811491935483871, + "grad_norm": 0.8812794930898853, + "learning_rate": 9.997316526380131e-07, + "loss": 0.3344, + "step": 161 + }, + { + "epoch": 0.08165322580645161, + "grad_norm": 0.8353193931880987, + "learning_rate": 9.997227830666118e-07, + "loss": 0.3305, + "step": 162 + }, + { + "epoch": 0.08215725806451613, + "grad_norm": 0.9923756258213962, + "learning_rate": 9.997137693282833e-07, + "loss": 0.3421, + "step": 163 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 2.476846910057538, + "learning_rate": 9.99704611425628e-07, + "loss": 0.3303, + "step": 164 + }, + { + "epoch": 0.08316532258064516, + "grad_norm": 3.0540963644253294, + "learning_rate": 9.996953093612877e-07, + "loss": 0.3244, + "step": 165 + }, + { + "epoch": 0.08366935483870967, + "grad_norm": 1.9232590922797959, + "learning_rate": 9.99685863137946e-07, + "loss": 0.3566, + "step": 166 + }, + { + "epoch": 0.08417338709677419, + "grad_norm": 0.8404131782552129, + "learning_rate": 9.996762727583285e-07, + "loss": 0.3248, + "step": 167 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 0.7858824684266251, + "learning_rate": 9.996665382252014e-07, + "loss": 0.3445, + "step": 168 + }, + { + "epoch": 0.08518145161290322, + "grad_norm": 0.7353820488132891, + "learning_rate": 9.996566595413734e-07, + "loss": 0.3284, + "step": 169 + }, + { + "epoch": 0.08568548387096774, + "grad_norm": 0.4062481131054341, + "learning_rate": 9.996466367096943e-07, + "loss": 0.3204, + "step": 170 + }, + { + "epoch": 0.08618951612903226, + "grad_norm": 0.7805121538010222, + "learning_rate": 9.996364697330555e-07, + "loss": 0.3222, + "step": 171 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 0.45723066112420185, + "learning_rate": 9.996261586143904e-07, + "loss": 0.3359, + "step": 172 + }, + { + "epoch": 0.0871975806451613, + "grad_norm": 0.9178689167005624, + "learning_rate": 9.996157033566737e-07, + "loss": 0.3446, + "step": 173 + }, + { + "epoch": 0.08770161290322581, + "grad_norm": 1.1988895413065521, + "learning_rate": 9.996051039629214e-07, + "loss": 0.3196, + "step": 174 + }, + { + "epoch": 0.08820564516129033, + "grad_norm": 1.5252246353478032, + "learning_rate": 9.995943604361915e-07, + "loss": 0.3296, + "step": 175 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 2.9977266959256217, + "learning_rate": 9.995834727795834e-07, + "loss": 0.332, + "step": 176 + }, + { + "epoch": 0.08921370967741936, + "grad_norm": 0.4475301770853643, + "learning_rate": 9.995724409962381e-07, + "loss": 0.3229, + "step": 177 + }, + { + "epoch": 0.08971774193548387, + "grad_norm": 0.5571776297010896, + "learning_rate": 9.995612650893384e-07, + "loss": 0.339, + "step": 178 + }, + { + "epoch": 0.09022177419354839, + "grad_norm": 0.3379507080955966, + "learning_rate": 9.995499450621084e-07, + "loss": 0.3363, + "step": 179 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 0.5115984237062103, + "learning_rate": 9.995384809178135e-07, + "loss": 0.3318, + "step": 180 + }, + { + "epoch": 0.09122983870967742, + "grad_norm": 0.597842695756601, + "learning_rate": 9.995268726597616e-07, + "loss": 0.3355, + "step": 181 + }, + { + "epoch": 0.09173387096774194, + "grad_norm": 1.6790645385073486, + "learning_rate": 9.995151202913013e-07, + "loss": 0.3379, + "step": 182 + }, + { + "epoch": 0.09223790322580645, + "grad_norm": 1.4108139487058173, + "learning_rate": 9.99503223815823e-07, + "loss": 0.3435, + "step": 183 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 1.1761800110707394, + "learning_rate": 9.99491183236759e-07, + "loss": 0.3304, + "step": 184 + }, + { + "epoch": 0.09324596774193548, + "grad_norm": 0.5429451949780051, + "learning_rate": 9.99478998557583e-07, + "loss": 0.3275, + "step": 185 + }, + { + "epoch": 0.09375, + "grad_norm": 1.0221880040659963, + "learning_rate": 9.994666697818097e-07, + "loss": 0.3248, + "step": 186 + }, + { + "epoch": 0.09425403225806452, + "grad_norm": 1.7971515949261194, + "learning_rate": 9.994541969129963e-07, + "loss": 0.3252, + "step": 187 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 0.9715025504301362, + "learning_rate": 9.99441579954741e-07, + "loss": 0.3275, + "step": 188 + }, + { + "epoch": 0.09526209677419355, + "grad_norm": 0.3672192941528604, + "learning_rate": 9.99428818910684e-07, + "loss": 0.3269, + "step": 189 + }, + { + "epoch": 0.09576612903225806, + "grad_norm": 0.520936669282708, + "learning_rate": 9.994159137845062e-07, + "loss": 0.3195, + "step": 190 + }, + { + "epoch": 0.09627016129032258, + "grad_norm": 0.7477175783403504, + "learning_rate": 9.994028645799312e-07, + "loss": 0.3136, + "step": 191 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 0.3729287311518564, + "learning_rate": 9.993896713007234e-07, + "loss": 0.3272, + "step": 192 + }, + { + "epoch": 0.09727822580645161, + "grad_norm": 0.34440602307523366, + "learning_rate": 9.99376333950689e-07, + "loss": 0.3233, + "step": 193 + }, + { + "epoch": 0.09778225806451613, + "grad_norm": 2.072604244189497, + "learning_rate": 9.993628525336757e-07, + "loss": 0.3436, + "step": 194 + }, + { + "epoch": 0.09828629032258064, + "grad_norm": 1.019190865070132, + "learning_rate": 9.993492270535728e-07, + "loss": 0.3313, + "step": 195 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 0.7904036059961682, + "learning_rate": 9.99335457514311e-07, + "loss": 0.3274, + "step": 196 + }, + { + "epoch": 0.09929435483870967, + "grad_norm": 0.6959515322641198, + "learning_rate": 9.993215439198632e-07, + "loss": 0.3208, + "step": 197 + }, + { + "epoch": 0.09979838709677419, + "grad_norm": 0.7837606339000123, + "learning_rate": 9.993074862742432e-07, + "loss": 0.3277, + "step": 198 + }, + { + "epoch": 0.1003024193548387, + "grad_norm": 0.3983563349530772, + "learning_rate": 9.992932845815062e-07, + "loss": 0.3259, + "step": 199 + }, + { + "epoch": 0.1003024193548387, + "eval_loss": 0.3568817675113678, + "eval_runtime": 17.0912, + "eval_samples_per_second": 50.026, + "eval_steps_per_second": 1.053, + "step": 199 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 2.1662289172188123, + "learning_rate": 9.992789388457496e-07, + "loss": 0.3374, + "step": 200 + }, + { + "epoch": 0.10131048387096774, + "grad_norm": 0.3792807469503325, + "learning_rate": 9.992644490711122e-07, + "loss": 0.3322, + "step": 201 + }, + { + "epoch": 0.10181451612903226, + "grad_norm": 0.8809934605286607, + "learning_rate": 9.99249815261774e-07, + "loss": 0.3234, + "step": 202 + }, + { + "epoch": 0.10231854838709678, + "grad_norm": 0.8130891805575898, + "learning_rate": 9.992350374219565e-07, + "loss": 0.3205, + "step": 203 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 0.6892675097386061, + "learning_rate": 9.992201155559235e-07, + "loss": 0.3265, + "step": 204 + }, + { + "epoch": 0.10332661290322581, + "grad_norm": 0.3620028450107508, + "learning_rate": 9.992050496679796e-07, + "loss": 0.3308, + "step": 205 + }, + { + "epoch": 0.10383064516129033, + "grad_norm": 1.1011333579285603, + "learning_rate": 9.991898397624713e-07, + "loss": 0.3297, + "step": 206 + }, + { + "epoch": 0.10433467741935484, + "grad_norm": 1.066929614971622, + "learning_rate": 9.991744858437867e-07, + "loss": 0.3343, + "step": 207 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 0.5043517533284387, + "learning_rate": 9.99158987916355e-07, + "loss": 0.3357, + "step": 208 + }, + { + "epoch": 0.10534274193548387, + "grad_norm": 2.028553721867032, + "learning_rate": 9.991433459846475e-07, + "loss": 0.3381, + "step": 209 + }, + { + "epoch": 0.10584677419354839, + "grad_norm": 1.0136025757003981, + "learning_rate": 9.991275600531766e-07, + "loss": 0.3256, + "step": 210 + }, + { + "epoch": 0.1063508064516129, + "grad_norm": 0.4260801721814953, + "learning_rate": 9.991116301264965e-07, + "loss": 0.3183, + "step": 211 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 0.8776821224341731, + "learning_rate": 9.990955562092032e-07, + "loss": 0.3086, + "step": 212 + }, + { + "epoch": 0.10735887096774194, + "grad_norm": 0.9670921019854246, + "learning_rate": 9.990793383059336e-07, + "loss": 0.3163, + "step": 213 + }, + { + "epoch": 0.10786290322580645, + "grad_norm": 0.4414053256258077, + "learning_rate": 9.990629764213663e-07, + "loss": 0.3274, + "step": 214 + }, + { + "epoch": 0.10836693548387097, + "grad_norm": 0.8635272396829062, + "learning_rate": 9.99046470560222e-07, + "loss": 0.3328, + "step": 215 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.2397064297417275, + "learning_rate": 9.990298207272625e-07, + "loss": 0.323, + "step": 216 + }, + { + "epoch": 0.109375, + "grad_norm": 0.4800605247066369, + "learning_rate": 9.99013026927291e-07, + "loss": 0.3248, + "step": 217 + }, + { + "epoch": 0.10987903225806452, + "grad_norm": 0.655524859523369, + "learning_rate": 9.989960891651521e-07, + "loss": 0.3237, + "step": 218 + }, + { + "epoch": 0.11038306451612903, + "grad_norm": 1.666081221879668, + "learning_rate": 9.98979007445733e-07, + "loss": 0.307, + "step": 219 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 0.6823811172697462, + "learning_rate": 9.989617817739612e-07, + "loss": 0.325, + "step": 220 + }, + { + "epoch": 0.11139112903225806, + "grad_norm": 0.38222379218675673, + "learning_rate": 9.989444121548061e-07, + "loss": 0.3284, + "step": 221 + }, + { + "epoch": 0.11189516129032258, + "grad_norm": 3.066211544417165, + "learning_rate": 9.989268985932789e-07, + "loss": 0.3192, + "step": 222 + }, + { + "epoch": 0.1123991935483871, + "grad_norm": 2.642378852763342, + "learning_rate": 9.989092410944321e-07, + "loss": 0.3381, + "step": 223 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 0.49898455707374983, + "learning_rate": 9.9889143966336e-07, + "loss": 0.3126, + "step": 224 + }, + { + "epoch": 0.11340725806451613, + "grad_norm": 0.8540861654265093, + "learning_rate": 9.988734943051981e-07, + "loss": 0.3272, + "step": 225 + }, + { + "epoch": 0.11391129032258064, + "grad_norm": 1.2446923755324546, + "learning_rate": 9.988554050251232e-07, + "loss": 0.329, + "step": 226 + }, + { + "epoch": 0.11441532258064516, + "grad_norm": 1.1002977014857587, + "learning_rate": 9.988371718283543e-07, + "loss": 0.3196, + "step": 227 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 0.42649460084589286, + "learning_rate": 9.988187947201517e-07, + "loss": 0.3226, + "step": 228 + }, + { + "epoch": 0.11542338709677419, + "grad_norm": 2.4011621649576824, + "learning_rate": 9.988002737058166e-07, + "loss": 0.3157, + "step": 229 + }, + { + "epoch": 0.1159274193548387, + "grad_norm": 1.9831170581698965, + "learning_rate": 9.987816087906924e-07, + "loss": 0.3128, + "step": 230 + }, + { + "epoch": 0.11643145161290322, + "grad_norm": 0.5007493489029178, + "learning_rate": 9.987627999801638e-07, + "loss": 0.3098, + "step": 231 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.3882086918200365, + "learning_rate": 9.987438472796572e-07, + "loss": 0.3205, + "step": 232 + }, + { + "epoch": 0.11743951612903226, + "grad_norm": 0.8267276195807572, + "learning_rate": 9.987247506946401e-07, + "loss": 0.3017, + "step": 233 + }, + { + "epoch": 0.11794354838709678, + "grad_norm": 0.5834541685362084, + "learning_rate": 9.98705510230622e-07, + "loss": 0.3211, + "step": 234 + }, + { + "epoch": 0.1184475806451613, + "grad_norm": 0.4068048722742785, + "learning_rate": 9.986861258931535e-07, + "loss": 0.317, + "step": 235 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 0.6447372842200539, + "learning_rate": 9.986665976878269e-07, + "loss": 0.3162, + "step": 236 + }, + { + "epoch": 0.11945564516129033, + "grad_norm": 0.3789031033771853, + "learning_rate": 9.986469256202758e-07, + "loss": 0.3434, + "step": 237 + }, + { + "epoch": 0.11995967741935484, + "grad_norm": 0.7092549539496891, + "learning_rate": 9.986271096961758e-07, + "loss": 0.3277, + "step": 238 + }, + { + "epoch": 0.12046370967741936, + "grad_norm": 0.5202838451750067, + "learning_rate": 9.986071499212435e-07, + "loss": 0.3138, + "step": 239 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 0.3561759374531645, + "learning_rate": 9.98587046301237e-07, + "loss": 0.3335, + "step": 240 + }, + { + "epoch": 0.12147177419354839, + "grad_norm": 0.3707646084919827, + "learning_rate": 9.985667988419562e-07, + "loss": 0.3162, + "step": 241 + }, + { + "epoch": 0.1219758064516129, + "grad_norm": 0.6547926315231538, + "learning_rate": 9.985464075492424e-07, + "loss": 0.3279, + "step": 242 + }, + { + "epoch": 0.12247983870967742, + "grad_norm": 0.6547474733966526, + "learning_rate": 9.985258724289784e-07, + "loss": 0.329, + "step": 243 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 1.4287976852804898, + "learning_rate": 9.985051934870886e-07, + "loss": 0.3171, + "step": 244 + }, + { + "epoch": 0.12348790322580645, + "grad_norm": 0.33812675544730314, + "learning_rate": 9.984843707295384e-07, + "loss": 0.3171, + "step": 245 + }, + { + "epoch": 0.12399193548387097, + "grad_norm": 0.5972872507124707, + "learning_rate": 9.98463404162335e-07, + "loss": 0.3195, + "step": 246 + }, + { + "epoch": 0.12449596774193548, + "grad_norm": 0.5523944045513064, + "learning_rate": 9.984422937915276e-07, + "loss": 0.3137, + "step": 247 + }, + { + "epoch": 0.125, + "grad_norm": 0.3520728667966682, + "learning_rate": 9.98421039623206e-07, + "loss": 0.3181, + "step": 248 + }, + { + "epoch": 0.12550403225806453, + "grad_norm": 1.0978808931136894, + "learning_rate": 9.98399641663502e-07, + "loss": 0.3212, + "step": 249 + }, + { + "epoch": 0.12600806451612903, + "grad_norm": 0.4007468615552269, + "learning_rate": 9.98378099918589e-07, + "loss": 0.3235, + "step": 250 + }, + { + "epoch": 0.12651209677419356, + "grad_norm": 0.38517912392455744, + "learning_rate": 9.983564143946813e-07, + "loss": 0.3221, + "step": 251 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 0.8562512071253154, + "learning_rate": 9.98334585098035e-07, + "loss": 0.3168, + "step": 252 + }, + { + "epoch": 0.1275201612903226, + "grad_norm": 0.37641066199373835, + "learning_rate": 9.98312612034948e-07, + "loss": 0.3338, + "step": 253 + }, + { + "epoch": 0.1280241935483871, + "grad_norm": 0.3364345875156596, + "learning_rate": 9.982904952117597e-07, + "loss": 0.3149, + "step": 254 + }, + { + "epoch": 0.12852822580645162, + "grad_norm": 0.4087566783250614, + "learning_rate": 9.9826823463485e-07, + "loss": 0.3256, + "step": 255 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 0.7179205392340822, + "learning_rate": 9.982458303106411e-07, + "loss": 0.3375, + "step": 256 + }, + { + "epoch": 0.12953629032258066, + "grad_norm": 0.8800188682694119, + "learning_rate": 9.982232822455968e-07, + "loss": 0.3378, + "step": 257 + }, + { + "epoch": 0.13004032258064516, + "grad_norm": 0.4425193312322176, + "learning_rate": 9.982005904462219e-07, + "loss": 0.3303, + "step": 258 + }, + { + "epoch": 0.1305443548387097, + "grad_norm": 0.9161891570130994, + "learning_rate": 9.981777549190627e-07, + "loss": 0.3036, + "step": 259 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 0.3551274779066522, + "learning_rate": 9.981547756707074e-07, + "loss": 0.3195, + "step": 260 + }, + { + "epoch": 0.13155241935483872, + "grad_norm": 0.4319317585133586, + "learning_rate": 9.981316527077852e-07, + "loss": 0.3321, + "step": 261 + }, + { + "epoch": 0.13205645161290322, + "grad_norm": 1.1093458242995622, + "learning_rate": 9.981083860369668e-07, + "loss": 0.3341, + "step": 262 + }, + { + "epoch": 0.13256048387096775, + "grad_norm": 1.1354326631084213, + "learning_rate": 9.98084975664965e-07, + "loss": 0.3184, + "step": 263 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 0.3445717345214689, + "learning_rate": 9.980614215985327e-07, + "loss": 0.326, + "step": 264 + }, + { + "epoch": 0.13356854838709678, + "grad_norm": 1.795842384240905, + "learning_rate": 9.98037723844466e-07, + "loss": 0.3176, + "step": 265 + }, + { + "epoch": 0.13407258064516128, + "grad_norm": 1.1674146767927045, + "learning_rate": 9.98013882409601e-07, + "loss": 0.3153, + "step": 266 + }, + { + "epoch": 0.1345766129032258, + "grad_norm": 0.3532656598688029, + "learning_rate": 9.97989897300816e-07, + "loss": 0.3237, + "step": 267 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 1.0598457327232365, + "learning_rate": 9.979657685250305e-07, + "loss": 0.3316, + "step": 268 + }, + { + "epoch": 0.13558467741935484, + "grad_norm": 0.4426506579911205, + "learning_rate": 9.979414960892055e-07, + "loss": 0.3031, + "step": 269 + }, + { + "epoch": 0.13608870967741934, + "grad_norm": 0.4547436037084278, + "learning_rate": 9.979170800003436e-07, + "loss": 0.3122, + "step": 270 + }, + { + "epoch": 0.13659274193548387, + "grad_norm": 0.4462058135399398, + "learning_rate": 9.978925202654883e-07, + "loss": 0.3227, + "step": 271 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 1.6285597918474548, + "learning_rate": 9.978678168917253e-07, + "loss": 0.3103, + "step": 272 + }, + { + "epoch": 0.1376008064516129, + "grad_norm": 0.8755040490518289, + "learning_rate": 9.978429698861812e-07, + "loss": 0.3048, + "step": 273 + }, + { + "epoch": 0.1381048387096774, + "grad_norm": 0.90287853250279, + "learning_rate": 9.978179792560245e-07, + "loss": 0.3265, + "step": 274 + }, + { + "epoch": 0.13860887096774194, + "grad_norm": 0.3787373854311499, + "learning_rate": 9.977928450084642e-07, + "loss": 0.3241, + "step": 275 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 0.5847992322700967, + "learning_rate": 9.977675671507522e-07, + "loss": 0.3176, + "step": 276 + }, + { + "epoch": 0.13961693548387097, + "grad_norm": 0.3486681848522623, + "learning_rate": 9.977421456901803e-07, + "loss": 0.3158, + "step": 277 + }, + { + "epoch": 0.14012096774193547, + "grad_norm": 0.38818889942185175, + "learning_rate": 9.977165806340827e-07, + "loss": 0.3135, + "step": 278 + }, + { + "epoch": 0.140625, + "grad_norm": 0.4543982688708197, + "learning_rate": 9.97690871989835e-07, + "loss": 0.3191, + "step": 279 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 1.4162173420118023, + "learning_rate": 9.976650197648536e-07, + "loss": 0.314, + "step": 280 + }, + { + "epoch": 0.14163306451612903, + "grad_norm": 0.6252978926999877, + "learning_rate": 9.976390239665971e-07, + "loss": 0.3241, + "step": 281 + }, + { + "epoch": 0.14213709677419356, + "grad_norm": 0.3461689499768043, + "learning_rate": 9.976128846025646e-07, + "loss": 0.3219, + "step": 282 + }, + { + "epoch": 0.14264112903225806, + "grad_norm": 0.3699553094334184, + "learning_rate": 9.975866016802977e-07, + "loss": 0.3037, + "step": 283 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 0.39061647423480156, + "learning_rate": 9.975601752073783e-07, + "loss": 0.3063, + "step": 284 + }, + { + "epoch": 0.1436491935483871, + "grad_norm": 0.5258130756540553, + "learning_rate": 9.975336051914307e-07, + "loss": 0.3108, + "step": 285 + }, + { + "epoch": 0.14415322580645162, + "grad_norm": 0.8216411787070486, + "learning_rate": 9.9750689164012e-07, + "loss": 0.3056, + "step": 286 + }, + { + "epoch": 0.14465725806451613, + "grad_norm": 0.4881519867613522, + "learning_rate": 9.974800345611532e-07, + "loss": 0.3164, + "step": 287 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 0.43534512320301666, + "learning_rate": 9.974530339622779e-07, + "loss": 0.3211, + "step": 288 + }, + { + "epoch": 0.14566532258064516, + "grad_norm": 0.6993861205113443, + "learning_rate": 9.97425889851284e-07, + "loss": 0.3267, + "step": 289 + }, + { + "epoch": 0.1461693548387097, + "grad_norm": 1.0418090024102467, + "learning_rate": 9.973986022360022e-07, + "loss": 0.3239, + "step": 290 + }, + { + "epoch": 0.1466733870967742, + "grad_norm": 0.3319740811109165, + "learning_rate": 9.97371171124305e-07, + "loss": 0.3153, + "step": 291 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 0.7719912354248915, + "learning_rate": 9.973435965241058e-07, + "loss": 0.3101, + "step": 292 + }, + { + "epoch": 0.14768145161290322, + "grad_norm": 0.8608701948441182, + "learning_rate": 9.973158784433599e-07, + "loss": 0.3329, + "step": 293 + }, + { + "epoch": 0.14818548387096775, + "grad_norm": 0.5399943385601701, + "learning_rate": 9.972880168900638e-07, + "loss": 0.3136, + "step": 294 + }, + { + "epoch": 0.14868951612903225, + "grad_norm": 1.21808306827626, + "learning_rate": 9.972600118722555e-07, + "loss": 0.312, + "step": 295 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 2.232388742950908, + "learning_rate": 9.97231863398014e-07, + "loss": 0.3077, + "step": 296 + }, + { + "epoch": 0.14969758064516128, + "grad_norm": 0.40750824930135454, + "learning_rate": 9.972035714754602e-07, + "loss": 0.3423, + "step": 297 + }, + { + "epoch": 0.1502016129032258, + "grad_norm": 0.6444065733453856, + "learning_rate": 9.971751361127562e-07, + "loss": 0.3108, + "step": 298 + }, + { + "epoch": 0.1507056451612903, + "grad_norm": 0.7680402422746748, + "learning_rate": 9.971465573181049e-07, + "loss": 0.3089, + "step": 299 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 0.38712847346257356, + "learning_rate": 9.971178350997516e-07, + "loss": 0.2963, + "step": 300 + }, + { + "epoch": 0.15171370967741934, + "grad_norm": 0.921107522036033, + "learning_rate": 9.970889694659823e-07, + "loss": 0.3149, + "step": 301 + }, + { + "epoch": 0.15221774193548387, + "grad_norm": 1.009394455117804, + "learning_rate": 9.970599604251247e-07, + "loss": 0.3017, + "step": 302 + }, + { + "epoch": 0.15272177419354838, + "grad_norm": 0.4398399428836015, + "learning_rate": 9.970308079855476e-07, + "loss": 0.3036, + "step": 303 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 0.8373555944492586, + "learning_rate": 9.970015121556615e-07, + "loss": 0.3204, + "step": 304 + }, + { + "epoch": 0.1537298387096774, + "grad_norm": 1.226841071199288, + "learning_rate": 9.969720729439177e-07, + "loss": 0.31, + "step": 305 + }, + { + "epoch": 0.15423387096774194, + "grad_norm": 0.4736294159912266, + "learning_rate": 9.969424903588094e-07, + "loss": 0.3273, + "step": 306 + }, + { + "epoch": 0.15473790322580644, + "grad_norm": 1.3438422876161837, + "learning_rate": 9.969127644088713e-07, + "loss": 0.3229, + "step": 307 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 2.420491209892375, + "learning_rate": 9.968828951026786e-07, + "loss": 0.3016, + "step": 308 + }, + { + "epoch": 0.15574596774193547, + "grad_norm": 0.9363754119347487, + "learning_rate": 9.96852882448849e-07, + "loss": 0.3096, + "step": 309 + }, + { + "epoch": 0.15625, + "grad_norm": 0.7464871725596692, + "learning_rate": 9.968227264560404e-07, + "loss": 0.3114, + "step": 310 + }, + { + "epoch": 0.15675403225806453, + "grad_norm": 0.8045539981080978, + "learning_rate": 9.96792427132953e-07, + "loss": 0.3074, + "step": 311 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 0.7968575088428339, + "learning_rate": 9.967619844883277e-07, + "loss": 0.3202, + "step": 312 + }, + { + "epoch": 0.15776209677419356, + "grad_norm": 0.3462338506463475, + "learning_rate": 9.967313985309472e-07, + "loss": 0.3104, + "step": 313 + }, + { + "epoch": 0.15826612903225806, + "grad_norm": 0.3411937335850433, + "learning_rate": 9.967006692696353e-07, + "loss": 0.325, + "step": 314 + }, + { + "epoch": 0.1587701612903226, + "grad_norm": 1.6309339803104832, + "learning_rate": 9.966697967132573e-07, + "loss": 0.3048, + "step": 315 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 0.8520422369690264, + "learning_rate": 9.966387808707196e-07, + "loss": 0.3031, + "step": 316 + }, + { + "epoch": 0.15977822580645162, + "grad_norm": 0.550306745006942, + "learning_rate": 9.966076217509702e-07, + "loss": 0.3174, + "step": 317 + }, + { + "epoch": 0.16028225806451613, + "grad_norm": 0.3244041619511754, + "learning_rate": 9.965763193629982e-07, + "loss": 0.3051, + "step": 318 + }, + { + "epoch": 0.16078629032258066, + "grad_norm": 0.3289354707634873, + "learning_rate": 9.965448737158343e-07, + "loss": 0.3024, + "step": 319 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 0.5698828377828442, + "learning_rate": 9.9651328481855e-07, + "loss": 0.3197, + "step": 320 + }, + { + "epoch": 0.1617943548387097, + "grad_norm": 0.3603627753702205, + "learning_rate": 9.964815526802588e-07, + "loss": 0.3267, + "step": 321 + }, + { + "epoch": 0.1622983870967742, + "grad_norm": 0.41260756113885855, + "learning_rate": 9.964496773101155e-07, + "loss": 0.3128, + "step": 322 + }, + { + "epoch": 0.16280241935483872, + "grad_norm": 0.3123646419774433, + "learning_rate": 9.964176587173154e-07, + "loss": 0.2909, + "step": 323 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 0.32862366743648225, + "learning_rate": 9.963854969110958e-07, + "loss": 0.3249, + "step": 324 + }, + { + "epoch": 0.16381048387096775, + "grad_norm": 0.32317653597651286, + "learning_rate": 9.963531919007355e-07, + "loss": 0.3179, + "step": 325 + }, + { + "epoch": 0.16431451612903225, + "grad_norm": 0.3636694182909097, + "learning_rate": 9.963207436955539e-07, + "loss": 0.3094, + "step": 326 + }, + { + "epoch": 0.16481854838709678, + "grad_norm": 0.5601385575192833, + "learning_rate": 9.962881523049125e-07, + "loss": 0.313, + "step": 327 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 0.32805972924192683, + "learning_rate": 9.962554177382134e-07, + "loss": 0.304, + "step": 328 + }, + { + "epoch": 0.1658266129032258, + "grad_norm": 0.4699969421619995, + "learning_rate": 9.962225400049004e-07, + "loss": 0.309, + "step": 329 + }, + { + "epoch": 0.1663306451612903, + "grad_norm": 0.3767389273779974, + "learning_rate": 9.961895191144586e-07, + "loss": 0.2983, + "step": 330 + }, + { + "epoch": 0.16683467741935484, + "grad_norm": 0.503719665226859, + "learning_rate": 9.961563550764143e-07, + "loss": 0.3112, + "step": 331 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 0.3438328709283268, + "learning_rate": 9.961230479003348e-07, + "loss": 0.2985, + "step": 332 + }, + { + "epoch": 0.16784274193548387, + "grad_norm": 0.3083813504969578, + "learning_rate": 9.960895975958296e-07, + "loss": 0.3134, + "step": 333 + }, + { + "epoch": 0.16834677419354838, + "grad_norm": 0.4787512237902366, + "learning_rate": 9.960560041725486e-07, + "loss": 0.3213, + "step": 334 + }, + { + "epoch": 0.1688508064516129, + "grad_norm": 1.4378814525493548, + "learning_rate": 9.960222676401833e-07, + "loss": 0.3056, + "step": 335 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 0.3506129924302826, + "learning_rate": 9.959883880084664e-07, + "loss": 0.3174, + "step": 336 + }, + { + "epoch": 0.16985887096774194, + "grad_norm": 0.35990282931470485, + "learning_rate": 9.95954365287172e-07, + "loss": 0.3224, + "step": 337 + }, + { + "epoch": 0.17036290322580644, + "grad_norm": 1.093999056396477, + "learning_rate": 9.959201994861156e-07, + "loss": 0.2938, + "step": 338 + }, + { + "epoch": 0.17086693548387097, + "grad_norm": 0.7575441587383402, + "learning_rate": 9.958858906151536e-07, + "loss": 0.3092, + "step": 339 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 0.7138793464492631, + "learning_rate": 9.958514386841842e-07, + "loss": 0.3126, + "step": 340 + }, + { + "epoch": 0.171875, + "grad_norm": 0.6820850764538464, + "learning_rate": 9.95816843703146e-07, + "loss": 0.3051, + "step": 341 + }, + { + "epoch": 0.17237903225806453, + "grad_norm": 0.438185665282885, + "learning_rate": 9.957821056820202e-07, + "loss": 0.3107, + "step": 342 + }, + { + "epoch": 0.17288306451612903, + "grad_norm": 0.2993355268251931, + "learning_rate": 9.957472246308278e-07, + "loss": 0.3177, + "step": 343 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 0.3143562128856459, + "learning_rate": 9.957122005596324e-07, + "loss": 0.3028, + "step": 344 + }, + { + "epoch": 0.17389112903225806, + "grad_norm": 0.29963517583943, + "learning_rate": 9.956770334785377e-07, + "loss": 0.303, + "step": 345 + }, + { + "epoch": 0.1743951612903226, + "grad_norm": 0.41635137878897605, + "learning_rate": 9.956417233976895e-07, + "loss": 0.3007, + "step": 346 + }, + { + "epoch": 0.1748991935483871, + "grad_norm": 0.3795788244623765, + "learning_rate": 9.956062703272744e-07, + "loss": 0.3183, + "step": 347 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 0.5158539655619079, + "learning_rate": 9.955706742775204e-07, + "loss": 0.3034, + "step": 348 + }, + { + "epoch": 0.17590725806451613, + "grad_norm": 0.41143417528165055, + "learning_rate": 9.955349352586968e-07, + "loss": 0.3064, + "step": 349 + }, + { + "epoch": 0.17641129032258066, + "grad_norm": 0.3499570984862177, + "learning_rate": 9.95499053281114e-07, + "loss": 0.3165, + "step": 350 + }, + { + "epoch": 0.17691532258064516, + "grad_norm": 0.6117708541856488, + "learning_rate": 9.95463028355124e-07, + "loss": 0.3135, + "step": 351 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 0.3512173058497519, + "learning_rate": 9.954268604911193e-07, + "loss": 0.3086, + "step": 352 + }, + { + "epoch": 0.1779233870967742, + "grad_norm": 0.5459910083191652, + "learning_rate": 9.953905496995346e-07, + "loss": 0.3212, + "step": 353 + }, + { + "epoch": 0.17842741935483872, + "grad_norm": 0.48953620036372436, + "learning_rate": 9.953540959908448e-07, + "loss": 0.3082, + "step": 354 + }, + { + "epoch": 0.17893145161290322, + "grad_norm": 0.4806785690019039, + "learning_rate": 9.953174993755669e-07, + "loss": 0.3114, + "step": 355 + }, + { + "epoch": 0.17943548387096775, + "grad_norm": 0.27574681950143637, + "learning_rate": 9.95280759864259e-07, + "loss": 0.3088, + "step": 356 + }, + { + "epoch": 0.17993951612903225, + "grad_norm": 0.4046536057043104, + "learning_rate": 9.952438774675199e-07, + "loss": 0.3014, + "step": 357 + }, + { + "epoch": 0.18044354838709678, + "grad_norm": 0.526200938156327, + "learning_rate": 9.952068521959898e-07, + "loss": 0.3042, + "step": 358 + }, + { + "epoch": 0.18094758064516128, + "grad_norm": 0.5729896337994929, + "learning_rate": 9.951696840603508e-07, + "loss": 0.3096, + "step": 359 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 0.5261901087089867, + "learning_rate": 9.95132373071325e-07, + "loss": 0.3077, + "step": 360 + }, + { + "epoch": 0.1819556451612903, + "grad_norm": 0.6668832949382653, + "learning_rate": 9.950949192396772e-07, + "loss": 0.3025, + "step": 361 + }, + { + "epoch": 0.18245967741935484, + "grad_norm": 0.862055275986415, + "learning_rate": 9.950573225762117e-07, + "loss": 0.3015, + "step": 362 + }, + { + "epoch": 0.18296370967741934, + "grad_norm": 0.6293046065894792, + "learning_rate": 9.950195830917756e-07, + "loss": 0.302, + "step": 363 + }, + { + "epoch": 0.18346774193548387, + "grad_norm": 0.6877385211906398, + "learning_rate": 9.949817007972563e-07, + "loss": 0.3176, + "step": 364 + }, + { + "epoch": 0.18397177419354838, + "grad_norm": 0.29549386712677456, + "learning_rate": 9.949436757035825e-07, + "loss": 0.3089, + "step": 365 + }, + { + "epoch": 0.1844758064516129, + "grad_norm": 0.8557932164329561, + "learning_rate": 9.949055078217244e-07, + "loss": 0.3296, + "step": 366 + }, + { + "epoch": 0.1849798387096774, + "grad_norm": 0.5402766960899985, + "learning_rate": 9.948671971626927e-07, + "loss": 0.3175, + "step": 367 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 0.49998887489839794, + "learning_rate": 9.948287437375403e-07, + "loss": 0.314, + "step": 368 + }, + { + "epoch": 0.18598790322580644, + "grad_norm": 0.33956036638788806, + "learning_rate": 9.94790147557361e-07, + "loss": 0.3072, + "step": 369 + }, + { + "epoch": 0.18649193548387097, + "grad_norm": 0.4691366516694783, + "learning_rate": 9.94751408633289e-07, + "loss": 0.3007, + "step": 370 + }, + { + "epoch": 0.18699596774193547, + "grad_norm": 0.4294056345752334, + "learning_rate": 9.947125269765001e-07, + "loss": 0.3244, + "step": 371 + }, + { + "epoch": 0.1875, + "grad_norm": 0.5198726803714843, + "learning_rate": 9.946735025982121e-07, + "loss": 0.3309, + "step": 372 + }, + { + "epoch": 0.18800403225806453, + "grad_norm": 0.5083174614120537, + "learning_rate": 9.94634335509683e-07, + "loss": 0.3064, + "step": 373 + }, + { + "epoch": 0.18850806451612903, + "grad_norm": 0.4820500182649492, + "learning_rate": 9.94595025722212e-07, + "loss": 0.3042, + "step": 374 + }, + { + "epoch": 0.18901209677419356, + "grad_norm": 0.5290432017994986, + "learning_rate": 9.9455557324714e-07, + "loss": 0.2969, + "step": 375 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 0.9665951840791808, + "learning_rate": 9.945159780958487e-07, + "loss": 0.3053, + "step": 376 + }, + { + "epoch": 0.1900201612903226, + "grad_norm": 0.27376501652334334, + "learning_rate": 9.94476240279761e-07, + "loss": 0.2975, + "step": 377 + }, + { + "epoch": 0.1905241935483871, + "grad_norm": 0.6907529950403531, + "learning_rate": 9.944363598103412e-07, + "loss": 0.3028, + "step": 378 + }, + { + "epoch": 0.19102822580645162, + "grad_norm": 0.7206171183198804, + "learning_rate": 9.943963366990944e-07, + "loss": 0.3244, + "step": 379 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 0.4351702422546798, + "learning_rate": 9.943561709575671e-07, + "loss": 0.3019, + "step": 380 + }, + { + "epoch": 0.19203629032258066, + "grad_norm": 0.4993062351913905, + "learning_rate": 9.94315862597347e-07, + "loss": 0.3147, + "step": 381 + }, + { + "epoch": 0.19254032258064516, + "grad_norm": 1.900968501331666, + "learning_rate": 9.942754116300627e-07, + "loss": 0.2954, + "step": 382 + }, + { + "epoch": 0.1930443548387097, + "grad_norm": 1.6477482220874253, + "learning_rate": 9.942348180673838e-07, + "loss": 0.3121, + "step": 383 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 0.742153580574187, + "learning_rate": 9.941940819210215e-07, + "loss": 0.2992, + "step": 384 + }, + { + "epoch": 0.19405241935483872, + "grad_norm": 0.47705653322280234, + "learning_rate": 9.941532032027281e-07, + "loss": 0.3244, + "step": 385 + }, + { + "epoch": 0.19455645161290322, + "grad_norm": 0.911598765383663, + "learning_rate": 9.941121819242965e-07, + "loss": 0.319, + "step": 386 + }, + { + "epoch": 0.19506048387096775, + "grad_norm": 0.8928463372091003, + "learning_rate": 9.940710180975615e-07, + "loss": 0.3176, + "step": 387 + }, + { + "epoch": 0.19556451612903225, + "grad_norm": 0.5466103602367745, + "learning_rate": 9.940297117343983e-07, + "loss": 0.3117, + "step": 388 + }, + { + "epoch": 0.19606854838709678, + "grad_norm": 0.28786769682640595, + "learning_rate": 9.939882628467235e-07, + "loss": 0.319, + "step": 389 + }, + { + "epoch": 0.19657258064516128, + "grad_norm": 1.750270794203163, + "learning_rate": 9.939466714464953e-07, + "loss": 0.3097, + "step": 390 + }, + { + "epoch": 0.1970766129032258, + "grad_norm": 1.3109389597034193, + "learning_rate": 9.939049375457117e-07, + "loss": 0.3095, + "step": 391 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 0.3810245348808811, + "learning_rate": 9.938630611564136e-07, + "loss": 0.3015, + "step": 392 + }, + { + "epoch": 0.19808467741935484, + "grad_norm": 0.2822642693051148, + "learning_rate": 9.938210422906816e-07, + "loss": 0.3065, + "step": 393 + }, + { + "epoch": 0.19858870967741934, + "grad_norm": 0.4548013874567676, + "learning_rate": 9.93778880960638e-07, + "loss": 0.3059, + "step": 394 + }, + { + "epoch": 0.19909274193548387, + "grad_norm": 0.7403977526231464, + "learning_rate": 9.937365771784458e-07, + "loss": 0.3001, + "step": 395 + }, + { + "epoch": 0.19959677419354838, + "grad_norm": 0.40584105744807936, + "learning_rate": 9.936941309563097e-07, + "loss": 0.3018, + "step": 396 + }, + { + "epoch": 0.2001008064516129, + "grad_norm": 0.4027563712329421, + "learning_rate": 9.936515423064752e-07, + "loss": 0.2821, + "step": 397 + }, + { + "epoch": 0.2006048387096774, + "grad_norm": 0.3215902734384433, + "learning_rate": 9.936088112412288e-07, + "loss": 0.3093, + "step": 398 + }, + { + "epoch": 0.2006048387096774, + "eval_loss": 0.33719342947006226, + "eval_runtime": 17.0014, + "eval_samples_per_second": 50.29, + "eval_steps_per_second": 1.059, + "step": 398 + }, + { + "epoch": 0.20110887096774194, + "grad_norm": 0.9421611516647215, + "learning_rate": 9.935659377728982e-07, + "loss": 0.2979, + "step": 399 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 0.9245468490926179, + "learning_rate": 9.935229219138517e-07, + "loss": 0.2942, + "step": 400 + }, + { + "epoch": 0.20211693548387097, + "grad_norm": 0.6824205781396179, + "learning_rate": 9.934797636764999e-07, + "loss": 0.2971, + "step": 401 + }, + { + "epoch": 0.20262096774193547, + "grad_norm": 0.27185149783417195, + "learning_rate": 9.934364630732928e-07, + "loss": 0.3107, + "step": 402 + }, + { + "epoch": 0.203125, + "grad_norm": 0.6827808287541225, + "learning_rate": 9.933930201167228e-07, + "loss": 0.3184, + "step": 403 + }, + { + "epoch": 0.20362903225806453, + "grad_norm": 0.6173711604161257, + "learning_rate": 9.93349434819323e-07, + "loss": 0.3137, + "step": 404 + }, + { + "epoch": 0.20413306451612903, + "grad_norm": 0.8730361088623172, + "learning_rate": 9.933057071936674e-07, + "loss": 0.3045, + "step": 405 + }, + { + "epoch": 0.20463709677419356, + "grad_norm": 0.2347631790989028, + "learning_rate": 9.932618372523712e-07, + "loss": 0.3174, + "step": 406 + }, + { + "epoch": 0.20514112903225806, + "grad_norm": 0.2618825790896803, + "learning_rate": 9.932178250080905e-07, + "loss": 0.3053, + "step": 407 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 0.8173532124618281, + "learning_rate": 9.931736704735226e-07, + "loss": 0.2987, + "step": 408 + }, + { + "epoch": 0.2061491935483871, + "grad_norm": 0.6079169109005206, + "learning_rate": 9.931293736614059e-07, + "loss": 0.3131, + "step": 409 + }, + { + "epoch": 0.20665322580645162, + "grad_norm": 0.4779413914330351, + "learning_rate": 9.930849345845195e-07, + "loss": 0.3095, + "step": 410 + }, + { + "epoch": 0.20715725806451613, + "grad_norm": 0.5403444992322752, + "learning_rate": 9.930403532556841e-07, + "loss": 0.2985, + "step": 411 + }, + { + "epoch": 0.20766129032258066, + "grad_norm": 0.33724492734825506, + "learning_rate": 9.929956296877609e-07, + "loss": 0.3029, + "step": 412 + }, + { + "epoch": 0.20816532258064516, + "grad_norm": 0.803588317511041, + "learning_rate": 9.929507638936527e-07, + "loss": 0.304, + "step": 413 + }, + { + "epoch": 0.2086693548387097, + "grad_norm": 0.24653864233446288, + "learning_rate": 9.929057558863025e-07, + "loss": 0.2991, + "step": 414 + }, + { + "epoch": 0.2091733870967742, + "grad_norm": 0.3080563000251818, + "learning_rate": 9.928606056786953e-07, + "loss": 0.3045, + "step": 415 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 0.23276679829744804, + "learning_rate": 9.928153132838564e-07, + "loss": 0.2998, + "step": 416 + }, + { + "epoch": 0.21018145161290322, + "grad_norm": 0.3031284409615895, + "learning_rate": 9.927698787148524e-07, + "loss": 0.3129, + "step": 417 + }, + { + "epoch": 0.21068548387096775, + "grad_norm": 0.7190277552056444, + "learning_rate": 9.92724301984791e-07, + "loss": 0.318, + "step": 418 + }, + { + "epoch": 0.21118951612903225, + "grad_norm": 0.671405235151991, + "learning_rate": 9.92678583106821e-07, + "loss": 0.3152, + "step": 419 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 0.26427462576363475, + "learning_rate": 9.926327220941313e-07, + "loss": 0.3131, + "step": 420 + }, + { + "epoch": 0.21219758064516128, + "grad_norm": 0.21989852886239047, + "learning_rate": 9.925867189599534e-07, + "loss": 0.2913, + "step": 421 + }, + { + "epoch": 0.2127016129032258, + "grad_norm": 0.5902102443498939, + "learning_rate": 9.925405737175582e-07, + "loss": 0.3047, + "step": 422 + }, + { + "epoch": 0.2132056451612903, + "grad_norm": 0.6036703980264593, + "learning_rate": 9.924942863802586e-07, + "loss": 0.3023, + "step": 423 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 0.6785617848990195, + "learning_rate": 9.924478569614084e-07, + "loss": 0.3059, + "step": 424 + }, + { + "epoch": 0.21421370967741934, + "grad_norm": 0.41465435154148006, + "learning_rate": 9.924012854744019e-07, + "loss": 0.2937, + "step": 425 + }, + { + "epoch": 0.21471774193548387, + "grad_norm": 0.5756370196897739, + "learning_rate": 9.923545719326748e-07, + "loss": 0.3003, + "step": 426 + }, + { + "epoch": 0.21522177419354838, + "grad_norm": 1.1087103802236575, + "learning_rate": 9.923077163497037e-07, + "loss": 0.3115, + "step": 427 + }, + { + "epoch": 0.2157258064516129, + "grad_norm": 0.5075052144087366, + "learning_rate": 9.922607187390062e-07, + "loss": 0.2861, + "step": 428 + }, + { + "epoch": 0.2162298387096774, + "grad_norm": 0.21875262709516646, + "learning_rate": 9.922135791141407e-07, + "loss": 0.3111, + "step": 429 + }, + { + "epoch": 0.21673387096774194, + "grad_norm": 0.5081145594897964, + "learning_rate": 9.921662974887067e-07, + "loss": 0.3012, + "step": 430 + }, + { + "epoch": 0.21723790322580644, + "grad_norm": 0.3760477536962576, + "learning_rate": 9.921188738763447e-07, + "loss": 0.307, + "step": 431 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 0.3082054028318284, + "learning_rate": 9.92071308290736e-07, + "loss": 0.3171, + "step": 432 + }, + { + "epoch": 0.21824596774193547, + "grad_norm": 0.22029848103815816, + "learning_rate": 9.920236007456031e-07, + "loss": 0.2931, + "step": 433 + }, + { + "epoch": 0.21875, + "grad_norm": 0.41253335918523026, + "learning_rate": 9.919757512547094e-07, + "loss": 0.3011, + "step": 434 + }, + { + "epoch": 0.21925403225806453, + "grad_norm": 0.5610820660085883, + "learning_rate": 9.91927759831859e-07, + "loss": 0.3088, + "step": 435 + }, + { + "epoch": 0.21975806451612903, + "grad_norm": 0.35401767629764336, + "learning_rate": 9.918796264908973e-07, + "loss": 0.3003, + "step": 436 + }, + { + "epoch": 0.22026209677419356, + "grad_norm": 0.28203957531336254, + "learning_rate": 9.918313512457104e-07, + "loss": 0.3081, + "step": 437 + }, + { + "epoch": 0.22076612903225806, + "grad_norm": 0.2554173995348508, + "learning_rate": 9.917829341102254e-07, + "loss": 0.2942, + "step": 438 + }, + { + "epoch": 0.2212701612903226, + "grad_norm": 0.23639618149360714, + "learning_rate": 9.917343750984102e-07, + "loss": 0.3021, + "step": 439 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 0.4305124117168449, + "learning_rate": 9.91685674224274e-07, + "loss": 0.3085, + "step": 440 + }, + { + "epoch": 0.22227822580645162, + "grad_norm": 0.2220338714231415, + "learning_rate": 9.916368315018666e-07, + "loss": 0.3028, + "step": 441 + }, + { + "epoch": 0.22278225806451613, + "grad_norm": 0.2927983739524596, + "learning_rate": 9.91587846945279e-07, + "loss": 0.3017, + "step": 442 + }, + { + "epoch": 0.22328629032258066, + "grad_norm": 0.3420210184854715, + "learning_rate": 9.915387205686427e-07, + "loss": 0.3081, + "step": 443 + }, + { + "epoch": 0.22379032258064516, + "grad_norm": 0.39311494277912007, + "learning_rate": 9.914894523861304e-07, + "loss": 0.3028, + "step": 444 + }, + { + "epoch": 0.2242943548387097, + "grad_norm": 0.4151663339605208, + "learning_rate": 9.914400424119555e-07, + "loss": 0.2956, + "step": 445 + }, + { + "epoch": 0.2247983870967742, + "grad_norm": 0.2123585820142778, + "learning_rate": 9.91390490660373e-07, + "loss": 0.3075, + "step": 446 + }, + { + "epoch": 0.22530241935483872, + "grad_norm": 0.571261555860595, + "learning_rate": 9.913407971456778e-07, + "loss": 0.3013, + "step": 447 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 0.23955795641291014, + "learning_rate": 9.912909618822063e-07, + "loss": 0.3134, + "step": 448 + }, + { + "epoch": 0.22631048387096775, + "grad_norm": 0.570859883971083, + "learning_rate": 9.912409848843358e-07, + "loss": 0.3056, + "step": 449 + }, + { + "epoch": 0.22681451612903225, + "grad_norm": 0.48199321601840345, + "learning_rate": 9.911908661664842e-07, + "loss": 0.3154, + "step": 450 + }, + { + "epoch": 0.22731854838709678, + "grad_norm": 0.5422943126443341, + "learning_rate": 9.911406057431104e-07, + "loss": 0.2987, + "step": 451 + }, + { + "epoch": 0.22782258064516128, + "grad_norm": 0.3574993769315431, + "learning_rate": 9.910902036287143e-07, + "loss": 0.2941, + "step": 452 + }, + { + "epoch": 0.2283266129032258, + "grad_norm": 1.1833977307135548, + "learning_rate": 9.910396598378366e-07, + "loss": 0.3017, + "step": 453 + }, + { + "epoch": 0.2288306451612903, + "grad_norm": 0.25337165989867305, + "learning_rate": 9.90988974385059e-07, + "loss": 0.2925, + "step": 454 + }, + { + "epoch": 0.22933467741935484, + "grad_norm": 0.2480951752658535, + "learning_rate": 9.909381472850036e-07, + "loss": 0.3021, + "step": 455 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 0.409014941345192, + "learning_rate": 9.90887178552334e-07, + "loss": 0.3272, + "step": 456 + }, + { + "epoch": 0.23034274193548387, + "grad_norm": 0.7457987975927259, + "learning_rate": 9.908360682017544e-07, + "loss": 0.3048, + "step": 457 + }, + { + "epoch": 0.23084677419354838, + "grad_norm": 0.45956267234032566, + "learning_rate": 9.907848162480094e-07, + "loss": 0.3066, + "step": 458 + }, + { + "epoch": 0.2313508064516129, + "grad_norm": 0.3779765403542269, + "learning_rate": 9.907334227058855e-07, + "loss": 0.3009, + "step": 459 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 0.2666151159273384, + "learning_rate": 9.90681887590209e-07, + "loss": 0.3124, + "step": 460 + }, + { + "epoch": 0.23235887096774194, + "grad_norm": 0.2839428445664857, + "learning_rate": 9.906302109158474e-07, + "loss": 0.3034, + "step": 461 + }, + { + "epoch": 0.23286290322580644, + "grad_norm": 0.4380368120654733, + "learning_rate": 9.905783926977094e-07, + "loss": 0.3087, + "step": 462 + }, + { + "epoch": 0.23336693548387097, + "grad_norm": 0.46446987722443317, + "learning_rate": 9.90526432950744e-07, + "loss": 0.3066, + "step": 463 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 0.2400929003281859, + "learning_rate": 9.904743316899412e-07, + "loss": 0.2971, + "step": 464 + }, + { + "epoch": 0.234375, + "grad_norm": 0.22533888473808292, + "learning_rate": 9.904220889303322e-07, + "loss": 0.3034, + "step": 465 + }, + { + "epoch": 0.23487903225806453, + "grad_norm": 0.5169333077194566, + "learning_rate": 9.903697046869885e-07, + "loss": 0.3058, + "step": 466 + }, + { + "epoch": 0.23538306451612903, + "grad_norm": 0.37976744599257856, + "learning_rate": 9.903171789750227e-07, + "loss": 0.2943, + "step": 467 + }, + { + "epoch": 0.23588709677419356, + "grad_norm": 0.303653990179142, + "learning_rate": 9.90264511809588e-07, + "loss": 0.3037, + "step": 468 + }, + { + "epoch": 0.23639112903225806, + "grad_norm": 0.21581398194659623, + "learning_rate": 9.902117032058788e-07, + "loss": 0.3142, + "step": 469 + }, + { + "epoch": 0.2368951612903226, + "grad_norm": 0.4025038895307978, + "learning_rate": 9.9015875317913e-07, + "loss": 0.2886, + "step": 470 + }, + { + "epoch": 0.2373991935483871, + "grad_norm": 0.7490982140928762, + "learning_rate": 9.90105661744617e-07, + "loss": 0.3048, + "step": 471 + }, + { + "epoch": 0.23790322580645162, + "grad_norm": 0.2765613593877836, + "learning_rate": 9.900524289176571e-07, + "loss": 0.3113, + "step": 472 + }, + { + "epoch": 0.23840725806451613, + "grad_norm": 0.25206617170451434, + "learning_rate": 9.899990547136068e-07, + "loss": 0.3031, + "step": 473 + }, + { + "epoch": 0.23891129032258066, + "grad_norm": 0.5866005018512471, + "learning_rate": 9.899455391478646e-07, + "loss": 0.3011, + "step": 474 + }, + { + "epoch": 0.23941532258064516, + "grad_norm": 0.39806300430021535, + "learning_rate": 9.898918822358695e-07, + "loss": 0.2955, + "step": 475 + }, + { + "epoch": 0.2399193548387097, + "grad_norm": 0.2584605383681581, + "learning_rate": 9.898380839931012e-07, + "loss": 0.3044, + "step": 476 + }, + { + "epoch": 0.2404233870967742, + "grad_norm": 0.19700196645461712, + "learning_rate": 9.897841444350799e-07, + "loss": 0.2983, + "step": 477 + }, + { + "epoch": 0.24092741935483872, + "grad_norm": 0.306687320143844, + "learning_rate": 9.89730063577367e-07, + "loss": 0.2963, + "step": 478 + }, + { + "epoch": 0.24143145161290322, + "grad_norm": 0.19514269842432405, + "learning_rate": 9.896758414355646e-07, + "loss": 0.2852, + "step": 479 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 0.2903995442375573, + "learning_rate": 9.89621478025315e-07, + "loss": 0.3039, + "step": 480 + }, + { + "epoch": 0.24243951612903225, + "grad_norm": 0.24936460961507168, + "learning_rate": 9.895669733623024e-07, + "loss": 0.3025, + "step": 481 + }, + { + "epoch": 0.24294354838709678, + "grad_norm": 0.20112659720170964, + "learning_rate": 9.895123274622506e-07, + "loss": 0.308, + "step": 482 + }, + { + "epoch": 0.24344758064516128, + "grad_norm": 0.3732280770776686, + "learning_rate": 9.894575403409246e-07, + "loss": 0.3048, + "step": 483 + }, + { + "epoch": 0.2439516129032258, + "grad_norm": 0.3084274056795106, + "learning_rate": 9.894026120141304e-07, + "loss": 0.2999, + "step": 484 + }, + { + "epoch": 0.2444556451612903, + "grad_norm": 0.23552597211145404, + "learning_rate": 9.893475424977143e-07, + "loss": 0.3073, + "step": 485 + }, + { + "epoch": 0.24495967741935484, + "grad_norm": 1.0356894202770972, + "learning_rate": 9.892923318075634e-07, + "loss": 0.2958, + "step": 486 + }, + { + "epoch": 0.24546370967741934, + "grad_norm": 0.22009519521768645, + "learning_rate": 9.892369799596057e-07, + "loss": 0.3059, + "step": 487 + }, + { + "epoch": 0.24596774193548387, + "grad_norm": 0.2573429172383206, + "learning_rate": 9.891814869698101e-07, + "loss": 0.2992, + "step": 488 + }, + { + "epoch": 0.24647177419354838, + "grad_norm": 0.2350845142780554, + "learning_rate": 9.891258528541859e-07, + "loss": 0.3033, + "step": 489 + }, + { + "epoch": 0.2469758064516129, + "grad_norm": 0.2879929160489008, + "learning_rate": 9.89070077628783e-07, + "loss": 0.3117, + "step": 490 + }, + { + "epoch": 0.2474798387096774, + "grad_norm": 0.25961881540855053, + "learning_rate": 9.890141613096924e-07, + "loss": 0.3168, + "step": 491 + }, + { + "epoch": 0.24798387096774194, + "grad_norm": 0.19180476078696093, + "learning_rate": 9.889581039130455e-07, + "loss": 0.2868, + "step": 492 + }, + { + "epoch": 0.24848790322580644, + "grad_norm": 0.45241056266402024, + "learning_rate": 9.889019054550144e-07, + "loss": 0.304, + "step": 493 + }, + { + "epoch": 0.24899193548387097, + "grad_norm": 0.24790970349975638, + "learning_rate": 9.888455659518124e-07, + "loss": 0.3188, + "step": 494 + }, + { + "epoch": 0.24949596774193547, + "grad_norm": 0.22736377748906864, + "learning_rate": 9.887890854196928e-07, + "loss": 0.3086, + "step": 495 + }, + { + "epoch": 0.25, + "grad_norm": 0.2157688569695134, + "learning_rate": 9.8873246387495e-07, + "loss": 0.3055, + "step": 496 + }, + { + "epoch": 0.2505040322580645, + "grad_norm": 0.2401397569367212, + "learning_rate": 9.886757013339188e-07, + "loss": 0.2903, + "step": 497 + }, + { + "epoch": 0.25100806451612906, + "grad_norm": 0.27724457476206976, + "learning_rate": 9.88618797812975e-07, + "loss": 0.2809, + "step": 498 + }, + { + "epoch": 0.25151209677419356, + "grad_norm": 0.22112121556855863, + "learning_rate": 9.885617533285349e-07, + "loss": 0.3083, + "step": 499 + }, + { + "epoch": 0.25201612903225806, + "grad_norm": 0.26659993041208113, + "learning_rate": 9.885045678970554e-07, + "loss": 0.2945, + "step": 500 + }, + { + "epoch": 0.25252016129032256, + "grad_norm": 0.474336083459808, + "learning_rate": 9.884472415350342e-07, + "loss": 0.3066, + "step": 501 + }, + { + "epoch": 0.2530241935483871, + "grad_norm": 0.3246752836417456, + "learning_rate": 9.883897742590094e-07, + "loss": 0.315, + "step": 502 + }, + { + "epoch": 0.2535282258064516, + "grad_norm": 0.19279895004344805, + "learning_rate": 9.883321660855604e-07, + "loss": 0.2902, + "step": 503 + }, + { + "epoch": 0.2540322580645161, + "grad_norm": 0.26362617716913594, + "learning_rate": 9.882744170313065e-07, + "loss": 0.3309, + "step": 504 + }, + { + "epoch": 0.2545362903225806, + "grad_norm": 0.5083521686121811, + "learning_rate": 9.882165271129078e-07, + "loss": 0.3039, + "step": 505 + }, + { + "epoch": 0.2550403225806452, + "grad_norm": 0.20136858022759457, + "learning_rate": 9.881584963470657e-07, + "loss": 0.3135, + "step": 506 + }, + { + "epoch": 0.2555443548387097, + "grad_norm": 0.24947755269293234, + "learning_rate": 9.88100324750521e-07, + "loss": 0.3, + "step": 507 + }, + { + "epoch": 0.2560483870967742, + "grad_norm": 0.1934113012969022, + "learning_rate": 9.880420123400567e-07, + "loss": 0.2998, + "step": 508 + }, + { + "epoch": 0.2565524193548387, + "grad_norm": 0.19024377903728643, + "learning_rate": 9.879835591324947e-07, + "loss": 0.3191, + "step": 509 + }, + { + "epoch": 0.25705645161290325, + "grad_norm": 0.19125874820918395, + "learning_rate": 9.87924965144699e-07, + "loss": 0.3075, + "step": 510 + }, + { + "epoch": 0.25756048387096775, + "grad_norm": 0.19244259184808277, + "learning_rate": 9.878662303935732e-07, + "loss": 0.2956, + "step": 511 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 0.22509070180606666, + "learning_rate": 9.878073548960623e-07, + "loss": 0.3218, + "step": 512 + }, + { + "epoch": 0.25856854838709675, + "grad_norm": 0.2717698227036736, + "learning_rate": 9.877483386691513e-07, + "loss": 0.3027, + "step": 513 + }, + { + "epoch": 0.2590725806451613, + "grad_norm": 0.1867442771638362, + "learning_rate": 9.876891817298658e-07, + "loss": 0.2846, + "step": 514 + }, + { + "epoch": 0.2595766129032258, + "grad_norm": 0.2377584026295176, + "learning_rate": 9.876298840952726e-07, + "loss": 0.2849, + "step": 515 + }, + { + "epoch": 0.2600806451612903, + "grad_norm": 0.2936917090291921, + "learning_rate": 9.875704457824786e-07, + "loss": 0.3013, + "step": 516 + }, + { + "epoch": 0.2605846774193548, + "grad_norm": 0.4263228296671513, + "learning_rate": 9.875108668086313e-07, + "loss": 0.2846, + "step": 517 + }, + { + "epoch": 0.2610887096774194, + "grad_norm": 0.2718679782241284, + "learning_rate": 9.874511471909189e-07, + "loss": 0.2842, + "step": 518 + }, + { + "epoch": 0.2615927419354839, + "grad_norm": 0.18725434376492076, + "learning_rate": 9.873912869465701e-07, + "loss": 0.3015, + "step": 519 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 0.19020214607186645, + "learning_rate": 9.873312860928541e-07, + "loss": 0.3149, + "step": 520 + }, + { + "epoch": 0.2626008064516129, + "grad_norm": 0.244741870819488, + "learning_rate": 9.87271144647081e-07, + "loss": 0.305, + "step": 521 + }, + { + "epoch": 0.26310483870967744, + "grad_norm": 0.22885289257455954, + "learning_rate": 9.872108626266014e-07, + "loss": 0.2933, + "step": 522 + }, + { + "epoch": 0.26360887096774194, + "grad_norm": 0.29727559200743137, + "learning_rate": 9.871504400488059e-07, + "loss": 0.2989, + "step": 523 + }, + { + "epoch": 0.26411290322580644, + "grad_norm": 0.3347319040673116, + "learning_rate": 9.870898769311261e-07, + "loss": 0.3003, + "step": 524 + }, + { + "epoch": 0.26461693548387094, + "grad_norm": 0.21285823806164653, + "learning_rate": 9.870291732910343e-07, + "loss": 0.2909, + "step": 525 + }, + { + "epoch": 0.2651209677419355, + "grad_norm": 0.19714692459161542, + "learning_rate": 9.86968329146043e-07, + "loss": 0.2863, + "step": 526 + }, + { + "epoch": 0.265625, + "grad_norm": 0.37762602100369747, + "learning_rate": 9.869073445137054e-07, + "loss": 0.3072, + "step": 527 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 0.2322570227181064, + "learning_rate": 9.868462194116149e-07, + "loss": 0.2952, + "step": 528 + }, + { + "epoch": 0.26663306451612906, + "grad_norm": 0.25028173693381045, + "learning_rate": 9.86784953857406e-07, + "loss": 0.2979, + "step": 529 + }, + { + "epoch": 0.26713709677419356, + "grad_norm": 0.23342075160977557, + "learning_rate": 9.867235478687534e-07, + "loss": 0.3029, + "step": 530 + }, + { + "epoch": 0.26764112903225806, + "grad_norm": 0.2202223368280641, + "learning_rate": 9.866620014633725e-07, + "loss": 0.2995, + "step": 531 + }, + { + "epoch": 0.26814516129032256, + "grad_norm": 0.40688579823270754, + "learning_rate": 9.866003146590186e-07, + "loss": 0.2998, + "step": 532 + }, + { + "epoch": 0.2686491935483871, + "grad_norm": 0.23558843514280095, + "learning_rate": 9.865384874734886e-07, + "loss": 0.2919, + "step": 533 + }, + { + "epoch": 0.2691532258064516, + "grad_norm": 0.350876653755429, + "learning_rate": 9.864765199246187e-07, + "loss": 0.2979, + "step": 534 + }, + { + "epoch": 0.2696572580645161, + "grad_norm": 0.5484769536987208, + "learning_rate": 9.864144120302865e-07, + "loss": 0.3098, + "step": 535 + }, + { + "epoch": 0.2701612903225806, + "grad_norm": 0.20073285824672496, + "learning_rate": 9.863521638084093e-07, + "loss": 0.299, + "step": 536 + }, + { + "epoch": 0.2706653225806452, + "grad_norm": 0.18356884936864395, + "learning_rate": 9.86289775276946e-07, + "loss": 0.3042, + "step": 537 + }, + { + "epoch": 0.2711693548387097, + "grad_norm": 0.2886255584956994, + "learning_rate": 9.862272464538946e-07, + "loss": 0.3132, + "step": 538 + }, + { + "epoch": 0.2716733870967742, + "grad_norm": 0.2658005187689356, + "learning_rate": 9.86164577357295e-07, + "loss": 0.308, + "step": 539 + }, + { + "epoch": 0.2721774193548387, + "grad_norm": 0.36659782067418106, + "learning_rate": 9.861017680052262e-07, + "loss": 0.2983, + "step": 540 + }, + { + "epoch": 0.27268145161290325, + "grad_norm": 0.26876552668494674, + "learning_rate": 9.860388184158086e-07, + "loss": 0.3026, + "step": 541 + }, + { + "epoch": 0.27318548387096775, + "grad_norm": 0.18123995001034987, + "learning_rate": 9.859757286072028e-07, + "loss": 0.2984, + "step": 542 + }, + { + "epoch": 0.27368951612903225, + "grad_norm": 0.3490662102838747, + "learning_rate": 9.859124985976097e-07, + "loss": 0.3006, + "step": 543 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 0.5077591494356637, + "learning_rate": 9.858491284052708e-07, + "loss": 0.2942, + "step": 544 + }, + { + "epoch": 0.2746975806451613, + "grad_norm": 0.1899610370247458, + "learning_rate": 9.857856180484682e-07, + "loss": 0.2952, + "step": 545 + }, + { + "epoch": 0.2752016129032258, + "grad_norm": 0.33906573112018934, + "learning_rate": 9.857219675455236e-07, + "loss": 0.2825, + "step": 546 + }, + { + "epoch": 0.2757056451612903, + "grad_norm": 0.24179860042616932, + "learning_rate": 9.856581769148007e-07, + "loss": 0.2963, + "step": 547 + }, + { + "epoch": 0.2762096774193548, + "grad_norm": 0.24173305167642747, + "learning_rate": 9.855942461747023e-07, + "loss": 0.3157, + "step": 548 + }, + { + "epoch": 0.2767137096774194, + "grad_norm": 0.28928103551655465, + "learning_rate": 9.855301753436718e-07, + "loss": 0.3087, + "step": 549 + }, + { + "epoch": 0.2772177419354839, + "grad_norm": 0.18922268926852628, + "learning_rate": 9.854659644401934e-07, + "loss": 0.3134, + "step": 550 + }, + { + "epoch": 0.2777217741935484, + "grad_norm": 0.20784139183584166, + "learning_rate": 9.854016134827916e-07, + "loss": 0.3051, + "step": 551 + }, + { + "epoch": 0.2782258064516129, + "grad_norm": 0.5667720250099785, + "learning_rate": 9.853371224900313e-07, + "loss": 0.2847, + "step": 552 + }, + { + "epoch": 0.27872983870967744, + "grad_norm": 0.34781718527992644, + "learning_rate": 9.852724914805175e-07, + "loss": 0.2873, + "step": 553 + }, + { + "epoch": 0.27923387096774194, + "grad_norm": 0.19716253139941464, + "learning_rate": 9.852077204728961e-07, + "loss": 0.3027, + "step": 554 + }, + { + "epoch": 0.27973790322580644, + "grad_norm": 0.19344190301481812, + "learning_rate": 9.85142809485853e-07, + "loss": 0.2897, + "step": 555 + }, + { + "epoch": 0.28024193548387094, + "grad_norm": 0.7127285715175351, + "learning_rate": 9.850777585381146e-07, + "loss": 0.2866, + "step": 556 + }, + { + "epoch": 0.2807459677419355, + "grad_norm": 0.40797443381529386, + "learning_rate": 9.85012567648448e-07, + "loss": 0.3056, + "step": 557 + }, + { + "epoch": 0.28125, + "grad_norm": 0.2136697521115876, + "learning_rate": 9.8494723683566e-07, + "loss": 0.3064, + "step": 558 + }, + { + "epoch": 0.2817540322580645, + "grad_norm": 0.26165293739971646, + "learning_rate": 9.848817661185984e-07, + "loss": 0.2921, + "step": 559 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 0.1846995068150634, + "learning_rate": 9.848161555161507e-07, + "loss": 0.3131, + "step": 560 + }, + { + "epoch": 0.28276209677419356, + "grad_norm": 1.0083499880979159, + "learning_rate": 9.847504050472454e-07, + "loss": 0.3053, + "step": 561 + }, + { + "epoch": 0.28326612903225806, + "grad_norm": 0.6105046373752532, + "learning_rate": 9.846845147308514e-07, + "loss": 0.3082, + "step": 562 + }, + { + "epoch": 0.28377016129032256, + "grad_norm": 0.32182959498303215, + "learning_rate": 9.846184845859772e-07, + "loss": 0.2886, + "step": 563 + }, + { + "epoch": 0.2842741935483871, + "grad_norm": 0.5616737715729443, + "learning_rate": 9.845523146316722e-07, + "loss": 0.3138, + "step": 564 + }, + { + "epoch": 0.2847782258064516, + "grad_norm": 0.34317343338743084, + "learning_rate": 9.844860048870261e-07, + "loss": 0.3058, + "step": 565 + }, + { + "epoch": 0.2852822580645161, + "grad_norm": 0.38672910543834893, + "learning_rate": 9.84419555371169e-07, + "loss": 0.2991, + "step": 566 + }, + { + "epoch": 0.2857862903225806, + "grad_norm": 0.27114300647024586, + "learning_rate": 9.843529661032706e-07, + "loss": 0.3083, + "step": 567 + }, + { + "epoch": 0.2862903225806452, + "grad_norm": 0.1896232339668482, + "learning_rate": 9.842862371025422e-07, + "loss": 0.3025, + "step": 568 + }, + { + "epoch": 0.2867943548387097, + "grad_norm": 0.3930098230344227, + "learning_rate": 9.842193683882344e-07, + "loss": 0.3079, + "step": 569 + }, + { + "epoch": 0.2872983870967742, + "grad_norm": 0.43124046850579134, + "learning_rate": 9.841523599796382e-07, + "loss": 0.2777, + "step": 570 + }, + { + "epoch": 0.2878024193548387, + "grad_norm": 0.7587117457845125, + "learning_rate": 9.840852118960853e-07, + "loss": 0.3064, + "step": 571 + }, + { + "epoch": 0.28830645161290325, + "grad_norm": 0.3117074250999561, + "learning_rate": 9.840179241569478e-07, + "loss": 0.299, + "step": 572 + }, + { + "epoch": 0.28881048387096775, + "grad_norm": 0.18731266500576274, + "learning_rate": 9.839504967816374e-07, + "loss": 0.3035, + "step": 573 + }, + { + "epoch": 0.28931451612903225, + "grad_norm": 0.43778747829645503, + "learning_rate": 9.838829297896065e-07, + "loss": 0.319, + "step": 574 + }, + { + "epoch": 0.28981854838709675, + "grad_norm": 0.40776414993659865, + "learning_rate": 9.83815223200348e-07, + "loss": 0.2955, + "step": 575 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 0.25990172322006877, + "learning_rate": 9.837473770333945e-07, + "loss": 0.3132, + "step": 576 + }, + { + "epoch": 0.2908266129032258, + "grad_norm": 0.2296880045553122, + "learning_rate": 9.836793913083195e-07, + "loss": 0.2936, + "step": 577 + }, + { + "epoch": 0.2913306451612903, + "grad_norm": 0.1953127561261216, + "learning_rate": 9.836112660447362e-07, + "loss": 0.3138, + "step": 578 + }, + { + "epoch": 0.2918346774193548, + "grad_norm": 0.27208394851949347, + "learning_rate": 9.835430012622988e-07, + "loss": 0.313, + "step": 579 + }, + { + "epoch": 0.2923387096774194, + "grad_norm": 0.2877633978499272, + "learning_rate": 9.834745969807006e-07, + "loss": 0.3013, + "step": 580 + }, + { + "epoch": 0.2928427419354839, + "grad_norm": 0.20402433175664483, + "learning_rate": 9.834060532196761e-07, + "loss": 0.2948, + "step": 581 + }, + { + "epoch": 0.2933467741935484, + "grad_norm": 0.3541481547851912, + "learning_rate": 9.833373699989999e-07, + "loss": 0.3057, + "step": 582 + }, + { + "epoch": 0.2938508064516129, + "grad_norm": 0.25513070455002546, + "learning_rate": 9.832685473384868e-07, + "loss": 0.2972, + "step": 583 + }, + { + "epoch": 0.29435483870967744, + "grad_norm": 0.6078105436530532, + "learning_rate": 9.83199585257991e-07, + "loss": 0.2974, + "step": 584 + }, + { + "epoch": 0.29485887096774194, + "grad_norm": 0.2701844685693586, + "learning_rate": 9.831304837774086e-07, + "loss": 0.314, + "step": 585 + }, + { + "epoch": 0.29536290322580644, + "grad_norm": 0.19399840828869006, + "learning_rate": 9.830612429166743e-07, + "loss": 0.3031, + "step": 586 + }, + { + "epoch": 0.29586693548387094, + "grad_norm": 0.184738711676414, + "learning_rate": 9.829918626957635e-07, + "loss": 0.3031, + "step": 587 + }, + { + "epoch": 0.2963709677419355, + "grad_norm": 0.3269160031771926, + "learning_rate": 9.829223431346926e-07, + "loss": 0.2911, + "step": 588 + }, + { + "epoch": 0.296875, + "grad_norm": 0.38332552633652756, + "learning_rate": 9.828526842535174e-07, + "loss": 0.2864, + "step": 589 + }, + { + "epoch": 0.2973790322580645, + "grad_norm": 0.21468223114928167, + "learning_rate": 9.827828860723338e-07, + "loss": 0.3109, + "step": 590 + }, + { + "epoch": 0.29788306451612906, + "grad_norm": 0.18987287925768598, + "learning_rate": 9.827129486112782e-07, + "loss": 0.3011, + "step": 591 + }, + { + "epoch": 0.29838709677419356, + "grad_norm": 0.3156090049134527, + "learning_rate": 9.82642871890527e-07, + "loss": 0.2906, + "step": 592 + }, + { + "epoch": 0.29889112903225806, + "grad_norm": 0.40698452999719154, + "learning_rate": 9.82572655930297e-07, + "loss": 0.306, + "step": 593 + }, + { + "epoch": 0.29939516129032256, + "grad_norm": 0.24367422580598258, + "learning_rate": 9.825023007508456e-07, + "loss": 0.3006, + "step": 594 + }, + { + "epoch": 0.2998991935483871, + "grad_norm": 0.27250412398540247, + "learning_rate": 9.82431806372469e-07, + "loss": 0.2989, + "step": 595 + }, + { + "epoch": 0.3004032258064516, + "grad_norm": 0.19246483798552555, + "learning_rate": 9.82361172815505e-07, + "loss": 0.2973, + "step": 596 + }, + { + "epoch": 0.3009072580645161, + "grad_norm": 0.18545484272367835, + "learning_rate": 9.822904001003306e-07, + "loss": 0.2996, + "step": 597 + }, + { + "epoch": 0.3009072580645161, + "eval_loss": 0.32989731431007385, + "eval_runtime": 18.7201, + "eval_samples_per_second": 45.673, + "eval_steps_per_second": 0.962, + "step": 597 + }, + { + "epoch": 0.3014112903225806, + "grad_norm": 0.5063679318757863, + "learning_rate": 9.822194882473635e-07, + "loss": 0.2864, + "step": 598 + }, + { + "epoch": 0.3019153225806452, + "grad_norm": 0.2855392122632754, + "learning_rate": 9.821484372770612e-07, + "loss": 0.295, + "step": 599 + }, + { + "epoch": 0.3024193548387097, + "grad_norm": 0.6735980580137383, + "learning_rate": 9.820772472099215e-07, + "loss": 0.2951, + "step": 600 + }, + { + "epoch": 0.3029233870967742, + "grad_norm": 0.35135382164679746, + "learning_rate": 9.82005918066482e-07, + "loss": 0.31, + "step": 601 + }, + { + "epoch": 0.3034274193548387, + "grad_norm": 0.19875119958210918, + "learning_rate": 9.819344498673215e-07, + "loss": 0.2956, + "step": 602 + }, + { + "epoch": 0.30393145161290325, + "grad_norm": 0.9315288935617498, + "learning_rate": 9.818628426330574e-07, + "loss": 0.3217, + "step": 603 + }, + { + "epoch": 0.30443548387096775, + "grad_norm": 1.0733717026504053, + "learning_rate": 9.817910963843481e-07, + "loss": 0.302, + "step": 604 + }, + { + "epoch": 0.30493951612903225, + "grad_norm": 0.1821439303858277, + "learning_rate": 9.81719211141892e-07, + "loss": 0.2951, + "step": 605 + }, + { + "epoch": 0.30544354838709675, + "grad_norm": 0.2227625770904509, + "learning_rate": 9.816471869264278e-07, + "loss": 0.2962, + "step": 606 + }, + { + "epoch": 0.3059475806451613, + "grad_norm": 0.6196036153090734, + "learning_rate": 9.815750237587338e-07, + "loss": 0.3029, + "step": 607 + }, + { + "epoch": 0.3064516129032258, + "grad_norm": 0.6010581227628021, + "learning_rate": 9.815027216596282e-07, + "loss": 0.3046, + "step": 608 + }, + { + "epoch": 0.3069556451612903, + "grad_norm": 0.3528611360569492, + "learning_rate": 9.814302806499707e-07, + "loss": 0.2932, + "step": 609 + }, + { + "epoch": 0.3074596774193548, + "grad_norm": 0.2979313249836626, + "learning_rate": 9.813577007506594e-07, + "loss": 0.2899, + "step": 610 + }, + { + "epoch": 0.3079637096774194, + "grad_norm": 0.489841762858414, + "learning_rate": 9.81284981982633e-07, + "loss": 0.2887, + "step": 611 + }, + { + "epoch": 0.3084677419354839, + "grad_norm": 0.32286398990319887, + "learning_rate": 9.81212124366871e-07, + "loss": 0.2972, + "step": 612 + }, + { + "epoch": 0.3089717741935484, + "grad_norm": 0.3654449282383879, + "learning_rate": 9.81139127924392e-07, + "loss": 0.2817, + "step": 613 + }, + { + "epoch": 0.3094758064516129, + "grad_norm": 0.23506329847966434, + "learning_rate": 9.810659926762551e-07, + "loss": 0.3041, + "step": 614 + }, + { + "epoch": 0.30997983870967744, + "grad_norm": 0.18739033718362563, + "learning_rate": 9.809927186435594e-07, + "loss": 0.3039, + "step": 615 + }, + { + "epoch": 0.31048387096774194, + "grad_norm": 0.4424195854136909, + "learning_rate": 9.809193058474438e-07, + "loss": 0.3027, + "step": 616 + }, + { + "epoch": 0.31098790322580644, + "grad_norm": 0.34612288423111537, + "learning_rate": 9.808457543090878e-07, + "loss": 0.2893, + "step": 617 + }, + { + "epoch": 0.31149193548387094, + "grad_norm": 0.6207470476564412, + "learning_rate": 9.807720640497103e-07, + "loss": 0.3089, + "step": 618 + }, + { + "epoch": 0.3119959677419355, + "grad_norm": 0.5606872023789801, + "learning_rate": 9.806982350905703e-07, + "loss": 0.3051, + "step": 619 + }, + { + "epoch": 0.3125, + "grad_norm": 0.6658134114343093, + "learning_rate": 9.806242674529676e-07, + "loss": 0.2879, + "step": 620 + }, + { + "epoch": 0.3130040322580645, + "grad_norm": 0.650400551133684, + "learning_rate": 9.80550161158241e-07, + "loss": 0.2998, + "step": 621 + }, + { + "epoch": 0.31350806451612906, + "grad_norm": 0.17947781258266207, + "learning_rate": 9.804759162277696e-07, + "loss": 0.3134, + "step": 622 + }, + { + "epoch": 0.31401209677419356, + "grad_norm": 0.3273834000987335, + "learning_rate": 9.804015326829728e-07, + "loss": 0.3108, + "step": 623 + }, + { + "epoch": 0.31451612903225806, + "grad_norm": 0.27388099966321255, + "learning_rate": 9.803270105453098e-07, + "loss": 0.2995, + "step": 624 + }, + { + "epoch": 0.31502016129032256, + "grad_norm": 0.4361569565465292, + "learning_rate": 9.802523498362797e-07, + "loss": 0.3037, + "step": 625 + }, + { + "epoch": 0.3155241935483871, + "grad_norm": 0.43147162718536736, + "learning_rate": 9.801775505774218e-07, + "loss": 0.3142, + "step": 626 + }, + { + "epoch": 0.3160282258064516, + "grad_norm": 0.23130882783317155, + "learning_rate": 9.801026127903149e-07, + "loss": 0.2981, + "step": 627 + }, + { + "epoch": 0.3165322580645161, + "grad_norm": 0.42494886746355115, + "learning_rate": 9.800275364965782e-07, + "loss": 0.3083, + "step": 628 + }, + { + "epoch": 0.3170362903225806, + "grad_norm": 0.23979029210918654, + "learning_rate": 9.79952321717871e-07, + "loss": 0.2927, + "step": 629 + }, + { + "epoch": 0.3175403225806452, + "grad_norm": 0.2649450830135195, + "learning_rate": 9.798769684758924e-07, + "loss": 0.3055, + "step": 630 + }, + { + "epoch": 0.3180443548387097, + "grad_norm": 0.20691288403035124, + "learning_rate": 9.798014767923807e-07, + "loss": 0.2939, + "step": 631 + }, + { + "epoch": 0.3185483870967742, + "grad_norm": 0.2818136169636899, + "learning_rate": 9.797258466891152e-07, + "loss": 0.3034, + "step": 632 + }, + { + "epoch": 0.3190524193548387, + "grad_norm": 0.1855253307389335, + "learning_rate": 9.796500781879148e-07, + "loss": 0.2845, + "step": 633 + }, + { + "epoch": 0.31955645161290325, + "grad_norm": 0.2383083001412052, + "learning_rate": 9.79574171310638e-07, + "loss": 0.2999, + "step": 634 + }, + { + "epoch": 0.32006048387096775, + "grad_norm": 0.2155300521952254, + "learning_rate": 9.794981260791837e-07, + "loss": 0.3115, + "step": 635 + }, + { + "epoch": 0.32056451612903225, + "grad_norm": 0.59372804366046, + "learning_rate": 9.794219425154904e-07, + "loss": 0.3057, + "step": 636 + }, + { + "epoch": 0.32106854838709675, + "grad_norm": 0.17584328105654679, + "learning_rate": 9.793456206415362e-07, + "loss": 0.3007, + "step": 637 + }, + { + "epoch": 0.3215725806451613, + "grad_norm": 0.2442097318205203, + "learning_rate": 9.792691604793402e-07, + "loss": 0.2897, + "step": 638 + }, + { + "epoch": 0.3220766129032258, + "grad_norm": 0.17991513417504787, + "learning_rate": 9.791925620509603e-07, + "loss": 0.289, + "step": 639 + }, + { + "epoch": 0.3225806451612903, + "grad_norm": 0.2022189687655052, + "learning_rate": 9.791158253784945e-07, + "loss": 0.3072, + "step": 640 + }, + { + "epoch": 0.3230846774193548, + "grad_norm": 0.3259212247774595, + "learning_rate": 9.79038950484081e-07, + "loss": 0.2989, + "step": 641 + }, + { + "epoch": 0.3235887096774194, + "grad_norm": 0.33045805468173267, + "learning_rate": 9.789619373898981e-07, + "loss": 0.3038, + "step": 642 + }, + { + "epoch": 0.3240927419354839, + "grad_norm": 0.24681922232019723, + "learning_rate": 9.788847861181631e-07, + "loss": 0.3138, + "step": 643 + }, + { + "epoch": 0.3245967741935484, + "grad_norm": 0.2518300810532734, + "learning_rate": 9.788074966911337e-07, + "loss": 0.2899, + "step": 644 + }, + { + "epoch": 0.3251008064516129, + "grad_norm": 0.29278633614058125, + "learning_rate": 9.787300691311077e-07, + "loss": 0.3139, + "step": 645 + }, + { + "epoch": 0.32560483870967744, + "grad_norm": 0.21645406918937576, + "learning_rate": 9.786525034604224e-07, + "loss": 0.309, + "step": 646 + }, + { + "epoch": 0.32610887096774194, + "grad_norm": 0.3666140583384514, + "learning_rate": 9.785747997014547e-07, + "loss": 0.2949, + "step": 647 + }, + { + "epoch": 0.32661290322580644, + "grad_norm": 0.1947942753801369, + "learning_rate": 9.78496957876622e-07, + "loss": 0.3075, + "step": 648 + }, + { + "epoch": 0.32711693548387094, + "grad_norm": 0.18162530785958844, + "learning_rate": 9.784189780083812e-07, + "loss": 0.294, + "step": 649 + }, + { + "epoch": 0.3276209677419355, + "grad_norm": 0.28107058253792394, + "learning_rate": 9.78340860119229e-07, + "loss": 0.2927, + "step": 650 + }, + { + "epoch": 0.328125, + "grad_norm": 0.1863145084715224, + "learning_rate": 9.782626042317015e-07, + "loss": 0.2977, + "step": 651 + }, + { + "epoch": 0.3286290322580645, + "grad_norm": 0.19255877376409836, + "learning_rate": 9.781842103683756e-07, + "loss": 0.3242, + "step": 652 + }, + { + "epoch": 0.32913306451612906, + "grad_norm": 0.22119159460683763, + "learning_rate": 9.78105678551867e-07, + "loss": 0.2949, + "step": 653 + }, + { + "epoch": 0.32963709677419356, + "grad_norm": 0.3370309906377243, + "learning_rate": 9.78027008804832e-07, + "loss": 0.3281, + "step": 654 + }, + { + "epoch": 0.33014112903225806, + "grad_norm": 0.21007597885166748, + "learning_rate": 9.779482011499662e-07, + "loss": 0.3011, + "step": 655 + }, + { + "epoch": 0.33064516129032256, + "grad_norm": 0.2114458853983953, + "learning_rate": 9.77869255610005e-07, + "loss": 0.3144, + "step": 656 + }, + { + "epoch": 0.3311491935483871, + "grad_norm": 0.4562451777517263, + "learning_rate": 9.77790172207724e-07, + "loss": 0.2804, + "step": 657 + }, + { + "epoch": 0.3316532258064516, + "grad_norm": 0.4033116791658588, + "learning_rate": 9.777109509659378e-07, + "loss": 0.2896, + "step": 658 + }, + { + "epoch": 0.3321572580645161, + "grad_norm": 0.19637371460146463, + "learning_rate": 9.776315919075015e-07, + "loss": 0.2851, + "step": 659 + }, + { + "epoch": 0.3326612903225806, + "grad_norm": 0.23427740658090485, + "learning_rate": 9.7755209505531e-07, + "loss": 0.2888, + "step": 660 + }, + { + "epoch": 0.3331653225806452, + "grad_norm": 0.2742494578051774, + "learning_rate": 9.77472460432297e-07, + "loss": 0.2811, + "step": 661 + }, + { + "epoch": 0.3336693548387097, + "grad_norm": 0.4976330279305446, + "learning_rate": 9.77392688061437e-07, + "loss": 0.2981, + "step": 662 + }, + { + "epoch": 0.3341733870967742, + "grad_norm": 0.20762430916034352, + "learning_rate": 9.773127779657442e-07, + "loss": 0.2928, + "step": 663 + }, + { + "epoch": 0.3346774193548387, + "grad_norm": 0.1974247496986688, + "learning_rate": 9.772327301682714e-07, + "loss": 0.2983, + "step": 664 + }, + { + "epoch": 0.33518145161290325, + "grad_norm": 0.2693856221779068, + "learning_rate": 9.771525446921123e-07, + "loss": 0.2966, + "step": 665 + }, + { + "epoch": 0.33568548387096775, + "grad_norm": 0.18606159238485928, + "learning_rate": 9.770722215604e-07, + "loss": 0.2989, + "step": 666 + }, + { + "epoch": 0.33618951612903225, + "grad_norm": 0.375528189461835, + "learning_rate": 9.769917607963068e-07, + "loss": 0.2908, + "step": 667 + }, + { + "epoch": 0.33669354838709675, + "grad_norm": 0.2859681728471663, + "learning_rate": 9.769111624230457e-07, + "loss": 0.2918, + "step": 668 + }, + { + "epoch": 0.3371975806451613, + "grad_norm": 0.20521406800135222, + "learning_rate": 9.768304264638684e-07, + "loss": 0.2987, + "step": 669 + }, + { + "epoch": 0.3377016129032258, + "grad_norm": 0.23397409956060417, + "learning_rate": 9.76749552942067e-07, + "loss": 0.3029, + "step": 670 + }, + { + "epoch": 0.3382056451612903, + "grad_norm": 0.4520922291898478, + "learning_rate": 9.766685418809727e-07, + "loss": 0.2963, + "step": 671 + }, + { + "epoch": 0.3387096774193548, + "grad_norm": 0.18452310561487084, + "learning_rate": 9.76587393303957e-07, + "loss": 0.2931, + "step": 672 + }, + { + "epoch": 0.3392137096774194, + "grad_norm": 0.22095628174169515, + "learning_rate": 9.765061072344305e-07, + "loss": 0.293, + "step": 673 + }, + { + "epoch": 0.3397177419354839, + "grad_norm": 0.45292152421659004, + "learning_rate": 9.764246836958439e-07, + "loss": 0.3098, + "step": 674 + }, + { + "epoch": 0.3402217741935484, + "grad_norm": 0.22491373287467453, + "learning_rate": 9.763431227116875e-07, + "loss": 0.297, + "step": 675 + }, + { + "epoch": 0.3407258064516129, + "grad_norm": 0.36919058274186534, + "learning_rate": 9.76261424305491e-07, + "loss": 0.3075, + "step": 676 + }, + { + "epoch": 0.34122983870967744, + "grad_norm": 0.3042293571669918, + "learning_rate": 9.761795885008236e-07, + "loss": 0.3018, + "step": 677 + }, + { + "epoch": 0.34173387096774194, + "grad_norm": 0.30087168043463913, + "learning_rate": 9.76097615321295e-07, + "loss": 0.2985, + "step": 678 + }, + { + "epoch": 0.34223790322580644, + "grad_norm": 0.24352689031383537, + "learning_rate": 9.760155047905534e-07, + "loss": 0.2938, + "step": 679 + }, + { + "epoch": 0.34274193548387094, + "grad_norm": 0.25871019913875265, + "learning_rate": 9.759332569322876e-07, + "loss": 0.2766, + "step": 680 + }, + { + "epoch": 0.3432459677419355, + "grad_norm": 0.3732275698199399, + "learning_rate": 9.758508717702253e-07, + "loss": 0.3068, + "step": 681 + }, + { + "epoch": 0.34375, + "grad_norm": 0.18175157181935991, + "learning_rate": 9.757683493281343e-07, + "loss": 0.3096, + "step": 682 + }, + { + "epoch": 0.3442540322580645, + "grad_norm": 0.3020686172952058, + "learning_rate": 9.756856896298218e-07, + "loss": 0.2874, + "step": 683 + }, + { + "epoch": 0.34475806451612906, + "grad_norm": 0.4287852398031048, + "learning_rate": 9.756028926991344e-07, + "loss": 0.3064, + "step": 684 + }, + { + "epoch": 0.34526209677419356, + "grad_norm": 0.3877586447077066, + "learning_rate": 9.755199585599587e-07, + "loss": 0.3034, + "step": 685 + }, + { + "epoch": 0.34576612903225806, + "grad_norm": 0.42768152833859835, + "learning_rate": 9.754368872362209e-07, + "loss": 0.3066, + "step": 686 + }, + { + "epoch": 0.34627016129032256, + "grad_norm": 0.2249355344351282, + "learning_rate": 9.75353678751886e-07, + "loss": 0.2882, + "step": 687 + }, + { + "epoch": 0.3467741935483871, + "grad_norm": 0.27912853498203316, + "learning_rate": 9.752703331309596e-07, + "loss": 0.3041, + "step": 688 + }, + { + "epoch": 0.3472782258064516, + "grad_norm": 0.43506449721846435, + "learning_rate": 9.751868503974862e-07, + "loss": 0.2846, + "step": 689 + }, + { + "epoch": 0.3477822580645161, + "grad_norm": 0.26206165096276, + "learning_rate": 9.7510323057555e-07, + "loss": 0.3068, + "step": 690 + }, + { + "epoch": 0.3482862903225806, + "grad_norm": 0.20904315219220723, + "learning_rate": 9.75019473689275e-07, + "loss": 0.2997, + "step": 691 + }, + { + "epoch": 0.3487903225806452, + "grad_norm": 0.19583961501100697, + "learning_rate": 9.749355797628243e-07, + "loss": 0.2819, + "step": 692 + }, + { + "epoch": 0.3492943548387097, + "grad_norm": 0.4477203330124689, + "learning_rate": 9.74851548820401e-07, + "loss": 0.2889, + "step": 693 + }, + { + "epoch": 0.3497983870967742, + "grad_norm": 0.465876564157471, + "learning_rate": 9.747673808862476e-07, + "loss": 0.2927, + "step": 694 + }, + { + "epoch": 0.3503024193548387, + "grad_norm": 0.19047436893148592, + "learning_rate": 9.746830759846456e-07, + "loss": 0.2972, + "step": 695 + }, + { + "epoch": 0.35080645161290325, + "grad_norm": 0.20298210286927948, + "learning_rate": 9.745986341399166e-07, + "loss": 0.2865, + "step": 696 + }, + { + "epoch": 0.35131048387096775, + "grad_norm": 0.21381675798513802, + "learning_rate": 9.745140553764219e-07, + "loss": 0.3057, + "step": 697 + }, + { + "epoch": 0.35181451612903225, + "grad_norm": 0.29038109612051655, + "learning_rate": 9.744293397185615e-07, + "loss": 0.2929, + "step": 698 + }, + { + "epoch": 0.35231854838709675, + "grad_norm": 0.3321256067582296, + "learning_rate": 9.743444871907756e-07, + "loss": 0.2896, + "step": 699 + }, + { + "epoch": 0.3528225806451613, + "grad_norm": 0.3056406476467989, + "learning_rate": 9.742594978175436e-07, + "loss": 0.2996, + "step": 700 + }, + { + "epoch": 0.3533266129032258, + "grad_norm": 0.22979049368338594, + "learning_rate": 9.741743716233843e-07, + "loss": 0.2879, + "step": 701 + }, + { + "epoch": 0.3538306451612903, + "grad_norm": 0.2167981736696398, + "learning_rate": 9.74089108632856e-07, + "loss": 0.3008, + "step": 702 + }, + { + "epoch": 0.3543346774193548, + "grad_norm": 0.6466745464300053, + "learning_rate": 9.74003708870557e-07, + "loss": 0.2883, + "step": 703 + }, + { + "epoch": 0.3548387096774194, + "grad_norm": 0.5205046458590996, + "learning_rate": 9.73918172361124e-07, + "loss": 0.2941, + "step": 704 + }, + { + "epoch": 0.3553427419354839, + "grad_norm": 0.4750444899997186, + "learning_rate": 9.73832499129234e-07, + "loss": 0.2978, + "step": 705 + }, + { + "epoch": 0.3558467741935484, + "grad_norm": 0.2755306914200271, + "learning_rate": 9.737466891996035e-07, + "loss": 0.2761, + "step": 706 + }, + { + "epoch": 0.3563508064516129, + "grad_norm": 0.23830584140125033, + "learning_rate": 9.736607425969878e-07, + "loss": 0.2817, + "step": 707 + }, + { + "epoch": 0.35685483870967744, + "grad_norm": 0.5272610922884131, + "learning_rate": 9.73574659346182e-07, + "loss": 0.3099, + "step": 708 + }, + { + "epoch": 0.35735887096774194, + "grad_norm": 0.4888851365749825, + "learning_rate": 9.734884394720207e-07, + "loss": 0.3035, + "step": 709 + }, + { + "epoch": 0.35786290322580644, + "grad_norm": 0.19529005617043382, + "learning_rate": 9.734020829993778e-07, + "loss": 0.288, + "step": 710 + }, + { + "epoch": 0.35836693548387094, + "grad_norm": 0.3168784896287719, + "learning_rate": 9.733155899531661e-07, + "loss": 0.2747, + "step": 711 + }, + { + "epoch": 0.3588709677419355, + "grad_norm": 0.20038440381297584, + "learning_rate": 9.73228960358339e-07, + "loss": 0.2855, + "step": 712 + }, + { + "epoch": 0.359375, + "grad_norm": 0.2637581920266508, + "learning_rate": 9.731421942398882e-07, + "loss": 0.3007, + "step": 713 + }, + { + "epoch": 0.3598790322580645, + "grad_norm": 0.677620290574485, + "learning_rate": 9.730552916228455e-07, + "loss": 0.31, + "step": 714 + }, + { + "epoch": 0.36038306451612906, + "grad_norm": 0.22952816515508798, + "learning_rate": 9.729682525322817e-07, + "loss": 0.3066, + "step": 715 + }, + { + "epoch": 0.36088709677419356, + "grad_norm": 0.1917187197394172, + "learning_rate": 9.728810769933066e-07, + "loss": 0.3017, + "step": 716 + }, + { + "epoch": 0.36139112903225806, + "grad_norm": 0.20951491712451265, + "learning_rate": 9.727937650310704e-07, + "loss": 0.2973, + "step": 717 + }, + { + "epoch": 0.36189516129032256, + "grad_norm": 0.3141631511750949, + "learning_rate": 9.727063166707619e-07, + "loss": 0.308, + "step": 718 + }, + { + "epoch": 0.3623991935483871, + "grad_norm": 0.18721524079592824, + "learning_rate": 9.726187319376088e-07, + "loss": 0.2927, + "step": 719 + }, + { + "epoch": 0.3629032258064516, + "grad_norm": 0.28391314645074966, + "learning_rate": 9.725310108568795e-07, + "loss": 0.2949, + "step": 720 + }, + { + "epoch": 0.3634072580645161, + "grad_norm": 0.17932969621501238, + "learning_rate": 9.724431534538809e-07, + "loss": 0.294, + "step": 721 + }, + { + "epoch": 0.3639112903225806, + "grad_norm": 0.21062063293701141, + "learning_rate": 9.723551597539591e-07, + "loss": 0.2836, + "step": 722 + }, + { + "epoch": 0.3644153225806452, + "grad_norm": 0.18043765044502827, + "learning_rate": 9.722670297824998e-07, + "loss": 0.3093, + "step": 723 + }, + { + "epoch": 0.3649193548387097, + "grad_norm": 0.29050763149878694, + "learning_rate": 9.72178763564928e-07, + "loss": 0.2907, + "step": 724 + }, + { + "epoch": 0.3654233870967742, + "grad_norm": 0.18845371146424408, + "learning_rate": 9.720903611267077e-07, + "loss": 0.2986, + "step": 725 + }, + { + "epoch": 0.3659274193548387, + "grad_norm": 0.18828740365731594, + "learning_rate": 9.720018224933427e-07, + "loss": 0.2872, + "step": 726 + }, + { + "epoch": 0.36643145161290325, + "grad_norm": 0.19190513878279883, + "learning_rate": 9.71913147690376e-07, + "loss": 0.3062, + "step": 727 + }, + { + "epoch": 0.36693548387096775, + "grad_norm": 0.19814485234368132, + "learning_rate": 9.718243367433893e-07, + "loss": 0.2929, + "step": 728 + }, + { + "epoch": 0.36743951612903225, + "grad_norm": 0.43417746968071463, + "learning_rate": 9.717353896780042e-07, + "loss": 0.2861, + "step": 729 + }, + { + "epoch": 0.36794354838709675, + "grad_norm": 0.2278557334184222, + "learning_rate": 9.716463065198817e-07, + "loss": 0.3085, + "step": 730 + }, + { + "epoch": 0.3684475806451613, + "grad_norm": 0.17793722781989707, + "learning_rate": 9.715570872947213e-07, + "loss": 0.2919, + "step": 731 + }, + { + "epoch": 0.3689516129032258, + "grad_norm": 0.20613998379871223, + "learning_rate": 9.714677320282623e-07, + "loss": 0.3027, + "step": 732 + }, + { + "epoch": 0.3694556451612903, + "grad_norm": 0.1966436661511563, + "learning_rate": 9.713782407462834e-07, + "loss": 0.2741, + "step": 733 + }, + { + "epoch": 0.3699596774193548, + "grad_norm": 0.28922865753235927, + "learning_rate": 9.712886134746019e-07, + "loss": 0.2858, + "step": 734 + }, + { + "epoch": 0.3704637096774194, + "grad_norm": 0.2702305708992764, + "learning_rate": 9.71198850239075e-07, + "loss": 0.297, + "step": 735 + }, + { + "epoch": 0.3709677419354839, + "grad_norm": 0.194378574505089, + "learning_rate": 9.711089510655985e-07, + "loss": 0.3142, + "step": 736 + }, + { + "epoch": 0.3714717741935484, + "grad_norm": 0.18465691225415032, + "learning_rate": 9.710189159801084e-07, + "loss": 0.2972, + "step": 737 + }, + { + "epoch": 0.3719758064516129, + "grad_norm": 0.19730530115872685, + "learning_rate": 9.709287450085786e-07, + "loss": 0.2954, + "step": 738 + }, + { + "epoch": 0.37247983870967744, + "grad_norm": 0.2189962450750751, + "learning_rate": 9.70838438177023e-07, + "loss": 0.2922, + "step": 739 + }, + { + "epoch": 0.37298387096774194, + "grad_norm": 0.4433211186238402, + "learning_rate": 9.707479955114948e-07, + "loss": 0.3027, + "step": 740 + }, + { + "epoch": 0.37348790322580644, + "grad_norm": 0.24228833774809522, + "learning_rate": 9.70657417038086e-07, + "loss": 0.2983, + "step": 741 + }, + { + "epoch": 0.37399193548387094, + "grad_norm": 0.18660389324918686, + "learning_rate": 9.70566702782928e-07, + "loss": 0.3081, + "step": 742 + }, + { + "epoch": 0.3744959677419355, + "grad_norm": 0.29156052417574047, + "learning_rate": 9.704758527721912e-07, + "loss": 0.2827, + "step": 743 + }, + { + "epoch": 0.375, + "grad_norm": 0.30556626157560013, + "learning_rate": 9.703848670320855e-07, + "loss": 0.3043, + "step": 744 + }, + { + "epoch": 0.3755040322580645, + "grad_norm": 0.28242712763277444, + "learning_rate": 9.702937455888593e-07, + "loss": 0.2971, + "step": 745 + }, + { + "epoch": 0.37600806451612906, + "grad_norm": 0.18221657954549308, + "learning_rate": 9.70202488468801e-07, + "loss": 0.2947, + "step": 746 + }, + { + "epoch": 0.37651209677419356, + "grad_norm": 0.17989299690744823, + "learning_rate": 9.701110956982374e-07, + "loss": 0.2924, + "step": 747 + }, + { + "epoch": 0.37701612903225806, + "grad_norm": 0.18220343446730708, + "learning_rate": 9.700195673035349e-07, + "loss": 0.2885, + "step": 748 + }, + { + "epoch": 0.37752016129032256, + "grad_norm": 0.22346642161187105, + "learning_rate": 9.699279033110988e-07, + "loss": 0.3244, + "step": 749 + }, + { + "epoch": 0.3780241935483871, + "grad_norm": 0.19169547017843008, + "learning_rate": 9.698361037473738e-07, + "loss": 0.2961, + "step": 750 + }, + { + "epoch": 0.3785282258064516, + "grad_norm": 0.22065738901487775, + "learning_rate": 9.697441686388432e-07, + "loss": 0.3068, + "step": 751 + }, + { + "epoch": 0.3790322580645161, + "grad_norm": 0.19267369377139024, + "learning_rate": 9.696520980120299e-07, + "loss": 0.2898, + "step": 752 + }, + { + "epoch": 0.3795362903225806, + "grad_norm": 0.19447034655516351, + "learning_rate": 9.695598918934958e-07, + "loss": 0.2938, + "step": 753 + }, + { + "epoch": 0.3800403225806452, + "grad_norm": 0.17575932802696953, + "learning_rate": 9.694675503098415e-07, + "loss": 0.299, + "step": 754 + }, + { + "epoch": 0.3805443548387097, + "grad_norm": 0.27717817200263767, + "learning_rate": 9.693750732877071e-07, + "loss": 0.2902, + "step": 755 + }, + { + "epoch": 0.3810483870967742, + "grad_norm": 0.18017521043674514, + "learning_rate": 9.692824608537718e-07, + "loss": 0.2934, + "step": 756 + }, + { + "epoch": 0.3815524193548387, + "grad_norm": 0.18161011916708894, + "learning_rate": 9.691897130347536e-07, + "loss": 0.2958, + "step": 757 + }, + { + "epoch": 0.38205645161290325, + "grad_norm": 0.20730764541104563, + "learning_rate": 9.690968298574095e-07, + "loss": 0.2908, + "step": 758 + }, + { + "epoch": 0.38256048387096775, + "grad_norm": 0.2200352742107318, + "learning_rate": 9.69003811348536e-07, + "loss": 0.3019, + "step": 759 + }, + { + "epoch": 0.38306451612903225, + "grad_norm": 0.19253930185176787, + "learning_rate": 9.689106575349682e-07, + "loss": 0.2864, + "step": 760 + }, + { + "epoch": 0.38356854838709675, + "grad_norm": 0.21888290789299836, + "learning_rate": 9.688173684435806e-07, + "loss": 0.313, + "step": 761 + }, + { + "epoch": 0.3840725806451613, + "grad_norm": 0.2275047774015037, + "learning_rate": 9.687239441012863e-07, + "loss": 0.2927, + "step": 762 + }, + { + "epoch": 0.3845766129032258, + "grad_norm": 0.17433382207800896, + "learning_rate": 9.686303845350377e-07, + "loss": 0.2934, + "step": 763 + }, + { + "epoch": 0.3850806451612903, + "grad_norm": 0.1910468291040477, + "learning_rate": 9.68536689771826e-07, + "loss": 0.2892, + "step": 764 + }, + { + "epoch": 0.3855846774193548, + "grad_norm": 0.2073819671598411, + "learning_rate": 9.68442859838682e-07, + "loss": 0.2874, + "step": 765 + }, + { + "epoch": 0.3860887096774194, + "grad_norm": 0.18337565075052092, + "learning_rate": 9.683488947626746e-07, + "loss": 0.2986, + "step": 766 + }, + { + "epoch": 0.3865927419354839, + "grad_norm": 0.19262381834997364, + "learning_rate": 9.682547945709125e-07, + "loss": 0.2954, + "step": 767 + }, + { + "epoch": 0.3870967741935484, + "grad_norm": 0.19206670831950323, + "learning_rate": 9.681605592905425e-07, + "loss": 0.2796, + "step": 768 + }, + { + "epoch": 0.3876008064516129, + "grad_norm": 0.18021565531041417, + "learning_rate": 9.680661889487517e-07, + "loss": 0.2974, + "step": 769 + }, + { + "epoch": 0.38810483870967744, + "grad_norm": 0.20648881640932878, + "learning_rate": 9.679716835727647e-07, + "loss": 0.304, + "step": 770 + }, + { + "epoch": 0.38860887096774194, + "grad_norm": 0.21633577449914515, + "learning_rate": 9.67877043189846e-07, + "loss": 0.2815, + "step": 771 + }, + { + "epoch": 0.38911290322580644, + "grad_norm": 0.19027954862930285, + "learning_rate": 9.677822678272986e-07, + "loss": 0.2775, + "step": 772 + }, + { + "epoch": 0.38961693548387094, + "grad_norm": 0.1853233674997291, + "learning_rate": 9.676873575124647e-07, + "loss": 0.2736, + "step": 773 + }, + { + "epoch": 0.3901209677419355, + "grad_norm": 0.28029424195764874, + "learning_rate": 9.675923122727253e-07, + "loss": 0.2919, + "step": 774 + }, + { + "epoch": 0.390625, + "grad_norm": 0.2271137389969601, + "learning_rate": 9.674971321355003e-07, + "loss": 0.2884, + "step": 775 + }, + { + "epoch": 0.3911290322580645, + "grad_norm": 0.2482514592600029, + "learning_rate": 9.67401817128249e-07, + "loss": 0.2947, + "step": 776 + }, + { + "epoch": 0.39163306451612906, + "grad_norm": 0.29951392676665967, + "learning_rate": 9.673063672784684e-07, + "loss": 0.289, + "step": 777 + }, + { + "epoch": 0.39213709677419356, + "grad_norm": 0.17456103242486867, + "learning_rate": 9.67210782613696e-07, + "loss": 0.2995, + "step": 778 + }, + { + "epoch": 0.39264112903225806, + "grad_norm": 0.6967952430229244, + "learning_rate": 9.67115063161507e-07, + "loss": 0.3022, + "step": 779 + }, + { + "epoch": 0.39314516129032256, + "grad_norm": 0.29680448703729057, + "learning_rate": 9.67019208949516e-07, + "loss": 0.2862, + "step": 780 + }, + { + "epoch": 0.3936491935483871, + "grad_norm": 0.177999847832676, + "learning_rate": 9.669232200053759e-07, + "loss": 0.2962, + "step": 781 + }, + { + "epoch": 0.3941532258064516, + "grad_norm": 0.27026664375469966, + "learning_rate": 9.668270963567794e-07, + "loss": 0.2986, + "step": 782 + }, + { + "epoch": 0.3946572580645161, + "grad_norm": 0.19723594083135135, + "learning_rate": 9.667308380314576e-07, + "loss": 0.3063, + "step": 783 + }, + { + "epoch": 0.3951612903225806, + "grad_norm": 0.21862308310307982, + "learning_rate": 9.666344450571801e-07, + "loss": 0.2964, + "step": 784 + }, + { + "epoch": 0.3956653225806452, + "grad_norm": 0.1758663671801379, + "learning_rate": 9.665379174617558e-07, + "loss": 0.2914, + "step": 785 + }, + { + "epoch": 0.3961693548387097, + "grad_norm": 0.32096497636941274, + "learning_rate": 9.664412552730326e-07, + "loss": 0.2897, + "step": 786 + }, + { + "epoch": 0.3966733870967742, + "grad_norm": 0.17374107415020215, + "learning_rate": 9.663444585188965e-07, + "loss": 0.2867, + "step": 787 + }, + { + "epoch": 0.3971774193548387, + "grad_norm": 0.18444275375398292, + "learning_rate": 9.66247527227273e-07, + "loss": 0.2961, + "step": 788 + }, + { + "epoch": 0.39768145161290325, + "grad_norm": 0.19643289496174723, + "learning_rate": 9.661504614261261e-07, + "loss": 0.3125, + "step": 789 + }, + { + "epoch": 0.39818548387096775, + "grad_norm": 0.2167645010446501, + "learning_rate": 9.660532611434591e-07, + "loss": 0.2921, + "step": 790 + }, + { + "epoch": 0.39868951612903225, + "grad_norm": 0.1806328023049352, + "learning_rate": 9.659559264073129e-07, + "loss": 0.2846, + "step": 791 + }, + { + "epoch": 0.39919354838709675, + "grad_norm": 0.2612388559577726, + "learning_rate": 9.658584572457686e-07, + "loss": 0.2945, + "step": 792 + }, + { + "epoch": 0.3996975806451613, + "grad_norm": 0.2793761291173004, + "learning_rate": 9.657608536869451e-07, + "loss": 0.2837, + "step": 793 + }, + { + "epoch": 0.4002016129032258, + "grad_norm": 0.18310300206951588, + "learning_rate": 9.656631157590004e-07, + "loss": 0.2978, + "step": 794 + }, + { + "epoch": 0.4007056451612903, + "grad_norm": 0.18408805840331044, + "learning_rate": 9.655652434901317e-07, + "loss": 0.3079, + "step": 795 + }, + { + "epoch": 0.4012096774193548, + "grad_norm": 0.2342952244459477, + "learning_rate": 9.654672369085742e-07, + "loss": 0.2834, + "step": 796 + }, + { + "epoch": 0.4012096774193548, + "eval_loss": 0.3250449001789093, + "eval_runtime": 18.239, + "eval_samples_per_second": 46.878, + "eval_steps_per_second": 0.987, + "step": 796 + }, + { + "epoch": 0.4017137096774194, + "grad_norm": 0.6951664973155232, + "learning_rate": 9.653690960426024e-07, + "loss": 0.2949, + "step": 797 + }, + { + "epoch": 0.4022177419354839, + "grad_norm": 0.3007352567348458, + "learning_rate": 9.652708209205289e-07, + "loss": 0.308, + "step": 798 + }, + { + "epoch": 0.4027217741935484, + "grad_norm": 0.2111912313466037, + "learning_rate": 9.651724115707059e-07, + "loss": 0.2868, + "step": 799 + }, + { + "epoch": 0.4032258064516129, + "grad_norm": 0.25518959182252676, + "learning_rate": 9.650738680215237e-07, + "loss": 0.2926, + "step": 800 + }, + { + "epoch": 0.40372983870967744, + "grad_norm": 0.18835961947528512, + "learning_rate": 9.649751903014117e-07, + "loss": 0.2894, + "step": 801 + }, + { + "epoch": 0.40423387096774194, + "grad_norm": 0.17984205410768345, + "learning_rate": 9.648763784388375e-07, + "loss": 0.2857, + "step": 802 + }, + { + "epoch": 0.40473790322580644, + "grad_norm": 0.19376522082181055, + "learning_rate": 9.647774324623082e-07, + "loss": 0.2956, + "step": 803 + }, + { + "epoch": 0.40524193548387094, + "grad_norm": 0.2473519213828851, + "learning_rate": 9.646783524003684e-07, + "loss": 0.2816, + "step": 804 + }, + { + "epoch": 0.4057459677419355, + "grad_norm": 0.4135144683121957, + "learning_rate": 9.645791382816026e-07, + "loss": 0.2794, + "step": 805 + }, + { + "epoch": 0.40625, + "grad_norm": 0.22178842307704266, + "learning_rate": 9.644797901346333e-07, + "loss": 0.2859, + "step": 806 + }, + { + "epoch": 0.4067540322580645, + "grad_norm": 0.1865025598672438, + "learning_rate": 9.64380307988122e-07, + "loss": 0.2826, + "step": 807 + }, + { + "epoch": 0.40725806451612906, + "grad_norm": 0.21780748939537628, + "learning_rate": 9.642806918707685e-07, + "loss": 0.3003, + "step": 808 + }, + { + "epoch": 0.40776209677419356, + "grad_norm": 0.23524333096509437, + "learning_rate": 9.641809418113113e-07, + "loss": 0.2731, + "step": 809 + }, + { + "epoch": 0.40826612903225806, + "grad_norm": 0.3522546815652214, + "learning_rate": 9.64081057838528e-07, + "loss": 0.3006, + "step": 810 + }, + { + "epoch": 0.40877016129032256, + "grad_norm": 0.2060231832092746, + "learning_rate": 9.63981039981234e-07, + "loss": 0.2889, + "step": 811 + }, + { + "epoch": 0.4092741935483871, + "grad_norm": 0.17805134176779017, + "learning_rate": 9.638808882682845e-07, + "loss": 0.28, + "step": 812 + }, + { + "epoch": 0.4097782258064516, + "grad_norm": 0.21338586685342792, + "learning_rate": 9.637806027285721e-07, + "loss": 0.3266, + "step": 813 + }, + { + "epoch": 0.4102822580645161, + "grad_norm": 0.23761874762782823, + "learning_rate": 9.636801833910291e-07, + "loss": 0.2928, + "step": 814 + }, + { + "epoch": 0.4107862903225806, + "grad_norm": 0.349479056634183, + "learning_rate": 9.635796302846253e-07, + "loss": 0.2893, + "step": 815 + }, + { + "epoch": 0.4112903225806452, + "grad_norm": 0.1793918088824337, + "learning_rate": 9.6347894343837e-07, + "loss": 0.3022, + "step": 816 + }, + { + "epoch": 0.4117943548387097, + "grad_norm": 0.20651527805989847, + "learning_rate": 9.633781228813107e-07, + "loss": 0.289, + "step": 817 + }, + { + "epoch": 0.4122983870967742, + "grad_norm": 0.2120940285959409, + "learning_rate": 9.63277168642533e-07, + "loss": 0.2796, + "step": 818 + }, + { + "epoch": 0.4128024193548387, + "grad_norm": 0.18644947273341353, + "learning_rate": 9.631760807511624e-07, + "loss": 0.2823, + "step": 819 + }, + { + "epoch": 0.41330645161290325, + "grad_norm": 0.2559146778083527, + "learning_rate": 9.630748592363617e-07, + "loss": 0.3102, + "step": 820 + }, + { + "epoch": 0.41381048387096775, + "grad_norm": 0.20144278166717078, + "learning_rate": 9.629735041273325e-07, + "loss": 0.2949, + "step": 821 + }, + { + "epoch": 0.41431451612903225, + "grad_norm": 0.22358896888220894, + "learning_rate": 9.628720154533157e-07, + "loss": 0.2809, + "step": 822 + }, + { + "epoch": 0.41481854838709675, + "grad_norm": 0.2056875307546662, + "learning_rate": 9.627703932435895e-07, + "loss": 0.2838, + "step": 823 + }, + { + "epoch": 0.4153225806451613, + "grad_norm": 0.199611316920199, + "learning_rate": 9.626686375274715e-07, + "loss": 0.2731, + "step": 824 + }, + { + "epoch": 0.4158266129032258, + "grad_norm": 0.17872361611126808, + "learning_rate": 9.625667483343177e-07, + "loss": 0.2885, + "step": 825 + }, + { + "epoch": 0.4163306451612903, + "grad_norm": 0.1715080401060158, + "learning_rate": 9.624647256935226e-07, + "loss": 0.2918, + "step": 826 + }, + { + "epoch": 0.4168346774193548, + "grad_norm": 0.28942913404328185, + "learning_rate": 9.623625696345187e-07, + "loss": 0.2909, + "step": 827 + }, + { + "epoch": 0.4173387096774194, + "grad_norm": 0.5219653514726549, + "learning_rate": 9.62260280186778e-07, + "loss": 0.2917, + "step": 828 + }, + { + "epoch": 0.4178427419354839, + "grad_norm": 0.1803172885571653, + "learning_rate": 9.621578573798098e-07, + "loss": 0.29, + "step": 829 + }, + { + "epoch": 0.4183467741935484, + "grad_norm": 0.26437521269828795, + "learning_rate": 9.620553012431626e-07, + "loss": 0.2962, + "step": 830 + }, + { + "epoch": 0.4188508064516129, + "grad_norm": 0.22357063643965017, + "learning_rate": 9.619526118064234e-07, + "loss": 0.2841, + "step": 831 + }, + { + "epoch": 0.41935483870967744, + "grad_norm": 0.27227090245290375, + "learning_rate": 9.618497890992171e-07, + "loss": 0.2791, + "step": 832 + }, + { + "epoch": 0.41985887096774194, + "grad_norm": 0.18400884656891223, + "learning_rate": 9.61746833151208e-07, + "loss": 0.2922, + "step": 833 + }, + { + "epoch": 0.42036290322580644, + "grad_norm": 0.17737272960072972, + "learning_rate": 9.616437439920977e-07, + "loss": 0.2941, + "step": 834 + }, + { + "epoch": 0.42086693548387094, + "grad_norm": 0.1845994861654205, + "learning_rate": 9.61540521651627e-07, + "loss": 0.2993, + "step": 835 + }, + { + "epoch": 0.4213709677419355, + "grad_norm": 0.184243652976245, + "learning_rate": 9.61437166159575e-07, + "loss": 0.2954, + "step": 836 + }, + { + "epoch": 0.421875, + "grad_norm": 0.1962394295882692, + "learning_rate": 9.61333677545759e-07, + "loss": 0.2797, + "step": 837 + }, + { + "epoch": 0.4223790322580645, + "grad_norm": 0.19907302059194398, + "learning_rate": 9.612300558400348e-07, + "loss": 0.2913, + "step": 838 + }, + { + "epoch": 0.42288306451612906, + "grad_norm": 0.17791364001664575, + "learning_rate": 9.611263010722968e-07, + "loss": 0.308, + "step": 839 + }, + { + "epoch": 0.42338709677419356, + "grad_norm": 0.20244623183338803, + "learning_rate": 9.610224132724772e-07, + "loss": 0.2984, + "step": 840 + }, + { + "epoch": 0.42389112903225806, + "grad_norm": 0.1821553749592407, + "learning_rate": 9.609183924705473e-07, + "loss": 0.276, + "step": 841 + }, + { + "epoch": 0.42439516129032256, + "grad_norm": 0.18302758905749872, + "learning_rate": 9.608142386965166e-07, + "loss": 0.2886, + "step": 842 + }, + { + "epoch": 0.4248991935483871, + "grad_norm": 0.2814755691064198, + "learning_rate": 9.607099519804325e-07, + "loss": 0.2996, + "step": 843 + }, + { + "epoch": 0.4254032258064516, + "grad_norm": 0.2188163570039596, + "learning_rate": 9.60605532352381e-07, + "loss": 0.2849, + "step": 844 + }, + { + "epoch": 0.4259072580645161, + "grad_norm": 0.19063373332278225, + "learning_rate": 9.605009798424871e-07, + "loss": 0.2826, + "step": 845 + }, + { + "epoch": 0.4264112903225806, + "grad_norm": 0.22803495479430694, + "learning_rate": 9.60396294480913e-07, + "loss": 0.2881, + "step": 846 + }, + { + "epoch": 0.4269153225806452, + "grad_norm": 0.2532580997944776, + "learning_rate": 9.6029147629786e-07, + "loss": 0.2931, + "step": 847 + }, + { + "epoch": 0.4274193548387097, + "grad_norm": 0.29895228840340143, + "learning_rate": 9.601865253235673e-07, + "loss": 0.2851, + "step": 848 + }, + { + "epoch": 0.4279233870967742, + "grad_norm": 0.19966385962690142, + "learning_rate": 9.60081441588313e-07, + "loss": 0.2977, + "step": 849 + }, + { + "epoch": 0.4284274193548387, + "grad_norm": 0.3898444466529591, + "learning_rate": 9.599762251224125e-07, + "loss": 0.3104, + "step": 850 + }, + { + "epoch": 0.42893145161290325, + "grad_norm": 0.24259677387671844, + "learning_rate": 9.598708759562208e-07, + "loss": 0.2987, + "step": 851 + }, + { + "epoch": 0.42943548387096775, + "grad_norm": 0.20006735807640816, + "learning_rate": 9.5976539412013e-07, + "loss": 0.2864, + "step": 852 + }, + { + "epoch": 0.42993951612903225, + "grad_norm": 0.2882245398429841, + "learning_rate": 9.59659779644571e-07, + "loss": 0.2858, + "step": 853 + }, + { + "epoch": 0.43044354838709675, + "grad_norm": 0.22513281216087977, + "learning_rate": 9.59554032560013e-07, + "loss": 0.2824, + "step": 854 + }, + { + "epoch": 0.4309475806451613, + "grad_norm": 0.26571888341504696, + "learning_rate": 9.594481528969635e-07, + "loss": 0.3086, + "step": 855 + }, + { + "epoch": 0.4314516129032258, + "grad_norm": 0.19249797160463983, + "learning_rate": 9.59342140685968e-07, + "loss": 0.2889, + "step": 856 + }, + { + "epoch": 0.4319556451612903, + "grad_norm": 0.17883290237014954, + "learning_rate": 9.592359959576104e-07, + "loss": 0.2913, + "step": 857 + }, + { + "epoch": 0.4324596774193548, + "grad_norm": 0.38963242443484597, + "learning_rate": 9.591297187425128e-07, + "loss": 0.2905, + "step": 858 + }, + { + "epoch": 0.4329637096774194, + "grad_norm": 0.4499390873499669, + "learning_rate": 9.590233090713354e-07, + "loss": 0.3016, + "step": 859 + }, + { + "epoch": 0.4334677419354839, + "grad_norm": 0.23147026645054392, + "learning_rate": 9.58916766974777e-07, + "loss": 0.2753, + "step": 860 + }, + { + "epoch": 0.4339717741935484, + "grad_norm": 0.23004776559869194, + "learning_rate": 9.58810092483574e-07, + "loss": 0.2857, + "step": 861 + }, + { + "epoch": 0.4344758064516129, + "grad_norm": 0.38524648235237546, + "learning_rate": 9.587032856285016e-07, + "loss": 0.295, + "step": 862 + }, + { + "epoch": 0.43497983870967744, + "grad_norm": 0.26978296400638296, + "learning_rate": 9.585963464403727e-07, + "loss": 0.2924, + "step": 863 + }, + { + "epoch": 0.43548387096774194, + "grad_norm": 0.2304216233656417, + "learning_rate": 9.584892749500388e-07, + "loss": 0.2782, + "step": 864 + }, + { + "epoch": 0.43598790322580644, + "grad_norm": 0.20628041373927403, + "learning_rate": 9.58382071188389e-07, + "loss": 0.2938, + "step": 865 + }, + { + "epoch": 0.43649193548387094, + "grad_norm": 0.24806721102476476, + "learning_rate": 9.582747351863518e-07, + "loss": 0.3051, + "step": 866 + }, + { + "epoch": 0.4369959677419355, + "grad_norm": 0.18289677229578133, + "learning_rate": 9.58167266974892e-07, + "loss": 0.2928, + "step": 867 + }, + { + "epoch": 0.4375, + "grad_norm": 0.20539381909701737, + "learning_rate": 9.580596665850139e-07, + "loss": 0.2873, + "step": 868 + }, + { + "epoch": 0.4380040322580645, + "grad_norm": 0.18821302140885868, + "learning_rate": 9.579519340477592e-07, + "loss": 0.2903, + "step": 869 + }, + { + "epoch": 0.43850806451612906, + "grad_norm": 0.37407049401588166, + "learning_rate": 9.57844069394209e-07, + "loss": 0.2931, + "step": 870 + }, + { + "epoch": 0.43901209677419356, + "grad_norm": 0.2339579704921313, + "learning_rate": 9.577360726554804e-07, + "loss": 0.2847, + "step": 871 + }, + { + "epoch": 0.43951612903225806, + "grad_norm": 0.23524427674559492, + "learning_rate": 9.576279438627308e-07, + "loss": 0.2754, + "step": 872 + }, + { + "epoch": 0.44002016129032256, + "grad_norm": 0.18413711797969656, + "learning_rate": 9.57519683047154e-07, + "loss": 0.2893, + "step": 873 + }, + { + "epoch": 0.4405241935483871, + "grad_norm": 0.22031589128909412, + "learning_rate": 9.574112902399829e-07, + "loss": 0.2884, + "step": 874 + }, + { + "epoch": 0.4410282258064516, + "grad_norm": 0.22755059592326796, + "learning_rate": 9.573027654724882e-07, + "loss": 0.29, + "step": 875 + }, + { + "epoch": 0.4415322580645161, + "grad_norm": 0.18373793370970248, + "learning_rate": 9.571941087759782e-07, + "loss": 0.3064, + "step": 876 + }, + { + "epoch": 0.4420362903225806, + "grad_norm": 0.19409387364473213, + "learning_rate": 9.570853201818002e-07, + "loss": 0.2922, + "step": 877 + }, + { + "epoch": 0.4425403225806452, + "grad_norm": 0.21129036924339822, + "learning_rate": 9.569763997213387e-07, + "loss": 0.2714, + "step": 878 + }, + { + "epoch": 0.4430443548387097, + "grad_norm": 0.17440531697354897, + "learning_rate": 9.568673474260168e-07, + "loss": 0.2847, + "step": 879 + }, + { + "epoch": 0.4435483870967742, + "grad_norm": 0.18668182286896606, + "learning_rate": 9.567581633272955e-07, + "loss": 0.3118, + "step": 880 + }, + { + "epoch": 0.4440524193548387, + "grad_norm": 0.1797795667764904, + "learning_rate": 9.56648847456673e-07, + "loss": 0.3019, + "step": 881 + }, + { + "epoch": 0.44455645161290325, + "grad_norm": 0.3707822100666679, + "learning_rate": 9.565393998456874e-07, + "loss": 0.2912, + "step": 882 + }, + { + "epoch": 0.44506048387096775, + "grad_norm": 0.289665390914343, + "learning_rate": 9.564298205259126e-07, + "loss": 0.2895, + "step": 883 + }, + { + "epoch": 0.44556451612903225, + "grad_norm": 0.2056754789971548, + "learning_rate": 9.563201095289624e-07, + "loss": 0.298, + "step": 884 + }, + { + "epoch": 0.44606854838709675, + "grad_norm": 0.1814615855914537, + "learning_rate": 9.562102668864871e-07, + "loss": 0.2934, + "step": 885 + }, + { + "epoch": 0.4465725806451613, + "grad_norm": 0.37487748973707236, + "learning_rate": 9.56100292630176e-07, + "loss": 0.2973, + "step": 886 + }, + { + "epoch": 0.4470766129032258, + "grad_norm": 0.1759000528091719, + "learning_rate": 9.559901867917556e-07, + "loss": 0.3039, + "step": 887 + }, + { + "epoch": 0.4475806451612903, + "grad_norm": 0.32191288919848915, + "learning_rate": 9.558799494029914e-07, + "loss": 0.2893, + "step": 888 + }, + { + "epoch": 0.4480846774193548, + "grad_norm": 0.19810987185465256, + "learning_rate": 9.557695804956856e-07, + "loss": 0.3127, + "step": 889 + }, + { + "epoch": 0.4485887096774194, + "grad_norm": 0.24720381123354968, + "learning_rate": 9.556590801016793e-07, + "loss": 0.2918, + "step": 890 + }, + { + "epoch": 0.4490927419354839, + "grad_norm": 0.18168928920802488, + "learning_rate": 9.555484482528508e-07, + "loss": 0.2881, + "step": 891 + }, + { + "epoch": 0.4495967741935484, + "grad_norm": 0.18687548855169936, + "learning_rate": 9.554376849811173e-07, + "loss": 0.2905, + "step": 892 + }, + { + "epoch": 0.4501008064516129, + "grad_norm": 0.20251482097557522, + "learning_rate": 9.553267903184327e-07, + "loss": 0.2939, + "step": 893 + }, + { + "epoch": 0.45060483870967744, + "grad_norm": 0.2887402542876595, + "learning_rate": 9.552157642967897e-07, + "loss": 0.2878, + "step": 894 + }, + { + "epoch": 0.45110887096774194, + "grad_norm": 0.19945034327270725, + "learning_rate": 9.551046069482186e-07, + "loss": 0.2845, + "step": 895 + }, + { + "epoch": 0.45161290322580644, + "grad_norm": 0.23431424692187636, + "learning_rate": 9.549933183047877e-07, + "loss": 0.2917, + "step": 896 + }, + { + "epoch": 0.45211693548387094, + "grad_norm": 0.2190333169890677, + "learning_rate": 9.54881898398603e-07, + "loss": 0.2952, + "step": 897 + }, + { + "epoch": 0.4526209677419355, + "grad_norm": 0.17822949705219454, + "learning_rate": 9.547703472618086e-07, + "loss": 0.299, + "step": 898 + }, + { + "epoch": 0.453125, + "grad_norm": 0.2102549610365594, + "learning_rate": 9.54658664926586e-07, + "loss": 0.2935, + "step": 899 + }, + { + "epoch": 0.4536290322580645, + "grad_norm": 0.19289696781722984, + "learning_rate": 9.545468514251552e-07, + "loss": 0.2998, + "step": 900 + }, + { + "epoch": 0.45413306451612906, + "grad_norm": 0.3042982461642612, + "learning_rate": 9.544349067897734e-07, + "loss": 0.2937, + "step": 901 + }, + { + "epoch": 0.45463709677419356, + "grad_norm": 0.19107058828557277, + "learning_rate": 9.54322831052736e-07, + "loss": 0.2765, + "step": 902 + }, + { + "epoch": 0.45514112903225806, + "grad_norm": 0.28747582105179686, + "learning_rate": 9.542106242463764e-07, + "loss": 0.2833, + "step": 903 + }, + { + "epoch": 0.45564516129032256, + "grad_norm": 0.18458129956622982, + "learning_rate": 9.540982864030653e-07, + "loss": 0.2831, + "step": 904 + }, + { + "epoch": 0.4561491935483871, + "grad_norm": 0.2130309759800133, + "learning_rate": 9.539858175552115e-07, + "loss": 0.2836, + "step": 905 + }, + { + "epoch": 0.4566532258064516, + "grad_norm": 0.3057883689836788, + "learning_rate": 9.538732177352617e-07, + "loss": 0.3007, + "step": 906 + }, + { + "epoch": 0.4571572580645161, + "grad_norm": 0.31124910837676767, + "learning_rate": 9.537604869757001e-07, + "loss": 0.2764, + "step": 907 + }, + { + "epoch": 0.4576612903225806, + "grad_norm": 0.2441004343752962, + "learning_rate": 9.53647625309049e-07, + "loss": 0.2868, + "step": 908 + }, + { + "epoch": 0.4581653225806452, + "grad_norm": 0.2357500626232433, + "learning_rate": 9.535346327678682e-07, + "loss": 0.3016, + "step": 909 + }, + { + "epoch": 0.4586693548387097, + "grad_norm": 0.20066160584984583, + "learning_rate": 9.534215093847552e-07, + "loss": 0.3001, + "step": 910 + }, + { + "epoch": 0.4591733870967742, + "grad_norm": 0.22773495544084632, + "learning_rate": 9.533082551923458e-07, + "loss": 0.2756, + "step": 911 + }, + { + "epoch": 0.4596774193548387, + "grad_norm": 0.1906199840165581, + "learning_rate": 9.531948702233126e-07, + "loss": 0.2869, + "step": 912 + }, + { + "epoch": 0.46018145161290325, + "grad_norm": 0.3245639131414002, + "learning_rate": 9.530813545103667e-07, + "loss": 0.2708, + "step": 913 + }, + { + "epoch": 0.46068548387096775, + "grad_norm": 0.21271580471854526, + "learning_rate": 9.52967708086257e-07, + "loss": 0.2892, + "step": 914 + }, + { + "epoch": 0.46118951612903225, + "grad_norm": 0.2104148343959223, + "learning_rate": 9.528539309837693e-07, + "loss": 0.2877, + "step": 915 + }, + { + "epoch": 0.46169354838709675, + "grad_norm": 0.19824185563673147, + "learning_rate": 9.527400232357279e-07, + "loss": 0.276, + "step": 916 + }, + { + "epoch": 0.4621975806451613, + "grad_norm": 0.21951663687873915, + "learning_rate": 9.526259848749943e-07, + "loss": 0.2859, + "step": 917 + }, + { + "epoch": 0.4627016129032258, + "grad_norm": 0.37545971476160844, + "learning_rate": 9.52511815934468e-07, + "loss": 0.2774, + "step": 918 + }, + { + "epoch": 0.4632056451612903, + "grad_norm": 0.22438052196032507, + "learning_rate": 9.523975164470859e-07, + "loss": 0.2978, + "step": 919 + }, + { + "epoch": 0.4637096774193548, + "grad_norm": 0.1757823732376565, + "learning_rate": 9.522830864458227e-07, + "loss": 0.3022, + "step": 920 + }, + { + "epoch": 0.4642137096774194, + "grad_norm": 0.27657899716696727, + "learning_rate": 9.521685259636909e-07, + "loss": 0.2944, + "step": 921 + }, + { + "epoch": 0.4647177419354839, + "grad_norm": 0.18756079463012706, + "learning_rate": 9.520538350337404e-07, + "loss": 0.2884, + "step": 922 + }, + { + "epoch": 0.4652217741935484, + "grad_norm": 0.2062606714488354, + "learning_rate": 9.519390136890589e-07, + "loss": 0.2778, + "step": 923 + }, + { + "epoch": 0.4657258064516129, + "grad_norm": 0.366011125828711, + "learning_rate": 9.518240619627713e-07, + "loss": 0.3001, + "step": 924 + }, + { + "epoch": 0.46622983870967744, + "grad_norm": 0.2541955304250703, + "learning_rate": 9.51708979888041e-07, + "loss": 0.2905, + "step": 925 + }, + { + "epoch": 0.46673387096774194, + "grad_norm": 0.36412234331970916, + "learning_rate": 9.51593767498068e-07, + "loss": 0.2822, + "step": 926 + }, + { + "epoch": 0.46723790322580644, + "grad_norm": 0.21139676616548536, + "learning_rate": 9.514784248260908e-07, + "loss": 0.3045, + "step": 927 + }, + { + "epoch": 0.46774193548387094, + "grad_norm": 0.19079104145467554, + "learning_rate": 9.513629519053845e-07, + "loss": 0.2737, + "step": 928 + }, + { + "epoch": 0.4682459677419355, + "grad_norm": 0.23607892317649437, + "learning_rate": 9.512473487692628e-07, + "loss": 0.2999, + "step": 929 + }, + { + "epoch": 0.46875, + "grad_norm": 0.1803840363558066, + "learning_rate": 9.511316154510763e-07, + "loss": 0.287, + "step": 930 + }, + { + "epoch": 0.4692540322580645, + "grad_norm": 0.17453949054667584, + "learning_rate": 9.510157519842133e-07, + "loss": 0.2851, + "step": 931 + }, + { + "epoch": 0.46975806451612906, + "grad_norm": 0.24449967092054903, + "learning_rate": 9.508997584020997e-07, + "loss": 0.2872, + "step": 932 + }, + { + "epoch": 0.47026209677419356, + "grad_norm": 0.1745644546122727, + "learning_rate": 9.507836347381992e-07, + "loss": 0.3005, + "step": 933 + }, + { + "epoch": 0.47076612903225806, + "grad_norm": 0.18053174508073816, + "learning_rate": 9.506673810260123e-07, + "loss": 0.2929, + "step": 934 + }, + { + "epoch": 0.47127016129032256, + "grad_norm": 0.31029290882211796, + "learning_rate": 9.505509972990778e-07, + "loss": 0.2926, + "step": 935 + }, + { + "epoch": 0.4717741935483871, + "grad_norm": 0.22938531150844882, + "learning_rate": 9.504344835909716e-07, + "loss": 0.2885, + "step": 936 + }, + { + "epoch": 0.4722782258064516, + "grad_norm": 0.17882862009471923, + "learning_rate": 9.503178399353072e-07, + "loss": 0.2743, + "step": 937 + }, + { + "epoch": 0.4727822580645161, + "grad_norm": 0.1854810156795896, + "learning_rate": 9.502010663657354e-07, + "loss": 0.2955, + "step": 938 + }, + { + "epoch": 0.4732862903225806, + "grad_norm": 0.21043848944429472, + "learning_rate": 9.50084162915945e-07, + "loss": 0.2964, + "step": 939 + }, + { + "epoch": 0.4737903225806452, + "grad_norm": 0.2847988534984117, + "learning_rate": 9.499671296196617e-07, + "loss": 0.285, + "step": 940 + }, + { + "epoch": 0.4742943548387097, + "grad_norm": 0.25216723077802655, + "learning_rate": 9.498499665106487e-07, + "loss": 0.2836, + "step": 941 + }, + { + "epoch": 0.4747983870967742, + "grad_norm": 0.18397402571778532, + "learning_rate": 9.497326736227071e-07, + "loss": 0.2937, + "step": 942 + }, + { + "epoch": 0.4753024193548387, + "grad_norm": 0.28708542295201006, + "learning_rate": 9.496152509896753e-07, + "loss": 0.2851, + "step": 943 + }, + { + "epoch": 0.47580645161290325, + "grad_norm": 0.2460818429957608, + "learning_rate": 9.494976986454286e-07, + "loss": 0.29, + "step": 944 + }, + { + "epoch": 0.47631048387096775, + "grad_norm": 0.19878718222512176, + "learning_rate": 9.493800166238805e-07, + "loss": 0.2917, + "step": 945 + }, + { + "epoch": 0.47681451612903225, + "grad_norm": 0.22398506177876373, + "learning_rate": 9.492622049589812e-07, + "loss": 0.3093, + "step": 946 + }, + { + "epoch": 0.47731854838709675, + "grad_norm": 0.19821540893857262, + "learning_rate": 9.491442636847189e-07, + "loss": 0.2946, + "step": 947 + }, + { + "epoch": 0.4778225806451613, + "grad_norm": 0.19307685380255402, + "learning_rate": 9.490261928351189e-07, + "loss": 0.2969, + "step": 948 + }, + { + "epoch": 0.4783266129032258, + "grad_norm": 0.216166144258633, + "learning_rate": 9.489079924442438e-07, + "loss": 0.2981, + "step": 949 + }, + { + "epoch": 0.4788306451612903, + "grad_norm": 0.17612196294528637, + "learning_rate": 9.487896625461935e-07, + "loss": 0.2907, + "step": 950 + }, + { + "epoch": 0.4793346774193548, + "grad_norm": 0.2589705775054588, + "learning_rate": 9.486712031751058e-07, + "loss": 0.3049, + "step": 951 + }, + { + "epoch": 0.4798387096774194, + "grad_norm": 0.24483548456140278, + "learning_rate": 9.485526143651555e-07, + "loss": 0.3105, + "step": 952 + }, + { + "epoch": 0.4803427419354839, + "grad_norm": 0.20974832200044807, + "learning_rate": 9.484338961505544e-07, + "loss": 0.2935, + "step": 953 + }, + { + "epoch": 0.4808467741935484, + "grad_norm": 0.17639810431043276, + "learning_rate": 9.483150485655523e-07, + "loss": 0.2818, + "step": 954 + }, + { + "epoch": 0.4813508064516129, + "grad_norm": 0.17457844069908712, + "learning_rate": 9.481960716444358e-07, + "loss": 0.2954, + "step": 955 + }, + { + "epoch": 0.48185483870967744, + "grad_norm": 0.17714100105123767, + "learning_rate": 9.480769654215291e-07, + "loss": 0.2811, + "step": 956 + }, + { + "epoch": 0.48235887096774194, + "grad_norm": 0.1958920141207276, + "learning_rate": 9.479577299311934e-07, + "loss": 0.2898, + "step": 957 + }, + { + "epoch": 0.48286290322580644, + "grad_norm": 0.2882352875771043, + "learning_rate": 9.478383652078277e-07, + "loss": 0.296, + "step": 958 + }, + { + "epoch": 0.48336693548387094, + "grad_norm": 0.21959333859610755, + "learning_rate": 9.477188712858679e-07, + "loss": 0.2816, + "step": 959 + }, + { + "epoch": 0.4838709677419355, + "grad_norm": 0.33566206173811586, + "learning_rate": 9.475992481997872e-07, + "loss": 0.2957, + "step": 960 + }, + { + "epoch": 0.484375, + "grad_norm": 0.24126700688554154, + "learning_rate": 9.474794959840959e-07, + "loss": 0.2913, + "step": 961 + }, + { + "epoch": 0.4848790322580645, + "grad_norm": 0.25752819972368163, + "learning_rate": 9.473596146733422e-07, + "loss": 0.2815, + "step": 962 + }, + { + "epoch": 0.48538306451612906, + "grad_norm": 0.19963355802829852, + "learning_rate": 9.472396043021109e-07, + "loss": 0.2859, + "step": 963 + }, + { + "epoch": 0.48588709677419356, + "grad_norm": 0.1726400189184517, + "learning_rate": 9.471194649050243e-07, + "loss": 0.2932, + "step": 964 + }, + { + "epoch": 0.48639112903225806, + "grad_norm": 0.1769030343965752, + "learning_rate": 9.469991965167416e-07, + "loss": 0.3046, + "step": 965 + }, + { + "epoch": 0.48689516129032256, + "grad_norm": 0.19108815400810478, + "learning_rate": 9.4687879917196e-07, + "loss": 0.2912, + "step": 966 + }, + { + "epoch": 0.4873991935483871, + "grad_norm": 0.20802165343766263, + "learning_rate": 9.46758272905413e-07, + "loss": 0.2895, + "step": 967 + }, + { + "epoch": 0.4879032258064516, + "grad_norm": 0.21787519659072058, + "learning_rate": 9.466376177518718e-07, + "loss": 0.287, + "step": 968 + }, + { + "epoch": 0.4884072580645161, + "grad_norm": 0.1761978641368217, + "learning_rate": 9.465168337461447e-07, + "loss": 0.2778, + "step": 969 + }, + { + "epoch": 0.4889112903225806, + "grad_norm": 0.20346855806980782, + "learning_rate": 9.46395920923077e-07, + "loss": 0.2868, + "step": 970 + }, + { + "epoch": 0.4894153225806452, + "grad_norm": 0.1746303170409007, + "learning_rate": 9.462748793175514e-07, + "loss": 0.2853, + "step": 971 + }, + { + "epoch": 0.4899193548387097, + "grad_norm": 0.27860715367389927, + "learning_rate": 9.461537089644876e-07, + "loss": 0.2984, + "step": 972 + }, + { + "epoch": 0.4904233870967742, + "grad_norm": 0.17619075334859385, + "learning_rate": 9.460324098988426e-07, + "loss": 0.2902, + "step": 973 + }, + { + "epoch": 0.4909274193548387, + "grad_norm": 0.2092460695723593, + "learning_rate": 9.459109821556104e-07, + "loss": 0.2893, + "step": 974 + }, + { + "epoch": 0.49143145161290325, + "grad_norm": 0.19009628066171583, + "learning_rate": 9.457894257698221e-07, + "loss": 0.289, + "step": 975 + }, + { + "epoch": 0.49193548387096775, + "grad_norm": 0.19214552662936102, + "learning_rate": 9.456677407765459e-07, + "loss": 0.2914, + "step": 976 + }, + { + "epoch": 0.49243951612903225, + "grad_norm": 0.21669299426430066, + "learning_rate": 9.455459272108873e-07, + "loss": 0.2829, + "step": 977 + }, + { + "epoch": 0.49294354838709675, + "grad_norm": 0.22291135020577527, + "learning_rate": 9.454239851079888e-07, + "loss": 0.2927, + "step": 978 + }, + { + "epoch": 0.4934475806451613, + "grad_norm": 0.1874075382197213, + "learning_rate": 9.453019145030296e-07, + "loss": 0.2821, + "step": 979 + }, + { + "epoch": 0.4939516129032258, + "grad_norm": 0.17233245804235334, + "learning_rate": 9.451797154312269e-07, + "loss": 0.2764, + "step": 980 + }, + { + "epoch": 0.4944556451612903, + "grad_norm": 0.1834747384575909, + "learning_rate": 9.450573879278338e-07, + "loss": 0.2735, + "step": 981 + }, + { + "epoch": 0.4949596774193548, + "grad_norm": 0.17012166060463613, + "learning_rate": 9.449349320281414e-07, + "loss": 0.2676, + "step": 982 + }, + { + "epoch": 0.4954637096774194, + "grad_norm": 0.24518957923359821, + "learning_rate": 9.448123477674773e-07, + "loss": 0.2964, + "step": 983 + }, + { + "epoch": 0.4959677419354839, + "grad_norm": 0.2037237351416288, + "learning_rate": 9.446896351812064e-07, + "loss": 0.2989, + "step": 984 + }, + { + "epoch": 0.4964717741935484, + "grad_norm": 0.17254543605119085, + "learning_rate": 9.445667943047303e-07, + "loss": 0.2816, + "step": 985 + }, + { + "epoch": 0.4969758064516129, + "grad_norm": 0.17270938767860752, + "learning_rate": 9.444438251734881e-07, + "loss": 0.29, + "step": 986 + }, + { + "epoch": 0.49747983870967744, + "grad_norm": 0.18549793090433508, + "learning_rate": 9.443207278229554e-07, + "loss": 0.2938, + "step": 987 + }, + { + "epoch": 0.49798387096774194, + "grad_norm": 0.17378591850071334, + "learning_rate": 9.441975022886453e-07, + "loss": 0.2863, + "step": 988 + }, + { + "epoch": 0.49848790322580644, + "grad_norm": 0.2036952349756055, + "learning_rate": 9.440741486061075e-07, + "loss": 0.2979, + "step": 989 + }, + { + "epoch": 0.49899193548387094, + "grad_norm": 0.24105084066474844, + "learning_rate": 9.439506668109284e-07, + "loss": 0.2938, + "step": 990 + }, + { + "epoch": 0.4994959677419355, + "grad_norm": 0.18506171017034273, + "learning_rate": 9.438270569387323e-07, + "loss": 0.2985, + "step": 991 + }, + { + "epoch": 0.5, + "grad_norm": 0.22764502062218736, + "learning_rate": 9.437033190251797e-07, + "loss": 0.2936, + "step": 992 + }, + { + "epoch": 0.5005040322580645, + "grad_norm": 0.1987156755550509, + "learning_rate": 9.43579453105968e-07, + "loss": 0.2941, + "step": 993 + }, + { + "epoch": 0.501008064516129, + "grad_norm": 0.18473840920584347, + "learning_rate": 9.43455459216832e-07, + "loss": 0.2951, + "step": 994 + }, + { + "epoch": 0.5015120967741935, + "grad_norm": 0.18615931801205335, + "learning_rate": 9.433313373935429e-07, + "loss": 0.2864, + "step": 995 + }, + { + "epoch": 0.5015120967741935, + "eval_loss": 0.3215525150299072, + "eval_runtime": 17.0311, + "eval_samples_per_second": 50.202, + "eval_steps_per_second": 1.057, + "step": 995 + }, + { + "epoch": 0.5020161290322581, + "grad_norm": 0.18646346465182403, + "learning_rate": 9.432070876719095e-07, + "loss": 0.2873, + "step": 996 + }, + { + "epoch": 0.5025201612903226, + "grad_norm": 0.3058678234050536, + "learning_rate": 9.430827100877767e-07, + "loss": 0.2898, + "step": 997 + }, + { + "epoch": 0.5030241935483871, + "grad_norm": 0.20468910289808356, + "learning_rate": 9.429582046770268e-07, + "loss": 0.2961, + "step": 998 + }, + { + "epoch": 0.5035282258064516, + "grad_norm": 0.18693796681842775, + "learning_rate": 9.428335714755788e-07, + "loss": 0.2917, + "step": 999 + }, + { + "epoch": 0.5040322580645161, + "grad_norm": 0.21831422621342852, + "learning_rate": 9.427088105193888e-07, + "loss": 0.2921, + "step": 1000 + }, + { + "epoch": 0.5045362903225806, + "grad_norm": 0.2733584938342229, + "learning_rate": 9.425839218444492e-07, + "loss": 0.2848, + "step": 1001 + }, + { + "epoch": 0.5050403225806451, + "grad_norm": 0.22194832047757593, + "learning_rate": 9.424589054867899e-07, + "loss": 0.2816, + "step": 1002 + }, + { + "epoch": 0.5055443548387096, + "grad_norm": 0.2631541311468527, + "learning_rate": 9.423337614824772e-07, + "loss": 0.2985, + "step": 1003 + }, + { + "epoch": 0.5060483870967742, + "grad_norm": 0.23862558481556265, + "learning_rate": 9.422084898676146e-07, + "loss": 0.2925, + "step": 1004 + }, + { + "epoch": 0.5065524193548387, + "grad_norm": 0.18127086379393875, + "learning_rate": 9.420830906783418e-07, + "loss": 0.2893, + "step": 1005 + }, + { + "epoch": 0.5070564516129032, + "grad_norm": 0.23817820439070797, + "learning_rate": 9.419575639508359e-07, + "loss": 0.2912, + "step": 1006 + }, + { + "epoch": 0.5075604838709677, + "grad_norm": 0.19504523429748843, + "learning_rate": 9.418319097213108e-07, + "loss": 0.287, + "step": 1007 + }, + { + "epoch": 0.5080645161290323, + "grad_norm": 0.20304831048415597, + "learning_rate": 9.417061280260165e-07, + "loss": 0.2804, + "step": 1008 + }, + { + "epoch": 0.5085685483870968, + "grad_norm": 0.22706657240299058, + "learning_rate": 9.415802189012407e-07, + "loss": 0.3014, + "step": 1009 + }, + { + "epoch": 0.5090725806451613, + "grad_norm": 0.1859609106588216, + "learning_rate": 9.414541823833071e-07, + "loss": 0.2984, + "step": 1010 + }, + { + "epoch": 0.5095766129032258, + "grad_norm": 0.1734707367992668, + "learning_rate": 9.413280185085766e-07, + "loss": 0.2717, + "step": 1011 + }, + { + "epoch": 0.5100806451612904, + "grad_norm": 0.1795148808426381, + "learning_rate": 9.412017273134464e-07, + "loss": 0.2963, + "step": 1012 + }, + { + "epoch": 0.5105846774193549, + "grad_norm": 0.17831883735108947, + "learning_rate": 9.410753088343513e-07, + "loss": 0.2844, + "step": 1013 + }, + { + "epoch": 0.5110887096774194, + "grad_norm": 0.17970109519050018, + "learning_rate": 9.409487631077617e-07, + "loss": 0.2961, + "step": 1014 + }, + { + "epoch": 0.5115927419354839, + "grad_norm": 0.18545886299498057, + "learning_rate": 9.408220901701856e-07, + "loss": 0.2944, + "step": 1015 + }, + { + "epoch": 0.5120967741935484, + "grad_norm": 0.2176487874487353, + "learning_rate": 9.40695290058167e-07, + "loss": 0.2805, + "step": 1016 + }, + { + "epoch": 0.5126008064516129, + "grad_norm": 0.1848666221951678, + "learning_rate": 9.405683628082875e-07, + "loss": 0.2923, + "step": 1017 + }, + { + "epoch": 0.5131048387096774, + "grad_norm": 0.179797631951874, + "learning_rate": 9.404413084571643e-07, + "loss": 0.2985, + "step": 1018 + }, + { + "epoch": 0.5136088709677419, + "grad_norm": 0.18358769665622782, + "learning_rate": 9.403141270414521e-07, + "loss": 0.2935, + "step": 1019 + }, + { + "epoch": 0.5141129032258065, + "grad_norm": 0.2395742446188053, + "learning_rate": 9.401868185978418e-07, + "loss": 0.2985, + "step": 1020 + }, + { + "epoch": 0.514616935483871, + "grad_norm": 0.21187575778017587, + "learning_rate": 9.400593831630612e-07, + "loss": 0.2934, + "step": 1021 + }, + { + "epoch": 0.5151209677419355, + "grad_norm": 0.1735450576887722, + "learning_rate": 9.399318207738744e-07, + "loss": 0.2753, + "step": 1022 + }, + { + "epoch": 0.515625, + "grad_norm": 0.18638678577349388, + "learning_rate": 9.398041314670828e-07, + "loss": 0.2787, + "step": 1023 + }, + { + "epoch": 0.5161290322580645, + "grad_norm": 0.24434453479231824, + "learning_rate": 9.396763152795237e-07, + "loss": 0.2855, + "step": 1024 + }, + { + "epoch": 0.516633064516129, + "grad_norm": 0.18013654047589361, + "learning_rate": 9.395483722480711e-07, + "loss": 0.2946, + "step": 1025 + }, + { + "epoch": 0.5171370967741935, + "grad_norm": 0.3202201260573074, + "learning_rate": 9.394203024096361e-07, + "loss": 0.2769, + "step": 1026 + }, + { + "epoch": 0.5176411290322581, + "grad_norm": 0.17815074975274992, + "learning_rate": 9.392921058011658e-07, + "loss": 0.2979, + "step": 1027 + }, + { + "epoch": 0.5181451612903226, + "grad_norm": 0.19221302362826442, + "learning_rate": 9.391637824596443e-07, + "loss": 0.2955, + "step": 1028 + }, + { + "epoch": 0.5186491935483871, + "grad_norm": 0.2866346023755273, + "learning_rate": 9.390353324220921e-07, + "loss": 0.292, + "step": 1029 + }, + { + "epoch": 0.5191532258064516, + "grad_norm": 0.17531514828702485, + "learning_rate": 9.389067557255661e-07, + "loss": 0.2925, + "step": 1030 + }, + { + "epoch": 0.5196572580645161, + "grad_norm": 0.186495533039337, + "learning_rate": 9.3877805240716e-07, + "loss": 0.2951, + "step": 1031 + }, + { + "epoch": 0.5201612903225806, + "grad_norm": 0.20577228144105744, + "learning_rate": 9.386492225040039e-07, + "loss": 0.2962, + "step": 1032 + }, + { + "epoch": 0.5206653225806451, + "grad_norm": 0.17419768193385268, + "learning_rate": 9.385202660532643e-07, + "loss": 0.2855, + "step": 1033 + }, + { + "epoch": 0.5211693548387096, + "grad_norm": 0.21933160804478288, + "learning_rate": 9.383911830921445e-07, + "loss": 0.3005, + "step": 1034 + }, + { + "epoch": 0.5216733870967742, + "grad_norm": 0.18215272532714025, + "learning_rate": 9.38261973657884e-07, + "loss": 0.2836, + "step": 1035 + }, + { + "epoch": 0.5221774193548387, + "grad_norm": 0.17976375005098702, + "learning_rate": 9.38132637787759e-07, + "loss": 0.287, + "step": 1036 + }, + { + "epoch": 0.5226814516129032, + "grad_norm": 0.28128296670889563, + "learning_rate": 9.380031755190823e-07, + "loss": 0.2775, + "step": 1037 + }, + { + "epoch": 0.5231854838709677, + "grad_norm": 0.23134564761372306, + "learning_rate": 9.378735868892024e-07, + "loss": 0.2968, + "step": 1038 + }, + { + "epoch": 0.5236895161290323, + "grad_norm": 0.18342110673735257, + "learning_rate": 9.377438719355054e-07, + "loss": 0.2907, + "step": 1039 + }, + { + "epoch": 0.5241935483870968, + "grad_norm": 0.18862222844193371, + "learning_rate": 9.37614030695413e-07, + "loss": 0.2911, + "step": 1040 + }, + { + "epoch": 0.5246975806451613, + "grad_norm": 0.30077945487794844, + "learning_rate": 9.374840632063836e-07, + "loss": 0.2783, + "step": 1041 + }, + { + "epoch": 0.5252016129032258, + "grad_norm": 0.18403419856623815, + "learning_rate": 9.37353969505912e-07, + "loss": 0.2873, + "step": 1042 + }, + { + "epoch": 0.5257056451612904, + "grad_norm": 0.19055837973677883, + "learning_rate": 9.372237496315295e-07, + "loss": 0.3061, + "step": 1043 + }, + { + "epoch": 0.5262096774193549, + "grad_norm": 0.1832527411516626, + "learning_rate": 9.370934036208037e-07, + "loss": 0.3085, + "step": 1044 + }, + { + "epoch": 0.5267137096774194, + "grad_norm": 0.20346555869158944, + "learning_rate": 9.369629315113385e-07, + "loss": 0.3114, + "step": 1045 + }, + { + "epoch": 0.5272177419354839, + "grad_norm": 0.1881315222709536, + "learning_rate": 9.368323333407746e-07, + "loss": 0.3027, + "step": 1046 + }, + { + "epoch": 0.5277217741935484, + "grad_norm": 0.20889874047965842, + "learning_rate": 9.367016091467885e-07, + "loss": 0.2935, + "step": 1047 + }, + { + "epoch": 0.5282258064516129, + "grad_norm": 0.22770873904210787, + "learning_rate": 9.365707589670933e-07, + "loss": 0.2808, + "step": 1048 + }, + { + "epoch": 0.5287298387096774, + "grad_norm": 0.1992019569628758, + "learning_rate": 9.364397828394386e-07, + "loss": 0.2804, + "step": 1049 + }, + { + "epoch": 0.5292338709677419, + "grad_norm": 0.1753127135289721, + "learning_rate": 9.3630868080161e-07, + "loss": 0.2811, + "step": 1050 + }, + { + "epoch": 0.5297379032258065, + "grad_norm": 0.17661722427381488, + "learning_rate": 9.361774528914299e-07, + "loss": 0.2806, + "step": 1051 + }, + { + "epoch": 0.530241935483871, + "grad_norm": 0.1889290840413473, + "learning_rate": 9.360460991467567e-07, + "loss": 0.2913, + "step": 1052 + }, + { + "epoch": 0.5307459677419355, + "grad_norm": 0.1829226305533146, + "learning_rate": 9.35914619605485e-07, + "loss": 0.2979, + "step": 1053 + }, + { + "epoch": 0.53125, + "grad_norm": 0.19271715211398896, + "learning_rate": 9.357830143055459e-07, + "loss": 0.282, + "step": 1054 + }, + { + "epoch": 0.5317540322580645, + "grad_norm": 0.18190812734541403, + "learning_rate": 9.356512832849066e-07, + "loss": 0.2941, + "step": 1055 + }, + { + "epoch": 0.532258064516129, + "grad_norm": 0.18587084451978964, + "learning_rate": 9.35519426581571e-07, + "loss": 0.3082, + "step": 1056 + }, + { + "epoch": 0.5327620967741935, + "grad_norm": 0.2198278002815469, + "learning_rate": 9.353874442335785e-07, + "loss": 0.302, + "step": 1057 + }, + { + "epoch": 0.5332661290322581, + "grad_norm": 0.17574611035139306, + "learning_rate": 9.352553362790054e-07, + "loss": 0.2882, + "step": 1058 + }, + { + "epoch": 0.5337701612903226, + "grad_norm": 0.1747327396768012, + "learning_rate": 9.351231027559643e-07, + "loss": 0.2867, + "step": 1059 + }, + { + "epoch": 0.5342741935483871, + "grad_norm": 0.22698038959811936, + "learning_rate": 9.349907437026034e-07, + "loss": 0.2811, + "step": 1060 + }, + { + "epoch": 0.5347782258064516, + "grad_norm": 0.18501161743729416, + "learning_rate": 9.348582591571075e-07, + "loss": 0.2917, + "step": 1061 + }, + { + "epoch": 0.5352822580645161, + "grad_norm": 0.19282052250090315, + "learning_rate": 9.347256491576976e-07, + "loss": 0.2725, + "step": 1062 + }, + { + "epoch": 0.5357862903225806, + "grad_norm": 0.18478580670133535, + "learning_rate": 9.345929137426311e-07, + "loss": 0.2774, + "step": 1063 + }, + { + "epoch": 0.5362903225806451, + "grad_norm": 0.17685008655382356, + "learning_rate": 9.344600529502009e-07, + "loss": 0.2945, + "step": 1064 + }, + { + "epoch": 0.5367943548387096, + "grad_norm": 0.20983177271923298, + "learning_rate": 9.34327066818737e-07, + "loss": 0.281, + "step": 1065 + }, + { + "epoch": 0.5372983870967742, + "grad_norm": 0.2077770172618288, + "learning_rate": 9.341939553866048e-07, + "loss": 0.2922, + "step": 1066 + }, + { + "epoch": 0.5378024193548387, + "grad_norm": 0.22247565527885046, + "learning_rate": 9.340607186922059e-07, + "loss": 0.2788, + "step": 1067 + }, + { + "epoch": 0.5383064516129032, + "grad_norm": 0.2072454495569315, + "learning_rate": 9.339273567739787e-07, + "loss": 0.2919, + "step": 1068 + }, + { + "epoch": 0.5388104838709677, + "grad_norm": 0.1745117991986638, + "learning_rate": 9.337938696703971e-07, + "loss": 0.2847, + "step": 1069 + }, + { + "epoch": 0.5393145161290323, + "grad_norm": 0.2100754257810942, + "learning_rate": 9.336602574199713e-07, + "loss": 0.3018, + "step": 1070 + }, + { + "epoch": 0.5398185483870968, + "grad_norm": 0.20378489641904865, + "learning_rate": 9.335265200612477e-07, + "loss": 0.2911, + "step": 1071 + }, + { + "epoch": 0.5403225806451613, + "grad_norm": 0.20915414310743302, + "learning_rate": 9.333926576328087e-07, + "loss": 0.2934, + "step": 1072 + }, + { + "epoch": 0.5408266129032258, + "grad_norm": 0.29515599957599536, + "learning_rate": 9.332586701732725e-07, + "loss": 0.2899, + "step": 1073 + }, + { + "epoch": 0.5413306451612904, + "grad_norm": 0.1983091401677111, + "learning_rate": 9.331245577212938e-07, + "loss": 0.2741, + "step": 1074 + }, + { + "epoch": 0.5418346774193549, + "grad_norm": 0.2538289487328285, + "learning_rate": 9.329903203155633e-07, + "loss": 0.2927, + "step": 1075 + }, + { + "epoch": 0.5423387096774194, + "grad_norm": 0.17986202552902045, + "learning_rate": 9.328559579948078e-07, + "loss": 0.2922, + "step": 1076 + }, + { + "epoch": 0.5428427419354839, + "grad_norm": 0.1825249327016375, + "learning_rate": 9.327214707977898e-07, + "loss": 0.3001, + "step": 1077 + }, + { + "epoch": 0.5433467741935484, + "grad_norm": 0.2444225537955273, + "learning_rate": 9.325868587633079e-07, + "loss": 0.2947, + "step": 1078 + }, + { + "epoch": 0.5438508064516129, + "grad_norm": 0.41133577581929226, + "learning_rate": 9.32452121930197e-07, + "loss": 0.2912, + "step": 1079 + }, + { + "epoch": 0.5443548387096774, + "grad_norm": 0.26649460477657994, + "learning_rate": 9.323172603373278e-07, + "loss": 0.2992, + "step": 1080 + }, + { + "epoch": 0.5448588709677419, + "grad_norm": 0.18547372683540594, + "learning_rate": 9.321822740236071e-07, + "loss": 0.3008, + "step": 1081 + }, + { + "epoch": 0.5453629032258065, + "grad_norm": 0.2046461782645962, + "learning_rate": 9.320471630279776e-07, + "loss": 0.2998, + "step": 1082 + }, + { + "epoch": 0.545866935483871, + "grad_norm": 0.18110940549710675, + "learning_rate": 9.319119273894179e-07, + "loss": 0.2761, + "step": 1083 + }, + { + "epoch": 0.5463709677419355, + "grad_norm": 0.20900035767008068, + "learning_rate": 9.317765671469428e-07, + "loss": 0.2808, + "step": 1084 + }, + { + "epoch": 0.546875, + "grad_norm": 0.36426577176365416, + "learning_rate": 9.316410823396026e-07, + "loss": 0.2904, + "step": 1085 + }, + { + "epoch": 0.5473790322580645, + "grad_norm": 0.2316323693848249, + "learning_rate": 9.315054730064842e-07, + "loss": 0.284, + "step": 1086 + }, + { + "epoch": 0.547883064516129, + "grad_norm": 0.18001191011621256, + "learning_rate": 9.313697391867099e-07, + "loss": 0.2781, + "step": 1087 + }, + { + "epoch": 0.5483870967741935, + "grad_norm": 0.18570361057953308, + "learning_rate": 9.31233880919438e-07, + "loss": 0.3016, + "step": 1088 + }, + { + "epoch": 0.5488911290322581, + "grad_norm": 0.1965322122969676, + "learning_rate": 9.310978982438627e-07, + "loss": 0.2717, + "step": 1089 + }, + { + "epoch": 0.5493951612903226, + "grad_norm": 0.18899479748188522, + "learning_rate": 9.309617911992143e-07, + "loss": 0.3011, + "step": 1090 + }, + { + "epoch": 0.5498991935483871, + "grad_norm": 0.19454709669812478, + "learning_rate": 9.308255598247589e-07, + "loss": 0.2959, + "step": 1091 + }, + { + "epoch": 0.5504032258064516, + "grad_norm": 0.25948798557199915, + "learning_rate": 9.306892041597982e-07, + "loss": 0.2894, + "step": 1092 + }, + { + "epoch": 0.5509072580645161, + "grad_norm": 0.18772311499552344, + "learning_rate": 9.305527242436702e-07, + "loss": 0.2753, + "step": 1093 + }, + { + "epoch": 0.5514112903225806, + "grad_norm": 0.1933597255454464, + "learning_rate": 9.304161201157484e-07, + "loss": 0.288, + "step": 1094 + }, + { + "epoch": 0.5519153225806451, + "grad_norm": 0.1769738727867225, + "learning_rate": 9.30279391815442e-07, + "loss": 0.2872, + "step": 1095 + }, + { + "epoch": 0.5524193548387096, + "grad_norm": 0.18100112677940808, + "learning_rate": 9.301425393821967e-07, + "loss": 0.2818, + "step": 1096 + }, + { + "epoch": 0.5529233870967742, + "grad_norm": 0.23375376145862312, + "learning_rate": 9.300055628554933e-07, + "loss": 0.2949, + "step": 1097 + }, + { + "epoch": 0.5534274193548387, + "grad_norm": 0.19417823391326344, + "learning_rate": 9.298684622748487e-07, + "loss": 0.2877, + "step": 1098 + }, + { + "epoch": 0.5539314516129032, + "grad_norm": 0.1744889878512034, + "learning_rate": 9.297312376798158e-07, + "loss": 0.2842, + "step": 1099 + }, + { + "epoch": 0.5544354838709677, + "grad_norm": 0.16932825168461108, + "learning_rate": 9.295938891099828e-07, + "loss": 0.2951, + "step": 1100 + }, + { + "epoch": 0.5549395161290323, + "grad_norm": 0.17870713463323182, + "learning_rate": 9.294564166049739e-07, + "loss": 0.2962, + "step": 1101 + }, + { + "epoch": 0.5554435483870968, + "grad_norm": 0.2132310949282931, + "learning_rate": 9.293188202044493e-07, + "loss": 0.289, + "step": 1102 + }, + { + "epoch": 0.5559475806451613, + "grad_norm": 0.19629585998507623, + "learning_rate": 9.291810999481045e-07, + "loss": 0.2867, + "step": 1103 + }, + { + "epoch": 0.5564516129032258, + "grad_norm": 0.173069340951293, + "learning_rate": 9.29043255875671e-07, + "loss": 0.2885, + "step": 1104 + }, + { + "epoch": 0.5569556451612904, + "grad_norm": 0.18743625034354597, + "learning_rate": 9.289052880269159e-07, + "loss": 0.2889, + "step": 1105 + }, + { + "epoch": 0.5574596774193549, + "grad_norm": 0.1747610249774233, + "learning_rate": 9.287671964416423e-07, + "loss": 0.2981, + "step": 1106 + }, + { + "epoch": 0.5579637096774194, + "grad_norm": 0.1803315112726341, + "learning_rate": 9.286289811596883e-07, + "loss": 0.2958, + "step": 1107 + }, + { + "epoch": 0.5584677419354839, + "grad_norm": 0.18669991895521104, + "learning_rate": 9.284906422209288e-07, + "loss": 0.2868, + "step": 1108 + }, + { + "epoch": 0.5589717741935484, + "grad_norm": 0.1836448818997824, + "learning_rate": 9.283521796652732e-07, + "loss": 0.2896, + "step": 1109 + }, + { + "epoch": 0.5594758064516129, + "grad_norm": 0.17629063140324913, + "learning_rate": 9.282135935326672e-07, + "loss": 0.2853, + "step": 1110 + }, + { + "epoch": 0.5599798387096774, + "grad_norm": 0.17145118521395394, + "learning_rate": 9.280748838630923e-07, + "loss": 0.2845, + "step": 1111 + }, + { + "epoch": 0.5604838709677419, + "grad_norm": 0.2113289196484219, + "learning_rate": 9.27936050696565e-07, + "loss": 0.2881, + "step": 1112 + }, + { + "epoch": 0.5609879032258065, + "grad_norm": 0.21214281262489457, + "learning_rate": 9.277970940731381e-07, + "loss": 0.278, + "step": 1113 + }, + { + "epoch": 0.561491935483871, + "grad_norm": 0.27553648004742354, + "learning_rate": 9.276580140328996e-07, + "loss": 0.3008, + "step": 1114 + }, + { + "epoch": 0.5619959677419355, + "grad_norm": 0.22325828203211032, + "learning_rate": 9.275188106159732e-07, + "loss": 0.2786, + "step": 1115 + }, + { + "epoch": 0.5625, + "grad_norm": 0.1847934093487764, + "learning_rate": 9.273794838625184e-07, + "loss": 0.2748, + "step": 1116 + }, + { + "epoch": 0.5630040322580645, + "grad_norm": 0.1909450760618991, + "learning_rate": 9.272400338127299e-07, + "loss": 0.2772, + "step": 1117 + }, + { + "epoch": 0.563508064516129, + "grad_norm": 0.1793116127902729, + "learning_rate": 9.271004605068382e-07, + "loss": 0.3071, + "step": 1118 + }, + { + "epoch": 0.5640120967741935, + "grad_norm": 0.1813871461319359, + "learning_rate": 9.269607639851095e-07, + "loss": 0.2968, + "step": 1119 + }, + { + "epoch": 0.5645161290322581, + "grad_norm": 0.2411100780601475, + "learning_rate": 9.268209442878452e-07, + "loss": 0.287, + "step": 1120 + }, + { + "epoch": 0.5650201612903226, + "grad_norm": 0.22883000547852828, + "learning_rate": 9.266810014553826e-07, + "loss": 0.2943, + "step": 1121 + }, + { + "epoch": 0.5655241935483871, + "grad_norm": 0.2008124860780277, + "learning_rate": 9.265409355280941e-07, + "loss": 0.2911, + "step": 1122 + }, + { + "epoch": 0.5660282258064516, + "grad_norm": 0.3472215822956789, + "learning_rate": 9.26400746546388e-07, + "loss": 0.2988, + "step": 1123 + }, + { + "epoch": 0.5665322580645161, + "grad_norm": 0.26593221261781297, + "learning_rate": 9.262604345507079e-07, + "loss": 0.2965, + "step": 1124 + }, + { + "epoch": 0.5670362903225806, + "grad_norm": 0.1797781151682231, + "learning_rate": 9.26119999581533e-07, + "loss": 0.2818, + "step": 1125 + }, + { + "epoch": 0.5675403225806451, + "grad_norm": 0.18192713954001474, + "learning_rate": 9.25979441679378e-07, + "loss": 0.2847, + "step": 1126 + }, + { + "epoch": 0.5680443548387096, + "grad_norm": 0.2591562379865001, + "learning_rate": 9.258387608847928e-07, + "loss": 0.2906, + "step": 1127 + }, + { + "epoch": 0.5685483870967742, + "grad_norm": 0.23679205899885067, + "learning_rate": 9.256979572383631e-07, + "loss": 0.2826, + "step": 1128 + }, + { + "epoch": 0.5690524193548387, + "grad_norm": 0.31203954672135736, + "learning_rate": 9.255570307807097e-07, + "loss": 0.2811, + "step": 1129 + }, + { + "epoch": 0.5695564516129032, + "grad_norm": 0.24288407570831536, + "learning_rate": 9.254159815524891e-07, + "loss": 0.2914, + "step": 1130 + }, + { + "epoch": 0.5700604838709677, + "grad_norm": 0.22988163476909568, + "learning_rate": 9.252748095943931e-07, + "loss": 0.2903, + "step": 1131 + }, + { + "epoch": 0.5705645161290323, + "grad_norm": 0.2003178753492969, + "learning_rate": 9.251335149471491e-07, + "loss": 0.2816, + "step": 1132 + }, + { + "epoch": 0.5710685483870968, + "grad_norm": 0.17031682685543056, + "learning_rate": 9.249920976515195e-07, + "loss": 0.275, + "step": 1133 + }, + { + "epoch": 0.5715725806451613, + "grad_norm": 0.1827276740823613, + "learning_rate": 9.248505577483026e-07, + "loss": 0.2869, + "step": 1134 + }, + { + "epoch": 0.5720766129032258, + "grad_norm": 0.2619379606957767, + "learning_rate": 9.247088952783313e-07, + "loss": 0.2812, + "step": 1135 + }, + { + "epoch": 0.5725806451612904, + "grad_norm": 0.1802141216250462, + "learning_rate": 9.245671102824748e-07, + "loss": 0.2921, + "step": 1136 + }, + { + "epoch": 0.5730846774193549, + "grad_norm": 0.19106913986785806, + "learning_rate": 9.244252028016371e-07, + "loss": 0.284, + "step": 1137 + }, + { + "epoch": 0.5735887096774194, + "grad_norm": 0.1812826559618984, + "learning_rate": 9.242831728767575e-07, + "loss": 0.2803, + "step": 1138 + }, + { + "epoch": 0.5740927419354839, + "grad_norm": 0.1734515314933056, + "learning_rate": 9.241410205488108e-07, + "loss": 0.2869, + "step": 1139 + }, + { + "epoch": 0.5745967741935484, + "grad_norm": 0.19432579296048832, + "learning_rate": 9.23998745858807e-07, + "loss": 0.2856, + "step": 1140 + }, + { + "epoch": 0.5751008064516129, + "grad_norm": 0.18377955104622667, + "learning_rate": 9.238563488477918e-07, + "loss": 0.2926, + "step": 1141 + }, + { + "epoch": 0.5756048387096774, + "grad_norm": 0.18200221102162675, + "learning_rate": 9.237138295568454e-07, + "loss": 0.2881, + "step": 1142 + }, + { + "epoch": 0.5761088709677419, + "grad_norm": 0.18619952457281, + "learning_rate": 9.23571188027084e-07, + "loss": 0.2842, + "step": 1143 + }, + { + "epoch": 0.5766129032258065, + "grad_norm": 0.1962070569411852, + "learning_rate": 9.234284242996588e-07, + "loss": 0.2831, + "step": 1144 + }, + { + "epoch": 0.577116935483871, + "grad_norm": 0.17904916055767378, + "learning_rate": 9.23285538415756e-07, + "loss": 0.2927, + "step": 1145 + }, + { + "epoch": 0.5776209677419355, + "grad_norm": 0.20485807453286825, + "learning_rate": 9.231425304165976e-07, + "loss": 0.283, + "step": 1146 + }, + { + "epoch": 0.578125, + "grad_norm": 0.17697640569451753, + "learning_rate": 9.229994003434405e-07, + "loss": 0.2987, + "step": 1147 + }, + { + "epoch": 0.5786290322580645, + "grad_norm": 0.17424623251522328, + "learning_rate": 9.228561482375766e-07, + "loss": 0.2746, + "step": 1148 + }, + { + "epoch": 0.579133064516129, + "grad_norm": 0.19037141787800266, + "learning_rate": 9.227127741403336e-07, + "loss": 0.2873, + "step": 1149 + }, + { + "epoch": 0.5796370967741935, + "grad_norm": 0.18343536870106394, + "learning_rate": 9.225692780930736e-07, + "loss": 0.2876, + "step": 1150 + }, + { + "epoch": 0.5801411290322581, + "grad_norm": 0.1774843130008874, + "learning_rate": 9.224256601371947e-07, + "loss": 0.2934, + "step": 1151 + }, + { + "epoch": 0.5806451612903226, + "grad_norm": 0.1890382236967552, + "learning_rate": 9.222819203141295e-07, + "loss": 0.2716, + "step": 1152 + }, + { + "epoch": 0.5811491935483871, + "grad_norm": 0.1813676084499092, + "learning_rate": 9.221380586653462e-07, + "loss": 0.2794, + "step": 1153 + }, + { + "epoch": 0.5816532258064516, + "grad_norm": 0.170202685121712, + "learning_rate": 9.21994075232348e-07, + "loss": 0.2872, + "step": 1154 + }, + { + "epoch": 0.5821572580645161, + "grad_norm": 0.1896602238301285, + "learning_rate": 9.218499700566733e-07, + "loss": 0.2904, + "step": 1155 + }, + { + "epoch": 0.5826612903225806, + "grad_norm": 0.1816413232762944, + "learning_rate": 9.217057431798954e-07, + "loss": 0.2873, + "step": 1156 + }, + { + "epoch": 0.5831653225806451, + "grad_norm": 0.19216255744528832, + "learning_rate": 9.215613946436229e-07, + "loss": 0.297, + "step": 1157 + }, + { + "epoch": 0.5836693548387096, + "grad_norm": 0.17974789944825, + "learning_rate": 9.214169244894996e-07, + "loss": 0.293, + "step": 1158 + }, + { + "epoch": 0.5841733870967742, + "grad_norm": 0.17992839360699955, + "learning_rate": 9.21272332759204e-07, + "loss": 0.287, + "step": 1159 + }, + { + "epoch": 0.5846774193548387, + "grad_norm": 0.18513535414685894, + "learning_rate": 9.211276194944501e-07, + "loss": 0.2794, + "step": 1160 + }, + { + "epoch": 0.5851814516129032, + "grad_norm": 0.1819833313378893, + "learning_rate": 9.209827847369866e-07, + "loss": 0.2744, + "step": 1161 + }, + { + "epoch": 0.5856854838709677, + "grad_norm": 0.17281034086736458, + "learning_rate": 9.208378285285977e-07, + "loss": 0.2945, + "step": 1162 + }, + { + "epoch": 0.5861895161290323, + "grad_norm": 0.23268827282788637, + "learning_rate": 9.206927509111022e-07, + "loss": 0.2931, + "step": 1163 + }, + { + "epoch": 0.5866935483870968, + "grad_norm": 0.18019045409013396, + "learning_rate": 9.205475519263541e-07, + "loss": 0.2892, + "step": 1164 + }, + { + "epoch": 0.5871975806451613, + "grad_norm": 0.19222136198482753, + "learning_rate": 9.204022316162424e-07, + "loss": 0.2977, + "step": 1165 + }, + { + "epoch": 0.5877016129032258, + "grad_norm": 0.17442507458285844, + "learning_rate": 9.202567900226912e-07, + "loss": 0.286, + "step": 1166 + }, + { + "epoch": 0.5882056451612904, + "grad_norm": 0.17138337080066757, + "learning_rate": 9.201112271876593e-07, + "loss": 0.284, + "step": 1167 + }, + { + "epoch": 0.5887096774193549, + "grad_norm": 0.2425654731892946, + "learning_rate": 9.199655431531409e-07, + "loss": 0.2814, + "step": 1168 + }, + { + "epoch": 0.5892137096774194, + "grad_norm": 0.1846246469710363, + "learning_rate": 9.198197379611647e-07, + "loss": 0.2854, + "step": 1169 + }, + { + "epoch": 0.5897177419354839, + "grad_norm": 0.19156185603852782, + "learning_rate": 9.19673811653795e-07, + "loss": 0.2999, + "step": 1170 + }, + { + "epoch": 0.5902217741935484, + "grad_norm": 0.17846888518866763, + "learning_rate": 9.195277642731303e-07, + "loss": 0.2944, + "step": 1171 + }, + { + "epoch": 0.5907258064516129, + "grad_norm": 0.18016209804691175, + "learning_rate": 9.193815958613043e-07, + "loss": 0.2909, + "step": 1172 + }, + { + "epoch": 0.5912298387096774, + "grad_norm": 0.1818568667655358, + "learning_rate": 9.192353064604858e-07, + "loss": 0.2977, + "step": 1173 + }, + { + "epoch": 0.5917338709677419, + "grad_norm": 0.17227833790308136, + "learning_rate": 9.190888961128787e-07, + "loss": 0.2799, + "step": 1174 + }, + { + "epoch": 0.5922379032258065, + "grad_norm": 0.1738364554972949, + "learning_rate": 9.18942364860721e-07, + "loss": 0.2764, + "step": 1175 + }, + { + "epoch": 0.592741935483871, + "grad_norm": 0.19302662529321246, + "learning_rate": 9.187957127462863e-07, + "loss": 0.2769, + "step": 1176 + }, + { + "epoch": 0.5932459677419355, + "grad_norm": 0.17788471334305958, + "learning_rate": 9.186489398118827e-07, + "loss": 0.2824, + "step": 1177 + }, + { + "epoch": 0.59375, + "grad_norm": 0.17681503923586112, + "learning_rate": 9.185020460998534e-07, + "loss": 0.287, + "step": 1178 + }, + { + "epoch": 0.5942540322580645, + "grad_norm": 0.176047828469218, + "learning_rate": 9.183550316525763e-07, + "loss": 0.2854, + "step": 1179 + }, + { + "epoch": 0.594758064516129, + "grad_norm": 0.17351698958327808, + "learning_rate": 9.182078965124643e-07, + "loss": 0.3057, + "step": 1180 + }, + { + "epoch": 0.5952620967741935, + "grad_norm": 0.1821800882504011, + "learning_rate": 9.180606407219644e-07, + "loss": 0.2759, + "step": 1181 + }, + { + "epoch": 0.5957661290322581, + "grad_norm": 0.17241468751173047, + "learning_rate": 9.179132643235598e-07, + "loss": 0.2909, + "step": 1182 + }, + { + "epoch": 0.5962701612903226, + "grad_norm": 0.18984463320176223, + "learning_rate": 9.177657673597671e-07, + "loss": 0.2772, + "step": 1183 + }, + { + "epoch": 0.5967741935483871, + "grad_norm": 0.17848369146299375, + "learning_rate": 9.176181498731385e-07, + "loss": 0.293, + "step": 1184 + }, + { + "epoch": 0.5972782258064516, + "grad_norm": 0.17602029953890927, + "learning_rate": 9.174704119062606e-07, + "loss": 0.2843, + "step": 1185 + }, + { + "epoch": 0.5977822580645161, + "grad_norm": 0.19474495339674502, + "learning_rate": 9.173225535017551e-07, + "loss": 0.2928, + "step": 1186 + }, + { + "epoch": 0.5982862903225806, + "grad_norm": 0.17273250226175776, + "learning_rate": 9.171745747022778e-07, + "loss": 0.2914, + "step": 1187 + }, + { + "epoch": 0.5987903225806451, + "grad_norm": 0.21145261381905367, + "learning_rate": 9.170264755505201e-07, + "loss": 0.2928, + "step": 1188 + }, + { + "epoch": 0.5992943548387096, + "grad_norm": 0.2039002377170998, + "learning_rate": 9.168782560892077e-07, + "loss": 0.2701, + "step": 1189 + }, + { + "epoch": 0.5997983870967742, + "grad_norm": 0.17988346160874077, + "learning_rate": 9.167299163611007e-07, + "loss": 0.2751, + "step": 1190 + }, + { + "epoch": 0.6003024193548387, + "grad_norm": 0.2493981489367964, + "learning_rate": 9.165814564089944e-07, + "loss": 0.2657, + "step": 1191 + }, + { + "epoch": 0.6008064516129032, + "grad_norm": 0.1797786958096351, + "learning_rate": 9.164328762757184e-07, + "loss": 0.2833, + "step": 1192 + }, + { + "epoch": 0.6013104838709677, + "grad_norm": 0.19193093734013503, + "learning_rate": 9.162841760041373e-07, + "loss": 0.2858, + "step": 1193 + }, + { + "epoch": 0.6018145161290323, + "grad_norm": 0.2769241015425666, + "learning_rate": 9.161353556371503e-07, + "loss": 0.3026, + "step": 1194 + }, + { + "epoch": 0.6018145161290323, + "eval_loss": 0.31878647208213806, + "eval_runtime": 16.9743, + "eval_samples_per_second": 50.37, + "eval_steps_per_second": 1.06, + "step": 1194 + }, + { + "epoch": 0.6023185483870968, + "grad_norm": 0.181263001899909, + "learning_rate": 9.159864152176908e-07, + "loss": 0.304, + "step": 1195 + }, + { + "epoch": 0.6028225806451613, + "grad_norm": 0.17978051150161645, + "learning_rate": 9.158373547887275e-07, + "loss": 0.2873, + "step": 1196 + }, + { + "epoch": 0.6033266129032258, + "grad_norm": 0.20272556727914262, + "learning_rate": 9.156881743932634e-07, + "loss": 0.2815, + "step": 1197 + }, + { + "epoch": 0.6038306451612904, + "grad_norm": 0.21068658372368834, + "learning_rate": 9.155388740743361e-07, + "loss": 0.2853, + "step": 1198 + }, + { + "epoch": 0.6043346774193549, + "grad_norm": 0.22437088927660387, + "learning_rate": 9.153894538750176e-07, + "loss": 0.283, + "step": 1199 + }, + { + "epoch": 0.6048387096774194, + "grad_norm": 0.1931533084578361, + "learning_rate": 9.152399138384148e-07, + "loss": 0.282, + "step": 1200 + }, + { + "epoch": 0.6053427419354839, + "grad_norm": 0.18479701434227452, + "learning_rate": 9.150902540076693e-07, + "loss": 0.2965, + "step": 1201 + }, + { + "epoch": 0.6058467741935484, + "grad_norm": 0.2210412349427078, + "learning_rate": 9.149404744259568e-07, + "loss": 0.2888, + "step": 1202 + }, + { + "epoch": 0.6063508064516129, + "grad_norm": 0.18226679427751902, + "learning_rate": 9.147905751364878e-07, + "loss": 0.2887, + "step": 1203 + }, + { + "epoch": 0.6068548387096774, + "grad_norm": 0.1832755725258158, + "learning_rate": 9.146405561825075e-07, + "loss": 0.2948, + "step": 1204 + }, + { + "epoch": 0.6073588709677419, + "grad_norm": 0.1720010372993489, + "learning_rate": 9.144904176072952e-07, + "loss": 0.2829, + "step": 1205 + }, + { + "epoch": 0.6078629032258065, + "grad_norm": 0.17565878536855542, + "learning_rate": 9.14340159454165e-07, + "loss": 0.2815, + "step": 1206 + }, + { + "epoch": 0.608366935483871, + "grad_norm": 0.18227852681153148, + "learning_rate": 9.141897817664657e-07, + "loss": 0.2878, + "step": 1207 + }, + { + "epoch": 0.6088709677419355, + "grad_norm": 0.1965323622835235, + "learning_rate": 9.140392845875799e-07, + "loss": 0.2845, + "step": 1208 + }, + { + "epoch": 0.609375, + "grad_norm": 0.17206440789039248, + "learning_rate": 9.138886679609254e-07, + "loss": 0.2852, + "step": 1209 + }, + { + "epoch": 0.6098790322580645, + "grad_norm": 0.17264736183064056, + "learning_rate": 9.137379319299542e-07, + "loss": 0.274, + "step": 1210 + }, + { + "epoch": 0.610383064516129, + "grad_norm": 0.18598352903511223, + "learning_rate": 9.135870765381525e-07, + "loss": 0.2896, + "step": 1211 + }, + { + "epoch": 0.6108870967741935, + "grad_norm": 0.19699736881940794, + "learning_rate": 9.134361018290413e-07, + "loss": 0.2763, + "step": 1212 + }, + { + "epoch": 0.6113911290322581, + "grad_norm": 0.22313038636092158, + "learning_rate": 9.132850078461758e-07, + "loss": 0.2846, + "step": 1213 + }, + { + "epoch": 0.6118951612903226, + "grad_norm": 0.30031307009566305, + "learning_rate": 9.131337946331458e-07, + "loss": 0.2989, + "step": 1214 + }, + { + "epoch": 0.6123991935483871, + "grad_norm": 0.2329858913080578, + "learning_rate": 9.129824622335752e-07, + "loss": 0.3048, + "step": 1215 + }, + { + "epoch": 0.6129032258064516, + "grad_norm": 0.17993489418040118, + "learning_rate": 9.128310106911226e-07, + "loss": 0.2932, + "step": 1216 + }, + { + "epoch": 0.6134072580645161, + "grad_norm": 0.18148081451402603, + "learning_rate": 9.126794400494806e-07, + "loss": 0.3034, + "step": 1217 + }, + { + "epoch": 0.6139112903225806, + "grad_norm": 0.20177903959056726, + "learning_rate": 9.125277503523766e-07, + "loss": 0.2919, + "step": 1218 + }, + { + "epoch": 0.6144153225806451, + "grad_norm": 0.27324162227606025, + "learning_rate": 9.123759416435722e-07, + "loss": 0.2922, + "step": 1219 + }, + { + "epoch": 0.6149193548387096, + "grad_norm": 0.34121798755615584, + "learning_rate": 9.122240139668631e-07, + "loss": 0.2832, + "step": 1220 + }, + { + "epoch": 0.6154233870967742, + "grad_norm": 0.21697155231985218, + "learning_rate": 9.120719673660796e-07, + "loss": 0.282, + "step": 1221 + }, + { + "epoch": 0.6159274193548387, + "grad_norm": 0.17674277405784744, + "learning_rate": 9.119198018850862e-07, + "loss": 0.2816, + "step": 1222 + }, + { + "epoch": 0.6164314516129032, + "grad_norm": 0.16871547242020288, + "learning_rate": 9.117675175677815e-07, + "loss": 0.2756, + "step": 1223 + }, + { + "epoch": 0.6169354838709677, + "grad_norm": 0.20057684589974267, + "learning_rate": 9.11615114458099e-07, + "loss": 0.275, + "step": 1224 + }, + { + "epoch": 0.6174395161290323, + "grad_norm": 0.18893088728346058, + "learning_rate": 9.114625926000057e-07, + "loss": 0.2932, + "step": 1225 + }, + { + "epoch": 0.6179435483870968, + "grad_norm": 0.24323661515306044, + "learning_rate": 9.113099520375032e-07, + "loss": 0.2884, + "step": 1226 + }, + { + "epoch": 0.6184475806451613, + "grad_norm": 0.17657599055834897, + "learning_rate": 9.111571928146276e-07, + "loss": 0.2847, + "step": 1227 + }, + { + "epoch": 0.6189516129032258, + "grad_norm": 0.1839549106035099, + "learning_rate": 9.110043149754487e-07, + "loss": 0.2976, + "step": 1228 + }, + { + "epoch": 0.6194556451612904, + "grad_norm": 0.2056215870054611, + "learning_rate": 9.108513185640712e-07, + "loss": 0.2812, + "step": 1229 + }, + { + "epoch": 0.6199596774193549, + "grad_norm": 0.17065753231349062, + "learning_rate": 9.106982036246331e-07, + "loss": 0.2857, + "step": 1230 + }, + { + "epoch": 0.6204637096774194, + "grad_norm": 0.18476180000705647, + "learning_rate": 9.105449702013076e-07, + "loss": 0.2965, + "step": 1231 + }, + { + "epoch": 0.6209677419354839, + "grad_norm": 0.2000432350203024, + "learning_rate": 9.103916183383013e-07, + "loss": 0.289, + "step": 1232 + }, + { + "epoch": 0.6214717741935484, + "grad_norm": 0.17671157012984728, + "learning_rate": 9.102381480798553e-07, + "loss": 0.2887, + "step": 1233 + }, + { + "epoch": 0.6219758064516129, + "grad_norm": 0.19186276950774964, + "learning_rate": 9.100845594702451e-07, + "loss": 0.2855, + "step": 1234 + }, + { + "epoch": 0.6224798387096774, + "grad_norm": 0.17315544977260117, + "learning_rate": 9.099308525537796e-07, + "loss": 0.2787, + "step": 1235 + }, + { + "epoch": 0.6229838709677419, + "grad_norm": 0.21400229908517301, + "learning_rate": 9.097770273748027e-07, + "loss": 0.2923, + "step": 1236 + }, + { + "epoch": 0.6234879032258065, + "grad_norm": 0.19574643374456288, + "learning_rate": 9.096230839776917e-07, + "loss": 0.2849, + "step": 1237 + }, + { + "epoch": 0.623991935483871, + "grad_norm": 0.18182176245833126, + "learning_rate": 9.094690224068585e-07, + "loss": 0.282, + "step": 1238 + }, + { + "epoch": 0.6244959677419355, + "grad_norm": 0.23124623736324354, + "learning_rate": 9.09314842706749e-07, + "loss": 0.2964, + "step": 1239 + }, + { + "epoch": 0.625, + "grad_norm": 0.17853145875824106, + "learning_rate": 9.091605449218427e-07, + "loss": 0.3069, + "step": 1240 + }, + { + "epoch": 0.6255040322580645, + "grad_norm": 0.2052782675999351, + "learning_rate": 9.09006129096654e-07, + "loss": 0.3013, + "step": 1241 + }, + { + "epoch": 0.626008064516129, + "grad_norm": 0.1823088396970403, + "learning_rate": 9.088515952757306e-07, + "loss": 0.2922, + "step": 1242 + }, + { + "epoch": 0.6265120967741935, + "grad_norm": 0.20577771722463917, + "learning_rate": 9.086969435036547e-07, + "loss": 0.2748, + "step": 1243 + }, + { + "epoch": 0.6270161290322581, + "grad_norm": 0.17813055873331182, + "learning_rate": 9.085421738250422e-07, + "loss": 0.2939, + "step": 1244 + }, + { + "epoch": 0.6275201612903226, + "grad_norm": 0.1814603197270135, + "learning_rate": 9.083872862845436e-07, + "loss": 0.2884, + "step": 1245 + }, + { + "epoch": 0.6280241935483871, + "grad_norm": 0.19148692203351, + "learning_rate": 9.082322809268425e-07, + "loss": 0.2726, + "step": 1246 + }, + { + "epoch": 0.6285282258064516, + "grad_norm": 0.1933452685613287, + "learning_rate": 9.080771577966574e-07, + "loss": 0.2894, + "step": 1247 + }, + { + "epoch": 0.6290322580645161, + "grad_norm": 0.18291707054663686, + "learning_rate": 9.0792191693874e-07, + "loss": 0.2999, + "step": 1248 + }, + { + "epoch": 0.6295362903225806, + "grad_norm": 0.17702736035496525, + "learning_rate": 9.077665583978766e-07, + "loss": 0.3027, + "step": 1249 + }, + { + "epoch": 0.6300403225806451, + "grad_norm": 0.1694200412404369, + "learning_rate": 9.076110822188872e-07, + "loss": 0.28, + "step": 1250 + }, + { + "epoch": 0.6305443548387096, + "grad_norm": 0.18581080838795144, + "learning_rate": 9.074554884466254e-07, + "loss": 0.2938, + "step": 1251 + }, + { + "epoch": 0.6310483870967742, + "grad_norm": 0.1997263136466941, + "learning_rate": 9.072997771259793e-07, + "loss": 0.2802, + "step": 1252 + }, + { + "epoch": 0.6315524193548387, + "grad_norm": 0.21481822334565182, + "learning_rate": 9.071439483018708e-07, + "loss": 0.2932, + "step": 1253 + }, + { + "epoch": 0.6320564516129032, + "grad_norm": 0.1791325184424046, + "learning_rate": 9.06988002019255e-07, + "loss": 0.2834, + "step": 1254 + }, + { + "epoch": 0.6325604838709677, + "grad_norm": 0.1836618100696337, + "learning_rate": 9.06831938323122e-07, + "loss": 0.2938, + "step": 1255 + }, + { + "epoch": 0.6330645161290323, + "grad_norm": 0.19292497762523103, + "learning_rate": 9.066757572584948e-07, + "loss": 0.2896, + "step": 1256 + }, + { + "epoch": 0.6335685483870968, + "grad_norm": 0.18240313649056145, + "learning_rate": 9.065194588704311e-07, + "loss": 0.2814, + "step": 1257 + }, + { + "epoch": 0.6340725806451613, + "grad_norm": 0.22052975724510962, + "learning_rate": 9.063630432040216e-07, + "loss": 0.2886, + "step": 1258 + }, + { + "epoch": 0.6345766129032258, + "grad_norm": 0.20672385722341882, + "learning_rate": 9.062065103043915e-07, + "loss": 0.2954, + "step": 1259 + }, + { + "epoch": 0.6350806451612904, + "grad_norm": 0.17995424113662914, + "learning_rate": 9.060498602166995e-07, + "loss": 0.2866, + "step": 1260 + }, + { + "epoch": 0.6355846774193549, + "grad_norm": 0.17630973219376414, + "learning_rate": 9.058930929861381e-07, + "loss": 0.2803, + "step": 1261 + }, + { + "epoch": 0.6360887096774194, + "grad_norm": 0.2239699462767389, + "learning_rate": 9.057362086579336e-07, + "loss": 0.2944, + "step": 1262 + }, + { + "epoch": 0.6365927419354839, + "grad_norm": 0.30137928217223403, + "learning_rate": 9.055792072773466e-07, + "loss": 0.297, + "step": 1263 + }, + { + "epoch": 0.6370967741935484, + "grad_norm": 0.22695762562998079, + "learning_rate": 9.054220888896706e-07, + "loss": 0.2815, + "step": 1264 + }, + { + "epoch": 0.6376008064516129, + "grad_norm": 0.1751438430718199, + "learning_rate": 9.052648535402334e-07, + "loss": 0.2862, + "step": 1265 + }, + { + "epoch": 0.6381048387096774, + "grad_norm": 0.17639322627147327, + "learning_rate": 9.051075012743965e-07, + "loss": 0.2648, + "step": 1266 + }, + { + "epoch": 0.6386088709677419, + "grad_norm": 0.18539526527243927, + "learning_rate": 9.049500321375549e-07, + "loss": 0.2719, + "step": 1267 + }, + { + "epoch": 0.6391129032258065, + "grad_norm": 0.2689595278098081, + "learning_rate": 9.047924461751376e-07, + "loss": 0.2875, + "step": 1268 + }, + { + "epoch": 0.639616935483871, + "grad_norm": 0.178579851083188, + "learning_rate": 9.046347434326072e-07, + "loss": 0.2915, + "step": 1269 + }, + { + "epoch": 0.6401209677419355, + "grad_norm": 0.17904389890697903, + "learning_rate": 9.044769239554599e-07, + "loss": 0.2745, + "step": 1270 + }, + { + "epoch": 0.640625, + "grad_norm": 0.1896562559936013, + "learning_rate": 9.043189877892254e-07, + "loss": 0.2883, + "step": 1271 + }, + { + "epoch": 0.6411290322580645, + "grad_norm": 0.1908189648372141, + "learning_rate": 9.041609349794678e-07, + "loss": 0.283, + "step": 1272 + }, + { + "epoch": 0.641633064516129, + "grad_norm": 0.21147318798440223, + "learning_rate": 9.040027655717841e-07, + "loss": 0.301, + "step": 1273 + }, + { + "epoch": 0.6421370967741935, + "grad_norm": 0.18345975087456926, + "learning_rate": 9.03844479611805e-07, + "loss": 0.3139, + "step": 1274 + }, + { + "epoch": 0.6426411290322581, + "grad_norm": 0.19734085403857743, + "learning_rate": 9.036860771451954e-07, + "loss": 0.2733, + "step": 1275 + }, + { + "epoch": 0.6431451612903226, + "grad_norm": 0.17121744147827178, + "learning_rate": 9.035275582176533e-07, + "loss": 0.2846, + "step": 1276 + }, + { + "epoch": 0.6436491935483871, + "grad_norm": 0.17622953005984843, + "learning_rate": 9.033689228749102e-07, + "loss": 0.2965, + "step": 1277 + }, + { + "epoch": 0.6441532258064516, + "grad_norm": 0.18180991170706243, + "learning_rate": 9.032101711627316e-07, + "loss": 0.2863, + "step": 1278 + }, + { + "epoch": 0.6446572580645161, + "grad_norm": 0.18383068016833068, + "learning_rate": 9.030513031269165e-07, + "loss": 0.2916, + "step": 1279 + }, + { + "epoch": 0.6451612903225806, + "grad_norm": 0.23069065361517252, + "learning_rate": 9.02892318813297e-07, + "loss": 0.285, + "step": 1280 + }, + { + "epoch": 0.6456653225806451, + "grad_norm": 0.27998314668853796, + "learning_rate": 9.027332182677397e-07, + "loss": 0.2854, + "step": 1281 + }, + { + "epoch": 0.6461693548387096, + "grad_norm": 0.17547605295017601, + "learning_rate": 9.025740015361433e-07, + "loss": 0.2839, + "step": 1282 + }, + { + "epoch": 0.6466733870967742, + "grad_norm": 0.17884356887542877, + "learning_rate": 9.024146686644415e-07, + "loss": 0.2922, + "step": 1283 + }, + { + "epoch": 0.6471774193548387, + "grad_norm": 0.18212333265861616, + "learning_rate": 9.022552196986006e-07, + "loss": 0.2735, + "step": 1284 + }, + { + "epoch": 0.6476814516129032, + "grad_norm": 0.17605258249805855, + "learning_rate": 9.020956546846205e-07, + "loss": 0.2907, + "step": 1285 + }, + { + "epoch": 0.6481854838709677, + "grad_norm": 0.23268702781185105, + "learning_rate": 9.01935973668535e-07, + "loss": 0.2899, + "step": 1286 + }, + { + "epoch": 0.6486895161290323, + "grad_norm": 0.17715332276237264, + "learning_rate": 9.017761766964111e-07, + "loss": 0.2966, + "step": 1287 + }, + { + "epoch": 0.6491935483870968, + "grad_norm": 0.18537544001467623, + "learning_rate": 9.01616263814349e-07, + "loss": 0.2851, + "step": 1288 + }, + { + "epoch": 0.6496975806451613, + "grad_norm": 0.19742055543365888, + "learning_rate": 9.014562350684824e-07, + "loss": 0.2904, + "step": 1289 + }, + { + "epoch": 0.6502016129032258, + "grad_norm": 0.17753937663898317, + "learning_rate": 9.012960905049791e-07, + "loss": 0.2909, + "step": 1290 + }, + { + "epoch": 0.6507056451612904, + "grad_norm": 0.18127362373513947, + "learning_rate": 9.011358301700397e-07, + "loss": 0.2991, + "step": 1291 + }, + { + "epoch": 0.6512096774193549, + "grad_norm": 0.1897788399464913, + "learning_rate": 9.009754541098982e-07, + "loss": 0.2995, + "step": 1292 + }, + { + "epoch": 0.6517137096774194, + "grad_norm": 0.17813293972191205, + "learning_rate": 9.008149623708219e-07, + "loss": 0.2765, + "step": 1293 + }, + { + "epoch": 0.6522177419354839, + "grad_norm": 0.18329773158412632, + "learning_rate": 9.006543549991119e-07, + "loss": 0.2936, + "step": 1294 + }, + { + "epoch": 0.6527217741935484, + "grad_norm": 0.20529257571242965, + "learning_rate": 9.004936320411025e-07, + "loss": 0.2949, + "step": 1295 + }, + { + "epoch": 0.6532258064516129, + "grad_norm": 0.21151253255173527, + "learning_rate": 9.003327935431612e-07, + "loss": 0.2771, + "step": 1296 + }, + { + "epoch": 0.6537298387096774, + "grad_norm": 0.1796301906371901, + "learning_rate": 9.001718395516889e-07, + "loss": 0.2869, + "step": 1297 + }, + { + "epoch": 0.6542338709677419, + "grad_norm": 0.1717234788709778, + "learning_rate": 9.000107701131196e-07, + "loss": 0.2823, + "step": 1298 + }, + { + "epoch": 0.6547379032258065, + "grad_norm": 0.17845055491572026, + "learning_rate": 8.998495852739213e-07, + "loss": 0.3044, + "step": 1299 + }, + { + "epoch": 0.655241935483871, + "grad_norm": 0.17442341108486234, + "learning_rate": 8.996882850805944e-07, + "loss": 0.28, + "step": 1300 + }, + { + "epoch": 0.6557459677419355, + "grad_norm": 0.20073841491430633, + "learning_rate": 8.995268695796734e-07, + "loss": 0.2811, + "step": 1301 + }, + { + "epoch": 0.65625, + "grad_norm": 0.21128884092445868, + "learning_rate": 8.993653388177254e-07, + "loss": 0.2929, + "step": 1302 + }, + { + "epoch": 0.6567540322580645, + "grad_norm": 0.1850833947673501, + "learning_rate": 8.99203692841351e-07, + "loss": 0.2914, + "step": 1303 + }, + { + "epoch": 0.657258064516129, + "grad_norm": 0.19939616483650877, + "learning_rate": 8.990419316971842e-07, + "loss": 0.2859, + "step": 1304 + }, + { + "epoch": 0.6577620967741935, + "grad_norm": 0.1750117264820056, + "learning_rate": 8.988800554318921e-07, + "loss": 0.2982, + "step": 1305 + }, + { + "epoch": 0.6582661290322581, + "grad_norm": 0.2075447584954836, + "learning_rate": 8.987180640921747e-07, + "loss": 0.299, + "step": 1306 + }, + { + "epoch": 0.6587701612903226, + "grad_norm": 0.1790266951422517, + "learning_rate": 8.985559577247661e-07, + "loss": 0.2776, + "step": 1307 + }, + { + "epoch": 0.6592741935483871, + "grad_norm": 0.178773579825902, + "learning_rate": 8.983937363764324e-07, + "loss": 0.2936, + "step": 1308 + }, + { + "epoch": 0.6597782258064516, + "grad_norm": 0.1738135103379992, + "learning_rate": 8.982314000939737e-07, + "loss": 0.2912, + "step": 1309 + }, + { + "epoch": 0.6602822580645161, + "grad_norm": 0.18476555106324266, + "learning_rate": 8.98068948924223e-07, + "loss": 0.3009, + "step": 1310 + }, + { + "epoch": 0.6607862903225806, + "grad_norm": 0.20244847551602765, + "learning_rate": 8.979063829140465e-07, + "loss": 0.2945, + "step": 1311 + }, + { + "epoch": 0.6612903225806451, + "grad_norm": 0.19339358525691477, + "learning_rate": 8.977437021103433e-07, + "loss": 0.2788, + "step": 1312 + }, + { + "epoch": 0.6617943548387096, + "grad_norm": 0.18712592000883932, + "learning_rate": 8.975809065600459e-07, + "loss": 0.3064, + "step": 1313 + }, + { + "epoch": 0.6622983870967742, + "grad_norm": 0.22452691139273012, + "learning_rate": 8.974179963101201e-07, + "loss": 0.2898, + "step": 1314 + }, + { + "epoch": 0.6628024193548387, + "grad_norm": 0.1796052556590874, + "learning_rate": 8.97254971407564e-07, + "loss": 0.2813, + "step": 1315 + }, + { + "epoch": 0.6633064516129032, + "grad_norm": 0.17710526677908528, + "learning_rate": 8.970918318994096e-07, + "loss": 0.2894, + "step": 1316 + }, + { + "epoch": 0.6638104838709677, + "grad_norm": 0.18446794030145566, + "learning_rate": 8.969285778327215e-07, + "loss": 0.2796, + "step": 1317 + }, + { + "epoch": 0.6643145161290323, + "grad_norm": 0.22546463300214195, + "learning_rate": 8.967652092545976e-07, + "loss": 0.292, + "step": 1318 + }, + { + "epoch": 0.6648185483870968, + "grad_norm": 0.24026811231174375, + "learning_rate": 8.966017262121687e-07, + "loss": 0.2846, + "step": 1319 + }, + { + "epoch": 0.6653225806451613, + "grad_norm": 0.18394372591314476, + "learning_rate": 8.964381287525986e-07, + "loss": 0.2994, + "step": 1320 + }, + { + "epoch": 0.6658266129032258, + "grad_norm": 0.1803179306900601, + "learning_rate": 8.962744169230841e-07, + "loss": 0.2797, + "step": 1321 + }, + { + "epoch": 0.6663306451612904, + "grad_norm": 0.16839049819871527, + "learning_rate": 8.96110590770855e-07, + "loss": 0.2631, + "step": 1322 + }, + { + "epoch": 0.6668346774193549, + "grad_norm": 0.22155662980251253, + "learning_rate": 8.959466503431744e-07, + "loss": 0.2816, + "step": 1323 + }, + { + "epoch": 0.6673387096774194, + "grad_norm": 0.19139441389189604, + "learning_rate": 8.957825956873379e-07, + "loss": 0.2855, + "step": 1324 + }, + { + "epoch": 0.6678427419354839, + "grad_norm": 0.19210293112712495, + "learning_rate": 8.956184268506742e-07, + "loss": 0.2817, + "step": 1325 + }, + { + "epoch": 0.6683467741935484, + "grad_norm": 0.1857304337358492, + "learning_rate": 8.954541438805452e-07, + "loss": 0.2873, + "step": 1326 + }, + { + "epoch": 0.6688508064516129, + "grad_norm": 0.17268224825580095, + "learning_rate": 8.952897468243454e-07, + "loss": 0.2713, + "step": 1327 + }, + { + "epoch": 0.6693548387096774, + "grad_norm": 0.1957606146053459, + "learning_rate": 8.951252357295022e-07, + "loss": 0.2767, + "step": 1328 + }, + { + "epoch": 0.6698588709677419, + "grad_norm": 0.1824912944669447, + "learning_rate": 8.949606106434763e-07, + "loss": 0.287, + "step": 1329 + }, + { + "epoch": 0.6703629032258065, + "grad_norm": 0.1798044187166269, + "learning_rate": 8.947958716137608e-07, + "loss": 0.3125, + "step": 1330 + }, + { + "epoch": 0.670866935483871, + "grad_norm": 0.17749986390962377, + "learning_rate": 8.946310186878821e-07, + "loss": 0.2675, + "step": 1331 + }, + { + "epoch": 0.6713709677419355, + "grad_norm": 0.17752734169122558, + "learning_rate": 8.94466051913399e-07, + "loss": 0.2772, + "step": 1332 + }, + { + "epoch": 0.671875, + "grad_norm": 0.1987933223640386, + "learning_rate": 8.943009713379034e-07, + "loss": 0.2838, + "step": 1333 + }, + { + "epoch": 0.6723790322580645, + "grad_norm": 0.1792807492758479, + "learning_rate": 8.941357770090203e-07, + "loss": 0.2872, + "step": 1334 + }, + { + "epoch": 0.672883064516129, + "grad_norm": 0.17948534651864595, + "learning_rate": 8.939704689744071e-07, + "loss": 0.3002, + "step": 1335 + }, + { + "epoch": 0.6733870967741935, + "grad_norm": 0.1812113705229493, + "learning_rate": 8.93805047281754e-07, + "loss": 0.2954, + "step": 1336 + }, + { + "epoch": 0.6738911290322581, + "grad_norm": 0.18844957646714744, + "learning_rate": 8.936395119787842e-07, + "loss": 0.2697, + "step": 1337 + }, + { + "epoch": 0.6743951612903226, + "grad_norm": 0.20929818984501958, + "learning_rate": 8.934738631132539e-07, + "loss": 0.2871, + "step": 1338 + }, + { + "epoch": 0.6748991935483871, + "grad_norm": 0.188001165636237, + "learning_rate": 8.933081007329515e-07, + "loss": 0.2973, + "step": 1339 + }, + { + "epoch": 0.6754032258064516, + "grad_norm": 0.16918518262975, + "learning_rate": 8.931422248856982e-07, + "loss": 0.2769, + "step": 1340 + }, + { + "epoch": 0.6759072580645161, + "grad_norm": 0.17795218348215974, + "learning_rate": 8.929762356193486e-07, + "loss": 0.2957, + "step": 1341 + }, + { + "epoch": 0.6764112903225806, + "grad_norm": 0.23569227174562507, + "learning_rate": 8.928101329817894e-07, + "loss": 0.296, + "step": 1342 + }, + { + "epoch": 0.6769153225806451, + "grad_norm": 0.1810084239182294, + "learning_rate": 8.926439170209401e-07, + "loss": 0.3002, + "step": 1343 + }, + { + "epoch": 0.6774193548387096, + "grad_norm": 0.17662865898790905, + "learning_rate": 8.92477587784753e-07, + "loss": 0.2877, + "step": 1344 + }, + { + "epoch": 0.6779233870967742, + "grad_norm": 0.20068556141454189, + "learning_rate": 8.923111453212131e-07, + "loss": 0.2987, + "step": 1345 + }, + { + "epoch": 0.6784274193548387, + "grad_norm": 0.1856885926434189, + "learning_rate": 8.921445896783381e-07, + "loss": 0.2762, + "step": 1346 + }, + { + "epoch": 0.6789314516129032, + "grad_norm": 0.17738716689811637, + "learning_rate": 8.919779209041782e-07, + "loss": 0.2917, + "step": 1347 + }, + { + "epoch": 0.6794354838709677, + "grad_norm": 0.17222917661740367, + "learning_rate": 8.918111390468162e-07, + "loss": 0.2801, + "step": 1348 + }, + { + "epoch": 0.6799395161290323, + "grad_norm": 0.19151284132256455, + "learning_rate": 8.916442441543678e-07, + "loss": 0.2706, + "step": 1349 + }, + { + "epoch": 0.6804435483870968, + "grad_norm": 0.17262129740464702, + "learning_rate": 8.91477236274981e-07, + "loss": 0.2769, + "step": 1350 + }, + { + "epoch": 0.6809475806451613, + "grad_norm": 0.1738945976891568, + "learning_rate": 8.913101154568366e-07, + "loss": 0.2849, + "step": 1351 + }, + { + "epoch": 0.6814516129032258, + "grad_norm": 0.18441311480041897, + "learning_rate": 8.91142881748148e-07, + "loss": 0.2684, + "step": 1352 + }, + { + "epoch": 0.6819556451612904, + "grad_norm": 0.18119499884546747, + "learning_rate": 8.90975535197161e-07, + "loss": 0.2886, + "step": 1353 + }, + { + "epoch": 0.6824596774193549, + "grad_norm": 0.1829298283270949, + "learning_rate": 8.90808075852154e-07, + "loss": 0.2929, + "step": 1354 + }, + { + "epoch": 0.6829637096774194, + "grad_norm": 0.18164824873198268, + "learning_rate": 8.906405037614382e-07, + "loss": 0.2935, + "step": 1355 + }, + { + "epoch": 0.6834677419354839, + "grad_norm": 0.20468371157832624, + "learning_rate": 8.904728189733568e-07, + "loss": 0.2748, + "step": 1356 + }, + { + "epoch": 0.6839717741935484, + "grad_norm": 0.1825106724862103, + "learning_rate": 8.90305021536286e-07, + "loss": 0.3001, + "step": 1357 + }, + { + "epoch": 0.6844758064516129, + "grad_norm": 0.17622106582753136, + "learning_rate": 8.901371114986343e-07, + "loss": 0.2819, + "step": 1358 + }, + { + "epoch": 0.6849798387096774, + "grad_norm": 0.18643087038948788, + "learning_rate": 8.899690889088427e-07, + "loss": 0.2949, + "step": 1359 + }, + { + "epoch": 0.6854838709677419, + "grad_norm": 0.1786561974876954, + "learning_rate": 8.898009538153847e-07, + "loss": 0.2798, + "step": 1360 + }, + { + "epoch": 0.6859879032258065, + "grad_norm": 0.17854373951330638, + "learning_rate": 8.896327062667663e-07, + "loss": 0.2989, + "step": 1361 + }, + { + "epoch": 0.686491935483871, + "grad_norm": 0.1945546348671395, + "learning_rate": 8.894643463115257e-07, + "loss": 0.2961, + "step": 1362 + }, + { + "epoch": 0.6869959677419355, + "grad_norm": 0.22049274658233767, + "learning_rate": 8.892958739982338e-07, + "loss": 0.275, + "step": 1363 + }, + { + "epoch": 0.6875, + "grad_norm": 0.2020492398451764, + "learning_rate": 8.891272893754937e-07, + "loss": 0.2949, + "step": 1364 + }, + { + "epoch": 0.6880040322580645, + "grad_norm": 0.20796221258156572, + "learning_rate": 8.889585924919414e-07, + "loss": 0.2738, + "step": 1365 + }, + { + "epoch": 0.688508064516129, + "grad_norm": 0.17715818724452845, + "learning_rate": 8.887897833962444e-07, + "loss": 0.2779, + "step": 1366 + }, + { + "epoch": 0.6890120967741935, + "grad_norm": 0.18291220238667136, + "learning_rate": 8.886208621371034e-07, + "loss": 0.2867, + "step": 1367 + }, + { + "epoch": 0.6895161290322581, + "grad_norm": 0.1726979302547636, + "learning_rate": 8.88451828763251e-07, + "loss": 0.2777, + "step": 1368 + }, + { + "epoch": 0.6900201612903226, + "grad_norm": 0.23127714111621359, + "learning_rate": 8.882826833234525e-07, + "loss": 0.3048, + "step": 1369 + }, + { + "epoch": 0.6905241935483871, + "grad_norm": 0.20073601560327714, + "learning_rate": 8.881134258665051e-07, + "loss": 0.3165, + "step": 1370 + }, + { + "epoch": 0.6910282258064516, + "grad_norm": 0.18006036389690872, + "learning_rate": 8.879440564412384e-07, + "loss": 0.2749, + "step": 1371 + }, + { + "epoch": 0.6915322580645161, + "grad_norm": 0.2323105469609835, + "learning_rate": 8.877745750965146e-07, + "loss": 0.2909, + "step": 1372 + }, + { + "epoch": 0.6920362903225806, + "grad_norm": 0.178854463220733, + "learning_rate": 8.876049818812281e-07, + "loss": 0.2951, + "step": 1373 + }, + { + "epoch": 0.6925403225806451, + "grad_norm": 0.2431317742511991, + "learning_rate": 8.874352768443055e-07, + "loss": 0.2938, + "step": 1374 + }, + { + "epoch": 0.6930443548387096, + "grad_norm": 0.17355381981942172, + "learning_rate": 8.872654600347055e-07, + "loss": 0.2904, + "step": 1375 + }, + { + "epoch": 0.6935483870967742, + "grad_norm": 0.19800049499752437, + "learning_rate": 8.870955315014193e-07, + "loss": 0.284, + "step": 1376 + }, + { + "epoch": 0.6940524193548387, + "grad_norm": 0.18380625874212533, + "learning_rate": 8.869254912934701e-07, + "loss": 0.2777, + "step": 1377 + }, + { + "epoch": 0.6945564516129032, + "grad_norm": 0.2003918541066735, + "learning_rate": 8.867553394599137e-07, + "loss": 0.2759, + "step": 1378 + }, + { + "epoch": 0.6950604838709677, + "grad_norm": 0.18695292816627587, + "learning_rate": 8.865850760498375e-07, + "loss": 0.2978, + "step": 1379 + }, + { + "epoch": 0.6955645161290323, + "grad_norm": 0.18229058385029212, + "learning_rate": 8.864147011123617e-07, + "loss": 0.3109, + "step": 1380 + }, + { + "epoch": 0.6960685483870968, + "grad_norm": 0.2363688988979662, + "learning_rate": 8.862442146966385e-07, + "loss": 0.3016, + "step": 1381 + }, + { + "epoch": 0.6965725806451613, + "grad_norm": 0.1795567842038728, + "learning_rate": 8.860736168518517e-07, + "loss": 0.2957, + "step": 1382 + }, + { + "epoch": 0.6970766129032258, + "grad_norm": 0.18655389810189696, + "learning_rate": 8.859029076272182e-07, + "loss": 0.2809, + "step": 1383 + }, + { + "epoch": 0.6975806451612904, + "grad_norm": 0.17971498800102143, + "learning_rate": 8.857320870719864e-07, + "loss": 0.2838, + "step": 1384 + }, + { + "epoch": 0.6980846774193549, + "grad_norm": 0.1915148111115413, + "learning_rate": 8.85561155235437e-07, + "loss": 0.2922, + "step": 1385 + }, + { + "epoch": 0.6985887096774194, + "grad_norm": 0.1858146547352289, + "learning_rate": 8.853901121668828e-07, + "loss": 0.287, + "step": 1386 + }, + { + "epoch": 0.6990927419354839, + "grad_norm": 0.19284974835593893, + "learning_rate": 8.852189579156684e-07, + "loss": 0.2994, + "step": 1387 + }, + { + "epoch": 0.6995967741935484, + "grad_norm": 0.17664023987078498, + "learning_rate": 8.850476925311711e-07, + "loss": 0.2753, + "step": 1388 + }, + { + "epoch": 0.7001008064516129, + "grad_norm": 0.17603041348701104, + "learning_rate": 8.848763160627997e-07, + "loss": 0.2862, + "step": 1389 + }, + { + "epoch": 0.7006048387096774, + "grad_norm": 0.17114594668139757, + "learning_rate": 8.847048285599952e-07, + "loss": 0.2832, + "step": 1390 + }, + { + "epoch": 0.7011088709677419, + "grad_norm": 0.18467144014813242, + "learning_rate": 8.84533230072231e-07, + "loss": 0.2761, + "step": 1391 + }, + { + "epoch": 0.7016129032258065, + "grad_norm": 0.19349322968299676, + "learning_rate": 8.843615206490118e-07, + "loss": 0.2953, + "step": 1392 + }, + { + "epoch": 0.702116935483871, + "grad_norm": 0.18516765422030596, + "learning_rate": 8.841897003398749e-07, + "loss": 0.2892, + "step": 1393 + }, + { + "epoch": 0.702116935483871, + "eval_loss": 0.31658247113227844, + "eval_runtime": 16.8213, + "eval_samples_per_second": 50.829, + "eval_steps_per_second": 1.07, + "step": 1393 + }, + { + "epoch": 0.7026209677419355, + "grad_norm": 0.17985038724483002, + "learning_rate": 8.840177691943895e-07, + "loss": 0.2785, + "step": 1394 + }, + { + "epoch": 0.703125, + "grad_norm": 0.19728174011424768, + "learning_rate": 8.838457272621565e-07, + "loss": 0.2942, + "step": 1395 + }, + { + "epoch": 0.7036290322580645, + "grad_norm": 0.22391820163520273, + "learning_rate": 8.836735745928089e-07, + "loss": 0.285, + "step": 1396 + }, + { + "epoch": 0.704133064516129, + "grad_norm": 0.17792486538644323, + "learning_rate": 8.835013112360118e-07, + "loss": 0.2889, + "step": 1397 + }, + { + "epoch": 0.7046370967741935, + "grad_norm": 0.17801651435359686, + "learning_rate": 8.833289372414621e-07, + "loss": 0.2652, + "step": 1398 + }, + { + "epoch": 0.7051411290322581, + "grad_norm": 0.18753019276564675, + "learning_rate": 8.831564526588886e-07, + "loss": 0.28, + "step": 1399 + }, + { + "epoch": 0.7056451612903226, + "grad_norm": 0.2040287956468218, + "learning_rate": 8.82983857538052e-07, + "loss": 0.2754, + "step": 1400 + }, + { + "epoch": 0.7061491935483871, + "grad_norm": 0.2003540418757947, + "learning_rate": 8.828111519287451e-07, + "loss": 0.289, + "step": 1401 + }, + { + "epoch": 0.7066532258064516, + "grad_norm": 0.20667446057464192, + "learning_rate": 8.82638335880792e-07, + "loss": 0.2983, + "step": 1402 + }, + { + "epoch": 0.7071572580645161, + "grad_norm": 0.18640904253903684, + "learning_rate": 8.824654094440496e-07, + "loss": 0.28, + "step": 1403 + }, + { + "epoch": 0.7076612903225806, + "grad_norm": 0.1841514649749984, + "learning_rate": 8.822923726684057e-07, + "loss": 0.2883, + "step": 1404 + }, + { + "epoch": 0.7081653225806451, + "grad_norm": 0.18229858409313876, + "learning_rate": 8.821192256037804e-07, + "loss": 0.2909, + "step": 1405 + }, + { + "epoch": 0.7086693548387096, + "grad_norm": 0.21389681862115453, + "learning_rate": 8.819459683001257e-07, + "loss": 0.2807, + "step": 1406 + }, + { + "epoch": 0.7091733870967742, + "grad_norm": 0.17509601564433855, + "learning_rate": 8.817726008074252e-07, + "loss": 0.2866, + "step": 1407 + }, + { + "epoch": 0.7096774193548387, + "grad_norm": 0.1766608417202575, + "learning_rate": 8.815991231756942e-07, + "loss": 0.2832, + "step": 1408 + }, + { + "epoch": 0.7101814516129032, + "grad_norm": 0.17771797879063636, + "learning_rate": 8.814255354549801e-07, + "loss": 0.2859, + "step": 1409 + }, + { + "epoch": 0.7106854838709677, + "grad_norm": 0.18708822232800043, + "learning_rate": 8.81251837695362e-07, + "loss": 0.2857, + "step": 1410 + }, + { + "epoch": 0.7111895161290323, + "grad_norm": 0.17909240174834304, + "learning_rate": 8.810780299469502e-07, + "loss": 0.2839, + "step": 1411 + }, + { + "epoch": 0.7116935483870968, + "grad_norm": 0.18475558009415183, + "learning_rate": 8.809041122598875e-07, + "loss": 0.2886, + "step": 1412 + }, + { + "epoch": 0.7121975806451613, + "grad_norm": 0.18675745708053215, + "learning_rate": 8.80730084684348e-07, + "loss": 0.2709, + "step": 1413 + }, + { + "epoch": 0.7127016129032258, + "grad_norm": 0.17219148431342304, + "learning_rate": 8.805559472705375e-07, + "loss": 0.2867, + "step": 1414 + }, + { + "epoch": 0.7132056451612904, + "grad_norm": 0.19026935921175026, + "learning_rate": 8.803817000686937e-07, + "loss": 0.286, + "step": 1415 + }, + { + "epoch": 0.7137096774193549, + "grad_norm": 0.19083134675660773, + "learning_rate": 8.802073431290857e-07, + "loss": 0.2782, + "step": 1416 + }, + { + "epoch": 0.7142137096774194, + "grad_norm": 0.17182972306227576, + "learning_rate": 8.800328765020146e-07, + "loss": 0.2764, + "step": 1417 + }, + { + "epoch": 0.7147177419354839, + "grad_norm": 0.1685745381686828, + "learning_rate": 8.798583002378128e-07, + "loss": 0.2793, + "step": 1418 + }, + { + "epoch": 0.7152217741935484, + "grad_norm": 0.17135712002602105, + "learning_rate": 8.796836143868445e-07, + "loss": 0.283, + "step": 1419 + }, + { + "epoch": 0.7157258064516129, + "grad_norm": 0.18005621021675017, + "learning_rate": 8.795088189995052e-07, + "loss": 0.2774, + "step": 1420 + }, + { + "epoch": 0.7162298387096774, + "grad_norm": 0.1757591514793466, + "learning_rate": 8.793339141262228e-07, + "loss": 0.2938, + "step": 1421 + }, + { + "epoch": 0.7167338709677419, + "grad_norm": 0.17230964184772374, + "learning_rate": 8.791588998174559e-07, + "loss": 0.2853, + "step": 1422 + }, + { + "epoch": 0.7172379032258065, + "grad_norm": 0.19439553949354596, + "learning_rate": 8.789837761236954e-07, + "loss": 0.2706, + "step": 1423 + }, + { + "epoch": 0.717741935483871, + "grad_norm": 0.18372365279361863, + "learning_rate": 8.788085430954629e-07, + "loss": 0.2756, + "step": 1424 + }, + { + "epoch": 0.7182459677419355, + "grad_norm": 0.17528943529169524, + "learning_rate": 8.786332007833123e-07, + "loss": 0.2999, + "step": 1425 + }, + { + "epoch": 0.71875, + "grad_norm": 0.1718666710733766, + "learning_rate": 8.78457749237829e-07, + "loss": 0.2828, + "step": 1426 + }, + { + "epoch": 0.7192540322580645, + "grad_norm": 0.19363050866395, + "learning_rate": 8.782821885096294e-07, + "loss": 0.3002, + "step": 1427 + }, + { + "epoch": 0.719758064516129, + "grad_norm": 0.24520324213496356, + "learning_rate": 8.781065186493617e-07, + "loss": 0.2983, + "step": 1428 + }, + { + "epoch": 0.7202620967741935, + "grad_norm": 0.17701465130657107, + "learning_rate": 8.779307397077056e-07, + "loss": 0.2941, + "step": 1429 + }, + { + "epoch": 0.7207661290322581, + "grad_norm": 0.17040112419340506, + "learning_rate": 8.777548517353722e-07, + "loss": 0.2758, + "step": 1430 + }, + { + "epoch": 0.7212701612903226, + "grad_norm": 0.1741219490089326, + "learning_rate": 8.775788547831042e-07, + "loss": 0.2726, + "step": 1431 + }, + { + "epoch": 0.7217741935483871, + "grad_norm": 0.17863386512186544, + "learning_rate": 8.774027489016756e-07, + "loss": 0.2979, + "step": 1432 + }, + { + "epoch": 0.7222782258064516, + "grad_norm": 0.1894540149341605, + "learning_rate": 8.772265341418918e-07, + "loss": 0.2775, + "step": 1433 + }, + { + "epoch": 0.7227822580645161, + "grad_norm": 0.17905751081552182, + "learning_rate": 8.770502105545898e-07, + "loss": 0.2759, + "step": 1434 + }, + { + "epoch": 0.7232862903225806, + "grad_norm": 0.2250191897070994, + "learning_rate": 8.768737781906377e-07, + "loss": 0.2792, + "step": 1435 + }, + { + "epoch": 0.7237903225806451, + "grad_norm": 0.1873361676914799, + "learning_rate": 8.766972371009351e-07, + "loss": 0.2849, + "step": 1436 + }, + { + "epoch": 0.7242943548387096, + "grad_norm": 0.18214326291659777, + "learning_rate": 8.765205873364132e-07, + "loss": 0.3074, + "step": 1437 + }, + { + "epoch": 0.7247983870967742, + "grad_norm": 0.18444546366186285, + "learning_rate": 8.763438289480343e-07, + "loss": 0.2974, + "step": 1438 + }, + { + "epoch": 0.7253024193548387, + "grad_norm": 0.19982363016080767, + "learning_rate": 8.76166961986792e-07, + "loss": 0.2661, + "step": 1439 + }, + { + "epoch": 0.7258064516129032, + "grad_norm": 0.1781094109669711, + "learning_rate": 8.759899865037115e-07, + "loss": 0.2823, + "step": 1440 + }, + { + "epoch": 0.7263104838709677, + "grad_norm": 0.1810221506084548, + "learning_rate": 8.758129025498488e-07, + "loss": 0.2994, + "step": 1441 + }, + { + "epoch": 0.7268145161290323, + "grad_norm": 0.17107486611373632, + "learning_rate": 8.75635710176292e-07, + "loss": 0.2733, + "step": 1442 + }, + { + "epoch": 0.7273185483870968, + "grad_norm": 0.1920323851595878, + "learning_rate": 8.754584094341597e-07, + "loss": 0.2949, + "step": 1443 + }, + { + "epoch": 0.7278225806451613, + "grad_norm": 0.18240009322545153, + "learning_rate": 8.75281000374602e-07, + "loss": 0.2936, + "step": 1444 + }, + { + "epoch": 0.7283266129032258, + "grad_norm": 0.17921579254401834, + "learning_rate": 8.751034830488006e-07, + "loss": 0.2885, + "step": 1445 + }, + { + "epoch": 0.7288306451612904, + "grad_norm": 0.17928589014845386, + "learning_rate": 8.749258575079678e-07, + "loss": 0.303, + "step": 1446 + }, + { + "epoch": 0.7293346774193549, + "grad_norm": 0.183832758370968, + "learning_rate": 8.747481238033478e-07, + "loss": 0.2843, + "step": 1447 + }, + { + "epoch": 0.7298387096774194, + "grad_norm": 0.1725772769965713, + "learning_rate": 8.745702819862155e-07, + "loss": 0.2702, + "step": 1448 + }, + { + "epoch": 0.7303427419354839, + "grad_norm": 0.18098089374754683, + "learning_rate": 8.743923321078772e-07, + "loss": 0.2931, + "step": 1449 + }, + { + "epoch": 0.7308467741935484, + "grad_norm": 0.19180274311843135, + "learning_rate": 8.742142742196703e-07, + "loss": 0.2856, + "step": 1450 + }, + { + "epoch": 0.7313508064516129, + "grad_norm": 0.17825200131791993, + "learning_rate": 8.740361083729634e-07, + "loss": 0.2902, + "step": 1451 + }, + { + "epoch": 0.7318548387096774, + "grad_norm": 0.17705711042600694, + "learning_rate": 8.738578346191563e-07, + "loss": 0.2989, + "step": 1452 + }, + { + "epoch": 0.7323588709677419, + "grad_norm": 0.18722947981092045, + "learning_rate": 8.736794530096797e-07, + "loss": 0.2837, + "step": 1453 + }, + { + "epoch": 0.7328629032258065, + "grad_norm": 0.17077286964873561, + "learning_rate": 8.735009635959958e-07, + "loss": 0.2837, + "step": 1454 + }, + { + "epoch": 0.733366935483871, + "grad_norm": 0.17523945290796009, + "learning_rate": 8.733223664295976e-07, + "loss": 0.3038, + "step": 1455 + }, + { + "epoch": 0.7338709677419355, + "grad_norm": 0.21527334289626718, + "learning_rate": 8.731436615620095e-07, + "loss": 0.2785, + "step": 1456 + }, + { + "epoch": 0.734375, + "grad_norm": 0.17398355824973538, + "learning_rate": 8.729648490447864e-07, + "loss": 0.2849, + "step": 1457 + }, + { + "epoch": 0.7348790322580645, + "grad_norm": 0.19117546742540467, + "learning_rate": 8.727859289295147e-07, + "loss": 0.2922, + "step": 1458 + }, + { + "epoch": 0.735383064516129, + "grad_norm": 0.17261671346075716, + "learning_rate": 8.726069012678119e-07, + "loss": 0.2916, + "step": 1459 + }, + { + "epoch": 0.7358870967741935, + "grad_norm": 0.17887054876224195, + "learning_rate": 8.724277661113262e-07, + "loss": 0.2873, + "step": 1460 + }, + { + "epoch": 0.7363911290322581, + "grad_norm": 0.18810141505407652, + "learning_rate": 8.722485235117369e-07, + "loss": 0.2754, + "step": 1461 + }, + { + "epoch": 0.7368951612903226, + "grad_norm": 0.1711988777610929, + "learning_rate": 8.720691735207549e-07, + "loss": 0.2795, + "step": 1462 + }, + { + "epoch": 0.7373991935483871, + "grad_norm": 0.181403489411845, + "learning_rate": 8.718897161901208e-07, + "loss": 0.2812, + "step": 1463 + }, + { + "epoch": 0.7379032258064516, + "grad_norm": 0.1741359968081314, + "learning_rate": 8.717101515716074e-07, + "loss": 0.2897, + "step": 1464 + }, + { + "epoch": 0.7384072580645161, + "grad_norm": 0.18618138691906877, + "learning_rate": 8.71530479717018e-07, + "loss": 0.2862, + "step": 1465 + }, + { + "epoch": 0.7389112903225806, + "grad_norm": 0.18489427612306297, + "learning_rate": 8.713507006781867e-07, + "loss": 0.2798, + "step": 1466 + }, + { + "epoch": 0.7394153225806451, + "grad_norm": 0.2032269124376584, + "learning_rate": 8.711708145069787e-07, + "loss": 0.2826, + "step": 1467 + }, + { + "epoch": 0.7399193548387096, + "grad_norm": 0.17163938575826002, + "learning_rate": 8.709908212552899e-07, + "loss": 0.284, + "step": 1468 + }, + { + "epoch": 0.7404233870967742, + "grad_norm": 0.17551816826492267, + "learning_rate": 8.708107209750473e-07, + "loss": 0.2986, + "step": 1469 + }, + { + "epoch": 0.7409274193548387, + "grad_norm": 0.1707997048312571, + "learning_rate": 8.706305137182089e-07, + "loss": 0.2795, + "step": 1470 + }, + { + "epoch": 0.7414314516129032, + "grad_norm": 0.18060170508978393, + "learning_rate": 8.70450199536763e-07, + "loss": 0.2766, + "step": 1471 + }, + { + "epoch": 0.7419354838709677, + "grad_norm": 0.17735325459231624, + "learning_rate": 8.702697784827295e-07, + "loss": 0.2748, + "step": 1472 + }, + { + "epoch": 0.7424395161290323, + "grad_norm": 0.18812112690658891, + "learning_rate": 8.700892506081587e-07, + "loss": 0.2933, + "step": 1473 + }, + { + "epoch": 0.7429435483870968, + "grad_norm": 0.17809974738272327, + "learning_rate": 8.699086159651314e-07, + "loss": 0.2793, + "step": 1474 + }, + { + "epoch": 0.7434475806451613, + "grad_norm": 0.18817435016585066, + "learning_rate": 8.697278746057602e-07, + "loss": 0.29, + "step": 1475 + }, + { + "epoch": 0.7439516129032258, + "grad_norm": 0.18251009568650395, + "learning_rate": 8.695470265821871e-07, + "loss": 0.2832, + "step": 1476 + }, + { + "epoch": 0.7444556451612904, + "grad_norm": 0.17400386528720746, + "learning_rate": 8.693660719465865e-07, + "loss": 0.284, + "step": 1477 + }, + { + "epoch": 0.7449596774193549, + "grad_norm": 0.22195293952140055, + "learning_rate": 8.69185010751162e-07, + "loss": 0.2793, + "step": 1478 + }, + { + "epoch": 0.7454637096774194, + "grad_norm": 0.16879955516508652, + "learning_rate": 8.690038430481489e-07, + "loss": 0.2851, + "step": 1479 + }, + { + "epoch": 0.7459677419354839, + "grad_norm": 0.17379174977002937, + "learning_rate": 8.688225688898129e-07, + "loss": 0.2997, + "step": 1480 + }, + { + "epoch": 0.7464717741935484, + "grad_norm": 0.17053843236094848, + "learning_rate": 8.686411883284505e-07, + "loss": 0.2772, + "step": 1481 + }, + { + "epoch": 0.7469758064516129, + "grad_norm": 0.1966482045417518, + "learning_rate": 8.684597014163891e-07, + "loss": 0.2921, + "step": 1482 + }, + { + "epoch": 0.7474798387096774, + "grad_norm": 0.1842097594300451, + "learning_rate": 8.682781082059861e-07, + "loss": 0.2902, + "step": 1483 + }, + { + "epoch": 0.7479838709677419, + "grad_norm": 0.17503229863085937, + "learning_rate": 8.680964087496303e-07, + "loss": 0.2974, + "step": 1484 + }, + { + "epoch": 0.7484879032258065, + "grad_norm": 0.1848664095238554, + "learning_rate": 8.679146030997409e-07, + "loss": 0.2856, + "step": 1485 + }, + { + "epoch": 0.748991935483871, + "grad_norm": 0.1762096766799662, + "learning_rate": 8.677326913087675e-07, + "loss": 0.2715, + "step": 1486 + }, + { + "epoch": 0.7494959677419355, + "grad_norm": 0.20658420010486583, + "learning_rate": 8.675506734291906e-07, + "loss": 0.2974, + "step": 1487 + }, + { + "epoch": 0.75, + "grad_norm": 0.17593053114884585, + "learning_rate": 8.673685495135214e-07, + "loss": 0.2883, + "step": 1488 + }, + { + "epoch": 0.7505040322580645, + "grad_norm": 0.17964689526353583, + "learning_rate": 8.671863196143014e-07, + "loss": 0.2931, + "step": 1489 + }, + { + "epoch": 0.751008064516129, + "grad_norm": 0.1794940966748331, + "learning_rate": 8.670039837841028e-07, + "loss": 0.2693, + "step": 1490 + }, + { + "epoch": 0.7515120967741935, + "grad_norm": 0.17496758427902515, + "learning_rate": 8.668215420755282e-07, + "loss": 0.2708, + "step": 1491 + }, + { + "epoch": 0.7520161290322581, + "grad_norm": 0.173160052063297, + "learning_rate": 8.666389945412112e-07, + "loss": 0.2903, + "step": 1492 + }, + { + "epoch": 0.7525201612903226, + "grad_norm": 0.19227463269140316, + "learning_rate": 8.664563412338154e-07, + "loss": 0.2928, + "step": 1493 + }, + { + "epoch": 0.7530241935483871, + "grad_norm": 0.20236023713329304, + "learning_rate": 8.662735822060352e-07, + "loss": 0.2776, + "step": 1494 + }, + { + "epoch": 0.7535282258064516, + "grad_norm": 0.20307254860433052, + "learning_rate": 8.660907175105956e-07, + "loss": 0.2789, + "step": 1495 + }, + { + "epoch": 0.7540322580645161, + "grad_norm": 0.18535559574459684, + "learning_rate": 8.659077472002518e-07, + "loss": 0.2977, + "step": 1496 + }, + { + "epoch": 0.7545362903225806, + "grad_norm": 0.19262270536952245, + "learning_rate": 8.657246713277895e-07, + "loss": 0.2884, + "step": 1497 + }, + { + "epoch": 0.7550403225806451, + "grad_norm": 0.17225775461167198, + "learning_rate": 8.655414899460251e-07, + "loss": 0.2798, + "step": 1498 + }, + { + "epoch": 0.7555443548387096, + "grad_norm": 0.18058865422335652, + "learning_rate": 8.653582031078053e-07, + "loss": 0.2803, + "step": 1499 + }, + { + "epoch": 0.7560483870967742, + "grad_norm": 0.193197708837322, + "learning_rate": 8.651748108660072e-07, + "loss": 0.285, + "step": 1500 + }, + { + "epoch": 0.7565524193548387, + "grad_norm": 0.19695362977929617, + "learning_rate": 8.649913132735383e-07, + "loss": 0.2871, + "step": 1501 + }, + { + "epoch": 0.7570564516129032, + "grad_norm": 0.17883642405940453, + "learning_rate": 8.648077103833365e-07, + "loss": 0.2799, + "step": 1502 + }, + { + "epoch": 0.7575604838709677, + "grad_norm": 0.17172477648515241, + "learning_rate": 8.646240022483699e-07, + "loss": 0.292, + "step": 1503 + }, + { + "epoch": 0.7580645161290323, + "grad_norm": 0.17575732800042274, + "learning_rate": 8.644401889216377e-07, + "loss": 0.2777, + "step": 1504 + }, + { + "epoch": 0.7585685483870968, + "grad_norm": 0.17678434388091105, + "learning_rate": 8.642562704561684e-07, + "loss": 0.2956, + "step": 1505 + }, + { + "epoch": 0.7590725806451613, + "grad_norm": 0.18058078973223446, + "learning_rate": 8.640722469050217e-07, + "loss": 0.2887, + "step": 1506 + }, + { + "epoch": 0.7595766129032258, + "grad_norm": 0.1742859796225662, + "learning_rate": 8.638881183212869e-07, + "loss": 0.2787, + "step": 1507 + }, + { + "epoch": 0.7600806451612904, + "grad_norm": 0.17057933697554573, + "learning_rate": 8.637038847580842e-07, + "loss": 0.2822, + "step": 1508 + }, + { + "epoch": 0.7605846774193549, + "grad_norm": 0.17396796248874147, + "learning_rate": 8.635195462685637e-07, + "loss": 0.2898, + "step": 1509 + }, + { + "epoch": 0.7610887096774194, + "grad_norm": 0.1754476685521095, + "learning_rate": 8.633351029059061e-07, + "loss": 0.2816, + "step": 1510 + }, + { + "epoch": 0.7615927419354839, + "grad_norm": 0.1996562682521391, + "learning_rate": 8.63150554723322e-07, + "loss": 0.2802, + "step": 1511 + }, + { + "epoch": 0.7620967741935484, + "grad_norm": 0.18654386311810392, + "learning_rate": 8.629659017740525e-07, + "loss": 0.2843, + "step": 1512 + }, + { + "epoch": 0.7626008064516129, + "grad_norm": 0.1795688988462513, + "learning_rate": 8.627811441113688e-07, + "loss": 0.2972, + "step": 1513 + }, + { + "epoch": 0.7631048387096774, + "grad_norm": 0.1700302651183351, + "learning_rate": 8.625962817885723e-07, + "loss": 0.276, + "step": 1514 + }, + { + "epoch": 0.7636088709677419, + "grad_norm": 0.17276816293597855, + "learning_rate": 8.624113148589947e-07, + "loss": 0.2857, + "step": 1515 + }, + { + "epoch": 0.7641129032258065, + "grad_norm": 0.17205240872531083, + "learning_rate": 8.622262433759976e-07, + "loss": 0.2898, + "step": 1516 + }, + { + "epoch": 0.764616935483871, + "grad_norm": 0.19308777537983587, + "learning_rate": 8.620410673929732e-07, + "loss": 0.2911, + "step": 1517 + }, + { + "epoch": 0.7651209677419355, + "grad_norm": 0.18066352556552442, + "learning_rate": 8.618557869633438e-07, + "loss": 0.2851, + "step": 1518 + }, + { + "epoch": 0.765625, + "grad_norm": 0.1740551426558877, + "learning_rate": 8.616704021405613e-07, + "loss": 0.2754, + "step": 1519 + }, + { + "epoch": 0.7661290322580645, + "grad_norm": 0.20220657101308243, + "learning_rate": 8.614849129781084e-07, + "loss": 0.2675, + "step": 1520 + }, + { + "epoch": 0.766633064516129, + "grad_norm": 0.17142601346124745, + "learning_rate": 8.612993195294971e-07, + "loss": 0.2697, + "step": 1521 + }, + { + "epoch": 0.7671370967741935, + "grad_norm": 0.20383494070933988, + "learning_rate": 8.611136218482704e-07, + "loss": 0.2917, + "step": 1522 + }, + { + "epoch": 0.7676411290322581, + "grad_norm": 0.17942668361212222, + "learning_rate": 8.609278199880007e-07, + "loss": 0.2996, + "step": 1523 + }, + { + "epoch": 0.7681451612903226, + "grad_norm": 0.21582357596764432, + "learning_rate": 8.607419140022908e-07, + "loss": 0.2905, + "step": 1524 + }, + { + "epoch": 0.7686491935483871, + "grad_norm": 0.19537946016922536, + "learning_rate": 8.605559039447734e-07, + "loss": 0.2897, + "step": 1525 + }, + { + "epoch": 0.7691532258064516, + "grad_norm": 0.17446732973784299, + "learning_rate": 8.603697898691112e-07, + "loss": 0.2922, + "step": 1526 + }, + { + "epoch": 0.7696572580645161, + "grad_norm": 0.17940436217661643, + "learning_rate": 8.601835718289971e-07, + "loss": 0.2897, + "step": 1527 + }, + { + "epoch": 0.7701612903225806, + "grad_norm": 0.16967508800400594, + "learning_rate": 8.599972498781536e-07, + "loss": 0.2838, + "step": 1528 + }, + { + "epoch": 0.7706653225806451, + "grad_norm": 0.17730457999766902, + "learning_rate": 8.598108240703337e-07, + "loss": 0.2863, + "step": 1529 + }, + { + "epoch": 0.7711693548387096, + "grad_norm": 0.17432021927654012, + "learning_rate": 8.5962429445932e-07, + "loss": 0.2838, + "step": 1530 + }, + { + "epoch": 0.7716733870967742, + "grad_norm": 0.18292364816715403, + "learning_rate": 8.594376610989249e-07, + "loss": 0.2889, + "step": 1531 + }, + { + "epoch": 0.7721774193548387, + "grad_norm": 0.1692258045511517, + "learning_rate": 8.592509240429913e-07, + "loss": 0.2859, + "step": 1532 + }, + { + "epoch": 0.7726814516129032, + "grad_norm": 0.17793048496272398, + "learning_rate": 8.590640833453916e-07, + "loss": 0.2962, + "step": 1533 + }, + { + "epoch": 0.7731854838709677, + "grad_norm": 0.170590500112818, + "learning_rate": 8.58877139060028e-07, + "loss": 0.273, + "step": 1534 + }, + { + "epoch": 0.7736895161290323, + "grad_norm": 0.1748601955404486, + "learning_rate": 8.58690091240833e-07, + "loss": 0.2844, + "step": 1535 + }, + { + "epoch": 0.7741935483870968, + "grad_norm": 0.18730945527731682, + "learning_rate": 8.585029399417687e-07, + "loss": 0.3091, + "step": 1536 + }, + { + "epoch": 0.7746975806451613, + "grad_norm": 0.18332986307488439, + "learning_rate": 8.583156852168269e-07, + "loss": 0.2793, + "step": 1537 + }, + { + "epoch": 0.7752016129032258, + "grad_norm": 0.18440491915084045, + "learning_rate": 8.581283271200297e-07, + "loss": 0.2862, + "step": 1538 + }, + { + "epoch": 0.7757056451612904, + "grad_norm": 0.173582809480461, + "learning_rate": 8.579408657054286e-07, + "loss": 0.2792, + "step": 1539 + }, + { + "epoch": 0.7762096774193549, + "grad_norm": 0.18014379318891854, + "learning_rate": 8.577533010271049e-07, + "loss": 0.2919, + "step": 1540 + }, + { + "epoch": 0.7767137096774194, + "grad_norm": 0.1722503671402626, + "learning_rate": 8.575656331391702e-07, + "loss": 0.2697, + "step": 1541 + }, + { + "epoch": 0.7772177419354839, + "grad_norm": 0.1727617532348802, + "learning_rate": 8.573778620957652e-07, + "loss": 0.3044, + "step": 1542 + }, + { + "epoch": 0.7777217741935484, + "grad_norm": 0.17458552774869926, + "learning_rate": 8.571899879510609e-07, + "loss": 0.3022, + "step": 1543 + }, + { + "epoch": 0.7782258064516129, + "grad_norm": 0.19575211026743364, + "learning_rate": 8.570020107592579e-07, + "loss": 0.2729, + "step": 1544 + }, + { + "epoch": 0.7787298387096774, + "grad_norm": 0.18571171303798414, + "learning_rate": 8.568139305745861e-07, + "loss": 0.2797, + "step": 1545 + }, + { + "epoch": 0.7792338709677419, + "grad_norm": 0.19155677402308682, + "learning_rate": 8.566257474513057e-07, + "loss": 0.2999, + "step": 1546 + }, + { + "epoch": 0.7797379032258065, + "grad_norm": 0.19852085649960613, + "learning_rate": 8.564374614437065e-07, + "loss": 0.2948, + "step": 1547 + }, + { + "epoch": 0.780241935483871, + "grad_norm": 0.18180259804141408, + "learning_rate": 8.562490726061074e-07, + "loss": 0.272, + "step": 1548 + }, + { + "epoch": 0.7807459677419355, + "grad_norm": 0.1757617028962345, + "learning_rate": 8.560605809928578e-07, + "loss": 0.2899, + "step": 1549 + }, + { + "epoch": 0.78125, + "grad_norm": 0.18002181651881002, + "learning_rate": 8.558719866583364e-07, + "loss": 0.2801, + "step": 1550 + }, + { + "epoch": 0.7817540322580645, + "grad_norm": 0.18391600786228307, + "learning_rate": 8.556832896569512e-07, + "loss": 0.2953, + "step": 1551 + }, + { + "epoch": 0.782258064516129, + "grad_norm": 0.1696821891773392, + "learning_rate": 8.554944900431405e-07, + "loss": 0.2765, + "step": 1552 + }, + { + "epoch": 0.7827620967741935, + "grad_norm": 0.17250755946675386, + "learning_rate": 8.553055878713714e-07, + "loss": 0.2742, + "step": 1553 + }, + { + "epoch": 0.7832661290322581, + "grad_norm": 0.17460773929441345, + "learning_rate": 8.551165831961414e-07, + "loss": 0.2941, + "step": 1554 + }, + { + "epoch": 0.7837701612903226, + "grad_norm": 0.17494634704080653, + "learning_rate": 8.549274760719767e-07, + "loss": 0.2838, + "step": 1555 + }, + { + "epoch": 0.7842741935483871, + "grad_norm": 0.18400774964808703, + "learning_rate": 8.547382665534339e-07, + "loss": 0.2913, + "step": 1556 + }, + { + "epoch": 0.7847782258064516, + "grad_norm": 0.17748115232473677, + "learning_rate": 8.545489546950988e-07, + "loss": 0.2859, + "step": 1557 + }, + { + "epoch": 0.7852822580645161, + "grad_norm": 0.18218986931380948, + "learning_rate": 8.543595405515864e-07, + "loss": 0.2812, + "step": 1558 + }, + { + "epoch": 0.7857862903225806, + "grad_norm": 0.1969916491736838, + "learning_rate": 8.541700241775419e-07, + "loss": 0.2888, + "step": 1559 + }, + { + "epoch": 0.7862903225806451, + "grad_norm": 0.20960556550108175, + "learning_rate": 8.539804056276393e-07, + "loss": 0.2896, + "step": 1560 + }, + { + "epoch": 0.7867943548387096, + "grad_norm": 0.18784903454787713, + "learning_rate": 8.537906849565824e-07, + "loss": 0.292, + "step": 1561 + }, + { + "epoch": 0.7872983870967742, + "grad_norm": 0.17668988962371776, + "learning_rate": 8.536008622191047e-07, + "loss": 0.2832, + "step": 1562 + }, + { + "epoch": 0.7878024193548387, + "grad_norm": 0.1938218281223095, + "learning_rate": 8.534109374699685e-07, + "loss": 0.2806, + "step": 1563 + }, + { + "epoch": 0.7883064516129032, + "grad_norm": 0.1724503265255416, + "learning_rate": 8.532209107639661e-07, + "loss": 0.2836, + "step": 1564 + }, + { + "epoch": 0.7888104838709677, + "grad_norm": 0.216684297634416, + "learning_rate": 8.530307821559192e-07, + "loss": 0.2834, + "step": 1565 + }, + { + "epoch": 0.7893145161290323, + "grad_norm": 0.1952086001178248, + "learning_rate": 8.528405517006785e-07, + "loss": 0.2703, + "step": 1566 + }, + { + "epoch": 0.7898185483870968, + "grad_norm": 0.17917540283085726, + "learning_rate": 8.526502194531242e-07, + "loss": 0.2795, + "step": 1567 + }, + { + "epoch": 0.7903225806451613, + "grad_norm": 0.16957021131556896, + "learning_rate": 8.524597854681663e-07, + "loss": 0.2849, + "step": 1568 + }, + { + "epoch": 0.7908266129032258, + "grad_norm": 0.17315335375074736, + "learning_rate": 8.522692498007436e-07, + "loss": 0.2706, + "step": 1569 + }, + { + "epoch": 0.7913306451612904, + "grad_norm": 0.17415225412091026, + "learning_rate": 8.520786125058246e-07, + "loss": 0.2784, + "step": 1570 + }, + { + "epoch": 0.7918346774193549, + "grad_norm": 0.19715463128787936, + "learning_rate": 8.518878736384067e-07, + "loss": 0.2791, + "step": 1571 + }, + { + "epoch": 0.7923387096774194, + "grad_norm": 0.18485659414062103, + "learning_rate": 8.516970332535174e-07, + "loss": 0.2883, + "step": 1572 + }, + { + "epoch": 0.7928427419354839, + "grad_norm": 0.17412481517409986, + "learning_rate": 8.515060914062124e-07, + "loss": 0.2891, + "step": 1573 + }, + { + "epoch": 0.7933467741935484, + "grad_norm": 0.19186859393133207, + "learning_rate": 8.513150481515777e-07, + "loss": 0.2861, + "step": 1574 + }, + { + "epoch": 0.7938508064516129, + "grad_norm": 0.17924622608650354, + "learning_rate": 8.511239035447277e-07, + "loss": 0.2769, + "step": 1575 + }, + { + "epoch": 0.7943548387096774, + "grad_norm": 0.21636343101039252, + "learning_rate": 8.509326576408066e-07, + "loss": 0.2809, + "step": 1576 + }, + { + "epoch": 0.7948588709677419, + "grad_norm": 0.1988798854713325, + "learning_rate": 8.507413104949878e-07, + "loss": 0.2817, + "step": 1577 + }, + { + "epoch": 0.7953629032258065, + "grad_norm": 0.20133302174026105, + "learning_rate": 8.505498621624738e-07, + "loss": 0.273, + "step": 1578 + }, + { + "epoch": 0.795866935483871, + "grad_norm": 0.18247301896520096, + "learning_rate": 8.503583126984959e-07, + "loss": 0.2851, + "step": 1579 + }, + { + "epoch": 0.7963709677419355, + "grad_norm": 0.179121678370444, + "learning_rate": 8.501666621583152e-07, + "loss": 0.2817, + "step": 1580 + }, + { + "epoch": 0.796875, + "grad_norm": 0.1794015027687013, + "learning_rate": 8.499749105972216e-07, + "loss": 0.277, + "step": 1581 + }, + { + "epoch": 0.7973790322580645, + "grad_norm": 0.18799369028010127, + "learning_rate": 8.497830580705343e-07, + "loss": 0.2811, + "step": 1582 + }, + { + "epoch": 0.797883064516129, + "grad_norm": 0.19305091528117443, + "learning_rate": 8.495911046336015e-07, + "loss": 0.2995, + "step": 1583 + }, + { + "epoch": 0.7983870967741935, + "grad_norm": 0.2066980461918033, + "learning_rate": 8.493990503418007e-07, + "loss": 0.2982, + "step": 1584 + }, + { + "epoch": 0.7988911290322581, + "grad_norm": 0.17520616254907886, + "learning_rate": 8.492068952505382e-07, + "loss": 0.271, + "step": 1585 + }, + { + "epoch": 0.7993951612903226, + "grad_norm": 0.17626401945753917, + "learning_rate": 8.490146394152497e-07, + "loss": 0.2858, + "step": 1586 + }, + { + "epoch": 0.7998991935483871, + "grad_norm": 0.1752484381904052, + "learning_rate": 8.488222828913998e-07, + "loss": 0.2862, + "step": 1587 + }, + { + "epoch": 0.8004032258064516, + "grad_norm": 0.18080248190940781, + "learning_rate": 8.486298257344821e-07, + "loss": 0.2815, + "step": 1588 + }, + { + "epoch": 0.8009072580645161, + "grad_norm": 0.18782657218788468, + "learning_rate": 8.484372680000193e-07, + "loss": 0.2907, + "step": 1589 + }, + { + "epoch": 0.8014112903225806, + "grad_norm": 0.17866164980608917, + "learning_rate": 8.482446097435631e-07, + "loss": 0.2863, + "step": 1590 + }, + { + "epoch": 0.8019153225806451, + "grad_norm": 0.16998904102569148, + "learning_rate": 8.480518510206942e-07, + "loss": 0.2866, + "step": 1591 + }, + { + "epoch": 0.8024193548387096, + "grad_norm": 0.17671260570065095, + "learning_rate": 8.478589918870225e-07, + "loss": 0.2965, + "step": 1592 + }, + { + "epoch": 0.8024193548387096, + "eval_loss": 0.31474894285202026, + "eval_runtime": 17.2161, + "eval_samples_per_second": 49.663, + "eval_steps_per_second": 1.046, + "step": 1592 + }, + { + "epoch": 0.8029233870967742, + "grad_norm": 0.17663891905474427, + "learning_rate": 8.476660323981863e-07, + "loss": 0.2894, + "step": 1593 + }, + { + "epoch": 0.8034274193548387, + "grad_norm": 0.2011698318082641, + "learning_rate": 8.474729726098537e-07, + "loss": 0.2922, + "step": 1594 + }, + { + "epoch": 0.8039314516129032, + "grad_norm": 0.20386721072025002, + "learning_rate": 8.472798125777208e-07, + "loss": 0.2797, + "step": 1595 + }, + { + "epoch": 0.8044354838709677, + "grad_norm": 0.21744706928084864, + "learning_rate": 8.470865523575133e-07, + "loss": 0.2773, + "step": 1596 + }, + { + "epoch": 0.8049395161290323, + "grad_norm": 0.1749022648400484, + "learning_rate": 8.468931920049855e-07, + "loss": 0.2893, + "step": 1597 + }, + { + "epoch": 0.8054435483870968, + "grad_norm": 0.17378425941567904, + "learning_rate": 8.466997315759207e-07, + "loss": 0.2885, + "step": 1598 + }, + { + "epoch": 0.8059475806451613, + "grad_norm": 0.2151441210122262, + "learning_rate": 8.465061711261312e-07, + "loss": 0.2812, + "step": 1599 + }, + { + "epoch": 0.8064516129032258, + "grad_norm": 0.20189295912194238, + "learning_rate": 8.463125107114576e-07, + "loss": 0.2938, + "step": 1600 + }, + { + "epoch": 0.8069556451612904, + "grad_norm": 0.1737385784520349, + "learning_rate": 8.461187503877701e-07, + "loss": 0.2913, + "step": 1601 + }, + { + "epoch": 0.8074596774193549, + "grad_norm": 0.17511209441842182, + "learning_rate": 8.459248902109671e-07, + "loss": 0.2972, + "step": 1602 + }, + { + "epoch": 0.8079637096774194, + "grad_norm": 0.17927894922934345, + "learning_rate": 8.457309302369762e-07, + "loss": 0.2751, + "step": 1603 + }, + { + "epoch": 0.8084677419354839, + "grad_norm": 0.1711157379063894, + "learning_rate": 8.455368705217536e-07, + "loss": 0.293, + "step": 1604 + }, + { + "epoch": 0.8089717741935484, + "grad_norm": 0.20561980849817957, + "learning_rate": 8.453427111212844e-07, + "loss": 0.2804, + "step": 1605 + }, + { + "epoch": 0.8094758064516129, + "grad_norm": 0.18016754167062388, + "learning_rate": 8.451484520915823e-07, + "loss": 0.2931, + "step": 1606 + }, + { + "epoch": 0.8099798387096774, + "grad_norm": 0.19807530942971208, + "learning_rate": 8.449540934886898e-07, + "loss": 0.2947, + "step": 1607 + }, + { + "epoch": 0.8104838709677419, + "grad_norm": 0.18074849101785265, + "learning_rate": 8.447596353686783e-07, + "loss": 0.3026, + "step": 1608 + }, + { + "epoch": 0.8109879032258065, + "grad_norm": 0.18227592034924972, + "learning_rate": 8.445650777876477e-07, + "loss": 0.2942, + "step": 1609 + }, + { + "epoch": 0.811491935483871, + "grad_norm": 0.1807387057488744, + "learning_rate": 8.443704208017265e-07, + "loss": 0.2905, + "step": 1610 + }, + { + "epoch": 0.8119959677419355, + "grad_norm": 0.17752462791309626, + "learning_rate": 8.441756644670721e-07, + "loss": 0.3053, + "step": 1611 + }, + { + "epoch": 0.8125, + "grad_norm": 0.17960458304361665, + "learning_rate": 8.439808088398708e-07, + "loss": 0.2807, + "step": 1612 + }, + { + "epoch": 0.8130040322580645, + "grad_norm": 0.17933572002728534, + "learning_rate": 8.437858539763368e-07, + "loss": 0.2874, + "step": 1613 + }, + { + "epoch": 0.813508064516129, + "grad_norm": 0.17388277996545867, + "learning_rate": 8.435907999327137e-07, + "loss": 0.2872, + "step": 1614 + }, + { + "epoch": 0.8140120967741935, + "grad_norm": 0.20696380286831806, + "learning_rate": 8.433956467652731e-07, + "loss": 0.296, + "step": 1615 + }, + { + "epoch": 0.8145161290322581, + "grad_norm": 0.19036323172621478, + "learning_rate": 8.432003945303157e-07, + "loss": 0.2919, + "step": 1616 + }, + { + "epoch": 0.8150201612903226, + "grad_norm": 0.17417826300098163, + "learning_rate": 8.430050432841705e-07, + "loss": 0.2839, + "step": 1617 + }, + { + "epoch": 0.8155241935483871, + "grad_norm": 0.1764029810149293, + "learning_rate": 8.428095930831951e-07, + "loss": 0.2801, + "step": 1618 + }, + { + "epoch": 0.8160282258064516, + "grad_norm": 0.18297026792189108, + "learning_rate": 8.426140439837758e-07, + "loss": 0.2907, + "step": 1619 + }, + { + "epoch": 0.8165322580645161, + "grad_norm": 0.1821471162865555, + "learning_rate": 8.42418396042327e-07, + "loss": 0.2871, + "step": 1620 + }, + { + "epoch": 0.8170362903225806, + "grad_norm": 0.18365629188801313, + "learning_rate": 8.422226493152923e-07, + "loss": 0.2729, + "step": 1621 + }, + { + "epoch": 0.8175403225806451, + "grad_norm": 0.17997053499497517, + "learning_rate": 8.420268038591432e-07, + "loss": 0.2928, + "step": 1622 + }, + { + "epoch": 0.8180443548387096, + "grad_norm": 0.18199210579188108, + "learning_rate": 8.418308597303798e-07, + "loss": 0.2833, + "step": 1623 + }, + { + "epoch": 0.8185483870967742, + "grad_norm": 0.17684377413059332, + "learning_rate": 8.41634816985531e-07, + "loss": 0.2821, + "step": 1624 + }, + { + "epoch": 0.8190524193548387, + "grad_norm": 0.1729650092760548, + "learning_rate": 8.414386756811538e-07, + "loss": 0.2915, + "step": 1625 + }, + { + "epoch": 0.8195564516129032, + "grad_norm": 0.16853170774887175, + "learning_rate": 8.412424358738337e-07, + "loss": 0.296, + "step": 1626 + }, + { + "epoch": 0.8200604838709677, + "grad_norm": 0.17189289211370995, + "learning_rate": 8.410460976201847e-07, + "loss": 0.3022, + "step": 1627 + }, + { + "epoch": 0.8205645161290323, + "grad_norm": 0.18303909561880122, + "learning_rate": 8.408496609768494e-07, + "loss": 0.27, + "step": 1628 + }, + { + "epoch": 0.8210685483870968, + "grad_norm": 0.1689191180195712, + "learning_rate": 8.406531260004983e-07, + "loss": 0.283, + "step": 1629 + }, + { + "epoch": 0.8215725806451613, + "grad_norm": 0.19059066020698312, + "learning_rate": 8.404564927478304e-07, + "loss": 0.2733, + "step": 1630 + }, + { + "epoch": 0.8220766129032258, + "grad_norm": 0.18412586501703707, + "learning_rate": 8.402597612755736e-07, + "loss": 0.2958, + "step": 1631 + }, + { + "epoch": 0.8225806451612904, + "grad_norm": 0.1865008483611006, + "learning_rate": 8.400629316404833e-07, + "loss": 0.28, + "step": 1632 + }, + { + "epoch": 0.8230846774193549, + "grad_norm": 0.1848533702786811, + "learning_rate": 8.398660038993439e-07, + "loss": 0.2806, + "step": 1633 + }, + { + "epoch": 0.8235887096774194, + "grad_norm": 0.1737421957224456, + "learning_rate": 8.396689781089676e-07, + "loss": 0.285, + "step": 1634 + }, + { + "epoch": 0.8240927419354839, + "grad_norm": 0.16653152620902592, + "learning_rate": 8.394718543261954e-07, + "loss": 0.2723, + "step": 1635 + }, + { + "epoch": 0.8245967741935484, + "grad_norm": 0.1758054798983086, + "learning_rate": 8.392746326078961e-07, + "loss": 0.2889, + "step": 1636 + }, + { + "epoch": 0.8251008064516129, + "grad_norm": 0.17075324603300618, + "learning_rate": 8.39077313010967e-07, + "loss": 0.2858, + "step": 1637 + }, + { + "epoch": 0.8256048387096774, + "grad_norm": 0.18600083079394014, + "learning_rate": 8.388798955923335e-07, + "loss": 0.2735, + "step": 1638 + }, + { + "epoch": 0.8261088709677419, + "grad_norm": 0.1961620789751493, + "learning_rate": 8.386823804089496e-07, + "loss": 0.2992, + "step": 1639 + }, + { + "epoch": 0.8266129032258065, + "grad_norm": 0.2056030735817931, + "learning_rate": 8.384847675177968e-07, + "loss": 0.2966, + "step": 1640 + }, + { + "epoch": 0.827116935483871, + "grad_norm": 0.178659647296446, + "learning_rate": 8.382870569758853e-07, + "loss": 0.2943, + "step": 1641 + }, + { + "epoch": 0.8276209677419355, + "grad_norm": 0.17144740006998577, + "learning_rate": 8.380892488402535e-07, + "loss": 0.2725, + "step": 1642 + }, + { + "epoch": 0.828125, + "grad_norm": 0.2048573646210055, + "learning_rate": 8.378913431679677e-07, + "loss": 0.2654, + "step": 1643 + }, + { + "epoch": 0.8286290322580645, + "grad_norm": 0.19608901393965525, + "learning_rate": 8.376933400161226e-07, + "loss": 0.2657, + "step": 1644 + }, + { + "epoch": 0.829133064516129, + "grad_norm": 0.16996202799605425, + "learning_rate": 8.374952394418409e-07, + "loss": 0.2857, + "step": 1645 + }, + { + "epoch": 0.8296370967741935, + "grad_norm": 0.1804403768350341, + "learning_rate": 8.37297041502273e-07, + "loss": 0.2889, + "step": 1646 + }, + { + "epoch": 0.8301411290322581, + "grad_norm": 0.18021938104304921, + "learning_rate": 8.370987462545984e-07, + "loss": 0.2836, + "step": 1647 + }, + { + "epoch": 0.8306451612903226, + "grad_norm": 0.18418845425920574, + "learning_rate": 8.369003537560237e-07, + "loss": 0.2812, + "step": 1648 + }, + { + "epoch": 0.8311491935483871, + "grad_norm": 0.2151893686661586, + "learning_rate": 8.367018640637838e-07, + "loss": 0.2856, + "step": 1649 + }, + { + "epoch": 0.8316532258064516, + "grad_norm": 0.25324081072788207, + "learning_rate": 8.365032772351419e-07, + "loss": 0.2745, + "step": 1650 + }, + { + "epoch": 0.8321572580645161, + "grad_norm": 0.17621766684272705, + "learning_rate": 8.363045933273889e-07, + "loss": 0.2813, + "step": 1651 + }, + { + "epoch": 0.8326612903225806, + "grad_norm": 0.1810792109703597, + "learning_rate": 8.361058123978442e-07, + "loss": 0.2958, + "step": 1652 + }, + { + "epoch": 0.8331653225806451, + "grad_norm": 0.17290815775300664, + "learning_rate": 8.359069345038548e-07, + "loss": 0.2821, + "step": 1653 + }, + { + "epoch": 0.8336693548387096, + "grad_norm": 0.17231901628427526, + "learning_rate": 8.357079597027954e-07, + "loss": 0.2793, + "step": 1654 + }, + { + "epoch": 0.8341733870967742, + "grad_norm": 0.19031643192982242, + "learning_rate": 8.355088880520693e-07, + "loss": 0.2931, + "step": 1655 + }, + { + "epoch": 0.8346774193548387, + "grad_norm": 0.1773771991850869, + "learning_rate": 8.353097196091074e-07, + "loss": 0.2897, + "step": 1656 + }, + { + "epoch": 0.8351814516129032, + "grad_norm": 0.18048345032642082, + "learning_rate": 8.351104544313685e-07, + "loss": 0.2836, + "step": 1657 + }, + { + "epoch": 0.8356854838709677, + "grad_norm": 0.18443696405092166, + "learning_rate": 8.349110925763393e-07, + "loss": 0.2846, + "step": 1658 + }, + { + "epoch": 0.8361895161290323, + "grad_norm": 0.1772897016179952, + "learning_rate": 8.347116341015347e-07, + "loss": 0.2932, + "step": 1659 + }, + { + "epoch": 0.8366935483870968, + "grad_norm": 0.1790343811345245, + "learning_rate": 8.34512079064497e-07, + "loss": 0.2915, + "step": 1660 + }, + { + "epoch": 0.8371975806451613, + "grad_norm": 0.18840621984787398, + "learning_rate": 8.343124275227968e-07, + "loss": 0.2816, + "step": 1661 + }, + { + "epoch": 0.8377016129032258, + "grad_norm": 0.17628267263776604, + "learning_rate": 8.341126795340321e-07, + "loss": 0.2805, + "step": 1662 + }, + { + "epoch": 0.8382056451612904, + "grad_norm": 0.17600045722386132, + "learning_rate": 8.339128351558291e-07, + "loss": 0.29, + "step": 1663 + }, + { + "epoch": 0.8387096774193549, + "grad_norm": 0.182325857327577, + "learning_rate": 8.337128944458415e-07, + "loss": 0.2903, + "step": 1664 + }, + { + "epoch": 0.8392137096774194, + "grad_norm": 0.19535349793236487, + "learning_rate": 8.335128574617513e-07, + "loss": 0.2858, + "step": 1665 + }, + { + "epoch": 0.8397177419354839, + "grad_norm": 0.27705115942369163, + "learning_rate": 8.333127242612677e-07, + "loss": 0.2857, + "step": 1666 + }, + { + "epoch": 0.8402217741935484, + "grad_norm": 0.185724216553934, + "learning_rate": 8.331124949021279e-07, + "loss": 0.2583, + "step": 1667 + }, + { + "epoch": 0.8407258064516129, + "grad_norm": 0.18254445448952297, + "learning_rate": 8.329121694420969e-07, + "loss": 0.2915, + "step": 1668 + }, + { + "epoch": 0.8412298387096774, + "grad_norm": 0.18038012693100844, + "learning_rate": 8.327117479389672e-07, + "loss": 0.2792, + "step": 1669 + }, + { + "epoch": 0.8417338709677419, + "grad_norm": 0.19274233034208005, + "learning_rate": 8.325112304505592e-07, + "loss": 0.2805, + "step": 1670 + }, + { + "epoch": 0.8422379032258065, + "grad_norm": 0.17681044961531925, + "learning_rate": 8.323106170347212e-07, + "loss": 0.2723, + "step": 1671 + }, + { + "epoch": 0.842741935483871, + "grad_norm": 0.17251043391808404, + "learning_rate": 8.321099077493285e-07, + "loss": 0.2696, + "step": 1672 + }, + { + "epoch": 0.8432459677419355, + "grad_norm": 0.1752390712831652, + "learning_rate": 8.319091026522848e-07, + "loss": 0.2738, + "step": 1673 + }, + { + "epoch": 0.84375, + "grad_norm": 0.17935779324663959, + "learning_rate": 8.317082018015211e-07, + "loss": 0.2862, + "step": 1674 + }, + { + "epoch": 0.8442540322580645, + "grad_norm": 0.19723380525297224, + "learning_rate": 8.315072052549961e-07, + "loss": 0.2961, + "step": 1675 + }, + { + "epoch": 0.844758064516129, + "grad_norm": 0.17594438243367191, + "learning_rate": 8.313061130706959e-07, + "loss": 0.2838, + "step": 1676 + }, + { + "epoch": 0.8452620967741935, + "grad_norm": 0.18502761220597608, + "learning_rate": 8.311049253066344e-07, + "loss": 0.2904, + "step": 1677 + }, + { + "epoch": 0.8457661290322581, + "grad_norm": 0.1735632363360221, + "learning_rate": 8.30903642020853e-07, + "loss": 0.286, + "step": 1678 + }, + { + "epoch": 0.8462701612903226, + "grad_norm": 0.1685121845779261, + "learning_rate": 8.307022632714208e-07, + "loss": 0.266, + "step": 1679 + }, + { + "epoch": 0.8467741935483871, + "grad_norm": 0.17378234473772378, + "learning_rate": 8.305007891164341e-07, + "loss": 0.2904, + "step": 1680 + }, + { + "epoch": 0.8472782258064516, + "grad_norm": 0.2032632255195518, + "learning_rate": 8.302992196140173e-07, + "loss": 0.2895, + "step": 1681 + }, + { + "epoch": 0.8477822580645161, + "grad_norm": 0.17748247042521456, + "learning_rate": 8.300975548223214e-07, + "loss": 0.3003, + "step": 1682 + }, + { + "epoch": 0.8482862903225806, + "grad_norm": 0.195328665507149, + "learning_rate": 8.298957947995261e-07, + "loss": 0.2832, + "step": 1683 + }, + { + "epoch": 0.8487903225806451, + "grad_norm": 0.17552608040636258, + "learning_rate": 8.296939396038375e-07, + "loss": 0.2866, + "step": 1684 + }, + { + "epoch": 0.8492943548387096, + "grad_norm": 0.17824578366133345, + "learning_rate": 8.294919892934896e-07, + "loss": 0.2779, + "step": 1685 + }, + { + "epoch": 0.8497983870967742, + "grad_norm": 0.17486942680363315, + "learning_rate": 8.29289943926744e-07, + "loss": 0.2617, + "step": 1686 + }, + { + "epoch": 0.8503024193548387, + "grad_norm": 0.18914845225754362, + "learning_rate": 8.290878035618893e-07, + "loss": 0.2812, + "step": 1687 + }, + { + "epoch": 0.8508064516129032, + "grad_norm": 0.18115901123985698, + "learning_rate": 8.28885568257242e-07, + "loss": 0.3004, + "step": 1688 + }, + { + "epoch": 0.8513104838709677, + "grad_norm": 0.17449979315444422, + "learning_rate": 8.286832380711454e-07, + "loss": 0.273, + "step": 1689 + }, + { + "epoch": 0.8518145161290323, + "grad_norm": 0.1782814890164981, + "learning_rate": 8.284808130619708e-07, + "loss": 0.2997, + "step": 1690 + }, + { + "epoch": 0.8523185483870968, + "grad_norm": 0.1710342855678077, + "learning_rate": 8.282782932881165e-07, + "loss": 0.2864, + "step": 1691 + }, + { + "epoch": 0.8528225806451613, + "grad_norm": 0.18551314368513988, + "learning_rate": 8.280756788080081e-07, + "loss": 0.2847, + "step": 1692 + }, + { + "epoch": 0.8533266129032258, + "grad_norm": 0.17793439535374442, + "learning_rate": 8.278729696800988e-07, + "loss": 0.3006, + "step": 1693 + }, + { + "epoch": 0.8538306451612904, + "grad_norm": 0.17754655120950547, + "learning_rate": 8.276701659628686e-07, + "loss": 0.2889, + "step": 1694 + }, + { + "epoch": 0.8543346774193549, + "grad_norm": 0.18489194070233667, + "learning_rate": 8.274672677148256e-07, + "loss": 0.2725, + "step": 1695 + }, + { + "epoch": 0.8548387096774194, + "grad_norm": 0.1967447154512761, + "learning_rate": 8.272642749945042e-07, + "loss": 0.2857, + "step": 1696 + }, + { + "epoch": 0.8553427419354839, + "grad_norm": 0.17696581032696806, + "learning_rate": 8.270611878604669e-07, + "loss": 0.2793, + "step": 1697 + }, + { + "epoch": 0.8558467741935484, + "grad_norm": 0.17041142623174294, + "learning_rate": 8.268580063713028e-07, + "loss": 0.2708, + "step": 1698 + }, + { + "epoch": 0.8563508064516129, + "grad_norm": 0.17554540593528697, + "learning_rate": 8.266547305856288e-07, + "loss": 0.2708, + "step": 1699 + }, + { + "epoch": 0.8568548387096774, + "grad_norm": 0.17308164296880016, + "learning_rate": 8.264513605620884e-07, + "loss": 0.2738, + "step": 1700 + }, + { + "epoch": 0.8573588709677419, + "grad_norm": 0.2285975263899735, + "learning_rate": 8.262478963593529e-07, + "loss": 0.2732, + "step": 1701 + }, + { + "epoch": 0.8578629032258065, + "grad_norm": 0.1765909866610657, + "learning_rate": 8.260443380361201e-07, + "loss": 0.2822, + "step": 1702 + }, + { + "epoch": 0.858366935483871, + "grad_norm": 0.18612762071005345, + "learning_rate": 8.258406856511157e-07, + "loss": 0.2897, + "step": 1703 + }, + { + "epoch": 0.8588709677419355, + "grad_norm": 0.18274802390819692, + "learning_rate": 8.256369392630918e-07, + "loss": 0.2826, + "step": 1704 + }, + { + "epoch": 0.859375, + "grad_norm": 0.1702610425035792, + "learning_rate": 8.254330989308283e-07, + "loss": 0.2825, + "step": 1705 + }, + { + "epoch": 0.8598790322580645, + "grad_norm": 0.18746425872965825, + "learning_rate": 8.252291647131315e-07, + "loss": 0.2831, + "step": 1706 + }, + { + "epoch": 0.860383064516129, + "grad_norm": 0.1729312918615105, + "learning_rate": 8.250251366688357e-07, + "loss": 0.2788, + "step": 1707 + }, + { + "epoch": 0.8608870967741935, + "grad_norm": 0.17310685779871965, + "learning_rate": 8.248210148568011e-07, + "loss": 0.2807, + "step": 1708 + }, + { + "epoch": 0.8613911290322581, + "grad_norm": 0.17388995344225858, + "learning_rate": 8.246167993359159e-07, + "loss": 0.278, + "step": 1709 + }, + { + "epoch": 0.8618951612903226, + "grad_norm": 0.17561470000841797, + "learning_rate": 8.244124901650951e-07, + "loss": 0.2852, + "step": 1710 + }, + { + "epoch": 0.8623991935483871, + "grad_norm": 0.19059405969768753, + "learning_rate": 8.242080874032804e-07, + "loss": 0.2811, + "step": 1711 + }, + { + "epoch": 0.8629032258064516, + "grad_norm": 0.17640465354683116, + "learning_rate": 8.24003591109441e-07, + "loss": 0.2837, + "step": 1712 + }, + { + "epoch": 0.8634072580645161, + "grad_norm": 0.1946696258918474, + "learning_rate": 8.237990013425725e-07, + "loss": 0.2943, + "step": 1713 + }, + { + "epoch": 0.8639112903225806, + "grad_norm": 0.17143548711069195, + "learning_rate": 8.23594318161698e-07, + "loss": 0.2844, + "step": 1714 + }, + { + "epoch": 0.8644153225806451, + "grad_norm": 0.170912525533573, + "learning_rate": 8.233895416258673e-07, + "loss": 0.2828, + "step": 1715 + }, + { + "epoch": 0.8649193548387096, + "grad_norm": 0.17030957212565428, + "learning_rate": 8.231846717941572e-07, + "loss": 0.2841, + "step": 1716 + }, + { + "epoch": 0.8654233870967742, + "grad_norm": 0.18030711489803564, + "learning_rate": 8.229797087256711e-07, + "loss": 0.2892, + "step": 1717 + }, + { + "epoch": 0.8659274193548387, + "grad_norm": 0.1811276763324713, + "learning_rate": 8.2277465247954e-07, + "loss": 0.2833, + "step": 1718 + }, + { + "epoch": 0.8664314516129032, + "grad_norm": 0.18181075140547173, + "learning_rate": 8.22569503114921e-07, + "loss": 0.2936, + "step": 1719 + }, + { + "epoch": 0.8669354838709677, + "grad_norm": 0.17610807113952007, + "learning_rate": 8.223642606909986e-07, + "loss": 0.2918, + "step": 1720 + }, + { + "epoch": 0.8674395161290323, + "grad_norm": 0.17498207358667517, + "learning_rate": 8.221589252669841e-07, + "loss": 0.2706, + "step": 1721 + }, + { + "epoch": 0.8679435483870968, + "grad_norm": 0.18462559185235636, + "learning_rate": 8.219534969021151e-07, + "loss": 0.2862, + "step": 1722 + }, + { + "epoch": 0.8684475806451613, + "grad_norm": 0.17613631479568562, + "learning_rate": 8.217479756556567e-07, + "loss": 0.2901, + "step": 1723 + }, + { + "epoch": 0.8689516129032258, + "grad_norm": 0.17518479858622174, + "learning_rate": 8.215423615869005e-07, + "loss": 0.2611, + "step": 1724 + }, + { + "epoch": 0.8694556451612904, + "grad_norm": 0.17109997839484933, + "learning_rate": 8.213366547551648e-07, + "loss": 0.2817, + "step": 1725 + }, + { + "epoch": 0.8699596774193549, + "grad_norm": 0.1966008607228567, + "learning_rate": 8.211308552197948e-07, + "loss": 0.2792, + "step": 1726 + }, + { + "epoch": 0.8704637096774194, + "grad_norm": 0.17565135968705492, + "learning_rate": 8.209249630401623e-07, + "loss": 0.2854, + "step": 1727 + }, + { + "epoch": 0.8709677419354839, + "grad_norm": 0.17137189614479656, + "learning_rate": 8.207189782756661e-07, + "loss": 0.2662, + "step": 1728 + }, + { + "epoch": 0.8714717741935484, + "grad_norm": 0.17297331900482513, + "learning_rate": 8.205129009857312e-07, + "loss": 0.2799, + "step": 1729 + }, + { + "epoch": 0.8719758064516129, + "grad_norm": 0.1724815696106094, + "learning_rate": 8.203067312298101e-07, + "loss": 0.2815, + "step": 1730 + }, + { + "epoch": 0.8724798387096774, + "grad_norm": 0.18096897924623875, + "learning_rate": 8.20100469067381e-07, + "loss": 0.2837, + "step": 1731 + }, + { + "epoch": 0.8729838709677419, + "grad_norm": 0.19781239041691703, + "learning_rate": 8.198941145579496e-07, + "loss": 0.2939, + "step": 1732 + }, + { + "epoch": 0.8734879032258065, + "grad_norm": 0.17163633709375878, + "learning_rate": 8.196876677610479e-07, + "loss": 0.266, + "step": 1733 + }, + { + "epoch": 0.873991935483871, + "grad_norm": 0.18825663758858197, + "learning_rate": 8.194811287362344e-07, + "loss": 0.291, + "step": 1734 + }, + { + "epoch": 0.8744959677419355, + "grad_norm": 0.19151890851189823, + "learning_rate": 8.192744975430941e-07, + "loss": 0.2887, + "step": 1735 + }, + { + "epoch": 0.875, + "grad_norm": 0.17804405341187254, + "learning_rate": 8.190677742412393e-07, + "loss": 0.2848, + "step": 1736 + }, + { + "epoch": 0.8755040322580645, + "grad_norm": 0.17441120813931166, + "learning_rate": 8.188609588903081e-07, + "loss": 0.297, + "step": 1737 + }, + { + "epoch": 0.876008064516129, + "grad_norm": 0.17132241278155927, + "learning_rate": 8.186540515499653e-07, + "loss": 0.2698, + "step": 1738 + }, + { + "epoch": 0.8765120967741935, + "grad_norm": 0.17642561108901925, + "learning_rate": 8.184470522799029e-07, + "loss": 0.2908, + "step": 1739 + }, + { + "epoch": 0.8770161290322581, + "grad_norm": 0.231128422621628, + "learning_rate": 8.182399611398385e-07, + "loss": 0.275, + "step": 1740 + }, + { + "epoch": 0.8775201612903226, + "grad_norm": 0.17983640539436302, + "learning_rate": 8.180327781895166e-07, + "loss": 0.2746, + "step": 1741 + }, + { + "epoch": 0.8780241935483871, + "grad_norm": 0.18298601240785914, + "learning_rate": 8.178255034887083e-07, + "loss": 0.2933, + "step": 1742 + }, + { + "epoch": 0.8785282258064516, + "grad_norm": 0.17727125106526648, + "learning_rate": 8.176181370972112e-07, + "loss": 0.3026, + "step": 1743 + }, + { + "epoch": 0.8790322580645161, + "grad_norm": 0.17900225615415002, + "learning_rate": 8.174106790748489e-07, + "loss": 0.2732, + "step": 1744 + }, + { + "epoch": 0.8795362903225806, + "grad_norm": 0.16825501654368868, + "learning_rate": 8.172031294814721e-07, + "loss": 0.2648, + "step": 1745 + }, + { + "epoch": 0.8800403225806451, + "grad_norm": 0.17838543584835861, + "learning_rate": 8.169954883769573e-07, + "loss": 0.2826, + "step": 1746 + }, + { + "epoch": 0.8805443548387096, + "grad_norm": 0.20624261385412906, + "learning_rate": 8.16787755821208e-07, + "loss": 0.2931, + "step": 1747 + }, + { + "epoch": 0.8810483870967742, + "grad_norm": 0.19622027217406832, + "learning_rate": 8.165799318741533e-07, + "loss": 0.2855, + "step": 1748 + }, + { + "epoch": 0.8815524193548387, + "grad_norm": 0.1795090027517633, + "learning_rate": 8.163720165957494e-07, + "loss": 0.2855, + "step": 1749 + }, + { + "epoch": 0.8820564516129032, + "grad_norm": 0.1727005576579451, + "learning_rate": 8.161640100459785e-07, + "loss": 0.2945, + "step": 1750 + }, + { + "epoch": 0.8825604838709677, + "grad_norm": 0.17361486955355637, + "learning_rate": 8.159559122848494e-07, + "loss": 0.2845, + "step": 1751 + }, + { + "epoch": 0.8830645161290323, + "grad_norm": 0.1844805830827498, + "learning_rate": 8.157477233723969e-07, + "loss": 0.298, + "step": 1752 + }, + { + "epoch": 0.8835685483870968, + "grad_norm": 0.17263438233102688, + "learning_rate": 8.15539443368682e-07, + "loss": 0.2929, + "step": 1753 + }, + { + "epoch": 0.8840725806451613, + "grad_norm": 0.18475595233458791, + "learning_rate": 8.153310723337923e-07, + "loss": 0.2844, + "step": 1754 + }, + { + "epoch": 0.8845766129032258, + "grad_norm": 0.1730273189248541, + "learning_rate": 8.151226103278417e-07, + "loss": 0.284, + "step": 1755 + }, + { + "epoch": 0.8850806451612904, + "grad_norm": 0.18104372418355474, + "learning_rate": 8.149140574109701e-07, + "loss": 0.2897, + "step": 1756 + }, + { + "epoch": 0.8855846774193549, + "grad_norm": 0.17839123584579997, + "learning_rate": 8.147054136433437e-07, + "loss": 0.2908, + "step": 1757 + }, + { + "epoch": 0.8860887096774194, + "grad_norm": 0.178362220369913, + "learning_rate": 8.144966790851551e-07, + "loss": 0.2724, + "step": 1758 + }, + { + "epoch": 0.8865927419354839, + "grad_norm": 0.18349648424878212, + "learning_rate": 8.142878537966225e-07, + "loss": 0.2813, + "step": 1759 + }, + { + "epoch": 0.8870967741935484, + "grad_norm": 0.1852606758136101, + "learning_rate": 8.140789378379911e-07, + "loss": 0.2705, + "step": 1760 + }, + { + "epoch": 0.8876008064516129, + "grad_norm": 0.19163597573225657, + "learning_rate": 8.138699312695318e-07, + "loss": 0.2763, + "step": 1761 + }, + { + "epoch": 0.8881048387096774, + "grad_norm": 0.16967793891115396, + "learning_rate": 8.136608341515417e-07, + "loss": 0.2878, + "step": 1762 + }, + { + "epoch": 0.8886088709677419, + "grad_norm": 0.17409231137199876, + "learning_rate": 8.13451646544344e-07, + "loss": 0.2824, + "step": 1763 + }, + { + "epoch": 0.8891129032258065, + "grad_norm": 0.1718892011569422, + "learning_rate": 8.132423685082879e-07, + "loss": 0.2819, + "step": 1764 + }, + { + "epoch": 0.889616935483871, + "grad_norm": 0.1768902921291496, + "learning_rate": 8.130330001037489e-07, + "loss": 0.2866, + "step": 1765 + }, + { + "epoch": 0.8901209677419355, + "grad_norm": 0.17727162868649207, + "learning_rate": 8.128235413911286e-07, + "loss": 0.2904, + "step": 1766 + }, + { + "epoch": 0.890625, + "grad_norm": 0.18000850290405332, + "learning_rate": 8.126139924308544e-07, + "loss": 0.2821, + "step": 1767 + }, + { + "epoch": 0.8911290322580645, + "grad_norm": 0.1950805957693296, + "learning_rate": 8.124043532833799e-07, + "loss": 0.2724, + "step": 1768 + }, + { + "epoch": 0.891633064516129, + "grad_norm": 0.16953136358361712, + "learning_rate": 8.121946240091847e-07, + "loss": 0.2897, + "step": 1769 + }, + { + "epoch": 0.8921370967741935, + "grad_norm": 0.17382780411451854, + "learning_rate": 8.119848046687745e-07, + "loss": 0.2743, + "step": 1770 + }, + { + "epoch": 0.8926411290322581, + "grad_norm": 0.17618015127985004, + "learning_rate": 8.117748953226807e-07, + "loss": 0.293, + "step": 1771 + }, + { + "epoch": 0.8931451612903226, + "grad_norm": 0.19101986869600812, + "learning_rate": 8.115648960314609e-07, + "loss": 0.282, + "step": 1772 + }, + { + "epoch": 0.8936491935483871, + "grad_norm": 0.18335641858591029, + "learning_rate": 8.113548068556989e-07, + "loss": 0.2782, + "step": 1773 + }, + { + "epoch": 0.8941532258064516, + "grad_norm": 0.17305922736379498, + "learning_rate": 8.111446278560037e-07, + "loss": 0.2819, + "step": 1774 + }, + { + "epoch": 0.8946572580645161, + "grad_norm": 0.17289190092892376, + "learning_rate": 8.109343590930107e-07, + "loss": 0.2722, + "step": 1775 + }, + { + "epoch": 0.8951612903225806, + "grad_norm": 0.20419317136274495, + "learning_rate": 8.107240006273815e-07, + "loss": 0.2984, + "step": 1776 + }, + { + "epoch": 0.8956653225806451, + "grad_norm": 0.17574416151651231, + "learning_rate": 8.105135525198026e-07, + "loss": 0.2785, + "step": 1777 + }, + { + "epoch": 0.8961693548387096, + "grad_norm": 0.18197635365780418, + "learning_rate": 8.103030148309876e-07, + "loss": 0.2671, + "step": 1778 + }, + { + "epoch": 0.8966733870967742, + "grad_norm": 0.182868231288254, + "learning_rate": 8.10092387621675e-07, + "loss": 0.2825, + "step": 1779 + }, + { + "epoch": 0.8971774193548387, + "grad_norm": 0.1753018528864952, + "learning_rate": 8.098816709526293e-07, + "loss": 0.2742, + "step": 1780 + }, + { + "epoch": 0.8976814516129032, + "grad_norm": 0.17996946202759295, + "learning_rate": 8.096708648846416e-07, + "loss": 0.2721, + "step": 1781 + }, + { + "epoch": 0.8981854838709677, + "grad_norm": 0.17250723947592084, + "learning_rate": 8.094599694785272e-07, + "loss": 0.2857, + "step": 1782 + }, + { + "epoch": 0.8986895161290323, + "grad_norm": 0.17857618093982208, + "learning_rate": 8.092489847951288e-07, + "loss": 0.289, + "step": 1783 + }, + { + "epoch": 0.8991935483870968, + "grad_norm": 0.180980362554131, + "learning_rate": 8.09037910895314e-07, + "loss": 0.2815, + "step": 1784 + }, + { + "epoch": 0.8996975806451613, + "grad_norm": 0.19044748994727145, + "learning_rate": 8.088267478399761e-07, + "loss": 0.2763, + "step": 1785 + }, + { + "epoch": 0.9002016129032258, + "grad_norm": 0.18502618756219494, + "learning_rate": 8.086154956900348e-07, + "loss": 0.2777, + "step": 1786 + }, + { + "epoch": 0.9007056451612904, + "grad_norm": 0.17795283297213882, + "learning_rate": 8.084041545064347e-07, + "loss": 0.277, + "step": 1787 + }, + { + "epoch": 0.9012096774193549, + "grad_norm": 0.18095952785719716, + "learning_rate": 8.081927243501465e-07, + "loss": 0.2849, + "step": 1788 + }, + { + "epoch": 0.9017137096774194, + "grad_norm": 0.20352287553838194, + "learning_rate": 8.079812052821665e-07, + "loss": 0.268, + "step": 1789 + }, + { + "epoch": 0.9022177419354839, + "grad_norm": 0.1817677931102643, + "learning_rate": 8.077695973635165e-07, + "loss": 0.2821, + "step": 1790 + }, + { + "epoch": 0.9027217741935484, + "grad_norm": 0.20806765911305017, + "learning_rate": 8.075579006552442e-07, + "loss": 0.2802, + "step": 1791 + }, + { + "epoch": 0.9027217741935484, + "eval_loss": 0.31333088874816895, + "eval_runtime": 17.8076, + "eval_samples_per_second": 48.013, + "eval_steps_per_second": 1.011, + "step": 1791 + }, + { + "epoch": 0.9032258064516129, + "grad_norm": 0.18964086276352513, + "learning_rate": 8.073461152184229e-07, + "loss": 0.2955, + "step": 1792 + }, + { + "epoch": 0.9037298387096774, + "grad_norm": 0.20100101509553, + "learning_rate": 8.071342411141511e-07, + "loss": 0.2833, + "step": 1793 + }, + { + "epoch": 0.9042338709677419, + "grad_norm": 0.16556895445429864, + "learning_rate": 8.069222784035536e-07, + "loss": 0.2572, + "step": 1794 + }, + { + "epoch": 0.9047379032258065, + "grad_norm": 0.18585984965180008, + "learning_rate": 8.067102271477798e-07, + "loss": 0.292, + "step": 1795 + }, + { + "epoch": 0.905241935483871, + "grad_norm": 0.1714046882678454, + "learning_rate": 8.064980874080056e-07, + "loss": 0.2783, + "step": 1796 + }, + { + "epoch": 0.9057459677419355, + "grad_norm": 0.1845849829174842, + "learning_rate": 8.062858592454318e-07, + "loss": 0.2719, + "step": 1797 + }, + { + "epoch": 0.90625, + "grad_norm": 0.18224225892643142, + "learning_rate": 8.060735427212848e-07, + "loss": 0.2812, + "step": 1798 + }, + { + "epoch": 0.9067540322580645, + "grad_norm": 0.1694582941399971, + "learning_rate": 8.05861137896817e-07, + "loss": 0.269, + "step": 1799 + }, + { + "epoch": 0.907258064516129, + "grad_norm": 0.1866565288452891, + "learning_rate": 8.056486448333053e-07, + "loss": 0.2867, + "step": 1800 + }, + { + "epoch": 0.9077620967741935, + "grad_norm": 0.18228705266437853, + "learning_rate": 8.054360635920532e-07, + "loss": 0.2919, + "step": 1801 + }, + { + "epoch": 0.9082661290322581, + "grad_norm": 0.17847533993145076, + "learning_rate": 8.052233942343889e-07, + "loss": 0.2864, + "step": 1802 + }, + { + "epoch": 0.9087701612903226, + "grad_norm": 0.16890284755735133, + "learning_rate": 8.050106368216661e-07, + "loss": 0.2748, + "step": 1803 + }, + { + "epoch": 0.9092741935483871, + "grad_norm": 0.17414946004467158, + "learning_rate": 8.047977914152639e-07, + "loss": 0.2886, + "step": 1804 + }, + { + "epoch": 0.9097782258064516, + "grad_norm": 0.17442792515844072, + "learning_rate": 8.045848580765869e-07, + "loss": 0.2926, + "step": 1805 + }, + { + "epoch": 0.9102822580645161, + "grad_norm": 0.17085485937836853, + "learning_rate": 8.043718368670654e-07, + "loss": 0.2847, + "step": 1806 + }, + { + "epoch": 0.9107862903225806, + "grad_norm": 0.17339005862232762, + "learning_rate": 8.041587278481541e-07, + "loss": 0.2729, + "step": 1807 + }, + { + "epoch": 0.9112903225806451, + "grad_norm": 0.17518880582664062, + "learning_rate": 8.039455310813343e-07, + "loss": 0.3047, + "step": 1808 + }, + { + "epoch": 0.9117943548387096, + "grad_norm": 0.17160540781556277, + "learning_rate": 8.037322466281116e-07, + "loss": 0.277, + "step": 1809 + }, + { + "epoch": 0.9122983870967742, + "grad_norm": 0.18984801005727484, + "learning_rate": 8.035188745500171e-07, + "loss": 0.2975, + "step": 1810 + }, + { + "epoch": 0.9128024193548387, + "grad_norm": 0.18036724606771928, + "learning_rate": 8.033054149086076e-07, + "loss": 0.2738, + "step": 1811 + }, + { + "epoch": 0.9133064516129032, + "grad_norm": 0.1694811997928256, + "learning_rate": 8.030918677654648e-07, + "loss": 0.2802, + "step": 1812 + }, + { + "epoch": 0.9138104838709677, + "grad_norm": 1.122599384666042, + "learning_rate": 8.028782331821956e-07, + "loss": 0.2729, + "step": 1813 + }, + { + "epoch": 0.9143145161290323, + "grad_norm": 0.19587893697493347, + "learning_rate": 8.026645112204325e-07, + "loss": 0.2844, + "step": 1814 + }, + { + "epoch": 0.9148185483870968, + "grad_norm": 0.18843188442999503, + "learning_rate": 8.024507019418327e-07, + "loss": 0.2622, + "step": 1815 + }, + { + "epoch": 0.9153225806451613, + "grad_norm": 0.2412193144484893, + "learning_rate": 8.022368054080789e-07, + "loss": 0.2864, + "step": 1816 + }, + { + "epoch": 0.9158266129032258, + "grad_norm": 0.2982039518983422, + "learning_rate": 8.020228216808792e-07, + "loss": 0.2827, + "step": 1817 + }, + { + "epoch": 0.9163306451612904, + "grad_norm": 0.24012806458676572, + "learning_rate": 8.018087508219664e-07, + "loss": 0.2749, + "step": 1818 + }, + { + "epoch": 0.9168346774193549, + "grad_norm": 0.20105639329407363, + "learning_rate": 8.015945928930985e-07, + "loss": 0.2751, + "step": 1819 + }, + { + "epoch": 0.9173387096774194, + "grad_norm": 0.17599257831809287, + "learning_rate": 8.013803479560588e-07, + "loss": 0.2726, + "step": 1820 + }, + { + "epoch": 0.9178427419354839, + "grad_norm": 0.17545331694711577, + "learning_rate": 8.011660160726556e-07, + "loss": 0.2783, + "step": 1821 + }, + { + "epoch": 0.9183467741935484, + "grad_norm": 0.1782601795128222, + "learning_rate": 8.009515973047225e-07, + "loss": 0.2721, + "step": 1822 + }, + { + "epoch": 0.9188508064516129, + "grad_norm": 0.1927011349013866, + "learning_rate": 8.007370917141177e-07, + "loss": 0.2762, + "step": 1823 + }, + { + "epoch": 0.9193548387096774, + "grad_norm": 0.1920937971165849, + "learning_rate": 8.005224993627251e-07, + "loss": 0.2704, + "step": 1824 + }, + { + "epoch": 0.9198588709677419, + "grad_norm": 0.1793904274021918, + "learning_rate": 8.003078203124532e-07, + "loss": 0.2906, + "step": 1825 + }, + { + "epoch": 0.9203629032258065, + "grad_norm": 0.1816291362076379, + "learning_rate": 8.000930546252351e-07, + "loss": 0.2691, + "step": 1826 + }, + { + "epoch": 0.920866935483871, + "grad_norm": 0.1783523495764457, + "learning_rate": 7.998782023630299e-07, + "loss": 0.2823, + "step": 1827 + }, + { + "epoch": 0.9213709677419355, + "grad_norm": 0.20855882950783844, + "learning_rate": 7.996632635878209e-07, + "loss": 0.2804, + "step": 1828 + }, + { + "epoch": 0.921875, + "grad_norm": 0.17492678074423923, + "learning_rate": 7.994482383616168e-07, + "loss": 0.2832, + "step": 1829 + }, + { + "epoch": 0.9223790322580645, + "grad_norm": 0.17234593489453404, + "learning_rate": 7.992331267464509e-07, + "loss": 0.2768, + "step": 1830 + }, + { + "epoch": 0.922883064516129, + "grad_norm": 0.17200804609809464, + "learning_rate": 7.990179288043815e-07, + "loss": 0.2677, + "step": 1831 + }, + { + "epoch": 0.9233870967741935, + "grad_norm": 0.17572805845486925, + "learning_rate": 7.98802644597492e-07, + "loss": 0.2821, + "step": 1832 + }, + { + "epoch": 0.9238911290322581, + "grad_norm": 0.20644587156717698, + "learning_rate": 7.985872741878905e-07, + "loss": 0.2732, + "step": 1833 + }, + { + "epoch": 0.9243951612903226, + "grad_norm": 0.17874642664132656, + "learning_rate": 7.983718176377101e-07, + "loss": 0.269, + "step": 1834 + }, + { + "epoch": 0.9248991935483871, + "grad_norm": 0.17456990653485246, + "learning_rate": 7.981562750091085e-07, + "loss": 0.2745, + "step": 1835 + }, + { + "epoch": 0.9254032258064516, + "grad_norm": 0.16939889686969853, + "learning_rate": 7.979406463642686e-07, + "loss": 0.2916, + "step": 1836 + }, + { + "epoch": 0.9259072580645161, + "grad_norm": 0.17330331884417968, + "learning_rate": 7.977249317653979e-07, + "loss": 0.2897, + "step": 1837 + }, + { + "epoch": 0.9264112903225806, + "grad_norm": 0.17625952471758344, + "learning_rate": 7.975091312747286e-07, + "loss": 0.28, + "step": 1838 + }, + { + "epoch": 0.9269153225806451, + "grad_norm": 0.18477999685889357, + "learning_rate": 7.97293244954518e-07, + "loss": 0.3018, + "step": 1839 + }, + { + "epoch": 0.9274193548387096, + "grad_norm": 0.17443677971673763, + "learning_rate": 7.970772728670479e-07, + "loss": 0.2701, + "step": 1840 + }, + { + "epoch": 0.9279233870967742, + "grad_norm": 0.17476215415812357, + "learning_rate": 7.968612150746247e-07, + "loss": 0.2917, + "step": 1841 + }, + { + "epoch": 0.9284274193548387, + "grad_norm": 0.172486146139852, + "learning_rate": 7.966450716395801e-07, + "loss": 0.2802, + "step": 1842 + }, + { + "epoch": 0.9289314516129032, + "grad_norm": 0.17784609076169455, + "learning_rate": 7.9642884262427e-07, + "loss": 0.2795, + "step": 1843 + }, + { + "epoch": 0.9294354838709677, + "grad_norm": 0.1837707194984302, + "learning_rate": 7.96212528091075e-07, + "loss": 0.2899, + "step": 1844 + }, + { + "epoch": 0.9299395161290323, + "grad_norm": 0.18094587392424352, + "learning_rate": 7.959961281024004e-07, + "loss": 0.2939, + "step": 1845 + }, + { + "epoch": 0.9304435483870968, + "grad_norm": 0.18248542798816517, + "learning_rate": 7.957796427206766e-07, + "loss": 0.2926, + "step": 1846 + }, + { + "epoch": 0.9309475806451613, + "grad_norm": 0.17614486195538318, + "learning_rate": 7.955630720083581e-07, + "loss": 0.2912, + "step": 1847 + }, + { + "epoch": 0.9314516129032258, + "grad_norm": 0.1726471918494145, + "learning_rate": 7.953464160279244e-07, + "loss": 0.2891, + "step": 1848 + }, + { + "epoch": 0.9319556451612904, + "grad_norm": 0.1774055068005992, + "learning_rate": 7.951296748418789e-07, + "loss": 0.3031, + "step": 1849 + }, + { + "epoch": 0.9324596774193549, + "grad_norm": 0.18664971286954127, + "learning_rate": 7.949128485127508e-07, + "loss": 0.2798, + "step": 1850 + }, + { + "epoch": 0.9329637096774194, + "grad_norm": 0.21309616857185437, + "learning_rate": 7.946959371030926e-07, + "loss": 0.2852, + "step": 1851 + }, + { + "epoch": 0.9334677419354839, + "grad_norm": 0.17453472429034075, + "learning_rate": 7.944789406754821e-07, + "loss": 0.2796, + "step": 1852 + }, + { + "epoch": 0.9339717741935484, + "grad_norm": 0.17686343311528496, + "learning_rate": 7.942618592925214e-07, + "loss": 0.2821, + "step": 1853 + }, + { + "epoch": 0.9344758064516129, + "grad_norm": 0.2050361536049733, + "learning_rate": 7.940446930168372e-07, + "loss": 0.2855, + "step": 1854 + }, + { + "epoch": 0.9349798387096774, + "grad_norm": 0.19293005978824818, + "learning_rate": 7.938274419110806e-07, + "loss": 0.2853, + "step": 1855 + }, + { + "epoch": 0.9354838709677419, + "grad_norm": 0.18447151377144042, + "learning_rate": 7.936101060379272e-07, + "loss": 0.2909, + "step": 1856 + }, + { + "epoch": 0.9359879032258065, + "grad_norm": 0.17295084942405378, + "learning_rate": 7.93392685460077e-07, + "loss": 0.2828, + "step": 1857 + }, + { + "epoch": 0.936491935483871, + "grad_norm": 0.1695374241026228, + "learning_rate": 7.931751802402544e-07, + "loss": 0.2799, + "step": 1858 + }, + { + "epoch": 0.9369959677419355, + "grad_norm": 0.17220089424019935, + "learning_rate": 7.929575904412086e-07, + "loss": 0.2783, + "step": 1859 + }, + { + "epoch": 0.9375, + "grad_norm": 0.17583666760835148, + "learning_rate": 7.927399161257127e-07, + "loss": 0.268, + "step": 1860 + }, + { + "epoch": 0.9380040322580645, + "grad_norm": 0.181882166877518, + "learning_rate": 7.925221573565644e-07, + "loss": 0.2931, + "step": 1861 + }, + { + "epoch": 0.938508064516129, + "grad_norm": 0.17175675035279075, + "learning_rate": 7.923043141965857e-07, + "loss": 0.2763, + "step": 1862 + }, + { + "epoch": 0.9390120967741935, + "grad_norm": 0.16867999230536798, + "learning_rate": 7.920863867086232e-07, + "loss": 0.2764, + "step": 1863 + }, + { + "epoch": 0.9395161290322581, + "grad_norm": 0.18179217734847186, + "learning_rate": 7.918683749555473e-07, + "loss": 0.2865, + "step": 1864 + }, + { + "epoch": 0.9400201612903226, + "grad_norm": 0.17793466089999202, + "learning_rate": 7.916502790002535e-07, + "loss": 0.2933, + "step": 1865 + }, + { + "epoch": 0.9405241935483871, + "grad_norm": 0.1687244667253048, + "learning_rate": 7.914320989056608e-07, + "loss": 0.2646, + "step": 1866 + }, + { + "epoch": 0.9410282258064516, + "grad_norm": 0.16924880345437254, + "learning_rate": 7.912138347347128e-07, + "loss": 0.2747, + "step": 1867 + }, + { + "epoch": 0.9415322580645161, + "grad_norm": 0.1998764610080855, + "learning_rate": 7.909954865503776e-07, + "loss": 0.2848, + "step": 1868 + }, + { + "epoch": 0.9420362903225806, + "grad_norm": 0.17645688052721015, + "learning_rate": 7.907770544156471e-07, + "loss": 0.2744, + "step": 1869 + }, + { + "epoch": 0.9425403225806451, + "grad_norm": 0.17444186260064237, + "learning_rate": 7.905585383935377e-07, + "loss": 0.2861, + "step": 1870 + }, + { + "epoch": 0.9430443548387096, + "grad_norm": 0.18655327778687014, + "learning_rate": 7.903399385470898e-07, + "loss": 0.2723, + "step": 1871 + }, + { + "epoch": 0.9435483870967742, + "grad_norm": 0.17432236510965454, + "learning_rate": 7.901212549393682e-07, + "loss": 0.2782, + "step": 1872 + }, + { + "epoch": 0.9440524193548387, + "grad_norm": 0.1866368609884871, + "learning_rate": 7.899024876334619e-07, + "loss": 0.27, + "step": 1873 + }, + { + "epoch": 0.9445564516129032, + "grad_norm": 0.1867440259653203, + "learning_rate": 7.896836366924836e-07, + "loss": 0.2861, + "step": 1874 + }, + { + "epoch": 0.9450604838709677, + "grad_norm": 0.17504198342034763, + "learning_rate": 7.894647021795707e-07, + "loss": 0.288, + "step": 1875 + }, + { + "epoch": 0.9455645161290323, + "grad_norm": 0.17474732068264098, + "learning_rate": 7.892456841578843e-07, + "loss": 0.2839, + "step": 1876 + }, + { + "epoch": 0.9460685483870968, + "grad_norm": 0.17433782582765533, + "learning_rate": 7.890265826906097e-07, + "loss": 0.2695, + "step": 1877 + }, + { + "epoch": 0.9465725806451613, + "grad_norm": 0.1732874776440476, + "learning_rate": 7.888073978409568e-07, + "loss": 0.2894, + "step": 1878 + }, + { + "epoch": 0.9470766129032258, + "grad_norm": 0.1829899116719353, + "learning_rate": 7.885881296721584e-07, + "loss": 0.2943, + "step": 1879 + }, + { + "epoch": 0.9475806451612904, + "grad_norm": 0.17001971535664087, + "learning_rate": 7.883687782474723e-07, + "loss": 0.2856, + "step": 1880 + }, + { + "epoch": 0.9480846774193549, + "grad_norm": 0.1780345466436045, + "learning_rate": 7.8814934363018e-07, + "loss": 0.2746, + "step": 1881 + }, + { + "epoch": 0.9485887096774194, + "grad_norm": 0.1811602910506968, + "learning_rate": 7.879298258835872e-07, + "loss": 0.2776, + "step": 1882 + }, + { + "epoch": 0.9490927419354839, + "grad_norm": 0.18771497800701312, + "learning_rate": 7.877102250710231e-07, + "loss": 0.2829, + "step": 1883 + }, + { + "epoch": 0.9495967741935484, + "grad_norm": 0.18472560622513845, + "learning_rate": 7.874905412558415e-07, + "loss": 0.2707, + "step": 1884 + }, + { + "epoch": 0.9501008064516129, + "grad_norm": 0.1795065532871576, + "learning_rate": 7.872707745014195e-07, + "loss": 0.2812, + "step": 1885 + }, + { + "epoch": 0.9506048387096774, + "grad_norm": 0.19450850100162909, + "learning_rate": 7.870509248711588e-07, + "loss": 0.262, + "step": 1886 + }, + { + "epoch": 0.9511088709677419, + "grad_norm": 0.17189805933616106, + "learning_rate": 7.868309924284842e-07, + "loss": 0.2681, + "step": 1887 + }, + { + "epoch": 0.9516129032258065, + "grad_norm": 0.17073814905474735, + "learning_rate": 7.866109772368453e-07, + "loss": 0.2774, + "step": 1888 + }, + { + "epoch": 0.952116935483871, + "grad_norm": 0.17066105523996877, + "learning_rate": 7.863908793597149e-07, + "loss": 0.2883, + "step": 1889 + }, + { + "epoch": 0.9526209677419355, + "grad_norm": 0.16966954884907165, + "learning_rate": 7.861706988605898e-07, + "loss": 0.2832, + "step": 1890 + }, + { + "epoch": 0.953125, + "grad_norm": 0.17731653359539812, + "learning_rate": 7.859504358029909e-07, + "loss": 0.2856, + "step": 1891 + }, + { + "epoch": 0.9536290322580645, + "grad_norm": 0.18810146382045853, + "learning_rate": 7.857300902504628e-07, + "loss": 0.2853, + "step": 1892 + }, + { + "epoch": 0.954133064516129, + "grad_norm": 0.18445022791672494, + "learning_rate": 7.855096622665735e-07, + "loss": 0.2867, + "step": 1893 + }, + { + "epoch": 0.9546370967741935, + "grad_norm": 0.1728990132935549, + "learning_rate": 7.852891519149152e-07, + "loss": 0.2874, + "step": 1894 + }, + { + "epoch": 0.9551411290322581, + "grad_norm": 0.1761006592616753, + "learning_rate": 7.85068559259104e-07, + "loss": 0.2844, + "step": 1895 + }, + { + "epoch": 0.9556451612903226, + "grad_norm": 0.183998556068685, + "learning_rate": 7.848478843627792e-07, + "loss": 0.2705, + "step": 1896 + }, + { + "epoch": 0.9561491935483871, + "grad_norm": 0.17318304016376618, + "learning_rate": 7.846271272896044e-07, + "loss": 0.2778, + "step": 1897 + }, + { + "epoch": 0.9566532258064516, + "grad_norm": 0.1824137043313419, + "learning_rate": 7.844062881032664e-07, + "loss": 0.2872, + "step": 1898 + }, + { + "epoch": 0.9571572580645161, + "grad_norm": 0.17344844058434108, + "learning_rate": 7.841853668674763e-07, + "loss": 0.2787, + "step": 1899 + }, + { + "epoch": 0.9576612903225806, + "grad_norm": 0.1758545032107461, + "learning_rate": 7.839643636459683e-07, + "loss": 0.2857, + "step": 1900 + }, + { + "epoch": 0.9581653225806451, + "grad_norm": 0.1729782764700269, + "learning_rate": 7.837432785025004e-07, + "loss": 0.2765, + "step": 1901 + }, + { + "epoch": 0.9586693548387096, + "grad_norm": 0.17898116556336083, + "learning_rate": 7.835221115008542e-07, + "loss": 0.2877, + "step": 1902 + }, + { + "epoch": 0.9591733870967742, + "grad_norm": 0.1895613129053808, + "learning_rate": 7.833008627048352e-07, + "loss": 0.2754, + "step": 1903 + }, + { + "epoch": 0.9596774193548387, + "grad_norm": 0.20979437224680517, + "learning_rate": 7.830795321782724e-07, + "loss": 0.2773, + "step": 1904 + }, + { + "epoch": 0.9601814516129032, + "grad_norm": 0.17134279973430921, + "learning_rate": 7.828581199850182e-07, + "loss": 0.2785, + "step": 1905 + }, + { + "epoch": 0.9606854838709677, + "grad_norm": 0.1703859949701216, + "learning_rate": 7.826366261889483e-07, + "loss": 0.2755, + "step": 1906 + }, + { + "epoch": 0.9611895161290323, + "grad_norm": 0.1708391359023031, + "learning_rate": 7.824150508539628e-07, + "loss": 0.2784, + "step": 1907 + }, + { + "epoch": 0.9616935483870968, + "grad_norm": 0.17468338877763465, + "learning_rate": 7.821933940439847e-07, + "loss": 0.2584, + "step": 1908 + }, + { + "epoch": 0.9621975806451613, + "grad_norm": 0.17669530382160473, + "learning_rate": 7.819716558229604e-07, + "loss": 0.2594, + "step": 1909 + }, + { + "epoch": 0.9627016129032258, + "grad_norm": 0.18182770484400235, + "learning_rate": 7.8174983625486e-07, + "loss": 0.2929, + "step": 1910 + }, + { + "epoch": 0.9632056451612904, + "grad_norm": 0.1718420918323365, + "learning_rate": 7.815279354036772e-07, + "loss": 0.2741, + "step": 1911 + }, + { + "epoch": 0.9637096774193549, + "grad_norm": 0.18047954008297915, + "learning_rate": 7.813059533334292e-07, + "loss": 0.282, + "step": 1912 + }, + { + "epoch": 0.9642137096774194, + "grad_norm": 0.1708373327019462, + "learning_rate": 7.810838901081561e-07, + "loss": 0.2671, + "step": 1913 + }, + { + "epoch": 0.9647177419354839, + "grad_norm": 0.17530947113755785, + "learning_rate": 7.80861745791922e-07, + "loss": 0.2863, + "step": 1914 + }, + { + "epoch": 0.9652217741935484, + "grad_norm": 0.1856980391062388, + "learning_rate": 7.80639520448814e-07, + "loss": 0.2953, + "step": 1915 + }, + { + "epoch": 0.9657258064516129, + "grad_norm": 0.17104956670285965, + "learning_rate": 7.80417214142943e-07, + "loss": 0.2929, + "step": 1916 + }, + { + "epoch": 0.9662298387096774, + "grad_norm": 0.17968017354027993, + "learning_rate": 7.801948269384427e-07, + "loss": 0.2805, + "step": 1917 + }, + { + "epoch": 0.9667338709677419, + "grad_norm": 0.16885090448355405, + "learning_rate": 7.799723588994706e-07, + "loss": 0.275, + "step": 1918 + }, + { + "epoch": 0.9672379032258065, + "grad_norm": 0.16977668143159869, + "learning_rate": 7.797498100902071e-07, + "loss": 0.2955, + "step": 1919 + }, + { + "epoch": 0.967741935483871, + "grad_norm": 0.17543252330569692, + "learning_rate": 7.795271805748565e-07, + "loss": 0.285, + "step": 1920 + }, + { + "epoch": 0.9682459677419355, + "grad_norm": 0.1779214329877026, + "learning_rate": 7.793044704176459e-07, + "loss": 0.2751, + "step": 1921 + }, + { + "epoch": 0.96875, + "grad_norm": 0.1727984897805244, + "learning_rate": 7.790816796828259e-07, + "loss": 0.2917, + "step": 1922 + }, + { + "epoch": 0.9692540322580645, + "grad_norm": 0.19093500891641543, + "learning_rate": 7.788588084346699e-07, + "loss": 0.2912, + "step": 1923 + }, + { + "epoch": 0.969758064516129, + "grad_norm": 0.1877331833500236, + "learning_rate": 7.786358567374752e-07, + "loss": 0.2828, + "step": 1924 + }, + { + "epoch": 0.9702620967741935, + "grad_norm": 0.1964520978166042, + "learning_rate": 7.784128246555619e-07, + "loss": 0.2878, + "step": 1925 + }, + { + "epoch": 0.9707661290322581, + "grad_norm": 0.1793857792176289, + "learning_rate": 7.781897122532732e-07, + "loss": 0.3009, + "step": 1926 + }, + { + "epoch": 0.9712701612903226, + "grad_norm": 0.1778515329690811, + "learning_rate": 7.779665195949761e-07, + "loss": 0.2686, + "step": 1927 + }, + { + "epoch": 0.9717741935483871, + "grad_norm": 0.18899510212928142, + "learning_rate": 7.777432467450598e-07, + "loss": 0.2944, + "step": 1928 + }, + { + "epoch": 0.9722782258064516, + "grad_norm": 0.18103789111426294, + "learning_rate": 7.775198937679375e-07, + "loss": 0.2853, + "step": 1929 + }, + { + "epoch": 0.9727822580645161, + "grad_norm": 0.17187773087886613, + "learning_rate": 7.772964607280448e-07, + "loss": 0.2853, + "step": 1930 + }, + { + "epoch": 0.9732862903225806, + "grad_norm": 0.17830119933021793, + "learning_rate": 7.77072947689841e-07, + "loss": 0.2883, + "step": 1931 + }, + { + "epoch": 0.9737903225806451, + "grad_norm": 0.18468034488609253, + "learning_rate": 7.768493547178083e-07, + "loss": 0.2783, + "step": 1932 + }, + { + "epoch": 0.9742943548387096, + "grad_norm": 0.1649149359989447, + "learning_rate": 7.766256818764517e-07, + "loss": 0.2712, + "step": 1933 + }, + { + "epoch": 0.9747983870967742, + "grad_norm": 0.18006035200393466, + "learning_rate": 7.764019292302994e-07, + "loss": 0.2873, + "step": 1934 + }, + { + "epoch": 0.9753024193548387, + "grad_norm": 0.18387324362971386, + "learning_rate": 7.761780968439027e-07, + "loss": 0.2825, + "step": 1935 + }, + { + "epoch": 0.9758064516129032, + "grad_norm": 0.1756603042845968, + "learning_rate": 7.759541847818361e-07, + "loss": 0.2633, + "step": 1936 + }, + { + "epoch": 0.9763104838709677, + "grad_norm": 0.17483974536625063, + "learning_rate": 7.757301931086963e-07, + "loss": 0.2982, + "step": 1937 + }, + { + "epoch": 0.9768145161290323, + "grad_norm": 0.18502806181928816, + "learning_rate": 7.755061218891041e-07, + "loss": 0.2836, + "step": 1938 + }, + { + "epoch": 0.9773185483870968, + "grad_norm": 0.17424427520526933, + "learning_rate": 7.752819711877024e-07, + "loss": 0.2909, + "step": 1939 + }, + { + "epoch": 0.9778225806451613, + "grad_norm": 0.17479328408301795, + "learning_rate": 7.750577410691572e-07, + "loss": 0.2892, + "step": 1940 + }, + { + "epoch": 0.9783266129032258, + "grad_norm": 0.18236213050678932, + "learning_rate": 7.748334315981577e-07, + "loss": 0.2904, + "step": 1941 + }, + { + "epoch": 0.9788306451612904, + "grad_norm": 0.17005102876538494, + "learning_rate": 7.746090428394156e-07, + "loss": 0.2784, + "step": 1942 + }, + { + "epoch": 0.9793346774193549, + "grad_norm": 0.17329730021771092, + "learning_rate": 7.743845748576659e-07, + "loss": 0.2836, + "step": 1943 + }, + { + "epoch": 0.9798387096774194, + "grad_norm": 0.1714850364043798, + "learning_rate": 7.741600277176659e-07, + "loss": 0.2857, + "step": 1944 + }, + { + "epoch": 0.9803427419354839, + "grad_norm": 0.19242374904544585, + "learning_rate": 7.739354014841963e-07, + "loss": 0.286, + "step": 1945 + }, + { + "epoch": 0.9808467741935484, + "grad_norm": 0.17505647013555173, + "learning_rate": 7.737106962220603e-07, + "loss": 0.2961, + "step": 1946 + }, + { + "epoch": 0.9813508064516129, + "grad_norm": 0.17558706687337872, + "learning_rate": 7.734859119960841e-07, + "loss": 0.2735, + "step": 1947 + }, + { + "epoch": 0.9818548387096774, + "grad_norm": 0.17903519468817597, + "learning_rate": 7.732610488711162e-07, + "loss": 0.2901, + "step": 1948 + }, + { + "epoch": 0.9823588709677419, + "grad_norm": 0.18371964890476014, + "learning_rate": 7.730361069120286e-07, + "loss": 0.2985, + "step": 1949 + }, + { + "epoch": 0.9828629032258065, + "grad_norm": 0.1723762496803238, + "learning_rate": 7.728110861837156e-07, + "loss": 0.2837, + "step": 1950 + }, + { + "epoch": 0.983366935483871, + "grad_norm": 0.1790011706688029, + "learning_rate": 7.725859867510942e-07, + "loss": 0.2696, + "step": 1951 + }, + { + "epoch": 0.9838709677419355, + "grad_norm": 0.18850089072217402, + "learning_rate": 7.72360808679104e-07, + "loss": 0.2811, + "step": 1952 + }, + { + "epoch": 0.984375, + "grad_norm": 0.17379961506239888, + "learning_rate": 7.72135552032708e-07, + "loss": 0.2743, + "step": 1953 + }, + { + "epoch": 0.9848790322580645, + "grad_norm": 0.18355475523994572, + "learning_rate": 7.719102168768907e-07, + "loss": 0.2918, + "step": 1954 + }, + { + "epoch": 0.985383064516129, + "grad_norm": 0.18238534494389397, + "learning_rate": 7.716848032766605e-07, + "loss": 0.2972, + "step": 1955 + }, + { + "epoch": 0.9858870967741935, + "grad_norm": 0.1729414375179342, + "learning_rate": 7.714593112970473e-07, + "loss": 0.2761, + "step": 1956 + }, + { + "epoch": 0.9863911290322581, + "grad_norm": 0.1768407372110884, + "learning_rate": 7.712337410031046e-07, + "loss": 0.2663, + "step": 1957 + }, + { + "epoch": 0.9868951612903226, + "grad_norm": 0.1803839421907579, + "learning_rate": 7.710080924599077e-07, + "loss": 0.2913, + "step": 1958 + }, + { + "epoch": 0.9873991935483871, + "grad_norm": 0.1722830670020056, + "learning_rate": 7.707823657325549e-07, + "loss": 0.2899, + "step": 1959 + }, + { + "epoch": 0.9879032258064516, + "grad_norm": 0.17157007007009292, + "learning_rate": 7.705565608861673e-07, + "loss": 0.2832, + "step": 1960 + }, + { + "epoch": 0.9884072580645161, + "grad_norm": 0.17179339985585704, + "learning_rate": 7.703306779858875e-07, + "loss": 0.275, + "step": 1961 + }, + { + "epoch": 0.9889112903225806, + "grad_norm": 0.1750481355495183, + "learning_rate": 7.701047170968819e-07, + "loss": 0.2935, + "step": 1962 + }, + { + "epoch": 0.9894153225806451, + "grad_norm": 0.1703164887136099, + "learning_rate": 7.698786782843386e-07, + "loss": 0.2578, + "step": 1963 + }, + { + "epoch": 0.9899193548387096, + "grad_norm": 0.17963472103153666, + "learning_rate": 7.696525616134686e-07, + "loss": 0.2885, + "step": 1964 + }, + { + "epoch": 0.9904233870967742, + "grad_norm": 0.17443507400988176, + "learning_rate": 7.694263671495047e-07, + "loss": 0.2992, + "step": 1965 + }, + { + "epoch": 0.9909274193548387, + "grad_norm": 0.18131986198353908, + "learning_rate": 7.692000949577031e-07, + "loss": 0.2897, + "step": 1966 + }, + { + "epoch": 0.9914314516129032, + "grad_norm": 0.19050009588750086, + "learning_rate": 7.689737451033415e-07, + "loss": 0.2975, + "step": 1967 + }, + { + "epoch": 0.9919354838709677, + "grad_norm": 0.21986786250372692, + "learning_rate": 7.687473176517209e-07, + "loss": 0.2775, + "step": 1968 + }, + { + "epoch": 0.9924395161290323, + "grad_norm": 0.1712446456152378, + "learning_rate": 7.685208126681637e-07, + "loss": 0.2653, + "step": 1969 + }, + { + "epoch": 0.9929435483870968, + "grad_norm": 0.17416424744387057, + "learning_rate": 7.682942302180155e-07, + "loss": 0.3078, + "step": 1970 + }, + { + "epoch": 0.9934475806451613, + "grad_norm": 0.17288742218707365, + "learning_rate": 7.680675703666439e-07, + "loss": 0.278, + "step": 1971 + }, + { + "epoch": 0.9939516129032258, + "grad_norm": 0.1795604518756118, + "learning_rate": 7.67840833179439e-07, + "loss": 0.2849, + "step": 1972 + }, + { + "epoch": 0.9944556451612904, + "grad_norm": 0.1715422047295615, + "learning_rate": 7.676140187218128e-07, + "loss": 0.2665, + "step": 1973 + }, + { + "epoch": 0.9949596774193549, + "grad_norm": 0.17763829951279275, + "learning_rate": 7.673871270592e-07, + "loss": 0.2907, + "step": 1974 + }, + { + "epoch": 0.9954637096774194, + "grad_norm": 0.1748484464046483, + "learning_rate": 7.671601582570573e-07, + "loss": 0.2722, + "step": 1975 + }, + { + "epoch": 0.9959677419354839, + "grad_norm": 0.18807863525366347, + "learning_rate": 7.66933112380864e-07, + "loss": 0.2966, + "step": 1976 + }, + { + "epoch": 0.9964717741935484, + "grad_norm": 0.19625503483008158, + "learning_rate": 7.667059894961214e-07, + "loss": 0.2688, + "step": 1977 + }, + { + "epoch": 0.9969758064516129, + "grad_norm": 0.1714858168146041, + "learning_rate": 7.664787896683528e-07, + "loss": 0.2857, + "step": 1978 + }, + { + "epoch": 0.9974798387096774, + "grad_norm": 0.17049746359373963, + "learning_rate": 7.662515129631045e-07, + "loss": 0.2732, + "step": 1979 + }, + { + "epoch": 0.9979838709677419, + "grad_norm": 0.17253458678680952, + "learning_rate": 7.660241594459437e-07, + "loss": 0.2802, + "step": 1980 + }, + { + "epoch": 0.9984879032258065, + "grad_norm": 0.17152385183060628, + "learning_rate": 7.65796729182461e-07, + "loss": 0.2775, + "step": 1981 + }, + { + "epoch": 0.998991935483871, + "grad_norm": 0.17962880378962143, + "learning_rate": 7.655692222382683e-07, + "loss": 0.282, + "step": 1982 + }, + { + "epoch": 0.9994959677419355, + "grad_norm": 0.1815884076274797, + "learning_rate": 7.653416386790003e-07, + "loss": 0.2807, + "step": 1983 + }, + { + "epoch": 1.0, + "grad_norm": 0.17036014949141842, + "learning_rate": 7.651139785703131e-07, + "loss": 0.2756, + "step": 1984 + }, + { + "epoch": 1.0005040322580645, + "grad_norm": 0.1858739335123262, + "learning_rate": 7.648862419778854e-07, + "loss": 0.2754, + "step": 1985 + }, + { + "epoch": 1.001008064516129, + "grad_norm": 0.17713241635811486, + "learning_rate": 7.646584289674178e-07, + "loss": 0.2913, + "step": 1986 + }, + { + "epoch": 1.0015120967741935, + "grad_norm": 0.17224329918547132, + "learning_rate": 7.644305396046328e-07, + "loss": 0.2838, + "step": 1987 + }, + { + "epoch": 1.002016129032258, + "grad_norm": 0.16815918206812863, + "learning_rate": 7.642025739552753e-07, + "loss": 0.2738, + "step": 1988 + }, + { + "epoch": 1.0003780718336484, + "grad_norm": 0.17810881094257833, + "learning_rate": 7.639745320851118e-07, + "loss": 0.2829, + "step": 1989 + }, + { + "epoch": 1.0008821676118462, + "grad_norm": 0.17225692284996372, + "learning_rate": 7.637464140599312e-07, + "loss": 0.2648, + "step": 1990 + }, + { + "epoch": 1.0008821676118462, + "eval_loss": 0.31192973256111145, + "eval_runtime": 16.7125, + "eval_samples_per_second": 51.159, + "eval_steps_per_second": 1.077, + "step": 1990 + }, + { + "epoch": 1.001386263390044, + "grad_norm": 0.18131745361437934, + "learning_rate": 7.635182199455437e-07, + "loss": 0.2904, + "step": 1991 + }, + { + "epoch": 1.001890359168242, + "grad_norm": 0.1674300111635892, + "learning_rate": 7.632899498077824e-07, + "loss": 0.2781, + "step": 1992 + }, + { + "epoch": 1.0023944549464399, + "grad_norm": 0.16930495343661658, + "learning_rate": 7.630616037125015e-07, + "loss": 0.2716, + "step": 1993 + }, + { + "epoch": 1.0028985507246377, + "grad_norm": 0.17149835744787098, + "learning_rate": 7.628331817255775e-07, + "loss": 0.3012, + "step": 1994 + }, + { + "epoch": 1.0034026465028356, + "grad_norm": 0.17694237469391477, + "learning_rate": 7.626046839129087e-07, + "loss": 0.2826, + "step": 1995 + }, + { + "epoch": 1.0039067422810335, + "grad_norm": 0.18584553067561851, + "learning_rate": 7.623761103404154e-07, + "loss": 0.2834, + "step": 1996 + }, + { + "epoch": 1.0044108380592311, + "grad_norm": 0.17723779816231616, + "learning_rate": 7.621474610740396e-07, + "loss": 0.2794, + "step": 1997 + }, + { + "epoch": 1.004914933837429, + "grad_norm": 0.18721280669165485, + "learning_rate": 7.619187361797451e-07, + "loss": 0.2813, + "step": 1998 + }, + { + "epoch": 1.005419029615627, + "grad_norm": 0.1752600172196804, + "learning_rate": 7.616899357235178e-07, + "loss": 0.2636, + "step": 1999 + }, + { + "epoch": 1.0059231253938248, + "grad_norm": 0.1855969151006619, + "learning_rate": 7.614610597713651e-07, + "loss": 0.2923, + "step": 2000 + }, + { + "epoch": 1.0064272211720227, + "grad_norm": 0.16729057622113636, + "learning_rate": 7.612321083893163e-07, + "loss": 0.2892, + "step": 2001 + }, + { + "epoch": 1.0069313169502205, + "grad_norm": 0.1693767296113182, + "learning_rate": 7.610030816434224e-07, + "loss": 0.2801, + "step": 2002 + }, + { + "epoch": 1.0074354127284184, + "grad_norm": 0.17521037625410535, + "learning_rate": 7.607739795997563e-07, + "loss": 0.2725, + "step": 2003 + }, + { + "epoch": 1.0079395085066163, + "grad_norm": 0.17979436463239115, + "learning_rate": 7.605448023244127e-07, + "loss": 0.2771, + "step": 2004 + }, + { + "epoch": 1.0084436042848142, + "grad_norm": 0.17931260879453892, + "learning_rate": 7.603155498835075e-07, + "loss": 0.2668, + "step": 2005 + }, + { + "epoch": 1.008947700063012, + "grad_norm": 0.17071051877009574, + "learning_rate": 7.600862223431787e-07, + "loss": 0.2828, + "step": 2006 + }, + { + "epoch": 1.00945179584121, + "grad_norm": 0.17196704916567862, + "learning_rate": 7.598568197695858e-07, + "loss": 0.2794, + "step": 2007 + }, + { + "epoch": 1.0099558916194078, + "grad_norm": 0.17562030132329703, + "learning_rate": 7.596273422289103e-07, + "loss": 0.2833, + "step": 2008 + }, + { + "epoch": 1.0104599873976055, + "grad_norm": 0.18089872134544313, + "learning_rate": 7.593977897873548e-07, + "loss": 0.2833, + "step": 2009 + }, + { + "epoch": 1.0109640831758033, + "grad_norm": 0.18222407962137802, + "learning_rate": 7.591681625111439e-07, + "loss": 0.2816, + "step": 2010 + }, + { + "epoch": 1.0114681789540012, + "grad_norm": 0.18109669021912, + "learning_rate": 7.589384604665235e-07, + "loss": 0.288, + "step": 2011 + }, + { + "epoch": 1.011972274732199, + "grad_norm": 0.17814912652297168, + "learning_rate": 7.587086837197614e-07, + "loss": 0.2753, + "step": 2012 + }, + { + "epoch": 1.012476370510397, + "grad_norm": 0.16991715478737968, + "learning_rate": 7.584788323371466e-07, + "loss": 0.2855, + "step": 2013 + }, + { + "epoch": 1.0129804662885948, + "grad_norm": 0.17035192594812326, + "learning_rate": 7.582489063849899e-07, + "loss": 0.2629, + "step": 2014 + }, + { + "epoch": 1.0134845620667927, + "grad_norm": 0.17513209286858666, + "learning_rate": 7.580189059296234e-07, + "loss": 0.2772, + "step": 2015 + }, + { + "epoch": 1.0139886578449906, + "grad_norm": 0.19229502375720714, + "learning_rate": 7.577888310374007e-07, + "loss": 0.2764, + "step": 2016 + }, + { + "epoch": 1.0144927536231885, + "grad_norm": 0.16703432210037336, + "learning_rate": 7.575586817746975e-07, + "loss": 0.2749, + "step": 2017 + }, + { + "epoch": 1.0149968494013863, + "grad_norm": 0.17186499930225924, + "learning_rate": 7.573284582079098e-07, + "loss": 0.2696, + "step": 2018 + }, + { + "epoch": 1.0155009451795842, + "grad_norm": 0.20803141423115293, + "learning_rate": 7.570981604034563e-07, + "loss": 0.274, + "step": 2019 + }, + { + "epoch": 1.0160050409577819, + "grad_norm": 0.17601420291337044, + "learning_rate": 7.56867788427776e-07, + "loss": 0.2908, + "step": 2020 + }, + { + "epoch": 1.0165091367359798, + "grad_norm": 0.18319232781478847, + "learning_rate": 7.566373423473299e-07, + "loss": 0.2923, + "step": 2021 + }, + { + "epoch": 1.0170132325141776, + "grad_norm": 0.17400976480499947, + "learning_rate": 7.564068222286004e-07, + "loss": 0.2808, + "step": 2022 + }, + { + "epoch": 1.0175173282923755, + "grad_norm": 0.17912905548016714, + "learning_rate": 7.56176228138091e-07, + "loss": 0.2865, + "step": 2023 + }, + { + "epoch": 1.0180214240705734, + "grad_norm": 0.1722722944490204, + "learning_rate": 7.559455601423266e-07, + "loss": 0.2741, + "step": 2024 + }, + { + "epoch": 1.0185255198487713, + "grad_norm": 0.16659736898121513, + "learning_rate": 7.557148183078539e-07, + "loss": 0.2821, + "step": 2025 + }, + { + "epoch": 1.0190296156269691, + "grad_norm": 0.17461443095442888, + "learning_rate": 7.554840027012399e-07, + "loss": 0.2764, + "step": 2026 + }, + { + "epoch": 1.019533711405167, + "grad_norm": 0.18391922179509013, + "learning_rate": 7.552531133890738e-07, + "loss": 0.295, + "step": 2027 + }, + { + "epoch": 1.020037807183365, + "grad_norm": 0.20322970192602707, + "learning_rate": 7.550221504379659e-07, + "loss": 0.2865, + "step": 2028 + }, + { + "epoch": 1.0205419029615628, + "grad_norm": 0.177400814926691, + "learning_rate": 7.547911139145472e-07, + "loss": 0.2911, + "step": 2029 + }, + { + "epoch": 1.0210459987397607, + "grad_norm": 0.1834139875378068, + "learning_rate": 7.545600038854705e-07, + "loss": 0.2795, + "step": 2030 + }, + { + "epoch": 1.0215500945179583, + "grad_norm": 0.17234090006004013, + "learning_rate": 7.543288204174096e-07, + "loss": 0.2812, + "step": 2031 + }, + { + "epoch": 1.0220541902961562, + "grad_norm": 0.19523445589007987, + "learning_rate": 7.540975635770595e-07, + "loss": 0.2634, + "step": 2032 + }, + { + "epoch": 1.022558286074354, + "grad_norm": 0.17425774393185609, + "learning_rate": 7.538662334311363e-07, + "loss": 0.2923, + "step": 2033 + }, + { + "epoch": 1.023062381852552, + "grad_norm": 0.18190325865887647, + "learning_rate": 7.536348300463775e-07, + "loss": 0.2763, + "step": 2034 + }, + { + "epoch": 1.0235664776307498, + "grad_norm": 0.17592758218306667, + "learning_rate": 7.534033534895415e-07, + "loss": 0.2869, + "step": 2035 + }, + { + "epoch": 1.0240705734089477, + "grad_norm": 0.17477622082253388, + "learning_rate": 7.531718038274076e-07, + "loss": 0.291, + "step": 2036 + }, + { + "epoch": 1.0245746691871456, + "grad_norm": 0.22007149638124626, + "learning_rate": 7.529401811267765e-07, + "loss": 0.2676, + "step": 2037 + }, + { + "epoch": 1.0250787649653434, + "grad_norm": 0.16688921868859855, + "learning_rate": 7.527084854544701e-07, + "loss": 0.267, + "step": 2038 + }, + { + "epoch": 1.0255828607435413, + "grad_norm": 0.1867065847855188, + "learning_rate": 7.524767168773311e-07, + "loss": 0.2694, + "step": 2039 + }, + { + "epoch": 1.0260869565217392, + "grad_norm": 0.205670590026773, + "learning_rate": 7.522448754622234e-07, + "loss": 0.2889, + "step": 2040 + }, + { + "epoch": 1.026591052299937, + "grad_norm": 0.17475031087338616, + "learning_rate": 7.520129612760318e-07, + "loss": 0.2697, + "step": 2041 + }, + { + "epoch": 1.0270951480781347, + "grad_norm": 0.18541787593421327, + "learning_rate": 7.517809743856618e-07, + "loss": 0.2774, + "step": 2042 + }, + { + "epoch": 1.0275992438563326, + "grad_norm": 0.17371102962725932, + "learning_rate": 7.515489148580405e-07, + "loss": 0.2594, + "step": 2043 + }, + { + "epoch": 1.0281033396345305, + "grad_norm": 0.17385943346589472, + "learning_rate": 7.513167827601154e-07, + "loss": 0.2675, + "step": 2044 + }, + { + "epoch": 1.0286074354127284, + "grad_norm": 0.17595128300347995, + "learning_rate": 7.510845781588554e-07, + "loss": 0.2793, + "step": 2045 + }, + { + "epoch": 1.0291115311909262, + "grad_norm": 0.179237071829832, + "learning_rate": 7.5085230112125e-07, + "loss": 0.2792, + "step": 2046 + }, + { + "epoch": 1.0296156269691241, + "grad_norm": 0.17451242012769375, + "learning_rate": 7.506199517143095e-07, + "loss": 0.276, + "step": 2047 + }, + { + "epoch": 1.030119722747322, + "grad_norm": 0.18158012272605775, + "learning_rate": 7.503875300050656e-07, + "loss": 0.2788, + "step": 2048 + }, + { + "epoch": 1.0306238185255199, + "grad_norm": 0.18636629727957202, + "learning_rate": 7.501550360605704e-07, + "loss": 0.293, + "step": 2049 + }, + { + "epoch": 1.0311279143037178, + "grad_norm": 0.17720408232848278, + "learning_rate": 7.499224699478969e-07, + "loss": 0.2813, + "step": 2050 + }, + { + "epoch": 1.0316320100819156, + "grad_norm": 0.1916262449849373, + "learning_rate": 7.496898317341389e-07, + "loss": 0.2677, + "step": 2051 + }, + { + "epoch": 1.0321361058601135, + "grad_norm": 0.17348752838152534, + "learning_rate": 7.494571214864113e-07, + "loss": 0.2751, + "step": 2052 + }, + { + "epoch": 1.0326402016383114, + "grad_norm": 0.19058244263360036, + "learning_rate": 7.492243392718493e-07, + "loss": 0.2722, + "step": 2053 + }, + { + "epoch": 1.033144297416509, + "grad_norm": 0.1890009158650643, + "learning_rate": 7.489914851576095e-07, + "loss": 0.2655, + "step": 2054 + }, + { + "epoch": 1.033648393194707, + "grad_norm": 0.172808831512677, + "learning_rate": 7.487585592108685e-07, + "loss": 0.2847, + "step": 2055 + }, + { + "epoch": 1.0341524889729048, + "grad_norm": 0.17546626349825928, + "learning_rate": 7.485255614988241e-07, + "loss": 0.2817, + "step": 2056 + }, + { + "epoch": 1.0346565847511027, + "grad_norm": 0.1775777311831324, + "learning_rate": 7.482924920886949e-07, + "loss": 0.2754, + "step": 2057 + }, + { + "epoch": 1.0351606805293005, + "grad_norm": 0.1749949082772431, + "learning_rate": 7.480593510477197e-07, + "loss": 0.2831, + "step": 2058 + }, + { + "epoch": 1.0356647763074984, + "grad_norm": 0.18632122267782072, + "learning_rate": 7.478261384431585e-07, + "loss": 0.2725, + "step": 2059 + }, + { + "epoch": 1.0361688720856963, + "grad_norm": 0.17774286945133272, + "learning_rate": 7.475928543422916e-07, + "loss": 0.2815, + "step": 2060 + }, + { + "epoch": 1.0366729678638942, + "grad_norm": 0.18490810909000924, + "learning_rate": 7.473594988124199e-07, + "loss": 0.2846, + "step": 2061 + }, + { + "epoch": 1.037177063642092, + "grad_norm": 0.1728261249927515, + "learning_rate": 7.471260719208649e-07, + "loss": 0.2689, + "step": 2062 + }, + { + "epoch": 1.03768115942029, + "grad_norm": 0.18211593138151266, + "learning_rate": 7.468925737349693e-07, + "loss": 0.2923, + "step": 2063 + }, + { + "epoch": 1.0381852551984878, + "grad_norm": 0.17343204712457802, + "learning_rate": 7.466590043220955e-07, + "loss": 0.2809, + "step": 2064 + }, + { + "epoch": 1.0386893509766855, + "grad_norm": 0.17100328567163497, + "learning_rate": 7.46425363749627e-07, + "loss": 0.2754, + "step": 2065 + }, + { + "epoch": 1.0391934467548833, + "grad_norm": 0.1818697379651732, + "learning_rate": 7.461916520849674e-07, + "loss": 0.2724, + "step": 2066 + }, + { + "epoch": 1.0396975425330812, + "grad_norm": 0.18227441948785947, + "learning_rate": 7.459578693955413e-07, + "loss": 0.2709, + "step": 2067 + }, + { + "epoch": 1.040201638311279, + "grad_norm": 0.1760139361145817, + "learning_rate": 7.457240157487935e-07, + "loss": 0.2806, + "step": 2068 + }, + { + "epoch": 1.040705734089477, + "grad_norm": 0.1795019228044322, + "learning_rate": 7.454900912121894e-07, + "loss": 0.2631, + "step": 2069 + }, + { + "epoch": 1.0412098298676749, + "grad_norm": 0.18888478288221203, + "learning_rate": 7.452560958532147e-07, + "loss": 0.2889, + "step": 2070 + }, + { + "epoch": 1.0417139256458727, + "grad_norm": 0.21616086911752216, + "learning_rate": 7.450220297393756e-07, + "loss": 0.2683, + "step": 2071 + }, + { + "epoch": 1.0422180214240706, + "grad_norm": 0.17939705286161006, + "learning_rate": 7.447878929381989e-07, + "loss": 0.293, + "step": 2072 + }, + { + "epoch": 1.0427221172022685, + "grad_norm": 0.1798424204037018, + "learning_rate": 7.445536855172313e-07, + "loss": 0.2706, + "step": 2073 + }, + { + "epoch": 1.0432262129804664, + "grad_norm": 0.17088142114120458, + "learning_rate": 7.443194075440405e-07, + "loss": 0.2819, + "step": 2074 + }, + { + "epoch": 1.0437303087586642, + "grad_norm": 0.17886842519012602, + "learning_rate": 7.44085059086214e-07, + "loss": 0.2893, + "step": 2075 + }, + { + "epoch": 1.0442344045368621, + "grad_norm": 0.1803195665228042, + "learning_rate": 7.4385064021136e-07, + "loss": 0.2581, + "step": 2076 + }, + { + "epoch": 1.0447385003150598, + "grad_norm": 0.21531931727822987, + "learning_rate": 7.436161509871069e-07, + "loss": 0.2852, + "step": 2077 + }, + { + "epoch": 1.0452425960932576, + "grad_norm": 0.16822216389718422, + "learning_rate": 7.433815914811033e-07, + "loss": 0.265, + "step": 2078 + }, + { + "epoch": 1.0457466918714555, + "grad_norm": 0.17669997866886272, + "learning_rate": 7.431469617610183e-07, + "loss": 0.272, + "step": 2079 + }, + { + "epoch": 1.0462507876496534, + "grad_norm": 0.17526090755685278, + "learning_rate": 7.429122618945409e-07, + "loss": 0.2741, + "step": 2080 + }, + { + "epoch": 1.0467548834278513, + "grad_norm": 0.17590646009524652, + "learning_rate": 7.426774919493808e-07, + "loss": 0.2825, + "step": 2081 + }, + { + "epoch": 1.0472589792060492, + "grad_norm": 0.172169218025432, + "learning_rate": 7.424426519932676e-07, + "loss": 0.2588, + "step": 2082 + }, + { + "epoch": 1.047763074984247, + "grad_norm": 0.1805765775734755, + "learning_rate": 7.422077420939511e-07, + "loss": 0.2686, + "step": 2083 + }, + { + "epoch": 1.048267170762445, + "grad_norm": 0.18419409997013503, + "learning_rate": 7.419727623192013e-07, + "loss": 0.273, + "step": 2084 + }, + { + "epoch": 1.0487712665406428, + "grad_norm": 0.18365937796388698, + "learning_rate": 7.417377127368087e-07, + "loss": 0.2884, + "step": 2085 + }, + { + "epoch": 1.0492753623188407, + "grad_norm": 0.1690690352871276, + "learning_rate": 7.415025934145836e-07, + "loss": 0.2711, + "step": 2086 + }, + { + "epoch": 1.0497794580970385, + "grad_norm": 0.17486780574416283, + "learning_rate": 7.412674044203561e-07, + "loss": 0.2664, + "step": 2087 + }, + { + "epoch": 1.0502835538752362, + "grad_norm": 0.1809124255952242, + "learning_rate": 7.410321458219771e-07, + "loss": 0.291, + "step": 2088 + }, + { + "epoch": 1.050787649653434, + "grad_norm": 0.17461075528139122, + "learning_rate": 7.407968176873169e-07, + "loss": 0.2829, + "step": 2089 + }, + { + "epoch": 1.051291745431632, + "grad_norm": 0.17457876173639864, + "learning_rate": 7.405614200842668e-07, + "loss": 0.2733, + "step": 2090 + }, + { + "epoch": 1.0517958412098298, + "grad_norm": 0.1729548254201044, + "learning_rate": 7.40325953080737e-07, + "loss": 0.2617, + "step": 2091 + }, + { + "epoch": 1.0522999369880277, + "grad_norm": 0.1731484045045451, + "learning_rate": 7.400904167446585e-07, + "loss": 0.2717, + "step": 2092 + }, + { + "epoch": 1.0528040327662256, + "grad_norm": 0.18214823178441017, + "learning_rate": 7.39854811143982e-07, + "loss": 0.2715, + "step": 2093 + }, + { + "epoch": 1.0533081285444235, + "grad_norm": 0.21466975211674327, + "learning_rate": 7.396191363466785e-07, + "loss": 0.2567, + "step": 2094 + }, + { + "epoch": 1.0538122243226213, + "grad_norm": 0.1830837150368865, + "learning_rate": 7.393833924207385e-07, + "loss": 0.2669, + "step": 2095 + }, + { + "epoch": 1.0543163201008192, + "grad_norm": 0.1772829383212441, + "learning_rate": 7.391475794341725e-07, + "loss": 0.2798, + "step": 2096 + }, + { + "epoch": 1.054820415879017, + "grad_norm": 0.17883144461072345, + "learning_rate": 7.389116974550114e-07, + "loss": 0.3041, + "step": 2097 + }, + { + "epoch": 1.055324511657215, + "grad_norm": 0.16977328934679697, + "learning_rate": 7.386757465513055e-07, + "loss": 0.2896, + "step": 2098 + }, + { + "epoch": 1.0558286074354126, + "grad_norm": 0.17447815375668077, + "learning_rate": 7.384397267911252e-07, + "loss": 0.2813, + "step": 2099 + }, + { + "epoch": 1.0563327032136105, + "grad_norm": 0.17549836356043094, + "learning_rate": 7.382036382425608e-07, + "loss": 0.2604, + "step": 2100 + }, + { + "epoch": 1.0568367989918084, + "grad_norm": 0.168631281997645, + "learning_rate": 7.379674809737226e-07, + "loss": 0.2595, + "step": 2101 + }, + { + "epoch": 1.0573408947700063, + "grad_norm": 0.1779153181273789, + "learning_rate": 7.377312550527399e-07, + "loss": 0.2769, + "step": 2102 + }, + { + "epoch": 1.0578449905482041, + "grad_norm": 0.17489911047054466, + "learning_rate": 7.37494960547763e-07, + "loss": 0.2638, + "step": 2103 + }, + { + "epoch": 1.058349086326402, + "grad_norm": 0.18710665444256072, + "learning_rate": 7.372585975269612e-07, + "loss": 0.2733, + "step": 2104 + }, + { + "epoch": 1.0588531821045999, + "grad_norm": 0.16631564764201226, + "learning_rate": 7.370221660585238e-07, + "loss": 0.2549, + "step": 2105 + }, + { + "epoch": 1.0593572778827978, + "grad_norm": 0.1763591739795769, + "learning_rate": 7.367856662106595e-07, + "loss": 0.2724, + "step": 2106 + }, + { + "epoch": 1.0598613736609956, + "grad_norm": 0.17005232527255398, + "learning_rate": 7.365490980515976e-07, + "loss": 0.294, + "step": 2107 + }, + { + "epoch": 1.0603654694391935, + "grad_norm": 0.18645746328153773, + "learning_rate": 7.36312461649586e-07, + "loss": 0.28, + "step": 2108 + }, + { + "epoch": 1.0608695652173914, + "grad_norm": 0.17883161921298946, + "learning_rate": 7.360757570728934e-07, + "loss": 0.2848, + "step": 2109 + }, + { + "epoch": 1.061373660995589, + "grad_norm": 0.16761850246628487, + "learning_rate": 7.358389843898071e-07, + "loss": 0.267, + "step": 2110 + }, + { + "epoch": 1.061877756773787, + "grad_norm": 0.16661816127263798, + "learning_rate": 7.356021436686347e-07, + "loss": 0.2669, + "step": 2111 + }, + { + "epoch": 1.0623818525519848, + "grad_norm": 0.17946012130007954, + "learning_rate": 7.353652349777033e-07, + "loss": 0.2973, + "step": 2112 + }, + { + "epoch": 1.0628859483301827, + "grad_norm": 0.18375956514983224, + "learning_rate": 7.351282583853597e-07, + "loss": 0.2679, + "step": 2113 + }, + { + "epoch": 1.0633900441083806, + "grad_norm": 0.17321969243454696, + "learning_rate": 7.348912139599701e-07, + "loss": 0.2699, + "step": 2114 + }, + { + "epoch": 1.0638941398865784, + "grad_norm": 0.17030523274157758, + "learning_rate": 7.346541017699204e-07, + "loss": 0.2752, + "step": 2115 + }, + { + "epoch": 1.0643982356647763, + "grad_norm": 0.17855378803648458, + "learning_rate": 7.344169218836161e-07, + "loss": 0.2634, + "step": 2116 + }, + { + "epoch": 1.0649023314429742, + "grad_norm": 0.17747877929878342, + "learning_rate": 7.341796743694817e-07, + "loss": 0.2796, + "step": 2117 + }, + { + "epoch": 1.065406427221172, + "grad_norm": 0.17074631177901406, + "learning_rate": 7.339423592959619e-07, + "loss": 0.2741, + "step": 2118 + }, + { + "epoch": 1.06591052299937, + "grad_norm": 0.16930821017014017, + "learning_rate": 7.337049767315207e-07, + "loss": 0.2669, + "step": 2119 + }, + { + "epoch": 1.0664146187775678, + "grad_norm": 0.17690264524984564, + "learning_rate": 7.334675267446415e-07, + "loss": 0.2725, + "step": 2120 + }, + { + "epoch": 1.0669187145557655, + "grad_norm": 0.17283246521488307, + "learning_rate": 7.33230009403827e-07, + "loss": 0.2883, + "step": 2121 + }, + { + "epoch": 1.0674228103339634, + "grad_norm": 0.1829853318315655, + "learning_rate": 7.329924247775997e-07, + "loss": 0.266, + "step": 2122 + }, + { + "epoch": 1.0679269061121612, + "grad_norm": 0.1685321993543116, + "learning_rate": 7.327547729345012e-07, + "loss": 0.2811, + "step": 2123 + }, + { + "epoch": 1.0684310018903591, + "grad_norm": 0.1849302770037306, + "learning_rate": 7.325170539430924e-07, + "loss": 0.2819, + "step": 2124 + }, + { + "epoch": 1.068935097668557, + "grad_norm": 0.1722656015767197, + "learning_rate": 7.32279267871954e-07, + "loss": 0.2689, + "step": 2125 + }, + { + "epoch": 1.0694391934467549, + "grad_norm": 0.1724271053593593, + "learning_rate": 7.320414147896857e-07, + "loss": 0.2667, + "step": 2126 + }, + { + "epoch": 1.0699432892249527, + "grad_norm": 0.17269435432280894, + "learning_rate": 7.318034947649064e-07, + "loss": 0.2799, + "step": 2127 + }, + { + "epoch": 1.0704473850031506, + "grad_norm": 0.17637659253320054, + "learning_rate": 7.31565507866255e-07, + "loss": 0.2916, + "step": 2128 + }, + { + "epoch": 1.0709514807813485, + "grad_norm": 0.1773899855628958, + "learning_rate": 7.313274541623891e-07, + "loss": 0.2817, + "step": 2129 + }, + { + "epoch": 1.0714555765595464, + "grad_norm": 0.18431910974801158, + "learning_rate": 7.310893337219857e-07, + "loss": 0.2909, + "step": 2130 + }, + { + "epoch": 1.0719596723377443, + "grad_norm": 0.16833157774674223, + "learning_rate": 7.30851146613741e-07, + "loss": 0.2765, + "step": 2131 + }, + { + "epoch": 1.0724637681159421, + "grad_norm": 0.17919301749031188, + "learning_rate": 7.306128929063705e-07, + "loss": 0.2882, + "step": 2132 + }, + { + "epoch": 1.0729678638941398, + "grad_norm": 0.17270532104058647, + "learning_rate": 7.303745726686091e-07, + "loss": 0.2781, + "step": 2133 + }, + { + "epoch": 1.0734719596723377, + "grad_norm": 0.1783614757197418, + "learning_rate": 7.301361859692103e-07, + "loss": 0.2754, + "step": 2134 + }, + { + "epoch": 1.0739760554505355, + "grad_norm": 0.17544697010356183, + "learning_rate": 7.298977328769476e-07, + "loss": 0.2854, + "step": 2135 + }, + { + "epoch": 1.0744801512287334, + "grad_norm": 0.18870456706038555, + "learning_rate": 7.296592134606133e-07, + "loss": 0.2766, + "step": 2136 + }, + { + "epoch": 1.0749842470069313, + "grad_norm": 0.18171100036742838, + "learning_rate": 7.294206277890185e-07, + "loss": 0.2744, + "step": 2137 + }, + { + "epoch": 1.0754883427851292, + "grad_norm": 0.1677025505094429, + "learning_rate": 7.291819759309936e-07, + "loss": 0.2647, + "step": 2138 + }, + { + "epoch": 1.075992438563327, + "grad_norm": 0.17288124268881677, + "learning_rate": 7.289432579553885e-07, + "loss": 0.2731, + "step": 2139 + }, + { + "epoch": 1.076496534341525, + "grad_norm": 0.1703425006119981, + "learning_rate": 7.287044739310717e-07, + "loss": 0.2795, + "step": 2140 + }, + { + "epoch": 1.0770006301197228, + "grad_norm": 0.20025547919726422, + "learning_rate": 7.284656239269308e-07, + "loss": 0.2877, + "step": 2141 + }, + { + "epoch": 1.0775047258979207, + "grad_norm": 0.17516136688479655, + "learning_rate": 7.282267080118727e-07, + "loss": 0.2929, + "step": 2142 + }, + { + "epoch": 1.0780088216761186, + "grad_norm": 0.17474524965964358, + "learning_rate": 7.279877262548232e-07, + "loss": 0.2768, + "step": 2143 + }, + { + "epoch": 1.0785129174543164, + "grad_norm": 0.17466518332420242, + "learning_rate": 7.277486787247269e-07, + "loss": 0.285, + "step": 2144 + }, + { + "epoch": 1.079017013232514, + "grad_norm": 0.1819043058750424, + "learning_rate": 7.275095654905477e-07, + "loss": 0.2952, + "step": 2145 + }, + { + "epoch": 1.079521109010712, + "grad_norm": 0.1737875425062245, + "learning_rate": 7.272703866212682e-07, + "loss": 0.2827, + "step": 2146 + }, + { + "epoch": 1.0800252047889098, + "grad_norm": 0.16958237624506534, + "learning_rate": 7.2703114218589e-07, + "loss": 0.268, + "step": 2147 + }, + { + "epoch": 1.0805293005671077, + "grad_norm": 0.17257636404154555, + "learning_rate": 7.267918322534336e-07, + "loss": 0.2971, + "step": 2148 + }, + { + "epoch": 1.0810333963453056, + "grad_norm": 0.16928675310322286, + "learning_rate": 7.265524568929386e-07, + "loss": 0.2736, + "step": 2149 + }, + { + "epoch": 1.0815374921235035, + "grad_norm": 0.1760838964224455, + "learning_rate": 7.263130161734632e-07, + "loss": 0.2849, + "step": 2150 + }, + { + "epoch": 1.0820415879017014, + "grad_norm": 0.18328166077952487, + "learning_rate": 7.260735101640845e-07, + "loss": 0.258, + "step": 2151 + }, + { + "epoch": 1.0825456836798992, + "grad_norm": 0.1739117009909244, + "learning_rate": 7.258339389338987e-07, + "loss": 0.2924, + "step": 2152 + }, + { + "epoch": 1.083049779458097, + "grad_norm": 0.1781115926870173, + "learning_rate": 7.255943025520203e-07, + "loss": 0.2596, + "step": 2153 + }, + { + "epoch": 1.083553875236295, + "grad_norm": 0.16844391904359102, + "learning_rate": 7.253546010875832e-07, + "loss": 0.2703, + "step": 2154 + }, + { + "epoch": 1.0840579710144929, + "grad_norm": 0.3278887015033608, + "learning_rate": 7.251148346097398e-07, + "loss": 0.2779, + "step": 2155 + }, + { + "epoch": 1.0845620667926905, + "grad_norm": 0.18990927841452848, + "learning_rate": 7.248750031876609e-07, + "loss": 0.2946, + "step": 2156 + }, + { + "epoch": 1.0850661625708884, + "grad_norm": 0.17850530711366283, + "learning_rate": 7.246351068905368e-07, + "loss": 0.2875, + "step": 2157 + }, + { + "epoch": 1.0855702583490863, + "grad_norm": 0.17728631017755883, + "learning_rate": 7.243951457875758e-07, + "loss": 0.2818, + "step": 2158 + }, + { + "epoch": 1.0860743541272841, + "grad_norm": 0.1774048928579683, + "learning_rate": 7.241551199480051e-07, + "loss": 0.3007, + "step": 2159 + }, + { + "epoch": 1.086578449905482, + "grad_norm": 0.17200567169289427, + "learning_rate": 7.239150294410712e-07, + "loss": 0.2717, + "step": 2160 + }, + { + "epoch": 1.08708254568368, + "grad_norm": 0.17241132084381322, + "learning_rate": 7.236748743360384e-07, + "loss": 0.2668, + "step": 2161 + }, + { + "epoch": 1.0875866414618778, + "grad_norm": 0.17166550486003698, + "learning_rate": 7.234346547021896e-07, + "loss": 0.2995, + "step": 2162 + }, + { + "epoch": 1.0880907372400757, + "grad_norm": 0.17146325451469707, + "learning_rate": 7.231943706088273e-07, + "loss": 0.2833, + "step": 2163 + }, + { + "epoch": 1.0885948330182735, + "grad_norm": 0.17437078445725077, + "learning_rate": 7.229540221252716e-07, + "loss": 0.2667, + "step": 2164 + }, + { + "epoch": 1.0890989287964714, + "grad_norm": 0.17975096501612886, + "learning_rate": 7.227136093208617e-07, + "loss": 0.2789, + "step": 2165 + }, + { + "epoch": 1.0896030245746693, + "grad_norm": 0.17177377044930098, + "learning_rate": 7.22473132264955e-07, + "loss": 0.2712, + "step": 2166 + }, + { + "epoch": 1.090107120352867, + "grad_norm": 0.16926098260953162, + "learning_rate": 7.22232591026928e-07, + "loss": 0.2655, + "step": 2167 + }, + { + "epoch": 1.0906112161310648, + "grad_norm": 0.1813817491695504, + "learning_rate": 7.21991985676175e-07, + "loss": 0.2782, + "step": 2168 + }, + { + "epoch": 1.0911153119092627, + "grad_norm": 0.1740637250383412, + "learning_rate": 7.217513162821094e-07, + "loss": 0.282, + "step": 2169 + }, + { + "epoch": 1.0916194076874606, + "grad_norm": 0.16909096484224836, + "learning_rate": 7.215105829141627e-07, + "loss": 0.2777, + "step": 2170 + }, + { + "epoch": 1.0921235034656585, + "grad_norm": 0.19169602165222183, + "learning_rate": 7.21269785641785e-07, + "loss": 0.2746, + "step": 2171 + }, + { + "epoch": 1.0926275992438563, + "grad_norm": 0.17606098979705256, + "learning_rate": 7.210289245344447e-07, + "loss": 0.2796, + "step": 2172 + }, + { + "epoch": 1.0931316950220542, + "grad_norm": 0.17081810769761613, + "learning_rate": 7.207879996616291e-07, + "loss": 0.272, + "step": 2173 + }, + { + "epoch": 1.093635790800252, + "grad_norm": 0.18390464264943393, + "learning_rate": 7.205470110928431e-07, + "loss": 0.2598, + "step": 2174 + }, + { + "epoch": 1.09413988657845, + "grad_norm": 0.1752617305053687, + "learning_rate": 7.203059588976107e-07, + "loss": 0.2702, + "step": 2175 + }, + { + "epoch": 1.0946439823566478, + "grad_norm": 0.17985065994012292, + "learning_rate": 7.20064843145474e-07, + "loss": 0.279, + "step": 2176 + }, + { + "epoch": 1.0951480781348457, + "grad_norm": 0.19788977536594046, + "learning_rate": 7.198236639059932e-07, + "loss": 0.2763, + "step": 2177 + }, + { + "epoch": 1.0956521739130434, + "grad_norm": 0.17078468603904942, + "learning_rate": 7.19582421248747e-07, + "loss": 0.2696, + "step": 2178 + }, + { + "epoch": 1.0961562696912412, + "grad_norm": 0.17485440247279033, + "learning_rate": 7.193411152433327e-07, + "loss": 0.2703, + "step": 2179 + }, + { + "epoch": 1.0966603654694391, + "grad_norm": 0.17440915689799694, + "learning_rate": 7.190997459593651e-07, + "loss": 0.272, + "step": 2180 + }, + { + "epoch": 1.097164461247637, + "grad_norm": 0.1798376956782012, + "learning_rate": 7.188583134664783e-07, + "loss": 0.2857, + "step": 2181 + }, + { + "epoch": 1.0976685570258349, + "grad_norm": 0.17843642267892243, + "learning_rate": 7.186168178343239e-07, + "loss": 0.3012, + "step": 2182 + }, + { + "epoch": 1.0981726528040328, + "grad_norm": 0.18526971863081118, + "learning_rate": 7.183752591325716e-07, + "loss": 0.2691, + "step": 2183 + }, + { + "epoch": 1.0986767485822306, + "grad_norm": 0.19144116070312203, + "learning_rate": 7.181336374309098e-07, + "loss": 0.2925, + "step": 2184 + }, + { + "epoch": 1.0991808443604285, + "grad_norm": 0.1727996053307209, + "learning_rate": 7.17891952799045e-07, + "loss": 0.2745, + "step": 2185 + }, + { + "epoch": 1.0996849401386264, + "grad_norm": 0.1706944820187512, + "learning_rate": 7.176502053067016e-07, + "loss": 0.2773, + "step": 2186 + }, + { + "epoch": 1.1001890359168243, + "grad_norm": 0.17180090645464358, + "learning_rate": 7.17408395023622e-07, + "loss": 0.2812, + "step": 2187 + }, + { + "epoch": 1.1006931316950221, + "grad_norm": 0.18042375948484118, + "learning_rate": 7.171665220195675e-07, + "loss": 0.2832, + "step": 2188 + }, + { + "epoch": 1.1011972274732198, + "grad_norm": 0.17833078926842466, + "learning_rate": 7.169245863643165e-07, + "loss": 0.2781, + "step": 2189 + }, + { + "epoch": 1.1011972274732198, + "eval_loss": 0.3109322786331177, + "eval_runtime": 18.3883, + "eval_samples_per_second": 46.497, + "eval_steps_per_second": 0.979, + "step": 2189 + }, + { + "epoch": 1.1017013232514177, + "grad_norm": 0.17668710642540988, + "learning_rate": 7.166825881276663e-07, + "loss": 0.2814, + "step": 2190 + }, + { + "epoch": 1.1022054190296156, + "grad_norm": 0.17892757631840098, + "learning_rate": 7.164405273794315e-07, + "loss": 0.287, + "step": 2191 + }, + { + "epoch": 1.1027095148078134, + "grad_norm": 0.19272913994143642, + "learning_rate": 7.161984041894453e-07, + "loss": 0.2671, + "step": 2192 + }, + { + "epoch": 1.1032136105860113, + "grad_norm": 0.1686523316395002, + "learning_rate": 7.159562186275589e-07, + "loss": 0.281, + "step": 2193 + }, + { + "epoch": 1.1037177063642092, + "grad_norm": 0.1802469776190519, + "learning_rate": 7.157139707636411e-07, + "loss": 0.2778, + "step": 2194 + }, + { + "epoch": 1.104221802142407, + "grad_norm": 0.17372788767408012, + "learning_rate": 7.15471660667579e-07, + "loss": 0.2766, + "step": 2195 + }, + { + "epoch": 1.104725897920605, + "grad_norm": 0.17515408548411832, + "learning_rate": 7.152292884092776e-07, + "loss": 0.2781, + "step": 2196 + }, + { + "epoch": 1.1052299936988028, + "grad_norm": 0.17120886057523532, + "learning_rate": 7.149868540586599e-07, + "loss": 0.2804, + "step": 2197 + }, + { + "epoch": 1.1057340894770007, + "grad_norm": 0.1967834646928095, + "learning_rate": 7.147443576856667e-07, + "loss": 0.2701, + "step": 2198 + }, + { + "epoch": 1.1062381852551986, + "grad_norm": 0.17730817500369223, + "learning_rate": 7.145017993602562e-07, + "loss": 0.2743, + "step": 2199 + }, + { + "epoch": 1.1067422810333964, + "grad_norm": 0.1963166032968741, + "learning_rate": 7.142591791524056e-07, + "loss": 0.2837, + "step": 2200 + }, + { + "epoch": 1.107246376811594, + "grad_norm": 0.17742060612270208, + "learning_rate": 7.14016497132109e-07, + "loss": 0.2878, + "step": 2201 + }, + { + "epoch": 1.107750472589792, + "grad_norm": 0.1746362136216461, + "learning_rate": 7.137737533693787e-07, + "loss": 0.2639, + "step": 2202 + }, + { + "epoch": 1.1082545683679899, + "grad_norm": 0.17804698672223912, + "learning_rate": 7.135309479342449e-07, + "loss": 0.2787, + "step": 2203 + }, + { + "epoch": 1.1087586641461877, + "grad_norm": 0.17579637022355174, + "learning_rate": 7.132880808967553e-07, + "loss": 0.2842, + "step": 2204 + }, + { + "epoch": 1.1092627599243856, + "grad_norm": 0.16989385004038, + "learning_rate": 7.130451523269757e-07, + "loss": 0.2677, + "step": 2205 + }, + { + "epoch": 1.1097668557025835, + "grad_norm": 0.17551046931385217, + "learning_rate": 7.128021622949894e-07, + "loss": 0.2797, + "step": 2206 + }, + { + "epoch": 1.1102709514807814, + "grad_norm": 0.17789389712157294, + "learning_rate": 7.125591108708973e-07, + "loss": 0.2638, + "step": 2207 + }, + { + "epoch": 1.1107750472589792, + "grad_norm": 0.1676683446730687, + "learning_rate": 7.123159981248187e-07, + "loss": 0.2743, + "step": 2208 + }, + { + "epoch": 1.1112791430371771, + "grad_norm": 0.17269955486897387, + "learning_rate": 7.120728241268897e-07, + "loss": 0.2753, + "step": 2209 + }, + { + "epoch": 1.111783238815375, + "grad_norm": 0.1772494652206068, + "learning_rate": 7.118295889472648e-07, + "loss": 0.2836, + "step": 2210 + }, + { + "epoch": 1.1122873345935729, + "grad_norm": 0.17063122243632267, + "learning_rate": 7.115862926561156e-07, + "loss": 0.2727, + "step": 2211 + }, + { + "epoch": 1.1127914303717708, + "grad_norm": 0.18443201636788845, + "learning_rate": 7.113429353236317e-07, + "loss": 0.2724, + "step": 2212 + }, + { + "epoch": 1.1132955261499684, + "grad_norm": 0.17187215404760892, + "learning_rate": 7.110995170200203e-07, + "loss": 0.2791, + "step": 2213 + }, + { + "epoch": 1.1137996219281663, + "grad_norm": 0.21059430107866162, + "learning_rate": 7.108560378155058e-07, + "loss": 0.2753, + "step": 2214 + }, + { + "epoch": 1.1143037177063642, + "grad_norm": 0.17008648209502905, + "learning_rate": 7.106124977803305e-07, + "loss": 0.2679, + "step": 2215 + }, + { + "epoch": 1.114807813484562, + "grad_norm": 0.17428666864442163, + "learning_rate": 7.103688969847544e-07, + "loss": 0.2788, + "step": 2216 + }, + { + "epoch": 1.11531190926276, + "grad_norm": 0.16978332708330493, + "learning_rate": 7.101252354990547e-07, + "loss": 0.2812, + "step": 2217 + }, + { + "epoch": 1.1158160050409578, + "grad_norm": 0.16763401695592228, + "learning_rate": 7.09881513393526e-07, + "loss": 0.2718, + "step": 2218 + }, + { + "epoch": 1.1163201008191557, + "grad_norm": 0.17695900270150158, + "learning_rate": 7.09637730738481e-07, + "loss": 0.2775, + "step": 2219 + }, + { + "epoch": 1.1168241965973535, + "grad_norm": 0.17577147289069614, + "learning_rate": 7.093938876042495e-07, + "loss": 0.2817, + "step": 2220 + }, + { + "epoch": 1.1173282923755514, + "grad_norm": 0.17875894680716742, + "learning_rate": 7.091499840611782e-07, + "loss": 0.2822, + "step": 2221 + }, + { + "epoch": 1.1178323881537493, + "grad_norm": 0.17471488732657428, + "learning_rate": 7.089060201796323e-07, + "loss": 0.2804, + "step": 2222 + }, + { + "epoch": 1.1183364839319472, + "grad_norm": 0.17146413279465827, + "learning_rate": 7.086619960299936e-07, + "loss": 0.2662, + "step": 2223 + }, + { + "epoch": 1.1188405797101448, + "grad_norm": 0.17497006413256116, + "learning_rate": 7.084179116826616e-07, + "loss": 0.273, + "step": 2224 + }, + { + "epoch": 1.1193446754883427, + "grad_norm": 0.17948306341455134, + "learning_rate": 7.081737672080533e-07, + "loss": 0.2772, + "step": 2225 + }, + { + "epoch": 1.1198487712665406, + "grad_norm": 0.18647108183544914, + "learning_rate": 7.079295626766026e-07, + "loss": 0.2788, + "step": 2226 + }, + { + "epoch": 1.1203528670447385, + "grad_norm": 0.1974823122970526, + "learning_rate": 7.076852981587613e-07, + "loss": 0.2993, + "step": 2227 + }, + { + "epoch": 1.1208569628229363, + "grad_norm": 0.17288627565030815, + "learning_rate": 7.07440973724998e-07, + "loss": 0.2775, + "step": 2228 + }, + { + "epoch": 1.1213610586011342, + "grad_norm": 0.18565848907871105, + "learning_rate": 7.071965894457987e-07, + "loss": 0.2763, + "step": 2229 + }, + { + "epoch": 1.121865154379332, + "grad_norm": 0.18033187048793917, + "learning_rate": 7.069521453916669e-07, + "loss": 0.2814, + "step": 2230 + }, + { + "epoch": 1.12236925015753, + "grad_norm": 0.17992401726222884, + "learning_rate": 7.067076416331233e-07, + "loss": 0.2729, + "step": 2231 + }, + { + "epoch": 1.1228733459357279, + "grad_norm": 0.17515567963879827, + "learning_rate": 7.064630782407053e-07, + "loss": 0.2971, + "step": 2232 + }, + { + "epoch": 1.1233774417139257, + "grad_norm": 0.16996972984196193, + "learning_rate": 7.062184552849683e-07, + "loss": 0.2752, + "step": 2233 + }, + { + "epoch": 1.1238815374921236, + "grad_norm": 0.1681151195763795, + "learning_rate": 7.059737728364844e-07, + "loss": 0.2764, + "step": 2234 + }, + { + "epoch": 1.1243856332703213, + "grad_norm": 0.17197455617986673, + "learning_rate": 7.05729030965843e-07, + "loss": 0.2732, + "step": 2235 + }, + { + "epoch": 1.1248897290485191, + "grad_norm": 0.17380812930247844, + "learning_rate": 7.054842297436506e-07, + "loss": 0.2713, + "step": 2236 + }, + { + "epoch": 1.125393824826717, + "grad_norm": 0.17146791708239711, + "learning_rate": 7.052393692405308e-07, + "loss": 0.274, + "step": 2237 + }, + { + "epoch": 1.125897920604915, + "grad_norm": 0.17522062714630593, + "learning_rate": 7.049944495271244e-07, + "loss": 0.291, + "step": 2238 + }, + { + "epoch": 1.1264020163831128, + "grad_norm": 0.18020842179105392, + "learning_rate": 7.047494706740891e-07, + "loss": 0.2776, + "step": 2239 + }, + { + "epoch": 1.1269061121613106, + "grad_norm": 0.17094614470132977, + "learning_rate": 7.045044327521e-07, + "loss": 0.2774, + "step": 2240 + }, + { + "epoch": 1.1274102079395085, + "grad_norm": 0.2013051302911893, + "learning_rate": 7.042593358318488e-07, + "loss": 0.274, + "step": 2241 + }, + { + "epoch": 1.1279143037177064, + "grad_norm": 0.17297868860952875, + "learning_rate": 7.040141799840446e-07, + "loss": 0.2651, + "step": 2242 + }, + { + "epoch": 1.1284183994959043, + "grad_norm": 0.17357556224692539, + "learning_rate": 7.037689652794132e-07, + "loss": 0.2742, + "step": 2243 + }, + { + "epoch": 1.1289224952741022, + "grad_norm": 0.1726295931526859, + "learning_rate": 7.035236917886977e-07, + "loss": 0.2749, + "step": 2244 + }, + { + "epoch": 1.1294265910523, + "grad_norm": 0.18408306057697696, + "learning_rate": 7.032783595826577e-07, + "loss": 0.2749, + "step": 2245 + }, + { + "epoch": 1.1299306868304977, + "grad_norm": 0.1760499835143201, + "learning_rate": 7.030329687320704e-07, + "loss": 0.2757, + "step": 2246 + }, + { + "epoch": 1.1304347826086956, + "grad_norm": 0.17625183802064207, + "learning_rate": 7.027875193077293e-07, + "loss": 0.2794, + "step": 2247 + }, + { + "epoch": 1.1309388783868934, + "grad_norm": 0.17473737339780746, + "learning_rate": 7.02542011380445e-07, + "loss": 0.2814, + "step": 2248 + }, + { + "epoch": 1.1314429741650913, + "grad_norm": 0.17699959608452862, + "learning_rate": 7.022964450210451e-07, + "loss": 0.2793, + "step": 2249 + }, + { + "epoch": 1.1319470699432892, + "grad_norm": 0.17541995578274833, + "learning_rate": 7.020508203003741e-07, + "loss": 0.2669, + "step": 2250 + }, + { + "epoch": 1.132451165721487, + "grad_norm": 0.1835401757832211, + "learning_rate": 7.01805137289293e-07, + "loss": 0.2743, + "step": 2251 + }, + { + "epoch": 1.132955261499685, + "grad_norm": 0.19020483202650845, + "learning_rate": 7.015593960586799e-07, + "loss": 0.2774, + "step": 2252 + }, + { + "epoch": 1.1334593572778828, + "grad_norm": 0.17000536309275166, + "learning_rate": 7.013135966794295e-07, + "loss": 0.2811, + "step": 2253 + }, + { + "epoch": 1.1339634530560807, + "grad_norm": 0.17801380513763565, + "learning_rate": 7.010677392224537e-07, + "loss": 0.2935, + "step": 2254 + }, + { + "epoch": 1.1344675488342786, + "grad_norm": 0.1787152664038223, + "learning_rate": 7.008218237586807e-07, + "loss": 0.2664, + "step": 2255 + }, + { + "epoch": 1.1349716446124765, + "grad_norm": 0.17254574236169345, + "learning_rate": 7.005758503590555e-07, + "loss": 0.2844, + "step": 2256 + }, + { + "epoch": 1.1354757403906741, + "grad_norm": 0.18663732747974232, + "learning_rate": 7.003298190945399e-07, + "loss": 0.2615, + "step": 2257 + }, + { + "epoch": 1.135979836168872, + "grad_norm": 0.18225915688322497, + "learning_rate": 7.000837300361127e-07, + "loss": 0.273, + "step": 2258 + }, + { + "epoch": 1.1364839319470699, + "grad_norm": 0.17525558203612415, + "learning_rate": 6.998375832547687e-07, + "loss": 0.2709, + "step": 2259 + }, + { + "epoch": 1.1369880277252677, + "grad_norm": 0.17721206051763022, + "learning_rate": 6.995913788215198e-07, + "loss": 0.2638, + "step": 2260 + }, + { + "epoch": 1.1374921235034656, + "grad_norm": 0.17478241116943685, + "learning_rate": 6.993451168073945e-07, + "loss": 0.2773, + "step": 2261 + }, + { + "epoch": 1.1379962192816635, + "grad_norm": 0.1774428076107305, + "learning_rate": 6.990987972834382e-07, + "loss": 0.2631, + "step": 2262 + }, + { + "epoch": 1.1385003150598614, + "grad_norm": 0.17410794951513323, + "learning_rate": 6.988524203207117e-07, + "loss": 0.2707, + "step": 2263 + }, + { + "epoch": 1.1390044108380593, + "grad_norm": 0.17441217480295762, + "learning_rate": 6.986059859902941e-07, + "loss": 0.2868, + "step": 2264 + }, + { + "epoch": 1.1395085066162571, + "grad_norm": 0.1747291161914758, + "learning_rate": 6.983594943632799e-07, + "loss": 0.2649, + "step": 2265 + }, + { + "epoch": 1.140012602394455, + "grad_norm": 0.17307228646736642, + "learning_rate": 6.981129455107802e-07, + "loss": 0.2861, + "step": 2266 + }, + { + "epoch": 1.1405166981726529, + "grad_norm": 0.18952720395194383, + "learning_rate": 6.978663395039231e-07, + "loss": 0.2865, + "step": 2267 + }, + { + "epoch": 1.1410207939508505, + "grad_norm": 0.17922645708493065, + "learning_rate": 6.976196764138526e-07, + "loss": 0.2808, + "step": 2268 + }, + { + "epoch": 1.1415248897290486, + "grad_norm": 0.17827854018956354, + "learning_rate": 6.973729563117297e-07, + "loss": 0.2667, + "step": 2269 + }, + { + "epoch": 1.1420289855072463, + "grad_norm": 0.18226122048319768, + "learning_rate": 6.971261792687315e-07, + "loss": 0.2784, + "step": 2270 + }, + { + "epoch": 1.1425330812854442, + "grad_norm": 0.1720926208778343, + "learning_rate": 6.968793453560518e-07, + "loss": 0.2696, + "step": 2271 + }, + { + "epoch": 1.143037177063642, + "grad_norm": 0.18305708913870564, + "learning_rate": 6.966324546449006e-07, + "loss": 0.281, + "step": 2272 + }, + { + "epoch": 1.14354127284184, + "grad_norm": 0.1731336224949858, + "learning_rate": 6.963855072065043e-07, + "loss": 0.2779, + "step": 2273 + }, + { + "epoch": 1.1440453686200378, + "grad_norm": 0.1748927423648997, + "learning_rate": 6.961385031121057e-07, + "loss": 0.2753, + "step": 2274 + }, + { + "epoch": 1.1445494643982357, + "grad_norm": 0.17333262216089212, + "learning_rate": 6.958914424329638e-07, + "loss": 0.2885, + "step": 2275 + }, + { + "epoch": 1.1450535601764336, + "grad_norm": 0.18183632440429925, + "learning_rate": 6.956443252403544e-07, + "loss": 0.2724, + "step": 2276 + }, + { + "epoch": 1.1455576559546314, + "grad_norm": 0.17479355568350904, + "learning_rate": 6.953971516055691e-07, + "loss": 0.2785, + "step": 2277 + }, + { + "epoch": 1.1460617517328293, + "grad_norm": 0.1785764248499925, + "learning_rate": 6.951499215999161e-07, + "loss": 0.2818, + "step": 2278 + }, + { + "epoch": 1.1465658475110272, + "grad_norm": 0.1707075256134137, + "learning_rate": 6.949026352947194e-07, + "loss": 0.2776, + "step": 2279 + }, + { + "epoch": 1.147069943289225, + "grad_norm": 0.17454520681763225, + "learning_rate": 6.946552927613201e-07, + "loss": 0.279, + "step": 2280 + }, + { + "epoch": 1.1475740390674227, + "grad_norm": 0.16725292243096776, + "learning_rate": 6.944078940710743e-07, + "loss": 0.2706, + "step": 2281 + }, + { + "epoch": 1.1480781348456206, + "grad_norm": 0.18158312079337943, + "learning_rate": 6.941604392953555e-07, + "loss": 0.2857, + "step": 2282 + }, + { + "epoch": 1.1485822306238185, + "grad_norm": 0.17686176298428924, + "learning_rate": 6.939129285055527e-07, + "loss": 0.2982, + "step": 2283 + }, + { + "epoch": 1.1490863264020164, + "grad_norm": 0.16814975626978187, + "learning_rate": 6.936653617730712e-07, + "loss": 0.2719, + "step": 2284 + }, + { + "epoch": 1.1495904221802142, + "grad_norm": 0.1759711441539806, + "learning_rate": 6.934177391693327e-07, + "loss": 0.2744, + "step": 2285 + }, + { + "epoch": 1.150094517958412, + "grad_norm": 0.17511838680958677, + "learning_rate": 6.931700607657744e-07, + "loss": 0.2739, + "step": 2286 + }, + { + "epoch": 1.15059861373661, + "grad_norm": 0.17740624835254679, + "learning_rate": 6.929223266338504e-07, + "loss": 0.269, + "step": 2287 + }, + { + "epoch": 1.1511027095148079, + "grad_norm": 0.1681514431068215, + "learning_rate": 6.926745368450301e-07, + "loss": 0.2749, + "step": 2288 + }, + { + "epoch": 1.1516068052930057, + "grad_norm": 0.1691426996495545, + "learning_rate": 6.924266914707995e-07, + "loss": 0.2644, + "step": 2289 + }, + { + "epoch": 1.1521109010712036, + "grad_norm": 0.1790330541665502, + "learning_rate": 6.921787905826605e-07, + "loss": 0.2823, + "step": 2290 + }, + { + "epoch": 1.1526149968494015, + "grad_norm": 0.1734370061342181, + "learning_rate": 6.919308342521308e-07, + "loss": 0.2764, + "step": 2291 + }, + { + "epoch": 1.1531190926275992, + "grad_norm": 0.18865399840145108, + "learning_rate": 6.916828225507443e-07, + "loss": 0.2811, + "step": 2292 + }, + { + "epoch": 1.153623188405797, + "grad_norm": 0.17616068936635845, + "learning_rate": 6.914347555500513e-07, + "loss": 0.2879, + "step": 2293 + }, + { + "epoch": 1.154127284183995, + "grad_norm": 0.17879090160590586, + "learning_rate": 6.911866333216169e-07, + "loss": 0.2717, + "step": 2294 + }, + { + "epoch": 1.1546313799621928, + "grad_norm": 0.18207500480662475, + "learning_rate": 6.909384559370233e-07, + "loss": 0.275, + "step": 2295 + }, + { + "epoch": 1.1551354757403907, + "grad_norm": 0.17522149059459446, + "learning_rate": 6.906902234678678e-07, + "loss": 0.2688, + "step": 2296 + }, + { + "epoch": 1.1556395715185885, + "grad_norm": 0.1938343793990561, + "learning_rate": 6.904419359857641e-07, + "loss": 0.2925, + "step": 2297 + }, + { + "epoch": 1.1561436672967864, + "grad_norm": 0.20218185185326207, + "learning_rate": 6.901935935623415e-07, + "loss": 0.2804, + "step": 2298 + }, + { + "epoch": 1.1566477630749843, + "grad_norm": 0.17255949907153817, + "learning_rate": 6.899451962692454e-07, + "loss": 0.2717, + "step": 2299 + }, + { + "epoch": 1.1571518588531822, + "grad_norm": 0.17432970101357595, + "learning_rate": 6.896967441781368e-07, + "loss": 0.2609, + "step": 2300 + }, + { + "epoch": 1.15765595463138, + "grad_norm": 0.17725867905667875, + "learning_rate": 6.894482373606927e-07, + "loss": 0.2877, + "step": 2301 + }, + { + "epoch": 1.158160050409578, + "grad_norm": 0.1817120717141377, + "learning_rate": 6.891996758886058e-07, + "loss": 0.2776, + "step": 2302 + }, + { + "epoch": 1.1586641461877756, + "grad_norm": 0.17327028373633524, + "learning_rate": 6.889510598335843e-07, + "loss": 0.2491, + "step": 2303 + }, + { + "epoch": 1.1591682419659735, + "grad_norm": 0.17396621817828842, + "learning_rate": 6.887023892673525e-07, + "loss": 0.2872, + "step": 2304 + }, + { + "epoch": 1.1596723377441713, + "grad_norm": 0.17083195902984658, + "learning_rate": 6.884536642616504e-07, + "loss": 0.2714, + "step": 2305 + }, + { + "epoch": 1.1601764335223692, + "grad_norm": 0.17991670145657968, + "learning_rate": 6.882048848882335e-07, + "loss": 0.2973, + "step": 2306 + }, + { + "epoch": 1.160680529300567, + "grad_norm": 0.17544125470825095, + "learning_rate": 6.879560512188733e-07, + "loss": 0.2767, + "step": 2307 + }, + { + "epoch": 1.161184625078765, + "grad_norm": 0.18423859716455426, + "learning_rate": 6.877071633253566e-07, + "loss": 0.2763, + "step": 2308 + }, + { + "epoch": 1.1616887208569628, + "grad_norm": 0.18681981813114995, + "learning_rate": 6.874582212794861e-07, + "loss": 0.2789, + "step": 2309 + }, + { + "epoch": 1.1621928166351607, + "grad_norm": 0.17640590031489775, + "learning_rate": 6.872092251530799e-07, + "loss": 0.288, + "step": 2310 + }, + { + "epoch": 1.1626969124133586, + "grad_norm": 0.17113861686922094, + "learning_rate": 6.869601750179721e-07, + "loss": 0.2721, + "step": 2311 + }, + { + "epoch": 1.1632010081915565, + "grad_norm": 0.1713692791805304, + "learning_rate": 6.867110709460118e-07, + "loss": 0.2767, + "step": 2312 + }, + { + "epoch": 1.1637051039697544, + "grad_norm": 0.1729727527955606, + "learning_rate": 6.864619130090642e-07, + "loss": 0.298, + "step": 2313 + }, + { + "epoch": 1.164209199747952, + "grad_norm": 0.18317780598435884, + "learning_rate": 6.862127012790098e-07, + "loss": 0.271, + "step": 2314 + }, + { + "epoch": 1.1647132955261499, + "grad_norm": 0.16947854182681354, + "learning_rate": 6.859634358277445e-07, + "loss": 0.2731, + "step": 2315 + }, + { + "epoch": 1.1652173913043478, + "grad_norm": 0.1764313134129964, + "learning_rate": 6.8571411672718e-07, + "loss": 0.2898, + "step": 2316 + }, + { + "epoch": 1.1657214870825456, + "grad_norm": 0.18128405805703027, + "learning_rate": 6.854647440492434e-07, + "loss": 0.2713, + "step": 2317 + }, + { + "epoch": 1.1662255828607435, + "grad_norm": 0.17302921586300837, + "learning_rate": 6.852153178658768e-07, + "loss": 0.2791, + "step": 2318 + }, + { + "epoch": 1.1667296786389414, + "grad_norm": 0.18238569158548426, + "learning_rate": 6.849658382490386e-07, + "loss": 0.2637, + "step": 2319 + }, + { + "epoch": 1.1672337744171393, + "grad_norm": 0.20560524437351874, + "learning_rate": 6.847163052707017e-07, + "loss": 0.2886, + "step": 2320 + }, + { + "epoch": 1.1677378701953371, + "grad_norm": 0.17279730398402016, + "learning_rate": 6.84466719002855e-07, + "loss": 0.276, + "step": 2321 + }, + { + "epoch": 1.168241965973535, + "grad_norm": 0.1748627881177702, + "learning_rate": 6.842170795175025e-07, + "loss": 0.2777, + "step": 2322 + }, + { + "epoch": 1.168746061751733, + "grad_norm": 0.20772510184287282, + "learning_rate": 6.839673868866639e-07, + "loss": 0.2745, + "step": 2323 + }, + { + "epoch": 1.1692501575299308, + "grad_norm": 0.1703532830956251, + "learning_rate": 6.837176411823738e-07, + "loss": 0.2631, + "step": 2324 + }, + { + "epoch": 1.1697542533081284, + "grad_norm": 0.1793111095271819, + "learning_rate": 6.834678424766822e-07, + "loss": 0.2818, + "step": 2325 + }, + { + "epoch": 1.1702583490863263, + "grad_norm": 0.17008601463744455, + "learning_rate": 6.832179908416546e-07, + "loss": 0.2637, + "step": 2326 + }, + { + "epoch": 1.1707624448645242, + "grad_norm": 0.16973624705065182, + "learning_rate": 6.829680863493717e-07, + "loss": 0.3012, + "step": 2327 + }, + { + "epoch": 1.171266540642722, + "grad_norm": 0.18216274445798897, + "learning_rate": 6.82718129071929e-07, + "loss": 0.2887, + "step": 2328 + }, + { + "epoch": 1.17177063642092, + "grad_norm": 0.18182861892057042, + "learning_rate": 6.824681190814383e-07, + "loss": 0.2825, + "step": 2329 + }, + { + "epoch": 1.1722747321991178, + "grad_norm": 0.18263373935726934, + "learning_rate": 6.822180564500254e-07, + "loss": 0.294, + "step": 2330 + }, + { + "epoch": 1.1727788279773157, + "grad_norm": 0.17351672150821956, + "learning_rate": 6.81967941249832e-07, + "loss": 0.2799, + "step": 2331 + }, + { + "epoch": 1.1732829237555136, + "grad_norm": 0.17364729038219348, + "learning_rate": 6.817177735530149e-07, + "loss": 0.2664, + "step": 2332 + }, + { + "epoch": 1.1737870195337115, + "grad_norm": 0.1808124777048507, + "learning_rate": 6.814675534317457e-07, + "loss": 0.2714, + "step": 2333 + }, + { + "epoch": 1.1742911153119093, + "grad_norm": 0.180877239519155, + "learning_rate": 6.812172809582114e-07, + "loss": 0.2828, + "step": 2334 + }, + { + "epoch": 1.1747952110901072, + "grad_norm": 0.17380368919234485, + "learning_rate": 6.809669562046142e-07, + "loss": 0.2737, + "step": 2335 + }, + { + "epoch": 1.1752993068683049, + "grad_norm": 0.19015792157284472, + "learning_rate": 6.807165792431712e-07, + "loss": 0.2909, + "step": 2336 + }, + { + "epoch": 1.175803402646503, + "grad_norm": 0.1739820937338746, + "learning_rate": 6.804661501461146e-07, + "loss": 0.2797, + "step": 2337 + }, + { + "epoch": 1.1763074984247006, + "grad_norm": 0.17829150310208366, + "learning_rate": 6.802156689856914e-07, + "loss": 0.272, + "step": 2338 + }, + { + "epoch": 1.1768115942028985, + "grad_norm": 0.17618058591368718, + "learning_rate": 6.799651358341644e-07, + "loss": 0.259, + "step": 2339 + }, + { + "epoch": 1.1773156899810964, + "grad_norm": 0.16972502373995604, + "learning_rate": 6.797145507638103e-07, + "loss": 0.2758, + "step": 2340 + }, + { + "epoch": 1.1778197857592942, + "grad_norm": 0.17376894673770923, + "learning_rate": 6.794639138469215e-07, + "loss": 0.2598, + "step": 2341 + }, + { + "epoch": 1.1783238815374921, + "grad_norm": 0.16890243175150166, + "learning_rate": 6.792132251558057e-07, + "loss": 0.2679, + "step": 2342 + }, + { + "epoch": 1.17882797731569, + "grad_norm": 0.16892397355410096, + "learning_rate": 6.789624847627842e-07, + "loss": 0.2713, + "step": 2343 + }, + { + "epoch": 1.1793320730938879, + "grad_norm": 0.18081668386093178, + "learning_rate": 6.787116927401947e-07, + "loss": 0.2839, + "step": 2344 + }, + { + "epoch": 1.1798361688720858, + "grad_norm": 0.1785462609641959, + "learning_rate": 6.784608491603887e-07, + "loss": 0.2767, + "step": 2345 + }, + { + "epoch": 1.1803402646502836, + "grad_norm": 0.17328874161653496, + "learning_rate": 6.782099540957334e-07, + "loss": 0.2792, + "step": 2346 + }, + { + "epoch": 1.1808443604284813, + "grad_norm": 0.17812431021686165, + "learning_rate": 6.779590076186103e-07, + "loss": 0.2756, + "step": 2347 + }, + { + "epoch": 1.1813484562066794, + "grad_norm": 0.18026369211177573, + "learning_rate": 6.777080098014157e-07, + "loss": 0.2829, + "step": 2348 + }, + { + "epoch": 1.181852551984877, + "grad_norm": 0.17224760523581167, + "learning_rate": 6.774569607165612e-07, + "loss": 0.2715, + "step": 2349 + }, + { + "epoch": 1.182356647763075, + "grad_norm": 0.17446195179033158, + "learning_rate": 6.772058604364728e-07, + "loss": 0.2816, + "step": 2350 + }, + { + "epoch": 1.1828607435412728, + "grad_norm": 0.18401389185741523, + "learning_rate": 6.769547090335915e-07, + "loss": 0.2756, + "step": 2351 + }, + { + "epoch": 1.1833648393194707, + "grad_norm": 0.17710337017025912, + "learning_rate": 6.767035065803728e-07, + "loss": 0.2662, + "step": 2352 + }, + { + "epoch": 1.1838689350976686, + "grad_norm": 0.17209284528655844, + "learning_rate": 6.76452253149287e-07, + "loss": 0.2643, + "step": 2353 + }, + { + "epoch": 1.1843730308758664, + "grad_norm": 0.19313801740333889, + "learning_rate": 6.762009488128193e-07, + "loss": 0.2859, + "step": 2354 + }, + { + "epoch": 1.1848771266540643, + "grad_norm": 0.185715764727419, + "learning_rate": 6.759495936434694e-07, + "loss": 0.2778, + "step": 2355 + }, + { + "epoch": 1.1853812224322622, + "grad_norm": 0.18586478407699625, + "learning_rate": 6.756981877137515e-07, + "loss": 0.2781, + "step": 2356 + }, + { + "epoch": 1.18588531821046, + "grad_norm": 0.18286265156559076, + "learning_rate": 6.754467310961951e-07, + "loss": 0.2941, + "step": 2357 + }, + { + "epoch": 1.186389413988658, + "grad_norm": 0.1827838570028727, + "learning_rate": 6.751952238633435e-07, + "loss": 0.2799, + "step": 2358 + }, + { + "epoch": 1.1868935097668558, + "grad_norm": 0.17564052156297236, + "learning_rate": 6.74943666087755e-07, + "loss": 0.2859, + "step": 2359 + }, + { + "epoch": 1.1873976055450535, + "grad_norm": 0.17276098802425446, + "learning_rate": 6.746920578420027e-07, + "loss": 0.273, + "step": 2360 + }, + { + "epoch": 1.1879017013232513, + "grad_norm": 0.16583786340074003, + "learning_rate": 6.744403991986737e-07, + "loss": 0.2751, + "step": 2361 + }, + { + "epoch": 1.1884057971014492, + "grad_norm": 0.1758298830227382, + "learning_rate": 6.741886902303703e-07, + "loss": 0.2817, + "step": 2362 + }, + { + "epoch": 1.188909892879647, + "grad_norm": 0.1786122710003466, + "learning_rate": 6.739369310097087e-07, + "loss": 0.2746, + "step": 2363 + }, + { + "epoch": 1.189413988657845, + "grad_norm": 0.1725746663086315, + "learning_rate": 6.7368512160932e-07, + "loss": 0.2697, + "step": 2364 + }, + { + "epoch": 1.1899180844360429, + "grad_norm": 0.17766797524511566, + "learning_rate": 6.734332621018497e-07, + "loss": 0.2731, + "step": 2365 + }, + { + "epoch": 1.1904221802142407, + "grad_norm": 0.1773919783826711, + "learning_rate": 6.731813525599576e-07, + "loss": 0.2811, + "step": 2366 + }, + { + "epoch": 1.1909262759924386, + "grad_norm": 0.18244881573424246, + "learning_rate": 6.72929393056318e-07, + "loss": 0.2733, + "step": 2367 + }, + { + "epoch": 1.1914303717706365, + "grad_norm": 0.17285562925317308, + "learning_rate": 6.7267738366362e-07, + "loss": 0.2691, + "step": 2368 + }, + { + "epoch": 1.1919344675488344, + "grad_norm": 0.18991372620629085, + "learning_rate": 6.724253244545663e-07, + "loss": 0.2787, + "step": 2369 + }, + { + "epoch": 1.1924385633270322, + "grad_norm": 0.17368277958423853, + "learning_rate": 6.721732155018747e-07, + "loss": 0.2746, + "step": 2370 + }, + { + "epoch": 1.19294265910523, + "grad_norm": 0.17143055764086812, + "learning_rate": 6.719210568782768e-07, + "loss": 0.2774, + "step": 2371 + }, + { + "epoch": 1.1934467548834278, + "grad_norm": 0.1723446011688051, + "learning_rate": 6.716688486565192e-07, + "loss": 0.2615, + "step": 2372 + }, + { + "epoch": 1.1939508506616257, + "grad_norm": 0.17415883235396237, + "learning_rate": 6.714165909093621e-07, + "loss": 0.2904, + "step": 2373 + }, + { + "epoch": 1.1944549464398235, + "grad_norm": 0.17281066848566462, + "learning_rate": 6.711642837095804e-07, + "loss": 0.2629, + "step": 2374 + }, + { + "epoch": 1.1949590422180214, + "grad_norm": 0.17706321838125264, + "learning_rate": 6.709119271299631e-07, + "loss": 0.2791, + "step": 2375 + }, + { + "epoch": 1.1954631379962193, + "grad_norm": 0.17409373969496145, + "learning_rate": 6.706595212433137e-07, + "loss": 0.286, + "step": 2376 + }, + { + "epoch": 1.1959672337744172, + "grad_norm": 0.17363748305455576, + "learning_rate": 6.704070661224496e-07, + "loss": 0.2809, + "step": 2377 + }, + { + "epoch": 1.196471329552615, + "grad_norm": 0.17224099285294484, + "learning_rate": 6.701545618402025e-07, + "loss": 0.2696, + "step": 2378 + }, + { + "epoch": 1.196975425330813, + "grad_norm": 0.17472439778273308, + "learning_rate": 6.699020084694183e-07, + "loss": 0.2719, + "step": 2379 + }, + { + "epoch": 1.1974795211090108, + "grad_norm": 0.17609087999551598, + "learning_rate": 6.69649406082957e-07, + "loss": 0.2687, + "step": 2380 + }, + { + "epoch": 1.1979836168872087, + "grad_norm": 0.17688131759879658, + "learning_rate": 6.693967547536932e-07, + "loss": 0.2839, + "step": 2381 + }, + { + "epoch": 1.1984877126654063, + "grad_norm": 0.1801110500802134, + "learning_rate": 6.69144054554515e-07, + "loss": 0.2898, + "step": 2382 + }, + { + "epoch": 1.1989918084436042, + "grad_norm": 0.17434074905342395, + "learning_rate": 6.688913055583247e-07, + "loss": 0.2673, + "step": 2383 + }, + { + "epoch": 1.199495904221802, + "grad_norm": 0.18206024523654654, + "learning_rate": 6.686385078380392e-07, + "loss": 0.2817, + "step": 2384 + }, + { + "epoch": 1.2, + "grad_norm": 0.1875715751646641, + "learning_rate": 6.683856614665887e-07, + "loss": 0.2694, + "step": 2385 + }, + { + "epoch": 1.2005040957781978, + "grad_norm": 0.17720280812976338, + "learning_rate": 6.68132766516918e-07, + "loss": 0.293, + "step": 2386 + }, + { + "epoch": 1.2010081915563957, + "grad_norm": 0.1827446975631408, + "learning_rate": 6.678798230619856e-07, + "loss": 0.2607, + "step": 2387 + }, + { + "epoch": 1.2015122873345936, + "grad_norm": 0.18309536687811928, + "learning_rate": 6.676268311747644e-07, + "loss": 0.2776, + "step": 2388 + }, + { + "epoch": 1.2015122873345936, + "eval_loss": 0.30996033549308777, + "eval_runtime": 18.406, + "eval_samples_per_second": 46.452, + "eval_steps_per_second": 0.978, + "step": 2388 + }, + { + "epoch": 1.2020163831127915, + "grad_norm": 0.17691733319401884, + "learning_rate": 6.673737909282406e-07, + "loss": 0.2905, + "step": 2389 + }, + { + "epoch": 1.2025204788909893, + "grad_norm": 0.17048139269250298, + "learning_rate": 6.671207023954151e-07, + "loss": 0.2683, + "step": 2390 + }, + { + "epoch": 1.2030245746691872, + "grad_norm": 0.18100250219396277, + "learning_rate": 6.66867565649302e-07, + "loss": 0.2676, + "step": 2391 + }, + { + "epoch": 1.203528670447385, + "grad_norm": 0.1723576811925902, + "learning_rate": 6.666143807629302e-07, + "loss": 0.2808, + "step": 2392 + }, + { + "epoch": 1.2040327662255828, + "grad_norm": 0.17272371476581297, + "learning_rate": 6.663611478093415e-07, + "loss": 0.2668, + "step": 2393 + }, + { + "epoch": 1.2045368620037806, + "grad_norm": 0.1852834091042434, + "learning_rate": 6.661078668615922e-07, + "loss": 0.276, + "step": 2394 + }, + { + "epoch": 1.2050409577819785, + "grad_norm": 0.17253248810106345, + "learning_rate": 6.658545379927523e-07, + "loss": 0.2737, + "step": 2395 + }, + { + "epoch": 1.2055450535601764, + "grad_norm": 0.17189335242974707, + "learning_rate": 6.656011612759056e-07, + "loss": 0.2671, + "step": 2396 + }, + { + "epoch": 1.2060491493383743, + "grad_norm": 0.17237484748444734, + "learning_rate": 6.653477367841497e-07, + "loss": 0.2702, + "step": 2397 + }, + { + "epoch": 1.2065532451165721, + "grad_norm": 0.17030567087724324, + "learning_rate": 6.65094264590596e-07, + "loss": 0.2688, + "step": 2398 + }, + { + "epoch": 1.20705734089477, + "grad_norm": 0.17311851732438963, + "learning_rate": 6.648407447683698e-07, + "loss": 0.2945, + "step": 2399 + }, + { + "epoch": 1.207561436672968, + "grad_norm": 0.1868284113694255, + "learning_rate": 6.645871773906098e-07, + "loss": 0.2685, + "step": 2400 + }, + { + "epoch": 1.2080655324511658, + "grad_norm": 0.1797570381187558, + "learning_rate": 6.643335625304687e-07, + "loss": 0.2778, + "step": 2401 + }, + { + "epoch": 1.2085696282293636, + "grad_norm": 0.1736947065045101, + "learning_rate": 6.640799002611127e-07, + "loss": 0.2701, + "step": 2402 + }, + { + "epoch": 1.2090737240075615, + "grad_norm": 0.16988523916524562, + "learning_rate": 6.638261906557219e-07, + "loss": 0.2716, + "step": 2403 + }, + { + "epoch": 1.2095778197857592, + "grad_norm": 0.17637575120619103, + "learning_rate": 6.635724337874902e-07, + "loss": 0.2792, + "step": 2404 + }, + { + "epoch": 1.210081915563957, + "grad_norm": 0.18457061836282274, + "learning_rate": 6.633186297296244e-07, + "loss": 0.2726, + "step": 2405 + }, + { + "epoch": 1.210586011342155, + "grad_norm": 0.1835538319807229, + "learning_rate": 6.630647785553456e-07, + "loss": 0.2778, + "step": 2406 + }, + { + "epoch": 1.2110901071203528, + "grad_norm": 0.17108814635241743, + "learning_rate": 6.628108803378884e-07, + "loss": 0.2596, + "step": 2407 + }, + { + "epoch": 1.2115942028985507, + "grad_norm": 0.17109777831078982, + "learning_rate": 6.625569351505008e-07, + "loss": 0.2799, + "step": 2408 + }, + { + "epoch": 1.2120982986767486, + "grad_norm": 0.17949433198481227, + "learning_rate": 6.623029430664444e-07, + "loss": 0.2717, + "step": 2409 + }, + { + "epoch": 1.2126023944549464, + "grad_norm": 0.18703173955181984, + "learning_rate": 6.620489041589942e-07, + "loss": 0.2734, + "step": 2410 + }, + { + "epoch": 1.2131064902331443, + "grad_norm": 0.16778996136487268, + "learning_rate": 6.617948185014392e-07, + "loss": 0.2673, + "step": 2411 + }, + { + "epoch": 1.2136105860113422, + "grad_norm": 0.17151013082627964, + "learning_rate": 6.615406861670811e-07, + "loss": 0.2848, + "step": 2412 + }, + { + "epoch": 1.21411468178954, + "grad_norm": 0.17205458561571332, + "learning_rate": 6.612865072292359e-07, + "loss": 0.2788, + "step": 2413 + }, + { + "epoch": 1.214618777567738, + "grad_norm": 0.1697154843584016, + "learning_rate": 6.610322817612326e-07, + "loss": 0.2782, + "step": 2414 + }, + { + "epoch": 1.2151228733459356, + "grad_norm": 0.17155395420115768, + "learning_rate": 6.607780098364133e-07, + "loss": 0.2757, + "step": 2415 + }, + { + "epoch": 1.2156269691241337, + "grad_norm": 0.17914770936359822, + "learning_rate": 6.605236915281343e-07, + "loss": 0.2625, + "step": 2416 + }, + { + "epoch": 1.2161310649023314, + "grad_norm": 0.17886990627397134, + "learning_rate": 6.602693269097646e-07, + "loss": 0.2826, + "step": 2417 + }, + { + "epoch": 1.2166351606805292, + "grad_norm": 0.17137302898724377, + "learning_rate": 6.600149160546868e-07, + "loss": 0.2654, + "step": 2418 + }, + { + "epoch": 1.2171392564587271, + "grad_norm": 0.16812006244582586, + "learning_rate": 6.597604590362972e-07, + "loss": 0.2693, + "step": 2419 + }, + { + "epoch": 1.217643352236925, + "grad_norm": 0.1967968966854081, + "learning_rate": 6.595059559280047e-07, + "loss": 0.2851, + "step": 2420 + }, + { + "epoch": 1.2181474480151229, + "grad_norm": 0.18024230166349509, + "learning_rate": 6.592514068032321e-07, + "loss": 0.2802, + "step": 2421 + }, + { + "epoch": 1.2186515437933207, + "grad_norm": 0.17680540350469492, + "learning_rate": 6.58996811735415e-07, + "loss": 0.267, + "step": 2422 + }, + { + "epoch": 1.2191556395715186, + "grad_norm": 0.17123047011311054, + "learning_rate": 6.587421707980027e-07, + "loss": 0.2691, + "step": 2423 + }, + { + "epoch": 1.2196597353497165, + "grad_norm": 0.1817858622939275, + "learning_rate": 6.584874840644575e-07, + "loss": 0.2695, + "step": 2424 + }, + { + "epoch": 1.2201638311279144, + "grad_norm": 0.17367383253762286, + "learning_rate": 6.582327516082549e-07, + "loss": 0.2836, + "step": 2425 + }, + { + "epoch": 1.2206679269061123, + "grad_norm": 0.17880252652432166, + "learning_rate": 6.579779735028836e-07, + "loss": 0.2733, + "step": 2426 + }, + { + "epoch": 1.2211720226843101, + "grad_norm": 0.18216013842602857, + "learning_rate": 6.577231498218457e-07, + "loss": 0.2563, + "step": 2427 + }, + { + "epoch": 1.2216761184625078, + "grad_norm": 0.17923344984805104, + "learning_rate": 6.574682806386559e-07, + "loss": 0.2727, + "step": 2428 + }, + { + "epoch": 1.2221802142407057, + "grad_norm": 0.17305807088265074, + "learning_rate": 6.572133660268428e-07, + "loss": 0.2786, + "step": 2429 + }, + { + "epoch": 1.2226843100189035, + "grad_norm": 0.1826130969086575, + "learning_rate": 6.569584060599475e-07, + "loss": 0.2639, + "step": 2430 + }, + { + "epoch": 1.2231884057971014, + "grad_norm": 0.17180169078282023, + "learning_rate": 6.567034008115242e-07, + "loss": 0.2567, + "step": 2431 + }, + { + "epoch": 1.2236925015752993, + "grad_norm": 0.17999665867139894, + "learning_rate": 6.564483503551406e-07, + "loss": 0.2868, + "step": 2432 + }, + { + "epoch": 1.2241965973534972, + "grad_norm": 0.1734485518145704, + "learning_rate": 6.56193254764377e-07, + "loss": 0.2842, + "step": 2433 + }, + { + "epoch": 1.224700693131695, + "grad_norm": 0.19524933134868758, + "learning_rate": 6.55938114112827e-07, + "loss": 0.2805, + "step": 2434 + }, + { + "epoch": 1.225204788909893, + "grad_norm": 0.3859143312648577, + "learning_rate": 6.556829284740972e-07, + "loss": 0.2793, + "step": 2435 + }, + { + "epoch": 1.2257088846880908, + "grad_norm": 0.16949921412202032, + "learning_rate": 6.554276979218069e-07, + "loss": 0.2633, + "step": 2436 + }, + { + "epoch": 1.2262129804662887, + "grad_norm": 0.17509147431622993, + "learning_rate": 6.551724225295885e-07, + "loss": 0.2585, + "step": 2437 + }, + { + "epoch": 1.2267170762444866, + "grad_norm": 0.18102430967796426, + "learning_rate": 6.549171023710874e-07, + "loss": 0.2568, + "step": 2438 + }, + { + "epoch": 1.2272211720226842, + "grad_norm": 0.17595819751957628, + "learning_rate": 6.54661737519962e-07, + "loss": 0.2678, + "step": 2439 + }, + { + "epoch": 1.227725267800882, + "grad_norm": 0.18034785174616086, + "learning_rate": 6.544063280498834e-07, + "loss": 0.286, + "step": 2440 + }, + { + "epoch": 1.22822936357908, + "grad_norm": 0.1751845129218413, + "learning_rate": 6.541508740345357e-07, + "loss": 0.2704, + "step": 2441 + }, + { + "epoch": 1.2287334593572778, + "grad_norm": 0.1707508154159352, + "learning_rate": 6.538953755476155e-07, + "loss": 0.2657, + "step": 2442 + }, + { + "epoch": 1.2292375551354757, + "grad_norm": 0.17571528214389306, + "learning_rate": 6.53639832662833e-07, + "loss": 0.2737, + "step": 2443 + }, + { + "epoch": 1.2297416509136736, + "grad_norm": 0.1767996956782402, + "learning_rate": 6.533842454539105e-07, + "loss": 0.2692, + "step": 2444 + }, + { + "epoch": 1.2302457466918715, + "grad_norm": 0.17677555995052904, + "learning_rate": 6.531286139945834e-07, + "loss": 0.2747, + "step": 2445 + }, + { + "epoch": 1.2307498424700694, + "grad_norm": 0.17418823992379287, + "learning_rate": 6.528729383585997e-07, + "loss": 0.2687, + "step": 2446 + }, + { + "epoch": 1.2312539382482672, + "grad_norm": 0.1981211293612195, + "learning_rate": 6.526172186197203e-07, + "loss": 0.2699, + "step": 2447 + }, + { + "epoch": 1.231758034026465, + "grad_norm": 0.1770440172204685, + "learning_rate": 6.523614548517187e-07, + "loss": 0.2758, + "step": 2448 + }, + { + "epoch": 1.232262129804663, + "grad_norm": 0.1733214708512733, + "learning_rate": 6.521056471283811e-07, + "loss": 0.2761, + "step": 2449 + }, + { + "epoch": 1.2327662255828606, + "grad_norm": 0.2036211297547966, + "learning_rate": 6.518497955235068e-07, + "loss": 0.268, + "step": 2450 + }, + { + "epoch": 1.2332703213610585, + "grad_norm": 0.17325758117743048, + "learning_rate": 6.515939001109069e-07, + "loss": 0.2748, + "step": 2451 + }, + { + "epoch": 1.2337744171392564, + "grad_norm": 0.17462788382443756, + "learning_rate": 6.513379609644062e-07, + "loss": 0.2782, + "step": 2452 + }, + { + "epoch": 1.2342785129174543, + "grad_norm": 0.1711968410986884, + "learning_rate": 6.51081978157841e-07, + "loss": 0.2707, + "step": 2453 + }, + { + "epoch": 1.2347826086956522, + "grad_norm": 0.1683105761384845, + "learning_rate": 6.50825951765061e-07, + "loss": 0.2773, + "step": 2454 + }, + { + "epoch": 1.23528670447385, + "grad_norm": 0.17554589173116267, + "learning_rate": 6.505698818599284e-07, + "loss": 0.2676, + "step": 2455 + }, + { + "epoch": 1.235790800252048, + "grad_norm": 0.1759694214712948, + "learning_rate": 6.503137685163173e-07, + "loss": 0.2749, + "step": 2456 + }, + { + "epoch": 1.2362948960302458, + "grad_norm": 0.17957710933654297, + "learning_rate": 6.500576118081155e-07, + "loss": 0.2762, + "step": 2457 + }, + { + "epoch": 1.2367989918084437, + "grad_norm": 0.19053173662723633, + "learning_rate": 6.49801411809222e-07, + "loss": 0.2803, + "step": 2458 + }, + { + "epoch": 1.2373030875866415, + "grad_norm": 0.17718428947532663, + "learning_rate": 6.495451685935494e-07, + "loss": 0.2729, + "step": 2459 + }, + { + "epoch": 1.2378071833648394, + "grad_norm": 0.17785327429842798, + "learning_rate": 6.492888822350219e-07, + "loss": 0.2661, + "step": 2460 + }, + { + "epoch": 1.238311279143037, + "grad_norm": 0.17300591946217833, + "learning_rate": 6.490325528075766e-07, + "loss": 0.2867, + "step": 2461 + }, + { + "epoch": 1.238815374921235, + "grad_norm": 0.16921354432191585, + "learning_rate": 6.487761803851631e-07, + "loss": 0.2711, + "step": 2462 + }, + { + "epoch": 1.2393194706994328, + "grad_norm": 0.17810885372654386, + "learning_rate": 6.485197650417431e-07, + "loss": 0.258, + "step": 2463 + }, + { + "epoch": 1.2398235664776307, + "grad_norm": 0.18103805033195008, + "learning_rate": 6.482633068512911e-07, + "loss": 0.2856, + "step": 2464 + }, + { + "epoch": 1.2403276622558286, + "grad_norm": 0.18360526541296773, + "learning_rate": 6.480068058877934e-07, + "loss": 0.2743, + "step": 2465 + }, + { + "epoch": 1.2408317580340265, + "grad_norm": 0.18884668384982423, + "learning_rate": 6.47750262225249e-07, + "loss": 0.2769, + "step": 2466 + }, + { + "epoch": 1.2413358538122243, + "grad_norm": 0.2002321224410455, + "learning_rate": 6.474936759376693e-07, + "loss": 0.2707, + "step": 2467 + }, + { + "epoch": 1.2418399495904222, + "grad_norm": 0.18325425135295045, + "learning_rate": 6.472370470990778e-07, + "loss": 0.2694, + "step": 2468 + }, + { + "epoch": 1.24234404536862, + "grad_norm": 0.1696869877874564, + "learning_rate": 6.469803757835102e-07, + "loss": 0.2747, + "step": 2469 + }, + { + "epoch": 1.242848141146818, + "grad_norm": 0.3486889386014526, + "learning_rate": 6.467236620650147e-07, + "loss": 0.2826, + "step": 2470 + }, + { + "epoch": 1.2433522369250158, + "grad_norm": 0.16956469304659022, + "learning_rate": 6.464669060176516e-07, + "loss": 0.2687, + "step": 2471 + }, + { + "epoch": 1.2438563327032135, + "grad_norm": 0.20746929532665012, + "learning_rate": 6.462101077154935e-07, + "loss": 0.2721, + "step": 2472 + }, + { + "epoch": 1.2443604284814114, + "grad_norm": 0.18257378343955985, + "learning_rate": 6.459532672326249e-07, + "loss": 0.2725, + "step": 2473 + }, + { + "epoch": 1.2448645242596093, + "grad_norm": 0.17681868108144572, + "learning_rate": 6.45696384643143e-07, + "loss": 0.2798, + "step": 2474 + }, + { + "epoch": 1.2453686200378071, + "grad_norm": 0.19498569967155788, + "learning_rate": 6.454394600211565e-07, + "loss": 0.276, + "step": 2475 + }, + { + "epoch": 1.245872715816005, + "grad_norm": 0.19909897682075145, + "learning_rate": 6.45182493440787e-07, + "loss": 0.2883, + "step": 2476 + }, + { + "epoch": 1.2463768115942029, + "grad_norm": 0.17632241811925964, + "learning_rate": 6.449254849761672e-07, + "loss": 0.2688, + "step": 2477 + }, + { + "epoch": 1.2468809073724008, + "grad_norm": 0.1762000666442149, + "learning_rate": 6.446684347014428e-07, + "loss": 0.2741, + "step": 2478 + }, + { + "epoch": 1.2473850031505986, + "grad_norm": 0.1773686838716517, + "learning_rate": 6.444113426907713e-07, + "loss": 0.2899, + "step": 2479 + }, + { + "epoch": 1.2478890989287965, + "grad_norm": 0.23756861929225664, + "learning_rate": 6.44154209018322e-07, + "loss": 0.2687, + "step": 2480 + }, + { + "epoch": 1.2483931947069944, + "grad_norm": 0.16938250561272578, + "learning_rate": 6.438970337582764e-07, + "loss": 0.2817, + "step": 2481 + }, + { + "epoch": 1.2488972904851923, + "grad_norm": 0.18157388578326444, + "learning_rate": 6.436398169848278e-07, + "loss": 0.2718, + "step": 2482 + }, + { + "epoch": 1.24940138626339, + "grad_norm": 0.19502196013884435, + "learning_rate": 6.43382558772182e-07, + "loss": 0.2768, + "step": 2483 + }, + { + "epoch": 1.249905482041588, + "grad_norm": 0.1734733146967059, + "learning_rate": 6.431252591945561e-07, + "loss": 0.2769, + "step": 2484 + }, + { + "epoch": 1.2504095778197857, + "grad_norm": 0.188235466172801, + "learning_rate": 6.428679183261796e-07, + "loss": 0.2705, + "step": 2485 + }, + { + "epoch": 1.2509136735979836, + "grad_norm": 0.19152028195774132, + "learning_rate": 6.426105362412935e-07, + "loss": 0.2732, + "step": 2486 + }, + { + "epoch": 1.2514177693761814, + "grad_norm": 0.1695103765753016, + "learning_rate": 6.423531130141513e-07, + "loss": 0.2753, + "step": 2487 + }, + { + "epoch": 1.2519218651543793, + "grad_norm": 0.17465424303832264, + "learning_rate": 6.420956487190177e-07, + "loss": 0.2744, + "step": 2488 + }, + { + "epoch": 1.2524259609325772, + "grad_norm": 0.17601676662066001, + "learning_rate": 6.418381434301698e-07, + "loss": 0.2639, + "step": 2489 + }, + { + "epoch": 1.252930056710775, + "grad_norm": 0.17576563315273216, + "learning_rate": 6.415805972218962e-07, + "loss": 0.2731, + "step": 2490 + }, + { + "epoch": 1.253434152488973, + "grad_norm": 0.1817582073444604, + "learning_rate": 6.413230101684972e-07, + "loss": 0.2915, + "step": 2491 + }, + { + "epoch": 1.2539382482671708, + "grad_norm": 0.1871857208126174, + "learning_rate": 6.410653823442853e-07, + "loss": 0.2669, + "step": 2492 + }, + { + "epoch": 1.2544423440453687, + "grad_norm": 0.17014233122771064, + "learning_rate": 6.408077138235843e-07, + "loss": 0.274, + "step": 2493 + }, + { + "epoch": 1.2549464398235664, + "grad_norm": 0.1714640417770811, + "learning_rate": 6.405500046807303e-07, + "loss": 0.2686, + "step": 2494 + }, + { + "epoch": 1.2554505356017645, + "grad_norm": 0.19243821454194698, + "learning_rate": 6.402922549900705e-07, + "loss": 0.2891, + "step": 2495 + }, + { + "epoch": 1.255954631379962, + "grad_norm": 0.17463123533718689, + "learning_rate": 6.400344648259644e-07, + "loss": 0.2699, + "step": 2496 + }, + { + "epoch": 1.25645872715816, + "grad_norm": 0.1910291136178679, + "learning_rate": 6.397766342627825e-07, + "loss": 0.2635, + "step": 2497 + }, + { + "epoch": 1.2569628229363579, + "grad_norm": 0.17138494155286757, + "learning_rate": 6.395187633749075e-07, + "loss": 0.2833, + "step": 2498 + }, + { + "epoch": 1.2574669187145557, + "grad_norm": 0.17443608504712982, + "learning_rate": 6.392608522367336e-07, + "loss": 0.2723, + "step": 2499 + }, + { + "epoch": 1.2579710144927536, + "grad_norm": 0.1699382749604669, + "learning_rate": 6.390029009226664e-07, + "loss": 0.2657, + "step": 2500 + }, + { + "epoch": 1.2584751102709515, + "grad_norm": 0.17760734553417154, + "learning_rate": 6.387449095071234e-07, + "loss": 0.2738, + "step": 2501 + }, + { + "epoch": 1.2589792060491494, + "grad_norm": 0.17525575952040232, + "learning_rate": 6.384868780645335e-07, + "loss": 0.2703, + "step": 2502 + }, + { + "epoch": 1.2594833018273472, + "grad_norm": 0.18894681759150508, + "learning_rate": 6.382288066693372e-07, + "loss": 0.2816, + "step": 2503 + }, + { + "epoch": 1.2599873976055451, + "grad_norm": 0.18849104352320012, + "learning_rate": 6.379706953959865e-07, + "loss": 0.2766, + "step": 2504 + }, + { + "epoch": 1.2604914933837428, + "grad_norm": 0.17320664007502315, + "learning_rate": 6.377125443189446e-07, + "loss": 0.2558, + "step": 2505 + }, + { + "epoch": 1.2609955891619409, + "grad_norm": 0.1981593202287322, + "learning_rate": 6.37454353512687e-07, + "loss": 0.2854, + "step": 2506 + }, + { + "epoch": 1.2614996849401385, + "grad_norm": 0.1674126260844481, + "learning_rate": 6.371961230516997e-07, + "loss": 0.2687, + "step": 2507 + }, + { + "epoch": 1.2620037807183364, + "grad_norm": 0.17667010185551682, + "learning_rate": 6.36937853010481e-07, + "loss": 0.2699, + "step": 2508 + }, + { + "epoch": 1.2625078764965343, + "grad_norm": 0.17818670333223607, + "learning_rate": 6.366795434635398e-07, + "loss": 0.2687, + "step": 2509 + }, + { + "epoch": 1.2630119722747322, + "grad_norm": 0.17694309827192736, + "learning_rate": 6.364211944853971e-07, + "loss": 0.2635, + "step": 2510 + }, + { + "epoch": 1.26351606805293, + "grad_norm": 0.17785156941257393, + "learning_rate": 6.361628061505849e-07, + "loss": 0.2831, + "step": 2511 + }, + { + "epoch": 1.264020163831128, + "grad_norm": 0.17826472857233217, + "learning_rate": 6.359043785336467e-07, + "loss": 0.2731, + "step": 2512 + }, + { + "epoch": 1.2645242596093258, + "grad_norm": 0.18356140095576604, + "learning_rate": 6.356459117091369e-07, + "loss": 0.2601, + "step": 2513 + }, + { + "epoch": 1.2650283553875237, + "grad_norm": 0.2119595973428602, + "learning_rate": 6.353874057516222e-07, + "loss": 0.2809, + "step": 2514 + }, + { + "epoch": 1.2655324511657216, + "grad_norm": 0.16952440646698724, + "learning_rate": 6.351288607356793e-07, + "loss": 0.2731, + "step": 2515 + }, + { + "epoch": 1.2660365469439194, + "grad_norm": 0.17460234803900188, + "learning_rate": 6.348702767358974e-07, + "loss": 0.2797, + "step": 2516 + }, + { + "epoch": 1.2665406427221173, + "grad_norm": 0.17160497843474512, + "learning_rate": 6.34611653826876e-07, + "loss": 0.2698, + "step": 2517 + }, + { + "epoch": 1.267044738500315, + "grad_norm": 0.18778464548766757, + "learning_rate": 6.343529920832263e-07, + "loss": 0.2573, + "step": 2518 + }, + { + "epoch": 1.2675488342785128, + "grad_norm": 0.172443493887068, + "learning_rate": 6.340942915795708e-07, + "loss": 0.2748, + "step": 2519 + }, + { + "epoch": 1.2680529300567107, + "grad_norm": 0.17168130332053846, + "learning_rate": 6.338355523905427e-07, + "loss": 0.2625, + "step": 2520 + }, + { + "epoch": 1.2685570258349086, + "grad_norm": 0.17567263307465228, + "learning_rate": 6.335767745907869e-07, + "loss": 0.2859, + "step": 2521 + }, + { + "epoch": 1.2690611216131065, + "grad_norm": 0.20501744023706306, + "learning_rate": 6.33317958254959e-07, + "loss": 0.2773, + "step": 2522 + }, + { + "epoch": 1.2695652173913043, + "grad_norm": 0.1693368131106986, + "learning_rate": 6.33059103457726e-07, + "loss": 0.2653, + "step": 2523 + }, + { + "epoch": 1.2700693131695022, + "grad_norm": 0.18006664625555702, + "learning_rate": 6.32800210273766e-07, + "loss": 0.2645, + "step": 2524 + }, + { + "epoch": 1.2705734089477, + "grad_norm": 0.17456080347662142, + "learning_rate": 6.32541278777768e-07, + "loss": 0.2699, + "step": 2525 + }, + { + "epoch": 1.271077504725898, + "grad_norm": 0.16574198775311022, + "learning_rate": 6.322823090444322e-07, + "loss": 0.2662, + "step": 2526 + }, + { + "epoch": 1.2715816005040959, + "grad_norm": 0.18598160934075628, + "learning_rate": 6.320233011484696e-07, + "loss": 0.2828, + "step": 2527 + }, + { + "epoch": 1.2720856962822937, + "grad_norm": 0.1711774666753222, + "learning_rate": 6.317642551646024e-07, + "loss": 0.2803, + "step": 2528 + }, + { + "epoch": 1.2725897920604914, + "grad_norm": 0.17441523118583016, + "learning_rate": 6.315051711675639e-07, + "loss": 0.2864, + "step": 2529 + }, + { + "epoch": 1.2730938878386895, + "grad_norm": 0.18168000027383474, + "learning_rate": 6.312460492320981e-07, + "loss": 0.2786, + "step": 2530 + }, + { + "epoch": 1.2735979836168871, + "grad_norm": 0.18114157608848858, + "learning_rate": 6.309868894329602e-07, + "loss": 0.2805, + "step": 2531 + }, + { + "epoch": 1.274102079395085, + "grad_norm": 0.18530369106217595, + "learning_rate": 6.30727691844916e-07, + "loss": 0.2804, + "step": 2532 + }, + { + "epoch": 1.274606175173283, + "grad_norm": 0.1927697807229042, + "learning_rate": 6.304684565427427e-07, + "loss": 0.2667, + "step": 2533 + }, + { + "epoch": 1.2751102709514808, + "grad_norm": 0.17177026835602924, + "learning_rate": 6.302091836012278e-07, + "loss": 0.2633, + "step": 2534 + }, + { + "epoch": 1.2756143667296787, + "grad_norm": 0.20642706373970884, + "learning_rate": 6.299498730951699e-07, + "loss": 0.2878, + "step": 2535 + }, + { + "epoch": 1.2761184625078765, + "grad_norm": 0.17594445837282194, + "learning_rate": 6.296905250993787e-07, + "loss": 0.2946, + "step": 2536 + }, + { + "epoch": 1.2766225582860744, + "grad_norm": 0.1762144552346574, + "learning_rate": 6.294311396886745e-07, + "loss": 0.2915, + "step": 2537 + }, + { + "epoch": 1.2771266540642723, + "grad_norm": 0.17045816209341122, + "learning_rate": 6.291717169378881e-07, + "loss": 0.2782, + "step": 2538 + }, + { + "epoch": 1.2776307498424702, + "grad_norm": 0.18383362547518797, + "learning_rate": 6.289122569218615e-07, + "loss": 0.2748, + "step": 2539 + }, + { + "epoch": 1.2781348456206678, + "grad_norm": 0.1807772669969172, + "learning_rate": 6.286527597154475e-07, + "loss": 0.2885, + "step": 2540 + }, + { + "epoch": 1.278638941398866, + "grad_norm": 0.19302775133426964, + "learning_rate": 6.283932253935094e-07, + "loss": 0.2886, + "step": 2541 + }, + { + "epoch": 1.2791430371770636, + "grad_norm": 0.1888729384221993, + "learning_rate": 6.28133654030921e-07, + "loss": 0.2807, + "step": 2542 + }, + { + "epoch": 1.2796471329552614, + "grad_norm": 0.17228549927667094, + "learning_rate": 6.278740457025671e-07, + "loss": 0.2754, + "step": 2543 + }, + { + "epoch": 1.2801512287334593, + "grad_norm": 0.1729966859664312, + "learning_rate": 6.276144004833432e-07, + "loss": 0.2828, + "step": 2544 + }, + { + "epoch": 1.2806553245116572, + "grad_norm": 0.17500523408983926, + "learning_rate": 6.273547184481554e-07, + "loss": 0.2679, + "step": 2545 + }, + { + "epoch": 1.281159420289855, + "grad_norm": 0.17974221115337133, + "learning_rate": 6.270949996719202e-07, + "loss": 0.28, + "step": 2546 + }, + { + "epoch": 1.281663516068053, + "grad_norm": 0.18105222606013233, + "learning_rate": 6.268352442295648e-07, + "loss": 0.2794, + "step": 2547 + }, + { + "epoch": 1.2821676118462508, + "grad_norm": 0.18180911542908265, + "learning_rate": 6.265754521960272e-07, + "loss": 0.2815, + "step": 2548 + }, + { + "epoch": 1.2826717076244487, + "grad_norm": 0.19054982521679337, + "learning_rate": 6.263156236462557e-07, + "loss": 0.2603, + "step": 2549 + }, + { + "epoch": 1.2831758034026466, + "grad_norm": 0.17388626076521252, + "learning_rate": 6.260557586552094e-07, + "loss": 0.2787, + "step": 2550 + }, + { + "epoch": 1.2836798991808442, + "grad_norm": 0.18270654576376585, + "learning_rate": 6.257958572978573e-07, + "loss": 0.2664, + "step": 2551 + }, + { + "epoch": 1.2841839949590423, + "grad_norm": 0.1727969110053315, + "learning_rate": 6.255359196491799e-07, + "loss": 0.2669, + "step": 2552 + }, + { + "epoch": 1.28468809073724, + "grad_norm": 0.17719931615423382, + "learning_rate": 6.252759457841672e-07, + "loss": 0.2806, + "step": 2553 + }, + { + "epoch": 1.2851921865154379, + "grad_norm": 0.16515633828315085, + "learning_rate": 6.250159357778202e-07, + "loss": 0.2804, + "step": 2554 + }, + { + "epoch": 1.2856962822936358, + "grad_norm": 0.18820483320106177, + "learning_rate": 6.247558897051503e-07, + "loss": 0.2965, + "step": 2555 + }, + { + "epoch": 1.2862003780718336, + "grad_norm": 0.17604374414885252, + "learning_rate": 6.244958076411789e-07, + "loss": 0.2622, + "step": 2556 + }, + { + "epoch": 1.2867044738500315, + "grad_norm": 0.1696182132757248, + "learning_rate": 6.242356896609383e-07, + "loss": 0.2742, + "step": 2557 + }, + { + "epoch": 1.2872085696282294, + "grad_norm": 0.1737771138671047, + "learning_rate": 6.239755358394707e-07, + "loss": 0.2667, + "step": 2558 + }, + { + "epoch": 1.2877126654064273, + "grad_norm": 0.1801598415185936, + "learning_rate": 6.237153462518291e-07, + "loss": 0.2654, + "step": 2559 + }, + { + "epoch": 1.2882167611846251, + "grad_norm": 0.1756137146667004, + "learning_rate": 6.234551209730765e-07, + "loss": 0.2947, + "step": 2560 + }, + { + "epoch": 1.288720856962823, + "grad_norm": 0.17233304314756842, + "learning_rate": 6.231948600782863e-07, + "loss": 0.2776, + "step": 2561 + }, + { + "epoch": 1.2892249527410207, + "grad_norm": 0.18074095930111286, + "learning_rate": 6.229345636425421e-07, + "loss": 0.2778, + "step": 2562 + }, + { + "epoch": 1.2897290485192188, + "grad_norm": 0.17475122322002845, + "learning_rate": 6.226742317409378e-07, + "loss": 0.2907, + "step": 2563 + }, + { + "epoch": 1.2902331442974164, + "grad_norm": 0.17347744444476335, + "learning_rate": 6.224138644485775e-07, + "loss": 0.2737, + "step": 2564 + }, + { + "epoch": 1.2907372400756143, + "grad_norm": 0.17186378287100884, + "learning_rate": 6.221534618405757e-07, + "loss": 0.277, + "step": 2565 + }, + { + "epoch": 1.2912413358538122, + "grad_norm": 0.1705178171148712, + "learning_rate": 6.218930239920568e-07, + "loss": 0.2685, + "step": 2566 + }, + { + "epoch": 1.29174543163201, + "grad_norm": 0.1786888372127578, + "learning_rate": 6.216325509781556e-07, + "loss": 0.2785, + "step": 2567 + }, + { + "epoch": 1.292249527410208, + "grad_norm": 0.17449611257529518, + "learning_rate": 6.213720428740168e-07, + "loss": 0.2653, + "step": 2568 + }, + { + "epoch": 1.2927536231884058, + "grad_norm": 0.168813740260223, + "learning_rate": 6.211114997547956e-07, + "loss": 0.2629, + "step": 2569 + }, + { + "epoch": 1.2932577189666037, + "grad_norm": 0.16790027679595843, + "learning_rate": 6.208509216956572e-07, + "loss": 0.2706, + "step": 2570 + }, + { + "epoch": 1.2937618147448016, + "grad_norm": 0.17099748648486032, + "learning_rate": 6.205903087717761e-07, + "loss": 0.2687, + "step": 2571 + }, + { + "epoch": 1.2942659105229994, + "grad_norm": 0.18162517160448968, + "learning_rate": 6.203296610583382e-07, + "loss": 0.2747, + "step": 2572 + }, + { + "epoch": 1.294770006301197, + "grad_norm": 0.17063028798101706, + "learning_rate": 6.200689786305383e-07, + "loss": 0.2761, + "step": 2573 + }, + { + "epoch": 1.2952741020793952, + "grad_norm": 0.17398933181018333, + "learning_rate": 6.19808261563582e-07, + "loss": 0.283, + "step": 2574 + }, + { + "epoch": 1.2957781978575929, + "grad_norm": 0.16776550305812016, + "learning_rate": 6.195475099326843e-07, + "loss": 0.2617, + "step": 2575 + }, + { + "epoch": 1.2962822936357907, + "grad_norm": 0.1747053664007132, + "learning_rate": 6.192867238130708e-07, + "loss": 0.2774, + "step": 2576 + }, + { + "epoch": 1.2967863894139886, + "grad_norm": 0.18115021944031587, + "learning_rate": 6.190259032799761e-07, + "loss": 0.2751, + "step": 2577 + }, + { + "epoch": 1.2972904851921865, + "grad_norm": 0.1790395768010079, + "learning_rate": 6.187650484086459e-07, + "loss": 0.29, + "step": 2578 + }, + { + "epoch": 1.2977945809703844, + "grad_norm": 0.1719106072500581, + "learning_rate": 6.185041592743348e-07, + "loss": 0.2729, + "step": 2579 + }, + { + "epoch": 1.2982986767485822, + "grad_norm": 0.17444897223126585, + "learning_rate": 6.182432359523079e-07, + "loss": 0.2789, + "step": 2580 + }, + { + "epoch": 1.2988027725267801, + "grad_norm": 0.1766242163102816, + "learning_rate": 6.179822785178398e-07, + "loss": 0.2583, + "step": 2581 + }, + { + "epoch": 1.299306868304978, + "grad_norm": 0.18162411025003278, + "learning_rate": 6.177212870462152e-07, + "loss": 0.2686, + "step": 2582 + }, + { + "epoch": 1.2998109640831759, + "grad_norm": 0.16955702326217229, + "learning_rate": 6.174602616127287e-07, + "loss": 0.2736, + "step": 2583 + }, + { + "epoch": 1.3003150598613735, + "grad_norm": 0.173479162516625, + "learning_rate": 6.171992022926841e-07, + "loss": 0.2738, + "step": 2584 + }, + { + "epoch": 1.3008191556395716, + "grad_norm": 0.17139404620083853, + "learning_rate": 6.16938109161396e-07, + "loss": 0.2757, + "step": 2585 + }, + { + "epoch": 1.3013232514177693, + "grad_norm": 0.17095006127373458, + "learning_rate": 6.166769822941877e-07, + "loss": 0.2748, + "step": 2586 + }, + { + "epoch": 1.3018273471959672, + "grad_norm": 0.17350708973766807, + "learning_rate": 6.164158217663926e-07, + "loss": 0.2729, + "step": 2587 + }, + { + "epoch": 1.3018273471959672, + "eval_loss": 0.3091413378715515, + "eval_runtime": 17.1268, + "eval_samples_per_second": 49.922, + "eval_steps_per_second": 1.051, + "step": 2587 + }, + { + "epoch": 1.302331442974165, + "grad_norm": 0.19869258969365142, + "learning_rate": 6.161546276533542e-07, + "loss": 0.2881, + "step": 2588 + }, + { + "epoch": 1.302835538752363, + "grad_norm": 0.1703478811098866, + "learning_rate": 6.158934000304251e-07, + "loss": 0.2701, + "step": 2589 + }, + { + "epoch": 1.3033396345305608, + "grad_norm": 0.1696150699647383, + "learning_rate": 6.156321389729682e-07, + "loss": 0.2839, + "step": 2590 + }, + { + "epoch": 1.3038437303087587, + "grad_norm": 0.1738118040743564, + "learning_rate": 6.153708445563555e-07, + "loss": 0.2852, + "step": 2591 + }, + { + "epoch": 1.3043478260869565, + "grad_norm": 0.1767377470005941, + "learning_rate": 6.151095168559688e-07, + "loss": 0.2769, + "step": 2592 + }, + { + "epoch": 1.3048519218651544, + "grad_norm": 0.17056286239729945, + "learning_rate": 6.148481559471995e-07, + "loss": 0.2634, + "step": 2593 + }, + { + "epoch": 1.3053560176433523, + "grad_norm": 0.1964952722351761, + "learning_rate": 6.145867619054487e-07, + "loss": 0.2821, + "step": 2594 + }, + { + "epoch": 1.3058601134215502, + "grad_norm": 0.17002975410785548, + "learning_rate": 6.143253348061271e-07, + "loss": 0.2751, + "step": 2595 + }, + { + "epoch": 1.306364209199748, + "grad_norm": 0.17294236211348307, + "learning_rate": 6.140638747246543e-07, + "loss": 0.2777, + "step": 2596 + }, + { + "epoch": 1.3068683049779457, + "grad_norm": 0.18702273486347906, + "learning_rate": 6.138023817364603e-07, + "loss": 0.2817, + "step": 2597 + }, + { + "epoch": 1.3073724007561438, + "grad_norm": 0.18257740617721263, + "learning_rate": 6.135408559169842e-07, + "loss": 0.2869, + "step": 2598 + }, + { + "epoch": 1.3078764965343415, + "grad_norm": 0.1679551881506779, + "learning_rate": 6.132792973416744e-07, + "loss": 0.2671, + "step": 2599 + }, + { + "epoch": 1.3083805923125393, + "grad_norm": 0.1754134210625823, + "learning_rate": 6.130177060859894e-07, + "loss": 0.302, + "step": 2600 + }, + { + "epoch": 1.3088846880907372, + "grad_norm": 0.1772876906689951, + "learning_rate": 6.12756082225396e-07, + "loss": 0.2783, + "step": 2601 + }, + { + "epoch": 1.309388783868935, + "grad_norm": 0.20064464760978862, + "learning_rate": 6.124944258353714e-07, + "loss": 0.2839, + "step": 2602 + }, + { + "epoch": 1.309892879647133, + "grad_norm": 0.18138531276139028, + "learning_rate": 6.122327369914018e-07, + "loss": 0.2928, + "step": 2603 + }, + { + "epoch": 1.3103969754253308, + "grad_norm": 0.17693406432408068, + "learning_rate": 6.119710157689828e-07, + "loss": 0.264, + "step": 2604 + }, + { + "epoch": 1.3109010712035287, + "grad_norm": 0.17240536698687978, + "learning_rate": 6.117092622436194e-07, + "loss": 0.2803, + "step": 2605 + }, + { + "epoch": 1.3114051669817266, + "grad_norm": 0.17830285202833615, + "learning_rate": 6.11447476490826e-07, + "loss": 0.271, + "step": 2606 + }, + { + "epoch": 1.3119092627599245, + "grad_norm": 0.18109922157734024, + "learning_rate": 6.111856585861259e-07, + "loss": 0.2673, + "step": 2607 + }, + { + "epoch": 1.3124133585381221, + "grad_norm": 0.17133961085931276, + "learning_rate": 6.10923808605052e-07, + "loss": 0.2726, + "step": 2608 + }, + { + "epoch": 1.3129174543163202, + "grad_norm": 0.17649723679339357, + "learning_rate": 6.106619266231467e-07, + "loss": 0.2674, + "step": 2609 + }, + { + "epoch": 1.3134215500945179, + "grad_norm": 0.17184213041429233, + "learning_rate": 6.104000127159608e-07, + "loss": 0.279, + "step": 2610 + }, + { + "epoch": 1.3139256458727158, + "grad_norm": 0.17468087267169044, + "learning_rate": 6.101380669590551e-07, + "loss": 0.279, + "step": 2611 + }, + { + "epoch": 1.3144297416509136, + "grad_norm": 0.17337912043291634, + "learning_rate": 6.098760894279995e-07, + "loss": 0.2685, + "step": 2612 + }, + { + "epoch": 1.3149338374291115, + "grad_norm": 0.18366133473396995, + "learning_rate": 6.096140801983727e-07, + "loss": 0.2711, + "step": 2613 + }, + { + "epoch": 1.3154379332073094, + "grad_norm": 0.17222644312128002, + "learning_rate": 6.093520393457627e-07, + "loss": 0.2631, + "step": 2614 + }, + { + "epoch": 1.3159420289855073, + "grad_norm": 0.17738930991704355, + "learning_rate": 6.09089966945767e-07, + "loss": 0.2743, + "step": 2615 + }, + { + "epoch": 1.3164461247637051, + "grad_norm": 0.17185117857715593, + "learning_rate": 6.088278630739915e-07, + "loss": 0.2772, + "step": 2616 + }, + { + "epoch": 1.316950220541903, + "grad_norm": 0.16984493121172142, + "learning_rate": 6.085657278060515e-07, + "loss": 0.2656, + "step": 2617 + }, + { + "epoch": 1.317454316320101, + "grad_norm": 0.17192888657461453, + "learning_rate": 6.083035612175716e-07, + "loss": 0.277, + "step": 2618 + }, + { + "epoch": 1.3179584120982986, + "grad_norm": 0.18354467561304685, + "learning_rate": 6.080413633841853e-07, + "loss": 0.2701, + "step": 2619 + }, + { + "epoch": 1.3184625078764967, + "grad_norm": 0.17503373460612875, + "learning_rate": 6.077791343815349e-07, + "loss": 0.2825, + "step": 2620 + }, + { + "epoch": 1.3189666036546943, + "grad_norm": 0.1706123234531778, + "learning_rate": 6.075168742852718e-07, + "loss": 0.2662, + "step": 2621 + }, + { + "epoch": 1.3194706994328922, + "grad_norm": 0.17412767194593945, + "learning_rate": 6.072545831710567e-07, + "loss": 0.2722, + "step": 2622 + }, + { + "epoch": 1.31997479521109, + "grad_norm": 0.17240484877752552, + "learning_rate": 6.069922611145587e-07, + "loss": 0.2686, + "step": 2623 + }, + { + "epoch": 1.320478890989288, + "grad_norm": 0.16827994239810876, + "learning_rate": 6.06729908191456e-07, + "loss": 0.2641, + "step": 2624 + }, + { + "epoch": 1.3209829867674858, + "grad_norm": 0.17930969669376187, + "learning_rate": 6.064675244774362e-07, + "loss": 0.2852, + "step": 2625 + }, + { + "epoch": 1.3214870825456837, + "grad_norm": 0.18006157470320514, + "learning_rate": 6.062051100481949e-07, + "loss": 0.2758, + "step": 2626 + }, + { + "epoch": 1.3219911783238816, + "grad_norm": 0.1727448964101327, + "learning_rate": 6.059426649794374e-07, + "loss": 0.2714, + "step": 2627 + }, + { + "epoch": 1.3224952741020795, + "grad_norm": 0.17424563952604652, + "learning_rate": 6.056801893468773e-07, + "loss": 0.2787, + "step": 2628 + }, + { + "epoch": 1.3229993698802773, + "grad_norm": 0.1731354768377102, + "learning_rate": 6.054176832262371e-07, + "loss": 0.2776, + "step": 2629 + }, + { + "epoch": 1.323503465658475, + "grad_norm": 0.1804386379598718, + "learning_rate": 6.051551466932485e-07, + "loss": 0.2651, + "step": 2630 + }, + { + "epoch": 1.324007561436673, + "grad_norm": 0.17399836046155562, + "learning_rate": 6.048925798236512e-07, + "loss": 0.2702, + "step": 2631 + }, + { + "epoch": 1.3245116572148707, + "grad_norm": 0.43353712827340424, + "learning_rate": 6.046299826931946e-07, + "loss": 0.2753, + "step": 2632 + }, + { + "epoch": 1.3250157529930686, + "grad_norm": 0.17183489960437298, + "learning_rate": 6.043673553776361e-07, + "loss": 0.2857, + "step": 2633 + }, + { + "epoch": 1.3255198487712665, + "grad_norm": 0.17519525464739322, + "learning_rate": 6.041046979527422e-07, + "loss": 0.2724, + "step": 2634 + }, + { + "epoch": 1.3260239445494644, + "grad_norm": 0.18357488474151853, + "learning_rate": 6.038420104942877e-07, + "loss": 0.2662, + "step": 2635 + }, + { + "epoch": 1.3265280403276623, + "grad_norm": 0.1859694085856373, + "learning_rate": 6.035792930780565e-07, + "loss": 0.2615, + "step": 2636 + }, + { + "epoch": 1.3270321361058601, + "grad_norm": 0.17450754260587856, + "learning_rate": 6.033165457798408e-07, + "loss": 0.2754, + "step": 2637 + }, + { + "epoch": 1.327536231884058, + "grad_norm": 0.1668193821161752, + "learning_rate": 6.030537686754419e-07, + "loss": 0.2698, + "step": 2638 + }, + { + "epoch": 1.3280403276622559, + "grad_norm": 0.1728884240425731, + "learning_rate": 6.027909618406689e-07, + "loss": 0.275, + "step": 2639 + }, + { + "epoch": 1.3285444234404538, + "grad_norm": 0.17398093656070354, + "learning_rate": 6.025281253513404e-07, + "loss": 0.2627, + "step": 2640 + }, + { + "epoch": 1.3290485192186514, + "grad_norm": 0.203938933296385, + "learning_rate": 6.022652592832827e-07, + "loss": 0.2782, + "step": 2641 + }, + { + "epoch": 1.3295526149968495, + "grad_norm": 0.18386883362802814, + "learning_rate": 6.02002363712331e-07, + "loss": 0.2748, + "step": 2642 + }, + { + "epoch": 1.3300567107750472, + "grad_norm": 0.17575912101228022, + "learning_rate": 6.017394387143294e-07, + "loss": 0.2752, + "step": 2643 + }, + { + "epoch": 1.330560806553245, + "grad_norm": 0.18067748484917678, + "learning_rate": 6.0147648436513e-07, + "loss": 0.2815, + "step": 2644 + }, + { + "epoch": 1.331064902331443, + "grad_norm": 0.16917344843892523, + "learning_rate": 6.012135007405936e-07, + "loss": 0.2648, + "step": 2645 + }, + { + "epoch": 1.3315689981096408, + "grad_norm": 0.17412949415096388, + "learning_rate": 6.009504879165891e-07, + "loss": 0.2754, + "step": 2646 + }, + { + "epoch": 1.3320730938878387, + "grad_norm": 0.18047299736884104, + "learning_rate": 6.006874459689942e-07, + "loss": 0.2694, + "step": 2647 + }, + { + "epoch": 1.3325771896660366, + "grad_norm": 0.19127600843343956, + "learning_rate": 6.004243749736947e-07, + "loss": 0.2705, + "step": 2648 + }, + { + "epoch": 1.3330812854442344, + "grad_norm": 0.17608841608371475, + "learning_rate": 6.001612750065853e-07, + "loss": 0.2797, + "step": 2649 + }, + { + "epoch": 1.3335853812224323, + "grad_norm": 0.17931850245102035, + "learning_rate": 5.998981461435685e-07, + "loss": 0.2937, + "step": 2650 + }, + { + "epoch": 1.3340894770006302, + "grad_norm": 0.18450380693233284, + "learning_rate": 5.996349884605552e-07, + "loss": 0.2702, + "step": 2651 + }, + { + "epoch": 1.3345935727788278, + "grad_norm": 0.17087115966825997, + "learning_rate": 5.993718020334652e-07, + "loss": 0.2762, + "step": 2652 + }, + { + "epoch": 1.335097668557026, + "grad_norm": 0.1718079561805658, + "learning_rate": 5.991085869382258e-07, + "loss": 0.2715, + "step": 2653 + }, + { + "epoch": 1.3356017643352236, + "grad_norm": 0.18602810395230512, + "learning_rate": 5.988453432507729e-07, + "loss": 0.2744, + "step": 2654 + }, + { + "epoch": 1.3361058601134215, + "grad_norm": 0.18389684807636075, + "learning_rate": 5.985820710470509e-07, + "loss": 0.2808, + "step": 2655 + }, + { + "epoch": 1.3366099558916194, + "grad_norm": 0.19592914957135987, + "learning_rate": 5.983187704030123e-07, + "loss": 0.2676, + "step": 2656 + }, + { + "epoch": 1.3371140516698172, + "grad_norm": 0.18190367471809632, + "learning_rate": 5.980554413946172e-07, + "loss": 0.2938, + "step": 2657 + }, + { + "epoch": 1.337618147448015, + "grad_norm": 0.1717978516962743, + "learning_rate": 5.977920840978346e-07, + "loss": 0.2865, + "step": 2658 + }, + { + "epoch": 1.338122243226213, + "grad_norm": 0.1860456269002095, + "learning_rate": 5.975286985886418e-07, + "loss": 0.2763, + "step": 2659 + }, + { + "epoch": 1.3386263390044109, + "grad_norm": 0.17913391357600741, + "learning_rate": 5.972652849430235e-07, + "loss": 0.2702, + "step": 2660 + }, + { + "epoch": 1.3391304347826087, + "grad_norm": 0.17254123661223592, + "learning_rate": 5.97001843236973e-07, + "loss": 0.2711, + "step": 2661 + }, + { + "epoch": 1.3396345305608066, + "grad_norm": 0.17083369763214978, + "learning_rate": 5.967383735464916e-07, + "loss": 0.2582, + "step": 2662 + }, + { + "epoch": 1.3401386263390045, + "grad_norm": 0.17895529015768472, + "learning_rate": 5.964748759475887e-07, + "loss": 0.2818, + "step": 2663 + }, + { + "epoch": 1.3406427221172024, + "grad_norm": 0.17751494439480603, + "learning_rate": 5.962113505162818e-07, + "loss": 0.2741, + "step": 2664 + }, + { + "epoch": 1.3411468178954, + "grad_norm": 0.1703756144111887, + "learning_rate": 5.959477973285961e-07, + "loss": 0.2643, + "step": 2665 + }, + { + "epoch": 1.3416509136735981, + "grad_norm": 0.16962499032704564, + "learning_rate": 5.956842164605654e-07, + "loss": 0.2632, + "step": 2666 + }, + { + "epoch": 1.3421550094517958, + "grad_norm": 0.16795341629826513, + "learning_rate": 5.954206079882311e-07, + "loss": 0.274, + "step": 2667 + }, + { + "epoch": 1.3426591052299937, + "grad_norm": 0.17092016847119307, + "learning_rate": 5.951569719876421e-07, + "loss": 0.2778, + "step": 2668 + }, + { + "epoch": 1.3431632010081915, + "grad_norm": 0.1790697008564231, + "learning_rate": 5.948933085348564e-07, + "loss": 0.2677, + "step": 2669 + }, + { + "epoch": 1.3436672967863894, + "grad_norm": 0.16930642394721995, + "learning_rate": 5.94629617705939e-07, + "loss": 0.2744, + "step": 2670 + }, + { + "epoch": 1.3441713925645873, + "grad_norm": 0.17294047568014345, + "learning_rate": 5.943658995769631e-07, + "loss": 0.2542, + "step": 2671 + }, + { + "epoch": 1.3446754883427852, + "grad_norm": 0.1767866393054919, + "learning_rate": 5.941021542240098e-07, + "loss": 0.259, + "step": 2672 + }, + { + "epoch": 1.345179584120983, + "grad_norm": 0.1684270679379651, + "learning_rate": 5.938383817231678e-07, + "loss": 0.2668, + "step": 2673 + }, + { + "epoch": 1.345683679899181, + "grad_norm": 0.17591755928235953, + "learning_rate": 5.93574582150534e-07, + "loss": 0.271, + "step": 2674 + }, + { + "epoch": 1.3461877756773788, + "grad_norm": 0.17125486033048526, + "learning_rate": 5.933107555822131e-07, + "loss": 0.2793, + "step": 2675 + }, + { + "epoch": 1.3466918714555765, + "grad_norm": 0.17748188736278603, + "learning_rate": 5.930469020943171e-07, + "loss": 0.2777, + "step": 2676 + }, + { + "epoch": 1.3471959672337745, + "grad_norm": 0.16672523194129693, + "learning_rate": 5.927830217629661e-07, + "loss": 0.2536, + "step": 2677 + }, + { + "epoch": 1.3477000630119722, + "grad_norm": 0.1713361784166428, + "learning_rate": 5.925191146642883e-07, + "loss": 0.2622, + "step": 2678 + }, + { + "epoch": 1.34820415879017, + "grad_norm": 0.17829104370453977, + "learning_rate": 5.92255180874419e-07, + "loss": 0.2817, + "step": 2679 + }, + { + "epoch": 1.348708254568368, + "grad_norm": 0.18860577855505326, + "learning_rate": 5.919912204695014e-07, + "loss": 0.2799, + "step": 2680 + }, + { + "epoch": 1.3492123503465658, + "grad_norm": 0.1833287597973856, + "learning_rate": 5.917272335256865e-07, + "loss": 0.2858, + "step": 2681 + }, + { + "epoch": 1.3497164461247637, + "grad_norm": 0.19807321067808822, + "learning_rate": 5.914632201191332e-07, + "loss": 0.2787, + "step": 2682 + }, + { + "epoch": 1.3502205419029616, + "grad_norm": 0.16859634450252498, + "learning_rate": 5.911991803260074e-07, + "loss": 0.2764, + "step": 2683 + }, + { + "epoch": 1.3507246376811595, + "grad_norm": 0.190261438179237, + "learning_rate": 5.909351142224829e-07, + "loss": 0.272, + "step": 2684 + }, + { + "epoch": 1.3512287334593573, + "grad_norm": 0.17519831750598353, + "learning_rate": 5.906710218847413e-07, + "loss": 0.2603, + "step": 2685 + }, + { + "epoch": 1.3517328292375552, + "grad_norm": 0.18778966952607162, + "learning_rate": 5.904069033889716e-07, + "loss": 0.261, + "step": 2686 + }, + { + "epoch": 1.3522369250157529, + "grad_norm": 0.17347841958582969, + "learning_rate": 5.901427588113703e-07, + "loss": 0.2885, + "step": 2687 + }, + { + "epoch": 1.352741020793951, + "grad_norm": 0.17645693666916326, + "learning_rate": 5.898785882281415e-07, + "loss": 0.2831, + "step": 2688 + }, + { + "epoch": 1.3532451165721486, + "grad_norm": 0.17182233696335003, + "learning_rate": 5.896143917154967e-07, + "loss": 0.2712, + "step": 2689 + }, + { + "epoch": 1.3537492123503465, + "grad_norm": 0.17603565137881136, + "learning_rate": 5.893501693496553e-07, + "loss": 0.2634, + "step": 2690 + }, + { + "epoch": 1.3542533081285444, + "grad_norm": 0.18263857001668504, + "learning_rate": 5.890859212068433e-07, + "loss": 0.2738, + "step": 2691 + }, + { + "epoch": 1.3547574039067423, + "grad_norm": 0.1730156353984436, + "learning_rate": 5.888216473632949e-07, + "loss": 0.2716, + "step": 2692 + }, + { + "epoch": 1.3552614996849401, + "grad_norm": 0.17823231943674622, + "learning_rate": 5.885573478952515e-07, + "loss": 0.275, + "step": 2693 + }, + { + "epoch": 1.355765595463138, + "grad_norm": 0.17236615330455812, + "learning_rate": 5.882930228789617e-07, + "loss": 0.2674, + "step": 2694 + }, + { + "epoch": 1.356269691241336, + "grad_norm": 0.16824543267551886, + "learning_rate": 5.880286723906817e-07, + "loss": 0.2711, + "step": 2695 + }, + { + "epoch": 1.3567737870195338, + "grad_norm": 0.17367807155908455, + "learning_rate": 5.877642965066753e-07, + "loss": 0.2839, + "step": 2696 + }, + { + "epoch": 1.3572778827977316, + "grad_norm": 0.17538723204136947, + "learning_rate": 5.874998953032128e-07, + "loss": 0.2686, + "step": 2697 + }, + { + "epoch": 1.3577819785759293, + "grad_norm": 0.17766393535437983, + "learning_rate": 5.872354688565727e-07, + "loss": 0.2816, + "step": 2698 + }, + { + "epoch": 1.3582860743541274, + "grad_norm": 0.1699641810797796, + "learning_rate": 5.869710172430401e-07, + "loss": 0.2605, + "step": 2699 + }, + { + "epoch": 1.358790170132325, + "grad_norm": 0.17422760346977517, + "learning_rate": 5.867065405389078e-07, + "loss": 0.2846, + "step": 2700 + }, + { + "epoch": 1.359294265910523, + "grad_norm": 0.18149647107592987, + "learning_rate": 5.864420388204757e-07, + "loss": 0.2749, + "step": 2701 + }, + { + "epoch": 1.3597983616887208, + "grad_norm": 0.1847991842938757, + "learning_rate": 5.86177512164051e-07, + "loss": 0.2695, + "step": 2702 + }, + { + "epoch": 1.3603024574669187, + "grad_norm": 0.16847607424615257, + "learning_rate": 5.859129606459477e-07, + "loss": 0.2702, + "step": 2703 + }, + { + "epoch": 1.3608065532451166, + "grad_norm": 0.17245367094502653, + "learning_rate": 5.856483843424875e-07, + "loss": 0.2667, + "step": 2704 + }, + { + "epoch": 1.3613106490233144, + "grad_norm": 0.16958701012632785, + "learning_rate": 5.853837833299991e-07, + "loss": 0.2748, + "step": 2705 + }, + { + "epoch": 1.3618147448015123, + "grad_norm": 0.17294794292884255, + "learning_rate": 5.85119157684818e-07, + "loss": 0.2742, + "step": 2706 + }, + { + "epoch": 1.3623188405797102, + "grad_norm": 0.16738628688725057, + "learning_rate": 5.848545074832873e-07, + "loss": 0.2549, + "step": 2707 + }, + { + "epoch": 1.362822936357908, + "grad_norm": 0.17652623817244345, + "learning_rate": 5.845898328017566e-07, + "loss": 0.2758, + "step": 2708 + }, + { + "epoch": 1.3633270321361057, + "grad_norm": 0.16927838446248678, + "learning_rate": 5.843251337165834e-07, + "loss": 0.2714, + "step": 2709 + }, + { + "epoch": 1.3638311279143038, + "grad_norm": 0.17423646348688704, + "learning_rate": 5.840604103041313e-07, + "loss": 0.2652, + "step": 2710 + }, + { + "epoch": 1.3643352236925015, + "grad_norm": 0.17007075110833333, + "learning_rate": 5.837956626407717e-07, + "loss": 0.2815, + "step": 2711 + }, + { + "epoch": 1.3648393194706994, + "grad_norm": 0.17563065467297476, + "learning_rate": 5.835308908028826e-07, + "loss": 0.2837, + "step": 2712 + }, + { + "epoch": 1.3653434152488972, + "grad_norm": 0.17681431025674674, + "learning_rate": 5.832660948668488e-07, + "loss": 0.2872, + "step": 2713 + }, + { + "epoch": 1.3658475110270951, + "grad_norm": 0.17713189828891057, + "learning_rate": 5.830012749090624e-07, + "loss": 0.2704, + "step": 2714 + }, + { + "epoch": 1.366351606805293, + "grad_norm": 0.17125824414564597, + "learning_rate": 5.827364310059224e-07, + "loss": 0.2698, + "step": 2715 + }, + { + "epoch": 1.3668557025834909, + "grad_norm": 0.1721925153952408, + "learning_rate": 5.824715632338345e-07, + "loss": 0.2614, + "step": 2716 + }, + { + "epoch": 1.3673597983616887, + "grad_norm": 0.1738669650774558, + "learning_rate": 5.822066716692116e-07, + "loss": 0.2891, + "step": 2717 + }, + { + "epoch": 1.3678638941398866, + "grad_norm": 0.17612032698684543, + "learning_rate": 5.819417563884732e-07, + "loss": 0.2888, + "step": 2718 + }, + { + "epoch": 1.3683679899180845, + "grad_norm": 0.1710729077993677, + "learning_rate": 5.816768174680457e-07, + "loss": 0.2763, + "step": 2719 + }, + { + "epoch": 1.3688720856962822, + "grad_norm": 0.17794493706059122, + "learning_rate": 5.814118549843623e-07, + "loss": 0.2623, + "step": 2720 + }, + { + "epoch": 1.3693761814744803, + "grad_norm": 0.17143323856959825, + "learning_rate": 5.81146869013863e-07, + "loss": 0.2647, + "step": 2721 + }, + { + "epoch": 1.369880277252678, + "grad_norm": 0.19765533176326622, + "learning_rate": 5.808818596329947e-07, + "loss": 0.2896, + "step": 2722 + }, + { + "epoch": 1.3703843730308758, + "grad_norm": 0.17134806523272622, + "learning_rate": 5.806168269182108e-07, + "loss": 0.2844, + "step": 2723 + }, + { + "epoch": 1.3708884688090737, + "grad_norm": 0.17764670109993702, + "learning_rate": 5.803517709459718e-07, + "loss": 0.2857, + "step": 2724 + }, + { + "epoch": 1.3713925645872715, + "grad_norm": 0.17405509742325434, + "learning_rate": 5.800866917927448e-07, + "loss": 0.2808, + "step": 2725 + }, + { + "epoch": 1.3718966603654694, + "grad_norm": 0.17704791599385142, + "learning_rate": 5.798215895350032e-07, + "loss": 0.2569, + "step": 2726 + }, + { + "epoch": 1.3724007561436673, + "grad_norm": 0.19139286392663546, + "learning_rate": 5.795564642492274e-07, + "loss": 0.2782, + "step": 2727 + }, + { + "epoch": 1.3729048519218652, + "grad_norm": 0.18973954678563565, + "learning_rate": 5.792913160119046e-07, + "loss": 0.2646, + "step": 2728 + }, + { + "epoch": 1.373408947700063, + "grad_norm": 0.1717813204204622, + "learning_rate": 5.790261448995283e-07, + "loss": 0.2752, + "step": 2729 + }, + { + "epoch": 1.373913043478261, + "grad_norm": 0.1707475223327584, + "learning_rate": 5.787609509885987e-07, + "loss": 0.2551, + "step": 2730 + }, + { + "epoch": 1.3744171392564588, + "grad_norm": 0.1715006155883366, + "learning_rate": 5.784957343556227e-07, + "loss": 0.2837, + "step": 2731 + }, + { + "epoch": 1.3749212350346567, + "grad_norm": 0.16563216070091388, + "learning_rate": 5.782304950771136e-07, + "loss": 0.2482, + "step": 2732 + }, + { + "epoch": 1.3754253308128543, + "grad_norm": 0.1701892292799077, + "learning_rate": 5.779652332295912e-07, + "loss": 0.2713, + "step": 2733 + }, + { + "epoch": 1.3759294265910522, + "grad_norm": 0.16506563913482972, + "learning_rate": 5.776999488895821e-07, + "loss": 0.2603, + "step": 2734 + }, + { + "epoch": 1.37643352236925, + "grad_norm": 0.17768766370328332, + "learning_rate": 5.774346421336191e-07, + "loss": 0.2691, + "step": 2735 + }, + { + "epoch": 1.376937618147448, + "grad_norm": 0.1821534720498173, + "learning_rate": 5.771693130382413e-07, + "loss": 0.2755, + "step": 2736 + }, + { + "epoch": 1.3774417139256458, + "grad_norm": 0.1719459414964014, + "learning_rate": 5.769039616799949e-07, + "loss": 0.2834, + "step": 2737 + }, + { + "epoch": 1.3779458097038437, + "grad_norm": 0.17228563061652777, + "learning_rate": 5.766385881354319e-07, + "loss": 0.2977, + "step": 2738 + }, + { + "epoch": 1.3784499054820416, + "grad_norm": 0.17256562211361462, + "learning_rate": 5.76373192481111e-07, + "loss": 0.2736, + "step": 2739 + }, + { + "epoch": 1.3789540012602395, + "grad_norm": 0.17577017025513997, + "learning_rate": 5.76107774793597e-07, + "loss": 0.2693, + "step": 2740 + }, + { + "epoch": 1.3794580970384374, + "grad_norm": 0.17615100356954397, + "learning_rate": 5.758423351494617e-07, + "loss": 0.2755, + "step": 2741 + }, + { + "epoch": 1.3799621928166352, + "grad_norm": 0.1769331316897802, + "learning_rate": 5.755768736252824e-07, + "loss": 0.2651, + "step": 2742 + }, + { + "epoch": 1.3804662885948331, + "grad_norm": 0.18475639772963626, + "learning_rate": 5.753113902976433e-07, + "loss": 0.2818, + "step": 2743 + }, + { + "epoch": 1.3809703843730308, + "grad_norm": 0.17618416233072068, + "learning_rate": 5.750458852431346e-07, + "loss": 0.2993, + "step": 2744 + }, + { + "epoch": 1.3814744801512289, + "grad_norm": 0.1855289675606494, + "learning_rate": 5.747803585383529e-07, + "loss": 0.2645, + "step": 2745 + }, + { + "epoch": 1.3819785759294265, + "grad_norm": 0.17112395237932768, + "learning_rate": 5.74514810259901e-07, + "loss": 0.2796, + "step": 2746 + }, + { + "epoch": 1.3824826717076244, + "grad_norm": 0.17069954299670645, + "learning_rate": 5.742492404843877e-07, + "loss": 0.2725, + "step": 2747 + }, + { + "epoch": 1.3829867674858223, + "grad_norm": 0.17473478086312352, + "learning_rate": 5.739836492884287e-07, + "loss": 0.2567, + "step": 2748 + }, + { + "epoch": 1.3834908632640202, + "grad_norm": 0.17717349784959838, + "learning_rate": 5.737180367486453e-07, + "loss": 0.2764, + "step": 2749 + }, + { + "epoch": 1.383994959042218, + "grad_norm": 0.18599321000957772, + "learning_rate": 5.734524029416648e-07, + "loss": 0.2776, + "step": 2750 + }, + { + "epoch": 1.384499054820416, + "grad_norm": 0.17518820340484337, + "learning_rate": 5.731867479441211e-07, + "loss": 0.2585, + "step": 2751 + }, + { + "epoch": 1.3850031505986138, + "grad_norm": 0.1792914417737057, + "learning_rate": 5.72921071832654e-07, + "loss": 0.282, + "step": 2752 + }, + { + "epoch": 1.3855072463768117, + "grad_norm": 0.17099619039747274, + "learning_rate": 5.726553746839094e-07, + "loss": 0.2632, + "step": 2753 + }, + { + "epoch": 1.3860113421550095, + "grad_norm": 0.18225019166152914, + "learning_rate": 5.72389656574539e-07, + "loss": 0.2695, + "step": 2754 + }, + { + "epoch": 1.3865154379332072, + "grad_norm": 0.1707269538625628, + "learning_rate": 5.721239175812014e-07, + "loss": 0.272, + "step": 2755 + }, + { + "epoch": 1.3870195337114053, + "grad_norm": 0.1796542283862304, + "learning_rate": 5.718581577805604e-07, + "loss": 0.2721, + "step": 2756 + }, + { + "epoch": 1.387523629489603, + "grad_norm": 0.1770367282404336, + "learning_rate": 5.71592377249286e-07, + "loss": 0.2735, + "step": 2757 + }, + { + "epoch": 1.3880277252678008, + "grad_norm": 0.16921478099539097, + "learning_rate": 5.713265760640541e-07, + "loss": 0.2778, + "step": 2758 + }, + { + "epoch": 1.3885318210459987, + "grad_norm": 0.17268757073355442, + "learning_rate": 5.71060754301547e-07, + "loss": 0.2748, + "step": 2759 + }, + { + "epoch": 1.3890359168241966, + "grad_norm": 0.17598942075530893, + "learning_rate": 5.707949120384523e-07, + "loss": 0.2724, + "step": 2760 + }, + { + "epoch": 1.3895400126023945, + "grad_norm": 0.18011915470443154, + "learning_rate": 5.705290493514642e-07, + "loss": 0.2891, + "step": 2761 + }, + { + "epoch": 1.3900441083805923, + "grad_norm": 0.17876735405789806, + "learning_rate": 5.702631663172822e-07, + "loss": 0.2772, + "step": 2762 + }, + { + "epoch": 1.3905482041587902, + "grad_norm": 0.17718778143728184, + "learning_rate": 5.69997263012612e-07, + "loss": 0.2719, + "step": 2763 + }, + { + "epoch": 1.391052299936988, + "grad_norm": 0.17114075203463552, + "learning_rate": 5.697313395141651e-07, + "loss": 0.2689, + "step": 2764 + }, + { + "epoch": 1.391556395715186, + "grad_norm": 0.1880075108323104, + "learning_rate": 5.694653958986586e-07, + "loss": 0.2893, + "step": 2765 + }, + { + "epoch": 1.3920604914933836, + "grad_norm": 0.177316108146064, + "learning_rate": 5.691994322428159e-07, + "loss": 0.2774, + "step": 2766 + }, + { + "epoch": 1.3925645872715817, + "grad_norm": 0.16915555475740024, + "learning_rate": 5.689334486233655e-07, + "loss": 0.2605, + "step": 2767 + }, + { + "epoch": 1.3930686830497794, + "grad_norm": 0.1706580418948374, + "learning_rate": 5.686674451170421e-07, + "loss": 0.2669, + "step": 2768 + }, + { + "epoch": 1.3935727788279773, + "grad_norm": 0.18166657991393095, + "learning_rate": 5.684014218005861e-07, + "loss": 0.271, + "step": 2769 + }, + { + "epoch": 1.3940768746061751, + "grad_norm": 0.18074355543323026, + "learning_rate": 5.68135378750744e-07, + "loss": 0.2787, + "step": 2770 + }, + { + "epoch": 1.394580970384373, + "grad_norm": 0.18191764036464922, + "learning_rate": 5.678693160442668e-07, + "loss": 0.2694, + "step": 2771 + }, + { + "epoch": 1.3950850661625709, + "grad_norm": 0.1742981275140808, + "learning_rate": 5.676032337579126e-07, + "loss": 0.2668, + "step": 2772 + }, + { + "epoch": 1.3955891619407688, + "grad_norm": 0.182525466069335, + "learning_rate": 5.673371319684443e-07, + "loss": 0.2704, + "step": 2773 + }, + { + "epoch": 1.3960932577189666, + "grad_norm": 0.18422795498525135, + "learning_rate": 5.670710107526303e-07, + "loss": 0.2554, + "step": 2774 + }, + { + "epoch": 1.3965973534971645, + "grad_norm": 0.16935758864299294, + "learning_rate": 5.668048701872453e-07, + "loss": 0.2787, + "step": 2775 + }, + { + "epoch": 1.3971014492753624, + "grad_norm": 0.17250150189867655, + "learning_rate": 5.665387103490691e-07, + "loss": 0.269, + "step": 2776 + }, + { + "epoch": 1.39760554505356, + "grad_norm": 0.16845458026081864, + "learning_rate": 5.662725313148872e-07, + "loss": 0.2591, + "step": 2777 + }, + { + "epoch": 1.3981096408317581, + "grad_norm": 0.19087750959177846, + "learning_rate": 5.660063331614905e-07, + "loss": 0.2684, + "step": 2778 + }, + { + "epoch": 1.3986137366099558, + "grad_norm": 0.17287405774573983, + "learning_rate": 5.657401159656757e-07, + "loss": 0.2707, + "step": 2779 + }, + { + "epoch": 1.3991178323881537, + "grad_norm": 0.17266422585724345, + "learning_rate": 5.654738798042445e-07, + "loss": 0.2669, + "step": 2780 + }, + { + "epoch": 1.3996219281663516, + "grad_norm": 0.1687413663962204, + "learning_rate": 5.652076247540045e-07, + "loss": 0.269, + "step": 2781 + }, + { + "epoch": 1.4001260239445494, + "grad_norm": 0.17337825335332843, + "learning_rate": 5.649413508917689e-07, + "loss": 0.2687, + "step": 2782 + }, + { + "epoch": 1.4006301197227473, + "grad_norm": 0.17254045056559464, + "learning_rate": 5.646750582943558e-07, + "loss": 0.2806, + "step": 2783 + }, + { + "epoch": 1.4011342155009452, + "grad_norm": 0.17604086047481585, + "learning_rate": 5.644087470385889e-07, + "loss": 0.2794, + "step": 2784 + }, + { + "epoch": 1.401638311279143, + "grad_norm": 0.17593248783979848, + "learning_rate": 5.641424172012976e-07, + "loss": 0.2687, + "step": 2785 + }, + { + "epoch": 1.402142407057341, + "grad_norm": 0.17677274912632954, + "learning_rate": 5.638760688593162e-07, + "loss": 0.2667, + "step": 2786 + }, + { + "epoch": 1.402142407057341, + "eval_loss": 0.3082324266433716, + "eval_runtime": 17.3029, + "eval_samples_per_second": 49.414, + "eval_steps_per_second": 1.04, + "step": 2786 + }, + { + "epoch": 1.4026465028355388, + "grad_norm": 0.17615085346486575, + "learning_rate": 5.636097020894849e-07, + "loss": 0.2652, + "step": 2787 + }, + { + "epoch": 1.4031505986137365, + "grad_norm": 0.18280305120298956, + "learning_rate": 5.633433169686483e-07, + "loss": 0.2622, + "step": 2788 + }, + { + "epoch": 1.4036546943919346, + "grad_norm": 0.1766526757543098, + "learning_rate": 5.630769135736573e-07, + "loss": 0.2855, + "step": 2789 + }, + { + "epoch": 1.4041587901701322, + "grad_norm": 0.18013844905463533, + "learning_rate": 5.628104919813673e-07, + "loss": 0.2864, + "step": 2790 + }, + { + "epoch": 1.40466288594833, + "grad_norm": 0.17308521362796225, + "learning_rate": 5.625440522686395e-07, + "loss": 0.2822, + "step": 2791 + }, + { + "epoch": 1.405166981726528, + "grad_norm": 0.17384117426113055, + "learning_rate": 5.622775945123401e-07, + "loss": 0.2726, + "step": 2792 + }, + { + "epoch": 1.4056710775047259, + "grad_norm": 0.18309399117577832, + "learning_rate": 5.620111187893404e-07, + "loss": 0.2632, + "step": 2793 + }, + { + "epoch": 1.4061751732829237, + "grad_norm": 0.17415826316818256, + "learning_rate": 5.617446251765173e-07, + "loss": 0.2731, + "step": 2794 + }, + { + "epoch": 1.4066792690611216, + "grad_norm": 0.1739100478730036, + "learning_rate": 5.614781137507521e-07, + "loss": 0.2762, + "step": 2795 + }, + { + "epoch": 1.4071833648393195, + "grad_norm": 0.1764423898652386, + "learning_rate": 5.612115845889321e-07, + "loss": 0.2849, + "step": 2796 + }, + { + "epoch": 1.4076874606175174, + "grad_norm": 0.1760604570799819, + "learning_rate": 5.60945037767949e-07, + "loss": 0.2719, + "step": 2797 + }, + { + "epoch": 1.4081915563957152, + "grad_norm": 0.17431566775862142, + "learning_rate": 5.606784733647e-07, + "loss": 0.253, + "step": 2798 + }, + { + "epoch": 1.4086956521739131, + "grad_norm": 0.17776800316760477, + "learning_rate": 5.604118914560873e-07, + "loss": 0.2699, + "step": 2799 + }, + { + "epoch": 1.409199747952111, + "grad_norm": 0.18918069069185012, + "learning_rate": 5.601452921190183e-07, + "loss": 0.2717, + "step": 2800 + }, + { + "epoch": 1.4097038437303087, + "grad_norm": 0.1732601715342729, + "learning_rate": 5.598786754304051e-07, + "loss": 0.2714, + "step": 2801 + }, + { + "epoch": 1.4102079395085065, + "grad_norm": 0.1791884506349457, + "learning_rate": 5.596120414671649e-07, + "loss": 0.2693, + "step": 2802 + }, + { + "epoch": 1.4107120352867044, + "grad_norm": 0.17551762309525726, + "learning_rate": 5.5934539030622e-07, + "loss": 0.2931, + "step": 2803 + }, + { + "epoch": 1.4112161310649023, + "grad_norm": 0.17653447291530958, + "learning_rate": 5.590787220244975e-07, + "loss": 0.279, + "step": 2804 + }, + { + "epoch": 1.4117202268431002, + "grad_norm": 0.1715844081659434, + "learning_rate": 5.588120366989299e-07, + "loss": 0.2692, + "step": 2805 + }, + { + "epoch": 1.412224322621298, + "grad_norm": 0.17195024969559972, + "learning_rate": 5.585453344064538e-07, + "loss": 0.2755, + "step": 2806 + }, + { + "epoch": 1.412728418399496, + "grad_norm": 0.17614434935253434, + "learning_rate": 5.582786152240116e-07, + "loss": 0.2868, + "step": 2807 + }, + { + "epoch": 1.4132325141776938, + "grad_norm": 0.17488803322082694, + "learning_rate": 5.580118792285497e-07, + "loss": 0.2785, + "step": 2808 + }, + { + "epoch": 1.4137366099558917, + "grad_norm": 0.17403718634941737, + "learning_rate": 5.577451264970203e-07, + "loss": 0.2779, + "step": 2809 + }, + { + "epoch": 1.4142407057340896, + "grad_norm": 0.180098783645297, + "learning_rate": 5.574783571063795e-07, + "loss": 0.2805, + "step": 2810 + }, + { + "epoch": 1.4147448015122874, + "grad_norm": 0.1735655015170712, + "learning_rate": 5.572115711335886e-07, + "loss": 0.2695, + "step": 2811 + }, + { + "epoch": 1.415248897290485, + "grad_norm": 0.1741615072776797, + "learning_rate": 5.56944768655614e-07, + "loss": 0.2585, + "step": 2812 + }, + { + "epoch": 1.4157529930686832, + "grad_norm": 0.1743862849176332, + "learning_rate": 5.566779497494264e-07, + "loss": 0.2979, + "step": 2813 + }, + { + "epoch": 1.4162570888468808, + "grad_norm": 0.17758390356185494, + "learning_rate": 5.564111144920013e-07, + "loss": 0.2745, + "step": 2814 + }, + { + "epoch": 1.4167611846250787, + "grad_norm": 0.1834026948019301, + "learning_rate": 5.561442629603192e-07, + "loss": 0.2768, + "step": 2815 + }, + { + "epoch": 1.4172652804032766, + "grad_norm": 0.17218386664662416, + "learning_rate": 5.55877395231365e-07, + "loss": 0.278, + "step": 2816 + }, + { + "epoch": 1.4177693761814745, + "grad_norm": 0.17269835730848343, + "learning_rate": 5.556105113821285e-07, + "loss": 0.272, + "step": 2817 + }, + { + "epoch": 1.4182734719596723, + "grad_norm": 0.19670398507490758, + "learning_rate": 5.553436114896037e-07, + "loss": 0.2511, + "step": 2818 + }, + { + "epoch": 1.4187775677378702, + "grad_norm": 0.1715178941839397, + "learning_rate": 5.5507669563079e-07, + "loss": 0.2735, + "step": 2819 + }, + { + "epoch": 1.419281663516068, + "grad_norm": 0.1729826705555197, + "learning_rate": 5.548097638826907e-07, + "loss": 0.2707, + "step": 2820 + }, + { + "epoch": 1.419785759294266, + "grad_norm": 0.1725819935420221, + "learning_rate": 5.545428163223142e-07, + "loss": 0.2671, + "step": 2821 + }, + { + "epoch": 1.4202898550724639, + "grad_norm": 0.17665111931235344, + "learning_rate": 5.542758530266729e-07, + "loss": 0.2538, + "step": 2822 + }, + { + "epoch": 1.4207939508506615, + "grad_norm": 0.17897097101640105, + "learning_rate": 5.540088740727843e-07, + "loss": 0.2697, + "step": 2823 + }, + { + "epoch": 1.4212980466288596, + "grad_norm": 0.1687081289149051, + "learning_rate": 5.537418795376702e-07, + "loss": 0.2828, + "step": 2824 + }, + { + "epoch": 1.4218021424070573, + "grad_norm": 0.20222752341146646, + "learning_rate": 5.534748694983567e-07, + "loss": 0.2713, + "step": 2825 + }, + { + "epoch": 1.4223062381852551, + "grad_norm": 0.17189172904171757, + "learning_rate": 5.532078440318746e-07, + "loss": 0.2698, + "step": 2826 + }, + { + "epoch": 1.422810333963453, + "grad_norm": 0.17345223486754652, + "learning_rate": 5.52940803215259e-07, + "loss": 0.276, + "step": 2827 + }, + { + "epoch": 1.423314429741651, + "grad_norm": 0.2080580963062512, + "learning_rate": 5.526737471255498e-07, + "loss": 0.2878, + "step": 2828 + }, + { + "epoch": 1.4238185255198488, + "grad_norm": 0.1993251803507769, + "learning_rate": 5.524066758397907e-07, + "loss": 0.2741, + "step": 2829 + }, + { + "epoch": 1.4243226212980467, + "grad_norm": 0.19657833333195346, + "learning_rate": 5.521395894350303e-07, + "loss": 0.2743, + "step": 2830 + }, + { + "epoch": 1.4248267170762445, + "grad_norm": 0.18203360278973735, + "learning_rate": 5.518724879883215e-07, + "loss": 0.2769, + "step": 2831 + }, + { + "epoch": 1.4253308128544424, + "grad_norm": 0.18264070770705262, + "learning_rate": 5.516053715767211e-07, + "loss": 0.2853, + "step": 2832 + }, + { + "epoch": 1.4258349086326403, + "grad_norm": 0.18164978911251264, + "learning_rate": 5.513382402772906e-07, + "loss": 0.2755, + "step": 2833 + }, + { + "epoch": 1.426339004410838, + "grad_norm": 0.1662574674843768, + "learning_rate": 5.510710941670959e-07, + "loss": 0.2689, + "step": 2834 + }, + { + "epoch": 1.426843100189036, + "grad_norm": 0.17319135764139246, + "learning_rate": 5.508039333232069e-07, + "loss": 0.2856, + "step": 2835 + }, + { + "epoch": 1.4273471959672337, + "grad_norm": 0.1827176229024098, + "learning_rate": 5.505367578226978e-07, + "loss": 0.2842, + "step": 2836 + }, + { + "epoch": 1.4278512917454316, + "grad_norm": 0.1908930430427692, + "learning_rate": 5.502695677426471e-07, + "loss": 0.2634, + "step": 2837 + }, + { + "epoch": 1.4283553875236294, + "grad_norm": 0.18109239755144405, + "learning_rate": 5.500023631601376e-07, + "loss": 0.2786, + "step": 2838 + }, + { + "epoch": 1.4288594833018273, + "grad_norm": 0.18601457798351573, + "learning_rate": 5.497351441522561e-07, + "loss": 0.2773, + "step": 2839 + }, + { + "epoch": 1.4293635790800252, + "grad_norm": 0.1851254103241904, + "learning_rate": 5.494679107960936e-07, + "loss": 0.2918, + "step": 2840 + }, + { + "epoch": 1.429867674858223, + "grad_norm": 0.1740395548507666, + "learning_rate": 5.492006631687451e-07, + "loss": 0.2681, + "step": 2841 + }, + { + "epoch": 1.430371770636421, + "grad_norm": 0.17944562058468594, + "learning_rate": 5.489334013473103e-07, + "loss": 0.2643, + "step": 2842 + }, + { + "epoch": 1.4308758664146188, + "grad_norm": 0.1721575725843985, + "learning_rate": 5.486661254088921e-07, + "loss": 0.2678, + "step": 2843 + }, + { + "epoch": 1.4313799621928167, + "grad_norm": 0.18124114212980594, + "learning_rate": 5.483988354305984e-07, + "loss": 0.286, + "step": 2844 + }, + { + "epoch": 1.4318840579710144, + "grad_norm": 0.17177819687786705, + "learning_rate": 5.481315314895404e-07, + "loss": 0.2883, + "step": 2845 + }, + { + "epoch": 1.4323881537492125, + "grad_norm": 0.17999663813819824, + "learning_rate": 5.478642136628338e-07, + "loss": 0.2506, + "step": 2846 + }, + { + "epoch": 1.4328922495274101, + "grad_norm": 0.1759171770648982, + "learning_rate": 5.47596882027598e-07, + "loss": 0.2735, + "step": 2847 + }, + { + "epoch": 1.433396345305608, + "grad_norm": 0.1816944331335307, + "learning_rate": 5.473295366609566e-07, + "loss": 0.2849, + "step": 2848 + }, + { + "epoch": 1.4339004410838059, + "grad_norm": 0.17100685390259318, + "learning_rate": 5.470621776400371e-07, + "loss": 0.2612, + "step": 2849 + }, + { + "epoch": 1.4344045368620038, + "grad_norm": 0.1819142852011383, + "learning_rate": 5.467948050419707e-07, + "loss": 0.2783, + "step": 2850 + }, + { + "epoch": 1.4349086326402016, + "grad_norm": 0.18361474818908685, + "learning_rate": 5.465274189438931e-07, + "loss": 0.284, + "step": 2851 + }, + { + "epoch": 1.4354127284183995, + "grad_norm": 0.1752427494123583, + "learning_rate": 5.462600194229432e-07, + "loss": 0.2823, + "step": 2852 + }, + { + "epoch": 1.4359168241965974, + "grad_norm": 0.1773020482286541, + "learning_rate": 5.459926065562643e-07, + "loss": 0.2686, + "step": 2853 + }, + { + "epoch": 1.4364209199747953, + "grad_norm": 0.17566660311428434, + "learning_rate": 5.457251804210035e-07, + "loss": 0.2668, + "step": 2854 + }, + { + "epoch": 1.4369250157529931, + "grad_norm": 0.17963004958574516, + "learning_rate": 5.454577410943113e-07, + "loss": 0.2713, + "step": 2855 + }, + { + "epoch": 1.4374291115311908, + "grad_norm": 0.17390970931818708, + "learning_rate": 5.451902886533424e-07, + "loss": 0.2681, + "step": 2856 + }, + { + "epoch": 1.437933207309389, + "grad_norm": 0.17729799370957, + "learning_rate": 5.449228231752551e-07, + "loss": 0.272, + "step": 2857 + }, + { + "epoch": 1.4384373030875865, + "grad_norm": 0.17507123221756268, + "learning_rate": 5.446553447372117e-07, + "loss": 0.2775, + "step": 2858 + }, + { + "epoch": 1.4389413988657844, + "grad_norm": 0.17681228025938894, + "learning_rate": 5.443878534163779e-07, + "loss": 0.2831, + "step": 2859 + }, + { + "epoch": 1.4394454946439823, + "grad_norm": 0.1821736618960098, + "learning_rate": 5.441203492899233e-07, + "loss": 0.2841, + "step": 2860 + }, + { + "epoch": 1.4399495904221802, + "grad_norm": 0.17903890868161446, + "learning_rate": 5.438528324350214e-07, + "loss": 0.278, + "step": 2861 + }, + { + "epoch": 1.440453686200378, + "grad_norm": 0.180429015598911, + "learning_rate": 5.435853029288489e-07, + "loss": 0.2746, + "step": 2862 + }, + { + "epoch": 1.440957781978576, + "grad_norm": 0.17355901123245668, + "learning_rate": 5.433177608485865e-07, + "loss": 0.2679, + "step": 2863 + }, + { + "epoch": 1.4414618777567738, + "grad_norm": 0.180173248420686, + "learning_rate": 5.430502062714184e-07, + "loss": 0.2698, + "step": 2864 + }, + { + "epoch": 1.4419659735349717, + "grad_norm": 0.17214222893696038, + "learning_rate": 5.427826392745325e-07, + "loss": 0.2769, + "step": 2865 + }, + { + "epoch": 1.4424700693131696, + "grad_norm": 0.1726707684780785, + "learning_rate": 5.425150599351201e-07, + "loss": 0.2819, + "step": 2866 + }, + { + "epoch": 1.4429741650913672, + "grad_norm": 0.17661401865470128, + "learning_rate": 5.422474683303765e-07, + "loss": 0.2633, + "step": 2867 + }, + { + "epoch": 1.4434782608695653, + "grad_norm": 0.16966464132316653, + "learning_rate": 5.419798645374998e-07, + "loss": 0.2749, + "step": 2868 + }, + { + "epoch": 1.443982356647763, + "grad_norm": 0.17637511222129923, + "learning_rate": 5.417122486336923e-07, + "loss": 0.2747, + "step": 2869 + }, + { + "epoch": 1.4444864524259609, + "grad_norm": 0.1747096047216983, + "learning_rate": 5.414446206961596e-07, + "loss": 0.2701, + "step": 2870 + }, + { + "epoch": 1.4449905482041587, + "grad_norm": 0.18391353933023288, + "learning_rate": 5.411769808021104e-07, + "loss": 0.2786, + "step": 2871 + }, + { + "epoch": 1.4454946439823566, + "grad_norm": 0.1731778758855043, + "learning_rate": 5.409093290287573e-07, + "loss": 0.2568, + "step": 2872 + }, + { + "epoch": 1.4459987397605545, + "grad_norm": 0.16898295126793264, + "learning_rate": 5.406416654533163e-07, + "loss": 0.2657, + "step": 2873 + }, + { + "epoch": 1.4465028355387524, + "grad_norm": 0.18058265233021242, + "learning_rate": 5.403739901530065e-07, + "loss": 0.2765, + "step": 2874 + }, + { + "epoch": 1.4470069313169502, + "grad_norm": 0.17654601160313765, + "learning_rate": 5.401063032050507e-07, + "loss": 0.2789, + "step": 2875 + }, + { + "epoch": 1.4475110270951481, + "grad_norm": 0.17294349882056775, + "learning_rate": 5.398386046866747e-07, + "loss": 0.268, + "step": 2876 + }, + { + "epoch": 1.448015122873346, + "grad_norm": 0.17407010490339622, + "learning_rate": 5.395708946751083e-07, + "loss": 0.2752, + "step": 2877 + }, + { + "epoch": 1.4485192186515439, + "grad_norm": 0.1869029531049797, + "learning_rate": 5.393031732475837e-07, + "loss": 0.2839, + "step": 2878 + }, + { + "epoch": 1.4490233144297417, + "grad_norm": 0.17047646712682293, + "learning_rate": 5.390354404813373e-07, + "loss": 0.2783, + "step": 2879 + }, + { + "epoch": 1.4495274102079394, + "grad_norm": 0.1740578915583476, + "learning_rate": 5.38767696453608e-07, + "loss": 0.2903, + "step": 2880 + }, + { + "epoch": 1.4500315059861375, + "grad_norm": 0.18937601572967064, + "learning_rate": 5.384999412416383e-07, + "loss": 0.2649, + "step": 2881 + }, + { + "epoch": 1.4505356017643352, + "grad_norm": 0.17078602897509845, + "learning_rate": 5.382321749226743e-07, + "loss": 0.2753, + "step": 2882 + }, + { + "epoch": 1.451039697542533, + "grad_norm": 0.18237399698730128, + "learning_rate": 5.379643975739647e-07, + "loss": 0.2991, + "step": 2883 + }, + { + "epoch": 1.451543793320731, + "grad_norm": 0.1741266975993158, + "learning_rate": 5.376966092727613e-07, + "loss": 0.271, + "step": 2884 + }, + { + "epoch": 1.4520478890989288, + "grad_norm": 0.17610657049121992, + "learning_rate": 5.374288100963198e-07, + "loss": 0.2889, + "step": 2885 + }, + { + "epoch": 1.4525519848771267, + "grad_norm": 0.17138137935782394, + "learning_rate": 5.371610001218983e-07, + "loss": 0.2649, + "step": 2886 + }, + { + "epoch": 1.4530560806553245, + "grad_norm": 0.17838838912402527, + "learning_rate": 5.368931794267586e-07, + "loss": 0.2753, + "step": 2887 + }, + { + "epoch": 1.4535601764335224, + "grad_norm": 0.1746687660444875, + "learning_rate": 5.366253480881651e-07, + "loss": 0.2763, + "step": 2888 + }, + { + "epoch": 1.4540642722117203, + "grad_norm": 0.17490010732782957, + "learning_rate": 5.363575061833856e-07, + "loss": 0.273, + "step": 2889 + }, + { + "epoch": 1.4545683679899182, + "grad_norm": 0.1753044578433687, + "learning_rate": 5.360896537896909e-07, + "loss": 0.2818, + "step": 2890 + }, + { + "epoch": 1.4550724637681158, + "grad_norm": 0.17281297026171832, + "learning_rate": 5.358217909843545e-07, + "loss": 0.2616, + "step": 2891 + }, + { + "epoch": 1.455576559546314, + "grad_norm": 0.17311583025271762, + "learning_rate": 5.355539178446535e-07, + "loss": 0.2705, + "step": 2892 + }, + { + "epoch": 1.4560806553245116, + "grad_norm": 0.17027692820248555, + "learning_rate": 5.352860344478673e-07, + "loss": 0.2736, + "step": 2893 + }, + { + "epoch": 1.4565847511027095, + "grad_norm": 0.17311501562837145, + "learning_rate": 5.350181408712787e-07, + "loss": 0.2563, + "step": 2894 + }, + { + "epoch": 1.4570888468809073, + "grad_norm": 0.17543624536918084, + "learning_rate": 5.347502371921735e-07, + "loss": 0.2736, + "step": 2895 + }, + { + "epoch": 1.4575929426591052, + "grad_norm": 0.19910330074338614, + "learning_rate": 5.3448232348784e-07, + "loss": 0.273, + "step": 2896 + }, + { + "epoch": 1.458097038437303, + "grad_norm": 0.1738967237730725, + "learning_rate": 5.342143998355698e-07, + "loss": 0.274, + "step": 2897 + }, + { + "epoch": 1.458601134215501, + "grad_norm": 0.17570081461431064, + "learning_rate": 5.339464663126574e-07, + "loss": 0.2759, + "step": 2898 + }, + { + "epoch": 1.4591052299936988, + "grad_norm": 0.17791619225009586, + "learning_rate": 5.336785229963996e-07, + "loss": 0.2623, + "step": 2899 + }, + { + "epoch": 1.4596093257718967, + "grad_norm": 0.17002260231876098, + "learning_rate": 5.334105699640965e-07, + "loss": 0.266, + "step": 2900 + }, + { + "epoch": 1.4601134215500946, + "grad_norm": 0.18429176939879613, + "learning_rate": 5.33142607293051e-07, + "loss": 0.2819, + "step": 2901 + }, + { + "epoch": 1.4606175173282923, + "grad_norm": 0.17868955547716125, + "learning_rate": 5.328746350605685e-07, + "loss": 0.2682, + "step": 2902 + }, + { + "epoch": 1.4611216131064904, + "grad_norm": 0.16918396641956726, + "learning_rate": 5.326066533439575e-07, + "loss": 0.2755, + "step": 2903 + }, + { + "epoch": 1.461625708884688, + "grad_norm": 0.17069887650764975, + "learning_rate": 5.323386622205291e-07, + "loss": 0.2801, + "step": 2904 + }, + { + "epoch": 1.462129804662886, + "grad_norm": 0.1701037068935557, + "learning_rate": 5.320706617675968e-07, + "loss": 0.2765, + "step": 2905 + }, + { + "epoch": 1.4626339004410838, + "grad_norm": 0.17789523122182876, + "learning_rate": 5.318026520624774e-07, + "loss": 0.2912, + "step": 2906 + }, + { + "epoch": 1.4631379962192816, + "grad_norm": 0.19749764941108588, + "learning_rate": 5.315346331824898e-07, + "loss": 0.2853, + "step": 2907 + }, + { + "epoch": 1.4636420919974795, + "grad_norm": 0.1756855169152155, + "learning_rate": 5.31266605204956e-07, + "loss": 0.2832, + "step": 2908 + }, + { + "epoch": 1.4641461877756774, + "grad_norm": 0.1786872687703457, + "learning_rate": 5.309985682072001e-07, + "loss": 0.2665, + "step": 2909 + }, + { + "epoch": 1.4646502835538753, + "grad_norm": 0.1877786342320192, + "learning_rate": 5.307305222665494e-07, + "loss": 0.2774, + "step": 2910 + }, + { + "epoch": 1.4651543793320732, + "grad_norm": 0.17127707594522085, + "learning_rate": 5.304624674603335e-07, + "loss": 0.2646, + "step": 2911 + }, + { + "epoch": 1.465658475110271, + "grad_norm": 0.2560823194379178, + "learning_rate": 5.301944038658842e-07, + "loss": 0.2669, + "step": 2912 + }, + { + "epoch": 1.4661625708884687, + "grad_norm": 0.1766829426286369, + "learning_rate": 5.299263315605367e-07, + "loss": 0.2781, + "step": 2913 + }, + { + "epoch": 1.4666666666666668, + "grad_norm": 0.16776123278488433, + "learning_rate": 5.296582506216277e-07, + "loss": 0.2616, + "step": 2914 + }, + { + "epoch": 1.4671707624448644, + "grad_norm": 0.17648227246454712, + "learning_rate": 5.293901611264971e-07, + "loss": 0.2649, + "step": 2915 + }, + { + "epoch": 1.4676748582230623, + "grad_norm": 0.17432483953665048, + "learning_rate": 5.291220631524872e-07, + "loss": 0.2687, + "step": 2916 + }, + { + "epoch": 1.4681789540012602, + "grad_norm": 0.20520459105179642, + "learning_rate": 5.288539567769424e-07, + "loss": 0.2862, + "step": 2917 + }, + { + "epoch": 1.468683049779458, + "grad_norm": 0.18543376500369232, + "learning_rate": 5.285858420772099e-07, + "loss": 0.2541, + "step": 2918 + }, + { + "epoch": 1.469187145557656, + "grad_norm": 0.17322589885199907, + "learning_rate": 5.283177191306389e-07, + "loss": 0.2748, + "step": 2919 + }, + { + "epoch": 1.4696912413358538, + "grad_norm": 0.18616715044944854, + "learning_rate": 5.280495880145814e-07, + "loss": 0.2791, + "step": 2920 + }, + { + "epoch": 1.4701953371140517, + "grad_norm": 0.17972067487672042, + "learning_rate": 5.277814488063918e-07, + "loss": 0.283, + "step": 2921 + }, + { + "epoch": 1.4706994328922496, + "grad_norm": 0.19354601757670176, + "learning_rate": 5.27513301583426e-07, + "loss": 0.2733, + "step": 2922 + }, + { + "epoch": 1.4712035286704475, + "grad_norm": 0.17386234930511313, + "learning_rate": 5.272451464230433e-07, + "loss": 0.2667, + "step": 2923 + }, + { + "epoch": 1.4717076244486451, + "grad_norm": 0.1747812393098067, + "learning_rate": 5.269769834026045e-07, + "loss": 0.2538, + "step": 2924 + }, + { + "epoch": 1.4722117202268432, + "grad_norm": 0.18216928171454855, + "learning_rate": 5.267088125994732e-07, + "loss": 0.2689, + "step": 2925 + }, + { + "epoch": 1.4727158160050409, + "grad_norm": 0.18671286603946396, + "learning_rate": 5.264406340910148e-07, + "loss": 0.2682, + "step": 2926 + }, + { + "epoch": 1.4732199117832387, + "grad_norm": 0.17551742566377754, + "learning_rate": 5.261724479545974e-07, + "loss": 0.2913, + "step": 2927 + }, + { + "epoch": 1.4737240075614366, + "grad_norm": 0.24593696823308445, + "learning_rate": 5.259042542675907e-07, + "loss": 0.2717, + "step": 2928 + }, + { + "epoch": 1.4742281033396345, + "grad_norm": 0.1716232359748941, + "learning_rate": 5.256360531073674e-07, + "loss": 0.2791, + "step": 2929 + }, + { + "epoch": 1.4747321991178324, + "grad_norm": 0.17889186640749882, + "learning_rate": 5.253678445513014e-07, + "loss": 0.2559, + "step": 2930 + }, + { + "epoch": 1.4752362948960303, + "grad_norm": 0.16660287168913102, + "learning_rate": 5.250996286767693e-07, + "loss": 0.2796, + "step": 2931 + }, + { + "epoch": 1.4757403906742281, + "grad_norm": 0.17208791704520798, + "learning_rate": 5.248314055611499e-07, + "loss": 0.2789, + "step": 2932 + }, + { + "epoch": 1.476244486452426, + "grad_norm": 0.16785449767453983, + "learning_rate": 5.245631752818238e-07, + "loss": 0.2517, + "step": 2933 + }, + { + "epoch": 1.4767485822306239, + "grad_norm": 0.17104687297149335, + "learning_rate": 5.242949379161739e-07, + "loss": 0.2669, + "step": 2934 + }, + { + "epoch": 1.4772526780088215, + "grad_norm": 0.17816748061415208, + "learning_rate": 5.240266935415847e-07, + "loss": 0.2761, + "step": 2935 + }, + { + "epoch": 1.4777567737870196, + "grad_norm": 0.17497683850506593, + "learning_rate": 5.237584422354435e-07, + "loss": 0.2691, + "step": 2936 + }, + { + "epoch": 1.4782608695652173, + "grad_norm": 0.17896943432738507, + "learning_rate": 5.234901840751388e-07, + "loss": 0.2691, + "step": 2937 + }, + { + "epoch": 1.4787649653434152, + "grad_norm": 0.18082033748198661, + "learning_rate": 5.232219191380614e-07, + "loss": 0.2726, + "step": 2938 + }, + { + "epoch": 1.479269061121613, + "grad_norm": 0.17731545802828333, + "learning_rate": 5.229536475016044e-07, + "loss": 0.2817, + "step": 2939 + }, + { + "epoch": 1.479773156899811, + "grad_norm": 0.17098014110692908, + "learning_rate": 5.22685369243162e-07, + "loss": 0.2714, + "step": 2940 + }, + { + "epoch": 1.4802772526780088, + "grad_norm": 0.1813522016663038, + "learning_rate": 5.224170844401313e-07, + "loss": 0.2846, + "step": 2941 + }, + { + "epoch": 1.4807813484562067, + "grad_norm": 0.17197034879530912, + "learning_rate": 5.221487931699107e-07, + "loss": 0.2667, + "step": 2942 + }, + { + "epoch": 1.4812854442344046, + "grad_norm": 0.16697715148870823, + "learning_rate": 5.218804955099003e-07, + "loss": 0.2627, + "step": 2943 + }, + { + "epoch": 1.4817895400126024, + "grad_norm": 0.17040842764616276, + "learning_rate": 5.216121915375026e-07, + "loss": 0.2577, + "step": 2944 + }, + { + "epoch": 1.4822936357908003, + "grad_norm": 0.17579731593772854, + "learning_rate": 5.213438813301214e-07, + "loss": 0.2811, + "step": 2945 + }, + { + "epoch": 1.4827977315689982, + "grad_norm": 0.17045185683058092, + "learning_rate": 5.210755649651627e-07, + "loss": 0.255, + "step": 2946 + }, + { + "epoch": 1.483301827347196, + "grad_norm": 0.17068201983517348, + "learning_rate": 5.20807242520034e-07, + "loss": 0.2743, + "step": 2947 + }, + { + "epoch": 1.4838059231253937, + "grad_norm": 0.18181611693726946, + "learning_rate": 5.205389140721448e-07, + "loss": 0.2732, + "step": 2948 + }, + { + "epoch": 1.4843100189035916, + "grad_norm": 0.1852929761291196, + "learning_rate": 5.202705796989061e-07, + "loss": 0.2801, + "step": 2949 + }, + { + "epoch": 1.4848141146817895, + "grad_norm": 0.17600122731072515, + "learning_rate": 5.200022394777308e-07, + "loss": 0.2716, + "step": 2950 + }, + { + "epoch": 1.4853182104599874, + "grad_norm": 0.1718236542287459, + "learning_rate": 5.197338934860332e-07, + "loss": 0.2674, + "step": 2951 + }, + { + "epoch": 1.4858223062381852, + "grad_norm": 0.1775790487723521, + "learning_rate": 5.194655418012295e-07, + "loss": 0.2825, + "step": 2952 + }, + { + "epoch": 1.486326402016383, + "grad_norm": 0.1807315629873165, + "learning_rate": 5.191971845007378e-07, + "loss": 0.3003, + "step": 2953 + }, + { + "epoch": 1.486830497794581, + "grad_norm": 0.17385001875969464, + "learning_rate": 5.189288216619773e-07, + "loss": 0.2586, + "step": 2954 + }, + { + "epoch": 1.4873345935727789, + "grad_norm": 0.17077436408789423, + "learning_rate": 5.186604533623689e-07, + "loss": 0.2774, + "step": 2955 + }, + { + "epoch": 1.4878386893509767, + "grad_norm": 0.17354882163404403, + "learning_rate": 5.183920796793353e-07, + "loss": 0.2695, + "step": 2956 + }, + { + "epoch": 1.4883427851291746, + "grad_norm": 0.18056872611734137, + "learning_rate": 5.181237006903007e-07, + "loss": 0.2575, + "step": 2957 + }, + { + "epoch": 1.4888468809073725, + "grad_norm": 0.1747297098238214, + "learning_rate": 5.178553164726906e-07, + "loss": 0.2832, + "step": 2958 + }, + { + "epoch": 1.4893509766855701, + "grad_norm": 0.17268688557025336, + "learning_rate": 5.175869271039325e-07, + "loss": 0.2745, + "step": 2959 + }, + { + "epoch": 1.4898550724637682, + "grad_norm": 0.1723723042058958, + "learning_rate": 5.173185326614546e-07, + "loss": 0.2835, + "step": 2960 + }, + { + "epoch": 1.490359168241966, + "grad_norm": 0.16999642870279735, + "learning_rate": 5.170501332226875e-07, + "loss": 0.2734, + "step": 2961 + }, + { + "epoch": 1.4908632640201638, + "grad_norm": 0.16872935699465505, + "learning_rate": 5.167817288650625e-07, + "loss": 0.2756, + "step": 2962 + }, + { + "epoch": 1.4913673597983617, + "grad_norm": 0.18195730992832942, + "learning_rate": 5.165133196660128e-07, + "loss": 0.275, + "step": 2963 + }, + { + "epoch": 1.4918714555765595, + "grad_norm": 0.1694600503693703, + "learning_rate": 5.162449057029725e-07, + "loss": 0.2677, + "step": 2964 + }, + { + "epoch": 1.4923755513547574, + "grad_norm": 0.17420508681132724, + "learning_rate": 5.159764870533777e-07, + "loss": 0.2804, + "step": 2965 + }, + { + "epoch": 1.4928796471329553, + "grad_norm": 0.16859261160349226, + "learning_rate": 5.157080637946654e-07, + "loss": 0.2705, + "step": 2966 + }, + { + "epoch": 1.4933837429111532, + "grad_norm": 0.17551194260816222, + "learning_rate": 5.154396360042738e-07, + "loss": 0.2592, + "step": 2967 + }, + { + "epoch": 1.493887838689351, + "grad_norm": 0.17227527863330538, + "learning_rate": 5.15171203759643e-07, + "loss": 0.2674, + "step": 2968 + }, + { + "epoch": 1.494391934467549, + "grad_norm": 0.1728755992481674, + "learning_rate": 5.149027671382138e-07, + "loss": 0.2705, + "step": 2969 + }, + { + "epoch": 1.4948960302457466, + "grad_norm": 0.17382519055950477, + "learning_rate": 5.146343262174286e-07, + "loss": 0.2652, + "step": 2970 + }, + { + "epoch": 1.4954001260239447, + "grad_norm": 0.1769092204357377, + "learning_rate": 5.14365881074731e-07, + "loss": 0.2767, + "step": 2971 + }, + { + "epoch": 1.4959042218021423, + "grad_norm": 0.17476416087544372, + "learning_rate": 5.140974317875657e-07, + "loss": 0.2753, + "step": 2972 + }, + { + "epoch": 1.4964083175803402, + "grad_norm": 0.169687985399867, + "learning_rate": 5.138289784333787e-07, + "loss": 0.2765, + "step": 2973 + }, + { + "epoch": 1.496912413358538, + "grad_norm": 0.17261770629037, + "learning_rate": 5.13560521089617e-07, + "loss": 0.2732, + "step": 2974 + }, + { + "epoch": 1.497416509136736, + "grad_norm": 0.17477899451723963, + "learning_rate": 5.13292059833729e-07, + "loss": 0.2599, + "step": 2975 + }, + { + "epoch": 1.4979206049149338, + "grad_norm": 0.18102926267539213, + "learning_rate": 5.13023594743164e-07, + "loss": 0.2674, + "step": 2976 + }, + { + "epoch": 1.4984247006931317, + "grad_norm": 0.17738927553973277, + "learning_rate": 5.127551258953727e-07, + "loss": 0.2672, + "step": 2977 + }, + { + "epoch": 1.4989287964713296, + "grad_norm": 0.18069762077371987, + "learning_rate": 5.124866533678066e-07, + "loss": 0.2665, + "step": 2978 + }, + { + "epoch": 1.4994328922495275, + "grad_norm": 0.17624639732046102, + "learning_rate": 5.122181772379182e-07, + "loss": 0.2847, + "step": 2979 + }, + { + "epoch": 1.4999369880277253, + "grad_norm": 0.21149596045503335, + "learning_rate": 5.119496975831616e-07, + "loss": 0.2755, + "step": 2980 + }, + { + "epoch": 1.500441083805923, + "grad_norm": 0.17200655792505817, + "learning_rate": 5.116812144809911e-07, + "loss": 0.2776, + "step": 2981 + }, + { + "epoch": 1.500945179584121, + "grad_norm": 0.199818514038348, + "learning_rate": 5.114127280088627e-07, + "loss": 0.2721, + "step": 2982 + }, + { + "epoch": 1.5014492753623188, + "grad_norm": 0.1767000934409416, + "learning_rate": 5.111442382442328e-07, + "loss": 0.2897, + "step": 2983 + }, + { + "epoch": 1.5019533711405169, + "grad_norm": 0.17657617178530624, + "learning_rate": 5.108757452645594e-07, + "loss": 0.2585, + "step": 2984 + }, + { + "epoch": 1.5024574669187145, + "grad_norm": 0.17907925642962816, + "learning_rate": 5.106072491473008e-07, + "loss": 0.2753, + "step": 2985 + }, + { + "epoch": 1.5024574669187145, + "eval_loss": 0.30766692757606506, + "eval_runtime": 17.4903, + "eval_samples_per_second": 48.884, + "eval_steps_per_second": 1.029, + "step": 2985 + }, + { + "epoch": 1.5029615626969124, + "grad_norm": 0.17603805841778422, + "learning_rate": 5.103387499699164e-07, + "loss": 0.2658, + "step": 2986 + }, + { + "epoch": 1.5034656584751103, + "grad_norm": 0.1772161820090902, + "learning_rate": 5.100702478098667e-07, + "loss": 0.2757, + "step": 2987 + }, + { + "epoch": 1.5039697542533081, + "grad_norm": 0.17226447203172757, + "learning_rate": 5.098017427446132e-07, + "loss": 0.2765, + "step": 2988 + }, + { + "epoch": 1.504473850031506, + "grad_norm": 0.17276877170127075, + "learning_rate": 5.095332348516172e-07, + "loss": 0.2752, + "step": 2989 + }, + { + "epoch": 1.504977945809704, + "grad_norm": 0.1847638673736269, + "learning_rate": 5.092647242083423e-07, + "loss": 0.2789, + "step": 2990 + }, + { + "epoch": 1.5054820415879018, + "grad_norm": 0.17901616149206137, + "learning_rate": 5.089962108922517e-07, + "loss": 0.2911, + "step": 2991 + }, + { + "epoch": 1.5059861373660994, + "grad_norm": 0.18718528180473848, + "learning_rate": 5.0872769498081e-07, + "loss": 0.2618, + "step": 2992 + }, + { + "epoch": 1.5064902331442975, + "grad_norm": 0.17561340990666174, + "learning_rate": 5.084591765514824e-07, + "loss": 0.2796, + "step": 2993 + }, + { + "epoch": 1.5069943289224952, + "grad_norm": 0.17573027328641988, + "learning_rate": 5.081906556817348e-07, + "loss": 0.2746, + "step": 2994 + }, + { + "epoch": 1.5074984247006933, + "grad_norm": 0.1946924830736341, + "learning_rate": 5.079221324490338e-07, + "loss": 0.2596, + "step": 2995 + }, + { + "epoch": 1.508002520478891, + "grad_norm": 0.19483610334363802, + "learning_rate": 5.076536069308466e-07, + "loss": 0.2661, + "step": 2996 + }, + { + "epoch": 1.5085066162570888, + "grad_norm": 0.17050635285106475, + "learning_rate": 5.073850792046411e-07, + "loss": 0.2845, + "step": 2997 + }, + { + "epoch": 1.5090107120352867, + "grad_norm": 0.17434918537651012, + "learning_rate": 5.071165493478862e-07, + "loss": 0.2763, + "step": 2998 + }, + { + "epoch": 1.5095148078134846, + "grad_norm": 0.17421166183775727, + "learning_rate": 5.068480174380507e-07, + "loss": 0.2849, + "step": 2999 + }, + { + "epoch": 1.5100189035916824, + "grad_norm": 0.17727506977234753, + "learning_rate": 5.065794835526047e-07, + "loss": 0.266, + "step": 3000 + }, + { + "epoch": 1.5105229993698803, + "grad_norm": 0.17988130416225281, + "learning_rate": 5.063109477690186e-07, + "loss": 0.2621, + "step": 3001 + }, + { + "epoch": 1.5110270951480782, + "grad_norm": 0.17698102799975615, + "learning_rate": 5.060424101647631e-07, + "loss": 0.2805, + "step": 3002 + }, + { + "epoch": 1.5115311909262759, + "grad_norm": 0.1800412666094284, + "learning_rate": 5.057738708173096e-07, + "loss": 0.2749, + "step": 3003 + }, + { + "epoch": 1.512035286704474, + "grad_norm": 0.17319416297445328, + "learning_rate": 5.055053298041302e-07, + "loss": 0.2782, + "step": 3004 + }, + { + "epoch": 1.5125393824826716, + "grad_norm": 0.1683959181018551, + "learning_rate": 5.052367872026971e-07, + "loss": 0.2595, + "step": 3005 + }, + { + "epoch": 1.5130434782608697, + "grad_norm": 0.19739859925036402, + "learning_rate": 5.049682430904835e-07, + "loss": 0.2671, + "step": 3006 + }, + { + "epoch": 1.5135475740390674, + "grad_norm": 0.17458057209940056, + "learning_rate": 5.046996975449624e-07, + "loss": 0.2812, + "step": 3007 + }, + { + "epoch": 1.5140516698172652, + "grad_norm": 0.17642668853441784, + "learning_rate": 5.044311506436077e-07, + "loss": 0.254, + "step": 3008 + }, + { + "epoch": 1.5145557655954631, + "grad_norm": 0.18216491050641825, + "learning_rate": 5.041626024638935e-07, + "loss": 0.2928, + "step": 3009 + }, + { + "epoch": 1.515059861373661, + "grad_norm": 0.16982395667266062, + "learning_rate": 5.038940530832944e-07, + "loss": 0.2792, + "step": 3010 + }, + { + "epoch": 1.5155639571518589, + "grad_norm": 0.17736924822640682, + "learning_rate": 5.03625502579285e-07, + "loss": 0.27, + "step": 3011 + }, + { + "epoch": 1.5160680529300568, + "grad_norm": 0.17990063325148373, + "learning_rate": 5.033569510293406e-07, + "loss": 0.2742, + "step": 3012 + }, + { + "epoch": 1.5165721487082546, + "grad_norm": 0.17269683446652181, + "learning_rate": 5.030883985109367e-07, + "loss": 0.2718, + "step": 3013 + }, + { + "epoch": 1.5170762444864523, + "grad_norm": 0.1713114478517227, + "learning_rate": 5.028198451015488e-07, + "loss": 0.292, + "step": 3014 + }, + { + "epoch": 1.5175803402646504, + "grad_norm": 0.17237373122427047, + "learning_rate": 5.025512908786531e-07, + "loss": 0.2708, + "step": 3015 + }, + { + "epoch": 1.518084436042848, + "grad_norm": 0.17979894721140827, + "learning_rate": 5.022827359197259e-07, + "loss": 0.2749, + "step": 3016 + }, + { + "epoch": 1.5185885318210461, + "grad_norm": 0.18029323092096203, + "learning_rate": 5.020141803022435e-07, + "loss": 0.2718, + "step": 3017 + }, + { + "epoch": 1.5190926275992438, + "grad_norm": 0.1709949058851969, + "learning_rate": 5.017456241036826e-07, + "loss": 0.2842, + "step": 3018 + }, + { + "epoch": 1.5195967233774417, + "grad_norm": 0.17615800092590866, + "learning_rate": 5.014770674015199e-07, + "loss": 0.2556, + "step": 3019 + }, + { + "epoch": 1.5201008191556395, + "grad_norm": 0.17565842258793046, + "learning_rate": 5.012085102732323e-07, + "loss": 0.2704, + "step": 3020 + }, + { + "epoch": 1.5206049149338374, + "grad_norm": 0.1706652936469358, + "learning_rate": 5.00939952796297e-07, + "loss": 0.2751, + "step": 3021 + }, + { + "epoch": 1.5211090107120353, + "grad_norm": 0.17904301457889313, + "learning_rate": 5.006713950481911e-07, + "loss": 0.2708, + "step": 3022 + }, + { + "epoch": 1.5216131064902332, + "grad_norm": 0.19348962154742488, + "learning_rate": 5.00402837106392e-07, + "loss": 0.2691, + "step": 3023 + }, + { + "epoch": 1.522117202268431, + "grad_norm": 0.17760169438436854, + "learning_rate": 5.001342790483769e-07, + "loss": 0.2812, + "step": 3024 + }, + { + "epoch": 1.5226212980466287, + "grad_norm": 0.1874622214559223, + "learning_rate": 4.998657209516231e-07, + "loss": 0.2703, + "step": 3025 + }, + { + "epoch": 1.5231253938248268, + "grad_norm": 0.19774473741326665, + "learning_rate": 4.995971628936078e-07, + "loss": 0.2642, + "step": 3026 + }, + { + "epoch": 1.5236294896030245, + "grad_norm": 0.19986370176179066, + "learning_rate": 4.993286049518088e-07, + "loss": 0.2949, + "step": 3027 + }, + { + "epoch": 1.5241335853812226, + "grad_norm": 0.16961474491016765, + "learning_rate": 4.990600472037029e-07, + "loss": 0.2767, + "step": 3028 + }, + { + "epoch": 1.5246376811594202, + "grad_norm": 0.18853446766118223, + "learning_rate": 4.987914897267678e-07, + "loss": 0.2781, + "step": 3029 + }, + { + "epoch": 1.525141776937618, + "grad_norm": 0.16987515007270199, + "learning_rate": 4.985229325984803e-07, + "loss": 0.2882, + "step": 3030 + }, + { + "epoch": 1.525645872715816, + "grad_norm": 0.1714486129335063, + "learning_rate": 4.982543758963174e-07, + "loss": 0.2704, + "step": 3031 + }, + { + "epoch": 1.5261499684940139, + "grad_norm": 0.16809069324008713, + "learning_rate": 4.979858196977566e-07, + "loss": 0.2591, + "step": 3032 + }, + { + "epoch": 1.5266540642722117, + "grad_norm": 0.1738515169060616, + "learning_rate": 4.977172640802741e-07, + "loss": 0.2582, + "step": 3033 + }, + { + "epoch": 1.5271581600504096, + "grad_norm": 0.17664113554519453, + "learning_rate": 4.974487091213469e-07, + "loss": 0.267, + "step": 3034 + }, + { + "epoch": 1.5276622558286075, + "grad_norm": 0.1916027084703846, + "learning_rate": 4.971801548984511e-07, + "loss": 0.2731, + "step": 3035 + }, + { + "epoch": 1.5281663516068051, + "grad_norm": 0.17313830459160134, + "learning_rate": 4.969116014890634e-07, + "loss": 0.2806, + "step": 3036 + }, + { + "epoch": 1.5286704473850032, + "grad_norm": 0.20413150350420245, + "learning_rate": 4.966430489706594e-07, + "loss": 0.2745, + "step": 3037 + }, + { + "epoch": 1.529174543163201, + "grad_norm": 0.1747943524307337, + "learning_rate": 4.96374497420715e-07, + "loss": 0.2634, + "step": 3038 + }, + { + "epoch": 1.529678638941399, + "grad_norm": 0.1728937587667476, + "learning_rate": 4.961059469167056e-07, + "loss": 0.2814, + "step": 3039 + }, + { + "epoch": 1.5301827347195966, + "grad_norm": 0.1743204044600501, + "learning_rate": 4.958373975361063e-07, + "loss": 0.2551, + "step": 3040 + }, + { + "epoch": 1.5306868304977945, + "grad_norm": 0.17502392690878435, + "learning_rate": 4.955688493563922e-07, + "loss": 0.267, + "step": 3041 + }, + { + "epoch": 1.5311909262759924, + "grad_norm": 0.19500065101179812, + "learning_rate": 4.953003024550375e-07, + "loss": 0.2819, + "step": 3042 + }, + { + "epoch": 1.5316950220541903, + "grad_norm": 0.17254980716871499, + "learning_rate": 4.950317569095166e-07, + "loss": 0.2649, + "step": 3043 + }, + { + "epoch": 1.5321991178323882, + "grad_norm": 0.1713124294788534, + "learning_rate": 4.94763212797303e-07, + "loss": 0.2563, + "step": 3044 + }, + { + "epoch": 1.532703213610586, + "grad_norm": 0.17577986512817073, + "learning_rate": 4.944946701958698e-07, + "loss": 0.273, + "step": 3045 + }, + { + "epoch": 1.533207309388784, + "grad_norm": 0.1701862456963333, + "learning_rate": 4.942261291826905e-07, + "loss": 0.2814, + "step": 3046 + }, + { + "epoch": 1.5337114051669816, + "grad_norm": 0.18127900976396943, + "learning_rate": 4.93957589835237e-07, + "loss": 0.2713, + "step": 3047 + }, + { + "epoch": 1.5342155009451797, + "grad_norm": 0.17254874589059224, + "learning_rate": 4.936890522309815e-07, + "loss": 0.2734, + "step": 3048 + }, + { + "epoch": 1.5347195967233773, + "grad_norm": 0.1717770510577077, + "learning_rate": 4.934205164473952e-07, + "loss": 0.2773, + "step": 3049 + }, + { + "epoch": 1.5352236925015754, + "grad_norm": 0.1908978528330254, + "learning_rate": 4.931519825619493e-07, + "loss": 0.2788, + "step": 3050 + }, + { + "epoch": 1.535727788279773, + "grad_norm": 0.171288059642186, + "learning_rate": 4.928834506521138e-07, + "loss": 0.2703, + "step": 3051 + }, + { + "epoch": 1.5362318840579712, + "grad_norm": 0.1675001073197896, + "learning_rate": 4.926149207953588e-07, + "loss": 0.2642, + "step": 3052 + }, + { + "epoch": 1.5367359798361688, + "grad_norm": 0.17021689322972322, + "learning_rate": 4.923463930691535e-07, + "loss": 0.2702, + "step": 3053 + }, + { + "epoch": 1.5372400756143667, + "grad_norm": 0.17371445641850253, + "learning_rate": 4.920778675509662e-07, + "loss": 0.2589, + "step": 3054 + }, + { + "epoch": 1.5377441713925646, + "grad_norm": 0.17478774431954305, + "learning_rate": 4.918093443182652e-07, + "loss": 0.2803, + "step": 3055 + }, + { + "epoch": 1.5382482671707625, + "grad_norm": 0.17591722779410152, + "learning_rate": 4.915408234485175e-07, + "loss": 0.2897, + "step": 3056 + }, + { + "epoch": 1.5387523629489603, + "grad_norm": 0.17280009330380483, + "learning_rate": 4.912723050191899e-07, + "loss": 0.2717, + "step": 3057 + }, + { + "epoch": 1.539256458727158, + "grad_norm": 0.16969841106908495, + "learning_rate": 4.910037891077482e-07, + "loss": 0.2602, + "step": 3058 + }, + { + "epoch": 1.539760554505356, + "grad_norm": 0.16865488414462979, + "learning_rate": 4.907352757916577e-07, + "loss": 0.2724, + "step": 3059 + }, + { + "epoch": 1.5402646502835537, + "grad_norm": 0.17584292304841348, + "learning_rate": 4.904667651483828e-07, + "loss": 0.2742, + "step": 3060 + }, + { + "epoch": 1.5407687460617518, + "grad_norm": 0.16673800323953514, + "learning_rate": 4.901982572553869e-07, + "loss": 0.2695, + "step": 3061 + }, + { + "epoch": 1.5412728418399495, + "grad_norm": 0.18326131122744185, + "learning_rate": 4.899297521901333e-07, + "loss": 0.2789, + "step": 3062 + }, + { + "epoch": 1.5417769376181476, + "grad_norm": 0.18805933934917177, + "learning_rate": 4.896612500300835e-07, + "loss": 0.272, + "step": 3063 + }, + { + "epoch": 1.5422810333963453, + "grad_norm": 0.1712806706085473, + "learning_rate": 4.893927508526993e-07, + "loss": 0.2669, + "step": 3064 + }, + { + "epoch": 1.5427851291745431, + "grad_norm": 0.1731870501531892, + "learning_rate": 4.891242547354406e-07, + "loss": 0.2812, + "step": 3065 + }, + { + "epoch": 1.543289224952741, + "grad_norm": 0.17506473097896597, + "learning_rate": 4.888557617557672e-07, + "loss": 0.2813, + "step": 3066 + }, + { + "epoch": 1.543793320730939, + "grad_norm": 0.17307157635246334, + "learning_rate": 4.885872719911375e-07, + "loss": 0.2695, + "step": 3067 + }, + { + "epoch": 1.5442974165091368, + "grad_norm": 0.17106830633462392, + "learning_rate": 4.883187855190089e-07, + "loss": 0.264, + "step": 3068 + }, + { + "epoch": 1.5448015122873346, + "grad_norm": 0.17296030737339352, + "learning_rate": 4.880503024168384e-07, + "loss": 0.2776, + "step": 3069 + }, + { + "epoch": 1.5453056080655325, + "grad_norm": 0.17398895118716073, + "learning_rate": 4.877818227620816e-07, + "loss": 0.2641, + "step": 3070 + }, + { + "epoch": 1.5458097038437302, + "grad_norm": 0.16983527604252388, + "learning_rate": 4.875133466321934e-07, + "loss": 0.2791, + "step": 3071 + }, + { + "epoch": 1.5463137996219283, + "grad_norm": 0.17446130630330534, + "learning_rate": 4.872448741046272e-07, + "loss": 0.2709, + "step": 3072 + }, + { + "epoch": 1.546817895400126, + "grad_norm": 0.17284082991627042, + "learning_rate": 4.86976405256836e-07, + "loss": 0.2671, + "step": 3073 + }, + { + "epoch": 1.547321991178324, + "grad_norm": 0.182632054641719, + "learning_rate": 4.867079401662711e-07, + "loss": 0.2776, + "step": 3074 + }, + { + "epoch": 1.5478260869565217, + "grad_norm": 0.17300501368762167, + "learning_rate": 4.864394789103829e-07, + "loss": 0.2794, + "step": 3075 + }, + { + "epoch": 1.5483301827347196, + "grad_norm": 0.17667332011696144, + "learning_rate": 4.861710215666213e-07, + "loss": 0.2853, + "step": 3076 + }, + { + "epoch": 1.5488342785129174, + "grad_norm": 0.16996586627278085, + "learning_rate": 4.859025682124341e-07, + "loss": 0.2786, + "step": 3077 + }, + { + "epoch": 1.5493383742911153, + "grad_norm": 0.17057715081373995, + "learning_rate": 4.85634118925269e-07, + "loss": 0.2691, + "step": 3078 + }, + { + "epoch": 1.5498424700693132, + "grad_norm": 0.17161445251025925, + "learning_rate": 4.853656737825713e-07, + "loss": 0.283, + "step": 3079 + }, + { + "epoch": 1.550346565847511, + "grad_norm": 0.17675037956109935, + "learning_rate": 4.850972328617863e-07, + "loss": 0.2721, + "step": 3080 + }, + { + "epoch": 1.550850661625709, + "grad_norm": 0.17208347939492113, + "learning_rate": 4.848287962403571e-07, + "loss": 0.2653, + "step": 3081 + }, + { + "epoch": 1.5513547574039066, + "grad_norm": 0.17004686976042202, + "learning_rate": 4.845603639957263e-07, + "loss": 0.2709, + "step": 3082 + }, + { + "epoch": 1.5518588531821047, + "grad_norm": 0.17995963542830856, + "learning_rate": 4.842919362053348e-07, + "loss": 0.265, + "step": 3083 + }, + { + "epoch": 1.5523629489603024, + "grad_norm": 0.18099808248082644, + "learning_rate": 4.840235129466222e-07, + "loss": 0.2651, + "step": 3084 + }, + { + "epoch": 1.5528670447385005, + "grad_norm": 0.17473760103531463, + "learning_rate": 4.837550942970275e-07, + "loss": 0.2841, + "step": 3085 + }, + { + "epoch": 1.5533711405166981, + "grad_norm": 0.19493429052326036, + "learning_rate": 4.834866803339872e-07, + "loss": 0.2789, + "step": 3086 + }, + { + "epoch": 1.553875236294896, + "grad_norm": 0.174886487259404, + "learning_rate": 4.832182711349374e-07, + "loss": 0.2897, + "step": 3087 + }, + { + "epoch": 1.5543793320730939, + "grad_norm": 0.1873744343078876, + "learning_rate": 4.829498667773126e-07, + "loss": 0.2596, + "step": 3088 + }, + { + "epoch": 1.5548834278512917, + "grad_norm": 0.1776819518952955, + "learning_rate": 4.826814673385454e-07, + "loss": 0.2592, + "step": 3089 + }, + { + "epoch": 1.5553875236294896, + "grad_norm": 0.17480879943963698, + "learning_rate": 4.824130728960677e-07, + "loss": 0.2854, + "step": 3090 + }, + { + "epoch": 1.5558916194076875, + "grad_norm": 0.17863519439054704, + "learning_rate": 4.821446835273093e-07, + "loss": 0.2848, + "step": 3091 + }, + { + "epoch": 1.5563957151858854, + "grad_norm": 0.20186594037979866, + "learning_rate": 4.818762993096994e-07, + "loss": 0.2742, + "step": 3092 + }, + { + "epoch": 1.556899810964083, + "grad_norm": 0.17253352705225278, + "learning_rate": 4.816079203206648e-07, + "loss": 0.2735, + "step": 3093 + }, + { + "epoch": 1.5574039067422811, + "grad_norm": 0.17021429772016514, + "learning_rate": 4.813395466376311e-07, + "loss": 0.2757, + "step": 3094 + }, + { + "epoch": 1.5579080025204788, + "grad_norm": 0.17831771845073935, + "learning_rate": 4.810711783380227e-07, + "loss": 0.2899, + "step": 3095 + }, + { + "epoch": 1.5584120982986769, + "grad_norm": 0.17539283313588858, + "learning_rate": 4.808028154992622e-07, + "loss": 0.2803, + "step": 3096 + }, + { + "epoch": 1.5589161940768745, + "grad_norm": 0.17570461231863696, + "learning_rate": 4.805344581987704e-07, + "loss": 0.2702, + "step": 3097 + }, + { + "epoch": 1.5594202898550724, + "grad_norm": 0.18256077525124403, + "learning_rate": 4.802661065139667e-07, + "loss": 0.279, + "step": 3098 + }, + { + "epoch": 1.5599243856332703, + "grad_norm": 0.1773908250652285, + "learning_rate": 4.799977605222693e-07, + "loss": 0.283, + "step": 3099 + }, + { + "epoch": 1.5604284814114682, + "grad_norm": 0.17701100191661834, + "learning_rate": 4.797294203010939e-07, + "loss": 0.2796, + "step": 3100 + }, + { + "epoch": 1.560932577189666, + "grad_norm": 0.17106773731405026, + "learning_rate": 4.794610859278552e-07, + "loss": 0.2669, + "step": 3101 + }, + { + "epoch": 1.561436672967864, + "grad_norm": 0.17235342667180142, + "learning_rate": 4.791927574799659e-07, + "loss": 0.2642, + "step": 3102 + }, + { + "epoch": 1.5619407687460618, + "grad_norm": 0.1791600844603549, + "learning_rate": 4.789244350348374e-07, + "loss": 0.2687, + "step": 3103 + }, + { + "epoch": 1.5624448645242595, + "grad_norm": 0.17661856365985878, + "learning_rate": 4.786561186698788e-07, + "loss": 0.2794, + "step": 3104 + }, + { + "epoch": 1.5629489603024576, + "grad_norm": 0.1755295092888704, + "learning_rate": 4.783878084624975e-07, + "loss": 0.2747, + "step": 3105 + }, + { + "epoch": 1.5634530560806552, + "grad_norm": 0.19305165095057605, + "learning_rate": 4.781195044900998e-07, + "loss": 0.2787, + "step": 3106 + }, + { + "epoch": 1.5639571518588533, + "grad_norm": 0.18063804583238724, + "learning_rate": 4.778512068300893e-07, + "loss": 0.2754, + "step": 3107 + }, + { + "epoch": 1.564461247637051, + "grad_norm": 0.18403206850653575, + "learning_rate": 4.775829155598686e-07, + "loss": 0.2778, + "step": 3108 + }, + { + "epoch": 1.5649653434152488, + "grad_norm": 0.1700956199275683, + "learning_rate": 4.773146307568379e-07, + "loss": 0.2727, + "step": 3109 + }, + { + "epoch": 1.5654694391934467, + "grad_norm": 0.18086574577750156, + "learning_rate": 4.770463524983956e-07, + "loss": 0.2912, + "step": 3110 + }, + { + "epoch": 1.5659735349716446, + "grad_norm": 0.17147115049908643, + "learning_rate": 4.7677808086193854e-07, + "loss": 0.27, + "step": 3111 + }, + { + "epoch": 1.5664776307498425, + "grad_norm": 0.1738895601221236, + "learning_rate": 4.7650981592486123e-07, + "loss": 0.2673, + "step": 3112 + }, + { + "epoch": 1.5669817265280404, + "grad_norm": 0.180723254015497, + "learning_rate": 4.7624155776455647e-07, + "loss": 0.2633, + "step": 3113 + }, + { + "epoch": 1.5674858223062382, + "grad_norm": 0.175819284778621, + "learning_rate": 4.7597330645841515e-07, + "loss": 0.2582, + "step": 3114 + }, + { + "epoch": 1.5679899180844359, + "grad_norm": 0.1753712910797449, + "learning_rate": 4.757050620838262e-07, + "loss": 0.2632, + "step": 3115 + }, + { + "epoch": 1.568494013862634, + "grad_norm": 0.17369096100282744, + "learning_rate": 4.754368247181761e-07, + "loss": 0.2798, + "step": 3116 + }, + { + "epoch": 1.5689981096408316, + "grad_norm": 0.17849770678240312, + "learning_rate": 4.751685944388501e-07, + "loss": 0.2842, + "step": 3117 + }, + { + "epoch": 1.5695022054190297, + "grad_norm": 0.19881328567873902, + "learning_rate": 4.749003713232308e-07, + "loss": 0.2733, + "step": 3118 + }, + { + "epoch": 1.5700063011972274, + "grad_norm": 0.1774218478893288, + "learning_rate": 4.7463215544869865e-07, + "loss": 0.2863, + "step": 3119 + }, + { + "epoch": 1.5705103969754255, + "grad_norm": 0.1733835174988626, + "learning_rate": 4.743639468926328e-07, + "loss": 0.2681, + "step": 3120 + }, + { + "epoch": 1.5710144927536231, + "grad_norm": 0.1736480466599747, + "learning_rate": 4.740957457324092e-07, + "loss": 0.2584, + "step": 3121 + }, + { + "epoch": 1.571518588531821, + "grad_norm": 0.17950796056364546, + "learning_rate": 4.738275520454027e-07, + "loss": 0.2508, + "step": 3122 + }, + { + "epoch": 1.572022684310019, + "grad_norm": 0.1778356015662689, + "learning_rate": 4.735593659089851e-07, + "loss": 0.2576, + "step": 3123 + }, + { + "epoch": 1.5725267800882168, + "grad_norm": 0.1687063104049516, + "learning_rate": 4.732911874005269e-07, + "loss": 0.2559, + "step": 3124 + }, + { + "epoch": 1.5730308758664147, + "grad_norm": 0.17493624040875075, + "learning_rate": 4.7302301659739547e-07, + "loss": 0.2648, + "step": 3125 + }, + { + "epoch": 1.5735349716446123, + "grad_norm": 0.17507468008329716, + "learning_rate": 4.7275485357695673e-07, + "loss": 0.2568, + "step": 3126 + }, + { + "epoch": 1.5740390674228104, + "grad_norm": 0.17975235409806142, + "learning_rate": 4.7248669841657404e-07, + "loss": 0.2776, + "step": 3127 + }, + { + "epoch": 1.574543163201008, + "grad_norm": 0.17454934150306223, + "learning_rate": 4.7221855119360824e-07, + "loss": 0.2681, + "step": 3128 + }, + { + "epoch": 1.5750472589792062, + "grad_norm": 0.18520223540546543, + "learning_rate": 4.7195041198541854e-07, + "loss": 0.2727, + "step": 3129 + }, + { + "epoch": 1.5755513547574038, + "grad_norm": 0.17055487749470938, + "learning_rate": 4.7168228086936096e-07, + "loss": 0.2692, + "step": 3130 + }, + { + "epoch": 1.576055450535602, + "grad_norm": 0.1751981137839451, + "learning_rate": 4.7141415792279015e-07, + "loss": 0.2754, + "step": 3131 + }, + { + "epoch": 1.5765595463137996, + "grad_norm": 0.180062722006383, + "learning_rate": 4.7114604322305747e-07, + "loss": 0.264, + "step": 3132 + }, + { + "epoch": 1.5770636420919975, + "grad_norm": 0.17530448742608076, + "learning_rate": 4.708779368475128e-07, + "loss": 0.2875, + "step": 3133 + }, + { + "epoch": 1.5775677378701953, + "grad_norm": 0.1716762643727378, + "learning_rate": 4.70609838873503e-07, + "loss": 0.27, + "step": 3134 + }, + { + "epoch": 1.5780718336483932, + "grad_norm": 0.17341353040640353, + "learning_rate": 4.703417493783723e-07, + "loss": 0.2623, + "step": 3135 + }, + { + "epoch": 1.578575929426591, + "grad_norm": 0.2080724128483008, + "learning_rate": 4.700736684394635e-07, + "loss": 0.2823, + "step": 3136 + }, + { + "epoch": 1.579080025204789, + "grad_norm": 0.17039241780253916, + "learning_rate": 4.6980559613411576e-07, + "loss": 0.2722, + "step": 3137 + }, + { + "epoch": 1.5795841209829868, + "grad_norm": 0.1742720965174898, + "learning_rate": 4.695375325396666e-07, + "loss": 0.2745, + "step": 3138 + }, + { + "epoch": 1.5800882167611845, + "grad_norm": 0.17895066474047672, + "learning_rate": 4.692694777334505e-07, + "loss": 0.2664, + "step": 3139 + }, + { + "epoch": 1.5805923125393826, + "grad_norm": 0.1779553651912266, + "learning_rate": 4.6900143179279984e-07, + "loss": 0.2729, + "step": 3140 + }, + { + "epoch": 1.5810964083175802, + "grad_norm": 0.1757573994664284, + "learning_rate": 4.687333947950441e-07, + "loss": 0.2628, + "step": 3141 + }, + { + "epoch": 1.5816005040957783, + "grad_norm": 0.17335221224324718, + "learning_rate": 4.684653668175102e-07, + "loss": 0.2564, + "step": 3142 + }, + { + "epoch": 1.582104599873976, + "grad_norm": 0.17526400475703355, + "learning_rate": 4.6819734793752257e-07, + "loss": 0.2724, + "step": 3143 + }, + { + "epoch": 1.5826086956521739, + "grad_norm": 0.17495007637044283, + "learning_rate": 4.679293382324031e-07, + "loss": 0.2844, + "step": 3144 + }, + { + "epoch": 1.5831127914303718, + "grad_norm": 0.17017524004684992, + "learning_rate": 4.67661337779471e-07, + "loss": 0.2772, + "step": 3145 + }, + { + "epoch": 1.5836168872085696, + "grad_norm": 0.17206013880701826, + "learning_rate": 4.6739334665604234e-07, + "loss": 0.2839, + "step": 3146 + }, + { + "epoch": 1.5841209829867675, + "grad_norm": 0.16939135952437226, + "learning_rate": 4.671253649394315e-07, + "loss": 0.2684, + "step": 3147 + }, + { + "epoch": 1.5846250787649654, + "grad_norm": 0.18799473978425724, + "learning_rate": 4.668573927069491e-07, + "loss": 0.2622, + "step": 3148 + }, + { + "epoch": 1.5851291745431633, + "grad_norm": 0.17747869099450092, + "learning_rate": 4.665894300359035e-07, + "loss": 0.2616, + "step": 3149 + }, + { + "epoch": 1.585633270321361, + "grad_norm": 0.17806445921975966, + "learning_rate": 4.6632147700360055e-07, + "loss": 0.2607, + "step": 3150 + }, + { + "epoch": 1.586137366099559, + "grad_norm": 0.17372413070642495, + "learning_rate": 4.6605353368734265e-07, + "loss": 0.2709, + "step": 3151 + }, + { + "epoch": 1.5866414618777567, + "grad_norm": 0.16458604740808605, + "learning_rate": 4.6578560016443013e-07, + "loss": 0.26, + "step": 3152 + }, + { + "epoch": 1.5871455576559548, + "grad_norm": 0.17882994895541682, + "learning_rate": 4.655176765121599e-07, + "loss": 0.27, + "step": 3153 + }, + { + "epoch": 1.5876496534341524, + "grad_norm": 0.17487944942600261, + "learning_rate": 4.652497628078266e-07, + "loss": 0.2579, + "step": 3154 + }, + { + "epoch": 1.5881537492123503, + "grad_norm": 0.17194013515573972, + "learning_rate": 4.6498185912872137e-07, + "loss": 0.2749, + "step": 3155 + }, + { + "epoch": 1.5886578449905482, + "grad_norm": 0.17344590975757027, + "learning_rate": 4.6471396555213273e-07, + "loss": 0.2625, + "step": 3156 + }, + { + "epoch": 1.589161940768746, + "grad_norm": 0.17496373280977637, + "learning_rate": 4.6444608215534657e-07, + "loss": 0.2715, + "step": 3157 + }, + { + "epoch": 1.589666036546944, + "grad_norm": 0.17877746835344072, + "learning_rate": 4.641782090156454e-07, + "loss": 0.2544, + "step": 3158 + }, + { + "epoch": 1.5901701323251418, + "grad_norm": 0.18160946077780746, + "learning_rate": 4.6391034621030903e-07, + "loss": 0.271, + "step": 3159 + }, + { + "epoch": 1.5906742281033397, + "grad_norm": 0.17284059925719977, + "learning_rate": 4.636424938166142e-07, + "loss": 0.2754, + "step": 3160 + }, + { + "epoch": 1.5911783238815373, + "grad_norm": 0.17028743922696857, + "learning_rate": 4.633746519118348e-07, + "loss": 0.263, + "step": 3161 + }, + { + "epoch": 1.5916824196597354, + "grad_norm": 0.1747381029040225, + "learning_rate": 4.631068205732413e-07, + "loss": 0.2684, + "step": 3162 + }, + { + "epoch": 1.592186515437933, + "grad_norm": 0.18722584514120708, + "learning_rate": 4.6283899987810164e-07, + "loss": 0.2854, + "step": 3163 + }, + { + "epoch": 1.5926906112161312, + "grad_norm": 0.16942535047104845, + "learning_rate": 4.6257118990368036e-07, + "loss": 0.2793, + "step": 3164 + }, + { + "epoch": 1.5931947069943289, + "grad_norm": 0.16883072945164623, + "learning_rate": 4.6230339072723874e-07, + "loss": 0.2707, + "step": 3165 + }, + { + "epoch": 1.5936988027725267, + "grad_norm": 0.17706329109752933, + "learning_rate": 4.6203560242603556e-07, + "loss": 0.2726, + "step": 3166 + }, + { + "epoch": 1.5942028985507246, + "grad_norm": 0.1686584353657218, + "learning_rate": 4.617678250773256e-07, + "loss": 0.2666, + "step": 3167 + }, + { + "epoch": 1.5947069943289225, + "grad_norm": 0.17166944845187698, + "learning_rate": 4.615000587583616e-07, + "loss": 0.2692, + "step": 3168 + }, + { + "epoch": 1.5952110901071204, + "grad_norm": 0.17764393811133042, + "learning_rate": 4.6123230354639194e-07, + "loss": 0.2578, + "step": 3169 + }, + { + "epoch": 1.5957151858853182, + "grad_norm": 0.18326712297594602, + "learning_rate": 4.6096455951866277e-07, + "loss": 0.2783, + "step": 3170 + }, + { + "epoch": 1.5962192816635161, + "grad_norm": 0.16700467021445017, + "learning_rate": 4.6069682675241626e-07, + "loss": 0.2729, + "step": 3171 + }, + { + "epoch": 1.5967233774417138, + "grad_norm": 0.17742240847400476, + "learning_rate": 4.6042910532489165e-07, + "loss": 0.2702, + "step": 3172 + }, + { + "epoch": 1.5972274732199119, + "grad_norm": 0.18272675092406107, + "learning_rate": 4.601613953133252e-07, + "loss": 0.2776, + "step": 3173 + }, + { + "epoch": 1.5977315689981095, + "grad_norm": 0.18849423602774695, + "learning_rate": 4.5989369679494935e-07, + "loss": 0.2831, + "step": 3174 + }, + { + "epoch": 1.5982356647763076, + "grad_norm": 0.19311234812321373, + "learning_rate": 4.5962600984699364e-07, + "loss": 0.271, + "step": 3175 + }, + { + "epoch": 1.5987397605545053, + "grad_norm": 0.17325148338292742, + "learning_rate": 4.593583345466837e-07, + "loss": 0.269, + "step": 3176 + }, + { + "epoch": 1.5992438563327032, + "grad_norm": 0.17501523218289874, + "learning_rate": 4.590906709712427e-07, + "loss": 0.2683, + "step": 3177 + }, + { + "epoch": 1.599747952110901, + "grad_norm": 0.18641330474821174, + "learning_rate": 4.588230191978898e-07, + "loss": 0.2681, + "step": 3178 + }, + { + "epoch": 1.600252047889099, + "grad_norm": 0.17265182176031832, + "learning_rate": 4.585553793038405e-07, + "loss": 0.2799, + "step": 3179 + }, + { + "epoch": 1.6007561436672968, + "grad_norm": 0.17101509452047053, + "learning_rate": 4.582877513663077e-07, + "loss": 0.2653, + "step": 3180 + }, + { + "epoch": 1.6012602394454947, + "grad_norm": 0.172204707085013, + "learning_rate": 4.5802013546250014e-07, + "loss": 0.2797, + "step": 3181 + }, + { + "epoch": 1.6017643352236925, + "grad_norm": 0.17967573980986776, + "learning_rate": 4.577525316696236e-07, + "loss": 0.262, + "step": 3182 + }, + { + "epoch": 1.6022684310018902, + "grad_norm": 0.17469624599365358, + "learning_rate": 4.574849400648797e-07, + "loss": 0.2636, + "step": 3183 + }, + { + "epoch": 1.6027725267800883, + "grad_norm": 0.1743708880339133, + "learning_rate": 4.5721736072546754e-07, + "loss": 0.2649, + "step": 3184 + }, + { + "epoch": 1.6027725267800883, + "eval_loss": 0.307079017162323, + "eval_runtime": 18.6674, + "eval_samples_per_second": 45.802, + "eval_steps_per_second": 0.964, + "step": 3184 + }, + { + "epoch": 1.603276622558286, + "grad_norm": 0.18603703776161118, + "learning_rate": 4.569497937285817e-07, + "loss": 0.2849, + "step": 3185 + }, + { + "epoch": 1.603780718336484, + "grad_norm": 0.1746753818130289, + "learning_rate": 4.566822391514135e-07, + "loss": 0.2644, + "step": 3186 + }, + { + "epoch": 1.6042848141146817, + "grad_norm": 0.19466418253252832, + "learning_rate": 4.5641469707115123e-07, + "loss": 0.2719, + "step": 3187 + }, + { + "epoch": 1.6047889098928798, + "grad_norm": 0.19722375083757057, + "learning_rate": 4.5614716756497856e-07, + "loss": 0.2765, + "step": 3188 + }, + { + "epoch": 1.6052930056710775, + "grad_norm": 0.17956583472244503, + "learning_rate": 4.5587965071007664e-07, + "loss": 0.2806, + "step": 3189 + }, + { + "epoch": 1.6057971014492753, + "grad_norm": 0.196228627957389, + "learning_rate": 4.556121465836221e-07, + "loss": 0.2753, + "step": 3190 + }, + { + "epoch": 1.6063011972274732, + "grad_norm": 0.1689632051276557, + "learning_rate": 4.553446552627884e-07, + "loss": 0.2714, + "step": 3191 + }, + { + "epoch": 1.606805293005671, + "grad_norm": 0.17456704042056104, + "learning_rate": 4.5507717682474475e-07, + "loss": 0.2732, + "step": 3192 + }, + { + "epoch": 1.607309388783869, + "grad_norm": 0.18246575730580003, + "learning_rate": 4.5480971134665765e-07, + "loss": 0.2684, + "step": 3193 + }, + { + "epoch": 1.6078134845620666, + "grad_norm": 0.16994370889715202, + "learning_rate": 4.545422589056888e-07, + "loss": 0.2581, + "step": 3194 + }, + { + "epoch": 1.6083175803402647, + "grad_norm": 0.1681073205628179, + "learning_rate": 4.5427481957899643e-07, + "loss": 0.2746, + "step": 3195 + }, + { + "epoch": 1.6088216761184624, + "grad_norm": 0.1935072839716343, + "learning_rate": 4.540073934437356e-07, + "loss": 0.2728, + "step": 3196 + }, + { + "epoch": 1.6093257718966605, + "grad_norm": 0.16743041342533738, + "learning_rate": 4.5373998057705667e-07, + "loss": 0.2709, + "step": 3197 + }, + { + "epoch": 1.6098298676748581, + "grad_norm": 0.18397921239376722, + "learning_rate": 4.53472581056107e-07, + "loss": 0.2801, + "step": 3198 + }, + { + "epoch": 1.6103339634530562, + "grad_norm": 0.17331537650260462, + "learning_rate": 4.5320519495802915e-07, + "loss": 0.2695, + "step": 3199 + }, + { + "epoch": 1.610838059231254, + "grad_norm": 0.17081500880990227, + "learning_rate": 4.5293782235996303e-07, + "loss": 0.271, + "step": 3200 + }, + { + "epoch": 1.6113421550094518, + "grad_norm": 0.17923914190076334, + "learning_rate": 4.526704633390435e-07, + "loss": 0.265, + "step": 3201 + }, + { + "epoch": 1.6118462507876496, + "grad_norm": 0.17772987495851134, + "learning_rate": 4.52403117972402e-07, + "loss": 0.2783, + "step": 3202 + }, + { + "epoch": 1.6123503465658475, + "grad_norm": 0.18238344872837062, + "learning_rate": 4.5213578633716627e-07, + "loss": 0.2756, + "step": 3203 + }, + { + "epoch": 1.6128544423440454, + "grad_norm": 0.17223510416546425, + "learning_rate": 4.5186846851045957e-07, + "loss": 0.2561, + "step": 3204 + }, + { + "epoch": 1.6133585381222433, + "grad_norm": 0.17827644377200683, + "learning_rate": 4.516011645694016e-07, + "loss": 0.2741, + "step": 3205 + }, + { + "epoch": 1.6138626339004412, + "grad_norm": 0.16814074106631602, + "learning_rate": 4.513338745911078e-07, + "loss": 0.2675, + "step": 3206 + }, + { + "epoch": 1.6143667296786388, + "grad_norm": 0.1774956008916992, + "learning_rate": 4.5106659865268973e-07, + "loss": 0.2697, + "step": 3207 + }, + { + "epoch": 1.614870825456837, + "grad_norm": 0.18296109118162968, + "learning_rate": 4.507993368312548e-07, + "loss": 0.2689, + "step": 3208 + }, + { + "epoch": 1.6153749212350346, + "grad_norm": 0.1858855435073671, + "learning_rate": 4.5053208920390646e-07, + "loss": 0.2669, + "step": 3209 + }, + { + "epoch": 1.6158790170132327, + "grad_norm": 0.17175203740808448, + "learning_rate": 4.5026485584774397e-07, + "loss": 0.2781, + "step": 3210 + }, + { + "epoch": 1.6163831127914303, + "grad_norm": 0.1792404257748635, + "learning_rate": 4.499976368398623e-07, + "loss": 0.2651, + "step": 3211 + }, + { + "epoch": 1.6168872085696282, + "grad_norm": 0.17147755827754907, + "learning_rate": 4.497304322573529e-07, + "loss": 0.2768, + "step": 3212 + }, + { + "epoch": 1.617391304347826, + "grad_norm": 0.1689049644593327, + "learning_rate": 4.494632421773021e-07, + "loss": 0.2644, + "step": 3213 + }, + { + "epoch": 1.617895400126024, + "grad_norm": 0.17249930136505048, + "learning_rate": 4.4919606667679314e-07, + "loss": 0.2695, + "step": 3214 + }, + { + "epoch": 1.6183994959042218, + "grad_norm": 0.17187403315174393, + "learning_rate": 4.489289058329042e-07, + "loss": 0.2753, + "step": 3215 + }, + { + "epoch": 1.6189035916824197, + "grad_norm": 0.17003988333355505, + "learning_rate": 4.4866175972270934e-07, + "loss": 0.2773, + "step": 3216 + }, + { + "epoch": 1.6194076874606176, + "grad_norm": 0.174941393420484, + "learning_rate": 4.4839462842327905e-07, + "loss": 0.2624, + "step": 3217 + }, + { + "epoch": 1.6199117832388152, + "grad_norm": 0.16731226660363765, + "learning_rate": 4.481275120116785e-07, + "loss": 0.2787, + "step": 3218 + }, + { + "epoch": 1.6204158790170133, + "grad_norm": 0.18079153758648603, + "learning_rate": 4.478604105649697e-07, + "loss": 0.2649, + "step": 3219 + }, + { + "epoch": 1.620919974795211, + "grad_norm": 0.1717217807780553, + "learning_rate": 4.475933241602093e-07, + "loss": 0.278, + "step": 3220 + }, + { + "epoch": 1.621424070573409, + "grad_norm": 0.1679242542537086, + "learning_rate": 4.473262528744502e-07, + "loss": 0.2614, + "step": 3221 + }, + { + "epoch": 1.6219281663516067, + "grad_norm": 0.17508722076028932, + "learning_rate": 4.47059196784741e-07, + "loss": 0.2681, + "step": 3222 + }, + { + "epoch": 1.6224322621298046, + "grad_norm": 0.17649494235855326, + "learning_rate": 4.467921559681255e-07, + "loss": 0.2636, + "step": 3223 + }, + { + "epoch": 1.6229363579080025, + "grad_norm": 0.171248906662282, + "learning_rate": 4.4652513050164344e-07, + "loss": 0.2675, + "step": 3224 + }, + { + "epoch": 1.6234404536862004, + "grad_norm": 0.1772203147323702, + "learning_rate": 4.462581204623298e-07, + "loss": 0.2676, + "step": 3225 + }, + { + "epoch": 1.6239445494643983, + "grad_norm": 0.17277840226247196, + "learning_rate": 4.4599112592721567e-07, + "loss": 0.2818, + "step": 3226 + }, + { + "epoch": 1.6244486452425961, + "grad_norm": 0.1888456571031224, + "learning_rate": 4.4572414697332694e-07, + "loss": 0.2582, + "step": 3227 + }, + { + "epoch": 1.624952741020794, + "grad_norm": 0.17481572073876417, + "learning_rate": 4.454571836776859e-07, + "loss": 0.2845, + "step": 3228 + }, + { + "epoch": 1.6254568367989917, + "grad_norm": 0.17113867643332575, + "learning_rate": 4.4519023611730913e-07, + "loss": 0.2628, + "step": 3229 + }, + { + "epoch": 1.6259609325771898, + "grad_norm": 0.1700358617278328, + "learning_rate": 4.4492330436920997e-07, + "loss": 0.272, + "step": 3230 + }, + { + "epoch": 1.6264650283553874, + "grad_norm": 0.17282908698853652, + "learning_rate": 4.4465638851039636e-07, + "loss": 0.2695, + "step": 3231 + }, + { + "epoch": 1.6269691241335855, + "grad_norm": 0.17626881997268046, + "learning_rate": 4.4438948861787164e-07, + "loss": 0.2744, + "step": 3232 + }, + { + "epoch": 1.6274732199117832, + "grad_norm": 0.1752662023406442, + "learning_rate": 4.4412260476863513e-07, + "loss": 0.2699, + "step": 3233 + }, + { + "epoch": 1.627977315689981, + "grad_norm": 0.18770480212311322, + "learning_rate": 4.4385573703968074e-07, + "loss": 0.2798, + "step": 3234 + }, + { + "epoch": 1.628481411468179, + "grad_norm": 0.17091794174949165, + "learning_rate": 4.435888855079987e-07, + "loss": 0.2554, + "step": 3235 + }, + { + "epoch": 1.6289855072463768, + "grad_norm": 0.18183277964556385, + "learning_rate": 4.4332205025057363e-07, + "loss": 0.2645, + "step": 3236 + }, + { + "epoch": 1.6294896030245747, + "grad_norm": 0.1721678222436179, + "learning_rate": 4.4305523134438603e-07, + "loss": 0.2714, + "step": 3237 + }, + { + "epoch": 1.6299936988027726, + "grad_norm": 0.17185890801401496, + "learning_rate": 4.427884288664114e-07, + "loss": 0.274, + "step": 3238 + }, + { + "epoch": 1.6304977945809704, + "grad_norm": 0.1705781529081766, + "learning_rate": 4.4252164289362055e-07, + "loss": 0.2689, + "step": 3239 + }, + { + "epoch": 1.631001890359168, + "grad_norm": 0.17045943428176855, + "learning_rate": 4.422548735029798e-07, + "loss": 0.2612, + "step": 3240 + }, + { + "epoch": 1.6315059861373662, + "grad_norm": 0.17057437607989734, + "learning_rate": 4.4198812077145014e-07, + "loss": 0.275, + "step": 3241 + }, + { + "epoch": 1.6320100819155638, + "grad_norm": 0.17813857592829013, + "learning_rate": 4.417213847759885e-07, + "loss": 0.2851, + "step": 3242 + }, + { + "epoch": 1.632514177693762, + "grad_norm": 0.1712522316623488, + "learning_rate": 4.4145466559354606e-07, + "loss": 0.2792, + "step": 3243 + }, + { + "epoch": 1.6330182734719596, + "grad_norm": 0.177089666182129, + "learning_rate": 4.4118796330107015e-07, + "loss": 0.2812, + "step": 3244 + }, + { + "epoch": 1.6335223692501575, + "grad_norm": 0.1804936269724506, + "learning_rate": 4.409212779755026e-07, + "loss": 0.2923, + "step": 3245 + }, + { + "epoch": 1.6340264650283554, + "grad_norm": 0.20349951530764857, + "learning_rate": 4.4065460969378e-07, + "loss": 0.2755, + "step": 3246 + }, + { + "epoch": 1.6345305608065532, + "grad_norm": 0.18026027565564015, + "learning_rate": 4.403879585328353e-07, + "loss": 0.27, + "step": 3247 + }, + { + "epoch": 1.6350346565847511, + "grad_norm": 0.17425845355042768, + "learning_rate": 4.4012132456959497e-07, + "loss": 0.2744, + "step": 3248 + }, + { + "epoch": 1.635538752362949, + "grad_norm": 0.17129421952442173, + "learning_rate": 4.3985470788098175e-07, + "loss": 0.2645, + "step": 3249 + }, + { + "epoch": 1.6360428481411469, + "grad_norm": 0.17611725539203973, + "learning_rate": 4.395881085439126e-07, + "loss": 0.2703, + "step": 3250 + }, + { + "epoch": 1.6365469439193445, + "grad_norm": 0.17504232119969135, + "learning_rate": 4.393215266353e-07, + "loss": 0.2543, + "step": 3251 + }, + { + "epoch": 1.6370510396975426, + "grad_norm": 0.17656439945011204, + "learning_rate": 4.3905496223205114e-07, + "loss": 0.2704, + "step": 3252 + }, + { + "epoch": 1.6375551354757403, + "grad_norm": 0.17063701784531435, + "learning_rate": 4.3878841541106805e-07, + "loss": 0.2889, + "step": 3253 + }, + { + "epoch": 1.6380592312539384, + "grad_norm": 0.16853391156671269, + "learning_rate": 4.385218862492479e-07, + "loss": 0.2796, + "step": 3254 + }, + { + "epoch": 1.638563327032136, + "grad_norm": 0.17227356978627587, + "learning_rate": 4.3825537482348274e-07, + "loss": 0.2848, + "step": 3255 + }, + { + "epoch": 1.6390674228103341, + "grad_norm": 0.17455061108226785, + "learning_rate": 4.3798888121065957e-07, + "loss": 0.2816, + "step": 3256 + }, + { + "epoch": 1.6395715185885318, + "grad_norm": 0.17637327346626572, + "learning_rate": 4.377224054876598e-07, + "loss": 0.2725, + "step": 3257 + }, + { + "epoch": 1.6400756143667297, + "grad_norm": 0.17175945902648324, + "learning_rate": 4.374559477313605e-07, + "loss": 0.2716, + "step": 3258 + }, + { + "epoch": 1.6405797101449275, + "grad_norm": 0.17464832215924994, + "learning_rate": 4.3718950801863263e-07, + "loss": 0.2816, + "step": 3259 + }, + { + "epoch": 1.6410838059231254, + "grad_norm": 0.17327368547696761, + "learning_rate": 4.369230864263428e-07, + "loss": 0.2708, + "step": 3260 + }, + { + "epoch": 1.6415879017013233, + "grad_norm": 0.17593115327303463, + "learning_rate": 4.3665668303135184e-07, + "loss": 0.271, + "step": 3261 + }, + { + "epoch": 1.642091997479521, + "grad_norm": 0.17355291392861003, + "learning_rate": 4.363902979105151e-07, + "loss": 0.2744, + "step": 3262 + }, + { + "epoch": 1.642596093257719, + "grad_norm": 0.17670933469610142, + "learning_rate": 4.361239311406837e-07, + "loss": 0.2767, + "step": 3263 + }, + { + "epoch": 1.6431001890359167, + "grad_norm": 0.1743548246208639, + "learning_rate": 4.358575827987022e-07, + "loss": 0.2877, + "step": 3264 + }, + { + "epoch": 1.6436042848141148, + "grad_norm": 0.1780920002901691, + "learning_rate": 4.3559125296141097e-07, + "loss": 0.2728, + "step": 3265 + }, + { + "epoch": 1.6441083805923125, + "grad_norm": 0.16978778461761998, + "learning_rate": 4.3532494170564413e-07, + "loss": 0.2638, + "step": 3266 + }, + { + "epoch": 1.6446124763705106, + "grad_norm": 0.19486305777071525, + "learning_rate": 4.35058649108231e-07, + "loss": 0.2745, + "step": 3267 + }, + { + "epoch": 1.6451165721487082, + "grad_norm": 0.18084492964621957, + "learning_rate": 4.3479237524599544e-07, + "loss": 0.2939, + "step": 3268 + }, + { + "epoch": 1.645620667926906, + "grad_norm": 0.17132944207036718, + "learning_rate": 4.345261201957556e-07, + "loss": 0.2664, + "step": 3269 + }, + { + "epoch": 1.646124763705104, + "grad_norm": 0.17437387694568648, + "learning_rate": 4.342598840343244e-07, + "loss": 0.2859, + "step": 3270 + }, + { + "epoch": 1.6466288594833018, + "grad_norm": 0.17301815118977484, + "learning_rate": 4.3399366683850946e-07, + "loss": 0.275, + "step": 3271 + }, + { + "epoch": 1.6471329552614997, + "grad_norm": 0.18122055882591992, + "learning_rate": 4.3372746868511284e-07, + "loss": 0.286, + "step": 3272 + }, + { + "epoch": 1.6476370510396976, + "grad_norm": 0.17723397432269952, + "learning_rate": 4.3346128965093077e-07, + "loss": 0.272, + "step": 3273 + }, + { + "epoch": 1.6481411468178955, + "grad_norm": 0.1801415778809958, + "learning_rate": 4.331951298127547e-07, + "loss": 0.2686, + "step": 3274 + }, + { + "epoch": 1.6486452425960931, + "grad_norm": 0.17591886578148874, + "learning_rate": 4.3292898924736976e-07, + "loss": 0.2549, + "step": 3275 + }, + { + "epoch": 1.6491493383742912, + "grad_norm": 0.1759321403355535, + "learning_rate": 4.326628680315558e-07, + "loss": 0.2836, + "step": 3276 + }, + { + "epoch": 1.6496534341524889, + "grad_norm": 0.17426684713872134, + "learning_rate": 4.323967662420874e-07, + "loss": 0.266, + "step": 3277 + }, + { + "epoch": 1.650157529930687, + "grad_norm": 0.1741569992164329, + "learning_rate": 4.3213068395573304e-07, + "loss": 0.2647, + "step": 3278 + }, + { + "epoch": 1.6506616257088846, + "grad_norm": 0.17222858934473642, + "learning_rate": 4.3186462124925616e-07, + "loss": 0.2735, + "step": 3279 + }, + { + "epoch": 1.6511657214870825, + "grad_norm": 0.1823409553601484, + "learning_rate": 4.315985781994137e-07, + "loss": 0.2756, + "step": 3280 + }, + { + "epoch": 1.6516698172652804, + "grad_norm": 0.18057706073850457, + "learning_rate": 4.3133255488295793e-07, + "loss": 0.2938, + "step": 3281 + }, + { + "epoch": 1.6521739130434783, + "grad_norm": 0.16850573295711344, + "learning_rate": 4.310665513766347e-07, + "loss": 0.258, + "step": 3282 + }, + { + "epoch": 1.6526780088216761, + "grad_norm": 0.17055663371284882, + "learning_rate": 4.308005677571842e-07, + "loss": 0.2773, + "step": 3283 + }, + { + "epoch": 1.653182104599874, + "grad_norm": 0.18266654108280814, + "learning_rate": 4.305346041013414e-07, + "loss": 0.2641, + "step": 3284 + }, + { + "epoch": 1.653686200378072, + "grad_norm": 0.18075413873305338, + "learning_rate": 4.302686604858349e-07, + "loss": 0.2729, + "step": 3285 + }, + { + "epoch": 1.6541902961562696, + "grad_norm": 0.16673471863682812, + "learning_rate": 4.3000273698738804e-07, + "loss": 0.2734, + "step": 3286 + }, + { + "epoch": 1.6546943919344677, + "grad_norm": 0.17371867860201198, + "learning_rate": 4.2973683368271775e-07, + "loss": 0.2755, + "step": 3287 + }, + { + "epoch": 1.6551984877126653, + "grad_norm": 0.17118499221022526, + "learning_rate": 4.2947095064853586e-07, + "loss": 0.2644, + "step": 3288 + }, + { + "epoch": 1.6557025834908634, + "grad_norm": 0.17084068049014356, + "learning_rate": 4.2920508796154755e-07, + "loss": 0.2749, + "step": 3289 + }, + { + "epoch": 1.656206679269061, + "grad_norm": 0.17288119445171216, + "learning_rate": 4.289392456984531e-07, + "loss": 0.2776, + "step": 3290 + }, + { + "epoch": 1.656710775047259, + "grad_norm": 0.17586923741312185, + "learning_rate": 4.2867342393594596e-07, + "loss": 0.2771, + "step": 3291 + }, + { + "epoch": 1.6572148708254568, + "grad_norm": 0.17101471939495422, + "learning_rate": 4.284076227507141e-07, + "loss": 0.263, + "step": 3292 + }, + { + "epoch": 1.6577189666036547, + "grad_norm": 0.16831960805801494, + "learning_rate": 4.2814184221943964e-07, + "loss": 0.2837, + "step": 3293 + }, + { + "epoch": 1.6582230623818526, + "grad_norm": 0.17622469939088511, + "learning_rate": 4.2787608241879847e-07, + "loss": 0.2639, + "step": 3294 + }, + { + "epoch": 1.6587271581600505, + "grad_norm": 0.16999028518865347, + "learning_rate": 4.2761034342546087e-07, + "loss": 0.2775, + "step": 3295 + }, + { + "epoch": 1.6592312539382483, + "grad_norm": 0.17456572812362198, + "learning_rate": 4.2734462531609063e-07, + "loss": 0.2736, + "step": 3296 + }, + { + "epoch": 1.659735349716446, + "grad_norm": 0.18460824603728268, + "learning_rate": 4.270789281673461e-07, + "loss": 0.2742, + "step": 3297 + }, + { + "epoch": 1.660239445494644, + "grad_norm": 0.17060321969936063, + "learning_rate": 4.26813252055879e-07, + "loss": 0.283, + "step": 3298 + }, + { + "epoch": 1.6607435412728417, + "grad_norm": 0.17365085031389696, + "learning_rate": 4.265475970583353e-07, + "loss": 0.2633, + "step": 3299 + }, + { + "epoch": 1.6612476370510398, + "grad_norm": 0.17714863665239125, + "learning_rate": 4.262819632513548e-07, + "loss": 0.268, + "step": 3300 + }, + { + "epoch": 1.6617517328292375, + "grad_norm": 0.1801000943195406, + "learning_rate": 4.260163507115712e-07, + "loss": 0.285, + "step": 3301 + }, + { + "epoch": 1.6622558286074354, + "grad_norm": 0.17866572538561554, + "learning_rate": 4.257507595156123e-07, + "loss": 0.2608, + "step": 3302 + }, + { + "epoch": 1.6627599243856332, + "grad_norm": 0.17544127376337787, + "learning_rate": 4.2548518974009906e-07, + "loss": 0.2845, + "step": 3303 + }, + { + "epoch": 1.6632640201638311, + "grad_norm": 0.17044959098671644, + "learning_rate": 4.2521964146164726e-07, + "loss": 0.267, + "step": 3304 + }, + { + "epoch": 1.663768115942029, + "grad_norm": 0.18277292620452268, + "learning_rate": 4.249541147568656e-07, + "loss": 0.2654, + "step": 3305 + }, + { + "epoch": 1.6642722117202269, + "grad_norm": 0.1700015852346814, + "learning_rate": 4.2468860970235676e-07, + "loss": 0.278, + "step": 3306 + }, + { + "epoch": 1.6647763074984248, + "grad_norm": 0.19816383195198753, + "learning_rate": 4.244231263747177e-07, + "loss": 0.2819, + "step": 3307 + }, + { + "epoch": 1.6652804032766224, + "grad_norm": 0.17293706700556136, + "learning_rate": 4.241576648505383e-07, + "loss": 0.2721, + "step": 3308 + }, + { + "epoch": 1.6657844990548205, + "grad_norm": 0.16833769270413493, + "learning_rate": 4.2389222520640297e-07, + "loss": 0.2629, + "step": 3309 + }, + { + "epoch": 1.6662885948330182, + "grad_norm": 0.1796549386579674, + "learning_rate": 4.2362680751888894e-07, + "loss": 0.281, + "step": 3310 + }, + { + "epoch": 1.6667926906112163, + "grad_norm": 0.17213164153650173, + "learning_rate": 4.2336141186456815e-07, + "loss": 0.278, + "step": 3311 + }, + { + "epoch": 1.667296786389414, + "grad_norm": 0.16665647034381148, + "learning_rate": 4.2309603832000523e-07, + "loss": 0.2677, + "step": 3312 + }, + { + "epoch": 1.6678008821676118, + "grad_norm": 0.19006288706354632, + "learning_rate": 4.2283068696175867e-07, + "loss": 0.2752, + "step": 3313 + }, + { + "epoch": 1.6683049779458097, + "grad_norm": 0.17508641308271217, + "learning_rate": 4.225653578663811e-07, + "loss": 0.2767, + "step": 3314 + }, + { + "epoch": 1.6688090737240076, + "grad_norm": 0.17313752219693374, + "learning_rate": 4.2230005111041793e-07, + "loss": 0.2752, + "step": 3315 + }, + { + "epoch": 1.6693131695022054, + "grad_norm": 0.17393250261360155, + "learning_rate": 4.2203476677040876e-07, + "loss": 0.2722, + "step": 3316 + }, + { + "epoch": 1.6698172652804033, + "grad_norm": 0.18422917201705832, + "learning_rate": 4.217695049228864e-07, + "loss": 0.271, + "step": 3317 + }, + { + "epoch": 1.6703213610586012, + "grad_norm": 0.1711040855484809, + "learning_rate": 4.2150426564437737e-07, + "loss": 0.2628, + "step": 3318 + }, + { + "epoch": 1.6708254568367988, + "grad_norm": 0.17294159684320054, + "learning_rate": 4.212390490114014e-07, + "loss": 0.2892, + "step": 3319 + }, + { + "epoch": 1.671329552614997, + "grad_norm": 0.18793079339537197, + "learning_rate": 4.2097385510047166e-07, + "loss": 0.2637, + "step": 3320 + }, + { + "epoch": 1.6718336483931946, + "grad_norm": 0.1773209876917145, + "learning_rate": 4.207086839880955e-07, + "loss": 0.27, + "step": 3321 + }, + { + "epoch": 1.6723377441713927, + "grad_norm": 0.17369452560686208, + "learning_rate": 4.204435357507725e-07, + "loss": 0.2695, + "step": 3322 + }, + { + "epoch": 1.6728418399495903, + "grad_norm": 0.17729418122505228, + "learning_rate": 4.201784104649969e-07, + "loss": 0.2631, + "step": 3323 + }, + { + "epoch": 1.6733459357277882, + "grad_norm": 0.17757333703014838, + "learning_rate": 4.199133082072552e-07, + "loss": 0.2761, + "step": 3324 + }, + { + "epoch": 1.673850031505986, + "grad_norm": 0.17481749922840548, + "learning_rate": 4.1964822905402817e-07, + "loss": 0.2948, + "step": 3325 + }, + { + "epoch": 1.674354127284184, + "grad_norm": 0.17607568877612143, + "learning_rate": 4.1938317308178903e-07, + "loss": 0.2789, + "step": 3326 + }, + { + "epoch": 1.6748582230623819, + "grad_norm": 0.17514273867736174, + "learning_rate": 4.191181403670054e-07, + "loss": 0.2693, + "step": 3327 + }, + { + "epoch": 1.6753623188405797, + "grad_norm": 0.16986181408491555, + "learning_rate": 4.1885313098613714e-07, + "loss": 0.2719, + "step": 3328 + }, + { + "epoch": 1.6758664146187776, + "grad_norm": 0.17536949245885125, + "learning_rate": 4.185881450156377e-07, + "loss": 0.294, + "step": 3329 + }, + { + "epoch": 1.6763705103969753, + "grad_norm": 0.1729391810469066, + "learning_rate": 4.183231825319544e-07, + "loss": 0.274, + "step": 3330 + }, + { + "epoch": 1.6768746061751734, + "grad_norm": 0.181209965516427, + "learning_rate": 4.1805824361152677e-07, + "loss": 0.2828, + "step": 3331 + }, + { + "epoch": 1.677378701953371, + "grad_norm": 0.17437527629077368, + "learning_rate": 4.177933283307884e-07, + "loss": 0.264, + "step": 3332 + }, + { + "epoch": 1.6778827977315691, + "grad_norm": 0.17749950810916038, + "learning_rate": 4.1752843676616533e-07, + "loss": 0.2774, + "step": 3333 + }, + { + "epoch": 1.6783868935097668, + "grad_norm": 0.17035105633216754, + "learning_rate": 4.1726356899407765e-07, + "loss": 0.2691, + "step": 3334 + }, + { + "epoch": 1.6788909892879649, + "grad_norm": 0.18282073284261757, + "learning_rate": 4.1699872509093774e-07, + "loss": 0.2774, + "step": 3335 + }, + { + "epoch": 1.6793950850661625, + "grad_norm": 0.1739086220354733, + "learning_rate": 4.167339051331513e-07, + "loss": 0.2761, + "step": 3336 + }, + { + "epoch": 1.6798991808443604, + "grad_norm": 0.178642966353365, + "learning_rate": 4.164691091971176e-07, + "loss": 0.2807, + "step": 3337 + }, + { + "epoch": 1.6804032766225583, + "grad_norm": 0.17158028839895453, + "learning_rate": 4.162043373592282e-07, + "loss": 0.2658, + "step": 3338 + }, + { + "epoch": 1.6809073724007562, + "grad_norm": 0.1761545331753215, + "learning_rate": 4.1593958969586864e-07, + "loss": 0.2683, + "step": 3339 + }, + { + "epoch": 1.681411468178954, + "grad_norm": 0.1790803594461759, + "learning_rate": 4.156748662834165e-07, + "loss": 0.2744, + "step": 3340 + }, + { + "epoch": 1.6819155639571517, + "grad_norm": 0.17622383520305496, + "learning_rate": 4.154101671982433e-07, + "loss": 0.2654, + "step": 3341 + }, + { + "epoch": 1.6824196597353498, + "grad_norm": 0.1749463588948772, + "learning_rate": 4.151454925167129e-07, + "loss": 0.292, + "step": 3342 + }, + { + "epoch": 1.6829237555135474, + "grad_norm": 0.17771616434870652, + "learning_rate": 4.14880842315182e-07, + "loss": 0.2689, + "step": 3343 + }, + { + "epoch": 1.6834278512917455, + "grad_norm": 0.171167947600746, + "learning_rate": 4.14616216670001e-07, + "loss": 0.2675, + "step": 3344 + }, + { + "epoch": 1.6839319470699432, + "grad_norm": 0.16943013435788834, + "learning_rate": 4.143516156575124e-07, + "loss": 0.2602, + "step": 3345 + }, + { + "epoch": 1.6844360428481413, + "grad_norm": 0.18285712491263306, + "learning_rate": 4.1408703935405234e-07, + "loss": 0.278, + "step": 3346 + }, + { + "epoch": 1.684940138626339, + "grad_norm": 0.17919325702734495, + "learning_rate": 4.1382248783594905e-07, + "loss": 0.2892, + "step": 3347 + }, + { + "epoch": 1.6854442344045368, + "grad_norm": 0.17248307230193116, + "learning_rate": 4.135579611795243e-07, + "loss": 0.2802, + "step": 3348 + }, + { + "epoch": 1.6859483301827347, + "grad_norm": 0.17017032309653435, + "learning_rate": 4.132934594610922e-07, + "loss": 0.2613, + "step": 3349 + }, + { + "epoch": 1.6864524259609326, + "grad_norm": 0.17232227547146478, + "learning_rate": 4.130289827569599e-07, + "loss": 0.28, + "step": 3350 + }, + { + "epoch": 1.6869565217391305, + "grad_norm": 0.19143726314461165, + "learning_rate": 4.127645311434275e-07, + "loss": 0.2882, + "step": 3351 + }, + { + "epoch": 1.6874606175173283, + "grad_norm": 0.17907586158542865, + "learning_rate": 4.125001046967871e-07, + "loss": 0.2795, + "step": 3352 + }, + { + "epoch": 1.6879647132955262, + "grad_norm": 0.1745434199070893, + "learning_rate": 4.122357034933248e-07, + "loss": 0.2827, + "step": 3353 + }, + { + "epoch": 1.6884688090737239, + "grad_norm": 0.18581346560412773, + "learning_rate": 4.119713276093181e-07, + "loss": 0.2744, + "step": 3354 + }, + { + "epoch": 1.688972904851922, + "grad_norm": 0.17050406426787684, + "learning_rate": 4.117069771210384e-07, + "loss": 0.2638, + "step": 3355 + }, + { + "epoch": 1.6894770006301196, + "grad_norm": 0.1678464885882017, + "learning_rate": 4.114426521047485e-07, + "loss": 0.274, + "step": 3356 + }, + { + "epoch": 1.6899810964083177, + "grad_norm": 0.18966378316531626, + "learning_rate": 4.1117835263670515e-07, + "loss": 0.2813, + "step": 3357 + }, + { + "epoch": 1.6904851921865154, + "grad_norm": 0.1700410651953894, + "learning_rate": 4.109140787931569e-07, + "loss": 0.261, + "step": 3358 + }, + { + "epoch": 1.6909892879647133, + "grad_norm": 0.165637909932199, + "learning_rate": 4.1064983065034476e-07, + "loss": 0.2654, + "step": 3359 + }, + { + "epoch": 1.6914933837429111, + "grad_norm": 0.1701023187284413, + "learning_rate": 4.103856082845032e-07, + "loss": 0.2779, + "step": 3360 + }, + { + "epoch": 1.691997479521109, + "grad_norm": 0.16764886438759627, + "learning_rate": 4.1012141177185846e-07, + "loss": 0.2495, + "step": 3361 + }, + { + "epoch": 1.692501575299307, + "grad_norm": 0.17964220409792864, + "learning_rate": 4.098572411886296e-07, + "loss": 0.2759, + "step": 3362 + }, + { + "epoch": 1.6930056710775048, + "grad_norm": 0.17356212055438042, + "learning_rate": 4.095930966110283e-07, + "loss": 0.2607, + "step": 3363 + }, + { + "epoch": 1.6935097668557026, + "grad_norm": 0.1748654301154168, + "learning_rate": 4.0932897811525866e-07, + "loss": 0.2647, + "step": 3364 + }, + { + "epoch": 1.6940138626339003, + "grad_norm": 0.17194284386041553, + "learning_rate": 4.0906488577751714e-07, + "loss": 0.2757, + "step": 3365 + }, + { + "epoch": 1.6945179584120984, + "grad_norm": 0.17475464121096163, + "learning_rate": 4.0880081967399265e-07, + "loss": 0.2766, + "step": 3366 + }, + { + "epoch": 1.695022054190296, + "grad_norm": 0.17506845809429164, + "learning_rate": 4.085367798808669e-07, + "loss": 0.2813, + "step": 3367 + }, + { + "epoch": 1.6955261499684942, + "grad_norm": 0.16997507190026173, + "learning_rate": 4.082727664743133e-07, + "loss": 0.2748, + "step": 3368 + }, + { + "epoch": 1.6960302457466918, + "grad_norm": 0.1823347794889387, + "learning_rate": 4.080087795304986e-07, + "loss": 0.2795, + "step": 3369 + }, + { + "epoch": 1.6965343415248897, + "grad_norm": 0.1714581235316788, + "learning_rate": 4.0774481912558103e-07, + "loss": 0.267, + "step": 3370 + }, + { + "epoch": 1.6970384373030876, + "grad_norm": 0.16918073676897602, + "learning_rate": 4.0748088533571174e-07, + "loss": 0.2659, + "step": 3371 + }, + { + "epoch": 1.6975425330812854, + "grad_norm": 0.1683780550275527, + "learning_rate": 4.072169782370339e-07, + "loss": 0.2792, + "step": 3372 + }, + { + "epoch": 1.6980466288594833, + "grad_norm": 0.1772465529025339, + "learning_rate": 4.069530979056829e-07, + "loss": 0.2675, + "step": 3373 + }, + { + "epoch": 1.6985507246376812, + "grad_norm": 0.20723872319464334, + "learning_rate": 4.06689244417787e-07, + "loss": 0.2761, + "step": 3374 + }, + { + "epoch": 1.699054820415879, + "grad_norm": 0.17653778927841088, + "learning_rate": 4.064254178494658e-07, + "loss": 0.278, + "step": 3375 + }, + { + "epoch": 1.6995589161940767, + "grad_norm": 0.1733117296454345, + "learning_rate": 4.0616161827683215e-07, + "loss": 0.2633, + "step": 3376 + }, + { + "epoch": 1.7000630119722748, + "grad_norm": 0.17322471461748393, + "learning_rate": 4.058978457759902e-07, + "loss": 0.2777, + "step": 3377 + }, + { + "epoch": 1.7005671077504725, + "grad_norm": 0.16784867847711188, + "learning_rate": 4.056341004230368e-07, + "loss": 0.2646, + "step": 3378 + }, + { + "epoch": 1.7010712035286706, + "grad_norm": 0.18157842039507768, + "learning_rate": 4.0537038229406093e-07, + "loss": 0.286, + "step": 3379 + }, + { + "epoch": 1.7015752993068682, + "grad_norm": 0.17284511881224912, + "learning_rate": 4.0510669146514356e-07, + "loss": 0.2667, + "step": 3380 + }, + { + "epoch": 1.7020793950850661, + "grad_norm": 0.1750808365681563, + "learning_rate": 4.0484302801235794e-07, + "loss": 0.2888, + "step": 3381 + }, + { + "epoch": 1.702583490863264, + "grad_norm": 0.17993571684627316, + "learning_rate": 4.0457939201176905e-07, + "loss": 0.2657, + "step": 3382 + }, + { + "epoch": 1.7030875866414619, + "grad_norm": 0.1860139337219952, + "learning_rate": 4.0431578353943464e-07, + "loss": 0.2712, + "step": 3383 + }, + { + "epoch": 1.7030875866414619, + "eval_loss": 0.3066566586494446, + "eval_runtime": 17.4494, + "eval_samples_per_second": 48.999, + "eval_steps_per_second": 1.032, + "step": 3383 + }, + { + "epoch": 1.7035916824196597, + "grad_norm": 0.1901404388554431, + "learning_rate": 4.0405220267140375e-07, + "loss": 0.2657, + "step": 3384 + }, + { + "epoch": 1.7040957781978576, + "grad_norm": 0.17602879795743603, + "learning_rate": 4.0378864948371824e-07, + "loss": 0.2814, + "step": 3385 + }, + { + "epoch": 1.7045998739760555, + "grad_norm": 0.17623442887630172, + "learning_rate": 4.035251240524114e-07, + "loss": 0.2811, + "step": 3386 + }, + { + "epoch": 1.7051039697542532, + "grad_norm": 0.18006049009972366, + "learning_rate": 4.0326162645350837e-07, + "loss": 0.2789, + "step": 3387 + }, + { + "epoch": 1.7056080655324513, + "grad_norm": 0.19068107643868426, + "learning_rate": 4.029981567630271e-07, + "loss": 0.2684, + "step": 3388 + }, + { + "epoch": 1.706112161310649, + "grad_norm": 0.17351592149290326, + "learning_rate": 4.027347150569765e-07, + "loss": 0.2635, + "step": 3389 + }, + { + "epoch": 1.706616257088847, + "grad_norm": 0.18358987651652428, + "learning_rate": 4.0247130141135825e-07, + "loss": 0.2971, + "step": 3390 + }, + { + "epoch": 1.7071203528670447, + "grad_norm": 0.1726651344266121, + "learning_rate": 4.0220791590216516e-07, + "loss": 0.2766, + "step": 3391 + }, + { + "epoch": 1.7076244486452425, + "grad_norm": 0.17121736158478892, + "learning_rate": 4.0194455860538284e-07, + "loss": 0.264, + "step": 3392 + }, + { + "epoch": 1.7081285444234404, + "grad_norm": 0.1723867701481724, + "learning_rate": 4.016812295969878e-07, + "loss": 0.2808, + "step": 3393 + }, + { + "epoch": 1.7086326402016383, + "grad_norm": 0.17772561012794783, + "learning_rate": 4.01417928952949e-07, + "loss": 0.2841, + "step": 3394 + }, + { + "epoch": 1.7091367359798362, + "grad_norm": 0.17268289420348543, + "learning_rate": 4.0115465674922697e-07, + "loss": 0.2715, + "step": 3395 + }, + { + "epoch": 1.709640831758034, + "grad_norm": 0.17538041239709554, + "learning_rate": 4.008914130617742e-07, + "loss": 0.2944, + "step": 3396 + }, + { + "epoch": 1.710144927536232, + "grad_norm": 0.1893702660671161, + "learning_rate": 4.006281979665349e-07, + "loss": 0.2851, + "step": 3397 + }, + { + "epoch": 1.7106490233144296, + "grad_norm": 0.17669490425000803, + "learning_rate": 4.003650115394446e-07, + "loss": 0.2619, + "step": 3398 + }, + { + "epoch": 1.7111531190926277, + "grad_norm": 0.17081475118990422, + "learning_rate": 4.0010185385643156e-07, + "loss": 0.2664, + "step": 3399 + }, + { + "epoch": 1.7116572148708253, + "grad_norm": 0.18139383971070436, + "learning_rate": 3.9983872499341463e-07, + "loss": 0.2722, + "step": 3400 + }, + { + "epoch": 1.7121613106490234, + "grad_norm": 0.16819965099097467, + "learning_rate": 3.9957562502630524e-07, + "loss": 0.2527, + "step": 3401 + }, + { + "epoch": 1.712665406427221, + "grad_norm": 0.17426197590337408, + "learning_rate": 3.99312554031006e-07, + "loss": 0.2675, + "step": 3402 + }, + { + "epoch": 1.7131695022054192, + "grad_norm": 0.1781326625588422, + "learning_rate": 3.99049512083411e-07, + "loss": 0.2769, + "step": 3403 + }, + { + "epoch": 1.7136735979836168, + "grad_norm": 0.18770153745691615, + "learning_rate": 3.9878649925940653e-07, + "loss": 0.266, + "step": 3404 + }, + { + "epoch": 1.7141776937618147, + "grad_norm": 0.17464611340669486, + "learning_rate": 3.9852351563486987e-07, + "loss": 0.2761, + "step": 3405 + }, + { + "epoch": 1.7146817895400126, + "grad_norm": 0.171635429988461, + "learning_rate": 3.9826056128567053e-07, + "loss": 0.2696, + "step": 3406 + }, + { + "epoch": 1.7151858853182105, + "grad_norm": 0.18495245117417844, + "learning_rate": 3.9799763628766895e-07, + "loss": 0.2975, + "step": 3407 + }, + { + "epoch": 1.7156899810964084, + "grad_norm": 0.169836479191478, + "learning_rate": 3.977347407167174e-07, + "loss": 0.2755, + "step": 3408 + }, + { + "epoch": 1.716194076874606, + "grad_norm": 0.17255997130578216, + "learning_rate": 3.9747187464865984e-07, + "loss": 0.2772, + "step": 3409 + }, + { + "epoch": 1.716698172652804, + "grad_norm": 0.16749153560856467, + "learning_rate": 3.972090381593311e-07, + "loss": 0.2736, + "step": 3410 + }, + { + "epoch": 1.7172022684310018, + "grad_norm": 0.17079123437345936, + "learning_rate": 3.9694623132455815e-07, + "loss": 0.2602, + "step": 3411 + }, + { + "epoch": 1.7177063642091999, + "grad_norm": 0.17321152114395624, + "learning_rate": 3.96683454220159e-07, + "loss": 0.275, + "step": 3412 + }, + { + "epoch": 1.7182104599873975, + "grad_norm": 0.16962109663638752, + "learning_rate": 3.964207069219435e-07, + "loss": 0.2732, + "step": 3413 + }, + { + "epoch": 1.7187145557655956, + "grad_norm": 0.1746549136635196, + "learning_rate": 3.9615798950571216e-07, + "loss": 0.2726, + "step": 3414 + }, + { + "epoch": 1.7192186515437933, + "grad_norm": 0.16750819214015739, + "learning_rate": 3.9589530204725787e-07, + "loss": 0.2659, + "step": 3415 + }, + { + "epoch": 1.7197227473219912, + "grad_norm": 0.1808474492424595, + "learning_rate": 3.956326446223639e-07, + "loss": 0.2684, + "step": 3416 + }, + { + "epoch": 1.720226843100189, + "grad_norm": 0.17539397294716638, + "learning_rate": 3.953700173068053e-07, + "loss": 0.2714, + "step": 3417 + }, + { + "epoch": 1.720730938878387, + "grad_norm": 0.17517500086130025, + "learning_rate": 3.9510742017634877e-07, + "loss": 0.2629, + "step": 3418 + }, + { + "epoch": 1.7212350346565848, + "grad_norm": 0.17325681094241782, + "learning_rate": 3.9484485330675153e-07, + "loss": 0.2714, + "step": 3419 + }, + { + "epoch": 1.7217391304347827, + "grad_norm": 0.17437360960471293, + "learning_rate": 3.9458231677376297e-07, + "loss": 0.2784, + "step": 3420 + }, + { + "epoch": 1.7222432262129805, + "grad_norm": 0.17254819902144547, + "learning_rate": 3.943198106531227e-07, + "loss": 0.2596, + "step": 3421 + }, + { + "epoch": 1.7227473219911782, + "grad_norm": 0.16965358248820828, + "learning_rate": 3.9405733502056267e-07, + "loss": 0.2705, + "step": 3422 + }, + { + "epoch": 1.7232514177693763, + "grad_norm": 0.17540405751695837, + "learning_rate": 3.9379488995180503e-07, + "loss": 0.275, + "step": 3423 + }, + { + "epoch": 1.723755513547574, + "grad_norm": 0.17044320687255016, + "learning_rate": 3.935324755225638e-07, + "loss": 0.261, + "step": 3424 + }, + { + "epoch": 1.724259609325772, + "grad_norm": 0.17079300009403095, + "learning_rate": 3.932700918085439e-07, + "loss": 0.2795, + "step": 3425 + }, + { + "epoch": 1.7247637051039697, + "grad_norm": 0.17492261380235571, + "learning_rate": 3.930077388854413e-07, + "loss": 0.2708, + "step": 3426 + }, + { + "epoch": 1.7252678008821676, + "grad_norm": 0.1697136489985747, + "learning_rate": 3.9274541682894325e-07, + "loss": 0.2646, + "step": 3427 + }, + { + "epoch": 1.7257718966603655, + "grad_norm": 0.17356605819677773, + "learning_rate": 3.9248312571472803e-07, + "loss": 0.2725, + "step": 3428 + }, + { + "epoch": 1.7262759924385633, + "grad_norm": 0.1798724083697512, + "learning_rate": 3.9222086561846515e-07, + "loss": 0.2689, + "step": 3429 + }, + { + "epoch": 1.7267800882167612, + "grad_norm": 0.172033557393297, + "learning_rate": 3.9195863661581466e-07, + "loss": 0.2675, + "step": 3430 + }, + { + "epoch": 1.727284183994959, + "grad_norm": 0.17813642880671784, + "learning_rate": 3.9169643878242836e-07, + "loss": 0.2538, + "step": 3431 + }, + { + "epoch": 1.727788279773157, + "grad_norm": 0.1744243470379689, + "learning_rate": 3.9143427219394857e-07, + "loss": 0.2719, + "step": 3432 + }, + { + "epoch": 1.7282923755513546, + "grad_norm": 0.19169988100579302, + "learning_rate": 3.911721369260086e-07, + "loss": 0.2729, + "step": 3433 + }, + { + "epoch": 1.7287964713295527, + "grad_norm": 0.1669226246028055, + "learning_rate": 3.9091003305423315e-07, + "loss": 0.2644, + "step": 3434 + }, + { + "epoch": 1.7293005671077504, + "grad_norm": 0.17848442108252346, + "learning_rate": 3.9064796065423716e-07, + "loss": 0.2765, + "step": 3435 + }, + { + "epoch": 1.7298046628859485, + "grad_norm": 0.17288599385442902, + "learning_rate": 3.9038591980162733e-07, + "loss": 0.2624, + "step": 3436 + }, + { + "epoch": 1.7303087586641461, + "grad_norm": 0.17418345316818656, + "learning_rate": 3.901239105720004e-07, + "loss": 0.2695, + "step": 3437 + }, + { + "epoch": 1.730812854442344, + "grad_norm": 0.16887161070123388, + "learning_rate": 3.8986193304094483e-07, + "loss": 0.269, + "step": 3438 + }, + { + "epoch": 1.7313169502205419, + "grad_norm": 0.1715448109776923, + "learning_rate": 3.8959998728403937e-07, + "loss": 0.2749, + "step": 3439 + }, + { + "epoch": 1.7318210459987398, + "grad_norm": 0.17162576830034787, + "learning_rate": 3.8933807337685344e-07, + "loss": 0.2866, + "step": 3440 + }, + { + "epoch": 1.7323251417769376, + "grad_norm": 0.17140705013497554, + "learning_rate": 3.89076191394948e-07, + "loss": 0.2515, + "step": 3441 + }, + { + "epoch": 1.7328292375551355, + "grad_norm": 0.17186684179031347, + "learning_rate": 3.8881434141387414e-07, + "loss": 0.2633, + "step": 3442 + }, + { + "epoch": 1.7333333333333334, + "grad_norm": 0.17247735874366607, + "learning_rate": 3.885525235091741e-07, + "loss": 0.2699, + "step": 3443 + }, + { + "epoch": 1.733837429111531, + "grad_norm": 0.19035869520485968, + "learning_rate": 3.8829073775638043e-07, + "loss": 0.2869, + "step": 3444 + }, + { + "epoch": 1.7343415248897291, + "grad_norm": 0.17126200809840406, + "learning_rate": 3.8802898423101715e-07, + "loss": 0.2659, + "step": 3445 + }, + { + "epoch": 1.7348456206679268, + "grad_norm": 0.17097316798308923, + "learning_rate": 3.877672630085983e-07, + "loss": 0.2747, + "step": 3446 + }, + { + "epoch": 1.735349716446125, + "grad_norm": 0.16663798950741054, + "learning_rate": 3.8750557416462863e-07, + "loss": 0.2666, + "step": 3447 + }, + { + "epoch": 1.7358538122243226, + "grad_norm": 0.17374809497432028, + "learning_rate": 3.8724391777460415e-07, + "loss": 0.2676, + "step": 3448 + }, + { + "epoch": 1.7363579080025204, + "grad_norm": 0.1705608920788751, + "learning_rate": 3.869822939140107e-07, + "loss": 0.273, + "step": 3449 + }, + { + "epoch": 1.7368620037807183, + "grad_norm": 0.1853493481955229, + "learning_rate": 3.867207026583255e-07, + "loss": 0.2858, + "step": 3450 + }, + { + "epoch": 1.7373660995589162, + "grad_norm": 0.1768536035338538, + "learning_rate": 3.8645914408301574e-07, + "loss": 0.2832, + "step": 3451 + }, + { + "epoch": 1.737870195337114, + "grad_norm": 0.16841766882214215, + "learning_rate": 3.861976182635397e-07, + "loss": 0.2623, + "step": 3452 + }, + { + "epoch": 1.738374291115312, + "grad_norm": 0.21467318021749093, + "learning_rate": 3.8593612527534585e-07, + "loss": 0.2769, + "step": 3453 + }, + { + "epoch": 1.7388783868935098, + "grad_norm": 0.17835594443596708, + "learning_rate": 3.8567466519387305e-07, + "loss": 0.2818, + "step": 3454 + }, + { + "epoch": 1.7393824826717075, + "grad_norm": 0.17944917802544677, + "learning_rate": 3.854132380945513e-07, + "loss": 0.285, + "step": 3455 + }, + { + "epoch": 1.7398865784499056, + "grad_norm": 0.1756271448121031, + "learning_rate": 3.8515184405280054e-07, + "loss": 0.2837, + "step": 3456 + }, + { + "epoch": 1.7403906742281032, + "grad_norm": 0.16873921221680957, + "learning_rate": 3.8489048314403124e-07, + "loss": 0.2668, + "step": 3457 + }, + { + "epoch": 1.7408947700063013, + "grad_norm": 0.1837468940499396, + "learning_rate": 3.846291554436445e-07, + "loss": 0.2771, + "step": 3458 + }, + { + "epoch": 1.741398865784499, + "grad_norm": 0.17119908182449262, + "learning_rate": 3.8436786102703186e-07, + "loss": 0.2642, + "step": 3459 + }, + { + "epoch": 1.7419029615626969, + "grad_norm": 0.17158536564497223, + "learning_rate": 3.841065999695748e-07, + "loss": 0.2716, + "step": 3460 + }, + { + "epoch": 1.7424070573408947, + "grad_norm": 0.18136191876998276, + "learning_rate": 3.838453723466459e-07, + "loss": 0.2701, + "step": 3461 + }, + { + "epoch": 1.7429111531190926, + "grad_norm": 0.17402251262948262, + "learning_rate": 3.8358417823360757e-07, + "loss": 0.2705, + "step": 3462 + }, + { + "epoch": 1.7434152488972905, + "grad_norm": 0.16714838859967632, + "learning_rate": 3.8332301770581244e-07, + "loss": 0.2705, + "step": 3463 + }, + { + "epoch": 1.7439193446754884, + "grad_norm": 0.16798057047478643, + "learning_rate": 3.8306189083860414e-07, + "loss": 0.265, + "step": 3464 + }, + { + "epoch": 1.7444234404536862, + "grad_norm": 0.1675965482058386, + "learning_rate": 3.8280079770731565e-07, + "loss": 0.2642, + "step": 3465 + }, + { + "epoch": 1.744927536231884, + "grad_norm": 0.1722718830074659, + "learning_rate": 3.8253973838727134e-07, + "loss": 0.2783, + "step": 3466 + }, + { + "epoch": 1.745431632010082, + "grad_norm": 0.17511740118469324, + "learning_rate": 3.822787129537846e-07, + "loss": 0.2548, + "step": 3467 + }, + { + "epoch": 1.7459357277882797, + "grad_norm": 0.1692592884492893, + "learning_rate": 3.820177214821602e-07, + "loss": 0.2756, + "step": 3468 + }, + { + "epoch": 1.7464398235664778, + "grad_norm": 0.1838015667126571, + "learning_rate": 3.8175676404769226e-07, + "loss": 0.2835, + "step": 3469 + }, + { + "epoch": 1.7469439193446754, + "grad_norm": 0.17221059457481896, + "learning_rate": 3.8149584072566516e-07, + "loss": 0.2679, + "step": 3470 + }, + { + "epoch": 1.7474480151228735, + "grad_norm": 0.17240520098361659, + "learning_rate": 3.8123495159135414e-07, + "loss": 0.2602, + "step": 3471 + }, + { + "epoch": 1.7479521109010712, + "grad_norm": 0.17645638344602915, + "learning_rate": 3.8097409672002384e-07, + "loss": 0.2664, + "step": 3472 + }, + { + "epoch": 1.748456206679269, + "grad_norm": 0.17119884887181155, + "learning_rate": 3.8071327618692926e-07, + "loss": 0.2771, + "step": 3473 + }, + { + "epoch": 1.748960302457467, + "grad_norm": 0.16737717354943357, + "learning_rate": 3.8045249006731554e-07, + "loss": 0.257, + "step": 3474 + }, + { + "epoch": 1.7494643982356648, + "grad_norm": 0.17011856009678725, + "learning_rate": 3.80191738436418e-07, + "loss": 0.2622, + "step": 3475 + }, + { + "epoch": 1.7499684940138627, + "grad_norm": 0.171796792732586, + "learning_rate": 3.799310213694618e-07, + "loss": 0.2818, + "step": 3476 + }, + { + "epoch": 1.7504725897920603, + "grad_norm": 0.1928931218051459, + "learning_rate": 3.796703389416619e-07, + "loss": 0.2763, + "step": 3477 + }, + { + "epoch": 1.7509766855702584, + "grad_norm": 0.20147336279423894, + "learning_rate": 3.7940969122822395e-07, + "loss": 0.2824, + "step": 3478 + }, + { + "epoch": 1.751480781348456, + "grad_norm": 0.175565231620901, + "learning_rate": 3.7914907830434296e-07, + "loss": 0.2864, + "step": 3479 + }, + { + "epoch": 1.7519848771266542, + "grad_norm": 0.17087917079117643, + "learning_rate": 3.7888850024520443e-07, + "loss": 0.2664, + "step": 3480 + }, + { + "epoch": 1.7524889729048518, + "grad_norm": 0.17664931365772008, + "learning_rate": 3.7862795712598307e-07, + "loss": 0.2758, + "step": 3481 + }, + { + "epoch": 1.75299306868305, + "grad_norm": 0.17456510661593722, + "learning_rate": 3.783674490218445e-07, + "loss": 0.2734, + "step": 3482 + }, + { + "epoch": 1.7534971644612476, + "grad_norm": 0.18609071641176797, + "learning_rate": 3.7810697600794335e-07, + "loss": 0.267, + "step": 3483 + }, + { + "epoch": 1.7540012602394455, + "grad_norm": 0.17283492786683485, + "learning_rate": 3.7784653815942436e-07, + "loss": 0.2837, + "step": 3484 + }, + { + "epoch": 1.7545053560176433, + "grad_norm": 0.17490689341326401, + "learning_rate": 3.775861355514226e-07, + "loss": 0.2799, + "step": 3485 + }, + { + "epoch": 1.7550094517958412, + "grad_norm": 0.17672370129936685, + "learning_rate": 3.7732576825906223e-07, + "loss": 0.2539, + "step": 3486 + }, + { + "epoch": 1.755513547574039, + "grad_norm": 0.17912383987865965, + "learning_rate": 3.7706543635745795e-07, + "loss": 0.2949, + "step": 3487 + }, + { + "epoch": 1.756017643352237, + "grad_norm": 0.1744911022180228, + "learning_rate": 3.768051399217137e-07, + "loss": 0.2803, + "step": 3488 + }, + { + "epoch": 1.7565217391304349, + "grad_norm": 0.17364431498626157, + "learning_rate": 3.7654487902692355e-07, + "loss": 0.2761, + "step": 3489 + }, + { + "epoch": 1.7570258349086325, + "grad_norm": 0.1746170171542789, + "learning_rate": 3.762846537481708e-07, + "loss": 0.2696, + "step": 3490 + }, + { + "epoch": 1.7575299306868306, + "grad_norm": 0.17817739668123456, + "learning_rate": 3.7602446416052926e-07, + "loss": 0.2801, + "step": 3491 + }, + { + "epoch": 1.7580340264650283, + "grad_norm": 0.17133707420290878, + "learning_rate": 3.757643103390618e-07, + "loss": 0.278, + "step": 3492 + }, + { + "epoch": 1.7585381222432264, + "grad_norm": 0.17227766863166188, + "learning_rate": 3.75504192358821e-07, + "loss": 0.2655, + "step": 3493 + }, + { + "epoch": 1.759042218021424, + "grad_norm": 0.1841860670831539, + "learning_rate": 3.7524411029484984e-07, + "loss": 0.284, + "step": 3494 + }, + { + "epoch": 1.759546313799622, + "grad_norm": 0.18128560254995008, + "learning_rate": 3.7498406422217967e-07, + "loss": 0.2688, + "step": 3495 + }, + { + "epoch": 1.7600504095778198, + "grad_norm": 0.1777006939689815, + "learning_rate": 3.7472405421583283e-07, + "loss": 0.2665, + "step": 3496 + }, + { + "epoch": 1.7605545053560177, + "grad_norm": 0.17651881222532234, + "learning_rate": 3.7446408035082e-07, + "loss": 0.2659, + "step": 3497 + }, + { + "epoch": 1.7610586011342155, + "grad_norm": 0.17121563039114812, + "learning_rate": 3.742041427021426e-07, + "loss": 0.2661, + "step": 3498 + }, + { + "epoch": 1.7615626969124134, + "grad_norm": 0.17086616322817438, + "learning_rate": 3.7394424134479085e-07, + "loss": 0.2748, + "step": 3499 + }, + { + "epoch": 1.7620667926906113, + "grad_norm": 0.1845306405915281, + "learning_rate": 3.736843763537443e-07, + "loss": 0.2666, + "step": 3500 + }, + { + "epoch": 1.762570888468809, + "grad_norm": 0.18385553877215485, + "learning_rate": 3.734245478039729e-07, + "loss": 0.2876, + "step": 3501 + }, + { + "epoch": 1.763074984247007, + "grad_norm": 0.1766369636170071, + "learning_rate": 3.7316475577043515e-07, + "loss": 0.2662, + "step": 3502 + }, + { + "epoch": 1.7635790800252047, + "grad_norm": 0.17153706729666343, + "learning_rate": 3.7290500032807994e-07, + "loss": 0.2629, + "step": 3503 + }, + { + "epoch": 1.7640831758034028, + "grad_norm": 0.17299192192087665, + "learning_rate": 3.726452815518446e-07, + "loss": 0.2619, + "step": 3504 + }, + { + "epoch": 1.7645872715816004, + "grad_norm": 0.19384856527161845, + "learning_rate": 3.7238559951665684e-07, + "loss": 0.264, + "step": 3505 + }, + { + "epoch": 1.7650913673597983, + "grad_norm": 0.17125569407787852, + "learning_rate": 3.721259542974329e-07, + "loss": 0.2736, + "step": 3506 + }, + { + "epoch": 1.7655954631379962, + "grad_norm": 0.17240058028350044, + "learning_rate": 3.7186634596907905e-07, + "loss": 0.2747, + "step": 3507 + }, + { + "epoch": 1.766099558916194, + "grad_norm": 0.1788859679300591, + "learning_rate": 3.716067746064907e-07, + "loss": 0.275, + "step": 3508 + }, + { + "epoch": 1.766603654694392, + "grad_norm": 0.17068346062555592, + "learning_rate": 3.7134724028455235e-07, + "loss": 0.2596, + "step": 3509 + }, + { + "epoch": 1.7671077504725898, + "grad_norm": 0.17273509001374454, + "learning_rate": 3.710877430781384e-07, + "loss": 0.2734, + "step": 3510 + }, + { + "epoch": 1.7676118462507877, + "grad_norm": 0.19441499623673344, + "learning_rate": 3.708282830621118e-07, + "loss": 0.2865, + "step": 3511 + }, + { + "epoch": 1.7681159420289854, + "grad_norm": 0.18157876524943378, + "learning_rate": 3.705688603113256e-07, + "loss": 0.2659, + "step": 3512 + }, + { + "epoch": 1.7686200378071835, + "grad_norm": 0.1755434232731158, + "learning_rate": 3.703094749006214e-07, + "loss": 0.2657, + "step": 3513 + }, + { + "epoch": 1.7691241335853811, + "grad_norm": 0.17616250911630144, + "learning_rate": 3.700501269048301e-07, + "loss": 0.2696, + "step": 3514 + }, + { + "epoch": 1.7696282293635792, + "grad_norm": 0.1717712747533756, + "learning_rate": 3.697908163987724e-07, + "loss": 0.263, + "step": 3515 + }, + { + "epoch": 1.7701323251417769, + "grad_norm": 0.1746899392847966, + "learning_rate": 3.6953154345725733e-07, + "loss": 0.2833, + "step": 3516 + }, + { + "epoch": 1.7706364209199748, + "grad_norm": 0.1705652748493287, + "learning_rate": 3.6927230815508403e-07, + "loss": 0.2637, + "step": 3517 + }, + { + "epoch": 1.7711405166981726, + "grad_norm": 0.18596072083429435, + "learning_rate": 3.6901311056703985e-07, + "loss": 0.2953, + "step": 3518 + }, + { + "epoch": 1.7716446124763705, + "grad_norm": 0.17652711756186662, + "learning_rate": 3.6875395076790186e-07, + "loss": 0.2861, + "step": 3519 + }, + { + "epoch": 1.7721487082545684, + "grad_norm": 0.17706846127443582, + "learning_rate": 3.684948288324362e-07, + "loss": 0.2646, + "step": 3520 + }, + { + "epoch": 1.7726528040327663, + "grad_norm": 0.17967101443910952, + "learning_rate": 3.6823574483539767e-07, + "loss": 0.2862, + "step": 3521 + }, + { + "epoch": 1.7731568998109641, + "grad_norm": 0.1747251079117009, + "learning_rate": 3.6797669885153046e-07, + "loss": 0.2683, + "step": 3522 + }, + { + "epoch": 1.7736609955891618, + "grad_norm": 0.1687110695892719, + "learning_rate": 3.6771769095556783e-07, + "loss": 0.2653, + "step": 3523 + }, + { + "epoch": 1.77416509136736, + "grad_norm": 0.17110856789881085, + "learning_rate": 3.6745872122223196e-07, + "loss": 0.2758, + "step": 3524 + }, + { + "epoch": 1.7746691871455575, + "grad_norm": 0.17460922566080275, + "learning_rate": 3.6719978972623385e-07, + "loss": 0.2753, + "step": 3525 + }, + { + "epoch": 1.7751732829237556, + "grad_norm": 0.1847111261299794, + "learning_rate": 3.669408965422739e-07, + "loss": 0.2731, + "step": 3526 + }, + { + "epoch": 1.7756773787019533, + "grad_norm": 0.17449754787956734, + "learning_rate": 3.6668204174504085e-07, + "loss": 0.2726, + "step": 3527 + }, + { + "epoch": 1.7761814744801512, + "grad_norm": 0.17338254071796924, + "learning_rate": 3.664232254092131e-07, + "loss": 0.2793, + "step": 3528 + }, + { + "epoch": 1.776685570258349, + "grad_norm": 0.17628576320824763, + "learning_rate": 3.6616444760945744e-07, + "loss": 0.2793, + "step": 3529 + }, + { + "epoch": 1.777189666036547, + "grad_norm": 0.1761083816514799, + "learning_rate": 3.659057084204292e-07, + "loss": 0.2762, + "step": 3530 + }, + { + "epoch": 1.7776937618147448, + "grad_norm": 0.17618815238730148, + "learning_rate": 3.656470079167737e-07, + "loss": 0.2723, + "step": 3531 + }, + { + "epoch": 1.7781978575929427, + "grad_norm": 0.2082525337835813, + "learning_rate": 3.6538834617312395e-07, + "loss": 0.2656, + "step": 3532 + }, + { + "epoch": 1.7787019533711406, + "grad_norm": 0.1740382361506214, + "learning_rate": 3.6512972326410274e-07, + "loss": 0.2874, + "step": 3533 + }, + { + "epoch": 1.7792060491493382, + "grad_norm": 0.18497511420153842, + "learning_rate": 3.6487113926432067e-07, + "loss": 0.2831, + "step": 3534 + }, + { + "epoch": 1.7797101449275363, + "grad_norm": 0.1868818820255656, + "learning_rate": 3.6461259424837787e-07, + "loss": 0.2704, + "step": 3535 + }, + { + "epoch": 1.780214240705734, + "grad_norm": 0.1747555047792214, + "learning_rate": 3.643540882908631e-07, + "loss": 0.2742, + "step": 3536 + }, + { + "epoch": 1.780718336483932, + "grad_norm": 0.16955294868925422, + "learning_rate": 3.640956214663534e-07, + "loss": 0.2805, + "step": 3537 + }, + { + "epoch": 1.7812224322621297, + "grad_norm": 0.17195016907202454, + "learning_rate": 3.638371938494151e-07, + "loss": 0.2726, + "step": 3538 + }, + { + "epoch": 1.7817265280403278, + "grad_norm": 0.17077392343569042, + "learning_rate": 3.635788055146028e-07, + "loss": 0.2695, + "step": 3539 + }, + { + "epoch": 1.7822306238185255, + "grad_norm": 0.16922021837013979, + "learning_rate": 3.633204565364602e-07, + "loss": 0.2607, + "step": 3540 + }, + { + "epoch": 1.7827347195967234, + "grad_norm": 0.17020929306602342, + "learning_rate": 3.63062146989519e-07, + "loss": 0.2774, + "step": 3541 + }, + { + "epoch": 1.7832388153749212, + "grad_norm": 0.18083426533544908, + "learning_rate": 3.628038769483002e-07, + "loss": 0.2841, + "step": 3542 + }, + { + "epoch": 1.7837429111531191, + "grad_norm": 0.1790034539370043, + "learning_rate": 3.625456464873131e-07, + "loss": 0.2713, + "step": 3543 + }, + { + "epoch": 1.784247006931317, + "grad_norm": 0.1729253600603901, + "learning_rate": 3.6228745568105534e-07, + "loss": 0.2714, + "step": 3544 + }, + { + "epoch": 1.7847511027095146, + "grad_norm": 0.20077594866662438, + "learning_rate": 3.620293046040137e-07, + "loss": 0.2694, + "step": 3545 + }, + { + "epoch": 1.7852551984877127, + "grad_norm": 0.17484350438222895, + "learning_rate": 3.6177119333066274e-07, + "loss": 0.2577, + "step": 3546 + }, + { + "epoch": 1.7857592942659104, + "grad_norm": 0.17015014074186688, + "learning_rate": 3.6151312193546653e-07, + "loss": 0.2756, + "step": 3547 + }, + { + "epoch": 1.7862633900441085, + "grad_norm": 0.18310485144032457, + "learning_rate": 3.612550904928765e-07, + "loss": 0.2605, + "step": 3548 + }, + { + "epoch": 1.7867674858223062, + "grad_norm": 0.1697602759063394, + "learning_rate": 3.6099709907733355e-07, + "loss": 0.277, + "step": 3549 + }, + { + "epoch": 1.7872715816005043, + "grad_norm": 0.17942080037574837, + "learning_rate": 3.6073914776326654e-07, + "loss": 0.2766, + "step": 3550 + }, + { + "epoch": 1.787775677378702, + "grad_norm": 0.1749196729191675, + "learning_rate": 3.6048123662509257e-07, + "loss": 0.2702, + "step": 3551 + }, + { + "epoch": 1.7882797731568998, + "grad_norm": 0.17142762886817356, + "learning_rate": 3.6022336573721757e-07, + "loss": 0.2646, + "step": 3552 + }, + { + "epoch": 1.7887838689350977, + "grad_norm": 0.16935198240355007, + "learning_rate": 3.599655351740356e-07, + "loss": 0.2799, + "step": 3553 + }, + { + "epoch": 1.7892879647132955, + "grad_norm": 0.17626517942064382, + "learning_rate": 3.597077450099295e-07, + "loss": 0.2608, + "step": 3554 + }, + { + "epoch": 1.7897920604914934, + "grad_norm": 0.1756679530760868, + "learning_rate": 3.5944999531926963e-07, + "loss": 0.2762, + "step": 3555 + }, + { + "epoch": 1.7902961562696913, + "grad_norm": 0.17715691284816204, + "learning_rate": 3.5919228617641564e-07, + "loss": 0.2753, + "step": 3556 + }, + { + "epoch": 1.7908002520478892, + "grad_norm": 0.1778053393171107, + "learning_rate": 3.5893461765571464e-07, + "loss": 0.2705, + "step": 3557 + }, + { + "epoch": 1.7913043478260868, + "grad_norm": 0.17959544599325605, + "learning_rate": 3.5867698983150277e-07, + "loss": 0.2629, + "step": 3558 + }, + { + "epoch": 1.791808443604285, + "grad_norm": 0.1662083229493581, + "learning_rate": 3.5841940277810395e-07, + "loss": 0.2628, + "step": 3559 + }, + { + "epoch": 1.7923125393824826, + "grad_norm": 0.16874268935126943, + "learning_rate": 3.581618565698301e-07, + "loss": 0.27, + "step": 3560 + }, + { + "epoch": 1.7928166351606807, + "grad_norm": 0.17322137849836952, + "learning_rate": 3.5790435128098227e-07, + "loss": 0.2669, + "step": 3561 + }, + { + "epoch": 1.7933207309388783, + "grad_norm": 0.1742075041952224, + "learning_rate": 3.576468869858486e-07, + "loss": 0.2726, + "step": 3562 + }, + { + "epoch": 1.7938248267170762, + "grad_norm": 0.168819659068297, + "learning_rate": 3.573894637587064e-07, + "loss": 0.2707, + "step": 3563 + }, + { + "epoch": 1.794328922495274, + "grad_norm": 0.19374260839838295, + "learning_rate": 3.571320816738205e-07, + "loss": 0.2779, + "step": 3564 + }, + { + "epoch": 1.794833018273472, + "grad_norm": 0.16947665412300375, + "learning_rate": 3.568747408054439e-07, + "loss": 0.2748, + "step": 3565 + }, + { + "epoch": 1.7953371140516698, + "grad_norm": 0.17469189198050658, + "learning_rate": 3.5661744122781804e-07, + "loss": 0.2618, + "step": 3566 + }, + { + "epoch": 1.7958412098298677, + "grad_norm": 0.1702481414901748, + "learning_rate": 3.5636018301517213e-07, + "loss": 0.2597, + "step": 3567 + }, + { + "epoch": 1.7963453056080656, + "grad_norm": 0.17222409493077534, + "learning_rate": 3.5610296624172365e-07, + "loss": 0.2649, + "step": 3568 + }, + { + "epoch": 1.7968494013862633, + "grad_norm": 0.16931592244630336, + "learning_rate": 3.55845790981678e-07, + "loss": 0.2731, + "step": 3569 + }, + { + "epoch": 1.7973534971644614, + "grad_norm": 0.17620807131039093, + "learning_rate": 3.555886573092287e-07, + "loss": 0.2749, + "step": 3570 + }, + { + "epoch": 1.797857592942659, + "grad_norm": 0.18052515979122832, + "learning_rate": 3.55331565298557e-07, + "loss": 0.2878, + "step": 3571 + }, + { + "epoch": 1.798361688720857, + "grad_norm": 0.17191185207152138, + "learning_rate": 3.550745150238328e-07, + "loss": 0.2862, + "step": 3572 + }, + { + "epoch": 1.7988657844990548, + "grad_norm": 0.1757024787598414, + "learning_rate": 3.548175065592132e-07, + "loss": 0.2649, + "step": 3573 + }, + { + "epoch": 1.7993698802772526, + "grad_norm": 0.1729303037797562, + "learning_rate": 3.545605399788434e-07, + "loss": 0.2562, + "step": 3574 + }, + { + "epoch": 1.7998739760554505, + "grad_norm": 0.17438697638244505, + "learning_rate": 3.543036153568571e-07, + "loss": 0.2601, + "step": 3575 + }, + { + "epoch": 1.8003780718336484, + "grad_norm": 0.1785781990776129, + "learning_rate": 3.5404673276737503e-07, + "loss": 0.2732, + "step": 3576 + }, + { + "epoch": 1.8008821676118463, + "grad_norm": 0.17317098668224498, + "learning_rate": 3.5378989228450664e-07, + "loss": 0.2797, + "step": 3577 + }, + { + "epoch": 1.8013862633900442, + "grad_norm": 0.1690137234088919, + "learning_rate": 3.5353309398234833e-07, + "loss": 0.2721, + "step": 3578 + }, + { + "epoch": 1.801890359168242, + "grad_norm": 0.17537848542244042, + "learning_rate": 3.5327633793498535e-07, + "loss": 0.2714, + "step": 3579 + }, + { + "epoch": 1.8023944549464397, + "grad_norm": 0.16986985399350213, + "learning_rate": 3.530196242164899e-07, + "loss": 0.2637, + "step": 3580 + }, + { + "epoch": 1.8028985507246378, + "grad_norm": 0.16970500175613376, + "learning_rate": 3.5276295290092225e-07, + "loss": 0.2919, + "step": 3581 + }, + { + "epoch": 1.8034026465028354, + "grad_norm": 0.1798669561885536, + "learning_rate": 3.525063240623307e-07, + "loss": 0.2743, + "step": 3582 + }, + { + "epoch": 1.8034026465028354, + "eval_loss": 0.3062366843223572, + "eval_runtime": 17.926, + "eval_samples_per_second": 47.696, + "eval_steps_per_second": 1.004, + "step": 3582 + }, + { + "epoch": 1.8039067422810335, + "grad_norm": 0.17610026348303112, + "learning_rate": 3.5224973777475096e-07, + "loss": 0.2622, + "step": 3583 + }, + { + "epoch": 1.8044108380592312, + "grad_norm": 0.1819103774919584, + "learning_rate": 3.519931941122066e-07, + "loss": 0.2749, + "step": 3584 + }, + { + "epoch": 1.804914933837429, + "grad_norm": 0.1717127969736599, + "learning_rate": 3.517366931487089e-07, + "loss": 0.2703, + "step": 3585 + }, + { + "epoch": 1.805419029615627, + "grad_norm": 0.17885330880659442, + "learning_rate": 3.514802349582569e-07, + "loss": 0.2907, + "step": 3586 + }, + { + "epoch": 1.8059231253938248, + "grad_norm": 0.18707259558766182, + "learning_rate": 3.5122381961483684e-07, + "loss": 0.276, + "step": 3587 + }, + { + "epoch": 1.8064272211720227, + "grad_norm": 0.1782099276673349, + "learning_rate": 3.509674471924234e-07, + "loss": 0.2716, + "step": 3588 + }, + { + "epoch": 1.8069313169502206, + "grad_norm": 0.17525854196465432, + "learning_rate": 3.507111177649783e-07, + "loss": 0.2575, + "step": 3589 + }, + { + "epoch": 1.8074354127284185, + "grad_norm": 0.18183285195462048, + "learning_rate": 3.5045483140645063e-07, + "loss": 0.2871, + "step": 3590 + }, + { + "epoch": 1.807939508506616, + "grad_norm": 0.16829292657965922, + "learning_rate": 3.50198588190778e-07, + "loss": 0.277, + "step": 3591 + }, + { + "epoch": 1.8084436042848142, + "grad_norm": 0.16750119184558296, + "learning_rate": 3.4994238819188446e-07, + "loss": 0.2688, + "step": 3592 + }, + { + "epoch": 1.8089477000630119, + "grad_norm": 0.19435552393670724, + "learning_rate": 3.496862314836826e-07, + "loss": 0.2752, + "step": 3593 + }, + { + "epoch": 1.80945179584121, + "grad_norm": 0.17151739988556397, + "learning_rate": 3.4943011814007154e-07, + "loss": 0.2752, + "step": 3594 + }, + { + "epoch": 1.8099558916194076, + "grad_norm": 0.17044681265940018, + "learning_rate": 3.4917404823493895e-07, + "loss": 0.2764, + "step": 3595 + }, + { + "epoch": 1.8104599873976055, + "grad_norm": 0.1706463322105864, + "learning_rate": 3.489180218421591e-07, + "loss": 0.2795, + "step": 3596 + }, + { + "epoch": 1.8109640831758034, + "grad_norm": 0.1743724675299324, + "learning_rate": 3.486620390355939e-07, + "loss": 0.2778, + "step": 3597 + }, + { + "epoch": 1.8114681789540013, + "grad_norm": 0.1672255541484769, + "learning_rate": 3.48406099889093e-07, + "loss": 0.2602, + "step": 3598 + }, + { + "epoch": 1.8119722747321991, + "grad_norm": 0.17213091184076965, + "learning_rate": 3.4815020447649325e-07, + "loss": 0.2662, + "step": 3599 + }, + { + "epoch": 1.812476370510397, + "grad_norm": 0.17382490282350782, + "learning_rate": 3.478943528716189e-07, + "loss": 0.2702, + "step": 3600 + }, + { + "epoch": 1.8129804662885949, + "grad_norm": 0.1707010922925502, + "learning_rate": 3.476385451482813e-07, + "loss": 0.2694, + "step": 3601 + }, + { + "epoch": 1.8134845620667925, + "grad_norm": 0.16861692088845995, + "learning_rate": 3.473827813802798e-07, + "loss": 0.2682, + "step": 3602 + }, + { + "epoch": 1.8139886578449906, + "grad_norm": 0.17226530366785384, + "learning_rate": 3.4712706164140044e-07, + "loss": 0.2707, + "step": 3603 + }, + { + "epoch": 1.8144927536231883, + "grad_norm": 0.17374229482438772, + "learning_rate": 3.468713860054166e-07, + "loss": 0.2665, + "step": 3604 + }, + { + "epoch": 1.8149968494013864, + "grad_norm": 0.17166711837546353, + "learning_rate": 3.466157545460895e-07, + "loss": 0.2614, + "step": 3605 + }, + { + "epoch": 1.815500945179584, + "grad_norm": 0.18055980554179216, + "learning_rate": 3.463601673371669e-07, + "loss": 0.286, + "step": 3606 + }, + { + "epoch": 1.816005040957782, + "grad_norm": 0.16875286919833732, + "learning_rate": 3.4610462445238447e-07, + "loss": 0.2754, + "step": 3607 + }, + { + "epoch": 1.8165091367359798, + "grad_norm": 0.17716433583796581, + "learning_rate": 3.4584912596546435e-07, + "loss": 0.2741, + "step": 3608 + }, + { + "epoch": 1.8170132325141777, + "grad_norm": 0.17475847439692482, + "learning_rate": 3.4559367195011663e-07, + "loss": 0.2709, + "step": 3609 + }, + { + "epoch": 1.8175173282923756, + "grad_norm": 0.17444775099348855, + "learning_rate": 3.4533826248003807e-07, + "loss": 0.2818, + "step": 3610 + }, + { + "epoch": 1.8180214240705734, + "grad_norm": 0.16496716406158368, + "learning_rate": 3.4508289762891253e-07, + "loss": 0.2597, + "step": 3611 + }, + { + "epoch": 1.8185255198487713, + "grad_norm": 0.17214388173996828, + "learning_rate": 3.448275774704116e-07, + "loss": 0.2858, + "step": 3612 + }, + { + "epoch": 1.819029615626969, + "grad_norm": 0.17019231591147854, + "learning_rate": 3.4457230207819317e-07, + "loss": 0.2678, + "step": 3613 + }, + { + "epoch": 1.819533711405167, + "grad_norm": 0.17699657912936148, + "learning_rate": 3.443170715259027e-07, + "loss": 0.2577, + "step": 3614 + }, + { + "epoch": 1.8200378071833647, + "grad_norm": 0.1752044669776778, + "learning_rate": 3.440618858871728e-07, + "loss": 0.2641, + "step": 3615 + }, + { + "epoch": 1.8205419029615628, + "grad_norm": 0.1796499070557965, + "learning_rate": 3.438067452356229e-07, + "loss": 0.2818, + "step": 3616 + }, + { + "epoch": 1.8210459987397605, + "grad_norm": 0.17743300718849356, + "learning_rate": 3.435516496448594e-07, + "loss": 0.2632, + "step": 3617 + }, + { + "epoch": 1.8215500945179586, + "grad_norm": 0.17048364525336646, + "learning_rate": 3.4329659918847574e-07, + "loss": 0.2797, + "step": 3618 + }, + { + "epoch": 1.8220541902961562, + "grad_norm": 0.17700606194697294, + "learning_rate": 3.430415939400526e-07, + "loss": 0.2738, + "step": 3619 + }, + { + "epoch": 1.822558286074354, + "grad_norm": 0.1732841514749814, + "learning_rate": 3.4278663397315714e-07, + "loss": 0.2767, + "step": 3620 + }, + { + "epoch": 1.823062381852552, + "grad_norm": 0.1746313075508639, + "learning_rate": 3.4253171936134407e-07, + "loss": 0.2737, + "step": 3621 + }, + { + "epoch": 1.8235664776307499, + "grad_norm": 0.16856675923392617, + "learning_rate": 3.4227685017815435e-07, + "loss": 0.2704, + "step": 3622 + }, + { + "epoch": 1.8240705734089477, + "grad_norm": 0.17817947743263224, + "learning_rate": 3.420220264971164e-07, + "loss": 0.2633, + "step": 3623 + }, + { + "epoch": 1.8245746691871454, + "grad_norm": 0.17390789109467564, + "learning_rate": 3.417672483917451e-07, + "loss": 0.2631, + "step": 3624 + }, + { + "epoch": 1.8250787649653435, + "grad_norm": 0.170222280271219, + "learning_rate": 3.4151251593554255e-07, + "loss": 0.2666, + "step": 3625 + }, + { + "epoch": 1.8255828607435411, + "grad_norm": 0.18503069700309455, + "learning_rate": 3.4125782920199744e-07, + "loss": 0.2727, + "step": 3626 + }, + { + "epoch": 1.8260869565217392, + "grad_norm": 0.17199894183725506, + "learning_rate": 3.41003188264585e-07, + "loss": 0.268, + "step": 3627 + }, + { + "epoch": 1.826591052299937, + "grad_norm": 0.17933515312179926, + "learning_rate": 3.407485931967681e-07, + "loss": 0.2793, + "step": 3628 + }, + { + "epoch": 1.827095148078135, + "grad_norm": 0.17059652406791323, + "learning_rate": 3.404940440719953e-07, + "loss": 0.2635, + "step": 3629 + }, + { + "epoch": 1.8275992438563327, + "grad_norm": 0.17365046482826368, + "learning_rate": 3.4023954096370285e-07, + "loss": 0.2762, + "step": 3630 + }, + { + "epoch": 1.8281033396345305, + "grad_norm": 0.17048013344794072, + "learning_rate": 3.399850839453131e-07, + "loss": 0.2672, + "step": 3631 + }, + { + "epoch": 1.8286074354127284, + "grad_norm": 0.18254834974101053, + "learning_rate": 3.397306730902355e-07, + "loss": 0.2855, + "step": 3632 + }, + { + "epoch": 1.8291115311909263, + "grad_norm": 0.1782057659837957, + "learning_rate": 3.394763084718659e-07, + "loss": 0.2723, + "step": 3633 + }, + { + "epoch": 1.8296156269691242, + "grad_norm": 0.1869729195366187, + "learning_rate": 3.3922199016358663e-07, + "loss": 0.2815, + "step": 3634 + }, + { + "epoch": 1.830119722747322, + "grad_norm": 0.17300425848362272, + "learning_rate": 3.389677182387676e-07, + "loss": 0.2766, + "step": 3635 + }, + { + "epoch": 1.83062381852552, + "grad_norm": 0.17409263469313568, + "learning_rate": 3.38713492770764e-07, + "loss": 0.2725, + "step": 3636 + }, + { + "epoch": 1.8311279143037176, + "grad_norm": 0.17168330174108185, + "learning_rate": 3.384593138329188e-07, + "loss": 0.2636, + "step": 3637 + }, + { + "epoch": 1.8316320100819157, + "grad_norm": 0.17321147308428417, + "learning_rate": 3.382051814985607e-07, + "loss": 0.2643, + "step": 3638 + }, + { + "epoch": 1.8321361058601133, + "grad_norm": 0.17382144255134466, + "learning_rate": 3.3795109584100565e-07, + "loss": 0.2774, + "step": 3639 + }, + { + "epoch": 1.8326402016383114, + "grad_norm": 0.17237782646607822, + "learning_rate": 3.376970569335557e-07, + "loss": 0.2912, + "step": 3640 + }, + { + "epoch": 1.833144297416509, + "grad_norm": 0.18252171874104944, + "learning_rate": 3.374430648494991e-07, + "loss": 0.2874, + "step": 3641 + }, + { + "epoch": 1.833648393194707, + "grad_norm": 0.17047186028552003, + "learning_rate": 3.371891196621116e-07, + "loss": 0.2752, + "step": 3642 + }, + { + "epoch": 1.8341524889729048, + "grad_norm": 0.1670392087043219, + "learning_rate": 3.3693522144465426e-07, + "loss": 0.2659, + "step": 3643 + }, + { + "epoch": 1.8346565847511027, + "grad_norm": 0.173528591535757, + "learning_rate": 3.366813702703757e-07, + "loss": 0.2817, + "step": 3644 + }, + { + "epoch": 1.8351606805293006, + "grad_norm": 0.17390840294297605, + "learning_rate": 3.364275662125099e-07, + "loss": 0.2736, + "step": 3645 + }, + { + "epoch": 1.8356647763074985, + "grad_norm": 0.16690990450267892, + "learning_rate": 3.361738093442781e-07, + "loss": 0.2603, + "step": 3646 + }, + { + "epoch": 1.8361688720856963, + "grad_norm": 0.17814929644569416, + "learning_rate": 3.359200997388874e-07, + "loss": 0.2645, + "step": 3647 + }, + { + "epoch": 1.836672967863894, + "grad_norm": 0.18228489937252948, + "learning_rate": 3.3566643746953136e-07, + "loss": 0.2847, + "step": 3648 + }, + { + "epoch": 1.837177063642092, + "grad_norm": 0.17393875067169687, + "learning_rate": 3.354128226093903e-07, + "loss": 0.2589, + "step": 3649 + }, + { + "epoch": 1.8376811594202898, + "grad_norm": 0.17147582807039788, + "learning_rate": 3.351592552316302e-07, + "loss": 0.2617, + "step": 3650 + }, + { + "epoch": 1.8381852551984879, + "grad_norm": 0.17359008786164207, + "learning_rate": 3.34905735409404e-07, + "loss": 0.2672, + "step": 3651 + }, + { + "epoch": 1.8386893509766855, + "grad_norm": 0.17091069458322225, + "learning_rate": 3.346522632158502e-07, + "loss": 0.2737, + "step": 3652 + }, + { + "epoch": 1.8391934467548834, + "grad_norm": 0.17036599489519516, + "learning_rate": 3.343988387240945e-07, + "loss": 0.2655, + "step": 3653 + }, + { + "epoch": 1.8396975425330813, + "grad_norm": 0.1751804328426936, + "learning_rate": 3.3414546200724765e-07, + "loss": 0.2778, + "step": 3654 + }, + { + "epoch": 1.8402016383112791, + "grad_norm": 0.1705462977459914, + "learning_rate": 3.338921331384078e-07, + "loss": 0.2775, + "step": 3655 + }, + { + "epoch": 1.840705734089477, + "grad_norm": 0.17625220305705913, + "learning_rate": 3.336388521906587e-07, + "loss": 0.271, + "step": 3656 + }, + { + "epoch": 1.841209829867675, + "grad_norm": 0.1711600986441844, + "learning_rate": 3.333856192370699e-07, + "loss": 0.2773, + "step": 3657 + }, + { + "epoch": 1.8417139256458728, + "grad_norm": 0.1861777517647817, + "learning_rate": 3.3313243435069796e-07, + "loss": 0.278, + "step": 3658 + }, + { + "epoch": 1.8422180214240704, + "grad_norm": 0.17302149508680212, + "learning_rate": 3.328792976045849e-07, + "loss": 0.2668, + "step": 3659 + }, + { + "epoch": 1.8427221172022685, + "grad_norm": 0.17199119227595985, + "learning_rate": 3.3262620907175935e-07, + "loss": 0.2591, + "step": 3660 + }, + { + "epoch": 1.8432262129804662, + "grad_norm": 0.17344070020135455, + "learning_rate": 3.323731688252356e-07, + "loss": 0.2716, + "step": 3661 + }, + { + "epoch": 1.8437303087586643, + "grad_norm": 0.1724615950455862, + "learning_rate": 3.3212017693801444e-07, + "loss": 0.2701, + "step": 3662 + }, + { + "epoch": 1.844234404536862, + "grad_norm": 0.1751562140997247, + "learning_rate": 3.3186723348308204e-07, + "loss": 0.2808, + "step": 3663 + }, + { + "epoch": 1.8447385003150598, + "grad_norm": 0.18510250956865856, + "learning_rate": 3.316143385334113e-07, + "loss": 0.2698, + "step": 3664 + }, + { + "epoch": 1.8452425960932577, + "grad_norm": 0.19779933795030186, + "learning_rate": 3.3136149216196094e-07, + "loss": 0.2812, + "step": 3665 + }, + { + "epoch": 1.8457466918714556, + "grad_norm": 0.17539119190444977, + "learning_rate": 3.311086944416752e-07, + "loss": 0.267, + "step": 3666 + }, + { + "epoch": 1.8462507876496534, + "grad_norm": 0.1688698268367981, + "learning_rate": 3.3085594544548506e-07, + "loss": 0.2706, + "step": 3667 + }, + { + "epoch": 1.8467548834278513, + "grad_norm": 0.16370546456235416, + "learning_rate": 3.306032452463067e-07, + "loss": 0.2593, + "step": 3668 + }, + { + "epoch": 1.8472589792060492, + "grad_norm": 0.16664793193380006, + "learning_rate": 3.303505939170429e-07, + "loss": 0.2747, + "step": 3669 + }, + { + "epoch": 1.8477630749842469, + "grad_norm": 0.1920173379347818, + "learning_rate": 3.3009799153058185e-07, + "loss": 0.2808, + "step": 3670 + }, + { + "epoch": 1.848267170762445, + "grad_norm": 0.17169232742511578, + "learning_rate": 3.298454381597976e-07, + "loss": 0.2737, + "step": 3671 + }, + { + "epoch": 1.8487712665406426, + "grad_norm": 0.1768506079726659, + "learning_rate": 3.295929338775505e-07, + "loss": 0.2821, + "step": 3672 + }, + { + "epoch": 1.8492753623188407, + "grad_norm": 0.17791535437923783, + "learning_rate": 3.2934047875668624e-07, + "loss": 0.3001, + "step": 3673 + }, + { + "epoch": 1.8497794580970384, + "grad_norm": 0.1739360428422386, + "learning_rate": 3.290880728700368e-07, + "loss": 0.2636, + "step": 3674 + }, + { + "epoch": 1.8502835538752362, + "grad_norm": 0.17787282666122797, + "learning_rate": 3.2883571629041964e-07, + "loss": 0.2658, + "step": 3675 + }, + { + "epoch": 1.8507876496534341, + "grad_norm": 0.17402992624760774, + "learning_rate": 3.2858340909063785e-07, + "loss": 0.2821, + "step": 3676 + }, + { + "epoch": 1.851291745431632, + "grad_norm": 0.16863049632313487, + "learning_rate": 3.283311513434809e-07, + "loss": 0.2762, + "step": 3677 + }, + { + "epoch": 1.8517958412098299, + "grad_norm": 0.17175916416523554, + "learning_rate": 3.280789431217231e-07, + "loss": 0.2646, + "step": 3678 + }, + { + "epoch": 1.8522999369880278, + "grad_norm": 0.1816439994226865, + "learning_rate": 3.278267844981254e-07, + "loss": 0.2751, + "step": 3679 + }, + { + "epoch": 1.8528040327662256, + "grad_norm": 0.17768054704900385, + "learning_rate": 3.275746755454337e-07, + "loss": 0.2934, + "step": 3680 + }, + { + "epoch": 1.8533081285444233, + "grad_norm": 0.17526569011593024, + "learning_rate": 3.2732261633638014e-07, + "loss": 0.2783, + "step": 3681 + }, + { + "epoch": 1.8538122243226214, + "grad_norm": 0.1683831095084952, + "learning_rate": 3.2707060694368185e-07, + "loss": 0.271, + "step": 3682 + }, + { + "epoch": 1.854316320100819, + "grad_norm": 0.184280262405193, + "learning_rate": 3.268186474400424e-07, + "loss": 0.2726, + "step": 3683 + }, + { + "epoch": 1.8548204158790171, + "grad_norm": 0.17522567840778597, + "learning_rate": 3.2656673789815045e-07, + "loss": 0.2847, + "step": 3684 + }, + { + "epoch": 1.8553245116572148, + "grad_norm": 0.17558489027707613, + "learning_rate": 3.2631487839067995e-07, + "loss": 0.2865, + "step": 3685 + }, + { + "epoch": 1.855828607435413, + "grad_norm": 0.17002382304061303, + "learning_rate": 3.260630689902913e-07, + "loss": 0.2491, + "step": 3686 + }, + { + "epoch": 1.8563327032136105, + "grad_norm": 0.17988503644984635, + "learning_rate": 3.2581130976962966e-07, + "loss": 0.2584, + "step": 3687 + }, + { + "epoch": 1.8568367989918084, + "grad_norm": 0.17012660875417476, + "learning_rate": 3.255596008013263e-07, + "loss": 0.2755, + "step": 3688 + }, + { + "epoch": 1.8573408947700063, + "grad_norm": 0.17339868792540347, + "learning_rate": 3.2530794215799726e-07, + "loss": 0.2957, + "step": 3689 + }, + { + "epoch": 1.8578449905482042, + "grad_norm": 0.17081604800735575, + "learning_rate": 3.2505633391224497e-07, + "loss": 0.2619, + "step": 3690 + }, + { + "epoch": 1.858349086326402, + "grad_norm": 0.17873255506104102, + "learning_rate": 3.248047761366566e-07, + "loss": 0.2494, + "step": 3691 + }, + { + "epoch": 1.8588531821045997, + "grad_norm": 0.18817722225475367, + "learning_rate": 3.2455326890380493e-07, + "loss": 0.2767, + "step": 3692 + }, + { + "epoch": 1.8593572778827978, + "grad_norm": 0.17152115532488543, + "learning_rate": 3.243018122862484e-07, + "loss": 0.2821, + "step": 3693 + }, + { + "epoch": 1.8598613736609955, + "grad_norm": 0.16839244253657323, + "learning_rate": 3.240504063565307e-07, + "loss": 0.2529, + "step": 3694 + }, + { + "epoch": 1.8603654694391936, + "grad_norm": 0.1730644160338181, + "learning_rate": 3.2379905118718075e-07, + "loss": 0.2669, + "step": 3695 + }, + { + "epoch": 1.8608695652173912, + "grad_norm": 0.1760485174615298, + "learning_rate": 3.2354774685071297e-07, + "loss": 0.2525, + "step": 3696 + }, + { + "epoch": 1.8613736609955893, + "grad_norm": 0.1724016942572103, + "learning_rate": 3.232964934196273e-07, + "loss": 0.2646, + "step": 3697 + }, + { + "epoch": 1.861877756773787, + "grad_norm": 0.1695018921199434, + "learning_rate": 3.230452909664084e-07, + "loss": 0.2597, + "step": 3698 + }, + { + "epoch": 1.8623818525519849, + "grad_norm": 0.17585413997700072, + "learning_rate": 3.2279413956352713e-07, + "loss": 0.2672, + "step": 3699 + }, + { + "epoch": 1.8628859483301827, + "grad_norm": 0.17301030936638456, + "learning_rate": 3.2254303928343886e-07, + "loss": 0.2656, + "step": 3700 + }, + { + "epoch": 1.8633900441083806, + "grad_norm": 0.17993787406188083, + "learning_rate": 3.2229199019858426e-07, + "loss": 0.2731, + "step": 3701 + }, + { + "epoch": 1.8638941398865785, + "grad_norm": 0.17406377463523626, + "learning_rate": 3.2204099238138986e-07, + "loss": 0.2663, + "step": 3702 + }, + { + "epoch": 1.8643982356647764, + "grad_norm": 0.17543670483670815, + "learning_rate": 3.217900459042666e-07, + "loss": 0.2744, + "step": 3703 + }, + { + "epoch": 1.8649023314429742, + "grad_norm": 0.18680717781511091, + "learning_rate": 3.2153915083961124e-07, + "loss": 0.2768, + "step": 3704 + }, + { + "epoch": 1.865406427221172, + "grad_norm": 0.1799452415248967, + "learning_rate": 3.2128830725980527e-07, + "loss": 0.2829, + "step": 3705 + }, + { + "epoch": 1.86591052299937, + "grad_norm": 0.17633674870398874, + "learning_rate": 3.210375152372157e-07, + "loss": 0.2732, + "step": 3706 + }, + { + "epoch": 1.8664146187775676, + "grad_norm": 0.17843417862899802, + "learning_rate": 3.207867748441945e-07, + "loss": 0.2672, + "step": 3707 + }, + { + "epoch": 1.8669187145557657, + "grad_norm": 0.1716848365869712, + "learning_rate": 3.2053608615307836e-07, + "loss": 0.2626, + "step": 3708 + }, + { + "epoch": 1.8674228103339634, + "grad_norm": 0.17365023761079248, + "learning_rate": 3.202854492361897e-07, + "loss": 0.2682, + "step": 3709 + }, + { + "epoch": 1.8679269061121613, + "grad_norm": 0.17180044980161524, + "learning_rate": 3.2003486416583566e-07, + "loss": 0.2616, + "step": 3710 + }, + { + "epoch": 1.8684310018903592, + "grad_norm": 0.1722207017048821, + "learning_rate": 3.1978433101430857e-07, + "loss": 0.2749, + "step": 3711 + }, + { + "epoch": 1.868935097668557, + "grad_norm": 0.16714750604744708, + "learning_rate": 3.1953384985388543e-07, + "loss": 0.2548, + "step": 3712 + }, + { + "epoch": 1.869439193446755, + "grad_norm": 0.17097699024786078, + "learning_rate": 3.192834207568288e-07, + "loss": 0.2771, + "step": 3713 + }, + { + "epoch": 1.8699432892249528, + "grad_norm": 0.18094945147962893, + "learning_rate": 3.1903304379538585e-07, + "loss": 0.2816, + "step": 3714 + }, + { + "epoch": 1.8704473850031507, + "grad_norm": 0.188619079337791, + "learning_rate": 3.1878271904178855e-07, + "loss": 0.2558, + "step": 3715 + }, + { + "epoch": 1.8709514807813483, + "grad_norm": 0.17733938393348464, + "learning_rate": 3.1853244656825446e-07, + "loss": 0.2689, + "step": 3716 + }, + { + "epoch": 1.8714555765595464, + "grad_norm": 0.17881276774598476, + "learning_rate": 3.1828222644698515e-07, + "loss": 0.2676, + "step": 3717 + }, + { + "epoch": 1.871959672337744, + "grad_norm": 0.18577840959577496, + "learning_rate": 3.1803205875016806e-07, + "loss": 0.2862, + "step": 3718 + }, + { + "epoch": 1.8724637681159422, + "grad_norm": 0.1708409232652426, + "learning_rate": 3.1778194354997456e-07, + "loss": 0.2637, + "step": 3719 + }, + { + "epoch": 1.8729678638941398, + "grad_norm": 0.17019804041538505, + "learning_rate": 3.1753188091856176e-07, + "loss": 0.2607, + "step": 3720 + }, + { + "epoch": 1.8734719596723377, + "grad_norm": 0.17419845830941813, + "learning_rate": 3.172818709280709e-07, + "loss": 0.2902, + "step": 3721 + }, + { + "epoch": 1.8739760554505356, + "grad_norm": 0.16739241329355717, + "learning_rate": 3.1703191365062843e-07, + "loss": 0.2526, + "step": 3722 + }, + { + "epoch": 1.8744801512287335, + "grad_norm": 0.1736290579522879, + "learning_rate": 3.167820091583455e-07, + "loss": 0.2879, + "step": 3723 + }, + { + "epoch": 1.8749842470069313, + "grad_norm": 0.1829860060599394, + "learning_rate": 3.1653215752331784e-07, + "loss": 0.2577, + "step": 3724 + }, + { + "epoch": 1.8754883427851292, + "grad_norm": 0.17208808352446284, + "learning_rate": 3.1628235881762624e-07, + "loss": 0.2624, + "step": 3725 + }, + { + "epoch": 1.875992438563327, + "grad_norm": 0.17226579687443258, + "learning_rate": 3.160326131133361e-07, + "loss": 0.2724, + "step": 3726 + }, + { + "epoch": 1.8764965343415247, + "grad_norm": 0.1758528503018195, + "learning_rate": 3.1578292048249743e-07, + "loss": 0.2635, + "step": 3727 + }, + { + "epoch": 1.8770006301197228, + "grad_norm": 0.17602460233894715, + "learning_rate": 3.1553328099714493e-07, + "loss": 0.2698, + "step": 3728 + }, + { + "epoch": 1.8775047258979205, + "grad_norm": 0.17916344278800375, + "learning_rate": 3.152836947292984e-07, + "loss": 0.2717, + "step": 3729 + }, + { + "epoch": 1.8780088216761186, + "grad_norm": 0.17570265601183882, + "learning_rate": 3.1503416175096156e-07, + "loss": 0.2794, + "step": 3730 + }, + { + "epoch": 1.8785129174543163, + "grad_norm": 0.1730642589307263, + "learning_rate": 3.147846821341231e-07, + "loss": 0.2521, + "step": 3731 + }, + { + "epoch": 1.8790170132325141, + "grad_norm": 0.1773966697187365, + "learning_rate": 3.145352559507567e-07, + "loss": 0.2722, + "step": 3732 + }, + { + "epoch": 1.879521109010712, + "grad_norm": 0.17094044431659802, + "learning_rate": 3.1428588327281993e-07, + "loss": 0.2758, + "step": 3733 + }, + { + "epoch": 1.8800252047889099, + "grad_norm": 0.1851002728085005, + "learning_rate": 3.140365641722555e-07, + "loss": 0.2734, + "step": 3734 + }, + { + "epoch": 1.8805293005671078, + "grad_norm": 0.18808599356233432, + "learning_rate": 3.137872987209902e-07, + "loss": 0.2826, + "step": 3735 + }, + { + "epoch": 1.8810333963453056, + "grad_norm": 0.16938800737251228, + "learning_rate": 3.1353808699093583e-07, + "loss": 0.2684, + "step": 3736 + }, + { + "epoch": 1.8815374921235035, + "grad_norm": 0.17470028516437633, + "learning_rate": 3.132889290539883e-07, + "loss": 0.2722, + "step": 3737 + }, + { + "epoch": 1.8820415879017012, + "grad_norm": 0.17323545442596444, + "learning_rate": 3.13039824982028e-07, + "loss": 0.2674, + "step": 3738 + }, + { + "epoch": 1.8825456836798993, + "grad_norm": 0.168222975337303, + "learning_rate": 3.127907748469201e-07, + "loss": 0.2596, + "step": 3739 + }, + { + "epoch": 1.883049779458097, + "grad_norm": 0.18129268141555105, + "learning_rate": 3.12541778720514e-07, + "loss": 0.27, + "step": 3740 + }, + { + "epoch": 1.883553875236295, + "grad_norm": 0.1747582775376678, + "learning_rate": 3.122928366746434e-07, + "loss": 0.263, + "step": 3741 + }, + { + "epoch": 1.8840579710144927, + "grad_norm": 0.17513777293951543, + "learning_rate": 3.1204394878112665e-07, + "loss": 0.2868, + "step": 3742 + }, + { + "epoch": 1.8845620667926906, + "grad_norm": 0.16905872732868277, + "learning_rate": 3.1179511511176646e-07, + "loss": 0.2661, + "step": 3743 + }, + { + "epoch": 1.8850661625708884, + "grad_norm": 0.1711316990404876, + "learning_rate": 3.1154633573834973e-07, + "loss": 0.2722, + "step": 3744 + }, + { + "epoch": 1.8855702583490863, + "grad_norm": 0.1739717737065482, + "learning_rate": 3.112976107326475e-07, + "loss": 0.268, + "step": 3745 + }, + { + "epoch": 1.8860743541272842, + "grad_norm": 0.17944928774305824, + "learning_rate": 3.110489401664158e-07, + "loss": 0.2812, + "step": 3746 + }, + { + "epoch": 1.886578449905482, + "grad_norm": 0.1762596235261297, + "learning_rate": 3.108003241113942e-07, + "loss": 0.2604, + "step": 3747 + }, + { + "epoch": 1.88708254568368, + "grad_norm": 0.17582649188250596, + "learning_rate": 3.1055176263930725e-07, + "loss": 0.27, + "step": 3748 + }, + { + "epoch": 1.8875866414618776, + "grad_norm": 0.18349586560030343, + "learning_rate": 3.10303255821863e-07, + "loss": 0.2731, + "step": 3749 + }, + { + "epoch": 1.8880907372400757, + "grad_norm": 0.18096878339273567, + "learning_rate": 3.100548037307546e-07, + "loss": 0.2817, + "step": 3750 + }, + { + "epoch": 1.8885948330182734, + "grad_norm": 0.17078057550904882, + "learning_rate": 3.0980640643765867e-07, + "loss": 0.2677, + "step": 3751 + }, + { + "epoch": 1.8890989287964715, + "grad_norm": 0.16893384397010494, + "learning_rate": 3.09558064014236e-07, + "loss": 0.2664, + "step": 3752 + }, + { + "epoch": 1.889603024574669, + "grad_norm": 0.18217985296544154, + "learning_rate": 3.093097765321324e-07, + "loss": 0.2688, + "step": 3753 + }, + { + "epoch": 1.8901071203528672, + "grad_norm": 0.17031724602389556, + "learning_rate": 3.0906154406297677e-07, + "loss": 0.2675, + "step": 3754 + }, + { + "epoch": 1.8906112161310649, + "grad_norm": 0.17612084822756136, + "learning_rate": 3.0881336667838313e-07, + "loss": 0.259, + "step": 3755 + }, + { + "epoch": 1.8911153119092627, + "grad_norm": 0.1661742309383176, + "learning_rate": 3.085652444499488e-07, + "loss": 0.2555, + "step": 3756 + }, + { + "epoch": 1.8916194076874606, + "grad_norm": 0.17479975552258276, + "learning_rate": 3.0831717744925556e-07, + "loss": 0.2719, + "step": 3757 + }, + { + "epoch": 1.8921235034656585, + "grad_norm": 0.1723634999690176, + "learning_rate": 3.080691657478691e-07, + "loss": 0.2728, + "step": 3758 + }, + { + "epoch": 1.8926275992438564, + "grad_norm": 0.17468461669614443, + "learning_rate": 3.0782120941733954e-07, + "loss": 0.2765, + "step": 3759 + }, + { + "epoch": 1.893131695022054, + "grad_norm": 0.1710752190489546, + "learning_rate": 3.075733085292006e-07, + "loss": 0.2687, + "step": 3760 + }, + { + "epoch": 1.8936357908002521, + "grad_norm": 0.16637775831353452, + "learning_rate": 3.0732546315496986e-07, + "loss": 0.2613, + "step": 3761 + }, + { + "epoch": 1.8941398865784498, + "grad_norm": 0.18290536724761589, + "learning_rate": 3.070776733661497e-07, + "loss": 0.2847, + "step": 3762 + }, + { + "epoch": 1.8946439823566479, + "grad_norm": 0.17059102780201477, + "learning_rate": 3.068299392342255e-07, + "loss": 0.2651, + "step": 3763 + }, + { + "epoch": 1.8951480781348455, + "grad_norm": 0.17464195983436098, + "learning_rate": 3.065822608306674e-07, + "loss": 0.2901, + "step": 3764 + }, + { + "epoch": 1.8956521739130436, + "grad_norm": 0.16589700319021694, + "learning_rate": 3.063346382269286e-07, + "loss": 0.2665, + "step": 3765 + }, + { + "epoch": 1.8961562696912413, + "grad_norm": 0.17656288621526112, + "learning_rate": 3.060870714944473e-07, + "loss": 0.2642, + "step": 3766 + }, + { + "epoch": 1.8966603654694392, + "grad_norm": 0.18513022931135426, + "learning_rate": 3.058395607046446e-07, + "loss": 0.2714, + "step": 3767 + }, + { + "epoch": 1.897164461247637, + "grad_norm": 0.17250542864882773, + "learning_rate": 3.0559210592892567e-07, + "loss": 0.2758, + "step": 3768 + }, + { + "epoch": 1.897668557025835, + "grad_norm": 0.17243208247681027, + "learning_rate": 3.053447072386801e-07, + "loss": 0.2769, + "step": 3769 + }, + { + "epoch": 1.8981726528040328, + "grad_norm": 0.17667298757324432, + "learning_rate": 3.050973647052805e-07, + "loss": 0.2703, + "step": 3770 + }, + { + "epoch": 1.8986767485822307, + "grad_norm": 0.16775068473994517, + "learning_rate": 3.0485007840008394e-07, + "loss": 0.2497, + "step": 3771 + }, + { + "epoch": 1.8991808443604286, + "grad_norm": 0.17502025512014624, + "learning_rate": 3.046028483944308e-07, + "loss": 0.2672, + "step": 3772 + }, + { + "epoch": 1.8996849401386262, + "grad_norm": 0.17430596589676, + "learning_rate": 3.043556747596456e-07, + "loss": 0.2753, + "step": 3773 + }, + { + "epoch": 1.9001890359168243, + "grad_norm": 0.17212088868648046, + "learning_rate": 3.0410855756703614e-07, + "loss": 0.2712, + "step": 3774 + }, + { + "epoch": 1.900693131695022, + "grad_norm": 0.17745212599865196, + "learning_rate": 3.0386149688789434e-07, + "loss": 0.2847, + "step": 3775 + }, + { + "epoch": 1.90119722747322, + "grad_norm": 0.17153573115828954, + "learning_rate": 3.036144927934958e-07, + "loss": 0.276, + "step": 3776 + }, + { + "epoch": 1.9017013232514177, + "grad_norm": 0.1780316189262351, + "learning_rate": 3.033675453550994e-07, + "loss": 0.2849, + "step": 3777 + }, + { + "epoch": 1.9022054190296156, + "grad_norm": 0.1832700985266359, + "learning_rate": 3.031206546439482e-07, + "loss": 0.2594, + "step": 3778 + }, + { + "epoch": 1.9027095148078135, + "grad_norm": 0.1699394230622764, + "learning_rate": 3.0287382073126837e-07, + "loss": 0.2766, + "step": 3779 + }, + { + "epoch": 1.9032136105860114, + "grad_norm": 0.17706015901322072, + "learning_rate": 3.0262704368827036e-07, + "loss": 0.2752, + "step": 3780 + }, + { + "epoch": 1.9037177063642092, + "grad_norm": 0.1775906710352642, + "learning_rate": 3.0238032358614753e-07, + "loss": 0.2721, + "step": 3781 + }, + { + "epoch": 1.9037177063642092, + "eval_loss": 0.3058184087276459, + "eval_runtime": 17.9595, + "eval_samples_per_second": 47.607, + "eval_steps_per_second": 1.002, + "step": 3781 + }, + { + "epoch": 1.904221802142407, + "grad_norm": 0.18004522794254016, + "learning_rate": 3.0213366049607703e-07, + "loss": 0.2564, + "step": 3782 + }, + { + "epoch": 1.904725897920605, + "grad_norm": 0.17630616450078446, + "learning_rate": 3.0188705448921994e-07, + "loss": 0.2788, + "step": 3783 + }, + { + "epoch": 1.9052299936988026, + "grad_norm": 0.17486538442100466, + "learning_rate": 3.0164050563672004e-07, + "loss": 0.2699, + "step": 3784 + }, + { + "epoch": 1.9057340894770007, + "grad_norm": 0.17611096571524854, + "learning_rate": 3.0139401400970586e-07, + "loss": 0.2754, + "step": 3785 + }, + { + "epoch": 1.9062381852551984, + "grad_norm": 0.1737782779159714, + "learning_rate": 3.0114757967928816e-07, + "loss": 0.2657, + "step": 3786 + }, + { + "epoch": 1.9067422810333965, + "grad_norm": 0.1739385161722817, + "learning_rate": 3.0090120271656194e-07, + "loss": 0.2653, + "step": 3787 + }, + { + "epoch": 1.9072463768115941, + "grad_norm": 0.17747427811726257, + "learning_rate": 3.0065488319260535e-07, + "loss": 0.2728, + "step": 3788 + }, + { + "epoch": 1.907750472589792, + "grad_norm": 0.17131014859330937, + "learning_rate": 3.004086211784802e-07, + "loss": 0.2675, + "step": 3789 + }, + { + "epoch": 1.90825456836799, + "grad_norm": 0.17148523515796457, + "learning_rate": 3.001624167452315e-07, + "loss": 0.2672, + "step": 3790 + }, + { + "epoch": 1.9087586641461878, + "grad_norm": 0.16882516072988113, + "learning_rate": 2.999162699638873e-07, + "loss": 0.2637, + "step": 3791 + }, + { + "epoch": 1.9092627599243857, + "grad_norm": 0.1786046049841935, + "learning_rate": 2.996701809054601e-07, + "loss": 0.2848, + "step": 3792 + }, + { + "epoch": 1.9097668557025835, + "grad_norm": 0.17145262177042117, + "learning_rate": 2.994241496409444e-07, + "loss": 0.2712, + "step": 3793 + }, + { + "epoch": 1.9102709514807814, + "grad_norm": 0.171516448246734, + "learning_rate": 2.991781762413194e-07, + "loss": 0.2847, + "step": 3794 + }, + { + "epoch": 1.910775047258979, + "grad_norm": 0.17763020432437332, + "learning_rate": 2.989322607775462e-07, + "loss": 0.2837, + "step": 3795 + }, + { + "epoch": 1.9112791430371772, + "grad_norm": 0.1746370724053624, + "learning_rate": 2.986864033205704e-07, + "loss": 0.2727, + "step": 3796 + }, + { + "epoch": 1.9117832388153748, + "grad_norm": 0.185899284676683, + "learning_rate": 2.984406039413202e-07, + "loss": 0.2887, + "step": 3797 + }, + { + "epoch": 1.912287334593573, + "grad_norm": 0.16862868846992204, + "learning_rate": 2.98194862710707e-07, + "loss": 0.2645, + "step": 3798 + }, + { + "epoch": 1.9127914303717706, + "grad_norm": 0.17734208433370674, + "learning_rate": 2.9794917969962595e-07, + "loss": 0.2769, + "step": 3799 + }, + { + "epoch": 1.9132955261499685, + "grad_norm": 0.1646715304601581, + "learning_rate": 2.977035549789548e-07, + "loss": 0.2596, + "step": 3800 + }, + { + "epoch": 1.9137996219281663, + "grad_norm": 0.1718225932856344, + "learning_rate": 2.9745798861955497e-07, + "loss": 0.281, + "step": 3801 + }, + { + "epoch": 1.9143037177063642, + "grad_norm": 0.17618806208047236, + "learning_rate": 2.972124806922707e-07, + "loss": 0.2662, + "step": 3802 + }, + { + "epoch": 1.914807813484562, + "grad_norm": 0.1719512615295124, + "learning_rate": 2.9696703126792967e-07, + "loss": 0.2669, + "step": 3803 + }, + { + "epoch": 1.91531190926276, + "grad_norm": 0.18136678946666576, + "learning_rate": 2.967216404173423e-07, + "loss": 0.2602, + "step": 3804 + }, + { + "epoch": 1.9158160050409578, + "grad_norm": 0.1746419414840373, + "learning_rate": 2.9647630821130234e-07, + "loss": 0.2723, + "step": 3805 + }, + { + "epoch": 1.9163201008191555, + "grad_norm": 0.17198386237681593, + "learning_rate": 2.9623103472058685e-07, + "loss": 0.276, + "step": 3806 + }, + { + "epoch": 1.9168241965973536, + "grad_norm": 0.17409294867594544, + "learning_rate": 2.959858200159554e-07, + "loss": 0.2668, + "step": 3807 + }, + { + "epoch": 1.9173282923755512, + "grad_norm": 0.16517572018109172, + "learning_rate": 2.9574066416815123e-07, + "loss": 0.2668, + "step": 3808 + }, + { + "epoch": 1.9178323881537493, + "grad_norm": 0.17327174886971222, + "learning_rate": 2.9549556724789995e-07, + "loss": 0.2654, + "step": 3809 + }, + { + "epoch": 1.918336483931947, + "grad_norm": 0.17123477242224328, + "learning_rate": 2.952505293259108e-07, + "loss": 0.2666, + "step": 3810 + }, + { + "epoch": 1.9188405797101449, + "grad_norm": 0.1711720330959814, + "learning_rate": 2.950055504728757e-07, + "loss": 0.275, + "step": 3811 + }, + { + "epoch": 1.9193446754883428, + "grad_norm": 0.17427388711209385, + "learning_rate": 2.9476063075946915e-07, + "loss": 0.2762, + "step": 3812 + }, + { + "epoch": 1.9198487712665406, + "grad_norm": 0.17268175591003512, + "learning_rate": 2.945157702563494e-07, + "loss": 0.2744, + "step": 3813 + }, + { + "epoch": 1.9203528670447385, + "grad_norm": 0.16809166747283374, + "learning_rate": 2.9427096903415694e-07, + "loss": 0.2778, + "step": 3814 + }, + { + "epoch": 1.9208569628229364, + "grad_norm": 0.17330157587108844, + "learning_rate": 2.940262271635156e-07, + "loss": 0.2768, + "step": 3815 + }, + { + "epoch": 1.9213610586011343, + "grad_norm": 0.1738154543130853, + "learning_rate": 2.9378154471503156e-07, + "loss": 0.2878, + "step": 3816 + }, + { + "epoch": 1.921865154379332, + "grad_norm": 0.18288271259832445, + "learning_rate": 2.9353692175929475e-07, + "loss": 0.2671, + "step": 3817 + }, + { + "epoch": 1.92236925015753, + "grad_norm": 0.18141220460600882, + "learning_rate": 2.9329235836687684e-07, + "loss": 0.2806, + "step": 3818 + }, + { + "epoch": 1.9228733459357277, + "grad_norm": 0.18790688920159956, + "learning_rate": 2.930478546083331e-07, + "loss": 0.2681, + "step": 3819 + }, + { + "epoch": 1.9233774417139258, + "grad_norm": 0.16964152403213711, + "learning_rate": 2.9280341055420133e-07, + "loss": 0.2664, + "step": 3820 + }, + { + "epoch": 1.9238815374921234, + "grad_norm": 0.1671671212800262, + "learning_rate": 2.9255902627500204e-07, + "loss": 0.2641, + "step": 3821 + }, + { + "epoch": 1.9243856332703215, + "grad_norm": 0.17197335612344677, + "learning_rate": 2.923147018412387e-07, + "loss": 0.2696, + "step": 3822 + }, + { + "epoch": 1.9248897290485192, + "grad_norm": 0.17243819294137702, + "learning_rate": 2.920704373233972e-07, + "loss": 0.2754, + "step": 3823 + }, + { + "epoch": 1.925393824826717, + "grad_norm": 0.17043519865324416, + "learning_rate": 2.918262327919466e-07, + "loss": 0.2715, + "step": 3824 + }, + { + "epoch": 1.925897920604915, + "grad_norm": 0.1722409742574077, + "learning_rate": 2.915820883173383e-07, + "loss": 0.2578, + "step": 3825 + }, + { + "epoch": 1.9264020163831128, + "grad_norm": 0.1740557480804691, + "learning_rate": 2.9133800397000627e-07, + "loss": 0.2765, + "step": 3826 + }, + { + "epoch": 1.9269061121613107, + "grad_norm": 0.1680406260361292, + "learning_rate": 2.910939798203677e-07, + "loss": 0.2795, + "step": 3827 + }, + { + "epoch": 1.9274102079395083, + "grad_norm": 0.17149022418835733, + "learning_rate": 2.9085001593882187e-07, + "loss": 0.2604, + "step": 3828 + }, + { + "epoch": 1.9279143037177064, + "grad_norm": 0.17010503345411235, + "learning_rate": 2.9060611239575085e-07, + "loss": 0.2827, + "step": 3829 + }, + { + "epoch": 1.928418399495904, + "grad_norm": 0.1749493958223586, + "learning_rate": 2.9036226926151897e-07, + "loss": 0.2758, + "step": 3830 + }, + { + "epoch": 1.9289224952741022, + "grad_norm": 0.17008445038688108, + "learning_rate": 2.90118486606474e-07, + "loss": 0.2645, + "step": 3831 + }, + { + "epoch": 1.9294265910522999, + "grad_norm": 0.17652489240339891, + "learning_rate": 2.898747645009454e-07, + "loss": 0.2746, + "step": 3832 + }, + { + "epoch": 1.929930686830498, + "grad_norm": 0.22700479625148187, + "learning_rate": 2.896311030152457e-07, + "loss": 0.2753, + "step": 3833 + }, + { + "epoch": 1.9304347826086956, + "grad_norm": 0.17427450545863746, + "learning_rate": 2.8938750221966965e-07, + "loss": 0.2703, + "step": 3834 + }, + { + "epoch": 1.9309388783868935, + "grad_norm": 0.17151048823577203, + "learning_rate": 2.891439621844943e-07, + "loss": 0.2595, + "step": 3835 + }, + { + "epoch": 1.9314429741650914, + "grad_norm": 0.17149156103858243, + "learning_rate": 2.8890048297997985e-07, + "loss": 0.252, + "step": 3836 + }, + { + "epoch": 1.9319470699432892, + "grad_norm": 0.16817818345200922, + "learning_rate": 2.886570646763682e-07, + "loss": 0.2643, + "step": 3837 + }, + { + "epoch": 1.9324511657214871, + "grad_norm": 0.1694609499296146, + "learning_rate": 2.8841370734388444e-07, + "loss": 0.2774, + "step": 3838 + }, + { + "epoch": 1.9329552614996848, + "grad_norm": 0.18663748392100837, + "learning_rate": 2.8817041105273513e-07, + "loss": 0.2794, + "step": 3839 + }, + { + "epoch": 1.9334593572778829, + "grad_norm": 0.17290683655437977, + "learning_rate": 2.8792717587311027e-07, + "loss": 0.2751, + "step": 3840 + }, + { + "epoch": 1.9339634530560805, + "grad_norm": 0.17362487357823306, + "learning_rate": 2.876840018751814e-07, + "loss": 0.263, + "step": 3841 + }, + { + "epoch": 1.9344675488342786, + "grad_norm": 0.18573004563502468, + "learning_rate": 2.8744088912910257e-07, + "loss": 0.2749, + "step": 3842 + }, + { + "epoch": 1.9349716446124763, + "grad_norm": 0.16915731492502192, + "learning_rate": 2.8719783770501074e-07, + "loss": 0.2589, + "step": 3843 + }, + { + "epoch": 1.9354757403906744, + "grad_norm": 0.17885497820565566, + "learning_rate": 2.8695484767302423e-07, + "loss": 0.2829, + "step": 3844 + }, + { + "epoch": 1.935979836168872, + "grad_norm": 0.16662065368985032, + "learning_rate": 2.8671191910324466e-07, + "loss": 0.2675, + "step": 3845 + }, + { + "epoch": 1.93648393194707, + "grad_norm": 0.18153383756126074, + "learning_rate": 2.86469052065755e-07, + "loss": 0.2636, + "step": 3846 + }, + { + "epoch": 1.9369880277252678, + "grad_norm": 0.17318438090821503, + "learning_rate": 2.8622624663062125e-07, + "loss": 0.2942, + "step": 3847 + }, + { + "epoch": 1.9374921235034657, + "grad_norm": 0.1733848907134762, + "learning_rate": 2.859835028678911e-07, + "loss": 0.2782, + "step": 3848 + }, + { + "epoch": 1.9379962192816635, + "grad_norm": 0.16974026130832987, + "learning_rate": 2.8574082084759434e-07, + "loss": 0.2549, + "step": 3849 + }, + { + "epoch": 1.9385003150598614, + "grad_norm": 0.17091600219366612, + "learning_rate": 2.854982006397438e-07, + "loss": 0.2838, + "step": 3850 + }, + { + "epoch": 1.9390044108380593, + "grad_norm": 0.17588856131135513, + "learning_rate": 2.852556423143333e-07, + "loss": 0.2667, + "step": 3851 + }, + { + "epoch": 1.939508506616257, + "grad_norm": 0.16743774165838593, + "learning_rate": 2.8501314594133996e-07, + "loss": 0.2672, + "step": 3852 + }, + { + "epoch": 1.940012602394455, + "grad_norm": 0.17244186575284368, + "learning_rate": 2.8477071159072206e-07, + "loss": 0.2764, + "step": 3853 + }, + { + "epoch": 1.9405166981726527, + "grad_norm": 0.17790819827202037, + "learning_rate": 2.845283393324208e-07, + "loss": 0.2717, + "step": 3854 + }, + { + "epoch": 1.9410207939508508, + "grad_norm": 0.17254105972174483, + "learning_rate": 2.8428602923635894e-07, + "loss": 0.28, + "step": 3855 + }, + { + "epoch": 1.9415248897290485, + "grad_norm": 0.16993890956105945, + "learning_rate": 2.84043781372441e-07, + "loss": 0.2677, + "step": 3856 + }, + { + "epoch": 1.9420289855072463, + "grad_norm": 0.17692085437275623, + "learning_rate": 2.838015958105547e-07, + "loss": 0.2878, + "step": 3857 + }, + { + "epoch": 1.9425330812854442, + "grad_norm": 0.175016522473398, + "learning_rate": 2.8355947262056865e-07, + "loss": 0.2736, + "step": 3858 + }, + { + "epoch": 1.943037177063642, + "grad_norm": 0.1845066084934637, + "learning_rate": 2.833174118723338e-07, + "loss": 0.2712, + "step": 3859 + }, + { + "epoch": 1.94354127284184, + "grad_norm": 0.17292158533010546, + "learning_rate": 2.8307541363568356e-07, + "loss": 0.2648, + "step": 3860 + }, + { + "epoch": 1.9440453686200379, + "grad_norm": 0.17403302200728327, + "learning_rate": 2.8283347798043265e-07, + "loss": 0.2601, + "step": 3861 + }, + { + "epoch": 1.9445494643982357, + "grad_norm": 0.17209410998540525, + "learning_rate": 2.825916049763779e-07, + "loss": 0.2885, + "step": 3862 + }, + { + "epoch": 1.9450535601764334, + "grad_norm": 0.1730295963728701, + "learning_rate": 2.8234979469329856e-07, + "loss": 0.2773, + "step": 3863 + }, + { + "epoch": 1.9455576559546315, + "grad_norm": 0.17409432528563223, + "learning_rate": 2.8210804720095516e-07, + "loss": 0.2762, + "step": 3864 + }, + { + "epoch": 1.9460617517328291, + "grad_norm": 0.18150338712543956, + "learning_rate": 2.818663625690902e-07, + "loss": 0.2729, + "step": 3865 + }, + { + "epoch": 1.9465658475110272, + "grad_norm": 0.17569994960621013, + "learning_rate": 2.8162474086742854e-07, + "loss": 0.2756, + "step": 3866 + }, + { + "epoch": 1.947069943289225, + "grad_norm": 0.17879985777229393, + "learning_rate": 2.813831821656762e-07, + "loss": 0.2919, + "step": 3867 + }, + { + "epoch": 1.9475740390674228, + "grad_norm": 0.17230743106972518, + "learning_rate": 2.811416865335217e-07, + "loss": 0.276, + "step": 3868 + }, + { + "epoch": 1.9480781348456206, + "grad_norm": 0.17984069998116, + "learning_rate": 2.8090025404063477e-07, + "loss": 0.2785, + "step": 3869 + }, + { + "epoch": 1.9485822306238185, + "grad_norm": 0.1744539834333857, + "learning_rate": 2.8065888475666745e-07, + "loss": 0.2884, + "step": 3870 + }, + { + "epoch": 1.9490863264020164, + "grad_norm": 0.18027869943306707, + "learning_rate": 2.80417578751253e-07, + "loss": 0.278, + "step": 3871 + }, + { + "epoch": 1.9495904221802143, + "grad_norm": 0.17865984324678172, + "learning_rate": 2.801763360940068e-07, + "loss": 0.2602, + "step": 3872 + }, + { + "epoch": 1.9500945179584122, + "grad_norm": 0.17671158591958067, + "learning_rate": 2.7993515685452613e-07, + "loss": 0.2779, + "step": 3873 + }, + { + "epoch": 1.9505986137366098, + "grad_norm": 0.1762952439624626, + "learning_rate": 2.796940411023892e-07, + "loss": 0.2593, + "step": 3874 + }, + { + "epoch": 1.951102709514808, + "grad_norm": 0.1823791079352628, + "learning_rate": 2.794529889071569e-07, + "loss": 0.2758, + "step": 3875 + }, + { + "epoch": 1.9516068052930056, + "grad_norm": 0.178230741517296, + "learning_rate": 2.792120003383709e-07, + "loss": 0.2628, + "step": 3876 + }, + { + "epoch": 1.9521109010712037, + "grad_norm": 0.17483875950866737, + "learning_rate": 2.7897107546555525e-07, + "loss": 0.2742, + "step": 3877 + }, + { + "epoch": 1.9526149968494013, + "grad_norm": 0.17465758090677505, + "learning_rate": 2.787302143582152e-07, + "loss": 0.276, + "step": 3878 + }, + { + "epoch": 1.9531190926275992, + "grad_norm": 0.17129898524450557, + "learning_rate": 2.784894170858373e-07, + "loss": 0.2652, + "step": 3879 + }, + { + "epoch": 1.953623188405797, + "grad_norm": 0.1658101810588016, + "learning_rate": 2.782486837178907e-07, + "loss": 0.263, + "step": 3880 + }, + { + "epoch": 1.954127284183995, + "grad_norm": 0.1755462609743805, + "learning_rate": 2.780080143238249e-07, + "loss": 0.2587, + "step": 3881 + }, + { + "epoch": 1.9546313799621928, + "grad_norm": 0.17509315570998626, + "learning_rate": 2.7776740897307203e-07, + "loss": 0.2666, + "step": 3882 + }, + { + "epoch": 1.9551354757403907, + "grad_norm": 0.16889327157699743, + "learning_rate": 2.7752686773504486e-07, + "loss": 0.2704, + "step": 3883 + }, + { + "epoch": 1.9556395715185886, + "grad_norm": 0.17329963412035843, + "learning_rate": 2.7728639067913826e-07, + "loss": 0.2765, + "step": 3884 + }, + { + "epoch": 1.9561436672967862, + "grad_norm": 0.17191012346840492, + "learning_rate": 2.7704597787472825e-07, + "loss": 0.2689, + "step": 3885 + }, + { + "epoch": 1.9566477630749843, + "grad_norm": 0.16839979801264043, + "learning_rate": 2.7680562939117265e-07, + "loss": 0.2665, + "step": 3886 + }, + { + "epoch": 1.957151858853182, + "grad_norm": 0.17446755444392845, + "learning_rate": 2.765653452978103e-07, + "loss": 0.2837, + "step": 3887 + }, + { + "epoch": 1.95765595463138, + "grad_norm": 0.1726644996987495, + "learning_rate": 2.7632512566396185e-07, + "loss": 0.2825, + "step": 3888 + }, + { + "epoch": 1.9581600504095777, + "grad_norm": 0.18240867556012677, + "learning_rate": 2.7608497055892877e-07, + "loss": 0.2799, + "step": 3889 + }, + { + "epoch": 1.9586641461877756, + "grad_norm": 0.1852368217976518, + "learning_rate": 2.758448800519948e-07, + "loss": 0.2774, + "step": 3890 + }, + { + "epoch": 1.9591682419659735, + "grad_norm": 0.17519851532629838, + "learning_rate": 2.756048542124244e-07, + "loss": 0.2664, + "step": 3891 + }, + { + "epoch": 1.9596723377441714, + "grad_norm": 0.17550478596249003, + "learning_rate": 2.7536489310946325e-07, + "loss": 0.2765, + "step": 3892 + }, + { + "epoch": 1.9601764335223693, + "grad_norm": 0.17986773014699392, + "learning_rate": 2.751249968123391e-07, + "loss": 0.2741, + "step": 3893 + }, + { + "epoch": 1.9606805293005671, + "grad_norm": 0.18286596209791772, + "learning_rate": 2.748851653902604e-07, + "loss": 0.274, + "step": 3894 + }, + { + "epoch": 1.961184625078765, + "grad_norm": 0.16845294749892417, + "learning_rate": 2.7464539891241677e-07, + "loss": 0.2706, + "step": 3895 + }, + { + "epoch": 1.9616887208569627, + "grad_norm": 0.1667625234228712, + "learning_rate": 2.744056974479798e-07, + "loss": 0.2689, + "step": 3896 + }, + { + "epoch": 1.9621928166351608, + "grad_norm": 0.17330081052295093, + "learning_rate": 2.741660610661013e-07, + "loss": 0.271, + "step": 3897 + }, + { + "epoch": 1.9626969124133584, + "grad_norm": 0.16872657786163298, + "learning_rate": 2.7392648983591547e-07, + "loss": 0.2622, + "step": 3898 + }, + { + "epoch": 1.9632010081915565, + "grad_norm": 0.17255810013786174, + "learning_rate": 2.736869838265368e-07, + "loss": 0.2772, + "step": 3899 + }, + { + "epoch": 1.9637051039697542, + "grad_norm": 0.18979660370274565, + "learning_rate": 2.7344754310706135e-07, + "loss": 0.2774, + "step": 3900 + }, + { + "epoch": 1.9642091997479523, + "grad_norm": 0.2005994736740754, + "learning_rate": 2.732081677465664e-07, + "loss": 0.2558, + "step": 3901 + }, + { + "epoch": 1.96471329552615, + "grad_norm": 0.16769942763481963, + "learning_rate": 2.7296885781410997e-07, + "loss": 0.2729, + "step": 3902 + }, + { + "epoch": 1.9652173913043478, + "grad_norm": 0.17196075848567396, + "learning_rate": 2.7272961337873184e-07, + "loss": 0.2597, + "step": 3903 + }, + { + "epoch": 1.9657214870825457, + "grad_norm": 0.17574737438377164, + "learning_rate": 2.724904345094522e-07, + "loss": 0.2787, + "step": 3904 + }, + { + "epoch": 1.9662255828607436, + "grad_norm": 0.1801545153423894, + "learning_rate": 2.7225132127527305e-07, + "loss": 0.2665, + "step": 3905 + }, + { + "epoch": 1.9667296786389414, + "grad_norm": 0.18767570216174043, + "learning_rate": 2.720122737451767e-07, + "loss": 0.2963, + "step": 3906 + }, + { + "epoch": 1.967233774417139, + "grad_norm": 0.17392592966729542, + "learning_rate": 2.717732919881273e-07, + "loss": 0.2664, + "step": 3907 + }, + { + "epoch": 1.9677378701953372, + "grad_norm": 0.18336201230753177, + "learning_rate": 2.715343760730693e-07, + "loss": 0.2722, + "step": 3908 + }, + { + "epoch": 1.9682419659735348, + "grad_norm": 0.17088548349889449, + "learning_rate": 2.7129552606892834e-07, + "loss": 0.2686, + "step": 3909 + }, + { + "epoch": 1.968746061751733, + "grad_norm": 0.17989017966945003, + "learning_rate": 2.710567420446116e-07, + "loss": 0.2661, + "step": 3910 + }, + { + "epoch": 1.9692501575299306, + "grad_norm": 0.17148613832259865, + "learning_rate": 2.708180240690063e-07, + "loss": 0.2682, + "step": 3911 + }, + { + "epoch": 1.9697542533081287, + "grad_norm": 0.16861185373135207, + "learning_rate": 2.705793722109816e-07, + "loss": 0.2704, + "step": 3912 + }, + { + "epoch": 1.9702583490863264, + "grad_norm": 0.19696241693330502, + "learning_rate": 2.7034078653938663e-07, + "loss": 0.2718, + "step": 3913 + }, + { + "epoch": 1.9707624448645242, + "grad_norm": 0.18050920428069256, + "learning_rate": 2.7010226712305227e-07, + "loss": 0.2849, + "step": 3914 + }, + { + "epoch": 1.971266540642722, + "grad_norm": 0.17461718966022174, + "learning_rate": 2.698638140307897e-07, + "loss": 0.2709, + "step": 3915 + }, + { + "epoch": 1.97177063642092, + "grad_norm": 0.16987029931279596, + "learning_rate": 2.6962542733139094e-07, + "loss": 0.2749, + "step": 3916 + }, + { + "epoch": 1.9722747321991179, + "grad_norm": 0.1780473567269595, + "learning_rate": 2.6938710709362953e-07, + "loss": 0.2801, + "step": 3917 + }, + { + "epoch": 1.9727788279773157, + "grad_norm": 0.18378204778035598, + "learning_rate": 2.691488533862589e-07, + "loss": 0.2737, + "step": 3918 + }, + { + "epoch": 1.9732829237555136, + "grad_norm": 0.17887945423278645, + "learning_rate": 2.689106662780143e-07, + "loss": 0.2733, + "step": 3919 + }, + { + "epoch": 1.9737870195337113, + "grad_norm": 0.16923234306379617, + "learning_rate": 2.686725458376109e-07, + "loss": 0.2718, + "step": 3920 + }, + { + "epoch": 1.9742911153119094, + "grad_norm": 0.17182101067457767, + "learning_rate": 2.684344921337449e-07, + "loss": 0.272, + "step": 3921 + }, + { + "epoch": 1.974795211090107, + "grad_norm": 0.1811448482903096, + "learning_rate": 2.681965052350935e-07, + "loss": 0.2757, + "step": 3922 + }, + { + "epoch": 1.9752993068683051, + "grad_norm": 0.17076536993145938, + "learning_rate": 2.6795858521031455e-07, + "loss": 0.272, + "step": 3923 + }, + { + "epoch": 1.9758034026465028, + "grad_norm": 0.17273807428844049, + "learning_rate": 2.6772073212804626e-07, + "loss": 0.2623, + "step": 3924 + }, + { + "epoch": 1.9763074984247007, + "grad_norm": 0.170028734836891, + "learning_rate": 2.674829460569077e-07, + "loss": 0.2783, + "step": 3925 + }, + { + "epoch": 1.9768115942028985, + "grad_norm": 0.16806227375301971, + "learning_rate": 2.67245227065499e-07, + "loss": 0.2556, + "step": 3926 + }, + { + "epoch": 1.9773156899810964, + "grad_norm": 0.17164028145315433, + "learning_rate": 2.6700757522240025e-07, + "loss": 0.2745, + "step": 3927 + }, + { + "epoch": 1.9778197857592943, + "grad_norm": 0.16995123735916823, + "learning_rate": 2.66769990596173e-07, + "loss": 0.2746, + "step": 3928 + }, + { + "epoch": 1.9783238815374922, + "grad_norm": 0.17598543808774827, + "learning_rate": 2.6653247325535843e-07, + "loss": 0.2622, + "step": 3929 + }, + { + "epoch": 1.97882797731569, + "grad_norm": 0.17359313560540118, + "learning_rate": 2.662950232684793e-07, + "loss": 0.2776, + "step": 3930 + }, + { + "epoch": 1.9793320730938877, + "grad_norm": 0.167656438622552, + "learning_rate": 2.6605764070403817e-07, + "loss": 0.262, + "step": 3931 + }, + { + "epoch": 1.9798361688720858, + "grad_norm": 0.17411531156307122, + "learning_rate": 2.6582032563051835e-07, + "loss": 0.2746, + "step": 3932 + }, + { + "epoch": 1.9803402646502835, + "grad_norm": 0.17074273937666753, + "learning_rate": 2.6558307811638415e-07, + "loss": 0.2745, + "step": 3933 + }, + { + "epoch": 1.9808443604284816, + "grad_norm": 0.17097062535567104, + "learning_rate": 2.653458982300795e-07, + "loss": 0.2762, + "step": 3934 + }, + { + "epoch": 1.9813484562066792, + "grad_norm": 0.1683002481107667, + "learning_rate": 2.6510878604002984e-07, + "loss": 0.28, + "step": 3935 + }, + { + "epoch": 1.981852551984877, + "grad_norm": 0.17106853535957925, + "learning_rate": 2.648717416146401e-07, + "loss": 0.2673, + "step": 3936 + }, + { + "epoch": 1.982356647763075, + "grad_norm": 0.18084388076626143, + "learning_rate": 2.6463476502229664e-07, + "loss": 0.2863, + "step": 3937 + }, + { + "epoch": 1.9828607435412728, + "grad_norm": 0.16736153224440417, + "learning_rate": 2.643978563313654e-07, + "loss": 0.2522, + "step": 3938 + }, + { + "epoch": 1.9833648393194707, + "grad_norm": 0.1708328607685619, + "learning_rate": 2.64161015610193e-07, + "loss": 0.2713, + "step": 3939 + }, + { + "epoch": 1.9838689350976686, + "grad_norm": 0.16701316011846237, + "learning_rate": 2.639242429271068e-07, + "loss": 0.2684, + "step": 3940 + }, + { + "epoch": 1.9843730308758665, + "grad_norm": 0.19205395390104565, + "learning_rate": 2.6368753835041384e-07, + "loss": 0.2652, + "step": 3941 + }, + { + "epoch": 1.9848771266540641, + "grad_norm": 0.1717816839539935, + "learning_rate": 2.634509019484025e-07, + "loss": 0.277, + "step": 3942 + }, + { + "epoch": 1.9853812224322622, + "grad_norm": 0.17022274259442283, + "learning_rate": 2.6321433378934035e-07, + "loss": 0.2609, + "step": 3943 + }, + { + "epoch": 1.9858853182104599, + "grad_norm": 0.17756035746010013, + "learning_rate": 2.629778339414763e-07, + "loss": 0.2712, + "step": 3944 + }, + { + "epoch": 1.986389413988658, + "grad_norm": 0.17227310821132044, + "learning_rate": 2.627414024730389e-07, + "loss": 0.2636, + "step": 3945 + }, + { + "epoch": 1.9868935097668556, + "grad_norm": 0.1759199181751579, + "learning_rate": 2.625050394522369e-07, + "loss": 0.2692, + "step": 3946 + }, + { + "epoch": 1.9873976055450535, + "grad_norm": 0.17477365439151, + "learning_rate": 2.6226874494726005e-07, + "loss": 0.2719, + "step": 3947 + }, + { + "epoch": 1.9879017013232514, + "grad_norm": 0.17546888078854525, + "learning_rate": 2.620325190262774e-07, + "loss": 0.2693, + "step": 3948 + }, + { + "epoch": 1.9884057971014493, + "grad_norm": 0.17339448401055704, + "learning_rate": 2.6179636175743905e-07, + "loss": 0.2735, + "step": 3949 + }, + { + "epoch": 1.9889098928796471, + "grad_norm": 0.1692734111192285, + "learning_rate": 2.615602732088748e-07, + "loss": 0.2712, + "step": 3950 + }, + { + "epoch": 1.989413988657845, + "grad_norm": 0.1779563761961173, + "learning_rate": 2.6132425344869446e-07, + "loss": 0.2596, + "step": 3951 + }, + { + "epoch": 1.989918084436043, + "grad_norm": 0.17689388931184377, + "learning_rate": 2.610883025449887e-07, + "loss": 0.2756, + "step": 3952 + }, + { + "epoch": 1.9904221802142406, + "grad_norm": 0.18739670056285473, + "learning_rate": 2.6085242056582764e-07, + "loss": 0.2713, + "step": 3953 + }, + { + "epoch": 1.9909262759924387, + "grad_norm": 0.17988497242704043, + "learning_rate": 2.6061660757926163e-07, + "loss": 0.272, + "step": 3954 + }, + { + "epoch": 1.9914303717706363, + "grad_norm": 0.1864336481558806, + "learning_rate": 2.6038086365332155e-07, + "loss": 0.2731, + "step": 3955 + }, + { + "epoch": 1.9919344675488344, + "grad_norm": 0.17348195377594694, + "learning_rate": 2.6014518885601803e-07, + "loss": 0.2747, + "step": 3956 + }, + { + "epoch": 1.992438563327032, + "grad_norm": 0.17263796785386343, + "learning_rate": 2.599095832553415e-07, + "loss": 0.2837, + "step": 3957 + }, + { + "epoch": 1.99294265910523, + "grad_norm": 0.17274927381019026, + "learning_rate": 2.596740469192631e-07, + "loss": 0.2724, + "step": 3958 + }, + { + "epoch": 1.9934467548834278, + "grad_norm": 0.18942508993924725, + "learning_rate": 2.594385799157333e-07, + "loss": 0.2704, + "step": 3959 + }, + { + "epoch": 1.9939508506616257, + "grad_norm": 0.16909085704359703, + "learning_rate": 2.59203182312683e-07, + "loss": 0.2658, + "step": 3960 + }, + { + "epoch": 1.9944549464398236, + "grad_norm": 0.17982713814067064, + "learning_rate": 2.5896785417802313e-07, + "loss": 0.3008, + "step": 3961 + }, + { + "epoch": 1.9949590422180214, + "grad_norm": 0.17137213119070738, + "learning_rate": 2.5873259557964395e-07, + "loss": 0.2798, + "step": 3962 + }, + { + "epoch": 1.9954631379962193, + "grad_norm": 0.1794592545298311, + "learning_rate": 2.584974065854165e-07, + "loss": 0.2686, + "step": 3963 + }, + { + "epoch": 1.995967233774417, + "grad_norm": 0.17623111078697679, + "learning_rate": 2.5826228726319116e-07, + "loss": 0.2792, + "step": 3964 + }, + { + "epoch": 1.996471329552615, + "grad_norm": 0.1704272750332749, + "learning_rate": 2.5802723768079857e-07, + "loss": 0.2706, + "step": 3965 + }, + { + "epoch": 1.9969754253308127, + "grad_norm": 0.17123275269483276, + "learning_rate": 2.5779225790604887e-07, + "loss": 0.28, + "step": 3966 + }, + { + "epoch": 1.9974795211090108, + "grad_norm": 0.170434187962785, + "learning_rate": 2.5755734800673243e-07, + "loss": 0.2765, + "step": 3967 + }, + { + "epoch": 1.9979836168872085, + "grad_norm": 0.1680976681438808, + "learning_rate": 2.573225080506193e-07, + "loss": 0.2617, + "step": 3968 + } + ], + "logging_steps": 1, + "max_steps": 5949, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 992, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 6240331729207296.0, + "train_batch_size": 6, + "trial_name": null, + "trial_params": null +}