diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,5801 @@ +{ + "best_metric": 2.115697145462036, + "best_model_checkpoint": "outputs/checkpoint-615", + "epoch": 1.0, + "eval_steps": 205, + "global_step": 818, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0012224938875305623, + "grad_norm": 2.384822130203247, + "learning_rate": 0.001, + "loss": 5.1789, + "step": 1 + }, + { + "epoch": 0.0024449877750611247, + "grad_norm": 1.9184950590133667, + "learning_rate": 0.001, + "loss": 4.254, + "step": 2 + }, + { + "epoch": 0.003667481662591687, + "grad_norm": 3.020171642303467, + "learning_rate": 0.001, + "loss": 3.3665, + "step": 3 + }, + { + "epoch": 0.004889975550122249, + "grad_norm": 5.194127559661865, + "learning_rate": 0.001, + "loss": 3.067, + "step": 4 + }, + { + "epoch": 0.006112469437652812, + "grad_norm": 12.575337409973145, + "learning_rate": 0.001, + "loss": 3.0469, + "step": 5 + }, + { + "epoch": 0.007334963325183374, + "grad_norm": 3.6407251358032227, + "learning_rate": 0.001, + "loss": 2.7362, + "step": 6 + }, + { + "epoch": 0.008557457212713936, + "grad_norm": 0.9856404662132263, + "learning_rate": 0.001, + "loss": 2.6537, + "step": 7 + }, + { + "epoch": 0.009779951100244499, + "grad_norm": 1.4989922046661377, + "learning_rate": 0.001, + "loss": 2.7092, + "step": 8 + }, + { + "epoch": 0.011002444987775062, + "grad_norm": 0.7976619601249695, + "learning_rate": 0.001, + "loss": 2.49, + "step": 9 + }, + { + "epoch": 0.012224938875305624, + "grad_norm": 1.2533477544784546, + "learning_rate": 0.001, + "loss": 2.531, + "step": 10 + }, + { + "epoch": 0.013447432762836185, + "grad_norm": 3.7765350341796875, + "learning_rate": 0.001, + "loss": 2.6799, + "step": 11 + }, + { + "epoch": 0.014669926650366748, + "grad_norm": 0.6018803119659424, + "learning_rate": 0.001, + "loss": 2.5433, + "step": 12 + }, + { + "epoch": 0.01589242053789731, + "grad_norm": 0.5885470509529114, + "learning_rate": 0.001, + "loss": 2.5155, + "step": 13 + }, + { + "epoch": 0.017114914425427872, + "grad_norm": 0.8284630179405212, + "learning_rate": 0.001, + "loss": 2.4638, + "step": 14 + }, + { + "epoch": 0.018337408312958436, + "grad_norm": 0.7927239537239075, + "learning_rate": 0.001, + "loss": 2.472, + "step": 15 + }, + { + "epoch": 0.019559902200488997, + "grad_norm": 0.5843257904052734, + "learning_rate": 0.001, + "loss": 2.4512, + "step": 16 + }, + { + "epoch": 0.02078239608801956, + "grad_norm": 2.5700955390930176, + "learning_rate": 0.001, + "loss": 2.5298, + "step": 17 + }, + { + "epoch": 0.022004889975550123, + "grad_norm": 3.5902013778686523, + "learning_rate": 0.001, + "loss": 2.4943, + "step": 18 + }, + { + "epoch": 0.023227383863080684, + "grad_norm": 0.5734415054321289, + "learning_rate": 0.001, + "loss": 2.4265, + "step": 19 + }, + { + "epoch": 0.02444987775061125, + "grad_norm": 0.5567678213119507, + "learning_rate": 0.001, + "loss": 2.4346, + "step": 20 + }, + { + "epoch": 0.02567237163814181, + "grad_norm": 8.68786334991455, + "learning_rate": 0.001, + "loss": 2.4593, + "step": 21 + }, + { + "epoch": 0.02689486552567237, + "grad_norm": 0.69892418384552, + "learning_rate": 0.001, + "loss": 2.4801, + "step": 22 + }, + { + "epoch": 0.028117359413202935, + "grad_norm": 0.5525035262107849, + "learning_rate": 0.001, + "loss": 2.5337, + "step": 23 + }, + { + "epoch": 0.029339853300733496, + "grad_norm": 0.4160382151603699, + "learning_rate": 0.001, + "loss": 2.2792, + "step": 24 + }, + { + "epoch": 0.030562347188264057, + "grad_norm": 0.44313889741897583, + "learning_rate": 0.001, + "loss": 2.2945, + "step": 25 + }, + { + "epoch": 0.03178484107579462, + "grad_norm": 19.726951599121094, + "learning_rate": 0.001, + "loss": 2.4251, + "step": 26 + }, + { + "epoch": 0.03300733496332518, + "grad_norm": 1.9587205648422241, + "learning_rate": 0.001, + "loss": 2.4371, + "step": 27 + }, + { + "epoch": 0.034229828850855744, + "grad_norm": 0.6212214231491089, + "learning_rate": 0.001, + "loss": 2.3065, + "step": 28 + }, + { + "epoch": 0.035452322738386305, + "grad_norm": 4.7371978759765625, + "learning_rate": 0.001, + "loss": 2.3882, + "step": 29 + }, + { + "epoch": 0.03667481662591687, + "grad_norm": 2.593173027038574, + "learning_rate": 0.001, + "loss": 2.4037, + "step": 30 + }, + { + "epoch": 0.037897310513447434, + "grad_norm": 0.5192613005638123, + "learning_rate": 0.001, + "loss": 2.375, + "step": 31 + }, + { + "epoch": 0.039119804400977995, + "grad_norm": 0.7379730343818665, + "learning_rate": 0.001, + "loss": 2.3854, + "step": 32 + }, + { + "epoch": 0.040342298288508556, + "grad_norm": 9.484269142150879, + "learning_rate": 0.001, + "loss": 2.418, + "step": 33 + }, + { + "epoch": 0.04156479217603912, + "grad_norm": 0.47596585750579834, + "learning_rate": 0.001, + "loss": 2.2716, + "step": 34 + }, + { + "epoch": 0.042787286063569685, + "grad_norm": 0.4927333891391754, + "learning_rate": 0.001, + "loss": 2.4094, + "step": 35 + }, + { + "epoch": 0.044009779951100246, + "grad_norm": 0.5788190364837646, + "learning_rate": 0.001, + "loss": 2.2373, + "step": 36 + }, + { + "epoch": 0.04523227383863081, + "grad_norm": 0.4687283933162689, + "learning_rate": 0.001, + "loss": 2.2792, + "step": 37 + }, + { + "epoch": 0.04645476772616137, + "grad_norm": 0.5552632212638855, + "learning_rate": 0.001, + "loss": 2.3337, + "step": 38 + }, + { + "epoch": 0.04767726161369193, + "grad_norm": 0.6140040755271912, + "learning_rate": 0.001, + "loss": 2.3333, + "step": 39 + }, + { + "epoch": 0.0488997555012225, + "grad_norm": 0.4478518068790436, + "learning_rate": 0.001, + "loss": 2.2584, + "step": 40 + }, + { + "epoch": 0.05012224938875306, + "grad_norm": 0.5146190524101257, + "learning_rate": 0.001, + "loss": 2.3386, + "step": 41 + }, + { + "epoch": 0.05134474327628362, + "grad_norm": 0.3977064788341522, + "learning_rate": 0.001, + "loss": 2.2329, + "step": 42 + }, + { + "epoch": 0.05256723716381418, + "grad_norm": 0.6183723211288452, + "learning_rate": 0.001, + "loss": 2.4065, + "step": 43 + }, + { + "epoch": 0.05378973105134474, + "grad_norm": 0.4606506824493408, + "learning_rate": 0.001, + "loss": 2.3218, + "step": 44 + }, + { + "epoch": 0.0550122249388753, + "grad_norm": 0.7891462445259094, + "learning_rate": 0.001, + "loss": 2.2771, + "step": 45 + }, + { + "epoch": 0.05623471882640587, + "grad_norm": 7.153961658477783, + "learning_rate": 0.001, + "loss": 2.2543, + "step": 46 + }, + { + "epoch": 0.05745721271393643, + "grad_norm": 0.5428357720375061, + "learning_rate": 0.001, + "loss": 2.341, + "step": 47 + }, + { + "epoch": 0.05867970660146699, + "grad_norm": 0.4498843550682068, + "learning_rate": 0.001, + "loss": 2.3183, + "step": 48 + }, + { + "epoch": 0.05990220048899755, + "grad_norm": 0.41269147396087646, + "learning_rate": 0.001, + "loss": 2.2966, + "step": 49 + }, + { + "epoch": 0.061124694376528114, + "grad_norm": 0.4155029356479645, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 50 + }, + { + "epoch": 0.06234718826405868, + "grad_norm": 0.41635772585868835, + "learning_rate": 0.001, + "loss": 2.3018, + "step": 51 + }, + { + "epoch": 0.06356968215158924, + "grad_norm": 0.3913130760192871, + "learning_rate": 0.001, + "loss": 2.1975, + "step": 52 + }, + { + "epoch": 0.0647921760391198, + "grad_norm": 0.4053911864757538, + "learning_rate": 0.001, + "loss": 2.2506, + "step": 53 + }, + { + "epoch": 0.06601466992665037, + "grad_norm": 0.43186694383621216, + "learning_rate": 0.001, + "loss": 2.235, + "step": 54 + }, + { + "epoch": 0.06723716381418093, + "grad_norm": 0.43836215138435364, + "learning_rate": 0.001, + "loss": 2.3089, + "step": 55 + }, + { + "epoch": 0.06845965770171149, + "grad_norm": 0.3933768570423126, + "learning_rate": 0.001, + "loss": 2.3207, + "step": 56 + }, + { + "epoch": 0.06968215158924206, + "grad_norm": 0.3999539911746979, + "learning_rate": 0.001, + "loss": 2.2643, + "step": 57 + }, + { + "epoch": 0.07090464547677261, + "grad_norm": 0.3799598813056946, + "learning_rate": 0.001, + "loss": 2.2278, + "step": 58 + }, + { + "epoch": 0.07212713936430318, + "grad_norm": 0.42073073983192444, + "learning_rate": 0.001, + "loss": 2.2424, + "step": 59 + }, + { + "epoch": 0.07334963325183375, + "grad_norm": 0.38391533493995667, + "learning_rate": 0.001, + "loss": 2.3866, + "step": 60 + }, + { + "epoch": 0.0745721271393643, + "grad_norm": 0.443452924489975, + "learning_rate": 0.001, + "loss": 2.2264, + "step": 61 + }, + { + "epoch": 0.07579462102689487, + "grad_norm": 0.4261990785598755, + "learning_rate": 0.001, + "loss": 2.3909, + "step": 62 + }, + { + "epoch": 0.07701711491442542, + "grad_norm": 0.35094141960144043, + "learning_rate": 0.001, + "loss": 2.2517, + "step": 63 + }, + { + "epoch": 0.07823960880195599, + "grad_norm": 0.44420382380485535, + "learning_rate": 0.001, + "loss": 2.2818, + "step": 64 + }, + { + "epoch": 0.07946210268948656, + "grad_norm": 0.3850550055503845, + "learning_rate": 0.001, + "loss": 2.2548, + "step": 65 + }, + { + "epoch": 0.08068459657701711, + "grad_norm": 0.39684730768203735, + "learning_rate": 0.001, + "loss": 2.3575, + "step": 66 + }, + { + "epoch": 0.08190709046454768, + "grad_norm": 0.3885946273803711, + "learning_rate": 0.001, + "loss": 2.3153, + "step": 67 + }, + { + "epoch": 0.08312958435207823, + "grad_norm": 0.39877697825431824, + "learning_rate": 0.001, + "loss": 2.3019, + "step": 68 + }, + { + "epoch": 0.0843520782396088, + "grad_norm": 0.38227578997612, + "learning_rate": 0.001, + "loss": 2.3031, + "step": 69 + }, + { + "epoch": 0.08557457212713937, + "grad_norm": 0.41848722100257874, + "learning_rate": 0.001, + "loss": 2.2451, + "step": 70 + }, + { + "epoch": 0.08679706601466992, + "grad_norm": 0.37621256709098816, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 71 + }, + { + "epoch": 0.08801955990220049, + "grad_norm": 0.45229101181030273, + "learning_rate": 0.001, + "loss": 2.2395, + "step": 72 + }, + { + "epoch": 0.08924205378973105, + "grad_norm": 0.3657698631286621, + "learning_rate": 0.001, + "loss": 2.2631, + "step": 73 + }, + { + "epoch": 0.09046454767726161, + "grad_norm": 0.40659746527671814, + "learning_rate": 0.001, + "loss": 2.2561, + "step": 74 + }, + { + "epoch": 0.09168704156479218, + "grad_norm": 0.3982115089893341, + "learning_rate": 0.001, + "loss": 2.2651, + "step": 75 + }, + { + "epoch": 0.09290953545232274, + "grad_norm": 0.37506604194641113, + "learning_rate": 0.001, + "loss": 2.2049, + "step": 76 + }, + { + "epoch": 0.0941320293398533, + "grad_norm": 0.41589951515197754, + "learning_rate": 0.001, + "loss": 2.3502, + "step": 77 + }, + { + "epoch": 0.09535452322738386, + "grad_norm": 0.36884135007858276, + "learning_rate": 0.001, + "loss": 2.2662, + "step": 78 + }, + { + "epoch": 0.09657701711491443, + "grad_norm": 0.4224059283733368, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 79 + }, + { + "epoch": 0.097799511002445, + "grad_norm": 0.5124974846839905, + "learning_rate": 0.001, + "loss": 2.2066, + "step": 80 + }, + { + "epoch": 0.09902200488997555, + "grad_norm": 0.4109271168708801, + "learning_rate": 0.001, + "loss": 2.2588, + "step": 81 + }, + { + "epoch": 0.10024449877750612, + "grad_norm": 0.44107335805892944, + "learning_rate": 0.001, + "loss": 2.2589, + "step": 82 + }, + { + "epoch": 0.10146699266503667, + "grad_norm": 0.4264432191848755, + "learning_rate": 0.001, + "loss": 2.123, + "step": 83 + }, + { + "epoch": 0.10268948655256724, + "grad_norm": 0.457029789686203, + "learning_rate": 0.001, + "loss": 2.3831, + "step": 84 + }, + { + "epoch": 0.1039119804400978, + "grad_norm": 0.397605836391449, + "learning_rate": 0.001, + "loss": 2.1846, + "step": 85 + }, + { + "epoch": 0.10513447432762836, + "grad_norm": 0.40626272559165955, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 86 + }, + { + "epoch": 0.10635696821515893, + "grad_norm": 0.41786232590675354, + "learning_rate": 0.001, + "loss": 2.2301, + "step": 87 + }, + { + "epoch": 0.10757946210268948, + "grad_norm": 0.37623491883277893, + "learning_rate": 0.001, + "loss": 2.1944, + "step": 88 + }, + { + "epoch": 0.10880195599022005, + "grad_norm": 0.36336657404899597, + "learning_rate": 0.001, + "loss": 2.2961, + "step": 89 + }, + { + "epoch": 0.1100244498777506, + "grad_norm": 0.45478343963623047, + "learning_rate": 0.001, + "loss": 2.2176, + "step": 90 + }, + { + "epoch": 0.11124694376528117, + "grad_norm": 0.4099563956260681, + "learning_rate": 0.001, + "loss": 2.2708, + "step": 91 + }, + { + "epoch": 0.11246943765281174, + "grad_norm": 0.37966397404670715, + "learning_rate": 0.001, + "loss": 2.2438, + "step": 92 + }, + { + "epoch": 0.1136919315403423, + "grad_norm": 0.40351352095603943, + "learning_rate": 0.001, + "loss": 2.1437, + "step": 93 + }, + { + "epoch": 0.11491442542787286, + "grad_norm": 0.43552902340888977, + "learning_rate": 0.001, + "loss": 2.2331, + "step": 94 + }, + { + "epoch": 0.11613691931540342, + "grad_norm": 0.36740803718566895, + "learning_rate": 0.001, + "loss": 2.1243, + "step": 95 + }, + { + "epoch": 0.11735941320293398, + "grad_norm": 0.37405940890312195, + "learning_rate": 0.001, + "loss": 2.2194, + "step": 96 + }, + { + "epoch": 0.11858190709046455, + "grad_norm": 0.3814330995082855, + "learning_rate": 0.001, + "loss": 2.3012, + "step": 97 + }, + { + "epoch": 0.1198044009779951, + "grad_norm": 0.3979599177837372, + "learning_rate": 0.001, + "loss": 2.3228, + "step": 98 + }, + { + "epoch": 0.12102689486552567, + "grad_norm": 0.3659515380859375, + "learning_rate": 0.001, + "loss": 2.2247, + "step": 99 + }, + { + "epoch": 0.12224938875305623, + "grad_norm": 0.3673119843006134, + "learning_rate": 0.001, + "loss": 2.2416, + "step": 100 + }, + { + "epoch": 0.1234718826405868, + "grad_norm": 0.39655619859695435, + "learning_rate": 0.001, + "loss": 2.2511, + "step": 101 + }, + { + "epoch": 0.12469437652811736, + "grad_norm": 0.4136020541191101, + "learning_rate": 0.001, + "loss": 2.2613, + "step": 102 + }, + { + "epoch": 0.12591687041564792, + "grad_norm": 0.42136579751968384, + "learning_rate": 0.001, + "loss": 2.1932, + "step": 103 + }, + { + "epoch": 0.1271393643031785, + "grad_norm": 0.4128344655036926, + "learning_rate": 0.001, + "loss": 2.2686, + "step": 104 + }, + { + "epoch": 0.12836185819070906, + "grad_norm": 0.3856717646121979, + "learning_rate": 0.001, + "loss": 2.2565, + "step": 105 + }, + { + "epoch": 0.1295843520782396, + "grad_norm": 0.43530967831611633, + "learning_rate": 0.001, + "loss": 2.2146, + "step": 106 + }, + { + "epoch": 0.13080684596577016, + "grad_norm": 0.4071982204914093, + "learning_rate": 0.001, + "loss": 2.2356, + "step": 107 + }, + { + "epoch": 0.13202933985330073, + "grad_norm": 0.3789448142051697, + "learning_rate": 0.001, + "loss": 2.2783, + "step": 108 + }, + { + "epoch": 0.1332518337408313, + "grad_norm": 0.42017051577568054, + "learning_rate": 0.001, + "loss": 2.1768, + "step": 109 + }, + { + "epoch": 0.13447432762836187, + "grad_norm": 0.43539750576019287, + "learning_rate": 0.001, + "loss": 2.181, + "step": 110 + }, + { + "epoch": 0.1356968215158924, + "grad_norm": 0.40269771218299866, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 111 + }, + { + "epoch": 0.13691931540342298, + "grad_norm": 0.42498067021369934, + "learning_rate": 0.001, + "loss": 2.2269, + "step": 112 + }, + { + "epoch": 0.13814180929095354, + "grad_norm": 0.38461583852767944, + "learning_rate": 0.001, + "loss": 2.2906, + "step": 113 + }, + { + "epoch": 0.1393643031784841, + "grad_norm": 0.4348565638065338, + "learning_rate": 0.001, + "loss": 2.295, + "step": 114 + }, + { + "epoch": 0.14058679706601468, + "grad_norm": 0.4046652019023895, + "learning_rate": 0.001, + "loss": 2.181, + "step": 115 + }, + { + "epoch": 0.14180929095354522, + "grad_norm": 0.38174059987068176, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 116 + }, + { + "epoch": 0.1430317848410758, + "grad_norm": 0.5675603747367859, + "learning_rate": 0.001, + "loss": 2.3263, + "step": 117 + }, + { + "epoch": 0.14425427872860636, + "grad_norm": 0.42327845096588135, + "learning_rate": 0.001, + "loss": 2.2042, + "step": 118 + }, + { + "epoch": 0.14547677261613692, + "grad_norm": 0.5511468052864075, + "learning_rate": 0.001, + "loss": 2.2331, + "step": 119 + }, + { + "epoch": 0.1466992665036675, + "grad_norm": 0.41198548674583435, + "learning_rate": 0.001, + "loss": 2.2646, + "step": 120 + }, + { + "epoch": 0.14792176039119803, + "grad_norm": 0.5606162548065186, + "learning_rate": 0.001, + "loss": 2.303, + "step": 121 + }, + { + "epoch": 0.1491442542787286, + "grad_norm": 0.5389887690544128, + "learning_rate": 0.001, + "loss": 2.2866, + "step": 122 + }, + { + "epoch": 0.15036674816625917, + "grad_norm": 0.47240230441093445, + "learning_rate": 0.001, + "loss": 2.3047, + "step": 123 + }, + { + "epoch": 0.15158924205378974, + "grad_norm": 0.3920021653175354, + "learning_rate": 0.001, + "loss": 2.2462, + "step": 124 + }, + { + "epoch": 0.1528117359413203, + "grad_norm": 0.5402709245681763, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 125 + }, + { + "epoch": 0.15403422982885084, + "grad_norm": 0.40199369192123413, + "learning_rate": 0.001, + "loss": 2.1024, + "step": 126 + }, + { + "epoch": 0.1552567237163814, + "grad_norm": 0.4106731414794922, + "learning_rate": 0.001, + "loss": 2.2969, + "step": 127 + }, + { + "epoch": 0.15647921760391198, + "grad_norm": 0.38309475779533386, + "learning_rate": 0.001, + "loss": 2.3367, + "step": 128 + }, + { + "epoch": 0.15770171149144255, + "grad_norm": 0.3637794852256775, + "learning_rate": 0.001, + "loss": 2.2432, + "step": 129 + }, + { + "epoch": 0.15892420537897312, + "grad_norm": 0.4057571589946747, + "learning_rate": 0.001, + "loss": 2.2451, + "step": 130 + }, + { + "epoch": 0.16014669926650366, + "grad_norm": 0.3911604881286621, + "learning_rate": 0.001, + "loss": 2.2577, + "step": 131 + }, + { + "epoch": 0.16136919315403422, + "grad_norm": 0.4547809660434723, + "learning_rate": 0.001, + "loss": 2.2029, + "step": 132 + }, + { + "epoch": 0.1625916870415648, + "grad_norm": 0.3767680823802948, + "learning_rate": 0.001, + "loss": 2.3067, + "step": 133 + }, + { + "epoch": 0.16381418092909536, + "grad_norm": 0.36867696046829224, + "learning_rate": 0.001, + "loss": 2.1743, + "step": 134 + }, + { + "epoch": 0.16503667481662593, + "grad_norm": 0.3789012134075165, + "learning_rate": 0.001, + "loss": 2.1533, + "step": 135 + }, + { + "epoch": 0.16625916870415647, + "grad_norm": 0.3715672194957733, + "learning_rate": 0.001, + "loss": 2.1754, + "step": 136 + }, + { + "epoch": 0.16748166259168704, + "grad_norm": 0.4021635055541992, + "learning_rate": 0.001, + "loss": 2.16, + "step": 137 + }, + { + "epoch": 0.1687041564792176, + "grad_norm": 0.401578426361084, + "learning_rate": 0.001, + "loss": 2.245, + "step": 138 + }, + { + "epoch": 0.16992665036674817, + "grad_norm": 0.3970203697681427, + "learning_rate": 0.001, + "loss": 2.2107, + "step": 139 + }, + { + "epoch": 0.17114914425427874, + "grad_norm": 0.4019182026386261, + "learning_rate": 0.001, + "loss": 2.2779, + "step": 140 + }, + { + "epoch": 0.17237163814180928, + "grad_norm": 0.42052340507507324, + "learning_rate": 0.001, + "loss": 2.2395, + "step": 141 + }, + { + "epoch": 0.17359413202933985, + "grad_norm": 0.450577050447464, + "learning_rate": 0.001, + "loss": 2.3603, + "step": 142 + }, + { + "epoch": 0.17481662591687042, + "grad_norm": 0.37792298197746277, + "learning_rate": 0.001, + "loss": 2.1671, + "step": 143 + }, + { + "epoch": 0.17603911980440098, + "grad_norm": 0.3733505308628082, + "learning_rate": 0.001, + "loss": 2.1905, + "step": 144 + }, + { + "epoch": 0.17726161369193155, + "grad_norm": 0.376338928937912, + "learning_rate": 0.001, + "loss": 2.1718, + "step": 145 + }, + { + "epoch": 0.1784841075794621, + "grad_norm": 0.3833712637424469, + "learning_rate": 0.001, + "loss": 2.2652, + "step": 146 + }, + { + "epoch": 0.17970660146699266, + "grad_norm": 0.37632736563682556, + "learning_rate": 0.001, + "loss": 2.1476, + "step": 147 + }, + { + "epoch": 0.18092909535452323, + "grad_norm": 0.40533486008644104, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 148 + }, + { + "epoch": 0.1821515892420538, + "grad_norm": 0.4485403597354889, + "learning_rate": 0.001, + "loss": 2.2362, + "step": 149 + }, + { + "epoch": 0.18337408312958436, + "grad_norm": 0.41201671957969666, + "learning_rate": 0.001, + "loss": 2.2703, + "step": 150 + }, + { + "epoch": 0.1845965770171149, + "grad_norm": 0.38323187828063965, + "learning_rate": 0.001, + "loss": 2.2436, + "step": 151 + }, + { + "epoch": 0.18581907090464547, + "grad_norm": 0.4398377537727356, + "learning_rate": 0.001, + "loss": 2.3432, + "step": 152 + }, + { + "epoch": 0.18704156479217604, + "grad_norm": 0.45339325070381165, + "learning_rate": 0.001, + "loss": 2.1949, + "step": 153 + }, + { + "epoch": 0.1882640586797066, + "grad_norm": 0.44392064213752747, + "learning_rate": 0.001, + "loss": 2.0682, + "step": 154 + }, + { + "epoch": 0.18948655256723718, + "grad_norm": 0.4051275849342346, + "learning_rate": 0.001, + "loss": 2.2742, + "step": 155 + }, + { + "epoch": 0.19070904645476772, + "grad_norm": 0.47490188479423523, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 156 + }, + { + "epoch": 0.19193154034229828, + "grad_norm": 0.4116264879703522, + "learning_rate": 0.001, + "loss": 2.264, + "step": 157 + }, + { + "epoch": 0.19315403422982885, + "grad_norm": 0.3978973627090454, + "learning_rate": 0.001, + "loss": 2.1993, + "step": 158 + }, + { + "epoch": 0.19437652811735942, + "grad_norm": 0.4235959053039551, + "learning_rate": 0.001, + "loss": 2.2614, + "step": 159 + }, + { + "epoch": 0.19559902200489, + "grad_norm": 0.4421234428882599, + "learning_rate": 0.001, + "loss": 2.1779, + "step": 160 + }, + { + "epoch": 0.19682151589242053, + "grad_norm": 0.37337085604667664, + "learning_rate": 0.001, + "loss": 2.1856, + "step": 161 + }, + { + "epoch": 0.1980440097799511, + "grad_norm": 0.44785186648368835, + "learning_rate": 0.001, + "loss": 2.251, + "step": 162 + }, + { + "epoch": 0.19926650366748166, + "grad_norm": 0.4330941438674927, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 163 + }, + { + "epoch": 0.20048899755501223, + "grad_norm": 0.38546377420425415, + "learning_rate": 0.001, + "loss": 2.1478, + "step": 164 + }, + { + "epoch": 0.2017114914425428, + "grad_norm": 0.3848446309566498, + "learning_rate": 0.001, + "loss": 2.1803, + "step": 165 + }, + { + "epoch": 0.20293398533007334, + "grad_norm": 0.41835349798202515, + "learning_rate": 0.001, + "loss": 2.2478, + "step": 166 + }, + { + "epoch": 0.2041564792176039, + "grad_norm": 0.3915431797504425, + "learning_rate": 0.001, + "loss": 2.168, + "step": 167 + }, + { + "epoch": 0.20537897310513448, + "grad_norm": 0.40515565872192383, + "learning_rate": 0.001, + "loss": 2.2641, + "step": 168 + }, + { + "epoch": 0.20660146699266504, + "grad_norm": 0.38732442259788513, + "learning_rate": 0.001, + "loss": 2.2896, + "step": 169 + }, + { + "epoch": 0.2078239608801956, + "grad_norm": 0.3959033489227295, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 170 + }, + { + "epoch": 0.20904645476772615, + "grad_norm": 0.43555209040641785, + "learning_rate": 0.001, + "loss": 2.2183, + "step": 171 + }, + { + "epoch": 0.21026894865525672, + "grad_norm": 0.43636706471443176, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 172 + }, + { + "epoch": 0.2114914425427873, + "grad_norm": 0.4059796631336212, + "learning_rate": 0.001, + "loss": 2.1839, + "step": 173 + }, + { + "epoch": 0.21271393643031786, + "grad_norm": 0.4584046006202698, + "learning_rate": 0.001, + "loss": 2.2967, + "step": 174 + }, + { + "epoch": 0.2139364303178484, + "grad_norm": 0.41237303614616394, + "learning_rate": 0.001, + "loss": 2.319, + "step": 175 + }, + { + "epoch": 0.21515892420537897, + "grad_norm": 0.44713032245635986, + "learning_rate": 0.001, + "loss": 2.2066, + "step": 176 + }, + { + "epoch": 0.21638141809290953, + "grad_norm": 0.4673137962818146, + "learning_rate": 0.001, + "loss": 2.1519, + "step": 177 + }, + { + "epoch": 0.2176039119804401, + "grad_norm": 0.47857725620269775, + "learning_rate": 0.001, + "loss": 2.2214, + "step": 178 + }, + { + "epoch": 0.21882640586797067, + "grad_norm": 0.40556231141090393, + "learning_rate": 0.001, + "loss": 2.1443, + "step": 179 + }, + { + "epoch": 0.2200488997555012, + "grad_norm": 0.40253594517707825, + "learning_rate": 0.001, + "loss": 2.209, + "step": 180 + }, + { + "epoch": 0.22127139364303178, + "grad_norm": 0.44094711542129517, + "learning_rate": 0.001, + "loss": 2.142, + "step": 181 + }, + { + "epoch": 0.22249388753056235, + "grad_norm": 0.40753334760665894, + "learning_rate": 0.001, + "loss": 2.1561, + "step": 182 + }, + { + "epoch": 0.2237163814180929, + "grad_norm": 0.49447524547576904, + "learning_rate": 0.001, + "loss": 2.2386, + "step": 183 + }, + { + "epoch": 0.22493887530562348, + "grad_norm": 0.42737525701522827, + "learning_rate": 0.001, + "loss": 2.2404, + "step": 184 + }, + { + "epoch": 0.22616136919315402, + "grad_norm": 0.4718843698501587, + "learning_rate": 0.001, + "loss": 2.179, + "step": 185 + }, + { + "epoch": 0.2273838630806846, + "grad_norm": 0.3781047761440277, + "learning_rate": 0.001, + "loss": 2.2067, + "step": 186 + }, + { + "epoch": 0.22860635696821516, + "grad_norm": 0.5081697702407837, + "learning_rate": 0.001, + "loss": 2.1621, + "step": 187 + }, + { + "epoch": 0.22982885085574573, + "grad_norm": 0.4034336507320404, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 188 + }, + { + "epoch": 0.2310513447432763, + "grad_norm": 0.45380139350891113, + "learning_rate": 0.001, + "loss": 2.167, + "step": 189 + }, + { + "epoch": 0.23227383863080683, + "grad_norm": 0.47250330448150635, + "learning_rate": 0.001, + "loss": 2.2126, + "step": 190 + }, + { + "epoch": 0.2334963325183374, + "grad_norm": 0.43483713269233704, + "learning_rate": 0.001, + "loss": 2.2488, + "step": 191 + }, + { + "epoch": 0.23471882640586797, + "grad_norm": 0.5420922636985779, + "learning_rate": 0.001, + "loss": 2.2842, + "step": 192 + }, + { + "epoch": 0.23594132029339854, + "grad_norm": 0.46761560440063477, + "learning_rate": 0.001, + "loss": 2.2363, + "step": 193 + }, + { + "epoch": 0.2371638141809291, + "grad_norm": 0.43423908948898315, + "learning_rate": 0.001, + "loss": 2.1634, + "step": 194 + }, + { + "epoch": 0.23838630806845965, + "grad_norm": 0.4111950397491455, + "learning_rate": 0.001, + "loss": 2.1722, + "step": 195 + }, + { + "epoch": 0.2396088019559902, + "grad_norm": 0.3922325670719147, + "learning_rate": 0.001, + "loss": 2.1681, + "step": 196 + }, + { + "epoch": 0.24083129584352078, + "grad_norm": 0.45113420486450195, + "learning_rate": 0.001, + "loss": 2.1986, + "step": 197 + }, + { + "epoch": 0.24205378973105135, + "grad_norm": 0.42804262042045593, + "learning_rate": 0.001, + "loss": 2.2062, + "step": 198 + }, + { + "epoch": 0.24327628361858192, + "grad_norm": 0.4039282500743866, + "learning_rate": 0.001, + "loss": 2.2656, + "step": 199 + }, + { + "epoch": 0.24449877750611246, + "grad_norm": 0.4062311351299286, + "learning_rate": 0.001, + "loss": 2.2483, + "step": 200 + }, + { + "epoch": 0.24572127139364303, + "grad_norm": 0.4116600751876831, + "learning_rate": 0.001, + "loss": 2.1896, + "step": 201 + }, + { + "epoch": 0.2469437652811736, + "grad_norm": 0.44408801198005676, + "learning_rate": 0.001, + "loss": 2.1628, + "step": 202 + }, + { + "epoch": 0.24816625916870416, + "grad_norm": 0.4627872109413147, + "learning_rate": 0.001, + "loss": 2.1843, + "step": 203 + }, + { + "epoch": 0.24938875305623473, + "grad_norm": 0.4298408031463623, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 204 + }, + { + "epoch": 0.2506112469437653, + "grad_norm": 0.4020983576774597, + "learning_rate": 0.001, + "loss": 2.2135, + "step": 205 + }, + { + "epoch": 0.2506112469437653, + "eval_loss": 2.182542324066162, + "eval_runtime": 69.0489, + "eval_samples_per_second": 151.559, + "eval_steps_per_second": 18.958, + "step": 205 + }, + { + "epoch": 0.25183374083129584, + "grad_norm": 0.6257454752922058, + "learning_rate": 0.001, + "loss": 2.1383, + "step": 206 + }, + { + "epoch": 0.2530562347188264, + "grad_norm": 0.4335342347621918, + "learning_rate": 0.001, + "loss": 2.2015, + "step": 207 + }, + { + "epoch": 0.254278728606357, + "grad_norm": 0.4160003364086151, + "learning_rate": 0.001, + "loss": 2.2308, + "step": 208 + }, + { + "epoch": 0.2555012224938875, + "grad_norm": 0.40494173765182495, + "learning_rate": 0.001, + "loss": 2.3659, + "step": 209 + }, + { + "epoch": 0.2567237163814181, + "grad_norm": 0.4622797966003418, + "learning_rate": 0.001, + "loss": 2.1818, + "step": 210 + }, + { + "epoch": 0.25794621026894865, + "grad_norm": 0.4066208004951477, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 211 + }, + { + "epoch": 0.2591687041564792, + "grad_norm": 0.5869137048721313, + "learning_rate": 0.001, + "loss": 2.1886, + "step": 212 + }, + { + "epoch": 0.2603911980440098, + "grad_norm": 0.38838991522789, + "learning_rate": 0.001, + "loss": 2.2406, + "step": 213 + }, + { + "epoch": 0.2616136919315403, + "grad_norm": 0.4655460715293884, + "learning_rate": 0.001, + "loss": 2.2619, + "step": 214 + }, + { + "epoch": 0.2628361858190709, + "grad_norm": 0.46997955441474915, + "learning_rate": 0.001, + "loss": 2.1855, + "step": 215 + }, + { + "epoch": 0.26405867970660146, + "grad_norm": 0.4898509979248047, + "learning_rate": 0.001, + "loss": 2.2856, + "step": 216 + }, + { + "epoch": 0.265281173594132, + "grad_norm": 0.4614812731742859, + "learning_rate": 0.001, + "loss": 2.2034, + "step": 217 + }, + { + "epoch": 0.2665036674816626, + "grad_norm": 0.41966143250465393, + "learning_rate": 0.001, + "loss": 2.1964, + "step": 218 + }, + { + "epoch": 0.26772616136919314, + "grad_norm": 0.3985699713230133, + "learning_rate": 0.001, + "loss": 2.15, + "step": 219 + }, + { + "epoch": 0.26894865525672373, + "grad_norm": 0.42294928431510925, + "learning_rate": 0.001, + "loss": 2.2558, + "step": 220 + }, + { + "epoch": 0.2701711491442543, + "grad_norm": 0.4656440019607544, + "learning_rate": 0.001, + "loss": 2.2737, + "step": 221 + }, + { + "epoch": 0.2713936430317848, + "grad_norm": 0.4101804196834564, + "learning_rate": 0.001, + "loss": 2.2278, + "step": 222 + }, + { + "epoch": 0.2726161369193154, + "grad_norm": 0.5069494843482971, + "learning_rate": 0.001, + "loss": 2.1685, + "step": 223 + }, + { + "epoch": 0.27383863080684595, + "grad_norm": 0.42884835600852966, + "learning_rate": 0.001, + "loss": 2.1298, + "step": 224 + }, + { + "epoch": 0.27506112469437655, + "grad_norm": 0.508474588394165, + "learning_rate": 0.001, + "loss": 2.0927, + "step": 225 + }, + { + "epoch": 0.2762836185819071, + "grad_norm": 0.42199471592903137, + "learning_rate": 0.001, + "loss": 2.1884, + "step": 226 + }, + { + "epoch": 0.2775061124694376, + "grad_norm": 0.45699968934059143, + "learning_rate": 0.001, + "loss": 2.3467, + "step": 227 + }, + { + "epoch": 0.2787286063569682, + "grad_norm": 0.45894646644592285, + "learning_rate": 0.001, + "loss": 2.1106, + "step": 228 + }, + { + "epoch": 0.27995110024449876, + "grad_norm": 0.42262643575668335, + "learning_rate": 0.001, + "loss": 2.1146, + "step": 229 + }, + { + "epoch": 0.28117359413202936, + "grad_norm": 0.4845474064350128, + "learning_rate": 0.001, + "loss": 2.2117, + "step": 230 + }, + { + "epoch": 0.2823960880195599, + "grad_norm": 0.42860740423202515, + "learning_rate": 0.001, + "loss": 2.1881, + "step": 231 + }, + { + "epoch": 0.28361858190709044, + "grad_norm": 0.5572071671485901, + "learning_rate": 0.001, + "loss": 2.2525, + "step": 232 + }, + { + "epoch": 0.28484107579462103, + "grad_norm": 0.4548279941082001, + "learning_rate": 0.001, + "loss": 2.1974, + "step": 233 + }, + { + "epoch": 0.2860635696821516, + "grad_norm": 0.4927968382835388, + "learning_rate": 0.001, + "loss": 2.2972, + "step": 234 + }, + { + "epoch": 0.28728606356968217, + "grad_norm": 0.42159003019332886, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 235 + }, + { + "epoch": 0.2885085574572127, + "grad_norm": 0.480201780796051, + "learning_rate": 0.001, + "loss": 2.1974, + "step": 236 + }, + { + "epoch": 0.28973105134474325, + "grad_norm": 0.4612812399864197, + "learning_rate": 0.001, + "loss": 2.2359, + "step": 237 + }, + { + "epoch": 0.29095354523227385, + "grad_norm": 0.411996454000473, + "learning_rate": 0.001, + "loss": 2.1805, + "step": 238 + }, + { + "epoch": 0.2921760391198044, + "grad_norm": 0.4566625952720642, + "learning_rate": 0.001, + "loss": 2.1987, + "step": 239 + }, + { + "epoch": 0.293398533007335, + "grad_norm": 0.41748395562171936, + "learning_rate": 0.001, + "loss": 2.2479, + "step": 240 + }, + { + "epoch": 0.2946210268948655, + "grad_norm": 0.4833139479160309, + "learning_rate": 0.001, + "loss": 2.2413, + "step": 241 + }, + { + "epoch": 0.29584352078239606, + "grad_norm": 0.45390957593917847, + "learning_rate": 0.001, + "loss": 2.2535, + "step": 242 + }, + { + "epoch": 0.29706601466992666, + "grad_norm": 0.49068349599838257, + "learning_rate": 0.001, + "loss": 2.2128, + "step": 243 + }, + { + "epoch": 0.2982885085574572, + "grad_norm": 0.48676741123199463, + "learning_rate": 0.001, + "loss": 2.17, + "step": 244 + }, + { + "epoch": 0.2995110024449878, + "grad_norm": 0.42673414945602417, + "learning_rate": 0.001, + "loss": 2.0798, + "step": 245 + }, + { + "epoch": 0.30073349633251834, + "grad_norm": 0.45610859990119934, + "learning_rate": 0.001, + "loss": 2.2348, + "step": 246 + }, + { + "epoch": 0.3019559902200489, + "grad_norm": 0.48875653743743896, + "learning_rate": 0.001, + "loss": 2.1261, + "step": 247 + }, + { + "epoch": 0.30317848410757947, + "grad_norm": 0.48908236622810364, + "learning_rate": 0.001, + "loss": 2.1355, + "step": 248 + }, + { + "epoch": 0.30440097799511, + "grad_norm": 0.4941488206386566, + "learning_rate": 0.001, + "loss": 2.2139, + "step": 249 + }, + { + "epoch": 0.3056234718826406, + "grad_norm": 0.4283684194087982, + "learning_rate": 0.001, + "loss": 2.1801, + "step": 250 + }, + { + "epoch": 0.30684596577017115, + "grad_norm": 0.4848301112651825, + "learning_rate": 0.001, + "loss": 2.1626, + "step": 251 + }, + { + "epoch": 0.3080684596577017, + "grad_norm": 0.38889309763908386, + "learning_rate": 0.001, + "loss": 2.1595, + "step": 252 + }, + { + "epoch": 0.3092909535452323, + "grad_norm": 0.46130338311195374, + "learning_rate": 0.001, + "loss": 2.1576, + "step": 253 + }, + { + "epoch": 0.3105134474327628, + "grad_norm": 0.45563745498657227, + "learning_rate": 0.001, + "loss": 2.2491, + "step": 254 + }, + { + "epoch": 0.3117359413202934, + "grad_norm": 0.47916093468666077, + "learning_rate": 0.001, + "loss": 2.2009, + "step": 255 + }, + { + "epoch": 0.31295843520782396, + "grad_norm": 0.49458763003349304, + "learning_rate": 0.001, + "loss": 2.2011, + "step": 256 + }, + { + "epoch": 0.3141809290953545, + "grad_norm": 0.4567379653453827, + "learning_rate": 0.001, + "loss": 2.218, + "step": 257 + }, + { + "epoch": 0.3154034229828851, + "grad_norm": 0.5005376935005188, + "learning_rate": 0.001, + "loss": 2.2661, + "step": 258 + }, + { + "epoch": 0.31662591687041564, + "grad_norm": 0.4395696222782135, + "learning_rate": 0.001, + "loss": 2.2497, + "step": 259 + }, + { + "epoch": 0.31784841075794623, + "grad_norm": 0.48259636759757996, + "learning_rate": 0.001, + "loss": 2.2019, + "step": 260 + }, + { + "epoch": 0.31907090464547677, + "grad_norm": 0.3922736942768097, + "learning_rate": 0.001, + "loss": 2.2352, + "step": 261 + }, + { + "epoch": 0.3202933985330073, + "grad_norm": 0.528638482093811, + "learning_rate": 0.001, + "loss": 2.1914, + "step": 262 + }, + { + "epoch": 0.3215158924205379, + "grad_norm": 0.50613933801651, + "learning_rate": 0.001, + "loss": 2.2448, + "step": 263 + }, + { + "epoch": 0.32273838630806845, + "grad_norm": 0.4539433717727661, + "learning_rate": 0.001, + "loss": 2.1952, + "step": 264 + }, + { + "epoch": 0.32396088019559904, + "grad_norm": 0.5500141382217407, + "learning_rate": 0.001, + "loss": 2.2235, + "step": 265 + }, + { + "epoch": 0.3251833740831296, + "grad_norm": 0.4346414804458618, + "learning_rate": 0.001, + "loss": 2.1393, + "step": 266 + }, + { + "epoch": 0.3264058679706601, + "grad_norm": 0.47232580184936523, + "learning_rate": 0.001, + "loss": 2.2068, + "step": 267 + }, + { + "epoch": 0.3276283618581907, + "grad_norm": 0.43433475494384766, + "learning_rate": 0.001, + "loss": 2.2397, + "step": 268 + }, + { + "epoch": 0.32885085574572126, + "grad_norm": 0.44475042819976807, + "learning_rate": 0.001, + "loss": 2.1364, + "step": 269 + }, + { + "epoch": 0.33007334963325186, + "grad_norm": 0.5216724276542664, + "learning_rate": 0.001, + "loss": 2.2284, + "step": 270 + }, + { + "epoch": 0.3312958435207824, + "grad_norm": 0.4575565755367279, + "learning_rate": 0.001, + "loss": 2.2487, + "step": 271 + }, + { + "epoch": 0.33251833740831294, + "grad_norm": 0.46748054027557373, + "learning_rate": 0.001, + "loss": 2.2365, + "step": 272 + }, + { + "epoch": 0.33374083129584353, + "grad_norm": 0.4664430618286133, + "learning_rate": 0.001, + "loss": 2.2382, + "step": 273 + }, + { + "epoch": 0.33496332518337407, + "grad_norm": 0.4394036531448364, + "learning_rate": 0.001, + "loss": 2.1527, + "step": 274 + }, + { + "epoch": 0.33618581907090467, + "grad_norm": 0.40508896112442017, + "learning_rate": 0.001, + "loss": 2.241, + "step": 275 + }, + { + "epoch": 0.3374083129584352, + "grad_norm": 0.4276115298271179, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 276 + }, + { + "epoch": 0.33863080684596575, + "grad_norm": 0.4394541382789612, + "learning_rate": 0.001, + "loss": 2.191, + "step": 277 + }, + { + "epoch": 0.33985330073349634, + "grad_norm": 0.47757041454315186, + "learning_rate": 0.001, + "loss": 2.1546, + "step": 278 + }, + { + "epoch": 0.3410757946210269, + "grad_norm": 0.43888697028160095, + "learning_rate": 0.001, + "loss": 2.1439, + "step": 279 + }, + { + "epoch": 0.3422982885085575, + "grad_norm": 0.43808552622795105, + "learning_rate": 0.001, + "loss": 2.2013, + "step": 280 + }, + { + "epoch": 0.343520782396088, + "grad_norm": 0.441026508808136, + "learning_rate": 0.001, + "loss": 2.2773, + "step": 281 + }, + { + "epoch": 0.34474327628361856, + "grad_norm": 0.7664148807525635, + "learning_rate": 0.001, + "loss": 2.1903, + "step": 282 + }, + { + "epoch": 0.34596577017114916, + "grad_norm": 0.4416791796684265, + "learning_rate": 0.001, + "loss": 2.2962, + "step": 283 + }, + { + "epoch": 0.3471882640586797, + "grad_norm": 0.6133376955986023, + "learning_rate": 0.001, + "loss": 2.1729, + "step": 284 + }, + { + "epoch": 0.3484107579462103, + "grad_norm": 0.446942001581192, + "learning_rate": 0.001, + "loss": 2.201, + "step": 285 + }, + { + "epoch": 0.34963325183374083, + "grad_norm": 0.4628114700317383, + "learning_rate": 0.001, + "loss": 2.1246, + "step": 286 + }, + { + "epoch": 0.3508557457212714, + "grad_norm": 0.5027017593383789, + "learning_rate": 0.001, + "loss": 2.1607, + "step": 287 + }, + { + "epoch": 0.35207823960880197, + "grad_norm": 0.4833141267299652, + "learning_rate": 0.001, + "loss": 2.2449, + "step": 288 + }, + { + "epoch": 0.3533007334963325, + "grad_norm": 0.44004133343696594, + "learning_rate": 0.001, + "loss": 2.2503, + "step": 289 + }, + { + "epoch": 0.3545232273838631, + "grad_norm": 0.44651323556900024, + "learning_rate": 0.001, + "loss": 2.1717, + "step": 290 + }, + { + "epoch": 0.35574572127139364, + "grad_norm": 0.42337921261787415, + "learning_rate": 0.001, + "loss": 2.1461, + "step": 291 + }, + { + "epoch": 0.3569682151589242, + "grad_norm": 0.42791426181793213, + "learning_rate": 0.001, + "loss": 2.1238, + "step": 292 + }, + { + "epoch": 0.3581907090464548, + "grad_norm": 0.46949905157089233, + "learning_rate": 0.001, + "loss": 2.1909, + "step": 293 + }, + { + "epoch": 0.3594132029339853, + "grad_norm": 0.49196457862854004, + "learning_rate": 0.001, + "loss": 2.1772, + "step": 294 + }, + { + "epoch": 0.3606356968215159, + "grad_norm": 0.47412702441215515, + "learning_rate": 0.001, + "loss": 2.225, + "step": 295 + }, + { + "epoch": 0.36185819070904646, + "grad_norm": 0.46963196992874146, + "learning_rate": 0.001, + "loss": 2.2759, + "step": 296 + }, + { + "epoch": 0.363080684596577, + "grad_norm": 0.49510377645492554, + "learning_rate": 0.001, + "loss": 2.2625, + "step": 297 + }, + { + "epoch": 0.3643031784841076, + "grad_norm": 0.4306149482727051, + "learning_rate": 0.001, + "loss": 2.1667, + "step": 298 + }, + { + "epoch": 0.36552567237163813, + "grad_norm": 0.4583509862422943, + "learning_rate": 0.001, + "loss": 2.2367, + "step": 299 + }, + { + "epoch": 0.36674816625916873, + "grad_norm": 0.49827685952186584, + "learning_rate": 0.001, + "loss": 2.2043, + "step": 300 + }, + { + "epoch": 0.36797066014669927, + "grad_norm": 0.5875521898269653, + "learning_rate": 0.001, + "loss": 2.2675, + "step": 301 + }, + { + "epoch": 0.3691931540342298, + "grad_norm": 0.4727851152420044, + "learning_rate": 0.001, + "loss": 2.1613, + "step": 302 + }, + { + "epoch": 0.3704156479217604, + "grad_norm": 0.4806399941444397, + "learning_rate": 0.001, + "loss": 2.1929, + "step": 303 + }, + { + "epoch": 0.37163814180929094, + "grad_norm": 0.711615800857544, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 304 + }, + { + "epoch": 0.37286063569682154, + "grad_norm": 0.5827208757400513, + "learning_rate": 0.001, + "loss": 2.2931, + "step": 305 + }, + { + "epoch": 0.3740831295843521, + "grad_norm": 0.4817729592323303, + "learning_rate": 0.001, + "loss": 2.1518, + "step": 306 + }, + { + "epoch": 0.3753056234718826, + "grad_norm": 0.4607214033603668, + "learning_rate": 0.001, + "loss": 2.2376, + "step": 307 + }, + { + "epoch": 0.3765281173594132, + "grad_norm": 0.5439569354057312, + "learning_rate": 0.001, + "loss": 2.214, + "step": 308 + }, + { + "epoch": 0.37775061124694376, + "grad_norm": 0.553310751914978, + "learning_rate": 0.001, + "loss": 2.1845, + "step": 309 + }, + { + "epoch": 0.37897310513447435, + "grad_norm": 0.5218061208724976, + "learning_rate": 0.001, + "loss": 2.1877, + "step": 310 + }, + { + "epoch": 0.3801955990220049, + "grad_norm": 0.44088688492774963, + "learning_rate": 0.001, + "loss": 2.1382, + "step": 311 + }, + { + "epoch": 0.38141809290953543, + "grad_norm": 0.46488451957702637, + "learning_rate": 0.001, + "loss": 2.2819, + "step": 312 + }, + { + "epoch": 0.38264058679706603, + "grad_norm": 0.46994853019714355, + "learning_rate": 0.001, + "loss": 2.1534, + "step": 313 + }, + { + "epoch": 0.38386308068459657, + "grad_norm": 0.47553324699401855, + "learning_rate": 0.001, + "loss": 2.2419, + "step": 314 + }, + { + "epoch": 0.38508557457212717, + "grad_norm": 0.4419028162956238, + "learning_rate": 0.001, + "loss": 2.0614, + "step": 315 + }, + { + "epoch": 0.3863080684596577, + "grad_norm": 0.482204794883728, + "learning_rate": 0.001, + "loss": 2.1259, + "step": 316 + }, + { + "epoch": 0.38753056234718825, + "grad_norm": 0.46489477157592773, + "learning_rate": 0.001, + "loss": 2.1542, + "step": 317 + }, + { + "epoch": 0.38875305623471884, + "grad_norm": 0.613818883895874, + "learning_rate": 0.001, + "loss": 2.151, + "step": 318 + }, + { + "epoch": 0.3899755501222494, + "grad_norm": 0.48042118549346924, + "learning_rate": 0.001, + "loss": 2.2729, + "step": 319 + }, + { + "epoch": 0.39119804400978, + "grad_norm": 0.5772485733032227, + "learning_rate": 0.001, + "loss": 2.1729, + "step": 320 + }, + { + "epoch": 0.3924205378973105, + "grad_norm": 0.9449200630187988, + "learning_rate": 0.001, + "loss": 2.2178, + "step": 321 + }, + { + "epoch": 0.39364303178484106, + "grad_norm": 0.4307827651500702, + "learning_rate": 0.001, + "loss": 2.1567, + "step": 322 + }, + { + "epoch": 0.39486552567237165, + "grad_norm": 0.5241131782531738, + "learning_rate": 0.001, + "loss": 2.1624, + "step": 323 + }, + { + "epoch": 0.3960880195599022, + "grad_norm": 0.5168140530586243, + "learning_rate": 0.001, + "loss": 2.2424, + "step": 324 + }, + { + "epoch": 0.3973105134474328, + "grad_norm": 0.47847840189933777, + "learning_rate": 0.001, + "loss": 2.1865, + "step": 325 + }, + { + "epoch": 0.39853300733496333, + "grad_norm": 0.559695303440094, + "learning_rate": 0.001, + "loss": 2.1683, + "step": 326 + }, + { + "epoch": 0.39975550122249387, + "grad_norm": 0.48280563950538635, + "learning_rate": 0.001, + "loss": 2.1433, + "step": 327 + }, + { + "epoch": 0.40097799511002447, + "grad_norm": 0.5894402861595154, + "learning_rate": 0.001, + "loss": 2.1939, + "step": 328 + }, + { + "epoch": 0.402200488997555, + "grad_norm": 0.4633193016052246, + "learning_rate": 0.001, + "loss": 2.2026, + "step": 329 + }, + { + "epoch": 0.4034229828850856, + "grad_norm": 0.502029538154602, + "learning_rate": 0.001, + "loss": 2.1984, + "step": 330 + }, + { + "epoch": 0.40464547677261614, + "grad_norm": 0.6570725440979004, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 331 + }, + { + "epoch": 0.4058679706601467, + "grad_norm": 0.459811270236969, + "learning_rate": 0.001, + "loss": 2.2403, + "step": 332 + }, + { + "epoch": 0.4070904645476773, + "grad_norm": 0.6034518480300903, + "learning_rate": 0.001, + "loss": 2.1935, + "step": 333 + }, + { + "epoch": 0.4083129584352078, + "grad_norm": 0.47477465867996216, + "learning_rate": 0.001, + "loss": 2.2074, + "step": 334 + }, + { + "epoch": 0.4095354523227384, + "grad_norm": 0.5079094767570496, + "learning_rate": 0.001, + "loss": 2.2006, + "step": 335 + }, + { + "epoch": 0.41075794621026895, + "grad_norm": 0.5319966077804565, + "learning_rate": 0.001, + "loss": 2.1931, + "step": 336 + }, + { + "epoch": 0.4119804400977995, + "grad_norm": 0.4573460519313812, + "learning_rate": 0.001, + "loss": 2.2233, + "step": 337 + }, + { + "epoch": 0.4132029339853301, + "grad_norm": 0.47283318638801575, + "learning_rate": 0.001, + "loss": 2.1454, + "step": 338 + }, + { + "epoch": 0.41442542787286063, + "grad_norm": 0.43014955520629883, + "learning_rate": 0.001, + "loss": 2.1942, + "step": 339 + }, + { + "epoch": 0.4156479217603912, + "grad_norm": 0.4784499406814575, + "learning_rate": 0.001, + "loss": 2.1523, + "step": 340 + }, + { + "epoch": 0.41687041564792177, + "grad_norm": 0.44560521841049194, + "learning_rate": 0.001, + "loss": 2.1879, + "step": 341 + }, + { + "epoch": 0.4180929095354523, + "grad_norm": 0.42554497718811035, + "learning_rate": 0.001, + "loss": 2.1782, + "step": 342 + }, + { + "epoch": 0.4193154034229829, + "grad_norm": 0.6102154850959778, + "learning_rate": 0.001, + "loss": 2.2327, + "step": 343 + }, + { + "epoch": 0.42053789731051344, + "grad_norm": 0.45047393441200256, + "learning_rate": 0.001, + "loss": 2.2005, + "step": 344 + }, + { + "epoch": 0.421760391198044, + "grad_norm": 0.4865048825740814, + "learning_rate": 0.001, + "loss": 2.3121, + "step": 345 + }, + { + "epoch": 0.4229828850855746, + "grad_norm": 0.4858083426952362, + "learning_rate": 0.001, + "loss": 2.0612, + "step": 346 + }, + { + "epoch": 0.4242053789731051, + "grad_norm": 0.4916556477546692, + "learning_rate": 0.001, + "loss": 2.2065, + "step": 347 + }, + { + "epoch": 0.4254278728606357, + "grad_norm": 0.6283455491065979, + "learning_rate": 0.001, + "loss": 2.2506, + "step": 348 + }, + { + "epoch": 0.42665036674816625, + "grad_norm": 0.6359683275222778, + "learning_rate": 0.001, + "loss": 2.2236, + "step": 349 + }, + { + "epoch": 0.4278728606356968, + "grad_norm": 0.5602818727493286, + "learning_rate": 0.001, + "loss": 2.2154, + "step": 350 + }, + { + "epoch": 0.4290953545232274, + "grad_norm": 0.5634006261825562, + "learning_rate": 0.001, + "loss": 2.0715, + "step": 351 + }, + { + "epoch": 0.43031784841075793, + "grad_norm": 0.4602808654308319, + "learning_rate": 0.001, + "loss": 2.1809, + "step": 352 + }, + { + "epoch": 0.4315403422982885, + "grad_norm": 0.4815332591533661, + "learning_rate": 0.001, + "loss": 2.1637, + "step": 353 + }, + { + "epoch": 0.43276283618581907, + "grad_norm": 0.49894168972969055, + "learning_rate": 0.001, + "loss": 2.2466, + "step": 354 + }, + { + "epoch": 0.4339853300733496, + "grad_norm": 0.4808076322078705, + "learning_rate": 0.001, + "loss": 2.2192, + "step": 355 + }, + { + "epoch": 0.4352078239608802, + "grad_norm": 0.45529651641845703, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 356 + }, + { + "epoch": 0.43643031784841074, + "grad_norm": 0.4804287552833557, + "learning_rate": 0.001, + "loss": 2.1756, + "step": 357 + }, + { + "epoch": 0.43765281173594134, + "grad_norm": 0.5576850175857544, + "learning_rate": 0.001, + "loss": 2.1637, + "step": 358 + }, + { + "epoch": 0.4388753056234719, + "grad_norm": 0.5175532698631287, + "learning_rate": 0.001, + "loss": 2.0891, + "step": 359 + }, + { + "epoch": 0.4400977995110024, + "grad_norm": 0.6010396480560303, + "learning_rate": 0.001, + "loss": 2.2515, + "step": 360 + }, + { + "epoch": 0.441320293398533, + "grad_norm": 0.6106312274932861, + "learning_rate": 0.001, + "loss": 2.1245, + "step": 361 + }, + { + "epoch": 0.44254278728606355, + "grad_norm": 0.5079056620597839, + "learning_rate": 0.001, + "loss": 2.1149, + "step": 362 + }, + { + "epoch": 0.44376528117359415, + "grad_norm": 0.5122853517532349, + "learning_rate": 0.001, + "loss": 2.175, + "step": 363 + }, + { + "epoch": 0.4449877750611247, + "grad_norm": 0.5180224776268005, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 364 + }, + { + "epoch": 0.44621026894865523, + "grad_norm": 0.4952344298362732, + "learning_rate": 0.001, + "loss": 2.1543, + "step": 365 + }, + { + "epoch": 0.4474327628361858, + "grad_norm": 0.5394139885902405, + "learning_rate": 0.001, + "loss": 2.0964, + "step": 366 + }, + { + "epoch": 0.44865525672371637, + "grad_norm": 0.4426160454750061, + "learning_rate": 0.001, + "loss": 2.1959, + "step": 367 + }, + { + "epoch": 0.44987775061124696, + "grad_norm": 0.446934312582016, + "learning_rate": 0.001, + "loss": 2.2799, + "step": 368 + }, + { + "epoch": 0.4511002444987775, + "grad_norm": 0.6091988682746887, + "learning_rate": 0.001, + "loss": 2.2826, + "step": 369 + }, + { + "epoch": 0.45232273838630804, + "grad_norm": 0.6177111268043518, + "learning_rate": 0.001, + "loss": 2.1416, + "step": 370 + }, + { + "epoch": 0.45354523227383864, + "grad_norm": 0.6030283570289612, + "learning_rate": 0.001, + "loss": 2.1534, + "step": 371 + }, + { + "epoch": 0.4547677261613692, + "grad_norm": 0.7201472520828247, + "learning_rate": 0.001, + "loss": 2.1527, + "step": 372 + }, + { + "epoch": 0.4559902200488998, + "grad_norm": 0.5519202947616577, + "learning_rate": 0.001, + "loss": 2.1276, + "step": 373 + }, + { + "epoch": 0.4572127139364303, + "grad_norm": 0.665658712387085, + "learning_rate": 0.001, + "loss": 2.1326, + "step": 374 + }, + { + "epoch": 0.45843520782396086, + "grad_norm": 0.5527313947677612, + "learning_rate": 0.001, + "loss": 2.1636, + "step": 375 + }, + { + "epoch": 0.45965770171149145, + "grad_norm": 0.5481876134872437, + "learning_rate": 0.001, + "loss": 2.0782, + "step": 376 + }, + { + "epoch": 0.460880195599022, + "grad_norm": 0.5887396335601807, + "learning_rate": 0.001, + "loss": 2.1269, + "step": 377 + }, + { + "epoch": 0.4621026894865526, + "grad_norm": 0.47340717911720276, + "learning_rate": 0.001, + "loss": 2.1033, + "step": 378 + }, + { + "epoch": 0.4633251833740831, + "grad_norm": 0.5240494012832642, + "learning_rate": 0.001, + "loss": 2.1931, + "step": 379 + }, + { + "epoch": 0.46454767726161367, + "grad_norm": 0.7044278979301453, + "learning_rate": 0.001, + "loss": 2.2165, + "step": 380 + }, + { + "epoch": 0.46577017114914426, + "grad_norm": 0.4907272458076477, + "learning_rate": 0.001, + "loss": 2.1903, + "step": 381 + }, + { + "epoch": 0.4669926650366748, + "grad_norm": 0.588350236415863, + "learning_rate": 0.001, + "loss": 2.209, + "step": 382 + }, + { + "epoch": 0.4682151589242054, + "grad_norm": 0.43188443779945374, + "learning_rate": 0.001, + "loss": 2.2638, + "step": 383 + }, + { + "epoch": 0.46943765281173594, + "grad_norm": 0.5474991798400879, + "learning_rate": 0.001, + "loss": 2.1538, + "step": 384 + }, + { + "epoch": 0.4706601466992665, + "grad_norm": 0.4939529597759247, + "learning_rate": 0.001, + "loss": 2.1797, + "step": 385 + }, + { + "epoch": 0.4718826405867971, + "grad_norm": 0.46284395456314087, + "learning_rate": 0.001, + "loss": 2.1734, + "step": 386 + }, + { + "epoch": 0.4731051344743276, + "grad_norm": 0.4820462465286255, + "learning_rate": 0.001, + "loss": 2.2029, + "step": 387 + }, + { + "epoch": 0.4743276283618582, + "grad_norm": 0.5413796901702881, + "learning_rate": 0.001, + "loss": 2.2625, + "step": 388 + }, + { + "epoch": 0.47555012224938875, + "grad_norm": 0.5737091898918152, + "learning_rate": 0.001, + "loss": 2.2101, + "step": 389 + }, + { + "epoch": 0.4767726161369193, + "grad_norm": 0.6295710802078247, + "learning_rate": 0.001, + "loss": 2.2646, + "step": 390 + }, + { + "epoch": 0.4779951100244499, + "grad_norm": 0.5470510125160217, + "learning_rate": 0.001, + "loss": 2.2067, + "step": 391 + }, + { + "epoch": 0.4792176039119804, + "grad_norm": 0.7382400631904602, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 392 + }, + { + "epoch": 0.480440097799511, + "grad_norm": 0.5777731537818909, + "learning_rate": 0.001, + "loss": 2.1658, + "step": 393 + }, + { + "epoch": 0.48166259168704156, + "grad_norm": 0.6507386565208435, + "learning_rate": 0.001, + "loss": 2.2107, + "step": 394 + }, + { + "epoch": 0.4828850855745721, + "grad_norm": 0.5742748379707336, + "learning_rate": 0.001, + "loss": 2.2308, + "step": 395 + }, + { + "epoch": 0.4841075794621027, + "grad_norm": 0.4340347349643707, + "learning_rate": 0.001, + "loss": 2.1719, + "step": 396 + }, + { + "epoch": 0.48533007334963324, + "grad_norm": 0.5274593234062195, + "learning_rate": 0.001, + "loss": 2.1833, + "step": 397 + }, + { + "epoch": 0.48655256723716384, + "grad_norm": 0.4945325553417206, + "learning_rate": 0.001, + "loss": 2.2275, + "step": 398 + }, + { + "epoch": 0.4877750611246944, + "grad_norm": 0.5010286569595337, + "learning_rate": 0.001, + "loss": 2.0742, + "step": 399 + }, + { + "epoch": 0.4889975550122249, + "grad_norm": 0.5433893799781799, + "learning_rate": 0.001, + "loss": 2.1805, + "step": 400 + }, + { + "epoch": 0.4902200488997555, + "grad_norm": 0.5283117294311523, + "learning_rate": 0.001, + "loss": 2.1663, + "step": 401 + }, + { + "epoch": 0.49144254278728605, + "grad_norm": 0.5704575181007385, + "learning_rate": 0.001, + "loss": 2.2328, + "step": 402 + }, + { + "epoch": 0.49266503667481665, + "grad_norm": 0.45715004205703735, + "learning_rate": 0.001, + "loss": 2.184, + "step": 403 + }, + { + "epoch": 0.4938875305623472, + "grad_norm": 0.5323057174682617, + "learning_rate": 0.001, + "loss": 2.1889, + "step": 404 + }, + { + "epoch": 0.49511002444987773, + "grad_norm": 0.5370688438415527, + "learning_rate": 0.001, + "loss": 2.1373, + "step": 405 + }, + { + "epoch": 0.4963325183374083, + "grad_norm": 0.45251837372779846, + "learning_rate": 0.001, + "loss": 2.1459, + "step": 406 + }, + { + "epoch": 0.49755501222493886, + "grad_norm": 0.4832480549812317, + "learning_rate": 0.001, + "loss": 2.1073, + "step": 407 + }, + { + "epoch": 0.49877750611246946, + "grad_norm": 0.5315654873847961, + "learning_rate": 0.001, + "loss": 2.16, + "step": 408 + }, + { + "epoch": 0.5, + "grad_norm": 0.5699260830879211, + "learning_rate": 0.001, + "loss": 2.2989, + "step": 409 + }, + { + "epoch": 0.5012224938875306, + "grad_norm": 0.49974632263183594, + "learning_rate": 0.001, + "loss": 2.1772, + "step": 410 + }, + { + "epoch": 0.5012224938875306, + "eval_loss": 2.1469788551330566, + "eval_runtime": 69.1494, + "eval_samples_per_second": 151.339, + "eval_steps_per_second": 18.93, + "step": 410 + }, + { + "epoch": 0.5024449877750611, + "grad_norm": 0.5740631818771362, + "learning_rate": 0.001, + "loss": 2.1783, + "step": 411 + }, + { + "epoch": 0.5036674816625917, + "grad_norm": 0.5050974488258362, + "learning_rate": 0.001, + "loss": 2.1696, + "step": 412 + }, + { + "epoch": 0.5048899755501223, + "grad_norm": 0.6442350149154663, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 413 + }, + { + "epoch": 0.5061124694376528, + "grad_norm": 0.5595468282699585, + "learning_rate": 0.001, + "loss": 2.2138, + "step": 414 + }, + { + "epoch": 0.5073349633251834, + "grad_norm": 0.5576492547988892, + "learning_rate": 0.001, + "loss": 2.1927, + "step": 415 + }, + { + "epoch": 0.508557457212714, + "grad_norm": 1.1910855770111084, + "learning_rate": 0.001, + "loss": 2.1443, + "step": 416 + }, + { + "epoch": 0.5097799511002445, + "grad_norm": 0.5421645045280457, + "learning_rate": 0.001, + "loss": 2.1744, + "step": 417 + }, + { + "epoch": 0.511002444987775, + "grad_norm": 0.6474606394767761, + "learning_rate": 0.001, + "loss": 2.2957, + "step": 418 + }, + { + "epoch": 0.5122249388753056, + "grad_norm": 0.5093860626220703, + "learning_rate": 0.001, + "loss": 2.2137, + "step": 419 + }, + { + "epoch": 0.5134474327628362, + "grad_norm": 0.6543591022491455, + "learning_rate": 0.001, + "loss": 2.2671, + "step": 420 + }, + { + "epoch": 0.5146699266503667, + "grad_norm": 0.4889623522758484, + "learning_rate": 0.001, + "loss": 2.1874, + "step": 421 + }, + { + "epoch": 0.5158924205378973, + "grad_norm": 0.5349574089050293, + "learning_rate": 0.001, + "loss": 2.1826, + "step": 422 + }, + { + "epoch": 0.5171149144254279, + "grad_norm": 0.5584796071052551, + "learning_rate": 0.001, + "loss": 2.2241, + "step": 423 + }, + { + "epoch": 0.5183374083129584, + "grad_norm": 0.49419817328453064, + "learning_rate": 0.001, + "loss": 2.2039, + "step": 424 + }, + { + "epoch": 0.519559902200489, + "grad_norm": 0.6226582527160645, + "learning_rate": 0.001, + "loss": 2.1601, + "step": 425 + }, + { + "epoch": 0.5207823960880196, + "grad_norm": 0.4907011091709137, + "learning_rate": 0.001, + "loss": 2.1829, + "step": 426 + }, + { + "epoch": 0.5220048899755502, + "grad_norm": 0.6074104309082031, + "learning_rate": 0.001, + "loss": 2.1575, + "step": 427 + }, + { + "epoch": 0.5232273838630807, + "grad_norm": 0.5580583810806274, + "learning_rate": 0.001, + "loss": 2.1722, + "step": 428 + }, + { + "epoch": 0.5244498777506112, + "grad_norm": 0.5270822644233704, + "learning_rate": 0.001, + "loss": 2.2356, + "step": 429 + }, + { + "epoch": 0.5256723716381418, + "grad_norm": 0.5240160226821899, + "learning_rate": 0.001, + "loss": 2.187, + "step": 430 + }, + { + "epoch": 0.5268948655256723, + "grad_norm": 0.6706748604774475, + "learning_rate": 0.001, + "loss": 2.2156, + "step": 431 + }, + { + "epoch": 0.5281173594132029, + "grad_norm": 0.5267993807792664, + "learning_rate": 0.001, + "loss": 2.205, + "step": 432 + }, + { + "epoch": 0.5293398533007335, + "grad_norm": 0.6394602656364441, + "learning_rate": 0.001, + "loss": 2.203, + "step": 433 + }, + { + "epoch": 0.530562347188264, + "grad_norm": 0.544459879398346, + "learning_rate": 0.001, + "loss": 2.3439, + "step": 434 + }, + { + "epoch": 0.5317848410757946, + "grad_norm": 0.5763698220252991, + "learning_rate": 0.001, + "loss": 2.1619, + "step": 435 + }, + { + "epoch": 0.5330073349633252, + "grad_norm": 0.539488673210144, + "learning_rate": 0.001, + "loss": 2.2853, + "step": 436 + }, + { + "epoch": 0.5342298288508558, + "grad_norm": 0.5510901212692261, + "learning_rate": 0.001, + "loss": 2.2645, + "step": 437 + }, + { + "epoch": 0.5354523227383863, + "grad_norm": 0.5045850276947021, + "learning_rate": 0.001, + "loss": 2.2893, + "step": 438 + }, + { + "epoch": 0.5366748166259169, + "grad_norm": 0.48548564314842224, + "learning_rate": 0.001, + "loss": 2.3173, + "step": 439 + }, + { + "epoch": 0.5378973105134475, + "grad_norm": 0.5125932693481445, + "learning_rate": 0.001, + "loss": 2.2716, + "step": 440 + }, + { + "epoch": 0.539119804400978, + "grad_norm": 0.5173934102058411, + "learning_rate": 0.001, + "loss": 2.1719, + "step": 441 + }, + { + "epoch": 0.5403422982885085, + "grad_norm": 0.546996533870697, + "learning_rate": 0.001, + "loss": 2.3323, + "step": 442 + }, + { + "epoch": 0.5415647921760391, + "grad_norm": 0.48216158151626587, + "learning_rate": 0.001, + "loss": 2.1881, + "step": 443 + }, + { + "epoch": 0.5427872860635696, + "grad_norm": 0.44798579812049866, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 444 + }, + { + "epoch": 0.5440097799511002, + "grad_norm": 0.4922938346862793, + "learning_rate": 0.001, + "loss": 2.1886, + "step": 445 + }, + { + "epoch": 0.5452322738386308, + "grad_norm": 0.564640998840332, + "learning_rate": 0.001, + "loss": 2.1519, + "step": 446 + }, + { + "epoch": 0.5464547677261614, + "grad_norm": 0.5763362050056458, + "learning_rate": 0.001, + "loss": 2.2067, + "step": 447 + }, + { + "epoch": 0.5476772616136919, + "grad_norm": 0.5354210138320923, + "learning_rate": 0.001, + "loss": 2.2425, + "step": 448 + }, + { + "epoch": 0.5488997555012225, + "grad_norm": 0.6192667484283447, + "learning_rate": 0.001, + "loss": 2.2426, + "step": 449 + }, + { + "epoch": 0.5501222493887531, + "grad_norm": 0.4782339632511139, + "learning_rate": 0.001, + "loss": 2.1758, + "step": 450 + }, + { + "epoch": 0.5513447432762836, + "grad_norm": 0.5111038684844971, + "learning_rate": 0.001, + "loss": 2.1927, + "step": 451 + }, + { + "epoch": 0.5525672371638142, + "grad_norm": 0.5513277649879456, + "learning_rate": 0.001, + "loss": 2.2836, + "step": 452 + }, + { + "epoch": 0.5537897310513448, + "grad_norm": 0.6026435494422913, + "learning_rate": 0.001, + "loss": 2.2628, + "step": 453 + }, + { + "epoch": 0.5550122249388753, + "grad_norm": 0.6273881793022156, + "learning_rate": 0.001, + "loss": 2.1867, + "step": 454 + }, + { + "epoch": 0.5562347188264058, + "grad_norm": 0.5088536143302917, + "learning_rate": 0.001, + "loss": 2.1296, + "step": 455 + }, + { + "epoch": 0.5574572127139364, + "grad_norm": 0.5127514600753784, + "learning_rate": 0.001, + "loss": 2.2092, + "step": 456 + }, + { + "epoch": 0.558679706601467, + "grad_norm": 0.6852807402610779, + "learning_rate": 0.001, + "loss": 2.2453, + "step": 457 + }, + { + "epoch": 0.5599022004889975, + "grad_norm": 0.5020545125007629, + "learning_rate": 0.001, + "loss": 2.2399, + "step": 458 + }, + { + "epoch": 0.5611246943765281, + "grad_norm": 0.5110552906990051, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 459 + }, + { + "epoch": 0.5623471882640587, + "grad_norm": 0.6412564516067505, + "learning_rate": 0.001, + "loss": 2.1798, + "step": 460 + }, + { + "epoch": 0.5635696821515892, + "grad_norm": 0.4610752463340759, + "learning_rate": 0.001, + "loss": 2.1207, + "step": 461 + }, + { + "epoch": 0.5647921760391198, + "grad_norm": 0.6350216865539551, + "learning_rate": 0.001, + "loss": 2.3013, + "step": 462 + }, + { + "epoch": 0.5660146699266504, + "grad_norm": 0.5525195002555847, + "learning_rate": 0.001, + "loss": 2.1299, + "step": 463 + }, + { + "epoch": 0.5672371638141809, + "grad_norm": 0.5612464547157288, + "learning_rate": 0.001, + "loss": 2.2332, + "step": 464 + }, + { + "epoch": 0.5684596577017115, + "grad_norm": 0.5840879678726196, + "learning_rate": 0.001, + "loss": 2.259, + "step": 465 + }, + { + "epoch": 0.5696821515892421, + "grad_norm": 0.558628261089325, + "learning_rate": 0.001, + "loss": 2.1456, + "step": 466 + }, + { + "epoch": 0.5709046454767727, + "grad_norm": 0.5693490505218506, + "learning_rate": 0.001, + "loss": 2.195, + "step": 467 + }, + { + "epoch": 0.5721271393643031, + "grad_norm": 0.48814135789871216, + "learning_rate": 0.001, + "loss": 2.1886, + "step": 468 + }, + { + "epoch": 0.5733496332518337, + "grad_norm": 0.528095006942749, + "learning_rate": 0.001, + "loss": 2.2137, + "step": 469 + }, + { + "epoch": 0.5745721271393643, + "grad_norm": 0.5991528034210205, + "learning_rate": 0.001, + "loss": 2.1803, + "step": 470 + }, + { + "epoch": 0.5757946210268948, + "grad_norm": 0.5528990626335144, + "learning_rate": 0.001, + "loss": 2.1444, + "step": 471 + }, + { + "epoch": 0.5770171149144254, + "grad_norm": 0.5375529527664185, + "learning_rate": 0.001, + "loss": 2.1947, + "step": 472 + }, + { + "epoch": 0.578239608801956, + "grad_norm": 0.47309327125549316, + "learning_rate": 0.001, + "loss": 2.1808, + "step": 473 + }, + { + "epoch": 0.5794621026894865, + "grad_norm": 0.49250733852386475, + "learning_rate": 0.001, + "loss": 2.2268, + "step": 474 + }, + { + "epoch": 0.5806845965770171, + "grad_norm": 0.5195334553718567, + "learning_rate": 0.001, + "loss": 2.1929, + "step": 475 + }, + { + "epoch": 0.5819070904645477, + "grad_norm": 0.48456084728240967, + "learning_rate": 0.001, + "loss": 2.1217, + "step": 476 + }, + { + "epoch": 0.5831295843520783, + "grad_norm": 0.534269392490387, + "learning_rate": 0.001, + "loss": 2.1721, + "step": 477 + }, + { + "epoch": 0.5843520782396088, + "grad_norm": 0.5373458862304688, + "learning_rate": 0.001, + "loss": 2.3162, + "step": 478 + }, + { + "epoch": 0.5855745721271394, + "grad_norm": 0.5691243410110474, + "learning_rate": 0.001, + "loss": 2.215, + "step": 479 + }, + { + "epoch": 0.58679706601467, + "grad_norm": 0.5426737666130066, + "learning_rate": 0.001, + "loss": 2.1483, + "step": 480 + }, + { + "epoch": 0.5880195599022005, + "grad_norm": 0.5185466408729553, + "learning_rate": 0.001, + "loss": 2.1977, + "step": 481 + }, + { + "epoch": 0.589242053789731, + "grad_norm": 0.49309492111206055, + "learning_rate": 0.001, + "loss": 2.1648, + "step": 482 + }, + { + "epoch": 0.5904645476772616, + "grad_norm": 0.4933290183544159, + "learning_rate": 0.001, + "loss": 2.2618, + "step": 483 + }, + { + "epoch": 0.5916870415647921, + "grad_norm": 0.44854751229286194, + "learning_rate": 0.001, + "loss": 2.1669, + "step": 484 + }, + { + "epoch": 0.5929095354523227, + "grad_norm": 0.580627977848053, + "learning_rate": 0.001, + "loss": 2.212, + "step": 485 + }, + { + "epoch": 0.5941320293398533, + "grad_norm": 0.6000897884368896, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 486 + }, + { + "epoch": 0.5953545232273839, + "grad_norm": 0.5180191397666931, + "learning_rate": 0.001, + "loss": 2.1981, + "step": 487 + }, + { + "epoch": 0.5965770171149144, + "grad_norm": 0.6499658823013306, + "learning_rate": 0.001, + "loss": 2.1392, + "step": 488 + }, + { + "epoch": 0.597799511002445, + "grad_norm": 0.5067393183708191, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 489 + }, + { + "epoch": 0.5990220048899756, + "grad_norm": 0.5518601536750793, + "learning_rate": 0.001, + "loss": 2.1892, + "step": 490 + }, + { + "epoch": 0.6002444987775061, + "grad_norm": 0.5741767287254333, + "learning_rate": 0.001, + "loss": 2.2325, + "step": 491 + }, + { + "epoch": 0.6014669926650367, + "grad_norm": 0.47276920080184937, + "learning_rate": 0.001, + "loss": 2.1709, + "step": 492 + }, + { + "epoch": 0.6026894865525673, + "grad_norm": 0.6871250867843628, + "learning_rate": 0.001, + "loss": 2.2172, + "step": 493 + }, + { + "epoch": 0.6039119804400978, + "grad_norm": 0.6001262664794922, + "learning_rate": 0.001, + "loss": 2.1843, + "step": 494 + }, + { + "epoch": 0.6051344743276283, + "grad_norm": 0.6246949434280396, + "learning_rate": 0.001, + "loss": 2.2578, + "step": 495 + }, + { + "epoch": 0.6063569682151589, + "grad_norm": 0.599838137626648, + "learning_rate": 0.001, + "loss": 2.2204, + "step": 496 + }, + { + "epoch": 0.6075794621026895, + "grad_norm": 0.6918209195137024, + "learning_rate": 0.001, + "loss": 2.2305, + "step": 497 + }, + { + "epoch": 0.60880195599022, + "grad_norm": 0.5067518949508667, + "learning_rate": 0.001, + "loss": 2.2096, + "step": 498 + }, + { + "epoch": 0.6100244498777506, + "grad_norm": 0.6585003733634949, + "learning_rate": 0.001, + "loss": 2.1348, + "step": 499 + }, + { + "epoch": 0.6112469437652812, + "grad_norm": 0.5794422626495361, + "learning_rate": 0.001, + "loss": 2.1785, + "step": 500 + }, + { + "epoch": 0.6124694376528117, + "grad_norm": 0.5732021927833557, + "learning_rate": 0.001, + "loss": 2.1643, + "step": 501 + }, + { + "epoch": 0.6136919315403423, + "grad_norm": 0.5366373062133789, + "learning_rate": 0.001, + "loss": 2.1448, + "step": 502 + }, + { + "epoch": 0.6149144254278729, + "grad_norm": 0.5503267049789429, + "learning_rate": 0.001, + "loss": 2.1848, + "step": 503 + }, + { + "epoch": 0.6161369193154034, + "grad_norm": 0.5450462102890015, + "learning_rate": 0.001, + "loss": 2.1748, + "step": 504 + }, + { + "epoch": 0.617359413202934, + "grad_norm": 0.5018944144248962, + "learning_rate": 0.001, + "loss": 2.2055, + "step": 505 + }, + { + "epoch": 0.6185819070904646, + "grad_norm": 0.6032975316047668, + "learning_rate": 0.001, + "loss": 2.1492, + "step": 506 + }, + { + "epoch": 0.6198044009779952, + "grad_norm": 0.5328817367553711, + "learning_rate": 0.001, + "loss": 2.2036, + "step": 507 + }, + { + "epoch": 0.6210268948655256, + "grad_norm": 0.5828500986099243, + "learning_rate": 0.001, + "loss": 2.1992, + "step": 508 + }, + { + "epoch": 0.6222493887530562, + "grad_norm": 0.5160496234893799, + "learning_rate": 0.001, + "loss": 2.1988, + "step": 509 + }, + { + "epoch": 0.6234718826405868, + "grad_norm": 0.6087934374809265, + "learning_rate": 0.001, + "loss": 2.1834, + "step": 510 + }, + { + "epoch": 0.6246943765281173, + "grad_norm": 0.5729974508285522, + "learning_rate": 0.001, + "loss": 2.1361, + "step": 511 + }, + { + "epoch": 0.6259168704156479, + "grad_norm": 0.5637314319610596, + "learning_rate": 0.001, + "loss": 2.2719, + "step": 512 + }, + { + "epoch": 0.6271393643031785, + "grad_norm": 0.7113996744155884, + "learning_rate": 0.001, + "loss": 2.2268, + "step": 513 + }, + { + "epoch": 0.628361858190709, + "grad_norm": 0.5220823884010315, + "learning_rate": 0.001, + "loss": 2.2301, + "step": 514 + }, + { + "epoch": 0.6295843520782396, + "grad_norm": 0.5641618371009827, + "learning_rate": 0.001, + "loss": 2.1943, + "step": 515 + }, + { + "epoch": 0.6308068459657702, + "grad_norm": 0.587079644203186, + "learning_rate": 0.001, + "loss": 2.1861, + "step": 516 + }, + { + "epoch": 0.6320293398533008, + "grad_norm": 0.7265164256095886, + "learning_rate": 0.001, + "loss": 2.1136, + "step": 517 + }, + { + "epoch": 0.6332518337408313, + "grad_norm": 0.5387188196182251, + "learning_rate": 0.001, + "loss": 2.1507, + "step": 518 + }, + { + "epoch": 0.6344743276283619, + "grad_norm": 0.7700957655906677, + "learning_rate": 0.001, + "loss": 2.2727, + "step": 519 + }, + { + "epoch": 0.6356968215158925, + "grad_norm": 0.526703953742981, + "learning_rate": 0.001, + "loss": 2.1255, + "step": 520 + }, + { + "epoch": 0.636919315403423, + "grad_norm": 0.5161398649215698, + "learning_rate": 0.001, + "loss": 2.1244, + "step": 521 + }, + { + "epoch": 0.6381418092909535, + "grad_norm": 0.5797749757766724, + "learning_rate": 0.001, + "loss": 2.2473, + "step": 522 + }, + { + "epoch": 0.6393643031784841, + "grad_norm": 0.5566680431365967, + "learning_rate": 0.001, + "loss": 2.1835, + "step": 523 + }, + { + "epoch": 0.6405867970660146, + "grad_norm": 0.5569880604743958, + "learning_rate": 0.001, + "loss": 2.155, + "step": 524 + }, + { + "epoch": 0.6418092909535452, + "grad_norm": 0.5413252711296082, + "learning_rate": 0.001, + "loss": 2.2778, + "step": 525 + }, + { + "epoch": 0.6430317848410758, + "grad_norm": 0.4530647397041321, + "learning_rate": 0.001, + "loss": 2.1888, + "step": 526 + }, + { + "epoch": 0.6442542787286064, + "grad_norm": 0.5370563864707947, + "learning_rate": 0.001, + "loss": 2.242, + "step": 527 + }, + { + "epoch": 0.6454767726161369, + "grad_norm": 0.5274892449378967, + "learning_rate": 0.001, + "loss": 2.2493, + "step": 528 + }, + { + "epoch": 0.6466992665036675, + "grad_norm": 0.5889971852302551, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 529 + }, + { + "epoch": 0.6479217603911981, + "grad_norm": 0.5334299206733704, + "learning_rate": 0.001, + "loss": 2.2133, + "step": 530 + }, + { + "epoch": 0.6491442542787286, + "grad_norm": 0.5480680465698242, + "learning_rate": 0.001, + "loss": 2.1973, + "step": 531 + }, + { + "epoch": 0.6503667481662592, + "grad_norm": 0.5218977332115173, + "learning_rate": 0.001, + "loss": 2.1938, + "step": 532 + }, + { + "epoch": 0.6515892420537898, + "grad_norm": 0.5179240107536316, + "learning_rate": 0.001, + "loss": 2.1262, + "step": 533 + }, + { + "epoch": 0.6528117359413202, + "grad_norm": 0.6605428457260132, + "learning_rate": 0.001, + "loss": 2.2826, + "step": 534 + }, + { + "epoch": 0.6540342298288508, + "grad_norm": 0.5191709995269775, + "learning_rate": 0.001, + "loss": 2.1649, + "step": 535 + }, + { + "epoch": 0.6552567237163814, + "grad_norm": 0.5000888109207153, + "learning_rate": 0.001, + "loss": 2.1345, + "step": 536 + }, + { + "epoch": 0.656479217603912, + "grad_norm": 0.5836233496665955, + "learning_rate": 0.001, + "loss": 2.1304, + "step": 537 + }, + { + "epoch": 0.6577017114914425, + "grad_norm": 0.672378420829773, + "learning_rate": 0.001, + "loss": 2.1928, + "step": 538 + }, + { + "epoch": 0.6589242053789731, + "grad_norm": 0.5583218932151794, + "learning_rate": 0.001, + "loss": 2.1173, + "step": 539 + }, + { + "epoch": 0.6601466992665037, + "grad_norm": 0.6046769618988037, + "learning_rate": 0.001, + "loss": 2.1138, + "step": 540 + }, + { + "epoch": 0.6613691931540342, + "grad_norm": 0.6253052353858948, + "learning_rate": 0.001, + "loss": 2.1529, + "step": 541 + }, + { + "epoch": 0.6625916870415648, + "grad_norm": 0.7345244884490967, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 542 + }, + { + "epoch": 0.6638141809290954, + "grad_norm": 0.5053750276565552, + "learning_rate": 0.001, + "loss": 2.1233, + "step": 543 + }, + { + "epoch": 0.6650366748166259, + "grad_norm": 0.6003459095954895, + "learning_rate": 0.001, + "loss": 2.1655, + "step": 544 + }, + { + "epoch": 0.6662591687041565, + "grad_norm": 0.9038470387458801, + "learning_rate": 0.001, + "loss": 2.198, + "step": 545 + }, + { + "epoch": 0.6674816625916871, + "grad_norm": 0.6527109146118164, + "learning_rate": 0.001, + "loss": 2.2085, + "step": 546 + }, + { + "epoch": 0.6687041564792175, + "grad_norm": 0.6161382794380188, + "learning_rate": 0.001, + "loss": 2.0986, + "step": 547 + }, + { + "epoch": 0.6699266503667481, + "grad_norm": 0.5980858206748962, + "learning_rate": 0.001, + "loss": 2.2287, + "step": 548 + }, + { + "epoch": 0.6711491442542787, + "grad_norm": 0.5564101934432983, + "learning_rate": 0.001, + "loss": 2.2339, + "step": 549 + }, + { + "epoch": 0.6723716381418093, + "grad_norm": 0.5963722467422485, + "learning_rate": 0.001, + "loss": 2.1952, + "step": 550 + }, + { + "epoch": 0.6735941320293398, + "grad_norm": 0.5184721350669861, + "learning_rate": 0.001, + "loss": 2.1839, + "step": 551 + }, + { + "epoch": 0.6748166259168704, + "grad_norm": 0.5719816088676453, + "learning_rate": 0.001, + "loss": 2.1343, + "step": 552 + }, + { + "epoch": 0.676039119804401, + "grad_norm": 0.49916115403175354, + "learning_rate": 0.001, + "loss": 2.2026, + "step": 553 + }, + { + "epoch": 0.6772616136919315, + "grad_norm": 0.7603865265846252, + "learning_rate": 0.001, + "loss": 2.1293, + "step": 554 + }, + { + "epoch": 0.6784841075794621, + "grad_norm": 0.5817961096763611, + "learning_rate": 0.001, + "loss": 2.3228, + "step": 555 + }, + { + "epoch": 0.6797066014669927, + "grad_norm": 0.6436595916748047, + "learning_rate": 0.001, + "loss": 2.1515, + "step": 556 + }, + { + "epoch": 0.6809290953545232, + "grad_norm": 0.600204586982727, + "learning_rate": 0.001, + "loss": 2.2553, + "step": 557 + }, + { + "epoch": 0.6821515892420538, + "grad_norm": 0.5750772953033447, + "learning_rate": 0.001, + "loss": 2.2018, + "step": 558 + }, + { + "epoch": 0.6833740831295844, + "grad_norm": 0.6340410709381104, + "learning_rate": 0.001, + "loss": 2.2321, + "step": 559 + }, + { + "epoch": 0.684596577017115, + "grad_norm": 0.5944398641586304, + "learning_rate": 0.001, + "loss": 2.1864, + "step": 560 + }, + { + "epoch": 0.6858190709046454, + "grad_norm": 0.5228376984596252, + "learning_rate": 0.001, + "loss": 2.2398, + "step": 561 + }, + { + "epoch": 0.687041564792176, + "grad_norm": 0.7118151187896729, + "learning_rate": 0.001, + "loss": 2.3087, + "step": 562 + }, + { + "epoch": 0.6882640586797066, + "grad_norm": 0.6326483488082886, + "learning_rate": 0.001, + "loss": 2.1648, + "step": 563 + }, + { + "epoch": 0.6894865525672371, + "grad_norm": 0.6230838894844055, + "learning_rate": 0.001, + "loss": 2.1223, + "step": 564 + }, + { + "epoch": 0.6907090464547677, + "grad_norm": 0.5977362990379333, + "learning_rate": 0.001, + "loss": 2.2783, + "step": 565 + }, + { + "epoch": 0.6919315403422983, + "grad_norm": 0.5887629389762878, + "learning_rate": 0.001, + "loss": 2.03, + "step": 566 + }, + { + "epoch": 0.6931540342298288, + "grad_norm": 0.5934235453605652, + "learning_rate": 0.001, + "loss": 2.228, + "step": 567 + }, + { + "epoch": 0.6943765281173594, + "grad_norm": 0.5208036303520203, + "learning_rate": 0.001, + "loss": 2.3098, + "step": 568 + }, + { + "epoch": 0.69559902200489, + "grad_norm": 0.6961516737937927, + "learning_rate": 0.001, + "loss": 2.1223, + "step": 569 + }, + { + "epoch": 0.6968215158924206, + "grad_norm": 0.6044623255729675, + "learning_rate": 0.001, + "loss": 2.2311, + "step": 570 + }, + { + "epoch": 0.6980440097799511, + "grad_norm": 0.5864453911781311, + "learning_rate": 0.001, + "loss": 2.2621, + "step": 571 + }, + { + "epoch": 0.6992665036674817, + "grad_norm": 0.6569441556930542, + "learning_rate": 0.001, + "loss": 2.3103, + "step": 572 + }, + { + "epoch": 0.7004889975550123, + "grad_norm": 0.6718661189079285, + "learning_rate": 0.001, + "loss": 2.1022, + "step": 573 + }, + { + "epoch": 0.7017114914425427, + "grad_norm": 0.6719921231269836, + "learning_rate": 0.001, + "loss": 2.2599, + "step": 574 + }, + { + "epoch": 0.7029339853300733, + "grad_norm": 0.5817815661430359, + "learning_rate": 0.001, + "loss": 2.1637, + "step": 575 + }, + { + "epoch": 0.7041564792176039, + "grad_norm": 0.648612380027771, + "learning_rate": 0.001, + "loss": 2.2681, + "step": 576 + }, + { + "epoch": 0.7053789731051344, + "grad_norm": 0.6213855147361755, + "learning_rate": 0.001, + "loss": 2.1772, + "step": 577 + }, + { + "epoch": 0.706601466992665, + "grad_norm": 0.6786682605743408, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 578 + }, + { + "epoch": 0.7078239608801956, + "grad_norm": 0.6275045275688171, + "learning_rate": 0.001, + "loss": 2.259, + "step": 579 + }, + { + "epoch": 0.7090464547677262, + "grad_norm": 0.6425896286964417, + "learning_rate": 0.001, + "loss": 2.263, + "step": 580 + }, + { + "epoch": 0.7102689486552567, + "grad_norm": 0.6005223989486694, + "learning_rate": 0.001, + "loss": 2.2789, + "step": 581 + }, + { + "epoch": 0.7114914425427873, + "grad_norm": 0.8881091475486755, + "learning_rate": 0.001, + "loss": 2.0891, + "step": 582 + }, + { + "epoch": 0.7127139364303179, + "grad_norm": 0.511212170124054, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 583 + }, + { + "epoch": 0.7139364303178484, + "grad_norm": 0.5520249009132385, + "learning_rate": 0.001, + "loss": 2.2404, + "step": 584 + }, + { + "epoch": 0.715158924205379, + "grad_norm": 0.5928168296813965, + "learning_rate": 0.001, + "loss": 2.2272, + "step": 585 + }, + { + "epoch": 0.7163814180929096, + "grad_norm": 0.5859750509262085, + "learning_rate": 0.001, + "loss": 2.1354, + "step": 586 + }, + { + "epoch": 0.71760391198044, + "grad_norm": 0.5234774947166443, + "learning_rate": 0.001, + "loss": 2.2706, + "step": 587 + }, + { + "epoch": 0.7188264058679706, + "grad_norm": 0.6386865973472595, + "learning_rate": 0.001, + "loss": 2.2589, + "step": 588 + }, + { + "epoch": 0.7200488997555012, + "grad_norm": 0.6570860147476196, + "learning_rate": 0.001, + "loss": 2.1486, + "step": 589 + }, + { + "epoch": 0.7212713936430318, + "grad_norm": 0.6303104162216187, + "learning_rate": 0.001, + "loss": 2.1699, + "step": 590 + }, + { + "epoch": 0.7224938875305623, + "grad_norm": 0.6067413091659546, + "learning_rate": 0.001, + "loss": 2.1678, + "step": 591 + }, + { + "epoch": 0.7237163814180929, + "grad_norm": 0.7316265106201172, + "learning_rate": 0.001, + "loss": 2.1259, + "step": 592 + }, + { + "epoch": 0.7249388753056235, + "grad_norm": 0.5220648050308228, + "learning_rate": 0.001, + "loss": 2.2057, + "step": 593 + }, + { + "epoch": 0.726161369193154, + "grad_norm": 0.5792797803878784, + "learning_rate": 0.001, + "loss": 2.1987, + "step": 594 + }, + { + "epoch": 0.7273838630806846, + "grad_norm": 0.885420024394989, + "learning_rate": 0.001, + "loss": 2.2314, + "step": 595 + }, + { + "epoch": 0.7286063569682152, + "grad_norm": 0.5722411274909973, + "learning_rate": 0.001, + "loss": 2.094, + "step": 596 + }, + { + "epoch": 0.7298288508557457, + "grad_norm": 0.5585153698921204, + "learning_rate": 0.001, + "loss": 2.2757, + "step": 597 + }, + { + "epoch": 0.7310513447432763, + "grad_norm": 0.5866743922233582, + "learning_rate": 0.001, + "loss": 2.2134, + "step": 598 + }, + { + "epoch": 0.7322738386308069, + "grad_norm": 0.5329750180244446, + "learning_rate": 0.001, + "loss": 2.2929, + "step": 599 + }, + { + "epoch": 0.7334963325183375, + "grad_norm": 0.5793975591659546, + "learning_rate": 0.001, + "loss": 2.1822, + "step": 600 + }, + { + "epoch": 0.7347188264058679, + "grad_norm": 0.621846616268158, + "learning_rate": 0.001, + "loss": 2.2078, + "step": 601 + }, + { + "epoch": 0.7359413202933985, + "grad_norm": 0.6965752840042114, + "learning_rate": 0.001, + "loss": 2.1822, + "step": 602 + }, + { + "epoch": 0.7371638141809291, + "grad_norm": 0.626397430896759, + "learning_rate": 0.001, + "loss": 2.21, + "step": 603 + }, + { + "epoch": 0.7383863080684596, + "grad_norm": 0.8464508056640625, + "learning_rate": 0.001, + "loss": 2.18, + "step": 604 + }, + { + "epoch": 0.7396088019559902, + "grad_norm": 0.5484370589256287, + "learning_rate": 0.001, + "loss": 2.12, + "step": 605 + }, + { + "epoch": 0.7408312958435208, + "grad_norm": 0.7459133267402649, + "learning_rate": 0.001, + "loss": 2.1339, + "step": 606 + }, + { + "epoch": 0.7420537897310513, + "grad_norm": 0.7169046998023987, + "learning_rate": 0.001, + "loss": 2.296, + "step": 607 + }, + { + "epoch": 0.7432762836185819, + "grad_norm": 0.5717725157737732, + "learning_rate": 0.001, + "loss": 2.163, + "step": 608 + }, + { + "epoch": 0.7444987775061125, + "grad_norm": 0.6337913274765015, + "learning_rate": 0.001, + "loss": 2.1702, + "step": 609 + }, + { + "epoch": 0.7457212713936431, + "grad_norm": 0.6826694011688232, + "learning_rate": 0.001, + "loss": 2.2204, + "step": 610 + }, + { + "epoch": 0.7469437652811736, + "grad_norm": 0.6060184836387634, + "learning_rate": 0.001, + "loss": 2.1545, + "step": 611 + }, + { + "epoch": 0.7481662591687042, + "grad_norm": 0.6665242910385132, + "learning_rate": 0.001, + "loss": 2.2457, + "step": 612 + }, + { + "epoch": 0.7493887530562348, + "grad_norm": 0.5639733076095581, + "learning_rate": 0.001, + "loss": 2.1641, + "step": 613 + }, + { + "epoch": 0.7506112469437652, + "grad_norm": 0.6882290840148926, + "learning_rate": 0.001, + "loss": 2.1655, + "step": 614 + }, + { + "epoch": 0.7518337408312958, + "grad_norm": 0.5488794445991516, + "learning_rate": 0.001, + "loss": 2.2228, + "step": 615 + }, + { + "epoch": 0.7518337408312958, + "eval_loss": 2.115697145462036, + "eval_runtime": 70.7202, + "eval_samples_per_second": 147.978, + "eval_steps_per_second": 18.51, + "step": 615 + }, + { + "epoch": 0.7530562347188264, + "grad_norm": 0.5853757262229919, + "learning_rate": 0.001, + "loss": 2.1722, + "step": 616 + }, + { + "epoch": 0.7542787286063569, + "grad_norm": 0.6095851063728333, + "learning_rate": 0.001, + "loss": 2.1482, + "step": 617 + }, + { + "epoch": 0.7555012224938875, + "grad_norm": 0.7331103682518005, + "learning_rate": 0.001, + "loss": 2.2136, + "step": 618 + }, + { + "epoch": 0.7567237163814181, + "grad_norm": 0.5856543779373169, + "learning_rate": 0.001, + "loss": 2.2382, + "step": 619 + }, + { + "epoch": 0.7579462102689487, + "grad_norm": 0.8980780839920044, + "learning_rate": 0.001, + "loss": 2.1312, + "step": 620 + }, + { + "epoch": 0.7591687041564792, + "grad_norm": 0.5309576988220215, + "learning_rate": 0.001, + "loss": 2.1447, + "step": 621 + }, + { + "epoch": 0.7603911980440098, + "grad_norm": 0.5389253497123718, + "learning_rate": 0.001, + "loss": 2.1772, + "step": 622 + }, + { + "epoch": 0.7616136919315404, + "grad_norm": 0.6142221093177795, + "learning_rate": 0.001, + "loss": 2.1545, + "step": 623 + }, + { + "epoch": 0.7628361858190709, + "grad_norm": 0.6199280023574829, + "learning_rate": 0.001, + "loss": 2.1463, + "step": 624 + }, + { + "epoch": 0.7640586797066015, + "grad_norm": 0.6900845170021057, + "learning_rate": 0.001, + "loss": 2.1927, + "step": 625 + }, + { + "epoch": 0.7652811735941321, + "grad_norm": 0.4952549338340759, + "learning_rate": 0.001, + "loss": 2.2629, + "step": 626 + }, + { + "epoch": 0.7665036674816625, + "grad_norm": 0.7719544172286987, + "learning_rate": 0.001, + "loss": 2.2246, + "step": 627 + }, + { + "epoch": 0.7677261613691931, + "grad_norm": 0.6267576813697815, + "learning_rate": 0.001, + "loss": 2.1653, + "step": 628 + }, + { + "epoch": 0.7689486552567237, + "grad_norm": 0.5654335618019104, + "learning_rate": 0.001, + "loss": 2.2629, + "step": 629 + }, + { + "epoch": 0.7701711491442543, + "grad_norm": 0.5660375952720642, + "learning_rate": 0.001, + "loss": 2.1891, + "step": 630 + }, + { + "epoch": 0.7713936430317848, + "grad_norm": 0.5151835083961487, + "learning_rate": 0.001, + "loss": 2.1307, + "step": 631 + }, + { + "epoch": 0.7726161369193154, + "grad_norm": 0.615497350692749, + "learning_rate": 0.001, + "loss": 2.2797, + "step": 632 + }, + { + "epoch": 0.773838630806846, + "grad_norm": 0.627841055393219, + "learning_rate": 0.001, + "loss": 2.2369, + "step": 633 + }, + { + "epoch": 0.7750611246943765, + "grad_norm": 0.5649968981742859, + "learning_rate": 0.001, + "loss": 2.1355, + "step": 634 + }, + { + "epoch": 0.7762836185819071, + "grad_norm": 0.6311205625534058, + "learning_rate": 0.001, + "loss": 2.2487, + "step": 635 + }, + { + "epoch": 0.7775061124694377, + "grad_norm": 0.6173393130302429, + "learning_rate": 0.001, + "loss": 2.2116, + "step": 636 + }, + { + "epoch": 0.7787286063569682, + "grad_norm": 0.706934928894043, + "learning_rate": 0.001, + "loss": 2.1385, + "step": 637 + }, + { + "epoch": 0.7799511002444988, + "grad_norm": 0.6243404746055603, + "learning_rate": 0.001, + "loss": 2.2883, + "step": 638 + }, + { + "epoch": 0.7811735941320294, + "grad_norm": 0.6092289686203003, + "learning_rate": 0.001, + "loss": 2.248, + "step": 639 + }, + { + "epoch": 0.78239608801956, + "grad_norm": 0.6141385436058044, + "learning_rate": 0.001, + "loss": 2.3563, + "step": 640 + }, + { + "epoch": 0.7836185819070904, + "grad_norm": 0.5421451926231384, + "learning_rate": 0.001, + "loss": 2.2518, + "step": 641 + }, + { + "epoch": 0.784841075794621, + "grad_norm": 0.5878483653068542, + "learning_rate": 0.001, + "loss": 2.1793, + "step": 642 + }, + { + "epoch": 0.7860635696821516, + "grad_norm": 0.5330148339271545, + "learning_rate": 0.001, + "loss": 2.2476, + "step": 643 + }, + { + "epoch": 0.7872860635696821, + "grad_norm": 0.5393016934394836, + "learning_rate": 0.001, + "loss": 2.2135, + "step": 644 + }, + { + "epoch": 0.7885085574572127, + "grad_norm": 0.5908885598182678, + "learning_rate": 0.001, + "loss": 2.0988, + "step": 645 + }, + { + "epoch": 0.7897310513447433, + "grad_norm": 0.5079434514045715, + "learning_rate": 0.001, + "loss": 2.1475, + "step": 646 + }, + { + "epoch": 0.7909535452322738, + "grad_norm": 0.4999438226222992, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 647 + }, + { + "epoch": 0.7921760391198044, + "grad_norm": 0.5183396339416504, + "learning_rate": 0.001, + "loss": 2.2245, + "step": 648 + }, + { + "epoch": 0.793398533007335, + "grad_norm": 0.5283822417259216, + "learning_rate": 0.001, + "loss": 2.238, + "step": 649 + }, + { + "epoch": 0.7946210268948656, + "grad_norm": 0.6713541150093079, + "learning_rate": 0.001, + "loss": 2.1341, + "step": 650 + }, + { + "epoch": 0.7958435207823961, + "grad_norm": 0.6195135712623596, + "learning_rate": 0.001, + "loss": 2.2756, + "step": 651 + }, + { + "epoch": 0.7970660146699267, + "grad_norm": 1.1329959630966187, + "learning_rate": 0.001, + "loss": 2.1616, + "step": 652 + }, + { + "epoch": 0.7982885085574573, + "grad_norm": 0.6215202212333679, + "learning_rate": 0.001, + "loss": 2.1917, + "step": 653 + }, + { + "epoch": 0.7995110024449877, + "grad_norm": 0.9555675983428955, + "learning_rate": 0.001, + "loss": 2.1186, + "step": 654 + }, + { + "epoch": 0.8007334963325183, + "grad_norm": 0.6120454668998718, + "learning_rate": 0.001, + "loss": 2.1704, + "step": 655 + }, + { + "epoch": 0.8019559902200489, + "grad_norm": 0.7060715556144714, + "learning_rate": 0.001, + "loss": 2.2127, + "step": 656 + }, + { + "epoch": 0.8031784841075794, + "grad_norm": 0.6769619584083557, + "learning_rate": 0.001, + "loss": 2.2006, + "step": 657 + }, + { + "epoch": 0.80440097799511, + "grad_norm": 0.5458390712738037, + "learning_rate": 0.001, + "loss": 2.2003, + "step": 658 + }, + { + "epoch": 0.8056234718826406, + "grad_norm": 0.6172414422035217, + "learning_rate": 0.001, + "loss": 2.221, + "step": 659 + }, + { + "epoch": 0.8068459657701712, + "grad_norm": 0.8861773014068604, + "learning_rate": 0.001, + "loss": 2.1507, + "step": 660 + }, + { + "epoch": 0.8080684596577017, + "grad_norm": 0.5643443465232849, + "learning_rate": 0.001, + "loss": 2.1747, + "step": 661 + }, + { + "epoch": 0.8092909535452323, + "grad_norm": 0.6740103363990784, + "learning_rate": 0.001, + "loss": 2.211, + "step": 662 + }, + { + "epoch": 0.8105134474327629, + "grad_norm": 0.6159763336181641, + "learning_rate": 0.001, + "loss": 2.311, + "step": 663 + }, + { + "epoch": 0.8117359413202934, + "grad_norm": 0.7396857142448425, + "learning_rate": 0.001, + "loss": 2.1574, + "step": 664 + }, + { + "epoch": 0.812958435207824, + "grad_norm": 0.7003642320632935, + "learning_rate": 0.001, + "loss": 2.1753, + "step": 665 + }, + { + "epoch": 0.8141809290953546, + "grad_norm": 0.735837996006012, + "learning_rate": 0.001, + "loss": 2.1408, + "step": 666 + }, + { + "epoch": 0.815403422982885, + "grad_norm": 0.6381567716598511, + "learning_rate": 0.001, + "loss": 2.1237, + "step": 667 + }, + { + "epoch": 0.8166259168704156, + "grad_norm": 0.6023222804069519, + "learning_rate": 0.001, + "loss": 2.2082, + "step": 668 + }, + { + "epoch": 0.8178484107579462, + "grad_norm": 0.7556263208389282, + "learning_rate": 0.001, + "loss": 2.1482, + "step": 669 + }, + { + "epoch": 0.8190709046454768, + "grad_norm": 0.685670018196106, + "learning_rate": 0.001, + "loss": 2.1409, + "step": 670 + }, + { + "epoch": 0.8202933985330073, + "grad_norm": 0.6988388895988464, + "learning_rate": 0.001, + "loss": 2.215, + "step": 671 + }, + { + "epoch": 0.8215158924205379, + "grad_norm": 0.7381739616394043, + "learning_rate": 0.001, + "loss": 2.1533, + "step": 672 + }, + { + "epoch": 0.8227383863080685, + "grad_norm": 0.6375918388366699, + "learning_rate": 0.001, + "loss": 2.1447, + "step": 673 + }, + { + "epoch": 0.823960880195599, + "grad_norm": 0.8125602006912231, + "learning_rate": 0.001, + "loss": 2.2065, + "step": 674 + }, + { + "epoch": 0.8251833740831296, + "grad_norm": 0.6340590119361877, + "learning_rate": 0.001, + "loss": 2.2653, + "step": 675 + }, + { + "epoch": 0.8264058679706602, + "grad_norm": 0.6410722732543945, + "learning_rate": 0.001, + "loss": 2.2131, + "step": 676 + }, + { + "epoch": 0.8276283618581907, + "grad_norm": 0.7203342318534851, + "learning_rate": 0.001, + "loss": 2.1969, + "step": 677 + }, + { + "epoch": 0.8288508557457213, + "grad_norm": 0.5189094543457031, + "learning_rate": 0.001, + "loss": 2.0506, + "step": 678 + }, + { + "epoch": 0.8300733496332519, + "grad_norm": 0.5186660885810852, + "learning_rate": 0.001, + "loss": 2.0897, + "step": 679 + }, + { + "epoch": 0.8312958435207825, + "grad_norm": 0.5656071901321411, + "learning_rate": 0.001, + "loss": 2.2073, + "step": 680 + }, + { + "epoch": 0.8325183374083129, + "grad_norm": 0.621357262134552, + "learning_rate": 0.001, + "loss": 2.3196, + "step": 681 + }, + { + "epoch": 0.8337408312958435, + "grad_norm": 0.6449140906333923, + "learning_rate": 0.001, + "loss": 2.1453, + "step": 682 + }, + { + "epoch": 0.8349633251833741, + "grad_norm": 0.5833041071891785, + "learning_rate": 0.001, + "loss": 2.1055, + "step": 683 + }, + { + "epoch": 0.8361858190709046, + "grad_norm": 0.5668090581893921, + "learning_rate": 0.001, + "loss": 2.2212, + "step": 684 + }, + { + "epoch": 0.8374083129584352, + "grad_norm": 0.611060619354248, + "learning_rate": 0.001, + "loss": 2.2337, + "step": 685 + }, + { + "epoch": 0.8386308068459658, + "grad_norm": 0.607295036315918, + "learning_rate": 0.001, + "loss": 2.1919, + "step": 686 + }, + { + "epoch": 0.8398533007334963, + "grad_norm": 0.6494394540786743, + "learning_rate": 0.001, + "loss": 2.1755, + "step": 687 + }, + { + "epoch": 0.8410757946210269, + "grad_norm": 0.650504469871521, + "learning_rate": 0.001, + "loss": 2.2382, + "step": 688 + }, + { + "epoch": 0.8422982885085575, + "grad_norm": 0.5527434349060059, + "learning_rate": 0.001, + "loss": 2.1191, + "step": 689 + }, + { + "epoch": 0.843520782396088, + "grad_norm": 0.6322658061981201, + "learning_rate": 0.001, + "loss": 2.1291, + "step": 690 + }, + { + "epoch": 0.8447432762836186, + "grad_norm": 0.6053141355514526, + "learning_rate": 0.001, + "loss": 2.169, + "step": 691 + }, + { + "epoch": 0.8459657701711492, + "grad_norm": 0.670330822467804, + "learning_rate": 0.001, + "loss": 2.1265, + "step": 692 + }, + { + "epoch": 0.8471882640586798, + "grad_norm": 0.810947835445404, + "learning_rate": 0.001, + "loss": 2.2218, + "step": 693 + }, + { + "epoch": 0.8484107579462102, + "grad_norm": 0.6541447639465332, + "learning_rate": 0.001, + "loss": 2.1557, + "step": 694 + }, + { + "epoch": 0.8496332518337408, + "grad_norm": 0.5765539407730103, + "learning_rate": 0.001, + "loss": 2.1568, + "step": 695 + }, + { + "epoch": 0.8508557457212714, + "grad_norm": 0.9142853617668152, + "learning_rate": 0.001, + "loss": 2.1001, + "step": 696 + }, + { + "epoch": 0.8520782396088019, + "grad_norm": 0.8350043296813965, + "learning_rate": 0.001, + "loss": 2.0955, + "step": 697 + }, + { + "epoch": 0.8533007334963325, + "grad_norm": 0.7408676743507385, + "learning_rate": 0.001, + "loss": 2.2104, + "step": 698 + }, + { + "epoch": 0.8545232273838631, + "grad_norm": 0.6313987970352173, + "learning_rate": 0.001, + "loss": 2.228, + "step": 699 + }, + { + "epoch": 0.8557457212713936, + "grad_norm": 0.503736138343811, + "learning_rate": 0.001, + "loss": 2.2701, + "step": 700 + }, + { + "epoch": 0.8569682151589242, + "grad_norm": 0.6567993760108948, + "learning_rate": 0.001, + "loss": 2.2163, + "step": 701 + }, + { + "epoch": 0.8581907090464548, + "grad_norm": 0.6446906328201294, + "learning_rate": 0.001, + "loss": 2.1723, + "step": 702 + }, + { + "epoch": 0.8594132029339854, + "grad_norm": 0.5486587882041931, + "learning_rate": 0.001, + "loss": 2.1876, + "step": 703 + }, + { + "epoch": 0.8606356968215159, + "grad_norm": 0.5563488602638245, + "learning_rate": 0.001, + "loss": 2.1077, + "step": 704 + }, + { + "epoch": 0.8618581907090465, + "grad_norm": 0.8015148043632507, + "learning_rate": 0.001, + "loss": 2.1492, + "step": 705 + }, + { + "epoch": 0.863080684596577, + "grad_norm": 0.5513556003570557, + "learning_rate": 0.001, + "loss": 2.207, + "step": 706 + }, + { + "epoch": 0.8643031784841075, + "grad_norm": 0.604444682598114, + "learning_rate": 0.001, + "loss": 2.2217, + "step": 707 + }, + { + "epoch": 0.8655256723716381, + "grad_norm": 0.5867708325386047, + "learning_rate": 0.001, + "loss": 2.236, + "step": 708 + }, + { + "epoch": 0.8667481662591687, + "grad_norm": 0.5340182781219482, + "learning_rate": 0.001, + "loss": 2.1848, + "step": 709 + }, + { + "epoch": 0.8679706601466992, + "grad_norm": 0.5035935044288635, + "learning_rate": 0.001, + "loss": 2.1546, + "step": 710 + }, + { + "epoch": 0.8691931540342298, + "grad_norm": 0.5513314604759216, + "learning_rate": 0.001, + "loss": 2.2459, + "step": 711 + }, + { + "epoch": 0.8704156479217604, + "grad_norm": 0.6878560781478882, + "learning_rate": 0.001, + "loss": 2.2302, + "step": 712 + }, + { + "epoch": 0.871638141809291, + "grad_norm": 0.5138952136039734, + "learning_rate": 0.001, + "loss": 2.186, + "step": 713 + }, + { + "epoch": 0.8728606356968215, + "grad_norm": 0.6376103162765503, + "learning_rate": 0.001, + "loss": 2.1625, + "step": 714 + }, + { + "epoch": 0.8740831295843521, + "grad_norm": 0.5404793620109558, + "learning_rate": 0.001, + "loss": 2.2442, + "step": 715 + }, + { + "epoch": 0.8753056234718827, + "grad_norm": 0.6397867798805237, + "learning_rate": 0.001, + "loss": 2.197, + "step": 716 + }, + { + "epoch": 0.8765281173594132, + "grad_norm": 0.5790246725082397, + "learning_rate": 0.001, + "loss": 2.1914, + "step": 717 + }, + { + "epoch": 0.8777506112469438, + "grad_norm": 0.5128389000892639, + "learning_rate": 0.001, + "loss": 2.2063, + "step": 718 + }, + { + "epoch": 0.8789731051344744, + "grad_norm": 0.5515012145042419, + "learning_rate": 0.001, + "loss": 2.1923, + "step": 719 + }, + { + "epoch": 0.8801955990220048, + "grad_norm": 0.5543470978736877, + "learning_rate": 0.001, + "loss": 2.2026, + "step": 720 + }, + { + "epoch": 0.8814180929095354, + "grad_norm": 0.7619017362594604, + "learning_rate": 0.001, + "loss": 2.2105, + "step": 721 + }, + { + "epoch": 0.882640586797066, + "grad_norm": 0.6313949227333069, + "learning_rate": 0.001, + "loss": 2.1459, + "step": 722 + }, + { + "epoch": 0.8838630806845966, + "grad_norm": 0.550726056098938, + "learning_rate": 0.001, + "loss": 2.3319, + "step": 723 + }, + { + "epoch": 0.8850855745721271, + "grad_norm": 0.7315700650215149, + "learning_rate": 0.001, + "loss": 2.2009, + "step": 724 + }, + { + "epoch": 0.8863080684596577, + "grad_norm": 0.705125629901886, + "learning_rate": 0.001, + "loss": 2.1458, + "step": 725 + }, + { + "epoch": 0.8875305623471883, + "grad_norm": 0.545971691608429, + "learning_rate": 0.001, + "loss": 2.161, + "step": 726 + }, + { + "epoch": 0.8887530562347188, + "grad_norm": 0.6012575030326843, + "learning_rate": 0.001, + "loss": 2.1907, + "step": 727 + }, + { + "epoch": 0.8899755501222494, + "grad_norm": 0.5882204174995422, + "learning_rate": 0.001, + "loss": 2.2835, + "step": 728 + }, + { + "epoch": 0.89119804400978, + "grad_norm": 0.5331866145133972, + "learning_rate": 0.001, + "loss": 2.2505, + "step": 729 + }, + { + "epoch": 0.8924205378973105, + "grad_norm": 0.5553421974182129, + "learning_rate": 0.001, + "loss": 2.1752, + "step": 730 + }, + { + "epoch": 0.8936430317848411, + "grad_norm": 0.576580822467804, + "learning_rate": 0.001, + "loss": 2.0457, + "step": 731 + }, + { + "epoch": 0.8948655256723717, + "grad_norm": 0.6258639693260193, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 732 + }, + { + "epoch": 0.8960880195599022, + "grad_norm": 0.5421616435050964, + "learning_rate": 0.001, + "loss": 2.1701, + "step": 733 + }, + { + "epoch": 0.8973105134474327, + "grad_norm": 0.6916106939315796, + "learning_rate": 0.001, + "loss": 2.1443, + "step": 734 + }, + { + "epoch": 0.8985330073349633, + "grad_norm": 0.5956131219863892, + "learning_rate": 0.001, + "loss": 2.1897, + "step": 735 + }, + { + "epoch": 0.8997555012224939, + "grad_norm": 0.5249125361442566, + "learning_rate": 0.001, + "loss": 2.2231, + "step": 736 + }, + { + "epoch": 0.9009779951100244, + "grad_norm": 0.5568667650222778, + "learning_rate": 0.001, + "loss": 2.1758, + "step": 737 + }, + { + "epoch": 0.902200488997555, + "grad_norm": 0.5609161853790283, + "learning_rate": 0.001, + "loss": 2.1774, + "step": 738 + }, + { + "epoch": 0.9034229828850856, + "grad_norm": 0.5504893064498901, + "learning_rate": 0.001, + "loss": 2.1974, + "step": 739 + }, + { + "epoch": 0.9046454767726161, + "grad_norm": 0.526772141456604, + "learning_rate": 0.001, + "loss": 2.1867, + "step": 740 + }, + { + "epoch": 0.9058679706601467, + "grad_norm": 0.6314900517463684, + "learning_rate": 0.001, + "loss": 2.1958, + "step": 741 + }, + { + "epoch": 0.9070904645476773, + "grad_norm": 0.4864460229873657, + "learning_rate": 0.001, + "loss": 2.2753, + "step": 742 + }, + { + "epoch": 0.9083129584352079, + "grad_norm": 0.5816671848297119, + "learning_rate": 0.001, + "loss": 2.2385, + "step": 743 + }, + { + "epoch": 0.9095354523227384, + "grad_norm": 0.6227999329566956, + "learning_rate": 0.001, + "loss": 2.3065, + "step": 744 + }, + { + "epoch": 0.910757946210269, + "grad_norm": 0.5527008771896362, + "learning_rate": 0.001, + "loss": 2.184, + "step": 745 + }, + { + "epoch": 0.9119804400977995, + "grad_norm": 0.5377302169799805, + "learning_rate": 0.001, + "loss": 2.2083, + "step": 746 + }, + { + "epoch": 0.91320293398533, + "grad_norm": 0.7717643976211548, + "learning_rate": 0.001, + "loss": 2.2884, + "step": 747 + }, + { + "epoch": 0.9144254278728606, + "grad_norm": 0.5529236793518066, + "learning_rate": 0.001, + "loss": 2.1217, + "step": 748 + }, + { + "epoch": 0.9156479217603912, + "grad_norm": 0.557788074016571, + "learning_rate": 0.001, + "loss": 2.2013, + "step": 749 + }, + { + "epoch": 0.9168704156479217, + "grad_norm": 0.5905176401138306, + "learning_rate": 0.001, + "loss": 2.1523, + "step": 750 + }, + { + "epoch": 0.9180929095354523, + "grad_norm": 0.7020503878593445, + "learning_rate": 0.001, + "loss": 2.2644, + "step": 751 + }, + { + "epoch": 0.9193154034229829, + "grad_norm": 0.6087464094161987, + "learning_rate": 0.001, + "loss": 2.164, + "step": 752 + }, + { + "epoch": 0.9205378973105135, + "grad_norm": 0.7976818680763245, + "learning_rate": 0.001, + "loss": 2.2657, + "step": 753 + }, + { + "epoch": 0.921760391198044, + "grad_norm": 0.5396186709403992, + "learning_rate": 0.001, + "loss": 2.205, + "step": 754 + }, + { + "epoch": 0.9229828850855746, + "grad_norm": 0.7185906171798706, + "learning_rate": 0.001, + "loss": 2.1775, + "step": 755 + }, + { + "epoch": 0.9242053789731052, + "grad_norm": 0.5456767082214355, + "learning_rate": 0.001, + "loss": 2.1751, + "step": 756 + }, + { + "epoch": 0.9254278728606357, + "grad_norm": 0.6924658417701721, + "learning_rate": 0.001, + "loss": 2.1324, + "step": 757 + }, + { + "epoch": 0.9266503667481663, + "grad_norm": 0.5632027983665466, + "learning_rate": 0.001, + "loss": 2.1198, + "step": 758 + }, + { + "epoch": 0.9278728606356969, + "grad_norm": 0.6390364170074463, + "learning_rate": 0.001, + "loss": 2.261, + "step": 759 + }, + { + "epoch": 0.9290953545232273, + "grad_norm": 0.7371255159378052, + "learning_rate": 0.001, + "loss": 2.2915, + "step": 760 + }, + { + "epoch": 0.9303178484107579, + "grad_norm": 0.665230393409729, + "learning_rate": 0.001, + "loss": 2.1735, + "step": 761 + }, + { + "epoch": 0.9315403422982885, + "grad_norm": 0.5170220732688904, + "learning_rate": 0.001, + "loss": 2.251, + "step": 762 + }, + { + "epoch": 0.9327628361858191, + "grad_norm": 0.6142858266830444, + "learning_rate": 0.001, + "loss": 2.2155, + "step": 763 + }, + { + "epoch": 0.9339853300733496, + "grad_norm": 0.5615746974945068, + "learning_rate": 0.001, + "loss": 2.1899, + "step": 764 + }, + { + "epoch": 0.9352078239608802, + "grad_norm": 0.6055252552032471, + "learning_rate": 0.001, + "loss": 2.2205, + "step": 765 + }, + { + "epoch": 0.9364303178484108, + "grad_norm": 0.5220616459846497, + "learning_rate": 0.001, + "loss": 2.2125, + "step": 766 + }, + { + "epoch": 0.9376528117359413, + "grad_norm": 0.665969729423523, + "learning_rate": 0.001, + "loss": 2.2579, + "step": 767 + }, + { + "epoch": 0.9388753056234719, + "grad_norm": 0.8221463561058044, + "learning_rate": 0.001, + "loss": 2.2587, + "step": 768 + }, + { + "epoch": 0.9400977995110025, + "grad_norm": 0.5516085624694824, + "learning_rate": 0.001, + "loss": 2.1706, + "step": 769 + }, + { + "epoch": 0.941320293398533, + "grad_norm": 0.6319237351417542, + "learning_rate": 0.001, + "loss": 2.1506, + "step": 770 + }, + { + "epoch": 0.9425427872860636, + "grad_norm": 0.6836983561515808, + "learning_rate": 0.001, + "loss": 2.172, + "step": 771 + }, + { + "epoch": 0.9437652811735942, + "grad_norm": 0.5751057863235474, + "learning_rate": 0.001, + "loss": 2.1617, + "step": 772 + }, + { + "epoch": 0.9449877750611247, + "grad_norm": 0.7755267024040222, + "learning_rate": 0.001, + "loss": 2.1039, + "step": 773 + }, + { + "epoch": 0.9462102689486552, + "grad_norm": 0.5571411848068237, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 774 + }, + { + "epoch": 0.9474327628361858, + "grad_norm": 0.5482550263404846, + "learning_rate": 0.001, + "loss": 2.2251, + "step": 775 + }, + { + "epoch": 0.9486552567237164, + "grad_norm": 0.5580353140830994, + "learning_rate": 0.001, + "loss": 2.2191, + "step": 776 + }, + { + "epoch": 0.9498777506112469, + "grad_norm": 0.5179866552352905, + "learning_rate": 0.001, + "loss": 2.1786, + "step": 777 + }, + { + "epoch": 0.9511002444987775, + "grad_norm": 0.6428946256637573, + "learning_rate": 0.001, + "loss": 2.2259, + "step": 778 + }, + { + "epoch": 0.9523227383863081, + "grad_norm": 0.5078992247581482, + "learning_rate": 0.001, + "loss": 2.1951, + "step": 779 + }, + { + "epoch": 0.9535452322738386, + "grad_norm": 0.6398012638092041, + "learning_rate": 0.001, + "loss": 2.1634, + "step": 780 + }, + { + "epoch": 0.9547677261613692, + "grad_norm": 0.6948238611221313, + "learning_rate": 0.001, + "loss": 2.2215, + "step": 781 + }, + { + "epoch": 0.9559902200488998, + "grad_norm": 0.6511239409446716, + "learning_rate": 0.001, + "loss": 2.305, + "step": 782 + }, + { + "epoch": 0.9572127139364304, + "grad_norm": 0.5249418020248413, + "learning_rate": 0.001, + "loss": 2.1631, + "step": 783 + }, + { + "epoch": 0.9584352078239609, + "grad_norm": 0.7171691656112671, + "learning_rate": 0.001, + "loss": 2.2198, + "step": 784 + }, + { + "epoch": 0.9596577017114915, + "grad_norm": 0.637071967124939, + "learning_rate": 0.001, + "loss": 2.2273, + "step": 785 + }, + { + "epoch": 0.960880195599022, + "grad_norm": 0.6645258069038391, + "learning_rate": 0.001, + "loss": 2.2887, + "step": 786 + }, + { + "epoch": 0.9621026894865525, + "grad_norm": 0.7602251172065735, + "learning_rate": 0.001, + "loss": 2.2839, + "step": 787 + }, + { + "epoch": 0.9633251833740831, + "grad_norm": 0.5988993644714355, + "learning_rate": 0.001, + "loss": 2.291, + "step": 788 + }, + { + "epoch": 0.9645476772616137, + "grad_norm": 0.6392702460289001, + "learning_rate": 0.001, + "loss": 2.1837, + "step": 789 + }, + { + "epoch": 0.9657701711491442, + "grad_norm": 0.5164241790771484, + "learning_rate": 0.001, + "loss": 2.1945, + "step": 790 + }, + { + "epoch": 0.9669926650366748, + "grad_norm": 0.6533142924308777, + "learning_rate": 0.001, + "loss": 2.258, + "step": 791 + }, + { + "epoch": 0.9682151589242054, + "grad_norm": 0.5421776175498962, + "learning_rate": 0.001, + "loss": 2.2432, + "step": 792 + }, + { + "epoch": 0.969437652811736, + "grad_norm": 0.9301859140396118, + "learning_rate": 0.001, + "loss": 2.175, + "step": 793 + }, + { + "epoch": 0.9706601466992665, + "grad_norm": 0.5094099640846252, + "learning_rate": 0.001, + "loss": 2.0958, + "step": 794 + }, + { + "epoch": 0.9718826405867971, + "grad_norm": 0.6982297897338867, + "learning_rate": 0.001, + "loss": 2.2227, + "step": 795 + }, + { + "epoch": 0.9731051344743277, + "grad_norm": 0.6126400232315063, + "learning_rate": 0.001, + "loss": 2.0977, + "step": 796 + }, + { + "epoch": 0.9743276283618582, + "grad_norm": 0.572688102722168, + "learning_rate": 0.001, + "loss": 2.1711, + "step": 797 + }, + { + "epoch": 0.9755501222493888, + "grad_norm": 0.5334901213645935, + "learning_rate": 0.001, + "loss": 2.177, + "step": 798 + }, + { + "epoch": 0.9767726161369193, + "grad_norm": 0.6569558382034302, + "learning_rate": 0.001, + "loss": 2.0747, + "step": 799 + }, + { + "epoch": 0.9779951100244498, + "grad_norm": 0.5903536677360535, + "learning_rate": 0.001, + "loss": 2.2039, + "step": 800 + }, + { + "epoch": 0.9792176039119804, + "grad_norm": 0.695564329624176, + "learning_rate": 0.001, + "loss": 2.1656, + "step": 801 + }, + { + "epoch": 0.980440097799511, + "grad_norm": 0.6315183043479919, + "learning_rate": 0.001, + "loss": 2.2292, + "step": 802 + }, + { + "epoch": 0.9816625916870416, + "grad_norm": 0.7556746006011963, + "learning_rate": 0.001, + "loss": 2.2316, + "step": 803 + }, + { + "epoch": 0.9828850855745721, + "grad_norm": 0.8313256502151489, + "learning_rate": 0.001, + "loss": 2.1614, + "step": 804 + }, + { + "epoch": 0.9841075794621027, + "grad_norm": 0.6595815420150757, + "learning_rate": 0.001, + "loss": 2.139, + "step": 805 + }, + { + "epoch": 0.9853300733496333, + "grad_norm": 0.6900365352630615, + "learning_rate": 0.001, + "loss": 2.1342, + "step": 806 + }, + { + "epoch": 0.9865525672371638, + "grad_norm": 0.9232804775238037, + "learning_rate": 0.001, + "loss": 2.2395, + "step": 807 + }, + { + "epoch": 0.9877750611246944, + "grad_norm": 0.6544688940048218, + "learning_rate": 0.001, + "loss": 2.1565, + "step": 808 + }, + { + "epoch": 0.988997555012225, + "grad_norm": 0.6720030903816223, + "learning_rate": 0.001, + "loss": 2.2108, + "step": 809 + }, + { + "epoch": 0.9902200488997555, + "grad_norm": 0.653714120388031, + "learning_rate": 0.001, + "loss": 2.121, + "step": 810 + }, + { + "epoch": 0.991442542787286, + "grad_norm": 0.5552472472190857, + "learning_rate": 0.001, + "loss": 2.1173, + "step": 811 + }, + { + "epoch": 0.9926650366748166, + "grad_norm": 0.5896117091178894, + "learning_rate": 0.001, + "loss": 2.2343, + "step": 812 + }, + { + "epoch": 0.9938875305623472, + "grad_norm": 0.5029270052909851, + "learning_rate": 0.001, + "loss": 2.1824, + "step": 813 + }, + { + "epoch": 0.9951100244498777, + "grad_norm": 0.6343913674354553, + "learning_rate": 0.001, + "loss": 2.1458, + "step": 814 + }, + { + "epoch": 0.9963325183374083, + "grad_norm": 0.5997858047485352, + "learning_rate": 0.001, + "loss": 2.1901, + "step": 815 + }, + { + "epoch": 0.9975550122249389, + "grad_norm": 0.5781143307685852, + "learning_rate": 0.001, + "loss": 2.1817, + "step": 816 + }, + { + "epoch": 0.9987775061124694, + "grad_norm": 0.6393863558769226, + "learning_rate": 0.001, + "loss": 2.1763, + "step": 817 + }, + { + "epoch": 1.0, + "grad_norm": 0.7026780247688293, + "learning_rate": 0.001, + "loss": 2.2279, + "step": 818 + }, + { + "epoch": 1.0, + "step": 818, + "total_flos": 3.557919619724083e+17, + "train_loss": 2.2239257339160425, + "train_runtime": 2338.1817, + "train_samples_per_second": 44.756, + "train_steps_per_second": 0.35 + } + ], + "logging_steps": 1, + "max_steps": 818, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 205, + "stateful_callbacks": { + "EarlyStoppingCallback": { + "args": { + "early_stopping_patience": 5, + "early_stopping_threshold": 0.0 + }, + "attributes": { + "early_stopping_patience_counter": 0 + } + }, + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 3.557919619724083e+17, + "train_batch_size": 128, + "trial_name": null, + "trial_params": null +}