{ "best_metric": null, "best_model_checkpoint": null, "epoch": 156.25, "eval_steps": 250, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0625, "grad_norm": 29172.793748496217, "learning_rate": 0.0, "loss": 233.2141, "num_input_tokens_seen": 71616, "step": 1 }, { "epoch": 0.0625, "eval_synth_IoU": 0.0, "eval_synth_MAE_x": 14.7373046875, "eval_synth_MAE_y": 43.91310119628906, "eval_synth_NUM_probability": 8.316255950546747e-08, "eval_synth_inside_bbox": 0.0, "eval_synth_loss": 9172.6328125, "eval_synth_loss_ce": 4.6333194971084595, "eval_synth_loss_xval": 9168.0, "eval_synth_runtime": 62.867, "eval_synth_samples_per_second": 2.036, "eval_synth_steps_per_second": 0.064, "num_input_tokens_seen": 71616, "step": 1 }, { "epoch": 0.0625, "loss": 8836.6484375, "loss_ce": 4.648736000061035, "loss_xval": 8832.0, "num_input_tokens_seen": 71616, "step": 1 }, { "epoch": 0.125, "grad_norm": 399652.4844158417, "learning_rate": 6.27684584633728e-06, "loss": 9956.6553, "num_input_tokens_seen": 143360, "step": 2 }, { "epoch": 0.125, "loss": 8644.6943359375, "loss_ce": 4.694034099578857, "loss_xval": 8640.0, "num_input_tokens_seen": 143360, "step": 2 }, { "epoch": 0.1875, "grad_norm": 404496.1754390865, "learning_rate": 9.948565289251939e-06, "loss": 10052.5811, "num_input_tokens_seen": 215040, "step": 3 }, { "epoch": 0.1875, "loss": 10884.490234375, "loss_ce": 4.490614891052246, "loss_xval": 10880.0, "num_input_tokens_seen": 215040, "step": 3 }, { "epoch": 0.25, "grad_norm": 192550.26822256536, "learning_rate": 1.255369169267456e-05, "loss": 3084.5933, "num_input_tokens_seen": 286848, "step": 4 }, { "epoch": 0.25, "loss": 3044.640380859375, "loss_ce": 4.640501499176025, "loss_xval": 3040.0, "num_input_tokens_seen": 286848, "step": 4 }, { "epoch": 0.3125, "grad_norm": 45415.22626884781, "learning_rate": 1.4574384717887574e-05, "loss": 373.7348, "num_input_tokens_seen": 358592, "step": 5 }, { "epoch": 0.3125, "loss": 460.8260192871094, "loss_ce": 4.826022148132324, "loss_xval": 456.0, "num_input_tokens_seen": 358592, "step": 5 }, { "epoch": 0.375, "grad_norm": 78593.80926834777, "learning_rate": 1.6225411135589218e-05, "loss": 934.9646, "num_input_tokens_seen": 430208, "step": 6 }, { "epoch": 0.375, "loss": 880.810791015625, "loss_ce": 4.810770034790039, "loss_xval": 876.0, "num_input_tokens_seen": 430208, "step": 6 }, { "epoch": 0.4375, "grad_norm": 28745.67233353964, "learning_rate": 1.762133408171179e-05, "loss": 258.9128, "num_input_tokens_seen": 501952, "step": 7 }, { "epoch": 0.4375, "loss": 208.93655395507812, "loss_ce": 4.936546325683594, "loss_xval": 204.0, "num_input_tokens_seen": 501952, "step": 7 }, { "epoch": 0.5, "grad_norm": 7699.87093071136, "learning_rate": 1.883053753901184e-05, "loss": 57.5219, "num_input_tokens_seen": 561088, "step": 8 }, { "epoch": 0.5, "loss": 62.05592346191406, "loss_ce": 4.8059234619140625, "loss_xval": 57.25, "num_input_tokens_seen": 561088, "step": 8 }, { "epoch": 0.5625, "grad_norm": 16712.558684854343, "learning_rate": 1.9897130578503877e-05, "loss": 95.5276, "num_input_tokens_seen": 632768, "step": 9 }, { "epoch": 0.5625, "loss": 103.8332290649414, "loss_ce": 4.8332319259643555, "loss_xval": 99.0, "num_input_tokens_seen": 632768, "step": 9 }, { "epoch": 0.625, "grad_norm": 17002.791469915122, "learning_rate": 2.0851230564224858e-05, "loss": 87.0461, "num_input_tokens_seen": 704320, "step": 10 }, { "epoch": 0.625, "loss": 110.79865264892578, "loss_ce": 4.798652648925781, "loss_xval": 106.0, "num_input_tokens_seen": 704320, "step": 10 }, { "epoch": 0.6875, "grad_norm": 5949.488232484429, "learning_rate": 2.1714318986131375e-05, "loss": 20.0366, "num_input_tokens_seen": 776064, "step": 11 }, { "epoch": 0.6875, "loss": 20.34089469909668, "loss_ce": 4.7783942222595215, "loss_xval": 15.5625, "num_input_tokens_seen": 776064, "step": 11 }, { "epoch": 0.75, "grad_norm": 4687.070808057291, "learning_rate": 2.2502256981926498e-05, "loss": 18.7925, "num_input_tokens_seen": 835200, "step": 12 }, { "epoch": 0.75, "loss": 21.56426429748535, "loss_ce": 4.689263820648193, "loss_xval": 16.875, "num_input_tokens_seen": 835200, "step": 12 }, { "epoch": 0.8125, "grad_norm": 3537.3965456054607, "learning_rate": 2.3227089674435412e-05, "loss": 22.4419, "num_input_tokens_seen": 906944, "step": 13 }, { "epoch": 0.8125, "loss": 25.58965301513672, "loss_ce": 4.964653968811035, "loss_xval": 20.625, "num_input_tokens_seen": 906944, "step": 13 }, { "epoch": 0.875, "grad_norm": 9298.24053084731, "learning_rate": 2.389817992804907e-05, "loss": 53.2494, "num_input_tokens_seen": 978624, "step": 14 }, { "epoch": 0.875, "loss": 74.64051818847656, "loss_ce": 5.1405205726623535, "loss_xval": 69.5, "num_input_tokens_seen": 978624, "step": 14 }, { "epoch": 0.9375, "grad_norm": 4254.288944119962, "learning_rate": 2.4522950007139514e-05, "loss": 22.9845, "num_input_tokens_seen": 1050368, "step": 15 }, { "epoch": 0.9375, "loss": 23.68242645263672, "loss_ce": 5.182426929473877, "loss_xval": 18.5, "num_input_tokens_seen": 1050368, "step": 15 }, { "epoch": 1.0, "grad_norm": 2730.8534629090714, "learning_rate": 2.510738338534912e-05, "loss": 11.6825, "num_input_tokens_seen": 1122112, "step": 16 }, { "epoch": 1.0, "loss": 12.665728569030762, "loss_ce": 5.321978569030762, "loss_xval": 7.34375, "num_input_tokens_seen": 1122112, "step": 16 }, { "epoch": 1.0625, "grad_norm": 1268.1179480482795, "learning_rate": 2.5656374157160173e-05, "loss": 9.2256, "num_input_tokens_seen": 1193664, "step": 17 }, { "epoch": 1.0625, "loss": 9.186548233032227, "loss_ce": 5.295922756195068, "loss_xval": 3.890625, "num_input_tokens_seen": 1193664, "step": 17 }, { "epoch": 1.125, "grad_norm": 1232.2475849082016, "learning_rate": 2.6173976424841157e-05, "loss": 8.0166, "num_input_tokens_seen": 1265408, "step": 18 }, { "epoch": 1.125, "loss": 8.235204696655273, "loss_ce": 5.641454696655273, "loss_xval": 2.59375, "num_input_tokens_seen": 1265408, "step": 18 }, { "epoch": 1.1875, "grad_norm": 930.3138210861072, "learning_rate": 2.666358616830022e-05, "loss": 7.102, "num_input_tokens_seen": 1324608, "step": 19 }, { "epoch": 1.1875, "loss": 6.875672817230225, "loss_ce": 5.766297817230225, "loss_xval": 1.109375, "num_input_tokens_seen": 1324608, "step": 19 }, { "epoch": 1.25, "grad_norm": 504.1642954319787, "learning_rate": 2.7128076410562138e-05, "loss": 6.509, "num_input_tokens_seen": 1383616, "step": 20 }, { "epoch": 1.25, "loss": 6.596158981323242, "loss_ce": 5.393033981323242, "loss_xval": 1.203125, "num_input_tokens_seen": 1383616, "step": 20 }, { "epoch": 1.3125, "grad_norm": 259.8236571378379, "learning_rate": 2.756989937096373e-05, "loss": 6.418, "num_input_tokens_seen": 1455232, "step": 21 }, { "epoch": 1.3125, "loss": 6.42617654800415, "loss_ce": 5.99453592300415, "loss_xval": 0.431640625, "num_input_tokens_seen": 1455232, "step": 21 }, { "epoch": 1.375, "grad_norm": 456.6421855261366, "learning_rate": 2.799116483246866e-05, "loss": 6.3956, "num_input_tokens_seen": 1526848, "step": 22 }, { "epoch": 1.375, "loss": 6.097779750823975, "loss_ce": 5.468873500823975, "loss_xval": 0.62890625, "num_input_tokens_seen": 1526848, "step": 22 }, { "epoch": 1.4375, "grad_norm": 566.2674289085138, "learning_rate": 2.8393701074525802e-05, "loss": 6.6971, "num_input_tokens_seen": 1598528, "step": 23 }, { "epoch": 1.4375, "loss": 6.89132833480835, "loss_ce": 5.68820333480835, "loss_xval": 1.203125, "num_input_tokens_seen": 1598528, "step": 23 }, { "epoch": 1.5, "grad_norm": 776.1354374337782, "learning_rate": 2.877910282826378e-05, "loss": 7.5419, "num_input_tokens_seen": 1670336, "step": 24 }, { "epoch": 1.5, "loss": 7.6928510665893555, "loss_ce": 6.2162885665893555, "loss_xval": 1.4765625, "num_input_tokens_seen": 1670336, "step": 24 }, { "epoch": 1.5625, "grad_norm": 487.7768849955227, "learning_rate": 2.9148769435775147e-05, "loss": 6.6087, "num_input_tokens_seen": 1741952, "step": 25 }, { "epoch": 1.5625, "loss": 6.729894638061523, "loss_ce": 5.921300888061523, "loss_xval": 0.80859375, "num_input_tokens_seen": 1741952, "step": 25 }, { "epoch": 1.625, "grad_norm": 126.59415634802222, "learning_rate": 2.950393552077269e-05, "loss": 5.676, "num_input_tokens_seen": 1788608, "step": 26 }, { "epoch": 1.625, "loss": 5.721405982971191, "loss_ce": 5.501679420471191, "loss_xval": 0.2197265625, "num_input_tokens_seen": 1788608, "step": 26 }, { "epoch": 1.6875, "grad_norm": 195.61738909493855, "learning_rate": 2.9845695867755812e-05, "loss": 5.8845, "num_input_tokens_seen": 1847680, "step": 27 }, { "epoch": 1.6875, "loss": 6.124744415283203, "loss_ce": 5.868885040283203, "loss_xval": 0.255859375, "num_input_tokens_seen": 1847680, "step": 27 }, { "epoch": 1.75, "grad_norm": 350.88859252845054, "learning_rate": 3.017502577438635e-05, "loss": 6.1343, "num_input_tokens_seen": 1919360, "step": 28 }, { "epoch": 1.75, "loss": 5.883708953857422, "loss_ce": 5.438396453857422, "loss_xval": 0.4453125, "num_input_tokens_seen": 1919360, "step": 28 }, { "epoch": 1.8125, "grad_norm": 447.93921756481274, "learning_rate": 3.049279783085195e-05, "loss": 6.0422, "num_input_tokens_seen": 1990976, "step": 29 }, { "epoch": 1.8125, "loss": 6.0027031898498535, "loss_ce": 5.3698906898498535, "loss_xval": 0.6328125, "num_input_tokens_seen": 1990976, "step": 29 }, { "epoch": 1.875, "grad_norm": 309.12130084336945, "learning_rate": 3.079979585347679e-05, "loss": 5.8828, "num_input_tokens_seen": 2062656, "step": 30 }, { "epoch": 1.875, "loss": 5.82305383682251, "loss_ce": 5.50078821182251, "loss_xval": 0.322265625, "num_input_tokens_seen": 2062656, "step": 30 }, { "epoch": 1.9375, "grad_norm": 274.02947169042403, "learning_rate": 3.1096726532791335e-05, "loss": 5.6858, "num_input_tokens_seen": 2134400, "step": 31 }, { "epoch": 1.9375, "loss": 5.657408237457275, "loss_ce": 5.414244174957275, "loss_xval": 0.2431640625, "num_input_tokens_seen": 2134400, "step": 31 }, { "epoch": 2.0, "grad_norm": 88.1339226566502, "learning_rate": 3.1384229231686404e-05, "loss": 5.118, "num_input_tokens_seen": 2205952, "step": 32 }, { "epoch": 2.0, "loss": 5.154168605804443, "loss_ce": 5.036981105804443, "loss_xval": 0.1171875, "num_input_tokens_seen": 2205952, "step": 32 }, { "epoch": 2.0625, "grad_norm": 230.87518409285667, "learning_rate": 3.1662884275383314e-05, "loss": 5.265, "num_input_tokens_seen": 2277632, "step": 33 }, { "epoch": 2.0625, "loss": 5.119575500488281, "loss_ce": 4.890083312988281, "loss_xval": 0.2294921875, "num_input_tokens_seen": 2277632, "step": 33 }, { "epoch": 2.125, "grad_norm": 301.8125776339725, "learning_rate": 3.1933220003497456e-05, "loss": 5.1474, "num_input_tokens_seen": 2336768, "step": 34 }, { "epoch": 2.125, "loss": 5.0475993156433105, "loss_ce": 4.6159586906433105, "loss_xval": 0.431640625, "num_input_tokens_seen": 2336768, "step": 34 }, { "epoch": 2.1875, "grad_norm": 362.43387778423926, "learning_rate": 3.219571879959937e-05, "loss": 5.2475, "num_input_tokens_seen": 2408448, "step": 35 }, { "epoch": 2.1875, "loss": 5.348223686218262, "loss_ce": 4.916583061218262, "loss_xval": 0.431640625, "num_input_tokens_seen": 2408448, "step": 35 }, { "epoch": 2.25, "grad_norm": 240.2118877111659, "learning_rate": 3.2450822271178436e-05, "loss": 4.8557, "num_input_tokens_seen": 2480064, "step": 36 }, { "epoch": 2.25, "loss": 4.754091262817383, "loss_ce": 4.496278762817383, "loss_xval": 0.2578125, "num_input_tokens_seen": 2480064, "step": 36 }, { "epoch": 2.3125, "grad_norm": 112.64967575290002, "learning_rate": 3.269893571973584e-05, "loss": 4.644, "num_input_tokens_seen": 2551616, "step": 37 }, { "epoch": 2.3125, "loss": 4.523304462432861, "loss_ce": 4.384632587432861, "loss_xval": 0.138671875, "num_input_tokens_seen": 2551616, "step": 37 }, { "epoch": 2.375, "grad_norm": 112.36137777715612, "learning_rate": 3.29404320146375e-05, "loss": 4.4596, "num_input_tokens_seen": 2623360, "step": 38 }, { "epoch": 2.375, "loss": 4.345686435699463, "loss_ce": 4.292219638824463, "loss_xval": 0.053466796875, "num_input_tokens_seen": 2623360, "step": 38 }, { "epoch": 2.4375, "grad_norm": 178.55025735379013, "learning_rate": 3.317565496368735e-05, "loss": 4.2165, "num_input_tokens_seen": 2682496, "step": 39 }, { "epoch": 2.4375, "loss": 4.244419097900391, "loss_ce": 4.049106597900391, "loss_xval": 0.1953125, "num_input_tokens_seen": 2682496, "step": 39 }, { "epoch": 2.5, "grad_norm": 315.4850255105284, "learning_rate": 3.340492225689942e-05, "loss": 4.4928, "num_input_tokens_seen": 2754048, "step": 40 }, { "epoch": 2.5, "loss": 4.407598972320557, "loss_ce": 4.122442722320557, "loss_xval": 0.28515625, "num_input_tokens_seen": 2754048, "step": 40 }, { "epoch": 2.5625, "grad_norm": 232.48313805962374, "learning_rate": 3.362852804672299e-05, "loss": 4.244, "num_input_tokens_seen": 2825728, "step": 41 }, { "epoch": 2.5625, "loss": 4.199363708496094, "loss_ce": 3.9737777709960938, "loss_xval": 0.2255859375, "num_input_tokens_seen": 2825728, "step": 41 }, { "epoch": 2.625, "grad_norm": 87.66313343151434, "learning_rate": 3.384674521730101e-05, "loss": 3.7806, "num_input_tokens_seen": 2884800, "step": 42 }, { "epoch": 2.625, "loss": 3.834822177886963, "loss_ce": 3.755720615386963, "loss_xval": 0.0791015625, "num_input_tokens_seen": 2884800, "step": 42 }, { "epoch": 2.6875, "grad_norm": 99.57816021937762, "learning_rate": 3.405982738667825e-05, "loss": 3.9589, "num_input_tokens_seen": 2956416, "step": 43 }, { "epoch": 2.6875, "loss": 3.855365753173828, "loss_ce": 3.767475128173828, "loss_xval": 0.087890625, "num_input_tokens_seen": 2956416, "step": 43 }, { "epoch": 2.75, "grad_norm": 157.27395398869928, "learning_rate": 3.4268010678805934e-05, "loss": 3.7278, "num_input_tokens_seen": 3028032, "step": 44 }, { "epoch": 2.75, "loss": 3.758025884628296, "loss_ce": 3.646697759628296, "loss_xval": 0.111328125, "num_input_tokens_seen": 3028032, "step": 44 }, { "epoch": 2.8125, "grad_norm": 199.10929648202688, "learning_rate": 3.447151529639145e-05, "loss": 3.6274, "num_input_tokens_seen": 3099648, "step": 45 }, { "epoch": 2.8125, "loss": 3.6010501384735107, "loss_ce": 3.4106204509735107, "loss_xval": 0.1904296875, "num_input_tokens_seen": 3099648, "step": 45 }, { "epoch": 2.875, "grad_norm": 188.23721951133405, "learning_rate": 3.467054692086308e-05, "loss": 3.4598, "num_input_tokens_seen": 3158720, "step": 46 }, { "epoch": 2.875, "loss": 3.4753458499908447, "loss_ce": 3.3415567874908447, "loss_xval": 0.1337890625, "num_input_tokens_seen": 3158720, "step": 46 }, { "epoch": 2.9375, "grad_norm": 52.48630854608108, "learning_rate": 3.486529796176414e-05, "loss": 3.3459, "num_input_tokens_seen": 3230400, "step": 47 }, { "epoch": 2.9375, "loss": 3.3899075984954834, "loss_ce": 3.3230130672454834, "loss_xval": 0.06689453125, "num_input_tokens_seen": 3230400, "step": 47 }, { "epoch": 3.0, "grad_norm": 78.14763974150075, "learning_rate": 3.505594867460106e-05, "loss": 3.2297, "num_input_tokens_seen": 3301952, "step": 48 }, { "epoch": 3.0, "loss": 3.199742555618286, "loss_ce": 3.146764039993286, "loss_xval": 0.052978515625, "num_input_tokens_seen": 3301952, "step": 48 }, { "epoch": 3.0625, "grad_norm": 107.09323318280654, "learning_rate": 3.524266816342358e-05, "loss": 3.1971, "num_input_tokens_seen": 3361088, "step": 49 }, { "epoch": 3.0625, "loss": 3.155874013900757, "loss_ce": 3.062124013900757, "loss_xval": 0.09375, "num_input_tokens_seen": 3361088, "step": 49 }, { "epoch": 3.125, "grad_norm": 175.2975707620488, "learning_rate": 3.542561528211243e-05, "loss": 3.178, "num_input_tokens_seen": 3432704, "step": 50 }, { "epoch": 3.125, "loss": 3.104872941970825, "loss_ce": 3.011611223220825, "loss_xval": 0.09326171875, "num_input_tokens_seen": 3432704, "step": 50 }, { "epoch": 3.1875, "grad_norm": 127.36496606890086, "learning_rate": 3.560493944641211e-05, "loss": 2.9993, "num_input_tokens_seen": 3504320, "step": 51 }, { "epoch": 3.1875, "eval_synth_IoU": 0.0066440212685847655, "eval_synth_MAE_x": 0.12607574462890625, "eval_synth_MAE_y": 0.268218994140625, "eval_synth_NUM_probability": 2.8884072733603716e-07, "eval_synth_inside_bbox": 0.0625, "eval_synth_loss": 2.902937412261963, "eval_synth_loss_ce": 2.8528276681900024, "eval_synth_loss_xval": 0.05010986328125, "eval_synth_runtime": 61.7514, "eval_synth_samples_per_second": 2.073, "eval_synth_steps_per_second": 0.065, "num_input_tokens_seen": 3504320, "step": 51 }, { "epoch": 3.1875, "loss": 2.87408447265625, "loss_ce": 2.84063720703125, "loss_xval": 0.033447265625, "num_input_tokens_seen": 3504320, "step": 51 }, { "epoch": 3.25, "grad_norm": 25.963210609727025, "learning_rate": 3.578078136710997e-05, "loss": 2.9028, "num_input_tokens_seen": 3576000, "step": 52 }, { "epoch": 3.25, "loss": 2.9049689769744873, "loss_ce": 2.8453986644744873, "loss_xval": 0.0595703125, "num_input_tokens_seen": 3576000, "step": 52 }, { "epoch": 3.3125, "grad_norm": 42.52841656307943, "learning_rate": 3.595327371337536e-05, "loss": 2.8712, "num_input_tokens_seen": 3647616, "step": 53 }, { "epoch": 3.3125, "loss": 2.880415678024292, "loss_ce": 2.812544584274292, "loss_xval": 0.06787109375, "num_input_tokens_seen": 3647616, "step": 53 }, { "epoch": 3.375, "grad_norm": 169.11167670262176, "learning_rate": 3.6122541714093095e-05, "loss": 2.845, "num_input_tokens_seen": 3719168, "step": 54 }, { "epoch": 3.375, "loss": 2.8389875888824463, "loss_ce": 2.7012922763824463, "loss_xval": 0.1376953125, "num_input_tokens_seen": 3719168, "step": 54 }, { "epoch": 3.4375, "grad_norm": 116.0472794869621, "learning_rate": 3.628870370401895e-05, "loss": 2.7699, "num_input_tokens_seen": 3790784, "step": 55 }, { "epoch": 3.4375, "loss": 2.774611234664917, "loss_ce": 2.682814359664917, "loss_xval": 0.091796875, "num_input_tokens_seen": 3790784, "step": 55 }, { "epoch": 3.5, "grad_norm": 69.64980837437214, "learning_rate": 3.6451871620723636e-05, "loss": 2.6313, "num_input_tokens_seen": 3849792, "step": 56 }, { "epoch": 3.5, "loss": 2.609570026397705, "loss_ce": 2.552441120147705, "loss_xval": 0.05712890625, "num_input_tokens_seen": 3849792, "step": 56 }, { "epoch": 3.5625, "grad_norm": 27.048681544392675, "learning_rate": 3.661215145755216e-05, "loss": 2.5762, "num_input_tokens_seen": 3921536, "step": 57 }, { "epoch": 3.5625, "loss": 2.5791242122650146, "loss_ce": 2.5366437435150146, "loss_xval": 0.04248046875, "num_input_tokens_seen": 3921536, "step": 57 }, { "epoch": 3.625, "grad_norm": 93.63238587072217, "learning_rate": 3.6769643677189227e-05, "loss": 2.5727, "num_input_tokens_seen": 3993280, "step": 58 }, { "epoch": 3.625, "loss": 2.563948154449463, "loss_ce": 2.504377841949463, "loss_xval": 0.0595703125, "num_input_tokens_seen": 3993280, "step": 58 }, { "epoch": 3.6875, "grad_norm": 126.16773744523488, "learning_rate": 3.692444358987175e-05, "loss": 2.5464, "num_input_tokens_seen": 4065024, "step": 59 }, { "epoch": 3.6875, "loss": 2.493313789367676, "loss_ce": 2.430325508117676, "loss_xval": 0.06298828125, "num_input_tokens_seen": 4065024, "step": 59 }, { "epoch": 3.75, "grad_norm": 117.97609678952398, "learning_rate": 3.707664169981407e-05, "loss": 2.4898, "num_input_tokens_seen": 4136768, "step": 60 }, { "epoch": 3.75, "loss": 2.492903232574463, "loss_ce": 2.384016513824463, "loss_xval": 0.10888671875, "num_input_tokens_seen": 4136768, "step": 60 }, { "epoch": 3.8125, "grad_norm": 33.00295945775187, "learning_rate": 3.7226324022999023e-05, "loss": 2.335, "num_input_tokens_seen": 4208512, "step": 61 }, { "epoch": 3.8125, "loss": 2.3376686573028564, "loss_ce": 2.2946999073028564, "loss_xval": 0.04296875, "num_input_tokens_seen": 4208512, "step": 61 }, { "epoch": 3.875, "grad_norm": 117.30328549034672, "learning_rate": 3.737357237912862e-05, "loss": 2.401, "num_input_tokens_seen": 4280192, "step": 62 }, { "epoch": 3.875, "loss": 2.417743682861328, "loss_ce": 2.324970245361328, "loss_xval": 0.0927734375, "num_input_tokens_seen": 4280192, "step": 62 }, { "epoch": 3.9375, "grad_norm": 111.69349851089615, "learning_rate": 3.751846466021567e-05, "loss": 2.2598, "num_input_tokens_seen": 4351872, "step": 63 }, { "epoch": 3.9375, "loss": 2.269806385040283, "loss_ce": 2.207306385040283, "loss_xval": 0.0625, "num_input_tokens_seen": 4351872, "step": 63 }, { "epoch": 4.0, "grad_norm": 28.60956298923437, "learning_rate": 3.766107507802368e-05, "loss": 2.2539, "num_input_tokens_seen": 4423424, "step": 64 }, { "epoch": 4.0, "loss": 2.2942304611206055, "loss_ce": 2.2344160079956055, "loss_xval": 0.059814453125, "num_input_tokens_seen": 4423424, "step": 64 }, { "epoch": 4.0625, "grad_norm": 18.836018192038598, "learning_rate": 3.7801474392322984e-05, "loss": 2.1486, "num_input_tokens_seen": 4495040, "step": 65 }, { "epoch": 4.0625, "loss": 2.1568853855133057, "loss_ce": 2.1168463230133057, "loss_xval": 0.0400390625, "num_input_tokens_seen": 4495040, "step": 65 }, { "epoch": 4.125, "grad_norm": 56.66330982290737, "learning_rate": 3.7939730121720594e-05, "loss": 2.1409, "num_input_tokens_seen": 4566720, "step": 66 }, { "epoch": 4.125, "loss": 2.1396796703338623, "loss_ce": 2.1030585765838623, "loss_xval": 0.03662109375, "num_input_tokens_seen": 4566720, "step": 66 }, { "epoch": 4.1875, "grad_norm": 70.88389886337272, "learning_rate": 3.807590673863634e-05, "loss": 2.0626, "num_input_tokens_seen": 4625792, "step": 67 }, { "epoch": 4.1875, "loss": 2.050058364868164, "loss_ce": 2.017343521118164, "loss_xval": 0.03271484375, "num_input_tokens_seen": 4625792, "step": 67 }, { "epoch": 4.25, "grad_norm": 55.513793058827545, "learning_rate": 3.8210065849834735e-05, "loss": 2.0257, "num_input_tokens_seen": 4684800, "step": 68 }, { "epoch": 4.25, "loss": 2.0458872318267822, "loss_ce": 2.0043833255767822, "loss_xval": 0.04150390625, "num_input_tokens_seen": 4684800, "step": 68 }, { "epoch": 4.3125, "grad_norm": 28.58848301667242, "learning_rate": 3.834226636377775e-05, "loss": 2.0008, "num_input_tokens_seen": 4756608, "step": 69 }, { "epoch": 4.3125, "loss": 2.0231857299804688, "loss_ce": 1.9741133451461792, "loss_xval": 0.049072265625, "num_input_tokens_seen": 4756608, "step": 69 }, { "epoch": 4.375, "grad_norm": 31.1682609533774, "learning_rate": 3.847256464593665e-05, "loss": 2.0347, "num_input_tokens_seen": 4828352, "step": 70 }, { "epoch": 4.375, "loss": 1.989190936088562, "loss_ce": 1.929376482963562, "loss_xval": 0.059814453125, "num_input_tokens_seen": 4828352, "step": 70 }, { "epoch": 4.4375, "grad_norm": 122.59637628732717, "learning_rate": 3.860101466308762e-05, "loss": 1.94, "num_input_tokens_seen": 4900160, "step": 71 }, { "epoch": 4.4375, "loss": 1.920021414756775, "loss_ce": 1.859474539756775, "loss_xval": 0.060546875, "num_input_tokens_seen": 4900160, "step": 71 }, { "epoch": 4.5, "grad_norm": 84.61405063476768, "learning_rate": 3.872766811751572e-05, "loss": 1.8873, "num_input_tokens_seen": 4971840, "step": 72 }, { "epoch": 4.5, "loss": 1.9320684671401978, "loss_ce": 1.8661504983901978, "loss_xval": 0.06591796875, "num_input_tokens_seen": 4971840, "step": 72 }, { "epoch": 4.5625, "grad_norm": 31.840479159637216, "learning_rate": 3.8852574571962525e-05, "loss": 1.8108, "num_input_tokens_seen": 5043584, "step": 73 }, { "epoch": 4.5625, "loss": 1.7685670852661133, "loss_ce": 1.7431764602661133, "loss_xval": 0.025390625, "num_input_tokens_seen": 5043584, "step": 73 }, { "epoch": 4.625, "grad_norm": 25.983925310667896, "learning_rate": 3.897578156607312e-05, "loss": 1.8146, "num_input_tokens_seen": 5102720, "step": 74 }, { "epoch": 4.625, "loss": 1.8115485906600952, "loss_ce": 1.7851814031600952, "loss_xval": 0.0263671875, "num_input_tokens_seen": 5102720, "step": 74 }, { "epoch": 4.6875, "grad_norm": 74.29589067086889, "learning_rate": 3.909733472502708e-05, "loss": 1.7631, "num_input_tokens_seen": 5174336, "step": 75 }, { "epoch": 4.6875, "loss": 1.7935320138931274, "loss_ce": 1.7349382638931274, "loss_xval": 0.05859375, "num_input_tokens_seen": 5174336, "step": 75 }, { "epoch": 4.75, "grad_norm": 88.02846317965282, "learning_rate": 3.921727786097478e-05, "loss": 1.7123, "num_input_tokens_seen": 5246016, "step": 76 }, { "epoch": 4.75, "loss": 1.746011734008789, "loss_ce": 1.680093765258789, "loss_xval": 0.06591796875, "num_input_tokens_seen": 5246016, "step": 76 }, { "epoch": 4.8125, "grad_norm": 16.541784065292656, "learning_rate": 3.933565306784317e-05, "loss": 1.6954, "num_input_tokens_seen": 5317568, "step": 77 }, { "epoch": 4.8125, "loss": 1.6550248861312866, "loss_ce": 1.6351274251937866, "loss_xval": 0.0198974609375, "num_input_tokens_seen": 5317568, "step": 77 }, { "epoch": 4.875, "grad_norm": 47.77120385942182, "learning_rate": 3.945250081002463e-05, "loss": 1.6259, "num_input_tokens_seen": 5389248, "step": 78 }, { "epoch": 4.875, "loss": 1.6267539262771606, "loss_ce": 1.5662070512771606, "loss_xval": 0.060546875, "num_input_tokens_seen": 5389248, "step": 78 }, { "epoch": 4.9375, "grad_norm": 102.39096093646586, "learning_rate": 3.9567860005416364e-05, "loss": 1.5948, "num_input_tokens_seen": 5460864, "step": 79 }, { "epoch": 4.9375, "loss": 1.622156023979187, "loss_ce": 1.566003680229187, "loss_xval": 0.05615234375, "num_input_tokens_seen": 5460864, "step": 79 }, { "epoch": 5.0, "grad_norm": 58.587913611747304, "learning_rate": 3.96817681032367e-05, "loss": 1.5486, "num_input_tokens_seen": 5532544, "step": 80 }, { "epoch": 5.0, "loss": 1.5358805656433105, "loss_ce": 1.5047526359558105, "loss_xval": 0.0311279296875, "num_input_tokens_seen": 5532544, "step": 80 }, { "epoch": 5.0625, "grad_norm": 48.16275511376751, "learning_rate": 3.9794261157007754e-05, "loss": 1.5092, "num_input_tokens_seen": 5604096, "step": 81 }, { "epoch": 5.0625, "loss": 1.4976553916931152, "loss_ce": 1.4751944541931152, "loss_xval": 0.0224609375, "num_input_tokens_seen": 5604096, "step": 81 }, { "epoch": 5.125, "grad_norm": 82.27402561349479, "learning_rate": 3.990537389306027e-05, "loss": 1.4799, "num_input_tokens_seen": 5663168, "step": 82 }, { "epoch": 5.125, "loss": 1.4801445007324219, "loss_ce": 1.4425468444824219, "loss_xval": 0.03759765625, "num_input_tokens_seen": 5663168, "step": 82 }, { "epoch": 5.1875, "grad_norm": 66.20344925699517, "learning_rate": 4.001513977488632e-05, "loss": 1.4557, "num_input_tokens_seen": 5722240, "step": 83 }, { "epoch": 5.1875, "loss": 1.4894322156906128, "loss_ce": 1.4515904188156128, "loss_xval": 0.037841796875, "num_input_tokens_seen": 5722240, "step": 83 }, { "epoch": 5.25, "grad_norm": 26.081096904088774, "learning_rate": 4.012359106363829e-05, "loss": 1.3982, "num_input_tokens_seen": 5793984, "step": 84 }, { "epoch": 5.25, "loss": 1.409085750579834, "loss_ce": 1.382108211517334, "loss_xval": 0.0269775390625, "num_input_tokens_seen": 5793984, "step": 84 }, { "epoch": 5.3125, "grad_norm": 82.13112661521336, "learning_rate": 4.023075887504775e-05, "loss": 1.3883, "num_input_tokens_seen": 5865600, "step": 85 }, { "epoch": 5.3125, "loss": 1.3842965364456177, "loss_ce": 1.3257027864456177, "loss_xval": 0.05859375, "num_input_tokens_seen": 5865600, "step": 85 }, { "epoch": 5.375, "grad_norm": 58.38568800603289, "learning_rate": 4.033667323301552e-05, "loss": 1.358, "num_input_tokens_seen": 5937152, "step": 86 }, { "epoch": 5.375, "loss": 1.357524037361145, "loss_ce": 1.327738881111145, "loss_xval": 0.02978515625, "num_input_tokens_seen": 5937152, "step": 86 }, { "epoch": 5.4375, "grad_norm": 29.264244807039464, "learning_rate": 4.0441363120103886e-05, "loss": 1.295, "num_input_tokens_seen": 6008960, "step": 87 }, { "epoch": 5.4375, "loss": 1.2950177192687988, "loss_ce": 1.2564435005187988, "loss_xval": 0.03857421875, "num_input_tokens_seen": 6008960, "step": 87 }, { "epoch": 5.5, "grad_norm": 58.26095726094696, "learning_rate": 4.054485652514322e-05, "loss": 1.2911, "num_input_tokens_seen": 6080640, "step": 88 }, { "epoch": 5.5, "loss": 1.3091117143630981, "loss_ce": 1.2700492143630981, "loss_xval": 0.0390625, "num_input_tokens_seen": 6080640, "step": 88 }, { "epoch": 5.5625, "grad_norm": 52.30535783546792, "learning_rate": 4.0647180488148894e-05, "loss": 1.2402, "num_input_tokens_seen": 6152192, "step": 89 }, { "epoch": 5.5625, "loss": 1.2014875411987305, "loss_ce": 1.1710920333862305, "loss_xval": 0.0303955078125, "num_input_tokens_seen": 6152192, "step": 89 }, { "epoch": 5.625, "grad_norm": 33.50300709768344, "learning_rate": 4.074836114272873e-05, "loss": 1.2258, "num_input_tokens_seen": 6223872, "step": 90 }, { "epoch": 5.625, "loss": 1.2106300592422485, "loss_ce": 1.1835304498672485, "loss_xval": 0.027099609375, "num_input_tokens_seen": 6223872, "step": 90 }, { "epoch": 5.6875, "grad_norm": 42.07708626662781, "learning_rate": 4.08484237561472e-05, "loss": 1.1905, "num_input_tokens_seen": 6295552, "step": 91 }, { "epoch": 5.6875, "loss": 1.1875571012496948, "loss_ce": 1.1519125699996948, "loss_xval": 0.03564453125, "num_input_tokens_seen": 6295552, "step": 91 }, { "epoch": 5.75, "grad_norm": 26.631374919077388, "learning_rate": 4.094739276720037e-05, "loss": 1.1531, "num_input_tokens_seen": 6367232, "step": 92 }, { "epoch": 5.75, "loss": 1.143760323524475, "loss_ce": 1.108848214149475, "loss_xval": 0.034912109375, "num_input_tokens_seen": 6367232, "step": 92 }, { "epoch": 5.8125, "grad_norm": 11.203893962892462, "learning_rate": 4.104529182204328e-05, "loss": 1.1346, "num_input_tokens_seen": 6438848, "step": 93 }, { "epoch": 5.8125, "loss": 1.1466225385665894, "loss_ce": 1.1186684370040894, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 6438848, "step": 93 }, { "epoch": 5.875, "grad_norm": 16.897054232717412, "learning_rate": 4.114214380810143e-05, "loss": 1.1241, "num_input_tokens_seen": 6510528, "step": 94 }, { "epoch": 5.875, "loss": 1.109265923500061, "loss_ce": 1.076795220375061, "loss_xval": 0.032470703125, "num_input_tokens_seen": 6510528, "step": 94 }, { "epoch": 5.9375, "grad_norm": 20.24250335227694, "learning_rate": 4.1237970886187796e-05, "loss": 1.0701, "num_input_tokens_seen": 6582144, "step": 95 }, { "epoch": 5.9375, "loss": 1.067662000656128, "loss_ce": 1.053135633468628, "loss_xval": 0.0145263671875, "num_input_tokens_seen": 6582144, "step": 95 }, { "epoch": 6.0, "grad_norm": 45.49070360548281, "learning_rate": 4.1332794520938336e-05, "loss": 1.0772, "num_input_tokens_seen": 6653696, "step": 96 }, { "epoch": 6.0, "loss": 1.0987242460250854, "loss_ce": 1.0538023710250854, "loss_xval": 0.044921875, "num_input_tokens_seen": 6653696, "step": 96 }, { "epoch": 6.0625, "grad_norm": 35.75220054268847, "learning_rate": 4.1426635509670346e-05, "loss": 1.0282, "num_input_tokens_seen": 6725312, "step": 97 }, { "epoch": 6.0625, "loss": 1.017682433128357, "loss_ce": 0.9949773550033569, "loss_xval": 0.022705078125, "num_input_tokens_seen": 6725312, "step": 97 }, { "epoch": 6.125, "grad_norm": 47.313168802314216, "learning_rate": 4.151951400976087e-05, "loss": 1.0307, "num_input_tokens_seen": 6796864, "step": 98 }, { "epoch": 6.125, "loss": 1.0477159023284912, "loss_ce": 0.990831196308136, "loss_xval": 0.056884765625, "num_input_tokens_seen": 6796864, "step": 98 }, { "epoch": 6.1875, "grad_norm": 76.72953585821251, "learning_rate": 4.1611449564635246e-05, "loss": 1.0313, "num_input_tokens_seen": 6855936, "step": 99 }, { "epoch": 6.1875, "loss": 1.008699893951416, "loss_ce": 0.965242862701416, "loss_xval": 0.04345703125, "num_input_tokens_seen": 6855936, "step": 99 }, { "epoch": 6.25, "grad_norm": 17.94697157489963, "learning_rate": 4.1702461128449717e-05, "loss": 0.9989, "num_input_tokens_seen": 6915136, "step": 100 }, { "epoch": 6.25, "loss": 0.9928085803985596, "loss_ce": 0.9674179553985596, "loss_xval": 0.025390625, "num_input_tokens_seen": 6915136, "step": 100 }, { "epoch": 6.3125, "grad_norm": 33.2144512243145, "learning_rate": 4.179256708954579e-05, "loss": 0.9649, "num_input_tokens_seen": 6986752, "step": 101 }, { "epoch": 6.3125, "loss": 0.9760175943374634, "loss_ce": 0.9579511880874634, "loss_xval": 0.01806640625, "num_input_tokens_seen": 6986752, "step": 101 }, { "epoch": 6.375, "grad_norm": 83.07224534219694, "learning_rate": 4.188178529274939e-05, "loss": 0.9689, "num_input_tokens_seen": 7058496, "step": 102 }, { "epoch": 6.375, "loss": 0.9563613533973694, "loss_ce": 0.9209609627723694, "loss_xval": 0.035400390625, "num_input_tokens_seen": 7058496, "step": 102 }, { "epoch": 6.4375, "grad_norm": 11.471453835567821, "learning_rate": 4.197013306058202e-05, "loss": 0.9529, "num_input_tokens_seen": 7130048, "step": 103 }, { "epoch": 6.4375, "loss": 0.9473816752433777, "loss_ce": 0.9284607768058777, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 7130048, "step": 103 }, { "epoch": 6.5, "grad_norm": 20.514656398784428, "learning_rate": 4.205762721344725e-05, "loss": 0.9252, "num_input_tokens_seen": 7201664, "step": 104 }, { "epoch": 6.5, "loss": 0.9248759746551514, "loss_ce": 0.9055888652801514, "loss_xval": 0.019287109375, "num_input_tokens_seen": 7201664, "step": 104 }, { "epoch": 6.5625, "grad_norm": 54.333103205036785, "learning_rate": 4.21442840888513e-05, "loss": 0.9309, "num_input_tokens_seen": 7273344, "step": 105 }, { "epoch": 6.5625, "loss": 0.9247029423713684, "loss_ce": 0.8912556767463684, "loss_xval": 0.033447265625, "num_input_tokens_seen": 7273344, "step": 105 }, { "epoch": 6.625, "grad_norm": 42.01503117611703, "learning_rate": 4.223011955971264e-05, "loss": 0.8987, "num_input_tokens_seen": 7345152, "step": 106 }, { "epoch": 6.625, "loss": 0.8918097019195557, "loss_ce": 0.8678839206695557, "loss_xval": 0.02392578125, "num_input_tokens_seen": 7345152, "step": 106 }, { "epoch": 6.6875, "grad_norm": 64.62949480171359, "learning_rate": 4.231514905181194e-05, "loss": 0.8985, "num_input_tokens_seen": 7416832, "step": 107 }, { "epoch": 6.6875, "loss": 0.8956446051597595, "loss_ce": 0.8673242926597595, "loss_xval": 0.0283203125, "num_input_tokens_seen": 7416832, "step": 107 }, { "epoch": 6.75, "grad_norm": 54.310221908362855, "learning_rate": 4.2399387560430375e-05, "loss": 0.8738, "num_input_tokens_seen": 7488576, "step": 108 }, { "epoch": 6.75, "loss": 0.8796428442001343, "loss_ce": 0.8507121801376343, "loss_xval": 0.0289306640625, "num_input_tokens_seen": 7488576, "step": 108 }, { "epoch": 6.8125, "grad_norm": 15.868960973624445, "learning_rate": 4.2482849666221134e-05, "loss": 0.8873, "num_input_tokens_seen": 7560128, "step": 109 }, { "epoch": 6.8125, "loss": 0.8466143012046814, "loss_ce": 0.8327593207359314, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 7560128, "step": 109 }, { "epoch": 6.875, "grad_norm": 49.92541947914144, "learning_rate": 4.2565549550356234e-05, "loss": 0.8584, "num_input_tokens_seen": 7631808, "step": 110 }, { "epoch": 6.875, "loss": 0.840313732624054, "loss_ce": 0.815655529499054, "loss_xval": 0.024658203125, "num_input_tokens_seen": 7631808, "step": 110 }, { "epoch": 6.9375, "grad_norm": 36.27667896033985, "learning_rate": 4.2647501008987776e-05, "loss": 0.8169, "num_input_tokens_seen": 7703488, "step": 111 }, { "epoch": 6.9375, "loss": 0.8194260001182556, "loss_ce": 0.7958664298057556, "loss_xval": 0.0235595703125, "num_input_tokens_seen": 7703488, "step": 111 }, { "epoch": 7.0, "grad_norm": 28.119935449380055, "learning_rate": 4.272871746706091e-05, "loss": 0.8164, "num_input_tokens_seen": 7762496, "step": 112 }, { "epoch": 7.0, "loss": 0.8153378367424011, "loss_ce": 0.7873837351799011, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 7762496, "step": 112 }, { "epoch": 7.0625, "grad_norm": 39.909688575941395, "learning_rate": 4.280921199151268e-05, "loss": 0.8144, "num_input_tokens_seen": 7834240, "step": 113 }, { "epoch": 7.0625, "loss": 0.8192792534828186, "loss_ce": 0.7809491753578186, "loss_xval": 0.038330078125, "num_input_tokens_seen": 7834240, "step": 113 }, { "epoch": 7.125, "grad_norm": 18.717218960055316, "learning_rate": 4.288899730388944e-05, "loss": 0.788, "num_input_tokens_seen": 7905984, "step": 114 }, { "epoch": 7.125, "loss": 0.7716240882873535, "loss_ce": 0.7491631507873535, "loss_xval": 0.0224609375, "num_input_tokens_seen": 7905984, "step": 114 }, { "epoch": 7.1875, "grad_norm": 60.9610528810964, "learning_rate": 4.296808579241338e-05, "loss": 0.7927, "num_input_tokens_seen": 7977600, "step": 115 }, { "epoch": 7.1875, "loss": 0.7663066387176514, "loss_ce": 0.7440898418426514, "loss_xval": 0.022216796875, "num_input_tokens_seen": 7977600, "step": 115 }, { "epoch": 7.25, "grad_norm": 9.439264469153885, "learning_rate": 4.3046489523526506e-05, "loss": 0.7562, "num_input_tokens_seen": 8036672, "step": 116 }, { "epoch": 7.25, "loss": 0.7505704760551453, "loss_ce": 0.7373258471488953, "loss_xval": 0.01324462890625, "num_input_tokens_seen": 8036672, "step": 116 }, { "epoch": 7.3125, "grad_norm": 14.835568869475779, "learning_rate": 4.312422025293929e-05, "loss": 0.7503, "num_input_tokens_seen": 8108352, "step": 117 }, { "epoch": 7.3125, "loss": 0.758650004863739, "loss_ce": 0.728498637676239, "loss_xval": 0.0301513671875, "num_input_tokens_seen": 8108352, "step": 117 }, { "epoch": 7.375, "grad_norm": 36.075858696561966, "learning_rate": 4.320128943620903e-05, "loss": 0.7338, "num_input_tokens_seen": 8180096, "step": 118 }, { "epoch": 7.375, "loss": 0.7231059670448303, "loss_ce": 0.7030864357948303, "loss_xval": 0.02001953125, "num_input_tokens_seen": 8180096, "step": 118 }, { "epoch": 7.4375, "grad_norm": 16.369509919688415, "learning_rate": 4.327770823887197e-05, "loss": 0.711, "num_input_tokens_seen": 8251648, "step": 119 }, { "epoch": 7.4375, "loss": 0.6975811123847961, "loss_ce": 0.6894634366035461, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 8251648, "step": 119 }, { "epoch": 7.5, "grad_norm": 23.42065778482628, "learning_rate": 4.335348754615135e-05, "loss": 0.7175, "num_input_tokens_seen": 8323264, "step": 120 }, { "epoch": 7.5, "loss": 0.7115622758865356, "loss_ce": 0.6920310258865356, "loss_xval": 0.01953125, "num_input_tokens_seen": 8323264, "step": 120 }, { "epoch": 7.5625, "grad_norm": 11.954631803701059, "learning_rate": 4.342863797226275e-05, "loss": 0.6866, "num_input_tokens_seen": 8394880, "step": 121 }, { "epoch": 7.5625, "loss": 0.687512993812561, "loss_ce": 0.666028618812561, "loss_xval": 0.021484375, "num_input_tokens_seen": 8394880, "step": 121 }, { "epoch": 7.625, "grad_norm": 10.161558994787343, "learning_rate": 4.350316986933631e-05, "loss": 0.6903, "num_input_tokens_seen": 8466496, "step": 122 }, { "epoch": 7.625, "loss": 0.6849098801612854, "loss_ce": 0.6639137864112854, "loss_xval": 0.02099609375, "num_input_tokens_seen": 8466496, "step": 122 }, { "epoch": 7.6875, "grad_norm": 41.66894560375791, "learning_rate": 4.357709333597492e-05, "loss": 0.6819, "num_input_tokens_seen": 8538048, "step": 123 }, { "epoch": 7.6875, "loss": 0.6672234535217285, "loss_ce": 0.6472039222717285, "loss_xval": 0.02001953125, "num_input_tokens_seen": 8538048, "step": 123 }, { "epoch": 7.75, "grad_norm": 19.22585236986661, "learning_rate": 4.36504182254659e-05, "loss": 0.6671, "num_input_tokens_seen": 8609728, "step": 124 }, { "epoch": 7.75, "loss": 0.6756678819656372, "loss_ce": 0.6427088975906372, "loss_xval": 0.032958984375, "num_input_tokens_seen": 8609728, "step": 124 }, { "epoch": 7.8125, "grad_norm": 13.173396588019653, "learning_rate": 4.372315415366273e-05, "loss": 0.6541, "num_input_tokens_seen": 8668800, "step": 125 }, { "epoch": 7.8125, "loss": 0.6586058139801025, "loss_ce": 0.6227171421051025, "loss_xval": 0.035888671875, "num_input_tokens_seen": 8668800, "step": 125 }, { "epoch": 7.875, "grad_norm": 73.83611704448327, "learning_rate": 4.379531050655295e-05, "loss": 0.6407, "num_input_tokens_seen": 8740480, "step": 126 }, { "epoch": 7.875, "loss": 0.6286131739616394, "loss_ce": 0.6065184473991394, "loss_xval": 0.0220947265625, "num_input_tokens_seen": 8740480, "step": 126 }, { "epoch": 7.9375, "grad_norm": 20.34502886757861, "learning_rate": 4.386689644752683e-05, "loss": 0.6079, "num_input_tokens_seen": 8812032, "step": 127 }, { "epoch": 7.9375, "loss": 0.6007564067840576, "loss_ce": 0.5874507427215576, "loss_xval": 0.0133056640625, "num_input_tokens_seen": 8812032, "step": 127 }, { "epoch": 8.0, "grad_norm": 53.69771774292302, "learning_rate": 4.3937920924360956e-05, "loss": 0.6143, "num_input_tokens_seen": 8871232, "step": 128 }, { "epoch": 8.0, "loss": 0.6093257665634155, "loss_ce": 0.5786861181259155, "loss_xval": 0.0306396484375, "num_input_tokens_seen": 8871232, "step": 128 }, { "epoch": 8.0625, "grad_norm": 39.90429875950801, "learning_rate": 4.400839267593018e-05, "loss": 0.5816, "num_input_tokens_seen": 8930432, "step": 129 }, { "epoch": 8.0625, "loss": 0.5776582360267639, "loss_ce": 0.5531221032142639, "loss_xval": 0.0245361328125, "num_input_tokens_seen": 8930432, "step": 129 }, { "epoch": 8.125, "grad_norm": 42.8106886505879, "learning_rate": 4.407832023866026e-05, "loss": 0.5801, "num_input_tokens_seen": 9002176, "step": 130 }, { "epoch": 8.125, "loss": 0.564851701259613, "loss_ce": 0.541170060634613, "loss_xval": 0.023681640625, "num_input_tokens_seen": 9002176, "step": 130 }, { "epoch": 8.1875, "grad_norm": 74.80583526977246, "learning_rate": 4.414771195273343e-05, "loss": 0.5773, "num_input_tokens_seen": 9073856, "step": 131 }, { "epoch": 8.1875, "loss": 0.5918972492218018, "loss_ce": 0.5333034992218018, "loss_xval": 0.05859375, "num_input_tokens_seen": 9073856, "step": 131 }, { "epoch": 8.25, "grad_norm": 36.13689427412808, "learning_rate": 4.421657596805787e-05, "loss": 0.5285, "num_input_tokens_seen": 9145536, "step": 132 }, { "epoch": 8.25, "loss": 0.5381279587745667, "loss_ce": 0.5160332322120667, "loss_xval": 0.0220947265625, "num_input_tokens_seen": 9145536, "step": 132 }, { "epoch": 8.3125, "grad_norm": 36.18830775008529, "learning_rate": 4.4284920250012015e-05, "loss": 0.5293, "num_input_tokens_seen": 9217088, "step": 133 }, { "epoch": 8.3125, "loss": 0.5235571265220642, "loss_ce": 0.5017065405845642, "loss_xval": 0.0218505859375, "num_input_tokens_seen": 9217088, "step": 133 }, { "epoch": 8.375, "grad_norm": 29.863124074710544, "learning_rate": 4.435275258497362e-05, "loss": 0.5279, "num_input_tokens_seen": 9288704, "step": 134 }, { "epoch": 8.375, "loss": 0.5201796293258667, "loss_ce": 0.5000380277633667, "loss_xval": 0.0201416015625, "num_input_tokens_seen": 9288704, "step": 134 }, { "epoch": 8.4375, "grad_norm": 26.175864599633144, "learning_rate": 4.4420080585643395e-05, "loss": 0.5015, "num_input_tokens_seen": 9347904, "step": 135 }, { "epoch": 8.4375, "loss": 0.48246899247169495, "loss_ce": 0.46836987137794495, "loss_xval": 0.01409912109375, "num_input_tokens_seen": 9347904, "step": 135 }, { "epoch": 8.5, "grad_norm": 41.789246513263144, "learning_rate": 4.4486911696172015e-05, "loss": 0.4809, "num_input_tokens_seen": 9419648, "step": 136 }, { "epoch": 8.5, "loss": 0.48507535457611084, "loss_ce": 0.46285855770111084, "loss_xval": 0.022216796875, "num_input_tokens_seen": 9419648, "step": 136 }, { "epoch": 8.5625, "grad_norm": 12.475484859727866, "learning_rate": 4.4553253197099536e-05, "loss": 0.4717, "num_input_tokens_seen": 9491200, "step": 137 }, { "epoch": 8.5625, "loss": 0.4641610085964203, "loss_ce": 0.4516488015651703, "loss_xval": 0.01251220703125, "num_input_tokens_seen": 9491200, "step": 137 }, { "epoch": 8.625, "grad_norm": 57.1829240329704, "learning_rate": 4.461911221011503e-05, "loss": 0.4917, "num_input_tokens_seen": 9550400, "step": 138 }, { "epoch": 8.625, "loss": 0.500900149345398, "loss_ce": 0.45500174164772034, "loss_xval": 0.0458984375, "num_input_tokens_seen": 9550400, "step": 138 }, { "epoch": 8.6875, "grad_norm": 15.812633321792019, "learning_rate": 4.4684495702644406e-05, "loss": 0.4351, "num_input_tokens_seen": 9622080, "step": 139 }, { "epoch": 8.6875, "loss": 0.4303787052631378, "loss_ce": 0.4198806583881378, "loss_xval": 0.010498046875, "num_input_tokens_seen": 9622080, "step": 139 }, { "epoch": 8.75, "grad_norm": 33.83518875566213, "learning_rate": 4.474941049227392e-05, "loss": 0.4621, "num_input_tokens_seen": 9693760, "step": 140 }, { "epoch": 8.75, "loss": 0.45098790526390076, "loss_ce": 0.42376622557640076, "loss_xval": 0.0272216796875, "num_input_tokens_seen": 9693760, "step": 140 }, { "epoch": 8.8125, "grad_norm": 40.96321239486865, "learning_rate": 4.481386325101608e-05, "loss": 0.4164, "num_input_tokens_seen": 9765440, "step": 141 }, { "epoch": 8.8125, "loss": 0.41145414113998413, "loss_ce": 0.38545316457748413, "loss_xval": 0.0260009765625, "num_input_tokens_seen": 9765440, "step": 141 }, { "epoch": 8.875, "grad_norm": 37.39520904229126, "learning_rate": 4.48778605094249e-05, "loss": 0.3979, "num_input_tokens_seen": 9837184, "step": 142 }, { "epoch": 8.875, "loss": 0.4080401062965393, "loss_ce": 0.3820391297340393, "loss_xval": 0.0260009765625, "num_input_tokens_seen": 9837184, "step": 142 }, { "epoch": 8.9375, "grad_norm": 34.52606007669613, "learning_rate": 4.494140866056678e-05, "loss": 0.3795, "num_input_tokens_seen": 9908864, "step": 143 }, { "epoch": 8.9375, "loss": 0.3671216368675232, "loss_ce": 0.3521069884300232, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 9908864, "step": 143 }, { "epoch": 9.0, "grad_norm": 30.32849867940823, "learning_rate": 4.5004513963852995e-05, "loss": 0.373, "num_input_tokens_seen": 9967936, "step": 144 }, { "epoch": 9.0, "loss": 0.37388211488723755, "loss_ce": 0.34702664613723755, "loss_xval": 0.02685546875, "num_input_tokens_seen": 9967936, "step": 144 }, { "epoch": 9.0625, "grad_norm": 33.853748342915004, "learning_rate": 4.5067182548739526e-05, "loss": 0.3726, "num_input_tokens_seen": 10039552, "step": 145 }, { "epoch": 9.0625, "loss": 0.370542973279953, "loss_ce": 0.351500004529953, "loss_xval": 0.01904296875, "num_input_tokens_seen": 10039552, "step": 145 }, { "epoch": 9.125, "grad_norm": 38.626009081815425, "learning_rate": 4.5129420418299804e-05, "loss": 0.3515, "num_input_tokens_seen": 10111168, "step": 146 }, { "epoch": 9.125, "loss": 0.3514055907726288, "loss_ce": 0.3271135985851288, "loss_xval": 0.0242919921875, "num_input_tokens_seen": 10111168, "step": 146 }, { "epoch": 9.1875, "grad_norm": 28.328723972964454, "learning_rate": 4.519123345267552e-05, "loss": 0.3439, "num_input_tokens_seen": 10182848, "step": 147 }, { "epoch": 9.1875, "loss": 0.3309248387813568, "loss_ce": 0.3159712255001068, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 10182848, "step": 147 }, { "epoch": 9.25, "grad_norm": 35.40827826279614, "learning_rate": 4.5252627412410396e-05, "loss": 0.3319, "num_input_tokens_seen": 10241856, "step": 148 }, { "epoch": 9.25, "loss": 0.33163246512413025, "loss_ce": 0.31014809012413025, "loss_xval": 0.021484375, "num_input_tokens_seen": 10241856, "step": 148 }, { "epoch": 9.3125, "grad_norm": 30.87887626218836, "learning_rate": 4.531360794167177e-05, "loss": 0.3193, "num_input_tokens_seen": 10313664, "step": 149 }, { "epoch": 9.3125, "loss": 0.33782634139060974, "loss_ce": 0.30852946639060974, "loss_xval": 0.029296875, "num_input_tokens_seen": 10313664, "step": 149 }, { "epoch": 9.375, "grad_norm": 34.69416994013918, "learning_rate": 4.537418057136437e-05, "loss": 0.3064, "num_input_tokens_seen": 10385408, "step": 150 }, { "epoch": 9.375, "loss": 0.30474743247032166, "loss_ce": 0.29302868247032166, "loss_xval": 0.01171875, "num_input_tokens_seen": 10385408, "step": 150 }, { "epoch": 9.4375, "grad_norm": 16.01660241849197, "learning_rate": 4.543435072214071e-05, "loss": 0.2817, "num_input_tokens_seen": 10456960, "step": 151 }, { "epoch": 9.4375, "loss": 0.2802174985408783, "loss_ce": 0.2626393735408783, "loss_xval": 0.017578125, "num_input_tokens_seen": 10456960, "step": 151 }, { "epoch": 9.5, "grad_norm": 12.08386534630288, "learning_rate": 4.549412370731206e-05, "loss": 0.2964, "num_input_tokens_seen": 10528576, "step": 152 }, { "epoch": 9.5, "loss": 0.2941746115684509, "loss_ce": 0.2757419943809509, "loss_xval": 0.0184326171875, "num_input_tokens_seen": 10528576, "step": 152 }, { "epoch": 9.5625, "grad_norm": 46.64162266885969, "learning_rate": 4.555350473566404e-05, "loss": 0.2922, "num_input_tokens_seen": 10600192, "step": 153 }, { "epoch": 9.5625, "loss": 0.29297852516174316, "loss_ce": 0.26148438453674316, "loss_xval": 0.031494140625, "num_input_tokens_seen": 10600192, "step": 153 }, { "epoch": 9.625, "grad_norm": 18.386857979962915, "learning_rate": 4.561249891418045e-05, "loss": 0.2874, "num_input_tokens_seen": 10671872, "step": 154 }, { "epoch": 9.625, "loss": 0.2867387533187866, "loss_ce": 0.2618364095687866, "loss_xval": 0.02490234375, "num_input_tokens_seen": 10671872, "step": 154 }, { "epoch": 9.6875, "grad_norm": 95.87776763441651, "learning_rate": 4.5671111250678913e-05, "loss": 0.291, "num_input_tokens_seen": 10743552, "step": 155 }, { "epoch": 9.6875, "loss": 0.28771263360977173, "loss_ce": 0.24157007038593292, "loss_xval": 0.046142578125, "num_input_tokens_seen": 10743552, "step": 155 }, { "epoch": 9.75, "grad_norm": 44.96972476442496, "learning_rate": 4.572934665636191e-05, "loss": 0.2742, "num_input_tokens_seen": 10815296, "step": 156 }, { "epoch": 9.75, "loss": 0.27817845344543457, "loss_ce": 0.24399876594543457, "loss_xval": 0.0341796875, "num_input_tokens_seen": 10815296, "step": 156 }, { "epoch": 9.8125, "grad_norm": 56.099205952139485, "learning_rate": 4.5787209948286147e-05, "loss": 0.2734, "num_input_tokens_seen": 10886976, "step": 157 }, { "epoch": 9.8125, "loss": 0.26927852630615234, "loss_ce": 0.23485469818115234, "loss_xval": 0.034423828125, "num_input_tokens_seen": 10886976, "step": 157 }, { "epoch": 9.875, "grad_norm": 41.24759861660678, "learning_rate": 4.5844705851753643e-05, "loss": 0.2707, "num_input_tokens_seen": 10958720, "step": 158 }, { "epoch": 9.875, "loss": 0.2965546250343323, "loss_ce": 0.23698429763317108, "loss_xval": 0.0595703125, "num_input_tokens_seen": 10958720, "step": 158 }, { "epoch": 9.9375, "grad_norm": 45.405964399299954, "learning_rate": 4.59018390026273e-05, "loss": 0.2736, "num_input_tokens_seen": 11030272, "step": 159 }, { "epoch": 9.9375, "loss": 0.25298789143562317, "loss_ce": 0.22674277424812317, "loss_xval": 0.0262451171875, "num_input_tokens_seen": 11030272, "step": 159 }, { "epoch": 10.0, "grad_norm": 76.86054524094995, "learning_rate": 4.5958613949573976e-05, "loss": 0.2655, "num_input_tokens_seen": 11101952, "step": 160 }, { "epoch": 10.0, "loss": 0.2796672284603119, "loss_ce": 0.2396281659603119, "loss_xval": 0.0400390625, "num_input_tokens_seen": 11101952, "step": 160 }, { "epoch": 10.0625, "grad_norm": 19.932279617630357, "learning_rate": 4.6015035156237594e-05, "loss": 0.2454, "num_input_tokens_seen": 11161024, "step": 161 }, { "epoch": 10.0625, "loss": 0.2661590576171875, "loss_ce": 0.2231903076171875, "loss_xval": 0.04296875, "num_input_tokens_seen": 11161024, "step": 161 }, { "epoch": 10.125, "grad_norm": 135.34396180408, "learning_rate": 4.607110700334503e-05, "loss": 0.2876, "num_input_tokens_seen": 11232704, "step": 162 }, { "epoch": 10.125, "loss": 0.2808167040348053, "loss_ce": 0.2114807665348053, "loss_xval": 0.0693359375, "num_input_tokens_seen": 11232704, "step": 162 }, { "epoch": 10.1875, "grad_norm": 70.37309365651035, "learning_rate": 4.612683379074717e-05, "loss": 0.2476, "num_input_tokens_seen": 11304320, "step": 163 }, { "epoch": 10.1875, "loss": 0.25540295243263245, "loss_ce": 0.21634045243263245, "loss_xval": 0.0390625, "num_input_tokens_seen": 11304320, "step": 163 }, { "epoch": 10.25, "grad_norm": 102.89804491544115, "learning_rate": 4.6182219739397555e-05, "loss": 0.2656, "num_input_tokens_seen": 11376000, "step": 164 }, { "epoch": 10.25, "loss": 0.2687593102455139, "loss_ce": 0.21871048212051392, "loss_xval": 0.050048828125, "num_input_tokens_seen": 11376000, "step": 164 }, { "epoch": 10.3125, "grad_norm": 137.28397466828608, "learning_rate": 4.623726899327088e-05, "loss": 0.2716, "num_input_tokens_seen": 11447616, "step": 165 }, { "epoch": 10.3125, "loss": 0.2575090527534485, "loss_ce": 0.1998918503522873, "loss_xval": 0.0576171875, "num_input_tokens_seen": 11447616, "step": 165 }, { "epoch": 10.375, "grad_norm": 26.430941270572102, "learning_rate": 4.62919856212236e-05, "loss": 0.2221, "num_input_tokens_seen": 11519232, "step": 166 }, { "epoch": 10.375, "loss": 0.21056154370307922, "loss_ce": 0.19273927807807922, "loss_xval": 0.017822265625, "num_input_tokens_seen": 11519232, "step": 166 }, { "epoch": 10.4375, "grad_norm": 130.14886600445317, "learning_rate": 4.6346373618798503e-05, "loss": 0.2633, "num_input_tokens_seen": 11591040, "step": 167 }, { "epoch": 10.4375, "loss": 0.277815043926239, "loss_ce": 0.201643168926239, "loss_xval": 0.076171875, "num_input_tokens_seen": 11591040, "step": 167 }, { "epoch": 10.5, "grad_norm": 15.36888208828194, "learning_rate": 4.640043690997557e-05, "loss": 0.2105, "num_input_tokens_seen": 11650048, "step": 168 }, { "epoch": 10.5, "loss": 0.21321672201156616, "loss_ce": 0.19209855794906616, "loss_xval": 0.0211181640625, "num_input_tokens_seen": 11650048, "step": 168 }, { "epoch": 10.5625, "grad_norm": 129.1002672969776, "learning_rate": 4.6454179348870823e-05, "loss": 0.2569, "num_input_tokens_seen": 11721664, "step": 169 }, { "epoch": 10.5625, "loss": 0.25425177812576294, "loss_ce": 0.19126349687576294, "loss_xval": 0.06298828125, "num_input_tokens_seen": 11721664, "step": 169 }, { "epoch": 10.625, "grad_norm": 113.95606982901108, "learning_rate": 4.650760472138503e-05, "loss": 0.2451, "num_input_tokens_seen": 11793408, "step": 170 }, { "epoch": 10.625, "loss": 0.2487194985151291, "loss_ce": 0.1932995766401291, "loss_xval": 0.055419921875, "num_input_tokens_seen": 11793408, "step": 170 }, { "epoch": 10.6875, "grad_norm": 9.33210610394722, "learning_rate": 4.65607167468041e-05, "loss": 0.2122, "num_input_tokens_seen": 11852480, "step": 171 }, { "epoch": 10.6875, "loss": 0.20928899943828583, "loss_ce": 0.18218939006328583, "loss_xval": 0.027099609375, "num_input_tokens_seen": 11852480, "step": 171 }, { "epoch": 10.75, "grad_norm": 72.11959352369618, "learning_rate": 4.66135190793528e-05, "loss": 0.2087, "num_input_tokens_seen": 11924032, "step": 172 }, { "epoch": 10.75, "loss": 0.2318447381258011, "loss_ce": 0.1837490350008011, "loss_xval": 0.048095703125, "num_input_tokens_seen": 11924032, "step": 172 }, { "epoch": 10.8125, "grad_norm": 47.15140710616073, "learning_rate": 4.666601530970347e-05, "loss": 0.1902, "num_input_tokens_seen": 11995712, "step": 173 }, { "epoch": 10.8125, "loss": 0.1904245764017105, "loss_ce": 0.1722360998392105, "loss_xval": 0.0181884765625, "num_input_tokens_seen": 11995712, "step": 173 }, { "epoch": 10.875, "grad_norm": 10.571062509972561, "learning_rate": 4.6718208966441165e-05, "loss": 0.1882, "num_input_tokens_seen": 12067392, "step": 174 }, { "epoch": 10.875, "loss": 0.1886989027261734, "loss_ce": 0.1755153089761734, "loss_xval": 0.01318359375, "num_input_tokens_seen": 12067392, "step": 174 }, { "epoch": 10.9375, "grad_norm": 46.46707211476998, "learning_rate": 4.677010351748695e-05, "loss": 0.1882, "num_input_tokens_seen": 12139072, "step": 175 }, { "epoch": 10.9375, "loss": 0.1885238140821457, "loss_ce": 0.1647201031446457, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 12139072, "step": 175 }, { "epoch": 11.0, "grad_norm": 42.51412908712922, "learning_rate": 4.682170237148049e-05, "loss": 0.1922, "num_input_tokens_seen": 12210624, "step": 176 }, { "epoch": 11.0, "loss": 0.1938706487417221, "loss_ce": 0.1682358831167221, "loss_xval": 0.025634765625, "num_input_tokens_seen": 12210624, "step": 176 }, { "epoch": 11.0625, "grad_norm": 20.645324063063473, "learning_rate": 4.6873008879123683e-05, "loss": 0.1954, "num_input_tokens_seen": 12269696, "step": 177 }, { "epoch": 11.0625, "loss": 0.1937786191701889, "loss_ce": 0.1749797910451889, "loss_xval": 0.018798828125, "num_input_tokens_seen": 12269696, "step": 177 }, { "epoch": 11.125, "grad_norm": 75.87090591068548, "learning_rate": 4.692402633448617e-05, "loss": 0.1903, "num_input_tokens_seen": 12341376, "step": 178 }, { "epoch": 11.125, "loss": 0.1830907016992569, "loss_ce": 0.1630711704492569, "loss_xval": 0.02001953125, "num_input_tokens_seen": 12341376, "step": 178 }, { "epoch": 11.1875, "grad_norm": 39.05028784592837, "learning_rate": 4.6974757976274554e-05, "loss": 0.1839, "num_input_tokens_seen": 12387840, "step": 179 }, { "epoch": 11.1875, "loss": 0.19825224578380585, "loss_ce": 0.17273955047130585, "loss_xval": 0.0255126953125, "num_input_tokens_seen": 12387840, "step": 179 }, { "epoch": 11.25, "grad_norm": 59.302577487431336, "learning_rate": 4.7025206989066015e-05, "loss": 0.1898, "num_input_tokens_seen": 12446912, "step": 180 }, { "epoch": 11.25, "loss": 0.18680866062641144, "loss_ce": 0.16556842625141144, "loss_xval": 0.021240234375, "num_input_tokens_seen": 12446912, "step": 180 }, { "epoch": 11.3125, "grad_norm": 132.3733658251234, "learning_rate": 4.7075376504507956e-05, "loss": 0.2344, "num_input_tokens_seen": 12518528, "step": 181 }, { "epoch": 11.3125, "loss": 0.22184261679649353, "loss_ce": 0.16031917929649353, "loss_xval": 0.0615234375, "num_input_tokens_seen": 12518528, "step": 181 }, { "epoch": 11.375, "grad_norm": 123.63188526561277, "learning_rate": 4.7125269602484475e-05, "loss": 0.2202, "num_input_tokens_seen": 12590272, "step": 182 }, { "epoch": 11.375, "loss": 0.1961844116449356, "loss_ce": 0.1507742553949356, "loss_xval": 0.04541015625, "num_input_tokens_seen": 12590272, "step": 182 }, { "epoch": 11.4375, "grad_norm": 28.488808197631567, "learning_rate": 4.717488931225096e-05, "loss": 0.1636, "num_input_tokens_seen": 12661952, "step": 183 }, { "epoch": 11.4375, "loss": 0.15562567114830017, "loss_ce": 0.14347967505455017, "loss_xval": 0.01214599609375, "num_input_tokens_seen": 12661952, "step": 183 }, { "epoch": 11.5, "grad_norm": 84.56249611227895, "learning_rate": 4.722423861353765e-05, "loss": 0.1893, "num_input_tokens_seen": 12733504, "step": 184 }, { "epoch": 11.5, "loss": 0.19987738132476807, "loss_ce": 0.15349066257476807, "loss_xval": 0.04638671875, "num_input_tokens_seen": 12733504, "step": 184 }, { "epoch": 11.5625, "grad_norm": 152.90061856756552, "learning_rate": 4.727332043762341e-05, "loss": 0.2362, "num_input_tokens_seen": 12805120, "step": 185 }, { "epoch": 11.5625, "loss": 0.22141288220882416, "loss_ce": 0.14963553845882416, "loss_xval": 0.07177734375, "num_input_tokens_seen": 12805120, "step": 185 }, { "epoch": 11.625, "grad_norm": 110.11923740163819, "learning_rate": 4.732213766838056e-05, "loss": 0.2016, "num_input_tokens_seen": 12876800, "step": 186 }, { "epoch": 11.625, "loss": 0.197799414396286, "loss_ce": 0.158004492521286, "loss_xval": 0.039794921875, "num_input_tokens_seen": 12876800, "step": 186 }, { "epoch": 11.6875, "grad_norm": 14.134948821400592, "learning_rate": 4.7370693143291545e-05, "loss": 0.1698, "num_input_tokens_seen": 12936000, "step": 187 }, { "epoch": 11.6875, "loss": 0.17875821888446808, "loss_ce": 0.15483243763446808, "loss_xval": 0.02392578125, "num_input_tokens_seen": 12936000, "step": 187 }, { "epoch": 11.75, "grad_norm": 97.72533976836097, "learning_rate": 4.74189896544387e-05, "loss": 0.1821, "num_input_tokens_seen": 13007680, "step": 188 }, { "epoch": 11.75, "loss": 0.17285184562206268, "loss_ce": 0.14892606437206268, "loss_xval": 0.02392578125, "num_input_tokens_seen": 13007680, "step": 188 }, { "epoch": 11.8125, "grad_norm": 101.84921282322784, "learning_rate": 4.746702994946761e-05, "loss": 0.1832, "num_input_tokens_seen": 13066880, "step": 189 }, { "epoch": 11.8125, "loss": 0.19510142505168915, "loss_ce": 0.13504283130168915, "loss_xval": 0.06005859375, "num_input_tokens_seen": 13066880, "step": 189 }, { "epoch": 11.875, "grad_norm": 24.2688075939221, "learning_rate": 4.7514816732525075e-05, "loss": 0.1571, "num_input_tokens_seen": 13138432, "step": 190 }, { "epoch": 11.875, "loss": 0.158127099275589, "loss_ce": 0.143905907869339, "loss_xval": 0.01422119140625, "num_input_tokens_seen": 13138432, "step": 190 }, { "epoch": 11.9375, "grad_norm": 78.76517648907875, "learning_rate": 4.7562352665172554e-05, "loss": 0.175, "num_input_tokens_seen": 13209984, "step": 191 }, { "epoch": 11.9375, "loss": 0.18069201707839966, "loss_ce": 0.15285998582839966, "loss_xval": 0.02783203125, "num_input_tokens_seen": 13209984, "step": 191 }, { "epoch": 12.0, "grad_norm": 149.9895595333202, "learning_rate": 4.760964036727562e-05, "loss": 0.2194, "num_input_tokens_seen": 13281664, "step": 192 }, { "epoch": 12.0, "loss": 0.2099841833114624, "loss_ce": 0.1377185583114624, "loss_xval": 0.072265625, "num_input_tokens_seen": 13281664, "step": 192 }, { "epoch": 12.0625, "grad_norm": 146.9176107213647, "learning_rate": 4.765668241787041e-05, "loss": 0.2137, "num_input_tokens_seen": 13353216, "step": 193 }, { "epoch": 12.0625, "loss": 0.2261110544204712, "loss_ce": 0.1435915231704712, "loss_xval": 0.08251953125, "num_input_tokens_seen": 13353216, "step": 193 }, { "epoch": 12.125, "grad_norm": 56.898685367206404, "learning_rate": 4.7703481356007625e-05, "loss": 0.1468, "num_input_tokens_seen": 13424832, "step": 194 }, { "epoch": 12.125, "loss": 0.14375604689121246, "loss_ce": 0.12715448439121246, "loss_xval": 0.0166015625, "num_input_tokens_seen": 13424832, "step": 194 }, { "epoch": 12.1875, "grad_norm": 39.22445120701935, "learning_rate": 4.775003968157493e-05, "loss": 0.1552, "num_input_tokens_seen": 13496512, "step": 195 }, { "epoch": 12.1875, "loss": 0.14577074348926544, "loss_ce": 0.13057298958301544, "loss_xval": 0.01519775390625, "num_input_tokens_seen": 13496512, "step": 195 }, { "epoch": 12.25, "grad_norm": 93.17339739911861, "learning_rate": 4.779635985609814e-05, "loss": 0.1683, "num_input_tokens_seen": 13568128, "step": 196 }, { "epoch": 12.25, "loss": 0.16052621603012085, "loss_ce": 0.12854379415512085, "loss_xval": 0.031982421875, "num_input_tokens_seen": 13568128, "step": 196 }, { "epoch": 12.3125, "grad_norm": 102.75877130957683, "learning_rate": 4.7842444303522264e-05, "loss": 0.1737, "num_input_tokens_seen": 13639808, "step": 197 }, { "epoch": 12.3125, "loss": 0.17900289595127106, "loss_ce": 0.13115133345127106, "loss_xval": 0.0478515625, "num_input_tokens_seen": 13639808, "step": 197 }, { "epoch": 12.375, "grad_norm": 84.31936070008948, "learning_rate": 4.7888295410972525e-05, "loss": 0.1567, "num_input_tokens_seen": 13711488, "step": 198 }, { "epoch": 12.375, "loss": 0.15471437573432922, "loss_ce": 0.11906984448432922, "loss_xval": 0.03564453125, "num_input_tokens_seen": 13711488, "step": 198 }, { "epoch": 12.4375, "grad_norm": 71.28851686781569, "learning_rate": 4.793391552949641e-05, "loss": 0.1591, "num_input_tokens_seen": 13770560, "step": 199 }, { "epoch": 12.4375, "loss": 0.15467874705791473, "loss_ce": 0.13002054393291473, "loss_xval": 0.024658203125, "num_input_tokens_seen": 13770560, "step": 199 }, { "epoch": 12.5, "grad_norm": 76.02012107203956, "learning_rate": 4.797930697478699e-05, "loss": 0.1488, "num_input_tokens_seen": 13842176, "step": 200 }, { "epoch": 12.5, "loss": 0.1352391541004181, "loss_ce": 0.1154637560248375, "loss_xval": 0.019775390625, "num_input_tokens_seen": 13842176, "step": 200 }, { "epoch": 12.5625, "grad_norm": 87.7457533680535, "learning_rate": 4.8024472027888286e-05, "loss": 0.1727, "num_input_tokens_seen": 13901248, "step": 201 }, { "epoch": 12.5625, "loss": 0.16884461045265198, "loss_ce": 0.12782898545265198, "loss_xval": 0.041015625, "num_input_tokens_seen": 13901248, "step": 201 }, { "epoch": 12.625, "grad_norm": 92.68577177289325, "learning_rate": 4.806941293588307e-05, "loss": 0.1663, "num_input_tokens_seen": 13972928, "step": 202 }, { "epoch": 12.625, "loss": 0.1651129573583603, "loss_ce": 0.1262945979833603, "loss_xval": 0.038818359375, "num_input_tokens_seen": 13972928, "step": 202 }, { "epoch": 12.6875, "grad_norm": 130.08538896402837, "learning_rate": 4.811413191256374e-05, "loss": 0.1892, "num_input_tokens_seen": 14044480, "step": 203 }, { "epoch": 12.6875, "loss": 0.17750969529151917, "loss_ce": 0.12038079649209976, "loss_xval": 0.05712890625, "num_input_tokens_seen": 14044480, "step": 203 }, { "epoch": 12.75, "grad_norm": 182.930050494289, "learning_rate": 4.815863113908667e-05, "loss": 0.2198, "num_input_tokens_seen": 14116160, "step": 204 }, { "epoch": 12.75, "loss": 0.21291296184062958, "loss_ce": 0.11086218059062958, "loss_xval": 0.10205078125, "num_input_tokens_seen": 14116160, "step": 204 }, { "epoch": 12.8125, "grad_norm": 169.78449355568006, "learning_rate": 4.820291276461056e-05, "loss": 0.2232, "num_input_tokens_seen": 14187840, "step": 205 }, { "epoch": 12.8125, "loss": 0.22619017958641052, "loss_ce": 0.13732299208641052, "loss_xval": 0.0888671875, "num_input_tokens_seen": 14187840, "step": 205 }, { "epoch": 12.875, "grad_norm": 77.39150620613387, "learning_rate": 4.82469789069193e-05, "loss": 0.1645, "num_input_tokens_seen": 14259584, "step": 206 }, { "epoch": 12.875, "loss": 0.16739881038665771, "loss_ce": 0.12980115413665771, "loss_xval": 0.03759765625, "num_input_tokens_seen": 14259584, "step": 206 }, { "epoch": 12.9375, "grad_norm": 59.71568867665327, "learning_rate": 4.829083165302968e-05, "loss": 0.138, "num_input_tokens_seen": 14318656, "step": 207 }, { "epoch": 12.9375, "loss": 0.14700518548488617, "loss_ce": 0.12198077142238617, "loss_xval": 0.0250244140625, "num_input_tokens_seen": 14318656, "step": 207 }, { "epoch": 13.0, "grad_norm": 176.08463177627178, "learning_rate": 4.833447305978453e-05, "loss": 0.2277, "num_input_tokens_seen": 14390272, "step": 208 }, { "epoch": 13.0, "loss": 0.21075105667114258, "loss_ce": 0.12530183792114258, "loss_xval": 0.08544921875, "num_input_tokens_seen": 14390272, "step": 208 }, { "epoch": 13.0625, "grad_norm": 207.07064896762472, "learning_rate": 4.83779051544316e-05, "loss": 0.2544, "num_input_tokens_seen": 14462080, "step": 209 }, { "epoch": 13.0625, "loss": 0.2749811112880707, "loss_ce": 0.12752017378807068, "loss_xval": 0.1474609375, "num_input_tokens_seen": 14462080, "step": 209 }, { "epoch": 13.125, "grad_norm": 152.6246485649904, "learning_rate": 4.842112993518858e-05, "loss": 0.2035, "num_input_tokens_seen": 14521152, "step": 210 }, { "epoch": 13.125, "loss": 0.21896807849407196, "loss_ce": 0.13303057849407196, "loss_xval": 0.0859375, "num_input_tokens_seen": 14521152, "step": 210 }, { "epoch": 13.1875, "grad_norm": 59.82558956835435, "learning_rate": 4.846414937179484e-05, "loss": 0.1316, "num_input_tokens_seen": 14592832, "step": 211 }, { "epoch": 13.1875, "loss": 0.12381850928068161, "loss_ce": 0.10794936865568161, "loss_xval": 0.015869140625, "num_input_tokens_seen": 14592832, "step": 211 }, { "epoch": 13.25, "grad_norm": 25.64597703986726, "learning_rate": 4.850696540604993e-05, "loss": 0.1209, "num_input_tokens_seen": 14651840, "step": 212 }, { "epoch": 13.25, "loss": 0.11560696363449097, "loss_ce": 0.10480374097824097, "loss_xval": 0.01080322265625, "num_input_tokens_seen": 14651840, "step": 212 }, { "epoch": 13.3125, "grad_norm": 69.3914547109082, "learning_rate": 4.8549579952339555e-05, "loss": 0.1522, "num_input_tokens_seen": 14723456, "step": 213 }, { "epoch": 13.3125, "loss": 0.15366515517234802, "loss_ce": 0.11753234267234802, "loss_xval": 0.0361328125, "num_input_tokens_seen": 14723456, "step": 213 }, { "epoch": 13.375, "grad_norm": 71.26381233983959, "learning_rate": 4.859199489814922e-05, "loss": 0.1408, "num_input_tokens_seen": 14782720, "step": 214 }, { "epoch": 13.375, "loss": 0.13961592316627502, "loss_ce": 0.11434736102819443, "loss_xval": 0.0252685546875, "num_input_tokens_seen": 14782720, "step": 214 }, { "epoch": 13.4375, "grad_norm": 87.29598783104798, "learning_rate": 4.863421210456582e-05, "loss": 0.1467, "num_input_tokens_seen": 14854400, "step": 215 }, { "epoch": 13.4375, "loss": 0.15197014808654785, "loss_ce": 0.11217523366212845, "loss_xval": 0.039794921875, "num_input_tokens_seen": 14854400, "step": 215 }, { "epoch": 13.5, "grad_norm": 137.423748091418, "learning_rate": 4.8676233406767654e-05, "loss": 0.183, "num_input_tokens_seen": 14913536, "step": 216 }, { "epoch": 13.5, "loss": 0.16861703991889954, "loss_ce": 0.10514048486948013, "loss_xval": 0.0634765625, "num_input_tokens_seen": 14913536, "step": 216 }, { "epoch": 13.5625, "grad_norm": 194.04712419667922, "learning_rate": 4.871806061450313e-05, "loss": 0.2344, "num_input_tokens_seen": 14985152, "step": 217 }, { "epoch": 13.5625, "loss": 0.23106953501701355, "loss_ce": 0.11144062131643295, "loss_xval": 0.11962890625, "num_input_tokens_seen": 14985152, "step": 217 }, { "epoch": 13.625, "grad_norm": 236.6670233746774, "learning_rate": 4.875969551255842e-05, "loss": 0.3039, "num_input_tokens_seen": 15044224, "step": 218 }, { "epoch": 13.625, "loss": 0.329833984375, "loss_ce": 0.1237793043255806, "loss_xval": 0.2060546875, "num_input_tokens_seen": 15044224, "step": 218 }, { "epoch": 13.6875, "grad_norm": 222.73052377666872, "learning_rate": 4.8801139861214464e-05, "loss": 0.2694, "num_input_tokens_seen": 15103296, "step": 219 }, { "epoch": 13.6875, "loss": 0.2580621540546417, "loss_ce": 0.10669496655464172, "loss_xval": 0.1513671875, "num_input_tokens_seen": 15103296, "step": 219 }, { "epoch": 13.75, "grad_norm": 134.04747220394003, "learning_rate": 4.884239539669351e-05, "loss": 0.1776, "num_input_tokens_seen": 15162368, "step": 220 }, { "epoch": 13.75, "loss": 0.1807592511177063, "loss_ce": 0.1089818999171257, "loss_xval": 0.07177734375, "num_input_tokens_seen": 15162368, "step": 220 }, { "epoch": 13.8125, "grad_norm": 8.718688250244332, "learning_rate": 4.8883463831595575e-05, "loss": 0.1181, "num_input_tokens_seen": 15221376, "step": 221 }, { "epoch": 13.8125, "loss": 0.11192178726196289, "loss_ce": 0.09953165054321289, "loss_xval": 0.01239013671875, "num_input_tokens_seen": 15221376, "step": 221 }, { "epoch": 13.875, "grad_norm": 156.2092295245801, "learning_rate": 4.8924346855325055e-05, "loss": 0.1788, "num_input_tokens_seen": 15292992, "step": 222 }, { "epoch": 13.875, "loss": 0.17930957674980164, "loss_ce": 0.09581348299980164, "loss_xval": 0.08349609375, "num_input_tokens_seen": 15292992, "step": 222 }, { "epoch": 13.9375, "grad_norm": 257.807381270319, "learning_rate": 4.896504613450767e-05, "loss": 0.3138, "num_input_tokens_seen": 15364672, "step": 223 }, { "epoch": 13.9375, "loss": 0.3046586215496063, "loss_ce": 0.10641643404960632, "loss_xval": 0.1982421875, "num_input_tokens_seen": 15364672, "step": 223 }, { "epoch": 14.0, "grad_norm": 261.8889355993732, "learning_rate": 4.900556331339819e-05, "loss": 0.3191, "num_input_tokens_seen": 15436352, "step": 224 }, { "epoch": 14.0, "loss": 0.25728708505630493, "loss_ce": 0.10103708505630493, "loss_xval": 0.15625, "num_input_tokens_seen": 15436352, "step": 224 }, { "epoch": 14.0625, "grad_norm": 194.10209172383972, "learning_rate": 4.904590001427903e-05, "loss": 0.2386, "num_input_tokens_seen": 15507904, "step": 225 }, { "epoch": 14.0625, "loss": 0.24090451002120972, "loss_ce": 0.10418576747179031, "loss_xval": 0.13671875, "num_input_tokens_seen": 15507904, "step": 225 }, { "epoch": 14.125, "grad_norm": 128.9053385322778, "learning_rate": 4.908605783784996e-05, "loss": 0.1716, "num_input_tokens_seen": 15579584, "step": 226 }, { "epoch": 14.125, "loss": 0.17217063903808594, "loss_ce": 0.09697532653808594, "loss_xval": 0.0751953125, "num_input_tokens_seen": 15579584, "step": 226 }, { "epoch": 14.1875, "grad_norm": 87.59732886995556, "learning_rate": 4.9126038363609304e-05, "loss": 0.1359, "num_input_tokens_seen": 15651200, "step": 227 }, { "epoch": 14.1875, "loss": 0.12465231120586395, "loss_ce": 0.09169332683086395, "loss_xval": 0.032958984375, "num_input_tokens_seen": 15651200, "step": 227 }, { "epoch": 14.25, "grad_norm": 70.56237270820588, "learning_rate": 4.916584315022672e-05, "loss": 0.145, "num_input_tokens_seen": 15722752, "step": 228 }, { "epoch": 14.25, "loss": 0.16532674431800842, "loss_ce": 0.09989706426858902, "loss_xval": 0.0654296875, "num_input_tokens_seen": 15722752, "step": 228 }, { "epoch": 14.3125, "grad_norm": 62.93559938979091, "learning_rate": 4.920547373590778e-05, "loss": 0.1368, "num_input_tokens_seen": 15781760, "step": 229 }, { "epoch": 14.3125, "loss": 0.13593743741512299, "loss_ce": 0.09785149991512299, "loss_xval": 0.0380859375, "num_input_tokens_seen": 15781760, "step": 229 }, { "epoch": 14.375, "grad_norm": 21.157191577955352, "learning_rate": 4.924493163875066e-05, "loss": 0.1233, "num_input_tokens_seen": 15853376, "step": 230 }, { "epoch": 14.375, "loss": 0.11231095343828201, "loss_ce": 0.09735734015703201, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 15853376, "step": 230 }, { "epoch": 14.4375, "grad_norm": 59.78704566708404, "learning_rate": 4.9284218357095105e-05, "loss": 0.1293, "num_input_tokens_seen": 15924928, "step": 231 }, { "epoch": 14.4375, "loss": 0.13710638880729675, "loss_ce": 0.10219427198171616, "loss_xval": 0.034912109375, "num_input_tokens_seen": 15924928, "step": 231 }, { "epoch": 14.5, "grad_norm": 172.68273765724646, "learning_rate": 4.9323335369863785e-05, "loss": 0.2016, "num_input_tokens_seen": 15996608, "step": 232 }, { "epoch": 14.5, "loss": 0.21267253160476685, "loss_ce": 0.09206705540418625, "loss_xval": 0.12060546875, "num_input_tokens_seen": 15996608, "step": 232 }, { "epoch": 14.5625, "grad_norm": 322.95712013391943, "learning_rate": 4.936228413689641e-05, "loss": 0.4173, "num_input_tokens_seen": 16068288, "step": 233 }, { "epoch": 14.5625, "loss": 0.4134458899497986, "loss_ce": 0.08532088994979858, "loss_xval": 0.328125, "num_input_tokens_seen": 16068288, "step": 233 }, { "epoch": 14.625, "grad_norm": 451.15748506196144, "learning_rate": 4.940106609927657e-05, "loss": 0.7367, "num_input_tokens_seen": 16139968, "step": 234 }, { "epoch": 14.625, "loss": 0.7196348905563354, "loss_ce": 0.09072863310575485, "loss_xval": 0.62890625, "num_input_tokens_seen": 16139968, "step": 234 }, { "epoch": 14.6875, "grad_norm": 489.1108218774364, "learning_rate": 4.943968267965172e-05, "loss": 0.8289, "num_input_tokens_seen": 16211648, "step": 235 }, { "epoch": 14.6875, "loss": 0.8319761157035828, "loss_ce": 0.08588234335184097, "loss_xval": 0.74609375, "num_input_tokens_seen": 16211648, "step": 235 }, { "epoch": 14.75, "grad_norm": 388.11242092173126, "learning_rate": 4.947813528254631e-05, "loss": 0.5546, "num_input_tokens_seen": 16283264, "step": 236 }, { "epoch": 14.75, "loss": 0.5663637518882751, "loss_ce": 0.08003561198711395, "loss_xval": 0.486328125, "num_input_tokens_seen": 16283264, "step": 236 }, { "epoch": 14.8125, "grad_norm": 162.58093825153682, "learning_rate": 4.95164252946683e-05, "loss": 0.1844, "num_input_tokens_seen": 16342400, "step": 237 }, { "epoch": 14.8125, "loss": 0.18206077814102173, "loss_ce": 0.08831078559160233, "loss_xval": 0.09375, "num_input_tokens_seen": 16342400, "step": 237 }, { "epoch": 14.875, "grad_norm": 108.88258484145715, "learning_rate": 4.955455408520925e-05, "loss": 0.1338, "num_input_tokens_seen": 16414016, "step": 238 }, { "epoch": 14.875, "loss": 0.13403305411338806, "loss_ce": 0.08447249978780746, "loss_xval": 0.049560546875, "num_input_tokens_seen": 16414016, "step": 238 }, { "epoch": 14.9375, "grad_norm": 312.07861032238304, "learning_rate": 4.9592523006138054e-05, "loss": 0.4097, "num_input_tokens_seen": 16485632, "step": 239 }, { "epoch": 14.9375, "loss": 0.3987405002117157, "loss_ce": 0.0901467576622963, "loss_xval": 0.30859375, "num_input_tokens_seen": 16485632, "step": 239 }, { "epoch": 15.0, "grad_norm": 350.5215392823211, "learning_rate": 4.963033339248863e-05, "loss": 0.5014, "num_input_tokens_seen": 16557248, "step": 240 }, { "epoch": 15.0, "loss": 0.5261030197143555, "loss_ce": 0.08079053461551666, "loss_xval": 0.4453125, "num_input_tokens_seen": 16557248, "step": 240 }, { "epoch": 15.0625, "grad_norm": 192.24256534435753, "learning_rate": 4.9667986562641596e-05, "loss": 0.2191, "num_input_tokens_seen": 16628864, "step": 241 }, { "epoch": 15.0625, "loss": 0.22793100774288177, "loss_ce": 0.08730600774288177, "loss_xval": 0.140625, "num_input_tokens_seen": 16628864, "step": 241 }, { "epoch": 15.125, "grad_norm": 74.93058959493273, "learning_rate": 4.970548381860003e-05, "loss": 0.1164, "num_input_tokens_seen": 16700416, "step": 242 }, { "epoch": 15.125, "loss": 0.10929420590400696, "loss_ce": 0.08488014340400696, "loss_xval": 0.0244140625, "num_input_tokens_seen": 16700416, "step": 242 }, { "epoch": 15.1875, "grad_norm": 260.9104255182291, "learning_rate": 4.9742826446259686e-05, "loss": 0.3269, "num_input_tokens_seen": 16772032, "step": 243 }, { "epoch": 15.1875, "loss": 0.33294883370399475, "loss_ce": 0.08099570125341415, "loss_xval": 0.251953125, "num_input_tokens_seen": 16772032, "step": 243 }, { "epoch": 15.25, "grad_norm": 227.83423730454186, "learning_rate": 4.978001571567359e-05, "loss": 0.2694, "num_input_tokens_seen": 16843712, "step": 244 }, { "epoch": 15.25, "loss": 0.26590994000434875, "loss_ce": 0.08036306500434875, "loss_xval": 0.185546875, "num_input_tokens_seen": 16843712, "step": 244 }, { "epoch": 15.3125, "grad_norm": 13.556589572008898, "learning_rate": 4.981705288131116e-05, "loss": 0.1122, "num_input_tokens_seen": 16902784, "step": 245 }, { "epoch": 15.3125, "loss": 0.12127488106489182, "loss_ce": 0.09576218575239182, "loss_xval": 0.0255126953125, "num_input_tokens_seen": 16902784, "step": 245 }, { "epoch": 15.375, "grad_norm": 191.0737255553766, "learning_rate": 4.98539391823122e-05, "loss": 0.2406, "num_input_tokens_seen": 16974528, "step": 246 }, { "epoch": 15.375, "loss": 0.2360803782939911, "loss_ce": 0.09252569824457169, "loss_xval": 0.1435546875, "num_input_tokens_seen": 16974528, "step": 246 }, { "epoch": 15.4375, "grad_norm": 225.96150851651885, "learning_rate": 4.9890675842735636e-05, "loss": 0.288, "num_input_tokens_seen": 17046272, "step": 247 }, { "epoch": 15.4375, "loss": 0.28033164143562317, "loss_ce": 0.09185508638620377, "loss_xval": 0.1884765625, "num_input_tokens_seen": 17046272, "step": 247 }, { "epoch": 15.5, "grad_norm": 127.88649841176745, "learning_rate": 4.992726407180318e-05, "loss": 0.1589, "num_input_tokens_seen": 17117952, "step": 248 }, { "epoch": 15.5, "loss": 0.1766410917043686, "loss_ce": 0.08582077920436859, "loss_xval": 0.0908203125, "num_input_tokens_seen": 17117952, "step": 248 }, { "epoch": 15.5625, "grad_norm": 20.415464624373833, "learning_rate": 4.996370506413826e-05, "loss": 0.1034, "num_input_tokens_seen": 17189568, "step": 249 }, { "epoch": 15.5625, "loss": 0.09446126967668533, "loss_ce": 0.07938558608293533, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 17189568, "step": 249 }, { "epoch": 15.625, "grad_norm": 143.26601409703582, "learning_rate": 5e-05, "loss": 0.1694, "num_input_tokens_seen": 17248704, "step": 250 }, { "epoch": 15.625, "eval_synth_IoU": 0.0, "eval_synth_MAE_x": 0.261452853679657, "eval_synth_MAE_y": 0.4066009521484375, "eval_synth_NUM_probability": 0.8209518492221832, "eval_synth_inside_bbox": 0.0, "eval_synth_loss": 0.20519158244132996, "eval_synth_loss_ce": 0.08092402294278145, "eval_synth_loss_xval": 0.124267578125, "eval_synth_runtime": 53.3374, "eval_synth_samples_per_second": 2.4, "eval_synth_steps_per_second": 0.075, "num_input_tokens_seen": 17248704, "step": 250 }, { "epoch": 15.625, "loss": 0.23013810813426971, "loss_ce": 0.08072404563426971, "loss_xval": 0.1494140625, "num_input_tokens_seen": 17248704, "step": 250 }, { "epoch": 15.6875, "grad_norm": 169.85650761281596, "learning_rate": 5e-05, "loss": 0.1935, "num_input_tokens_seen": 17320512, "step": 251 }, { "epoch": 15.6875, "loss": 0.1724509298801422, "loss_ce": 0.08065404742956161, "loss_xval": 0.091796875, "num_input_tokens_seen": 17320512, "step": 251 }, { "epoch": 15.75, "grad_norm": 64.15008625818228, "learning_rate": 5e-05, "loss": 0.1049, "num_input_tokens_seen": 17392192, "step": 252 }, { "epoch": 15.75, "loss": 0.09971656650304794, "loss_ce": 0.07945289462804794, "loss_xval": 0.020263671875, "num_input_tokens_seen": 17392192, "step": 252 }, { "epoch": 15.8125, "grad_norm": 103.9891196216456, "learning_rate": 5e-05, "loss": 0.1254, "num_input_tokens_seen": 17463936, "step": 253 }, { "epoch": 15.8125, "loss": 0.12425937503576279, "loss_ce": 0.07323398441076279, "loss_xval": 0.051025390625, "num_input_tokens_seen": 17463936, "step": 253 }, { "epoch": 15.875, "grad_norm": 204.58315274712749, "learning_rate": 5e-05, "loss": 0.2338, "num_input_tokens_seen": 17535488, "step": 254 }, { "epoch": 15.875, "loss": 0.2364085167646408, "loss_ce": 0.08406476676464081, "loss_xval": 0.15234375, "num_input_tokens_seen": 17535488, "step": 254 }, { "epoch": 15.9375, "grad_norm": 162.52441420980293, "learning_rate": 5e-05, "loss": 0.1916, "num_input_tokens_seen": 17607104, "step": 255 }, { "epoch": 15.9375, "loss": 0.20207734405994415, "loss_ce": 0.08098359405994415, "loss_xval": 0.12109375, "num_input_tokens_seen": 17607104, "step": 255 }, { "epoch": 16.0, "grad_norm": 11.653784354334272, "learning_rate": 5e-05, "loss": 0.0978, "num_input_tokens_seen": 17678720, "step": 256 }, { "epoch": 16.0, "loss": 0.08942156285047531, "loss_ce": 0.07111101597547531, "loss_xval": 0.018310546875, "num_input_tokens_seen": 17678720, "step": 256 }, { "epoch": 16.0625, "grad_norm": 158.11169008732344, "learning_rate": 5e-05, "loss": 0.1773, "num_input_tokens_seen": 17750400, "step": 257 }, { "epoch": 16.0625, "loss": 0.17226354777812958, "loss_ce": 0.07460729777812958, "loss_xval": 0.09765625, "num_input_tokens_seen": 17750400, "step": 257 }, { "epoch": 16.125, "grad_norm": 229.81576815742696, "learning_rate": 5e-05, "loss": 0.2733, "num_input_tokens_seen": 17822080, "step": 258 }, { "epoch": 16.125, "loss": 0.27170130610466003, "loss_ce": 0.07736537605524063, "loss_xval": 0.1943359375, "num_input_tokens_seen": 17822080, "step": 258 }, { "epoch": 16.1875, "grad_norm": 159.28923838871276, "learning_rate": 5e-05, "loss": 0.1674, "num_input_tokens_seen": 17881280, "step": 259 }, { "epoch": 16.1875, "loss": 0.1720913052558899, "loss_ce": 0.06857568025588989, "loss_xval": 0.103515625, "num_input_tokens_seen": 17881280, "step": 259 }, { "epoch": 16.25, "grad_norm": 5.26556670728744, "learning_rate": 5e-05, "loss": 0.0864, "num_input_tokens_seen": 17940416, "step": 260 }, { "epoch": 16.25, "loss": 0.08316627144813538, "loss_ce": 0.07922950387001038, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 17940416, "step": 260 }, { "epoch": 16.3125, "grad_norm": 155.7591091021434, "learning_rate": 5e-05, "loss": 0.1733, "num_input_tokens_seen": 18012096, "step": 261 }, { "epoch": 16.3125, "loss": 0.16722114384174347, "loss_ce": 0.07688911259174347, "loss_xval": 0.09033203125, "num_input_tokens_seen": 18012096, "step": 261 }, { "epoch": 16.375, "grad_norm": 233.63878569347762, "learning_rate": 5e-05, "loss": 0.2803, "num_input_tokens_seen": 18083776, "step": 262 }, { "epoch": 16.375, "loss": 0.27516964077949524, "loss_ce": 0.07692745327949524, "loss_xval": 0.1982421875, "num_input_tokens_seen": 18083776, "step": 262 }, { "epoch": 16.4375, "grad_norm": 186.41794854520438, "learning_rate": 5e-05, "loss": 0.2068, "num_input_tokens_seen": 18155392, "step": 263 }, { "epoch": 16.4375, "loss": 0.21879586577415466, "loss_ce": 0.07524118572473526, "loss_xval": 0.1435546875, "num_input_tokens_seen": 18155392, "step": 263 }, { "epoch": 16.5, "grad_norm": 38.078416547409816, "learning_rate": 5e-05, "loss": 0.1028, "num_input_tokens_seen": 18226944, "step": 264 }, { "epoch": 16.5, "loss": 0.11282212287187576, "loss_ce": 0.08389145880937576, "loss_xval": 0.0289306640625, "num_input_tokens_seen": 18226944, "step": 264 }, { "epoch": 16.5625, "grad_norm": 131.60459627807813, "learning_rate": 5e-05, "loss": 0.1424, "num_input_tokens_seen": 18298496, "step": 265 }, { "epoch": 16.5625, "loss": 0.13576087355613708, "loss_ce": 0.07448158413171768, "loss_xval": 0.061279296875, "num_input_tokens_seen": 18298496, "step": 265 }, { "epoch": 16.625, "grad_norm": 223.23466176441613, "learning_rate": 5e-05, "loss": 0.2574, "num_input_tokens_seen": 18357632, "step": 266 }, { "epoch": 16.625, "loss": 0.27536073327064514, "loss_ce": 0.06832948327064514, "loss_xval": 0.20703125, "num_input_tokens_seen": 18357632, "step": 266 }, { "epoch": 16.6875, "grad_norm": 204.3555030951728, "learning_rate": 5e-05, "loss": 0.2297, "num_input_tokens_seen": 18429376, "step": 267 }, { "epoch": 16.6875, "loss": 0.225807785987854, "loss_ce": 0.073464035987854, "loss_xval": 0.15234375, "num_input_tokens_seen": 18429376, "step": 267 }, { "epoch": 16.75, "grad_norm": 91.23908279974412, "learning_rate": 5e-05, "loss": 0.1098, "num_input_tokens_seen": 18501056, "step": 268 }, { "epoch": 16.75, "loss": 0.11241857707500458, "loss_ce": 0.06822912395000458, "loss_xval": 0.044189453125, "num_input_tokens_seen": 18501056, "step": 268 }, { "epoch": 16.8125, "grad_norm": 59.27150232738392, "learning_rate": 5e-05, "loss": 0.0899, "num_input_tokens_seen": 18572736, "step": 269 }, { "epoch": 16.8125, "loss": 0.0962166041135788, "loss_ce": 0.0699714869260788, "loss_xval": 0.0262451171875, "num_input_tokens_seen": 18572736, "step": 269 }, { "epoch": 16.875, "grad_norm": 183.1381956429717, "learning_rate": 5e-05, "loss": 0.2093, "num_input_tokens_seen": 18632000, "step": 270 }, { "epoch": 16.875, "loss": 0.22140049934387207, "loss_ce": 0.08468174934387207, "loss_xval": 0.13671875, "num_input_tokens_seen": 18632000, "step": 270 }, { "epoch": 16.9375, "grad_norm": 214.36639578000916, "learning_rate": 5e-05, "loss": 0.2495, "num_input_tokens_seen": 18691136, "step": 271 }, { "epoch": 16.9375, "loss": 0.25600141286849976, "loss_ce": 0.07338422536849976, "loss_xval": 0.1826171875, "num_input_tokens_seen": 18691136, "step": 271 }, { "epoch": 17.0, "grad_norm": 127.84713265646646, "learning_rate": 5e-05, "loss": 0.1332, "num_input_tokens_seen": 18762816, "step": 272 }, { "epoch": 17.0, "loss": 0.139483243227005, "loss_ce": 0.067705899477005, "loss_xval": 0.07177734375, "num_input_tokens_seen": 18762816, "step": 272 }, { "epoch": 17.0625, "grad_norm": 4.898120648292572, "learning_rate": 5e-05, "loss": 0.0764, "num_input_tokens_seen": 18834432, "step": 273 }, { "epoch": 17.0625, "loss": 0.08028127998113632, "loss_ce": 0.06990530341863632, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 18834432, "step": 273 }, { "epoch": 17.125, "grad_norm": 95.27997261371199, "learning_rate": 5e-05, "loss": 0.1044, "num_input_tokens_seen": 18906176, "step": 274 }, { "epoch": 17.125, "loss": 0.10349054634571075, "loss_ce": 0.06662531197071075, "loss_xval": 0.036865234375, "num_input_tokens_seen": 18906176, "step": 274 }, { "epoch": 17.1875, "grad_norm": 146.32212640213385, "learning_rate": 5e-05, "loss": 0.152, "num_input_tokens_seen": 18977920, "step": 275 }, { "epoch": 17.1875, "loss": 0.1592569351196289, "loss_ce": 0.0664834976196289, "loss_xval": 0.0927734375, "num_input_tokens_seen": 18977920, "step": 275 }, { "epoch": 17.25, "grad_norm": 142.43308471222042, "learning_rate": 5e-05, "loss": 0.1569, "num_input_tokens_seen": 19049536, "step": 276 }, { "epoch": 17.25, "loss": 0.15653030574321747, "loss_ce": 0.06229202449321747, "loss_xval": 0.09423828125, "num_input_tokens_seen": 19049536, "step": 276 }, { "epoch": 17.3125, "grad_norm": 91.99449362446872, "learning_rate": 5e-05, "loss": 0.0961, "num_input_tokens_seen": 19121280, "step": 277 }, { "epoch": 17.3125, "loss": 0.09561696648597717, "loss_ce": 0.055577900260686874, "loss_xval": 0.0400390625, "num_input_tokens_seen": 19121280, "step": 277 }, { "epoch": 17.375, "grad_norm": 21.1872764546701, "learning_rate": 5e-05, "loss": 0.0708, "num_input_tokens_seen": 19192896, "step": 278 }, { "epoch": 17.375, "loss": 0.07196405529975891, "loss_ce": 0.06250360608100891, "loss_xval": 0.00946044921875, "num_input_tokens_seen": 19192896, "step": 278 }, { "epoch": 17.4375, "grad_norm": 39.6768526052572, "learning_rate": 5e-05, "loss": 0.0671, "num_input_tokens_seen": 19264576, "step": 279 }, { "epoch": 17.4375, "loss": 0.07075974345207214, "loss_ce": 0.05574509873986244, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 19264576, "step": 279 }, { "epoch": 17.5, "grad_norm": 77.921892367936, "learning_rate": 5e-05, "loss": 0.0893, "num_input_tokens_seen": 19336128, "step": 280 }, { "epoch": 17.5, "loss": 0.09789624810218811, "loss_ce": 0.06396070122718811, "loss_xval": 0.033935546875, "num_input_tokens_seen": 19336128, "step": 280 }, { "epoch": 17.5625, "grad_norm": 82.02369908492679, "learning_rate": 5e-05, "loss": 0.0943, "num_input_tokens_seen": 19395328, "step": 281 }, { "epoch": 17.5625, "loss": 0.09304691106081009, "loss_ce": 0.06655765324831009, "loss_xval": 0.0264892578125, "num_input_tokens_seen": 19395328, "step": 281 }, { "epoch": 17.625, "grad_norm": 42.31537870755725, "learning_rate": 5e-05, "loss": 0.0771, "num_input_tokens_seen": 19466880, "step": 282 }, { "epoch": 17.625, "loss": 0.0733356922864914, "loss_ce": 0.05801587179303169, "loss_xval": 0.01531982421875, "num_input_tokens_seen": 19466880, "step": 282 }, { "epoch": 17.6875, "grad_norm": 17.771330987667906, "learning_rate": 5e-05, "loss": 0.066, "num_input_tokens_seen": 19538624, "step": 283 }, { "epoch": 17.6875, "loss": 0.0675898939371109, "loss_ce": 0.0558101125061512, "loss_xval": 0.01177978515625, "num_input_tokens_seen": 19538624, "step": 283 }, { "epoch": 17.75, "grad_norm": 50.51106428941197, "learning_rate": 5e-05, "loss": 0.0796, "num_input_tokens_seen": 19610176, "step": 284 }, { "epoch": 17.75, "loss": 0.08379670232534409, "loss_ce": 0.06304474920034409, "loss_xval": 0.020751953125, "num_input_tokens_seen": 19610176, "step": 284 }, { "epoch": 17.8125, "grad_norm": 59.38081278481981, "learning_rate": 5e-05, "loss": 0.0827, "num_input_tokens_seen": 19669248, "step": 285 }, { "epoch": 17.8125, "loss": 0.08613239228725433, "loss_ce": 0.06074177101254463, "loss_xval": 0.025390625, "num_input_tokens_seen": 19669248, "step": 285 }, { "epoch": 17.875, "grad_norm": 86.85566163114785, "learning_rate": 5e-05, "loss": 0.0945, "num_input_tokens_seen": 19740864, "step": 286 }, { "epoch": 17.875, "loss": 0.0980193167924881, "loss_ce": 0.0587126798927784, "loss_xval": 0.039306640625, "num_input_tokens_seen": 19740864, "step": 286 }, { "epoch": 17.9375, "grad_norm": 131.48187852125864, "learning_rate": 5e-05, "loss": 0.127, "num_input_tokens_seen": 19812416, "step": 287 }, { "epoch": 17.9375, "loss": 0.12445548176765442, "loss_ce": 0.05951407551765442, "loss_xval": 0.06494140625, "num_input_tokens_seen": 19812416, "step": 287 }, { "epoch": 18.0, "grad_norm": 186.15300597167504, "learning_rate": 5e-05, "loss": 0.1944, "num_input_tokens_seen": 19884032, "step": 288 }, { "epoch": 18.0, "loss": 0.19328424334526062, "loss_ce": 0.05754205584526062, "loss_xval": 0.1357421875, "num_input_tokens_seen": 19884032, "step": 288 }, { "epoch": 18.0625, "grad_norm": 273.5012601742529, "learning_rate": 5e-05, "loss": 0.3323, "num_input_tokens_seen": 19955648, "step": 289 }, { "epoch": 18.0625, "loss": 0.33222687244415283, "loss_ce": 0.05292999744415283, "loss_xval": 0.279296875, "num_input_tokens_seen": 19955648, "step": 289 }, { "epoch": 18.125, "grad_norm": 388.36705185798957, "learning_rate": 5e-05, "loss": 0.6139, "num_input_tokens_seen": 20027328, "step": 290 }, { "epoch": 18.125, "loss": 0.6378867626190186, "loss_ce": 0.05194929242134094, "loss_xval": 0.5859375, "num_input_tokens_seen": 20027328, "step": 290 }, { "epoch": 18.1875, "grad_norm": 499.00867924698656, "learning_rate": 5e-05, "loss": 0.9746, "num_input_tokens_seen": 20098944, "step": 291 }, { "epoch": 18.1875, "loss": 0.9852690696716309, "loss_ce": 0.05167528986930847, "loss_xval": 0.93359375, "num_input_tokens_seen": 20098944, "step": 291 }, { "epoch": 18.25, "grad_norm": 554.1905120885209, "learning_rate": 5e-05, "loss": 1.1936, "num_input_tokens_seen": 20170496, "step": 292 }, { "epoch": 18.25, "loss": 1.2322349548339844, "loss_ce": 0.05254746600985527, "loss_xval": 1.1796875, "num_input_tokens_seen": 20170496, "step": 292 }, { "epoch": 18.3125, "grad_norm": 457.7168361435202, "learning_rate": 5e-05, "loss": 0.8529, "num_input_tokens_seen": 20242176, "step": 293 }, { "epoch": 18.3125, "loss": 0.8144187927246094, "loss_ce": 0.05660631135106087, "loss_xval": 0.7578125, "num_input_tokens_seen": 20242176, "step": 293 }, { "epoch": 18.375, "grad_norm": 152.99752414373336, "learning_rate": 5e-05, "loss": 0.157, "num_input_tokens_seen": 20313728, "step": 294 }, { "epoch": 18.375, "loss": 0.16795939207077026, "loss_ce": 0.054678138345479965, "loss_xval": 0.11328125, "num_input_tokens_seen": 20313728, "step": 294 }, { "epoch": 18.4375, "grad_norm": 233.74530405929963, "learning_rate": 5e-05, "loss": 0.2758, "num_input_tokens_seen": 20385344, "step": 295 }, { "epoch": 18.4375, "loss": 0.2857314646244049, "loss_ce": 0.04842676967382431, "loss_xval": 0.2373046875, "num_input_tokens_seen": 20385344, "step": 295 }, { "epoch": 18.5, "grad_norm": 475.0087558704663, "learning_rate": 5e-05, "loss": 0.9749, "num_input_tokens_seen": 20456960, "step": 296 }, { "epoch": 18.5, "loss": 0.9483780264854431, "loss_ce": 0.06947175413370132, "loss_xval": 0.87890625, "num_input_tokens_seen": 20456960, "step": 296 }, { "epoch": 18.5625, "grad_norm": 398.2800226521687, "learning_rate": 5e-05, "loss": 0.7156, "num_input_tokens_seen": 20528576, "step": 297 }, { "epoch": 18.5625, "loss": 0.6812264919281006, "loss_ce": 0.06403897702693939, "loss_xval": 0.6171875, "num_input_tokens_seen": 20528576, "step": 297 }, { "epoch": 18.625, "grad_norm": 44.51041055843072, "learning_rate": 5e-05, "loss": 0.09, "num_input_tokens_seen": 20600128, "step": 298 }, { "epoch": 18.625, "loss": 0.08670137822628021, "loss_ce": 0.05520723760128021, "loss_xval": 0.031494140625, "num_input_tokens_seen": 20600128, "step": 298 }, { "epoch": 18.6875, "grad_norm": 324.861214138624, "learning_rate": 5e-05, "loss": 0.5103, "num_input_tokens_seen": 20671808, "step": 299 }, { "epoch": 18.6875, "loss": 0.48987704515457153, "loss_ce": 0.06214268505573273, "loss_xval": 0.427734375, "num_input_tokens_seen": 20671808, "step": 299 }, { "epoch": 18.75, "grad_norm": 385.2579654281278, "learning_rate": 5e-05, "loss": 0.6728, "num_input_tokens_seen": 20743424, "step": 300 }, { "epoch": 18.75, "loss": 0.6869125366210938, "loss_ce": 0.05410004034638405, "loss_xval": 0.6328125, "num_input_tokens_seen": 20743424, "step": 300 }, { "epoch": 18.8125, "grad_norm": 77.86073528701853, "learning_rate": 5e-05, "loss": 0.1027, "num_input_tokens_seen": 20802496, "step": 301 }, { "epoch": 18.8125, "loss": 0.10991650074720383, "loss_ce": 0.06963329762220383, "loss_xval": 0.040283203125, "num_input_tokens_seen": 20802496, "step": 301 }, { "epoch": 18.875, "grad_norm": 292.0502032364089, "learning_rate": 5e-05, "loss": 0.432, "num_input_tokens_seen": 20874112, "step": 302 }, { "epoch": 18.875, "loss": 0.40421196818351746, "loss_ce": 0.060461968183517456, "loss_xval": 0.34375, "num_input_tokens_seen": 20874112, "step": 302 }, { "epoch": 18.9375, "grad_norm": 344.01605003997935, "learning_rate": 5e-05, "loss": 0.5894, "num_input_tokens_seen": 20933376, "step": 303 }, { "epoch": 18.9375, "loss": 0.5786466598510742, "loss_ce": 0.05911543220281601, "loss_xval": 0.51953125, "num_input_tokens_seen": 20933376, "step": 303 }, { "epoch": 19.0, "grad_norm": 46.29534379699135, "learning_rate": 5e-05, "loss": 0.0928, "num_input_tokens_seen": 21005056, "step": 304 }, { "epoch": 19.0, "loss": 0.08590758591890335, "loss_ce": 0.05978453904390335, "loss_xval": 0.026123046875, "num_input_tokens_seen": 21005056, "step": 304 }, { "epoch": 19.0625, "grad_norm": 258.9382904270406, "learning_rate": 5e-05, "loss": 0.3661, "num_input_tokens_seen": 21076736, "step": 305 }, { "epoch": 19.0625, "loss": 0.3459300398826599, "loss_ce": 0.06077377498149872, "loss_xval": 0.28515625, "num_input_tokens_seen": 21076736, "step": 305 }, { "epoch": 19.125, "grad_norm": 238.5090180219324, "learning_rate": 5e-05, "loss": 0.3337, "num_input_tokens_seen": 21148288, "step": 306 }, { "epoch": 19.125, "loss": 0.3305038511753082, "loss_ce": 0.06487884372472763, "loss_xval": 0.265625, "num_input_tokens_seen": 21148288, "step": 306 }, { "epoch": 19.1875, "grad_norm": 64.6083728076613, "learning_rate": 5e-05, "loss": 0.1134, "num_input_tokens_seen": 21219840, "step": 307 }, { "epoch": 19.1875, "loss": 0.10510671883821487, "loss_ce": 0.07019460946321487, "loss_xval": 0.034912109375, "num_input_tokens_seen": 21219840, "step": 307 }, { "epoch": 19.25, "grad_norm": 259.0824739486409, "learning_rate": 5e-05, "loss": 0.3807, "num_input_tokens_seen": 21291520, "step": 308 }, { "epoch": 19.25, "loss": 0.3581124544143677, "loss_ce": 0.06514371186494827, "loss_xval": 0.29296875, "num_input_tokens_seen": 21291520, "step": 308 }, { "epoch": 19.3125, "grad_norm": 104.19307087054078, "learning_rate": 5e-05, "loss": 0.1216, "num_input_tokens_seen": 21363072, "step": 309 }, { "epoch": 19.3125, "loss": 0.11511696875095367, "loss_ce": 0.056279078125953674, "loss_xval": 0.058837890625, "num_input_tokens_seen": 21363072, "step": 309 }, { "epoch": 19.375, "grad_norm": 169.93634652801455, "learning_rate": 5e-05, "loss": 0.2042, "num_input_tokens_seen": 21434624, "step": 310 }, { "epoch": 19.375, "loss": 0.22216372191905975, "loss_ce": 0.05419497564435005, "loss_xval": 0.16796875, "num_input_tokens_seen": 21434624, "step": 310 }, { "epoch": 19.4375, "grad_norm": 203.94069831695467, "learning_rate": 5e-05, "loss": 0.2734, "num_input_tokens_seen": 21506304, "step": 311 }, { "epoch": 19.4375, "loss": 0.2772751450538635, "loss_ce": 0.06438452750444412, "loss_xval": 0.212890625, "num_input_tokens_seen": 21506304, "step": 311 }, { "epoch": 19.5, "grad_norm": 18.024055507046423, "learning_rate": 5e-05, "loss": 0.0643, "num_input_tokens_seen": 21577920, "step": 312 }, { "epoch": 19.5, "loss": 0.06432458758354187, "loss_ce": 0.05309411510825157, "loss_xval": 0.01123046875, "num_input_tokens_seen": 21577920, "step": 312 }, { "epoch": 19.5625, "grad_norm": 184.21039649224565, "learning_rate": 5e-05, "loss": 0.237, "num_input_tokens_seen": 21649600, "step": 313 }, { "epoch": 19.5625, "loss": 0.2161283791065216, "loss_ce": 0.0559721365571022, "loss_xval": 0.16015625, "num_input_tokens_seen": 21649600, "step": 313 }, { "epoch": 19.625, "grad_norm": 108.20152056306442, "learning_rate": 5e-05, "loss": 0.1182, "num_input_tokens_seen": 21721408, "step": 314 }, { "epoch": 19.625, "loss": 0.12597306072711945, "loss_ce": 0.055660564452409744, "loss_xval": 0.0703125, "num_input_tokens_seen": 21721408, "step": 314 }, { "epoch": 19.6875, "grad_norm": 83.49633562498352, "learning_rate": 5e-05, "loss": 0.1044, "num_input_tokens_seen": 21793024, "step": 315 }, { "epoch": 19.6875, "loss": 0.1037326231598854, "loss_ce": 0.0629611387848854, "loss_xval": 0.040771484375, "num_input_tokens_seen": 21793024, "step": 315 }, { "epoch": 19.75, "grad_norm": 166.61290953107724, "learning_rate": 5e-05, "loss": 0.2003, "num_input_tokens_seen": 21864640, "step": 316 }, { "epoch": 19.75, "loss": 0.21483193337917328, "loss_ce": 0.05076942965388298, "loss_xval": 0.1640625, "num_input_tokens_seen": 21864640, "step": 316 }, { "epoch": 19.8125, "grad_norm": 71.69323940555286, "learning_rate": 5e-05, "loss": 0.0875, "num_input_tokens_seen": 21936320, "step": 317 }, { "epoch": 19.8125, "loss": 0.08265180885791779, "loss_ce": 0.048960406333208084, "loss_xval": 0.03369140625, "num_input_tokens_seen": 21936320, "step": 317 }, { "epoch": 19.875, "grad_norm": 82.73207269507293, "learning_rate": 5e-05, "loss": 0.1014, "num_input_tokens_seen": 22008000, "step": 318 }, { "epoch": 19.875, "loss": 0.09214404225349426, "loss_ce": 0.05527880787849426, "loss_xval": 0.036865234375, "num_input_tokens_seen": 22008000, "step": 318 }, { "epoch": 19.9375, "grad_norm": 129.00645154565726, "learning_rate": 5e-05, "loss": 0.1439, "num_input_tokens_seen": 22079744, "step": 319 }, { "epoch": 19.9375, "loss": 0.1347639113664627, "loss_ce": 0.051267821341753006, "loss_xval": 0.08349609375, "num_input_tokens_seen": 22079744, "step": 319 }, { "epoch": 20.0, "grad_norm": 53.870059829152936, "learning_rate": 5e-05, "loss": 0.0771, "num_input_tokens_seen": 22138816, "step": 320 }, { "epoch": 20.0, "loss": 0.07210846245288849, "loss_ce": 0.05465241149067879, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 22138816, "step": 320 }, { "epoch": 20.0625, "grad_norm": 42.41968507185895, "learning_rate": 5e-05, "loss": 0.0691, "num_input_tokens_seen": 22210496, "step": 321 }, { "epoch": 20.0625, "loss": 0.06588022410869598, "loss_ce": 0.04927866533398628, "loss_xval": 0.0166015625, "num_input_tokens_seen": 22210496, "step": 321 }, { "epoch": 20.125, "grad_norm": 81.37358201753432, "learning_rate": 5e-05, "loss": 0.0889, "num_input_tokens_seen": 22282304, "step": 322 }, { "epoch": 20.125, "loss": 0.08118662238121033, "loss_ce": 0.05079111456871033, "loss_xval": 0.0303955078125, "num_input_tokens_seen": 22282304, "step": 322 }, { "epoch": 20.1875, "grad_norm": 31.96465123047603, "learning_rate": 5e-05, "loss": 0.0593, "num_input_tokens_seen": 22353856, "step": 323 }, { "epoch": 20.1875, "loss": 0.05360583961009979, "loss_ce": 0.04493884742259979, "loss_xval": 0.0086669921875, "num_input_tokens_seen": 22353856, "step": 323 }, { "epoch": 20.25, "grad_norm": 54.30901644053265, "learning_rate": 5e-05, "loss": 0.074, "num_input_tokens_seen": 22412928, "step": 324 }, { "epoch": 20.25, "loss": 0.07150323688983917, "loss_ce": 0.044525694102048874, "loss_xval": 0.0269775390625, "num_input_tokens_seen": 22412928, "step": 324 }, { "epoch": 20.3125, "grad_norm": 100.10696752851614, "learning_rate": 5e-05, "loss": 0.1114, "num_input_tokens_seen": 22484544, "step": 325 }, { "epoch": 20.3125, "loss": 0.11652743816375732, "loss_ce": 0.04767978563904762, "loss_xval": 0.06884765625, "num_input_tokens_seen": 22484544, "step": 325 }, { "epoch": 20.375, "grad_norm": 86.5660911617045, "learning_rate": 5e-05, "loss": 0.0969, "num_input_tokens_seen": 22556224, "step": 326 }, { "epoch": 20.375, "loss": 0.09553231298923492, "loss_ce": 0.048169028013944626, "loss_xval": 0.04736328125, "num_input_tokens_seen": 22556224, "step": 326 }, { "epoch": 20.4375, "grad_norm": 6.751561284312927, "learning_rate": 5e-05, "loss": 0.0512, "num_input_tokens_seen": 22627904, "step": 327 }, { "epoch": 20.4375, "loss": 0.0450252965092659, "loss_ce": 0.0397762730717659, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 22627904, "step": 327 }, { "epoch": 20.5, "grad_norm": 102.19788973198584, "learning_rate": 5e-05, "loss": 0.1133, "num_input_tokens_seen": 22699520, "step": 328 }, { "epoch": 20.5, "loss": 0.10728923976421356, "loss_ce": 0.049672048538923264, "loss_xval": 0.0576171875, "num_input_tokens_seen": 22699520, "step": 328 }, { "epoch": 20.5625, "grad_norm": 129.43090976643876, "learning_rate": 5e-05, "loss": 0.1355, "num_input_tokens_seen": 22771136, "step": 329 }, { "epoch": 20.5625, "loss": 0.1352676898241043, "loss_ce": 0.04054112359881401, "loss_xval": 0.0947265625, "num_input_tokens_seen": 22771136, "step": 329 }, { "epoch": 20.625, "grad_norm": 24.348588574348376, "learning_rate": 5e-05, "loss": 0.0479, "num_input_tokens_seen": 22842688, "step": 330 }, { "epoch": 20.625, "loss": 0.04819801449775696, "loss_ce": 0.04111793637275696, "loss_xval": 0.007080078125, "num_input_tokens_seen": 22842688, "step": 330 }, { "epoch": 20.6875, "grad_norm": 138.02485486652458, "learning_rate": 5e-05, "loss": 0.1444, "num_input_tokens_seen": 22914368, "step": 331 }, { "epoch": 20.6875, "loss": 0.1612803339958191, "loss_ce": 0.04116315022110939, "loss_xval": 0.1201171875, "num_input_tokens_seen": 22914368, "step": 331 }, { "epoch": 20.75, "grad_norm": 177.826118184589, "learning_rate": 5e-05, "loss": 0.2107, "num_input_tokens_seen": 22986112, "step": 332 }, { "epoch": 20.75, "loss": 0.20077544450759888, "loss_ce": 0.039642639458179474, "loss_xval": 0.1611328125, "num_input_tokens_seen": 22986112, "step": 332 }, { "epoch": 20.8125, "grad_norm": 32.575859879051436, "learning_rate": 5e-05, "loss": 0.0547, "num_input_tokens_seen": 23057728, "step": 333 }, { "epoch": 20.8125, "loss": 0.053013481199741364, "loss_ce": 0.036167778074741364, "loss_xval": 0.016845703125, "num_input_tokens_seen": 23057728, "step": 333 }, { "epoch": 20.875, "grad_norm": 145.04876372864413, "learning_rate": 5e-05, "loss": 0.1634, "num_input_tokens_seen": 23129344, "step": 334 }, { "epoch": 20.875, "loss": 0.1619020253419876, "loss_ce": 0.04129656031727791, "loss_xval": 0.12060546875, "num_input_tokens_seen": 23129344, "step": 334 }, { "epoch": 20.9375, "grad_norm": 168.3578247321365, "learning_rate": 5e-05, "loss": 0.1975, "num_input_tokens_seen": 23188416, "step": 335 }, { "epoch": 20.9375, "loss": 0.1955825239419937, "loss_ce": 0.041285645216703415, "loss_xval": 0.154296875, "num_input_tokens_seen": 23188416, "step": 335 }, { "epoch": 21.0, "grad_norm": 12.803234575367828, "learning_rate": 5e-05, "loss": 0.0448, "num_input_tokens_seen": 23260160, "step": 336 }, { "epoch": 21.0, "loss": 0.04127725958824158, "loss_ce": 0.03199991583824158, "loss_xval": 0.00927734375, "num_input_tokens_seen": 23260160, "step": 336 }, { "epoch": 21.0625, "grad_norm": 159.70923416573038, "learning_rate": 5e-05, "loss": 0.1813, "num_input_tokens_seen": 23331712, "step": 337 }, { "epoch": 21.0625, "loss": 0.1771336793899536, "loss_ce": 0.034555550664663315, "loss_xval": 0.142578125, "num_input_tokens_seen": 23331712, "step": 337 }, { "epoch": 21.125, "grad_norm": 178.92852100380634, "learning_rate": 5e-05, "loss": 0.2198, "num_input_tokens_seen": 23403392, "step": 338 }, { "epoch": 21.125, "loss": 0.22689706087112427, "loss_ce": 0.03451424837112427, "loss_xval": 0.1923828125, "num_input_tokens_seen": 23403392, "step": 338 }, { "epoch": 21.1875, "grad_norm": 58.222787015758534, "learning_rate": 5e-05, "loss": 0.0678, "num_input_tokens_seen": 23474944, "step": 339 }, { "epoch": 21.1875, "loss": 0.060029055923223495, "loss_ce": 0.038422610610723495, "loss_xval": 0.0216064453125, "num_input_tokens_seen": 23474944, "step": 339 }, { "epoch": 21.25, "grad_norm": 85.83316430272406, "learning_rate": 5e-05, "loss": 0.0826, "num_input_tokens_seen": 23546560, "step": 340 }, { "epoch": 21.25, "loss": 0.092320516705513, "loss_ce": 0.0334826223552227, "loss_xval": 0.058837890625, "num_input_tokens_seen": 23546560, "step": 340 }, { "epoch": 21.3125, "grad_norm": 153.34178081295283, "learning_rate": 5e-05, "loss": 0.1732, "num_input_tokens_seen": 23605632, "step": 341 }, { "epoch": 21.3125, "loss": 0.18945997953414917, "loss_ce": 0.03223340958356857, "loss_xval": 0.1572265625, "num_input_tokens_seen": 23605632, "step": 341 }, { "epoch": 21.375, "grad_norm": 113.06064593355569, "learning_rate": 5e-05, "loss": 0.1126, "num_input_tokens_seen": 23652224, "step": 342 }, { "epoch": 21.375, "loss": 0.114508718252182, "loss_ce": 0.03150090202689171, "loss_xval": 0.0830078125, "num_input_tokens_seen": 23652224, "step": 342 }, { "epoch": 21.4375, "grad_norm": 9.502862995495352, "learning_rate": 5e-05, "loss": 0.0391, "num_input_tokens_seen": 23724032, "step": 343 }, { "epoch": 21.4375, "loss": 0.041819460690021515, "loss_ce": 0.032542116940021515, "loss_xval": 0.00927734375, "num_input_tokens_seen": 23724032, "step": 343 }, { "epoch": 21.5, "grad_norm": 93.54614585882905, "learning_rate": 5e-05, "loss": 0.0871, "num_input_tokens_seen": 23783104, "step": 344 }, { "epoch": 21.5, "loss": 0.07588471472263336, "loss_ce": 0.027300726622343063, "loss_xval": 0.048583984375, "num_input_tokens_seen": 23783104, "step": 344 }, { "epoch": 21.5625, "grad_norm": 125.48052241576953, "learning_rate": 5e-05, "loss": 0.1224, "num_input_tokens_seen": 23854720, "step": 345 }, { "epoch": 21.5625, "loss": 0.12017424404621124, "loss_ce": 0.02886565402150154, "loss_xval": 0.09130859375, "num_input_tokens_seen": 23854720, "step": 345 }, { "epoch": 21.625, "grad_norm": 57.39062178859967, "learning_rate": 5e-05, "loss": 0.0623, "num_input_tokens_seen": 23926336, "step": 346 }, { "epoch": 21.625, "loss": 0.06358948349952698, "loss_ce": 0.03258362039923668, "loss_xval": 0.031005859375, "num_input_tokens_seen": 23926336, "step": 346 }, { "epoch": 21.6875, "grad_norm": 42.191072428656106, "learning_rate": 5e-05, "loss": 0.0509, "num_input_tokens_seen": 23997952, "step": 347 }, { "epoch": 21.6875, "loss": 0.04895569011569023, "loss_ce": 0.03375793620944023, "loss_xval": 0.01519775390625, "num_input_tokens_seen": 23997952, "step": 347 }, { "epoch": 21.75, "grad_norm": 91.60524796764251, "learning_rate": 5e-05, "loss": 0.0813, "num_input_tokens_seen": 24069696, "step": 348 }, { "epoch": 21.75, "loss": 0.07642795890569687, "loss_ce": 0.02808811329305172, "loss_xval": 0.04833984375, "num_input_tokens_seen": 24069696, "step": 348 }, { "epoch": 21.8125, "grad_norm": 65.79991588306397, "learning_rate": 5e-05, "loss": 0.0599, "num_input_tokens_seen": 24141312, "step": 349 }, { "epoch": 21.8125, "loss": 0.04916411638259888, "loss_ce": 0.023041069507598877, "loss_xval": 0.026123046875, "num_input_tokens_seen": 24141312, "step": 349 }, { "epoch": 21.875, "grad_norm": 7.238730719380323, "learning_rate": 5e-05, "loss": 0.0391, "num_input_tokens_seen": 24212928, "step": 350 }, { "epoch": 21.875, "loss": 0.0389084629714489, "loss_ce": 0.0302414707839489, "loss_xval": 0.0086669921875, "num_input_tokens_seen": 24212928, "step": 350 }, { "epoch": 21.9375, "grad_norm": 67.62441618992753, "learning_rate": 5e-05, "loss": 0.058, "num_input_tokens_seen": 24259584, "step": 351 }, { "epoch": 21.9375, "loss": 0.061608459800481796, "loss_ce": 0.025231506675481796, "loss_xval": 0.036376953125, "num_input_tokens_seen": 24259584, "step": 351 }, { "epoch": 22.0, "grad_norm": 78.57853165326333, "learning_rate": 5e-05, "loss": 0.0642, "num_input_tokens_seen": 24331200, "step": 352 }, { "epoch": 22.0, "loss": 0.056535474956035614, "loss_ce": 0.022355787456035614, "loss_xval": 0.0341796875, "num_input_tokens_seen": 24331200, "step": 352 }, { "epoch": 22.0625, "grad_norm": 48.73817646103603, "learning_rate": 5e-05, "loss": 0.0446, "num_input_tokens_seen": 24377920, "step": 353 }, { "epoch": 22.0625, "loss": 0.04055093228816986, "loss_ce": 0.02523110806941986, "loss_xval": 0.01531982421875, "num_input_tokens_seen": 24377920, "step": 353 }, { "epoch": 22.125, "grad_norm": 22.41809225651893, "learning_rate": 5e-05, "loss": 0.0334, "num_input_tokens_seen": 24437120, "step": 354 }, { "epoch": 22.125, "loss": 0.03857526183128357, "loss_ce": 0.02502545714378357, "loss_xval": 0.0135498046875, "num_input_tokens_seen": 24437120, "step": 354 }, { "epoch": 22.1875, "grad_norm": 21.286114934711435, "learning_rate": 5e-05, "loss": 0.0337, "num_input_tokens_seen": 24508800, "step": 355 }, { "epoch": 22.1875, "loss": 0.03242785111069679, "loss_ce": 0.024066034704446793, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 24508800, "step": 355 }, { "epoch": 22.25, "grad_norm": 27.768584251920654, "learning_rate": 5e-05, "loss": 0.0346, "num_input_tokens_seen": 24580480, "step": 356 }, { "epoch": 22.25, "loss": 0.03272373229265213, "loss_ce": 0.02051670290529728, "loss_xval": 0.01220703125, "num_input_tokens_seen": 24580480, "step": 356 }, { "epoch": 22.3125, "grad_norm": 33.0264274440666, "learning_rate": 5e-05, "loss": 0.0394, "num_input_tokens_seen": 24652096, "step": 357 }, { "epoch": 22.3125, "loss": 0.03478675335645676, "loss_ce": 0.023434214293956757, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 24652096, "step": 357 }, { "epoch": 22.375, "grad_norm": 30.680450549270756, "learning_rate": 5e-05, "loss": 0.0307, "num_input_tokens_seen": 24723712, "step": 358 }, { "epoch": 22.375, "loss": 0.0311301089823246, "loss_ce": 0.0179465152323246, "loss_xval": 0.01318359375, "num_input_tokens_seen": 24723712, "step": 358 }, { "epoch": 22.4375, "grad_norm": 1.6152731376540712, "learning_rate": 5e-05, "loss": 0.0256, "num_input_tokens_seen": 24795264, "step": 359 }, { "epoch": 22.4375, "loss": 0.02527458965778351, "loss_ce": 0.02133782207965851, "loss_xval": 0.003936767578125, "num_input_tokens_seen": 24795264, "step": 359 }, { "epoch": 22.5, "grad_norm": 39.24958933219172, "learning_rate": 5e-05, "loss": 0.035, "num_input_tokens_seen": 24866880, "step": 360 }, { "epoch": 22.5, "loss": 0.04027906805276871, "loss_ce": 0.021114028990268707, "loss_xval": 0.0191650390625, "num_input_tokens_seen": 24866880, "step": 360 }, { "epoch": 22.5625, "grad_norm": 49.680752477148026, "learning_rate": 5e-05, "loss": 0.0416, "num_input_tokens_seen": 24938432, "step": 361 }, { "epoch": 22.5625, "loss": 0.04497775062918663, "loss_ce": 0.02337130531668663, "loss_xval": 0.0216064453125, "num_input_tokens_seen": 24938432, "step": 361 }, { "epoch": 22.625, "grad_norm": 30.514751566658266, "learning_rate": 5e-05, "loss": 0.0308, "num_input_tokens_seen": 25010112, "step": 362 }, { "epoch": 22.625, "loss": 0.03016595169901848, "loss_ce": 0.015151304192841053, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 25010112, "step": 362 }, { "epoch": 22.6875, "grad_norm": 3.0108457596333102, "learning_rate": 5e-05, "loss": 0.0265, "num_input_tokens_seen": 25081728, "step": 363 }, { "epoch": 22.6875, "loss": 0.026027997955679893, "loss_ce": 0.017910322174429893, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 25081728, "step": 363 }, { "epoch": 22.75, "grad_norm": 34.720989573408396, "learning_rate": 5e-05, "loss": 0.0276, "num_input_tokens_seen": 25153280, "step": 364 }, { "epoch": 22.75, "loss": 0.03153940662741661, "loss_ce": 0.014937843196094036, "loss_xval": 0.0166015625, "num_input_tokens_seen": 25153280, "step": 364 }, { "epoch": 22.8125, "grad_norm": 54.55908300103559, "learning_rate": 5e-05, "loss": 0.0403, "num_input_tokens_seen": 25212416, "step": 365 }, { "epoch": 22.8125, "loss": 0.04323313385248184, "loss_ce": 0.013936257921159267, "loss_xval": 0.029296875, "num_input_tokens_seen": 25212416, "step": 365 }, { "epoch": 22.875, "grad_norm": 58.489593389674056, "learning_rate": 5e-05, "loss": 0.0417, "num_input_tokens_seen": 25271488, "step": 366 }, { "epoch": 22.875, "loss": 0.04008274897933006, "loss_ce": 0.014081771485507488, "loss_xval": 0.0260009765625, "num_input_tokens_seen": 25271488, "step": 366 }, { "epoch": 22.9375, "grad_norm": 48.24943727129867, "learning_rate": 5e-05, "loss": 0.0368, "num_input_tokens_seen": 25330688, "step": 367 }, { "epoch": 22.9375, "loss": 0.041829679161310196, "loss_ce": 0.018025968223810196, "loss_xval": 0.0238037109375, "num_input_tokens_seen": 25330688, "step": 367 }, { "epoch": 23.0, "grad_norm": 28.82296636692516, "learning_rate": 5e-05, "loss": 0.0284, "num_input_tokens_seen": 25402240, "step": 368 }, { "epoch": 23.0, "loss": 0.026926452293992043, "loss_ce": 0.017343932762742043, "loss_xval": 0.00958251953125, "num_input_tokens_seen": 25402240, "step": 368 }, { "epoch": 23.0625, "grad_norm": 3.7841073776005842, "learning_rate": 5e-05, "loss": 0.019, "num_input_tokens_seen": 25461312, "step": 369 }, { "epoch": 23.0625, "loss": 0.018005449324846268, "loss_ce": 0.010284501127898693, "loss_xval": 0.007720947265625, "num_input_tokens_seen": 25461312, "step": 369 }, { "epoch": 23.125, "grad_norm": 35.588070766280204, "learning_rate": 5e-05, "loss": 0.0255, "num_input_tokens_seen": 25532992, "step": 370 }, { "epoch": 23.125, "loss": 0.02573539689183235, "loss_ce": 0.012612837366759777, "loss_xval": 0.01312255859375, "num_input_tokens_seen": 25532992, "step": 370 }, { "epoch": 23.1875, "grad_norm": 65.26191511837887, "learning_rate": 5e-05, "loss": 0.0442, "num_input_tokens_seen": 25592192, "step": 371 }, { "epoch": 23.1875, "loss": 0.04462482035160065, "loss_ce": 0.010689273476600647, "loss_xval": 0.033935546875, "num_input_tokens_seen": 25592192, "step": 371 }, { "epoch": 23.25, "grad_norm": 97.58296827423466, "learning_rate": 5e-05, "loss": 0.0662, "num_input_tokens_seen": 25651200, "step": 372 }, { "epoch": 23.25, "loss": 0.06839506328105927, "loss_ce": 0.010533735156059265, "loss_xval": 0.057861328125, "num_input_tokens_seen": 25651200, "step": 372 }, { "epoch": 23.3125, "grad_norm": 151.49134967464235, "learning_rate": 5e-05, "loss": 0.1385, "num_input_tokens_seen": 25722880, "step": 373 }, { "epoch": 23.3125, "loss": 0.14277715981006622, "loss_ce": 0.009964662604033947, "loss_xval": 0.1328125, "num_input_tokens_seen": 25722880, "step": 373 }, { "epoch": 23.375, "grad_norm": 243.7000987041097, "learning_rate": 5e-05, "loss": 0.339, "num_input_tokens_seen": 25794560, "step": 374 }, { "epoch": 23.375, "loss": 0.34568315744400024, "loss_ce": 0.009745652787387371, "loss_xval": 0.3359375, "num_input_tokens_seen": 25794560, "step": 374 }, { "epoch": 23.4375, "grad_norm": 387.16968867611166, "learning_rate": 5e-05, "loss": 0.8597, "num_input_tokens_seen": 25866304, "step": 375 }, { "epoch": 23.4375, "loss": 0.8643907308578491, "loss_ce": 0.008921988308429718, "loss_xval": 0.85546875, "num_input_tokens_seen": 25866304, "step": 375 }, { "epoch": 23.5, "grad_norm": 523.2692863493334, "learning_rate": 5e-05, "loss": 1.5678, "num_input_tokens_seen": 25925312, "step": 376 }, { "epoch": 23.5, "loss": 1.5135647058486938, "loss_ce": 0.013564717024564743, "loss_xval": 1.5, "num_input_tokens_seen": 25925312, "step": 376 }, { "epoch": 23.5625, "grad_norm": 482.0184380618385, "learning_rate": 5e-05, "loss": 1.4084, "num_input_tokens_seen": 25984512, "step": 377 }, { "epoch": 23.5625, "loss": 1.487014889717102, "loss_ce": 0.018264926970005035, "loss_xval": 1.46875, "num_input_tokens_seen": 25984512, "step": 377 }, { "epoch": 23.625, "grad_norm": 205.0593908826293, "learning_rate": 5e-05, "loss": 0.3361, "num_input_tokens_seen": 26056192, "step": 378 }, { "epoch": 23.625, "loss": 0.34420228004455566, "loss_ce": 0.02584289014339447, "loss_xval": 0.318359375, "num_input_tokens_seen": 26056192, "step": 378 }, { "epoch": 23.6875, "grad_norm": 132.83934380697207, "learning_rate": 5e-05, "loss": 0.2223, "num_input_tokens_seen": 26127872, "step": 379 }, { "epoch": 23.6875, "loss": 0.2154698669910431, "loss_ce": 0.04457143321633339, "loss_xval": 0.1708984375, "num_input_tokens_seen": 26127872, "step": 379 }, { "epoch": 23.75, "grad_norm": 306.0324430341869, "learning_rate": 5e-05, "loss": 0.8102, "num_input_tokens_seen": 26187008, "step": 380 }, { "epoch": 23.75, "loss": 0.7603037357330322, "loss_ce": 0.053272493183612823, "loss_xval": 0.70703125, "num_input_tokens_seen": 26187008, "step": 380 }, { "epoch": 23.8125, "grad_norm": 243.68404104415083, "learning_rate": 5e-05, "loss": 0.5698, "num_input_tokens_seen": 26258560, "step": 381 }, { "epoch": 23.8125, "loss": 0.5247402787208557, "loss_ce": 0.04622465744614601, "loss_xval": 0.478515625, "num_input_tokens_seen": 26258560, "step": 381 }, { "epoch": 23.875, "grad_norm": 37.919739354322424, "learning_rate": 5e-05, "loss": 0.0741, "num_input_tokens_seen": 26330368, "step": 382 }, { "epoch": 23.875, "loss": 0.07584744691848755, "loss_ce": 0.03605252504348755, "loss_xval": 0.039794921875, "num_input_tokens_seen": 26330368, "step": 382 }, { "epoch": 23.9375, "grad_norm": 191.36875903807635, "learning_rate": 5e-05, "loss": 0.3273, "num_input_tokens_seen": 26401920, "step": 383 }, { "epoch": 23.9375, "loss": 0.33099889755249023, "loss_ce": 0.026311399415135384, "loss_xval": 0.3046875, "num_input_tokens_seen": 26401920, "step": 383 }, { "epoch": 24.0, "grad_norm": 242.1428011928719, "learning_rate": 5e-05, "loss": 0.4589, "num_input_tokens_seen": 26473664, "step": 384 }, { "epoch": 24.0, "loss": 0.44113773107528687, "loss_ce": 0.023168986663222313, "loss_xval": 0.41796875, "num_input_tokens_seen": 26473664, "step": 384 }, { "epoch": 24.0625, "grad_norm": 30.636740642057283, "learning_rate": 5e-05, "loss": 0.0483, "num_input_tokens_seen": 26545344, "step": 385 }, { "epoch": 24.0625, "loss": 0.04922012984752655, "loss_ce": 0.0277357567101717, "loss_xval": 0.021484375, "num_input_tokens_seen": 26545344, "step": 385 }, { "epoch": 24.125, "grad_norm": 184.2297319008558, "learning_rate": 5e-05, "loss": 0.3037, "num_input_tokens_seen": 26617024, "step": 386 }, { "epoch": 24.125, "loss": 0.30065447092056274, "loss_ce": 0.0311232078820467, "loss_xval": 0.26953125, "num_input_tokens_seen": 26617024, "step": 386 }, { "epoch": 24.1875, "grad_norm": 148.02728447044228, "learning_rate": 5e-05, "loss": 0.2284, "num_input_tokens_seen": 26676096, "step": 387 }, { "epoch": 24.1875, "loss": 0.20117712020874023, "loss_ce": 0.026372438296675682, "loss_xval": 0.1748046875, "num_input_tokens_seen": 26676096, "step": 387 }, { "epoch": 24.25, "grad_norm": 48.7182625078638, "learning_rate": 5e-05, "loss": 0.0775, "num_input_tokens_seen": 26747712, "step": 388 }, { "epoch": 24.25, "loss": 0.0815923810005188, "loss_ce": 0.0259283185005188, "loss_xval": 0.0556640625, "num_input_tokens_seen": 26747712, "step": 388 }, { "epoch": 24.3125, "grad_norm": 159.84480277227877, "learning_rate": 5e-05, "loss": 0.2301, "num_input_tokens_seen": 26806848, "step": 389 }, { "epoch": 24.3125, "loss": 0.2049870491027832, "loss_ce": 0.025299543514847755, "loss_xval": 0.1796875, "num_input_tokens_seen": 26806848, "step": 389 }, { "epoch": 24.375, "grad_norm": 90.66012488974275, "learning_rate": 5e-05, "loss": 0.1106, "num_input_tokens_seen": 26865984, "step": 390 }, { "epoch": 24.375, "loss": 0.1021375060081482, "loss_ce": 0.022059379145503044, "loss_xval": 0.080078125, "num_input_tokens_seen": 26865984, "step": 390 }, { "epoch": 24.4375, "grad_norm": 91.60761564913557, "learning_rate": 5e-05, "loss": 0.1192, "num_input_tokens_seen": 26925120, "step": 391 }, { "epoch": 24.4375, "loss": 0.10391199588775635, "loss_ce": 0.028716687113046646, "loss_xval": 0.0751953125, "num_input_tokens_seen": 26925120, "step": 391 }, { "epoch": 24.5, "grad_norm": 192.7878292026773, "learning_rate": 5e-05, "loss": 0.3131, "num_input_tokens_seen": 26984320, "step": 392 }, { "epoch": 24.5, "loss": 0.31701016426086426, "loss_ce": 0.022088276222348213, "loss_xval": 0.294921875, "num_input_tokens_seen": 26984320, "step": 392 }, { "epoch": 24.5625, "grad_norm": 111.49127185827285, "learning_rate": 5e-05, "loss": 0.1325, "num_input_tokens_seen": 27055936, "step": 393 }, { "epoch": 24.5625, "loss": 0.14198513329029083, "loss_ce": 0.023332787677645683, "loss_xval": 0.11865234375, "num_input_tokens_seen": 27055936, "step": 393 }, { "epoch": 24.625, "grad_norm": 73.49139601180414, "learning_rate": 5e-05, "loss": 0.0894, "num_input_tokens_seen": 27115072, "step": 394 }, { "epoch": 24.625, "loss": 0.10783809423446655, "loss_ce": 0.026783406734466553, "loss_xval": 0.0810546875, "num_input_tokens_seen": 27115072, "step": 394 }, { "epoch": 24.6875, "grad_norm": 167.08639423336632, "learning_rate": 5e-05, "loss": 0.2457, "num_input_tokens_seen": 27186624, "step": 395 }, { "epoch": 24.6875, "loss": 0.24250024557113647, "loss_ce": 0.022773688659071922, "loss_xval": 0.2197265625, "num_input_tokens_seen": 27186624, "step": 395 }, { "epoch": 24.75, "grad_norm": 57.70197502018322, "learning_rate": 5e-05, "loss": 0.0598, "num_input_tokens_seen": 27245696, "step": 396 }, { "epoch": 24.75, "loss": 0.06535350531339645, "loss_ce": 0.030929675325751305, "loss_xval": 0.034423828125, "num_input_tokens_seen": 27245696, "step": 396 }, { "epoch": 24.8125, "grad_norm": 118.89098168669285, "learning_rate": 5e-05, "loss": 0.1317, "num_input_tokens_seen": 27304768, "step": 397 }, { "epoch": 24.8125, "loss": 0.125514954328537, "loss_ce": 0.019069643691182137, "loss_xval": 0.1064453125, "num_input_tokens_seen": 27304768, "step": 397 }, { "epoch": 24.875, "grad_norm": 169.94360101936388, "learning_rate": 5e-05, "loss": 0.2503, "num_input_tokens_seen": 27376320, "step": 398 }, { "epoch": 24.875, "loss": 0.2484622746706009, "loss_ce": 0.02287633717060089, "loss_xval": 0.2255859375, "num_input_tokens_seen": 27376320, "step": 398 }, { "epoch": 24.9375, "grad_norm": 63.0037046223998, "learning_rate": 5e-05, "loss": 0.0637, "num_input_tokens_seen": 27435456, "step": 399 }, { "epoch": 24.9375, "loss": 0.0754106268286705, "loss_ce": 0.01901414431631565, "loss_xval": 0.056396484375, "num_input_tokens_seen": 27435456, "step": 399 }, { "epoch": 25.0, "grad_norm": 83.98852289312464, "learning_rate": 5e-05, "loss": 0.0777, "num_input_tokens_seen": 27507072, "step": 400 }, { "epoch": 25.0, "loss": 0.07177116721868515, "loss_ce": 0.014642258174717426, "loss_xval": 0.05712890625, "num_input_tokens_seen": 27507072, "step": 400 }, { "epoch": 25.0625, "grad_norm": 96.74470968630307, "learning_rate": 5e-05, "loss": 0.0943, "num_input_tokens_seen": 27578752, "step": 401 }, { "epoch": 25.0625, "loss": 0.09034855663776398, "loss_ce": 0.018571211025118828, "loss_xval": 0.07177734375, "num_input_tokens_seen": 27578752, "step": 401 }, { "epoch": 25.125, "grad_norm": 28.095202197446476, "learning_rate": 5e-05, "loss": 0.0358, "num_input_tokens_seen": 27650368, "step": 402 }, { "epoch": 25.125, "loss": 0.034102149307727814, "loss_ce": 0.018110936507582664, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 27650368, "step": 402 }, { "epoch": 25.1875, "grad_norm": 135.5835375799599, "learning_rate": 5e-05, "loss": 0.1754, "num_input_tokens_seen": 27722048, "step": 403 }, { "epoch": 25.1875, "loss": 0.17733702063560486, "loss_ce": 0.015227638185024261, "loss_xval": 0.162109375, "num_input_tokens_seen": 27722048, "step": 403 }, { "epoch": 25.25, "grad_norm": 82.8180389489728, "learning_rate": 5e-05, "loss": 0.0893, "num_input_tokens_seen": 27768640, "step": 404 }, { "epoch": 25.25, "loss": 0.09928497672080994, "loss_ce": 0.022624818608164787, "loss_xval": 0.07666015625, "num_input_tokens_seen": 27768640, "step": 404 }, { "epoch": 25.3125, "grad_norm": 84.89081123788657, "learning_rate": 5e-05, "loss": 0.0907, "num_input_tokens_seen": 27840256, "step": 405 }, { "epoch": 25.3125, "loss": 0.08249988406896591, "loss_ce": 0.012675662524998188, "loss_xval": 0.06982421875, "num_input_tokens_seen": 27840256, "step": 405 }, { "epoch": 25.375, "grad_norm": 155.09101131078668, "learning_rate": 5e-05, "loss": 0.237, "num_input_tokens_seen": 27899328, "step": 406 }, { "epoch": 25.375, "loss": 0.22836892306804657, "loss_ce": 0.015478293411433697, "loss_xval": 0.212890625, "num_input_tokens_seen": 27899328, "step": 406 }, { "epoch": 25.4375, "grad_norm": 46.687669868373305, "learning_rate": 5e-05, "loss": 0.0399, "num_input_tokens_seen": 27958400, "step": 407 }, { "epoch": 25.4375, "loss": 0.029898736625909805, "loss_ce": 0.01378545444458723, "loss_xval": 0.01611328125, "num_input_tokens_seen": 27958400, "step": 407 }, { "epoch": 25.5, "grad_norm": 106.1049029587732, "learning_rate": 5e-05, "loss": 0.1194, "num_input_tokens_seen": 28030080, "step": 408 }, { "epoch": 25.5, "loss": 0.10383109748363495, "loss_ce": 0.012522506527602673, "loss_xval": 0.09130859375, "num_input_tokens_seen": 28030080, "step": 408 }, { "epoch": 25.5625, "grad_norm": 96.26835574978439, "learning_rate": 5e-05, "loss": 0.1106, "num_input_tokens_seen": 28101696, "step": 409 }, { "epoch": 25.5625, "loss": 0.10796400904655457, "loss_ce": 0.011284318752586842, "loss_xval": 0.0966796875, "num_input_tokens_seen": 28101696, "step": 409 }, { "epoch": 25.625, "grad_norm": 70.72911312420754, "learning_rate": 5e-05, "loss": 0.0676, "num_input_tokens_seen": 28173248, "step": 410 }, { "epoch": 25.625, "loss": 0.06852303445339203, "loss_ce": 0.013347254134714603, "loss_xval": 0.05517578125, "num_input_tokens_seen": 28173248, "step": 410 }, { "epoch": 25.6875, "grad_norm": 142.01249737081872, "learning_rate": 5e-05, "loss": 0.1973, "num_input_tokens_seen": 28232320, "step": 411 }, { "epoch": 25.6875, "loss": 0.18970969319343567, "loss_ce": 0.011975325644016266, "loss_xval": 0.177734375, "num_input_tokens_seen": 28232320, "step": 411 }, { "epoch": 25.75, "grad_norm": 30.679026667393117, "learning_rate": 5e-05, "loss": 0.0309, "num_input_tokens_seen": 28304000, "step": 412 }, { "epoch": 25.75, "loss": 0.03227870911359787, "loss_ce": 0.01360194943845272, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 28304000, "step": 412 }, { "epoch": 25.8125, "grad_norm": 107.5003547141714, "learning_rate": 5e-05, "loss": 0.1255, "num_input_tokens_seen": 28363136, "step": 413 }, { "epoch": 25.8125, "loss": 0.13056893646717072, "loss_ce": 0.010451752692461014, "loss_xval": 0.1201171875, "num_input_tokens_seen": 28363136, "step": 413 }, { "epoch": 25.875, "grad_norm": 108.86184718629066, "learning_rate": 5e-05, "loss": 0.122, "num_input_tokens_seen": 28434816, "step": 414 }, { "epoch": 25.875, "loss": 0.11903096735477448, "loss_ce": 0.0130739351734519, "loss_xval": 0.10595703125, "num_input_tokens_seen": 28434816, "step": 414 }, { "epoch": 25.9375, "grad_norm": 26.118808281545533, "learning_rate": 5e-05, "loss": 0.0236, "num_input_tokens_seen": 28493824, "step": 415 }, { "epoch": 25.9375, "loss": 0.02844841778278351, "loss_ce": 0.008184745907783508, "loss_xval": 0.020263671875, "num_input_tokens_seen": 28493824, "step": 415 }, { "epoch": 26.0, "grad_norm": 128.17331848976858, "learning_rate": 5e-05, "loss": 0.1698, "num_input_tokens_seen": 28565568, "step": 416 }, { "epoch": 26.0, "loss": 0.1835230439901352, "loss_ce": 0.01164803933352232, "loss_xval": 0.171875, "num_input_tokens_seen": 28565568, "step": 416 }, { "epoch": 26.0625, "grad_norm": 77.50573567485803, "learning_rate": 5e-05, "loss": 0.0784, "num_input_tokens_seen": 28637248, "step": 417 }, { "epoch": 26.0625, "loss": 0.07487931102514267, "loss_ce": 0.00847306102514267, "loss_xval": 0.06640625, "num_input_tokens_seen": 28637248, "step": 417 }, { "epoch": 26.125, "grad_norm": 46.99801436201133, "learning_rate": 5e-05, "loss": 0.0385, "num_input_tokens_seen": 28708800, "step": 418 }, { "epoch": 26.125, "loss": 0.02982092648744583, "loss_ce": 0.010411746799945831, "loss_xval": 0.0194091796875, "num_input_tokens_seen": 28708800, "step": 418 }, { "epoch": 26.1875, "grad_norm": 102.79239572865913, "learning_rate": 5e-05, "loss": 0.1212, "num_input_tokens_seen": 28780480, "step": 419 }, { "epoch": 26.1875, "loss": 0.12018196284770966, "loss_ce": 0.00738899689167738, "loss_xval": 0.11279296875, "num_input_tokens_seen": 28780480, "step": 419 }, { "epoch": 26.25, "grad_norm": 37.28295120694252, "learning_rate": 5e-05, "loss": 0.0298, "num_input_tokens_seen": 28852032, "step": 420 }, { "epoch": 26.25, "loss": 0.033372994512319565, "loss_ce": 0.0076161595061421394, "loss_xval": 0.0257568359375, "num_input_tokens_seen": 28852032, "step": 420 }, { "epoch": 26.3125, "grad_norm": 62.84732112776997, "learning_rate": 5e-05, "loss": 0.0532, "num_input_tokens_seen": 28911296, "step": 421 }, { "epoch": 26.3125, "loss": 0.04969404637813568, "loss_ce": 0.00892256386578083, "loss_xval": 0.040771484375, "num_input_tokens_seen": 28911296, "step": 421 }, { "epoch": 26.375, "grad_norm": 63.9094773158963, "learning_rate": 5e-05, "loss": 0.0522, "num_input_tokens_seen": 28982912, "step": 422 }, { "epoch": 26.375, "loss": 0.0577460378408432, "loss_ce": 0.006476507056504488, "loss_xval": 0.05126953125, "num_input_tokens_seen": 28982912, "step": 422 }, { "epoch": 26.4375, "grad_norm": 13.533038052106159, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 29054464, "step": 423 }, { "epoch": 26.4375, "loss": 0.020495440810918808, "loss_ce": 0.007678058464080095, "loss_xval": 0.0128173828125, "num_input_tokens_seen": 29054464, "step": 423 }, { "epoch": 26.5, "grad_norm": 73.36663422996799, "learning_rate": 5e-05, "loss": 0.078, "num_input_tokens_seen": 29113664, "step": 424 }, { "epoch": 26.5, "loss": 0.06784259527921677, "loss_ce": 0.0077840001322329044, "loss_xval": 0.06005859375, "num_input_tokens_seen": 29113664, "step": 424 }, { "epoch": 26.5625, "grad_norm": 73.33448547583748, "learning_rate": 5e-05, "loss": 0.0695, "num_input_tokens_seen": 29185472, "step": 425 }, { "epoch": 26.5625, "loss": 0.07413546741008759, "loss_ce": 0.0072409361600875854, "loss_xval": 0.06689453125, "num_input_tokens_seen": 29185472, "step": 425 }, { "epoch": 26.625, "grad_norm": 5.3644065133097625, "learning_rate": 5e-05, "loss": 0.0141, "num_input_tokens_seen": 29244672, "step": 426 }, { "epoch": 26.625, "loss": 0.01347382366657257, "loss_ce": 0.006485298741608858, "loss_xval": 0.006988525390625, "num_input_tokens_seen": 29244672, "step": 426 }, { "epoch": 26.6875, "grad_norm": 72.55059146303088, "learning_rate": 5e-05, "loss": 0.0645, "num_input_tokens_seen": 29316288, "step": 427 }, { "epoch": 26.6875, "loss": 0.07400896400213242, "loss_ce": 0.006626151502132416, "loss_xval": 0.0673828125, "num_input_tokens_seen": 29316288, "step": 427 }, { "epoch": 26.75, "grad_norm": 70.20787953586438, "learning_rate": 5e-05, "loss": 0.064, "num_input_tokens_seen": 29387840, "step": 428 }, { "epoch": 26.75, "loss": 0.06021607294678688, "loss_ce": 0.0069934166967868805, "loss_xval": 0.05322265625, "num_input_tokens_seen": 29387840, "step": 428 }, { "epoch": 26.8125, "grad_norm": 11.058123030300388, "learning_rate": 5e-05, "loss": 0.0194, "num_input_tokens_seen": 29459584, "step": 429 }, { "epoch": 26.8125, "loss": 0.02217293716967106, "loss_ce": 0.006059656385332346, "loss_xval": 0.01611328125, "num_input_tokens_seen": 29459584, "step": 429 }, { "epoch": 26.875, "grad_norm": 47.16874607456602, "learning_rate": 5e-05, "loss": 0.0409, "num_input_tokens_seen": 29518720, "step": 430 }, { "epoch": 26.875, "loss": 0.0351792611181736, "loss_ce": 0.0041734022088348866, "loss_xval": 0.031005859375, "num_input_tokens_seen": 29518720, "step": 430 }, { "epoch": 26.9375, "grad_norm": 52.91000873684464, "learning_rate": 5e-05, "loss": 0.0371, "num_input_tokens_seen": 29590400, "step": 431 }, { "epoch": 26.9375, "loss": 0.036790959537029266, "loss_ce": 0.004076115787029266, "loss_xval": 0.03271484375, "num_input_tokens_seen": 29590400, "step": 431 }, { "epoch": 27.0, "grad_norm": 5.600692805884, "learning_rate": 5e-05, "loss": 0.0128, "num_input_tokens_seen": 29661952, "step": 432 }, { "epoch": 27.0, "loss": 0.013566261157393456, "loss_ce": 0.007004982326179743, "loss_xval": 0.006561279296875, "num_input_tokens_seen": 29661952, "step": 432 }, { "epoch": 27.0625, "grad_norm": 48.03720973950958, "learning_rate": 5e-05, "loss": 0.0387, "num_input_tokens_seen": 29733504, "step": 433 }, { "epoch": 27.0625, "loss": 0.045741401612758636, "loss_ce": 0.006923042703419924, "loss_xval": 0.038818359375, "num_input_tokens_seen": 29733504, "step": 433 }, { "epoch": 27.125, "grad_norm": 44.271188473177446, "learning_rate": 5e-05, "loss": 0.0336, "num_input_tokens_seen": 29805248, "step": 434 }, { "epoch": 27.125, "loss": 0.028378482908010483, "loss_ce": 0.004696843214333057, "loss_xval": 0.023681640625, "num_input_tokens_seen": 29805248, "step": 434 }, { "epoch": 27.1875, "grad_norm": 2.5845658910605147, "learning_rate": 5e-05, "loss": 0.0095, "num_input_tokens_seen": 29876992, "step": 435 }, { "epoch": 27.1875, "loss": 0.009095221757888794, "loss_ce": 0.004914313089102507, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 29876992, "step": 435 }, { "epoch": 27.25, "grad_norm": 54.92103062539981, "learning_rate": 5e-05, "loss": 0.0417, "num_input_tokens_seen": 29948544, "step": 436 }, { "epoch": 27.25, "loss": 0.03424233943223953, "loss_ce": 0.0042130411602556705, "loss_xval": 0.030029296875, "num_input_tokens_seen": 29948544, "step": 436 }, { "epoch": 27.3125, "grad_norm": 62.51473236387241, "learning_rate": 5e-05, "loss": 0.0471, "num_input_tokens_seen": 30020160, "step": 437 }, { "epoch": 27.3125, "loss": 0.051075927913188934, "loss_ce": 0.004689209628850222, "loss_xval": 0.04638671875, "num_input_tokens_seen": 30020160, "step": 437 }, { "epoch": 27.375, "grad_norm": 9.014784016109035, "learning_rate": 5e-05, "loss": 0.012, "num_input_tokens_seen": 30091840, "step": 438 }, { "epoch": 27.375, "loss": 0.011384607292711735, "loss_ce": 0.003968835808336735, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 30091840, "step": 438 }, { "epoch": 27.4375, "grad_norm": 44.83606998875092, "learning_rate": 5e-05, "loss": 0.0279, "num_input_tokens_seen": 30151040, "step": 439 }, { "epoch": 27.4375, "loss": 0.029954617843031883, "loss_ce": 0.0046860636211931705, "loss_xval": 0.0252685546875, "num_input_tokens_seen": 30151040, "step": 439 }, { "epoch": 27.5, "grad_norm": 54.64195232542024, "learning_rate": 5e-05, "loss": 0.0437, "num_input_tokens_seen": 30222656, "step": 440 }, { "epoch": 27.5, "loss": 0.046291086822748184, "loss_ce": 0.004543041344732046, "loss_xval": 0.041748046875, "num_input_tokens_seen": 30222656, "step": 440 }, { "epoch": 27.5625, "grad_norm": 29.63204149481485, "learning_rate": 5e-05, "loss": 0.0154, "num_input_tokens_seen": 30294208, "step": 441 }, { "epoch": 27.5625, "loss": 0.018965570256114006, "loss_ce": 0.0038898871280252934, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 30294208, "step": 441 }, { "epoch": 27.625, "grad_norm": 12.016289337786652, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 30353408, "step": 442 }, { "epoch": 27.625, "loss": 0.008693516254425049, "loss_ce": 0.002590001095086336, "loss_xval": 0.006103515625, "num_input_tokens_seen": 30353408, "step": 442 }, { "epoch": 27.6875, "grad_norm": 49.116878889158016, "learning_rate": 5e-05, "loss": 0.0294, "num_input_tokens_seen": 30412544, "step": 443 }, { "epoch": 27.6875, "loss": 0.023649927228689194, "loss_ce": 0.0030200453475117683, "loss_xval": 0.0206298828125, "num_input_tokens_seen": 30412544, "step": 443 }, { "epoch": 27.75, "grad_norm": 59.35697128703133, "learning_rate": 5e-05, "loss": 0.0421, "num_input_tokens_seen": 30484352, "step": 444 }, { "epoch": 27.75, "loss": 0.03642663359642029, "loss_ce": 0.00297936936840415, "loss_xval": 0.033447265625, "num_input_tokens_seen": 30484352, "step": 444 }, { "epoch": 27.8125, "grad_norm": 41.49363581547399, "learning_rate": 5e-05, "loss": 0.0252, "num_input_tokens_seen": 30555968, "step": 445 }, { "epoch": 27.8125, "loss": 0.021612796932458878, "loss_ce": 0.004156741313636303, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 30555968, "step": 445 }, { "epoch": 27.875, "grad_norm": 4.900475318391131, "learning_rate": 5e-05, "loss": 0.0079, "num_input_tokens_seen": 30627648, "step": 446 }, { "epoch": 27.875, "loss": 0.008801901713013649, "loss_ce": 0.0037970193661749363, "loss_xval": 0.0050048828125, "num_input_tokens_seen": 30627648, "step": 446 }, { "epoch": 27.9375, "grad_norm": 29.975252846709004, "learning_rate": 5e-05, "loss": 0.0153, "num_input_tokens_seen": 30686848, "step": 447 }, { "epoch": 27.9375, "loss": 0.012868822552263737, "loss_ce": 0.003713548881933093, "loss_xval": 0.0091552734375, "num_input_tokens_seen": 30686848, "step": 447 }, { "epoch": 28.0, "grad_norm": 38.18920800664839, "learning_rate": 5e-05, "loss": 0.0244, "num_input_tokens_seen": 30758592, "step": 448 }, { "epoch": 28.0, "loss": 0.025430506095290184, "loss_ce": 0.002847498282790184, "loss_xval": 0.0225830078125, "num_input_tokens_seen": 30758592, "step": 448 }, { "epoch": 28.0625, "grad_norm": 22.862525129157383, "learning_rate": 5e-05, "loss": 0.0111, "num_input_tokens_seen": 30817728, "step": 449 }, { "epoch": 28.0625, "loss": 0.011791002005338669, "loss_ce": 0.0031850445084273815, "loss_xval": 0.00860595703125, "num_input_tokens_seen": 30817728, "step": 449 }, { "epoch": 28.125, "grad_norm": 9.087768044891721, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 30889280, "step": 450 }, { "epoch": 28.125, "loss": 0.005972502753138542, "loss_ce": 0.003103850409388542, "loss_xval": 0.00286865234375, "num_input_tokens_seen": 30889280, "step": 450 }, { "epoch": 28.1875, "grad_norm": 43.22152195751561, "learning_rate": 5e-05, "loss": 0.0248, "num_input_tokens_seen": 30960960, "step": 451 }, { "epoch": 28.1875, "loss": 0.025609012693166733, "loss_ce": 0.002659793710336089, "loss_xval": 0.02294921875, "num_input_tokens_seen": 30960960, "step": 451 }, { "epoch": 28.25, "grad_norm": 56.557234628273605, "learning_rate": 5e-05, "loss": 0.0364, "num_input_tokens_seen": 31032576, "step": 452 }, { "epoch": 28.25, "loss": 0.03853260725736618, "loss_ce": 0.003376358188688755, "loss_xval": 0.03515625, "num_input_tokens_seen": 31032576, "step": 452 }, { "epoch": 28.3125, "grad_norm": 49.96014160420832, "learning_rate": 5e-05, "loss": 0.0308, "num_input_tokens_seen": 31091584, "step": 453 }, { "epoch": 28.3125, "loss": 0.031207242980599403, "loss_ce": 0.0031310718040913343, "loss_xval": 0.028076171875, "num_input_tokens_seen": 31091584, "step": 453 }, { "epoch": 28.375, "grad_norm": 38.69272253503625, "learning_rate": 5e-05, "loss": 0.0205, "num_input_tokens_seen": 31163200, "step": 454 }, { "epoch": 28.375, "loss": 0.022096609696745872, "loss_ce": 0.002809500088915229, "loss_xval": 0.019287109375, "num_input_tokens_seen": 31163200, "step": 454 }, { "epoch": 28.4375, "grad_norm": 30.798340846655005, "learning_rate": 5e-05, "loss": 0.0167, "num_input_tokens_seen": 31234752, "step": 455 }, { "epoch": 28.4375, "loss": 0.0183558352291584, "loss_ce": 0.0032801516354084015, "loss_xval": 0.01507568359375, "num_input_tokens_seen": 31234752, "step": 455 }, { "epoch": 28.5, "grad_norm": 30.85733710728687, "learning_rate": 5e-05, "loss": 0.0146, "num_input_tokens_seen": 31306496, "step": 456 }, { "epoch": 28.5, "loss": 0.016646770760416985, "loss_ce": 0.0023645448964089155, "loss_xval": 0.0142822265625, "num_input_tokens_seen": 31306496, "step": 456 }, { "epoch": 28.5625, "grad_norm": 26.032608242571797, "learning_rate": 5e-05, "loss": 0.0133, "num_input_tokens_seen": 31365632, "step": 457 }, { "epoch": 28.5625, "loss": 0.01072744745761156, "loss_ce": 0.0023656312841922045, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 31365632, "step": 457 }, { "epoch": 28.625, "grad_norm": 15.7544578200061, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 31437248, "step": 458 }, { "epoch": 28.625, "loss": 0.008288905955851078, "loss_ce": 0.0020633197855204344, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 31437248, "step": 458 }, { "epoch": 28.6875, "grad_norm": 11.918111014755505, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 31508928, "step": 459 }, { "epoch": 28.6875, "loss": 0.006751437671482563, "loss_ce": 0.002845187671482563, "loss_xval": 0.00390625, "num_input_tokens_seen": 31508928, "step": 459 }, { "epoch": 28.75, "grad_norm": 10.662391058200697, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 31568064, "step": 460 }, { "epoch": 28.75, "loss": 0.005551275797188282, "loss_ce": 0.0018586487276479602, "loss_xval": 0.003692626953125, "num_input_tokens_seen": 31568064, "step": 460 }, { "epoch": 28.8125, "grad_norm": 9.083266060823656, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 31639744, "step": 461 }, { "epoch": 28.8125, "loss": 0.007424565963447094, "loss_ce": 0.0026333059649914503, "loss_xval": 0.004791259765625, "num_input_tokens_seen": 31639744, "step": 461 }, { "epoch": 28.875, "grad_norm": 6.863640803613258, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 31698880, "step": 462 }, { "epoch": 28.875, "loss": 0.0041747791692614555, "loss_ce": 0.0018859605770558119, "loss_xval": 0.002288818359375, "num_input_tokens_seen": 31698880, "step": 462 }, { "epoch": 28.9375, "grad_norm": 2.9356043389315865, "learning_rate": 5e-05, "loss": 0.0058, "num_input_tokens_seen": 31770560, "step": 463 }, { "epoch": 28.9375, "loss": 0.005293050315231085, "loss_ce": 0.0015699057839810848, "loss_xval": 0.00372314453125, "num_input_tokens_seen": 31770560, "step": 463 }, { "epoch": 29.0, "grad_norm": 5.050925803376897, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 31842112, "step": 464 }, { "epoch": 29.0, "loss": 0.006694018840789795, "loss_ce": 0.001750171068124473, "loss_xval": 0.00494384765625, "num_input_tokens_seen": 31842112, "step": 464 }, { "epoch": 29.0625, "grad_norm": 22.310520898540933, "learning_rate": 5e-05, "loss": 0.0093, "num_input_tokens_seen": 31913728, "step": 465 }, { "epoch": 29.0625, "loss": 0.009156938642263412, "loss_ce": 0.002259966218844056, "loss_xval": 0.00689697265625, "num_input_tokens_seen": 31913728, "step": 465 }, { "epoch": 29.125, "grad_norm": 54.42750478439811, "learning_rate": 5e-05, "loss": 0.0321, "num_input_tokens_seen": 31985344, "step": 466 }, { "epoch": 29.125, "loss": 0.029004469513893127, "loss_ce": 0.0019048596732318401, "loss_xval": 0.027099609375, "num_input_tokens_seen": 31985344, "step": 466 }, { "epoch": 29.1875, "grad_norm": 89.64622913235212, "learning_rate": 5e-05, "loss": 0.0845, "num_input_tokens_seen": 32057024, "step": 467 }, { "epoch": 29.1875, "loss": 0.08670613914728165, "loss_ce": 0.0017451988533139229, "loss_xval": 0.0849609375, "num_input_tokens_seen": 32057024, "step": 467 }, { "epoch": 29.25, "grad_norm": 132.36809969049492, "learning_rate": 5e-05, "loss": 0.1769, "num_input_tokens_seen": 32128768, "step": 468 }, { "epoch": 29.25, "loss": 0.17079618573188782, "loss_ce": 0.0018508683424443007, "loss_xval": 0.1689453125, "num_input_tokens_seen": 32128768, "step": 468 }, { "epoch": 29.3125, "grad_norm": 198.67521722432534, "learning_rate": 5e-05, "loss": 0.3852, "num_input_tokens_seen": 32200320, "step": 469 }, { "epoch": 29.3125, "loss": 0.3854331970214844, "loss_ce": 0.002620699815452099, "loss_xval": 0.3828125, "num_input_tokens_seen": 32200320, "step": 469 }, { "epoch": 29.375, "grad_norm": 276.7772498319319, "learning_rate": 5e-05, "loss": 0.7342, "num_input_tokens_seen": 32272064, "step": 470 }, { "epoch": 29.375, "loss": 0.7204177379608154, "loss_ce": 0.0016677571693435311, "loss_xval": 0.71875, "num_input_tokens_seen": 32272064, "step": 470 }, { "epoch": 29.4375, "grad_norm": 323.3059130938597, "learning_rate": 5e-05, "loss": 1.1, "num_input_tokens_seen": 32343680, "step": 471 }, { "epoch": 29.4375, "loss": 1.1041285991668701, "loss_ce": 0.0025661007966846228, "loss_xval": 1.1015625, "num_input_tokens_seen": 32343680, "step": 471 }, { "epoch": 29.5, "grad_norm": 247.84388629445735, "learning_rate": 5e-05, "loss": 0.7679, "num_input_tokens_seen": 32415296, "step": 472 }, { "epoch": 29.5, "loss": 0.7610682845115662, "loss_ce": 0.011068296618759632, "loss_xval": 0.75, "num_input_tokens_seen": 32415296, "step": 472 }, { "epoch": 29.5625, "grad_norm": 79.33459788940463, "learning_rate": 5e-05, "loss": 0.1329, "num_input_tokens_seen": 32486976, "step": 473 }, { "epoch": 29.5625, "loss": 0.14590966701507568, "loss_ce": 0.01700342260301113, "loss_xval": 0.12890625, "num_input_tokens_seen": 32486976, "step": 473 }, { "epoch": 29.625, "grad_norm": 92.30159962135761, "learning_rate": 5e-05, "loss": 0.1728, "num_input_tokens_seen": 32546112, "step": 474 }, { "epoch": 29.625, "loss": 0.1546037346124649, "loss_ce": 0.020814666524529457, "loss_xval": 0.1337890625, "num_input_tokens_seen": 32546112, "step": 474 }, { "epoch": 29.6875, "grad_norm": 206.2393649928487, "learning_rate": 5e-05, "loss": 0.6537, "num_input_tokens_seen": 32617792, "step": 475 }, { "epoch": 29.6875, "loss": 0.6552282571792603, "loss_ce": 0.022415766492486, "loss_xval": 0.6328125, "num_input_tokens_seen": 32617792, "step": 475 }, { "epoch": 29.75, "grad_norm": 174.12080752050923, "learning_rate": 5e-05, "loss": 0.4548, "num_input_tokens_seen": 32689344, "step": 476 }, { "epoch": 29.75, "loss": 0.4094059467315674, "loss_ce": 0.018780935555696487, "loss_xval": 0.390625, "num_input_tokens_seen": 32689344, "step": 476 }, { "epoch": 29.8125, "grad_norm": 16.614748602973496, "learning_rate": 5e-05, "loss": 0.0486, "num_input_tokens_seen": 32761024, "step": 477 }, { "epoch": 29.8125, "loss": 0.05317946523427963, "loss_ce": 0.010210715234279633, "loss_xval": 0.04296875, "num_input_tokens_seen": 32761024, "step": 477 }, { "epoch": 29.875, "grad_norm": 155.42975100857723, "learning_rate": 5e-05, "loss": 0.2962, "num_input_tokens_seen": 32820224, "step": 478 }, { "epoch": 29.875, "loss": 0.3082675635814667, "loss_ce": 0.011392563581466675, "loss_xval": 0.296875, "num_input_tokens_seen": 32820224, "step": 478 }, { "epoch": 29.9375, "grad_norm": 175.08209465974755, "learning_rate": 5e-05, "loss": 0.4032, "num_input_tokens_seen": 32891904, "step": 479 }, { "epoch": 29.9375, "loss": 0.37343543767929077, "loss_ce": 0.008201051503419876, "loss_xval": 0.365234375, "num_input_tokens_seen": 32891904, "step": 479 }, { "epoch": 30.0, "grad_norm": 72.59815396689685, "learning_rate": 5e-05, "loss": 0.0902, "num_input_tokens_seen": 32963520, "step": 480 }, { "epoch": 30.0, "loss": 0.08620818704366684, "loss_ce": 0.01198943704366684, "loss_xval": 0.07421875, "num_input_tokens_seen": 32963520, "step": 480 }, { "epoch": 30.0625, "grad_norm": 84.90202896547324, "learning_rate": 5e-05, "loss": 0.1233, "num_input_tokens_seen": 33035072, "step": 481 }, { "epoch": 30.0625, "loss": 0.11216428875923157, "loss_ce": 0.02671506628394127, "loss_xval": 0.08544921875, "num_input_tokens_seen": 33035072, "step": 481 }, { "epoch": 30.125, "grad_norm": 185.0124554464278, "learning_rate": 5e-05, "loss": 0.4448, "num_input_tokens_seen": 33094144, "step": 482 }, { "epoch": 30.125, "loss": 0.4436917006969452, "loss_ce": 0.05306670442223549, "loss_xval": 0.390625, "num_input_tokens_seen": 33094144, "step": 482 }, { "epoch": 30.1875, "grad_norm": 160.28780193792343, "learning_rate": 5e-05, "loss": 0.3597, "num_input_tokens_seen": 33165760, "step": 483 }, { "epoch": 30.1875, "loss": 0.40504151582717896, "loss_ce": 0.03590088710188866, "loss_xval": 0.369140625, "num_input_tokens_seen": 33165760, "step": 483 }, { "epoch": 30.25, "grad_norm": 24.348194390421313, "learning_rate": 5e-05, "loss": 0.0453, "num_input_tokens_seen": 33237376, "step": 484 }, { "epoch": 30.25, "loss": 0.04609951376914978, "loss_ce": 0.01948818564414978, "loss_xval": 0.026611328125, "num_input_tokens_seen": 33237376, "step": 484 }, { "epoch": 30.3125, "grad_norm": 115.32401946114567, "learning_rate": 5e-05, "loss": 0.2025, "num_input_tokens_seen": 33309056, "step": 485 }, { "epoch": 30.3125, "loss": 0.1775987148284912, "loss_ce": 0.02037215791642666, "loss_xval": 0.1572265625, "num_input_tokens_seen": 33309056, "step": 485 }, { "epoch": 30.375, "grad_norm": 129.3704814022795, "learning_rate": 5e-05, "loss": 0.2717, "num_input_tokens_seen": 33380736, "step": 486 }, { "epoch": 30.375, "loss": 0.27861320972442627, "loss_ce": 0.026660069823265076, "loss_xval": 0.251953125, "num_input_tokens_seen": 33380736, "step": 486 }, { "epoch": 30.4375, "grad_norm": 18.90543693800268, "learning_rate": 5e-05, "loss": 0.0492, "num_input_tokens_seen": 33452352, "step": 487 }, { "epoch": 30.4375, "loss": 0.05479831621050835, "loss_ce": 0.018177222460508347, "loss_xval": 0.03662109375, "num_input_tokens_seen": 33452352, "step": 487 }, { "epoch": 30.5, "grad_norm": 98.72747960845389, "learning_rate": 5e-05, "loss": 0.1848, "num_input_tokens_seen": 33524032, "step": 488 }, { "epoch": 30.5, "loss": 0.1754639744758606, "loss_ce": 0.020190536975860596, "loss_xval": 0.1552734375, "num_input_tokens_seen": 33524032, "step": 488 }, { "epoch": 30.5625, "grad_norm": 108.46431537751594, "learning_rate": 5e-05, "loss": 0.244, "num_input_tokens_seen": 33595648, "step": 489 }, { "epoch": 30.5625, "loss": 0.24445466697216034, "loss_ce": 0.019845297560095787, "loss_xval": 0.224609375, "num_input_tokens_seen": 33595648, "step": 489 }, { "epoch": 30.625, "grad_norm": 16.58390160532647, "learning_rate": 5e-05, "loss": 0.0316, "num_input_tokens_seen": 33667328, "step": 490 }, { "epoch": 30.625, "loss": 0.027875222265720367, "loss_ce": 0.015485084615647793, "loss_xval": 0.01239013671875, "num_input_tokens_seen": 33667328, "step": 490 }, { "epoch": 30.6875, "grad_norm": 89.30391448559037, "learning_rate": 5e-05, "loss": 0.1477, "num_input_tokens_seen": 33738944, "step": 491 }, { "epoch": 30.6875, "loss": 0.14867781102657318, "loss_ce": 0.01684187538921833, "loss_xval": 0.1318359375, "num_input_tokens_seen": 33738944, "step": 491 }, { "epoch": 30.75, "grad_norm": 76.51561580318129, "learning_rate": 5e-05, "loss": 0.105, "num_input_tokens_seen": 33810688, "step": 492 }, { "epoch": 30.75, "loss": 0.09136239439249039, "loss_ce": 0.016655363142490387, "loss_xval": 0.07470703125, "num_input_tokens_seen": 33810688, "step": 492 }, { "epoch": 30.8125, "grad_norm": 47.40846111493751, "learning_rate": 5e-05, "loss": 0.0548, "num_input_tokens_seen": 33882304, "step": 493 }, { "epoch": 30.8125, "loss": 0.051707200706005096, "loss_ce": 0.015818530693650246, "loss_xval": 0.035888671875, "num_input_tokens_seen": 33882304, "step": 493 }, { "epoch": 30.875, "grad_norm": 116.85312574162306, "learning_rate": 5e-05, "loss": 0.2196, "num_input_tokens_seen": 33953856, "step": 494 }, { "epoch": 30.875, "loss": 0.21689002215862274, "loss_ce": 0.012788456864655018, "loss_xval": 0.2041015625, "num_input_tokens_seen": 33953856, "step": 494 }, { "epoch": 30.9375, "grad_norm": 33.56625863849918, "learning_rate": 5e-05, "loss": 0.066, "num_input_tokens_seen": 34025536, "step": 495 }, { "epoch": 30.9375, "loss": 0.07113610208034515, "loss_ce": 0.016936881467700005, "loss_xval": 0.05419921875, "num_input_tokens_seen": 34025536, "step": 495 }, { "epoch": 31.0, "grad_norm": 89.30015482102085, "learning_rate": 5e-05, "loss": 0.1394, "num_input_tokens_seen": 34097280, "step": 496 }, { "epoch": 31.0, "loss": 0.1428450495004654, "loss_ce": 0.011985674500465393, "loss_xval": 0.130859375, "num_input_tokens_seen": 34097280, "step": 496 }, { "epoch": 31.0625, "grad_norm": 111.67380604143827, "learning_rate": 5e-05, "loss": 0.1921, "num_input_tokens_seen": 34156352, "step": 497 }, { "epoch": 31.0625, "loss": 0.19294708967208862, "loss_ce": 0.01032990776002407, "loss_xval": 0.1826171875, "num_input_tokens_seen": 34156352, "step": 497 }, { "epoch": 31.125, "grad_norm": 15.672979639453287, "learning_rate": 5e-05, "loss": 0.04, "num_input_tokens_seen": 34228096, "step": 498 }, { "epoch": 31.125, "loss": 0.04462538659572601, "loss_ce": 0.010201560333371162, "loss_xval": 0.034423828125, "num_input_tokens_seen": 34228096, "step": 498 }, { "epoch": 31.1875, "grad_norm": 101.23147864018928, "learning_rate": 5e-05, "loss": 0.1685, "num_input_tokens_seen": 34299648, "step": 499 }, { "epoch": 31.1875, "loss": 0.14524954557418823, "loss_ce": 0.007554229814559221, "loss_xval": 0.1376953125, "num_input_tokens_seen": 34299648, "step": 499 }, { "epoch": 31.25, "grad_norm": 79.7314578785406, "learning_rate": 5e-05, "loss": 0.1155, "num_input_tokens_seen": 34371456, "step": 500 }, { "epoch": 31.25, "eval_synth_IoU": 0.00709187425673008, "eval_synth_MAE_x": 0.183349609375, "eval_synth_MAE_y": 0.22174072265625, "eval_synth_NUM_probability": 0.9508986622095108, "eval_synth_inside_bbox": 0.0, "eval_synth_loss": 0.03743727132678032, "eval_synth_loss_ce": 0.0061567522352561355, "eval_synth_loss_xval": 0.031280517578125, "eval_synth_runtime": 60.0762, "eval_synth_samples_per_second": 2.131, "eval_synth_steps_per_second": 0.067, "num_input_tokens_seen": 34371456, "step": 500 }, { "epoch": 31.25, "loss": 0.038767192512750626, "loss_ce": 0.004831646103411913, "loss_xval": 0.033935546875, "num_input_tokens_seen": 34371456, "step": 500 }, { "epoch": 31.3125, "grad_norm": 33.35997221627081, "learning_rate": 5e-05, "loss": 0.0416, "num_input_tokens_seen": 34443136, "step": 501 }, { "epoch": 31.3125, "loss": 0.03074466995894909, "loss_ce": 0.006330606993287802, "loss_xval": 0.0244140625, "num_input_tokens_seen": 34443136, "step": 501 }, { "epoch": 31.375, "grad_norm": 101.3020414172374, "learning_rate": 5e-05, "loss": 0.1878, "num_input_tokens_seen": 34514688, "step": 502 }, { "epoch": 31.375, "loss": 0.18065997958183289, "loss_ce": 0.0039021666161715984, "loss_xval": 0.1767578125, "num_input_tokens_seen": 34514688, "step": 502 }, { "epoch": 31.4375, "grad_norm": 58.21370222284822, "learning_rate": 5e-05, "loss": 0.0698, "num_input_tokens_seen": 34586432, "step": 503 }, { "epoch": 31.4375, "loss": 0.0838276594877243, "loss_ce": 0.004237812012434006, "loss_xval": 0.07958984375, "num_input_tokens_seen": 34586432, "step": 503 }, { "epoch": 31.5, "grad_norm": 41.29086127352125, "learning_rate": 5e-05, "loss": 0.0441, "num_input_tokens_seen": 34658048, "step": 504 }, { "epoch": 31.5, "loss": 0.03785308822989464, "loss_ce": 0.005138244479894638, "loss_xval": 0.03271484375, "num_input_tokens_seen": 34658048, "step": 504 }, { "epoch": 31.5625, "grad_norm": 85.45688443812506, "learning_rate": 5e-05, "loss": 0.1334, "num_input_tokens_seen": 34717184, "step": 505 }, { "epoch": 31.5625, "loss": 0.14185205101966858, "loss_ce": 0.006109867710620165, "loss_xval": 0.1357421875, "num_input_tokens_seen": 34717184, "step": 505 }, { "epoch": 31.625, "grad_norm": 32.42188566199593, "learning_rate": 5e-05, "loss": 0.0356, "num_input_tokens_seen": 34788736, "step": 506 }, { "epoch": 31.625, "loss": 0.03629305213689804, "loss_ce": 0.005287191364914179, "loss_xval": 0.031005859375, "num_input_tokens_seen": 34788736, "step": 506 }, { "epoch": 31.6875, "grad_norm": 49.39589605562242, "learning_rate": 5e-05, "loss": 0.0519, "num_input_tokens_seen": 34860352, "step": 507 }, { "epoch": 31.6875, "loss": 0.05294110253453255, "loss_ce": 0.0028922753408551216, "loss_xval": 0.050048828125, "num_input_tokens_seen": 34860352, "step": 507 }, { "epoch": 31.75, "grad_norm": 75.95056302700068, "learning_rate": 5e-05, "loss": 0.1077, "num_input_tokens_seen": 34931968, "step": 508 }, { "epoch": 31.75, "loss": 0.11320182681083679, "loss_ce": 0.0033385478891432285, "loss_xval": 0.10986328125, "num_input_tokens_seen": 34931968, "step": 508 }, { "epoch": 31.8125, "grad_norm": 19.239734144914703, "learning_rate": 5e-05, "loss": 0.016, "num_input_tokens_seen": 35003520, "step": 509 }, { "epoch": 31.8125, "loss": 0.01861836388707161, "loss_ce": 0.002383012091740966, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 35003520, "step": 509 }, { "epoch": 31.875, "grad_norm": 63.359210101062615, "learning_rate": 5e-05, "loss": 0.0702, "num_input_tokens_seen": 35075200, "step": 510 }, { "epoch": 31.875, "loss": 0.06541571021080017, "loss_ce": 0.0024274298921227455, "loss_xval": 0.06298828125, "num_input_tokens_seen": 35075200, "step": 510 }, { "epoch": 31.9375, "grad_norm": 72.4722047412447, "learning_rate": 5e-05, "loss": 0.0872, "num_input_tokens_seen": 35146944, "step": 511 }, { "epoch": 31.9375, "loss": 0.09188695251941681, "loss_ce": 0.0025314840022474527, "loss_xval": 0.08935546875, "num_input_tokens_seen": 35146944, "step": 511 }, { "epoch": 32.0, "grad_norm": 3.152057996325055, "learning_rate": 5e-05, "loss": 0.0102, "num_input_tokens_seen": 35218560, "step": 512 }, { "epoch": 32.0, "loss": 0.009740998968482018, "loss_ce": 0.002325227949768305, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 35218560, "step": 512 }, { "epoch": 32.0625, "grad_norm": 64.23363309997048, "learning_rate": 5e-05, "loss": 0.0727, "num_input_tokens_seen": 35290112, "step": 513 }, { "epoch": 32.0625, "loss": 0.07529156655073166, "loss_ce": 0.002049379050731659, "loss_xval": 0.0732421875, "num_input_tokens_seen": 35290112, "step": 513 }, { "epoch": 32.125, "grad_norm": 60.79170962146762, "learning_rate": 5e-05, "loss": 0.0633, "num_input_tokens_seen": 35361856, "step": 514 }, { "epoch": 32.125, "loss": 0.062045708298683167, "loss_ce": 0.002231256105005741, "loss_xval": 0.059814453125, "num_input_tokens_seen": 35361856, "step": 514 }, { "epoch": 32.1875, "grad_norm": 3.960201489811377, "learning_rate": 5e-05, "loss": 0.0083, "num_input_tokens_seen": 35433472, "step": 515 }, { "epoch": 32.1875, "loss": 0.008324277587234974, "loss_ce": 0.0019461041083559394, "loss_xval": 0.006378173828125, "num_input_tokens_seen": 35433472, "step": 515 }, { "epoch": 32.25, "grad_norm": 51.90847741220123, "learning_rate": 5e-05, "loss": 0.05, "num_input_tokens_seen": 35505088, "step": 516 }, { "epoch": 32.25, "loss": 0.047266796231269836, "loss_ce": 0.002344920299947262, "loss_xval": 0.044921875, "num_input_tokens_seen": 35505088, "step": 516 }, { "epoch": 32.3125, "grad_norm": 33.40482968325523, "learning_rate": 5e-05, "loss": 0.0274, "num_input_tokens_seen": 35576640, "step": 517 }, { "epoch": 32.3125, "loss": 0.028853023424744606, "loss_ce": 0.0019975542090833187, "loss_xval": 0.02685546875, "num_input_tokens_seen": 35576640, "step": 517 }, { "epoch": 32.375, "grad_norm": 19.400593021580494, "learning_rate": 5e-05, "loss": 0.0125, "num_input_tokens_seen": 35648192, "step": 518 }, { "epoch": 32.375, "loss": 0.013727393001317978, "loss_ce": 0.002252783626317978, "loss_xval": 0.011474609375, "num_input_tokens_seen": 35648192, "step": 518 }, { "epoch": 32.4375, "grad_norm": 46.919959828504645, "learning_rate": 5e-05, "loss": 0.0414, "num_input_tokens_seen": 35719936, "step": 519 }, { "epoch": 32.4375, "loss": 0.034388408064842224, "loss_ce": 0.0019177051726728678, "loss_xval": 0.032470703125, "num_input_tokens_seen": 35719936, "step": 519 }, { "epoch": 32.5, "grad_norm": 25.058903049742, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 35791680, "step": 520 }, { "epoch": 32.5, "loss": 0.017532430589199066, "loss_ce": 0.00196846597827971, "loss_xval": 0.01556396484375, "num_input_tokens_seen": 35791680, "step": 520 }, { "epoch": 32.5625, "grad_norm": 13.897068822398493, "learning_rate": 5e-05, "loss": 0.0139, "num_input_tokens_seen": 35863232, "step": 521 }, { "epoch": 32.5625, "loss": 0.013923520222306252, "loss_ce": 0.002876157173886895, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 35863232, "step": 521 }, { "epoch": 32.625, "grad_norm": 26.52398400048985, "learning_rate": 5e-05, "loss": 0.0198, "num_input_tokens_seen": 35934784, "step": 522 }, { "epoch": 32.625, "loss": 0.019185151904821396, "loss_ce": 0.0023394490126520395, "loss_xval": 0.016845703125, "num_input_tokens_seen": 35934784, "step": 522 }, { "epoch": 32.6875, "grad_norm": 18.375483995081037, "learning_rate": 5e-05, "loss": 0.0114, "num_input_tokens_seen": 36006464, "step": 523 }, { "epoch": 32.6875, "loss": 0.01426710095256567, "loss_ce": 0.0019379997393116355, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 36006464, "step": 523 }, { "epoch": 32.75, "grad_norm": 6.104764631350985, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 36078016, "step": 524 }, { "epoch": 32.75, "loss": 0.006929485592991114, "loss_ce": 0.0029622004367411137, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 36078016, "step": 524 }, { "epoch": 32.8125, "grad_norm": 3.738792946367771, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 36149696, "step": 525 }, { "epoch": 32.8125, "loss": 0.004459495190531015, "loss_ce": 0.0013924787053838372, "loss_xval": 0.0030670166015625, "num_input_tokens_seen": 36149696, "step": 525 }, { "epoch": 32.875, "grad_norm": 4.412211012903223, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 36221312, "step": 526 }, { "epoch": 32.875, "loss": 0.005546785891056061, "loss_ce": 0.0018236414762213826, "loss_xval": 0.00372314453125, "num_input_tokens_seen": 36221312, "step": 526 }, { "epoch": 32.9375, "grad_norm": 5.271021579259849, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 36280448, "step": 527 }, { "epoch": 32.9375, "loss": 0.006776329595595598, "loss_ce": 0.0017409290885552764, "loss_xval": 0.005035400390625, "num_input_tokens_seen": 36280448, "step": 527 }, { "epoch": 33.0, "grad_norm": 8.210751475709698, "learning_rate": 5e-05, "loss": 0.0064, "num_input_tokens_seen": 36352128, "step": 528 }, { "epoch": 33.0, "loss": 0.005801305174827576, "loss_ce": 0.0019866079092025757, "loss_xval": 0.003814697265625, "num_input_tokens_seen": 36352128, "step": 528 }, { "epoch": 33.0625, "grad_norm": 9.888229864178133, "learning_rate": 5e-05, "loss": 0.0075, "num_input_tokens_seen": 36411264, "step": 529 }, { "epoch": 33.0625, "loss": 0.00706527940928936, "loss_ce": 0.0013890096452087164, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 36411264, "step": 529 }, { "epoch": 33.125, "grad_norm": 14.21697984091777, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 36483008, "step": 530 }, { "epoch": 33.125, "loss": 0.008422967046499252, "loss_ce": 0.0016175472410395741, "loss_xval": 0.006805419921875, "num_input_tokens_seen": 36483008, "step": 530 }, { "epoch": 33.1875, "grad_norm": 11.435123159471466, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 36542080, "step": 531 }, { "epoch": 33.1875, "loss": 0.00780049292370677, "loss_ce": 0.0013002488994970918, "loss_xval": 0.006500244140625, "num_input_tokens_seen": 36542080, "step": 531 }, { "epoch": 33.25, "grad_norm": 2.0343152338608244, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 36613696, "step": 532 }, { "epoch": 33.25, "loss": 0.005734146572649479, "loss_ce": 0.001064957003109157, "loss_xval": 0.004669189453125, "num_input_tokens_seen": 36613696, "step": 532 }, { "epoch": 33.3125, "grad_norm": 5.0903071876766806, "learning_rate": 5e-05, "loss": 0.0053, "num_input_tokens_seen": 36685312, "step": 533 }, { "epoch": 33.3125, "loss": 0.004778821021318436, "loss_ce": 0.001009899890050292, "loss_xval": 0.0037689208984375, "num_input_tokens_seen": 36685312, "step": 533 }, { "epoch": 33.375, "grad_norm": 7.323274835426254, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 36757056, "step": 534 }, { "epoch": 33.375, "loss": 0.0067153251729905605, "loss_ce": 0.0011916436487808824, "loss_xval": 0.005523681640625, "num_input_tokens_seen": 36757056, "step": 534 }, { "epoch": 33.4375, "grad_norm": 8.30632649383284, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 36816256, "step": 535 }, { "epoch": 33.4375, "loss": 0.0058524045161902905, "loss_ce": 0.0010916623286902905, "loss_xval": 0.0047607421875, "num_input_tokens_seen": 36816256, "step": 535 }, { "epoch": 33.5, "grad_norm": 14.708122951546155, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 36887936, "step": 536 }, { "epoch": 33.5, "loss": 0.007070864085108042, "loss_ce": 0.0012725242413580418, "loss_xval": 0.00579833984375, "num_input_tokens_seen": 36887936, "step": 536 }, { "epoch": 33.5625, "grad_norm": 16.30334818048439, "learning_rate": 5e-05, "loss": 0.0071, "num_input_tokens_seen": 36959552, "step": 537 }, { "epoch": 33.5625, "loss": 0.00768459215760231, "loss_ce": 0.0009707247372716665, "loss_xval": 0.0067138671875, "num_input_tokens_seen": 36959552, "step": 537 }, { "epoch": 33.625, "grad_norm": 5.956799141206181, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 37018624, "step": 538 }, { "epoch": 33.625, "loss": 0.0054640574380755424, "loss_ce": 0.0009169381810352206, "loss_xval": 0.004547119140625, "num_input_tokens_seen": 37018624, "step": 538 }, { "epoch": 33.6875, "grad_norm": 5.938199449973524, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 37077696, "step": 539 }, { "epoch": 33.6875, "loss": 0.0063777221366763115, "loss_ce": 0.0008235230925492942, "loss_xval": 0.00555419921875, "num_input_tokens_seen": 37077696, "step": 539 }, { "epoch": 33.75, "grad_norm": 9.67535270082586, "learning_rate": 5e-05, "loss": 0.0053, "num_input_tokens_seen": 37149312, "step": 540 }, { "epoch": 33.75, "loss": 0.0036917172838002443, "loss_ce": 0.0008230649982579052, "loss_xval": 0.00286865234375, "num_input_tokens_seen": 37149312, "step": 540 }, { "epoch": 33.8125, "grad_norm": 7.6229017292072285, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 37220992, "step": 541 }, { "epoch": 33.8125, "loss": 0.0077782683074474335, "loss_ce": 0.0009118132875300944, "loss_xval": 0.006866455078125, "num_input_tokens_seen": 37220992, "step": 541 }, { "epoch": 33.875, "grad_norm": 2.035502392885776, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 37292544, "step": 542 }, { "epoch": 33.875, "loss": 0.0037268272135406733, "loss_ce": 0.0008581749279983342, "loss_xval": 0.00286865234375, "num_input_tokens_seen": 37292544, "step": 542 }, { "epoch": 33.9375, "grad_norm": 2.7394241925506413, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 37364096, "step": 543 }, { "epoch": 33.9375, "loss": 0.0030486832838505507, "loss_ce": 0.0009429703350178897, "loss_xval": 0.002105712890625, "num_input_tokens_seen": 37364096, "step": 543 }, { "epoch": 34.0, "grad_norm": 6.268802781769899, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 37435776, "step": 544 }, { "epoch": 34.0, "loss": 0.002665142063051462, "loss_ce": 0.0008188285282813013, "loss_xval": 0.0018463134765625, "num_input_tokens_seen": 37435776, "step": 544 }, { "epoch": 34.0625, "grad_norm": 5.029103856315326, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 37507392, "step": 545 }, { "epoch": 34.0625, "loss": 0.004083414562046528, "loss_ce": 0.0006807046011090279, "loss_xval": 0.0034027099609375, "num_input_tokens_seen": 37507392, "step": 545 }, { "epoch": 34.125, "grad_norm": 1.2852869284529702, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 37566528, "step": 546 }, { "epoch": 34.125, "loss": 0.001781684928573668, "loss_ce": 0.000744087272323668, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 37566528, "step": 546 }, { "epoch": 34.1875, "grad_norm": 4.627840663233622, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 37638208, "step": 547 }, { "epoch": 34.1875, "loss": 0.0028193281032145023, "loss_ce": 0.0006983564235270023, "loss_xval": 0.0021209716796875, "num_input_tokens_seen": 37638208, "step": 547 }, { "epoch": 34.25, "grad_norm": 7.831680161882376, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 37709760, "step": 548 }, { "epoch": 34.25, "loss": 0.0046610175631940365, "loss_ce": 0.0008615790284238756, "loss_xval": 0.0037994384765625, "num_input_tokens_seen": 37709760, "step": 548 }, { "epoch": 34.3125, "grad_norm": 3.0887274586067153, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 37781376, "step": 549 }, { "epoch": 34.3125, "loss": 0.0022106545511633158, "loss_ce": 0.0007686989265494049, "loss_xval": 0.00144195556640625, "num_input_tokens_seen": 37781376, "step": 549 }, { "epoch": 34.375, "grad_norm": 3.0136299204466135, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 37840512, "step": 550 }, { "epoch": 34.375, "loss": 0.002881981898099184, "loss_ce": 0.000547387229744345, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 37840512, "step": 550 }, { "epoch": 34.4375, "grad_norm": 5.8966589422552955, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 37912256, "step": 551 }, { "epoch": 34.4375, "loss": 0.003600254189223051, "loss_ce": 0.000594272802118212, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 37912256, "step": 551 }, { "epoch": 34.5, "grad_norm": 6.267751347166229, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 37971392, "step": 552 }, { "epoch": 34.5, "loss": 0.003300173208117485, "loss_ce": 0.0008282493217848241, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 37971392, "step": 552 }, { "epoch": 34.5625, "grad_norm": 4.66913309713646, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 38030336, "step": 553 }, { "epoch": 34.5625, "loss": 0.003951496444642544, "loss_ce": 0.000747150566894561, "loss_xval": 0.003204345703125, "num_input_tokens_seen": 38030336, "step": 553 }, { "epoch": 34.625, "grad_norm": 4.052412059769818, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 38089408, "step": 554 }, { "epoch": 34.625, "loss": 0.0021589105017483234, "loss_ce": 0.0005872553447261453, "loss_xval": 0.0015716552734375, "num_input_tokens_seen": 38089408, "step": 554 }, { "epoch": 34.6875, "grad_norm": 2.423240420462405, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 38161024, "step": 555 }, { "epoch": 34.6875, "loss": 0.0019102065125480294, "loss_ce": 0.0006437270203605294, "loss_xval": 0.0012664794921875, "num_input_tokens_seen": 38161024, "step": 555 }, { "epoch": 34.75, "grad_norm": 2.3341505500246424, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 38232640, "step": 556 }, { "epoch": 34.75, "loss": 0.00247854832559824, "loss_ce": 0.0007237876416184008, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 38232640, "step": 556 }, { "epoch": 34.8125, "grad_norm": 4.626196368155673, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 38291712, "step": 557 }, { "epoch": 34.8125, "loss": 0.0018908249912783504, "loss_ce": 0.0005175339756533504, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 38291712, "step": 557 }, { "epoch": 34.875, "grad_norm": 9.620699963063549, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 38363392, "step": 558 }, { "epoch": 34.875, "loss": 0.0025881037581712008, "loss_ce": 0.0006197200273163617, "loss_xval": 0.0019683837890625, "num_input_tokens_seen": 38363392, "step": 558 }, { "epoch": 34.9375, "grad_norm": 20.22670183452228, "learning_rate": 5e-05, "loss": 0.0078, "num_input_tokens_seen": 38435008, "step": 559 }, { "epoch": 34.9375, "loss": 0.007559607736766338, "loss_ce": 0.0006626348476856947, "loss_xval": 0.00689697265625, "num_input_tokens_seen": 38435008, "step": 559 }, { "epoch": 35.0, "grad_norm": 41.86820033044202, "learning_rate": 5e-05, "loss": 0.0284, "num_input_tokens_seen": 38494016, "step": 560 }, { "epoch": 35.0, "loss": 0.03127765655517578, "loss_ce": 0.0005159361753612757, "loss_xval": 0.03076171875, "num_input_tokens_seen": 38494016, "step": 560 }, { "epoch": 35.0625, "grad_norm": 90.8855239560365, "learning_rate": 5e-05, "loss": 0.1259, "num_input_tokens_seen": 38565760, "step": 561 }, { "epoch": 35.0625, "loss": 0.13142003118991852, "loss_ce": 0.0005606548511423171, "loss_xval": 0.130859375, "num_input_tokens_seen": 38565760, "step": 561 }, { "epoch": 35.125, "grad_norm": 191.29554767759743, "learning_rate": 5e-05, "loss": 0.5571, "num_input_tokens_seen": 38637312, "step": 562 }, { "epoch": 35.125, "loss": 0.5629037618637085, "loss_ce": 0.0004037575563415885, "loss_xval": 0.5625, "num_input_tokens_seen": 38637312, "step": 562 }, { "epoch": 35.1875, "grad_norm": 350.50346982799715, "learning_rate": 5e-05, "loss": 1.8912, "num_input_tokens_seen": 38709056, "step": 563 }, { "epoch": 35.1875, "loss": 1.883461833000183, "loss_ce": 0.0006492895190604031, "loss_xval": 1.8828125, "num_input_tokens_seen": 38709056, "step": 563 }, { "epoch": 35.25, "grad_norm": 366.21743192231634, "learning_rate": 5e-05, "loss": 2.7019, "num_input_tokens_seen": 38768128, "step": 564 }, { "epoch": 35.25, "loss": 2.739633321762085, "loss_ce": 0.0052583953365683556, "loss_xval": 2.734375, "num_input_tokens_seen": 38768128, "step": 564 }, { "epoch": 35.3125, "grad_norm": 551.2575174025078, "learning_rate": 5e-05, "loss": 5.1266, "num_input_tokens_seen": 38839872, "step": 565 }, { "epoch": 35.3125, "loss": 5.231325149536133, "loss_ce": 0.10632535815238953, "loss_xval": 5.125, "num_input_tokens_seen": 38839872, "step": 565 }, { "epoch": 35.375, "grad_norm": 274.40929267866454, "learning_rate": 5e-05, "loss": 1.5613, "num_input_tokens_seen": 38911488, "step": 566 }, { "epoch": 35.375, "loss": 1.4925724267959595, "loss_ce": 0.04725995287299156, "loss_xval": 1.4453125, "num_input_tokens_seen": 38911488, "step": 566 }, { "epoch": 35.4375, "grad_norm": 770.5286052127881, "learning_rate": 5e-05, "loss": 5.75, "num_input_tokens_seen": 38970624, "step": 567 }, { "epoch": 35.4375, "loss": 5.361253261566162, "loss_ce": 0.3300033211708069, "loss_xval": 5.03125, "num_input_tokens_seen": 38970624, "step": 567 }, { "epoch": 35.5, "grad_norm": 1251.903653622121, "learning_rate": 5e-05, "loss": 14.7884, "num_input_tokens_seen": 39042304, "step": 568 }, { "epoch": 35.5, "loss": 14.415470123291016, "loss_ce": 0.5404703617095947, "loss_xval": 13.875, "num_input_tokens_seen": 39042304, "step": 568 }, { "epoch": 35.5625, "grad_norm": 938.7359719425729, "learning_rate": 5e-05, "loss": 10.5851, "num_input_tokens_seen": 39113984, "step": 569 }, { "epoch": 35.5625, "loss": 10.34769058227539, "loss_ce": 1.5351901054382324, "loss_xval": 8.8125, "num_input_tokens_seen": 39113984, "step": 569 }, { "epoch": 35.625, "grad_norm": 1230.5398739652574, "learning_rate": 5e-05, "loss": 22.0495, "num_input_tokens_seen": 39185728, "step": 570 }, { "epoch": 35.625, "loss": 21.479244232177734, "loss_ce": 0.4792449176311493, "loss_xval": 21.0, "num_input_tokens_seen": 39185728, "step": 570 }, { "epoch": 35.6875, "grad_norm": 696.4657801334389, "learning_rate": 5e-05, "loss": 5.2556, "num_input_tokens_seen": 39257280, "step": 571 }, { "epoch": 35.6875, "loss": 5.062924385070801, "loss_ce": 0.46917423605918884, "loss_xval": 4.59375, "num_input_tokens_seen": 39257280, "step": 571 }, { "epoch": 35.75, "grad_norm": 869.8728621056691, "learning_rate": 5e-05, "loss": 5.5111, "num_input_tokens_seen": 39328896, "step": 572 }, { "epoch": 35.75, "loss": 5.267719745635986, "loss_ce": 0.7052197456359863, "loss_xval": 4.5625, "num_input_tokens_seen": 39328896, "step": 572 }, { "epoch": 35.8125, "grad_norm": 3549.498225625143, "learning_rate": 5e-05, "loss": 31.3322, "num_input_tokens_seen": 39400512, "step": 573 }, { "epoch": 35.8125, "loss": 35.222599029541016, "loss_ce": 0.7225989699363708, "loss_xval": 34.5, "num_input_tokens_seen": 39400512, "step": 573 }, { "epoch": 35.875, "grad_norm": 1180.2634837582707, "learning_rate": 5e-05, "loss": 9.5381, "num_input_tokens_seen": 39472192, "step": 574 }, { "epoch": 35.875, "loss": 9.109892845153809, "loss_ce": 0.5473928451538086, "loss_xval": 8.5625, "num_input_tokens_seen": 39472192, "step": 574 }, { "epoch": 35.9375, "grad_norm": 116.29402504782887, "learning_rate": 5e-05, "loss": 0.9392, "num_input_tokens_seen": 39543744, "step": 575 }, { "epoch": 35.9375, "loss": 0.8281427621841431, "loss_ce": 0.5703302621841431, "loss_xval": 0.2578125, "num_input_tokens_seen": 39543744, "step": 575 }, { "epoch": 36.0, "grad_norm": 918.9252968800259, "learning_rate": 5e-05, "loss": 7.6593, "num_input_tokens_seen": 39615296, "step": 576 }, { "epoch": 36.0, "loss": 7.5638298988342285, "loss_ce": 1.0325798988342285, "loss_xval": 6.53125, "num_input_tokens_seen": 39615296, "step": 576 }, { "epoch": 36.0625, "grad_norm": 358.6239331313297, "learning_rate": 5e-05, "loss": 2.1741, "num_input_tokens_seen": 39686912, "step": 577 }, { "epoch": 36.0625, "loss": 2.2433958053588867, "loss_ce": 1.1730833053588867, "loss_xval": 1.0703125, "num_input_tokens_seen": 39686912, "step": 577 }, { "epoch": 36.125, "grad_norm": 702.9413630456082, "learning_rate": 5e-05, "loss": 4.5602, "num_input_tokens_seen": 39758592, "step": 578 }, { "epoch": 36.125, "loss": 4.851856231689453, "loss_ce": 0.961230993270874, "loss_xval": 3.890625, "num_input_tokens_seen": 39758592, "step": 578 }, { "epoch": 36.1875, "grad_norm": 602.1591127964381, "learning_rate": 5e-05, "loss": 3.6184, "num_input_tokens_seen": 39830144, "step": 579 }, { "epoch": 36.1875, "loss": 3.764401435852051, "loss_ce": 0.545651376247406, "loss_xval": 3.21875, "num_input_tokens_seen": 39830144, "step": 579 }, { "epoch": 36.25, "grad_norm": 401.1552345637159, "learning_rate": 5e-05, "loss": 1.8375, "num_input_tokens_seen": 39901760, "step": 580 }, { "epoch": 36.25, "loss": 1.8537099361419678, "loss_ce": 0.38495996594429016, "loss_xval": 1.46875, "num_input_tokens_seen": 39901760, "step": 580 }, { "epoch": 36.3125, "grad_norm": 744.8830304932653, "learning_rate": 5e-05, "loss": 4.9107, "num_input_tokens_seen": 39973312, "step": 581 }, { "epoch": 36.3125, "loss": 4.981729984283447, "loss_ce": 0.35673001408576965, "loss_xval": 4.625, "num_input_tokens_seen": 39973312, "step": 581 }, { "epoch": 36.375, "grad_norm": 41.1117289556165, "learning_rate": 5e-05, "loss": 0.3746, "num_input_tokens_seen": 40019840, "step": 582 }, { "epoch": 36.375, "loss": 0.372890442609787, "loss_ce": 0.291347473859787, "loss_xval": 0.08154296875, "num_input_tokens_seen": 40019840, "step": 582 }, { "epoch": 36.4375, "grad_norm": 522.7553105033345, "learning_rate": 5e-05, "loss": 2.9091, "num_input_tokens_seen": 40091456, "step": 583 }, { "epoch": 36.4375, "loss": 2.9245858192443848, "loss_ce": 0.2683357298374176, "loss_xval": 2.65625, "num_input_tokens_seen": 40091456, "step": 583 }, { "epoch": 36.5, "grad_norm": 100.93606363922494, "learning_rate": 5e-05, "loss": 0.386, "num_input_tokens_seen": 40163136, "step": 584 }, { "epoch": 36.5, "loss": 0.4024519622325897, "loss_ce": 0.2530378997325897, "loss_xval": 0.1494140625, "num_input_tokens_seen": 40163136, "step": 584 }, { "epoch": 36.5625, "grad_norm": 455.6989539550385, "learning_rate": 5e-05, "loss": 2.2411, "num_input_tokens_seen": 40222336, "step": 585 }, { "epoch": 36.5625, "loss": 2.1417577266693115, "loss_ce": 0.2589452564716339, "loss_xval": 1.8828125, "num_input_tokens_seen": 40222336, "step": 585 }, { "epoch": 36.625, "grad_norm": 164.02950173346085, "learning_rate": 5e-05, "loss": 0.5776, "num_input_tokens_seen": 40281408, "step": 586 }, { "epoch": 36.625, "loss": 0.591841995716095, "loss_ce": 0.25004512071609497, "loss_xval": 0.341796875, "num_input_tokens_seen": 40281408, "step": 586 }, { "epoch": 36.6875, "grad_norm": 286.35296257900984, "learning_rate": 5e-05, "loss": 1.1763, "num_input_tokens_seen": 40353152, "step": 587 }, { "epoch": 36.6875, "loss": 1.0982005596160889, "loss_ce": 0.25445055961608887, "loss_xval": 0.84375, "num_input_tokens_seen": 40353152, "step": 587 }, { "epoch": 36.75, "grad_norm": 263.32879398104865, "learning_rate": 5e-05, "loss": 1.0375, "num_input_tokens_seen": 40412224, "step": 588 }, { "epoch": 36.75, "loss": 1.0531641244888306, "loss_ce": 0.24457040429115295, "loss_xval": 0.80859375, "num_input_tokens_seen": 40412224, "step": 588 }, { "epoch": 36.8125, "grad_norm": 151.29925845397975, "learning_rate": 5e-05, "loss": 0.513, "num_input_tokens_seen": 40483904, "step": 589 }, { "epoch": 36.8125, "loss": 0.5355762243270874, "loss_ce": 0.2328418642282486, "loss_xval": 0.302734375, "num_input_tokens_seen": 40483904, "step": 589 }, { "epoch": 36.875, "grad_norm": 311.1048830892808, "learning_rate": 5e-05, "loss": 1.2992, "num_input_tokens_seen": 40555648, "step": 590 }, { "epoch": 36.875, "loss": 1.2941535711288452, "loss_ce": 0.23165352642536163, "loss_xval": 1.0625, "num_input_tokens_seen": 40555648, "step": 590 }, { "epoch": 36.9375, "grad_norm": 25.12563712108501, "learning_rate": 5e-05, "loss": 0.2608, "num_input_tokens_seen": 40614848, "step": 591 }, { "epoch": 36.9375, "loss": 0.2656458020210266, "loss_ce": 0.22218875586986542, "loss_xval": 0.04345703125, "num_input_tokens_seen": 40614848, "step": 591 }, { "epoch": 37.0, "grad_norm": 265.0904914487781, "learning_rate": 5e-05, "loss": 1.0982, "num_input_tokens_seen": 40686400, "step": 592 }, { "epoch": 37.0, "loss": 1.1134018898010254, "loss_ce": 0.2149643748998642, "loss_xval": 0.8984375, "num_input_tokens_seen": 40686400, "step": 592 }, { "epoch": 37.0625, "grad_norm": 98.20828086212038, "learning_rate": 5e-05, "loss": 0.3804, "num_input_tokens_seen": 40758016, "step": 593 }, { "epoch": 37.0625, "loss": 0.37509244680404663, "loss_ce": 0.20614711940288544, "loss_xval": 0.1689453125, "num_input_tokens_seen": 40758016, "step": 593 }, { "epoch": 37.125, "grad_norm": 156.23063382829764, "learning_rate": 5e-05, "loss": 0.5599, "num_input_tokens_seen": 40817088, "step": 594 }, { "epoch": 37.125, "loss": 0.5345146656036377, "loss_ce": 0.1985771507024765, "loss_xval": 0.3359375, "num_input_tokens_seen": 40817088, "step": 594 }, { "epoch": 37.1875, "grad_norm": 205.7645154073751, "learning_rate": 5e-05, "loss": 0.7541, "num_input_tokens_seen": 40888704, "step": 595 }, { "epoch": 37.1875, "loss": 0.7232792973518372, "loss_ce": 0.19593554735183716, "loss_xval": 0.52734375, "num_input_tokens_seen": 40888704, "step": 595 }, { "epoch": 37.25, "grad_norm": 54.92131899982118, "learning_rate": 5e-05, "loss": 0.2654, "num_input_tokens_seen": 40947776, "step": 596 }, { "epoch": 37.25, "loss": 0.27080121636390686, "loss_ce": 0.20146527886390686, "loss_xval": 0.0693359375, "num_input_tokens_seen": 40947776, "step": 596 }, { "epoch": 37.3125, "grad_norm": 205.73367835848026, "learning_rate": 5e-05, "loss": 0.7528, "num_input_tokens_seen": 41019456, "step": 597 }, { "epoch": 37.3125, "loss": 0.7497467994689941, "loss_ce": 0.18334054946899414, "loss_xval": 0.56640625, "num_input_tokens_seen": 41019456, "step": 597 }, { "epoch": 37.375, "grad_norm": 60.87860178293434, "learning_rate": 5e-05, "loss": 0.2509, "num_input_tokens_seen": 41078528, "step": 598 }, { "epoch": 37.375, "loss": 0.238224059343338, "loss_ce": 0.179630309343338, "loss_xval": 0.05859375, "num_input_tokens_seen": 41078528, "step": 598 }, { "epoch": 37.4375, "grad_norm": 140.53096665849083, "learning_rate": 5e-05, "loss": 0.4592, "num_input_tokens_seen": 41150080, "step": 599 }, { "epoch": 37.4375, "loss": 0.4615119397640228, "loss_ce": 0.17440256476402283, "loss_xval": 0.287109375, "num_input_tokens_seen": 41150080, "step": 599 }, { "epoch": 37.5, "grad_norm": 141.48099157738406, "learning_rate": 5e-05, "loss": 0.4551, "num_input_tokens_seen": 41221760, "step": 600 }, { "epoch": 37.5, "loss": 0.44427230954170227, "loss_ce": 0.17278793454170227, "loss_xval": 0.271484375, "num_input_tokens_seen": 41221760, "step": 600 }, { "epoch": 37.5625, "grad_norm": 56.75102315773255, "learning_rate": 5e-05, "loss": 0.2472, "num_input_tokens_seen": 41293568, "step": 601 }, { "epoch": 37.5625, "loss": 0.23851189017295837, "loss_ce": 0.16771110892295837, "loss_xval": 0.07080078125, "num_input_tokens_seen": 41293568, "step": 601 }, { "epoch": 37.625, "grad_norm": 149.9406290414657, "learning_rate": 5e-05, "loss": 0.4996, "num_input_tokens_seen": 41365184, "step": 602 }, { "epoch": 37.625, "loss": 0.4996224045753479, "loss_ce": 0.1714974194765091, "loss_xval": 0.328125, "num_input_tokens_seen": 41365184, "step": 602 }, { "epoch": 37.6875, "grad_norm": 29.47624024193657, "learning_rate": 5e-05, "loss": 0.1971, "num_input_tokens_seen": 41436800, "step": 603 }, { "epoch": 37.6875, "loss": 0.19674646854400635, "loss_ce": 0.16159021854400635, "loss_xval": 0.03515625, "num_input_tokens_seen": 41436800, "step": 603 }, { "epoch": 37.75, "grad_norm": 125.28948276890881, "learning_rate": 5e-05, "loss": 0.3971, "num_input_tokens_seen": 41508480, "step": 604 }, { "epoch": 37.75, "loss": 0.45287567377090454, "loss_ce": 0.16186004877090454, "loss_xval": 0.291015625, "num_input_tokens_seen": 41508480, "step": 604 }, { "epoch": 37.8125, "grad_norm": 95.54904903701473, "learning_rate": 5e-05, "loss": 0.3014, "num_input_tokens_seen": 41580224, "step": 605 }, { "epoch": 37.8125, "loss": 0.31282493472099304, "loss_ce": 0.16145774722099304, "loss_xval": 0.1513671875, "num_input_tokens_seen": 41580224, "step": 605 }, { "epoch": 37.875, "grad_norm": 78.91158457455924, "learning_rate": 5e-05, "loss": 0.2589, "num_input_tokens_seen": 41651904, "step": 606 }, { "epoch": 37.875, "loss": 0.2595600187778473, "loss_ce": 0.1628803312778473, "loss_xval": 0.0966796875, "num_input_tokens_seen": 41651904, "step": 606 }, { "epoch": 37.9375, "grad_norm": 117.50889229373615, "learning_rate": 5e-05, "loss": 0.3626, "num_input_tokens_seen": 41723520, "step": 607 }, { "epoch": 37.9375, "loss": 0.3645981252193451, "loss_ce": 0.1546371877193451, "loss_xval": 0.2099609375, "num_input_tokens_seen": 41723520, "step": 607 }, { "epoch": 38.0, "grad_norm": 11.030866607250644, "learning_rate": 5e-05, "loss": 0.1699, "num_input_tokens_seen": 41795136, "step": 608 }, { "epoch": 38.0, "loss": 0.16384030878543854, "loss_ce": 0.15279294550418854, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 41795136, "step": 608 }, { "epoch": 38.0625, "grad_norm": 115.85949003037487, "learning_rate": 5e-05, "loss": 0.3556, "num_input_tokens_seen": 41866752, "step": 609 }, { "epoch": 38.0625, "loss": 0.373709499835968, "loss_ce": 0.15105323493480682, "loss_xval": 0.22265625, "num_input_tokens_seen": 41866752, "step": 609 }, { "epoch": 38.125, "grad_norm": 35.99753609550145, "learning_rate": 5e-05, "loss": 0.1821, "num_input_tokens_seen": 41938432, "step": 610 }, { "epoch": 38.125, "loss": 0.18259596824645996, "loss_ce": 0.14841628074645996, "loss_xval": 0.0341796875, "num_input_tokens_seen": 41938432, "step": 610 }, { "epoch": 38.1875, "grad_norm": 67.90140780338362, "learning_rate": 5e-05, "loss": 0.228, "num_input_tokens_seen": 42009984, "step": 611 }, { "epoch": 38.1875, "loss": 0.23608772456645966, "loss_ce": 0.14966194331645966, "loss_xval": 0.08642578125, "num_input_tokens_seen": 42009984, "step": 611 }, { "epoch": 38.25, "grad_norm": 76.47480758466817, "learning_rate": 5e-05, "loss": 0.2357, "num_input_tokens_seen": 42081664, "step": 612 }, { "epoch": 38.25, "loss": 0.2421186864376068, "loss_ce": 0.1469038426876068, "loss_xval": 0.09521484375, "num_input_tokens_seen": 42081664, "step": 612 }, { "epoch": 38.3125, "grad_norm": 16.84212415909238, "learning_rate": 5e-05, "loss": 0.156, "num_input_tokens_seen": 42153216, "step": 613 }, { "epoch": 38.3125, "loss": 0.1534736603498459, "loss_ce": 0.1434028595685959, "loss_xval": 0.01007080078125, "num_input_tokens_seen": 42153216, "step": 613 }, { "epoch": 38.375, "grad_norm": 88.89162301325827, "learning_rate": 5e-05, "loss": 0.2588, "num_input_tokens_seen": 42224832, "step": 614 }, { "epoch": 38.375, "loss": 0.268532931804657, "loss_ce": 0.14255638420581818, "loss_xval": 0.1259765625, "num_input_tokens_seen": 42224832, "step": 614 }, { "epoch": 38.4375, "grad_norm": 23.5430509471996, "learning_rate": 5e-05, "loss": 0.158, "num_input_tokens_seen": 42296512, "step": 615 }, { "epoch": 38.4375, "loss": 0.16037799417972565, "loss_ce": 0.14243365824222565, "loss_xval": 0.0179443359375, "num_input_tokens_seen": 42296512, "step": 615 }, { "epoch": 38.5, "grad_norm": 60.88480572029057, "learning_rate": 5e-05, "loss": 0.2048, "num_input_tokens_seen": 42368064, "step": 616 }, { "epoch": 38.5, "loss": 0.21100540459156036, "loss_ce": 0.14362259209156036, "loss_xval": 0.0673828125, "num_input_tokens_seen": 42368064, "step": 616 }, { "epoch": 38.5625, "grad_norm": 66.58093227045465, "learning_rate": 5e-05, "loss": 0.2093, "num_input_tokens_seen": 42427072, "step": 617 }, { "epoch": 38.5625, "loss": 0.21216745674610138, "loss_ce": 0.13746042549610138, "loss_xval": 0.07470703125, "num_input_tokens_seen": 42427072, "step": 617 }, { "epoch": 38.625, "grad_norm": 41.32810840119761, "learning_rate": 5e-05, "loss": 0.1741, "num_input_tokens_seen": 42498752, "step": 618 }, { "epoch": 38.625, "loss": 0.17929083108901978, "loss_ce": 0.13827520608901978, "loss_xval": 0.041015625, "num_input_tokens_seen": 42498752, "step": 618 }, { "epoch": 38.6875, "grad_norm": 71.79194449299779, "learning_rate": 5e-05, "loss": 0.2116, "num_input_tokens_seen": 42570496, "step": 619 }, { "epoch": 38.6875, "loss": 0.21144595742225647, "loss_ce": 0.13234439492225647, "loss_xval": 0.0791015625, "num_input_tokens_seen": 42570496, "step": 619 }, { "epoch": 38.75, "grad_norm": 7.438002395643336, "learning_rate": 5e-05, "loss": 0.14, "num_input_tokens_seen": 42642176, "step": 620 }, { "epoch": 38.75, "loss": 0.14000988006591797, "loss_ce": 0.13289928436279297, "loss_xval": 0.007110595703125, "num_input_tokens_seen": 42642176, "step": 620 }, { "epoch": 38.8125, "grad_norm": 58.768235228479554, "learning_rate": 5e-05, "loss": 0.1878, "num_input_tokens_seen": 42713728, "step": 621 }, { "epoch": 38.8125, "loss": 0.17987307906150818, "loss_ce": 0.13421878218650818, "loss_xval": 0.045654296875, "num_input_tokens_seen": 42713728, "step": 621 }, { "epoch": 38.875, "grad_norm": 31.55788652090228, "learning_rate": 5e-05, "loss": 0.149, "num_input_tokens_seen": 42772864, "step": 622 }, { "epoch": 38.875, "loss": 0.14639230072498322, "loss_ce": 0.13143868744373322, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 42772864, "step": 622 }, { "epoch": 38.9375, "grad_norm": 50.2062141649387, "learning_rate": 5e-05, "loss": 0.174, "num_input_tokens_seen": 42844416, "step": 623 }, { "epoch": 38.9375, "loss": 0.17014646530151367, "loss_ce": 0.13181638717651367, "loss_xval": 0.038330078125, "num_input_tokens_seen": 42844416, "step": 623 }, { "epoch": 39.0, "grad_norm": 39.727476739538965, "learning_rate": 5e-05, "loss": 0.1575, "num_input_tokens_seen": 42903552, "step": 624 }, { "epoch": 39.0, "loss": 0.15018722414970398, "loss_ce": 0.12662765383720398, "loss_xval": 0.0235595703125, "num_input_tokens_seen": 42903552, "step": 624 }, { "epoch": 39.0625, "grad_norm": 19.034774434189423, "learning_rate": 5e-05, "loss": 0.1347, "num_input_tokens_seen": 42975296, "step": 625 }, { "epoch": 39.0625, "loss": 0.13135939836502075, "loss_ce": 0.12476760149002075, "loss_xval": 0.006591796875, "num_input_tokens_seen": 42975296, "step": 625 }, { "epoch": 39.125, "grad_norm": 56.67017709198661, "learning_rate": 5e-05, "loss": 0.1763, "num_input_tokens_seen": 43034432, "step": 626 }, { "epoch": 39.125, "loss": 0.17652033269405365, "loss_ce": 0.12305353581905365, "loss_xval": 0.053466796875, "num_input_tokens_seen": 43034432, "step": 626 }, { "epoch": 39.1875, "grad_norm": 3.7113754650312445, "learning_rate": 5e-05, "loss": 0.1303, "num_input_tokens_seen": 43106112, "step": 627 }, { "epoch": 39.1875, "loss": 0.12929722666740417, "loss_ce": 0.12178990989923477, "loss_xval": 0.00750732421875, "num_input_tokens_seen": 43106112, "step": 627 }, { "epoch": 39.25, "grad_norm": 54.51094923701809, "learning_rate": 5e-05, "loss": 0.1703, "num_input_tokens_seen": 43177792, "step": 628 }, { "epoch": 39.25, "loss": 0.18063339591026306, "loss_ce": 0.12106308341026306, "loss_xval": 0.0595703125, "num_input_tokens_seen": 43177792, "step": 628 }, { "epoch": 39.3125, "grad_norm": 19.084346976100914, "learning_rate": 5e-05, "loss": 0.134, "num_input_tokens_seen": 43249472, "step": 629 }, { "epoch": 39.3125, "loss": 0.13764870166778564, "loss_ce": 0.12574684619903564, "loss_xval": 0.01190185546875, "num_input_tokens_seen": 43249472, "step": 629 }, { "epoch": 39.375, "grad_norm": 58.28364550359889, "learning_rate": 5e-05, "loss": 0.1704, "num_input_tokens_seen": 43321152, "step": 630 }, { "epoch": 39.375, "loss": 0.17048946022987366, "loss_ce": 0.11946406215429306, "loss_xval": 0.051025390625, "num_input_tokens_seen": 43321152, "step": 630 }, { "epoch": 39.4375, "grad_norm": 26.589554454909614, "learning_rate": 5e-05, "loss": 0.1319, "num_input_tokens_seen": 43392768, "step": 631 }, { "epoch": 39.4375, "loss": 0.1374756097793579, "loss_ce": 0.11879885196685791, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 43392768, "step": 631 }, { "epoch": 39.5, "grad_norm": 48.98562574688163, "learning_rate": 5e-05, "loss": 0.153, "num_input_tokens_seen": 43464448, "step": 632 }, { "epoch": 39.5, "loss": 0.1484987884759903, "loss_ce": 0.1145632416009903, "loss_xval": 0.033935546875, "num_input_tokens_seen": 43464448, "step": 632 }, { "epoch": 39.5625, "grad_norm": 38.604355288622315, "learning_rate": 5e-05, "loss": 0.1457, "num_input_tokens_seen": 43536064, "step": 633 }, { "epoch": 39.5625, "loss": 0.144775390625, "loss_ce": 0.1132812425494194, "loss_xval": 0.031494140625, "num_input_tokens_seen": 43536064, "step": 633 }, { "epoch": 39.625, "grad_norm": 39.82814144958052, "learning_rate": 5e-05, "loss": 0.1402, "num_input_tokens_seen": 43607872, "step": 634 }, { "epoch": 39.625, "loss": 0.1378423571586609, "loss_ce": 0.11391657590866089, "loss_xval": 0.02392578125, "num_input_tokens_seen": 43607872, "step": 634 }, { "epoch": 39.6875, "grad_norm": 42.46176254037517, "learning_rate": 5e-05, "loss": 0.1436, "num_input_tokens_seen": 43679488, "step": 635 }, { "epoch": 39.6875, "loss": 0.1384507417678833, "loss_ce": 0.1140366792678833, "loss_xval": 0.0244140625, "num_input_tokens_seen": 43679488, "step": 635 }, { "epoch": 39.75, "grad_norm": 24.345686051123515, "learning_rate": 5e-05, "loss": 0.1243, "num_input_tokens_seen": 43738560, "step": 636 }, { "epoch": 39.75, "loss": 0.12430451065301895, "loss_ce": 0.10898468643426895, "loss_xval": 0.01531982421875, "num_input_tokens_seen": 43738560, "step": 636 }, { "epoch": 39.8125, "grad_norm": 43.87853861651247, "learning_rate": 5e-05, "loss": 0.1395, "num_input_tokens_seen": 43797568, "step": 637 }, { "epoch": 39.8125, "loss": 0.1355985701084137, "loss_ce": 0.1086210310459137, "loss_xval": 0.0269775390625, "num_input_tokens_seen": 43797568, "step": 637 }, { "epoch": 39.875, "grad_norm": 6.588837352574171, "learning_rate": 5e-05, "loss": 0.1169, "num_input_tokens_seen": 43869184, "step": 638 }, { "epoch": 39.875, "loss": 0.11672370135784149, "loss_ce": 0.11235968768596649, "loss_xval": 0.004364013671875, "num_input_tokens_seen": 43869184, "step": 638 }, { "epoch": 39.9375, "grad_norm": 48.67125834183872, "learning_rate": 5e-05, "loss": 0.1465, "num_input_tokens_seen": 43940800, "step": 639 }, { "epoch": 39.9375, "loss": 0.14488358795642853, "loss_ce": 0.10997147858142853, "loss_xval": 0.034912109375, "num_input_tokens_seen": 43940800, "step": 639 }, { "epoch": 40.0, "grad_norm": 4.979504580343616, "learning_rate": 5e-05, "loss": 0.1124, "num_input_tokens_seen": 44012416, "step": 640 }, { "epoch": 40.0, "loss": 0.11065511405467987, "loss_ce": 0.10659627616405487, "loss_xval": 0.004058837890625, "num_input_tokens_seen": 44012416, "step": 640 }, { "epoch": 40.0625, "grad_norm": 51.66416460471596, "learning_rate": 5e-05, "loss": 0.1475, "num_input_tokens_seen": 44084160, "step": 641 }, { "epoch": 40.0625, "loss": 0.14437633752822876, "loss_ce": 0.10360485315322876, "loss_xval": 0.040771484375, "num_input_tokens_seen": 44084160, "step": 641 }, { "epoch": 40.125, "grad_norm": 14.566490513755292, "learning_rate": 5e-05, "loss": 0.1104, "num_input_tokens_seen": 44155712, "step": 642 }, { "epoch": 40.125, "loss": 0.10976142436265945, "loss_ce": 0.10231513530015945, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 44155712, "step": 642 }, { "epoch": 40.1875, "grad_norm": 61.00209116862136, "learning_rate": 5e-05, "loss": 0.1549, "num_input_tokens_seen": 44227392, "step": 643 }, { "epoch": 40.1875, "loss": 0.15537142753601074, "loss_ce": 0.09921907633543015, "loss_xval": 0.05615234375, "num_input_tokens_seen": 44227392, "step": 643 }, { "epoch": 40.25, "grad_norm": 11.347214980938771, "learning_rate": 5e-05, "loss": 0.1063, "num_input_tokens_seen": 44286336, "step": 644 }, { "epoch": 40.25, "loss": 0.10684747993946075, "loss_ce": 0.10077448189258575, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 44286336, "step": 644 }, { "epoch": 40.3125, "grad_norm": 57.017450004203646, "learning_rate": 5e-05, "loss": 0.148, "num_input_tokens_seen": 44358016, "step": 645 }, { "epoch": 40.3125, "loss": 0.14290070533752441, "loss_ce": 0.09773469716310501, "loss_xval": 0.045166015625, "num_input_tokens_seen": 44358016, "step": 645 }, { "epoch": 40.375, "grad_norm": 16.615742633522416, "learning_rate": 5e-05, "loss": 0.1075, "num_input_tokens_seen": 44429760, "step": 646 }, { "epoch": 40.375, "loss": 0.10806427896022797, "loss_ce": 0.09756623208522797, "loss_xval": 0.010498046875, "num_input_tokens_seen": 44429760, "step": 646 }, { "epoch": 40.4375, "grad_norm": 58.16047818456758, "learning_rate": 5e-05, "loss": 0.1494, "num_input_tokens_seen": 44488832, "step": 647 }, { "epoch": 40.4375, "loss": 0.1497732698917389, "loss_ce": 0.09679476171731949, "loss_xval": 0.052978515625, "num_input_tokens_seen": 44488832, "step": 647 }, { "epoch": 40.5, "grad_norm": 16.82982616979513, "learning_rate": 5e-05, "loss": 0.104, "num_input_tokens_seen": 44560448, "step": 648 }, { "epoch": 40.5, "loss": 0.10390922427177429, "loss_ce": 0.09426566958427429, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 44560448, "step": 648 }, { "epoch": 40.5625, "grad_norm": 59.937320299463295, "learning_rate": 5e-05, "loss": 0.1482, "num_input_tokens_seen": 44632064, "step": 649 }, { "epoch": 40.5625, "loss": 0.14854532480239868, "loss_ce": 0.09263712167739868, "loss_xval": 0.055908203125, "num_input_tokens_seen": 44632064, "step": 649 }, { "epoch": 40.625, "grad_norm": 8.174049550372988, "learning_rate": 5e-05, "loss": 0.0969, "num_input_tokens_seen": 44703680, "step": 650 }, { "epoch": 40.625, "loss": 0.09525209665298462, "loss_ce": 0.09174257516860962, "loss_xval": 0.003509521484375, "num_input_tokens_seen": 44703680, "step": 650 }, { "epoch": 40.6875, "grad_norm": 55.966905853940496, "learning_rate": 5e-05, "loss": 0.1393, "num_input_tokens_seen": 44775296, "step": 651 }, { "epoch": 40.6875, "loss": 0.1412389874458313, "loss_ce": 0.0909460112452507, "loss_xval": 0.05029296875, "num_input_tokens_seen": 44775296, "step": 651 }, { "epoch": 40.75, "grad_norm": 0.8837807743869168, "learning_rate": 5e-05, "loss": 0.0934, "num_input_tokens_seen": 44846848, "step": 652 }, { "epoch": 40.75, "loss": 0.09453385323286057, "loss_ce": 0.08836930245161057, "loss_xval": 0.00616455078125, "num_input_tokens_seen": 44846848, "step": 652 }, { "epoch": 40.8125, "grad_norm": 46.75400872468516, "learning_rate": 5e-05, "loss": 0.1207, "num_input_tokens_seen": 44918464, "step": 653 }, { "epoch": 40.8125, "loss": 0.1174541562795639, "loss_ce": 0.0849834531545639, "loss_xval": 0.032470703125, "num_input_tokens_seen": 44918464, "step": 653 }, { "epoch": 40.875, "grad_norm": 5.538176318189277, "learning_rate": 5e-05, "loss": 0.0958, "num_input_tokens_seen": 44990016, "step": 654 }, { "epoch": 40.875, "loss": 0.10027727484703064, "loss_ce": 0.09420427680015564, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 44990016, "step": 654 }, { "epoch": 40.9375, "grad_norm": 50.93686152495069, "learning_rate": 5e-05, "loss": 0.1246, "num_input_tokens_seen": 45061632, "step": 655 }, { "epoch": 40.9375, "loss": 0.11831493675708771, "loss_ce": 0.08560009300708771, "loss_xval": 0.03271484375, "num_input_tokens_seen": 45061632, "step": 655 }, { "epoch": 41.0, "grad_norm": 6.387264661998158, "learning_rate": 5e-05, "loss": 0.0879, "num_input_tokens_seen": 45133312, "step": 656 }, { "epoch": 41.0, "loss": 0.08795207738876343, "loss_ce": 0.08313030004501343, "loss_xval": 0.00482177734375, "num_input_tokens_seen": 45133312, "step": 656 }, { "epoch": 41.0625, "grad_norm": 38.77820198456228, "learning_rate": 5e-05, "loss": 0.1052, "num_input_tokens_seen": 45204992, "step": 657 }, { "epoch": 41.0625, "loss": 0.10423070937395096, "loss_ce": 0.08054906874895096, "loss_xval": 0.023681640625, "num_input_tokens_seen": 45204992, "step": 657 }, { "epoch": 41.125, "grad_norm": 7.696398023612668, "learning_rate": 5e-05, "loss": 0.0888, "num_input_tokens_seen": 45276736, "step": 658 }, { "epoch": 41.125, "loss": 0.08869247138500214, "loss_ce": 0.08310775458812714, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 45276736, "step": 658 }, { "epoch": 41.1875, "grad_norm": 42.066906629424764, "learning_rate": 5e-05, "loss": 0.1093, "num_input_tokens_seen": 45348352, "step": 659 }, { "epoch": 41.1875, "loss": 0.10560226440429688, "loss_ce": 0.07996749877929688, "loss_xval": 0.025634765625, "num_input_tokens_seen": 45348352, "step": 659 }, { "epoch": 41.25, "grad_norm": 24.179182286884714, "learning_rate": 5e-05, "loss": 0.0898, "num_input_tokens_seen": 45420096, "step": 660 }, { "epoch": 41.25, "loss": 0.08728927373886108, "loss_ce": 0.07892745733261108, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 45420096, "step": 660 }, { "epoch": 41.3125, "grad_norm": 37.78980795468517, "learning_rate": 5e-05, "loss": 0.1027, "num_input_tokens_seen": 45491712, "step": 661 }, { "epoch": 41.3125, "loss": 0.09915024042129517, "loss_ce": 0.07766586542129517, "loss_xval": 0.021484375, "num_input_tokens_seen": 45491712, "step": 661 }, { "epoch": 41.375, "grad_norm": 33.059393467942876, "learning_rate": 5e-05, "loss": 0.0992, "num_input_tokens_seen": 45563392, "step": 662 }, { "epoch": 41.375, "loss": 0.10039538890123367, "loss_ce": 0.08000964671373367, "loss_xval": 0.0203857421875, "num_input_tokens_seen": 45563392, "step": 662 }, { "epoch": 41.4375, "grad_norm": 32.69051951121018, "learning_rate": 5e-05, "loss": 0.0912, "num_input_tokens_seen": 45635008, "step": 663 }, { "epoch": 41.4375, "loss": 0.09104805439710617, "loss_ce": 0.07212715595960617, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 45635008, "step": 663 }, { "epoch": 41.5, "grad_norm": 44.721200169778065, "learning_rate": 5e-05, "loss": 0.1053, "num_input_tokens_seen": 45694016, "step": 664 }, { "epoch": 41.5, "loss": 0.10693207383155823, "loss_ce": 0.07580414414405823, "loss_xval": 0.0311279296875, "num_input_tokens_seen": 45694016, "step": 664 }, { "epoch": 41.5625, "grad_norm": 10.834594654550962, "learning_rate": 5e-05, "loss": 0.0782, "num_input_tokens_seen": 45765824, "step": 665 }, { "epoch": 41.5625, "loss": 0.07746880501508713, "loss_ce": 0.07350151985883713, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 45765824, "step": 665 }, { "epoch": 41.625, "grad_norm": 42.53339591806573, "learning_rate": 5e-05, "loss": 0.1004, "num_input_tokens_seen": 45837504, "step": 666 }, { "epoch": 41.625, "loss": 0.1011945903301239, "loss_ce": 0.0706770122051239, "loss_xval": 0.030517578125, "num_input_tokens_seen": 45837504, "step": 666 }, { "epoch": 41.6875, "grad_norm": 6.314878555402224, "learning_rate": 5e-05, "loss": 0.0723, "num_input_tokens_seen": 45909120, "step": 667 }, { "epoch": 41.6875, "loss": 0.07241284847259521, "loss_ce": 0.06925427913665771, "loss_xval": 0.0031585693359375, "num_input_tokens_seen": 45909120, "step": 667 }, { "epoch": 41.75, "grad_norm": 38.929875736582176, "learning_rate": 5e-05, "loss": 0.0937, "num_input_tokens_seen": 45980736, "step": 668 }, { "epoch": 41.75, "loss": 0.09472913295030594, "loss_ce": 0.07043714076280594, "loss_xval": 0.0242919921875, "num_input_tokens_seen": 45980736, "step": 668 }, { "epoch": 41.8125, "grad_norm": 24.81714823157948, "learning_rate": 5e-05, "loss": 0.0811, "num_input_tokens_seen": 46052416, "step": 669 }, { "epoch": 41.8125, "loss": 0.08008313924074173, "loss_ce": 0.06952405720949173, "loss_xval": 0.01055908203125, "num_input_tokens_seen": 46052416, "step": 669 }, { "epoch": 41.875, "grad_norm": 27.465588929973283, "learning_rate": 5e-05, "loss": 0.0793, "num_input_tokens_seen": 46111424, "step": 670 }, { "epoch": 41.875, "loss": 0.07748164236545563, "loss_ce": 0.06686152517795563, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 46111424, "step": 670 }, { "epoch": 41.9375, "grad_norm": 46.14633889565823, "learning_rate": 5e-05, "loss": 0.0991, "num_input_tokens_seen": 46183104, "step": 671 }, { "epoch": 41.9375, "loss": 0.10552562028169632, "loss_ce": 0.06890452653169632, "loss_xval": 0.03662109375, "num_input_tokens_seen": 46183104, "step": 671 }, { "epoch": 42.0, "grad_norm": 12.898889615697131, "learning_rate": 5e-05, "loss": 0.0699, "num_input_tokens_seen": 46242176, "step": 672 }, { "epoch": 42.0, "loss": 0.07312614470720291, "loss_ce": 0.06470329314470291, "loss_xval": 0.0084228515625, "num_input_tokens_seen": 46242176, "step": 672 }, { "epoch": 42.0625, "grad_norm": 22.250142954382216, "learning_rate": 5e-05, "loss": 0.0728, "num_input_tokens_seen": 46313920, "step": 673 }, { "epoch": 42.0625, "loss": 0.0726141631603241, "loss_ce": 0.0619330070912838, "loss_xval": 0.01068115234375, "num_input_tokens_seen": 46313920, "step": 673 }, { "epoch": 42.125, "grad_norm": 15.733860421093425, "learning_rate": 5e-05, "loss": 0.0683, "num_input_tokens_seen": 46385664, "step": 674 }, { "epoch": 42.125, "loss": 0.06907017529010773, "loss_ce": 0.06220372021198273, "loss_xval": 0.006866455078125, "num_input_tokens_seen": 46385664, "step": 674 }, { "epoch": 42.1875, "grad_norm": 16.41276651460441, "learning_rate": 5e-05, "loss": 0.0708, "num_input_tokens_seen": 46457408, "step": 675 }, { "epoch": 42.1875, "loss": 0.0687999352812767, "loss_ce": 0.0634288415312767, "loss_xval": 0.00537109375, "num_input_tokens_seen": 46457408, "step": 675 }, { "epoch": 42.25, "grad_norm": 32.91585484605843, "learning_rate": 5e-05, "loss": 0.0785, "num_input_tokens_seen": 46528960, "step": 676 }, { "epoch": 42.25, "loss": 0.07857609540224075, "loss_ce": 0.06258488446474075, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 46528960, "step": 676 }, { "epoch": 42.3125, "grad_norm": 13.792516811760255, "learning_rate": 5e-05, "loss": 0.0672, "num_input_tokens_seen": 46588096, "step": 677 }, { "epoch": 42.3125, "loss": 0.07050401717424393, "loss_ce": 0.06086046248674393, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 46588096, "step": 677 }, { "epoch": 42.375, "grad_norm": 15.583831144343767, "learning_rate": 5e-05, "loss": 0.0654, "num_input_tokens_seen": 46659648, "step": 678 }, { "epoch": 42.375, "loss": 0.06597653776407242, "loss_ce": 0.05791989713907242, "loss_xval": 0.008056640625, "num_input_tokens_seen": 46659648, "step": 678 }, { "epoch": 42.4375, "grad_norm": 13.894096468825236, "learning_rate": 5e-05, "loss": 0.067, "num_input_tokens_seen": 46731264, "step": 679 }, { "epoch": 42.4375, "loss": 0.06740245968103409, "loss_ce": 0.06007824093103409, "loss_xval": 0.00732421875, "num_input_tokens_seen": 46731264, "step": 679 }, { "epoch": 42.5, "grad_norm": 11.877728077317133, "learning_rate": 5e-05, "loss": 0.0619, "num_input_tokens_seen": 46802944, "step": 680 }, { "epoch": 42.5, "loss": 0.06254500895738602, "loss_ce": 0.05720443278551102, "loss_xval": 0.005340576171875, "num_input_tokens_seen": 46802944, "step": 680 }, { "epoch": 42.5625, "grad_norm": 27.267415158619585, "learning_rate": 5e-05, "loss": 0.0697, "num_input_tokens_seen": 46874496, "step": 681 }, { "epoch": 42.5625, "loss": 0.06751132011413574, "loss_ce": 0.05512118339538574, "loss_xval": 0.01239013671875, "num_input_tokens_seen": 46874496, "step": 681 }, { "epoch": 42.625, "grad_norm": 16.835417062638697, "learning_rate": 5e-05, "loss": 0.0592, "num_input_tokens_seen": 46946176, "step": 682 }, { "epoch": 42.625, "loss": 0.06013280153274536, "loss_ce": 0.05412083864212036, "loss_xval": 0.006011962890625, "num_input_tokens_seen": 46946176, "step": 682 }, { "epoch": 42.6875, "grad_norm": 12.117372223184038, "learning_rate": 5e-05, "loss": 0.0611, "num_input_tokens_seen": 47017792, "step": 683 }, { "epoch": 42.6875, "loss": 0.05970475450158119, "loss_ce": 0.05415055528283119, "loss_xval": 0.00555419921875, "num_input_tokens_seen": 47017792, "step": 683 }, { "epoch": 42.75, "grad_norm": 42.62280927507315, "learning_rate": 5e-05, "loss": 0.0806, "num_input_tokens_seen": 47089344, "step": 684 }, { "epoch": 42.75, "loss": 0.08083441853523254, "loss_ce": 0.053246524184942245, "loss_xval": 0.027587890625, "num_input_tokens_seen": 47089344, "step": 684 }, { "epoch": 42.8125, "grad_norm": 48.03632893846273, "learning_rate": 5e-05, "loss": 0.085, "num_input_tokens_seen": 47148352, "step": 685 }, { "epoch": 42.8125, "loss": 0.08726750314235687, "loss_ce": 0.05333195626735687, "loss_xval": 0.033935546875, "num_input_tokens_seen": 47148352, "step": 685 }, { "epoch": 42.875, "grad_norm": 18.319266290733662, "learning_rate": 5e-05, "loss": 0.058, "num_input_tokens_seen": 47220032, "step": 686 }, { "epoch": 42.875, "loss": 0.05868493393063545, "loss_ce": 0.05191003158688545, "loss_xval": 0.00677490234375, "num_input_tokens_seen": 47220032, "step": 686 }, { "epoch": 42.9375, "grad_norm": 20.324921894587835, "learning_rate": 5e-05, "loss": 0.0575, "num_input_tokens_seen": 47291648, "step": 687 }, { "epoch": 42.9375, "loss": 0.057490069419145584, "loss_ce": 0.049769122153520584, "loss_xval": 0.007720947265625, "num_input_tokens_seen": 47291648, "step": 687 }, { "epoch": 43.0, "grad_norm": 35.3073557714966, "learning_rate": 5e-05, "loss": 0.0695, "num_input_tokens_seen": 47363456, "step": 688 }, { "epoch": 43.0, "loss": 0.07189452648162842, "loss_ce": 0.04870116710662842, "loss_xval": 0.023193359375, "num_input_tokens_seen": 47363456, "step": 688 }, { "epoch": 43.0625, "grad_norm": 24.857812599646923, "learning_rate": 5e-05, "loss": 0.0629, "num_input_tokens_seen": 47435008, "step": 689 }, { "epoch": 43.0625, "loss": 0.06210353970527649, "loss_ce": 0.04714992642402649, "loss_xval": 0.01495361328125, "num_input_tokens_seen": 47435008, "step": 689 }, { "epoch": 43.125, "grad_norm": 0.8296193514274374, "learning_rate": 5e-05, "loss": 0.0515, "num_input_tokens_seen": 47506752, "step": 690 }, { "epoch": 43.125, "loss": 0.05312083289027214, "loss_ce": 0.04974864050745964, "loss_xval": 0.0033721923828125, "num_input_tokens_seen": 47506752, "step": 690 }, { "epoch": 43.1875, "grad_norm": 26.00094135796779, "learning_rate": 5e-05, "loss": 0.0599, "num_input_tokens_seen": 47578368, "step": 691 }, { "epoch": 43.1875, "loss": 0.06024426594376564, "loss_ce": 0.04614514485001564, "loss_xval": 0.01409912109375, "num_input_tokens_seen": 47578368, "step": 691 }, { "epoch": 43.25, "grad_norm": 32.288835474595736, "learning_rate": 5e-05, "loss": 0.0623, "num_input_tokens_seen": 47637504, "step": 692 }, { "epoch": 43.25, "loss": 0.06244288384914398, "loss_ce": 0.04535304009914398, "loss_xval": 0.01708984375, "num_input_tokens_seen": 47637504, "step": 692 }, { "epoch": 43.3125, "grad_norm": 10.261597614874594, "learning_rate": 5e-05, "loss": 0.0508, "num_input_tokens_seen": 47709248, "step": 693 }, { "epoch": 43.3125, "loss": 0.051792070269584656, "loss_ce": 0.043308183550834656, "loss_xval": 0.00848388671875, "num_input_tokens_seen": 47709248, "step": 693 }, { "epoch": 43.375, "grad_norm": 21.415064135225904, "learning_rate": 5e-05, "loss": 0.0521, "num_input_tokens_seen": 47780928, "step": 694 }, { "epoch": 43.375, "loss": 0.053237855434417725, "loss_ce": 0.042983949184417725, "loss_xval": 0.01025390625, "num_input_tokens_seen": 47780928, "step": 694 }, { "epoch": 43.4375, "grad_norm": 43.97768401612803, "learning_rate": 5e-05, "loss": 0.072, "num_input_tokens_seen": 47852544, "step": 695 }, { "epoch": 43.4375, "loss": 0.0703013688325882, "loss_ce": 0.043201759457588196, "loss_xval": 0.027099609375, "num_input_tokens_seen": 47852544, "step": 695 }, { "epoch": 43.5, "grad_norm": 50.50532393833621, "learning_rate": 5e-05, "loss": 0.0806, "num_input_tokens_seen": 47924096, "step": 696 }, { "epoch": 43.5, "loss": 0.08449646830558777, "loss_ce": 0.04226014390587807, "loss_xval": 0.042236328125, "num_input_tokens_seen": 47924096, "step": 696 }, { "epoch": 43.5625, "grad_norm": 39.98741018375859, "learning_rate": 5e-05, "loss": 0.0663, "num_input_tokens_seen": 47995648, "step": 697 }, { "epoch": 43.5625, "loss": 0.0640028789639473, "loss_ce": 0.041175730526447296, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 47995648, "step": 697 }, { "epoch": 43.625, "grad_norm": 19.82453199076641, "learning_rate": 5e-05, "loss": 0.0557, "num_input_tokens_seen": 48067392, "step": 698 }, { "epoch": 43.625, "loss": 0.06127607077360153, "loss_ce": 0.04015790671110153, "loss_xval": 0.0211181640625, "num_input_tokens_seen": 48067392, "step": 698 }, { "epoch": 43.6875, "grad_norm": 3.178978415114982, "learning_rate": 5e-05, "loss": 0.0444, "num_input_tokens_seen": 48139008, "step": 699 }, { "epoch": 43.6875, "loss": 0.0443594753742218, "loss_ce": 0.0406668484210968, "loss_xval": 0.003692626953125, "num_input_tokens_seen": 48139008, "step": 699 }, { "epoch": 43.75, "grad_norm": 19.867462089830223, "learning_rate": 5e-05, "loss": 0.0472, "num_input_tokens_seen": 48210688, "step": 700 }, { "epoch": 43.75, "loss": 0.04552736133337021, "loss_ce": 0.03808107227087021, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 48210688, "step": 700 }, { "epoch": 43.8125, "grad_norm": 30.11916544309693, "learning_rate": 5e-05, "loss": 0.0548, "num_input_tokens_seen": 48282304, "step": 701 }, { "epoch": 43.8125, "loss": 0.05572984740138054, "loss_ce": 0.03973863646388054, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 48282304, "step": 701 }, { "epoch": 43.875, "grad_norm": 33.42329124347825, "learning_rate": 5e-05, "loss": 0.0578, "num_input_tokens_seen": 48353856, "step": 702 }, { "epoch": 43.875, "loss": 0.059752244502305984, "loss_ce": 0.038878221064805984, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 48353856, "step": 702 }, { "epoch": 43.9375, "grad_norm": 34.79651760572601, "learning_rate": 5e-05, "loss": 0.0606, "num_input_tokens_seen": 48412992, "step": 703 }, { "epoch": 43.9375, "loss": 0.057222314178943634, "loss_ce": 0.034273095428943634, "loss_xval": 0.02294921875, "num_input_tokens_seen": 48412992, "step": 703 }, { "epoch": 44.0, "grad_norm": 42.63367419447805, "learning_rate": 5e-05, "loss": 0.0643, "num_input_tokens_seen": 48484672, "step": 704 }, { "epoch": 44.0, "loss": 0.061634887009859085, "loss_ce": 0.038319457322359085, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 48484672, "step": 704 }, { "epoch": 44.0625, "grad_norm": 56.91280811688065, "learning_rate": 5e-05, "loss": 0.0814, "num_input_tokens_seen": 48556352, "step": 705 }, { "epoch": 44.0625, "loss": 0.07580693066120148, "loss_ce": 0.034791309386491776, "loss_xval": 0.041015625, "num_input_tokens_seen": 48556352, "step": 705 }, { "epoch": 44.125, "grad_norm": 78.36616983589818, "learning_rate": 5e-05, "loss": 0.125, "num_input_tokens_seen": 48627968, "step": 706 }, { "epoch": 44.125, "loss": 0.12948673963546753, "loss_ce": 0.03671329841017723, "loss_xval": 0.0927734375, "num_input_tokens_seen": 48627968, "step": 706 }, { "epoch": 44.1875, "grad_norm": 106.25507358737354, "learning_rate": 5e-05, "loss": 0.1908, "num_input_tokens_seen": 48699584, "step": 707 }, { "epoch": 44.1875, "loss": 0.19240882992744446, "loss_ce": 0.03615882247686386, "loss_xval": 0.15625, "num_input_tokens_seen": 48699584, "step": 707 }, { "epoch": 44.25, "grad_norm": 139.4602745129167, "learning_rate": 5e-05, "loss": 0.3064, "num_input_tokens_seen": 48771328, "step": 708 }, { "epoch": 44.25, "loss": 0.31655657291412354, "loss_ce": 0.035306572914123535, "loss_xval": 0.28125, "num_input_tokens_seen": 48771328, "step": 708 }, { "epoch": 44.3125, "grad_norm": 175.85977776893148, "learning_rate": 5e-05, "loss": 0.4576, "num_input_tokens_seen": 48843008, "step": 709 }, { "epoch": 44.3125, "loss": 0.45811718702316284, "loss_ce": 0.03428905829787254, "loss_xval": 0.423828125, "num_input_tokens_seen": 48843008, "step": 709 }, { "epoch": 44.375, "grad_norm": 192.97655324199405, "learning_rate": 5e-05, "loss": 0.5435, "num_input_tokens_seen": 48914560, "step": 710 }, { "epoch": 44.375, "loss": 0.5439357757568359, "loss_ce": 0.036123279482126236, "loss_xval": 0.5078125, "num_input_tokens_seen": 48914560, "step": 710 }, { "epoch": 44.4375, "grad_norm": 165.50638734522764, "learning_rate": 5e-05, "loss": 0.4227, "num_input_tokens_seen": 48986240, "step": 711 }, { "epoch": 44.4375, "loss": 0.41279712319374084, "loss_ce": 0.03193773701786995, "loss_xval": 0.380859375, "num_input_tokens_seen": 48986240, "step": 711 }, { "epoch": 44.5, "grad_norm": 84.90065968277804, "learning_rate": 5e-05, "loss": 0.1522, "num_input_tokens_seen": 49057856, "step": 712 }, { "epoch": 44.5, "loss": 0.15466076135635376, "loss_ce": 0.03844981640577316, "loss_xval": 0.1162109375, "num_input_tokens_seen": 49057856, "step": 712 }, { "epoch": 44.5625, "grad_norm": 35.09701586106571, "learning_rate": 5e-05, "loss": 0.0663, "num_input_tokens_seen": 49129408, "step": 713 }, { "epoch": 44.5625, "loss": 0.0639449805021286, "loss_ce": 0.0364791564643383, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 49129408, "step": 713 }, { "epoch": 44.625, "grad_norm": 129.3152090152038, "learning_rate": 5e-05, "loss": 0.2909, "num_input_tokens_seen": 49201088, "step": 714 }, { "epoch": 44.625, "loss": 0.2906899154186249, "loss_ce": 0.04166646674275398, "loss_xval": 0.2490234375, "num_input_tokens_seen": 49201088, "step": 714 }, { "epoch": 44.6875, "grad_norm": 136.13492585050543, "learning_rate": 5e-05, "loss": 0.3203, "num_input_tokens_seen": 49272832, "step": 715 }, { "epoch": 44.6875, "loss": 0.315001517534256, "loss_ce": 0.043517131358385086, "loss_xval": 0.271484375, "num_input_tokens_seen": 49272832, "step": 715 }, { "epoch": 44.75, "grad_norm": 54.00318687230522, "learning_rate": 5e-05, "loss": 0.0901, "num_input_tokens_seen": 49344576, "step": 716 }, { "epoch": 44.75, "loss": 0.09366574883460999, "loss_ce": 0.039466530084609985, "loss_xval": 0.05419921875, "num_input_tokens_seen": 49344576, "step": 716 }, { "epoch": 44.8125, "grad_norm": 59.18192953802604, "learning_rate": 5e-05, "loss": 0.1019, "num_input_tokens_seen": 49416192, "step": 717 }, { "epoch": 44.8125, "loss": 0.10018584132194519, "loss_ce": 0.04037138447165489, "loss_xval": 0.059814453125, "num_input_tokens_seen": 49416192, "step": 717 }, { "epoch": 44.875, "grad_norm": 116.34551079767814, "learning_rate": 5e-05, "loss": 0.256, "num_input_tokens_seen": 49487744, "step": 718 }, { "epoch": 44.875, "loss": 0.24698016047477722, "loss_ce": 0.042878590524196625, "loss_xval": 0.2041015625, "num_input_tokens_seen": 49487744, "step": 718 }, { "epoch": 44.9375, "grad_norm": 72.96403398639748, "learning_rate": 5e-05, "loss": 0.1235, "num_input_tokens_seen": 49546880, "step": 719 }, { "epoch": 44.9375, "loss": 0.1306336671113968, "loss_ce": 0.03688367083668709, "loss_xval": 0.09375, "num_input_tokens_seen": 49546880, "step": 719 }, { "epoch": 45.0, "grad_norm": 34.10621333807227, "learning_rate": 5e-05, "loss": 0.062, "num_input_tokens_seen": 49618496, "step": 720 }, { "epoch": 45.0, "loss": 0.06327150762081146, "loss_ce": 0.03885744512081146, "loss_xval": 0.0244140625, "num_input_tokens_seen": 49618496, "step": 720 }, { "epoch": 45.0625, "grad_norm": 106.42200449605606, "learning_rate": 5e-05, "loss": 0.2205, "num_input_tokens_seen": 49690048, "step": 721 }, { "epoch": 45.0625, "loss": 0.23118966817855835, "loss_ce": 0.03685373440384865, "loss_xval": 0.1943359375, "num_input_tokens_seen": 49690048, "step": 721 }, { "epoch": 45.125, "grad_norm": 83.45709473306052, "learning_rate": 5e-05, "loss": 0.1568, "num_input_tokens_seen": 49761728, "step": 722 }, { "epoch": 45.125, "loss": 0.14840547740459442, "loss_ce": 0.03658906742930412, "loss_xval": 0.11181640625, "num_input_tokens_seen": 49761728, "step": 722 }, { "epoch": 45.1875, "grad_norm": 2.3389858812521966, "learning_rate": 5e-05, "loss": 0.0446, "num_input_tokens_seen": 49833280, "step": 723 }, { "epoch": 45.1875, "loss": 0.04393548145890236, "loss_ce": 0.03792351856827736, "loss_xval": 0.006011962890625, "num_input_tokens_seen": 49833280, "step": 723 }, { "epoch": 45.25, "grad_norm": 72.71402722285599, "learning_rate": 5e-05, "loss": 0.1276, "num_input_tokens_seen": 49904896, "step": 724 }, { "epoch": 45.25, "loss": 0.1313050538301468, "loss_ce": 0.03267224133014679, "loss_xval": 0.0986328125, "num_input_tokens_seen": 49904896, "step": 724 }, { "epoch": 45.3125, "grad_norm": 86.73820778200127, "learning_rate": 5e-05, "loss": 0.166, "num_input_tokens_seen": 49976448, "step": 725 }, { "epoch": 45.3125, "loss": 0.17679466307163239, "loss_ce": 0.039099350571632385, "loss_xval": 0.1376953125, "num_input_tokens_seen": 49976448, "step": 725 }, { "epoch": 45.375, "grad_norm": 41.67951897794212, "learning_rate": 5e-05, "loss": 0.0656, "num_input_tokens_seen": 50048000, "step": 726 }, { "epoch": 45.375, "loss": 0.06182122975587845, "loss_ce": 0.03350091725587845, "loss_xval": 0.0283203125, "num_input_tokens_seen": 50048000, "step": 726 }, { "epoch": 45.4375, "grad_norm": 25.223047984935373, "learning_rate": 5e-05, "loss": 0.0496, "num_input_tokens_seen": 50119680, "step": 727 }, { "epoch": 45.4375, "loss": 0.04861375689506531, "loss_ce": 0.03701707720756531, "loss_xval": 0.0115966796875, "num_input_tokens_seen": 50119680, "step": 727 }, { "epoch": 45.5, "grad_norm": 58.577934750268255, "learning_rate": 5e-05, "loss": 0.0958, "num_input_tokens_seen": 50191424, "step": 728 }, { "epoch": 45.5, "loss": 0.09291598200798035, "loss_ce": 0.03432223200798035, "loss_xval": 0.05859375, "num_input_tokens_seen": 50191424, "step": 728 }, { "epoch": 45.5625, "grad_norm": 39.21227207563384, "learning_rate": 5e-05, "loss": 0.0598, "num_input_tokens_seen": 50263168, "step": 729 }, { "epoch": 45.5625, "loss": 0.061409905552864075, "loss_ce": 0.028695063665509224, "loss_xval": 0.03271484375, "num_input_tokens_seen": 50263168, "step": 729 }, { "epoch": 45.625, "grad_norm": 9.434706828644057, "learning_rate": 5e-05, "loss": 0.0343, "num_input_tokens_seen": 50334720, "step": 730 }, { "epoch": 45.625, "loss": 0.03275488317012787, "loss_ce": 0.027292238548398018, "loss_xval": 0.005462646484375, "num_input_tokens_seen": 50334720, "step": 730 }, { "epoch": 45.6875, "grad_norm": 44.995512555068565, "learning_rate": 5e-05, "loss": 0.0675, "num_input_tokens_seen": 50393856, "step": 731 }, { "epoch": 45.6875, "loss": 0.0645364299416542, "loss_ce": 0.029868463054299355, "loss_xval": 0.03466796875, "num_input_tokens_seen": 50393856, "step": 731 }, { "epoch": 45.75, "grad_norm": 41.09193323924135, "learning_rate": 5e-05, "loss": 0.0608, "num_input_tokens_seen": 50452928, "step": 732 }, { "epoch": 45.75, "loss": 0.05605471879243851, "loss_ce": 0.026513701304793358, "loss_xval": 0.029541015625, "num_input_tokens_seen": 50452928, "step": 732 }, { "epoch": 45.8125, "grad_norm": 7.2153893512299145, "learning_rate": 5e-05, "loss": 0.0316, "num_input_tokens_seen": 50512064, "step": 733 }, { "epoch": 45.8125, "loss": 0.030238093808293343, "loss_ce": 0.026057185605168343, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 50512064, "step": 733 }, { "epoch": 45.875, "grad_norm": 29.270123435698675, "learning_rate": 5e-05, "loss": 0.0448, "num_input_tokens_seen": 50583616, "step": 734 }, { "epoch": 45.875, "loss": 0.04943709075450897, "loss_ce": 0.026609940454363823, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 50583616, "step": 734 }, { "epoch": 45.9375, "grad_norm": 43.11011675650782, "learning_rate": 5e-05, "loss": 0.0611, "num_input_tokens_seen": 50655360, "step": 735 }, { "epoch": 45.9375, "loss": 0.05930478125810623, "loss_ce": 0.02610165812075138, "loss_xval": 0.033203125, "num_input_tokens_seen": 50655360, "step": 735 }, { "epoch": 46.0, "grad_norm": 26.0936617683548, "learning_rate": 5e-05, "loss": 0.0423, "num_input_tokens_seen": 50726912, "step": 736 }, { "epoch": 46.0, "loss": 0.045453645288944244, "loss_ce": 0.024213409051299095, "loss_xval": 0.021240234375, "num_input_tokens_seen": 50726912, "step": 736 }, { "epoch": 46.0625, "grad_norm": 7.645360379252364, "learning_rate": 5e-05, "loss": 0.0248, "num_input_tokens_seen": 50786112, "step": 737 }, { "epoch": 46.0625, "loss": 0.024106912314891815, "loss_ce": 0.019864968955516815, "loss_xval": 0.004241943359375, "num_input_tokens_seen": 50786112, "step": 737 }, { "epoch": 46.125, "grad_norm": 36.16606847044462, "learning_rate": 5e-05, "loss": 0.0496, "num_input_tokens_seen": 50857792, "step": 738 }, { "epoch": 46.125, "loss": 0.048444587737321854, "loss_ce": 0.025495368987321854, "loss_xval": 0.02294921875, "num_input_tokens_seen": 50857792, "step": 738 }, { "epoch": 46.1875, "grad_norm": 43.482696946571494, "learning_rate": 5e-05, "loss": 0.059, "num_input_tokens_seen": 50929344, "step": 739 }, { "epoch": 46.1875, "loss": 0.057134151458740234, "loss_ce": 0.024175165221095085, "loss_xval": 0.032958984375, "num_input_tokens_seen": 50929344, "step": 739 }, { "epoch": 46.25, "grad_norm": 22.770786901935196, "learning_rate": 5e-05, "loss": 0.0308, "num_input_tokens_seen": 50988480, "step": 740 }, { "epoch": 46.25, "loss": 0.029636967927217484, "loss_ce": 0.017124760895967484, "loss_xval": 0.01251220703125, "num_input_tokens_seen": 50988480, "step": 740 }, { "epoch": 46.3125, "grad_norm": 16.404867876806033, "learning_rate": 5e-05, "loss": 0.0272, "num_input_tokens_seen": 51060160, "step": 741 }, { "epoch": 46.3125, "loss": 0.026602361351251602, "loss_ce": 0.019644353538751602, "loss_xval": 0.0069580078125, "num_input_tokens_seen": 51060160, "step": 741 }, { "epoch": 46.375, "grad_norm": 54.91413940931545, "learning_rate": 5e-05, "loss": 0.0718, "num_input_tokens_seen": 51119296, "step": 742 }, { "epoch": 46.375, "loss": 0.07138977199792862, "loss_ce": 0.01670227386057377, "loss_xval": 0.0546875, "num_input_tokens_seen": 51119296, "step": 742 }, { "epoch": 46.4375, "grad_norm": 71.93891750065025, "learning_rate": 5e-05, "loss": 0.1097, "num_input_tokens_seen": 51190912, "step": 743 }, { "epoch": 46.4375, "loss": 0.11477219313383102, "loss_ce": 0.017115944996476173, "loss_xval": 0.09765625, "num_input_tokens_seen": 51190912, "step": 743 }, { "epoch": 46.5, "grad_norm": 59.730350011881576, "learning_rate": 5e-05, "loss": 0.0768, "num_input_tokens_seen": 51262656, "step": 744 }, { "epoch": 46.5, "loss": 0.07721823453903198, "loss_ce": 0.013253391720354557, "loss_xval": 0.06396484375, "num_input_tokens_seen": 51262656, "step": 744 }, { "epoch": 46.5625, "grad_norm": 24.58129799871284, "learning_rate": 5e-05, "loss": 0.0272, "num_input_tokens_seen": 51334208, "step": 745 }, { "epoch": 46.5625, "loss": 0.02467159926891327, "loss_ce": 0.014539762400090694, "loss_xval": 0.0101318359375, "num_input_tokens_seen": 51334208, "step": 745 }, { "epoch": 46.625, "grad_norm": 18.53325748451815, "learning_rate": 5e-05, "loss": 0.0218, "num_input_tokens_seen": 51405824, "step": 746 }, { "epoch": 46.625, "loss": 0.017218932509422302, "loss_ce": 0.011847837828099728, "loss_xval": 0.00537109375, "num_input_tokens_seen": 51405824, "step": 746 }, { "epoch": 46.6875, "grad_norm": 50.9913159022942, "learning_rate": 5e-05, "loss": 0.0579, "num_input_tokens_seen": 51477440, "step": 747 }, { "epoch": 46.6875, "loss": 0.059509001672267914, "loss_ce": 0.009216032922267914, "loss_xval": 0.05029296875, "num_input_tokens_seen": 51477440, "step": 747 }, { "epoch": 46.75, "grad_norm": 63.66315135326273, "learning_rate": 5e-05, "loss": 0.0821, "num_input_tokens_seen": 51549120, "step": 748 }, { "epoch": 46.75, "loss": 0.0909615084528923, "loss_ce": 0.008930260315537453, "loss_xval": 0.08203125, "num_input_tokens_seen": 51549120, "step": 748 }, { "epoch": 46.8125, "grad_norm": 53.49089256030126, "learning_rate": 5e-05, "loss": 0.0614, "num_input_tokens_seen": 51608256, "step": 749 }, { "epoch": 46.8125, "loss": 0.05685190483927727, "loss_ce": 0.008023779839277267, "loss_xval": 0.048828125, "num_input_tokens_seen": 51608256, "step": 749 }, { "epoch": 46.875, "grad_norm": 22.96802504271489, "learning_rate": 5e-05, "loss": 0.0204, "num_input_tokens_seen": 51679936, "step": 750 }, { "epoch": 46.875, "eval_synth_IoU": 0.0007812945987097919, "eval_synth_MAE_x": 0.0928192138671875, "eval_synth_MAE_y": 0.088897705078125, "eval_synth_NUM_probability": 0.9719446450471878, "eval_synth_inside_bbox": 0.0, "eval_synth_loss": 0.015559088438749313, "eval_synth_loss_ce": 0.006327520357444882, "eval_synth_loss_xval": 0.0092315673828125, "eval_synth_runtime": 55.1359, "eval_synth_samples_per_second": 2.322, "eval_synth_steps_per_second": 0.073, "num_input_tokens_seen": 51679936, "step": 750 }, { "epoch": 46.875, "loss": 0.018887830898165703, "loss_ce": 0.0071080452762544155, "loss_xval": 0.01177978515625, "num_input_tokens_seen": 51679936, "step": 750 }, { "epoch": 46.9375, "grad_norm": 19.42051608033023, "learning_rate": 5e-05, "loss": 0.0179, "num_input_tokens_seen": 51751680, "step": 751 }, { "epoch": 46.9375, "loss": 0.012810613960027695, "loss_ce": 0.007103826384991407, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 51751680, "step": 751 }, { "epoch": 47.0, "grad_norm": 54.658150025767185, "learning_rate": 5e-05, "loss": 0.0623, "num_input_tokens_seen": 51823232, "step": 752 }, { "epoch": 47.0, "loss": 0.06768595427274704, "loss_ce": 0.006650795694440603, "loss_xval": 0.06103515625, "num_input_tokens_seen": 51823232, "step": 752 }, { "epoch": 47.0625, "grad_norm": 69.07348114452765, "learning_rate": 5e-05, "loss": 0.0919, "num_input_tokens_seen": 51894912, "step": 753 }, { "epoch": 47.0625, "loss": 0.09093870967626572, "loss_ce": 0.005489490460604429, "loss_xval": 0.08544921875, "num_input_tokens_seen": 51894912, "step": 753 }, { "epoch": 47.125, "grad_norm": 60.03217462112842, "learning_rate": 5e-05, "loss": 0.0746, "num_input_tokens_seen": 51966656, "step": 754 }, { "epoch": 47.125, "loss": 0.07557681947946548, "loss_ce": 0.004776036832481623, "loss_xval": 0.07080078125, "num_input_tokens_seen": 51966656, "step": 754 }, { "epoch": 47.1875, "grad_norm": 43.325007150716914, "learning_rate": 5e-05, "loss": 0.0462, "num_input_tokens_seen": 52038336, "step": 755 }, { "epoch": 47.1875, "loss": 0.04101795330643654, "loss_ce": 0.004396860953420401, "loss_xval": 0.03662109375, "num_input_tokens_seen": 52038336, "step": 755 }, { "epoch": 47.25, "grad_norm": 27.442651321523833, "learning_rate": 5e-05, "loss": 0.0226, "num_input_tokens_seen": 52109888, "step": 756 }, { "epoch": 47.25, "loss": 0.017444264143705368, "loss_ce": 0.004260670859366655, "loss_xval": 0.01318359375, "num_input_tokens_seen": 52109888, "step": 756 }, { "epoch": 47.3125, "grad_norm": 8.615957725807633, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 52181632, "step": 757 }, { "epoch": 47.3125, "loss": 0.014540612697601318, "loss_ce": 0.004530847072601318, "loss_xval": 0.010009765625, "num_input_tokens_seen": 52181632, "step": 757 }, { "epoch": 47.375, "grad_norm": 10.469872066339313, "learning_rate": 5e-05, "loss": 0.011, "num_input_tokens_seen": 52240640, "step": 758 }, { "epoch": 47.375, "loss": 0.011658506467938423, "loss_ce": 0.003540830686688423, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 52240640, "step": 758 }, { "epoch": 47.4375, "grad_norm": 22.950363116153852, "learning_rate": 5e-05, "loss": 0.0188, "num_input_tokens_seen": 52312320, "step": 759 }, { "epoch": 47.4375, "loss": 0.01578526385128498, "loss_ce": 0.003212022129446268, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 52312320, "step": 759 }, { "epoch": 47.5, "grad_norm": 24.16847740208771, "learning_rate": 5e-05, "loss": 0.0171, "num_input_tokens_seen": 52384000, "step": 760 }, { "epoch": 47.5, "loss": 0.01860148459672928, "loss_ce": 0.0032206252217292786, "loss_xval": 0.015380859375, "num_input_tokens_seen": 52384000, "step": 760 }, { "epoch": 47.5625, "grad_norm": 17.844345643001546, "learning_rate": 5e-05, "loss": 0.0128, "num_input_tokens_seen": 52455680, "step": 761 }, { "epoch": 47.5625, "loss": 0.013861645944416523, "loss_ce": 0.003424634225666523, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 52455680, "step": 761 }, { "epoch": 47.625, "grad_norm": 18.057741465889073, "learning_rate": 5e-05, "loss": 0.0106, "num_input_tokens_seen": 52527296, "step": 762 }, { "epoch": 47.625, "loss": 0.011138019151985645, "loss_ce": 0.002715167822316289, "loss_xval": 0.0084228515625, "num_input_tokens_seen": 52527296, "step": 762 }, { "epoch": 47.6875, "grad_norm": 25.332474387451597, "learning_rate": 5e-05, "loss": 0.0169, "num_input_tokens_seen": 52598976, "step": 763 }, { "epoch": 47.6875, "loss": 0.017574511468410492, "loss_ce": 0.0027429680339992046, "loss_xval": 0.01483154296875, "num_input_tokens_seen": 52598976, "step": 763 }, { "epoch": 47.75, "grad_norm": 35.70284428611629, "learning_rate": 5e-05, "loss": 0.0292, "num_input_tokens_seen": 52658240, "step": 764 }, { "epoch": 47.75, "loss": 0.029903825372457504, "loss_ce": 0.0024380050599575043, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 52658240, "step": 764 }, { "epoch": 47.8125, "grad_norm": 49.93003045007072, "learning_rate": 5e-05, "loss": 0.0493, "num_input_tokens_seen": 52729920, "step": 765 }, { "epoch": 47.8125, "loss": 0.047652266919612885, "loss_ce": 0.002486249664798379, "loss_xval": 0.045166015625, "num_input_tokens_seen": 52729920, "step": 765 }, { "epoch": 47.875, "grad_norm": 67.404459413659, "learning_rate": 5e-05, "loss": 0.0829, "num_input_tokens_seen": 52801728, "step": 766 }, { "epoch": 47.875, "loss": 0.0774126872420311, "loss_ce": 0.0017290961695834994, "loss_xval": 0.07568359375, "num_input_tokens_seen": 52801728, "step": 766 }, { "epoch": 47.9375, "grad_norm": 90.78005663657689, "learning_rate": 5e-05, "loss": 0.149, "num_input_tokens_seen": 52860864, "step": 767 }, { "epoch": 47.9375, "loss": 0.14267286658287048, "loss_ce": 0.0020478684455156326, "loss_xval": 0.140625, "num_input_tokens_seen": 52860864, "step": 767 }, { "epoch": 48.0, "grad_norm": 120.70776113463316, "learning_rate": 5e-05, "loss": 0.258, "num_input_tokens_seen": 52920064, "step": 768 }, { "epoch": 48.0, "loss": 0.2656416893005371, "loss_ce": 0.0019698101095855236, "loss_xval": 0.263671875, "num_input_tokens_seen": 52920064, "step": 768 }, { "epoch": 48.0625, "grad_norm": 154.26644625699004, "learning_rate": 5e-05, "loss": 0.4307, "num_input_tokens_seen": 52991744, "step": 769 }, { "epoch": 48.0625, "loss": 0.42958393692970276, "loss_ce": 0.0018495743861421943, "loss_xval": 0.427734375, "num_input_tokens_seen": 52991744, "step": 769 }, { "epoch": 48.125, "grad_norm": 183.65303034626527, "learning_rate": 5e-05, "loss": 0.6016, "num_input_tokens_seen": 53063424, "step": 770 }, { "epoch": 48.125, "loss": 0.5957886576652527, "loss_ce": 0.002038649283349514, "loss_xval": 0.59375, "num_input_tokens_seen": 53063424, "step": 770 }, { "epoch": 48.1875, "grad_norm": 180.1180856007345, "learning_rate": 5e-05, "loss": 0.5999, "num_input_tokens_seen": 53135104, "step": 771 }, { "epoch": 48.1875, "loss": 0.5726340413093567, "loss_ce": 0.0023215236142277718, "loss_xval": 0.5703125, "num_input_tokens_seen": 53135104, "step": 771 }, { "epoch": 48.25, "grad_norm": 125.47094176348381, "learning_rate": 5e-05, "loss": 0.2987, "num_input_tokens_seen": 53206784, "step": 772 }, { "epoch": 48.25, "loss": 0.2898358702659607, "loss_ce": 0.002726496197283268, "loss_xval": 0.287109375, "num_input_tokens_seen": 53206784, "step": 772 }, { "epoch": 48.3125, "grad_norm": 24.422705550921062, "learning_rate": 5e-05, "loss": 0.0202, "num_input_tokens_seen": 53278528, "step": 773 }, { "epoch": 48.3125, "loss": 0.015276818536221981, "loss_ce": 0.003924279473721981, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 53278528, "step": 773 }, { "epoch": 48.375, "grad_norm": 81.92908246583572, "learning_rate": 5e-05, "loss": 0.1362, "num_input_tokens_seen": 53350144, "step": 774 }, { "epoch": 48.375, "loss": 0.1354779452085495, "loss_ce": 0.003642010036855936, "loss_xval": 0.1318359375, "num_input_tokens_seen": 53350144, "step": 774 }, { "epoch": 48.4375, "grad_norm": 132.71379220552564, "learning_rate": 5e-05, "loss": 0.3499, "num_input_tokens_seen": 53421824, "step": 775 }, { "epoch": 48.4375, "loss": 0.3635503947734833, "loss_ce": 0.00417538033798337, "loss_xval": 0.359375, "num_input_tokens_seen": 53421824, "step": 775 }, { "epoch": 48.5, "grad_norm": 103.21943678223388, "learning_rate": 5e-05, "loss": 0.2241, "num_input_tokens_seen": 53493504, "step": 776 }, { "epoch": 48.5, "loss": 0.22680360078811646, "loss_ce": 0.005123913753777742, "loss_xval": 0.2216796875, "num_input_tokens_seen": 53493504, "step": 776 }, { "epoch": 48.5625, "grad_norm": 20.388756780472935, "learning_rate": 5e-05, "loss": 0.0227, "num_input_tokens_seen": 53565120, "step": 777 }, { "epoch": 48.5625, "loss": 0.016845600679516792, "loss_ce": 0.00463856989517808, "loss_xval": 0.01220703125, "num_input_tokens_seen": 53565120, "step": 777 }, { "epoch": 48.625, "grad_norm": 54.35238290952437, "learning_rate": 5e-05, "loss": 0.0736, "num_input_tokens_seen": 53624320, "step": 778 }, { "epoch": 48.625, "loss": 0.08257697522640228, "loss_ce": 0.005916816648095846, "loss_xval": 0.07666015625, "num_input_tokens_seen": 53624320, "step": 778 }, { "epoch": 48.6875, "grad_norm": 69.92504449745852, "learning_rate": 5e-05, "loss": 0.1096, "num_input_tokens_seen": 53683520, "step": 779 }, { "epoch": 48.6875, "loss": 0.11179614067077637, "loss_ce": 0.005350826773792505, "loss_xval": 0.1064453125, "num_input_tokens_seen": 53683520, "step": 779 }, { "epoch": 48.75, "grad_norm": 31.00750248115586, "learning_rate": 5e-05, "loss": 0.0312, "num_input_tokens_seen": 53755072, "step": 780 }, { "epoch": 48.75, "loss": 0.03486330807209015, "loss_ce": 0.006176783703267574, "loss_xval": 0.0286865234375, "num_input_tokens_seen": 53755072, "step": 780 }, { "epoch": 48.8125, "grad_norm": 26.982223682099708, "learning_rate": 5e-05, "loss": 0.0294, "num_input_tokens_seen": 53826752, "step": 781 }, { "epoch": 48.8125, "loss": 0.02688555046916008, "loss_ce": 0.005401174537837505, "loss_xval": 0.021484375, "num_input_tokens_seen": 53826752, "step": 781 }, { "epoch": 48.875, "grad_norm": 55.79318187391173, "learning_rate": 5e-05, "loss": 0.0742, "num_input_tokens_seen": 53898496, "step": 782 }, { "epoch": 48.875, "loss": 0.08878543227910995, "loss_ce": 0.004801060538738966, "loss_xval": 0.083984375, "num_input_tokens_seen": 53898496, "step": 782 }, { "epoch": 48.9375, "grad_norm": 31.524279884792552, "learning_rate": 5e-05, "loss": 0.0291, "num_input_tokens_seen": 53970176, "step": 783 }, { "epoch": 48.9375, "loss": 0.024802180007100105, "loss_ce": 0.004416438285261393, "loss_xval": 0.0203857421875, "num_input_tokens_seen": 53970176, "step": 783 }, { "epoch": 49.0, "grad_norm": 18.41228108143884, "learning_rate": 5e-05, "loss": 0.0153, "num_input_tokens_seen": 54041792, "step": 784 }, { "epoch": 49.0, "loss": 0.017882753163576126, "loss_ce": 0.0043329475447535515, "loss_xval": 0.0135498046875, "num_input_tokens_seen": 54041792, "step": 784 }, { "epoch": 49.0625, "grad_norm": 55.099401878604795, "learning_rate": 5e-05, "loss": 0.0738, "num_input_tokens_seen": 54113344, "step": 785 }, { "epoch": 49.0625, "loss": 0.0743025466799736, "loss_ce": 0.0039900485426187515, "loss_xval": 0.0703125, "num_input_tokens_seen": 54113344, "step": 785 }, { "epoch": 49.125, "grad_norm": 54.48862222878034, "learning_rate": 5e-05, "loss": 0.0694, "num_input_tokens_seen": 54184896, "step": 786 }, { "epoch": 49.125, "loss": 0.06297452747821808, "loss_ce": 0.004136639181524515, "loss_xval": 0.058837890625, "num_input_tokens_seen": 54184896, "step": 786 }, { "epoch": 49.1875, "grad_norm": 25.23820781380214, "learning_rate": 5e-05, "loss": 0.0204, "num_input_tokens_seen": 54256576, "step": 787 }, { "epoch": 49.1875, "loss": 0.01891055330634117, "loss_ce": 0.0038959055673331022, "loss_xval": 0.0150146484375, "num_input_tokens_seen": 54256576, "step": 787 }, { "epoch": 49.25, "grad_norm": 6.017261666530016, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 54328384, "step": 788 }, { "epoch": 49.25, "loss": 0.006588565185666084, "loss_ce": 0.0036131010856479406, "loss_xval": 0.0029754638671875, "num_input_tokens_seen": 54328384, "step": 788 }, { "epoch": 49.3125, "grad_norm": 27.539892076431858, "learning_rate": 5e-05, "loss": 0.022, "num_input_tokens_seen": 54387520, "step": 789 }, { "epoch": 49.3125, "loss": 0.021436044946312904, "loss_ce": 0.004712412133812904, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 54387520, "step": 789 }, { "epoch": 49.375, "grad_norm": 32.542488691104325, "learning_rate": 5e-05, "loss": 0.0303, "num_input_tokens_seen": 54459136, "step": 790 }, { "epoch": 49.375, "loss": 0.030650347471237183, "loss_ce": 0.005992144346237183, "loss_xval": 0.024658203125, "num_input_tokens_seen": 54459136, "step": 790 }, { "epoch": 49.4375, "grad_norm": 20.238510613982434, "learning_rate": 5e-05, "loss": 0.0137, "num_input_tokens_seen": 54530752, "step": 791 }, { "epoch": 49.4375, "loss": 0.011637124232947826, "loss_ce": 0.0035194484516978264, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 54530752, "step": 791 }, { "epoch": 49.5, "grad_norm": 1.5253884409622565, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 54602368, "step": 792 }, { "epoch": 49.5, "loss": 0.005845654755830765, "loss_ce": 0.002915967023000121, "loss_xval": 0.0029296875, "num_input_tokens_seen": 54602368, "step": 792 }, { "epoch": 49.5625, "grad_norm": 21.408122372907325, "learning_rate": 5e-05, "loss": 0.0151, "num_input_tokens_seen": 54673920, "step": 793 }, { "epoch": 49.5625, "loss": 0.017070988193154335, "loss_ce": 0.0031549730338156223, "loss_xval": 0.013916015625, "num_input_tokens_seen": 54673920, "step": 793 }, { "epoch": 49.625, "grad_norm": 32.00954556671291, "learning_rate": 5e-05, "loss": 0.0274, "num_input_tokens_seen": 54733120, "step": 794 }, { "epoch": 49.625, "loss": 0.026889847591519356, "loss_ce": 0.0027199252508580685, "loss_xval": 0.024169921875, "num_input_tokens_seen": 54733120, "step": 794 }, { "epoch": 49.6875, "grad_norm": 32.08281797834498, "learning_rate": 5e-05, "loss": 0.0261, "num_input_tokens_seen": 54804800, "step": 795 }, { "epoch": 49.6875, "loss": 0.02954932302236557, "loss_ce": 0.00257178395986557, "loss_xval": 0.0269775390625, "num_input_tokens_seen": 54804800, "step": 795 }, { "epoch": 49.75, "grad_norm": 23.6610997266782, "learning_rate": 5e-05, "loss": 0.0171, "num_input_tokens_seen": 54876352, "step": 796 }, { "epoch": 49.75, "loss": 0.016563788056373596, "loss_ce": 0.002464667893946171, "loss_xval": 0.01409912109375, "num_input_tokens_seen": 54876352, "step": 796 }, { "epoch": 49.8125, "grad_norm": 10.80261321499895, "learning_rate": 5e-05, "loss": 0.0069, "num_input_tokens_seen": 54947968, "step": 797 }, { "epoch": 49.8125, "loss": 0.008322998881340027, "loss_ce": 0.0021584476344287395, "loss_xval": 0.00616455078125, "num_input_tokens_seen": 54947968, "step": 797 }, { "epoch": 49.875, "grad_norm": 1.9500645700249208, "learning_rate": 5e-05, "loss": 0.0046, "num_input_tokens_seen": 55007168, "step": 798 }, { "epoch": 49.875, "loss": 0.004437156952917576, "loss_ce": 0.0020873036701232195, "loss_xval": 0.002349853515625, "num_input_tokens_seen": 55007168, "step": 798 }, { "epoch": 49.9375, "grad_norm": 11.979543997613995, "learning_rate": 5e-05, "loss": 0.0073, "num_input_tokens_seen": 55066176, "step": 799 }, { "epoch": 49.9375, "loss": 0.00749744800850749, "loss_ce": 0.00227894214913249, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 55066176, "step": 799 }, { "epoch": 50.0, "grad_norm": 19.932370797577512, "learning_rate": 5e-05, "loss": 0.0134, "num_input_tokens_seen": 55137984, "step": 800 }, { "epoch": 50.0, "loss": 0.013562907464802265, "loss_ce": 0.0020272627007216215, "loss_xval": 0.01153564453125, "num_input_tokens_seen": 55137984, "step": 800 }, { "epoch": 50.0625, "grad_norm": 26.848126234216654, "learning_rate": 5e-05, "loss": 0.021, "num_input_tokens_seen": 55197120, "step": 801 }, { "epoch": 50.0625, "loss": 0.018240634351968765, "loss_ce": 0.0020052818581461906, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 55197120, "step": 801 }, { "epoch": 50.125, "grad_norm": 26.692372871133674, "learning_rate": 5e-05, "loss": 0.0185, "num_input_tokens_seen": 55268736, "step": 802 }, { "epoch": 50.125, "loss": 0.01761055923998356, "loss_ce": 0.0016193479532375932, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 55268736, "step": 802 }, { "epoch": 50.1875, "grad_norm": 18.658201527903525, "learning_rate": 5e-05, "loss": 0.0113, "num_input_tokens_seen": 55340480, "step": 803 }, { "epoch": 50.1875, "loss": 0.012265665456652641, "loss_ce": 0.0016455486183986068, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 55340480, "step": 803 }, { "epoch": 50.25, "grad_norm": 8.941056023600712, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 55412032, "step": 804 }, { "epoch": 50.25, "loss": 0.005654824897646904, "loss_ce": 0.001626504585146904, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 55412032, "step": 804 }, { "epoch": 50.3125, "grad_norm": 0.7011671163304171, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 55483712, "step": 805 }, { "epoch": 50.3125, "loss": 0.0031639118678867817, "loss_ce": 0.0016151447780430317, "loss_xval": 0.00154876708984375, "num_input_tokens_seen": 55483712, "step": 805 }, { "epoch": 50.375, "grad_norm": 11.37878069005779, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 55555328, "step": 806 }, { "epoch": 50.375, "loss": 0.005364611279219389, "loss_ce": 0.0017330195987597108, "loss_xval": 0.003631591796875, "num_input_tokens_seen": 55555328, "step": 806 }, { "epoch": 50.4375, "grad_norm": 22.824887834021848, "learning_rate": 5e-05, "loss": 0.0151, "num_input_tokens_seen": 55626944, "step": 807 }, { "epoch": 50.4375, "loss": 0.01386871188879013, "loss_ce": 0.0016006450168788433, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 55626944, "step": 807 }, { "epoch": 50.5, "grad_norm": 32.869491648733195, "learning_rate": 5e-05, "loss": 0.0267, "num_input_tokens_seen": 55698496, "step": 808 }, { "epoch": 50.5, "loss": 0.024559233337640762, "loss_ce": 0.001487944507971406, "loss_xval": 0.0230712890625, "num_input_tokens_seen": 55698496, "step": 808 }, { "epoch": 50.5625, "grad_norm": 42.62504396274984, "learning_rate": 5e-05, "loss": 0.0419, "num_input_tokens_seen": 55770112, "step": 809 }, { "epoch": 50.5625, "loss": 0.03527969866991043, "loss_ce": 0.0013441527262330055, "loss_xval": 0.033935546875, "num_input_tokens_seen": 55770112, "step": 809 }, { "epoch": 50.625, "grad_norm": 55.812846213983434, "learning_rate": 5e-05, "loss": 0.0697, "num_input_tokens_seen": 55841792, "step": 810 }, { "epoch": 50.625, "loss": 0.07067803293466568, "loss_ce": 0.0013420972973108292, "loss_xval": 0.0693359375, "num_input_tokens_seen": 55841792, "step": 810 }, { "epoch": 50.6875, "grad_norm": 73.57934830845704, "learning_rate": 5e-05, "loss": 0.1194, "num_input_tokens_seen": 55913344, "step": 811 }, { "epoch": 50.6875, "loss": 0.12286809086799622, "loss_ce": 0.0012860629940405488, "loss_xval": 0.12158203125, "num_input_tokens_seen": 55913344, "step": 811 }, { "epoch": 50.75, "grad_norm": 92.5057681700446, "learning_rate": 5e-05, "loss": 0.1853, "num_input_tokens_seen": 55985024, "step": 812 }, { "epoch": 50.75, "loss": 0.17899751663208008, "loss_ce": 0.0012631455902010202, "loss_xval": 0.177734375, "num_input_tokens_seen": 55985024, "step": 812 }, { "epoch": 50.8125, "grad_norm": 110.50358208916595, "learning_rate": 5e-05, "loss": 0.2668, "num_input_tokens_seen": 56044032, "step": 813 }, { "epoch": 50.8125, "loss": 0.27068424224853516, "loss_ce": 0.0011529907351359725, "loss_xval": 0.26953125, "num_input_tokens_seen": 56044032, "step": 813 }, { "epoch": 50.875, "grad_norm": 120.20554805484593, "learning_rate": 5e-05, "loss": 0.3158, "num_input_tokens_seen": 56115776, "step": 814 }, { "epoch": 50.875, "loss": 0.31771913170814514, "loss_ce": 0.0013128790305927396, "loss_xval": 0.31640625, "num_input_tokens_seen": 56115776, "step": 814 }, { "epoch": 50.9375, "grad_norm": 114.74657469509621, "learning_rate": 5e-05, "loss": 0.2944, "num_input_tokens_seen": 56187456, "step": 815 }, { "epoch": 50.9375, "loss": 0.2767355144023895, "loss_ce": 0.0013448747340589762, "loss_xval": 0.275390625, "num_input_tokens_seen": 56187456, "step": 815 }, { "epoch": 51.0, "grad_norm": 90.33489565175944, "learning_rate": 5e-05, "loss": 0.1812, "num_input_tokens_seen": 56259072, "step": 816 }, { "epoch": 51.0, "loss": 0.17537222802639008, "loss_ce": 0.001544097438454628, "loss_xval": 0.173828125, "num_input_tokens_seen": 56259072, "step": 816 }, { "epoch": 51.0625, "grad_norm": 44.040262786271, "learning_rate": 5e-05, "loss": 0.047, "num_input_tokens_seen": 56330688, "step": 817 }, { "epoch": 51.0625, "loss": 0.04732108861207962, "loss_ce": 0.0019109330605715513, "loss_xval": 0.04541015625, "num_input_tokens_seen": 56330688, "step": 817 }, { "epoch": 51.125, "grad_norm": 8.197144710350221, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 56402304, "step": 818 }, { "epoch": 51.125, "loss": 0.00620028143748641, "loss_ce": 0.00195833807811141, "loss_xval": 0.004241943359375, "num_input_tokens_seen": 56402304, "step": 818 }, { "epoch": 51.1875, "grad_norm": 51.23137999157995, "learning_rate": 5e-05, "loss": 0.068, "num_input_tokens_seen": 56473984, "step": 819 }, { "epoch": 51.1875, "loss": 0.06873885542154312, "loss_ce": 0.0018443216104060411, "loss_xval": 0.06689453125, "num_input_tokens_seen": 56473984, "step": 819 }, { "epoch": 51.25, "grad_norm": 75.80088109055204, "learning_rate": 5e-05, "loss": 0.1361, "num_input_tokens_seen": 56545600, "step": 820 }, { "epoch": 51.25, "loss": 0.13631145656108856, "loss_ce": 0.002522395458072424, "loss_xval": 0.1337890625, "num_input_tokens_seen": 56545600, "step": 820 }, { "epoch": 51.3125, "grad_norm": 73.5638812770527, "learning_rate": 5e-05, "loss": 0.1285, "num_input_tokens_seen": 56617216, "step": 821 }, { "epoch": 51.3125, "loss": 0.13125012814998627, "loss_ce": 0.002343877451494336, "loss_xval": 0.12890625, "num_input_tokens_seen": 56617216, "step": 821 }, { "epoch": 51.375, "grad_norm": 50.491024677513764, "learning_rate": 5e-05, "loss": 0.0624, "num_input_tokens_seen": 56688960, "step": 822 }, { "epoch": 51.375, "loss": 0.06392044574022293, "loss_ce": 0.0021528673823922873, "loss_xval": 0.061767578125, "num_input_tokens_seen": 56688960, "step": 822 }, { "epoch": 51.4375, "grad_norm": 13.627391662624422, "learning_rate": 5e-05, "loss": 0.008, "num_input_tokens_seen": 56760512, "step": 823 }, { "epoch": 51.4375, "loss": 0.009278560988605022, "loss_ce": 0.0019848598167300224, "loss_xval": 0.007293701171875, "num_input_tokens_seen": 56760512, "step": 823 }, { "epoch": 51.5, "grad_norm": 21.93003165557537, "learning_rate": 5e-05, "loss": 0.0162, "num_input_tokens_seen": 56832256, "step": 824 }, { "epoch": 51.5, "loss": 0.018117789179086685, "loss_ce": 0.0020045076962560415, "loss_xval": 0.01611328125, "num_input_tokens_seen": 56832256, "step": 824 }, { "epoch": 51.5625, "grad_norm": 37.6954295759489, "learning_rate": 5e-05, "loss": 0.0405, "num_input_tokens_seen": 56904000, "step": 825 }, { "epoch": 51.5625, "loss": 0.04147803410887718, "loss_ce": 0.006565925199538469, "loss_xval": 0.034912109375, "num_input_tokens_seen": 56904000, "step": 825 }, { "epoch": 51.625, "grad_norm": 37.611391142785784, "learning_rate": 5e-05, "loss": 0.0376, "num_input_tokens_seen": 56975616, "step": 826 }, { "epoch": 51.625, "loss": 0.04057503119111061, "loss_ce": 0.0020008108112961054, "loss_xval": 0.03857421875, "num_input_tokens_seen": 56975616, "step": 826 }, { "epoch": 51.6875, "grad_norm": 29.196912703371584, "learning_rate": 5e-05, "loss": 0.0231, "num_input_tokens_seen": 57047296, "step": 827 }, { "epoch": 51.6875, "loss": 0.024808669462800026, "loss_ce": 0.001981521723791957, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 57047296, "step": 827 }, { "epoch": 51.75, "grad_norm": 16.54743485919119, "learning_rate": 5e-05, "loss": 0.011, "num_input_tokens_seen": 57106496, "step": 828 }, { "epoch": 51.75, "loss": 0.011654208414256573, "loss_ce": 0.0019496186869218946, "loss_xval": 0.00970458984375, "num_input_tokens_seen": 57106496, "step": 828 }, { "epoch": 51.8125, "grad_norm": 3.7849532984052625, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 57178176, "step": 829 }, { "epoch": 51.8125, "loss": 0.0048974500969052315, "loss_ce": 0.0018914688844233751, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 57178176, "step": 829 }, { "epoch": 51.875, "grad_norm": 6.105116572368139, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 57249792, "step": 830 }, { "epoch": 51.875, "loss": 0.0064654904417693615, "loss_ce": 0.0019183713011443615, "loss_xval": 0.004547119140625, "num_input_tokens_seen": 57249792, "step": 830 }, { "epoch": 51.9375, "grad_norm": 13.59154846215083, "learning_rate": 5e-05, "loss": 0.0089, "num_input_tokens_seen": 57321536, "step": 831 }, { "epoch": 51.9375, "loss": 0.008973270654678345, "loss_ce": 0.002167850499972701, "loss_xval": 0.006805419921875, "num_input_tokens_seen": 57321536, "step": 831 }, { "epoch": 52.0, "grad_norm": 22.67058972281316, "learning_rate": 5e-05, "loss": 0.0185, "num_input_tokens_seen": 57393216, "step": 832 }, { "epoch": 52.0, "loss": 0.022312132641673088, "loss_ce": 0.002536741318181157, "loss_xval": 0.019775390625, "num_input_tokens_seen": 57393216, "step": 832 }, { "epoch": 52.0625, "grad_norm": 31.26119262513664, "learning_rate": 5e-05, "loss": 0.0276, "num_input_tokens_seen": 57465024, "step": 833 }, { "epoch": 52.0625, "loss": 0.027556775137782097, "loss_ce": 0.001799939782358706, "loss_xval": 0.0257568359375, "num_input_tokens_seen": 57465024, "step": 833 }, { "epoch": 52.125, "grad_norm": 39.97021875837148, "learning_rate": 5e-05, "loss": 0.0408, "num_input_tokens_seen": 57536832, "step": 834 }, { "epoch": 52.125, "loss": 0.03928814083337784, "loss_ce": 0.0016904838848859072, "loss_xval": 0.03759765625, "num_input_tokens_seen": 57536832, "step": 834 }, { "epoch": 52.1875, "grad_norm": 54.09472417184387, "learning_rate": 5e-05, "loss": 0.072, "num_input_tokens_seen": 57608576, "step": 835 }, { "epoch": 52.1875, "loss": 0.07196825742721558, "loss_ce": 0.0016557550989091396, "loss_xval": 0.0703125, "num_input_tokens_seen": 57608576, "step": 835 }, { "epoch": 52.25, "grad_norm": 71.16024527891, "learning_rate": 5e-05, "loss": 0.1223, "num_input_tokens_seen": 57680320, "step": 836 }, { "epoch": 52.25, "loss": 0.12591731548309326, "loss_ce": 0.0014055914944037795, "loss_xval": 0.12451171875, "num_input_tokens_seen": 57680320, "step": 836 }, { "epoch": 52.3125, "grad_norm": 86.84082121350787, "learning_rate": 5e-05, "loss": 0.1791, "num_input_tokens_seen": 57751872, "step": 837 }, { "epoch": 52.3125, "loss": 0.18296390771865845, "loss_ce": 0.0013232819037511945, "loss_xval": 0.181640625, "num_input_tokens_seen": 57751872, "step": 837 }, { "epoch": 52.375, "grad_norm": 96.18561268671853, "learning_rate": 5e-05, "loss": 0.2226, "num_input_tokens_seen": 57823488, "step": 838 }, { "epoch": 52.375, "loss": 0.22117449343204498, "loss_ce": 0.001447926159016788, "loss_xval": 0.2197265625, "num_input_tokens_seen": 57823488, "step": 838 }, { "epoch": 52.4375, "grad_norm": 91.7109355876937, "learning_rate": 5e-05, "loss": 0.2035, "num_input_tokens_seen": 57895104, "step": 839 }, { "epoch": 52.4375, "loss": 0.20843414962291718, "loss_ce": 0.001402897178195417, "loss_xval": 0.20703125, "num_input_tokens_seen": 57895104, "step": 839 }, { "epoch": 52.5, "grad_norm": 62.32489949280706, "learning_rate": 5e-05, "loss": 0.0974, "num_input_tokens_seen": 57966784, "step": 840 }, { "epoch": 52.5, "loss": 0.10197938978672028, "loss_ce": 0.0013934536837041378, "loss_xval": 0.1005859375, "num_input_tokens_seen": 57966784, "step": 840 }, { "epoch": 52.5625, "grad_norm": 16.85693768146133, "learning_rate": 5e-05, "loss": 0.0162, "num_input_tokens_seen": 58038528, "step": 841 }, { "epoch": 52.5625, "loss": 0.015338940545916557, "loss_ce": 0.001422925153747201, "loss_xval": 0.013916015625, "num_input_tokens_seen": 58038528, "step": 841 }, { "epoch": 52.625, "grad_norm": 26.557199399991575, "learning_rate": 5e-05, "loss": 0.0313, "num_input_tokens_seen": 58110144, "step": 842 }, { "epoch": 52.625, "loss": 0.032941192388534546, "loss_ce": 0.001691192970611155, "loss_xval": 0.03125, "num_input_tokens_seen": 58110144, "step": 842 }, { "epoch": 52.6875, "grad_norm": 61.834426831310644, "learning_rate": 5e-05, "loss": 0.1024, "num_input_tokens_seen": 58181888, "step": 843 }, { "epoch": 52.6875, "loss": 0.11410608887672424, "loss_ce": 0.0013131167506799102, "loss_xval": 0.11279296875, "num_input_tokens_seen": 58181888, "step": 843 }, { "epoch": 52.75, "grad_norm": 85.75039908293095, "learning_rate": 5e-05, "loss": 0.1846, "num_input_tokens_seen": 58253568, "step": 844 }, { "epoch": 52.75, "loss": 0.1811586171388626, "loss_ce": 0.0014711236581206322, "loss_xval": 0.1796875, "num_input_tokens_seen": 58253568, "step": 844 }, { "epoch": 52.8125, "grad_norm": 93.53596735754489, "learning_rate": 5e-05, "loss": 0.2221, "num_input_tokens_seen": 58325184, "step": 845 }, { "epoch": 52.8125, "loss": 0.22300904989242554, "loss_ce": 0.001329358434304595, "loss_xval": 0.2216796875, "num_input_tokens_seen": 58325184, "step": 845 }, { "epoch": 52.875, "grad_norm": 77.09685538155797, "learning_rate": 5e-05, "loss": 0.1562, "num_input_tokens_seen": 58396928, "step": 846 }, { "epoch": 52.875, "loss": 0.15464122593402863, "loss_ce": 0.0013209060998633504, "loss_xval": 0.1533203125, "num_input_tokens_seen": 58396928, "step": 846 }, { "epoch": 52.9375, "grad_norm": 40.01764226139686, "learning_rate": 5e-05, "loss": 0.0554, "num_input_tokens_seen": 58468544, "step": 847 }, { "epoch": 52.9375, "loss": 0.04871372878551483, "loss_ce": 0.0013504456728696823, "loss_xval": 0.04736328125, "num_input_tokens_seen": 58468544, "step": 847 }, { "epoch": 53.0, "grad_norm": 3.160508660622948, "learning_rate": 5e-05, "loss": 0.0141, "num_input_tokens_seen": 58540224, "step": 848 }, { "epoch": 53.0, "loss": 0.013072260655462742, "loss_ce": 0.0014145454624667764, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 58540224, "step": 848 }, { "epoch": 53.0625, "grad_norm": 36.930479844854915, "learning_rate": 5e-05, "loss": 0.0432, "num_input_tokens_seen": 58611840, "step": 849 }, { "epoch": 53.0625, "loss": 0.04419334605336189, "loss_ce": 0.0014687355142086744, "loss_xval": 0.042724609375, "num_input_tokens_seen": 58611840, "step": 849 }, { "epoch": 53.125, "grad_norm": 49.85794519163775, "learning_rate": 5e-05, "loss": 0.0681, "num_input_tokens_seen": 58670912, "step": 850 }, { "epoch": 53.125, "loss": 0.06584130972623825, "loss_ce": 0.001388181815855205, "loss_xval": 0.064453125, "num_input_tokens_seen": 58670912, "step": 850 }, { "epoch": 53.1875, "grad_norm": 41.56149633060318, "learning_rate": 5e-05, "loss": 0.0478, "num_input_tokens_seen": 58742592, "step": 851 }, { "epoch": 53.1875, "loss": 0.04808543249964714, "loss_ce": 0.0019428534433245659, "loss_xval": 0.046142578125, "num_input_tokens_seen": 58742592, "step": 851 }, { "epoch": 53.25, "grad_norm": 20.379106913212443, "learning_rate": 5e-05, "loss": 0.015, "num_input_tokens_seen": 58814208, "step": 852 }, { "epoch": 53.25, "loss": 0.013091476634144783, "loss_ce": 0.0019220428075641394, "loss_xval": 0.01116943359375, "num_input_tokens_seen": 58814208, "step": 852 }, { "epoch": 53.3125, "grad_norm": 4.5415851008096935, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 58885888, "step": 853 }, { "epoch": 53.3125, "loss": 0.0059251245111227036, "loss_ce": 0.0018968043150380254, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 58885888, "step": 853 }, { "epoch": 53.375, "grad_norm": 22.50266574558752, "learning_rate": 5e-05, "loss": 0.0207, "num_input_tokens_seen": 58957504, "step": 854 }, { "epoch": 53.375, "loss": 0.022456642240285873, "loss_ce": 0.0015826192684471607, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 58957504, "step": 854 }, { "epoch": 53.4375, "grad_norm": 29.03587347070425, "learning_rate": 5e-05, "loss": 0.0281, "num_input_tokens_seen": 59029248, "step": 855 }, { "epoch": 53.4375, "loss": 0.029709484428167343, "loss_ce": 0.0018774535274133086, "loss_xval": 0.02783203125, "num_input_tokens_seen": 59029248, "step": 855 }, { "epoch": 53.5, "grad_norm": 28.401153106004635, "learning_rate": 5e-05, "loss": 0.0272, "num_input_tokens_seen": 59088384, "step": 856 }, { "epoch": 53.5, "loss": 0.026484563946723938, "loss_ce": 0.0014601502334699035, "loss_xval": 0.0250244140625, "num_input_tokens_seen": 59088384, "step": 856 }, { "epoch": 53.5625, "grad_norm": 27.042709640622366, "learning_rate": 5e-05, "loss": 0.0238, "num_input_tokens_seen": 59160128, "step": 857 }, { "epoch": 53.5625, "loss": 0.021871447563171387, "loss_ce": 0.0016077766194939613, "loss_xval": 0.020263671875, "num_input_tokens_seen": 59160128, "step": 857 }, { "epoch": 53.625, "grad_norm": 22.177055726651922, "learning_rate": 5e-05, "loss": 0.0167, "num_input_tokens_seen": 59231744, "step": 858 }, { "epoch": 53.625, "loss": 0.016048060730099678, "loss_ce": 0.0016437633894383907, "loss_xval": 0.014404296875, "num_input_tokens_seen": 59231744, "step": 858 }, { "epoch": 53.6875, "grad_norm": 13.365363544559107, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 59303360, "step": 859 }, { "epoch": 53.6875, "loss": 0.00661351066082716, "loss_ce": 0.0013950045686215162, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 59303360, "step": 859 }, { "epoch": 53.75, "grad_norm": 2.290110524547106, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 59375040, "step": 860 }, { "epoch": 53.75, "loss": 0.003171001560986042, "loss_ce": 0.0015306818531826138, "loss_xval": 0.00164031982421875, "num_input_tokens_seen": 59375040, "step": 860 }, { "epoch": 53.8125, "grad_norm": 11.050180430092226, "learning_rate": 5e-05, "loss": 0.0077, "num_input_tokens_seen": 59434176, "step": 861 }, { "epoch": 53.8125, "loss": 0.00867583230137825, "loss_ce": 0.0013821311295032501, "loss_xval": 0.007293701171875, "num_input_tokens_seen": 59434176, "step": 861 }, { "epoch": 53.875, "grad_norm": 23.083052407895174, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 59505856, "step": 862 }, { "epoch": 53.875, "loss": 0.01968074031174183, "loss_ce": 0.0013701926218345761, "loss_xval": 0.018310546875, "num_input_tokens_seen": 59505856, "step": 862 }, { "epoch": 53.9375, "grad_norm": 31.603491961859262, "learning_rate": 5e-05, "loss": 0.031, "num_input_tokens_seen": 59577536, "step": 863 }, { "epoch": 53.9375, "loss": 0.02824649214744568, "loss_ce": 0.0013910232810303569, "loss_xval": 0.02685546875, "num_input_tokens_seen": 59577536, "step": 863 }, { "epoch": 54.0, "grad_norm": 37.373668693249535, "learning_rate": 5e-05, "loss": 0.0404, "num_input_tokens_seen": 59649216, "step": 864 }, { "epoch": 54.0, "loss": 0.04348291456699371, "loss_ce": 0.0012465871404856443, "loss_xval": 0.042236328125, "num_input_tokens_seen": 59649216, "step": 864 }, { "epoch": 54.0625, "grad_norm": 42.12211569032457, "learning_rate": 5e-05, "loss": 0.0502, "num_input_tokens_seen": 59720832, "step": 865 }, { "epoch": 54.0625, "loss": 0.05193883180618286, "loss_ce": 0.0011575802927836776, "loss_xval": 0.05078125, "num_input_tokens_seen": 59720832, "step": 865 }, { "epoch": 54.125, "grad_norm": 47.381010910843735, "learning_rate": 5e-05, "loss": 0.0632, "num_input_tokens_seen": 59792512, "step": 866 }, { "epoch": 54.125, "loss": 0.06457292288541794, "loss_ce": 0.0010963573586195707, "loss_xval": 0.0634765625, "num_input_tokens_seen": 59792512, "step": 866 }, { "epoch": 54.1875, "grad_norm": 54.87808466607328, "learning_rate": 5e-05, "loss": 0.0844, "num_input_tokens_seen": 59864128, "step": 867 }, { "epoch": 54.1875, "loss": 0.08025375008583069, "loss_ce": 0.0011521849082782865, "loss_xval": 0.0791015625, "num_input_tokens_seen": 59864128, "step": 867 }, { "epoch": 54.25, "grad_norm": 61.93344752696612, "learning_rate": 5e-05, "loss": 0.1067, "num_input_tokens_seen": 59923200, "step": 868 }, { "epoch": 54.25, "loss": 0.10697054117918015, "loss_ce": 0.0010135115589946508, "loss_xval": 0.10595703125, "num_input_tokens_seen": 59923200, "step": 868 }, { "epoch": 54.3125, "grad_norm": 63.59808744718001, "learning_rate": 5e-05, "loss": 0.1134, "num_input_tokens_seen": 59982336, "step": 869 }, { "epoch": 54.3125, "loss": 0.11188820004463196, "loss_ce": 0.001048356993123889, "loss_xval": 0.11083984375, "num_input_tokens_seen": 59982336, "step": 869 }, { "epoch": 54.375, "grad_norm": 56.94998811770308, "learning_rate": 5e-05, "loss": 0.0915, "num_input_tokens_seen": 60041472, "step": 870 }, { "epoch": 54.375, "loss": 0.0917876660823822, "loss_ce": 0.0009673540480434895, "loss_xval": 0.0908203125, "num_input_tokens_seen": 60041472, "step": 870 }, { "epoch": 54.4375, "grad_norm": 41.91686845647309, "learning_rate": 5e-05, "loss": 0.0506, "num_input_tokens_seen": 60113216, "step": 871 }, { "epoch": 54.4375, "loss": 0.045974716544151306, "loss_ce": 0.0010528410784900188, "loss_xval": 0.044921875, "num_input_tokens_seen": 60113216, "step": 871 }, { "epoch": 54.5, "grad_norm": 23.635588748282537, "learning_rate": 5e-05, "loss": 0.0175, "num_input_tokens_seen": 60184832, "step": 872 }, { "epoch": 54.5, "loss": 0.016869032755494118, "loss_ce": 0.0009998929454013705, "loss_xval": 0.015869140625, "num_input_tokens_seen": 60184832, "step": 872 }, { "epoch": 54.5625, "grad_norm": 5.082441886348326, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 60256576, "step": 873 }, { "epoch": 54.5625, "loss": 0.003460080362856388, "loss_ce": 0.0010491915745660663, "loss_xval": 0.002410888671875, "num_input_tokens_seen": 60256576, "step": 873 }, { "epoch": 54.625, "grad_norm": 13.605292859198338, "learning_rate": 5e-05, "loss": 0.0076, "num_input_tokens_seen": 60328128, "step": 874 }, { "epoch": 54.625, "loss": 0.00831228494644165, "loss_ce": 0.0009270310983993113, "loss_xval": 0.00738525390625, "num_input_tokens_seen": 60328128, "step": 874 }, { "epoch": 54.6875, "grad_norm": 30.014976579341187, "learning_rate": 5e-05, "loss": 0.0269, "num_input_tokens_seen": 60387264, "step": 875 }, { "epoch": 54.6875, "loss": 0.025903020054101944, "loss_ce": 0.001000675605610013, "loss_xval": 0.02490234375, "num_input_tokens_seen": 60387264, "step": 875 }, { "epoch": 54.75, "grad_norm": 41.671989374196215, "learning_rate": 5e-05, "loss": 0.0508, "num_input_tokens_seen": 60458880, "step": 876 }, { "epoch": 54.75, "loss": 0.04798507317900658, "loss_ce": 0.001110074925236404, "loss_xval": 0.046875, "num_input_tokens_seen": 60458880, "step": 876 }, { "epoch": 54.8125, "grad_norm": 48.6149903146668, "learning_rate": 5e-05, "loss": 0.0684, "num_input_tokens_seen": 60530496, "step": 877 }, { "epoch": 54.8125, "loss": 0.07246832549571991, "loss_ce": 0.0011792667210102081, "loss_xval": 0.0712890625, "num_input_tokens_seen": 60530496, "step": 877 }, { "epoch": 54.875, "grad_norm": 52.85622195242736, "learning_rate": 5e-05, "loss": 0.0813, "num_input_tokens_seen": 60602112, "step": 878 }, { "epoch": 54.875, "loss": 0.07911968231201172, "loss_ce": 0.0009946819627657533, "loss_xval": 0.078125, "num_input_tokens_seen": 60602112, "step": 878 }, { "epoch": 54.9375, "grad_norm": 55.263215932973885, "learning_rate": 5e-05, "loss": 0.0894, "num_input_tokens_seen": 60673728, "step": 879 }, { "epoch": 54.9375, "loss": 0.08692839741706848, "loss_ce": 0.0009908954380080104, "loss_xval": 0.0859375, "num_input_tokens_seen": 60673728, "step": 879 }, { "epoch": 55.0, "grad_norm": 54.75293670657225, "learning_rate": 5e-05, "loss": 0.0876, "num_input_tokens_seen": 60745280, "step": 880 }, { "epoch": 55.0, "loss": 0.085189089179039, "loss_ce": 0.0012047119671478868, "loss_xval": 0.083984375, "num_input_tokens_seen": 60745280, "step": 880 }, { "epoch": 55.0625, "grad_norm": 49.616545082817495, "learning_rate": 5e-05, "loss": 0.0727, "num_input_tokens_seen": 60816896, "step": 881 }, { "epoch": 55.0625, "loss": 0.07568308711051941, "loss_ce": 0.0009760558605194092, "loss_xval": 0.07470703125, "num_input_tokens_seen": 60816896, "step": 881 }, { "epoch": 55.125, "grad_norm": 41.843744921751586, "learning_rate": 5e-05, "loss": 0.0519, "num_input_tokens_seen": 60876096, "step": 882 }, { "epoch": 55.125, "loss": 0.04788345843553543, "loss_ce": 0.0010084599489346147, "loss_xval": 0.046875, "num_input_tokens_seen": 60876096, "step": 882 }, { "epoch": 55.1875, "grad_norm": 31.37590948848245, "learning_rate": 5e-05, "loss": 0.0304, "num_input_tokens_seen": 60935104, "step": 883 }, { "epoch": 55.1875, "loss": 0.033360954374074936, "loss_ce": 0.0008902526460587978, "loss_xval": 0.032470703125, "num_input_tokens_seen": 60935104, "step": 883 }, { "epoch": 55.25, "grad_norm": 15.947873383411899, "learning_rate": 5e-05, "loss": 0.0102, "num_input_tokens_seen": 61006656, "step": 884 }, { "epoch": 55.25, "loss": 0.009581946767866611, "loss_ce": 0.000914954871404916, "loss_xval": 0.0086669921875, "num_input_tokens_seen": 61006656, "step": 884 }, { "epoch": 55.3125, "grad_norm": 0.661683516786272, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 61078272, "step": 885 }, { "epoch": 55.3125, "loss": 0.0037136925384402275, "loss_ce": 0.0009976280853152275, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 61078272, "step": 885 }, { "epoch": 55.375, "grad_norm": 11.31514067465619, "learning_rate": 5e-05, "loss": 0.0066, "num_input_tokens_seen": 61137408, "step": 886 }, { "epoch": 55.375, "loss": 0.006238732486963272, "loss_ce": 0.0008371211588382721, "loss_xval": 0.005401611328125, "num_input_tokens_seen": 61137408, "step": 886 }, { "epoch": 55.4375, "grad_norm": 18.242504586556592, "learning_rate": 5e-05, "loss": 0.0123, "num_input_tokens_seen": 61209024, "step": 887 }, { "epoch": 55.4375, "loss": 0.013944677077233791, "loss_ce": 0.000822118716314435, "loss_xval": 0.01312255859375, "num_input_tokens_seen": 61209024, "step": 887 }, { "epoch": 55.5, "grad_norm": 24.271649102582902, "learning_rate": 5e-05, "loss": 0.019, "num_input_tokens_seen": 61280704, "step": 888 }, { "epoch": 55.5, "loss": 0.01951012946665287, "loss_ce": 0.0008333711884915829, "loss_xval": 0.0186767578125, "num_input_tokens_seen": 61280704, "step": 888 }, { "epoch": 55.5625, "grad_norm": 32.47503220114088, "learning_rate": 5e-05, "loss": 0.0325, "num_input_tokens_seen": 61339776, "step": 889 }, { "epoch": 55.5625, "loss": 0.03378288447856903, "loss_ce": 0.0008238990558311343, "loss_xval": 0.032958984375, "num_input_tokens_seen": 61339776, "step": 889 }, { "epoch": 55.625, "grad_norm": 44.32927293806377, "learning_rate": 5e-05, "loss": 0.058, "num_input_tokens_seen": 61399040, "step": 890 }, { "epoch": 55.625, "loss": 0.053077153861522675, "loss_ce": 0.0008310599368996918, "loss_xval": 0.05224609375, "num_input_tokens_seen": 61399040, "step": 890 }, { "epoch": 55.6875, "grad_norm": 61.68685014028904, "learning_rate": 5e-05, "loss": 0.1114, "num_input_tokens_seen": 61470656, "step": 891 }, { "epoch": 55.6875, "loss": 0.11456962674856186, "loss_ce": 0.0008000986417755485, "loss_xval": 0.11376953125, "num_input_tokens_seen": 61470656, "step": 891 }, { "epoch": 55.75, "grad_norm": 83.67046241533913, "learning_rate": 5e-05, "loss": 0.204, "num_input_tokens_seen": 61542208, "step": 892 }, { "epoch": 55.75, "loss": 0.20592321455478668, "loss_ce": 0.0008450897876173258, "loss_xval": 0.205078125, "num_input_tokens_seen": 61542208, "step": 892 }, { "epoch": 55.8125, "grad_norm": 105.657566184581, "learning_rate": 5e-05, "loss": 0.326, "num_input_tokens_seen": 61601344, "step": 893 }, { "epoch": 55.8125, "loss": 0.3270096182823181, "loss_ce": 0.0008377549238502979, "loss_xval": 0.326171875, "num_input_tokens_seen": 61601344, "step": 893 }, { "epoch": 55.875, "grad_norm": 116.78406432128128, "learning_rate": 5e-05, "loss": 0.4071, "num_input_tokens_seen": 61672896, "step": 894 }, { "epoch": 55.875, "loss": 0.4032019376754761, "loss_ce": 0.0008581792353652418, "loss_xval": 0.40234375, "num_input_tokens_seen": 61672896, "step": 894 }, { "epoch": 55.9375, "grad_norm": 101.11755702111572, "learning_rate": 5e-05, "loss": 0.3116, "num_input_tokens_seen": 61744512, "step": 895 }, { "epoch": 55.9375, "loss": 0.309630423784256, "loss_ce": 0.001036668079905212, "loss_xval": 0.30859375, "num_input_tokens_seen": 61744512, "step": 895 }, { "epoch": 56.0, "grad_norm": 52.970602494824114, "learning_rate": 5e-05, "loss": 0.089, "num_input_tokens_seen": 61803648, "step": 896 }, { "epoch": 56.0, "loss": 0.09438222646713257, "loss_ce": 0.001120504573918879, "loss_xval": 0.09326171875, "num_input_tokens_seen": 61803648, "step": 896 }, { "epoch": 56.0625, "grad_norm": 11.375753228798327, "learning_rate": 5e-05, "loss": 0.0088, "num_input_tokens_seen": 61875328, "step": 897 }, { "epoch": 56.0625, "loss": 0.007805442437529564, "loss_ce": 0.0012441633734852076, "loss_xval": 0.006561279296875, "num_input_tokens_seen": 61875328, "step": 897 }, { "epoch": 56.125, "grad_norm": 62.90691314652422, "learning_rate": 5e-05, "loss": 0.1301, "num_input_tokens_seen": 61946880, "step": 898 }, { "epoch": 56.125, "loss": 0.12718553841114044, "loss_ce": 0.001208969741128385, "loss_xval": 0.1259765625, "num_input_tokens_seen": 61946880, "step": 898 }, { "epoch": 56.1875, "grad_norm": 84.84208103644006, "learning_rate": 5e-05, "loss": 0.2331, "num_input_tokens_seen": 62006016, "step": 899 }, { "epoch": 56.1875, "loss": 0.22287721931934357, "loss_ce": 0.0011975381057709455, "loss_xval": 0.2216796875, "num_input_tokens_seen": 62006016, "step": 899 }, { "epoch": 56.25, "grad_norm": 67.01970442288534, "learning_rate": 5e-05, "loss": 0.1478, "num_input_tokens_seen": 62065216, "step": 900 }, { "epoch": 56.25, "loss": 0.14876626431941986, "loss_ce": 0.001305332756601274, "loss_xval": 0.1474609375, "num_input_tokens_seen": 62065216, "step": 900 }, { "epoch": 56.3125, "grad_norm": 20.31969946362518, "learning_rate": 5e-05, "loss": 0.0165, "num_input_tokens_seen": 62136768, "step": 901 }, { "epoch": 56.3125, "loss": 0.018054738640785217, "loss_ce": 0.0013311064103618264, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 62136768, "step": 901 }, { "epoch": 56.375, "grad_norm": 26.062323010289205, "learning_rate": 5e-05, "loss": 0.0262, "num_input_tokens_seen": 62208320, "step": 902 }, { "epoch": 56.375, "loss": 0.025609955191612244, "loss_ce": 0.001195893157273531, "loss_xval": 0.0244140625, "num_input_tokens_seen": 62208320, "step": 902 }, { "epoch": 56.4375, "grad_norm": 52.54992119672312, "learning_rate": 5e-05, "loss": 0.0925, "num_input_tokens_seen": 62279872, "step": 903 }, { "epoch": 56.4375, "loss": 0.09242052584886551, "loss_ce": 0.001111934194341302, "loss_xval": 0.09130859375, "num_input_tokens_seen": 62279872, "step": 903 }, { "epoch": 56.5, "grad_norm": 51.81828437755531, "learning_rate": 5e-05, "loss": 0.0896, "num_input_tokens_seen": 62351488, "step": 904 }, { "epoch": 56.5, "loss": 0.08236326277256012, "loss_ce": 0.001308571663685143, "loss_xval": 0.0810546875, "num_input_tokens_seen": 62351488, "step": 904 }, { "epoch": 56.5625, "grad_norm": 27.88396242758177, "learning_rate": 5e-05, "loss": 0.0283, "num_input_tokens_seen": 62423168, "step": 905 }, { "epoch": 56.5625, "loss": 0.02919887565076351, "loss_ce": 0.0012447733897715807, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 62423168, "step": 905 }, { "epoch": 56.625, "grad_norm": 7.343225292992126, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 62494912, "step": 906 }, { "epoch": 56.625, "loss": 0.004097954370081425, "loss_ce": 0.0011530080810189247, "loss_xval": 0.0029449462890625, "num_input_tokens_seen": 62494912, "step": 906 }, { "epoch": 56.6875, "grad_norm": 35.796164062283395, "learning_rate": 5e-05, "loss": 0.0438, "num_input_tokens_seen": 62566464, "step": 907 }, { "epoch": 56.6875, "loss": 0.04529685154557228, "loss_ce": 0.0011073980713263154, "loss_xval": 0.044189453125, "num_input_tokens_seen": 62566464, "step": 907 }, { "epoch": 56.75, "grad_norm": 42.181166309964844, "learning_rate": 5e-05, "loss": 0.0593, "num_input_tokens_seen": 62638080, "step": 908 }, { "epoch": 56.75, "loss": 0.05803143233060837, "loss_ce": 0.0011466656578704715, "loss_xval": 0.056884765625, "num_input_tokens_seen": 62638080, "step": 908 }, { "epoch": 56.8125, "grad_norm": 28.04768382436155, "learning_rate": 5e-05, "loss": 0.0284, "num_input_tokens_seen": 62697280, "step": 909 }, { "epoch": 56.8125, "loss": 0.028996262699365616, "loss_ce": 0.0010421618353575468, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 62697280, "step": 909 }, { "epoch": 56.875, "grad_norm": 4.604248343503638, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 62768896, "step": 910 }, { "epoch": 56.875, "loss": 0.0034466590732336044, "loss_ce": 0.0011120643466711044, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 62768896, "step": 910 }, { "epoch": 56.9375, "grad_norm": 16.717241202153325, "learning_rate": 5e-05, "loss": 0.0116, "num_input_tokens_seen": 62840512, "step": 911 }, { "epoch": 56.9375, "loss": 0.013039151206612587, "loss_ce": 0.0010762609308585525, "loss_xval": 0.011962890625, "num_input_tokens_seen": 62840512, "step": 911 }, { "epoch": 57.0, "grad_norm": 26.396870079955534, "learning_rate": 5e-05, "loss": 0.0255, "num_input_tokens_seen": 62912128, "step": 912 }, { "epoch": 57.0, "loss": 0.026932695880532265, "loss_ce": 0.001053789397701621, "loss_xval": 0.02587890625, "num_input_tokens_seen": 62912128, "step": 912 }, { "epoch": 57.0625, "grad_norm": 22.369721128961444, "learning_rate": 5e-05, "loss": 0.0187, "num_input_tokens_seen": 62983808, "step": 913 }, { "epoch": 57.0625, "loss": 0.017884692177176476, "loss_ce": 0.0010389896342530847, "loss_xval": 0.016845703125, "num_input_tokens_seen": 62983808, "step": 913 }, { "epoch": 57.125, "grad_norm": 8.173019447239868, "learning_rate": 5e-05, "loss": 0.0044, "num_input_tokens_seen": 63055552, "step": 914 }, { "epoch": 57.125, "loss": 0.004705994389951229, "loss_ce": 0.001059143920429051, "loss_xval": 0.0036468505859375, "num_input_tokens_seen": 63055552, "step": 914 }, { "epoch": 57.1875, "grad_norm": 10.20139356248153, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 63127296, "step": 915 }, { "epoch": 57.1875, "loss": 0.005690255202353001, "loss_ce": 0.0009905482875183225, "loss_xval": 0.00469970703125, "num_input_tokens_seen": 63127296, "step": 915 }, { "epoch": 57.25, "grad_norm": 23.837598745509396, "learning_rate": 5e-05, "loss": 0.0218, "num_input_tokens_seen": 63199040, "step": 916 }, { "epoch": 57.25, "loss": 0.022357981652021408, "loss_ce": 0.001117747975513339, "loss_xval": 0.021240234375, "num_input_tokens_seen": 63199040, "step": 916 }, { "epoch": 57.3125, "grad_norm": 24.947858111336256, "learning_rate": 5e-05, "loss": 0.0237, "num_input_tokens_seen": 63270720, "step": 917 }, { "epoch": 57.3125, "loss": 0.023415779694914818, "loss_ce": 0.0009548426023684442, "loss_xval": 0.0224609375, "num_input_tokens_seen": 63270720, "step": 917 }, { "epoch": 57.375, "grad_norm": 16.266573389443806, "learning_rate": 5e-05, "loss": 0.0115, "num_input_tokens_seen": 63342400, "step": 918 }, { "epoch": 57.375, "loss": 0.010243535041809082, "loss_ce": 0.0009051557281054556, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 63342400, "step": 918 }, { "epoch": 57.4375, "grad_norm": 4.543084565084577, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 63401600, "step": 919 }, { "epoch": 57.4375, "loss": 0.002306486712768674, "loss_ce": 0.0009179369662888348, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 63401600, "step": 919 }, { "epoch": 57.5, "grad_norm": 6.932013648934979, "learning_rate": 5e-05, "loss": 0.0034, "num_input_tokens_seen": 63473344, "step": 920 }, { "epoch": 57.5, "loss": 0.0029933510813862085, "loss_ce": 0.0008266030927188694, "loss_xval": 0.002166748046875, "num_input_tokens_seen": 63473344, "step": 920 }, { "epoch": 57.5625, "grad_norm": 15.423595588632633, "learning_rate": 5e-05, "loss": 0.0101, "num_input_tokens_seen": 63544960, "step": 921 }, { "epoch": 57.5625, "loss": 0.008886837400496006, "loss_ce": 0.0010133021278306842, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 63544960, "step": 921 }, { "epoch": 57.625, "grad_norm": 17.914449695553976, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 63616640, "step": 922 }, { "epoch": 57.625, "loss": 0.013764435425400734, "loss_ce": 0.0008249821839854121, "loss_xval": 0.012939453125, "num_input_tokens_seen": 63616640, "step": 922 }, { "epoch": 57.6875, "grad_norm": 14.011915071038517, "learning_rate": 5e-05, "loss": 0.0087, "num_input_tokens_seen": 63688192, "step": 923 }, { "epoch": 57.6875, "loss": 0.0082526421174407, "loss_ce": 0.0008063528803177178, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 63688192, "step": 923 }, { "epoch": 57.75, "grad_norm": 5.3692390458080865, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 63759744, "step": 924 }, { "epoch": 57.75, "loss": 0.002704136073589325, "loss_ce": 0.0007891579298302531, "loss_xval": 0.00191497802734375, "num_input_tokens_seen": 63759744, "step": 924 }, { "epoch": 57.8125, "grad_norm": 5.053317204803564, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 63818752, "step": 925 }, { "epoch": 57.8125, "loss": 0.002319210208952427, "loss_ce": 0.0006865198374725878, "loss_xval": 0.0016326904296875, "num_input_tokens_seen": 63818752, "step": 925 }, { "epoch": 57.875, "grad_norm": 13.882537585237738, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 63890560, "step": 926 }, { "epoch": 57.875, "loss": 0.0076802195981144905, "loss_ce": 0.0008137643453665078, "loss_xval": 0.006866455078125, "num_input_tokens_seen": 63890560, "step": 926 }, { "epoch": 57.9375, "grad_norm": 20.089664118763753, "learning_rate": 5e-05, "loss": 0.0156, "num_input_tokens_seen": 63962240, "step": 927 }, { "epoch": 57.9375, "loss": 0.014369064942002296, "loss_ce": 0.0006971897091716528, "loss_xval": 0.013671875, "num_input_tokens_seen": 63962240, "step": 927 }, { "epoch": 58.0, "grad_norm": 23.570347705929276, "learning_rate": 5e-05, "loss": 0.021, "num_input_tokens_seen": 64033920, "step": 928 }, { "epoch": 58.0, "loss": 0.019682860001921654, "loss_ce": 0.0007619610987603664, "loss_xval": 0.0189208984375, "num_input_tokens_seen": 64033920, "step": 928 }, { "epoch": 58.0625, "grad_norm": 24.207978511849383, "learning_rate": 5e-05, "loss": 0.0219, "num_input_tokens_seen": 64105664, "step": 929 }, { "epoch": 58.0625, "loss": 0.02138231135904789, "loss_ce": 0.0006303590489551425, "loss_xval": 0.020751953125, "num_input_tokens_seen": 64105664, "step": 929 }, { "epoch": 58.125, "grad_norm": 23.42852044875404, "learning_rate": 5e-05, "loss": 0.0207, "num_input_tokens_seen": 64177216, "step": 930 }, { "epoch": 58.125, "loss": 0.02136334218084812, "loss_ce": 0.0006113895797170699, "loss_xval": 0.020751953125, "num_input_tokens_seen": 64177216, "step": 930 }, { "epoch": 58.1875, "grad_norm": 22.312109718917235, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 64236288, "step": 931 }, { "epoch": 58.1875, "loss": 0.019073408097028732, "loss_ce": 0.0006407916662283242, "loss_xval": 0.0184326171875, "num_input_tokens_seen": 64236288, "step": 931 }, { "epoch": 58.25, "grad_norm": 21.508149968679223, "learning_rate": 5e-05, "loss": 0.0176, "num_input_tokens_seen": 64307840, "step": 932 }, { "epoch": 58.25, "loss": 0.01990123651921749, "loss_ce": 0.0006141266785562038, "loss_xval": 0.019287109375, "num_input_tokens_seen": 64307840, "step": 932 }, { "epoch": 58.3125, "grad_norm": 22.156431498551758, "learning_rate": 5e-05, "loss": 0.0185, "num_input_tokens_seen": 64379392, "step": 933 }, { "epoch": 58.3125, "loss": 0.018906189128756523, "loss_ce": 0.0005956426030024886, "loss_xval": 0.018310546875, "num_input_tokens_seen": 64379392, "step": 933 }, { "epoch": 58.375, "grad_norm": 23.568803424797704, "learning_rate": 5e-05, "loss": 0.0206, "num_input_tokens_seen": 64450944, "step": 934 }, { "epoch": 58.375, "loss": 0.020614925771951675, "loss_ce": 0.0005953953368589282, "loss_xval": 0.02001953125, "num_input_tokens_seen": 64450944, "step": 934 }, { "epoch": 58.4375, "grad_norm": 24.20906636976166, "learning_rate": 5e-05, "loss": 0.0217, "num_input_tokens_seen": 64522624, "step": 935 }, { "epoch": 58.4375, "loss": 0.02329493872821331, "loss_ce": 0.000589860079344362, "loss_xval": 0.022705078125, "num_input_tokens_seen": 64522624, "step": 935 }, { "epoch": 58.5, "grad_norm": 25.28492068445306, "learning_rate": 5e-05, "loss": 0.024, "num_input_tokens_seen": 64594304, "step": 936 }, { "epoch": 58.5, "loss": 0.02387586608529091, "loss_ce": 0.0005604361067526042, "loss_xval": 0.0233154296875, "num_input_tokens_seen": 64594304, "step": 936 }, { "epoch": 58.5625, "grad_norm": 29.171874285488098, "learning_rate": 5e-05, "loss": 0.0312, "num_input_tokens_seen": 64665984, "step": 937 }, { "epoch": 58.5625, "loss": 0.03120674006640911, "loss_ce": 0.0005670919199474156, "loss_xval": 0.0306396484375, "num_input_tokens_seen": 64665984, "step": 937 }, { "epoch": 58.625, "grad_norm": 36.220269709923116, "learning_rate": 5e-05, "loss": 0.0474, "num_input_tokens_seen": 64737600, "step": 938 }, { "epoch": 58.625, "loss": 0.04941752925515175, "loss_ce": 0.0005894028581678867, "loss_xval": 0.048828125, "num_input_tokens_seen": 64737600, "step": 938 }, { "epoch": 58.6875, "grad_norm": 44.53325592874753, "learning_rate": 5e-05, "loss": 0.0716, "num_input_tokens_seen": 64809152, "step": 939 }, { "epoch": 58.6875, "loss": 0.0684363916516304, "loss_ce": 0.0005652988911606371, "loss_xval": 0.06787109375, "num_input_tokens_seen": 64809152, "step": 939 }, { "epoch": 58.75, "grad_norm": 53.332865645494145, "learning_rate": 5e-05, "loss": 0.1024, "num_input_tokens_seen": 64880704, "step": 940 }, { "epoch": 58.75, "loss": 0.10312040895223618, "loss_ce": 0.0005813451134599745, "loss_xval": 0.1025390625, "num_input_tokens_seen": 64880704, "step": 940 }, { "epoch": 58.8125, "grad_norm": 61.78611720533413, "learning_rate": 5e-05, "loss": 0.1383, "num_input_tokens_seen": 64952448, "step": 941 }, { "epoch": 58.8125, "loss": 0.14115650951862335, "loss_ce": 0.0005315167945809662, "loss_xval": 0.140625, "num_input_tokens_seen": 64952448, "step": 941 }, { "epoch": 58.875, "grad_norm": 67.57119829473461, "learning_rate": 5e-05, "loss": 0.1657, "num_input_tokens_seen": 65024000, "step": 942 }, { "epoch": 58.875, "loss": 0.16276301443576813, "loss_ce": 0.0006536326254718006, "loss_xval": 0.162109375, "num_input_tokens_seen": 65024000, "step": 942 }, { "epoch": 58.9375, "grad_norm": 67.3748450718513, "learning_rate": 5e-05, "loss": 0.1666, "num_input_tokens_seen": 65095552, "step": 943 }, { "epoch": 58.9375, "loss": 0.17051354050636292, "loss_ce": 0.0005916593945585191, "loss_xval": 0.169921875, "num_input_tokens_seen": 65095552, "step": 943 }, { "epoch": 59.0, "grad_norm": 58.048157118684784, "learning_rate": 5e-05, "loss": 0.1269, "num_input_tokens_seen": 65167104, "step": 944 }, { "epoch": 59.0, "loss": 0.1286008656024933, "loss_ce": 0.0006711802561767399, "loss_xval": 0.1279296875, "num_input_tokens_seen": 65167104, "step": 944 }, { "epoch": 59.0625, "grad_norm": 38.19997686993994, "learning_rate": 5e-05, "loss": 0.0568, "num_input_tokens_seen": 65238720, "step": 945 }, { "epoch": 59.0625, "loss": 0.05981079116463661, "loss_ce": 0.0007287596818059683, "loss_xval": 0.05908203125, "num_input_tokens_seen": 65238720, "step": 945 }, { "epoch": 59.125, "grad_norm": 9.525180721691116, "learning_rate": 5e-05, "loss": 0.0057, "num_input_tokens_seen": 65310336, "step": 946 }, { "epoch": 59.125, "loss": 0.006437521893531084, "loss_ce": 0.0007612523622810841, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 65310336, "step": 946 }, { "epoch": 59.1875, "grad_norm": 20.86777318321889, "learning_rate": 5e-05, "loss": 0.0183, "num_input_tokens_seen": 65382016, "step": 947 }, { "epoch": 59.1875, "loss": 0.01804313436150551, "loss_ce": 0.0008312196005135775, "loss_xval": 0.0172119140625, "num_input_tokens_seen": 65382016, "step": 947 }, { "epoch": 59.25, "grad_norm": 44.605782594914494, "learning_rate": 5e-05, "loss": 0.0775, "num_input_tokens_seen": 65441152, "step": 948 }, { "epoch": 59.25, "loss": 0.07944431900978088, "loss_ce": 0.0008310358389280736, "loss_xval": 0.07861328125, "num_input_tokens_seen": 65441152, "step": 948 }, { "epoch": 59.3125, "grad_norm": 57.15983641908541, "learning_rate": 5e-05, "loss": 0.1251, "num_input_tokens_seen": 65512832, "step": 949 }, { "epoch": 59.3125, "loss": 0.12679226696491241, "loss_ce": 0.0008157067350111902, "loss_xval": 0.1259765625, "num_input_tokens_seen": 65512832, "step": 949 }, { "epoch": 59.375, "grad_norm": 52.388511589162, "learning_rate": 5e-05, "loss": 0.1072, "num_input_tokens_seen": 65584384, "step": 950 }, { "epoch": 59.375, "loss": 0.10724907368421555, "loss_ce": 0.0008037587977014482, "loss_xval": 0.1064453125, "num_input_tokens_seen": 65584384, "step": 950 }, { "epoch": 59.4375, "grad_norm": 29.62629590918632, "learning_rate": 5e-05, "loss": 0.036, "num_input_tokens_seen": 65655936, "step": 951 }, { "epoch": 59.4375, "loss": 0.03387866169214249, "loss_ce": 0.0009196769678965211, "loss_xval": 0.032958984375, "num_input_tokens_seen": 65655936, "step": 951 }, { "epoch": 59.5, "grad_norm": 3.4575314631603558, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 65727552, "step": 952 }, { "epoch": 59.5, "loss": 0.002521348185837269, "loss_ce": 0.0008428815053775907, "loss_xval": 0.001678466796875, "num_input_tokens_seen": 65727552, "step": 952 }, { "epoch": 59.5625, "grad_norm": 34.69990940993319, "learning_rate": 5e-05, "loss": 0.0484, "num_input_tokens_seen": 65799168, "step": 953 }, { "epoch": 59.5625, "loss": 0.05138184130191803, "loss_ce": 0.0008447307045571506, "loss_xval": 0.050537109375, "num_input_tokens_seen": 65799168, "step": 953 }, { "epoch": 59.625, "grad_norm": 54.85586285901504, "learning_rate": 5e-05, "loss": 0.1207, "num_input_tokens_seen": 65870848, "step": 954 }, { "epoch": 59.625, "loss": 0.12335727363824844, "loss_ce": 0.000798677618149668, "loss_xval": 0.12255859375, "num_input_tokens_seen": 65870848, "step": 954 }, { "epoch": 59.6875, "grad_norm": 58.16518006510348, "learning_rate": 5e-05, "loss": 0.1336, "num_input_tokens_seen": 65942592, "step": 955 }, { "epoch": 59.6875, "loss": 0.1276523470878601, "loss_ce": 0.0006992240669205785, "loss_xval": 0.126953125, "num_input_tokens_seen": 65942592, "step": 955 }, { "epoch": 59.75, "grad_norm": 41.478621513823285, "learning_rate": 5e-05, "loss": 0.0689, "num_input_tokens_seen": 66001664, "step": 956 }, { "epoch": 59.75, "loss": 0.06770110130310059, "loss_ce": 0.0008065711590461433, "loss_xval": 0.06689453125, "num_input_tokens_seen": 66001664, "step": 956 }, { "epoch": 59.8125, "grad_norm": 12.798023338883505, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 66073216, "step": 957 }, { "epoch": 59.8125, "loss": 0.010349994525313377, "loss_ce": 0.0008895453647710383, "loss_xval": 0.00946044921875, "num_input_tokens_seen": 66073216, "step": 957 }, { "epoch": 59.875, "grad_norm": 14.997701812579756, "learning_rate": 5e-05, "loss": 0.0113, "num_input_tokens_seen": 66145024, "step": 958 }, { "epoch": 59.875, "loss": 0.011212315410375595, "loss_ce": 0.0007753035170026124, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 66145024, "step": 958 }, { "epoch": 59.9375, "grad_norm": 31.64754420739134, "learning_rate": 5e-05, "loss": 0.0409, "num_input_tokens_seen": 66204032, "step": 959 }, { "epoch": 59.9375, "loss": 0.046755120158195496, "loss_ce": 0.0008566842298023403, "loss_xval": 0.0458984375, "num_input_tokens_seen": 66204032, "step": 959 }, { "epoch": 60.0, "grad_norm": 33.04461809351946, "learning_rate": 5e-05, "loss": 0.0446, "num_input_tokens_seen": 66275776, "step": 960 }, { "epoch": 60.0, "loss": 0.041984446346759796, "loss_ce": 0.0007246798486448824, "loss_xval": 0.041259765625, "num_input_tokens_seen": 66275776, "step": 960 }, { "epoch": 60.0625, "grad_norm": 23.900029676188776, "learning_rate": 5e-05, "loss": 0.0245, "num_input_tokens_seen": 66347456, "step": 961 }, { "epoch": 60.0625, "loss": 0.026303108781576157, "loss_ce": 0.0007904136436991394, "loss_xval": 0.0255126953125, "num_input_tokens_seen": 66347456, "step": 961 }, { "epoch": 60.125, "grad_norm": 10.796280459241752, "learning_rate": 5e-05, "loss": 0.0065, "num_input_tokens_seen": 66419008, "step": 962 }, { "epoch": 60.125, "loss": 0.007391719613224268, "loss_ce": 0.0007388876401819289, "loss_xval": 0.00665283203125, "num_input_tokens_seen": 66419008, "step": 962 }, { "epoch": 60.1875, "grad_norm": 3.5390655467739354, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 66490560, "step": 963 }, { "epoch": 60.1875, "loss": 0.003932563588023186, "loss_ce": 0.0006824417505413294, "loss_xval": 0.0032501220703125, "num_input_tokens_seen": 66490560, "step": 963 }, { "epoch": 60.25, "grad_norm": 14.230928746177904, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 66562304, "step": 964 }, { "epoch": 60.25, "loss": 0.00826042890548706, "loss_ce": 0.0007225867011584342, "loss_xval": 0.007537841796875, "num_input_tokens_seen": 66562304, "step": 964 }, { "epoch": 60.3125, "grad_norm": 15.974909633110373, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 66634112, "step": 965 }, { "epoch": 60.3125, "loss": 0.016383003443479538, "loss_ce": 0.0006359326071105897, "loss_xval": 0.0157470703125, "num_input_tokens_seen": 66634112, "step": 965 }, { "epoch": 60.375, "grad_norm": 10.808098917481177, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 66705728, "step": 966 }, { "epoch": 60.375, "loss": 0.0064287637360394, "loss_ce": 0.0005999064305797219, "loss_xval": 0.005828857421875, "num_input_tokens_seen": 66705728, "step": 966 }, { "epoch": 60.4375, "grad_norm": 5.528738803033689, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 66764800, "step": 967 }, { "epoch": 60.4375, "loss": 0.0019519556080922484, "loss_ce": 0.0006091821705922484, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 66764800, "step": 967 }, { "epoch": 60.5, "grad_norm": 0.7874106581706314, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 66836352, "step": 968 }, { "epoch": 60.5, "loss": 0.002332956064492464, "loss_ce": 0.000601083564106375, "loss_xval": 0.00173187255859375, "num_input_tokens_seen": 66836352, "step": 968 }, { "epoch": 60.5625, "grad_norm": 3.489877042850535, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 66907968, "step": 969 }, { "epoch": 60.5625, "loss": 0.0026276027783751488, "loss_ce": 0.0005981839494779706, "loss_xval": 0.0020294189453125, "num_input_tokens_seen": 66907968, "step": 969 }, { "epoch": 60.625, "grad_norm": 3.717508401531591, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 66979584, "step": 970 }, { "epoch": 60.625, "loss": 0.0015087537467479706, "loss_ce": 0.0005398206412792206, "loss_xval": 0.00096893310546875, "num_input_tokens_seen": 66979584, "step": 970 }, { "epoch": 60.6875, "grad_norm": 0.5407000248907325, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 67051200, "step": 971 }, { "epoch": 60.6875, "loss": 0.0016631756443530321, "loss_ce": 0.000610319257248193, "loss_xval": 0.0010528564453125, "num_input_tokens_seen": 67051200, "step": 971 }, { "epoch": 60.75, "grad_norm": 1.6571346214836031, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 67122816, "step": 972 }, { "epoch": 60.75, "loss": 0.001810828223824501, "loss_ce": 0.000544348731637001, "loss_xval": 0.0012664794921875, "num_input_tokens_seen": 67122816, "step": 972 }, { "epoch": 60.8125, "grad_norm": 1.5450918420386779, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 67194368, "step": 973 }, { "epoch": 60.8125, "loss": 0.0022611215244978666, "loss_ce": 0.0005368783604353666, "loss_xval": 0.0017242431640625, "num_input_tokens_seen": 67194368, "step": 973 }, { "epoch": 60.875, "grad_norm": 0.7884652256068236, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 67265920, "step": 974 }, { "epoch": 60.875, "loss": 0.0010354053229093552, "loss_ce": 0.0005013477057218552, "loss_xval": 0.0005340576171875, "num_input_tokens_seen": 67265920, "step": 974 }, { "epoch": 60.9375, "grad_norm": 0.5366298158606575, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 67337600, "step": 975 }, { "epoch": 60.9375, "loss": 0.0011774263111874461, "loss_ce": 0.0005022248951718211, "loss_xval": 0.000675201416015625, "num_input_tokens_seen": 67337600, "step": 975 }, { "epoch": 61.0, "grad_norm": 2.310955169102163, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 67409280, "step": 976 }, { "epoch": 61.0, "loss": 0.001500699669122696, "loss_ce": 0.00048599025467410684, "loss_xval": 0.00101470947265625, "num_input_tokens_seen": 67409280, "step": 976 }, { "epoch": 61.0625, "grad_norm": 3.349124799438185, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 67480896, "step": 977 }, { "epoch": 61.0625, "loss": 0.0016312211519107223, "loss_ce": 0.00044103560503572226, "loss_xval": 0.001190185546875, "num_input_tokens_seen": 67480896, "step": 977 }, { "epoch": 61.125, "grad_norm": 3.8676898786882665, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 67552512, "step": 978 }, { "epoch": 61.125, "loss": 0.0016393068945035338, "loss_ce": 0.0004414919239934534, "loss_xval": 0.00119781494140625, "num_input_tokens_seen": 67552512, "step": 978 }, { "epoch": 61.1875, "grad_norm": 4.27868966415631, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 67624128, "step": 979 }, { "epoch": 61.1875, "loss": 0.0018708731513470411, "loss_ce": 0.00045180582674220204, "loss_xval": 0.0014190673828125, "num_input_tokens_seen": 67624128, "step": 979 }, { "epoch": 61.25, "grad_norm": 5.103740631510975, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 67695808, "step": 980 }, { "epoch": 61.25, "loss": 0.001763427397236228, "loss_ce": 0.0003901363234035671, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 67695808, "step": 980 }, { "epoch": 61.3125, "grad_norm": 7.877201524071439, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 67754944, "step": 981 }, { "epoch": 61.3125, "loss": 0.003038346767425537, "loss_ce": 0.00042909375042654574, "loss_xval": 0.0026092529296875, "num_input_tokens_seen": 67754944, "step": 981 }, { "epoch": 61.375, "grad_norm": 13.964321159085115, "learning_rate": 5e-05, "loss": 0.0089, "num_input_tokens_seen": 67826624, "step": 982 }, { "epoch": 61.375, "loss": 0.009481735527515411, "loss_ce": 0.00038749678060412407, "loss_xval": 0.00909423828125, "num_input_tokens_seen": 67826624, "step": 982 }, { "epoch": 61.4375, "grad_norm": 25.125654309287086, "learning_rate": 5e-05, "loss": 0.0268, "num_input_tokens_seen": 67873344, "step": 983 }, { "epoch": 61.4375, "loss": 0.02721680887043476, "loss_ce": 0.000361339480150491, "loss_xval": 0.02685546875, "num_input_tokens_seen": 67873344, "step": 983 }, { "epoch": 61.5, "grad_norm": 45.26377203729771, "learning_rate": 5e-05, "loss": 0.087, "num_input_tokens_seen": 67944960, "step": 984 }, { "epoch": 61.5, "loss": 0.08778023719787598, "loss_ce": 0.0003778930695261806, "loss_xval": 0.08740234375, "num_input_tokens_seen": 67944960, "step": 984 }, { "epoch": 61.5625, "grad_norm": 79.46728193534139, "learning_rate": 5e-05, "loss": 0.266, "num_input_tokens_seen": 68016576, "step": 985 }, { "epoch": 61.5625, "loss": 0.2660321593284607, "loss_ce": 0.0004071468429174274, "loss_xval": 0.265625, "num_input_tokens_seen": 68016576, "step": 985 }, { "epoch": 61.625, "grad_norm": 121.02400174402173, "learning_rate": 5e-05, "loss": 0.6274, "num_input_tokens_seen": 68088256, "step": 986 }, { "epoch": 61.625, "loss": 0.6254241466522217, "loss_ce": 0.0004241722053848207, "loss_xval": 0.625, "num_input_tokens_seen": 68088256, "step": 986 }, { "epoch": 61.6875, "grad_norm": 137.4454848089739, "learning_rate": 5e-05, "loss": 0.8307, "num_input_tokens_seen": 68159808, "step": 987 }, { "epoch": 61.6875, "loss": 0.8248781561851501, "loss_ce": 0.0006593933212570846, "loss_xval": 0.82421875, "num_input_tokens_seen": 68159808, "step": 987 }, { "epoch": 61.75, "grad_norm": 87.03113064555924, "learning_rate": 5e-05, "loss": 0.3425, "num_input_tokens_seen": 68231360, "step": 988 }, { "epoch": 61.75, "loss": 0.3444867432117462, "loss_ce": 0.0007367533398792148, "loss_xval": 0.34375, "num_input_tokens_seen": 68231360, "step": 988 }, { "epoch": 61.8125, "grad_norm": 10.68486256134069, "learning_rate": 5e-05, "loss": 0.0083, "num_input_tokens_seen": 68303040, "step": 989 }, { "epoch": 61.8125, "loss": 0.008067439310252666, "loss_ce": 0.0006516677094623446, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 68303040, "step": 989 }, { "epoch": 61.875, "grad_norm": 71.27713716064294, "learning_rate": 5e-05, "loss": 0.2234, "num_input_tokens_seen": 68362112, "step": 990 }, { "epoch": 61.875, "loss": 0.2184855192899704, "loss_ce": 0.0007120799855329096, "loss_xval": 0.2177734375, "num_input_tokens_seen": 68362112, "step": 990 }, { "epoch": 61.9375, "grad_norm": 150.67348138910384, "learning_rate": 5e-05, "loss": 0.9576, "num_input_tokens_seen": 68433728, "step": 991 }, { "epoch": 61.9375, "loss": 0.9381079077720642, "loss_ce": 0.0006078826263546944, "loss_xval": 0.9375, "num_input_tokens_seen": 68433728, "step": 991 }, { "epoch": 62.0, "grad_norm": 160.0697716894221, "learning_rate": 5e-05, "loss": 0.9985, "num_input_tokens_seen": 68505280, "step": 992 }, { "epoch": 62.0, "loss": 1.0446293354034424, "loss_ce": 0.005566796753555536, "loss_xval": 1.0390625, "num_input_tokens_seen": 68505280, "step": 992 }, { "epoch": 62.0625, "grad_norm": 54.725170062410925, "learning_rate": 5e-05, "loss": 0.1661, "num_input_tokens_seen": 68576896, "step": 993 }, { "epoch": 62.0625, "loss": 0.14711056649684906, "loss_ce": 0.001602747361175716, "loss_xval": 0.1455078125, "num_input_tokens_seen": 68576896, "step": 993 }, { "epoch": 62.125, "grad_norm": 57.095522982761416, "learning_rate": 5e-05, "loss": 0.2123, "num_input_tokens_seen": 68648448, "step": 994 }, { "epoch": 62.125, "loss": 0.19358542561531067, "loss_ce": 0.009991676546633244, "loss_xval": 0.18359375, "num_input_tokens_seen": 68648448, "step": 994 }, { "epoch": 62.1875, "grad_norm": 129.51112432923358, "learning_rate": 5e-05, "loss": 0.9835, "num_input_tokens_seen": 68720128, "step": 995 }, { "epoch": 62.1875, "loss": 1.0289194583892822, "loss_ce": 0.028919465839862823, "loss_xval": 1.0, "num_input_tokens_seen": 68720128, "step": 995 }, { "epoch": 62.25, "grad_norm": 236.6345678485908, "learning_rate": 5e-05, "loss": 3.1, "num_input_tokens_seen": 68791744, "step": 996 }, { "epoch": 62.25, "loss": 3.211564302444458, "loss_ce": 0.03968937322497368, "loss_xval": 3.171875, "num_input_tokens_seen": 68791744, "step": 996 }, { "epoch": 62.3125, "grad_norm": 225.87529604424057, "learning_rate": 5e-05, "loss": 2.516, "num_input_tokens_seen": 68851008, "step": 997 }, { "epoch": 62.3125, "loss": 2.585651397705078, "loss_ce": 0.00752646429464221, "loss_xval": 2.578125, "num_input_tokens_seen": 68851008, "step": 997 }, { "epoch": 62.375, "grad_norm": 22.233360328809926, "learning_rate": 5e-05, "loss": 0.1002, "num_input_tokens_seen": 68922624, "step": 998 }, { "epoch": 62.375, "loss": 0.08905638754367828, "loss_ce": 0.06305541098117828, "loss_xval": 0.0260009765625, "num_input_tokens_seen": 68922624, "step": 998 }, { "epoch": 62.4375, "grad_norm": 330.7636326856827, "learning_rate": 5e-05, "loss": 3.8083, "num_input_tokens_seen": 68994176, "step": 999 }, { "epoch": 62.4375, "loss": 3.894900321960449, "loss_ce": 0.03552544116973877, "loss_xval": 3.859375, "num_input_tokens_seen": 68994176, "step": 999 }, { "epoch": 62.5, "grad_norm": 180.3650646320968, "learning_rate": 5e-05, "loss": 1.4232, "num_input_tokens_seen": 69065728, "step": 1000 }, { "epoch": 62.5, "eval_synth_IoU": 0.0, "eval_synth_MAE_x": 1.335601806640625, "eval_synth_MAE_y": 1.298309326171875, "eval_synth_NUM_probability": 0.9881191998720169, "eval_synth_inside_bbox": 0.0, "eval_synth_loss": 1.8374046087265015, "eval_synth_loss_ce": 0.015139079187065363, "eval_synth_loss_xval": 1.822265625, "eval_synth_runtime": 63.5264, "eval_synth_samples_per_second": 2.015, "eval_synth_steps_per_second": 0.063, "num_input_tokens_seen": 69065728, "step": 1000 }, { "epoch": 62.5, "loss": 1.84526789188385, "loss_ce": 0.017142947763204575, "loss_xval": 1.828125, "num_input_tokens_seen": 69065728, "step": 1000 }, { "epoch": 62.5625, "grad_norm": 200.80735424840003, "learning_rate": 5e-05, "loss": 1.8535, "num_input_tokens_seen": 69124736, "step": 1001 }, { "epoch": 62.5625, "loss": 1.8449586629867554, "loss_ce": 0.016833681613206863, "loss_xval": 1.828125, "num_input_tokens_seen": 69124736, "step": 1001 }, { "epoch": 62.625, "grad_norm": 114.3159442962799, "learning_rate": 5e-05, "loss": 0.565, "num_input_tokens_seen": 69183872, "step": 1002 }, { "epoch": 62.625, "loss": 0.5889109373092651, "loss_ce": 0.018598422408103943, "loss_xval": 0.5703125, "num_input_tokens_seen": 69183872, "step": 1002 }, { "epoch": 62.6875, "grad_norm": 186.98765624810193, "learning_rate": 5e-05, "loss": 1.5284, "num_input_tokens_seen": 69255424, "step": 1003 }, { "epoch": 62.6875, "loss": 1.472475528717041, "loss_ce": 0.011538065969944, "loss_xval": 1.4609375, "num_input_tokens_seen": 69255424, "step": 1003 }, { "epoch": 62.75, "grad_norm": 66.58049614141694, "learning_rate": 5e-05, "loss": 0.2263, "num_input_tokens_seen": 69326976, "step": 1004 }, { "epoch": 62.75, "loss": 0.23296989500522614, "loss_ce": 0.012266767211258411, "loss_xval": 0.220703125, "num_input_tokens_seen": 69326976, "step": 1004 }, { "epoch": 62.8125, "grad_norm": 159.02076786011705, "learning_rate": 5e-05, "loss": 1.1836, "num_input_tokens_seen": 69398656, "step": 1005 }, { "epoch": 62.8125, "loss": 1.1524460315704346, "loss_ce": 0.00400858698412776, "loss_xval": 1.1484375, "num_input_tokens_seen": 69398656, "step": 1005 }, { "epoch": 62.875, "grad_norm": 7.503047980086942, "learning_rate": 5e-05, "loss": 0.0494, "num_input_tokens_seen": 69470336, "step": 1006 }, { "epoch": 62.875, "loss": 0.04918452724814415, "loss_ce": 0.0037743712309747934, "loss_xval": 0.04541015625, "num_input_tokens_seen": 69470336, "step": 1006 }, { "epoch": 62.9375, "grad_norm": 149.27054672521433, "learning_rate": 5e-05, "loss": 1.0062, "num_input_tokens_seen": 69542080, "step": 1007 }, { "epoch": 62.9375, "loss": 0.8986896872520447, "loss_ce": 0.004158430732786655, "loss_xval": 0.89453125, "num_input_tokens_seen": 69542080, "step": 1007 }, { "epoch": 63.0, "grad_norm": 47.09901096637737, "learning_rate": 5e-05, "loss": 0.1531, "num_input_tokens_seen": 69613760, "step": 1008 }, { "epoch": 63.0, "loss": 0.16589389741420746, "loss_ce": 0.0067142159678041935, "loss_xval": 0.1591796875, "num_input_tokens_seen": 69613760, "step": 1008 }, { "epoch": 63.0625, "grad_norm": 70.27234087538793, "learning_rate": 5e-05, "loss": 0.3167, "num_input_tokens_seen": 69685376, "step": 1009 }, { "epoch": 63.0625, "loss": 0.31789273023605347, "loss_ce": 0.019064605236053467, "loss_xval": 0.298828125, "num_input_tokens_seen": 69685376, "step": 1009 }, { "epoch": 63.125, "grad_norm": 72.23082569297506, "learning_rate": 5e-05, "loss": 0.349, "num_input_tokens_seen": 69757056, "step": 1010 }, { "epoch": 63.125, "loss": 0.352620929479599, "loss_ce": 0.012777186930179596, "loss_xval": 0.33984375, "num_input_tokens_seen": 69757056, "step": 1010 }, { "epoch": 63.1875, "grad_norm": 22.212031709182547, "learning_rate": 5e-05, "loss": 0.0616, "num_input_tokens_seen": 69828672, "step": 1011 }, { "epoch": 63.1875, "loss": 0.06873536854982376, "loss_ce": 0.013803728856146336, "loss_xval": 0.054931640625, "num_input_tokens_seen": 69828672, "step": 1011 }, { "epoch": 63.25, "grad_norm": 72.80873328440106, "learning_rate": 5e-05, "loss": 0.3807, "num_input_tokens_seen": 69900288, "step": 1012 }, { "epoch": 63.25, "loss": 0.4027869403362274, "loss_ce": 0.012161929160356522, "loss_xval": 0.390625, "num_input_tokens_seen": 69900288, "step": 1012 }, { "epoch": 63.3125, "grad_norm": 17.47620030362804, "learning_rate": 5e-05, "loss": 0.0398, "num_input_tokens_seen": 69971904, "step": 1013 }, { "epoch": 63.3125, "loss": 0.03510742262005806, "loss_ce": 0.013623046688735485, "loss_xval": 0.021484375, "num_input_tokens_seen": 69971904, "step": 1013 }, { "epoch": 63.375, "grad_norm": 55.426872710066, "learning_rate": 5e-05, "loss": 0.2349, "num_input_tokens_seen": 70043520, "step": 1014 }, { "epoch": 63.375, "loss": 0.22966395318508148, "loss_ce": 0.012867072597146034, "loss_xval": 0.216796875, "num_input_tokens_seen": 70043520, "step": 1014 }, { "epoch": 63.4375, "grad_norm": 44.943384288182784, "learning_rate": 5e-05, "loss": 0.1586, "num_input_tokens_seen": 70115264, "step": 1015 }, { "epoch": 63.4375, "loss": 0.15268297493457794, "loss_ce": 0.012057974934577942, "loss_xval": 0.140625, "num_input_tokens_seen": 70115264, "step": 1015 }, { "epoch": 63.5, "grad_norm": 20.501575432429476, "learning_rate": 5e-05, "loss": 0.0473, "num_input_tokens_seen": 70186816, "step": 1016 }, { "epoch": 63.5, "loss": 0.047375284135341644, "loss_ce": 0.011242473497986794, "loss_xval": 0.0361328125, "num_input_tokens_seen": 70186816, "step": 1016 }, { "epoch": 63.5625, "grad_norm": 50.11477187146609, "learning_rate": 5e-05, "loss": 0.1953, "num_input_tokens_seen": 70258368, "step": 1017 }, { "epoch": 63.5625, "loss": 0.2005818635225296, "loss_ce": 0.009175607934594154, "loss_xval": 0.19140625, "num_input_tokens_seen": 70258368, "step": 1017 }, { "epoch": 63.625, "grad_norm": 17.595502628261446, "learning_rate": 5e-05, "loss": 0.0397, "num_input_tokens_seen": 70329920, "step": 1018 }, { "epoch": 63.625, "loss": 0.044486869126558304, "loss_ce": 0.007133353501558304, "loss_xval": 0.037353515625, "num_input_tokens_seen": 70329920, "step": 1018 }, { "epoch": 63.6875, "grad_norm": 34.4850294390446, "learning_rate": 5e-05, "loss": 0.0961, "num_input_tokens_seen": 70401536, "step": 1019 }, { "epoch": 63.6875, "loss": 0.11015944927930832, "loss_ce": 0.005667262244969606, "loss_xval": 0.1044921875, "num_input_tokens_seen": 70401536, "step": 1019 }, { "epoch": 63.75, "grad_norm": 41.12267515604947, "learning_rate": 5e-05, "loss": 0.1271, "num_input_tokens_seen": 70473088, "step": 1020 }, { "epoch": 63.75, "loss": 0.12851181626319885, "loss_ce": 0.0054649352096021175, "loss_xval": 0.123046875, "num_input_tokens_seen": 70473088, "step": 1020 }, { "epoch": 63.8125, "grad_norm": 1.6562020306690155, "learning_rate": 5e-05, "loss": 0.0132, "num_input_tokens_seen": 70544768, "step": 1021 }, { "epoch": 63.8125, "loss": 0.009423565119504929, "loss_ce": 0.005029033403843641, "loss_xval": 0.00439453125, "num_input_tokens_seen": 70544768, "step": 1021 }, { "epoch": 63.875, "grad_norm": 40.425729593839435, "learning_rate": 5e-05, "loss": 0.1248, "num_input_tokens_seen": 70616320, "step": 1022 }, { "epoch": 63.875, "loss": 0.12477166950702667, "loss_ce": 0.005142764188349247, "loss_xval": 0.11962890625, "num_input_tokens_seen": 70616320, "step": 1022 }, { "epoch": 63.9375, "grad_norm": 26.761025062310726, "learning_rate": 5e-05, "loss": 0.0593, "num_input_tokens_seen": 70687936, "step": 1023 }, { "epoch": 63.9375, "loss": 0.05864199995994568, "loss_ce": 0.004686920437961817, "loss_xval": 0.053955078125, "num_input_tokens_seen": 70687936, "step": 1023 }, { "epoch": 64.0, "grad_norm": 16.220804961953952, "learning_rate": 5e-05, "loss": 0.0266, "num_input_tokens_seen": 70759744, "step": 1024 }, { "epoch": 64.0, "loss": 0.03134244307875633, "loss_ce": 0.0038766234647482634, "loss_xval": 0.0274658203125, "num_input_tokens_seen": 70759744, "step": 1024 }, { "epoch": 64.0625, "grad_norm": 37.128469054500776, "learning_rate": 5e-05, "loss": 0.1006, "num_input_tokens_seen": 70831424, "step": 1025 }, { "epoch": 64.0625, "loss": 0.10453501343727112, "loss_ce": 0.003949078731238842, "loss_xval": 0.1005859375, "num_input_tokens_seen": 70831424, "step": 1025 }, { "epoch": 64.125, "grad_norm": 9.66025199247302, "learning_rate": 5e-05, "loss": 0.0183, "num_input_tokens_seen": 70903168, "step": 1026 }, { "epoch": 64.125, "loss": 0.017569195479154587, "loss_ce": 0.003653179621323943, "loss_xval": 0.013916015625, "num_input_tokens_seen": 70903168, "step": 1026 }, { "epoch": 64.1875, "grad_norm": 29.50918018984489, "learning_rate": 5e-05, "loss": 0.0682, "num_input_tokens_seen": 70974848, "step": 1027 }, { "epoch": 64.1875, "loss": 0.06166379898786545, "loss_ce": 0.0035583314020186663, "loss_xval": 0.05810546875, "num_input_tokens_seen": 70974848, "step": 1027 }, { "epoch": 64.25, "grad_norm": 24.911312204437753, "learning_rate": 5e-05, "loss": 0.0505, "num_input_tokens_seen": 71046400, "step": 1028 }, { "epoch": 64.25, "loss": 0.05654432252049446, "loss_ce": 0.0035658059641718864, "loss_xval": 0.052978515625, "num_input_tokens_seen": 71046400, "step": 1028 }, { "epoch": 64.3125, "grad_norm": 9.45537493788124, "learning_rate": 5e-05, "loss": 0.0149, "num_input_tokens_seen": 71105472, "step": 1029 }, { "epoch": 64.3125, "loss": 0.015377216972410679, "loss_ce": 0.0030481156427413225, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 71105472, "step": 1029 }, { "epoch": 64.375, "grad_norm": 26.640750365389675, "learning_rate": 5e-05, "loss": 0.0554, "num_input_tokens_seen": 71177088, "step": 1030 }, { "epoch": 64.375, "loss": 0.05298246070742607, "loss_ce": 0.00293363188393414, "loss_xval": 0.050048828125, "num_input_tokens_seen": 71177088, "step": 1030 }, { "epoch": 64.4375, "grad_norm": 11.291872650185505, "learning_rate": 5e-05, "loss": 0.0144, "num_input_tokens_seen": 71248768, "step": 1031 }, { "epoch": 64.4375, "loss": 0.014845769852399826, "loss_ce": 0.002821844071149826, "loss_xval": 0.01202392578125, "num_input_tokens_seen": 71248768, "step": 1031 }, { "epoch": 64.5, "grad_norm": 18.072573126276268, "learning_rate": 5e-05, "loss": 0.0278, "num_input_tokens_seen": 71320384, "step": 1032 }, { "epoch": 64.5, "loss": 0.027823323383927345, "loss_ce": 0.0029209794010967016, "loss_xval": 0.02490234375, "num_input_tokens_seen": 71320384, "step": 1032 }, { "epoch": 64.5625, "grad_norm": 21.662435019484295, "learning_rate": 5e-05, "loss": 0.0382, "num_input_tokens_seen": 71392000, "step": 1033 }, { "epoch": 64.5625, "loss": 0.03982299193739891, "loss_ce": 0.002713618101552129, "loss_xval": 0.037109375, "num_input_tokens_seen": 71392000, "step": 1033 }, { "epoch": 64.625, "grad_norm": 2.838038638389152, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 71463680, "step": 1034 }, { "epoch": 64.625, "loss": 0.006255846470594406, "loss_ce": 0.002441149204969406, "loss_xval": 0.003814697265625, "num_input_tokens_seen": 71463680, "step": 1034 }, { "epoch": 64.6875, "grad_norm": 20.100523420312953, "learning_rate": 5e-05, "loss": 0.0371, "num_input_tokens_seen": 71535488, "step": 1035 }, { "epoch": 64.6875, "loss": 0.03598330169916153, "loss_ce": 0.0022918949835002422, "loss_xval": 0.03369140625, "num_input_tokens_seen": 71535488, "step": 1035 }, { "epoch": 64.75, "grad_norm": 11.602017826304696, "learning_rate": 5e-05, "loss": 0.0152, "num_input_tokens_seen": 71594624, "step": 1036 }, { "epoch": 64.75, "loss": 0.016812482848763466, "loss_ce": 0.0021640451159328222, "loss_xval": 0.0146484375, "num_input_tokens_seen": 71594624, "step": 1036 }, { "epoch": 64.8125, "grad_norm": 8.661482672312363, "learning_rate": 5e-05, "loss": 0.0096, "num_input_tokens_seen": 71666368, "step": 1037 }, { "epoch": 64.8125, "loss": 0.010122916661202908, "loss_ce": 0.002249381737783551, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 71666368, "step": 1037 }, { "epoch": 64.875, "grad_norm": 18.271595554722843, "learning_rate": 5e-05, "loss": 0.0279, "num_input_tokens_seen": 71738048, "step": 1038 }, { "epoch": 64.875, "loss": 0.03008742816746235, "loss_ce": 0.0022553973831236362, "loss_xval": 0.02783203125, "num_input_tokens_seen": 71738048, "step": 1038 }, { "epoch": 64.9375, "grad_norm": 3.060520876484566, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 71809600, "step": 1039 }, { "epoch": 64.9375, "loss": 0.006218142807483673, "loss_ce": 0.002128787338733673, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 71809600, "step": 1039 }, { "epoch": 65.0, "grad_norm": 13.986125332437513, "learning_rate": 5e-05, "loss": 0.0174, "num_input_tokens_seen": 71868672, "step": 1040 }, { "epoch": 65.0, "loss": 0.019594108685851097, "loss_ce": 0.001893913489766419, "loss_xval": 0.0177001953125, "num_input_tokens_seen": 71868672, "step": 1040 }, { "epoch": 65.0625, "grad_norm": 12.209464179139154, "learning_rate": 5e-05, "loss": 0.0145, "num_input_tokens_seen": 71940480, "step": 1041 }, { "epoch": 65.0625, "loss": 0.014442279934883118, "loss_ce": 0.0018690372817218304, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 71940480, "step": 1041 }, { "epoch": 65.125, "grad_norm": 4.50926374607243, "learning_rate": 5e-05, "loss": 0.0058, "num_input_tokens_seen": 72012096, "step": 1042 }, { "epoch": 65.125, "loss": 0.0058934856206178665, "loss_ce": 0.0019262002315372229, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 72012096, "step": 1042 }, { "epoch": 65.1875, "grad_norm": 15.853797766114015, "learning_rate": 5e-05, "loss": 0.0215, "num_input_tokens_seen": 72083648, "step": 1043 }, { "epoch": 65.1875, "loss": 0.021349458023905754, "loss_ce": 0.0021844184957444668, "loss_xval": 0.0191650390625, "num_input_tokens_seen": 72083648, "step": 1043 }, { "epoch": 65.25, "grad_norm": 3.198409504037858, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 72155328, "step": 1044 }, { "epoch": 65.25, "loss": 0.004548709373921156, "loss_ce": 0.0017258335137739778, "loss_xval": 0.0028228759765625, "num_input_tokens_seen": 72155328, "step": 1044 }, { "epoch": 65.3125, "grad_norm": 11.095617305917825, "learning_rate": 5e-05, "loss": 0.0121, "num_input_tokens_seen": 72226944, "step": 1045 }, { "epoch": 65.3125, "loss": 0.011855102144181728, "loss_ce": 0.0016622307011857629, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 72226944, "step": 1045 }, { "epoch": 65.375, "grad_norm": 10.122140050011588, "learning_rate": 5e-05, "loss": 0.0107, "num_input_tokens_seen": 72298496, "step": 1046 }, { "epoch": 65.375, "loss": 0.011897360906004906, "loss_ce": 0.001582419266924262, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 72298496, "step": 1046 }, { "epoch": 65.4375, "grad_norm": 7.1144618783809745, "learning_rate": 5e-05, "loss": 0.0066, "num_input_tokens_seen": 72370112, "step": 1047 }, { "epoch": 65.4375, "loss": 0.006526774261146784, "loss_ce": 0.0016134442994371057, "loss_xval": 0.004913330078125, "num_input_tokens_seen": 72370112, "step": 1047 }, { "epoch": 65.5, "grad_norm": 10.69983363857111, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 72429184, "step": 1048 }, { "epoch": 65.5, "loss": 0.010820340365171432, "loss_ce": 0.0014209267683327198, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 72429184, "step": 1048 }, { "epoch": 65.5625, "grad_norm": 0.7117590553510648, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 72500800, "step": 1049 }, { "epoch": 65.5625, "loss": 0.0037945066578686237, "loss_ce": 0.0013683591969311237, "loss_xval": 0.0024261474609375, "num_input_tokens_seen": 72500800, "step": 1049 }, { "epoch": 65.625, "grad_norm": 10.867323388580141, "learning_rate": 5e-05, "loss": 0.0111, "num_input_tokens_seen": 72559936, "step": 1050 }, { "epoch": 65.625, "loss": 0.011647101491689682, "loss_ce": 0.001271124929189682, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 72559936, "step": 1050 }, { "epoch": 65.6875, "grad_norm": 5.686620656543113, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 72631616, "step": 1051 }, { "epoch": 65.6875, "loss": 0.005414774175733328, "loss_ce": 0.0012949011288583279, "loss_xval": 0.004119873046875, "num_input_tokens_seen": 72631616, "step": 1051 }, { "epoch": 65.75, "grad_norm": 7.442335265025644, "learning_rate": 5e-05, "loss": 0.0071, "num_input_tokens_seen": 72703296, "step": 1052 }, { "epoch": 65.75, "loss": 0.007290867157280445, "loss_ce": 0.001217869110405445, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 72703296, "step": 1052 }, { "epoch": 65.8125, "grad_norm": 8.994490972618683, "learning_rate": 5e-05, "loss": 0.0095, "num_input_tokens_seen": 72774912, "step": 1053 }, { "epoch": 65.8125, "loss": 0.010024000890552998, "loss_ce": 0.0012349386233836412, "loss_xval": 0.0087890625, "num_input_tokens_seen": 72774912, "step": 1053 }, { "epoch": 65.875, "grad_norm": 4.616509642337401, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 72846528, "step": 1054 }, { "epoch": 65.875, "loss": 0.0034966999664902687, "loss_ce": 0.0011926227016374469, "loss_xval": 0.0023040771484375, "num_input_tokens_seen": 72846528, "step": 1054 }, { "epoch": 65.9375, "grad_norm": 8.391185923447743, "learning_rate": 5e-05, "loss": 0.0065, "num_input_tokens_seen": 72918080, "step": 1055 }, { "epoch": 65.9375, "loss": 0.006700682453811169, "loss_ce": 0.0011464833514764905, "loss_xval": 0.00555419921875, "num_input_tokens_seen": 72918080, "step": 1055 }, { "epoch": 66.0, "grad_norm": 0.5200151132607369, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 72989632, "step": 1056 }, { "epoch": 66.0, "loss": 0.0027731633745133877, "loss_ce": 0.0015372015768662095, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 72989632, "step": 1056 }, { "epoch": 66.0625, "grad_norm": 7.8363630439888, "learning_rate": 5e-05, "loss": 0.0064, "num_input_tokens_seen": 73061248, "step": 1057 }, { "epoch": 66.0625, "loss": 0.005550441797822714, "loss_ce": 0.0010948755079880357, "loss_xval": 0.00445556640625, "num_input_tokens_seen": 73061248, "step": 1057 }, { "epoch": 66.125, "grad_norm": 2.6651868002357277, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 73132800, "step": 1058 }, { "epoch": 66.125, "loss": 0.0026142490096390247, "loss_ce": 0.0011112582869827747, "loss_xval": 0.00150299072265625, "num_input_tokens_seen": 73132800, "step": 1058 }, { "epoch": 66.1875, "grad_norm": 4.598939057870578, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 73204352, "step": 1059 }, { "epoch": 66.1875, "loss": 0.003604610450565815, "loss_ce": 0.001025875099003315, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 73204352, "step": 1059 }, { "epoch": 66.25, "grad_norm": 5.179290730834164, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 73275968, "step": 1060 }, { "epoch": 66.25, "loss": 0.0038217175751924515, "loss_ce": 0.0009988414822146297, "loss_xval": 0.0028228759765625, "num_input_tokens_seen": 73275968, "step": 1060 }, { "epoch": 66.3125, "grad_norm": 2.314749895515334, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 73347648, "step": 1061 }, { "epoch": 66.3125, "loss": 0.0025231381878256798, "loss_ce": 0.0010277768597006798, "loss_xval": 0.001495361328125, "num_input_tokens_seen": 73347648, "step": 1061 }, { "epoch": 66.375, "grad_norm": 7.116350353107652, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 73419392, "step": 1062 }, { "epoch": 66.375, "loss": 0.005443810019642115, "loss_ce": 0.0010187613079324365, "loss_xval": 0.004425048828125, "num_input_tokens_seen": 73419392, "step": 1062 }, { "epoch": 66.4375, "grad_norm": 0.7000035546873251, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 73490944, "step": 1063 }, { "epoch": 66.4375, "loss": 0.001345249591395259, "loss_ce": 0.0008722271886654198, "loss_xval": 0.0004730224609375, "num_input_tokens_seen": 73490944, "step": 1063 }, { "epoch": 66.5, "grad_norm": 7.391093285153892, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 73562624, "step": 1064 }, { "epoch": 66.5, "loss": 0.005488572642207146, "loss_ce": 0.0008499008254148066, "loss_xval": 0.004638671875, "num_input_tokens_seen": 73562624, "step": 1064 }, { "epoch": 66.5625, "grad_norm": 1.4615064596624099, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 73634240, "step": 1065 }, { "epoch": 66.5625, "loss": 0.0013411077670753002, "loss_ce": 0.0008528265752829611, "loss_xval": 0.00048828125, "num_input_tokens_seen": 73634240, "step": 1065 }, { "epoch": 66.625, "grad_norm": 7.578130916384667, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 73705792, "step": 1066 }, { "epoch": 66.625, "loss": 0.006180023308843374, "loss_ce": 0.000778412155341357, "loss_xval": 0.005401611328125, "num_input_tokens_seen": 73705792, "step": 1066 }, { "epoch": 66.6875, "grad_norm": 2.7945854003731605, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 73777472, "step": 1067 }, { "epoch": 66.6875, "loss": 0.002022083615884185, "loss_ce": 0.0008166392799466848, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 73777472, "step": 1067 }, { "epoch": 66.75, "grad_norm": 7.82940226419613, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 73849088, "step": 1068 }, { "epoch": 66.75, "loss": 0.005442717578262091, "loss_ce": 0.0007735280669294298, "loss_xval": 0.004669189453125, "num_input_tokens_seen": 73849088, "step": 1068 }, { "epoch": 66.8125, "grad_norm": 2.9098149809574982, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 73920640, "step": 1069 }, { "epoch": 66.8125, "loss": 0.0020586801692843437, "loss_ce": 0.0007006479427218437, "loss_xval": 0.0013580322265625, "num_input_tokens_seen": 73920640, "step": 1069 }, { "epoch": 66.875, "grad_norm": 5.130710698587081, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 73979776, "step": 1070 }, { "epoch": 66.875, "loss": 0.003260532859712839, "loss_ce": 0.000727573991753161, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 73979776, "step": 1070 }, { "epoch": 66.9375, "grad_norm": 5.263835846590998, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 74051456, "step": 1071 }, { "epoch": 66.9375, "loss": 0.0030756406486034393, "loss_ce": 0.0006952694384381175, "loss_xval": 0.00238037109375, "num_input_tokens_seen": 74051456, "step": 1071 }, { "epoch": 67.0, "grad_norm": 4.423793604769365, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 74123136, "step": 1072 }, { "epoch": 67.0, "loss": 0.003088552039116621, "loss_ce": 0.0007234398508444428, "loss_xval": 0.0023651123046875, "num_input_tokens_seen": 74123136, "step": 1072 }, { "epoch": 67.0625, "grad_norm": 5.4151559989195714, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 74169728, "step": 1073 }, { "epoch": 67.0625, "loss": 0.00415863236412406, "loss_ce": 0.0006643697270192206, "loss_xval": 0.0034942626953125, "num_input_tokens_seen": 74169728, "step": 1073 }, { "epoch": 67.125, "grad_norm": 2.6697138026778893, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 74241280, "step": 1074 }, { "epoch": 67.125, "loss": 0.0019654666539281607, "loss_ce": 0.0006760990363545716, "loss_xval": 0.00128936767578125, "num_input_tokens_seen": 74241280, "step": 1074 }, { "epoch": 67.1875, "grad_norm": 6.466851461930606, "learning_rate": 5e-05, "loss": 0.0041, "num_input_tokens_seen": 74312832, "step": 1075 }, { "epoch": 67.1875, "loss": 0.004360595252364874, "loss_ce": 0.0006984857027418911, "loss_xval": 0.003662109375, "num_input_tokens_seen": 74312832, "step": 1075 }, { "epoch": 67.25, "grad_norm": 2.748059317089271, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 74384448, "step": 1076 }, { "epoch": 67.25, "loss": 0.0014462934341281652, "loss_ce": 0.0006108746747486293, "loss_xval": 0.000835418701171875, "num_input_tokens_seen": 74384448, "step": 1076 }, { "epoch": 67.3125, "grad_norm": 6.290131306911363, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 74456000, "step": 1077 }, { "epoch": 67.3125, "loss": 0.004040678031742573, "loss_ce": 0.0006227090489119291, "loss_xval": 0.00341796875, "num_input_tokens_seen": 74456000, "step": 1077 }, { "epoch": 67.375, "grad_norm": 3.3652699857297126, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 74527680, "step": 1078 }, { "epoch": 67.375, "loss": 0.0017457085195928812, "loss_ce": 0.0005936700035817921, "loss_xval": 0.00115203857421875, "num_input_tokens_seen": 74527680, "step": 1078 }, { "epoch": 67.4375, "grad_norm": 4.970270177767509, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 74599360, "step": 1079 }, { "epoch": 67.4375, "loss": 0.002559410873800516, "loss_ce": 0.0005910270265303552, "loss_xval": 0.0019683837890625, "num_input_tokens_seen": 74599360, "step": 1079 }, { "epoch": 67.5, "grad_norm": 3.2857842297194555, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 74671040, "step": 1080 }, { "epoch": 67.5, "loss": 0.001640387112274766, "loss_ce": 0.0005493836360983551, "loss_xval": 0.00109100341796875, "num_input_tokens_seen": 74671040, "step": 1080 }, { "epoch": 67.5625, "grad_norm": 3.3166996297328875, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 74730112, "step": 1081 }, { "epoch": 67.5625, "loss": 0.0017551060300320387, "loss_ce": 0.0005115147796459496, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 74730112, "step": 1081 }, { "epoch": 67.625, "grad_norm": 2.5686066250891417, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 74801792, "step": 1082 }, { "epoch": 67.625, "loss": 0.0016136132180690765, "loss_ce": 0.0005531273782253265, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 74801792, "step": 1082 }, { "epoch": 67.6875, "grad_norm": 2.109128772753182, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 74860992, "step": 1083 }, { "epoch": 67.6875, "loss": 0.001649228623136878, "loss_ce": 0.0005505957524292171, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 74860992, "step": 1083 }, { "epoch": 67.75, "grad_norm": 2.3087502620833376, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 74932672, "step": 1084 }, { "epoch": 67.75, "loss": 0.0018915177788585424, "loss_ce": 0.0005258561577647924, "loss_xval": 0.00136566162109375, "num_input_tokens_seen": 74932672, "step": 1084 }, { "epoch": 67.8125, "grad_norm": 1.606598249894243, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 75004352, "step": 1085 }, { "epoch": 67.8125, "loss": 0.001132636098191142, "loss_ce": 0.000495581713039428, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 75004352, "step": 1085 }, { "epoch": 67.875, "grad_norm": 2.3263075112685443, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 75075904, "step": 1086 }, { "epoch": 67.875, "loss": 0.0011369904968887568, "loss_ce": 0.0005418977816589177, "loss_xval": 0.0005950927734375, "num_input_tokens_seen": 75075904, "step": 1086 }, { "epoch": 67.9375, "grad_norm": 1.9644424594312697, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 75147520, "step": 1087 }, { "epoch": 67.9375, "loss": 0.001179728889837861, "loss_ce": 0.00047400989569723606, "loss_xval": 0.000705718994140625, "num_input_tokens_seen": 75147520, "step": 1087 }, { "epoch": 68.0, "grad_norm": 4.111860002263121, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 75219136, "step": 1088 }, { "epoch": 68.0, "loss": 0.002233484061434865, "loss_ce": 0.00047109383740462363, "loss_xval": 0.00176239013671875, "num_input_tokens_seen": 75219136, "step": 1088 }, { "epoch": 68.0625, "grad_norm": 0.582760051076, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 75290816, "step": 1089 }, { "epoch": 68.0625, "loss": 0.0007814638083800673, "loss_ce": 0.0004781954048667103, "loss_xval": 0.0003032684326171875, "num_input_tokens_seen": 75290816, "step": 1089 }, { "epoch": 68.125, "grad_norm": 3.3103458752612496, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 75362624, "step": 1090 }, { "epoch": 68.125, "loss": 0.0014409287832677364, "loss_ce": 0.00044910749420523643, "loss_xval": 0.0009918212890625, "num_input_tokens_seen": 75362624, "step": 1090 }, { "epoch": 68.1875, "grad_norm": 2.086643414735248, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 75434432, "step": 1091 }, { "epoch": 68.1875, "loss": 0.0010076756589114666, "loss_ce": 0.00042021225090138614, "loss_xval": 0.00058746337890625, "num_input_tokens_seen": 75434432, "step": 1091 }, { "epoch": 68.25, "grad_norm": 2.022816660904633, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 75506048, "step": 1092 }, { "epoch": 68.25, "loss": 0.0015586587833240628, "loss_ce": 0.0004447671817615628, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 75506048, "step": 1092 }, { "epoch": 68.3125, "grad_norm": 3.020208647060878, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 75577600, "step": 1093 }, { "epoch": 68.3125, "loss": 0.0018930145306512713, "loss_ce": 0.00043580017518252134, "loss_xval": 0.00145721435546875, "num_input_tokens_seen": 75577600, "step": 1093 }, { "epoch": 68.375, "grad_norm": 0.3506489901589061, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 75649216, "step": 1094 }, { "epoch": 68.375, "loss": 0.0009144733194261789, "loss_ce": 0.0004109333094675094, "loss_xval": 0.0005035400390625, "num_input_tokens_seen": 75649216, "step": 1094 }, { "epoch": 68.4375, "grad_norm": 2.0988202553666055, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 75720832, "step": 1095 }, { "epoch": 68.4375, "loss": 0.0012659782078117132, "loss_ce": 0.00041530077578499913, "loss_xval": 0.000850677490234375, "num_input_tokens_seen": 75720832, "step": 1095 }, { "epoch": 68.5, "grad_norm": 1.105888327952167, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 75792448, "step": 1096 }, { "epoch": 68.5, "loss": 0.0007757066050544381, "loss_ce": 0.00043047647341154516, "loss_xval": 0.0003452301025390625, "num_input_tokens_seen": 75792448, "step": 1096 }, { "epoch": 68.5625, "grad_norm": 1.9460696134458952, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 75864256, "step": 1097 }, { "epoch": 68.5625, "loss": 0.0008472871268168092, "loss_ce": 0.0004124116094317287, "loss_xval": 0.00043487548828125, "num_input_tokens_seen": 75864256, "step": 1097 }, { "epoch": 68.625, "grad_norm": 2.96760282304176, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 75923392, "step": 1098 }, { "epoch": 68.625, "loss": 0.0010627032024785876, "loss_ce": 0.0003875018155667931, "loss_xval": 0.000675201416015625, "num_input_tokens_seen": 75923392, "step": 1098 }, { "epoch": 68.6875, "grad_norm": 0.43332427108493793, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 75982464, "step": 1099 }, { "epoch": 68.6875, "loss": 0.0010092169977724552, "loss_ce": 0.00037216261262074113, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 75982464, "step": 1099 }, { "epoch": 68.75, "grad_norm": 2.2587393753504292, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 76054080, "step": 1100 }, { "epoch": 68.75, "loss": 0.0009440279682166874, "loss_ce": 0.00037563807563856244, "loss_xval": 0.000568389892578125, "num_input_tokens_seen": 76054080, "step": 1100 }, { "epoch": 68.8125, "grad_norm": 1.448401941251065, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 76125760, "step": 1101 }, { "epoch": 68.8125, "loss": 0.0006880094297230244, "loss_ce": 0.0003752042830456048, "loss_xval": 0.00031280517578125, "num_input_tokens_seen": 76125760, "step": 1101 }, { "epoch": 68.875, "grad_norm": 1.0144090906009593, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 76197376, "step": 1102 }, { "epoch": 68.875, "loss": 0.0008232088293880224, "loss_ce": 0.00038261126610450447, "loss_xval": 0.0004405975341796875, "num_input_tokens_seen": 76197376, "step": 1102 }, { "epoch": 68.9375, "grad_norm": 1.502076433418958, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 76268992, "step": 1103 }, { "epoch": 68.9375, "loss": 0.0007593849441036582, "loss_ce": 0.00036647109664045274, "loss_xval": 0.000392913818359375, "num_input_tokens_seen": 76268992, "step": 1103 }, { "epoch": 69.0, "grad_norm": 0.2226085749394477, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 76340608, "step": 1104 }, { "epoch": 69.0, "loss": 0.0006789171602576971, "loss_ce": 0.0003718340303748846, "loss_xval": 0.0003070831298828125, "num_input_tokens_seen": 76340608, "step": 1104 }, { "epoch": 69.0625, "grad_norm": 1.1949850171733325, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 76412288, "step": 1105 }, { "epoch": 69.0625, "loss": 0.0007067075930535793, "loss_ce": 0.0003652921586763114, "loss_xval": 0.0003414154052734375, "num_input_tokens_seen": 76412288, "step": 1105 }, { "epoch": 69.125, "grad_norm": 0.22225027753658388, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 76483840, "step": 1106 }, { "epoch": 69.125, "loss": 0.000622923020273447, "loss_ce": 0.00034826481714844704, "loss_xval": 0.000274658203125, "num_input_tokens_seen": 76483840, "step": 1106 }, { "epoch": 69.1875, "grad_norm": 1.6311751923551645, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 76555456, "step": 1107 }, { "epoch": 69.1875, "loss": 0.000752802356146276, "loss_ce": 0.00034844447509385645, "loss_xval": 0.00040435791015625, "num_input_tokens_seen": 76555456, "step": 1107 }, { "epoch": 69.25, "grad_norm": 0.90413859652661, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 76614656, "step": 1108 }, { "epoch": 69.25, "loss": 0.0005864095874130726, "loss_ce": 0.00032701014424674213, "loss_xval": 0.0002593994140625, "num_input_tokens_seen": 76614656, "step": 1108 }, { "epoch": 69.3125, "grad_norm": 0.8893801851428039, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 76686208, "step": 1109 }, { "epoch": 69.3125, "loss": 0.0005970131605863571, "loss_ce": 0.0003452431410551071, "loss_xval": 0.00025177001953125, "num_input_tokens_seen": 76686208, "step": 1109 }, { "epoch": 69.375, "grad_norm": 1.8984967276637597, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 76745280, "step": 1110 }, { "epoch": 69.375, "loss": 0.0007707853801548481, "loss_ce": 0.00030920698191039264, "loss_xval": 0.000461578369140625, "num_input_tokens_seen": 76745280, "step": 1110 }, { "epoch": 69.4375, "grad_norm": 1.1870451841541478, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 76816832, "step": 1111 }, { "epoch": 69.4375, "loss": 0.0005835613701492548, "loss_ce": 0.00032034722971729934, "loss_xval": 0.000263214111328125, "num_input_tokens_seen": 76816832, "step": 1111 }, { "epoch": 69.5, "grad_norm": 0.9232265195058813, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 76888384, "step": 1112 }, { "epoch": 69.5, "loss": 0.0006464470061473548, "loss_ce": 0.00030503160087391734, "loss_xval": 0.0003414154052734375, "num_input_tokens_seen": 76888384, "step": 1112 }, { "epoch": 69.5625, "grad_norm": 2.1241217130665517, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 76960064, "step": 1113 }, { "epoch": 69.5625, "loss": 0.0007790784584358335, "loss_ce": 0.00030987069476395845, "loss_xval": 0.000469207763671875, "num_input_tokens_seen": 76960064, "step": 1113 }, { "epoch": 69.625, "grad_norm": 0.5368007405442083, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 77031616, "step": 1114 }, { "epoch": 69.625, "loss": 0.0010094753233715892, "loss_ce": 0.0003037563001271337, "loss_xval": 0.000705718994140625, "num_input_tokens_seen": 77031616, "step": 1114 }, { "epoch": 69.6875, "grad_norm": 2.8898095165154576, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 77090688, "step": 1115 }, { "epoch": 69.6875, "loss": 0.0013308109482750297, "loss_ce": 0.00029321329202502966, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 77090688, "step": 1115 }, { "epoch": 69.75, "grad_norm": 4.702732081058997, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 77149888, "step": 1116 }, { "epoch": 69.75, "loss": 0.0017137245740741491, "loss_ce": 0.0002946572203654796, "loss_xval": 0.0014190673828125, "num_input_tokens_seen": 77149888, "step": 1116 }, { "epoch": 69.8125, "grad_norm": 3.5802036236691444, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 77221440, "step": 1117 }, { "epoch": 69.8125, "loss": 0.0014380008215084672, "loss_ce": 0.0002859622472897172, "loss_xval": 0.00115203857421875, "num_input_tokens_seen": 77221440, "step": 1117 }, { "epoch": 69.875, "grad_norm": 0.6476686940933929, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 77293120, "step": 1118 }, { "epoch": 69.875, "loss": 0.0006478414870798588, "loss_ce": 0.00029116732184775174, "loss_xval": 0.0003566741943359375, "num_input_tokens_seen": 77293120, "step": 1118 }, { "epoch": 69.9375, "grad_norm": 2.1153515585154823, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 77364736, "step": 1119 }, { "epoch": 69.9375, "loss": 0.000696200062520802, "loss_ce": 0.00028612007736228406, "loss_xval": 0.0004100799560546875, "num_input_tokens_seen": 77364736, "step": 1119 }, { "epoch": 70.0, "grad_norm": 2.784751995439613, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 77423936, "step": 1120 }, { "epoch": 70.0, "loss": 0.000890074297785759, "loss_ce": 0.00026446394622325897, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 77423936, "step": 1120 }, { "epoch": 70.0625, "grad_norm": 1.5326245030717471, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 77495616, "step": 1121 }, { "epoch": 70.0625, "loss": 0.0007288586348295212, "loss_ce": 0.0002653728879522532, "loss_xval": 0.0004634857177734375, "num_input_tokens_seen": 77495616, "step": 1121 }, { "epoch": 70.125, "grad_norm": 0.09610609543339954, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 77567232, "step": 1122 }, { "epoch": 70.125, "loss": 0.0005251829279586673, "loss_ce": 0.0002562467707321048, "loss_xval": 0.0002689361572265625, "num_input_tokens_seen": 77567232, "step": 1122 }, { "epoch": 70.1875, "grad_norm": 0.746399639745206, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 77638784, "step": 1123 }, { "epoch": 70.1875, "loss": 0.00044554518535733223, "loss_ce": 0.00025862501934170723, "loss_xval": 0.000186920166015625, "num_input_tokens_seen": 77638784, "step": 1123 }, { "epoch": 70.25, "grad_norm": 0.5062431402486854, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 77710400, "step": 1124 }, { "epoch": 70.25, "loss": 0.0004995564231649041, "loss_ce": 0.0002477864036336541, "loss_xval": 0.00025177001953125, "num_input_tokens_seen": 77710400, "step": 1124 }, { "epoch": 70.3125, "grad_norm": 0.7834653348132636, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 77781952, "step": 1125 }, { "epoch": 70.3125, "loss": 0.0007250700728036463, "loss_ce": 0.0002558623091317713, "loss_xval": 0.000469207763671875, "num_input_tokens_seen": 77781952, "step": 1125 }, { "epoch": 70.375, "grad_norm": 2.6329741353793525, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 77841024, "step": 1126 }, { "epoch": 70.375, "loss": 0.0008808316779322922, "loss_ce": 0.00023996253730729222, "loss_xval": 0.000640869140625, "num_input_tokens_seen": 77841024, "step": 1126 }, { "epoch": 70.4375, "grad_norm": 3.810101421457146, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 77912704, "step": 1127 }, { "epoch": 70.4375, "loss": 0.001295228023082018, "loss_ce": 0.00025763033772818744, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 77912704, "step": 1127 }, { "epoch": 70.5, "grad_norm": 3.3135496242953315, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 77984256, "step": 1128 }, { "epoch": 70.5, "loss": 0.0012748048175126314, "loss_ce": 0.0002524659794289619, "loss_xval": 0.0010223388671875, "num_input_tokens_seen": 77984256, "step": 1128 }, { "epoch": 70.5625, "grad_norm": 1.2567102717320116, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78043392, "step": 1129 }, { "epoch": 70.5625, "loss": 0.0005960181588307023, "loss_ce": 0.00024697332992218435, "loss_xval": 0.0003490447998046875, "num_input_tokens_seen": 78043392, "step": 1129 }, { "epoch": 70.625, "grad_norm": 1.5373568630363612, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78115200, "step": 1130 }, { "epoch": 70.625, "loss": 0.0009142133640125394, "loss_ce": 0.00024282665981445462, "loss_xval": 0.00067138671875, "num_input_tokens_seen": 78115200, "step": 1130 }, { "epoch": 70.6875, "grad_norm": 3.3277541930648953, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 78161792, "step": 1131 }, { "epoch": 70.6875, "loss": 0.001277487725019455, "loss_ce": 0.0002170019142795354, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 78161792, "step": 1131 }, { "epoch": 70.75, "grad_norm": 3.077346914693892, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 78220864, "step": 1132 }, { "epoch": 70.75, "loss": 0.0009054588153958321, "loss_ce": 0.00022644268756266683, "loss_xval": 0.00067901611328125, "num_input_tokens_seen": 78220864, "step": 1132 }, { "epoch": 70.8125, "grad_norm": 1.3757736730193164, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78292672, "step": 1133 }, { "epoch": 70.8125, "loss": 0.0008023668196983635, "loss_ce": 0.00023016221530269831, "loss_xval": 0.00057220458984375, "num_input_tokens_seen": 78292672, "step": 1133 }, { "epoch": 70.875, "grad_norm": 0.6192579612834873, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 78364352, "step": 1134 }, { "epoch": 70.875, "loss": 0.0005293790600262582, "loss_ce": 0.0002222959155915305, "loss_xval": 0.0003070831298828125, "num_input_tokens_seen": 78364352, "step": 1134 }, { "epoch": 70.9375, "grad_norm": 1.7480415437279953, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 78435904, "step": 1135 }, { "epoch": 70.9375, "loss": 0.0006328938179649413, "loss_ce": 0.00021327711874619126, "loss_xval": 0.00041961669921875, "num_input_tokens_seen": 78435904, "step": 1135 }, { "epoch": 71.0, "grad_norm": 2.4914839158123705, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 78507520, "step": 1136 }, { "epoch": 71.0, "loss": 0.0007470841519534588, "loss_ce": 0.00022828533838037401, "loss_xval": 0.000518798828125, "num_input_tokens_seen": 78507520, "step": 1136 }, { "epoch": 71.0625, "grad_norm": 3.127032348632522, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 78579264, "step": 1137 }, { "epoch": 71.0625, "loss": 0.0009275174816139042, "loss_ce": 0.000217983775655739, "loss_xval": 0.00070953369140625, "num_input_tokens_seen": 78579264, "step": 1137 }, { "epoch": 71.125, "grad_norm": 3.296218627743644, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 78650880, "step": 1138 }, { "epoch": 71.125, "loss": 0.0013986143749207258, "loss_ce": 0.00021605828078463674, "loss_xval": 0.00118255615234375, "num_input_tokens_seen": 78650880, "step": 1138 }, { "epoch": 71.1875, "grad_norm": 3.2523286688234703, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 78722624, "step": 1139 }, { "epoch": 71.1875, "loss": 0.0010947894770652056, "loss_ce": 0.00020977971144020557, "loss_xval": 0.000885009765625, "num_input_tokens_seen": 78722624, "step": 1139 }, { "epoch": 71.25, "grad_norm": 2.8340559225912716, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 78794240, "step": 1140 }, { "epoch": 71.25, "loss": 0.0009315769420936704, "loss_ce": 0.0001991550379898399, "loss_xval": 0.000732421875, "num_input_tokens_seen": 78794240, "step": 1140 }, { "epoch": 71.3125, "grad_norm": 2.1018467475347307, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78866048, "step": 1141 }, { "epoch": 71.3125, "loss": 0.0006598670734092593, "loss_ce": 0.00020210337243042886, "loss_xval": 0.000457763671875, "num_input_tokens_seen": 78866048, "step": 1141 }, { "epoch": 71.375, "grad_norm": 1.585719690902742, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78925056, "step": 1142 }, { "epoch": 71.375, "loss": 0.0005244898493401706, "loss_ce": 0.00018688915588427335, "loss_xval": 0.0003376007080078125, "num_input_tokens_seen": 78925056, "step": 1142 }, { "epoch": 71.4375, "grad_norm": 1.5981146071845063, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 78996608, "step": 1143 }, { "epoch": 71.4375, "loss": 0.0005193843389861286, "loss_ce": 0.00019704240548890084, "loss_xval": 0.0003223419189453125, "num_input_tokens_seen": 78996608, "step": 1143 }, { "epoch": 71.5, "grad_norm": 2.743565029475743, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 79068288, "step": 1144 }, { "epoch": 71.5, "loss": 0.0009858801495283842, "loss_ce": 0.00020005246915388852, "loss_xval": 0.00078582763671875, "num_input_tokens_seen": 79068288, "step": 1144 }, { "epoch": 71.5625, "grad_norm": 4.49463450679341, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 79127488, "step": 1145 }, { "epoch": 71.5625, "loss": 0.0016861867625266314, "loss_ce": 0.00019845488714054227, "loss_xval": 0.00148773193359375, "num_input_tokens_seen": 79127488, "step": 1145 }, { "epoch": 71.625, "grad_norm": 6.002361391410423, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 79199104, "step": 1146 }, { "epoch": 71.625, "loss": 0.0026474776677787304, "loss_ce": 0.00019081255595665425, "loss_xval": 0.0024566650390625, "num_input_tokens_seen": 79199104, "step": 1146 }, { "epoch": 71.6875, "grad_norm": 7.37613968419637, "learning_rate": 5e-05, "loss": 0.0039, "num_input_tokens_seen": 79270784, "step": 1147 }, { "epoch": 71.6875, "loss": 0.004187567625194788, "loss_ce": 0.00018976477440446615, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 79270784, "step": 1147 }, { "epoch": 71.75, "grad_norm": 9.433800009175698, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 79329984, "step": 1148 }, { "epoch": 71.75, "loss": 0.0059251803904771805, "loss_ce": 0.00018787590670399368, "loss_xval": 0.0057373046875, "num_input_tokens_seen": 79329984, "step": 1148 }, { "epoch": 71.8125, "grad_norm": 13.554320713358411, "learning_rate": 5e-05, "loss": 0.0121, "num_input_tokens_seen": 79401664, "step": 1149 }, { "epoch": 71.8125, "loss": 0.012463638558983803, "loss_ce": 0.00019557222549337894, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 79401664, "step": 1149 }, { "epoch": 71.875, "grad_norm": 20.450152288934543, "learning_rate": 5e-05, "loss": 0.0273, "num_input_tokens_seen": 79460672, "step": 1150 }, { "epoch": 71.875, "loss": 0.02654273435473442, "loss_ce": 0.00017554643272887915, "loss_xval": 0.0263671875, "num_input_tokens_seen": 79460672, "step": 1150 }, { "epoch": 71.9375, "grad_norm": 30.27927271328012, "learning_rate": 5e-05, "loss": 0.0601, "num_input_tokens_seen": 79532416, "step": 1151 }, { "epoch": 71.9375, "loss": 0.05904359370470047, "loss_ce": 0.0002057042729575187, "loss_xval": 0.058837890625, "num_input_tokens_seen": 79532416, "step": 1151 }, { "epoch": 72.0, "grad_norm": 42.395333437272235, "learning_rate": 5e-05, "loss": 0.1191, "num_input_tokens_seen": 79603968, "step": 1152 }, { "epoch": 72.0, "loss": 0.11884753406047821, "loss_ce": 0.00019518662884365767, "loss_xval": 0.11865234375, "num_input_tokens_seen": 79603968, "step": 1152 }, { "epoch": 72.0625, "grad_norm": 51.66495567019675, "learning_rate": 5e-05, "loss": 0.1765, "num_input_tokens_seen": 79675648, "step": 1153 }, { "epoch": 72.0625, "loss": 0.17893804609775543, "loss_ce": 0.0002271079138154164, "loss_xval": 0.1787109375, "num_input_tokens_seen": 79675648, "step": 1153 }, { "epoch": 72.125, "grad_norm": 45.35518512350621, "learning_rate": 5e-05, "loss": 0.1428, "num_input_tokens_seen": 79747328, "step": 1154 }, { "epoch": 72.125, "loss": 0.1486750841140747, "loss_ce": 0.00023758277529850602, "loss_xval": 0.1484375, "num_input_tokens_seen": 79747328, "step": 1154 }, { "epoch": 72.1875, "grad_norm": 19.70199552558819, "learning_rate": 5e-05, "loss": 0.0293, "num_input_tokens_seen": 79806400, "step": 1155 }, { "epoch": 72.1875, "loss": 0.028222400695085526, "loss_ce": 0.0002682995982468128, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 79806400, "step": 1155 }, { "epoch": 72.25, "grad_norm": 13.444270932502397, "learning_rate": 5e-05, "loss": 0.0167, "num_input_tokens_seen": 79878016, "step": 1156 }, { "epoch": 72.25, "loss": 0.015745338052511215, "loss_ce": 0.0003034429391846061, "loss_xval": 0.01544189453125, "num_input_tokens_seen": 79878016, "step": 1156 }, { "epoch": 72.3125, "grad_norm": 34.97534035039033, "learning_rate": 5e-05, "loss": 0.0904, "num_input_tokens_seen": 79949760, "step": 1157 }, { "epoch": 72.3125, "loss": 0.09801744669675827, "loss_ce": 0.00036119777359999716, "loss_xval": 0.09765625, "num_input_tokens_seen": 79949760, "step": 1157 }, { "epoch": 72.375, "grad_norm": 29.571068751807424, "learning_rate": 5e-05, "loss": 0.0668, "num_input_tokens_seen": 80021312, "step": 1158 }, { "epoch": 72.375, "loss": 0.06681597232818604, "loss_ce": 0.0004097215423826128, "loss_xval": 0.06640625, "num_input_tokens_seen": 80021312, "step": 1158 }, { "epoch": 72.4375, "grad_norm": 3.12352961523283, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 80092992, "step": 1159 }, { "epoch": 72.4375, "loss": 0.00865169707685709, "loss_ce": 0.00047298570279963315, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 80092992, "step": 1159 }, { "epoch": 72.5, "grad_norm": 22.360989695703942, "learning_rate": 5e-05, "loss": 0.0462, "num_input_tokens_seen": 80152000, "step": 1160 }, { "epoch": 72.5, "loss": 0.04107421636581421, "loss_ce": 0.0005468718591146171, "loss_xval": 0.04052734375, "num_input_tokens_seen": 80152000, "step": 1160 }, { "epoch": 72.5625, "grad_norm": 27.510295556022815, "learning_rate": 5e-05, "loss": 0.0605, "num_input_tokens_seen": 80211200, "step": 1161 }, { "epoch": 72.5625, "loss": 0.062126122415065765, "loss_ce": 0.0006026857881806791, "loss_xval": 0.0615234375, "num_input_tokens_seen": 80211200, "step": 1161 }, { "epoch": 72.625, "grad_norm": 10.877303396663827, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 80282816, "step": 1162 }, { "epoch": 72.625, "loss": 0.012666767463088036, "loss_ce": 0.0007649122853763402, "loss_xval": 0.01190185546875, "num_input_tokens_seen": 80282816, "step": 1162 }, { "epoch": 72.6875, "grad_norm": 11.656950051709288, "learning_rate": 5e-05, "loss": 0.0151, "num_input_tokens_seen": 80354560, "step": 1163 }, { "epoch": 72.6875, "loss": 0.01623488776385784, "loss_ce": 0.000609888113103807, "loss_xval": 0.015625, "num_input_tokens_seen": 80354560, "step": 1163 }, { "epoch": 72.75, "grad_norm": 20.38091104817712, "learning_rate": 5e-05, "loss": 0.0376, "num_input_tokens_seen": 80426112, "step": 1164 }, { "epoch": 72.75, "loss": 0.041903380304574966, "loss_ce": 0.0006436131079681218, "loss_xval": 0.041259765625, "num_input_tokens_seen": 80426112, "step": 1164 }, { "epoch": 72.8125, "grad_norm": 11.945638795307774, "learning_rate": 5e-05, "loss": 0.0157, "num_input_tokens_seen": 80485184, "step": 1165 }, { "epoch": 72.8125, "loss": 0.01640426367521286, "loss_ce": 0.0006571935955435038, "loss_xval": 0.0157470703125, "num_input_tokens_seen": 80485184, "step": 1165 }, { "epoch": 72.875, "grad_norm": 3.9872152859369683, "learning_rate": 5e-05, "loss": 0.0044, "num_input_tokens_seen": 80544256, "step": 1166 }, { "epoch": 72.875, "loss": 0.004912215750664473, "loss_ce": 0.0007007899112068117, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 80544256, "step": 1166 }, { "epoch": 72.9375, "grad_norm": 13.697934139156507, "learning_rate": 5e-05, "loss": 0.0184, "num_input_tokens_seen": 80615936, "step": 1167 }, { "epoch": 72.9375, "loss": 0.0189347043633461, "loss_ce": 0.0007462279172614217, "loss_xval": 0.0181884765625, "num_input_tokens_seen": 80615936, "step": 1167 }, { "epoch": 73.0, "grad_norm": 12.216265019337566, "learning_rate": 5e-05, "loss": 0.0165, "num_input_tokens_seen": 80687616, "step": 1168 }, { "epoch": 73.0, "loss": 0.01614932343363762, "loss_ce": 0.0007684631855227053, "loss_xval": 0.015380859375, "num_input_tokens_seen": 80687616, "step": 1168 }, { "epoch": 73.0625, "grad_norm": 2.0683251647835994, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 80759296, "step": 1169 }, { "epoch": 73.0625, "loss": 0.0035584524739533663, "loss_ce": 0.0007966115954332054, "loss_xval": 0.0027618408203125, "num_input_tokens_seen": 80759296, "step": 1169 }, { "epoch": 73.125, "grad_norm": 8.472099059482757, "learning_rate": 5e-05, "loss": 0.0093, "num_input_tokens_seen": 80830912, "step": 1170 }, { "epoch": 73.125, "loss": 0.00800013355910778, "loss_ce": 0.000797984772361815, "loss_xval": 0.0072021484375, "num_input_tokens_seen": 80830912, "step": 1170 }, { "epoch": 73.1875, "grad_norm": 10.998131460264046, "learning_rate": 5e-05, "loss": 0.0119, "num_input_tokens_seen": 80902656, "step": 1171 }, { "epoch": 73.1875, "loss": 0.0125557417050004, "loss_ce": 0.0008369915885850787, "loss_xval": 0.01171875, "num_input_tokens_seen": 80902656, "step": 1171 }, { "epoch": 73.25, "grad_norm": 4.3344978035814625, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 80961792, "step": 1172 }, { "epoch": 73.25, "loss": 0.003933137748390436, "loss_ce": 0.0008356033940799534, "loss_xval": 0.0030975341796875, "num_input_tokens_seen": 80961792, "step": 1172 }, { "epoch": 73.3125, "grad_norm": 6.32022394282333, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 81033344, "step": 1173 }, { "epoch": 73.3125, "loss": 0.006512059364467859, "loss_ce": 0.0008052723133005202, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 81033344, "step": 1173 }, { "epoch": 73.375, "grad_norm": 13.37202980190419, "learning_rate": 5e-05, "loss": 0.0169, "num_input_tokens_seen": 81105024, "step": 1174 }, { "epoch": 73.375, "loss": 0.018025677651166916, "loss_ce": 0.0008137636468745768, "loss_xval": 0.0172119140625, "num_input_tokens_seen": 81105024, "step": 1174 }, { "epoch": 73.4375, "grad_norm": 10.779752378641776, "learning_rate": 5e-05, "loss": 0.0118, "num_input_tokens_seen": 81176640, "step": 1175 }, { "epoch": 73.4375, "loss": 0.01216103881597519, "loss_ce": 0.0008085001609288156, "loss_xval": 0.0113525390625, "num_input_tokens_seen": 81176640, "step": 1175 }, { "epoch": 73.5, "grad_norm": 0.7641412974344073, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 81235904, "step": 1176 }, { "epoch": 73.5, "loss": 0.0018806563457474113, "loss_ce": 0.0007896529277786613, "loss_xval": 0.00109100341796875, "num_input_tokens_seen": 81235904, "step": 1176 }, { "epoch": 73.5625, "grad_norm": 9.594310910818406, "learning_rate": 5e-05, "loss": 0.0093, "num_input_tokens_seen": 81307520, "step": 1177 }, { "epoch": 73.5625, "loss": 0.009962813928723335, "loss_ce": 0.0008075400837697089, "loss_xval": 0.0091552734375, "num_input_tokens_seen": 81307520, "step": 1177 }, { "epoch": 73.625, "grad_norm": 13.113512591536526, "learning_rate": 5e-05, "loss": 0.0151, "num_input_tokens_seen": 81379264, "step": 1178 }, { "epoch": 73.625, "loss": 0.015093508176505566, "loss_ce": 0.0008112816140055656, "loss_xval": 0.0142822265625, "num_input_tokens_seen": 81379264, "step": 1178 }, { "epoch": 73.6875, "grad_norm": 8.21336511687051, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 81438464, "step": 1179 }, { "epoch": 73.6875, "loss": 0.006963968276977539, "loss_ce": 0.0007383823394775391, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 81438464, "step": 1179 }, { "epoch": 73.75, "grad_norm": 1.3305949124151968, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 81497600, "step": 1180 }, { "epoch": 73.75, "loss": 0.0021093811374157667, "loss_ce": 0.0006826843600720167, "loss_xval": 0.00142669677734375, "num_input_tokens_seen": 81497600, "step": 1180 }, { "epoch": 73.8125, "grad_norm": 10.757753909600044, "learning_rate": 5e-05, "loss": 0.0106, "num_input_tokens_seen": 81569280, "step": 1181 }, { "epoch": 73.8125, "loss": 0.010827873833477497, "loss_ce": 0.0006960381288081408, "loss_xval": 0.0101318359375, "num_input_tokens_seen": 81569280, "step": 1181 }, { "epoch": 73.875, "grad_norm": 14.785007365776059, "learning_rate": 5e-05, "loss": 0.0185, "num_input_tokens_seen": 81640832, "step": 1182 }, { "epoch": 73.875, "loss": 0.01923806220293045, "loss_ce": 0.0006833748193457723, "loss_xval": 0.0185546875, "num_input_tokens_seen": 81640832, "step": 1182 }, { "epoch": 73.9375, "grad_norm": 12.13125384920187, "learning_rate": 5e-05, "loss": 0.0128, "num_input_tokens_seen": 81712640, "step": 1183 }, { "epoch": 73.9375, "loss": 0.012981466948986053, "loss_ce": 0.0006523649790324271, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 81712640, "step": 1183 }, { "epoch": 74.0, "grad_norm": 5.163790877656547, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 81784320, "step": 1184 }, { "epoch": 74.0, "loss": 0.003778281854465604, "loss_ce": 0.0006197125767357647, "loss_xval": 0.0031585693359375, "num_input_tokens_seen": 81784320, "step": 1184 }, { "epoch": 74.0625, "grad_norm": 2.7271002962089423, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 81855872, "step": 1185 }, { "epoch": 74.0625, "loss": 0.0015725651755928993, "loss_ce": 0.0005960026173852384, "loss_xval": 0.0009765625, "num_input_tokens_seen": 81855872, "step": 1185 }, { "epoch": 74.125, "grad_norm": 7.825771598203474, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 81927488, "step": 1186 }, { "epoch": 74.125, "loss": 0.006267584394663572, "loss_ce": 0.0005913149216212332, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 81927488, "step": 1186 }, { "epoch": 74.1875, "grad_norm": 8.782250449852858, "learning_rate": 5e-05, "loss": 0.0072, "num_input_tokens_seen": 81986496, "step": 1187 }, { "epoch": 74.1875, "loss": 0.006883226800709963, "loss_ce": 0.0005355706671252847, "loss_xval": 0.00634765625, "num_input_tokens_seen": 81986496, "step": 1187 }, { "epoch": 74.25, "grad_norm": 6.2170072955900055, "learning_rate": 5e-05, "loss": 0.0041, "num_input_tokens_seen": 82045760, "step": 1188 }, { "epoch": 74.25, "loss": 0.004353879485279322, "loss_ce": 0.0005239234305918217, "loss_xval": 0.0038299560546875, "num_input_tokens_seen": 82045760, "step": 1188 }, { "epoch": 74.3125, "grad_norm": 1.7663809319742363, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 82117312, "step": 1189 }, { "epoch": 74.3125, "loss": 0.0013176639331504703, "loss_ce": 0.0005203922046348453, "loss_xval": 0.000797271728515625, "num_input_tokens_seen": 82117312, "step": 1189 }, { "epoch": 74.375, "grad_norm": 2.6607146417355954, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 82188992, "step": 1190 }, { "epoch": 74.375, "loss": 0.0011832985328510404, "loss_ce": 0.0004852089623454958, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 82188992, "step": 1190 }, { "epoch": 74.4375, "grad_norm": 4.697353229448369, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 82248128, "step": 1191 }, { "epoch": 74.4375, "loss": 0.003033492248505354, "loss_ce": 0.00045475686783902347, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 82248128, "step": 1191 }, { "epoch": 74.5, "grad_norm": 3.393077012844604, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 82319680, "step": 1192 }, { "epoch": 74.5, "loss": 0.001984121510758996, "loss_ce": 0.0004429837572388351, "loss_xval": 0.0015411376953125, "num_input_tokens_seen": 82319680, "step": 1192 }, { "epoch": 74.5625, "grad_norm": 0.958210115408945, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 82378688, "step": 1193 }, { "epoch": 74.5625, "loss": 0.001094202627427876, "loss_ce": 0.00040374239324592054, "loss_xval": 0.000690460205078125, "num_input_tokens_seen": 82378688, "step": 1193 }, { "epoch": 74.625, "grad_norm": 1.133347023469092, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 82450304, "step": 1194 }, { "epoch": 74.625, "loss": 0.0007766564376652241, "loss_ce": 0.0004180748655926436, "loss_xval": 0.00035858154296875, "num_input_tokens_seen": 82450304, "step": 1194 }, { "epoch": 74.6875, "grad_norm": 2.3412767010384474, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 82509312, "step": 1195 }, { "epoch": 74.6875, "loss": 0.0011598513228818774, "loss_ce": 0.0003625796234700829, "loss_xval": 0.000797271728515625, "num_input_tokens_seen": 82509312, "step": 1195 }, { "epoch": 74.75, "grad_norm": 3.5261338234594457, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 82568512, "step": 1196 }, { "epoch": 74.75, "loss": 0.0019028933020308614, "loss_ce": 0.00034649684675969183, "loss_xval": 0.001556396484375, "num_input_tokens_seen": 82568512, "step": 1196 }, { "epoch": 74.8125, "grad_norm": 5.722321680343235, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 82627584, "step": 1197 }, { "epoch": 74.8125, "loss": 0.0031568289268761873, "loss_ce": 0.00034921171027235687, "loss_xval": 0.0028076171875, "num_input_tokens_seen": 82627584, "step": 1197 }, { "epoch": 74.875, "grad_norm": 8.312268126186806, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 82699264, "step": 1198 }, { "epoch": 74.875, "loss": 0.00578534509986639, "loss_ce": 0.0003226986445952207, "loss_xval": 0.005462646484375, "num_input_tokens_seen": 82699264, "step": 1198 }, { "epoch": 74.9375, "grad_norm": 11.608810372977517, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 82770944, "step": 1199 }, { "epoch": 74.9375, "loss": 0.011615313589572906, "loss_ce": 0.00032380997436121106, "loss_xval": 0.01129150390625, "num_input_tokens_seen": 82770944, "step": 1199 }, { "epoch": 75.0, "grad_norm": 15.768164554755671, "learning_rate": 5e-05, "loss": 0.0196, "num_input_tokens_seen": 82817408, "step": 1200 }, { "epoch": 75.0, "loss": 0.019836699590086937, "loss_ce": 0.00030544938636012375, "loss_xval": 0.01953125, "num_input_tokens_seen": 82817408, "step": 1200 }, { "epoch": 75.0625, "grad_norm": 20.13083247408963, "learning_rate": 5e-05, "loss": 0.0319, "num_input_tokens_seen": 82889024, "step": 1201 }, { "epoch": 75.0625, "loss": 0.03227750584483147, "loss_ce": 0.0002950829511974007, "loss_xval": 0.031982421875, "num_input_tokens_seen": 82889024, "step": 1201 }, { "epoch": 75.125, "grad_norm": 23.608974934533943, "learning_rate": 5e-05, "loss": 0.0441, "num_input_tokens_seen": 82960640, "step": 1202 }, { "epoch": 75.125, "loss": 0.04472793638706207, "loss_ce": 0.0002943412109743804, "loss_xval": 0.04443359375, "num_input_tokens_seen": 82960640, "step": 1202 }, { "epoch": 75.1875, "grad_norm": 25.185805677940856, "learning_rate": 5e-05, "loss": 0.0503, "num_input_tokens_seen": 83032320, "step": 1203 }, { "epoch": 75.1875, "loss": 0.050356145948171616, "loss_ce": 0.0003073193074669689, "loss_xval": 0.050048828125, "num_input_tokens_seen": 83032320, "step": 1203 }, { "epoch": 75.25, "grad_norm": 24.398977418417044, "learning_rate": 5e-05, "loss": 0.0472, "num_input_tokens_seen": 83103936, "step": 1204 }, { "epoch": 75.25, "loss": 0.04962233826518059, "loss_ce": 0.0003059330047108233, "loss_xval": 0.04931640625, "num_input_tokens_seen": 83103936, "step": 1204 }, { "epoch": 75.3125, "grad_norm": 19.919419001296145, "learning_rate": 5e-05, "loss": 0.0321, "num_input_tokens_seen": 83175552, "step": 1205 }, { "epoch": 75.3125, "loss": 0.03275446221232414, "loss_ce": 0.0002837601350620389, "loss_xval": 0.032470703125, "num_input_tokens_seen": 83175552, "step": 1205 }, { "epoch": 75.375, "grad_norm": 11.768760718587416, "learning_rate": 5e-05, "loss": 0.0115, "num_input_tokens_seen": 83247104, "step": 1206 }, { "epoch": 75.375, "loss": 0.012369153089821339, "loss_ce": 0.0002841917157638818, "loss_xval": 0.0120849609375, "num_input_tokens_seen": 83247104, "step": 1206 }, { "epoch": 75.4375, "grad_norm": 1.8368035999056054, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 83318656, "step": 1207 }, { "epoch": 75.4375, "loss": 0.001351103070192039, "loss_ce": 0.00029061720124445856, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 83318656, "step": 1207 }, { "epoch": 75.5, "grad_norm": 7.80666118483688, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 83390336, "step": 1208 }, { "epoch": 75.5, "loss": 0.005994611419737339, "loss_ce": 0.0002878241066355258, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 83390336, "step": 1208 }, { "epoch": 75.5625, "grad_norm": 15.776520230822296, "learning_rate": 5e-05, "loss": 0.0218, "num_input_tokens_seen": 83461952, "step": 1209 }, { "epoch": 75.5625, "loss": 0.01836695894598961, "loss_ce": 0.0003005519974976778, "loss_xval": 0.01806640625, "num_input_tokens_seen": 83461952, "step": 1209 }, { "epoch": 75.625, "grad_norm": 20.701223909897063, "learning_rate": 5e-05, "loss": 0.0364, "num_input_tokens_seen": 83533504, "step": 1210 }, { "epoch": 75.625, "loss": 0.03570494055747986, "loss_ce": 0.00030455051455646753, "loss_xval": 0.035400390625, "num_input_tokens_seen": 83533504, "step": 1210 }, { "epoch": 75.6875, "grad_norm": 21.527798508931795, "learning_rate": 5e-05, "loss": 0.0386, "num_input_tokens_seen": 83605248, "step": 1211 }, { "epoch": 75.6875, "loss": 0.03861980512738228, "loss_ce": 0.0002897270314861089, "loss_xval": 0.038330078125, "num_input_tokens_seen": 83605248, "step": 1211 }, { "epoch": 75.75, "grad_norm": 17.81662722607838, "learning_rate": 5e-05, "loss": 0.0267, "num_input_tokens_seen": 83676800, "step": 1212 }, { "epoch": 75.75, "loss": 0.0250775758177042, "loss_ce": 0.0002973016817122698, "loss_xval": 0.0247802734375, "num_input_tokens_seen": 83676800, "step": 1212 }, { "epoch": 75.8125, "grad_norm": 10.896037597453502, "learning_rate": 5e-05, "loss": 0.0107, "num_input_tokens_seen": 83748544, "step": 1213 }, { "epoch": 75.8125, "loss": 0.011996936053037643, "loss_ce": 0.0002781857911031693, "loss_xval": 0.01171875, "num_input_tokens_seen": 83748544, "step": 1213 }, { "epoch": 75.875, "grad_norm": 2.36863658926043, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 83820352, "step": 1214 }, { "epoch": 75.875, "loss": 0.0016460572369396687, "loss_ce": 0.0002803955867420882, "loss_xval": 0.00136566162109375, "num_input_tokens_seen": 83820352, "step": 1214 }, { "epoch": 75.9375, "grad_norm": 5.758967969538332, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 83879424, "step": 1215 }, { "epoch": 75.9375, "loss": 0.0028607631102204323, "loss_ce": 0.0002820278750732541, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 83879424, "step": 1215 }, { "epoch": 76.0, "grad_norm": 11.882970539678618, "learning_rate": 5e-05, "loss": 0.0123, "num_input_tokens_seen": 83951040, "step": 1216 }, { "epoch": 76.0, "loss": 0.012303365394473076, "loss_ce": 0.000279439176665619, "loss_xval": 0.01202392578125, "num_input_tokens_seen": 83951040, "step": 1216 }, { "epoch": 76.0625, "grad_norm": 15.547437781822687, "learning_rate": 5e-05, "loss": 0.0205, "num_input_tokens_seen": 84022656, "step": 1217 }, { "epoch": 76.0625, "loss": 0.0207960307598114, "loss_ce": 0.0002882190456148237, "loss_xval": 0.0205078125, "num_input_tokens_seen": 84022656, "step": 1217 }, { "epoch": 76.125, "grad_norm": 16.543148649304666, "learning_rate": 5e-05, "loss": 0.023, "num_input_tokens_seen": 84094400, "step": 1218 }, { "epoch": 76.125, "loss": 0.02309388294816017, "loss_ce": 0.0002667343069333583, "loss_xval": 0.0228271484375, "num_input_tokens_seen": 84094400, "step": 1218 }, { "epoch": 76.1875, "grad_norm": 14.45577440531413, "learning_rate": 5e-05, "loss": 0.0176, "num_input_tokens_seen": 84165952, "step": 1219 }, { "epoch": 76.1875, "loss": 0.019056590273976326, "loss_ce": 0.00025776130496524274, "loss_xval": 0.018798828125, "num_input_tokens_seen": 84165952, "step": 1219 }, { "epoch": 76.25, "grad_norm": 9.135506928731544, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 84237568, "step": 1220 }, { "epoch": 76.25, "loss": 0.008334951475262642, "loss_ce": 0.00027831082115881145, "loss_xval": 0.008056640625, "num_input_tokens_seen": 84237568, "step": 1220 }, { "epoch": 76.3125, "grad_norm": 1.4668907368775246, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 84309120, "step": 1221 }, { "epoch": 76.3125, "loss": 0.0018034669337794185, "loss_ce": 0.0002699586621019989, "loss_xval": 0.00153350830078125, "num_input_tokens_seen": 84309120, "step": 1221 }, { "epoch": 76.375, "grad_norm": 7.317495664094737, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 84380864, "step": 1222 }, { "epoch": 76.375, "loss": 0.003733080578967929, "loss_ce": 0.0002845943090505898, "loss_xval": 0.003448486328125, "num_input_tokens_seen": 84380864, "step": 1222 }, { "epoch": 76.4375, "grad_norm": 14.681200730949367, "learning_rate": 5e-05, "loss": 0.0187, "num_input_tokens_seen": 84452416, "step": 1223 }, { "epoch": 76.4375, "loss": 0.01689068041741848, "loss_ce": 0.000289117539068684, "loss_xval": 0.0166015625, "num_input_tokens_seen": 84452416, "step": 1223 }, { "epoch": 76.5, "grad_norm": 18.602396130334732, "learning_rate": 5e-05, "loss": 0.0293, "num_input_tokens_seen": 84523968, "step": 1224 }, { "epoch": 76.5, "loss": 0.02823912911117077, "loss_ce": 0.0002850280434358865, "loss_xval": 0.0279541015625, "num_input_tokens_seen": 84523968, "step": 1224 }, { "epoch": 76.5625, "grad_norm": 19.009596458273673, "learning_rate": 5e-05, "loss": 0.0308, "num_input_tokens_seen": 84595648, "step": 1225 }, { "epoch": 76.5625, "loss": 0.03032798133790493, "loss_ce": 0.0002986853360198438, "loss_xval": 0.030029296875, "num_input_tokens_seen": 84595648, "step": 1225 }, { "epoch": 76.625, "grad_norm": 17.267936941710204, "learning_rate": 5e-05, "loss": 0.0257, "num_input_tokens_seen": 84667392, "step": 1226 }, { "epoch": 76.625, "loss": 0.02483111433684826, "loss_ce": 0.00029498201911337674, "loss_xval": 0.0245361328125, "num_input_tokens_seen": 84667392, "step": 1226 }, { "epoch": 76.6875, "grad_norm": 14.417852806473686, "learning_rate": 5e-05, "loss": 0.0183, "num_input_tokens_seen": 84739008, "step": 1227 }, { "epoch": 76.6875, "loss": 0.016663750633597374, "loss_ce": 0.0003063289914280176, "loss_xval": 0.016357421875, "num_input_tokens_seen": 84739008, "step": 1227 }, { "epoch": 76.75, "grad_norm": 10.706297841004377, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 84810688, "step": 1228 }, { "epoch": 76.75, "loss": 0.010646898299455643, "loss_ce": 0.0003319565439596772, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 84810688, "step": 1228 }, { "epoch": 76.8125, "grad_norm": 6.725381663583949, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 84882368, "step": 1229 }, { "epoch": 76.8125, "loss": 0.004713365808129311, "loss_ce": 0.0003188344999216497, "loss_xval": 0.00439453125, "num_input_tokens_seen": 84882368, "step": 1229 }, { "epoch": 76.875, "grad_norm": 3.2416371982539625, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 84953920, "step": 1230 }, { "epoch": 76.875, "loss": 0.0016462351195514202, "loss_ce": 0.00034160862560383976, "loss_xval": 0.00130462646484375, "num_input_tokens_seen": 84953920, "step": 1230 }, { "epoch": 76.9375, "grad_norm": 0.8319526045639284, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 85025536, "step": 1231 }, { "epoch": 76.9375, "loss": 0.0012074424885213375, "loss_ce": 0.0003300620592199266, "loss_xval": 0.00087738037109375, "num_input_tokens_seen": 85025536, "step": 1231 }, { "epoch": 77.0, "grad_norm": 0.4998901065208127, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 85097088, "step": 1232 }, { "epoch": 77.0, "loss": 0.0010170785244554281, "loss_ce": 0.00031898898305371404, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 85097088, "step": 1232 }, { "epoch": 77.0625, "grad_norm": 1.5322908679654579, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 85168640, "step": 1233 }, { "epoch": 77.0625, "loss": 0.0006979235913604498, "loss_ce": 0.00031454648706130683, "loss_xval": 0.0003833770751953125, "num_input_tokens_seen": 85168640, "step": 1233 }, { "epoch": 77.125, "grad_norm": 2.774667522264961, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 85227712, "step": 1234 }, { "epoch": 77.125, "loss": 0.0014237510040402412, "loss_ce": 0.0003174888261128217, "loss_xval": 0.00110626220703125, "num_input_tokens_seen": 85227712, "step": 1234 }, { "epoch": 77.1875, "grad_norm": 4.7024190141487265, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 85299264, "step": 1235 }, { "epoch": 77.1875, "loss": 0.002505771117284894, "loss_ce": 0.00030850546318106353, "loss_xval": 0.002197265625, "num_input_tokens_seen": 85299264, "step": 1235 }, { "epoch": 77.25, "grad_norm": 7.645943356947957, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 85370944, "step": 1236 }, { "epoch": 77.25, "loss": 0.005701807327568531, "loss_ce": 0.0003001959703397006, "loss_xval": 0.005401611328125, "num_input_tokens_seen": 85370944, "step": 1236 }, { "epoch": 77.3125, "grad_norm": 11.666851267281478, "learning_rate": 5e-05, "loss": 0.0124, "num_input_tokens_seen": 85442560, "step": 1237 }, { "epoch": 77.3125, "loss": 0.012451760470867157, "loss_ce": 0.0003057646390516311, "loss_xval": 0.01214599609375, "num_input_tokens_seen": 85442560, "step": 1237 }, { "epoch": 77.375, "grad_norm": 17.347565722362265, "learning_rate": 5e-05, "loss": 0.0265, "num_input_tokens_seen": 85501696, "step": 1238 }, { "epoch": 77.375, "loss": 0.02691800892353058, "loss_ce": 0.00030668015824630857, "loss_xval": 0.026611328125, "num_input_tokens_seen": 85501696, "step": 1238 }, { "epoch": 77.4375, "grad_norm": 25.38116280556335, "learning_rate": 5e-05, "loss": 0.0555, "num_input_tokens_seen": 85573376, "step": 1239 }, { "epoch": 77.4375, "loss": 0.0549708791077137, "loss_ce": 0.0002833788748830557, "loss_xval": 0.0546875, "num_input_tokens_seen": 85573376, "step": 1239 }, { "epoch": 77.5, "grad_norm": 35.527800123958905, "learning_rate": 5e-05, "loss": 0.1089, "num_input_tokens_seen": 85644928, "step": 1240 }, { "epoch": 77.5, "loss": 0.10820645838975906, "loss_ce": 0.0002962992584798485, "loss_xval": 0.10791015625, "num_input_tokens_seen": 85644928, "step": 1240 }, { "epoch": 77.5625, "grad_norm": 43.15199387931728, "learning_rate": 5e-05, "loss": 0.1634, "num_input_tokens_seen": 85716544, "step": 1241 }, { "epoch": 77.5625, "loss": 0.16338858008384705, "loss_ce": 0.00030264799715951085, "loss_xval": 0.1630859375, "num_input_tokens_seen": 85716544, "step": 1241 }, { "epoch": 77.625, "grad_norm": 40.2416273483037, "learning_rate": 5e-05, "loss": 0.1444, "num_input_tokens_seen": 85788160, "step": 1242 }, { "epoch": 77.625, "loss": 0.14094926416873932, "loss_ce": 0.0003242639359086752, "loss_xval": 0.140625, "num_input_tokens_seen": 85788160, "step": 1242 }, { "epoch": 77.6875, "grad_norm": 20.986646115749792, "learning_rate": 5e-05, "loss": 0.042, "num_input_tokens_seen": 85847232, "step": 1243 }, { "epoch": 77.6875, "loss": 0.03987127169966698, "loss_ce": 0.0003204921376891434, "loss_xval": 0.03955078125, "num_input_tokens_seen": 85847232, "step": 1243 }, { "epoch": 77.75, "grad_norm": 7.662557391421135, "learning_rate": 5e-05, "loss": 0.0077, "num_input_tokens_seen": 85918848, "step": 1244 }, { "epoch": 77.75, "loss": 0.008253362961113453, "loss_ce": 0.00037982812500558794, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 85918848, "step": 1244 }, { "epoch": 77.8125, "grad_norm": 28.73130524451353, "learning_rate": 5e-05, "loss": 0.0761, "num_input_tokens_seen": 85990528, "step": 1245 }, { "epoch": 77.8125, "loss": 0.07705868035554886, "loss_ce": 0.0003985270159319043, "loss_xval": 0.07666015625, "num_input_tokens_seen": 85990528, "step": 1245 }, { "epoch": 77.875, "grad_norm": 30.854670471636584, "learning_rate": 5e-05, "loss": 0.0883, "num_input_tokens_seen": 86062272, "step": 1246 }, { "epoch": 77.875, "loss": 0.08785340934991837, "loss_ce": 0.0004510624276008457, "loss_xval": 0.08740234375, "num_input_tokens_seen": 86062272, "step": 1246 }, { "epoch": 77.9375, "grad_norm": 14.028656578933234, "learning_rate": 5e-05, "loss": 0.0191, "num_input_tokens_seen": 86121344, "step": 1247 }, { "epoch": 77.9375, "loss": 0.019599292427301407, "loss_ce": 0.0004342528118286282, "loss_xval": 0.0191650390625, "num_input_tokens_seen": 86121344, "step": 1247 }, { "epoch": 78.0, "grad_norm": 10.269793907601676, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 86192960, "step": 1248 }, { "epoch": 78.0, "loss": 0.01089974120259285, "loss_ce": 0.0004627294256351888, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 86192960, "step": 1248 }, { "epoch": 78.0625, "grad_norm": 26.15659721116164, "learning_rate": 5e-05, "loss": 0.0637, "num_input_tokens_seen": 86264512, "step": 1249 }, { "epoch": 78.0625, "loss": 0.06197069212794304, "loss_ce": 0.00044725643238052726, "loss_xval": 0.0615234375, "num_input_tokens_seen": 86264512, "step": 1249 }, { "epoch": 78.125, "grad_norm": 23.947367489720282, "learning_rate": 5e-05, "loss": 0.0541, "num_input_tokens_seen": 86323712, "step": 1250 }, { "epoch": 78.125, "eval_synth_IoU": 0.15380492992699146, "eval_synth_MAE_x": 0.037353515625, "eval_synth_MAE_y": 0.0395660400390625, "eval_synth_NUM_probability": 0.993680864572525, "eval_synth_inside_bbox": 0.3125, "eval_synth_loss": 0.003601398319005966, "eval_synth_loss_ce": 0.000526752628502436, "eval_synth_loss_xval": 0.00307464599609375, "eval_synth_runtime": 58.4846, "eval_synth_samples_per_second": 2.189, "eval_synth_steps_per_second": 0.068, "num_input_tokens_seen": 86323712, "step": 1250 }, { "epoch": 78.125, "loss": 0.0037252199836075306, "loss_ce": 0.0005208743968978524, "loss_xval": 0.003204345703125, "num_input_tokens_seen": 86323712, "step": 1250 }, { "epoch": 78.1875, "grad_norm": 5.597444260463171, "learning_rate": 5e-05, "loss": 0.0041, "num_input_tokens_seen": 86395328, "step": 1251 }, { "epoch": 78.1875, "loss": 0.004415246658027172, "loss_ce": 0.0005700316396541893, "loss_xval": 0.00384521484375, "num_input_tokens_seen": 86395328, "step": 1251 }, { "epoch": 78.25, "grad_norm": 14.920542235110513, "learning_rate": 5e-05, "loss": 0.0223, "num_input_tokens_seen": 86467072, "step": 1252 }, { "epoch": 78.25, "loss": 0.022936955094337463, "loss_ce": 0.0005980880814604461, "loss_xval": 0.0223388671875, "num_input_tokens_seen": 86467072, "step": 1252 }, { "epoch": 78.3125, "grad_norm": 21.22270271731356, "learning_rate": 5e-05, "loss": 0.0454, "num_input_tokens_seen": 86538688, "step": 1253 }, { "epoch": 78.3125, "loss": 0.04360269755125046, "loss_ce": 0.0006339486571960151, "loss_xval": 0.04296875, "num_input_tokens_seen": 86538688, "step": 1253 }, { "epoch": 78.375, "grad_norm": 9.998335500552542, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 86610432, "step": 1254 }, { "epoch": 78.375, "loss": 0.010832501575350761, "loss_ce": 0.0007006655796431005, "loss_xval": 0.0101318359375, "num_input_tokens_seen": 86610432, "step": 1254 }, { "epoch": 78.4375, "grad_norm": 7.9508214231797805, "learning_rate": 5e-05, "loss": 0.0073, "num_input_tokens_seen": 86669504, "step": 1255 }, { "epoch": 78.4375, "loss": 0.00764887360855937, "loss_ce": 0.0007213835488073528, "loss_xval": 0.006927490234375, "num_input_tokens_seen": 86669504, "step": 1255 }, { "epoch": 78.5, "grad_norm": 17.42214602129036, "learning_rate": 5e-05, "loss": 0.0309, "num_input_tokens_seen": 86741184, "step": 1256 }, { "epoch": 78.5, "loss": 0.03059588000178337, "loss_ce": 0.0006886525661684573, "loss_xval": 0.0299072265625, "num_input_tokens_seen": 86741184, "step": 1256 }, { "epoch": 78.5625, "grad_norm": 12.183816197766062, "learning_rate": 5e-05, "loss": 0.0161, "num_input_tokens_seen": 86812864, "step": 1257 }, { "epoch": 78.5625, "loss": 0.015264628455042839, "loss_ce": 0.0007382616749964654, "loss_xval": 0.0145263671875, "num_input_tokens_seen": 86812864, "step": 1257 }, { "epoch": 78.625, "grad_norm": 1.3140939069078006, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 86884480, "step": 1258 }, { "epoch": 78.625, "loss": 0.0012654373422265053, "loss_ce": 0.0006741593242622912, "loss_xval": 0.000591278076171875, "num_input_tokens_seen": 86884480, "step": 1258 }, { "epoch": 78.6875, "grad_norm": 12.321355731714885, "learning_rate": 5e-05, "loss": 0.0159, "num_input_tokens_seen": 86956032, "step": 1259 }, { "epoch": 78.6875, "loss": 0.017447177320718765, "loss_ce": 0.000601475010626018, "loss_xval": 0.016845703125, "num_input_tokens_seen": 86956032, "step": 1259 }, { "epoch": 78.75, "grad_norm": 12.148726058787897, "learning_rate": 5e-05, "loss": 0.0152, "num_input_tokens_seen": 87027776, "step": 1260 }, { "epoch": 78.75, "loss": 0.015706263482570648, "loss_ce": 0.0005695446161553264, "loss_xval": 0.01513671875, "num_input_tokens_seen": 87027776, "step": 1260 }, { "epoch": 78.8125, "grad_norm": 1.4145298228038716, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 87099520, "step": 1261 }, { "epoch": 78.8125, "loss": 0.001163907814770937, "loss_ce": 0.000515409279614687, "loss_xval": 0.00064849853515625, "num_input_tokens_seen": 87099520, "step": 1261 }, { "epoch": 78.875, "grad_norm": 10.189589879843748, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 87171264, "step": 1262 }, { "epoch": 78.875, "loss": 0.01154272723942995, "loss_ce": 0.0004953640745952725, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 87171264, "step": 1262 }, { "epoch": 78.9375, "grad_norm": 13.848987821502545, "learning_rate": 5e-05, "loss": 0.0193, "num_input_tokens_seen": 87242944, "step": 1263 }, { "epoch": 78.9375, "loss": 0.019630515947937965, "loss_ce": 0.00046547639067284763, "loss_xval": 0.0191650390625, "num_input_tokens_seen": 87242944, "step": 1263 }, { "epoch": 79.0, "grad_norm": 7.633121068960464, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 87314560, "step": 1264 }, { "epoch": 79.0, "loss": 0.006737522780895233, "loss_ce": 0.0004509016580414027, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 87314560, "step": 1264 }, { "epoch": 79.0625, "grad_norm": 2.5095848968654546, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 87386240, "step": 1265 }, { "epoch": 79.0625, "loss": 0.0015422420110553503, "loss_ce": 0.0004664973239414394, "loss_xval": 0.00107574462890625, "num_input_tokens_seen": 87386240, "step": 1265 }, { "epoch": 79.125, "grad_norm": 8.394106185074616, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 87457856, "step": 1266 }, { "epoch": 79.125, "loss": 0.008274378255009651, "loss_ce": 0.000461878371424973, "loss_xval": 0.0078125, "num_input_tokens_seen": 87457856, "step": 1266 }, { "epoch": 79.1875, "grad_norm": 6.9888938015426545, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 87529472, "step": 1267 }, { "epoch": 79.1875, "loss": 0.005406632088124752, "loss_ce": 0.0004322669410612434, "loss_xval": 0.004974365234375, "num_input_tokens_seen": 87529472, "step": 1267 }, { "epoch": 79.25, "grad_norm": 0.6307274121771996, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 87601152, "step": 1268 }, { "epoch": 79.25, "loss": 0.0011359049240127206, "loss_ce": 0.00043400059803389013, "loss_xval": 0.000701904296875, "num_input_tokens_seen": 87601152, "step": 1268 }, { "epoch": 79.3125, "grad_norm": 6.587295085954337, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 87672896, "step": 1269 }, { "epoch": 79.3125, "loss": 0.005824776366353035, "loss_ce": 0.000392647460103035, "loss_xval": 0.00543212890625, "num_input_tokens_seen": 87672896, "step": 1269 }, { "epoch": 79.375, "grad_norm": 9.86644422051795, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 87744576, "step": 1270 }, { "epoch": 79.375, "loss": 0.010824608616530895, "loss_ce": 0.0003875968395732343, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 87744576, "step": 1270 }, { "epoch": 79.4375, "grad_norm": 7.150141739852848, "learning_rate": 5e-05, "loss": 0.0058, "num_input_tokens_seen": 87816256, "step": 1271 }, { "epoch": 79.4375, "loss": 0.005546413827687502, "loss_ce": 0.0003584257501643151, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 87816256, "step": 1271 }, { "epoch": 79.5, "grad_norm": 0.6032163165017892, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 87875392, "step": 1272 }, { "epoch": 79.5, "loss": 0.0005711402627639472, "loss_ce": 0.000364192936103791, "loss_xval": 0.00020694732666015625, "num_input_tokens_seen": 87875392, "step": 1272 }, { "epoch": 79.5625, "grad_norm": 5.324726389319908, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 87947200, "step": 1273 }, { "epoch": 79.5625, "loss": 0.003648017067462206, "loss_ce": 0.0003521185426507145, "loss_xval": 0.0032958984375, "num_input_tokens_seen": 87947200, "step": 1273 }, { "epoch": 79.625, "grad_norm": 7.363803798232587, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 88018944, "step": 1274 }, { "epoch": 79.625, "loss": 0.005804733373224735, "loss_ce": 0.0003420870052650571, "loss_xval": 0.005462646484375, "num_input_tokens_seen": 88018944, "step": 1274 }, { "epoch": 79.6875, "grad_norm": 5.834627935141895, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 88090560, "step": 1275 }, { "epoch": 79.6875, "loss": 0.0035827215760946274, "loss_ce": 0.00033259938936680555, "loss_xval": 0.0032501220703125, "num_input_tokens_seen": 88090560, "step": 1275 }, { "epoch": 79.75, "grad_norm": 2.5866018005038907, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 88162240, "step": 1276 }, { "epoch": 79.75, "loss": 0.001492985524237156, "loss_ce": 0.00033331758459098637, "loss_xval": 0.00115966796875, "num_input_tokens_seen": 88162240, "step": 1276 }, { "epoch": 79.8125, "grad_norm": 1.1924939995655466, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 88233984, "step": 1277 }, { "epoch": 79.8125, "loss": 0.0008020081440918148, "loss_ce": 0.00032135628862306476, "loss_xval": 0.00048065185546875, "num_input_tokens_seen": 88233984, "step": 1277 }, { "epoch": 79.875, "grad_norm": 4.411322436249555, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 88305536, "step": 1278 }, { "epoch": 79.875, "loss": 0.00267414515838027, "loss_ce": 0.0002937741228379309, "loss_xval": 0.00238037109375, "num_input_tokens_seen": 88305536, "step": 1278 }, { "epoch": 79.9375, "grad_norm": 5.861055974831472, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 88377088, "step": 1279 }, { "epoch": 79.9375, "loss": 0.00425551924854517, "loss_ce": 0.0002882343251258135, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 88377088, "step": 1279 }, { "epoch": 80.0, "grad_norm": 4.045180702609077, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 88448704, "step": 1280 }, { "epoch": 80.0, "loss": 0.002019032370299101, "loss_ce": 0.00027190105174668133, "loss_xval": 0.00174713134765625, "num_input_tokens_seen": 88448704, "step": 1280 }, { "epoch": 80.0625, "grad_norm": 0.6975818335879924, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 88520448, "step": 1281 }, { "epoch": 80.0625, "loss": 0.0006273279432207346, "loss_ce": 0.00026111697661690414, "loss_xval": 0.0003662109375, "num_input_tokens_seen": 88520448, "step": 1281 }, { "epoch": 80.125, "grad_norm": 6.415093584282375, "learning_rate": 5e-05, "loss": 0.0046, "num_input_tokens_seen": 88592064, "step": 1282 }, { "epoch": 80.125, "loss": 0.004382018465548754, "loss_ce": 0.0002621453022584319, "loss_xval": 0.004119873046875, "num_input_tokens_seen": 88592064, "step": 1282 }, { "epoch": 80.1875, "grad_norm": 10.984504435288581, "learning_rate": 5e-05, "loss": 0.0123, "num_input_tokens_seen": 88651200, "step": 1283 }, { "epoch": 80.1875, "loss": 0.011899075470864773, "loss_ce": 0.00024136043793987483, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 88651200, "step": 1283 }, { "epoch": 80.25, "grad_norm": 12.976863346590147, "learning_rate": 5e-05, "loss": 0.0171, "num_input_tokens_seen": 88722816, "step": 1284 }, { "epoch": 80.25, "loss": 0.017806636169552803, "loss_ce": 0.00022851029643788934, "loss_xval": 0.017578125, "num_input_tokens_seen": 88722816, "step": 1284 }, { "epoch": 80.3125, "grad_norm": 12.000667507600303, "learning_rate": 5e-05, "loss": 0.0145, "num_input_tokens_seen": 88794560, "step": 1285 }, { "epoch": 80.3125, "loss": 0.01462092436850071, "loss_ce": 0.0002166273188777268, "loss_xval": 0.014404296875, "num_input_tokens_seen": 88794560, "step": 1285 }, { "epoch": 80.375, "grad_norm": 8.338492384447054, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 88866240, "step": 1286 }, { "epoch": 80.375, "loss": 0.007325833663344383, "loss_ce": 0.0002152378874598071, "loss_xval": 0.007110595703125, "num_input_tokens_seen": 88866240, "step": 1286 }, { "epoch": 80.4375, "grad_norm": 3.1245423806255186, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 88937984, "step": 1287 }, { "epoch": 80.4375, "loss": 0.0016321254661306739, "loss_ce": 0.00020542873244266957, "loss_xval": 0.00142669677734375, "num_input_tokens_seen": 88937984, "step": 1287 }, { "epoch": 80.5, "grad_norm": 1.778896332245402, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 89009536, "step": 1288 }, { "epoch": 80.5, "loss": 0.0011494061909615993, "loss_ce": 0.00019573181634768844, "loss_xval": 0.00095367431640625, "num_input_tokens_seen": 89009536, "step": 1288 }, { "epoch": 80.5625, "grad_norm": 5.718502417530253, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 89081152, "step": 1289 }, { "epoch": 80.5625, "loss": 0.0042812880128622055, "loss_ce": 0.00019193251500837505, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 89081152, "step": 1289 }, { "epoch": 80.625, "grad_norm": 8.627519762670474, "learning_rate": 5e-05, "loss": 0.0077, "num_input_tokens_seen": 89152768, "step": 1290 }, { "epoch": 80.625, "loss": 0.0074982293881475925, "loss_ce": 0.00020452811440918595, "loss_xval": 0.007293701171875, "num_input_tokens_seen": 89152768, "step": 1290 }, { "epoch": 80.6875, "grad_norm": 10.07743867452318, "learning_rate": 5e-05, "loss": 0.0104, "num_input_tokens_seen": 89211968, "step": 1291 }, { "epoch": 80.6875, "loss": 0.009577931836247444, "loss_ce": 0.00017851768643595278, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 89211968, "step": 1291 }, { "epoch": 80.75, "grad_norm": 10.289217040543756, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 89283712, "step": 1292 }, { "epoch": 80.75, "loss": 0.010684765875339508, "loss_ce": 0.0001867193350335583, "loss_xval": 0.010498046875, "num_input_tokens_seen": 89283712, "step": 1292 }, { "epoch": 80.8125, "grad_norm": 9.518871705471712, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 89355328, "step": 1293 }, { "epoch": 80.8125, "loss": 0.009457839652895927, "loss_ce": 0.00018049543723464012, "loss_xval": 0.00927734375, "num_input_tokens_seen": 89355328, "step": 1293 }, { "epoch": 80.875, "grad_norm": 8.006495123595153, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 89426880, "step": 1294 }, { "epoch": 80.875, "loss": 0.006589856464415789, "loss_ce": 0.00018116526189260185, "loss_xval": 0.00640869140625, "num_input_tokens_seen": 89426880, "step": 1294 }, { "epoch": 80.9375, "grad_norm": 5.763032519327169, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 89498624, "step": 1295 }, { "epoch": 80.9375, "loss": 0.003941795788705349, "loss_ce": 0.00018813367933034897, "loss_xval": 0.003753662109375, "num_input_tokens_seen": 89498624, "step": 1295 }, { "epoch": 81.0, "grad_norm": 2.828151506659076, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 89570176, "step": 1296 }, { "epoch": 81.0, "loss": 0.001258036820217967, "loss_ce": 0.00017466276767663658, "loss_xval": 0.0010833740234375, "num_input_tokens_seen": 89570176, "step": 1296 }, { "epoch": 81.0625, "grad_norm": 0.09979255913719957, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 89641856, "step": 1297 }, { "epoch": 81.0625, "loss": 0.0003645333054009825, "loss_ce": 0.000183335185283795, "loss_xval": 0.0001811981201171875, "num_input_tokens_seen": 89641856, "step": 1297 }, { "epoch": 81.125, "grad_norm": 2.4045827761146454, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 89713472, "step": 1298 }, { "epoch": 81.125, "loss": 0.0010376586578786373, "loss_ce": 0.00017172243678942323, "loss_xval": 0.000865936279296875, "num_input_tokens_seen": 89713472, "step": 1298 }, { "epoch": 81.1875, "grad_norm": 4.5778954748282175, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 89785152, "step": 1299 }, { "epoch": 81.1875, "loss": 0.0022919767070561647, "loss_ce": 0.00018626371456775814, "loss_xval": 0.002105712890625, "num_input_tokens_seen": 89785152, "step": 1299 }, { "epoch": 81.25, "grad_norm": 6.917729798556076, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 89856832, "step": 1300 }, { "epoch": 81.25, "loss": 0.004717848729342222, "loss_ce": 0.00017072970513254404, "loss_xval": 0.004547119140625, "num_input_tokens_seen": 89856832, "step": 1300 }, { "epoch": 81.3125, "grad_norm": 9.803192834949183, "learning_rate": 5e-05, "loss": 0.0103, "num_input_tokens_seen": 89928448, "step": 1301 }, { "epoch": 81.3125, "loss": 0.009035426191985607, "loss_ce": 0.00018532808462623507, "loss_xval": 0.00885009765625, "num_input_tokens_seen": 89928448, "step": 1301 }, { "epoch": 81.375, "grad_norm": 13.892361525694142, "learning_rate": 5e-05, "loss": 0.02, "num_input_tokens_seen": 89987584, "step": 1302 }, { "epoch": 81.375, "loss": 0.019351232796907425, "loss_ce": 0.00018619366164784878, "loss_xval": 0.0191650390625, "num_input_tokens_seen": 89987584, "step": 1302 }, { "epoch": 81.4375, "grad_norm": 19.455423704004556, "learning_rate": 5e-05, "loss": 0.0386, "num_input_tokens_seen": 90059264, "step": 1303 }, { "epoch": 81.4375, "loss": 0.038518596440553665, "loss_ce": 0.00018851927598007023, "loss_xval": 0.038330078125, "num_input_tokens_seen": 90059264, "step": 1303 }, { "epoch": 81.5, "grad_norm": 26.25043747926682, "learning_rate": 5e-05, "loss": 0.0708, "num_input_tokens_seen": 90130816, "step": 1304 }, { "epoch": 81.5, "loss": 0.07098682224750519, "loss_ce": 0.00018604137585498393, "loss_xval": 0.07080078125, "num_input_tokens_seen": 90130816, "step": 1304 }, { "epoch": 81.5625, "grad_norm": 32.9316261955588, "learning_rate": 5e-05, "loss": 0.111, "num_input_tokens_seen": 90202368, "step": 1305 }, { "epoch": 81.5625, "loss": 0.11202113330364227, "loss_ce": 0.0002047246671281755, "loss_xval": 0.11181640625, "num_input_tokens_seen": 90202368, "step": 1305 }, { "epoch": 81.625, "grad_norm": 34.395804303815375, "learning_rate": 5e-05, "loss": 0.1213, "num_input_tokens_seen": 90274048, "step": 1306 }, { "epoch": 81.625, "loss": 0.12080544233322144, "loss_ce": 0.00019997703202534467, "loss_xval": 0.12060546875, "num_input_tokens_seen": 90274048, "step": 1306 }, { "epoch": 81.6875, "grad_norm": 23.92537655157102, "learning_rate": 5e-05, "loss": 0.0609, "num_input_tokens_seen": 90345728, "step": 1307 }, { "epoch": 81.6875, "loss": 0.06155260279774666, "loss_ce": 0.0002733046712819487, "loss_xval": 0.061279296875, "num_input_tokens_seen": 90345728, "step": 1307 }, { "epoch": 81.75, "grad_norm": 5.603796248217643, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 90417280, "step": 1308 }, { "epoch": 81.75, "loss": 0.005875443108379841, "loss_ce": 0.0004127967986278236, "loss_xval": 0.005462646484375, "num_input_tokens_seen": 90417280, "step": 1308 }, { "epoch": 81.8125, "grad_norm": 13.120661509399742, "learning_rate": 5e-05, "loss": 0.0215, "num_input_tokens_seen": 90489024, "step": 1309 }, { "epoch": 81.8125, "loss": 0.020262904465198517, "loss_ce": 0.00048751308349892497, "loss_xval": 0.019775390625, "num_input_tokens_seen": 90489024, "step": 1309 }, { "epoch": 81.875, "grad_norm": 27.54321692725346, "learning_rate": 5e-05, "loss": 0.0844, "num_input_tokens_seen": 90560704, "step": 1310 }, { "epoch": 81.875, "loss": 0.08634451031684875, "loss_ce": 0.0004070091526955366, "loss_xval": 0.0859375, "num_input_tokens_seen": 90560704, "step": 1310 }, { "epoch": 81.9375, "grad_norm": 27.0470695659577, "learning_rate": 5e-05, "loss": 0.081, "num_input_tokens_seen": 90632512, "step": 1311 }, { "epoch": 81.9375, "loss": 0.07950714975595474, "loss_ce": 0.00040558731416240335, "loss_xval": 0.0791015625, "num_input_tokens_seen": 90632512, "step": 1311 }, { "epoch": 82.0, "grad_norm": 8.319309467414499, "learning_rate": 5e-05, "loss": 0.0086, "num_input_tokens_seen": 90691520, "step": 1312 }, { "epoch": 82.0, "loss": 0.007567130029201508, "loss_ce": 0.0003954994026571512, "loss_xval": 0.007171630859375, "num_input_tokens_seen": 90691520, "step": 1312 }, { "epoch": 82.0625, "grad_norm": 13.83520003951101, "learning_rate": 5e-05, "loss": 0.0214, "num_input_tokens_seen": 90763264, "step": 1313 }, { "epoch": 82.0625, "loss": 0.020780092105269432, "loss_ce": 0.00039435079088434577, "loss_xval": 0.0203857421875, "num_input_tokens_seen": 90763264, "step": 1313 }, { "epoch": 82.125, "grad_norm": 26.233744470895388, "learning_rate": 5e-05, "loss": 0.0736, "num_input_tokens_seen": 90835072, "step": 1314 }, { "epoch": 82.125, "loss": 0.07211720943450928, "loss_ce": 0.000339865597197786, "loss_xval": 0.07177734375, "num_input_tokens_seen": 90835072, "step": 1314 }, { "epoch": 82.1875, "grad_norm": 21.966334899550386, "learning_rate": 5e-05, "loss": 0.0553, "num_input_tokens_seen": 90906816, "step": 1315 }, { "epoch": 82.1875, "loss": 0.05788750201463699, "loss_ce": 0.0005144541501067579, "loss_xval": 0.057373046875, "num_input_tokens_seen": 90906816, "step": 1315 }, { "epoch": 82.25, "grad_norm": 7.992168817180659, "learning_rate": 5e-05, "loss": 0.0092, "num_input_tokens_seen": 90978368, "step": 1316 }, { "epoch": 82.25, "loss": 0.0099477618932724, "loss_ce": 0.0009755940409377217, "loss_xval": 0.00897216796875, "num_input_tokens_seen": 90978368, "step": 1316 }, { "epoch": 82.3125, "grad_norm": 9.853948345579273, "learning_rate": 5e-05, "loss": 0.0131, "num_input_tokens_seen": 91050048, "step": 1317 }, { "epoch": 82.3125, "loss": 0.01295376755297184, "loss_ce": 0.0011129477061331272, "loss_xval": 0.0118408203125, "num_input_tokens_seen": 91050048, "step": 1317 }, { "epoch": 82.375, "grad_norm": 24.121509566174467, "learning_rate": 5e-05, "loss": 0.0712, "num_input_tokens_seen": 91109312, "step": 1318 }, { "epoch": 82.375, "loss": 0.07309217005968094, "loss_ce": 0.0008265475044026971, "loss_xval": 0.072265625, "num_input_tokens_seen": 91109312, "step": 1318 }, { "epoch": 82.4375, "grad_norm": 22.443026799692635, "learning_rate": 5e-05, "loss": 0.0631, "num_input_tokens_seen": 91180992, "step": 1319 }, { "epoch": 82.4375, "loss": 0.06288768351078033, "loss_ce": 0.001120108994655311, "loss_xval": 0.061767578125, "num_input_tokens_seen": 91180992, "step": 1319 }, { "epoch": 82.5, "grad_norm": 9.204746494814867, "learning_rate": 5e-05, "loss": 0.0156, "num_input_tokens_seen": 91252736, "step": 1320 }, { "epoch": 82.5, "loss": 0.013615299016237259, "loss_ce": 0.0006758461822755635, "loss_xval": 0.012939453125, "num_input_tokens_seen": 91252736, "step": 1320 }, { "epoch": 82.5625, "grad_norm": 7.041680380778854, "learning_rate": 5e-05, "loss": 0.0107, "num_input_tokens_seen": 91324352, "step": 1321 }, { "epoch": 82.5625, "loss": 0.010888535529375076, "loss_ce": 0.0004515242180787027, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 91324352, "step": 1321 }, { "epoch": 82.625, "grad_norm": 20.68685335144807, "learning_rate": 5e-05, "loss": 0.0477, "num_input_tokens_seen": 91395968, "step": 1322 }, { "epoch": 82.625, "loss": 0.04815371334552765, "loss_ce": 0.0003021496522706002, "loss_xval": 0.0478515625, "num_input_tokens_seen": 91395968, "step": 1322 }, { "epoch": 82.6875, "grad_norm": 19.53341497193577, "learning_rate": 5e-05, "loss": 0.0443, "num_input_tokens_seen": 91455040, "step": 1323 }, { "epoch": 82.6875, "loss": 0.0440186932682991, "loss_ce": 0.00031752054928801954, "loss_xval": 0.043701171875, "num_input_tokens_seen": 91455040, "step": 1323 }, { "epoch": 82.75, "grad_norm": 5.313685312356574, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 91526784, "step": 1324 }, { "epoch": 82.75, "loss": 0.007937032729387283, "loss_ce": 0.00027712108567357063, "loss_xval": 0.007659912109375, "num_input_tokens_seen": 91526784, "step": 1324 }, { "epoch": 82.8125, "grad_norm": 10.570006105733498, "learning_rate": 5e-05, "loss": 0.0166, "num_input_tokens_seen": 91598528, "step": 1325 }, { "epoch": 82.8125, "loss": 0.015571070834994316, "loss_ce": 0.00025124690728262067, "loss_xval": 0.01531982421875, "num_input_tokens_seen": 91598528, "step": 1325 }, { "epoch": 82.875, "grad_norm": 15.549359921026063, "learning_rate": 5e-05, "loss": 0.0284, "num_input_tokens_seen": 91670144, "step": 1326 }, { "epoch": 82.875, "loss": 0.02837221696972847, "loss_ce": 0.0002960452693514526, "loss_xval": 0.028076171875, "num_input_tokens_seen": 91670144, "step": 1326 }, { "epoch": 82.9375, "grad_norm": 6.965176644189292, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 91741824, "step": 1327 }, { "epoch": 82.9375, "loss": 0.006542420946061611, "loss_ce": 0.00025579993962310255, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 91741824, "step": 1327 }, { "epoch": 83.0, "grad_norm": 6.505690399306705, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 91801024, "step": 1328 }, { "epoch": 83.0, "loss": 0.007964624091982841, "loss_ce": 0.00027419428806751966, "loss_xval": 0.0076904296875, "num_input_tokens_seen": 91801024, "step": 1328 }, { "epoch": 83.0625, "grad_norm": 13.084367095717914, "learning_rate": 5e-05, "loss": 0.0218, "num_input_tokens_seen": 91872640, "step": 1329 }, { "epoch": 83.0625, "loss": 0.0210590660572052, "loss_ce": 0.0003071120008826256, "loss_xval": 0.020751953125, "num_input_tokens_seen": 91872640, "step": 1329 }, { "epoch": 83.125, "grad_norm": 6.6777848034082945, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 91944384, "step": 1330 }, { "epoch": 83.125, "loss": 0.006794905290007591, "loss_ce": 0.0003251784946769476, "loss_xval": 0.0064697265625, "num_input_tokens_seen": 91944384, "step": 1330 }, { "epoch": 83.1875, "grad_norm": 6.060488788965237, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 92016064, "step": 1331 }, { "epoch": 83.1875, "loss": 0.004320111591368914, "loss_ce": 0.0003223086823709309, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 92016064, "step": 1331 }, { "epoch": 83.25, "grad_norm": 11.275035340765044, "learning_rate": 5e-05, "loss": 0.0158, "num_input_tokens_seen": 92087680, "step": 1332 }, { "epoch": 83.25, "loss": 0.015535068698227406, "loss_ce": 0.0003983501228503883, "loss_xval": 0.01513671875, "num_input_tokens_seen": 92087680, "step": 1332 }, { "epoch": 83.3125, "grad_norm": 6.306319413443722, "learning_rate": 5e-05, "loss": 0.0065, "num_input_tokens_seen": 92159232, "step": 1333 }, { "epoch": 83.3125, "loss": 0.006737944670021534, "loss_ce": 0.0004208058526273817, "loss_xval": 0.006317138671875, "num_input_tokens_seen": 92159232, "step": 1333 }, { "epoch": 83.375, "grad_norm": 3.0254192286907, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 92230784, "step": 1334 }, { "epoch": 83.375, "loss": 0.0021276480983942747, "loss_ce": 0.00041866363608278334, "loss_xval": 0.001708984375, "num_input_tokens_seen": 92230784, "step": 1334 }, { "epoch": 83.4375, "grad_norm": 8.935617589851892, "learning_rate": 5e-05, "loss": 0.0103, "num_input_tokens_seen": 92302336, "step": 1335 }, { "epoch": 83.4375, "loss": 0.010258061811327934, "loss_ce": 0.00043140165507793427, "loss_xval": 0.00982666015625, "num_input_tokens_seen": 92302336, "step": 1335 }, { "epoch": 83.5, "grad_norm": 6.017498085566332, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 92373888, "step": 1336 }, { "epoch": 83.5, "loss": 0.005582468118518591, "loss_ce": 0.00045551499351859093, "loss_xval": 0.005126953125, "num_input_tokens_seen": 92373888, "step": 1336 }, { "epoch": 83.5625, "grad_norm": 1.867077741626802, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 92445632, "step": 1337 }, { "epoch": 83.5625, "loss": 0.0021143611520528793, "loss_ce": 0.00045115326065570116, "loss_xval": 0.0016632080078125, "num_input_tokens_seen": 92445632, "step": 1337 }, { "epoch": 83.625, "grad_norm": 6.6407270625012105, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 92504768, "step": 1338 }, { "epoch": 83.625, "loss": 0.006132456939667463, "loss_ce": 0.0003951520484406501, "loss_xval": 0.0057373046875, "num_input_tokens_seen": 92504768, "step": 1338 }, { "epoch": 83.6875, "grad_norm": 5.393994327897671, "learning_rate": 5e-05, "loss": 0.0044, "num_input_tokens_seen": 92563840, "step": 1339 }, { "epoch": 83.6875, "loss": 0.0039986069314181805, "loss_ce": 0.0003670152509585023, "loss_xval": 0.003631591796875, "num_input_tokens_seen": 92563840, "step": 1339 }, { "epoch": 83.75, "grad_norm": 0.187030626807252, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 92635456, "step": 1340 }, { "epoch": 83.75, "loss": 0.0009429383790120482, "loss_ce": 0.0003821779100690037, "loss_xval": 0.000560760498046875, "num_input_tokens_seen": 92635456, "step": 1340 }, { "epoch": 83.8125, "grad_norm": 5.5623352632301355, "learning_rate": 5e-05, "loss": 0.0049, "num_input_tokens_seen": 92707072, "step": 1341 }, { "epoch": 83.8125, "loss": 0.004972436930984259, "loss_ce": 0.0003642825176939368, "loss_xval": 0.004608154296875, "num_input_tokens_seen": 92707072, "step": 1341 }, { "epoch": 83.875, "grad_norm": 7.908228587710838, "learning_rate": 5e-05, "loss": 0.0089, "num_input_tokens_seen": 92778752, "step": 1342 }, { "epoch": 83.875, "loss": 0.0085558220744133, "loss_ce": 0.00037711087497882545, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 92778752, "step": 1342 }, { "epoch": 83.9375, "grad_norm": 6.82307396320819, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 92850304, "step": 1343 }, { "epoch": 83.9375, "loss": 0.00607724254950881, "loss_ce": 0.0003399380366317928, "loss_xval": 0.0057373046875, "num_input_tokens_seen": 92850304, "step": 1343 }, { "epoch": 84.0, "grad_norm": 2.2725296771540027, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 92921856, "step": 1344 }, { "epoch": 84.0, "loss": 0.0013987963320687413, "loss_ce": 0.0003001635486725718, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 92921856, "step": 1344 }, { "epoch": 84.0625, "grad_norm": 2.9586250079478553, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 92993536, "step": 1345 }, { "epoch": 84.0625, "loss": 0.002096587559208274, "loss_ce": 0.00030367981526069343, "loss_xval": 0.00179290771484375, "num_input_tokens_seen": 92993536, "step": 1345 }, { "epoch": 84.125, "grad_norm": 5.5977867942047705, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 93065152, "step": 1346 }, { "epoch": 84.125, "loss": 0.004932393319904804, "loss_ce": 0.00029372161952778697, "loss_xval": 0.004638671875, "num_input_tokens_seen": 93065152, "step": 1346 }, { "epoch": 84.1875, "grad_norm": 4.7846782687239475, "learning_rate": 5e-05, "loss": 0.0034, "num_input_tokens_seen": 93136768, "step": 1347 }, { "epoch": 84.1875, "loss": 0.0036556622944772243, "loss_ce": 0.00026821118080988526, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 93136768, "step": 1347 }, { "epoch": 84.25, "grad_norm": 1.0287294637288882, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 93208512, "step": 1348 }, { "epoch": 84.25, "loss": 0.000610416114795953, "loss_ce": 0.00025946396635845304, "loss_xval": 0.0003509521484375, "num_input_tokens_seen": 93208512, "step": 1348 }, { "epoch": 84.3125, "grad_norm": 3.4560656948515134, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 93280128, "step": 1349 }, { "epoch": 84.3125, "loss": 0.002060085302218795, "loss_ce": 0.00025191876920871437, "loss_xval": 0.00180816650390625, "num_input_tokens_seen": 93280128, "step": 1349 }, { "epoch": 84.375, "grad_norm": 5.959650224848103, "learning_rate": 5e-05, "loss": 0.0049, "num_input_tokens_seen": 93351744, "step": 1350 }, { "epoch": 84.375, "loss": 0.004852754529565573, "loss_ce": 0.0002446004073135555, "loss_xval": 0.004608154296875, "num_input_tokens_seen": 93351744, "step": 1350 }, { "epoch": 84.4375, "grad_norm": 5.097639920887682, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 93423360, "step": 1351 }, { "epoch": 84.4375, "loss": 0.0037487023510038853, "loss_ce": 0.00023918086662888527, "loss_xval": 0.003509521484375, "num_input_tokens_seen": 93423360, "step": 1351 }, { "epoch": 84.5, "grad_norm": 0.9345982088961234, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 93494912, "step": 1352 }, { "epoch": 84.5, "loss": 0.0008009417215362191, "loss_ce": 0.0001905901444843039, "loss_xval": 0.0006103515625, "num_input_tokens_seen": 93494912, "step": 1352 }, { "epoch": 84.5625, "grad_norm": 4.545730632277391, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 93566592, "step": 1353 }, { "epoch": 84.5625, "loss": 0.0027575090061873198, "loss_ce": 0.00019403238547965884, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 93566592, "step": 1353 }, { "epoch": 84.625, "grad_norm": 8.556585400390404, "learning_rate": 5e-05, "loss": 0.0091, "num_input_tokens_seen": 93638272, "step": 1354 }, { "epoch": 84.625, "loss": 0.009279740042984486, "loss_ce": 0.00018550171807873994, "loss_xval": 0.00909423828125, "num_input_tokens_seen": 93638272, "step": 1354 }, { "epoch": 84.6875, "grad_norm": 8.62215731743346, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 93697408, "step": 1355 }, { "epoch": 84.6875, "loss": 0.009744501672685146, "loss_ce": 0.00016198224329855293, "loss_xval": 0.00958251953125, "num_input_tokens_seen": 93697408, "step": 1355 }, { "epoch": 84.75, "grad_norm": 4.452801107041619, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 93769024, "step": 1356 }, { "epoch": 84.75, "loss": 0.0040631224401295185, "loss_ce": 0.00015687257109675556, "loss_xval": 0.00390625, "num_input_tokens_seen": 93769024, "step": 1356 }, { "epoch": 84.8125, "grad_norm": 1.6587461632003737, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 93840576, "step": 1357 }, { "epoch": 84.8125, "loss": 0.000456514535471797, "loss_ce": 0.00015896816330496222, "loss_xval": 0.00029754638671875, "num_input_tokens_seen": 93840576, "step": 1357 }, { "epoch": 84.875, "grad_norm": 6.452086587448419, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 93912192, "step": 1358 }, { "epoch": 84.875, "loss": 0.005854357499629259, "loss_ce": 0.00014757020107936114, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 93912192, "step": 1358 }, { "epoch": 84.9375, "grad_norm": 7.768455428908589, "learning_rate": 5e-05, "loss": 0.0075, "num_input_tokens_seen": 93971392, "step": 1359 }, { "epoch": 84.9375, "loss": 0.007658465765416622, "loss_ce": 0.00015114153211470693, "loss_xval": 0.00750732421875, "num_input_tokens_seen": 93971392, "step": 1359 }, { "epoch": 85.0, "grad_norm": 6.261992830665613, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 94030528, "step": 1360 }, { "epoch": 85.0, "loss": 0.005399479065090418, "loss_ce": 0.00015045571490190923, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 94030528, "step": 1360 }, { "epoch": 85.0625, "grad_norm": 3.5746365441520247, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 94102144, "step": 1361 }, { "epoch": 85.0625, "loss": 0.002178808907046914, "loss_ce": 0.0001493899617344141, "loss_xval": 0.0020294189453125, "num_input_tokens_seen": 94102144, "step": 1361 }, { "epoch": 85.125, "grad_norm": 0.4716400038577123, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 94161216, "step": 1362 }, { "epoch": 85.125, "loss": 0.0006105655338615179, "loss_ce": 0.00014898713561706245, "loss_xval": 0.000461578369140625, "num_input_tokens_seen": 94161216, "step": 1362 }, { "epoch": 85.1875, "grad_norm": 2.4725061164222364, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 94232768, "step": 1363 }, { "epoch": 85.1875, "loss": 0.0010596337961032987, "loss_ce": 0.0001479211205150932, "loss_xval": 0.000911712646484375, "num_input_tokens_seen": 94232768, "step": 1363 }, { "epoch": 85.25, "grad_norm": 4.09294530167, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 94304512, "step": 1364 }, { "epoch": 85.25, "loss": 0.0021935063414275646, "loss_ce": 0.00014882863615639508, "loss_xval": 0.002044677734375, "num_input_tokens_seen": 94304512, "step": 1364 }, { "epoch": 85.3125, "grad_norm": 4.489692593923753, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 94376128, "step": 1365 }, { "epoch": 85.3125, "loss": 0.003198940772563219, "loss_ce": 0.00014718300371896476, "loss_xval": 0.0030517578125, "num_input_tokens_seen": 94376128, "step": 1365 }, { "epoch": 85.375, "grad_norm": 4.618347858672993, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 94447744, "step": 1366 }, { "epoch": 85.375, "loss": 0.0029601159039884806, "loss_ce": 0.00013723997108172625, "loss_xval": 0.0028228759765625, "num_input_tokens_seen": 94447744, "step": 1366 }, { "epoch": 85.4375, "grad_norm": 4.74594135568652, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 94519296, "step": 1367 }, { "epoch": 85.4375, "loss": 0.002849689219146967, "loss_ce": 0.0001336246496066451, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 94519296, "step": 1367 }, { "epoch": 85.5, "grad_norm": 4.213946298872178, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 94591040, "step": 1368 }, { "epoch": 85.5, "loss": 0.002137300791218877, "loss_ce": 0.00013839948223903775, "loss_xval": 0.0019989013671875, "num_input_tokens_seen": 94591040, "step": 1368 }, { "epoch": 85.5625, "grad_norm": 3.0831016186464044, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 94662656, "step": 1369 }, { "epoch": 85.5625, "loss": 0.0013986109988763928, "loss_ce": 0.00012450206850189716, "loss_xval": 0.00127410888671875, "num_input_tokens_seen": 94662656, "step": 1369 }, { "epoch": 85.625, "grad_norm": 1.6788028144692124, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 94721664, "step": 1370 }, { "epoch": 85.625, "loss": 0.0006253900937736034, "loss_ce": 0.0001371088292216882, "loss_xval": 0.00048828125, "num_input_tokens_seen": 94721664, "step": 1370 }, { "epoch": 85.6875, "grad_norm": 0.27781799054399664, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 94793216, "step": 1371 }, { "epoch": 85.6875, "loss": 0.00037374423118308187, "loss_ce": 0.0001400940091116354, "loss_xval": 0.00023365020751953125, "num_input_tokens_seen": 94793216, "step": 1371 }, { "epoch": 85.75, "grad_norm": 2.7779533251465924, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 94864768, "step": 1372 }, { "epoch": 85.75, "loss": 0.0012504992773756385, "loss_ce": 0.00012897825217805803, "loss_xval": 0.00112152099609375, "num_input_tokens_seen": 94864768, "step": 1372 }, { "epoch": 85.8125, "grad_norm": 5.650012776195728, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 94936512, "step": 1373 }, { "epoch": 85.8125, "loss": 0.004096949473023415, "loss_ce": 0.00012966437498107553, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 94936512, "step": 1373 }, { "epoch": 85.875, "grad_norm": 8.657061946127184, "learning_rate": 5e-05, "loss": 0.0093, "num_input_tokens_seen": 95008128, "step": 1374 }, { "epoch": 85.875, "loss": 0.010076146572828293, "loss_ce": 0.00012741591490339488, "loss_xval": 0.00994873046875, "num_input_tokens_seen": 95008128, "step": 1374 }, { "epoch": 85.9375, "grad_norm": 11.424991293675157, "learning_rate": 5e-05, "loss": 0.016, "num_input_tokens_seen": 95067200, "step": 1375 }, { "epoch": 85.9375, "loss": 0.015012609772384167, "loss_ce": 0.00012003149458905682, "loss_xval": 0.014892578125, "num_input_tokens_seen": 95067200, "step": 1375 }, { "epoch": 86.0, "grad_norm": 14.5645994997339, "learning_rate": 5e-05, "loss": 0.0256, "num_input_tokens_seen": 95138944, "step": 1376 }, { "epoch": 86.0, "loss": 0.02514888532459736, "loss_ce": 0.00012447111657820642, "loss_xval": 0.0250244140625, "num_input_tokens_seen": 95138944, "step": 1376 }, { "epoch": 86.0625, "grad_norm": 18.481510231126762, "learning_rate": 5e-05, "loss": 0.0419, "num_input_tokens_seen": 95210624, "step": 1377 }, { "epoch": 86.0625, "loss": 0.04261020943522453, "loss_ce": 0.00012974253331776708, "loss_xval": 0.04248046875, "num_input_tokens_seen": 95210624, "step": 1377 }, { "epoch": 86.125, "grad_norm": 21.992243575699817, "learning_rate": 5e-05, "loss": 0.06, "num_input_tokens_seen": 95282176, "step": 1378 }, { "epoch": 86.125, "loss": 0.06021275743842125, "loss_ce": 0.00015416437236126512, "loss_xval": 0.06005859375, "num_input_tokens_seen": 95282176, "step": 1378 }, { "epoch": 86.1875, "grad_norm": 22.56849610163038, "learning_rate": 5e-05, "loss": 0.0639, "num_input_tokens_seen": 95353728, "step": 1379 }, { "epoch": 86.1875, "loss": 0.06363598257303238, "loss_ce": 0.00015941797755658627, "loss_xval": 0.0634765625, "num_input_tokens_seen": 95353728, "step": 1379 }, { "epoch": 86.25, "grad_norm": 16.94868086603522, "learning_rate": 5e-05, "loss": 0.0378, "num_input_tokens_seen": 95412864, "step": 1380 }, { "epoch": 86.25, "loss": 0.03410815820097923, "loss_ce": 0.00017261072935070843, "loss_xval": 0.033935546875, "num_input_tokens_seen": 95412864, "step": 1380 }, { "epoch": 86.3125, "grad_norm": 4.895642035480853, "learning_rate": 5e-05, "loss": 0.0045, "num_input_tokens_seen": 95484544, "step": 1381 }, { "epoch": 86.3125, "loss": 0.004222327843308449, "loss_ce": 0.00022452489065472037, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 95484544, "step": 1381 }, { "epoch": 86.375, "grad_norm": 8.953947922684396, "learning_rate": 5e-05, "loss": 0.0114, "num_input_tokens_seen": 95556224, "step": 1382 }, { "epoch": 86.375, "loss": 0.011928737163543701, "loss_ce": 0.0002710219123400748, "loss_xval": 0.01165771484375, "num_input_tokens_seen": 95556224, "step": 1382 }, { "epoch": 86.4375, "grad_norm": 18.1527503843166, "learning_rate": 5e-05, "loss": 0.0429, "num_input_tokens_seen": 95627968, "step": 1383 }, { "epoch": 86.4375, "loss": 0.04327051341533661, "loss_ce": 0.0003017636190634221, "loss_xval": 0.04296875, "num_input_tokens_seen": 95627968, "step": 1383 }, { "epoch": 86.5, "grad_norm": 18.33660375652762, "learning_rate": 5e-05, "loss": 0.0448, "num_input_tokens_seen": 95687104, "step": 1384 }, { "epoch": 86.5, "loss": 0.044277459383010864, "loss_ce": 0.000332146737491712, "loss_xval": 0.0439453125, "num_input_tokens_seen": 95687104, "step": 1384 }, { "epoch": 86.5625, "grad_norm": 9.742972210972088, "learning_rate": 5e-05, "loss": 0.0147, "num_input_tokens_seen": 95758848, "step": 1385 }, { "epoch": 86.5625, "loss": 0.014891230501234531, "loss_ce": 0.0003648633719421923, "loss_xval": 0.0145263671875, "num_input_tokens_seen": 95758848, "step": 1385 }, { "epoch": 86.625, "grad_norm": 2.0753446635970603, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 95830464, "step": 1386 }, { "epoch": 86.625, "loss": 0.002193968975916505, "loss_ce": 0.00040106126107275486, "loss_xval": 0.00179290771484375, "num_input_tokens_seen": 95830464, "step": 1386 }, { "epoch": 86.6875, "grad_norm": 10.612091415688838, "learning_rate": 5e-05, "loss": 0.0159, "num_input_tokens_seen": 95902208, "step": 1387 }, { "epoch": 86.6875, "loss": 0.01625114679336548, "loss_ce": 0.00038200628478080034, "loss_xval": 0.015869140625, "num_input_tokens_seen": 95902208, "step": 1387 }, { "epoch": 86.75, "grad_norm": 12.023490316206656, "learning_rate": 5e-05, "loss": 0.0197, "num_input_tokens_seen": 95961216, "step": 1388 }, { "epoch": 86.75, "loss": 0.020631328225135803, "loss_ce": 0.0003676554188132286, "loss_xval": 0.020263671875, "num_input_tokens_seen": 95961216, "step": 1388 }, { "epoch": 86.8125, "grad_norm": 6.192574013263385, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 96032896, "step": 1389 }, { "epoch": 86.8125, "loss": 0.0059670875780284405, "loss_ce": 0.00041288844658993185, "loss_xval": 0.00555419921875, "num_input_tokens_seen": 96032896, "step": 1389 }, { "epoch": 86.875, "grad_norm": 2.6492488243139736, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 96104640, "step": 1390 }, { "epoch": 86.875, "loss": 0.0033788911532610655, "loss_ce": 0.00041868616244755685, "loss_xval": 0.002960205078125, "num_input_tokens_seen": 96104640, "step": 1390 }, { "epoch": 86.9375, "grad_norm": 8.729217015659232, "learning_rate": 5e-05, "loss": 0.0121, "num_input_tokens_seen": 96176192, "step": 1391 }, { "epoch": 86.9375, "loss": 0.01327319536358118, "loss_ce": 0.0003947777731809765, "loss_xval": 0.01287841796875, "num_input_tokens_seen": 96176192, "step": 1391 }, { "epoch": 87.0, "grad_norm": 8.728910170528094, "learning_rate": 5e-05, "loss": 0.0119, "num_input_tokens_seen": 96247808, "step": 1392 }, { "epoch": 87.0, "loss": 0.011554991826415062, "loss_ce": 0.00038555837818421423, "loss_xval": 0.01116943359375, "num_input_tokens_seen": 96247808, "step": 1392 }, { "epoch": 87.0625, "grad_norm": 3.355145966560444, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 96306816, "step": 1393 }, { "epoch": 87.0625, "loss": 0.002884861547499895, "loss_ce": 0.0003519024758134037, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 96306816, "step": 1393 }, { "epoch": 87.125, "grad_norm": 4.138859429284309, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 96365952, "step": 1394 }, { "epoch": 87.125, "loss": 0.0029549452010542154, "loss_ce": 0.0003151747805532068, "loss_xval": 0.0026397705078125, "num_input_tokens_seen": 96365952, "step": 1394 }, { "epoch": 87.1875, "grad_norm": 8.873096729500526, "learning_rate": 5e-05, "loss": 0.0114, "num_input_tokens_seen": 96437568, "step": 1395 }, { "epoch": 87.1875, "loss": 0.011341361328959465, "loss_ce": 0.0002939981932286173, "loss_xval": 0.01104736328125, "num_input_tokens_seen": 96437568, "step": 1395 }, { "epoch": 87.25, "grad_norm": 7.362971805252709, "learning_rate": 5e-05, "loss": 0.0079, "num_input_tokens_seen": 96509184, "step": 1396 }, { "epoch": 87.25, "loss": 0.007855205796658993, "loss_ce": 0.0002563287562225014, "loss_xval": 0.007598876953125, "num_input_tokens_seen": 96509184, "step": 1396 }, { "epoch": 87.3125, "grad_norm": 0.7905654077163219, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 96580800, "step": 1397 }, { "epoch": 87.3125, "loss": 0.0012294021435081959, "loss_ce": 0.0002566543407738209, "loss_xval": 0.000972747802734375, "num_input_tokens_seen": 96580800, "step": 1397 }, { "epoch": 87.375, "grad_norm": 6.997986291690594, "learning_rate": 5e-05, "loss": 0.0069, "num_input_tokens_seen": 96652416, "step": 1398 }, { "epoch": 87.375, "loss": 0.007005076855421066, "loss_ce": 0.0002301743079442531, "loss_xval": 0.00677490234375, "num_input_tokens_seen": 96652416, "step": 1398 }, { "epoch": 87.4375, "grad_norm": 12.697177281449116, "learning_rate": 5e-05, "loss": 0.0217, "num_input_tokens_seen": 96724032, "step": 1399 }, { "epoch": 87.4375, "loss": 0.021074065938591957, "loss_ce": 0.0002000429667532444, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 96724032, "step": 1399 }, { "epoch": 87.5, "grad_norm": 13.521911622196056, "learning_rate": 5e-05, "loss": 0.0247, "num_input_tokens_seen": 96795584, "step": 1400 }, { "epoch": 87.5, "loss": 0.026104973629117012, "loss_ce": 0.00022606826678384095, "loss_xval": 0.02587890625, "num_input_tokens_seen": 96795584, "step": 1400 }, { "epoch": 87.5625, "grad_norm": 7.620328150939222, "learning_rate": 5e-05, "loss": 0.0099, "num_input_tokens_seen": 96867136, "step": 1401 }, { "epoch": 87.5625, "loss": 0.009842384606599808, "loss_ce": 0.0002598651044536382, "loss_xval": 0.00958251953125, "num_input_tokens_seen": 96867136, "step": 1401 }, { "epoch": 87.625, "grad_norm": 2.257605245863537, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 96938688, "step": 1402 }, { "epoch": 87.625, "loss": 0.0028382607270032167, "loss_ce": 0.00027478416450321674, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 96938688, "step": 1402 }, { "epoch": 87.6875, "grad_norm": 9.331684847900517, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 96997824, "step": 1403 }, { "epoch": 87.6875, "loss": 0.013780038803815842, "loss_ce": 0.00029126947629265487, "loss_xval": 0.01348876953125, "num_input_tokens_seen": 96997824, "step": 1403 }, { "epoch": 87.75, "grad_norm": 11.815568839124595, "learning_rate": 5e-05, "loss": 0.0203, "num_input_tokens_seen": 97069376, "step": 1404 }, { "epoch": 87.75, "loss": 0.02077043429017067, "loss_ce": 0.0003846912004519254, "loss_xval": 0.0203857421875, "num_input_tokens_seen": 97069376, "step": 1404 }, { "epoch": 87.8125, "grad_norm": 10.458438509888028, "learning_rate": 5e-05, "loss": 0.0157, "num_input_tokens_seen": 97141120, "step": 1405 }, { "epoch": 87.8125, "loss": 0.01642109826207161, "loss_ce": 0.00030781730310991406, "loss_xval": 0.01611328125, "num_input_tokens_seen": 97141120, "step": 1405 }, { "epoch": 87.875, "grad_norm": 4.899426763532995, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 97212800, "step": 1406 }, { "epoch": 87.875, "loss": 0.004273131489753723, "loss_ce": 0.0002753287262748927, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 97212800, "step": 1406 }, { "epoch": 87.9375, "grad_norm": 2.572351585425516, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 97284352, "step": 1407 }, { "epoch": 87.9375, "loss": 0.0018727717688307166, "loss_ce": 0.0002553401282057166, "loss_xval": 0.001617431640625, "num_input_tokens_seen": 97284352, "step": 1407 }, { "epoch": 88.0, "grad_norm": 7.9539379483395205, "learning_rate": 5e-05, "loss": 0.0095, "num_input_tokens_seen": 97356032, "step": 1408 }, { "epoch": 88.0, "loss": 0.009988140314817429, "loss_ce": 0.00028355044196359813, "loss_xval": 0.00970458984375, "num_input_tokens_seen": 97356032, "step": 1408 }, { "epoch": 88.0625, "grad_norm": 8.125767744555626, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 97427584, "step": 1409 }, { "epoch": 88.0625, "loss": 0.010983756743371487, "loss_ce": 0.00024156902509275824, "loss_xval": 0.0107421875, "num_input_tokens_seen": 97427584, "step": 1409 }, { "epoch": 88.125, "grad_norm": 2.498880075441956, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 97499392, "step": 1410 }, { "epoch": 88.125, "loss": 0.0017704254714772105, "loss_ce": 0.0002369171561440453, "loss_xval": 0.00153350830078125, "num_input_tokens_seen": 97499392, "step": 1410 }, { "epoch": 88.1875, "grad_norm": 4.088092554901874, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 97558464, "step": 1411 }, { "epoch": 88.1875, "loss": 0.0027891267091035843, "loss_ce": 0.00021039124112576246, "loss_xval": 0.0025787353515625, "num_input_tokens_seen": 97558464, "step": 1411 }, { "epoch": 88.25, "grad_norm": 7.781016703274088, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 97617472, "step": 1412 }, { "epoch": 88.25, "loss": 0.009119339287281036, "loss_ce": 0.00020820641657337546, "loss_xval": 0.0089111328125, "num_input_tokens_seen": 97617472, "step": 1412 }, { "epoch": 88.3125, "grad_norm": 8.188874335876337, "learning_rate": 5e-05, "loss": 0.01, "num_input_tokens_seen": 97689024, "step": 1413 }, { "epoch": 88.3125, "loss": 0.010654650628566742, "loss_ce": 0.00021763896802440286, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 97689024, "step": 1413 }, { "epoch": 88.375, "grad_norm": 5.9478870558346175, "learning_rate": 5e-05, "loss": 0.0055, "num_input_tokens_seen": 97760768, "step": 1414 }, { "epoch": 88.375, "loss": 0.005853202193975449, "loss_ce": 0.00020745031361002475, "loss_xval": 0.005645751953125, "num_input_tokens_seen": 97760768, "step": 1414 }, { "epoch": 88.4375, "grad_norm": 1.475954179206284, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 97820032, "step": 1415 }, { "epoch": 88.4375, "loss": 0.00100472301710397, "loss_ce": 0.0001960072258953005, "loss_xval": 0.0008087158203125, "num_input_tokens_seen": 97820032, "step": 1415 }, { "epoch": 88.5, "grad_norm": 4.114678583328582, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 97891584, "step": 1416 }, { "epoch": 88.5, "loss": 0.0029814133886247873, "loss_ce": 0.00018905493197962642, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 97891584, "step": 1416 }, { "epoch": 88.5625, "grad_norm": 8.631794997393147, "learning_rate": 5e-05, "loss": 0.0109, "num_input_tokens_seen": 97963392, "step": 1417 }, { "epoch": 88.5625, "loss": 0.010813172906637192, "loss_ce": 0.00019305601017549634, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 97963392, "step": 1417 }, { "epoch": 88.625, "grad_norm": 10.541015255541664, "learning_rate": 5e-05, "loss": 0.0158, "num_input_tokens_seen": 98035008, "step": 1418 }, { "epoch": 88.625, "loss": 0.015680886805057526, "loss_ce": 0.00017795769963413477, "loss_xval": 0.0155029296875, "num_input_tokens_seen": 98035008, "step": 1418 }, { "epoch": 88.6875, "grad_norm": 9.71623274897056, "learning_rate": 5e-05, "loss": 0.0138, "num_input_tokens_seen": 98106752, "step": 1419 }, { "epoch": 88.6875, "loss": 0.014093155972659588, "loss_ce": 0.0001771400129655376, "loss_xval": 0.013916015625, "num_input_tokens_seen": 98106752, "step": 1419 }, { "epoch": 88.75, "grad_norm": 6.3053819258526875, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 98178432, "step": 1420 }, { "epoch": 88.75, "loss": 0.006050786003470421, "loss_ce": 0.00016089326527435333, "loss_xval": 0.005889892578125, "num_input_tokens_seen": 98178432, "step": 1420 }, { "epoch": 88.8125, "grad_norm": 1.3613681611204922, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 98249984, "step": 1421 }, { "epoch": 88.8125, "loss": 0.0017392839072272182, "loss_ce": 0.00014474045019596815, "loss_xval": 0.00159454345703125, "num_input_tokens_seen": 98249984, "step": 1421 }, { "epoch": 88.875, "grad_norm": 2.8573323171257665, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 98321536, "step": 1422 }, { "epoch": 88.875, "loss": 0.0017971351044252515, "loss_ce": 0.0001415565056959167, "loss_xval": 0.00165557861328125, "num_input_tokens_seen": 98321536, "step": 1422 }, { "epoch": 88.9375, "grad_norm": 4.949223939521938, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 98393088, "step": 1423 }, { "epoch": 88.9375, "loss": 0.0042897784151136875, "loss_ce": 0.0001393879938405007, "loss_xval": 0.004150390625, "num_input_tokens_seen": 98393088, "step": 1423 }, { "epoch": 89.0, "grad_norm": 4.876705980586143, "learning_rate": 5e-05, "loss": 0.0048, "num_input_tokens_seen": 98464640, "step": 1424 }, { "epoch": 89.0, "loss": 0.005188289098441601, "loss_ce": 0.00012237107148393989, "loss_xval": 0.00506591796875, "num_input_tokens_seen": 98464640, "step": 1424 }, { "epoch": 89.0625, "grad_norm": 2.4669840278626824, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 98536256, "step": 1425 }, { "epoch": 89.0625, "loss": 0.0016833170084282756, "loss_ce": 0.0001269204803975299, "loss_xval": 0.001556396484375, "num_input_tokens_seen": 98536256, "step": 1425 }, { "epoch": 89.125, "grad_norm": 1.6542139949837114, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 98607872, "step": 1426 }, { "epoch": 89.125, "loss": 0.0010792359244078398, "loss_ce": 0.00012174686708021909, "loss_xval": 0.000957489013671875, "num_input_tokens_seen": 98607872, "step": 1426 }, { "epoch": 89.1875, "grad_norm": 5.219650660467849, "learning_rate": 5e-05, "loss": 0.0049, "num_input_tokens_seen": 98666944, "step": 1427 }, { "epoch": 89.1875, "loss": 0.005092137027531862, "loss_ce": 0.00011777185864048079, "loss_xval": 0.004974365234375, "num_input_tokens_seen": 98666944, "step": 1427 }, { "epoch": 89.25, "grad_norm": 7.222756845472301, "learning_rate": 5e-05, "loss": 0.0078, "num_input_tokens_seen": 98738624, "step": 1428 }, { "epoch": 89.25, "loss": 0.008125527761876583, "loss_ce": 0.00012992256961297244, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 98738624, "step": 1428 }, { "epoch": 89.3125, "grad_norm": 8.234607449616483, "learning_rate": 5e-05, "loss": 0.0097, "num_input_tokens_seen": 98810240, "step": 1429 }, { "epoch": 89.3125, "loss": 0.010082121938467026, "loss_ce": 0.0001333911350229755, "loss_xval": 0.00994873046875, "num_input_tokens_seen": 98810240, "step": 1429 }, { "epoch": 89.375, "grad_norm": 8.465055746180632, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 98881856, "step": 1430 }, { "epoch": 89.375, "loss": 0.01027833390980959, "loss_ce": 0.00014649789955001324, "loss_xval": 0.0101318359375, "num_input_tokens_seen": 98881856, "step": 1430 }, { "epoch": 89.4375, "grad_norm": 7.673367194062242, "learning_rate": 5e-05, "loss": 0.0088, "num_input_tokens_seen": 98953536, "step": 1431 }, { "epoch": 89.4375, "loss": 0.009366064332425594, "loss_ce": 0.00014975547674112022, "loss_xval": 0.00921630859375, "num_input_tokens_seen": 98953536, "step": 1431 }, { "epoch": 89.5, "grad_norm": 5.9578760054636515, "learning_rate": 5e-05, "loss": 0.0053, "num_input_tokens_seen": 99025088, "step": 1432 }, { "epoch": 89.5, "loss": 0.005480596330016851, "loss_ce": 0.00017053764895536005, "loss_xval": 0.00531005859375, "num_input_tokens_seen": 99025088, "step": 1432 }, { "epoch": 89.5625, "grad_norm": 3.8362523636362607, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 99084352, "step": 1433 }, { "epoch": 89.5625, "loss": 0.002257879823446274, "loss_ce": 0.00018268443818669766, "loss_xval": 0.0020751953125, "num_input_tokens_seen": 99084352, "step": 1433 }, { "epoch": 89.625, "grad_norm": 1.6603316778373378, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 99143360, "step": 1434 }, { "epoch": 89.625, "loss": 0.0008298221509903669, "loss_ce": 0.00017750888946466148, "loss_xval": 0.000652313232421875, "num_input_tokens_seen": 99143360, "step": 1434 }, { "epoch": 89.6875, "grad_norm": 0.25305967288664716, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 99202432, "step": 1435 }, { "epoch": 89.6875, "loss": 0.0005389668513089418, "loss_ce": 0.0001765706401783973, "loss_xval": 0.000362396240234375, "num_input_tokens_seen": 99202432, "step": 1435 }, { "epoch": 89.75, "grad_norm": 1.7938476568634658, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 99274176, "step": 1436 }, { "epoch": 89.75, "loss": 0.001010698964819312, "loss_ce": 0.00018672439910005778, "loss_xval": 0.000823974609375, "num_input_tokens_seen": 99274176, "step": 1436 }, { "epoch": 89.8125, "grad_norm": 3.1259805404663594, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 99345856, "step": 1437 }, { "epoch": 89.8125, "loss": 0.001759349019266665, "loss_ce": 0.00018769371672533453, "loss_xval": 0.0015716552734375, "num_input_tokens_seen": 99345856, "step": 1437 }, { "epoch": 89.875, "grad_norm": 4.692768165309171, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 99417408, "step": 1438 }, { "epoch": 89.875, "loss": 0.0036474072840064764, "loss_ce": 0.00016840336320456117, "loss_xval": 0.00347900390625, "num_input_tokens_seen": 99417408, "step": 1438 }, { "epoch": 89.9375, "grad_norm": 6.687045552794276, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 99489152, "step": 1439 }, { "epoch": 89.9375, "loss": 0.006873737554997206, "loss_ce": 0.00015987036749720573, "loss_xval": 0.0067138671875, "num_input_tokens_seen": 99489152, "step": 1439 }, { "epoch": 90.0, "grad_norm": 9.163106584929274, "learning_rate": 5e-05, "loss": 0.0123, "num_input_tokens_seen": 99560832, "step": 1440 }, { "epoch": 90.0, "loss": 0.011944948695600033, "loss_ce": 0.00016516332107130438, "loss_xval": 0.01177978515625, "num_input_tokens_seen": 99560832, "step": 1440 }, { "epoch": 90.0625, "grad_norm": 12.669250965673333, "learning_rate": 5e-05, "loss": 0.0229, "num_input_tokens_seen": 99619968, "step": 1441 }, { "epoch": 90.0625, "loss": 0.02285129763185978, "loss_ce": 0.00014621867740061134, "loss_xval": 0.022705078125, "num_input_tokens_seen": 99619968, "step": 1441 }, { "epoch": 90.125, "grad_norm": 16.549156532771427, "learning_rate": 5e-05, "loss": 0.0393, "num_input_tokens_seen": 99691520, "step": 1442 }, { "epoch": 90.125, "loss": 0.03847173973917961, "loss_ce": 0.00014166282198857516, "loss_xval": 0.038330078125, "num_input_tokens_seen": 99691520, "step": 1442 }, { "epoch": 90.1875, "grad_norm": 18.803814282510302, "learning_rate": 5e-05, "loss": 0.0506, "num_input_tokens_seen": 99763264, "step": 1443 }, { "epoch": 90.1875, "loss": 0.0509246364235878, "loss_ce": 0.00014338496839627624, "loss_xval": 0.05078125, "num_input_tokens_seen": 99763264, "step": 1443 }, { "epoch": 90.25, "grad_norm": 16.951817413186063, "learning_rate": 5e-05, "loss": 0.0417, "num_input_tokens_seen": 99822400, "step": 1444 }, { "epoch": 90.25, "loss": 0.039222631603479385, "loss_ce": 0.00016012991545721889, "loss_xval": 0.0390625, "num_input_tokens_seen": 99822400, "step": 1444 }, { "epoch": 90.3125, "grad_norm": 10.18017997837843, "learning_rate": 5e-05, "loss": 0.0156, "num_input_tokens_seen": 99893952, "step": 1445 }, { "epoch": 90.3125, "loss": 0.016544576734304428, "loss_ce": 0.00018715558690018952, "loss_xval": 0.016357421875, "num_input_tokens_seen": 99893952, "step": 1445 }, { "epoch": 90.375, "grad_norm": 0.6092748133457243, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 99965632, "step": 1446 }, { "epoch": 90.375, "loss": 0.0005741502973251045, "loss_ce": 0.0002155687689082697, "loss_xval": 0.00035858154296875, "num_input_tokens_seen": 99965632, "step": 1446 }, { "epoch": 90.4375, "grad_norm": 8.441956699548394, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 100037248, "step": 1447 }, { "epoch": 90.4375, "loss": 0.012076275423169136, "loss_ce": 0.0002354555472265929, "loss_xval": 0.0118408203125, "num_input_tokens_seen": 100037248, "step": 1447 }, { "epoch": 90.5, "grad_norm": 13.906676576538521, "learning_rate": 5e-05, "loss": 0.0294, "num_input_tokens_seen": 100108928, "step": 1448 }, { "epoch": 90.5, "loss": 0.02933783084154129, "loss_ce": 0.00028509661206044257, "loss_xval": 0.029052734375, "num_input_tokens_seen": 100108928, "step": 1448 }, { "epoch": 90.5625, "grad_norm": 13.606772895897334, "learning_rate": 5e-05, "loss": 0.0282, "num_input_tokens_seen": 100180608, "step": 1449 }, { "epoch": 90.5625, "loss": 0.027996975928544998, "loss_ce": 0.0002870155731216073, "loss_xval": 0.0277099609375, "num_input_tokens_seen": 100180608, "step": 1449 }, { "epoch": 90.625, "grad_norm": 7.634030291353978, "learning_rate": 5e-05, "loss": 0.0096, "num_input_tokens_seen": 100252224, "step": 1450 }, { "epoch": 90.625, "loss": 0.009694787673652172, "loss_ce": 0.00029537358204834163, "loss_xval": 0.0093994140625, "num_input_tokens_seen": 100252224, "step": 1450 }, { "epoch": 90.6875, "grad_norm": 1.3682169589201283, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 100311232, "step": 1451 }, { "epoch": 90.6875, "loss": 0.0008320095366798341, "loss_ce": 0.00029795191949233413, "loss_xval": 0.0005340576171875, "num_input_tokens_seen": 100311232, "step": 1451 }, { "epoch": 90.75, "grad_norm": 9.032418326904276, "learning_rate": 5e-05, "loss": 0.0128, "num_input_tokens_seen": 100370432, "step": 1452 }, { "epoch": 90.75, "loss": 0.013181054033339024, "loss_ce": 0.0003026359772775322, "loss_xval": 0.01287841796875, "num_input_tokens_seen": 100370432, "step": 1452 }, { "epoch": 90.8125, "grad_norm": 11.59426157318603, "learning_rate": 5e-05, "loss": 0.0208, "num_input_tokens_seen": 100442112, "step": 1453 }, { "epoch": 90.8125, "loss": 0.022128943353891373, "loss_ce": 0.0002783581439871341, "loss_xval": 0.0218505859375, "num_input_tokens_seen": 100442112, "step": 1453 }, { "epoch": 90.875, "grad_norm": 8.386193910116718, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 100501248, "step": 1454 }, { "epoch": 90.875, "loss": 0.010896342806518078, "loss_ce": 0.0002762254443950951, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 100501248, "step": 1454 }, { "epoch": 90.9375, "grad_norm": 2.1259181598992742, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 100572800, "step": 1455 }, { "epoch": 90.9375, "loss": 0.0011982765281572938, "loss_ce": 0.00027130506350658834, "loss_xval": 0.000926971435546875, "num_input_tokens_seen": 100572800, "step": 1455 }, { "epoch": 91.0, "grad_norm": 3.138463209234726, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 100644480, "step": 1456 }, { "epoch": 91.0, "loss": 0.0022985711693763733, "loss_ce": 0.0002538934932090342, "loss_xval": 0.002044677734375, "num_input_tokens_seen": 100644480, "step": 1456 }, { "epoch": 91.0625, "grad_norm": 4.981874046721102, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 100716096, "step": 1457 }, { "epoch": 91.0625, "loss": 0.004420939367264509, "loss_ce": 0.00024003109137993306, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 100716096, "step": 1457 }, { "epoch": 91.125, "grad_norm": 3.6465351426806363, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 100787712, "step": 1458 }, { "epoch": 91.125, "loss": 0.002575915539637208, "loss_ce": 0.00022606206766795367, "loss_xval": 0.002349853515625, "num_input_tokens_seen": 100787712, "step": 1458 }, { "epoch": 91.1875, "grad_norm": 0.5411738473532007, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 100859264, "step": 1459 }, { "epoch": 91.1875, "loss": 0.0013069550041109324, "loss_ce": 0.0001930634316522628, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 100859264, "step": 1459 }, { "epoch": 91.25, "grad_norm": 3.3943456146166944, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 100930816, "step": 1460 }, { "epoch": 91.25, "loss": 0.0021058768033981323, "loss_ce": 0.00020615763787645847, "loss_xval": 0.00189971923828125, "num_input_tokens_seen": 100930816, "step": 1460 }, { "epoch": 91.3125, "grad_norm": 6.824281318521774, "learning_rate": 5e-05, "loss": 0.0071, "num_input_tokens_seen": 101002432, "step": 1461 }, { "epoch": 91.3125, "loss": 0.006831796374171972, "loss_ce": 0.0001789645612007007, "loss_xval": 0.00665283203125, "num_input_tokens_seen": 101002432, "step": 1461 }, { "epoch": 91.375, "grad_norm": 8.703061439614308, "learning_rate": 5e-05, "loss": 0.0119, "num_input_tokens_seen": 101061632, "step": 1462 }, { "epoch": 91.375, "loss": 0.011036021634936333, "loss_ce": 0.00017176421533804387, "loss_xval": 0.0108642578125, "num_input_tokens_seen": 101061632, "step": 1462 }, { "epoch": 91.4375, "grad_norm": 8.61245755703454, "learning_rate": 5e-05, "loss": 0.0116, "num_input_tokens_seen": 101120704, "step": 1463 }, { "epoch": 91.4375, "loss": 0.011173286475241184, "loss_ce": 0.0001869582774816081, "loss_xval": 0.010986328125, "num_input_tokens_seen": 101120704, "step": 1463 }, { "epoch": 91.5, "grad_norm": 6.233009274737832, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 101192256, "step": 1464 }, { "epoch": 91.5, "loss": 0.005891819950193167, "loss_ce": 0.000185032666195184, "loss_xval": 0.005706787109375, "num_input_tokens_seen": 101192256, "step": 1464 }, { "epoch": 91.5625, "grad_norm": 2.021738887516537, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 101263872, "step": 1465 }, { "epoch": 91.5625, "loss": 0.0013423706404864788, "loss_ce": 0.00020559091353788972, "loss_xval": 0.00113677978515625, "num_input_tokens_seen": 101263872, "step": 1465 }, { "epoch": 91.625, "grad_norm": 2.568438390324364, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 101335424, "step": 1466 }, { "epoch": 91.625, "loss": 0.0018030558712780476, "loss_ce": 0.00021614175057038665, "loss_xval": 0.0015869140625, "num_input_tokens_seen": 101335424, "step": 1466 }, { "epoch": 91.6875, "grad_norm": 6.616184071416912, "learning_rate": 5e-05, "loss": 0.0073, "num_input_tokens_seen": 101407040, "step": 1467 }, { "epoch": 91.6875, "loss": 0.007182391360402107, "loss_ce": 0.00022438356245402247, "loss_xval": 0.0069580078125, "num_input_tokens_seen": 101407040, "step": 1467 }, { "epoch": 91.75, "grad_norm": 9.052048701564082, "learning_rate": 5e-05, "loss": 0.0133, "num_input_tokens_seen": 101478784, "step": 1468 }, { "epoch": 91.75, "loss": 0.01281468290835619, "loss_ce": 0.00024144031340256333, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 101478784, "step": 1468 }, { "epoch": 91.8125, "grad_norm": 8.216207441201835, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 101550336, "step": 1469 }, { "epoch": 91.8125, "loss": 0.012428391724824905, "loss_ce": 0.00022136066399980336, "loss_xval": 0.01220703125, "num_input_tokens_seen": 101550336, "step": 1469 }, { "epoch": 91.875, "grad_norm": 4.215063838216148, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 101622080, "step": 1470 }, { "epoch": 91.875, "loss": 0.0033252511639147997, "loss_ce": 0.00022771699877921492, "loss_xval": 0.0030975341796875, "num_input_tokens_seen": 101622080, "step": 1470 }, { "epoch": 91.9375, "grad_norm": 0.9309762576951055, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 101693760, "step": 1471 }, { "epoch": 91.9375, "loss": 0.00047848786925897, "loss_ce": 0.0002133663947461173, "loss_xval": 0.0002651214599609375, "num_input_tokens_seen": 101693760, "step": 1471 }, { "epoch": 92.0, "grad_norm": 5.291556637101302, "learning_rate": 5e-05, "loss": 0.0048, "num_input_tokens_seen": 101765312, "step": 1472 }, { "epoch": 92.0, "loss": 0.005367064382880926, "loss_ce": 0.0002095934614771977, "loss_xval": 0.005157470703125, "num_input_tokens_seen": 101765312, "step": 1472 }, { "epoch": 92.0625, "grad_norm": 8.002296405575832, "learning_rate": 5e-05, "loss": 0.0104, "num_input_tokens_seen": 101837056, "step": 1473 }, { "epoch": 92.0625, "loss": 0.010928795672953129, "loss_ce": 0.0001866081147454679, "loss_xval": 0.0107421875, "num_input_tokens_seen": 101837056, "step": 1473 }, { "epoch": 92.125, "grad_norm": 8.903838830767665, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 101908672, "step": 1474 }, { "epoch": 92.125, "loss": 0.01206380594521761, "loss_ce": 0.0001619504182599485, "loss_xval": 0.01190185546875, "num_input_tokens_seen": 101908672, "step": 1474 }, { "epoch": 92.1875, "grad_norm": 7.604459502826674, "learning_rate": 5e-05, "loss": 0.0093, "num_input_tokens_seen": 101980416, "step": 1475 }, { "epoch": 92.1875, "loss": 0.009634369052946568, "loss_ce": 0.00017392008157912642, "loss_xval": 0.00946044921875, "num_input_tokens_seen": 101980416, "step": 1475 }, { "epoch": 92.25, "grad_norm": 4.488691635855987, "learning_rate": 5e-05, "loss": 0.0034, "num_input_tokens_seen": 102052032, "step": 1476 }, { "epoch": 92.25, "loss": 0.0033483817242085934, "loss_ce": 0.0001592948829056695, "loss_xval": 0.0031890869140625, "num_input_tokens_seen": 102052032, "step": 1476 }, { "epoch": 92.3125, "grad_norm": 1.0491006631465516, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 102123712, "step": 1477 }, { "epoch": 92.3125, "loss": 0.00053489237325266, "loss_ce": 0.0001591446780366823, "loss_xval": 0.0003757476806640625, "num_input_tokens_seen": 102123712, "step": 1477 }, { "epoch": 92.375, "grad_norm": 1.5456909708775723, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 102170240, "step": 1478 }, { "epoch": 92.375, "loss": 0.0006626353715546429, "loss_ce": 0.00015528064977843314, "loss_xval": 0.000507354736328125, "num_input_tokens_seen": 102170240, "step": 1478 }, { "epoch": 92.4375, "grad_norm": 3.0534220131793335, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 102229376, "step": 1479 }, { "epoch": 92.4375, "loss": 0.0015860440907999873, "loss_ce": 0.00015171787526924163, "loss_xval": 0.001434326171875, "num_input_tokens_seen": 102229376, "step": 1479 }, { "epoch": 92.5, "grad_norm": 3.6563123643106765, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 102288512, "step": 1480 }, { "epoch": 92.5, "loss": 0.0022240960970520973, "loss_ce": 0.00013364202459342778, "loss_xval": 0.0020904541015625, "num_input_tokens_seen": 102288512, "step": 1480 }, { "epoch": 92.5625, "grad_norm": 3.5907319476119204, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 102360064, "step": 1481 }, { "epoch": 92.5625, "loss": 0.002281162654981017, "loss_ce": 0.00014493215712718666, "loss_xval": 0.00213623046875, "num_input_tokens_seen": 102360064, "step": 1481 }, { "epoch": 92.625, "grad_norm": 3.197964126736755, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 102419072, "step": 1482 }, { "epoch": 92.625, "loss": 0.0018821260891854763, "loss_ce": 0.00012736536154989153, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 102419072, "step": 1482 }, { "epoch": 92.6875, "grad_norm": 2.5057744516558538, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 102490624, "step": 1483 }, { "epoch": 92.6875, "loss": 0.0014125545276328921, "loss_ce": 0.00013081621727906168, "loss_xval": 0.00128173828125, "num_input_tokens_seen": 102490624, "step": 1483 }, { "epoch": 92.75, "grad_norm": 2.006927847620975, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 102562368, "step": 1484 }, { "epoch": 92.75, "loss": 0.0013389154337346554, "loss_ce": 0.00012584170326590538, "loss_xval": 0.00121307373046875, "num_input_tokens_seen": 102562368, "step": 1484 }, { "epoch": 92.8125, "grad_norm": 2.350273933391494, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 102608960, "step": 1485 }, { "epoch": 92.8125, "loss": 0.0011312231654301286, "loss_ce": 0.00010888425458688289, "loss_xval": 0.0010223388671875, "num_input_tokens_seen": 102608960, "step": 1485 }, { "epoch": 92.875, "grad_norm": 3.297160408486596, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 102680576, "step": 1486 }, { "epoch": 92.875, "loss": 0.0019285711459815502, "loss_ce": 0.00011277526937192306, "loss_xval": 0.0018157958984375, "num_input_tokens_seen": 102680576, "step": 1486 }, { "epoch": 92.9375, "grad_norm": 4.720119891390996, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 102752192, "step": 1487 }, { "epoch": 92.9375, "loss": 0.0036649664398282766, "loss_ce": 0.00010966858826577663, "loss_xval": 0.0035552978515625, "num_input_tokens_seen": 102752192, "step": 1487 }, { "epoch": 93.0, "grad_norm": 6.9058946446826495, "learning_rate": 5e-05, "loss": 0.0075, "num_input_tokens_seen": 102823872, "step": 1488 }, { "epoch": 93.0, "loss": 0.007736025843769312, "loss_ce": 0.00010663115972420201, "loss_xval": 0.00762939453125, "num_input_tokens_seen": 102823872, "step": 1488 }, { "epoch": 93.0625, "grad_norm": 9.544356321091449, "learning_rate": 5e-05, "loss": 0.0142, "num_input_tokens_seen": 102895616, "step": 1489 }, { "epoch": 93.0625, "loss": 0.014089690521359444, "loss_ce": 0.0001126401184592396, "loss_xval": 0.01397705078125, "num_input_tokens_seen": 102895616, "step": 1489 }, { "epoch": 93.125, "grad_norm": 12.144914124818996, "learning_rate": 5e-05, "loss": 0.0237, "num_input_tokens_seen": 102967296, "step": 1490 }, { "epoch": 93.125, "loss": 0.02307068556547165, "loss_ce": 0.00012146684457547963, "loss_xval": 0.02294921875, "num_input_tokens_seen": 102967296, "step": 1490 }, { "epoch": 93.1875, "grad_norm": 13.277750354610223, "learning_rate": 5e-05, "loss": 0.0293, "num_input_tokens_seen": 103038912, "step": 1491 }, { "epoch": 93.1875, "loss": 0.02955476939678192, "loss_ce": 0.000135824506287463, "loss_xval": 0.0294189453125, "num_input_tokens_seen": 103038912, "step": 1491 }, { "epoch": 93.25, "grad_norm": 11.655739983208555, "learning_rate": 5e-05, "loss": 0.0228, "num_input_tokens_seen": 103110464, "step": 1492 }, { "epoch": 93.25, "loss": 0.02409106306731701, "loss_ce": 0.000165281118825078, "loss_xval": 0.02392578125, "num_input_tokens_seen": 103110464, "step": 1492 }, { "epoch": 93.3125, "grad_norm": 8.403740244338909, "learning_rate": 5e-05, "loss": 0.012, "num_input_tokens_seen": 103182080, "step": 1493 }, { "epoch": 93.3125, "loss": 0.012799791991710663, "loss_ce": 0.00022654981876257807, "loss_xval": 0.0125732421875, "num_input_tokens_seen": 103182080, "step": 1493 }, { "epoch": 93.375, "grad_norm": 4.172359561436788, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 103253760, "step": 1494 }, { "epoch": 93.375, "loss": 0.004045870620757341, "loss_ce": 0.00021591474069282413, "loss_xval": 0.0038299560546875, "num_input_tokens_seen": 103253760, "step": 1494 }, { "epoch": 93.4375, "grad_norm": 0.447412754121555, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 103325440, "step": 1495 }, { "epoch": 93.4375, "loss": 0.0008496865048073232, "loss_ce": 0.00022407615324482322, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 103325440, "step": 1495 }, { "epoch": 93.5, "grad_norm": 3.8982735632213625, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 103397120, "step": 1496 }, { "epoch": 93.5, "loss": 0.002992264460772276, "loss_ce": 0.00021516485139727592, "loss_xval": 0.002777099609375, "num_input_tokens_seen": 103397120, "step": 1496 }, { "epoch": 93.5625, "grad_norm": 6.281101491606525, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 103456320, "step": 1497 }, { "epoch": 93.5625, "loss": 0.0074848029762506485, "loss_ce": 0.00019110173161607236, "loss_xval": 0.007293701171875, "num_input_tokens_seen": 103456320, "step": 1497 }, { "epoch": 93.625, "grad_norm": 7.413049338313138, "learning_rate": 5e-05, "loss": 0.0096, "num_input_tokens_seen": 103527936, "step": 1498 }, { "epoch": 93.625, "loss": 0.00933548528701067, "loss_ce": 0.00018021151481661946, "loss_xval": 0.0091552734375, "num_input_tokens_seen": 103527936, "step": 1498 }, { "epoch": 93.6875, "grad_norm": 7.626232550227615, "learning_rate": 5e-05, "loss": 0.01, "num_input_tokens_seen": 103599680, "step": 1499 }, { "epoch": 93.6875, "loss": 0.010206174105405807, "loss_ce": 0.00019640830578282475, "loss_xval": 0.010009765625, "num_input_tokens_seen": 103599680, "step": 1499 }, { "epoch": 93.75, "grad_norm": 7.359830796258682, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 103671424, "step": 1500 }, { "epoch": 93.75, "eval_synth_IoU": 0.014503192156553268, "eval_synth_MAE_x": 0.064971923828125, "eval_synth_MAE_y": 0.065216064453125, "eval_synth_NUM_probability": 0.9975821077823639, "eval_synth_inside_bbox": 0.0625, "eval_synth_loss": 0.006595642305910587, "eval_synth_loss_ce": 0.00020220893929945305, "eval_synth_loss_xval": 0.0063934326171875, "eval_synth_runtime": 53.8671, "eval_synth_samples_per_second": 2.376, "eval_synth_steps_per_second": 0.074, "num_input_tokens_seen": 103671424, "step": 1500 }, { "epoch": 93.75, "loss": 0.0063739619217813015, "loss_ce": 0.0002094111987389624, "loss_xval": 0.00616455078125, "num_input_tokens_seen": 103671424, "step": 1500 }, { "epoch": 93.8125, "grad_norm": 5.815031205342289, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 103743104, "step": 1501 }, { "epoch": 93.8125, "loss": 0.005885062273591757, "loss_ce": 0.00020879291696473956, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 103743104, "step": 1501 }, { "epoch": 93.875, "grad_norm": 2.802506755096174, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 103814720, "step": 1502 }, { "epoch": 93.875, "loss": 0.0018814080394804478, "loss_ce": 0.00021819998801220208, "loss_xval": 0.0016632080078125, "num_input_tokens_seen": 103814720, "step": 1502 }, { "epoch": 93.9375, "grad_norm": 0.7991744215551432, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 103886400, "step": 1503 }, { "epoch": 93.9375, "loss": 0.0005315585294738412, "loss_ce": 0.0002130313077941537, "loss_xval": 0.0003185272216796875, "num_input_tokens_seen": 103886400, "step": 1503 }, { "epoch": 94.0, "grad_norm": 4.320325374935247, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 103957952, "step": 1504 }, { "epoch": 94.0, "loss": 0.0035868578124791384, "loss_ce": 0.00021466535690706223, "loss_xval": 0.0033721923828125, "num_input_tokens_seen": 103957952, "step": 1504 }, { "epoch": 94.0625, "grad_norm": 7.803107358622946, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 104017152, "step": 1505 }, { "epoch": 94.0625, "loss": 0.010636304505169392, "loss_ce": 0.00019929290283471346, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 104017152, "step": 1505 }, { "epoch": 94.125, "grad_norm": 11.06673981346744, "learning_rate": 5e-05, "loss": 0.0201, "num_input_tokens_seen": 104088832, "step": 1506 }, { "epoch": 94.125, "loss": 0.0195931363850832, "loss_ce": 0.00018395722145214677, "loss_xval": 0.0194091796875, "num_input_tokens_seen": 104088832, "step": 1506 }, { "epoch": 94.1875, "grad_norm": 12.747970197272299, "learning_rate": 5e-05, "loss": 0.0263, "num_input_tokens_seen": 104160576, "step": 1507 }, { "epoch": 94.1875, "loss": 0.02580292895436287, "loss_ce": 0.00016816369316074997, "loss_xval": 0.025634765625, "num_input_tokens_seen": 104160576, "step": 1507 }, { "epoch": 94.25, "grad_norm": 11.777468484766578, "learning_rate": 5e-05, "loss": 0.0234, "num_input_tokens_seen": 104232192, "step": 1508 }, { "epoch": 94.25, "loss": 0.02337353304028511, "loss_ce": 0.0001801743928808719, "loss_xval": 0.023193359375, "num_input_tokens_seen": 104232192, "step": 1508 }, { "epoch": 94.3125, "grad_norm": 8.40786241418314, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 104291264, "step": 1509 }, { "epoch": 94.3125, "loss": 0.014077544212341309, "loss_ce": 0.00016152839816641062, "loss_xval": 0.013916015625, "num_input_tokens_seen": 104291264, "step": 1509 }, { "epoch": 94.375, "grad_norm": 3.4373041650499183, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 104350336, "step": 1510 }, { "epoch": 94.375, "loss": 0.0023855778854340315, "loss_ce": 0.0002035711077041924, "loss_xval": 0.0021820068359375, "num_input_tokens_seen": 104350336, "step": 1510 }, { "epoch": 94.4375, "grad_norm": 1.8541259301263842, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 104421888, "step": 1511 }, { "epoch": 94.4375, "loss": 0.0016115899197757244, "loss_ce": 0.0002382989041507244, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 104421888, "step": 1511 }, { "epoch": 94.5, "grad_norm": 6.309518960354388, "learning_rate": 5e-05, "loss": 0.0083, "num_input_tokens_seen": 104493440, "step": 1512 }, { "epoch": 94.5, "loss": 0.00801610667258501, "loss_ce": 0.0002646417706273496, "loss_xval": 0.00775146484375, "num_input_tokens_seen": 104493440, "step": 1512 }, { "epoch": 94.5625, "grad_norm": 9.278758980180998, "learning_rate": 5e-05, "loss": 0.0152, "num_input_tokens_seen": 104552576, "step": 1513 }, { "epoch": 94.5625, "loss": 0.014277543872594833, "loss_ce": 0.0002394582115812227, "loss_xval": 0.0140380859375, "num_input_tokens_seen": 104552576, "step": 1513 }, { "epoch": 94.625, "grad_norm": 10.669631347212485, "learning_rate": 5e-05, "loss": 0.0201, "num_input_tokens_seen": 104624256, "step": 1514 }, { "epoch": 94.625, "loss": 0.01994253136217594, "loss_ce": 0.00028921186458319426, "loss_xval": 0.0196533203125, "num_input_tokens_seen": 104624256, "step": 1514 }, { "epoch": 94.6875, "grad_norm": 9.71471363982105, "learning_rate": 5e-05, "loss": 0.0167, "num_input_tokens_seen": 104695808, "step": 1515 }, { "epoch": 94.6875, "loss": 0.016933368518948555, "loss_ce": 0.0002097350952681154, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 104695808, "step": 1515 }, { "epoch": 94.75, "grad_norm": 5.316889455778196, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 104767488, "step": 1516 }, { "epoch": 94.75, "loss": 0.00562123442068696, "loss_ce": 0.00018910533981397748, "loss_xval": 0.00543212890625, "num_input_tokens_seen": 104767488, "step": 1516 }, { "epoch": 94.8125, "grad_norm": 0.8871147780921804, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 104839168, "step": 1517 }, { "epoch": 94.8125, "loss": 0.000770560756791383, "loss_ce": 0.00017928268061950803, "loss_xval": 0.000591278076171875, "num_input_tokens_seen": 104839168, "step": 1517 }, { "epoch": 94.875, "grad_norm": 6.706770298722494, "learning_rate": 5e-05, "loss": 0.0087, "num_input_tokens_seen": 104910720, "step": 1518 }, { "epoch": 94.875, "loss": 0.007364541292190552, "loss_ce": 0.0001623926218599081, "loss_xval": 0.0072021484375, "num_input_tokens_seen": 104910720, "step": 1518 }, { "epoch": 94.9375, "grad_norm": 11.540691922864541, "learning_rate": 5e-05, "loss": 0.0225, "num_input_tokens_seen": 104982336, "step": 1519 }, { "epoch": 94.9375, "loss": 0.022339239716529846, "loss_ce": 0.0001224419247591868, "loss_xval": 0.022216796875, "num_input_tokens_seen": 104982336, "step": 1519 }, { "epoch": 95.0, "grad_norm": 13.257087366622384, "learning_rate": 5e-05, "loss": 0.0303, "num_input_tokens_seen": 105053888, "step": 1520 }, { "epoch": 95.0, "loss": 0.030590539798140526, "loss_ce": 0.00019503226212691516, "loss_xval": 0.0303955078125, "num_input_tokens_seen": 105053888, "step": 1520 }, { "epoch": 95.0625, "grad_norm": 11.574854226916827, "learning_rate": 5e-05, "loss": 0.0233, "num_input_tokens_seen": 105125504, "step": 1521 }, { "epoch": 95.0625, "loss": 0.022616026923060417, "loss_ce": 0.00027716063777916133, "loss_xval": 0.0223388671875, "num_input_tokens_seen": 105125504, "step": 1521 }, { "epoch": 95.125, "grad_norm": 6.765160516068692, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 105197056, "step": 1522 }, { "epoch": 95.125, "loss": 0.008212330751121044, "loss_ce": 0.00021672547154594213, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 105197056, "step": 1522 }, { "epoch": 95.1875, "grad_norm": 0.9655238738340344, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 105268672, "step": 1523 }, { "epoch": 95.1875, "loss": 0.0006662615924142301, "loss_ce": 0.00021231263235677034, "loss_xval": 0.000453948974609375, "num_input_tokens_seen": 105268672, "step": 1523 }, { "epoch": 95.25, "grad_norm": 7.545374574995911, "learning_rate": 5e-05, "loss": 0.01, "num_input_tokens_seen": 105340288, "step": 1524 }, { "epoch": 95.25, "loss": 0.010226485319435596, "loss_ce": 0.00021672008733730763, "loss_xval": 0.010009765625, "num_input_tokens_seen": 105340288, "step": 1524 }, { "epoch": 95.3125, "grad_norm": 9.974132967357392, "learning_rate": 5e-05, "loss": 0.0166, "num_input_tokens_seen": 105411904, "step": 1525 }, { "epoch": 95.3125, "loss": 0.016069557517766953, "loss_ce": 0.00020041617972310632, "loss_xval": 0.015869140625, "num_input_tokens_seen": 105411904, "step": 1525 }, { "epoch": 95.375, "grad_norm": 8.400691776915371, "learning_rate": 5e-05, "loss": 0.0124, "num_input_tokens_seen": 105483520, "step": 1526 }, { "epoch": 95.375, "loss": 0.012161885388195515, "loss_ce": 0.00019899505423381925, "loss_xval": 0.011962890625, "num_input_tokens_seen": 105483520, "step": 1526 }, { "epoch": 95.4375, "grad_norm": 3.1287308542841585, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 105555136, "step": 1527 }, { "epoch": 95.4375, "loss": 0.002447618404403329, "loss_ce": 0.0002198353031417355, "loss_xval": 0.002227783203125, "num_input_tokens_seen": 105555136, "step": 1527 }, { "epoch": 95.5, "grad_norm": 3.333597100843683, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 105626752, "step": 1528 }, { "epoch": 95.5, "loss": 0.0024574645794928074, "loss_ce": 0.0002449401072226465, "loss_xval": 0.0022125244140625, "num_input_tokens_seen": 105626752, "step": 1528 }, { "epoch": 95.5625, "grad_norm": 7.238995915388532, "learning_rate": 5e-05, "loss": 0.0095, "num_input_tokens_seen": 105698304, "step": 1529 }, { "epoch": 95.5625, "loss": 0.008781913667917252, "loss_ce": 0.0002369913418078795, "loss_xval": 0.008544921875, "num_input_tokens_seen": 105698304, "step": 1529 }, { "epoch": 95.625, "grad_norm": 7.5002230114016655, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 105769856, "step": 1530 }, { "epoch": 95.625, "loss": 0.01023135520517826, "loss_ce": 0.00028262505657039583, "loss_xval": 0.00994873046875, "num_input_tokens_seen": 105769856, "step": 1530 }, { "epoch": 95.6875, "grad_norm": 4.072986944781151, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 105841472, "step": 1531 }, { "epoch": 95.6875, "loss": 0.0039795455522835255, "loss_ce": 0.0002869187155738473, "loss_xval": 0.003692626953125, "num_input_tokens_seen": 105841472, "step": 1531 }, { "epoch": 95.75, "grad_norm": 1.6817132275661484, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 105900544, "step": 1532 }, { "epoch": 95.75, "loss": 0.0010310329962521791, "loss_ce": 0.00031386996852234006, "loss_xval": 0.0007171630859375, "num_input_tokens_seen": 105900544, "step": 1532 }, { "epoch": 95.8125, "grad_norm": 6.534209892896805, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 105959552, "step": 1533 }, { "epoch": 95.8125, "loss": 0.008419894613325596, "loss_ce": 0.0003022188611794263, "loss_xval": 0.00811767578125, "num_input_tokens_seen": 105959552, "step": 1533 }, { "epoch": 95.875, "grad_norm": 7.322113681309952, "learning_rate": 5e-05, "loss": 0.0103, "num_input_tokens_seen": 106018752, "step": 1534 }, { "epoch": 95.875, "loss": 0.010474001057446003, "loss_ce": 0.00028112975996918976, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 106018752, "step": 1534 }, { "epoch": 95.9375, "grad_norm": 3.3700594012413823, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 106090432, "step": 1535 }, { "epoch": 95.9375, "loss": 0.002817351371049881, "loss_ce": 0.000269133597612381, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 106090432, "step": 1535 }, { "epoch": 96.0, "grad_norm": 2.0784202889921293, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 106149632, "step": 1536 }, { "epoch": 96.0, "loss": 0.0012789510656148195, "loss_ce": 0.00027950035291723907, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 106149632, "step": 1536 }, { "epoch": 96.0625, "grad_norm": 5.73115808037762, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 106221184, "step": 1537 }, { "epoch": 96.0625, "loss": 0.007430717349052429, "loss_ce": 0.00022856892610434443, "loss_xval": 0.0072021484375, "num_input_tokens_seen": 106221184, "step": 1537 }, { "epoch": 96.125, "grad_norm": 5.778802439492285, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 106292800, "step": 1538 }, { "epoch": 96.125, "loss": 0.00656101806089282, "loss_ce": 0.00018284417456015944, "loss_xval": 0.006378173828125, "num_input_tokens_seen": 106292800, "step": 1538 }, { "epoch": 96.1875, "grad_norm": 2.748174730459637, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 106351936, "step": 1539 }, { "epoch": 96.1875, "loss": 0.0019633835181593895, "loss_ce": 0.00016284632147289813, "loss_xval": 0.001800537109375, "num_input_tokens_seen": 106351936, "step": 1539 }, { "epoch": 96.25, "grad_norm": 1.075487575940013, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 106423552, "step": 1540 }, { "epoch": 96.25, "loss": 0.0006161235505715013, "loss_ce": 0.0001526378619018942, "loss_xval": 0.0004634857177734375, "num_input_tokens_seen": 106423552, "step": 1540 }, { "epoch": 96.3125, "grad_norm": 4.298651762979521, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 106495232, "step": 1541 }, { "epoch": 96.3125, "loss": 0.0032136535737663507, "loss_ce": 0.00013137809582985938, "loss_xval": 0.003082275390625, "num_input_tokens_seen": 106495232, "step": 1541 }, { "epoch": 96.375, "grad_norm": 5.9612708922485815, "learning_rate": 5e-05, "loss": 0.0064, "num_input_tokens_seen": 106566848, "step": 1542 }, { "epoch": 96.375, "loss": 0.00659773126244545, "loss_ce": 0.0001280044816667214, "loss_xval": 0.0064697265625, "num_input_tokens_seen": 106566848, "step": 1542 }, { "epoch": 96.4375, "grad_norm": 5.412804621160427, "learning_rate": 5e-05, "loss": 0.0053, "num_input_tokens_seen": 106625984, "step": 1543 }, { "epoch": 96.4375, "loss": 0.005133685190230608, "loss_ce": 9.828498150454834e-05, "loss_xval": 0.005035400390625, "num_input_tokens_seen": 106625984, "step": 1543 }, { "epoch": 96.5, "grad_norm": 3.1861755295026826, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 106685184, "step": 1544 }, { "epoch": 96.5, "loss": 0.002085526008158922, "loss_ce": 8.662457548780367e-05, "loss_xval": 0.0019989013671875, "num_input_tokens_seen": 106685184, "step": 1544 }, { "epoch": 96.5625, "grad_norm": 0.28285281962099296, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 106756800, "step": 1545 }, { "epoch": 96.5625, "loss": 0.0006361733539961278, "loss_ce": 9.067163045983762e-05, "loss_xval": 0.000545501708984375, "num_input_tokens_seen": 106756800, "step": 1545 }, { "epoch": 96.625, "grad_norm": 2.8705565420322605, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 106828416, "step": 1546 }, { "epoch": 96.625, "loss": 0.001644703559577465, "loss_ce": 8.830706792650744e-05, "loss_xval": 0.001556396484375, "num_input_tokens_seen": 106828416, "step": 1546 }, { "epoch": 96.6875, "grad_norm": 4.6414381546442725, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 106900032, "step": 1547 }, { "epoch": 96.6875, "loss": 0.004356001503765583, "loss_ce": 8.354033343493938e-05, "loss_xval": 0.0042724609375, "num_input_tokens_seen": 106900032, "step": 1547 }, { "epoch": 96.75, "grad_norm": 4.447232076049476, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 106959168, "step": 1548 }, { "epoch": 96.75, "loss": 0.003473155666142702, "loss_ce": 8.57045961311087e-05, "loss_xval": 0.003387451171875, "num_input_tokens_seen": 106959168, "step": 1548 }, { "epoch": 96.8125, "grad_norm": 2.9373942860318802, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 107030784, "step": 1549 }, { "epoch": 96.8125, "loss": 0.001669206889346242, "loss_ce": 8.229284139815718e-05, "loss_xval": 0.0015869140625, "num_input_tokens_seen": 107030784, "step": 1549 }, { "epoch": 96.875, "grad_norm": 0.9303865075128541, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 107102464, "step": 1550 }, { "epoch": 96.875, "loss": 0.00041434323065914214, "loss_ce": 7.864987856009975e-05, "loss_xval": 0.000335693359375, "num_input_tokens_seen": 107102464, "step": 1550 }, { "epoch": 96.9375, "grad_norm": 1.138452302146148, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 107174080, "step": 1551 }, { "epoch": 96.9375, "loss": 0.0004917146870866418, "loss_ce": 8.544942102162167e-05, "loss_xval": 0.0004062652587890625, "num_input_tokens_seen": 107174080, "step": 1551 }, { "epoch": 97.0, "grad_norm": 2.6210738731498995, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 107245632, "step": 1552 }, { "epoch": 97.0, "loss": 0.001443982939235866, "loss_ce": 8.595068356953561e-05, "loss_xval": 0.0013580322265625, "num_input_tokens_seen": 107245632, "step": 1552 }, { "epoch": 97.0625, "grad_norm": 3.311687916584373, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 107317248, "step": 1553 }, { "epoch": 97.0625, "loss": 0.0018252766458317637, "loss_ce": 8.577468543080613e-05, "loss_xval": 0.001739501953125, "num_input_tokens_seen": 107317248, "step": 1553 }, { "epoch": 97.125, "grad_norm": 3.4443077065766827, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 107388800, "step": 1554 }, { "epoch": 97.125, "loss": 0.002176129724830389, "loss_ce": 8.567561599193141e-05, "loss_xval": 0.0020904541015625, "num_input_tokens_seen": 107388800, "step": 1554 }, { "epoch": 97.1875, "grad_norm": 3.232124177935162, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 107460480, "step": 1555 }, { "epoch": 97.1875, "loss": 0.002326987450942397, "loss_ce": 8.394538599532098e-05, "loss_xval": 0.0022430419921875, "num_input_tokens_seen": 107460480, "step": 1555 }, { "epoch": 97.25, "grad_norm": 2.5180593657209966, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 107532160, "step": 1556 }, { "epoch": 97.25, "loss": 0.0013235878432169557, "loss_ce": 8.762597281020135e-05, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 107532160, "step": 1556 }, { "epoch": 97.3125, "grad_norm": 0.9263637828252639, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 107603776, "step": 1557 }, { "epoch": 97.3125, "loss": 0.0005748713156208396, "loss_ce": 8.659008744871244e-05, "loss_xval": 0.00048828125, "num_input_tokens_seen": 107603776, "step": 1557 }, { "epoch": 97.375, "grad_norm": 0.9979947084252878, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 107662784, "step": 1558 }, { "epoch": 97.375, "loss": 0.0005600899457931519, "loss_ce": 8.706748485565186e-05, "loss_xval": 0.0004730224609375, "num_input_tokens_seen": 107662784, "step": 1558 }, { "epoch": 97.4375, "grad_norm": 2.174623829666188, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 107734528, "step": 1559 }, { "epoch": 97.4375, "loss": 0.0013285287423059344, "loss_ce": 8.493742643622681e-05, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 107734528, "step": 1559 }, { "epoch": 97.5, "grad_norm": 2.704914360205697, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 107806144, "step": 1560 }, { "epoch": 97.5, "loss": 0.0014250859385356307, "loss_ce": 8.231253013946116e-05, "loss_xval": 0.0013427734375, "num_input_tokens_seen": 107806144, "step": 1560 }, { "epoch": 97.5625, "grad_norm": 3.0964133170468706, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 107877824, "step": 1561 }, { "epoch": 97.5625, "loss": 0.0015938394935801625, "loss_ce": 8.321935456478968e-05, "loss_xval": 0.0015106201171875, "num_input_tokens_seen": 107877824, "step": 1561 }, { "epoch": 97.625, "grad_norm": 3.339458327215024, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 107949376, "step": 1562 }, { "epoch": 97.625, "loss": 0.002547947457060218, "loss_ce": 7.602353434776887e-05, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 107949376, "step": 1562 }, { "epoch": 97.6875, "grad_norm": 3.1582407893570537, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 108020992, "step": 1563 }, { "epoch": 97.6875, "loss": 0.002089640824124217, "loss_ce": 7.548071152996272e-05, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 108020992, "step": 1563 }, { "epoch": 97.75, "grad_norm": 2.575145553500417, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 108092608, "step": 1564 }, { "epoch": 97.75, "loss": 0.0014476650394499302, "loss_ce": 7.437399472109973e-05, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 108092608, "step": 1564 }, { "epoch": 97.8125, "grad_norm": 1.9222053814262445, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 108164288, "step": 1565 }, { "epoch": 97.8125, "loss": 0.0007965641561895609, "loss_ce": 7.558635115856305e-05, "loss_xval": 0.000720977783203125, "num_input_tokens_seen": 108164288, "step": 1565 }, { "epoch": 97.875, "grad_norm": 1.6299140847499538, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 108235840, "step": 1566 }, { "epoch": 97.875, "loss": 0.0008366935653612018, "loss_ce": 7.375411223620176e-05, "loss_xval": 0.000762939453125, "num_input_tokens_seen": 108235840, "step": 1566 }, { "epoch": 97.9375, "grad_norm": 2.2908526062343384, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 108307392, "step": 1567 }, { "epoch": 97.9375, "loss": 0.0012096071150153875, "loss_ce": 7.282734441105276e-05, "loss_xval": 0.00113677978515625, "num_input_tokens_seen": 108307392, "step": 1567 }, { "epoch": 98.0, "grad_norm": 4.153602107714487, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 108379008, "step": 1568 }, { "epoch": 98.0, "loss": 0.0032553914934396744, "loss_ce": 8.156333933584392e-05, "loss_xval": 0.003173828125, "num_input_tokens_seen": 108379008, "step": 1568 }, { "epoch": 98.0625, "grad_norm": 7.1710467414549335, "learning_rate": 5e-05, "loss": 0.0092, "num_input_tokens_seen": 108450688, "step": 1569 }, { "epoch": 98.0625, "loss": 0.009295488707721233, "loss_ce": 7.918039773358032e-05, "loss_xval": 0.00921630859375, "num_input_tokens_seen": 108450688, "step": 1569 }, { "epoch": 98.125, "grad_norm": 11.662438031029577, "learning_rate": 5e-05, "loss": 0.0241, "num_input_tokens_seen": 108522304, "step": 1570 }, { "epoch": 98.125, "loss": 0.024007223546504974, "loss_ce": 8.144209277816117e-05, "loss_xval": 0.02392578125, "num_input_tokens_seen": 108522304, "step": 1570 }, { "epoch": 98.1875, "grad_norm": 17.407733059595365, "learning_rate": 5e-05, "loss": 0.0543, "num_input_tokens_seen": 108593856, "step": 1571 }, { "epoch": 98.1875, "loss": 0.055038176476955414, "loss_ce": 0.00010653500794433057, "loss_xval": 0.054931640625, "num_input_tokens_seen": 108593856, "step": 1571 }, { "epoch": 98.25, "grad_norm": 20.927165200757347, "learning_rate": 5e-05, "loss": 0.0795, "num_input_tokens_seen": 108665536, "step": 1572 }, { "epoch": 98.25, "loss": 0.08215904235839844, "loss_ce": 0.00012779254757333547, "loss_xval": 0.08203125, "num_input_tokens_seen": 108665536, "step": 1572 }, { "epoch": 98.3125, "grad_norm": 16.725522171740963, "learning_rate": 5e-05, "loss": 0.0514, "num_input_tokens_seen": 108737152, "step": 1573 }, { "epoch": 98.3125, "loss": 0.048741865903139114, "loss_ce": 0.0001578811788931489, "loss_xval": 0.048583984375, "num_input_tokens_seen": 108737152, "step": 1573 }, { "epoch": 98.375, "grad_norm": 5.127003707099649, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 108808768, "step": 1574 }, { "epoch": 98.375, "loss": 0.00516909547150135, "loss_ce": 0.00019473010615911335, "loss_xval": 0.004974365234375, "num_input_tokens_seen": 108808768, "step": 1574 }, { "epoch": 98.4375, "grad_norm": 7.22636403212046, "learning_rate": 5e-05, "loss": 0.0106, "num_input_tokens_seen": 108880576, "step": 1575 }, { "epoch": 98.4375, "loss": 0.010036150924861431, "loss_ce": 0.000270525662926957, "loss_xval": 0.009765625, "num_input_tokens_seen": 108880576, "step": 1575 }, { "epoch": 98.5, "grad_norm": 15.10241100748922, "learning_rate": 5e-05, "loss": 0.0419, "num_input_tokens_seen": 108952192, "step": 1576 }, { "epoch": 98.5, "loss": 0.04190199449658394, "loss_ce": 0.00039808667497709394, "loss_xval": 0.04150390625, "num_input_tokens_seen": 108952192, "step": 1576 }, { "epoch": 98.5625, "grad_norm": 14.14376880473154, "learning_rate": 5e-05, "loss": 0.0382, "num_input_tokens_seen": 109023872, "step": 1577 }, { "epoch": 98.5625, "loss": 0.03786517307162285, "loss_ce": 0.0005116576212458313, "loss_xval": 0.037353515625, "num_input_tokens_seen": 109023872, "step": 1577 }, { "epoch": 98.625, "grad_norm": 5.188315109714636, "learning_rate": 5e-05, "loss": 0.0064, "num_input_tokens_seen": 109083072, "step": 1578 }, { "epoch": 98.625, "loss": 0.005667213816195726, "loss_ce": 0.0007233662181533873, "loss_xval": 0.00494384765625, "num_input_tokens_seen": 109083072, "step": 1578 }, { "epoch": 98.6875, "grad_norm": 6.888484289652325, "learning_rate": 5e-05, "loss": 0.0114, "num_input_tokens_seen": 109154816, "step": 1579 }, { "epoch": 98.6875, "loss": 0.011372015811502934, "loss_ce": 0.0008129340712912381, "loss_xval": 0.01055908203125, "num_input_tokens_seen": 109154816, "step": 1579 }, { "epoch": 98.75, "grad_norm": 15.72317232842836, "learning_rate": 5e-05, "loss": 0.0508, "num_input_tokens_seen": 109226368, "step": 1580 }, { "epoch": 98.75, "loss": 0.049659743905067444, "loss_ce": 0.0008316197781823575, "loss_xval": 0.048828125, "num_input_tokens_seen": 109226368, "step": 1580 }, { "epoch": 98.8125, "grad_norm": 13.773559500694164, "learning_rate": 5e-05, "loss": 0.0367, "num_input_tokens_seen": 109298048, "step": 1581 }, { "epoch": 98.8125, "loss": 0.036477576941251755, "loss_ce": 0.00034476383007131517, "loss_xval": 0.0361328125, "num_input_tokens_seen": 109298048, "step": 1581 }, { "epoch": 98.875, "grad_norm": 3.4372123434645796, "learning_rate": 5e-05, "loss": 0.0043, "num_input_tokens_seen": 109369728, "step": 1582 }, { "epoch": 98.875, "loss": 0.003365415846928954, "loss_ce": 0.00025262293638661504, "loss_xval": 0.00311279296875, "num_input_tokens_seen": 109369728, "step": 1582 }, { "epoch": 98.9375, "grad_norm": 7.8152476972547005, "learning_rate": 5e-05, "loss": 0.0114, "num_input_tokens_seen": 109428800, "step": 1583 }, { "epoch": 98.9375, "loss": 0.01139904372394085, "loss_ce": 0.00022961030481383204, "loss_xval": 0.01116943359375, "num_input_tokens_seen": 109428800, "step": 1583 }, { "epoch": 99.0, "grad_norm": 18.65440543806313, "learning_rate": 5e-05, "loss": 0.0614, "num_input_tokens_seen": 109475392, "step": 1584 }, { "epoch": 99.0, "loss": 0.059953078627586365, "loss_ce": 0.0003827674372587353, "loss_xval": 0.0595703125, "num_input_tokens_seen": 109475392, "step": 1584 }, { "epoch": 99.0625, "grad_norm": 23.83843810097326, "learning_rate": 5e-05, "loss": 0.0944, "num_input_tokens_seen": 109546944, "step": 1585 }, { "epoch": 99.0625, "loss": 0.09535447508096695, "loss_ce": 0.0001396313455188647, "loss_xval": 0.09521484375, "num_input_tokens_seen": 109546944, "step": 1585 }, { "epoch": 99.125, "grad_norm": 17.427719670772035, "learning_rate": 5e-05, "loss": 0.0516, "num_input_tokens_seen": 109618688, "step": 1586 }, { "epoch": 99.125, "loss": 0.051441460847854614, "loss_ce": 0.00017192953964695334, "loss_xval": 0.05126953125, "num_input_tokens_seen": 109618688, "step": 1586 }, { "epoch": 99.1875, "grad_norm": 6.166135907550393, "learning_rate": 5e-05, "loss": 0.0097, "num_input_tokens_seen": 109690496, "step": 1587 }, { "epoch": 99.1875, "loss": 0.010387794114649296, "loss_ce": 0.0007442396599799395, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 109690496, "step": 1587 }, { "epoch": 99.25, "grad_norm": 7.035162997340315, "learning_rate": 5e-05, "loss": 0.0145, "num_input_tokens_seen": 109762176, "step": 1588 }, { "epoch": 99.25, "loss": 0.015704365447163582, "loss_ce": 0.0018493849784135818, "loss_xval": 0.01385498046875, "num_input_tokens_seen": 109762176, "step": 1588 }, { "epoch": 99.3125, "grad_norm": 22.907493652120387, "learning_rate": 5e-05, "loss": 0.1104, "num_input_tokens_seen": 109821248, "step": 1589 }, { "epoch": 99.3125, "loss": 0.1109476163983345, "loss_ce": 0.0025491821579635143, "loss_xval": 0.1083984375, "num_input_tokens_seen": 109821248, "step": 1589 }, { "epoch": 99.375, "grad_norm": 27.005672270421268, "learning_rate": 5e-05, "loss": 0.148, "num_input_tokens_seen": 109893056, "step": 1590 }, { "epoch": 99.375, "loss": 0.14455649256706238, "loss_ce": 0.0010018125176429749, "loss_xval": 0.1435546875, "num_input_tokens_seen": 109893056, "step": 1590 }, { "epoch": 99.4375, "grad_norm": 18.030635499664317, "learning_rate": 5e-05, "loss": 0.059, "num_input_tokens_seen": 109952064, "step": 1591 }, { "epoch": 99.4375, "loss": 0.05704944580793381, "loss_ce": 0.00016467941168230027, "loss_xval": 0.056884765625, "num_input_tokens_seen": 109952064, "step": 1591 }, { "epoch": 99.5, "grad_norm": 6.114988855419912, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 110023680, "step": 1592 }, { "epoch": 99.5, "loss": 0.009071971289813519, "loss_ce": 0.00016083818627521396, "loss_xval": 0.0089111328125, "num_input_tokens_seen": 110023680, "step": 1592 }, { "epoch": 99.5625, "grad_norm": 25.734581206903535, "learning_rate": 5e-05, "loss": 0.1091, "num_input_tokens_seen": 110095424, "step": 1593 }, { "epoch": 99.5625, "loss": 0.11003206670284271, "loss_ce": 0.0001687873445916921, "loss_xval": 0.10986328125, "num_input_tokens_seen": 110095424, "step": 1593 }, { "epoch": 99.625, "grad_norm": 16.73317359001743, "learning_rate": 5e-05, "loss": 0.0489, "num_input_tokens_seen": 110167168, "step": 1594 }, { "epoch": 99.625, "loss": 0.04463024437427521, "loss_ce": 0.00019665084255393595, "loss_xval": 0.04443359375, "num_input_tokens_seen": 110167168, "step": 1594 }, { "epoch": 99.6875, "grad_norm": 5.752505101398109, "learning_rate": 5e-05, "loss": 0.0102, "num_input_tokens_seen": 110238848, "step": 1595 }, { "epoch": 99.6875, "loss": 0.009299618192017078, "loss_ce": 0.00026641503791324794, "loss_xval": 0.009033203125, "num_input_tokens_seen": 110238848, "step": 1595 }, { "epoch": 99.75, "grad_norm": 16.170946032727255, "learning_rate": 5e-05, "loss": 0.0483, "num_input_tokens_seen": 110310464, "step": 1596 }, { "epoch": 99.75, "loss": 0.04650229960680008, "loss_ce": 0.00035972270416095853, "loss_xval": 0.046142578125, "num_input_tokens_seen": 110310464, "step": 1596 }, { "epoch": 99.8125, "grad_norm": 5.235616533671838, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 110382016, "step": 1597 }, { "epoch": 99.8125, "loss": 0.006151093170046806, "loss_ce": 0.0006579289911314845, "loss_xval": 0.0054931640625, "num_input_tokens_seen": 110382016, "step": 1597 }, { "epoch": 99.875, "grad_norm": 12.563475707020881, "learning_rate": 5e-05, "loss": 0.0318, "num_input_tokens_seen": 110441216, "step": 1598 }, { "epoch": 99.875, "loss": 0.03234691545367241, "loss_ce": 0.0015851972857490182, "loss_xval": 0.03076171875, "num_input_tokens_seen": 110441216, "step": 1598 }, { "epoch": 99.9375, "grad_norm": 14.430502102016037, "learning_rate": 5e-05, "loss": 0.0399, "num_input_tokens_seen": 110512832, "step": 1599 }, { "epoch": 99.9375, "loss": 0.04030544310808182, "loss_ce": 0.0007546603446826339, "loss_xval": 0.03955078125, "num_input_tokens_seen": 110512832, "step": 1599 }, { "epoch": 100.0, "grad_norm": 2.025038232206116, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 110571904, "step": 1600 }, { "epoch": 100.0, "loss": 0.0020840922370553017, "loss_ce": 0.0009473123354837298, "loss_xval": 0.00113677978515625, "num_input_tokens_seen": 110571904, "step": 1600 }, { "epoch": 100.0625, "grad_norm": 12.756881989338398, "learning_rate": 5e-05, "loss": 0.0334, "num_input_tokens_seen": 110643584, "step": 1601 }, { "epoch": 100.0625, "loss": 0.0334443598985672, "loss_ce": 0.0009736571228131652, "loss_xval": 0.032470703125, "num_input_tokens_seen": 110643584, "step": 1601 }, { "epoch": 100.125, "grad_norm": 1.3560502459791763, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 110702720, "step": 1602 }, { "epoch": 100.125, "loss": 0.0013581677339971066, "loss_ce": 0.0006143018254078925, "loss_xval": 0.000743865966796875, "num_input_tokens_seen": 110702720, "step": 1602 }, { "epoch": 100.1875, "grad_norm": 11.49146267669002, "learning_rate": 5e-05, "loss": 0.0282, "num_input_tokens_seen": 110774336, "step": 1603 }, { "epoch": 100.1875, "loss": 0.027602650225162506, "loss_ce": 0.0005030405009165406, "loss_xval": 0.027099609375, "num_input_tokens_seen": 110774336, "step": 1603 }, { "epoch": 100.25, "grad_norm": 2.8865605520789948, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 110845888, "step": 1604 }, { "epoch": 100.25, "loss": 0.0031700225081294775, "loss_ce": 0.0004844755749218166, "loss_xval": 0.002685546875, "num_input_tokens_seen": 110845888, "step": 1604 }, { "epoch": 100.3125, "grad_norm": 9.938249020872993, "learning_rate": 5e-05, "loss": 0.0205, "num_input_tokens_seen": 110917632, "step": 1605 }, { "epoch": 100.3125, "loss": 0.020480969920754433, "loss_ce": 0.0004614384670276195, "loss_xval": 0.02001953125, "num_input_tokens_seen": 110917632, "step": 1605 }, { "epoch": 100.375, "grad_norm": 535.267256561788, "learning_rate": 5e-05, "loss": 0.5603, "num_input_tokens_seen": 110989312, "step": 1606 }, { "epoch": 100.375, "loss": 0.0017519124085083604, "loss_ce": 0.0004015095764771104, "loss_xval": 0.00135040283203125, "num_input_tokens_seen": 110989312, "step": 1606 }, { "epoch": 100.4375, "grad_norm": 26.803573221116935, "learning_rate": 5e-05, "loss": 0.1485, "num_input_tokens_seen": 111060928, "step": 1607 }, { "epoch": 100.4375, "loss": 0.13052555918693542, "loss_ce": 0.12610051035881042, "loss_xval": 0.004425048828125, "num_input_tokens_seen": 111060928, "step": 1607 }, { "epoch": 100.5, "grad_norm": 11.96771523717426, "learning_rate": 5e-05, "loss": 0.7261, "num_input_tokens_seen": 111132608, "step": 1608 }, { "epoch": 100.5, "loss": 0.728778064250946, "loss_ce": 0.7280418276786804, "loss_xval": 0.000736236572265625, "num_input_tokens_seen": 111132608, "step": 1608 }, { "epoch": 100.5625, "grad_norm": 11.58786077220693, "learning_rate": 5e-05, "loss": 0.7126, "num_input_tokens_seen": 111204288, "step": 1609 }, { "epoch": 100.5625, "loss": 0.7263501286506653, "loss_ce": 0.7207348942756653, "loss_xval": 0.005615234375, "num_input_tokens_seen": 111204288, "step": 1609 }, { "epoch": 100.625, "grad_norm": 7.627950979392913, "learning_rate": 5e-05, "loss": 0.5433, "num_input_tokens_seen": 111275968, "step": 1610 }, { "epoch": 100.625, "loss": 0.5421339273452759, "loss_ce": 0.5413748025894165, "loss_xval": 0.000759124755859375, "num_input_tokens_seen": 111275968, "step": 1610 }, { "epoch": 100.6875, "grad_norm": 8.926215867422837, "learning_rate": 5e-05, "loss": 0.5094, "num_input_tokens_seen": 111335168, "step": 1611 }, { "epoch": 100.6875, "loss": 0.508723258972168, "loss_ce": 0.5062055587768555, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 111335168, "step": 1611 }, { "epoch": 100.75, "grad_norm": 13.96864154608611, "learning_rate": 5e-05, "loss": 0.4793, "num_input_tokens_seen": 111406848, "step": 1612 }, { "epoch": 100.75, "loss": 0.4821075201034546, "loss_ce": 0.4637969732284546, "loss_xval": 0.018310546875, "num_input_tokens_seen": 111406848, "step": 1612 }, { "epoch": 100.8125, "grad_norm": 32.770614154641045, "learning_rate": 5e-05, "loss": 0.4225, "num_input_tokens_seen": 111478528, "step": 1613 }, { "epoch": 100.8125, "loss": 0.42495107650756836, "loss_ce": 0.29604482650756836, "loss_xval": 0.12890625, "num_input_tokens_seen": 111478528, "step": 1613 }, { "epoch": 100.875, "grad_norm": 96.20222280752, "learning_rate": 5e-05, "loss": 1.2346, "num_input_tokens_seen": 111537600, "step": 1614 }, { "epoch": 100.875, "loss": 1.218656063079834, "loss_ce": 0.1795935183763504, "loss_xval": 1.0390625, "num_input_tokens_seen": 111537600, "step": 1614 }, { "epoch": 100.9375, "grad_norm": 98.88955953502278, "learning_rate": 5e-05, "loss": 1.2119, "num_input_tokens_seen": 111609280, "step": 1615 }, { "epoch": 100.9375, "loss": 1.1993708610534668, "loss_ce": 0.1681208610534668, "loss_xval": 1.03125, "num_input_tokens_seen": 111609280, "step": 1615 }, { "epoch": 101.0, "grad_norm": 22.017518689198113, "learning_rate": 5e-05, "loss": 0.2187, "num_input_tokens_seen": 111680896, "step": 1616 }, { "epoch": 101.0, "loss": 0.2185109406709671, "loss_ce": 0.1611378937959671, "loss_xval": 0.057373046875, "num_input_tokens_seen": 111680896, "step": 1616 }, { "epoch": 101.0625, "grad_norm": 70.8242896529797, "learning_rate": 5e-05, "loss": 0.7584, "num_input_tokens_seen": 111752576, "step": 1617 }, { "epoch": 101.0625, "loss": 0.74258953332901, "loss_ce": 0.1293082982301712, "loss_xval": 0.61328125, "num_input_tokens_seen": 111752576, "step": 1617 }, { "epoch": 101.125, "grad_norm": 27.584765102526394, "learning_rate": 5e-05, "loss": 0.1862, "num_input_tokens_seen": 111824128, "step": 1618 }, { "epoch": 101.125, "loss": 0.18330301344394684, "loss_ce": 0.07636941969394684, "loss_xval": 0.10693359375, "num_input_tokens_seen": 111824128, "step": 1618 }, { "epoch": 101.1875, "grad_norm": 45.35551615786325, "learning_rate": 5e-05, "loss": 0.3777, "num_input_tokens_seen": 111895872, "step": 1619 }, { "epoch": 101.1875, "loss": 0.37971189618110657, "loss_ce": 0.06135252118110657, "loss_xval": 0.318359375, "num_input_tokens_seen": 111895872, "step": 1619 }, { "epoch": 101.25, "grad_norm": 43.72179545283442, "learning_rate": 5e-05, "loss": 0.3603, "num_input_tokens_seen": 111967616, "step": 1620 }, { "epoch": 101.25, "loss": 0.36618638038635254, "loss_ce": 0.06345201283693314, "loss_xval": 0.302734375, "num_input_tokens_seen": 111967616, "step": 1620 }, { "epoch": 101.3125, "grad_norm": 10.257659607571505, "learning_rate": 5e-05, "loss": 0.066, "num_input_tokens_seen": 112039296, "step": 1621 }, { "epoch": 101.3125, "loss": 0.06621536612510681, "loss_ce": 0.05034622177481651, "loss_xval": 0.015869140625, "num_input_tokens_seen": 112039296, "step": 1621 }, { "epoch": 101.375, "grad_norm": 47.05730933565881, "learning_rate": 5e-05, "loss": 0.4014, "num_input_tokens_seen": 112098368, "step": 1622 }, { "epoch": 101.375, "loss": 0.3984343707561493, "loss_ce": 0.046871863305568695, "loss_xval": 0.3515625, "num_input_tokens_seen": 112098368, "step": 1622 }, { "epoch": 101.4375, "grad_norm": 21.400823346508936, "learning_rate": 5e-05, "loss": 0.1173, "num_input_tokens_seen": 112157568, "step": 1623 }, { "epoch": 101.4375, "loss": 0.11608298122882843, "loss_ce": 0.04088767245411873, "loss_xval": 0.0751953125, "num_input_tokens_seen": 112157568, "step": 1623 }, { "epoch": 101.5, "grad_norm": 23.821125025940553, "learning_rate": 5e-05, "loss": 0.1299, "num_input_tokens_seen": 112229248, "step": 1624 }, { "epoch": 101.5, "loss": 0.13002723455429077, "loss_ce": 0.03383582830429077, "loss_xval": 0.09619140625, "num_input_tokens_seen": 112229248, "step": 1624 }, { "epoch": 101.5625, "grad_norm": 36.06573205221317, "learning_rate": 5e-05, "loss": 0.2493, "num_input_tokens_seen": 112300864, "step": 1625 }, { "epoch": 101.5625, "loss": 0.24369105696678162, "loss_ce": 0.022011367604136467, "loss_xval": 0.2216796875, "num_input_tokens_seen": 112300864, "step": 1625 }, { "epoch": 101.625, "grad_norm": 8.543920957277138, "learning_rate": 5e-05, "loss": 0.0293, "num_input_tokens_seen": 112372544, "step": 1626 }, { "epoch": 101.625, "loss": 0.027967257425189018, "loss_ce": 0.014356417581439018, "loss_xval": 0.01361083984375, "num_input_tokens_seen": 112372544, "step": 1626 }, { "epoch": 101.6875, "grad_norm": 24.866755936364093, "learning_rate": 5e-05, "loss": 0.1224, "num_input_tokens_seen": 112444224, "step": 1627 }, { "epoch": 101.6875, "loss": 0.12345461547374725, "loss_ce": 0.013591330498456955, "loss_xval": 0.10986328125, "num_input_tokens_seen": 112444224, "step": 1627 }, { "epoch": 101.75, "grad_norm": 29.743958865983373, "learning_rate": 5e-05, "loss": 0.163, "num_input_tokens_seen": 112515840, "step": 1628 }, { "epoch": 101.75, "loss": 0.1601444035768509, "loss_ce": 0.006824086420238018, "loss_xval": 0.1533203125, "num_input_tokens_seen": 112515840, "step": 1628 }, { "epoch": 101.8125, "grad_norm": 3.2598818341929277, "learning_rate": 5e-05, "loss": 0.0088, "num_input_tokens_seen": 112587392, "step": 1629 }, { "epoch": 101.8125, "loss": 0.009183799847960472, "loss_ce": 0.005124961957335472, "loss_xval": 0.004058837890625, "num_input_tokens_seen": 112587392, "step": 1629 }, { "epoch": 101.875, "grad_norm": 23.646893975459562, "learning_rate": 5e-05, "loss": 0.1019, "num_input_tokens_seen": 112659136, "step": 1630 }, { "epoch": 101.875, "loss": 0.10114867240190506, "loss_ce": 0.003980706911534071, "loss_xval": 0.09716796875, "num_input_tokens_seen": 112659136, "step": 1630 }, { "epoch": 101.9375, "grad_norm": 22.947677597928312, "learning_rate": 5e-05, "loss": 0.0949, "num_input_tokens_seen": 112730752, "step": 1631 }, { "epoch": 101.9375, "loss": 0.0934811681509018, "loss_ce": 0.0026608556509017944, "loss_xval": 0.0908203125, "num_input_tokens_seen": 112730752, "step": 1631 }, { "epoch": 102.0, "grad_norm": 0.9599640145062192, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 112802368, "step": 1632 }, { "epoch": 102.0, "loss": 0.0026651208754628897, "loss_ce": 0.0018411462660878897, "loss_xval": 0.000823974609375, "num_input_tokens_seen": 112802368, "step": 1632 }, { "epoch": 102.0625, "grad_norm": 23.756242845065035, "learning_rate": 5e-05, "loss": 0.0953, "num_input_tokens_seen": 112874112, "step": 1633 }, { "epoch": 102.0625, "loss": 0.0883241519331932, "loss_ce": 0.0018983713816851377, "loss_xval": 0.08642578125, "num_input_tokens_seen": 112874112, "step": 1633 }, { "epoch": 102.125, "grad_norm": 16.368851868634717, "learning_rate": 5e-05, "loss": 0.0489, "num_input_tokens_seen": 112945664, "step": 1634 }, { "epoch": 102.125, "loss": 0.04897579923272133, "loss_ce": 0.001368378521874547, "loss_xval": 0.047607421875, "num_input_tokens_seen": 112945664, "step": 1634 }, { "epoch": 102.1875, "grad_norm": 4.815410162153499, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 113017280, "step": 1635 }, { "epoch": 102.1875, "loss": 0.005683029070496559, "loss_ce": 0.0011053922353312373, "loss_xval": 0.00457763671875, "num_input_tokens_seen": 113017280, "step": 1635 }, { "epoch": 102.25, "grad_norm": 18.835360111676962, "learning_rate": 5e-05, "loss": 0.0649, "num_input_tokens_seen": 113088960, "step": 1636 }, { "epoch": 102.25, "loss": 0.06543883681297302, "loss_ce": 0.0009857096010819077, "loss_xval": 0.064453125, "num_input_tokens_seen": 113088960, "step": 1636 }, { "epoch": 102.3125, "grad_norm": 11.221442126844604, "learning_rate": 5e-05, "loss": 0.025, "num_input_tokens_seen": 113160640, "step": 1637 }, { "epoch": 102.3125, "loss": 0.026167068630456924, "loss_ce": 0.0010205849539488554, "loss_xval": 0.025146484375, "num_input_tokens_seen": 113160640, "step": 1637 }, { "epoch": 102.375, "grad_norm": 6.333657334181233, "learning_rate": 5e-05, "loss": 0.0099, "num_input_tokens_seen": 113219712, "step": 1638 }, { "epoch": 102.375, "loss": 0.01122201420366764, "loss_ce": 0.0008460377575829625, "loss_xval": 0.0103759765625, "num_input_tokens_seen": 113219712, "step": 1638 }, { "epoch": 102.4375, "grad_norm": 16.67463922925151, "learning_rate": 5e-05, "loss": 0.053, "num_input_tokens_seen": 113291328, "step": 1639 }, { "epoch": 102.4375, "loss": 0.05274589732289314, "loss_ce": 0.0007439448963850737, "loss_xval": 0.052001953125, "num_input_tokens_seen": 113291328, "step": 1639 }, { "epoch": 102.5, "grad_norm": 7.620879504670172, "learning_rate": 5e-05, "loss": 0.0124, "num_input_tokens_seen": 113362944, "step": 1640 }, { "epoch": 102.5, "loss": 0.012734380550682545, "loss_ce": 0.0007714895764365792, "loss_xval": 0.011962890625, "num_input_tokens_seen": 113362944, "step": 1640 }, { "epoch": 102.5625, "grad_norm": 8.532472679636887, "learning_rate": 5e-05, "loss": 0.0149, "num_input_tokens_seen": 113434624, "step": 1641 }, { "epoch": 102.5625, "loss": 0.016723984852433205, "loss_ce": 0.0007327737403102219, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 113434624, "step": 1641 }, { "epoch": 102.625, "grad_norm": 13.787586527116975, "learning_rate": 5e-05, "loss": 0.0371, "num_input_tokens_seen": 113506240, "step": 1642 }, { "epoch": 102.625, "loss": 0.03597182780504227, "loss_ce": 0.0008155779214575887, "loss_xval": 0.03515625, "num_input_tokens_seen": 113506240, "step": 1642 }, { "epoch": 102.6875, "grad_norm": 4.768758139645163, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 113577792, "step": 1643 }, { "epoch": 102.6875, "loss": 0.006672807969152927, "loss_ce": 0.0007829151581972837, "loss_xval": 0.005889892578125, "num_input_tokens_seen": 113577792, "step": 1643 }, { "epoch": 102.75, "grad_norm": 8.60501362437254, "learning_rate": 5e-05, "loss": 0.0153, "num_input_tokens_seen": 113649472, "step": 1644 }, { "epoch": 102.75, "loss": 0.01445347536355257, "loss_ce": 0.0007816006545908749, "loss_xval": 0.013671875, "num_input_tokens_seen": 113649472, "step": 1644 }, { "epoch": 102.8125, "grad_norm": 11.367159098499652, "learning_rate": 5e-05, "loss": 0.0266, "num_input_tokens_seen": 113721152, "step": 1645 }, { "epoch": 102.8125, "loss": 0.026272470131516457, "loss_ce": 0.0007597756339237094, "loss_xval": 0.0255126953125, "num_input_tokens_seen": 113721152, "step": 1645 }, { "epoch": 102.875, "grad_norm": 3.3770240355971866, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 113780160, "step": 1646 }, { "epoch": 102.875, "loss": 0.0036508911289274693, "loss_ce": 0.0007974976324476302, "loss_xval": 0.0028533935546875, "num_input_tokens_seen": 113780160, "step": 1646 }, { "epoch": 102.9375, "grad_norm": 7.827881100394053, "learning_rate": 5e-05, "loss": 0.0133, "num_input_tokens_seen": 113851776, "step": 1647 }, { "epoch": 102.9375, "loss": 0.012910965830087662, "loss_ce": 0.0007039343472570181, "loss_xval": 0.01220703125, "num_input_tokens_seen": 113851776, "step": 1647 }, { "epoch": 103.0, "grad_norm": 10.520670672072665, "learning_rate": 5e-05, "loss": 0.0232, "num_input_tokens_seen": 113910784, "step": 1648 }, { "epoch": 103.0, "loss": 0.01958302967250347, "loss_ce": 0.0007842010818421841, "loss_xval": 0.018798828125, "num_input_tokens_seen": 113910784, "step": 1648 }, { "epoch": 103.0625, "grad_norm": 1.5553846635183586, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 113969920, "step": 1649 }, { "epoch": 103.0625, "loss": 0.0020108516328036785, "loss_ce": 0.0007672603824175894, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 113969920, "step": 1649 }, { "epoch": 103.125, "grad_norm": 7.754582670099876, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 114041600, "step": 1650 }, { "epoch": 103.125, "loss": 0.013059111312031746, "loss_ce": 0.0007300098077394068, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 114041600, "step": 1650 }, { "epoch": 103.1875, "grad_norm": 8.891149925560153, "learning_rate": 5e-05, "loss": 0.0167, "num_input_tokens_seen": 114113216, "step": 1651 }, { "epoch": 103.1875, "loss": 0.016599709168076515, "loss_ce": 0.0008526380988769233, "loss_xval": 0.0157470703125, "num_input_tokens_seen": 114113216, "step": 1651 }, { "epoch": 103.25, "grad_norm": 0.12653154608843153, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 114184960, "step": 1652 }, { "epoch": 103.25, "loss": 0.0012427711626514792, "loss_ce": 0.0008498573442921042, "loss_xval": 0.000392913818359375, "num_input_tokens_seen": 114184960, "step": 1652 }, { "epoch": 103.3125, "grad_norm": 7.470563693408022, "learning_rate": 5e-05, "loss": 0.0121, "num_input_tokens_seen": 114256640, "step": 1653 }, { "epoch": 103.3125, "loss": 0.012393683195114136, "loss_ce": 0.0007970034494064748, "loss_xval": 0.0115966796875, "num_input_tokens_seen": 114256640, "step": 1653 }, { "epoch": 103.375, "grad_norm": 7.080284493962458, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 114328192, "step": 1654 }, { "epoch": 103.375, "loss": 0.011119197122752666, "loss_ce": 0.0008042553090490401, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 114328192, "step": 1654 }, { "epoch": 103.4375, "grad_norm": 1.1158407425238983, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 114399808, "step": 1655 }, { "epoch": 103.4375, "loss": 0.001563415164127946, "loss_ce": 0.0007966610719449818, "loss_xval": 0.000766754150390625, "num_input_tokens_seen": 114399808, "step": 1655 }, { "epoch": 103.5, "grad_norm": 6.957097591177087, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 114471424, "step": 1656 }, { "epoch": 103.5, "loss": 0.012271175161004066, "loss_ce": 0.0007965656695887446, "loss_xval": 0.011474609375, "num_input_tokens_seen": 114471424, "step": 1656 }, { "epoch": 103.5625, "grad_norm": 5.014957813994443, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 114530496, "step": 1657 }, { "epoch": 103.5625, "loss": 0.006433679256588221, "loss_ce": 0.0007574096089228988, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 114530496, "step": 1657 }, { "epoch": 103.625, "grad_norm": 1.9327560449349166, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 114589504, "step": 1658 }, { "epoch": 103.625, "loss": 0.0017968413885682821, "loss_ce": 0.0007363555487245321, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 114589504, "step": 1658 }, { "epoch": 103.6875, "grad_norm": 6.253396113562765, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 114648640, "step": 1659 }, { "epoch": 103.6875, "loss": 0.008760355412960052, "loss_ce": 0.0007037151954136789, "loss_xval": 0.008056640625, "num_input_tokens_seen": 114648640, "step": 1659 }, { "epoch": 103.75, "grad_norm": 3.665027561445157, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 114707648, "step": 1660 }, { "epoch": 103.75, "loss": 0.0037118252366781235, "loss_ce": 0.0007211025804281235, "loss_xval": 0.00299072265625, "num_input_tokens_seen": 114707648, "step": 1660 }, { "epoch": 103.8125, "grad_norm": 2.8591045359372713, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 114779264, "step": 1661 }, { "epoch": 103.8125, "loss": 0.0027316624764353037, "loss_ce": 0.0006717260112054646, "loss_xval": 0.0020599365234375, "num_input_tokens_seen": 114779264, "step": 1661 }, { "epoch": 103.875, "grad_norm": 5.895781742475335, "learning_rate": 5e-05, "loss": 0.0081, "num_input_tokens_seen": 114851072, "step": 1662 }, { "epoch": 103.875, "loss": 0.007647594437003136, "loss_ce": 0.0006285517010837793, "loss_xval": 0.00701904296875, "num_input_tokens_seen": 114851072, "step": 1662 }, { "epoch": 103.9375, "grad_norm": 1.5133022469671882, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 114910080, "step": 1663 }, { "epoch": 103.9375, "loss": 0.0016773771494626999, "loss_ce": 0.0007656645029783249, "loss_xval": 0.000911712646484375, "num_input_tokens_seen": 114910080, "step": 1663 }, { "epoch": 104.0, "grad_norm": 3.610526889212173, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 114981632, "step": 1664 }, { "epoch": 104.0, "loss": 0.003930710256099701, "loss_ce": 0.0007263646693900228, "loss_xval": 0.003204345703125, "num_input_tokens_seen": 114981632, "step": 1664 }, { "epoch": 104.0625, "grad_norm": 4.220355319451222, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 115053184, "step": 1665 }, { "epoch": 104.0625, "loss": 0.005380589049309492, "loss_ce": 0.0006198466871865094, "loss_xval": 0.0047607421875, "num_input_tokens_seen": 115053184, "step": 1665 }, { "epoch": 104.125, "grad_norm": 0.07905389863386036, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 115124800, "step": 1666 }, { "epoch": 104.125, "loss": 0.0010264108423143625, "loss_ce": 0.0006354043725878, "loss_xval": 0.0003910064697265625, "num_input_tokens_seen": 115124800, "step": 1666 }, { "epoch": 104.1875, "grad_norm": 3.7039766082303043, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 115196416, "step": 1667 }, { "epoch": 104.1875, "loss": 0.0037187503185123205, "loss_ce": 0.0007127689314074814, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 115196416, "step": 1667 }, { "epoch": 104.25, "grad_norm": 3.05507735450659, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 115268032, "step": 1668 }, { "epoch": 104.25, "loss": 0.00257797259837389, "loss_ce": 0.0005943299038335681, "loss_xval": 0.001983642578125, "num_input_tokens_seen": 115268032, "step": 1668 }, { "epoch": 104.3125, "grad_norm": 1.0814883240904307, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 115339840, "step": 1669 }, { "epoch": 104.3125, "loss": 0.0011010868474841118, "loss_ce": 0.0005326969549059868, "loss_xval": 0.000568389892578125, "num_input_tokens_seen": 115339840, "step": 1669 }, { "epoch": 104.375, "grad_norm": 3.690749536698239, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 115411392, "step": 1670 }, { "epoch": 104.375, "loss": 0.0035830074921250343, "loss_ce": 0.0005770259886048734, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 115411392, "step": 1670 }, { "epoch": 104.4375, "grad_norm": 1.8211360411562902, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 115470464, "step": 1671 }, { "epoch": 104.4375, "loss": 0.001722783432342112, "loss_ce": 0.0006241506198421121, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 115470464, "step": 1671 }, { "epoch": 104.5, "grad_norm": 2.054217315006852, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 115542016, "step": 1672 }, { "epoch": 104.5, "loss": 0.0015874525997787714, "loss_ce": 0.00048119042185135186, "loss_xval": 0.00110626220703125, "num_input_tokens_seen": 115542016, "step": 1672 }, { "epoch": 104.5625, "grad_norm": 3.3353779554239678, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 115613760, "step": 1673 }, { "epoch": 104.5625, "loss": 0.0029215288814157248, "loss_ce": 0.0005258990568108857, "loss_xval": 0.0023956298828125, "num_input_tokens_seen": 115613760, "step": 1673 }, { "epoch": 104.625, "grad_norm": 0.39359180484763523, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 115685376, "step": 1674 }, { "epoch": 104.625, "loss": 0.0007368091028183699, "loss_ce": 0.0005298617761582136, "loss_xval": 0.00020694732666015625, "num_input_tokens_seen": 115685376, "step": 1674 }, { "epoch": 104.6875, "grad_norm": 3.1102239317177895, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 115744512, "step": 1675 }, { "epoch": 104.6875, "loss": 0.0027107985224574804, "loss_ce": 0.00048301531933248043, "loss_xval": 0.002227783203125, "num_input_tokens_seen": 115744512, "step": 1675 }, { "epoch": 104.75, "grad_norm": 2.048859685459329, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 115816064, "step": 1676 }, { "epoch": 104.75, "loss": 0.0014386232942342758, "loss_ce": 0.00043154318700544536, "loss_xval": 0.001007080078125, "num_input_tokens_seen": 115816064, "step": 1676 }, { "epoch": 104.8125, "grad_norm": 1.3415920474072838, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 115875136, "step": 1677 }, { "epoch": 104.8125, "loss": 0.0009290423477068543, "loss_ce": 0.0004293169768061489, "loss_xval": 0.000499725341796875, "num_input_tokens_seen": 115875136, "step": 1677 }, { "epoch": 104.875, "grad_norm": 2.761071000900736, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 115946816, "step": 1678 }, { "epoch": 104.875, "loss": 0.002234385348856449, "loss_ce": 0.00043384829768911004, "loss_xval": 0.001800537109375, "num_input_tokens_seen": 115946816, "step": 1678 }, { "epoch": 104.9375, "grad_norm": 0.35031847383260256, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 116018368, "step": 1679 }, { "epoch": 104.9375, "loss": 0.0006341906264424324, "loss_ce": 0.0004215212247800082, "loss_xval": 0.00021266937255859375, "num_input_tokens_seen": 116018368, "step": 1679 }, { "epoch": 105.0, "grad_norm": 2.156864692866578, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 116089984, "step": 1680 }, { "epoch": 105.0, "loss": 0.0013213963247835636, "loss_ce": 0.0003448338247835636, "loss_xval": 0.0009765625, "num_input_tokens_seen": 116089984, "step": 1680 }, { "epoch": 105.0625, "grad_norm": 1.561713135829587, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 116161600, "step": 1681 }, { "epoch": 105.0625, "loss": 0.0013053379952907562, "loss_ce": 0.00038981062243692577, "loss_xval": 0.00091552734375, "num_input_tokens_seen": 116161600, "step": 1681 }, { "epoch": 105.125, "grad_norm": 1.2670996351838228, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 116233280, "step": 1682 }, { "epoch": 105.125, "loss": 0.0008212352404370904, "loss_ce": 0.0003024364123120904, "loss_xval": 0.000518798828125, "num_input_tokens_seen": 116233280, "step": 1682 }, { "epoch": 105.1875, "grad_norm": 1.765763884638601, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 116292352, "step": 1683 }, { "epoch": 105.1875, "loss": 0.0009669559658505023, "loss_ce": 0.00026886636624112725, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 116292352, "step": 1683 }, { "epoch": 105.25, "grad_norm": 0.057805268793867895, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 116364032, "step": 1684 }, { "epoch": 105.25, "loss": 0.0003976496518589556, "loss_ce": 0.0003156336606480181, "loss_xval": 8.20159912109375e-05, "num_input_tokens_seen": 116364032, "step": 1684 }, { "epoch": 105.3125, "grad_norm": 1.6781220119027798, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 116435648, "step": 1685 }, { "epoch": 105.3125, "loss": 0.0009655409958213568, "loss_ce": 0.0003055983397644013, "loss_xval": 0.000659942626953125, "num_input_tokens_seen": 116435648, "step": 1685 }, { "epoch": 105.375, "grad_norm": 0.6101874326629925, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 116507328, "step": 1686 }, { "epoch": 105.375, "loss": 0.00042850777390412986, "loss_ce": 0.0002482633281033486, "loss_xval": 0.00018024444580078125, "num_input_tokens_seen": 116507328, "step": 1686 }, { "epoch": 105.4375, "grad_norm": 0.9843981208119894, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 116578944, "step": 1687 }, { "epoch": 105.4375, "loss": 0.0005340241477824748, "loss_ce": 0.00027081003645434976, "loss_xval": 0.000263214111328125, "num_input_tokens_seen": 116578944, "step": 1687 }, { "epoch": 105.5, "grad_norm": 1.1536326062306255, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 116638144, "step": 1688 }, { "epoch": 105.5, "loss": 0.0008680825121700764, "loss_ce": 0.0002806191623676568, "loss_xval": 0.00058746337890625, "num_input_tokens_seen": 116638144, "step": 1688 }, { "epoch": 105.5625, "grad_norm": 0.5324269020281875, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 116709824, "step": 1689 }, { "epoch": 105.5625, "loss": 0.0004137876385357231, "loss_ce": 0.00029076365171931684, "loss_xval": 0.00012302398681640625, "num_input_tokens_seen": 116709824, "step": 1689 }, { "epoch": 105.625, "grad_norm": 1.2980282215142769, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 116781504, "step": 1690 }, { "epoch": 105.625, "loss": 0.0009148502722382545, "loss_ce": 0.00024727825075387955, "loss_xval": 0.000667572021484375, "num_input_tokens_seen": 116781504, "step": 1690 }, { "epoch": 105.6875, "grad_norm": 0.29562102696361137, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 116840640, "step": 1691 }, { "epoch": 105.6875, "loss": 0.0003725567366927862, "loss_ce": 0.0002333202719455585, "loss_xval": 0.0001392364501953125, "num_input_tokens_seen": 116840640, "step": 1691 }, { "epoch": 105.75, "grad_norm": 1.0232409981555282, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 116912256, "step": 1692 }, { "epoch": 105.75, "loss": 0.0005665783537551761, "loss_ce": 0.00021371884213294834, "loss_xval": 0.0003528594970703125, "num_input_tokens_seen": 116912256, "step": 1692 }, { "epoch": 105.8125, "grad_norm": 0.34358201221822143, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 116983936, "step": 1693 }, { "epoch": 105.8125, "loss": 0.0003722688998095691, "loss_ce": 0.00023589345801156014, "loss_xval": 0.00013637542724609375, "num_input_tokens_seen": 116983936, "step": 1693 }, { "epoch": 105.875, "grad_norm": 1.2389221421776981, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 117055616, "step": 1694 }, { "epoch": 105.875, "loss": 0.0006702013779431581, "loss_ce": 0.00023914058692753315, "loss_xval": 0.000431060791015625, "num_input_tokens_seen": 117055616, "step": 1694 }, { "epoch": 105.9375, "grad_norm": 0.62070963216232, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117114688, "step": 1695 }, { "epoch": 105.9375, "loss": 0.0003860698197968304, "loss_ce": 0.0001896128960652277, "loss_xval": 0.0001964569091796875, "num_input_tokens_seen": 117114688, "step": 1695 }, { "epoch": 106.0, "grad_norm": 1.151688414326741, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 117186368, "step": 1696 }, { "epoch": 106.0, "loss": 0.0006602061912417412, "loss_ce": 0.00020625718752853572, "loss_xval": 0.000453948974609375, "num_input_tokens_seen": 117186368, "step": 1696 }, { "epoch": 106.0625, "grad_norm": 0.9367465334806727, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117258176, "step": 1697 }, { "epoch": 106.0625, "loss": 0.00044638433610089123, "loss_ce": 0.00018317022477276623, "loss_xval": 0.000263214111328125, "num_input_tokens_seen": 117258176, "step": 1697 }, { "epoch": 106.125, "grad_norm": 1.0215678861138757, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117329728, "step": 1698 }, { "epoch": 106.125, "loss": 0.000491351296659559, "loss_ce": 0.00017663878679741174, "loss_xval": 0.0003147125244140625, "num_input_tokens_seen": 117329728, "step": 1698 }, { "epoch": 106.1875, "grad_norm": 1.0125715492817908, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117401408, "step": 1699 }, { "epoch": 106.1875, "loss": 0.0005752349970862269, "loss_ce": 0.0001746918132994324, "loss_xval": 0.000400543212890625, "num_input_tokens_seen": 117401408, "step": 1699 }, { "epoch": 106.25, "grad_norm": 1.066618829875019, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117473216, "step": 1700 }, { "epoch": 106.25, "loss": 0.00048783019883558154, "loss_ce": 0.00015785889991093427, "loss_xval": 0.0003299713134765625, "num_input_tokens_seen": 117473216, "step": 1700 }, { "epoch": 106.3125, "grad_norm": 0.6928800245749516, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 117544768, "step": 1701 }, { "epoch": 106.3125, "loss": 0.00034172070445492864, "loss_ce": 0.00015289320435840636, "loss_xval": 0.0001888275146484375, "num_input_tokens_seen": 117544768, "step": 1701 }, { "epoch": 106.375, "grad_norm": 0.6562982159952676, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 117616512, "step": 1702 }, { "epoch": 106.375, "loss": 0.0002900107065215707, "loss_ce": 0.00017795395979192108, "loss_xval": 0.00011205673217773438, "num_input_tokens_seen": 117616512, "step": 1702 }, { "epoch": 106.4375, "grad_norm": 0.8476027009117076, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 117675712, "step": 1703 }, { "epoch": 106.4375, "loss": 0.00037799362326040864, "loss_ce": 0.00015960219025146216, "loss_xval": 0.00021839141845703125, "num_input_tokens_seen": 117675712, "step": 1703 }, { "epoch": 106.5, "grad_norm": 0.30678107087599277, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 117734784, "step": 1704 }, { "epoch": 106.5, "loss": 0.00021619546168949455, "loss_ce": 0.00012798058742191643, "loss_xval": 8.821487426757812e-05, "num_input_tokens_seen": 117734784, "step": 1704 }, { "epoch": 106.5625, "grad_norm": 0.9752061317620482, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 117806592, "step": 1705 }, { "epoch": 106.5625, "loss": 0.000510626588948071, "loss_ce": 0.00015204506053123623, "loss_xval": 0.00035858154296875, "num_input_tokens_seen": 117806592, "step": 1705 }, { "epoch": 106.625, "grad_norm": 0.11033428610619243, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 117878336, "step": 1706 }, { "epoch": 106.625, "loss": 0.00024724070681259036, "loss_ce": 0.00014472071779891849, "loss_xval": 0.00010251998901367188, "num_input_tokens_seen": 117878336, "step": 1706 }, { "epoch": 106.6875, "grad_norm": 0.8970902998903458, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 117949952, "step": 1707 }, { "epoch": 106.6875, "loss": 0.000420129275880754, "loss_ce": 0.00013593431503977627, "loss_xval": 0.0002841949462890625, "num_input_tokens_seen": 117949952, "step": 1707 }, { "epoch": 106.75, "grad_norm": 0.0864938089783715, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118021504, "step": 1708 }, { "epoch": 106.75, "loss": 0.00023897687788121402, "loss_ce": 0.00012930433149449527, "loss_xval": 0.00010967254638671875, "num_input_tokens_seen": 118021504, "step": 1708 }, { "epoch": 106.8125, "grad_norm": 0.8691233585505428, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 118093120, "step": 1709 }, { "epoch": 106.8125, "loss": 0.00035278405994176865, "loss_ce": 0.00015060511941555887, "loss_xval": 0.000202178955078125, "num_input_tokens_seen": 118093120, "step": 1709 }, { "epoch": 106.875, "grad_norm": 0.2920927641095561, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 118164864, "step": 1710 }, { "epoch": 106.875, "loss": 0.0003268433501943946, "loss_ce": 0.00013038642646279186, "loss_xval": 0.0001964569091796875, "num_input_tokens_seen": 118164864, "step": 1710 }, { "epoch": 106.9375, "grad_norm": 0.6542078281721475, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 118236480, "step": 1711 }, { "epoch": 106.9375, "loss": 0.00030111701926216483, "loss_ce": 0.00014566810568794608, "loss_xval": 0.00015544891357421875, "num_input_tokens_seen": 118236480, "step": 1711 }, { "epoch": 107.0, "grad_norm": 0.45484764465508853, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 118308032, "step": 1712 }, { "epoch": 107.0, "loss": 0.00029993202770128846, "loss_ce": 0.00013590004527941346, "loss_xval": 0.000164031982421875, "num_input_tokens_seen": 118308032, "step": 1712 }, { "epoch": 107.0625, "grad_norm": 0.1475064477319675, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118379712, "step": 1713 }, { "epoch": 107.0625, "loss": 0.00025952374562621117, "loss_ce": 0.00013268507609609514, "loss_xval": 0.00012683868408203125, "num_input_tokens_seen": 118379712, "step": 1713 }, { "epoch": 107.125, "grad_norm": 0.25357679220631113, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118438720, "step": 1714 }, { "epoch": 107.125, "loss": 0.00023810943821445107, "loss_ce": 0.00012557586887851357, "loss_xval": 0.0001125335693359375, "num_input_tokens_seen": 118438720, "step": 1714 }, { "epoch": 107.1875, "grad_norm": 0.0193646792988071, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118510336, "step": 1715 }, { "epoch": 107.1875, "loss": 0.00020649436919484288, "loss_ce": 0.00012018684356007725, "loss_xval": 8.630752563476562e-05, "num_input_tokens_seen": 118510336, "step": 1715 }, { "epoch": 107.25, "grad_norm": 0.24234549892618124, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118581888, "step": 1716 }, { "epoch": 107.25, "loss": 0.00017738385940901935, "loss_ce": 0.00011253401316935197, "loss_xval": 6.4849853515625e-05, "num_input_tokens_seen": 118581888, "step": 1716 }, { "epoch": 107.3125, "grad_norm": 0.16171066044051577, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118653504, "step": 1717 }, { "epoch": 107.3125, "loss": 0.0002221949107479304, "loss_ce": 0.00011490655015222728, "loss_xval": 0.00010728836059570312, "num_input_tokens_seen": 118653504, "step": 1717 }, { "epoch": 107.375, "grad_norm": 0.23147773169592833, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118725184, "step": 1718 }, { "epoch": 107.375, "loss": 0.00021182381897233427, "loss_ce": 9.976707951864228e-05, "loss_xval": 0.00011205673217773438, "num_input_tokens_seen": 118725184, "step": 1718 }, { "epoch": 107.4375, "grad_norm": 0.44545970064903945, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118796800, "step": 1719 }, { "epoch": 107.4375, "loss": 0.0002488536119926721, "loss_ce": 9.817306272452697e-05, "loss_xval": 0.0001506805419921875, "num_input_tokens_seen": 118796800, "step": 1719 }, { "epoch": 107.5, "grad_norm": 0.04274000590970272, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118868416, "step": 1720 }, { "epoch": 107.5, "loss": 0.0001888109982246533, "loss_ce": 0.0001010729611152783, "loss_xval": 8.7738037109375e-05, "num_input_tokens_seen": 118868416, "step": 1720 }, { "epoch": 107.5625, "grad_norm": 0.348588123620522, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 118940032, "step": 1721 }, { "epoch": 107.5625, "loss": 0.00023122888524085283, "loss_ce": 0.00010915856546489522, "loss_xval": 0.0001220703125, "num_input_tokens_seen": 118940032, "step": 1721 }, { "epoch": 107.625, "grad_norm": 0.13433877341461867, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119011712, "step": 1722 }, { "epoch": 107.625, "loss": 0.00021391075279098004, "loss_ce": 0.00010662239219527692, "loss_xval": 0.00010728836059570312, "num_input_tokens_seen": 119011712, "step": 1722 }, { "epoch": 107.6875, "grad_norm": 0.6809539631450718, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 119083264, "step": 1723 }, { "epoch": 107.6875, "loss": 0.00023257522843778133, "loss_ce": 9.715346823213622e-05, "loss_xval": 0.0001354217529296875, "num_input_tokens_seen": 119083264, "step": 1723 }, { "epoch": 107.75, "grad_norm": 0.1889751093111315, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119154880, "step": 1724 }, { "epoch": 107.75, "loss": 0.00014927498705219477, "loss_ce": 9.658448107074946e-05, "loss_xval": 5.269050598144531e-05, "num_input_tokens_seen": 119154880, "step": 1724 }, { "epoch": 107.8125, "grad_norm": 0.49261580074161004, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 119226496, "step": 1725 }, { "epoch": 107.8125, "loss": 0.00027312879683449864, "loss_ce": 0.00010337477579014376, "loss_xval": 0.0001697540283203125, "num_input_tokens_seen": 119226496, "step": 1725 }, { "epoch": 107.875, "grad_norm": 0.1872703937412782, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119285440, "step": 1726 }, { "epoch": 107.875, "loss": 0.00020403148664627224, "loss_ce": 9.197475446853787e-05, "loss_xval": 0.00011205673217773438, "num_input_tokens_seen": 119285440, "step": 1726 }, { "epoch": 107.9375, "grad_norm": 0.44040056668876976, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119357120, "step": 1727 }, { "epoch": 107.9375, "loss": 0.0002309640112798661, "loss_ce": 9.554225107422099e-05, "loss_xval": 0.0001354217529296875, "num_input_tokens_seen": 119357120, "step": 1727 }, { "epoch": 108.0, "grad_norm": 0.4350724614073691, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119428800, "step": 1728 }, { "epoch": 108.0, "loss": 0.000171815074281767, "loss_ce": 8.503071148879826e-05, "loss_xval": 8.678436279296875e-05, "num_input_tokens_seen": 119428800, "step": 1728 }, { "epoch": 108.0625, "grad_norm": 0.12261902416252166, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119500480, "step": 1729 }, { "epoch": 108.0625, "loss": 0.00015668789274059236, "loss_ce": 0.00010542790550971404, "loss_xval": 5.125999450683594e-05, "num_input_tokens_seen": 119500480, "step": 1729 }, { "epoch": 108.125, "grad_norm": 0.04068237379475871, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119559680, "step": 1730 }, { "epoch": 108.125, "loss": 0.00014538603136315942, "loss_ce": 8.292036363855004e-05, "loss_xval": 6.246566772460938e-05, "num_input_tokens_seen": 119559680, "step": 1730 }, { "epoch": 108.1875, "grad_norm": 0.21295300564952035, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119631424, "step": 1731 }, { "epoch": 108.1875, "loss": 0.00018561960314400494, "loss_ce": 7.117867789929733e-05, "loss_xval": 0.00011444091796875, "num_input_tokens_seen": 119631424, "step": 1731 }, { "epoch": 108.25, "grad_norm": 0.6202458907209969, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 119702976, "step": 1732 }, { "epoch": 108.25, "loss": 0.00030045691528357565, "loss_ce": 8.015814091777429e-05, "loss_xval": 0.00022029876708984375, "num_input_tokens_seen": 119702976, "step": 1732 }, { "epoch": 108.3125, "grad_norm": 0.3733458594831618, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119774656, "step": 1733 }, { "epoch": 108.3125, "loss": 0.00019437010632827878, "loss_ce": 8.517439709976315e-05, "loss_xval": 0.00010919570922851562, "num_input_tokens_seen": 119774656, "step": 1733 }, { "epoch": 108.375, "grad_norm": 0.788826552316628, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 119846336, "step": 1734 }, { "epoch": 108.375, "loss": 0.0002434041816741228, "loss_ce": 7.93721919762902e-05, "loss_xval": 0.000164031982421875, "num_input_tokens_seen": 119846336, "step": 1734 }, { "epoch": 108.4375, "grad_norm": 1.4110098626578793, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 119918080, "step": 1735 }, { "epoch": 108.4375, "loss": 0.0004943107487633824, "loss_ce": 9.376756497658789e-05, "loss_xval": 0.000400543212890625, "num_input_tokens_seen": 119918080, "step": 1735 }, { "epoch": 108.5, "grad_norm": 0.2677482367315443, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 119989632, "step": 1736 }, { "epoch": 108.5, "loss": 0.0001648281468078494, "loss_ce": 7.709010242251679e-05, "loss_xval": 8.7738037109375e-05, "num_input_tokens_seen": 119989632, "step": 1736 }, { "epoch": 108.5625, "grad_norm": 1.4965836171714064, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 120061376, "step": 1737 }, { "epoch": 108.5625, "loss": 0.0004614101198967546, "loss_ce": 6.849630881333724e-05, "loss_xval": 0.000392913818359375, "num_input_tokens_seen": 120061376, "step": 1737 }, { "epoch": 108.625, "grad_norm": 1.7362134972145584, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 120132928, "step": 1738 }, { "epoch": 108.625, "loss": 0.000654931936878711, "loss_ce": 8.65420515765436e-05, "loss_xval": 0.000568389892578125, "num_input_tokens_seen": 120132928, "step": 1738 }, { "epoch": 108.6875, "grad_norm": 0.10497585603783582, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 120192192, "step": 1739 }, { "epoch": 108.6875, "loss": 0.00011801804066635668, "loss_ce": 7.1526417741552e-05, "loss_xval": 4.649162292480469e-05, "num_input_tokens_seen": 120192192, "step": 1739 }, { "epoch": 108.75, "grad_norm": 1.9549083462496235, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 120264000, "step": 1740 }, { "epoch": 108.75, "loss": 0.0008286124211736023, "loss_ce": 7.330238440772519e-05, "loss_xval": 0.00075531005859375, "num_input_tokens_seen": 120264000, "step": 1740 }, { "epoch": 108.8125, "grad_norm": 1.64600119762836, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 120323136, "step": 1741 }, { "epoch": 108.8125, "loss": 0.0006632714066654444, "loss_ce": 7.199330138973892e-05, "loss_xval": 0.000591278076171875, "num_input_tokens_seen": 120323136, "step": 1741 }, { "epoch": 108.875, "grad_norm": 0.31792835465951985, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 120394688, "step": 1742 }, { "epoch": 108.875, "loss": 0.00013815359852742404, "loss_ce": 7.091955922078341e-05, "loss_xval": 6.723403930664062e-05, "num_input_tokens_seen": 120394688, "step": 1742 }, { "epoch": 108.9375, "grad_norm": 1.8780950319980536, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 120466432, "step": 1743 }, { "epoch": 108.9375, "loss": 0.0008036716608330607, "loss_ce": 7.12497640051879e-05, "loss_xval": 0.000732421875, "num_input_tokens_seen": 120466432, "step": 1743 }, { "epoch": 109.0, "grad_norm": 1.738570960504345, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 120525568, "step": 1744 }, { "epoch": 109.0, "loss": 0.0007079385104589164, "loss_ce": 7.088408892741427e-05, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 120525568, "step": 1744 }, { "epoch": 109.0625, "grad_norm": 0.06762341183011107, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 120584704, "step": 1745 }, { "epoch": 109.0625, "loss": 0.0001587302831467241, "loss_ce": 6.527020741486922e-05, "loss_xval": 9.34600830078125e-05, "num_input_tokens_seen": 120584704, "step": 1745 }, { "epoch": 109.125, "grad_norm": 1.6084002598386766, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 120656384, "step": 1746 }, { "epoch": 109.125, "loss": 0.0006961078615859151, "loss_ce": 7.049749547149986e-05, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 120656384, "step": 1746 }, { "epoch": 109.1875, "grad_norm": 1.8576904874062472, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 120728000, "step": 1747 }, { "epoch": 109.1875, "loss": 0.0008069484028965235, "loss_ce": 8.597062696935609e-05, "loss_xval": 0.000720977783203125, "num_input_tokens_seen": 120728000, "step": 1747 }, { "epoch": 109.25, "grad_norm": 0.4006157882739807, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 120799552, "step": 1748 }, { "epoch": 109.25, "loss": 0.0001714739773888141, "loss_ce": 6.561612826772034e-05, "loss_xval": 0.00010585784912109375, "num_input_tokens_seen": 120799552, "step": 1748 }, { "epoch": 109.3125, "grad_norm": 1.7118247767867223, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 120871168, "step": 1749 }, { "epoch": 109.3125, "loss": 0.0007678983965888619, "loss_ce": 6.21793806203641e-05, "loss_xval": 0.000705718994140625, "num_input_tokens_seen": 120871168, "step": 1749 }, { "epoch": 109.375, "grad_norm": 2.6824813308961724, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 120930240, "step": 1750 }, { "epoch": 109.375, "eval_synth_IoU": 0.06953348591923714, "eval_synth_MAE_x": 0.02272796630859375, "eval_synth_MAE_y": 0.023560963571071625, "eval_synth_NUM_probability": 0.9992043226957321, "eval_synth_inside_bbox": 0.1875, "eval_synth_loss": 0.0006334885256364942, "eval_synth_loss_ce": 7.082072261255234e-05, "eval_synth_loss_xval": 0.0005626678466796875, "eval_synth_runtime": 58.1486, "eval_synth_samples_per_second": 2.201, "eval_synth_steps_per_second": 0.069, "num_input_tokens_seen": 120930240, "step": 1750 }, { "epoch": 109.375, "loss": 0.0005780013161711395, "loss_ce": 6.301718531176448e-05, "loss_xval": 0.000514984130859375, "num_input_tokens_seen": 120930240, "step": 1750 }, { "epoch": 109.4375, "grad_norm": 1.579022788002766, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 121001920, "step": 1751 }, { "epoch": 109.4375, "loss": 0.0007101305527612567, "loss_ce": 7.307608029805124e-05, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 121001920, "step": 1751 }, { "epoch": 109.5, "grad_norm": 0.7241471043715434, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 121073472, "step": 1752 }, { "epoch": 109.5, "loss": 0.00023262601462192833, "loss_ce": 6.76403651596047e-05, "loss_xval": 0.00016498565673828125, "num_input_tokens_seen": 121073472, "step": 1752 }, { "epoch": 109.5625, "grad_norm": 2.1732530424914747, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 121145088, "step": 1753 }, { "epoch": 109.5625, "loss": 0.0010409480892121792, "loss_ce": 6.820028647780418e-05, "loss_xval": 0.000972747802734375, "num_input_tokens_seen": 121145088, "step": 1753 }, { "epoch": 109.625, "grad_norm": 1.7185759244489052, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 121216640, "step": 1754 }, { "epoch": 109.625, "loss": 0.0007494474994018674, "loss_ce": 6.2801998865325e-05, "loss_xval": 0.0006866455078125, "num_input_tokens_seen": 121216640, "step": 1754 }, { "epoch": 109.6875, "grad_norm": 0.25256629473919784, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 121288256, "step": 1755 }, { "epoch": 109.6875, "loss": 0.00019676412921398878, "loss_ce": 6.038870196789503e-05, "loss_xval": 0.00013637542724609375, "num_input_tokens_seen": 121288256, "step": 1755 }, { "epoch": 109.75, "grad_norm": 2.321287818343787, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 121347520, "step": 1756 }, { "epoch": 109.75, "loss": 0.0010661354754120111, "loss_ce": 6.668480637017637e-05, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 121347520, "step": 1756 }, { "epoch": 109.8125, "grad_norm": 3.325342099713657, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 121419072, "step": 1757 }, { "epoch": 109.8125, "loss": 0.0022488064132630825, "loss_ce": 6.679958460154012e-05, "loss_xval": 0.0021820068359375, "num_input_tokens_seen": 121419072, "step": 1757 }, { "epoch": 109.875, "grad_norm": 3.001496617326765, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 121490688, "step": 1758 }, { "epoch": 109.875, "loss": 0.001875939778983593, "loss_ce": 6.014389146002941e-05, "loss_xval": 0.0018157958984375, "num_input_tokens_seen": 121490688, "step": 1758 }, { "epoch": 109.9375, "grad_norm": 1.6426331612052185, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 121562304, "step": 1759 }, { "epoch": 109.9375, "loss": 0.0006351894699037075, "loss_ce": 7.061428914312273e-05, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 121562304, "step": 1759 }, { "epoch": 110.0, "grad_norm": 0.14074563650367997, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 121633856, "step": 1760 }, { "epoch": 110.0, "loss": 0.00021442785509862006, "loss_ce": 6.851567741250619e-05, "loss_xval": 0.00014591217041015625, "num_input_tokens_seen": 121633856, "step": 1760 }, { "epoch": 110.0625, "grad_norm": 1.5574652891852028, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 121705536, "step": 1761 }, { "epoch": 110.0625, "loss": 0.0006171943969093263, "loss_ce": 6.406330066965893e-05, "loss_xval": 0.000553131103515625, "num_input_tokens_seen": 121705536, "step": 1761 }, { "epoch": 110.125, "grad_norm": 2.288040547426898, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 121777280, "step": 1762 }, { "epoch": 110.125, "loss": 0.0011686455691233277, "loss_ce": 7.00127420714125e-05, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 121777280, "step": 1762 }, { "epoch": 110.1875, "grad_norm": 2.129444147921264, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 121836480, "step": 1763 }, { "epoch": 110.1875, "loss": 0.0010745597537606955, "loss_ce": 5.985027382848784e-05, "loss_xval": 0.00101470947265625, "num_input_tokens_seen": 121836480, "step": 1763 }, { "epoch": 110.25, "grad_norm": 1.3842064988364935, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 121908032, "step": 1764 }, { "epoch": 110.25, "loss": 0.000571783515624702, "loss_ce": 6.824348383815959e-05, "loss_xval": 0.0005035400390625, "num_input_tokens_seen": 121908032, "step": 1764 }, { "epoch": 110.3125, "grad_norm": 0.35625028169896966, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 121967168, "step": 1765 }, { "epoch": 110.3125, "loss": 0.00019284302834421396, "loss_ce": 6.505066266981885e-05, "loss_xval": 0.0001277923583984375, "num_input_tokens_seen": 121967168, "step": 1765 }, { "epoch": 110.375, "grad_norm": 0.598471501459658, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 122038784, "step": 1766 }, { "epoch": 110.375, "loss": 0.00024847392342053354, "loss_ce": 7.490520511055365e-05, "loss_xval": 0.0001735687255859375, "num_input_tokens_seen": 122038784, "step": 1766 }, { "epoch": 110.4375, "grad_norm": 1.177837228473711, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 122110528, "step": 1767 }, { "epoch": 110.4375, "loss": 0.00036571957753039896, "loss_ce": 6.435850082198158e-05, "loss_xval": 0.000301361083984375, "num_input_tokens_seen": 122110528, "step": 1767 }, { "epoch": 110.5, "grad_norm": 1.4204740711254538, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 122182208, "step": 1768 }, { "epoch": 110.5, "loss": 0.0005603337194770575, "loss_ce": 6.823776493547484e-05, "loss_xval": 0.000492095947265625, "num_input_tokens_seen": 122182208, "step": 1768 }, { "epoch": 110.5625, "grad_norm": 1.6478446466696302, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 122253952, "step": 1769 }, { "epoch": 110.5625, "loss": 0.0006315922364592552, "loss_ce": 7.08317311364226e-05, "loss_xval": 0.000560760498046875, "num_input_tokens_seen": 122253952, "step": 1769 }, { "epoch": 110.625, "grad_norm": 1.9859732568458073, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 122325504, "step": 1770 }, { "epoch": 110.625, "loss": 0.0007901331409811974, "loss_ce": 6.534063868457451e-05, "loss_xval": 0.00072479248046875, "num_input_tokens_seen": 122325504, "step": 1770 }, { "epoch": 110.6875, "grad_norm": 2.3332948078117006, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 122384704, "step": 1771 }, { "epoch": 110.6875, "loss": 0.001085848081856966, "loss_ce": 7.113863102858886e-05, "loss_xval": 0.00101470947265625, "num_input_tokens_seen": 122384704, "step": 1771 }, { "epoch": 110.75, "grad_norm": 2.6076680683930005, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 122456448, "step": 1772 }, { "epoch": 110.75, "loss": 0.0012727116700261831, "loss_ce": 6.72673195367679e-05, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 122456448, "step": 1772 }, { "epoch": 110.8125, "grad_norm": 2.9984594669057536, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 122528000, "step": 1773 }, { "epoch": 110.8125, "loss": 0.0018300075316801667, "loss_ce": 5.998796768835746e-05, "loss_xval": 0.00177001953125, "num_input_tokens_seen": 122528000, "step": 1773 }, { "epoch": 110.875, "grad_norm": 3.598605406045106, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 122599616, "step": 1774 }, { "epoch": 110.875, "loss": 0.0025792140513658524, "loss_ce": 6.151380512164906e-05, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 122599616, "step": 1774 }, { "epoch": 110.9375, "grad_norm": 4.1017815818888055, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 122671232, "step": 1775 }, { "epoch": 110.9375, "loss": 0.003606092417612672, "loss_ce": 6.605328235309571e-05, "loss_xval": 0.0035400390625, "num_input_tokens_seen": 122671232, "step": 1775 }, { "epoch": 111.0, "grad_norm": 4.37458727859117, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 122742848, "step": 1776 }, { "epoch": 111.0, "loss": 0.0038295018021017313, "loss_ce": 7.583970727864653e-05, "loss_xval": 0.003753662109375, "num_input_tokens_seen": 122742848, "step": 1776 }, { "epoch": 111.0625, "grad_norm": 4.258724618254019, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 122814592, "step": 1777 }, { "epoch": 111.0625, "loss": 0.0033998247236013412, "loss_ce": 7.3408788011875e-05, "loss_xval": 0.003326416015625, "num_input_tokens_seen": 122814592, "step": 1777 }, { "epoch": 111.125, "grad_norm": 3.6412286226566386, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 122886208, "step": 1778 }, { "epoch": 111.125, "loss": 0.0030623050406575203, "loss_ce": 7.158235530368984e-05, "loss_xval": 0.00299072265625, "num_input_tokens_seen": 122886208, "step": 1778 }, { "epoch": 111.1875, "grad_norm": 2.301903912547345, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 122957760, "step": 1779 }, { "epoch": 111.1875, "loss": 0.001317744143307209, "loss_ce": 7.415279105771333e-05, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 122957760, "step": 1779 }, { "epoch": 111.25, "grad_norm": 0.1169441178714884, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 123029504, "step": 1780 }, { "epoch": 111.25, "loss": 0.0003047786885872483, "loss_ce": 7.875786104705185e-05, "loss_xval": 0.00022602081298828125, "num_input_tokens_seen": 123029504, "step": 1780 }, { "epoch": 111.3125, "grad_norm": 2.676408616858053, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 123101184, "step": 1781 }, { "epoch": 111.3125, "loss": 0.001780191552825272, "loss_ce": 7.883652142481878e-05, "loss_xval": 0.00170135498046875, "num_input_tokens_seen": 123101184, "step": 1781 }, { "epoch": 111.375, "grad_norm": 5.026821121010569, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 123172992, "step": 1782 }, { "epoch": 111.375, "loss": 0.005271148402243853, "loss_ce": 8.315989543916658e-05, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 123172992, "step": 1782 }, { "epoch": 111.4375, "grad_norm": 7.014898222716356, "learning_rate": 5e-05, "loss": 0.0097, "num_input_tokens_seen": 123244608, "step": 1783 }, { "epoch": 111.4375, "loss": 0.009538796730339527, "loss_ce": 7.834726420696825e-05, "loss_xval": 0.00946044921875, "num_input_tokens_seen": 123244608, "step": 1783 }, { "epoch": 111.5, "grad_norm": 8.57632780067167, "learning_rate": 5e-05, "loss": 0.0143, "num_input_tokens_seen": 123316224, "step": 1784 }, { "epoch": 111.5, "loss": 0.014065294526517391, "loss_ce": 8.824347605695948e-05, "loss_xval": 0.01397705078125, "num_input_tokens_seen": 123316224, "step": 1784 }, { "epoch": 111.5625, "grad_norm": 8.879834626558313, "learning_rate": 5e-05, "loss": 0.0154, "num_input_tokens_seen": 123375360, "step": 1785 }, { "epoch": 111.5625, "loss": 0.015540524385869503, "loss_ce": 9.86294326139614e-05, "loss_xval": 0.01544189453125, "num_input_tokens_seen": 123375360, "step": 1785 }, { "epoch": 111.625, "grad_norm": 7.156892270448019, "learning_rate": 5e-05, "loss": 0.0104, "num_input_tokens_seen": 123446976, "step": 1786 }, { "epoch": 111.625, "loss": 0.010456508956849575, "loss_ce": 0.00014156715769786388, "loss_xval": 0.01031494140625, "num_input_tokens_seen": 123446976, "step": 1786 }, { "epoch": 111.6875, "grad_norm": 3.4572267030304777, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 123518528, "step": 1787 }, { "epoch": 111.6875, "loss": 0.0025650986935943365, "loss_ce": 0.00012369242904242128, "loss_xval": 0.00244140625, "num_input_tokens_seen": 123518528, "step": 1787 }, { "epoch": 111.75, "grad_norm": 0.8988618961510938, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 123590208, "step": 1788 }, { "epoch": 111.75, "loss": 0.00045180803863331676, "loss_ce": 0.00015044694009702653, "loss_xval": 0.000301361083984375, "num_input_tokens_seen": 123590208, "step": 1788 }, { "epoch": 111.8125, "grad_norm": 4.2547116814228225, "learning_rate": 5e-05, "loss": 0.0039, "num_input_tokens_seen": 123649344, "step": 1789 }, { "epoch": 111.8125, "loss": 0.0041702440939843655, "loss_ce": 0.00017244124319404364, "loss_xval": 0.003997802734375, "num_input_tokens_seen": 123649344, "step": 1789 }, { "epoch": 111.875, "grad_norm": 5.244946357466308, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 123721024, "step": 1790 }, { "epoch": 111.875, "loss": 0.006065647583454847, "loss_ce": 0.00020627248159144074, "loss_xval": 0.005859375, "num_input_tokens_seen": 123721024, "step": 1790 }, { "epoch": 111.9375, "grad_norm": 3.45428635969614, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 123792576, "step": 1791 }, { "epoch": 111.9375, "loss": 0.0029510704334825277, "loss_ce": 0.00023500603856518865, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 123792576, "step": 1791 }, { "epoch": 112.0, "grad_norm": 0.4139652085726806, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 123851712, "step": 1792 }, { "epoch": 112.0, "loss": 0.0004514800093602389, "loss_ce": 0.00020829305867664516, "loss_xval": 0.00024318695068359375, "num_input_tokens_seen": 123851712, "step": 1792 }, { "epoch": 112.0625, "grad_norm": 4.381395109553456, "learning_rate": 5e-05, "loss": 0.0043, "num_input_tokens_seen": 123923328, "step": 1793 }, { "epoch": 112.0625, "loss": 0.004271045792847872, "loss_ce": 0.00024272565497085452, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 123923328, "step": 1793 }, { "epoch": 112.125, "grad_norm": 5.976886761872509, "learning_rate": 5e-05, "loss": 0.0079, "num_input_tokens_seen": 123995136, "step": 1794 }, { "epoch": 112.125, "loss": 0.007744569797068834, "loss_ce": 0.00026776306913234293, "loss_xval": 0.007476806640625, "num_input_tokens_seen": 123995136, "step": 1794 }, { "epoch": 112.1875, "grad_norm": 4.295502004175747, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 124066688, "step": 1795 }, { "epoch": 112.1875, "loss": 0.0045149922370910645, "loss_ce": 0.0002730487904045731, "loss_xval": 0.004241943359375, "num_input_tokens_seen": 124066688, "step": 1795 }, { "epoch": 112.25, "grad_norm": 0.5322849004613961, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 124138368, "step": 1796 }, { "epoch": 112.25, "loss": 0.0004541517118923366, "loss_ce": 0.00029107340378686786, "loss_xval": 0.00016307830810546875, "num_input_tokens_seen": 124138368, "step": 1796 }, { "epoch": 112.3125, "grad_norm": 3.1494265214095427, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 124210176, "step": 1797 }, { "epoch": 112.3125, "loss": 0.002794926520437002, "loss_ce": 0.00030774399056099355, "loss_xval": 0.0024871826171875, "num_input_tokens_seen": 124210176, "step": 1797 }, { "epoch": 112.375, "grad_norm": 5.114076833623887, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 124281856, "step": 1798 }, { "epoch": 112.375, "loss": 0.006075653247535229, "loss_ce": 0.000338348385412246, "loss_xval": 0.0057373046875, "num_input_tokens_seen": 124281856, "step": 1798 }, { "epoch": 112.4375, "grad_norm": 4.469994657184072, "learning_rate": 5e-05, "loss": 0.0048, "num_input_tokens_seen": 124353408, "step": 1799 }, { "epoch": 112.4375, "loss": 0.004576649982482195, "loss_ce": 0.0003957415756303817, "loss_xval": 0.004180908203125, "num_input_tokens_seen": 124353408, "step": 1799 }, { "epoch": 112.5, "grad_norm": 1.534273645092906, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 124425088, "step": 1800 }, { "epoch": 112.5, "loss": 0.0012304669944569468, "loss_ce": 0.0003836042305920273, "loss_xval": 0.00084686279296875, "num_input_tokens_seen": 124425088, "step": 1800 }, { "epoch": 112.5625, "grad_norm": 1.6878461682804873, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 124496768, "step": 1801 }, { "epoch": 112.5625, "loss": 0.0012798135867342353, "loss_ce": 0.00033758333302102983, "loss_xval": 0.000942230224609375, "num_input_tokens_seen": 124496768, "step": 1801 }, { "epoch": 112.625, "grad_norm": 3.394315017740658, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 124555840, "step": 1802 }, { "epoch": 112.625, "loss": 0.0028806827031075954, "loss_ce": 0.00034772377694025636, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 124555840, "step": 1802 }, { "epoch": 112.6875, "grad_norm": 3.0832818029007254, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 124627456, "step": 1803 }, { "epoch": 112.6875, "loss": 0.002533203223720193, "loss_ce": 0.0003817139659076929, "loss_xval": 0.0021514892578125, "num_input_tokens_seen": 124627456, "step": 1803 }, { "epoch": 112.75, "grad_norm": 1.014619500099443, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 124699072, "step": 1804 }, { "epoch": 112.75, "loss": 0.0006874875398352742, "loss_ce": 0.0003308133454993367, "loss_xval": 0.0003566741943359375, "num_input_tokens_seen": 124699072, "step": 1804 }, { "epoch": 112.8125, "grad_norm": 1.7040306007143686, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 124770688, "step": 1805 }, { "epoch": 112.8125, "loss": 0.0010238613467663527, "loss_ce": 0.00030288362177088857, "loss_xval": 0.000720977783203125, "num_input_tokens_seen": 124770688, "step": 1805 }, { "epoch": 112.875, "grad_norm": 3.357459877271798, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 124842304, "step": 1806 }, { "epoch": 112.875, "loss": 0.002878846600651741, "loss_ce": 0.0003306289145257324, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 124842304, "step": 1806 }, { "epoch": 112.9375, "grad_norm": 3.4384850104981792, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 124913856, "step": 1807 }, { "epoch": 112.9375, "loss": 0.00293713784776628, "loss_ce": 0.0003126260999124497, "loss_xval": 0.00262451171875, "num_input_tokens_seen": 124913856, "step": 1807 }, { "epoch": 113.0, "grad_norm": 2.036426365495195, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 124985408, "step": 1808 }, { "epoch": 113.0, "loss": 0.001254330389201641, "loss_ce": 0.0002548796765040606, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 124985408, "step": 1808 }, { "epoch": 113.0625, "grad_norm": 0.357243663340432, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 125044544, "step": 1809 }, { "epoch": 113.0625, "loss": 0.0005287157255224884, "loss_ce": 0.0002931581693701446, "loss_xval": 0.00023555755615234375, "num_input_tokens_seen": 125044544, "step": 1809 }, { "epoch": 113.125, "grad_norm": 2.53961593001829, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 125103616, "step": 1810 }, { "epoch": 113.125, "loss": 0.0017481737304478884, "loss_ce": 0.00026044173864647746, "loss_xval": 0.00148773193359375, "num_input_tokens_seen": 125103616, "step": 1810 }, { "epoch": 113.1875, "grad_norm": 3.941602045394896, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 125175296, "step": 1811 }, { "epoch": 113.1875, "loss": 0.0037777761463075876, "loss_ce": 0.0002377371274633333, "loss_xval": 0.0035400390625, "num_input_tokens_seen": 125175296, "step": 1811 }, { "epoch": 113.25, "grad_norm": 4.1087198992751075, "learning_rate": 5e-05, "loss": 0.0039, "num_input_tokens_seen": 125246976, "step": 1812 }, { "epoch": 113.25, "loss": 0.004106709733605385, "loss_ce": 0.00023097720986697823, "loss_xval": 0.003875732421875, "num_input_tokens_seen": 125246976, "step": 1812 }, { "epoch": 113.3125, "grad_norm": 3.2760181391357883, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 125318656, "step": 1813 }, { "epoch": 113.3125, "loss": 0.002692433074116707, "loss_ce": 0.00022050918778404593, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 125318656, "step": 1813 }, { "epoch": 113.375, "grad_norm": 1.9032552197242543, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 125390336, "step": 1814 }, { "epoch": 113.375, "loss": 0.001287812483496964, "loss_ce": 0.00021969727822579443, "loss_xval": 0.001068115234375, "num_input_tokens_seen": 125390336, "step": 1814 }, { "epoch": 113.4375, "grad_norm": 0.43698459078532, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 125462016, "step": 1815 }, { "epoch": 113.4375, "loss": 0.00039364141412079334, "loss_ce": 0.00020672124810516834, "loss_xval": 0.000186920166015625, "num_input_tokens_seen": 125462016, "step": 1815 }, { "epoch": 113.5, "grad_norm": 0.8694389485657945, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 125533632, "step": 1816 }, { "epoch": 113.5, "loss": 0.0004984450060874224, "loss_ce": 0.00017801046487875283, "loss_xval": 0.0003204345703125, "num_input_tokens_seen": 125533632, "step": 1816 }, { "epoch": 113.5625, "grad_norm": 1.8447178924289218, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 125605184, "step": 1817 }, { "epoch": 113.5625, "loss": 0.0012481531593948603, "loss_ce": 0.0001800378959160298, "loss_xval": 0.001068115234375, "num_input_tokens_seen": 125605184, "step": 1817 }, { "epoch": 113.625, "grad_norm": 2.4990876372368533, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 125676800, "step": 1818 }, { "epoch": 113.625, "loss": 0.0014198764692991972, "loss_ce": 0.00018391459889244288, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 125676800, "step": 1818 }, { "epoch": 113.6875, "grad_norm": 3.34970916140925, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 125748416, "step": 1819 }, { "epoch": 113.6875, "loss": 0.0026685905177146196, "loss_ce": 0.00015089042426552624, "loss_xval": 0.0025177001953125, "num_input_tokens_seen": 125748416, "step": 1819 }, { "epoch": 113.75, "grad_norm": 4.698109000689552, "learning_rate": 5e-05, "loss": 0.0048, "num_input_tokens_seen": 125807424, "step": 1820 }, { "epoch": 113.75, "loss": 0.00491245137527585, "loss_ce": 0.000151709042256698, "loss_xval": 0.0047607421875, "num_input_tokens_seen": 125807424, "step": 1820 }, { "epoch": 113.8125, "grad_norm": 6.367293742130005, "learning_rate": 5e-05, "loss": 0.0086, "num_input_tokens_seen": 125879104, "step": 1821 }, { "epoch": 113.8125, "loss": 0.008711196482181549, "loss_ce": 0.0001662747235968709, "loss_xval": 0.008544921875, "num_input_tokens_seen": 125879104, "step": 1821 }, { "epoch": 113.875, "grad_norm": 7.886708899895091, "learning_rate": 5e-05, "loss": 0.0131, "num_input_tokens_seen": 125950784, "step": 1822 }, { "epoch": 113.875, "loss": 0.01309604849666357, "loss_ce": 0.00015659494965802878, "loss_xval": 0.012939453125, "num_input_tokens_seen": 125950784, "step": 1822 }, { "epoch": 113.9375, "grad_norm": 8.846094764017112, "learning_rate": 5e-05, "loss": 0.0164, "num_input_tokens_seen": 126022464, "step": 1823 }, { "epoch": 113.9375, "loss": 0.016144748777151108, "loss_ce": 0.00015353858179878443, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 126022464, "step": 1823 }, { "epoch": 114.0, "grad_norm": 8.424385504717284, "learning_rate": 5e-05, "loss": 0.015, "num_input_tokens_seen": 126094016, "step": 1824 }, { "epoch": 114.0, "loss": 0.015658153221011162, "loss_ce": 0.00015522287867497653, "loss_xval": 0.0155029296875, "num_input_tokens_seen": 126094016, "step": 1824 }, { "epoch": 114.0625, "grad_norm": 6.13634496945625, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 126165696, "step": 1825 }, { "epoch": 114.0625, "loss": 0.008386150002479553, "loss_ce": 0.00020743937056977302, "loss_xval": 0.0081787109375, "num_input_tokens_seen": 126165696, "step": 1825 }, { "epoch": 114.125, "grad_norm": 2.491914014905266, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 126237248, "step": 1826 }, { "epoch": 114.125, "loss": 0.0016902722418308258, "loss_ce": 0.00020254029368516058, "loss_xval": 0.00148773193359375, "num_input_tokens_seen": 126237248, "step": 1826 }, { "epoch": 114.1875, "grad_norm": 1.2848642236060528, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 126308928, "step": 1827 }, { "epoch": 114.1875, "loss": 0.0008566059404984117, "loss_ce": 0.0002424397098366171, "loss_xval": 0.000614166259765625, "num_input_tokens_seen": 126308928, "step": 1827 }, { "epoch": 114.25, "grad_norm": 3.851711734139488, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 126380480, "step": 1828 }, { "epoch": 114.25, "loss": 0.003448725678026676, "loss_ce": 0.0002901563420891762, "loss_xval": 0.0031585693359375, "num_input_tokens_seen": 126380480, "step": 1828 }, { "epoch": 114.3125, "grad_norm": 4.443643290686806, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 126452160, "step": 1829 }, { "epoch": 114.3125, "loss": 0.004588674288243055, "loss_ce": 0.00025517813628539443, "loss_xval": 0.00433349609375, "num_input_tokens_seen": 126452160, "step": 1829 }, { "epoch": 114.375, "grad_norm": 3.386106327231258, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 126523840, "step": 1830 }, { "epoch": 114.375, "loss": 0.0029100049287080765, "loss_ce": 0.00031601072987541556, "loss_xval": 0.002593994140625, "num_input_tokens_seen": 126523840, "step": 1830 }, { "epoch": 114.4375, "grad_norm": 1.9251156894869557, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 126595392, "step": 1831 }, { "epoch": 114.4375, "loss": 0.001533187460154295, "loss_ce": 0.0003582606732379645, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 126595392, "step": 1831 }, { "epoch": 114.5, "grad_norm": 0.5046940819061044, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 126667008, "step": 1832 }, { "epoch": 114.5, "loss": 0.0006944824708625674, "loss_ce": 0.0003339935792610049, "loss_xval": 0.0003604888916015625, "num_input_tokens_seen": 126667008, "step": 1832 }, { "epoch": 114.5625, "grad_norm": 0.9328194283619768, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 126738624, "step": 1833 }, { "epoch": 114.5625, "loss": 0.0007143397815525532, "loss_ce": 0.00035003622178919613, "loss_xval": 0.0003643035888671875, "num_input_tokens_seen": 126738624, "step": 1833 }, { "epoch": 114.625, "grad_norm": 1.5869105539345048, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 126810304, "step": 1834 }, { "epoch": 114.625, "loss": 0.0008443781407549977, "loss_ce": 0.0003255793417338282, "loss_xval": 0.000518798828125, "num_input_tokens_seen": 126810304, "step": 1834 }, { "epoch": 114.6875, "grad_norm": 1.1512002350734496, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 126881856, "step": 1835 }, { "epoch": 114.6875, "loss": 0.0007516160840168595, "loss_ce": 0.0003300920652691275, "loss_xval": 0.0004215240478515625, "num_input_tokens_seen": 126881856, "step": 1835 }, { "epoch": 114.75, "grad_norm": 0.15802612896501098, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 126953536, "step": 1836 }, { "epoch": 114.75, "loss": 0.0006052050739526749, "loss_ce": 0.0003229174471925944, "loss_xval": 0.00028228759765625, "num_input_tokens_seen": 126953536, "step": 1836 }, { "epoch": 114.8125, "grad_norm": 1.384495541854308, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 127025344, "step": 1837 }, { "epoch": 114.8125, "loss": 0.0007899554912000895, "loss_ce": 0.0003283771511632949, "loss_xval": 0.000461578369140625, "num_input_tokens_seen": 127025344, "step": 1837 }, { "epoch": 114.875, "grad_norm": 2.5352190525658584, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 127097152, "step": 1838 }, { "epoch": 114.875, "loss": 0.0019897250458598137, "loss_ce": 0.0002883701235987246, "loss_xval": 0.00170135498046875, "num_input_tokens_seen": 127097152, "step": 1838 }, { "epoch": 114.9375, "grad_norm": 3.571308713408931, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 127168768, "step": 1839 }, { "epoch": 114.9375, "loss": 0.003045588731765747, "loss_ce": 0.0002532302460167557, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 127168768, "step": 1839 }, { "epoch": 115.0, "grad_norm": 4.6709850727627344, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 127227840, "step": 1840 }, { "epoch": 115.0, "loss": 0.004910253453999758, "loss_ce": 0.0002715814916882664, "loss_xval": 0.004638671875, "num_input_tokens_seen": 127227840, "step": 1840 }, { "epoch": 115.0625, "grad_norm": 5.525343372867988, "learning_rate": 5e-05, "loss": 0.0069, "num_input_tokens_seen": 127299520, "step": 1841 }, { "epoch": 115.0625, "loss": 0.0067585185170173645, "loss_ce": 0.00022775662364438176, "loss_xval": 0.00653076171875, "num_input_tokens_seen": 127299520, "step": 1841 }, { "epoch": 115.125, "grad_norm": 5.74424661005289, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 127371136, "step": 1842 }, { "epoch": 115.125, "loss": 0.00769760413095355, "loss_ce": 0.00022079766495153308, "loss_xval": 0.007476806640625, "num_input_tokens_seen": 127371136, "step": 1842 }, { "epoch": 115.1875, "grad_norm": 5.468632833948617, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 127442880, "step": 1843 }, { "epoch": 115.1875, "loss": 0.006827354431152344, "loss_ce": 0.0002050399052677676, "loss_xval": 0.006622314453125, "num_input_tokens_seen": 127442880, "step": 1843 }, { "epoch": 115.25, "grad_norm": 4.465560051973801, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 127514560, "step": 1844 }, { "epoch": 115.25, "loss": 0.004257781431078911, "loss_ce": 0.0002294613077538088, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 127514560, "step": 1844 }, { "epoch": 115.3125, "grad_norm": 2.294471964577755, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 127586240, "step": 1845 }, { "epoch": 115.3125, "loss": 0.0017640250734984875, "loss_ce": 0.00023051683092489839, "loss_xval": 0.00153350830078125, "num_input_tokens_seen": 127586240, "step": 1845 }, { "epoch": 115.375, "grad_norm": 0.5333315440861227, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 127657856, "step": 1846 }, { "epoch": 115.375, "loss": 0.0005950859049335122, "loss_ce": 0.00021552354155573994, "loss_xval": 0.0003795623779296875, "num_input_tokens_seen": 127657856, "step": 1846 }, { "epoch": 115.4375, "grad_norm": 2.9353209276907544, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 127729472, "step": 1847 }, { "epoch": 115.4375, "loss": 0.002198727335780859, "loss_ce": 0.00024560230667702854, "loss_xval": 0.001953125, "num_input_tokens_seen": 127729472, "step": 1847 }, { "epoch": 115.5, "grad_norm": 4.950894094113668, "learning_rate": 5e-05, "loss": 0.0057, "num_input_tokens_seen": 127801152, "step": 1848 }, { "epoch": 115.5, "loss": 0.005807612556964159, "loss_ce": 0.00022289558546617627, "loss_xval": 0.005584716796875, "num_input_tokens_seen": 127801152, "step": 1848 }, { "epoch": 115.5625, "grad_norm": 6.535244946174206, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 127872768, "step": 1849 }, { "epoch": 115.5625, "loss": 0.009224435314536095, "loss_ce": 0.00019123233505524695, "loss_xval": 0.009033203125, "num_input_tokens_seen": 127872768, "step": 1849 }, { "epoch": 115.625, "grad_norm": 7.165273740342595, "learning_rate": 5e-05, "loss": 0.0113, "num_input_tokens_seen": 127931968, "step": 1850 }, { "epoch": 115.625, "loss": 0.01137017086148262, "loss_ce": 0.00020073754421900958, "loss_xval": 0.01116943359375, "num_input_tokens_seen": 127931968, "step": 1850 }, { "epoch": 115.6875, "grad_norm": 6.6107148447934145, "learning_rate": 5e-05, "loss": 0.0098, "num_input_tokens_seen": 128003584, "step": 1851 }, { "epoch": 115.6875, "loss": 0.01010982133448124, "loss_ce": 0.00022212599287740886, "loss_xval": 0.0098876953125, "num_input_tokens_seen": 128003584, "step": 1851 }, { "epoch": 115.75, "grad_norm": 5.118953223208179, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 128075200, "step": 1852 }, { "epoch": 115.75, "loss": 0.0058305393904447556, "loss_ce": 0.0002153048844775185, "loss_xval": 0.005615234375, "num_input_tokens_seen": 128075200, "step": 1852 }, { "epoch": 115.8125, "grad_norm": 2.865843839872992, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 128146944, "step": 1853 }, { "epoch": 115.8125, "loss": 0.0022164033725857735, "loss_ce": 0.00020224327454343438, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 128146944, "step": 1853 }, { "epoch": 115.875, "grad_norm": 0.1575066950867638, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 128218624, "step": 1854 }, { "epoch": 115.875, "loss": 0.0004789860104210675, "loss_ce": 0.00022149394499137998, "loss_xval": 0.0002574920654296875, "num_input_tokens_seen": 128218624, "step": 1854 }, { "epoch": 115.9375, "grad_norm": 2.6798771718241228, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 128290176, "step": 1855 }, { "epoch": 115.9375, "loss": 0.002353307791054249, "loss_ce": 0.0002475949004292488, "loss_xval": 0.002105712890625, "num_input_tokens_seen": 128290176, "step": 1855 }, { "epoch": 116.0, "grad_norm": 4.719576085613883, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 128361728, "step": 1856 }, { "epoch": 116.0, "loss": 0.005584153346717358, "loss_ce": 0.00027409486938267946, "loss_xval": 0.00531005859375, "num_input_tokens_seen": 128361728, "step": 1856 }, { "epoch": 116.0625, "grad_norm": 5.490063572209329, "learning_rate": 5e-05, "loss": 0.0072, "num_input_tokens_seen": 128433280, "step": 1857 }, { "epoch": 116.0625, "loss": 0.0072921644896268845, "loss_ce": 0.0002731215790845454, "loss_xval": 0.00701904296875, "num_input_tokens_seen": 128433280, "step": 1857 }, { "epoch": 116.125, "grad_norm": 4.931355390541039, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 128504832, "step": 1858 }, { "epoch": 116.125, "loss": 0.005588597152382135, "loss_ce": 0.00024802106781862676, "loss_xval": 0.005340576171875, "num_input_tokens_seen": 128504832, "step": 1858 }, { "epoch": 116.1875, "grad_norm": 3.4001732010288155, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 128576384, "step": 1859 }, { "epoch": 116.1875, "loss": 0.003236029762774706, "loss_ce": 0.0003216010518372059, "loss_xval": 0.0029144287109375, "num_input_tokens_seen": 128576384, "step": 1859 }, { "epoch": 116.25, "grad_norm": 1.2787257238501162, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 128648064, "step": 1860 }, { "epoch": 116.25, "loss": 0.0010641159024089575, "loss_ce": 0.0002554001403041184, "loss_xval": 0.0008087158203125, "num_input_tokens_seen": 128648064, "step": 1860 }, { "epoch": 116.3125, "grad_norm": 1.2800305157776937, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 128719808, "step": 1861 }, { "epoch": 116.3125, "loss": 0.0008978757541626692, "loss_ce": 0.0002646360080689192, "loss_xval": 0.00063323974609375, "num_input_tokens_seen": 128719808, "step": 1861 }, { "epoch": 116.375, "grad_norm": 3.485110576462092, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 128791424, "step": 1862 }, { "epoch": 116.375, "loss": 0.0031012576073408127, "loss_ce": 0.0002478641690686345, "loss_xval": 0.0028533935546875, "num_input_tokens_seen": 128791424, "step": 1862 }, { "epoch": 116.4375, "grad_norm": 4.646144569167161, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 128863104, "step": 1863 }, { "epoch": 116.4375, "loss": 0.0047195046208798885, "loss_ce": 0.00029445585096254945, "loss_xval": 0.004425048828125, "num_input_tokens_seen": 128863104, "step": 1863 }, { "epoch": 116.5, "grad_norm": 4.549980043580312, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 128934720, "step": 1864 }, { "epoch": 116.5, "loss": 0.0052688755095005035, "loss_ce": 0.000263992726104334, "loss_xval": 0.0050048828125, "num_input_tokens_seen": 128934720, "step": 1864 }, { "epoch": 116.5625, "grad_norm": 3.0547045447153636, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 129006336, "step": 1865 }, { "epoch": 116.5625, "loss": 0.0026751989498734474, "loss_ce": 0.00024905154714360833, "loss_xval": 0.0024261474609375, "num_input_tokens_seen": 129006336, "step": 1865 }, { "epoch": 116.625, "grad_norm": 0.5058449032568803, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 129077888, "step": 1866 }, { "epoch": 116.625, "loss": 0.0005971361533738673, "loss_ce": 0.0002442766563035548, "loss_xval": 0.0003528594970703125, "num_input_tokens_seen": 129077888, "step": 1866 }, { "epoch": 116.6875, "grad_norm": 2.4142117198655995, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 129149568, "step": 1867 }, { "epoch": 116.6875, "loss": 0.0016070627607405186, "loss_ce": 0.000271918746875599, "loss_xval": 0.00133514404296875, "num_input_tokens_seen": 129149568, "step": 1867 }, { "epoch": 116.75, "grad_norm": 5.0569314143632935, "learning_rate": 5e-05, "loss": 0.006, "num_input_tokens_seen": 129221184, "step": 1868 }, { "epoch": 116.75, "loss": 0.006282792426645756, "loss_ce": 0.0002097943506669253, "loss_xval": 0.006072998046875, "num_input_tokens_seen": 129221184, "step": 1868 }, { "epoch": 116.8125, "grad_norm": 7.532738506332238, "learning_rate": 5e-05, "loss": 0.0129, "num_input_tokens_seen": 129292800, "step": 1869 }, { "epoch": 116.8125, "loss": 0.013102911412715912, "loss_ce": 0.0002244935603812337, "loss_xval": 0.01287841796875, "num_input_tokens_seen": 129292800, "step": 1869 }, { "epoch": 116.875, "grad_norm": 9.355417713150146, "learning_rate": 5e-05, "loss": 0.0197, "num_input_tokens_seen": 129351872, "step": 1870 }, { "epoch": 116.875, "loss": 0.019691236317157745, "loss_ce": 0.00028205677517689764, "loss_xval": 0.0194091796875, "num_input_tokens_seen": 129351872, "step": 1870 }, { "epoch": 116.9375, "grad_norm": 9.051156641686775, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 129423488, "step": 1871 }, { "epoch": 116.9375, "loss": 0.01793898269534111, "loss_ce": 0.00023878809588495642, "loss_xval": 0.0177001953125, "num_input_tokens_seen": 129423488, "step": 1871 }, { "epoch": 117.0, "grad_norm": 5.676332544955607, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 129495168, "step": 1872 }, { "epoch": 117.0, "loss": 0.006899159401655197, "loss_ce": 0.0003378799301572144, "loss_xval": 0.006561279296875, "num_input_tokens_seen": 129495168, "step": 1872 }, { "epoch": 117.0625, "grad_norm": 0.24712015306062302, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 129566848, "step": 1873 }, { "epoch": 117.0625, "loss": 0.000838251318782568, "loss_ce": 0.000342340674251318, "loss_xval": 0.00049591064453125, "num_input_tokens_seen": 129566848, "step": 1873 }, { "epoch": 117.125, "grad_norm": 5.538933794151056, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 129638464, "step": 1874 }, { "epoch": 117.125, "loss": 0.007606185507029295, "loss_ce": 0.0004040371277369559, "loss_xval": 0.0072021484375, "num_input_tokens_seen": 129638464, "step": 1874 }, { "epoch": 117.1875, "grad_norm": 7.734305494423269, "learning_rate": 5e-05, "loss": 0.0141, "num_input_tokens_seen": 129710080, "step": 1875 }, { "epoch": 117.1875, "loss": 0.013922973535954952, "loss_ce": 0.00037316849920898676, "loss_xval": 0.0135498046875, "num_input_tokens_seen": 129710080, "step": 1875 }, { "epoch": 117.25, "grad_norm": 6.0224839866175826, "learning_rate": 5e-05, "loss": 0.0091, "num_input_tokens_seen": 129781760, "step": 1876 }, { "epoch": 117.25, "loss": 0.010079562664031982, "loss_ce": 0.00037497258745133877, "loss_xval": 0.00970458984375, "num_input_tokens_seen": 129781760, "step": 1876 }, { "epoch": 117.3125, "grad_norm": 1.8034154244820024, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 129840832, "step": 1877 }, { "epoch": 117.3125, "loss": 0.0012554076965898275, "loss_ce": 0.000412359629990533, "loss_xval": 0.000843048095703125, "num_input_tokens_seen": 129840832, "step": 1877 }, { "epoch": 117.375, "grad_norm": 2.803885585326666, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 129912448, "step": 1878 }, { "epoch": 117.375, "loss": 0.0023236768320202827, "loss_ce": 0.0003934399283025414, "loss_xval": 0.00193023681640625, "num_input_tokens_seen": 129912448, "step": 1878 }, { "epoch": 117.4375, "grad_norm": 5.350260126337846, "learning_rate": 5e-05, "loss": 0.0073, "num_input_tokens_seen": 129984064, "step": 1879 }, { "epoch": 117.4375, "loss": 0.007126620039343834, "loss_ce": 0.0004737878334708512, "loss_xval": 0.00665283203125, "num_input_tokens_seen": 129984064, "step": 1879 }, { "epoch": 117.5, "grad_norm": 4.234827947508276, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 130055616, "step": 1880 }, { "epoch": 117.5, "loss": 0.004482659976929426, "loss_ce": 0.00045433969353325665, "loss_xval": 0.0040283203125, "num_input_tokens_seen": 130055616, "step": 1880 }, { "epoch": 117.5625, "grad_norm": 0.40401382532919683, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 130127232, "step": 1881 }, { "epoch": 117.5625, "loss": 0.0006351525080390275, "loss_ce": 0.00046349113108590245, "loss_xval": 0.000171661376953125, "num_input_tokens_seen": 130127232, "step": 1881 }, { "epoch": 117.625, "grad_norm": 3.515571860108049, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 130198848, "step": 1882 }, { "epoch": 117.625, "loss": 0.0034939174074679613, "loss_ce": 0.00041164198773913085, "loss_xval": 0.003082275390625, "num_input_tokens_seen": 130198848, "step": 1882 }, { "epoch": 117.6875, "grad_norm": 5.162499769376859, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 130270528, "step": 1883 }, { "epoch": 117.6875, "loss": 0.006677284371107817, "loss_ce": 0.0003906633937731385, "loss_xval": 0.00628662109375, "num_input_tokens_seen": 130270528, "step": 1883 }, { "epoch": 117.75, "grad_norm": 3.7342067107303967, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 130329536, "step": 1884 }, { "epoch": 117.75, "loss": 0.0037594609893858433, "loss_ce": 0.00032623333390802145, "loss_xval": 0.0034332275390625, "num_input_tokens_seen": 130329536, "step": 1884 }, { "epoch": 117.8125, "grad_norm": 0.4550271944054085, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 130388672, "step": 1885 }, { "epoch": 117.8125, "loss": 0.0004483010561671108, "loss_ce": 0.00033290646388195455, "loss_xval": 0.00011539459228515625, "num_input_tokens_seen": 130388672, "step": 1885 }, { "epoch": 117.875, "grad_norm": 2.5099531673200572, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 130460480, "step": 1886 }, { "epoch": 117.875, "loss": 0.0020212167873978615, "loss_ce": 0.0002969737397506833, "loss_xval": 0.0017242431640625, "num_input_tokens_seen": 130460480, "step": 1886 }, { "epoch": 117.9375, "grad_norm": 3.7727950222198268, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 130532032, "step": 1887 }, { "epoch": 117.9375, "loss": 0.0038818458560854197, "loss_ce": 0.0002502540301065892, "loss_xval": 0.003631591796875, "num_input_tokens_seen": 130532032, "step": 1887 }, { "epoch": 118.0, "grad_norm": 3.3771414897134897, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 130591104, "step": 1888 }, { "epoch": 118.0, "loss": 0.0025949934497475624, "loss_ce": 0.00024513984681107104, "loss_xval": 0.002349853515625, "num_input_tokens_seen": 130591104, "step": 1888 }, { "epoch": 118.0625, "grad_norm": 1.8822174090023722, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 130662656, "step": 1889 }, { "epoch": 118.0625, "loss": 0.0014085653237998486, "loss_ce": 0.00023363855143543333, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 130662656, "step": 1889 }, { "epoch": 118.125, "grad_norm": 0.22173542948007965, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 130734400, "step": 1890 }, { "epoch": 118.125, "loss": 0.00043304392602294683, "loss_ce": 0.0001889032864710316, "loss_xval": 0.000244140625, "num_input_tokens_seen": 130734400, "step": 1890 }, { "epoch": 118.1875, "grad_norm": 2.3935184610564866, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 130793408, "step": 1891 }, { "epoch": 118.1875, "loss": 0.0013886751839891076, "loss_ce": 0.0001756014535203576, "loss_xval": 0.00121307373046875, "num_input_tokens_seen": 130793408, "step": 1891 }, { "epoch": 118.25, "grad_norm": 4.620740343148861, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 130865088, "step": 1892 }, { "epoch": 118.25, "loss": 0.005361003335565329, "loss_ce": 0.00017301514162681997, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 130865088, "step": 1892 }, { "epoch": 118.3125, "grad_norm": 6.107267740441691, "learning_rate": 5e-05, "loss": 0.0084, "num_input_tokens_seen": 130936704, "step": 1893 }, { "epoch": 118.3125, "loss": 0.00820300355553627, "loss_ce": 0.00014636323612648994, "loss_xval": 0.008056640625, "num_input_tokens_seen": 130936704, "step": 1893 }, { "epoch": 118.375, "grad_norm": 6.598447170792395, "learning_rate": 5e-05, "loss": 0.0098, "num_input_tokens_seen": 131008256, "step": 1894 }, { "epoch": 118.375, "loss": 0.00978381372988224, "loss_ce": 0.00014025870768819004, "loss_xval": 0.0096435546875, "num_input_tokens_seen": 131008256, "step": 1894 }, { "epoch": 118.4375, "grad_norm": 5.97610984602622, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 131067264, "step": 1895 }, { "epoch": 118.4375, "loss": 0.0079050837084651, "loss_ce": 0.00015361930127255619, "loss_xval": 0.00775146484375, "num_input_tokens_seen": 131067264, "step": 1895 }, { "epoch": 118.5, "grad_norm": 3.8296539045437776, "learning_rate": 5e-05, "loss": 0.0036, "num_input_tokens_seen": 131138944, "step": 1896 }, { "epoch": 118.5, "loss": 0.003549471264705062, "loss_ce": 0.0002077965618809685, "loss_xval": 0.0033416748046875, "num_input_tokens_seen": 131138944, "step": 1896 }, { "epoch": 118.5625, "grad_norm": 0.5031231600520943, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 131210496, "step": 1897 }, { "epoch": 118.5625, "loss": 0.00042222841875627637, "loss_ce": 0.00021242006914690137, "loss_xval": 0.000209808349609375, "num_input_tokens_seen": 131210496, "step": 1897 }, { "epoch": 118.625, "grad_norm": 2.7708119254564774, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 131282176, "step": 1898 }, { "epoch": 118.625, "loss": 0.0018496726406738162, "loss_ce": 0.00027801733813248575, "loss_xval": 0.0015716552734375, "num_input_tokens_seen": 131282176, "step": 1898 }, { "epoch": 118.6875, "grad_norm": 4.503105217437891, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 131353792, "step": 1899 }, { "epoch": 118.6875, "loss": 0.005182057619094849, "loss_ce": 0.00023820977366995066, "loss_xval": 0.00494384765625, "num_input_tokens_seen": 131353792, "step": 1899 }, { "epoch": 118.75, "grad_norm": 4.32176320487952, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 131425408, "step": 1900 }, { "epoch": 118.75, "loss": 0.004484056495130062, "loss_ce": 0.0002726308593992144, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 131425408, "step": 1900 }, { "epoch": 118.8125, "grad_norm": 2.625321999415006, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 131484544, "step": 1901 }, { "epoch": 118.8125, "loss": 0.0019633802585303783, "loss_ce": 0.00030017219251021743, "loss_xval": 0.0016632080078125, "num_input_tokens_seen": 131484544, "step": 1901 }, { "epoch": 118.875, "grad_norm": 0.08851930350664257, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 131556160, "step": 1902 }, { "epoch": 118.875, "loss": 0.0004228780453559011, "loss_ce": 0.0002989003842230886, "loss_xval": 0.0001239776611328125, "num_input_tokens_seen": 131556160, "step": 1902 }, { "epoch": 118.9375, "grad_norm": 2.4963959699155094, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 131602752, "step": 1903 }, { "epoch": 118.9375, "loss": 0.0017550225602462888, "loss_ce": 0.00027492005028761923, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 131602752, "step": 1903 }, { "epoch": 119.0, "grad_norm": 4.442726334947545, "learning_rate": 5e-05, "loss": 0.005, "num_input_tokens_seen": 131674304, "step": 1904 }, { "epoch": 119.0, "loss": 0.005132955964654684, "loss_ce": 0.00025014366838149726, "loss_xval": 0.0048828125, "num_input_tokens_seen": 131674304, "step": 1904 }, { "epoch": 119.0625, "grad_norm": 5.2266029914078125, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 131745856, "step": 1905 }, { "epoch": 119.0625, "loss": 0.006491591222584248, "loss_ce": 0.0002660052268765867, "loss_xval": 0.0062255859375, "num_input_tokens_seen": 131745856, "step": 1905 }, { "epoch": 119.125, "grad_norm": 4.301569943752984, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 131817408, "step": 1906 }, { "epoch": 119.125, "loss": 0.0044795856811106205, "loss_ce": 0.0002681601035874337, "loss_xval": 0.00421142578125, "num_input_tokens_seen": 131817408, "step": 1906 }, { "epoch": 119.1875, "grad_norm": 1.8553873164934465, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 131889024, "step": 1907 }, { "epoch": 119.1875, "loss": 0.0012664373498409986, "loss_ce": 0.0002517279062885791, "loss_xval": 0.00101470947265625, "num_input_tokens_seen": 131889024, "step": 1907 }, { "epoch": 119.25, "grad_norm": 0.8385600978339574, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 131960704, "step": 1908 }, { "epoch": 119.25, "loss": 0.0005585540202446282, "loss_ce": 0.0002533782389946282, "loss_xval": 0.00030517578125, "num_input_tokens_seen": 131960704, "step": 1908 }, { "epoch": 119.3125, "grad_norm": 2.8320554472079498, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 132032384, "step": 1909 }, { "epoch": 119.3125, "loss": 0.002158309333026409, "loss_ce": 0.00023570179473608732, "loss_xval": 0.001922607421875, "num_input_tokens_seen": 132032384, "step": 1909 }, { "epoch": 119.375, "grad_norm": 4.023752035863257, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 132104064, "step": 1910 }, { "epoch": 119.375, "loss": 0.0039420705288648605, "loss_ce": 0.00023418469936586916, "loss_xval": 0.0037078857421875, "num_input_tokens_seen": 132104064, "step": 1910 }, { "epoch": 119.4375, "grad_norm": 4.339042522343855, "learning_rate": 5e-05, "loss": 0.0046, "num_input_tokens_seen": 132175616, "step": 1911 }, { "epoch": 119.4375, "loss": 0.004613110329955816, "loss_ce": 0.00021857912361156195, "loss_xval": 0.00439453125, "num_input_tokens_seen": 132175616, "step": 1911 }, { "epoch": 119.5, "grad_norm": 3.577390118173682, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 132247296, "step": 1912 }, { "epoch": 119.5, "loss": 0.0032142093405127525, "loss_ce": 0.0001777103025233373, "loss_xval": 0.0030364990234375, "num_input_tokens_seen": 132247296, "step": 1912 }, { "epoch": 119.5625, "grad_norm": 1.843513969482811, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 132318912, "step": 1913 }, { "epoch": 119.5625, "loss": 0.0010311749065294862, "loss_ce": 0.0001919415226439014, "loss_xval": 0.0008392333984375, "num_input_tokens_seen": 132318912, "step": 1913 }, { "epoch": 119.625, "grad_norm": 0.3208426926764874, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 132390592, "step": 1914 }, { "epoch": 119.625, "loss": 0.0003046026104129851, "loss_ce": 0.00020398998458404094, "loss_xval": 0.00010061264038085938, "num_input_tokens_seen": 132390592, "step": 1914 }, { "epoch": 119.6875, "grad_norm": 2.265117670760533, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 132462336, "step": 1915 }, { "epoch": 119.6875, "loss": 0.0014705532230436802, "loss_ce": 0.00015829737822059542, "loss_xval": 0.001312255859375, "num_input_tokens_seen": 132462336, "step": 1915 }, { "epoch": 119.75, "grad_norm": 3.5286223420609417, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 132534016, "step": 1916 }, { "epoch": 119.75, "loss": 0.003026542253792286, "loss_ce": 0.00017314877186436206, "loss_xval": 0.0028533935546875, "num_input_tokens_seen": 132534016, "step": 1916 }, { "epoch": 119.8125, "grad_norm": 3.8044836959593145, "learning_rate": 5e-05, "loss": 0.0035, "num_input_tokens_seen": 132593216, "step": 1917 }, { "epoch": 119.8125, "loss": 0.0033919373527169228, "loss_ce": 0.00014181526785250753, "loss_xval": 0.0032501220703125, "num_input_tokens_seen": 132593216, "step": 1917 }, { "epoch": 119.875, "grad_norm": 3.434972428474604, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 132664832, "step": 1918 }, { "epoch": 119.875, "loss": 0.0028979736380279064, "loss_ce": 0.00013613273040391505, "loss_xval": 0.0027618408203125, "num_input_tokens_seen": 132664832, "step": 1918 }, { "epoch": 119.9375, "grad_norm": 2.858343072749603, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 132736512, "step": 1919 }, { "epoch": 119.9375, "loss": 0.0021448051556944847, "loss_ce": 0.00013064501399639994, "loss_xval": 0.00201416015625, "num_input_tokens_seen": 132736512, "step": 1919 }, { "epoch": 120.0, "grad_norm": 2.0550669726698705, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 132808256, "step": 1920 }, { "epoch": 120.0, "loss": 0.0010152951581403613, "loss_ce": 0.00014554419612977654, "loss_xval": 0.0008697509765625, "num_input_tokens_seen": 132808256, "step": 1920 }, { "epoch": 120.0625, "grad_norm": 0.9455822939077928, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 132880000, "step": 1921 }, { "epoch": 120.0625, "loss": 0.0003893776738550514, "loss_ce": 0.0001242562138941139, "loss_xval": 0.0002651214599609375, "num_input_tokens_seen": 132880000, "step": 1921 }, { "epoch": 120.125, "grad_norm": 0.038142623378173694, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 132951616, "step": 1922 }, { "epoch": 120.125, "loss": 0.00025403310428373516, "loss_ce": 0.00013625432620756328, "loss_xval": 0.00011777877807617188, "num_input_tokens_seen": 132951616, "step": 1922 }, { "epoch": 120.1875, "grad_norm": 0.517983666843393, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 133023296, "step": 1923 }, { "epoch": 120.1875, "loss": 0.0002651893300935626, "loss_ce": 0.00013072125148028135, "loss_xval": 0.00013446807861328125, "num_input_tokens_seen": 133023296, "step": 1923 }, { "epoch": 120.25, "grad_norm": 0.5278042857056948, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 133094912, "step": 1924 }, { "epoch": 120.25, "loss": 0.00028297933749854565, "loss_ce": 0.00011322532373014838, "loss_xval": 0.0001697540283203125, "num_input_tokens_seen": 133094912, "step": 1924 }, { "epoch": 120.3125, "grad_norm": 0.19533164384864551, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 133166592, "step": 1925 }, { "epoch": 120.3125, "loss": 0.0001814560528146103, "loss_ce": 0.00012256666377652436, "loss_xval": 5.888938903808594e-05, "num_input_tokens_seen": 133166592, "step": 1925 }, { "epoch": 120.375, "grad_norm": 0.26742184823554127, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 133238208, "step": 1926 }, { "epoch": 120.375, "loss": 0.00021792534971609712, "loss_ce": 0.000109206470369827, "loss_xval": 0.0001087188720703125, "num_input_tokens_seen": 133238208, "step": 1926 }, { "epoch": 120.4375, "grad_norm": 0.6238052263308073, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 133297344, "step": 1927 }, { "epoch": 120.4375, "loss": 0.0002904341381508857, "loss_ce": 9.874559327727184e-05, "loss_xval": 0.00019168853759765625, "num_input_tokens_seen": 133297344, "step": 1927 }, { "epoch": 120.5, "grad_norm": 0.8868703067433092, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 133369024, "step": 1928 }, { "epoch": 120.5, "loss": 0.00046274211490526795, "loss_ce": 8.508710016030818e-05, "loss_xval": 0.000377655029296875, "num_input_tokens_seen": 133369024, "step": 1928 }, { "epoch": 120.5625, "grad_norm": 1.3927712553743166, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 133440704, "step": 1929 }, { "epoch": 120.5625, "loss": 0.0006976852891966701, "loss_ce": 9.114840213442221e-05, "loss_xval": 0.000606536865234375, "num_input_tokens_seen": 133440704, "step": 1929 }, { "epoch": 120.625, "grad_norm": 2.6743346780382424, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 133512256, "step": 1930 }, { "epoch": 120.625, "loss": 0.00175421591848135, "loss_ce": 8.337857434526086e-05, "loss_xval": 0.00167083740234375, "num_input_tokens_seen": 133512256, "step": 1930 }, { "epoch": 120.6875, "grad_norm": 5.5814413905267255, "learning_rate": 5e-05, "loss": 0.0071, "num_input_tokens_seen": 133571392, "step": 1931 }, { "epoch": 120.6875, "loss": 0.007034452632069588, "loss_ce": 7.644500874448568e-05, "loss_xval": 0.0069580078125, "num_input_tokens_seen": 133571392, "step": 1931 }, { "epoch": 120.75, "grad_norm": 11.156357970593493, "learning_rate": 5e-05, "loss": 0.0282, "num_input_tokens_seen": 133643008, "step": 1932 }, { "epoch": 120.75, "loss": 0.028513383120298386, "loss_ce": 7.099966751411557e-05, "loss_xval": 0.0284423828125, "num_input_tokens_seen": 133643008, "step": 1932 }, { "epoch": 120.8125, "grad_norm": 19.248052553147836, "learning_rate": 5e-05, "loss": 0.086, "num_input_tokens_seen": 133714624, "step": 1933 }, { "epoch": 120.8125, "loss": 0.085076242685318, "loss_ce": 0.00011530210758792236, "loss_xval": 0.0849609375, "num_input_tokens_seen": 133714624, "step": 1933 }, { "epoch": 120.875, "grad_norm": 20.62678420915086, "learning_rate": 5e-05, "loss": 0.1021, "num_input_tokens_seen": 133773696, "step": 1934 }, { "epoch": 120.875, "loss": 0.10283531993627548, "loss_ce": 0.00029625691240653396, "loss_xval": 0.1025390625, "num_input_tokens_seen": 133773696, "step": 1934 }, { "epoch": 120.9375, "grad_norm": 7.228987067817792, "learning_rate": 5e-05, "loss": 0.0146, "num_input_tokens_seen": 133845440, "step": 1935 }, { "epoch": 120.9375, "loss": 0.015175651758909225, "loss_ce": 0.001259636483155191, "loss_xval": 0.013916015625, "num_input_tokens_seen": 133845440, "step": 1935 }, { "epoch": 121.0, "grad_norm": 9.84998372390247, "learning_rate": 5e-05, "loss": 0.0271, "num_input_tokens_seen": 133917056, "step": 1936 }, { "epoch": 121.0, "loss": 0.02591935358941555, "loss_ce": 0.0019935716409236193, "loss_xval": 0.02392578125, "num_input_tokens_seen": 133917056, "step": 1936 }, { "epoch": 121.0625, "grad_norm": 18.91736705034326, "learning_rate": 5e-05, "loss": 0.0873, "num_input_tokens_seen": 133988736, "step": 1937 }, { "epoch": 121.0625, "loss": 0.08870452642440796, "loss_ce": 0.0017904606647789478, "loss_xval": 0.0869140625, "num_input_tokens_seen": 133988736, "step": 1937 }, { "epoch": 121.125, "grad_norm": 11.215001835063116, "learning_rate": 5e-05, "loss": 0.0317, "num_input_tokens_seen": 134060416, "step": 1938 }, { "epoch": 121.125, "loss": 0.030903557315468788, "loss_ce": 0.0012404713779687881, "loss_xval": 0.0296630859375, "num_input_tokens_seen": 134060416, "step": 1938 }, { "epoch": 121.1875, "grad_norm": 3.4933702646752485, "learning_rate": 5e-05, "loss": 0.0058, "num_input_tokens_seen": 134132160, "step": 1939 }, { "epoch": 121.1875, "loss": 0.006178136914968491, "loss_ce": 0.0012953245313838124, "loss_xval": 0.0048828125, "num_input_tokens_seen": 134132160, "step": 1939 }, { "epoch": 121.25, "grad_norm": 25.058268735271895, "learning_rate": 5e-05, "loss": 0.1257, "num_input_tokens_seen": 134203904, "step": 1940 }, { "epoch": 121.25, "loss": 0.1271267533302307, "loss_ce": 0.0011501964181661606, "loss_xval": 0.1259765625, "num_input_tokens_seen": 134203904, "step": 1940 }, { "epoch": 121.3125, "grad_norm": 30.920576182316488, "learning_rate": 5e-05, "loss": 0.2169, "num_input_tokens_seen": 134275456, "step": 1941 }, { "epoch": 121.3125, "loss": 0.21448010206222534, "loss_ce": 0.0006129105458967388, "loss_xval": 0.2138671875, "num_input_tokens_seen": 134275456, "step": 1941 }, { "epoch": 121.375, "grad_norm": 4.388403747873867, "learning_rate": 5e-05, "loss": 0.0117, "num_input_tokens_seen": 134347072, "step": 1942 }, { "epoch": 121.375, "loss": 0.008787467144429684, "loss_ce": 0.0010360024170950055, "loss_xval": 0.00775146484375, "num_input_tokens_seen": 134347072, "step": 1942 }, { "epoch": 121.4375, "grad_norm": 28.057472943062788, "learning_rate": 5e-05, "loss": 0.1948, "num_input_tokens_seen": 134406208, "step": 1943 }, { "epoch": 121.4375, "loss": 0.1948278546333313, "loss_ce": 0.00146848289296031, "loss_xval": 0.193359375, "num_input_tokens_seen": 134406208, "step": 1943 }, { "epoch": 121.5, "grad_norm": 8.07372460241718, "learning_rate": 5e-05, "loss": 0.0311, "num_input_tokens_seen": 134477824, "step": 1944 }, { "epoch": 121.5, "loss": 0.03582604229450226, "loss_ce": 0.008238151669502258, "loss_xval": 0.027587890625, "num_input_tokens_seen": 134477824, "step": 1944 }, { "epoch": 121.5625, "grad_norm": 19.59842984931545, "learning_rate": 5e-05, "loss": 0.1091, "num_input_tokens_seen": 134549440, "step": 1945 }, { "epoch": 121.5625, "loss": 0.10925983637571335, "loss_ce": 0.0164863970130682, "loss_xval": 0.0927734375, "num_input_tokens_seen": 134549440, "step": 1945 }, { "epoch": 121.625, "grad_norm": 25.373277244650303, "learning_rate": 5e-05, "loss": 0.1628, "num_input_tokens_seen": 134621120, "step": 1946 }, { "epoch": 121.625, "loss": 0.15820538997650146, "loss_ce": 0.003908507991582155, "loss_xval": 0.154296875, "num_input_tokens_seen": 134621120, "step": 1946 }, { "epoch": 121.6875, "grad_norm": 5.837645263257035, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 134692672, "step": 1947 }, { "epoch": 121.6875, "loss": 0.01283172331750393, "loss_ce": 0.0018453949596732855, "loss_xval": 0.010986328125, "num_input_tokens_seen": 134692672, "step": 1947 }, { "epoch": 121.75, "grad_norm": 21.944172020399566, "learning_rate": 5e-05, "loss": 0.115, "num_input_tokens_seen": 134764288, "step": 1948 }, { "epoch": 121.75, "loss": 0.11614223569631577, "loss_ce": 0.0009078634320758283, "loss_xval": 0.115234375, "num_input_tokens_seen": 134764288, "step": 1948 }, { "epoch": 121.8125, "grad_norm": 4.671667038476155, "learning_rate": 5e-05, "loss": 0.0094, "num_input_tokens_seen": 134835968, "step": 1949 }, { "epoch": 121.8125, "loss": 0.007974156178534031, "loss_ce": 0.0005278667085804045, "loss_xval": 0.0074462890625, "num_input_tokens_seen": 134835968, "step": 1949 }, { "epoch": 121.875, "grad_norm": 17.329389706599958, "learning_rate": 5e-05, "loss": 0.075, "num_input_tokens_seen": 134907520, "step": 1950 }, { "epoch": 121.875, "loss": 0.07992716878652573, "loss_ce": 0.0003373274812474847, "loss_xval": 0.07958984375, "num_input_tokens_seen": 134907520, "step": 1950 }, { "epoch": 121.9375, "grad_norm": 4.712997049900013, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 134979072, "step": 1951 }, { "epoch": 121.9375, "loss": 0.006857059895992279, "loss_ce": 0.0002652631083037704, "loss_xval": 0.006591796875, "num_input_tokens_seen": 134979072, "step": 1951 }, { "epoch": 122.0, "grad_norm": 15.58583942859578, "learning_rate": 5e-05, "loss": 0.0579, "num_input_tokens_seen": 135050752, "step": 1952 }, { "epoch": 122.0, "loss": 0.05805974081158638, "loss_ce": 0.00019841421453747898, "loss_xval": 0.057861328125, "num_input_tokens_seen": 135050752, "step": 1952 }, { "epoch": 122.0625, "grad_norm": 7.025742904997995, "learning_rate": 5e-05, "loss": 0.0174, "num_input_tokens_seen": 135122496, "step": 1953 }, { "epoch": 122.0625, "loss": 0.017606699839234352, "loss_ce": 0.0001506455009803176, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 135122496, "step": 1953 }, { "epoch": 122.125, "grad_norm": 15.468862719989644, "learning_rate": 5e-05, "loss": 0.0571, "num_input_tokens_seen": 135181632, "step": 1954 }, { "epoch": 122.125, "loss": 0.056041352450847626, "loss_ce": 0.00013315118849277496, "loss_xval": 0.055908203125, "num_input_tokens_seen": 135181632, "step": 1954 }, { "epoch": 122.1875, "grad_norm": 5.551718809172979, "learning_rate": 5e-05, "loss": 0.0092, "num_input_tokens_seen": 135253248, "step": 1955 }, { "epoch": 122.1875, "loss": 0.008621604181826115, "loss_ce": 0.00013771781232208014, "loss_xval": 0.00848388671875, "num_input_tokens_seen": 135253248, "step": 1955 }, { "epoch": 122.25, "grad_norm": 12.265454105803553, "learning_rate": 5e-05, "loss": 0.0357, "num_input_tokens_seen": 135324800, "step": 1956 }, { "epoch": 122.25, "loss": 0.036517731845378876, "loss_ce": 0.00014077810919843614, "loss_xval": 0.036376953125, "num_input_tokens_seen": 135324800, "step": 1956 }, { "epoch": 122.3125, "grad_norm": 6.687800353400105, "learning_rate": 5e-05, "loss": 0.0118, "num_input_tokens_seen": 135396416, "step": 1957 }, { "epoch": 122.3125, "loss": 0.01237054355442524, "loss_ce": 0.00016351198428310454, "loss_xval": 0.01220703125, "num_input_tokens_seen": 135396416, "step": 1957 }, { "epoch": 122.375, "grad_norm": 10.946746665319191, "learning_rate": 5e-05, "loss": 0.0298, "num_input_tokens_seen": 135467968, "step": 1958 }, { "epoch": 122.375, "loss": 0.030561501160264015, "loss_ce": 0.00016599331866018474, "loss_xval": 0.0303955078125, "num_input_tokens_seen": 135467968, "step": 1958 }, { "epoch": 122.4375, "grad_norm": 6.228161151916856, "learning_rate": 5e-05, "loss": 0.0103, "num_input_tokens_seen": 135539520, "step": 1959 }, { "epoch": 122.4375, "loss": 0.010677915997803211, "loss_ce": 0.00024090414808597416, "loss_xval": 0.01043701171875, "num_input_tokens_seen": 135539520, "step": 1959 }, { "epoch": 122.5, "grad_norm": 5.822338375589962, "learning_rate": 5e-05, "loss": 0.0092, "num_input_tokens_seen": 135611264, "step": 1960 }, { "epoch": 122.5, "loss": 0.010117021389305592, "loss_ce": 0.00029036152409389615, "loss_xval": 0.00982666015625, "num_input_tokens_seen": 135611264, "step": 1960 }, { "epoch": 122.5625, "grad_norm": 8.336276382534857, "learning_rate": 5e-05, "loss": 0.0189, "num_input_tokens_seen": 135682816, "step": 1961 }, { "epoch": 122.5625, "loss": 0.02094731293618679, "loss_ce": 0.00031743032741360366, "loss_xval": 0.0206298828125, "num_input_tokens_seen": 135682816, "step": 1961 }, { "epoch": 122.625, "grad_norm": 1.6092316524284045, "learning_rate": 5e-05, "loss": 0.0048, "num_input_tokens_seen": 135754432, "step": 1962 }, { "epoch": 122.625, "loss": 0.005259785335510969, "loss_ce": 0.000407490530051291, "loss_xval": 0.004852294921875, "num_input_tokens_seen": 135754432, "step": 1962 }, { "epoch": 122.6875, "grad_norm": 8.359250061226241, "learning_rate": 5e-05, "loss": 0.0202, "num_input_tokens_seen": 135825984, "step": 1963 }, { "epoch": 122.6875, "loss": 0.019757412374019623, "loss_ce": 0.00047030241694301367, "loss_xval": 0.019287109375, "num_input_tokens_seen": 135825984, "step": 1963 }, { "epoch": 122.75, "grad_norm": 3.774240853149926, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 135897728, "step": 1964 }, { "epoch": 122.75, "loss": 0.005050990264862776, "loss_ce": 0.0005649062804877758, "loss_xval": 0.004486083984375, "num_input_tokens_seen": 135897728, "step": 1964 }, { "epoch": 122.8125, "grad_norm": 6.56217554022863, "learning_rate": 5e-05, "loss": 0.0123, "num_input_tokens_seen": 135969408, "step": 1965 }, { "epoch": 122.8125, "loss": 0.013142449781298637, "loss_ce": 0.000630243041086942, "loss_xval": 0.01251220703125, "num_input_tokens_seen": 135969408, "step": 1965 }, { "epoch": 122.875, "grad_norm": 6.069851309504797, "learning_rate": 5e-05, "loss": 0.0105, "num_input_tokens_seen": 136041088, "step": 1966 }, { "epoch": 122.875, "loss": 0.00996310357004404, "loss_ce": 0.0007467948598787189, "loss_xval": 0.00921630859375, "num_input_tokens_seen": 136041088, "step": 1966 }, { "epoch": 122.9375, "grad_norm": 2.64188694224837, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 136112704, "step": 1967 }, { "epoch": 122.9375, "loss": 0.003580670803785324, "loss_ce": 0.000742536096367985, "loss_xval": 0.002838134765625, "num_input_tokens_seen": 136112704, "step": 1967 }, { "epoch": 123.0, "grad_norm": 6.8665627223274255, "learning_rate": 5e-05, "loss": 0.0147, "num_input_tokens_seen": 136184256, "step": 1968 }, { "epoch": 123.0, "loss": 0.012735115364193916, "loss_ce": 0.0008942951099015772, "loss_xval": 0.0118408203125, "num_input_tokens_seen": 136184256, "step": 1968 }, { "epoch": 123.0625, "grad_norm": 0.5807229440386033, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 136255872, "step": 1969 }, { "epoch": 123.0625, "loss": 0.001586060388945043, "loss_ce": 0.0008002327522262931, "loss_xval": 0.00078582763671875, "num_input_tokens_seen": 136255872, "step": 1969 }, { "epoch": 123.125, "grad_norm": 5.465627938504707, "learning_rate": 5e-05, "loss": 0.0088, "num_input_tokens_seen": 136327424, "step": 1970 }, { "epoch": 123.125, "loss": 0.008768781088292599, "loss_ce": 0.0008342110668309033, "loss_xval": 0.0079345703125, "num_input_tokens_seen": 136327424, "step": 1970 }, { "epoch": 123.1875, "grad_norm": 3.2688936917904052, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 136399168, "step": 1971 }, { "epoch": 123.1875, "loss": 0.003709967015311122, "loss_ce": 0.000887090980540961, "loss_xval": 0.0028228759765625, "num_input_tokens_seen": 136399168, "step": 1971 }, { "epoch": 123.25, "grad_norm": 2.859825800308269, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 136470784, "step": 1972 }, { "epoch": 123.25, "loss": 0.0040792059153318405, "loss_ce": 0.0008901189430616796, "loss_xval": 0.0031890869140625, "num_input_tokens_seen": 136470784, "step": 1972 }, { "epoch": 123.3125, "grad_norm": 4.779754176105408, "learning_rate": 5e-05, "loss": 0.008, "num_input_tokens_seen": 136529984, "step": 1973 }, { "epoch": 123.3125, "loss": 0.008246231824159622, "loss_ce": 0.0008304607472382486, "loss_xval": 0.007415771484375, "num_input_tokens_seen": 136529984, "step": 1973 }, { "epoch": 123.375, "grad_norm": 0.39980392059776326, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 136601664, "step": 1974 }, { "epoch": 123.375, "loss": 0.0014047473669052124, "loss_ce": 0.0008020252571441233, "loss_xval": 0.00060272216796875, "num_input_tokens_seen": 136601664, "step": 1974 }, { "epoch": 123.4375, "grad_norm": 4.187773290468926, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 136673344, "step": 1975 }, { "epoch": 123.4375, "loss": 0.006777295842766762, "loss_ce": 0.0007348151411861181, "loss_xval": 0.00604248046875, "num_input_tokens_seen": 136673344, "step": 1975 }, { "epoch": 123.5, "grad_norm": 1.7687218212830937, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 136732416, "step": 1976 }, { "epoch": 123.5, "loss": 0.002407355234026909, "loss_ce": 0.0007212591008283198, "loss_xval": 0.00168609619140625, "num_input_tokens_seen": 136732416, "step": 1976 }, { "epoch": 123.5625, "grad_norm": 2.6514844366807613, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 136804160, "step": 1977 }, { "epoch": 123.5625, "loss": 0.0029455279000103474, "loss_ce": 0.0006109331152401865, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 136804160, "step": 1977 }, { "epoch": 123.625, "grad_norm": 3.0990904073083367, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 136875840, "step": 1978 }, { "epoch": 123.625, "loss": 0.003793952288106084, "loss_ce": 0.0004980538506060839, "loss_xval": 0.0032958984375, "num_input_tokens_seen": 136875840, "step": 1978 }, { "epoch": 123.6875, "grad_norm": 1.103623406702498, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 136947584, "step": 1979 }, { "epoch": 123.6875, "loss": 0.0019718646071851254, "loss_ce": 0.0004917620099149644, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 136947584, "step": 1979 }, { "epoch": 123.75, "grad_norm": 3.151175428198334, "learning_rate": 5e-05, "loss": 0.0034, "num_input_tokens_seen": 137006656, "step": 1980 }, { "epoch": 123.75, "loss": 0.0033435134682804346, "loss_ce": 0.0004443435464054346, "loss_xval": 0.002899169921875, "num_input_tokens_seen": 137006656, "step": 1980 }, { "epoch": 123.8125, "grad_norm": 0.6259623426254022, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 137065728, "step": 1981 }, { "epoch": 123.8125, "loss": 0.0006881886511109769, "loss_ce": 0.00039445696165785193, "loss_xval": 0.000293731689453125, "num_input_tokens_seen": 137065728, "step": 1981 }, { "epoch": 123.875, "grad_norm": 3.066975260146319, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 137137408, "step": 1982 }, { "epoch": 123.875, "loss": 0.002845182316377759, "loss_ce": 0.00037325851735658944, "loss_xval": 0.002471923828125, "num_input_tokens_seen": 137137408, "step": 1982 }, { "epoch": 123.9375, "grad_norm": 1.3733118331002803, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 137209024, "step": 1983 }, { "epoch": 123.9375, "loss": 0.0013497625477612019, "loss_ce": 0.0003426824987400323, "loss_xval": 0.001007080078125, "num_input_tokens_seen": 137209024, "step": 1983 }, { "epoch": 124.0, "grad_norm": 2.5286701579932305, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 137280704, "step": 1984 }, { "epoch": 124.0, "loss": 0.002422866877168417, "loss_ce": 0.0003018952556885779, "loss_xval": 0.0021209716796875, "num_input_tokens_seen": 137280704, "step": 1984 }, { "epoch": 124.0625, "grad_norm": 1.6838102534552042, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 137339648, "step": 1985 }, { "epoch": 124.0625, "loss": 0.0013198907254263759, "loss_ce": 0.00027466367464512587, "loss_xval": 0.00104522705078125, "num_input_tokens_seen": 137339648, "step": 1985 }, { "epoch": 124.125, "grad_norm": 1.870493706709494, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 137398912, "step": 1986 }, { "epoch": 124.125, "loss": 0.0017201672308146954, "loss_ce": 0.0002476940571796149, "loss_xval": 0.00147247314453125, "num_input_tokens_seen": 137398912, "step": 1986 }, { "epoch": 124.1875, "grad_norm": 1.8316167682634479, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 137470464, "step": 1987 }, { "epoch": 124.1875, "loss": 0.0013492691796272993, "loss_ce": 0.0002201187307946384, "loss_xval": 0.001129150390625, "num_input_tokens_seen": 137470464, "step": 1987 }, { "epoch": 124.25, "grad_norm": 1.3896300670799442, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 137542144, "step": 1988 }, { "epoch": 124.25, "loss": 0.000932362221647054, "loss_ce": 0.00021138445299584419, "loss_xval": 0.000720977783203125, "num_input_tokens_seen": 137542144, "step": 1988 }, { "epoch": 124.3125, "grad_norm": 1.8757599594813787, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 137613760, "step": 1989 }, { "epoch": 124.3125, "loss": 0.0013382441829890013, "loss_ce": 0.0001938350615091622, "loss_xval": 0.0011444091796875, "num_input_tokens_seen": 137613760, "step": 1989 }, { "epoch": 124.375, "grad_norm": 1.0058499642413008, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 137685504, "step": 1990 }, { "epoch": 124.375, "loss": 0.000767315796110779, "loss_ce": 0.00019129650900140405, "loss_xval": 0.000576019287109375, "num_input_tokens_seen": 137685504, "step": 1990 }, { "epoch": 124.4375, "grad_norm": 1.7611596455705885, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 137757120, "step": 1991 }, { "epoch": 124.4375, "loss": 0.0010359555017203093, "loss_ce": 0.00015476047701667994, "loss_xval": 0.000881195068359375, "num_input_tokens_seen": 137757120, "step": 1991 }, { "epoch": 124.5, "grad_norm": 0.7570485915024486, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 137816256, "step": 1992 }, { "epoch": 124.5, "loss": 0.0004390325048007071, "loss_ce": 0.00014911549806129187, "loss_xval": 0.0002899169921875, "num_input_tokens_seen": 137816256, "step": 1992 }, { "epoch": 124.5625, "grad_norm": 1.321495989789071, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 137887872, "step": 1993 }, { "epoch": 124.5625, "loss": 0.000663900631479919, "loss_ce": 0.00014128712064120919, "loss_xval": 0.000522613525390625, "num_input_tokens_seen": 137887872, "step": 1993 }, { "epoch": 124.625, "grad_norm": 0.2112346218673502, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 137946880, "step": 1994 }, { "epoch": 124.625, "loss": 0.0002966892207041383, "loss_ce": 0.00012502782919909805, "loss_xval": 0.000171661376953125, "num_input_tokens_seen": 137946880, "step": 1994 }, { "epoch": 124.6875, "grad_norm": 1.5159169466326932, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 138018496, "step": 1995 }, { "epoch": 124.6875, "loss": 0.0007277600234374404, "loss_ce": 0.00011359379277564585, "loss_xval": 0.000614166259765625, "num_input_tokens_seen": 138018496, "step": 1995 }, { "epoch": 124.75, "grad_norm": 0.32384576031497636, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 138090048, "step": 1996 }, { "epoch": 124.75, "loss": 0.00020585546735674143, "loss_ce": 0.00011621007433859631, "loss_xval": 8.96453857421875e-05, "num_input_tokens_seen": 138090048, "step": 1996 }, { "epoch": 124.8125, "grad_norm": 1.2856806695187504, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 138161664, "step": 1997 }, { "epoch": 124.8125, "loss": 0.0009543233318254352, "loss_ce": 0.00011127521429443732, "loss_xval": 0.000843048095703125, "num_input_tokens_seen": 138161664, "step": 1997 }, { "epoch": 124.875, "grad_norm": 0.273387107407299, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 138233216, "step": 1998 }, { "epoch": 124.875, "loss": 0.00030335847986862063, "loss_ce": 0.00010690158524084836, "loss_xval": 0.0001964569091796875, "num_input_tokens_seen": 138233216, "step": 1998 }, { "epoch": 124.9375, "grad_norm": 1.2811936909150359, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 138304832, "step": 1999 }, { "epoch": 124.9375, "loss": 0.0005950164049863815, "loss_ce": 9.529103408567607e-05, "loss_xval": 0.000499725341796875, "num_input_tokens_seen": 138304832, "step": 1999 }, { "epoch": 125.0, "grad_norm": 0.6247624824023181, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 138363904, "step": 2000 }, { "epoch": 125.0, "eval_synth_IoU": 0.16027227230370045, "eval_synth_MAE_x": 0.00927734375, "eval_synth_MAE_y": 0.016754150390625, "eval_synth_NUM_probability": 0.9989532977342606, "eval_synth_inside_bbox": 0.6875, "eval_synth_loss": 0.00046615718747489154, "eval_synth_loss_ce": 8.802525371720549e-05, "eval_synth_loss_xval": 0.0003781318664550781, "eval_synth_runtime": 63.8838, "eval_synth_samples_per_second": 2.004, "eval_synth_steps_per_second": 0.063, "num_input_tokens_seen": 138363904, "step": 2000 }, { "epoch": 125.0, "loss": 0.0005669583915732801, "loss_ce": 9.20285820029676e-05, "loss_xval": 0.0004749298095703125, "num_input_tokens_seen": 138363904, "step": 2000 }, { "epoch": 125.0625, "grad_norm": 1.1289912466788816, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 138435456, "step": 2001 }, { "epoch": 125.0625, "loss": 0.0005536347161978483, "loss_ce": 8.824166434351355e-05, "loss_xval": 0.00046539306640625, "num_input_tokens_seen": 138435456, "step": 2001 }, { "epoch": 125.125, "grad_norm": 1.3321695183277404, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 138507264, "step": 2002 }, { "epoch": 125.125, "loss": 0.0006004355382174253, "loss_ce": 8.163671736838296e-05, "loss_xval": 0.000518798828125, "num_input_tokens_seen": 138507264, "step": 2002 }, { "epoch": 125.1875, "grad_norm": 0.1435252998460747, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 138578880, "step": 2003 }, { "epoch": 125.1875, "loss": 0.0001957011700142175, "loss_ce": 8.078341488726437e-05, "loss_xval": 0.00011491775512695312, "num_input_tokens_seen": 138578880, "step": 2003 }, { "epoch": 125.25, "grad_norm": 0.7789113344012044, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 138650496, "step": 2004 }, { "epoch": 125.25, "loss": 0.00030843037529848516, "loss_ce": 7.47801605029963e-05, "loss_xval": 0.00023365020751953125, "num_input_tokens_seen": 138650496, "step": 2004 }, { "epoch": 125.3125, "grad_norm": 0.13352900111691626, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 138722112, "step": 2005 }, { "epoch": 125.3125, "loss": 0.0002294059086125344, "loss_ce": 7.49106693547219e-05, "loss_xval": 0.0001544952392578125, "num_input_tokens_seen": 138722112, "step": 2005 }, { "epoch": 125.375, "grad_norm": 0.6646610600448061, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 138793664, "step": 2006 }, { "epoch": 125.375, "loss": 0.00029537681257352233, "loss_ce": 7.221702253445983e-05, "loss_xval": 0.0002231597900390625, "num_input_tokens_seen": 138793664, "step": 2006 }, { "epoch": 125.4375, "grad_norm": 0.2662452337215074, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 138865408, "step": 2007 }, { "epoch": 125.4375, "loss": 0.00023515126667916775, "loss_ce": 6.444357131840661e-05, "loss_xval": 0.00017070770263671875, "num_input_tokens_seen": 138865408, "step": 2007 }, { "epoch": 125.5, "grad_norm": 0.2014708435449277, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 138937024, "step": 2008 }, { "epoch": 125.5, "loss": 0.00019856503058690578, "loss_ce": 6.600430060643703e-05, "loss_xval": 0.00013256072998046875, "num_input_tokens_seen": 138937024, "step": 2008 }, { "epoch": 125.5625, "grad_norm": 0.37715980519558245, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 138996160, "step": 2009 }, { "epoch": 125.5625, "loss": 0.00028794267564080656, "loss_ce": 6.001452129567042e-05, "loss_xval": 0.00022792816162109375, "num_input_tokens_seen": 138996160, "step": 2009 }, { "epoch": 125.625, "grad_norm": 0.15972412896600788, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139067776, "step": 2010 }, { "epoch": 125.625, "loss": 0.00024122698232531548, "loss_ce": 6.479722651420161e-05, "loss_xval": 0.00017642974853515625, "num_input_tokens_seen": 139067776, "step": 2010 }, { "epoch": 125.6875, "grad_norm": 0.21785802377503982, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139139392, "step": 2011 }, { "epoch": 125.6875, "loss": 0.0001753378746798262, "loss_ce": 5.994327875669114e-05, "loss_xval": 0.00011539459228515625, "num_input_tokens_seen": 139139392, "step": 2011 }, { "epoch": 125.75, "grad_norm": 0.3215632556672757, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139211200, "step": 2012 }, { "epoch": 125.75, "loss": 0.0001643118157517165, "loss_ce": 5.893080742680468e-05, "loss_xval": 0.00010538101196289062, "num_input_tokens_seen": 139211200, "step": 2012 }, { "epoch": 125.8125, "grad_norm": 0.0709285778957573, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139282944, "step": 2013 }, { "epoch": 125.8125, "loss": 9.688051795819774e-05, "loss_ce": 5.7064615248236805e-05, "loss_xval": 3.981590270996094e-05, "num_input_tokens_seen": 139282944, "step": 2013 }, { "epoch": 125.875, "grad_norm": 0.2572826224861247, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139341952, "step": 2014 }, { "epoch": 125.875, "loss": 0.00015328649897128344, "loss_ce": 5.267385859042406e-05, "loss_xval": 0.00010061264038085938, "num_input_tokens_seen": 139341952, "step": 2014 }, { "epoch": 125.9375, "grad_norm": 0.09740524760517949, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139413760, "step": 2015 }, { "epoch": 125.9375, "loss": 0.00014573728549294174, "loss_ce": 5.132352816872299e-05, "loss_xval": 9.441375732421875e-05, "num_input_tokens_seen": 139413760, "step": 2015 }, { "epoch": 126.0, "grad_norm": 0.2981512293786988, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139485376, "step": 2016 }, { "epoch": 126.0, "loss": 0.00012508727377280593, "loss_ce": 4.7362816985696554e-05, "loss_xval": 7.772445678710938e-05, "num_input_tokens_seen": 139485376, "step": 2016 }, { "epoch": 126.0625, "grad_norm": 0.23777569105293894, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139557120, "step": 2017 }, { "epoch": 126.0625, "loss": 0.0001335121924057603, "loss_ce": 4.7204663133015856e-05, "loss_xval": 8.630752563476562e-05, "num_input_tokens_seen": 139557120, "step": 2017 }, { "epoch": 126.125, "grad_norm": 0.08930608778804024, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139616256, "step": 2018 }, { "epoch": 126.125, "loss": 0.00016115653852466494, "loss_ce": 4.671562055591494e-05, "loss_xval": 0.00011444091796875, "num_input_tokens_seen": 139616256, "step": 2018 }, { "epoch": 126.1875, "grad_norm": 0.27264871121824047, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139687808, "step": 2019 }, { "epoch": 126.1875, "loss": 0.00015747947327326983, "loss_ce": 4.589958189171739e-05, "loss_xval": 0.00011157989501953125, "num_input_tokens_seen": 139687808, "step": 2019 }, { "epoch": 126.25, "grad_norm": 0.260372602529021, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139759424, "step": 2020 }, { "epoch": 126.25, "loss": 9.420116839464754e-05, "loss_ce": 4.3179588828934357e-05, "loss_xval": 5.1021575927734375e-05, "num_input_tokens_seen": 139759424, "step": 2020 }, { "epoch": 126.3125, "grad_norm": 0.024071873095662125, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139831104, "step": 2021 }, { "epoch": 126.3125, "loss": 9.950671665137634e-05, "loss_ce": 4.1094164771493524e-05, "loss_xval": 5.841255187988281e-05, "num_input_tokens_seen": 139831104, "step": 2021 }, { "epoch": 126.375, "grad_norm": 0.10003631958517323, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 139902784, "step": 2022 }, { "epoch": 126.375, "loss": 8.994160452857614e-05, "loss_ce": 4.05889586545527e-05, "loss_xval": 4.935264587402344e-05, "num_input_tokens_seen": 139902784, "step": 2022 }, { "epoch": 126.4375, "grad_norm": 0.5174751342213327, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 139961856, "step": 2023 }, { "epoch": 126.4375, "loss": 0.0002707853855099529, "loss_ce": 4.094986798008904e-05, "loss_xval": 0.00022983551025390625, "num_input_tokens_seen": 139961856, "step": 2023 }, { "epoch": 126.5, "grad_norm": 1.1338815551773325, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 140033408, "step": 2024 }, { "epoch": 126.5, "loss": 0.0003625124227255583, "loss_ce": 4.20778633269947e-05, "loss_xval": 0.0003204345703125, "num_input_tokens_seen": 140033408, "step": 2024 }, { "epoch": 126.5625, "grad_norm": 0.6000982973521324, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140105088, "step": 2025 }, { "epoch": 126.5625, "loss": 0.0002064650325337425, "loss_ce": 3.861834920826368e-05, "loss_xval": 0.0001678466796875, "num_input_tokens_seen": 140105088, "step": 2025 }, { "epoch": 126.625, "grad_norm": 0.6923015797971452, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140176640, "step": 2026 }, { "epoch": 126.625, "loss": 0.0002409739390714094, "loss_ce": 3.879498763126321e-05, "loss_xval": 0.000202178955078125, "num_input_tokens_seen": 140176640, "step": 2026 }, { "epoch": 126.6875, "grad_norm": 1.3144685472037902, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 140248320, "step": 2027 }, { "epoch": 126.6875, "loss": 0.0006072742398828268, "loss_ce": 3.888433275278658e-05, "loss_xval": 0.000568389892578125, "num_input_tokens_seen": 140248320, "step": 2027 }, { "epoch": 126.75, "grad_norm": 0.6489809024564193, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140319936, "step": 2028 }, { "epoch": 126.75, "loss": 0.00017327992827631533, "loss_ce": 3.7858171708649024e-05, "loss_xval": 0.0001354217529296875, "num_input_tokens_seen": 140319936, "step": 2028 }, { "epoch": 126.8125, "grad_norm": 0.6426814506642058, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140391552, "step": 2029 }, { "epoch": 126.8125, "loss": 0.00019704973965417594, "loss_ce": 3.587877654354088e-05, "loss_xval": 0.00016117095947265625, "num_input_tokens_seen": 140391552, "step": 2029 }, { "epoch": 126.875, "grad_norm": 1.5260237419116756, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 140463232, "step": 2030 }, { "epoch": 126.875, "loss": 0.0006509263766929507, "loss_ce": 3.676014603115618e-05, "loss_xval": 0.000614166259765625, "num_input_tokens_seen": 140463232, "step": 2030 }, { "epoch": 126.9375, "grad_norm": 1.409406677975525, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 140534784, "step": 2031 }, { "epoch": 126.9375, "loss": 0.0006154632428660989, "loss_ce": 3.562926212907769e-05, "loss_xval": 0.000579833984375, "num_input_tokens_seen": 140534784, "step": 2031 }, { "epoch": 127.0, "grad_norm": 0.5766126788840545, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140606336, "step": 2032 }, { "epoch": 127.0, "loss": 0.00019185608834959567, "loss_ce": 3.831452704616822e-05, "loss_xval": 0.00015354156494140625, "num_input_tokens_seen": 140606336, "step": 2032 }, { "epoch": 127.0625, "grad_norm": 0.2498634473100581, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 140665408, "step": 2033 }, { "epoch": 127.0625, "loss": 9.204051457345486e-05, "loss_ce": 3.6012152122566476e-05, "loss_xval": 5.602836608886719e-05, "num_input_tokens_seen": 140665408, "step": 2033 }, { "epoch": 127.125, "grad_norm": 0.6422050609056498, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140736960, "step": 2034 }, { "epoch": 127.125, "loss": 0.00023144778970163316, "loss_ce": 3.499088415992446e-05, "loss_xval": 0.0001964569091796875, "num_input_tokens_seen": 140736960, "step": 2034 }, { "epoch": 127.1875, "grad_norm": 0.5595138398678299, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140808576, "step": 2035 }, { "epoch": 127.1875, "loss": 0.00017433673201594502, "loss_ce": 3.3192936825798824e-05, "loss_xval": 0.000141143798828125, "num_input_tokens_seen": 140808576, "step": 2035 }, { "epoch": 127.25, "grad_norm": 0.22115814221362357, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 140880256, "step": 2036 }, { "epoch": 127.25, "loss": 6.543470954056829e-05, "loss_ce": 3.575160008040257e-05, "loss_xval": 2.968311309814453e-05, "num_input_tokens_seen": 140880256, "step": 2036 }, { "epoch": 127.3125, "grad_norm": 0.20004183815073995, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 140939456, "step": 2037 }, { "epoch": 127.3125, "loss": 0.0001192227064166218, "loss_ce": 3.291518078185618e-05, "loss_xval": 8.630752563476562e-05, "num_input_tokens_seen": 140939456, "step": 2037 }, { "epoch": 127.375, "grad_norm": 0.627095105597688, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 140985984, "step": 2038 }, { "epoch": 127.375, "loss": 0.00016552691522520036, "loss_ce": 3.487353751552291e-05, "loss_xval": 0.00013065338134765625, "num_input_tokens_seen": 140985984, "step": 2038 }, { "epoch": 127.4375, "grad_norm": 0.8872909986246116, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 141057536, "step": 2039 }, { "epoch": 127.4375, "loss": 0.00025953692966140807, "loss_ce": 3.1608771678293124e-05, "loss_xval": 0.00022792816162109375, "num_input_tokens_seen": 141057536, "step": 2039 }, { "epoch": 127.5, "grad_norm": 0.7824631232935765, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 141129216, "step": 2040 }, { "epoch": 127.5, "loss": 0.00019397796131670475, "loss_ce": 3.376067979843356e-05, "loss_xval": 0.00016021728515625, "num_input_tokens_seen": 141129216, "step": 2040 }, { "epoch": 127.5625, "grad_norm": 0.3417990588828484, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141200768, "step": 2041 }, { "epoch": 127.5625, "loss": 9.184810187434778e-05, "loss_ce": 3.12897827825509e-05, "loss_xval": 6.0558319091796875e-05, "num_input_tokens_seen": 141200768, "step": 2041 }, { "epoch": 127.625, "grad_norm": 0.06222981770637875, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141272384, "step": 2042 }, { "epoch": 127.625, "loss": 9.499691077508032e-05, "loss_ce": 3.205440953024663e-05, "loss_xval": 6.29425048828125e-05, "num_input_tokens_seen": 141272384, "step": 2042 }, { "epoch": 127.6875, "grad_norm": 0.11726662055733453, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141344192, "step": 2043 }, { "epoch": 127.6875, "loss": 8.967838220996782e-05, "loss_ce": 3.1742667488288134e-05, "loss_xval": 5.793571472167969e-05, "num_input_tokens_seen": 141344192, "step": 2043 }, { "epoch": 127.75, "grad_norm": 0.15529934399109852, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141415808, "step": 2044 }, { "epoch": 127.75, "loss": 9.700076043372974e-05, "loss_ce": 2.881304499169346e-05, "loss_xval": 6.818771362304688e-05, "num_input_tokens_seen": 141415808, "step": 2044 }, { "epoch": 127.8125, "grad_norm": 0.10366291324035277, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141487360, "step": 2045 }, { "epoch": 127.8125, "loss": 7.866997475503013e-05, "loss_ce": 3.0271003197412938e-05, "loss_xval": 4.839897155761719e-05, "num_input_tokens_seen": 141487360, "step": 2045 }, { "epoch": 127.875, "grad_norm": 0.18112455961803986, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141558912, "step": 2046 }, { "epoch": 127.875, "loss": 0.00012426248576957732, "loss_ce": 3.127923991996795e-05, "loss_xval": 9.298324584960938e-05, "num_input_tokens_seen": 141558912, "step": 2046 }, { "epoch": 127.9375, "grad_norm": 0.45311283789773527, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 141630656, "step": 2047 }, { "epoch": 127.9375, "loss": 0.00011793743760790676, "loss_ce": 3.067623401875608e-05, "loss_xval": 8.726119995117188e-05, "num_input_tokens_seen": 141630656, "step": 2047 }, { "epoch": 128.0, "grad_norm": 0.7398344509970638, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 141702400, "step": 2048 }, { "epoch": 128.0, "loss": 0.00020481945830397308, "loss_ce": 3.0297051125671715e-05, "loss_xval": 0.00017452239990234375, "num_input_tokens_seen": 141702400, "step": 2048 }, { "epoch": 128.0625, "grad_norm": 0.9385140841705101, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 141773952, "step": 2049 }, { "epoch": 128.0625, "loss": 0.00026439601788297296, "loss_ce": 2.9792136047035456e-05, "loss_xval": 0.0002346038818359375, "num_input_tokens_seen": 141773952, "step": 2049 }, { "epoch": 128.125, "grad_norm": 1.1834843798666885, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 141845632, "step": 2050 }, { "epoch": 128.125, "loss": 0.0004072985320817679, "loss_ce": 2.9643508241861127e-05, "loss_xval": 0.000377655029296875, "num_input_tokens_seen": 141845632, "step": 2050 }, { "epoch": 128.1875, "grad_norm": 1.6238155697563705, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 141917376, "step": 2051 }, { "epoch": 128.1875, "loss": 0.0006558331078849733, "loss_ce": 3.0222730856621638e-05, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 141917376, "step": 2051 }, { "epoch": 128.25, "grad_norm": 2.281162313109408, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 141989184, "step": 2052 }, { "epoch": 128.25, "loss": 0.001286553917452693, "loss_ce": 2.7703841624315828e-05, "loss_xval": 0.00125885009765625, "num_input_tokens_seen": 141989184, "step": 2052 }, { "epoch": 128.3125, "grad_norm": 3.1514891072102236, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 142060800, "step": 2053 }, { "epoch": 128.3125, "loss": 0.002576857805252075, "loss_ce": 2.864013185899239e-05, "loss_xval": 0.0025482177734375, "num_input_tokens_seen": 142060800, "step": 2053 }, { "epoch": 128.375, "grad_norm": 4.284793735662272, "learning_rate": 5e-05, "loss": 0.0044, "num_input_tokens_seen": 142132416, "step": 2054 }, { "epoch": 128.375, "loss": 0.004516967572271824, "loss_ce": 3.088352968916297e-05, "loss_xval": 0.004486083984375, "num_input_tokens_seen": 142132416, "step": 2054 }, { "epoch": 128.4375, "grad_norm": 5.376791980859641, "learning_rate": 5e-05, "loss": 0.0069, "num_input_tokens_seen": 142204224, "step": 2055 }, { "epoch": 128.4375, "loss": 0.007232699543237686, "loss_ce": 3.055127672269009e-05, "loss_xval": 0.0072021484375, "num_input_tokens_seen": 142204224, "step": 2055 }, { "epoch": 128.5, "grad_norm": 6.1098489395843805, "learning_rate": 5e-05, "loss": 0.0091, "num_input_tokens_seen": 142275904, "step": 2056 }, { "epoch": 128.5, "loss": 0.00888771377503872, "loss_ce": 3.7616420740960166e-05, "loss_xval": 0.00885009765625, "num_input_tokens_seen": 142275904, "step": 2056 }, { "epoch": 128.5625, "grad_norm": 5.633904203121933, "learning_rate": 5e-05, "loss": 0.0078, "num_input_tokens_seen": 142335040, "step": 2057 }, { "epoch": 128.5625, "loss": 0.007855950854718685, "loss_ce": 4.3451062083477154e-05, "loss_xval": 0.0078125, "num_input_tokens_seen": 142335040, "step": 2057 }, { "epoch": 128.625, "grad_norm": 3.200725435628059, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 142406656, "step": 2058 }, { "epoch": 128.625, "loss": 0.0025145290419459343, "loss_ce": 5.7864039263222367e-05, "loss_xval": 0.0024566650390625, "num_input_tokens_seen": 142406656, "step": 2058 }, { "epoch": 128.6875, "grad_norm": 0.34371424886412993, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 142478208, "step": 2059 }, { "epoch": 128.6875, "loss": 0.0002074497169815004, "loss_ce": 7.393530540866777e-05, "loss_xval": 0.000133514404296875, "num_input_tokens_seen": 142478208, "step": 2059 }, { "epoch": 128.75, "grad_norm": 3.282261970447706, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 142549760, "step": 2060 }, { "epoch": 128.75, "loss": 0.002972968854010105, "loss_ce": 8.905776485335082e-05, "loss_xval": 0.0028839111328125, "num_input_tokens_seen": 142549760, "step": 2060 }, { "epoch": 128.8125, "grad_norm": 3.694021790210018, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 142621376, "step": 2061 }, { "epoch": 128.8125, "loss": 0.003910902887582779, "loss_ce": 0.00011146435281261802, "loss_xval": 0.0037994384765625, "num_input_tokens_seen": 142621376, "step": 2061 }, { "epoch": 128.875, "grad_norm": 1.7190069916683812, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 142680512, "step": 2062 }, { "epoch": 128.875, "loss": 0.00120463315397501, "loss_ce": 0.00013651789049617946, "loss_xval": 0.001068115234375, "num_input_tokens_seen": 142680512, "step": 2062 }, { "epoch": 128.9375, "grad_norm": 0.9999859186357049, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 142752192, "step": 2063 }, { "epoch": 128.9375, "loss": 0.0004924352397210896, "loss_ce": 0.00016055656305979937, "loss_xval": 0.000331878662109375, "num_input_tokens_seen": 142752192, "step": 2063 }, { "epoch": 129.0, "grad_norm": 2.440647575670156, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 142811136, "step": 2064 }, { "epoch": 129.0, "loss": 0.0016980627551674843, "loss_ce": 0.00018744260887615383, "loss_xval": 0.0015106201171875, "num_input_tokens_seen": 142811136, "step": 2064 }, { "epoch": 129.0625, "grad_norm": 1.5520947196070725, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 142882752, "step": 2065 }, { "epoch": 129.0625, "loss": 0.0009497711434960365, "loss_ce": 0.00021353457123041153, "loss_xval": 0.000736236572265625, "num_input_tokens_seen": 142882752, "step": 2065 }, { "epoch": 129.125, "grad_norm": 0.6450613769787013, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 142954432, "step": 2066 }, { "epoch": 129.125, "loss": 0.00037603528471663594, "loss_ce": 0.00021677168842870742, "loss_xval": 0.00015926361083984375, "num_input_tokens_seen": 142954432, "step": 2066 }, { "epoch": 129.1875, "grad_norm": 2.277810195811712, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 143026048, "step": 2067 }, { "epoch": 129.1875, "loss": 0.001686818664893508, "loss_ce": 0.00021434557856991887, "loss_xval": 0.00147247314453125, "num_input_tokens_seen": 143026048, "step": 2067 }, { "epoch": 129.25, "grad_norm": 2.140687373653537, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 143085056, "step": 2068 }, { "epoch": 129.25, "loss": 0.0012608192628249526, "loss_ce": 0.0002308510447619483, "loss_xval": 0.00102996826171875, "num_input_tokens_seen": 143085056, "step": 2068 }, { "epoch": 129.3125, "grad_norm": 0.5652085826944148, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 143156800, "step": 2069 }, { "epoch": 129.3125, "loss": 0.0005040373071096838, "loss_ce": 0.00022937910398468375, "loss_xval": 0.000274658203125, "num_input_tokens_seen": 143156800, "step": 2069 }, { "epoch": 129.375, "grad_norm": 1.2559795587036462, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 143228352, "step": 2070 }, { "epoch": 129.375, "loss": 0.000903485284652561, "loss_ce": 0.00022065445955377072, "loss_xval": 0.000682830810546875, "num_input_tokens_seen": 143228352, "step": 2070 }, { "epoch": 129.4375, "grad_norm": 2.088837487238198, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 143287360, "step": 2071 }, { "epoch": 129.4375, "loss": 0.0016689603216946125, "loss_ce": 0.00022700471163261682, "loss_xval": 0.00144195556640625, "num_input_tokens_seen": 143287360, "step": 2071 }, { "epoch": 129.5, "grad_norm": 1.6954721292442427, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 143359168, "step": 2072 }, { "epoch": 129.5, "loss": 0.0010234940564259887, "loss_ce": 0.000226222284254618, "loss_xval": 0.000797271728515625, "num_input_tokens_seen": 143359168, "step": 2072 }, { "epoch": 129.5625, "grad_norm": 0.39508711281801384, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 143430976, "step": 2073 }, { "epoch": 129.5625, "loss": 0.0003998870379291475, "loss_ce": 0.00019961541693191975, "loss_xval": 0.0002002716064453125, "num_input_tokens_seen": 143430976, "step": 2073 }, { "epoch": 129.625, "grad_norm": 0.9832453768116473, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 143490112, "step": 2074 }, { "epoch": 129.625, "loss": 0.000548589276149869, "loss_ce": 0.00017284158093389124, "loss_xval": 0.0003757476806640625, "num_input_tokens_seen": 143490112, "step": 2074 }, { "epoch": 129.6875, "grad_norm": 1.47372069184567, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 143549120, "step": 2075 }, { "epoch": 129.6875, "loss": 0.0007509666029363871, "loss_ce": 0.00017113260400947183, "loss_xval": 0.000579833984375, "num_input_tokens_seen": 143549120, "step": 2075 }, { "epoch": 129.75, "grad_norm": 0.8547622412274952, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 143595712, "step": 2076 }, { "epoch": 129.75, "loss": 0.00036665465449914336, "loss_ce": 0.00014254120469558984, "loss_xval": 0.00022411346435546875, "num_input_tokens_seen": 143595712, "step": 2076 }, { "epoch": 129.8125, "grad_norm": 0.44202400008651155, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 143667328, "step": 2077 }, { "epoch": 129.8125, "loss": 0.00024111934180837125, "loss_ce": 0.00013478465552907437, "loss_xval": 0.00010633468627929688, "num_input_tokens_seen": 143667328, "step": 2077 }, { "epoch": 129.875, "grad_norm": 1.6873056781973697, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 143726464, "step": 2078 }, { "epoch": 129.875, "loss": 0.001029672333970666, "loss_ce": 0.00012177436292404309, "loss_xval": 0.00090789794921875, "num_input_tokens_seen": 143726464, "step": 2078 }, { "epoch": 129.9375, "grad_norm": 2.158001366610662, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 143798080, "step": 2079 }, { "epoch": 129.9375, "loss": 0.0013578996295109391, "loss_ce": 0.00011430833546910435, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 143798080, "step": 2079 }, { "epoch": 130.0, "grad_norm": 1.5856936274924123, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 143869760, "step": 2080 }, { "epoch": 130.0, "loss": 0.0008600328583270311, "loss_ce": 0.00010090813157148659, "loss_xval": 0.000759124755859375, "num_input_tokens_seen": 143869760, "step": 2080 }, { "epoch": 130.0625, "grad_norm": 0.4768186065708803, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 143941440, "step": 2081 }, { "epoch": 130.0625, "loss": 0.00021693602320738137, "loss_ce": 9.295836207456887e-05, "loss_xval": 0.0001239776611328125, "num_input_tokens_seen": 143941440, "step": 2081 }, { "epoch": 130.125, "grad_norm": 0.7141073631281202, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 144013184, "step": 2082 }, { "epoch": 130.125, "loss": 0.0002918837999459356, "loss_ce": 8.684382191859186e-05, "loss_xval": 0.00020503997802734375, "num_input_tokens_seen": 144013184, "step": 2082 }, { "epoch": 130.1875, "grad_norm": 1.6396061250062675, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 144084736, "step": 2083 }, { "epoch": 130.1875, "loss": 0.0009414113592356443, "loss_ce": 8.691917901160195e-05, "loss_xval": 0.0008544921875, "num_input_tokens_seen": 144084736, "step": 2083 }, { "epoch": 130.25, "grad_norm": 2.4179746537506874, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 144156352, "step": 2084 }, { "epoch": 130.25, "loss": 0.0014471329050138593, "loss_ce": 7.384183845715597e-05, "loss_xval": 0.001373291015625, "num_input_tokens_seen": 144156352, "step": 2084 }, { "epoch": 130.3125, "grad_norm": 3.316065081632679, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 144227904, "step": 2085 }, { "epoch": 130.3125, "loss": 0.002950825961306691, "loss_ce": 6.691478483844548e-05, "loss_xval": 0.0028839111328125, "num_input_tokens_seen": 144227904, "step": 2085 }, { "epoch": 130.375, "grad_norm": 4.422820340803834, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 144299584, "step": 2086 }, { "epoch": 130.375, "loss": 0.005191377829760313, "loss_ce": 6.442474841605872e-05, "loss_xval": 0.005126953125, "num_input_tokens_seen": 144299584, "step": 2086 }, { "epoch": 130.4375, "grad_norm": 5.654507138086931, "learning_rate": 5e-05, "loss": 0.0083, "num_input_tokens_seen": 144371328, "step": 2087 }, { "epoch": 130.4375, "loss": 0.008368929848074913, "loss_ce": 6.814829976065084e-05, "loss_xval": 0.00830078125, "num_input_tokens_seen": 144371328, "step": 2087 }, { "epoch": 130.5, "grad_norm": 6.5448532185426105, "learning_rate": 5e-05, "loss": 0.011, "num_input_tokens_seen": 144430464, "step": 2088 }, { "epoch": 130.5, "loss": 0.010691306553781033, "loss_ce": 7.118981739040464e-05, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 144430464, "step": 2088 }, { "epoch": 130.5625, "grad_norm": 6.274424619511378, "learning_rate": 5e-05, "loss": 0.0103, "num_input_tokens_seen": 144502080, "step": 2089 }, { "epoch": 130.5625, "loss": 0.010163553059101105, "loss_ce": 9.275234333472326e-05, "loss_xval": 0.01007080078125, "num_input_tokens_seen": 144502080, "step": 2089 }, { "epoch": 130.625, "grad_norm": 4.177287100411339, "learning_rate": 5e-05, "loss": 0.0047, "num_input_tokens_seen": 144561152, "step": 2090 }, { "epoch": 130.625, "loss": 0.004846962168812752, "loss_ce": 0.00011673758126562461, "loss_xval": 0.004730224609375, "num_input_tokens_seen": 144561152, "step": 2090 }, { "epoch": 130.6875, "grad_norm": 0.49295514443893185, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 144620288, "step": 2091 }, { "epoch": 130.6875, "loss": 0.00044583145063370466, "loss_ce": 0.00014256300346460193, "loss_xval": 0.0003032684326171875, "num_input_tokens_seen": 144620288, "step": 2091 }, { "epoch": 130.75, "grad_norm": 3.0607890432683558, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 144691904, "step": 2092 }, { "epoch": 130.75, "loss": 0.0030132883694022894, "loss_ce": 0.00017515364743303508, "loss_xval": 0.002838134765625, "num_input_tokens_seen": 144691904, "step": 2092 }, { "epoch": 130.8125, "grad_norm": 5.011675862355414, "learning_rate": 5e-05, "loss": 0.007, "num_input_tokens_seen": 144763456, "step": 2093 }, { "epoch": 130.8125, "loss": 0.007125661708414555, "loss_ce": 0.0002286891103722155, "loss_xval": 0.00689697265625, "num_input_tokens_seen": 144763456, "step": 2093 }, { "epoch": 130.875, "grad_norm": 4.665674019487356, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 144835072, "step": 2094 }, { "epoch": 130.875, "loss": 0.006237682420760393, "loss_ce": 0.00022571970475837588, "loss_xval": 0.006011962890625, "num_input_tokens_seen": 144835072, "step": 2094 }, { "epoch": 130.9375, "grad_norm": 2.327435721813015, "learning_rate": 5e-05, "loss": 0.0021, "num_input_tokens_seen": 144906752, "step": 2095 }, { "epoch": 130.9375, "loss": 0.001996598206460476, "loss_ce": 0.00027235501329414546, "loss_xval": 0.0017242431640625, "num_input_tokens_seen": 144906752, "step": 2095 }, { "epoch": 131.0, "grad_norm": 0.7726657536137062, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 144953280, "step": 2096 }, { "epoch": 131.0, "loss": 0.0011610279325395823, "loss_ce": 0.00027983286418020725, "loss_xval": 0.000881195068359375, "num_input_tokens_seen": 144953280, "step": 2096 }, { "epoch": 131.0625, "grad_norm": 3.181122525381395, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 145024832, "step": 2097 }, { "epoch": 131.0625, "loss": 0.0032270171213895082, "loss_ce": 0.00028207077411934733, "loss_xval": 0.0029449462890625, "num_input_tokens_seen": 145024832, "step": 2097 }, { "epoch": 131.125, "grad_norm": 3.6210500111995345, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 145096448, "step": 2098 }, { "epoch": 131.125, "loss": 0.003765873610973358, "loss_ce": 0.00028686958830803633, "loss_xval": 0.00347900390625, "num_input_tokens_seen": 145096448, "step": 2098 }, { "epoch": 131.1875, "grad_norm": 2.1818267846684427, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 145168128, "step": 2099 }, { "epoch": 131.1875, "loss": 0.0021016704849898815, "loss_ce": 0.0002782451338134706, "loss_xval": 0.00182342529296875, "num_input_tokens_seen": 145168128, "step": 2099 }, { "epoch": 131.25, "grad_norm": 0.09386964420034875, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 145239744, "step": 2100 }, { "epoch": 131.25, "loss": 0.0003963925701100379, "loss_ce": 0.00025048039969988167, "loss_xval": 0.00014591217041015625, "num_input_tokens_seen": 145239744, "step": 2100 }, { "epoch": 131.3125, "grad_norm": 2.1667057386452147, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 145298816, "step": 2101 }, { "epoch": 131.3125, "loss": 0.001640781294554472, "loss_ce": 0.00023697275901213288, "loss_xval": 0.00140380859375, "num_input_tokens_seen": 145298816, "step": 2101 }, { "epoch": 131.375, "grad_norm": 3.06446925765169, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 145370560, "step": 2102 }, { "epoch": 131.375, "loss": 0.002880053361877799, "loss_ce": 0.00022502410865854472, "loss_xval": 0.002655029296875, "num_input_tokens_seen": 145370560, "step": 2102 }, { "epoch": 131.4375, "grad_norm": 2.4814137009230643, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 145442240, "step": 2103 }, { "epoch": 131.4375, "loss": 0.00198269821703434, "loss_ce": 0.0001897904003271833, "loss_xval": 0.00179290771484375, "num_input_tokens_seen": 145442240, "step": 2103 }, { "epoch": 131.5, "grad_norm": 0.8091662677058721, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 145513856, "step": 2104 }, { "epoch": 131.5, "loss": 0.000518153072334826, "loss_ce": 0.00016910828708205372, "loss_xval": 0.0003490447998046875, "num_input_tokens_seen": 145513856, "step": 2104 }, { "epoch": 131.5625, "grad_norm": 0.9711896831709398, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 145585408, "step": 2105 }, { "epoch": 131.5625, "loss": 0.000512247032020241, "loss_ce": 0.0001593875203980133, "loss_xval": 0.0003528594970703125, "num_input_tokens_seen": 145585408, "step": 2105 }, { "epoch": 131.625, "grad_norm": 2.0220649876272074, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 145657152, "step": 2106 }, { "epoch": 131.625, "loss": 0.0014408582355827093, "loss_ce": 0.00013623179984278977, "loss_xval": 0.00130462646484375, "num_input_tokens_seen": 145657152, "step": 2106 }, { "epoch": 131.6875, "grad_norm": 2.1693323682100005, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 145728704, "step": 2107 }, { "epoch": 131.6875, "loss": 0.0013570053270086646, "loss_ce": 0.00013630217290483415, "loss_xval": 0.001220703125, "num_input_tokens_seen": 145728704, "step": 2107 }, { "epoch": 131.75, "grad_norm": 1.9971593096020999, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 145800384, "step": 2108 }, { "epoch": 131.75, "loss": 0.0013030003756284714, "loss_ce": 0.0001128148433053866, "loss_xval": 0.001190185546875, "num_input_tokens_seen": 145800384, "step": 2108 }, { "epoch": 131.8125, "grad_norm": 1.9509462926255905, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 145872064, "step": 2109 }, { "epoch": 131.8125, "loss": 0.0012032217346131802, "loss_ce": 0.00011221828026464209, "loss_xval": 0.00109100341796875, "num_input_tokens_seen": 145872064, "step": 2109 }, { "epoch": 131.875, "grad_norm": 2.164767534194291, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 145943808, "step": 2110 }, { "epoch": 131.875, "loss": 0.0012747022556141019, "loss_ce": 9.977544686989859e-05, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 145943808, "step": 2110 }, { "epoch": 131.9375, "grad_norm": 2.577591041548048, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 146015360, "step": 2111 }, { "epoch": 131.9375, "loss": 0.0018587567610666156, "loss_ce": 9.636659524403512e-05, "loss_xval": 0.00176239013671875, "num_input_tokens_seen": 146015360, "step": 2111 }, { "epoch": 132.0, "grad_norm": 2.897437033309384, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 146086976, "step": 2112 }, { "epoch": 132.0, "loss": 0.002194196917116642, "loss_ce": 8.848396828398108e-05, "loss_xval": 0.002105712890625, "num_input_tokens_seen": 146086976, "step": 2112 }, { "epoch": 132.0625, "grad_norm": 3.1496740804235337, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 146158592, "step": 2113 }, { "epoch": 132.0625, "loss": 0.0026031185407191515, "loss_ce": 0.00010067722178064287, "loss_xval": 0.00250244140625, "num_input_tokens_seen": 146158592, "step": 2113 }, { "epoch": 132.125, "grad_norm": 3.213446574083063, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 146230144, "step": 2114 }, { "epoch": 132.125, "loss": 0.0028038991149514914, "loss_ce": 8.783464727457613e-05, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 146230144, "step": 2114 }, { "epoch": 132.1875, "grad_norm": 3.317823485630597, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 146301888, "step": 2115 }, { "epoch": 132.1875, "loss": 0.002983665093779564, "loss_ce": 9.97540118987672e-05, "loss_xval": 0.0028839111328125, "num_input_tokens_seen": 146301888, "step": 2115 }, { "epoch": 132.25, "grad_norm": 3.2938515342402424, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 146373568, "step": 2116 }, { "epoch": 132.25, "loss": 0.0033804993145167828, "loss_ce": 9.985957149183378e-05, "loss_xval": 0.0032806396484375, "num_input_tokens_seen": 146373568, "step": 2116 }, { "epoch": 132.3125, "grad_norm": 2.94541207651058, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 146445248, "step": 2117 }, { "epoch": 132.3125, "loss": 0.0024823653511703014, "loss_ce": 0.00011725301010301337, "loss_xval": 0.0023651123046875, "num_input_tokens_seen": 146445248, "step": 2117 }, { "epoch": 132.375, "grad_norm": 2.4504378031553036, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 146516928, "step": 2118 }, { "epoch": 132.375, "loss": 0.0016645942814648151, "loss_ce": 0.00012345658615231514, "loss_xval": 0.0015411376953125, "num_input_tokens_seen": 146516928, "step": 2118 }, { "epoch": 132.4375, "grad_norm": 1.5718108744843646, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 146576128, "step": 2119 }, { "epoch": 132.4375, "loss": 0.0009304281556978822, "loss_ce": 0.00013315639807842672, "loss_xval": 0.000797271728515625, "num_input_tokens_seen": 146576128, "step": 2119 }, { "epoch": 132.5, "grad_norm": 0.4789646132744658, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 146647936, "step": 2120 }, { "epoch": 132.5, "loss": 0.0004606024594977498, "loss_ce": 0.00014970461779739708, "loss_xval": 0.0003108978271484375, "num_input_tokens_seen": 146647936, "step": 2120 }, { "epoch": 132.5625, "grad_norm": 0.6269730419291779, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 146707136, "step": 2121 }, { "epoch": 132.5625, "loss": 0.0003991887788288295, "loss_ce": 0.0001493260933784768, "loss_xval": 0.0002498626708984375, "num_input_tokens_seen": 146707136, "step": 2121 }, { "epoch": 132.625, "grad_norm": 1.7977778352649605, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 146778816, "step": 2122 }, { "epoch": 132.625, "loss": 0.0011801546206697822, "loss_ce": 0.0001425569789716974, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 146778816, "step": 2122 }, { "epoch": 132.6875, "grad_norm": 3.1212230939689753, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 146850432, "step": 2123 }, { "epoch": 132.6875, "loss": 0.0028741902206093073, "loss_ce": 0.00015812575293239206, "loss_xval": 0.002716064453125, "num_input_tokens_seen": 146850432, "step": 2123 }, { "epoch": 132.75, "grad_norm": 4.587900160641939, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 146922048, "step": 2124 }, { "epoch": 132.75, "loss": 0.005815291311591864, "loss_ce": 0.00013902169303037226, "loss_xval": 0.00567626953125, "num_input_tokens_seen": 146922048, "step": 2124 }, { "epoch": 132.8125, "grad_norm": 5.973389673402774, "learning_rate": 5e-05, "loss": 0.0098, "num_input_tokens_seen": 146993664, "step": 2125 }, { "epoch": 132.8125, "loss": 0.009735052473843098, "loss_ce": 0.00015253292804118246, "loss_xval": 0.00958251953125, "num_input_tokens_seen": 146993664, "step": 2125 }, { "epoch": 132.875, "grad_norm": 6.805012275520879, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 147065344, "step": 2126 }, { "epoch": 132.875, "loss": 0.012403740547597408, "loss_ce": 0.0001356737338937819, "loss_xval": 0.01226806640625, "num_input_tokens_seen": 147065344, "step": 2126 }, { "epoch": 132.9375, "grad_norm": 6.354336216661166, "learning_rate": 5e-05, "loss": 0.0112, "num_input_tokens_seen": 147136960, "step": 2127 }, { "epoch": 132.9375, "loss": 0.010768568143248558, "loss_ce": 0.00014845086843706667, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 147136960, "step": 2127 }, { "epoch": 133.0, "grad_norm": 4.233638122891625, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 147208576, "step": 2128 }, { "epoch": 133.0, "loss": 0.005150423850864172, "loss_ce": 0.00017605879111215472, "loss_xval": 0.004974365234375, "num_input_tokens_seen": 147208576, "step": 2128 }, { "epoch": 133.0625, "grad_norm": 1.0086102085646704, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 147280192, "step": 2129 }, { "epoch": 133.0625, "loss": 0.0004604866844601929, "loss_ce": 0.0001972725585801527, "loss_xval": 0.000263214111328125, "num_input_tokens_seen": 147280192, "step": 2129 }, { "epoch": 133.125, "grad_norm": 2.2402200838579125, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 147351808, "step": 2130 }, { "epoch": 133.125, "loss": 0.0016773329116404057, "loss_ce": 0.0002048597380053252, "loss_xval": 0.00147247314453125, "num_input_tokens_seen": 147351808, "step": 2130 }, { "epoch": 133.1875, "grad_norm": 4.178564998809066, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 147411008, "step": 2131 }, { "epoch": 133.1875, "loss": 0.0049562654457986355, "loss_ce": 0.00022604072000831366, "loss_xval": 0.004730224609375, "num_input_tokens_seen": 147411008, "step": 2131 }, { "epoch": 133.25, "grad_norm": 4.097662993556067, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 147482688, "step": 2132 }, { "epoch": 133.25, "loss": 0.005465524271130562, "loss_ce": 0.00021650105190929025, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 147482688, "step": 2132 }, { "epoch": 133.3125, "grad_norm": 2.351868917270304, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 147554368, "step": 2133 }, { "epoch": 133.3125, "loss": 0.0020998907275497913, "loss_ce": 0.00024594776914454997, "loss_xval": 0.00185394287109375, "num_input_tokens_seen": 147554368, "step": 2133 }, { "epoch": 133.375, "grad_norm": 0.09088466157172155, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 147626048, "step": 2134 }, { "epoch": 133.375, "loss": 0.000461443851236254, "loss_ce": 0.0002697553136385977, "loss_xval": 0.00019168853759765625, "num_input_tokens_seen": 147626048, "step": 2134 }, { "epoch": 133.4375, "grad_norm": 2.034847356599081, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 147697664, "step": 2135 }, { "epoch": 133.4375, "loss": 0.0017401942750439048, "loss_ce": 0.00028297994867898524, "loss_xval": 0.00145721435546875, "num_input_tokens_seen": 147697664, "step": 2135 }, { "epoch": 133.5, "grad_norm": 2.841930809682247, "learning_rate": 5e-05, "loss": 0.0026, "num_input_tokens_seen": 147769344, "step": 2136 }, { "epoch": 133.5, "loss": 0.002519955625757575, "loss_ce": 0.0002769135753624141, "loss_xval": 0.0022430419921875, "num_input_tokens_seen": 147769344, "step": 2136 }, { "epoch": 133.5625, "grad_norm": 2.349514500940194, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 147841024, "step": 2137 }, { "epoch": 133.5625, "loss": 0.002035383600741625, "loss_ce": 0.0002577346167527139, "loss_xval": 0.00177764892578125, "num_input_tokens_seen": 147841024, "step": 2137 }, { "epoch": 133.625, "grad_norm": 0.7448406078444589, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 147912704, "step": 2138 }, { "epoch": 133.625, "loss": 0.0006271991296671331, "loss_ce": 0.0002381000085733831, "loss_xval": 0.00038909912109375, "num_input_tokens_seen": 147912704, "step": 2138 }, { "epoch": 133.6875, "grad_norm": 1.4929476874380203, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 147971776, "step": 2139 }, { "epoch": 133.6875, "loss": 0.0014256933936849236, "loss_ce": 0.00020499022502917796, "loss_xval": 0.001220703125, "num_input_tokens_seen": 147971776, "step": 2139 }, { "epoch": 133.75, "grad_norm": 3.5753967779590026, "learning_rate": 5e-05, "loss": 0.0038, "num_input_tokens_seen": 148043520, "step": 2140 }, { "epoch": 133.75, "loss": 0.0038607781752943993, "loss_ce": 0.00018340993847232312, "loss_xval": 0.0036773681640625, "num_input_tokens_seen": 148043520, "step": 2140 }, { "epoch": 133.8125, "grad_norm": 4.669850830222451, "learning_rate": 5e-05, "loss": 0.0062, "num_input_tokens_seen": 148102592, "step": 2141 }, { "epoch": 133.8125, "loss": 0.006373754236847162, "loss_ce": 0.00017868586292024702, "loss_xval": 0.006195068359375, "num_input_tokens_seen": 148102592, "step": 2141 }, { "epoch": 133.875, "grad_norm": 4.273147287943639, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 148174208, "step": 2142 }, { "epoch": 133.875, "loss": 0.005067807622253895, "loss_ce": 0.0001544774859212339, "loss_xval": 0.004913330078125, "num_input_tokens_seen": 148174208, "step": 2142 }, { "epoch": 133.9375, "grad_norm": 2.557900459431935, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 148245824, "step": 2143 }, { "epoch": 133.9375, "loss": 0.002185113960877061, "loss_ce": 0.00015569495735689998, "loss_xval": 0.0020294189453125, "num_input_tokens_seen": 148245824, "step": 2143 }, { "epoch": 134.0, "grad_norm": 0.1244516336618763, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 148317632, "step": 2144 }, { "epoch": 134.0, "loss": 0.00032432418083772063, "loss_ce": 0.0001583848352311179, "loss_xval": 0.0001659393310546875, "num_input_tokens_seen": 148317632, "step": 2144 }, { "epoch": 134.0625, "grad_norm": 2.3715035560448148, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 148389312, "step": 2145 }, { "epoch": 134.0625, "loss": 0.001994270598515868, "loss_ce": 0.00015558644372504205, "loss_xval": 0.00183868408203125, "num_input_tokens_seen": 148389312, "step": 2145 }, { "epoch": 134.125, "grad_norm": 4.106114568684724, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 148461056, "step": 2146 }, { "epoch": 134.125, "loss": 0.005479307379573584, "loss_ce": 0.00013873119314666837, "loss_xval": 0.005340576171875, "num_input_tokens_seen": 148461056, "step": 2146 }, { "epoch": 134.1875, "grad_norm": 4.820552638251836, "learning_rate": 5e-05, "loss": 0.0066, "num_input_tokens_seen": 148532736, "step": 2147 }, { "epoch": 134.1875, "loss": 0.006738653406500816, "loss_ce": 0.00014685651694890112, "loss_xval": 0.006591796875, "num_input_tokens_seen": 148532736, "step": 2147 }, { "epoch": 134.25, "grad_norm": 4.291935062772253, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 148604480, "step": 2148 }, { "epoch": 134.25, "loss": 0.005339866038411856, "loss_ce": 0.00015187790268100798, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 148604480, "step": 2148 }, { "epoch": 134.3125, "grad_norm": 2.494981976416252, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 148676160, "step": 2149 }, { "epoch": 134.3125, "loss": 0.0019246542360633612, "loss_ce": 0.00016989343566820025, "loss_xval": 0.0017547607421875, "num_input_tokens_seen": 148676160, "step": 2149 }, { "epoch": 134.375, "grad_norm": 0.22830845799667687, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 148747840, "step": 2150 }, { "epoch": 134.375, "loss": 0.00036656681913882494, "loss_ce": 0.0001806003274396062, "loss_xval": 0.00018596649169921875, "num_input_tokens_seen": 148747840, "step": 2150 }, { "epoch": 134.4375, "grad_norm": 2.847110043325801, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 148806976, "step": 2151 }, { "epoch": 134.4375, "loss": 0.0023445640690624714, "loss_ce": 0.0001625571894692257, "loss_xval": 0.0021820068359375, "num_input_tokens_seen": 148806976, "step": 2151 }, { "epoch": 134.5, "grad_norm": 4.3579190454001875, "learning_rate": 5e-05, "loss": 0.0055, "num_input_tokens_seen": 148878784, "step": 2152 }, { "epoch": 134.5, "loss": 0.005705356132239103, "loss_ce": 0.000181674535269849, "loss_xval": 0.005523681640625, "num_input_tokens_seen": 148878784, "step": 2152 }, { "epoch": 134.5625, "grad_norm": 4.161126822103258, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 148950400, "step": 2153 }, { "epoch": 134.5625, "loss": 0.004884625319391489, "loss_ce": 0.00015440078277606517, "loss_xval": 0.004730224609375, "num_input_tokens_seen": 148950400, "step": 2153 }, { "epoch": 134.625, "grad_norm": 2.6864583102223, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 149022080, "step": 2154 }, { "epoch": 134.625, "loss": 0.002241534413769841, "loss_ce": 0.00016633901395834982, "loss_xval": 0.0020751953125, "num_input_tokens_seen": 149022080, "step": 2154 }, { "epoch": 134.6875, "grad_norm": 0.5679632082483581, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 149081216, "step": 2155 }, { "epoch": 134.6875, "loss": 0.00033710565185174346, "loss_ce": 0.00016639796376693994, "loss_xval": 0.00017070770263671875, "num_input_tokens_seen": 149081216, "step": 2155 }, { "epoch": 134.75, "grad_norm": 1.6203994046512498, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 149140352, "step": 2156 }, { "epoch": 134.75, "loss": 0.001092609716579318, "loss_ce": 0.00015037944831419736, "loss_xval": 0.000942230224609375, "num_input_tokens_seen": 149140352, "step": 2156 }, { "epoch": 134.8125, "grad_norm": 3.2646001056365086, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 149199552, "step": 2157 }, { "epoch": 134.8125, "loss": 0.003142598317936063, "loss_ce": 0.0001366168726235628, "loss_xval": 0.0030059814453125, "num_input_tokens_seen": 149199552, "step": 2157 }, { "epoch": 134.875, "grad_norm": 3.9872981802680254, "learning_rate": 5e-05, "loss": 0.0045, "num_input_tokens_seen": 149271360, "step": 2158 }, { "epoch": 134.875, "loss": 0.004656321369111538, "loss_ce": 0.00013972001033835113, "loss_xval": 0.0045166015625, "num_input_tokens_seen": 149271360, "step": 2158 }, { "epoch": 134.9375, "grad_norm": 3.9922012140639787, "learning_rate": 5e-05, "loss": 0.0045, "num_input_tokens_seen": 149343168, "step": 2159 }, { "epoch": 134.9375, "loss": 0.004001868888735771, "loss_ce": 0.00012613640865311027, "loss_xval": 0.003875732421875, "num_input_tokens_seen": 149343168, "step": 2159 }, { "epoch": 135.0, "grad_norm": 3.2265108874032564, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 149402240, "step": 2160 }, { "epoch": 135.0, "loss": 0.0029423029627650976, "loss_ce": 0.00014994456432759762, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 149402240, "step": 2160 }, { "epoch": 135.0625, "grad_norm": 1.7072210926221831, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 149473792, "step": 2161 }, { "epoch": 135.0625, "loss": 0.0012776124058291316, "loss_ce": 0.00013320324069354683, "loss_xval": 0.0011444091796875, "num_input_tokens_seen": 149473792, "step": 2161 }, { "epoch": 135.125, "grad_norm": 0.1241373477669299, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 149545408, "step": 2162 }, { "epoch": 135.125, "loss": 0.00028155322070233524, "loss_ce": 0.00015185351367108524, "loss_xval": 0.00012969970703125, "num_input_tokens_seen": 149545408, "step": 2162 }, { "epoch": 135.1875, "grad_norm": 1.903590505377728, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 149616960, "step": 2163 }, { "epoch": 135.1875, "loss": 0.0014339193003252149, "loss_ce": 0.0001521810336271301, "loss_xval": 0.00128173828125, "num_input_tokens_seen": 149616960, "step": 2163 }, { "epoch": 135.25, "grad_norm": 3.573030706002393, "learning_rate": 5e-05, "loss": 0.0043, "num_input_tokens_seen": 149688640, "step": 2164 }, { "epoch": 135.25, "loss": 0.003992053214460611, "loss_ce": 0.0001468385016778484, "loss_xval": 0.00384521484375, "num_input_tokens_seen": 149688640, "step": 2164 }, { "epoch": 135.3125, "grad_norm": 4.742522445976156, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 149760192, "step": 2165 }, { "epoch": 135.3125, "loss": 0.006116765085607767, "loss_ce": 0.00013531999138649553, "loss_xval": 0.0059814453125, "num_input_tokens_seen": 149760192, "step": 2165 }, { "epoch": 135.375, "grad_norm": 4.418326558097359, "learning_rate": 5e-05, "loss": 0.0074, "num_input_tokens_seen": 149819456, "step": 2166 }, { "epoch": 135.375, "loss": 0.007084544748067856, "loss_ce": 0.00012653697922360152, "loss_xval": 0.0069580078125, "num_input_tokens_seen": 149819456, "step": 2166 }, { "epoch": 135.4375, "grad_norm": 2.5258406225652026, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 149891072, "step": 2167 }, { "epoch": 135.4375, "loss": 0.0028270725160837173, "loss_ce": 0.0001567845029057935, "loss_xval": 0.0026702880859375, "num_input_tokens_seen": 149891072, "step": 2167 }, { "epoch": 135.5, "grad_norm": 0.43194665823243344, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 149950336, "step": 2168 }, { "epoch": 135.5, "loss": 0.0009768784511834383, "loss_ce": 0.00015671852452214807, "loss_xval": 0.000820159912109375, "num_input_tokens_seen": 149950336, "step": 2168 }, { "epoch": 135.5625, "grad_norm": 4.093186764659218, "learning_rate": 5e-05, "loss": 0.0061, "num_input_tokens_seen": 150022144, "step": 2169 }, { "epoch": 135.5625, "loss": 0.00625575752928853, "loss_ce": 0.00015224194794427603, "loss_xval": 0.006103515625, "num_input_tokens_seen": 150022144, "step": 2169 }, { "epoch": 135.625, "grad_norm": 7.19927736392235, "learning_rate": 5e-05, "loss": 0.0146, "num_input_tokens_seen": 150093824, "step": 2170 }, { "epoch": 135.625, "loss": 0.014271882362663746, "loss_ce": 0.00017276135622523725, "loss_xval": 0.01409912109375, "num_input_tokens_seen": 150093824, "step": 2170 }, { "epoch": 135.6875, "grad_norm": 7.1791670497950335, "learning_rate": 5e-05, "loss": 0.0179, "num_input_tokens_seen": 150165504, "step": 2171 }, { "epoch": 135.6875, "loss": 0.016541821882128716, "loss_ce": 0.00018440044368617237, "loss_xval": 0.016357421875, "num_input_tokens_seen": 150165504, "step": 2171 }, { "epoch": 135.75, "grad_norm": 2.6245323142450956, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 150237120, "step": 2172 }, { "epoch": 135.75, "loss": 0.0037419276777654886, "loss_ce": 0.00024766495334915817, "loss_xval": 0.0034942626953125, "num_input_tokens_seen": 150237120, "step": 2172 }, { "epoch": 135.8125, "grad_norm": 2.9451622408931923, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 150308736, "step": 2173 }, { "epoch": 135.8125, "loss": 0.0027326305862516165, "loss_ce": 0.00035225943429395556, "loss_xval": 0.00238037109375, "num_input_tokens_seen": 150308736, "step": 2173 }, { "epoch": 135.875, "grad_norm": 7.313853082086013, "learning_rate": 5e-05, "loss": 0.017, "num_input_tokens_seen": 150367808, "step": 2174 }, { "epoch": 135.875, "loss": 0.01791013963520527, "loss_ce": 0.00045408555888570845, "loss_xval": 0.0174560546875, "num_input_tokens_seen": 150367808, "step": 2174 }, { "epoch": 135.9375, "grad_norm": 9.56986347418184, "learning_rate": 5e-05, "loss": 0.0256, "num_input_tokens_seen": 150439360, "step": 2175 }, { "epoch": 135.9375, "loss": 0.024953246116638184, "loss_ce": 0.0002950435155071318, "loss_xval": 0.024658203125, "num_input_tokens_seen": 150439360, "step": 2175 }, { "epoch": 136.0, "grad_norm": 6.600646933001922, "learning_rate": 5e-05, "loss": 0.013, "num_input_tokens_seen": 150510912, "step": 2176 }, { "epoch": 136.0, "loss": 0.013693580403923988, "loss_ce": 0.0002048104361165315, "loss_xval": 0.01348876953125, "num_input_tokens_seen": 150510912, "step": 2176 }, { "epoch": 136.0625, "grad_norm": 0.6427743574819383, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 150582592, "step": 2177 }, { "epoch": 136.0625, "loss": 0.0012838091934099793, "loss_ce": 0.00023095273354556412, "loss_xval": 0.0010528564453125, "num_input_tokens_seen": 150582592, "step": 2177 }, { "epoch": 136.125, "grad_norm": 7.175273253244792, "learning_rate": 5e-05, "loss": 0.0143, "num_input_tokens_seen": 150654272, "step": 2178 }, { "epoch": 136.125, "loss": 0.01449158787727356, "loss_ce": 0.00027039687847718596, "loss_xval": 0.01422119140625, "num_input_tokens_seen": 150654272, "step": 2178 }, { "epoch": 136.1875, "grad_norm": 9.420888468605963, "learning_rate": 5e-05, "loss": 0.0252, "num_input_tokens_seen": 150725824, "step": 2179 }, { "epoch": 136.1875, "loss": 0.026520689949393272, "loss_ce": 0.00027557314024306834, "loss_xval": 0.0262451171875, "num_input_tokens_seen": 150725824, "step": 2179 }, { "epoch": 136.25, "grad_norm": 4.459816136929259, "learning_rate": 5e-05, "loss": 0.0068, "num_input_tokens_seen": 150797376, "step": 2180 }, { "epoch": 136.25, "loss": 0.005994915496557951, "loss_ce": 0.0005322691868059337, "loss_xval": 0.005462646484375, "num_input_tokens_seen": 150797376, "step": 2180 }, { "epoch": 136.3125, "grad_norm": 0.676377349039084, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 150856448, "step": 2181 }, { "epoch": 136.3125, "loss": 0.0020002590026706457, "loss_ce": 0.0012296901550143957, "loss_xval": 0.00077056884765625, "num_input_tokens_seen": 150856448, "step": 2181 }, { "epoch": 136.375, "grad_norm": 7.099016476439983, "learning_rate": 5e-05, "loss": 0.0182, "num_input_tokens_seen": 150928064, "step": 2182 }, { "epoch": 136.375, "loss": 0.018470434471964836, "loss_ce": 0.0022350833751261234, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 150928064, "step": 2182 }, { "epoch": 136.4375, "grad_norm": 16.96350016913768, "learning_rate": 5e-05, "loss": 0.0817, "num_input_tokens_seen": 150999616, "step": 2183 }, { "epoch": 136.4375, "loss": 0.08347728848457336, "loss_ce": 0.0004694793897215277, "loss_xval": 0.0830078125, "num_input_tokens_seen": 150999616, "step": 2183 }, { "epoch": 136.5, "grad_norm": 12.780990377411227, "learning_rate": 5e-05, "loss": 0.0439, "num_input_tokens_seen": 151071296, "step": 2184 }, { "epoch": 136.5, "loss": 0.04217681288719177, "loss_ce": 0.00018462415027897805, "loss_xval": 0.0419921875, "num_input_tokens_seen": 151071296, "step": 2184 }, { "epoch": 136.5625, "grad_norm": 4.135673579801127, "learning_rate": 5e-05, "loss": 0.0082, "num_input_tokens_seen": 151142912, "step": 2185 }, { "epoch": 136.5625, "loss": 0.008142843842506409, "loss_ce": 0.0002693082788027823, "loss_xval": 0.00787353515625, "num_input_tokens_seen": 151142912, "step": 2185 }, { "epoch": 136.625, "grad_norm": 16.228388083310122, "learning_rate": 5e-05, "loss": 0.0732, "num_input_tokens_seen": 151202112, "step": 2186 }, { "epoch": 136.625, "loss": 0.06874268501996994, "loss_ce": 0.0003833128430414945, "loss_xval": 0.068359375, "num_input_tokens_seen": 151202112, "step": 2186 }, { "epoch": 136.6875, "grad_norm": 16.39062976495596, "learning_rate": 5e-05, "loss": 0.0769, "num_input_tokens_seen": 151261376, "step": 2187 }, { "epoch": 136.6875, "loss": 0.07935876399278641, "loss_ce": 0.0007454807055182755, "loss_xval": 0.07861328125, "num_input_tokens_seen": 151261376, "step": 2187 }, { "epoch": 136.75, "grad_norm": 3.1005737558272437, "learning_rate": 5e-05, "loss": 0.0056, "num_input_tokens_seen": 151333120, "step": 2188 }, { "epoch": 136.75, "loss": 0.00870249792933464, "loss_ce": 0.0007068926934152842, "loss_xval": 0.00799560546875, "num_input_tokens_seen": 151333120, "step": 2188 }, { "epoch": 136.8125, "grad_norm": 13.490271293549755, "learning_rate": 5e-05, "loss": 0.0461, "num_input_tokens_seen": 151404736, "step": 2189 }, { "epoch": 136.8125, "loss": 0.04683200642466545, "loss_ce": 0.000689428299665451, "loss_xval": 0.046142578125, "num_input_tokens_seen": 151404736, "step": 2189 }, { "epoch": 136.875, "grad_norm": 21.577246738555573, "learning_rate": 5e-05, "loss": 0.1245, "num_input_tokens_seen": 151476352, "step": 2190 }, { "epoch": 136.875, "loss": 0.12032702565193176, "loss_ce": 0.0006981170736253262, "loss_xval": 0.11962890625, "num_input_tokens_seen": 151476352, "step": 2190 }, { "epoch": 136.9375, "grad_norm": 6.149390085768126, "learning_rate": 5e-05, "loss": 0.0113, "num_input_tokens_seen": 151548096, "step": 2191 }, { "epoch": 136.9375, "loss": 0.010628869757056236, "loss_ce": 0.0005580692086368799, "loss_xval": 0.01007080078125, "num_input_tokens_seen": 151548096, "step": 2191 }, { "epoch": 137.0, "grad_norm": 13.650308173024213, "learning_rate": 5e-05, "loss": 0.052, "num_input_tokens_seen": 151619648, "step": 2192 }, { "epoch": 137.0, "loss": 0.05068320035934448, "loss_ce": 0.00039023064891807735, "loss_xval": 0.05029296875, "num_input_tokens_seen": 151619648, "step": 2192 }, { "epoch": 137.0625, "grad_norm": 9.617224665229953, "learning_rate": 5e-05, "loss": 0.0281, "num_input_tokens_seen": 151678784, "step": 2193 }, { "epoch": 137.0625, "loss": 0.03268560767173767, "loss_ce": 0.0002149032079614699, "loss_xval": 0.032470703125, "num_input_tokens_seen": 151678784, "step": 2193 }, { "epoch": 137.125, "grad_norm": 7.486163953767819, "learning_rate": 5e-05, "loss": 0.0159, "num_input_tokens_seen": 151750336, "step": 2194 }, { "epoch": 137.125, "loss": 0.016298949718475342, "loss_ce": 0.00030773921753279865, "loss_xval": 0.0159912109375, "num_input_tokens_seen": 151750336, "step": 2194 }, { "epoch": 137.1875, "grad_norm": 10.21288790843152, "learning_rate": 5e-05, "loss": 0.0293, "num_input_tokens_seen": 151821952, "step": 2195 }, { "epoch": 137.1875, "loss": 0.02932918071746826, "loss_ce": 0.0005205870256759226, "loss_xval": 0.02880859375, "num_input_tokens_seen": 151821952, "step": 2195 }, { "epoch": 137.25, "grad_norm": 3.702340244739552, "learning_rate": 5e-05, "loss": 0.0088, "num_input_tokens_seen": 151893568, "step": 2196 }, { "epoch": 137.25, "loss": 0.009085440076887608, "loss_ce": 0.0007846589433029294, "loss_xval": 0.00830078125, "num_input_tokens_seen": 151893568, "step": 2196 }, { "epoch": 137.3125, "grad_norm": 9.37196438277608, "learning_rate": 5e-05, "loss": 0.0257, "num_input_tokens_seen": 151965184, "step": 2197 }, { "epoch": 137.3125, "loss": 0.02497875690460205, "loss_ce": 0.0008088357863016427, "loss_xval": 0.024169921875, "num_input_tokens_seen": 151965184, "step": 2197 }, { "epoch": 137.375, "grad_norm": 0.5164767426303001, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 152036800, "step": 2198 }, { "epoch": 137.375, "loss": 0.0027048829942941666, "loss_ce": 0.0008814577013254166, "loss_xval": 0.00182342529296875, "num_input_tokens_seen": 152036800, "step": 2198 }, { "epoch": 137.4375, "grad_norm": 7.568909428786799, "learning_rate": 5e-05, "loss": 0.0209, "num_input_tokens_seen": 152108416, "step": 2199 }, { "epoch": 137.4375, "loss": 0.015644783154129982, "loss_ce": 0.0008132405928336084, "loss_xval": 0.01483154296875, "num_input_tokens_seen": 152108416, "step": 2199 }, { "epoch": 137.5, "grad_norm": 0.9429202499800853, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 152167488, "step": 2200 }, { "epoch": 137.5, "loss": 0.002534096594899893, "loss_ce": 0.0008937768288888037, "loss_xval": 0.00164031982421875, "num_input_tokens_seen": 152167488, "step": 2200 }, { "epoch": 137.5625, "grad_norm": 6.852523911842224, "learning_rate": 5e-05, "loss": 0.0141, "num_input_tokens_seen": 152239104, "step": 2201 }, { "epoch": 137.5625, "loss": 0.013777682557702065, "loss_ce": 0.00096029945416376, "loss_xval": 0.0128173828125, "num_input_tokens_seen": 152239104, "step": 2201 }, { "epoch": 137.625, "grad_norm": 1.6369849402409489, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 152310784, "step": 2202 }, { "epoch": 137.625, "loss": 0.0015844183508306742, "loss_ce": 0.0007680730777792633, "loss_xval": 0.00081634521484375, "num_input_tokens_seen": 152310784, "step": 2202 }, { "epoch": 137.6875, "grad_norm": 6.529793333660272, "learning_rate": 5e-05, "loss": 0.0126, "num_input_tokens_seen": 152369856, "step": 2203 }, { "epoch": 137.6875, "loss": 0.012915344908833504, "loss_ce": 0.0005862429388798773, "loss_xval": 0.0123291015625, "num_input_tokens_seen": 152369856, "step": 2203 }, { "epoch": 137.75, "grad_norm": 1.3590339013771888, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 152441664, "step": 2204 }, { "epoch": 137.75, "loss": 0.001615358516573906, "loss_ce": 0.0005777608603239059, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 152441664, "step": 2204 }, { "epoch": 137.8125, "grad_norm": 6.429611279367293, "learning_rate": 5e-05, "loss": 0.0121, "num_input_tokens_seen": 152500800, "step": 2205 }, { "epoch": 137.8125, "loss": 0.011699453927576542, "loss_ce": 0.00046898474101908505, "loss_xval": 0.01123046875, "num_input_tokens_seen": 152500800, "step": 2205 }, { "epoch": 137.875, "grad_norm": 1.5553170722890897, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 152572352, "step": 2206 }, { "epoch": 137.875, "loss": 0.0012894327519461513, "loss_ce": 0.00032049964647740126, "loss_xval": 0.00096893310546875, "num_input_tokens_seen": 152572352, "step": 2206 }, { "epoch": 137.9375, "grad_norm": 6.308887726050781, "learning_rate": 5e-05, "loss": 0.0124, "num_input_tokens_seen": 152619008, "step": 2207 }, { "epoch": 137.9375, "loss": 0.012343852780759335, "loss_ce": 0.00025889204698614776, "loss_xval": 0.0120849609375, "num_input_tokens_seen": 152619008, "step": 2207 }, { "epoch": 138.0, "grad_norm": 1.514388520820637, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 152690624, "step": 2208 }, { "epoch": 138.0, "loss": 0.0014354572631418705, "loss_ce": 0.00023001297086011618, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 152690624, "step": 2208 }, { "epoch": 138.0625, "grad_norm": 6.194249119347115, "learning_rate": 5e-05, "loss": 0.011, "num_input_tokens_seen": 152762176, "step": 2209 }, { "epoch": 138.0625, "loss": 0.010418097488582134, "loss_ce": 0.00022522661311086267, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 152762176, "step": 2209 }, { "epoch": 138.125, "grad_norm": 1.1014088449740902, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 152833792, "step": 2210 }, { "epoch": 138.125, "loss": 0.000695666647516191, "loss_ce": 0.00016160900122486055, "loss_xval": 0.0005340576171875, "num_input_tokens_seen": 152833792, "step": 2210 }, { "epoch": 138.1875, "grad_norm": 5.586581449559117, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 152905344, "step": 2211 }, { "epoch": 138.1875, "loss": 0.009005383588373661, "loss_ce": 0.00015528571384493262, "loss_xval": 0.00885009765625, "num_input_tokens_seen": 152905344, "step": 2211 }, { "epoch": 138.25, "grad_norm": 0.6119646356804296, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 152977024, "step": 2212 }, { "epoch": 138.25, "loss": 0.0005884505808353424, "loss_ce": 0.00015166777302511036, "loss_xval": 0.0004367828369140625, "num_input_tokens_seen": 152977024, "step": 2212 }, { "epoch": 138.3125, "grad_norm": 4.730973741739341, "learning_rate": 5e-05, "loss": 0.0065, "num_input_tokens_seen": 153048640, "step": 2213 }, { "epoch": 138.3125, "loss": 0.007002365775406361, "loss_ce": 0.0001359108428005129, "loss_xval": 0.006866455078125, "num_input_tokens_seen": 153048640, "step": 2213 }, { "epoch": 138.375, "grad_norm": 0.49462988250905304, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 153107776, "step": 2214 }, { "epoch": 138.375, "loss": 0.0005698601016774774, "loss_ce": 0.0001292625820497051, "loss_xval": 0.0004405975341796875, "num_input_tokens_seen": 153107776, "step": 2214 }, { "epoch": 138.4375, "grad_norm": 4.070162378919028, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 153179328, "step": 2215 }, { "epoch": 138.4375, "loss": 0.005405760835856199, "loss_ce": 0.00012622002395801246, "loss_xval": 0.005279541015625, "num_input_tokens_seen": 153179328, "step": 2215 }, { "epoch": 138.5, "grad_norm": 1.2962029399375476, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 153238336, "step": 2216 }, { "epoch": 138.5, "loss": 0.0007740218425169587, "loss_ce": 0.00011789389100158587, "loss_xval": 0.0006561279296875, "num_input_tokens_seen": 153238336, "step": 2216 }, { "epoch": 138.5625, "grad_norm": 3.585443961923812, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 153297472, "step": 2217 }, { "epoch": 138.5625, "loss": 0.00407800730317831, "loss_ce": 0.0001107221978600137, "loss_xval": 0.00396728515625, "num_input_tokens_seen": 153297472, "step": 2217 }, { "epoch": 138.625, "grad_norm": 1.5923921681887396, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 153369152, "step": 2218 }, { "epoch": 138.625, "loss": 0.0010032439604401588, "loss_ce": 0.0001220489211846143, "loss_xval": 0.000881195068359375, "num_input_tokens_seen": 153369152, "step": 2218 }, { "epoch": 138.6875, "grad_norm": 3.071232683319909, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 153440832, "step": 2219 }, { "epoch": 138.6875, "loss": 0.0028993317391723394, "loss_ce": 0.0001222321152454242, "loss_xval": 0.002777099609375, "num_input_tokens_seen": 153440832, "step": 2219 }, { "epoch": 138.75, "grad_norm": 1.7176997414034458, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 153512576, "step": 2220 }, { "epoch": 138.75, "loss": 0.0010029050754383206, "loss_ce": 0.00011789533164119348, "loss_xval": 0.000885009765625, "num_input_tokens_seen": 153512576, "step": 2220 }, { "epoch": 138.8125, "grad_norm": 2.804938035327702, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 153571776, "step": 2221 }, { "epoch": 138.8125, "loss": 0.003006437560543418, "loss_ce": 0.00012252634041942656, "loss_xval": 0.0028839111328125, "num_input_tokens_seen": 153571776, "step": 2221 }, { "epoch": 138.875, "grad_norm": 1.4055190533044555, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 153643584, "step": 2222 }, { "epoch": 138.875, "loss": 0.0008755518938414752, "loss_ce": 0.00012405651796143502, "loss_xval": 0.000751495361328125, "num_input_tokens_seen": 153643584, "step": 2222 }, { "epoch": 138.9375, "grad_norm": 2.110967150498914, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 153702720, "step": 2223 }, { "epoch": 138.9375, "loss": 0.0020835991017520428, "loss_ce": 0.00011521533451741561, "loss_xval": 0.0019683837890625, "num_input_tokens_seen": 153702720, "step": 2223 }, { "epoch": 139.0, "grad_norm": 1.4203094135983487, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 153761920, "step": 2224 }, { "epoch": 139.0, "loss": 0.0012334496714174747, "loss_ce": 0.00011955812806263566, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 153761920, "step": 2224 }, { "epoch": 139.0625, "grad_norm": 1.5418461161142287, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 153833600, "step": 2225 }, { "epoch": 139.0625, "loss": 0.0013548274291679263, "loss_ce": 0.00011123609874630347, "loss_xval": 0.00124359130859375, "num_input_tokens_seen": 153833600, "step": 2225 }, { "epoch": 139.125, "grad_norm": 1.3799738043250926, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 153905216, "step": 2226 }, { "epoch": 139.125, "loss": 0.0008790802676230669, "loss_ce": 0.0001161408144980669, "loss_xval": 0.000762939453125, "num_input_tokens_seen": 153905216, "step": 2226 }, { "epoch": 139.1875, "grad_norm": 1.5584645779359327, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 153976768, "step": 2227 }, { "epoch": 139.1875, "loss": 0.0011807398404926062, "loss_ce": 0.00012025403702864423, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 153976768, "step": 2227 }, { "epoch": 139.25, "grad_norm": 0.8157982009158394, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 154048448, "step": 2228 }, { "epoch": 139.25, "loss": 0.0007961892988532782, "loss_ce": 0.00011717317102011293, "loss_xval": 0.00067901611328125, "num_input_tokens_seen": 154048448, "step": 2228 }, { "epoch": 139.3125, "grad_norm": 1.4322162772025198, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 154120000, "step": 2229 }, { "epoch": 139.3125, "loss": 0.001034379587508738, "loss_ce": 0.00011503753921715543, "loss_xval": 0.000919342041015625, "num_input_tokens_seen": 154120000, "step": 2229 }, { "epoch": 139.375, "grad_norm": 0.47032266578243526, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154191616, "step": 2230 }, { "epoch": 139.375, "loss": 0.00034551139106042683, "loss_ce": 0.00011281486513325945, "loss_xval": 0.000232696533203125, "num_input_tokens_seen": 154191616, "step": 2230 }, { "epoch": 139.4375, "grad_norm": 1.2886053491749265, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 154263232, "step": 2231 }, { "epoch": 139.4375, "loss": 0.0007385425269603729, "loss_ce": 0.00010148809087695554, "loss_xval": 0.000637054443359375, "num_input_tokens_seen": 154263232, "step": 2231 }, { "epoch": 139.5, "grad_norm": 0.17049475117003512, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 154334848, "step": 2232 }, { "epoch": 139.5, "loss": 0.0004746349295601249, "loss_ce": 9.507254435447976e-05, "loss_xval": 0.0003795623779296875, "num_input_tokens_seen": 154334848, "step": 2232 }, { "epoch": 139.5625, "grad_norm": 0.2630249658768459, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 154406400, "step": 2233 }, { "epoch": 139.5625, "loss": 0.0004748214269056916, "loss_ce": 9.14443371584639e-05, "loss_xval": 0.0003833770751953125, "num_input_tokens_seen": 154406400, "step": 2233 }, { "epoch": 139.625, "grad_norm": 0.311810875046848, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 154478016, "step": 2234 }, { "epoch": 139.625, "loss": 0.0003131224075332284, "loss_ce": 8.805528341326863e-05, "loss_xval": 0.000225067138671875, "num_input_tokens_seen": 154478016, "step": 2234 }, { "epoch": 139.6875, "grad_norm": 0.3015887855796754, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 154549696, "step": 2235 }, { "epoch": 139.6875, "loss": 0.00023762644559610635, "loss_ce": 8.503855497110635e-05, "loss_xval": 0.000152587890625, "num_input_tokens_seen": 154549696, "step": 2235 }, { "epoch": 139.75, "grad_norm": 0.28683055185678186, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154621312, "step": 2236 }, { "epoch": 139.75, "loss": 0.0003172550059389323, "loss_ce": 8.169744251063094e-05, "loss_xval": 0.00023555755615234375, "num_input_tokens_seen": 154621312, "step": 2236 }, { "epoch": 139.8125, "grad_norm": 0.44450358647372484, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154693056, "step": 2237 }, { "epoch": 139.8125, "loss": 0.00025231545441783965, "loss_ce": 7.588571315864101e-05, "loss_xval": 0.00017642974853515625, "num_input_tokens_seen": 154693056, "step": 2237 }, { "epoch": 139.875, "grad_norm": 0.35186947388869055, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154764672, "step": 2238 }, { "epoch": 139.875, "loss": 0.00023929460439831018, "loss_ce": 7.14479319867678e-05, "loss_xval": 0.0001678466796875, "num_input_tokens_seen": 154764672, "step": 2238 }, { "epoch": 139.9375, "grad_norm": 0.845929231225681, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154836288, "step": 2239 }, { "epoch": 139.9375, "loss": 0.00032878026831895113, "loss_ce": 6.365880835801363e-05, "loss_xval": 0.0002651214599609375, "num_input_tokens_seen": 154836288, "step": 2239 }, { "epoch": 140.0, "grad_norm": 0.2749791957448786, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 154907968, "step": 2240 }, { "epoch": 140.0, "loss": 0.0003007893683388829, "loss_ce": 6.332447082968429e-05, "loss_xval": 0.00023746490478515625, "num_input_tokens_seen": 154907968, "step": 2240 }, { "epoch": 140.0625, "grad_norm": 0.6196150387867916, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 154967168, "step": 2241 }, { "epoch": 140.0625, "loss": 0.0003579995536711067, "loss_ce": 5.663846604875289e-05, "loss_xval": 0.000301361083984375, "num_input_tokens_seen": 154967168, "step": 2241 }, { "epoch": 140.125, "grad_norm": 0.6804395287117125, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155038784, "step": 2242 }, { "epoch": 140.125, "loss": 0.00025112146977335215, "loss_ce": 5.180353036848828e-05, "loss_xval": 0.00019931793212890625, "num_input_tokens_seen": 155038784, "step": 2242 }, { "epoch": 140.1875, "grad_norm": 0.13514985352534017, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 155110336, "step": 2243 }, { "epoch": 140.1875, "loss": 0.00020603234588634223, "loss_ce": 4.772240572492592e-05, "loss_xval": 0.0001583099365234375, "num_input_tokens_seen": 155110336, "step": 2243 }, { "epoch": 140.25, "grad_norm": 0.7497696258118558, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 155181888, "step": 2244 }, { "epoch": 140.25, "loss": 0.00037656372296623886, "loss_ce": 4.849975448451005e-05, "loss_xval": 0.00032806396484375, "num_input_tokens_seen": 155181888, "step": 2244 }, { "epoch": 140.3125, "grad_norm": 0.41926920477007684, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155253504, "step": 2245 }, { "epoch": 140.3125, "loss": 0.0003734802012331784, "loss_ce": 4.732359229819849e-05, "loss_xval": 0.0003261566162109375, "num_input_tokens_seen": 155253504, "step": 2245 }, { "epoch": 140.375, "grad_norm": 0.07113424112628354, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 155325120, "step": 2246 }, { "epoch": 140.375, "loss": 0.00014185221516527236, "loss_ce": 4.3146916141267866e-05, "loss_xval": 9.870529174804688e-05, "num_input_tokens_seen": 155325120, "step": 2246 }, { "epoch": 140.4375, "grad_norm": 0.25364112394243704, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 155396736, "step": 2247 }, { "epoch": 140.4375, "loss": 0.00019017353770323098, "loss_ce": 4.044667730340734e-05, "loss_xval": 0.00014972686767578125, "num_input_tokens_seen": 155396736, "step": 2247 }, { "epoch": 140.5, "grad_norm": 0.8181657596738321, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155468416, "step": 2248 }, { "epoch": 140.5, "loss": 0.0003228874411433935, "loss_ce": 3.869248030241579e-05, "loss_xval": 0.0002841949462890625, "num_input_tokens_seen": 155468416, "step": 2248 }, { "epoch": 140.5625, "grad_norm": 0.8271386434304645, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155540160, "step": 2249 }, { "epoch": 140.5625, "loss": 0.0003159397456329316, "loss_ce": 3.9374201151076704e-05, "loss_xval": 0.0002765655517578125, "num_input_tokens_seen": 155540160, "step": 2249 }, { "epoch": 140.625, "grad_norm": 0.11107590119883078, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 155611968, "step": 2250 }, { "epoch": 140.625, "eval_synth_IoU": 0.29707459546625614, "eval_synth_MAE_x": 0.014007568359375, "eval_synth_MAE_y": 0.0155181884765625, "eval_synth_NUM_probability": 0.9995821416378021, "eval_synth_inside_bbox": 0.5, "eval_synth_loss": 0.0002646126667968929, "eval_synth_loss_ce": 3.5015559660678264e-05, "eval_synth_loss_xval": 0.0002295970916748047, "eval_synth_runtime": 62.9843, "eval_synth_samples_per_second": 2.032, "eval_synth_steps_per_second": 0.064, "num_input_tokens_seen": 155611968, "step": 2250 }, { "epoch": 140.625, "loss": 0.0002962798753287643, "loss_ce": 3.49731017195154e-05, "loss_xval": 0.0002613067626953125, "num_input_tokens_seen": 155611968, "step": 2250 }, { "epoch": 140.6875, "grad_norm": 0.6278575183312798, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 155683776, "step": 2251 }, { "epoch": 140.6875, "loss": 0.0002068749745376408, "loss_ce": 3.425991963013075e-05, "loss_xval": 0.00017261505126953125, "num_input_tokens_seen": 155683776, "step": 2251 }, { "epoch": 140.75, "grad_norm": 0.6664386753012557, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155755392, "step": 2252 }, { "epoch": 140.75, "loss": 0.00036714281304739416, "loss_ce": 3.335680958116427e-05, "loss_xval": 0.0003337860107421875, "num_input_tokens_seen": 155755392, "step": 2252 }, { "epoch": 140.8125, "grad_norm": 0.06576501359392857, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 155826944, "step": 2253 }, { "epoch": 140.8125, "loss": 0.00013523247616831213, "loss_ce": 3.223565363441594e-05, "loss_xval": 0.000102996826171875, "num_input_tokens_seen": 155826944, "step": 2253 }, { "epoch": 140.875, "grad_norm": 0.7849105738929436, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 155898560, "step": 2254 }, { "epoch": 140.875, "loss": 0.0003013724344782531, "loss_ce": 2.8621574529097416e-05, "loss_xval": 0.0002727508544921875, "num_input_tokens_seen": 155898560, "step": 2254 }, { "epoch": 140.9375, "grad_norm": 0.9380444255904111, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 155970112, "step": 2255 }, { "epoch": 140.9375, "loss": 0.0003526804503053427, "loss_ce": 2.8431173632270657e-05, "loss_xval": 0.000324249267578125, "num_input_tokens_seen": 155970112, "step": 2255 }, { "epoch": 141.0, "grad_norm": 0.44775255648791673, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156041728, "step": 2256 }, { "epoch": 141.0, "loss": 0.00012957978469785303, "loss_ce": 2.7059801141149364e-05, "loss_xval": 0.00010251998901367188, "num_input_tokens_seen": 156041728, "step": 2256 }, { "epoch": 141.0625, "grad_norm": 0.126081080342943, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156100864, "step": 2257 }, { "epoch": 141.0625, "loss": 8.322107169078663e-05, "loss_ce": 2.647744804562535e-05, "loss_xval": 5.6743621826171875e-05, "num_input_tokens_seen": 156100864, "step": 2257 }, { "epoch": 141.125, "grad_norm": 0.47681506835538257, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 156159872, "step": 2258 }, { "epoch": 141.125, "loss": 0.00012125995272072032, "loss_ce": 2.398517244728282e-05, "loss_xval": 9.72747802734375e-05, "num_input_tokens_seen": 156159872, "step": 2258 }, { "epoch": 141.1875, "grad_norm": 0.4726416575728405, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 156231552, "step": 2259 }, { "epoch": 141.1875, "loss": 0.00021909164206590503, "loss_ce": 2.644942833285313e-05, "loss_xval": 0.0001926422119140625, "num_input_tokens_seen": 156231552, "step": 2259 }, { "epoch": 141.25, "grad_norm": 0.1718543214570315, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156303296, "step": 2260 }, { "epoch": 141.25, "loss": 0.00012607949611265212, "loss_ce": 2.5943692889995873e-05, "loss_xval": 0.00010013580322265625, "num_input_tokens_seen": 156303296, "step": 2260 }, { "epoch": 141.3125, "grad_norm": 0.191379048394649, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156362432, "step": 2261 }, { "epoch": 141.3125, "loss": 6.542243500007316e-05, "loss_ce": 2.3460766897187568e-05, "loss_xval": 4.1961669921875e-05, "num_input_tokens_seen": 156362432, "step": 2261 }, { "epoch": 141.375, "grad_norm": 0.5373399154129548, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 156434048, "step": 2262 }, { "epoch": 141.375, "loss": 0.00020634174870792776, "loss_ce": 2.2282611098489724e-05, "loss_xval": 0.00018405914306640625, "num_input_tokens_seen": 156434048, "step": 2262 }, { "epoch": 141.4375, "grad_norm": 0.7771818980975708, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 156505600, "step": 2263 }, { "epoch": 141.4375, "loss": 0.00023935161880217493, "loss_ce": 2.1913880118518136e-05, "loss_xval": 0.000217437744140625, "num_input_tokens_seen": 156505600, "step": 2263 }, { "epoch": 141.5, "grad_norm": 0.652937392279704, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 156577152, "step": 2264 }, { "epoch": 141.5, "loss": 0.00015698786592110991, "loss_ce": 2.347346344322432e-05, "loss_xval": 0.000133514404296875, "num_input_tokens_seen": 156577152, "step": 2264 }, { "epoch": 141.5625, "grad_norm": 0.24034733399568411, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156648832, "step": 2265 }, { "epoch": 141.5625, "loss": 8.453571354039013e-05, "loss_ce": 2.1116375137353316e-05, "loss_xval": 6.341934204101562e-05, "num_input_tokens_seen": 156648832, "step": 2265 }, { "epoch": 141.625, "grad_norm": 0.05137380518404474, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156707968, "step": 2266 }, { "epoch": 141.625, "loss": 8.02580761956051e-05, "loss_ce": 1.9938175682909787e-05, "loss_xval": 6.031990051269531e-05, "num_input_tokens_seen": 156707968, "step": 2266 }, { "epoch": 141.6875, "grad_norm": 0.07237913080118438, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156779584, "step": 2267 }, { "epoch": 141.6875, "loss": 3.4921242331620306e-05, "loss_ce": 2.0020079318783246e-05, "loss_xval": 1.4901161193847656e-05, "num_input_tokens_seen": 156779584, "step": 2267 }, { "epoch": 141.75, "grad_norm": 0.02706912992405218, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156838592, "step": 2268 }, { "epoch": 141.75, "loss": 8.904261630959809e-05, "loss_ce": 1.8470716895535588e-05, "loss_xval": 7.05718994140625e-05, "num_input_tokens_seen": 156838592, "step": 2268 }, { "epoch": 141.8125, "grad_norm": 0.12553347339518686, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156910144, "step": 2269 }, { "epoch": 141.8125, "loss": 6.425999163184315e-05, "loss_ce": 1.8960461602546275e-05, "loss_xval": 4.5299530029296875e-05, "num_input_tokens_seen": 156910144, "step": 2269 }, { "epoch": 141.875, "grad_norm": 0.4635155560761945, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 156969344, "step": 2270 }, { "epoch": 141.875, "loss": 0.0001502758968854323, "loss_ce": 1.6761494407546706e-05, "loss_xval": 0.000133514404296875, "num_input_tokens_seen": 156969344, "step": 2270 }, { "epoch": 141.9375, "grad_norm": 0.8880865872714322, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 157041024, "step": 2271 }, { "epoch": 141.9375, "loss": 0.00034242760739289224, "loss_ce": 1.8178337995777838e-05, "loss_xval": 0.000324249267578125, "num_input_tokens_seen": 157041024, "step": 2271 }, { "epoch": 142.0, "grad_norm": 1.02132370067152, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 157112768, "step": 2272 }, { "epoch": 142.0, "loss": 0.00038059926009736955, "loss_ce": 1.629567486816086e-05, "loss_xval": 0.0003643035888671875, "num_input_tokens_seen": 157112768, "step": 2272 }, { "epoch": 142.0625, "grad_norm": 0.754057194678211, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 157184384, "step": 2273 }, { "epoch": 142.0625, "loss": 0.0002003343979595229, "loss_ce": 1.627525307412725e-05, "loss_xval": 0.00018405914306640625, "num_input_tokens_seen": 157184384, "step": 2273 }, { "epoch": 142.125, "grad_norm": 0.270742358060926, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 157256192, "step": 2274 }, { "epoch": 142.125, "loss": 0.00013197364751249552, "loss_ce": 1.7532724086777307e-05, "loss_xval": 0.00011444091796875, "num_input_tokens_seen": 157256192, "step": 2274 }, { "epoch": 142.1875, "grad_norm": 0.3032083858971255, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 157315328, "step": 2275 }, { "epoch": 142.1875, "loss": 8.614677062723786e-05, "loss_ce": 1.5574874851154163e-05, "loss_xval": 7.05718994140625e-05, "num_input_tokens_seen": 157315328, "step": 2275 }, { "epoch": 142.25, "grad_norm": 0.8531261556843668, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 157386880, "step": 2276 }, { "epoch": 142.25, "loss": 0.00036126747727394104, "loss_ce": 1.6037383829825558e-05, "loss_xval": 0.0003452301025390625, "num_input_tokens_seen": 157386880, "step": 2276 }, { "epoch": 142.3125, "grad_norm": 1.2143602578373374, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 157458560, "step": 2277 }, { "epoch": 142.3125, "loss": 0.0003886088670697063, "loss_ce": 1.6675890947226435e-05, "loss_xval": 0.0003719329833984375, "num_input_tokens_seen": 157458560, "step": 2277 }, { "epoch": 142.375, "grad_norm": 1.4565740954306055, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 157530112, "step": 2278 }, { "epoch": 142.375, "loss": 0.0005817589699290693, "loss_ce": 1.7183778254548088e-05, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 157530112, "step": 2278 }, { "epoch": 142.4375, "grad_norm": 1.7198112420019938, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 157601920, "step": 2279 }, { "epoch": 142.4375, "loss": 0.0009235304314643145, "loss_ce": 1.5632504073437303e-05, "loss_xval": 0.00090789794921875, "num_input_tokens_seen": 157601920, "step": 2279 }, { "epoch": 142.5, "grad_norm": 2.0291451474600204, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 157673664, "step": 2280 }, { "epoch": 142.5, "loss": 0.001222676713950932, "loss_ce": 1.7232428945135325e-05, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 157673664, "step": 2280 }, { "epoch": 142.5625, "grad_norm": 2.4538085825998506, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 157745280, "step": 2281 }, { "epoch": 142.5625, "loss": 0.0017871184973046184, "loss_ce": 1.7099009710364044e-05, "loss_xval": 0.00177001953125, "num_input_tokens_seen": 157745280, "step": 2281 }, { "epoch": 142.625, "grad_norm": 3.101466685210458, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 157816960, "step": 2282 }, { "epoch": 142.625, "loss": 0.0027955160476267338, "loss_ce": 1.8416507373331115e-05, "loss_xval": 0.002777099609375, "num_input_tokens_seen": 157816960, "step": 2282 }, { "epoch": 142.6875, "grad_norm": 3.860516054775248, "learning_rate": 5e-05, "loss": 0.0042, "num_input_tokens_seen": 157875968, "step": 2283 }, { "epoch": 142.6875, "loss": 0.004079585429280996, "loss_ce": 2.0747500457218848e-05, "loss_xval": 0.004058837890625, "num_input_tokens_seen": 157875968, "step": 2283 }, { "epoch": 142.75, "grad_norm": 4.391002128654902, "learning_rate": 5e-05, "loss": 0.0054, "num_input_tokens_seen": 157947584, "step": 2284 }, { "epoch": 142.75, "loss": 0.005272651091217995, "loss_ce": 2.362753548368346e-05, "loss_xval": 0.0052490234375, "num_input_tokens_seen": 157947584, "step": 2284 }, { "epoch": 142.8125, "grad_norm": 3.8481785567403946, "learning_rate": 5e-05, "loss": 0.0043, "num_input_tokens_seen": 158019264, "step": 2285 }, { "epoch": 142.8125, "loss": 0.00454552099108696, "loss_ce": 2.8919605028931983e-05, "loss_xval": 0.0045166015625, "num_input_tokens_seen": 158019264, "step": 2285 }, { "epoch": 142.875, "grad_norm": 1.639097733325779, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 158091008, "step": 2286 }, { "epoch": 142.875, "loss": 0.0011131562059745193, "loss_ce": 3.741156615433283e-05, "loss_xval": 0.00107574462890625, "num_input_tokens_seen": 158091008, "step": 2286 }, { "epoch": 142.9375, "grad_norm": 1.261991233490499, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 158162624, "step": 2287 }, { "epoch": 142.9375, "loss": 0.0009148393291980028, "loss_ce": 4.8903075366979465e-05, "loss_xval": 0.000865936279296875, "num_input_tokens_seen": 158162624, "step": 2287 }, { "epoch": 143.0, "grad_norm": 3.2012555194906236, "learning_rate": 5e-05, "loss": 0.0032, "num_input_tokens_seen": 158234240, "step": 2288 }, { "epoch": 143.0, "loss": 0.0035402164794504642, "loss_ce": 6.121250044088811e-05, "loss_xval": 0.00347900390625, "num_input_tokens_seen": 158234240, "step": 2288 }, { "epoch": 143.0625, "grad_norm": 3.072103026451075, "learning_rate": 5e-05, "loss": 0.0029, "num_input_tokens_seen": 158305920, "step": 2289 }, { "epoch": 143.0625, "loss": 0.003025761106982827, "loss_ce": 8.081478881649673e-05, "loss_xval": 0.0029449462890625, "num_input_tokens_seen": 158305920, "step": 2289 }, { "epoch": 143.125, "grad_norm": 0.9733620436290504, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 158377472, "step": 2290 }, { "epoch": 143.125, "loss": 0.0005783883389085531, "loss_ce": 0.00010345852206228301, "loss_xval": 0.0004749298095703125, "num_input_tokens_seen": 158377472, "step": 2290 }, { "epoch": 143.1875, "grad_norm": 1.6681225600357712, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 158449152, "step": 2291 }, { "epoch": 143.1875, "loss": 0.0009542530169710517, "loss_ce": 0.00012264902761671692, "loss_xval": 0.00083160400390625, "num_input_tokens_seen": 158449152, "step": 2291 }, { "epoch": 143.25, "grad_norm": 3.0001199342153058, "learning_rate": 5e-05, "loss": 0.003, "num_input_tokens_seen": 158520832, "step": 2292 }, { "epoch": 143.25, "loss": 0.00304748909547925, "loss_ce": 0.00014831911539658904, "loss_xval": 0.002899169921875, "num_input_tokens_seen": 158520832, "step": 2292 }, { "epoch": 143.3125, "grad_norm": 2.267618572064086, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 158592512, "step": 2293 }, { "epoch": 143.3125, "loss": 0.002161857206374407, "loss_ce": 0.00017821461369749159, "loss_xval": 0.001983642578125, "num_input_tokens_seen": 158592512, "step": 2293 }, { "epoch": 143.375, "grad_norm": 0.04030482510571815, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 158651520, "step": 2294 }, { "epoch": 143.375, "loss": 0.00026899820659309626, "loss_ce": 0.00017506130097899586, "loss_xval": 9.393692016601562e-05, "num_input_tokens_seen": 158651520, "step": 2294 }, { "epoch": 143.4375, "grad_norm": 2.254898691584632, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 158723200, "step": 2295 }, { "epoch": 143.4375, "loss": 0.001831955392844975, "loss_ce": 0.00019163561228197068, "loss_xval": 0.00164031982421875, "num_input_tokens_seen": 158723200, "step": 2295 }, { "epoch": 143.5, "grad_norm": 2.7482044925183335, "learning_rate": 5e-05, "loss": 0.0025, "num_input_tokens_seen": 158794816, "step": 2296 }, { "epoch": 143.5, "loss": 0.0025814685504883528, "loss_ce": 0.0001858387258835137, "loss_xval": 0.0023956298828125, "num_input_tokens_seen": 158794816, "step": 2296 }, { "epoch": 143.5625, "grad_norm": 1.1240351697063617, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 158866432, "step": 2297 }, { "epoch": 143.5625, "loss": 0.0007681635906919837, "loss_ce": 0.00019977372721768916, "loss_xval": 0.000568389892578125, "num_input_tokens_seen": 158866432, "step": 2297 }, { "epoch": 143.625, "grad_norm": 1.1891412391354403, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 158938112, "step": 2298 }, { "epoch": 143.625, "loss": 0.0009352611377835274, "loss_ce": 0.0001837657910073176, "loss_xval": 0.000751495361328125, "num_input_tokens_seen": 158938112, "step": 2298 }, { "epoch": 143.6875, "grad_norm": 2.247141443842513, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 159009792, "step": 2299 }, { "epoch": 143.6875, "loss": 0.002149116713553667, "loss_ce": 0.00016547413542866707, "loss_xval": 0.001983642578125, "num_input_tokens_seen": 159009792, "step": 2299 }, { "epoch": 143.75, "grad_norm": 1.6416382389026558, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 159081472, "step": 2300 }, { "epoch": 143.75, "loss": 0.001146484981290996, "loss_ce": 0.00016229308675974607, "loss_xval": 0.00098419189453125, "num_input_tokens_seen": 159081472, "step": 2300 }, { "epoch": 143.8125, "grad_norm": 0.349722522096258, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 159153088, "step": 2301 }, { "epoch": 143.8125, "loss": 0.00040729466127231717, "loss_ce": 0.00014217320131137967, "loss_xval": 0.0002651214599609375, "num_input_tokens_seen": 159153088, "step": 2301 }, { "epoch": 143.875, "grad_norm": 0.6319018735463661, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 159224640, "step": 2302 }, { "epoch": 143.875, "loss": 0.0005437061772681773, "loss_ce": 0.00012790417531505227, "loss_xval": 0.000415802001953125, "num_input_tokens_seen": 159224640, "step": 2302 }, { "epoch": 143.9375, "grad_norm": 0.8852029961655974, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 159296256, "step": 2303 }, { "epoch": 143.9375, "loss": 0.000454133638413623, "loss_ce": 0.00010890352859860286, "loss_xval": 0.0003452301025390625, "num_input_tokens_seen": 159296256, "step": 2303 }, { "epoch": 144.0, "grad_norm": 0.7714757947019761, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 159368064, "step": 2304 }, { "epoch": 144.0, "loss": 0.00044721277663484216, "loss_ce": 0.00010579736408544704, "loss_xval": 0.0003414154052734375, "num_input_tokens_seen": 159368064, "step": 2304 }, { "epoch": 144.0625, "grad_norm": 0.4846986541212903, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 159439616, "step": 2305 }, { "epoch": 144.0625, "loss": 0.0004524993128143251, "loss_ce": 9.391778439749032e-05, "loss_xval": 0.00035858154296875, "num_input_tokens_seen": 159439616, "step": 2305 }, { "epoch": 144.125, "grad_norm": 0.220384214054421, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 159511296, "step": 2306 }, { "epoch": 144.125, "loss": 0.00016809874796308577, "loss_ce": 7.797653233865276e-05, "loss_xval": 9.012222290039062e-05, "num_input_tokens_seen": 159511296, "step": 2306 }, { "epoch": 144.1875, "grad_norm": 1.0475055335972265, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 159583104, "step": 2307 }, { "epoch": 144.1875, "loss": 0.00048047397285699844, "loss_ce": 6.276463682297617e-05, "loss_xval": 0.0004177093505859375, "num_input_tokens_seen": 159583104, "step": 2307 }, { "epoch": 144.25, "grad_norm": 1.7401840496492975, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 159654848, "step": 2308 }, { "epoch": 144.25, "loss": 0.0011681739706546068, "loss_ce": 5.428240183391608e-05, "loss_xval": 0.0011138916015625, "num_input_tokens_seen": 159654848, "step": 2308 }, { "epoch": 144.3125, "grad_norm": 2.008997027680627, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 159726528, "step": 2309 }, { "epoch": 144.3125, "loss": 0.0014396762708202004, "loss_ce": 5.112646613270044e-05, "loss_xval": 0.0013885498046875, "num_input_tokens_seen": 159726528, "step": 2309 }, { "epoch": 144.375, "grad_norm": 1.7834433012025868, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 159798208, "step": 2310 }, { "epoch": 144.375, "loss": 0.001046067220158875, "loss_ce": 4.661651837523095e-05, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 159798208, "step": 2310 }, { "epoch": 144.4375, "grad_norm": 1.4923854766570368, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 159857344, "step": 2311 }, { "epoch": 144.4375, "loss": 0.0011433206964284182, "loss_ce": 4.468788392841816e-05, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 159857344, "step": 2311 }, { "epoch": 144.5, "grad_norm": 1.4666471946527897, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 159928896, "step": 2312 }, { "epoch": 144.5, "loss": 0.0006469779764302075, "loss_ce": 4.044111119583249e-05, "loss_xval": 0.000606536865234375, "num_input_tokens_seen": 159928896, "step": 2312 }, { "epoch": 144.5625, "grad_norm": 1.4521669594107383, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 160000576, "step": 2313 }, { "epoch": 144.5625, "loss": 0.0008465639548376203, "loss_ce": 3.784813452512026e-05, "loss_xval": 0.0008087158203125, "num_input_tokens_seen": 160000576, "step": 2313 }, { "epoch": 144.625, "grad_norm": 1.3065765034926327, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 160072256, "step": 2314 }, { "epoch": 144.625, "loss": 0.0010280825663357973, "loss_ce": 3.6261219065636396e-05, "loss_xval": 0.0009918212890625, "num_input_tokens_seen": 160072256, "step": 2314 }, { "epoch": 144.6875, "grad_norm": 1.2298249889341375, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 160143808, "step": 2315 }, { "epoch": 144.6875, "loss": 0.0005464506102725863, "loss_ce": 3.5281202144687995e-05, "loss_xval": 0.00051116943359375, "num_input_tokens_seen": 160143808, "step": 2315 }, { "epoch": 144.75, "grad_norm": 1.1456463289722179, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 160215552, "step": 2316 }, { "epoch": 144.75, "loss": 0.0007918947958387434, "loss_ce": 3.277005453128368e-05, "loss_xval": 0.000759124755859375, "num_input_tokens_seen": 160215552, "step": 2316 }, { "epoch": 144.8125, "grad_norm": 0.9536307015781831, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 160287232, "step": 2317 }, { "epoch": 144.8125, "loss": 0.0003948156663682312, "loss_ce": 3.0512072044075467e-05, "loss_xval": 0.0003643035888671875, "num_input_tokens_seen": 160287232, "step": 2317 }, { "epoch": 144.875, "grad_norm": 0.8927383093885575, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 160358912, "step": 2318 }, { "epoch": 144.875, "loss": 0.000286363298073411, "loss_ce": 3.07785885524936e-05, "loss_xval": 0.000255584716796875, "num_input_tokens_seen": 160358912, "step": 2318 }, { "epoch": 144.9375, "grad_norm": 1.0475096611391477, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 160430464, "step": 2319 }, { "epoch": 144.9375, "loss": 0.00042939482955262065, "loss_ce": 2.8851631213910878e-05, "loss_xval": 0.000400543212890625, "num_input_tokens_seen": 160430464, "step": 2319 }, { "epoch": 145.0, "grad_norm": 1.2569209874717198, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 160502144, "step": 2320 }, { "epoch": 145.0, "loss": 0.0005893835914321244, "loss_ce": 2.86231061181752e-05, "loss_xval": 0.000560760498046875, "num_input_tokens_seen": 160502144, "step": 2320 }, { "epoch": 145.0625, "grad_norm": 1.433665622859712, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 160573824, "step": 2321 }, { "epoch": 145.0625, "loss": 0.0007936473120935261, "loss_ce": 2.6893150788964704e-05, "loss_xval": 0.000766754150390625, "num_input_tokens_seen": 160573824, "step": 2321 }, { "epoch": 145.125, "grad_norm": 1.6109769417554887, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 160645504, "step": 2322 }, { "epoch": 145.125, "loss": 0.0008781353244557977, "loss_ce": 2.7457821488496847e-05, "loss_xval": 0.000850677490234375, "num_input_tokens_seen": 160645504, "step": 2322 }, { "epoch": 145.1875, "grad_norm": 2.0578679521087397, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 160717056, "step": 2323 }, { "epoch": 145.1875, "loss": 0.001340178889222443, "loss_ce": 2.7923082598135807e-05, "loss_xval": 0.001312255859375, "num_input_tokens_seen": 160717056, "step": 2323 }, { "epoch": 145.25, "grad_norm": 3.059981768578544, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 160776128, "step": 2324 }, { "epoch": 145.25, "loss": 0.0027596692088991404, "loss_ce": 2.8345857572276145e-05, "loss_xval": 0.0027313232421875, "num_input_tokens_seen": 160776128, "step": 2324 }, { "epoch": 145.3125, "grad_norm": 4.841719922986072, "learning_rate": 5e-05, "loss": 0.0067, "num_input_tokens_seen": 160847808, "step": 2325 }, { "epoch": 145.3125, "loss": 0.006712182424962521, "loss_ce": 2.8832586394855753e-05, "loss_xval": 0.006683349609375, "num_input_tokens_seen": 160847808, "step": 2325 }, { "epoch": 145.375, "grad_norm": 7.633571031981438, "learning_rate": 5e-05, "loss": 0.0166, "num_input_tokens_seen": 160919552, "step": 2326 }, { "epoch": 145.375, "loss": 0.01700500398874283, "loss_ce": 3.7230420275591314e-05, "loss_xval": 0.0169677734375, "num_input_tokens_seen": 160919552, "step": 2326 }, { "epoch": 145.4375, "grad_norm": 10.222211247378048, "learning_rate": 5e-05, "loss": 0.03, "num_input_tokens_seen": 160991232, "step": 2327 }, { "epoch": 145.4375, "loss": 0.029842397198081017, "loss_ce": 5.724148650187999e-05, "loss_xval": 0.02978515625, "num_input_tokens_seen": 160991232, "step": 2327 }, { "epoch": 145.5, "grad_norm": 8.580259887263983, "learning_rate": 5e-05, "loss": 0.0219, "num_input_tokens_seen": 161063040, "step": 2328 }, { "epoch": 145.5, "loss": 0.02100929617881775, "loss_ce": 0.0001352726249024272, "loss_xval": 0.0208740234375, "num_input_tokens_seen": 161063040, "step": 2328 }, { "epoch": 145.5625, "grad_norm": 1.5104478122402045, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 161134656, "step": 2329 }, { "epoch": 145.5625, "loss": 0.0015624697552993894, "loss_ce": 0.0003341372648719698, "loss_xval": 0.00122833251953125, "num_input_tokens_seen": 161134656, "step": 2329 }, { "epoch": 145.625, "grad_norm": 5.965994091283684, "learning_rate": 5e-05, "loss": 0.0117, "num_input_tokens_seen": 161206400, "step": 2330 }, { "epoch": 145.625, "loss": 0.011319183744490147, "loss_ce": 0.000699066964443773, "loss_xval": 0.0106201171875, "num_input_tokens_seen": 161206400, "step": 2330 }, { "epoch": 145.6875, "grad_norm": 7.511859289700143, "learning_rate": 5e-05, "loss": 0.0182, "num_input_tokens_seen": 161278080, "step": 2331 }, { "epoch": 145.6875, "loss": 0.018168123438954353, "loss_ce": 0.0008341383654624224, "loss_xval": 0.017333984375, "num_input_tokens_seen": 161278080, "step": 2331 }, { "epoch": 145.75, "grad_norm": 1.5647029181604932, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 161349632, "step": 2332 }, { "epoch": 145.75, "loss": 0.0016789359506219625, "loss_ce": 0.0007977409404702485, "loss_xval": 0.000881195068359375, "num_input_tokens_seen": 161349632, "step": 2332 }, { "epoch": 145.8125, "grad_norm": 5.257327290104701, "learning_rate": 5e-05, "loss": 0.0097, "num_input_tokens_seen": 161421312, "step": 2333 }, { "epoch": 145.8125, "loss": 0.009456822648644447, "loss_ce": 0.0006067253416404128, "loss_xval": 0.00885009765625, "num_input_tokens_seen": 161421312, "step": 2333 }, { "epoch": 145.875, "grad_norm": 4.94192237617853, "learning_rate": 5e-05, "loss": 0.008, "num_input_tokens_seen": 161492928, "step": 2334 }, { "epoch": 145.875, "loss": 0.007787576876580715, "loss_ce": 0.0004023227375000715, "loss_xval": 0.00738525390625, "num_input_tokens_seen": 161492928, "step": 2334 }, { "epoch": 145.9375, "grad_norm": 1.2994286275910956, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 161564544, "step": 2335 }, { "epoch": 145.9375, "loss": 0.0014959691325202584, "loss_ce": 0.00026763658388517797, "loss_xval": 0.00122833251953125, "num_input_tokens_seen": 161564544, "step": 2335 }, { "epoch": 146.0, "grad_norm": 5.7923874654102, "learning_rate": 5e-05, "loss": 0.0101, "num_input_tokens_seen": 161636160, "step": 2336 }, { "epoch": 146.0, "loss": 0.010387586429715157, "loss_ce": 0.00019471513223834336, "loss_xval": 0.01019287109375, "num_input_tokens_seen": 161636160, "step": 2336 }, { "epoch": 146.0625, "grad_norm": 4.061666459172769, "learning_rate": 5e-05, "loss": 0.0052, "num_input_tokens_seen": 161707776, "step": 2337 }, { "epoch": 146.0625, "loss": 0.005342346150428057, "loss_ce": 0.00015435769455507398, "loss_xval": 0.00518798828125, "num_input_tokens_seen": 161707776, "step": 2337 }, { "epoch": 146.125, "grad_norm": 1.5401101359509008, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 161779392, "step": 2338 }, { "epoch": 146.125, "loss": 0.0011722747003659606, "loss_ce": 0.0001346770004602149, "loss_xval": 0.00103759765625, "num_input_tokens_seen": 161779392, "step": 2338 }, { "epoch": 146.1875, "grad_norm": 5.173482264622772, "learning_rate": 5e-05, "loss": 0.008, "num_input_tokens_seen": 161851008, "step": 2339 }, { "epoch": 146.1875, "loss": 0.008483229205012321, "loss_ce": 0.00012141237675677985, "loss_xval": 0.00836181640625, "num_input_tokens_seen": 161851008, "step": 2339 }, { "epoch": 146.25, "grad_norm": 3.4134114365896546, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 161922624, "step": 2340 }, { "epoch": 146.25, "loss": 0.003694977844133973, "loss_ce": 0.00012442127626854926, "loss_xval": 0.003570556640625, "num_input_tokens_seen": 161922624, "step": 2340 }, { "epoch": 146.3125, "grad_norm": 1.4282072315824867, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 161981760, "step": 2341 }, { "epoch": 146.3125, "loss": 0.0013863763306289911, "loss_ce": 0.0001351555692963302, "loss_xval": 0.001251220703125, "num_input_tokens_seen": 161981760, "step": 2341 }, { "epoch": 146.375, "grad_norm": 4.02508908047131, "learning_rate": 5e-05, "loss": 0.0051, "num_input_tokens_seen": 162053312, "step": 2342 }, { "epoch": 146.375, "loss": 0.005358177702873945, "loss_ce": 0.0001396719308104366, "loss_xval": 0.005218505859375, "num_input_tokens_seen": 162053312, "step": 2342 }, { "epoch": 146.4375, "grad_norm": 2.0923616011747375, "learning_rate": 5e-05, "loss": 0.002, "num_input_tokens_seen": 162124928, "step": 2343 }, { "epoch": 146.4375, "loss": 0.0017234979895874858, "loss_ce": 0.00015184275980573148, "loss_xval": 0.0015716552734375, "num_input_tokens_seen": 162124928, "step": 2343 }, { "epoch": 146.5, "grad_norm": 1.746073179556703, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 162196736, "step": 2344 }, { "epoch": 146.5, "loss": 0.0013283995212987065, "loss_ce": 0.00016110220167320222, "loss_xval": 0.00116729736328125, "num_input_tokens_seen": 162196736, "step": 2344 }, { "epoch": 146.5625, "grad_norm": 3.4177429260977448, "learning_rate": 5e-05, "loss": 0.004, "num_input_tokens_seen": 162268352, "step": 2345 }, { "epoch": 146.5625, "loss": 0.004274115432053804, "loss_ce": 0.00018476003606338054, "loss_xval": 0.00408935546875, "num_input_tokens_seen": 162268352, "step": 2345 }, { "epoch": 146.625, "grad_norm": 1.3166728482788537, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 162340032, "step": 2346 }, { "epoch": 146.625, "loss": 0.0010671853087842464, "loss_ce": 0.000178360816789791, "loss_xval": 0.000888824462890625, "num_input_tokens_seen": 162340032, "step": 2346 }, { "epoch": 146.6875, "grad_norm": 2.075985271349164, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 162386560, "step": 2347 }, { "epoch": 146.6875, "loss": 0.0016639826353639364, "loss_ce": 0.00018388015450909734, "loss_xval": 0.0014801025390625, "num_input_tokens_seen": 162386560, "step": 2347 }, { "epoch": 146.75, "grad_norm": 2.8229531028952852, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 162458176, "step": 2348 }, { "epoch": 146.75, "loss": 0.002624648157507181, "loss_ce": 0.00018324196571484208, "loss_xval": 0.00244140625, "num_input_tokens_seen": 162458176, "step": 2348 }, { "epoch": 146.8125, "grad_norm": 0.5651772904000949, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 162529856, "step": 2349 }, { "epoch": 146.8125, "loss": 0.0006011626101098955, "loss_ce": 0.00017391651635989547, "loss_xval": 0.00042724609375, "num_input_tokens_seen": 162529856, "step": 2349 }, { "epoch": 146.875, "grad_norm": 2.122147012123423, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 162589056, "step": 2350 }, { "epoch": 146.875, "loss": 0.0016808974323794246, "loss_ce": 0.0001550184824736789, "loss_xval": 0.00152587890625, "num_input_tokens_seen": 162589056, "step": 2350 }, { "epoch": 146.9375, "grad_norm": 2.6642079199279696, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 162660736, "step": 2351 }, { "epoch": 146.9375, "loss": 0.002070722868666053, "loss_ce": 0.00014811537403147668, "loss_xval": 0.001922607421875, "num_input_tokens_seen": 162660736, "step": 2351 }, { "epoch": 147.0, "grad_norm": 0.6804208855864794, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 162719872, "step": 2352 }, { "epoch": 147.0, "loss": 0.00033290323335677385, "loss_ce": 0.00012500221782829612, "loss_xval": 0.0002079010009765625, "num_input_tokens_seen": 162719872, "step": 2352 }, { "epoch": 147.0625, "grad_norm": 2.030024025671497, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 162791488, "step": 2353 }, { "epoch": 147.0625, "loss": 0.0017423724057152867, "loss_ce": 0.00010968195419991389, "loss_xval": 0.0016326904296875, "num_input_tokens_seen": 162791488, "step": 2353 }, { "epoch": 147.125, "grad_norm": 3.117284781056457, "learning_rate": 5e-05, "loss": 0.0031, "num_input_tokens_seen": 162863296, "step": 2354 }, { "epoch": 147.125, "loss": 0.003642292460426688, "loss_ce": 0.00010225331061519682, "loss_xval": 0.0035400390625, "num_input_tokens_seen": 162863296, "step": 2354 }, { "epoch": 147.1875, "grad_norm": 1.6553163910169943, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 162934848, "step": 2355 }, { "epoch": 147.1875, "loss": 0.0010774503462016582, "loss_ce": 9.325839346274734e-05, "loss_xval": 0.00098419189453125, "num_input_tokens_seen": 162934848, "step": 2355 }, { "epoch": 147.25, "grad_norm": 0.9701196269738218, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 163006528, "step": 2356 }, { "epoch": 147.25, "loss": 0.00039144710171967745, "loss_ce": 8.817866182653233e-05, "loss_xval": 0.0003032684326171875, "num_input_tokens_seen": 163006528, "step": 2356 }, { "epoch": 147.3125, "grad_norm": 2.569815599589962, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 163078208, "step": 2357 }, { "epoch": 147.3125, "loss": 0.0022654198110103607, "loss_ce": 8.341299690073356e-05, "loss_xval": 0.0021820068359375, "num_input_tokens_seen": 163078208, "step": 2357 }, { "epoch": 147.375, "grad_norm": 2.0197217575491, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 163149888, "step": 2358 }, { "epoch": 147.375, "loss": 0.0012817709939554334, "loss_ce": 7.632669439772144e-05, "loss_xval": 0.0012054443359375, "num_input_tokens_seen": 163149888, "step": 2358 }, { "epoch": 147.4375, "grad_norm": 0.16068342052611778, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 163221632, "step": 2359 }, { "epoch": 147.4375, "loss": 0.0002521135611459613, "loss_ce": 7.28227969375439e-05, "loss_xval": 0.000179290771484375, "num_input_tokens_seen": 163221632, "step": 2359 }, { "epoch": 147.5, "grad_norm": 2.114308270336181, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 163280768, "step": 2360 }, { "epoch": 147.5, "loss": 0.0013699112460017204, "loss_ce": 6.52848175377585e-05, "loss_xval": 0.00130462646484375, "num_input_tokens_seen": 163280768, "step": 2360 }, { "epoch": 147.5625, "grad_norm": 2.6919587894761263, "learning_rate": 5e-05, "loss": 0.0024, "num_input_tokens_seen": 163352384, "step": 2361 }, { "epoch": 147.5625, "loss": 0.002398441778495908, "loss_ce": 6.384702282957733e-05, "loss_xval": 0.0023345947265625, "num_input_tokens_seen": 163352384, "step": 2361 }, { "epoch": 147.625, "grad_norm": 1.5939603759228747, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 163424064, "step": 2362 }, { "epoch": 147.625, "loss": 0.001161682652309537, "loss_ce": 6.304984708549455e-05, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 163424064, "step": 2362 }, { "epoch": 147.6875, "grad_norm": 0.46960435548578017, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 163495744, "step": 2363 }, { "epoch": 147.6875, "loss": 0.0001773663389030844, "loss_ce": 5.815705662826076e-05, "loss_xval": 0.00011920928955078125, "num_input_tokens_seen": 163495744, "step": 2363 }, { "epoch": 147.75, "grad_norm": 1.94759574317629, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 163567360, "step": 2364 }, { "epoch": 147.75, "loss": 0.0014991631032899022, "loss_ce": 5.720752233173698e-05, "loss_xval": 0.00144195556640625, "num_input_tokens_seen": 163567360, "step": 2364 }, { "epoch": 147.8125, "grad_norm": 1.7810217172520648, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 163638912, "step": 2365 }, { "epoch": 147.8125, "loss": 0.0013713567750528455, "loss_ce": 5.1471488404786214e-05, "loss_xval": 0.00131988525390625, "num_input_tokens_seen": 163638912, "step": 2365 }, { "epoch": 147.875, "grad_norm": 0.2965164840091746, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 163698048, "step": 2366 }, { "epoch": 147.875, "loss": 0.00019151023298036307, "loss_ce": 5.4181127779884264e-05, "loss_xval": 0.0001373291015625, "num_input_tokens_seen": 163698048, "step": 2366 }, { "epoch": 147.9375, "grad_norm": 1.2371936328233522, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 163744640, "step": 2367 }, { "epoch": 147.9375, "loss": 0.0007852339185774326, "loss_ce": 5.662675903295167e-05, "loss_xval": 0.000728607177734375, "num_input_tokens_seen": 163744640, "step": 2367 }, { "epoch": 148.0, "grad_norm": 1.8258004590520762, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 163816256, "step": 2368 }, { "epoch": 148.0, "loss": 0.001227924833074212, "loss_ce": 5.299809708958492e-05, "loss_xval": 0.0011749267578125, "num_input_tokens_seen": 163816256, "step": 2368 }, { "epoch": 148.0625, "grad_norm": 1.3028197191754545, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 163887872, "step": 2369 }, { "epoch": 148.0625, "loss": 0.0006705644191242754, "loss_ce": 5.2583487558877096e-05, "loss_xval": 0.00061798095703125, "num_input_tokens_seen": 163887872, "step": 2369 }, { "epoch": 148.125, "grad_norm": 0.2217973624577362, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 163959424, "step": 2370 }, { "epoch": 148.125, "loss": 0.0006748749292455614, "loss_ce": 4.926458859699778e-05, "loss_xval": 0.0006256103515625, "num_input_tokens_seen": 163959424, "step": 2370 }, { "epoch": 148.1875, "grad_norm": 1.423999264306517, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 164031040, "step": 2371 }, { "epoch": 148.1875, "loss": 0.0007270199712365866, "loss_ce": 4.8003839765442535e-05, "loss_xval": 0.00067901611328125, "num_input_tokens_seen": 164031040, "step": 2371 }, { "epoch": 148.25, "grad_norm": 1.8733486817133544, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 164090176, "step": 2372 }, { "epoch": 148.25, "loss": 0.00138192274607718, "loss_ce": 4.677875404013321e-05, "loss_xval": 0.00133514404296875, "num_input_tokens_seen": 164090176, "step": 2372 }, { "epoch": 148.3125, "grad_norm": 1.1859644410491554, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 164161920, "step": 2373 }, { "epoch": 148.3125, "loss": 0.0008659652667120099, "loss_ce": 4.962003367836587e-05, "loss_xval": 0.00081634521484375, "num_input_tokens_seen": 164161920, "step": 2373 }, { "epoch": 148.375, "grad_norm": 0.18254656740800682, "learning_rate": 5e-05, "loss": 0.0001, "num_input_tokens_seen": 164233664, "step": 2374 }, { "epoch": 148.375, "loss": 8.343144145328552e-05, "loss_ce": 4.2661868064897135e-05, "loss_xval": 4.076957702636719e-05, "num_input_tokens_seen": 164233664, "step": 2374 }, { "epoch": 148.4375, "grad_norm": 1.4045512358573804, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 164292800, "step": 2375 }, { "epoch": 148.4375, "loss": 0.0006769074825569987, "loss_ce": 4.366773646324873e-05, "loss_xval": 0.00063323974609375, "num_input_tokens_seen": 164292800, "step": 2375 }, { "epoch": 148.5, "grad_norm": 1.761208723522083, "learning_rate": 5e-05, "loss": 0.0013, "num_input_tokens_seen": 164364416, "step": 2376 }, { "epoch": 148.5, "loss": 0.00126472651027143, "loss_ce": 4.402335616759956e-05, "loss_xval": 0.001220703125, "num_input_tokens_seen": 164364416, "step": 2376 }, { "epoch": 148.5625, "grad_norm": 1.2882790294587565, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 164436096, "step": 2377 }, { "epoch": 148.5625, "loss": 0.0005924896104261279, "loss_ce": 3.9358477806672454e-05, "loss_xval": 0.000553131103515625, "num_input_tokens_seen": 164436096, "step": 2377 }, { "epoch": 148.625, "grad_norm": 0.666004216117137, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 164507712, "step": 2378 }, { "epoch": 148.625, "loss": 0.0004235539527144283, "loss_ce": 4.208423706586473e-05, "loss_xval": 0.0003814697265625, "num_input_tokens_seen": 164507712, "step": 2378 }, { "epoch": 148.6875, "grad_norm": 0.2717273144022413, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 164566784, "step": 2379 }, { "epoch": 148.6875, "loss": 0.0004639588587451726, "loss_ce": 4.43421486124862e-05, "loss_xval": 0.00041961669921875, "num_input_tokens_seen": 164566784, "step": 2379 }, { "epoch": 148.75, "grad_norm": 1.1169222811849129, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 164625856, "step": 2380 }, { "epoch": 148.75, "loss": 0.0005799496429972351, "loss_ce": 4.207733218208887e-05, "loss_xval": 0.000537872314453125, "num_input_tokens_seen": 164625856, "step": 2380 }, { "epoch": 148.8125, "grad_norm": 1.9921527724459633, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 164697472, "step": 2381 }, { "epoch": 148.8125, "loss": 0.0016376186395063996, "loss_ce": 4.307520794100128e-05, "loss_xval": 0.00159454345703125, "num_input_tokens_seen": 164697472, "step": 2381 }, { "epoch": 148.875, "grad_norm": 2.5562549292946337, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 164756480, "step": 2382 }, { "epoch": 148.875, "loss": 0.002530452562496066, "loss_ce": 4.326990165282041e-05, "loss_xval": 0.0024871826171875, "num_input_tokens_seen": 164756480, "step": 2382 }, { "epoch": 148.9375, "grad_norm": 2.5740588770110158, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 164828032, "step": 2383 }, { "epoch": 148.9375, "loss": 0.0023500563111156225, "loss_ce": 4.597923543769866e-05, "loss_xval": 0.0023040771484375, "num_input_tokens_seen": 164828032, "step": 2383 }, { "epoch": 149.0, "grad_norm": 1.9237964044249054, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 164899648, "step": 2384 }, { "epoch": 149.0, "loss": 0.001942421542480588, "loss_ce": 5.033174966229126e-05, "loss_xval": 0.00189208984375, "num_input_tokens_seen": 164899648, "step": 2384 }, { "epoch": 149.0625, "grad_norm": 0.9794175832805015, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 164946240, "step": 2385 }, { "epoch": 149.0625, "loss": 0.00033468782203271985, "loss_ce": 5.430756209534593e-05, "loss_xval": 0.0002803802490234375, "num_input_tokens_seen": 164946240, "step": 2385 }, { "epoch": 149.125, "grad_norm": 0.4144373637048414, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 165017856, "step": 2386 }, { "epoch": 149.125, "loss": 0.0006403362494893372, "loss_ce": 5.6687582400627434e-05, "loss_xval": 0.000583648681640625, "num_input_tokens_seen": 165017856, "step": 2386 }, { "epoch": 149.1875, "grad_norm": 0.23057807805512553, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 165077056, "step": 2387 }, { "epoch": 149.1875, "loss": 0.00042012392077594995, "loss_ce": 5.963501462247223e-05, "loss_xval": 0.0003604888916015625, "num_input_tokens_seen": 165077056, "step": 2387 }, { "epoch": 149.25, "grad_norm": 0.11669737874547069, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 165148608, "step": 2388 }, { "epoch": 149.25, "loss": 0.00020264298655092716, "loss_ce": 5.959184272796847e-05, "loss_xval": 0.0001430511474609375, "num_input_tokens_seen": 165148608, "step": 2388 }, { "epoch": 149.3125, "grad_norm": 0.4484633697241466, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 165220224, "step": 2389 }, { "epoch": 149.3125, "loss": 0.0004950024886056781, "loss_ce": 6.775640940759331e-05, "loss_xval": 0.00042724609375, "num_input_tokens_seen": 165220224, "step": 2389 }, { "epoch": 149.375, "grad_norm": 1.0541395280626393, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 165291904, "step": 2390 }, { "epoch": 149.375, "loss": 0.0005390362348407507, "loss_ce": 6.0291731642792e-05, "loss_xval": 0.0004787445068359375, "num_input_tokens_seen": 165291904, "step": 2390 }, { "epoch": 149.4375, "grad_norm": 2.2593428653272807, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 165363648, "step": 2391 }, { "epoch": 149.4375, "loss": 0.0015220154309645295, "loss_ce": 5.7171691878465936e-05, "loss_xval": 0.00146484375, "num_input_tokens_seen": 165363648, "step": 2391 }, { "epoch": 149.5, "grad_norm": 3.7638655610361695, "learning_rate": 5e-05, "loss": 0.0044, "num_input_tokens_seen": 165435264, "step": 2392 }, { "epoch": 149.5, "loss": 0.004354399163275957, "loss_ce": 5.142059671925381e-05, "loss_xval": 0.004302978515625, "num_input_tokens_seen": 165435264, "step": 2392 }, { "epoch": 149.5625, "grad_norm": 5.0545420851926, "learning_rate": 5e-05, "loss": 0.0077, "num_input_tokens_seen": 165506816, "step": 2393 }, { "epoch": 149.5625, "loss": 0.00762578472495079, "loss_ce": 5.742551002185792e-05, "loss_xval": 0.007568359375, "num_input_tokens_seen": 165506816, "step": 2393 }, { "epoch": 149.625, "grad_norm": 5.500015390501117, "learning_rate": 5e-05, "loss": 0.009, "num_input_tokens_seen": 165578496, "step": 2394 }, { "epoch": 149.625, "loss": 0.008792513981461525, "loss_ce": 6.448627391364425e-05, "loss_xval": 0.00872802734375, "num_input_tokens_seen": 165578496, "step": 2394 }, { "epoch": 149.6875, "grad_norm": 4.239923001902639, "learning_rate": 5e-05, "loss": 0.0059, "num_input_tokens_seen": 165637632, "step": 2395 }, { "epoch": 149.6875, "loss": 0.006338251288980246, "loss_ce": 8.214791887439787e-05, "loss_xval": 0.006256103515625, "num_input_tokens_seen": 165637632, "step": 2395 }, { "epoch": 149.75, "grad_norm": 1.4885826961334672, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 165709312, "step": 2396 }, { "epoch": 149.75, "loss": 0.002050133654847741, "loss_ce": 0.00011226750939385965, "loss_xval": 0.0019378662109375, "num_input_tokens_seen": 165709312, "step": 2396 }, { "epoch": 149.8125, "grad_norm": 0.8805698261264313, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 165780864, "step": 2397 }, { "epoch": 149.8125, "loss": 0.0015474815154448152, "loss_ce": 0.00016656114894431084, "loss_xval": 0.00138092041015625, "num_input_tokens_seen": 165780864, "step": 2397 }, { "epoch": 149.875, "grad_norm": 1.5547521412813465, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 165852672, "step": 2398 }, { "epoch": 149.875, "loss": 0.0012131972471252084, "loss_ce": 0.00021374659263528883, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 165852672, "step": 2398 }, { "epoch": 149.9375, "grad_norm": 1.5844453429196825, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 165924352, "step": 2399 }, { "epoch": 149.9375, "loss": 0.0024127454962581396, "loss_ce": 0.00026125620934180915, "loss_xval": 0.0021514892578125, "num_input_tokens_seen": 165924352, "step": 2399 }, { "epoch": 150.0, "grad_norm": 1.3534716861597587, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 165996032, "step": 2400 }, { "epoch": 150.0, "loss": 0.0009662243537604809, "loss_ce": 0.0002681347250472754, "loss_xval": 0.000698089599609375, "num_input_tokens_seen": 165996032, "step": 2400 }, { "epoch": 150.0625, "grad_norm": 1.0737007932266736, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 166067584, "step": 2401 }, { "epoch": 150.0625, "loss": 0.0017657603602856398, "loss_ce": 0.0002475108194630593, "loss_xval": 0.00151824951171875, "num_input_tokens_seen": 166067584, "step": 2401 }, { "epoch": 150.125, "grad_norm": 0.6989581212698932, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 166126720, "step": 2402 }, { "epoch": 150.125, "loss": 0.0008287295931950212, "loss_ce": 0.00019930452981498092, "loss_xval": 0.000629425048828125, "num_input_tokens_seen": 166126720, "step": 2402 }, { "epoch": 150.1875, "grad_norm": 0.5112130537543741, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 166198272, "step": 2403 }, { "epoch": 150.1875, "loss": 0.0005174290854483843, "loss_ce": 0.00014358872431330383, "loss_xval": 0.00037384033203125, "num_input_tokens_seen": 166198272, "step": 2403 }, { "epoch": 150.25, "grad_norm": 0.3919207549294726, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 166270016, "step": 2404 }, { "epoch": 150.25, "loss": 0.0008279854082502425, "loss_ce": 0.00011082231503678486, "loss_xval": 0.0007171630859375, "num_input_tokens_seen": 166270016, "step": 2404 }, { "epoch": 150.3125, "grad_norm": 0.10739100581299145, "learning_rate": 5e-05, "loss": 0.0003, "num_input_tokens_seen": 166341760, "step": 2405 }, { "epoch": 150.3125, "loss": 0.00023223445168696344, "loss_ce": 8.346126560354605e-05, "loss_xval": 0.000148773193359375, "num_input_tokens_seen": 166341760, "step": 2405 }, { "epoch": 150.375, "grad_norm": 0.45975874155091206, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 166413376, "step": 2406 }, { "epoch": 150.375, "loss": 0.0007082895026542246, "loss_ce": 6.360567203955725e-05, "loss_xval": 0.000644683837890625, "num_input_tokens_seen": 166413376, "step": 2406 }, { "epoch": 150.4375, "grad_norm": 0.9410291482374249, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 166485056, "step": 2407 }, { "epoch": 150.4375, "loss": 0.0007459745393134654, "loss_ce": 4.407024243846536e-05, "loss_xval": 0.000701904296875, "num_input_tokens_seen": 166485056, "step": 2407 }, { "epoch": 150.5, "grad_norm": 1.6421583770726258, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 166556672, "step": 2408 }, { "epoch": 150.5, "loss": 0.0009627325343899429, "loss_ce": 3.576111339498311e-05, "loss_xval": 0.000926971435546875, "num_input_tokens_seen": 166556672, "step": 2408 }, { "epoch": 150.5625, "grad_norm": 2.7839599851905446, "learning_rate": 5e-05, "loss": 0.0028, "num_input_tokens_seen": 166628224, "step": 2409 }, { "epoch": 150.5625, "loss": 0.0030045805033296347, "loss_ce": 2.9116721634636633e-05, "loss_xval": 0.0029754638671875, "num_input_tokens_seen": 166628224, "step": 2409 }, { "epoch": 150.625, "grad_norm": 5.54840185171238, "learning_rate": 5e-05, "loss": 0.0092, "num_input_tokens_seen": 166699904, "step": 2410 }, { "epoch": 150.625, "loss": 0.009306657128036022, "loss_ce": 2.9313479899428785e-05, "loss_xval": 0.00927734375, "num_input_tokens_seen": 166699904, "step": 2410 }, { "epoch": 150.6875, "grad_norm": 10.46276851301423, "learning_rate": 5e-05, "loss": 0.0329, "num_input_tokens_seen": 166771456, "step": 2411 }, { "epoch": 150.6875, "loss": 0.03250272199511528, "loss_ce": 3.201870640623383e-05, "loss_xval": 0.032470703125, "num_input_tokens_seen": 166771456, "step": 2411 }, { "epoch": 150.75, "grad_norm": 12.49379731434549, "learning_rate": 5e-05, "loss": 0.0504, "num_input_tokens_seen": 166843136, "step": 2412 }, { "epoch": 150.75, "loss": 0.048925530165433884, "loss_ce": 9.740592940943316e-05, "loss_xval": 0.048828125, "num_input_tokens_seen": 166843136, "step": 2412 }, { "epoch": 150.8125, "grad_norm": 8.792072173625062, "learning_rate": 5e-05, "loss": 0.0248, "num_input_tokens_seen": 166914688, "step": 2413 }, { "epoch": 150.8125, "loss": 0.024670902639627457, "loss_ce": 0.0009892623638734221, "loss_xval": 0.023681640625, "num_input_tokens_seen": 166914688, "step": 2413 }, { "epoch": 150.875, "grad_norm": 4.2342744701784, "learning_rate": 5e-05, "loss": 0.0065, "num_input_tokens_seen": 166986304, "step": 2414 }, { "epoch": 150.875, "loss": 0.00638669403269887, "loss_ce": 0.000863012217450887, "loss_xval": 0.005523681640625, "num_input_tokens_seen": 166986304, "step": 2414 }, { "epoch": 150.9375, "grad_norm": 0.4366523557031741, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 167057984, "step": 2415 }, { "epoch": 150.9375, "loss": 0.0010362620232626796, "loss_ce": 0.0005537028191611171, "loss_xval": 0.0004825592041015625, "num_input_tokens_seen": 167057984, "step": 2415 }, { "epoch": 151.0, "grad_norm": 6.031583553086854, "learning_rate": 5e-05, "loss": 0.011, "num_input_tokens_seen": 167129664, "step": 2416 }, { "epoch": 151.0, "loss": 0.011357029899954796, "loss_ce": 0.0003707021242007613, "loss_xval": 0.010986328125, "num_input_tokens_seen": 167129664, "step": 2416 }, { "epoch": 151.0625, "grad_norm": 15.787760958608375, "learning_rate": 5e-05, "loss": 0.0699, "num_input_tokens_seen": 167201280, "step": 2417 }, { "epoch": 151.0625, "loss": 0.07035034149885178, "loss_ce": 0.0005261220503598452, "loss_xval": 0.06982421875, "num_input_tokens_seen": 167201280, "step": 2417 }, { "epoch": 151.125, "grad_norm": 17.264080440378386, "learning_rate": 5e-05, "loss": 0.0875, "num_input_tokens_seen": 167272896, "step": 2418 }, { "epoch": 151.125, "loss": 0.08509006351232529, "loss_ce": 0.0001291264343308285, "loss_xval": 0.0849609375, "num_input_tokens_seen": 167272896, "step": 2418 }, { "epoch": 151.1875, "grad_norm": 5.293257878290243, "learning_rate": 5e-05, "loss": 0.0098, "num_input_tokens_seen": 167344640, "step": 2419 }, { "epoch": 151.1875, "loss": 0.009484825655817986, "loss_ce": 0.0001464468368794769, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 167344640, "step": 2419 }, { "epoch": 151.25, "grad_norm": 8.234025956692294, "learning_rate": 5e-05, "loss": 0.0196, "num_input_tokens_seen": 167403840, "step": 2420 }, { "epoch": 151.25, "loss": 0.020006585866212845, "loss_ce": 0.00023119535762816668, "loss_xval": 0.019775390625, "num_input_tokens_seen": 167403840, "step": 2420 }, { "epoch": 151.3125, "grad_norm": 26.46834646209185, "learning_rate": 5e-05, "loss": 0.1832, "num_input_tokens_seen": 167475392, "step": 2421 }, { "epoch": 151.3125, "loss": 0.1851268708705902, "loss_ce": 0.0005565655883401632, "loss_xval": 0.1845703125, "num_input_tokens_seen": 167475392, "step": 2421 }, { "epoch": 151.375, "grad_norm": 22.697539209910644, "learning_rate": 5e-05, "loss": 0.1425, "num_input_tokens_seen": 167534400, "step": 2422 }, { "epoch": 151.375, "loss": 0.1417573243379593, "loss_ce": 0.00015575841825921088, "loss_xval": 0.1416015625, "num_input_tokens_seen": 167534400, "step": 2422 }, { "epoch": 151.4375, "grad_norm": 7.6623304718247995, "learning_rate": 5e-05, "loss": 0.0178, "num_input_tokens_seen": 167593472, "step": 2423 }, { "epoch": 151.4375, "loss": 0.017808010801672935, "loss_ce": 0.00022988590353634208, "loss_xval": 0.017578125, "num_input_tokens_seen": 167593472, "step": 2423 }, { "epoch": 151.5, "grad_norm": 414.1100107255893, "learning_rate": 5e-05, "loss": 0.9645, "num_input_tokens_seen": 167652608, "step": 2424 }, { "epoch": 151.5, "loss": 1.7413725852966309, "loss_ce": 0.030435139313340187, "loss_xval": 1.7109375, "num_input_tokens_seen": 167652608, "step": 2424 }, { "epoch": 151.5625, "grad_norm": 19.262266220946344, "learning_rate": 5e-05, "loss": 0.0932, "num_input_tokens_seen": 167724288, "step": 2425 }, { "epoch": 151.5625, "loss": 0.09090714901685715, "loss_ce": 0.0005751141579821706, "loss_xval": 0.09033203125, "num_input_tokens_seen": 167724288, "step": 2425 }, { "epoch": 151.625, "grad_norm": 24.36110882222433, "learning_rate": 5e-05, "loss": 0.1085, "num_input_tokens_seen": 167795904, "step": 2426 }, { "epoch": 151.625, "loss": 0.12360428273677826, "loss_ce": 0.08698318898677826, "loss_xval": 0.03662109375, "num_input_tokens_seen": 167795904, "step": 2426 }, { "epoch": 151.6875, "grad_norm": 36.983860252487354, "learning_rate": 5e-05, "loss": 0.5427, "num_input_tokens_seen": 167867456, "step": 2427 }, { "epoch": 151.6875, "loss": 0.5592447519302368, "loss_ce": 0.4049479067325592, "loss_xval": 0.154296875, "num_input_tokens_seen": 167867456, "step": 2427 }, { "epoch": 151.75, "grad_norm": 16.237935621266285, "learning_rate": 5e-05, "loss": 0.4207, "num_input_tokens_seen": 167939136, "step": 2428 }, { "epoch": 151.75, "loss": 0.4261643588542938, "loss_ce": 0.4107834994792938, "loss_xval": 0.015380859375, "num_input_tokens_seen": 167939136, "step": 2428 }, { "epoch": 151.8125, "grad_norm": 19.680429926189632, "learning_rate": 5e-05, "loss": 0.4199, "num_input_tokens_seen": 168010752, "step": 2429 }, { "epoch": 151.8125, "loss": 0.4241836667060852, "loss_ce": 0.3348281979560852, "loss_xval": 0.08935546875, "num_input_tokens_seen": 168010752, "step": 2429 }, { "epoch": 151.875, "grad_norm": 18.703801220606653, "learning_rate": 5e-05, "loss": 0.3537, "num_input_tokens_seen": 168069824, "step": 2430 }, { "epoch": 151.875, "loss": 0.35095417499542236, "loss_ce": 0.26648151874542236, "loss_xval": 0.08447265625, "num_input_tokens_seen": 168069824, "step": 2430 }, { "epoch": 151.9375, "grad_norm": 15.645070978296983, "learning_rate": 5e-05, "loss": 0.3064, "num_input_tokens_seen": 168141440, "step": 2431 }, { "epoch": 151.9375, "loss": 0.30103811621665955, "loss_ce": 0.26881155371665955, "loss_xval": 0.0322265625, "num_input_tokens_seen": 168141440, "step": 2431 }, { "epoch": 152.0, "grad_norm": 21.225905190040425, "learning_rate": 5e-05, "loss": 0.2675, "num_input_tokens_seen": 168213056, "step": 2432 }, { "epoch": 152.0, "loss": 0.2683914303779602, "loss_ce": 0.1848953515291214, "loss_xval": 0.08349609375, "num_input_tokens_seen": 168213056, "step": 2432 }, { "epoch": 152.0625, "grad_norm": 12.611375752594181, "learning_rate": 5e-05, "loss": 0.1306, "num_input_tokens_seen": 168272192, "step": 2433 }, { "epoch": 152.0625, "loss": 0.13391664624214172, "loss_ce": 0.09827210754156113, "loss_xval": 0.03564453125, "num_input_tokens_seen": 168272192, "step": 2433 }, { "epoch": 152.125, "grad_norm": 17.538312214800563, "learning_rate": 5e-05, "loss": 0.1296, "num_input_tokens_seen": 168343808, "step": 2434 }, { "epoch": 152.125, "loss": 0.1320444941520691, "loss_ce": 0.07052105665206909, "loss_xval": 0.0615234375, "num_input_tokens_seen": 168343808, "step": 2434 }, { "epoch": 152.1875, "grad_norm": 19.61837353575689, "learning_rate": 5e-05, "loss": 0.1222, "num_input_tokens_seen": 168415360, "step": 2435 }, { "epoch": 152.1875, "loss": 0.12624847888946533, "loss_ce": 0.047146908938884735, "loss_xval": 0.0791015625, "num_input_tokens_seen": 168415360, "step": 2435 }, { "epoch": 152.25, "grad_norm": 8.839200180151096, "learning_rate": 5e-05, "loss": 0.0516, "num_input_tokens_seen": 168487104, "step": 2436 }, { "epoch": 152.25, "loss": 0.050722721964120865, "loss_ce": 0.033877018839120865, "loss_xval": 0.016845703125, "num_input_tokens_seen": 168487104, "step": 2436 }, { "epoch": 152.3125, "grad_norm": 29.340307873660837, "learning_rate": 5e-05, "loss": 0.1746, "num_input_tokens_seen": 168558720, "step": 2437 }, { "epoch": 152.3125, "loss": 0.17294269800186157, "loss_ce": 0.024505192413926125, "loss_xval": 0.1484375, "num_input_tokens_seen": 168558720, "step": 2437 }, { "epoch": 152.375, "grad_norm": 19.262572996746773, "learning_rate": 5e-05, "loss": 0.0831, "num_input_tokens_seen": 168630336, "step": 2438 }, { "epoch": 152.375, "loss": 0.08717883378267288, "loss_ce": 0.021260865032672882, "loss_xval": 0.06591796875, "num_input_tokens_seen": 168630336, "step": 2438 }, { "epoch": 152.4375, "grad_norm": 9.476712746612705, "learning_rate": 5e-05, "loss": 0.0225, "num_input_tokens_seen": 168702080, "step": 2439 }, { "epoch": 152.4375, "loss": 0.02292168326675892, "loss_ce": 0.0061980499885976315, "loss_xval": 0.0167236328125, "num_input_tokens_seen": 168702080, "step": 2439 }, { "epoch": 152.5, "grad_norm": 27.06563178685452, "learning_rate": 5e-05, "loss": 0.1195, "num_input_tokens_seen": 168773760, "step": 2440 }, { "epoch": 152.5, "loss": 0.11924093216657639, "loss_ce": 0.004006555303931236, "loss_xval": 0.115234375, "num_input_tokens_seen": 168773760, "step": 2440 }, { "epoch": 152.5625, "grad_norm": 16.280688867399473, "learning_rate": 5e-05, "loss": 0.0452, "num_input_tokens_seen": 168845376, "step": 2441 }, { "epoch": 152.5625, "loss": 0.04708874225616455, "loss_ce": 0.00265514780767262, "loss_xval": 0.04443359375, "num_input_tokens_seen": 168845376, "step": 2441 }, { "epoch": 152.625, "grad_norm": 9.938469294554265, "learning_rate": 5e-05, "loss": 0.0191, "num_input_tokens_seen": 168904512, "step": 2442 }, { "epoch": 152.625, "loss": 0.018490314483642578, "loss_ce": 0.003231526119634509, "loss_xval": 0.0152587890625, "num_input_tokens_seen": 168904512, "step": 2442 }, { "epoch": 152.6875, "grad_norm": 23.010114349229884, "learning_rate": 5e-05, "loss": 0.0849, "num_input_tokens_seen": 168963520, "step": 2443 }, { "epoch": 152.6875, "loss": 0.08742774277925491, "loss_ce": 0.0029550848994404078, "loss_xval": 0.08447265625, "num_input_tokens_seen": 168963520, "step": 2443 }, { "epoch": 152.75, "grad_norm": 9.040468310430352, "learning_rate": 5e-05, "loss": 0.0153, "num_input_tokens_seen": 169035072, "step": 2444 }, { "epoch": 152.75, "loss": 0.016200613230466843, "loss_ce": 0.0022235626820474863, "loss_xval": 0.01397705078125, "num_input_tokens_seen": 169035072, "step": 2444 }, { "epoch": 152.8125, "grad_norm": 15.46627308177994, "learning_rate": 5e-05, "loss": 0.0391, "num_input_tokens_seen": 169106624, "step": 2445 }, { "epoch": 152.8125, "loss": 0.037854935973882675, "loss_ce": 0.001233842340297997, "loss_xval": 0.03662109375, "num_input_tokens_seen": 169106624, "step": 2445 }, { "epoch": 152.875, "grad_norm": 17.74470932258292, "learning_rate": 5e-05, "loss": 0.0512, "num_input_tokens_seen": 169178304, "step": 2446 }, { "epoch": 152.875, "loss": 0.049263544380664825, "loss_ce": 0.00043541795457713306, "loss_xval": 0.048828125, "num_input_tokens_seen": 169178304, "step": 2446 }, { "epoch": 152.9375, "grad_norm": 3.983319638379511, "learning_rate": 5e-05, "loss": 0.0033, "num_input_tokens_seen": 169250048, "step": 2447 }, { "epoch": 152.9375, "loss": 0.00340116024017334, "loss_ce": 0.0002883673587348312, "loss_xval": 0.00311279296875, "num_input_tokens_seen": 169250048, "step": 2447 }, { "epoch": 153.0, "grad_norm": 17.052741587322256, "learning_rate": 5e-05, "loss": 0.0486, "num_input_tokens_seen": 169296640, "step": 2448 }, { "epoch": 153.0, "loss": 0.04775082692503929, "loss_ce": 0.00014340641791932285, "loss_xval": 0.047607421875, "num_input_tokens_seen": 169296640, "step": 2448 }, { "epoch": 153.0625, "grad_norm": 1.6488709856004677, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 169368192, "step": 2449 }, { "epoch": 153.0625, "loss": 0.0007192580960690975, "loss_ce": 7.457426545443013e-05, "loss_xval": 0.000644683837890625, "num_input_tokens_seen": 169368192, "step": 2449 }, { "epoch": 153.125, "grad_norm": 14.82132478454114, "learning_rate": 5e-05, "loss": 0.038, "num_input_tokens_seen": 169439744, "step": 2450 }, { "epoch": 153.125, "loss": 0.0388912670314312, "loss_ce": 7.290825305972248e-05, "loss_xval": 0.038818359375, "num_input_tokens_seen": 169439744, "step": 2450 }, { "epoch": 153.1875, "grad_norm": 4.269698392907984, "learning_rate": 5e-05, "loss": 0.0039, "num_input_tokens_seen": 169511424, "step": 2451 }, { "epoch": 153.1875, "loss": 0.0037585701793432236, "loss_ce": 3.5425644455244765e-05, "loss_xval": 0.00372314453125, "num_input_tokens_seen": 169511424, "step": 2451 }, { "epoch": 153.25, "grad_norm": 11.674239325590117, "learning_rate": 5e-05, "loss": 0.0252, "num_input_tokens_seen": 169583104, "step": 2452 }, { "epoch": 153.25, "loss": 0.02480572834610939, "loss_ce": 2.5454932256252505e-05, "loss_xval": 0.0247802734375, "num_input_tokens_seen": 169583104, "step": 2452 }, { "epoch": 153.3125, "grad_norm": 3.212616261973195, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 169654784, "step": 2453 }, { "epoch": 153.3125, "loss": 0.0025852012913674116, "loss_ce": 2.172468521166593e-05, "loss_xval": 0.0025634765625, "num_input_tokens_seen": 169654784, "step": 2453 }, { "epoch": 153.375, "grad_norm": 10.599579969728131, "learning_rate": 5e-05, "loss": 0.0206, "num_input_tokens_seen": 169726464, "step": 2454 }, { "epoch": 153.375, "loss": 0.020038805902004242, "loss_ce": 1.9274504666100256e-05, "loss_xval": 0.02001953125, "num_input_tokens_seen": 169726464, "step": 2454 }, { "epoch": 153.4375, "grad_norm": 2.329570429065563, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 169798144, "step": 2455 }, { "epoch": 153.4375, "loss": 0.0016645665746182203, "loss_ce": 1.6617326764389873e-05, "loss_xval": 0.00164794921875, "num_input_tokens_seen": 169798144, "step": 2455 }, { "epoch": 153.5, "grad_norm": 9.745183346329203, "learning_rate": 5e-05, "loss": 0.0184, "num_input_tokens_seen": 169869760, "step": 2456 }, { "epoch": 153.5, "loss": 0.017713962122797966, "loss_ce": 1.3767366908723488e-05, "loss_xval": 0.0177001953125, "num_input_tokens_seen": 169869760, "step": 2456 }, { "epoch": 153.5625, "grad_norm": 1.2033565802466124, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 169941568, "step": 2457 }, { "epoch": 153.5625, "loss": 0.0006536798318848014, "loss_ce": 1.2810680345864967e-05, "loss_xval": 0.000640869140625, "num_input_tokens_seen": 169941568, "step": 2457 }, { "epoch": 153.625, "grad_norm": 8.976723632694345, "learning_rate": 5e-05, "loss": 0.0157, "num_input_tokens_seen": 170013248, "step": 2458 }, { "epoch": 153.625, "loss": 0.016246860846877098, "loss_ce": 1.150855405285256e-05, "loss_xval": 0.0162353515625, "num_input_tokens_seen": 170013248, "step": 2458 }, { "epoch": 153.6875, "grad_norm": 0.5399703579578968, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 170085056, "step": 2459 }, { "epoch": 153.6875, "loss": 0.00033668929245322943, "loss_ce": 1.244001578015741e-05, "loss_xval": 0.000324249267578125, "num_input_tokens_seen": 170085056, "step": 2459 }, { "epoch": 153.75, "grad_norm": 7.471261265286304, "learning_rate": 5e-05, "loss": 0.0113, "num_input_tokens_seen": 170156736, "step": 2460 }, { "epoch": 153.75, "loss": 0.011607798747718334, "loss_ce": 1.1119185728603043e-05, "loss_xval": 0.0115966796875, "num_input_tokens_seen": 170156736, "step": 2460 }, { "epoch": 153.8125, "grad_norm": 1.004765225632213, "learning_rate": 5e-05, "loss": 0.0005, "num_input_tokens_seen": 170215872, "step": 2461 }, { "epoch": 153.8125, "loss": 0.0003412454097997397, "loss_ce": 1.1274102689640131e-05, "loss_xval": 0.0003299713134765625, "num_input_tokens_seen": 170215872, "step": 2461 }, { "epoch": 153.875, "grad_norm": 6.8509049121426315, "learning_rate": 5e-05, "loss": 0.0095, "num_input_tokens_seen": 170287488, "step": 2462 }, { "epoch": 153.875, "loss": 0.009349527768790722, "loss_ce": 1.1149150850542355e-05, "loss_xval": 0.00933837890625, "num_input_tokens_seen": 170287488, "step": 2462 }, { "epoch": 153.9375, "grad_norm": 1.4588749386358972, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 170359168, "step": 2463 }, { "epoch": 153.9375, "loss": 0.0006519771413877606, "loss_ce": 1.1107976206403691e-05, "loss_xval": 0.000640869140625, "num_input_tokens_seen": 170359168, "step": 2463 }, { "epoch": 154.0, "grad_norm": 5.5031631339173215, "learning_rate": 5e-05, "loss": 0.0063, "num_input_tokens_seen": 170430848, "step": 2464 }, { "epoch": 154.0, "loss": 0.006387988105416298, "loss_ce": 9.814341865421738e-06, "loss_xval": 0.006378173828125, "num_input_tokens_seen": 170430848, "step": 2464 }, { "epoch": 154.0625, "grad_norm": 2.0060406491376064, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 170502400, "step": 2465 }, { "epoch": 154.0625, "loss": 0.0010102284140884876, "loss_ce": 1.077772230928531e-05, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 170502400, "step": 2465 }, { "epoch": 154.125, "grad_norm": 4.1404109225822765, "learning_rate": 5e-05, "loss": 0.0037, "num_input_tokens_seen": 170573952, "step": 2466 }, { "epoch": 154.125, "loss": 0.003657120745629072, "loss_ce": 1.0270115126331802e-05, "loss_xval": 0.0036468505859375, "num_input_tokens_seen": 170573952, "step": 2466 }, { "epoch": 154.1875, "grad_norm": 2.731704866940803, "learning_rate": 5e-05, "loss": 0.0017, "num_input_tokens_seen": 170645632, "step": 2467 }, { "epoch": 154.1875, "loss": 0.0018024840392172337, "loss_ce": 9.576368938724045e-06, "loss_xval": 0.00179290771484375, "num_input_tokens_seen": 170645632, "step": 2467 }, { "epoch": 154.25, "grad_norm": 3.563697178179098, "learning_rate": 5e-05, "loss": 0.0027, "num_input_tokens_seen": 170717312, "step": 2468 }, { "epoch": 154.25, "loss": 0.002802206901833415, "loss_ce": 9.848467925621662e-06, "loss_xval": 0.0027923583984375, "num_input_tokens_seen": 170717312, "step": 2468 }, { "epoch": 154.3125, "grad_norm": 2.8417179115709756, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 170788864, "step": 2469 }, { "epoch": 154.3125, "loss": 0.001655792584642768, "loss_ce": 7.84333678893745e-06, "loss_xval": 0.00164794921875, "num_input_tokens_seen": 170788864, "step": 2469 }, { "epoch": 154.375, "grad_norm": 2.631168310040131, "learning_rate": 5e-05, "loss": 0.0016, "num_input_tokens_seen": 170860608, "step": 2470 }, { "epoch": 154.375, "loss": 0.0017291078111156821, "loss_ce": 1.2494046131905634e-05, "loss_xval": 0.00171661376953125, "num_input_tokens_seen": 170860608, "step": 2470 }, { "epoch": 154.4375, "grad_norm": 3.2617330357890206, "learning_rate": 5e-05, "loss": 0.0023, "num_input_tokens_seen": 170932160, "step": 2471 }, { "epoch": 154.4375, "loss": 0.002359155099838972, "loss_ce": 9.301562386099249e-06, "loss_xval": 0.002349853515625, "num_input_tokens_seen": 170932160, "step": 2471 }, { "epoch": 154.5, "grad_norm": 2.4545855076784964, "learning_rate": 5e-05, "loss": 0.0015, "num_input_tokens_seen": 171003840, "step": 2472 }, { "epoch": 154.5, "loss": 0.0010087847476825118, "loss_ce": 9.334085007139947e-06, "loss_xval": 0.00099945068359375, "num_input_tokens_seen": 171003840, "step": 2472 }, { "epoch": 154.5625, "grad_norm": 3.054959949582806, "learning_rate": 5e-05, "loss": 0.0022, "num_input_tokens_seen": 171075456, "step": 2473 }, { "epoch": 154.5625, "loss": 0.002541671507060528, "loss_ce": 8.712435374036431e-06, "loss_xval": 0.002532958984375, "num_input_tokens_seen": 171075456, "step": 2473 }, { "epoch": 154.625, "grad_norm": 2.1776287269885204, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 171147072, "step": 2474 }, { "epoch": 154.625, "loss": 0.001070039696060121, "loss_ce": 9.55386713030748e-06, "loss_xval": 0.00106048583984375, "num_input_tokens_seen": 171147072, "step": 2474 }, { "epoch": 154.6875, "grad_norm": 2.901123518547072, "learning_rate": 5e-05, "loss": 0.0019, "num_input_tokens_seen": 171218624, "step": 2475 }, { "epoch": 154.6875, "loss": 0.001939892885275185, "loss_ce": 9.656041584094055e-06, "loss_xval": 0.00193023681640625, "num_input_tokens_seen": 171218624, "step": 2475 }, { "epoch": 154.75, "grad_norm": 1.8316559975125357, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 171290368, "step": 2476 }, { "epoch": 154.75, "loss": 0.0008991528302431107, "loss_ce": 1.0328377356927376e-05, "loss_xval": 0.000888824462890625, "num_input_tokens_seen": 171290368, "step": 2476 }, { "epoch": 154.8125, "grad_norm": 2.8623043914660786, "learning_rate": 5e-05, "loss": 0.0018, "num_input_tokens_seen": 171361984, "step": 2477 }, { "epoch": 154.8125, "loss": 0.0019163988763466477, "loss_ce": 9.050200787896756e-06, "loss_xval": 0.0019073486328125, "num_input_tokens_seen": 171361984, "step": 2477 }, { "epoch": 154.875, "grad_norm": 1.79429224420876, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 171433536, "step": 2478 }, { "epoch": 154.875, "loss": 0.0011088310275226831, "loss_ce": 1.019821866066195e-05, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 171433536, "step": 2478 }, { "epoch": 154.9375, "grad_norm": 2.4579784433113057, "learning_rate": 5e-05, "loss": 0.0014, "num_input_tokens_seen": 171505280, "step": 2479 }, { "epoch": 154.9375, "loss": 0.0012454879470169544, "loss_ce": 9.526052963337861e-06, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 171505280, "step": 2479 }, { "epoch": 155.0, "grad_norm": 1.7145579880094552, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 171577024, "step": 2480 }, { "epoch": 155.0, "loss": 0.0006987936212681234, "loss_ce": 8.33344256534474e-06, "loss_xval": 0.000690460205078125, "num_input_tokens_seen": 171577024, "step": 2480 }, { "epoch": 155.0625, "grad_norm": 2.2600772332493424, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 171636160, "step": 2481 }, { "epoch": 155.0625, "loss": 0.0012454873649403453, "loss_ce": 9.525448149361182e-06, "loss_xval": 0.0012359619140625, "num_input_tokens_seen": 171636160, "step": 2481 }, { "epoch": 155.125, "grad_norm": 1.8448501855644037, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 171695360, "step": 2482 }, { "epoch": 155.125, "loss": 0.0007585145649500191, "loss_ce": 7.0192181738093495e-06, "loss_xval": 0.000751495361328125, "num_input_tokens_seen": 171695360, "step": 2482 }, { "epoch": 155.1875, "grad_norm": 1.8702917832781996, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 171767040, "step": 2483 }, { "epoch": 155.1875, "loss": 0.0008324088994413614, "loss_ce": 8.434298251813743e-06, "loss_xval": 0.000823974609375, "num_input_tokens_seen": 171767040, "step": 2483 }, { "epoch": 155.25, "grad_norm": 1.6708108711761513, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 171838656, "step": 2484 }, { "epoch": 155.25, "loss": 0.0006913580000400543, "loss_ce": 8.52719404065283e-06, "loss_xval": 0.000682830810546875, "num_input_tokens_seen": 171838656, "step": 2484 }, { "epoch": 155.3125, "grad_norm": 1.8771915125255452, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 171910272, "step": 2485 }, { "epoch": 155.3125, "loss": 0.0007790317176841199, "loss_ce": 8.462846381007694e-06, "loss_xval": 0.00077056884765625, "num_input_tokens_seen": 171910272, "step": 2485 }, { "epoch": 155.375, "grad_norm": 1.7608488549611228, "learning_rate": 5e-05, "loss": 0.0008, "num_input_tokens_seen": 171981952, "step": 2486 }, { "epoch": 155.375, "loss": 0.0008362592780031264, "loss_ce": 8.469964086543769e-06, "loss_xval": 0.000827789306640625, "num_input_tokens_seen": 171981952, "step": 2486 }, { "epoch": 155.4375, "grad_norm": 1.9982541590104177, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 172053568, "step": 2487 }, { "epoch": 155.4375, "loss": 0.0011517609236761928, "loss_ce": 7.351757176365936e-06, "loss_xval": 0.0011444091796875, "num_input_tokens_seen": 172053568, "step": 2487 }, { "epoch": 155.5, "grad_norm": 2.2192158162225946, "learning_rate": 5e-05, "loss": 0.0012, "num_input_tokens_seen": 172125312, "step": 2488 }, { "epoch": 155.5, "loss": 0.0011686511570587754, "loss_ce": 8.983131920103915e-06, "loss_xval": 0.00115966796875, "num_input_tokens_seen": 172125312, "step": 2488 }, { "epoch": 155.5625, "grad_norm": 1.4457901672409814, "learning_rate": 5e-05, "loss": 0.0006, "num_input_tokens_seen": 172184512, "step": 2489 }, { "epoch": 155.5625, "loss": 0.0005124812014400959, "loss_ce": 8.941165106080007e-06, "loss_xval": 0.0005035400390625, "num_input_tokens_seen": 172184512, "step": 2489 }, { "epoch": 155.625, "grad_norm": 2.1011507510571357, "learning_rate": 5e-05, "loss": 0.0011, "num_input_tokens_seen": 172243648, "step": 2490 }, { "epoch": 155.625, "loss": 0.0011147982440888882, "loss_ce": 8.536085260857362e-06, "loss_xval": 0.00110626220703125, "num_input_tokens_seen": 172243648, "step": 2490 }, { "epoch": 155.6875, "grad_norm": 1.056409217218583, "learning_rate": 5e-05, "loss": 0.0004, "num_input_tokens_seen": 172315264, "step": 2491 }, { "epoch": 155.6875, "loss": 0.0003501183819025755, "loss_ce": 8.702984814590309e-06, "loss_xval": 0.0003414154052734375, "num_input_tokens_seen": 172315264, "step": 2491 }, { "epoch": 155.75, "grad_norm": 1.9594186214660128, "learning_rate": 5e-05, "loss": 0.001, "num_input_tokens_seen": 172374400, "step": 2492 }, { "epoch": 155.75, "loss": 0.0011073722271248698, "loss_ce": 8.739401891943999e-06, "loss_xval": 0.0010986328125, "num_input_tokens_seen": 172374400, "step": 2492 }, { "epoch": 155.8125, "grad_norm": 0.6302802377590286, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 172446016, "step": 2493 }, { "epoch": 155.8125, "loss": 0.0002539145352784544, "loss_ce": 7.866553460189607e-06, "loss_xval": 0.0002460479736328125, "num_input_tokens_seen": 172446016, "step": 2493 }, { "epoch": 155.875, "grad_norm": 1.6048452374088327, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 172517632, "step": 2494 }, { "epoch": 155.875, "loss": 0.000572717166505754, "loss_ce": 8.141959369822871e-06, "loss_xval": 0.0005645751953125, "num_input_tokens_seen": 172517632, "step": 2494 }, { "epoch": 155.9375, "grad_norm": 0.49816692649252725, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 172589248, "step": 2495 }, { "epoch": 155.9375, "loss": 0.00022121713845990598, "loss_ce": 8.54776499181753e-06, "loss_xval": 0.00021266937255859375, "num_input_tokens_seen": 172589248, "step": 2495 }, { "epoch": 156.0, "grad_norm": 1.689080638561562, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 172660800, "step": 2496 }, { "epoch": 156.0, "loss": 0.0008247463265433908, "loss_ce": 8.401118066103663e-06, "loss_xval": 0.00081634521484375, "num_input_tokens_seen": 172660800, "step": 2496 }, { "epoch": 156.0625, "grad_norm": 0.23182185348303827, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 172719872, "step": 2497 }, { "epoch": 156.0625, "loss": 0.0002188103972002864, "loss_ce": 8.048378731473349e-06, "loss_xval": 0.00021076202392578125, "num_input_tokens_seen": 172719872, "step": 2497 }, { "epoch": 156.125, "grad_norm": 1.880333994977742, "learning_rate": 5e-05, "loss": 0.0009, "num_input_tokens_seen": 172778880, "step": 2498 }, { "epoch": 156.125, "loss": 0.0008566674077883363, "loss_ce": 9.804623005038593e-06, "loss_xval": 0.00084686279296875, "num_input_tokens_seen": 172778880, "step": 2498 }, { "epoch": 156.1875, "grad_norm": 0.2540775595755331, "learning_rate": 5e-05, "loss": 0.0002, "num_input_tokens_seen": 172850496, "step": 2499 }, { "epoch": 156.1875, "loss": 0.0002453178749419749, "loss_ce": 9.760318789631128e-06, "loss_xval": 0.00023555755615234375, "num_input_tokens_seen": 172850496, "step": 2499 }, { "epoch": 156.25, "grad_norm": 1.6266601835135746, "learning_rate": 5e-05, "loss": 0.0007, "num_input_tokens_seen": 172922048, "step": 2500 }, { "epoch": 156.25, "eval_synth_IoU": 0.28413987159729004, "eval_synth_MAE_x": 0.01047515869140625, "eval_synth_MAE_y": 0.0106201171875, "eval_synth_NUM_probability": 0.9999175518751144, "eval_synth_inside_bbox": 0.6875, "eval_synth_loss": 0.00015532341785728931, "eval_synth_loss_ce": 8.934383231462562e-06, "eval_synth_loss_xval": 0.00014638900756835938, "eval_synth_runtime": 60.3629, "eval_synth_samples_per_second": 2.121, "eval_synth_steps_per_second": 0.066, "num_input_tokens_seen": 172922048, "step": 2500 } ], "logging_steps": 1.0, "max_steps": 3000, "num_input_tokens_seen": 172922048, "num_train_epochs": 188, "save_steps": 250, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1414729900425216.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }