{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0012150668286757, "eval_steps": 103, "global_step": 206, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.3593529462814331, "learning_rate": 2e-05, "loss": 0.4337, "step": 1 }, { "epoch": 0.0, "eval_loss": 0.3783022463321686, "eval_runtime": 12.9552, "eval_samples_per_second": 1.93, "eval_steps_per_second": 1.93, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.3430859446525574, "learning_rate": 4e-05, "loss": 0.4605, "step": 2 }, { "epoch": 0.01, "grad_norm": 0.3275960683822632, "learning_rate": 6e-05, "loss": 0.39, "step": 3 }, { "epoch": 0.02, "grad_norm": 0.3221317529678345, "learning_rate": 8e-05, "loss": 0.3372, "step": 4 }, { "epoch": 0.02, "grad_norm": 0.3926704227924347, "learning_rate": 0.0001, "loss": 0.3333, "step": 5 }, { "epoch": 0.03, "grad_norm": 0.2960835099220276, "learning_rate": 0.00012, "loss": 0.3671, "step": 6 }, { "epoch": 0.03, "grad_norm": 0.3393571078777313, "learning_rate": 0.00014, "loss": 0.327, "step": 7 }, { "epoch": 0.04, "grad_norm": 0.2799758017063141, "learning_rate": 0.00016, "loss": 0.2933, "step": 8 }, { "epoch": 0.04, "grad_norm": 0.3084808886051178, "learning_rate": 0.00018, "loss": 0.3505, "step": 9 }, { "epoch": 0.05, "grad_norm": 0.23642300069332123, "learning_rate": 0.0002, "loss": 0.3289, "step": 10 }, { "epoch": 0.05, "grad_norm": 0.369229793548584, "learning_rate": 0.00019999691576447898, "loss": 0.3049, "step": 11 }, { "epoch": 0.06, "grad_norm": 0.2706857919692993, "learning_rate": 0.00019998766324816607, "loss": 0.3425, "step": 12 }, { "epoch": 0.06, "grad_norm": 0.21327799558639526, "learning_rate": 0.00019997224302180006, "loss": 0.2686, "step": 13 }, { "epoch": 0.07, "grad_norm": 0.26732948422431946, "learning_rate": 0.00019995065603657316, "loss": 0.2987, "step": 14 }, { "epoch": 0.07, "grad_norm": 0.2009548544883728, "learning_rate": 0.0001999229036240723, "loss": 0.2668, "step": 15 }, { "epoch": 0.08, "grad_norm": 0.23616977035999298, "learning_rate": 0.00019988898749619702, "loss": 0.2962, "step": 16 }, { "epoch": 0.08, "grad_norm": 0.18399174511432648, "learning_rate": 0.00019984890974505381, "loss": 0.2238, "step": 17 }, { "epoch": 0.09, "grad_norm": 0.24744661152362823, "learning_rate": 0.00019980267284282717, "loss": 0.2628, "step": 18 }, { "epoch": 0.09, "grad_norm": 0.22109034657478333, "learning_rate": 0.00019975027964162702, "loss": 0.3143, "step": 19 }, { "epoch": 0.1, "grad_norm": 0.21471348404884338, "learning_rate": 0.0001996917333733128, "loss": 0.3436, "step": 20 }, { "epoch": 0.1, "grad_norm": 0.2112409919500351, "learning_rate": 0.00019962703764929413, "loss": 0.2714, "step": 21 }, { "epoch": 0.11, "grad_norm": 0.19044426083564758, "learning_rate": 0.00019955619646030802, "loss": 0.2335, "step": 22 }, { "epoch": 0.11, "grad_norm": 0.21945005655288696, "learning_rate": 0.00019947921417617267, "loss": 0.2381, "step": 23 }, { "epoch": 0.12, "grad_norm": 0.1864914447069168, "learning_rate": 0.000199396095545518, "loss": 0.287, "step": 24 }, { "epoch": 0.12, "grad_norm": 0.2413000762462616, "learning_rate": 0.00019930684569549264, "loss": 0.1813, "step": 25 }, { "epoch": 0.13, "grad_norm": 0.20038250088691711, "learning_rate": 0.0001992114701314478, "loss": 0.2722, "step": 26 }, { "epoch": 0.13, "grad_norm": 0.16239097714424133, "learning_rate": 0.0001991099747365975, "loss": 0.1917, "step": 27 }, { "epoch": 0.14, "grad_norm": 0.19039247930049896, "learning_rate": 0.00019900236577165576, "loss": 0.2665, "step": 28 }, { "epoch": 0.14, "grad_norm": 0.17717348039150238, "learning_rate": 0.0001988886498744505, "loss": 0.2667, "step": 29 }, { "epoch": 0.15, "grad_norm": 0.18755286931991577, "learning_rate": 0.00019876883405951377, "loss": 0.1775, "step": 30 }, { "epoch": 0.15, "grad_norm": 0.1621539294719696, "learning_rate": 0.00019864292571764955, "loss": 0.2292, "step": 31 }, { "epoch": 0.16, "grad_norm": 0.1834522783756256, "learning_rate": 0.0001985109326154774, "loss": 0.2327, "step": 32 }, { "epoch": 0.16, "grad_norm": 0.18009088933467865, "learning_rate": 0.00019837286289495361, "loss": 0.2325, "step": 33 }, { "epoch": 0.17, "grad_norm": 0.16372188925743103, "learning_rate": 0.0001982287250728689, "loss": 0.2748, "step": 34 }, { "epoch": 0.17, "grad_norm": 0.16966991126537323, "learning_rate": 0.00019807852804032305, "loss": 0.2353, "step": 35 }, { "epoch": 0.17, "grad_norm": 0.18791744112968445, "learning_rate": 0.00019792228106217658, "loss": 0.2693, "step": 36 }, { "epoch": 0.18, "grad_norm": 0.16828736662864685, "learning_rate": 0.0001977599937764791, "loss": 0.1752, "step": 37 }, { "epoch": 0.18, "grad_norm": 0.1636415272951126, "learning_rate": 0.00019759167619387476, "loss": 0.2961, "step": 38 }, { "epoch": 0.19, "grad_norm": 0.152165487408638, "learning_rate": 0.00019741733869698495, "loss": 0.1954, "step": 39 }, { "epoch": 0.19, "grad_norm": 0.1838696449995041, "learning_rate": 0.00019723699203976766, "loss": 0.2039, "step": 40 }, { "epoch": 0.2, "grad_norm": 0.170082688331604, "learning_rate": 0.00019705064734685425, "loss": 0.2228, "step": 41 }, { "epoch": 0.2, "grad_norm": 0.1489935666322708, "learning_rate": 0.0001968583161128631, "loss": 0.185, "step": 42 }, { "epoch": 0.21, "grad_norm": 0.1501648873090744, "learning_rate": 0.00019666001020169073, "loss": 0.2198, "step": 43 }, { "epoch": 0.21, "grad_norm": 0.24484610557556152, "learning_rate": 0.00019645574184577982, "loss": 0.2514, "step": 44 }, { "epoch": 0.22, "grad_norm": 0.16400669515132904, "learning_rate": 0.00019624552364536473, "loss": 0.283, "step": 45 }, { "epoch": 0.22, "grad_norm": 0.15034696459770203, "learning_rate": 0.0001960293685676943, "loss": 0.1637, "step": 46 }, { "epoch": 0.23, "grad_norm": 0.15385685861110687, "learning_rate": 0.00019580728994623195, "loss": 0.2905, "step": 47 }, { "epoch": 0.23, "grad_norm": 0.16004744172096252, "learning_rate": 0.00019557930147983302, "loss": 0.2173, "step": 48 }, { "epoch": 0.24, "grad_norm": 0.17182469367980957, "learning_rate": 0.0001953454172319001, "loss": 0.2147, "step": 49 }, { "epoch": 0.24, "grad_norm": 0.1576426774263382, "learning_rate": 0.00019510565162951537, "loss": 0.284, "step": 50 }, { "epoch": 0.25, "grad_norm": 0.14034013450145721, "learning_rate": 0.00019486001946255046, "loss": 0.1671, "step": 51 }, { "epoch": 0.25, "grad_norm": 0.1770170032978058, "learning_rate": 0.00019460853588275454, "loss": 0.205, "step": 52 }, { "epoch": 0.26, "grad_norm": 0.14077183604240417, "learning_rate": 0.00019435121640281938, "loss": 0.209, "step": 53 }, { "epoch": 0.26, "grad_norm": 0.2193373441696167, "learning_rate": 0.00019408807689542257, "loss": 0.2125, "step": 54 }, { "epoch": 0.27, "grad_norm": 0.1439114212989807, "learning_rate": 0.00019381913359224842, "loss": 0.1611, "step": 55 }, { "epoch": 0.27, "grad_norm": 0.16939564049243927, "learning_rate": 0.00019354440308298675, "loss": 0.2189, "step": 56 }, { "epoch": 0.28, "grad_norm": 0.17016972601413727, "learning_rate": 0.00019326390231430942, "loss": 0.2129, "step": 57 }, { "epoch": 0.28, "grad_norm": 0.16602838039398193, "learning_rate": 0.00019297764858882514, "loss": 0.23, "step": 58 }, { "epoch": 0.29, "grad_norm": 0.21853233873844147, "learning_rate": 0.00019268565956401208, "loss": 0.1879, "step": 59 }, { "epoch": 0.29, "grad_norm": 0.18649564683437347, "learning_rate": 0.0001923879532511287, "loss": 0.2063, "step": 60 }, { "epoch": 0.3, "grad_norm": 0.16304822266101837, "learning_rate": 0.00019208454801410266, "loss": 0.2182, "step": 61 }, { "epoch": 0.3, "grad_norm": 0.1357814073562622, "learning_rate": 0.00019177546256839812, "loss": 0.1912, "step": 62 }, { "epoch": 0.31, "grad_norm": 0.1645856499671936, "learning_rate": 0.00019146071597986138, "loss": 0.2498, "step": 63 }, { "epoch": 0.31, "grad_norm": 0.20048978924751282, "learning_rate": 0.00019114032766354453, "loss": 0.3207, "step": 64 }, { "epoch": 0.32, "grad_norm": 0.15142077207565308, "learning_rate": 0.00019081431738250814, "loss": 0.2102, "step": 65 }, { "epoch": 0.32, "grad_norm": 0.15482479333877563, "learning_rate": 0.00019048270524660196, "loss": 0.2706, "step": 66 }, { "epoch": 0.33, "grad_norm": 0.15685780346393585, "learning_rate": 0.00019014551171122457, "loss": 0.1898, "step": 67 }, { "epoch": 0.33, "grad_norm": 0.18191099166870117, "learning_rate": 0.00018980275757606157, "loss": 0.2838, "step": 68 }, { "epoch": 0.34, "grad_norm": 0.1533990502357483, "learning_rate": 0.0001894544639838025, "loss": 0.1954, "step": 69 }, { "epoch": 0.34, "grad_norm": 0.14591443538665771, "learning_rate": 0.0001891006524188368, "loss": 0.248, "step": 70 }, { "epoch": 0.35, "grad_norm": 0.19209636747837067, "learning_rate": 0.00018874134470592835, "loss": 0.2677, "step": 71 }, { "epoch": 0.35, "grad_norm": 0.14589589834213257, "learning_rate": 0.00018837656300886937, "loss": 0.2031, "step": 72 }, { "epoch": 0.35, "grad_norm": 0.17039361596107483, "learning_rate": 0.00018800632982911322, "loss": 0.2679, "step": 73 }, { "epoch": 0.36, "grad_norm": 0.1550627052783966, "learning_rate": 0.00018763066800438636, "loss": 0.2082, "step": 74 }, { "epoch": 0.36, "grad_norm": 0.15761853754520416, "learning_rate": 0.00018724960070727972, "loss": 0.2542, "step": 75 }, { "epoch": 0.37, "grad_norm": 0.1586439311504364, "learning_rate": 0.00018686315144381913, "loss": 0.1673, "step": 76 }, { "epoch": 0.37, "grad_norm": 0.16859450936317444, "learning_rate": 0.0001864713440520155, "loss": 0.2679, "step": 77 }, { "epoch": 0.38, "grad_norm": 0.15938322246074677, "learning_rate": 0.0001860742027003944, "loss": 0.2812, "step": 78 }, { "epoch": 0.38, "grad_norm": 0.15844044089317322, "learning_rate": 0.00018567175188650498, "loss": 0.2443, "step": 79 }, { "epoch": 0.39, "grad_norm": 0.1673026829957962, "learning_rate": 0.00018526401643540922, "loss": 0.1635, "step": 80 }, { "epoch": 0.39, "grad_norm": 0.18364693224430084, "learning_rate": 0.00018485102149815038, "loss": 0.2023, "step": 81 }, { "epoch": 0.4, "grad_norm": 0.17101280391216278, "learning_rate": 0.00018443279255020152, "loss": 0.1827, "step": 82 }, { "epoch": 0.4, "grad_norm": 0.15244990587234497, "learning_rate": 0.0001840093553898942, "loss": 0.2362, "step": 83 }, { "epoch": 0.41, "grad_norm": 0.16682085394859314, "learning_rate": 0.00018358073613682706, "loss": 0.2006, "step": 84 }, { "epoch": 0.41, "grad_norm": 0.18180780112743378, "learning_rate": 0.00018314696123025454, "loss": 0.2275, "step": 85 }, { "epoch": 0.42, "grad_norm": 0.1552531123161316, "learning_rate": 0.00018270805742745617, "loss": 0.2165, "step": 86 }, { "epoch": 0.42, "grad_norm": 0.1649930328130722, "learning_rate": 0.000182264051802086, "loss": 0.2714, "step": 87 }, { "epoch": 0.43, "grad_norm": 0.16403907537460327, "learning_rate": 0.00018181497174250236, "loss": 0.1963, "step": 88 }, { "epoch": 0.43, "grad_norm": 0.1535252332687378, "learning_rate": 0.00018136084495007872, "loss": 0.1944, "step": 89 }, { "epoch": 0.44, "grad_norm": 0.17279598116874695, "learning_rate": 0.00018090169943749476, "loss": 0.2072, "step": 90 }, { "epoch": 0.44, "grad_norm": 0.16793234646320343, "learning_rate": 0.00018043756352700846, "loss": 0.1753, "step": 91 }, { "epoch": 0.45, "grad_norm": 0.2446856051683426, "learning_rate": 0.00017996846584870908, "loss": 0.1976, "step": 92 }, { "epoch": 0.45, "grad_norm": 0.20265690982341766, "learning_rate": 0.000179494435338751, "loss": 0.2656, "step": 93 }, { "epoch": 0.46, "grad_norm": 0.1686105579137802, "learning_rate": 0.00017901550123756906, "loss": 0.2032, "step": 94 }, { "epoch": 0.46, "grad_norm": 0.16633199155330658, "learning_rate": 0.00017853169308807448, "loss": 0.1939, "step": 95 }, { "epoch": 0.47, "grad_norm": 0.15133249759674072, "learning_rate": 0.000178043040733833, "loss": 0.214, "step": 96 }, { "epoch": 0.47, "grad_norm": 0.1566823273897171, "learning_rate": 0.00017754957431722346, "loss": 0.1764, "step": 97 }, { "epoch": 0.48, "grad_norm": 0.15705688297748566, "learning_rate": 0.00017705132427757895, "loss": 0.1941, "step": 98 }, { "epoch": 0.48, "grad_norm": 0.1403038203716278, "learning_rate": 0.00017654832134930882, "loss": 0.171, "step": 99 }, { "epoch": 0.49, "grad_norm": 0.14009466767311096, "learning_rate": 0.0001760405965600031, "loss": 0.2162, "step": 100 }, { "epoch": 0.49, "grad_norm": 0.14049287140369415, "learning_rate": 0.00017552818122851838, "loss": 0.1668, "step": 101 }, { "epoch": 0.5, "grad_norm": 0.1339244395494461, "learning_rate": 0.00017501110696304596, "loss": 0.1511, "step": 102 }, { "epoch": 0.5, "grad_norm": 0.13987047970294952, "learning_rate": 0.00017448940565916222, "loss": 0.2537, "step": 103 }, { "epoch": 0.5, "eval_loss": 0.2344847470521927, "eval_runtime": 13.3513, "eval_samples_per_second": 1.872, "eval_steps_per_second": 1.872, "step": 103 }, { "epoch": 0.51, "grad_norm": 0.1495981365442276, "learning_rate": 0.000173963109497861, "loss": 0.2417, "step": 104 }, { "epoch": 0.51, "grad_norm": 0.129630908370018, "learning_rate": 0.00017343225094356855, "loss": 0.1913, "step": 105 }, { "epoch": 0.52, "grad_norm": 0.17110762000083923, "learning_rate": 0.00017289686274214118, "loss": 0.3276, "step": 106 }, { "epoch": 0.52, "grad_norm": 0.1344563066959381, "learning_rate": 0.00017235697791884494, "loss": 0.1805, "step": 107 }, { "epoch": 0.52, "grad_norm": 0.14707708358764648, "learning_rate": 0.00017181262977631888, "loss": 0.198, "step": 108 }, { "epoch": 0.53, "grad_norm": 0.14540620148181915, "learning_rate": 0.00017126385189252053, "loss": 0.1923, "step": 109 }, { "epoch": 0.53, "grad_norm": 0.14539222419261932, "learning_rate": 0.00017071067811865476, "loss": 0.1822, "step": 110 }, { "epoch": 0.54, "grad_norm": 0.16808092594146729, "learning_rate": 0.0001701531425770856, "loss": 0.2384, "step": 111 }, { "epoch": 0.54, "grad_norm": 0.1316782385110855, "learning_rate": 0.00016959127965923142, "loss": 0.1967, "step": 112 }, { "epoch": 0.55, "grad_norm": 0.15255320072174072, "learning_rate": 0.00016902512402344373, "loss": 0.245, "step": 113 }, { "epoch": 0.55, "grad_norm": 0.152465358376503, "learning_rate": 0.00016845471059286887, "loss": 0.1942, "step": 114 }, { "epoch": 0.56, "grad_norm": 0.15641257166862488, "learning_rate": 0.0001678800745532942, "loss": 0.2091, "step": 115 }, { "epoch": 0.56, "grad_norm": 0.16389591991901398, "learning_rate": 0.00016730125135097735, "loss": 0.2006, "step": 116 }, { "epoch": 0.57, "grad_norm": 0.1631624549627304, "learning_rate": 0.00016671827669045998, "loss": 0.2716, "step": 117 }, { "epoch": 0.57, "grad_norm": 0.16853250563144684, "learning_rate": 0.00016613118653236518, "loss": 0.2376, "step": 118 }, { "epoch": 0.58, "grad_norm": 0.13790781795978546, "learning_rate": 0.0001655400170911794, "loss": 0.1943, "step": 119 }, { "epoch": 0.58, "grad_norm": 0.2789172828197479, "learning_rate": 0.00016494480483301836, "loss": 0.3018, "step": 120 }, { "epoch": 0.59, "grad_norm": 0.14194965362548828, "learning_rate": 0.0001643455864733779, "loss": 0.2133, "step": 121 }, { "epoch": 0.59, "grad_norm": 0.13147993385791779, "learning_rate": 0.000163742398974869, "loss": 0.2281, "step": 122 }, { "epoch": 0.6, "grad_norm": 0.16474007070064545, "learning_rate": 0.00016313527954493778, "loss": 0.2507, "step": 123 }, { "epoch": 0.6, "grad_norm": 0.14342117309570312, "learning_rate": 0.00016252426563357055, "loss": 0.2118, "step": 124 }, { "epoch": 0.61, "grad_norm": 0.13474318385124207, "learning_rate": 0.00016190939493098344, "loss": 0.2172, "step": 125 }, { "epoch": 0.61, "grad_norm": 0.12630698084831238, "learning_rate": 0.00016129070536529766, "loss": 0.1549, "step": 126 }, { "epoch": 0.62, "grad_norm": 0.12225893884897232, "learning_rate": 0.00016066823510019998, "loss": 0.2312, "step": 127 }, { "epoch": 0.62, "grad_norm": 0.17443059384822845, "learning_rate": 0.00016004202253258842, "loss": 0.3303, "step": 128 }, { "epoch": 0.63, "grad_norm": 0.1610797941684723, "learning_rate": 0.00015941210629020388, "loss": 0.2463, "step": 129 }, { "epoch": 0.63, "grad_norm": 0.14372020959854126, "learning_rate": 0.00015877852522924732, "loss": 0.3351, "step": 130 }, { "epoch": 0.64, "grad_norm": 0.15952154994010925, "learning_rate": 0.00015814131843198308, "loss": 0.2147, "step": 131 }, { "epoch": 0.64, "grad_norm": 0.18614554405212402, "learning_rate": 0.00015750052520432787, "loss": 0.1983, "step": 132 }, { "epoch": 0.65, "grad_norm": 0.15100689232349396, "learning_rate": 0.0001568561850734264, "loss": 0.182, "step": 133 }, { "epoch": 0.65, "grad_norm": 0.15041258931159973, "learning_rate": 0.00015620833778521307, "loss": 0.1979, "step": 134 }, { "epoch": 0.66, "grad_norm": 0.14446201920509338, "learning_rate": 0.00015555702330196023, "loss": 0.2039, "step": 135 }, { "epoch": 0.66, "grad_norm": 0.12656830251216888, "learning_rate": 0.0001549022817998132, "loss": 0.2032, "step": 136 }, { "epoch": 0.67, "grad_norm": 0.147608682513237, "learning_rate": 0.00015424415366631188, "loss": 0.1889, "step": 137 }, { "epoch": 0.67, "grad_norm": 0.1701640486717224, "learning_rate": 0.00015358267949789966, "loss": 0.2654, "step": 138 }, { "epoch": 0.68, "grad_norm": 0.1424136757850647, "learning_rate": 0.00015291790009741907, "loss": 0.2052, "step": 139 }, { "epoch": 0.68, "grad_norm": 0.1711564064025879, "learning_rate": 0.0001522498564715949, "loss": 0.226, "step": 140 }, { "epoch": 0.69, "grad_norm": 0.17195338010787964, "learning_rate": 0.00015157858982850475, "loss": 0.2366, "step": 141 }, { "epoch": 0.69, "grad_norm": 0.1596439927816391, "learning_rate": 0.00015090414157503714, "loss": 0.2387, "step": 142 }, { "epoch": 0.7, "grad_norm": 0.1538238674402237, "learning_rate": 0.00015022655331433727, "loss": 0.2326, "step": 143 }, { "epoch": 0.7, "grad_norm": 0.13117662072181702, "learning_rate": 0.00014954586684324078, "loss": 0.1571, "step": 144 }, { "epoch": 0.7, "grad_norm": 0.1380062848329544, "learning_rate": 0.00014886212414969553, "loss": 0.1581, "step": 145 }, { "epoch": 0.71, "grad_norm": 0.15343894064426422, "learning_rate": 0.00014817536741017152, "loss": 0.2154, "step": 146 }, { "epoch": 0.71, "grad_norm": 0.12983113527297974, "learning_rate": 0.00014748563898705946, "loss": 0.1988, "step": 147 }, { "epoch": 0.72, "grad_norm": 0.1460677534341812, "learning_rate": 0.00014679298142605734, "loss": 0.208, "step": 148 }, { "epoch": 0.72, "grad_norm": 0.18640218675136566, "learning_rate": 0.00014609743745354624, "loss": 0.2976, "step": 149 }, { "epoch": 0.73, "grad_norm": 0.1410883516073227, "learning_rate": 0.00014539904997395468, "loss": 0.1779, "step": 150 }, { "epoch": 0.73, "grad_norm": 0.1352994590997696, "learning_rate": 0.00014469786206711214, "loss": 0.1932, "step": 151 }, { "epoch": 0.74, "grad_norm": 0.16276930272579193, "learning_rate": 0.00014399391698559152, "loss": 0.19, "step": 152 }, { "epoch": 0.74, "grad_norm": 0.1330641359090805, "learning_rate": 0.00014328725815204144, "loss": 0.1529, "step": 153 }, { "epoch": 0.75, "grad_norm": 0.1461140215396881, "learning_rate": 0.00014257792915650728, "loss": 0.1822, "step": 154 }, { "epoch": 0.75, "grad_norm": 0.1400836855173111, "learning_rate": 0.0001418659737537428, "loss": 0.1984, "step": 155 }, { "epoch": 0.76, "grad_norm": 0.15338997542858124, "learning_rate": 0.00014115143586051088, "loss": 0.1856, "step": 156 }, { "epoch": 0.76, "grad_norm": 0.15661920607089996, "learning_rate": 0.00014043435955287452, "loss": 0.2219, "step": 157 }, { "epoch": 0.77, "grad_norm": 0.22158733010292053, "learning_rate": 0.00013971478906347806, "loss": 0.3549, "step": 158 }, { "epoch": 0.77, "grad_norm": 0.1338592767715454, "learning_rate": 0.00013899276877881884, "loss": 0.1785, "step": 159 }, { "epoch": 0.78, "grad_norm": 0.14615756273269653, "learning_rate": 0.000138268343236509, "loss": 0.2202, "step": 160 }, { "epoch": 0.78, "grad_norm": 0.1247783973813057, "learning_rate": 0.00013754155712252832, "loss": 0.2008, "step": 161 }, { "epoch": 0.79, "grad_norm": 0.18145664036273956, "learning_rate": 0.00013681245526846783, "loss": 0.2144, "step": 162 }, { "epoch": 0.79, "grad_norm": 0.1396200805902481, "learning_rate": 0.0001360810826487642, "loss": 0.2194, "step": 163 }, { "epoch": 0.8, "grad_norm": 0.15508796274662018, "learning_rate": 0.00013534748437792573, "loss": 0.2766, "step": 164 }, { "epoch": 0.8, "grad_norm": 0.15434060990810394, "learning_rate": 0.0001346117057077493, "loss": 0.188, "step": 165 }, { "epoch": 0.81, "grad_norm": 0.14731119573116302, "learning_rate": 0.00013387379202452917, "loss": 0.233, "step": 166 }, { "epoch": 0.81, "grad_norm": 0.1317255198955536, "learning_rate": 0.0001331337888462571, "loss": 0.233, "step": 167 }, { "epoch": 0.82, "grad_norm": 0.16242042183876038, "learning_rate": 0.00013239174181981495, "loss": 0.2541, "step": 168 }, { "epoch": 0.82, "grad_norm": 0.14940407872200012, "learning_rate": 0.00013164769671815862, "loss": 0.2244, "step": 169 }, { "epoch": 0.83, "grad_norm": 0.14262273907661438, "learning_rate": 0.00013090169943749476, "loss": 0.2458, "step": 170 }, { "epoch": 0.83, "grad_norm": 0.13066165149211884, "learning_rate": 0.00013015379599444957, "loss": 0.1865, "step": 171 }, { "epoch": 0.84, "grad_norm": 0.14521218836307526, "learning_rate": 0.0001294040325232304, "loss": 0.1838, "step": 172 }, { "epoch": 0.84, "grad_norm": 0.14943768084049225, "learning_rate": 0.00012865245527277986, "loss": 0.2369, "step": 173 }, { "epoch": 0.85, "grad_norm": 0.14182843267917633, "learning_rate": 0.00012789911060392294, "loss": 0.1953, "step": 174 }, { "epoch": 0.85, "grad_norm": 0.16210182011127472, "learning_rate": 0.00012714404498650743, "loss": 0.2624, "step": 175 }, { "epoch": 0.86, "grad_norm": 0.15228134393692017, "learning_rate": 0.0001263873049965373, "loss": 0.2441, "step": 176 }, { "epoch": 0.86, "grad_norm": 0.13629144430160522, "learning_rate": 0.00012562893731329967, "loss": 0.2003, "step": 177 }, { "epoch": 0.87, "grad_norm": 0.15732020139694214, "learning_rate": 0.0001248689887164855, "loss": 0.2311, "step": 178 }, { "epoch": 0.87, "grad_norm": 0.13804040849208832, "learning_rate": 0.00012410750608330388, "loss": 0.1768, "step": 179 }, { "epoch": 0.87, "grad_norm": 0.13766342401504517, "learning_rate": 0.00012334453638559057, "loss": 0.2182, "step": 180 }, { "epoch": 0.88, "grad_norm": 0.15981349349021912, "learning_rate": 0.0001225801266869104, "loss": 0.2061, "step": 181 }, { "epoch": 0.88, "grad_norm": 0.12819038331508636, "learning_rate": 0.00012181432413965428, "loss": 0.1549, "step": 182 }, { "epoch": 0.89, "grad_norm": 0.13831038773059845, "learning_rate": 0.00012104717598213056, "loss": 0.1653, "step": 183 }, { "epoch": 0.89, "grad_norm": 0.16673411428928375, "learning_rate": 0.00012027872953565125, "loss": 0.1933, "step": 184 }, { "epoch": 0.9, "grad_norm": 0.14682242274284363, "learning_rate": 0.00011950903220161285, "loss": 0.202, "step": 185 }, { "epoch": 0.9, "grad_norm": 0.1462392807006836, "learning_rate": 0.00011873813145857249, "loss": 0.2377, "step": 186 }, { "epoch": 0.91, "grad_norm": 0.12843555212020874, "learning_rate": 0.00011796607485931928, "loss": 0.2125, "step": 187 }, { "epoch": 0.91, "grad_norm": 0.15481220185756683, "learning_rate": 0.00011719291002794096, "loss": 0.2539, "step": 188 }, { "epoch": 0.92, "grad_norm": 0.13950768113136292, "learning_rate": 0.0001164186846568863, "loss": 0.1972, "step": 189 }, { "epoch": 0.92, "grad_norm": 0.11621260643005371, "learning_rate": 0.0001156434465040231, "loss": 0.1856, "step": 190 }, { "epoch": 0.93, "grad_norm": 0.14099712669849396, "learning_rate": 0.00011486724338969232, "loss": 0.1916, "step": 191 }, { "epoch": 0.93, "grad_norm": 0.1282750517129898, "learning_rate": 0.00011409012319375827, "loss": 0.1836, "step": 192 }, { "epoch": 0.94, "grad_norm": 0.15066012740135193, "learning_rate": 0.00011331213385265524, "loss": 0.2432, "step": 193 }, { "epoch": 0.94, "grad_norm": 0.14334805309772491, "learning_rate": 0.00011253332335643043, "loss": 0.1749, "step": 194 }, { "epoch": 0.95, "grad_norm": 0.15670594573020935, "learning_rate": 0.00011175373974578378, "loss": 0.2479, "step": 195 }, { "epoch": 0.95, "grad_norm": 0.15438470244407654, "learning_rate": 0.00011097343110910452, "loss": 0.2356, "step": 196 }, { "epoch": 0.96, "grad_norm": 0.1420874148607254, "learning_rate": 0.000110192445579505, "loss": 0.2662, "step": 197 }, { "epoch": 0.96, "grad_norm": 0.1418399214744568, "learning_rate": 0.00010941083133185146, "loss": 0.1785, "step": 198 }, { "epoch": 0.97, "grad_norm": 0.1280946284532547, "learning_rate": 0.00010862863657979237, "loss": 0.1652, "step": 199 }, { "epoch": 0.97, "grad_norm": 0.14323323965072632, "learning_rate": 0.0001078459095727845, "loss": 0.2104, "step": 200 }, { "epoch": 0.98, "grad_norm": 0.10913383960723877, "learning_rate": 0.00010706269859311669, "loss": 0.1448, "step": 201 }, { "epoch": 0.98, "grad_norm": 0.12007103115320206, "learning_rate": 0.00010627905195293135, "loss": 0.1263, "step": 202 }, { "epoch": 0.99, "grad_norm": 0.1433536857366562, "learning_rate": 0.0001054950179912446, "loss": 0.1831, "step": 203 }, { "epoch": 0.99, "grad_norm": 0.1542392522096634, "learning_rate": 0.00010471064507096426, "loss": 0.2828, "step": 204 }, { "epoch": 1.0, "grad_norm": 0.18856601417064667, "learning_rate": 0.00010392598157590688, "loss": 0.183, "step": 205 }, { "epoch": 1.0, "grad_norm": 0.13536609709262848, "learning_rate": 0.00010314107590781284, "loss": 0.2161, "step": 206 }, { "epoch": 1.0, "eval_loss": 0.22577418386936188, "eval_runtime": 13.2186, "eval_samples_per_second": 1.891, "eval_steps_per_second": 1.891, "step": 206 } ], "logging_steps": 1, "max_steps": 410, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 103, "total_flos": 1.27754961965192e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }