{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.4773519163763065, "eval_steps": 500, "global_step": 180, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.013937282229965157, "grad_norm": 93.75749969482422, "learning_rate": 2.2e-06, "loss": 1.2675, "step": 1 }, { "epoch": 0.027874564459930314, "grad_norm": 66.60814666748047, "learning_rate": 4.4e-06, "loss": 1.3582, "step": 2 }, { "epoch": 0.041811846689895474, "grad_norm": 77.9828109741211, "learning_rate": 6.5999999999999995e-06, "loss": 1.2567, "step": 3 }, { "epoch": 0.05574912891986063, "grad_norm": 17.5104923248291, "learning_rate": 8.8e-06, "loss": 1.1877, "step": 4 }, { "epoch": 0.06968641114982578, "grad_norm": 10.045647621154785, "learning_rate": 1.1e-05, "loss": 1.1663, "step": 5 }, { "epoch": 0.08362369337979095, "grad_norm": 12.031865119934082, "learning_rate": 1.0999372667896238e-05, "loss": 1.1769, "step": 6 }, { "epoch": 0.0975609756097561, "grad_norm": 6.24857234954834, "learning_rate": 1.0997490814692433e-05, "loss": 1.1703, "step": 7 }, { "epoch": 0.11149825783972125, "grad_norm": 4.496493816375732, "learning_rate": 1.0994354869678378e-05, "loss": 1.1608, "step": 8 }, { "epoch": 0.1254355400696864, "grad_norm": 4.9181036949157715, "learning_rate": 1.0989965548228246e-05, "loss": 1.1165, "step": 9 }, { "epoch": 0.13937282229965156, "grad_norm": 3.872187614440918, "learning_rate": 1.0984323851637407e-05, "loss": 1.1308, "step": 10 }, { "epoch": 0.15331010452961671, "grad_norm": 4.341373920440674, "learning_rate": 1.0977431066894e-05, "loss": 1.1245, "step": 11 }, { "epoch": 0.1672473867595819, "grad_norm": 3.867882013320923, "learning_rate": 1.0969288766385357e-05, "loss": 1.1486, "step": 12 }, { "epoch": 0.18118466898954705, "grad_norm": 3.2795205116271973, "learning_rate": 1.0959898807539295e-05, "loss": 1.123, "step": 13 }, { "epoch": 0.1951219512195122, "grad_norm": 3.429868698120117, "learning_rate": 1.0949263332400415e-05, "loss": 1.1169, "step": 14 }, { "epoch": 0.20905923344947736, "grad_norm": 3.0912277698516846, "learning_rate": 1.0937384767141438e-05, "loss": 1.0832, "step": 15 }, { "epoch": 0.2229965156794425, "grad_norm": 4.55407190322876, "learning_rate": 1.0924265821509758e-05, "loss": 1.1018, "step": 16 }, { "epoch": 0.23693379790940766, "grad_norm": 3.4495749473571777, "learning_rate": 1.090990948820929e-05, "loss": 1.1186, "step": 17 }, { "epoch": 0.2508710801393728, "grad_norm": 6.985757827758789, "learning_rate": 1.0894319042217767e-05, "loss": 1.1135, "step": 18 }, { "epoch": 0.26480836236933797, "grad_norm": 3.7354323863983154, "learning_rate": 1.0877498040039657e-05, "loss": 1.1064, "step": 19 }, { "epoch": 0.2787456445993031, "grad_norm": 4.375406265258789, "learning_rate": 1.0859450318894847e-05, "loss": 1.0767, "step": 20 }, { "epoch": 0.2926829268292683, "grad_norm": 4.374971389770508, "learning_rate": 1.0840179995843286e-05, "loss": 1.1072, "step": 21 }, { "epoch": 0.30662020905923343, "grad_norm": 3.129504442214966, "learning_rate": 1.0819691466845815e-05, "loss": 1.0414, "step": 22 }, { "epoch": 0.3205574912891986, "grad_norm": 4.991695404052734, "learning_rate": 1.079798940576134e-05, "loss": 1.0773, "step": 23 }, { "epoch": 0.3344947735191638, "grad_norm": 3.570375919342041, "learning_rate": 1.0775078763280638e-05, "loss": 1.0709, "step": 24 }, { "epoch": 0.34843205574912894, "grad_norm": 7.718966007232666, "learning_rate": 1.0750964765797001e-05, "loss": 1.1019, "step": 25 }, { "epoch": 0.3623693379790941, "grad_norm": 3.459160327911377, "learning_rate": 1.072565291421398e-05, "loss": 1.0505, "step": 26 }, { "epoch": 0.37630662020905925, "grad_norm": 5.686498641967773, "learning_rate": 1.069914898269052e-05, "loss": 1.0897, "step": 27 }, { "epoch": 0.3902439024390244, "grad_norm": 3.622288465499878, "learning_rate": 1.067145901732376e-05, "loss": 1.0817, "step": 28 }, { "epoch": 0.40418118466898956, "grad_norm": 6.713829040527344, "learning_rate": 1.0642589334769783e-05, "loss": 1.0597, "step": 29 }, { "epoch": 0.4181184668989547, "grad_norm": 3.2670044898986816, "learning_rate": 1.061254652080265e-05, "loss": 1.0631, "step": 30 }, { "epoch": 0.43205574912891986, "grad_norm": 3.0448334217071533, "learning_rate": 1.0581337428812077e-05, "loss": 1.0697, "step": 31 }, { "epoch": 0.445993031358885, "grad_norm": 4.583492755889893, "learning_rate": 1.0548969178239997e-05, "loss": 1.0564, "step": 32 }, { "epoch": 0.45993031358885017, "grad_norm": 3.7002651691436768, "learning_rate": 1.0515449152956496e-05, "loss": 1.0852, "step": 33 }, { "epoch": 0.4738675958188153, "grad_norm": 3.1550400257110596, "learning_rate": 1.0480784999575381e-05, "loss": 1.0576, "step": 34 }, { "epoch": 0.4878048780487805, "grad_norm": 3.1954734325408936, "learning_rate": 1.0444984625709842e-05, "loss": 1.0965, "step": 35 }, { "epoch": 0.5017421602787456, "grad_norm": 2.8396148681640625, "learning_rate": 1.0408056198168555e-05, "loss": 1.0401, "step": 36 }, { "epoch": 0.5156794425087108, "grad_norm": 10.67453384399414, "learning_rate": 1.0370008141092654e-05, "loss": 1.0909, "step": 37 }, { "epoch": 0.5296167247386759, "grad_norm": 3.554422378540039, "learning_rate": 1.0330849134034033e-05, "loss": 1.0534, "step": 38 }, { "epoch": 0.5435540069686411, "grad_norm": 3.5049169063568115, "learning_rate": 1.0290588109975334e-05, "loss": 1.0366, "step": 39 }, { "epoch": 0.5574912891986062, "grad_norm": 3.9955601692199707, "learning_rate": 1.024923425329217e-05, "loss": 1.0264, "step": 40 }, { "epoch": 0.5714285714285714, "grad_norm": 3.2370924949645996, "learning_rate": 1.0206796997657961e-05, "loss": 1.0282, "step": 41 }, { "epoch": 0.5853658536585366, "grad_norm": 3.0653462409973145, "learning_rate": 1.0163286023891926e-05, "loss": 1.066, "step": 42 }, { "epoch": 0.5993031358885017, "grad_norm": 2.9653961658477783, "learning_rate": 1.011871125775069e-05, "loss": 1.068, "step": 43 }, { "epoch": 0.6132404181184669, "grad_norm": 2.899329423904419, "learning_rate": 1.0073082867664e-05, "loss": 1.0075, "step": 44 }, { "epoch": 0.627177700348432, "grad_norm": 3.231595277786255, "learning_rate": 1.002641126241511e-05, "loss": 1.0579, "step": 45 }, { "epoch": 0.6411149825783972, "grad_norm": 3.7711715698242188, "learning_rate": 9.978707088766316e-06, "loss": 1.0511, "step": 46 }, { "epoch": 0.6550522648083623, "grad_norm": 3.1697182655334473, "learning_rate": 9.929981229030202e-06, "loss": 1.0785, "step": 47 }, { "epoch": 0.6689895470383276, "grad_norm": 2.8038201332092285, "learning_rate": 9.88024479858717e-06, "loss": 1.0389, "step": 48 }, { "epoch": 0.6829268292682927, "grad_norm": 3.2184159755706787, "learning_rate": 9.829509143349775e-06, "loss": 1.0625, "step": 49 }, { "epoch": 0.6968641114982579, "grad_norm": 2.9576430320739746, "learning_rate": 9.77778583717451e-06, "loss": 1.0217, "step": 50 }, { "epoch": 0.710801393728223, "grad_norm": 2.918567657470703, "learning_rate": 9.725086679221542e-06, "loss": 1.0106, "step": 51 }, { "epoch": 0.7247386759581882, "grad_norm": 2.8690834045410156, "learning_rate": 9.671423691263104e-06, "loss": 1.0427, "step": 52 }, { "epoch": 0.7386759581881533, "grad_norm": 2.8049697875976562, "learning_rate": 9.616809114941055e-06, "loss": 1.0553, "step": 53 }, { "epoch": 0.7526132404181185, "grad_norm": 3.5655879974365234, "learning_rate": 9.561255408974332e-06, "loss": 1.0308, "step": 54 }, { "epoch": 0.7665505226480837, "grad_norm": 3.0086355209350586, "learning_rate": 9.504775246316836e-06, "loss": 1.0625, "step": 55 }, { "epoch": 0.7804878048780488, "grad_norm": 3.2106897830963135, "learning_rate": 9.447381511266482e-06, "loss": 1.0582, "step": 56 }, { "epoch": 0.794425087108014, "grad_norm": 3.013434410095215, "learning_rate": 9.38908729652601e-06, "loss": 1.0501, "step": 57 }, { "epoch": 0.8083623693379791, "grad_norm": 3.194031000137329, "learning_rate": 9.32990590021629e-06, "loss": 1.0385, "step": 58 }, { "epoch": 0.8222996515679443, "grad_norm": 2.9916939735412598, "learning_rate": 9.269850822842717e-06, "loss": 0.9978, "step": 59 }, { "epoch": 0.8362369337979094, "grad_norm": 2.90720796585083, "learning_rate": 9.208935764215487e-06, "loss": 1.038, "step": 60 }, { "epoch": 0.8501742160278746, "grad_norm": 2.983062505722046, "learning_rate": 9.147174620324374e-06, "loss": 1.0137, "step": 61 }, { "epoch": 0.8641114982578397, "grad_norm": 3.021130084991455, "learning_rate": 9.084581480168767e-06, "loss": 1.0388, "step": 62 }, { "epoch": 0.8780487804878049, "grad_norm": 3.831815242767334, "learning_rate": 9.021170622543684e-06, "loss": 1.0482, "step": 63 }, { "epoch": 0.89198606271777, "grad_norm": 3.5826456546783447, "learning_rate": 8.956956512782476e-06, "loss": 1.0494, "step": 64 }, { "epoch": 0.9059233449477352, "grad_norm": 3.445178747177124, "learning_rate": 8.891953799456987e-06, "loss": 0.9976, "step": 65 }, { "epoch": 0.9198606271777003, "grad_norm": 2.8443377017974854, "learning_rate": 8.826177311035906e-06, "loss": 1.0646, "step": 66 }, { "epoch": 0.9337979094076655, "grad_norm": 2.8210973739624023, "learning_rate": 8.759642052502092e-06, "loss": 1.0602, "step": 67 }, { "epoch": 0.9477351916376306, "grad_norm": 2.9282021522521973, "learning_rate": 8.692363201929623e-06, "loss": 1.041, "step": 68 }, { "epoch": 0.9616724738675958, "grad_norm": 2.916203260421753, "learning_rate": 8.624356107021355e-06, "loss": 1.035, "step": 69 }, { "epoch": 0.975609756097561, "grad_norm": 2.7474899291992188, "learning_rate": 8.555636281607811e-06, "loss": 1.0443, "step": 70 }, { "epoch": 0.9895470383275261, "grad_norm": 3.2409560680389404, "learning_rate": 8.486219402108133e-06, "loss": 1.0502, "step": 71 }, { "epoch": 1.0034843205574913, "grad_norm": 3.2006499767303467, "learning_rate": 8.416121303953973e-06, "loss": 1.0337, "step": 72 }, { "epoch": 1.0017421602787457, "grad_norm": 3.021101713180542, "learning_rate": 8.345357977977113e-06, "loss": 1.0164, "step": 73 }, { "epoch": 1.0156794425087108, "grad_norm": 3.0675883293151855, "learning_rate": 8.273945566761604e-06, "loss": 0.9739, "step": 74 }, { "epoch": 1.029616724738676, "grad_norm": 2.925316095352173, "learning_rate": 8.201900360961325e-06, "loss": 0.9607, "step": 75 }, { "epoch": 1.043554006968641, "grad_norm": 3.2823657989501953, "learning_rate": 8.12923879558374e-06, "loss": 0.9313, "step": 76 }, { "epoch": 1.0574912891986064, "grad_norm": 2.809762477874756, "learning_rate": 8.055977446240727e-06, "loss": 0.9682, "step": 77 }, { "epoch": 1.0714285714285714, "grad_norm": 3.0586483478546143, "learning_rate": 7.982133025367346e-06, "loss": 0.9674, "step": 78 }, { "epoch": 1.0853658536585367, "grad_norm": 2.9541120529174805, "learning_rate": 7.907722378409371e-06, "loss": 0.9691, "step": 79 }, { "epoch": 1.0993031358885017, "grad_norm": 2.965197801589966, "learning_rate": 7.83276247998052e-06, "loss": 0.9537, "step": 80 }, { "epoch": 1.113240418118467, "grad_norm": 3.096423625946045, "learning_rate": 7.757270429990162e-06, "loss": 0.9631, "step": 81 }, { "epoch": 1.127177700348432, "grad_norm": 3.0303800106048584, "learning_rate": 7.681263449742493e-06, "loss": 0.9774, "step": 82 }, { "epoch": 1.1411149825783973, "grad_norm": 3.2755138874053955, "learning_rate": 7.604758878007994e-06, "loss": 0.9589, "step": 83 }, { "epoch": 1.1550522648083623, "grad_norm": 3.0583481788635254, "learning_rate": 7.527774167068094e-06, "loss": 0.9313, "step": 84 }, { "epoch": 1.1689895470383276, "grad_norm": 2.8140718936920166, "learning_rate": 7.4503268787339455e-06, "loss": 0.9137, "step": 85 }, { "epoch": 1.1829268292682926, "grad_norm": 3.0150392055511475, "learning_rate": 7.372434680340213e-06, "loss": 0.9324, "step": 86 }, { "epoch": 1.1968641114982579, "grad_norm": 3.1816203594207764, "learning_rate": 7.294115340714782e-06, "loss": 0.9835, "step": 87 }, { "epoch": 1.210801393728223, "grad_norm": 3.011570930480957, "learning_rate": 7.215386726125319e-06, "loss": 0.9395, "step": 88 }, { "epoch": 1.2247386759581882, "grad_norm": 4.090625286102295, "learning_rate": 7.1362667962036166e-06, "loss": 0.9682, "step": 89 }, { "epoch": 1.2386759581881532, "grad_norm": 3.290282964706421, "learning_rate": 7.056773599848612e-06, "loss": 0.9487, "step": 90 }, { "epoch": 1.2526132404181185, "grad_norm": 3.567591667175293, "learning_rate": 6.976925271109072e-06, "loss": 0.9459, "step": 91 }, { "epoch": 1.2665505226480835, "grad_norm": 2.817159414291382, "learning_rate": 6.8967400250468335e-06, "loss": 0.968, "step": 92 }, { "epoch": 1.2804878048780488, "grad_norm": 3.8883535861968994, "learning_rate": 6.816236153581568e-06, "loss": 0.938, "step": 93 }, { "epoch": 1.294425087108014, "grad_norm": 2.920037269592285, "learning_rate": 6.735432021318023e-06, "loss": 0.9401, "step": 94 }, { "epoch": 1.3083623693379791, "grad_norm": 2.851327896118164, "learning_rate": 6.654346061356661e-06, "loss": 0.9636, "step": 95 }, { "epoch": 1.3222996515679442, "grad_norm": 3.1404778957366943, "learning_rate": 6.572996771088706e-06, "loss": 0.9665, "step": 96 }, { "epoch": 1.3362369337979094, "grad_norm": 4.205505847930908, "learning_rate": 6.491402707976482e-06, "loss": 0.945, "step": 97 }, { "epoch": 1.3501742160278747, "grad_norm": 2.9891207218170166, "learning_rate": 6.409582485320087e-06, "loss": 0.9554, "step": 98 }, { "epoch": 1.3641114982578397, "grad_norm": 2.9016053676605225, "learning_rate": 6.327554768011307e-06, "loss": 0.9613, "step": 99 }, { "epoch": 1.3780487804878048, "grad_norm": 2.954751968383789, "learning_rate": 6.245338268275765e-06, "loss": 0.9358, "step": 100 }, { "epoch": 1.39198606271777, "grad_norm": 3.2333548069000244, "learning_rate": 6.162951741404276e-06, "loss": 0.9573, "step": 101 }, { "epoch": 1.4059233449477353, "grad_norm": 3.4226996898651123, "learning_rate": 6.080413981474379e-06, "loss": 0.9294, "step": 102 }, { "epoch": 1.4198606271777003, "grad_norm": 3.169379711151123, "learning_rate": 5.9977438170630085e-06, "loss": 0.9195, "step": 103 }, { "epoch": 1.4337979094076654, "grad_norm": 3.147132396697998, "learning_rate": 5.914960106951313e-06, "loss": 0.9524, "step": 104 }, { "epoch": 1.4477351916376306, "grad_norm": 2.7678744792938232, "learning_rate": 5.832081735822573e-06, "loss": 0.9234, "step": 105 }, { "epoch": 1.461672473867596, "grad_norm": 2.8116793632507324, "learning_rate": 5.749127609954215e-06, "loss": 0.9619, "step": 106 }, { "epoch": 1.475609756097561, "grad_norm": 3.6426944732666016, "learning_rate": 5.666116652904889e-06, "loss": 0.9435, "step": 107 }, { "epoch": 1.489547038327526, "grad_norm": 4.532324314117432, "learning_rate": 5.5830678011976225e-06, "loss": 0.9538, "step": 108 }, { "epoch": 1.5034843205574913, "grad_norm": 3.7293591499328613, "learning_rate": 5.5e-06, "loss": 0.9595, "step": 109 }, { "epoch": 1.5174216027874565, "grad_norm": 2.8792057037353516, "learning_rate": 5.416932198802378e-06, "loss": 0.9498, "step": 110 }, { "epoch": 1.5313588850174216, "grad_norm": 2.9074580669403076, "learning_rate": 5.333883347095112e-06, "loss": 0.9572, "step": 111 }, { "epoch": 1.5452961672473866, "grad_norm": 4.807355880737305, "learning_rate": 5.250872390045787e-06, "loss": 0.9527, "step": 112 }, { "epoch": 1.5592334494773519, "grad_norm": 4.229637622833252, "learning_rate": 5.167918264177426e-06, "loss": 0.9275, "step": 113 }, { "epoch": 1.5731707317073171, "grad_norm": 3.07944393157959, "learning_rate": 5.085039893048687e-06, "loss": 0.9565, "step": 114 }, { "epoch": 1.5871080139372822, "grad_norm": 2.8650782108306885, "learning_rate": 5.002256182936992e-06, "loss": 0.9493, "step": 115 }, { "epoch": 1.6010452961672472, "grad_norm": 2.928194284439087, "learning_rate": 4.919586018525621e-06, "loss": 0.9513, "step": 116 }, { "epoch": 1.6149825783972127, "grad_norm": 3.433330535888672, "learning_rate": 4.837048258595723e-06, "loss": 0.978, "step": 117 }, { "epoch": 1.6289198606271778, "grad_norm": 2.8581085205078125, "learning_rate": 4.754661731724237e-06, "loss": 0.9366, "step": 118 }, { "epoch": 1.6428571428571428, "grad_norm": 2.897559404373169, "learning_rate": 4.672445231988693e-06, "loss": 0.9641, "step": 119 }, { "epoch": 1.656794425087108, "grad_norm": 2.8979365825653076, "learning_rate": 4.590417514679912e-06, "loss": 0.9316, "step": 120 }, { "epoch": 1.6707317073170733, "grad_norm": 3.090183734893799, "learning_rate": 4.508597292023518e-06, "loss": 0.9545, "step": 121 }, { "epoch": 1.6846689895470384, "grad_norm": 2.9215645790100098, "learning_rate": 4.427003228911295e-06, "loss": 0.9169, "step": 122 }, { "epoch": 1.6986062717770034, "grad_norm": 4.066997528076172, "learning_rate": 4.345653938643339e-06, "loss": 0.9357, "step": 123 }, { "epoch": 1.7125435540069687, "grad_norm": 3.2360036373138428, "learning_rate": 4.2645679786819796e-06, "loss": 0.9205, "step": 124 }, { "epoch": 1.726480836236934, "grad_norm": 2.890538215637207, "learning_rate": 4.1837638464184334e-06, "loss": 0.9355, "step": 125 }, { "epoch": 1.740418118466899, "grad_norm": 3.4630892276763916, "learning_rate": 4.103259974953166e-06, "loss": 0.9263, "step": 126 }, { "epoch": 1.754355400696864, "grad_norm": 3.326503276824951, "learning_rate": 4.023074728890927e-06, "loss": 0.9154, "step": 127 }, { "epoch": 1.7682926829268293, "grad_norm": 3.6348044872283936, "learning_rate": 3.943226400151388e-06, "loss": 0.944, "step": 128 }, { "epoch": 1.7822299651567945, "grad_norm": 3.0831995010375977, "learning_rate": 3.863733203796385e-06, "loss": 0.9457, "step": 129 }, { "epoch": 1.7961672473867596, "grad_norm": 3.1765310764312744, "learning_rate": 3.784613273874681e-06, "loss": 0.9329, "step": 130 }, { "epoch": 1.8101045296167246, "grad_norm": 2.767540693283081, "learning_rate": 3.70588465928522e-06, "loss": 0.9285, "step": 131 }, { "epoch": 1.82404181184669, "grad_norm": 2.718041181564331, "learning_rate": 3.6275653196597856e-06, "loss": 0.9767, "step": 132 }, { "epoch": 1.8379790940766552, "grad_norm": 4.337693691253662, "learning_rate": 3.5496731212660538e-06, "loss": 0.9351, "step": 133 }, { "epoch": 1.8519163763066202, "grad_norm": 3.106288194656372, "learning_rate": 3.472225832931907e-06, "loss": 0.9554, "step": 134 }, { "epoch": 1.8658536585365852, "grad_norm": 3.2049343585968018, "learning_rate": 3.3952411219920066e-06, "loss": 0.9601, "step": 135 }, { "epoch": 1.8797909407665505, "grad_norm": 4.705850601196289, "learning_rate": 3.318736550257507e-06, "loss": 0.9595, "step": 136 }, { "epoch": 1.8937282229965158, "grad_norm": 3.335909128189087, "learning_rate": 3.2427295700098385e-06, "loss": 0.9889, "step": 137 }, { "epoch": 1.9076655052264808, "grad_norm": 2.938302993774414, "learning_rate": 3.1672375200194797e-06, "loss": 0.9306, "step": 138 }, { "epoch": 1.9216027874564459, "grad_norm": 3.15813946723938, "learning_rate": 3.092277621590627e-06, "loss": 0.9332, "step": 139 }, { "epoch": 1.9355400696864111, "grad_norm": 2.869309902191162, "learning_rate": 3.0178669746326567e-06, "loss": 0.9496, "step": 140 }, { "epoch": 1.9494773519163764, "grad_norm": 2.8482539653778076, "learning_rate": 2.9440225537592728e-06, "loss": 0.9406, "step": 141 }, { "epoch": 1.9634146341463414, "grad_norm": 3.2523257732391357, "learning_rate": 2.8707612044162595e-06, "loss": 0.9377, "step": 142 }, { "epoch": 1.9773519163763065, "grad_norm": 3.42722487449646, "learning_rate": 2.7980996390386755e-06, "loss": 0.9622, "step": 143 }, { "epoch": 1.9912891986062717, "grad_norm": 3.1534297466278076, "learning_rate": 2.7260544332383964e-06, "loss": 0.9219, "step": 144 }, { "epoch": 2.005226480836237, "grad_norm": 2.9020955562591553, "learning_rate": 2.654642022022887e-06, "loss": 0.9233, "step": 145 }, { "epoch": 2.0034843205574915, "grad_norm": 6.1046247482299805, "learning_rate": 2.5838786960460267e-06, "loss": 0.9226, "step": 146 }, { "epoch": 2.0174216027874565, "grad_norm": 2.791771411895752, "learning_rate": 2.513780597891867e-06, "loss": 0.8854, "step": 147 }, { "epoch": 2.0313588850174216, "grad_norm": 4.039772033691406, "learning_rate": 2.444363718392189e-06, "loss": 0.9067, "step": 148 }, { "epoch": 2.0452961672473866, "grad_norm": 3.059945821762085, "learning_rate": 2.3756438929786434e-06, "loss": 0.872, "step": 149 }, { "epoch": 2.059233449477352, "grad_norm": 3.355379819869995, "learning_rate": 2.3076367980703774e-06, "loss": 0.8897, "step": 150 }, { "epoch": 2.073170731707317, "grad_norm": 3.4992244243621826, "learning_rate": 2.240357947497908e-06, "loss": 0.9425, "step": 151 }, { "epoch": 2.087108013937282, "grad_norm": 3.829111099243164, "learning_rate": 2.173822688964094e-06, "loss": 0.9141, "step": 152 }, { "epoch": 2.1010452961672472, "grad_norm": 2.9267561435699463, "learning_rate": 2.108046200543013e-06, "loss": 0.9238, "step": 153 }, { "epoch": 2.1149825783972127, "grad_norm": 3.307797908782959, "learning_rate": 2.0430434872175245e-06, "loss": 0.9014, "step": 154 }, { "epoch": 2.1289198606271778, "grad_norm": 3.0536694526672363, "learning_rate": 1.9788293774563163e-06, "loss": 0.8967, "step": 155 }, { "epoch": 2.142857142857143, "grad_norm": 3.2300519943237305, "learning_rate": 1.9154185198312327e-06, "loss": 0.8817, "step": 156 }, { "epoch": 2.156794425087108, "grad_norm": 3.410097599029541, "learning_rate": 1.8528253796756277e-06, "loss": 0.9151, "step": 157 }, { "epoch": 2.1707317073170733, "grad_norm": 3.1198341846466064, "learning_rate": 1.7910642357845122e-06, "loss": 0.9359, "step": 158 }, { "epoch": 2.1846689895470384, "grad_norm": 3.0908079147338867, "learning_rate": 1.7301491771572808e-06, "loss": 0.8697, "step": 159 }, { "epoch": 2.1986062717770034, "grad_norm": 45.06842803955078, "learning_rate": 1.67009409978371e-06, "loss": 0.8753, "step": 160 }, { "epoch": 2.2125435540069684, "grad_norm": 3.0929031372070312, "learning_rate": 1.610912703473989e-06, "loss": 0.9032, "step": 161 }, { "epoch": 2.226480836236934, "grad_norm": 3.174290180206299, "learning_rate": 1.5526184887335188e-06, "loss": 0.9066, "step": 162 }, { "epoch": 2.240418118466899, "grad_norm": 3.2078001499176025, "learning_rate": 1.4952247536831645e-06, "loss": 0.8793, "step": 163 }, { "epoch": 2.254355400696864, "grad_norm": 3.8318631649017334, "learning_rate": 1.438744591025668e-06, "loss": 0.8881, "step": 164 }, { "epoch": 2.2682926829268295, "grad_norm": 3.460146188735962, "learning_rate": 1.3831908850589433e-06, "loss": 0.9087, "step": 165 }, { "epoch": 2.2822299651567945, "grad_norm": 3.1915969848632812, "learning_rate": 1.3285763087368974e-06, "loss": 0.9007, "step": 166 }, { "epoch": 2.2961672473867596, "grad_norm": 3.0315561294555664, "learning_rate": 1.2749133207784575e-06, "loss": 0.879, "step": 167 }, { "epoch": 2.3101045296167246, "grad_norm": 4.681447505950928, "learning_rate": 1.2222141628254902e-06, "loss": 0.9268, "step": 168 }, { "epoch": 2.3240418118466897, "grad_norm": 2.946652889251709, "learning_rate": 1.1704908566502246e-06, "loss": 0.8952, "step": 169 }, { "epoch": 2.337979094076655, "grad_norm": 2.944952964782715, "learning_rate": 1.1197552014128314e-06, "loss": 0.8807, "step": 170 }, { "epoch": 2.35191637630662, "grad_norm": 3.2808735370635986, "learning_rate": 1.0700187709697969e-06, "loss": 0.8722, "step": 171 }, { "epoch": 2.3658536585365852, "grad_norm": 3.887355327606201, "learning_rate": 1.0212929112336848e-06, "loss": 0.8863, "step": 172 }, { "epoch": 2.3797909407665507, "grad_norm": 3.0687787532806396, "learning_rate": 9.7358873758489e-07, "loss": 0.9093, "step": 173 }, { "epoch": 2.3937282229965158, "grad_norm": 2.890662431716919, "learning_rate": 9.269171323360006e-07, "loss": 0.8987, "step": 174 }, { "epoch": 2.407665505226481, "grad_norm": 3.5224292278289795, "learning_rate": 8.812887422493117e-07, "loss": 0.9008, "step": 175 }, { "epoch": 2.421602787456446, "grad_norm": 7.938294410705566, "learning_rate": 8.367139761080734e-07, "loss": 0.8774, "step": 176 }, { "epoch": 2.435540069686411, "grad_norm": 3.240544557571411, "learning_rate": 7.932030023420393e-07, "loss": 0.9178, "step": 177 }, { "epoch": 2.4494773519163764, "grad_norm": 3.17809796333313, "learning_rate": 7.507657467078292e-07, "loss": 0.8881, "step": 178 }, { "epoch": 2.4634146341463414, "grad_norm": 3.075671911239624, "learning_rate": 7.094118900246642e-07, "loss": 0.94, "step": 179 }, { "epoch": 2.4773519163763065, "grad_norm": 3.6002347469329834, "learning_rate": 6.691508659659682e-07, "loss": 0.8895, "step": 180 } ], "logging_steps": 1, "max_steps": 213, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 36, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 6.164757741828047e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }