{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9781591263650546, "eval_steps": 80, "global_step": 320, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0062402496099844, "grad_norm": 19.714784622192383, "learning_rate": 2.0000000000000002e-07, "loss": 1.3589, "step": 1 }, { "epoch": 0.0062402496099844, "eval_loss": 1.3540421724319458, "eval_runtime": 132.5999, "eval_samples_per_second": 102.195, "eval_steps_per_second": 6.388, "step": 1 }, { "epoch": 0.0124804992199688, "grad_norm": 20.498014450073242, "learning_rate": 4.0000000000000003e-07, "loss": 1.3662, "step": 2 }, { "epoch": 0.0187207488299532, "grad_norm": 19.82619285583496, "learning_rate": 6.000000000000001e-07, "loss": 1.3336, "step": 3 }, { "epoch": 0.0249609984399376, "grad_norm": 18.423460006713867, "learning_rate": 8.000000000000001e-07, "loss": 1.3555, "step": 4 }, { "epoch": 0.031201248049921998, "grad_norm": 16.555850982666016, "learning_rate": 1.0000000000000002e-06, "loss": 1.3527, "step": 5 }, { "epoch": 0.0374414976599064, "grad_norm": 10.684965133666992, "learning_rate": 1.2000000000000002e-06, "loss": 1.3491, "step": 6 }, { "epoch": 0.0436817472698908, "grad_norm": 8.396592140197754, "learning_rate": 1.4000000000000001e-06, "loss": 1.3181, "step": 7 }, { "epoch": 0.0499219968798752, "grad_norm": 3.145500421524048, "learning_rate": 1.6000000000000001e-06, "loss": 1.2984, "step": 8 }, { "epoch": 0.056162246489859596, "grad_norm": 2.981050491333008, "learning_rate": 1.8000000000000001e-06, "loss": 1.2901, "step": 9 }, { "epoch": 0.062402496099843996, "grad_norm": 2.741509199142456, "learning_rate": 2.0000000000000003e-06, "loss": 1.2948, "step": 10 }, { "epoch": 0.0686427457098284, "grad_norm": 3.8496174812316895, "learning_rate": 2.2e-06, "loss": 1.2524, "step": 11 }, { "epoch": 0.0748829953198128, "grad_norm": 3.039551258087158, "learning_rate": 2.4000000000000003e-06, "loss": 1.2369, "step": 12 }, { "epoch": 0.0811232449297972, "grad_norm": 2.215259313583374, "learning_rate": 2.6e-06, "loss": 1.244, "step": 13 }, { "epoch": 0.0873634945397816, "grad_norm": 1.4627336263656616, "learning_rate": 2.8000000000000003e-06, "loss": 1.2201, "step": 14 }, { "epoch": 0.093603744149766, "grad_norm": 2.0100812911987305, "learning_rate": 3e-06, "loss": 1.2097, "step": 15 }, { "epoch": 0.0998439937597504, "grad_norm": 2.0757627487182617, "learning_rate": 3.2000000000000003e-06, "loss": 1.2058, "step": 16 }, { "epoch": 0.1060842433697348, "grad_norm": 1.6582179069519043, "learning_rate": 3.4000000000000005e-06, "loss": 1.1775, "step": 17 }, { "epoch": 0.11232449297971919, "grad_norm": 1.2454713582992554, "learning_rate": 3.6000000000000003e-06, "loss": 1.1405, "step": 18 }, { "epoch": 0.11856474258970359, "grad_norm": 1.0032132863998413, "learning_rate": 3.8000000000000005e-06, "loss": 1.1442, "step": 19 }, { "epoch": 0.12480499219968799, "grad_norm": 1.3543955087661743, "learning_rate": 4.000000000000001e-06, "loss": 1.1874, "step": 20 }, { "epoch": 0.1310452418096724, "grad_norm": 1.2795507907867432, "learning_rate": 4.2000000000000004e-06, "loss": 1.1423, "step": 21 }, { "epoch": 0.1372854914196568, "grad_norm": 1.0040080547332764, "learning_rate": 4.4e-06, "loss": 1.1269, "step": 22 }, { "epoch": 0.1435257410296412, "grad_norm": 0.9706005454063416, "learning_rate": 4.600000000000001e-06, "loss": 1.1508, "step": 23 }, { "epoch": 0.1497659906396256, "grad_norm": 0.905784547328949, "learning_rate": 4.800000000000001e-06, "loss": 1.1003, "step": 24 }, { "epoch": 0.15600624024961, "grad_norm": 0.8688749074935913, "learning_rate": 5e-06, "loss": 1.1046, "step": 25 }, { "epoch": 0.1622464898595944, "grad_norm": 0.7418661713600159, "learning_rate": 5.2e-06, "loss": 1.0736, "step": 26 }, { "epoch": 0.1684867394695788, "grad_norm": 0.7218017578125, "learning_rate": 5.400000000000001e-06, "loss": 1.0924, "step": 27 }, { "epoch": 0.1747269890795632, "grad_norm": 0.7364180684089661, "learning_rate": 5.600000000000001e-06, "loss": 1.0666, "step": 28 }, { "epoch": 0.1809672386895476, "grad_norm": 0.6347681879997253, "learning_rate": 5.8e-06, "loss": 1.0533, "step": 29 }, { "epoch": 0.187207488299532, "grad_norm": 0.672021210193634, "learning_rate": 6e-06, "loss": 1.0719, "step": 30 }, { "epoch": 0.1934477379095164, "grad_norm": 0.6880649328231812, "learning_rate": 6.200000000000001e-06, "loss": 1.0555, "step": 31 }, { "epoch": 0.1996879875195008, "grad_norm": 0.5669052004814148, "learning_rate": 6.4000000000000006e-06, "loss": 1.0845, "step": 32 }, { "epoch": 0.2059282371294852, "grad_norm": 0.6051258444786072, "learning_rate": 6.600000000000001e-06, "loss": 1.0656, "step": 33 }, { "epoch": 0.2121684867394696, "grad_norm": 0.5937217473983765, "learning_rate": 6.800000000000001e-06, "loss": 1.0738, "step": 34 }, { "epoch": 0.21840873634945399, "grad_norm": 0.5861482620239258, "learning_rate": 7e-06, "loss": 1.0497, "step": 35 }, { "epoch": 0.22464898595943839, "grad_norm": 0.5939168334007263, "learning_rate": 7.2000000000000005e-06, "loss": 1.0657, "step": 36 }, { "epoch": 0.23088923556942278, "grad_norm": 0.5843105316162109, "learning_rate": 7.4e-06, "loss": 1.0498, "step": 37 }, { "epoch": 0.23712948517940718, "grad_norm": 0.5303648710250854, "learning_rate": 7.600000000000001e-06, "loss": 1.0604, "step": 38 }, { "epoch": 0.24336973478939158, "grad_norm": 0.558338463306427, "learning_rate": 7.800000000000002e-06, "loss": 1.0383, "step": 39 }, { "epoch": 0.24960998439937598, "grad_norm": 0.49629613757133484, "learning_rate": 8.000000000000001e-06, "loss": 1.0521, "step": 40 }, { "epoch": 0.25585023400936036, "grad_norm": 0.5873180627822876, "learning_rate": 8.2e-06, "loss": 1.0403, "step": 41 }, { "epoch": 0.2620904836193448, "grad_norm": 0.5466005802154541, "learning_rate": 8.400000000000001e-06, "loss": 1.0127, "step": 42 }, { "epoch": 0.26833073322932915, "grad_norm": 0.5514444708824158, "learning_rate": 8.6e-06, "loss": 1.0399, "step": 43 }, { "epoch": 0.2745709828393136, "grad_norm": 0.5304705500602722, "learning_rate": 8.8e-06, "loss": 1.0057, "step": 44 }, { "epoch": 0.28081123244929795, "grad_norm": 0.5105130076408386, "learning_rate": 9e-06, "loss": 1.0174, "step": 45 }, { "epoch": 0.2870514820592824, "grad_norm": 0.533640444278717, "learning_rate": 9.200000000000002e-06, "loss": 1.0342, "step": 46 }, { "epoch": 0.29329173166926675, "grad_norm": 0.48208147287368774, "learning_rate": 9.4e-06, "loss": 1.0195, "step": 47 }, { "epoch": 0.2995319812792512, "grad_norm": 0.5069381594657898, "learning_rate": 9.600000000000001e-06, "loss": 1.0382, "step": 48 }, { "epoch": 0.30577223088923555, "grad_norm": 0.4819696843624115, "learning_rate": 9.800000000000001e-06, "loss": 1.05, "step": 49 }, { "epoch": 0.31201248049922, "grad_norm": 0.5414313673973083, "learning_rate": 1e-05, "loss": 1.0245, "step": 50 }, { "epoch": 0.31825273010920435, "grad_norm": 0.4769354462623596, "learning_rate": 1.02e-05, "loss": 1.005, "step": 51 }, { "epoch": 0.3244929797191888, "grad_norm": 0.5051629543304443, "learning_rate": 1.04e-05, "loss": 1.0158, "step": 52 }, { "epoch": 0.33073322932917315, "grad_norm": 0.5432644486427307, "learning_rate": 1.0600000000000002e-05, "loss": 1.0122, "step": 53 }, { "epoch": 0.3369734789391576, "grad_norm": 0.4705195128917694, "learning_rate": 1.0800000000000002e-05, "loss": 1.0053, "step": 54 }, { "epoch": 0.34321372854914195, "grad_norm": 0.5468801856040955, "learning_rate": 1.1000000000000001e-05, "loss": 1.0173, "step": 55 }, { "epoch": 0.3494539781591264, "grad_norm": 0.6218928694725037, "learning_rate": 1.1200000000000001e-05, "loss": 0.9974, "step": 56 }, { "epoch": 0.35569422776911075, "grad_norm": 0.532873272895813, "learning_rate": 1.14e-05, "loss": 1.006, "step": 57 }, { "epoch": 0.3619344773790952, "grad_norm": 0.48144450783729553, "learning_rate": 1.16e-05, "loss": 1.0025, "step": 58 }, { "epoch": 0.36817472698907955, "grad_norm": 0.5385976433753967, "learning_rate": 1.18e-05, "loss": 0.9976, "step": 59 }, { "epoch": 0.374414976599064, "grad_norm": 0.5179689526557922, "learning_rate": 1.2e-05, "loss": 0.9988, "step": 60 }, { "epoch": 0.38065522620904835, "grad_norm": 0.4646259844303131, "learning_rate": 1.22e-05, "loss": 0.9959, "step": 61 }, { "epoch": 0.3868954758190328, "grad_norm": 0.5259431004524231, "learning_rate": 1.2400000000000002e-05, "loss": 0.9933, "step": 62 }, { "epoch": 0.39313572542901715, "grad_norm": 0.5602505803108215, "learning_rate": 1.2600000000000001e-05, "loss": 0.9732, "step": 63 }, { "epoch": 0.3993759750390016, "grad_norm": 0.5400233864784241, "learning_rate": 1.2800000000000001e-05, "loss": 1.0021, "step": 64 }, { "epoch": 0.40561622464898595, "grad_norm": 0.5008605718612671, "learning_rate": 1.3000000000000001e-05, "loss": 1.0098, "step": 65 }, { "epoch": 0.4118564742589704, "grad_norm": 0.5215092301368713, "learning_rate": 1.3200000000000002e-05, "loss": 0.9805, "step": 66 }, { "epoch": 0.41809672386895474, "grad_norm": 0.6043874025344849, "learning_rate": 1.3400000000000002e-05, "loss": 0.9651, "step": 67 }, { "epoch": 0.4243369734789392, "grad_norm": 0.5744293928146362, "learning_rate": 1.3600000000000002e-05, "loss": 0.9715, "step": 68 }, { "epoch": 0.43057722308892354, "grad_norm": 0.6228943467140198, "learning_rate": 1.38e-05, "loss": 0.9942, "step": 69 }, { "epoch": 0.43681747269890797, "grad_norm": 0.6340550780296326, "learning_rate": 1.4e-05, "loss": 1.0278, "step": 70 }, { "epoch": 0.44305772230889234, "grad_norm": 0.6537193655967712, "learning_rate": 1.4200000000000001e-05, "loss": 1.005, "step": 71 }, { "epoch": 0.44929797191887677, "grad_norm": 0.6706846356391907, "learning_rate": 1.4400000000000001e-05, "loss": 0.9736, "step": 72 }, { "epoch": 0.45553822152886114, "grad_norm": 0.5686175227165222, "learning_rate": 1.46e-05, "loss": 0.9753, "step": 73 }, { "epoch": 0.46177847113884557, "grad_norm": 0.5182248950004578, "learning_rate": 1.48e-05, "loss": 0.9964, "step": 74 }, { "epoch": 0.46801872074882994, "grad_norm": 0.5445067286491394, "learning_rate": 1.5000000000000002e-05, "loss": 0.9702, "step": 75 }, { "epoch": 0.47425897035881437, "grad_norm": 0.6168459057807922, "learning_rate": 1.5200000000000002e-05, "loss": 0.9791, "step": 76 }, { "epoch": 0.48049921996879874, "grad_norm": 0.6475315093994141, "learning_rate": 1.54e-05, "loss": 0.98, "step": 77 }, { "epoch": 0.48673946957878317, "grad_norm": 0.8365716934204102, "learning_rate": 1.5600000000000003e-05, "loss": 0.987, "step": 78 }, { "epoch": 0.49297971918876754, "grad_norm": 1.0882554054260254, "learning_rate": 1.58e-05, "loss": 0.9796, "step": 79 }, { "epoch": 0.49921996879875197, "grad_norm": 1.109529972076416, "learning_rate": 1.6000000000000003e-05, "loss": 0.9825, "step": 80 }, { "epoch": 0.49921996879875197, "eval_loss": 0.9798125624656677, "eval_runtime": 132.8615, "eval_samples_per_second": 101.993, "eval_steps_per_second": 6.375, "step": 80 }, { "epoch": 0.5054602184087363, "grad_norm": 0.9999867677688599, "learning_rate": 1.62e-05, "loss": 0.9591, "step": 81 }, { "epoch": 0.5117004680187207, "grad_norm": 0.8042426705360413, "learning_rate": 1.64e-05, "loss": 0.9832, "step": 82 }, { "epoch": 0.5179407176287052, "grad_norm": 0.5843170881271362, "learning_rate": 1.66e-05, "loss": 0.9769, "step": 83 }, { "epoch": 0.5241809672386896, "grad_norm": 0.6988096237182617, "learning_rate": 1.6800000000000002e-05, "loss": 0.9556, "step": 84 }, { "epoch": 0.5304212168486739, "grad_norm": 0.7298963665962219, "learning_rate": 1.7e-05, "loss": 0.983, "step": 85 }, { "epoch": 0.5366614664586583, "grad_norm": 0.7856214046478271, "learning_rate": 1.72e-05, "loss": 0.9541, "step": 86 }, { "epoch": 0.5429017160686428, "grad_norm": 0.6896259188652039, "learning_rate": 1.7400000000000003e-05, "loss": 0.983, "step": 87 }, { "epoch": 0.5491419656786272, "grad_norm": 0.5555576682090759, "learning_rate": 1.76e-05, "loss": 0.9395, "step": 88 }, { "epoch": 0.5553822152886115, "grad_norm": 0.6107622385025024, "learning_rate": 1.7800000000000002e-05, "loss": 0.9601, "step": 89 }, { "epoch": 0.5616224648985959, "grad_norm": 0.7116836309432983, "learning_rate": 1.8e-05, "loss": 0.9647, "step": 90 }, { "epoch": 0.5678627145085804, "grad_norm": 0.5782715082168579, "learning_rate": 1.8200000000000002e-05, "loss": 0.9604, "step": 91 }, { "epoch": 0.5741029641185648, "grad_norm": 0.512100338935852, "learning_rate": 1.8400000000000003e-05, "loss": 0.9433, "step": 92 }, { "epoch": 0.5803432137285491, "grad_norm": 0.6315212249755859, "learning_rate": 1.86e-05, "loss": 0.969, "step": 93 }, { "epoch": 0.5865834633385335, "grad_norm": 0.6883739233016968, "learning_rate": 1.88e-05, "loss": 0.9832, "step": 94 }, { "epoch": 0.592823712948518, "grad_norm": 0.6760767102241516, "learning_rate": 1.9e-05, "loss": 0.9414, "step": 95 }, { "epoch": 0.5990639625585024, "grad_norm": 0.6577237248420715, "learning_rate": 1.9200000000000003e-05, "loss": 0.9748, "step": 96 }, { "epoch": 0.6053042121684867, "grad_norm": 0.9515029788017273, "learning_rate": 1.94e-05, "loss": 0.9863, "step": 97 }, { "epoch": 0.6115444617784711, "grad_norm": 1.40570068359375, "learning_rate": 1.9600000000000002e-05, "loss": 0.9445, "step": 98 }, { "epoch": 0.6177847113884556, "grad_norm": 0.9026833176612854, "learning_rate": 1.98e-05, "loss": 0.9436, "step": 99 }, { "epoch": 0.62402496099844, "grad_norm": 0.6666714549064636, "learning_rate": 2e-05, "loss": 0.9832, "step": 100 }, { "epoch": 0.6302652106084243, "grad_norm": 0.8047837018966675, "learning_rate": 1.9998980430094333e-05, "loss": 0.9498, "step": 101 }, { "epoch": 0.6365054602184087, "grad_norm": 0.9035269618034363, "learning_rate": 1.9995921928281893e-05, "loss": 0.9541, "step": 102 }, { "epoch": 0.6427457098283932, "grad_norm": 1.027601718902588, "learning_rate": 1.9990825118233958e-05, "loss": 0.9786, "step": 103 }, { "epoch": 0.6489859594383776, "grad_norm": 1.1459457874298096, "learning_rate": 1.9983691039261358e-05, "loss": 0.9482, "step": 104 }, { "epoch": 0.6552262090483619, "grad_norm": 0.7179874777793884, "learning_rate": 1.9974521146102535e-05, "loss": 0.9743, "step": 105 }, { "epoch": 0.6614664586583463, "grad_norm": 0.6881632208824158, "learning_rate": 1.9963317308626916e-05, "loss": 0.9797, "step": 106 }, { "epoch": 0.6677067082683308, "grad_norm": 0.7822304368019104, "learning_rate": 1.9950081811453598e-05, "loss": 0.9682, "step": 107 }, { "epoch": 0.6739469578783152, "grad_norm": 0.8269001841545105, "learning_rate": 1.99348173534855e-05, "loss": 0.9455, "step": 108 }, { "epoch": 0.6801872074882995, "grad_norm": 0.8077254295349121, "learning_rate": 1.991752704735903e-05, "loss": 0.9243, "step": 109 }, { "epoch": 0.6864274570982839, "grad_norm": 0.8119699954986572, "learning_rate": 1.989821441880933e-05, "loss": 0.9273, "step": 110 }, { "epoch": 0.6926677067082684, "grad_norm": 0.8220670223236084, "learning_rate": 1.9876883405951378e-05, "loss": 0.9455, "step": 111 }, { "epoch": 0.6989079563182528, "grad_norm": 0.8622007966041565, "learning_rate": 1.9853538358476933e-05, "loss": 0.9624, "step": 112 }, { "epoch": 0.7051482059282371, "grad_norm": 0.8222960233688354, "learning_rate": 1.9828184036767556e-05, "loss": 0.955, "step": 113 }, { "epoch": 0.7113884555382215, "grad_norm": 0.62811678647995, "learning_rate": 1.9800825610923937e-05, "loss": 0.9551, "step": 114 }, { "epoch": 0.717628705148206, "grad_norm": 0.7614508271217346, "learning_rate": 1.9771468659711595e-05, "loss": 0.9413, "step": 115 }, { "epoch": 0.7238689547581904, "grad_norm": 0.6695716977119446, "learning_rate": 1.9740119169423337e-05, "loss": 0.9384, "step": 116 }, { "epoch": 0.7301092043681747, "grad_norm": 0.5493482947349548, "learning_rate": 1.9706783532658528e-05, "loss": 0.9601, "step": 117 }, { "epoch": 0.7363494539781591, "grad_norm": 0.7798200249671936, "learning_rate": 1.9671468547019575e-05, "loss": 0.9555, "step": 118 }, { "epoch": 0.7425897035881436, "grad_norm": 0.8122205138206482, "learning_rate": 1.963418141372579e-05, "loss": 0.9351, "step": 119 }, { "epoch": 0.748829953198128, "grad_norm": 0.6351688504219055, "learning_rate": 1.9594929736144978e-05, "loss": 0.9517, "step": 120 }, { "epoch": 0.7550702028081123, "grad_norm": 0.8507185578346252, "learning_rate": 1.955372151824297e-05, "loss": 0.9482, "step": 121 }, { "epoch": 0.7613104524180967, "grad_norm": 1.057692050933838, "learning_rate": 1.9510565162951538e-05, "loss": 0.9626, "step": 122 }, { "epoch": 0.7675507020280812, "grad_norm": 0.789968729019165, "learning_rate": 1.94654694704549e-05, "loss": 0.9504, "step": 123 }, { "epoch": 0.7737909516380655, "grad_norm": 0.8988214731216431, "learning_rate": 1.941844363639525e-05, "loss": 0.9339, "step": 124 }, { "epoch": 0.7800312012480499, "grad_norm": 0.6798993945121765, "learning_rate": 1.936949724999762e-05, "loss": 0.9387, "step": 125 }, { "epoch": 0.7862714508580343, "grad_norm": 0.7597091794013977, "learning_rate": 1.9318640292114526e-05, "loss": 0.9884, "step": 126 }, { "epoch": 0.7925117004680188, "grad_norm": 0.9357583522796631, "learning_rate": 1.9265883133190715e-05, "loss": 0.9382, "step": 127 }, { "epoch": 0.7987519500780031, "grad_norm": 0.8738594055175781, "learning_rate": 1.92112365311485e-05, "loss": 0.9482, "step": 128 }, { "epoch": 0.8049921996879875, "grad_norm": 0.8523539900779724, "learning_rate": 1.9154711629194062e-05, "loss": 0.9299, "step": 129 }, { "epoch": 0.8112324492979719, "grad_norm": 0.5781116485595703, "learning_rate": 1.9096319953545186e-05, "loss": 0.9636, "step": 130 }, { "epoch": 0.8174726989079563, "grad_norm": 0.7737751007080078, "learning_rate": 1.9036073411080917e-05, "loss": 0.9482, "step": 131 }, { "epoch": 0.8237129485179407, "grad_norm": 0.7203546762466431, "learning_rate": 1.8973984286913584e-05, "loss": 0.9298, "step": 132 }, { "epoch": 0.8299531981279251, "grad_norm": 0.5875493288040161, "learning_rate": 1.891006524188368e-05, "loss": 0.9239, "step": 133 }, { "epoch": 0.8361934477379095, "grad_norm": 0.7981539964675903, "learning_rate": 1.8844329309978146e-05, "loss": 0.9546, "step": 134 }, { "epoch": 0.8424336973478939, "grad_norm": 0.7623902559280396, "learning_rate": 1.8776789895672557e-05, "loss": 0.9335, "step": 135 }, { "epoch": 0.8486739469578783, "grad_norm": 0.6350914239883423, "learning_rate": 1.8707460771197773e-05, "loss": 0.9585, "step": 136 }, { "epoch": 0.8549141965678627, "grad_norm": 0.6981391310691833, "learning_rate": 1.863635607373157e-05, "loss": 0.9271, "step": 137 }, { "epoch": 0.8611544461778471, "grad_norm": 0.7900795936584473, "learning_rate": 1.856349030251589e-05, "loss": 0.9022, "step": 138 }, { "epoch": 0.8673946957878315, "grad_norm": 0.7494855523109436, "learning_rate": 1.8488878315900228e-05, "loss": 0.9534, "step": 139 }, { "epoch": 0.8736349453978159, "grad_norm": 0.5757277011871338, "learning_rate": 1.8412535328311813e-05, "loss": 0.9397, "step": 140 }, { "epoch": 0.8798751950078003, "grad_norm": 0.6893640756607056, "learning_rate": 1.8334476907153177e-05, "loss": 0.952, "step": 141 }, { "epoch": 0.8861154446177847, "grad_norm": 0.7050842046737671, "learning_rate": 1.825471896962774e-05, "loss": 0.9417, "step": 142 }, { "epoch": 0.8923556942277691, "grad_norm": 0.5544989109039307, "learning_rate": 1.817327777949407e-05, "loss": 0.9008, "step": 143 }, { "epoch": 0.8985959438377535, "grad_norm": 0.6469840407371521, "learning_rate": 1.8090169943749477e-05, "loss": 0.9471, "step": 144 }, { "epoch": 0.9048361934477379, "grad_norm": 0.6894209384918213, "learning_rate": 1.8005412409243604e-05, "loss": 0.9553, "step": 145 }, { "epoch": 0.9110764430577223, "grad_norm": 0.5356501936912537, "learning_rate": 1.7919022459222754e-05, "loss": 0.9496, "step": 146 }, { "epoch": 0.9173166926677067, "grad_norm": 0.6416233777999878, "learning_rate": 1.7831017709805555e-05, "loss": 0.9558, "step": 147 }, { "epoch": 0.9235569422776911, "grad_norm": 0.7085059881210327, "learning_rate": 1.7741416106390828e-05, "loss": 0.9168, "step": 148 }, { "epoch": 0.9297971918876755, "grad_norm": 0.6492967009544373, "learning_rate": 1.7650235919998234e-05, "loss": 0.9065, "step": 149 }, { "epoch": 0.9360374414976599, "grad_norm": 0.7753322124481201, "learning_rate": 1.7557495743542586e-05, "loss": 0.9285, "step": 150 }, { "epoch": 0.9422776911076443, "grad_norm": 0.6451005935668945, "learning_rate": 1.7463214488042472e-05, "loss": 0.9567, "step": 151 }, { "epoch": 0.9485179407176287, "grad_norm": 0.4824198782444, "learning_rate": 1.736741137876405e-05, "loss": 0.905, "step": 152 }, { "epoch": 0.9547581903276131, "grad_norm": 0.5846424102783203, "learning_rate": 1.727010595130074e-05, "loss": 0.9426, "step": 153 }, { "epoch": 0.9609984399375975, "grad_norm": 0.5984904170036316, "learning_rate": 1.7171318047589637e-05, "loss": 0.9398, "step": 154 }, { "epoch": 0.9672386895475819, "grad_norm": 0.545465886592865, "learning_rate": 1.7071067811865477e-05, "loss": 0.9185, "step": 155 }, { "epoch": 0.9734789391575663, "grad_norm": 0.5724261403083801, "learning_rate": 1.696937568655294e-05, "loss": 0.9222, "step": 156 }, { "epoch": 0.9797191887675507, "grad_norm": 0.5120018124580383, "learning_rate": 1.6866262408098134e-05, "loss": 0.93, "step": 157 }, { "epoch": 0.9859594383775351, "grad_norm": 0.5575640797615051, "learning_rate": 1.6761749002740195e-05, "loss": 0.9483, "step": 158 }, { "epoch": 0.9921996879875195, "grad_norm": 0.603184163570404, "learning_rate": 1.6655856782223682e-05, "loss": 0.9394, "step": 159 }, { "epoch": 0.9984399375975039, "grad_norm": 0.536756157875061, "learning_rate": 1.6548607339452853e-05, "loss": 0.9227, "step": 160 }, { "epoch": 0.9984399375975039, "eval_loss": 0.9286661744117737, "eval_runtime": 134.8358, "eval_samples_per_second": 100.5, "eval_steps_per_second": 6.282, "step": 160 }, { "epoch": 1.0046801872074882, "grad_norm": 0.5680767297744751, "learning_rate": 1.6440022544088553e-05, "loss": 0.9263, "step": 161 }, { "epoch": 1.0109204368174727, "grad_norm": 0.6374309062957764, "learning_rate": 1.6330124538088705e-05, "loss": 0.942, "step": 162 }, { "epoch": 1.0171606864274572, "grad_norm": 0.5749344825744629, "learning_rate": 1.6218935731193223e-05, "loss": 0.9264, "step": 163 }, { "epoch": 1.0046801872074882, "grad_norm": 0.9867531657218933, "learning_rate": 1.6106478796354382e-05, "loss": 0.8513, "step": 164 }, { "epoch": 1.0109204368174727, "grad_norm": 0.7155735492706299, "learning_rate": 1.599277666511347e-05, "loss": 0.8023, "step": 165 }, { "epoch": 1.0171606864274572, "grad_norm": 1.3990559577941895, "learning_rate": 1.5877852522924733e-05, "loss": 0.7992, "step": 166 }, { "epoch": 1.0234009360374414, "grad_norm": 0.9339443445205688, "learning_rate": 1.576172980442753e-05, "loss": 0.7982, "step": 167 }, { "epoch": 1.029641185647426, "grad_norm": 0.8383383750915527, "learning_rate": 1.5644432188667695e-05, "loss": 0.7764, "step": 168 }, { "epoch": 1.0358814352574104, "grad_norm": 0.8227719664573669, "learning_rate": 1.5525983594269026e-05, "loss": 0.7985, "step": 169 }, { "epoch": 1.0421216848673946, "grad_norm": 0.8231950998306274, "learning_rate": 1.5406408174555978e-05, "loss": 0.8097, "step": 170 }, { "epoch": 1.0483619344773791, "grad_norm": 1.1454997062683105, "learning_rate": 1.528573031262842e-05, "loss": 0.7779, "step": 171 }, { "epoch": 1.0546021840873634, "grad_norm": 0.6939067840576172, "learning_rate": 1.5163974616389621e-05, "loss": 0.8217, "step": 172 }, { "epoch": 1.0608424336973479, "grad_norm": 0.7526265978813171, "learning_rate": 1.504116591352832e-05, "loss": 0.7886, "step": 173 }, { "epoch": 1.0670826833073324, "grad_norm": 0.7892379760742188, "learning_rate": 1.491732924645604e-05, "loss": 0.7959, "step": 174 }, { "epoch": 1.0733229329173166, "grad_norm": 0.7279461026191711, "learning_rate": 1.479248986720057e-05, "loss": 0.7977, "step": 175 }, { "epoch": 1.079563182527301, "grad_norm": 0.7360721230506897, "learning_rate": 1.4666673232256738e-05, "loss": 0.7883, "step": 176 }, { "epoch": 1.0858034321372856, "grad_norm": 0.6525989174842834, "learning_rate": 1.4539904997395468e-05, "loss": 0.785, "step": 177 }, { "epoch": 1.0920436817472698, "grad_norm": 0.7803720235824585, "learning_rate": 1.4412211012432213e-05, "loss": 0.7998, "step": 178 }, { "epoch": 1.0982839313572543, "grad_norm": 0.6588256359100342, "learning_rate": 1.4283617315955815e-05, "loss": 0.7999, "step": 179 }, { "epoch": 1.1045241809672386, "grad_norm": 0.5983767509460449, "learning_rate": 1.4154150130018867e-05, "loss": 0.7848, "step": 180 }, { "epoch": 1.110764430577223, "grad_norm": 0.641603946685791, "learning_rate": 1.4023835854790682e-05, "loss": 0.7937, "step": 181 }, { "epoch": 1.1170046801872076, "grad_norm": 0.6453792452812195, "learning_rate": 1.3892701063173917e-05, "loss": 0.8004, "step": 182 }, { "epoch": 1.1232449297971918, "grad_norm": 0.6428067088127136, "learning_rate": 1.3760772495385998e-05, "loss": 0.792, "step": 183 }, { "epoch": 1.1294851794071763, "grad_norm": 0.6279442310333252, "learning_rate": 1.362807705350641e-05, "loss": 0.7859, "step": 184 }, { "epoch": 1.1357254290171608, "grad_norm": 0.6000891327857971, "learning_rate": 1.3494641795990986e-05, "loss": 0.8039, "step": 185 }, { "epoch": 1.141965678627145, "grad_norm": 0.6628398895263672, "learning_rate": 1.3360493932154301e-05, "loss": 0.7829, "step": 186 }, { "epoch": 1.1482059282371295, "grad_norm": 0.6268762946128845, "learning_rate": 1.3225660816621342e-05, "loss": 0.778, "step": 187 }, { "epoch": 1.154446177847114, "grad_norm": 0.639062225818634, "learning_rate": 1.3090169943749475e-05, "loss": 0.7796, "step": 188 }, { "epoch": 1.1606864274570983, "grad_norm": 0.6048714518547058, "learning_rate": 1.2954048942022002e-05, "loss": 0.7883, "step": 189 }, { "epoch": 1.1669266770670828, "grad_norm": 0.5929723381996155, "learning_rate": 1.2817325568414299e-05, "loss": 0.7736, "step": 190 }, { "epoch": 1.173166926677067, "grad_norm": 0.5971985459327698, "learning_rate": 1.2680027702733791e-05, "loss": 0.8008, "step": 191 }, { "epoch": 1.1794071762870515, "grad_norm": 0.6460970044136047, "learning_rate": 1.2542183341934873e-05, "loss": 0.7867, "step": 192 }, { "epoch": 1.185647425897036, "grad_norm": 0.5345771908760071, "learning_rate": 1.2403820594409926e-05, "loss": 0.7808, "step": 193 }, { "epoch": 1.1918876755070202, "grad_norm": 0.6704164743423462, "learning_rate": 1.2264967674257647e-05, "loss": 0.7785, "step": 194 }, { "epoch": 1.1981279251170047, "grad_norm": 0.5182461738586426, "learning_rate": 1.2125652895529766e-05, "loss": 0.7907, "step": 195 }, { "epoch": 1.204368174726989, "grad_norm": 0.6421562433242798, "learning_rate": 1.1985904666457455e-05, "loss": 0.7914, "step": 196 }, { "epoch": 1.2106084243369735, "grad_norm": 0.5846896171569824, "learning_rate": 1.1845751483658454e-05, "loss": 0.7631, "step": 197 }, { "epoch": 1.216848673946958, "grad_norm": 0.5582466721534729, "learning_rate": 1.170522192632624e-05, "loss": 0.7912, "step": 198 }, { "epoch": 1.2230889235569422, "grad_norm": 0.5527791976928711, "learning_rate": 1.156434465040231e-05, "loss": 0.7938, "step": 199 }, { "epoch": 1.2293291731669267, "grad_norm": 0.5673221945762634, "learning_rate": 1.1423148382732854e-05, "loss": 0.7947, "step": 200 }, { "epoch": 1.2355694227769112, "grad_norm": 0.5078392028808594, "learning_rate": 1.1281661915210931e-05, "loss": 0.7771, "step": 201 }, { "epoch": 1.2418096723868954, "grad_norm": 0.5475752353668213, "learning_rate": 1.1139914098905406e-05, "loss": 0.7781, "step": 202 }, { "epoch": 1.24804992199688, "grad_norm": 0.5290600657463074, "learning_rate": 1.0997933838177828e-05, "loss": 0.7622, "step": 203 }, { "epoch": 1.2542901716068644, "grad_norm": 0.4957723915576935, "learning_rate": 1.08557500847884e-05, "loss": 0.7857, "step": 204 }, { "epoch": 1.2605304212168487, "grad_norm": 0.5119233727455139, "learning_rate": 1.0713391831992324e-05, "loss": 0.7585, "step": 205 }, { "epoch": 1.2667706708268331, "grad_norm": 0.5187195539474487, "learning_rate": 1.0570888108627682e-05, "loss": 0.7885, "step": 206 }, { "epoch": 1.2730109204368174, "grad_norm": 0.5066515803337097, "learning_rate": 1.0428267973196027e-05, "loss": 0.7691, "step": 207 }, { "epoch": 1.2792511700468019, "grad_norm": 0.48673221468925476, "learning_rate": 1.0285560507936962e-05, "loss": 0.7715, "step": 208 }, { "epoch": 1.2854914196567861, "grad_norm": 0.5083721876144409, "learning_rate": 1.0142794812897874e-05, "loss": 0.7812, "step": 209 }, { "epoch": 1.2917316692667706, "grad_norm": 0.5033391118049622, "learning_rate": 1e-05, "loss": 0.7756, "step": 210 }, { "epoch": 1.2979719188767551, "grad_norm": 0.532008945941925, "learning_rate": 9.85720518710213e-06, "loss": 0.7898, "step": 211 }, { "epoch": 1.3042121684867394, "grad_norm": 0.5123456716537476, "learning_rate": 9.71443949206304e-06, "loss": 0.7779, "step": 212 }, { "epoch": 1.3104524180967239, "grad_norm": 0.48444995284080505, "learning_rate": 9.571732026803978e-06, "loss": 0.7598, "step": 213 }, { "epoch": 1.3166926677067083, "grad_norm": 0.5265589356422424, "learning_rate": 9.42911189137232e-06, "loss": 0.783, "step": 214 }, { "epoch": 1.3229329173166926, "grad_norm": 0.5039641261100769, "learning_rate": 9.286608168007678e-06, "loss": 0.7798, "step": 215 }, { "epoch": 1.329173166926677, "grad_norm": 0.5092752575874329, "learning_rate": 9.144249915211605e-06, "loss": 0.7635, "step": 216 }, { "epoch": 1.3354134165366616, "grad_norm": 0.5394583940505981, "learning_rate": 9.002066161822174e-06, "loss": 0.7999, "step": 217 }, { "epoch": 1.3416536661466458, "grad_norm": 0.5039361119270325, "learning_rate": 8.860085901094595e-06, "loss": 0.7745, "step": 218 }, { "epoch": 1.3478939157566303, "grad_norm": 0.5087692737579346, "learning_rate": 8.718338084789074e-06, "loss": 0.8093, "step": 219 }, { "epoch": 1.3541341653666148, "grad_norm": 0.5368974208831787, "learning_rate": 8.576851617267151e-06, "loss": 0.7596, "step": 220 }, { "epoch": 1.360374414976599, "grad_norm": 0.5123481750488281, "learning_rate": 8.43565534959769e-06, "loss": 0.7665, "step": 221 }, { "epoch": 1.3666146645865835, "grad_norm": 0.5048606395721436, "learning_rate": 8.294778073673762e-06, "loss": 0.769, "step": 222 }, { "epoch": 1.3728549141965678, "grad_norm": 0.5156130790710449, "learning_rate": 8.154248516341547e-06, "loss": 0.7989, "step": 223 }, { "epoch": 1.3790951638065523, "grad_norm": 0.5228000283241272, "learning_rate": 8.014095333542548e-06, "loss": 0.7775, "step": 224 }, { "epoch": 1.3853354134165365, "grad_norm": 0.5356248617172241, "learning_rate": 7.874347104470234e-06, "loss": 0.7629, "step": 225 }, { "epoch": 1.391575663026521, "grad_norm": 0.49624764919281006, "learning_rate": 7.735032325742355e-06, "loss": 0.8026, "step": 226 }, { "epoch": 1.3978159126365055, "grad_norm": 0.5164198279380798, "learning_rate": 7.596179405590076e-06, "loss": 0.7803, "step": 227 }, { "epoch": 1.4040561622464898, "grad_norm": 0.47972792387008667, "learning_rate": 7.4578166580651335e-06, "loss": 0.7783, "step": 228 }, { "epoch": 1.4102964118564743, "grad_norm": 0.4989663064479828, "learning_rate": 7.319972297266215e-06, "loss": 0.7728, "step": 229 }, { "epoch": 1.4165366614664587, "grad_norm": 0.5109092593193054, "learning_rate": 7.182674431585703e-06, "loss": 0.7919, "step": 230 }, { "epoch": 1.422776911076443, "grad_norm": 0.5069689750671387, "learning_rate": 7.045951057978001e-06, "loss": 0.8015, "step": 231 }, { "epoch": 1.4290171606864275, "grad_norm": 0.5074580907821655, "learning_rate": 6.909830056250527e-06, "loss": 0.7844, "step": 232 }, { "epoch": 1.435257410296412, "grad_norm": 0.47711381316185, "learning_rate": 6.774339183378663e-06, "loss": 0.7601, "step": 233 }, { "epoch": 1.4414976599063962, "grad_norm": 0.4926273822784424, "learning_rate": 6.639506067845698e-06, "loss": 0.7904, "step": 234 }, { "epoch": 1.4477379095163807, "grad_norm": 0.5004563927650452, "learning_rate": 6.505358204009018e-06, "loss": 0.7638, "step": 235 }, { "epoch": 1.4539781591263652, "grad_norm": 0.5014521479606628, "learning_rate": 6.3719229464935915e-06, "loss": 0.794, "step": 236 }, { "epoch": 1.4602184087363494, "grad_norm": 0.5181212425231934, "learning_rate": 6.239227504614004e-06, "loss": 0.7495, "step": 237 }, { "epoch": 1.466458658346334, "grad_norm": 0.5317255258560181, "learning_rate": 6.107298936826086e-06, "loss": 0.7884, "step": 238 }, { "epoch": 1.4726989079563182, "grad_norm": 0.5126049518585205, "learning_rate": 5.9761641452093225e-06, "loss": 0.7869, "step": 239 }, { "epoch": 1.4789391575663027, "grad_norm": 0.5322765707969666, "learning_rate": 5.845849869981137e-06, "loss": 0.7712, "step": 240 }, { "epoch": 1.4789391575663027, "eval_loss": 0.9110648036003113, "eval_runtime": 134.5659, "eval_samples_per_second": 100.702, "eval_steps_per_second": 6.294, "step": 240 }, { "epoch": 1.485179407176287, "grad_norm": 0.5006371140480042, "learning_rate": 5.716382684044191e-06, "loss": 0.7939, "step": 241 }, { "epoch": 1.4914196567862714, "grad_norm": 0.5099849104881287, "learning_rate": 5.587788987567785e-06, "loss": 0.771, "step": 242 }, { "epoch": 1.497659906396256, "grad_norm": 0.5397711396217346, "learning_rate": 5.460095002604533e-06, "loss": 0.7642, "step": 243 }, { "epoch": 1.5039001560062402, "grad_norm": 0.47754916548728943, "learning_rate": 5.333326767743263e-06, "loss": 0.7848, "step": 244 }, { "epoch": 1.5101404056162246, "grad_norm": 0.5114724636077881, "learning_rate": 5.207510132799436e-06, "loss": 0.7743, "step": 245 }, { "epoch": 1.5163806552262091, "grad_norm": 0.4859448969364166, "learning_rate": 5.082670753543961e-06, "loss": 0.7748, "step": 246 }, { "epoch": 1.5226209048361934, "grad_norm": 0.46158257126808167, "learning_rate": 4.958834086471683e-06, "loss": 0.7951, "step": 247 }, { "epoch": 1.5288611544461779, "grad_norm": 0.48695865273475647, "learning_rate": 4.836025383610382e-06, "loss": 0.7968, "step": 248 }, { "epoch": 1.5351014040561624, "grad_norm": 0.4924914240837097, "learning_rate": 4.714269687371581e-06, "loss": 0.792, "step": 249 }, { "epoch": 1.5413416536661466, "grad_norm": 0.5044175982475281, "learning_rate": 4.593591825444028e-06, "loss": 0.781, "step": 250 }, { "epoch": 1.547581903276131, "grad_norm": 0.4598456919193268, "learning_rate": 4.474016405730973e-06, "loss": 0.794, "step": 251 }, { "epoch": 1.5538221528861156, "grad_norm": 0.48866939544677734, "learning_rate": 4.355567811332311e-06, "loss": 0.7853, "step": 252 }, { "epoch": 1.5600624024960998, "grad_norm": 0.4878495931625366, "learning_rate": 4.2382701955724724e-06, "loss": 0.743, "step": 253 }, { "epoch": 1.566302652106084, "grad_norm": 0.4770466387271881, "learning_rate": 4.12214747707527e-06, "loss": 0.7442, "step": 254 }, { "epoch": 1.5725429017160688, "grad_norm": 0.4467732012271881, "learning_rate": 4.007223334886531e-06, "loss": 0.7611, "step": 255 }, { "epoch": 1.578783151326053, "grad_norm": 0.47116416692733765, "learning_rate": 3.893521203645618e-06, "loss": 0.7921, "step": 256 }, { "epoch": 1.5850234009360373, "grad_norm": 0.468517005443573, "learning_rate": 3.78106426880678e-06, "loss": 0.7811, "step": 257 }, { "epoch": 1.5912636505460218, "grad_norm": 0.46981289982795715, "learning_rate": 3.6698754619112974e-06, "loss": 0.7756, "step": 258 }, { "epoch": 1.5975039001560063, "grad_norm": 0.45571863651275635, "learning_rate": 3.5599774559114475e-06, "loss": 0.7469, "step": 259 }, { "epoch": 1.6037441497659906, "grad_norm": 0.4486157298088074, "learning_rate": 3.4513926605471504e-06, "loss": 0.7566, "step": 260 }, { "epoch": 1.609984399375975, "grad_norm": 0.47735777497291565, "learning_rate": 3.344143217776319e-06, "loss": 0.7753, "step": 261 }, { "epoch": 1.6162246489859595, "grad_norm": 0.4546492099761963, "learning_rate": 3.2382509972598087e-06, "loss": 0.7741, "step": 262 }, { "epoch": 1.6224648985959438, "grad_norm": 0.4690036177635193, "learning_rate": 3.133737591901864e-06, "loss": 0.7693, "step": 263 }, { "epoch": 1.6287051482059283, "grad_norm": 0.4607780873775482, "learning_rate": 3.0306243134470668e-06, "loss": 0.7312, "step": 264 }, { "epoch": 1.6349453978159127, "grad_norm": 0.4510229825973511, "learning_rate": 2.9289321881345257e-06, "loss": 0.7705, "step": 265 }, { "epoch": 1.641185647425897, "grad_norm": 0.43797171115875244, "learning_rate": 2.8286819524103657e-06, "loss": 0.7666, "step": 266 }, { "epoch": 1.6474258970358813, "grad_norm": 0.45275524258613586, "learning_rate": 2.7298940486992654e-06, "loss": 0.8028, "step": 267 }, { "epoch": 1.653666146645866, "grad_norm": 0.4479183852672577, "learning_rate": 2.6325886212359496e-06, "loss": 0.7647, "step": 268 }, { "epoch": 1.6599063962558502, "grad_norm": 0.4575185775756836, "learning_rate": 2.5367855119575314e-06, "loss": 0.7731, "step": 269 }, { "epoch": 1.6661466458658345, "grad_norm": 0.45092037320137024, "learning_rate": 2.4425042564574186e-06, "loss": 0.7895, "step": 270 }, { "epoch": 1.672386895475819, "grad_norm": 0.4297903776168823, "learning_rate": 2.3497640800017687e-06, "loss": 0.7597, "step": 271 }, { "epoch": 1.6786271450858035, "grad_norm": 0.43731772899627686, "learning_rate": 2.2585838936091753e-06, "loss": 0.7775, "step": 272 }, { "epoch": 1.6848673946957877, "grad_norm": 0.4288908541202545, "learning_rate": 2.1689822901944456e-06, "loss": 0.787, "step": 273 }, { "epoch": 1.6911076443057722, "grad_norm": 0.42555147409439087, "learning_rate": 2.0809775407772505e-06, "loss": 0.7751, "step": 274 }, { "epoch": 1.6973478939157567, "grad_norm": 0.45276904106140137, "learning_rate": 1.994587590756397e-06, "loss": 0.7893, "step": 275 }, { "epoch": 1.703588143525741, "grad_norm": 0.44294846057891846, "learning_rate": 1.9098300562505266e-06, "loss": 0.7794, "step": 276 }, { "epoch": 1.7098283931357254, "grad_norm": 0.4307778775691986, "learning_rate": 1.826722220505931e-06, "loss": 0.7791, "step": 277 }, { "epoch": 1.71606864274571, "grad_norm": 0.43245357275009155, "learning_rate": 1.74528103037226e-06, "loss": 0.7497, "step": 278 }, { "epoch": 1.7223088923556942, "grad_norm": 0.4251644015312195, "learning_rate": 1.6655230928468257e-06, "loss": 0.7743, "step": 279 }, { "epoch": 1.7285491419656787, "grad_norm": 0.4321819543838501, "learning_rate": 1.587464671688187e-06, "loss": 0.7522, "step": 280 }, { "epoch": 1.7347893915756631, "grad_norm": 0.4292062222957611, "learning_rate": 1.5111216840997745e-06, "loss": 0.7698, "step": 281 }, { "epoch": 1.7410296411856474, "grad_norm": 0.45093026757240295, "learning_rate": 1.436509697484111e-06, "loss": 0.7647, "step": 282 }, { "epoch": 1.7472698907956317, "grad_norm": 0.42376089096069336, "learning_rate": 1.3636439262684299e-06, "loss": 0.7556, "step": 283 }, { "epoch": 1.7535101404056164, "grad_norm": 0.4337958097457886, "learning_rate": 1.2925392288022299e-06, "loss": 0.7756, "step": 284 }, { "epoch": 1.7597503900156006, "grad_norm": 0.4319595992565155, "learning_rate": 1.2232101043274437e-06, "loss": 0.7816, "step": 285 }, { "epoch": 1.765990639625585, "grad_norm": 0.4335976243019104, "learning_rate": 1.1556706900218572e-06, "loss": 0.7902, "step": 286 }, { "epoch": 1.7722308892355694, "grad_norm": 0.4158109724521637, "learning_rate": 1.0899347581163222e-06, "loss": 0.7396, "step": 287 }, { "epoch": 1.7784711388455539, "grad_norm": 0.42235442996025085, "learning_rate": 1.0260157130864178e-06, "loss": 0.7747, "step": 288 }, { "epoch": 1.7847113884555381, "grad_norm": 0.43022453784942627, "learning_rate": 9.63926588919083e-07, "loss": 0.7616, "step": 289 }, { "epoch": 1.7909516380655226, "grad_norm": 0.42115500569343567, "learning_rate": 9.036800464548157e-07, "loss": 0.7609, "step": 290 }, { "epoch": 1.797191887675507, "grad_norm": 0.43961915373802185, "learning_rate": 8.4528837080594e-07, "loss": 0.7665, "step": 291 }, { "epoch": 1.8034321372854913, "grad_norm": 0.42159175872802734, "learning_rate": 7.887634688515e-07, "loss": 0.7684, "step": 292 }, { "epoch": 1.8096723868954758, "grad_norm": 0.4211348295211792, "learning_rate": 7.341168668092857e-07, "loss": 0.7701, "step": 293 }, { "epoch": 1.8159126365054603, "grad_norm": 0.447083055973053, "learning_rate": 6.813597078854772e-07, "loss": 0.8051, "step": 294 }, { "epoch": 1.8221528861154446, "grad_norm": 0.42439502477645874, "learning_rate": 6.305027500023841e-07, "loss": 0.7794, "step": 295 }, { "epoch": 1.828393135725429, "grad_norm": 0.43265220522880554, "learning_rate": 5.815563636047539e-07, "loss": 0.7722, "step": 296 }, { "epoch": 1.8346333853354135, "grad_norm": 0.4244794249534607, "learning_rate": 5.345305295450997e-07, "loss": 0.7638, "step": 297 }, { "epoch": 1.8408736349453978, "grad_norm": 0.41255486011505127, "learning_rate": 4.894348370484648e-07, "loss": 0.772, "step": 298 }, { "epoch": 1.847113884555382, "grad_norm": 0.4242880344390869, "learning_rate": 4.4627848175703315e-07, "loss": 0.7643, "step": 299 }, { "epoch": 1.8533541341653668, "grad_norm": 0.41673314571380615, "learning_rate": 4.0507026385502747e-07, "loss": 0.7601, "step": 300 }, { "epoch": 1.859594383775351, "grad_norm": 0.42518967390060425, "learning_rate": 3.658185862742103e-07, "loss": 0.7699, "step": 301 }, { "epoch": 1.8658346333853353, "grad_norm": 0.42029449343681335, "learning_rate": 3.2853145298042954e-07, "loss": 0.7498, "step": 302 }, { "epoch": 1.8720748829953198, "grad_norm": 0.4201337695121765, "learning_rate": 2.93216467341475e-07, "loss": 0.7626, "step": 303 }, { "epoch": 1.8783151326053042, "grad_norm": 0.42505332827568054, "learning_rate": 2.5988083057666534e-07, "loss": 0.774, "step": 304 }, { "epoch": 1.8845553822152885, "grad_norm": 0.41834697127342224, "learning_rate": 2.2853134028840594e-07, "loss": 0.7638, "step": 305 }, { "epoch": 1.890795631825273, "grad_norm": 0.4278232455253601, "learning_rate": 1.9917438907606556e-07, "loss": 0.7906, "step": 306 }, { "epoch": 1.8970358814352575, "grad_norm": 0.429078608751297, "learning_rate": 1.7181596323244453e-07, "loss": 0.7839, "step": 307 }, { "epoch": 1.9032761310452417, "grad_norm": 0.4142579138278961, "learning_rate": 1.464616415230702e-07, "loss": 0.7687, "step": 308 }, { "epoch": 1.9095163806552262, "grad_norm": 0.40971171855926514, "learning_rate": 1.231165940486234e-07, "loss": 0.7647, "step": 309 }, { "epoch": 1.9157566302652107, "grad_norm": 0.4336109161376953, "learning_rate": 1.0178558119067316e-07, "loss": 0.7691, "step": 310 }, { "epoch": 1.921996879875195, "grad_norm": 0.40623047947883606, "learning_rate": 8.247295264097288e-08, "loss": 0.7728, "step": 311 }, { "epoch": 1.9282371294851794, "grad_norm": 0.4205041527748108, "learning_rate": 6.51826465144978e-08, "loss": 0.7533, "step": 312 }, { "epoch": 1.934477379095164, "grad_norm": 0.416535347700119, "learning_rate": 4.991818854640396e-08, "loss": 0.7826, "step": 313 }, { "epoch": 1.9407176287051482, "grad_norm": 0.41483184695243835, "learning_rate": 3.668269137308666e-08, "loss": 0.7688, "step": 314 }, { "epoch": 1.9469578783151325, "grad_norm": 0.4072718322277069, "learning_rate": 2.547885389746485e-08, "loss": 0.7943, "step": 315 }, { "epoch": 1.9531981279251172, "grad_norm": 0.413289338350296, "learning_rate": 1.630896073864352e-08, "loss": 0.7867, "step": 316 }, { "epoch": 1.9594383775351014, "grad_norm": 0.4177180528640747, "learning_rate": 9.174881766043086e-09, "loss": 0.781, "step": 317 }, { "epoch": 1.9656786271450857, "grad_norm": 0.41807225346565247, "learning_rate": 4.0780717181077015e-09, "loss": 0.769, "step": 318 }, { "epoch": 1.9719188767550702, "grad_norm": 0.41558825969696045, "learning_rate": 1.019569905666984e-09, "loss": 0.7504, "step": 319 }, { "epoch": 1.9781591263650546, "grad_norm": 0.4160574674606323, "learning_rate": 0.0, "loss": 0.8025, "step": 320 }, { "epoch": 1.9781591263650546, "eval_loss": 0.903252899646759, "eval_runtime": 134.5566, "eval_samples_per_second": 100.709, "eval_steps_per_second": 6.295, "step": 320 } ], "logging_steps": 1, "max_steps": 320, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 80, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9476972312723456e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }