|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9781591263650546, |
|
"eval_steps": 80, |
|
"global_step": 320, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0062402496099844, |
|
"grad_norm": 19.714784622192383, |
|
"learning_rate": 2.0000000000000002e-07, |
|
"loss": 1.3589, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0062402496099844, |
|
"eval_loss": 1.3540421724319458, |
|
"eval_runtime": 132.5999, |
|
"eval_samples_per_second": 102.195, |
|
"eval_steps_per_second": 6.388, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0124804992199688, |
|
"grad_norm": 20.498014450073242, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 1.3662, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.0187207488299532, |
|
"grad_norm": 19.82619285583496, |
|
"learning_rate": 6.000000000000001e-07, |
|
"loss": 1.3336, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.0249609984399376, |
|
"grad_norm": 18.423460006713867, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 1.3555, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.031201248049921998, |
|
"grad_norm": 16.555850982666016, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.3527, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0374414976599064, |
|
"grad_norm": 10.684965133666992, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 1.3491, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.0436817472698908, |
|
"grad_norm": 8.396592140197754, |
|
"learning_rate": 1.4000000000000001e-06, |
|
"loss": 1.3181, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.0499219968798752, |
|
"grad_norm": 3.145500421524048, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 1.2984, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.056162246489859596, |
|
"grad_norm": 2.981050491333008, |
|
"learning_rate": 1.8000000000000001e-06, |
|
"loss": 1.2901, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.062402496099843996, |
|
"grad_norm": 2.741509199142456, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.2948, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.0686427457098284, |
|
"grad_norm": 3.8496174812316895, |
|
"learning_rate": 2.2e-06, |
|
"loss": 1.2524, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0748829953198128, |
|
"grad_norm": 3.039551258087158, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 1.2369, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.0811232449297972, |
|
"grad_norm": 2.215259313583374, |
|
"learning_rate": 2.6e-06, |
|
"loss": 1.244, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.0873634945397816, |
|
"grad_norm": 1.4627336263656616, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 1.2201, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.093603744149766, |
|
"grad_norm": 2.0100812911987305, |
|
"learning_rate": 3e-06, |
|
"loss": 1.2097, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.0998439937597504, |
|
"grad_norm": 2.0757627487182617, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 1.2058, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1060842433697348, |
|
"grad_norm": 1.6582179069519043, |
|
"learning_rate": 3.4000000000000005e-06, |
|
"loss": 1.1775, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11232449297971919, |
|
"grad_norm": 1.2454713582992554, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 1.1405, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.11856474258970359, |
|
"grad_norm": 1.0032132863998413, |
|
"learning_rate": 3.8000000000000005e-06, |
|
"loss": 1.1442, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12480499219968799, |
|
"grad_norm": 1.3543955087661743, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.1874, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.1310452418096724, |
|
"grad_norm": 1.2795507907867432, |
|
"learning_rate": 4.2000000000000004e-06, |
|
"loss": 1.1423, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.1372854914196568, |
|
"grad_norm": 1.0040080547332764, |
|
"learning_rate": 4.4e-06, |
|
"loss": 1.1269, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.1435257410296412, |
|
"grad_norm": 0.9706005454063416, |
|
"learning_rate": 4.600000000000001e-06, |
|
"loss": 1.1508, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.1497659906396256, |
|
"grad_norm": 0.905784547328949, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 1.1003, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.15600624024961, |
|
"grad_norm": 0.8688749074935913, |
|
"learning_rate": 5e-06, |
|
"loss": 1.1046, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.1622464898595944, |
|
"grad_norm": 0.7418661713600159, |
|
"learning_rate": 5.2e-06, |
|
"loss": 1.0736, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1684867394695788, |
|
"grad_norm": 0.7218017578125, |
|
"learning_rate": 5.400000000000001e-06, |
|
"loss": 1.0924, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1747269890795632, |
|
"grad_norm": 0.7364180684089661, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 1.0666, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1809672386895476, |
|
"grad_norm": 0.6347681879997253, |
|
"learning_rate": 5.8e-06, |
|
"loss": 1.0533, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.187207488299532, |
|
"grad_norm": 0.672021210193634, |
|
"learning_rate": 6e-06, |
|
"loss": 1.0719, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.1934477379095164, |
|
"grad_norm": 0.6880649328231812, |
|
"learning_rate": 6.200000000000001e-06, |
|
"loss": 1.0555, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.1996879875195008, |
|
"grad_norm": 0.5669052004814148, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 1.0845, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2059282371294852, |
|
"grad_norm": 0.6051258444786072, |
|
"learning_rate": 6.600000000000001e-06, |
|
"loss": 1.0656, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.2121684867394696, |
|
"grad_norm": 0.5937217473983765, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 1.0738, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.21840873634945399, |
|
"grad_norm": 0.5861482620239258, |
|
"learning_rate": 7e-06, |
|
"loss": 1.0497, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.22464898595943839, |
|
"grad_norm": 0.5939168334007263, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 1.0657, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23088923556942278, |
|
"grad_norm": 0.5843105316162109, |
|
"learning_rate": 7.4e-06, |
|
"loss": 1.0498, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.23712948517940718, |
|
"grad_norm": 0.5303648710250854, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 1.0604, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.24336973478939158, |
|
"grad_norm": 0.558338463306427, |
|
"learning_rate": 7.800000000000002e-06, |
|
"loss": 1.0383, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.24960998439937598, |
|
"grad_norm": 0.49629613757133484, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.0521, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.25585023400936036, |
|
"grad_norm": 0.5873180627822876, |
|
"learning_rate": 8.2e-06, |
|
"loss": 1.0403, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2620904836193448, |
|
"grad_norm": 0.5466005802154541, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 1.0127, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.26833073322932915, |
|
"grad_norm": 0.5514444708824158, |
|
"learning_rate": 8.6e-06, |
|
"loss": 1.0399, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2745709828393136, |
|
"grad_norm": 0.5304705500602722, |
|
"learning_rate": 8.8e-06, |
|
"loss": 1.0057, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.28081123244929795, |
|
"grad_norm": 0.5105130076408386, |
|
"learning_rate": 9e-06, |
|
"loss": 1.0174, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2870514820592824, |
|
"grad_norm": 0.533640444278717, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 1.0342, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.29329173166926675, |
|
"grad_norm": 0.48208147287368774, |
|
"learning_rate": 9.4e-06, |
|
"loss": 1.0195, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.2995319812792512, |
|
"grad_norm": 0.5069381594657898, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 1.0382, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.30577223088923555, |
|
"grad_norm": 0.4819696843624115, |
|
"learning_rate": 9.800000000000001e-06, |
|
"loss": 1.05, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.31201248049922, |
|
"grad_norm": 0.5414313673973083, |
|
"learning_rate": 1e-05, |
|
"loss": 1.0245, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.31825273010920435, |
|
"grad_norm": 0.4769354462623596, |
|
"learning_rate": 1.02e-05, |
|
"loss": 1.005, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.3244929797191888, |
|
"grad_norm": 0.5051629543304443, |
|
"learning_rate": 1.04e-05, |
|
"loss": 1.0158, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.33073322932917315, |
|
"grad_norm": 0.5432644486427307, |
|
"learning_rate": 1.0600000000000002e-05, |
|
"loss": 1.0122, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.3369734789391576, |
|
"grad_norm": 0.4705195128917694, |
|
"learning_rate": 1.0800000000000002e-05, |
|
"loss": 1.0053, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.34321372854914195, |
|
"grad_norm": 0.5468801856040955, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 1.0173, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.3494539781591264, |
|
"grad_norm": 0.6218928694725037, |
|
"learning_rate": 1.1200000000000001e-05, |
|
"loss": 0.9974, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.35569422776911075, |
|
"grad_norm": 0.532873272895813, |
|
"learning_rate": 1.14e-05, |
|
"loss": 1.006, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3619344773790952, |
|
"grad_norm": 0.48144450783729553, |
|
"learning_rate": 1.16e-05, |
|
"loss": 1.0025, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.36817472698907955, |
|
"grad_norm": 0.5385976433753967, |
|
"learning_rate": 1.18e-05, |
|
"loss": 0.9976, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.374414976599064, |
|
"grad_norm": 0.5179689526557922, |
|
"learning_rate": 1.2e-05, |
|
"loss": 0.9988, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.38065522620904835, |
|
"grad_norm": 0.4646259844303131, |
|
"learning_rate": 1.22e-05, |
|
"loss": 0.9959, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.3868954758190328, |
|
"grad_norm": 0.5259431004524231, |
|
"learning_rate": 1.2400000000000002e-05, |
|
"loss": 0.9933, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.39313572542901715, |
|
"grad_norm": 0.5602505803108215, |
|
"learning_rate": 1.2600000000000001e-05, |
|
"loss": 0.9732, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.3993759750390016, |
|
"grad_norm": 0.5400233864784241, |
|
"learning_rate": 1.2800000000000001e-05, |
|
"loss": 1.0021, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.40561622464898595, |
|
"grad_norm": 0.5008605718612671, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 1.0098, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4118564742589704, |
|
"grad_norm": 0.5215092301368713, |
|
"learning_rate": 1.3200000000000002e-05, |
|
"loss": 0.9805, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.41809672386895474, |
|
"grad_norm": 0.6043874025344849, |
|
"learning_rate": 1.3400000000000002e-05, |
|
"loss": 0.9651, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.4243369734789392, |
|
"grad_norm": 0.5744293928146362, |
|
"learning_rate": 1.3600000000000002e-05, |
|
"loss": 0.9715, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.43057722308892354, |
|
"grad_norm": 0.6228943467140198, |
|
"learning_rate": 1.38e-05, |
|
"loss": 0.9942, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.43681747269890797, |
|
"grad_norm": 0.6340550780296326, |
|
"learning_rate": 1.4e-05, |
|
"loss": 1.0278, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.44305772230889234, |
|
"grad_norm": 0.6537193655967712, |
|
"learning_rate": 1.4200000000000001e-05, |
|
"loss": 1.005, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.44929797191887677, |
|
"grad_norm": 0.6706846356391907, |
|
"learning_rate": 1.4400000000000001e-05, |
|
"loss": 0.9736, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.45553822152886114, |
|
"grad_norm": 0.5686175227165222, |
|
"learning_rate": 1.46e-05, |
|
"loss": 0.9753, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.46177847113884557, |
|
"grad_norm": 0.5182248950004578, |
|
"learning_rate": 1.48e-05, |
|
"loss": 0.9964, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.46801872074882994, |
|
"grad_norm": 0.5445067286491394, |
|
"learning_rate": 1.5000000000000002e-05, |
|
"loss": 0.9702, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.47425897035881437, |
|
"grad_norm": 0.6168459057807922, |
|
"learning_rate": 1.5200000000000002e-05, |
|
"loss": 0.9791, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.48049921996879874, |
|
"grad_norm": 0.6475315093994141, |
|
"learning_rate": 1.54e-05, |
|
"loss": 0.98, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.48673946957878317, |
|
"grad_norm": 0.8365716934204102, |
|
"learning_rate": 1.5600000000000003e-05, |
|
"loss": 0.987, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.49297971918876754, |
|
"grad_norm": 1.0882554054260254, |
|
"learning_rate": 1.58e-05, |
|
"loss": 0.9796, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.49921996879875197, |
|
"grad_norm": 1.109529972076416, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 0.9825, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.49921996879875197, |
|
"eval_loss": 0.9798125624656677, |
|
"eval_runtime": 132.8615, |
|
"eval_samples_per_second": 101.993, |
|
"eval_steps_per_second": 6.375, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5054602184087363, |
|
"grad_norm": 0.9999867677688599, |
|
"learning_rate": 1.62e-05, |
|
"loss": 0.9591, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5117004680187207, |
|
"grad_norm": 0.8042426705360413, |
|
"learning_rate": 1.64e-05, |
|
"loss": 0.9832, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.5179407176287052, |
|
"grad_norm": 0.5843170881271362, |
|
"learning_rate": 1.66e-05, |
|
"loss": 0.9769, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5241809672386896, |
|
"grad_norm": 0.6988096237182617, |
|
"learning_rate": 1.6800000000000002e-05, |
|
"loss": 0.9556, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5304212168486739, |
|
"grad_norm": 0.7298963665962219, |
|
"learning_rate": 1.7e-05, |
|
"loss": 0.983, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5366614664586583, |
|
"grad_norm": 0.7856214046478271, |
|
"learning_rate": 1.72e-05, |
|
"loss": 0.9541, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5429017160686428, |
|
"grad_norm": 0.6896259188652039, |
|
"learning_rate": 1.7400000000000003e-05, |
|
"loss": 0.983, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.5491419656786272, |
|
"grad_norm": 0.5555576682090759, |
|
"learning_rate": 1.76e-05, |
|
"loss": 0.9395, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5553822152886115, |
|
"grad_norm": 0.6107622385025024, |
|
"learning_rate": 1.7800000000000002e-05, |
|
"loss": 0.9601, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5616224648985959, |
|
"grad_norm": 0.7116836309432983, |
|
"learning_rate": 1.8e-05, |
|
"loss": 0.9647, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5678627145085804, |
|
"grad_norm": 0.5782715082168579, |
|
"learning_rate": 1.8200000000000002e-05, |
|
"loss": 0.9604, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5741029641185648, |
|
"grad_norm": 0.512100338935852, |
|
"learning_rate": 1.8400000000000003e-05, |
|
"loss": 0.9433, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.5803432137285491, |
|
"grad_norm": 0.6315212249755859, |
|
"learning_rate": 1.86e-05, |
|
"loss": 0.969, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.5865834633385335, |
|
"grad_norm": 0.6883739233016968, |
|
"learning_rate": 1.88e-05, |
|
"loss": 0.9832, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.592823712948518, |
|
"grad_norm": 0.6760767102241516, |
|
"learning_rate": 1.9e-05, |
|
"loss": 0.9414, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.5990639625585024, |
|
"grad_norm": 0.6577237248420715, |
|
"learning_rate": 1.9200000000000003e-05, |
|
"loss": 0.9748, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6053042121684867, |
|
"grad_norm": 0.9515029788017273, |
|
"learning_rate": 1.94e-05, |
|
"loss": 0.9863, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.6115444617784711, |
|
"grad_norm": 1.40570068359375, |
|
"learning_rate": 1.9600000000000002e-05, |
|
"loss": 0.9445, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6177847113884556, |
|
"grad_norm": 0.9026833176612854, |
|
"learning_rate": 1.98e-05, |
|
"loss": 0.9436, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.62402496099844, |
|
"grad_norm": 0.6666714549064636, |
|
"learning_rate": 2e-05, |
|
"loss": 0.9832, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6302652106084243, |
|
"grad_norm": 0.8047837018966675, |
|
"learning_rate": 1.9998980430094333e-05, |
|
"loss": 0.9498, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6365054602184087, |
|
"grad_norm": 0.9035269618034363, |
|
"learning_rate": 1.9995921928281893e-05, |
|
"loss": 0.9541, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6427457098283932, |
|
"grad_norm": 1.027601718902588, |
|
"learning_rate": 1.9990825118233958e-05, |
|
"loss": 0.9786, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6489859594383776, |
|
"grad_norm": 1.1459457874298096, |
|
"learning_rate": 1.9983691039261358e-05, |
|
"loss": 0.9482, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6552262090483619, |
|
"grad_norm": 0.7179874777793884, |
|
"learning_rate": 1.9974521146102535e-05, |
|
"loss": 0.9743, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6614664586583463, |
|
"grad_norm": 0.6881632208824158, |
|
"learning_rate": 1.9963317308626916e-05, |
|
"loss": 0.9797, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6677067082683308, |
|
"grad_norm": 0.7822304368019104, |
|
"learning_rate": 1.9950081811453598e-05, |
|
"loss": 0.9682, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6739469578783152, |
|
"grad_norm": 0.8269001841545105, |
|
"learning_rate": 1.99348173534855e-05, |
|
"loss": 0.9455, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.6801872074882995, |
|
"grad_norm": 0.8077254295349121, |
|
"learning_rate": 1.991752704735903e-05, |
|
"loss": 0.9243, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.6864274570982839, |
|
"grad_norm": 0.8119699954986572, |
|
"learning_rate": 1.989821441880933e-05, |
|
"loss": 0.9273, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.6926677067082684, |
|
"grad_norm": 0.8220670223236084, |
|
"learning_rate": 1.9876883405951378e-05, |
|
"loss": 0.9455, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.6989079563182528, |
|
"grad_norm": 0.8622007966041565, |
|
"learning_rate": 1.9853538358476933e-05, |
|
"loss": 0.9624, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7051482059282371, |
|
"grad_norm": 0.8222960233688354, |
|
"learning_rate": 1.9828184036767556e-05, |
|
"loss": 0.955, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7113884555382215, |
|
"grad_norm": 0.62811678647995, |
|
"learning_rate": 1.9800825610923937e-05, |
|
"loss": 0.9551, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.717628705148206, |
|
"grad_norm": 0.7614508271217346, |
|
"learning_rate": 1.9771468659711595e-05, |
|
"loss": 0.9413, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7238689547581904, |
|
"grad_norm": 0.6695716977119446, |
|
"learning_rate": 1.9740119169423337e-05, |
|
"loss": 0.9384, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7301092043681747, |
|
"grad_norm": 0.5493482947349548, |
|
"learning_rate": 1.9706783532658528e-05, |
|
"loss": 0.9601, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7363494539781591, |
|
"grad_norm": 0.7798200249671936, |
|
"learning_rate": 1.9671468547019575e-05, |
|
"loss": 0.9555, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7425897035881436, |
|
"grad_norm": 0.8122205138206482, |
|
"learning_rate": 1.963418141372579e-05, |
|
"loss": 0.9351, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.748829953198128, |
|
"grad_norm": 0.6351688504219055, |
|
"learning_rate": 1.9594929736144978e-05, |
|
"loss": 0.9517, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7550702028081123, |
|
"grad_norm": 0.8507185578346252, |
|
"learning_rate": 1.955372151824297e-05, |
|
"loss": 0.9482, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7613104524180967, |
|
"grad_norm": 1.057692050933838, |
|
"learning_rate": 1.9510565162951538e-05, |
|
"loss": 0.9626, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7675507020280812, |
|
"grad_norm": 0.789968729019165, |
|
"learning_rate": 1.94654694704549e-05, |
|
"loss": 0.9504, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.7737909516380655, |
|
"grad_norm": 0.8988214731216431, |
|
"learning_rate": 1.941844363639525e-05, |
|
"loss": 0.9339, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.7800312012480499, |
|
"grad_norm": 0.6798993945121765, |
|
"learning_rate": 1.936949724999762e-05, |
|
"loss": 0.9387, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.7862714508580343, |
|
"grad_norm": 0.7597091794013977, |
|
"learning_rate": 1.9318640292114526e-05, |
|
"loss": 0.9884, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.7925117004680188, |
|
"grad_norm": 0.9357583522796631, |
|
"learning_rate": 1.9265883133190715e-05, |
|
"loss": 0.9382, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.7987519500780031, |
|
"grad_norm": 0.8738594055175781, |
|
"learning_rate": 1.92112365311485e-05, |
|
"loss": 0.9482, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.8049921996879875, |
|
"grad_norm": 0.8523539900779724, |
|
"learning_rate": 1.9154711629194062e-05, |
|
"loss": 0.9299, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8112324492979719, |
|
"grad_norm": 0.5781116485595703, |
|
"learning_rate": 1.9096319953545186e-05, |
|
"loss": 0.9636, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8174726989079563, |
|
"grad_norm": 0.7737751007080078, |
|
"learning_rate": 1.9036073411080917e-05, |
|
"loss": 0.9482, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8237129485179407, |
|
"grad_norm": 0.7203546762466431, |
|
"learning_rate": 1.8973984286913584e-05, |
|
"loss": 0.9298, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8299531981279251, |
|
"grad_norm": 0.5875493288040161, |
|
"learning_rate": 1.891006524188368e-05, |
|
"loss": 0.9239, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.8361934477379095, |
|
"grad_norm": 0.7981539964675903, |
|
"learning_rate": 1.8844329309978146e-05, |
|
"loss": 0.9546, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8424336973478939, |
|
"grad_norm": 0.7623902559280396, |
|
"learning_rate": 1.8776789895672557e-05, |
|
"loss": 0.9335, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8486739469578783, |
|
"grad_norm": 0.6350914239883423, |
|
"learning_rate": 1.8707460771197773e-05, |
|
"loss": 0.9585, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8549141965678627, |
|
"grad_norm": 0.6981391310691833, |
|
"learning_rate": 1.863635607373157e-05, |
|
"loss": 0.9271, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8611544461778471, |
|
"grad_norm": 0.7900795936584473, |
|
"learning_rate": 1.856349030251589e-05, |
|
"loss": 0.9022, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.8673946957878315, |
|
"grad_norm": 0.7494855523109436, |
|
"learning_rate": 1.8488878315900228e-05, |
|
"loss": 0.9534, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.8736349453978159, |
|
"grad_norm": 0.5757277011871338, |
|
"learning_rate": 1.8412535328311813e-05, |
|
"loss": 0.9397, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.8798751950078003, |
|
"grad_norm": 0.6893640756607056, |
|
"learning_rate": 1.8334476907153177e-05, |
|
"loss": 0.952, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.8861154446177847, |
|
"grad_norm": 0.7050842046737671, |
|
"learning_rate": 1.825471896962774e-05, |
|
"loss": 0.9417, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.8923556942277691, |
|
"grad_norm": 0.5544989109039307, |
|
"learning_rate": 1.817327777949407e-05, |
|
"loss": 0.9008, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.8985959438377535, |
|
"grad_norm": 0.6469840407371521, |
|
"learning_rate": 1.8090169943749477e-05, |
|
"loss": 0.9471, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9048361934477379, |
|
"grad_norm": 0.6894209384918213, |
|
"learning_rate": 1.8005412409243604e-05, |
|
"loss": 0.9553, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9110764430577223, |
|
"grad_norm": 0.5356501936912537, |
|
"learning_rate": 1.7919022459222754e-05, |
|
"loss": 0.9496, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9173166926677067, |
|
"grad_norm": 0.6416233777999878, |
|
"learning_rate": 1.7831017709805555e-05, |
|
"loss": 0.9558, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9235569422776911, |
|
"grad_norm": 0.7085059881210327, |
|
"learning_rate": 1.7741416106390828e-05, |
|
"loss": 0.9168, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9297971918876755, |
|
"grad_norm": 0.6492967009544373, |
|
"learning_rate": 1.7650235919998234e-05, |
|
"loss": 0.9065, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.9360374414976599, |
|
"grad_norm": 0.7753322124481201, |
|
"learning_rate": 1.7557495743542586e-05, |
|
"loss": 0.9285, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9422776911076443, |
|
"grad_norm": 0.6451005935668945, |
|
"learning_rate": 1.7463214488042472e-05, |
|
"loss": 0.9567, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9485179407176287, |
|
"grad_norm": 0.4824198782444, |
|
"learning_rate": 1.736741137876405e-05, |
|
"loss": 0.905, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9547581903276131, |
|
"grad_norm": 0.5846424102783203, |
|
"learning_rate": 1.727010595130074e-05, |
|
"loss": 0.9426, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9609984399375975, |
|
"grad_norm": 0.5984904170036316, |
|
"learning_rate": 1.7171318047589637e-05, |
|
"loss": 0.9398, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.9672386895475819, |
|
"grad_norm": 0.545465886592865, |
|
"learning_rate": 1.7071067811865477e-05, |
|
"loss": 0.9185, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.9734789391575663, |
|
"grad_norm": 0.5724261403083801, |
|
"learning_rate": 1.696937568655294e-05, |
|
"loss": 0.9222, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.9797191887675507, |
|
"grad_norm": 0.5120018124580383, |
|
"learning_rate": 1.6866262408098134e-05, |
|
"loss": 0.93, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.9859594383775351, |
|
"grad_norm": 0.5575640797615051, |
|
"learning_rate": 1.6761749002740195e-05, |
|
"loss": 0.9483, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.9921996879875195, |
|
"grad_norm": 0.603184163570404, |
|
"learning_rate": 1.6655856782223682e-05, |
|
"loss": 0.9394, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.9984399375975039, |
|
"grad_norm": 0.536756157875061, |
|
"learning_rate": 1.6548607339452853e-05, |
|
"loss": 0.9227, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.9984399375975039, |
|
"eval_loss": 0.9286661744117737, |
|
"eval_runtime": 134.8358, |
|
"eval_samples_per_second": 100.5, |
|
"eval_steps_per_second": 6.282, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0046801872074882, |
|
"grad_norm": 0.5680767297744751, |
|
"learning_rate": 1.6440022544088553e-05, |
|
"loss": 0.9263, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0109204368174727, |
|
"grad_norm": 0.6374309062957764, |
|
"learning_rate": 1.6330124538088705e-05, |
|
"loss": 0.942, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0171606864274572, |
|
"grad_norm": 0.5749344825744629, |
|
"learning_rate": 1.6218935731193223e-05, |
|
"loss": 0.9264, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0046801872074882, |
|
"grad_norm": 0.9867531657218933, |
|
"learning_rate": 1.6106478796354382e-05, |
|
"loss": 0.8513, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.0109204368174727, |
|
"grad_norm": 0.7155735492706299, |
|
"learning_rate": 1.599277666511347e-05, |
|
"loss": 0.8023, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.0171606864274572, |
|
"grad_norm": 1.3990559577941895, |
|
"learning_rate": 1.5877852522924733e-05, |
|
"loss": 0.7992, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0234009360374414, |
|
"grad_norm": 0.9339443445205688, |
|
"learning_rate": 1.576172980442753e-05, |
|
"loss": 0.7982, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.029641185647426, |
|
"grad_norm": 0.8383383750915527, |
|
"learning_rate": 1.5644432188667695e-05, |
|
"loss": 0.7764, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0358814352574104, |
|
"grad_norm": 0.8227719664573669, |
|
"learning_rate": 1.5525983594269026e-05, |
|
"loss": 0.7985, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.0421216848673946, |
|
"grad_norm": 0.8231950998306274, |
|
"learning_rate": 1.5406408174555978e-05, |
|
"loss": 0.8097, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.0483619344773791, |
|
"grad_norm": 1.1454997062683105, |
|
"learning_rate": 1.528573031262842e-05, |
|
"loss": 0.7779, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.0546021840873634, |
|
"grad_norm": 0.6939067840576172, |
|
"learning_rate": 1.5163974616389621e-05, |
|
"loss": 0.8217, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.0608424336973479, |
|
"grad_norm": 0.7526265978813171, |
|
"learning_rate": 1.504116591352832e-05, |
|
"loss": 0.7886, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.0670826833073324, |
|
"grad_norm": 0.7892379760742188, |
|
"learning_rate": 1.491732924645604e-05, |
|
"loss": 0.7959, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.0733229329173166, |
|
"grad_norm": 0.7279461026191711, |
|
"learning_rate": 1.479248986720057e-05, |
|
"loss": 0.7977, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.079563182527301, |
|
"grad_norm": 0.7360721230506897, |
|
"learning_rate": 1.4666673232256738e-05, |
|
"loss": 0.7883, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.0858034321372856, |
|
"grad_norm": 0.6525989174842834, |
|
"learning_rate": 1.4539904997395468e-05, |
|
"loss": 0.785, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.0920436817472698, |
|
"grad_norm": 0.7803720235824585, |
|
"learning_rate": 1.4412211012432213e-05, |
|
"loss": 0.7998, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.0982839313572543, |
|
"grad_norm": 0.6588256359100342, |
|
"learning_rate": 1.4283617315955815e-05, |
|
"loss": 0.7999, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1045241809672386, |
|
"grad_norm": 0.5983767509460449, |
|
"learning_rate": 1.4154150130018867e-05, |
|
"loss": 0.7848, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.110764430577223, |
|
"grad_norm": 0.641603946685791, |
|
"learning_rate": 1.4023835854790682e-05, |
|
"loss": 0.7937, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1170046801872076, |
|
"grad_norm": 0.6453792452812195, |
|
"learning_rate": 1.3892701063173917e-05, |
|
"loss": 0.8004, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1232449297971918, |
|
"grad_norm": 0.6428067088127136, |
|
"learning_rate": 1.3760772495385998e-05, |
|
"loss": 0.792, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1294851794071763, |
|
"grad_norm": 0.6279442310333252, |
|
"learning_rate": 1.362807705350641e-05, |
|
"loss": 0.7859, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1357254290171608, |
|
"grad_norm": 0.6000891327857971, |
|
"learning_rate": 1.3494641795990986e-05, |
|
"loss": 0.8039, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.141965678627145, |
|
"grad_norm": 0.6628398895263672, |
|
"learning_rate": 1.3360493932154301e-05, |
|
"loss": 0.7829, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.1482059282371295, |
|
"grad_norm": 0.6268762946128845, |
|
"learning_rate": 1.3225660816621342e-05, |
|
"loss": 0.778, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.154446177847114, |
|
"grad_norm": 0.639062225818634, |
|
"learning_rate": 1.3090169943749475e-05, |
|
"loss": 0.7796, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.1606864274570983, |
|
"grad_norm": 0.6048714518547058, |
|
"learning_rate": 1.2954048942022002e-05, |
|
"loss": 0.7883, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.1669266770670828, |
|
"grad_norm": 0.5929723381996155, |
|
"learning_rate": 1.2817325568414299e-05, |
|
"loss": 0.7736, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.173166926677067, |
|
"grad_norm": 0.5971985459327698, |
|
"learning_rate": 1.2680027702733791e-05, |
|
"loss": 0.8008, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.1794071762870515, |
|
"grad_norm": 0.6460970044136047, |
|
"learning_rate": 1.2542183341934873e-05, |
|
"loss": 0.7867, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.185647425897036, |
|
"grad_norm": 0.5345771908760071, |
|
"learning_rate": 1.2403820594409926e-05, |
|
"loss": 0.7808, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.1918876755070202, |
|
"grad_norm": 0.6704164743423462, |
|
"learning_rate": 1.2264967674257647e-05, |
|
"loss": 0.7785, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.1981279251170047, |
|
"grad_norm": 0.5182461738586426, |
|
"learning_rate": 1.2125652895529766e-05, |
|
"loss": 0.7907, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.204368174726989, |
|
"grad_norm": 0.6421562433242798, |
|
"learning_rate": 1.1985904666457455e-05, |
|
"loss": 0.7914, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.2106084243369735, |
|
"grad_norm": 0.5846896171569824, |
|
"learning_rate": 1.1845751483658454e-05, |
|
"loss": 0.7631, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.216848673946958, |
|
"grad_norm": 0.5582466721534729, |
|
"learning_rate": 1.170522192632624e-05, |
|
"loss": 0.7912, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2230889235569422, |
|
"grad_norm": 0.5527791976928711, |
|
"learning_rate": 1.156434465040231e-05, |
|
"loss": 0.7938, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2293291731669267, |
|
"grad_norm": 0.5673221945762634, |
|
"learning_rate": 1.1423148382732854e-05, |
|
"loss": 0.7947, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2355694227769112, |
|
"grad_norm": 0.5078392028808594, |
|
"learning_rate": 1.1281661915210931e-05, |
|
"loss": 0.7771, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.2418096723868954, |
|
"grad_norm": 0.5475752353668213, |
|
"learning_rate": 1.1139914098905406e-05, |
|
"loss": 0.7781, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.24804992199688, |
|
"grad_norm": 0.5290600657463074, |
|
"learning_rate": 1.0997933838177828e-05, |
|
"loss": 0.7622, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.2542901716068644, |
|
"grad_norm": 0.4957723915576935, |
|
"learning_rate": 1.08557500847884e-05, |
|
"loss": 0.7857, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.2605304212168487, |
|
"grad_norm": 0.5119233727455139, |
|
"learning_rate": 1.0713391831992324e-05, |
|
"loss": 0.7585, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.2667706708268331, |
|
"grad_norm": 0.5187195539474487, |
|
"learning_rate": 1.0570888108627682e-05, |
|
"loss": 0.7885, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.2730109204368174, |
|
"grad_norm": 0.5066515803337097, |
|
"learning_rate": 1.0428267973196027e-05, |
|
"loss": 0.7691, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.2792511700468019, |
|
"grad_norm": 0.48673221468925476, |
|
"learning_rate": 1.0285560507936962e-05, |
|
"loss": 0.7715, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.2854914196567861, |
|
"grad_norm": 0.5083721876144409, |
|
"learning_rate": 1.0142794812897874e-05, |
|
"loss": 0.7812, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.2917316692667706, |
|
"grad_norm": 0.5033391118049622, |
|
"learning_rate": 1e-05, |
|
"loss": 0.7756, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.2979719188767551, |
|
"grad_norm": 0.532008945941925, |
|
"learning_rate": 9.85720518710213e-06, |
|
"loss": 0.7898, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.3042121684867394, |
|
"grad_norm": 0.5123456716537476, |
|
"learning_rate": 9.71443949206304e-06, |
|
"loss": 0.7779, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3104524180967239, |
|
"grad_norm": 0.48444995284080505, |
|
"learning_rate": 9.571732026803978e-06, |
|
"loss": 0.7598, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3166926677067083, |
|
"grad_norm": 0.5265589356422424, |
|
"learning_rate": 9.42911189137232e-06, |
|
"loss": 0.783, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3229329173166926, |
|
"grad_norm": 0.5039641261100769, |
|
"learning_rate": 9.286608168007678e-06, |
|
"loss": 0.7798, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.329173166926677, |
|
"grad_norm": 0.5092752575874329, |
|
"learning_rate": 9.144249915211605e-06, |
|
"loss": 0.7635, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.3354134165366616, |
|
"grad_norm": 0.5394583940505981, |
|
"learning_rate": 9.002066161822174e-06, |
|
"loss": 0.7999, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.3416536661466458, |
|
"grad_norm": 0.5039361119270325, |
|
"learning_rate": 8.860085901094595e-06, |
|
"loss": 0.7745, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.3478939157566303, |
|
"grad_norm": 0.5087692737579346, |
|
"learning_rate": 8.718338084789074e-06, |
|
"loss": 0.8093, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.3541341653666148, |
|
"grad_norm": 0.5368974208831787, |
|
"learning_rate": 8.576851617267151e-06, |
|
"loss": 0.7596, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.360374414976599, |
|
"grad_norm": 0.5123481750488281, |
|
"learning_rate": 8.43565534959769e-06, |
|
"loss": 0.7665, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.3666146645865835, |
|
"grad_norm": 0.5048606395721436, |
|
"learning_rate": 8.294778073673762e-06, |
|
"loss": 0.769, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.3728549141965678, |
|
"grad_norm": 0.5156130790710449, |
|
"learning_rate": 8.154248516341547e-06, |
|
"loss": 0.7989, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.3790951638065523, |
|
"grad_norm": 0.5228000283241272, |
|
"learning_rate": 8.014095333542548e-06, |
|
"loss": 0.7775, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.3853354134165365, |
|
"grad_norm": 0.5356248617172241, |
|
"learning_rate": 7.874347104470234e-06, |
|
"loss": 0.7629, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.391575663026521, |
|
"grad_norm": 0.49624764919281006, |
|
"learning_rate": 7.735032325742355e-06, |
|
"loss": 0.8026, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.3978159126365055, |
|
"grad_norm": 0.5164198279380798, |
|
"learning_rate": 7.596179405590076e-06, |
|
"loss": 0.7803, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4040561622464898, |
|
"grad_norm": 0.47972792387008667, |
|
"learning_rate": 7.4578166580651335e-06, |
|
"loss": 0.7783, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4102964118564743, |
|
"grad_norm": 0.4989663064479828, |
|
"learning_rate": 7.319972297266215e-06, |
|
"loss": 0.7728, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4165366614664587, |
|
"grad_norm": 0.5109092593193054, |
|
"learning_rate": 7.182674431585703e-06, |
|
"loss": 0.7919, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.422776911076443, |
|
"grad_norm": 0.5069689750671387, |
|
"learning_rate": 7.045951057978001e-06, |
|
"loss": 0.8015, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4290171606864275, |
|
"grad_norm": 0.5074580907821655, |
|
"learning_rate": 6.909830056250527e-06, |
|
"loss": 0.7844, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.435257410296412, |
|
"grad_norm": 0.47711381316185, |
|
"learning_rate": 6.774339183378663e-06, |
|
"loss": 0.7601, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.4414976599063962, |
|
"grad_norm": 0.4926273822784424, |
|
"learning_rate": 6.639506067845698e-06, |
|
"loss": 0.7904, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.4477379095163807, |
|
"grad_norm": 0.5004563927650452, |
|
"learning_rate": 6.505358204009018e-06, |
|
"loss": 0.7638, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.4539781591263652, |
|
"grad_norm": 0.5014521479606628, |
|
"learning_rate": 6.3719229464935915e-06, |
|
"loss": 0.794, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.4602184087363494, |
|
"grad_norm": 0.5181212425231934, |
|
"learning_rate": 6.239227504614004e-06, |
|
"loss": 0.7495, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.466458658346334, |
|
"grad_norm": 0.5317255258560181, |
|
"learning_rate": 6.107298936826086e-06, |
|
"loss": 0.7884, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.4726989079563182, |
|
"grad_norm": 0.5126049518585205, |
|
"learning_rate": 5.9761641452093225e-06, |
|
"loss": 0.7869, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.4789391575663027, |
|
"grad_norm": 0.5322765707969666, |
|
"learning_rate": 5.845849869981137e-06, |
|
"loss": 0.7712, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.4789391575663027, |
|
"eval_loss": 0.9110648036003113, |
|
"eval_runtime": 134.5659, |
|
"eval_samples_per_second": 100.702, |
|
"eval_steps_per_second": 6.294, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.485179407176287, |
|
"grad_norm": 0.5006371140480042, |
|
"learning_rate": 5.716382684044191e-06, |
|
"loss": 0.7939, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.4914196567862714, |
|
"grad_norm": 0.5099849104881287, |
|
"learning_rate": 5.587788987567785e-06, |
|
"loss": 0.771, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.497659906396256, |
|
"grad_norm": 0.5397711396217346, |
|
"learning_rate": 5.460095002604533e-06, |
|
"loss": 0.7642, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5039001560062402, |
|
"grad_norm": 0.47754916548728943, |
|
"learning_rate": 5.333326767743263e-06, |
|
"loss": 0.7848, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5101404056162246, |
|
"grad_norm": 0.5114724636077881, |
|
"learning_rate": 5.207510132799436e-06, |
|
"loss": 0.7743, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5163806552262091, |
|
"grad_norm": 0.4859448969364166, |
|
"learning_rate": 5.082670753543961e-06, |
|
"loss": 0.7748, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5226209048361934, |
|
"grad_norm": 0.46158257126808167, |
|
"learning_rate": 4.958834086471683e-06, |
|
"loss": 0.7951, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.5288611544461779, |
|
"grad_norm": 0.48695865273475647, |
|
"learning_rate": 4.836025383610382e-06, |
|
"loss": 0.7968, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.5351014040561624, |
|
"grad_norm": 0.4924914240837097, |
|
"learning_rate": 4.714269687371581e-06, |
|
"loss": 0.792, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.5413416536661466, |
|
"grad_norm": 0.5044175982475281, |
|
"learning_rate": 4.593591825444028e-06, |
|
"loss": 0.781, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.547581903276131, |
|
"grad_norm": 0.4598456919193268, |
|
"learning_rate": 4.474016405730973e-06, |
|
"loss": 0.794, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.5538221528861156, |
|
"grad_norm": 0.48866939544677734, |
|
"learning_rate": 4.355567811332311e-06, |
|
"loss": 0.7853, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.5600624024960998, |
|
"grad_norm": 0.4878495931625366, |
|
"learning_rate": 4.2382701955724724e-06, |
|
"loss": 0.743, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.566302652106084, |
|
"grad_norm": 0.4770466387271881, |
|
"learning_rate": 4.12214747707527e-06, |
|
"loss": 0.7442, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.5725429017160688, |
|
"grad_norm": 0.4467732012271881, |
|
"learning_rate": 4.007223334886531e-06, |
|
"loss": 0.7611, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.578783151326053, |
|
"grad_norm": 0.47116416692733765, |
|
"learning_rate": 3.893521203645618e-06, |
|
"loss": 0.7921, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.5850234009360373, |
|
"grad_norm": 0.468517005443573, |
|
"learning_rate": 3.78106426880678e-06, |
|
"loss": 0.7811, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.5912636505460218, |
|
"grad_norm": 0.46981289982795715, |
|
"learning_rate": 3.6698754619112974e-06, |
|
"loss": 0.7756, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.5975039001560063, |
|
"grad_norm": 0.45571863651275635, |
|
"learning_rate": 3.5599774559114475e-06, |
|
"loss": 0.7469, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6037441497659906, |
|
"grad_norm": 0.4486157298088074, |
|
"learning_rate": 3.4513926605471504e-06, |
|
"loss": 0.7566, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.609984399375975, |
|
"grad_norm": 0.47735777497291565, |
|
"learning_rate": 3.344143217776319e-06, |
|
"loss": 0.7753, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6162246489859595, |
|
"grad_norm": 0.4546492099761963, |
|
"learning_rate": 3.2382509972598087e-06, |
|
"loss": 0.7741, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.6224648985959438, |
|
"grad_norm": 0.4690036177635193, |
|
"learning_rate": 3.133737591901864e-06, |
|
"loss": 0.7693, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.6287051482059283, |
|
"grad_norm": 0.4607780873775482, |
|
"learning_rate": 3.0306243134470668e-06, |
|
"loss": 0.7312, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.6349453978159127, |
|
"grad_norm": 0.4510229825973511, |
|
"learning_rate": 2.9289321881345257e-06, |
|
"loss": 0.7705, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.641185647425897, |
|
"grad_norm": 0.43797171115875244, |
|
"learning_rate": 2.8286819524103657e-06, |
|
"loss": 0.7666, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.6474258970358813, |
|
"grad_norm": 0.45275524258613586, |
|
"learning_rate": 2.7298940486992654e-06, |
|
"loss": 0.8028, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.653666146645866, |
|
"grad_norm": 0.4479183852672577, |
|
"learning_rate": 2.6325886212359496e-06, |
|
"loss": 0.7647, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.6599063962558502, |
|
"grad_norm": 0.4575185775756836, |
|
"learning_rate": 2.5367855119575314e-06, |
|
"loss": 0.7731, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.6661466458658345, |
|
"grad_norm": 0.45092037320137024, |
|
"learning_rate": 2.4425042564574186e-06, |
|
"loss": 0.7895, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.672386895475819, |
|
"grad_norm": 0.4297903776168823, |
|
"learning_rate": 2.3497640800017687e-06, |
|
"loss": 0.7597, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.6786271450858035, |
|
"grad_norm": 0.43731772899627686, |
|
"learning_rate": 2.2585838936091753e-06, |
|
"loss": 0.7775, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.6848673946957877, |
|
"grad_norm": 0.4288908541202545, |
|
"learning_rate": 2.1689822901944456e-06, |
|
"loss": 0.787, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.6911076443057722, |
|
"grad_norm": 0.42555147409439087, |
|
"learning_rate": 2.0809775407772505e-06, |
|
"loss": 0.7751, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.6973478939157567, |
|
"grad_norm": 0.45276904106140137, |
|
"learning_rate": 1.994587590756397e-06, |
|
"loss": 0.7893, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.703588143525741, |
|
"grad_norm": 0.44294846057891846, |
|
"learning_rate": 1.9098300562505266e-06, |
|
"loss": 0.7794, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7098283931357254, |
|
"grad_norm": 0.4307778775691986, |
|
"learning_rate": 1.826722220505931e-06, |
|
"loss": 0.7791, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.71606864274571, |
|
"grad_norm": 0.43245357275009155, |
|
"learning_rate": 1.74528103037226e-06, |
|
"loss": 0.7497, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.7223088923556942, |
|
"grad_norm": 0.4251644015312195, |
|
"learning_rate": 1.6655230928468257e-06, |
|
"loss": 0.7743, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.7285491419656787, |
|
"grad_norm": 0.4321819543838501, |
|
"learning_rate": 1.587464671688187e-06, |
|
"loss": 0.7522, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.7347893915756631, |
|
"grad_norm": 0.4292062222957611, |
|
"learning_rate": 1.5111216840997745e-06, |
|
"loss": 0.7698, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.7410296411856474, |
|
"grad_norm": 0.45093026757240295, |
|
"learning_rate": 1.436509697484111e-06, |
|
"loss": 0.7647, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.7472698907956317, |
|
"grad_norm": 0.42376089096069336, |
|
"learning_rate": 1.3636439262684299e-06, |
|
"loss": 0.7556, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.7535101404056164, |
|
"grad_norm": 0.4337958097457886, |
|
"learning_rate": 1.2925392288022299e-06, |
|
"loss": 0.7756, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.7597503900156006, |
|
"grad_norm": 0.4319595992565155, |
|
"learning_rate": 1.2232101043274437e-06, |
|
"loss": 0.7816, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.765990639625585, |
|
"grad_norm": 0.4335976243019104, |
|
"learning_rate": 1.1556706900218572e-06, |
|
"loss": 0.7902, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.7722308892355694, |
|
"grad_norm": 0.4158109724521637, |
|
"learning_rate": 1.0899347581163222e-06, |
|
"loss": 0.7396, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.7784711388455539, |
|
"grad_norm": 0.42235442996025085, |
|
"learning_rate": 1.0260157130864178e-06, |
|
"loss": 0.7747, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.7847113884555381, |
|
"grad_norm": 0.43022453784942627, |
|
"learning_rate": 9.63926588919083e-07, |
|
"loss": 0.7616, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.7909516380655226, |
|
"grad_norm": 0.42115500569343567, |
|
"learning_rate": 9.036800464548157e-07, |
|
"loss": 0.7609, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.797191887675507, |
|
"grad_norm": 0.43961915373802185, |
|
"learning_rate": 8.4528837080594e-07, |
|
"loss": 0.7665, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8034321372854913, |
|
"grad_norm": 0.42159175872802734, |
|
"learning_rate": 7.887634688515e-07, |
|
"loss": 0.7684, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8096723868954758, |
|
"grad_norm": 0.4211348295211792, |
|
"learning_rate": 7.341168668092857e-07, |
|
"loss": 0.7701, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.8159126365054603, |
|
"grad_norm": 0.447083055973053, |
|
"learning_rate": 6.813597078854772e-07, |
|
"loss": 0.8051, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.8221528861154446, |
|
"grad_norm": 0.42439502477645874, |
|
"learning_rate": 6.305027500023841e-07, |
|
"loss": 0.7794, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.828393135725429, |
|
"grad_norm": 0.43265220522880554, |
|
"learning_rate": 5.815563636047539e-07, |
|
"loss": 0.7722, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.8346333853354135, |
|
"grad_norm": 0.4244794249534607, |
|
"learning_rate": 5.345305295450997e-07, |
|
"loss": 0.7638, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.8408736349453978, |
|
"grad_norm": 0.41255486011505127, |
|
"learning_rate": 4.894348370484648e-07, |
|
"loss": 0.772, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.847113884555382, |
|
"grad_norm": 0.4242880344390869, |
|
"learning_rate": 4.4627848175703315e-07, |
|
"loss": 0.7643, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.8533541341653668, |
|
"grad_norm": 0.41673314571380615, |
|
"learning_rate": 4.0507026385502747e-07, |
|
"loss": 0.7601, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.859594383775351, |
|
"grad_norm": 0.42518967390060425, |
|
"learning_rate": 3.658185862742103e-07, |
|
"loss": 0.7699, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.8658346333853353, |
|
"grad_norm": 0.42029449343681335, |
|
"learning_rate": 3.2853145298042954e-07, |
|
"loss": 0.7498, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.8720748829953198, |
|
"grad_norm": 0.4201337695121765, |
|
"learning_rate": 2.93216467341475e-07, |
|
"loss": 0.7626, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.8783151326053042, |
|
"grad_norm": 0.42505332827568054, |
|
"learning_rate": 2.5988083057666534e-07, |
|
"loss": 0.774, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.8845553822152885, |
|
"grad_norm": 0.41834697127342224, |
|
"learning_rate": 2.2853134028840594e-07, |
|
"loss": 0.7638, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.890795631825273, |
|
"grad_norm": 0.4278232455253601, |
|
"learning_rate": 1.9917438907606556e-07, |
|
"loss": 0.7906, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.8970358814352575, |
|
"grad_norm": 0.429078608751297, |
|
"learning_rate": 1.7181596323244453e-07, |
|
"loss": 0.7839, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9032761310452417, |
|
"grad_norm": 0.4142579138278961, |
|
"learning_rate": 1.464616415230702e-07, |
|
"loss": 0.7687, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9095163806552262, |
|
"grad_norm": 0.40971171855926514, |
|
"learning_rate": 1.231165940486234e-07, |
|
"loss": 0.7647, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.9157566302652107, |
|
"grad_norm": 0.4336109161376953, |
|
"learning_rate": 1.0178558119067316e-07, |
|
"loss": 0.7691, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.921996879875195, |
|
"grad_norm": 0.40623047947883606, |
|
"learning_rate": 8.247295264097288e-08, |
|
"loss": 0.7728, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.9282371294851794, |
|
"grad_norm": 0.4205041527748108, |
|
"learning_rate": 6.51826465144978e-08, |
|
"loss": 0.7533, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.934477379095164, |
|
"grad_norm": 0.416535347700119, |
|
"learning_rate": 4.991818854640396e-08, |
|
"loss": 0.7826, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.9407176287051482, |
|
"grad_norm": 0.41483184695243835, |
|
"learning_rate": 3.668269137308666e-08, |
|
"loss": 0.7688, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.9469578783151325, |
|
"grad_norm": 0.4072718322277069, |
|
"learning_rate": 2.547885389746485e-08, |
|
"loss": 0.7943, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.9531981279251172, |
|
"grad_norm": 0.413289338350296, |
|
"learning_rate": 1.630896073864352e-08, |
|
"loss": 0.7867, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.9594383775351014, |
|
"grad_norm": 0.4177180528640747, |
|
"learning_rate": 9.174881766043086e-09, |
|
"loss": 0.781, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.9656786271450857, |
|
"grad_norm": 0.41807225346565247, |
|
"learning_rate": 4.0780717181077015e-09, |
|
"loss": 0.769, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9719188767550702, |
|
"grad_norm": 0.41558825969696045, |
|
"learning_rate": 1.019569905666984e-09, |
|
"loss": 0.7504, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.9781591263650546, |
|
"grad_norm": 0.4160574674606323, |
|
"learning_rate": 0.0, |
|
"loss": 0.8025, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.9781591263650546, |
|
"eval_loss": 0.903252899646759, |
|
"eval_runtime": 134.5566, |
|
"eval_samples_per_second": 100.709, |
|
"eval_steps_per_second": 6.295, |
|
"step": 320 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 320, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 80, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9476972312723456e+18, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|