Meta-Llama-3.1-8B-Claude / trainer_state.json
Undi95's picture
Upload folder using huggingface_hub
f086430 verified
raw
history blame
56.6 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.9781591263650546,
"eval_steps": 80,
"global_step": 320,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0062402496099844,
"grad_norm": 19.714784622192383,
"learning_rate": 2.0000000000000002e-07,
"loss": 1.3589,
"step": 1
},
{
"epoch": 0.0062402496099844,
"eval_loss": 1.3540421724319458,
"eval_runtime": 132.5999,
"eval_samples_per_second": 102.195,
"eval_steps_per_second": 6.388,
"step": 1
},
{
"epoch": 0.0124804992199688,
"grad_norm": 20.498014450073242,
"learning_rate": 4.0000000000000003e-07,
"loss": 1.3662,
"step": 2
},
{
"epoch": 0.0187207488299532,
"grad_norm": 19.82619285583496,
"learning_rate": 6.000000000000001e-07,
"loss": 1.3336,
"step": 3
},
{
"epoch": 0.0249609984399376,
"grad_norm": 18.423460006713867,
"learning_rate": 8.000000000000001e-07,
"loss": 1.3555,
"step": 4
},
{
"epoch": 0.031201248049921998,
"grad_norm": 16.555850982666016,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.3527,
"step": 5
},
{
"epoch": 0.0374414976599064,
"grad_norm": 10.684965133666992,
"learning_rate": 1.2000000000000002e-06,
"loss": 1.3491,
"step": 6
},
{
"epoch": 0.0436817472698908,
"grad_norm": 8.396592140197754,
"learning_rate": 1.4000000000000001e-06,
"loss": 1.3181,
"step": 7
},
{
"epoch": 0.0499219968798752,
"grad_norm": 3.145500421524048,
"learning_rate": 1.6000000000000001e-06,
"loss": 1.2984,
"step": 8
},
{
"epoch": 0.056162246489859596,
"grad_norm": 2.981050491333008,
"learning_rate": 1.8000000000000001e-06,
"loss": 1.2901,
"step": 9
},
{
"epoch": 0.062402496099843996,
"grad_norm": 2.741509199142456,
"learning_rate": 2.0000000000000003e-06,
"loss": 1.2948,
"step": 10
},
{
"epoch": 0.0686427457098284,
"grad_norm": 3.8496174812316895,
"learning_rate": 2.2e-06,
"loss": 1.2524,
"step": 11
},
{
"epoch": 0.0748829953198128,
"grad_norm": 3.039551258087158,
"learning_rate": 2.4000000000000003e-06,
"loss": 1.2369,
"step": 12
},
{
"epoch": 0.0811232449297972,
"grad_norm": 2.215259313583374,
"learning_rate": 2.6e-06,
"loss": 1.244,
"step": 13
},
{
"epoch": 0.0873634945397816,
"grad_norm": 1.4627336263656616,
"learning_rate": 2.8000000000000003e-06,
"loss": 1.2201,
"step": 14
},
{
"epoch": 0.093603744149766,
"grad_norm": 2.0100812911987305,
"learning_rate": 3e-06,
"loss": 1.2097,
"step": 15
},
{
"epoch": 0.0998439937597504,
"grad_norm": 2.0757627487182617,
"learning_rate": 3.2000000000000003e-06,
"loss": 1.2058,
"step": 16
},
{
"epoch": 0.1060842433697348,
"grad_norm": 1.6582179069519043,
"learning_rate": 3.4000000000000005e-06,
"loss": 1.1775,
"step": 17
},
{
"epoch": 0.11232449297971919,
"grad_norm": 1.2454713582992554,
"learning_rate": 3.6000000000000003e-06,
"loss": 1.1405,
"step": 18
},
{
"epoch": 0.11856474258970359,
"grad_norm": 1.0032132863998413,
"learning_rate": 3.8000000000000005e-06,
"loss": 1.1442,
"step": 19
},
{
"epoch": 0.12480499219968799,
"grad_norm": 1.3543955087661743,
"learning_rate": 4.000000000000001e-06,
"loss": 1.1874,
"step": 20
},
{
"epoch": 0.1310452418096724,
"grad_norm": 1.2795507907867432,
"learning_rate": 4.2000000000000004e-06,
"loss": 1.1423,
"step": 21
},
{
"epoch": 0.1372854914196568,
"grad_norm": 1.0040080547332764,
"learning_rate": 4.4e-06,
"loss": 1.1269,
"step": 22
},
{
"epoch": 0.1435257410296412,
"grad_norm": 0.9706005454063416,
"learning_rate": 4.600000000000001e-06,
"loss": 1.1508,
"step": 23
},
{
"epoch": 0.1497659906396256,
"grad_norm": 0.905784547328949,
"learning_rate": 4.800000000000001e-06,
"loss": 1.1003,
"step": 24
},
{
"epoch": 0.15600624024961,
"grad_norm": 0.8688749074935913,
"learning_rate": 5e-06,
"loss": 1.1046,
"step": 25
},
{
"epoch": 0.1622464898595944,
"grad_norm": 0.7418661713600159,
"learning_rate": 5.2e-06,
"loss": 1.0736,
"step": 26
},
{
"epoch": 0.1684867394695788,
"grad_norm": 0.7218017578125,
"learning_rate": 5.400000000000001e-06,
"loss": 1.0924,
"step": 27
},
{
"epoch": 0.1747269890795632,
"grad_norm": 0.7364180684089661,
"learning_rate": 5.600000000000001e-06,
"loss": 1.0666,
"step": 28
},
{
"epoch": 0.1809672386895476,
"grad_norm": 0.6347681879997253,
"learning_rate": 5.8e-06,
"loss": 1.0533,
"step": 29
},
{
"epoch": 0.187207488299532,
"grad_norm": 0.672021210193634,
"learning_rate": 6e-06,
"loss": 1.0719,
"step": 30
},
{
"epoch": 0.1934477379095164,
"grad_norm": 0.6880649328231812,
"learning_rate": 6.200000000000001e-06,
"loss": 1.0555,
"step": 31
},
{
"epoch": 0.1996879875195008,
"grad_norm": 0.5669052004814148,
"learning_rate": 6.4000000000000006e-06,
"loss": 1.0845,
"step": 32
},
{
"epoch": 0.2059282371294852,
"grad_norm": 0.6051258444786072,
"learning_rate": 6.600000000000001e-06,
"loss": 1.0656,
"step": 33
},
{
"epoch": 0.2121684867394696,
"grad_norm": 0.5937217473983765,
"learning_rate": 6.800000000000001e-06,
"loss": 1.0738,
"step": 34
},
{
"epoch": 0.21840873634945399,
"grad_norm": 0.5861482620239258,
"learning_rate": 7e-06,
"loss": 1.0497,
"step": 35
},
{
"epoch": 0.22464898595943839,
"grad_norm": 0.5939168334007263,
"learning_rate": 7.2000000000000005e-06,
"loss": 1.0657,
"step": 36
},
{
"epoch": 0.23088923556942278,
"grad_norm": 0.5843105316162109,
"learning_rate": 7.4e-06,
"loss": 1.0498,
"step": 37
},
{
"epoch": 0.23712948517940718,
"grad_norm": 0.5303648710250854,
"learning_rate": 7.600000000000001e-06,
"loss": 1.0604,
"step": 38
},
{
"epoch": 0.24336973478939158,
"grad_norm": 0.558338463306427,
"learning_rate": 7.800000000000002e-06,
"loss": 1.0383,
"step": 39
},
{
"epoch": 0.24960998439937598,
"grad_norm": 0.49629613757133484,
"learning_rate": 8.000000000000001e-06,
"loss": 1.0521,
"step": 40
},
{
"epoch": 0.25585023400936036,
"grad_norm": 0.5873180627822876,
"learning_rate": 8.2e-06,
"loss": 1.0403,
"step": 41
},
{
"epoch": 0.2620904836193448,
"grad_norm": 0.5466005802154541,
"learning_rate": 8.400000000000001e-06,
"loss": 1.0127,
"step": 42
},
{
"epoch": 0.26833073322932915,
"grad_norm": 0.5514444708824158,
"learning_rate": 8.6e-06,
"loss": 1.0399,
"step": 43
},
{
"epoch": 0.2745709828393136,
"grad_norm": 0.5304705500602722,
"learning_rate": 8.8e-06,
"loss": 1.0057,
"step": 44
},
{
"epoch": 0.28081123244929795,
"grad_norm": 0.5105130076408386,
"learning_rate": 9e-06,
"loss": 1.0174,
"step": 45
},
{
"epoch": 0.2870514820592824,
"grad_norm": 0.533640444278717,
"learning_rate": 9.200000000000002e-06,
"loss": 1.0342,
"step": 46
},
{
"epoch": 0.29329173166926675,
"grad_norm": 0.48208147287368774,
"learning_rate": 9.4e-06,
"loss": 1.0195,
"step": 47
},
{
"epoch": 0.2995319812792512,
"grad_norm": 0.5069381594657898,
"learning_rate": 9.600000000000001e-06,
"loss": 1.0382,
"step": 48
},
{
"epoch": 0.30577223088923555,
"grad_norm": 0.4819696843624115,
"learning_rate": 9.800000000000001e-06,
"loss": 1.05,
"step": 49
},
{
"epoch": 0.31201248049922,
"grad_norm": 0.5414313673973083,
"learning_rate": 1e-05,
"loss": 1.0245,
"step": 50
},
{
"epoch": 0.31825273010920435,
"grad_norm": 0.4769354462623596,
"learning_rate": 1.02e-05,
"loss": 1.005,
"step": 51
},
{
"epoch": 0.3244929797191888,
"grad_norm": 0.5051629543304443,
"learning_rate": 1.04e-05,
"loss": 1.0158,
"step": 52
},
{
"epoch": 0.33073322932917315,
"grad_norm": 0.5432644486427307,
"learning_rate": 1.0600000000000002e-05,
"loss": 1.0122,
"step": 53
},
{
"epoch": 0.3369734789391576,
"grad_norm": 0.4705195128917694,
"learning_rate": 1.0800000000000002e-05,
"loss": 1.0053,
"step": 54
},
{
"epoch": 0.34321372854914195,
"grad_norm": 0.5468801856040955,
"learning_rate": 1.1000000000000001e-05,
"loss": 1.0173,
"step": 55
},
{
"epoch": 0.3494539781591264,
"grad_norm": 0.6218928694725037,
"learning_rate": 1.1200000000000001e-05,
"loss": 0.9974,
"step": 56
},
{
"epoch": 0.35569422776911075,
"grad_norm": 0.532873272895813,
"learning_rate": 1.14e-05,
"loss": 1.006,
"step": 57
},
{
"epoch": 0.3619344773790952,
"grad_norm": 0.48144450783729553,
"learning_rate": 1.16e-05,
"loss": 1.0025,
"step": 58
},
{
"epoch": 0.36817472698907955,
"grad_norm": 0.5385976433753967,
"learning_rate": 1.18e-05,
"loss": 0.9976,
"step": 59
},
{
"epoch": 0.374414976599064,
"grad_norm": 0.5179689526557922,
"learning_rate": 1.2e-05,
"loss": 0.9988,
"step": 60
},
{
"epoch": 0.38065522620904835,
"grad_norm": 0.4646259844303131,
"learning_rate": 1.22e-05,
"loss": 0.9959,
"step": 61
},
{
"epoch": 0.3868954758190328,
"grad_norm": 0.5259431004524231,
"learning_rate": 1.2400000000000002e-05,
"loss": 0.9933,
"step": 62
},
{
"epoch": 0.39313572542901715,
"grad_norm": 0.5602505803108215,
"learning_rate": 1.2600000000000001e-05,
"loss": 0.9732,
"step": 63
},
{
"epoch": 0.3993759750390016,
"grad_norm": 0.5400233864784241,
"learning_rate": 1.2800000000000001e-05,
"loss": 1.0021,
"step": 64
},
{
"epoch": 0.40561622464898595,
"grad_norm": 0.5008605718612671,
"learning_rate": 1.3000000000000001e-05,
"loss": 1.0098,
"step": 65
},
{
"epoch": 0.4118564742589704,
"grad_norm": 0.5215092301368713,
"learning_rate": 1.3200000000000002e-05,
"loss": 0.9805,
"step": 66
},
{
"epoch": 0.41809672386895474,
"grad_norm": 0.6043874025344849,
"learning_rate": 1.3400000000000002e-05,
"loss": 0.9651,
"step": 67
},
{
"epoch": 0.4243369734789392,
"grad_norm": 0.5744293928146362,
"learning_rate": 1.3600000000000002e-05,
"loss": 0.9715,
"step": 68
},
{
"epoch": 0.43057722308892354,
"grad_norm": 0.6228943467140198,
"learning_rate": 1.38e-05,
"loss": 0.9942,
"step": 69
},
{
"epoch": 0.43681747269890797,
"grad_norm": 0.6340550780296326,
"learning_rate": 1.4e-05,
"loss": 1.0278,
"step": 70
},
{
"epoch": 0.44305772230889234,
"grad_norm": 0.6537193655967712,
"learning_rate": 1.4200000000000001e-05,
"loss": 1.005,
"step": 71
},
{
"epoch": 0.44929797191887677,
"grad_norm": 0.6706846356391907,
"learning_rate": 1.4400000000000001e-05,
"loss": 0.9736,
"step": 72
},
{
"epoch": 0.45553822152886114,
"grad_norm": 0.5686175227165222,
"learning_rate": 1.46e-05,
"loss": 0.9753,
"step": 73
},
{
"epoch": 0.46177847113884557,
"grad_norm": 0.5182248950004578,
"learning_rate": 1.48e-05,
"loss": 0.9964,
"step": 74
},
{
"epoch": 0.46801872074882994,
"grad_norm": 0.5445067286491394,
"learning_rate": 1.5000000000000002e-05,
"loss": 0.9702,
"step": 75
},
{
"epoch": 0.47425897035881437,
"grad_norm": 0.6168459057807922,
"learning_rate": 1.5200000000000002e-05,
"loss": 0.9791,
"step": 76
},
{
"epoch": 0.48049921996879874,
"grad_norm": 0.6475315093994141,
"learning_rate": 1.54e-05,
"loss": 0.98,
"step": 77
},
{
"epoch": 0.48673946957878317,
"grad_norm": 0.8365716934204102,
"learning_rate": 1.5600000000000003e-05,
"loss": 0.987,
"step": 78
},
{
"epoch": 0.49297971918876754,
"grad_norm": 1.0882554054260254,
"learning_rate": 1.58e-05,
"loss": 0.9796,
"step": 79
},
{
"epoch": 0.49921996879875197,
"grad_norm": 1.109529972076416,
"learning_rate": 1.6000000000000003e-05,
"loss": 0.9825,
"step": 80
},
{
"epoch": 0.49921996879875197,
"eval_loss": 0.9798125624656677,
"eval_runtime": 132.8615,
"eval_samples_per_second": 101.993,
"eval_steps_per_second": 6.375,
"step": 80
},
{
"epoch": 0.5054602184087363,
"grad_norm": 0.9999867677688599,
"learning_rate": 1.62e-05,
"loss": 0.9591,
"step": 81
},
{
"epoch": 0.5117004680187207,
"grad_norm": 0.8042426705360413,
"learning_rate": 1.64e-05,
"loss": 0.9832,
"step": 82
},
{
"epoch": 0.5179407176287052,
"grad_norm": 0.5843170881271362,
"learning_rate": 1.66e-05,
"loss": 0.9769,
"step": 83
},
{
"epoch": 0.5241809672386896,
"grad_norm": 0.6988096237182617,
"learning_rate": 1.6800000000000002e-05,
"loss": 0.9556,
"step": 84
},
{
"epoch": 0.5304212168486739,
"grad_norm": 0.7298963665962219,
"learning_rate": 1.7e-05,
"loss": 0.983,
"step": 85
},
{
"epoch": 0.5366614664586583,
"grad_norm": 0.7856214046478271,
"learning_rate": 1.72e-05,
"loss": 0.9541,
"step": 86
},
{
"epoch": 0.5429017160686428,
"grad_norm": 0.6896259188652039,
"learning_rate": 1.7400000000000003e-05,
"loss": 0.983,
"step": 87
},
{
"epoch": 0.5491419656786272,
"grad_norm": 0.5555576682090759,
"learning_rate": 1.76e-05,
"loss": 0.9395,
"step": 88
},
{
"epoch": 0.5553822152886115,
"grad_norm": 0.6107622385025024,
"learning_rate": 1.7800000000000002e-05,
"loss": 0.9601,
"step": 89
},
{
"epoch": 0.5616224648985959,
"grad_norm": 0.7116836309432983,
"learning_rate": 1.8e-05,
"loss": 0.9647,
"step": 90
},
{
"epoch": 0.5678627145085804,
"grad_norm": 0.5782715082168579,
"learning_rate": 1.8200000000000002e-05,
"loss": 0.9604,
"step": 91
},
{
"epoch": 0.5741029641185648,
"grad_norm": 0.512100338935852,
"learning_rate": 1.8400000000000003e-05,
"loss": 0.9433,
"step": 92
},
{
"epoch": 0.5803432137285491,
"grad_norm": 0.6315212249755859,
"learning_rate": 1.86e-05,
"loss": 0.969,
"step": 93
},
{
"epoch": 0.5865834633385335,
"grad_norm": 0.6883739233016968,
"learning_rate": 1.88e-05,
"loss": 0.9832,
"step": 94
},
{
"epoch": 0.592823712948518,
"grad_norm": 0.6760767102241516,
"learning_rate": 1.9e-05,
"loss": 0.9414,
"step": 95
},
{
"epoch": 0.5990639625585024,
"grad_norm": 0.6577237248420715,
"learning_rate": 1.9200000000000003e-05,
"loss": 0.9748,
"step": 96
},
{
"epoch": 0.6053042121684867,
"grad_norm": 0.9515029788017273,
"learning_rate": 1.94e-05,
"loss": 0.9863,
"step": 97
},
{
"epoch": 0.6115444617784711,
"grad_norm": 1.40570068359375,
"learning_rate": 1.9600000000000002e-05,
"loss": 0.9445,
"step": 98
},
{
"epoch": 0.6177847113884556,
"grad_norm": 0.9026833176612854,
"learning_rate": 1.98e-05,
"loss": 0.9436,
"step": 99
},
{
"epoch": 0.62402496099844,
"grad_norm": 0.6666714549064636,
"learning_rate": 2e-05,
"loss": 0.9832,
"step": 100
},
{
"epoch": 0.6302652106084243,
"grad_norm": 0.8047837018966675,
"learning_rate": 1.9998980430094333e-05,
"loss": 0.9498,
"step": 101
},
{
"epoch": 0.6365054602184087,
"grad_norm": 0.9035269618034363,
"learning_rate": 1.9995921928281893e-05,
"loss": 0.9541,
"step": 102
},
{
"epoch": 0.6427457098283932,
"grad_norm": 1.027601718902588,
"learning_rate": 1.9990825118233958e-05,
"loss": 0.9786,
"step": 103
},
{
"epoch": 0.6489859594383776,
"grad_norm": 1.1459457874298096,
"learning_rate": 1.9983691039261358e-05,
"loss": 0.9482,
"step": 104
},
{
"epoch": 0.6552262090483619,
"grad_norm": 0.7179874777793884,
"learning_rate": 1.9974521146102535e-05,
"loss": 0.9743,
"step": 105
},
{
"epoch": 0.6614664586583463,
"grad_norm": 0.6881632208824158,
"learning_rate": 1.9963317308626916e-05,
"loss": 0.9797,
"step": 106
},
{
"epoch": 0.6677067082683308,
"grad_norm": 0.7822304368019104,
"learning_rate": 1.9950081811453598e-05,
"loss": 0.9682,
"step": 107
},
{
"epoch": 0.6739469578783152,
"grad_norm": 0.8269001841545105,
"learning_rate": 1.99348173534855e-05,
"loss": 0.9455,
"step": 108
},
{
"epoch": 0.6801872074882995,
"grad_norm": 0.8077254295349121,
"learning_rate": 1.991752704735903e-05,
"loss": 0.9243,
"step": 109
},
{
"epoch": 0.6864274570982839,
"grad_norm": 0.8119699954986572,
"learning_rate": 1.989821441880933e-05,
"loss": 0.9273,
"step": 110
},
{
"epoch": 0.6926677067082684,
"grad_norm": 0.8220670223236084,
"learning_rate": 1.9876883405951378e-05,
"loss": 0.9455,
"step": 111
},
{
"epoch": 0.6989079563182528,
"grad_norm": 0.8622007966041565,
"learning_rate": 1.9853538358476933e-05,
"loss": 0.9624,
"step": 112
},
{
"epoch": 0.7051482059282371,
"grad_norm": 0.8222960233688354,
"learning_rate": 1.9828184036767556e-05,
"loss": 0.955,
"step": 113
},
{
"epoch": 0.7113884555382215,
"grad_norm": 0.62811678647995,
"learning_rate": 1.9800825610923937e-05,
"loss": 0.9551,
"step": 114
},
{
"epoch": 0.717628705148206,
"grad_norm": 0.7614508271217346,
"learning_rate": 1.9771468659711595e-05,
"loss": 0.9413,
"step": 115
},
{
"epoch": 0.7238689547581904,
"grad_norm": 0.6695716977119446,
"learning_rate": 1.9740119169423337e-05,
"loss": 0.9384,
"step": 116
},
{
"epoch": 0.7301092043681747,
"grad_norm": 0.5493482947349548,
"learning_rate": 1.9706783532658528e-05,
"loss": 0.9601,
"step": 117
},
{
"epoch": 0.7363494539781591,
"grad_norm": 0.7798200249671936,
"learning_rate": 1.9671468547019575e-05,
"loss": 0.9555,
"step": 118
},
{
"epoch": 0.7425897035881436,
"grad_norm": 0.8122205138206482,
"learning_rate": 1.963418141372579e-05,
"loss": 0.9351,
"step": 119
},
{
"epoch": 0.748829953198128,
"grad_norm": 0.6351688504219055,
"learning_rate": 1.9594929736144978e-05,
"loss": 0.9517,
"step": 120
},
{
"epoch": 0.7550702028081123,
"grad_norm": 0.8507185578346252,
"learning_rate": 1.955372151824297e-05,
"loss": 0.9482,
"step": 121
},
{
"epoch": 0.7613104524180967,
"grad_norm": 1.057692050933838,
"learning_rate": 1.9510565162951538e-05,
"loss": 0.9626,
"step": 122
},
{
"epoch": 0.7675507020280812,
"grad_norm": 0.789968729019165,
"learning_rate": 1.94654694704549e-05,
"loss": 0.9504,
"step": 123
},
{
"epoch": 0.7737909516380655,
"grad_norm": 0.8988214731216431,
"learning_rate": 1.941844363639525e-05,
"loss": 0.9339,
"step": 124
},
{
"epoch": 0.7800312012480499,
"grad_norm": 0.6798993945121765,
"learning_rate": 1.936949724999762e-05,
"loss": 0.9387,
"step": 125
},
{
"epoch": 0.7862714508580343,
"grad_norm": 0.7597091794013977,
"learning_rate": 1.9318640292114526e-05,
"loss": 0.9884,
"step": 126
},
{
"epoch": 0.7925117004680188,
"grad_norm": 0.9357583522796631,
"learning_rate": 1.9265883133190715e-05,
"loss": 0.9382,
"step": 127
},
{
"epoch": 0.7987519500780031,
"grad_norm": 0.8738594055175781,
"learning_rate": 1.92112365311485e-05,
"loss": 0.9482,
"step": 128
},
{
"epoch": 0.8049921996879875,
"grad_norm": 0.8523539900779724,
"learning_rate": 1.9154711629194062e-05,
"loss": 0.9299,
"step": 129
},
{
"epoch": 0.8112324492979719,
"grad_norm": 0.5781116485595703,
"learning_rate": 1.9096319953545186e-05,
"loss": 0.9636,
"step": 130
},
{
"epoch": 0.8174726989079563,
"grad_norm": 0.7737751007080078,
"learning_rate": 1.9036073411080917e-05,
"loss": 0.9482,
"step": 131
},
{
"epoch": 0.8237129485179407,
"grad_norm": 0.7203546762466431,
"learning_rate": 1.8973984286913584e-05,
"loss": 0.9298,
"step": 132
},
{
"epoch": 0.8299531981279251,
"grad_norm": 0.5875493288040161,
"learning_rate": 1.891006524188368e-05,
"loss": 0.9239,
"step": 133
},
{
"epoch": 0.8361934477379095,
"grad_norm": 0.7981539964675903,
"learning_rate": 1.8844329309978146e-05,
"loss": 0.9546,
"step": 134
},
{
"epoch": 0.8424336973478939,
"grad_norm": 0.7623902559280396,
"learning_rate": 1.8776789895672557e-05,
"loss": 0.9335,
"step": 135
},
{
"epoch": 0.8486739469578783,
"grad_norm": 0.6350914239883423,
"learning_rate": 1.8707460771197773e-05,
"loss": 0.9585,
"step": 136
},
{
"epoch": 0.8549141965678627,
"grad_norm": 0.6981391310691833,
"learning_rate": 1.863635607373157e-05,
"loss": 0.9271,
"step": 137
},
{
"epoch": 0.8611544461778471,
"grad_norm": 0.7900795936584473,
"learning_rate": 1.856349030251589e-05,
"loss": 0.9022,
"step": 138
},
{
"epoch": 0.8673946957878315,
"grad_norm": 0.7494855523109436,
"learning_rate": 1.8488878315900228e-05,
"loss": 0.9534,
"step": 139
},
{
"epoch": 0.8736349453978159,
"grad_norm": 0.5757277011871338,
"learning_rate": 1.8412535328311813e-05,
"loss": 0.9397,
"step": 140
},
{
"epoch": 0.8798751950078003,
"grad_norm": 0.6893640756607056,
"learning_rate": 1.8334476907153177e-05,
"loss": 0.952,
"step": 141
},
{
"epoch": 0.8861154446177847,
"grad_norm": 0.7050842046737671,
"learning_rate": 1.825471896962774e-05,
"loss": 0.9417,
"step": 142
},
{
"epoch": 0.8923556942277691,
"grad_norm": 0.5544989109039307,
"learning_rate": 1.817327777949407e-05,
"loss": 0.9008,
"step": 143
},
{
"epoch": 0.8985959438377535,
"grad_norm": 0.6469840407371521,
"learning_rate": 1.8090169943749477e-05,
"loss": 0.9471,
"step": 144
},
{
"epoch": 0.9048361934477379,
"grad_norm": 0.6894209384918213,
"learning_rate": 1.8005412409243604e-05,
"loss": 0.9553,
"step": 145
},
{
"epoch": 0.9110764430577223,
"grad_norm": 0.5356501936912537,
"learning_rate": 1.7919022459222754e-05,
"loss": 0.9496,
"step": 146
},
{
"epoch": 0.9173166926677067,
"grad_norm": 0.6416233777999878,
"learning_rate": 1.7831017709805555e-05,
"loss": 0.9558,
"step": 147
},
{
"epoch": 0.9235569422776911,
"grad_norm": 0.7085059881210327,
"learning_rate": 1.7741416106390828e-05,
"loss": 0.9168,
"step": 148
},
{
"epoch": 0.9297971918876755,
"grad_norm": 0.6492967009544373,
"learning_rate": 1.7650235919998234e-05,
"loss": 0.9065,
"step": 149
},
{
"epoch": 0.9360374414976599,
"grad_norm": 0.7753322124481201,
"learning_rate": 1.7557495743542586e-05,
"loss": 0.9285,
"step": 150
},
{
"epoch": 0.9422776911076443,
"grad_norm": 0.6451005935668945,
"learning_rate": 1.7463214488042472e-05,
"loss": 0.9567,
"step": 151
},
{
"epoch": 0.9485179407176287,
"grad_norm": 0.4824198782444,
"learning_rate": 1.736741137876405e-05,
"loss": 0.905,
"step": 152
},
{
"epoch": 0.9547581903276131,
"grad_norm": 0.5846424102783203,
"learning_rate": 1.727010595130074e-05,
"loss": 0.9426,
"step": 153
},
{
"epoch": 0.9609984399375975,
"grad_norm": 0.5984904170036316,
"learning_rate": 1.7171318047589637e-05,
"loss": 0.9398,
"step": 154
},
{
"epoch": 0.9672386895475819,
"grad_norm": 0.545465886592865,
"learning_rate": 1.7071067811865477e-05,
"loss": 0.9185,
"step": 155
},
{
"epoch": 0.9734789391575663,
"grad_norm": 0.5724261403083801,
"learning_rate": 1.696937568655294e-05,
"loss": 0.9222,
"step": 156
},
{
"epoch": 0.9797191887675507,
"grad_norm": 0.5120018124580383,
"learning_rate": 1.6866262408098134e-05,
"loss": 0.93,
"step": 157
},
{
"epoch": 0.9859594383775351,
"grad_norm": 0.5575640797615051,
"learning_rate": 1.6761749002740195e-05,
"loss": 0.9483,
"step": 158
},
{
"epoch": 0.9921996879875195,
"grad_norm": 0.603184163570404,
"learning_rate": 1.6655856782223682e-05,
"loss": 0.9394,
"step": 159
},
{
"epoch": 0.9984399375975039,
"grad_norm": 0.536756157875061,
"learning_rate": 1.6548607339452853e-05,
"loss": 0.9227,
"step": 160
},
{
"epoch": 0.9984399375975039,
"eval_loss": 0.9286661744117737,
"eval_runtime": 134.8358,
"eval_samples_per_second": 100.5,
"eval_steps_per_second": 6.282,
"step": 160
},
{
"epoch": 1.0046801872074882,
"grad_norm": 0.5680767297744751,
"learning_rate": 1.6440022544088553e-05,
"loss": 0.9263,
"step": 161
},
{
"epoch": 1.0109204368174727,
"grad_norm": 0.6374309062957764,
"learning_rate": 1.6330124538088705e-05,
"loss": 0.942,
"step": 162
},
{
"epoch": 1.0171606864274572,
"grad_norm": 0.5749344825744629,
"learning_rate": 1.6218935731193223e-05,
"loss": 0.9264,
"step": 163
},
{
"epoch": 1.0046801872074882,
"grad_norm": 0.9867531657218933,
"learning_rate": 1.6106478796354382e-05,
"loss": 0.8513,
"step": 164
},
{
"epoch": 1.0109204368174727,
"grad_norm": 0.7155735492706299,
"learning_rate": 1.599277666511347e-05,
"loss": 0.8023,
"step": 165
},
{
"epoch": 1.0171606864274572,
"grad_norm": 1.3990559577941895,
"learning_rate": 1.5877852522924733e-05,
"loss": 0.7992,
"step": 166
},
{
"epoch": 1.0234009360374414,
"grad_norm": 0.9339443445205688,
"learning_rate": 1.576172980442753e-05,
"loss": 0.7982,
"step": 167
},
{
"epoch": 1.029641185647426,
"grad_norm": 0.8383383750915527,
"learning_rate": 1.5644432188667695e-05,
"loss": 0.7764,
"step": 168
},
{
"epoch": 1.0358814352574104,
"grad_norm": 0.8227719664573669,
"learning_rate": 1.5525983594269026e-05,
"loss": 0.7985,
"step": 169
},
{
"epoch": 1.0421216848673946,
"grad_norm": 0.8231950998306274,
"learning_rate": 1.5406408174555978e-05,
"loss": 0.8097,
"step": 170
},
{
"epoch": 1.0483619344773791,
"grad_norm": 1.1454997062683105,
"learning_rate": 1.528573031262842e-05,
"loss": 0.7779,
"step": 171
},
{
"epoch": 1.0546021840873634,
"grad_norm": 0.6939067840576172,
"learning_rate": 1.5163974616389621e-05,
"loss": 0.8217,
"step": 172
},
{
"epoch": 1.0608424336973479,
"grad_norm": 0.7526265978813171,
"learning_rate": 1.504116591352832e-05,
"loss": 0.7886,
"step": 173
},
{
"epoch": 1.0670826833073324,
"grad_norm": 0.7892379760742188,
"learning_rate": 1.491732924645604e-05,
"loss": 0.7959,
"step": 174
},
{
"epoch": 1.0733229329173166,
"grad_norm": 0.7279461026191711,
"learning_rate": 1.479248986720057e-05,
"loss": 0.7977,
"step": 175
},
{
"epoch": 1.079563182527301,
"grad_norm": 0.7360721230506897,
"learning_rate": 1.4666673232256738e-05,
"loss": 0.7883,
"step": 176
},
{
"epoch": 1.0858034321372856,
"grad_norm": 0.6525989174842834,
"learning_rate": 1.4539904997395468e-05,
"loss": 0.785,
"step": 177
},
{
"epoch": 1.0920436817472698,
"grad_norm": 0.7803720235824585,
"learning_rate": 1.4412211012432213e-05,
"loss": 0.7998,
"step": 178
},
{
"epoch": 1.0982839313572543,
"grad_norm": 0.6588256359100342,
"learning_rate": 1.4283617315955815e-05,
"loss": 0.7999,
"step": 179
},
{
"epoch": 1.1045241809672386,
"grad_norm": 0.5983767509460449,
"learning_rate": 1.4154150130018867e-05,
"loss": 0.7848,
"step": 180
},
{
"epoch": 1.110764430577223,
"grad_norm": 0.641603946685791,
"learning_rate": 1.4023835854790682e-05,
"loss": 0.7937,
"step": 181
},
{
"epoch": 1.1170046801872076,
"grad_norm": 0.6453792452812195,
"learning_rate": 1.3892701063173917e-05,
"loss": 0.8004,
"step": 182
},
{
"epoch": 1.1232449297971918,
"grad_norm": 0.6428067088127136,
"learning_rate": 1.3760772495385998e-05,
"loss": 0.792,
"step": 183
},
{
"epoch": 1.1294851794071763,
"grad_norm": 0.6279442310333252,
"learning_rate": 1.362807705350641e-05,
"loss": 0.7859,
"step": 184
},
{
"epoch": 1.1357254290171608,
"grad_norm": 0.6000891327857971,
"learning_rate": 1.3494641795990986e-05,
"loss": 0.8039,
"step": 185
},
{
"epoch": 1.141965678627145,
"grad_norm": 0.6628398895263672,
"learning_rate": 1.3360493932154301e-05,
"loss": 0.7829,
"step": 186
},
{
"epoch": 1.1482059282371295,
"grad_norm": 0.6268762946128845,
"learning_rate": 1.3225660816621342e-05,
"loss": 0.778,
"step": 187
},
{
"epoch": 1.154446177847114,
"grad_norm": 0.639062225818634,
"learning_rate": 1.3090169943749475e-05,
"loss": 0.7796,
"step": 188
},
{
"epoch": 1.1606864274570983,
"grad_norm": 0.6048714518547058,
"learning_rate": 1.2954048942022002e-05,
"loss": 0.7883,
"step": 189
},
{
"epoch": 1.1669266770670828,
"grad_norm": 0.5929723381996155,
"learning_rate": 1.2817325568414299e-05,
"loss": 0.7736,
"step": 190
},
{
"epoch": 1.173166926677067,
"grad_norm": 0.5971985459327698,
"learning_rate": 1.2680027702733791e-05,
"loss": 0.8008,
"step": 191
},
{
"epoch": 1.1794071762870515,
"grad_norm": 0.6460970044136047,
"learning_rate": 1.2542183341934873e-05,
"loss": 0.7867,
"step": 192
},
{
"epoch": 1.185647425897036,
"grad_norm": 0.5345771908760071,
"learning_rate": 1.2403820594409926e-05,
"loss": 0.7808,
"step": 193
},
{
"epoch": 1.1918876755070202,
"grad_norm": 0.6704164743423462,
"learning_rate": 1.2264967674257647e-05,
"loss": 0.7785,
"step": 194
},
{
"epoch": 1.1981279251170047,
"grad_norm": 0.5182461738586426,
"learning_rate": 1.2125652895529766e-05,
"loss": 0.7907,
"step": 195
},
{
"epoch": 1.204368174726989,
"grad_norm": 0.6421562433242798,
"learning_rate": 1.1985904666457455e-05,
"loss": 0.7914,
"step": 196
},
{
"epoch": 1.2106084243369735,
"grad_norm": 0.5846896171569824,
"learning_rate": 1.1845751483658454e-05,
"loss": 0.7631,
"step": 197
},
{
"epoch": 1.216848673946958,
"grad_norm": 0.5582466721534729,
"learning_rate": 1.170522192632624e-05,
"loss": 0.7912,
"step": 198
},
{
"epoch": 1.2230889235569422,
"grad_norm": 0.5527791976928711,
"learning_rate": 1.156434465040231e-05,
"loss": 0.7938,
"step": 199
},
{
"epoch": 1.2293291731669267,
"grad_norm": 0.5673221945762634,
"learning_rate": 1.1423148382732854e-05,
"loss": 0.7947,
"step": 200
},
{
"epoch": 1.2355694227769112,
"grad_norm": 0.5078392028808594,
"learning_rate": 1.1281661915210931e-05,
"loss": 0.7771,
"step": 201
},
{
"epoch": 1.2418096723868954,
"grad_norm": 0.5475752353668213,
"learning_rate": 1.1139914098905406e-05,
"loss": 0.7781,
"step": 202
},
{
"epoch": 1.24804992199688,
"grad_norm": 0.5290600657463074,
"learning_rate": 1.0997933838177828e-05,
"loss": 0.7622,
"step": 203
},
{
"epoch": 1.2542901716068644,
"grad_norm": 0.4957723915576935,
"learning_rate": 1.08557500847884e-05,
"loss": 0.7857,
"step": 204
},
{
"epoch": 1.2605304212168487,
"grad_norm": 0.5119233727455139,
"learning_rate": 1.0713391831992324e-05,
"loss": 0.7585,
"step": 205
},
{
"epoch": 1.2667706708268331,
"grad_norm": 0.5187195539474487,
"learning_rate": 1.0570888108627682e-05,
"loss": 0.7885,
"step": 206
},
{
"epoch": 1.2730109204368174,
"grad_norm": 0.5066515803337097,
"learning_rate": 1.0428267973196027e-05,
"loss": 0.7691,
"step": 207
},
{
"epoch": 1.2792511700468019,
"grad_norm": 0.48673221468925476,
"learning_rate": 1.0285560507936962e-05,
"loss": 0.7715,
"step": 208
},
{
"epoch": 1.2854914196567861,
"grad_norm": 0.5083721876144409,
"learning_rate": 1.0142794812897874e-05,
"loss": 0.7812,
"step": 209
},
{
"epoch": 1.2917316692667706,
"grad_norm": 0.5033391118049622,
"learning_rate": 1e-05,
"loss": 0.7756,
"step": 210
},
{
"epoch": 1.2979719188767551,
"grad_norm": 0.532008945941925,
"learning_rate": 9.85720518710213e-06,
"loss": 0.7898,
"step": 211
},
{
"epoch": 1.3042121684867394,
"grad_norm": 0.5123456716537476,
"learning_rate": 9.71443949206304e-06,
"loss": 0.7779,
"step": 212
},
{
"epoch": 1.3104524180967239,
"grad_norm": 0.48444995284080505,
"learning_rate": 9.571732026803978e-06,
"loss": 0.7598,
"step": 213
},
{
"epoch": 1.3166926677067083,
"grad_norm": 0.5265589356422424,
"learning_rate": 9.42911189137232e-06,
"loss": 0.783,
"step": 214
},
{
"epoch": 1.3229329173166926,
"grad_norm": 0.5039641261100769,
"learning_rate": 9.286608168007678e-06,
"loss": 0.7798,
"step": 215
},
{
"epoch": 1.329173166926677,
"grad_norm": 0.5092752575874329,
"learning_rate": 9.144249915211605e-06,
"loss": 0.7635,
"step": 216
},
{
"epoch": 1.3354134165366616,
"grad_norm": 0.5394583940505981,
"learning_rate": 9.002066161822174e-06,
"loss": 0.7999,
"step": 217
},
{
"epoch": 1.3416536661466458,
"grad_norm": 0.5039361119270325,
"learning_rate": 8.860085901094595e-06,
"loss": 0.7745,
"step": 218
},
{
"epoch": 1.3478939157566303,
"grad_norm": 0.5087692737579346,
"learning_rate": 8.718338084789074e-06,
"loss": 0.8093,
"step": 219
},
{
"epoch": 1.3541341653666148,
"grad_norm": 0.5368974208831787,
"learning_rate": 8.576851617267151e-06,
"loss": 0.7596,
"step": 220
},
{
"epoch": 1.360374414976599,
"grad_norm": 0.5123481750488281,
"learning_rate": 8.43565534959769e-06,
"loss": 0.7665,
"step": 221
},
{
"epoch": 1.3666146645865835,
"grad_norm": 0.5048606395721436,
"learning_rate": 8.294778073673762e-06,
"loss": 0.769,
"step": 222
},
{
"epoch": 1.3728549141965678,
"grad_norm": 0.5156130790710449,
"learning_rate": 8.154248516341547e-06,
"loss": 0.7989,
"step": 223
},
{
"epoch": 1.3790951638065523,
"grad_norm": 0.5228000283241272,
"learning_rate": 8.014095333542548e-06,
"loss": 0.7775,
"step": 224
},
{
"epoch": 1.3853354134165365,
"grad_norm": 0.5356248617172241,
"learning_rate": 7.874347104470234e-06,
"loss": 0.7629,
"step": 225
},
{
"epoch": 1.391575663026521,
"grad_norm": 0.49624764919281006,
"learning_rate": 7.735032325742355e-06,
"loss": 0.8026,
"step": 226
},
{
"epoch": 1.3978159126365055,
"grad_norm": 0.5164198279380798,
"learning_rate": 7.596179405590076e-06,
"loss": 0.7803,
"step": 227
},
{
"epoch": 1.4040561622464898,
"grad_norm": 0.47972792387008667,
"learning_rate": 7.4578166580651335e-06,
"loss": 0.7783,
"step": 228
},
{
"epoch": 1.4102964118564743,
"grad_norm": 0.4989663064479828,
"learning_rate": 7.319972297266215e-06,
"loss": 0.7728,
"step": 229
},
{
"epoch": 1.4165366614664587,
"grad_norm": 0.5109092593193054,
"learning_rate": 7.182674431585703e-06,
"loss": 0.7919,
"step": 230
},
{
"epoch": 1.422776911076443,
"grad_norm": 0.5069689750671387,
"learning_rate": 7.045951057978001e-06,
"loss": 0.8015,
"step": 231
},
{
"epoch": 1.4290171606864275,
"grad_norm": 0.5074580907821655,
"learning_rate": 6.909830056250527e-06,
"loss": 0.7844,
"step": 232
},
{
"epoch": 1.435257410296412,
"grad_norm": 0.47711381316185,
"learning_rate": 6.774339183378663e-06,
"loss": 0.7601,
"step": 233
},
{
"epoch": 1.4414976599063962,
"grad_norm": 0.4926273822784424,
"learning_rate": 6.639506067845698e-06,
"loss": 0.7904,
"step": 234
},
{
"epoch": 1.4477379095163807,
"grad_norm": 0.5004563927650452,
"learning_rate": 6.505358204009018e-06,
"loss": 0.7638,
"step": 235
},
{
"epoch": 1.4539781591263652,
"grad_norm": 0.5014521479606628,
"learning_rate": 6.3719229464935915e-06,
"loss": 0.794,
"step": 236
},
{
"epoch": 1.4602184087363494,
"grad_norm": 0.5181212425231934,
"learning_rate": 6.239227504614004e-06,
"loss": 0.7495,
"step": 237
},
{
"epoch": 1.466458658346334,
"grad_norm": 0.5317255258560181,
"learning_rate": 6.107298936826086e-06,
"loss": 0.7884,
"step": 238
},
{
"epoch": 1.4726989079563182,
"grad_norm": 0.5126049518585205,
"learning_rate": 5.9761641452093225e-06,
"loss": 0.7869,
"step": 239
},
{
"epoch": 1.4789391575663027,
"grad_norm": 0.5322765707969666,
"learning_rate": 5.845849869981137e-06,
"loss": 0.7712,
"step": 240
},
{
"epoch": 1.4789391575663027,
"eval_loss": 0.9110648036003113,
"eval_runtime": 134.5659,
"eval_samples_per_second": 100.702,
"eval_steps_per_second": 6.294,
"step": 240
},
{
"epoch": 1.485179407176287,
"grad_norm": 0.5006371140480042,
"learning_rate": 5.716382684044191e-06,
"loss": 0.7939,
"step": 241
},
{
"epoch": 1.4914196567862714,
"grad_norm": 0.5099849104881287,
"learning_rate": 5.587788987567785e-06,
"loss": 0.771,
"step": 242
},
{
"epoch": 1.497659906396256,
"grad_norm": 0.5397711396217346,
"learning_rate": 5.460095002604533e-06,
"loss": 0.7642,
"step": 243
},
{
"epoch": 1.5039001560062402,
"grad_norm": 0.47754916548728943,
"learning_rate": 5.333326767743263e-06,
"loss": 0.7848,
"step": 244
},
{
"epoch": 1.5101404056162246,
"grad_norm": 0.5114724636077881,
"learning_rate": 5.207510132799436e-06,
"loss": 0.7743,
"step": 245
},
{
"epoch": 1.5163806552262091,
"grad_norm": 0.4859448969364166,
"learning_rate": 5.082670753543961e-06,
"loss": 0.7748,
"step": 246
},
{
"epoch": 1.5226209048361934,
"grad_norm": 0.46158257126808167,
"learning_rate": 4.958834086471683e-06,
"loss": 0.7951,
"step": 247
},
{
"epoch": 1.5288611544461779,
"grad_norm": 0.48695865273475647,
"learning_rate": 4.836025383610382e-06,
"loss": 0.7968,
"step": 248
},
{
"epoch": 1.5351014040561624,
"grad_norm": 0.4924914240837097,
"learning_rate": 4.714269687371581e-06,
"loss": 0.792,
"step": 249
},
{
"epoch": 1.5413416536661466,
"grad_norm": 0.5044175982475281,
"learning_rate": 4.593591825444028e-06,
"loss": 0.781,
"step": 250
},
{
"epoch": 1.547581903276131,
"grad_norm": 0.4598456919193268,
"learning_rate": 4.474016405730973e-06,
"loss": 0.794,
"step": 251
},
{
"epoch": 1.5538221528861156,
"grad_norm": 0.48866939544677734,
"learning_rate": 4.355567811332311e-06,
"loss": 0.7853,
"step": 252
},
{
"epoch": 1.5600624024960998,
"grad_norm": 0.4878495931625366,
"learning_rate": 4.2382701955724724e-06,
"loss": 0.743,
"step": 253
},
{
"epoch": 1.566302652106084,
"grad_norm": 0.4770466387271881,
"learning_rate": 4.12214747707527e-06,
"loss": 0.7442,
"step": 254
},
{
"epoch": 1.5725429017160688,
"grad_norm": 0.4467732012271881,
"learning_rate": 4.007223334886531e-06,
"loss": 0.7611,
"step": 255
},
{
"epoch": 1.578783151326053,
"grad_norm": 0.47116416692733765,
"learning_rate": 3.893521203645618e-06,
"loss": 0.7921,
"step": 256
},
{
"epoch": 1.5850234009360373,
"grad_norm": 0.468517005443573,
"learning_rate": 3.78106426880678e-06,
"loss": 0.7811,
"step": 257
},
{
"epoch": 1.5912636505460218,
"grad_norm": 0.46981289982795715,
"learning_rate": 3.6698754619112974e-06,
"loss": 0.7756,
"step": 258
},
{
"epoch": 1.5975039001560063,
"grad_norm": 0.45571863651275635,
"learning_rate": 3.5599774559114475e-06,
"loss": 0.7469,
"step": 259
},
{
"epoch": 1.6037441497659906,
"grad_norm": 0.4486157298088074,
"learning_rate": 3.4513926605471504e-06,
"loss": 0.7566,
"step": 260
},
{
"epoch": 1.609984399375975,
"grad_norm": 0.47735777497291565,
"learning_rate": 3.344143217776319e-06,
"loss": 0.7753,
"step": 261
},
{
"epoch": 1.6162246489859595,
"grad_norm": 0.4546492099761963,
"learning_rate": 3.2382509972598087e-06,
"loss": 0.7741,
"step": 262
},
{
"epoch": 1.6224648985959438,
"grad_norm": 0.4690036177635193,
"learning_rate": 3.133737591901864e-06,
"loss": 0.7693,
"step": 263
},
{
"epoch": 1.6287051482059283,
"grad_norm": 0.4607780873775482,
"learning_rate": 3.0306243134470668e-06,
"loss": 0.7312,
"step": 264
},
{
"epoch": 1.6349453978159127,
"grad_norm": 0.4510229825973511,
"learning_rate": 2.9289321881345257e-06,
"loss": 0.7705,
"step": 265
},
{
"epoch": 1.641185647425897,
"grad_norm": 0.43797171115875244,
"learning_rate": 2.8286819524103657e-06,
"loss": 0.7666,
"step": 266
},
{
"epoch": 1.6474258970358813,
"grad_norm": 0.45275524258613586,
"learning_rate": 2.7298940486992654e-06,
"loss": 0.8028,
"step": 267
},
{
"epoch": 1.653666146645866,
"grad_norm": 0.4479183852672577,
"learning_rate": 2.6325886212359496e-06,
"loss": 0.7647,
"step": 268
},
{
"epoch": 1.6599063962558502,
"grad_norm": 0.4575185775756836,
"learning_rate": 2.5367855119575314e-06,
"loss": 0.7731,
"step": 269
},
{
"epoch": 1.6661466458658345,
"grad_norm": 0.45092037320137024,
"learning_rate": 2.4425042564574186e-06,
"loss": 0.7895,
"step": 270
},
{
"epoch": 1.672386895475819,
"grad_norm": 0.4297903776168823,
"learning_rate": 2.3497640800017687e-06,
"loss": 0.7597,
"step": 271
},
{
"epoch": 1.6786271450858035,
"grad_norm": 0.43731772899627686,
"learning_rate": 2.2585838936091753e-06,
"loss": 0.7775,
"step": 272
},
{
"epoch": 1.6848673946957877,
"grad_norm": 0.4288908541202545,
"learning_rate": 2.1689822901944456e-06,
"loss": 0.787,
"step": 273
},
{
"epoch": 1.6911076443057722,
"grad_norm": 0.42555147409439087,
"learning_rate": 2.0809775407772505e-06,
"loss": 0.7751,
"step": 274
},
{
"epoch": 1.6973478939157567,
"grad_norm": 0.45276904106140137,
"learning_rate": 1.994587590756397e-06,
"loss": 0.7893,
"step": 275
},
{
"epoch": 1.703588143525741,
"grad_norm": 0.44294846057891846,
"learning_rate": 1.9098300562505266e-06,
"loss": 0.7794,
"step": 276
},
{
"epoch": 1.7098283931357254,
"grad_norm": 0.4307778775691986,
"learning_rate": 1.826722220505931e-06,
"loss": 0.7791,
"step": 277
},
{
"epoch": 1.71606864274571,
"grad_norm": 0.43245357275009155,
"learning_rate": 1.74528103037226e-06,
"loss": 0.7497,
"step": 278
},
{
"epoch": 1.7223088923556942,
"grad_norm": 0.4251644015312195,
"learning_rate": 1.6655230928468257e-06,
"loss": 0.7743,
"step": 279
},
{
"epoch": 1.7285491419656787,
"grad_norm": 0.4321819543838501,
"learning_rate": 1.587464671688187e-06,
"loss": 0.7522,
"step": 280
},
{
"epoch": 1.7347893915756631,
"grad_norm": 0.4292062222957611,
"learning_rate": 1.5111216840997745e-06,
"loss": 0.7698,
"step": 281
},
{
"epoch": 1.7410296411856474,
"grad_norm": 0.45093026757240295,
"learning_rate": 1.436509697484111e-06,
"loss": 0.7647,
"step": 282
},
{
"epoch": 1.7472698907956317,
"grad_norm": 0.42376089096069336,
"learning_rate": 1.3636439262684299e-06,
"loss": 0.7556,
"step": 283
},
{
"epoch": 1.7535101404056164,
"grad_norm": 0.4337958097457886,
"learning_rate": 1.2925392288022299e-06,
"loss": 0.7756,
"step": 284
},
{
"epoch": 1.7597503900156006,
"grad_norm": 0.4319595992565155,
"learning_rate": 1.2232101043274437e-06,
"loss": 0.7816,
"step": 285
},
{
"epoch": 1.765990639625585,
"grad_norm": 0.4335976243019104,
"learning_rate": 1.1556706900218572e-06,
"loss": 0.7902,
"step": 286
},
{
"epoch": 1.7722308892355694,
"grad_norm": 0.4158109724521637,
"learning_rate": 1.0899347581163222e-06,
"loss": 0.7396,
"step": 287
},
{
"epoch": 1.7784711388455539,
"grad_norm": 0.42235442996025085,
"learning_rate": 1.0260157130864178e-06,
"loss": 0.7747,
"step": 288
},
{
"epoch": 1.7847113884555381,
"grad_norm": 0.43022453784942627,
"learning_rate": 9.63926588919083e-07,
"loss": 0.7616,
"step": 289
},
{
"epoch": 1.7909516380655226,
"grad_norm": 0.42115500569343567,
"learning_rate": 9.036800464548157e-07,
"loss": 0.7609,
"step": 290
},
{
"epoch": 1.797191887675507,
"grad_norm": 0.43961915373802185,
"learning_rate": 8.4528837080594e-07,
"loss": 0.7665,
"step": 291
},
{
"epoch": 1.8034321372854913,
"grad_norm": 0.42159175872802734,
"learning_rate": 7.887634688515e-07,
"loss": 0.7684,
"step": 292
},
{
"epoch": 1.8096723868954758,
"grad_norm": 0.4211348295211792,
"learning_rate": 7.341168668092857e-07,
"loss": 0.7701,
"step": 293
},
{
"epoch": 1.8159126365054603,
"grad_norm": 0.447083055973053,
"learning_rate": 6.813597078854772e-07,
"loss": 0.8051,
"step": 294
},
{
"epoch": 1.8221528861154446,
"grad_norm": 0.42439502477645874,
"learning_rate": 6.305027500023841e-07,
"loss": 0.7794,
"step": 295
},
{
"epoch": 1.828393135725429,
"grad_norm": 0.43265220522880554,
"learning_rate": 5.815563636047539e-07,
"loss": 0.7722,
"step": 296
},
{
"epoch": 1.8346333853354135,
"grad_norm": 0.4244794249534607,
"learning_rate": 5.345305295450997e-07,
"loss": 0.7638,
"step": 297
},
{
"epoch": 1.8408736349453978,
"grad_norm": 0.41255486011505127,
"learning_rate": 4.894348370484648e-07,
"loss": 0.772,
"step": 298
},
{
"epoch": 1.847113884555382,
"grad_norm": 0.4242880344390869,
"learning_rate": 4.4627848175703315e-07,
"loss": 0.7643,
"step": 299
},
{
"epoch": 1.8533541341653668,
"grad_norm": 0.41673314571380615,
"learning_rate": 4.0507026385502747e-07,
"loss": 0.7601,
"step": 300
},
{
"epoch": 1.859594383775351,
"grad_norm": 0.42518967390060425,
"learning_rate": 3.658185862742103e-07,
"loss": 0.7699,
"step": 301
},
{
"epoch": 1.8658346333853353,
"grad_norm": 0.42029449343681335,
"learning_rate": 3.2853145298042954e-07,
"loss": 0.7498,
"step": 302
},
{
"epoch": 1.8720748829953198,
"grad_norm": 0.4201337695121765,
"learning_rate": 2.93216467341475e-07,
"loss": 0.7626,
"step": 303
},
{
"epoch": 1.8783151326053042,
"grad_norm": 0.42505332827568054,
"learning_rate": 2.5988083057666534e-07,
"loss": 0.774,
"step": 304
},
{
"epoch": 1.8845553822152885,
"grad_norm": 0.41834697127342224,
"learning_rate": 2.2853134028840594e-07,
"loss": 0.7638,
"step": 305
},
{
"epoch": 1.890795631825273,
"grad_norm": 0.4278232455253601,
"learning_rate": 1.9917438907606556e-07,
"loss": 0.7906,
"step": 306
},
{
"epoch": 1.8970358814352575,
"grad_norm": 0.429078608751297,
"learning_rate": 1.7181596323244453e-07,
"loss": 0.7839,
"step": 307
},
{
"epoch": 1.9032761310452417,
"grad_norm": 0.4142579138278961,
"learning_rate": 1.464616415230702e-07,
"loss": 0.7687,
"step": 308
},
{
"epoch": 1.9095163806552262,
"grad_norm": 0.40971171855926514,
"learning_rate": 1.231165940486234e-07,
"loss": 0.7647,
"step": 309
},
{
"epoch": 1.9157566302652107,
"grad_norm": 0.4336109161376953,
"learning_rate": 1.0178558119067316e-07,
"loss": 0.7691,
"step": 310
},
{
"epoch": 1.921996879875195,
"grad_norm": 0.40623047947883606,
"learning_rate": 8.247295264097288e-08,
"loss": 0.7728,
"step": 311
},
{
"epoch": 1.9282371294851794,
"grad_norm": 0.4205041527748108,
"learning_rate": 6.51826465144978e-08,
"loss": 0.7533,
"step": 312
},
{
"epoch": 1.934477379095164,
"grad_norm": 0.416535347700119,
"learning_rate": 4.991818854640396e-08,
"loss": 0.7826,
"step": 313
},
{
"epoch": 1.9407176287051482,
"grad_norm": 0.41483184695243835,
"learning_rate": 3.668269137308666e-08,
"loss": 0.7688,
"step": 314
},
{
"epoch": 1.9469578783151325,
"grad_norm": 0.4072718322277069,
"learning_rate": 2.547885389746485e-08,
"loss": 0.7943,
"step": 315
},
{
"epoch": 1.9531981279251172,
"grad_norm": 0.413289338350296,
"learning_rate": 1.630896073864352e-08,
"loss": 0.7867,
"step": 316
},
{
"epoch": 1.9594383775351014,
"grad_norm": 0.4177180528640747,
"learning_rate": 9.174881766043086e-09,
"loss": 0.781,
"step": 317
},
{
"epoch": 1.9656786271450857,
"grad_norm": 0.41807225346565247,
"learning_rate": 4.0780717181077015e-09,
"loss": 0.769,
"step": 318
},
{
"epoch": 1.9719188767550702,
"grad_norm": 0.41558825969696045,
"learning_rate": 1.019569905666984e-09,
"loss": 0.7504,
"step": 319
},
{
"epoch": 1.9781591263650546,
"grad_norm": 0.4160574674606323,
"learning_rate": 0.0,
"loss": 0.8025,
"step": 320
},
{
"epoch": 1.9781591263650546,
"eval_loss": 0.903252899646759,
"eval_runtime": 134.5566,
"eval_samples_per_second": 100.709,
"eval_steps_per_second": 6.295,
"step": 320
}
],
"logging_steps": 1,
"max_steps": 320,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 80,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.9476972312723456e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}