|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 141420, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.07071135624381275, |
|
"grad_norm": 1.3512808084487915, |
|
"learning_rate": 9.375e-06, |
|
"loss": 5.8359, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.1414227124876255, |
|
"grad_norm": 1.6514761447906494, |
|
"learning_rate": 1.875e-05, |
|
"loss": 3.8313, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.21213406873143828, |
|
"grad_norm": 1.4630589485168457, |
|
"learning_rate": 2.8125e-05, |
|
"loss": 3.4288, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.282845424975251, |
|
"grad_norm": 1.2321913242340088, |
|
"learning_rate": 3.75e-05, |
|
"loss": 3.202, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.3535567812190638, |
|
"grad_norm": 1.084225058555603, |
|
"learning_rate": 4.6874999999999994e-05, |
|
"loss": 3.0637, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.42426813746287656, |
|
"grad_norm": 0.9784078598022461, |
|
"learning_rate": 5.625e-05, |
|
"loss": 2.9404, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.4949794937066893, |
|
"grad_norm": 0.9349497556686401, |
|
"learning_rate": 6.5625e-05, |
|
"loss": 2.8482, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.565690849950502, |
|
"grad_norm": 0.7968702912330627, |
|
"learning_rate": 7.5e-05, |
|
"loss": 2.7644, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.6364022061943148, |
|
"grad_norm": 0.7970491647720337, |
|
"learning_rate": 8.437499999999999e-05, |
|
"loss": 2.7035, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.7071135624381276, |
|
"grad_norm": 0.808237612247467, |
|
"learning_rate": 9.374999999999999e-05, |
|
"loss": 2.6559, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.7778249186819404, |
|
"grad_norm": 0.6886675953865051, |
|
"learning_rate": 0.00010312499999999999, |
|
"loss": 2.5948, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.8485362749257531, |
|
"grad_norm": 0.7730551362037659, |
|
"learning_rate": 0.0001125, |
|
"loss": 2.5558, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.9192476311695659, |
|
"grad_norm": 0.6982184052467346, |
|
"learning_rate": 0.000121865625, |
|
"loss": 2.5352, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.9899589874133786, |
|
"grad_norm": 0.666688084602356, |
|
"learning_rate": 0.000131240625, |
|
"loss": 2.5065, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.49123160368602076, |
|
"eval_loss": 2.7265632152557373, |
|
"eval_runtime": 100.7986, |
|
"eval_samples_per_second": 464.967, |
|
"eval_steps_per_second": 7.272, |
|
"step": 14142 |
|
}, |
|
{ |
|
"epoch": 1.0606703436571914, |
|
"grad_norm": 0.6360883116722107, |
|
"learning_rate": 0.00014060625, |
|
"loss": 2.449, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 1.131381699901004, |
|
"grad_norm": 0.659411609172821, |
|
"learning_rate": 0.00014998125, |
|
"loss": 2.4338, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 1.2020930561448169, |
|
"grad_norm": 0.6129982471466064, |
|
"learning_rate": 0.000159346875, |
|
"loss": 2.4172, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 1.2728044123886297, |
|
"grad_norm": 0.6230875849723816, |
|
"learning_rate": 0.00016872187499999999, |
|
"loss": 2.4099, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 1.3435157686324424, |
|
"grad_norm": 0.5862132906913757, |
|
"learning_rate": 0.00017808749999999999, |
|
"loss": 2.403, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 1.414227124876255, |
|
"grad_norm": 0.5603518486022949, |
|
"learning_rate": 0.00018746249999999998, |
|
"loss": 2.3744, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 1.4849384811200679, |
|
"grad_norm": 0.5778409242630005, |
|
"learning_rate": 0.00019682812499999998, |
|
"loss": 2.3861, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 1.5556498373638807, |
|
"grad_norm": 0.5504118800163269, |
|
"learning_rate": 0.00020620312499999998, |
|
"loss": 2.3661, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 1.6263611936076934, |
|
"grad_norm": 0.5299841165542603, |
|
"learning_rate": 0.00021556874999999998, |
|
"loss": 2.371, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 1.697072549851506, |
|
"grad_norm": 0.4794081747531891, |
|
"learning_rate": 0.00022494374999999998, |
|
"loss": 2.3571, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 1.7677839060953189, |
|
"grad_norm": 0.5159916877746582, |
|
"learning_rate": 0.00023430937499999997, |
|
"loss": 2.3483, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 1.8384952623391317, |
|
"grad_norm": 0.4809582233428955, |
|
"learning_rate": 0.00024368437499999997, |
|
"loss": 2.3403, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 1.9092066185829444, |
|
"grad_norm": 0.4775105118751526, |
|
"learning_rate": 0.00025305, |
|
"loss": 2.3318, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 1.979917974826757, |
|
"grad_norm": 0.46707651019096375, |
|
"learning_rate": 0.000262425, |
|
"loss": 2.3323, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.5073636729232759, |
|
"eval_loss": 2.5641860961914062, |
|
"eval_runtime": 101.8267, |
|
"eval_samples_per_second": 460.272, |
|
"eval_steps_per_second": 7.199, |
|
"step": 28284 |
|
}, |
|
{ |
|
"epoch": 2.05062933107057, |
|
"grad_norm": 0.4617663025856018, |
|
"learning_rate": 0.000271790625, |
|
"loss": 2.2778, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 2.1213406873143827, |
|
"grad_norm": 0.4646793007850647, |
|
"learning_rate": 0.000281165625, |
|
"loss": 2.2697, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 2.1920520435581956, |
|
"grad_norm": 0.4711323082447052, |
|
"learning_rate": 0.00029053124999999994, |
|
"loss": 2.2687, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 2.262763399802008, |
|
"grad_norm": 0.4392153322696686, |
|
"learning_rate": 0.000299896875, |
|
"loss": 2.263, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 2.333474756045821, |
|
"grad_norm": 0.4188326299190521, |
|
"learning_rate": 0.00029728842990312557, |
|
"loss": 2.2572, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 2.4041861122896337, |
|
"grad_norm": 0.4100128710269928, |
|
"learning_rate": 0.0002945467007859623, |
|
"loss": 2.2616, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 2.4748974685334466, |
|
"grad_norm": 0.3786410391330719, |
|
"learning_rate": 0.0002918049716687991, |
|
"loss": 2.2527, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 2.5456088247772595, |
|
"grad_norm": 0.40753409266471863, |
|
"learning_rate": 0.00028906598428075306, |
|
"loss": 2.2385, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 2.616320181021072, |
|
"grad_norm": 0.3649774193763733, |
|
"learning_rate": 0.000286326996892707, |
|
"loss": 2.2364, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 2.6870315372648848, |
|
"grad_norm": 0.35535377264022827, |
|
"learning_rate": 0.00028358526777554375, |
|
"loss": 2.2312, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 2.7577428935086976, |
|
"grad_norm": 0.3544033169746399, |
|
"learning_rate": 0.0002808462803874977, |
|
"loss": 2.2245, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 2.82845424975251, |
|
"grad_norm": 0.3812442719936371, |
|
"learning_rate": 0.00027810455127033444, |
|
"loss": 2.2202, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 2.899165605996323, |
|
"grad_norm": 0.3528529405593872, |
|
"learning_rate": 0.00027536282215317125, |
|
"loss": 2.2127, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 2.9698769622401358, |
|
"grad_norm": 0.36804184317588806, |
|
"learning_rate": 0.000272621093036008, |
|
"loss": 2.2158, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.518365513011292, |
|
"eval_loss": 2.4669742584228516, |
|
"eval_runtime": 101.8636, |
|
"eval_samples_per_second": 460.105, |
|
"eval_steps_per_second": 7.196, |
|
"step": 42426 |
|
}, |
|
{ |
|
"epoch": 3.0405883184839486, |
|
"grad_norm": 0.4022282063961029, |
|
"learning_rate": 0.00026988210564796194, |
|
"loss": 2.1561, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 3.1112996747277615, |
|
"grad_norm": 0.36605340242385864, |
|
"learning_rate": 0.00026714037653079875, |
|
"loss": 2.1226, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 3.182011030971574, |
|
"grad_norm": 0.34304675459861755, |
|
"learning_rate": 0.0002644013891427527, |
|
"loss": 2.1217, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 3.2527223872153868, |
|
"grad_norm": 0.35023054480552673, |
|
"learning_rate": 0.00026165966002558944, |
|
"loss": 2.1274, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 3.3234337434591996, |
|
"grad_norm": 0.3564451038837433, |
|
"learning_rate": 0.00025891793090842625, |
|
"loss": 2.1314, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 3.3941450997030125, |
|
"grad_norm": 0.3799506425857544, |
|
"learning_rate": 0.000256176201791263, |
|
"loss": 2.1193, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 3.464856455946825, |
|
"grad_norm": 0.3519129455089569, |
|
"learning_rate": 0.00025343721440321693, |
|
"loss": 2.1219, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 3.5355678121906378, |
|
"grad_norm": 0.3426956534385681, |
|
"learning_rate": 0.00025069548528605374, |
|
"loss": 2.1172, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 3.6062791684344506, |
|
"grad_norm": 0.3629501760005951, |
|
"learning_rate": 0.0002479564978980076, |
|
"loss": 2.1177, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 3.6769905246782635, |
|
"grad_norm": 0.3377913534641266, |
|
"learning_rate": 0.00024521476878084443, |
|
"loss": 2.1175, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 3.747701880922076, |
|
"grad_norm": 0.339662104845047, |
|
"learning_rate": 0.00024247303966368121, |
|
"loss": 2.1186, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 3.8184132371658888, |
|
"grad_norm": 0.34301745891571045, |
|
"learning_rate": 0.00023973405227563515, |
|
"loss": 2.1155, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 3.8891245934097016, |
|
"grad_norm": 0.3543142080307007, |
|
"learning_rate": 0.00023699232315847193, |
|
"loss": 2.1056, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 3.9598359496535145, |
|
"grad_norm": 0.3132970929145813, |
|
"learning_rate": 0.0002342533357704259, |
|
"loss": 2.1109, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.5248612794140942, |
|
"eval_loss": 2.4178478717803955, |
|
"eval_runtime": 102.1506, |
|
"eval_samples_per_second": 458.813, |
|
"eval_steps_per_second": 7.176, |
|
"step": 56568 |
|
}, |
|
{ |
|
"epoch": 4.030547305897327, |
|
"grad_norm": 0.3517058491706848, |
|
"learning_rate": 0.00023151160665326262, |
|
"loss": 2.055, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 4.10125866214114, |
|
"grad_norm": 0.35281458497047424, |
|
"learning_rate": 0.00022877261926521658, |
|
"loss": 2.0072, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 4.171970018384952, |
|
"grad_norm": 0.3426903188228607, |
|
"learning_rate": 0.00022603089014805336, |
|
"loss": 2.0172, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 4.2426813746287655, |
|
"grad_norm": 0.35942381620407104, |
|
"learning_rate": 0.0002232919027600073, |
|
"loss": 2.0161, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 4.313392730872578, |
|
"grad_norm": 0.36592814326286316, |
|
"learning_rate": 0.00022055017364284408, |
|
"loss": 2.0166, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 4.384104087116391, |
|
"grad_norm": 0.3915741741657257, |
|
"learning_rate": 0.000217811186254798, |
|
"loss": 2.0206, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 4.454815443360204, |
|
"grad_norm": 0.3634374439716339, |
|
"learning_rate": 0.0002150694571376348, |
|
"loss": 2.0227, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 4.525526799604016, |
|
"grad_norm": 0.38214239478111267, |
|
"learning_rate": 0.00021233046974958875, |
|
"loss": 2.0201, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 4.596238155847829, |
|
"grad_norm": 0.33417147397994995, |
|
"learning_rate": 0.00020958874063242548, |
|
"loss": 2.0242, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 4.666949512091642, |
|
"grad_norm": 0.3682495057582855, |
|
"learning_rate": 0.00020684975324437944, |
|
"loss": 2.0351, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 4.737660868335455, |
|
"grad_norm": 0.36803096532821655, |
|
"learning_rate": 0.00020410802412721622, |
|
"loss": 2.0257, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 4.8083722245792675, |
|
"grad_norm": 0.3615093529224396, |
|
"learning_rate": 0.00020136903673917015, |
|
"loss": 2.033, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 4.87908358082308, |
|
"grad_norm": 0.3575078845024109, |
|
"learning_rate": 0.00019862730762200694, |
|
"loss": 2.0305, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 4.949794937066893, |
|
"grad_norm": 0.34489330649375916, |
|
"learning_rate": 0.00019588832023396087, |
|
"loss": 2.0194, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.5280184201411432, |
|
"eval_loss": 2.400068998336792, |
|
"eval_runtime": 102.0615, |
|
"eval_samples_per_second": 459.213, |
|
"eval_steps_per_second": 7.182, |
|
"step": 70710 |
|
}, |
|
{ |
|
"epoch": 5.020506293310706, |
|
"grad_norm": 0.3643249273300171, |
|
"learning_rate": 0.00019314659111679765, |
|
"loss": 1.9864, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 5.091217649554518, |
|
"grad_norm": 0.4295579195022583, |
|
"learning_rate": 0.00019040760372875156, |
|
"loss": 1.9059, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 5.161929005798331, |
|
"grad_norm": 0.3856581151485443, |
|
"learning_rate": 0.00018766587461158834, |
|
"loss": 1.9127, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 5.232640362042144, |
|
"grad_norm": 0.39465850591659546, |
|
"learning_rate": 0.0001849268872235423, |
|
"loss": 1.918, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 5.303351718285957, |
|
"grad_norm": 0.37704503536224365, |
|
"learning_rate": 0.00018218515810637905, |
|
"loss": 1.9239, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 5.3740630745297695, |
|
"grad_norm": 0.3656931221485138, |
|
"learning_rate": 0.00017944342898921584, |
|
"loss": 1.9284, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 5.444774430773582, |
|
"grad_norm": 0.3781563341617584, |
|
"learning_rate": 0.0001767044416011698, |
|
"loss": 1.9319, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 5.515485787017395, |
|
"grad_norm": 0.38474106788635254, |
|
"learning_rate": 0.00017396271248400655, |
|
"loss": 1.9363, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 5.586197143261208, |
|
"grad_norm": 0.35963472723960876, |
|
"learning_rate": 0.0001712237250959605, |
|
"loss": 1.9504, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 5.65690849950502, |
|
"grad_norm": 0.38337036967277527, |
|
"learning_rate": 0.0001684819959787973, |
|
"loss": 1.9397, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 5.727619855748833, |
|
"grad_norm": 0.41345345973968506, |
|
"learning_rate": 0.00016574300859075123, |
|
"loss": 1.9517, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 5.798331211992646, |
|
"grad_norm": 0.4042370021343231, |
|
"learning_rate": 0.000163001279473588, |
|
"loss": 1.9455, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 5.869042568236459, |
|
"grad_norm": 0.3603546619415283, |
|
"learning_rate": 0.00016026229208554192, |
|
"loss": 1.9401, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 5.9397539244802715, |
|
"grad_norm": 0.3735804557800293, |
|
"learning_rate": 0.0001575205629683787, |
|
"loss": 1.938, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.5290693059969069, |
|
"eval_loss": 2.4066717624664307, |
|
"eval_runtime": 102.8194, |
|
"eval_samples_per_second": 455.828, |
|
"eval_steps_per_second": 7.129, |
|
"step": 84852 |
|
}, |
|
{ |
|
"epoch": 6.010465280724084, |
|
"grad_norm": 0.3881630599498749, |
|
"learning_rate": 0.00015478157558033266, |
|
"loss": 1.9349, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 6.081176636967897, |
|
"grad_norm": 0.41385406255722046, |
|
"learning_rate": 0.0001520398464631694, |
|
"loss": 1.8133, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 6.15188799321171, |
|
"grad_norm": 0.4141051173210144, |
|
"learning_rate": 0.0001492981173460062, |
|
"loss": 1.8269, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 6.222599349455523, |
|
"grad_norm": 0.43294477462768555, |
|
"learning_rate": 0.00014655912995796013, |
|
"loss": 1.8367, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 6.293310705699335, |
|
"grad_norm": 0.40869224071502686, |
|
"learning_rate": 0.0001438174008407969, |
|
"loss": 1.8283, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 6.364022061943148, |
|
"grad_norm": 0.43988147377967834, |
|
"learning_rate": 0.00014107841345275087, |
|
"loss": 1.8398, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 6.434733418186961, |
|
"grad_norm": 0.4311169385910034, |
|
"learning_rate": 0.00013833668433558763, |
|
"loss": 1.8467, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 6.5054447744307735, |
|
"grad_norm": 0.4262569844722748, |
|
"learning_rate": 0.00013559769694754156, |
|
"loss": 1.8504, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 6.576156130674587, |
|
"grad_norm": 0.4127192199230194, |
|
"learning_rate": 0.00013285596783037834, |
|
"loss": 1.8553, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 6.646867486918399, |
|
"grad_norm": 0.4223586916923523, |
|
"learning_rate": 0.0001301169804423323, |
|
"loss": 1.8472, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 6.717578843162212, |
|
"grad_norm": 0.40577736496925354, |
|
"learning_rate": 0.00012737525132516906, |
|
"loss": 1.8497, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 6.788290199406025, |
|
"grad_norm": 0.42064401507377625, |
|
"learning_rate": 0.000124636263937123, |
|
"loss": 1.8625, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 6.859001555649837, |
|
"grad_norm": 0.4312768578529358, |
|
"learning_rate": 0.00012189453481995978, |
|
"loss": 1.8687, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 6.92971291189365, |
|
"grad_norm": 0.41861584782600403, |
|
"learning_rate": 0.00011915554743191372, |
|
"loss": 1.8569, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.5282999459696228, |
|
"eval_loss": 2.4313058853149414, |
|
"eval_runtime": 102.0616, |
|
"eval_samples_per_second": 459.213, |
|
"eval_steps_per_second": 7.182, |
|
"step": 98994 |
|
}, |
|
{ |
|
"epoch": 7.000424268137463, |
|
"grad_norm": 0.40982282161712646, |
|
"learning_rate": 0.00011641381831475049, |
|
"loss": 1.8623, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 7.0711356243812755, |
|
"grad_norm": 0.5019235610961914, |
|
"learning_rate": 0.00011367208919758727, |
|
"loss": 1.7161, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 7.141846980625088, |
|
"grad_norm": 0.44862595200538635, |
|
"learning_rate": 0.00011093310180954122, |
|
"loss": 1.7258, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 7.212558336868901, |
|
"grad_norm": 0.515501856803894, |
|
"learning_rate": 0.00010819137269237798, |
|
"loss": 1.7396, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 7.283269693112714, |
|
"grad_norm": 0.49428591132164, |
|
"learning_rate": 0.00010545238530433192, |
|
"loss": 1.7388, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 7.353981049356527, |
|
"grad_norm": 0.5093473196029663, |
|
"learning_rate": 0.0001027106561871687, |
|
"loss": 1.7468, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 7.424692405600339, |
|
"grad_norm": 0.4532296061515808, |
|
"learning_rate": 9.997166879912265e-05, |
|
"loss": 1.7588, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 7.495403761844152, |
|
"grad_norm": 0.4764035940170288, |
|
"learning_rate": 9.722993968195941e-05, |
|
"loss": 1.7627, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 7.566115118087965, |
|
"grad_norm": 0.46253088116645813, |
|
"learning_rate": 9.449095229391335e-05, |
|
"loss": 1.7655, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 7.6368264743317775, |
|
"grad_norm": 0.5122313499450684, |
|
"learning_rate": 9.174922317675013e-05, |
|
"loss": 1.7677, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 7.707537830575591, |
|
"grad_norm": 0.4764326214790344, |
|
"learning_rate": 8.901023578870408e-05, |
|
"loss": 1.7715, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 7.778249186819403, |
|
"grad_norm": 0.47460654377937317, |
|
"learning_rate": 8.626850667154085e-05, |
|
"loss": 1.7665, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 7.848960543063216, |
|
"grad_norm": 0.4747351109981537, |
|
"learning_rate": 8.352951928349478e-05, |
|
"loss": 1.7796, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 7.919671899307029, |
|
"grad_norm": 0.47340136766433716, |
|
"learning_rate": 8.078779016633157e-05, |
|
"loss": 1.7741, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 7.990383255550841, |
|
"grad_norm": 0.47387179732322693, |
|
"learning_rate": 7.804880277828549e-05, |
|
"loss": 1.7668, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.5260272330369687, |
|
"eval_loss": 2.4765777587890625, |
|
"eval_runtime": 102.0203, |
|
"eval_samples_per_second": 459.399, |
|
"eval_steps_per_second": 7.185, |
|
"step": 113136 |
|
}, |
|
{ |
|
"epoch": 8.061094611794655, |
|
"grad_norm": 0.5383424162864685, |
|
"learning_rate": 7.530707366112228e-05, |
|
"loss": 1.6391, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 8.131805968038467, |
|
"grad_norm": 0.5288543105125427, |
|
"learning_rate": 7.256808627307622e-05, |
|
"loss": 1.6373, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 8.20251732428228, |
|
"grad_norm": 0.5492115616798401, |
|
"learning_rate": 6.982635715591299e-05, |
|
"loss": 1.6506, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 8.273228680526092, |
|
"grad_norm": 0.579088568687439, |
|
"learning_rate": 6.708736976786694e-05, |
|
"loss": 1.6573, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 8.343940036769904, |
|
"grad_norm": 0.5820958614349365, |
|
"learning_rate": 6.43456406507037e-05, |
|
"loss": 1.6593, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 8.414651393013719, |
|
"grad_norm": 0.4972964823246002, |
|
"learning_rate": 6.160391153354047e-05, |
|
"loss": 1.6636, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 8.485362749257531, |
|
"grad_norm": 0.537187933921814, |
|
"learning_rate": 5.886492414549442e-05, |
|
"loss": 1.6654, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 8.556074105501343, |
|
"grad_norm": 0.5469970107078552, |
|
"learning_rate": 5.61231950283312e-05, |
|
"loss": 1.6739, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 8.626785461745156, |
|
"grad_norm": 0.5517230033874512, |
|
"learning_rate": 5.3384207640285136e-05, |
|
"loss": 1.676, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 8.697496817988968, |
|
"grad_norm": 0.5250716805458069, |
|
"learning_rate": 5.064247852312191e-05, |
|
"loss": 1.6689, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 8.768208174232782, |
|
"grad_norm": 0.5485174059867859, |
|
"learning_rate": 4.790349113507585e-05, |
|
"loss": 1.6701, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 8.838919530476595, |
|
"grad_norm": 0.5471927523612976, |
|
"learning_rate": 4.516176201791263e-05, |
|
"loss": 1.6723, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 8.909630886720407, |
|
"grad_norm": 0.4902847409248352, |
|
"learning_rate": 4.2422774629866566e-05, |
|
"loss": 1.6744, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 8.98034224296422, |
|
"grad_norm": 0.5765844583511353, |
|
"learning_rate": 3.968104551270334e-05, |
|
"loss": 1.6733, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.5228845380469165, |
|
"eval_loss": 2.5416789054870605, |
|
"eval_runtime": 102.2087, |
|
"eval_samples_per_second": 458.552, |
|
"eval_steps_per_second": 7.172, |
|
"step": 127278 |
|
}, |
|
{ |
|
"epoch": 9.051053599208032, |
|
"grad_norm": 0.5943803191184998, |
|
"learning_rate": 3.693931639554012e-05, |
|
"loss": 1.586, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 9.121764955451846, |
|
"grad_norm": 0.5351667404174805, |
|
"learning_rate": 3.420032900749406e-05, |
|
"loss": 1.5515, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 9.192476311695659, |
|
"grad_norm": 0.6331674456596375, |
|
"learning_rate": 3.145859989033083e-05, |
|
"loss": 1.5732, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 9.263187667939471, |
|
"grad_norm": 0.5774397253990173, |
|
"learning_rate": 2.8719612502284773e-05, |
|
"loss": 1.5673, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 9.333899024183284, |
|
"grad_norm": 0.5716825723648071, |
|
"learning_rate": 2.5977883385121545e-05, |
|
"loss": 1.5681, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 9.404610380427096, |
|
"grad_norm": 0.5614861249923706, |
|
"learning_rate": 2.3238895997075488e-05, |
|
"loss": 1.5691, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 9.47532173667091, |
|
"grad_norm": 0.5972283482551575, |
|
"learning_rate": 2.0497166879912264e-05, |
|
"loss": 1.5756, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 9.546033092914723, |
|
"grad_norm": 0.6400578022003174, |
|
"learning_rate": 1.7758179491866203e-05, |
|
"loss": 1.5708, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 9.616744449158535, |
|
"grad_norm": 0.5687015056610107, |
|
"learning_rate": 1.5016450374702977e-05, |
|
"loss": 1.5724, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 9.687455805402347, |
|
"grad_norm": 0.6058946251869202, |
|
"learning_rate": 1.2277462986656918e-05, |
|
"loss": 1.5676, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 9.75816716164616, |
|
"grad_norm": 0.6315460801124573, |
|
"learning_rate": 9.535733869493694e-06, |
|
"loss": 1.5771, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 9.828878517889972, |
|
"grad_norm": 0.5992266535758972, |
|
"learning_rate": 6.796746481447632e-06, |
|
"loss": 1.5753, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 9.899589874133786, |
|
"grad_norm": 0.5792600512504578, |
|
"learning_rate": 4.055017364284408e-06, |
|
"loss": 1.5747, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 9.970301230377599, |
|
"grad_norm": 0.5985902547836304, |
|
"learning_rate": 1.3132882471211842e-06, |
|
"loss": 1.579, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.5183717396220663, |
|
"eval_loss": 2.6186234951019287, |
|
"eval_runtime": 102.0754, |
|
"eval_samples_per_second": 459.151, |
|
"eval_steps_per_second": 7.181, |
|
"step": 141420 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 141420, |
|
"total_flos": 6.171008476428288e+17, |
|
"train_loss": 2.062552438357513, |
|
"train_runtime": 24003.715, |
|
"train_samples_per_second": 188.522, |
|
"train_steps_per_second": 5.892 |
|
} |
|
], |
|
"logging_steps": 1000, |
|
"max_steps": 141420, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 5000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.171008476428288e+17, |
|
"train_batch_size": 32, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|