|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9995267392333176, |
|
"eval_steps": 500, |
|
"global_step": 1584, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.7190074920654297, |
|
"learning_rate": 4.9995083170283816e-05, |
|
"loss": 2.9245, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 3.431870222091675, |
|
"learning_rate": 4.998033461515242e-05, |
|
"loss": 2.0053, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.3315682411193848, |
|
"learning_rate": 4.9955760135896534e-05, |
|
"loss": 1.888, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 3.2937276363372803, |
|
"learning_rate": 4.992136939879856e-05, |
|
"loss": 1.8447, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.7375714778900146, |
|
"learning_rate": 4.9877175931330346e-05, |
|
"loss": 1.8212, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.15061092376709, |
|
"learning_rate": 4.982319711683221e-05, |
|
"loss": 1.793, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.0427424907684326, |
|
"learning_rate": 4.975945418767529e-05, |
|
"loss": 1.756, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 2.107785224914551, |
|
"learning_rate": 4.968597221690986e-05, |
|
"loss": 1.7285, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.100552558898926, |
|
"learning_rate": 4.96027801084029e-05, |
|
"loss": 1.7297, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.2227377891540527, |
|
"learning_rate": 4.950991058546893e-05, |
|
"loss": 1.7602, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.535144567489624, |
|
"learning_rate": 4.940740017799833e-05, |
|
"loss": 1.7433, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.6522979736328125, |
|
"learning_rate": 4.929528920808854e-05, |
|
"loss": 1.7363, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.8091869354248047, |
|
"learning_rate": 4.917362177418342e-05, |
|
"loss": 1.6872, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 2.1017510890960693, |
|
"learning_rate": 4.904244573372733e-05, |
|
"loss": 1.7084, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.6424258947372437, |
|
"learning_rate": 4.8901812684340564e-05, |
|
"loss": 1.6997, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.4547488689422607, |
|
"learning_rate": 4.8751777943523634e-05, |
|
"loss": 1.6747, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.6251146793365479, |
|
"learning_rate": 4.8592400526898314e-05, |
|
"loss": 1.6836, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.098386526107788, |
|
"learning_rate": 4.842374312499405e-05, |
|
"loss": 1.6552, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 2.2387640476226807, |
|
"learning_rate": 4.824587207858888e-05, |
|
"loss": 1.6489, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.7299611568450928, |
|
"learning_rate": 4.805885735261454e-05, |
|
"loss": 1.6576, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.5701665878295898, |
|
"learning_rate": 4.786277250863599e-05, |
|
"loss": 1.6533, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 2.417296886444092, |
|
"learning_rate": 4.765769467591625e-05, |
|
"loss": 1.6356, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.2636029720306396, |
|
"learning_rate": 4.744370452107789e-05, |
|
"loss": 1.6389, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.576324224472046, |
|
"learning_rate": 4.722088621637309e-05, |
|
"loss": 1.6546, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.9720542430877686, |
|
"learning_rate": 4.698932740657479e-05, |
|
"loss": 1.6354, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.5250279903411865, |
|
"learning_rate": 4.6749119174501975e-05, |
|
"loss": 1.6342, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.4737966060638428, |
|
"learning_rate": 4.6500356005192514e-05, |
|
"loss": 1.6407, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.2792372703552246, |
|
"learning_rate": 4.6243135748737864e-05, |
|
"loss": 1.6339, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.5593037605285645, |
|
"learning_rate": 4.597755958179406e-05, |
|
"loss": 1.6095, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.3141404390335083, |
|
"learning_rate": 4.570373196778427e-05, |
|
"loss": 1.6036, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.2617065906524658, |
|
"learning_rate": 4.5421760615808474e-05, |
|
"loss": 1.6244, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.64117431640625, |
|
"learning_rate": 4.513175643827647e-05, |
|
"loss": 1.6449, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.7132749557495117, |
|
"learning_rate": 4.4833833507280884e-05, |
|
"loss": 1.5948, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 2.1323654651641846, |
|
"learning_rate": 4.4528109009727336e-05, |
|
"loss": 1.627, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 2.253115653991699, |
|
"learning_rate": 4.42147032012394e-05, |
|
"loss": 1.6151, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.6143097877502441, |
|
"learning_rate": 4.389373935885646e-05, |
|
"loss": 1.5838, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.3353707790374756, |
|
"learning_rate": 4.356534373254316e-05, |
|
"loss": 1.5935, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.283742904663086, |
|
"learning_rate": 4.322964549552943e-05, |
|
"loss": 1.6015, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.437249779701233, |
|
"learning_rate": 4.288677669350066e-05, |
|
"loss": 1.577, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 1.5190638303756714, |
|
"learning_rate": 4.2536872192658036e-05, |
|
"loss": 1.5843, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 2.1320886611938477, |
|
"learning_rate": 4.218006962666934e-05, |
|
"loss": 1.6145, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.0696591138839722, |
|
"learning_rate": 4.181650934253132e-05, |
|
"loss": 1.5601, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.3149545192718506, |
|
"learning_rate": 4.144633434536467e-05, |
|
"loss": 1.5664, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.3661577701568604, |
|
"learning_rate": 4.1069690242163484e-05, |
|
"loss": 1.6002, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 1.6984481811523438, |
|
"learning_rate": 4.06867251845213e-05, |
|
"loss": 1.576, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 1.2728784084320068, |
|
"learning_rate": 4.0297589810356165e-05, |
|
"loss": 1.5448, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.4147616624832153, |
|
"learning_rate": 3.9902437184657784e-05, |
|
"loss": 1.5595, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.2289011478424072, |
|
"learning_rate": 3.9501422739279956e-05, |
|
"loss": 1.5628, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.5690233707427979, |
|
"learning_rate": 3.909470421180201e-05, |
|
"loss": 1.5731, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.4935098886489868, |
|
"learning_rate": 3.8682441583483314e-05, |
|
"loss": 1.545, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.2939772605895996, |
|
"learning_rate": 3.8264797016335205e-05, |
|
"loss": 1.5793, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2150651216506958, |
|
"learning_rate": 3.7841934789335164e-05, |
|
"loss": 1.5378, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.2153139114379883, |
|
"learning_rate": 3.741402123380828e-05, |
|
"loss": 1.5345, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 1.290591835975647, |
|
"learning_rate": 3.6981224668001424e-05, |
|
"loss": 1.5517, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1924967765808105, |
|
"learning_rate": 3.654371533087586e-05, |
|
"loss": 1.5472, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.6345056295394897, |
|
"learning_rate": 3.610166531514436e-05, |
|
"loss": 1.5564, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.185119867324829, |
|
"learning_rate": 3.565524849957921e-05, |
|
"loss": 1.5574, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.3646321296691895, |
|
"learning_rate": 3.520464048061758e-05, |
|
"loss": 1.5584, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.2333228588104248, |
|
"learning_rate": 3.47500185032913e-05, |
|
"loss": 1.518, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.3945318460464478, |
|
"learning_rate": 3.4291561391508185e-05, |
|
"loss": 1.5339, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 1.304306149482727, |
|
"learning_rate": 3.3829449477712324e-05, |
|
"loss": 1.5339, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 1.6393932104110718, |
|
"learning_rate": 3.336386453195088e-05, |
|
"loss": 1.5399, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.2000635862350464, |
|
"learning_rate": 3.2894989690375626e-05, |
|
"loss": 1.5233, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 1.1479601860046387, |
|
"learning_rate": 3.2423009383206876e-05, |
|
"loss": 1.538, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.1483389139175415, |
|
"learning_rate": 3.194810926218861e-05, |
|
"loss": 1.528, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.2403253316879272, |
|
"learning_rate": 3.147047612756302e-05, |
|
"loss": 1.5307, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.3997712135314941, |
|
"learning_rate": 3.099029785459328e-05, |
|
"loss": 1.4915, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.2010352611541748, |
|
"learning_rate": 3.0507763319663517e-05, |
|
"loss": 1.5268, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.0670932531356812, |
|
"learning_rate": 3.002306232598497e-05, |
|
"loss": 1.5273, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.2283655405044556, |
|
"learning_rate": 2.9536385528937567e-05, |
|
"loss": 1.5273, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1306476593017578, |
|
"learning_rate": 2.9047924361076345e-05, |
|
"loss": 1.5072, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.1699943542480469, |
|
"learning_rate": 2.8557870956832132e-05, |
|
"loss": 1.4856, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.2550854682922363, |
|
"learning_rate": 2.8066418076936167e-05, |
|
"loss": 1.4983, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.0610970258712769, |
|
"learning_rate": 2.7573759032598366e-05, |
|
"loss": 1.5518, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.1754754781723022, |
|
"learning_rate": 2.7080087609469062e-05, |
|
"loss": 1.4998, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1955766677856445, |
|
"learning_rate": 2.6585597991414114e-05, |
|
"loss": 1.5109, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0891656875610352, |
|
"learning_rate": 2.6090484684133404e-05, |
|
"loss": 1.5007, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.0880335569381714, |
|
"learning_rate": 2.5594942438652688e-05, |
|
"loss": 1.5049, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.345954418182373, |
|
"learning_rate": 2.509916617471903e-05, |
|
"loss": 1.5154, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 1.1668224334716797, |
|
"learning_rate": 2.46033509041298e-05, |
|
"loss": 1.4883, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.055127501487732, |
|
"learning_rate": 2.410769165402549e-05, |
|
"loss": 1.5053, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0528500080108643, |
|
"learning_rate": 2.3612383390176503e-05, |
|
"loss": 1.4871, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.328258991241455, |
|
"learning_rate": 2.3117620940294048e-05, |
|
"loss": 1.5037, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.0326772928237915, |
|
"learning_rate": 2.2623598917395438e-05, |
|
"loss": 1.4525, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 3.057058811187744, |
|
"learning_rate": 2.213051164325366e-05, |
|
"loss": 1.4898, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1190940141677856, |
|
"learning_rate": 2.1638553071961708e-05, |
|
"loss": 1.488, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.1501041650772095, |
|
"learning_rate": 2.1147916713641367e-05, |
|
"loss": 1.4711, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.090022325515747, |
|
"learning_rate": 2.0658795558326743e-05, |
|
"loss": 1.488, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 1.0642565488815308, |
|
"learning_rate": 2.017138200005236e-05, |
|
"loss": 1.4791, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.3562296628952026, |
|
"learning_rate": 1.9685867761175584e-05, |
|
"loss": 1.4956, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 1.2069261074066162, |
|
"learning_rate": 1.9202443816963425e-05, |
|
"loss": 1.4918, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3227437734603882, |
|
"learning_rate": 1.872130032047302e-05, |
|
"loss": 1.4577, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0784181356430054, |
|
"learning_rate": 1.824262652775568e-05, |
|
"loss": 1.4888, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.000135898590088, |
|
"learning_rate": 1.7766610723413684e-05, |
|
"loss": 1.4673, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.136026382446289, |
|
"learning_rate": 1.7293440146539196e-05, |
|
"loss": 1.4779, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.123252272605896, |
|
"learning_rate": 1.682330091706446e-05, |
|
"loss": 1.4583, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.0559343099594116, |
|
"learning_rate": 1.6356377962552238e-05, |
|
"loss": 1.4471, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.0266658067703247, |
|
"learning_rate": 1.589285494545514e-05, |
|
"loss": 1.4632, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.1371444463729858, |
|
"learning_rate": 1.5432914190872757e-05, |
|
"loss": 1.4732, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.1203784942626953, |
|
"learning_rate": 1.4976736614834664e-05, |
|
"loss": 1.452, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.0037944316864014, |
|
"learning_rate": 1.4524501653137787e-05, |
|
"loss": 1.461, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 1.1353282928466797, |
|
"learning_rate": 1.4076387190766017e-05, |
|
"loss": 1.4538, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 1.1203887462615967, |
|
"learning_rate": 1.363256949191972e-05, |
|
"loss": 1.4681, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0686651468276978, |
|
"learning_rate": 1.3193223130682936e-05, |
|
"loss": 1.4548, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 1.0339988470077515, |
|
"learning_rate": 1.2758520922355226e-05, |
|
"loss": 1.4535, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 1.4555269479751587, |
|
"learning_rate": 1.2328633855475429e-05, |
|
"loss": 1.4621, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.0318940877914429, |
|
"learning_rate": 1.1903731024563966e-05, |
|
"loss": 1.4621, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.084612488746643, |
|
"learning_rate": 1.148397956361007e-05, |
|
"loss": 1.4636, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.0705621242523193, |
|
"learning_rate": 1.106954458033026e-05, |
|
"loss": 1.4495, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 1.050857424736023, |
|
"learning_rate": 1.0660589091223855e-05, |
|
"loss": 1.4395, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.0744839906692505, |
|
"learning_rate": 1.025727395745095e-05, |
|
"loss": 1.4583, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.0446105003356934, |
|
"learning_rate": 9.859757821558337e-06, |
|
"loss": 1.4606, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1479051113128662, |
|
"learning_rate": 9.468197045077976e-06, |
|
"loss": 1.454, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.985953152179718, |
|
"learning_rate": 9.082745647022797e-06, |
|
"loss": 1.4654, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.1085201501846313, |
|
"learning_rate": 8.703555243303835e-06, |
|
"loss": 1.4526, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.2304482460021973, |
|
"learning_rate": 8.330774987092712e-06, |
|
"loss": 1.448, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.0740071535110474, |
|
"learning_rate": 7.96455151015272e-06, |
|
"loss": 1.4606, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.0380760431289673, |
|
"learning_rate": 7.605028865161809e-06, |
|
"loss": 1.4661, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1115810871124268, |
|
"learning_rate": 7.25234846904993e-06, |
|
"loss": 1.4567, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9248858094215393, |
|
"learning_rate": 6.906649047373246e-06, |
|
"loss": 1.4372, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 1.0288389921188354, |
|
"learning_rate": 6.568066579746901e-06, |
|
"loss": 1.4542, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 1.0125763416290283, |
|
"learning_rate": 6.2367342463579475e-06, |
|
"loss": 1.4426, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.9536031484603882, |
|
"learning_rate": 5.912782375579412e-06, |
|
"loss": 1.4292, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.993061363697052, |
|
"learning_rate": 5.596338392706077e-06, |
|
"loss": 1.432, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.9642956852912903, |
|
"learning_rate": 5.2875267698322325e-06, |
|
"loss": 1.4427, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9925894737243652, |
|
"learning_rate": 4.986468976890993e-06, |
|
"loss": 1.4199, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.0030889511108398, |
|
"learning_rate": 4.693283433874565e-06, |
|
"loss": 1.4253, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.986602783203125, |
|
"learning_rate": 4.408085464254183e-06, |
|
"loss": 1.4382, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.9463419318199158, |
|
"learning_rate": 4.130987249617993e-06, |
|
"loss": 1.439, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.9418216347694397, |
|
"learning_rate": 3.8620977855448935e-06, |
|
"loss": 1.4322, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.067226529121399, |
|
"learning_rate": 3.601522838731461e-06, |
|
"loss": 1.4305, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.9662885665893555, |
|
"learning_rate": 3.3493649053890326e-06, |
|
"loss": 1.4188, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.1397868394851685, |
|
"learning_rate": 3.1057231709272077e-06, |
|
"loss": 1.4426, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.0030759572982788, |
|
"learning_rate": 2.8706934709395892e-06, |
|
"loss": 1.4185, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9549908638000488, |
|
"learning_rate": 2.6443682535072177e-06, |
|
"loss": 1.4276, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9839365482330322, |
|
"learning_rate": 2.4268365428344736e-06, |
|
"loss": 1.4174, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.954189121723175, |
|
"learning_rate": 2.21818390423168e-06, |
|
"loss": 1.441, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9914742708206177, |
|
"learning_rate": 2.0184924104583613e-06, |
|
"loss": 1.4322, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9965653419494629, |
|
"learning_rate": 1.8278406094401623e-06, |
|
"loss": 1.411, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 1.0744175910949707, |
|
"learning_rate": 1.6463034933723337e-06, |
|
"loss": 1.4368, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.9871243238449097, |
|
"learning_rate": 1.4739524692218314e-06, |
|
"loss": 1.396, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.9976981282234192, |
|
"learning_rate": 1.3108553306396265e-06, |
|
"loss": 1.439, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.9817109704017639, |
|
"learning_rate": 1.1570762312943295e-06, |
|
"loss": 1.4113, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.9741029143333435, |
|
"learning_rate": 1.0126756596375686e-06, |
|
"loss": 1.4438, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 1.0171328783035278, |
|
"learning_rate": 8.777104151110826e-07, |
|
"loss": 1.4365, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.980021595954895, |
|
"learning_rate": 7.522335858048707e-07, |
|
"loss": 1.4355, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.9966154098510742, |
|
"learning_rate": 6.362945275751736e-07, |
|
"loss": 1.431, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.9687898755073547, |
|
"learning_rate": 5.299388446305343e-07, |
|
"loss": 1.4057, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.9906119704246521, |
|
"learning_rate": 4.3320837159353813e-07, |
|
"loss": 1.421, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0227527618408203, |
|
"learning_rate": 3.4614115704533767e-07, |
|
"loss": 1.4319, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.0115277767181396, |
|
"learning_rate": 2.687714485593462e-07, |
|
"loss": 1.4295, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.993654727935791, |
|
"learning_rate": 2.011296792301165e-07, |
|
"loss": 1.4294, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.8775748014450073, |
|
"learning_rate": 1.4324245570256633e-07, |
|
"loss": 1.4562, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9754842519760132, |
|
"learning_rate": 9.513254770636137e-08, |
|
"loss": 1.4447, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9996697902679443, |
|
"learning_rate": 5.681887909952388e-08, |
|
"loss": 1.4229, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.9914098381996155, |
|
"learning_rate": 2.831652042480093e-08, |
|
"loss": 1.4458, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9639108777046204, |
|
"learning_rate": 9.636682981720158e-09, |
|
"loss": 1.4267, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.9515108466148376, |
|
"learning_rate": 7.867144166728846e-10, |
|
"loss": 1.4373, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"step": 1584, |
|
"total_flos": 1.1098698583858217e+18, |
|
"train_loss": 1.5383612829627413, |
|
"train_runtime": 4681.1872, |
|
"train_samples_per_second": 21.666, |
|
"train_steps_per_second": 0.338 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1584, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 1000, |
|
"total_flos": 1.1098698583858217e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|