|
{ |
|
"best_metric": 0.564625850340136, |
|
"best_model_checkpoint": "/data/users/bking2/tod_zero/outputs/runs/finetune/starcoder_3b/mar_18_otj9Am5v/offline_label/bqag3oyb/checkpoint-22400", |
|
"epoch": 0.7928838672910627, |
|
"eval_steps": 3200, |
|
"global_step": 32000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.6985061168670654, |
|
"learning_rate": 2.5e-06, |
|
"loss": 1.1039, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.080778956413269, |
|
"learning_rate": 5e-06, |
|
"loss": 0.2136, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0957249402999878, |
|
"learning_rate": 4.9999696912850374e-06, |
|
"loss": 0.1732, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 1.0785242319107056, |
|
"learning_rate": 4.999878765875043e-06, |
|
"loss": 0.1636, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1675797700881958, |
|
"learning_rate": 4.999727225974682e-06, |
|
"loss": 0.1622, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.1212489604949951, |
|
"learning_rate": 4.999515075258341e-06, |
|
"loss": 0.1633, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9962964653968811, |
|
"learning_rate": 4.999242318870029e-06, |
|
"loss": 0.149, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0880298614501953, |
|
"learning_rate": 4.998908963423264e-06, |
|
"loss": 0.1558, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.0359498262405396, |
|
"learning_rate": 4.998515017000907e-06, |
|
"loss": 0.1558, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.019607663154602, |
|
"learning_rate": 4.998060489154965e-06, |
|
"loss": 0.1504, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9753773808479309, |
|
"learning_rate": 4.997545390906362e-06, |
|
"loss": 0.1502, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.9316331148147583, |
|
"learning_rate": 4.996969734744671e-06, |
|
"loss": 0.1445, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8932620286941528, |
|
"learning_rate": 4.99633353462781e-06, |
|
"loss": 0.1419, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.8961341977119446, |
|
"learning_rate": 4.995636805981707e-06, |
|
"loss": 0.1424, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.0110450983047485, |
|
"learning_rate": 4.99487956569992e-06, |
|
"loss": 0.1484, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9437933564186096, |
|
"learning_rate": 4.994061832143235e-06, |
|
"loss": 0.1382, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.836251974105835, |
|
"learning_rate": 4.993183625139212e-06, |
|
"loss": 0.1448, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9206503629684448, |
|
"learning_rate": 4.992244965981714e-06, |
|
"loss": 0.1408, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9278163313865662, |
|
"learning_rate": 4.991245877430382e-06, |
|
"loss": 0.1396, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9222483038902283, |
|
"learning_rate": 4.990186383710089e-06, |
|
"loss": 0.1387, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9481703042984009, |
|
"learning_rate": 4.9890665105103484e-06, |
|
"loss": 0.1391, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.865689218044281, |
|
"learning_rate": 4.987886284984695e-06, |
|
"loss": 0.1329, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9181361794471741, |
|
"learning_rate": 4.986645735750025e-06, |
|
"loss": 0.1399, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.942889928817749, |
|
"learning_rate": 4.985344892885899e-06, |
|
"loss": 0.144, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9355672001838684, |
|
"learning_rate": 4.98398378793382e-06, |
|
"loss": 0.1412, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8571763634681702, |
|
"learning_rate": 4.982562453896458e-06, |
|
"loss": 0.1323, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.8957087397575378, |
|
"learning_rate": 4.9810809252368615e-06, |
|
"loss": 0.1356, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9295082688331604, |
|
"learning_rate": 4.979539237877615e-06, |
|
"loss": 0.1332, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8543279767036438, |
|
"learning_rate": 4.977937429199968e-06, |
|
"loss": 0.1335, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9446925520896912, |
|
"learning_rate": 4.976275538042932e-06, |
|
"loss": 0.1336, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8666749596595764, |
|
"learning_rate": 4.974553604702332e-06, |
|
"loss": 0.1369, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.9225461483001709, |
|
"learning_rate": 4.972771670929841e-06, |
|
"loss": 0.1326, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6894574165344238, |
|
"learning_rate": 4.970929779931955e-06, |
|
"loss": 0.1317, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8522433638572693, |
|
"learning_rate": 4.969027976368954e-06, |
|
"loss": 0.134, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.8407069444656372, |
|
"learning_rate": 4.967066306353816e-06, |
|
"loss": 0.13, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.901373565196991, |
|
"learning_rate": 4.9650448174510986e-06, |
|
"loss": 0.1347, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8298419713973999, |
|
"learning_rate": 4.9629635586757865e-06, |
|
"loss": 0.1329, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.7622634172439575, |
|
"learning_rate": 4.960822580492103e-06, |
|
"loss": 0.1248, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8928254842758179, |
|
"learning_rate": 4.958621934812286e-06, |
|
"loss": 0.1349, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.830184817314148, |
|
"learning_rate": 4.95636167499533e-06, |
|
"loss": 0.1247, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.9439135193824768, |
|
"learning_rate": 4.9540418558456915e-06, |
|
"loss": 0.1346, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8274710178375244, |
|
"learning_rate": 4.951662533611959e-06, |
|
"loss": 0.1321, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.8595709204673767, |
|
"learning_rate": 4.9492237659854946e-06, |
|
"loss": 0.131, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.864810049533844, |
|
"learning_rate": 4.9467256120990255e-06, |
|
"loss": 0.1286, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7884636521339417, |
|
"learning_rate": 4.9441681325252215e-06, |
|
"loss": 0.132, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7718741297721863, |
|
"learning_rate": 4.941551389275217e-06, |
|
"loss": 0.1293, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8145634531974792, |
|
"learning_rate": 4.938875445797112e-06, |
|
"loss": 0.1348, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8029401898384094, |
|
"learning_rate": 4.936140366974434e-06, |
|
"loss": 0.1309, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7573081851005554, |
|
"learning_rate": 4.933346219124562e-06, |
|
"loss": 0.1257, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.8588012456893921, |
|
"learning_rate": 4.93049306999712e-06, |
|
"loss": 0.1342, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7663673758506775, |
|
"learning_rate": 4.927580988772336e-06, |
|
"loss": 0.1263, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.6892865300178528, |
|
"learning_rate": 4.9246100460593606e-06, |
|
"loss": 0.1304, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7494785785675049, |
|
"learning_rate": 4.92158031389456e-06, |
|
"loss": 0.1258, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7757202982902527, |
|
"learning_rate": 4.918491865739763e-06, |
|
"loss": 0.1284, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8363822102546692, |
|
"learning_rate": 4.915344776480487e-06, |
|
"loss": 0.1271, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8193435668945312, |
|
"learning_rate": 4.912139122424118e-06, |
|
"loss": 0.1309, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7342443466186523, |
|
"learning_rate": 4.908874981298058e-06, |
|
"loss": 0.1319, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.7340241074562073, |
|
"learning_rate": 4.9055524322478456e-06, |
|
"loss": 0.1263, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8974625468254089, |
|
"learning_rate": 4.902171555835236e-06, |
|
"loss": 0.1272, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8946986794471741, |
|
"learning_rate": 4.8987324340362445e-06, |
|
"loss": 0.1316, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8382922410964966, |
|
"learning_rate": 4.895235150239159e-06, |
|
"loss": 0.1309, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7496135234832764, |
|
"learning_rate": 4.891679789242524e-06, |
|
"loss": 0.1271, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.9334201812744141, |
|
"learning_rate": 4.8880664372530765e-06, |
|
"loss": 0.1264, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.879549503326416, |
|
"learning_rate": 4.884395181883661e-06, |
|
"loss": 0.1295, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8264172673225403, |
|
"learning_rate": 4.880666112151104e-06, |
|
"loss": 0.1239, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7288581728935242, |
|
"learning_rate": 4.876879318474056e-06, |
|
"loss": 0.1308, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.6938970685005188, |
|
"learning_rate": 4.873034892670795e-06, |
|
"loss": 0.124, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7392057776451111, |
|
"learning_rate": 4.869132927957007e-06, |
|
"loss": 0.129, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7360211610794067, |
|
"learning_rate": 4.8651735189435205e-06, |
|
"loss": 0.1218, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8656970262527466, |
|
"learning_rate": 4.861156761634014e-06, |
|
"loss": 0.1271, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7724373936653137, |
|
"learning_rate": 4.857082753422691e-06, |
|
"loss": 0.1266, |
|
"step": 3550 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7553298473358154, |
|
"learning_rate": 4.852951593091914e-06, |
|
"loss": 0.1251, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.8519923090934753, |
|
"learning_rate": 4.848763380809811e-06, |
|
"loss": 0.1253, |
|
"step": 3650 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7823351621627808, |
|
"learning_rate": 4.844518218127849e-06, |
|
"loss": 0.1234, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.863345742225647, |
|
"learning_rate": 4.840216207978368e-06, |
|
"loss": 0.1298, |
|
"step": 3750 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.729887068271637, |
|
"learning_rate": 4.835857454672087e-06, |
|
"loss": 0.1293, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8041313290596008, |
|
"learning_rate": 4.831442063895575e-06, |
|
"loss": 0.125, |
|
"step": 3850 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6497385501861572, |
|
"learning_rate": 4.8269701427086905e-06, |
|
"loss": 0.1272, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7458536624908447, |
|
"learning_rate": 4.822441799541979e-06, |
|
"loss": 0.1251, |
|
"step": 3950 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.7111815810203552, |
|
"learning_rate": 4.8178571441940515e-06, |
|
"loss": 0.1209, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8121939301490784, |
|
"learning_rate": 4.813216287828917e-06, |
|
"loss": 0.1209, |
|
"step": 4050 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6625011563301086, |
|
"learning_rate": 4.808519342973289e-06, |
|
"loss": 0.1277, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8151265978813171, |
|
"learning_rate": 4.80376642351386e-06, |
|
"loss": 0.1268, |
|
"step": 4150 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6908511519432068, |
|
"learning_rate": 4.798957644694533e-06, |
|
"loss": 0.1247, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.653842568397522, |
|
"learning_rate": 4.794093123113635e-06, |
|
"loss": 0.1257, |
|
"step": 4250 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8020724654197693, |
|
"learning_rate": 4.789172976721082e-06, |
|
"loss": 0.121, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.7690026164054871, |
|
"learning_rate": 4.7841973248155275e-06, |
|
"loss": 0.1265, |
|
"step": 4350 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8199232220649719, |
|
"learning_rate": 4.779166288041463e-06, |
|
"loss": 0.1233, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.6871870756149292, |
|
"learning_rate": 4.7740799883862966e-06, |
|
"loss": 0.1252, |
|
"step": 4450 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8392221927642822, |
|
"learning_rate": 4.7689385491773934e-06, |
|
"loss": 0.1294, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.678961455821991, |
|
"learning_rate": 4.7637420950790855e-06, |
|
"loss": 0.1205, |
|
"step": 4550 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8034936785697937, |
|
"learning_rate": 4.75849075208965e-06, |
|
"loss": 0.1237, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.5964626669883728, |
|
"learning_rate": 4.7531846475382526e-06, |
|
"loss": 0.1196, |
|
"step": 4650 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7667948007583618, |
|
"learning_rate": 4.7478239100818626e-06, |
|
"loss": 0.1215, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.896281898021698, |
|
"learning_rate": 4.742408669702131e-06, |
|
"loss": 0.1253, |
|
"step": 4750 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.6606175303459167, |
|
"learning_rate": 4.736939057702239e-06, |
|
"loss": 0.1199, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7194575071334839, |
|
"learning_rate": 4.731415206703714e-06, |
|
"loss": 0.1275, |
|
"step": 4850 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7007895112037659, |
|
"learning_rate": 4.725837250643218e-06, |
|
"loss": 0.1241, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.703335702419281, |
|
"learning_rate": 4.720205324769296e-06, |
|
"loss": 0.1209, |
|
"step": 4950 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7912906408309937, |
|
"learning_rate": 4.714519565639095e-06, |
|
"loss": 0.1205, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7368320226669312, |
|
"learning_rate": 4.708780111115058e-06, |
|
"loss": 0.1267, |
|
"step": 5050 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8405609726905823, |
|
"learning_rate": 4.702987100361578e-06, |
|
"loss": 0.1217, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.768998384475708, |
|
"learning_rate": 4.697140673841624e-06, |
|
"loss": 0.1235, |
|
"step": 5150 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.730161726474762, |
|
"learning_rate": 4.6912409733133365e-06, |
|
"loss": 0.1234, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7618039846420288, |
|
"learning_rate": 4.685288141826589e-06, |
|
"loss": 0.1206, |
|
"step": 5250 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7404894828796387, |
|
"learning_rate": 4.679282323719519e-06, |
|
"loss": 0.1207, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8315970301628113, |
|
"learning_rate": 4.67322366461503e-06, |
|
"loss": 0.1197, |
|
"step": 5350 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8174512386322021, |
|
"learning_rate": 4.66711231141726e-06, |
|
"loss": 0.1246, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7582206130027771, |
|
"learning_rate": 4.660948412308018e-06, |
|
"loss": 0.1179, |
|
"step": 5450 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7047674655914307, |
|
"learning_rate": 4.654732116743193e-06, |
|
"loss": 0.124, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7401306629180908, |
|
"learning_rate": 4.64846357544913e-06, |
|
"loss": 0.1209, |
|
"step": 5550 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7297683954238892, |
|
"learning_rate": 4.642142940418973e-06, |
|
"loss": 0.124, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6836703419685364, |
|
"learning_rate": 4.635770364908984e-06, |
|
"loss": 0.1188, |
|
"step": 5650 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6775400042533875, |
|
"learning_rate": 4.629346003434822e-06, |
|
"loss": 0.1211, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8052581548690796, |
|
"learning_rate": 4.622870011767798e-06, |
|
"loss": 0.1203, |
|
"step": 5750 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.7253400087356567, |
|
"learning_rate": 4.616342546931103e-06, |
|
"loss": 0.1226, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6844501495361328, |
|
"learning_rate": 4.609763767195991e-06, |
|
"loss": 0.1208, |
|
"step": 5850 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7401016354560852, |
|
"learning_rate": 4.603133832077953e-06, |
|
"loss": 0.1192, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8344407081604004, |
|
"learning_rate": 4.596452902332839e-06, |
|
"loss": 0.1198, |
|
"step": 5950 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6959987878799438, |
|
"learning_rate": 4.589721139952964e-06, |
|
"loss": 0.1156, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6912865042686462, |
|
"learning_rate": 4.582938708163183e-06, |
|
"loss": 0.1151, |
|
"step": 6050 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.8227457404136658, |
|
"learning_rate": 4.576105771416928e-06, |
|
"loss": 0.1196, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7104191184043884, |
|
"learning_rate": 4.569222495392227e-06, |
|
"loss": 0.1228, |
|
"step": 6150 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.816095232963562, |
|
"learning_rate": 4.562289046987679e-06, |
|
"loss": 0.125, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7309590578079224, |
|
"learning_rate": 4.555305594318414e-06, |
|
"loss": 0.1229, |
|
"step": 6250 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6685283184051514, |
|
"learning_rate": 4.548272306712013e-06, |
|
"loss": 0.1204, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7148935794830322, |
|
"learning_rate": 4.541189354704403e-06, |
|
"loss": 0.1213, |
|
"step": 6350 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7746471762657166, |
|
"learning_rate": 4.534056910035724e-06, |
|
"loss": 0.1176, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6856210231781006, |
|
"learning_rate": 4.5268751456461605e-06, |
|
"loss": 0.1214, |
|
"step": 6450 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7244325280189514, |
|
"learning_rate": 4.5196442356717526e-06, |
|
"loss": 0.119, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7556556463241577, |
|
"learning_rate": 4.512364355440172e-06, |
|
"loss": 0.1236, |
|
"step": 6550 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.7055052518844604, |
|
"learning_rate": 4.505035681466472e-06, |
|
"loss": 0.119, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.6788272261619568, |
|
"learning_rate": 4.497658391448803e-06, |
|
"loss": 0.1189, |
|
"step": 6650 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8098623752593994, |
|
"learning_rate": 4.49023266426411e-06, |
|
"loss": 0.1178, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5360668301582336, |
|
"learning_rate": 4.482758679963792e-06, |
|
"loss": 0.1123, |
|
"step": 6750 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7503378987312317, |
|
"learning_rate": 4.475236619769336e-06, |
|
"loss": 0.1232, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.8074086308479309, |
|
"learning_rate": 4.4676666660679265e-06, |
|
"loss": 0.1124, |
|
"step": 6850 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7047454118728638, |
|
"learning_rate": 4.460049002408018e-06, |
|
"loss": 0.1224, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7454392313957214, |
|
"learning_rate": 4.452383813494887e-06, |
|
"loss": 0.1198, |
|
"step": 6950 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7296658158302307, |
|
"learning_rate": 4.444671285186155e-06, |
|
"loss": 0.1191, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.7346377372741699, |
|
"learning_rate": 4.4369116044872786e-06, |
|
"loss": 0.123, |
|
"step": 7050 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7199191451072693, |
|
"learning_rate": 4.42910495954702e-06, |
|
"loss": 0.1226, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8274137377738953, |
|
"learning_rate": 4.421251539652879e-06, |
|
"loss": 0.1165, |
|
"step": 7150 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6081553101539612, |
|
"learning_rate": 4.413351535226507e-06, |
|
"loss": 0.1205, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6576844453811646, |
|
"learning_rate": 4.4054051378190915e-06, |
|
"loss": 0.1196, |
|
"step": 7250 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8263300657272339, |
|
"learning_rate": 4.397412540106707e-06, |
|
"loss": 0.1194, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.7084447145462036, |
|
"learning_rate": 4.3893739358856465e-06, |
|
"loss": 0.1167, |
|
"step": 7350 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6882210969924927, |
|
"learning_rate": 4.38128952006772e-06, |
|
"loss": 0.1179, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.5834758281707764, |
|
"learning_rate": 4.373159488675533e-06, |
|
"loss": 0.1126, |
|
"step": 7450 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6505135893821716, |
|
"learning_rate": 4.364984038837727e-06, |
|
"loss": 0.1225, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6714935898780823, |
|
"learning_rate": 4.356763368784207e-06, |
|
"loss": 0.1178, |
|
"step": 7550 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7283095121383667, |
|
"learning_rate": 4.348497677841328e-06, |
|
"loss": 0.1184, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6789336800575256, |
|
"learning_rate": 4.340187166427067e-06, |
|
"loss": 0.1166, |
|
"step": 7650 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6609737277030945, |
|
"learning_rate": 4.331832036046162e-06, |
|
"loss": 0.1171, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.8023338317871094, |
|
"learning_rate": 4.323432489285223e-06, |
|
"loss": 0.1169, |
|
"step": 7750 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.6348956227302551, |
|
"learning_rate": 4.3149887298078275e-06, |
|
"loss": 0.1199, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.7053850293159485, |
|
"learning_rate": 4.306500962349573e-06, |
|
"loss": 0.1216, |
|
"step": 7850 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7582606077194214, |
|
"learning_rate": 4.2979693927131205e-06, |
|
"loss": 0.1191, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7093271017074585, |
|
"learning_rate": 4.289394227763199e-06, |
|
"loss": 0.1132, |
|
"step": 7950 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.8069114685058594, |
|
"learning_rate": 4.2807756754215926e-06, |
|
"loss": 0.1163, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7039481997489929, |
|
"learning_rate": 4.272113944662099e-06, |
|
"loss": 0.1236, |
|
"step": 8050 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7971010208129883, |
|
"learning_rate": 4.263409245505461e-06, |
|
"loss": 0.1211, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7244572639465332, |
|
"learning_rate": 4.254661789014274e-06, |
|
"loss": 0.1151, |
|
"step": 8150 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7881250381469727, |
|
"learning_rate": 4.2458717872878715e-06, |
|
"loss": 0.1169, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.7103307843208313, |
|
"learning_rate": 4.237039453457179e-06, |
|
"loss": 0.1208, |
|
"step": 8250 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7782537937164307, |
|
"learning_rate": 4.228165001679547e-06, |
|
"loss": 0.1229, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6521384119987488, |
|
"learning_rate": 4.219248647133559e-06, |
|
"loss": 0.1203, |
|
"step": 8350 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7174645662307739, |
|
"learning_rate": 4.210290606013813e-06, |
|
"loss": 0.116, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8697425723075867, |
|
"learning_rate": 4.2012910955256825e-06, |
|
"loss": 0.1138, |
|
"step": 8450 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6629298329353333, |
|
"learning_rate": 4.192250333880045e-06, |
|
"loss": 0.1177, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6568993330001831, |
|
"learning_rate": 4.183168540287995e-06, |
|
"loss": 0.1177, |
|
"step": 8550 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.6727361083030701, |
|
"learning_rate": 4.174045934955527e-06, |
|
"loss": 0.1166, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.7124900221824646, |
|
"learning_rate": 4.164882739078197e-06, |
|
"loss": 0.1199, |
|
"step": 8650 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6333892941474915, |
|
"learning_rate": 4.155679174835758e-06, |
|
"loss": 0.1158, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6718785762786865, |
|
"learning_rate": 4.146435465386776e-06, |
|
"loss": 0.1155, |
|
"step": 8750 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7972610592842102, |
|
"learning_rate": 4.137151834863213e-06, |
|
"loss": 0.1165, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6914675831794739, |
|
"learning_rate": 4.1278285083649985e-06, |
|
"loss": 0.115, |
|
"step": 8850 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7176665663719177, |
|
"learning_rate": 4.11846571195457e-06, |
|
"loss": 0.1191, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.6305205821990967, |
|
"learning_rate": 4.1090636726513875e-06, |
|
"loss": 0.1162, |
|
"step": 8950 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.655022144317627, |
|
"learning_rate": 4.0996226184264355e-06, |
|
"loss": 0.1172, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.7785295248031616, |
|
"learning_rate": 4.090142778196692e-06, |
|
"loss": 0.1188, |
|
"step": 9050 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6232600212097168, |
|
"learning_rate": 4.080624381819577e-06, |
|
"loss": 0.1172, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7053920030593872, |
|
"learning_rate": 4.071067660087379e-06, |
|
"loss": 0.111, |
|
"step": 9150 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7314987182617188, |
|
"learning_rate": 4.061472844721664e-06, |
|
"loss": 0.1238, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7953878045082092, |
|
"learning_rate": 4.05184016836765e-06, |
|
"loss": 0.1131, |
|
"step": 9250 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.840351402759552, |
|
"learning_rate": 4.042169864588571e-06, |
|
"loss": 0.1117, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8227623701095581, |
|
"learning_rate": 4.032462167860012e-06, |
|
"loss": 0.1151, |
|
"step": 9350 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6529046297073364, |
|
"learning_rate": 4.022717313564223e-06, |
|
"loss": 0.1164, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.7209616899490356, |
|
"learning_rate": 4.012935537984414e-06, |
|
"loss": 0.1142, |
|
"step": 9450 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6554023623466492, |
|
"learning_rate": 4.0031170782990214e-06, |
|
"loss": 0.1177, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8092223405838013, |
|
"learning_rate": 3.993262172575962e-06, |
|
"loss": 0.1155, |
|
"step": 9550 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7681934833526611, |
|
"learning_rate": 3.983371059766862e-06, |
|
"loss": 0.116, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7766609787940979, |
|
"learning_rate": 3.973443979701252e-06, |
|
"loss": 0.1158, |
|
"step": 9650 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6570335030555725, |
|
"learning_rate": 3.963481173080768e-06, |
|
"loss": 0.1151, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7423933744430542, |
|
"learning_rate": 3.9534828814733e-06, |
|
"loss": 0.1159, |
|
"step": 9750 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.8370651006698608, |
|
"learning_rate": 3.943449347307146e-06, |
|
"loss": 0.1197, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.6548042297363281, |
|
"learning_rate": 3.9333808138651265e-06, |
|
"loss": 0.114, |
|
"step": 9850 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7364439368247986, |
|
"learning_rate": 3.923277525278691e-06, |
|
"loss": 0.1129, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.669224202632904, |
|
"learning_rate": 3.913139726521993e-06, |
|
"loss": 0.1158, |
|
"step": 9950 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.834517240524292, |
|
"learning_rate": 3.9029676634059565e-06, |
|
"loss": 0.1159, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6793729066848755, |
|
"learning_rate": 3.89276158257231e-06, |
|
"loss": 0.1139, |
|
"step": 10050 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7160745859146118, |
|
"learning_rate": 3.882521731487609e-06, |
|
"loss": 0.1158, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6966678500175476, |
|
"learning_rate": 3.872248358437236e-06, |
|
"loss": 0.1164, |
|
"step": 10150 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7358713746070862, |
|
"learning_rate": 3.861941712519379e-06, |
|
"loss": 0.1144, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.7294729351997375, |
|
"learning_rate": 3.8516020436389945e-06, |
|
"loss": 0.1191, |
|
"step": 10250 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7433848977088928, |
|
"learning_rate": 3.841229602501742e-06, |
|
"loss": 0.112, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7290607690811157, |
|
"learning_rate": 3.8308246406079116e-06, |
|
"loss": 0.1131, |
|
"step": 10350 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7990119457244873, |
|
"learning_rate": 3.820387410246324e-06, |
|
"loss": 0.1155, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7290509939193726, |
|
"learning_rate": 3.809918164488208e-06, |
|
"loss": 0.1179, |
|
"step": 10450 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7162163257598877, |
|
"learning_rate": 3.7994171571810756e-06, |
|
"loss": 0.114, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7141215801239014, |
|
"learning_rate": 3.788884642942555e-06, |
|
"loss": 0.1163, |
|
"step": 10550 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7095829844474792, |
|
"learning_rate": 3.7783208771542237e-06, |
|
"loss": 0.1133, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.752619743347168, |
|
"learning_rate": 3.7677261159554145e-06, |
|
"loss": 0.1191, |
|
"step": 10650 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7586488127708435, |
|
"learning_rate": 3.757100616237006e-06, |
|
"loss": 0.1183, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6924918293952942, |
|
"learning_rate": 3.746444635635191e-06, |
|
"loss": 0.113, |
|
"step": 10750 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6969543695449829, |
|
"learning_rate": 3.735758432525234e-06, |
|
"loss": 0.1096, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7095456719398499, |
|
"learning_rate": 3.725042266015201e-06, |
|
"loss": 0.1145, |
|
"step": 10850 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7116329669952393, |
|
"learning_rate": 3.7142963959396805e-06, |
|
"loss": 0.1104, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7710102200508118, |
|
"learning_rate": 3.7035210828534846e-06, |
|
"loss": 0.118, |
|
"step": 10950 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.5557773113250732, |
|
"learning_rate": 3.692716588025327e-06, |
|
"loss": 0.1168, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.769664466381073, |
|
"learning_rate": 3.68188317343149e-06, |
|
"loss": 0.1166, |
|
"step": 11050 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6898650527000427, |
|
"learning_rate": 3.671021101749476e-06, |
|
"loss": 0.1154, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7608383297920227, |
|
"learning_rate": 3.6601306363516297e-06, |
|
"loss": 0.1138, |
|
"step": 11150 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6353211998939514, |
|
"learning_rate": 3.649212041298763e-06, |
|
"loss": 0.1121, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7474289536476135, |
|
"learning_rate": 3.638265581333742e-06, |
|
"loss": 0.1169, |
|
"step": 11250 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6579691171646118, |
|
"learning_rate": 3.627291521875076e-06, |
|
"loss": 0.1102, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6420250535011292, |
|
"learning_rate": 3.616290129010476e-06, |
|
"loss": 0.111, |
|
"step": 11350 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6057188510894775, |
|
"learning_rate": 3.605261669490407e-06, |
|
"loss": 0.1125, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.5284745693206787, |
|
"learning_rate": 3.5942064107216183e-06, |
|
"loss": 0.1051, |
|
"step": 11450 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.6904653906822205, |
|
"learning_rate": 3.5831246207606597e-06, |
|
"loss": 0.1197, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6652848124504089, |
|
"learning_rate": 3.57201656830738e-06, |
|
"loss": 0.1116, |
|
"step": 11550 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7260887026786804, |
|
"learning_rate": 3.5608825226984168e-06, |
|
"loss": 0.1108, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7659937143325806, |
|
"learning_rate": 3.549722753900662e-06, |
|
"loss": 0.1142, |
|
"step": 11650 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.6617051362991333, |
|
"learning_rate": 3.5385375325047167e-06, |
|
"loss": 0.1126, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7225079536437988, |
|
"learning_rate": 3.5273271297183302e-06, |
|
"loss": 0.1159, |
|
"step": 11750 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5886747241020203, |
|
"learning_rate": 3.516091817359825e-06, |
|
"loss": 0.111, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7085060477256775, |
|
"learning_rate": 3.5048318678515052e-06, |
|
"loss": 0.1123, |
|
"step": 11850 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7402300834655762, |
|
"learning_rate": 3.493547554213051e-06, |
|
"loss": 0.1161, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7527924180030823, |
|
"learning_rate": 3.482239150054898e-06, |
|
"loss": 0.1147, |
|
"step": 11950 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6737644076347351, |
|
"learning_rate": 3.470906929571605e-06, |
|
"loss": 0.1105, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7493759393692017, |
|
"learning_rate": 3.459551167535205e-06, |
|
"loss": 0.1145, |
|
"step": 12050 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7661862373352051, |
|
"learning_rate": 3.4481721392885415e-06, |
|
"loss": 0.1136, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6885182857513428, |
|
"learning_rate": 3.4367701207385944e-06, |
|
"loss": 0.1122, |
|
"step": 12150 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.8971922993659973, |
|
"learning_rate": 3.425345388349787e-06, |
|
"loss": 0.108, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.6365148425102234, |
|
"learning_rate": 3.4138982191372838e-06, |
|
"loss": 0.113, |
|
"step": 12250 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.627581775188446, |
|
"learning_rate": 3.402428890660279e-06, |
|
"loss": 0.1094, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.73035728931427, |
|
"learning_rate": 3.390937681015256e-06, |
|
"loss": 0.1132, |
|
"step": 12350 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5905104875564575, |
|
"learning_rate": 3.379424868829254e-06, |
|
"loss": 0.1126, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6537535786628723, |
|
"learning_rate": 3.367890733253108e-06, |
|
"loss": 0.1137, |
|
"step": 12450 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.718180239200592, |
|
"learning_rate": 3.3563355539546795e-06, |
|
"loss": 0.1169, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6057806611061096, |
|
"learning_rate": 3.3447596111120767e-06, |
|
"loss": 0.1127, |
|
"step": 12550 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.8001242876052856, |
|
"learning_rate": 3.333163185406861e-06, |
|
"loss": 0.1124, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6936395168304443, |
|
"learning_rate": 3.321546558017243e-06, |
|
"loss": 0.1089, |
|
"step": 12650 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.664017915725708, |
|
"learning_rate": 3.309910010611259e-06, |
|
"loss": 0.1137, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6639358997344971, |
|
"learning_rate": 3.29825382533995e-06, |
|
"loss": 0.1152, |
|
"step": 12750 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6084735989570618, |
|
"learning_rate": 3.286578284830513e-06, |
|
"loss": 0.1117, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6548373103141785, |
|
"learning_rate": 3.2748836721794514e-06, |
|
"loss": 0.1146, |
|
"step": 12850 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7516295313835144, |
|
"learning_rate": 3.263170270945709e-06, |
|
"loss": 0.1095, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6284793615341187, |
|
"learning_rate": 3.2514383651437987e-06, |
|
"loss": 0.1122, |
|
"step": 12950 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7250240445137024, |
|
"learning_rate": 3.239688239236911e-06, |
|
"loss": 0.1156, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.6905579566955566, |
|
"learning_rate": 3.2279201781300206e-06, |
|
"loss": 0.1143, |
|
"step": 13050 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.7570493221282959, |
|
"learning_rate": 3.2161344671629736e-06, |
|
"loss": 0.1177, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7020050287246704, |
|
"learning_rate": 3.2043313921035747e-06, |
|
"loss": 0.1054, |
|
"step": 13150 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7082750201225281, |
|
"learning_rate": 3.1925112391406534e-06, |
|
"loss": 0.1118, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6312862634658813, |
|
"learning_rate": 3.1806742948771276e-06, |
|
"loss": 0.1111, |
|
"step": 13250 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6235171556472778, |
|
"learning_rate": 3.168820846323053e-06, |
|
"loss": 0.1082, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7088466882705688, |
|
"learning_rate": 3.1569511808886633e-06, |
|
"loss": 0.1114, |
|
"step": 13350 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.7077568173408508, |
|
"learning_rate": 3.1450655863774053e-06, |
|
"loss": 0.1128, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6110339760780334, |
|
"learning_rate": 3.1331643509789553e-06, |
|
"loss": 0.1099, |
|
"step": 13450 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.1900233030319214, |
|
"learning_rate": 3.121247763262235e-06, |
|
"loss": 0.1155, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7000632882118225, |
|
"learning_rate": 3.1093161121684118e-06, |
|
"loss": 0.1166, |
|
"step": 13550 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7171856164932251, |
|
"learning_rate": 3.097369687003896e-06, |
|
"loss": 0.1129, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7927132248878479, |
|
"learning_rate": 3.085408777433323e-06, |
|
"loss": 0.1126, |
|
"step": 13650 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.7410084009170532, |
|
"learning_rate": 3.0734336734725327e-06, |
|
"loss": 0.116, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8224751949310303, |
|
"learning_rate": 3.0614446654815346e-06, |
|
"loss": 0.1119, |
|
"step": 13750 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.790632963180542, |
|
"learning_rate": 3.049442044157469e-06, |
|
"loss": 0.1122, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6657869815826416, |
|
"learning_rate": 3.0374261005275606e-06, |
|
"loss": 0.1105, |
|
"step": 13850 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.6833150386810303, |
|
"learning_rate": 3.025397125942056e-06, |
|
"loss": 0.1158, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7890775203704834, |
|
"learning_rate": 3.0133554120671653e-06, |
|
"loss": 0.1158, |
|
"step": 13950 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6974292397499084, |
|
"learning_rate": 3.001301250877987e-06, |
|
"loss": 0.1121, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7317948341369629, |
|
"learning_rate": 2.9892349346514306e-06, |
|
"loss": 0.1111, |
|
"step": 14050 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.8558988571166992, |
|
"learning_rate": 2.977156755959126e-06, |
|
"loss": 0.113, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6563243865966797, |
|
"learning_rate": 2.9650670076603342e-06, |
|
"loss": 0.1121, |
|
"step": 14150 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7113382816314697, |
|
"learning_rate": 2.952965982894844e-06, |
|
"loss": 0.1082, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.6555420160293579, |
|
"learning_rate": 2.9408539750758625e-06, |
|
"loss": 0.1123, |
|
"step": 14250 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7071360945701599, |
|
"learning_rate": 2.9287312778829047e-06, |
|
"loss": 0.1146, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6903111934661865, |
|
"learning_rate": 2.9165981852546688e-06, |
|
"loss": 0.1151, |
|
"step": 14350 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6903436779975891, |
|
"learning_rate": 2.9044549913819125e-06, |
|
"loss": 0.111, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.7512865662574768, |
|
"learning_rate": 2.892301990700316e-06, |
|
"loss": 0.1081, |
|
"step": 14450 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6243235468864441, |
|
"learning_rate": 2.8801394778833475e-06, |
|
"loss": 0.1101, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.761928141117096, |
|
"learning_rate": 2.8679677478351147e-06, |
|
"loss": 0.1124, |
|
"step": 14550 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6263371706008911, |
|
"learning_rate": 2.8557870956832135e-06, |
|
"loss": 0.1107, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5973655581474304, |
|
"learning_rate": 2.8435978167715753e-06, |
|
"loss": 0.1108, |
|
"step": 14650 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.6588687896728516, |
|
"learning_rate": 2.8314002066533053e-06, |
|
"loss": 0.1109, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.640466034412384, |
|
"learning_rate": 2.8191945610835138e-06, |
|
"loss": 0.1071, |
|
"step": 14750 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7230612635612488, |
|
"learning_rate": 2.8069811760121463e-06, |
|
"loss": 0.1079, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6982682347297668, |
|
"learning_rate": 2.794760347576809e-06, |
|
"loss": 0.1137, |
|
"step": 14850 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6350643038749695, |
|
"learning_rate": 2.7825323720955853e-06, |
|
"loss": 0.1053, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.7661166191101074, |
|
"learning_rate": 2.7702975460598545e-06, |
|
"loss": 0.1109, |
|
"step": 14950 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6969360113143921, |
|
"learning_rate": 2.7580561661271015e-06, |
|
"loss": 0.1099, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.8628710508346558, |
|
"learning_rate": 2.7458085291137213e-06, |
|
"loss": 0.1073, |
|
"step": 15050 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.6863730549812317, |
|
"learning_rate": 2.733554931987825e-06, |
|
"loss": 0.1067, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6106501817703247, |
|
"learning_rate": 2.7212956718620404e-06, |
|
"loss": 0.1083, |
|
"step": 15150 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7088560461997986, |
|
"learning_rate": 2.709031045986302e-06, |
|
"loss": 0.1074, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6315382122993469, |
|
"learning_rate": 2.6967613517406514e-06, |
|
"loss": 0.11, |
|
"step": 15250 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8497392535209656, |
|
"learning_rate": 2.68448688662802e-06, |
|
"loss": 0.1063, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7048566341400146, |
|
"learning_rate": 2.6722079482670196e-06, |
|
"loss": 0.1044, |
|
"step": 15350 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.6477850675582886, |
|
"learning_rate": 2.6599248343847243e-06, |
|
"loss": 0.112, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.5776751041412354, |
|
"learning_rate": 2.6476378428094523e-06, |
|
"loss": 0.1085, |
|
"step": 15450 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.7395492792129517, |
|
"learning_rate": 2.6353472714635443e-06, |
|
"loss": 0.1084, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7547775506973267, |
|
"learning_rate": 2.6230534183561385e-06, |
|
"loss": 0.1131, |
|
"step": 15550 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6937977075576782, |
|
"learning_rate": 2.6107565815759473e-06, |
|
"loss": 0.1097, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6883909702301025, |
|
"learning_rate": 2.598457059284027e-06, |
|
"loss": 0.114, |
|
"step": 15650 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7592599987983704, |
|
"learning_rate": 2.5861551497065497e-06, |
|
"loss": 0.1093, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6865835189819336, |
|
"learning_rate": 2.5738511511275716e-06, |
|
"loss": 0.1127, |
|
"step": 15750 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6286877989768982, |
|
"learning_rate": 2.5615453618818033e-06, |
|
"loss": 0.1143, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6750493049621582, |
|
"learning_rate": 2.5492380803473705e-06, |
|
"loss": 0.1079, |
|
"step": 15850 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6905640959739685, |
|
"learning_rate": 2.5369296049385837e-06, |
|
"loss": 0.1096, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6613180041313171, |
|
"learning_rate": 2.5246202340987004e-06, |
|
"loss": 0.1103, |
|
"step": 15950 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7040888071060181, |
|
"learning_rate": 2.5123102662926912e-06, |
|
"loss": 0.1038, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7578396797180176, |
|
"learning_rate": 2.5e-06, |
|
"loss": 0.1087, |
|
"step": 16050 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.587769627571106, |
|
"learning_rate": 2.4876897337073105e-06, |
|
"loss": 0.1075, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.688367486000061, |
|
"learning_rate": 2.475379765901301e-06, |
|
"loss": 0.1088, |
|
"step": 16150 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7524093985557556, |
|
"learning_rate": 2.4630703950614176e-06, |
|
"loss": 0.1094, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.6418857574462891, |
|
"learning_rate": 2.45076191965263e-06, |
|
"loss": 0.1057, |
|
"step": 16250 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.7005917429924011, |
|
"learning_rate": 2.4384546381181975e-06, |
|
"loss": 0.111, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5830152630805969, |
|
"learning_rate": 2.4261488488724284e-06, |
|
"loss": 0.1066, |
|
"step": 16350 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6886410117149353, |
|
"learning_rate": 2.413844850293451e-06, |
|
"loss": 0.1075, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7370522022247314, |
|
"learning_rate": 2.4015429407159746e-06, |
|
"loss": 0.1081, |
|
"step": 16450 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.681659996509552, |
|
"learning_rate": 2.3892434184240536e-06, |
|
"loss": 0.1091, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8225075006484985, |
|
"learning_rate": 2.3769465816438627e-06, |
|
"loss": 0.1119, |
|
"step": 16550 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.8517566919326782, |
|
"learning_rate": 2.3646527285364565e-06, |
|
"loss": 0.1167, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.7776892185211182, |
|
"learning_rate": 2.3523621571905485e-06, |
|
"loss": 0.1091, |
|
"step": 16650 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.6754997968673706, |
|
"learning_rate": 2.340075165615276e-06, |
|
"loss": 0.107, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6805914640426636, |
|
"learning_rate": 2.3277920517329813e-06, |
|
"loss": 0.1178, |
|
"step": 16750 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7494375705718994, |
|
"learning_rate": 2.315513113371981e-06, |
|
"loss": 0.1122, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7486910223960876, |
|
"learning_rate": 2.303238648259349e-06, |
|
"loss": 0.1109, |
|
"step": 16850 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7235914468765259, |
|
"learning_rate": 2.2909689540136986e-06, |
|
"loss": 0.1081, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9256291389465332, |
|
"learning_rate": 2.27870432813796e-06, |
|
"loss": 0.1097, |
|
"step": 16950 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7309363484382629, |
|
"learning_rate": 2.2664450680121757e-06, |
|
"loss": 0.1137, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.6349431872367859, |
|
"learning_rate": 2.254191470886279e-06, |
|
"loss": 0.1058, |
|
"step": 17050 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7812219262123108, |
|
"learning_rate": 2.241943833872899e-06, |
|
"loss": 0.1109, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.7502079010009766, |
|
"learning_rate": 2.2297024539401463e-06, |
|
"loss": 0.1092, |
|
"step": 17150 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6485741138458252, |
|
"learning_rate": 2.2174676279044155e-06, |
|
"loss": 0.1086, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6442984938621521, |
|
"learning_rate": 2.2052396524231924e-06, |
|
"loss": 0.1026, |
|
"step": 17250 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6773645281791687, |
|
"learning_rate": 2.193018823987854e-06, |
|
"loss": 0.1055, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7892882227897644, |
|
"learning_rate": 2.180805438916487e-06, |
|
"loss": 0.1077, |
|
"step": 17350 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8464773297309875, |
|
"learning_rate": 2.1685997933466947e-06, |
|
"loss": 0.1083, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7581765651702881, |
|
"learning_rate": 2.1564021832284255e-06, |
|
"loss": 0.1081, |
|
"step": 17450 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.7814217805862427, |
|
"learning_rate": 2.1442129043167877e-06, |
|
"loss": 0.1067, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.6047621965408325, |
|
"learning_rate": 2.1320322521648857e-06, |
|
"loss": 0.1065, |
|
"step": 17550 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7465390563011169, |
|
"learning_rate": 2.119860522116653e-06, |
|
"loss": 0.1058, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7619149684906006, |
|
"learning_rate": 2.1076980092996845e-06, |
|
"loss": 0.109, |
|
"step": 17650 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7714558839797974, |
|
"learning_rate": 2.0955450086180883e-06, |
|
"loss": 0.1089, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6427506804466248, |
|
"learning_rate": 2.083401814745332e-06, |
|
"loss": 0.1086, |
|
"step": 17750 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6711663603782654, |
|
"learning_rate": 2.071268722117096e-06, |
|
"loss": 0.107, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7448789477348328, |
|
"learning_rate": 2.0591460249241383e-06, |
|
"loss": 0.1054, |
|
"step": 17850 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6524811387062073, |
|
"learning_rate": 2.0470340171051567e-06, |
|
"loss": 0.1135, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6184554100036621, |
|
"learning_rate": 2.034932992339666e-06, |
|
"loss": 0.1093, |
|
"step": 17950 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7184391021728516, |
|
"learning_rate": 2.022843244040874e-06, |
|
"loss": 0.1039, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6897612810134888, |
|
"learning_rate": 2.0107650653485707e-06, |
|
"loss": 0.111, |
|
"step": 18050 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7289073467254639, |
|
"learning_rate": 1.998698749122014e-06, |
|
"loss": 0.1083, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6289246678352356, |
|
"learning_rate": 1.986644587932835e-06, |
|
"loss": 0.1062, |
|
"step": 18150 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.7373926639556885, |
|
"learning_rate": 1.9746028740579453e-06, |
|
"loss": 0.1083, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6934205293655396, |
|
"learning_rate": 1.96257389947244e-06, |
|
"loss": 0.108, |
|
"step": 18250 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6576781868934631, |
|
"learning_rate": 1.9505579558425315e-06, |
|
"loss": 0.105, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.6750394105911255, |
|
"learning_rate": 1.938555334518466e-06, |
|
"loss": 0.1098, |
|
"step": 18350 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6316762566566467, |
|
"learning_rate": 1.926566326527468e-06, |
|
"loss": 0.1063, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7245384454727173, |
|
"learning_rate": 1.914591222566678e-06, |
|
"loss": 0.1066, |
|
"step": 18450 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6794847249984741, |
|
"learning_rate": 1.9026303129961049e-06, |
|
"loss": 0.1081, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6249060034751892, |
|
"learning_rate": 1.8906838878315886e-06, |
|
"loss": 0.1092, |
|
"step": 18550 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7857920527458191, |
|
"learning_rate": 1.878752236737765e-06, |
|
"loss": 0.1088, |
|
"step": 18600 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6644686460494995, |
|
"learning_rate": 1.8668356490210449e-06, |
|
"loss": 0.1003, |
|
"step": 18650 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.8369055986404419, |
|
"learning_rate": 1.8549344136225946e-06, |
|
"loss": 0.1091, |
|
"step": 18700 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.7103849649429321, |
|
"learning_rate": 1.8430488191113373e-06, |
|
"loss": 0.1016, |
|
"step": 18750 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.655951976776123, |
|
"learning_rate": 1.8311791536769485e-06, |
|
"loss": 0.1063, |
|
"step": 18800 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7587403059005737, |
|
"learning_rate": 1.819325705122873e-06, |
|
"loss": 0.1043, |
|
"step": 18850 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7093736529350281, |
|
"learning_rate": 1.8074887608593477e-06, |
|
"loss": 0.1092, |
|
"step": 18900 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5937142968177795, |
|
"learning_rate": 1.7956686078964257e-06, |
|
"loss": 0.1045, |
|
"step": 18950 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6976079344749451, |
|
"learning_rate": 1.7838655328370268e-06, |
|
"loss": 0.1127, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6277263760566711, |
|
"learning_rate": 1.7720798218699798e-06, |
|
"loss": 0.1115, |
|
"step": 19050 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.7694005966186523, |
|
"learning_rate": 1.7603117607630892e-06, |
|
"loss": 0.1068, |
|
"step": 19100 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6981104612350464, |
|
"learning_rate": 1.7485616348562023e-06, |
|
"loss": 0.1053, |
|
"step": 19150 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.816888153553009, |
|
"learning_rate": 1.7368297290542918e-06, |
|
"loss": 0.1084, |
|
"step": 19200 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7193763852119446, |
|
"learning_rate": 1.72511632782055e-06, |
|
"loss": 0.1056, |
|
"step": 19250 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.6017695069313049, |
|
"learning_rate": 1.7134217151694873e-06, |
|
"loss": 0.1033, |
|
"step": 19300 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7289357781410217, |
|
"learning_rate": 1.7017461746600506e-06, |
|
"loss": 0.1092, |
|
"step": 19350 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7104734182357788, |
|
"learning_rate": 1.690089989388741e-06, |
|
"loss": 0.1075, |
|
"step": 19400 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.707067608833313, |
|
"learning_rate": 1.678453441982758e-06, |
|
"loss": 0.1093, |
|
"step": 19450 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7596837878227234, |
|
"learning_rate": 1.66683681459314e-06, |
|
"loss": 0.1042, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.7431414723396301, |
|
"learning_rate": 1.6552403888879243e-06, |
|
"loss": 0.1034, |
|
"step": 19550 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7935433387756348, |
|
"learning_rate": 1.6436644460453218e-06, |
|
"loss": 0.1101, |
|
"step": 19600 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7206917405128479, |
|
"learning_rate": 1.6321092667468926e-06, |
|
"loss": 0.104, |
|
"step": 19650 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7832915782928467, |
|
"learning_rate": 1.6205751311707463e-06, |
|
"loss": 0.106, |
|
"step": 19700 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6786001324653625, |
|
"learning_rate": 1.6090623189847443e-06, |
|
"loss": 0.1044, |
|
"step": 19750 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.739486038684845, |
|
"learning_rate": 1.5975711093397223e-06, |
|
"loss": 0.1058, |
|
"step": 19800 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6627418398857117, |
|
"learning_rate": 1.5861017808627167e-06, |
|
"loss": 0.1111, |
|
"step": 19850 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.7300294637680054, |
|
"learning_rate": 1.574654611650214e-06, |
|
"loss": 0.1076, |
|
"step": 19900 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.8272718191146851, |
|
"learning_rate": 1.5632298792614064e-06, |
|
"loss": 0.1078, |
|
"step": 19950 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7260649800300598, |
|
"learning_rate": 1.5518278607114585e-06, |
|
"loss": 0.1068, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7374041080474854, |
|
"learning_rate": 1.540448832464796e-06, |
|
"loss": 0.1083, |
|
"step": 20050 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7387140989303589, |
|
"learning_rate": 1.5290930704283953e-06, |
|
"loss": 0.104, |
|
"step": 20100 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.784419059753418, |
|
"learning_rate": 1.517760849945103e-06, |
|
"loss": 0.1073, |
|
"step": 20150 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6998342275619507, |
|
"learning_rate": 1.5064524457869506e-06, |
|
"loss": 0.1079, |
|
"step": 20200 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7088004946708679, |
|
"learning_rate": 1.4951681321484952e-06, |
|
"loss": 0.1051, |
|
"step": 20250 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8070769906044006, |
|
"learning_rate": 1.4839081826401756e-06, |
|
"loss": 0.1053, |
|
"step": 20300 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.7509734034538269, |
|
"learning_rate": 1.47267287028167e-06, |
|
"loss": 0.1087, |
|
"step": 20350 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7456051111221313, |
|
"learning_rate": 1.4614624674952843e-06, |
|
"loss": 0.1058, |
|
"step": 20400 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7578474879264832, |
|
"learning_rate": 1.4502772460993387e-06, |
|
"loss": 0.1087, |
|
"step": 20450 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7839920520782471, |
|
"learning_rate": 1.4391174773015836e-06, |
|
"loss": 0.1081, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7195042371749878, |
|
"learning_rate": 1.4279834316926217e-06, |
|
"loss": 0.1072, |
|
"step": 20550 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.6250014901161194, |
|
"learning_rate": 1.4168753792393413e-06, |
|
"loss": 0.1052, |
|
"step": 20600 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.5411588549613953, |
|
"learning_rate": 1.405793589278382e-06, |
|
"loss": 0.1015, |
|
"step": 20650 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7470389008522034, |
|
"learning_rate": 1.394738330509593e-06, |
|
"loss": 0.1043, |
|
"step": 20700 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.7059767842292786, |
|
"learning_rate": 1.3837098709895246e-06, |
|
"loss": 0.1054, |
|
"step": 20750 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8912293314933777, |
|
"learning_rate": 1.3727084781249251e-06, |
|
"loss": 0.1066, |
|
"step": 20800 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7008835673332214, |
|
"learning_rate": 1.3617344186662585e-06, |
|
"loss": 0.105, |
|
"step": 20850 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.8228752017021179, |
|
"learning_rate": 1.3507879587012378e-06, |
|
"loss": 0.1056, |
|
"step": 20900 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7580632567405701, |
|
"learning_rate": 1.3398693636483707e-06, |
|
"loss": 0.1061, |
|
"step": 20950 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.5401976704597473, |
|
"learning_rate": 1.328978898250525e-06, |
|
"loss": 0.1054, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.7178921103477478, |
|
"learning_rate": 1.31811682656851e-06, |
|
"loss": 0.1048, |
|
"step": 21050 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.701740026473999, |
|
"learning_rate": 1.307283411974674e-06, |
|
"loss": 0.1044, |
|
"step": 21100 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6609808206558228, |
|
"learning_rate": 1.2964789171465164e-06, |
|
"loss": 0.1033, |
|
"step": 21150 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8305937051773071, |
|
"learning_rate": 1.2857036040603204e-06, |
|
"loss": 0.1073, |
|
"step": 21200 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7639765739440918, |
|
"learning_rate": 1.2749577339848007e-06, |
|
"loss": 0.1092, |
|
"step": 21250 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7896683216094971, |
|
"learning_rate": 1.2642415674747675e-06, |
|
"loss": 0.1016, |
|
"step": 21300 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7455915808677673, |
|
"learning_rate": 1.25355536436481e-06, |
|
"loss": 0.1063, |
|
"step": 21350 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6086365580558777, |
|
"learning_rate": 1.2428993837629943e-06, |
|
"loss": 0.1059, |
|
"step": 21400 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7196105122566223, |
|
"learning_rate": 1.2322738840445867e-06, |
|
"loss": 0.1019, |
|
"step": 21450 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7521807551383972, |
|
"learning_rate": 1.2216791228457778e-06, |
|
"loss": 0.1079, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.6882525682449341, |
|
"learning_rate": 1.2111153570574454e-06, |
|
"loss": 0.1058, |
|
"step": 21550 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7660160064697266, |
|
"learning_rate": 1.2005828428189256e-06, |
|
"loss": 0.1048, |
|
"step": 21600 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7205012440681458, |
|
"learning_rate": 1.1900818355117918e-06, |
|
"loss": 0.1028, |
|
"step": 21650 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6535946726799011, |
|
"learning_rate": 1.1796125897536782e-06, |
|
"loss": 0.1061, |
|
"step": 21700 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7131094336509705, |
|
"learning_rate": 1.1691753593920884e-06, |
|
"loss": 0.1053, |
|
"step": 21750 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.8452409505844116, |
|
"learning_rate": 1.1587703974982583e-06, |
|
"loss": 0.104, |
|
"step": 21800 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7793323993682861, |
|
"learning_rate": 1.148397956361007e-06, |
|
"loss": 0.1055, |
|
"step": 21850 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6791970729827881, |
|
"learning_rate": 1.1380582874806208e-06, |
|
"loss": 0.1047, |
|
"step": 21900 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7349908351898193, |
|
"learning_rate": 1.127751641562765e-06, |
|
"loss": 0.1038, |
|
"step": 21950 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.61646968126297, |
|
"learning_rate": 1.1174782685123919e-06, |
|
"loss": 0.1047, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8146559000015259, |
|
"learning_rate": 1.107238417427691e-06, |
|
"loss": 0.1041, |
|
"step": 22050 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7460653185844421, |
|
"learning_rate": 1.0970323365940443e-06, |
|
"loss": 0.1069, |
|
"step": 22100 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7389132976531982, |
|
"learning_rate": 1.0868602734780075e-06, |
|
"loss": 0.1038, |
|
"step": 22150 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.8619258999824524, |
|
"learning_rate": 1.0767224747213102e-06, |
|
"loss": 0.1029, |
|
"step": 22200 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.814842939376831, |
|
"learning_rate": 1.0666191861348741e-06, |
|
"loss": 0.1063, |
|
"step": 22250 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7106545567512512, |
|
"learning_rate": 1.0565506526928548e-06, |
|
"loss": 0.1023, |
|
"step": 22300 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.7529650330543518, |
|
"learning_rate": 1.0465171185267007e-06, |
|
"loss": 0.1068, |
|
"step": 22350 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7822823524475098, |
|
"learning_rate": 1.036518826919233e-06, |
|
"loss": 0.1063, |
|
"step": 22400 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7902446389198303, |
|
"learning_rate": 1.0265560202987474e-06, |
|
"loss": 0.1038, |
|
"step": 22450 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8036994338035583, |
|
"learning_rate": 1.0166289402331391e-06, |
|
"loss": 0.1015, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6801638603210449, |
|
"learning_rate": 1.006737827424038e-06, |
|
"loss": 0.1115, |
|
"step": 22550 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.66480952501297, |
|
"learning_rate": 9.9688292170098e-07, |
|
"loss": 0.1067, |
|
"step": 22600 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6099157929420471, |
|
"learning_rate": 9.870644620155878e-07, |
|
"loss": 0.1002, |
|
"step": 22650 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7186796069145203, |
|
"learning_rate": 9.77282686435777e-07, |
|
"loss": 0.1035, |
|
"step": 22700 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7684562802314758, |
|
"learning_rate": 9.67537832139989e-07, |
|
"loss": 0.1047, |
|
"step": 22750 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.804244875907898, |
|
"learning_rate": 9.578301354114292e-07, |
|
"loss": 0.104, |
|
"step": 22800 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7283428311347961, |
|
"learning_rate": 9.481598316323504e-07, |
|
"loss": 0.1053, |
|
"step": 22850 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7441770434379578, |
|
"learning_rate": 9.385271552783376e-07, |
|
"loss": 0.1003, |
|
"step": 22900 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6516032814979553, |
|
"learning_rate": 9.289323399126216e-07, |
|
"loss": 0.1041, |
|
"step": 22950 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7602024078369141, |
|
"learning_rate": 9.193756181804248e-07, |
|
"loss": 0.1047, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.70171058177948, |
|
"learning_rate": 9.098572218033084e-07, |
|
"loss": 0.1043, |
|
"step": 23050 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6728523969650269, |
|
"learning_rate": 9.003773815735644e-07, |
|
"loss": 0.103, |
|
"step": 23100 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7401735186576843, |
|
"learning_rate": 8.90936327348613e-07, |
|
"loss": 0.106, |
|
"step": 23150 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.7361724972724915, |
|
"learning_rate": 8.815342880454312e-07, |
|
"loss": 0.1044, |
|
"step": 23200 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8209415674209595, |
|
"learning_rate": 8.721714916350019e-07, |
|
"loss": 0.1063, |
|
"step": 23250 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8205540776252747, |
|
"learning_rate": 8.628481651367876e-07, |
|
"loss": 0.1046, |
|
"step": 23300 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6642114520072937, |
|
"learning_rate": 8.535645346132246e-07, |
|
"loss": 0.108, |
|
"step": 23350 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6761398315429688, |
|
"learning_rate": 8.443208251642418e-07, |
|
"loss": 0.1041, |
|
"step": 23400 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.821124255657196, |
|
"learning_rate": 8.351172609218033e-07, |
|
"loss": 0.1036, |
|
"step": 23450 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6441815495491028, |
|
"learning_rate": 8.259540650444736e-07, |
|
"loss": 0.1103, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.6896815299987793, |
|
"learning_rate": 8.168314597120059e-07, |
|
"loss": 0.1049, |
|
"step": 23550 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7527472972869873, |
|
"learning_rate": 8.077496661199557e-07, |
|
"loss": 0.1094, |
|
"step": 23600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.788384199142456, |
|
"learning_rate": 7.987089044743182e-07, |
|
"loss": 0.1071, |
|
"step": 23650 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7462133169174194, |
|
"learning_rate": 7.897093939861878e-07, |
|
"loss": 0.1071, |
|
"step": 23700 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6221845746040344, |
|
"learning_rate": 7.807513528664415e-07, |
|
"loss": 0.1026, |
|
"step": 23750 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7602658271789551, |
|
"learning_rate": 7.71834998320454e-07, |
|
"loss": 0.1064, |
|
"step": 23800 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6582159399986267, |
|
"learning_rate": 7.629605465428211e-07, |
|
"loss": 0.1013, |
|
"step": 23850 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9127190709114075, |
|
"learning_rate": 7.541282127121291e-07, |
|
"loss": 0.1037, |
|
"step": 23900 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7410569787025452, |
|
"learning_rate": 7.453382109857269e-07, |
|
"loss": 0.1041, |
|
"step": 23950 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6176531314849854, |
|
"learning_rate": 7.365907544945398e-07, |
|
"loss": 0.1076, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6940147280693054, |
|
"learning_rate": 7.27886055337902e-07, |
|
"loss": 0.1036, |
|
"step": 24050 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7587239146232605, |
|
"learning_rate": 7.192243245784075e-07, |
|
"loss": 0.1047, |
|
"step": 24100 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6673498153686523, |
|
"learning_rate": 7.106057722368012e-07, |
|
"loss": 0.1021, |
|
"step": 24150 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7008479833602905, |
|
"learning_rate": 7.020306072868804e-07, |
|
"loss": 0.1056, |
|
"step": 24200 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6808987259864807, |
|
"learning_rate": 6.934990376504269e-07, |
|
"loss": 0.1005, |
|
"step": 24250 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.6858727931976318, |
|
"learning_rate": 6.850112701921735e-07, |
|
"loss": 0.1043, |
|
"step": 24300 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.804255485534668, |
|
"learning_rate": 6.76567510714777e-07, |
|
"loss": 0.1027, |
|
"step": 24350 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7274004817008972, |
|
"learning_rate": 6.681679639538388e-07, |
|
"loss": 0.1053, |
|
"step": 24400 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6244350075721741, |
|
"learning_rate": 6.598128335729332e-07, |
|
"loss": 0.1016, |
|
"step": 24450 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8558462858200073, |
|
"learning_rate": 6.515023221586722e-07, |
|
"loss": 0.1015, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.678629457950592, |
|
"learning_rate": 6.432366312157933e-07, |
|
"loss": 0.1057, |
|
"step": 24550 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.6484153866767883, |
|
"learning_rate": 6.35015961162273e-07, |
|
"loss": 0.1015, |
|
"step": 24600 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7038909196853638, |
|
"learning_rate": 6.268405113244677e-07, |
|
"loss": 0.1065, |
|
"step": 24650 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8573065996170044, |
|
"learning_rate": 6.187104799322805e-07, |
|
"loss": 0.1029, |
|
"step": 24700 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.8027458190917969, |
|
"learning_rate": 6.106260641143547e-07, |
|
"loss": 0.1032, |
|
"step": 24750 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 1.0594960451126099, |
|
"learning_rate": 6.025874598932937e-07, |
|
"loss": 0.1016, |
|
"step": 24800 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7134573459625244, |
|
"learning_rate": 5.945948621809092e-07, |
|
"loss": 0.1075, |
|
"step": 24850 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.732449471950531, |
|
"learning_rate": 5.866484647734935e-07, |
|
"loss": 0.1025, |
|
"step": 24900 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.7451493740081787, |
|
"learning_rate": 5.787484603471221e-07, |
|
"loss": 0.1048, |
|
"step": 24950 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6442447304725647, |
|
"learning_rate": 5.708950404529812e-07, |
|
"loss": 0.0989, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6639313101768494, |
|
"learning_rate": 5.630883955127211e-07, |
|
"loss": 0.106, |
|
"step": 25050 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.8314816951751709, |
|
"learning_rate": 5.553287148138462e-07, |
|
"loss": 0.1078, |
|
"step": 25100 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6702859997749329, |
|
"learning_rate": 5.47616186505113e-07, |
|
"loss": 0.1063, |
|
"step": 25150 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.6533217430114746, |
|
"learning_rate": 5.399509975919828e-07, |
|
"loss": 0.1015, |
|
"step": 25200 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7391800284385681, |
|
"learning_rate": 5.323333339320739e-07, |
|
"loss": 0.1027, |
|
"step": 25250 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.6083396077156067, |
|
"learning_rate": 5.247633802306637e-07, |
|
"loss": 0.1045, |
|
"step": 25300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8118199110031128, |
|
"learning_rate": 5.172413200362092e-07, |
|
"loss": 0.1021, |
|
"step": 25350 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7648591995239258, |
|
"learning_rate": 5.097673357358906e-07, |
|
"loss": 0.1046, |
|
"step": 25400 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7872638702392578, |
|
"learning_rate": 5.023416085511976e-07, |
|
"loss": 0.1024, |
|
"step": 25450 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.8803538084030151, |
|
"learning_rate": 4.949643185335288e-07, |
|
"loss": 0.1055, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7241789698600769, |
|
"learning_rate": 4.876356445598279e-07, |
|
"loss": 0.1049, |
|
"step": 25550 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.7358962893486023, |
|
"learning_rate": 4.803557643282486e-07, |
|
"loss": 0.1027, |
|
"step": 25600 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.9544288516044617, |
|
"learning_rate": 4.731248543538405e-07, |
|
"loss": 0.1075, |
|
"step": 25650 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7101548314094543, |
|
"learning_rate": 4.6594308996427696e-07, |
|
"loss": 0.1051, |
|
"step": 25700 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7388818264007568, |
|
"learning_rate": 4.588106452955973e-07, |
|
"loss": 0.1059, |
|
"step": 25750 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7218101024627686, |
|
"learning_rate": 4.517276932879877e-07, |
|
"loss": 0.1042, |
|
"step": 25800 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7213023900985718, |
|
"learning_rate": 4.446944056815866e-07, |
|
"loss": 0.1036, |
|
"step": 25850 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7361165881156921, |
|
"learning_rate": 4.377109530123216e-07, |
|
"loss": 0.1019, |
|
"step": 25900 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7176214456558228, |
|
"learning_rate": 4.307775046077739e-07, |
|
"loss": 0.1054, |
|
"step": 25950 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.8045145273208618, |
|
"learning_rate": 4.2389422858307244e-07, |
|
"loss": 0.1017, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.9221975803375244, |
|
"learning_rate": 4.1706129183681834e-07, |
|
"loss": 0.1016, |
|
"step": 26050 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.749830961227417, |
|
"learning_rate": 4.10278860047037e-07, |
|
"loss": 0.1008, |
|
"step": 26100 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.719508707523346, |
|
"learning_rate": 4.035470976671621e-07, |
|
"loss": 0.1026, |
|
"step": 26150 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7058319449424744, |
|
"learning_rate": 3.9686616792204677e-07, |
|
"loss": 0.1009, |
|
"step": 26200 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6976651549339294, |
|
"learning_rate": 3.902362328040091e-07, |
|
"loss": 0.1057, |
|
"step": 26250 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.8557724952697754, |
|
"learning_rate": 3.836574530688983e-07, |
|
"loss": 0.1045, |
|
"step": 26300 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6408629417419434, |
|
"learning_rate": 3.7712998823220243e-07, |
|
"loss": 0.1025, |
|
"step": 26350 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6862276196479797, |
|
"learning_rate": 3.7065399656517955e-07, |
|
"loss": 0.1009, |
|
"step": 26400 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6881060600280762, |
|
"learning_rate": 3.6422963509101626e-07, |
|
"loss": 0.1016, |
|
"step": 26450 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7063938975334167, |
|
"learning_rate": 3.578570595810274e-07, |
|
"loss": 0.1059, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7486748695373535, |
|
"learning_rate": 3.515364245508704e-07, |
|
"loss": 0.1042, |
|
"step": 26550 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7359747886657715, |
|
"learning_rate": 3.452678832568071e-07, |
|
"loss": 0.1056, |
|
"step": 26600 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.666910707950592, |
|
"learning_rate": 3.390515876919831e-07, |
|
"loss": 0.105, |
|
"step": 26650 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7717428207397461, |
|
"learning_rate": 3.328876885827406e-07, |
|
"loss": 0.0995, |
|
"step": 26700 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.6869860887527466, |
|
"learning_rate": 3.267763353849704e-07, |
|
"loss": 0.103, |
|
"step": 26750 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.7728705406188965, |
|
"learning_rate": 3.207176762804814e-07, |
|
"loss": 0.1002, |
|
"step": 26800 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.670854389667511, |
|
"learning_rate": 3.1471185817341153e-07, |
|
"loss": 0.1047, |
|
"step": 26850 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8178605437278748, |
|
"learning_rate": 3.0875902668666386e-07, |
|
"loss": 0.0986, |
|
"step": 26900 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.700291097164154, |
|
"learning_rate": 3.0285932615837646e-07, |
|
"loss": 0.1027, |
|
"step": 26950 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8143967986106873, |
|
"learning_rate": 2.970128996384228e-07, |
|
"loss": 0.0978, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5957488417625427, |
|
"learning_rate": 2.9121988888494297e-07, |
|
"loss": 0.1012, |
|
"step": 27050 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.8061670660972595, |
|
"learning_rate": 2.854804343609058e-07, |
|
"loss": 0.1018, |
|
"step": 27100 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.7996898293495178, |
|
"learning_rate": 2.7979467523070484e-07, |
|
"loss": 0.103, |
|
"step": 27150 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.712619960308075, |
|
"learning_rate": 2.741627493567822e-07, |
|
"loss": 0.1058, |
|
"step": 27200 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.9328902959823608, |
|
"learning_rate": 2.685847932962868e-07, |
|
"loss": 0.1032, |
|
"step": 27250 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.642899751663208, |
|
"learning_rate": 2.630609422977623e-07, |
|
"loss": 0.1038, |
|
"step": 27300 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8721086382865906, |
|
"learning_rate": 2.575913302978697e-07, |
|
"loss": 0.0986, |
|
"step": 27350 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6639971733093262, |
|
"learning_rate": 2.5217608991813774e-07, |
|
"loss": 0.1029, |
|
"step": 27400 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6409977078437805, |
|
"learning_rate": 2.468153524617478e-07, |
|
"loss": 0.1008, |
|
"step": 27450 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8069073557853699, |
|
"learning_rate": 2.4150924791035037e-07, |
|
"loss": 0.1038, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.6896743774414062, |
|
"learning_rate": 2.3625790492091544e-07, |
|
"loss": 0.1013, |
|
"step": 27550 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.7691417932510376, |
|
"learning_rate": 2.3106145082260777e-07, |
|
"loss": 0.1015, |
|
"step": 27600 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7701432704925537, |
|
"learning_rate": 2.2592001161370392e-07, |
|
"loss": 0.1023, |
|
"step": 27650 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7143141031265259, |
|
"learning_rate": 2.2083371195853797e-07, |
|
"loss": 0.1034, |
|
"step": 27700 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7811148762702942, |
|
"learning_rate": 2.158026751844733e-07, |
|
"loss": 0.1052, |
|
"step": 27750 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6876391172409058, |
|
"learning_rate": 2.1082702327891918e-07, |
|
"loss": 0.0985, |
|
"step": 27800 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7319098711013794, |
|
"learning_rate": 2.0590687688636619e-07, |
|
"loss": 0.1036, |
|
"step": 27850 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7728010416030884, |
|
"learning_rate": 2.0104235530546745e-07, |
|
"loss": 0.1025, |
|
"step": 27900 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.7075905203819275, |
|
"learning_rate": 1.9623357648614088e-07, |
|
"loss": 0.1025, |
|
"step": 27950 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.8835753202438354, |
|
"learning_rate": 1.914806570267111e-07, |
|
"loss": 0.1045, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6914262771606445, |
|
"learning_rate": 1.8678371217108387e-07, |
|
"loss": 0.1012, |
|
"step": 28050 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7682451605796814, |
|
"learning_rate": 1.821428558059493e-07, |
|
"loss": 0.0987, |
|
"step": 28100 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6042952537536621, |
|
"learning_rate": 1.7755820045802146e-07, |
|
"loss": 0.1033, |
|
"step": 28150 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.9065592885017395, |
|
"learning_rate": 1.7302985729131e-07, |
|
"loss": 0.1067, |
|
"step": 28200 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7775442004203796, |
|
"learning_rate": 1.6855793610442484e-07, |
|
"loss": 0.1019, |
|
"step": 28250 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7581862807273865, |
|
"learning_rate": 1.6414254532791357e-07, |
|
"loss": 0.1043, |
|
"step": 28300 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7499451041221619, |
|
"learning_rate": 1.5978379202163275e-07, |
|
"loss": 0.1046, |
|
"step": 28350 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.7929815053939819, |
|
"learning_rate": 1.554817818721513e-07, |
|
"loss": 0.1054, |
|
"step": 28400 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8825467228889465, |
|
"learning_rate": 1.51236619190189e-07, |
|
"loss": 0.1034, |
|
"step": 28450 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6325812339782715, |
|
"learning_rate": 1.4704840690808658e-07, |
|
"loss": 0.105, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.6791960000991821, |
|
"learning_rate": 1.4291724657730904e-07, |
|
"loss": 0.1054, |
|
"step": 28550 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.696438729763031, |
|
"learning_rate": 1.3884323836598656e-07, |
|
"loss": 0.1007, |
|
"step": 28600 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.638707160949707, |
|
"learning_rate": 1.348264810564809e-07, |
|
"loss": 0.0989, |
|
"step": 28650 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.9399768114089966, |
|
"learning_rate": 1.3086707204299415e-07, |
|
"loss": 0.1041, |
|
"step": 28700 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7409209609031677, |
|
"learning_rate": 1.269651073292058e-07, |
|
"loss": 0.1028, |
|
"step": 28750 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.7949596047401428, |
|
"learning_rate": 1.2312068152594448e-07, |
|
"loss": 0.1025, |
|
"step": 28800 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.8088447451591492, |
|
"learning_rate": 1.1933388784889617e-07, |
|
"loss": 0.1034, |
|
"step": 28850 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7876179814338684, |
|
"learning_rate": 1.1560481811633911e-07, |
|
"loss": 0.1007, |
|
"step": 28900 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6866023540496826, |
|
"learning_rate": 1.1193356274692424e-07, |
|
"loss": 0.1031, |
|
"step": 28950 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8255860805511475, |
|
"learning_rate": 1.0832021075747712e-07, |
|
"loss": 0.1072, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7951275706291199, |
|
"learning_rate": 1.047648497608414e-07, |
|
"loss": 0.1029, |
|
"step": 29050 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.7167026996612549, |
|
"learning_rate": 1.0126756596375687e-07, |
|
"loss": 0.1039, |
|
"step": 29100 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.75605708360672, |
|
"learning_rate": 9.782844416476423e-08, |
|
"loss": 0.1022, |
|
"step": 29150 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.72404545545578, |
|
"learning_rate": 9.444756775215446e-08, |
|
"loss": 0.1044, |
|
"step": 29200 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8219017386436462, |
|
"learning_rate": 9.112501870194273e-08, |
|
"loss": 0.1024, |
|
"step": 29250 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7620697617530823, |
|
"learning_rate": 8.786087757588269e-08, |
|
"loss": 0.1039, |
|
"step": 29300 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7205595970153809, |
|
"learning_rate": 8.465522351951305e-08, |
|
"loss": 0.1047, |
|
"step": 29350 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7058899998664856, |
|
"learning_rate": 8.150813426023752e-08, |
|
"loss": 0.1031, |
|
"step": 29400 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7188001871109009, |
|
"learning_rate": 7.841968610544121e-08, |
|
"loss": 0.1008, |
|
"step": 29450 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.672663152217865, |
|
"learning_rate": 7.538995394063996e-08, |
|
"loss": 0.1022, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.7694998979568481, |
|
"learning_rate": 7.241901122766515e-08, |
|
"loss": 0.1014, |
|
"step": 29550 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.6881873607635498, |
|
"learning_rate": 6.950693000288056e-08, |
|
"loss": 0.1003, |
|
"step": 29600 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.8163633942604065, |
|
"learning_rate": 6.665378087543889e-08, |
|
"loss": 0.1019, |
|
"step": 29650 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6912686228752136, |
|
"learning_rate": 6.385963302556642e-08, |
|
"loss": 0.1024, |
|
"step": 29700 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8492407202720642, |
|
"learning_rate": 6.112455420288821e-08, |
|
"loss": 0.1039, |
|
"step": 29750 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.737253725528717, |
|
"learning_rate": 5.844861072478336e-08, |
|
"loss": 0.1051, |
|
"step": 29800 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6771920919418335, |
|
"learning_rate": 5.583186747477848e-08, |
|
"loss": 0.0996, |
|
"step": 29850 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6680823564529419, |
|
"learning_rate": 5.32743879009745e-08, |
|
"loss": 0.1027, |
|
"step": 29900 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7818738222122192, |
|
"learning_rate": 5.077623401450599e-08, |
|
"loss": 0.1027, |
|
"step": 29950 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7418414950370789, |
|
"learning_rate": 4.8337466388040935e-08, |
|
"loss": 0.1027, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.6610372066497803, |
|
"learning_rate": 4.595814415430916e-08, |
|
"loss": 0.0983, |
|
"step": 30050 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.9139551520347595, |
|
"learning_rate": 4.3638325004670134e-08, |
|
"loss": 0.1096, |
|
"step": 30100 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7789270877838135, |
|
"learning_rate": 4.1378065187714365e-08, |
|
"loss": 0.1013, |
|
"step": 30150 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7473266124725342, |
|
"learning_rate": 3.917741950789727e-08, |
|
"loss": 0.1048, |
|
"step": 30200 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7920830249786377, |
|
"learning_rate": 3.703644132421386e-08, |
|
"loss": 0.1003, |
|
"step": 30250 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8591673970222473, |
|
"learning_rate": 3.4955182548901956e-08, |
|
"loss": 0.102, |
|
"step": 30300 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8582797646522522, |
|
"learning_rate": 3.293369364618465e-08, |
|
"loss": 0.1014, |
|
"step": 30350 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.7193279266357422, |
|
"learning_rate": 3.097202363104679e-08, |
|
"loss": 0.1026, |
|
"step": 30400 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.8688130378723145, |
|
"learning_rate": 2.9070220068045663e-08, |
|
"loss": 0.1055, |
|
"step": 30450 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7748174071311951, |
|
"learning_rate": 2.722832907015971e-08, |
|
"loss": 0.1026, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5935469269752502, |
|
"learning_rate": 2.544639529766829e-08, |
|
"loss": 0.1047, |
|
"step": 30550 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6561381816864014, |
|
"learning_rate": 2.3724461957068955e-08, |
|
"loss": 0.1002, |
|
"step": 30600 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7781614065170288, |
|
"learning_rate": 2.206257080003188e-08, |
|
"loss": 0.1064, |
|
"step": 30650 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7374931573867798, |
|
"learning_rate": 2.0460762122385124e-08, |
|
"loss": 0.096, |
|
"step": 30700 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6997425556182861, |
|
"learning_rate": 1.8919074763138757e-08, |
|
"loss": 0.101, |
|
"step": 30750 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.8059677481651306, |
|
"learning_rate": 1.7437546103542814e-08, |
|
"loss": 0.1011, |
|
"step": 30800 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.7518121600151062, |
|
"learning_rate": 1.6016212066181368e-08, |
|
"loss": 0.0995, |
|
"step": 30850 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7425240278244019, |
|
"learning_rate": 1.4655107114101008e-08, |
|
"loss": 0.1071, |
|
"step": 30900 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7220152020454407, |
|
"learning_rate": 1.3354264249975379e-08, |
|
"loss": 0.1011, |
|
"step": 30950 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7005524635314941, |
|
"learning_rate": 1.2113715015304728e-08, |
|
"loss": 0.1035, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6991942524909973, |
|
"learning_rate": 1.0933489489651783e-08, |
|
"loss": 0.1073, |
|
"step": 31050 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7447285652160645, |
|
"learning_rate": 9.81361628991151e-09, |
|
"loss": 0.1046, |
|
"step": 31100 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.734191358089447, |
|
"learning_rate": 8.754122569618329e-09, |
|
"loss": 0.1, |
|
"step": 31150 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7444181442260742, |
|
"learning_rate": 7.755034018286644e-09, |
|
"loss": 0.1079, |
|
"step": 31200 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.723635196685791, |
|
"learning_rate": 6.816374860788566e-09, |
|
"loss": 0.1055, |
|
"step": 31250 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6592646837234497, |
|
"learning_rate": 5.938167856766319e-09, |
|
"loss": 0.0998, |
|
"step": 31300 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8482509255409241, |
|
"learning_rate": 5.120434300080745e-09, |
|
"loss": 0.1018, |
|
"step": 31350 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7359779477119446, |
|
"learning_rate": 4.363194018293937e-09, |
|
"loss": 0.1013, |
|
"step": 31400 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.704291820526123, |
|
"learning_rate": 3.666465372190453e-09, |
|
"loss": 0.1016, |
|
"step": 31450 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6474089622497559, |
|
"learning_rate": 3.030265255329623e-09, |
|
"loss": 0.1018, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.75139319896698, |
|
"learning_rate": 2.4546090936383717e-09, |
|
"loss": 0.1078, |
|
"step": 31550 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8348722457885742, |
|
"learning_rate": 1.9395108450351308e-09, |
|
"loss": 0.1052, |
|
"step": 31600 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6401920914649963, |
|
"learning_rate": 1.4849829990931653e-09, |
|
"loss": 0.102, |
|
"step": 31650 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6777997612953186, |
|
"learning_rate": 1.0910365767358155e-09, |
|
"loss": 0.1032, |
|
"step": 31700 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.808435320854187, |
|
"learning_rate": 7.576811299714326e-10, |
|
"loss": 0.1034, |
|
"step": 31750 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7167596817016602, |
|
"learning_rate": 4.849247416599534e-10, |
|
"loss": 0.1047, |
|
"step": 31800 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7499010562896729, |
|
"learning_rate": 2.727740253177791e-10, |
|
"loss": 0.1045, |
|
"step": 31850 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7393851280212402, |
|
"learning_rate": 1.2123412495762543e-10, |
|
"loss": 0.1063, |
|
"step": 31900 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.7504858374595642, |
|
"learning_rate": 3.0308714963067644e-11, |
|
"loss": 0.1045, |
|
"step": 31950 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.8671845197677612, |
|
"learning_rate": 0.0, |
|
"loss": 0.1015, |
|
"step": 32000 |
|
} |
|
], |
|
"logging_steps": 50, |
|
"max_steps": 32000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 3200, |
|
"total_flos": 2.174443399415808e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|