|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 4.0, |
|
"eval_steps": 500, |
|
"global_step": 17216, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.023234200743494422, |
|
"grad_norm": 0.33875614404678345, |
|
"learning_rate": 1.1149825783972125e-06, |
|
"loss": 3.2044, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.046468401486988845, |
|
"grad_norm": 0.09059225022792816, |
|
"learning_rate": 2.2764227642276426e-06, |
|
"loss": 3.1582, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.06970260223048327, |
|
"grad_norm": 0.24917162954807281, |
|
"learning_rate": 3.4378629500580724e-06, |
|
"loss": 3.1608, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.09293680297397769, |
|
"grad_norm": 0.465605229139328, |
|
"learning_rate": 4.599303135888502e-06, |
|
"loss": 3.1981, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.11617100371747212, |
|
"grad_norm": 0.27495619654655457, |
|
"learning_rate": 5.7607433217189324e-06, |
|
"loss": 3.1815, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.13940520446096655, |
|
"grad_norm": 0.19188807904720306, |
|
"learning_rate": 6.922183507549362e-06, |
|
"loss": 3.1294, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.16263940520446096, |
|
"grad_norm": 0.5246957540512085, |
|
"learning_rate": 8.083623693379791e-06, |
|
"loss": 3.0677, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.18587360594795538, |
|
"grad_norm": 0.258408784866333, |
|
"learning_rate": 9.24506387921022e-06, |
|
"loss": 2.9832, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.20910780669144982, |
|
"grad_norm": 0.31014084815979004, |
|
"learning_rate": 1.0406504065040652e-05, |
|
"loss": 2.9743, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.23234200743494424, |
|
"grad_norm": 0.4873325824737549, |
|
"learning_rate": 1.1567944250871081e-05, |
|
"loss": 2.8721, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.2555762081784387, |
|
"grad_norm": 0.7442412972450256, |
|
"learning_rate": 1.272938443670151e-05, |
|
"loss": 2.7949, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.2788104089219331, |
|
"grad_norm": 0.6129536628723145, |
|
"learning_rate": 1.389082462253194e-05, |
|
"loss": 2.6381, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.3020446096654275, |
|
"grad_norm": 0.5687291026115417, |
|
"learning_rate": 1.5052264808362371e-05, |
|
"loss": 2.4031, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.3252788104089219, |
|
"grad_norm": 0.6154528856277466, |
|
"learning_rate": 1.62137049941928e-05, |
|
"loss": 2.107, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.34851301115241634, |
|
"grad_norm": 0.8730382323265076, |
|
"learning_rate": 1.7375145180023228e-05, |
|
"loss": 1.981, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.37174721189591076, |
|
"grad_norm": 0.6668545603752136, |
|
"learning_rate": 1.8536585365853663e-05, |
|
"loss": 1.9311, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.3949814126394052, |
|
"grad_norm": 0.6021186709403992, |
|
"learning_rate": 1.969802555168409e-05, |
|
"loss": 1.8733, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.41821561338289964, |
|
"grad_norm": 0.8815124034881592, |
|
"learning_rate": 1.9904479153220604e-05, |
|
"loss": 1.8434, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.44144981412639406, |
|
"grad_norm": 1.1727079153060913, |
|
"learning_rate": 1.9775396927843037e-05, |
|
"loss": 1.8051, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.4646840148698885, |
|
"grad_norm": 1.1215996742248535, |
|
"learning_rate": 1.964631470246547e-05, |
|
"loss": 1.733, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.4879182156133829, |
|
"grad_norm": 1.1965365409851074, |
|
"learning_rate": 1.9517232477087907e-05, |
|
"loss": 1.6994, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.5111524163568774, |
|
"grad_norm": 1.2489936351776123, |
|
"learning_rate": 1.938815025171034e-05, |
|
"loss": 1.6529, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.5343866171003717, |
|
"grad_norm": 1.5988222360610962, |
|
"learning_rate": 1.9259068026332776e-05, |
|
"loss": 1.5897, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.5576208178438662, |
|
"grad_norm": 0.6558517217636108, |
|
"learning_rate": 1.912998580095521e-05, |
|
"loss": 1.5099, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.5808550185873605, |
|
"grad_norm": 0.7629631757736206, |
|
"learning_rate": 1.900219439783142e-05, |
|
"loss": 1.4466, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.604089219330855, |
|
"grad_norm": 0.9707331657409668, |
|
"learning_rate": 1.8873112172453855e-05, |
|
"loss": 1.3949, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.6273234200743495, |
|
"grad_norm": 0.849176287651062, |
|
"learning_rate": 1.874402994707629e-05, |
|
"loss": 1.3449, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.6505576208178439, |
|
"grad_norm": 0.460151731967926, |
|
"learning_rate": 1.8614947721698724e-05, |
|
"loss": 1.3182, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.6737918215613383, |
|
"grad_norm": 0.652923047542572, |
|
"learning_rate": 1.8485865496321157e-05, |
|
"loss": 1.2623, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.6970260223048327, |
|
"grad_norm": 0.5269683599472046, |
|
"learning_rate": 1.8356783270943594e-05, |
|
"loss": 1.2059, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7202602230483272, |
|
"grad_norm": 0.6761623024940491, |
|
"learning_rate": 1.8227701045566027e-05, |
|
"loss": 1.1477, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.7434944237918215, |
|
"grad_norm": 0.4611155390739441, |
|
"learning_rate": 1.809861882018846e-05, |
|
"loss": 1.1063, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.766728624535316, |
|
"grad_norm": 1.20090913772583, |
|
"learning_rate": 1.7969536594810896e-05, |
|
"loss": 1.0812, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.7899628252788105, |
|
"grad_norm": 0.5198754072189331, |
|
"learning_rate": 1.7840454369433332e-05, |
|
"loss": 1.0637, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.8131970260223048, |
|
"grad_norm": 0.7287588119506836, |
|
"learning_rate": 1.7711372144055765e-05, |
|
"loss": 1.0311, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8364312267657993, |
|
"grad_norm": 0.850121021270752, |
|
"learning_rate": 1.75822899186782e-05, |
|
"loss": 0.9687, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.8596654275092936, |
|
"grad_norm": 0.5256717801094055, |
|
"learning_rate": 1.7453207693300635e-05, |
|
"loss": 0.8706, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.8828996282527881, |
|
"grad_norm": 0.6515185236930847, |
|
"learning_rate": 1.7324125467923068e-05, |
|
"loss": 0.8474, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.9061338289962825, |
|
"grad_norm": 0.8604176640510559, |
|
"learning_rate": 1.7195043242545504e-05, |
|
"loss": 0.8302, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.929368029739777, |
|
"grad_norm": 0.3369189202785492, |
|
"learning_rate": 1.7065961017167937e-05, |
|
"loss": 0.7959, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9526022304832714, |
|
"grad_norm": 0.4804532527923584, |
|
"learning_rate": 1.6936878791790373e-05, |
|
"loss": 0.7945, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.9758364312267658, |
|
"grad_norm": 0.3839660882949829, |
|
"learning_rate": 1.6807796566412806e-05, |
|
"loss": 0.7975, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.9990706319702602, |
|
"grad_norm": 0.31136325001716614, |
|
"learning_rate": 1.667871434103524e-05, |
|
"loss": 0.7804, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 1.0223048327137547, |
|
"grad_norm": 0.2822754681110382, |
|
"learning_rate": 1.6549632115657676e-05, |
|
"loss": 0.7502, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 1.045539033457249, |
|
"grad_norm": 0.3364527225494385, |
|
"learning_rate": 1.6420549890280112e-05, |
|
"loss": 0.747, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0687732342007434, |
|
"grad_norm": 0.45242545008659363, |
|
"learning_rate": 1.6291467664902545e-05, |
|
"loss": 0.7263, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 1.092007434944238, |
|
"grad_norm": 0.2541595995426178, |
|
"learning_rate": 1.6162385439524978e-05, |
|
"loss": 0.7311, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 1.1152416356877324, |
|
"grad_norm": 0.32410866022109985, |
|
"learning_rate": 1.6033303214147415e-05, |
|
"loss": 0.7213, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 1.1384758364312269, |
|
"grad_norm": 0.28702208399772644, |
|
"learning_rate": 1.5904220988769848e-05, |
|
"loss": 0.7103, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 1.161710037174721, |
|
"grad_norm": 0.2637524902820587, |
|
"learning_rate": 1.577513876339228e-05, |
|
"loss": 0.7033, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.1849442379182156, |
|
"grad_norm": 0.38048645853996277, |
|
"learning_rate": 1.5646056538014717e-05, |
|
"loss": 0.7111, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 1.20817843866171, |
|
"grad_norm": 0.22926197946071625, |
|
"learning_rate": 1.5516974312637153e-05, |
|
"loss": 0.7053, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 1.2314126394052045, |
|
"grad_norm": 0.2666023373603821, |
|
"learning_rate": 1.5387892087259586e-05, |
|
"loss": 0.6915, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 1.2546468401486988, |
|
"grad_norm": 0.2618410587310791, |
|
"learning_rate": 1.525880986188202e-05, |
|
"loss": 0.6843, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 1.2778810408921932, |
|
"grad_norm": 0.24479706585407257, |
|
"learning_rate": 1.5129727636504454e-05, |
|
"loss": 0.6775, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.3011152416356877, |
|
"grad_norm": 0.19555561244487762, |
|
"learning_rate": 1.5000645411126889e-05, |
|
"loss": 0.6601, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 1.3243494423791822, |
|
"grad_norm": 0.2121550738811493, |
|
"learning_rate": 1.4871563185749323e-05, |
|
"loss": 0.6625, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 1.3475836431226766, |
|
"grad_norm": 0.36492133140563965, |
|
"learning_rate": 1.474248096037176e-05, |
|
"loss": 0.6567, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 1.370817843866171, |
|
"grad_norm": 0.28411343693733215, |
|
"learning_rate": 1.4613398734994193e-05, |
|
"loss": 0.6424, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 1.3940520446096654, |
|
"grad_norm": 0.3487832248210907, |
|
"learning_rate": 1.4484316509616627e-05, |
|
"loss": 0.6508, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.4172862453531598, |
|
"grad_norm": 0.4025629758834839, |
|
"learning_rate": 1.4355234284239062e-05, |
|
"loss": 0.6374, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 1.4405204460966543, |
|
"grad_norm": 0.31936919689178467, |
|
"learning_rate": 1.4226152058861495e-05, |
|
"loss": 0.6462, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 1.4637546468401488, |
|
"grad_norm": 0.27360206842422485, |
|
"learning_rate": 1.409706983348393e-05, |
|
"loss": 0.6382, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 1.486988847583643, |
|
"grad_norm": 0.35483697056770325, |
|
"learning_rate": 1.3967987608106366e-05, |
|
"loss": 0.6274, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 1.5102230483271375, |
|
"grad_norm": 0.30311813950538635, |
|
"learning_rate": 1.38389053827288e-05, |
|
"loss": 0.6258, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.533457249070632, |
|
"grad_norm": 0.3184954524040222, |
|
"learning_rate": 1.3709823157351234e-05, |
|
"loss": 0.6313, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 1.5566914498141264, |
|
"grad_norm": 0.2632908821105957, |
|
"learning_rate": 1.3580740931973668e-05, |
|
"loss": 0.6217, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 1.579925650557621, |
|
"grad_norm": 0.22145096957683563, |
|
"learning_rate": 1.3451658706596103e-05, |
|
"loss": 0.6245, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 1.6031598513011152, |
|
"grad_norm": 0.5008528828620911, |
|
"learning_rate": 1.3322576481218536e-05, |
|
"loss": 0.6187, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 1.6263940520446096, |
|
"grad_norm": 0.25452372431755066, |
|
"learning_rate": 1.3193494255840972e-05, |
|
"loss": 0.6084, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.649628252788104, |
|
"grad_norm": 0.3917735815048218, |
|
"learning_rate": 1.3064412030463407e-05, |
|
"loss": 0.6088, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 1.6728624535315983, |
|
"grad_norm": 0.28736940026283264, |
|
"learning_rate": 1.2935329805085842e-05, |
|
"loss": 0.6084, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 1.696096654275093, |
|
"grad_norm": 0.3900860548019409, |
|
"learning_rate": 1.2807538401962051e-05, |
|
"loss": 0.6017, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 1.7193308550185873, |
|
"grad_norm": 0.2482582926750183, |
|
"learning_rate": 1.2678456176584486e-05, |
|
"loss": 0.5964, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 1.7425650557620818, |
|
"grad_norm": 0.2464774250984192, |
|
"learning_rate": 1.254937395120692e-05, |
|
"loss": 0.5929, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.7657992565055762, |
|
"grad_norm": 0.36112162470817566, |
|
"learning_rate": 1.2420291725829354e-05, |
|
"loss": 0.5913, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 1.7890334572490705, |
|
"grad_norm": 0.30204829573631287, |
|
"learning_rate": 1.2291209500451788e-05, |
|
"loss": 0.5804, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 1.8122676579925652, |
|
"grad_norm": 0.2731075584888458, |
|
"learning_rate": 1.2162127275074223e-05, |
|
"loss": 0.5881, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 1.8355018587360594, |
|
"grad_norm": 0.24604862928390503, |
|
"learning_rate": 1.2033045049696656e-05, |
|
"loss": 0.5679, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 1.858736059479554, |
|
"grad_norm": 0.3449194133281708, |
|
"learning_rate": 1.1903962824319092e-05, |
|
"loss": 0.582, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.8819702602230484, |
|
"grad_norm": 0.310375452041626, |
|
"learning_rate": 1.1774880598941527e-05, |
|
"loss": 0.575, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 1.9052044609665426, |
|
"grad_norm": 0.28315114974975586, |
|
"learning_rate": 1.1645798373563962e-05, |
|
"loss": 0.5722, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 1.9284386617100373, |
|
"grad_norm": 0.3091906011104584, |
|
"learning_rate": 1.1516716148186395e-05, |
|
"loss": 0.5533, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 1.9516728624535316, |
|
"grad_norm": 0.28990840911865234, |
|
"learning_rate": 1.138763392280883e-05, |
|
"loss": 0.5724, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 1.974907063197026, |
|
"grad_norm": 0.44591304659843445, |
|
"learning_rate": 1.1258551697431264e-05, |
|
"loss": 0.5701, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.9981412639405205, |
|
"grad_norm": 0.26404786109924316, |
|
"learning_rate": 1.11294694720537e-05, |
|
"loss": 0.553, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 2.0213754646840147, |
|
"grad_norm": 0.2843058705329895, |
|
"learning_rate": 1.1000387246676133e-05, |
|
"loss": 0.5631, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 2.0446096654275094, |
|
"grad_norm": 0.20029422640800476, |
|
"learning_rate": 1.0871305021298568e-05, |
|
"loss": 0.5495, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 2.0678438661710037, |
|
"grad_norm": 0.26215997338294983, |
|
"learning_rate": 1.0742222795921003e-05, |
|
"loss": 0.5562, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 2.091078066914498, |
|
"grad_norm": 0.29611942172050476, |
|
"learning_rate": 1.0613140570543436e-05, |
|
"loss": 0.5541, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.1143122676579926, |
|
"grad_norm": 0.2809213697910309, |
|
"learning_rate": 1.048405834516587e-05, |
|
"loss": 0.5429, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 2.137546468401487, |
|
"grad_norm": 0.4684973657131195, |
|
"learning_rate": 1.0354976119788307e-05, |
|
"loss": 0.5518, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 2.1607806691449816, |
|
"grad_norm": 0.2790776193141937, |
|
"learning_rate": 1.0225893894410741e-05, |
|
"loss": 0.5485, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 2.184014869888476, |
|
"grad_norm": 0.24624982476234436, |
|
"learning_rate": 1.0096811669033174e-05, |
|
"loss": 0.5434, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 2.20724907063197, |
|
"grad_norm": 0.27161070704460144, |
|
"learning_rate": 9.967729443655609e-06, |
|
"loss": 0.5503, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.2304832713754648, |
|
"grad_norm": 0.2635902166366577, |
|
"learning_rate": 9.838647218278044e-06, |
|
"loss": 0.538, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 2.253717472118959, |
|
"grad_norm": 0.35729700326919556, |
|
"learning_rate": 9.709564992900478e-06, |
|
"loss": 0.5376, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 2.2769516728624537, |
|
"grad_norm": 0.224281907081604, |
|
"learning_rate": 9.580482767522913e-06, |
|
"loss": 0.5423, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 2.300185873605948, |
|
"grad_norm": 0.2016523778438568, |
|
"learning_rate": 9.451400542145348e-06, |
|
"loss": 0.54, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 2.323420074349442, |
|
"grad_norm": 0.3719424605369568, |
|
"learning_rate": 9.322318316767782e-06, |
|
"loss": 0.5326, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.346654275092937, |
|
"grad_norm": 0.22268572449684143, |
|
"learning_rate": 9.193236091390217e-06, |
|
"loss": 0.5379, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 2.369888475836431, |
|
"grad_norm": 0.3181590735912323, |
|
"learning_rate": 9.06415386601265e-06, |
|
"loss": 0.5328, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 2.393122676579926, |
|
"grad_norm": 0.2703763246536255, |
|
"learning_rate": 8.935071640635087e-06, |
|
"loss": 0.5276, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 2.41635687732342, |
|
"grad_norm": 0.2698732912540436, |
|
"learning_rate": 8.80598941525752e-06, |
|
"loss": 0.5338, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 2.4395910780669143, |
|
"grad_norm": 0.2765790820121765, |
|
"learning_rate": 8.676907189879954e-06, |
|
"loss": 0.5418, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.462825278810409, |
|
"grad_norm": 0.36516493558883667, |
|
"learning_rate": 8.547824964502389e-06, |
|
"loss": 0.5249, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 2.4860594795539033, |
|
"grad_norm": 0.23371903598308563, |
|
"learning_rate": 8.418742739124824e-06, |
|
"loss": 0.5318, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 2.5092936802973975, |
|
"grad_norm": 0.23883387446403503, |
|
"learning_rate": 8.289660513747258e-06, |
|
"loss": 0.5336, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 2.532527881040892, |
|
"grad_norm": 0.23600026965141296, |
|
"learning_rate": 8.160578288369693e-06, |
|
"loss": 0.5207, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 2.5557620817843865, |
|
"grad_norm": 0.22283987700939178, |
|
"learning_rate": 8.031496062992128e-06, |
|
"loss": 0.5261, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.578996282527881, |
|
"grad_norm": 0.3077383041381836, |
|
"learning_rate": 7.90241383761456e-06, |
|
"loss": 0.5117, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 2.6022304832713754, |
|
"grad_norm": 0.24372899532318115, |
|
"learning_rate": 7.773331612236995e-06, |
|
"loss": 0.5251, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 2.6254646840148697, |
|
"grad_norm": 0.3168962001800537, |
|
"learning_rate": 7.64424938685943e-06, |
|
"loss": 0.5238, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 2.6486988847583643, |
|
"grad_norm": 0.2522094249725342, |
|
"learning_rate": 7.515167161481865e-06, |
|
"loss": 0.5141, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 2.6719330855018586, |
|
"grad_norm": 0.4139024317264557, |
|
"learning_rate": 7.3860849361042984e-06, |
|
"loss": 0.5185, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.6951672862453533, |
|
"grad_norm": 0.2781153619289398, |
|
"learning_rate": 7.257002710726734e-06, |
|
"loss": 0.5121, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 2.7184014869888475, |
|
"grad_norm": 0.38515913486480713, |
|
"learning_rate": 7.127920485349168e-06, |
|
"loss": 0.5178, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 2.741635687732342, |
|
"grad_norm": 0.33289971947669983, |
|
"learning_rate": 6.998838259971602e-06, |
|
"loss": 0.5124, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 2.7648698884758365, |
|
"grad_norm": 0.36876046657562256, |
|
"learning_rate": 6.871046856847813e-06, |
|
"loss": 0.5137, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 2.7881040892193307, |
|
"grad_norm": 0.28098130226135254, |
|
"learning_rate": 6.7419646314702466e-06, |
|
"loss": 0.509, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.8113382899628254, |
|
"grad_norm": 0.32521939277648926, |
|
"learning_rate": 6.612882406092681e-06, |
|
"loss": 0.512, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 2.8345724907063197, |
|
"grad_norm": 0.23627902567386627, |
|
"learning_rate": 6.483800180715116e-06, |
|
"loss": 0.5084, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 2.857806691449814, |
|
"grad_norm": 0.23111554980278015, |
|
"learning_rate": 6.354717955337551e-06, |
|
"loss": 0.517, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 2.8810408921933086, |
|
"grad_norm": 0.3062553107738495, |
|
"learning_rate": 6.2256357299599844e-06, |
|
"loss": 0.5063, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 2.904275092936803, |
|
"grad_norm": 0.3274383842945099, |
|
"learning_rate": 6.09655350458242e-06, |
|
"loss": 0.5066, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.9275092936802976, |
|
"grad_norm": 0.25803956389427185, |
|
"learning_rate": 5.967471279204854e-06, |
|
"loss": 0.5064, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 2.950743494423792, |
|
"grad_norm": 0.29026666283607483, |
|
"learning_rate": 5.838389053827288e-06, |
|
"loss": 0.5088, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 2.973977695167286, |
|
"grad_norm": 0.36228805780410767, |
|
"learning_rate": 5.709306828449723e-06, |
|
"loss": 0.507, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 2.9972118959107807, |
|
"grad_norm": 0.2669726014137268, |
|
"learning_rate": 5.580224603072157e-06, |
|
"loss": 0.4934, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 3.020446096654275, |
|
"grad_norm": 0.24396216869354248, |
|
"learning_rate": 5.451142377694592e-06, |
|
"loss": 0.5099, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 3.0436802973977697, |
|
"grad_norm": 0.25540581345558167, |
|
"learning_rate": 5.322060152317027e-06, |
|
"loss": 0.5037, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 3.066914498141264, |
|
"grad_norm": 0.1964583396911621, |
|
"learning_rate": 5.192977926939461e-06, |
|
"loss": 0.5055, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 3.090148698884758, |
|
"grad_norm": 0.2318154275417328, |
|
"learning_rate": 5.063895701561895e-06, |
|
"loss": 0.5041, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 3.113382899628253, |
|
"grad_norm": 0.28110265731811523, |
|
"learning_rate": 4.9348134761843295e-06, |
|
"loss": 0.5043, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 3.136617100371747, |
|
"grad_norm": 0.3360753357410431, |
|
"learning_rate": 4.805731250806764e-06, |
|
"loss": 0.4915, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.159851301115242, |
|
"grad_norm": 0.3044135868549347, |
|
"learning_rate": 4.676649025429199e-06, |
|
"loss": 0.499, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 3.183085501858736, |
|
"grad_norm": 0.28163620829582214, |
|
"learning_rate": 4.547566800051634e-06, |
|
"loss": 0.4996, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 3.2063197026022303, |
|
"grad_norm": 0.23853909969329834, |
|
"learning_rate": 4.418484574674068e-06, |
|
"loss": 0.5073, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 3.229553903345725, |
|
"grad_norm": 0.25510174036026, |
|
"learning_rate": 4.289402349296502e-06, |
|
"loss": 0.4988, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 3.2527881040892193, |
|
"grad_norm": 0.650174081325531, |
|
"learning_rate": 4.160320123918937e-06, |
|
"loss": 0.5024, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.276022304832714, |
|
"grad_norm": 0.36293137073516846, |
|
"learning_rate": 4.0312378985413715e-06, |
|
"loss": 0.4913, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 3.299256505576208, |
|
"grad_norm": 0.35399818420410156, |
|
"learning_rate": 3.902155673163805e-06, |
|
"loss": 0.4993, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 3.3224907063197024, |
|
"grad_norm": 0.2553289830684662, |
|
"learning_rate": 3.7730734477862404e-06, |
|
"loss": 0.5017, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 3.345724907063197, |
|
"grad_norm": 0.25535061955451965, |
|
"learning_rate": 3.643991222408675e-06, |
|
"loss": 0.4895, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 3.3689591078066914, |
|
"grad_norm": 0.2772742509841919, |
|
"learning_rate": 3.514908997031109e-06, |
|
"loss": 0.4954, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.392193308550186, |
|
"grad_norm": 0.26105812191963196, |
|
"learning_rate": 3.387117593907319e-06, |
|
"loss": 0.4964, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 3.4154275092936803, |
|
"grad_norm": 0.2538992166519165, |
|
"learning_rate": 3.258035368529754e-06, |
|
"loss": 0.4985, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 3.4386617100371746, |
|
"grad_norm": 0.2889178693294525, |
|
"learning_rate": 3.128953143152188e-06, |
|
"loss": 0.4969, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 3.4618959107806693, |
|
"grad_norm": 0.28792130947113037, |
|
"learning_rate": 2.9998709177746228e-06, |
|
"loss": 0.4985, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 3.4851301115241635, |
|
"grad_norm": 0.36826494336128235, |
|
"learning_rate": 2.8707886923970575e-06, |
|
"loss": 0.4937, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.508364312267658, |
|
"grad_norm": 0.24432937800884247, |
|
"learning_rate": 2.7417064670194917e-06, |
|
"loss": 0.4892, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 3.5315985130111525, |
|
"grad_norm": 0.36436623334884644, |
|
"learning_rate": 2.6126242416419264e-06, |
|
"loss": 0.5029, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 3.5548327137546467, |
|
"grad_norm": 0.3257830739021301, |
|
"learning_rate": 2.4835420162643606e-06, |
|
"loss": 0.484, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 3.5780669144981414, |
|
"grad_norm": 0.20910651981830597, |
|
"learning_rate": 2.354459790886795e-06, |
|
"loss": 0.4934, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 3.6013011152416357, |
|
"grad_norm": 0.27706313133239746, |
|
"learning_rate": 2.2253775655092296e-06, |
|
"loss": 0.4972, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.6245353159851303, |
|
"grad_norm": 0.28043028712272644, |
|
"learning_rate": 2.0962953401316643e-06, |
|
"loss": 0.4878, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 3.6477695167286246, |
|
"grad_norm": 0.34835153818130493, |
|
"learning_rate": 1.9672131147540985e-06, |
|
"loss": 0.4954, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 3.671003717472119, |
|
"grad_norm": 0.3561202585697174, |
|
"learning_rate": 1.838130889376533e-06, |
|
"loss": 0.4992, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 3.6942379182156135, |
|
"grad_norm": 0.2767621576786041, |
|
"learning_rate": 1.7090486639989677e-06, |
|
"loss": 0.4982, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 3.717472118959108, |
|
"grad_norm": 0.22851090133190155, |
|
"learning_rate": 1.579966438621402e-06, |
|
"loss": 0.498, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.7407063197026025, |
|
"grad_norm": 0.28282201290130615, |
|
"learning_rate": 1.4508842132438364e-06, |
|
"loss": 0.4898, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 3.7639405204460967, |
|
"grad_norm": 0.24474182724952698, |
|
"learning_rate": 1.3218019878662709e-06, |
|
"loss": 0.501, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 3.787174721189591, |
|
"grad_norm": 0.27427938580513, |
|
"learning_rate": 1.1927197624887055e-06, |
|
"loss": 0.4966, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 3.8104089219330852, |
|
"grad_norm": 0.38391393423080444, |
|
"learning_rate": 1.0636375371111398e-06, |
|
"loss": 0.4941, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 3.83364312267658, |
|
"grad_norm": 0.3098974823951721, |
|
"learning_rate": 9.345553117335744e-07, |
|
"loss": 0.4879, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.8568773234200746, |
|
"grad_norm": 0.2817577123641968, |
|
"learning_rate": 8.054730863560088e-07, |
|
"loss": 0.4925, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 3.880111524163569, |
|
"grad_norm": 0.3037372827529907, |
|
"learning_rate": 6.763908609784433e-07, |
|
"loss": 0.4927, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 3.903345724907063, |
|
"grad_norm": 0.2850995659828186, |
|
"learning_rate": 5.473086356008779e-07, |
|
"loss": 0.4909, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 3.9265799256505574, |
|
"grad_norm": 0.25115731358528137, |
|
"learning_rate": 4.182264102233123e-07, |
|
"loss": 0.5, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 3.949814126394052, |
|
"grad_norm": 0.4323899745941162, |
|
"learning_rate": 2.8914418484574677e-07, |
|
"loss": 0.4861, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.9730483271375467, |
|
"grad_norm": 0.30076873302459717, |
|
"learning_rate": 1.6006195946818127e-07, |
|
"loss": 0.4855, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 3.996282527881041, |
|
"grad_norm": 0.2874129116535187, |
|
"learning_rate": 3.097973409061573e-08, |
|
"loss": 0.4957, |
|
"step": 17200 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 17216, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 4, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.805111076121907e+16, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|