|
{ |
|
"best_metric": 5.531345844268799, |
|
"best_model_checkpoint": "./results/models/checkpoint-58916", |
|
"epoch": 13.0, |
|
"eval_steps": 500, |
|
"global_step": 58916, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.11032656663724624, |
|
"grad_norm": 0.07373046875, |
|
"learning_rate": 0.00199558693733451, |
|
"loss": 6.7817, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.22065313327449249, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 0.0019911738746690205, |
|
"loss": 6.1767, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.33097969991173876, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 0.0019867608120035306, |
|
"loss": 6.0418, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.44130626654898497, |
|
"grad_norm": 0.08642578125, |
|
"learning_rate": 0.0019823477493380406, |
|
"loss": 5.9811, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.5516328331862312, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.0019779346866725506, |
|
"loss": 5.9448, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.6619593998234775, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.001973521624007061, |
|
"loss": 5.9146, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.7722859664607238, |
|
"grad_norm": 0.10205078125, |
|
"learning_rate": 0.001969108561341571, |
|
"loss": 5.8869, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.8826125330979699, |
|
"grad_norm": 0.09814453125, |
|
"learning_rate": 0.001964695498676081, |
|
"loss": 5.8551, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.9929390997352162, |
|
"grad_norm": 0.1025390625, |
|
"learning_rate": 0.001960282436010591, |
|
"loss": 5.8427, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 5.741388320922852, |
|
"eval_runtime": 7.2342, |
|
"eval_samples_per_second": 69.116, |
|
"eval_steps_per_second": 1.106, |
|
"step": 4532 |
|
}, |
|
{ |
|
"epoch": 1.1032656663724625, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.0019558693733451016, |
|
"loss": 5.8385, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 1.2135922330097086, |
|
"grad_norm": 0.10302734375, |
|
"learning_rate": 0.0019514563106796117, |
|
"loss": 5.8289, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 1.323918799646955, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 0.001947043248014122, |
|
"loss": 5.8157, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 1.4342453662842012, |
|
"grad_norm": 0.11376953125, |
|
"learning_rate": 0.001942630185348632, |
|
"loss": 5.7893, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 1.5445719329214476, |
|
"grad_norm": 0.1171875, |
|
"learning_rate": 0.0019382171226831422, |
|
"loss": 5.7912, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 1.6548984995586937, |
|
"grad_norm": 0.119140625, |
|
"learning_rate": 0.0019338040600176522, |
|
"loss": 5.7802, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 1.7652250661959399, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 0.0019293909973521625, |
|
"loss": 5.7805, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 1.8755516328331863, |
|
"grad_norm": 0.10693359375, |
|
"learning_rate": 0.0019249779346866725, |
|
"loss": 5.765, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 1.9858781994704324, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 0.0019205648720211827, |
|
"loss": 5.7674, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 5.6664910316467285, |
|
"eval_runtime": 6.8932, |
|
"eval_samples_per_second": 72.535, |
|
"eval_steps_per_second": 1.161, |
|
"step": 9064 |
|
}, |
|
{ |
|
"epoch": 2.096204766107679, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.001916151809355693, |
|
"loss": 5.7568, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 2.206531332744925, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 0.001911738746690203, |
|
"loss": 5.7619, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 2.316857899382171, |
|
"grad_norm": 0.12060546875, |
|
"learning_rate": 0.0019073256840247133, |
|
"loss": 5.7423, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 2.4271844660194173, |
|
"grad_norm": 0.1181640625, |
|
"learning_rate": 0.0019029126213592233, |
|
"loss": 5.7502, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 2.537511032656664, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 0.0018984995586937335, |
|
"loss": 5.7371, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 2.64783759929391, |
|
"grad_norm": 0.1142578125, |
|
"learning_rate": 0.0018940864960282436, |
|
"loss": 5.7316, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 2.758164165931156, |
|
"grad_norm": 0.10595703125, |
|
"learning_rate": 0.0018896734333627538, |
|
"loss": 5.7315, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 2.8684907325684024, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.001885260370697264, |
|
"loss": 5.7301, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 2.978817299205649, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.001880847308031774, |
|
"loss": 5.728, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 5.630899429321289, |
|
"eval_runtime": 7.3076, |
|
"eval_samples_per_second": 68.422, |
|
"eval_steps_per_second": 1.095, |
|
"step": 13596 |
|
}, |
|
{ |
|
"epoch": 3.089143865842895, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.001876434245366284, |
|
"loss": 5.724, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 3.1994704324801413, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 0.0018720211827007946, |
|
"loss": 5.7181, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 3.3097969991173875, |
|
"grad_norm": 0.11865234375, |
|
"learning_rate": 0.0018676081200353046, |
|
"loss": 5.7201, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 3.4201235657546336, |
|
"grad_norm": 0.1240234375, |
|
"learning_rate": 0.0018631950573698146, |
|
"loss": 5.718, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 3.5304501323918798, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0018587819947043247, |
|
"loss": 5.6959, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 3.6407766990291264, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 0.0018543689320388351, |
|
"loss": 5.7112, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 3.7511032656663725, |
|
"grad_norm": 0.1220703125, |
|
"learning_rate": 0.0018499558693733451, |
|
"loss": 5.6986, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.8614298323036187, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.0018455428067078552, |
|
"loss": 5.7103, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.971756398940865, |
|
"grad_norm": 0.12158203125, |
|
"learning_rate": 0.0018411297440423656, |
|
"loss": 5.7054, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 5.601424694061279, |
|
"eval_runtime": 6.7523, |
|
"eval_samples_per_second": 74.049, |
|
"eval_steps_per_second": 1.185, |
|
"step": 18128 |
|
}, |
|
{ |
|
"epoch": 4.0820829655781115, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 0.0018367166813768757, |
|
"loss": 5.6929, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 4.192409532215358, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.0018323036187113857, |
|
"loss": 5.6868, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 4.302736098852604, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0018278905560458957, |
|
"loss": 5.6955, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 4.41306266548985, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.0018234774933804062, |
|
"loss": 5.6879, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 4.523389232127096, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.0018190644307149162, |
|
"loss": 5.6919, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 4.633715798764342, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 0.0018146513680494262, |
|
"loss": 5.6906, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 4.744042365401588, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0018102383053839365, |
|
"loss": 5.6761, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 4.854368932038835, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.0018058252427184467, |
|
"loss": 5.6859, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 4.964695498676081, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0018014121800529568, |
|
"loss": 5.6948, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 5.584114074707031, |
|
"eval_runtime": 6.7803, |
|
"eval_samples_per_second": 73.743, |
|
"eval_steps_per_second": 1.18, |
|
"step": 22660 |
|
}, |
|
{ |
|
"epoch": 5.075022065313328, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.001796999117387467, |
|
"loss": 5.6773, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 5.185348631950574, |
|
"grad_norm": 0.1494140625, |
|
"learning_rate": 0.001792586054721977, |
|
"loss": 5.6741, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 5.29567519858782, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0017881729920564873, |
|
"loss": 5.6694, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 5.406001765225066, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.0017837599293909973, |
|
"loss": 5.6803, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 5.516328331862312, |
|
"grad_norm": 0.1611328125, |
|
"learning_rate": 0.0017793468667255076, |
|
"loss": 5.6683, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 5.626654898499559, |
|
"grad_norm": 0.1318359375, |
|
"learning_rate": 0.0017749338040600176, |
|
"loss": 5.6717, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 5.736981465136805, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 0.0017705207413945278, |
|
"loss": 5.671, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 5.847308031774051, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.001766107678729038, |
|
"loss": 5.6843, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 5.957634598411297, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 0.0017616946160635481, |
|
"loss": 5.6726, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 5.570712566375732, |
|
"eval_runtime": 6.7662, |
|
"eval_samples_per_second": 73.896, |
|
"eval_steps_per_second": 1.182, |
|
"step": 27192 |
|
}, |
|
{ |
|
"epoch": 6.067961165048544, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 0.0017572815533980584, |
|
"loss": 5.6475, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 6.17828773168579, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0017528684907325684, |
|
"loss": 5.6556, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 6.288614298323036, |
|
"grad_norm": 0.142578125, |
|
"learning_rate": 0.0017484554280670786, |
|
"loss": 5.6642, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 6.398940864960283, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.0017440423654015887, |
|
"loss": 5.6667, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 6.509267431597529, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.001739629302736099, |
|
"loss": 5.6611, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 6.619593998234775, |
|
"grad_norm": 0.1376953125, |
|
"learning_rate": 0.0017352162400706092, |
|
"loss": 5.6541, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 6.729920564872021, |
|
"grad_norm": 0.146484375, |
|
"learning_rate": 0.0017308031774051192, |
|
"loss": 5.6624, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 6.840247131509267, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 0.0017263901147396292, |
|
"loss": 5.6582, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 6.950573698146513, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 0.0017219770520741397, |
|
"loss": 5.6548, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 5.559887886047363, |
|
"eval_runtime": 6.8808, |
|
"eval_samples_per_second": 72.666, |
|
"eval_steps_per_second": 1.163, |
|
"step": 31724 |
|
}, |
|
{ |
|
"epoch": 7.0609002647837595, |
|
"grad_norm": 0.154296875, |
|
"learning_rate": 0.0017175639894086497, |
|
"loss": 5.6465, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 7.171226831421007, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.0017131509267431597, |
|
"loss": 5.6571, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 7.281553398058253, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0017087378640776698, |
|
"loss": 5.6475, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 7.391879964695499, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0017043248014121802, |
|
"loss": 5.6503, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 7.502206531332745, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.0016999117387466903, |
|
"loss": 5.6645, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 7.612533097969991, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.0016954986760812003, |
|
"loss": 5.6403, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 7.722859664607237, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.0016910856134157105, |
|
"loss": 5.6528, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 7.8331862312444835, |
|
"grad_norm": 0.138671875, |
|
"learning_rate": 0.0016866725507502208, |
|
"loss": 5.6419, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 7.94351279788173, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.0016822594880847308, |
|
"loss": 5.6511, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 5.553098678588867, |
|
"eval_runtime": 8.0507, |
|
"eval_samples_per_second": 62.106, |
|
"eval_steps_per_second": 0.994, |
|
"step": 36256 |
|
}, |
|
{ |
|
"epoch": 8.053839364518977, |
|
"grad_norm": 0.1689453125, |
|
"learning_rate": 0.0016778464254192408, |
|
"loss": 5.6478, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 8.164165931156223, |
|
"grad_norm": 0.16796875, |
|
"learning_rate": 0.0016734333627537513, |
|
"loss": 5.6245, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 8.274492497793469, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.0016690203000882613, |
|
"loss": 5.6376, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 8.384819064430715, |
|
"grad_norm": 0.2099609375, |
|
"learning_rate": 0.0016646072374227714, |
|
"loss": 5.6465, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 8.495145631067961, |
|
"grad_norm": 0.1552734375, |
|
"learning_rate": 0.0016601941747572816, |
|
"loss": 5.6546, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 8.605472197705208, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0016557811120917918, |
|
"loss": 5.6396, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 8.715798764342454, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0016513680494263019, |
|
"loss": 5.6524, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 8.8261253309797, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0016469549867608121, |
|
"loss": 5.6368, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 8.936451897616946, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0016425419240953221, |
|
"loss": 5.6372, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 5.547245979309082, |
|
"eval_runtime": 6.7444, |
|
"eval_samples_per_second": 74.136, |
|
"eval_steps_per_second": 1.186, |
|
"step": 40788 |
|
}, |
|
{ |
|
"epoch": 9.046778464254192, |
|
"grad_norm": 0.1572265625, |
|
"learning_rate": 0.0016381288614298324, |
|
"loss": 5.6392, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 9.157105030891438, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0016337157987643424, |
|
"loss": 5.6347, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 9.267431597528685, |
|
"grad_norm": 0.2041015625, |
|
"learning_rate": 0.0016293027360988527, |
|
"loss": 5.6439, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 9.37775816416593, |
|
"grad_norm": 0.15625, |
|
"learning_rate": 0.0016248896734333627, |
|
"loss": 5.6422, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 9.488084730803177, |
|
"grad_norm": 0.17578125, |
|
"learning_rate": 0.001620476610767873, |
|
"loss": 5.6266, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 9.598411297440423, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.0016160635481023832, |
|
"loss": 5.6408, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 9.70873786407767, |
|
"grad_norm": 0.1328125, |
|
"learning_rate": 0.0016116504854368932, |
|
"loss": 5.6357, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 9.819064430714917, |
|
"grad_norm": 0.1474609375, |
|
"learning_rate": 0.0016072374227714032, |
|
"loss": 5.6387, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 9.929390997352161, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 0.0016028243601059135, |
|
"loss": 5.6347, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 5.540464401245117, |
|
"eval_runtime": 7.1327, |
|
"eval_samples_per_second": 70.1, |
|
"eval_steps_per_second": 1.122, |
|
"step": 45320 |
|
}, |
|
{ |
|
"epoch": 10.03971756398941, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 0.0015984112974404237, |
|
"loss": 5.6329, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 10.150044130626656, |
|
"grad_norm": 0.1728515625, |
|
"learning_rate": 0.0015939982347749338, |
|
"loss": 5.6368, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 10.260370697263902, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.001589585172109444, |
|
"loss": 5.6292, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 10.370697263901148, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.0015851721094439543, |
|
"loss": 5.64, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 10.481023830538394, |
|
"grad_norm": 0.1845703125, |
|
"learning_rate": 0.0015807590467784643, |
|
"loss": 5.63, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 10.59135039717564, |
|
"grad_norm": 0.1826171875, |
|
"learning_rate": 0.0015763459841129743, |
|
"loss": 5.6312, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 10.701676963812886, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0015719329214474848, |
|
"loss": 5.6319, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 10.812003530450133, |
|
"grad_norm": 0.1533203125, |
|
"learning_rate": 0.0015675198587819948, |
|
"loss": 5.6298, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 10.922330097087379, |
|
"grad_norm": 0.1904296875, |
|
"learning_rate": 0.0015631067961165048, |
|
"loss": 5.6281, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_loss": 5.537391662597656, |
|
"eval_runtime": 6.6373, |
|
"eval_samples_per_second": 75.332, |
|
"eval_steps_per_second": 1.205, |
|
"step": 49852 |
|
}, |
|
{ |
|
"epoch": 11.032656663724625, |
|
"grad_norm": 0.1513671875, |
|
"learning_rate": 0.0015586937334510149, |
|
"loss": 5.6283, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 11.142983230361871, |
|
"grad_norm": 0.1435546875, |
|
"learning_rate": 0.0015542806707855253, |
|
"loss": 5.6227, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 11.253309796999117, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0015498676081200354, |
|
"loss": 5.6277, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 11.363636363636363, |
|
"grad_norm": 0.134765625, |
|
"learning_rate": 0.0015454545454545454, |
|
"loss": 5.6274, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 11.47396293027361, |
|
"grad_norm": 0.140625, |
|
"learning_rate": 0.0015410414827890556, |
|
"loss": 5.6306, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 11.584289496910856, |
|
"grad_norm": 0.150390625, |
|
"learning_rate": 0.0015366284201235659, |
|
"loss": 5.63, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 11.694616063548102, |
|
"grad_norm": 0.1982421875, |
|
"learning_rate": 0.001532215357458076, |
|
"loss": 5.6296, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 11.804942630185348, |
|
"grad_norm": 0.2490234375, |
|
"learning_rate": 0.001527802294792586, |
|
"loss": 5.6257, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 11.915269196822594, |
|
"grad_norm": 0.1767578125, |
|
"learning_rate": 0.0015233892321270964, |
|
"loss": 5.6299, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_loss": 5.5359296798706055, |
|
"eval_runtime": 7.1034, |
|
"eval_samples_per_second": 70.388, |
|
"eval_steps_per_second": 1.126, |
|
"step": 54384 |
|
}, |
|
{ |
|
"epoch": 12.02559576345984, |
|
"grad_norm": 0.22265625, |
|
"learning_rate": 0.0015189761694616064, |
|
"loss": 5.6219, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 12.135922330097088, |
|
"grad_norm": 0.162109375, |
|
"learning_rate": 0.0015145631067961165, |
|
"loss": 5.6342, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 12.246248896734334, |
|
"grad_norm": 0.1669921875, |
|
"learning_rate": 0.0015101500441306267, |
|
"loss": 5.629, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 12.35657546337158, |
|
"grad_norm": 0.1865234375, |
|
"learning_rate": 0.001505736981465137, |
|
"loss": 5.6198, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 12.466902030008827, |
|
"grad_norm": 0.16015625, |
|
"learning_rate": 0.001501323918799647, |
|
"loss": 5.6308, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 12.577228596646073, |
|
"grad_norm": 0.265625, |
|
"learning_rate": 0.0014969108561341572, |
|
"loss": 5.6205, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 12.687555163283319, |
|
"grad_norm": 0.1416015625, |
|
"learning_rate": 0.0014924977934686673, |
|
"loss": 5.6075, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 12.797881729920565, |
|
"grad_norm": 0.1708984375, |
|
"learning_rate": 0.0014880847308031775, |
|
"loss": 5.6294, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 12.908208296557811, |
|
"grad_norm": 0.1484375, |
|
"learning_rate": 0.0014836716681376875, |
|
"loss": 5.6225, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_loss": 5.531345844268799, |
|
"eval_runtime": 6.7386, |
|
"eval_samples_per_second": 74.2, |
|
"eval_steps_per_second": 1.187, |
|
"step": 58916 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 226600, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 50, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"EarlyStoppingCallback": { |
|
"args": { |
|
"early_stopping_patience": 3, |
|
"early_stopping_threshold": 0.0 |
|
}, |
|
"attributes": { |
|
"early_stopping_patience_counter": 0 |
|
} |
|
}, |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.800284808610376e+17, |
|
"train_batch_size": 64, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|