|
{ |
|
"best_metric": 5.177720546722412, |
|
"best_model_checkpoint": "../checkpoints/nf-distilbart_12_3-rqnsf-lagtrain-augmented/checkpoint-212000", |
|
"epoch": 2.953510079549729, |
|
"global_step": 212000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.03, |
|
"gate_score": 0.4928, |
|
"learning_rate": 0.0, |
|
"loss": 573.0775, |
|
"nf_loss": 569.8872, |
|
"ppl": 32.0588, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"gate_score": 0.4835, |
|
"learning_rate": 0.0, |
|
"loss": 341.9544, |
|
"nf_loss": 339.8841, |
|
"ppl": 9.249, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_loss": 256.18603515625, |
|
"eval_nf_loss": 253.6398468017578, |
|
"eval_perplexity": 13.794677734375, |
|
"eval_runtime": 646.8656, |
|
"eval_samples_per_second": 20.666, |
|
"eval_steps_per_second": 2.583, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"gate_score": 0.468, |
|
"learning_rate": 0.0, |
|
"loss": 216.1715, |
|
"nf_loss": 214.5228, |
|
"ppl": 5.9033, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"gate_score": 0.4488, |
|
"learning_rate": 0.0, |
|
"loss": 143.1113, |
|
"nf_loss": 141.682, |
|
"ppl": 4.666, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_loss": 111.98719024658203, |
|
"eval_nf_loss": 109.95372009277344, |
|
"eval_perplexity": 8.187104225158691, |
|
"eval_runtime": 836.6322, |
|
"eval_samples_per_second": 15.978, |
|
"eval_steps_per_second": 1.997, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"gate_score": 0.4331, |
|
"learning_rate": 0.0, |
|
"loss": 97.2206, |
|
"nf_loss": 95.8896, |
|
"ppl": 4.2098, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"gate_score": 0.4146, |
|
"learning_rate": 0.0, |
|
"loss": 72.4608, |
|
"nf_loss": 71.1644, |
|
"ppl": 4.0666, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_loss": 60.48736572265625, |
|
"eval_nf_loss": 58.55850601196289, |
|
"eval_perplexity": 7.357144355773926, |
|
"eval_runtime": 847.0188, |
|
"eval_samples_per_second": 15.782, |
|
"eval_steps_per_second": 1.973, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"gate_score": 0.3943, |
|
"learning_rate": 0.0, |
|
"loss": 58.0258, |
|
"nf_loss": 56.7432, |
|
"ppl": 4.0134, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"gate_score": 0.373, |
|
"learning_rate": 0.0, |
|
"loss": 45.4137, |
|
"nf_loss": 44.1607, |
|
"ppl": 3.8803, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_loss": 35.780086517333984, |
|
"eval_nf_loss": 33.906742095947266, |
|
"eval_perplexity": 6.943728923797607, |
|
"eval_runtime": 829.3361, |
|
"eval_samples_per_second": 16.119, |
|
"eval_steps_per_second": 2.015, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"gate_score": 0.3552, |
|
"learning_rate": 0.0, |
|
"loss": 33.838, |
|
"nf_loss": 32.5964, |
|
"ppl": 3.8548, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"gate_score": 0.3346, |
|
"learning_rate": 0.0, |
|
"loss": 28.827, |
|
"nf_loss": 27.5837, |
|
"ppl": 3.8361, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"eval_loss": 24.475879669189453, |
|
"eval_nf_loss": 22.578872680664062, |
|
"eval_perplexity": 7.1263651847839355, |
|
"eval_runtime": 878.8848, |
|
"eval_samples_per_second": 15.21, |
|
"eval_steps_per_second": 1.901, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"gate_score": 0.3166, |
|
"learning_rate": 0.0, |
|
"loss": 25.3406, |
|
"nf_loss": 24.0903, |
|
"ppl": 3.8804, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"gate_score": 0.2976, |
|
"learning_rate": 0.0, |
|
"loss": 22.6997, |
|
"nf_loss": 21.451, |
|
"ppl": 3.8523, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"eval_loss": 20.067764282226562, |
|
"eval_nf_loss": 18.16155242919922, |
|
"eval_perplexity": 7.192458152770996, |
|
"eval_runtime": 909.546, |
|
"eval_samples_per_second": 14.697, |
|
"eval_steps_per_second": 1.837, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"gate_score": 0.2793, |
|
"learning_rate": 0.0, |
|
"loss": 20.7264, |
|
"nf_loss": 19.4539, |
|
"ppl": 3.9698, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"gate_score": 0.2666, |
|
"learning_rate": 0.0, |
|
"loss": 19.1146, |
|
"nf_loss": 17.8629, |
|
"ppl": 3.8907, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"eval_loss": 16.49034309387207, |
|
"eval_nf_loss": 14.588922500610352, |
|
"eval_perplexity": 7.155718803405762, |
|
"eval_runtime": 909.9257, |
|
"eval_samples_per_second": 14.691, |
|
"eval_steps_per_second": 1.836, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"gate_score": 0.2556, |
|
"learning_rate": 0.0, |
|
"loss": 17.6585, |
|
"nf_loss": 16.365, |
|
"ppl": 4.0564, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"gate_score": 0.2421, |
|
"learning_rate": 0.0, |
|
"loss": 16.8685, |
|
"nf_loss": 15.5951, |
|
"ppl": 3.9944, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"eval_loss": 15.092597961425781, |
|
"eval_nf_loss": 13.178274154663086, |
|
"eval_perplexity": 7.254034996032715, |
|
"eval_runtime": 911.9245, |
|
"eval_samples_per_second": 14.659, |
|
"eval_steps_per_second": 1.832, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"gate_score": 0.2294, |
|
"learning_rate": 0.0, |
|
"loss": 15.8886, |
|
"nf_loss": 14.6156, |
|
"ppl": 3.9955, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"gate_score": 0.218, |
|
"learning_rate": 0.0, |
|
"loss": 14.9949, |
|
"nf_loss": 13.7324, |
|
"ppl": 3.9536, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 13.548192977905273, |
|
"eval_nf_loss": 11.632102012634277, |
|
"eval_perplexity": 7.279270172119141, |
|
"eval_runtime": 927.089, |
|
"eval_samples_per_second": 14.419, |
|
"eval_steps_per_second": 1.802, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"gate_score": 0.2095, |
|
"learning_rate": 0.0, |
|
"loss": 14.5153, |
|
"nf_loss": 13.2641, |
|
"ppl": 3.8886, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"gate_score": 0.2065, |
|
"learning_rate": 0.0, |
|
"loss": 13.8227, |
|
"nf_loss": 12.5901, |
|
"ppl": 3.8375, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"eval_loss": 12.412004470825195, |
|
"eval_nf_loss": 10.545845985412598, |
|
"eval_perplexity": 6.89225435256958, |
|
"eval_runtime": 905.0462, |
|
"eval_samples_per_second": 14.771, |
|
"eval_steps_per_second": 1.846, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"gate_score": 0.2009, |
|
"learning_rate": 0.0, |
|
"loss": 13.3939, |
|
"nf_loss": 12.1663, |
|
"ppl": 3.7926, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"gate_score": 0.1935, |
|
"learning_rate": 0.0, |
|
"loss": 13.0096, |
|
"nf_loss": 11.7892, |
|
"ppl": 3.758, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"eval_loss": 11.815178871154785, |
|
"eval_nf_loss": 9.942044258117676, |
|
"eval_perplexity": 6.960797309875488, |
|
"eval_runtime": 909.5617, |
|
"eval_samples_per_second": 14.697, |
|
"eval_steps_per_second": 1.837, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"gate_score": 0.1903, |
|
"learning_rate": 0.0, |
|
"loss": 12.4269, |
|
"nf_loss": 11.2151, |
|
"ppl": 3.7435, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"gate_score": 0.1854, |
|
"learning_rate": 0.0, |
|
"loss": 11.9829, |
|
"nf_loss": 10.7658, |
|
"ppl": 3.7626, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"eval_loss": 10.730130195617676, |
|
"eval_nf_loss": 8.879775047302246, |
|
"eval_perplexity": 6.794580459594727, |
|
"eval_runtime": 900.5692, |
|
"eval_samples_per_second": 14.844, |
|
"eval_steps_per_second": 1.855, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"gate_score": 0.1842, |
|
"learning_rate": 0.0, |
|
"loss": 11.6604, |
|
"nf_loss": 10.459, |
|
"ppl": 3.6987, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"gate_score": 0.1778, |
|
"learning_rate": 0.0, |
|
"loss": 11.3837, |
|
"nf_loss": 10.1987, |
|
"ppl": 3.6203, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"eval_loss": 10.283514976501465, |
|
"eval_nf_loss": 8.462200164794922, |
|
"eval_perplexity": 6.589704513549805, |
|
"eval_runtime": 897.7218, |
|
"eval_samples_per_second": 14.891, |
|
"eval_steps_per_second": 1.861, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"gate_score": 0.1743, |
|
"learning_rate": 0.0, |
|
"loss": 11.1345, |
|
"nf_loss": 9.9505, |
|
"ppl": 3.6226, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"gate_score": 0.1681, |
|
"learning_rate": 0.0, |
|
"loss": 10.913, |
|
"nf_loss": 9.7401, |
|
"ppl": 3.5839, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"eval_loss": 9.798712730407715, |
|
"eval_nf_loss": 7.988397598266602, |
|
"eval_perplexity": 6.516669750213623, |
|
"eval_runtime": 883.3578, |
|
"eval_samples_per_second": 15.133, |
|
"eval_steps_per_second": 1.892, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"gate_score": 0.1714, |
|
"learning_rate": 0.0, |
|
"loss": 10.7764, |
|
"nf_loss": 9.605, |
|
"ppl": 3.5672, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"gate_score": 0.1655, |
|
"learning_rate": 0.0, |
|
"loss": 10.5627, |
|
"nf_loss": 9.4126, |
|
"ppl": 3.5056, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"eval_loss": 9.538662910461426, |
|
"eval_nf_loss": 7.738859176635742, |
|
"eval_perplexity": 6.450651168823242, |
|
"eval_runtime": 876.1125, |
|
"eval_samples_per_second": 15.258, |
|
"eval_steps_per_second": 1.907, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"gate_score": 0.1661, |
|
"learning_rate": 0.0, |
|
"loss": 10.4422, |
|
"nf_loss": 9.2887, |
|
"ppl": 3.523, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"gate_score": 0.1643, |
|
"learning_rate": 0.0, |
|
"loss": 10.238, |
|
"nf_loss": 9.092, |
|
"ppl": 3.4769, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"eval_loss": 9.2942533493042, |
|
"eval_nf_loss": 7.510049819946289, |
|
"eval_perplexity": 6.345398426055908, |
|
"eval_runtime": 873.6279, |
|
"eval_samples_per_second": 15.302, |
|
"eval_steps_per_second": 1.913, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"gate_score": 0.1625, |
|
"learning_rate": 0.0, |
|
"loss": 10.1516, |
|
"nf_loss": 9.0024, |
|
"ppl": 3.4898, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"gate_score": 0.1609, |
|
"learning_rate": 0.0, |
|
"loss": 9.9819, |
|
"nf_loss": 8.8616, |
|
"ppl": 3.3834, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"eval_loss": 9.13103199005127, |
|
"eval_nf_loss": 7.360151290893555, |
|
"eval_perplexity": 6.2570929527282715, |
|
"eval_runtime": 892.7167, |
|
"eval_samples_per_second": 14.975, |
|
"eval_steps_per_second": 1.872, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"gate_score": 0.1605, |
|
"learning_rate": 0.0, |
|
"loss": 9.9759, |
|
"nf_loss": 8.8506, |
|
"ppl": 3.3913, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"gate_score": 0.161, |
|
"learning_rate": 5.5250000000000005e-06, |
|
"loss": 9.9267, |
|
"nf_loss": 8.805, |
|
"ppl": 3.3955, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 9.130549430847168, |
|
"eval_nf_loss": 7.364365577697754, |
|
"eval_perplexity": 6.226567268371582, |
|
"eval_runtime": 865.1706, |
|
"eval_samples_per_second": 15.451, |
|
"eval_steps_per_second": 1.931, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.03, |
|
"gate_score": 0.1525, |
|
"learning_rate": 4.9922293335593894e-05, |
|
"loss": 10.6431, |
|
"nf_loss": 9.4961, |
|
"ppl": 3.4863, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"gate_score": 0.1281, |
|
"learning_rate": 4.921622232583111e-05, |
|
"loss": 11.018, |
|
"nf_loss": 9.7948, |
|
"ppl": 3.8142, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"eval_loss": 10.00072193145752, |
|
"eval_nf_loss": 8.153450965881348, |
|
"eval_perplexity": 6.768190383911133, |
|
"eval_runtime": 618.9198, |
|
"eval_samples_per_second": 21.599, |
|
"eval_steps_per_second": 2.7, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.09, |
|
"gate_score": 0.1074, |
|
"learning_rate": 4.850979810395739e-05, |
|
"loss": 10.5531, |
|
"nf_loss": 9.3486, |
|
"ppl": 3.7186, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"gate_score": 0.0924, |
|
"learning_rate": 4.780408030630554e-05, |
|
"loss": 10.0575, |
|
"nf_loss": 8.8613, |
|
"ppl": 3.7072, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"eval_loss": 9.039822578430176, |
|
"eval_nf_loss": 7.206754684448242, |
|
"eval_perplexity": 6.669988632202148, |
|
"eval_runtime": 639.9491, |
|
"eval_samples_per_second": 20.889, |
|
"eval_steps_per_second": 2.611, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.14, |
|
"gate_score": 0.0874, |
|
"learning_rate": 4.7097656084431825e-05, |
|
"loss": 9.7084, |
|
"nf_loss": 8.5025, |
|
"ppl": 3.7371, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"gate_score": 0.0759, |
|
"learning_rate": 4.639158507466904e-05, |
|
"loss": 9.3613, |
|
"nf_loss": 8.1756, |
|
"ppl": 3.6444, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.17, |
|
"eval_loss": 8.67480182647705, |
|
"eval_nf_loss": 6.853387832641602, |
|
"eval_perplexity": 6.5951128005981445, |
|
"eval_runtime": 655.8198, |
|
"eval_samples_per_second": 20.384, |
|
"eval_steps_per_second": 2.548, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"gate_score": 0.0725, |
|
"learning_rate": 4.5685160852795324e-05, |
|
"loss": 8.9712, |
|
"nf_loss": 7.7979, |
|
"ppl": 3.5998, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"gate_score": 0.0666, |
|
"learning_rate": 4.497908984303254e-05, |
|
"loss": 8.7453, |
|
"nf_loss": 7.5777, |
|
"ppl": 3.5764, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"eval_loss": 7.947096347808838, |
|
"eval_nf_loss": 6.137327671051025, |
|
"eval_perplexity": 6.509429454803467, |
|
"eval_runtime": 654.1896, |
|
"eval_samples_per_second": 20.434, |
|
"eval_steps_per_second": 2.554, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"gate_score": 0.0657, |
|
"learning_rate": 4.4273018833269757e-05, |
|
"loss": 8.4051, |
|
"nf_loss": 7.2351, |
|
"ppl": 3.5928, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"gate_score": 0.0632, |
|
"learning_rate": 4.356694782350697e-05, |
|
"loss": 8.2083, |
|
"nf_loss": 7.0471, |
|
"ppl": 3.5522, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"eval_loss": 7.776934623718262, |
|
"eval_nf_loss": 5.991891384124756, |
|
"eval_perplexity": 6.346434593200684, |
|
"eval_runtime": 664.9292, |
|
"eval_samples_per_second": 20.104, |
|
"eval_steps_per_second": 2.513, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 1.31, |
|
"gate_score": 0.0614, |
|
"learning_rate": 4.2860523601633256e-05, |
|
"loss": 7.8299, |
|
"nf_loss": 6.6969, |
|
"ppl": 3.4547, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"gate_score": 0.0596, |
|
"learning_rate": 4.215445259187047e-05, |
|
"loss": 7.6361, |
|
"nf_loss": 6.4891, |
|
"ppl": 3.5083, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"eval_loss": 7.583176612854004, |
|
"eval_nf_loss": 5.804019927978516, |
|
"eval_perplexity": 6.316826343536377, |
|
"eval_runtime": 649.9826, |
|
"eval_samples_per_second": 20.567, |
|
"eval_steps_per_second": 2.571, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"gate_score": 0.0567, |
|
"learning_rate": 4.144838158210769e-05, |
|
"loss": 7.4853, |
|
"nf_loss": 6.36, |
|
"ppl": 3.4233, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"gate_score": 0.0557, |
|
"learning_rate": 4.074195736023397e-05, |
|
"loss": 7.2881, |
|
"nf_loss": 6.1511, |
|
"ppl": 3.4723, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"eval_loss": 7.400701999664307, |
|
"eval_nf_loss": 5.625098705291748, |
|
"eval_perplexity": 6.289206504821777, |
|
"eval_runtime": 667.0031, |
|
"eval_samples_per_second": 20.042, |
|
"eval_steps_per_second": 2.505, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 1.42, |
|
"gate_score": 0.054, |
|
"learning_rate": 4.003588635047119e-05, |
|
"loss": 7.165, |
|
"nf_loss": 6.0472, |
|
"ppl": 3.3942, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"gate_score": 0.0532, |
|
"learning_rate": 3.9329815340708403e-05, |
|
"loss": 6.9929, |
|
"nf_loss": 5.864, |
|
"ppl": 3.45, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.45, |
|
"eval_loss": 6.6446003913879395, |
|
"eval_nf_loss": 4.897329330444336, |
|
"eval_perplexity": 6.099964141845703, |
|
"eval_runtime": 665.1582, |
|
"eval_samples_per_second": 20.097, |
|
"eval_steps_per_second": 2.512, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"gate_score": 0.0508, |
|
"learning_rate": 3.8623391118834686e-05, |
|
"loss": 6.8638, |
|
"nf_loss": 5.7412, |
|
"ppl": 3.4016, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"gate_score": 0.048, |
|
"learning_rate": 3.79173201090719e-05, |
|
"loss": 6.7439, |
|
"nf_loss": 5.6285, |
|
"ppl": 3.3851, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"eval_loss": 6.440381050109863, |
|
"eval_nf_loss": 4.687047004699707, |
|
"eval_perplexity": 6.147556304931641, |
|
"eval_runtime": 671.2658, |
|
"eval_samples_per_second": 19.915, |
|
"eval_steps_per_second": 2.489, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"gate_score": 0.0489, |
|
"learning_rate": 3.721124909930912e-05, |
|
"loss": 6.6258, |
|
"nf_loss": 5.5227, |
|
"ppl": 3.34, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"gate_score": 0.0476, |
|
"learning_rate": 3.65048248774354e-05, |
|
"loss": 6.5119, |
|
"nf_loss": 5.4155, |
|
"ppl": 3.3128, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"eval_loss": 6.296238422393799, |
|
"eval_nf_loss": 4.570157527923584, |
|
"eval_perplexity": 5.967574119567871, |
|
"eval_runtime": 658.6675, |
|
"eval_samples_per_second": 20.296, |
|
"eval_steps_per_second": 2.537, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 1.59, |
|
"gate_score": 0.0482, |
|
"learning_rate": 3.579910707978356e-05, |
|
"loss": 6.3285, |
|
"nf_loss": 5.2244, |
|
"ppl": 3.3628, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"gate_score": 0.0482, |
|
"learning_rate": 3.5092682857909834e-05, |
|
"loss": 6.2206, |
|
"nf_loss": 5.1234, |
|
"ppl": 3.3327, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.62, |
|
"eval_loss": 6.046633720397949, |
|
"eval_nf_loss": 4.321578025817871, |
|
"eval_perplexity": 5.9654130935668945, |
|
"eval_runtime": 653.7138, |
|
"eval_samples_per_second": 20.449, |
|
"eval_steps_per_second": 2.556, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 1.64, |
|
"gate_score": 0.0489, |
|
"learning_rate": 3.438625863603611e-05, |
|
"loss": 6.0978, |
|
"nf_loss": 5.0037, |
|
"ppl": 3.3053, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"gate_score": 0.0468, |
|
"learning_rate": 3.3680540838384266e-05, |
|
"loss": 6.0198, |
|
"nf_loss": 4.9354, |
|
"ppl": 3.2737, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"eval_loss": 5.779796123504639, |
|
"eval_nf_loss": 4.067526817321777, |
|
"eval_perplexity": 5.880522727966309, |
|
"eval_runtime": 641.0411, |
|
"eval_samples_per_second": 20.854, |
|
"eval_steps_per_second": 2.607, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"gate_score": 0.0425, |
|
"learning_rate": 3.297446982862148e-05, |
|
"loss": 5.907, |
|
"nf_loss": 4.8253, |
|
"ppl": 3.2734, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"gate_score": 0.0432, |
|
"learning_rate": 3.2268045606747765e-05, |
|
"loss": 5.7937, |
|
"nf_loss": 4.7167, |
|
"ppl": 3.2463, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 1.73, |
|
"eval_loss": 5.693485260009766, |
|
"eval_nf_loss": 3.997667074203491, |
|
"eval_perplexity": 5.778055667877197, |
|
"eval_runtime": 629.6888, |
|
"eval_samples_per_second": 21.23, |
|
"eval_steps_per_second": 2.654, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"gate_score": 0.0413, |
|
"learning_rate": 3.156197459698498e-05, |
|
"loss": 5.6708, |
|
"nf_loss": 4.5839, |
|
"ppl": 3.2695, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"gate_score": 0.0423, |
|
"learning_rate": 3.08559035872222e-05, |
|
"loss": 5.5759, |
|
"nf_loss": 4.5071, |
|
"ppl": 3.2127, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 1.78, |
|
"eval_loss": 5.489774227142334, |
|
"eval_nf_loss": 3.8035385608673096, |
|
"eval_perplexity": 5.725496292114258, |
|
"eval_runtime": 629.3114, |
|
"eval_samples_per_second": 21.242, |
|
"eval_steps_per_second": 2.655, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"gate_score": 0.0415, |
|
"learning_rate": 3.0149832577459418e-05, |
|
"loss": 5.4762, |
|
"nf_loss": 4.4182, |
|
"ppl": 3.189, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"gate_score": 0.043, |
|
"learning_rate": 2.9443761567696637e-05, |
|
"loss": 5.4304, |
|
"nf_loss": 4.3627, |
|
"ppl": 3.2132, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"eval_loss": 5.458424091339111, |
|
"eval_nf_loss": 3.7697036266326904, |
|
"eval_perplexity": 5.744439125061035, |
|
"eval_runtime": 641.9111, |
|
"eval_samples_per_second": 20.825, |
|
"eval_steps_per_second": 2.603, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 1.87, |
|
"gate_score": 0.0438, |
|
"learning_rate": 2.8737337345822913e-05, |
|
"loss": 5.3504, |
|
"nf_loss": 4.2998, |
|
"ppl": 3.1531, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"gate_score": 0.0429, |
|
"learning_rate": 2.803126633606013e-05, |
|
"loss": 5.2695, |
|
"nf_loss": 4.2383, |
|
"ppl": 3.0881, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"eval_loss": 5.256904125213623, |
|
"eval_nf_loss": 3.5766844749450684, |
|
"eval_perplexity": 5.6950812339782715, |
|
"eval_runtime": 660.429, |
|
"eval_samples_per_second": 20.241, |
|
"eval_steps_per_second": 2.53, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"gate_score": 0.0421, |
|
"learning_rate": 2.7324842114186412e-05, |
|
"loss": 5.2219, |
|
"nf_loss": 4.1712, |
|
"ppl": 3.1497, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"gate_score": 0.0417, |
|
"learning_rate": 2.661877110442363e-05, |
|
"loss": 5.1817, |
|
"nf_loss": 4.1256, |
|
"ppl": 3.1777, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.95, |
|
"eval_loss": 5.158201217651367, |
|
"eval_nf_loss": 3.487734794616699, |
|
"eval_perplexity": 5.636016845703125, |
|
"eval_runtime": 644.0563, |
|
"eval_samples_per_second": 20.756, |
|
"eval_steps_per_second": 2.594, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"gate_score": 0.0422, |
|
"learning_rate": 2.5913053306771785e-05, |
|
"loss": 5.1108, |
|
"nf_loss": 4.0728, |
|
"ppl": 3.1163, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"gate_score": 0.0402, |
|
"learning_rate": 2.5206982297009e-05, |
|
"loss": 5.0219, |
|
"nf_loss": 4.0038, |
|
"ppl": 3.0436, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 2.01, |
|
"eval_loss": 5.062300205230713, |
|
"eval_nf_loss": 3.3892769813537598, |
|
"eval_perplexity": 5.655203819274902, |
|
"eval_runtime": 654.7044, |
|
"eval_samples_per_second": 20.418, |
|
"eval_steps_per_second": 2.552, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 2.03, |
|
"gate_score": 0.0407, |
|
"learning_rate": 2.4500558075135284e-05, |
|
"loss": 4.8922, |
|
"nf_loss": 3.9548, |
|
"ppl": 2.775, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"gate_score": 0.0409, |
|
"learning_rate": 2.379413385326156e-05, |
|
"loss": 4.8179, |
|
"nf_loss": 3.8903, |
|
"ppl": 2.7374, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 2.06, |
|
"eval_loss": 4.891648769378662, |
|
"eval_nf_loss": 3.21927547454834, |
|
"eval_perplexity": 5.654395580291748, |
|
"eval_runtime": 649.5705, |
|
"eval_samples_per_second": 20.58, |
|
"eval_steps_per_second": 2.572, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 2.09, |
|
"gate_score": 0.0431, |
|
"learning_rate": 2.308806284349878e-05, |
|
"loss": 4.7494, |
|
"nf_loss": 3.8238, |
|
"ppl": 2.7455, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"gate_score": 0.0424, |
|
"learning_rate": 2.2381991833735996e-05, |
|
"loss": 4.7155, |
|
"nf_loss": 3.7946, |
|
"ppl": 2.7302, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"eval_loss": 4.806872844696045, |
|
"eval_nf_loss": 3.148994207382202, |
|
"eval_perplexity": 5.567600250244141, |
|
"eval_runtime": 630.6434, |
|
"eval_samples_per_second": 21.197, |
|
"eval_steps_per_second": 2.65, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 2.15, |
|
"gate_score": 0.0408, |
|
"learning_rate": 2.1675567611862275e-05, |
|
"loss": 4.6944, |
|
"nf_loss": 3.7643, |
|
"ppl": 2.7512, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"gate_score": 0.0426, |
|
"learning_rate": 2.0969496602099495e-05, |
|
"loss": 4.6805, |
|
"nf_loss": 3.7557, |
|
"ppl": 2.7415, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 2.17, |
|
"eval_loss": 4.722836494445801, |
|
"eval_nf_loss": 3.066758632659912, |
|
"eval_perplexity": 5.557853698730469, |
|
"eval_runtime": 604.3797, |
|
"eval_samples_per_second": 22.119, |
|
"eval_steps_per_second": 2.765, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"gate_score": 0.0438, |
|
"learning_rate": 2.026342559233671e-05, |
|
"loss": 4.5943, |
|
"nf_loss": 3.6721, |
|
"ppl": 2.722, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"gate_score": 0.0447, |
|
"learning_rate": 1.9557354582573927e-05, |
|
"loss": 4.5781, |
|
"nf_loss": 3.6564, |
|
"ppl": 2.728, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 2.23, |
|
"eval_loss": 4.671295166015625, |
|
"eval_nf_loss": 3.0203938484191895, |
|
"eval_perplexity": 5.526300430297852, |
|
"eval_runtime": 604.9582, |
|
"eval_samples_per_second": 22.097, |
|
"eval_steps_per_second": 2.762, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 2.26, |
|
"gate_score": 0.0436, |
|
"learning_rate": 1.8851283572811147e-05, |
|
"loss": 4.5642, |
|
"nf_loss": 3.6404, |
|
"ppl": 2.7413, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"gate_score": 0.0453, |
|
"learning_rate": 1.8144859350937426e-05, |
|
"loss": 4.4834, |
|
"nf_loss": 3.5645, |
|
"ppl": 2.7145, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 2.28, |
|
"eval_loss": 4.710845947265625, |
|
"eval_nf_loss": 3.063795566558838, |
|
"eval_perplexity": 5.505098342895508, |
|
"eval_runtime": 601.1865, |
|
"eval_samples_per_second": 22.236, |
|
"eval_steps_per_second": 2.78, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 2.31, |
|
"gate_score": 0.0442, |
|
"learning_rate": 1.7438788341174643e-05, |
|
"loss": 4.4998, |
|
"nf_loss": 3.5735, |
|
"ppl": 2.7361, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"gate_score": 0.0442, |
|
"learning_rate": 1.6733070543522796e-05, |
|
"loss": 4.4172, |
|
"nf_loss": 3.4989, |
|
"ppl": 2.7118, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 2.34, |
|
"eval_loss": 4.5222930908203125, |
|
"eval_nf_loss": 2.8794257640838623, |
|
"eval_perplexity": 5.477485179901123, |
|
"eval_runtime": 609.6448, |
|
"eval_samples_per_second": 21.928, |
|
"eval_steps_per_second": 2.741, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 2.37, |
|
"gate_score": 0.0464, |
|
"learning_rate": 1.602664632164908e-05, |
|
"loss": 4.3817, |
|
"nf_loss": 3.4671, |
|
"ppl": 2.7031, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"gate_score": 0.0447, |
|
"learning_rate": 1.5320575311886295e-05, |
|
"loss": 4.3776, |
|
"nf_loss": 3.4701, |
|
"ppl": 2.6834, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"eval_loss": 4.446962356567383, |
|
"eval_nf_loss": 2.8145503997802734, |
|
"eval_perplexity": 5.419012546539307, |
|
"eval_runtime": 591.7923, |
|
"eval_samples_per_second": 22.589, |
|
"eval_steps_per_second": 2.824, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 2.42, |
|
"gate_score": 0.0438, |
|
"learning_rate": 1.4614151090012576e-05, |
|
"loss": 4.34, |
|
"nf_loss": 3.4303, |
|
"ppl": 2.6917, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"gate_score": 0.0433, |
|
"learning_rate": 1.390808008024979e-05, |
|
"loss": 4.3096, |
|
"nf_loss": 3.4, |
|
"ppl": 2.6915, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 2.45, |
|
"eval_loss": 4.412790298461914, |
|
"eval_nf_loss": 2.7833149433135986, |
|
"eval_perplexity": 5.404773712158203, |
|
"eval_runtime": 593.2765, |
|
"eval_samples_per_second": 22.532, |
|
"eval_steps_per_second": 2.817, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"gate_score": 0.0449, |
|
"learning_rate": 1.3201655858376072e-05, |
|
"loss": 4.289, |
|
"nf_loss": 3.3752, |
|
"ppl": 2.7063, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"gate_score": 0.0431, |
|
"learning_rate": 1.2495938060724226e-05, |
|
"loss": 4.2453, |
|
"nf_loss": 3.3433, |
|
"ppl": 2.6679, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 2.51, |
|
"eval_loss": 4.363792896270752, |
|
"eval_nf_loss": 2.7436203956604004, |
|
"eval_perplexity": 5.350350856781006, |
|
"eval_runtime": 592.6812, |
|
"eval_samples_per_second": 22.555, |
|
"eval_steps_per_second": 2.819, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 2.54, |
|
"gate_score": 0.0444, |
|
"learning_rate": 1.1789867050961444e-05, |
|
"loss": 4.1812, |
|
"nf_loss": 3.2773, |
|
"ppl": 2.6769, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"gate_score": 0.0439, |
|
"learning_rate": 1.1083442829087724e-05, |
|
"loss": 4.2063, |
|
"nf_loss": 3.3076, |
|
"ppl": 2.6592, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"eval_loss": 4.401766777038574, |
|
"eval_nf_loss": 2.7845916748046875, |
|
"eval_perplexity": 5.334909915924072, |
|
"eval_runtime": 593.569, |
|
"eval_samples_per_second": 22.521, |
|
"eval_steps_per_second": 2.815, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 2.59, |
|
"gate_score": 0.0435, |
|
"learning_rate": 1.0377371819324942e-05, |
|
"loss": 4.1697, |
|
"nf_loss": 3.2627, |
|
"ppl": 2.6781, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"gate_score": 0.0455, |
|
"learning_rate": 9.67130080956216e-06, |
|
"loss": 4.1413, |
|
"nf_loss": 3.2508, |
|
"ppl": 2.6383, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 2.62, |
|
"eval_loss": 4.287137031555176, |
|
"eval_nf_loss": 2.6753435134887695, |
|
"eval_perplexity": 5.3036627769470215, |
|
"eval_runtime": 592.6474, |
|
"eval_samples_per_second": 22.556, |
|
"eval_steps_per_second": 2.82, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 2.65, |
|
"gate_score": 0.0438, |
|
"learning_rate": 8.964876587688439e-06, |
|
"loss": 4.1244, |
|
"nf_loss": 3.2269, |
|
"ppl": 2.6537, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"gate_score": 0.0446, |
|
"learning_rate": 8.258805577925655e-06, |
|
"loss": 4.0845, |
|
"nf_loss": 3.1966, |
|
"ppl": 2.6201, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 2.67, |
|
"eval_loss": 4.247885704040527, |
|
"eval_nf_loss": 2.640995502471924, |
|
"eval_perplexity": 5.275094985961914, |
|
"eval_runtime": 591.9491, |
|
"eval_samples_per_second": 22.583, |
|
"eval_steps_per_second": 2.823, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 2.7, |
|
"gate_score": 0.0445, |
|
"learning_rate": 7.552734568162874e-06, |
|
"loss": 4.0604, |
|
"nf_loss": 3.1769, |
|
"ppl": 2.6091, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"gate_score": 0.0433, |
|
"learning_rate": 6.846310346289154e-06, |
|
"loss": 4.06, |
|
"nf_loss": 3.1758, |
|
"ppl": 2.6114, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 2.73, |
|
"eval_loss": 4.171822547912598, |
|
"eval_nf_loss": 2.567200183868408, |
|
"eval_perplexity": 5.264888763427734, |
|
"eval_runtime": 589.4312, |
|
"eval_samples_per_second": 22.679, |
|
"eval_steps_per_second": 2.835, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"gate_score": 0.0436, |
|
"learning_rate": 6.140239336526371e-06, |
|
"loss": 4.0426, |
|
"nf_loss": 3.1603, |
|
"ppl": 2.6136, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"gate_score": 0.0444, |
|
"learning_rate": 5.4338151146526515e-06, |
|
"loss": 4.0352, |
|
"nf_loss": 3.1654, |
|
"ppl": 2.5709, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 2.79, |
|
"eval_loss": 4.141200542449951, |
|
"eval_nf_loss": 2.539963483810425, |
|
"eval_perplexity": 5.245651721954346, |
|
"eval_runtime": 622.9626, |
|
"eval_samples_per_second": 21.459, |
|
"eval_steps_per_second": 2.682, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 2.81, |
|
"gate_score": 0.0442, |
|
"learning_rate": 4.727744104889869e-06, |
|
"loss": 3.9924, |
|
"nf_loss": 3.1183, |
|
"ppl": 2.5982, |
|
"step": 202000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"gate_score": 0.0437, |
|
"learning_rate": 4.021673095127087e-06, |
|
"loss": 3.982, |
|
"nf_loss": 3.0969, |
|
"ppl": 2.6167, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"eval_loss": 4.099726676940918, |
|
"eval_nf_loss": 2.5044403076171875, |
|
"eval_perplexity": 5.212295055389404, |
|
"eval_runtime": 628.0641, |
|
"eval_samples_per_second": 21.284, |
|
"eval_steps_per_second": 2.661, |
|
"step": 204000 |
|
}, |
|
{ |
|
"epoch": 2.87, |
|
"gate_score": 0.044, |
|
"learning_rate": 3.315248873253366e-06, |
|
"loss": 3.9519, |
|
"nf_loss": 3.0778, |
|
"ppl": 2.5849, |
|
"step": 206000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"gate_score": 0.0427, |
|
"learning_rate": 2.6091778634905835e-06, |
|
"loss": 3.9487, |
|
"nf_loss": 3.082, |
|
"ppl": 2.5664, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 2.9, |
|
"eval_loss": 4.066439151763916, |
|
"eval_nf_loss": 2.474716901779175, |
|
"eval_perplexity": 5.1935272216796875, |
|
"eval_runtime": 625.0657, |
|
"eval_samples_per_second": 21.387, |
|
"eval_steps_per_second": 2.673, |
|
"step": 208000 |
|
}, |
|
{ |
|
"epoch": 2.93, |
|
"gate_score": 0.0425, |
|
"learning_rate": 1.9031068537278006e-06, |
|
"loss": 3.9389, |
|
"nf_loss": 3.0634, |
|
"ppl": 2.5914, |
|
"step": 210000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"gate_score": 0.0424, |
|
"learning_rate": 1.196682631854081e-06, |
|
"loss": 3.9603, |
|
"nf_loss": 3.0926, |
|
"ppl": 2.5703, |
|
"step": 212000 |
|
}, |
|
{ |
|
"epoch": 2.95, |
|
"eval_loss": 4.057208061218262, |
|
"eval_nf_loss": 2.4683539867401123, |
|
"eval_perplexity": 5.177720546722412, |
|
"eval_runtime": 618.2401, |
|
"eval_samples_per_second": 21.623, |
|
"eval_steps_per_second": 2.703, |
|
"step": 212000 |
|
} |
|
], |
|
"max_steps": 215337, |
|
"num_train_epochs": 3, |
|
"total_flos": 2.1693560999113897e+18, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|