{ "best_metric": 5.177720546722412, "best_model_checkpoint": "../checkpoints/nf-distilbart_12_3-rqnsf-lagtrain-augmented/checkpoint-212000", "epoch": 2.953510079549729, "global_step": 212000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.03, "gate_score": 0.4928, "learning_rate": 0.0, "loss": 573.0775, "nf_loss": 569.8872, "ppl": 32.0588, "step": 2000 }, { "epoch": 0.06, "gate_score": 0.4835, "learning_rate": 0.0, "loss": 341.9544, "nf_loss": 339.8841, "ppl": 9.249, "step": 4000 }, { "epoch": 0.06, "eval_loss": 256.18603515625, "eval_nf_loss": 253.6398468017578, "eval_perplexity": 13.794677734375, "eval_runtime": 646.8656, "eval_samples_per_second": 20.666, "eval_steps_per_second": 2.583, "step": 4000 }, { "epoch": 0.08, "gate_score": 0.468, "learning_rate": 0.0, "loss": 216.1715, "nf_loss": 214.5228, "ppl": 5.9033, "step": 6000 }, { "epoch": 0.11, "gate_score": 0.4488, "learning_rate": 0.0, "loss": 143.1113, "nf_loss": 141.682, "ppl": 4.666, "step": 8000 }, { "epoch": 0.11, "eval_loss": 111.98719024658203, "eval_nf_loss": 109.95372009277344, "eval_perplexity": 8.187104225158691, "eval_runtime": 836.6322, "eval_samples_per_second": 15.978, "eval_steps_per_second": 1.997, "step": 8000 }, { "epoch": 0.14, "gate_score": 0.4331, "learning_rate": 0.0, "loss": 97.2206, "nf_loss": 95.8896, "ppl": 4.2098, "step": 10000 }, { "epoch": 0.17, "gate_score": 0.4146, "learning_rate": 0.0, "loss": 72.4608, "nf_loss": 71.1644, "ppl": 4.0666, "step": 12000 }, { "epoch": 0.17, "eval_loss": 60.48736572265625, "eval_nf_loss": 58.55850601196289, "eval_perplexity": 7.357144355773926, "eval_runtime": 847.0188, "eval_samples_per_second": 15.782, "eval_steps_per_second": 1.973, "step": 12000 }, { "epoch": 0.2, "gate_score": 0.3943, "learning_rate": 0.0, "loss": 58.0258, "nf_loss": 56.7432, "ppl": 4.0134, "step": 14000 }, { "epoch": 0.22, "gate_score": 0.373, "learning_rate": 0.0, "loss": 45.4137, "nf_loss": 44.1607, "ppl": 3.8803, "step": 16000 }, { "epoch": 0.22, "eval_loss": 35.780086517333984, "eval_nf_loss": 33.906742095947266, "eval_perplexity": 6.943728923797607, "eval_runtime": 829.3361, "eval_samples_per_second": 16.119, "eval_steps_per_second": 2.015, "step": 16000 }, { "epoch": 0.25, "gate_score": 0.3552, "learning_rate": 0.0, "loss": 33.838, "nf_loss": 32.5964, "ppl": 3.8548, "step": 18000 }, { "epoch": 0.28, "gate_score": 0.3346, "learning_rate": 0.0, "loss": 28.827, "nf_loss": 27.5837, "ppl": 3.8361, "step": 20000 }, { "epoch": 0.28, "eval_loss": 24.475879669189453, "eval_nf_loss": 22.578872680664062, "eval_perplexity": 7.1263651847839355, "eval_runtime": 878.8848, "eval_samples_per_second": 15.21, "eval_steps_per_second": 1.901, "step": 20000 }, { "epoch": 0.31, "gate_score": 0.3166, "learning_rate": 0.0, "loss": 25.3406, "nf_loss": 24.0903, "ppl": 3.8804, "step": 22000 }, { "epoch": 0.33, "gate_score": 0.2976, "learning_rate": 0.0, "loss": 22.6997, "nf_loss": 21.451, "ppl": 3.8523, "step": 24000 }, { "epoch": 0.33, "eval_loss": 20.067764282226562, "eval_nf_loss": 18.16155242919922, "eval_perplexity": 7.192458152770996, "eval_runtime": 909.546, "eval_samples_per_second": 14.697, "eval_steps_per_second": 1.837, "step": 24000 }, { "epoch": 0.36, "gate_score": 0.2793, "learning_rate": 0.0, "loss": 20.7264, "nf_loss": 19.4539, "ppl": 3.9698, "step": 26000 }, { "epoch": 0.39, "gate_score": 0.2666, "learning_rate": 0.0, "loss": 19.1146, "nf_loss": 17.8629, "ppl": 3.8907, "step": 28000 }, { "epoch": 0.39, "eval_loss": 16.49034309387207, "eval_nf_loss": 14.588922500610352, "eval_perplexity": 7.155718803405762, "eval_runtime": 909.9257, "eval_samples_per_second": 14.691, "eval_steps_per_second": 1.836, "step": 28000 }, { "epoch": 0.42, "gate_score": 0.2556, "learning_rate": 0.0, "loss": 17.6585, "nf_loss": 16.365, "ppl": 4.0564, "step": 30000 }, { "epoch": 0.45, "gate_score": 0.2421, "learning_rate": 0.0, "loss": 16.8685, "nf_loss": 15.5951, "ppl": 3.9944, "step": 32000 }, { "epoch": 0.45, "eval_loss": 15.092597961425781, "eval_nf_loss": 13.178274154663086, "eval_perplexity": 7.254034996032715, "eval_runtime": 911.9245, "eval_samples_per_second": 14.659, "eval_steps_per_second": 1.832, "step": 32000 }, { "epoch": 0.47, "gate_score": 0.2294, "learning_rate": 0.0, "loss": 15.8886, "nf_loss": 14.6156, "ppl": 3.9955, "step": 34000 }, { "epoch": 0.5, "gate_score": 0.218, "learning_rate": 0.0, "loss": 14.9949, "nf_loss": 13.7324, "ppl": 3.9536, "step": 36000 }, { "epoch": 0.5, "eval_loss": 13.548192977905273, "eval_nf_loss": 11.632102012634277, "eval_perplexity": 7.279270172119141, "eval_runtime": 927.089, "eval_samples_per_second": 14.419, "eval_steps_per_second": 1.802, "step": 36000 }, { "epoch": 0.53, "gate_score": 0.2095, "learning_rate": 0.0, "loss": 14.5153, "nf_loss": 13.2641, "ppl": 3.8886, "step": 38000 }, { "epoch": 0.56, "gate_score": 0.2065, "learning_rate": 0.0, "loss": 13.8227, "nf_loss": 12.5901, "ppl": 3.8375, "step": 40000 }, { "epoch": 0.56, "eval_loss": 12.412004470825195, "eval_nf_loss": 10.545845985412598, "eval_perplexity": 6.89225435256958, "eval_runtime": 905.0462, "eval_samples_per_second": 14.771, "eval_steps_per_second": 1.846, "step": 40000 }, { "epoch": 0.59, "gate_score": 0.2009, "learning_rate": 0.0, "loss": 13.3939, "nf_loss": 12.1663, "ppl": 3.7926, "step": 42000 }, { "epoch": 0.61, "gate_score": 0.1935, "learning_rate": 0.0, "loss": 13.0096, "nf_loss": 11.7892, "ppl": 3.758, "step": 44000 }, { "epoch": 0.61, "eval_loss": 11.815178871154785, "eval_nf_loss": 9.942044258117676, "eval_perplexity": 6.960797309875488, "eval_runtime": 909.5617, "eval_samples_per_second": 14.697, "eval_steps_per_second": 1.837, "step": 44000 }, { "epoch": 0.64, "gate_score": 0.1903, "learning_rate": 0.0, "loss": 12.4269, "nf_loss": 11.2151, "ppl": 3.7435, "step": 46000 }, { "epoch": 0.67, "gate_score": 0.1854, "learning_rate": 0.0, "loss": 11.9829, "nf_loss": 10.7658, "ppl": 3.7626, "step": 48000 }, { "epoch": 0.67, "eval_loss": 10.730130195617676, "eval_nf_loss": 8.879775047302246, "eval_perplexity": 6.794580459594727, "eval_runtime": 900.5692, "eval_samples_per_second": 14.844, "eval_steps_per_second": 1.855, "step": 48000 }, { "epoch": 0.7, "gate_score": 0.1842, "learning_rate": 0.0, "loss": 11.6604, "nf_loss": 10.459, "ppl": 3.6987, "step": 50000 }, { "epoch": 0.72, "gate_score": 0.1778, "learning_rate": 0.0, "loss": 11.3837, "nf_loss": 10.1987, "ppl": 3.6203, "step": 52000 }, { "epoch": 0.72, "eval_loss": 10.283514976501465, "eval_nf_loss": 8.462200164794922, "eval_perplexity": 6.589704513549805, "eval_runtime": 897.7218, "eval_samples_per_second": 14.891, "eval_steps_per_second": 1.861, "step": 52000 }, { "epoch": 0.75, "gate_score": 0.1743, "learning_rate": 0.0, "loss": 11.1345, "nf_loss": 9.9505, "ppl": 3.6226, "step": 54000 }, { "epoch": 0.78, "gate_score": 0.1681, "learning_rate": 0.0, "loss": 10.913, "nf_loss": 9.7401, "ppl": 3.5839, "step": 56000 }, { "epoch": 0.78, "eval_loss": 9.798712730407715, "eval_nf_loss": 7.988397598266602, "eval_perplexity": 6.516669750213623, "eval_runtime": 883.3578, "eval_samples_per_second": 15.133, "eval_steps_per_second": 1.892, "step": 56000 }, { "epoch": 0.81, "gate_score": 0.1714, "learning_rate": 0.0, "loss": 10.7764, "nf_loss": 9.605, "ppl": 3.5672, "step": 58000 }, { "epoch": 0.84, "gate_score": 0.1655, "learning_rate": 0.0, "loss": 10.5627, "nf_loss": 9.4126, "ppl": 3.5056, "step": 60000 }, { "epoch": 0.84, "eval_loss": 9.538662910461426, "eval_nf_loss": 7.738859176635742, "eval_perplexity": 6.450651168823242, "eval_runtime": 876.1125, "eval_samples_per_second": 15.258, "eval_steps_per_second": 1.907, "step": 60000 }, { "epoch": 0.86, "gate_score": 0.1661, "learning_rate": 0.0, "loss": 10.4422, "nf_loss": 9.2887, "ppl": 3.523, "step": 62000 }, { "epoch": 0.89, "gate_score": 0.1643, "learning_rate": 0.0, "loss": 10.238, "nf_loss": 9.092, "ppl": 3.4769, "step": 64000 }, { "epoch": 0.89, "eval_loss": 9.2942533493042, "eval_nf_loss": 7.510049819946289, "eval_perplexity": 6.345398426055908, "eval_runtime": 873.6279, "eval_samples_per_second": 15.302, "eval_steps_per_second": 1.913, "step": 64000 }, { "epoch": 0.92, "gate_score": 0.1625, "learning_rate": 0.0, "loss": 10.1516, "nf_loss": 9.0024, "ppl": 3.4898, "step": 66000 }, { "epoch": 0.95, "gate_score": 0.1609, "learning_rate": 0.0, "loss": 9.9819, "nf_loss": 8.8616, "ppl": 3.3834, "step": 68000 }, { "epoch": 0.95, "eval_loss": 9.13103199005127, "eval_nf_loss": 7.360151290893555, "eval_perplexity": 6.2570929527282715, "eval_runtime": 892.7167, "eval_samples_per_second": 14.975, "eval_steps_per_second": 1.872, "step": 68000 }, { "epoch": 0.98, "gate_score": 0.1605, "learning_rate": 0.0, "loss": 9.9759, "nf_loss": 8.8506, "ppl": 3.3913, "step": 70000 }, { "epoch": 1.0, "gate_score": 0.161, "learning_rate": 5.5250000000000005e-06, "loss": 9.9267, "nf_loss": 8.805, "ppl": 3.3955, "step": 72000 }, { "epoch": 1.0, "eval_loss": 9.130549430847168, "eval_nf_loss": 7.364365577697754, "eval_perplexity": 6.226567268371582, "eval_runtime": 865.1706, "eval_samples_per_second": 15.451, "eval_steps_per_second": 1.931, "step": 72000 }, { "epoch": 1.03, "gate_score": 0.1525, "learning_rate": 4.9922293335593894e-05, "loss": 10.6431, "nf_loss": 9.4961, "ppl": 3.4863, "step": 74000 }, { "epoch": 1.06, "gate_score": 0.1281, "learning_rate": 4.921622232583111e-05, "loss": 11.018, "nf_loss": 9.7948, "ppl": 3.8142, "step": 76000 }, { "epoch": 1.06, "eval_loss": 10.00072193145752, "eval_nf_loss": 8.153450965881348, "eval_perplexity": 6.768190383911133, "eval_runtime": 618.9198, "eval_samples_per_second": 21.599, "eval_steps_per_second": 2.7, "step": 76000 }, { "epoch": 1.09, "gate_score": 0.1074, "learning_rate": 4.850979810395739e-05, "loss": 10.5531, "nf_loss": 9.3486, "ppl": 3.7186, "step": 78000 }, { "epoch": 1.11, "gate_score": 0.0924, "learning_rate": 4.780408030630554e-05, "loss": 10.0575, "nf_loss": 8.8613, "ppl": 3.7072, "step": 80000 }, { "epoch": 1.11, "eval_loss": 9.039822578430176, "eval_nf_loss": 7.206754684448242, "eval_perplexity": 6.669988632202148, "eval_runtime": 639.9491, "eval_samples_per_second": 20.889, "eval_steps_per_second": 2.611, "step": 80000 }, { "epoch": 1.14, "gate_score": 0.0874, "learning_rate": 4.7097656084431825e-05, "loss": 9.7084, "nf_loss": 8.5025, "ppl": 3.7371, "step": 82000 }, { "epoch": 1.17, "gate_score": 0.0759, "learning_rate": 4.639158507466904e-05, "loss": 9.3613, "nf_loss": 8.1756, "ppl": 3.6444, "step": 84000 }, { "epoch": 1.17, "eval_loss": 8.67480182647705, "eval_nf_loss": 6.853387832641602, "eval_perplexity": 6.5951128005981445, "eval_runtime": 655.8198, "eval_samples_per_second": 20.384, "eval_steps_per_second": 2.548, "step": 84000 }, { "epoch": 1.2, "gate_score": 0.0725, "learning_rate": 4.5685160852795324e-05, "loss": 8.9712, "nf_loss": 7.7979, "ppl": 3.5998, "step": 86000 }, { "epoch": 1.23, "gate_score": 0.0666, "learning_rate": 4.497908984303254e-05, "loss": 8.7453, "nf_loss": 7.5777, "ppl": 3.5764, "step": 88000 }, { "epoch": 1.23, "eval_loss": 7.947096347808838, "eval_nf_loss": 6.137327671051025, "eval_perplexity": 6.509429454803467, "eval_runtime": 654.1896, "eval_samples_per_second": 20.434, "eval_steps_per_second": 2.554, "step": 88000 }, { "epoch": 1.25, "gate_score": 0.0657, "learning_rate": 4.4273018833269757e-05, "loss": 8.4051, "nf_loss": 7.2351, "ppl": 3.5928, "step": 90000 }, { "epoch": 1.28, "gate_score": 0.0632, "learning_rate": 4.356694782350697e-05, "loss": 8.2083, "nf_loss": 7.0471, "ppl": 3.5522, "step": 92000 }, { "epoch": 1.28, "eval_loss": 7.776934623718262, "eval_nf_loss": 5.991891384124756, "eval_perplexity": 6.346434593200684, "eval_runtime": 664.9292, "eval_samples_per_second": 20.104, "eval_steps_per_second": 2.513, "step": 92000 }, { "epoch": 1.31, "gate_score": 0.0614, "learning_rate": 4.2860523601633256e-05, "loss": 7.8299, "nf_loss": 6.6969, "ppl": 3.4547, "step": 94000 }, { "epoch": 1.34, "gate_score": 0.0596, "learning_rate": 4.215445259187047e-05, "loss": 7.6361, "nf_loss": 6.4891, "ppl": 3.5083, "step": 96000 }, { "epoch": 1.34, "eval_loss": 7.583176612854004, "eval_nf_loss": 5.804019927978516, "eval_perplexity": 6.316826343536377, "eval_runtime": 649.9826, "eval_samples_per_second": 20.567, "eval_steps_per_second": 2.571, "step": 96000 }, { "epoch": 1.37, "gate_score": 0.0567, "learning_rate": 4.144838158210769e-05, "loss": 7.4853, "nf_loss": 6.36, "ppl": 3.4233, "step": 98000 }, { "epoch": 1.39, "gate_score": 0.0557, "learning_rate": 4.074195736023397e-05, "loss": 7.2881, "nf_loss": 6.1511, "ppl": 3.4723, "step": 100000 }, { "epoch": 1.39, "eval_loss": 7.400701999664307, "eval_nf_loss": 5.625098705291748, "eval_perplexity": 6.289206504821777, "eval_runtime": 667.0031, "eval_samples_per_second": 20.042, "eval_steps_per_second": 2.505, "step": 100000 }, { "epoch": 1.42, "gate_score": 0.054, "learning_rate": 4.003588635047119e-05, "loss": 7.165, "nf_loss": 6.0472, "ppl": 3.3942, "step": 102000 }, { "epoch": 1.45, "gate_score": 0.0532, "learning_rate": 3.9329815340708403e-05, "loss": 6.9929, "nf_loss": 5.864, "ppl": 3.45, "step": 104000 }, { "epoch": 1.45, "eval_loss": 6.6446003913879395, "eval_nf_loss": 4.897329330444336, "eval_perplexity": 6.099964141845703, "eval_runtime": 665.1582, "eval_samples_per_second": 20.097, "eval_steps_per_second": 2.512, "step": 104000 }, { "epoch": 1.48, "gate_score": 0.0508, "learning_rate": 3.8623391118834686e-05, "loss": 6.8638, "nf_loss": 5.7412, "ppl": 3.4016, "step": 106000 }, { "epoch": 1.5, "gate_score": 0.048, "learning_rate": 3.79173201090719e-05, "loss": 6.7439, "nf_loss": 5.6285, "ppl": 3.3851, "step": 108000 }, { "epoch": 1.5, "eval_loss": 6.440381050109863, "eval_nf_loss": 4.687047004699707, "eval_perplexity": 6.147556304931641, "eval_runtime": 671.2658, "eval_samples_per_second": 19.915, "eval_steps_per_second": 2.489, "step": 108000 }, { "epoch": 1.53, "gate_score": 0.0489, "learning_rate": 3.721124909930912e-05, "loss": 6.6258, "nf_loss": 5.5227, "ppl": 3.34, "step": 110000 }, { "epoch": 1.56, "gate_score": 0.0476, "learning_rate": 3.65048248774354e-05, "loss": 6.5119, "nf_loss": 5.4155, "ppl": 3.3128, "step": 112000 }, { "epoch": 1.56, "eval_loss": 6.296238422393799, "eval_nf_loss": 4.570157527923584, "eval_perplexity": 5.967574119567871, "eval_runtime": 658.6675, "eval_samples_per_second": 20.296, "eval_steps_per_second": 2.537, "step": 112000 }, { "epoch": 1.59, "gate_score": 0.0482, "learning_rate": 3.579910707978356e-05, "loss": 6.3285, "nf_loss": 5.2244, "ppl": 3.3628, "step": 114000 }, { "epoch": 1.62, "gate_score": 0.0482, "learning_rate": 3.5092682857909834e-05, "loss": 6.2206, "nf_loss": 5.1234, "ppl": 3.3327, "step": 116000 }, { "epoch": 1.62, "eval_loss": 6.046633720397949, "eval_nf_loss": 4.321578025817871, "eval_perplexity": 5.9654130935668945, "eval_runtime": 653.7138, "eval_samples_per_second": 20.449, "eval_steps_per_second": 2.556, "step": 116000 }, { "epoch": 1.64, "gate_score": 0.0489, "learning_rate": 3.438625863603611e-05, "loss": 6.0978, "nf_loss": 5.0037, "ppl": 3.3053, "step": 118000 }, { "epoch": 1.67, "gate_score": 0.0468, "learning_rate": 3.3680540838384266e-05, "loss": 6.0198, "nf_loss": 4.9354, "ppl": 3.2737, "step": 120000 }, { "epoch": 1.67, "eval_loss": 5.779796123504639, "eval_nf_loss": 4.067526817321777, "eval_perplexity": 5.880522727966309, "eval_runtime": 641.0411, "eval_samples_per_second": 20.854, "eval_steps_per_second": 2.607, "step": 120000 }, { "epoch": 1.7, "gate_score": 0.0425, "learning_rate": 3.297446982862148e-05, "loss": 5.907, "nf_loss": 4.8253, "ppl": 3.2734, "step": 122000 }, { "epoch": 1.73, "gate_score": 0.0432, "learning_rate": 3.2268045606747765e-05, "loss": 5.7937, "nf_loss": 4.7167, "ppl": 3.2463, "step": 124000 }, { "epoch": 1.73, "eval_loss": 5.693485260009766, "eval_nf_loss": 3.997667074203491, "eval_perplexity": 5.778055667877197, "eval_runtime": 629.6888, "eval_samples_per_second": 21.23, "eval_steps_per_second": 2.654, "step": 124000 }, { "epoch": 1.76, "gate_score": 0.0413, "learning_rate": 3.156197459698498e-05, "loss": 5.6708, "nf_loss": 4.5839, "ppl": 3.2695, "step": 126000 }, { "epoch": 1.78, "gate_score": 0.0423, "learning_rate": 3.08559035872222e-05, "loss": 5.5759, "nf_loss": 4.5071, "ppl": 3.2127, "step": 128000 }, { "epoch": 1.78, "eval_loss": 5.489774227142334, "eval_nf_loss": 3.8035385608673096, "eval_perplexity": 5.725496292114258, "eval_runtime": 629.3114, "eval_samples_per_second": 21.242, "eval_steps_per_second": 2.655, "step": 128000 }, { "epoch": 1.81, "gate_score": 0.0415, "learning_rate": 3.0149832577459418e-05, "loss": 5.4762, "nf_loss": 4.4182, "ppl": 3.189, "step": 130000 }, { "epoch": 1.84, "gate_score": 0.043, "learning_rate": 2.9443761567696637e-05, "loss": 5.4304, "nf_loss": 4.3627, "ppl": 3.2132, "step": 132000 }, { "epoch": 1.84, "eval_loss": 5.458424091339111, "eval_nf_loss": 3.7697036266326904, "eval_perplexity": 5.744439125061035, "eval_runtime": 641.9111, "eval_samples_per_second": 20.825, "eval_steps_per_second": 2.603, "step": 132000 }, { "epoch": 1.87, "gate_score": 0.0438, "learning_rate": 2.8737337345822913e-05, "loss": 5.3504, "nf_loss": 4.2998, "ppl": 3.1531, "step": 134000 }, { "epoch": 1.89, "gate_score": 0.0429, "learning_rate": 2.803126633606013e-05, "loss": 5.2695, "nf_loss": 4.2383, "ppl": 3.0881, "step": 136000 }, { "epoch": 1.89, "eval_loss": 5.256904125213623, "eval_nf_loss": 3.5766844749450684, "eval_perplexity": 5.6950812339782715, "eval_runtime": 660.429, "eval_samples_per_second": 20.241, "eval_steps_per_second": 2.53, "step": 136000 }, { "epoch": 1.92, "gate_score": 0.0421, "learning_rate": 2.7324842114186412e-05, "loss": 5.2219, "nf_loss": 4.1712, "ppl": 3.1497, "step": 138000 }, { "epoch": 1.95, "gate_score": 0.0417, "learning_rate": 2.661877110442363e-05, "loss": 5.1817, "nf_loss": 4.1256, "ppl": 3.1777, "step": 140000 }, { "epoch": 1.95, "eval_loss": 5.158201217651367, "eval_nf_loss": 3.487734794616699, "eval_perplexity": 5.636016845703125, "eval_runtime": 644.0563, "eval_samples_per_second": 20.756, "eval_steps_per_second": 2.594, "step": 140000 }, { "epoch": 1.98, "gate_score": 0.0422, "learning_rate": 2.5913053306771785e-05, "loss": 5.1108, "nf_loss": 4.0728, "ppl": 3.1163, "step": 142000 }, { "epoch": 2.01, "gate_score": 0.0402, "learning_rate": 2.5206982297009e-05, "loss": 5.0219, "nf_loss": 4.0038, "ppl": 3.0436, "step": 144000 }, { "epoch": 2.01, "eval_loss": 5.062300205230713, "eval_nf_loss": 3.3892769813537598, "eval_perplexity": 5.655203819274902, "eval_runtime": 654.7044, "eval_samples_per_second": 20.418, "eval_steps_per_second": 2.552, "step": 144000 }, { "epoch": 2.03, "gate_score": 0.0407, "learning_rate": 2.4500558075135284e-05, "loss": 4.8922, "nf_loss": 3.9548, "ppl": 2.775, "step": 146000 }, { "epoch": 2.06, "gate_score": 0.0409, "learning_rate": 2.379413385326156e-05, "loss": 4.8179, "nf_loss": 3.8903, "ppl": 2.7374, "step": 148000 }, { "epoch": 2.06, "eval_loss": 4.891648769378662, "eval_nf_loss": 3.21927547454834, "eval_perplexity": 5.654395580291748, "eval_runtime": 649.5705, "eval_samples_per_second": 20.58, "eval_steps_per_second": 2.572, "step": 148000 }, { "epoch": 2.09, "gate_score": 0.0431, "learning_rate": 2.308806284349878e-05, "loss": 4.7494, "nf_loss": 3.8238, "ppl": 2.7455, "step": 150000 }, { "epoch": 2.12, "gate_score": 0.0424, "learning_rate": 2.2381991833735996e-05, "loss": 4.7155, "nf_loss": 3.7946, "ppl": 2.7302, "step": 152000 }, { "epoch": 2.12, "eval_loss": 4.806872844696045, "eval_nf_loss": 3.148994207382202, "eval_perplexity": 5.567600250244141, "eval_runtime": 630.6434, "eval_samples_per_second": 21.197, "eval_steps_per_second": 2.65, "step": 152000 }, { "epoch": 2.15, "gate_score": 0.0408, "learning_rate": 2.1675567611862275e-05, "loss": 4.6944, "nf_loss": 3.7643, "ppl": 2.7512, "step": 154000 }, { "epoch": 2.17, "gate_score": 0.0426, "learning_rate": 2.0969496602099495e-05, "loss": 4.6805, "nf_loss": 3.7557, "ppl": 2.7415, "step": 156000 }, { "epoch": 2.17, "eval_loss": 4.722836494445801, "eval_nf_loss": 3.066758632659912, "eval_perplexity": 5.557853698730469, "eval_runtime": 604.3797, "eval_samples_per_second": 22.119, "eval_steps_per_second": 2.765, "step": 156000 }, { "epoch": 2.2, "gate_score": 0.0438, "learning_rate": 2.026342559233671e-05, "loss": 4.5943, "nf_loss": 3.6721, "ppl": 2.722, "step": 158000 }, { "epoch": 2.23, "gate_score": 0.0447, "learning_rate": 1.9557354582573927e-05, "loss": 4.5781, "nf_loss": 3.6564, "ppl": 2.728, "step": 160000 }, { "epoch": 2.23, "eval_loss": 4.671295166015625, "eval_nf_loss": 3.0203938484191895, "eval_perplexity": 5.526300430297852, "eval_runtime": 604.9582, "eval_samples_per_second": 22.097, "eval_steps_per_second": 2.762, "step": 160000 }, { "epoch": 2.26, "gate_score": 0.0436, "learning_rate": 1.8851283572811147e-05, "loss": 4.5642, "nf_loss": 3.6404, "ppl": 2.7413, "step": 162000 }, { "epoch": 2.28, "gate_score": 0.0453, "learning_rate": 1.8144859350937426e-05, "loss": 4.4834, "nf_loss": 3.5645, "ppl": 2.7145, "step": 164000 }, { "epoch": 2.28, "eval_loss": 4.710845947265625, "eval_nf_loss": 3.063795566558838, "eval_perplexity": 5.505098342895508, "eval_runtime": 601.1865, "eval_samples_per_second": 22.236, "eval_steps_per_second": 2.78, "step": 164000 }, { "epoch": 2.31, "gate_score": 0.0442, "learning_rate": 1.7438788341174643e-05, "loss": 4.4998, "nf_loss": 3.5735, "ppl": 2.7361, "step": 166000 }, { "epoch": 2.34, "gate_score": 0.0442, "learning_rate": 1.6733070543522796e-05, "loss": 4.4172, "nf_loss": 3.4989, "ppl": 2.7118, "step": 168000 }, { "epoch": 2.34, "eval_loss": 4.5222930908203125, "eval_nf_loss": 2.8794257640838623, "eval_perplexity": 5.477485179901123, "eval_runtime": 609.6448, "eval_samples_per_second": 21.928, "eval_steps_per_second": 2.741, "step": 168000 }, { "epoch": 2.37, "gate_score": 0.0464, "learning_rate": 1.602664632164908e-05, "loss": 4.3817, "nf_loss": 3.4671, "ppl": 2.7031, "step": 170000 }, { "epoch": 2.4, "gate_score": 0.0447, "learning_rate": 1.5320575311886295e-05, "loss": 4.3776, "nf_loss": 3.4701, "ppl": 2.6834, "step": 172000 }, { "epoch": 2.4, "eval_loss": 4.446962356567383, "eval_nf_loss": 2.8145503997802734, "eval_perplexity": 5.419012546539307, "eval_runtime": 591.7923, "eval_samples_per_second": 22.589, "eval_steps_per_second": 2.824, "step": 172000 }, { "epoch": 2.42, "gate_score": 0.0438, "learning_rate": 1.4614151090012576e-05, "loss": 4.34, "nf_loss": 3.4303, "ppl": 2.6917, "step": 174000 }, { "epoch": 2.45, "gate_score": 0.0433, "learning_rate": 1.390808008024979e-05, "loss": 4.3096, "nf_loss": 3.4, "ppl": 2.6915, "step": 176000 }, { "epoch": 2.45, "eval_loss": 4.412790298461914, "eval_nf_loss": 2.7833149433135986, "eval_perplexity": 5.404773712158203, "eval_runtime": 593.2765, "eval_samples_per_second": 22.532, "eval_steps_per_second": 2.817, "step": 176000 }, { "epoch": 2.48, "gate_score": 0.0449, "learning_rate": 1.3201655858376072e-05, "loss": 4.289, "nf_loss": 3.3752, "ppl": 2.7063, "step": 178000 }, { "epoch": 2.51, "gate_score": 0.0431, "learning_rate": 1.2495938060724226e-05, "loss": 4.2453, "nf_loss": 3.3433, "ppl": 2.6679, "step": 180000 }, { "epoch": 2.51, "eval_loss": 4.363792896270752, "eval_nf_loss": 2.7436203956604004, "eval_perplexity": 5.350350856781006, "eval_runtime": 592.6812, "eval_samples_per_second": 22.555, "eval_steps_per_second": 2.819, "step": 180000 }, { "epoch": 2.54, "gate_score": 0.0444, "learning_rate": 1.1789867050961444e-05, "loss": 4.1812, "nf_loss": 3.2773, "ppl": 2.6769, "step": 182000 }, { "epoch": 2.56, "gate_score": 0.0439, "learning_rate": 1.1083442829087724e-05, "loss": 4.2063, "nf_loss": 3.3076, "ppl": 2.6592, "step": 184000 }, { "epoch": 2.56, "eval_loss": 4.401766777038574, "eval_nf_loss": 2.7845916748046875, "eval_perplexity": 5.334909915924072, "eval_runtime": 593.569, "eval_samples_per_second": 22.521, "eval_steps_per_second": 2.815, "step": 184000 }, { "epoch": 2.59, "gate_score": 0.0435, "learning_rate": 1.0377371819324942e-05, "loss": 4.1697, "nf_loss": 3.2627, "ppl": 2.6781, "step": 186000 }, { "epoch": 2.62, "gate_score": 0.0455, "learning_rate": 9.67130080956216e-06, "loss": 4.1413, "nf_loss": 3.2508, "ppl": 2.6383, "step": 188000 }, { "epoch": 2.62, "eval_loss": 4.287137031555176, "eval_nf_loss": 2.6753435134887695, "eval_perplexity": 5.3036627769470215, "eval_runtime": 592.6474, "eval_samples_per_second": 22.556, "eval_steps_per_second": 2.82, "step": 188000 }, { "epoch": 2.65, "gate_score": 0.0438, "learning_rate": 8.964876587688439e-06, "loss": 4.1244, "nf_loss": 3.2269, "ppl": 2.6537, "step": 190000 }, { "epoch": 2.67, "gate_score": 0.0446, "learning_rate": 8.258805577925655e-06, "loss": 4.0845, "nf_loss": 3.1966, "ppl": 2.6201, "step": 192000 }, { "epoch": 2.67, "eval_loss": 4.247885704040527, "eval_nf_loss": 2.640995502471924, "eval_perplexity": 5.275094985961914, "eval_runtime": 591.9491, "eval_samples_per_second": 22.583, "eval_steps_per_second": 2.823, "step": 192000 }, { "epoch": 2.7, "gate_score": 0.0445, "learning_rate": 7.552734568162874e-06, "loss": 4.0604, "nf_loss": 3.1769, "ppl": 2.6091, "step": 194000 }, { "epoch": 2.73, "gate_score": 0.0433, "learning_rate": 6.846310346289154e-06, "loss": 4.06, "nf_loss": 3.1758, "ppl": 2.6114, "step": 196000 }, { "epoch": 2.73, "eval_loss": 4.171822547912598, "eval_nf_loss": 2.567200183868408, "eval_perplexity": 5.264888763427734, "eval_runtime": 589.4312, "eval_samples_per_second": 22.679, "eval_steps_per_second": 2.835, "step": 196000 }, { "epoch": 2.76, "gate_score": 0.0436, "learning_rate": 6.140239336526371e-06, "loss": 4.0426, "nf_loss": 3.1603, "ppl": 2.6136, "step": 198000 }, { "epoch": 2.79, "gate_score": 0.0444, "learning_rate": 5.4338151146526515e-06, "loss": 4.0352, "nf_loss": 3.1654, "ppl": 2.5709, "step": 200000 }, { "epoch": 2.79, "eval_loss": 4.141200542449951, "eval_nf_loss": 2.539963483810425, "eval_perplexity": 5.245651721954346, "eval_runtime": 622.9626, "eval_samples_per_second": 21.459, "eval_steps_per_second": 2.682, "step": 200000 }, { "epoch": 2.81, "gate_score": 0.0442, "learning_rate": 4.727744104889869e-06, "loss": 3.9924, "nf_loss": 3.1183, "ppl": 2.5982, "step": 202000 }, { "epoch": 2.84, "gate_score": 0.0437, "learning_rate": 4.021673095127087e-06, "loss": 3.982, "nf_loss": 3.0969, "ppl": 2.6167, "step": 204000 }, { "epoch": 2.84, "eval_loss": 4.099726676940918, "eval_nf_loss": 2.5044403076171875, "eval_perplexity": 5.212295055389404, "eval_runtime": 628.0641, "eval_samples_per_second": 21.284, "eval_steps_per_second": 2.661, "step": 204000 }, { "epoch": 2.87, "gate_score": 0.044, "learning_rate": 3.315248873253366e-06, "loss": 3.9519, "nf_loss": 3.0778, "ppl": 2.5849, "step": 206000 }, { "epoch": 2.9, "gate_score": 0.0427, "learning_rate": 2.6091778634905835e-06, "loss": 3.9487, "nf_loss": 3.082, "ppl": 2.5664, "step": 208000 }, { "epoch": 2.9, "eval_loss": 4.066439151763916, "eval_nf_loss": 2.474716901779175, "eval_perplexity": 5.1935272216796875, "eval_runtime": 625.0657, "eval_samples_per_second": 21.387, "eval_steps_per_second": 2.673, "step": 208000 }, { "epoch": 2.93, "gate_score": 0.0425, "learning_rate": 1.9031068537278006e-06, "loss": 3.9389, "nf_loss": 3.0634, "ppl": 2.5914, "step": 210000 }, { "epoch": 2.95, "gate_score": 0.0424, "learning_rate": 1.196682631854081e-06, "loss": 3.9603, "nf_loss": 3.0926, "ppl": 2.5703, "step": 212000 }, { "epoch": 2.95, "eval_loss": 4.057208061218262, "eval_nf_loss": 2.4683539867401123, "eval_perplexity": 5.177720546722412, "eval_runtime": 618.2401, "eval_samples_per_second": 21.623, "eval_steps_per_second": 2.703, "step": 212000 } ], "max_steps": 215337, "num_train_epochs": 3, "total_flos": 2.1693560999113897e+18, "trial_name": null, "trial_params": null }