{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 782, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 6.329113924050633e-09, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -403.3199157714844, "logps/real": -443.6107177734375, "loss": 4.7453, "rewards/accuracies": 0.5, "rewards/generated": -12.943833351135254, "rewards/margins": -2.53641414642334, "rewards/real": -15.48024845123291, "step": 1 }, { "epoch": 0.01, "learning_rate": 6.329113924050633e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -487.2853088378906, "logps/real": -384.8033142089844, "loss": 3.4516, "rewards/accuracies": 0.6111111044883728, "rewards/generated": -19.51491928100586, "rewards/margins": 6.083561420440674, "rewards/real": -13.431358337402344, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.2658227848101266e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -473.446044921875, "logps/real": -413.171630859375, "loss": 3.9705, "rewards/accuracies": 0.637499988079071, "rewards/generated": -18.367900848388672, "rewards/margins": 3.9024269580841064, "rewards/real": -14.465472221374512, "step": 20 }, { "epoch": 0.04, "learning_rate": 1.89873417721519e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -510.4585876464844, "logps/real": -408.74212646484375, "loss": 3.6955, "rewards/accuracies": 0.6875, "rewards/generated": -21.059778213500977, "rewards/margins": 7.762242317199707, "rewards/real": -13.297533988952637, "step": 30 }, { "epoch": 0.05, "learning_rate": 2.5316455696202533e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -451.734130859375, "logps/real": -368.16998291015625, "loss": 3.9056, "rewards/accuracies": 0.625, "rewards/generated": -16.805126190185547, "rewards/margins": 4.4605584144592285, "rewards/real": -12.344568252563477, "step": 40 }, { "epoch": 0.06, "learning_rate": 3.1645569620253163e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -552.9141845703125, "logps/real": -397.95318603515625, "loss": 2.9742, "rewards/accuracies": 0.737500011920929, "rewards/generated": -25.695674896240234, "rewards/margins": 10.653547286987305, "rewards/real": -15.042126655578613, "step": 50 }, { "epoch": 0.08, "learning_rate": 3.79746835443038e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -509.5240783691406, "logps/real": -390.6533203125, "loss": 2.5682, "rewards/accuracies": 0.737500011920929, "rewards/generated": -22.568603515625, "rewards/margins": 7.904494285583496, "rewards/real": -14.664111137390137, "step": 60 }, { "epoch": 0.09, "learning_rate": 4.4303797468354424e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -554.5638427734375, "logps/real": -382.84832763671875, "loss": 1.7793, "rewards/accuracies": 0.8374999761581421, "rewards/generated": -28.215290069580078, "rewards/margins": 14.945714950561523, "rewards/real": -13.269571304321289, "step": 70 }, { "epoch": 0.1, "learning_rate": 4.992887624466572e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -574.03759765625, "logps/real": -402.69232177734375, "loss": 1.4194, "rewards/accuracies": 0.800000011920929, "rewards/generated": -28.118602752685547, "rewards/margins": 12.523630142211914, "rewards/real": -15.594972610473633, "step": 80 }, { "epoch": 0.12, "learning_rate": 4.92176386913229e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -577.7903442382812, "logps/real": -401.81561279296875, "loss": 1.2635, "rewards/accuracies": 0.7749999761581421, "rewards/generated": -30.364093780517578, "rewards/margins": 14.755389213562012, "rewards/real": -15.608701705932617, "step": 90 }, { "epoch": 0.13, "learning_rate": 4.850640113798008e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -654.4276733398438, "logps/real": -442.893798828125, "loss": 0.9213, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -35.57740020751953, "rewards/margins": 18.7423038482666, "rewards/real": -16.835100173950195, "step": 100 }, { "epoch": 0.14, "learning_rate": 4.779516358463727e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -637.9244384765625, "logps/real": -426.7471618652344, "loss": 1.0798, "rewards/accuracies": 0.875, "rewards/generated": -35.024356842041016, "rewards/margins": 18.848251342773438, "rewards/real": -16.17610740661621, "step": 110 }, { "epoch": 0.15, "learning_rate": 4.7083926031294454e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -647.3193969726562, "logps/real": -419.98095703125, "loss": 0.7428, "rewards/accuracies": 0.9375, "rewards/generated": -35.827938079833984, "rewards/margins": 21.43227767944336, "rewards/real": -14.395665168762207, "step": 120 }, { "epoch": 0.17, "learning_rate": 4.6372688477951633e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -647.6121826171875, "logps/real": -413.42401123046875, "loss": 0.6566, "rewards/accuracies": 0.875, "rewards/generated": -35.408538818359375, "rewards/margins": 20.003904342651367, "rewards/real": -15.404635429382324, "step": 130 }, { "epoch": 0.18, "learning_rate": 4.5661450924608817e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -697.9991455078125, "logps/real": -401.5552673339844, "loss": 0.9421, "rewards/accuracies": 0.9375, "rewards/generated": -38.38452911376953, "rewards/margins": 23.595943450927734, "rewards/real": -14.788581848144531, "step": 140 }, { "epoch": 0.19, "learning_rate": 4.4950213371266e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -672.8363037109375, "logps/real": -401.61590576171875, "loss": 0.6587, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -38.686241149902344, "rewards/margins": 23.072139739990234, "rewards/real": -15.614102363586426, "step": 150 }, { "epoch": 0.2, "learning_rate": 4.4238975817923186e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -641.0380859375, "logps/real": -377.9221496582031, "loss": 0.6261, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -35.72499084472656, "rewards/margins": 21.804719924926758, "rewards/real": -13.920272827148438, "step": 160 }, { "epoch": 0.22, "learning_rate": 4.3527738264580364e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -676.1116943359375, "logps/real": -410.857421875, "loss": 0.6189, "rewards/accuracies": 0.9125000238418579, "rewards/generated": -37.884979248046875, "rewards/margins": 23.008445739746094, "rewards/real": -14.876535415649414, "step": 170 }, { "epoch": 0.23, "learning_rate": 4.2816500711237554e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -654.6505737304688, "logps/real": -370.24224853515625, "loss": 0.3247, "rewards/accuracies": 0.9375, "rewards/generated": -36.68430709838867, "rewards/margins": 23.027376174926758, "rewards/real": -13.656933784484863, "step": 180 }, { "epoch": 0.24, "learning_rate": 4.2105263157894733e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -665.4993286132812, "logps/real": -364.58880615234375, "loss": 0.5709, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -37.38275146484375, "rewards/margins": 25.380136489868164, "rewards/real": -12.002609252929688, "step": 190 }, { "epoch": 0.26, "learning_rate": 4.1394025604551917e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -684.380615234375, "logps/real": -392.02288818359375, "loss": 0.6823, "rewards/accuracies": 0.8999999761581421, "rewards/generated": -38.82280349731445, "rewards/margins": 24.800228118896484, "rewards/real": -14.02257251739502, "step": 200 }, { "epoch": 0.27, "learning_rate": 4.06827880512091e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -741.0132446289062, "logps/real": -383.6390075683594, "loss": 0.5369, "rewards/accuracies": 0.925000011920929, "rewards/generated": -43.17304229736328, "rewards/margins": 28.80326271057129, "rewards/real": -14.369776725769043, "step": 210 }, { "epoch": 0.28, "learning_rate": 3.9971550497866285e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -665.7184448242188, "logps/real": -401.6400146484375, "loss": 0.3696, "rewards/accuracies": 0.9375, "rewards/generated": -38.721866607666016, "rewards/margins": 24.154865264892578, "rewards/real": -14.567001342773438, "step": 220 }, { "epoch": 0.29, "learning_rate": 3.926031294452347e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -681.5037231445312, "logps/real": -394.03509521484375, "loss": 0.2532, "rewards/accuracies": 0.949999988079071, "rewards/generated": -38.438758850097656, "rewards/margins": 23.877817153930664, "rewards/real": -14.560938835144043, "step": 230 }, { "epoch": 0.31, "learning_rate": 3.8549075391180653e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -705.4946899414062, "logps/real": -414.25146484375, "loss": 0.4719, "rewards/accuracies": 0.925000011920929, "rewards/generated": -40.05225372314453, "rewards/margins": 24.665658950805664, "rewards/real": -15.386594772338867, "step": 240 }, { "epoch": 0.32, "learning_rate": 3.783783783783784e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -718.469970703125, "logps/real": -393.9696350097656, "loss": 0.326, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -43.078765869140625, "rewards/margins": 28.980609893798828, "rewards/real": -14.09815788269043, "step": 250 }, { "epoch": 0.33, "learning_rate": 3.7126600284495016e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -721.9268798828125, "logps/real": -403.2998046875, "loss": 0.4804, "rewards/accuracies": 0.9375, "rewards/generated": -41.29940414428711, "rewards/margins": 26.352609634399414, "rewards/real": -14.946797370910645, "step": 260 }, { "epoch": 0.35, "learning_rate": 3.6415362731152206e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -690.9553833007812, "logps/real": -374.5037536621094, "loss": 0.3827, "rewards/accuracies": 0.949999988079071, "rewards/generated": -40.69682693481445, "rewards/margins": 26.715112686157227, "rewards/real": -13.981710433959961, "step": 270 }, { "epoch": 0.36, "learning_rate": 3.5704125177809385e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -725.0104370117188, "logps/real": -406.488037109375, "loss": 0.4101, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -42.66962432861328, "rewards/margins": 27.242467880249023, "rewards/real": -15.427154541015625, "step": 280 }, { "epoch": 0.37, "learning_rate": 3.4992887624466574e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -697.3206787109375, "logps/real": -400.65753173828125, "loss": 0.3629, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -39.595970153808594, "rewards/margins": 24.689983367919922, "rewards/real": -14.90599250793457, "step": 290 }, { "epoch": 0.38, "learning_rate": 3.4281650071123753e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -711.7612915039062, "logps/real": -388.0057678222656, "loss": 0.2122, "rewards/accuracies": 0.949999988079071, "rewards/generated": -42.39947509765625, "rewards/margins": 27.109222412109375, "rewards/real": -15.290254592895508, "step": 300 }, { "epoch": 0.4, "learning_rate": 3.3570412517780937e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -713.0751953125, "logps/real": -423.7640686035156, "loss": 0.3657, "rewards/accuracies": 0.9125000238418579, "rewards/generated": -41.861183166503906, "rewards/margins": 26.213199615478516, "rewards/real": -15.647982597351074, "step": 310 }, { "epoch": 0.41, "learning_rate": 3.285917496443812e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -695.4622192382812, "logps/real": -386.4324645996094, "loss": 0.3005, "rewards/accuracies": 0.925000011920929, "rewards/generated": -39.45234298706055, "rewards/margins": 25.696590423583984, "rewards/real": -13.755752563476562, "step": 320 }, { "epoch": 0.42, "learning_rate": 3.2147937411095305e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -774.9369506835938, "logps/real": -426.2724609375, "loss": 0.2325, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -48.17813491821289, "rewards/margins": 31.29937171936035, "rewards/real": -16.87876319885254, "step": 330 }, { "epoch": 0.43, "learning_rate": 3.1436699857752484e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -714.9234008789062, "logps/real": -395.0069885253906, "loss": 0.2981, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -41.547645568847656, "rewards/margins": 27.00480079650879, "rewards/real": -14.54284381866455, "step": 340 }, { "epoch": 0.45, "learning_rate": 3.0725462304409674e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -721.8656005859375, "logps/real": -400.1606750488281, "loss": 0.2019, "rewards/accuracies": 0.9375, "rewards/generated": -44.031803131103516, "rewards/margins": 28.76715087890625, "rewards/real": -15.264646530151367, "step": 350 }, { "epoch": 0.46, "learning_rate": 3.001422475106685e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -728.89599609375, "logps/real": -470.1971130371094, "loss": 0.5931, "rewards/accuracies": 0.9375, "rewards/generated": -43.573814392089844, "rewards/margins": 24.9804630279541, "rewards/real": -18.593351364135742, "step": 360 }, { "epoch": 0.47, "learning_rate": 2.9302987197724037e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -667.4353637695312, "logps/real": -369.68328857421875, "loss": 0.4031, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -38.71464920043945, "rewards/margins": 24.013246536254883, "rewards/real": -14.701400756835938, "step": 370 }, { "epoch": 0.49, "learning_rate": 2.8591749644381226e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -708.6209716796875, "logps/real": -395.7604675292969, "loss": 0.3691, "rewards/accuracies": 0.9375, "rewards/generated": -42.240318298339844, "rewards/margins": 26.32097816467285, "rewards/real": -15.919347763061523, "step": 380 }, { "epoch": 0.5, "learning_rate": 2.7880512091038405e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -766.4454345703125, "logps/real": -423.30755615234375, "loss": 0.2758, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -47.435791015625, "rewards/margins": 30.477214813232422, "rewards/real": -16.958572387695312, "step": 390 }, { "epoch": 0.51, "learning_rate": 2.716927453769559e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -768.6455078125, "logps/real": -449.0997009277344, "loss": 0.3455, "rewards/accuracies": 0.949999988079071, "rewards/generated": -47.01398849487305, "rewards/margins": 28.302906036376953, "rewards/real": -18.711084365844727, "step": 400 }, { "epoch": 0.52, "learning_rate": 2.6458036984352773e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -734.2765502929688, "logps/real": -447.5357971191406, "loss": 0.375, "rewards/accuracies": 0.9375, "rewards/generated": -46.2476921081543, "rewards/margins": 27.413455963134766, "rewards/real": -18.834239959716797, "step": 410 }, { "epoch": 0.54, "learning_rate": 2.574679943100996e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -730.5106201171875, "logps/real": -453.9371643066406, "loss": 0.0816, "rewards/accuracies": 0.925000011920929, "rewards/generated": -45.583648681640625, "rewards/margins": 27.334802627563477, "rewards/real": -18.248844146728516, "step": 420 }, { "epoch": 0.55, "learning_rate": 2.5035561877667136e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -766.5603637695312, "logps/real": -438.89788818359375, "loss": 0.1735, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -46.9077262878418, "rewards/margins": 29.52130126953125, "rewards/real": -17.386432647705078, "step": 430 }, { "epoch": 0.56, "learning_rate": 2.4324324324324326e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -781.421630859375, "logps/real": -405.1501770019531, "loss": 0.3965, "rewards/accuracies": 0.987500011920929, "rewards/generated": -49.6665153503418, "rewards/margins": 33.62762451171875, "rewards/real": -16.03889274597168, "step": 440 }, { "epoch": 0.58, "learning_rate": 2.3613086770981507e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -760.2469482421875, "logps/real": -404.4407653808594, "loss": 0.4128, "rewards/accuracies": 0.949999988079071, "rewards/generated": -47.4620361328125, "rewards/margins": 30.4351863861084, "rewards/real": -17.026851654052734, "step": 450 }, { "epoch": 0.59, "learning_rate": 2.290184921763869e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -820.1843872070312, "logps/real": -420.02587890625, "loss": 0.1306, "rewards/accuracies": 1.0, "rewards/generated": -51.192138671875, "rewards/margins": 35.38946533203125, "rewards/real": -15.8026704788208, "step": 460 }, { "epoch": 0.6, "learning_rate": 2.2190611664295875e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -759.2930908203125, "logps/real": -438.888916015625, "loss": 0.2487, "rewards/accuracies": 0.925000011920929, "rewards/generated": -45.420188903808594, "rewards/margins": 27.34377670288086, "rewards/real": -18.0764102935791, "step": 470 }, { "epoch": 0.61, "learning_rate": 2.1479374110953057e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -777.7584838867188, "logps/real": -388.6899108886719, "loss": 0.3922, "rewards/accuracies": 0.949999988079071, "rewards/generated": -48.69025421142578, "rewards/margins": 32.126991271972656, "rewards/real": -16.563264846801758, "step": 480 }, { "epoch": 0.63, "learning_rate": 2.076813655761024e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -804.2822875976562, "logps/real": -438.558837890625, "loss": 0.2558, "rewards/accuracies": 0.987500011920929, "rewards/generated": -50.539390563964844, "rewards/margins": 31.528533935546875, "rewards/real": -19.0108585357666, "step": 490 }, { "epoch": 0.64, "learning_rate": 2.0056899004267425e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -799.6202392578125, "logps/real": -483.55096435546875, "loss": 0.2945, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -50.81739807128906, "rewards/margins": 30.668746948242188, "rewards/real": -20.14865493774414, "step": 500 }, { "epoch": 0.64, "eval_logits/generated": -Infinity, "eval_logits/real": -Infinity, "eval_logps/generated": -509.32861328125, "eval_logps/real": -313.0279541015625, "eval_loss": 0.1748354285955429, "eval_rewards/accuracies": 0.9442675113677979, "eval_rewards/generated": -21.789350509643555, "eval_rewards/margins": 15.725247383117676, "eval_rewards/real": -6.0641021728515625, "eval_runtime": 590.026, "eval_samples_per_second": 8.474, "eval_steps_per_second": 0.266, "step": 500 }, { "epoch": 0.65, "learning_rate": 1.9345661450924607e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -757.3658447265625, "logps/real": -412.8812561035156, "loss": 0.2523, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -45.53099822998047, "rewards/margins": 29.076038360595703, "rewards/real": -16.454959869384766, "step": 510 }, { "epoch": 0.66, "learning_rate": 1.863442389758179e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -780.4212036132812, "logps/real": -437.5557556152344, "loss": 0.2147, "rewards/accuracies": 0.987500011920929, "rewards/generated": -49.51309585571289, "rewards/margins": 32.93183517456055, "rewards/real": -16.581256866455078, "step": 520 }, { "epoch": 0.68, "learning_rate": 1.7923186344238975e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -735.9564208984375, "logps/real": -407.6651306152344, "loss": 0.2975, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -44.98466491699219, "rewards/margins": 29.423254013061523, "rewards/real": -15.561413764953613, "step": 530 }, { "epoch": 0.69, "learning_rate": 1.721194879089616e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -807.4033203125, "logps/real": -429.4000549316406, "loss": 0.2254, "rewards/accuracies": 0.949999988079071, "rewards/generated": -51.46739959716797, "rewards/margins": 32.75578689575195, "rewards/real": -18.711612701416016, "step": 540 }, { "epoch": 0.7, "learning_rate": 1.650071123755334e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -811.77099609375, "logps/real": -425.84649658203125, "loss": 0.3105, "rewards/accuracies": 0.949999988079071, "rewards/generated": -52.33929443359375, "rewards/margins": 34.459083557128906, "rewards/real": -17.88020896911621, "step": 550 }, { "epoch": 0.72, "learning_rate": 1.5789473684210525e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -788.242919921875, "logps/real": -440.695556640625, "loss": 0.1702, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -49.456298828125, "rewards/margins": 31.41439437866211, "rewards/real": -18.041906356811523, "step": 560 }, { "epoch": 0.73, "learning_rate": 1.507823613086771e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -752.26611328125, "logps/real": -427.10113525390625, "loss": 0.3281, "rewards/accuracies": 0.9375, "rewards/generated": -47.79071807861328, "rewards/margins": 30.950246810913086, "rewards/real": -16.84047508239746, "step": 570 }, { "epoch": 0.74, "learning_rate": 1.436699857752489e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -826.0186767578125, "logps/real": -424.66680908203125, "loss": 0.3042, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -51.909454345703125, "rewards/margins": 34.6719856262207, "rewards/real": -17.237468719482422, "step": 580 }, { "epoch": 0.75, "learning_rate": 1.3655761024182077e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -816.61962890625, "logps/real": -420.100830078125, "loss": 0.2635, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -52.91407012939453, "rewards/margins": 36.203983306884766, "rewards/real": -16.710086822509766, "step": 590 }, { "epoch": 0.77, "learning_rate": 1.2944523470839261e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -820.4493408203125, "logps/real": -428.4378967285156, "loss": 0.347, "rewards/accuracies": 0.9375, "rewards/generated": -51.51692581176758, "rewards/margins": 34.30039596557617, "rewards/real": -17.216527938842773, "step": 600 }, { "epoch": 0.78, "learning_rate": 1.2233285917496443e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -768.8092041015625, "logps/real": -420.62200927734375, "loss": 0.0588, "rewards/accuracies": 0.987500011920929, "rewards/generated": -47.871437072753906, "rewards/margins": 32.33845901489258, "rewards/real": -15.532976150512695, "step": 610 }, { "epoch": 0.79, "learning_rate": 1.1522048364153626e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -786.0013427734375, "logps/real": -442.2757873535156, "loss": 0.2483, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -50.278663635253906, "rewards/margins": 33.101016998291016, "rewards/real": -17.17764663696289, "step": 620 }, { "epoch": 0.81, "learning_rate": 1.0810810810810811e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -787.63818359375, "logps/real": -404.19122314453125, "loss": 0.1336, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -50.15456008911133, "rewards/margins": 33.99864959716797, "rewards/real": -16.15591049194336, "step": 630 }, { "epoch": 0.82, "learning_rate": 1.0099573257467994e-07, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -830.99462890625, "logps/real": -457.3229064941406, "loss": 0.2925, "rewards/accuracies": 0.949999988079071, "rewards/generated": -53.06385040283203, "rewards/margins": 34.269439697265625, "rewards/real": -18.794404983520508, "step": 640 }, { "epoch": 0.83, "learning_rate": 9.388335704125178e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -777.55517578125, "logps/real": -405.3326110839844, "loss": 0.137, "rewards/accuracies": 1.0, "rewards/generated": -49.94211196899414, "rewards/margins": 32.77922821044922, "rewards/real": -17.16288948059082, "step": 650 }, { "epoch": 0.84, "learning_rate": 8.677098150782361e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -775.7921142578125, "logps/real": -408.35302734375, "loss": 0.0992, "rewards/accuracies": 0.987500011920929, "rewards/generated": -48.498905181884766, "rewards/margins": 32.695289611816406, "rewards/real": -15.803617477416992, "step": 660 }, { "epoch": 0.86, "learning_rate": 7.965860597439544e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -764.4813232421875, "logps/real": -427.140625, "loss": 0.2396, "rewards/accuracies": 0.925000011920929, "rewards/generated": -48.443748474121094, "rewards/margins": 31.320148468017578, "rewards/real": -17.123600006103516, "step": 670 }, { "epoch": 0.87, "learning_rate": 7.254623044096728e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -803.4662475585938, "logps/real": -435.402099609375, "loss": 0.2362, "rewards/accuracies": 0.987500011920929, "rewards/generated": -51.644203186035156, "rewards/margins": 33.23621368408203, "rewards/real": -18.40799331665039, "step": 680 }, { "epoch": 0.88, "learning_rate": 6.543385490753911e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -863.4392700195312, "logps/real": -439.1971130371094, "loss": 0.1949, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -55.93394088745117, "rewards/margins": 36.83943176269531, "rewards/real": -19.094507217407227, "step": 690 }, { "epoch": 0.9, "learning_rate": 5.832147937411095e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -779.9309692382812, "logps/real": -411.91455078125, "loss": 0.2666, "rewards/accuracies": 0.9375, "rewards/generated": -47.416011810302734, "rewards/margins": 29.500951766967773, "rewards/real": -17.915063858032227, "step": 700 }, { "epoch": 0.91, "learning_rate": 5.120910384068278e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -826.6921997070312, "logps/real": -447.87762451171875, "loss": 0.258, "rewards/accuracies": 0.949999988079071, "rewards/generated": -53.74137496948242, "rewards/margins": 35.463531494140625, "rewards/real": -18.277841567993164, "step": 710 }, { "epoch": 0.92, "learning_rate": 4.4096728307254624e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -834.2398681640625, "logps/real": -427.90966796875, "loss": 0.1272, "rewards/accuracies": 0.987500011920929, "rewards/generated": -53.82038497924805, "rewards/margins": 36.249298095703125, "rewards/real": -17.571086883544922, "step": 720 }, { "epoch": 0.93, "learning_rate": 3.698435277382646e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -838.7642822265625, "logps/real": -436.14642333984375, "loss": 0.2078, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -55.14348602294922, "rewards/margins": 37.05797576904297, "rewards/real": -18.08551597595215, "step": 730 }, { "epoch": 0.95, "learning_rate": 2.9871977240398294e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -798.5349731445312, "logps/real": -427.6378479003906, "loss": 0.2621, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -52.086219787597656, "rewards/margins": 34.135223388671875, "rewards/real": -17.95099449157715, "step": 740 }, { "epoch": 0.96, "learning_rate": 2.275960170697013e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -835.6724853515625, "logps/real": -414.00152587890625, "loss": 0.1564, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -52.597930908203125, "rewards/margins": 36.33824920654297, "rewards/real": -16.259681701660156, "step": 750 }, { "epoch": 0.97, "learning_rate": 1.564722617354196e-08, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -850.904296875, "logps/real": -392.68310546875, "loss": 0.1515, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -55.202125549316406, "rewards/margins": 38.95097351074219, "rewards/real": -16.25115394592285, "step": 760 }, { "epoch": 0.98, "learning_rate": 8.534850640113798e-09, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -814.2432250976562, "logps/real": -415.373779296875, "loss": 0.1867, "rewards/accuracies": 0.9624999761581421, "rewards/generated": -53.38776397705078, "rewards/margins": 37.246055603027344, "rewards/real": -16.141704559326172, "step": 770 }, { "epoch": 1.0, "learning_rate": 1.422475106685633e-09, "logits/generated": -Infinity, "logits/real": -Infinity, "logps/generated": -811.1026611328125, "logps/real": -388.99835205078125, "loss": 0.2403, "rewards/accuracies": 0.9750000238418579, "rewards/generated": -51.92308807373047, "rewards/margins": 35.41986846923828, "rewards/real": -16.503215789794922, "step": 780 }, { "epoch": 1.0, "step": 782, "total_flos": 0.0, "train_loss": 0.6284351689954493, "train_runtime": 6560.8804, "train_samples_per_second": 3.81, "train_steps_per_second": 0.119 } ], "logging_steps": 10, "max_steps": 782, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }