diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,3730 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 2421, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 61.79818261705088, + "learning_rate": 2.05761316872428e-09, + "logits/chosen": -3.5, + "logits/rejected": -1.4140625, + "logps/chosen": -262.0, + "logps/rejected": -788.0, + "loss": 0.6914, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "grad_norm": 71.93619140446947, + "learning_rate": 2.0576131687242796e-08, + "logits/chosen": -1.9140625, + "logits/rejected": -3.0625, + "logps/chosen": -648.0, + "logps/rejected": -760.0, + "loss": 0.6928, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.033447265625, + "rewards/margins": -0.033447265625, + "rewards/rejected": 0.0, + "step": 10 + }, + { + "epoch": 0.02, + "grad_norm": 95.51795034006538, + "learning_rate": 4.115226337448559e-08, + "logits/chosen": -1.828125, + "logits/rejected": -2.671875, + "logps/chosen": -484.0, + "logps/rejected": -352.0, + "loss": 0.69, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": 0.02001953125, + "rewards/margins": 0.0, + "rewards/rejected": 0.02001953125, + "step": 20 + }, + { + "epoch": 0.04, + "grad_norm": 83.15780879075649, + "learning_rate": 6.172839506172839e-08, + "logits/chosen": -1.8671875, + "logits/rejected": -1.75, + "logps/chosen": -404.0, + "logps/rejected": -540.0, + "loss": 0.6933, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -0.0150146484375, + "rewards/margins": -0.02001953125, + "rewards/rejected": 0.0050048828125, + "step": 30 + }, + { + "epoch": 0.05, + "grad_norm": 78.74493611311031, + "learning_rate": 8.230452674897118e-08, + "logits/chosen": -2.015625, + "logits/rejected": -2.265625, + "logps/chosen": -448.0, + "logps/rejected": -600.0, + "loss": 0.692, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0250244140625, + "rewards/margins": 0.06005859375, + "rewards/rejected": -0.03515625, + "step": 40 + }, + { + "epoch": 0.06, + "grad_norm": 84.22871977974292, + "learning_rate": 1.02880658436214e-07, + "logits/chosen": -1.6640625, + "logits/rejected": -2.421875, + "logps/chosen": -592.0, + "logps/rejected": -580.0, + "loss": 0.6922, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -0.0400390625, + "rewards/margins": -0.030029296875, + "rewards/rejected": -0.010009765625, + "step": 50 + }, + { + "epoch": 0.07, + "grad_norm": 57.8386192430028, + "learning_rate": 1.2345679012345677e-07, + "logits/chosen": -2.03125, + "logits/rejected": -2.9375, + "logps/chosen": -362.0, + "logps/rejected": -368.0, + "loss": 0.6872, + "rewards/accuracies": 0.10000000149011612, + "rewards/chosen": -0.0050048828125, + "rewards/margins": -0.044921875, + "rewards/rejected": 0.0400390625, + "step": 60 + }, + { + "epoch": 0.09, + "grad_norm": 108.85645337456651, + "learning_rate": 1.4403292181069958e-07, + "logits/chosen": -2.140625, + "logits/rejected": -2.4375, + "logps/chosen": -460.0, + "logps/rejected": -468.0, + "loss": 0.6888, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.05517578125, + "rewards/margins": 0.0751953125, + "rewards/rejected": -0.02001953125, + "step": 70 + }, + { + "epoch": 0.1, + "grad_norm": 70.2187244330304, + "learning_rate": 1.6460905349794237e-07, + "logits/chosen": -1.8984375, + "logits/rejected": -2.28125, + "logps/chosen": -516.0, + "logps/rejected": -506.0, + "loss": 0.687, + "rewards/accuracies": 0.20000000298023224, + "rewards/chosen": -0.050048828125, + "rewards/margins": -0.03515625, + "rewards/rejected": -0.0150146484375, + "step": 80 + }, + { + "epoch": 0.11, + "grad_norm": 54.744834672425526, + "learning_rate": 1.8518518518518516e-07, + "logits/chosen": -1.7578125, + "logits/rejected": -2.078125, + "logps/chosen": -532.0, + "logps/rejected": -660.0, + "loss": 0.677, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": 0.050048828125, + "rewards/margins": 0.06494140625, + "rewards/rejected": -0.0150146484375, + "step": 90 + }, + { + "epoch": 0.12, + "grad_norm": 98.04764626666417, + "learning_rate": 2.05761316872428e-07, + "logits/chosen": -1.6015625, + "logits/rejected": -2.234375, + "logps/chosen": -560.0, + "logps/rejected": -660.0, + "loss": 0.6657, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": 0.03515625, + "rewards/margins": -0.0301513671875, + "rewards/rejected": 0.06494140625, + "step": 100 + }, + { + "epoch": 0.14, + "grad_norm": 78.67800061099335, + "learning_rate": 2.2633744855967078e-07, + "logits/chosen": -2.03125, + "logits/rejected": -2.703125, + "logps/chosen": -484.0, + "logps/rejected": -428.0, + "loss": 0.6618, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.054931640625, + "rewards/margins": 0.06982421875, + "rewards/rejected": -0.01507568359375, + "step": 110 + }, + { + "epoch": 0.15, + "grad_norm": 98.90783538853094, + "learning_rate": 2.4691358024691354e-07, + "logits/chosen": -1.8984375, + "logits/rejected": -1.71875, + "logps/chosen": -668.0, + "logps/rejected": -752.0, + "loss": 0.6576, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.25, + "rewards/margins": 0.205078125, + "rewards/rejected": 0.045166015625, + "step": 120 + }, + { + "epoch": 0.16, + "grad_norm": 183.377786164052, + "learning_rate": 2.6748971193415635e-07, + "logits/chosen": -2.140625, + "logits/rejected": -2.78125, + "logps/chosen": -408.0, + "logps/rejected": -358.0, + "loss": 0.6568, + "rewards/accuracies": 0.30000001192092896, + "rewards/chosen": -0.09521484375, + "rewards/margins": 0.02001953125, + "rewards/rejected": -0.115234375, + "step": 130 + }, + { + "epoch": 0.17, + "grad_norm": 75.16806808811697, + "learning_rate": 2.8806584362139917e-07, + "logits/chosen": -1.359375, + "logits/rejected": -2.21875, + "logps/chosen": -668.0, + "logps/rejected": -556.0, + "loss": 0.6513, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.0703125, + "rewards/margins": 0.0400390625, + "rewards/rejected": -0.1103515625, + "step": 140 + }, + { + "epoch": 0.19, + "grad_norm": 59.26706314390388, + "learning_rate": 3.086419753086419e-07, + "logits/chosen": -2.390625, + "logits/rejected": -2.1875, + "logps/chosen": -356.0, + "logps/rejected": -382.0, + "loss": 0.6289, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.125, + "rewards/margins": 0.1357421875, + "rewards/rejected": -0.010009765625, + "step": 150 + }, + { + "epoch": 0.2, + "grad_norm": 103.75204061254972, + "learning_rate": 3.2921810699588474e-07, + "logits/chosen": -2.46875, + "logits/rejected": -1.6171875, + "logps/chosen": -398.0, + "logps/rejected": -486.0, + "loss": 0.6381, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.0703125, + "rewards/margins": 0.08056640625, + "rewards/rejected": -0.150390625, + "step": 160 + }, + { + "epoch": 0.21, + "grad_norm": 113.86192442502461, + "learning_rate": 3.4979423868312755e-07, + "logits/chosen": -2.3125, + "logits/rejected": -1.9140625, + "logps/chosen": -450.0, + "logps/rejected": -464.0, + "loss": 0.6266, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.205078125, + "rewards/margins": 0.1455078125, + "rewards/rejected": -0.3515625, + "step": 170 + }, + { + "epoch": 0.22, + "grad_norm": 63.20529708150695, + "learning_rate": 3.703703703703703e-07, + "logits/chosen": -1.8046875, + "logits/rejected": -2.8125, + "logps/chosen": -744.0, + "logps/rejected": -584.0, + "loss": 0.6384, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.0849609375, + "rewards/margins": 0.1806640625, + "rewards/rejected": -0.265625, + "step": 180 + }, + { + "epoch": 0.24, + "grad_norm": 83.60084870315862, + "learning_rate": 3.909465020576131e-07, + "logits/chosen": -1.7578125, + "logits/rejected": -1.828125, + "logps/chosen": -716.0, + "logps/rejected": -848.0, + "loss": 0.5983, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.1201171875, + "rewards/margins": 0.400390625, + "rewards/rejected": -0.51953125, + "step": 190 + }, + { + "epoch": 0.25, + "grad_norm": 62.24290659916015, + "learning_rate": 4.11522633744856e-07, + "logits/chosen": -2.125, + "logits/rejected": -2.28125, + "logps/chosen": -536.0, + "logps/rejected": -576.0, + "loss": 0.5939, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23046875, + "rewards/margins": 0.49609375, + "rewards/rejected": -0.265625, + "step": 200 + }, + { + "epoch": 0.26, + "grad_norm": 54.79737482658103, + "learning_rate": 4.320987654320987e-07, + "logits/chosen": -1.9765625, + "logits/rejected": -2.203125, + "logps/chosen": -494.0, + "logps/rejected": -648.0, + "loss": 0.6176, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1103515625, + "rewards/margins": 0.56640625, + "rewards/rejected": -0.455078125, + "step": 210 + }, + { + "epoch": 0.27, + "grad_norm": 75.36398911732266, + "learning_rate": 4.5267489711934156e-07, + "logits/chosen": -2.046875, + "logits/rejected": -2.03125, + "logps/chosen": -512.0, + "logps/rejected": -390.0, + "loss": 0.6214, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.1806640625, + "rewards/margins": -0.0103759765625, + "rewards/rejected": -0.169921875, + "step": 220 + }, + { + "epoch": 0.29, + "grad_norm": 65.42502577887264, + "learning_rate": 4.732510288065844e-07, + "logits/chosen": -2.40625, + "logits/rejected": -2.3125, + "logps/chosen": -652.0, + "logps/rejected": -504.0, + "loss": 0.5634, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.134765625, + "rewards/margins": 0.359375, + "rewards/rejected": -0.49609375, + "step": 230 + }, + { + "epoch": 0.3, + "grad_norm": 101.59685524071844, + "learning_rate": 4.938271604938271e-07, + "logits/chosen": -1.6953125, + "logits/rejected": -1.6640625, + "logps/chosen": -704.0, + "logps/rejected": -548.0, + "loss": 0.5768, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.37109375, + "rewards/margins": 0.1103515625, + "rewards/rejected": -0.48046875, + "step": 240 + }, + { + "epoch": 0.31, + "grad_norm": 56.985215974106104, + "learning_rate": 4.999872565682321e-07, + "logits/chosen": -1.4921875, + "logits/rejected": -2.109375, + "logps/chosen": -732.0, + "logps/rejected": -496.0, + "loss": 0.5665, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.205078125, + "rewards/margins": 0.6328125, + "rewards/rejected": -0.42578125, + "step": 250 + }, + { + "epoch": 0.32, + "grad_norm": 60.61672615933723, + "learning_rate": 4.999248428870611e-07, + "logits/chosen": -1.890625, + "logits/rejected": -3.453125, + "logps/chosen": -588.0, + "logps/rejected": -430.0, + "loss": 0.5575, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.1396484375, + "rewards/margins": 0.37109375, + "rewards/rejected": -0.51171875, + "step": 260 + }, + { + "epoch": 0.33, + "grad_norm": 171.89078996410038, + "learning_rate": 4.99810431295357e-07, + "logits/chosen": -1.640625, + "logits/rejected": -2.140625, + "logps/chosen": -688.0, + "logps/rejected": -488.0, + "loss": 0.5472, + "rewards/accuracies": 0.4000000059604645, + "rewards/chosen": -0.828125, + "rewards/margins": 0.005462646484375, + "rewards/rejected": -0.8359375, + "step": 270 + }, + { + "epoch": 0.35, + "grad_norm": 132.22262423790198, + "learning_rate": 4.99644045596931e-07, + "logits/chosen": -1.9375, + "logits/rejected": -2.875, + "logps/chosen": -462.0, + "logps/rejected": -356.0, + "loss": 0.5785, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.2451171875, + "rewards/margins": 0.474609375, + "rewards/rejected": -0.71875, + "step": 280 + }, + { + "epoch": 0.36, + "grad_norm": 123.26679640601527, + "learning_rate": 4.994257204090243e-07, + "logits/chosen": -1.609375, + "logits/rejected": -3.0, + "logps/chosen": -580.0, + "logps/rejected": -476.0, + "loss": 0.5198, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.42578125, + "rewards/margins": 0.5703125, + "rewards/rejected": -0.99609375, + "step": 290 + }, + { + "epoch": 0.37, + "grad_norm": 47.55857001285771, + "learning_rate": 4.991555011551073e-07, + "logits/chosen": -1.53125, + "logits/rejected": -2.203125, + "logps/chosen": -624.0, + "logps/rejected": -620.0, + "loss": 0.5123, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.07470703125, + "rewards/margins": 0.88671875, + "rewards/rejected": -0.9609375, + "step": 300 + }, + { + "epoch": 0.38, + "grad_norm": 57.83726580735741, + "learning_rate": 4.988334440554274e-07, + "logits/chosen": -1.6796875, + "logits/rejected": -2.0, + "logps/chosen": -800.0, + "logps/rejected": -632.0, + "loss": 0.5449, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.10498046875, + "rewards/margins": 0.5234375, + "rewards/rejected": -0.6328125, + "step": 310 + }, + { + "epoch": 0.4, + "grad_norm": 63.9187247476568, + "learning_rate": 4.984596161153135e-07, + "logits/chosen": -1.96875, + "logits/rejected": -1.6015625, + "logps/chosen": -612.0, + "logps/rejected": -664.0, + "loss": 0.5372, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.8515625, + "rewards/margins": 0.33984375, + "rewards/rejected": -1.1875, + "step": 320 + }, + { + "epoch": 0.41, + "grad_norm": 59.66658958585498, + "learning_rate": 4.980340951112345e-07, + "logits/chosen": -1.59375, + "logits/rejected": -2.4375, + "logps/chosen": -652.0, + "logps/rejected": -612.0, + "loss": 0.5528, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.65625, + "rewards/margins": 0.58984375, + "rewards/rejected": -1.25, + "step": 330 + }, + { + "epoch": 0.42, + "grad_norm": 51.91101244482548, + "learning_rate": 4.975569695746179e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -2.703125, + "logps/chosen": -716.0, + "logps/rejected": -496.0, + "loss": 0.5169, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.259765625, + "rewards/margins": 0.4140625, + "rewards/rejected": -0.671875, + "step": 340 + }, + { + "epoch": 0.43, + "grad_norm": 72.46042088468953, + "learning_rate": 4.970283387734303e-07, + "logits/chosen": -2.34375, + "logits/rejected": -1.96875, + "logps/chosen": -552.0, + "logps/rejected": -494.0, + "loss": 0.5549, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.546875, + "rewards/margins": 0.1943359375, + "rewards/rejected": -0.7421875, + "step": 350 + }, + { + "epoch": 0.45, + "grad_norm": 98.36392274253446, + "learning_rate": 4.964483126915245e-07, + "logits/chosen": -1.9375, + "logits/rejected": -1.6796875, + "logps/chosen": -552.0, + "logps/rejected": -524.0, + "loss": 0.5237, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.375, + "rewards/margins": 0.62109375, + "rewards/rejected": -0.99609375, + "step": 360 + }, + { + "epoch": 0.46, + "grad_norm": 35.92451209539994, + "learning_rate": 4.958170120057565e-07, + "logits/chosen": -1.875, + "logits/rejected": -3.46875, + "logps/chosen": -438.0, + "logps/rejected": -400.0, + "loss": 0.514, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.39453125, + "rewards/margins": 0.75390625, + "rewards/rejected": -1.1484375, + "step": 370 + }, + { + "epoch": 0.47, + "grad_norm": 43.212042789239504, + "learning_rate": 4.951345680608787e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -2.65625, + "logps/chosen": -660.0, + "logps/rejected": -616.0, + "loss": 0.5045, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.140625, + "rewards/margins": 0.201171875, + "rewards/rejected": -1.34375, + "step": 380 + }, + { + "epoch": 0.48, + "grad_norm": 55.92901896058768, + "learning_rate": 4.944011228422125e-07, + "logits/chosen": -1.6875, + "logits/rejected": -2.03125, + "logps/chosen": -536.0, + "logps/rejected": -508.0, + "loss": 0.5521, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.6953125, + "rewards/margins": 0.427734375, + "rewards/rejected": -1.125, + "step": 390 + }, + { + "epoch": 0.5, + "grad_norm": 48.27467634078171, + "learning_rate": 4.936168289461084e-07, + "logits/chosen": -2.25, + "logits/rejected": -2.046875, + "logps/chosen": -390.0, + "logps/rejected": -452.0, + "loss": 0.5116, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.76953125, + "rewards/margins": 0.2353515625, + "rewards/rejected": -1.0078125, + "step": 400 + }, + { + "epoch": 0.51, + "grad_norm": 43.39520482895247, + "learning_rate": 4.92781849548197e-07, + "logits/chosen": -1.6640625, + "logits/rejected": -1.6796875, + "logps/chosen": -580.0, + "logps/rejected": -548.0, + "loss": 0.5259, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.0234375, + "rewards/margins": 0.5859375, + "rewards/rejected": -1.609375, + "step": 410 + }, + { + "epoch": 0.52, + "grad_norm": 53.58876818695228, + "learning_rate": 4.918963583694396e-07, + "logits/chosen": -1.765625, + "logits/rejected": -1.5234375, + "logps/chosen": -548.0, + "logps/rejected": -596.0, + "loss": 0.516, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.93359375, + "rewards/margins": 0.59765625, + "rewards/rejected": -1.53125, + "step": 420 + }, + { + "epoch": 0.53, + "grad_norm": 195.21265227218643, + "learning_rate": 4.909605396399855e-07, + "logits/chosen": -1.9375, + "logits/rejected": -1.625, + "logps/chosen": -414.0, + "logps/rejected": -756.0, + "loss": 0.5513, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.64453125, + "rewards/margins": 1.6640625, + "rewards/rejected": -2.3125, + "step": 430 + }, + { + "epoch": 0.55, + "grad_norm": 46.48860995329084, + "learning_rate": 4.899745880608417e-07, + "logits/chosen": -1.5, + "logits/rejected": -1.921875, + "logps/chosen": -668.0, + "logps/rejected": -664.0, + "loss": 0.7219, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.921875, + "rewards/margins": 0.462890625, + "rewards/rejected": -1.3828125, + "step": 440 + }, + { + "epoch": 0.56, + "grad_norm": 42.31847046504421, + "learning_rate": 4.889387087633647e-07, + "logits/chosen": -1.4296875, + "logits/rejected": -2.84375, + "logps/chosen": -612.0, + "logps/rejected": -600.0, + "loss": 0.5171, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.69921875, + "rewards/margins": 0.6328125, + "rewards/rejected": -1.328125, + "step": 450 + }, + { + "epoch": 0.57, + "grad_norm": 43.44689835885094, + "learning_rate": 4.878531172665815e-07, + "logits/chosen": -1.890625, + "logits/rejected": -2.75, + "logps/chosen": -608.0, + "logps/rejected": -544.0, + "loss": 0.4904, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.169921875, + "rewards/margins": 0.9453125, + "rewards/rejected": -1.1171875, + "step": 460 + }, + { + "epoch": 0.58, + "grad_norm": 47.14528734842805, + "learning_rate": 4.867180394323509e-07, + "logits/chosen": -1.5546875, + "logits/rejected": -2.1875, + "logps/chosen": -700.0, + "logps/rejected": -732.0, + "loss": 0.5632, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0, + "rewards/margins": 1.3359375, + "rewards/rejected": -2.328125, + "step": 470 + }, + { + "epoch": 0.59, + "grad_norm": 43.11262095050651, + "learning_rate": 4.855337114183711e-07, + "logits/chosen": -1.421875, + "logits/rejected": -1.5859375, + "logps/chosen": -644.0, + "logps/rejected": -692.0, + "loss": 0.4951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.330078125, + "rewards/margins": 1.53125, + "rewards/rejected": -1.859375, + "step": 480 + }, + { + "epoch": 0.61, + "grad_norm": 45.38075851262632, + "learning_rate": 4.843003796290469e-07, + "logits/chosen": -1.8046875, + "logits/rejected": -2.15625, + "logps/chosen": -772.0, + "logps/rejected": -576.0, + "loss": 0.5214, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.375, + "rewards/margins": 0.71484375, + "rewards/rejected": -1.09375, + "step": 490 + }, + { + "epoch": 0.62, + "grad_norm": 50.08648685280834, + "learning_rate": 4.830183006642236e-07, + "logits/chosen": -2.40625, + "logits/rejected": -1.9765625, + "logps/chosen": -444.0, + "logps/rejected": -536.0, + "loss": 0.5388, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.83203125, + "rewards/margins": 0.34765625, + "rewards/rejected": -1.1796875, + "step": 500 + }, + { + "epoch": 0.62, + "eval_logits/chosen": -1.0859375, + "eval_logits/rejected": -1.0078125, + "eval_logps/chosen": -540.0, + "eval_logps/rejected": -620.0, + "eval_loss": 0.49058592319488525, + "eval_rewards/accuracies": 0.8055555820465088, + "eval_rewards/chosen": -0.7578125, + "eval_rewards/margins": 0.86328125, + "eval_rewards/rejected": -1.625, + "eval_runtime": 50.5034, + "eval_samples_per_second": 20.791, + "eval_steps_per_second": 0.178, + "step": 500 + }, + { + "epoch": 0.63, + "grad_norm": 45.830329324613906, + "learning_rate": 4.816877412658007e-07, + "logits/chosen": -1.5546875, + "logits/rejected": -1.84375, + "logps/chosen": -772.0, + "logps/rejected": -616.0, + "loss": 0.4923, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.6171875, + "rewards/margins": 0.921875, + "rewards/rejected": -1.5390625, + "step": 510 + }, + { + "epoch": 0.64, + "grad_norm": 53.190960939965166, + "learning_rate": 4.80308978262235e-07, + "logits/chosen": -1.515625, + "logits/rejected": -2.609375, + "logps/chosen": -548.0, + "logps/rejected": -502.0, + "loss": 0.4409, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.88671875, + "rewards/margins": 0.84375, + "rewards/rejected": -1.7265625, + "step": 520 + }, + { + "epoch": 0.66, + "grad_norm": 45.04181332411229, + "learning_rate": 4.788822985109449e-07, + "logits/chosen": -1.875, + "logits/rejected": -1.9140625, + "logps/chosen": -432.0, + "logps/rejected": -448.0, + "loss": 0.5368, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.78515625, + "rewards/margins": 0.50390625, + "rewards/rejected": -1.2890625, + "step": 530 + }, + { + "epoch": 0.67, + "grad_norm": 40.85074772821745, + "learning_rate": 4.774079988386296e-07, + "logits/chosen": -1.9765625, + "logits/rejected": -1.640625, + "logps/chosen": -436.0, + "logps/rejected": -464.0, + "loss": 0.5083, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.94140625, + "rewards/margins": 0.2197265625, + "rewards/rejected": -1.15625, + "step": 540 + }, + { + "epoch": 0.68, + "grad_norm": 45.70832159940248, + "learning_rate": 4.7588638597951173e-07, + "logits/chosen": -1.1171875, + "logits/rejected": -2.265625, + "logps/chosen": -748.0, + "logps/rejected": -524.0, + "loss": 0.4754, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.287109375, + "rewards/margins": 1.640625, + "rewards/rejected": -1.3515625, + "step": 550 + }, + { + "epoch": 0.69, + "grad_norm": 59.530448462467945, + "learning_rate": 4.7431777651152103e-07, + "logits/chosen": -1.4765625, + "logits/rejected": -2.09375, + "logps/chosen": -464.0, + "logps/rejected": -452.0, + "loss": 0.5154, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.890625, + "rewards/margins": 0.380859375, + "rewards/rejected": -1.2734375, + "step": 560 + }, + { + "epoch": 0.71, + "grad_norm": 60.197735477558666, + "learning_rate": 4.727024967904284e-07, + "logits/chosen": -1.890625, + "logits/rejected": -1.4375, + "logps/chosen": -414.0, + "logps/rejected": -564.0, + "loss": 0.5178, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.7421875, + "rewards/margins": -0.025634765625, + "rewards/rejected": -0.71484375, + "step": 570 + }, + { + "epoch": 0.72, + "grad_norm": 42.26765729774977, + "learning_rate": 4.710408828819463e-07, + "logits/chosen": -1.609375, + "logits/rejected": -2.359375, + "logps/chosen": -552.0, + "logps/rejected": -524.0, + "loss": 0.5026, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2890625, + "rewards/margins": 0.6484375, + "rewards/rejected": -1.9375, + "step": 580 + }, + { + "epoch": 0.73, + "grad_norm": 50.89266004351078, + "learning_rate": 4.6933328049180937e-07, + "logits/chosen": -1.890625, + "logits/rejected": -2.140625, + "logps/chosen": -496.0, + "logps/rejected": -580.0, + "loss": 0.4856, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.734375, + "rewards/margins": 0.421875, + "rewards/rejected": -1.15625, + "step": 590 + }, + { + "epoch": 0.74, + "grad_norm": 47.93987624657981, + "learning_rate": 4.6758004489384815e-07, + "logits/chosen": -1.7734375, + "logits/rejected": -2.046875, + "logps/chosen": -408.0, + "logps/rejected": -544.0, + "loss": 0.4713, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -1.2109375, + "rewards/margins": 0.23828125, + "rewards/rejected": -1.4453125, + "step": 600 + }, + { + "epoch": 0.76, + "grad_norm": 48.72108104567496, + "learning_rate": 4.6578154085607323e-07, + "logits/chosen": -2.8125, + "logits/rejected": -2.21875, + "logps/chosen": -580.0, + "logps/rejected": -824.0, + "loss": 0.488, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.51953125, + "rewards/margins": 1.4609375, + "rewards/rejected": -1.984375, + "step": 610 + }, + { + "epoch": 0.77, + "grad_norm": 55.85530255600864, + "learning_rate": 4.639381425647841e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -1.5703125, + "logps/chosen": -502.0, + "logps/rejected": -576.0, + "loss": 0.5025, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.64453125, + "rewards/margins": 1.0, + "rewards/rejected": -1.6484375, + "step": 620 + }, + { + "epoch": 0.78, + "grad_norm": 59.68466050286291, + "learning_rate": 4.6205023354671735e-07, + "logits/chosen": -1.5390625, + "logits/rejected": -2.03125, + "logps/chosen": -460.0, + "logps/rejected": -446.0, + "loss": 0.4818, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.77734375, + "rewards/margins": 0.875, + "rewards/rejected": -1.6484375, + "step": 630 + }, + { + "epoch": 0.79, + "grad_norm": 44.610719658440864, + "learning_rate": 4.601182065892529e-07, + "logits/chosen": -1.5625, + "logits/rejected": -1.453125, + "logps/chosen": -556.0, + "logps/rejected": -460.0, + "loss": 0.476, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.91015625, + "rewards/margins": 0.53515625, + "rewards/rejected": -1.4453125, + "step": 640 + }, + { + "epoch": 0.81, + "grad_norm": 49.65621035961898, + "learning_rate": 4.581424636586928e-07, + "logits/chosen": -1.671875, + "logits/rejected": -2.203125, + "logps/chosen": -456.0, + "logps/rejected": -536.0, + "loss": 0.5334, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.83984375, + "rewards/margins": 0.2060546875, + "rewards/rejected": -1.046875, + "step": 650 + }, + { + "epoch": 0.82, + "grad_norm": 37.22285340163814, + "learning_rate": 4.561234158166305e-07, + "logits/chosen": -1.265625, + "logits/rejected": -1.3046875, + "logps/chosen": -540.0, + "logps/rejected": -524.0, + "loss": 0.4929, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.87109375, + "rewards/margins": 0.75, + "rewards/rejected": -1.6171875, + "step": 660 + }, + { + "epoch": 0.83, + "grad_norm": 72.63982166912706, + "learning_rate": 4.5406148313442753e-07, + "logits/chosen": -2.140625, + "logits/rejected": -2.34375, + "logps/chosen": -354.0, + "logps/rejected": -372.0, + "loss": 0.4682, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": -0.98046875, + "rewards/margins": 0.7109375, + "rewards/rejected": -1.6875, + "step": 670 + }, + { + "epoch": 0.84, + "grad_norm": 49.88245474539256, + "learning_rate": 4.519570946058162e-07, + "logits/chosen": -1.3046875, + "logits/rejected": -1.6796875, + "logps/chosen": -482.0, + "logps/rejected": -556.0, + "loss": 0.4673, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.84375, + "rewards/margins": 1.0234375, + "rewards/rejected": -1.8671875, + "step": 680 + }, + { + "epoch": 0.86, + "grad_norm": 36.17988110201782, + "learning_rate": 4.4981068805764545e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -2.125, + "logps/chosen": -700.0, + "logps/rejected": -668.0, + "loss": 0.4494, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.4765625, + "rewards/margins": -0.234375, + "rewards/rejected": -1.2421875, + "step": 690 + }, + { + "epoch": 0.87, + "grad_norm": 61.21171348805659, + "learning_rate": 4.4762271005878913e-07, + "logits/chosen": -1.765625, + "logits/rejected": -1.9765625, + "logps/chosen": -388.0, + "logps/rejected": -446.0, + "loss": 0.4665, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.2578125, + "rewards/margins": 0.55859375, + "rewards/rejected": -1.8203125, + "step": 700 + }, + { + "epoch": 0.88, + "grad_norm": 52.27627457515467, + "learning_rate": 4.4539361582723586e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.453125, + "logps/chosen": -668.0, + "logps/rejected": -892.0, + "loss": 0.4806, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8828125, + "rewards/margins": 1.46875, + "rewards/rejected": -3.34375, + "step": 710 + }, + { + "epoch": 0.89, + "grad_norm": 65.21759338991524, + "learning_rate": 4.431238691353784e-07, + "logits/chosen": -1.7265625, + "logits/rejected": -1.9921875, + "logps/chosen": -486.0, + "logps/rejected": -536.0, + "loss": 0.4816, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.125, + "rewards/margins": 1.75, + "rewards/rejected": -2.875, + "step": 720 + }, + { + "epoch": 0.9, + "grad_norm": 51.27261353179044, + "learning_rate": 4.408139422135241e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -2.078125, + "logps/chosen": -462.0, + "logps/rejected": -428.0, + "loss": 0.4583, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2890625, + "rewards/margins": 1.859375, + "rewards/rejected": -3.15625, + "step": 730 + }, + { + "epoch": 0.92, + "grad_norm": 64.31719063334275, + "learning_rate": 4.3846431565164596e-07, + "logits/chosen": -1.390625, + "logits/rejected": -1.3515625, + "logps/chosen": -584.0, + "logps/rejected": -608.0, + "loss": 1.897, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.828125, + "rewards/margins": 1.9453125, + "rewards/rejected": -3.78125, + "step": 740 + }, + { + "epoch": 0.93, + "grad_norm": 46.19657074878751, + "learning_rate": 4.360754782993929e-07, + "logits/chosen": -1.4765625, + "logits/rejected": -2.25, + "logps/chosen": -484.0, + "logps/rejected": -434.0, + "loss": 0.465, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.328125, + "rewards/margins": 0.7578125, + "rewards/rejected": -2.078125, + "step": 750 + }, + { + "epoch": 0.94, + "grad_norm": 55.55089778313834, + "learning_rate": 4.336479271643833e-07, + "logits/chosen": -1.4765625, + "logits/rejected": -1.4609375, + "logps/chosen": -488.0, + "logps/rejected": -462.0, + "loss": 0.49, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.2265625, + "rewards/margins": 0.5390625, + "rewards/rejected": -1.765625, + "step": 760 + }, + { + "epoch": 0.95, + "grad_norm": 77.27127183474478, + "learning_rate": 4.3118216730880015e-07, + "logits/chosen": -1.5859375, + "logits/rejected": -2.515625, + "logps/chosen": -552.0, + "logps/rejected": -520.0, + "loss": 0.4607, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0703125, + "rewards/margins": 1.0859375, + "rewards/rejected": -2.15625, + "step": 770 + }, + { + "epoch": 0.97, + "grad_norm": 61.141458979311125, + "learning_rate": 4.286787117443108e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.3984375, + "logps/chosen": -588.0, + "logps/rejected": -580.0, + "loss": 0.4711, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1796875, + "rewards/margins": 1.1796875, + "rewards/rejected": -2.359375, + "step": 780 + }, + { + "epoch": 0.98, + "grad_norm": 48.22890637233016, + "learning_rate": 4.261380813253328e-07, + "logits/chosen": -1.828125, + "logits/rejected": -2.578125, + "logps/chosen": -448.0, + "logps/rejected": -418.0, + "loss": 0.4414, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.1640625, + "rewards/margins": 0.4375, + "rewards/rejected": -1.6015625, + "step": 790 + }, + { + "epoch": 0.99, + "grad_norm": 34.09372820587962, + "learning_rate": 4.2356080464066784e-07, + "logits/chosen": -1.3671875, + "logits/rejected": -3.171875, + "logps/chosen": -536.0, + "logps/rejected": -464.0, + "loss": 0.4354, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.140625, + "rewards/margins": 1.5390625, + "rewards/rejected": -2.6875, + "step": 800 + }, + { + "epoch": 1.0, + "grad_norm": 40.13298242319205, + "learning_rate": 4.2094741790352673e-07, + "logits/chosen": -1.859375, + "logits/rejected": -2.203125, + "logps/chosen": -516.0, + "logps/rejected": -624.0, + "loss": 0.3855, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.03125, + "rewards/margins": 1.203125, + "rewards/rejected": -2.234375, + "step": 810 + }, + { + "epoch": 1.02, + "grad_norm": 24.15795246510711, + "learning_rate": 4.1829846483996813e-07, + "logits/chosen": -1.265625, + "logits/rejected": -2.015625, + "logps/chosen": -884.0, + "logps/rejected": -696.0, + "loss": 0.2332, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.154296875, + "rewards/margins": 2.125, + "rewards/rejected": -2.28125, + "step": 820 + }, + { + "epoch": 1.03, + "grad_norm": 40.12239181322821, + "learning_rate": 4.156144965757735e-07, + "logits/chosen": -2.828125, + "logits/rejected": -2.40625, + "logps/chosen": -416.0, + "logps/rejected": -464.0, + "loss": 0.264, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.484375, + "rewards/margins": 1.7734375, + "rewards/rejected": -3.25, + "step": 830 + }, + { + "epoch": 1.04, + "grad_norm": 22.033239175244603, + "learning_rate": 4.128960715217839e-07, + "logits/chosen": -1.6015625, + "logits/rejected": -2.375, + "logps/chosen": -636.0, + "logps/rejected": -572.0, + "loss": 0.2482, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.7109375, + "rewards/margins": 2.125, + "rewards/rejected": -2.828125, + "step": 840 + }, + { + "epoch": 1.05, + "grad_norm": 24.492268202299986, + "learning_rate": 4.1014375525771963e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.875, + "logps/chosen": -636.0, + "logps/rejected": -600.0, + "loss": 0.2447, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0361328125, + "rewards/margins": 2.453125, + "rewards/rejected": -2.421875, + "step": 850 + }, + { + "epoch": 1.07, + "grad_norm": 31.641742017850387, + "learning_rate": 4.0735812041450926e-07, + "logits/chosen": -1.578125, + "logits/rejected": -1.6484375, + "logps/chosen": -596.0, + "logps/rejected": -596.0, + "loss": 0.2428, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.5390625, + "rewards/margins": 2.5625, + "rewards/rejected": -3.09375, + "step": 860 + }, + { + "epoch": 1.08, + "grad_norm": 44.35625008242433, + "learning_rate": 4.045397465551513e-07, + "logits/chosen": -1.5234375, + "logits/rejected": -1.796875, + "logps/chosen": -520.0, + "logps/rejected": -596.0, + "loss": 0.2393, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.0859375, + "rewards/margins": 1.171875, + "rewards/rejected": -2.265625, + "step": 870 + }, + { + "epoch": 1.09, + "grad_norm": 47.415350341880725, + "learning_rate": 4.0168922005413384e-07, + "logits/chosen": -1.1171875, + "logits/rejected": -1.7109375, + "logps/chosen": -692.0, + "logps/rejected": -612.0, + "loss": 0.2804, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.65625, + "rewards/margins": 1.5, + "rewards/rejected": -2.15625, + "step": 880 + }, + { + "epoch": 1.1, + "grad_norm": 31.910462711815494, + "learning_rate": 3.988071339754366e-07, + "logits/chosen": -1.296875, + "logits/rejected": -1.765625, + "logps/chosen": -632.0, + "logps/rejected": -736.0, + "loss": 0.2293, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21875, + "rewards/margins": 2.296875, + "rewards/rejected": -2.515625, + "step": 890 + }, + { + "epoch": 1.12, + "grad_norm": 27.7996029933054, + "learning_rate": 3.958940879491418e-07, + "logits/chosen": -1.4296875, + "logits/rejected": -1.984375, + "logps/chosen": -568.0, + "logps/rejected": -498.0, + "loss": 0.2693, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -0.84765625, + "rewards/margins": 1.734375, + "rewards/rejected": -2.578125, + "step": 900 + }, + { + "epoch": 1.13, + "grad_norm": 24.76648851906884, + "learning_rate": 3.9295068804667823e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -2.015625, + "logps/chosen": -498.0, + "logps/rejected": -494.0, + "loss": 0.2089, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.73046875, + "rewards/margins": 2.203125, + "rewards/rejected": -2.9375, + "step": 910 + }, + { + "epoch": 1.14, + "grad_norm": 23.922743861511947, + "learning_rate": 3.899775466547261e-07, + "logits/chosen": -1.625, + "logits/rejected": -1.9609375, + "logps/chosen": -556.0, + "logps/rejected": -532.0, + "loss": 0.231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.86328125, + "rewards/margins": 1.8046875, + "rewards/rejected": -2.671875, + "step": 920 + }, + { + "epoch": 1.15, + "grad_norm": 51.66225882393323, + "learning_rate": 3.8697528234780674e-07, + "logits/chosen": -2.015625, + "logits/rejected": -1.6640625, + "logps/chosen": -400.0, + "logps/rejected": -612.0, + "loss": 0.9229, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.515625, + "rewards/margins": 2.140625, + "rewards/rejected": -3.640625, + "step": 930 + }, + { + "epoch": 1.16, + "grad_norm": 38.82610683477862, + "learning_rate": 3.839445197595863e-07, + "logits/chosen": -1.8671875, + "logits/rejected": -2.046875, + "logps/chosen": -418.0, + "logps/rejected": -378.0, + "loss": 0.231, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.71484375, + "rewards/margins": 2.03125, + "rewards/rejected": -2.75, + "step": 940 + }, + { + "epoch": 1.18, + "grad_norm": 22.389795986878887, + "learning_rate": 3.8088588945291734e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -2.453125, + "logps/chosen": -568.0, + "logps/rejected": -496.0, + "loss": 0.2115, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.90625, + "rewards/margins": 1.8203125, + "rewards/rejected": -2.734375, + "step": 950 + }, + { + "epoch": 1.19, + "grad_norm": 22.583198913209376, + "learning_rate": 3.778000277886483e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -2.046875, + "logps/chosen": -668.0, + "logps/rejected": -676.0, + "loss": 0.2358, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.326171875, + "rewards/margins": 2.78125, + "rewards/rejected": -3.109375, + "step": 960 + }, + { + "epoch": 1.2, + "grad_norm": 21.507764548315773, + "learning_rate": 3.746875767932255e-07, + "logits/chosen": -1.609375, + "logits/rejected": -2.234375, + "logps/chosen": -588.0, + "logps/rejected": -668.0, + "loss": 0.2342, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.453125, + "rewards/margins": 2.078125, + "rewards/rejected": -3.53125, + "step": 970 + }, + { + "epoch": 1.21, + "grad_norm": 23.64514805939643, + "learning_rate": 3.7154918402511714e-07, + "logits/chosen": -1.5078125, + "logits/rejected": -3.703125, + "logps/chosen": -536.0, + "logps/rejected": -436.0, + "loss": 0.2427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.66015625, + "rewards/margins": 2.46875, + "rewards/rejected": -3.125, + "step": 980 + }, + { + "epoch": 1.23, + "grad_norm": 25.279762808321895, + "learning_rate": 3.6838550244008573e-07, + "logits/chosen": -1.9765625, + "logits/rejected": -1.5234375, + "logps/chosen": -556.0, + "logps/rejected": -592.0, + "loss": 0.2198, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.875, + "rewards/margins": 1.6015625, + "rewards/rejected": -2.484375, + "step": 990 + }, + { + "epoch": 1.24, + "grad_norm": 23.158055403708982, + "learning_rate": 3.651971902553381e-07, + "logits/chosen": -1.3984375, + "logits/rejected": -2.609375, + "logps/chosen": -564.0, + "logps/rejected": -486.0, + "loss": 0.2367, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.63671875, + "rewards/margins": 2.515625, + "rewards/rejected": -3.140625, + "step": 1000 + }, + { + "epoch": 1.24, + "eval_logits/chosen": -1.0859375, + "eval_logits/rejected": -0.9921875, + "eval_logps/chosen": -564.0, + "eval_logps/rejected": -656.0, + "eval_loss": 0.4058724045753479, + "eval_rewards/accuracies": 0.8888888955116272, + "eval_rewards/chosen": -1.90625, + "eval_rewards/margins": 1.5625, + "eval_rewards/rejected": -3.453125, + "eval_runtime": 50.8495, + "eval_samples_per_second": 20.649, + "eval_steps_per_second": 0.177, + "step": 1000 + }, + { + "epoch": 1.25, + "grad_norm": 39.231752515284, + "learning_rate": 3.6198491081258066e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.8828125, + "logps/chosen": -604.0, + "logps/rejected": -494.0, + "loss": 0.2308, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.265625, + "rewards/margins": 1.734375, + "rewards/rejected": -3.0, + "step": 1010 + }, + { + "epoch": 1.26, + "grad_norm": 29.796507962550486, + "learning_rate": 3.58749332440008e-07, + "logits/chosen": -1.359375, + "logits/rejected": -1.6015625, + "logps/chosen": -584.0, + "logps/rejected": -556.0, + "loss": 0.2362, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.2421875, + "rewards/margins": 1.640625, + "rewards/rejected": -2.875, + "step": 1020 + }, + { + "epoch": 1.28, + "grad_norm": 21.63597550992159, + "learning_rate": 3.55491128313255e-07, + "logits/chosen": -2.3125, + "logits/rejected": -1.7734375, + "logps/chosen": -496.0, + "logps/rejected": -548.0, + "loss": 0.2107, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5703125, + "rewards/margins": 2.5, + "rewards/rejected": -4.0625, + "step": 1030 + }, + { + "epoch": 1.29, + "grad_norm": 14.702295094535923, + "learning_rate": 3.522109763153392e-07, + "logits/chosen": -1.546875, + "logits/rejected": -1.6328125, + "logps/chosen": -442.0, + "logps/rejected": -520.0, + "loss": 0.2199, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.3984375, + "rewards/margins": 1.796875, + "rewards/rejected": -3.203125, + "step": 1040 + }, + { + "epoch": 1.3, + "grad_norm": 21.29596218246951, + "learning_rate": 3.489095588956249e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.890625, + "logps/chosen": -648.0, + "logps/rejected": -668.0, + "loss": 0.2325, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.7890625, + "rewards/margins": 2.53125, + "rewards/rejected": -3.328125, + "step": 1050 + }, + { + "epoch": 1.31, + "grad_norm": 52.09643118168795, + "learning_rate": 3.455875629278363e-07, + "logits/chosen": -1.5390625, + "logits/rejected": -2.140625, + "logps/chosen": -540.0, + "logps/rejected": -536.0, + "loss": 0.2484, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.2578125, + "rewards/margins": 2.265625, + "rewards/rejected": -3.515625, + "step": 1060 + }, + { + "epoch": 1.33, + "grad_norm": 35.57618950847242, + "learning_rate": 3.4224567956715085e-07, + "logits/chosen": -1.671875, + "logits/rejected": -1.8515625, + "logps/chosen": -536.0, + "logps/rejected": -564.0, + "loss": 0.2062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.703125, + "rewards/margins": 1.8203125, + "rewards/rejected": -2.515625, + "step": 1070 + }, + { + "epoch": 1.34, + "grad_norm": 20.490288705612254, + "learning_rate": 3.388846041064012e-07, + "logits/chosen": -1.671875, + "logits/rejected": -2.25, + "logps/chosen": -796.0, + "logps/rejected": -604.0, + "loss": 0.2027, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0078125, + "rewards/margins": 1.8125, + "rewards/rejected": -2.828125, + "step": 1080 + }, + { + "epoch": 1.35, + "grad_norm": 25.587126609494646, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.078125, + "logps/chosen": -728.0, + "logps/rejected": -732.0, + "loss": 0.2549, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3203125, + "rewards/margins": 2.125, + "rewards/rejected": -3.4375, + "step": 1090 + }, + { + "epoch": 1.36, + "grad_norm": 32.538109074719216, + "learning_rate": 3.321076778755358e-07, + "logits/chosen": -1.8359375, + "logits/rejected": -1.3125, + "logps/chosen": -572.0, + "logps/rejected": -568.0, + "loss": 0.2117, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.359375, + "rewards/margins": 1.90625, + "rewards/rejected": -3.265625, + "step": 1100 + }, + { + "epoch": 1.38, + "grad_norm": 19.634746831169892, + "learning_rate": 3.2869323707331176e-07, + "logits/chosen": -1.453125, + "logits/rejected": -1.96875, + "logps/chosen": -490.0, + "logps/rejected": -536.0, + "loss": 0.217, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.390625, + "rewards/margins": 2.171875, + "rewards/rejected": -3.5625, + "step": 1110 + }, + { + "epoch": 1.39, + "grad_norm": 52.58964934069639, + "learning_rate": 3.2526242381345766e-07, + "logits/chosen": -2.21875, + "logits/rejected": -2.390625, + "logps/chosen": -484.0, + "logps/rejected": -652.0, + "loss": 0.2209, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.1875, + "rewards/margins": 3.359375, + "rewards/rejected": -4.53125, + "step": 1120 + }, + { + "epoch": 1.4, + "grad_norm": 26.692041535833823, + "learning_rate": 3.218159518910443e-07, + "logits/chosen": -1.65625, + "logits/rejected": -1.7265625, + "logps/chosen": -540.0, + "logps/rejected": -580.0, + "loss": 0.2288, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.6328125, + "rewards/margins": 2.015625, + "rewards/rejected": -3.640625, + "step": 1130 + }, + { + "epoch": 1.41, + "grad_norm": 36.803960598284185, + "learning_rate": 3.183545383589927e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.5078125, + "logps/chosen": -628.0, + "logps/rejected": -620.0, + "loss": 0.2016, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4375, + "rewards/margins": 2.0625, + "rewards/rejected": -3.5, + "step": 1140 + }, + { + "epoch": 1.43, + "grad_norm": 62.57539343068982, + "learning_rate": 3.148789033788889e-07, + "logits/chosen": -1.3359375, + "logits/rejected": -1.5, + "logps/chosen": -504.0, + "logps/rejected": -524.0, + "loss": 0.1919, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0546875, + "rewards/margins": 2.234375, + "rewards/rejected": -3.28125, + "step": 1150 + }, + { + "epoch": 1.44, + "grad_norm": 27.545095580039987, + "learning_rate": 3.113897700711502e-07, + "logits/chosen": -0.9375, + "logits/rejected": -1.7578125, + "logps/chosen": -412.0, + "logps/rejected": -728.0, + "loss": 0.2339, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.484375, + "rewards/margins": 2.71875, + "rewards/rejected": -4.1875, + "step": 1160 + }, + { + "epoch": 1.45, + "grad_norm": 20.69956429305614, + "learning_rate": 3.078878643645778e-07, + "logits/chosen": -1.7109375, + "logits/rejected": -1.4296875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.2147, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.921875, + "rewards/margins": 2.25, + "rewards/rejected": -4.1875, + "step": 1170 + }, + { + "epoch": 1.46, + "grad_norm": 30.998852642526103, + "learning_rate": 3.0437391484532403e-07, + "logits/chosen": -1.1328125, + "logits/rejected": -1.9921875, + "logps/chosen": -804.0, + "logps/rejected": -600.0, + "loss": 0.2046, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.203125, + "rewards/margins": 2.390625, + "rewards/rejected": -3.59375, + "step": 1180 + }, + { + "epoch": 1.47, + "grad_norm": 20.204732181232373, + "learning_rate": 3.0084865260530666e-07, + "logits/chosen": -1.6328125, + "logits/rejected": -1.1875, + "logps/chosen": -552.0, + "logps/rejected": -780.0, + "loss": 0.2196, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.92578125, + "rewards/margins": 3.203125, + "rewards/rejected": -4.125, + "step": 1190 + }, + { + "epoch": 1.49, + "grad_norm": 38.16020262352693, + "learning_rate": 2.9731281109010253e-07, + "logits/chosen": -1.8203125, + "logits/rejected": -1.8359375, + "logps/chosen": -446.0, + "logps/rejected": -484.0, + "loss": 0.233, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.640625, + "rewards/margins": 2.40625, + "rewards/rejected": -4.0625, + "step": 1200 + }, + { + "epoch": 1.5, + "grad_norm": 29.06302627643078, + "learning_rate": 2.937671259463512e-07, + "logits/chosen": -1.8515625, + "logits/rejected": -1.625, + "logps/chosen": -536.0, + "logps/rejected": -648.0, + "loss": 0.2302, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.296875, + "rewards/margins": 3.484375, + "rewards/rejected": -4.78125, + "step": 1210 + }, + { + "epoch": 1.51, + "grad_norm": 91.98272670818265, + "learning_rate": 2.9021233486869994e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.7421875, + "logps/chosen": -672.0, + "logps/rejected": -560.0, + "loss": 0.2402, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.484375, + "rewards/margins": 1.6953125, + "rewards/rejected": -3.1875, + "step": 1220 + }, + { + "epoch": 1.52, + "grad_norm": 30.373071126803815, + "learning_rate": 2.8664917744632423e-07, + "logits/chosen": -1.6484375, + "logits/rejected": -1.875, + "logps/chosen": -544.0, + "logps/rejected": -640.0, + "loss": 0.2211, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.90625, + "rewards/margins": 2.40625, + "rewards/rejected": -4.3125, + "step": 1230 + }, + { + "epoch": 1.54, + "grad_norm": 24.064215386247714, + "learning_rate": 2.830783950090522e-07, + "logits/chosen": -1.2265625, + "logits/rejected": -1.734375, + "logps/chosen": -528.0, + "logps/rejected": -462.0, + "loss": 0.2278, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.875, + "rewards/margins": 1.96875, + "rewards/rejected": -3.828125, + "step": 1240 + }, + { + "epoch": 1.55, + "grad_norm": 38.77040858523863, + "learning_rate": 2.7950073047312855e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.7109375, + "logps/chosen": -640.0, + "logps/rejected": -752.0, + "loss": 0.2113, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8125, + "rewards/margins": 2.40625, + "rewards/rejected": -4.21875, + "step": 1250 + }, + { + "epoch": 1.56, + "grad_norm": 36.115163088163186, + "learning_rate": 2.759169281866472e-07, + "logits/chosen": -1.671875, + "logits/rejected": -2.34375, + "logps/chosen": -460.0, + "logps/rejected": -430.0, + "loss": 0.2048, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.953125, + "rewards/margins": 1.7734375, + "rewards/rejected": -3.71875, + "step": 1260 + }, + { + "epoch": 1.57, + "grad_norm": 21.74767552298878, + "learning_rate": 2.72327733774687e-07, + "logits/chosen": -1.28125, + "logits/rejected": -1.875, + "logps/chosen": -752.0, + "logps/rejected": -588.0, + "loss": 0.2159, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.8125, + "rewards/margins": 2.40625, + "rewards/rejected": -3.21875, + "step": 1270 + }, + { + "epoch": 1.59, + "grad_norm": 29.90116775368825, + "learning_rate": 2.6873389398418085e-07, + "logits/chosen": -1.4453125, + "logits/rejected": -1.9921875, + "logps/chosen": -420.0, + "logps/rejected": -490.0, + "loss": 0.2191, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.390625, + "rewards/margins": 1.59375, + "rewards/rejected": -3.984375, + "step": 1280 + }, + { + "epoch": 1.6, + "grad_norm": 23.793756365026436, + "learning_rate": 2.6513615652855246e-07, + "logits/chosen": -1.5546875, + "logits/rejected": -1.3046875, + "logps/chosen": -572.0, + "logps/rejected": -600.0, + "loss": 0.2011, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.84375, + "rewards/margins": 1.53125, + "rewards/rejected": -3.375, + "step": 1290 + }, + { + "epoch": 1.61, + "grad_norm": 43.491509201941334, + "learning_rate": 2.6153526993215085e-07, + "logits/chosen": -1.65625, + "logits/rejected": -1.6171875, + "logps/chosen": -500.0, + "logps/rejected": -548.0, + "loss": 0.1999, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.375, + "rewards/margins": 2.34375, + "rewards/rejected": -4.71875, + "step": 1300 + }, + { + "epoch": 1.62, + "grad_norm": 43.19768159653058, + "learning_rate": 2.579319833745169e-07, + "logits/chosen": -1.546875, + "logits/rejected": -1.859375, + "logps/chosen": -460.0, + "logps/rejected": -406.0, + "loss": 0.2023, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.65625, + "rewards/margins": 2.171875, + "rewards/rejected": -3.828125, + "step": 1310 + }, + { + "epoch": 1.64, + "grad_norm": 35.267489669162956, + "learning_rate": 2.5432704653451374e-07, + "logits/chosen": -1.3515625, + "logits/rejected": -1.234375, + "logps/chosen": -532.0, + "logps/rejected": -728.0, + "loss": 0.1962, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.046875, + "rewards/margins": 3.109375, + "rewards/rejected": -5.15625, + "step": 1320 + }, + { + "epoch": 1.65, + "grad_norm": 17.971641900070907, + "learning_rate": 2.5072120943435246e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.4765625, + "logps/chosen": -784.0, + "logps/rejected": -808.0, + "loss": 0.1927, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.453125, + "rewards/margins": 2.765625, + "rewards/rejected": -4.21875, + "step": 1330 + }, + { + "epoch": 1.66, + "grad_norm": 26.236832847315178, + "learning_rate": 2.471152222835471e-07, + "logits/chosen": -2.1875, + "logits/rejected": -1.5859375, + "logps/chosen": -684.0, + "logps/rejected": -624.0, + "loss": 0.2161, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.078125, + "rewards/margins": 3.125, + "rewards/rejected": -4.1875, + "step": 1340 + }, + { + "epoch": 1.67, + "grad_norm": 25.147006316460192, + "learning_rate": 2.4350983532283043e-07, + "logits/chosen": -1.4296875, + "logits/rejected": -1.1484375, + "logps/chosen": -472.0, + "logps/rejected": -592.0, + "loss": 0.189, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -1.34375, + "rewards/margins": 2.125, + "rewards/rejected": -3.46875, + "step": 1350 + }, + { + "epoch": 1.69, + "grad_norm": 54.14713597288178, + "learning_rate": 2.39905798668063e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.6875, + "logps/chosen": -510.0, + "logps/rejected": -568.0, + "loss": 0.2336, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8359375, + "rewards/margins": 2.484375, + "rewards/rejected": -4.3125, + "step": 1360 + }, + { + "epoch": 1.7, + "grad_norm": 21.942609458104677, + "learning_rate": 2.3630386215416878e-07, + "logits/chosen": -1.5390625, + "logits/rejected": -1.8671875, + "logps/chosen": -620.0, + "logps/rejected": -620.0, + "loss": 0.1966, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.1484375, + "rewards/margins": 3.0, + "rewards/rejected": -4.15625, + "step": 1370 + }, + { + "epoch": 1.71, + "grad_norm": 72.72707544492208, + "learning_rate": 2.3270477517912835e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.1171875, + "logps/chosen": -716.0, + "logps/rejected": -708.0, + "loss": 0.2211, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.1875, + "rewards/margins": 1.84375, + "rewards/rejected": -4.03125, + "step": 1380 + }, + { + "epoch": 1.72, + "grad_norm": 22.411175858752106, + "learning_rate": 2.291092865480641e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.7890625, + "logps/chosen": -568.0, + "logps/rejected": -680.0, + "loss": 0.2027, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.7109375, + "rewards/margins": 3.09375, + "rewards/rejected": -4.8125, + "step": 1390 + }, + { + "epoch": 1.73, + "grad_norm": 124.47455963689552, + "learning_rate": 2.2551814431744758e-07, + "logits/chosen": -1.4765625, + "logits/rejected": -1.5, + "logps/chosen": -592.0, + "logps/rejected": -600.0, + "loss": 0.3274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9765625, + "rewards/margins": 2.40625, + "rewards/rejected": -4.375, + "step": 1400 + }, + { + "epoch": 1.75, + "grad_norm": 40.10464388387894, + "learning_rate": 2.2193209563946382e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.859375, + "logps/chosen": -740.0, + "logps/rejected": -604.0, + "loss": 0.2608, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.34375, + "rewards/margins": 2.984375, + "rewards/rejected": -5.3125, + "step": 1410 + }, + { + "epoch": 1.76, + "grad_norm": 139.91739653913288, + "learning_rate": 2.1835188660656265e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.4921875, + "logps/chosen": -600.0, + "logps/rejected": -580.0, + "loss": 0.1985, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.546875, + "rewards/margins": 2.125, + "rewards/rejected": -3.671875, + "step": 1420 + }, + { + "epoch": 1.77, + "grad_norm": 45.52923168908793, + "learning_rate": 2.147782620962314e-07, + "logits/chosen": -1.5, + "logits/rejected": -1.7109375, + "logps/chosen": -544.0, + "logps/rejected": -540.0, + "loss": 0.2292, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4375, + "rewards/margins": 2.140625, + "rewards/rejected": -3.578125, + "step": 1430 + }, + { + "epoch": 1.78, + "grad_norm": 55.0384600231688, + "learning_rate": 2.112119656160199e-07, + "logits/chosen": -1.15625, + "logits/rejected": -1.25, + "logps/chosen": -620.0, + "logps/rejected": -700.0, + "loss": 0.2028, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.8046875, + "rewards/margins": 1.6953125, + "rewards/rejected": -3.484375, + "step": 1440 + }, + { + "epoch": 1.8, + "grad_norm": 18.297415510730342, + "learning_rate": 2.0765373914885047e-07, + "logits/chosen": -1.8203125, + "logits/rejected": -1.5234375, + "logps/chosen": -418.0, + "logps/rejected": -508.0, + "loss": 0.2187, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.3828125, + "rewards/margins": 1.5234375, + "rewards/rejected": -2.90625, + "step": 1450 + }, + { + "epoch": 1.81, + "grad_norm": 25.14760440570739, + "learning_rate": 2.0410432299864556e-07, + "logits/chosen": -1.46875, + "logits/rejected": -1.3828125, + "logps/chosen": -584.0, + "logps/rejected": -816.0, + "loss": 0.2162, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.125, + "rewards/margins": 2.9375, + "rewards/rejected": -5.0625, + "step": 1460 + }, + { + "epoch": 1.82, + "grad_norm": 25.279928980019168, + "learning_rate": 2.0056445563630423e-07, + "logits/chosen": -1.9609375, + "logits/rejected": -1.5625, + "logps/chosen": -532.0, + "logps/rejected": -604.0, + "loss": 0.217, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.40625, + "rewards/margins": 2.046875, + "rewards/rejected": -3.46875, + "step": 1470 + }, + { + "epoch": 1.83, + "grad_norm": 31.005511813027777, + "learning_rate": 1.9703487354606018e-07, + "logits/chosen": -2.640625, + "logits/rejected": -1.96875, + "logps/chosen": -564.0, + "logps/rejected": -628.0, + "loss": 0.2051, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.25, + "rewards/margins": 3.265625, + "rewards/rejected": -4.53125, + "step": 1480 + }, + { + "epoch": 1.85, + "grad_norm": 17.907410467346242, + "learning_rate": 1.935163110722533e-07, + "logits/chosen": -1.578125, + "logits/rejected": -1.8515625, + "logps/chosen": -672.0, + "logps/rejected": -528.0, + "loss": 0.2019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.82421875, + "rewards/margins": 2.359375, + "rewards/rejected": -3.171875, + "step": 1490 + }, + { + "epoch": 1.86, + "grad_norm": 37.56302682379733, + "learning_rate": 1.900095002665459e-07, + "logits/chosen": -1.375, + "logits/rejected": -1.5859375, + "logps/chosen": -544.0, + "logps/rejected": -668.0, + "loss": 0.2247, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.4609375, + "rewards/margins": 2.109375, + "rewards/rejected": -3.5625, + "step": 1500 + }, + { + "epoch": 1.86, + "eval_logits/chosen": -1.0546875, + "eval_logits/rejected": -0.9765625, + "eval_logps/chosen": -568.0, + "eval_logps/rejected": -668.0, + "eval_loss": 0.3828948140144348, + "eval_rewards/accuracies": 0.7777777910232544, + "eval_rewards/chosen": -2.203125, + "eval_rewards/margins": 1.8125, + "eval_rewards/rejected": -4.03125, + "eval_runtime": 49.0823, + "eval_samples_per_second": 21.393, + "eval_steps_per_second": 0.183, + "step": 1500 + }, + { + "epoch": 1.87, + "grad_norm": 142.03485499088688, + "learning_rate": 1.8651517073561673e-07, + "logits/chosen": -1.9140625, + "logits/rejected": -1.84375, + "logps/chosen": -516.0, + "logps/rejected": -444.0, + "loss": 0.2354, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0546875, + "rewards/margins": 1.6640625, + "rewards/rejected": -2.71875, + "step": 1510 + }, + { + "epoch": 1.88, + "grad_norm": 20.904872000144213, + "learning_rate": 1.8303404948936285e-07, + "logits/chosen": -1.5625, + "logits/rejected": -1.3828125, + "logps/chosen": -466.0, + "logps/rejected": -492.0, + "loss": 0.2063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.98046875, + "rewards/margins": 2.203125, + "rewards/rejected": -3.171875, + "step": 1520 + }, + { + "epoch": 1.9, + "grad_norm": 23.082974237096174, + "learning_rate": 1.7956686078964255e-07, + "logits/chosen": -1.375, + "logits/rejected": -1.4375, + "logps/chosen": -528.0, + "logps/rejected": -656.0, + "loss": 0.2083, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.421875, + "rewards/margins": 3.09375, + "rewards/rejected": -4.5, + "step": 1530 + }, + { + "epoch": 1.91, + "grad_norm": 65.20385510486626, + "learning_rate": 1.7611432599958924e-07, + "logits/chosen": -1.9140625, + "logits/rejected": -2.234375, + "logps/chosen": -352.0, + "logps/rejected": -392.0, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3515625, + "rewards/margins": 1.9609375, + "rewards/rejected": -3.3125, + "step": 1540 + }, + { + "epoch": 1.92, + "grad_norm": 26.23633528972487, + "learning_rate": 1.726771634335293e-07, + "logits/chosen": -1.4609375, + "logits/rejected": -2.0625, + "logps/chosen": -492.0, + "logps/rejected": -456.0, + "loss": 0.2321, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.0703125, + "rewards/margins": 1.6015625, + "rewards/rejected": -2.671875, + "step": 1550 + }, + { + "epoch": 1.93, + "grad_norm": 49.10798542538174, + "learning_rate": 1.6925608820753325e-07, + "logits/chosen": -0.83203125, + "logits/rejected": -1.1953125, + "logps/chosen": -708.0, + "logps/rejected": -880.0, + "loss": 0.2232, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.86328125, + "rewards/margins": 2.625, + "rewards/rejected": -3.484375, + "step": 1560 + }, + { + "epoch": 1.95, + "grad_norm": 52.60647313486629, + "learning_rate": 1.6585181209063321e-07, + "logits/chosen": -1.71875, + "logits/rejected": -1.4921875, + "logps/chosen": -472.0, + "logps/rejected": -704.0, + "loss": 0.1907, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5546875, + "rewards/margins": 2.578125, + "rewards/rejected": -4.125, + "step": 1570 + }, + { + "epoch": 1.96, + "grad_norm": 24.213468426964006, + "learning_rate": 1.6246504335673625e-07, + "logits/chosen": -1.0390625, + "logits/rejected": -1.4453125, + "logps/chosen": -668.0, + "logps/rejected": -856.0, + "loss": 0.2086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.9375, + "rewards/margins": 2.15625, + "rewards/rejected": -3.09375, + "step": 1580 + }, + { + "epoch": 1.97, + "grad_norm": 47.733763616697836, + "learning_rate": 1.590964866372652e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.2734375, + "logps/chosen": -636.0, + "logps/rejected": -784.0, + "loss": 0.2083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.3515625, + "rewards/margins": 2.828125, + "rewards/rejected": -4.1875, + "step": 1590 + }, + { + "epoch": 1.98, + "grad_norm": 37.63668160123638, + "learning_rate": 1.5574684277455685e-07, + "logits/chosen": -1.765625, + "logits/rejected": -1.1953125, + "logps/chosen": -464.0, + "logps/rejected": -640.0, + "loss": 0.22, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -1.5234375, + "rewards/margins": 1.765625, + "rewards/rejected": -3.28125, + "step": 1600 + }, + { + "epoch": 2.0, + "grad_norm": 20.84423028894674, + "learning_rate": 1.5241680867604905e-07, + "logits/chosen": -1.0078125, + "logits/rejected": -2.34375, + "logps/chosen": -660.0, + "logps/rejected": -624.0, + "loss": 0.2062, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -0.73046875, + "rewards/margins": 2.265625, + "rewards/rejected": -3.0, + "step": 1610 + }, + { + "epoch": 2.01, + "grad_norm": 15.238180752697565, + "learning_rate": 1.4910707716928586e-07, + "logits/chosen": -1.75, + "logits/rejected": -2.375, + "logps/chosen": -568.0, + "logps/rejected": -696.0, + "loss": 0.1306, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.828125, + "rewards/margins": 3.359375, + "rewards/rejected": -5.1875, + "step": 1620 + }, + { + "epoch": 2.02, + "grad_norm": 13.583277201205796, + "learning_rate": 1.4581833685777228e-07, + "logits/chosen": -1.34375, + "logits/rejected": -1.578125, + "logps/chosen": -552.0, + "logps/rejected": -640.0, + "loss": 0.1173, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5234375, + "rewards/margins": 2.65625, + "rewards/rejected": -4.1875, + "step": 1630 + }, + { + "epoch": 2.03, + "grad_norm": 14.86440122341942, + "learning_rate": 1.4255127197770707e-07, + "logits/chosen": -1.4609375, + "logits/rejected": -1.3828125, + "logps/chosen": -434.0, + "logps/rejected": -552.0, + "loss": 0.1149, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.65625, + "rewards/margins": 2.515625, + "rewards/rejected": -4.1875, + "step": 1640 + }, + { + "epoch": 2.04, + "grad_norm": 15.578800057924948, + "learning_rate": 1.3930656225562474e-07, + "logits/chosen": -1.6640625, + "logits/rejected": -1.515625, + "logps/chosen": -540.0, + "logps/rejected": -620.0, + "loss": 0.1074, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.90625, + "rewards/margins": 3.28125, + "rewards/rejected": -5.1875, + "step": 1650 + }, + { + "epoch": 2.06, + "grad_norm": 13.991553452696552, + "learning_rate": 1.360848827669756e-07, + "logits/chosen": -1.421875, + "logits/rejected": -1.2265625, + "logps/chosen": -524.0, + "logps/rejected": -520.0, + "loss": 0.1255, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7578125, + "rewards/margins": 2.734375, + "rewards/rejected": -4.5, + "step": 1660 + }, + { + "epoch": 2.07, + "grad_norm": 21.745298822673373, + "learning_rate": 1.3288690379567314e-07, + "logits/chosen": -1.4140625, + "logits/rejected": -1.84375, + "logps/chosen": -506.0, + "logps/rejected": -544.0, + "loss": 0.123, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.78125, + "rewards/margins": 2.171875, + "rewards/rejected": -3.953125, + "step": 1670 + }, + { + "epoch": 2.08, + "grad_norm": 15.334862616251963, + "learning_rate": 1.2971329069463932e-07, + "logits/chosen": -1.328125, + "logits/rejected": -1.8984375, + "logps/chosen": -632.0, + "logps/rejected": -672.0, + "loss": 0.1169, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7421875, + "rewards/margins": 3.03125, + "rewards/rejected": -4.78125, + "step": 1680 + }, + { + "epoch": 2.09, + "grad_norm": 36.5802518789977, + "learning_rate": 1.2656470374737434e-07, + "logits/chosen": -1.1875, + "logits/rejected": -1.3671875, + "logps/chosen": -716.0, + "logps/rejected": -1024.0, + "loss": 0.1232, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.859375, + "rewards/margins": 4.25, + "rewards/rejected": -6.125, + "step": 1690 + }, + { + "epoch": 2.11, + "grad_norm": 35.20242961161644, + "learning_rate": 1.2344179803058264e-07, + "logits/chosen": -1.2578125, + "logits/rejected": -1.9921875, + "logps/chosen": -528.0, + "logps/rejected": -624.0, + "loss": 0.1247, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.1875, + "rewards/margins": 2.25, + "rewards/rejected": -4.4375, + "step": 1700 + }, + { + "epoch": 2.12, + "grad_norm": 20.682912146389263, + "learning_rate": 1.203452232778807e-07, + "logits/chosen": -1.4375, + "logits/rejected": -1.6015625, + "logps/chosen": -748.0, + "logps/rejected": -824.0, + "loss": 0.1213, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.78125, + "rewards/margins": 3.921875, + "rewards/rejected": -5.71875, + "step": 1710 + }, + { + "epoch": 2.13, + "grad_norm": 21.01075482943445, + "learning_rate": 1.1727562374461788e-07, + "logits/chosen": -1.9765625, + "logits/rejected": -1.515625, + "logps/chosen": -532.0, + "logps/rejected": -620.0, + "loss": 0.1279, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5859375, + "rewards/margins": 3.53125, + "rewards/rejected": -5.125, + "step": 1720 + }, + { + "epoch": 2.14, + "grad_norm": 29.30233670676864, + "learning_rate": 1.142336380738361e-07, + "logits/chosen": -1.3203125, + "logits/rejected": -1.109375, + "logps/chosen": -564.0, + "logps/rejected": -544.0, + "loss": 0.1133, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.3125, + "rewards/margins": 2.90625, + "rewards/rejected": -5.21875, + "step": 1730 + }, + { + "epoch": 2.16, + "grad_norm": 16.664591107532367, + "learning_rate": 1.1121989916339756e-07, + "logits/chosen": -1.203125, + "logits/rejected": -2.9375, + "logps/chosen": -732.0, + "logps/rejected": -624.0, + "loss": 0.1121, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.46875, + "rewards/margins": 3.109375, + "rewards/rejected": -5.59375, + "step": 1740 + }, + { + "epoch": 2.17, + "grad_norm": 83.44488397290417, + "learning_rate": 1.0823503403430734e-07, + "logits/chosen": -1.25, + "logits/rejected": -1.5546875, + "logps/chosen": -648.0, + "logps/rejected": -508.0, + "loss": 0.1218, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.859375, + "rewards/margins": 2.90625, + "rewards/rejected": -4.78125, + "step": 1750 + }, + { + "epoch": 2.18, + "grad_norm": 15.585689114051172, + "learning_rate": 1.0527966370025964e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.7578125, + "logps/chosen": -716.0, + "logps/rejected": -692.0, + "loss": 0.1205, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.015625, + "rewards/margins": 2.875, + "rewards/rejected": -4.875, + "step": 1760 + }, + { + "epoch": 2.19, + "grad_norm": 10.765461249613185, + "learning_rate": 1.0235440303843302e-07, + "logits/chosen": -1.2109375, + "logits/rejected": -1.796875, + "logps/chosen": -500.0, + "logps/rejected": -636.0, + "loss": 0.1099, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.59375, + "rewards/margins": 2.515625, + "rewards/rejected": -5.125, + "step": 1770 + }, + { + "epoch": 2.21, + "grad_norm": 12.386913795936541, + "learning_rate": 9.945986066156248e-08, + "logits/chosen": -1.59375, + "logits/rejected": -1.8828125, + "logps/chosen": -498.0, + "logps/rejected": -576.0, + "loss": 0.108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5703125, + "rewards/margins": 3.5625, + "rewards/rejected": -5.125, + "step": 1780 + }, + { + "epoch": 2.22, + "grad_norm": 16.61091563337375, + "learning_rate": 9.659663879131503e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.3125, + "logps/chosen": -560.0, + "logps/rejected": -528.0, + "loss": 0.125, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.40625, + "rewards/margins": 2.71875, + "rewards/rejected": -5.125, + "step": 1790 + }, + { + "epoch": 2.23, + "grad_norm": 24.411403141380244, + "learning_rate": 9.376533313299542e-08, + "logits/chosen": -1.2265625, + "logits/rejected": -2.21875, + "logps/chosen": -772.0, + "logps/rejected": -660.0, + "loss": 0.111, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078125, + "rewards/margins": 2.9375, + "rewards/rejected": -5.0, + "step": 1800 + }, + { + "epoch": 2.24, + "grad_norm": 15.505538034971874, + "learning_rate": 9.096653275160641e-08, + "logits/chosen": -1.5390625, + "logits/rejected": -1.59375, + "logps/chosen": -492.0, + "logps/rejected": -576.0, + "loss": 0.1229, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7109375, + "rewards/margins": 2.765625, + "rewards/rejected": -4.46875, + "step": 1810 + }, + { + "epoch": 2.26, + "grad_norm": 15.710939806805685, + "learning_rate": 8.820081994929207e-08, + "logits/chosen": -1.7421875, + "logits/rejected": -2.03125, + "logps/chosen": -724.0, + "logps/rejected": -656.0, + "loss": 0.1194, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.4375, + "rewards/margins": 2.4375, + "rewards/rejected": -4.875, + "step": 1820 + }, + { + "epoch": 2.27, + "grad_norm": 15.364495322714388, + "learning_rate": 8.546877014418671e-08, + "logits/chosen": -1.9296875, + "logits/rejected": -2.15625, + "logps/chosen": -496.0, + "logps/rejected": -532.0, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.046875, + "rewards/margins": 2.484375, + "rewards/rejected": -4.53125, + "step": 1830 + }, + { + "epoch": 2.28, + "grad_norm": 13.756725707803474, + "learning_rate": 8.277095175069738e-08, + "logits/chosen": -1.3984375, + "logits/rejected": -1.4609375, + "logps/chosen": -532.0, + "logps/rejected": -552.0, + "loss": 0.1072, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.390625, + "rewards/margins": 2.9375, + "rewards/rejected": -5.3125, + "step": 1840 + }, + { + "epoch": 2.29, + "grad_norm": 23.80983088717624, + "learning_rate": 8.010792606124228e-08, + "logits/chosen": -1.0703125, + "logits/rejected": -1.0546875, + "logps/chosen": -672.0, + "logps/rejected": -680.0, + "loss": 0.1081, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.890625, + "rewards/margins": 3.53125, + "rewards/rejected": -5.40625, + "step": 1850 + }, + { + "epoch": 2.3, + "grad_norm": 17.774551031970322, + "learning_rate": 7.748024712947204e-08, + "logits/chosen": -1.3984375, + "logits/rejected": -1.2421875, + "logps/chosen": -636.0, + "logps/rejected": -652.0, + "loss": 0.1291, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.28125, + "rewards/margins": 2.34375, + "rewards/rejected": -4.625, + "step": 1860 + }, + { + "epoch": 2.32, + "grad_norm": 14.943619749566544, + "learning_rate": 7.488846165499596e-08, + "logits/chosen": -1.3984375, + "logits/rejected": -1.9609375, + "logps/chosen": -572.0, + "logps/rejected": -684.0, + "loss": 0.1282, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.734375, + "rewards/margins": 3.515625, + "rewards/rejected": -6.25, + "step": 1870 + }, + { + "epoch": 2.33, + "grad_norm": 16.562697765445648, + "learning_rate": 7.233310886963942e-08, + "logits/chosen": -1.375, + "logits/rejected": -1.3984375, + "logps/chosen": -474.0, + "logps/rejected": -544.0, + "loss": 0.1229, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.28125, + "rewards/margins": 2.5, + "rewards/rejected": -4.78125, + "step": 1880 + }, + { + "epoch": 2.34, + "grad_norm": 16.237370125481036, + "learning_rate": 6.981472042525416e-08, + "logits/chosen": -1.515625, + "logits/rejected": -1.765625, + "logps/chosen": -640.0, + "logps/rejected": -588.0, + "loss": 0.1077, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078125, + "rewards/margins": 2.890625, + "rewards/rejected": -4.96875, + "step": 1890 + }, + { + "epoch": 2.35, + "grad_norm": 12.54574310017106, + "learning_rate": 6.7333820283106e-08, + "logits/chosen": -0.94921875, + "logits/rejected": -1.3828125, + "logps/chosen": -696.0, + "logps/rejected": -840.0, + "loss": 0.1192, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.578125, + "rewards/margins": 4.125, + "rewards/rejected": -5.71875, + "step": 1900 + }, + { + "epoch": 2.37, + "grad_norm": 17.067698975214256, + "learning_rate": 6.48909246048622e-08, + "logits/chosen": -1.6953125, + "logits/rejected": -1.6015625, + "logps/chosen": -490.0, + "logps/rejected": -560.0, + "loss": 0.1259, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078125, + "rewards/margins": 3.03125, + "rewards/rejected": -5.09375, + "step": 1910 + }, + { + "epoch": 2.38, + "grad_norm": 17.315279196446202, + "learning_rate": 6.248654164520237e-08, + "logits/chosen": -1.2890625, + "logits/rejected": -1.4609375, + "logps/chosen": -458.0, + "logps/rejected": -426.0, + "loss": 0.1221, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.515625, + "rewards/margins": 2.09375, + "rewards/rejected": -4.625, + "step": 1920 + }, + { + "epoch": 2.39, + "grad_norm": 15.397715588828959, + "learning_rate": 6.012117164607347e-08, + "logits/chosen": -0.90625, + "logits/rejected": -1.4921875, + "logps/chosen": -796.0, + "logps/rejected": -708.0, + "loss": 0.109, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.5078125, + "rewards/margins": 3.78125, + "rewards/rejected": -5.3125, + "step": 1930 + }, + { + "epoch": 2.4, + "grad_norm": 45.120668890615434, + "learning_rate": 5.779530673261279e-08, + "logits/chosen": -1.0703125, + "logits/rejected": -1.8125, + "logps/chosen": -612.0, + "logps/rejected": -820.0, + "loss": 0.0907, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7265625, + "rewards/margins": 3.5625, + "rewards/rejected": -5.3125, + "step": 1940 + }, + { + "epoch": 2.42, + "grad_norm": 12.405391170841106, + "learning_rate": 5.5509430810758817e-08, + "logits/chosen": -1.0234375, + "logits/rejected": -1.5703125, + "logps/chosen": -800.0, + "logps/rejected": -848.0, + "loss": 0.1012, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.28125, + "rewards/margins": 3.375, + "rewards/rejected": -5.65625, + "step": 1950 + }, + { + "epoch": 2.43, + "grad_norm": 13.29830997717489, + "learning_rate": 5.3264019466573053e-08, + "logits/chosen": -1.03125, + "logits/rejected": -1.8203125, + "logps/chosen": -660.0, + "logps/rejected": -588.0, + "loss": 0.0953, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.859375, + "rewards/margins": 3.640625, + "rewards/rejected": -5.5, + "step": 1960 + }, + { + "epoch": 2.44, + "grad_norm": 15.306039857091942, + "learning_rate": 5.105953986729195e-08, + "logits/chosen": -1.6171875, + "logits/rejected": -1.2109375, + "logps/chosen": -576.0, + "logps/rejected": -732.0, + "loss": 0.1057, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4375, + "rewards/margins": 3.140625, + "rewards/rejected": -5.5625, + "step": 1970 + }, + { + "epoch": 2.45, + "grad_norm": 17.90216325537495, + "learning_rate": 4.889645066413112e-08, + "logits/chosen": -1.125, + "logits/rejected": -1.5546875, + "logps/chosen": -568.0, + "logps/rejected": -612.0, + "loss": 0.1165, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.203125, + "rewards/margins": 2.4375, + "rewards/rejected": -4.625, + "step": 1980 + }, + { + "epoch": 2.47, + "grad_norm": 13.665308623510128, + "learning_rate": 4.67752018968606e-08, + "logits/chosen": -1.1328125, + "logits/rejected": -1.4765625, + "logps/chosen": -624.0, + "logps/rejected": -592.0, + "loss": 0.0942, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.1875, + "rewards/margins": 3.3125, + "rewards/rejected": -5.5, + "step": 1990 + }, + { + "epoch": 2.48, + "grad_norm": 14.773793343841884, + "learning_rate": 4.4696234900172744e-08, + "logits/chosen": -1.5703125, + "logits/rejected": -1.0546875, + "logps/chosen": -540.0, + "logps/rejected": -824.0, + "loss": 0.1132, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.4375, + "rewards/margins": 3.46875, + "rewards/rejected": -5.90625, + "step": 2000 + }, + { + "epoch": 2.48, + "eval_logits/chosen": -1.0625, + "eval_logits/rejected": -0.96484375, + "eval_logps/chosen": -592.0, + "eval_logps/rejected": -696.0, + "eval_loss": 0.37350770831108093, + "eval_rewards/accuracies": 0.75, + "eval_rewards/chosen": -3.3125, + "eval_rewards/margins": 2.09375, + "eval_rewards/rejected": -5.40625, + "eval_runtime": 49.8427, + "eval_samples_per_second": 21.066, + "eval_steps_per_second": 0.181, + "step": 2000 + }, + { + "epoch": 2.49, + "grad_norm": 15.779889705391266, + "learning_rate": 4.265998221186023e-08, + "logits/chosen": -1.2421875, + "logits/rejected": -1.1640625, + "logps/chosen": -592.0, + "logps/rejected": -560.0, + "loss": 0.1073, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.546875, + "rewards/margins": 3.03125, + "rewards/rejected": -5.5625, + "step": 2010 + }, + { + "epoch": 2.5, + "grad_norm": 27.097026355265392, + "learning_rate": 4.0666867482825135e-08, + "logits/chosen": -1.0859375, + "logits/rejected": -1.0859375, + "logps/chosen": -668.0, + "logps/rejected": -548.0, + "loss": 0.117, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.953125, + "rewards/margins": 3.359375, + "rewards/rejected": -5.3125, + "step": 2020 + }, + { + "epoch": 2.52, + "grad_norm": 13.442767252473281, + "learning_rate": 3.871730538893611e-08, + "logits/chosen": -1.3515625, + "logits/rejected": -1.9609375, + "logps/chosen": -736.0, + "logps/rejected": -740.0, + "loss": 0.1108, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.421875, + "rewards/margins": 3.328125, + "rewards/rejected": -4.75, + "step": 2030 + }, + { + "epoch": 2.53, + "grad_norm": 17.818418586870905, + "learning_rate": 3.681170154475391e-08, + "logits/chosen": -1.625, + "logits/rejected": -1.546875, + "logps/chosen": -442.0, + "logps/rejected": -620.0, + "loss": 0.1236, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.90625, + "rewards/margins": 2.5625, + "rewards/rejected": -4.46875, + "step": 2040 + }, + { + "epoch": 2.54, + "grad_norm": 16.487242563455915, + "learning_rate": 3.495045241914105e-08, + "logits/chosen": -1.09375, + "logits/rejected": -2.546875, + "logps/chosen": -584.0, + "logps/rejected": -672.0, + "loss": 0.1079, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.25, + "rewards/margins": 4.09375, + "rewards/rejected": -6.34375, + "step": 2050 + }, + { + "epoch": 2.55, + "grad_norm": 15.802897986414916, + "learning_rate": 3.313394525277527e-08, + "logits/chosen": -1.4609375, + "logits/rejected": -1.3125, + "logps/chosen": -482.0, + "logps/rejected": -572.0, + "loss": 0.0979, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.265625, + "rewards/margins": 3.078125, + "rewards/rejected": -5.34375, + "step": 2060 + }, + { + "epoch": 2.57, + "grad_norm": 11.010895733144364, + "learning_rate": 3.1362557977582e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.2109375, + "logps/chosen": -482.0, + "logps/rejected": -506.0, + "loss": 0.1009, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.65625, + "rewards/margins": 1.8515625, + "rewards/rejected": -4.5, + "step": 2070 + }, + { + "epoch": 2.58, + "grad_norm": 19.922568800825218, + "learning_rate": 2.963665913810451e-08, + "logits/chosen": -1.0078125, + "logits/rejected": -2.40625, + "logps/chosen": -712.0, + "logps/rejected": -588.0, + "loss": 0.1016, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.34375, + "rewards/margins": 2.796875, + "rewards/rejected": -5.125, + "step": 2080 + }, + { + "epoch": 2.59, + "grad_norm": 14.614675732714645, + "learning_rate": 2.7956607814826366e-08, + "logits/chosen": -1.2109375, + "logits/rejected": -1.078125, + "logps/chosen": -732.0, + "logps/rejected": -712.0, + "loss": 0.1244, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.15625, + "rewards/margins": 3.171875, + "rewards/rejected": -5.34375, + "step": 2090 + }, + { + "epoch": 2.6, + "grad_norm": 13.459903874571827, + "learning_rate": 2.632275354946342e-08, + "logits/chosen": -0.9375, + "logits/rejected": -2.328125, + "logps/chosen": -470.0, + "logps/rejected": -386.0, + "loss": 0.1195, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.8359375, + "rewards/margins": 2.0625, + "rewards/rejected": -3.90625, + "step": 2100 + }, + { + "epoch": 2.61, + "grad_norm": 11.287103509304053, + "learning_rate": 2.4735436272239922e-08, + "logits/chosen": -1.7109375, + "logits/rejected": -2.484375, + "logps/chosen": -500.0, + "logps/rejected": -612.0, + "loss": 0.096, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.03125, + "rewards/margins": 3.71875, + "rewards/rejected": -5.75, + "step": 2110 + }, + { + "epoch": 2.63, + "grad_norm": 9.246071212037243, + "learning_rate": 2.319498623116492e-08, + "logits/chosen": -2.203125, + "logits/rejected": -1.578125, + "logps/chosen": -564.0, + "logps/rejected": -796.0, + "loss": 0.1098, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.125, + "rewards/margins": 4.15625, + "rewards/rejected": -7.28125, + "step": 2120 + }, + { + "epoch": 2.64, + "grad_norm": 25.87859572275051, + "learning_rate": 2.1701723923322673e-08, + "logits/chosen": -1.7265625, + "logits/rejected": -1.8984375, + "logps/chosen": -516.0, + "logps/rejected": -644.0, + "loss": 0.1225, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.484375, + "rewards/margins": 3.4375, + "rewards/rejected": -5.9375, + "step": 2130 + }, + { + "epoch": 2.65, + "grad_norm": 22.24550487415097, + "learning_rate": 2.0255960028191798e-08, + "logits/chosen": -1.5703125, + "logits/rejected": -1.703125, + "logps/chosen": -502.0, + "logps/rejected": -552.0, + "loss": 0.1103, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.375, + "rewards/margins": 3.0625, + "rewards/rejected": -5.4375, + "step": 2140 + }, + { + "epoch": 2.66, + "grad_norm": 22.983007966397018, + "learning_rate": 1.8857995343007167e-08, + "logits/chosen": -1.8125, + "logits/rejected": -1.3828125, + "logps/chosen": -728.0, + "logps/rejected": -900.0, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.96875, + "rewards/margins": 3.96875, + "rewards/rejected": -5.9375, + "step": 2150 + }, + { + "epoch": 2.68, + "grad_norm": 25.097623669642626, + "learning_rate": 1.7508120720177795e-08, + "logits/chosen": -1.1796875, + "logits/rejected": -1.03125, + "logps/chosen": -568.0, + "logps/rejected": -604.0, + "loss": 0.1184, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.78125, + "rewards/margins": 2.078125, + "rewards/rejected": -4.875, + "step": 2160 + }, + { + "epoch": 2.69, + "grad_norm": 32.20516783572916, + "learning_rate": 1.6206617006773753e-08, + "logits/chosen": -0.78515625, + "logits/rejected": -2.078125, + "logps/chosen": -736.0, + "logps/rejected": -556.0, + "loss": 0.1038, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -1.5078125, + "rewards/margins": 2.828125, + "rewards/rejected": -4.34375, + "step": 2170 + }, + { + "epoch": 2.7, + "grad_norm": 11.93984865185982, + "learning_rate": 1.4953754986094886e-08, + "logits/chosen": -1.5859375, + "logits/rejected": -1.6328125, + "logps/chosen": -568.0, + "logps/rejected": -580.0, + "loss": 0.1095, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7109375, + "rewards/margins": 2.640625, + "rewards/rejected": -4.34375, + "step": 2180 + }, + { + "epoch": 2.71, + "grad_norm": 30.015474257847476, + "learning_rate": 1.3749795321332885e-08, + "logits/chosen": -1.265625, + "logits/rejected": -1.5390625, + "logps/chosen": -664.0, + "logps/rejected": -804.0, + "loss": 0.1274, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.96875, + "rewards/margins": 3.421875, + "rewards/rejected": -5.40625, + "step": 2190 + }, + { + "epoch": 2.73, + "grad_norm": 20.828265625017977, + "learning_rate": 1.2594988501339665e-08, + "logits/chosen": -1.1796875, + "logits/rejected": -1.796875, + "logps/chosen": -628.0, + "logps/rejected": -684.0, + "loss": 0.1094, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.71875, + "rewards/margins": 3.109375, + "rewards/rejected": -5.8125, + "step": 2200 + }, + { + "epoch": 2.74, + "grad_norm": 22.34873903931498, + "learning_rate": 1.148957478851173e-08, + "logits/chosen": -1.515625, + "logits/rejected": -1.375, + "logps/chosen": -604.0, + "logps/rejected": -572.0, + "loss": 0.1136, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.328125, + "rewards/margins": 2.515625, + "rewards/rejected": -4.84375, + "step": 2210 + }, + { + "epoch": 2.75, + "grad_norm": 15.346930767785905, + "learning_rate": 1.0433784168802805e-08, + "logits/chosen": -1.3125, + "logits/rejected": -1.578125, + "logps/chosen": -624.0, + "logps/rejected": -820.0, + "loss": 0.1239, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.03125, + "rewards/margins": 3.015625, + "rewards/rejected": -6.0625, + "step": 2220 + }, + { + "epoch": 2.76, + "grad_norm": 16.847862725204944, + "learning_rate": 9.427836303874115e-09, + "logits/chosen": -1.1640625, + "logits/rejected": -2.03125, + "logps/chosen": -568.0, + "logps/rejected": -648.0, + "loss": 0.1139, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.921875, + "rewards/margins": 4.0, + "rewards/rejected": -6.9375, + "step": 2230 + }, + { + "epoch": 2.78, + "grad_norm": 11.648381674685714, + "learning_rate": 8.47194048539307e-09, + "logits/chosen": -1.015625, + "logits/rejected": -1.6875, + "logps/chosen": -880.0, + "logps/rejected": -708.0, + "loss": 0.114, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.515625, + "rewards/margins": 3.515625, + "rewards/rejected": -6.03125, + "step": 2240 + }, + { + "epoch": 2.79, + "grad_norm": 18.35912272257459, + "learning_rate": 7.566295591489052e-09, + "logits/chosen": -1.4765625, + "logits/rejected": -1.421875, + "logps/chosen": -604.0, + "logps/rejected": -676.0, + "loss": 0.1191, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9140625, + "rewards/margins": 3.015625, + "rewards/rejected": -4.9375, + "step": 2250 + }, + { + "epoch": 2.8, + "grad_norm": 13.681536454598728, + "learning_rate": 6.71109004537615e-09, + "logits/chosen": -1.125, + "logits/rejected": -1.1875, + "logps/chosen": -604.0, + "logps/rejected": -664.0, + "loss": 0.0951, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.46875, + "rewards/margins": 3.0625, + "rewards/rejected": -5.53125, + "step": 2260 + }, + { + "epoch": 2.81, + "grad_norm": 9.950566904078942, + "learning_rate": 5.906501776150763e-09, + "logits/chosen": -1.0, + "logits/rejected": -2.921875, + "logps/chosen": -712.0, + "logps/rejected": -600.0, + "loss": 0.1083, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8515625, + "rewards/margins": 3.109375, + "rewards/rejected": -4.96875, + "step": 2270 + }, + { + "epoch": 2.83, + "grad_norm": 22.085690818869246, + "learning_rate": 5.152698181772857e-09, + "logits/chosen": -1.140625, + "logits/rejected": -1.546875, + "logps/chosen": -572.0, + "logps/rejected": -760.0, + "loss": 0.1072, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.125, + "rewards/margins": 2.71875, + "rewards/rejected": -4.84375, + "step": 2280 + }, + { + "epoch": 2.84, + "grad_norm": 21.353543333604414, + "learning_rate": 4.449836094238019e-09, + "logits/chosen": -1.171875, + "logits/rejected": -1.890625, + "logps/chosen": -620.0, + "logps/rejected": -486.0, + "loss": 0.0966, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.9453125, + "rewards/margins": 3.21875, + "rewards/rejected": -5.15625, + "step": 2290 + }, + { + "epoch": 2.85, + "grad_norm": 11.392655664834834, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -1.09375, + "logits/rejected": -1.546875, + "logps/chosen": -500.0, + "logps/rejected": -568.0, + "loss": 0.1108, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": -2.40625, + "rewards/margins": 2.078125, + "rewards/rejected": -4.5, + "step": 2300 + }, + { + "epoch": 2.86, + "grad_norm": 16.899400188846528, + "learning_rate": 3.1975107442860637e-09, + "logits/chosen": -1.953125, + "logits/rejected": -1.453125, + "logps/chosen": -494.0, + "logps/rejected": -928.0, + "loss": 0.6379, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.453125, + "rewards/margins": 4.59375, + "rewards/rejected": -7.0625, + "step": 2310 + }, + { + "epoch": 2.87, + "grad_norm": 18.0993042603957, + "learning_rate": 2.6483080334041287e-09, + "logits/chosen": -1.328125, + "logits/rejected": -1.453125, + "logps/chosen": -652.0, + "logps/rejected": -816.0, + "loss": 0.0927, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.078125, + "rewards/margins": 3.625, + "rewards/rejected": -5.6875, + "step": 2320 + }, + { + "epoch": 2.89, + "grad_norm": 18.326061508758997, + "learning_rate": 2.1505678782269e-09, + "logits/chosen": -1.3125, + "logits/rejected": -2.1875, + "logps/chosen": -668.0, + "logps/rejected": -572.0, + "loss": 0.1068, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.125, + "rewards/margins": 3.1875, + "rewards/rejected": -5.3125, + "step": 2330 + }, + { + "epoch": 2.9, + "grad_norm": 20.00896083752618, + "learning_rate": 1.7043938356787467e-09, + "logits/chosen": -1.4921875, + "logits/rejected": -1.3125, + "logps/chosen": -354.0, + "logps/rejected": -556.0, + "loss": 0.1215, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.109375, + "rewards/margins": 2.765625, + "rewards/rejected": -4.875, + "step": 2340 + }, + { + "epoch": 2.91, + "grad_norm": 33.844584510123674, + "learning_rate": 1.30987873413832e-09, + "logits/chosen": -0.94140625, + "logits/rejected": -1.109375, + "logps/chosen": -588.0, + "logps/rejected": -660.0, + "loss": 0.1078, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.375, + "rewards/margins": 2.859375, + "rewards/rejected": -5.21875, + "step": 2350 + }, + { + "epoch": 2.92, + "grad_norm": 13.322271542703083, + "learning_rate": 9.671046541251393e-10, + "logits/chosen": -1.09375, + "logits/rejected": -2.046875, + "logps/chosen": -672.0, + "logps/rejected": -576.0, + "loss": 0.1159, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.7890625, + "rewards/margins": 3.765625, + "rewards/rejected": -5.5625, + "step": 2360 + }, + { + "epoch": 2.94, + "grad_norm": 28.085054792653725, + "learning_rate": 6.761429112225326e-10, + "logits/chosen": -1.1484375, + "logits/rejected": -0.7734375, + "logps/chosen": -688.0, + "logps/rejected": -908.0, + "loss": 0.0952, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.8203125, + "rewards/margins": 3.71875, + "rewards/rejected": -5.53125, + "step": 2370 + }, + { + "epoch": 2.95, + "grad_norm": 20.97872609812394, + "learning_rate": 4.370540412399759e-10, + "logits/chosen": -1.859375, + "logits/rejected": -1.8125, + "logps/chosen": -564.0, + "logps/rejected": -684.0, + "loss": 0.1135, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.328125, + "rewards/margins": 3.609375, + "rewards/rejected": -5.9375, + "step": 2380 + }, + { + "epoch": 2.96, + "grad_norm": 19.210485937449167, + "learning_rate": 2.498877876184191e-10, + "logits/chosen": -1.4765625, + "logits/rejected": -1.046875, + "logps/chosen": -688.0, + "logps/rejected": -664.0, + "loss": 0.1025, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.6015625, + "rewards/margins": 3.125, + "rewards/rejected": -4.71875, + "step": 2390 + }, + { + "epoch": 2.97, + "grad_norm": 12.682964834422256, + "learning_rate": 1.1468309108100816e-10, + "logits/chosen": -1.1171875, + "logits/rejected": -1.3125, + "logps/chosen": -456.0, + "logps/rejected": -540.0, + "loss": 0.1049, + "rewards/accuracies": 1.0, + "rewards/chosen": -3.0, + "rewards/margins": 3.109375, + "rewards/rejected": -6.125, + "step": 2400 + }, + { + "epoch": 2.99, + "grad_norm": 16.080027717160323, + "learning_rate": 3.146808153123293e-11, + "logits/chosen": -1.234375, + "logits/rejected": -2.03125, + "logps/chosen": -492.0, + "logps/rejected": -532.0, + "loss": 0.1196, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.109375, + "rewards/margins": 2.046875, + "rewards/rejected": -4.15625, + "step": 2410 + }, + { + "epoch": 3.0, + "grad_norm": 13.95546245482141, + "learning_rate": 2.60072200469752e-13, + "logits/chosen": -1.2265625, + "logits/rejected": -2.421875, + "logps/chosen": -624.0, + "logps/rejected": -482.0, + "loss": 0.1097, + "rewards/accuracies": 1.0, + "rewards/chosen": -2.34375, + "rewards/margins": 2.84375, + "rewards/rejected": -5.1875, + "step": 2420 + } + ], + "logging_steps": 10, + "max_steps": 2421, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}