{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9989071038251366, "eval_steps": 400, "global_step": 457, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01092896174863388, "grad_norm": 358.5620562461214, "learning_rate": 1.0869565217391303e-07, "logits/chosen": -1.0113575458526611, "logits/rejected": -1.0064939260482788, "logps/chosen": -0.2803983986377716, "logps/rejected": -0.2860395908355713, "loss": 5.2315, "rewards/accuracies": 0.53125, "rewards/chosen": -2.8039839267730713, "rewards/margins": 0.056411754339933395, "rewards/rejected": -2.860395669937134, "semantic_entropy": 0.7518940567970276, "step": 5 }, { "epoch": 0.02185792349726776, "grad_norm": 233.11198507849488, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -1.056563138961792, "logits/rejected": -1.0053507089614868, "logps/chosen": -0.2568749487400055, "logps/rejected": -0.27021342515945435, "loss": 5.206, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.568748950958252, "rewards/margins": 0.13338527083396912, "rewards/rejected": -2.702134370803833, "semantic_entropy": 0.7094504237174988, "step": 10 }, { "epoch": 0.03278688524590164, "grad_norm": 210.79380420075688, "learning_rate": 3.260869565217391e-07, "logits/chosen": -1.0091139078140259, "logits/rejected": -0.9631060361862183, "logps/chosen": -0.2674282491207123, "logps/rejected": -0.27336788177490234, "loss": 5.1278, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.6742825508117676, "rewards/margins": 0.05939612537622452, "rewards/rejected": -2.7336785793304443, "semantic_entropy": 0.7274739742279053, "step": 15 }, { "epoch": 0.04371584699453552, "grad_norm": 375.0299222500816, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -0.9487798810005188, "logits/rejected": -0.8998070955276489, "logps/chosen": -0.2723819613456726, "logps/rejected": -0.28497135639190674, "loss": 5.222, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.7238194942474365, "rewards/margins": 0.1258939504623413, "rewards/rejected": -2.8497138023376465, "semantic_entropy": 0.7452942132949829, "step": 20 }, { "epoch": 0.0546448087431694, "grad_norm": 299.8105605992341, "learning_rate": 5.434782608695652e-07, "logits/chosen": -0.9505411386489868, "logits/rejected": -0.8759678602218628, "logps/chosen": -0.27557066082954407, "logps/rejected": -0.29382389783859253, "loss": 5.1223, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.755706548690796, "rewards/margins": 0.1825324147939682, "rewards/rejected": -2.9382388591766357, "semantic_entropy": 0.7546485662460327, "step": 25 }, { "epoch": 0.06557377049180328, "grad_norm": 300.511189676041, "learning_rate": 6.521739130434782e-07, "logits/chosen": -1.0557540655136108, "logits/rejected": -0.990186333656311, "logps/chosen": -0.267598032951355, "logps/rejected": -0.28400346636772156, "loss": 5.1993, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.6759800910949707, "rewards/margins": 0.1640542596578598, "rewards/rejected": -2.8400347232818604, "semantic_entropy": 0.7248786091804504, "step": 30 }, { "epoch": 0.07650273224043716, "grad_norm": 186.14457687250982, "learning_rate": 7.608695652173913e-07, "logits/chosen": -1.0076847076416016, "logits/rejected": -0.9405019879341125, "logps/chosen": -0.2580474615097046, "logps/rejected": -0.279694139957428, "loss": 5.0812, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.580474615097046, "rewards/margins": 0.2164669930934906, "rewards/rejected": -2.7969415187835693, "semantic_entropy": 0.7201561331748962, "step": 35 }, { "epoch": 0.08743169398907104, "grad_norm": 223.67562662995005, "learning_rate": 8.695652173913043e-07, "logits/chosen": -0.9655061960220337, "logits/rejected": -0.9045132398605347, "logps/chosen": -0.2820321023464203, "logps/rejected": -0.2990434765815735, "loss": 5.3116, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.820321559906006, "rewards/margins": 0.17011362314224243, "rewards/rejected": -2.9904348850250244, "semantic_entropy": 0.7597802877426147, "step": 40 }, { "epoch": 0.09836065573770492, "grad_norm": 131.00541589477967, "learning_rate": 9.782608695652173e-07, "logits/chosen": -1.0179851055145264, "logits/rejected": -0.9359539151191711, "logps/chosen": -0.2855134606361389, "logps/rejected": -0.3078162968158722, "loss": 4.919, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.8551342487335205, "rewards/margins": 0.22302868962287903, "rewards/rejected": -3.078162908554077, "semantic_entropy": 0.7599790096282959, "step": 45 }, { "epoch": 0.1092896174863388, "grad_norm": 360.5211336376954, "learning_rate": 9.997663088532014e-07, "logits/chosen": -0.9701619148254395, "logits/rejected": -0.887865424156189, "logps/chosen": -0.28138467669487, "logps/rejected": -0.28913217782974243, "loss": 5.1014, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.813847064971924, "rewards/margins": 0.07747481018304825, "rewards/rejected": -2.891321897506714, "semantic_entropy": 0.7510823607444763, "step": 50 }, { "epoch": 0.12021857923497267, "grad_norm": 162.42445549849919, "learning_rate": 9.98817312944725e-07, "logits/chosen": -1.0028311014175415, "logits/rejected": -0.8850187063217163, "logps/chosen": -0.27895885705947876, "logps/rejected": -0.3143305778503418, "loss": 4.8519, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.7895889282226562, "rewards/margins": 0.3537173271179199, "rewards/rejected": -3.143306255340576, "semantic_entropy": 0.7608937621116638, "step": 55 }, { "epoch": 0.13114754098360656, "grad_norm": 120.127890644715, "learning_rate": 9.971397915250336e-07, "logits/chosen": -1.0246554613113403, "logits/rejected": -0.9786936640739441, "logps/chosen": -0.26926669478416443, "logps/rejected": -0.3113505244255066, "loss": 4.6493, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.692667245864868, "rewards/margins": 0.42083826661109924, "rewards/rejected": -3.1135053634643555, "semantic_entropy": 0.7576217651367188, "step": 60 }, { "epoch": 0.14207650273224043, "grad_norm": 214.34275301709462, "learning_rate": 9.94736194623663e-07, "logits/chosen": -0.9859918355941772, "logits/rejected": -0.9181090593338013, "logps/chosen": -0.3084973096847534, "logps/rejected": -0.3399081826210022, "loss": 4.9942, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.084973096847534, "rewards/margins": 0.31410855054855347, "rewards/rejected": -3.3990814685821533, "semantic_entropy": 0.8060176968574524, "step": 65 }, { "epoch": 0.15300546448087432, "grad_norm": 544.9393207205336, "learning_rate": 9.916100327075037e-07, "logits/chosen": -0.9521551132202148, "logits/rejected": -0.9334642291069031, "logps/chosen": -0.29637694358825684, "logps/rejected": -0.3240143656730652, "loss": 4.7857, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.9637694358825684, "rewards/margins": 0.27637460827827454, "rewards/rejected": -3.2401435375213623, "semantic_entropy": 0.7712014317512512, "step": 70 }, { "epoch": 0.16393442622950818, "grad_norm": 123.2584567983735, "learning_rate": 9.877658715537428e-07, "logits/chosen": -0.9457874298095703, "logits/rejected": -0.9289388656616211, "logps/chosen": -0.31344500184059143, "logps/rejected": -0.3473803997039795, "loss": 4.8544, "rewards/accuracies": 0.625, "rewards/chosen": -3.1344501972198486, "rewards/margins": 0.33935409784317017, "rewards/rejected": -3.473804473876953, "semantic_entropy": 0.8020299077033997, "step": 75 }, { "epoch": 0.17486338797814208, "grad_norm": 136.55956527115663, "learning_rate": 9.832093255815216e-07, "logits/chosen": -0.9409273266792297, "logits/rejected": -0.8797443509101868, "logps/chosen": -0.3109641373157501, "logps/rejected": -0.3346250057220459, "loss": 4.7853, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.1096413135528564, "rewards/margins": 0.23660895228385925, "rewards/rejected": -3.346250534057617, "semantic_entropy": 0.7786868810653687, "step": 80 }, { "epoch": 0.18579234972677597, "grad_norm": 202.99615488786125, "learning_rate": 9.779470496520441e-07, "logits/chosen": -0.9390329122543335, "logits/rejected": -0.89063560962677, "logps/chosen": -0.31133827567100525, "logps/rejected": -0.3710102438926697, "loss": 4.6394, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.1133828163146973, "rewards/margins": 0.5967196226119995, "rewards/rejected": -3.7101027965545654, "semantic_entropy": 0.8020746111869812, "step": 85 }, { "epoch": 0.19672131147540983, "grad_norm": 147.8274369377715, "learning_rate": 9.719867293491144e-07, "logits/chosen": -0.9997242093086243, "logits/rejected": -0.9218411445617676, "logps/chosen": -0.33917468786239624, "logps/rejected": -0.3771332800388336, "loss": 4.7466, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.391746997833252, "rewards/margins": 0.37958595156669617, "rewards/rejected": -3.7713329792022705, "semantic_entropy": 0.8518031239509583, "step": 90 }, { "epoch": 0.20765027322404372, "grad_norm": 150.5965714109422, "learning_rate": 9.653370697542987e-07, "logits/chosen": -0.949033260345459, "logits/rejected": -0.9504354596138, "logps/chosen": -0.34006524085998535, "logps/rejected": -0.3670746684074402, "loss": 4.5023, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -3.4006524085998535, "rewards/margins": 0.27009397745132446, "rewards/rejected": -3.6707465648651123, "semantic_entropy": 0.8378095626831055, "step": 95 }, { "epoch": 0.2185792349726776, "grad_norm": 205.55603229224332, "learning_rate": 9.580077827331037e-07, "logits/chosen": -0.9659525752067566, "logits/rejected": -0.9214147329330444, "logps/chosen": -0.37815189361572266, "logps/rejected": -0.4408392012119293, "loss": 4.4615, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -3.7815189361572266, "rewards/margins": 0.6268730163574219, "rewards/rejected": -4.408391952514648, "semantic_entropy": 0.8833344578742981, "step": 100 }, { "epoch": 0.22950819672131148, "grad_norm": 168.46156689250674, "learning_rate": 9.500095727507419e-07, "logits/chosen": -1.0187479257583618, "logits/rejected": -0.9876689910888672, "logps/chosen": -0.3568420708179474, "logps/rejected": -0.4102093577384949, "loss": 4.4882, "rewards/accuracies": 0.625, "rewards/chosen": -3.568420886993408, "rewards/margins": 0.5336726903915405, "rewards/rejected": -4.102093696594238, "semantic_entropy": 0.8596333265304565, "step": 105 }, { "epoch": 0.24043715846994534, "grad_norm": 163.05795657074705, "learning_rate": 9.413541212382004e-07, "logits/chosen": -1.013091802597046, "logits/rejected": -0.9950464367866516, "logps/chosen": -0.37098902463912964, "logps/rejected": -0.45781344175338745, "loss": 4.3876, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.709890842437744, "rewards/margins": 0.8682438135147095, "rewards/rejected": -4.578134059906006, "semantic_entropy": 0.9028980135917664, "step": 110 }, { "epoch": 0.25136612021857924, "grad_norm": 159.92608074418774, "learning_rate": 9.320540695314438e-07, "logits/chosen": -1.0206435918807983, "logits/rejected": -0.9817934036254883, "logps/chosen": -0.37152332067489624, "logps/rejected": -0.4835759103298187, "loss": 4.1862, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.715233325958252, "rewards/margins": 1.1205257177352905, "rewards/rejected": -4.835759162902832, "semantic_entropy": 0.8893026113510132, "step": 115 }, { "epoch": 0.26229508196721313, "grad_norm": 126.53869344949423, "learning_rate": 9.221230004086721e-07, "logits/chosen": -1.0430892705917358, "logits/rejected": -0.9731811285018921, "logps/chosen": -0.38018742203712463, "logps/rejected": -0.44267600774765015, "loss": 4.3042, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -3.8018736839294434, "rewards/margins": 0.6248863935470581, "rewards/rejected": -4.426759719848633, "semantic_entropy": 0.9051868319511414, "step": 120 }, { "epoch": 0.273224043715847, "grad_norm": 330.21792860177044, "learning_rate": 9.11575418252596e-07, "logits/chosen": -0.9472485780715942, "logits/rejected": -0.9132539629936218, "logps/chosen": -0.396454393863678, "logps/rejected": -0.4733741283416748, "loss": 4.0912, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -3.9645442962646484, "rewards/margins": 0.7691973447799683, "rewards/rejected": -4.733741283416748, "semantic_entropy": 0.9098461866378784, "step": 125 }, { "epoch": 0.28415300546448086, "grad_norm": 136.7218535491149, "learning_rate": 9.004267278667031e-07, "logits/chosen": -0.9847833514213562, "logits/rejected": -0.9780336618423462, "logps/chosen": -0.4162030816078186, "logps/rejected": -0.5500742793083191, "loss": 4.1012, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -4.162030220031738, "rewards/margins": 1.3387129306793213, "rewards/rejected": -5.500743389129639, "semantic_entropy": 0.9023580551147461, "step": 130 }, { "epoch": 0.29508196721311475, "grad_norm": 114.59260330847327, "learning_rate": 8.886932119764565e-07, "logits/chosen": -1.018243670463562, "logits/rejected": -0.934001624584198, "logps/chosen": -0.40604203939437866, "logps/rejected": -0.521537184715271, "loss": 3.933, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -4.060420036315918, "rewards/margins": 1.154951810836792, "rewards/rejected": -5.215372562408447, "semantic_entropy": 0.9238536953926086, "step": 135 }, { "epoch": 0.30601092896174864, "grad_norm": 151.22961611030988, "learning_rate": 8.763920074482809e-07, "logits/chosen": -1.02057683467865, "logits/rejected": -0.9660438299179077, "logps/chosen": -0.43255481123924255, "logps/rejected": -0.5811036229133606, "loss": 3.5202, "rewards/accuracies": 0.71875, "rewards/chosen": -4.32554817199707, "rewards/margins": 1.485487699508667, "rewards/rejected": -5.811036109924316, "semantic_entropy": 0.9533751606941223, "step": 140 }, { "epoch": 0.31693989071038253, "grad_norm": 171.9485188405326, "learning_rate": 8.635410802610723e-07, "logits/chosen": -1.0028008222579956, "logits/rejected": -0.9838630557060242, "logps/chosen": -0.4235480725765228, "logps/rejected": -0.49707216024398804, "loss": 3.7809, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -4.235480785369873, "rewards/margins": 0.7352409958839417, "rewards/rejected": -4.970722198486328, "semantic_entropy": 0.9417489767074585, "step": 145 }, { "epoch": 0.32786885245901637, "grad_norm": 135.1994551251087, "learning_rate": 8.501591992667849e-07, "logits/chosen": -1.0660603046417236, "logits/rejected": -1.0330188274383545, "logps/chosen": -0.458204448223114, "logps/rejected": -0.6369711756706238, "loss": 3.6343, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -4.58204460144043, "rewards/margins": 1.7876676321029663, "rewards/rejected": -6.369711875915527, "semantic_entropy": 0.9522278904914856, "step": 150 }, { "epoch": 0.33879781420765026, "grad_norm": 123.13149743090504, "learning_rate": 8.362659087784152e-07, "logits/chosen": -0.9992470741271973, "logits/rejected": -0.946983814239502, "logps/chosen": -0.45955339074134827, "logps/rejected": -0.5732384324073792, "loss": 3.6764, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.5955328941345215, "rewards/margins": 1.1368510723114014, "rewards/rejected": -5.732384204864502, "semantic_entropy": 0.9510468244552612, "step": 155 }, { "epoch": 0.34972677595628415, "grad_norm": 245.75242739092204, "learning_rate": 8.218815000254231e-07, "logits/chosen": -1.0509494543075562, "logits/rejected": -0.9942834973335266, "logps/chosen": -0.5233258008956909, "logps/rejected": -0.6250703930854797, "loss": 3.7281, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -5.233258247375488, "rewards/margins": 1.0174460411071777, "rewards/rejected": -6.250703811645508, "semantic_entropy": 0.9671458005905151, "step": 160 }, { "epoch": 0.36065573770491804, "grad_norm": 158.28325663577223, "learning_rate": 8.07026981518276e-07, "logits/chosen": -1.0312683582305908, "logits/rejected": -0.9780334234237671, "logps/chosen": -0.5141728520393372, "logps/rejected": -0.6252259016036987, "loss": 3.6091, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -5.141728401184082, "rewards/margins": 1.1105303764343262, "rewards/rejected": -6.252258777618408, "semantic_entropy": 0.9860242605209351, "step": 165 }, { "epoch": 0.37158469945355194, "grad_norm": 136.70967385020978, "learning_rate": 7.917240483654e-07, "logits/chosen": -1.0271486043930054, "logits/rejected": -0.9650506973266602, "logps/chosen": -0.5105966329574585, "logps/rejected": -0.6147508025169373, "loss": 3.7493, "rewards/accuracies": 0.71875, "rewards/chosen": -5.105965614318848, "rewards/margins": 1.0415422916412354, "rewards/rejected": -6.147508144378662, "semantic_entropy": 0.9853399991989136, "step": 170 }, { "epoch": 0.3825136612021858, "grad_norm": 192.89943041726585, "learning_rate": 7.759950505873521e-07, "logits/chosen": -1.0819661617279053, "logits/rejected": -1.050621747970581, "logps/chosen": -0.5413715839385986, "logps/rejected": -0.6315657496452332, "loss": 3.5161, "rewards/accuracies": 0.65625, "rewards/chosen": -5.413715362548828, "rewards/margins": 0.9019424319267273, "rewards/rejected": -6.315657615661621, "semantic_entropy": 0.9645811319351196, "step": 175 }, { "epoch": 0.39344262295081966, "grad_norm": 152.89832261081025, "learning_rate": 7.598629604744872e-07, "logits/chosen": -1.0869873762130737, "logits/rejected": -1.0785080194473267, "logps/chosen": -0.5242325067520142, "logps/rejected": -0.7012667655944824, "loss": 3.3521, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.242325305938721, "rewards/margins": 1.7703425884246826, "rewards/rejected": -7.012667655944824, "semantic_entropy": 1.0011693239212036, "step": 180 }, { "epoch": 0.40437158469945356, "grad_norm": 150.16456326171243, "learning_rate": 7.433513390357989e-07, "logits/chosen": -1.1106479167938232, "logits/rejected": -1.1203057765960693, "logps/chosen": -0.5613775253295898, "logps/rejected": -0.7369329333305359, "loss": 3.3325, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -5.61377477645874, "rewards/margins": 1.755553960800171, "rewards/rejected": -7.36932897567749, "semantic_entropy": 1.0098798274993896, "step": 185 }, { "epoch": 0.41530054644808745, "grad_norm": 177.03981164325452, "learning_rate": 7.264843015879321e-07, "logits/chosen": -1.118817925453186, "logits/rejected": -1.075656771659851, "logps/chosen": -0.5620681047439575, "logps/rejected": -0.7573049664497375, "loss": 3.3889, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -5.620680332183838, "rewards/margins": 1.9523694515228271, "rewards/rejected": -7.573049068450928, "semantic_entropy": 0.998263955116272, "step": 190 }, { "epoch": 0.4262295081967213, "grad_norm": 126.76170022708793, "learning_rate": 7.092864825346266e-07, "logits/chosen": -1.1385692358016968, "logits/rejected": -1.1158863306045532, "logps/chosen": -0.6738488078117371, "logps/rejected": -0.8919061422348022, "loss": 3.431, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -6.73848819732666, "rewards/margins": 2.180572986602783, "rewards/rejected": -8.919061660766602, "semantic_entropy": 0.9914792776107788, "step": 195 }, { "epoch": 0.4371584699453552, "grad_norm": 146.12776815167663, "learning_rate": 6.917829993880302e-07, "logits/chosen": -1.1107165813446045, "logits/rejected": -1.0282893180847168, "logps/chosen": -0.6398170590400696, "logps/rejected": -0.8206952810287476, "loss": 3.2848, "rewards/accuracies": 0.78125, "rewards/chosen": -6.398170471191406, "rewards/margins": 1.8087825775146484, "rewards/rejected": -8.206953048706055, "semantic_entropy": 1.0069334506988525, "step": 200 }, { "epoch": 0.44808743169398907, "grad_norm": 114.70543290599623, "learning_rate": 6.739994160844309e-07, "logits/chosen": -1.0968120098114014, "logits/rejected": -1.1100647449493408, "logps/chosen": -0.6191304326057434, "logps/rejected": -0.8075233697891235, "loss": 3.113, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -6.1913042068481445, "rewards/margins": 1.8839294910430908, "rewards/rejected": -8.075233459472656, "semantic_entropy": 1.0141515731811523, "step": 205 }, { "epoch": 0.45901639344262296, "grad_norm": 207.23320066468338, "learning_rate": 6.559617056479827e-07, "logits/chosen": -1.1229215860366821, "logits/rejected": -1.1227028369903564, "logps/chosen": -0.6795850396156311, "logps/rejected": -0.9186260104179382, "loss": 3.1442, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -6.7958502769470215, "rewards/margins": 2.3904099464416504, "rewards/rejected": -9.186259269714355, "semantic_entropy": 0.9720403552055359, "step": 210 }, { "epoch": 0.46994535519125685, "grad_norm": 154.50502979258272, "learning_rate": 6.376962122569567e-07, "logits/chosen": -1.1284302473068237, "logits/rejected": -1.0769437551498413, "logps/chosen": -0.6974600553512573, "logps/rejected": -0.9393995404243469, "loss": 3.4054, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.974600315093994, "rewards/margins": 2.4193947315216064, "rewards/rejected": -9.39399528503418, "semantic_entropy": 1.0159441232681274, "step": 215 }, { "epoch": 0.4808743169398907, "grad_norm": 138.6943703552703, "learning_rate": 6.192296127679192e-07, "logits/chosen": -1.1995110511779785, "logits/rejected": -1.1735769510269165, "logps/chosen": -0.7303147912025452, "logps/rejected": -0.9362783432006836, "loss": 3.1146, "rewards/accuracies": 0.75, "rewards/chosen": -7.303147792816162, "rewards/margins": 2.0596346855163574, "rewards/rejected": -9.362783432006836, "semantic_entropy": 0.9887905120849609, "step": 220 }, { "epoch": 0.4918032786885246, "grad_norm": 120.75185480754473, "learning_rate": 6.005888777540319e-07, "logits/chosen": -1.2185839414596558, "logits/rejected": -1.1783679723739624, "logps/chosen": -0.7471610307693481, "logps/rejected": -0.9852075576782227, "loss": 3.1221, "rewards/accuracies": 0.8125, "rewards/chosen": -7.471610069274902, "rewards/margins": 2.380465030670166, "rewards/rejected": -9.852075576782227, "semantic_entropy": 0.9979068040847778, "step": 225 }, { "epoch": 0.5027322404371585, "grad_norm": 122.09354415954165, "learning_rate": 5.818012321143773e-07, "logits/chosen": -1.1167972087860107, "logits/rejected": -1.1158543825149536, "logps/chosen": -0.7506524920463562, "logps/rejected": -1.0082188844680786, "loss": 3.1675, "rewards/accuracies": 0.8125, "rewards/chosen": -7.50652551651001, "rewards/margins": 2.5756633281707764, "rewards/rejected": -10.082188606262207, "semantic_entropy": 0.993320643901825, "step": 230 }, { "epoch": 0.5136612021857924, "grad_norm": 147.5670346142303, "learning_rate": 5.628941153118388e-07, "logits/chosen": -1.1291579008102417, "logits/rejected": -1.0918009281158447, "logps/chosen": -0.775924563407898, "logps/rejected": -1.0093990564346313, "loss": 2.9612, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -7.759246826171875, "rewards/margins": 2.3347439765930176, "rewards/rejected": -10.093989372253418, "semantic_entropy": 0.981053352355957, "step": 235 }, { "epoch": 0.5245901639344263, "grad_norm": 133.14046441252583, "learning_rate": 5.438951412976098e-07, "logits/chosen": -1.1797640323638916, "logits/rejected": -1.1900447607040405, "logps/chosen": -0.7702494859695435, "logps/rejected": -1.056715488433838, "loss": 2.7098, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -7.702495574951172, "rewards/margins": 2.864659547805786, "rewards/rejected": -10.567155838012695, "semantic_entropy": 0.991602897644043, "step": 240 }, { "epoch": 0.5355191256830601, "grad_norm": 133.0444183672, "learning_rate": 5.248320581808619e-07, "logits/chosen": -1.1087061166763306, "logits/rejected": -1.0653207302093506, "logps/chosen": -0.792129397392273, "logps/rejected": -1.075703501701355, "loss": 2.9413, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -7.921294212341309, "rewards/margins": 2.835740566253662, "rewards/rejected": -10.757036209106445, "semantic_entropy": 0.9732475280761719, "step": 245 }, { "epoch": 0.546448087431694, "grad_norm": 122.1994564086186, "learning_rate": 5.057327077024744e-07, "logits/chosen": -1.1738090515136719, "logits/rejected": -1.1409178972244263, "logps/chosen": -0.806613564491272, "logps/rejected": -1.0335161685943604, "loss": 3.0864, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -8.06613540649414, "rewards/margins": 2.2690269947052, "rewards/rejected": -10.335162162780762, "semantic_entropy": 0.9622586965560913, "step": 250 }, { "epoch": 0.5573770491803278, "grad_norm": 232.10642636329976, "learning_rate": 4.866249845720132e-07, "logits/chosen": -1.1730918884277344, "logits/rejected": -1.1448460817337036, "logps/chosen": -0.8790150880813599, "logps/rejected": -1.1891063451766968, "loss": 2.8134, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.790151596069336, "rewards/margins": 3.1009116172790527, "rewards/rejected": -11.891061782836914, "semantic_entropy": 0.9595390558242798, "step": 255 }, { "epoch": 0.5683060109289617, "grad_norm": 131.2755348822229, "learning_rate": 4.675367957273505e-07, "logits/chosen": -1.141157865524292, "logits/rejected": -1.1335737705230713, "logps/chosen": -0.8586977124214172, "logps/rejected": -1.1454424858093262, "loss": 2.8892, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -8.586977005004883, "rewards/margins": 2.8674476146698, "rewards/rejected": -11.454424858093262, "semantic_entropy": 0.9425627589225769, "step": 260 }, { "epoch": 0.5792349726775956, "grad_norm": 140.64129624293926, "learning_rate": 4.4849601957642285e-07, "logits/chosen": -1.1671048402786255, "logits/rejected": -1.1342850923538208, "logps/chosen": -0.884295642375946, "logps/rejected": -1.1824935674667358, "loss": 2.8754, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -8.84295654296875, "rewards/margins": 2.9819793701171875, "rewards/rejected": -11.824935913085938, "semantic_entropy": 0.9442464709281921, "step": 265 }, { "epoch": 0.5901639344262295, "grad_norm": 147.4714622434341, "learning_rate": 4.295304652806592e-07, "logits/chosen": -1.161628246307373, "logits/rejected": -1.1403006315231323, "logps/chosen": -0.9028175473213196, "logps/rejected": -1.2475910186767578, "loss": 2.6617, "rewards/accuracies": 0.8125, "rewards/chosen": -9.028175354003906, "rewards/margins": 3.447734832763672, "rewards/rejected": -12.475909233093262, "semantic_entropy": 0.9366561770439148, "step": 270 }, { "epoch": 0.6010928961748634, "grad_norm": 103.487924065158, "learning_rate": 4.106678321395433e-07, "logits/chosen": -1.149213194847107, "logits/rejected": -1.091584324836731, "logps/chosen": -0.944200873374939, "logps/rejected": -1.1400898694992065, "loss": 2.9059, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -9.442008972167969, "rewards/margins": 1.9588892459869385, "rewards/rejected": -11.400897979736328, "semantic_entropy": 0.9353906512260437, "step": 275 }, { "epoch": 0.6120218579234973, "grad_norm": 134.66475933979152, "learning_rate": 3.9193566913562915e-07, "logits/chosen": -1.112555742263794, "logits/rejected": -1.1183173656463623, "logps/chosen": -0.9673782587051392, "logps/rejected": -1.330810308456421, "loss": 2.8642, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -9.673782348632812, "rewards/margins": 3.6343204975128174, "rewards/rejected": -13.30810260772705, "semantic_entropy": 0.8982528448104858, "step": 280 }, { "epoch": 0.6229508196721312, "grad_norm": 122.90153644042246, "learning_rate": 3.7336133469909623e-07, "logits/chosen": -1.2511584758758545, "logits/rejected": -1.2238370180130005, "logps/chosen": -0.9596298933029175, "logps/rejected": -1.3390361070632935, "loss": 2.5195, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -9.59630012512207, "rewards/margins": 3.794060468673706, "rewards/rejected": -13.390359878540039, "semantic_entropy": 0.9061362147331238, "step": 285 }, { "epoch": 0.6338797814207651, "grad_norm": 155.0095981855771, "learning_rate": 3.549719567506076e-07, "logits/chosen": -1.190308690071106, "logits/rejected": -1.1568708419799805, "logps/chosen": -1.0390634536743164, "logps/rejected": -1.3806968927383423, "loss": 2.8348, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -10.390633583068848, "rewards/margins": 3.4163336753845215, "rewards/rejected": -13.806968688964844, "semantic_entropy": 0.8884953260421753, "step": 290 }, { "epoch": 0.644808743169399, "grad_norm": 128.49285257186182, "learning_rate": 3.3679439308082774e-07, "logits/chosen": -1.1692029237747192, "logits/rejected": -1.172719120979309, "logps/chosen": -1.0511713027954102, "logps/rejected": -1.4550367593765259, "loss": 2.2826, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -10.511713027954102, "rewards/margins": 4.0386552810668945, "rewards/rejected": -14.55036735534668, "semantic_entropy": 0.878921389579773, "step": 295 }, { "epoch": 0.6557377049180327, "grad_norm": 113.97800775884723, "learning_rate": 3.1885519212446716e-07, "logits/chosen": -1.2200191020965576, "logits/rejected": -1.209221601486206, "logps/chosen": -1.0968992710113525, "logps/rejected": -1.4734210968017578, "loss": 2.5534, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -10.968993186950684, "rewards/margins": 3.7652194499969482, "rewards/rejected": -14.734212875366211, "semantic_entropy": 0.8491713404655457, "step": 300 }, { "epoch": 0.6666666666666666, "grad_norm": 153.40901841678263, "learning_rate": 3.0118055418614295e-07, "logits/chosen": -1.210296392440796, "logits/rejected": -1.166100263595581, "logps/chosen": -1.0915873050689697, "logps/rejected": -1.5003201961517334, "loss": 2.7077, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -10.915873527526855, "rewards/margins": 4.087328910827637, "rewards/rejected": -15.003199577331543, "semantic_entropy": 0.851865291595459, "step": 305 }, { "epoch": 0.6775956284153005, "grad_norm": 188.92294277142176, "learning_rate": 2.83796293174686e-07, "logits/chosen": -1.144047498703003, "logits/rejected": -1.1552207469940186, "logps/chosen": -1.1141725778579712, "logps/rejected": -1.5596559047698975, "loss": 2.9239, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -11.141725540161133, "rewards/margins": 4.454832077026367, "rewards/rejected": -15.5965576171875, "semantic_entropy": 0.8461610078811646, "step": 310 }, { "epoch": 0.6885245901639344, "grad_norm": 158.1416950642293, "learning_rate": 2.6672779890178046e-07, "logits/chosen": -1.1891381740570068, "logits/rejected": -1.1934657096862793, "logps/chosen": -1.1909846067428589, "logps/rejected": -1.4780324697494507, "loss": 2.6254, "rewards/accuracies": 0.75, "rewards/chosen": -11.9098482131958, "rewards/margins": 2.8704779148101807, "rewards/rejected": -14.780324935913086, "semantic_entropy": 0.8265172243118286, "step": 315 }, { "epoch": 0.6994535519125683, "grad_norm": 117.90491663822478, "learning_rate": 2.500000000000001e-07, "logits/chosen": -1.2632883787155151, "logits/rejected": -1.2213891744613647, "logps/chosen": -1.2055654525756836, "logps/rejected": -1.6077735424041748, "loss": 2.6783, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -12.05565357208252, "rewards/margins": 4.022082805633545, "rewards/rejected": -16.077735900878906, "semantic_entropy": 0.8192012906074524, "step": 320 }, { "epoch": 0.7103825136612022, "grad_norm": 164.40962706267163, "learning_rate": 2.3363732751439923e-07, "logits/chosen": -1.2160775661468506, "logits/rejected": -1.205322027206421, "logps/chosen": -1.1661893129348755, "logps/rejected": -1.5277663469314575, "loss": 2.6624, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -11.661892890930176, "rewards/margins": 3.615769863128662, "rewards/rejected": -15.27766227722168, "semantic_entropy": 0.823623538017273, "step": 325 }, { "epoch": 0.7213114754098361, "grad_norm": 107.16070244814924, "learning_rate": 2.1766367922083283e-07, "logits/chosen": -1.1548130512237549, "logits/rejected": -1.1359449625015259, "logps/chosen": -1.1202858686447144, "logps/rejected": -1.626604676246643, "loss": 2.5554, "rewards/accuracies": 0.84375, "rewards/chosen": -11.20285701751709, "rewards/margins": 5.063187599182129, "rewards/rejected": -16.26604461669922, "semantic_entropy": 0.8284898996353149, "step": 330 }, { "epoch": 0.73224043715847, "grad_norm": 143.54805923440733, "learning_rate": 2.021023847231202e-07, "logits/chosen": -1.1354625225067139, "logits/rejected": -1.1055997610092163, "logps/chosen": -1.2420397996902466, "logps/rejected": -1.6342121362686157, "loss": 2.5194, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -12.42039680480957, "rewards/margins": 3.9217231273651123, "rewards/rejected": -16.342121124267578, "semantic_entropy": 0.8055655360221863, "step": 335 }, { "epoch": 0.7431693989071039, "grad_norm": 143.90601843179704, "learning_rate": 1.869761713800254e-07, "logits/chosen": -1.159055471420288, "logits/rejected": -1.1217933893203735, "logps/chosen": -1.216088056564331, "logps/rejected": -1.6224826574325562, "loss": 2.6871, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -12.160881996154785, "rewards/margins": 4.0639448165893555, "rewards/rejected": -16.22482681274414, "semantic_entropy": 0.7940818667411804, "step": 340 }, { "epoch": 0.7540983606557377, "grad_norm": 146.77126754239146, "learning_rate": 1.7230713111182164e-07, "logits/chosen": -1.2234665155410767, "logits/rejected": -1.2274234294891357, "logps/chosen": -1.2689043283462524, "logps/rejected": -1.6999661922454834, "loss": 2.6683, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.689043998718262, "rewards/margins": 4.310617923736572, "rewards/rejected": -16.99966049194336, "semantic_entropy": 0.7804916501045227, "step": 345 }, { "epoch": 0.7650273224043715, "grad_norm": 156.59132030322806, "learning_rate": 1.5811668813491696e-07, "logits/chosen": -1.2142775058746338, "logits/rejected": -1.1987954378128052, "logps/chosen": -1.1977766752243042, "logps/rejected": -1.5605593919754028, "loss": 2.6148, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -11.977766036987305, "rewards/margins": 3.62782621383667, "rewards/rejected": -15.605592727661133, "semantic_entropy": 0.8086638450622559, "step": 350 }, { "epoch": 0.7759562841530054, "grad_norm": 144.3372503884412, "learning_rate": 1.4442556767166369e-07, "logits/chosen": -1.1762679815292358, "logits/rejected": -1.1479089260101318, "logps/chosen": -1.193447470664978, "logps/rejected": -1.6105226278305054, "loss": 2.5697, "rewards/accuracies": 0.84375, "rewards/chosen": -11.934475898742676, "rewards/margins": 4.170751094818115, "rewards/rejected": -16.105228424072266, "semantic_entropy": 0.8080552220344543, "step": 355 }, { "epoch": 0.7868852459016393, "grad_norm": 148.5272314733488, "learning_rate": 1.312537656810549e-07, "logits/chosen": -1.135874629020691, "logits/rejected": -1.1405253410339355, "logps/chosen": -1.242232322692871, "logps/rejected": -1.6316314935684204, "loss": 2.629, "rewards/accuracies": 0.75, "rewards/chosen": -12.422323226928711, "rewards/margins": 3.8939926624298096, "rewards/rejected": -16.316316604614258, "semantic_entropy": 0.8038153648376465, "step": 360 }, { "epoch": 0.7978142076502732, "grad_norm": 189.12080572010774, "learning_rate": 1.1862051965451214e-07, "logits/chosen": -1.216966152191162, "logits/rejected": -1.2227869033813477, "logps/chosen": -1.2868871688842773, "logps/rejected": -1.718698263168335, "loss": 2.5421, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -12.868871688842773, "rewards/margins": 4.318110466003418, "rewards/rejected": -17.18698501586914, "semantic_entropy": 0.7714166045188904, "step": 365 }, { "epoch": 0.8087431693989071, "grad_norm": 130.34992194933218, "learning_rate": 1.0654428051942138e-07, "logits/chosen": -1.2347556352615356, "logits/rejected": -1.2042248249053955, "logps/chosen": -1.3064343929290771, "logps/rejected": -1.777515172958374, "loss": 2.6716, "rewards/accuracies": 0.78125, "rewards/chosen": -13.06434440612793, "rewards/margins": 4.710807800292969, "rewards/rejected": -17.7751522064209, "semantic_entropy": 0.7663524150848389, "step": 370 }, { "epoch": 0.819672131147541, "grad_norm": 120.52547478396558, "learning_rate": 9.504268569144763e-08, "logits/chosen": -1.2310011386871338, "logits/rejected": -1.179221749305725, "logps/chosen": -1.2675514221191406, "logps/rejected": -1.7146999835968018, "loss": 2.5273, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -12.675516128540039, "rewards/margins": 4.4714837074279785, "rewards/rejected": -17.14699935913086, "semantic_entropy": 0.7719418406486511, "step": 375 }, { "epoch": 0.8306010928961749, "grad_norm": 134.85584452412903, "learning_rate": 8.413253331499049e-08, "logits/chosen": -1.1171958446502686, "logits/rejected": -1.1349446773529053, "logps/chosen": -1.2904092073440552, "logps/rejected": -1.6963565349578857, "loss": 2.5116, "rewards/accuracies": 0.84375, "rewards/chosen": -12.904090881347656, "rewards/margins": 4.059475898742676, "rewards/rejected": -16.96356773376465, "semantic_entropy": 0.7810186147689819, "step": 380 }, { "epoch": 0.8415300546448088, "grad_norm": 137.9880749898867, "learning_rate": 7.382975772939865e-08, "logits/chosen": -1.219100832939148, "logits/rejected": -1.2109915018081665, "logps/chosen": -1.3701751232147217, "logps/rejected": -1.809171438217163, "loss": 2.8231, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -13.701749801635742, "rewards/margins": 4.389963626861572, "rewards/rejected": -18.09171485900879, "semantic_entropy": 0.7575483918190002, "step": 385 }, { "epoch": 0.8524590163934426, "grad_norm": 192.7458723286622, "learning_rate": 6.414940619677734e-08, "logits/chosen": -1.2024834156036377, "logits/rejected": -1.1897705793380737, "logps/chosen": -1.2859004735946655, "logps/rejected": -1.8171262741088867, "loss": 2.4732, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -12.85900592803955, "rewards/margins": 5.312258243560791, "rewards/rejected": -18.171262741088867, "semantic_entropy": 0.77564936876297, "step": 390 }, { "epoch": 0.8633879781420765, "grad_norm": 182.36527261020933, "learning_rate": 5.5105616925376296e-08, "logits/chosen": -1.1923558712005615, "logits/rejected": -1.1765515804290771, "logps/chosen": -1.3574202060699463, "logps/rejected": -1.7072795629501343, "loss": 2.4702, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -13.574200630187988, "rewards/margins": 3.4985928535461426, "rewards/rejected": -17.07279396057129, "semantic_entropy": 0.7606626749038696, "step": 395 }, { "epoch": 0.8743169398907104, "grad_norm": 156.32689835666156, "learning_rate": 4.6711598420656976e-08, "logits/chosen": -1.144141435623169, "logits/rejected": -1.1195942163467407, "logps/chosen": -1.3408299684524536, "logps/rejected": -1.8310232162475586, "loss": 2.3262, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -13.408299446105957, "rewards/margins": 4.901931285858154, "rewards/rejected": -18.310230255126953, "semantic_entropy": 0.7442251443862915, "step": 400 }, { "epoch": 0.8743169398907104, "eval_logits/chosen": -1.3813101053237915, "eval_logits/rejected": -1.3473328351974487, "eval_logps/chosen": -1.2879669666290283, "eval_logps/rejected": -1.739721655845642, "eval_loss": 2.5607941150665283, "eval_rewards/accuracies": 0.8072289228439331, "eval_rewards/chosen": -12.879671096801758, "eval_rewards/margins": 4.517546653747559, "eval_rewards/rejected": -17.397218704223633, "eval_runtime": 36.6792, "eval_samples_per_second": 35.933, "eval_semantic_entropy": 0.7719007730484009, "eval_steps_per_second": 2.263, "step": 400 }, { "epoch": 0.8852459016393442, "grad_norm": 148.89570610138972, "learning_rate": 3.897961019419516e-08, "logits/chosen": -1.1572606563568115, "logits/rejected": -1.1004865169525146, "logps/chosen": -1.2273566722869873, "logps/rejected": -1.640681266784668, "loss": 2.4505, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -12.273566246032715, "rewards/margins": 4.13324499130249, "rewards/rejected": -16.406810760498047, "semantic_entropy": 0.7908967137336731, "step": 405 }, { "epoch": 0.8961748633879781, "grad_norm": 135.13513698899922, "learning_rate": 3.192094485859526e-08, "logits/chosen": -1.164189100265503, "logits/rejected": -1.1957439184188843, "logps/chosen": -1.340441346168518, "logps/rejected": -1.8204329013824463, "loss": 2.5453, "rewards/accuracies": 0.78125, "rewards/chosen": -13.404413223266602, "rewards/margins": 4.799916744232178, "rewards/rejected": -18.204330444335938, "semantic_entropy": 0.7424927949905396, "step": 410 }, { "epoch": 0.907103825136612, "grad_norm": 138.1252447502212, "learning_rate": 2.5545911634565265e-08, "logits/chosen": -1.2133328914642334, "logits/rejected": -1.217905879020691, "logps/chosen": -1.3356597423553467, "logps/rejected": -1.8359012603759766, "loss": 2.719, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -13.356595993041992, "rewards/margins": 5.002416133880615, "rewards/rejected": -18.359012603759766, "semantic_entropy": 0.7578593492507935, "step": 415 }, { "epoch": 0.9180327868852459, "grad_norm": 133.36843556957237, "learning_rate": 1.9863821294241522e-08, "logits/chosen": -1.2045748233795166, "logits/rejected": -1.188718557357788, "logps/chosen": -1.3029712438583374, "logps/rejected": -1.7860324382781982, "loss": 2.3706, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -13.029711723327637, "rewards/margins": 4.830612659454346, "rewards/rejected": -17.86032485961914, "semantic_entropy": 0.7500916123390198, "step": 420 }, { "epoch": 0.9289617486338798, "grad_norm": 139.42311056578998, "learning_rate": 1.4882972562753615e-08, "logits/chosen": -1.2088757753372192, "logits/rejected": -1.2028796672821045, "logps/chosen": -1.4182978868484497, "logps/rejected": -1.899735450744629, "loss": 2.6805, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.182978630065918, "rewards/margins": 4.814376354217529, "rewards/rejected": -18.997356414794922, "semantic_entropy": 0.7167907953262329, "step": 425 }, { "epoch": 0.9398907103825137, "grad_norm": 138.05690419269808, "learning_rate": 1.0610639997888915e-08, "logits/chosen": -1.1360652446746826, "logits/rejected": -1.1394312381744385, "logps/chosen": -1.265080451965332, "logps/rejected": -1.7739555835723877, "loss": 2.2297, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -12.65080451965332, "rewards/margins": 5.0887532234191895, "rewards/rejected": -17.73955535888672, "semantic_entropy": 0.7776240110397339, "step": 430 }, { "epoch": 0.9508196721311475, "grad_norm": 156.31606318677163, "learning_rate": 7.053063365559997e-09, "logits/chosen": -1.1962147951126099, "logits/rejected": -1.2261369228363037, "logps/chosen": -1.330643892288208, "logps/rejected": -1.8400895595550537, "loss": 2.397, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -13.306437492370605, "rewards/margins": 5.094459056854248, "rewards/rejected": -18.400897979736328, "semantic_entropy": 0.7367173433303833, "step": 435 }, { "epoch": 0.9617486338797814, "grad_norm": 137.08276067025474, "learning_rate": 4.215438526591064e-09, "logits/chosen": -1.1730583906173706, "logits/rejected": -1.140211582183838, "logps/chosen": -1.3757076263427734, "logps/rejected": -1.7452484369277954, "loss": 2.5885, "rewards/accuracies": 0.78125, "rewards/chosen": -13.757074356079102, "rewards/margins": 3.6954073905944824, "rewards/rejected": -17.452484130859375, "semantic_entropy": 0.7384070754051208, "step": 440 }, { "epoch": 0.9726775956284153, "grad_norm": 153.84420514847153, "learning_rate": 2.1019098481337426e-09, "logits/chosen": -1.2059695720672607, "logits/rejected": -1.1932657957077026, "logps/chosen": -1.2917059659957886, "logps/rejected": -1.7843118906021118, "loss": 2.4083, "rewards/accuracies": 0.8125, "rewards/chosen": -12.917058944702148, "rewards/margins": 4.926059246063232, "rewards/rejected": -17.843120574951172, "semantic_entropy": 0.7641780376434326, "step": 445 }, { "epoch": 0.9836065573770492, "grad_norm": 169.02746393168292, "learning_rate": 7.155641507955445e-10, "logits/chosen": -1.122040033340454, "logits/rejected": -1.1226763725280762, "logps/chosen": -1.3791415691375732, "logps/rejected": -1.798413872718811, "loss": 2.6799, "rewards/accuracies": 0.8125, "rewards/chosen": -13.791415214538574, "rewards/margins": 4.192723274230957, "rewards/rejected": -17.98413848876953, "semantic_entropy": 0.7384847402572632, "step": 450 }, { "epoch": 0.994535519125683, "grad_norm": 160.91531446057363, "learning_rate": 5.842620032053824e-11, "logits/chosen": -1.1409598588943481, "logits/rejected": -1.139664649963379, "logps/chosen": -1.3967525959014893, "logps/rejected": -1.73647940158844, "loss": 2.866, "rewards/accuracies": 0.75, "rewards/chosen": -13.96752643585205, "rewards/margins": 3.3972675800323486, "rewards/rejected": -17.364791870117188, "semantic_entropy": 0.7418167591094971, "step": 455 }, { "epoch": 0.9989071038251366, "step": 457, "total_flos": 0.0, "train_loss": 3.4314852449513107, "train_runtime": 5934.8281, "train_samples_per_second": 9.867, "train_steps_per_second": 0.077 } ], "logging_steps": 5, "max_steps": 457, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }