{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020920502092050207, "grad_norm": 4.2377470321442345, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.516148090362549, "logits/rejected": -2.4595022201538086, "logps/chosen": -1.2051799297332764, "logps/rejected": -1.1685211658477783, "loss": -0.0047, "rewards/accuracies": 0.5, "rewards/chosen": 0.3819315433502197, "rewards/margins": -0.0013609901070594788, "rewards/rejected": 0.3832925260066986, "step": 1 }, { "epoch": 0.0041841004184100415, "grad_norm": 4.616200091383757, "learning_rate": 2.083333333333333e-08, "logits/chosen": -2.4128036499023438, "logits/rejected": -2.479793071746826, "logps/chosen": -1.2863150835037231, "logps/rejected": -1.0149686336517334, "loss": -0.0103, "rewards/accuracies": 0.4375, "rewards/chosen": 0.34382855892181396, "rewards/margins": -0.057850055396556854, "rewards/rejected": 0.4016786217689514, "step": 2 }, { "epoch": 0.006276150627615063, "grad_norm": 3.700184874168768, "learning_rate": 3.125e-08, "logits/chosen": -2.678924083709717, "logits/rejected": -2.668107032775879, "logps/chosen": -0.8378489017486572, "logps/rejected": -1.1761590242385864, "loss": -0.0625, "rewards/accuracies": 0.75, "rewards/chosen": 0.44425511360168457, "rewards/margins": 0.11912594735622406, "rewards/rejected": 0.3251291811466217, "step": 3 }, { "epoch": 0.008368200836820083, "grad_norm": 3.754675176110523, "learning_rate": 4.166666666666666e-08, "logits/chosen": -2.6950135231018066, "logits/rejected": -2.7032535076141357, "logps/chosen": -0.9283154010772705, "logps/rejected": -0.842531681060791, "loss": -0.0123, "rewards/accuracies": 0.375, "rewards/chosen": 0.4067751169204712, "rewards/margins": -0.049619611352682114, "rewards/rejected": 0.4563947319984436, "step": 4 }, { "epoch": 0.010460251046025104, "grad_norm": 3.75972351843305, "learning_rate": 5.208333333333333e-08, "logits/chosen": -2.7333128452301025, "logits/rejected": -2.6697793006896973, "logps/chosen": -0.8777127265930176, "logps/rejected": -1.043122410774231, "loss": -0.0324, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4418213367462158, "rewards/margins": 0.02298763580620289, "rewards/rejected": 0.41883373260498047, "step": 5 }, { "epoch": 0.012552301255230125, "grad_norm": 3.7912148566272696, "learning_rate": 6.25e-08, "logits/chosen": -2.8148372173309326, "logits/rejected": -2.755297899246216, "logps/chosen": -1.0822432041168213, "logps/rejected": -1.2278616428375244, "loss": -0.015, "rewards/accuracies": 0.5, "rewards/chosen": 0.35870179533958435, "rewards/margins": 0.02633710391819477, "rewards/rejected": 0.3323647081851959, "step": 6 }, { "epoch": 0.014644351464435146, "grad_norm": 4.260569724370287, "learning_rate": 7.291666666666667e-08, "logits/chosen": -2.7257280349731445, "logits/rejected": -2.6599254608154297, "logps/chosen": -1.0147705078125, "logps/rejected": -0.9999724626541138, "loss": -0.0232, "rewards/accuracies": 0.625, "rewards/chosen": 0.38983920216560364, "rewards/margins": 0.007026904262602329, "rewards/rejected": 0.3828122913837433, "step": 7 }, { "epoch": 0.016736401673640166, "grad_norm": 4.363722516784404, "learning_rate": 8.333333333333333e-08, "logits/chosen": -2.8212952613830566, "logits/rejected": -2.65653920173645, "logps/chosen": -0.971604585647583, "logps/rejected": -1.1535077095031738, "loss": -0.0233, "rewards/accuracies": 0.75, "rewards/chosen": 0.4037427604198456, "rewards/margins": 0.0658140704035759, "rewards/rejected": 0.3379287123680115, "step": 8 }, { "epoch": 0.01882845188284519, "grad_norm": 4.498762808376351, "learning_rate": 9.375e-08, "logits/chosen": -2.692415237426758, "logits/rejected": -2.6526036262512207, "logps/chosen": -0.9371532201766968, "logps/rejected": -1.1771211624145508, "loss": -0.0324, "rewards/accuracies": 0.625, "rewards/chosen": 0.4131394028663635, "rewards/margins": 0.04278501123189926, "rewards/rejected": 0.37035441398620605, "step": 9 }, { "epoch": 0.02092050209205021, "grad_norm": 4.174864879865781, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.5466957092285156, "logits/rejected": -2.448245048522949, "logps/chosen": -1.09194016456604, "logps/rejected": -1.0789859294891357, "loss": -0.0433, "rewards/accuracies": 0.5625, "rewards/chosen": 0.382916659116745, "rewards/margins": 0.029688388109207153, "rewards/rejected": 0.35322827100753784, "step": 10 }, { "epoch": 0.02301255230125523, "grad_norm": 3.8103486541857396, "learning_rate": 1.1458333333333332e-07, "logits/chosen": -2.755258083343506, "logits/rejected": -2.5368123054504395, "logps/chosen": -0.8456466794013977, "logps/rejected": -1.1355555057525635, "loss": -0.0339, "rewards/accuracies": 0.75, "rewards/chosen": 0.44307994842529297, "rewards/margins": 0.10706950724124908, "rewards/rejected": 0.3360104560852051, "step": 11 }, { "epoch": 0.02510460251046025, "grad_norm": 3.6062416909583246, "learning_rate": 1.25e-07, "logits/chosen": -2.8309640884399414, "logits/rejected": -2.8085203170776367, "logps/chosen": -1.0716187953948975, "logps/rejected": -0.9773526191711426, "loss": -0.0023, "rewards/accuracies": 0.5, "rewards/chosen": 0.3665681481361389, "rewards/margins": -0.025565478950738907, "rewards/rejected": 0.3921336233615875, "step": 12 }, { "epoch": 0.027196652719665274, "grad_norm": 4.894676212438303, "learning_rate": 1.3541666666666666e-07, "logits/chosen": -2.646806001663208, "logits/rejected": -2.4753153324127197, "logps/chosen": -0.9714970588684082, "logps/rejected": -1.3113722801208496, "loss": -0.0457, "rewards/accuracies": 0.6875, "rewards/chosen": 0.406857430934906, "rewards/margins": 0.11795446276664734, "rewards/rejected": 0.2889029383659363, "step": 13 }, { "epoch": 0.029288702928870293, "grad_norm": 3.971879581235679, "learning_rate": 1.4583333333333335e-07, "logits/chosen": -2.527503252029419, "logits/rejected": -2.5251338481903076, "logps/chosen": -1.6883941888809204, "logps/rejected": -1.28494393825531, "loss": -0.0479, "rewards/accuracies": 0.375, "rewards/chosen": 0.3116728961467743, "rewards/margins": 0.010951630771160126, "rewards/rejected": 0.30072125792503357, "step": 14 }, { "epoch": 0.03138075313807531, "grad_norm": 4.3835348530172755, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.7007007598876953, "logits/rejected": -2.6135330200195312, "logps/chosen": -0.9208273887634277, "logps/rejected": -1.1965676546096802, "loss": -0.0258, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4235038161277771, "rewards/margins": 0.0690305233001709, "rewards/rejected": 0.3544732928276062, "step": 15 }, { "epoch": 0.03347280334728033, "grad_norm": 4.802927181660066, "learning_rate": 1.6666666666666665e-07, "logits/chosen": -2.7196192741394043, "logits/rejected": -2.704707145690918, "logps/chosen": -1.0318691730499268, "logps/rejected": -1.019579291343689, "loss": -0.0248, "rewards/accuracies": 0.3125, "rewards/chosen": 0.37734121084213257, "rewards/margins": -0.018189536407589912, "rewards/rejected": 0.3955307602882385, "step": 16 }, { "epoch": 0.03556485355648536, "grad_norm": 3.624971192344114, "learning_rate": 1.7708333333333334e-07, "logits/chosen": -2.555588960647583, "logits/rejected": -2.5805137157440186, "logps/chosen": -0.8474312424659729, "logps/rejected": -0.9811149835586548, "loss": -0.0411, "rewards/accuracies": 0.75, "rewards/chosen": 0.44956469535827637, "rewards/margins": 0.05308142676949501, "rewards/rejected": 0.39648327231407166, "step": 17 }, { "epoch": 0.03765690376569038, "grad_norm": 4.118639811082369, "learning_rate": 1.875e-07, "logits/chosen": -2.728419303894043, "logits/rejected": -2.6613481044769287, "logps/chosen": -0.9432893991470337, "logps/rejected": -1.0357279777526855, "loss": -0.0093, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40460628271102905, "rewards/margins": 0.005164351314306259, "rewards/rejected": 0.3994419276714325, "step": 18 }, { "epoch": 0.0397489539748954, "grad_norm": 4.70344861713647, "learning_rate": 1.9791666666666664e-07, "logits/chosen": -2.632000684738159, "logits/rejected": -2.435832977294922, "logps/chosen": -1.014740228652954, "logps/rejected": -1.2633649110794067, "loss": -0.044, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3806372284889221, "rewards/margins": 0.07575057446956635, "rewards/rejected": 0.30488666892051697, "step": 19 }, { "epoch": 0.04184100418410042, "grad_norm": 4.0685356493856935, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.7355053424835205, "logits/rejected": -2.6346383094787598, "logps/chosen": -1.0173437595367432, "logps/rejected": -1.0325090885162354, "loss": -0.0442, "rewards/accuracies": 0.625, "rewards/chosen": 0.3999063968658447, "rewards/margins": 0.006554506719112396, "rewards/rejected": 0.39335188269615173, "step": 20 }, { "epoch": 0.043933054393305436, "grad_norm": 4.535495847198634, "learning_rate": 2.1875e-07, "logits/chosen": -2.667809247970581, "logits/rejected": -2.7679128646850586, "logps/chosen": -0.9066150188446045, "logps/rejected": -1.042630672454834, "loss": -0.0631, "rewards/accuracies": 0.625, "rewards/chosen": 0.4318792223930359, "rewards/margins": 0.058360084891319275, "rewards/rejected": 0.3735191226005554, "step": 21 }, { "epoch": 0.04602510460251046, "grad_norm": 5.626761266687518, "learning_rate": 2.2916666666666663e-07, "logits/chosen": -2.7311110496520996, "logits/rejected": -2.657087802886963, "logps/chosen": -0.8237791657447815, "logps/rejected": -0.9569952487945557, "loss": -0.0306, "rewards/accuracies": 0.625, "rewards/chosen": 0.4589225649833679, "rewards/margins": 0.0617038793861866, "rewards/rejected": 0.3972187042236328, "step": 22 }, { "epoch": 0.04811715481171548, "grad_norm": 4.31976215520191, "learning_rate": 2.3958333333333335e-07, "logits/chosen": -2.5237855911254883, "logits/rejected": -2.485886573791504, "logps/chosen": -1.017724633216858, "logps/rejected": -1.073349118232727, "loss": -0.0512, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4005371034145355, "rewards/margins": 0.02935650758445263, "rewards/rejected": 0.37118059396743774, "step": 23 }, { "epoch": 0.0502092050209205, "grad_norm": 4.392600704421736, "learning_rate": 2.5e-07, "logits/chosen": -2.5538222789764404, "logits/rejected": -2.4816856384277344, "logps/chosen": -1.0112228393554688, "logps/rejected": -1.232728362083435, "loss": -0.0263, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39029499888420105, "rewards/margins": 0.03938312828540802, "rewards/rejected": 0.35091185569763184, "step": 24 }, { "epoch": 0.05230125523012552, "grad_norm": 4.19926916757524, "learning_rate": 2.604166666666667e-07, "logits/chosen": -2.5918517112731934, "logits/rejected": -2.532200813293457, "logps/chosen": -0.8845524787902832, "logps/rejected": -0.7892788648605347, "loss": -0.0345, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4380142092704773, "rewards/margins": -0.0516534224152565, "rewards/rejected": 0.4896676540374756, "step": 25 }, { "epoch": 0.05439330543933055, "grad_norm": 6.458780023037348, "learning_rate": 2.708333333333333e-07, "logits/chosen": -2.755127429962158, "logits/rejected": -2.6158037185668945, "logps/chosen": -0.9921547174453735, "logps/rejected": -1.0211769342422485, "loss": -0.0173, "rewards/accuracies": 0.625, "rewards/chosen": 0.3931275010108948, "rewards/margins": 0.010015063919126987, "rewards/rejected": 0.38311243057250977, "step": 26 }, { "epoch": 0.056485355648535567, "grad_norm": 5.258249431893916, "learning_rate": 2.8125e-07, "logits/chosen": -2.682135581970215, "logits/rejected": -2.555710792541504, "logps/chosen": -0.941645622253418, "logps/rejected": -1.0752073526382446, "loss": -0.0419, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4056612253189087, "rewards/margins": 0.021622449159622192, "rewards/rejected": 0.3840388059616089, "step": 27 }, { "epoch": 0.058577405857740586, "grad_norm": 4.888961752916909, "learning_rate": 2.916666666666667e-07, "logits/chosen": -2.7251815795898438, "logits/rejected": -2.621352195739746, "logps/chosen": -0.9367978572845459, "logps/rejected": -1.0780291557312012, "loss": -0.0295, "rewards/accuracies": 0.625, "rewards/chosen": 0.4540524184703827, "rewards/margins": 0.06387701630592346, "rewards/rejected": 0.39017540216445923, "step": 28 }, { "epoch": 0.060669456066945605, "grad_norm": 3.776672366733632, "learning_rate": 3.020833333333333e-07, "logits/chosen": -2.8015384674072266, "logits/rejected": -2.614657402038574, "logps/chosen": -0.8712697625160217, "logps/rejected": -0.9271025061607361, "loss": -0.0434, "rewards/accuracies": 0.375, "rewards/chosen": 0.4297908544540405, "rewards/margins": 0.002205170691013336, "rewards/rejected": 0.4275856614112854, "step": 29 }, { "epoch": 0.06276150627615062, "grad_norm": 5.648597556731242, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.6692140102386475, "logits/rejected": -2.67612624168396, "logps/chosen": -0.7322691679000854, "logps/rejected": -1.0025076866149902, "loss": -0.0516, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5026121139526367, "rewards/margins": 0.10446584224700928, "rewards/rejected": 0.39814627170562744, "step": 30 }, { "epoch": 0.06485355648535565, "grad_norm": 4.03210676684634, "learning_rate": 3.2291666666666666e-07, "logits/chosen": -2.7173359394073486, "logits/rejected": -2.658566951751709, "logps/chosen": -0.7469158172607422, "logps/rejected": -0.8179668188095093, "loss": -0.0637, "rewards/accuracies": 0.4375, "rewards/chosen": 0.48906075954437256, "rewards/margins": 0.03415411710739136, "rewards/rejected": 0.4549066722393036, "step": 31 }, { "epoch": 0.06694560669456066, "grad_norm": 4.276452711129558, "learning_rate": 3.333333333333333e-07, "logits/chosen": -2.8544907569885254, "logits/rejected": -2.737621307373047, "logps/chosen": -0.9466021060943604, "logps/rejected": -0.9423712491989136, "loss": -0.0211, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40121617913246155, "rewards/margins": -0.005811708979308605, "rewards/rejected": 0.4070279002189636, "step": 32 }, { "epoch": 0.06903765690376569, "grad_norm": 4.391804275431816, "learning_rate": 3.4375e-07, "logits/chosen": -2.7092764377593994, "logits/rejected": -2.723233699798584, "logps/chosen": -0.7280018329620361, "logps/rejected": -0.8956056833267212, "loss": -0.0404, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5123242139816284, "rewards/margins": 0.07963782548904419, "rewards/rejected": 0.4326864182949066, "step": 33 }, { "epoch": 0.07112970711297072, "grad_norm": 4.277257017884931, "learning_rate": 3.541666666666667e-07, "logits/chosen": -2.6140832901000977, "logits/rejected": -2.687009811401367, "logps/chosen": -0.944001317024231, "logps/rejected": -1.021315336227417, "loss": -0.039, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4302595257759094, "rewards/margins": 0.0396876186132431, "rewards/rejected": 0.3905719220638275, "step": 34 }, { "epoch": 0.07322175732217573, "grad_norm": 4.903229579454535, "learning_rate": 3.645833333333333e-07, "logits/chosen": -2.6962552070617676, "logits/rejected": -2.609804630279541, "logps/chosen": -0.9382450580596924, "logps/rejected": -1.2346339225769043, "loss": -0.0714, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4204990267753601, "rewards/margins": 0.09388677775859833, "rewards/rejected": 0.3266122341156006, "step": 35 }, { "epoch": 0.07531380753138076, "grad_norm": 4.611264633523621, "learning_rate": 3.75e-07, "logits/chosen": -2.652193069458008, "logits/rejected": -2.685980796813965, "logps/chosen": -0.8813173770904541, "logps/rejected": -1.0486414432525635, "loss": -0.033, "rewards/accuracies": 0.5625, "rewards/chosen": 0.42412978410720825, "rewards/margins": 0.0556933730840683, "rewards/rejected": 0.36843645572662354, "step": 36 }, { "epoch": 0.07740585774058577, "grad_norm": 4.492060757383237, "learning_rate": 3.8541666666666665e-07, "logits/chosen": -2.502615451812744, "logits/rejected": -2.550412178039551, "logps/chosen": -0.8484123945236206, "logps/rejected": -0.8843783140182495, "loss": -0.0398, "rewards/accuracies": 0.5625, "rewards/chosen": 0.45689207315444946, "rewards/margins": 0.010739020071923733, "rewards/rejected": 0.44615304470062256, "step": 37 }, { "epoch": 0.0794979079497908, "grad_norm": 4.489890717193878, "learning_rate": 3.958333333333333e-07, "logits/chosen": -2.6360530853271484, "logits/rejected": -2.594210147857666, "logps/chosen": -0.9129270315170288, "logps/rejected": -0.8058098554611206, "loss": -0.0472, "rewards/accuracies": 0.5, "rewards/chosen": 0.4257778525352478, "rewards/margins": -0.04449187219142914, "rewards/rejected": 0.47026970982551575, "step": 38 }, { "epoch": 0.08158995815899582, "grad_norm": 6.383961680323578, "learning_rate": 4.0625e-07, "logits/chosen": -2.7180564403533936, "logits/rejected": -2.7788286209106445, "logps/chosen": -0.7782948613166809, "logps/rejected": -1.1029548645019531, "loss": -0.0721, "rewards/accuracies": 0.625, "rewards/chosen": 0.4835827648639679, "rewards/margins": 0.11756639182567596, "rewards/rejected": 0.36601632833480835, "step": 39 }, { "epoch": 0.08368200836820083, "grad_norm": 4.574297558165053, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6807751655578613, "logits/rejected": -2.6066842079162598, "logps/chosen": -0.7192291617393494, "logps/rejected": -1.1977105140686035, "loss": -0.0487, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5016716718673706, "rewards/margins": 0.15893656015396118, "rewards/rejected": 0.3427351117134094, "step": 40 }, { "epoch": 0.08577405857740586, "grad_norm": 4.670542855578557, "learning_rate": 4.270833333333333e-07, "logits/chosen": -2.750822067260742, "logits/rejected": -2.598273754119873, "logps/chosen": -0.8036954402923584, "logps/rejected": -0.8826763033866882, "loss": -0.0595, "rewards/accuracies": 0.5, "rewards/chosen": 0.47965848445892334, "rewards/margins": 0.03851592540740967, "rewards/rejected": 0.4411425292491913, "step": 41 }, { "epoch": 0.08786610878661087, "grad_norm": 5.6444060526232205, "learning_rate": 4.375e-07, "logits/chosen": -2.7300307750701904, "logits/rejected": -2.8059144020080566, "logps/chosen": -0.871497631072998, "logps/rejected": -1.0136091709136963, "loss": -0.0639, "rewards/accuracies": 0.5625, "rewards/chosen": 0.44451814889907837, "rewards/margins": 0.012301255017518997, "rewards/rejected": 0.4322168827056885, "step": 42 }, { "epoch": 0.0899581589958159, "grad_norm": 4.7276809756186555, "learning_rate": 4.479166666666667e-07, "logits/chosen": -2.8357558250427246, "logits/rejected": -2.8672068119049072, "logps/chosen": -0.8881194591522217, "logps/rejected": -0.800782322883606, "loss": -0.0444, "rewards/accuracies": 0.4375, "rewards/chosen": 0.42225128412246704, "rewards/margins": -0.049045875668525696, "rewards/rejected": 0.47129717469215393, "step": 43 }, { "epoch": 0.09205020920502092, "grad_norm": 5.168209747900235, "learning_rate": 4.5833333333333327e-07, "logits/chosen": -2.6327948570251465, "logits/rejected": -2.5793819427490234, "logps/chosen": -0.8625282049179077, "logps/rejected": -0.9531125426292419, "loss": -0.0613, "rewards/accuracies": 0.625, "rewards/chosen": 0.4718647599220276, "rewards/margins": 0.022850003093481064, "rewards/rejected": 0.44901472330093384, "step": 44 }, { "epoch": 0.09414225941422594, "grad_norm": 5.748488517798206, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.7688827514648438, "logits/rejected": -2.7678184509277344, "logps/chosen": -0.9253497123718262, "logps/rejected": -1.012501835823059, "loss": -0.0772, "rewards/accuracies": 0.5, "rewards/chosen": 0.44503989815711975, "rewards/margins": 0.041792869567871094, "rewards/rejected": 0.40324705839157104, "step": 45 }, { "epoch": 0.09623430962343096, "grad_norm": 5.4053973810033265, "learning_rate": 4.791666666666667e-07, "logits/chosen": -2.6743528842926025, "logits/rejected": -2.6169381141662598, "logps/chosen": -0.9037365913391113, "logps/rejected": -1.1369562149047852, "loss": -0.039, "rewards/accuracies": 0.6875, "rewards/chosen": 0.42915260791778564, "rewards/margins": 0.09240510314702988, "rewards/rejected": 0.33674749732017517, "step": 46 }, { "epoch": 0.09832635983263599, "grad_norm": 5.260086542730235, "learning_rate": 4.895833333333333e-07, "logits/chosen": -2.643252372741699, "logits/rejected": -2.6564273834228516, "logps/chosen": -0.711982250213623, "logps/rejected": -0.9268280863761902, "loss": -0.0643, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5240494608879089, "rewards/margins": 0.1118624359369278, "rewards/rejected": 0.41218701004981995, "step": 47 }, { "epoch": 0.100418410041841, "grad_norm": 4.998054698179204, "learning_rate": 5e-07, "logits/chosen": -2.601804256439209, "logits/rejected": -2.592262029647827, "logps/chosen": -0.9648156762123108, "logps/rejected": -1.016688346862793, "loss": -0.0447, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4200252592563629, "rewards/margins": 0.016244657337665558, "rewards/rejected": 0.40378057956695557, "step": 48 }, { "epoch": 0.10251046025104603, "grad_norm": 5.789088857902208, "learning_rate": 4.999933277714308e-07, "logits/chosen": -2.596520185470581, "logits/rejected": -2.591111421585083, "logps/chosen": -0.9277907609939575, "logps/rejected": -0.9234358668327332, "loss": -0.0426, "rewards/accuracies": 0.625, "rewards/chosen": 0.4210990071296692, "rewards/margins": 0.0025722142308950424, "rewards/rejected": 0.4185267984867096, "step": 49 }, { "epoch": 0.10460251046025104, "grad_norm": 5.3386843580079235, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.7607293128967285, "logits/rejected": -2.6781206130981445, "logps/chosen": -0.699967622756958, "logps/rejected": -0.9246343374252319, "loss": -0.0689, "rewards/accuracies": 0.5625, "rewards/chosen": 0.532953679561615, "rewards/margins": 0.11697375774383545, "rewards/rejected": 0.41597992181777954, "step": 50 }, { "epoch": 0.10669456066945607, "grad_norm": 5.916200359756027, "learning_rate": 4.999399520797532e-07, "logits/chosen": -2.6939072608947754, "logits/rejected": -2.692051887512207, "logps/chosen": -0.8488726615905762, "logps/rejected": -1.0264009237289429, "loss": -0.0331, "rewards/accuracies": 0.625, "rewards/chosen": 0.44875848293304443, "rewards/margins": 0.03353102505207062, "rewards/rejected": 0.415227472782135, "step": 51 }, { "epoch": 0.1087866108786611, "grad_norm": 6.06972427169266, "learning_rate": 4.998932514657231e-07, "logits/chosen": -2.7914843559265137, "logits/rejected": -2.7075424194335938, "logps/chosen": -0.9562678337097168, "logps/rejected": -1.2075998783111572, "loss": -0.0542, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4133673906326294, "rewards/margins": 0.04625946655869484, "rewards/rejected": 0.36710792779922485, "step": 52 }, { "epoch": 0.1108786610878661, "grad_norm": 5.089942679391605, "learning_rate": 4.998332120925598e-07, "logits/chosen": -2.753302574157715, "logits/rejected": -2.7128305435180664, "logps/chosen": -0.8735638856887817, "logps/rejected": -0.884076714515686, "loss": -0.04, "rewards/accuracies": 0.4375, "rewards/chosen": 0.454987108707428, "rewards/margins": 0.017186364158988, "rewards/rejected": 0.4378007650375366, "step": 53 }, { "epoch": 0.11297071129707113, "grad_norm": 6.052397500406735, "learning_rate": 4.997598371650346e-07, "logits/chosen": -2.581367015838623, "logits/rejected": -2.5348823070526123, "logps/chosen": -1.1102932691574097, "logps/rejected": -1.1976356506347656, "loss": -0.0424, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36265528202056885, "rewards/margins": 0.038111936300992966, "rewards/rejected": 0.3245433568954468, "step": 54 }, { "epoch": 0.11506276150627615, "grad_norm": 7.582726283106986, "learning_rate": 4.996731305997416e-07, "logits/chosen": -2.6058666706085205, "logits/rejected": -2.5393056869506836, "logps/chosen": -0.8814643621444702, "logps/rejected": -1.1655168533325195, "loss": -0.0653, "rewards/accuracies": 0.625, "rewards/chosen": 0.4387378692626953, "rewards/margins": 0.08904074132442474, "rewards/rejected": 0.34969714283943176, "step": 55 }, { "epoch": 0.11715481171548117, "grad_norm": 6.882707748741758, "learning_rate": 4.995730970248893e-07, "logits/chosen": -2.767383575439453, "logits/rejected": -2.616359233856201, "logps/chosen": -0.7647976279258728, "logps/rejected": -0.9351394772529602, "loss": -0.1056, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4906325042247772, "rewards/margins": 0.06625062227249146, "rewards/rejected": 0.42438188195228577, "step": 56 }, { "epoch": 0.1192468619246862, "grad_norm": 5.54458889124346, "learning_rate": 4.994597417800523e-07, "logits/chosen": -2.6890878677368164, "logits/rejected": -2.6082592010498047, "logps/chosen": -1.0842435359954834, "logps/rejected": -1.4014520645141602, "loss": -0.0148, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3719639182090759, "rewards/margins": 0.05614328384399414, "rewards/rejected": 0.3158206641674042, "step": 57 }, { "epoch": 0.12133891213389121, "grad_norm": 6.9373603325608695, "learning_rate": 4.993330709158879e-07, "logits/chosen": -2.7441017627716064, "logits/rejected": -2.5297117233276367, "logps/chosen": -1.0221364498138428, "logps/rejected": -1.1408171653747559, "loss": -0.0863, "rewards/accuracies": 0.4375, "rewards/chosen": 0.39980924129486084, "rewards/margins": 0.021861765533685684, "rewards/rejected": 0.37794747948646545, "step": 58 }, { "epoch": 0.12343096234309624, "grad_norm": 6.321561447114049, "learning_rate": 4.991930911938115e-07, "logits/chosen": -2.4681005477905273, "logits/rejected": -2.4990711212158203, "logps/chosen": -1.0633734464645386, "logps/rejected": -1.114372730255127, "loss": -0.0631, "rewards/accuracies": 0.5, "rewards/chosen": 0.41653022170066833, "rewards/margins": 0.05260680615901947, "rewards/rejected": 0.36392340064048767, "step": 59 }, { "epoch": 0.12552301255230125, "grad_norm": 7.821830122083976, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.7289726734161377, "logits/rejected": -2.6343679428100586, "logps/chosen": -0.9890714883804321, "logps/rejected": -1.0246083736419678, "loss": -0.0631, "rewards/accuracies": 0.5625, "rewards/chosen": 0.39659732580184937, "rewards/margins": -0.005929629318416119, "rewards/rejected": 0.40252697467803955, "step": 60 }, { "epoch": 0.12761506276150628, "grad_norm": 5.543365313785047, "learning_rate": 4.988732357731762e-07, "logits/chosen": -2.670830011367798, "logits/rejected": -2.719724655151367, "logps/chosen": -0.8051760792732239, "logps/rejected": -1.1917810440063477, "loss": -0.0634, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4671447277069092, "rewards/margins": 0.12195159494876862, "rewards/rejected": 0.34519311785697937, "step": 61 }, { "epoch": 0.1297071129707113, "grad_norm": 6.158656545510282, "learning_rate": 4.986933771478051e-07, "logits/chosen": -2.5070548057556152, "logits/rejected": -2.5650229454040527, "logps/chosen": -0.8839155435562134, "logps/rejected": -1.074038028717041, "loss": -0.0583, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4455738067626953, "rewards/margins": 0.05701454356312752, "rewards/rejected": 0.3885592222213745, "step": 62 }, { "epoch": 0.13179916317991633, "grad_norm": 6.601566833855826, "learning_rate": 4.985002438099865e-07, "logits/chosen": -2.7178516387939453, "logits/rejected": -2.614670753479004, "logps/chosen": -0.9592885971069336, "logps/rejected": -1.2345027923583984, "loss": -0.0602, "rewards/accuracies": 0.75, "rewards/chosen": 0.41584277153015137, "rewards/margins": 0.07030172646045685, "rewards/rejected": 0.3455410599708557, "step": 63 }, { "epoch": 0.13389121338912133, "grad_norm": 5.110638476831358, "learning_rate": 4.982938460687582e-07, "logits/chosen": -2.6705799102783203, "logits/rejected": -2.535891532897949, "logps/chosen": -1.0088186264038086, "logps/rejected": -1.3167262077331543, "loss": -0.0636, "rewards/accuracies": 0.6875, "rewards/chosen": 0.379120796918869, "rewards/margins": 0.07601502537727356, "rewards/rejected": 0.30310577154159546, "step": 64 }, { "epoch": 0.13598326359832635, "grad_norm": 6.39928910360316, "learning_rate": 4.980741949411839e-07, "logits/chosen": -2.5929765701293945, "logits/rejected": -2.5973377227783203, "logps/chosen": -1.2004098892211914, "logps/rejected": -1.2652997970581055, "loss": -0.0478, "rewards/accuracies": 0.375, "rewards/chosen": 0.3200799822807312, "rewards/margins": 0.005203430540859699, "rewards/rejected": 0.314876526594162, "step": 65 }, { "epoch": 0.13807531380753138, "grad_norm": 5.664762551027314, "learning_rate": 4.978413021517633e-07, "logits/chosen": -2.7091989517211914, "logits/rejected": -2.6626734733581543, "logps/chosen": -0.9367485046386719, "logps/rejected": -1.2151732444763184, "loss": -0.0621, "rewards/accuracies": 0.5625, "rewards/chosen": 0.43186360597610474, "rewards/margins": 0.11135440319776535, "rewards/rejected": 0.32050925493240356, "step": 66 }, { "epoch": 0.1401673640167364, "grad_norm": 6.952965346657238, "learning_rate": 4.975951801318083e-07, "logits/chosen": -2.5098729133605957, "logits/rejected": -2.414456844329834, "logps/chosen": -0.8484750986099243, "logps/rejected": -0.9892241358757019, "loss": -0.0918, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4698827266693115, "rewards/margins": 0.06307347118854523, "rewards/rejected": 0.4068092703819275, "step": 67 }, { "epoch": 0.14225941422594143, "grad_norm": 6.436841023614087, "learning_rate": 4.973358420187775e-07, "logits/chosen": -2.7483773231506348, "logits/rejected": -2.7305080890655518, "logps/chosen": -1.0968064069747925, "logps/rejected": -0.9938352108001709, "loss": -0.066, "rewards/accuracies": 0.5, "rewards/chosen": 0.3873424530029297, "rewards/margins": -0.041090674698352814, "rewards/rejected": 0.4284331202507019, "step": 68 }, { "epoch": 0.14435146443514643, "grad_norm": 21.319389236605137, "learning_rate": 4.970633016555764e-07, "logits/chosen": -2.6799473762512207, "logits/rejected": -2.551419734954834, "logps/chosen": -0.8439831733703613, "logps/rejected": -1.3273664712905884, "loss": -0.0879, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46591490507125854, "rewards/margins": 0.14239582419395447, "rewards/rejected": 0.3235190808773041, "step": 69 }, { "epoch": 0.14644351464435146, "grad_norm": 6.7666133308501, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.6862854957580566, "logits/rejected": -2.689319133758545, "logps/chosen": -0.9402115345001221, "logps/rejected": -1.3846523761749268, "loss": -0.0862, "rewards/accuracies": 0.75, "rewards/chosen": 0.4224025309085846, "rewards/margins": 0.1333645135164261, "rewards/rejected": 0.2890380024909973, "step": 70 }, { "epoch": 0.14853556485355648, "grad_norm": 6.681866135481183, "learning_rate": 4.964786730730454e-07, "logits/chosen": -2.7253994941711426, "logits/rejected": -2.776564598083496, "logps/chosen": -1.0395264625549316, "logps/rejected": -1.0701773166656494, "loss": -0.0683, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3953511118888855, "rewards/margins": 0.028210703283548355, "rewards/rejected": 0.36714041233062744, "step": 71 }, { "epoch": 0.1506276150627615, "grad_norm": 7.723837222390713, "learning_rate": 4.961666160599197e-07, "logits/chosen": -2.6773457527160645, "logits/rejected": -2.59285306930542, "logps/chosen": -0.9283171892166138, "logps/rejected": -1.0602527856826782, "loss": -0.0795, "rewards/accuracies": 0.75, "rewards/chosen": 0.4250871241092682, "rewards/margins": 0.025532707571983337, "rewards/rejected": 0.39955443143844604, "step": 72 }, { "epoch": 0.15271966527196654, "grad_norm": 6.789946293344587, "learning_rate": 4.958414192073665e-07, "logits/chosen": -2.546414613723755, "logits/rejected": -2.4631576538085938, "logps/chosen": -0.9542316794395447, "logps/rejected": -1.1909294128417969, "loss": -0.0186, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4049053192138672, "rewards/margins": 0.05104460567235947, "rewards/rejected": 0.3538607060909271, "step": 73 }, { "epoch": 0.15481171548117154, "grad_norm": 7.548673490492515, "learning_rate": 4.955030998736876e-07, "logits/chosen": -2.74623966217041, "logits/rejected": -2.6763722896575928, "logps/chosen": -1.0954599380493164, "logps/rejected": -1.460294485092163, "loss": -0.068, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3597659468650818, "rewards/margins": 0.06558110564947128, "rewards/rejected": 0.2941848635673523, "step": 74 }, { "epoch": 0.15690376569037656, "grad_norm": 7.870243634302642, "learning_rate": 4.951516761176343e-07, "logits/chosen": -2.6385092735290527, "logits/rejected": -2.5714449882507324, "logps/chosen": -0.8212575912475586, "logps/rejected": -1.3720622062683105, "loss": -0.0754, "rewards/accuracies": 0.625, "rewards/chosen": 0.4903402030467987, "rewards/margins": 0.16850018501281738, "rewards/rejected": 0.3218400180339813, "step": 75 }, { "epoch": 0.1589958158995816, "grad_norm": 7.740159216532024, "learning_rate": 4.947871666974437e-07, "logits/chosen": -2.746283531188965, "logits/rejected": -2.6653685569763184, "logps/chosen": -1.0666463375091553, "logps/rejected": -1.5027016401290894, "loss": -0.0814, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3813169598579407, "rewards/margins": 0.11095122247934341, "rewards/rejected": 0.27036574482917786, "step": 76 }, { "epoch": 0.16108786610878661, "grad_norm": 7.976694731779206, "learning_rate": 4.944095910698372e-07, "logits/chosen": -2.6141693592071533, "logits/rejected": -2.6142563819885254, "logps/chosen": -1.0128289461135864, "logps/rejected": -1.2213771343231201, "loss": -0.0688, "rewards/accuracies": 0.625, "rewards/chosen": 0.3892231583595276, "rewards/margins": 0.02057480998337269, "rewards/rejected": 0.36864835023880005, "step": 77 }, { "epoch": 0.16317991631799164, "grad_norm": 12.788059053105092, "learning_rate": 4.940189693889818e-07, "logits/chosen": -2.5218615531921387, "logits/rejected": -2.475954055786133, "logps/chosen": -1.17843759059906, "logps/rejected": -1.431633472442627, "loss": -0.079, "rewards/accuracies": 0.75, "rewards/chosen": 0.3605992794036865, "rewards/margins": 0.08995417505502701, "rewards/rejected": 0.2706451416015625, "step": 78 }, { "epoch": 0.16527196652719664, "grad_norm": 7.9687380323133175, "learning_rate": 4.936153225054146e-07, "logits/chosen": -2.453888416290283, "logits/rejected": -2.4906387329101562, "logps/chosen": -1.0334270000457764, "logps/rejected": -1.2166098356246948, "loss": -0.0811, "rewards/accuracies": 0.625, "rewards/chosen": 0.40555617213249207, "rewards/margins": 0.07155266404151917, "rewards/rejected": 0.3340035080909729, "step": 79 }, { "epoch": 0.16736401673640167, "grad_norm": 7.103423794773496, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.58834171295166, "logits/rejected": -2.602987766265869, "logps/chosen": -0.9456465840339661, "logps/rejected": -1.0439833402633667, "loss": -0.062, "rewards/accuracies": 0.625, "rewards/chosen": 0.4474758505821228, "rewards/margins": 0.05571586266160011, "rewards/rejected": 0.391759991645813, "step": 80 }, { "epoch": 0.1694560669456067, "grad_norm": 8.009957315662204, "learning_rate": 4.927690400074286e-07, "logits/chosen": -2.6588869094848633, "logits/rejected": -2.6168625354766846, "logps/chosen": -0.9099507927894592, "logps/rejected": -1.2698771953582764, "loss": -0.0448, "rewards/accuracies": 0.625, "rewards/chosen": 0.45280569791793823, "rewards/margins": 0.09746626019477844, "rewards/rejected": 0.3553394079208374, "step": 81 }, { "epoch": 0.17154811715481172, "grad_norm": 7.621644329163717, "learning_rate": 4.923264495657319e-07, "logits/chosen": -2.6462247371673584, "logits/rejected": -2.540278434753418, "logps/chosen": -0.9409681558609009, "logps/rejected": -1.4178993701934814, "loss": -0.1086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4165344536304474, "rewards/margins": 0.11931312829256058, "rewards/rejected": 0.2972213327884674, "step": 82 }, { "epoch": 0.17364016736401675, "grad_norm": 6.497792969659968, "learning_rate": 4.918709242643563e-07, "logits/chosen": -2.6598472595214844, "logits/rejected": -2.5916948318481445, "logps/chosen": -0.94767165184021, "logps/rejected": -1.136979341506958, "loss": -0.0992, "rewards/accuracies": 0.625, "rewards/chosen": 0.39976003766059875, "rewards/margins": 0.04650614410638809, "rewards/rejected": 0.35325387120246887, "step": 83 }, { "epoch": 0.17573221757322174, "grad_norm": 5.447470282604431, "learning_rate": 4.914024884182534e-07, "logits/chosen": -2.7488155364990234, "logits/rejected": -2.7466177940368652, "logps/chosen": -0.9067939519882202, "logps/rejected": -1.152085542678833, "loss": -0.0763, "rewards/accuracies": 0.625, "rewards/chosen": 0.4365091919898987, "rewards/margins": 0.1028919368982315, "rewards/rejected": 0.33361726999282837, "step": 84 }, { "epoch": 0.17782426778242677, "grad_norm": 9.656999379927793, "learning_rate": 4.909211670315114e-07, "logits/chosen": -2.494537115097046, "logits/rejected": -2.4501397609710693, "logps/chosen": -1.0578653812408447, "logps/rejected": -1.2545416355133057, "loss": -0.0752, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40090253949165344, "rewards/margins": 0.0684221088886261, "rewards/rejected": 0.33248043060302734, "step": 85 }, { "epoch": 0.1799163179916318, "grad_norm": 6.686833143408991, "learning_rate": 4.904269857960208e-07, "logits/chosen": -2.5811610221862793, "logits/rejected": -2.5554800033569336, "logps/chosen": -0.9244771003723145, "logps/rejected": -0.9853077530860901, "loss": -0.0852, "rewards/accuracies": 0.5625, "rewards/chosen": 0.43252524733543396, "rewards/margins": 0.02168216183781624, "rewards/rejected": 0.4108430743217468, "step": 86 }, { "epoch": 0.18200836820083682, "grad_norm": 9.23900218513629, "learning_rate": 4.899199710901028e-07, "logits/chosen": -2.7979397773742676, "logits/rejected": -2.7201385498046875, "logps/chosen": -0.9246209859848022, "logps/rejected": -1.1718610525131226, "loss": -0.0907, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4397488832473755, "rewards/margins": 0.09631224721670151, "rewards/rejected": 0.34343665838241577, "step": 87 }, { "epoch": 0.18410041841004185, "grad_norm": 9.234428289424285, "learning_rate": 4.894001499771015e-07, "logits/chosen": -2.5294344425201416, "logits/rejected": -2.4418015480041504, "logps/chosen": -1.093963384628296, "logps/rejected": -2.0233564376831055, "loss": -0.0818, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39119696617126465, "rewards/margins": 0.1488228142261505, "rewards/rejected": 0.24237412214279175, "step": 88 }, { "epoch": 0.18619246861924685, "grad_norm": 12.083165310097053, "learning_rate": 4.888675502039391e-07, "logits/chosen": -2.485434055328369, "logits/rejected": -2.4613311290740967, "logps/chosen": -1.15748929977417, "logps/rejected": -1.3351922035217285, "loss": -0.0771, "rewards/accuracies": 0.4375, "rewards/chosen": 0.33443617820739746, "rewards/margins": -0.015506994910538197, "rewards/rejected": 0.3499431908130646, "step": 89 }, { "epoch": 0.18828451882845187, "grad_norm": 5.745618394681697, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.551848888397217, "logits/rejected": -2.6118860244750977, "logps/chosen": -1.160627841949463, "logps/rejected": -1.5292195081710815, "loss": -0.0656, "rewards/accuracies": 0.5, "rewards/chosen": 0.36388182640075684, "rewards/margins": 0.056162893772125244, "rewards/rejected": 0.3077189028263092, "step": 90 }, { "epoch": 0.1903765690376569, "grad_norm": 6.048392960412249, "learning_rate": 4.877641290737883e-07, "logits/chosen": -2.544996738433838, "logits/rejected": -2.5666582584381104, "logps/chosen": -1.1059722900390625, "logps/rejected": -1.0704509019851685, "loss": -0.0755, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4008127450942993, "rewards/margins": -0.0058725010603666306, "rewards/rejected": 0.4066852629184723, "step": 91 }, { "epoch": 0.19246861924686193, "grad_norm": 7.750849031271762, "learning_rate": 4.871933666150239e-07, "logits/chosen": -2.5515635013580322, "logits/rejected": -2.622760534286499, "logps/chosen": -1.0890980958938599, "logps/rejected": -1.2788047790527344, "loss": -0.0606, "rewards/accuracies": 0.625, "rewards/chosen": 0.3743430972099304, "rewards/margins": 0.04916682466864586, "rewards/rejected": 0.32517626881599426, "step": 92 }, { "epoch": 0.19456066945606695, "grad_norm": 6.133733012127484, "learning_rate": 4.866099432894024e-07, "logits/chosen": -2.6669888496398926, "logits/rejected": -2.5700578689575195, "logps/chosen": -0.6280112266540527, "logps/rejected": -1.3432271480560303, "loss": -0.0976, "rewards/accuracies": 1.0, "rewards/chosen": 0.56821608543396, "rewards/margins": 0.256414532661438, "rewards/rejected": 0.311801552772522, "step": 93 }, { "epoch": 0.19665271966527198, "grad_norm": 7.408700272090894, "learning_rate": 4.860138902387939e-07, "logits/chosen": -2.6542887687683105, "logits/rejected": -2.572908401489258, "logps/chosen": -0.769626259803772, "logps/rejected": -0.9871947169303894, "loss": -0.0605, "rewards/accuracies": 0.625, "rewards/chosen": 0.5172251462936401, "rewards/margins": 0.06419370323419571, "rewards/rejected": 0.45303142070770264, "step": 94 }, { "epoch": 0.19874476987447698, "grad_norm": 5.427432507236315, "learning_rate": 4.854052392792161e-07, "logits/chosen": -2.437551975250244, "logits/rejected": -2.353766918182373, "logps/chosen": -1.1325689554214478, "logps/rejected": -1.2256555557250977, "loss": -0.0596, "rewards/accuracies": 0.5, "rewards/chosen": 0.36106187105178833, "rewards/margins": 0.02263234183192253, "rewards/rejected": 0.3384295105934143, "step": 95 }, { "epoch": 0.200836820083682, "grad_norm": 8.701179547204811, "learning_rate": 4.847840228991356e-07, "logits/chosen": -2.6898810863494873, "logits/rejected": -2.622732639312744, "logps/chosen": -0.8455623388290405, "logps/rejected": -1.298283338546753, "loss": -0.0478, "rewards/accuracies": 0.75, "rewards/chosen": 0.47173362970352173, "rewards/margins": 0.14020122587680817, "rewards/rejected": 0.33153235912323, "step": 96 }, { "epoch": 0.20292887029288703, "grad_norm": 9.967523481438473, "learning_rate": 4.841502742577338e-07, "logits/chosen": -2.6468942165374756, "logits/rejected": -2.573026180267334, "logps/chosen": -1.0249431133270264, "logps/rejected": -1.4148008823394775, "loss": -0.0399, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3901897668838501, "rewards/margins": 0.03252024948596954, "rewards/rejected": 0.35766950249671936, "step": 97 }, { "epoch": 0.20502092050209206, "grad_norm": 7.633760266532135, "learning_rate": 4.83504027183137e-07, "logits/chosen": -2.7580676078796387, "logits/rejected": -2.746480941772461, "logps/chosen": -0.9797180891036987, "logps/rejected": -1.188408374786377, "loss": -0.0867, "rewards/accuracies": 0.5625, "rewards/chosen": 0.42061182856559753, "rewards/margins": 0.09095914661884308, "rewards/rejected": 0.32965266704559326, "step": 98 }, { "epoch": 0.20711297071129708, "grad_norm": 9.235928116007752, "learning_rate": 4.828453161706108e-07, "logits/chosen": -2.788665771484375, "logits/rejected": -2.838707447052002, "logps/chosen": -0.8068978786468506, "logps/rejected": -1.2045230865478516, "loss": -0.1209, "rewards/accuracies": 0.6875, "rewards/chosen": 0.48726317286491394, "rewards/margins": 0.1484646499156952, "rewards/rejected": 0.33879852294921875, "step": 99 }, { "epoch": 0.20920502092050208, "grad_norm": 10.315885970527864, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.7912654876708984, "logits/rejected": -2.681771755218506, "logps/chosen": -1.0527245998382568, "logps/rejected": -1.1655709743499756, "loss": -0.0844, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3953736424446106, "rewards/margins": -0.006716027855873108, "rewards/rejected": 0.4020896852016449, "step": 100 }, { "epoch": 0.2112970711297071, "grad_norm": 10.580144647139496, "learning_rate": 4.81490643637445e-07, "logits/chosen": -2.5700273513793945, "logits/rejected": -2.616337299346924, "logps/chosen": -0.8749042749404907, "logps/rejected": -1.345421314239502, "loss": -0.107, "rewards/accuracies": 0.6875, "rewards/chosen": 0.46963128447532654, "rewards/margins": 0.13532906770706177, "rewards/rejected": 0.3343021869659424, "step": 101 }, { "epoch": 0.21338912133891214, "grad_norm": 7.233574775615707, "learning_rate": 4.807947544262838e-07, "logits/chosen": -2.5869598388671875, "logits/rejected": -2.5668182373046875, "logps/chosen": -1.0655555725097656, "logps/rejected": -1.5141360759735107, "loss": -0.1207, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3982890546321869, "rewards/margins": 0.09518561512231827, "rewards/rejected": 0.30310338735580444, "step": 102 }, { "epoch": 0.21548117154811716, "grad_norm": 8.314364804924969, "learning_rate": 4.800865458922898e-07, "logits/chosen": -2.965330123901367, "logits/rejected": -2.9011240005493164, "logps/chosen": -0.8784880638122559, "logps/rejected": -1.2078497409820557, "loss": -0.0707, "rewards/accuracies": 0.75, "rewards/chosen": 0.4488745331764221, "rewards/margins": 0.09526941180229187, "rewards/rejected": 0.35360509157180786, "step": 103 }, { "epoch": 0.2175732217573222, "grad_norm": 7.240646906240465, "learning_rate": 4.793660558380969e-07, "logits/chosen": -2.8919410705566406, "logits/rejected": -2.908057451248169, "logps/chosen": -1.297257661819458, "logps/rejected": -1.2795225381851196, "loss": -0.0768, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31585755944252014, "rewards/margins": -0.03636983036994934, "rewards/rejected": 0.3522273600101471, "step": 104 }, { "epoch": 0.2196652719665272, "grad_norm": 13.885756163895579, "learning_rate": 4.786333227218995e-07, "logits/chosen": -2.655101776123047, "logits/rejected": -2.53055477142334, "logps/chosen": -1.0020012855529785, "logps/rejected": -1.7252869606018066, "loss": -0.131, "rewards/accuracies": 0.625, "rewards/chosen": 0.4196831285953522, "rewards/margins": 0.16872525215148926, "rewards/rejected": 0.2509578466415405, "step": 105 }, { "epoch": 0.2217573221757322, "grad_norm": 5.7928363890391195, "learning_rate": 4.778883856554003e-07, "logits/chosen": -2.6295785903930664, "logits/rejected": -2.560263156890869, "logps/chosen": -0.9247240424156189, "logps/rejected": -1.521117925643921, "loss": -0.1343, "rewards/accuracies": 0.75, "rewards/chosen": 0.42524152994155884, "rewards/margins": 0.1525372862815857, "rewards/rejected": 0.27270421385765076, "step": 106 }, { "epoch": 0.22384937238493724, "grad_norm": 10.72110537794039, "learning_rate": 4.771312844017224e-07, "logits/chosen": -2.7769789695739746, "logits/rejected": -2.6794891357421875, "logps/chosen": -1.0048589706420898, "logps/rejected": -1.5365948677062988, "loss": -0.1032, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40189632773399353, "rewards/margins": 0.09801431000232697, "rewards/rejected": 0.30388203263282776, "step": 107 }, { "epoch": 0.22594142259414227, "grad_norm": 7.750878315038949, "learning_rate": 4.7636205937328664e-07, "logits/chosen": -2.4595589637756348, "logits/rejected": -2.406452178955078, "logps/chosen": -0.9035621285438538, "logps/rejected": -1.3715593814849854, "loss": -0.0912, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4404214024543762, "rewards/margins": 0.10055311024188995, "rewards/rejected": 0.33986830711364746, "step": 108 }, { "epoch": 0.2280334728033473, "grad_norm": 8.013088010734279, "learning_rate": 4.755807516296547e-07, "logits/chosen": -2.734206199645996, "logits/rejected": -2.6361465454101562, "logps/chosen": -1.279822826385498, "logps/rejected": -1.9017040729522705, "loss": -0.0997, "rewards/accuracies": 0.5, "rewards/chosen": 0.30485787987709045, "rewards/margins": 0.04763868451118469, "rewards/rejected": 0.25721919536590576, "step": 109 }, { "epoch": 0.2301255230125523, "grad_norm": 10.361495087309358, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.663100242614746, "logits/rejected": -2.608872890472412, "logps/chosen": -1.1176433563232422, "logps/rejected": -1.4879114627838135, "loss": -0.0849, "rewards/accuracies": 0.625, "rewards/chosen": 0.39366304874420166, "rewards/margins": 0.08537843823432922, "rewards/rejected": 0.30828458070755005, "step": 110 }, { "epoch": 0.23221757322175732, "grad_norm": 11.489203651128713, "learning_rate": 4.739820554575686e-07, "logits/chosen": -2.7516872882843018, "logits/rejected": -2.7274041175842285, "logps/chosen": -1.2375142574310303, "logps/rejected": -1.3109242916107178, "loss": -0.0329, "rewards/accuracies": 0.5625, "rewards/chosen": 0.35172292590141296, "rewards/margins": 0.028698619455099106, "rewards/rejected": 0.32302427291870117, "step": 111 }, { "epoch": 0.23430962343096234, "grad_norm": 7.973354310908626, "learning_rate": 4.731647523640445e-07, "logits/chosen": -2.761908531188965, "logits/rejected": -2.641033172607422, "logps/chosen": -0.9457468390464783, "logps/rejected": -1.5787360668182373, "loss": -0.1048, "rewards/accuracies": 0.75, "rewards/chosen": 0.416814386844635, "rewards/margins": 0.12927794456481934, "rewards/rejected": 0.2875364422798157, "step": 112 }, { "epoch": 0.23640167364016737, "grad_norm": 8.416401572506357, "learning_rate": 4.723355372206297e-07, "logits/chosen": -2.749239921569824, "logits/rejected": -2.667978525161743, "logps/chosen": -0.738419771194458, "logps/rejected": -1.6119505167007446, "loss": -0.1138, "rewards/accuracies": 0.75, "rewards/chosen": 0.515089213848114, "rewards/margins": 0.2025749832391739, "rewards/rejected": 0.31251418590545654, "step": 113 }, { "epoch": 0.2384937238493724, "grad_norm": 8.941134480770472, "learning_rate": 4.714944542890278e-07, "logits/chosen": -2.7977280616760254, "logits/rejected": -2.7237446308135986, "logps/chosen": -1.195307970046997, "logps/rejected": -1.7132177352905273, "loss": -0.1042, "rewards/accuracies": 0.5, "rewards/chosen": 0.3529362082481384, "rewards/margins": 0.005412375554442406, "rewards/rejected": 0.3475238084793091, "step": 114 }, { "epoch": 0.2405857740585774, "grad_norm": 8.619242640102716, "learning_rate": 4.706415484644195e-07, "logits/chosen": -2.7845406532287598, "logits/rejected": -2.7088088989257812, "logps/chosen": -1.0112650394439697, "logps/rejected": -1.4609363079071045, "loss": -0.0881, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4237975776195526, "rewards/margins": 0.11371254920959473, "rewards/rejected": 0.3100849986076355, "step": 115 }, { "epoch": 0.24267782426778242, "grad_norm": 15.569708812933248, "learning_rate": 4.6977686527306555e-07, "logits/chosen": -2.7467026710510254, "logits/rejected": -2.6555519104003906, "logps/chosen": -1.3848240375518799, "logps/rejected": -1.6398433446884155, "loss": -0.0615, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32322824001312256, "rewards/margins": 0.04797099530696869, "rewards/rejected": 0.2752572298049927, "step": 116 }, { "epoch": 0.24476987447698745, "grad_norm": 7.401683574617976, "learning_rate": 4.6890045086987707e-07, "logits/chosen": -2.8689966201782227, "logits/rejected": -2.7104268074035645, "logps/chosen": -0.9026917219161987, "logps/rejected": -1.6604235172271729, "loss": -0.0947, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4347726106643677, "rewards/margins": 0.16332250833511353, "rewards/rejected": 0.27145010232925415, "step": 117 }, { "epoch": 0.24686192468619247, "grad_norm": 7.629104260103111, "learning_rate": 4.680123520359519e-07, "logits/chosen": -2.6418347358703613, "logits/rejected": -2.5293517112731934, "logps/chosen": -1.1596897840499878, "logps/rejected": -1.622258186340332, "loss": -0.1004, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3867058753967285, "rewards/margins": 0.13067524135112762, "rewards/rejected": 0.2560306191444397, "step": 118 }, { "epoch": 0.2489539748953975, "grad_norm": 8.780545352344028, "learning_rate": 4.671126161760772e-07, "logits/chosen": -2.7948989868164062, "logits/rejected": -2.7727484703063965, "logps/chosen": -1.3009717464447021, "logps/rejected": -2.0594563484191895, "loss": -0.0531, "rewards/accuracies": 0.625, "rewards/chosen": 0.3208620548248291, "rewards/margins": 0.0681702196598053, "rewards/rejected": 0.2526918053627014, "step": 119 }, { "epoch": 0.2510460251046025, "grad_norm": 8.141226660227753, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.8210859298706055, "logits/rejected": -2.7449235916137695, "logps/chosen": -1.0188672542572021, "logps/rejected": -1.4339046478271484, "loss": -0.0647, "rewards/accuracies": 0.625, "rewards/chosen": 0.41782328486442566, "rewards/margins": 0.07959190011024475, "rewards/rejected": 0.3382313847541809, "step": 120 }, { "epoch": 0.25313807531380755, "grad_norm": 8.356022071440359, "learning_rate": 4.6527842610086124e-07, "logits/chosen": -2.499788761138916, "logits/rejected": -2.588809013366699, "logps/chosen": -1.0250036716461182, "logps/rejected": -1.3243354558944702, "loss": -0.0935, "rewards/accuracies": 0.5625, "rewards/chosen": 0.43090829253196716, "rewards/margins": 0.053986430168151855, "rewards/rejected": 0.3769218921661377, "step": 121 }, { "epoch": 0.25523012552301255, "grad_norm": 8.069688534688002, "learning_rate": 4.6434406979060327e-07, "logits/chosen": -2.7623844146728516, "logits/rejected": -2.692293167114258, "logps/chosen": -1.4179835319519043, "logps/rejected": -1.441861867904663, "loss": -0.0593, "rewards/accuracies": 0.3125, "rewards/chosen": 0.2650303244590759, "rewards/margins": -0.03164853900671005, "rewards/rejected": 0.2966788411140442, "step": 122 }, { "epoch": 0.25732217573221755, "grad_norm": 13.346564467347195, "learning_rate": 4.6339827225933657e-07, "logits/chosen": -2.778604030609131, "logits/rejected": -2.7227749824523926, "logps/chosen": -0.8697854280471802, "logps/rejected": -1.2359058856964111, "loss": -0.0895, "rewards/accuracies": 0.8125, "rewards/chosen": 0.45396870374679565, "rewards/margins": 0.10900671780109406, "rewards/rejected": 0.3449620008468628, "step": 123 }, { "epoch": 0.2594142259414226, "grad_norm": 7.9417348062875055, "learning_rate": 4.6244108399167977e-07, "logits/chosen": -2.67881441116333, "logits/rejected": -2.4599831104278564, "logps/chosen": -1.3219633102416992, "logps/rejected": -1.7368388175964355, "loss": -0.0727, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3538292646408081, "rewards/margins": 0.05511477217078209, "rewards/rejected": 0.2987144887447357, "step": 124 }, { "epoch": 0.2615062761506276, "grad_norm": 9.180985197503443, "learning_rate": 4.614725560802639e-07, "logits/chosen": -2.7183783054351807, "logits/rejected": -2.6761581897735596, "logps/chosen": -0.976803183555603, "logps/rejected": -1.0104469060897827, "loss": -0.0768, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4247945249080658, "rewards/margins": 0.011689772829413414, "rewards/rejected": 0.41310471296310425, "step": 125 }, { "epoch": 0.26359832635983266, "grad_norm": 8.86553923626234, "learning_rate": 4.60492740223006e-07, "logits/chosen": -2.7192089557647705, "logits/rejected": -2.6593987941741943, "logps/chosen": -1.2293155193328857, "logps/rejected": -1.3931902647018433, "loss": -0.0804, "rewards/accuracies": 0.5625, "rewards/chosen": 0.37537840008735657, "rewards/margins": 0.05658911168575287, "rewards/rejected": 0.3187893033027649, "step": 126 }, { "epoch": 0.26569037656903766, "grad_norm": 9.100936304907126, "learning_rate": 4.595016887203488e-07, "logits/chosen": -2.7341713905334473, "logits/rejected": -2.625302314758301, "logps/chosen": -1.227184772491455, "logps/rejected": -1.8654588460922241, "loss": -0.0752, "rewards/accuracies": 0.625, "rewards/chosen": 0.34461599588394165, "rewards/margins": 0.10101988911628723, "rewards/rejected": 0.24359610676765442, "step": 127 }, { "epoch": 0.26778242677824265, "grad_norm": 10.631080592900888, "learning_rate": 4.584994544724695e-07, "logits/chosen": -2.548196315765381, "logits/rejected": -2.4987902641296387, "logps/chosen": -1.1368122100830078, "logps/rejected": -2.0986969470977783, "loss": -0.1204, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3744341731071472, "rewards/margins": 0.12809962034225464, "rewards/rejected": 0.24633455276489258, "step": 128 }, { "epoch": 0.2698744769874477, "grad_norm": 13.719744878682432, "learning_rate": 4.574860909764559e-07, "logits/chosen": -2.5039968490600586, "logits/rejected": -2.5294370651245117, "logps/chosen": -1.5889067649841309, "logps/rejected": -1.7118114233016968, "loss": -0.1109, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3196980655193329, "rewards/margins": 0.10617660731077194, "rewards/rejected": 0.21352145075798035, "step": 129 }, { "epoch": 0.2719665271966527, "grad_norm": 15.366716836335764, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.7375288009643555, "logits/rejected": -2.6242637634277344, "logps/chosen": -1.133746862411499, "logps/rejected": -1.0736860036849976, "loss": -0.105, "rewards/accuracies": 0.375, "rewards/chosen": 0.41827988624572754, "rewards/margins": -0.03402038663625717, "rewards/rejected": 0.4523002803325653, "step": 130 }, { "epoch": 0.27405857740585776, "grad_norm": 15.603864611701567, "learning_rate": 4.554261931957657e-07, "logits/chosen": -2.7278056144714355, "logits/rejected": -2.5975325107574463, "logps/chosen": -0.9498021602630615, "logps/rejected": -1.1297776699066162, "loss": -0.0668, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4634658396244049, "rewards/margins": 0.07323861122131348, "rewards/rejected": 0.39022722840309143, "step": 131 }, { "epoch": 0.27615062761506276, "grad_norm": 10.531086272373418, "learning_rate": 4.5437976886395955e-07, "logits/chosen": -2.705713987350464, "logits/rejected": -2.652479410171509, "logps/chosen": -1.1467740535736084, "logps/rejected": -1.4576821327209473, "loss": -0.0988, "rewards/accuracies": 0.75, "rewards/chosen": 0.41082507371902466, "rewards/margins": 0.09115354716777802, "rewards/rejected": 0.31967151165008545, "step": 132 }, { "epoch": 0.27824267782426776, "grad_norm": 7.399259334304283, "learning_rate": 4.5332243518389136e-07, "logits/chosen": -2.725745677947998, "logits/rejected": -2.7743215560913086, "logps/chosen": -1.1210857629776, "logps/rejected": -1.1685874462127686, "loss": -0.093, "rewards/accuracies": 0.375, "rewards/chosen": 0.4526345133781433, "rewards/margins": 0.03685787320137024, "rewards/rejected": 0.41577666997909546, "step": 133 }, { "epoch": 0.2803347280334728, "grad_norm": 7.297983481533, "learning_rate": 4.5225424859373684e-07, "logits/chosen": -2.640211582183838, "logits/rejected": -2.570061206817627, "logps/chosen": -0.9512884020805359, "logps/rejected": -1.7315542697906494, "loss": -0.1001, "rewards/accuracies": 0.625, "rewards/chosen": 0.406036376953125, "rewards/margins": 0.10577087104320526, "rewards/rejected": 0.30026549100875854, "step": 134 }, { "epoch": 0.2824267782426778, "grad_norm": 8.614770102330974, "learning_rate": 4.511752661109768e-07, "logits/chosen": -2.5034797191619873, "logits/rejected": -2.482116460800171, "logps/chosen": -0.8747888207435608, "logps/rejected": -1.3776814937591553, "loss": -0.0835, "rewards/accuracies": 0.75, "rewards/chosen": 0.5053682327270508, "rewards/margins": 0.13714160025119781, "rewards/rejected": 0.36822664737701416, "step": 135 }, { "epoch": 0.28451882845188287, "grad_norm": 11.511732667746932, "learning_rate": 4.5008554532935316e-07, "logits/chosen": -2.2463009357452393, "logits/rejected": -2.418936014175415, "logps/chosen": -1.3389081954956055, "logps/rejected": -1.4335492849349976, "loss": -0.1004, "rewards/accuracies": 0.6875, "rewards/chosen": 0.33481383323669434, "rewards/margins": 0.04980039596557617, "rewards/rejected": 0.28501343727111816, "step": 136 }, { "epoch": 0.28661087866108786, "grad_norm": 7.62656299018194, "learning_rate": 4.4898514441579493e-07, "logits/chosen": -2.730118751525879, "logits/rejected": -2.7033190727233887, "logps/chosen": -1.2007243633270264, "logps/rejected": -1.3021783828735352, "loss": -0.0571, "rewards/accuracies": 0.5625, "rewards/chosen": 0.34427547454833984, "rewards/margins": 0.009376163594424725, "rewards/rejected": 0.33489930629730225, "step": 137 }, { "epoch": 0.28870292887029286, "grad_norm": 10.818515405254571, "learning_rate": 4.478741221073135e-07, "logits/chosen": -2.742260694503784, "logits/rejected": -2.5423145294189453, "logps/chosen": -0.9980774521827698, "logps/rejected": -1.3085711002349854, "loss": -0.0653, "rewards/accuracies": 0.625, "rewards/chosen": 0.4080244302749634, "rewards/margins": 0.05671762675046921, "rewards/rejected": 0.3513067960739136, "step": 138 }, { "epoch": 0.2907949790794979, "grad_norm": 11.277914045373983, "learning_rate": 4.467525377078671e-07, "logits/chosen": -2.218235492706299, "logits/rejected": -2.426461696624756, "logps/chosen": -1.1532562971115112, "logps/rejected": -1.4017515182495117, "loss": -0.0578, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4083666205406189, "rewards/margins": 0.056754596531391144, "rewards/rejected": 0.35161203145980835, "step": 139 }, { "epoch": 0.2928870292887029, "grad_norm": 14.584077361972543, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.7155497074127197, "logits/rejected": -2.58384370803833, "logps/chosen": -1.0665032863616943, "logps/rejected": -1.7976462841033936, "loss": -0.1131, "rewards/accuracies": 0.75, "rewards/chosen": 0.38510066270828247, "rewards/margins": 0.15970441699028015, "rewards/rejected": 0.22539621591567993, "step": 140 }, { "epoch": 0.29497907949790797, "grad_norm": 13.90438178725477, "learning_rate": 4.444779226676246e-07, "logits/chosen": -2.7173407077789307, "logits/rejected": -2.7211666107177734, "logps/chosen": -1.0276908874511719, "logps/rejected": -1.4164080619812012, "loss": -0.1078, "rewards/accuracies": 0.6875, "rewards/chosen": 0.40510475635528564, "rewards/margins": 0.08837657421827316, "rewards/rejected": 0.3167282044887543, "step": 141 }, { "epoch": 0.29707112970711297, "grad_norm": 12.235049719980324, "learning_rate": 4.4332501344084005e-07, "logits/chosen": -2.552258014678955, "logits/rejected": -2.592665195465088, "logps/chosen": -0.9130370616912842, "logps/rejected": -1.2232286930084229, "loss": -0.0768, "rewards/accuracies": 0.625, "rewards/chosen": 0.4437675476074219, "rewards/margins": 0.04495351016521454, "rewards/rejected": 0.39881402254104614, "step": 142 }, { "epoch": 0.29916317991631797, "grad_norm": 21.089753672362736, "learning_rate": 4.4216178494463295e-07, "logits/chosen": -2.7459824085235596, "logits/rejected": -2.619235038757324, "logps/chosen": -1.0598134994506836, "logps/rejected": -1.9267642498016357, "loss": -0.1071, "rewards/accuracies": 0.8125, "rewards/chosen": 0.36971956491470337, "rewards/margins": 0.14974790811538696, "rewards/rejected": 0.2199716567993164, "step": 143 }, { "epoch": 0.301255230125523, "grad_norm": 14.159186928029824, "learning_rate": 4.4098829926961477e-07, "logits/chosen": -2.5897159576416016, "logits/rejected": -2.516749858856201, "logps/chosen": -0.7705448269844055, "logps/rejected": -1.9500865936279297, "loss": -0.0738, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5220432281494141, "rewards/margins": 0.24915724992752075, "rewards/rejected": 0.2728859782218933, "step": 144 }, { "epoch": 0.303347280334728, "grad_norm": 25.852536388047355, "learning_rate": 4.398046190539024e-07, "logits/chosen": -2.5979108810424805, "logits/rejected": -2.5280308723449707, "logps/chosen": -1.0106014013290405, "logps/rejected": -2.4643092155456543, "loss": -0.1157, "rewards/accuracies": 0.875, "rewards/chosen": 0.3909301459789276, "rewards/margins": 0.12234397977590561, "rewards/rejected": 0.2685861587524414, "step": 145 }, { "epoch": 0.3054393305439331, "grad_norm": 37.83743366982954, "learning_rate": 4.3861080747977566e-07, "logits/chosen": -2.741292953491211, "logits/rejected": -2.7124412059783936, "logps/chosen": -1.1303961277008057, "logps/rejected": -2.0690033435821533, "loss": -0.1239, "rewards/accuracies": 0.875, "rewards/chosen": 0.38765209913253784, "rewards/margins": 0.16262786090373993, "rewards/rejected": 0.22502422332763672, "step": 146 }, { "epoch": 0.3075313807531381, "grad_norm": 15.24433968980372, "learning_rate": 4.37406928270304e-07, "logits/chosen": -2.6818463802337646, "logits/rejected": -2.603290557861328, "logps/chosen": -1.3489582538604736, "logps/rejected": -1.6901756525039673, "loss": -0.0567, "rewards/accuracies": 0.6875, "rewards/chosen": 0.31866884231567383, "rewards/margins": 0.022746426984667778, "rewards/rejected": 0.2959223985671997, "step": 147 }, { "epoch": 0.30962343096234307, "grad_norm": 22.776756565596372, "learning_rate": 4.3619304568594546e-07, "logits/chosen": -2.6372740268707275, "logits/rejected": -2.6938648223876953, "logps/chosen": -1.3485358953475952, "logps/rejected": -1.5879672765731812, "loss": -0.0709, "rewards/accuracies": 0.75, "rewards/chosen": 0.33045515418052673, "rewards/margins": 0.09428148716688156, "rewards/rejected": 0.23617368936538696, "step": 148 }, { "epoch": 0.3117154811715481, "grad_norm": 14.832493676126875, "learning_rate": 4.349692245211165e-07, "logits/chosen": -2.8095154762268066, "logits/rejected": -2.6838107109069824, "logps/chosen": -1.02547025680542, "logps/rejected": -1.4738640785217285, "loss": -0.0792, "rewards/accuracies": 0.625, "rewards/chosen": 0.414419949054718, "rewards/margins": 0.08410301804542542, "rewards/rejected": 0.3303169012069702, "step": 149 }, { "epoch": 0.3138075313807531, "grad_norm": 18.310776769390138, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.5838117599487305, "logits/rejected": -2.431469440460205, "logps/chosen": -1.4929683208465576, "logps/rejected": -1.7874231338500977, "loss": -0.0836, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3493340015411377, "rewards/margins": 0.10493455827236176, "rewards/rejected": 0.24439939856529236, "step": 150 }, { "epoch": 0.3158995815899582, "grad_norm": 9.964326552753194, "learning_rate": 4.324920282767256e-07, "logits/chosen": -2.6689319610595703, "logits/rejected": -2.5450775623321533, "logps/chosen": -1.054506778717041, "logps/rejected": -1.81797194480896, "loss": -0.1199, "rewards/accuracies": 0.625, "rewards/chosen": 0.3724176287651062, "rewards/margins": 0.1260986328125, "rewards/rejected": 0.2463189959526062, "step": 151 }, { "epoch": 0.3179916317991632, "grad_norm": 14.781469879773143, "learning_rate": 4.312387854245201e-07, "logits/chosen": -2.4934237003326416, "logits/rejected": -2.5184531211853027, "logps/chosen": -1.202325701713562, "logps/rejected": -1.8260574340820312, "loss": -0.1168, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34124889969825745, "rewards/margins": 0.11295446753501892, "rewards/rejected": 0.22829441726207733, "step": 152 }, { "epoch": 0.3200836820083682, "grad_norm": 13.249819772322319, "learning_rate": 4.2997586843949896e-07, "logits/chosen": -2.612116813659668, "logits/rejected": -2.5220794677734375, "logps/chosen": -1.3319296836853027, "logps/rejected": -1.688657283782959, "loss": -0.1192, "rewards/accuracies": 0.5625, "rewards/chosen": 0.31724148988723755, "rewards/margins": 0.07862979173660278, "rewards/rejected": 0.23861169815063477, "step": 153 }, { "epoch": 0.32217573221757323, "grad_norm": 9.882053994484261, "learning_rate": 4.287033447334286e-07, "logits/chosen": -2.6856942176818848, "logits/rejected": -2.6089882850646973, "logps/chosen": -1.3274683952331543, "logps/rejected": -2.0857741832733154, "loss": -0.0792, "rewards/accuracies": 0.75, "rewards/chosen": 0.3238699436187744, "rewards/margins": 0.08010490238666534, "rewards/rejected": 0.24376502633094788, "step": 154 }, { "epoch": 0.32426778242677823, "grad_norm": 20.74435826093936, "learning_rate": 4.2742128223086115e-07, "logits/chosen": -2.469430446624756, "logits/rejected": -2.3157687187194824, "logps/chosen": -1.229305624961853, "logps/rejected": -1.7900751829147339, "loss": -0.0827, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3742397427558899, "rewards/margins": 0.09423865377902985, "rewards/rejected": 0.28000104427337646, "step": 155 }, { "epoch": 0.3263598326359833, "grad_norm": 17.4329604936243, "learning_rate": 4.261297493655092e-07, "logits/chosen": -2.436098575592041, "logits/rejected": -2.3914339542388916, "logps/chosen": -1.217616081237793, "logps/rejected": -1.8459382057189941, "loss": -0.1423, "rewards/accuracies": 0.75, "rewards/chosen": 0.4322535991668701, "rewards/margins": 0.1469898223876953, "rewards/rejected": 0.2852637767791748, "step": 156 }, { "epoch": 0.3284518828451883, "grad_norm": 17.563732823445005, "learning_rate": 4.2482881507659244e-07, "logits/chosen": -2.4747214317321777, "logits/rejected": -2.4113662242889404, "logps/chosen": -1.379757046699524, "logps/rejected": -1.7662575244903564, "loss": -0.1266, "rewards/accuracies": 0.6875, "rewards/chosen": 0.37260758876800537, "rewards/margins": 0.06532790511846542, "rewards/rejected": 0.30727970600128174, "step": 157 }, { "epoch": 0.3305439330543933, "grad_norm": 8.540815754507376, "learning_rate": 4.235185488051585e-07, "logits/chosen": -2.3964672088623047, "logits/rejected": -2.3802266120910645, "logps/chosen": -1.4923787117004395, "logps/rejected": -1.987583875656128, "loss": -0.1227, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3038066029548645, "rewards/margins": 0.06362194567918777, "rewards/rejected": 0.24018466472625732, "step": 158 }, { "epoch": 0.33263598326359833, "grad_norm": 9.25947209544519, "learning_rate": 4.2219902049037554e-07, "logits/chosen": -2.479762077331543, "logits/rejected": -2.4210665225982666, "logps/chosen": -1.38946533203125, "logps/rejected": -1.7284841537475586, "loss": -0.0544, "rewards/accuracies": 0.4375, "rewards/chosen": 0.29429730772972107, "rewards/margins": 0.0212214644998312, "rewards/rejected": 0.27307581901550293, "step": 159 }, { "epoch": 0.33472803347280333, "grad_norm": 12.44176743490638, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.319620132446289, "logits/rejected": -2.326211452484131, "logps/chosen": -0.8733711242675781, "logps/rejected": -1.2630772590637207, "loss": -0.0928, "rewards/accuracies": 0.75, "rewards/chosen": 0.4575880169868469, "rewards/margins": 0.08910935372114182, "rewards/rejected": 0.3684786856174469, "step": 160 }, { "epoch": 0.3368200836820084, "grad_norm": 23.431018635077386, "learning_rate": 4.1953245995561577e-07, "logits/chosen": -2.76431941986084, "logits/rejected": -2.6128687858581543, "logps/chosen": -0.8526709079742432, "logps/rejected": -1.4783575534820557, "loss": -0.1187, "rewards/accuracies": 0.625, "rewards/chosen": 0.4951631426811218, "rewards/margins": 0.10067776590585709, "rewards/rejected": 0.39448535442352295, "step": 161 }, { "epoch": 0.3389121338912134, "grad_norm": 15.623223043488867, "learning_rate": 4.1818557007085e-07, "logits/chosen": -2.620527982711792, "logits/rejected": -2.532945156097412, "logps/chosen": -1.390239953994751, "logps/rejected": -1.7109453678131104, "loss": -0.091, "rewards/accuracies": 0.625, "rewards/chosen": 0.33600008487701416, "rewards/margins": 0.07325442135334015, "rewards/rejected": 0.2627456486225128, "step": 162 }, { "epoch": 0.3410041841004184, "grad_norm": 12.224980553609598, "learning_rate": 4.1682970280555987e-07, "logits/chosen": -2.6704134941101074, "logits/rejected": -2.5877957344055176, "logps/chosen": -0.9367567896842957, "logps/rejected": -1.9059950113296509, "loss": -0.1361, "rewards/accuracies": 0.875, "rewards/chosen": 0.46736544370651245, "rewards/margins": 0.20542237162590027, "rewards/rejected": 0.26194310188293457, "step": 163 }, { "epoch": 0.34309623430962344, "grad_norm": 12.801839720123297, "learning_rate": 4.154649305329958e-07, "logits/chosen": -2.562652111053467, "logits/rejected": -2.575808048248291, "logps/chosen": -1.4792587757110596, "logps/rejected": -1.5863935947418213, "loss": -0.1133, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3257550001144409, "rewards/margins": 0.05095459520816803, "rewards/rejected": 0.2748003900051117, "step": 164 }, { "epoch": 0.34518828451882844, "grad_norm": 12.577792600421537, "learning_rate": 4.140913261017382e-07, "logits/chosen": -2.5608842372894287, "logits/rejected": -2.502084732055664, "logps/chosen": -0.9237871170043945, "logps/rejected": -1.2350677251815796, "loss": -0.0699, "rewards/accuracies": 0.5625, "rewards/chosen": 0.44716089963912964, "rewards/margins": 0.09571586549282074, "rewards/rejected": 0.3514450192451477, "step": 165 }, { "epoch": 0.3472803347280335, "grad_norm": 25.043359615671452, "learning_rate": 4.127089628318089e-07, "logits/chosen": -2.6735382080078125, "logits/rejected": -2.5704262256622314, "logps/chosen": -1.2471284866333008, "logps/rejected": -1.6530365943908691, "loss": -0.0821, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3380359411239624, "rewards/margins": 0.031453195959329605, "rewards/rejected": 0.3065827488899231, "step": 166 }, { "epoch": 0.3493723849372385, "grad_norm": 14.126453426156447, "learning_rate": 4.113179145107575e-07, "logits/chosen": -2.7621216773986816, "logits/rejected": -2.6524105072021484, "logps/chosen": -0.7899508476257324, "logps/rejected": -1.3169074058532715, "loss": -0.084, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4821957051753998, "rewards/margins": 0.10919322073459625, "rewards/rejected": 0.3730024993419647, "step": 167 }, { "epoch": 0.3514644351464435, "grad_norm": 8.684294843492529, "learning_rate": 4.099182553897228e-07, "logits/chosen": -2.5085854530334473, "logits/rejected": -2.3533246517181396, "logps/chosen": -1.205338716506958, "logps/rejected": -1.7073439359664917, "loss": -0.09, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3430725932121277, "rewards/margins": 0.0864902213215828, "rewards/rejected": 0.2565823793411255, "step": 168 }, { "epoch": 0.35355648535564854, "grad_norm": 10.933055039872562, "learning_rate": 4.0851006017946945e-07, "logits/chosen": -2.7597827911376953, "logits/rejected": -2.5856375694274902, "logps/chosen": -1.1774203777313232, "logps/rejected": -2.170247793197632, "loss": -0.1448, "rewards/accuracies": 0.75, "rewards/chosen": 0.39301756024360657, "rewards/margins": 0.15624675154685974, "rewards/rejected": 0.23677079379558563, "step": 169 }, { "epoch": 0.35564853556485354, "grad_norm": 11.726931388893034, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.6450562477111816, "logits/rejected": -2.5120420455932617, "logps/chosen": -1.0560662746429443, "logps/rejected": -1.673842191696167, "loss": -0.0984, "rewards/accuracies": 0.75, "rewards/chosen": 0.4195786118507385, "rewards/margins": 0.17711637914180756, "rewards/rejected": 0.24246223270893097, "step": 170 }, { "epoch": 0.3577405857740586, "grad_norm": 10.741121040812216, "learning_rate": 4.056683626085422e-07, "logits/chosen": -2.6740009784698486, "logits/rejected": -2.5694196224212646, "logps/chosen": -0.98111891746521, "logps/rejected": -1.4604544639587402, "loss": -0.1277, "rewards/accuracies": 0.5, "rewards/chosen": 0.41521942615509033, "rewards/margins": 0.03261597827076912, "rewards/rejected": 0.3826034665107727, "step": 171 }, { "epoch": 0.3598326359832636, "grad_norm": 12.230786885035174, "learning_rate": 4.042350119315141e-07, "logits/chosen": -2.764572858810425, "logits/rejected": -2.6556856632232666, "logps/chosen": -1.0518836975097656, "logps/rejected": -1.255865454673767, "loss": -0.0712, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3684941828250885, "rewards/margins": 0.009007595479488373, "rewards/rejected": 0.35948657989501953, "step": 172 }, { "epoch": 0.3619246861924686, "grad_norm": 14.615276527777311, "learning_rate": 4.027934285244623e-07, "logits/chosen": -2.2994682788848877, "logits/rejected": -2.2448410987854004, "logps/chosen": -1.4600330591201782, "logps/rejected": -1.3843920230865479, "loss": -0.0957, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3354830741882324, "rewards/margins": -0.007916823029518127, "rewards/rejected": 0.34339988231658936, "step": 173 }, { "epoch": 0.36401673640167365, "grad_norm": 17.754495400703274, "learning_rate": 4.0134368933597864e-07, "logits/chosen": -2.5806937217712402, "logits/rejected": -2.5036139488220215, "logps/chosen": -1.0033631324768066, "logps/rejected": -1.66176438331604, "loss": -0.1324, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43012845516204834, "rewards/margins": 0.09277307987213135, "rewards/rejected": 0.337355375289917, "step": 174 }, { "epoch": 0.36610878661087864, "grad_norm": 36.05147178915256, "learning_rate": 3.9988587174999306e-07, "logits/chosen": -2.775186538696289, "logits/rejected": -2.6564929485321045, "logps/chosen": -1.3308205604553223, "logps/rejected": -1.4604084491729736, "loss": -0.1109, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3784557282924652, "rewards/margins": 0.07769946753978729, "rewards/rejected": 0.3007562458515167, "step": 175 }, { "epoch": 0.3682008368200837, "grad_norm": 8.497342760203038, "learning_rate": 3.9842005358164267e-07, "logits/chosen": -2.674741268157959, "logits/rejected": -2.57438063621521, "logps/chosen": -1.2102327346801758, "logps/rejected": -1.3143596649169922, "loss": -0.0969, "rewards/accuracies": 0.5, "rewards/chosen": 0.36521029472351074, "rewards/margins": 0.0020803166553378105, "rewards/rejected": 0.36313000321388245, "step": 176 }, { "epoch": 0.3702928870292887, "grad_norm": 10.985890442670515, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -2.6628575325012207, "logits/rejected": -2.4451780319213867, "logps/chosen": -0.8929375410079956, "logps/rejected": -1.682444453239441, "loss": -0.0701, "rewards/accuracies": 0.75, "rewards/chosen": 0.4489145576953888, "rewards/margins": 0.19804592430591583, "rewards/rejected": 0.25086861848831177, "step": 177 }, { "epoch": 0.3723849372384937, "grad_norm": 11.399134258390868, "learning_rate": 3.954647288894882e-07, "logits/chosen": -2.5748448371887207, "logits/rejected": -2.4727160930633545, "logps/chosen": -0.9902825355529785, "logps/rejected": -1.9029393196105957, "loss": -0.0998, "rewards/accuracies": 0.75, "rewards/chosen": 0.40634915232658386, "rewards/margins": 0.1376405507326126, "rewards/rejected": 0.26870861649513245, "step": 178 }, { "epoch": 0.37447698744769875, "grad_norm": 17.72907495829811, "learning_rate": 3.9397538011449896e-07, "logits/chosen": -2.5171046257019043, "logits/rejected": -2.4481072425842285, "logps/chosen": -1.2923526763916016, "logps/rejected": -1.5680699348449707, "loss": -0.1458, "rewards/accuracies": 0.625, "rewards/chosen": 0.37985485792160034, "rewards/margins": 0.10051047801971436, "rewards/rejected": 0.279344379901886, "step": 179 }, { "epoch": 0.37656903765690375, "grad_norm": 13.449254488225826, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.5202701091766357, "logits/rejected": -2.5015406608581543, "logps/chosen": -1.03229820728302, "logps/rejected": -1.5690972805023193, "loss": -0.0937, "rewards/accuracies": 0.5, "rewards/chosen": 0.4546021819114685, "rewards/margins": 0.09776400029659271, "rewards/rejected": 0.3568382263183594, "step": 180 }, { "epoch": 0.3786610878661088, "grad_norm": 27.750872074205276, "learning_rate": 3.9097370719347065e-07, "logits/chosen": -2.4913382530212402, "logits/rejected": -2.458721160888672, "logps/chosen": -1.2403842210769653, "logps/rejected": -1.6784918308258057, "loss": -0.0936, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4432637095451355, "rewards/margins": 0.09729886054992676, "rewards/rejected": 0.34596481919288635, "step": 181 }, { "epoch": 0.3807531380753138, "grad_norm": 10.808603632088001, "learning_rate": 3.894615432702143e-07, "logits/chosen": -2.595398187637329, "logits/rejected": -2.599726915359497, "logps/chosen": -1.179286003112793, "logps/rejected": -1.8391259908676147, "loss": -0.1385, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3741753101348877, "rewards/margins": 0.12403026968240738, "rewards/rejected": 0.2501450181007385, "step": 182 }, { "epoch": 0.38284518828451886, "grad_norm": 30.462769845824713, "learning_rate": 3.879419351926115e-07, "logits/chosen": -2.8511743545532227, "logits/rejected": -2.758679151535034, "logps/chosen": -1.5136802196502686, "logps/rejected": -1.9941767454147339, "loss": -0.1107, "rewards/accuracies": 0.5, "rewards/chosen": 0.3550741672515869, "rewards/margins": 0.037071578204631805, "rewards/rejected": 0.3180025815963745, "step": 183 }, { "epoch": 0.38493723849372385, "grad_norm": 20.80095620525014, "learning_rate": 3.864149640740416e-07, "logits/chosen": -2.501633882522583, "logits/rejected": -2.5069472789764404, "logps/chosen": -0.8953936100006104, "logps/rejected": -2.3201770782470703, "loss": -0.1123, "rewards/accuracies": 0.8125, "rewards/chosen": 0.47941428422927856, "rewards/margins": 0.2701411843299866, "rewards/rejected": 0.20927312970161438, "step": 184 }, { "epoch": 0.38702928870292885, "grad_norm": 10.92510807498925, "learning_rate": 3.848807114209074e-07, "logits/chosen": -2.6651406288146973, "logits/rejected": -2.560551166534424, "logps/chosen": -1.2817895412445068, "logps/rejected": -1.9381226301193237, "loss": -0.0943, "rewards/accuracies": 0.4375, "rewards/chosen": 0.32449212670326233, "rewards/margins": 0.07041622698307037, "rewards/rejected": 0.25407588481903076, "step": 185 }, { "epoch": 0.3891213389121339, "grad_norm": 12.044202139443744, "learning_rate": 3.833392591282838e-07, "logits/chosen": -2.7797350883483887, "logits/rejected": -2.7092323303222656, "logps/chosen": -1.0567940473556519, "logps/rejected": -1.476905107498169, "loss": -0.0953, "rewards/accuracies": 0.75, "rewards/chosen": 0.39627107977867126, "rewards/margins": 0.1051664873957634, "rewards/rejected": 0.29110458493232727, "step": 186 }, { "epoch": 0.3912133891213389, "grad_norm": 18.417800983461795, "learning_rate": 3.8179068947554705e-07, "logits/chosen": -2.527510643005371, "logits/rejected": -2.4806833267211914, "logps/chosen": -1.0980507135391235, "logps/rejected": -1.4430592060089111, "loss": -0.1071, "rewards/accuracies": 0.75, "rewards/chosen": 0.37078559398651123, "rewards/margins": 0.07708199322223663, "rewards/rejected": 0.293703556060791, "step": 187 }, { "epoch": 0.39330543933054396, "grad_norm": 10.619572229193354, "learning_rate": 3.8023508512198257e-07, "logits/chosen": -2.6688337326049805, "logits/rejected": -2.5510902404785156, "logps/chosen": -1.0620704889297485, "logps/rejected": -1.6755778789520264, "loss": -0.1078, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3925957679748535, "rewards/margins": 0.10504680871963501, "rewards/rejected": 0.2875489592552185, "step": 188 }, { "epoch": 0.39539748953974896, "grad_norm": 13.725621477241875, "learning_rate": 3.786725291023728e-07, "logits/chosen": -2.6156606674194336, "logits/rejected": -2.729259490966797, "logps/chosen": -1.2324016094207764, "logps/rejected": -1.929736852645874, "loss": -0.114, "rewards/accuracies": 0.75, "rewards/chosen": 0.36698228120803833, "rewards/margins": 0.09162534028291702, "rewards/rejected": 0.2753569483757019, "step": 189 }, { "epoch": 0.39748953974895396, "grad_norm": 17.142971438072294, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.6821255683898926, "logits/rejected": -2.609424114227295, "logps/chosen": -1.1436941623687744, "logps/rejected": -1.7539355754852295, "loss": -0.1211, "rewards/accuracies": 0.875, "rewards/chosen": 0.3664313554763794, "rewards/margins": 0.1225995272397995, "rewards/rejected": 0.2438318431377411, "step": 190 }, { "epoch": 0.399581589958159, "grad_norm": 10.808377047655993, "learning_rate": 3.7552689605501986e-07, "logits/chosen": -2.8227596282958984, "logits/rejected": -2.7516233921051025, "logps/chosen": -1.1025787591934204, "logps/rejected": -1.4772980213165283, "loss": -0.1236, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3938792943954468, "rewards/margins": 0.12577448785305023, "rewards/rejected": 0.26810479164123535, "step": 191 }, { "epoch": 0.401673640167364, "grad_norm": 9.051612364731888, "learning_rate": 3.7394398693433794e-07, "logits/chosen": -2.725497007369995, "logits/rejected": -2.604818820953369, "logps/chosen": -1.1002895832061768, "logps/rejected": -1.7089571952819824, "loss": -0.1199, "rewards/accuracies": 0.5, "rewards/chosen": 0.4390842914581299, "rewards/margins": 0.12097970396280289, "rewards/rejected": 0.3181045949459076, "step": 192 }, { "epoch": 0.40376569037656906, "grad_norm": 7.1679248444670325, "learning_rate": 3.7235446195277136e-07, "logits/chosen": -2.573514938354492, "logits/rejected": -2.376397132873535, "logps/chosen": -1.1410503387451172, "logps/rejected": -1.619983196258545, "loss": -0.1093, "rewards/accuracies": 0.5, "rewards/chosen": 0.36380043625831604, "rewards/margins": 0.03999757021665573, "rewards/rejected": 0.3238028883934021, "step": 193 }, { "epoch": 0.40585774058577406, "grad_norm": 13.32634634691774, "learning_rate": 3.7075840595571194e-07, "logits/chosen": -2.483315944671631, "logits/rejected": -2.51577091217041, "logps/chosen": -1.5389375686645508, "logps/rejected": -1.8842053413391113, "loss": -0.0454, "rewards/accuracies": 0.6875, "rewards/chosen": 0.31732869148254395, "rewards/margins": 0.020423030480742455, "rewards/rejected": 0.29690563678741455, "step": 194 }, { "epoch": 0.40794979079497906, "grad_norm": 6.509324354981061, "learning_rate": 3.691559041371631e-07, "logits/chosen": -2.787278652191162, "logits/rejected": -2.7446131706237793, "logps/chosen": -1.2347677946090698, "logps/rejected": -1.3611440658569336, "loss": -0.0695, "rewards/accuracies": 0.625, "rewards/chosen": 0.3504650890827179, "rewards/margins": 0.03077811747789383, "rewards/rejected": 0.3196869492530823, "step": 195 }, { "epoch": 0.4100418410041841, "grad_norm": 9.647934576973586, "learning_rate": 3.6754704203519204e-07, "logits/chosen": -2.79685640335083, "logits/rejected": -2.7126359939575195, "logps/chosen": -0.9442129731178284, "logps/rejected": -1.1807160377502441, "loss": -0.1063, "rewards/accuracies": 0.5625, "rewards/chosen": 0.435344934463501, "rewards/margins": 0.0697101503610611, "rewards/rejected": 0.3656347990036011, "step": 196 }, { "epoch": 0.4121338912133891, "grad_norm": 7.90674670924584, "learning_rate": 3.659319055273644e-07, "logits/chosen": -2.6790804862976074, "logits/rejected": -2.638993740081787, "logps/chosen": -1.2018253803253174, "logps/rejected": -1.8742297887802124, "loss": -0.1294, "rewards/accuracies": 0.75, "rewards/chosen": 0.3638248145580292, "rewards/margins": 0.11370320618152618, "rewards/rejected": 0.2501215934753418, "step": 197 }, { "epoch": 0.41422594142259417, "grad_norm": 7.339508519985613, "learning_rate": 3.643105808261596e-07, "logits/chosen": -2.8512470722198486, "logits/rejected": -2.8233699798583984, "logps/chosen": -0.9459013938903809, "logps/rejected": -1.6173968315124512, "loss": -0.1092, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43863311409950256, "rewards/margins": 0.17252716422080994, "rewards/rejected": 0.2661059498786926, "step": 198 }, { "epoch": 0.41631799163179917, "grad_norm": 8.240459127134638, "learning_rate": 3.626831544743697e-07, "logits/chosen": -2.6968235969543457, "logits/rejected": -2.6434929370880127, "logps/chosen": -0.81470787525177, "logps/rejected": -2.106025218963623, "loss": -0.0904, "rewards/accuracies": 0.75, "rewards/chosen": 0.4797627925872803, "rewards/margins": 0.17039833962917328, "rewards/rejected": 0.3093644380569458, "step": 199 }, { "epoch": 0.41841004184100417, "grad_norm": 6.44111013570436, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.7517199516296387, "logits/rejected": -2.6663498878479004, "logps/chosen": -1.2116317749023438, "logps/rejected": -1.853126883506775, "loss": -0.0994, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3775237798690796, "rewards/margins": 0.12298993766307831, "rewards/rejected": 0.2545338273048401, "step": 200 }, { "epoch": 0.4205020920502092, "grad_norm": 13.368515581855178, "learning_rate": 3.594103446140297e-07, "logits/chosen": -2.706216335296631, "logits/rejected": -2.6910247802734375, "logps/chosen": -1.0522258281707764, "logps/rejected": -2.026273727416992, "loss": -0.1004, "rewards/accuracies": 0.625, "rewards/chosen": 0.39808568358421326, "rewards/margins": 0.06647545844316483, "rewards/rejected": 0.33161020278930664, "step": 201 }, { "epoch": 0.4225941422594142, "grad_norm": 10.440505640054965, "learning_rate": 3.5776513580096313e-07, "logits/chosen": -2.8171496391296387, "logits/rejected": -2.7153167724609375, "logps/chosen": -1.0142191648483276, "logps/rejected": -1.8539962768554688, "loss": -0.112, "rewards/accuracies": 0.625, "rewards/chosen": 0.419700026512146, "rewards/margins": 0.18621891736984253, "rewards/rejected": 0.23348110914230347, "step": 202 }, { "epoch": 0.4246861924686193, "grad_norm": 12.957381005981867, "learning_rate": 3.5611417471895376e-07, "logits/chosen": -2.5837535858154297, "logits/rejected": -2.5695419311523438, "logps/chosen": -1.0546152591705322, "logps/rejected": -1.3897829055786133, "loss": -0.1011, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4112699031829834, "rewards/margins": 0.07434143126010895, "rewards/rejected": 0.33692848682403564, "step": 203 }, { "epoch": 0.42677824267782427, "grad_norm": 11.867645234753555, "learning_rate": 3.5445754949271924e-07, "logits/chosen": -2.698544979095459, "logits/rejected": -2.6663873195648193, "logps/chosen": -1.3959226608276367, "logps/rejected": -1.8983134031295776, "loss": -0.1, "rewards/accuracies": 0.625, "rewards/chosen": 0.32857778668403625, "rewards/margins": 0.07399465888738632, "rewards/rejected": 0.25458312034606934, "step": 204 }, { "epoch": 0.42887029288702927, "grad_norm": 11.364438697927202, "learning_rate": 3.5279534854931674e-07, "logits/chosen": -2.551753044128418, "logits/rejected": -2.554995536804199, "logps/chosen": -0.8723805546760559, "logps/rejected": -1.1063939332962036, "loss": -0.1265, "rewards/accuracies": 0.5625, "rewards/chosen": 0.5010254979133606, "rewards/margins": 0.12164342403411865, "rewards/rejected": 0.37938204407691956, "step": 205 }, { "epoch": 0.4309623430962343, "grad_norm": 8.03697196685394, "learning_rate": 3.511276606134234e-07, "logits/chosen": -2.748340606689453, "logits/rejected": -2.7050065994262695, "logps/chosen": -1.1861228942871094, "logps/rejected": -1.7552168369293213, "loss": -0.1095, "rewards/accuracies": 0.75, "rewards/chosen": 0.3975922465324402, "rewards/margins": 0.14437630772590637, "rewards/rejected": 0.2532159090042114, "step": 206 }, { "epoch": 0.4330543933054393, "grad_norm": 13.082858779626731, "learning_rate": 3.4945457470259987e-07, "logits/chosen": -2.761401653289795, "logits/rejected": -2.730571746826172, "logps/chosen": -1.2381715774536133, "logps/rejected": -1.7709894180297852, "loss": -0.099, "rewards/accuracies": 0.5625, "rewards/chosen": 0.32908713817596436, "rewards/margins": -0.023325469344854355, "rewards/rejected": 0.352412611246109, "step": 207 }, { "epoch": 0.4351464435146444, "grad_norm": 10.455595309048718, "learning_rate": 3.4777618012253895e-07, "logits/chosen": -2.8608574867248535, "logits/rejected": -2.6432385444641113, "logps/chosen": -1.0563664436340332, "logps/rejected": -1.7969838380813599, "loss": -0.0882, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4247441291809082, "rewards/margins": 0.1520257294178009, "rewards/rejected": 0.2727183699607849, "step": 208 }, { "epoch": 0.4372384937238494, "grad_norm": 9.638630516945211, "learning_rate": 3.4609256646229903e-07, "logits/chosen": -2.3889663219451904, "logits/rejected": -2.1771650314331055, "logps/chosen": -1.2622408866882324, "logps/rejected": -1.732050895690918, "loss": -0.095, "rewards/accuracies": 0.625, "rewards/chosen": 0.4195970594882965, "rewards/margins": 0.1015428677201271, "rewards/rejected": 0.31805419921875, "step": 209 }, { "epoch": 0.4393305439330544, "grad_norm": 9.12468771204172, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.6972784996032715, "logits/rejected": -2.6877102851867676, "logps/chosen": -1.0190376043319702, "logps/rejected": -1.5476694107055664, "loss": -0.0858, "rewards/accuracies": 0.6875, "rewards/chosen": 0.42147475481033325, "rewards/margins": 0.12910205125808716, "rewards/rejected": 0.2923726737499237, "step": 210 }, { "epoch": 0.44142259414225943, "grad_norm": 32.3610083786221, "learning_rate": 3.4271004164563294e-07, "logits/chosen": -2.7253365516662598, "logits/rejected": -2.6379623413085938, "logps/chosen": -1.332204818725586, "logps/rejected": -1.8204054832458496, "loss": -0.1333, "rewards/accuracies": 0.5625, "rewards/chosen": 0.32043054699897766, "rewards/margins": 0.06172497570514679, "rewards/rejected": 0.25870558619499207, "step": 211 }, { "epoch": 0.4435146443514644, "grad_norm": 10.842115823256194, "learning_rate": 3.410113110410366e-07, "logits/chosen": -2.586061477661133, "logits/rejected": -2.564213514328003, "logps/chosen": -2.031338691711426, "logps/rejected": -2.1223268508911133, "loss": -0.0833, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3001391291618347, "rewards/margins": 0.08526147902011871, "rewards/rejected": 0.2148776352405548, "step": 212 }, { "epoch": 0.4456066945606695, "grad_norm": 9.514760245882242, "learning_rate": 3.3930772245028317e-07, "logits/chosen": -2.7689568996429443, "logits/rejected": -2.7311832904815674, "logps/chosen": -1.108777642250061, "logps/rejected": -1.4788711071014404, "loss": -0.0744, "rewards/accuracies": 0.75, "rewards/chosen": 0.36296340823173523, "rewards/margins": 0.10396925359964371, "rewards/rejected": 0.2589941620826721, "step": 213 }, { "epoch": 0.4476987447698745, "grad_norm": 18.402743823262046, "learning_rate": 3.3759936680723233e-07, "logits/chosen": -2.7321577072143555, "logits/rejected": -2.66558837890625, "logps/chosen": -1.065527081489563, "logps/rejected": -2.6747887134552, "loss": -0.1122, "rewards/accuracies": 0.75, "rewards/chosen": 0.36968982219696045, "rewards/margins": 0.13903871178627014, "rewards/rejected": 0.2306510955095291, "step": 214 }, { "epoch": 0.4497907949790795, "grad_norm": 18.17225957514382, "learning_rate": 3.3588633530019866e-07, "logits/chosen": -2.7257394790649414, "logits/rejected": -2.6584632396698, "logps/chosen": -1.2919788360595703, "logps/rejected": -2.163170099258423, "loss": -0.0922, "rewards/accuracies": 0.6875, "rewards/chosen": 0.324113667011261, "rewards/margins": 0.10252688080072403, "rewards/rejected": 0.22158677875995636, "step": 215 }, { "epoch": 0.45188284518828453, "grad_norm": 9.915365267290412, "learning_rate": 3.341687193670843e-07, "logits/chosen": -2.619964599609375, "logits/rejected": -2.6678147315979004, "logps/chosen": -1.0964298248291016, "logps/rejected": -1.6010725498199463, "loss": -0.1037, "rewards/accuracies": 0.6875, "rewards/chosen": 0.363621324300766, "rewards/margins": 0.09185197204351425, "rewards/rejected": 0.27176934480667114, "step": 216 }, { "epoch": 0.45397489539748953, "grad_norm": 12.895209885390825, "learning_rate": 3.3244661069049806e-07, "logits/chosen": -2.659514904022217, "logits/rejected": -2.7047770023345947, "logps/chosen": -1.2583937644958496, "logps/rejected": -1.487951636314392, "loss": -0.1083, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3770939111709595, "rewards/margins": 0.06072164326906204, "rewards/rejected": 0.316372275352478, "step": 217 }, { "epoch": 0.4560669456066946, "grad_norm": 10.495110963013996, "learning_rate": 3.3072010119286155e-07, "logits/chosen": -2.5267844200134277, "logits/rejected": -2.553617477416992, "logps/chosen": -1.634291410446167, "logps/rejected": -2.40535569190979, "loss": -0.1146, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3271016478538513, "rewards/margins": 0.12336497008800507, "rewards/rejected": 0.20373669266700745, "step": 218 }, { "epoch": 0.4581589958158996, "grad_norm": 17.247831788836987, "learning_rate": 3.289892830315028e-07, "logits/chosen": -2.6873631477355957, "logits/rejected": -2.706193447113037, "logps/chosen": -1.4748320579528809, "logps/rejected": -1.753096103668213, "loss": -0.111, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3177873194217682, "rewards/margins": 0.0883328765630722, "rewards/rejected": 0.2294544279575348, "step": 219 }, { "epoch": 0.4602510460251046, "grad_norm": 11.987244502458385, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.706116199493408, "logits/rejected": -2.677891254425049, "logps/chosen": -1.3741241693496704, "logps/rejected": -2.315911293029785, "loss": -0.1119, "rewards/accuracies": 0.5, "rewards/chosen": 0.2902843952178955, "rewards/margins": 0.04889319837093353, "rewards/rejected": 0.24139118194580078, "step": 220 }, { "epoch": 0.46234309623430964, "grad_norm": 12.418005612952522, "learning_rate": 3.2551509049193444e-07, "logits/chosen": -2.2188100814819336, "logits/rejected": -2.2306671142578125, "logps/chosen": -1.245575189590454, "logps/rejected": -1.4142975807189941, "loss": -0.0828, "rewards/accuracies": 0.4375, "rewards/chosen": 0.36663031578063965, "rewards/margins": 0.045284636318683624, "rewards/rejected": 0.32134565711021423, "step": 221 }, { "epoch": 0.46443514644351463, "grad_norm": 20.459625679835504, "learning_rate": 3.2377190155857864e-07, "logits/chosen": -2.6385624408721924, "logits/rejected": -2.5443832874298096, "logps/chosen": -1.0716688632965088, "logps/rejected": -2.228652000427246, "loss": -0.1143, "rewards/accuracies": 0.625, "rewards/chosen": 0.3814164102077484, "rewards/margins": 0.11980819702148438, "rewards/rejected": 0.26160821318626404, "step": 222 }, { "epoch": 0.4665271966527197, "grad_norm": 9.310053766507147, "learning_rate": 3.220247748413094e-07, "logits/chosen": -2.6578550338745117, "logits/rejected": -2.5362112522125244, "logps/chosen": -1.8237850666046143, "logps/rejected": -2.437105655670166, "loss": -0.1295, "rewards/accuracies": 0.6875, "rewards/chosen": 0.29475274682044983, "rewards/margins": 0.08544115722179413, "rewards/rejected": 0.2093116044998169, "step": 223 }, { "epoch": 0.4686192468619247, "grad_norm": 26.131595636840544, "learning_rate": 3.2027380359795706e-07, "logits/chosen": -2.605294942855835, "logits/rejected": -2.6494808197021484, "logps/chosen": -1.6993333101272583, "logps/rejected": -2.158817768096924, "loss": -0.1396, "rewards/accuracies": 0.625, "rewards/chosen": 0.29801005125045776, "rewards/margins": 0.04826948419213295, "rewards/rejected": 0.24974055588245392, "step": 224 }, { "epoch": 0.4707112970711297, "grad_norm": 15.164313053378295, "learning_rate": 3.185190812915646e-07, "logits/chosen": -2.7706186771392822, "logits/rejected": -2.7250330448150635, "logps/chosen": -1.3935312032699585, "logps/rejected": -1.8978990316390991, "loss": -0.0839, "rewards/accuracies": 0.625, "rewards/chosen": 0.37241330742836, "rewards/margins": 0.08906591683626175, "rewards/rejected": 0.28334739804267883, "step": 225 }, { "epoch": 0.47280334728033474, "grad_norm": 12.335340044509186, "learning_rate": 3.167607015853983e-07, "logits/chosen": -2.734358787536621, "logits/rejected": -2.7213616371154785, "logps/chosen": -1.0013186931610107, "logps/rejected": -1.8874804973602295, "loss": -0.1055, "rewards/accuracies": 0.75, "rewards/chosen": 0.3800060451030731, "rewards/margins": 0.12340433895587921, "rewards/rejected": 0.2566016912460327, "step": 226 }, { "epoch": 0.47489539748953974, "grad_norm": 13.155370252466945, "learning_rate": 3.149987583379485e-07, "logits/chosen": -2.5443460941314697, "logits/rejected": -2.6617841720581055, "logps/chosen": -1.8027722835540771, "logps/rejected": -1.8264943361282349, "loss": -0.0883, "rewards/accuracies": 0.3125, "rewards/chosen": 0.29171156883239746, "rewards/margins": -0.008143829181790352, "rewards/rejected": 0.29985541105270386, "step": 227 }, { "epoch": 0.4769874476987448, "grad_norm": 9.64227092918857, "learning_rate": 3.1323334559792015e-07, "logits/chosen": -2.615062713623047, "logits/rejected": -2.5575170516967773, "logps/chosen": -1.3669350147247314, "logps/rejected": -2.0923495292663574, "loss": -0.0975, "rewards/accuracies": 0.625, "rewards/chosen": 0.3245178461074829, "rewards/margins": 0.047479961067438126, "rewards/rejected": 0.2770378887653351, "step": 228 }, { "epoch": 0.4790794979079498, "grad_norm": 27.480598402768067, "learning_rate": 3.114645575992116e-07, "logits/chosen": -2.754039764404297, "logits/rejected": -2.71734619140625, "logps/chosen": -1.1088240146636963, "logps/rejected": -1.2668546438217163, "loss": -0.1377, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3654661178588867, "rewards/margins": 0.04204658791422844, "rewards/rejected": 0.3234195411205292, "step": 229 }, { "epoch": 0.4811715481171548, "grad_norm": 29.703059245061358, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.4283628463745117, "logits/rejected": -2.3639044761657715, "logps/chosen": -1.721587061882019, "logps/rejected": -1.948933720588684, "loss": -0.095, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2999853789806366, "rewards/margins": 0.036863137036561966, "rewards/rejected": 0.2631222605705261, "step": 230 }, { "epoch": 0.48326359832635984, "grad_norm": 16.287734133486, "learning_rate": 3.079172336571286e-07, "logits/chosen": -2.5601136684417725, "logits/rejected": -2.435715675354004, "logps/chosen": -1.1398509740829468, "logps/rejected": -1.23728346824646, "loss": -0.1092, "rewards/accuracies": 0.5, "rewards/chosen": 0.37845513224601746, "rewards/margins": 0.035441212356090546, "rewards/rejected": 0.3430139422416687, "step": 231 }, { "epoch": 0.48535564853556484, "grad_norm": 21.134919315037024, "learning_rate": 3.061388870622033e-07, "logits/chosen": -2.6344518661499023, "logits/rejected": -2.6067075729370117, "logps/chosen": -1.2356458902359009, "logps/rejected": -1.4378910064697266, "loss": -0.0805, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3725782036781311, "rewards/margins": 0.08114241808652878, "rewards/rejected": 0.29143577814102173, "step": 232 }, { "epoch": 0.4874476987447699, "grad_norm": 16.14848191061866, "learning_rate": 3.0435754389538925e-07, "logits/chosen": -2.6492137908935547, "logits/rejected": -2.6512625217437744, "logps/chosen": -0.9026902914047241, "logps/rejected": -1.7054013013839722, "loss": -0.1189, "rewards/accuracies": 0.875, "rewards/chosen": 0.45766186714172363, "rewards/margins": 0.2092771828174591, "rewards/rejected": 0.24838465452194214, "step": 233 }, { "epoch": 0.4895397489539749, "grad_norm": 13.080452441868529, "learning_rate": 3.0257329924091654e-07, "logits/chosen": -2.4482171535491943, "logits/rejected": -2.4012904167175293, "logps/chosen": -1.2280887365341187, "logps/rejected": -2.007750988006592, "loss": -0.1044, "rewards/accuracies": 0.6875, "rewards/chosen": 0.34713730216026306, "rewards/margins": 0.0972638949751854, "rewards/rejected": 0.24987338483333588, "step": 234 }, { "epoch": 0.4916317991631799, "grad_norm": 14.805770182211953, "learning_rate": 3.007862483378906e-07, "logits/chosen": -2.4385581016540527, "logits/rejected": -2.5235042572021484, "logps/chosen": -0.982105553150177, "logps/rejected": -2.118596315383911, "loss": -0.1241, "rewards/accuracies": 0.625, "rewards/chosen": 0.4126356542110443, "rewards/margins": 0.11890283226966858, "rewards/rejected": 0.29373282194137573, "step": 235 }, { "epoch": 0.49372384937238495, "grad_norm": 17.000928068072675, "learning_rate": 2.989964865752079e-07, "logits/chosen": -2.6185555458068848, "logits/rejected": -2.569633960723877, "logps/chosen": -1.3694891929626465, "logps/rejected": -2.023630380630493, "loss": -0.1363, "rewards/accuracies": 0.75, "rewards/chosen": 0.4494704008102417, "rewards/margins": 0.11591412127017975, "rewards/rejected": 0.33355626463890076, "step": 236 }, { "epoch": 0.49581589958158995, "grad_norm": 11.97132435495436, "learning_rate": 2.97204109486465e-07, "logits/chosen": -2.6144487857818604, "logits/rejected": -2.5463814735412598, "logps/chosen": -1.415244460105896, "logps/rejected": -2.0929644107818604, "loss": -0.0947, "rewards/accuracies": 0.625, "rewards/chosen": 0.34909066557884216, "rewards/margins": 0.054297953844070435, "rewards/rejected": 0.29479271173477173, "step": 237 }, { "epoch": 0.497907949790795, "grad_norm": 13.176263587191198, "learning_rate": 2.954092127448591e-07, "logits/chosen": -2.405916690826416, "logits/rejected": -2.5460305213928223, "logps/chosen": -1.3031535148620605, "logps/rejected": -1.7486495971679688, "loss": -0.0982, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3606809377670288, "rewards/margins": 0.08783643692731857, "rewards/rejected": 0.27284449338912964, "step": 238 }, { "epoch": 0.5, "grad_norm": 10.727719660590079, "learning_rate": 2.9361189215808057e-07, "logits/chosen": -2.478989601135254, "logits/rejected": -2.439676284790039, "logps/chosen": -1.472078800201416, "logps/rejected": -1.6831552982330322, "loss": -0.1018, "rewards/accuracies": 0.625, "rewards/chosen": 0.35690170526504517, "rewards/margins": 0.0915931761264801, "rewards/rejected": 0.26530852913856506, "step": 239 }, { "epoch": 0.502092050209205, "grad_norm": 6.3268417752750254, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.3423075675964355, "logits/rejected": -2.273186683654785, "logps/chosen": -0.9384135007858276, "logps/rejected": -2.9800286293029785, "loss": -0.1227, "rewards/accuracies": 0.875, "rewards/chosen": 0.4423084259033203, "rewards/margins": 0.2813805341720581, "rewards/rejected": 0.1609278917312622, "step": 240 }, { "epoch": 0.50418410041841, "grad_norm": 12.995796697837475, "learning_rate": 2.900103633215447e-07, "logits/chosen": -2.729633331298828, "logits/rejected": -2.6653881072998047, "logps/chosen": -0.8296223878860474, "logps/rejected": -1.6522765159606934, "loss": -0.1, "rewards/accuracies": 0.875, "rewards/chosen": 0.48354005813598633, "rewards/margins": 0.20459306240081787, "rewards/rejected": 0.27894699573516846, "step": 241 }, { "epoch": 0.5062761506276151, "grad_norm": 18.75630509223207, "learning_rate": 2.882063473135763e-07, "logits/chosen": -2.5082345008850098, "logits/rejected": -2.5162205696105957, "logps/chosen": -1.2960155010223389, "logps/rejected": -2.2314138412475586, "loss": -0.1344, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4037836790084839, "rewards/margins": 0.16876041889190674, "rewards/rejected": 0.23502326011657715, "step": 242 }, { "epoch": 0.5083682008368201, "grad_norm": 8.482528682337579, "learning_rate": 2.864002919337512e-07, "logits/chosen": -2.416985511779785, "logits/rejected": -2.3670668601989746, "logps/chosen": -1.2545527219772339, "logps/rejected": -1.3394420146942139, "loss": -0.1234, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3523995876312256, "rewards/margins": 0.04634542763233185, "rewards/rejected": 0.30605414509773254, "step": 243 }, { "epoch": 0.5104602510460251, "grad_norm": 10.719105172459228, "learning_rate": 2.8459229358538404e-07, "logits/chosen": -2.5898609161376953, "logits/rejected": -2.541146755218506, "logps/chosen": -1.346826434135437, "logps/rejected": -1.5069986581802368, "loss": -0.1222, "rewards/accuracies": 0.625, "rewards/chosen": 0.3515220880508423, "rewards/margins": 0.023752853274345398, "rewards/rejected": 0.3277692198753357, "step": 244 }, { "epoch": 0.5125523012552301, "grad_norm": 11.8510977063605, "learning_rate": 2.827824487755007e-07, "logits/chosen": -2.5046603679656982, "logits/rejected": -2.5287513732910156, "logps/chosen": -0.9352712035179138, "logps/rejected": -1.6383126974105835, "loss": -0.1216, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43307042121887207, "rewards/margins": 0.14084957540035248, "rewards/rejected": 0.2922208607196808, "step": 245 }, { "epoch": 0.5146443514644351, "grad_norm": 15.503410702429422, "learning_rate": 2.8097085410968694e-07, "logits/chosen": -2.634338855743408, "logits/rejected": -2.51353120803833, "logps/chosen": -1.4211082458496094, "logps/rejected": -2.5676441192626953, "loss": -0.0967, "rewards/accuracies": 0.8125, "rewards/chosen": 0.36743271350860596, "rewards/margins": 0.1599361002445221, "rewards/rejected": 0.20749661326408386, "step": 246 }, { "epoch": 0.5167364016736402, "grad_norm": 14.807369583073374, "learning_rate": 2.7915760628693253e-07, "logits/chosen": -2.6446118354797363, "logits/rejected": -2.5478878021240234, "logps/chosen": -1.0431950092315674, "logps/rejected": -2.3127846717834473, "loss": -0.1089, "rewards/accuracies": 1.0, "rewards/chosen": 0.4418485760688782, "rewards/margins": 0.20923131704330444, "rewards/rejected": 0.23261725902557373, "step": 247 }, { "epoch": 0.5188284518828452, "grad_norm": 101.83732528062832, "learning_rate": 2.7734280209446865e-07, "logits/chosen": -2.5180978775024414, "logits/rejected": -2.453972339630127, "logps/chosen": -0.8324543237686157, "logps/rejected": -1.6864728927612305, "loss": -0.1014, "rewards/accuracies": 0.75, "rewards/chosen": 0.46302682161331177, "rewards/margins": 0.18120057880878448, "rewards/rejected": 0.2818262279033661, "step": 248 }, { "epoch": 0.5209205020920502, "grad_norm": 9.351081141015694, "learning_rate": 2.755265384026023e-07, "logits/chosen": -2.600846290588379, "logits/rejected": -2.6015748977661133, "logps/chosen": -1.2091116905212402, "logps/rejected": -1.1987733840942383, "loss": -0.0511, "rewards/accuracies": 0.5, "rewards/chosen": 0.3433293104171753, "rewards/margins": 0.025443512946367264, "rewards/rejected": 0.31788578629493713, "step": 249 }, { "epoch": 0.5230125523012552, "grad_norm": 6.870687623461676, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.494401454925537, "logits/rejected": -2.4487247467041016, "logps/chosen": -1.596292495727539, "logps/rejected": -1.5297508239746094, "loss": -0.1328, "rewards/accuracies": 0.375, "rewards/chosen": 0.34035491943359375, "rewards/margins": 0.039452340453863144, "rewards/rejected": 0.3009026050567627, "step": 250 }, { "epoch": 0.5251046025104602, "grad_norm": 13.661153720373681, "learning_rate": 2.7189002038624057e-07, "logits/chosen": -2.5769824981689453, "logits/rejected": -2.481782913208008, "logps/chosen": -1.1278202533721924, "logps/rejected": -1.621917963027954, "loss": -0.0929, "rewards/accuracies": 0.625, "rewards/chosen": 0.37641996145248413, "rewards/margins": 0.02839614823460579, "rewards/rejected": 0.34802380204200745, "step": 251 }, { "epoch": 0.5271966527196653, "grad_norm": 24.04163690716132, "learning_rate": 2.7006996017118027e-07, "logits/chosen": -2.618551254272461, "logits/rejected": -2.5014870166778564, "logps/chosen": -1.0564649105072021, "logps/rejected": -1.7426464557647705, "loss": -0.0936, "rewards/accuracies": 0.75, "rewards/chosen": 0.3888087868690491, "rewards/margins": 0.12157319486141205, "rewards/rejected": 0.2672356367111206, "step": 252 }, { "epoch": 0.5292887029288703, "grad_norm": 14.053938365943903, "learning_rate": 2.682488286652269e-07, "logits/chosen": -2.354100227355957, "logits/rejected": -2.320981502532959, "logps/chosen": -1.0166618824005127, "logps/rejected": -2.0148091316223145, "loss": -0.1304, "rewards/accuracies": 0.75, "rewards/chosen": 0.4282534122467041, "rewards/margins": 0.19922780990600586, "rewards/rejected": 0.22902560234069824, "step": 253 }, { "epoch": 0.5313807531380753, "grad_norm": 9.783298850662142, "learning_rate": 2.6642672307642573e-07, "logits/chosen": -2.5941338539123535, "logits/rejected": -2.4625022411346436, "logps/chosen": -1.620993971824646, "logps/rejected": -1.5014917850494385, "loss": -0.1108, "rewards/accuracies": 0.5625, "rewards/chosen": 0.350075900554657, "rewards/margins": 0.060651615262031555, "rewards/rejected": 0.2894243001937866, "step": 254 }, { "epoch": 0.5334728033472803, "grad_norm": 43.27055232189887, "learning_rate": 2.646037406648165e-07, "logits/chosen": -2.4940779209136963, "logits/rejected": -2.4904818534851074, "logps/chosen": -1.324521541595459, "logps/rejected": -1.9451937675476074, "loss": -0.0874, "rewards/accuracies": 0.625, "rewards/chosen": 0.32973212003707886, "rewards/margins": 0.11690256744623184, "rewards/rejected": 0.2128295600414276, "step": 255 }, { "epoch": 0.5355648535564853, "grad_norm": 32.69660335713406, "learning_rate": 2.6277997873724176e-07, "logits/chosen": -2.576993465423584, "logits/rejected": -2.549190044403076, "logps/chosen": -1.1641985177993774, "logps/rejected": -1.3727514743804932, "loss": -0.1371, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3553909659385681, "rewards/margins": 0.04894494265317917, "rewards/rejected": 0.30644604563713074, "step": 256 }, { "epoch": 0.5376569037656904, "grad_norm": 17.152292597708968, "learning_rate": 2.609555346421532e-07, "logits/chosen": -2.582218647003174, "logits/rejected": -2.4460763931274414, "logps/chosen": -0.8872560262680054, "logps/rejected": -2.193643093109131, "loss": -0.1292, "rewards/accuracies": 0.75, "rewards/chosen": 0.4860767722129822, "rewards/margins": 0.2561631202697754, "rewards/rejected": 0.2299136519432068, "step": 257 }, { "epoch": 0.5397489539748954, "grad_norm": 20.420927555489797, "learning_rate": 2.5913050576441473e-07, "logits/chosen": -2.537754535675049, "logits/rejected": -2.533719062805176, "logps/chosen": -1.4319506883621216, "logps/rejected": -1.8760106563568115, "loss": -0.1314, "rewards/accuracies": 0.625, "rewards/chosen": 0.41588470339775085, "rewards/margins": 0.10261406004428864, "rewards/rejected": 0.313270628452301, "step": 258 }, { "epoch": 0.5418410041841004, "grad_norm": 24.83960755558801, "learning_rate": 2.5730498952010496e-07, "logits/chosen": -2.3227603435516357, "logits/rejected": -2.322937488555908, "logps/chosen": -1.1083595752716064, "logps/rejected": -1.3232736587524414, "loss": -0.1028, "rewards/accuracies": 0.625, "rewards/chosen": 0.375250905752182, "rewards/margins": 0.0630226731300354, "rewards/rejected": 0.3122282326221466, "step": 259 }, { "epoch": 0.5439330543933054, "grad_norm": 14.323617407100764, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.5259079933166504, "logits/rejected": -2.4510679244995117, "logps/chosen": -0.9364595413208008, "logps/rejected": -2.3588366508483887, "loss": -0.0809, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4500068426132202, "rewards/margins": 0.18414944410324097, "rewards/rejected": 0.26585739850997925, "step": 260 }, { "epoch": 0.5460251046025104, "grad_norm": 17.10022407964933, "learning_rate": 2.536528847209573e-07, "logits/chosen": -2.4352588653564453, "logits/rejected": -2.2389445304870605, "logps/chosen": -1.1492376327514648, "logps/rejected": -1.8517694473266602, "loss": -0.124, "rewards/accuracies": 0.625, "rewards/chosen": 0.4028159976005554, "rewards/margins": 0.12413935363292694, "rewards/rejected": 0.2786766290664673, "step": 261 }, { "epoch": 0.5481171548117155, "grad_norm": 10.190814456155364, "learning_rate": 2.5182649110754325e-07, "logits/chosen": -2.412586212158203, "logits/rejected": -2.4488370418548584, "logps/chosen": -1.2817234992980957, "logps/rejected": -1.8377283811569214, "loss": -0.1022, "rewards/accuracies": 0.75, "rewards/chosen": 0.38602763414382935, "rewards/margins": 0.09022464603185654, "rewards/rejected": 0.2958029508590698, "step": 262 }, { "epoch": 0.5502092050209205, "grad_norm": 8.793628588219915, "learning_rate": 2.5e-07, "logits/chosen": -2.462949275970459, "logits/rejected": -2.5068588256835938, "logps/chosen": -1.3072988986968994, "logps/rejected": -1.7679378986358643, "loss": -0.0738, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3691004514694214, "rewards/margins": 0.07263248413801193, "rewards/rejected": 0.29646795988082886, "step": 263 }, { "epoch": 0.5523012552301255, "grad_norm": 15.380045241350123, "learning_rate": 2.4817350889245673e-07, "logits/chosen": -2.341583013534546, "logits/rejected": -2.3073818683624268, "logps/chosen": -1.3789433240890503, "logps/rejected": -2.094804048538208, "loss": -0.1176, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3920101821422577, "rewards/margins": 0.10554394125938416, "rewards/rejected": 0.28646624088287354, "step": 264 }, { "epoch": 0.5543933054393305, "grad_norm": 8.58809117665816, "learning_rate": 2.463471152790427e-07, "logits/chosen": -2.4648337364196777, "logits/rejected": -2.2489168643951416, "logps/chosen": -1.2276692390441895, "logps/rejected": -2.249476909637451, "loss": -0.1457, "rewards/accuracies": 0.8125, "rewards/chosen": 0.48874348402023315, "rewards/margins": 0.22283288836479187, "rewards/rejected": 0.2659105658531189, "step": 265 }, { "epoch": 0.5564853556485355, "grad_norm": 11.286756360470665, "learning_rate": 2.44520916648683e-07, "logits/chosen": -2.3162903785705566, "logits/rejected": -2.3765616416931152, "logps/chosen": -1.3577649593353271, "logps/rejected": -2.117098808288574, "loss": -0.1085, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3324105441570282, "rewards/margins": 0.09969653189182281, "rewards/rejected": 0.23271401226520538, "step": 266 }, { "epoch": 0.5585774058577406, "grad_norm": 15.174465033011408, "learning_rate": 2.426950104798951e-07, "logits/chosen": -2.4470949172973633, "logits/rejected": -2.42891788482666, "logps/chosen": -1.2459077835083008, "logps/rejected": -1.9684381484985352, "loss": -0.11, "rewards/accuracies": 0.9375, "rewards/chosen": 0.40111368894577026, "rewards/margins": 0.14265714585781097, "rewards/rejected": 0.2584565281867981, "step": 267 }, { "epoch": 0.5606694560669456, "grad_norm": 14.450224054308865, "learning_rate": 2.4086949423558525e-07, "logits/chosen": -2.407813549041748, "logits/rejected": -2.3005266189575195, "logps/chosen": -1.276888370513916, "logps/rejected": -2.612614631652832, "loss": -0.112, "rewards/accuracies": 0.75, "rewards/chosen": 0.39179039001464844, "rewards/margins": 0.14099331200122833, "rewards/rejected": 0.2507970631122589, "step": 268 }, { "epoch": 0.5627615062761506, "grad_norm": 17.153862681122654, "learning_rate": 2.3904446535784686e-07, "logits/chosen": -2.3327040672302246, "logits/rejected": -2.223817825317383, "logps/chosen": -1.4948809146881104, "logps/rejected": -2.7149105072021484, "loss": -0.1135, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3385244309902191, "rewards/margins": 0.10032963752746582, "rewards/rejected": 0.2381947934627533, "step": 269 }, { "epoch": 0.5648535564853556, "grad_norm": 16.515843039005382, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.504027843475342, "logits/rejected": -2.405820369720459, "logps/chosen": -0.9141228199005127, "logps/rejected": -1.5929572582244873, "loss": -0.1279, "rewards/accuracies": 0.75, "rewards/chosen": 0.42940253019332886, "rewards/margins": 0.11588534712791443, "rewards/rejected": 0.3135172128677368, "step": 270 }, { "epoch": 0.5669456066945606, "grad_norm": 10.230006000297033, "learning_rate": 2.353962593351835e-07, "logits/chosen": -2.282273292541504, "logits/rejected": -2.239558696746826, "logps/chosen": -1.2985116243362427, "logps/rejected": -2.147219181060791, "loss": -0.1401, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33533766865730286, "rewards/margins": 0.10907959938049316, "rewards/rejected": 0.2262580394744873, "step": 271 }, { "epoch": 0.5690376569037657, "grad_norm": 29.235986727351534, "learning_rate": 2.3357327692357428e-07, "logits/chosen": -2.3464512825012207, "logits/rejected": -2.2939443588256836, "logps/chosen": -0.868351936340332, "logps/rejected": -2.0506629943847656, "loss": -0.1217, "rewards/accuracies": 0.75, "rewards/chosen": 0.46230238676071167, "rewards/margins": 0.1395689845085144, "rewards/rejected": 0.32273340225219727, "step": 272 }, { "epoch": 0.5711297071129707, "grad_norm": 13.500310258479074, "learning_rate": 2.317511713347731e-07, "logits/chosen": -2.6007192134857178, "logits/rejected": -2.519045352935791, "logps/chosen": -0.9955112934112549, "logps/rejected": -2.4560132026672363, "loss": -0.1451, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4444941282272339, "rewards/margins": 0.19739052653312683, "rewards/rejected": 0.24710360169410706, "step": 273 }, { "epoch": 0.5732217573221757, "grad_norm": 16.783212103053867, "learning_rate": 2.2993003982881973e-07, "logits/chosen": -2.2793025970458984, "logits/rejected": -2.195977210998535, "logps/chosen": -1.0133543014526367, "logps/rejected": -2.022432804107666, "loss": -0.1397, "rewards/accuracies": 0.875, "rewards/chosen": 0.38419491052627563, "rewards/margins": 0.14749623835086823, "rewards/rejected": 0.2366986870765686, "step": 274 }, { "epoch": 0.5753138075313807, "grad_norm": 10.232857115492903, "learning_rate": 2.2810997961375938e-07, "logits/chosen": -2.620478630065918, "logits/rejected": -2.368046283721924, "logps/chosen": -0.8385424613952637, "logps/rejected": -1.756354570388794, "loss": -0.1605, "rewards/accuracies": 0.75, "rewards/chosen": 0.4779413044452667, "rewards/margins": 0.16384732723236084, "rewards/rejected": 0.31409400701522827, "step": 275 }, { "epoch": 0.5774058577405857, "grad_norm": 9.441049880789874, "learning_rate": 2.2629108784045436e-07, "logits/chosen": -2.2932982444763184, "logits/rejected": -2.1024169921875, "logps/chosen": -1.0233564376831055, "logps/rejected": -1.8084924221038818, "loss": -0.1577, "rewards/accuracies": 0.8125, "rewards/chosen": 0.39267247915267944, "rewards/margins": 0.17666901648044586, "rewards/rejected": 0.21600347757339478, "step": 276 }, { "epoch": 0.5794979079497908, "grad_norm": 11.23380249432245, "learning_rate": 2.2447346159739768e-07, "logits/chosen": -2.1626408100128174, "logits/rejected": -2.164907455444336, "logps/chosen": -1.130321979522705, "logps/rejected": -1.553159475326538, "loss": -0.0833, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3877987563610077, "rewards/margins": 0.061269715428352356, "rewards/rejected": 0.32652902603149414, "step": 277 }, { "epoch": 0.5815899581589958, "grad_norm": 15.096559124916823, "learning_rate": 2.2265719790553146e-07, "logits/chosen": -2.020289897918701, "logits/rejected": -2.020287036895752, "logps/chosen": -1.4989287853240967, "logps/rejected": -1.5906364917755127, "loss": -0.1001, "rewards/accuracies": 0.5, "rewards/chosen": 0.3539296090602875, "rewards/margins": 0.014707334339618683, "rewards/rejected": 0.3392222821712494, "step": 278 }, { "epoch": 0.5836820083682008, "grad_norm": 10.528790195156935, "learning_rate": 2.2084239371306752e-07, "logits/chosen": -2.0334033966064453, "logits/rejected": -1.881544589996338, "logps/chosen": -0.9108498096466064, "logps/rejected": -1.809361457824707, "loss": -0.0869, "rewards/accuracies": 0.875, "rewards/chosen": 0.4324657917022705, "rewards/margins": 0.1985461413860321, "rewards/rejected": 0.23391962051391602, "step": 279 }, { "epoch": 0.5857740585774058, "grad_norm": 10.4888057073251, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.1263463497161865, "logits/rejected": -1.9130442142486572, "logps/chosen": -0.9421380758285522, "logps/rejected": -2.6061289310455322, "loss": -0.136, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4234393835067749, "rewards/margins": 0.15696920454502106, "rewards/rejected": 0.26647016406059265, "step": 280 }, { "epoch": 0.5878661087866108, "grad_norm": 10.261955546016459, "learning_rate": 2.172175512244993e-07, "logits/chosen": -1.8044922351837158, "logits/rejected": -1.7952802181243896, "logps/chosen": -1.216686725616455, "logps/rejected": -1.8076926469802856, "loss": -0.1096, "rewards/accuracies": 0.75, "rewards/chosen": 0.35631251335144043, "rewards/margins": 0.12221503257751465, "rewards/rejected": 0.2340974658727646, "step": 281 }, { "epoch": 0.5899581589958159, "grad_norm": 9.452698854031054, "learning_rate": 2.154077064146159e-07, "logits/chosen": -2.2612810134887695, "logits/rejected": -2.135688543319702, "logps/chosen": -1.256422519683838, "logps/rejected": -2.5949811935424805, "loss": -0.1141, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3647221326828003, "rewards/margins": 0.1851029098033905, "rewards/rejected": 0.17961925268173218, "step": 282 }, { "epoch": 0.5920502092050209, "grad_norm": 12.99298258428552, "learning_rate": 2.1359970806624884e-07, "logits/chosen": -2.015012741088867, "logits/rejected": -1.8471283912658691, "logps/chosen": -1.1125614643096924, "logps/rejected": -3.3590145111083984, "loss": -0.1401, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39575979113578796, "rewards/margins": 0.21206815540790558, "rewards/rejected": 0.1836916208267212, "step": 283 }, { "epoch": 0.5941422594142259, "grad_norm": 11.402998787496921, "learning_rate": 2.1179365268642375e-07, "logits/chosen": -1.88407301902771, "logits/rejected": -1.6973862648010254, "logps/chosen": -1.3595623970031738, "logps/rejected": -1.9901232719421387, "loss": -0.1053, "rewards/accuracies": 0.5, "rewards/chosen": 0.3310593366622925, "rewards/margins": 0.025551561266183853, "rewards/rejected": 0.3055077791213989, "step": 284 }, { "epoch": 0.5962343096234309, "grad_norm": 12.544217446992135, "learning_rate": 2.0998963667845536e-07, "logits/chosen": -2.0112407207489014, "logits/rejected": -1.8508641719818115, "logps/chosen": -1.3053799867630005, "logps/rejected": -1.789559006690979, "loss": -0.0668, "rewards/accuracies": 0.5, "rewards/chosen": 0.36560434103012085, "rewards/margins": 0.05719471722841263, "rewards/rejected": 0.3084096312522888, "step": 285 }, { "epoch": 0.5983263598326359, "grad_norm": 9.227582418093855, "learning_rate": 2.0818775633680055e-07, "logits/chosen": -2.041214942932129, "logits/rejected": -1.606358528137207, "logps/chosen": -1.0451130867004395, "logps/rejected": -3.085855484008789, "loss": -0.1153, "rewards/accuracies": 0.75, "rewards/chosen": 0.41002923250198364, "rewards/margins": 0.2596610188484192, "rewards/rejected": 0.15036822855472565, "step": 286 }, { "epoch": 0.600418410041841, "grad_norm": 10.879171338781033, "learning_rate": 2.0638810784191946e-07, "logits/chosen": -1.8780803680419922, "logits/rejected": -1.659158706665039, "logps/chosen": -1.0779826641082764, "logps/rejected": -1.470560073852539, "loss": -0.1425, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4114425778388977, "rewards/margins": 0.05348636955022812, "rewards/rejected": 0.357956200838089, "step": 287 }, { "epoch": 0.602510460251046, "grad_norm": 24.091905733565373, "learning_rate": 2.0459078725514089e-07, "logits/chosen": -1.4315662384033203, "logits/rejected": -1.4450922012329102, "logps/chosen": -1.0725127458572388, "logps/rejected": -1.8540666103363037, "loss": -0.105, "rewards/accuracies": 0.5, "rewards/chosen": 0.4384384751319885, "rewards/margins": 0.028723157942295074, "rewards/rejected": 0.40971535444259644, "step": 288 }, { "epoch": 0.604602510460251, "grad_norm": 7.747661853144717, "learning_rate": 2.027958905135349e-07, "logits/chosen": -1.7538211345672607, "logits/rejected": -1.6950899362564087, "logps/chosen": -1.7596023082733154, "logps/rejected": -2.0811004638671875, "loss": -0.0933, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3173483610153198, "rewards/margins": 0.06009531766176224, "rewards/rejected": 0.2572530210018158, "step": 289 }, { "epoch": 0.606694560669456, "grad_norm": 12.087581194789344, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -1.7584887742996216, "logits/rejected": -1.574345588684082, "logps/chosen": -0.9665372967720032, "logps/rejected": -2.1095662117004395, "loss": -0.1048, "rewards/accuracies": 0.75, "rewards/chosen": 0.4177166223526001, "rewards/margins": 0.2137906551361084, "rewards/rejected": 0.2039259523153305, "step": 290 }, { "epoch": 0.608786610878661, "grad_norm": 8.499022932555569, "learning_rate": 1.9921375166210945e-07, "logits/chosen": -1.7692201137542725, "logits/rejected": -1.7351100444793701, "logps/chosen": -1.3378679752349854, "logps/rejected": -2.4489097595214844, "loss": -0.165, "rewards/accuracies": 0.625, "rewards/chosen": 0.32389402389526367, "rewards/margins": 0.0668807253241539, "rewards/rejected": 0.2570132911205292, "step": 291 }, { "epoch": 0.6108786610878661, "grad_norm": 9.761211893478931, "learning_rate": 1.9742670075908349e-07, "logits/chosen": -1.6519404649734497, "logits/rejected": -1.5875215530395508, "logps/chosen": -1.360038161277771, "logps/rejected": -1.602811336517334, "loss": -0.099, "rewards/accuracies": 0.625, "rewards/chosen": 0.37947362661361694, "rewards/margins": 0.0840110331773758, "rewards/rejected": 0.29546260833740234, "step": 292 }, { "epoch": 0.6129707112970711, "grad_norm": 13.035324813489078, "learning_rate": 1.9564245610461078e-07, "logits/chosen": -1.7272558212280273, "logits/rejected": -1.4798274040222168, "logps/chosen": -1.3200814723968506, "logps/rejected": -1.9456639289855957, "loss": -0.102, "rewards/accuracies": 0.5625, "rewards/chosen": 0.31104105710983276, "rewards/margins": 0.06010560691356659, "rewards/rejected": 0.250935435295105, "step": 293 }, { "epoch": 0.6150627615062761, "grad_norm": 12.373073851835231, "learning_rate": 1.938611129377967e-07, "logits/chosen": -1.4790561199188232, "logits/rejected": -1.4483731985092163, "logps/chosen": -1.1588627099990845, "logps/rejected": -1.8893986940383911, "loss": -0.1182, "rewards/accuracies": 0.625, "rewards/chosen": 0.40000268816947937, "rewards/margins": 0.1388007402420044, "rewards/rejected": 0.261201947927475, "step": 294 }, { "epoch": 0.6171548117154811, "grad_norm": 13.570956779826611, "learning_rate": 1.920827663428714e-07, "logits/chosen": -1.3182231187820435, "logits/rejected": -1.210131287574768, "logps/chosen": -1.8925442695617676, "logps/rejected": -2.5153698921203613, "loss": -0.1257, "rewards/accuracies": 0.625, "rewards/chosen": 0.22584283351898193, "rewards/margins": 0.044571515172719955, "rewards/rejected": 0.18127131462097168, "step": 295 }, { "epoch": 0.6192468619246861, "grad_norm": 12.244728666938812, "learning_rate": 1.9030751124411448e-07, "logits/chosen": -1.202715277671814, "logits/rejected": -1.2500357627868652, "logps/chosen": -1.0252957344055176, "logps/rejected": -2.197416305541992, "loss": -0.1301, "rewards/accuracies": 0.75, "rewards/chosen": 0.4244975447654724, "rewards/margins": 0.19387859106063843, "rewards/rejected": 0.2306189239025116, "step": 296 }, { "epoch": 0.6213389121338913, "grad_norm": 11.318104404213964, "learning_rate": 1.8853544240078836e-07, "logits/chosen": -1.6210575103759766, "logits/rejected": -1.6088294982910156, "logps/chosen": -0.868645966053009, "logps/rejected": -2.105026960372925, "loss": -0.0987, "rewards/accuracies": 0.5625, "rewards/chosen": 0.461618572473526, "rewards/margins": 0.15263152122497559, "rewards/rejected": 0.3089870810508728, "step": 297 }, { "epoch": 0.6234309623430963, "grad_norm": 13.963854313724484, "learning_rate": 1.8676665440207977e-07, "logits/chosen": -1.3747458457946777, "logits/rejected": -1.2494533061981201, "logps/chosen": -1.5411226749420166, "logps/rejected": -2.833916664123535, "loss": -0.1238, "rewards/accuracies": 0.875, "rewards/chosen": 0.30651533603668213, "rewards/margins": 0.11065477132797241, "rewards/rejected": 0.1958606094121933, "step": 298 }, { "epoch": 0.6255230125523012, "grad_norm": 12.924584175076303, "learning_rate": 1.850012416620515e-07, "logits/chosen": -1.06024169921875, "logits/rejected": -1.0010329484939575, "logps/chosen": -1.9282222986221313, "logps/rejected": -2.097105026245117, "loss": -0.1203, "rewards/accuracies": 0.625, "rewards/chosen": 0.2811262607574463, "rewards/margins": 0.0608399361371994, "rewards/rejected": 0.2202863246202469, "step": 299 }, { "epoch": 0.6276150627615062, "grad_norm": 14.77377885396883, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -1.4455621242523193, "logits/rejected": -1.5248115062713623, "logps/chosen": -1.1223115921020508, "logps/rejected": -1.5705149173736572, "loss": -0.1138, "rewards/accuracies": 0.5625, "rewards/chosen": 0.37588104605674744, "rewards/margins": 0.06724551320075989, "rewards/rejected": 0.30863553285598755, "step": 300 }, { "epoch": 0.6297071129707112, "grad_norm": 12.758467910292707, "learning_rate": 1.8148091870843552e-07, "logits/chosen": -1.3640754222869873, "logits/rejected": -1.0591566562652588, "logps/chosen": -1.2672115564346313, "logps/rejected": -2.286756992340088, "loss": -0.1324, "rewards/accuracies": 0.75, "rewards/chosen": 0.44048044085502625, "rewards/margins": 0.191227987408638, "rewards/rejected": 0.24925243854522705, "step": 301 }, { "epoch": 0.6317991631799164, "grad_norm": 12.745982416001658, "learning_rate": 1.7972619640204294e-07, "logits/chosen": -1.4261343479156494, "logits/rejected": -1.451894760131836, "logps/chosen": -1.2100127935409546, "logps/rejected": -1.9714622497558594, "loss": -0.1214, "rewards/accuracies": 0.625, "rewards/chosen": 0.40259698033332825, "rewards/margins": 0.1440141350030899, "rewards/rejected": 0.25858286023139954, "step": 302 }, { "epoch": 0.6338912133891214, "grad_norm": 42.33215506608496, "learning_rate": 1.779752251586906e-07, "logits/chosen": -1.2698376178741455, "logits/rejected": -1.4541172981262207, "logps/chosen": -1.0781916379928589, "logps/rejected": -2.400745153427124, "loss": -0.1352, "rewards/accuracies": 0.5, "rewards/chosen": 0.40851694345474243, "rewards/margins": 0.07796123623847961, "rewards/rejected": 0.3305557072162628, "step": 303 }, { "epoch": 0.6359832635983264, "grad_norm": 11.646923762516986, "learning_rate": 1.7622809844142137e-07, "logits/chosen": -1.804600477218628, "logits/rejected": -1.4111582040786743, "logps/chosen": -1.0599024295806885, "logps/rejected": -1.7829433679580688, "loss": -0.141, "rewards/accuracies": 0.6875, "rewards/chosen": 0.43998903036117554, "rewards/margins": 0.12246303260326385, "rewards/rejected": 0.31752604246139526, "step": 304 }, { "epoch": 0.6380753138075314, "grad_norm": 17.322291060310846, "learning_rate": 1.7448490950806548e-07, "logits/chosen": -0.9987385272979736, "logits/rejected": -1.0168724060058594, "logps/chosen": -1.1187671422958374, "logps/rejected": -1.6882683038711548, "loss": -0.1489, "rewards/accuracies": 0.875, "rewards/chosen": 0.4853442311286926, "rewards/margins": 0.17401915788650513, "rewards/rejected": 0.3113250732421875, "step": 305 }, { "epoch": 0.6401673640167364, "grad_norm": 48.50507886583888, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -1.501349925994873, "logits/rejected": -1.0330767631530762, "logps/chosen": -0.9468430280685425, "logps/rejected": -2.1863956451416016, "loss": -0.1216, "rewards/accuracies": 0.625, "rewards/chosen": 0.4197174608707428, "rewards/margins": 0.1767096370458603, "rewards/rejected": 0.2430078387260437, "step": 306 }, { "epoch": 0.6422594142259415, "grad_norm": 33.49834381108919, "learning_rate": 1.7101071696849718e-07, "logits/chosen": -0.9369056224822998, "logits/rejected": -0.749775767326355, "logps/chosen": -1.204647421836853, "logps/rejected": -2.528010606765747, "loss": -0.1797, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41651594638824463, "rewards/margins": 0.20134611427783966, "rewards/rejected": 0.21516986191272736, "step": 307 }, { "epoch": 0.6443514644351465, "grad_norm": 9.165843117450295, "learning_rate": 1.692798988071385e-07, "logits/chosen": -1.354283094406128, "logits/rejected": -1.3673646450042725, "logps/chosen": -1.1920933723449707, "logps/rejected": -3.002389907836914, "loss": -0.0909, "rewards/accuracies": 0.625, "rewards/chosen": 0.3539876341819763, "rewards/margins": 0.09995920956134796, "rewards/rejected": 0.25402843952178955, "step": 308 }, { "epoch": 0.6464435146443515, "grad_norm": 39.53221497092988, "learning_rate": 1.6755338930950192e-07, "logits/chosen": -1.3984010219573975, "logits/rejected": -1.153122067451477, "logps/chosen": -1.0089532136917114, "logps/rejected": -2.453733444213867, "loss": -0.1695, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4267140030860901, "rewards/margins": 0.13178855180740356, "rewards/rejected": 0.2949254512786865, "step": 309 }, { "epoch": 0.6485355648535565, "grad_norm": 22.69036225057899, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -1.415753960609436, "logits/rejected": -1.2994680404663086, "logps/chosen": -0.9845906496047974, "logps/rejected": -1.4354872703552246, "loss": -0.1213, "rewards/accuracies": 0.625, "rewards/chosen": 0.39985141158103943, "rewards/margins": 0.08858242630958557, "rewards/rejected": 0.31126895546913147, "step": 310 }, { "epoch": 0.6506276150627615, "grad_norm": 11.735736386990032, "learning_rate": 1.6411366469980134e-07, "logits/chosen": -0.8768999576568604, "logits/rejected": -0.7540444135665894, "logps/chosen": -1.6087219715118408, "logps/rejected": -2.7079243659973145, "loss": -0.1349, "rewards/accuracies": 0.625, "rewards/chosen": 0.3981132507324219, "rewards/margins": 0.15352198481559753, "rewards/rejected": 0.24459126591682434, "step": 311 }, { "epoch": 0.6527196652719666, "grad_norm": 36.93233096234923, "learning_rate": 1.6240063319276764e-07, "logits/chosen": -1.090734601020813, "logits/rejected": -0.999484658241272, "logps/chosen": -1.0787606239318848, "logps/rejected": -2.9695515632629395, "loss": -0.1377, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4704767167568207, "rewards/margins": 0.2525039315223694, "rewards/rejected": 0.21797281503677368, "step": 312 }, { "epoch": 0.6548117154811716, "grad_norm": 12.512648020048866, "learning_rate": 1.606922775497168e-07, "logits/chosen": -1.3213591575622559, "logits/rejected": -0.8296306133270264, "logps/chosen": -1.1784396171569824, "logps/rejected": -2.6272573471069336, "loss": -0.134, "rewards/accuracies": 0.75, "rewards/chosen": 0.33794206380844116, "rewards/margins": 0.11368609964847565, "rewards/rejected": 0.22425594925880432, "step": 313 }, { "epoch": 0.6569037656903766, "grad_norm": 33.98215330381615, "learning_rate": 1.5898868895896332e-07, "logits/chosen": -1.473046898841858, "logits/rejected": -0.959273099899292, "logps/chosen": -1.1853554248809814, "logps/rejected": -2.4030003547668457, "loss": -0.1059, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4078238606452942, "rewards/margins": 0.12516064941883087, "rewards/rejected": 0.2826631963253021, "step": 314 }, { "epoch": 0.6589958158995816, "grad_norm": 11.777819227549552, "learning_rate": 1.572899583543671e-07, "logits/chosen": -1.2090966701507568, "logits/rejected": -0.9002936482429504, "logps/chosen": -1.2504746913909912, "logps/rejected": -3.4912562370300293, "loss": -0.1467, "rewards/accuracies": 0.75, "rewards/chosen": 0.3819003105163574, "rewards/margins": 0.18813729286193848, "rewards/rejected": 0.19376301765441895, "step": 315 }, { "epoch": 0.6610878661087866, "grad_norm": 13.64501155485151, "learning_rate": 1.5559617641047885e-07, "logits/chosen": -1.0406343936920166, "logits/rejected": -0.9147306680679321, "logps/chosen": -1.650768518447876, "logps/rejected": -2.1240224838256836, "loss": -0.129, "rewards/accuracies": 0.625, "rewards/chosen": 0.2685588300228119, "rewards/margins": 0.01526784710586071, "rewards/rejected": 0.25329098105430603, "step": 316 }, { "epoch": 0.6631799163179917, "grad_norm": 17.681504166772957, "learning_rate": 1.5390743353770108e-07, "logits/chosen": -0.7469021081924438, "logits/rejected": -0.7831078767776489, "logps/chosen": -1.285473346710205, "logps/rejected": -1.6377302408218384, "loss": -0.0996, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4055963158607483, "rewards/margins": 0.03280503302812576, "rewards/rejected": 0.3727912902832031, "step": 317 }, { "epoch": 0.6652719665271967, "grad_norm": 23.101940308036763, "learning_rate": 1.5222381987746102e-07, "logits/chosen": -0.8882025480270386, "logits/rejected": -0.9792768955230713, "logps/chosen": -1.6120126247406006, "logps/rejected": -2.5856385231018066, "loss": -0.1461, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3878062963485718, "rewards/margins": 0.1964162290096283, "rewards/rejected": 0.1913900524377823, "step": 318 }, { "epoch": 0.6673640167364017, "grad_norm": 32.62994604510173, "learning_rate": 1.5054542529740008e-07, "logits/chosen": -1.0213111639022827, "logits/rejected": -0.7104781866073608, "logps/chosen": -1.222242832183838, "logps/rejected": -3.8115944862365723, "loss": -0.1556, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3644482493400574, "rewards/margins": 0.15496206283569336, "rewards/rejected": 0.209486186504364, "step": 319 }, { "epoch": 0.6694560669456067, "grad_norm": 13.051692014015307, "learning_rate": 1.488723393865766e-07, "logits/chosen": -1.4067423343658447, "logits/rejected": -1.2527976036071777, "logps/chosen": -1.1540534496307373, "logps/rejected": -2.087472438812256, "loss": -0.1364, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5028185844421387, "rewards/margins": 0.1457022875547409, "rewards/rejected": 0.3571162819862366, "step": 320 }, { "epoch": 0.6715481171548117, "grad_norm": 24.77692590150978, "learning_rate": 1.472046514506832e-07, "logits/chosen": -1.0069209337234497, "logits/rejected": -0.8981058597564697, "logps/chosen": -1.629122018814087, "logps/rejected": -1.7869584560394287, "loss": -0.0875, "rewards/accuracies": 0.4375, "rewards/chosen": 0.31096604466438293, "rewards/margins": 0.06203940510749817, "rewards/rejected": 0.24892663955688477, "step": 321 }, { "epoch": 0.6736401673640168, "grad_norm": 17.05484586075126, "learning_rate": 1.4554245050728084e-07, "logits/chosen": -1.1591291427612305, "logits/rejected": -0.8904301524162292, "logps/chosen": -1.1888909339904785, "logps/rejected": -2.005044937133789, "loss": -0.1115, "rewards/accuracies": 0.625, "rewards/chosen": 0.405101478099823, "rewards/margins": 0.13060510158538818, "rewards/rejected": 0.2744963467121124, "step": 322 }, { "epoch": 0.6757322175732218, "grad_norm": 11.322008720616964, "learning_rate": 1.4388582528104627e-07, "logits/chosen": -0.8311055302619934, "logits/rejected": -0.7126474380493164, "logps/chosen": -1.374341607093811, "logps/rejected": -2.404649019241333, "loss": -0.0939, "rewards/accuracies": 0.75, "rewards/chosen": 0.37304264307022095, "rewards/margins": 0.05022872984409332, "rewards/rejected": 0.3228139281272888, "step": 323 }, { "epoch": 0.6778242677824268, "grad_norm": 14.488311265640249, "learning_rate": 1.422348641990369e-07, "logits/chosen": -1.0909366607666016, "logits/rejected": -0.9740039706230164, "logps/chosen": -0.7895447015762329, "logps/rejected": -2.2779781818389893, "loss": -0.1645, "rewards/accuracies": 0.8125, "rewards/chosen": 0.49242639541625977, "rewards/margins": 0.2595653533935547, "rewards/rejected": 0.23286104202270508, "step": 324 }, { "epoch": 0.6799163179916318, "grad_norm": 9.655770397929885, "learning_rate": 1.4058965538597032e-07, "logits/chosen": -1.1042437553405762, "logits/rejected": -1.1286996603012085, "logps/chosen": -1.1713753938674927, "logps/rejected": -1.9938318729400635, "loss": -0.0939, "rewards/accuracies": 0.75, "rewards/chosen": 0.38073980808258057, "rewards/margins": 0.1361331045627594, "rewards/rejected": 0.24460668861865997, "step": 325 }, { "epoch": 0.6820083682008368, "grad_norm": 13.037016765822761, "learning_rate": 1.3895028665952057e-07, "logits/chosen": -0.8333834409713745, "logits/rejected": -0.8366643190383911, "logps/chosen": -1.1039419174194336, "logps/rejected": -2.8780007362365723, "loss": -0.1369, "rewards/accuracies": 0.75, "rewards/chosen": 0.41021984815597534, "rewards/margins": 0.18466591835021973, "rewards/rejected": 0.22555390000343323, "step": 326 }, { "epoch": 0.6841004184100419, "grad_norm": 14.265017443250814, "learning_rate": 1.3731684552563027e-07, "logits/chosen": -0.5523157119750977, "logits/rejected": -0.542705774307251, "logps/chosen": -0.7236326336860657, "logps/rejected": -1.5846632719039917, "loss": -0.1217, "rewards/accuracies": 0.75, "rewards/chosen": 0.5142608880996704, "rewards/margins": 0.1967753767967224, "rewards/rejected": 0.317485511302948, "step": 327 }, { "epoch": 0.6861924686192469, "grad_norm": 10.0387668669992, "learning_rate": 1.3568941917384036e-07, "logits/chosen": -0.8576334118843079, "logits/rejected": -0.7204816937446594, "logps/chosen": -1.0021593570709229, "logps/rejected": -2.57088565826416, "loss": -0.1258, "rewards/accuracies": 0.6875, "rewards/chosen": 0.42235422134399414, "rewards/margins": 0.15459594130516052, "rewards/rejected": 0.26775825023651123, "step": 328 }, { "epoch": 0.6882845188284519, "grad_norm": 10.581183347813186, "learning_rate": 1.3406809447263568e-07, "logits/chosen": -1.0595145225524902, "logits/rejected": -0.8824669122695923, "logps/chosen": -1.6525616645812988, "logps/rejected": -2.7960574626922607, "loss": -0.0999, "rewards/accuracies": 0.625, "rewards/chosen": 0.24829578399658203, "rewards/margins": 0.045187197625637054, "rewards/rejected": 0.20310857892036438, "step": 329 }, { "epoch": 0.6903765690376569, "grad_norm": 17.448093710233614, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -0.8543559908866882, "logits/rejected": -0.49344658851623535, "logps/chosen": -1.0859427452087402, "logps/rejected": -3.5764997005462646, "loss": -0.1058, "rewards/accuracies": 0.875, "rewards/chosen": 0.41744154691696167, "rewards/margins": 0.21425436437129974, "rewards/rejected": 0.20318715274333954, "step": 330 }, { "epoch": 0.6924686192468619, "grad_norm": 16.730200153254856, "learning_rate": 1.3084409586283694e-07, "logits/chosen": -1.2231147289276123, "logits/rejected": -0.7146123051643372, "logps/chosen": -2.151677131652832, "logps/rejected": -2.705674409866333, "loss": -0.106, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3055073022842407, "rewards/margins": 0.035416390746831894, "rewards/rejected": 0.27009090781211853, "step": 331 }, { "epoch": 0.694560669456067, "grad_norm": 13.156926153571998, "learning_rate": 1.2924159404428801e-07, "logits/chosen": -0.7178107500076294, "logits/rejected": -0.36026304960250854, "logps/chosen": -1.1973762512207031, "logps/rejected": -3.018155574798584, "loss": -0.1277, "rewards/accuracies": 0.6875, "rewards/chosen": 0.367809534072876, "rewards/margins": 0.16245080530643463, "rewards/rejected": 0.20535872876644135, "step": 332 }, { "epoch": 0.696652719665272, "grad_norm": 14.96194258618006, "learning_rate": 1.2764553804722867e-07, "logits/chosen": -0.6460230946540833, "logits/rejected": -0.623856782913208, "logps/chosen": -1.4669647216796875, "logps/rejected": -1.899721622467041, "loss": -0.0925, "rewards/accuracies": 0.6875, "rewards/chosen": 0.41646426916122437, "rewards/margins": 0.09561848640441895, "rewards/rejected": 0.3208458125591278, "step": 333 }, { "epoch": 0.698744769874477, "grad_norm": 10.920052001046809, "learning_rate": 1.2605601306566204e-07, "logits/chosen": -1.2113707065582275, "logits/rejected": -0.9189119338989258, "logps/chosen": -1.2759509086608887, "logps/rejected": -2.9539761543273926, "loss": -0.1303, "rewards/accuracies": 0.75, "rewards/chosen": 0.3761143088340759, "rewards/margins": 0.11043473333120346, "rewards/rejected": 0.26567959785461426, "step": 334 }, { "epoch": 0.700836820083682, "grad_norm": 12.026438516396775, "learning_rate": 1.2447310394498017e-07, "logits/chosen": -0.3498826324939728, "logits/rejected": -0.2449064552783966, "logps/chosen": -1.938281536102295, "logps/rejected": -2.0428366661071777, "loss": -0.1389, "rewards/accuracies": 0.75, "rewards/chosen": 0.3499003052711487, "rewards/margins": 0.08844777196645737, "rewards/rejected": 0.2614525258541107, "step": 335 }, { "epoch": 0.702928870292887, "grad_norm": 10.793477695654982, "learning_rate": 1.2289689517743472e-07, "logits/chosen": -0.5094469785690308, "logits/rejected": -0.2816145718097687, "logps/chosen": -1.3401274681091309, "logps/rejected": -2.9226505756378174, "loss": -0.1241, "rewards/accuracies": 0.75, "rewards/chosen": 0.37236952781677246, "rewards/margins": 0.20558218657970428, "rewards/rejected": 0.16678734123706818, "step": 336 }, { "epoch": 0.7050209205020921, "grad_norm": 10.41218722859423, "learning_rate": 1.213274708976271e-07, "logits/chosen": -0.6510695815086365, "logits/rejected": -0.739233136177063, "logps/chosen": -1.454422950744629, "logps/rejected": -2.9711012840270996, "loss": -0.1458, "rewards/accuracies": 0.8125, "rewards/chosen": 0.29552698135375977, "rewards/margins": 0.11168254911899567, "rewards/rejected": 0.1838444173336029, "step": 337 }, { "epoch": 0.7071129707112971, "grad_norm": 17.793530880939215, "learning_rate": 1.1976491487801746e-07, "logits/chosen": -0.9943718910217285, "logits/rejected": -0.8745123147964478, "logps/chosen": -1.1566295623779297, "logps/rejected": -3.3676934242248535, "loss": -0.1562, "rewards/accuracies": 0.625, "rewards/chosen": 0.38487157225608826, "rewards/margins": 0.10210902243852615, "rewards/rejected": 0.2827625572681427, "step": 338 }, { "epoch": 0.7092050209205021, "grad_norm": 16.609393373696328, "learning_rate": 1.1820931052445297e-07, "logits/chosen": -0.9806454181671143, "logits/rejected": -0.5159525871276855, "logps/chosen": -1.153395414352417, "logps/rejected": -3.263624429702759, "loss": -0.1205, "rewards/accuracies": 0.75, "rewards/chosen": 0.3399513363838196, "rewards/margins": 0.1355111300945282, "rewards/rejected": 0.204440176486969, "step": 339 }, { "epoch": 0.7112970711297071, "grad_norm": 18.60139351823708, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -0.8198652267456055, "logits/rejected": -0.4478958547115326, "logps/chosen": -1.401387333869934, "logps/rejected": -3.135948657989502, "loss": -0.1524, "rewards/accuracies": 0.625, "rewards/chosen": 0.35824739933013916, "rewards/margins": 0.18047404289245605, "rewards/rejected": 0.1777733713388443, "step": 340 }, { "epoch": 0.7133891213389121, "grad_norm": 11.26052872387053, "learning_rate": 1.1511928857909264e-07, "logits/chosen": -0.6239542961120605, "logits/rejected": -0.25442153215408325, "logps/chosen": -1.1070218086242676, "logps/rejected": -2.229344367980957, "loss": -0.0917, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3640150725841522, "rewards/margins": 0.13586899638175964, "rewards/rejected": 0.22814607620239258, "step": 341 }, { "epoch": 0.7154811715481172, "grad_norm": 13.071141355219329, "learning_rate": 1.1358503592595837e-07, "logits/chosen": -0.30327486991882324, "logits/rejected": -0.11083739995956421, "logps/chosen": -2.030562400817871, "logps/rejected": -3.745598793029785, "loss": -0.1459, "rewards/accuracies": 0.8125, "rewards/chosen": 0.30320000648498535, "rewards/margins": 0.10328814387321472, "rewards/rejected": 0.19991187751293182, "step": 342 }, { "epoch": 0.7175732217573222, "grad_norm": 35.84080060882081, "learning_rate": 1.120580648073885e-07, "logits/chosen": -0.28081023693084717, "logits/rejected": 0.07680314779281616, "logps/chosen": -1.5106937885284424, "logps/rejected": -2.175105094909668, "loss": -0.1121, "rewards/accuracies": 0.6875, "rewards/chosen": 0.38606128096580505, "rewards/margins": 0.07659916579723358, "rewards/rejected": 0.30946213006973267, "step": 343 }, { "epoch": 0.7196652719665272, "grad_norm": 12.317160716663446, "learning_rate": 1.1053845672978565e-07, "logits/chosen": -1.054025411605835, "logits/rejected": -0.6850357055664062, "logps/chosen": -1.185429334640503, "logps/rejected": -2.5092053413391113, "loss": -0.1204, "rewards/accuracies": 0.75, "rewards/chosen": 0.38027000427246094, "rewards/margins": 0.06081349402666092, "rewards/rejected": 0.3194565176963806, "step": 344 }, { "epoch": 0.7217573221757322, "grad_norm": 16.46100980347196, "learning_rate": 1.090262928065293e-07, "logits/chosen": -1.0338250398635864, "logits/rejected": -0.9750658273696899, "logps/chosen": -0.9890057444572449, "logps/rejected": -1.7097139358520508, "loss": -0.108, "rewards/accuracies": 0.5625, "rewards/chosen": 0.42490583658218384, "rewards/margins": 0.10276540368795395, "rewards/rejected": 0.3221404552459717, "step": 345 }, { "epoch": 0.7238493723849372, "grad_norm": 19.269194310225696, "learning_rate": 1.0752165375364591e-07, "logits/chosen": 0.17747777700424194, "logits/rejected": 0.3185412287712097, "logps/chosen": -1.8695478439331055, "logps/rejected": -2.6686084270477295, "loss": -0.1522, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3071568012237549, "rewards/margins": 0.05259630084037781, "rewards/rejected": 0.25456053018569946, "step": 346 }, { "epoch": 0.7259414225941423, "grad_norm": 13.005216285581936, "learning_rate": 1.060246198855011e-07, "logits/chosen": -0.6884080171585083, "logits/rejected": 0.031216230243444443, "logps/chosen": -1.4426461458206177, "logps/rejected": -2.789125919342041, "loss": -0.0979, "rewards/accuracies": 0.75, "rewards/chosen": 0.36756807565689087, "rewards/margins": 0.15506842732429504, "rewards/rejected": 0.21249966323375702, "step": 347 }, { "epoch": 0.7280334728033473, "grad_norm": 22.121610737491377, "learning_rate": 1.0453527111051183e-07, "logits/chosen": -1.3621861934661865, "logits/rejected": -0.784406304359436, "logps/chosen": -1.073019027709961, "logps/rejected": -3.2600555419921875, "loss": -0.1192, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4461975693702698, "rewards/margins": 0.14909616112709045, "rewards/rejected": 0.2971014380455017, "step": 348 }, { "epoch": 0.7301255230125523, "grad_norm": 10.880303503739961, "learning_rate": 1.0305368692688174e-07, "logits/chosen": -0.3833010792732239, "logits/rejected": -0.0003327280282974243, "logps/chosen": -2.284611225128174, "logps/rejected": -2.494032144546509, "loss": -0.1503, "rewards/accuracies": 0.5625, "rewards/chosen": 0.31178295612335205, "rewards/margins": 0.11959420889616013, "rewards/rejected": 0.19218875467777252, "step": 349 }, { "epoch": 0.7322175732217573, "grad_norm": 17.100515864214906, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -0.2467648983001709, "logits/rejected": -0.18056590855121613, "logps/chosen": -2.0999393463134766, "logps/rejected": -2.614711046218872, "loss": -0.1453, "rewards/accuracies": 0.5625, "rewards/chosen": 0.37043696641921997, "rewards/margins": 0.08482887595891953, "rewards/rejected": 0.28560805320739746, "step": 350 }, { "epoch": 0.7343096234309623, "grad_norm": 28.629645757692103, "learning_rate": 1.0011412825000693e-07, "logits/chosen": -1.1214914321899414, "logits/rejected": -0.7452467083930969, "logps/chosen": -1.2375651597976685, "logps/rejected": -2.4114208221435547, "loss": -0.0998, "rewards/accuracies": 0.625, "rewards/chosen": 0.3861534595489502, "rewards/margins": 0.0308099165558815, "rewards/rejected": 0.3553435802459717, "step": 351 }, { "epoch": 0.7364016736401674, "grad_norm": 12.024191314088965, "learning_rate": 9.865631066402136e-08, "logits/chosen": -0.7538611888885498, "logits/rejected": 0.19186115264892578, "logps/chosen": -1.4035093784332275, "logps/rejected": -4.363006591796875, "loss": -0.1394, "rewards/accuracies": 0.875, "rewards/chosen": 0.3268209993839264, "rewards/margins": 0.18925338983535767, "rewards/rejected": 0.13756762444972992, "step": 352 }, { "epoch": 0.7384937238493724, "grad_norm": 15.314163055416728, "learning_rate": 9.720657147553767e-08, "logits/chosen": -0.4137316942214966, "logits/rejected": -0.34769952297210693, "logps/chosen": -2.0995492935180664, "logps/rejected": -2.4585986137390137, "loss": -0.093, "rewards/accuracies": 0.5, "rewards/chosen": 0.28392699360847473, "rewards/margins": 0.05607965961098671, "rewards/rejected": 0.22784735262393951, "step": 353 }, { "epoch": 0.7405857740585774, "grad_norm": 16.82859831332979, "learning_rate": 9.57649880684859e-08, "logits/chosen": -0.16023807227611542, "logits/rejected": -0.07635152339935303, "logps/chosen": -1.9627785682678223, "logps/rejected": -1.6600819826126099, "loss": -0.1113, "rewards/accuracies": 0.5625, "rewards/chosen": 0.337715744972229, "rewards/margins": -0.00830451026558876, "rewards/rejected": 0.3460202217102051, "step": 354 }, { "epoch": 0.7426778242677824, "grad_norm": 17.97313380627762, "learning_rate": 9.433163739145771e-08, "logits/chosen": -0.5657111406326294, "logits/rejected": -0.043357543647289276, "logps/chosen": -1.0140752792358398, "logps/rejected": -2.526522636413574, "loss": -0.1323, "rewards/accuracies": 0.75, "rewards/chosen": 0.40150293707847595, "rewards/margins": 0.20014238357543945, "rewards/rejected": 0.2013605535030365, "step": 355 }, { "epoch": 0.7447698744769874, "grad_norm": 21.86574720605052, "learning_rate": 9.290659595360017e-08, "logits/chosen": -0.07870801538228989, "logits/rejected": -0.09383751451969147, "logps/chosen": -1.0062921047210693, "logps/rejected": -2.758099317550659, "loss": -0.1152, "rewards/accuracies": 0.9375, "rewards/chosen": 0.45154866576194763, "rewards/margins": 0.24462944269180298, "rewards/rejected": 0.20691920816898346, "step": 356 }, { "epoch": 0.7468619246861925, "grad_norm": 11.516028470925384, "learning_rate": 9.148993982053058e-08, "logits/chosen": -0.04537372291088104, "logits/rejected": 0.28835880756378174, "logps/chosen": -1.351635217666626, "logps/rejected": -2.764091968536377, "loss": -0.1137, "rewards/accuracies": 0.6875, "rewards/chosen": 0.37033578753471375, "rewards/margins": 0.13014227151870728, "rewards/rejected": 0.24019351601600647, "step": 357 }, { "epoch": 0.7489539748953975, "grad_norm": 17.569715002068367, "learning_rate": 9.008174461027723e-08, "logits/chosen": -0.7079002261161804, "logits/rejected": -0.3616589903831482, "logps/chosen": -1.2153748273849487, "logps/rejected": -2.612088680267334, "loss": -0.1094, "rewards/accuracies": 0.625, "rewards/chosen": 0.38530054688453674, "rewards/margins": 0.10429896414279938, "rewards/rejected": 0.28100159764289856, "step": 358 }, { "epoch": 0.7510460251046025, "grad_norm": 11.265866090520127, "learning_rate": 8.868208548924253e-08, "logits/chosen": -0.3887017071247101, "logits/rejected": -0.23636029660701752, "logps/chosen": -0.9961744546890259, "logps/rejected": -2.2566308975219727, "loss": -0.1156, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4412422776222229, "rewards/margins": 0.13439972698688507, "rewards/rejected": 0.30684253573417664, "step": 359 }, { "epoch": 0.7531380753138075, "grad_norm": 17.208413235507344, "learning_rate": 8.729103716819111e-08, "logits/chosen": -0.8207125663757324, "logits/rejected": -0.43636664748191833, "logps/chosen": -2.236236095428467, "logps/rejected": -2.8618106842041016, "loss": -0.1482, "rewards/accuracies": 0.75, "rewards/chosen": 0.3289279341697693, "rewards/margins": 0.11165976524353027, "rewards/rejected": 0.2172681838274002, "step": 360 }, { "epoch": 0.7552301255230126, "grad_norm": 10.315633401806636, "learning_rate": 8.590867389826179e-08, "logits/chosen": -0.05628104507923126, "logits/rejected": 0.12740841507911682, "logps/chosen": -0.8035219311714172, "logps/rejected": -2.464738368988037, "loss": -0.1833, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5340550541877747, "rewards/margins": 0.24898886680603027, "rewards/rejected": 0.2850662171840668, "step": 361 }, { "epoch": 0.7573221757322176, "grad_norm": 44.322778195778604, "learning_rate": 8.453506946700417e-08, "logits/chosen": -0.7813249826431274, "logits/rejected": -0.02990594506263733, "logps/chosen": -0.9563344120979309, "logps/rejected": -3.9870352745056152, "loss": -0.143, "rewards/accuracies": 0.75, "rewards/chosen": 0.42556053400039673, "rewards/margins": 0.20505291223526, "rewards/rejected": 0.2205076366662979, "step": 362 }, { "epoch": 0.7594142259414226, "grad_norm": 12.418318943630904, "learning_rate": 8.317029719444016e-08, "logits/chosen": -0.534734845161438, "logits/rejected": 0.015497885644435883, "logps/chosen": -1.012712001800537, "logps/rejected": -2.580301284790039, "loss": -0.1126, "rewards/accuracies": 0.8125, "rewards/chosen": 0.41058653593063354, "rewards/margins": 0.1472766250371933, "rewards/rejected": 0.26330992579460144, "step": 363 }, { "epoch": 0.7615062761506276, "grad_norm": 14.003047066565102, "learning_rate": 8.181442992915e-08, "logits/chosen": -0.52370285987854, "logits/rejected": 0.1399349719285965, "logps/chosen": -0.8610312938690186, "logps/rejected": -3.0481395721435547, "loss": -0.1402, "rewards/accuracies": 0.9375, "rewards/chosen": 0.4761735200881958, "rewards/margins": 0.25835907459259033, "rewards/rejected": 0.21781444549560547, "step": 364 }, { "epoch": 0.7635983263598326, "grad_norm": 27.853135912779752, "learning_rate": 8.046754004438428e-08, "logits/chosen": -0.38113510608673096, "logits/rejected": -0.05555605888366699, "logps/chosen": -0.7260129451751709, "logps/rejected": -1.8457127809524536, "loss": -0.1276, "rewards/accuracies": 0.8125, "rewards/chosen": 0.5552431344985962, "rewards/margins": 0.20019498467445374, "rewards/rejected": 0.35504812002182007, "step": 365 }, { "epoch": 0.7656903765690377, "grad_norm": 14.711785420166494, "learning_rate": 7.912969943420017e-08, "logits/chosen": -1.2947417497634888, "logits/rejected": -0.7500016689300537, "logps/chosen": -1.219745397567749, "logps/rejected": -1.6371362209320068, "loss": -0.1534, "rewards/accuracies": 0.625, "rewards/chosen": 0.36199867725372314, "rewards/margins": 0.12112035602331161, "rewards/rejected": 0.24087829887866974, "step": 366 }, { "epoch": 0.7677824267782427, "grad_norm": 11.746982917584226, "learning_rate": 7.780097950962447e-08, "logits/chosen": -0.529086172580719, "logits/rejected": -0.30336976051330566, "logps/chosen": -1.449676513671875, "logps/rejected": -1.9631965160369873, "loss": -0.148, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3482644557952881, "rewards/margins": 0.08891519904136658, "rewards/rejected": 0.2593492269515991, "step": 367 }, { "epoch": 0.7698744769874477, "grad_norm": 18.018124250700236, "learning_rate": 7.648145119484151e-08, "logits/chosen": -0.7858597040176392, "logits/rejected": -0.34618625044822693, "logps/chosen": -1.6457475423812866, "logps/rejected": -2.3563523292541504, "loss": -0.1509, "rewards/accuracies": 0.625, "rewards/chosen": 0.37516093254089355, "rewards/margins": 0.11906899511814117, "rewards/rejected": 0.2560918927192688, "step": 368 }, { "epoch": 0.7719665271966527, "grad_norm": 14.496994233180718, "learning_rate": 7.517118492340748e-08, "logits/chosen": -0.7869740128517151, "logits/rejected": -0.7773195505142212, "logps/chosen": -1.2835150957107544, "logps/rejected": -2.575712203979492, "loss": -0.0977, "rewards/accuracies": 0.625, "rewards/chosen": 0.31397247314453125, "rewards/margins": 0.12309419363737106, "rewards/rejected": 0.19087830185890198, "step": 369 }, { "epoch": 0.7740585774058577, "grad_norm": 25.21651844990452, "learning_rate": 7.387025063449081e-08, "logits/chosen": 0.0035008257254958153, "logits/rejected": 0.4158563017845154, "logps/chosen": -1.4392738342285156, "logps/rejected": -3.455766439437866, "loss": -0.1568, "rewards/accuracies": 0.75, "rewards/chosen": 0.34259992837905884, "rewards/margins": 0.16605782508850098, "rewards/rejected": 0.17654210329055786, "step": 370 }, { "epoch": 0.7761506276150628, "grad_norm": 12.841556705216126, "learning_rate": 7.257871776913879e-08, "logits/chosen": -1.1575801372528076, "logits/rejected": -0.5147409439086914, "logps/chosen": -0.9548391699790955, "logps/rejected": -3.196890115737915, "loss": -0.1368, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4133971333503723, "rewards/margins": 0.14503958821296692, "rewards/rejected": 0.2683575451374054, "step": 371 }, { "epoch": 0.7782426778242678, "grad_norm": 14.050240424328877, "learning_rate": 7.129665526657145e-08, "logits/chosen": -0.8727412819862366, "logits/rejected": -0.41230928897857666, "logps/chosen": -1.1312958002090454, "logps/rejected": -2.5632314682006836, "loss": -0.1652, "rewards/accuracies": 0.75, "rewards/chosen": 0.39264926314353943, "rewards/margins": 0.13636496663093567, "rewards/rejected": 0.25628429651260376, "step": 372 }, { "epoch": 0.7803347280334728, "grad_norm": 13.116388311344872, "learning_rate": 7.002413156050108e-08, "logits/chosen": -0.6466270685195923, "logits/rejected": 0.143938809633255, "logps/chosen": -1.2247765064239502, "logps/rejected": -2.1847689151763916, "loss": -0.1197, "rewards/accuracies": 0.75, "rewards/chosen": 0.3555133640766144, "rewards/margins": 0.09387169033288956, "rewards/rejected": 0.2616417109966278, "step": 373 }, { "epoch": 0.7824267782426778, "grad_norm": 23.736331901716355, "learning_rate": 6.876121457547995e-08, "logits/chosen": -1.3204679489135742, "logits/rejected": -1.0910612344741821, "logps/chosen": -0.9571521282196045, "logps/rejected": -2.0240230560302734, "loss": -0.1045, "rewards/accuracies": 0.75, "rewards/chosen": 0.4545818269252777, "rewards/margins": 0.09085176885128021, "rewards/rejected": 0.3637300729751587, "step": 374 }, { "epoch": 0.7845188284518828, "grad_norm": 24.818348846229174, "learning_rate": 6.75079717232744e-08, "logits/chosen": -0.06902044266462326, "logits/rejected": 0.6411824226379395, "logps/chosen": -1.6761717796325684, "logps/rejected": -2.5190796852111816, "loss": -0.0937, "rewards/accuracies": 0.625, "rewards/chosen": 0.28744983673095703, "rewards/margins": -0.014971615746617317, "rewards/rejected": 0.3024214804172516, "step": 375 }, { "epoch": 0.7866108786610879, "grad_norm": 16.412707062669462, "learning_rate": 6.626446989926652e-08, "logits/chosen": -0.7313116192817688, "logits/rejected": -0.14149028062820435, "logps/chosen": -1.0154411792755127, "logps/rejected": -2.468728542327881, "loss": -0.1287, "rewards/accuracies": 0.75, "rewards/chosen": 0.39400801062583923, "rewards/margins": 0.13069072365760803, "rewards/rejected": 0.2633172869682312, "step": 376 }, { "epoch": 0.7887029288702929, "grad_norm": 12.947593105914633, "learning_rate": 6.503077547888352e-08, "logits/chosen": -0.3051021099090576, "logits/rejected": -0.031502705067396164, "logps/chosen": -1.4966380596160889, "logps/rejected": -1.756742000579834, "loss": -0.1317, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2928207516670227, "rewards/margins": 0.045941371470689774, "rewards/rejected": 0.24687938392162323, "step": 377 }, { "epoch": 0.7907949790794979, "grad_norm": 22.150989160775218, "learning_rate": 6.380695431405453e-08, "logits/chosen": -0.8521575331687927, "logits/rejected": -0.49384939670562744, "logps/chosen": -0.8695181012153625, "logps/rejected": -2.132112979888916, "loss": -0.0904, "rewards/accuracies": 0.5625, "rewards/chosen": 0.45235487818717957, "rewards/margins": 0.09790237247943878, "rewards/rejected": 0.354452520608902, "step": 378 }, { "epoch": 0.7928870292887029, "grad_norm": 17.656613830598737, "learning_rate": 6.259307172969606e-08, "logits/chosen": -0.6174445152282715, "logits/rejected": 0.07970046997070312, "logps/chosen": -1.4945907592773438, "logps/rejected": -3.1515700817108154, "loss": -0.1211, "rewards/accuracies": 0.5625, "rewards/chosen": 0.357840359210968, "rewards/margins": 0.05095478519797325, "rewards/rejected": 0.30688557028770447, "step": 379 }, { "epoch": 0.7949790794979079, "grad_norm": 15.194872161839614, "learning_rate": 6.138919252022435e-08, "logits/chosen": -0.35832735896110535, "logits/rejected": -0.3035891056060791, "logps/chosen": -1.24945867061615, "logps/rejected": -1.789233684539795, "loss": -0.0939, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3697297275066376, "rewards/margins": 0.10971508920192719, "rewards/rejected": 0.2600146532058716, "step": 380 }, { "epoch": 0.797071129707113, "grad_norm": 19.862030211955297, "learning_rate": 6.019538094609759e-08, "logits/chosen": -0.2610815465450287, "logits/rejected": -0.4965687394142151, "logps/chosen": -0.9594101309776306, "logps/rejected": -1.6051406860351562, "loss": -0.1072, "rewards/accuracies": 0.625, "rewards/chosen": 0.4468555748462677, "rewards/margins": 0.13224181532859802, "rewards/rejected": 0.3146137595176697, "step": 381 }, { "epoch": 0.799163179916318, "grad_norm": 16.771708981733756, "learning_rate": 5.9011700730385224e-08, "logits/chosen": -0.19101418554782867, "logits/rejected": -0.03149036690592766, "logps/chosen": -1.246549129486084, "logps/rejected": -3.112955331802368, "loss": -0.1322, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3705316185951233, "rewards/margins": 0.0991310402750969, "rewards/rejected": 0.2714005708694458, "step": 382 }, { "epoch": 0.801255230125523, "grad_norm": 38.34589726312402, "learning_rate": 5.7838215055366954e-08, "logits/chosen": -1.1628611087799072, "logits/rejected": -0.803926408290863, "logps/chosen": -1.4910624027252197, "logps/rejected": -1.700842261314392, "loss": -0.1221, "rewards/accuracies": 0.5, "rewards/chosen": 0.3370550870895386, "rewards/margins": 0.01984988898038864, "rewards/rejected": 0.31720519065856934, "step": 383 }, { "epoch": 0.803347280334728, "grad_norm": 15.644651216768494, "learning_rate": 5.6674986559160004e-08, "logits/chosen": -0.9479022026062012, "logits/rejected": -0.10572785139083862, "logps/chosen": -0.906363844871521, "logps/rejected": -2.1339168548583984, "loss": -0.1254, "rewards/accuracies": 0.875, "rewards/chosen": 0.4436798393726349, "rewards/margins": 0.23987531661987305, "rewards/rejected": 0.20380452275276184, "step": 384 }, { "epoch": 0.805439330543933, "grad_norm": 19.05289765194646, "learning_rate": 5.552207733237543e-08, "logits/chosen": -0.6024627685546875, "logits/rejected": -0.5707870721817017, "logps/chosen": -0.8243768215179443, "logps/rejected": -1.8555145263671875, "loss": -0.1289, "rewards/accuracies": 0.625, "rewards/chosen": 0.4963781237602234, "rewards/margins": 0.2237432599067688, "rewards/rejected": 0.2726348638534546, "step": 385 }, { "epoch": 0.8075313807531381, "grad_norm": 15.4528964096149, "learning_rate": 5.4379548914804427e-08, "logits/chosen": -0.016455668956041336, "logits/rejected": 0.3638351559638977, "logps/chosen": -1.2458593845367432, "logps/rejected": -2.3567447662353516, "loss": -0.1144, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3728832006454468, "rewards/margins": 0.11920465528964996, "rewards/rejected": 0.253678560256958, "step": 386 }, { "epoch": 0.8096234309623431, "grad_norm": 22.637025787930654, "learning_rate": 5.324746229213281e-08, "logits/chosen": -0.13823582231998444, "logits/rejected": 0.23312069475650787, "logps/chosen": -1.2766966819763184, "logps/rejected": -1.7796881198883057, "loss": -0.0925, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3433237671852112, "rewards/margins": 0.09567471593618393, "rewards/rejected": 0.24764902889728546, "step": 387 }, { "epoch": 0.8117154811715481, "grad_norm": 14.718678468178092, "learning_rate": 5.212587789268649e-08, "logits/chosen": -0.2695750594139099, "logits/rejected": -0.005407601594924927, "logps/chosen": -1.4397711753845215, "logps/rejected": -1.629962682723999, "loss": -0.0973, "rewards/accuracies": 0.625, "rewards/chosen": 0.33621859550476074, "rewards/margins": 0.033973321318626404, "rewards/rejected": 0.30224525928497314, "step": 388 }, { "epoch": 0.8138075313807531, "grad_norm": 20.171091314172557, "learning_rate": 5.101485558420504e-08, "logits/chosen": 0.19560030102729797, "logits/rejected": 0.34746694564819336, "logps/chosen": -1.6387301683425903, "logps/rejected": -4.318758010864258, "loss": -0.0964, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3025053143501282, "rewards/margins": 0.17127913236618042, "rewards/rejected": 0.13122616708278656, "step": 389 }, { "epoch": 0.8158995815899581, "grad_norm": 19.414432686359987, "learning_rate": 4.991445467064689e-08, "logits/chosen": -0.273246169090271, "logits/rejected": -0.431201308965683, "logps/chosen": -0.9642292857170105, "logps/rejected": -1.8679355382919312, "loss": -0.1217, "rewards/accuracies": 0.75, "rewards/chosen": 0.4235716760158539, "rewards/margins": 0.08978331089019775, "rewards/rejected": 0.33378836512565613, "step": 390 }, { "epoch": 0.8179916317991632, "grad_norm": 16.85819113660886, "learning_rate": 4.882473388902322e-08, "logits/chosen": -0.500401496887207, "logits/rejected": -0.2619732618331909, "logps/chosen": -1.076308250427246, "logps/rejected": -2.224181890487671, "loss": -0.0881, "rewards/accuracies": 0.625, "rewards/chosen": 0.4291846454143524, "rewards/margins": 0.11726579070091248, "rewards/rejected": 0.31191885471343994, "step": 391 }, { "epoch": 0.8200836820083682, "grad_norm": 15.5960410352012, "learning_rate": 4.774575140626316e-08, "logits/chosen": -0.5160663723945618, "logits/rejected": 0.43592193722724915, "logps/chosen": -0.9588484168052673, "logps/rejected": -2.9696645736694336, "loss": -0.1598, "rewards/accuracies": 0.875, "rewards/chosen": 0.45355111360549927, "rewards/margins": 0.257058709859848, "rewards/rejected": 0.19649238884449005, "step": 392 }, { "epoch": 0.8221757322175732, "grad_norm": 23.839231530419294, "learning_rate": 4.667756481610866e-08, "logits/chosen": -0.3702046275138855, "logits/rejected": -0.35005468130111694, "logps/chosen": -0.9770090579986572, "logps/rejected": -1.1811765432357788, "loss": -0.1075, "rewards/accuracies": 0.625, "rewards/chosen": 0.40616047382354736, "rewards/margins": 0.03360208123922348, "rewards/rejected": 0.3725584149360657, "step": 393 }, { "epoch": 0.8242677824267782, "grad_norm": 16.344564093013464, "learning_rate": 4.562023113604041e-08, "logits/chosen": -0.8614406585693359, "logits/rejected": -0.667637825012207, "logps/chosen": -0.942477822303772, "logps/rejected": -1.798526644706726, "loss": -0.1132, "rewards/accuracies": 0.625, "rewards/chosen": 0.4376535415649414, "rewards/margins": 0.10072365403175354, "rewards/rejected": 0.33692991733551025, "step": 394 }, { "epoch": 0.8263598326359832, "grad_norm": 18.59736703264403, "learning_rate": 4.4573806804234335e-08, "logits/chosen": -0.6730791330337524, "logits/rejected": -0.10565708577632904, "logps/chosen": -0.9278669953346252, "logps/rejected": -2.0849101543426514, "loss": -0.1275, "rewards/accuracies": 0.5625, "rewards/chosen": 0.4342162609100342, "rewards/margins": 0.16311287879943848, "rewards/rejected": 0.2711033821105957, "step": 395 }, { "epoch": 0.8284518828451883, "grad_norm": 17.260327771943437, "learning_rate": 4.3538347676548956e-08, "logits/chosen": -0.22298240661621094, "logits/rejected": -0.0226747989654541, "logps/chosen": -1.0462150573730469, "logps/rejected": -2.898242235183716, "loss": -0.1041, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4490687847137451, "rewards/margins": 0.21503090858459473, "rewards/rejected": 0.23403790593147278, "step": 396 }, { "epoch": 0.8305439330543933, "grad_norm": 10.738289775824189, "learning_rate": 4.251390902354413e-08, "logits/chosen": -0.19089946150779724, "logits/rejected": -0.2588718831539154, "logps/chosen": -1.3638062477111816, "logps/rejected": -1.6540424823760986, "loss": -0.1205, "rewards/accuracies": 0.625, "rewards/chosen": 0.34393906593322754, "rewards/margins": 0.06465037912130356, "rewards/rejected": 0.279288649559021, "step": 397 }, { "epoch": 0.8326359832635983, "grad_norm": 14.000882257182658, "learning_rate": 4.1500545527530544e-08, "logits/chosen": -0.4493892192840576, "logits/rejected": -0.09688322246074677, "logps/chosen": -1.4314875602722168, "logps/rejected": -1.8489952087402344, "loss": -0.0961, "rewards/accuracies": 0.625, "rewards/chosen": 0.358940064907074, "rewards/margins": 0.13328352570533752, "rewards/rejected": 0.22565653920173645, "step": 398 }, { "epoch": 0.8347280334728033, "grad_norm": 15.545368370778581, "learning_rate": 4.0498311279651196e-08, "logits/chosen": -0.07205243408679962, "logits/rejected": 0.23499715328216553, "logps/chosen": -1.2373902797698975, "logps/rejected": -3.2700839042663574, "loss": -0.0877, "rewards/accuracies": 0.8125, "rewards/chosen": 0.34146398305892944, "rewards/margins": 0.13711421191692352, "rewards/rejected": 0.20434975624084473, "step": 399 }, { "epoch": 0.8368200836820083, "grad_norm": 18.918377276587357, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -0.5115982294082642, "logits/rejected": -0.6578488349914551, "logps/chosen": -1.8198411464691162, "logps/rejected": -2.025270938873291, "loss": -0.1423, "rewards/accuracies": 0.5625, "rewards/chosen": 0.41336411237716675, "rewards/margins": 0.07312976568937302, "rewards/rejected": 0.3402343690395355, "step": 400 }, { "epoch": 0.8389121338912134, "grad_norm": 11.634156261583618, "learning_rate": 3.8527443919736006e-08, "logits/chosen": -0.9072321653366089, "logits/rejected": -0.7701002359390259, "logps/chosen": -1.0199377536773682, "logps/rejected": -3.217071533203125, "loss": -0.1353, "rewards/accuracies": 0.75, "rewards/chosen": 0.4319874048233032, "rewards/margins": 0.19452467560768127, "rewards/rejected": 0.23746272921562195, "step": 401 }, { "epoch": 0.8410041841004184, "grad_norm": 12.4551196253915, "learning_rate": 3.755891600832026e-08, "logits/chosen": -0.25930002331733704, "logits/rejected": 0.25398126244544983, "logps/chosen": -2.0748448371887207, "logps/rejected": -2.5072836875915527, "loss": -0.1016, "rewards/accuracies": 0.625, "rewards/chosen": 0.39124736189842224, "rewards/margins": 0.09978035092353821, "rewards/rejected": 0.29146701097488403, "step": 402 }, { "epoch": 0.8430962343096234, "grad_norm": 10.222174114537948, "learning_rate": 3.660172774066339e-08, "logits/chosen": -0.05241062492132187, "logits/rejected": 0.2334146946668625, "logps/chosen": -1.0232222080230713, "logps/rejected": -2.3795347213745117, "loss": -0.137, "rewards/accuracies": 0.75, "rewards/chosen": 0.3805456757545471, "rewards/margins": 0.16428259015083313, "rewards/rejected": 0.21626310050487518, "step": 403 }, { "epoch": 0.8451882845188284, "grad_norm": 15.992043901254412, "learning_rate": 3.565593020939678e-08, "logits/chosen": -0.27851253747940063, "logits/rejected": -0.0054774656891822815, "logps/chosen": -1.8243669271469116, "logps/rejected": -2.437450885772705, "loss": -0.1127, "rewards/accuracies": 0.5, "rewards/chosen": 0.33263060450553894, "rewards/margins": 0.05638391897082329, "rewards/rejected": 0.27624669671058655, "step": 404 }, { "epoch": 0.8472803347280334, "grad_norm": 25.026948366336036, "learning_rate": 3.472157389913874e-08, "logits/chosen": -0.805883526802063, "logits/rejected": -0.528130292892456, "logps/chosen": -1.4973548650741577, "logps/rejected": -2.4731860160827637, "loss": -0.1533, "rewards/accuracies": 0.75, "rewards/chosen": 0.3797512650489807, "rewards/margins": 0.11224304139614105, "rewards/rejected": 0.26750820875167847, "step": 405 }, { "epoch": 0.8493723849372385, "grad_norm": 18.95659135912971, "learning_rate": 3.3798708683800305e-08, "logits/chosen": -0.06515583395957947, "logits/rejected": -0.24208559095859528, "logps/chosen": -1.6230918169021606, "logps/rejected": -2.0216987133026123, "loss": -0.0905, "rewards/accuracies": 0.5625, "rewards/chosen": 0.2974805235862732, "rewards/margins": 0.04249989986419678, "rewards/rejected": 0.2549806237220764, "step": 406 }, { "epoch": 0.8514644351464435, "grad_norm": 15.13226749066972, "learning_rate": 3.288738382392273e-08, "logits/chosen": -0.37732651829719543, "logits/rejected": 0.23026621341705322, "logps/chosen": -1.16318941116333, "logps/rejected": -3.1813478469848633, "loss": -0.1584, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3486320972442627, "rewards/margins": 0.1388455033302307, "rewards/rejected": 0.2097865641117096, "step": 407 }, { "epoch": 0.8535564853556485, "grad_norm": 12.92143630698041, "learning_rate": 3.198764796404807e-08, "logits/chosen": -0.5920988321304321, "logits/rejected": -0.5197505950927734, "logps/chosen": -1.685182809829712, "logps/rejected": -2.676419973373413, "loss": -0.1647, "rewards/accuracies": 0.6875, "rewards/chosen": 0.32835954427719116, "rewards/margins": 0.06703662127256393, "rewards/rejected": 0.26132291555404663, "step": 408 }, { "epoch": 0.8556485355648535, "grad_norm": 15.060307639983238, "learning_rate": 3.109954913012294e-08, "logits/chosen": -0.05348600819706917, "logits/rejected": 0.5084730386734009, "logps/chosen": -1.1922996044158936, "logps/rejected": -3.422697067260742, "loss": -0.1294, "rewards/accuracies": 0.8125, "rewards/chosen": 0.400797963142395, "rewards/margins": 0.2237713634967804, "rewards/rejected": 0.17702659964561462, "step": 409 }, { "epoch": 0.8577405857740585, "grad_norm": 10.856906615820568, "learning_rate": 3.022313472693447e-08, "logits/chosen": -0.7400836944580078, "logits/rejected": -0.9220219850540161, "logps/chosen": -0.9642383456230164, "logps/rejected": -1.5164411067962646, "loss": -0.1233, "rewards/accuracies": 0.625, "rewards/chosen": 0.4663574695587158, "rewards/margins": 0.1306409388780594, "rewards/rejected": 0.33571651577949524, "step": 410 }, { "epoch": 0.8598326359832636, "grad_norm": 11.044705837575496, "learning_rate": 2.935845153558053e-08, "logits/chosen": -0.6763345003128052, "logits/rejected": -0.42656803131103516, "logps/chosen": -1.0390863418579102, "logps/rejected": -2.544398784637451, "loss": -0.1433, "rewards/accuracies": 0.75, "rewards/chosen": 0.4070824384689331, "rewards/margins": 0.17951026558876038, "rewards/rejected": 0.22757217288017273, "step": 411 }, { "epoch": 0.8619246861924686, "grad_norm": 14.117311799051569, "learning_rate": 2.8505545710972107e-08, "logits/chosen": -0.008791878819465637, "logits/rejected": 0.23330970108509064, "logps/chosen": -1.5099177360534668, "logps/rejected": -1.9858248233795166, "loss": -0.0957, "rewards/accuracies": 0.5, "rewards/chosen": 0.3444157838821411, "rewards/margins": 0.04557289183139801, "rewards/rejected": 0.2988429069519043, "step": 412 }, { "epoch": 0.8640167364016736, "grad_norm": 21.343019352885765, "learning_rate": 2.766446277937029e-08, "logits/chosen": -0.3148643970489502, "logits/rejected": -0.20236754417419434, "logps/chosen": -1.3142023086547852, "logps/rejected": -2.6143746376037598, "loss": -0.0726, "rewards/accuracies": 0.625, "rewards/chosen": 0.3833717703819275, "rewards/margins": 0.0287872813642025, "rewards/rejected": 0.3545844852924347, "step": 413 }, { "epoch": 0.8661087866108786, "grad_norm": 9.868482152917004, "learning_rate": 2.683524763595546e-08, "logits/chosen": 0.10180674493312836, "logits/rejected": -0.06073710322380066, "logps/chosen": -0.9547271728515625, "logps/rejected": -1.5085153579711914, "loss": -0.1442, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4392520785331726, "rewards/margins": 0.11705724895000458, "rewards/rejected": 0.3221948742866516, "step": 414 }, { "epoch": 0.8682008368200836, "grad_norm": 24.104841754222196, "learning_rate": 2.601794454243139e-08, "logits/chosen": -0.3390297591686249, "logits/rejected": 0.024038255214691162, "logps/chosen": -1.0783426761627197, "logps/rejected": -2.5162158012390137, "loss": -0.1205, "rewards/accuracies": 0.75, "rewards/chosen": 0.435734361410141, "rewards/margins": 0.15422660112380981, "rewards/rejected": 0.2815077602863312, "step": 415 }, { "epoch": 0.8702928870292888, "grad_norm": 14.28427980032814, "learning_rate": 2.521259712466256e-08, "logits/chosen": -1.2510604858398438, "logits/rejected": -0.2848242223262787, "logps/chosen": -0.8153286576271057, "logps/rejected": -1.7719354629516602, "loss": -0.1589, "rewards/accuracies": 0.875, "rewards/chosen": 0.5010541677474976, "rewards/margins": 0.23067042231559753, "rewards/rejected": 0.27038371562957764, "step": 416 }, { "epoch": 0.8723849372384938, "grad_norm": 25.588656821141594, "learning_rate": 2.4419248370345285e-08, "logits/chosen": 0.3352106511592865, "logits/rejected": 0.3673442602157593, "logps/chosen": -1.1408625841140747, "logps/rejected": -3.054860830307007, "loss": -0.126, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39492154121398926, "rewards/margins": 0.11788132786750793, "rewards/rejected": 0.2770402431488037, "step": 417 }, { "epoch": 0.8744769874476988, "grad_norm": 10.077506491437301, "learning_rate": 2.3637940626713342e-08, "logits/chosen": 0.2162850946187973, "logits/rejected": 0.19268055260181427, "logps/chosen": -1.1083879470825195, "logps/rejected": -2.3523571491241455, "loss": -0.1301, "rewards/accuracies": 0.75, "rewards/chosen": 0.41468891501426697, "rewards/margins": 0.19711005687713623, "rewards/rejected": 0.21757885813713074, "step": 418 }, { "epoch": 0.8765690376569037, "grad_norm": 14.531005857324054, "learning_rate": 2.2868715598277578e-08, "logits/chosen": -0.6414574384689331, "logits/rejected": -0.19284185767173767, "logps/chosen": -1.3791618347167969, "logps/rejected": -1.8712342977523804, "loss": -0.1405, "rewards/accuracies": 0.6875, "rewards/chosen": 0.35826003551483154, "rewards/margins": 0.051309142261743546, "rewards/rejected": 0.3069508671760559, "step": 419 }, { "epoch": 0.8786610878661087, "grad_norm": 15.518123245027137, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -0.6647897362709045, "logits/rejected": -0.1362893283367157, "logps/chosen": -1.5059080123901367, "logps/rejected": -2.9713315963745117, "loss": -0.122, "rewards/accuracies": 0.75, "rewards/chosen": 0.3615904450416565, "rewards/margins": 0.1611703634262085, "rewards/rejected": 0.2004200667142868, "step": 420 }, { "epoch": 0.8807531380753139, "grad_norm": 18.186740715196795, "learning_rate": 2.1366677278100486e-08, "logits/chosen": -0.7590320110321045, "logits/rejected": -0.011863499879837036, "logps/chosen": -1.4375065565109253, "logps/rejected": -2.3867299556732178, "loss": -0.1402, "rewards/accuracies": 0.5625, "rewards/chosen": 0.40658998489379883, "rewards/margins": 0.12375946342945099, "rewards/rejected": 0.28283050656318665, "step": 421 }, { "epoch": 0.8828451882845189, "grad_norm": 14.824655173088397, "learning_rate": 2.0633944161903145e-08, "logits/chosen": -0.28729909658432007, "logits/rejected": 0.09423608332872391, "logps/chosen": -1.6937799453735352, "logps/rejected": -2.76290225982666, "loss": -0.14, "rewards/accuracies": 0.5, "rewards/chosen": 0.33846837282180786, "rewards/margins": 0.10761270672082901, "rewards/rejected": 0.23085570335388184, "step": 422 }, { "epoch": 0.8849372384937239, "grad_norm": 14.454926513569529, "learning_rate": 1.991345410771017e-08, "logits/chosen": -0.23677058517932892, "logits/rejected": 0.2690431773662567, "logps/chosen": -1.7377506494522095, "logps/rejected": -2.0279836654663086, "loss": -0.1366, "rewards/accuracies": 0.625, "rewards/chosen": 0.31104129552841187, "rewards/margins": 0.1051572784781456, "rewards/rejected": 0.20588400959968567, "step": 423 }, { "epoch": 0.8870292887029289, "grad_norm": 13.709071053517471, "learning_rate": 1.9205245573716195e-08, "logits/chosen": -0.5881615877151489, "logits/rejected": -0.2020711451768875, "logps/chosen": -1.4259986877441406, "logps/rejected": -3.365917682647705, "loss": -0.1618, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4160638153553009, "rewards/margins": 0.1756630837917328, "rewards/rejected": 0.2404007464647293, "step": 424 }, { "epoch": 0.8891213389121339, "grad_norm": 15.42709086141289, "learning_rate": 1.850935636255496e-08, "logits/chosen": -0.4001805782318115, "logits/rejected": 0.3592504858970642, "logps/chosen": -1.2519683837890625, "logps/rejected": -2.341618061065674, "loss": -0.1113, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39130017161369324, "rewards/margins": 0.1382875144481659, "rewards/rejected": 0.25301262736320496, "step": 425 }, { "epoch": 0.891213389121339, "grad_norm": 18.239206963415292, "learning_rate": 1.7825823619281448e-08, "logits/chosen": -0.4060337245464325, "logits/rejected": -0.25071170926094055, "logps/chosen": -0.7171350717544556, "logps/rejected": -2.6098623275756836, "loss": -0.1455, "rewards/accuracies": 0.6875, "rewards/chosen": 0.5387618541717529, "rewards/margins": 0.19522696733474731, "rewards/rejected": 0.3435348570346832, "step": 426 }, { "epoch": 0.893305439330544, "grad_norm": 13.151951030853512, "learning_rate": 1.7154683829389283e-08, "logits/chosen": 0.037064895033836365, "logits/rejected": 0.4181457757949829, "logps/chosen": -2.020270347595215, "logps/rejected": -2.110900402069092, "loss": -0.1494, "rewards/accuracies": 0.375, "rewards/chosen": 0.3166952133178711, "rewards/margins": 0.030689965933561325, "rewards/rejected": 0.28600525856018066, "step": 427 }, { "epoch": 0.895397489539749, "grad_norm": 30.11848854193685, "learning_rate": 1.649597281686302e-08, "logits/chosen": 0.27454110980033875, "logits/rejected": 0.2602013349533081, "logps/chosen": -1.5725305080413818, "logps/rejected": -2.114314556121826, "loss": -0.1403, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3760819137096405, "rewards/margins": 0.09325071424245834, "rewards/rejected": 0.28283122181892395, "step": 428 }, { "epoch": 0.897489539748954, "grad_norm": 49.208669602647255, "learning_rate": 1.584972574226623e-08, "logits/chosen": -0.2047557532787323, "logits/rejected": 0.5918644070625305, "logps/chosen": -1.154144525527954, "logps/rejected": -3.3555400371551514, "loss": -0.139, "rewards/accuracies": 0.875, "rewards/chosen": 0.41403210163116455, "rewards/margins": 0.12810006737709045, "rewards/rejected": 0.2859320640563965, "step": 429 }, { "epoch": 0.899581589958159, "grad_norm": 20.955344541279917, "learning_rate": 1.521597710086439e-08, "logits/chosen": 0.13177312910556793, "logits/rejected": 0.5117599964141846, "logps/chosen": -1.241478681564331, "logps/rejected": -1.3989903926849365, "loss": -0.1086, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3930959701538086, "rewards/margins": 0.03417598456144333, "rewards/rejected": 0.3589199483394623, "step": 430 }, { "epoch": 0.9016736401673641, "grad_norm": 14.619595042411454, "learning_rate": 1.459476072078386e-08, "logits/chosen": 0.24203670024871826, "logits/rejected": 0.2092907428741455, "logps/chosen": -1.1378140449523926, "logps/rejected": -1.448889970779419, "loss": -0.1118, "rewards/accuracies": 0.3125, "rewards/chosen": 0.4876505732536316, "rewards/margins": 0.054524172097444534, "rewards/rejected": 0.43312641978263855, "step": 431 }, { "epoch": 0.9037656903765691, "grad_norm": 20.394218134088426, "learning_rate": 1.3986109761206093e-08, "logits/chosen": -0.41305041313171387, "logits/rejected": -0.09600323438644409, "logps/chosen": -1.150968313217163, "logps/rejected": -2.4742114543914795, "loss": -0.1391, "rewards/accuracies": 0.5625, "rewards/chosen": 0.36665695905685425, "rewards/margins": 0.12305985391139984, "rewards/rejected": 0.2435971200466156, "step": 432 }, { "epoch": 0.9058577405857741, "grad_norm": 18.209878374392186, "learning_rate": 1.3390056710597647e-08, "logits/chosen": -0.21727851033210754, "logits/rejected": 0.14106367528438568, "logps/chosen": -1.2363967895507812, "logps/rejected": -2.26884126663208, "loss": -0.1446, "rewards/accuracies": 0.625, "rewards/chosen": 0.3572999835014343, "rewards/margins": 0.11691320687532425, "rewards/rejected": 0.24038679897785187, "step": 433 }, { "epoch": 0.9079497907949791, "grad_norm": 11.047404016240506, "learning_rate": 1.280663338497609e-08, "logits/chosen": -0.43556827306747437, "logits/rejected": -0.133283793926239, "logps/chosen": -1.3448657989501953, "logps/rejected": -2.900851011276245, "loss": -0.1378, "rewards/accuracies": 0.75, "rewards/chosen": 0.33240455389022827, "rewards/margins": 0.14208939671516418, "rewards/rejected": 0.19031518697738647, "step": 434 }, { "epoch": 0.9100418410041841, "grad_norm": 18.197187537565902, "learning_rate": 1.2235870926211616e-08, "logits/chosen": -0.5432617664337158, "logits/rejected": 0.13345128297805786, "logps/chosen": -1.0295956134796143, "logps/rejected": -3.0868406295776367, "loss": -0.1274, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4138888418674469, "rewards/margins": 0.20933057367801666, "rewards/rejected": 0.20455826818943024, "step": 435 }, { "epoch": 0.9121338912133892, "grad_norm": 17.261325822213802, "learning_rate": 1.1677799800364957e-08, "logits/chosen": -1.115002989768982, "logits/rejected": -0.11034034192562103, "logps/chosen": -1.1753132343292236, "logps/rejected": -2.2985410690307617, "loss": -0.1525, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3633139729499817, "rewards/margins": 0.10394985973834991, "rewards/rejected": 0.2593640685081482, "step": 436 }, { "epoch": 0.9142259414225942, "grad_norm": 34.53685792981264, "learning_rate": 1.1132449796060873e-08, "logits/chosen": -0.3196355700492859, "logits/rejected": -0.32760536670684814, "logps/chosen": -1.0820286273956299, "logps/rejected": -2.6322782039642334, "loss": -0.1472, "rewards/accuracies": 0.875, "rewards/chosen": 0.4171540141105652, "rewards/margins": 0.19411149621009827, "rewards/rejected": 0.22304251790046692, "step": 437 }, { "epoch": 0.9163179916317992, "grad_norm": 23.26394464843546, "learning_rate": 1.0599850022898537e-08, "logits/chosen": -0.21112748980522156, "logits/rejected": -0.7489008903503418, "logps/chosen": -1.442376971244812, "logps/rejected": -1.3414108753204346, "loss": -0.1214, "rewards/accuracies": 0.6875, "rewards/chosen": 0.39732199907302856, "rewards/margins": 0.025272248312830925, "rewards/rejected": 0.3720497786998749, "step": 438 }, { "epoch": 0.9184100418410042, "grad_norm": 16.039680866672114, "learning_rate": 1.0080028909897232e-08, "logits/chosen": 0.08464886993169785, "logits/rejected": -0.09728383272886276, "logps/chosen": -1.6759049892425537, "logps/rejected": -2.3375589847564697, "loss": -0.1067, "rewards/accuracies": 0.625, "rewards/chosen": 0.3139100968837738, "rewards/margins": 0.07879811525344849, "rewards/rejected": 0.2351119965314865, "step": 439 }, { "epoch": 0.9205020920502092, "grad_norm": 14.377105293354811, "learning_rate": 9.57301420397924e-09, "logits/chosen": -0.8353943824768066, "logits/rejected": -0.9240339994430542, "logps/chosen": -1.6584469079971313, "logps/rejected": -3.0757076740264893, "loss": -0.1096, "rewards/accuracies": 0.625, "rewards/chosen": 0.43850457668304443, "rewards/margins": 0.1471424549818039, "rewards/rejected": 0.29136213660240173, "step": 440 }, { "epoch": 0.9225941422594143, "grad_norm": 18.35983958853663, "learning_rate": 9.078832968488632e-09, "logits/chosen": -0.48855262994766235, "logits/rejected": -0.07575897127389908, "logps/chosen": -1.3988516330718994, "logps/rejected": -3.0933828353881836, "loss": -0.1606, "rewards/accuracies": 0.6875, "rewards/chosen": 0.33013659715652466, "rewards/margins": 0.0989568755030632, "rewards/rejected": 0.23117972910404205, "step": 441 }, { "epoch": 0.9246861924686193, "grad_norm": 12.572841961166487, "learning_rate": 8.597511581746625e-09, "logits/chosen": 0.07626624405384064, "logits/rejected": 0.3884120285511017, "logps/chosen": -1.4382637739181519, "logps/rejected": -3.394392967224121, "loss": -0.1189, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2986642122268677, "rewards/margins": 0.108258455991745, "rewards/rejected": 0.1904057264328003, "step": 442 }, { "epoch": 0.9267782426778243, "grad_norm": 11.08282465711567, "learning_rate": 8.129075735643698e-09, "logits/chosen": 0.036822497844696045, "logits/rejected": 0.4752381145954132, "logps/chosen": -1.358346939086914, "logps/rejected": -2.069066047668457, "loss": -0.1501, "rewards/accuracies": 0.5625, "rewards/chosen": 0.3432375192642212, "rewards/margins": 0.052591562271118164, "rewards/rejected": 0.290645956993103, "step": 443 }, { "epoch": 0.9288702928870293, "grad_norm": 16.456766535225622, "learning_rate": 7.673550434268123e-09, "logits/chosen": -0.1680659055709839, "logits/rejected": 0.33423447608947754, "logps/chosen": -1.4859250783920288, "logps/rejected": -2.750816583633423, "loss": -0.121, "rewards/accuracies": 0.75, "rewards/chosen": 0.3396714925765991, "rewards/margins": 0.14422327280044556, "rewards/rejected": 0.19544823467731476, "step": 444 }, { "epoch": 0.9309623430962343, "grad_norm": 13.963716082092512, "learning_rate": 7.230959992571367e-09, "logits/chosen": -0.5167222023010254, "logits/rejected": -0.10349351167678833, "logps/chosen": -2.2473559379577637, "logps/rejected": -1.9560575485229492, "loss": -0.1235, "rewards/accuracies": 0.4375, "rewards/chosen": 0.3042965829372406, "rewards/margins": 0.0013524312525987625, "rewards/rejected": 0.302944153547287, "step": 445 }, { "epoch": 0.9330543933054394, "grad_norm": 11.73510057260775, "learning_rate": 6.801328035070136e-09, "logits/chosen": -0.1064242273569107, "logits/rejected": -0.47670555114746094, "logps/chosen": -1.6268620491027832, "logps/rejected": -2.1553955078125, "loss": -0.1086, "rewards/accuracies": 0.6875, "rewards/chosen": 0.369182288646698, "rewards/margins": 0.05523061379790306, "rewards/rejected": 0.31395167112350464, "step": 446 }, { "epoch": 0.9351464435146444, "grad_norm": 21.99025718659999, "learning_rate": 6.38467749458535e-09, "logits/chosen": -0.5817941427230835, "logits/rejected": -0.2126074880361557, "logps/chosen": -0.6642739176750183, "logps/rejected": -1.7573022842407227, "loss": -0.1693, "rewards/accuracies": 0.75, "rewards/chosen": 0.581842303276062, "rewards/margins": 0.2932204008102417, "rewards/rejected": 0.2886218726634979, "step": 447 }, { "epoch": 0.9372384937238494, "grad_norm": 20.176181702826447, "learning_rate": 5.981030611018234e-09, "logits/chosen": -0.23663604259490967, "logits/rejected": -0.3075418472290039, "logps/chosen": -0.8663835525512695, "logps/rejected": -1.8129829168319702, "loss": -0.1573, "rewards/accuracies": 0.75, "rewards/chosen": 0.4720425307750702, "rewards/margins": 0.13673511147499084, "rewards/rejected": 0.33530741930007935, "step": 448 }, { "epoch": 0.9393305439330544, "grad_norm": 11.998624086972171, "learning_rate": 5.590408930162799e-09, "logits/chosen": -0.9552597999572754, "logits/rejected": -0.2985736131668091, "logps/chosen": -1.1250615119934082, "logps/rejected": -2.544997215270996, "loss": -0.1118, "rewards/accuracies": 0.625, "rewards/chosen": 0.40850841999053955, "rewards/margins": 0.10064040124416351, "rewards/rejected": 0.30786800384521484, "step": 449 }, { "epoch": 0.9414225941422594, "grad_norm": 17.17071890301872, "learning_rate": 5.212833302556258e-09, "logits/chosen": 0.05753253027796745, "logits/rejected": 0.7037515044212341, "logps/chosen": -1.349974274635315, "logps/rejected": -1.7757717370986938, "loss": -0.1235, "rewards/accuracies": 0.4375, "rewards/chosen": 0.40113726258277893, "rewards/margins": 0.038026921451091766, "rewards/rejected": 0.36311033368110657, "step": 450 }, { "epoch": 0.9435146443514645, "grad_norm": 13.909230390768446, "learning_rate": 4.848323882365668e-09, "logits/chosen": -0.5183601379394531, "logits/rejected": 0.12597893178462982, "logps/chosen": -1.328192949295044, "logps/rejected": -2.0817277431488037, "loss": -0.0979, "rewards/accuracies": 0.75, "rewards/chosen": 0.3509393632411957, "rewards/margins": 0.10210657864809036, "rewards/rejected": 0.2488327920436859, "step": 451 }, { "epoch": 0.9456066945606695, "grad_norm": 14.71491478198279, "learning_rate": 4.496900126312431e-09, "logits/chosen": -0.2766793668270111, "logits/rejected": 0.13515634834766388, "logps/chosen": -1.4848310947418213, "logps/rejected": -2.497296094894409, "loss": -0.0911, "rewards/accuracies": 0.625, "rewards/chosen": 0.3426211476325989, "rewards/margins": 0.09051606804132462, "rewards/rejected": 0.25210511684417725, "step": 452 }, { "epoch": 0.9476987447698745, "grad_norm": 13.65972842123855, "learning_rate": 4.158580792633482e-09, "logits/chosen": -0.19712916016578674, "logits/rejected": 0.3826109766960144, "logps/chosen": -1.3620458841323853, "logps/rejected": -3.217412233352661, "loss": -0.1318, "rewards/accuracies": 0.8125, "rewards/chosen": 0.310494989156723, "rewards/margins": 0.16613849997520447, "rewards/rejected": 0.14435648918151855, "step": 453 }, { "epoch": 0.9497907949790795, "grad_norm": 18.05968836248289, "learning_rate": 3.833383940080231e-09, "logits/chosen": 0.03498596325516701, "logits/rejected": 0.11552795022726059, "logps/chosen": -2.0567967891693115, "logps/rejected": -1.346252679824829, "loss": -0.1251, "rewards/accuracies": 0.5, "rewards/chosen": 0.33275869488716125, "rewards/margins": 0.012848753482103348, "rewards/rejected": 0.319909930229187, "step": 454 }, { "epoch": 0.9518828451882845, "grad_norm": 12.21768160390563, "learning_rate": 3.521326926954532e-09, "logits/chosen": -0.4226292371749878, "logits/rejected": -0.23755401372909546, "logps/chosen": -0.9351893067359924, "logps/rejected": -2.4535017013549805, "loss": -0.0938, "rewards/accuracies": 0.625, "rewards/chosen": 0.4389905333518982, "rewards/margins": 0.11947016417980194, "rewards/rejected": 0.31952038407325745, "step": 455 }, { "epoch": 0.9539748953974896, "grad_norm": 18.098045353171948, "learning_rate": 3.2224264101821108e-09, "logits/chosen": -0.09337201714515686, "logits/rejected": 0.5950417518615723, "logps/chosen": -1.2980998754501343, "logps/rejected": -2.468187093734741, "loss": -0.1534, "rewards/accuracies": 0.625, "rewards/chosen": 0.3416418433189392, "rewards/margins": 0.13695421814918518, "rewards/rejected": 0.20468764007091522, "step": 456 }, { "epoch": 0.9560669456066946, "grad_norm": 26.130154479920918, "learning_rate": 2.936698344423505e-09, "logits/chosen": 0.07752484083175659, "logits/rejected": 0.2245202213525772, "logps/chosen": -1.3137603998184204, "logps/rejected": -2.435314893722534, "loss": -0.1571, "rewards/accuracies": 0.75, "rewards/chosen": 0.36322876811027527, "rewards/margins": 0.16007305681705475, "rewards/rejected": 0.2031557261943817, "step": 457 }, { "epoch": 0.9581589958158996, "grad_norm": 10.469709392256348, "learning_rate": 2.664157981222437e-09, "logits/chosen": -0.37826135754585266, "logits/rejected": -0.2887250483036041, "logps/chosen": -1.4897507429122925, "logps/rejected": -2.7404723167419434, "loss": -0.1025, "rewards/accuracies": 0.6875, "rewards/chosen": 0.3629001975059509, "rewards/margins": 0.11764570325613022, "rewards/rejected": 0.2452545166015625, "step": 458 }, { "epoch": 0.9602510460251046, "grad_norm": 19.507854738140097, "learning_rate": 2.4048198681917154e-09, "logits/chosen": 0.14118759334087372, "logits/rejected": 0.5922274589538574, "logps/chosen": -1.4373829364776611, "logps/rejected": -2.7581567764282227, "loss": -0.142, "rewards/accuracies": 0.5, "rewards/chosen": 0.41010457277297974, "rewards/margins": 0.15131796896457672, "rewards/rejected": 0.2587866187095642, "step": 459 }, { "epoch": 0.9623430962343096, "grad_norm": 18.02135980384631, "learning_rate": 2.158697848236607e-09, "logits/chosen": -0.6226168870925903, "logits/rejected": -0.31880009174346924, "logps/chosen": -1.2920911312103271, "logps/rejected": -2.5839664936065674, "loss": -0.1252, "rewards/accuracies": 0.8125, "rewards/chosen": 0.38381361961364746, "rewards/margins": 0.1413259208202362, "rewards/rejected": 0.24248769879341125, "step": 460 }, { "epoch": 0.9644351464435147, "grad_norm": 17.032613549200434, "learning_rate": 1.9258050588161766e-09, "logits/chosen": 0.1924222707748413, "logits/rejected": 0.715542733669281, "logps/chosen": -1.2721333503723145, "logps/rejected": -4.255842685699463, "loss": -0.1525, "rewards/accuracies": 0.9375, "rewards/chosen": 0.3516067862510681, "rewards/margins": 0.1841929852962494, "rewards/rejected": 0.16741378605365753, "step": 461 }, { "epoch": 0.9665271966527197, "grad_norm": 34.36895188270403, "learning_rate": 1.7061539312417107e-09, "logits/chosen": -0.29315185546875, "logits/rejected": 0.2312111258506775, "logps/chosen": -1.306030511856079, "logps/rejected": -3.0991930961608887, "loss": -0.1276, "rewards/accuracies": 0.75, "rewards/chosen": 0.3960568308830261, "rewards/margins": 0.16641941666603088, "rewards/rejected": 0.22963739931583405, "step": 462 }, { "epoch": 0.9686192468619247, "grad_norm": 19.667410540117263, "learning_rate": 1.4997561900135236e-09, "logits/chosen": -0.6279085874557495, "logits/rejected": -0.18156293034553528, "logps/chosen": -1.0435895919799805, "logps/rejected": -2.9352269172668457, "loss": -0.1557, "rewards/accuracies": 0.75, "rewards/chosen": 0.4187704920768738, "rewards/margins": 0.16668733954429626, "rewards/rejected": 0.2520831525325775, "step": 463 }, { "epoch": 0.9707112970711297, "grad_norm": 37.90773614924713, "learning_rate": 1.3066228521948219e-09, "logits/chosen": -0.0598616898059845, "logits/rejected": 0.47852182388305664, "logps/chosen": -1.1657414436340332, "logps/rejected": -2.9400830268859863, "loss": -0.136, "rewards/accuracies": 0.8125, "rewards/chosen": 0.38903993368148804, "rewards/margins": 0.2122332900762558, "rewards/rejected": 0.17680665850639343, "step": 464 }, { "epoch": 0.9728033472803347, "grad_norm": 32.18135370625648, "learning_rate": 1.126764226823812e-09, "logits/chosen": -0.33765825629234314, "logits/rejected": 0.25973930954933167, "logps/chosen": -1.73961341381073, "logps/rejected": -2.8940162658691406, "loss": -0.1681, "rewards/accuracies": 0.8125, "rewards/chosen": 0.3287394344806671, "rewards/margins": 0.11775898933410645, "rewards/rejected": 0.21098044514656067, "step": 465 }, { "epoch": 0.9748953974895398, "grad_norm": 15.098617874452946, "learning_rate": 9.60189914363363e-10, "logits/chosen": 0.35089248418807983, "logits/rejected": 0.9048178791999817, "logps/chosen": -1.9484138488769531, "logps/rejected": -2.5396575927734375, "loss": -0.1436, "rewards/accuracies": 0.75, "rewards/chosen": 0.44588300585746765, "rewards/margins": 0.2035720944404602, "rewards/rejected": 0.24231091141700745, "step": 466 }, { "epoch": 0.9769874476987448, "grad_norm": 26.9169004762807, "learning_rate": 8.069088061885276e-10, "logits/chosen": 0.1534384936094284, "logits/rejected": 0.5771427750587463, "logps/chosen": -0.8172132968902588, "logps/rejected": -3.5388541221618652, "loss": -0.1277, "rewards/accuracies": 0.75, "rewards/chosen": 0.46066659688949585, "rewards/margins": 0.21784912049770355, "rewards/rejected": 0.2428174614906311, "step": 467 }, { "epoch": 0.9790794979079498, "grad_norm": 17.911143221851358, "learning_rate": 6.66929084112089e-10, "logits/chosen": -0.29413968324661255, "logits/rejected": 0.13598132133483887, "logps/chosen": -1.2387174367904663, "logps/rejected": -1.7768938541412354, "loss": -0.0878, "rewards/accuracies": 0.6875, "rewards/chosen": 0.4200688600540161, "rewards/margins": 0.1287473887205124, "rewards/rejected": 0.2913214862346649, "step": 468 }, { "epoch": 0.9811715481171548, "grad_norm": 13.151996858359748, "learning_rate": 5.402582199476036e-10, "logits/chosen": -0.49788159132003784, "logits/rejected": -0.18531841039657593, "logps/chosen": -0.8361096382141113, "logps/rejected": -3.4101362228393555, "loss": -0.1225, "rewards/accuracies": 0.8125, "rewards/chosen": 0.4737487733364105, "rewards/margins": 0.23892131447792053, "rewards/rejected": 0.23482745885849, "step": 469 }, { "epoch": 0.9832635983263598, "grad_norm": 16.63754739733106, "learning_rate": 4.269029751107489e-10, "logits/chosen": -0.5837739706039429, "logits/rejected": -0.30460917949676514, "logps/chosen": -0.9492983222007751, "logps/rejected": -2.2384424209594727, "loss": -0.152, "rewards/accuracies": 0.75, "rewards/chosen": 0.47352638840675354, "rewards/margins": 0.12935146689414978, "rewards/rejected": 0.34417492151260376, "step": 470 }, { "epoch": 0.9853556485355649, "grad_norm": 26.79474982462166, "learning_rate": 3.2686940025836164e-10, "logits/chosen": -0.20093563199043274, "logits/rejected": -0.20078404247760773, "logps/chosen": -1.623030662536621, "logps/rejected": -1.8910236358642578, "loss": -0.1135, "rewards/accuracies": 0.625, "rewards/chosen": 0.3414144217967987, "rewards/margins": 0.0637202039361, "rewards/rejected": 0.2776942253112793, "step": 471 }, { "epoch": 0.9874476987447699, "grad_norm": 26.02885361868872, "learning_rate": 2.4016283496544607e-10, "logits/chosen": -0.08121422678232193, "logits/rejected": -0.05113856494426727, "logps/chosen": -1.5861430168151855, "logps/rejected": -2.3078389167785645, "loss": -0.1663, "rewards/accuracies": 0.75, "rewards/chosen": 0.35104459524154663, "rewards/margins": 0.11668679118156433, "rewards/rejected": 0.2343578040599823, "step": 472 }, { "epoch": 0.9895397489539749, "grad_norm": 21.891539506236764, "learning_rate": 1.6678790744015236e-10, "logits/chosen": -0.7576691508293152, "logits/rejected": -0.04828827083110809, "logps/chosen": -1.8233122825622559, "logps/rejected": -2.146613121032715, "loss": -0.1304, "rewards/accuracies": 0.6875, "rewards/chosen": 0.268475741147995, "rewards/margins": 0.07678797096014023, "rewards/rejected": 0.19168779253959656, "step": 473 }, { "epoch": 0.9916317991631799, "grad_norm": 28.502708975557262, "learning_rate": 1.0674853427683484e-10, "logits/chosen": 0.16506695747375488, "logits/rejected": -0.4553059935569763, "logps/chosen": -1.0200731754302979, "logps/rejected": -1.8700056076049805, "loss": -0.1593, "rewards/accuracies": 0.4375, "rewards/chosen": 0.4241200089454651, "rewards/margins": 0.11907336115837097, "rewards/rejected": 0.30504661798477173, "step": 474 }, { "epoch": 0.9937238493723849, "grad_norm": 18.524168156983546, "learning_rate": 6.004792024680294e-11, "logits/chosen": 0.46647167205810547, "logits/rejected": 0.5237947702407837, "logps/chosen": -1.175705909729004, "logps/rejected": -2.5198819637298584, "loss": -0.1397, "rewards/accuracies": 0.75, "rewards/chosen": 0.4669370651245117, "rewards/margins": 0.23335213959217072, "rewards/rejected": 0.2335849404335022, "step": 475 }, { "epoch": 0.99581589958159, "grad_norm": 18.476908526093876, "learning_rate": 2.6688558127485604e-11, "logits/chosen": -0.22382637858390808, "logits/rejected": -0.0033440515398979187, "logps/chosen": -1.3147752285003662, "logps/rejected": -2.3031740188598633, "loss": -0.1175, "rewards/accuracies": 0.75, "rewards/chosen": 0.36867618560791016, "rewards/margins": 0.1478295624256134, "rewards/rejected": 0.22084660828113556, "step": 476 }, { "epoch": 0.997907949790795, "grad_norm": 60.634529292954234, "learning_rate": 6.672228569148952e-12, "logits/chosen": -0.9228725433349609, "logits/rejected": -0.574424147605896, "logps/chosen": -1.5599159002304077, "logps/rejected": -1.694455623626709, "loss": -0.1238, "rewards/accuracies": 0.5, "rewards/chosen": 0.28848904371261597, "rewards/margins": 0.02120237797498703, "rewards/rejected": 0.26728665828704834, "step": 477 }, { "epoch": 1.0, "grad_norm": 12.74207673062675, "learning_rate": 0.0, "logits/chosen": -0.6011393070220947, "logits/rejected": -0.007772140204906464, "logps/chosen": -1.1837854385375977, "logps/rejected": -2.4837894439697266, "loss": -0.0854, "rewards/accuracies": 0.6875, "rewards/chosen": 0.35520443320274353, "rewards/margins": 0.12285250425338745, "rewards/rejected": 0.23235191404819489, "step": 478 } ], "logging_steps": 1, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }