{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999297541394882, "eval_steps": 400, "global_step": 5604, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002676032781401572, "grad_norm": 5.007836359233225, "learning_rate": 8.9126559714795e-09, "logits/chosen": -0.0686589926481247, "logits/rejected": 0.14136984944343567, "logps/chosen": -1.7160040140151978, "logps/rejected": -1.889505386352539, "loss": 0.7102, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.7160040140151978, "rewards/margins": 0.1735011637210846, "rewards/rejected": -1.889505386352539, "sft_loss": 1.468440294265747, "step": 5 }, { "epoch": 0.005352065562803144, "grad_norm": 9.54345972209578, "learning_rate": 1.7825311942959e-08, "logits/chosen": -0.007096876855939627, "logits/rejected": 0.11429889500141144, "logps/chosen": -1.802136778831482, "logps/rejected": -1.8458713293075562, "loss": 0.7922, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.802136778831482, "rewards/margins": 0.043734706938266754, "rewards/rejected": -1.8458713293075562, "sft_loss": 1.5083144903182983, "step": 10 }, { "epoch": 0.008028098344204716, "grad_norm": 10.76009186748774, "learning_rate": 2.67379679144385e-08, "logits/chosen": -0.03930598497390747, "logits/rejected": 0.06061038374900818, "logps/chosen": -1.635589838027954, "logps/rejected": -1.7648627758026123, "loss": 0.7673, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.635589838027954, "rewards/margins": 0.12927308678627014, "rewards/rejected": -1.7648627758026123, "sft_loss": 1.5007826089859009, "step": 15 }, { "epoch": 0.010704131125606288, "grad_norm": 5.0343169890815735, "learning_rate": 3.5650623885918e-08, "logits/chosen": -0.039493732154369354, "logits/rejected": 0.04849349707365036, "logps/chosen": -1.7248096466064453, "logps/rejected": -1.8056780099868774, "loss": 0.791, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.7248096466064453, "rewards/margins": 0.08086834847927094, "rewards/rejected": -1.8056780099868774, "sft_loss": 1.5005706548690796, "step": 20 }, { "epoch": 0.013380163907007862, "grad_norm": 16.01029536696111, "learning_rate": 4.45632798573975e-08, "logits/chosen": -0.06505529582500458, "logits/rejected": 0.020799441263079643, "logps/chosen": -1.8701883554458618, "logps/rejected": -1.7795698642730713, "loss": 0.8987, "rewards/accuracies": 0.3812499940395355, "rewards/chosen": -1.8701883554458618, "rewards/margins": -0.09061814099550247, "rewards/rejected": -1.7795698642730713, "sft_loss": 1.5459671020507812, "step": 25 }, { "epoch": 0.016056196688409432, "grad_norm": 8.419472984626248, "learning_rate": 5.3475935828877e-08, "logits/chosen": -0.09517794847488403, "logits/rejected": 8.840263035381213e-05, "logps/chosen": -1.9080820083618164, "logps/rejected": -1.8322795629501343, "loss": 0.8508, "rewards/accuracies": 0.4375, "rewards/chosen": -1.9080820083618164, "rewards/margins": -0.07580234855413437, "rewards/rejected": -1.8322795629501343, "sft_loss": 1.6464264392852783, "step": 30 }, { "epoch": 0.018732229469811006, "grad_norm": 9.603595781210746, "learning_rate": 6.23885918003565e-08, "logits/chosen": -0.047932375222444534, "logits/rejected": 0.11526918411254883, "logps/chosen": -1.8458175659179688, "logps/rejected": -1.9971166849136353, "loss": 0.8152, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.8458175659179688, "rewards/margins": 0.15129896998405457, "rewards/rejected": -1.9971166849136353, "sft_loss": 1.5614925622940063, "step": 35 }, { "epoch": 0.021408262251212576, "grad_norm": 9.043739776427191, "learning_rate": 7.1301247771836e-08, "logits/chosen": 0.0321493037045002, "logits/rejected": 0.21030040085315704, "logps/chosen": -1.8800443410873413, "logps/rejected": -1.742314100265503, "loss": 0.8692, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -1.8800443410873413, "rewards/margins": -0.13773025572299957, "rewards/rejected": -1.742314100265503, "sft_loss": 1.5186289548873901, "step": 40 }, { "epoch": 0.02408429503261415, "grad_norm": 14.676100184258393, "learning_rate": 8.021390374331551e-08, "logits/chosen": 0.027946826070547104, "logits/rejected": 0.23048114776611328, "logps/chosen": -1.8351805210113525, "logps/rejected": -1.8704410791397095, "loss": 0.8339, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.8351805210113525, "rewards/margins": 0.03526050224900246, "rewards/rejected": -1.8704410791397095, "sft_loss": 1.5355684757232666, "step": 45 }, { "epoch": 0.026760327814015723, "grad_norm": 11.460982576779914, "learning_rate": 8.9126559714795e-08, "logits/chosen": -0.04780071973800659, "logits/rejected": 0.10815383493900299, "logps/chosen": -1.8967111110687256, "logps/rejected": -1.7773425579071045, "loss": 0.8844, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.8967111110687256, "rewards/margins": -0.11936845630407333, "rewards/rejected": -1.7773425579071045, "sft_loss": 1.5827276706695557, "step": 50 }, { "epoch": 0.029436360595417294, "grad_norm": 7.361779322900751, "learning_rate": 9.80392156862745e-08, "logits/chosen": -0.12230806052684784, "logits/rejected": 0.0970315933227539, "logps/chosen": -1.830540657043457, "logps/rejected": -1.8639686107635498, "loss": 0.8545, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.830540657043457, "rewards/margins": 0.03342791646718979, "rewards/rejected": -1.8639686107635498, "sft_loss": 1.5817229747772217, "step": 55 }, { "epoch": 0.032112393376818864, "grad_norm": 7.008292330029091, "learning_rate": 1.06951871657754e-07, "logits/chosen": -0.09996206313371658, "logits/rejected": 0.09421779960393906, "logps/chosen": -1.7850735187530518, "logps/rejected": -1.890228271484375, "loss": 0.7729, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7850735187530518, "rewards/margins": 0.10515467822551727, "rewards/rejected": -1.890228271484375, "sft_loss": 1.5428695678710938, "step": 60 }, { "epoch": 0.03478842615822044, "grad_norm": 5.8428555921928265, "learning_rate": 1.158645276292335e-07, "logits/chosen": -0.01847922056913376, "logits/rejected": 0.13318422436714172, "logps/chosen": -1.6318439245224, "logps/rejected": -1.7609220743179321, "loss": 0.7337, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.6318439245224, "rewards/margins": 0.12907817959785461, "rewards/rejected": -1.7609220743179321, "sft_loss": 1.4719539880752563, "step": 65 }, { "epoch": 0.03746445893962201, "grad_norm": 11.535699964755835, "learning_rate": 1.24777183600713e-07, "logits/chosen": -0.06903735548257828, "logits/rejected": 0.08338301628828049, "logps/chosen": -1.7622463703155518, "logps/rejected": -1.8080501556396484, "loss": 0.8392, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -1.7622463703155518, "rewards/margins": 0.045803725719451904, "rewards/rejected": -1.8080501556396484, "sft_loss": 1.6276578903198242, "step": 70 }, { "epoch": 0.04014049172102358, "grad_norm": 12.274354805500392, "learning_rate": 1.3368983957219251e-07, "logits/chosen": -0.049991387873888016, "logits/rejected": 0.13196972012519836, "logps/chosen": -1.7706562280654907, "logps/rejected": -2.0294816493988037, "loss": 0.7273, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.7706562280654907, "rewards/margins": 0.25882548093795776, "rewards/rejected": -2.0294816493988037, "sft_loss": 1.5631906986236572, "step": 75 }, { "epoch": 0.04281652450242515, "grad_norm": 8.065691812498399, "learning_rate": 1.42602495543672e-07, "logits/chosen": 0.007000925950706005, "logits/rejected": 0.11386320739984512, "logps/chosen": -1.7066723108291626, "logps/rejected": -1.739593505859375, "loss": 0.7943, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.7066723108291626, "rewards/margins": 0.03292134404182434, "rewards/rejected": -1.739593505859375, "sft_loss": 1.5200769901275635, "step": 80 }, { "epoch": 0.04549255728382673, "grad_norm": 5.087382142819339, "learning_rate": 1.5151515151515152e-07, "logits/chosen": -0.13984394073486328, "logits/rejected": 0.11329865455627441, "logps/chosen": -1.770267128944397, "logps/rejected": -1.9428346157073975, "loss": 0.7577, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.770267128944397, "rewards/margins": 0.17256739735603333, "rewards/rejected": -1.9428346157073975, "sft_loss": 1.488335371017456, "step": 85 }, { "epoch": 0.0481685900652283, "grad_norm": 14.454887817717022, "learning_rate": 1.6042780748663102e-07, "logits/chosen": 0.10436830669641495, "logits/rejected": 0.06718367338180542, "logps/chosen": -1.7230262756347656, "logps/rejected": -1.757142424583435, "loss": 0.8211, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.7230262756347656, "rewards/margins": 0.03411626070737839, "rewards/rejected": -1.757142424583435, "sft_loss": 1.4506069421768188, "step": 90 }, { "epoch": 0.05084462284662987, "grad_norm": 5.421984139338058, "learning_rate": 1.693404634581105e-07, "logits/chosen": -0.06644740700721741, "logits/rejected": 0.08747534453868866, "logps/chosen": -1.7685035467147827, "logps/rejected": -1.884555459022522, "loss": 0.7823, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7685035467147827, "rewards/margins": 0.11605201661586761, "rewards/rejected": -1.884555459022522, "sft_loss": 1.5129237174987793, "step": 95 }, { "epoch": 0.05352065562803145, "grad_norm": 4.635883271568906, "learning_rate": 1.7825311942959e-07, "logits/chosen": -0.027575846761465073, "logits/rejected": 0.040067609399557114, "logps/chosen": -1.6706348657608032, "logps/rejected": -1.7762954235076904, "loss": 0.7543, "rewards/accuracies": 0.5, "rewards/chosen": -1.6706348657608032, "rewards/margins": 0.1056608110666275, "rewards/rejected": -1.7762954235076904, "sft_loss": 1.4829761981964111, "step": 100 }, { "epoch": 0.05619668840943302, "grad_norm": 9.791136385045762, "learning_rate": 1.8716577540106952e-07, "logits/chosen": 0.06704328954219818, "logits/rejected": 0.09551501274108887, "logps/chosen": -1.6210968494415283, "logps/rejected": -1.790696144104004, "loss": 0.7318, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.6210968494415283, "rewards/margins": 0.16959939897060394, "rewards/rejected": -1.790696144104004, "sft_loss": 1.4277700185775757, "step": 105 }, { "epoch": 0.05887272119083459, "grad_norm": 6.014061678484121, "learning_rate": 1.96078431372549e-07, "logits/chosen": 0.0035847374238073826, "logits/rejected": 0.10112349689006805, "logps/chosen": -1.6423265933990479, "logps/rejected": -1.6988475322723389, "loss": 0.7825, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6423265933990479, "rewards/margins": 0.05652119964361191, "rewards/rejected": -1.6988475322723389, "sft_loss": 1.4510281085968018, "step": 110 }, { "epoch": 0.06154875397223616, "grad_norm": 9.636781087531212, "learning_rate": 2.049910873440285e-07, "logits/chosen": 0.0216152872890234, "logits/rejected": 0.23170170187950134, "logps/chosen": -1.6151697635650635, "logps/rejected": -1.8834221363067627, "loss": 0.6893, "rewards/accuracies": 0.625, "rewards/chosen": -1.6151697635650635, "rewards/margins": 0.2682521939277649, "rewards/rejected": -1.8834221363067627, "sft_loss": 1.5399529933929443, "step": 115 }, { "epoch": 0.06422478675363773, "grad_norm": 5.837092063890294, "learning_rate": 2.13903743315508e-07, "logits/chosen": -0.09482669830322266, "logits/rejected": 0.07769424468278885, "logps/chosen": -1.670412302017212, "logps/rejected": -1.7851431369781494, "loss": 0.7476, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.670412302017212, "rewards/margins": 0.11473057419061661, "rewards/rejected": -1.7851431369781494, "sft_loss": 1.52475905418396, "step": 120 }, { "epoch": 0.0669008195350393, "grad_norm": 4.61651137551121, "learning_rate": 2.2281639928698751e-07, "logits/chosen": -0.0943833515048027, "logits/rejected": 0.035781342536211014, "logps/chosen": -1.601912498474121, "logps/rejected": -1.5632137060165405, "loss": 0.7998, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.601912498474121, "rewards/margins": -0.03869876265525818, "rewards/rejected": -1.5632137060165405, "sft_loss": 1.4977911710739136, "step": 125 }, { "epoch": 0.06957685231644088, "grad_norm": 8.07442245018302, "learning_rate": 2.31729055258467e-07, "logits/chosen": 0.04960859939455986, "logits/rejected": 0.18758010864257812, "logps/chosen": -1.6405330896377563, "logps/rejected": -1.759902000427246, "loss": 0.7124, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.6405330896377563, "rewards/margins": 0.11936911195516586, "rewards/rejected": -1.759902000427246, "sft_loss": 1.5563743114471436, "step": 130 }, { "epoch": 0.07225288509784245, "grad_norm": 15.363477991757385, "learning_rate": 2.406417112299465e-07, "logits/chosen": -0.0477161630988121, "logits/rejected": 0.07339635491371155, "logps/chosen": -1.6893672943115234, "logps/rejected": -1.7175499200820923, "loss": 0.7915, "rewards/accuracies": 0.5, "rewards/chosen": -1.6893672943115234, "rewards/margins": 0.028182348236441612, "rewards/rejected": -1.7175499200820923, "sft_loss": 1.4941551685333252, "step": 135 }, { "epoch": 0.07492891787924402, "grad_norm": 8.438465773901987, "learning_rate": 2.49554367201426e-07, "logits/chosen": -0.04275672882795334, "logits/rejected": 0.12644967436790466, "logps/chosen": -1.6506645679473877, "logps/rejected": -1.7775901556015015, "loss": 0.7327, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.6506645679473877, "rewards/margins": 0.12692561745643616, "rewards/rejected": -1.7775901556015015, "sft_loss": 1.5400911569595337, "step": 140 }, { "epoch": 0.0776049506606456, "grad_norm": 9.072723379650128, "learning_rate": 2.5846702317290554e-07, "logits/chosen": -0.02842085435986519, "logits/rejected": 0.12878260016441345, "logps/chosen": -1.56234610080719, "logps/rejected": -1.6720802783966064, "loss": 0.7311, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -1.56234610080719, "rewards/margins": 0.10973384231328964, "rewards/rejected": -1.6720802783966064, "sft_loss": 1.4896290302276611, "step": 145 }, { "epoch": 0.08028098344204716, "grad_norm": 11.917688310921138, "learning_rate": 2.6737967914438503e-07, "logits/chosen": -0.08208204805850983, "logits/rejected": 0.0795825868844986, "logps/chosen": -1.5122708082199097, "logps/rejected": -1.5111819505691528, "loss": 0.7804, "rewards/accuracies": 0.5, "rewards/chosen": -1.5122708082199097, "rewards/margins": -0.0010888517135754228, "rewards/rejected": -1.5111819505691528, "sft_loss": 1.3492343425750732, "step": 150 }, { "epoch": 0.08295701622344874, "grad_norm": 7.993571495447725, "learning_rate": 2.762923351158645e-07, "logits/chosen": -0.04924124851822853, "logits/rejected": 0.004975716583430767, "logps/chosen": -1.5232518911361694, "logps/rejected": -1.6216939687728882, "loss": 0.7286, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5232518911361694, "rewards/margins": 0.098441943526268, "rewards/rejected": -1.6216939687728882, "sft_loss": 1.4322659969329834, "step": 155 }, { "epoch": 0.0856330490048503, "grad_norm": 7.499988200221608, "learning_rate": 2.85204991087344e-07, "logits/chosen": -0.1464027464389801, "logits/rejected": -0.003987524192780256, "logps/chosen": -1.6318111419677734, "logps/rejected": -1.6092545986175537, "loss": 0.805, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -1.6318111419677734, "rewards/margins": -0.02255646511912346, "rewards/rejected": -1.6092545986175537, "sft_loss": 1.4808762073516846, "step": 160 }, { "epoch": 0.08830908178625188, "grad_norm": 7.056161316365935, "learning_rate": 2.941176470588235e-07, "logits/chosen": -0.07448373734951019, "logits/rejected": 0.09501216560602188, "logps/chosen": -1.4764728546142578, "logps/rejected": -1.5967134237289429, "loss": 0.7312, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4764728546142578, "rewards/margins": 0.1202404722571373, "rewards/rejected": -1.5967134237289429, "sft_loss": 1.372351050376892, "step": 165 }, { "epoch": 0.09098511456765346, "grad_norm": 13.80003838568036, "learning_rate": 3.0303030303030305e-07, "logits/chosen": -0.09532450139522552, "logits/rejected": -0.03916095569729805, "logps/chosen": -1.5926252603530884, "logps/rejected": -1.6508491039276123, "loss": 0.7664, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.5926252603530884, "rewards/margins": 0.0582241527736187, "rewards/rejected": -1.6508491039276123, "sft_loss": 1.4843275547027588, "step": 170 }, { "epoch": 0.09366114734905502, "grad_norm": 7.915014598111359, "learning_rate": 3.1194295900178254e-07, "logits/chosen": 0.04395443946123123, "logits/rejected": 0.04227130487561226, "logps/chosen": -1.4485992193222046, "logps/rejected": -1.549605369567871, "loss": 0.73, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.4485992193222046, "rewards/margins": 0.1010061502456665, "rewards/rejected": -1.549605369567871, "sft_loss": 1.4186640977859497, "step": 175 }, { "epoch": 0.0963371801304566, "grad_norm": 7.065980369491947, "learning_rate": 3.2085561497326203e-07, "logits/chosen": -0.0706276148557663, "logits/rejected": -0.07220318913459778, "logps/chosen": -1.4419519901275635, "logps/rejected": -1.6293582916259766, "loss": 0.7267, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4419519901275635, "rewards/margins": 0.18740621209144592, "rewards/rejected": -1.6293582916259766, "sft_loss": 1.414329171180725, "step": 180 }, { "epoch": 0.09901321291185818, "grad_norm": 6.786407778120364, "learning_rate": 3.297682709447415e-07, "logits/chosen": -0.16844771802425385, "logits/rejected": -0.08266721665859222, "logps/chosen": -1.3996317386627197, "logps/rejected": -1.4599860906600952, "loss": 0.7491, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.3996317386627197, "rewards/margins": 0.0603543221950531, "rewards/rejected": -1.4599860906600952, "sft_loss": 1.3800976276397705, "step": 185 }, { "epoch": 0.10168924569325974, "grad_norm": 7.300767239806578, "learning_rate": 3.38680926916221e-07, "logits/chosen": -0.0973626971244812, "logits/rejected": 0.022847438231110573, "logps/chosen": -1.3400040864944458, "logps/rejected": -1.474424123764038, "loss": 0.6999, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3400040864944458, "rewards/margins": 0.13442011177539825, "rewards/rejected": -1.474424123764038, "sft_loss": 1.327915906906128, "step": 190 }, { "epoch": 0.10436527847466132, "grad_norm": 4.64361809307048, "learning_rate": 3.475935828877005e-07, "logits/chosen": -0.007407195866107941, "logits/rejected": 0.14789652824401855, "logps/chosen": -1.2831056118011475, "logps/rejected": -1.4523200988769531, "loss": 0.6801, "rewards/accuracies": 0.59375, "rewards/chosen": -1.2831056118011475, "rewards/margins": 0.16921459138393402, "rewards/rejected": -1.4523200988769531, "sft_loss": 1.3099342584609985, "step": 195 }, { "epoch": 0.1070413112560629, "grad_norm": 14.575211650225327, "learning_rate": 3.5650623885918e-07, "logits/chosen": -0.11465966701507568, "logits/rejected": 0.021672243252396584, "logps/chosen": -1.4081348180770874, "logps/rejected": -1.4467805624008179, "loss": 0.7459, "rewards/accuracies": 0.53125, "rewards/chosen": -1.4081348180770874, "rewards/margins": 0.03864575922489166, "rewards/rejected": -1.4467805624008179, "sft_loss": 1.4106323719024658, "step": 200 }, { "epoch": 0.10971734403746446, "grad_norm": 10.129209630022192, "learning_rate": 3.654188948306595e-07, "logits/chosen": -0.10025770962238312, "logits/rejected": 0.03735864534974098, "logps/chosen": -1.3229440450668335, "logps/rejected": -1.3909223079681396, "loss": 0.7343, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3229440450668335, "rewards/margins": 0.06797824800014496, "rewards/rejected": -1.3909223079681396, "sft_loss": 1.301814079284668, "step": 205 }, { "epoch": 0.11239337681886603, "grad_norm": 7.458541707417236, "learning_rate": 3.7433155080213904e-07, "logits/chosen": -0.18131954967975616, "logits/rejected": 0.0006055116537027061, "logps/chosen": -1.3974190950393677, "logps/rejected": -1.5311411619186401, "loss": 0.7184, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3974190950393677, "rewards/margins": 0.13372211158275604, "rewards/rejected": -1.5311411619186401, "sft_loss": 1.3603262901306152, "step": 210 }, { "epoch": 0.1150694096002676, "grad_norm": 5.689970505577374, "learning_rate": 3.8324420677361853e-07, "logits/chosen": -0.21051593124866486, "logits/rejected": 0.03054944798350334, "logps/chosen": -1.4112319946289062, "logps/rejected": -1.481041431427002, "loss": 0.7194, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4112319946289062, "rewards/margins": 0.06980942189693451, "rewards/rejected": -1.481041431427002, "sft_loss": 1.383886456489563, "step": 215 }, { "epoch": 0.11774544238166917, "grad_norm": 12.530800803177595, "learning_rate": 3.92156862745098e-07, "logits/chosen": 0.034305017441511154, "logits/rejected": 0.13380172848701477, "logps/chosen": -1.3511821031570435, "logps/rejected": -1.5188138484954834, "loss": 0.6987, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3511821031570435, "rewards/margins": 0.16763189435005188, "rewards/rejected": -1.5188138484954834, "sft_loss": 1.360771656036377, "step": 220 }, { "epoch": 0.12042147516307075, "grad_norm": 4.653708323105172, "learning_rate": 4.010695187165775e-07, "logits/chosen": -0.12555362284183502, "logits/rejected": 0.04124899208545685, "logps/chosen": -1.353245496749878, "logps/rejected": -1.4943913221359253, "loss": 0.6904, "rewards/accuracies": 0.5625, "rewards/chosen": -1.353245496749878, "rewards/margins": 0.14114579558372498, "rewards/rejected": -1.4943913221359253, "sft_loss": 1.3382834196090698, "step": 225 }, { "epoch": 0.12309750794447231, "grad_norm": 5.33403533842893, "learning_rate": 4.09982174688057e-07, "logits/chosen": -0.031155142933130264, "logits/rejected": 0.045900508761405945, "logps/chosen": -1.3820817470550537, "logps/rejected": -1.5517305135726929, "loss": 0.6978, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3820817470550537, "rewards/margins": 0.16964863240718842, "rewards/rejected": -1.5517305135726929, "sft_loss": 1.3105311393737793, "step": 230 }, { "epoch": 0.1257735407258739, "grad_norm": 8.706238550801956, "learning_rate": 4.188948306595365e-07, "logits/chosen": -0.032305438071489334, "logits/rejected": 0.09844879806041718, "logps/chosen": -1.3472206592559814, "logps/rejected": -1.5170398950576782, "loss": 0.6835, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3472206592559814, "rewards/margins": 0.16981934010982513, "rewards/rejected": -1.5170398950576782, "sft_loss": 1.3197492361068726, "step": 235 }, { "epoch": 0.12844957350727546, "grad_norm": 4.084985722679438, "learning_rate": 4.27807486631016e-07, "logits/chosen": -0.05923212692141533, "logits/rejected": 0.06422128528356552, "logps/chosen": -1.3529850244522095, "logps/rejected": -1.5478001832962036, "loss": 0.6863, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3529850244522095, "rewards/margins": 0.19481512904167175, "rewards/rejected": -1.5478001832962036, "sft_loss": 1.3814536333084106, "step": 240 }, { "epoch": 0.13112560628867703, "grad_norm": 6.9924765863104374, "learning_rate": 4.3672014260249554e-07, "logits/chosen": 0.0005951419589109719, "logits/rejected": 0.11696485430002213, "logps/chosen": -1.4723063707351685, "logps/rejected": -1.5162469148635864, "loss": 0.748, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4723063707351685, "rewards/margins": 0.0439404733479023, "rewards/rejected": -1.5162469148635864, "sft_loss": 1.464321255683899, "step": 245 }, { "epoch": 0.1338016390700786, "grad_norm": 11.619283656380444, "learning_rate": 4.4563279857397503e-07, "logits/chosen": -0.10031737387180328, "logits/rejected": 0.05885583162307739, "logps/chosen": -1.3780508041381836, "logps/rejected": -1.4445879459381104, "loss": 0.7493, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.3780508041381836, "rewards/margins": 0.06653715670108795, "rewards/rejected": -1.4445879459381104, "sft_loss": 1.3511133193969727, "step": 250 }, { "epoch": 0.1364776718514802, "grad_norm": 8.252245686424416, "learning_rate": 4.545454545454545e-07, "logits/chosen": -0.04596395045518875, "logits/rejected": 0.09472165256738663, "logps/chosen": -1.327968955039978, "logps/rejected": -1.4561989307403564, "loss": 0.6962, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.327968955039978, "rewards/margins": 0.12822984158992767, "rewards/rejected": -1.4561989307403564, "sft_loss": 1.278329849243164, "step": 255 }, { "epoch": 0.13915370463288176, "grad_norm": 5.269952180322018, "learning_rate": 4.63458110516934e-07, "logits/chosen": -0.2541922330856323, "logits/rejected": -0.1493658423423767, "logps/chosen": -1.4302836656570435, "logps/rejected": -1.5907987356185913, "loss": 0.6847, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4302836656570435, "rewards/margins": 0.1605151742696762, "rewards/rejected": -1.5907987356185913, "sft_loss": 1.414287805557251, "step": 260 }, { "epoch": 0.1418297374142833, "grad_norm": 7.611663736654725, "learning_rate": 4.723707664884135e-07, "logits/chosen": -0.09324956685304642, "logits/rejected": -0.008047522976994514, "logps/chosen": -1.42179536819458, "logps/rejected": -1.5962066650390625, "loss": 0.699, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.42179536819458, "rewards/margins": 0.1744113266468048, "rewards/rejected": -1.5962066650390625, "sft_loss": 1.4485552310943604, "step": 265 }, { "epoch": 0.1445057701956849, "grad_norm": 4.296214658770507, "learning_rate": 4.81283422459893e-07, "logits/chosen": -0.09900476038455963, "logits/rejected": 0.034304648637771606, "logps/chosen": -1.3736134767532349, "logps/rejected": -1.4784634113311768, "loss": 0.704, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3736134767532349, "rewards/margins": 0.10485007613897324, "rewards/rejected": -1.4784634113311768, "sft_loss": 1.3692827224731445, "step": 270 }, { "epoch": 0.14718180297708647, "grad_norm": 5.364644455566007, "learning_rate": 4.901960784313725e-07, "logits/chosen": -0.04653949290513992, "logits/rejected": 0.05055801197886467, "logps/chosen": -1.3204883337020874, "logps/rejected": -1.5095690488815308, "loss": 0.6815, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3204883337020874, "rewards/margins": 0.18908073008060455, "rewards/rejected": -1.5095690488815308, "sft_loss": 1.2910387516021729, "step": 275 }, { "epoch": 0.14985783575848804, "grad_norm": 6.003631973478166, "learning_rate": 4.99108734402852e-07, "logits/chosen": -0.10684315115213394, "logits/rejected": 0.05572965741157532, "logps/chosen": -1.391960859298706, "logps/rejected": -1.5029346942901611, "loss": 0.7129, "rewards/accuracies": 0.5625, "rewards/chosen": -1.391960859298706, "rewards/margins": 0.11097397655248642, "rewards/rejected": -1.5029346942901611, "sft_loss": 1.3595014810562134, "step": 280 }, { "epoch": 0.15253386853988962, "grad_norm": 6.480892543259098, "learning_rate": 5.080213903743315e-07, "logits/chosen": -0.07482358068227768, "logits/rejected": 0.06803113967180252, "logps/chosen": -1.3924249410629272, "logps/rejected": -1.4932241439819336, "loss": 0.727, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3924249410629272, "rewards/margins": 0.1007990688085556, "rewards/rejected": -1.4932241439819336, "sft_loss": 1.4270622730255127, "step": 285 }, { "epoch": 0.1552099013212912, "grad_norm": 6.616276424059221, "learning_rate": 5.169340463458111e-07, "logits/chosen": -0.12157426029443741, "logits/rejected": 0.18175940215587616, "logps/chosen": -1.414035439491272, "logps/rejected": -1.5550696849822998, "loss": 0.6967, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.414035439491272, "rewards/margins": 0.14103442430496216, "rewards/rejected": -1.5550696849822998, "sft_loss": 1.3971197605133057, "step": 290 }, { "epoch": 0.15788593410269275, "grad_norm": 7.138403141788474, "learning_rate": 5.258467023172905e-07, "logits/chosen": -0.05374305695295334, "logits/rejected": 0.003037288784980774, "logps/chosen": -1.3270564079284668, "logps/rejected": -1.4653503894805908, "loss": 0.691, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.3270564079284668, "rewards/margins": 0.13829405605793, "rewards/rejected": -1.4653503894805908, "sft_loss": 1.3143774271011353, "step": 295 }, { "epoch": 0.16056196688409433, "grad_norm": 5.567542205148069, "learning_rate": 5.347593582887701e-07, "logits/chosen": -0.0868334174156189, "logits/rejected": 0.08281473815441132, "logps/chosen": -1.3654279708862305, "logps/rejected": -1.4594337940216064, "loss": 0.7129, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3654279708862305, "rewards/margins": 0.09400572627782822, "rewards/rejected": -1.4594337940216064, "sft_loss": 1.4067200422286987, "step": 300 }, { "epoch": 0.1632379996654959, "grad_norm": 4.6470009273355535, "learning_rate": 5.436720142602496e-07, "logits/chosen": -0.010817406699061394, "logits/rejected": 0.0604841411113739, "logps/chosen": -1.4783557653427124, "logps/rejected": -1.4822551012039185, "loss": 0.7712, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -1.4783557653427124, "rewards/margins": 0.0038991898763924837, "rewards/rejected": -1.4822551012039185, "sft_loss": 1.456894874572754, "step": 305 }, { "epoch": 0.16591403244689748, "grad_norm": 7.661301028049097, "learning_rate": 5.52584670231729e-07, "logits/chosen": -0.20085179805755615, "logits/rejected": -0.10509626567363739, "logps/chosen": -1.4366939067840576, "logps/rejected": -1.537825345993042, "loss": 0.7375, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.4366939067840576, "rewards/margins": 0.10113133490085602, "rewards/rejected": -1.537825345993042, "sft_loss": 1.4186075925827026, "step": 310 }, { "epoch": 0.16859006522829906, "grad_norm": 8.327325233802375, "learning_rate": 5.614973262032086e-07, "logits/chosen": -0.02356710098683834, "logits/rejected": 0.13607418537139893, "logps/chosen": -1.429595708847046, "logps/rejected": -1.6014493703842163, "loss": 0.7139, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.429595708847046, "rewards/margins": 0.17185349762439728, "rewards/rejected": -1.6014493703842163, "sft_loss": 1.4313127994537354, "step": 315 }, { "epoch": 0.1712660980097006, "grad_norm": 4.427112532919185, "learning_rate": 5.70409982174688e-07, "logits/chosen": -0.06657937169075012, "logits/rejected": 0.06765355914831161, "logps/chosen": -1.3815885782241821, "logps/rejected": -1.445359230041504, "loss": 0.7261, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.3815885782241821, "rewards/margins": 0.06377061456441879, "rewards/rejected": -1.445359230041504, "sft_loss": 1.3856720924377441, "step": 320 }, { "epoch": 0.17394213079110218, "grad_norm": 7.529891394125554, "learning_rate": 5.793226381461676e-07, "logits/chosen": -0.15291689336299896, "logits/rejected": -0.03675685077905655, "logps/chosen": -1.3926328420639038, "logps/rejected": -1.6686160564422607, "loss": 0.6705, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3926328420639038, "rewards/margins": 0.2759833335876465, "rewards/rejected": -1.6686160564422607, "sft_loss": 1.453553557395935, "step": 325 }, { "epoch": 0.17661816357250376, "grad_norm": 10.632517925413287, "learning_rate": 5.88235294117647e-07, "logits/chosen": -0.023518884554505348, "logits/rejected": 0.12941356003284454, "logps/chosen": -1.4052571058273315, "logps/rejected": -1.6226956844329834, "loss": 0.673, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4052571058273315, "rewards/margins": 0.21743862330913544, "rewards/rejected": -1.6226956844329834, "sft_loss": 1.4034149646759033, "step": 330 }, { "epoch": 0.17929419635390534, "grad_norm": 11.086324499799552, "learning_rate": 5.971479500891266e-07, "logits/chosen": 0.021479438990354538, "logits/rejected": 0.12531518936157227, "logps/chosen": -1.42387855052948, "logps/rejected": -1.4794212579727173, "loss": 0.724, "rewards/accuracies": 0.5, "rewards/chosen": -1.42387855052948, "rewards/margins": 0.05554261803627014, "rewards/rejected": -1.4794212579727173, "sft_loss": 1.3950622081756592, "step": 335 }, { "epoch": 0.18197022913530692, "grad_norm": 10.388558650210811, "learning_rate": 6.060606060606061e-07, "logits/chosen": -0.03921655938029289, "logits/rejected": 0.11117533594369888, "logps/chosen": -1.5144026279449463, "logps/rejected": -1.6030089855194092, "loss": 0.7484, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.5144026279449463, "rewards/margins": 0.08860644698143005, "rewards/rejected": -1.6030089855194092, "sft_loss": 1.4564971923828125, "step": 340 }, { "epoch": 0.1846462619167085, "grad_norm": 10.196942882530658, "learning_rate": 6.149732620320855e-07, "logits/chosen": 0.047799251973629, "logits/rejected": 0.0809812992811203, "logps/chosen": -1.4201780557632446, "logps/rejected": -1.5810493230819702, "loss": 0.6994, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4201780557632446, "rewards/margins": 0.16087140142917633, "rewards/rejected": -1.5810493230819702, "sft_loss": 1.4226887226104736, "step": 345 }, { "epoch": 0.18732229469811004, "grad_norm": 8.817739794382337, "learning_rate": 6.238859180035651e-07, "logits/chosen": 0.005632379557937384, "logits/rejected": 0.10080881416797638, "logps/chosen": -1.3695688247680664, "logps/rejected": -1.4985499382019043, "loss": 0.7175, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.3695688247680664, "rewards/margins": 0.12898120284080505, "rewards/rejected": -1.4985499382019043, "sft_loss": 1.3992822170257568, "step": 350 }, { "epoch": 0.18999832747951162, "grad_norm": 6.017069129228117, "learning_rate": 6.327985739750445e-07, "logits/chosen": -0.11661320924758911, "logits/rejected": 0.10363030433654785, "logps/chosen": -1.4653418064117432, "logps/rejected": -1.5187338590621948, "loss": 0.7471, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.4653418064117432, "rewards/margins": 0.05339198186993599, "rewards/rejected": -1.5187338590621948, "sft_loss": 1.460890769958496, "step": 355 }, { "epoch": 0.1926743602609132, "grad_norm": 5.990445407054706, "learning_rate": 6.417112299465241e-07, "logits/chosen": -0.07277899235486984, "logits/rejected": 0.009725173935294151, "logps/chosen": -1.4071974754333496, "logps/rejected": -1.5529217720031738, "loss": 0.7105, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4071974754333496, "rewards/margins": 0.14572428166866302, "rewards/rejected": -1.5529217720031738, "sft_loss": 1.3624567985534668, "step": 360 }, { "epoch": 0.19535039304231477, "grad_norm": 10.46336449448534, "learning_rate": 6.506238859180035e-07, "logits/chosen": -0.016026372089982033, "logits/rejected": 0.06777580082416534, "logps/chosen": -1.3677482604980469, "logps/rejected": -1.4697232246398926, "loss": 0.7235, "rewards/accuracies": 0.625, "rewards/chosen": -1.3677482604980469, "rewards/margins": 0.10197494179010391, "rewards/rejected": -1.4697232246398926, "sft_loss": 1.3233493566513062, "step": 365 }, { "epoch": 0.19802642582371635, "grad_norm": 6.70527148965871, "learning_rate": 6.59536541889483e-07, "logits/chosen": -0.03155245631933212, "logits/rejected": 0.06311879307031631, "logps/chosen": -1.3529279232025146, "logps/rejected": -1.4192687273025513, "loss": 0.7291, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3529279232025146, "rewards/margins": 0.0663408562541008, "rewards/rejected": -1.4192687273025513, "sft_loss": 1.3164355754852295, "step": 370 }, { "epoch": 0.2007024586051179, "grad_norm": 6.213682685842821, "learning_rate": 6.684491978609626e-07, "logits/chosen": -0.10043720155954361, "logits/rejected": 0.0535179078578949, "logps/chosen": -1.332228422164917, "logps/rejected": -1.5000375509262085, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.332228422164917, "rewards/margins": 0.16780902445316315, "rewards/rejected": -1.5000375509262085, "sft_loss": 1.3726236820220947, "step": 375 }, { "epoch": 0.20337849138651948, "grad_norm": 5.956018695417195, "learning_rate": 6.77361853832442e-07, "logits/chosen": -0.06339693814516068, "logits/rejected": 0.02334422990679741, "logps/chosen": -1.3552122116088867, "logps/rejected": -1.5462400913238525, "loss": 0.6773, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3552122116088867, "rewards/margins": 0.1910279095172882, "rewards/rejected": -1.5462400913238525, "sft_loss": 1.3529400825500488, "step": 380 }, { "epoch": 0.20605452416792105, "grad_norm": 4.1253425708857, "learning_rate": 6.862745098039216e-07, "logits/chosen": -0.02446773275732994, "logits/rejected": 0.0541352704167366, "logps/chosen": -1.4543194770812988, "logps/rejected": -1.466575026512146, "loss": 0.7721, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.4543194770812988, "rewards/margins": 0.012255474925041199, "rewards/rejected": -1.466575026512146, "sft_loss": 1.4507054090499878, "step": 385 }, { "epoch": 0.20873055694932263, "grad_norm": 8.907745404115765, "learning_rate": 6.95187165775401e-07, "logits/chosen": 0.06300806254148483, "logits/rejected": 0.23759731650352478, "logps/chosen": -1.454111933708191, "logps/rejected": -1.5439735651016235, "loss": 0.7385, "rewards/accuracies": 0.5, "rewards/chosen": -1.454111933708191, "rewards/margins": 0.08986148983240128, "rewards/rejected": -1.5439735651016235, "sft_loss": 1.4432179927825928, "step": 390 }, { "epoch": 0.2114065897307242, "grad_norm": 7.03912409933743, "learning_rate": 7.040998217468806e-07, "logits/chosen": -0.07751835882663727, "logits/rejected": 0.08364540338516235, "logps/chosen": -1.403145432472229, "logps/rejected": -1.4548805952072144, "loss": 0.7233, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.403145432472229, "rewards/margins": 0.05173531919717789, "rewards/rejected": -1.4548805952072144, "sft_loss": 1.3948204517364502, "step": 395 }, { "epoch": 0.2140826225121258, "grad_norm": 11.640061370540616, "learning_rate": 7.1301247771836e-07, "logits/chosen": 0.048798851668834686, "logits/rejected": 0.1456696093082428, "logps/chosen": -1.4081380367279053, "logps/rejected": -1.5352369546890259, "loss": 0.7019, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4081380367279053, "rewards/margins": 0.12709888815879822, "rewards/rejected": -1.5352369546890259, "sft_loss": 1.371368646621704, "step": 400 }, { "epoch": 0.2140826225121258, "eval_logits/chosen": 0.21375156939029694, "eval_logits/rejected": 0.2993115484714508, "eval_logps/chosen": -1.4375065565109253, "eval_logps/rejected": -1.6032114028930664, "eval_loss": 0.6977089643478394, "eval_rewards/accuracies": 0.5630564093589783, "eval_rewards/chosen": -1.4375065565109253, "eval_rewards/margins": 0.16570471227169037, "eval_rewards/rejected": -1.6032114028930664, "eval_runtime": 49.8197, "eval_samples_per_second": 26.997, "eval_sft_loss": 1.4218635559082031, "eval_steps_per_second": 6.764, "step": 400 }, { "epoch": 0.21675865529352734, "grad_norm": 6.375356479218084, "learning_rate": 7.219251336898395e-07, "logits/chosen": -0.04546252638101578, "logits/rejected": 0.04799600690603256, "logps/chosen": -1.4380704164505005, "logps/rejected": -1.5441672801971436, "loss": 0.723, "rewards/accuracies": 0.5, "rewards/chosen": -1.4380704164505005, "rewards/margins": 0.10609670728445053, "rewards/rejected": -1.5441672801971436, "sft_loss": 1.3966643810272217, "step": 405 }, { "epoch": 0.2194346880749289, "grad_norm": 9.424186700278675, "learning_rate": 7.30837789661319e-07, "logits/chosen": -0.0008656397694721818, "logits/rejected": 0.13088323175907135, "logps/chosen": -1.4057133197784424, "logps/rejected": -1.5324071645736694, "loss": 0.7085, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.4057133197784424, "rewards/margins": 0.12669387459754944, "rewards/rejected": -1.5324071645736694, "sft_loss": 1.4035170078277588, "step": 410 }, { "epoch": 0.2221107208563305, "grad_norm": 5.854251356791351, "learning_rate": 7.397504456327985e-07, "logits/chosen": -0.028660956770181656, "logits/rejected": 0.012854715809226036, "logps/chosen": -1.3972389698028564, "logps/rejected": -1.5771993398666382, "loss": 0.6901, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3972389698028564, "rewards/margins": 0.17996013164520264, "rewards/rejected": -1.5771993398666382, "sft_loss": 1.380652666091919, "step": 415 }, { "epoch": 0.22478675363773207, "grad_norm": 5.815774407814431, "learning_rate": 7.486631016042781e-07, "logits/chosen": -0.029275968670845032, "logits/rejected": 0.16776686906814575, "logps/chosen": -1.361322283744812, "logps/rejected": -1.4909484386444092, "loss": 0.7079, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.361322283744812, "rewards/margins": 0.1296263039112091, "rewards/rejected": -1.4909484386444092, "sft_loss": 1.3848352432250977, "step": 420 }, { "epoch": 0.22746278641913364, "grad_norm": 7.325771410324191, "learning_rate": 7.575757575757575e-07, "logits/chosen": -0.07150163501501083, "logits/rejected": 0.1298082172870636, "logps/chosen": -1.417018175125122, "logps/rejected": -1.6245094537734985, "loss": 0.6712, "rewards/accuracies": 0.625, "rewards/chosen": -1.417018175125122, "rewards/margins": 0.20749130845069885, "rewards/rejected": -1.6245094537734985, "sft_loss": 1.457844853401184, "step": 425 }, { "epoch": 0.2301388192005352, "grad_norm": 7.679720354675964, "learning_rate": 7.664884135472371e-07, "logits/chosen": -0.11018051207065582, "logits/rejected": 0.08682509511709213, "logps/chosen": -1.4306937456130981, "logps/rejected": -1.6434657573699951, "loss": 0.6759, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4306937456130981, "rewards/margins": 0.2127722203731537, "rewards/rejected": -1.6434657573699951, "sft_loss": 1.4518373012542725, "step": 430 }, { "epoch": 0.23281485198193677, "grad_norm": 8.169121159593331, "learning_rate": 7.754010695187165e-07, "logits/chosen": -0.02098211646080017, "logits/rejected": 0.07202951610088348, "logps/chosen": -1.31760573387146, "logps/rejected": -1.4644668102264404, "loss": 0.6876, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.31760573387146, "rewards/margins": 0.14686112105846405, "rewards/rejected": -1.4644668102264404, "sft_loss": 1.356937289237976, "step": 435 }, { "epoch": 0.23549088476333835, "grad_norm": 5.008493811115046, "learning_rate": 7.84313725490196e-07, "logits/chosen": -0.009568731300532818, "logits/rejected": 0.0893973559141159, "logps/chosen": -1.3814446926116943, "logps/rejected": -1.5284905433654785, "loss": 0.6879, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.3814446926116943, "rewards/margins": 0.14704598486423492, "rewards/rejected": -1.5284905433654785, "sft_loss": 1.3872871398925781, "step": 440 }, { "epoch": 0.23816691754473993, "grad_norm": 6.714628106326379, "learning_rate": 7.932263814616755e-07, "logits/chosen": -0.07186368852853775, "logits/rejected": 0.04148939996957779, "logps/chosen": -1.4153398275375366, "logps/rejected": -1.6106348037719727, "loss": 0.6975, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4153398275375366, "rewards/margins": 0.19529494643211365, "rewards/rejected": -1.6106348037719727, "sft_loss": 1.4277794361114502, "step": 445 }, { "epoch": 0.2408429503261415, "grad_norm": 9.004914374033351, "learning_rate": 8.02139037433155e-07, "logits/chosen": 0.016086876392364502, "logits/rejected": 0.14900197088718414, "logps/chosen": -1.4307498931884766, "logps/rejected": -1.599519968032837, "loss": 0.6705, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.4307498931884766, "rewards/margins": 0.1687699854373932, "rewards/rejected": -1.599519968032837, "sft_loss": 1.3819632530212402, "step": 450 }, { "epoch": 0.24351898310754308, "grad_norm": 6.10990838233586, "learning_rate": 8.110516934046346e-07, "logits/chosen": 0.01322667021304369, "logits/rejected": 0.10899517685174942, "logps/chosen": -1.3523483276367188, "logps/rejected": -1.5968722105026245, "loss": 0.6662, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3523483276367188, "rewards/margins": 0.24452391266822815, "rewards/rejected": -1.5968722105026245, "sft_loss": 1.3561742305755615, "step": 455 }, { "epoch": 0.24619501588894463, "grad_norm": 6.549737853320029, "learning_rate": 8.19964349376114e-07, "logits/chosen": -0.12617279589176178, "logits/rejected": 0.0016090974677354097, "logps/chosen": -1.479893445968628, "logps/rejected": -1.5853257179260254, "loss": 0.7197, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.479893445968628, "rewards/margins": 0.1054321900010109, "rewards/rejected": -1.5853257179260254, "sft_loss": 1.4918811321258545, "step": 460 }, { "epoch": 0.2488710486703462, "grad_norm": 6.783572560298443, "learning_rate": 8.288770053475936e-07, "logits/chosen": 0.12295699119567871, "logits/rejected": 0.14290973544120789, "logps/chosen": -1.4391229152679443, "logps/rejected": -1.633167028427124, "loss": 0.7003, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.4391229152679443, "rewards/margins": 0.1940440833568573, "rewards/rejected": -1.633167028427124, "sft_loss": 1.4124929904937744, "step": 465 }, { "epoch": 0.2515470814517478, "grad_norm": 6.340493689806178, "learning_rate": 8.37789661319073e-07, "logits/chosen": 0.15927724540233612, "logits/rejected": 0.10868068784475327, "logps/chosen": -1.3759548664093018, "logps/rejected": -1.5975778102874756, "loss": 0.6741, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3759548664093018, "rewards/margins": 0.22162313759326935, "rewards/rejected": -1.5975778102874756, "sft_loss": 1.3803473711013794, "step": 470 }, { "epoch": 0.25422311423314936, "grad_norm": 5.897942882416216, "learning_rate": 8.467023172905525e-07, "logits/chosen": -0.06146723031997681, "logits/rejected": 0.08610192686319351, "logps/chosen": -1.4093339443206787, "logps/rejected": -1.7378123998641968, "loss": 0.6464, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.4093339443206787, "rewards/margins": 0.3284783959388733, "rewards/rejected": -1.7378123998641968, "sft_loss": 1.4273899793624878, "step": 475 }, { "epoch": 0.2568991470145509, "grad_norm": 6.98144339229632, "learning_rate": 8.55614973262032e-07, "logits/chosen": -0.059163518249988556, "logits/rejected": 0.14092698693275452, "logps/chosen": -1.3883863687515259, "logps/rejected": -1.507129430770874, "loss": 0.7007, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.3883863687515259, "rewards/margins": 0.11874298751354218, "rewards/rejected": -1.507129430770874, "sft_loss": 1.3975694179534912, "step": 480 }, { "epoch": 0.2595751797959525, "grad_norm": 7.168833653699756, "learning_rate": 8.645276292335115e-07, "logits/chosen": -0.0012844301527366042, "logits/rejected": 0.04530390724539757, "logps/chosen": -1.5146420001983643, "logps/rejected": -1.6267915964126587, "loss": 0.7237, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.5146420001983643, "rewards/margins": 0.11214945465326309, "rewards/rejected": -1.6267915964126587, "sft_loss": 1.478551983833313, "step": 485 }, { "epoch": 0.26225121257735406, "grad_norm": 5.927038210209542, "learning_rate": 8.734402852049911e-07, "logits/chosen": 0.0074288249015808105, "logits/rejected": 0.07527925074100494, "logps/chosen": -1.4502588510513306, "logps/rejected": -1.5668810606002808, "loss": 0.7244, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.4502588510513306, "rewards/margins": 0.116622194647789, "rewards/rejected": -1.5668810606002808, "sft_loss": 1.4134190082550049, "step": 490 }, { "epoch": 0.26492724535875567, "grad_norm": 7.74594919759278, "learning_rate": 8.823529411764705e-07, "logits/chosen": -0.05722605064511299, "logits/rejected": -0.031624868512153625, "logps/chosen": -1.4506053924560547, "logps/rejected": -1.5846562385559082, "loss": 0.7083, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.4506053924560547, "rewards/margins": 0.13405072689056396, "rewards/rejected": -1.5846562385559082, "sft_loss": 1.4903643131256104, "step": 495 }, { "epoch": 0.2676032781401572, "grad_norm": 5.9093862472476335, "learning_rate": 8.912655971479501e-07, "logits/chosen": -0.05097437649965286, "logits/rejected": 0.05235857516527176, "logps/chosen": -1.3838527202606201, "logps/rejected": -1.5711188316345215, "loss": 0.6943, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.3838527202606201, "rewards/margins": 0.1872660517692566, "rewards/rejected": -1.5711188316345215, "sft_loss": 1.3825076818466187, "step": 500 }, { "epoch": 0.27027931092155877, "grad_norm": 9.934857874379231, "learning_rate": 9.001782531194295e-07, "logits/chosen": -0.09654500335454941, "logits/rejected": 0.04827988147735596, "logps/chosen": -1.4918394088745117, "logps/rejected": -1.5619932413101196, "loss": 0.7319, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4918394088745117, "rewards/margins": 0.07015396654605865, "rewards/rejected": -1.5619932413101196, "sft_loss": 1.4723923206329346, "step": 505 }, { "epoch": 0.2729553437029604, "grad_norm": 6.643593763195416, "learning_rate": 9.09090909090909e-07, "logits/chosen": 0.07212212681770325, "logits/rejected": 0.1325608640909195, "logps/chosen": -1.4403663873672485, "logps/rejected": -1.6584949493408203, "loss": 0.6729, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4403663873672485, "rewards/margins": 0.2181284874677658, "rewards/rejected": -1.6584949493408203, "sft_loss": 1.3849339485168457, "step": 510 }, { "epoch": 0.2756313764843619, "grad_norm": 5.457019277334743, "learning_rate": 9.180035650623885e-07, "logits/chosen": 0.0359685979783535, "logits/rejected": 0.13570207357406616, "logps/chosen": -1.3674839735031128, "logps/rejected": -1.5590038299560547, "loss": 0.6877, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.3674839735031128, "rewards/margins": 0.19151967763900757, "rewards/rejected": -1.5590038299560547, "sft_loss": 1.3747385740280151, "step": 515 }, { "epoch": 0.27830740926576353, "grad_norm": 5.525007303010948, "learning_rate": 9.26916221033868e-07, "logits/chosen": -0.08311732113361359, "logits/rejected": 0.056603431701660156, "logps/chosen": -1.423223853111267, "logps/rejected": -1.5819368362426758, "loss": 0.702, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.423223853111267, "rewards/margins": 0.1587129533290863, "rewards/rejected": -1.5819368362426758, "sft_loss": 1.498355507850647, "step": 520 }, { "epoch": 0.2809834420471651, "grad_norm": 11.421683960091872, "learning_rate": 9.358288770053476e-07, "logits/chosen": 0.0943758636713028, "logits/rejected": 0.17001059651374817, "logps/chosen": -1.4106147289276123, "logps/rejected": -1.6605732440948486, "loss": 0.6754, "rewards/accuracies": 0.59375, "rewards/chosen": -1.4106147289276123, "rewards/margins": 0.24995842576026917, "rewards/rejected": -1.6605732440948486, "sft_loss": 1.474737524986267, "step": 525 }, { "epoch": 0.2836594748285666, "grad_norm": 5.183308574444876, "learning_rate": 9.44741532976827e-07, "logits/chosen": 0.06468604505062103, "logits/rejected": 0.15001121163368225, "logps/chosen": -1.3696916103363037, "logps/rejected": -1.5476267337799072, "loss": 0.6941, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3696916103363037, "rewards/margins": 0.1779351830482483, "rewards/rejected": -1.5476267337799072, "sft_loss": 1.3246756792068481, "step": 530 }, { "epoch": 0.28633550760996823, "grad_norm": 5.118190343811381, "learning_rate": 9.536541889483066e-07, "logits/chosen": -0.10416406393051147, "logits/rejected": 0.16024169325828552, "logps/chosen": -1.3779942989349365, "logps/rejected": -1.509367823600769, "loss": 0.6947, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3779942989349365, "rewards/margins": 0.13137365877628326, "rewards/rejected": -1.509367823600769, "sft_loss": 1.3444318771362305, "step": 535 }, { "epoch": 0.2890115403913698, "grad_norm": 4.741957444228488, "learning_rate": 9.62566844919786e-07, "logits/chosen": 0.0058511835522949696, "logits/rejected": 0.08141259849071503, "logps/chosen": -1.5181024074554443, "logps/rejected": -1.6562395095825195, "loss": 0.7145, "rewards/accuracies": 0.53125, "rewards/chosen": -1.5181024074554443, "rewards/margins": 0.13813704252243042, "rewards/rejected": -1.6562395095825195, "sft_loss": 1.5423694849014282, "step": 540 }, { "epoch": 0.2916875731727714, "grad_norm": 6.190035659114328, "learning_rate": 9.714795008912655e-07, "logits/chosen": -0.09001535922288895, "logits/rejected": 0.1136549562215805, "logps/chosen": -1.4268022775650024, "logps/rejected": -1.618949294090271, "loss": 0.6708, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.4268022775650024, "rewards/margins": 0.1921471804380417, "rewards/rejected": -1.618949294090271, "sft_loss": 1.430987000465393, "step": 545 }, { "epoch": 0.29436360595417294, "grad_norm": 6.200801616546889, "learning_rate": 9.80392156862745e-07, "logits/chosen": 0.05723940208554268, "logits/rejected": 0.1289665400981903, "logps/chosen": -1.4569398164749146, "logps/rejected": -1.6450210809707642, "loss": 0.6722, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.4569398164749146, "rewards/margins": 0.18808124959468842, "rewards/rejected": -1.6450210809707642, "sft_loss": 1.4220554828643799, "step": 550 }, { "epoch": 0.2970396387355745, "grad_norm": 11.232439041213702, "learning_rate": 9.893048128342244e-07, "logits/chosen": -0.03822972625494003, "logits/rejected": 0.09498357772827148, "logps/chosen": -1.550957441329956, "logps/rejected": -1.6663873195648193, "loss": 0.7298, "rewards/accuracies": 0.5625, "rewards/chosen": -1.550957441329956, "rewards/margins": 0.11543013900518417, "rewards/rejected": -1.6663873195648193, "sft_loss": 1.5272772312164307, "step": 555 }, { "epoch": 0.2997156715169761, "grad_norm": 7.814214005511332, "learning_rate": 9.98217468805704e-07, "logits/chosen": 0.07431139051914215, "logits/rejected": 0.09394214302301407, "logps/chosen": -1.399718165397644, "logps/rejected": -1.6008237600326538, "loss": 0.6723, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.399718165397644, "rewards/margins": 0.20110544562339783, "rewards/rejected": -1.6008237600326538, "sft_loss": 1.5155736207962036, "step": 560 }, { "epoch": 0.30239170429837764, "grad_norm": 5.516404765486951, "learning_rate": 9.999984476788462e-07, "logits/chosen": 0.03611458092927933, "logits/rejected": 0.0971362441778183, "logps/chosen": -1.5159542560577393, "logps/rejected": -1.7062089443206787, "loss": 0.6873, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.5159542560577393, "rewards/margins": 0.1902548372745514, "rewards/rejected": -1.7062089443206787, "sft_loss": 1.521512746810913, "step": 565 }, { "epoch": 0.30506773707977924, "grad_norm": 10.217360300757083, "learning_rate": 9.999921413906797e-07, "logits/chosen": -0.02934812568128109, "logits/rejected": 0.19856297969818115, "logps/chosen": -1.4896576404571533, "logps/rejected": -1.6452884674072266, "loss": 0.7056, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.4896576404571533, "rewards/margins": 0.15563085675239563, "rewards/rejected": -1.6452884674072266, "sft_loss": 1.5293747186660767, "step": 570 }, { "epoch": 0.3077437698611808, "grad_norm": 6.126063460820979, "learning_rate": 9.999809841765644e-07, "logits/chosen": -0.015764957293868065, "logits/rejected": 0.047070231288671494, "logps/chosen": -1.4180388450622559, "logps/rejected": -1.6173263788223267, "loss": 0.6796, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4180388450622559, "rewards/margins": 0.1992875337600708, "rewards/rejected": -1.6173263788223267, "sft_loss": 1.4322500228881836, "step": 575 }, { "epoch": 0.3104198026425824, "grad_norm": 5.442692462822735, "learning_rate": 9.999649761447477e-07, "logits/chosen": -0.022562870755791664, "logits/rejected": 0.13785383105278015, "logps/chosen": -1.4369986057281494, "logps/rejected": -1.7041908502578735, "loss": 0.6478, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4369986057281494, "rewards/margins": 0.26719212532043457, "rewards/rejected": -1.7041908502578735, "sft_loss": 1.439800500869751, "step": 580 }, { "epoch": 0.31309583542398395, "grad_norm": 7.033602039336304, "learning_rate": 9.999441174505398e-07, "logits/chosen": -0.05476289987564087, "logits/rejected": 0.055011261254549026, "logps/chosen": -1.6148452758789062, "logps/rejected": -1.7512447834014893, "loss": 0.7277, "rewards/accuracies": 0.5625, "rewards/chosen": -1.6148452758789062, "rewards/margins": 0.13639959692955017, "rewards/rejected": -1.7512447834014893, "sft_loss": 1.599481225013733, "step": 585 }, { "epoch": 0.3157718682053855, "grad_norm": 10.075903252412404, "learning_rate": 9.999184082963116e-07, "logits/chosen": -0.011929292231798172, "logits/rejected": 0.11828957498073578, "logps/chosen": -1.5608313083648682, "logps/rejected": -1.6579519510269165, "loss": 0.721, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5608313083648682, "rewards/margins": 0.09712080657482147, "rewards/rejected": -1.6579519510269165, "sft_loss": 1.5709166526794434, "step": 590 }, { "epoch": 0.3184479009867871, "grad_norm": 7.47194767395495, "learning_rate": 9.998878489314937e-07, "logits/chosen": 0.03971802070736885, "logits/rejected": 0.17081709206104279, "logps/chosen": -1.4810861349105835, "logps/rejected": -1.6688859462738037, "loss": 0.6828, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.4810861349105835, "rewards/margins": 0.18779988586902618, "rewards/rejected": -1.6688859462738037, "sft_loss": 1.4875504970550537, "step": 595 }, { "epoch": 0.32112393376818865, "grad_norm": 5.715123632371702, "learning_rate": 9.99852439652573e-07, "logits/chosen": -0.008820680901408195, "logits/rejected": 0.14663805067539215, "logps/chosen": -1.513209581375122, "logps/rejected": -1.6487150192260742, "loss": 0.6979, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.513209581375122, "rewards/margins": 0.13550536334514618, "rewards/rejected": -1.6487150192260742, "sft_loss": 1.5156606435775757, "step": 600 }, { "epoch": 0.32379996654959026, "grad_norm": 7.759220702730302, "learning_rate": 9.998121808030904e-07, "logits/chosen": -0.06416098773479462, "logits/rejected": 0.021558623760938644, "logps/chosen": -1.6154426336288452, "logps/rejected": -1.846312165260315, "loss": 0.6851, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.6154426336288452, "rewards/margins": 0.23086953163146973, "rewards/rejected": -1.846312165260315, "sft_loss": 1.6153326034545898, "step": 605 }, { "epoch": 0.3264759993309918, "grad_norm": 15.63102859711399, "learning_rate": 9.997670727736379e-07, "logits/chosen": 0.06927318871021271, "logits/rejected": 0.2384120672941208, "logps/chosen": -1.572723388671875, "logps/rejected": -1.791204810142517, "loss": 0.6892, "rewards/accuracies": 0.59375, "rewards/chosen": -1.572723388671875, "rewards/margins": 0.21848134696483612, "rewards/rejected": -1.791204810142517, "sft_loss": 1.5574983358383179, "step": 610 }, { "epoch": 0.32915203211239336, "grad_norm": 5.568616824813846, "learning_rate": 9.99717116001853e-07, "logits/chosen": -0.04871724173426628, "logits/rejected": 0.055701516568660736, "logps/chosen": -1.562664270401001, "logps/rejected": -1.8564201593399048, "loss": 0.66, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.562664270401001, "rewards/margins": 0.29375597834587097, "rewards/rejected": -1.8564201593399048, "sft_loss": 1.5629616975784302, "step": 615 }, { "epoch": 0.33182806489379496, "grad_norm": 6.02384270726035, "learning_rate": 9.996623109724173e-07, "logits/chosen": 0.04730135574936867, "logits/rejected": 0.11533886194229126, "logps/chosen": -1.6759907007217407, "logps/rejected": -1.8699016571044922, "loss": 0.6981, "rewards/accuracies": 0.59375, "rewards/chosen": -1.6759907007217407, "rewards/margins": 0.19391095638275146, "rewards/rejected": -1.8699016571044922, "sft_loss": 1.651746392250061, "step": 620 }, { "epoch": 0.3345040976751965, "grad_norm": 8.76762564401349, "learning_rate": 9.996026582170488e-07, "logits/chosen": 0.08321405947208405, "logits/rejected": 0.20709185302257538, "logps/chosen": -1.5465686321258545, "logps/rejected": -1.8440536260604858, "loss": 0.6417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.5465686321258545, "rewards/margins": 0.297484815120697, "rewards/rejected": -1.8440536260604858, "sft_loss": 1.563207745552063, "step": 625 }, { "epoch": 0.3371801304565981, "grad_norm": 9.201337338268745, "learning_rate": 9.995381583144996e-07, "logits/chosen": 0.020206613466143608, "logits/rejected": 0.13258489966392517, "logps/chosen": -1.6068108081817627, "logps/rejected": -1.890876054763794, "loss": 0.6448, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6068108081817627, "rewards/margins": 0.28406545519828796, "rewards/rejected": -1.890876054763794, "sft_loss": 1.561948537826538, "step": 630 }, { "epoch": 0.33985616323799966, "grad_norm": 5.093884861429177, "learning_rate": 9.994688118905471e-07, "logits/chosen": 0.011320598423480988, "logits/rejected": 0.2581380009651184, "logps/chosen": -1.7048816680908203, "logps/rejected": -1.9319698810577393, "loss": 0.6912, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.7048816680908203, "rewards/margins": 0.22708837687969208, "rewards/rejected": -1.9319698810577393, "sft_loss": 1.6976354122161865, "step": 635 }, { "epoch": 0.3425321960194012, "grad_norm": 17.736312901847896, "learning_rate": 9.993946196179912e-07, "logits/chosen": -0.06087531894445419, "logits/rejected": 0.1462034285068512, "logps/chosen": -1.681366205215454, "logps/rejected": -1.9161033630371094, "loss": 0.6828, "rewards/accuracies": 0.53125, "rewards/chosen": -1.681366205215454, "rewards/margins": 0.23473712801933289, "rewards/rejected": -1.9161033630371094, "sft_loss": 1.709673523902893, "step": 640 }, { "epoch": 0.3452082288008028, "grad_norm": 8.314358446774131, "learning_rate": 9.993155822166455e-07, "logits/chosen": -0.03272467106580734, "logits/rejected": 0.05644283443689346, "logps/chosen": -1.6156227588653564, "logps/rejected": -1.9258009195327759, "loss": 0.6386, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.6156227588653564, "rewards/margins": 0.3101782202720642, "rewards/rejected": -1.9258009195327759, "sft_loss": 1.5771795511245728, "step": 645 }, { "epoch": 0.34788426158220437, "grad_norm": 9.800112089550767, "learning_rate": 9.992317004533313e-07, "logits/chosen": 0.0005621820455417037, "logits/rejected": 0.1446131467819214, "logps/chosen": -1.7680717706680298, "logps/rejected": -2.068042278289795, "loss": 0.6571, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.7680717706680298, "rewards/margins": 0.2999705374240875, "rewards/rejected": -2.068042278289795, "sft_loss": 1.7745215892791748, "step": 650 }, { "epoch": 0.350560294363606, "grad_norm": 8.891072766513426, "learning_rate": 9.991429751418696e-07, "logits/chosen": 0.06538809090852737, "logits/rejected": 0.08319473266601562, "logps/chosen": -1.7189629077911377, "logps/rejected": -2.026162624359131, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.7189629077911377, "rewards/margins": 0.3071998655796051, "rewards/rejected": -2.026162624359131, "sft_loss": 1.7231667041778564, "step": 655 }, { "epoch": 0.3532363271450075, "grad_norm": 8.35891612764567, "learning_rate": 9.99049407143074e-07, "logits/chosen": 0.046678341925144196, "logits/rejected": 0.1763012558221817, "logps/chosen": -1.7010895013809204, "logps/rejected": -1.878273367881775, "loss": 0.705, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.7010895013809204, "rewards/margins": 0.17718379199504852, "rewards/rejected": -1.878273367881775, "sft_loss": 1.7096478939056396, "step": 660 }, { "epoch": 0.35591235992640907, "grad_norm": 5.872723655675756, "learning_rate": 9.989509973647416e-07, "logits/chosen": 0.02392008528113365, "logits/rejected": 0.16449224948883057, "logps/chosen": -1.6240675449371338, "logps/rejected": -1.9025075435638428, "loss": 0.6608, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6240675449371338, "rewards/margins": 0.27844005823135376, "rewards/rejected": -1.9025075435638428, "sft_loss": 1.6689106225967407, "step": 665 }, { "epoch": 0.3585883927078107, "grad_norm": 7.4402063008554835, "learning_rate": 9.988477467616445e-07, "logits/chosen": -0.0179903507232666, "logits/rejected": 0.18532223999500275, "logps/chosen": -1.6726337671279907, "logps/rejected": -1.9018869400024414, "loss": 0.6558, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.6726337671279907, "rewards/margins": 0.22925344109535217, "rewards/rejected": -1.9018869400024414, "sft_loss": 1.7706172466278076, "step": 670 }, { "epoch": 0.3612644254892122, "grad_norm": 9.22882511476252, "learning_rate": 9.987396563355205e-07, "logits/chosen": -0.024912597611546516, "logits/rejected": 0.04954840987920761, "logps/chosen": -1.6799808740615845, "logps/rejected": -2.0296683311462402, "loss": 0.637, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6799808740615845, "rewards/margins": 0.3496876657009125, "rewards/rejected": -2.0296683311462402, "sft_loss": 1.7392793893814087, "step": 675 }, { "epoch": 0.36394045827061383, "grad_norm": 7.178215796653519, "learning_rate": 9.986267271350631e-07, "logits/chosen": 0.08153931051492691, "logits/rejected": 0.24789385497570038, "logps/chosen": -1.7651420831680298, "logps/rejected": -2.004348039627075, "loss": 0.7226, "rewards/accuracies": 0.5625, "rewards/chosen": -1.7651420831680298, "rewards/margins": 0.2392059862613678, "rewards/rejected": -2.004348039627075, "sft_loss": 1.7212005853652954, "step": 680 }, { "epoch": 0.3666164910520154, "grad_norm": 17.26491478757877, "learning_rate": 9.985089602559123e-07, "logits/chosen": 0.037060752511024475, "logits/rejected": 0.1910381019115448, "logps/chosen": -1.7412440776824951, "logps/rejected": -2.0073153972625732, "loss": 0.6763, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.7412440776824951, "rewards/margins": 0.2660714387893677, "rewards/rejected": -2.0073153972625732, "sft_loss": 1.7309767007827759, "step": 685 }, { "epoch": 0.369292523833417, "grad_norm": 9.390663949447314, "learning_rate": 9.983863568406428e-07, "logits/chosen": 0.07388173043727875, "logits/rejected": 0.11109878122806549, "logps/chosen": -1.765343427658081, "logps/rejected": -2.0304367542266846, "loss": 0.6819, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.765343427658081, "rewards/margins": 0.26509329676628113, "rewards/rejected": -2.0304367542266846, "sft_loss": 1.8135614395141602, "step": 690 }, { "epoch": 0.37196855661481854, "grad_norm": 6.290396110648321, "learning_rate": 9.982589180787532e-07, "logits/chosen": 0.03160684183239937, "logits/rejected": 0.11958432197570801, "logps/chosen": -1.6528549194335938, "logps/rejected": -1.9509315490722656, "loss": 0.6564, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.6528549194335938, "rewards/margins": 0.29807668924331665, "rewards/rejected": -1.9509315490722656, "sft_loss": 1.7110259532928467, "step": 695 }, { "epoch": 0.3746445893962201, "grad_norm": 10.086599389475353, "learning_rate": 9.981266452066553e-07, "logits/chosen": -0.09080805629491806, "logits/rejected": 0.04179905354976654, "logps/chosen": -1.8715393543243408, "logps/rejected": -2.0836453437805176, "loss": 0.6784, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8715393543243408, "rewards/margins": 0.21210607886314392, "rewards/rejected": -2.0836453437805176, "sft_loss": 1.8276889324188232, "step": 700 }, { "epoch": 0.3773206221776217, "grad_norm": 9.64164987311192, "learning_rate": 9.979895395076608e-07, "logits/chosen": -0.04067766293883324, "logits/rejected": 0.12535516917705536, "logps/chosen": -1.8169269561767578, "logps/rejected": -2.19791841506958, "loss": 0.6354, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.8169269561767578, "rewards/margins": 0.3809918165206909, "rewards/rejected": -2.19791841506958, "sft_loss": 1.835120439529419, "step": 705 }, { "epoch": 0.37999665495902324, "grad_norm": 9.28735932443622, "learning_rate": 9.9784760231197e-07, "logits/chosen": 0.05085369944572449, "logits/rejected": 0.1413969099521637, "logps/chosen": -1.8504364490509033, "logps/rejected": -2.155524492263794, "loss": 0.6419, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.8504364490509033, "rewards/margins": 0.3050883710384369, "rewards/rejected": -2.155524492263794, "sft_loss": 1.8197228908538818, "step": 710 }, { "epoch": 0.38267268774042484, "grad_norm": 12.115110830896828, "learning_rate": 9.97700834996658e-07, "logits/chosen": 0.011143917217850685, "logits/rejected": 0.17604656517505646, "logps/chosen": -1.9947057962417603, "logps/rejected": -2.267084836959839, "loss": 0.6649, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9947057962417603, "rewards/margins": 0.27237898111343384, "rewards/rejected": -2.267084836959839, "sft_loss": 1.9120969772338867, "step": 715 }, { "epoch": 0.3853487205218264, "grad_norm": 11.624644878116946, "learning_rate": 9.97549238985662e-07, "logits/chosen": 0.08611107617616653, "logits/rejected": 0.27575188875198364, "logps/chosen": -2.0711281299591064, "logps/rejected": -2.4074230194091797, "loss": 0.6771, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0711281299591064, "rewards/margins": 0.33629506826400757, "rewards/rejected": -2.4074230194091797, "sft_loss": 2.0715701580047607, "step": 720 }, { "epoch": 0.38802475330322794, "grad_norm": 9.108287386730169, "learning_rate": 9.973928157497674e-07, "logits/chosen": -0.04209893196821213, "logits/rejected": 0.10033257305622101, "logps/chosen": -1.863908052444458, "logps/rejected": -2.286168336868286, "loss": 0.6031, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.863908052444458, "rewards/margins": 0.422260582447052, "rewards/rejected": -2.286168336868286, "sft_loss": 1.9227043390274048, "step": 725 }, { "epoch": 0.39070078608462955, "grad_norm": 14.030613682567305, "learning_rate": 9.972315668065927e-07, "logits/chosen": -0.06971832364797592, "logits/rejected": 0.08316639065742493, "logps/chosen": -2.050490617752075, "logps/rejected": -2.3599963188171387, "loss": 0.6707, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.050490617752075, "rewards/margins": 0.3095058500766754, "rewards/rejected": -2.3599963188171387, "sft_loss": 2.0619983673095703, "step": 730 }, { "epoch": 0.3933768188660311, "grad_norm": 7.885707770222676, "learning_rate": 9.97065493720576e-07, "logits/chosen": -0.050217293202877045, "logits/rejected": 0.04937911778688431, "logps/chosen": -2.032588005065918, "logps/rejected": -2.2886781692504883, "loss": 0.6714, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.032588005065918, "rewards/margins": 0.2560901343822479, "rewards/rejected": -2.2886781692504883, "sft_loss": 2.058027505874634, "step": 735 }, { "epoch": 0.3960528516474327, "grad_norm": 15.556154977262148, "learning_rate": 9.968945981029594e-07, "logits/chosen": -0.03946347162127495, "logits/rejected": 0.12837369740009308, "logps/chosen": -2.0982556343078613, "logps/rejected": -2.4328927993774414, "loss": 0.655, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0982556343078613, "rewards/margins": 0.3346374034881592, "rewards/rejected": -2.4328927993774414, "sft_loss": 2.1010241508483887, "step": 740 }, { "epoch": 0.39872888442883425, "grad_norm": 6.841990333720714, "learning_rate": 9.967188816117726e-07, "logits/chosen": 0.06810925900936127, "logits/rejected": 0.14391477406024933, "logps/chosen": -2.0970871448516846, "logps/rejected": -2.4689412117004395, "loss": 0.6865, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0970871448516846, "rewards/margins": 0.37185385823249817, "rewards/rejected": -2.4689412117004395, "sft_loss": 2.0899500846862793, "step": 745 }, { "epoch": 0.4014049172102358, "grad_norm": 9.963614775990674, "learning_rate": 9.965383459518179e-07, "logits/chosen": -0.005502223968505859, "logits/rejected": 0.15343348681926727, "logps/chosen": -2.0381226539611816, "logps/rejected": -2.447211742401123, "loss": 0.632, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0381226539611816, "rewards/margins": 0.40908899903297424, "rewards/rejected": -2.447211742401123, "sft_loss": 2.032761812210083, "step": 750 }, { "epoch": 0.4040809499916374, "grad_norm": 9.315325168804089, "learning_rate": 9.963529928746533e-07, "logits/chosen": 0.04696832224726677, "logits/rejected": 0.17964449524879456, "logps/chosen": -2.0477912425994873, "logps/rejected": -2.3599047660827637, "loss": 0.6803, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0477912425994873, "rewards/margins": 0.31211379170417786, "rewards/rejected": -2.3599047660827637, "sft_loss": 2.053529977798462, "step": 755 }, { "epoch": 0.40675698277303896, "grad_norm": 6.797423613480653, "learning_rate": 9.961628241785746e-07, "logits/chosen": -0.04901718348264694, "logits/rejected": 0.028940856456756592, "logps/chosen": -2.0668463706970215, "logps/rejected": -2.386622905731201, "loss": 0.6586, "rewards/accuracies": 0.625, "rewards/chosen": -2.0668463706970215, "rewards/margins": 0.3197762370109558, "rewards/rejected": -2.386622905731201, "sft_loss": 2.104837656021118, "step": 760 }, { "epoch": 0.40943301555444056, "grad_norm": 9.903022378126753, "learning_rate": 9.959678417085998e-07, "logits/chosen": 0.015145739540457726, "logits/rejected": 0.11417007446289062, "logps/chosen": -1.9795106649398804, "logps/rejected": -2.254296064376831, "loss": 0.6665, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9795106649398804, "rewards/margins": 0.2747856080532074, "rewards/rejected": -2.254296064376831, "sft_loss": 1.9884569644927979, "step": 765 }, { "epoch": 0.4121090483358421, "grad_norm": 12.022757569057635, "learning_rate": 9.957680473564493e-07, "logits/chosen": 0.09810831397771835, "logits/rejected": 0.22772392630577087, "logps/chosen": -1.9476169347763062, "logps/rejected": -2.4173974990844727, "loss": 0.6122, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9476169347763062, "rewards/margins": 0.4697801470756531, "rewards/rejected": -2.4173974990844727, "sft_loss": 1.9548561573028564, "step": 770 }, { "epoch": 0.41478508111724366, "grad_norm": 6.422764925697639, "learning_rate": 9.95563443060529e-07, "logits/chosen": -0.1036531925201416, "logits/rejected": 0.06811682879924774, "logps/chosen": -2.0023977756500244, "logps/rejected": -2.3354830741882324, "loss": 0.6719, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0023977756500244, "rewards/margins": 0.3330851197242737, "rewards/rejected": -2.3354830741882324, "sft_loss": 1.9613491296768188, "step": 775 }, { "epoch": 0.41746111389864526, "grad_norm": 9.881302782545758, "learning_rate": 9.95354030805911e-07, "logits/chosen": -0.15749771893024445, "logits/rejected": -0.008543589152395725, "logps/chosen": -2.031306028366089, "logps/rejected": -2.3502228260040283, "loss": 0.6487, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.031306028366089, "rewards/margins": 0.318916380405426, "rewards/rejected": -2.3502228260040283, "sft_loss": 2.0914266109466553, "step": 780 }, { "epoch": 0.4201371466800468, "grad_norm": 8.31230234799272, "learning_rate": 9.951398126243133e-07, "logits/chosen": -0.02615680918097496, "logits/rejected": 0.09981267154216766, "logps/chosen": -1.9661918878555298, "logps/rejected": -2.4096293449401855, "loss": 0.6199, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9661918878555298, "rewards/margins": 0.44343748688697815, "rewards/rejected": -2.4096293449401855, "sft_loss": 1.9984050989151, "step": 785 }, { "epoch": 0.4228131794614484, "grad_norm": 11.315300797578953, "learning_rate": 9.94920790594082e-07, "logits/chosen": -0.060059063136577606, "logits/rejected": 0.06812725216150284, "logps/chosen": -1.9654719829559326, "logps/rejected": -2.4030096530914307, "loss": 0.6103, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9654719829559326, "rewards/margins": 0.4375377595424652, "rewards/rejected": -2.4030096530914307, "sft_loss": 1.9656639099121094, "step": 790 }, { "epoch": 0.42548921224284997, "grad_norm": 22.755632032076623, "learning_rate": 9.946969668401696e-07, "logits/chosen": -0.07668205350637436, "logits/rejected": 0.11146184056997299, "logps/chosen": -2.0439791679382324, "logps/rejected": -2.5747227668762207, "loss": 0.632, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0439791679382324, "rewards/margins": 0.5307438969612122, "rewards/rejected": -2.5747227668762207, "sft_loss": 2.075232744216919, "step": 795 }, { "epoch": 0.4281652450242516, "grad_norm": 8.03414191880417, "learning_rate": 9.944683435341155e-07, "logits/chosen": -0.04020323604345322, "logits/rejected": 0.03881923481822014, "logps/chosen": -2.0494797229766846, "logps/rejected": -2.466977596282959, "loss": 0.6225, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0494797229766846, "rewards/margins": 0.41749781370162964, "rewards/rejected": -2.466977596282959, "sft_loss": 2.019458055496216, "step": 800 }, { "epoch": 0.4281652450242516, "eval_logits/chosen": 0.256985604763031, "eval_logits/rejected": 0.34294527769088745, "eval_logps/chosen": -2.0770034790039062, "eval_logps/rejected": -2.5396463871002197, "eval_loss": 0.6192032694816589, "eval_rewards/accuracies": 0.6669139266014099, "eval_rewards/chosen": -2.0770034790039062, "eval_rewards/margins": 0.46264272928237915, "eval_rewards/rejected": -2.5396463871002197, "eval_runtime": 51.5427, "eval_samples_per_second": 26.095, "eval_sft_loss": 2.057264804840088, "eval_steps_per_second": 6.538, "step": 800 }, { "epoch": 0.4308412778056531, "grad_norm": 11.213423217245992, "learning_rate": 9.942349228940236e-07, "logits/chosen": -0.08227229118347168, "logits/rejected": 0.07269458472728729, "logps/chosen": -2.0386552810668945, "logps/rejected": -2.636596441268921, "loss": 0.5819, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0386552810668945, "rewards/margins": 0.5979411602020264, "rewards/rejected": -2.636596441268921, "sft_loss": 2.055975914001465, "step": 805 }, { "epoch": 0.43351731058705467, "grad_norm": 9.383016112080094, "learning_rate": 9.939967071845424e-07, "logits/chosen": 0.019783342257142067, "logits/rejected": 0.0919952541589737, "logps/chosen": -2.1240878105163574, "logps/rejected": -2.4889702796936035, "loss": 0.6429, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1240878105163574, "rewards/margins": 0.3648822009563446, "rewards/rejected": -2.4889702796936035, "sft_loss": 2.1461615562438965, "step": 810 }, { "epoch": 0.4361933433684563, "grad_norm": 14.285322895558972, "learning_rate": 9.937536987168413e-07, "logits/chosen": 0.05799783021211624, "logits/rejected": 0.18904006481170654, "logps/chosen": -2.104933977127075, "logps/rejected": -2.6830811500549316, "loss": 0.6267, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.104933977127075, "rewards/margins": 0.5781470537185669, "rewards/rejected": -2.6830811500549316, "sft_loss": 2.1595168113708496, "step": 815 }, { "epoch": 0.4388693761498578, "grad_norm": 11.488414591128015, "learning_rate": 9.935058998485896e-07, "logits/chosen": 0.0572284460067749, "logits/rejected": 0.10458134114742279, "logps/chosen": -2.1640007495880127, "logps/rejected": -2.5970141887664795, "loss": 0.6493, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1640007495880127, "rewards/margins": 0.43301382660865784, "rewards/rejected": -2.5970141887664795, "sft_loss": 2.146779775619507, "step": 820 }, { "epoch": 0.44154540893125943, "grad_norm": 17.889890042811206, "learning_rate": 9.932533129839333e-07, "logits/chosen": 0.008310935460031033, "logits/rejected": 0.12683984637260437, "logps/chosen": -2.0136170387268066, "logps/rejected": -2.5307841300964355, "loss": 0.6103, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.0136170387268066, "rewards/margins": 0.5171666145324707, "rewards/rejected": -2.5307841300964355, "sft_loss": 2.116117000579834, "step": 825 }, { "epoch": 0.444221441712661, "grad_norm": 10.465878733412262, "learning_rate": 9.929959405734711e-07, "logits/chosen": 0.10247880220413208, "logits/rejected": 0.26821228861808777, "logps/chosen": -2.043735980987549, "logps/rejected": -2.4065542221069336, "loss": 0.6423, "rewards/accuracies": 0.65625, "rewards/chosen": -2.043735980987549, "rewards/margins": 0.36281818151474, "rewards/rejected": -2.4065542221069336, "sft_loss": 2.042168378829956, "step": 830 }, { "epoch": 0.44689747449406253, "grad_norm": 8.980836413849559, "learning_rate": 9.927337851142314e-07, "logits/chosen": 0.042201682925224304, "logits/rejected": 0.1663995385169983, "logps/chosen": -1.9975996017456055, "logps/rejected": -2.3808562755584717, "loss": 0.6439, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9975996017456055, "rewards/margins": 0.38325658440589905, "rewards/rejected": -2.3808562755584717, "sft_loss": 2.103684902191162, "step": 835 }, { "epoch": 0.44957350727546413, "grad_norm": 8.385913958085657, "learning_rate": 9.924668491496474e-07, "logits/chosen": 0.02358698472380638, "logits/rejected": 0.19346722960472107, "logps/chosen": -2.174470901489258, "logps/rejected": -2.5670955181121826, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -2.174470901489258, "rewards/margins": 0.3926246762275696, "rewards/rejected": -2.5670955181121826, "sft_loss": 2.2588775157928467, "step": 840 }, { "epoch": 0.4522495400568657, "grad_norm": 6.069412517957521, "learning_rate": 9.92195135269533e-07, "logits/chosen": 0.08882583677768707, "logits/rejected": 0.1478150337934494, "logps/chosen": -2.1246047019958496, "logps/rejected": -2.388331174850464, "loss": 0.6976, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1246047019958496, "rewards/margins": 0.2637265920639038, "rewards/rejected": -2.388331174850464, "sft_loss": 2.252500534057617, "step": 845 }, { "epoch": 0.4549255728382673, "grad_norm": 9.418875341176285, "learning_rate": 9.919186461100574e-07, "logits/chosen": 0.040856409817934036, "logits/rejected": 0.1182093620300293, "logps/chosen": -2.1500442028045654, "logps/rejected": -2.470322608947754, "loss": 0.646, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1500442028045654, "rewards/margins": 0.3202785849571228, "rewards/rejected": -2.470322608947754, "sft_loss": 2.192167043685913, "step": 850 }, { "epoch": 0.45760160561966884, "grad_norm": 13.78367496344772, "learning_rate": 9.9163738435372e-07, "logits/chosen": 0.005407715681940317, "logits/rejected": 0.15354683995246887, "logps/chosen": -2.170201301574707, "logps/rejected": -2.624565601348877, "loss": 0.6577, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.170201301574707, "rewards/margins": 0.454364150762558, "rewards/rejected": -2.624565601348877, "sft_loss": 2.2026968002319336, "step": 855 }, { "epoch": 0.4602776384010704, "grad_norm": 7.739859753372353, "learning_rate": 9.913513527293234e-07, "logits/chosen": -0.03920990601181984, "logits/rejected": 0.12746313214302063, "logps/chosen": -2.2391293048858643, "logps/rejected": -2.791182279586792, "loss": 0.6081, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2391293048858643, "rewards/margins": 0.5520530343055725, "rewards/rejected": -2.791182279586792, "sft_loss": 2.3047616481781006, "step": 860 }, { "epoch": 0.462953671182472, "grad_norm": 23.85659179365519, "learning_rate": 9.910605540119474e-07, "logits/chosen": 0.03437185287475586, "logits/rejected": 0.1299118548631668, "logps/chosen": -2.114776849746704, "logps/rejected": -2.64350962638855, "loss": 0.6475, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.114776849746704, "rewards/margins": 0.5287328958511353, "rewards/rejected": -2.64350962638855, "sft_loss": 2.149594783782959, "step": 865 }, { "epoch": 0.46562970396387354, "grad_norm": 7.068050548964723, "learning_rate": 9.907649910229227e-07, "logits/chosen": -0.06576119363307953, "logits/rejected": 0.19930952787399292, "logps/chosen": -2.1443984508514404, "logps/rejected": -2.663065195083618, "loss": 0.6076, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1443984508514404, "rewards/margins": 0.5186666250228882, "rewards/rejected": -2.663065195083618, "sft_loss": 2.212402820587158, "step": 870 }, { "epoch": 0.46830573674527515, "grad_norm": 11.088938035410244, "learning_rate": 9.90464666629803e-07, "logits/chosen": 0.03996530547738075, "logits/rejected": 0.12184039503335953, "logps/chosen": -2.2576634883880615, "logps/rejected": -2.614593982696533, "loss": 0.7126, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2576634883880615, "rewards/margins": 0.35693034529685974, "rewards/rejected": -2.614593982696533, "sft_loss": 2.228409767150879, "step": 875 }, { "epoch": 0.4709817695266767, "grad_norm": 8.271474030444953, "learning_rate": 9.901595837463363e-07, "logits/chosen": 0.0436701737344265, "logits/rejected": 0.22366786003112793, "logps/chosen": -2.2589449882507324, "logps/rejected": -2.7298736572265625, "loss": 0.6217, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2589449882507324, "rewards/margins": 0.47092896699905396, "rewards/rejected": -2.7298736572265625, "sft_loss": 2.1659958362579346, "step": 880 }, { "epoch": 0.47365780230807825, "grad_norm": 9.519114072202234, "learning_rate": 9.898497453324384e-07, "logits/chosen": -0.06962232291698456, "logits/rejected": 0.014059120789170265, "logps/chosen": -2.2020440101623535, "logps/rejected": -2.7084178924560547, "loss": 0.5909, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.2020440101623535, "rewards/margins": 0.5063741207122803, "rewards/rejected": -2.7084178924560547, "sft_loss": 2.2662155628204346, "step": 885 }, { "epoch": 0.47633383508947985, "grad_norm": 6.855135000012716, "learning_rate": 9.895351543941628e-07, "logits/chosen": -0.14909687638282776, "logits/rejected": -0.0239469762891531, "logps/chosen": -2.174975872039795, "logps/rejected": -2.598846673965454, "loss": 0.6291, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.174975872039795, "rewards/margins": 0.4238705039024353, "rewards/rejected": -2.598846673965454, "sft_loss": 2.251343250274658, "step": 890 }, { "epoch": 0.4790098678708814, "grad_norm": 11.9995465160683, "learning_rate": 9.892158139836724e-07, "logits/chosen": 0.0655425637960434, "logits/rejected": 0.17073342204093933, "logps/chosen": -2.043718099594116, "logps/rejected": -2.407827854156494, "loss": 0.6423, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.043718099594116, "rewards/margins": 0.3641095757484436, "rewards/rejected": -2.407827854156494, "sft_loss": 2.106400966644287, "step": 895 }, { "epoch": 0.481685900652283, "grad_norm": 9.267739718647467, "learning_rate": 9.88891727199209e-07, "logits/chosen": -0.07653886079788208, "logits/rejected": -0.0005800087237730622, "logps/chosen": -2.0188803672790527, "logps/rejected": -2.47680926322937, "loss": 0.6263, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0188803672790527, "rewards/margins": 0.45792898535728455, "rewards/rejected": -2.47680926322937, "sft_loss": 2.075723886489868, "step": 900 }, { "epoch": 0.48436193343368455, "grad_norm": 9.434825016785153, "learning_rate": 9.885628971850641e-07, "logits/chosen": -0.005064345896244049, "logits/rejected": 0.18762926757335663, "logps/chosen": -2.159120559692383, "logps/rejected": -2.6442129611968994, "loss": 0.6447, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.159120559692383, "rewards/margins": 0.4850922226905823, "rewards/rejected": -2.6442129611968994, "sft_loss": 2.236520528793335, "step": 905 }, { "epoch": 0.48703796621508616, "grad_norm": 6.504563353175588, "learning_rate": 9.882293271315481e-07, "logits/chosen": -0.020215703174471855, "logits/rejected": 0.07510758936405182, "logps/chosen": -2.161691904067993, "logps/rejected": -2.5602848529815674, "loss": 0.6645, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.161691904067993, "rewards/margins": 0.3985927700996399, "rewards/rejected": -2.5602848529815674, "sft_loss": 2.1818904876708984, "step": 910 }, { "epoch": 0.4897139989964877, "grad_norm": 8.088147446053588, "learning_rate": 9.878910202749589e-07, "logits/chosen": -0.030447423458099365, "logits/rejected": 0.1630699634552002, "logps/chosen": -2.1108880043029785, "logps/rejected": -2.5778558254241943, "loss": 0.6127, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.1108880043029785, "rewards/margins": 0.4669678807258606, "rewards/rejected": -2.5778558254241943, "sft_loss": 2.157522439956665, "step": 915 }, { "epoch": 0.49239003177788926, "grad_norm": 8.554766860204774, "learning_rate": 9.875479798975512e-07, "logits/chosen": 0.05800174921751022, "logits/rejected": 0.2097451388835907, "logps/chosen": -2.054659366607666, "logps/rejected": -2.5831289291381836, "loss": 0.6255, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.054659366607666, "rewards/margins": 0.5284695625305176, "rewards/rejected": -2.5831289291381836, "sft_loss": 2.185912609100342, "step": 920 }, { "epoch": 0.49506606455929086, "grad_norm": 11.034199446323399, "learning_rate": 9.87200209327504e-07, "logits/chosen": -0.027339598163962364, "logits/rejected": 0.1445145457983017, "logps/chosen": -2.3099098205566406, "logps/rejected": -2.737062454223633, "loss": 0.626, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.3099098205566406, "rewards/margins": 0.427152544260025, "rewards/rejected": -2.737062454223633, "sft_loss": 2.298609972000122, "step": 925 }, { "epoch": 0.4977420973406924, "grad_norm": 12.015830078755586, "learning_rate": 9.868477119388894e-07, "logits/chosen": -0.043401092290878296, "logits/rejected": 0.07257211208343506, "logps/chosen": -2.1922214031219482, "logps/rejected": -2.7571864128112793, "loss": 0.6264, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1922214031219482, "rewards/margins": 0.5649651288986206, "rewards/rejected": -2.7571864128112793, "sft_loss": 2.2458460330963135, "step": 930 }, { "epoch": 0.500418130122094, "grad_norm": 8.727604479465507, "learning_rate": 9.864904911516383e-07, "logits/chosen": 0.019951870664954185, "logits/rejected": 0.06993341445922852, "logps/chosen": -2.2700858116149902, "logps/rejected": -2.6822025775909424, "loss": 0.6514, "rewards/accuracies": 0.625, "rewards/chosen": -2.2700858116149902, "rewards/margins": 0.4121168255805969, "rewards/rejected": -2.6822025775909424, "sft_loss": 2.372786283493042, "step": 935 }, { "epoch": 0.5030941629034956, "grad_norm": 9.246727041827082, "learning_rate": 9.861285504315084e-07, "logits/chosen": -0.009649311192333698, "logits/rejected": 0.10374053567647934, "logps/chosen": -2.1735637187957764, "logps/rejected": -2.6011998653411865, "loss": 0.6233, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.1735637187957764, "rewards/margins": 0.427636057138443, "rewards/rejected": -2.6011998653411865, "sft_loss": 2.2195963859558105, "step": 940 }, { "epoch": 0.5057701956848971, "grad_norm": 9.023576880931797, "learning_rate": 9.857618932900502e-07, "logits/chosen": -0.045220039784908295, "logits/rejected": 0.10452475398778915, "logps/chosen": -2.1646695137023926, "logps/rejected": -2.719238758087158, "loss": 0.5769, "rewards/accuracies": 0.71875, "rewards/chosen": -2.1646695137023926, "rewards/margins": 0.554568886756897, "rewards/rejected": -2.719238758087158, "sft_loss": 2.2067770957946777, "step": 945 }, { "epoch": 0.5084462284662987, "grad_norm": 9.790405508612462, "learning_rate": 9.853905232845727e-07, "logits/chosen": -0.0507919080555439, "logits/rejected": 0.12988582253456116, "logps/chosen": -2.31115984916687, "logps/rejected": -2.7202539443969727, "loss": 0.6656, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.31115984916687, "rewards/margins": 0.4090944230556488, "rewards/rejected": -2.7202539443969727, "sft_loss": 2.3003106117248535, "step": 950 }, { "epoch": 0.5111222612477003, "grad_norm": 9.675479981290406, "learning_rate": 9.850144440181095e-07, "logits/chosen": 0.007676619105041027, "logits/rejected": 0.23354463279247284, "logps/chosen": -2.39599347114563, "logps/rejected": -2.8343722820281982, "loss": 0.6345, "rewards/accuracies": 0.65625, "rewards/chosen": -2.39599347114563, "rewards/margins": 0.4383786618709564, "rewards/rejected": -2.8343722820281982, "sft_loss": 2.4643120765686035, "step": 955 }, { "epoch": 0.5137982940291018, "grad_norm": 8.612987321414094, "learning_rate": 9.846336591393832e-07, "logits/chosen": -0.025504469871520996, "logits/rejected": 0.12795531749725342, "logps/chosen": -2.4095778465270996, "logps/rejected": -2.8609580993652344, "loss": 0.635, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.4095778465270996, "rewards/margins": 0.4513804018497467, "rewards/rejected": -2.8609580993652344, "sft_loss": 2.4834115505218506, "step": 960 }, { "epoch": 0.5164743268105034, "grad_norm": 9.36794990504209, "learning_rate": 9.842481723427704e-07, "logits/chosen": 0.09307606518268585, "logits/rejected": 0.10523343086242676, "logps/chosen": -2.4937140941619873, "logps/rejected": -2.9816248416900635, "loss": 0.658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.4937140941619873, "rewards/margins": 0.487910658121109, "rewards/rejected": -2.9816248416900635, "sft_loss": 2.5522685050964355, "step": 965 }, { "epoch": 0.519150359591905, "grad_norm": 8.475134352425844, "learning_rate": 9.838579873682658e-07, "logits/chosen": 0.059538520872592926, "logits/rejected": 0.07206230610609055, "logps/chosen": -2.302995204925537, "logps/rejected": -2.6984548568725586, "loss": 0.6553, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.302995204925537, "rewards/margins": 0.3954595923423767, "rewards/rejected": -2.6984548568725586, "sft_loss": 2.3630242347717285, "step": 970 }, { "epoch": 0.5218263923733065, "grad_norm": 8.725772671105329, "learning_rate": 9.834631080014457e-07, "logits/chosen": -0.06823412328958511, "logits/rejected": 0.13702434301376343, "logps/chosen": -2.2840070724487305, "logps/rejected": -2.745213747024536, "loss": 0.5947, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.2840070724487305, "rewards/margins": 0.4612065255641937, "rewards/rejected": -2.745213747024536, "sft_loss": 2.383709669113159, "step": 975 }, { "epoch": 0.5245024251547081, "grad_norm": 14.382330843645065, "learning_rate": 9.830635380734312e-07, "logits/chosen": -0.07350875437259674, "logits/rejected": 0.12759271264076233, "logps/chosen": -2.4287877082824707, "logps/rejected": -2.8255655765533447, "loss": 0.6507, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.4287877082824707, "rewards/margins": 0.39677804708480835, "rewards/rejected": -2.8255655765533447, "sft_loss": 2.4610254764556885, "step": 980 }, { "epoch": 0.5271784579361097, "grad_norm": 10.325135195234868, "learning_rate": 9.826592814608517e-07, "logits/chosen": 0.010499343276023865, "logits/rejected": 0.22065281867980957, "logps/chosen": -2.3261592388153076, "logps/rejected": -2.746830463409424, "loss": 0.6346, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3261592388153076, "rewards/margins": 0.42067116498947144, "rewards/rejected": -2.746830463409424, "sft_loss": 2.400534152984619, "step": 985 }, { "epoch": 0.5298544907175113, "grad_norm": 10.580647081118688, "learning_rate": 9.822503420858067e-07, "logits/chosen": 0.09105484187602997, "logits/rejected": 0.11873571574687958, "logps/chosen": -2.158053398132324, "logps/rejected": -2.67893385887146, "loss": 0.6008, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.158053398132324, "rewards/margins": 0.5208802819252014, "rewards/rejected": -2.67893385887146, "sft_loss": 2.3424346446990967, "step": 990 }, { "epoch": 0.5325305234989128, "grad_norm": 9.96487400254228, "learning_rate": 9.818367239158277e-07, "logits/chosen": 0.09545004367828369, "logits/rejected": 0.18055422604084015, "logps/chosen": -2.286459445953369, "logps/rejected": -2.673666000366211, "loss": 0.6709, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.286459445953369, "rewards/margins": 0.38720664381980896, "rewards/rejected": -2.673666000366211, "sft_loss": 2.4550933837890625, "step": 995 }, { "epoch": 0.5352065562803144, "grad_norm": 10.265248076385419, "learning_rate": 9.8141843096384e-07, "logits/chosen": 0.06765373051166534, "logits/rejected": 0.19860796630382538, "logps/chosen": -2.3884854316711426, "logps/rejected": -2.9137723445892334, "loss": 0.6063, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3884854316711426, "rewards/margins": 0.5252869129180908, "rewards/rejected": -2.9137723445892334, "sft_loss": 2.4504899978637695, "step": 1000 }, { "epoch": 0.537882589061716, "grad_norm": 11.740882940902146, "learning_rate": 9.809954672881237e-07, "logits/chosen": 0.05435361713171005, "logits/rejected": 0.23164144158363342, "logps/chosen": -2.520017147064209, "logps/rejected": -2.9663870334625244, "loss": 0.6627, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.520017147064209, "rewards/margins": 0.44636982679367065, "rewards/rejected": -2.9663870334625244, "sft_loss": 2.5899224281311035, "step": 1005 }, { "epoch": 0.5405586218431175, "grad_norm": 8.925497581186566, "learning_rate": 9.80567836992274e-07, "logits/chosen": 0.03229910135269165, "logits/rejected": 0.23886017501354218, "logps/chosen": -2.2775940895080566, "logps/rejected": -2.876896381378174, "loss": 0.6006, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2775940895080566, "rewards/margins": 0.5993021726608276, "rewards/rejected": -2.876896381378174, "sft_loss": 2.4006941318511963, "step": 1010 }, { "epoch": 0.5432346546245191, "grad_norm": 8.44126902041844, "learning_rate": 9.801355442251625e-07, "logits/chosen": 0.0005810469156131148, "logits/rejected": 0.17584313452243805, "logps/chosen": -2.3316941261291504, "logps/rejected": -2.8272385597229004, "loss": 0.6297, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3316941261291504, "rewards/margins": 0.49554443359375, "rewards/rejected": -2.8272385597229004, "sft_loss": 2.4663829803466797, "step": 1015 }, { "epoch": 0.5459106874059207, "grad_norm": 12.02768296687771, "learning_rate": 9.796985931808949e-07, "logits/chosen": 0.005703767295926809, "logits/rejected": 0.15967944264411926, "logps/chosen": -2.399937391281128, "logps/rejected": -2.9279513359069824, "loss": 0.5931, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.399937391281128, "rewards/margins": 0.5280137062072754, "rewards/rejected": -2.9279513359069824, "sft_loss": 2.5254905223846436, "step": 1020 }, { "epoch": 0.5485867201873222, "grad_norm": 10.475937269154038, "learning_rate": 9.792569880987724e-07, "logits/chosen": -0.039773181080818176, "logits/rejected": 0.0928923636674881, "logps/chosen": -2.4386563301086426, "logps/rejected": -3.032752513885498, "loss": 0.5996, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.4386563301086426, "rewards/margins": 0.5940964221954346, "rewards/rejected": -3.032752513885498, "sft_loss": 2.555467128753662, "step": 1025 }, { "epoch": 0.5512627529687238, "grad_norm": 14.15306373994566, "learning_rate": 9.788107332632493e-07, "logits/chosen": 0.023439515382051468, "logits/rejected": 0.11487237364053726, "logps/chosen": -2.4161739349365234, "logps/rejected": -2.874342918395996, "loss": 0.6551, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.4161739349365234, "rewards/margins": 0.458169162273407, "rewards/rejected": -2.874342918395996, "sft_loss": 2.5372936725616455, "step": 1030 }, { "epoch": 0.5539387857501255, "grad_norm": 8.847741885689816, "learning_rate": 9.783598330038924e-07, "logits/chosen": -0.024633217602968216, "logits/rejected": 0.10054673254489899, "logps/chosen": -2.4228594303131104, "logps/rejected": -2.8241658210754395, "loss": 0.6474, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.4228594303131104, "rewards/margins": 0.4013066291809082, "rewards/rejected": -2.8241658210754395, "sft_loss": 2.4797160625457764, "step": 1035 }, { "epoch": 0.5566148185315271, "grad_norm": 13.358408093082733, "learning_rate": 9.779042916953376e-07, "logits/chosen": 0.022121794521808624, "logits/rejected": 0.20207636058330536, "logps/chosen": -2.1366634368896484, "logps/rejected": -2.747373104095459, "loss": 0.5924, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1366634368896484, "rewards/margins": 0.6107093095779419, "rewards/rejected": -2.747373104095459, "sft_loss": 2.242213487625122, "step": 1040 }, { "epoch": 0.5592908513129285, "grad_norm": 7.435232618941414, "learning_rate": 9.774441137572487e-07, "logits/chosen": -0.04853493347764015, "logits/rejected": 0.11352036148309708, "logps/chosen": -2.305412769317627, "logps/rejected": -2.8964731693267822, "loss": 0.586, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.305412769317627, "rewards/margins": 0.5910605788230896, "rewards/rejected": -2.8964731693267822, "sft_loss": 2.4367892742156982, "step": 1045 }, { "epoch": 0.5619668840943302, "grad_norm": 10.496072454599288, "learning_rate": 9.76979303654274e-07, "logits/chosen": -0.07789639383554459, "logits/rejected": 0.03533010184764862, "logps/chosen": -2.4677746295928955, "logps/rejected": -3.0698654651641846, "loss": 0.595, "rewards/accuracies": 0.6875, "rewards/chosen": -2.4677746295928955, "rewards/margins": 0.6020905375480652, "rewards/rejected": -3.0698654651641846, "sft_loss": 2.5504465103149414, "step": 1050 }, { "epoch": 0.5646429168757318, "grad_norm": 12.156821575710055, "learning_rate": 9.765098658960035e-07, "logits/chosen": 0.011598305776715279, "logits/rejected": 0.09234117716550827, "logps/chosen": -2.443835496902466, "logps/rejected": -2.99617600440979, "loss": 0.5969, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.443835496902466, "rewards/margins": 0.5523403882980347, "rewards/rejected": -2.99617600440979, "sft_loss": 2.48038649559021, "step": 1055 }, { "epoch": 0.5673189496571333, "grad_norm": 12.482189019423522, "learning_rate": 9.76035805036924e-07, "logits/chosen": 0.06962551921606064, "logits/rejected": 0.2640361189842224, "logps/chosen": -2.56504487991333, "logps/rejected": -3.0283312797546387, "loss": 0.6309, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.56504487991333, "rewards/margins": 0.4632865786552429, "rewards/rejected": -3.0283312797546387, "sft_loss": 2.5605111122131348, "step": 1060 }, { "epoch": 0.5699949824385349, "grad_norm": 10.862661070793598, "learning_rate": 9.755571256763764e-07, "logits/chosen": 0.050166137516498566, "logits/rejected": 0.18997737765312195, "logps/chosen": -2.4118528366088867, "logps/rejected": -3.04915189743042, "loss": 0.5701, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4118528366088867, "rewards/margins": 0.6372988820075989, "rewards/rejected": -3.04915189743042, "sft_loss": 2.5698437690734863, "step": 1065 }, { "epoch": 0.5726710152199365, "grad_norm": 8.554618594416997, "learning_rate": 9.750738324585097e-07, "logits/chosen": -0.05651255324482918, "logits/rejected": 0.20479531586170197, "logps/chosen": -2.5034983158111572, "logps/rejected": -3.0640318393707275, "loss": 0.592, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.5034983158111572, "rewards/margins": 0.5605336427688599, "rewards/rejected": -3.0640318393707275, "sft_loss": 2.6030170917510986, "step": 1070 }, { "epoch": 0.5753470480013381, "grad_norm": 7.239548206687595, "learning_rate": 9.74585930072237e-07, "logits/chosen": 0.032971903681755066, "logits/rejected": 0.17771092057228088, "logps/chosen": -2.311800956726074, "logps/rejected": -2.924129009246826, "loss": 0.6031, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.311800956726074, "rewards/margins": 0.6123279929161072, "rewards/rejected": -2.924129009246826, "sft_loss": 2.4453539848327637, "step": 1075 }, { "epoch": 0.5780230807827396, "grad_norm": 9.823708542127223, "learning_rate": 9.740934232511892e-07, "logits/chosen": -0.05281168222427368, "logits/rejected": 0.06951533257961273, "logps/chosen": -2.492975950241089, "logps/rejected": -3.0146288871765137, "loss": 0.6193, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.492975950241089, "rewards/margins": 0.52165287733078, "rewards/rejected": -3.0146288871765137, "sft_loss": 2.663308620452881, "step": 1080 }, { "epoch": 0.5806991135641412, "grad_norm": 11.971129664715203, "learning_rate": 9.735963167736698e-07, "logits/chosen": 0.019016049802303314, "logits/rejected": 0.2044394314289093, "logps/chosen": -2.41227388381958, "logps/rejected": -2.7654335498809814, "loss": 0.6744, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.41227388381958, "rewards/margins": 0.35315990447998047, "rewards/rejected": -2.7654335498809814, "sft_loss": 2.4880402088165283, "step": 1085 }, { "epoch": 0.5833751463455428, "grad_norm": 8.836003071123491, "learning_rate": 9.730946154626078e-07, "logits/chosen": 0.028936797752976418, "logits/rejected": 0.1407909095287323, "logps/chosen": -2.507131576538086, "logps/rejected": -2.9046547412872314, "loss": 0.672, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.507131576538086, "rewards/margins": 0.397522896528244, "rewards/rejected": -2.9046547412872314, "sft_loss": 2.6242282390594482, "step": 1090 }, { "epoch": 0.5860511791269443, "grad_norm": 13.877213002027947, "learning_rate": 9.725883241855117e-07, "logits/chosen": -0.10876253992319107, "logits/rejected": 0.05406232550740242, "logps/chosen": -2.434305191040039, "logps/rejected": -3.0059266090393066, "loss": 0.5916, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.434305191040039, "rewards/margins": 0.5716217756271362, "rewards/rejected": -3.0059266090393066, "sft_loss": 2.587421178817749, "step": 1095 }, { "epoch": 0.5887272119083459, "grad_norm": 9.283286246160356, "learning_rate": 9.720774478544218e-07, "logits/chosen": 0.015044411644339561, "logits/rejected": 0.1409268081188202, "logps/chosen": -2.320230722427368, "logps/rejected": -2.980865478515625, "loss": 0.6017, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.320230722427368, "rewards/margins": 0.6606348752975464, "rewards/rejected": -2.980865478515625, "sft_loss": 2.453669309616089, "step": 1100 }, { "epoch": 0.5914032446897475, "grad_norm": 9.587947641293601, "learning_rate": 9.715619914258624e-07, "logits/chosen": -0.07415847480297089, "logits/rejected": 0.025357436388731003, "logps/chosen": -2.4371161460876465, "logps/rejected": -2.8709144592285156, "loss": 0.6434, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4371161460876465, "rewards/margins": 0.4337983727455139, "rewards/rejected": -2.8709144592285156, "sft_loss": 2.452127456665039, "step": 1105 }, { "epoch": 0.594079277471149, "grad_norm": 12.43499144947583, "learning_rate": 9.710419599007937e-07, "logits/chosen": -0.020220275968313217, "logits/rejected": 0.12485732138156891, "logps/chosen": -2.4217236042022705, "logps/rejected": -2.8118643760681152, "loss": 0.6412, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.4217236042022705, "rewards/margins": 0.39014047384262085, "rewards/rejected": -2.8118643760681152, "sft_loss": 2.513850212097168, "step": 1110 }, { "epoch": 0.5967553102525506, "grad_norm": 11.869515874986861, "learning_rate": 9.705173583245643e-07, "logits/chosen": 0.03815209120512009, "logits/rejected": 0.20307651162147522, "logps/chosen": -2.2028329372406006, "logps/rejected": -2.7756314277648926, "loss": 0.5967, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.2028329372406006, "rewards/margins": 0.5727984309196472, "rewards/rejected": -2.7756314277648926, "sft_loss": 2.2485616207122803, "step": 1115 }, { "epoch": 0.5994313430339522, "grad_norm": 7.423096137669163, "learning_rate": 9.699881917868609e-07, "logits/chosen": -0.13720020651817322, "logits/rejected": -0.012343773618340492, "logps/chosen": -2.245342969894409, "logps/rejected": -2.795599937438965, "loss": 0.6001, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.245342969894409, "rewards/margins": 0.55025714635849, "rewards/rejected": -2.795599937438965, "sft_loss": 2.408857822418213, "step": 1120 }, { "epoch": 0.6021073758153538, "grad_norm": 9.075458489681749, "learning_rate": 9.694544654216594e-07, "logits/chosen": -0.1065760999917984, "logits/rejected": 0.10421280562877655, "logps/chosen": -2.33565092086792, "logps/rejected": -2.943451404571533, "loss": 0.5765, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.33565092086792, "rewards/margins": 0.6078003644943237, "rewards/rejected": -2.943451404571533, "sft_loss": 2.3767237663269043, "step": 1125 }, { "epoch": 0.6047834085967553, "grad_norm": 12.459931336585251, "learning_rate": 9.689161844071755e-07, "logits/chosen": 0.05389224365353584, "logits/rejected": 0.1359303891658783, "logps/chosen": -2.204026699066162, "logps/rejected": -2.6704185009002686, "loss": 0.6297, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.204026699066162, "rewards/margins": 0.4663916230201721, "rewards/rejected": -2.6704185009002686, "sft_loss": 2.2297959327697754, "step": 1130 }, { "epoch": 0.6074594413781569, "grad_norm": 9.910833434965273, "learning_rate": 9.683733539658138e-07, "logits/chosen": -0.034460533410310745, "logits/rejected": 0.15427373349666595, "logps/chosen": -2.3623645305633545, "logps/rejected": -2.85990047454834, "loss": 0.6379, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.3623645305633545, "rewards/margins": 0.4975360929965973, "rewards/rejected": -2.85990047454834, "sft_loss": 2.3342556953430176, "step": 1135 }, { "epoch": 0.6101354741595585, "grad_norm": 10.119642658770305, "learning_rate": 9.678259793641178e-07, "logits/chosen": -0.005990887992084026, "logits/rejected": 0.048573773354291916, "logps/chosen": -2.343507766723633, "logps/rejected": -2.6922554969787598, "loss": 0.6553, "rewards/accuracies": 0.625, "rewards/chosen": -2.343507766723633, "rewards/margins": 0.3487474322319031, "rewards/rejected": -2.6922554969787598, "sft_loss": 2.4332337379455566, "step": 1140 }, { "epoch": 0.61281150694096, "grad_norm": 8.549538226355667, "learning_rate": 9.672740659127183e-07, "logits/chosen": -0.14116328954696655, "logits/rejected": 0.010193651542067528, "logps/chosen": -2.280282974243164, "logps/rejected": -2.8611793518066406, "loss": 0.6175, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.280282974243164, "rewards/margins": 0.580896258354187, "rewards/rejected": -2.8611793518066406, "sft_loss": 2.414921283721924, "step": 1145 }, { "epoch": 0.6154875397223616, "grad_norm": 9.265583223855089, "learning_rate": 9.667176189662818e-07, "logits/chosen": -0.11378173530101776, "logits/rejected": 0.03218189626932144, "logps/chosen": -2.2497658729553223, "logps/rejected": -2.8140835762023926, "loss": 0.5923, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.2497658729553223, "rewards/margins": 0.5643175840377808, "rewards/rejected": -2.8140835762023926, "sft_loss": 2.3082830905914307, "step": 1150 }, { "epoch": 0.6181635725037632, "grad_norm": 8.945456930621933, "learning_rate": 9.661566439234592e-07, "logits/chosen": -0.010784052312374115, "logits/rejected": 0.10385878384113312, "logps/chosen": -2.259997606277466, "logps/rejected": -2.689664840698242, "loss": 0.6281, "rewards/accuracies": 0.65625, "rewards/chosen": -2.259997606277466, "rewards/margins": 0.42966747283935547, "rewards/rejected": -2.689664840698242, "sft_loss": 2.386997699737549, "step": 1155 }, { "epoch": 0.6208396052851648, "grad_norm": 9.913210782246972, "learning_rate": 9.655911462268327e-07, "logits/chosen": 0.058179665356874466, "logits/rejected": 0.17801596224308014, "logps/chosen": -2.320453643798828, "logps/rejected": -2.9494757652282715, "loss": 0.547, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.320453643798828, "rewards/margins": 0.6290220022201538, "rewards/rejected": -2.9494757652282715, "sft_loss": 2.536233901977539, "step": 1160 }, { "epoch": 0.6235156380665663, "grad_norm": 9.03448641219398, "learning_rate": 9.650211313628636e-07, "logits/chosen": -0.026484167203307152, "logits/rejected": 0.07927101105451584, "logps/chosen": -2.41178560256958, "logps/rejected": -2.841890811920166, "loss": 0.6468, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.41178560256958, "rewards/margins": 0.4301057457923889, "rewards/rejected": -2.841890811920166, "sft_loss": 2.56024169921875, "step": 1165 }, { "epoch": 0.6261916708479679, "grad_norm": 7.4627355054266555, "learning_rate": 9.644466048618386e-07, "logits/chosen": -0.013878998346626759, "logits/rejected": 0.15998892486095428, "logps/chosen": -2.7046947479248047, "logps/rejected": -3.2395119667053223, "loss": 0.6503, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.7046947479248047, "rewards/margins": 0.534817099571228, "rewards/rejected": -3.2395119667053223, "sft_loss": 2.7282543182373047, "step": 1170 }, { "epoch": 0.6288677036293695, "grad_norm": 8.984602343546689, "learning_rate": 9.63867572297816e-07, "logits/chosen": -0.025428790599107742, "logits/rejected": 0.18897771835327148, "logps/chosen": -2.431147336959839, "logps/rejected": -2.9441728591918945, "loss": 0.625, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.431147336959839, "rewards/margins": 0.5130254030227661, "rewards/rejected": -2.9441728591918945, "sft_loss": 2.5837855339050293, "step": 1175 }, { "epoch": 0.631543736410771, "grad_norm": 10.646614025720375, "learning_rate": 9.632840392885727e-07, "logits/chosen": -0.05773182958364487, "logits/rejected": 0.12018024921417236, "logps/chosen": -2.6617088317871094, "logps/rejected": -3.2409186363220215, "loss": 0.6207, "rewards/accuracies": 0.6875, "rewards/chosen": -2.6617088317871094, "rewards/margins": 0.5792103409767151, "rewards/rejected": -3.2409186363220215, "sft_loss": 2.7341196537017822, "step": 1180 }, { "epoch": 0.6342197691921726, "grad_norm": 8.685344328099887, "learning_rate": 9.626960114955483e-07, "logits/chosen": 0.0071663991548120975, "logits/rejected": 0.1699160784482956, "logps/chosen": -2.555429458618164, "logps/rejected": -3.1707100868225098, "loss": 0.6035, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.555429458618164, "rewards/margins": 0.6152806282043457, "rewards/rejected": -3.1707100868225098, "sft_loss": 2.6178455352783203, "step": 1185 }, { "epoch": 0.6368958019735742, "grad_norm": 11.533136482896289, "learning_rate": 9.621034946237909e-07, "logits/chosen": -0.05947018787264824, "logits/rejected": 0.1066945418715477, "logps/chosen": -2.5827643871307373, "logps/rejected": -3.259899854660034, "loss": 0.5655, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5827643871307373, "rewards/margins": 0.6771354675292969, "rewards/rejected": -3.259899854660034, "sft_loss": 2.7543718814849854, "step": 1190 }, { "epoch": 0.6395718347549757, "grad_norm": 9.564838984193049, "learning_rate": 9.615064944219021e-07, "logits/chosen": 0.01643727719783783, "logits/rejected": 0.1448705494403839, "logps/chosen": -2.372252941131592, "logps/rejected": -3.0352444648742676, "loss": 0.5645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.372252941131592, "rewards/margins": 0.6629914045333862, "rewards/rejected": -3.0352444648742676, "sft_loss": 2.5685365200042725, "step": 1195 }, { "epoch": 0.6422478675363773, "grad_norm": 15.41012291667016, "learning_rate": 9.609050166819803e-07, "logits/chosen": -0.037835728377103806, "logits/rejected": 0.043718576431274414, "logps/chosen": -2.5800116062164307, "logps/rejected": -3.1163458824157715, "loss": 0.6242, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.5800116062164307, "rewards/margins": 0.5363345146179199, "rewards/rejected": -3.1163458824157715, "sft_loss": 2.6192474365234375, "step": 1200 }, { "epoch": 0.6422478675363773, "eval_logits/chosen": 0.41017210483551025, "eval_logits/rejected": 0.523736834526062, "eval_logps/chosen": -2.484957456588745, "eval_logps/rejected": -3.1039416790008545, "eval_loss": 0.5881574749946594, "eval_rewards/accuracies": 0.6973294019699097, "eval_rewards/chosen": -2.484957456588745, "eval_rewards/margins": 0.618984043598175, "eval_rewards/rejected": -3.1039416790008545, "eval_runtime": 52.8258, "eval_samples_per_second": 25.461, "eval_sft_loss": 2.6278696060180664, "eval_steps_per_second": 6.379, "step": 1200 }, { "epoch": 0.6449239003177789, "grad_norm": 16.223702023822415, "learning_rate": 9.602990672395653e-07, "logits/chosen": -0.12028829753398895, "logits/rejected": 0.09198782593011856, "logps/chosen": -2.447338342666626, "logps/rejected": -3.0379064083099365, "loss": 0.5845, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.447338342666626, "rewards/margins": 0.5905681848526001, "rewards/rejected": -3.0379064083099365, "sft_loss": 2.548774003982544, "step": 1205 }, { "epoch": 0.6475999330991805, "grad_norm": 10.103767921832775, "learning_rate": 9.59688651973581e-07, "logits/chosen": -0.04002942889928818, "logits/rejected": 0.17909970879554749, "logps/chosen": -2.391770124435425, "logps/rejected": -2.9228744506835938, "loss": 0.5863, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.391770124435425, "rewards/margins": 0.5311041474342346, "rewards/rejected": -2.9228744506835938, "sft_loss": 2.507202625274658, "step": 1210 }, { "epoch": 0.650275965880582, "grad_norm": 9.570882970595996, "learning_rate": 9.590737768062792e-07, "logits/chosen": -0.08841636031866074, "logits/rejected": 0.05188380554318428, "logps/chosen": -2.521583080291748, "logps/rejected": -2.9929375648498535, "loss": 0.6329, "rewards/accuracies": 0.6875, "rewards/chosen": -2.521583080291748, "rewards/margins": 0.47135478258132935, "rewards/rejected": -2.9929375648498535, "sft_loss": 2.6055068969726562, "step": 1215 }, { "epoch": 0.6529519986619836, "grad_norm": 9.152708059844768, "learning_rate": 9.584544477031816e-07, "logits/chosen": 0.09380490332841873, "logits/rejected": 0.22841504216194153, "logps/chosen": -2.2076313495635986, "logps/rejected": -2.7225444316864014, "loss": 0.6128, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.2076313495635986, "rewards/margins": 0.5149132013320923, "rewards/rejected": -2.7225444316864014, "sft_loss": 2.280226707458496, "step": 1220 }, { "epoch": 0.6556280314433852, "grad_norm": 10.507226530549053, "learning_rate": 9.578306706730215e-07, "logits/chosen": -0.13376358151435852, "logits/rejected": 0.10572312772274017, "logps/chosen": -2.4945626258850098, "logps/rejected": -2.9013962745666504, "loss": 0.6674, "rewards/accuracies": 0.65625, "rewards/chosen": -2.4945626258850098, "rewards/margins": 0.40683332085609436, "rewards/rejected": -2.9013962745666504, "sft_loss": 2.568493366241455, "step": 1225 }, { "epoch": 0.6583040642247867, "grad_norm": 11.20669415454124, "learning_rate": 9.572024517676865e-07, "logits/chosen": -0.012587158009409904, "logits/rejected": 0.10158940404653549, "logps/chosen": -2.4850094318389893, "logps/rejected": -2.9105517864227295, "loss": 0.649, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4850094318389893, "rewards/margins": 0.4255426526069641, "rewards/rejected": -2.9105517864227295, "sft_loss": 2.536282777786255, "step": 1230 }, { "epoch": 0.6609800970061883, "grad_norm": 8.63873296459481, "learning_rate": 9.565697970821593e-07, "logits/chosen": 0.01712236925959587, "logits/rejected": 0.17170551419258118, "logps/chosen": -2.3881125450134277, "logps/rejected": -2.8042643070220947, "loss": 0.6276, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3881125450134277, "rewards/margins": 0.41615214943885803, "rewards/rejected": -2.8042643070220947, "sft_loss": 2.5537092685699463, "step": 1235 }, { "epoch": 0.6636561297875899, "grad_norm": 9.518280928455988, "learning_rate": 9.559327127544585e-07, "logits/chosen": -0.09290903806686401, "logits/rejected": 0.05830955505371094, "logps/chosen": -2.2869534492492676, "logps/rejected": -2.7384681701660156, "loss": 0.6042, "rewards/accuracies": 0.65625, "rewards/chosen": -2.2869534492492676, "rewards/margins": 0.45151472091674805, "rewards/rejected": -2.7384681701660156, "sft_loss": 2.4760611057281494, "step": 1240 }, { "epoch": 0.6663321625689914, "grad_norm": 10.55673759047217, "learning_rate": 9.552912049655789e-07, "logits/chosen": -0.01964261755347252, "logits/rejected": 0.18300272524356842, "logps/chosen": -2.2489116191864014, "logps/rejected": -2.731536626815796, "loss": 0.6167, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.2489116191864014, "rewards/margins": 0.4826253056526184, "rewards/rejected": -2.731536626815796, "sft_loss": 2.314009189605713, "step": 1245 }, { "epoch": 0.669008195350393, "grad_norm": 12.081308503661694, "learning_rate": 9.546452799394315e-07, "logits/chosen": -0.00539967929944396, "logits/rejected": 0.21461212635040283, "logps/chosen": -2.4177112579345703, "logps/rejected": -2.815523147583008, "loss": 0.6565, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.4177112579345703, "rewards/margins": 0.3978119492530823, "rewards/rejected": -2.815523147583008, "sft_loss": 2.4331181049346924, "step": 1250 }, { "epoch": 0.6716842281317946, "grad_norm": 11.044936559753248, "learning_rate": 9.539949439427846e-07, "logits/chosen": -0.0206326711922884, "logits/rejected": 0.1170809417963028, "logps/chosen": -2.331955671310425, "logps/rejected": -2.8401269912719727, "loss": 0.61, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.331955671310425, "rewards/margins": 0.5081711411476135, "rewards/rejected": -2.8401269912719727, "sft_loss": 2.543001651763916, "step": 1255 }, { "epoch": 0.6743602609131962, "grad_norm": 8.165133631615923, "learning_rate": 9.533402032852002e-07, "logits/chosen": -0.05226556584239006, "logits/rejected": 0.09908227622509003, "logps/chosen": -2.407578468322754, "logps/rejected": -3.0979156494140625, "loss": 0.5715, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.407578468322754, "rewards/margins": 0.6903371214866638, "rewards/rejected": -3.0979156494140625, "sft_loss": 2.6117091178894043, "step": 1260 }, { "epoch": 0.6770362936945977, "grad_norm": 12.847697606224473, "learning_rate": 9.526810643189754e-07, "logits/chosen": 0.031186480075120926, "logits/rejected": 0.2108938992023468, "logps/chosen": -2.4767446517944336, "logps/rejected": -3.136263608932495, "loss": 0.5793, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.4767446517944336, "rewards/margins": 0.659518837928772, "rewards/rejected": -3.136263608932495, "sft_loss": 2.5776114463806152, "step": 1265 }, { "epoch": 0.6797123264759993, "grad_norm": 13.802014339563467, "learning_rate": 9.52017533439079e-07, "logits/chosen": -0.0694260522723198, "logits/rejected": 0.029919719323515892, "logps/chosen": -2.591400146484375, "logps/rejected": -3.1500930786132812, "loss": 0.6103, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.591400146484375, "rewards/margins": 0.5586927533149719, "rewards/rejected": -3.1500930786132812, "sft_loss": 2.695070743560791, "step": 1270 }, { "epoch": 0.6823883592574009, "grad_norm": 9.669757404786512, "learning_rate": 9.513496170830909e-07, "logits/chosen": -0.05950520560145378, "logits/rejected": 0.06504709273576736, "logps/chosen": -2.6838889122009277, "logps/rejected": -3.1975209712982178, "loss": 0.6543, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.6838889122009277, "rewards/margins": 0.51363205909729, "rewards/rejected": -3.1975209712982178, "sft_loss": 2.694319725036621, "step": 1275 }, { "epoch": 0.6850643920388024, "grad_norm": 10.902718843836002, "learning_rate": 9.506773217311382e-07, "logits/chosen": -0.05056373029947281, "logits/rejected": 0.13073304295539856, "logps/chosen": -2.54476261138916, "logps/rejected": -3.0694384574890137, "loss": 0.622, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.54476261138916, "rewards/margins": 0.5246758460998535, "rewards/rejected": -3.0694384574890137, "sft_loss": 2.6479544639587402, "step": 1280 }, { "epoch": 0.687740424820204, "grad_norm": 10.933403727554007, "learning_rate": 9.500006539058334e-07, "logits/chosen": -0.0024568967055529356, "logits/rejected": 0.16251808404922485, "logps/chosen": -2.3788487911224365, "logps/rejected": -2.8265020847320557, "loss": 0.6192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3788487911224365, "rewards/margins": 0.44765329360961914, "rewards/rejected": -2.8265020847320557, "sft_loss": 2.450951099395752, "step": 1285 }, { "epoch": 0.6904164576016056, "grad_norm": 11.691876400461265, "learning_rate": 9.493196201722109e-07, "logits/chosen": -0.14926186203956604, "logits/rejected": 0.02142159640789032, "logps/chosen": -2.5109152793884277, "logps/rejected": -2.903718948364258, "loss": 0.6638, "rewards/accuracies": 0.625, "rewards/chosen": -2.5109152793884277, "rewards/margins": 0.39280396699905396, "rewards/rejected": -2.903718948364258, "sft_loss": 2.608675479888916, "step": 1290 }, { "epoch": 0.6930924903830072, "grad_norm": 7.651439069393412, "learning_rate": 9.486342271376628e-07, "logits/chosen": -0.005381585098803043, "logits/rejected": -0.002708807587623596, "logps/chosen": -2.4760279655456543, "logps/rejected": -3.0964133739471436, "loss": 0.5893, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4760279655456543, "rewards/margins": 0.6203856468200684, "rewards/rejected": -3.0964133739471436, "sft_loss": 2.5709683895111084, "step": 1295 }, { "epoch": 0.6957685231644087, "grad_norm": 11.078381942425255, "learning_rate": 9.479444814518755e-07, "logits/chosen": -0.041806433349847794, "logits/rejected": 0.25304484367370605, "logps/chosen": -2.383418321609497, "logps/rejected": -3.0692639350891113, "loss": 0.5741, "rewards/accuracies": 0.6875, "rewards/chosen": -2.383418321609497, "rewards/margins": 0.6858457326889038, "rewards/rejected": -3.0692639350891113, "sft_loss": 2.5278241634368896, "step": 1300 }, { "epoch": 0.6984445559458103, "grad_norm": 8.606399599948285, "learning_rate": 9.472503898067645e-07, "logits/chosen": 0.09567885100841522, "logits/rejected": 0.15037801861763, "logps/chosen": -2.4444668292999268, "logps/rejected": -2.9069840908050537, "loss": 0.6422, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.4444668292999268, "rewards/margins": 0.4625173509120941, "rewards/rejected": -2.9069840908050537, "sft_loss": 2.4670987129211426, "step": 1305 }, { "epoch": 0.701120588727212, "grad_norm": 9.235939474310552, "learning_rate": 9.465519589364099e-07, "logits/chosen": 0.08462213724851608, "logits/rejected": 0.1842813789844513, "logps/chosen": -2.3209452629089355, "logps/rejected": -2.9042229652404785, "loss": 0.5986, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3209452629089355, "rewards/margins": 0.583277702331543, "rewards/rejected": -2.9042229652404785, "sft_loss": 2.447727680206299, "step": 1310 }, { "epoch": 0.7037966215086134, "grad_norm": 11.057609220966931, "learning_rate": 9.458491956169914e-07, "logits/chosen": -0.0232564527541399, "logits/rejected": 0.1688549518585205, "logps/chosen": -2.376094341278076, "logps/rejected": -2.959827423095703, "loss": 0.5972, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.376094341278076, "rewards/margins": 0.5837332010269165, "rewards/rejected": -2.959827423095703, "sft_loss": 2.442779302597046, "step": 1315 }, { "epoch": 0.706472654290015, "grad_norm": 8.547817393114872, "learning_rate": 9.451421066667215e-07, "logits/chosen": -0.14264704287052155, "logits/rejected": 0.07813362777233124, "logps/chosen": -2.3628363609313965, "logps/rejected": -2.8726606369018555, "loss": 0.6029, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3628363609313965, "rewards/margins": 0.5098242163658142, "rewards/rejected": -2.8726606369018555, "sft_loss": 2.477353572845459, "step": 1320 }, { "epoch": 0.7091486870714167, "grad_norm": 15.063495170861021, "learning_rate": 9.444306989457805e-07, "logits/chosen": 0.05425446107983589, "logits/rejected": 0.19093184173107147, "logps/chosen": -2.3016774654388428, "logps/rejected": -2.8103010654449463, "loss": 0.6641, "rewards/accuracies": 0.625, "rewards/chosen": -2.3016774654388428, "rewards/margins": 0.5086237192153931, "rewards/rejected": -2.8103010654449463, "sft_loss": 2.3117482662200928, "step": 1325 }, { "epoch": 0.7118247198528181, "grad_norm": 11.299849791199572, "learning_rate": 9.437149793562489e-07, "logits/chosen": 0.008106740191578865, "logits/rejected": 0.13419048488140106, "logps/chosen": -2.317131280899048, "logps/rejected": -2.8257088661193848, "loss": 0.6247, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.317131280899048, "rewards/margins": 0.5085776448249817, "rewards/rejected": -2.8257088661193848, "sft_loss": 2.4108710289001465, "step": 1330 }, { "epoch": 0.7145007526342197, "grad_norm": 11.67637808191526, "learning_rate": 9.429949548420417e-07, "logits/chosen": 0.0023697116412222385, "logits/rejected": 0.10824587196111679, "logps/chosen": -2.3033547401428223, "logps/rejected": -2.787705421447754, "loss": 0.6166, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.3033547401428223, "rewards/margins": 0.4843505918979645, "rewards/rejected": -2.787705421447754, "sft_loss": 2.342329502105713, "step": 1335 }, { "epoch": 0.7171767854156214, "grad_norm": 12.35202949763023, "learning_rate": 9.422706323888396e-07, "logits/chosen": 0.02033412829041481, "logits/rejected": 0.07009680569171906, "logps/chosen": -2.1604971885681152, "logps/rejected": -2.5448131561279297, "loss": 0.6474, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1604971885681152, "rewards/margins": 0.38431602716445923, "rewards/rejected": -2.5448131561279297, "sft_loss": 2.2100844383239746, "step": 1340 }, { "epoch": 0.719852818197023, "grad_norm": 8.235804001921842, "learning_rate": 9.415420190240225e-07, "logits/chosen": 0.06834589689970016, "logits/rejected": 0.2915201485157013, "logps/chosen": -2.198119640350342, "logps/rejected": -2.784534215927124, "loss": 0.5475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.198119640350342, "rewards/margins": 0.586414635181427, "rewards/rejected": -2.784534215927124, "sft_loss": 2.3332037925720215, "step": 1345 }, { "epoch": 0.7225288509784245, "grad_norm": 12.578137433589989, "learning_rate": 9.408091218166002e-07, "logits/chosen": 0.04810570925474167, "logits/rejected": 0.1264292299747467, "logps/chosen": -2.316530227661133, "logps/rejected": -2.6347362995147705, "loss": 0.6768, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.316530227661133, "rewards/margins": 0.31820613145828247, "rewards/rejected": -2.6347362995147705, "sft_loss": 2.4868788719177246, "step": 1350 }, { "epoch": 0.7252048837598261, "grad_norm": 9.732474560078147, "learning_rate": 9.400719478771449e-07, "logits/chosen": -0.0023418336641043425, "logits/rejected": 0.32972416281700134, "logps/chosen": -2.483259916305542, "logps/rejected": -2.9924492835998535, "loss": 0.6039, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.483259916305542, "rewards/margins": 0.5091896057128906, "rewards/rejected": -2.9924492835998535, "sft_loss": 2.556227445602417, "step": 1355 }, { "epoch": 0.7278809165412277, "grad_norm": 10.784811466778459, "learning_rate": 9.393305043577209e-07, "logits/chosen": -0.07075022161006927, "logits/rejected": 0.07518994808197021, "logps/chosen": -2.6700491905212402, "logps/rejected": -3.307894229888916, "loss": 0.5849, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.6700491905212402, "rewards/margins": 0.6378449201583862, "rewards/rejected": -3.307894229888916, "sft_loss": 2.8535122871398926, "step": 1360 }, { "epoch": 0.7305569493226292, "grad_norm": 8.152593413202151, "learning_rate": 9.38584798451817e-07, "logits/chosen": 0.001338806701824069, "logits/rejected": 0.16451290249824524, "logps/chosen": -2.5478076934814453, "logps/rejected": -3.1517438888549805, "loss": 0.5846, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5478076934814453, "rewards/margins": 0.6039361953735352, "rewards/rejected": -3.1517438888549805, "sft_loss": 2.6479318141937256, "step": 1365 }, { "epoch": 0.7332329821040308, "grad_norm": 44.588830934101054, "learning_rate": 9.37834837394275e-07, "logits/chosen": 0.01660953089594841, "logits/rejected": 0.1651841104030609, "logps/chosen": -2.634829521179199, "logps/rejected": -3.4330337047576904, "loss": 0.5846, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.634829521179199, "rewards/margins": 0.7982040643692017, "rewards/rejected": -3.4330337047576904, "sft_loss": 2.690223455429077, "step": 1370 }, { "epoch": 0.7359090148854324, "grad_norm": 7.931378725379154, "learning_rate": 9.370806284612203e-07, "logits/chosen": -0.03525816649198532, "logits/rejected": 0.1411258727312088, "logps/chosen": -2.5949692726135254, "logps/rejected": -3.309791088104248, "loss": 0.5696, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.5949692726135254, "rewards/margins": 0.7148216962814331, "rewards/rejected": -3.309791088104248, "sft_loss": 2.707218885421753, "step": 1375 }, { "epoch": 0.738585047666834, "grad_norm": 8.866169013098796, "learning_rate": 9.363221789699912e-07, "logits/chosen": -0.07670523971319199, "logits/rejected": 0.08060277998447418, "logps/chosen": -2.572488784790039, "logps/rejected": -3.0087974071502686, "loss": 0.6832, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.572488784790039, "rewards/margins": 0.43630871176719666, "rewards/rejected": -3.0087974071502686, "sft_loss": 2.633976459503174, "step": 1380 }, { "epoch": 0.7412610804482355, "grad_norm": 19.933929857933897, "learning_rate": 9.355594962790682e-07, "logits/chosen": -0.03613440319895744, "logits/rejected": 0.12450988590717316, "logps/chosen": -2.275702476501465, "logps/rejected": -2.871594190597534, "loss": 0.6035, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.275702476501465, "rewards/margins": 0.5958917737007141, "rewards/rejected": -2.871594190597534, "sft_loss": 2.397634983062744, "step": 1385 }, { "epoch": 0.7439371132296371, "grad_norm": 9.498790915187232, "learning_rate": 9.34792587788002e-07, "logits/chosen": 0.04346933960914612, "logits/rejected": 0.19381602108478546, "logps/chosen": -2.349087715148926, "logps/rejected": -2.825218677520752, "loss": 0.6281, "rewards/accuracies": 0.625, "rewards/chosen": -2.349087715148926, "rewards/margins": 0.47613126039505005, "rewards/rejected": -2.825218677520752, "sft_loss": 2.442924737930298, "step": 1390 }, { "epoch": 0.7466131460110387, "grad_norm": 8.605497264714288, "learning_rate": 9.34021460937342e-07, "logits/chosen": 0.06417609751224518, "logits/rejected": 0.16199856996536255, "logps/chosen": -2.3879554271698, "logps/rejected": -2.80842924118042, "loss": 0.6328, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.3879554271698, "rewards/margins": 0.4204736649990082, "rewards/rejected": -2.80842924118042, "sft_loss": 2.4789719581604004, "step": 1395 }, { "epoch": 0.7492891787924402, "grad_norm": 8.410478037971695, "learning_rate": 9.332461232085646e-07, "logits/chosen": -0.16510483622550964, "logits/rejected": 0.0025481381453573704, "logps/chosen": -2.527270793914795, "logps/rejected": -2.992276668548584, "loss": 0.6225, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.527270793914795, "rewards/margins": 0.46500545740127563, "rewards/rejected": -2.992276668548584, "sft_loss": 2.6221566200256348, "step": 1400 }, { "epoch": 0.7519652115738418, "grad_norm": 10.206815132123339, "learning_rate": 9.324665821239998e-07, "logits/chosen": -0.04792945086956024, "logits/rejected": 0.17230018973350525, "logps/chosen": -2.290905714035034, "logps/rejected": -2.963562250137329, "loss": 0.6188, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.290905714035034, "rewards/margins": 0.6726564168930054, "rewards/rejected": -2.963562250137329, "sft_loss": 2.4523534774780273, "step": 1405 }, { "epoch": 0.7546412443552434, "grad_norm": 12.703305381633276, "learning_rate": 9.316828452467583e-07, "logits/chosen": -0.04610484838485718, "logits/rejected": 0.16724538803100586, "logps/chosen": -2.3713042736053467, "logps/rejected": -2.9829983711242676, "loss": 0.5651, "rewards/accuracies": 0.71875, "rewards/chosen": -2.3713042736053467, "rewards/margins": 0.6116942167282104, "rewards/rejected": -2.9829983711242676, "sft_loss": 2.5226478576660156, "step": 1410 }, { "epoch": 0.7573172771366449, "grad_norm": 13.513252526025813, "learning_rate": 9.30894920180659e-07, "logits/chosen": 0.08314958959817886, "logits/rejected": 0.2487107217311859, "logps/chosen": -2.300837278366089, "logps/rejected": -2.700249671936035, "loss": 0.638, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.300837278366089, "rewards/margins": 0.39941245317459106, "rewards/rejected": -2.700249671936035, "sft_loss": 2.325178623199463, "step": 1415 }, { "epoch": 0.7599933099180465, "grad_norm": 7.612690497794865, "learning_rate": 9.301028145701543e-07, "logits/chosen": 0.07180485874414444, "logits/rejected": 0.24491927027702332, "logps/chosen": -2.378814220428467, "logps/rejected": -3.0824999809265137, "loss": 0.5982, "rewards/accuracies": 0.65625, "rewards/chosen": -2.378814220428467, "rewards/margins": 0.7036858797073364, "rewards/rejected": -3.0824999809265137, "sft_loss": 2.5600998401641846, "step": 1420 }, { "epoch": 0.7626693426994481, "grad_norm": 8.394035519991178, "learning_rate": 9.293065361002563e-07, "logits/chosen": 0.07057957351207733, "logits/rejected": 0.2046077698469162, "logps/chosen": -2.426069736480713, "logps/rejected": -3.0615506172180176, "loss": 0.598, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.426069736480713, "rewards/margins": 0.6354812383651733, "rewards/rejected": -3.0615506172180176, "sft_loss": 2.4885902404785156, "step": 1425 }, { "epoch": 0.7653453754808497, "grad_norm": 13.024495600475117, "learning_rate": 9.285060924964622e-07, "logits/chosen": -0.04211791604757309, "logits/rejected": 0.11966993659734726, "logps/chosen": -2.557664632797241, "logps/rejected": -3.0726795196533203, "loss": 0.6104, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.557664632797241, "rewards/margins": 0.5150147676467896, "rewards/rejected": -3.0726795196533203, "sft_loss": 2.638887882232666, "step": 1430 }, { "epoch": 0.7680214082622512, "grad_norm": 11.315382949699499, "learning_rate": 9.277014915246792e-07, "logits/chosen": 0.10380347073078156, "logits/rejected": 0.17050564289093018, "logps/chosen": -2.4016692638397217, "logps/rejected": -3.118354082107544, "loss": 0.575, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4016692638397217, "rewards/margins": 0.7166846990585327, "rewards/rejected": -3.118354082107544, "sft_loss": 2.5354397296905518, "step": 1435 }, { "epoch": 0.7706974410436528, "grad_norm": 7.414405749714338, "learning_rate": 9.268927409911498e-07, "logits/chosen": 0.020688209682703018, "logits/rejected": 0.15002763271331787, "logps/chosen": -2.38792085647583, "logps/rejected": -2.8667197227478027, "loss": 0.6295, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.38792085647583, "rewards/margins": 0.47879910469055176, "rewards/rejected": -2.8667197227478027, "sft_loss": 2.567570924758911, "step": 1440 }, { "epoch": 0.7733734738250544, "grad_norm": 13.793743690118733, "learning_rate": 9.260798487423749e-07, "logits/chosen": -0.0633191242814064, "logits/rejected": 0.18369950354099274, "logps/chosen": -2.3912315368652344, "logps/rejected": -2.8711366653442383, "loss": 0.6286, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.3912315368652344, "rewards/margins": 0.4799051284790039, "rewards/rejected": -2.8711366653442383, "sft_loss": 2.5537047386169434, "step": 1445 }, { "epoch": 0.7760495066064559, "grad_norm": 13.521948035260047, "learning_rate": 9.252628226650389e-07, "logits/chosen": 0.04702399671077728, "logits/rejected": 0.16084954142570496, "logps/chosen": -2.424561023712158, "logps/rejected": -2.8729541301727295, "loss": 0.671, "rewards/accuracies": 0.65625, "rewards/chosen": -2.424561023712158, "rewards/margins": 0.4483933448791504, "rewards/rejected": -2.8729541301727295, "sft_loss": 2.5094757080078125, "step": 1450 }, { "epoch": 0.7787255393878575, "grad_norm": 11.039814398221159, "learning_rate": 9.244416706859321e-07, "logits/chosen": -0.022870201617479324, "logits/rejected": 0.1717710942029953, "logps/chosen": -2.3271360397338867, "logps/rejected": -2.869342803955078, "loss": 0.6107, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3271360397338867, "rewards/margins": 0.542206883430481, "rewards/rejected": -2.869342803955078, "sft_loss": 2.42985463142395, "step": 1455 }, { "epoch": 0.7814015721692591, "grad_norm": 7.698204509511495, "learning_rate": 9.23616400771875e-07, "logits/chosen": 0.023630857467651367, "logits/rejected": 0.23449544608592987, "logps/chosen": -2.3770511150360107, "logps/rejected": -2.9637835025787354, "loss": 0.5966, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.3770511150360107, "rewards/margins": 0.5867326855659485, "rewards/rejected": -2.9637835025787354, "sft_loss": 2.427887201309204, "step": 1460 }, { "epoch": 0.7840776049506607, "grad_norm": 8.187687061644198, "learning_rate": 9.227870209296395e-07, "logits/chosen": 0.016844961792230606, "logits/rejected": 0.16727712750434875, "logps/chosen": -2.491568088531494, "logps/rejected": -2.9410855770111084, "loss": 0.6333, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.491568088531494, "rewards/margins": 0.4495179057121277, "rewards/rejected": -2.9410855770111084, "sft_loss": 2.5922093391418457, "step": 1465 }, { "epoch": 0.7867536377320622, "grad_norm": 10.159632993115585, "learning_rate": 9.219535392058728e-07, "logits/chosen": -0.017890067771077156, "logits/rejected": 0.014517772011458874, "logps/chosen": -2.4694104194641113, "logps/rejected": -2.9818406105041504, "loss": 0.627, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.4694104194641113, "rewards/margins": 0.5124302506446838, "rewards/rejected": -2.9818406105041504, "sft_loss": 2.578446626663208, "step": 1470 }, { "epoch": 0.7894296705134638, "grad_norm": 9.98388075126376, "learning_rate": 9.211159636870181e-07, "logits/chosen": -0.051668472588062286, "logits/rejected": 0.15935666859149933, "logps/chosen": -2.43375301361084, "logps/rejected": -3.010441541671753, "loss": 0.6019, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.43375301361084, "rewards/margins": 0.5766886472702026, "rewards/rejected": -3.010441541671753, "sft_loss": 2.519343137741089, "step": 1475 }, { "epoch": 0.7921057032948654, "grad_norm": 9.826631961075899, "learning_rate": 9.202743024992367e-07, "logits/chosen": 0.06925741583108902, "logits/rejected": 0.19104215502738953, "logps/chosen": -2.2985386848449707, "logps/rejected": -2.9673094749450684, "loss": 0.5893, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2985386848449707, "rewards/margins": 0.6687710881233215, "rewards/rejected": -2.9673094749450684, "sft_loss": 2.3903632164001465, "step": 1480 }, { "epoch": 0.7947817360762669, "grad_norm": 14.190865641317497, "learning_rate": 9.194285638083293e-07, "logits/chosen": 0.054853539913892746, "logits/rejected": 0.24246785044670105, "logps/chosen": -2.4935526847839355, "logps/rejected": -3.1982533931732178, "loss": 0.5546, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4935526847839355, "rewards/margins": 0.7047004699707031, "rewards/rejected": -3.1982533931732178, "sft_loss": 2.608858585357666, "step": 1485 }, { "epoch": 0.7974577688576685, "grad_norm": 12.488609995267533, "learning_rate": 9.185787558196562e-07, "logits/chosen": -0.029964571818709373, "logits/rejected": 0.11130587756633759, "logps/chosen": -2.4313859939575195, "logps/rejected": -2.966317892074585, "loss": 0.6163, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4313859939575195, "rewards/margins": 0.534932017326355, "rewards/rejected": -2.966317892074585, "sft_loss": 2.5397000312805176, "step": 1490 }, { "epoch": 0.8001338016390701, "grad_norm": 10.465880185412507, "learning_rate": 9.177248867780583e-07, "logits/chosen": 0.029187191277742386, "logits/rejected": 0.1466110199689865, "logps/chosen": -2.6569457054138184, "logps/rejected": -3.035051107406616, "loss": 0.6588, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.6569457054138184, "rewards/margins": 0.3781054615974426, "rewards/rejected": -3.035051107406616, "sft_loss": 2.8448221683502197, "step": 1495 }, { "epoch": 0.8028098344204716, "grad_norm": 10.168222832018843, "learning_rate": 9.168669649677769e-07, "logits/chosen": -0.03170633316040039, "logits/rejected": 0.09394307434558868, "logps/chosen": -2.513032913208008, "logps/rejected": -3.018311023712158, "loss": 0.6592, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.513032913208008, "rewards/margins": 0.5052781105041504, "rewards/rejected": -3.018311023712158, "sft_loss": 2.691277027130127, "step": 1500 }, { "epoch": 0.8054858672018732, "grad_norm": 10.59003170893267, "learning_rate": 9.16004998712373e-07, "logits/chosen": 0.04619743674993515, "logits/rejected": 0.13593070209026337, "logps/chosen": -2.49043607711792, "logps/rejected": -3.106337785720825, "loss": 0.5759, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.49043607711792, "rewards/margins": 0.6159020662307739, "rewards/rejected": -3.106337785720825, "sft_loss": 2.5797295570373535, "step": 1505 }, { "epoch": 0.8081618999832748, "grad_norm": 6.3951303624507565, "learning_rate": 9.151389963746472e-07, "logits/chosen": -0.03683237358927727, "logits/rejected": 0.29354020953178406, "logps/chosen": -2.4399712085723877, "logps/rejected": -3.1233441829681396, "loss": 0.548, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4399712085723877, "rewards/margins": 0.6833727359771729, "rewards/rejected": -3.1233441829681396, "sft_loss": 2.5200114250183105, "step": 1510 }, { "epoch": 0.8108379327646764, "grad_norm": 8.83858247981277, "learning_rate": 9.142689663565577e-07, "logits/chosen": 0.06057562306523323, "logits/rejected": 0.1377892941236496, "logps/chosen": -2.4171109199523926, "logps/rejected": -3.0059268474578857, "loss": 0.5867, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4171109199523926, "rewards/margins": 0.5888158082962036, "rewards/rejected": -3.0059268474578857, "sft_loss": 2.5347301959991455, "step": 1515 }, { "epoch": 0.8135139655460779, "grad_norm": 10.59050926766106, "learning_rate": 9.133949170991397e-07, "logits/chosen": 0.021306898444890976, "logits/rejected": 0.13196924328804016, "logps/chosen": -2.4130589962005615, "logps/rejected": -3.0544638633728027, "loss": 0.5634, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4130589962005615, "rewards/margins": 0.6414052248001099, "rewards/rejected": -3.0544638633728027, "sft_loss": 2.662980794906616, "step": 1520 }, { "epoch": 0.8161899983274795, "grad_norm": 9.759469171962795, "learning_rate": 9.125168570824231e-07, "logits/chosen": -0.017821846529841423, "logits/rejected": 0.1999787837266922, "logps/chosen": -2.5519795417785645, "logps/rejected": -3.0707650184631348, "loss": 0.6221, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.5519795417785645, "rewards/margins": 0.5187851190567017, "rewards/rejected": -3.0707650184631348, "sft_loss": 2.6400365829467773, "step": 1525 }, { "epoch": 0.8188660311088811, "grad_norm": 10.399775137815965, "learning_rate": 9.116347948253496e-07, "logits/chosen": -0.026381874457001686, "logits/rejected": 0.1348962038755417, "logps/chosen": -2.6443264484405518, "logps/rejected": -3.1459546089172363, "loss": 0.6222, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6443264484405518, "rewards/margins": 0.5016285181045532, "rewards/rejected": -3.1459546089172363, "sft_loss": 2.7468667030334473, "step": 1530 }, { "epoch": 0.8215420638902826, "grad_norm": 10.118881166792805, "learning_rate": 9.107487388856916e-07, "logits/chosen": -0.0058399587869644165, "logits/rejected": 0.21891054511070251, "logps/chosen": -2.5201256275177, "logps/rejected": -3.0925190448760986, "loss": 0.5805, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.5201256275177, "rewards/margins": 0.5723938345909119, "rewards/rejected": -3.0925190448760986, "sft_loss": 2.6686787605285645, "step": 1535 }, { "epoch": 0.8242180966716842, "grad_norm": 14.069021014111197, "learning_rate": 9.098586978599673e-07, "logits/chosen": 0.05855223536491394, "logits/rejected": 0.25015169382095337, "logps/chosen": -2.5722360610961914, "logps/rejected": -3.3856213092803955, "loss": 0.5534, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5722360610961914, "rewards/margins": 0.8133853077888489, "rewards/rejected": -3.3856213092803955, "sft_loss": 2.6835830211639404, "step": 1540 }, { "epoch": 0.8268941294530858, "grad_norm": 8.476476232068334, "learning_rate": 9.089646803833588e-07, "logits/chosen": 0.06817345321178436, "logits/rejected": 0.2557623088359833, "logps/chosen": -2.548581838607788, "logps/rejected": -3.107095241546631, "loss": 0.6084, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.548581838607788, "rewards/margins": 0.558513343334198, "rewards/rejected": -3.107095241546631, "sft_loss": 2.7562756538391113, "step": 1545 }, { "epoch": 0.8295701622344873, "grad_norm": 10.063918914577242, "learning_rate": 9.080666951296276e-07, "logits/chosen": -0.09717416018247604, "logits/rejected": 0.22753658890724182, "logps/chosen": -2.5381972789764404, "logps/rejected": -3.340608596801758, "loss": 0.5353, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5381972789764404, "rewards/margins": 0.8024111986160278, "rewards/rejected": -3.340608596801758, "sft_loss": 2.6446547508239746, "step": 1550 }, { "epoch": 0.8322461950158889, "grad_norm": 10.929038771965324, "learning_rate": 9.071647508110305e-07, "logits/chosen": -0.09855834394693375, "logits/rejected": 0.22255852818489075, "logps/chosen": -2.6067984104156494, "logps/rejected": -3.4245991706848145, "loss": 0.5868, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.6067984104156494, "rewards/margins": 0.8178008198738098, "rewards/rejected": -3.4245991706848145, "sft_loss": 2.7024035453796387, "step": 1555 }, { "epoch": 0.8349222277972905, "grad_norm": 10.825492368892702, "learning_rate": 9.062588561782354e-07, "logits/chosen": 0.045937247574329376, "logits/rejected": 0.1359720081090927, "logps/chosen": -2.725828170776367, "logps/rejected": -3.294826030731201, "loss": 0.6275, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.725828170776367, "rewards/margins": 0.5689979195594788, "rewards/rejected": -3.294826030731201, "sft_loss": 2.925290107727051, "step": 1560 }, { "epoch": 0.8375982605786921, "grad_norm": 8.38173658353747, "learning_rate": 9.053490200202358e-07, "logits/chosen": 0.056266169995069504, "logits/rejected": 0.1544162929058075, "logps/chosen": -2.730910062789917, "logps/rejected": -3.2816321849823, "loss": 0.6223, "rewards/accuracies": 0.71875, "rewards/chosen": -2.730910062789917, "rewards/margins": 0.5507221221923828, "rewards/rejected": -3.2816321849823, "sft_loss": 2.8730056285858154, "step": 1565 }, { "epoch": 0.8402742933600936, "grad_norm": 14.039872862864177, "learning_rate": 9.044352511642661e-07, "logits/chosen": 0.02374919317662716, "logits/rejected": 0.07336236536502838, "logps/chosen": -2.6218318939208984, "logps/rejected": -3.0729284286499023, "loss": 0.6783, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.6218318939208984, "rewards/margins": 0.45109647512435913, "rewards/rejected": -3.0729284286499023, "sft_loss": 2.8568921089172363, "step": 1570 }, { "epoch": 0.8429503261414952, "grad_norm": 8.102782982998859, "learning_rate": 9.03517558475716e-07, "logits/chosen": 0.02059529349207878, "logits/rejected": 0.1358802318572998, "logps/chosen": -2.311969518661499, "logps/rejected": -2.7871482372283936, "loss": 0.6045, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.311969518661499, "rewards/margins": 0.47517871856689453, "rewards/rejected": -2.7871482372283936, "sft_loss": 2.404184341430664, "step": 1575 }, { "epoch": 0.8456263589228968, "grad_norm": 10.945963523938573, "learning_rate": 9.025959508580436e-07, "logits/chosen": 0.05906160920858383, "logits/rejected": 0.3352259695529938, "logps/chosen": -2.40116548538208, "logps/rejected": -2.9874892234802246, "loss": 0.586, "rewards/accuracies": 0.71875, "rewards/chosen": -2.40116548538208, "rewards/margins": 0.586323618888855, "rewards/rejected": -2.9874892234802246, "sft_loss": 2.513141632080078, "step": 1580 }, { "epoch": 0.8483023917042983, "grad_norm": 7.322528639219694, "learning_rate": 9.016704372526905e-07, "logits/chosen": 0.016174791380763054, "logits/rejected": 0.21014456450939178, "logps/chosen": -2.2829861640930176, "logps/rejected": -2.883354663848877, "loss": 0.5812, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2829861640930176, "rewards/margins": 0.6003685593605042, "rewards/rejected": -2.883354663848877, "sft_loss": 2.4132537841796875, "step": 1585 }, { "epoch": 0.8509784244856999, "grad_norm": 14.393486048168747, "learning_rate": 9.007410266389934e-07, "logits/chosen": -0.07246458530426025, "logits/rejected": 0.015409506857395172, "logps/chosen": -2.3226370811462402, "logps/rejected": -2.7810959815979004, "loss": 0.6211, "rewards/accuracies": 0.65625, "rewards/chosen": -2.3226370811462402, "rewards/margins": 0.4584590792655945, "rewards/rejected": -2.7810959815979004, "sft_loss": 2.4499218463897705, "step": 1590 }, { "epoch": 0.8536544572671015, "grad_norm": 13.456133600052352, "learning_rate": 8.998077280340981e-07, "logits/chosen": 0.018017996102571487, "logits/rejected": 0.09234277158975601, "logps/chosen": -2.4913840293884277, "logps/rejected": -2.8565590381622314, "loss": 0.6712, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.4913840293884277, "rewards/margins": 0.36517494916915894, "rewards/rejected": -2.8565590381622314, "sft_loss": 2.5182454586029053, "step": 1595 }, { "epoch": 0.8563304900485031, "grad_norm": 10.74731582976374, "learning_rate": 8.988705504928722e-07, "logits/chosen": -0.10593881458044052, "logits/rejected": 0.11520107090473175, "logps/chosen": -2.420809268951416, "logps/rejected": -3.160034418106079, "loss": 0.5405, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.420809268951416, "rewards/margins": 0.7392248511314392, "rewards/rejected": -3.160034418106079, "sft_loss": 2.5499579906463623, "step": 1600 }, { "epoch": 0.8563304900485031, "eval_logits/chosen": 0.3042474389076233, "eval_logits/rejected": 0.41216760873794556, "eval_logps/chosen": -2.415980100631714, "eval_logps/rejected": -3.020190477371216, "eval_loss": 0.578135073184967, "eval_rewards/accuracies": 0.7091988325119019, "eval_rewards/chosen": -2.415980100631714, "eval_rewards/margins": 0.6042105555534363, "eval_rewards/rejected": -3.020190477371216, "eval_runtime": 52.1108, "eval_samples_per_second": 25.81, "eval_sft_loss": 2.5442447662353516, "eval_steps_per_second": 6.467, "step": 1600 }, { "epoch": 0.8590065228299046, "grad_norm": 10.524729461130852, "learning_rate": 8.979295031078157e-07, "logits/chosen": -0.09900583326816559, "logits/rejected": 0.1608930081129074, "logps/chosen": -2.5220437049865723, "logps/rejected": -3.159421920776367, "loss": 0.5715, "rewards/accuracies": 0.71875, "rewards/chosen": -2.5220437049865723, "rewards/margins": 0.6373783946037292, "rewards/rejected": -3.159421920776367, "sft_loss": 2.5957248210906982, "step": 1605 }, { "epoch": 0.8616825556113062, "grad_norm": 10.311884692182046, "learning_rate": 8.969845950089751e-07, "logits/chosen": -0.10433954000473022, "logits/rejected": 0.09323252737522125, "logps/chosen": -2.5022635459899902, "logps/rejected": -3.2369015216827393, "loss": 0.5502, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5022635459899902, "rewards/margins": 0.7346378564834595, "rewards/rejected": -3.2369015216827393, "sft_loss": 2.668539524078369, "step": 1610 }, { "epoch": 0.8643585883927078, "grad_norm": 13.504489740740828, "learning_rate": 8.960358353638526e-07, "logits/chosen": -0.04470493271946907, "logits/rejected": 0.09468583017587662, "logps/chosen": -2.56691312789917, "logps/rejected": -3.183248996734619, "loss": 0.6348, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.56691312789917, "rewards/margins": 0.6163356900215149, "rewards/rejected": -3.183248996734619, "sft_loss": 2.6765799522399902, "step": 1615 }, { "epoch": 0.8670346211741093, "grad_norm": 9.87102588828472, "learning_rate": 8.950832333773184e-07, "logits/chosen": -0.029353905469179153, "logits/rejected": 0.15252381563186646, "logps/chosen": -2.49654483795166, "logps/rejected": -3.065964937210083, "loss": 0.634, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.49654483795166, "rewards/margins": 0.5694200992584229, "rewards/rejected": -3.065964937210083, "sft_loss": 2.660219430923462, "step": 1620 }, { "epoch": 0.869710653955511, "grad_norm": 11.89297302689208, "learning_rate": 8.941267982915213e-07, "logits/chosen": 0.02455337718129158, "logits/rejected": 0.08116547763347626, "logps/chosen": -2.5555217266082764, "logps/rejected": -2.864656686782837, "loss": 0.7054, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.5555217266082764, "rewards/margins": 0.30913475155830383, "rewards/rejected": -2.864656686782837, "sft_loss": 2.5726044178009033, "step": 1625 }, { "epoch": 0.8723866867369126, "grad_norm": 8.067304083731935, "learning_rate": 8.931665393857983e-07, "logits/chosen": -0.003184753004461527, "logits/rejected": 0.15830065310001373, "logps/chosen": -2.301793336868286, "logps/rejected": -2.8337371349334717, "loss": 0.5993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.301793336868286, "rewards/margins": 0.5319440364837646, "rewards/rejected": -2.8337371349334717, "sft_loss": 2.388474225997925, "step": 1630 }, { "epoch": 0.875062719518314, "grad_norm": 7.820656606757321, "learning_rate": 8.922024659765861e-07, "logits/chosen": -0.11717693507671356, "logits/rejected": 0.031198328360915184, "logps/chosen": -2.133420467376709, "logps/rejected": -2.7510008811950684, "loss": 0.5645, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.133420467376709, "rewards/margins": 0.6175805330276489, "rewards/rejected": -2.7510008811950684, "sft_loss": 2.2334976196289062, "step": 1635 }, { "epoch": 0.8777387522997157, "grad_norm": 8.589411461863673, "learning_rate": 8.912345874173288e-07, "logits/chosen": -0.07331643998622894, "logits/rejected": 0.06835556030273438, "logps/chosen": -2.201294183731079, "logps/rejected": -2.838127374649048, "loss": 0.5824, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.201294183731079, "rewards/margins": 0.6368333101272583, "rewards/rejected": -2.838127374649048, "sft_loss": 2.3204426765441895, "step": 1640 }, { "epoch": 0.8804147850811173, "grad_norm": 9.556488632247595, "learning_rate": 8.902629130983885e-07, "logits/chosen": -0.008688343688845634, "logits/rejected": 0.0492224246263504, "logps/chosen": -2.273496389389038, "logps/rejected": -2.669645309448242, "loss": 0.6292, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.273496389389038, "rewards/margins": 0.3961489200592041, "rewards/rejected": -2.669645309448242, "sft_loss": 2.380650281906128, "step": 1645 }, { "epoch": 0.8830908178625189, "grad_norm": 11.532073827027904, "learning_rate": 8.892874524469537e-07, "logits/chosen": 0.0749739557504654, "logits/rejected": 0.14436577260494232, "logps/chosen": -2.218055248260498, "logps/rejected": -2.7674765586853027, "loss": 0.5729, "rewards/accuracies": 0.71875, "rewards/chosen": -2.218055248260498, "rewards/margins": 0.5494211912155151, "rewards/rejected": -2.7674765586853027, "sft_loss": 2.2503247261047363, "step": 1650 }, { "epoch": 0.8857668506439204, "grad_norm": 11.091732222379273, "learning_rate": 8.883082149269478e-07, "logits/chosen": -0.04484002664685249, "logits/rejected": 0.0897749662399292, "logps/chosen": -2.3275647163391113, "logps/rejected": -2.8501715660095215, "loss": 0.5955, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3275647163391113, "rewards/margins": 0.5226072072982788, "rewards/rejected": -2.8501715660095215, "sft_loss": 2.369812488555908, "step": 1655 }, { "epoch": 0.888442883425322, "grad_norm": 9.42136788562333, "learning_rate": 8.873252100389377e-07, "logits/chosen": -0.004197058267891407, "logits/rejected": 0.049018494784832, "logps/chosen": -2.1921660900115967, "logps/rejected": -2.7342629432678223, "loss": 0.5732, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.1921660900115967, "rewards/margins": 0.5420966744422913, "rewards/rejected": -2.7342629432678223, "sft_loss": 2.228539228439331, "step": 1660 }, { "epoch": 0.8911189162067236, "grad_norm": 11.62999518562216, "learning_rate": 8.863384473200411e-07, "logits/chosen": 0.006611555814743042, "logits/rejected": 0.07799387723207474, "logps/chosen": -2.5068271160125732, "logps/rejected": -2.906339168548584, "loss": 0.6524, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5068271160125732, "rewards/margins": 0.3995123505592346, "rewards/rejected": -2.906339168548584, "sft_loss": 2.572601795196533, "step": 1665 }, { "epoch": 0.8937949489881251, "grad_norm": 9.897573518256243, "learning_rate": 8.853479363438342e-07, "logits/chosen": 0.030647706240415573, "logits/rejected": 0.21590964496135712, "logps/chosen": -2.4557127952575684, "logps/rejected": -2.8481664657592773, "loss": 0.6855, "rewards/accuracies": 0.625, "rewards/chosen": -2.4557127952575684, "rewards/margins": 0.39245349168777466, "rewards/rejected": -2.8481664657592773, "sft_loss": 2.4239420890808105, "step": 1670 }, { "epoch": 0.8964709817695267, "grad_norm": 9.069166113077122, "learning_rate": 8.843536867202588e-07, "logits/chosen": 0.002661994192749262, "logits/rejected": 0.2346363514661789, "logps/chosen": -2.4013454914093018, "logps/rejected": -3.0166726112365723, "loss": 0.5964, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.4013454914093018, "rewards/margins": 0.6153267621994019, "rewards/rejected": -3.0166726112365723, "sft_loss": 2.511767864227295, "step": 1675 }, { "epoch": 0.8991470145509283, "grad_norm": 10.264602842040526, "learning_rate": 8.833557080955292e-07, "logits/chosen": -0.10588578134775162, "logits/rejected": 0.02353302203118801, "logps/chosen": -2.397885322570801, "logps/rejected": -2.8403050899505615, "loss": 0.6325, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.397885322570801, "rewards/margins": 0.4424198269844055, "rewards/rejected": -2.8403050899505615, "sft_loss": 2.5135016441345215, "step": 1680 }, { "epoch": 0.9018230473323299, "grad_norm": 8.803254226319224, "learning_rate": 8.823540101520381e-07, "logits/chosen": -0.04913238063454628, "logits/rejected": 0.19184985756874084, "logps/chosen": -2.2383077144622803, "logps/rejected": -2.8146414756774902, "loss": 0.6112, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.2383077144622803, "rewards/margins": 0.5763339400291443, "rewards/rejected": -2.8146414756774902, "sft_loss": 2.3560469150543213, "step": 1685 }, { "epoch": 0.9044990801137314, "grad_norm": 8.00691293957899, "learning_rate": 8.813486026082637e-07, "logits/chosen": -0.04420467093586922, "logits/rejected": 0.1645149141550064, "logps/chosen": -2.1973953247070312, "logps/rejected": -2.8400979042053223, "loss": 0.5517, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.1973953247070312, "rewards/margins": 0.6427024602890015, "rewards/rejected": -2.8400979042053223, "sft_loss": 2.3609063625335693, "step": 1690 }, { "epoch": 0.907175112895133, "grad_norm": 13.186135400387222, "learning_rate": 8.803394952186742e-07, "logits/chosen": -0.20445886254310608, "logits/rejected": -0.030445415526628494, "logps/chosen": -2.309983491897583, "logps/rejected": -2.8405935764312744, "loss": 0.5976, "rewards/accuracies": 0.71875, "rewards/chosen": -2.309983491897583, "rewards/margins": 0.5306099653244019, "rewards/rejected": -2.8405935764312744, "sft_loss": 2.475982189178467, "step": 1695 }, { "epoch": 0.9098511456765346, "grad_norm": 11.275274668930086, "learning_rate": 8.793266977736342e-07, "logits/chosen": 0.0015526011120527983, "logits/rejected": -0.050358422100543976, "logps/chosen": -2.424626111984253, "logps/rejected": -2.747622489929199, "loss": 0.655, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.424626111984253, "rewards/margins": 0.3229961097240448, "rewards/rejected": -2.747622489929199, "sft_loss": 2.554354190826416, "step": 1700 }, { "epoch": 0.9125271784579361, "grad_norm": 13.117118001873978, "learning_rate": 8.783102200993085e-07, "logits/chosen": -0.037245552986860275, "logits/rejected": 0.11655652523040771, "logps/chosen": -2.402632236480713, "logps/rejected": -3.000462293624878, "loss": 0.5785, "rewards/accuracies": 0.71875, "rewards/chosen": -2.402632236480713, "rewards/margins": 0.5978304743766785, "rewards/rejected": -3.000462293624878, "sft_loss": 2.527595043182373, "step": 1705 }, { "epoch": 0.9152032112393377, "grad_norm": 9.46908294460337, "learning_rate": 8.772900720575683e-07, "logits/chosen": -0.05901443958282471, "logits/rejected": 0.04909048229455948, "logps/chosen": -2.5413331985473633, "logps/rejected": -2.999572992324829, "loss": 0.6188, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5413331985473633, "rewards/margins": 0.45823970437049866, "rewards/rejected": -2.999572992324829, "sft_loss": 2.6973278522491455, "step": 1710 }, { "epoch": 0.9178792440207393, "grad_norm": 12.101459296969413, "learning_rate": 8.762662635458944e-07, "logits/chosen": -0.07247239351272583, "logits/rejected": 0.1388850212097168, "logps/chosen": -2.650653123855591, "logps/rejected": -3.1614272594451904, "loss": 0.6523, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.650653123855591, "rewards/margins": 0.5107744932174683, "rewards/rejected": -3.1614272594451904, "sft_loss": 2.730468988418579, "step": 1715 }, { "epoch": 0.9205552768021408, "grad_norm": 11.961155999673137, "learning_rate": 8.752388044972811e-07, "logits/chosen": -0.05106500908732414, "logits/rejected": 0.0330757237970829, "logps/chosen": -2.454437732696533, "logps/rejected": -3.1570143699645996, "loss": 0.565, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.454437732696533, "rewards/margins": 0.7025768160820007, "rewards/rejected": -3.1570143699645996, "sft_loss": 2.619330644607544, "step": 1720 }, { "epoch": 0.9232313095835424, "grad_norm": 8.617608037704702, "learning_rate": 8.74207704880141e-07, "logits/chosen": -0.022454053163528442, "logits/rejected": 0.0959378033876419, "logps/chosen": -2.598407745361328, "logps/rejected": -3.4198384284973145, "loss": 0.5128, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.598407745361328, "rewards/margins": 0.8214303851127625, "rewards/rejected": -3.4198384284973145, "sft_loss": 2.8084254264831543, "step": 1725 }, { "epoch": 0.925907342364944, "grad_norm": 10.519713288799583, "learning_rate": 8.731729746982068e-07, "logits/chosen": 0.01740475744009018, "logits/rejected": 0.10223283618688583, "logps/chosen": -2.5419623851776123, "logps/rejected": -3.128298282623291, "loss": 0.5918, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5419623851776123, "rewards/margins": 0.5863358378410339, "rewards/rejected": -3.128298282623291, "sft_loss": 2.7975895404815674, "step": 1730 }, { "epoch": 0.9285833751463456, "grad_norm": 10.455643617501638, "learning_rate": 8.721346239904355e-07, "logits/chosen": -0.16528668999671936, "logits/rejected": 0.045746032148599625, "logps/chosen": -2.6919751167297363, "logps/rejected": -3.503297805786133, "loss": 0.5759, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.6919751167297363, "rewards/margins": 0.8113226890563965, "rewards/rejected": -3.503297805786133, "sft_loss": 2.817317247390747, "step": 1735 }, { "epoch": 0.9312594079277471, "grad_norm": 10.852266542046127, "learning_rate": 8.710926628309101e-07, "logits/chosen": -0.10590909421443939, "logits/rejected": 0.08133234828710556, "logps/chosen": -2.810664653778076, "logps/rejected": -3.4224929809570312, "loss": 0.578, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.810664653778076, "rewards/margins": 0.6118277311325073, "rewards/rejected": -3.4224929809570312, "sft_loss": 2.9222919940948486, "step": 1740 }, { "epoch": 0.9339354407091487, "grad_norm": 7.61006798221914, "learning_rate": 8.700471013287424e-07, "logits/chosen": 0.021407146006822586, "logits/rejected": 0.06234606355428696, "logps/chosen": -2.5792348384857178, "logps/rejected": -3.1805102825164795, "loss": 0.5893, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5792348384857178, "rewards/margins": 0.6012751460075378, "rewards/rejected": -3.1805102825164795, "sft_loss": 2.7608916759490967, "step": 1745 }, { "epoch": 0.9366114734905503, "grad_norm": 14.860283835159844, "learning_rate": 8.689979496279746e-07, "logits/chosen": -0.027105966582894325, "logits/rejected": 0.03506525978446007, "logps/chosen": -2.8316097259521484, "logps/rejected": -3.2912864685058594, "loss": 0.6747, "rewards/accuracies": 0.625, "rewards/chosen": -2.8316097259521484, "rewards/margins": 0.45967674255371094, "rewards/rejected": -3.2912864685058594, "sft_loss": 2.979743480682373, "step": 1750 }, { "epoch": 0.9392875062719518, "grad_norm": 10.10079478152837, "learning_rate": 8.679452179074811e-07, "logits/chosen": -0.028563061729073524, "logits/rejected": 0.10825793445110321, "logps/chosen": -2.69927716255188, "logps/rejected": -3.3682494163513184, "loss": 0.5521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.69927716255188, "rewards/margins": 0.6689725518226624, "rewards/rejected": -3.3682494163513184, "sft_loss": 2.9125723838806152, "step": 1755 }, { "epoch": 0.9419635390533534, "grad_norm": 13.235932611878656, "learning_rate": 8.668889163808698e-07, "logits/chosen": -0.03027569130063057, "logits/rejected": 0.12222941219806671, "logps/chosen": -2.6722209453582764, "logps/rejected": -3.266571521759033, "loss": 0.5746, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6722209453582764, "rewards/margins": 0.5943503379821777, "rewards/rejected": -3.266571521759033, "sft_loss": 2.8436102867126465, "step": 1760 }, { "epoch": 0.944639571834755, "grad_norm": 11.58412404023222, "learning_rate": 8.658290552963827e-07, "logits/chosen": 0.03251715749502182, "logits/rejected": 0.07301442325115204, "logps/chosen": -2.695906162261963, "logps/rejected": -3.2821033000946045, "loss": 0.6197, "rewards/accuracies": 0.6875, "rewards/chosen": -2.695906162261963, "rewards/margins": 0.5861972570419312, "rewards/rejected": -3.2821033000946045, "sft_loss": 2.8826968669891357, "step": 1765 }, { "epoch": 0.9473156046161565, "grad_norm": 10.649089680608327, "learning_rate": 8.647656449367966e-07, "logits/chosen": 0.020825685933232307, "logits/rejected": 0.19587847590446472, "logps/chosen": -2.665592908859253, "logps/rejected": -3.1587936878204346, "loss": 0.6363, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.665592908859253, "rewards/margins": 0.49320077896118164, "rewards/rejected": -3.1587936878204346, "sft_loss": 2.8860023021698, "step": 1770 }, { "epoch": 0.9499916373975581, "grad_norm": 9.615398550070553, "learning_rate": 8.636986956193235e-07, "logits/chosen": -0.04956268146634102, "logits/rejected": 0.08047482371330261, "logps/chosen": -2.4707024097442627, "logps/rejected": -3.0871739387512207, "loss": 0.584, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4707024097442627, "rewards/margins": 0.6164714694023132, "rewards/rejected": -3.0871739387512207, "sft_loss": 2.6601004600524902, "step": 1775 }, { "epoch": 0.9526676701789597, "grad_norm": 9.694860492684317, "learning_rate": 8.626282176955104e-07, "logits/chosen": -0.030627835541963577, "logits/rejected": 0.10882334411144257, "logps/chosen": -2.3581511974334717, "logps/rejected": -2.9486238956451416, "loss": 0.5797, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.3581511974334717, "rewards/margins": 0.5904725193977356, "rewards/rejected": -2.9486238956451416, "sft_loss": 2.451658248901367, "step": 1780 }, { "epoch": 0.9553437029603613, "grad_norm": 14.376924824437532, "learning_rate": 8.615542215511389e-07, "logits/chosen": 0.03919892758131027, "logits/rejected": 0.11939878761768341, "logps/chosen": -2.399656295776367, "logps/rejected": -2.767712116241455, "loss": 0.6465, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.399656295776367, "rewards/margins": 0.3680557310581207, "rewards/rejected": -2.767712116241455, "sft_loss": 2.459564208984375, "step": 1785 }, { "epoch": 0.9580197357417628, "grad_norm": 11.271406982128253, "learning_rate": 8.604767176061241e-07, "logits/chosen": 0.10714355856180191, "logits/rejected": 0.19515064358711243, "logps/chosen": -2.5030503273010254, "logps/rejected": -2.955298900604248, "loss": 0.615, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.5030503273010254, "rewards/margins": 0.4522481858730316, "rewards/rejected": -2.955298900604248, "sft_loss": 2.6175243854522705, "step": 1790 }, { "epoch": 0.9606957685231644, "grad_norm": 7.772687838765346, "learning_rate": 8.593957163144141e-07, "logits/chosen": -0.03184313327074051, "logits/rejected": 0.1284581869840622, "logps/chosen": -2.3338072299957275, "logps/rejected": -2.968812942504883, "loss": 0.5771, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.3338072299957275, "rewards/margins": 0.6350058913230896, "rewards/rejected": -2.968812942504883, "sft_loss": 2.5412683486938477, "step": 1795 }, { "epoch": 0.963371801304566, "grad_norm": 8.865298487683116, "learning_rate": 8.58311228163888e-07, "logits/chosen": -0.03143691271543503, "logits/rejected": 0.06218884512782097, "logps/chosen": -2.478829860687256, "logps/rejected": -2.9742467403411865, "loss": 0.6032, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.478829860687256, "rewards/margins": 0.4954164922237396, "rewards/rejected": -2.9742467403411865, "sft_loss": 2.5903961658477783, "step": 1800 }, { "epoch": 0.9660478340859675, "grad_norm": 11.02813751654619, "learning_rate": 8.57223263676255e-07, "logits/chosen": -0.13884618878364563, "logits/rejected": 0.01260617095977068, "logps/chosen": -2.3413610458374023, "logps/rejected": -3.1076226234436035, "loss": 0.5275, "rewards/accuracies": 0.75, "rewards/chosen": -2.3413610458374023, "rewards/margins": 0.7662616968154907, "rewards/rejected": -3.1076226234436035, "sft_loss": 2.465789318084717, "step": 1805 }, { "epoch": 0.9687238668673691, "grad_norm": 9.841681672444608, "learning_rate": 8.561318334069511e-07, "logits/chosen": -0.0021059750579297543, "logits/rejected": 0.15467438101768494, "logps/chosen": -2.4257614612579346, "logps/rejected": -3.022879123687744, "loss": 0.573, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4257614612579346, "rewards/margins": 0.5971179008483887, "rewards/rejected": -3.022879123687744, "sft_loss": 2.515134811401367, "step": 1810 }, { "epoch": 0.9713998996487707, "grad_norm": 9.870310295591283, "learning_rate": 8.550369479450375e-07, "logits/chosen": -0.04107099026441574, "logits/rejected": 0.12032093107700348, "logps/chosen": -2.587172031402588, "logps/rejected": -3.255514144897461, "loss": 0.5651, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.587172031402588, "rewards/margins": 0.6683421730995178, "rewards/rejected": -3.255514144897461, "sft_loss": 2.731900453567505, "step": 1815 }, { "epoch": 0.9740759324301723, "grad_norm": 13.783549029664979, "learning_rate": 8.539386179130977e-07, "logits/chosen": 0.013371935114264488, "logits/rejected": 0.07807054370641708, "logps/chosen": -2.649529218673706, "logps/rejected": -3.255265712738037, "loss": 0.5839, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.649529218673706, "rewards/margins": 0.6057366728782654, "rewards/rejected": -3.255265712738037, "sft_loss": 2.7066922187805176, "step": 1820 }, { "epoch": 0.9767519652115738, "grad_norm": 10.932540143596936, "learning_rate": 8.528368539671347e-07, "logits/chosen": -0.08376463502645493, "logits/rejected": 0.10879947245121002, "logps/chosen": -2.476346492767334, "logps/rejected": -3.3493971824645996, "loss": 0.5491, "rewards/accuracies": 0.71875, "rewards/chosen": -2.476346492767334, "rewards/margins": 0.8730506896972656, "rewards/rejected": -3.3493971824645996, "sft_loss": 2.656287908554077, "step": 1825 }, { "epoch": 0.9794279979929754, "grad_norm": 11.332806366276783, "learning_rate": 8.51731666796467e-07, "logits/chosen": 0.06971029192209244, "logits/rejected": 0.1344754993915558, "logps/chosen": -2.857238292694092, "logps/rejected": -3.4709160327911377, "loss": 0.5962, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.857238292694092, "rewards/margins": 0.613677442073822, "rewards/rejected": -3.4709160327911377, "sft_loss": 2.995389461517334, "step": 1830 }, { "epoch": 0.982104030774377, "grad_norm": 12.236061486827184, "learning_rate": 8.506230671236254e-07, "logits/chosen": -0.05031920596957207, "logits/rejected": 0.04754648730158806, "logps/chosen": -2.865511417388916, "logps/rejected": -3.368098497390747, "loss": 0.6121, "rewards/accuracies": 0.6875, "rewards/chosen": -2.865511417388916, "rewards/margins": 0.5025866627693176, "rewards/rejected": -3.368098497390747, "sft_loss": 3.0100932121276855, "step": 1835 }, { "epoch": 0.9847800635557785, "grad_norm": 10.42805969789397, "learning_rate": 8.495110657042488e-07, "logits/chosen": 0.007626816630363464, "logits/rejected": 0.1920599341392517, "logps/chosen": -3.0169951915740967, "logps/rejected": -3.7608039379119873, "loss": 0.5544, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.0169951915740967, "rewards/margins": 0.7438088655471802, "rewards/rejected": -3.7608039379119873, "sft_loss": 3.2395145893096924, "step": 1840 }, { "epoch": 0.9874560963371801, "grad_norm": 18.67805455718663, "learning_rate": 8.483956733269799e-07, "logits/chosen": -0.031309835612773895, "logits/rejected": 0.08653328567743301, "logps/chosen": -3.035778045654297, "logps/rejected": -3.7503821849823, "loss": 0.5996, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.035778045654297, "rewards/margins": 0.7146042585372925, "rewards/rejected": -3.7503821849823, "sft_loss": 3.326650619506836, "step": 1845 }, { "epoch": 0.9901321291185817, "grad_norm": 15.850563785858194, "learning_rate": 8.472769008133602e-07, "logits/chosen": -0.18987888097763062, "logits/rejected": -0.025769507512450218, "logps/chosen": -3.2544639110565186, "logps/rejected": -3.8407649993896484, "loss": 0.6206, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.2544639110565186, "rewards/margins": 0.5863012671470642, "rewards/rejected": -3.8407649993896484, "sft_loss": 3.4116978645324707, "step": 1850 }, { "epoch": 0.9928081618999832, "grad_norm": 13.542824139028458, "learning_rate": 8.461547590177259e-07, "logits/chosen": -0.07165570557117462, "logits/rejected": 0.09050522744655609, "logps/chosen": -3.0256094932556152, "logps/rejected": -3.739541530609131, "loss": 0.6175, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -3.0256094932556152, "rewards/margins": 0.7139323353767395, "rewards/rejected": -3.739541530609131, "sft_loss": 3.331972599029541, "step": 1855 }, { "epoch": 0.9954841946813848, "grad_norm": 14.004127538179633, "learning_rate": 8.450292588271014e-07, "logits/chosen": -0.059675056487321854, "logits/rejected": 0.08022672683000565, "logps/chosen": -3.128673791885376, "logps/rejected": -3.766014814376831, "loss": 0.5919, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.128673791885376, "rewards/margins": 0.6373409032821655, "rewards/rejected": -3.766014814376831, "sft_loss": 3.2620816230773926, "step": 1860 }, { "epoch": 0.9981602274627864, "grad_norm": 13.526350959371076, "learning_rate": 8.439004111610945e-07, "logits/chosen": -0.06711464375257492, "logits/rejected": 0.0229620523750782, "logps/chosen": -2.7440898418426514, "logps/rejected": -3.351423978805542, "loss": 0.6055, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7440898418426514, "rewards/margins": 0.6073340773582458, "rewards/rejected": -3.351423978805542, "sft_loss": 2.9087913036346436, "step": 1865 }, { "epoch": 1.000836260244188, "grad_norm": 11.344183671884279, "learning_rate": 8.427682269717901e-07, "logits/chosen": -0.10571829974651337, "logits/rejected": 0.05615962669253349, "logps/chosen": -2.7725422382354736, "logps/rejected": -3.4601359367370605, "loss": 0.5566, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7725422382354736, "rewards/margins": 0.6875935792922974, "rewards/rejected": -3.4601359367370605, "sft_loss": 2.911839246749878, "step": 1870 }, { "epoch": 1.0035122930255895, "grad_norm": 11.766277223905877, "learning_rate": 8.416327172436446e-07, "logits/chosen": -0.16360296308994293, "logits/rejected": -0.006053599528968334, "logps/chosen": -2.719698667526245, "logps/rejected": -3.208998441696167, "loss": 0.6163, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.719698667526245, "rewards/margins": 0.48930010199546814, "rewards/rejected": -3.208998441696167, "sft_loss": 2.786731719970703, "step": 1875 }, { "epoch": 1.0061883258069912, "grad_norm": 11.76137503676261, "learning_rate": 8.404938929933778e-07, "logits/chosen": -0.015264851041138172, "logits/rejected": 0.15779271721839905, "logps/chosen": -2.614290952682495, "logps/rejected": -3.4814743995666504, "loss": 0.515, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.614290952682495, "rewards/margins": 0.8671833276748657, "rewards/rejected": -3.4814743995666504, "sft_loss": 2.8025989532470703, "step": 1880 }, { "epoch": 1.0088643585883927, "grad_norm": 9.733281383374747, "learning_rate": 8.39351765269868e-07, "logits/chosen": -0.08949162811040878, "logits/rejected": 0.004027103073894978, "logps/chosen": -2.561148166656494, "logps/rejected": -3.174879789352417, "loss": 0.6062, "rewards/accuracies": 0.6875, "rewards/chosen": -2.561148166656494, "rewards/margins": 0.6137315034866333, "rewards/rejected": -3.174879789352417, "sft_loss": 2.714580774307251, "step": 1885 }, { "epoch": 1.0115403913697942, "grad_norm": 13.196842268820903, "learning_rate": 8.382063451540431e-07, "logits/chosen": -0.08461443334817886, "logits/rejected": 0.16043774783611298, "logps/chosen": -2.725468397140503, "logps/rejected": -3.4064719676971436, "loss": 0.5576, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.725468397140503, "rewards/margins": 0.6810039281845093, "rewards/rejected": -3.4064719676971436, "sft_loss": 2.99931001663208, "step": 1890 }, { "epoch": 1.014216424151196, "grad_norm": 10.52990082304153, "learning_rate": 8.370576437587742e-07, "logits/chosen": -0.03379444777965546, "logits/rejected": 0.010876533575356007, "logps/chosen": -2.713357448577881, "logps/rejected": -3.2912967205047607, "loss": 0.5837, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.713357448577881, "rewards/margins": 0.5779389142990112, "rewards/rejected": -3.2912967205047607, "sft_loss": 2.830855369567871, "step": 1895 }, { "epoch": 1.0168924569325974, "grad_norm": 9.671101865530032, "learning_rate": 8.359056722287674e-07, "logits/chosen": -0.1582552045583725, "logits/rejected": 0.1438843011856079, "logps/chosen": -2.714686870574951, "logps/rejected": -3.3958747386932373, "loss": 0.5765, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.714686870574951, "rewards/margins": 0.6811872720718384, "rewards/rejected": -3.3958747386932373, "sft_loss": 2.9313085079193115, "step": 1900 }, { "epoch": 1.019568489713999, "grad_norm": 8.817897366753009, "learning_rate": 8.347504417404553e-07, "logits/chosen": -0.03570292145013809, "logits/rejected": 0.1323067843914032, "logps/chosen": -2.7149858474731445, "logps/rejected": -3.3264877796173096, "loss": 0.6033, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7149858474731445, "rewards/margins": 0.6115021109580994, "rewards/rejected": -3.3264877796173096, "sft_loss": 2.8391032218933105, "step": 1905 }, { "epoch": 1.0222445224954007, "grad_norm": 9.514633202698919, "learning_rate": 8.335919635018893e-07, "logits/chosen": -0.16271661221981049, "logits/rejected": -0.003299406263977289, "logps/chosen": -2.598848581314087, "logps/rejected": -3.1364142894744873, "loss": 0.5932, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.598848581314087, "rewards/margins": 0.5375655889511108, "rewards/rejected": -3.1364142894744873, "sft_loss": 2.7851195335388184, "step": 1910 }, { "epoch": 1.0249205552768021, "grad_norm": 9.029115764201192, "learning_rate": 8.324302487526303e-07, "logits/chosen": -0.09520978480577469, "logits/rejected": 0.021425556391477585, "logps/chosen": -2.681286334991455, "logps/rejected": -3.2381675243377686, "loss": 0.5917, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.681286334991455, "rewards/margins": 0.5568810701370239, "rewards/rejected": -3.2381675243377686, "sft_loss": 2.8256735801696777, "step": 1915 }, { "epoch": 1.0275965880582036, "grad_norm": 7.915959995679739, "learning_rate": 8.312653087636398e-07, "logits/chosen": -0.12838247418403625, "logits/rejected": -0.032163430005311966, "logps/chosen": -2.5076041221618652, "logps/rejected": -3.203805923461914, "loss": 0.5654, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5076041221618652, "rewards/margins": 0.6962019205093384, "rewards/rejected": -3.203805923461914, "sft_loss": 2.7093334197998047, "step": 1920 }, { "epoch": 1.0302726208396054, "grad_norm": 12.434460694472266, "learning_rate": 8.300971548371711e-07, "logits/chosen": -0.21484927833080292, "logits/rejected": 0.02849237620830536, "logps/chosen": -2.7233753204345703, "logps/rejected": -3.3476593494415283, "loss": 0.5705, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7233753204345703, "rewards/margins": 0.624284029006958, "rewards/rejected": -3.3476593494415283, "sft_loss": 2.8742966651916504, "step": 1925 }, { "epoch": 1.0329486536210069, "grad_norm": 13.463123545165725, "learning_rate": 8.289257983066582e-07, "logits/chosen": -0.14655368030071259, "logits/rejected": 0.009099148213863373, "logps/chosen": -2.6282095909118652, "logps/rejected": -3.3094677925109863, "loss": 0.555, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.6282095909118652, "rewards/margins": 0.6812580227851868, "rewards/rejected": -3.3094677925109863, "sft_loss": 2.856292486190796, "step": 1930 }, { "epoch": 1.0356246864024083, "grad_norm": 15.604488504855244, "learning_rate": 8.277512505366077e-07, "logits/chosen": -0.18012908101081848, "logits/rejected": 0.05511124059557915, "logps/chosen": -2.7234902381896973, "logps/rejected": -3.4398033618927, "loss": 0.5666, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7234902381896973, "rewards/margins": 0.7163132429122925, "rewards/rejected": -3.4398033618927, "sft_loss": 2.827632427215576, "step": 1935 }, { "epoch": 1.03830071918381, "grad_norm": 9.738078522357188, "learning_rate": 8.265735229224868e-07, "logits/chosen": -0.10542136430740356, "logits/rejected": 0.033803604543209076, "logps/chosen": -2.6598103046417236, "logps/rejected": -3.473357677459717, "loss": 0.5437, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6598103046417236, "rewards/margins": 0.8135476112365723, "rewards/rejected": -3.473357677459717, "sft_loss": 2.6809260845184326, "step": 1940 }, { "epoch": 1.0409767519652116, "grad_norm": 7.820147815952518, "learning_rate": 8.253926268906144e-07, "logits/chosen": -0.19088909029960632, "logits/rejected": -0.005916637368500233, "logps/chosen": -2.7106292247772217, "logps/rejected": -3.611953020095825, "loss": 0.5106, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7106292247772217, "rewards/margins": 0.9013236165046692, "rewards/rejected": -3.611953020095825, "sft_loss": 2.822298526763916, "step": 1945 }, { "epoch": 1.043652784746613, "grad_norm": 11.916728732236763, "learning_rate": 8.242085738980487e-07, "logits/chosen": -0.09535542875528336, "logits/rejected": 0.16036078333854675, "logps/chosen": -2.753713369369507, "logps/rejected": -3.4758613109588623, "loss": 0.5628, "rewards/accuracies": 0.75, "rewards/chosen": -2.753713369369507, "rewards/margins": 0.7221483588218689, "rewards/rejected": -3.4758613109588623, "sft_loss": 2.8452465534210205, "step": 1950 }, { "epoch": 1.0463288175280148, "grad_norm": 12.036744464835651, "learning_rate": 8.230213754324772e-07, "logits/chosen": -0.12327710539102554, "logits/rejected": -0.037312399595975876, "logps/chosen": -2.7057414054870605, "logps/rejected": -3.308201313018799, "loss": 0.5719, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7057414054870605, "rewards/margins": 0.602459728717804, "rewards/rejected": -3.308201313018799, "sft_loss": 2.8045592308044434, "step": 1955 }, { "epoch": 1.0490048503094163, "grad_norm": 11.974887547568208, "learning_rate": 8.218310430121045e-07, "logits/chosen": -0.10618670284748077, "logits/rejected": -0.06071647256612778, "logps/chosen": -2.6146535873413086, "logps/rejected": -3.202754259109497, "loss": 0.5934, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.6146535873413086, "rewards/margins": 0.5881003737449646, "rewards/rejected": -3.202754259109497, "sft_loss": 2.719392776489258, "step": 1960 }, { "epoch": 1.051680883090818, "grad_norm": 9.87186260378657, "learning_rate": 8.20637588185541e-07, "logits/chosen": -0.06013760715723038, "logits/rejected": 0.028408635407686234, "logps/chosen": -2.637633800506592, "logps/rejected": -3.5640335083007812, "loss": 0.5077, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.637633800506592, "rewards/margins": 0.926399827003479, "rewards/rejected": -3.5640335083007812, "sft_loss": 2.814918041229248, "step": 1965 }, { "epoch": 1.0543569158722195, "grad_norm": 8.711974662345696, "learning_rate": 8.194410225316906e-07, "logits/chosen": -0.13953541219234467, "logits/rejected": 0.04353134706616402, "logps/chosen": -2.487884521484375, "logps/rejected": -3.113614320755005, "loss": 0.5679, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.487884521484375, "rewards/margins": 0.6257299780845642, "rewards/rejected": -3.113614320755005, "sft_loss": 2.620357036590576, "step": 1970 }, { "epoch": 1.057032948653621, "grad_norm": 11.306716862032234, "learning_rate": 8.182413576596385e-07, "logits/chosen": 0.0013474032748490572, "logits/rejected": 0.08792434632778168, "logps/chosen": -2.478005886077881, "logps/rejected": -3.1275486946105957, "loss": 0.5732, "rewards/accuracies": 0.6875, "rewards/chosen": -2.478005886077881, "rewards/margins": 0.6495428085327148, "rewards/rejected": -3.1275486946105957, "sft_loss": 2.6386067867279053, "step": 1975 }, { "epoch": 1.0597089814350227, "grad_norm": 11.784456427703102, "learning_rate": 8.170386052085389e-07, "logits/chosen": 0.014350226148962975, "logits/rejected": 0.14056164026260376, "logps/chosen": -2.672428607940674, "logps/rejected": -3.3221981525421143, "loss": 0.6038, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.672428607940674, "rewards/margins": 0.6497694253921509, "rewards/rejected": -3.3221981525421143, "sft_loss": 2.784529685974121, "step": 1980 }, { "epoch": 1.0623850142164242, "grad_norm": 10.737430694761672, "learning_rate": 8.158327768475008e-07, "logits/chosen": -0.05563334748148918, "logits/rejected": 0.1250276118516922, "logps/chosen": -2.6196036338806152, "logps/rejected": -3.1013073921203613, "loss": 0.6358, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.6196036338806152, "rewards/margins": 0.48170357942581177, "rewards/rejected": -3.1013073921203613, "sft_loss": 2.7226316928863525, "step": 1985 }, { "epoch": 1.0650610469978257, "grad_norm": 12.96106628879117, "learning_rate": 8.146238842754767e-07, "logits/chosen": -0.1020413488149643, "logits/rejected": 0.020089680328965187, "logps/chosen": -2.629999876022339, "logps/rejected": -3.177628993988037, "loss": 0.5901, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.629999876022339, "rewards/margins": 0.5476290583610535, "rewards/rejected": -3.177628993988037, "sft_loss": 2.7194972038269043, "step": 1990 }, { "epoch": 1.0677370797792274, "grad_norm": 13.03319722387789, "learning_rate": 8.134119392211476e-07, "logits/chosen": 0.02129880152642727, "logits/rejected": 0.20938155055046082, "logps/chosen": -2.5209169387817383, "logps/rejected": -3.362943172454834, "loss": 0.5345, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5209169387817383, "rewards/margins": 0.8420262336730957, "rewards/rejected": -3.362943172454834, "sft_loss": 2.6732523441314697, "step": 1995 }, { "epoch": 1.0704131125606289, "grad_norm": 16.541188511965675, "learning_rate": 8.121969534428094e-07, "logits/chosen": -0.11407023668289185, "logits/rejected": 0.07046560198068619, "logps/chosen": -2.650700807571411, "logps/rejected": -3.288252592086792, "loss": 0.6195, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.650700807571411, "rewards/margins": 0.6375521421432495, "rewards/rejected": -3.288252592086792, "sft_loss": 2.7548530101776123, "step": 2000 }, { "epoch": 1.0704131125606289, "eval_logits/chosen": 0.3371480703353882, "eval_logits/rejected": 0.45726171135902405, "eval_logps/chosen": -2.545128107070923, "eval_logps/rejected": -3.2527496814727783, "eval_loss": 0.5672591328620911, "eval_rewards/accuracies": 0.7129080295562744, "eval_rewards/chosen": -2.545128107070923, "eval_rewards/margins": 0.7076213359832764, "eval_rewards/rejected": -3.2527496814727783, "eval_runtime": 49.6983, "eval_samples_per_second": 27.063, "eval_sft_loss": 2.7121262550354004, "eval_steps_per_second": 6.781, "step": 2000 }, { "epoch": 1.0730891453420304, "grad_norm": 12.203435349479575, "learning_rate": 8.109789387282599e-07, "logits/chosen": -0.03285417705774307, "logits/rejected": 0.04845278710126877, "logps/chosen": -2.5501391887664795, "logps/rejected": -3.1087799072265625, "loss": 0.6308, "rewards/accuracies": 0.6875, "rewards/chosen": -2.5501391887664795, "rewards/margins": 0.5586405396461487, "rewards/rejected": -3.1087799072265625, "sft_loss": 2.6897871494293213, "step": 2005 }, { "epoch": 1.075765178123432, "grad_norm": 15.12578662488471, "learning_rate": 8.097579068946827e-07, "logits/chosen": -0.0025532320141792297, "logits/rejected": 0.14059139788150787, "logps/chosen": -2.384495973587036, "logps/rejected": -3.02142596244812, "loss": 0.5619, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.384495973587036, "rewards/margins": 0.636929988861084, "rewards/rejected": -3.02142596244812, "sft_loss": 2.541128635406494, "step": 2010 }, { "epoch": 1.0784412109048336, "grad_norm": 10.919169894003698, "learning_rate": 8.085338697885344e-07, "logits/chosen": -0.020297734066843987, "logits/rejected": 0.12689833343029022, "logps/chosen": -2.4526207447052, "logps/rejected": -3.089092969894409, "loss": 0.5716, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.4526207447052, "rewards/margins": 0.6364722847938538, "rewards/rejected": -3.089092969894409, "sft_loss": 2.522400140762329, "step": 2015 }, { "epoch": 1.081117243686235, "grad_norm": 12.437741570509774, "learning_rate": 8.073068392854282e-07, "logits/chosen": -0.16363094747066498, "logits/rejected": 0.08789423853158951, "logps/chosen": -2.60274600982666, "logps/rejected": -3.3508548736572266, "loss": 0.5272, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.60274600982666, "rewards/margins": 0.7481087446212769, "rewards/rejected": -3.3508548736572266, "sft_loss": 2.661445379257202, "step": 2020 }, { "epoch": 1.0837932764676368, "grad_norm": 9.100173797791186, "learning_rate": 8.060768272900193e-07, "logits/chosen": -0.024624507874250412, "logits/rejected": 0.15064984560012817, "logps/chosen": -2.4519011974334717, "logps/rejected": -3.172219753265381, "loss": 0.5684, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.4519011974334717, "rewards/margins": 0.7203187942504883, "rewards/rejected": -3.172219753265381, "sft_loss": 2.6379141807556152, "step": 2025 }, { "epoch": 1.0864693092490383, "grad_norm": 9.418736880908407, "learning_rate": 8.0484384573589e-07, "logits/chosen": -0.07204429060220718, "logits/rejected": -0.027175098657608032, "logps/chosen": -2.4381704330444336, "logps/rejected": -3.055497169494629, "loss": 0.5735, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.4381704330444336, "rewards/margins": 0.6173266172409058, "rewards/rejected": -3.055497169494629, "sft_loss": 2.570493698120117, "step": 2030 }, { "epoch": 1.0891453420304398, "grad_norm": 14.029045781506598, "learning_rate": 8.03607906585432e-07, "logits/chosen": -0.09846794605255127, "logits/rejected": 0.08520887792110443, "logps/chosen": -2.560640335083008, "logps/rejected": -3.2208092212677, "loss": 0.5881, "rewards/accuracies": 0.71875, "rewards/chosen": -2.560640335083008, "rewards/margins": 0.6601688265800476, "rewards/rejected": -3.2208092212677, "sft_loss": 2.736283779144287, "step": 2035 }, { "epoch": 1.0918213748118415, "grad_norm": 26.009942660064883, "learning_rate": 8.023690218297329e-07, "logits/chosen": -0.17696990072727203, "logits/rejected": -0.08916838467121124, "logps/chosen": -2.5884757041931152, "logps/rejected": -3.3237998485565186, "loss": 0.5634, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.5884757041931152, "rewards/margins": 0.7353242635726929, "rewards/rejected": -3.3237998485565186, "sft_loss": 2.683675527572632, "step": 2040 }, { "epoch": 1.094497407593243, "grad_norm": 12.131765431140455, "learning_rate": 8.01127203488458e-07, "logits/chosen": -0.040083326399326324, "logits/rejected": 0.01661711558699608, "logps/chosen": -2.7470474243164062, "logps/rejected": -3.420219898223877, "loss": 0.5626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.7470474243164062, "rewards/margins": 0.6731725335121155, "rewards/rejected": -3.420219898223877, "sft_loss": 2.886335849761963, "step": 2045 }, { "epoch": 1.0971734403746445, "grad_norm": 13.591583277242973, "learning_rate": 7.998824636097339e-07, "logits/chosen": -0.17442944645881653, "logits/rejected": -0.010418994352221489, "logps/chosen": -2.714845895767212, "logps/rejected": -3.4485251903533936, "loss": 0.5542, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.714845895767212, "rewards/margins": 0.7336792945861816, "rewards/rejected": -3.4485251903533936, "sft_loss": 2.9100375175476074, "step": 2050 }, { "epoch": 1.0998494731560462, "grad_norm": 16.109926012977844, "learning_rate": 7.986348142700328e-07, "logits/chosen": -0.11374533176422119, "logits/rejected": 0.06635691970586777, "logps/chosen": -2.861820936203003, "logps/rejected": -3.820934772491455, "loss": 0.5276, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.861820936203003, "rewards/margins": 0.9591139554977417, "rewards/rejected": -3.820934772491455, "sft_loss": 3.154653310775757, "step": 2055 }, { "epoch": 1.1025255059374477, "grad_norm": 16.53595169896097, "learning_rate": 7.973842675740539e-07, "logits/chosen": -0.05169066786766052, "logits/rejected": 0.02870332822203636, "logps/chosen": -2.8666768074035645, "logps/rejected": -3.7568085193634033, "loss": 0.5237, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8666768074035645, "rewards/margins": 0.8901316523551941, "rewards/rejected": -3.7568085193634033, "sft_loss": 3.1012189388275146, "step": 2060 }, { "epoch": 1.1052015387188494, "grad_norm": 14.205600074036031, "learning_rate": 7.961308356546066e-07, "logits/chosen": -0.08631865680217743, "logits/rejected": 0.09614025056362152, "logps/chosen": -3.0432827472686768, "logps/rejected": -3.993431806564331, "loss": 0.5182, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0432827472686768, "rewards/margins": 0.9501487016677856, "rewards/rejected": -3.993431806564331, "sft_loss": 3.184915065765381, "step": 2065 }, { "epoch": 1.107877571500251, "grad_norm": 18.08939313670418, "learning_rate": 7.948745306724931e-07, "logits/chosen": -0.08003537356853485, "logits/rejected": 0.1082262396812439, "logps/chosen": -2.843501091003418, "logps/rejected": -3.8439784049987793, "loss": 0.4818, "rewards/accuracies": 0.78125, "rewards/chosen": -2.843501091003418, "rewards/margins": 1.0004774332046509, "rewards/rejected": -3.8439784049987793, "sft_loss": 2.974362850189209, "step": 2070 }, { "epoch": 1.1105536042816524, "grad_norm": 17.045952758047306, "learning_rate": 7.936153648163897e-07, "logits/chosen": -0.150905042886734, "logits/rejected": -0.01554001122713089, "logps/chosen": -3.081815719604492, "logps/rejected": -3.82598876953125, "loss": 0.5928, "rewards/accuracies": 0.71875, "rewards/chosen": -3.081815719604492, "rewards/margins": 0.7441730499267578, "rewards/rejected": -3.82598876953125, "sft_loss": 3.387814998626709, "step": 2075 }, { "epoch": 1.1132296370630541, "grad_norm": 13.732378711313746, "learning_rate": 7.92353350302729e-07, "logits/chosen": -0.20131368935108185, "logits/rejected": 0.002260335488244891, "logps/chosen": -2.891542911529541, "logps/rejected": -3.7180428504943848, "loss": 0.5424, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.891542911529541, "rewards/margins": 0.826499342918396, "rewards/rejected": -3.7180428504943848, "sft_loss": 3.1301307678222656, "step": 2080 }, { "epoch": 1.1159056698444556, "grad_norm": 16.660204890320408, "learning_rate": 7.910884993755816e-07, "logits/chosen": -0.15335850417613983, "logits/rejected": -0.04589155316352844, "logps/chosen": -2.9131200313568115, "logps/rejected": -3.7110915184020996, "loss": 0.5577, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.9131200313568115, "rewards/margins": 0.7979711294174194, "rewards/rejected": -3.7110915184020996, "sft_loss": 3.070864200592041, "step": 2085 }, { "epoch": 1.118581702625857, "grad_norm": 13.701172511940262, "learning_rate": 7.898208243065367e-07, "logits/chosen": -0.18213477730751038, "logits/rejected": -0.1654680222272873, "logps/chosen": -2.819640874862671, "logps/rejected": -3.414013624191284, "loss": 0.6072, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.819640874862671, "rewards/margins": 0.5943728685379028, "rewards/rejected": -3.414013624191284, "sft_loss": 3.111002206802368, "step": 2090 }, { "epoch": 1.1212577354072588, "grad_norm": 16.53325718701653, "learning_rate": 7.88550337394583e-07, "logits/chosen": -0.21913953125476837, "logits/rejected": -0.05338749289512634, "logps/chosen": -3.027158498764038, "logps/rejected": -3.6880135536193848, "loss": 0.5923, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -3.027158498764038, "rewards/margins": 0.6608546376228333, "rewards/rejected": -3.6880135536193848, "sft_loss": 3.1259403228759766, "step": 2095 }, { "epoch": 1.1239337681886603, "grad_norm": 15.264267957216644, "learning_rate": 7.872770509659905e-07, "logits/chosen": -0.0772733986377716, "logits/rejected": -0.007263128645718098, "logps/chosen": -2.935605525970459, "logps/rejected": -3.6056618690490723, "loss": 0.5867, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.935605525970459, "rewards/margins": 0.6700563430786133, "rewards/rejected": -3.6056618690490723, "sft_loss": 3.046522855758667, "step": 2100 }, { "epoch": 1.1266098009700618, "grad_norm": 14.343161073716592, "learning_rate": 7.860009773741896e-07, "logits/chosen": -0.064370296895504, "logits/rejected": 0.11259357631206512, "logps/chosen": -2.7505581378936768, "logps/rejected": -3.5925374031066895, "loss": 0.5111, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7505581378936768, "rewards/margins": 0.8419793844223022, "rewards/rejected": -3.5925374031066895, "sft_loss": 2.818540096282959, "step": 2105 }, { "epoch": 1.1292858337514635, "grad_norm": 15.151824872919203, "learning_rate": 7.84722128999652e-07, "logits/chosen": -0.12127542495727539, "logits/rejected": 0.044526226818561554, "logps/chosen": -2.701420307159424, "logps/rejected": -3.569472551345825, "loss": 0.5615, "rewards/accuracies": 0.75, "rewards/chosen": -2.701420307159424, "rewards/margins": 0.8680523633956909, "rewards/rejected": -3.569472551345825, "sft_loss": 2.8453078269958496, "step": 2110 }, { "epoch": 1.131961866532865, "grad_norm": 13.018946550229275, "learning_rate": 7.834405182497699e-07, "logits/chosen": -0.039768896996974945, "logits/rejected": 0.02067652717232704, "logps/chosen": -2.7957961559295654, "logps/rejected": -3.5757274627685547, "loss": 0.5708, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7957961559295654, "rewards/margins": 0.779931366443634, "rewards/rejected": -3.5757274627685547, "sft_loss": 2.929673910140991, "step": 2115 }, { "epoch": 1.1346378993142665, "grad_norm": 15.20906486976137, "learning_rate": 7.821561575587368e-07, "logits/chosen": -0.1733914464712143, "logits/rejected": -0.12084399163722992, "logps/chosen": -2.7386631965637207, "logps/rejected": -3.3611443042755127, "loss": 0.5547, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.7386631965637207, "rewards/margins": 0.6224810481071472, "rewards/rejected": -3.3611443042755127, "sft_loss": 2.9166512489318848, "step": 2120 }, { "epoch": 1.1373139320956682, "grad_norm": 10.09351337769588, "learning_rate": 7.808690593874254e-07, "logits/chosen": -0.17015834152698517, "logits/rejected": -0.0607013925909996, "logps/chosen": -2.801271677017212, "logps/rejected": -3.549854278564453, "loss": 0.5552, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.801271677017212, "rewards/margins": 0.7485824823379517, "rewards/rejected": -3.549854278564453, "sft_loss": 2.9816415309906006, "step": 2125 }, { "epoch": 1.1399899648770697, "grad_norm": 13.731817537809986, "learning_rate": 7.79579236223268e-07, "logits/chosen": -0.10042476654052734, "logits/rejected": 0.1891237497329712, "logps/chosen": -2.7051987648010254, "logps/rejected": -3.6020240783691406, "loss": 0.5231, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7051987648010254, "rewards/margins": 0.8968254923820496, "rewards/rejected": -3.6020240783691406, "sft_loss": 2.8918728828430176, "step": 2130 }, { "epoch": 1.1426659976584714, "grad_norm": 13.688072489475777, "learning_rate": 7.782867005801346e-07, "logits/chosen": -0.11776898056268692, "logits/rejected": 0.11693098396062851, "logps/chosen": -2.7273452281951904, "logps/rejected": -3.5392394065856934, "loss": 0.5442, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7273452281951904, "rewards/margins": 0.8118942975997925, "rewards/rejected": -3.5392394065856934, "sft_loss": 2.8112258911132812, "step": 2135 }, { "epoch": 1.145342030439873, "grad_norm": 17.846087519185875, "learning_rate": 7.769914649982117e-07, "logits/chosen": -0.11212320625782013, "logits/rejected": 0.05280379205942154, "logps/chosen": -2.698925018310547, "logps/rejected": -3.4237124919891357, "loss": 0.5754, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.698925018310547, "rewards/margins": 0.7247874736785889, "rewards/rejected": -3.4237124919891357, "sft_loss": 2.8307290077209473, "step": 2140 }, { "epoch": 1.1480180632212744, "grad_norm": 14.376053087200567, "learning_rate": 7.756935420438803e-07, "logits/chosen": -0.09898360073566437, "logits/rejected": 0.021031454205513, "logps/chosen": -2.5822346210479736, "logps/rejected": -3.566603899002075, "loss": 0.519, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5822346210479736, "rewards/margins": 0.9843686819076538, "rewards/rejected": -3.566603899002075, "sft_loss": 2.718616008758545, "step": 2145 }, { "epoch": 1.1506940960026761, "grad_norm": 11.203256988700122, "learning_rate": 7.743929443095951e-07, "logits/chosen": -0.12101199477910995, "logits/rejected": -0.04032517224550247, "logps/chosen": -2.7327792644500732, "logps/rejected": -3.550232410430908, "loss": 0.5139, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7327792644500732, "rewards/margins": 0.8174529075622559, "rewards/rejected": -3.550232410430908, "sft_loss": 2.8194966316223145, "step": 2150 }, { "epoch": 1.1533701287840776, "grad_norm": 13.665641869802482, "learning_rate": 7.730896844137609e-07, "logits/chosen": -0.06756674498319626, "logits/rejected": 0.04592638835310936, "logps/chosen": -2.945727586746216, "logps/rejected": -3.566514492034912, "loss": 0.608, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.945727586746216, "rewards/margins": 0.6207873821258545, "rewards/rejected": -3.566514492034912, "sft_loss": 3.0970585346221924, "step": 2155 }, { "epoch": 1.1560461615654791, "grad_norm": 16.169765648383418, "learning_rate": 7.717837750006106e-07, "logits/chosen": -0.09466465562582016, "logits/rejected": 0.007165367715060711, "logps/chosen": -2.7117316722869873, "logps/rejected": -3.6370861530303955, "loss": 0.5396, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7117316722869873, "rewards/margins": 0.9253547787666321, "rewards/rejected": -3.6370861530303955, "sft_loss": 2.8119845390319824, "step": 2160 }, { "epoch": 1.1587221943468808, "grad_norm": 13.68381522374296, "learning_rate": 7.704752287400832e-07, "logits/chosen": -0.07723397761583328, "logits/rejected": 0.1379738301038742, "logps/chosen": -2.777071475982666, "logps/rejected": -3.6475002765655518, "loss": 0.5489, "rewards/accuracies": 0.75, "rewards/chosen": -2.777071475982666, "rewards/margins": 0.8704291582107544, "rewards/rejected": -3.6475002765655518, "sft_loss": 2.910811424255371, "step": 2165 }, { "epoch": 1.1613982271282823, "grad_norm": 9.552484298341081, "learning_rate": 7.691640583277004e-07, "logits/chosen": -0.08300045132637024, "logits/rejected": 0.10612811893224716, "logps/chosen": -2.6649529933929443, "logps/rejected": -3.496000289916992, "loss": 0.5528, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6649529933929443, "rewards/margins": 0.8310472369194031, "rewards/rejected": -3.496000289916992, "sft_loss": 2.795423984527588, "step": 2170 }, { "epoch": 1.1640742599096838, "grad_norm": 10.432443980432863, "learning_rate": 7.678502764844433e-07, "logits/chosen": -0.13375064730644226, "logits/rejected": 0.08962388336658478, "logps/chosen": -2.653404712677002, "logps/rejected": -3.332920551300049, "loss": 0.5569, "rewards/accuracies": 0.75, "rewards/chosen": -2.653404712677002, "rewards/margins": 0.679515540599823, "rewards/rejected": -3.332920551300049, "sft_loss": 2.7964444160461426, "step": 2175 }, { "epoch": 1.1667502926910855, "grad_norm": 10.949292698095055, "learning_rate": 7.665338959566288e-07, "logits/chosen": -0.09721340239048004, "logits/rejected": 0.022823205217719078, "logps/chosen": -2.5667824745178223, "logps/rejected": -3.463876724243164, "loss": 0.5048, "rewards/accuracies": 0.78125, "rewards/chosen": -2.5667824745178223, "rewards/margins": 0.8970942497253418, "rewards/rejected": -3.463876724243164, "sft_loss": 2.7630155086517334, "step": 2180 }, { "epoch": 1.169426325472487, "grad_norm": 16.319514759464216, "learning_rate": 7.652149295157868e-07, "logits/chosen": -0.019224589690566063, "logits/rejected": 0.17753568291664124, "logps/chosen": -2.734881639480591, "logps/rejected": -3.398794651031494, "loss": 0.5615, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.734881639480591, "rewards/margins": 0.6639131307601929, "rewards/rejected": -3.398794651031494, "sft_loss": 2.8227455615997314, "step": 2185 }, { "epoch": 1.1721023582538885, "grad_norm": 11.860038540569555, "learning_rate": 7.638933899585354e-07, "logits/chosen": 0.06096430495381355, "logits/rejected": 0.0971364825963974, "logps/chosen": -2.660165309906006, "logps/rejected": -3.4485931396484375, "loss": 0.5471, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.660165309906006, "rewards/margins": 0.7884277105331421, "rewards/rejected": -3.4485931396484375, "sft_loss": 2.973078727722168, "step": 2190 }, { "epoch": 1.1747783910352902, "grad_norm": 12.804501417380948, "learning_rate": 7.625692901064573e-07, "logits/chosen": -0.04396064206957817, "logits/rejected": 0.08522314578294754, "logps/chosen": -2.8927605152130127, "logps/rejected": -3.7299964427948, "loss": 0.5893, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8927605152130127, "rewards/margins": 0.8372361063957214, "rewards/rejected": -3.7299964427948, "sft_loss": 3.15350604057312, "step": 2195 }, { "epoch": 1.1774544238166917, "grad_norm": 11.705850285187246, "learning_rate": 7.61242642805975e-07, "logits/chosen": -0.0731678232550621, "logits/rejected": -0.07998080551624298, "logps/chosen": -2.737246036529541, "logps/rejected": -3.451080322265625, "loss": 0.5729, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.737246036529541, "rewards/margins": 0.7138343453407288, "rewards/rejected": -3.451080322265625, "sft_loss": 2.9744622707366943, "step": 2200 }, { "epoch": 1.1801304565980932, "grad_norm": 11.625726585306893, "learning_rate": 7.599134609282266e-07, "logits/chosen": -0.15962481498718262, "logits/rejected": 0.07510614395141602, "logps/chosen": -2.7328078746795654, "logps/rejected": -3.451481580734253, "loss": 0.5604, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7328078746795654, "rewards/margins": 0.7186736464500427, "rewards/rejected": -3.451481580734253, "sft_loss": 2.8423523902893066, "step": 2205 }, { "epoch": 1.182806489379495, "grad_norm": 11.640982228332257, "learning_rate": 7.585817573689402e-07, "logits/chosen": -0.1442055106163025, "logits/rejected": 0.0014272450935095549, "logps/chosen": -2.437304735183716, "logps/rejected": -3.400106906890869, "loss": 0.492, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.437304735183716, "rewards/margins": 0.9628024101257324, "rewards/rejected": -3.400106906890869, "sft_loss": 2.632190227508545, "step": 2210 }, { "epoch": 1.1854825221608964, "grad_norm": 11.954836265342767, "learning_rate": 7.572475450483098e-07, "logits/chosen": -0.13480237126350403, "logits/rejected": -0.02519914135336876, "logps/chosen": -2.741669178009033, "logps/rejected": -3.451484203338623, "loss": 0.5723, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.741669178009033, "rewards/margins": 0.7098146677017212, "rewards/rejected": -3.451484203338623, "sft_loss": 2.809096097946167, "step": 2215 }, { "epoch": 1.188158554942298, "grad_norm": 13.788219036304126, "learning_rate": 7.559108369108689e-07, "logits/chosen": -0.19939342141151428, "logits/rejected": -0.054556954652071, "logps/chosen": -2.576840877532959, "logps/rejected": -3.2807235717773438, "loss": 0.5834, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.576840877532959, "rewards/margins": 0.7038829922676086, "rewards/rejected": -3.2807235717773438, "sft_loss": 2.7406558990478516, "step": 2220 }, { "epoch": 1.1908345877236997, "grad_norm": 11.214294947271048, "learning_rate": 7.54571645925366e-07, "logits/chosen": -0.20111767947673798, "logits/rejected": 0.10563284158706665, "logps/chosen": -2.5377230644226074, "logps/rejected": -3.4687418937683105, "loss": 0.5078, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.5377230644226074, "rewards/margins": 0.9310193061828613, "rewards/rejected": -3.4687418937683105, "sft_loss": 2.7131781578063965, "step": 2225 }, { "epoch": 1.1935106205051011, "grad_norm": 14.038927448722399, "learning_rate": 7.532299850846378e-07, "logits/chosen": -0.1545659601688385, "logits/rejected": 0.048467788845300674, "logps/chosen": -2.569761037826538, "logps/rejected": -3.4898669719696045, "loss": 0.5239, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.569761037826538, "rewards/margins": 0.9201061129570007, "rewards/rejected": -3.4898669719696045, "sft_loss": 2.6959354877471924, "step": 2230 }, { "epoch": 1.1961866532865026, "grad_norm": 22.202008098046015, "learning_rate": 7.518858674054838e-07, "logits/chosen": -0.16763782501220703, "logits/rejected": 0.07074837386608124, "logps/chosen": -2.594499111175537, "logps/rejected": -3.4791977405548096, "loss": 0.5313, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.594499111175537, "rewards/margins": 0.8846985101699829, "rewards/rejected": -3.4791977405548096, "sft_loss": 2.7077810764312744, "step": 2235 }, { "epoch": 1.1988626860679044, "grad_norm": 11.638140383512662, "learning_rate": 7.505393059285394e-07, "logits/chosen": -0.1514950543642044, "logits/rejected": 0.05857670307159424, "logps/chosen": -2.7575697898864746, "logps/rejected": -3.520163059234619, "loss": 0.5629, "rewards/accuracies": 0.75, "rewards/chosen": -2.7575697898864746, "rewards/margins": 0.7625933885574341, "rewards/rejected": -3.520163059234619, "sft_loss": 2.9731178283691406, "step": 2240 }, { "epoch": 1.2015387188493059, "grad_norm": 13.490389163955774, "learning_rate": 7.491903137181501e-07, "logits/chosen": -0.06201281026005745, "logits/rejected": -0.00964923482388258, "logps/chosen": -2.607433795928955, "logps/rejected": -3.3961021900177, "loss": 0.5365, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.607433795928955, "rewards/margins": 0.7886683940887451, "rewards/rejected": -3.3961021900177, "sft_loss": 2.8041486740112305, "step": 2245 }, { "epoch": 1.2042147516307076, "grad_norm": 13.885635892847203, "learning_rate": 7.478389038622441e-07, "logits/chosen": 0.021996164694428444, "logits/rejected": 0.07282289117574692, "logps/chosen": -2.7160027027130127, "logps/rejected": -3.5241007804870605, "loss": 0.5597, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7160027027130127, "rewards/margins": 0.8080977201461792, "rewards/rejected": -3.5241007804870605, "sft_loss": 2.846470594406128, "step": 2250 }, { "epoch": 1.206890784412109, "grad_norm": 17.65495152779927, "learning_rate": 7.46485089472206e-07, "logits/chosen": -0.07781971246004105, "logits/rejected": 0.011274236254394054, "logps/chosen": -2.707087993621826, "logps/rejected": -3.4074695110321045, "loss": 0.6023, "rewards/accuracies": 0.71875, "rewards/chosen": -2.707087993621826, "rewards/margins": 0.7003816366195679, "rewards/rejected": -3.4074695110321045, "sft_loss": 2.811058521270752, "step": 2255 }, { "epoch": 1.2095668171935106, "grad_norm": 12.9064548583628, "learning_rate": 7.451288836827487e-07, "logits/chosen": -0.034752242267131805, "logits/rejected": -0.03955184295773506, "logps/chosen": -2.5647547245025635, "logps/rejected": -3.176440954208374, "loss": 0.5846, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.5647547245025635, "rewards/margins": 0.6116862893104553, "rewards/rejected": -3.176440954208374, "sft_loss": 2.714322328567505, "step": 2260 }, { "epoch": 1.2122428499749123, "grad_norm": 11.576293376589549, "learning_rate": 7.437702996517869e-07, "logits/chosen": -0.0587887242436409, "logits/rejected": 0.057128388434648514, "logps/chosen": -2.555610179901123, "logps/rejected": -3.3883907794952393, "loss": 0.5334, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.555610179901123, "rewards/margins": 0.8327801823616028, "rewards/rejected": -3.3883907794952393, "sft_loss": 2.7125325202941895, "step": 2265 }, { "epoch": 1.2149188827563138, "grad_norm": 16.459688569958313, "learning_rate": 7.424093505603087e-07, "logits/chosen": -0.21486875414848328, "logits/rejected": 0.0009250387665815651, "logps/chosen": -2.632251501083374, "logps/rejected": -3.5194313526153564, "loss": 0.5191, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.632251501083374, "rewards/margins": 0.8871792554855347, "rewards/rejected": -3.5194313526153564, "sft_loss": 2.7042503356933594, "step": 2270 }, { "epoch": 1.2175949155377153, "grad_norm": 12.02172575643201, "learning_rate": 7.410460496122482e-07, "logits/chosen": -0.11882486194372177, "logits/rejected": 0.061978042125701904, "logps/chosen": -2.511664867401123, "logps/rejected": -3.44714093208313, "loss": 0.5004, "rewards/accuracies": 0.78125, "rewards/chosen": -2.511664867401123, "rewards/margins": 0.9354757070541382, "rewards/rejected": -3.44714093208313, "sft_loss": 2.6550300121307373, "step": 2275 }, { "epoch": 1.220270948319117, "grad_norm": 12.938920832651148, "learning_rate": 7.396804100343572e-07, "logits/chosen": -0.21035528182983398, "logits/rejected": 0.024009039625525475, "logps/chosen": -2.495543956756592, "logps/rejected": -3.2615630626678467, "loss": 0.5356, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.495543956756592, "rewards/margins": 0.7660187482833862, "rewards/rejected": -3.2615630626678467, "sft_loss": 2.665358304977417, "step": 2280 }, { "epoch": 1.2229469811005185, "grad_norm": 10.660246082448193, "learning_rate": 7.383124450760768e-07, "logits/chosen": -0.1773698329925537, "logits/rejected": 0.060549378395080566, "logps/chosen": -2.765693426132202, "logps/rejected": -3.6653060913085938, "loss": 0.5215, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.765693426132202, "rewards/margins": 0.8996122479438782, "rewards/rejected": -3.6653060913085938, "sft_loss": 2.8947532176971436, "step": 2285 }, { "epoch": 1.22562301388192, "grad_norm": 11.952228684288013, "learning_rate": 7.369421680094091e-07, "logits/chosen": -0.25170189142227173, "logits/rejected": -0.06527785211801529, "logps/chosen": -2.6405110359191895, "logps/rejected": -3.569425106048584, "loss": 0.5445, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.6405110359191895, "rewards/margins": 0.9289140701293945, "rewards/rejected": -3.569425106048584, "sft_loss": 2.804739236831665, "step": 2290 }, { "epoch": 1.2282990466633217, "grad_norm": 14.512637560124094, "learning_rate": 7.355695921287881e-07, "logits/chosen": -0.18567724525928497, "logits/rejected": -0.07959018647670746, "logps/chosen": -2.851104259490967, "logps/rejected": -3.550295352935791, "loss": 0.604, "rewards/accuracies": 0.6875, "rewards/chosen": -2.851104259490967, "rewards/margins": 0.6991909742355347, "rewards/rejected": -3.550295352935791, "sft_loss": 3.084052324295044, "step": 2295 }, { "epoch": 1.2309750794447232, "grad_norm": 17.14862593143908, "learning_rate": 7.341947307509513e-07, "logits/chosen": -0.14178717136383057, "logits/rejected": 0.025927435606718063, "logps/chosen": -2.7473275661468506, "logps/rejected": -3.539985179901123, "loss": 0.584, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7473275661468506, "rewards/margins": 0.7926574945449829, "rewards/rejected": -3.539985179901123, "sft_loss": 2.9406330585479736, "step": 2300 }, { "epoch": 1.233651112226125, "grad_norm": 14.567551554858559, "learning_rate": 7.328175972148094e-07, "logits/chosen": -0.12181315571069717, "logits/rejected": 0.027345454320311546, "logps/chosen": -2.9586310386657715, "logps/rejected": -3.7822742462158203, "loss": 0.5585, "rewards/accuracies": 0.75, "rewards/chosen": -2.9586310386657715, "rewards/margins": 0.8236430883407593, "rewards/rejected": -3.7822742462158203, "sft_loss": 3.0613818168640137, "step": 2305 }, { "epoch": 1.2363271450075264, "grad_norm": 14.46195645390294, "learning_rate": 7.314382048813185e-07, "logits/chosen": -0.07838133722543716, "logits/rejected": 0.24125882983207703, "logps/chosen": -2.6525206565856934, "logps/rejected": -3.5842769145965576, "loss": 0.5071, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.6525206565856934, "rewards/margins": 0.9317564964294434, "rewards/rejected": -3.5842769145965576, "sft_loss": 2.7743771076202393, "step": 2310 }, { "epoch": 1.2390031777889279, "grad_norm": 12.667723186738346, "learning_rate": 7.300565671333486e-07, "logits/chosen": -0.16325172781944275, "logits/rejected": 0.09019894897937775, "logps/chosen": -2.7748184204101562, "logps/rejected": -3.6521975994110107, "loss": 0.5157, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7748184204101562, "rewards/margins": 0.8773792386054993, "rewards/rejected": -3.6521975994110107, "sft_loss": 2.919607400894165, "step": 2315 }, { "epoch": 1.2416792105703296, "grad_norm": 9.821970272156106, "learning_rate": 7.286726973755554e-07, "logits/chosen": -0.0405716709792614, "logits/rejected": 0.012422848492860794, "logps/chosen": -2.7273387908935547, "logps/rejected": -3.563554048538208, "loss": 0.5221, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7273387908935547, "rewards/margins": 0.8362150192260742, "rewards/rejected": -3.563554048538208, "sft_loss": 2.8721890449523926, "step": 2320 }, { "epoch": 1.244355243351731, "grad_norm": 11.2576869894753, "learning_rate": 7.272866090342493e-07, "logits/chosen": 0.014239540323615074, "logits/rejected": 0.10396864265203476, "logps/chosen": -2.7915005683898926, "logps/rejected": -3.6830050945281982, "loss": 0.5032, "rewards/accuracies": 0.78125, "rewards/chosen": -2.7915005683898926, "rewards/margins": 0.8915045857429504, "rewards/rejected": -3.6830050945281982, "sft_loss": 2.8475332260131836, "step": 2325 }, { "epoch": 1.2470312761331326, "grad_norm": 14.886226508028688, "learning_rate": 7.258983155572656e-07, "logits/chosen": -0.1416918933391571, "logits/rejected": -0.013908380642533302, "logps/chosen": -2.7864866256713867, "logps/rejected": -3.65608549118042, "loss": 0.5579, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7864866256713867, "rewards/margins": 0.8695987462997437, "rewards/rejected": -3.65608549118042, "sft_loss": 3.0220208168029785, "step": 2330 }, { "epoch": 1.2497073089145343, "grad_norm": 11.78564603000481, "learning_rate": 7.245078304138335e-07, "logits/chosen": -0.04754914715886116, "logits/rejected": 0.0625465139746666, "logps/chosen": -2.9272003173828125, "logps/rejected": -3.7455573081970215, "loss": 0.5553, "rewards/accuracies": 0.75, "rewards/chosen": -2.9272003173828125, "rewards/margins": 0.8183562159538269, "rewards/rejected": -3.7455573081970215, "sft_loss": 3.047041416168213, "step": 2335 }, { "epoch": 1.2523833416959358, "grad_norm": 11.619307815290895, "learning_rate": 7.231151670944462e-07, "logits/chosen": -0.22953256964683533, "logits/rejected": 0.018267208710312843, "logps/chosen": -2.9490160942077637, "logps/rejected": -3.734051465988159, "loss": 0.5658, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9490160942077637, "rewards/margins": 0.7850354313850403, "rewards/rejected": -3.734051465988159, "sft_loss": 3.06479811668396, "step": 2340 }, { "epoch": 1.2550593744773373, "grad_norm": 11.88694919613225, "learning_rate": 7.217203391107291e-07, "logits/chosen": -0.15705767273902893, "logits/rejected": 0.05266653373837471, "logps/chosen": -2.8118395805358887, "logps/rejected": -3.7154057025909424, "loss": 0.5396, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8118395805358887, "rewards/margins": 0.9035660028457642, "rewards/rejected": -3.7154057025909424, "sft_loss": 2.9647486209869385, "step": 2345 }, { "epoch": 1.257735407258739, "grad_norm": 11.194729124579853, "learning_rate": 7.203233599953096e-07, "logits/chosen": -0.11801191419363022, "logits/rejected": 0.08426766842603683, "logps/chosen": -2.867255449295044, "logps/rejected": -3.711289882659912, "loss": 0.5432, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.867255449295044, "rewards/margins": 0.8440347909927368, "rewards/rejected": -3.711289882659912, "sft_loss": 2.996035099029541, "step": 2350 }, { "epoch": 1.2604114400401405, "grad_norm": 15.396629241648583, "learning_rate": 7.189242433016852e-07, "logits/chosen": -0.07606076449155807, "logits/rejected": 0.08415577560663223, "logps/chosen": -2.6616203784942627, "logps/rejected": -3.712498188018799, "loss": 0.5173, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6616203784942627, "rewards/margins": 1.0508776903152466, "rewards/rejected": -3.712498188018799, "sft_loss": 2.822685480117798, "step": 2355 }, { "epoch": 1.263087472821542, "grad_norm": 15.779769905906738, "learning_rate": 7.17523002604092e-07, "logits/chosen": -0.1172393336892128, "logits/rejected": 0.0867505595088005, "logps/chosen": -2.902477979660034, "logps/rejected": -3.823699474334717, "loss": 0.5439, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.902477979660034, "rewards/margins": 0.9212223291397095, "rewards/rejected": -3.823699474334717, "sft_loss": 3.1079139709472656, "step": 2360 }, { "epoch": 1.2657635056029437, "grad_norm": 14.826732984520188, "learning_rate": 7.161196514973734e-07, "logits/chosen": -0.0951496809720993, "logits/rejected": 0.10926233232021332, "logps/chosen": -2.767425775527954, "logps/rejected": -3.7156193256378174, "loss": 0.5329, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.767425775527954, "rewards/margins": 0.9481937289237976, "rewards/rejected": -3.7156193256378174, "sft_loss": 2.93123197555542, "step": 2365 }, { "epoch": 1.2684395383843452, "grad_norm": 15.424680439322838, "learning_rate": 7.147142035968483e-07, "logits/chosen": -0.05707361549139023, "logits/rejected": 0.14730793237686157, "logps/chosen": -2.9719011783599854, "logps/rejected": -3.8209996223449707, "loss": 0.5497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.9719011783599854, "rewards/margins": 0.8490983247756958, "rewards/rejected": -3.8209996223449707, "sft_loss": 3.1319332122802734, "step": 2370 }, { "epoch": 1.2711155711657467, "grad_norm": 16.015488945324908, "learning_rate": 7.133066725381781e-07, "logits/chosen": -0.19962266087532043, "logits/rejected": 0.032320786267519, "logps/chosen": -2.7625584602355957, "logps/rejected": -3.652691602706909, "loss": 0.5326, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7625584602355957, "rewards/margins": 0.8901335597038269, "rewards/rejected": -3.652691602706909, "sft_loss": 2.890148639678955, "step": 2375 }, { "epoch": 1.2737916039471484, "grad_norm": 13.416203673025201, "learning_rate": 7.118970719772354e-07, "logits/chosen": -0.1521248072385788, "logits/rejected": 0.05423276498913765, "logps/chosen": -2.9326486587524414, "logps/rejected": -3.9643986225128174, "loss": 0.5213, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.9326486587524414, "rewards/margins": 1.0317497253417969, "rewards/rejected": -3.9643986225128174, "sft_loss": 3.080808162689209, "step": 2380 }, { "epoch": 1.27646763672855, "grad_norm": 16.355261452513187, "learning_rate": 7.104854155899711e-07, "logits/chosen": -0.056324996054172516, "logits/rejected": 0.09308980405330658, "logps/chosen": -2.9391448497772217, "logps/rejected": -3.81658673286438, "loss": 0.5471, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9391448497772217, "rewards/margins": 0.8774418830871582, "rewards/rejected": -3.81658673286438, "sft_loss": 3.0427327156066895, "step": 2385 }, { "epoch": 1.2791436695099514, "grad_norm": 15.660593397945485, "learning_rate": 7.090717170722817e-07, "logits/chosen": -0.07351523637771606, "logits/rejected": 0.0036463707219809294, "logps/chosen": -2.8803794384002686, "logps/rejected": -3.9230926036834717, "loss": 0.4815, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8803794384002686, "rewards/margins": 1.0427131652832031, "rewards/rejected": -3.9230926036834717, "sft_loss": 3.0287270545959473, "step": 2390 }, { "epoch": 1.2818197022913531, "grad_norm": 13.5979207659121, "learning_rate": 7.076559901398762e-07, "logits/chosen": -0.2557370960712433, "logits/rejected": -0.08898217976093292, "logps/chosen": -2.7281875610351562, "logps/rejected": -3.4912161827087402, "loss": 0.5508, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.7281875610351562, "rewards/margins": 0.7630285024642944, "rewards/rejected": -3.4912161827087402, "sft_loss": 2.8979766368865967, "step": 2395 }, { "epoch": 1.2844957350727546, "grad_norm": 21.81683023409898, "learning_rate": 7.062382485281436e-07, "logits/chosen": -0.17235462367534637, "logits/rejected": -0.011511986143887043, "logps/chosen": -2.8245368003845215, "logps/rejected": -3.442866086959839, "loss": 0.5895, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.8245368003845215, "rewards/margins": 0.6183292269706726, "rewards/rejected": -3.442866086959839, "sft_loss": 2.9664804935455322, "step": 2400 }, { "epoch": 1.2844957350727546, "eval_logits/chosen": 0.21740129590034485, "eval_logits/rejected": 0.3361586034297943, "eval_logps/chosen": -2.896199941635132, "eval_logps/rejected": -3.7486469745635986, "eval_loss": 0.5589631199836731, "eval_rewards/accuracies": 0.7321958541870117, "eval_rewards/chosen": -2.896199941635132, "eval_rewards/margins": 0.8524471521377563, "eval_rewards/rejected": -3.7486469745635986, "eval_runtime": 49.6629, "eval_samples_per_second": 27.083, "eval_sft_loss": 3.063127279281616, "eval_steps_per_second": 6.786, "step": 2400 }, { "epoch": 1.287171767854156, "grad_norm": 10.169197101778261, "learning_rate": 7.048185059920193e-07, "logits/chosen": -0.13088330626487732, "logits/rejected": 0.033510565757751465, "logps/chosen": -2.812943935394287, "logps/rejected": -3.8146815299987793, "loss": 0.5221, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.812943935394287, "rewards/margins": 1.0017378330230713, "rewards/rejected": -3.8146815299987793, "sft_loss": 2.9715335369110107, "step": 2405 }, { "epoch": 1.2898478006355578, "grad_norm": 15.283932968431893, "learning_rate": 7.033967763058516e-07, "logits/chosen": -0.21596041321754456, "logits/rejected": 0.0054006329737603664, "logps/chosen": -2.8387198448181152, "logps/rejected": -3.5503010749816895, "loss": 0.5535, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8387198448181152, "rewards/margins": 0.7115810513496399, "rewards/rejected": -3.5503010749816895, "sft_loss": 2.988356113433838, "step": 2410 }, { "epoch": 1.2925238334169593, "grad_norm": 12.239190981465617, "learning_rate": 7.019730732632681e-07, "logits/chosen": -0.06232795864343643, "logits/rejected": 0.05048118159174919, "logps/chosen": -2.8041563034057617, "logps/rejected": -3.745462417602539, "loss": 0.5402, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8041563034057617, "rewards/margins": 0.941305935382843, "rewards/rejected": -3.745462417602539, "sft_loss": 2.9846417903900146, "step": 2415 }, { "epoch": 1.2951998661983608, "grad_norm": 10.981673187841457, "learning_rate": 7.005474106770418e-07, "logits/chosen": -0.18740686774253845, "logits/rejected": -0.04007058963179588, "logps/chosen": -2.7511918544769287, "logps/rejected": -3.655221939086914, "loss": 0.5035, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7511918544769287, "rewards/margins": 0.9040305018424988, "rewards/rejected": -3.655221939086914, "sft_loss": 2.9835357666015625, "step": 2420 }, { "epoch": 1.2978758989797625, "grad_norm": 11.537702916654943, "learning_rate": 6.991198023789577e-07, "logits/chosen": -0.10398707538843155, "logits/rejected": 0.002530190395191312, "logps/chosen": -2.594005584716797, "logps/rejected": -3.307220935821533, "loss": 0.5364, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.594005584716797, "rewards/margins": 0.7132154703140259, "rewards/rejected": -3.307220935821533, "sft_loss": 2.844587802886963, "step": 2425 }, { "epoch": 1.300551931761164, "grad_norm": 14.650164088227385, "learning_rate": 6.976902622196776e-07, "logits/chosen": -0.13141386210918427, "logits/rejected": -0.04375090077519417, "logps/chosen": -2.753594160079956, "logps/rejected": -3.594632625579834, "loss": 0.5434, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.753594160079956, "rewards/margins": 0.8410388231277466, "rewards/rejected": -3.594632625579834, "sft_loss": 2.9324729442596436, "step": 2430 }, { "epoch": 1.3032279645425655, "grad_norm": 12.054791406137, "learning_rate": 6.962588040686064e-07, "logits/chosen": -0.13357993960380554, "logits/rejected": 0.04774869233369827, "logps/chosen": -2.727971315383911, "logps/rejected": -3.395655870437622, "loss": 0.6061, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.727971315383911, "rewards/margins": 0.6676840782165527, "rewards/rejected": -3.395655870437622, "sft_loss": 2.8951988220214844, "step": 2435 }, { "epoch": 1.3059039973239672, "grad_norm": 13.956191863559187, "learning_rate": 6.948254418137573e-07, "logits/chosen": -0.152082160115242, "logits/rejected": 0.005078119225800037, "logps/chosen": -2.735032558441162, "logps/rejected": -3.594900131225586, "loss": 0.5572, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.735032558441162, "rewards/margins": 0.8598672747612, "rewards/rejected": -3.594900131225586, "sft_loss": 2.8514065742492676, "step": 2440 }, { "epoch": 1.3085800301053687, "grad_norm": 17.880626952480096, "learning_rate": 6.933901893616174e-07, "logits/chosen": -0.18042948842048645, "logits/rejected": 0.001165248453617096, "logps/chosen": -2.8338892459869385, "logps/rejected": -3.564720630645752, "loss": 0.5791, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.8338892459869385, "rewards/margins": 0.7308312654495239, "rewards/rejected": -3.564720630645752, "sft_loss": 2.9869532585144043, "step": 2445 }, { "epoch": 1.3112560628867704, "grad_norm": 17.936263693494023, "learning_rate": 6.919530606370121e-07, "logits/chosen": -0.11919529736042023, "logits/rejected": 0.06414364278316498, "logps/chosen": -2.6797900199890137, "logps/rejected": -3.5703368186950684, "loss": 0.5222, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6797900199890137, "rewards/margins": 0.8905467987060547, "rewards/rejected": -3.5703368186950684, "sft_loss": 2.842641592025757, "step": 2450 }, { "epoch": 1.313932095668172, "grad_norm": 11.778029290115247, "learning_rate": 6.905140695829706e-07, "logits/chosen": -0.2211964875459671, "logits/rejected": 0.10638616979122162, "logps/chosen": -2.8779773712158203, "logps/rejected": -3.729775905609131, "loss": 0.5211, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8779773712158203, "rewards/margins": 0.8517991304397583, "rewards/rejected": -3.729775905609131, "sft_loss": 2.9841933250427246, "step": 2455 }, { "epoch": 1.3166081284495736, "grad_norm": 21.232965650496542, "learning_rate": 6.890732301605904e-07, "logits/chosen": -0.1100403293967247, "logits/rejected": 0.023740727454423904, "logps/chosen": -2.82916522026062, "logps/rejected": -3.5652172565460205, "loss": 0.5753, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.82916522026062, "rewards/margins": 0.7360517382621765, "rewards/rejected": -3.5652172565460205, "sft_loss": 2.94977068901062, "step": 2460 }, { "epoch": 1.3192841612309751, "grad_norm": 13.816982282247295, "learning_rate": 6.876305563489021e-07, "logits/chosen": -0.13921356201171875, "logits/rejected": -0.0037501081824302673, "logps/chosen": -3.0031192302703857, "logps/rejected": -4.045921802520752, "loss": 0.4954, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0031192302703857, "rewards/margins": 1.0428025722503662, "rewards/rejected": -4.045921802520752, "sft_loss": 3.080378770828247, "step": 2465 }, { "epoch": 1.3219601940123766, "grad_norm": 17.284787525126063, "learning_rate": 6.861860621447331e-07, "logits/chosen": -0.24131083488464355, "logits/rejected": -0.09177212417125702, "logps/chosen": -3.058800220489502, "logps/rejected": -3.75386118888855, "loss": 0.5922, "rewards/accuracies": 0.71875, "rewards/chosen": -3.058800220489502, "rewards/margins": 0.6950610280036926, "rewards/rejected": -3.75386118888855, "sft_loss": 3.283332347869873, "step": 2470 }, { "epoch": 1.3246362267937783, "grad_norm": 16.4037189947279, "learning_rate": 6.847397615625725e-07, "logits/chosen": -0.05851219221949577, "logits/rejected": 0.018921365961432457, "logps/chosen": -3.0433387756347656, "logps/rejected": -3.802337646484375, "loss": 0.563, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.0433387756347656, "rewards/margins": 0.7589989900588989, "rewards/rejected": -3.802337646484375, "sft_loss": 3.2078189849853516, "step": 2475 }, { "epoch": 1.3273122595751798, "grad_norm": 12.783966343392903, "learning_rate": 6.83291668634435e-07, "logits/chosen": -0.23455576598644257, "logits/rejected": 0.007899327203631401, "logps/chosen": -2.9427683353424072, "logps/rejected": -3.9789185523986816, "loss": 0.5023, "rewards/accuracies": 0.78125, "rewards/chosen": -2.9427683353424072, "rewards/margins": 1.0361502170562744, "rewards/rejected": -3.9789185523986816, "sft_loss": 3.2492759227752686, "step": 2480 }, { "epoch": 1.3299882923565813, "grad_norm": 14.533663119876245, "learning_rate": 6.818417974097246e-07, "logits/chosen": 0.0017811127472668886, "logits/rejected": 0.1938328891992569, "logps/chosen": -2.8244447708129883, "logps/rejected": -3.9046339988708496, "loss": 0.5096, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8244447708129883, "rewards/margins": 1.0801894664764404, "rewards/rejected": -3.9046339988708496, "sft_loss": 3.1318705081939697, "step": 2485 }, { "epoch": 1.332664325137983, "grad_norm": 13.898077604356237, "learning_rate": 6.803901619550981e-07, "logits/chosen": -0.16150932013988495, "logits/rejected": -0.07232292741537094, "logps/chosen": -2.845247983932495, "logps/rejected": -3.7361931800842285, "loss": 0.5081, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.845247983932495, "rewards/margins": 0.89094477891922, "rewards/rejected": -3.7361931800842285, "sft_loss": 3.053283214569092, "step": 2490 }, { "epoch": 1.3353403579193845, "grad_norm": 15.257198775978162, "learning_rate": 6.789367763543292e-07, "logits/chosen": -0.09079436212778091, "logits/rejected": -0.049105338752269745, "logps/chosen": -2.799342632293701, "logps/rejected": -3.526744842529297, "loss": 0.5961, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.799342632293701, "rewards/margins": 0.7274022102355957, "rewards/rejected": -3.526744842529297, "sft_loss": 3.0104057788848877, "step": 2495 }, { "epoch": 1.338016390700786, "grad_norm": 12.661016212031267, "learning_rate": 6.774816547081714e-07, "logits/chosen": -0.08757440000772476, "logits/rejected": 0.11619249731302261, "logps/chosen": -2.7563064098358154, "logps/rejected": -3.429248809814453, "loss": 0.56, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.7563064098358154, "rewards/margins": 0.6729423999786377, "rewards/rejected": -3.429248809814453, "sft_loss": 2.9776875972747803, "step": 2500 }, { "epoch": 1.3406924234821878, "grad_norm": 11.3369202045845, "learning_rate": 6.760248111342211e-07, "logits/chosen": -0.11019454151391983, "logits/rejected": 0.09622526913881302, "logps/chosen": -2.58648419380188, "logps/rejected": -3.4114139080047607, "loss": 0.5313, "rewards/accuracies": 0.75, "rewards/chosen": -2.58648419380188, "rewards/margins": 0.8249297142028809, "rewards/rejected": -3.4114139080047607, "sft_loss": 2.7188868522644043, "step": 2505 }, { "epoch": 1.3433684562635893, "grad_norm": 11.930669828343557, "learning_rate": 6.745662597667813e-07, "logits/chosen": -0.18405844271183014, "logits/rejected": -0.00788338202983141, "logps/chosen": -2.5469136238098145, "logps/rejected": -3.428063154220581, "loss": 0.504, "rewards/accuracies": 0.75, "rewards/chosen": -2.5469136238098145, "rewards/margins": 0.88114994764328, "rewards/rejected": -3.428063154220581, "sft_loss": 2.7372565269470215, "step": 2510 }, { "epoch": 1.3460444890449907, "grad_norm": 13.572330915296204, "learning_rate": 6.731060147567236e-07, "logits/chosen": -0.07396905869245529, "logits/rejected": 0.04314111918210983, "logps/chosen": -2.5699820518493652, "logps/rejected": -3.4824867248535156, "loss": 0.5057, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5699820518493652, "rewards/margins": 0.9125045537948608, "rewards/rejected": -3.4824867248535156, "sft_loss": 2.786505699157715, "step": 2515 }, { "epoch": 1.3487205218263925, "grad_norm": 13.632209758213088, "learning_rate": 6.716440902713515e-07, "logits/chosen": -0.1771983802318573, "logits/rejected": -0.0806611105799675, "logps/chosen": -2.7183549404144287, "logps/rejected": -3.488394260406494, "loss": 0.5214, "rewards/accuracies": 0.75, "rewards/chosen": -2.7183549404144287, "rewards/margins": 0.7700392007827759, "rewards/rejected": -3.488394260406494, "sft_loss": 2.750516414642334, "step": 2520 }, { "epoch": 1.351396554607794, "grad_norm": 16.99291377369377, "learning_rate": 6.701805004942627e-07, "logits/chosen": -0.16392597556114197, "logits/rejected": -0.068141408264637, "logps/chosen": -2.8260693550109863, "logps/rejected": -3.63167142868042, "loss": 0.5541, "rewards/accuracies": 0.71875, "rewards/chosen": -2.8260693550109863, "rewards/margins": 0.8056022524833679, "rewards/rejected": -3.63167142868042, "sft_loss": 3.0697553157806396, "step": 2525 }, { "epoch": 1.3540725873891954, "grad_norm": 17.881106078580785, "learning_rate": 6.687152596252119e-07, "logits/chosen": -0.13811075687408447, "logits/rejected": -0.05306004732847214, "logps/chosen": -2.8596765995025635, "logps/rejected": -3.5510857105255127, "loss": 0.6078, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.8596765995025635, "rewards/margins": 0.6914095878601074, "rewards/rejected": -3.5510857105255127, "sft_loss": 3.048046112060547, "step": 2530 }, { "epoch": 1.3567486201705972, "grad_norm": 14.497796493869934, "learning_rate": 6.672483818799722e-07, "logits/chosen": -0.22085972130298615, "logits/rejected": -0.03556264936923981, "logps/chosen": -2.8032827377319336, "logps/rejected": -3.651709794998169, "loss": 0.5368, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8032827377319336, "rewards/margins": 0.848427414894104, "rewards/rejected": -3.651709794998169, "sft_loss": 2.9784340858459473, "step": 2535 }, { "epoch": 1.3594246529519987, "grad_norm": 16.23050143054709, "learning_rate": 6.657798814901978e-07, "logits/chosen": -0.18565736711025238, "logits/rejected": 0.02686428092420101, "logps/chosen": -2.8961615562438965, "logps/rejected": -3.6319243907928467, "loss": 0.5747, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8961615562438965, "rewards/margins": 0.7357627749443054, "rewards/rejected": -3.6319243907928467, "sft_loss": 3.0940470695495605, "step": 2540 }, { "epoch": 1.3621006857334002, "grad_norm": 14.795515293991508, "learning_rate": 6.643097727032863e-07, "logits/chosen": -0.20490925014019012, "logits/rejected": 0.03412886708974838, "logps/chosen": -2.802849292755127, "logps/rejected": -3.7249903678894043, "loss": 0.5106, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.802849292755127, "rewards/margins": 0.9221410751342773, "rewards/rejected": -3.7249903678894043, "sft_loss": 2.9522287845611572, "step": 2545 }, { "epoch": 1.3647767185148019, "grad_norm": 13.311934939525496, "learning_rate": 6.628380697822392e-07, "logits/chosen": -0.16037167608737946, "logits/rejected": 0.039409227669239044, "logps/chosen": -2.8980062007904053, "logps/rejected": -3.645191192626953, "loss": 0.5538, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8980062007904053, "rewards/margins": 0.7471850514411926, "rewards/rejected": -3.645191192626953, "sft_loss": 3.0559191703796387, "step": 2550 }, { "epoch": 1.3674527512962034, "grad_norm": 19.606325206974198, "learning_rate": 6.61364787005525e-07, "logits/chosen": -0.12282273918390274, "logits/rejected": 0.030886346474289894, "logps/chosen": -2.7401130199432373, "logps/rejected": -3.7287514209747314, "loss": 0.5452, "rewards/accuracies": 0.75, "rewards/chosen": -2.7401130199432373, "rewards/margins": 0.988638699054718, "rewards/rejected": -3.7287514209747314, "sft_loss": 2.9616169929504395, "step": 2555 }, { "epoch": 1.3701287840776049, "grad_norm": 18.41661304276396, "learning_rate": 6.598899386669395e-07, "logits/chosen": -0.11560845375061035, "logits/rejected": 0.032873429358005524, "logps/chosen": -2.8558077812194824, "logps/rejected": -3.606076717376709, "loss": 0.5747, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8558077812194824, "rewards/margins": 0.7502694129943848, "rewards/rejected": -3.606076717376709, "sft_loss": 2.979921817779541, "step": 2560 }, { "epoch": 1.3728048168590066, "grad_norm": 20.287732924933724, "learning_rate": 6.584135390754679e-07, "logits/chosen": -0.14809802174568176, "logits/rejected": 0.026063639670610428, "logps/chosen": -2.7513139247894287, "logps/rejected": -3.6620700359344482, "loss": 0.5366, "rewards/accuracies": 0.75, "rewards/chosen": -2.7513139247894287, "rewards/margins": 0.9107562899589539, "rewards/rejected": -3.6620700359344482, "sft_loss": 2.9361300468444824, "step": 2565 }, { "epoch": 1.375480849640408, "grad_norm": 9.657853877904172, "learning_rate": 6.569356025551454e-07, "logits/chosen": -0.10108263790607452, "logits/rejected": -0.0015401586424559355, "logps/chosen": -2.671628952026367, "logps/rejected": -3.570929765701294, "loss": 0.5249, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.671628952026367, "rewards/margins": 0.8993002772331238, "rewards/rejected": -3.570929765701294, "sft_loss": 2.78222918510437, "step": 2570 }, { "epoch": 1.3781568824218096, "grad_norm": 12.249073564214969, "learning_rate": 6.554561434449186e-07, "logits/chosen": -0.2132205218076706, "logits/rejected": -0.01806102879345417, "logps/chosen": -2.6709792613983154, "logps/rejected": -3.5337345600128174, "loss": 0.5402, "rewards/accuracies": 0.75, "rewards/chosen": -2.6709792613983154, "rewards/margins": 0.8627556562423706, "rewards/rejected": -3.5337345600128174, "sft_loss": 2.8137805461883545, "step": 2575 }, { "epoch": 1.3808329152032113, "grad_norm": 18.244963709261334, "learning_rate": 6.539751760985063e-07, "logits/chosen": -0.15387776494026184, "logits/rejected": -0.03826805576682091, "logps/chosen": -2.8103771209716797, "logps/rejected": -3.4488766193389893, "loss": 0.5825, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.8103771209716797, "rewards/margins": 0.6384997963905334, "rewards/rejected": -3.4488766193389893, "sft_loss": 2.9886860847473145, "step": 2580 }, { "epoch": 1.3835089479846128, "grad_norm": 11.372926174115369, "learning_rate": 6.524927148842602e-07, "logits/chosen": -0.051814544945955276, "logits/rejected": 0.14694175124168396, "logps/chosen": -2.651075839996338, "logps/rejected": -3.5731492042541504, "loss": 0.505, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.651075839996338, "rewards/margins": 0.9220731854438782, "rewards/rejected": -3.5731492042541504, "sft_loss": 2.774786949157715, "step": 2585 }, { "epoch": 1.3861849807660143, "grad_norm": 16.02538304897232, "learning_rate": 6.510087741850254e-07, "logits/chosen": -0.15899457037448883, "logits/rejected": -0.0023765608202666044, "logps/chosen": -2.590641498565674, "logps/rejected": -3.3977553844451904, "loss": 0.5546, "rewards/accuracies": 0.71875, "rewards/chosen": -2.590641498565674, "rewards/margins": 0.8071142435073853, "rewards/rejected": -3.3977553844451904, "sft_loss": 2.838257074356079, "step": 2590 }, { "epoch": 1.388861013547416, "grad_norm": 13.562954231672789, "learning_rate": 6.495233683980012e-07, "logits/chosen": -0.13240444660186768, "logits/rejected": -0.061270572245121, "logps/chosen": -2.760969638824463, "logps/rejected": -3.5130043029785156, "loss": 0.5545, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.760969638824463, "rewards/margins": 0.7520343661308289, "rewards/rejected": -3.5130043029785156, "sft_loss": 2.855792999267578, "step": 2595 }, { "epoch": 1.3915370463288175, "grad_norm": 15.00286944204792, "learning_rate": 6.480365119346011e-07, "logits/chosen": -0.03778408467769623, "logits/rejected": 0.13565590977668762, "logps/chosen": -2.7521817684173584, "logps/rejected": -3.5368714332580566, "loss": 0.5358, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7521817684173584, "rewards/margins": 0.7846895456314087, "rewards/rejected": -3.5368714332580566, "sft_loss": 2.9168272018432617, "step": 2600 }, { "epoch": 1.394213079110219, "grad_norm": 13.34637580930508, "learning_rate": 6.465482192203129e-07, "logits/chosen": -0.018246622756123543, "logits/rejected": 0.07032772898674011, "logps/chosen": -2.7906553745269775, "logps/rejected": -3.6011791229248047, "loss": 0.5307, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7906553745269775, "rewards/margins": 0.8105236887931824, "rewards/rejected": -3.6011791229248047, "sft_loss": 3.0532116889953613, "step": 2605 }, { "epoch": 1.3968891118916207, "grad_norm": 24.566874069177967, "learning_rate": 6.45058504694559e-07, "logits/chosen": -0.021194588392972946, "logits/rejected": 0.04253927990794182, "logps/chosen": -2.8475394248962402, "logps/rejected": -3.7430832386016846, "loss": 0.5366, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8475394248962402, "rewards/margins": 0.8955437541007996, "rewards/rejected": -3.7430832386016846, "sft_loss": 3.0145957469940186, "step": 2610 }, { "epoch": 1.3995651446730222, "grad_norm": 22.213587799390993, "learning_rate": 6.435673828105564e-07, "logits/chosen": -0.13955837488174438, "logits/rejected": 0.028280243277549744, "logps/chosen": -2.7947468757629395, "logps/rejected": -3.759547710418701, "loss": 0.5256, "rewards/accuracies": 0.75, "rewards/chosen": -2.7947468757629395, "rewards/margins": 0.9648011922836304, "rewards/rejected": -3.759547710418701, "sft_loss": 3.0343430042266846, "step": 2615 }, { "epoch": 1.402241177454424, "grad_norm": 16.48086056628069, "learning_rate": 6.420748680351763e-07, "logits/chosen": -0.05696401000022888, "logits/rejected": -0.09101828187704086, "logps/chosen": -2.867499828338623, "logps/rejected": -3.5520408153533936, "loss": 0.5869, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.867499828338623, "rewards/margins": 0.6845409274101257, "rewards/rejected": -3.5520408153533936, "sft_loss": 3.1165659427642822, "step": 2620 }, { "epoch": 1.4049172102358254, "grad_norm": 24.259711379457958, "learning_rate": 6.405809748488032e-07, "logits/chosen": -0.14697353541851044, "logits/rejected": 0.04247549921274185, "logps/chosen": -2.9035472869873047, "logps/rejected": -3.8812873363494873, "loss": 0.5561, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9035472869873047, "rewards/margins": 0.9777399301528931, "rewards/rejected": -3.8812873363494873, "sft_loss": 3.024491310119629, "step": 2625 }, { "epoch": 1.4075932430172269, "grad_norm": 13.22720645000827, "learning_rate": 6.390857177451956e-07, "logits/chosen": -0.24623966217041016, "logits/rejected": -0.02795444056391716, "logps/chosen": -2.919384241104126, "logps/rejected": -3.8018722534179688, "loss": 0.5353, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.919384241104126, "rewards/margins": 0.882487952709198, "rewards/rejected": -3.8018722534179688, "sft_loss": 3.0856080055236816, "step": 2630 }, { "epoch": 1.4102692757986286, "grad_norm": 16.391043845520393, "learning_rate": 6.375891112313445e-07, "logits/chosen": -0.2169569432735443, "logits/rejected": -0.0841716006398201, "logps/chosen": -2.943756580352783, "logps/rejected": -3.8317313194274902, "loss": 0.5336, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.943756580352783, "rewards/margins": 0.887974739074707, "rewards/rejected": -3.8317313194274902, "sft_loss": 3.124804973602295, "step": 2635 }, { "epoch": 1.41294530858003, "grad_norm": 16.571457287546348, "learning_rate": 6.360911698273326e-07, "logits/chosen": -0.14374461770057678, "logits/rejected": 0.007679411675781012, "logps/chosen": -3.0234484672546387, "logps/rejected": -3.7668967247009277, "loss": 0.575, "rewards/accuracies": 0.75, "rewards/chosen": -3.0234484672546387, "rewards/margins": 0.7434485554695129, "rewards/rejected": -3.7668967247009277, "sft_loss": 3.1646761894226074, "step": 2640 }, { "epoch": 1.4156213413614318, "grad_norm": 15.362718817664941, "learning_rate": 6.345919080661944e-07, "logits/chosen": -0.14833353459835052, "logits/rejected": -0.04470009729266167, "logps/chosen": -2.707223892211914, "logps/rejected": -3.673488140106201, "loss": 0.4991, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.707223892211914, "rewards/margins": 0.9662643671035767, "rewards/rejected": -3.673488140106201, "sft_loss": 2.848567485809326, "step": 2645 }, { "epoch": 1.4182973741428333, "grad_norm": 13.379887652672586, "learning_rate": 6.330913404937737e-07, "logits/chosen": -0.21039438247680664, "logits/rejected": -0.033975306898355484, "logps/chosen": -2.8401217460632324, "logps/rejected": -3.8723747730255127, "loss": 0.5236, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8401217460632324, "rewards/margins": 1.032252550125122, "rewards/rejected": -3.8723747730255127, "sft_loss": 2.9957895278930664, "step": 2650 }, { "epoch": 1.4209734069242348, "grad_norm": 15.738869793942214, "learning_rate": 6.315894816685838e-07, "logits/chosen": -0.15092626214027405, "logits/rejected": 0.03545082360506058, "logps/chosen": -2.785090208053589, "logps/rejected": -3.711087465286255, "loss": 0.4916, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.785090208053589, "rewards/margins": 0.9259971380233765, "rewards/rejected": -3.711087465286255, "sft_loss": 3.063774585723877, "step": 2655 }, { "epoch": 1.4236494397056365, "grad_norm": 14.72542003850956, "learning_rate": 6.300863461616657e-07, "logits/chosen": -0.11617982387542725, "logits/rejected": -0.014439836144447327, "logps/chosen": -2.838334560394287, "logps/rejected": -3.5807464122772217, "loss": 0.6031, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.838334560394287, "rewards/margins": 0.7424119710922241, "rewards/rejected": -3.5807464122772217, "sft_loss": 3.023712635040283, "step": 2660 }, { "epoch": 1.426325472487038, "grad_norm": 12.299955789891118, "learning_rate": 6.285819485564465e-07, "logits/chosen": -0.2647419273853302, "logits/rejected": -0.07578029483556747, "logps/chosen": -2.828758716583252, "logps/rejected": -3.6691207885742188, "loss": 0.517, "rewards/accuracies": 0.78125, "rewards/chosen": -2.828758716583252, "rewards/margins": 0.8403621912002563, "rewards/rejected": -3.6691207885742188, "sft_loss": 3.0283281803131104, "step": 2665 }, { "epoch": 1.4290015052684395, "grad_norm": 14.38249715299272, "learning_rate": 6.270763034485986e-07, "logits/chosen": -0.07366399466991425, "logits/rejected": 0.05366234853863716, "logps/chosen": -2.936784267425537, "logps/rejected": -3.766112804412842, "loss": 0.5265, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.936784267425537, "rewards/margins": 0.8293284177780151, "rewards/rejected": -3.766112804412842, "sft_loss": 3.0304007530212402, "step": 2670 }, { "epoch": 1.4316775380498412, "grad_norm": 21.464765721123907, "learning_rate": 6.255694254458972e-07, "logits/chosen": -0.13733455538749695, "logits/rejected": 0.061878375709056854, "logps/chosen": -2.890540838241577, "logps/rejected": -3.669931411743164, "loss": 0.5913, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.890540838241577, "rewards/margins": 0.7793907523155212, "rewards/rejected": -3.669931411743164, "sft_loss": 2.913121461868286, "step": 2675 }, { "epoch": 1.4343535708312427, "grad_norm": 22.08383332497907, "learning_rate": 6.240613291680795e-07, "logits/chosen": -0.20060577988624573, "logits/rejected": 0.010845445096492767, "logps/chosen": -2.7031121253967285, "logps/rejected": -3.4497039318084717, "loss": 0.5899, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7031121253967285, "rewards/margins": 0.7465916872024536, "rewards/rejected": -3.4497039318084717, "sft_loss": 2.8224806785583496, "step": 2680 }, { "epoch": 1.4370296036126442, "grad_norm": 11.542264244056486, "learning_rate": 6.225520292467021e-07, "logits/chosen": -0.2530103027820587, "logits/rejected": 0.029706627130508423, "logps/chosen": -2.5867018699645996, "logps/rejected": -3.494511842727661, "loss": 0.4921, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.5867018699645996, "rewards/margins": 0.9078100323677063, "rewards/rejected": -3.494511842727661, "sft_loss": 2.7249536514282227, "step": 2685 }, { "epoch": 1.439705636394046, "grad_norm": 24.402269527036708, "learning_rate": 6.210415403249993e-07, "logits/chosen": -0.34314805269241333, "logits/rejected": -0.03575535863637924, "logps/chosen": -2.68799090385437, "logps/rejected": -3.5274910926818848, "loss": 0.5629, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.68799090385437, "rewards/margins": 0.8395000696182251, "rewards/rejected": -3.5274910926818848, "sft_loss": 2.7762532234191895, "step": 2690 }, { "epoch": 1.4423816691754474, "grad_norm": 16.204604712741396, "learning_rate": 6.195298770577415e-07, "logits/chosen": -0.08360803127288818, "logits/rejected": -0.07277211546897888, "logps/chosen": -2.678548812866211, "logps/rejected": -3.5122742652893066, "loss": 0.5523, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.678548812866211, "rewards/margins": 0.83372563123703, "rewards/rejected": -3.5122742652893066, "sft_loss": 2.7697131633758545, "step": 2695 }, { "epoch": 1.445057701956849, "grad_norm": 10.0804571859839, "learning_rate": 6.180170541110923e-07, "logits/chosen": -0.16857047379016876, "logits/rejected": 0.06029961258172989, "logps/chosen": -2.7331137657165527, "logps/rejected": -3.5940475463867188, "loss": 0.5378, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7331137657165527, "rewards/margins": 0.860933780670166, "rewards/rejected": -3.5940475463867188, "sft_loss": 2.9450149536132812, "step": 2700 }, { "epoch": 1.4477337347382506, "grad_norm": 12.99917532888424, "learning_rate": 6.165030861624663e-07, "logits/chosen": -0.269326388835907, "logits/rejected": 0.01531993132084608, "logps/chosen": -2.7049174308776855, "logps/rejected": -3.7752068042755127, "loss": 0.4784, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7049174308776855, "rewards/margins": 1.0702893733978271, "rewards/rejected": -3.7752068042755127, "sft_loss": 2.769993305206299, "step": 2705 }, { "epoch": 1.4504097675196521, "grad_norm": 15.836204895419154, "learning_rate": 6.149879879003876e-07, "logits/chosen": -0.11991055309772491, "logits/rejected": -0.10178259760141373, "logps/chosen": -2.7728562355041504, "logps/rejected": -3.674401044845581, "loss": 0.5164, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.7728562355041504, "rewards/margins": 0.9015445709228516, "rewards/rejected": -3.674401044845581, "sft_loss": 2.895869731903076, "step": 2710 }, { "epoch": 1.4530858003010536, "grad_norm": 11.375428045803405, "learning_rate": 6.13471774024346e-07, "logits/chosen": -0.2894328534603119, "logits/rejected": -0.14026543498039246, "logps/chosen": -2.712986469268799, "logps/rejected": -3.547203779220581, "loss": 0.5218, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.712986469268799, "rewards/margins": 0.8342171907424927, "rewards/rejected": -3.547203779220581, "sft_loss": 2.893850326538086, "step": 2715 }, { "epoch": 1.4557618330824553, "grad_norm": 12.298204703115006, "learning_rate": 6.119544592446551e-07, "logits/chosen": -0.2140885889530182, "logits/rejected": -0.06003519147634506, "logps/chosen": -2.8017475605010986, "logps/rejected": -3.537752628326416, "loss": 0.5506, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8017475605010986, "rewards/margins": 0.7360051870346069, "rewards/rejected": -3.537752628326416, "sft_loss": 2.888538122177124, "step": 2720 }, { "epoch": 1.4584378658638568, "grad_norm": 12.601854566833609, "learning_rate": 6.104360582823096e-07, "logits/chosen": -0.17153920233249664, "logits/rejected": -0.04675662890076637, "logps/chosen": -2.7769007682800293, "logps/rejected": -3.5771872997283936, "loss": 0.5458, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.7769007682800293, "rewards/margins": 0.8002867698669434, "rewards/rejected": -3.5771872997283936, "sft_loss": 2.9473347663879395, "step": 2725 }, { "epoch": 1.4611138986452583, "grad_norm": 18.364600743337366, "learning_rate": 6.089165858688423e-07, "logits/chosen": -0.18823352456092834, "logits/rejected": 0.02625465765595436, "logps/chosen": -2.734922170639038, "logps/rejected": -3.678147077560425, "loss": 0.5376, "rewards/accuracies": 0.71875, "rewards/chosen": -2.734922170639038, "rewards/margins": 0.9432247877120972, "rewards/rejected": -3.678147077560425, "sft_loss": 2.9276623725891113, "step": 2730 }, { "epoch": 1.46378993142666, "grad_norm": 11.07782177923181, "learning_rate": 6.073960567461811e-07, "logits/chosen": -0.20135828852653503, "logits/rejected": 0.026552096009254456, "logps/chosen": -2.5619168281555176, "logps/rejected": -3.5634264945983887, "loss": 0.4779, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.5619168281555176, "rewards/margins": 1.0015099048614502, "rewards/rejected": -3.5634264945983887, "sft_loss": 2.7775845527648926, "step": 2735 }, { "epoch": 1.4664659642080615, "grad_norm": 13.098442571344702, "learning_rate": 6.058744856665065e-07, "logits/chosen": -0.2057342827320099, "logits/rejected": -0.06957806646823883, "logps/chosen": -2.756690502166748, "logps/rejected": -3.7283108234405518, "loss": 0.5006, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.756690502166748, "rewards/margins": 0.9716199040412903, "rewards/rejected": -3.7283108234405518, "sft_loss": 2.9453227519989014, "step": 2740 }, { "epoch": 1.469141996989463, "grad_norm": 14.17214298836207, "learning_rate": 6.043518873921074e-07, "logits/chosen": -0.2255820780992508, "logits/rejected": -0.038780681788921356, "logps/chosen": -2.7394678592681885, "logps/rejected": -3.5877277851104736, "loss": 0.5144, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7394678592681885, "rewards/margins": 0.8482600450515747, "rewards/rejected": -3.5877277851104736, "sft_loss": 2.8428146839141846, "step": 2745 }, { "epoch": 1.4718180297708647, "grad_norm": 16.81159688729916, "learning_rate": 6.028282766952393e-07, "logits/chosen": -0.15135575830936432, "logits/rejected": -0.013321302831172943, "logps/chosen": -2.8603711128234863, "logps/rejected": -3.800457000732422, "loss": 0.5129, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8603711128234863, "rewards/margins": 0.9400860071182251, "rewards/rejected": -3.800457000732422, "sft_loss": 2.9740004539489746, "step": 2750 }, { "epoch": 1.4744940625522662, "grad_norm": 22.76507570539442, "learning_rate": 6.013036683579798e-07, "logits/chosen": -0.10337056964635849, "logits/rejected": 0.040584810078144073, "logps/chosen": -2.8502213954925537, "logps/rejected": -3.8350982666015625, "loss": 0.5026, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.8502213954925537, "rewards/margins": 0.9848769307136536, "rewards/rejected": -3.8350982666015625, "sft_loss": 3.0348498821258545, "step": 2755 }, { "epoch": 1.4771700953336677, "grad_norm": 14.477476737366594, "learning_rate": 5.997780771720854e-07, "logits/chosen": -0.2488655149936676, "logits/rejected": -0.011253480799496174, "logps/chosen": -2.973107099533081, "logps/rejected": -4.006959438323975, "loss": 0.4991, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.973107099533081, "rewards/margins": 1.033852458000183, "rewards/rejected": -4.006959438323975, "sft_loss": 3.1735317707061768, "step": 2760 }, { "epoch": 1.4798461281150694, "grad_norm": 20.500259800881533, "learning_rate": 5.982515179388486e-07, "logits/chosen": -0.14365240931510925, "logits/rejected": 0.004178575240075588, "logps/chosen": -2.9276845455169678, "logps/rejected": -3.8023598194122314, "loss": 0.5415, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9276845455169678, "rewards/margins": 0.8746751546859741, "rewards/rejected": -3.8023598194122314, "sft_loss": 3.1712915897369385, "step": 2765 }, { "epoch": 1.482522160896471, "grad_norm": 14.336439014965194, "learning_rate": 5.967240054689541e-07, "logits/chosen": -0.2544417083263397, "logits/rejected": -0.1564224660396576, "logps/chosen": -2.945641040802002, "logps/rejected": -3.751100540161133, "loss": 0.5694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.945641040802002, "rewards/margins": 0.8054594993591309, "rewards/rejected": -3.751100540161133, "sft_loss": 3.140873670578003, "step": 2770 }, { "epoch": 1.4851981936778724, "grad_norm": 16.712349793859868, "learning_rate": 5.951955545823342e-07, "logits/chosen": -0.14263644814491272, "logits/rejected": -0.03438568860292435, "logps/chosen": -3.0389950275421143, "logps/rejected": -4.015830039978027, "loss": 0.5333, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.0389950275421143, "rewards/margins": 0.9768354296684265, "rewards/rejected": -4.015830039978027, "sft_loss": 3.184089422225952, "step": 2775 }, { "epoch": 1.4878742264592741, "grad_norm": 13.43688652506777, "learning_rate": 5.936661801080263e-07, "logits/chosen": -0.14340347051620483, "logits/rejected": -0.0075989714823663235, "logps/chosen": -3.104412794113159, "logps/rejected": -3.8261265754699707, "loss": 0.6125, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.104412794113159, "rewards/margins": 0.7217133641242981, "rewards/rejected": -3.8261265754699707, "sft_loss": 3.140293598175049, "step": 2780 }, { "epoch": 1.4905502592406756, "grad_norm": 11.308275521719805, "learning_rate": 5.92135896884028e-07, "logits/chosen": -0.20025837421417236, "logits/rejected": -0.013455281034111977, "logps/chosen": -3.028421401977539, "logps/rejected": -3.971764087677002, "loss": 0.5263, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.028421401977539, "rewards/margins": 0.9433425664901733, "rewards/rejected": -3.971764087677002, "sft_loss": 3.062842845916748, "step": 2785 }, { "epoch": 1.4932262920220774, "grad_norm": 18.922995983184755, "learning_rate": 5.906047197571541e-07, "logits/chosen": -0.12423284351825714, "logits/rejected": -0.1412658989429474, "logps/chosen": -2.9017343521118164, "logps/rejected": -3.7224926948547363, "loss": 0.5747, "rewards/accuracies": 0.71875, "rewards/chosen": -2.9017343521118164, "rewards/margins": 0.820758044719696, "rewards/rejected": -3.7224926948547363, "sft_loss": 3.1434712409973145, "step": 2790 }, { "epoch": 1.4959023248034788, "grad_norm": 12.450582467267319, "learning_rate": 5.890726635828919e-07, "logits/chosen": -0.041972313076257706, "logits/rejected": -0.03934203460812569, "logps/chosen": -2.6451210975646973, "logps/rejected": -3.56543231010437, "loss": 0.5393, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.6451210975646973, "rewards/margins": 0.9203113317489624, "rewards/rejected": -3.56543231010437, "sft_loss": 2.757323741912842, "step": 2795 }, { "epoch": 1.4985783575848803, "grad_norm": 18.413613848092343, "learning_rate": 5.875397432252569e-07, "logits/chosen": -0.19127897918224335, "logits/rejected": -0.10696186125278473, "logps/chosen": -2.6481118202209473, "logps/rejected": -3.434417247772217, "loss": 0.5512, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.6481118202209473, "rewards/margins": 0.7863054275512695, "rewards/rejected": -3.434417247772217, "sft_loss": 2.8000905513763428, "step": 2800 }, { "epoch": 1.4985783575848803, "eval_logits/chosen": 0.1750119924545288, "eval_logits/rejected": 0.28923624753952026, "eval_logps/chosen": -2.7513301372528076, "eval_logps/rejected": -3.5751004219055176, "eval_loss": 0.5563209652900696, "eval_rewards/accuracies": 0.7203264236450195, "eval_rewards/chosen": -2.7513301372528076, "eval_rewards/margins": 0.8237702250480652, "eval_rewards/rejected": -3.5751004219055176, "eval_runtime": 50.2195, "eval_samples_per_second": 26.782, "eval_sft_loss": 2.9053328037261963, "eval_steps_per_second": 6.711, "step": 2800 }, { "epoch": 1.5012543903662818, "grad_norm": 10.41761647965276, "learning_rate": 5.860059735566491e-07, "logits/chosen": -0.332925945520401, "logits/rejected": -0.13850803673267365, "logps/chosen": -2.547250270843506, "logps/rejected": -3.3842644691467285, "loss": 0.5283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.547250270843506, "rewards/margins": 0.837013840675354, "rewards/rejected": -3.3842644691467285, "sft_loss": 2.7107858657836914, "step": 2805 }, { "epoch": 1.5039304231476835, "grad_norm": 15.905805367045609, "learning_rate": 5.844713694577087e-07, "logits/chosen": -0.169723242521286, "logits/rejected": -0.06621355563402176, "logps/chosen": -2.638108968734741, "logps/rejected": -3.4682857990264893, "loss": 0.5378, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.638108968734741, "rewards/margins": 0.8301769495010376, "rewards/rejected": -3.4682857990264893, "sft_loss": 2.8118090629577637, "step": 2810 }, { "epoch": 1.5066064559290853, "grad_norm": 10.777745910778588, "learning_rate": 5.829359458171714e-07, "logits/chosen": -0.1262860745191574, "logits/rejected": 0.008149102330207825, "logps/chosen": -2.671485662460327, "logps/rejected": -3.7072227001190186, "loss": 0.4716, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.671485662460327, "rewards/margins": 1.0357367992401123, "rewards/rejected": -3.7072227001190186, "sft_loss": 2.7742977142333984, "step": 2815 }, { "epoch": 1.5092824887104868, "grad_norm": 13.279294927026276, "learning_rate": 5.81399717531724e-07, "logits/chosen": -0.21378269791603088, "logits/rejected": 0.0385824516415596, "logps/chosen": -2.8625640869140625, "logps/rejected": -3.6599457263946533, "loss": 0.5676, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8625640869140625, "rewards/margins": 0.7973818182945251, "rewards/rejected": -3.6599457263946533, "sft_loss": 3.028451919555664, "step": 2820 }, { "epoch": 1.5119585214918883, "grad_norm": 13.266146592543238, "learning_rate": 5.798626995058602e-07, "logits/chosen": -0.2696044445037842, "logits/rejected": -0.037425920367240906, "logps/chosen": -2.9013805389404297, "logps/rejected": -3.7950050830841064, "loss": 0.5276, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9013805389404297, "rewards/margins": 0.893624484539032, "rewards/rejected": -3.7950050830841064, "sft_loss": 3.0186824798583984, "step": 2825 }, { "epoch": 1.51463455427329, "grad_norm": 15.104956220264441, "learning_rate": 5.783249066517354e-07, "logits/chosen": -0.17140641808509827, "logits/rejected": -0.01820288598537445, "logps/chosen": -2.724626064300537, "logps/rejected": -3.6410300731658936, "loss": 0.5089, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.724626064300537, "rewards/margins": 0.9164039492607117, "rewards/rejected": -3.6410300731658936, "sft_loss": 2.8896212577819824, "step": 2830 }, { "epoch": 1.5173105870546915, "grad_norm": 33.067123229566405, "learning_rate": 5.767863538890228e-07, "logits/chosen": -0.19309866428375244, "logits/rejected": -0.01465575397014618, "logps/chosen": -2.8873116970062256, "logps/rejected": -3.890298843383789, "loss": 0.5101, "rewards/accuracies": 0.75, "rewards/chosen": -2.8873116970062256, "rewards/margins": 1.0029871463775635, "rewards/rejected": -3.890298843383789, "sft_loss": 3.0265347957611084, "step": 2835 }, { "epoch": 1.519986619836093, "grad_norm": 16.68826289031432, "learning_rate": 5.75247056144768e-07, "logits/chosen": -0.18705160915851593, "logits/rejected": -0.09919128566980362, "logps/chosen": -2.9234955310821533, "logps/rejected": -3.677717685699463, "loss": 0.5891, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.9234955310821533, "rewards/margins": 0.7542222738265991, "rewards/rejected": -3.677717685699463, "sft_loss": 3.084500789642334, "step": 2840 }, { "epoch": 1.5226626526174947, "grad_norm": 18.399868318413944, "learning_rate": 5.737070283532444e-07, "logits/chosen": -0.14178113639354706, "logits/rejected": -0.03312011808156967, "logps/chosen": -2.937467336654663, "logps/rejected": -3.7148735523223877, "loss": 0.6212, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.937467336654663, "rewards/margins": 0.7774060964584351, "rewards/rejected": -3.7148735523223877, "sft_loss": 2.9519081115722656, "step": 2845 }, { "epoch": 1.5253386853988962, "grad_norm": 12.134442423216148, "learning_rate": 5.721662854558084e-07, "logits/chosen": -0.19238229095935822, "logits/rejected": -0.06883934885263443, "logps/chosen": -2.799609422683716, "logps/rejected": -3.813912868499756, "loss": 0.4851, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.799609422683716, "rewards/margins": 1.0143041610717773, "rewards/rejected": -3.813912868499756, "sft_loss": 2.959176540374756, "step": 2850 }, { "epoch": 1.5280147181802977, "grad_norm": 12.525962827150428, "learning_rate": 5.706248424007545e-07, "logits/chosen": -0.22883224487304688, "logits/rejected": -0.01344398595392704, "logps/chosen": -2.968219041824341, "logps/rejected": -3.826395034790039, "loss": 0.5412, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.968219041824341, "rewards/margins": 0.8581761121749878, "rewards/rejected": -3.826395034790039, "sft_loss": 3.1020607948303223, "step": 2855 }, { "epoch": 1.5306907509616994, "grad_norm": 14.910898523551785, "learning_rate": 5.690827141431699e-07, "logits/chosen": -0.2574513554573059, "logits/rejected": -0.019537249580025673, "logps/chosen": -2.833799123764038, "logps/rejected": -3.614091157913208, "loss": 0.5313, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.833799123764038, "rewards/margins": 0.7802920341491699, "rewards/rejected": -3.614091157913208, "sft_loss": 2.9435155391693115, "step": 2860 }, { "epoch": 1.5333667837431009, "grad_norm": 19.252025215234966, "learning_rate": 5.675399156447897e-07, "logits/chosen": -0.2882543206214905, "logits/rejected": -0.1258959323167801, "logps/chosen": -2.9176371097564697, "logps/rejected": -3.582078456878662, "loss": 0.5927, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.9176371097564697, "rewards/margins": 0.6644415259361267, "rewards/rejected": -3.582078456878662, "sft_loss": 3.088707447052002, "step": 2865 }, { "epoch": 1.5360428165245024, "grad_norm": 14.712122093475505, "learning_rate": 5.659964618738515e-07, "logits/chosen": -0.20669814944267273, "logits/rejected": -0.06127766892313957, "logps/chosen": -2.803532123565674, "logps/rejected": -3.5541293621063232, "loss": 0.5624, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.803532123565674, "rewards/margins": 0.7505972981452942, "rewards/rejected": -3.5541293621063232, "sft_loss": 2.925236701965332, "step": 2870 }, { "epoch": 1.538718849305904, "grad_norm": 17.12798243385786, "learning_rate": 5.644523678049509e-07, "logits/chosen": -0.2009466141462326, "logits/rejected": -0.05111227184534073, "logps/chosen": -2.793321132659912, "logps/rejected": -3.577846050262451, "loss": 0.5411, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.793321132659912, "rewards/margins": 0.7845247983932495, "rewards/rejected": -3.577846050262451, "sft_loss": 2.881221055984497, "step": 2875 }, { "epoch": 1.5413948820873056, "grad_norm": 15.641371082674475, "learning_rate": 5.629076484188952e-07, "logits/chosen": -0.05735234171152115, "logits/rejected": 0.07924290746450424, "logps/chosen": -2.6026575565338135, "logps/rejected": -3.470081329345703, "loss": 0.5192, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.6026575565338135, "rewards/margins": 0.8674238324165344, "rewards/rejected": -3.470081329345703, "sft_loss": 2.746920108795166, "step": 2880 }, { "epoch": 1.544070914868707, "grad_norm": 13.5763732852673, "learning_rate": 5.613623187025587e-07, "logits/chosen": -0.17112448811531067, "logits/rejected": -0.008452496491372585, "logps/chosen": -2.730259418487549, "logps/rejected": -3.6526596546173096, "loss": 0.5129, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.730259418487549, "rewards/margins": 0.9224007725715637, "rewards/rejected": -3.6526596546173096, "sft_loss": 2.858396291732788, "step": 2885 }, { "epoch": 1.5467469476501088, "grad_norm": 10.988273230358, "learning_rate": 5.598163936487369e-07, "logits/chosen": -0.2011490762233734, "logits/rejected": 0.03283718600869179, "logps/chosen": -2.794330358505249, "logps/rejected": -3.887986660003662, "loss": 0.4753, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.794330358505249, "rewards/margins": 1.0936561822891235, "rewards/rejected": -3.887986660003662, "sft_loss": 2.8787245750427246, "step": 2890 }, { "epoch": 1.5494229804315103, "grad_norm": 15.782631448647228, "learning_rate": 5.582698882560017e-07, "logits/chosen": -0.17405319213867188, "logits/rejected": 0.021756382659077644, "logps/chosen": -2.754683017730713, "logps/rejected": -3.586590528488159, "loss": 0.5615, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.754683017730713, "rewards/margins": 0.8319074511528015, "rewards/rejected": -3.586590528488159, "sft_loss": 2.8560948371887207, "step": 2895 }, { "epoch": 1.5520990132129118, "grad_norm": 11.187595581042258, "learning_rate": 5.567228175285549e-07, "logits/chosen": -0.11427092552185059, "logits/rejected": 0.023462316021323204, "logps/chosen": -2.862224578857422, "logps/rejected": -3.868736743927002, "loss": 0.4852, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.862224578857422, "rewards/margins": 1.006511926651001, "rewards/rejected": -3.868736743927002, "sft_loss": 2.9960880279541016, "step": 2900 }, { "epoch": 1.5547750459943135, "grad_norm": 15.811635282671729, "learning_rate": 5.551751964760838e-07, "logits/chosen": -0.0515236034989357, "logits/rejected": -0.028668904677033424, "logps/chosen": -2.864414930343628, "logps/rejected": -3.809762954711914, "loss": 0.5122, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.864414930343628, "rewards/margins": 0.9453479647636414, "rewards/rejected": -3.809762954711914, "sft_loss": 3.0091872215270996, "step": 2905 }, { "epoch": 1.557451078775715, "grad_norm": 19.555853889432004, "learning_rate": 5.536270401136145e-07, "logits/chosen": -0.17987671494483948, "logits/rejected": -0.03466006740927696, "logps/chosen": -2.9958698749542236, "logps/rejected": -3.8351073265075684, "loss": 0.5502, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.9958698749542236, "rewards/margins": 0.8392373323440552, "rewards/rejected": -3.8351073265075684, "sft_loss": 3.2206878662109375, "step": 2910 }, { "epoch": 1.5601271115571165, "grad_norm": 19.875087782818508, "learning_rate": 5.520783634613667e-07, "logits/chosen": -0.11352671682834625, "logits/rejected": 0.12970559298992157, "logps/chosen": -2.9497323036193848, "logps/rejected": -3.9853293895721436, "loss": 0.5179, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.9497323036193848, "rewards/margins": 1.0355972051620483, "rewards/rejected": -3.9853293895721436, "sft_loss": 3.243316650390625, "step": 2915 }, { "epoch": 1.5628031443385182, "grad_norm": 21.14982034068398, "learning_rate": 5.505291815446082e-07, "logits/chosen": -0.13817700743675232, "logits/rejected": 0.0019467368256300688, "logps/chosen": -3.080827236175537, "logps/rejected": -4.003448009490967, "loss": 0.5712, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.080827236175537, "rewards/margins": 0.9226205945014954, "rewards/rejected": -4.003448009490967, "sft_loss": 3.302227020263672, "step": 2920 }, { "epoch": 1.5654791771199197, "grad_norm": 16.15882869564911, "learning_rate": 5.489795093935089e-07, "logits/chosen": -0.11244082450866699, "logits/rejected": 0.02112450823187828, "logps/chosen": -2.877242088317871, "logps/rejected": -3.7602596282958984, "loss": 0.565, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.877242088317871, "rewards/margins": 0.8830181360244751, "rewards/rejected": -3.7602596282958984, "sft_loss": 3.063002347946167, "step": 2925 }, { "epoch": 1.5681552099013212, "grad_norm": 14.919029677524945, "learning_rate": 5.474293620429946e-07, "logits/chosen": -0.26579058170318604, "logits/rejected": -0.04727768152952194, "logps/chosen": -2.7573142051696777, "logps/rejected": -3.9940857887268066, "loss": 0.474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7573142051696777, "rewards/margins": 1.2367714643478394, "rewards/rejected": -3.9940857887268066, "sft_loss": 3.0310394763946533, "step": 2930 }, { "epoch": 1.570831242682723, "grad_norm": 16.447429484587722, "learning_rate": 5.458787545326018e-07, "logits/chosen": -0.21073214709758759, "logits/rejected": -0.06848563253879547, "logps/chosen": -2.97898530960083, "logps/rejected": -3.864051103591919, "loss": 0.5328, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.97898530960083, "rewards/margins": 0.8850658535957336, "rewards/rejected": -3.864051103591919, "sft_loss": 3.101569652557373, "step": 2935 }, { "epoch": 1.5735072754641244, "grad_norm": 15.746122226760354, "learning_rate": 5.443277019063311e-07, "logits/chosen": -0.251105397939682, "logits/rejected": -0.05666235834360123, "logps/chosen": -2.954221487045288, "logps/rejected": -4.072968006134033, "loss": 0.5307, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.954221487045288, "rewards/margins": 1.118746280670166, "rewards/rejected": -4.072968006134033, "sft_loss": 3.1267192363739014, "step": 2940 }, { "epoch": 1.5761833082455259, "grad_norm": 20.737025541581826, "learning_rate": 5.427762192125023e-07, "logits/chosen": -0.18963779509067535, "logits/rejected": -0.04109461233019829, "logps/chosen": -2.91569447517395, "logps/rejected": -3.7815444469451904, "loss": 0.5441, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.91569447517395, "rewards/margins": 0.8658501505851746, "rewards/rejected": -3.7815444469451904, "sft_loss": 3.0128207206726074, "step": 2945 }, { "epoch": 1.5788593410269276, "grad_norm": 18.26540381429684, "learning_rate": 5.41224321503607e-07, "logits/chosen": -0.18460258841514587, "logits/rejected": 0.1022685170173645, "logps/chosen": -2.8344292640686035, "logps/rejected": -3.855309247970581, "loss": 0.4855, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.8344292640686035, "rewards/margins": 1.020879864692688, "rewards/rejected": -3.855309247970581, "sft_loss": 2.9823155403137207, "step": 2950 }, { "epoch": 1.5815353738083293, "grad_norm": 16.947184497240457, "learning_rate": 5.396720238361637e-07, "logits/chosen": -0.13508784770965576, "logits/rejected": 0.007212462835013866, "logps/chosen": -2.922914981842041, "logps/rejected": -3.78143310546875, "loss": 0.5484, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.922914981842041, "rewards/margins": 0.8585184216499329, "rewards/rejected": -3.78143310546875, "sft_loss": 3.1644444465637207, "step": 2955 }, { "epoch": 1.5842114065897306, "grad_norm": 10.8241391582058, "learning_rate": 5.381193412705711e-07, "logits/chosen": -0.258370041847229, "logits/rejected": -0.08109404146671295, "logps/chosen": -2.8407623767852783, "logps/rejected": -3.7270267009735107, "loss": 0.5045, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8407623767852783, "rewards/margins": 0.8862646818161011, "rewards/rejected": -3.7270267009735107, "sft_loss": 2.9842042922973633, "step": 2960 }, { "epoch": 1.5868874393711323, "grad_norm": 12.363580477888307, "learning_rate": 5.365662888709622e-07, "logits/chosen": -0.2032860517501831, "logits/rejected": -0.04430514574050903, "logps/chosen": -2.98106050491333, "logps/rejected": -4.009333610534668, "loss": 0.5141, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.98106050491333, "rewards/margins": 1.0282728672027588, "rewards/rejected": -4.009333610534668, "sft_loss": 3.1617591381073, "step": 2965 }, { "epoch": 1.589563472152534, "grad_norm": 23.654763880923753, "learning_rate": 5.350128817050585e-07, "logits/chosen": -0.2287582904100418, "logits/rejected": -0.0005988016491755843, "logps/chosen": -3.0337624549865723, "logps/rejected": -3.9518356323242188, "loss": 0.5504, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0337624549865723, "rewards/margins": 0.9180733561515808, "rewards/rejected": -3.9518356323242188, "sft_loss": 3.1794168949127197, "step": 2970 }, { "epoch": 1.5922395049339353, "grad_norm": 18.4826698939958, "learning_rate": 5.334591348440229e-07, "logits/chosen": -0.16902866959571838, "logits/rejected": 0.02397763356566429, "logps/chosen": -2.8789451122283936, "logps/rejected": -3.7171578407287598, "loss": 0.5406, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8789451122283936, "rewards/margins": 0.8382126688957214, "rewards/rejected": -3.7171578407287598, "sft_loss": 3.027923107147217, "step": 2975 }, { "epoch": 1.594915537715337, "grad_norm": 12.616492772128158, "learning_rate": 5.319050633623141e-07, "logits/chosen": -0.22738048434257507, "logits/rejected": -0.019261473789811134, "logps/chosen": -2.9477336406707764, "logps/rejected": -3.7788281440734863, "loss": 0.5202, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9477336406707764, "rewards/margins": 0.8310942649841309, "rewards/rejected": -3.7788281440734863, "sft_loss": 3.102329730987549, "step": 2980 }, { "epoch": 1.5975915704967387, "grad_norm": 17.002375245551736, "learning_rate": 5.303506823375409e-07, "logits/chosen": -0.22807928919792175, "logits/rejected": 0.0325554795563221, "logps/chosen": -2.9772753715515137, "logps/rejected": -4.074067115783691, "loss": 0.5154, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9772753715515137, "rewards/margins": 1.0967915058135986, "rewards/rejected": -4.074067115783691, "sft_loss": 3.0923879146575928, "step": 2985 }, { "epoch": 1.60026760327814, "grad_norm": 13.948824541303509, "learning_rate": 5.287960068503143e-07, "logits/chosen": -0.2504068613052368, "logits/rejected": 0.0015707932179793715, "logps/chosen": -2.912179470062256, "logps/rejected": -3.9256370067596436, "loss": 0.5103, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.912179470062256, "rewards/margins": 1.0134575366973877, "rewards/rejected": -3.9256370067596436, "sft_loss": 3.058387517929077, "step": 2990 }, { "epoch": 1.6029436360595417, "grad_norm": 17.29165259872914, "learning_rate": 5.272410519841032e-07, "logits/chosen": -0.14839215576648712, "logits/rejected": 0.014627779833972454, "logps/chosen": -3.000635862350464, "logps/rejected": -4.1716814041137695, "loss": 0.4851, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.000635862350464, "rewards/margins": 1.1710458993911743, "rewards/rejected": -4.1716814041137695, "sft_loss": 3.2312474250793457, "step": 2995 }, { "epoch": 1.6056196688409434, "grad_norm": 11.933730730663703, "learning_rate": 5.256858328250861e-07, "logits/chosen": -0.20798341929912567, "logits/rejected": 0.023721005767583847, "logps/chosen": -2.970376968383789, "logps/rejected": -3.7593460083007812, "loss": 0.5722, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.970376968383789, "rewards/margins": 0.7889685034751892, "rewards/rejected": -3.7593460083007812, "sft_loss": 3.0607645511627197, "step": 3000 }, { "epoch": 1.608295701622345, "grad_norm": 28.095890549973145, "learning_rate": 5.241303644620063e-07, "logits/chosen": -0.30875033140182495, "logits/rejected": -0.09672629833221436, "logps/chosen": -2.978696823120117, "logps/rejected": -3.6481051445007324, "loss": 0.6136, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.978696823120117, "rewards/margins": 0.6694087982177734, "rewards/rejected": -3.6481051445007324, "sft_loss": 3.108910083770752, "step": 3005 }, { "epoch": 1.6109717344037464, "grad_norm": 19.960336015122536, "learning_rate": 5.225746619860248e-07, "logits/chosen": -0.25689318776130676, "logits/rejected": -0.09589236229658127, "logps/chosen": -2.924680709838867, "logps/rejected": -3.665656328201294, "loss": 0.6023, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.924680709838867, "rewards/margins": 0.7409757375717163, "rewards/rejected": -3.665656328201294, "sft_loss": 3.052212953567505, "step": 3010 }, { "epoch": 1.6136477671851481, "grad_norm": 19.465256166030574, "learning_rate": 5.210187404905735e-07, "logits/chosen": -0.03861024230718613, "logits/rejected": 0.05268191546201706, "logps/chosen": -2.995497226715088, "logps/rejected": -3.8167786598205566, "loss": 0.5544, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.995497226715088, "rewards/margins": 0.8212817311286926, "rewards/rejected": -3.8167786598205566, "sft_loss": 3.1183435916900635, "step": 3015 }, { "epoch": 1.6163237999665496, "grad_norm": 14.179972871464367, "learning_rate": 5.194626150712098e-07, "logits/chosen": -0.22885003685951233, "logits/rejected": -0.08128555119037628, "logps/chosen": -2.917171001434326, "logps/rejected": -3.7479748725891113, "loss": 0.5307, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.917171001434326, "rewards/margins": 0.8308032751083374, "rewards/rejected": -3.7479748725891113, "sft_loss": 3.0888686180114746, "step": 3020 }, { "epoch": 1.6189998327479511, "grad_norm": 13.640509203922976, "learning_rate": 5.179063008254695e-07, "logits/chosen": -0.21364569664001465, "logits/rejected": -0.00813610665500164, "logps/chosen": -2.7993874549865723, "logps/rejected": -3.598403215408325, "loss": 0.561, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7993874549865723, "rewards/margins": 0.799015998840332, "rewards/rejected": -3.598403215408325, "sft_loss": 2.9994029998779297, "step": 3025 }, { "epoch": 1.6216758655293528, "grad_norm": 14.4445070123789, "learning_rate": 5.163498128527199e-07, "logits/chosen": -0.15052303671836853, "logits/rejected": 0.02527775429189205, "logps/chosen": -2.982490301132202, "logps/rejected": -3.844486951828003, "loss": 0.5513, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.982490301132202, "rewards/margins": 0.8619967699050903, "rewards/rejected": -3.844486951828003, "sft_loss": 3.0802769660949707, "step": 3030 }, { "epoch": 1.6243518983107543, "grad_norm": 15.812415149593381, "learning_rate": 5.147931662540144e-07, "logits/chosen": -0.06317378580570221, "logits/rejected": 0.1144840270280838, "logps/chosen": -2.8584022521972656, "logps/rejected": -3.5950756072998047, "loss": 0.5475, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8584022521972656, "rewards/margins": 0.7366732954978943, "rewards/rejected": -3.5950756072998047, "sft_loss": 3.009364366531372, "step": 3035 }, { "epoch": 1.6270279310921558, "grad_norm": 11.468441941469427, "learning_rate": 5.132363761319449e-07, "logits/chosen": -0.1439296305179596, "logits/rejected": -0.049841322004795074, "logps/chosen": -2.809378147125244, "logps/rejected": -3.8419647216796875, "loss": 0.5151, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.809378147125244, "rewards/margins": 1.0325868129730225, "rewards/rejected": -3.8419647216796875, "sft_loss": 2.957885265350342, "step": 3040 }, { "epoch": 1.6297039638735575, "grad_norm": 34.802718071815235, "learning_rate": 5.116794575904962e-07, "logits/chosen": -0.13249622285366058, "logits/rejected": -0.0025797567795962095, "logps/chosen": -2.711029529571533, "logps/rejected": -3.486485004425049, "loss": 0.5684, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.711029529571533, "rewards/margins": 0.7754554748535156, "rewards/rejected": -3.486485004425049, "sft_loss": 2.8514134883880615, "step": 3045 }, { "epoch": 1.632379996654959, "grad_norm": 10.892875604035309, "learning_rate": 5.101224257348987e-07, "logits/chosen": -0.15007975697517395, "logits/rejected": 0.019764890894293785, "logps/chosen": -2.878706455230713, "logps/rejected": -3.9150726795196533, "loss": 0.4774, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.878706455230713, "rewards/margins": 1.0363662242889404, "rewards/rejected": -3.9150726795196533, "sft_loss": 3.086242198944092, "step": 3050 }, { "epoch": 1.6350560294363605, "grad_norm": 14.483283525837482, "learning_rate": 5.085652956714823e-07, "logits/chosen": -0.21432361006736755, "logits/rejected": -0.02309691347181797, "logps/chosen": -2.9688711166381836, "logps/rejected": -3.8170647621154785, "loss": 0.5388, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9688711166381836, "rewards/margins": 0.8481934666633606, "rewards/rejected": -3.8170647621154785, "sft_loss": 3.095407009124756, "step": 3055 }, { "epoch": 1.6377320622177622, "grad_norm": 13.001074946425849, "learning_rate": 5.070080825075298e-07, "logits/chosen": -0.1922428458929062, "logits/rejected": 0.04582958295941353, "logps/chosen": -2.91438364982605, "logps/rejected": -3.7429771423339844, "loss": 0.5892, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.91438364982605, "rewards/margins": 0.8285935521125793, "rewards/rejected": -3.7429771423339844, "sft_loss": 3.1438403129577637, "step": 3060 }, { "epoch": 1.6404080949991637, "grad_norm": 13.995641813546811, "learning_rate": 5.0545080135113e-07, "logits/chosen": -0.13181616365909576, "logits/rejected": -0.02712242677807808, "logps/chosen": -2.84912371635437, "logps/rejected": -3.7584915161132812, "loss": 0.5505, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.84912371635437, "rewards/margins": 0.909367561340332, "rewards/rejected": -3.7584915161132812, "sft_loss": 3.021531343460083, "step": 3065 }, { "epoch": 1.6430841277805652, "grad_norm": 18.420291032139474, "learning_rate": 5.038934673110316e-07, "logits/chosen": -0.20530517399311066, "logits/rejected": -0.06820012629032135, "logps/chosen": -2.9428317546844482, "logps/rejected": -3.816471815109253, "loss": 0.5638, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.9428317546844482, "rewards/margins": 0.8736408352851868, "rewards/rejected": -3.816471815109253, "sft_loss": 3.122080087661743, "step": 3070 }, { "epoch": 1.645760160561967, "grad_norm": 12.210492099926979, "learning_rate": 5.023360954964963e-07, "logits/chosen": -0.2356480062007904, "logits/rejected": -0.13236093521118164, "logps/chosen": -2.6807234287261963, "logps/rejected": -3.5748507976531982, "loss": 0.4923, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6807234287261963, "rewards/margins": 0.8941277265548706, "rewards/rejected": -3.5748507976531982, "sft_loss": 2.826641082763672, "step": 3075 }, { "epoch": 1.6484361933433684, "grad_norm": 14.451314916093441, "learning_rate": 5.007787010171524e-07, "logits/chosen": -0.30715638399124146, "logits/rejected": -0.0917990654706955, "logps/chosen": -2.5948257446289062, "logps/rejected": -3.5277962684631348, "loss": 0.4782, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.5948257446289062, "rewards/margins": 0.932970404624939, "rewards/rejected": -3.5277962684631348, "sft_loss": 2.8153226375579834, "step": 3080 }, { "epoch": 1.65111222612477, "grad_norm": 16.093189988405726, "learning_rate": 4.992212989828477e-07, "logits/chosen": -0.12263667583465576, "logits/rejected": -0.08358993381261826, "logps/chosen": -2.74635648727417, "logps/rejected": -3.4704620838165283, "loss": 0.5703, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.74635648727417, "rewards/margins": 0.7241055369377136, "rewards/rejected": -3.4704620838165283, "sft_loss": 2.9041481018066406, "step": 3085 }, { "epoch": 1.6537882589061716, "grad_norm": 14.303938596204755, "learning_rate": 4.976639045035036e-07, "logits/chosen": -0.10676582902669907, "logits/rejected": -0.01227900106459856, "logps/chosen": -2.698171854019165, "logps/rejected": -3.3960328102111816, "loss": 0.6148, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.698171854019165, "rewards/margins": 0.697860836982727, "rewards/rejected": -3.3960328102111816, "sft_loss": 2.8922371864318848, "step": 3090 }, { "epoch": 1.6564642916875731, "grad_norm": 12.995678732494168, "learning_rate": 4.961065326889683e-07, "logits/chosen": -0.13217754662036896, "logits/rejected": 0.052339475601911545, "logps/chosen": -2.795322895050049, "logps/rejected": -3.633164644241333, "loss": 0.5317, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.795322895050049, "rewards/margins": 0.8378413915634155, "rewards/rejected": -3.633164644241333, "sft_loss": 2.9511818885803223, "step": 3095 }, { "epoch": 1.6591403244689746, "grad_norm": 16.08044230088277, "learning_rate": 4.9454919864887e-07, "logits/chosen": -0.255813330411911, "logits/rejected": -0.1016959398984909, "logps/chosen": -2.697873592376709, "logps/rejected": -3.5687255859375, "loss": 0.5228, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.697873592376709, "rewards/margins": 0.8708522915840149, "rewards/rejected": -3.5687255859375, "sft_loss": 2.96101975440979, "step": 3100 }, { "epoch": 1.6618163572503764, "grad_norm": 19.529351787541103, "learning_rate": 4.929919174924701e-07, "logits/chosen": -0.26780134439468384, "logits/rejected": -0.045740462839603424, "logps/chosen": -2.713799238204956, "logps/rejected": -3.4403533935546875, "loss": 0.5532, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.713799238204956, "rewards/margins": 0.7265541553497314, "rewards/rejected": -3.4403533935546875, "sft_loss": 2.929337501525879, "step": 3105 }, { "epoch": 1.6644923900317778, "grad_norm": 12.31980115365111, "learning_rate": 4.914347043285177e-07, "logits/chosen": -0.15741178393363953, "logits/rejected": -0.007818855345249176, "logps/chosen": -2.7308452129364014, "logps/rejected": -3.5612876415252686, "loss": 0.5259, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.7308452129364014, "rewards/margins": 0.8304422497749329, "rewards/rejected": -3.5612876415252686, "sft_loss": 2.8026375770568848, "step": 3110 }, { "epoch": 1.6671684228131793, "grad_norm": 13.205443574074472, "learning_rate": 4.898775742651013e-07, "logits/chosen": -0.1403086930513382, "logits/rejected": 0.01723460480570793, "logps/chosen": -2.7844042778015137, "logps/rejected": -3.7625949382781982, "loss": 0.4815, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7844042778015137, "rewards/margins": 0.9781904220581055, "rewards/rejected": -3.7625949382781982, "sft_loss": 3.0160343647003174, "step": 3115 }, { "epoch": 1.669844455594581, "grad_norm": 11.706451190194656, "learning_rate": 4.883205424095037e-07, "logits/chosen": -0.27177152037620544, "logits/rejected": -0.07603542506694794, "logps/chosen": -2.9312081336975098, "logps/rejected": -3.8443360328674316, "loss": 0.5256, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.9312081336975098, "rewards/margins": 0.9131277203559875, "rewards/rejected": -3.8443360328674316, "sft_loss": 3.074876308441162, "step": 3120 }, { "epoch": 1.6725204883759828, "grad_norm": 13.913970753690924, "learning_rate": 4.86763623868055e-07, "logits/chosen": -0.184868723154068, "logits/rejected": -0.03205538168549538, "logps/chosen": -2.962169885635376, "logps/rejected": -3.8724663257598877, "loss": 0.5351, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.962169885635376, "rewards/margins": 0.910296618938446, "rewards/rejected": -3.8724663257598877, "sft_loss": 3.0643038749694824, "step": 3125 }, { "epoch": 1.675196521157384, "grad_norm": 13.159797786943738, "learning_rate": 4.852068337459856e-07, "logits/chosen": -0.12268374860286713, "logits/rejected": 0.09164348989725113, "logps/chosen": -3.1019387245178223, "logps/rejected": -3.940359592437744, "loss": 0.5308, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.1019387245178223, "rewards/margins": 0.8384206891059875, "rewards/rejected": -3.940359592437744, "sft_loss": 3.2816097736358643, "step": 3130 }, { "epoch": 1.6778725539387858, "grad_norm": 15.4563527851753, "learning_rate": 4.8365018714728e-07, "logits/chosen": -0.11665117740631104, "logits/rejected": -0.003966017626225948, "logps/chosen": -3.2221763134002686, "logps/rejected": -3.9856514930725098, "loss": 0.5706, "rewards/accuracies": 0.71875, "rewards/chosen": -3.2221763134002686, "rewards/margins": 0.763475239276886, "rewards/rejected": -3.9856514930725098, "sft_loss": 3.3253014087677, "step": 3135 }, { "epoch": 1.6805485867201875, "grad_norm": 17.822799045657792, "learning_rate": 4.820936991745304e-07, "logits/chosen": -0.3635734021663666, "logits/rejected": -0.2108539342880249, "logps/chosen": -2.9746642112731934, "logps/rejected": -3.8151726722717285, "loss": 0.5206, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9746642112731934, "rewards/margins": 0.8405085802078247, "rewards/rejected": -3.8151726722717285, "sft_loss": 3.126021146774292, "step": 3140 }, { "epoch": 1.6832246195015887, "grad_norm": 13.688291338768908, "learning_rate": 4.8053738492879e-07, "logits/chosen": -0.15512339770793915, "logits/rejected": 0.007311081979423761, "logps/chosen": -2.8693430423736572, "logps/rejected": -3.8841490745544434, "loss": 0.522, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8693430423736572, "rewards/margins": 1.0148061513900757, "rewards/rejected": -3.8841490745544434, "sft_loss": 2.981611967086792, "step": 3145 }, { "epoch": 1.6859006522829905, "grad_norm": 13.620392772007163, "learning_rate": 4.789812595094265e-07, "logits/chosen": -0.2888887822628021, "logits/rejected": -0.1388740837574005, "logps/chosen": -2.93782377243042, "logps/rejected": -3.9592788219451904, "loss": 0.4875, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.93782377243042, "rewards/margins": 1.0214550495147705, "rewards/rejected": -3.9592788219451904, "sft_loss": 3.0410799980163574, "step": 3150 }, { "epoch": 1.6885766850643922, "grad_norm": 12.646316434762811, "learning_rate": 4.774253380139752e-07, "logits/chosen": -0.3319533169269562, "logits/rejected": -0.17273344099521637, "logps/chosen": -2.779099702835083, "logps/rejected": -3.8044273853302, "loss": 0.4882, "rewards/accuracies": 0.78125, "rewards/chosen": -2.779099702835083, "rewards/margins": 1.0253279209136963, "rewards/rejected": -3.8044273853302, "sft_loss": 2.975861072540283, "step": 3155 }, { "epoch": 1.6912527178457935, "grad_norm": 16.896373974454246, "learning_rate": 4.758696355379936e-07, "logits/chosen": -0.15754546225070953, "logits/rejected": -0.19510313868522644, "logps/chosen": -2.8744874000549316, "logps/rejected": -3.818066358566284, "loss": 0.5135, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8744874000549316, "rewards/margins": 0.9435788989067078, "rewards/rejected": -3.818066358566284, "sft_loss": 3.1294708251953125, "step": 3160 }, { "epoch": 1.6939287506271952, "grad_norm": 15.788673901400218, "learning_rate": 4.743141671749138e-07, "logits/chosen": -0.3557151257991791, "logits/rejected": -0.18027469515800476, "logps/chosen": -3.043041706085205, "logps/rejected": -3.759451389312744, "loss": 0.6138, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.043041706085205, "rewards/margins": 0.7164098024368286, "rewards/rejected": -3.759451389312744, "sft_loss": 3.246708393096924, "step": 3165 }, { "epoch": 1.6966047834085969, "grad_norm": 14.237283392007136, "learning_rate": 4.727589480158968e-07, "logits/chosen": -0.2369193732738495, "logits/rejected": -0.11112294346094131, "logps/chosen": -2.933332920074463, "logps/rejected": -3.9403202533721924, "loss": 0.4988, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.933332920074463, "rewards/margins": 1.0069873332977295, "rewards/rejected": -3.9403202533721924, "sft_loss": 3.1049280166625977, "step": 3170 }, { "epoch": 1.6992808161899984, "grad_norm": 19.168604149317066, "learning_rate": 4.712039931496855e-07, "logits/chosen": -0.2870144844055176, "logits/rejected": -0.11162833869457245, "logps/chosen": -3.0423762798309326, "logps/rejected": -3.660609722137451, "loss": 0.6636, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0423762798309326, "rewards/margins": 0.6182333827018738, "rewards/rejected": -3.660609722137451, "sft_loss": 3.242758274078369, "step": 3175 }, { "epoch": 1.7019568489713999, "grad_norm": 17.257914139514913, "learning_rate": 4.6964931766245905e-07, "logits/chosen": -0.1425977647304535, "logits/rejected": -0.0717199370265007, "logps/chosen": -2.940380573272705, "logps/rejected": -3.841813564300537, "loss": 0.5339, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.940380573272705, "rewards/margins": 0.9014331102371216, "rewards/rejected": -3.841813564300537, "sft_loss": 3.0481081008911133, "step": 3180 }, { "epoch": 1.7046328817528016, "grad_norm": 15.206261455382656, "learning_rate": 4.6809493663768575e-07, "logits/chosen": -0.1936657726764679, "logits/rejected": -0.12913763523101807, "logps/chosen": -2.8649189472198486, "logps/rejected": -3.4266788959503174, "loss": 0.6247, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.8649189472198486, "rewards/margins": 0.5617601275444031, "rewards/rejected": -3.4266788959503174, "sft_loss": 3.061222553253174, "step": 3185 }, { "epoch": 1.707308914534203, "grad_norm": 12.229514415619054, "learning_rate": 4.6654086515597716e-07, "logits/chosen": -0.3212862014770508, "logits/rejected": -0.11561963707208633, "logps/chosen": -2.8837294578552246, "logps/rejected": -3.8425660133361816, "loss": 0.4869, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.8837294578552246, "rewards/margins": 0.9588366746902466, "rewards/rejected": -3.8425660133361816, "sft_loss": 3.031167507171631, "step": 3190 }, { "epoch": 1.7099849473156046, "grad_norm": 12.193657302291385, "learning_rate": 4.6498711829494154e-07, "logits/chosen": -0.30257949233055115, "logits/rejected": -0.16699732840061188, "logps/chosen": -2.770297050476074, "logps/rejected": -3.7176918983459473, "loss": 0.5174, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.770297050476074, "rewards/margins": 0.9473945498466492, "rewards/rejected": -3.7176918983459473, "sft_loss": 2.859004259109497, "step": 3195 }, { "epoch": 1.7126609800970063, "grad_norm": 16.39615605105048, "learning_rate": 4.6343371112903777e-07, "logits/chosen": -0.18592138588428497, "logits/rejected": -0.01131738256663084, "logps/chosen": -2.9315178394317627, "logps/rejected": -3.722820281982422, "loss": 0.5766, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.9315178394317627, "rewards/margins": 0.7913025617599487, "rewards/rejected": -3.722820281982422, "sft_loss": 3.017145872116089, "step": 3200 }, { "epoch": 1.7126609800970063, "eval_logits/chosen": 0.15615563094615936, "eval_logits/rejected": 0.267660915851593, "eval_logps/chosen": -2.8133504390716553, "eval_logps/rejected": -3.665544271469116, "eval_loss": 0.551977813243866, "eval_rewards/accuracies": 0.7262611389160156, "eval_rewards/chosen": -2.8133504390716553, "eval_rewards/margins": 0.8521937727928162, "eval_rewards/rejected": -3.665544271469116, "eval_runtime": 50.0415, "eval_samples_per_second": 26.878, "eval_sft_loss": 2.964270830154419, "eval_steps_per_second": 6.734, "step": 3200 }, { "epoch": 1.7153370128784078, "grad_norm": 11.957138109955915, "learning_rate": 4.618806587294291e-07, "logits/chosen": -0.33529722690582275, "logits/rejected": -0.19825676083564758, "logps/chosen": -2.708866596221924, "logps/rejected": -3.683584213256836, "loss": 0.4965, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.708866596221924, "rewards/margins": 0.9747177958488464, "rewards/rejected": -3.683584213256836, "sft_loss": 2.8329052925109863, "step": 3205 }, { "epoch": 1.7180130456598093, "grad_norm": 18.133437006125078, "learning_rate": 4.603279761638365e-07, "logits/chosen": -0.3051992952823639, "logits/rejected": -0.1710871011018753, "logps/chosen": -2.7860054969787598, "logps/rejected": -3.4999566078186035, "loss": 0.5994, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.7860054969787598, "rewards/margins": 0.7139514684677124, "rewards/rejected": -3.4999566078186035, "sft_loss": 2.8975796699523926, "step": 3210 }, { "epoch": 1.720689078441211, "grad_norm": 14.432897472663257, "learning_rate": 4.5877567849639315e-07, "logits/chosen": -0.21736936271190643, "logits/rejected": -0.09302875399589539, "logps/chosen": -2.8191990852355957, "logps/rejected": -3.7241218090057373, "loss": 0.524, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.8191990852355957, "rewards/margins": 0.9049233198165894, "rewards/rejected": -3.7241218090057373, "sft_loss": 2.898805618286133, "step": 3215 }, { "epoch": 1.7233651112226125, "grad_norm": 13.15421311840138, "learning_rate": 4.572237807874979e-07, "logits/chosen": -0.28606656193733215, "logits/rejected": 0.012532521970570087, "logps/chosen": -3.0481009483337402, "logps/rejected": -3.9428951740264893, "loss": 0.5897, "rewards/accuracies": 0.71875, "rewards/chosen": -3.0481009483337402, "rewards/margins": 0.8947939872741699, "rewards/rejected": -3.9428951740264893, "sft_loss": 3.056109666824341, "step": 3220 }, { "epoch": 1.726041144004014, "grad_norm": 14.062265382351862, "learning_rate": 4.5567229809366895e-07, "logits/chosen": -0.2212449610233307, "logits/rejected": -0.07261637598276138, "logps/chosen": -2.7670116424560547, "logps/rejected": -3.6225428581237793, "loss": 0.5298, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.7670116424560547, "rewards/margins": 0.8555313348770142, "rewards/rejected": -3.6225428581237793, "sft_loss": 2.9393343925476074, "step": 3225 }, { "epoch": 1.7287171767854157, "grad_norm": 21.977791827793503, "learning_rate": 4.541212454673984e-07, "logits/chosen": -0.2686312794685364, "logits/rejected": -0.08200834691524506, "logps/chosen": -2.868399143218994, "logps/rejected": -4.024724006652832, "loss": 0.4992, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.868399143218994, "rewards/margins": 1.156325101852417, "rewards/rejected": -4.024724006652832, "sft_loss": 2.982067584991455, "step": 3230 }, { "epoch": 1.7313932095668172, "grad_norm": 13.672277376114732, "learning_rate": 4.525706379570055e-07, "logits/chosen": -0.2825610041618347, "logits/rejected": -0.16836941242218018, "logps/chosen": -2.8134896755218506, "logps/rejected": -3.7119224071502686, "loss": 0.5161, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8134896755218506, "rewards/margins": 0.8984330296516418, "rewards/rejected": -3.7119224071502686, "sft_loss": 2.921664237976074, "step": 3235 }, { "epoch": 1.7340692423482187, "grad_norm": 14.546138304989343, "learning_rate": 4.510204906064911e-07, "logits/chosen": -0.19505472481250763, "logits/rejected": -0.030590301379561424, "logps/chosen": -2.7975356578826904, "logps/rejected": -3.816844940185547, "loss": 0.5086, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.7975356578826904, "rewards/margins": 1.0193090438842773, "rewards/rejected": -3.816844940185547, "sft_loss": 2.81569242477417, "step": 3240 }, { "epoch": 1.7367452751296204, "grad_norm": 14.787915348963589, "learning_rate": 4.4947081845539177e-07, "logits/chosen": -0.35870975255966187, "logits/rejected": -0.206782728433609, "logps/chosen": -2.8665363788604736, "logps/rejected": -3.6919448375701904, "loss": 0.56, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.8665363788604736, "rewards/margins": 0.8254083395004272, "rewards/rejected": -3.6919448375701904, "sft_loss": 2.9097416400909424, "step": 3245 }, { "epoch": 1.739421307911022, "grad_norm": 12.871323062130514, "learning_rate": 4.479216365386333e-07, "logits/chosen": -0.19920530915260315, "logits/rejected": 0.00923833716660738, "logps/chosen": -2.8280484676361084, "logps/rejected": -3.8082661628723145, "loss": 0.5145, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.8280484676361084, "rewards/margins": 0.980217456817627, "rewards/rejected": -3.8082661628723145, "sft_loss": 2.869939088821411, "step": 3250 }, { "epoch": 1.7420973406924234, "grad_norm": 13.069487032537221, "learning_rate": 4.4637295988638555e-07, "logits/chosen": -0.21338963508605957, "logits/rejected": -0.09642849117517471, "logps/chosen": -2.8135313987731934, "logps/rejected": -3.6506245136260986, "loss": 0.5479, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8135313987731934, "rewards/margins": 0.8370929956436157, "rewards/rejected": -3.6506245136260986, "sft_loss": 2.9300215244293213, "step": 3255 }, { "epoch": 1.744773373473825, "grad_norm": 21.564495906900454, "learning_rate": 4.4482480352391623e-07, "logits/chosen": -0.28455790877342224, "logits/rejected": -0.12763725221157074, "logps/chosen": -2.817474603652954, "logps/rejected": -3.723423480987549, "loss": 0.5153, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.817474603652954, "rewards/margins": 0.90594881772995, "rewards/rejected": -3.723423480987549, "sft_loss": 2.937216281890869, "step": 3260 }, { "epoch": 1.7474494062552266, "grad_norm": 23.117393791841398, "learning_rate": 4.4327718247144507e-07, "logits/chosen": -0.19001540541648865, "logits/rejected": -0.04242430999875069, "logps/chosen": -2.7809228897094727, "logps/rejected": -3.6734752655029297, "loss": 0.5425, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7809228897094727, "rewards/margins": 0.8925522565841675, "rewards/rejected": -3.6734752655029297, "sft_loss": 2.970322608947754, "step": 3265 }, { "epoch": 1.750125439036628, "grad_norm": 18.135415288579836, "learning_rate": 4.417301117439984e-07, "logits/chosen": -0.238087460398674, "logits/rejected": -0.09954921901226044, "logps/chosen": -2.83843994140625, "logps/rejected": -3.768962860107422, "loss": 0.531, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.83843994140625, "rewards/margins": 0.9305224418640137, "rewards/rejected": -3.768962860107422, "sft_loss": 2.9210972785949707, "step": 3270 }, { "epoch": 1.7528014718180298, "grad_norm": 16.001325089443043, "learning_rate": 4.401836063512631e-07, "logits/chosen": -0.30430033802986145, "logits/rejected": 0.05234812945127487, "logps/chosen": -2.756070613861084, "logps/rejected": -3.744997501373291, "loss": 0.496, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.756070613861084, "rewards/margins": 0.9889270663261414, "rewards/rejected": -3.744997501373291, "sft_loss": 2.9325056076049805, "step": 3275 }, { "epoch": 1.7554775045994313, "grad_norm": 19.97497988116218, "learning_rate": 4.386376812974413e-07, "logits/chosen": -0.252798855304718, "logits/rejected": -0.16332267224788666, "logps/chosen": -2.5628037452697754, "logps/rejected": -3.5728652477264404, "loss": 0.5096, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.5628037452697754, "rewards/margins": 1.0100618600845337, "rewards/rejected": -3.5728652477264404, "sft_loss": 2.757195472717285, "step": 3280 }, { "epoch": 1.7581535373808328, "grad_norm": 13.525894288892985, "learning_rate": 4.370923515811048e-07, "logits/chosen": -0.2700463831424713, "logits/rejected": -0.023473823443055153, "logps/chosen": -2.79638934135437, "logps/rejected": -3.7672476768493652, "loss": 0.5042, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.79638934135437, "rewards/margins": 0.9708584547042847, "rewards/rejected": -3.7672476768493652, "sft_loss": 2.8965699672698975, "step": 3285 }, { "epoch": 1.7608295701622345, "grad_norm": 14.699034871188948, "learning_rate": 4.35547632195049e-07, "logits/chosen": -0.21370744705200195, "logits/rejected": -0.08334928005933762, "logps/chosen": -2.721179246902466, "logps/rejected": -3.579859495162964, "loss": 0.5228, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.721179246902466, "rewards/margins": 0.858680248260498, "rewards/rejected": -3.579859495162964, "sft_loss": 2.878352165222168, "step": 3290 }, { "epoch": 1.763505602943636, "grad_norm": 17.573298503902322, "learning_rate": 4.340035381261484e-07, "logits/chosen": -0.21696865558624268, "logits/rejected": -0.09547743201255798, "logps/chosen": -2.980989456176758, "logps/rejected": -3.8878490924835205, "loss": 0.5625, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.980989456176758, "rewards/margins": 0.9068597555160522, "rewards/rejected": -3.8878490924835205, "sft_loss": 3.0405678749084473, "step": 3295 }, { "epoch": 1.7661816357250375, "grad_norm": 24.841053261828165, "learning_rate": 4.324600843552104e-07, "logits/chosen": -0.31977978348731995, "logits/rejected": -0.14189480245113373, "logps/chosen": -3.106128692626953, "logps/rejected": -4.027562618255615, "loss": 0.5568, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.106128692626953, "rewards/margins": 0.9214338064193726, "rewards/rejected": -4.027562618255615, "sft_loss": 3.2608656883239746, "step": 3300 }, { "epoch": 1.7688576685064392, "grad_norm": 17.014889686723876, "learning_rate": 4.309172858568302e-07, "logits/chosen": -0.3131139874458313, "logits/rejected": -0.08883488923311234, "logps/chosen": -3.0094892978668213, "logps/rejected": -3.9629604816436768, "loss": 0.5439, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0094892978668213, "rewards/margins": 0.9534710645675659, "rewards/rejected": -3.9629604816436768, "sft_loss": 3.136040210723877, "step": 3305 }, { "epoch": 1.771533701287841, "grad_norm": 20.074429307266534, "learning_rate": 4.293751575992455e-07, "logits/chosen": -0.12032179534435272, "logits/rejected": -0.06570522487163544, "logps/chosen": -3.0344769954681396, "logps/rejected": -3.9326884746551514, "loss": 0.5096, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.0344769954681396, "rewards/margins": 0.898211658000946, "rewards/rejected": -3.9326884746551514, "sft_loss": 3.2100040912628174, "step": 3310 }, { "epoch": 1.7742097340692422, "grad_norm": 26.469130295066087, "learning_rate": 4.278337145441916e-07, "logits/chosen": -0.26177433133125305, "logits/rejected": -0.05351484566926956, "logps/chosen": -3.0022337436676025, "logps/rejected": -3.882744312286377, "loss": 0.5514, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.0022337436676025, "rewards/margins": 0.880510151386261, "rewards/rejected": -3.882744312286377, "sft_loss": 3.1298012733459473, "step": 3315 }, { "epoch": 1.776885766850644, "grad_norm": 13.810006794454017, "learning_rate": 4.262929716467556e-07, "logits/chosen": -0.24092534184455872, "logits/rejected": 0.03226093575358391, "logps/chosen": -2.9213504791259766, "logps/rejected": -3.9801437854766846, "loss": 0.5226, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9213504791259766, "rewards/margins": 1.0587931871414185, "rewards/rejected": -3.9801437854766846, "sft_loss": 3.0345194339752197, "step": 3320 }, { "epoch": 1.7795617996320456, "grad_norm": 13.141757845299274, "learning_rate": 4.247529438552321e-07, "logits/chosen": -0.25433140993118286, "logits/rejected": -0.04708702117204666, "logps/chosen": -2.885637044906616, "logps/rejected": -3.7773375511169434, "loss": 0.5466, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.885637044906616, "rewards/margins": 0.8917006254196167, "rewards/rejected": -3.7773375511169434, "sft_loss": 3.123898983001709, "step": 3325 }, { "epoch": 1.782237832413447, "grad_norm": 15.999497786068854, "learning_rate": 4.232136461109773e-07, "logits/chosen": -0.19254162907600403, "logits/rejected": -0.05719348043203354, "logps/chosen": -2.7694904804229736, "logps/rejected": -3.805851697921753, "loss": 0.4953, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7694904804229736, "rewards/margins": 1.0363613367080688, "rewards/rejected": -3.805851697921753, "sft_loss": 2.980656623840332, "step": 3330 }, { "epoch": 1.7849138651948486, "grad_norm": 24.841428177881934, "learning_rate": 4.216750933482646e-07, "logits/chosen": -0.21872751414775848, "logits/rejected": -0.017170961946249008, "logps/chosen": -3.009554386138916, "logps/rejected": -3.8017420768737793, "loss": 0.563, "rewards/accuracies": 0.75, "rewards/chosen": -3.009554386138916, "rewards/margins": 0.7921879887580872, "rewards/rejected": -3.8017420768737793, "sft_loss": 3.075871467590332, "step": 3335 }, { "epoch": 1.7875898979762503, "grad_norm": 36.892800216700245, "learning_rate": 4.2013730049413986e-07, "logits/chosen": -0.19581346213817596, "logits/rejected": -0.0030058815609663725, "logps/chosen": -2.7792811393737793, "logps/rejected": -3.8306567668914795, "loss": 0.4923, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.7792811393737793, "rewards/margins": 1.0513756275177002, "rewards/rejected": -3.8306567668914795, "sft_loss": 2.9651989936828613, "step": 3340 }, { "epoch": 1.7902659307576518, "grad_norm": 15.101609080190148, "learning_rate": 4.1860028246827594e-07, "logits/chosen": -0.19188782572746277, "logits/rejected": 0.03688093274831772, "logps/chosen": -2.7170400619506836, "logps/rejected": -3.64373779296875, "loss": 0.5098, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7170400619506836, "rewards/margins": 0.9266974329948425, "rewards/rejected": -3.64373779296875, "sft_loss": 2.9208431243896484, "step": 3345 }, { "epoch": 1.7929419635390533, "grad_norm": 15.293234158359965, "learning_rate": 4.170640541828285e-07, "logits/chosen": -0.31982487440109253, "logits/rejected": -0.16904591023921967, "logps/chosen": -2.936917543411255, "logps/rejected": -3.794989824295044, "loss": 0.5417, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.936917543411255, "rewards/margins": 0.8580719828605652, "rewards/rejected": -3.794989824295044, "sft_loss": 3.077047348022461, "step": 3350 }, { "epoch": 1.795617996320455, "grad_norm": 19.198648656415443, "learning_rate": 4.1552863054229116e-07, "logits/chosen": -0.10441069304943085, "logits/rejected": -0.027602875605225563, "logps/chosen": -3.047877788543701, "logps/rejected": -3.7978546619415283, "loss": 0.6106, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -3.047877788543701, "rewards/margins": 0.7499769330024719, "rewards/rejected": -3.7978546619415283, "sft_loss": 3.1034159660339355, "step": 3355 }, { "epoch": 1.7982940291018565, "grad_norm": 16.89787950921183, "learning_rate": 4.139940264433508e-07, "logits/chosen": -0.2672869861125946, "logits/rejected": -0.043167419731616974, "logps/chosen": -2.7406458854675293, "logps/rejected": -3.6946983337402344, "loss": 0.5234, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7406458854675293, "rewards/margins": 0.9540519714355469, "rewards/rejected": -3.6946983337402344, "sft_loss": 2.8566882610321045, "step": 3360 }, { "epoch": 1.800970061883258, "grad_norm": 15.023275891915496, "learning_rate": 4.1246025677474303e-07, "logits/chosen": -0.29182684421539307, "logits/rejected": -0.07425285875797272, "logps/chosen": -2.939763069152832, "logps/rejected": -3.802227735519409, "loss": 0.5382, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.939763069152832, "rewards/margins": 0.8624647259712219, "rewards/rejected": -3.802227735519409, "sft_loss": 3.107689380645752, "step": 3365 }, { "epoch": 1.8036460946646597, "grad_norm": 15.364491576601871, "learning_rate": 4.10927336417108e-07, "logits/chosen": -0.2519490122795105, "logits/rejected": -0.06419762223958969, "logps/chosen": -2.964268684387207, "logps/rejected": -3.6328086853027344, "loss": 0.6315, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.964268684387207, "rewards/margins": 0.6685395240783691, "rewards/rejected": -3.6328086853027344, "sft_loss": 3.074256658554077, "step": 3370 }, { "epoch": 1.8063221274460612, "grad_norm": 21.966152974131624, "learning_rate": 4.093952802428457e-07, "logits/chosen": -0.09806279838085175, "logits/rejected": -0.012893694452941418, "logps/chosen": -3.0203330516815186, "logps/rejected": -3.806386947631836, "loss": 0.6036, "rewards/accuracies": 0.6875, "rewards/chosen": -3.0203330516815186, "rewards/margins": 0.7860537767410278, "rewards/rejected": -3.806386947631836, "sft_loss": 3.112948417663574, "step": 3375 }, { "epoch": 1.8089981602274627, "grad_norm": 13.016070631296792, "learning_rate": 4.0786410311597184e-07, "logits/chosen": -0.3160150945186615, "logits/rejected": -0.10090694576501846, "logps/chosen": -2.8167786598205566, "logps/rejected": -3.6615512371063232, "loss": 0.5482, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8167786598205566, "rewards/margins": 0.8447723388671875, "rewards/rejected": -3.6615512371063232, "sft_loss": 2.863129138946533, "step": 3380 }, { "epoch": 1.8116741930088645, "grad_norm": 14.525805888243237, "learning_rate": 4.063338198919737e-07, "logits/chosen": -0.2736918330192566, "logits/rejected": -0.23787431418895721, "logps/chosen": -2.814265012741089, "logps/rejected": -3.5256824493408203, "loss": 0.5887, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.814265012741089, "rewards/margins": 0.7114171981811523, "rewards/rejected": -3.5256824493408203, "sft_loss": 2.904420852661133, "step": 3385 }, { "epoch": 1.814350225790266, "grad_norm": 22.23247451095, "learning_rate": 4.0480444541766575e-07, "logits/chosen": -0.2675407826900482, "logits/rejected": -0.09931263327598572, "logps/chosen": -2.8064563274383545, "logps/rejected": -3.5337014198303223, "loss": 0.5936, "rewards/accuracies": 0.6875, "rewards/chosen": -2.8064563274383545, "rewards/margins": 0.727245032787323, "rewards/rejected": -3.5337014198303223, "sft_loss": 2.860211133956909, "step": 3390 }, { "epoch": 1.8170262585716674, "grad_norm": 14.488352793649376, "learning_rate": 4.0327599453104606e-07, "logits/chosen": -0.30500540137290955, "logits/rejected": -0.12484397739171982, "logps/chosen": -2.6433451175689697, "logps/rejected": -3.6233038902282715, "loss": 0.497, "rewards/accuracies": 0.78125, "rewards/chosen": -2.6433451175689697, "rewards/margins": 0.9799593091011047, "rewards/rejected": -3.6233038902282715, "sft_loss": 2.772120952606201, "step": 3395 }, { "epoch": 1.8197022913530692, "grad_norm": 18.939007337615287, "learning_rate": 4.017484820611514e-07, "logits/chosen": -0.22828754782676697, "logits/rejected": -0.0670999065041542, "logps/chosen": -2.731137275695801, "logps/rejected": -3.5642154216766357, "loss": 0.5295, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.731137275695801, "rewards/margins": 0.8330783843994141, "rewards/rejected": -3.5642154216766357, "sft_loss": 2.819455623626709, "step": 3400 }, { "epoch": 1.8223783241344707, "grad_norm": 13.642105553707072, "learning_rate": 4.002219228279148e-07, "logits/chosen": -0.2647199034690857, "logits/rejected": -0.09628833085298538, "logps/chosen": -2.7119967937469482, "logps/rejected": -3.5323944091796875, "loss": 0.5146, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.7119967937469482, "rewards/margins": 0.8203978538513184, "rewards/rejected": -3.5323944091796875, "sft_loss": 2.823310613632202, "step": 3405 }, { "epoch": 1.8250543569158721, "grad_norm": 15.604402083872163, "learning_rate": 3.9869633164202045e-07, "logits/chosen": -0.2539847493171692, "logits/rejected": 0.020132040604948997, "logps/chosen": -2.849400043487549, "logps/rejected": -3.6440062522888184, "loss": 0.5334, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.849400043487549, "rewards/margins": 0.7946061491966248, "rewards/rejected": -3.6440062522888184, "sft_loss": 2.8739774227142334, "step": 3410 }, { "epoch": 1.8277303896972739, "grad_norm": 21.899598807926793, "learning_rate": 3.9717172330476077e-07, "logits/chosen": -0.2510986924171448, "logits/rejected": -0.10071302950382233, "logps/chosen": -2.827031135559082, "logps/rejected": -3.734854221343994, "loss": 0.5492, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.827031135559082, "rewards/margins": 0.9078229665756226, "rewards/rejected": -3.734854221343994, "sft_loss": 2.97774076461792, "step": 3415 }, { "epoch": 1.8304064224786754, "grad_norm": 20.063555498521275, "learning_rate": 3.956481126078927e-07, "logits/chosen": -0.14432759582996368, "logits/rejected": -0.011064152233302593, "logps/chosen": -2.895400047302246, "logps/rejected": -3.739138126373291, "loss": 0.5999, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.895400047302246, "rewards/margins": 0.8437372446060181, "rewards/rejected": -3.739138126373291, "sft_loss": 3.0360639095306396, "step": 3420 }, { "epoch": 1.8330824552600768, "grad_norm": 13.483525350613622, "learning_rate": 3.941255143334937e-07, "logits/chosen": -0.2542577385902405, "logits/rejected": -0.19230565428733826, "logps/chosen": -2.8025808334350586, "logps/rejected": -3.6844921112060547, "loss": 0.5318, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8025808334350586, "rewards/margins": 0.8819111585617065, "rewards/rejected": -3.6844921112060547, "sft_loss": 2.8600831031799316, "step": 3425 }, { "epoch": 1.8357584880414786, "grad_norm": 16.43243776872195, "learning_rate": 3.9260394325381895e-07, "logits/chosen": -0.22831299901008606, "logits/rejected": -0.075655996799469, "logps/chosen": -2.699906587600708, "logps/rejected": -3.8140316009521484, "loss": 0.4896, "rewards/accuracies": 0.84375, "rewards/chosen": -2.699906587600708, "rewards/margins": 1.1141245365142822, "rewards/rejected": -3.8140316009521484, "sft_loss": 2.791806697845459, "step": 3430 }, { "epoch": 1.83843452082288, "grad_norm": 15.428523896946192, "learning_rate": 3.9108341413115784e-07, "logits/chosen": -0.24065284430980682, "logits/rejected": -0.09862431138753891, "logps/chosen": -2.6614131927490234, "logps/rejected": -3.627790927886963, "loss": 0.4677, "rewards/accuracies": 0.84375, "rewards/chosen": -2.6614131927490234, "rewards/margins": 0.9663776159286499, "rewards/rejected": -3.627790927886963, "sft_loss": 2.8119826316833496, "step": 3435 }, { "epoch": 1.8411105536042816, "grad_norm": 19.851445861786452, "learning_rate": 3.895639417176905e-07, "logits/chosen": -0.2709447145462036, "logits/rejected": -0.14585816860198975, "logps/chosen": -2.7599568367004395, "logps/rejected": -3.7462761402130127, "loss": 0.5487, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.7599568367004395, "rewards/margins": 0.9863188862800598, "rewards/rejected": -3.7462761402130127, "sft_loss": 2.869521141052246, "step": 3440 }, { "epoch": 1.8437865863856833, "grad_norm": 15.225388707062304, "learning_rate": 3.8804554075534497e-07, "logits/chosen": -0.2733847498893738, "logits/rejected": -0.026109689846634865, "logps/chosen": -2.805610418319702, "logps/rejected": -3.7957942485809326, "loss": 0.5148, "rewards/accuracies": 0.75, "rewards/chosen": -2.805610418319702, "rewards/margins": 0.9901833534240723, "rewards/rejected": -3.7957942485809326, "sft_loss": 2.947772264480591, "step": 3445 }, { "epoch": 1.8464626191670848, "grad_norm": 15.99874798191993, "learning_rate": 3.8652822597565403e-07, "logits/chosen": -0.3326260447502136, "logits/rejected": -0.12364955246448517, "logps/chosen": -2.821265459060669, "logps/rejected": -3.783513307571411, "loss": 0.5188, "rewards/accuracies": 0.75, "rewards/chosen": -2.821265459060669, "rewards/margins": 0.9622477293014526, "rewards/rejected": -3.783513307571411, "sft_loss": 2.948594570159912, "step": 3450 }, { "epoch": 1.8491386519484863, "grad_norm": 17.173300996571587, "learning_rate": 3.850120120996123e-07, "logits/chosen": -0.23530642688274384, "logits/rejected": 0.0075013055466115475, "logps/chosen": -2.9990508556365967, "logps/rejected": -3.8762524127960205, "loss": 0.562, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9990508556365967, "rewards/margins": 0.8772016763687134, "rewards/rejected": -3.8762524127960205, "sft_loss": 3.0761046409606934, "step": 3455 }, { "epoch": 1.851814684729888, "grad_norm": 17.42784087365534, "learning_rate": 3.8349691383753356e-07, "logits/chosen": -0.13078606128692627, "logits/rejected": 0.005738553591072559, "logps/chosen": -2.811361789703369, "logps/rejected": -3.7540740966796875, "loss": 0.5298, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.811361789703369, "rewards/margins": 0.9427124261856079, "rewards/rejected": -3.7540740966796875, "sft_loss": 2.885227680206299, "step": 3460 }, { "epoch": 1.8544907175112895, "grad_norm": 11.970284039619953, "learning_rate": 3.819829458889078e-07, "logits/chosen": -0.23028437793254852, "logits/rejected": -0.10200443118810654, "logps/chosen": -2.7681221961975098, "logps/rejected": -3.6479923725128174, "loss": 0.534, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -2.7681221961975098, "rewards/margins": 0.8798701167106628, "rewards/rejected": -3.6479923725128174, "sft_loss": 2.8739943504333496, "step": 3465 }, { "epoch": 1.857166750292691, "grad_norm": 15.818645996946636, "learning_rate": 3.804701229422585e-07, "logits/chosen": -0.2516114115715027, "logits/rejected": -0.13954241573810577, "logps/chosen": -2.9505364894866943, "logps/rejected": -3.8601531982421875, "loss": 0.5392, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9505364894866943, "rewards/margins": 0.9096164703369141, "rewards/rejected": -3.8601531982421875, "sft_loss": 3.0391030311584473, "step": 3470 }, { "epoch": 1.8598427830740927, "grad_norm": 18.35439781552601, "learning_rate": 3.789584596750007e-07, "logits/chosen": -0.22998587787151337, "logits/rejected": -0.16458949446678162, "logps/chosen": -2.8295726776123047, "logps/rejected": -3.6990628242492676, "loss": 0.5392, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.8295726776123047, "rewards/margins": 0.8694899678230286, "rewards/rejected": -3.6990628242492676, "sft_loss": 2.8997833728790283, "step": 3475 }, { "epoch": 1.8625188158554944, "grad_norm": 15.377452961017998, "learning_rate": 3.77447970753298e-07, "logits/chosen": -0.11249478161334991, "logits/rejected": -0.07554732263088226, "logps/chosen": -2.8798892498016357, "logps/rejected": -3.7367751598358154, "loss": 0.543, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8798892498016357, "rewards/margins": 0.8568856120109558, "rewards/rejected": -3.7367751598358154, "sft_loss": 3.0244550704956055, "step": 3480 }, { "epoch": 1.8651948486368957, "grad_norm": 17.636805330875724, "learning_rate": 3.7593867083192057e-07, "logits/chosen": -0.19624033570289612, "logits/rejected": -0.01715945638716221, "logps/chosen": -2.7213451862335205, "logps/rejected": -3.5876636505126953, "loss": 0.555, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.7213451862335205, "rewards/margins": 0.8663187026977539, "rewards/rejected": -3.5876636505126953, "sft_loss": 2.8986878395080566, "step": 3485 }, { "epoch": 1.8678708814182974, "grad_norm": 16.273522498705884, "learning_rate": 3.7443057455410276e-07, "logits/chosen": -0.19358979165554047, "logits/rejected": -0.05072442814707756, "logps/chosen": -2.7149434089660645, "logps/rejected": -3.714277982711792, "loss": 0.478, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7149434089660645, "rewards/margins": 0.999334454536438, "rewards/rejected": -3.714277982711792, "sft_loss": 2.915403127670288, "step": 3490 }, { "epoch": 1.870546914199699, "grad_norm": 13.653360966622175, "learning_rate": 3.7292369655140145e-07, "logits/chosen": -0.29655593633651733, "logits/rejected": -0.09064863622188568, "logps/chosen": -2.846242904663086, "logps/rejected": -3.625659465789795, "loss": 0.5148, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.846242904663086, "rewards/margins": 0.779416561126709, "rewards/rejected": -3.625659465789795, "sft_loss": 3.0332603454589844, "step": 3495 }, { "epoch": 1.8732229469811004, "grad_norm": 15.100840540865368, "learning_rate": 3.714180514435534e-07, "logits/chosen": -0.16411438584327698, "logits/rejected": 0.02352045103907585, "logps/chosen": -2.780078411102295, "logps/rejected": -3.7550857067108154, "loss": 0.5162, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.780078411102295, "rewards/margins": 0.9750074148178101, "rewards/rejected": -3.7550857067108154, "sft_loss": 2.971510648727417, "step": 3500 }, { "epoch": 1.875898979762502, "grad_norm": 18.8741615601585, "learning_rate": 3.6991365383833426e-07, "logits/chosen": -0.2565234303474426, "logits/rejected": -0.07370957732200623, "logps/chosen": -2.813046932220459, "logps/rejected": -3.7858214378356934, "loss": 0.4985, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.813046932220459, "rewards/margins": 0.9727746248245239, "rewards/rejected": -3.7858214378356934, "sft_loss": 3.0226640701293945, "step": 3505 }, { "epoch": 1.8785750125439038, "grad_norm": 19.983117937550535, "learning_rate": 3.684105183314162e-07, "logits/chosen": -0.2373221218585968, "logits/rejected": -0.12379314005374908, "logps/chosen": -2.7259597778320312, "logps/rejected": -3.637960433959961, "loss": 0.5015, "rewards/accuracies": 0.8125, "rewards/chosen": -2.7259597778320312, "rewards/margins": 0.9120001792907715, "rewards/rejected": -3.637960433959961, "sft_loss": 2.852381467819214, "step": 3510 }, { "epoch": 1.881251045325305, "grad_norm": 18.11630276511099, "learning_rate": 3.669086595062263e-07, "logits/chosen": -0.23923330008983612, "logits/rejected": -0.0036049566697329283, "logps/chosen": -2.9402759075164795, "logps/rejected": -3.849766492843628, "loss": 0.5202, "rewards/accuracies": 0.75, "rewards/chosen": -2.9402759075164795, "rewards/margins": 0.9094909429550171, "rewards/rejected": -3.849766492843628, "sft_loss": 3.062849521636963, "step": 3515 }, { "epoch": 1.8839270781067068, "grad_norm": 14.118071827863625, "learning_rate": 3.654080919338056e-07, "logits/chosen": -0.28001868724823, "logits/rejected": -0.07703977078199387, "logps/chosen": -2.8357691764831543, "logps/rejected": -3.7062485218048096, "loss": 0.5369, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8357691764831543, "rewards/margins": 0.8704794049263, "rewards/rejected": -3.7062485218048096, "sft_loss": 3.0368707180023193, "step": 3520 }, { "epoch": 1.8866031108881085, "grad_norm": 16.525389106043953, "learning_rate": 3.639088301726673e-07, "logits/chosen": -0.20582440495491028, "logits/rejected": 0.0367545410990715, "logps/chosen": -2.8971991539001465, "logps/rejected": -3.7518577575683594, "loss": 0.5588, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.8971991539001465, "rewards/margins": 0.8546587824821472, "rewards/rejected": -3.7518577575683594, "sft_loss": 3.0653936862945557, "step": 3525 }, { "epoch": 1.88927914366951, "grad_norm": 20.9316555158969, "learning_rate": 3.624108887686556e-07, "logits/chosen": -0.20897629857063293, "logits/rejected": -0.11267737299203873, "logps/chosen": -2.93704891204834, "logps/rejected": -3.8252410888671875, "loss": 0.5127, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.93704891204834, "rewards/margins": 0.888192355632782, "rewards/rejected": -3.8252410888671875, "sft_loss": 3.1720235347747803, "step": 3530 }, { "epoch": 1.8919551764509115, "grad_norm": 12.859754973839012, "learning_rate": 3.6091428225480433e-07, "logits/chosen": -0.2824031412601471, "logits/rejected": -0.11336223781108856, "logps/chosen": -2.920830488204956, "logps/rejected": -3.8924171924591064, "loss": 0.5286, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.920830488204956, "rewards/margins": 0.9715864062309265, "rewards/rejected": -3.8924171924591064, "sft_loss": 3.1959595680236816, "step": 3535 }, { "epoch": 1.8946312092323132, "grad_norm": 20.170217014416863, "learning_rate": 3.5941902515119674e-07, "logits/chosen": -0.27359622716903687, "logits/rejected": -0.008305774070322514, "logps/chosen": -2.9588866233825684, "logps/rejected": -3.740037202835083, "loss": 0.5759, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9588866233825684, "rewards/margins": 0.7811505198478699, "rewards/rejected": -3.740037202835083, "sft_loss": 3.141091823577881, "step": 3540 }, { "epoch": 1.8973072420137147, "grad_norm": 18.208678592847644, "learning_rate": 3.5792513196482373e-07, "logits/chosen": -0.38286128640174866, "logits/rejected": -0.04241828992962837, "logps/chosen": -2.8372414112091064, "logps/rejected": -3.7894434928894043, "loss": 0.4823, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -2.8372414112091064, "rewards/margins": 0.952202320098877, "rewards/rejected": -3.7894434928894043, "sft_loss": 2.954357624053955, "step": 3545 }, { "epoch": 1.8999832747951162, "grad_norm": 16.611233931970162, "learning_rate": 3.5643261718944346e-07, "logits/chosen": -0.13990192115306854, "logits/rejected": -0.02662300132215023, "logps/chosen": -2.934990644454956, "logps/rejected": -3.732720136642456, "loss": 0.5683, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.934990644454956, "rewards/margins": 0.7977299094200134, "rewards/rejected": -3.732720136642456, "sft_loss": 2.920933246612549, "step": 3550 }, { "epoch": 1.902659307576518, "grad_norm": 14.196617770384604, "learning_rate": 3.5494149530544087e-07, "logits/chosen": -0.29803937673568726, "logits/rejected": -0.16058126091957092, "logps/chosen": -2.825559139251709, "logps/rejected": -3.727395534515381, "loss": 0.5747, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.825559139251709, "rewards/margins": 0.9018365740776062, "rewards/rejected": -3.727395534515381, "sft_loss": 2.9392800331115723, "step": 3555 }, { "epoch": 1.9053353403579194, "grad_norm": 17.050120292129, "learning_rate": 3.534517807796871e-07, "logits/chosen": -0.24503159523010254, "logits/rejected": -0.10288417339324951, "logps/chosen": -2.7696847915649414, "logps/rejected": -3.6267178058624268, "loss": 0.5248, "rewards/accuracies": 0.71875, "rewards/chosen": -2.7696847915649414, "rewards/margins": 0.8570332527160645, "rewards/rejected": -3.6267178058624268, "sft_loss": 2.9060094356536865, "step": 3560 }, { "epoch": 1.908011373139321, "grad_norm": 15.24198881267226, "learning_rate": 3.519634880653988e-07, "logits/chosen": -0.20890846848487854, "logits/rejected": -0.07936038076877594, "logps/chosen": -2.9502017498016357, "logps/rejected": -3.9783225059509277, "loss": 0.5096, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.9502017498016357, "rewards/margins": 1.028120756149292, "rewards/rejected": -3.9783225059509277, "sft_loss": 3.110295057296753, "step": 3565 }, { "epoch": 1.9106874059207226, "grad_norm": 13.753923022395519, "learning_rate": 3.504766316019987e-07, "logits/chosen": -0.28474587202072144, "logits/rejected": -0.05706968158483505, "logps/chosen": -2.75578236579895, "logps/rejected": -3.6933975219726562, "loss": 0.4884, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.75578236579895, "rewards/margins": 0.9376150965690613, "rewards/rejected": -3.6933975219726562, "sft_loss": 2.853646755218506, "step": 3570 }, { "epoch": 1.913363438702124, "grad_norm": 12.325171877103088, "learning_rate": 3.489912258149745e-07, "logits/chosen": -0.16082385182380676, "logits/rejected": -0.013875825330615044, "logps/chosen": -2.8332247734069824, "logps/rejected": -3.8153090476989746, "loss": 0.545, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.8332247734069824, "rewards/margins": 0.9820839166641235, "rewards/rejected": -3.8153090476989746, "sft_loss": 2.911475419998169, "step": 3575 }, { "epoch": 1.9160394714835256, "grad_norm": 12.90296161207828, "learning_rate": 3.475072851157397e-07, "logits/chosen": -0.20278310775756836, "logits/rejected": -0.11776771396398544, "logps/chosen": -2.810978889465332, "logps/rejected": -3.799607038497925, "loss": 0.4928, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.810978889465332, "rewards/margins": 0.9886280298233032, "rewards/rejected": -3.799607038497925, "sft_loss": 2.984903573989868, "step": 3580 }, { "epoch": 1.9187155042649273, "grad_norm": 13.919607678377089, "learning_rate": 3.460248239014936e-07, "logits/chosen": -0.1239798441529274, "logits/rejected": -0.034661222249269485, "logps/chosen": -2.9402782917022705, "logps/rejected": -3.8812129497528076, "loss": 0.5112, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.9402782917022705, "rewards/margins": 0.9409344792366028, "rewards/rejected": -3.8812129497528076, "sft_loss": 3.1353812217712402, "step": 3585 }, { "epoch": 1.9213915370463288, "grad_norm": 13.995097148662646, "learning_rate": 3.4454385655508134e-07, "logits/chosen": -0.16172340512275696, "logits/rejected": -0.08319219201803207, "logps/chosen": -2.9241456985473633, "logps/rejected": -3.6937434673309326, "loss": 0.5888, "rewards/accuracies": 0.65625, "rewards/chosen": -2.9241456985473633, "rewards/margins": 0.7695978879928589, "rewards/rejected": -3.6937434673309326, "sft_loss": 3.051616668701172, "step": 3590 }, { "epoch": 1.9240675698277303, "grad_norm": 11.714401564603223, "learning_rate": 3.4306439744485447e-07, "logits/chosen": -0.2790454626083374, "logits/rejected": -0.04525241255760193, "logps/chosen": -2.9379630088806152, "logps/rejected": -3.804042100906372, "loss": 0.5476, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9379630088806152, "rewards/margins": 0.8660792112350464, "rewards/rejected": -3.804042100906372, "sft_loss": 2.9716274738311768, "step": 3595 }, { "epoch": 1.926743602609132, "grad_norm": 16.019929188124948, "learning_rate": 3.415864609245322e-07, "logits/chosen": -0.17339129745960236, "logits/rejected": 0.01918947696685791, "logps/chosen": -2.9560437202453613, "logps/rejected": -3.9043736457824707, "loss": 0.5625, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.9560437202453613, "rewards/margins": 0.9483293294906616, "rewards/rejected": -3.9043736457824707, "sft_loss": 3.161888360977173, "step": 3600 }, { "epoch": 1.926743602609132, "eval_logits/chosen": 0.24408887326717377, "eval_logits/rejected": 0.36701610684394836, "eval_logps/chosen": -2.859692335128784, "eval_logps/rejected": -3.7385363578796387, "eval_loss": 0.5478324294090271, "eval_rewards/accuracies": 0.7255192995071411, "eval_rewards/chosen": -2.859692335128784, "eval_rewards/margins": 0.878844141960144, "eval_rewards/rejected": -3.7385363578796387, "eval_runtime": 50.1277, "eval_samples_per_second": 26.831, "eval_sft_loss": 3.0563251972198486, "eval_steps_per_second": 6.723, "step": 3600 }, { "epoch": 1.9294196353905335, "grad_norm": 15.270554541376217, "learning_rate": 3.401100613330605e-07, "logits/chosen": -0.23438136279582977, "logits/rejected": -0.19709154963493347, "logps/chosen": -2.741083860397339, "logps/rejected": -3.5722594261169434, "loss": 0.5405, "rewards/accuracies": 0.71875, "rewards/chosen": -2.741083860397339, "rewards/margins": 0.8311758041381836, "rewards/rejected": -3.5722594261169434, "sft_loss": 2.9489662647247314, "step": 3605 }, { "epoch": 1.932095668171935, "grad_norm": 14.471201812410206, "learning_rate": 3.3863521299447514e-07, "logits/chosen": -0.2784094214439392, "logits/rejected": -0.0939759910106659, "logps/chosen": -2.7622246742248535, "logps/rejected": -3.6392664909362793, "loss": 0.5004, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.7622246742248535, "rewards/margins": 0.8770421147346497, "rewards/rejected": -3.6392664909362793, "sft_loss": 2.948101758956909, "step": 3610 }, { "epoch": 1.9347717009533367, "grad_norm": 15.877868402206891, "learning_rate": 3.371619302177609e-07, "logits/chosen": -0.15976648032665253, "logits/rejected": 0.0016909487312659621, "logps/chosen": -2.9228010177612305, "logps/rejected": -3.85103178024292, "loss": 0.5133, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.9228010177612305, "rewards/margins": 0.9282311201095581, "rewards/rejected": -3.85103178024292, "sft_loss": 3.01761794090271, "step": 3615 }, { "epoch": 1.9374477337347382, "grad_norm": 21.44056032546373, "learning_rate": 3.3569022729671393e-07, "logits/chosen": -0.1950550526380539, "logits/rejected": -0.07320324331521988, "logps/chosen": -3.0602307319641113, "logps/rejected": -3.849700927734375, "loss": 0.5615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.0602307319641113, "rewards/margins": 0.7894704937934875, "rewards/rejected": -3.849700927734375, "sft_loss": 3.249194622039795, "step": 3620 }, { "epoch": 1.9401237665161397, "grad_norm": 16.019071416415215, "learning_rate": 3.342201185098024e-07, "logits/chosen": -0.15915197134017944, "logits/rejected": -0.1630932092666626, "logps/chosen": -2.8209826946258545, "logps/rejected": -3.661130905151367, "loss": 0.5292, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8209826946258545, "rewards/margins": 0.8401481509208679, "rewards/rejected": -3.661130905151367, "sft_loss": 2.9431231021881104, "step": 3625 }, { "epoch": 1.9427997992975414, "grad_norm": 15.905398766305188, "learning_rate": 3.3275161812002807e-07, "logits/chosen": -0.21201694011688232, "logits/rejected": -0.15958142280578613, "logps/chosen": -2.9415431022644043, "logps/rejected": -3.8751723766326904, "loss": 0.5645, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9415431022644043, "rewards/margins": 0.9336287379264832, "rewards/rejected": -3.8751723766326904, "sft_loss": 3.1352477073669434, "step": 3630 }, { "epoch": 1.945475832078943, "grad_norm": 13.58100398003522, "learning_rate": 3.312847403747883e-07, "logits/chosen": -0.272103875875473, "logits/rejected": -0.13591468334197998, "logps/chosen": -2.8277993202209473, "logps/rejected": -3.806042432785034, "loss": 0.4966, "rewards/accuracies": 0.78125, "rewards/chosen": -2.8277993202209473, "rewards/margins": 0.9782431721687317, "rewards/rejected": -3.806042432785034, "sft_loss": 2.9858555793762207, "step": 3635 }, { "epoch": 1.9481518648603444, "grad_norm": 15.315418911990719, "learning_rate": 3.2981949950573733e-07, "logits/chosen": -0.21761062741279602, "logits/rejected": -0.12431806325912476, "logps/chosen": -2.9406609535217285, "logps/rejected": -3.739722490310669, "loss": 0.5394, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9406609535217285, "rewards/margins": 0.7990615367889404, "rewards/rejected": -3.739722490310669, "sft_loss": 3.1177260875701904, "step": 3640 }, { "epoch": 1.9508278976417461, "grad_norm": 14.116155538156312, "learning_rate": 3.283559097286486e-07, "logits/chosen": -0.24495331943035126, "logits/rejected": -0.08875279128551483, "logps/chosen": -2.9264073371887207, "logps/rejected": -3.5987040996551514, "loss": 0.5746, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9264073371887207, "rewards/margins": 0.6722965240478516, "rewards/rejected": -3.5987040996551514, "sft_loss": 3.0629003047943115, "step": 3645 }, { "epoch": 1.9535039304231478, "grad_norm": 17.25268549247319, "learning_rate": 3.268939852432765e-07, "logits/chosen": -0.29995864629745483, "logits/rejected": -0.17995992302894592, "logps/chosen": -3.043863296508789, "logps/rejected": -3.762787342071533, "loss": 0.5848, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -3.043863296508789, "rewards/margins": 0.7189238667488098, "rewards/rejected": -3.762787342071533, "sft_loss": 3.2166271209716797, "step": 3650 }, { "epoch": 1.9561799632045491, "grad_norm": 21.013060863581366, "learning_rate": 3.254337402332187e-07, "logits/chosen": -0.21988160908222198, "logits/rejected": -0.0836673155426979, "logps/chosen": -2.945904493331909, "logps/rejected": -3.8223743438720703, "loss": 0.5408, "rewards/accuracies": 0.75, "rewards/chosen": -2.945904493331909, "rewards/margins": 0.876469612121582, "rewards/rejected": -3.8223743438720703, "sft_loss": 3.0382790565490723, "step": 3655 }, { "epoch": 1.9588559959859508, "grad_norm": 13.024525263591862, "learning_rate": 3.239751888657788e-07, "logits/chosen": -0.2342025488615036, "logits/rejected": -0.06337428838014603, "logps/chosen": -2.926807165145874, "logps/rejected": -3.78998064994812, "loss": 0.5428, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.926807165145874, "rewards/margins": 0.8631734848022461, "rewards/rejected": -3.78998064994812, "sft_loss": 3.1329076290130615, "step": 3660 }, { "epoch": 1.9615320287673526, "grad_norm": 14.471523363662126, "learning_rate": 3.2251834529182856e-07, "logits/chosen": -0.2262742817401886, "logits/rejected": -0.10412702709436417, "logps/chosen": -2.7210283279418945, "logps/rejected": -3.677239179611206, "loss": 0.5285, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7210283279418945, "rewards/margins": 0.9562112092971802, "rewards/rejected": -3.677239179611206, "sft_loss": 2.808276414871216, "step": 3665 }, { "epoch": 1.9642080615487538, "grad_norm": 13.1135910496621, "learning_rate": 3.2106322364567075e-07, "logits/chosen": -0.22553035616874695, "logits/rejected": -0.0694446787238121, "logps/chosen": -2.8147988319396973, "logps/rejected": -3.906506299972534, "loss": 0.4704, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.8147988319396973, "rewards/margins": 1.091707468032837, "rewards/rejected": -3.906506299972534, "sft_loss": 3.0503079891204834, "step": 3670 }, { "epoch": 1.9668840943301555, "grad_norm": 15.40942759554274, "learning_rate": 3.1960983804490183e-07, "logits/chosen": -0.2783694863319397, "logits/rejected": -0.10769981145858765, "logps/chosen": -3.0342190265655518, "logps/rejected": -3.9841713905334473, "loss": 0.5712, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -3.0342190265655518, "rewards/margins": 0.9499521255493164, "rewards/rejected": -3.9841713905334473, "sft_loss": 3.1858086585998535, "step": 3675 }, { "epoch": 1.9695601271115573, "grad_norm": 14.517911574149721, "learning_rate": 3.1815820259027537e-07, "logits/chosen": -0.26202255487442017, "logits/rejected": -0.11341051012277603, "logps/chosen": -2.6334633827209473, "logps/rejected": -3.5371620655059814, "loss": 0.4942, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.6334633827209473, "rewards/margins": 0.9036985635757446, "rewards/rejected": -3.5371620655059814, "sft_loss": 2.8220176696777344, "step": 3680 }, { "epoch": 1.9722361598929585, "grad_norm": 18.456034604568856, "learning_rate": 3.16708331365565e-07, "logits/chosen": -0.27187708020210266, "logits/rejected": -0.1584286093711853, "logps/chosen": -2.8514575958251953, "logps/rejected": -3.7883028984069824, "loss": 0.5393, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.8514575958251953, "rewards/margins": 0.936845600605011, "rewards/rejected": -3.7883028984069824, "sft_loss": 3.0907986164093018, "step": 3685 }, { "epoch": 1.9749121926743602, "grad_norm": 12.80151079203658, "learning_rate": 3.152602384374275e-07, "logits/chosen": -0.2671336829662323, "logits/rejected": -0.05357428267598152, "logps/chosen": -2.947993040084839, "logps/rejected": -3.795607089996338, "loss": 0.5436, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.947993040084839, "rewards/margins": 0.8476136922836304, "rewards/rejected": -3.795607089996338, "sft_loss": 3.047563076019287, "step": 3690 }, { "epoch": 1.977588225455762, "grad_norm": 16.203253604205564, "learning_rate": 3.1381393785526697e-07, "logits/chosen": -0.27621060609817505, "logits/rejected": -0.1704329550266266, "logps/chosen": -2.8942766189575195, "logps/rejected": -3.7458534240722656, "loss": 0.552, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8942766189575195, "rewards/margins": 0.8515765070915222, "rewards/rejected": -3.7458534240722656, "sft_loss": 3.07773494720459, "step": 3695 }, { "epoch": 1.9802642582371635, "grad_norm": 14.229932105580046, "learning_rate": 3.123694436510979e-07, "logits/chosen": -0.19697576761245728, "logits/rejected": -0.03812349587678909, "logps/chosen": -2.7251546382904053, "logps/rejected": -3.6232516765594482, "loss": 0.5101, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.7251546382904053, "rewards/margins": 0.8980968594551086, "rewards/rejected": -3.6232516765594482, "sft_loss": 2.8783936500549316, "step": 3700 }, { "epoch": 1.982940291018565, "grad_norm": 17.557722448319616, "learning_rate": 3.1092676983940946e-07, "logits/chosen": -0.2563186287879944, "logits/rejected": -0.16103769838809967, "logps/chosen": -2.7463276386260986, "logps/rejected": -3.7818381786346436, "loss": 0.4905, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.7463276386260986, "rewards/margins": 1.035510540008545, "rewards/rejected": -3.7818381786346436, "sft_loss": 2.89477801322937, "step": 3705 }, { "epoch": 1.9856163237999667, "grad_norm": 16.39475902018491, "learning_rate": 3.094859304170293e-07, "logits/chosen": -0.06748291850090027, "logits/rejected": -0.017182841897010803, "logps/chosen": -2.8282206058502197, "logps/rejected": -3.6515190601348877, "loss": 0.5719, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.8282206058502197, "rewards/margins": 0.823298454284668, "rewards/rejected": -3.6515190601348877, "sft_loss": 3.028571367263794, "step": 3710 }, { "epoch": 1.9882923565813682, "grad_norm": 15.992315690449052, "learning_rate": 3.0804693936298795e-07, "logits/chosen": -0.17725564539432526, "logits/rejected": -0.10359780490398407, "logps/chosen": -2.8508968353271484, "logps/rejected": -3.920119047164917, "loss": 0.4938, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.8508968353271484, "rewards/margins": 1.069222331047058, "rewards/rejected": -3.920119047164917, "sft_loss": 3.0350348949432373, "step": 3715 }, { "epoch": 1.9909683893627697, "grad_norm": 14.889695737319636, "learning_rate": 3.066098106383826e-07, "logits/chosen": -0.24103884398937225, "logits/rejected": -0.11680476367473602, "logps/chosen": -2.8061740398406982, "logps/rejected": -3.612884521484375, "loss": 0.5453, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8061740398406982, "rewards/margins": 0.806710422039032, "rewards/rejected": -3.612884521484375, "sft_loss": 2.8877646923065186, "step": 3720 }, { "epoch": 1.9936444221441714, "grad_norm": 14.487555363306377, "learning_rate": 3.0517455818624263e-07, "logits/chosen": -0.29338186979293823, "logits/rejected": -0.17560279369354248, "logps/chosen": -2.8024487495422363, "logps/rejected": -3.7356905937194824, "loss": 0.5019, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -2.8024487495422363, "rewards/margins": 0.9332420229911804, "rewards/rejected": -3.7356905937194824, "sft_loss": 3.047268867492676, "step": 3725 }, { "epoch": 1.9963204549255729, "grad_norm": 13.336958480256456, "learning_rate": 3.037411959313936e-07, "logits/chosen": -0.2274896800518036, "logits/rejected": -0.06707743555307388, "logps/chosen": -2.781489849090576, "logps/rejected": -3.660910129547119, "loss": 0.4864, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.781489849090576, "rewards/margins": 0.8794196844100952, "rewards/rejected": -3.660910129547119, "sft_loss": 2.9758944511413574, "step": 3730 }, { "epoch": 1.9989964877069744, "grad_norm": 16.98708079592768, "learning_rate": 3.023097377803224e-07, "logits/chosen": -0.14046138525009155, "logits/rejected": -0.0367421992123127, "logps/chosen": -2.9978785514831543, "logps/rejected": -3.839390993118286, "loss": 0.5814, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.9978785514831543, "rewards/margins": 0.8415123820304871, "rewards/rejected": -3.839390993118286, "sft_loss": 3.11506724357605, "step": 3735 }, { "epoch": 2.001672520488376, "grad_norm": 14.889260074472366, "learning_rate": 3.008801976210423e-07, "logits/chosen": -0.1395779699087143, "logits/rejected": -0.06194404885172844, "logps/chosen": -3.1212987899780273, "logps/rejected": -3.968535900115967, "loss": 0.5346, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1212987899780273, "rewards/margins": 0.8472372889518738, "rewards/rejected": -3.968535900115967, "sft_loss": 3.2119712829589844, "step": 3740 }, { "epoch": 2.0043485532697773, "grad_norm": 14.158552312571112, "learning_rate": 2.994525893229581e-07, "logits/chosen": -0.16429729759693146, "logits/rejected": -0.05637521669268608, "logps/chosen": -2.970836877822876, "logps/rejected": -4.122104167938232, "loss": 0.4368, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.970836877822876, "rewards/margins": 1.1512675285339355, "rewards/rejected": -4.122104167938232, "sft_loss": 3.106628894805908, "step": 3745 }, { "epoch": 2.007024586051179, "grad_norm": 13.97493978051257, "learning_rate": 2.98026926736732e-07, "logits/chosen": -0.24882233142852783, "logits/rejected": -0.1182834729552269, "logps/chosen": -2.7795872688293457, "logps/rejected": -3.8998725414276123, "loss": 0.4589, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.7795872688293457, "rewards/margins": 1.1202853918075562, "rewards/rejected": -3.8998725414276123, "sft_loss": 3.035322427749634, "step": 3750 }, { "epoch": 2.0097006188325808, "grad_norm": 14.726312849685929, "learning_rate": 2.9660322369414846e-07, "logits/chosen": -0.24929046630859375, "logits/rejected": -0.0745077133178711, "logps/chosen": -2.9807448387145996, "logps/rejected": -4.177748203277588, "loss": 0.448, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.9807448387145996, "rewards/margins": 1.197003722190857, "rewards/rejected": -4.177748203277588, "sft_loss": 3.2656562328338623, "step": 3755 }, { "epoch": 2.0123766516139825, "grad_norm": 12.955406923867912, "learning_rate": 2.9518149400798063e-07, "logits/chosen": -0.27042073011398315, "logits/rejected": -0.1892492026090622, "logps/chosen": -3.0288338661193848, "logps/rejected": -4.288145065307617, "loss": 0.4451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0288338661193848, "rewards/margins": 1.259311318397522, "rewards/rejected": -4.288145065307617, "sft_loss": 3.255460739135742, "step": 3760 }, { "epoch": 2.0150526843953838, "grad_norm": 21.026865395750054, "learning_rate": 2.9376175147185633e-07, "logits/chosen": -0.22757235169410706, "logits/rejected": 0.01834225468337536, "logps/chosen": -3.2413315773010254, "logps/rejected": -4.376957416534424, "loss": 0.5095, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2413315773010254, "rewards/margins": 1.1356260776519775, "rewards/rejected": -4.376957416534424, "sft_loss": 3.3733558654785156, "step": 3765 }, { "epoch": 2.0177287171767855, "grad_norm": 23.06269740719491, "learning_rate": 2.9234400986012376e-07, "logits/chosen": -0.3123210072517395, "logits/rejected": -0.10822536796331406, "logps/chosen": -3.0676515102386475, "logps/rejected": -4.425958156585693, "loss": 0.4553, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.0676515102386475, "rewards/margins": 1.3583061695098877, "rewards/rejected": -4.425958156585693, "sft_loss": 3.320253849029541, "step": 3770 }, { "epoch": 2.020404749958187, "grad_norm": 19.322245740661582, "learning_rate": 2.9092828292771817e-07, "logits/chosen": -0.18818703293800354, "logits/rejected": -0.12292011082172394, "logps/chosen": -3.184173583984375, "logps/rejected": -4.315966606140137, "loss": 0.476, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.184173583984375, "rewards/margins": 1.1317930221557617, "rewards/rejected": -4.315966606140137, "sft_loss": 3.3012537956237793, "step": 3775 }, { "epoch": 2.0230807827395885, "grad_norm": 13.414764005285019, "learning_rate": 2.8951458441002875e-07, "logits/chosen": -0.22242359817028046, "logits/rejected": -0.1718529760837555, "logps/chosen": -3.073848009109497, "logps/rejected": -4.259646415710449, "loss": 0.4569, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.073848009109497, "rewards/margins": 1.1857987642288208, "rewards/rejected": -4.259646415710449, "sft_loss": 3.235039472579956, "step": 3780 }, { "epoch": 2.02575681552099, "grad_norm": 12.06602753174118, "learning_rate": 2.881029280227643e-07, "logits/chosen": -0.2631004750728607, "logits/rejected": -0.06374208629131317, "logps/chosen": -3.1589930057525635, "logps/rejected": -4.320956230163574, "loss": 0.4747, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1589930057525635, "rewards/margins": 1.1619632244110107, "rewards/rejected": -4.320956230163574, "sft_loss": 3.288616895675659, "step": 3785 }, { "epoch": 2.028432848302392, "grad_norm": 11.64879865176554, "learning_rate": 2.8669332746182177e-07, "logits/chosen": -0.3327762186527252, "logits/rejected": -0.13205161690711975, "logps/chosen": -3.0247673988342285, "logps/rejected": -4.253329277038574, "loss": 0.4586, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0247673988342285, "rewards/margins": 1.2285616397857666, "rewards/rejected": -4.253329277038574, "sft_loss": 3.222275972366333, "step": 3790 }, { "epoch": 2.031108881083793, "grad_norm": 15.875083033580784, "learning_rate": 2.8528579640315156e-07, "logits/chosen": -0.20314481854438782, "logits/rejected": -0.16578742861747742, "logps/chosen": -2.9212849140167236, "logps/rejected": -3.900330066680908, "loss": 0.5052, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.9212849140167236, "rewards/margins": 0.979045033454895, "rewards/rejected": -3.900330066680908, "sft_loss": 3.1145448684692383, "step": 3795 }, { "epoch": 2.033784913865195, "grad_norm": 23.02598168268979, "learning_rate": 2.8388034850262646e-07, "logits/chosen": -0.23519723117351532, "logits/rejected": -0.06606370955705643, "logps/chosen": -3.071678876876831, "logps/rejected": -4.214540958404541, "loss": 0.4638, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.071678876876831, "rewards/margins": 1.142861247062683, "rewards/rejected": -4.214540958404541, "sft_loss": 3.3099563121795654, "step": 3800 }, { "epoch": 2.0364609466465966, "grad_norm": 19.864249915483093, "learning_rate": 2.824769973959079e-07, "logits/chosen": -0.25034523010253906, "logits/rejected": -0.08123587816953659, "logps/chosen": -2.9977831840515137, "logps/rejected": -4.137685298919678, "loss": 0.4488, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.9977831840515137, "rewards/margins": 1.1399024724960327, "rewards/rejected": -4.137685298919678, "sft_loss": 3.1640961170196533, "step": 3805 }, { "epoch": 2.039136979427998, "grad_norm": 15.379477022203476, "learning_rate": 2.81075756698315e-07, "logits/chosen": -0.13126808404922485, "logits/rejected": -0.017083149403333664, "logps/chosen": -2.9781837463378906, "logps/rejected": -4.2689385414123535, "loss": 0.3996, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.9781837463378906, "rewards/margins": 1.290755033493042, "rewards/rejected": -4.2689385414123535, "sft_loss": 3.0259854793548584, "step": 3810 }, { "epoch": 2.0418130122093996, "grad_norm": 14.558688116421909, "learning_rate": 2.7967664000469035e-07, "logits/chosen": -0.31151649355888367, "logits/rejected": -0.16419212520122528, "logps/chosen": -3.105769634246826, "logps/rejected": -4.2047200202941895, "loss": 0.4558, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.105769634246826, "rewards/margins": 1.098949909210205, "rewards/rejected": -4.2047200202941895, "sft_loss": 3.147165536880493, "step": 3815 }, { "epoch": 2.0444890449908013, "grad_norm": 16.17521936665668, "learning_rate": 2.7827966088927095e-07, "logits/chosen": -0.33099812269210815, "logits/rejected": -0.04387823864817619, "logps/chosen": -3.1706812381744385, "logps/rejected": -4.422829627990723, "loss": 0.4525, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.1706812381744385, "rewards/margins": 1.2521488666534424, "rewards/rejected": -4.422829627990723, "sft_loss": 3.3813254833221436, "step": 3820 }, { "epoch": 2.0471650777722026, "grad_norm": 16.53156057156711, "learning_rate": 2.768848329055538e-07, "logits/chosen": -0.20927992463111877, "logits/rejected": -0.11633219569921494, "logps/chosen": -3.0672831535339355, "logps/rejected": -4.292285919189453, "loss": 0.4382, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.0672831535339355, "rewards/margins": 1.225002646446228, "rewards/rejected": -4.292285919189453, "sft_loss": 3.2913658618927, "step": 3825 }, { "epoch": 2.0498411105536043, "grad_norm": 19.536733489255592, "learning_rate": 2.7549216958616657e-07, "logits/chosen": -0.330092191696167, "logits/rejected": -0.1299157738685608, "logps/chosen": -3.3392879962921143, "logps/rejected": -4.565664291381836, "loss": 0.4668, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.3392879962921143, "rewards/margins": 1.2263761758804321, "rewards/rejected": -4.565664291381836, "sft_loss": 3.4869301319122314, "step": 3830 }, { "epoch": 2.052517143335006, "grad_norm": 14.002002293057318, "learning_rate": 2.741016844427344e-07, "logits/chosen": -0.22635014355182648, "logits/rejected": -0.006752826273441315, "logps/chosen": -3.167531728744507, "logps/rejected": -4.436515808105469, "loss": 0.4436, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.167531728744507, "rewards/margins": 1.2689836025238037, "rewards/rejected": -4.436515808105469, "sft_loss": 3.422799587249756, "step": 3835 }, { "epoch": 2.0551931761164073, "grad_norm": 14.720630160755997, "learning_rate": 2.7271339096575073e-07, "logits/chosen": -0.17786608636379242, "logits/rejected": -0.00022650808386970311, "logps/chosen": -3.0252976417541504, "logps/rejected": -4.256524562835693, "loss": 0.4593, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0252976417541504, "rewards/margins": 1.2312266826629639, "rewards/rejected": -4.256524562835693, "sft_loss": 3.2365882396698, "step": 3840 }, { "epoch": 2.057869208897809, "grad_norm": 14.236715915279673, "learning_rate": 2.713273026244446e-07, "logits/chosen": -0.3513855040073395, "logits/rejected": -0.06968530267477036, "logps/chosen": -3.218956470489502, "logps/rejected": -4.501593112945557, "loss": 0.4086, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.218956470489502, "rewards/margins": 1.2826364040374756, "rewards/rejected": -4.501593112945557, "sft_loss": 3.32515287399292, "step": 3845 }, { "epoch": 2.0605452416792107, "grad_norm": 17.603244982199804, "learning_rate": 2.6994343286665156e-07, "logits/chosen": -0.2870510220527649, "logits/rejected": -0.05223512649536133, "logps/chosen": -3.2820980548858643, "logps/rejected": -4.297463893890381, "loss": 0.507, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2820980548858643, "rewards/margins": 1.0153658390045166, "rewards/rejected": -4.297463893890381, "sft_loss": 3.4639363288879395, "step": 3850 }, { "epoch": 2.063221274460612, "grad_norm": 18.77331919350168, "learning_rate": 2.6856179511868156e-07, "logits/chosen": -0.21658697724342346, "logits/rejected": 0.019917303696274757, "logps/chosen": -3.235149383544922, "logps/rejected": -4.659626007080078, "loss": 0.4438, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.235149383544922, "rewards/margins": 1.4244766235351562, "rewards/rejected": -4.659626007080078, "sft_loss": 3.3648905754089355, "step": 3855 }, { "epoch": 2.0658973072420137, "grad_norm": 15.47100105468003, "learning_rate": 2.6718240278519056e-07, "logits/chosen": -0.18325772881507874, "logits/rejected": -0.0073938206769526005, "logps/chosen": -3.246943950653076, "logps/rejected": -4.498908042907715, "loss": 0.4531, "rewards/accuracies": 0.84375, "rewards/chosen": -3.246943950653076, "rewards/margins": 1.2519642114639282, "rewards/rejected": -4.498908042907715, "sft_loss": 3.366358995437622, "step": 3860 }, { "epoch": 2.0685733400234154, "grad_norm": 19.67939759305146, "learning_rate": 2.6580526924904866e-07, "logits/chosen": -0.3614625930786133, "logits/rejected": -0.16604101657867432, "logps/chosen": -3.1864423751831055, "logps/rejected": -4.37761926651001, "loss": 0.4572, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1864423751831055, "rewards/margins": 1.1911770105361938, "rewards/rejected": -4.37761926651001, "sft_loss": 3.3482697010040283, "step": 3865 }, { "epoch": 2.0712493728048167, "grad_norm": 16.813465827880535, "learning_rate": 2.6443040787121186e-07, "logits/chosen": -0.34609368443489075, "logits/rejected": -0.20742973685264587, "logps/chosen": -3.072408676147461, "logps/rejected": -4.248513221740723, "loss": 0.4607, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.072408676147461, "rewards/margins": 1.1761040687561035, "rewards/rejected": -4.248513221740723, "sft_loss": 3.3169169425964355, "step": 3870 }, { "epoch": 2.0739254055862184, "grad_norm": 15.706136306241692, "learning_rate": 2.6305783199059084e-07, "logits/chosen": -0.2473127543926239, "logits/rejected": -0.11334657669067383, "logps/chosen": -3.205875873565674, "logps/rejected": -4.373950004577637, "loss": 0.4714, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.205875873565674, "rewards/margins": 1.168074131011963, "rewards/rejected": -4.373950004577637, "sft_loss": 3.4339394569396973, "step": 3875 }, { "epoch": 2.07660143836762, "grad_norm": 18.912546324244307, "learning_rate": 2.6168755492392324e-07, "logits/chosen": -0.3082582652568817, "logits/rejected": -0.09416376054286957, "logps/chosen": -2.921729564666748, "logps/rejected": -4.220498085021973, "loss": 0.4275, "rewards/accuracies": 0.84375, "rewards/chosen": -2.921729564666748, "rewards/margins": 1.2987688779830933, "rewards/rejected": -4.220498085021973, "sft_loss": 3.0420355796813965, "step": 3880 }, { "epoch": 2.0792774711490214, "grad_norm": 16.556211849700116, "learning_rate": 2.6031958996564274e-07, "logits/chosen": -0.29506513476371765, "logits/rejected": -0.14849936962127686, "logps/chosen": -2.925961971282959, "logps/rejected": -4.301008224487305, "loss": 0.4222, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.925961971282959, "rewards/margins": 1.3750462532043457, "rewards/rejected": -4.301008224487305, "sft_loss": 3.147723913192749, "step": 3885 }, { "epoch": 2.081953503930423, "grad_norm": 17.664433888001273, "learning_rate": 2.589539503877518e-07, "logits/chosen": -0.21779341995716095, "logits/rejected": -0.10009002685546875, "logps/chosen": -3.122642755508423, "logps/rejected": -4.227787971496582, "loss": 0.4997, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.122642755508423, "rewards/margins": 1.1051450967788696, "rewards/rejected": -4.227787971496582, "sft_loss": 3.3117828369140625, "step": 3890 }, { "epoch": 2.084629536711825, "grad_norm": 13.11518902003022, "learning_rate": 2.5759064943969125e-07, "logits/chosen": -0.29696202278137207, "logits/rejected": -0.024793457239866257, "logps/chosen": -3.1317408084869385, "logps/rejected": -4.314787864685059, "loss": 0.4773, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1317408084869385, "rewards/margins": 1.1830475330352783, "rewards/rejected": -4.314787864685059, "sft_loss": 3.2217929363250732, "step": 3895 }, { "epoch": 2.087305569493226, "grad_norm": 14.38161932062421, "learning_rate": 2.562297003482131e-07, "logits/chosen": -0.19218644499778748, "logits/rejected": -0.13539327681064606, "logps/chosen": -3.163686752319336, "logps/rejected": -4.396138668060303, "loss": 0.4345, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.163686752319336, "rewards/margins": 1.2324519157409668, "rewards/rejected": -4.396138668060303, "sft_loss": 3.3125, "step": 3900 }, { "epoch": 2.089981602274628, "grad_norm": 17.436176806374725, "learning_rate": 2.548711163172512e-07, "logits/chosen": -0.21015885472297668, "logits/rejected": -0.09632865339517593, "logps/chosen": -3.2756333351135254, "logps/rejected": -4.456031799316406, "loss": 0.4933, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2756333351135254, "rewards/margins": 1.180398941040039, "rewards/rejected": -4.456031799316406, "sft_loss": 3.378589630126953, "step": 3905 }, { "epoch": 2.0926576350560295, "grad_norm": 16.21246166612964, "learning_rate": 2.53514910527794e-07, "logits/chosen": -0.2180435210466385, "logits/rejected": -0.05177057906985283, "logps/chosen": -3.026956081390381, "logps/rejected": -4.208635330200195, "loss": 0.445, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.026956081390381, "rewards/margins": 1.181679129600525, "rewards/rejected": -4.208635330200195, "sft_loss": 3.2399840354919434, "step": 3910 }, { "epoch": 2.095333667837431, "grad_norm": 21.6260859408156, "learning_rate": 2.5216109613775573e-07, "logits/chosen": -0.2627881169319153, "logits/rejected": -0.0699787437915802, "logps/chosen": -3.3080992698669434, "logps/rejected": -4.408918380737305, "loss": 0.5084, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3080992698669434, "rewards/margins": 1.1008193492889404, "rewards/rejected": -4.408918380737305, "sft_loss": 3.465599536895752, "step": 3915 }, { "epoch": 2.0980097006188325, "grad_norm": 15.82214573668947, "learning_rate": 2.5080968628184993e-07, "logits/chosen": -0.24287445843219757, "logits/rejected": -0.05494864657521248, "logps/chosen": -3.1612963676452637, "logps/rejected": -4.5922040939331055, "loss": 0.4206, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1612963676452637, "rewards/margins": 1.4309074878692627, "rewards/rejected": -4.5922040939331055, "sft_loss": 3.247044324874878, "step": 3920 }, { "epoch": 2.1006857334002342, "grad_norm": 14.561044409130913, "learning_rate": 2.494606940714605e-07, "logits/chosen": -0.23816195130348206, "logits/rejected": -0.12281879037618637, "logps/chosen": -3.0465781688690186, "logps/rejected": -4.356485843658447, "loss": 0.4415, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.0465781688690186, "rewards/margins": 1.30990731716156, "rewards/rejected": -4.356485843658447, "sft_loss": 3.24699068069458, "step": 3925 }, { "epoch": 2.103361766181636, "grad_norm": 13.12575769056934, "learning_rate": 2.4811413259451625e-07, "logits/chosen": -0.3366604447364807, "logits/rejected": -0.11346001923084259, "logps/chosen": -3.1869969367980957, "logps/rejected": -4.55192756652832, "loss": 0.4457, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1869969367980957, "rewards/margins": 1.3649308681488037, "rewards/rejected": -4.55192756652832, "sft_loss": 3.3118603229522705, "step": 3930 }, { "epoch": 2.106037798963037, "grad_norm": 14.285361906378304, "learning_rate": 2.46770014915362e-07, "logits/chosen": -0.25430670380592346, "logits/rejected": -0.12929832935333252, "logps/chosen": -3.152146577835083, "logps/rejected": -4.497963905334473, "loss": 0.4376, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.152146577835083, "rewards/margins": 1.3458168506622314, "rewards/rejected": -4.497963905334473, "sft_loss": 3.266214370727539, "step": 3935 }, { "epoch": 2.108713831744439, "grad_norm": 25.152286266240516, "learning_rate": 2.45428354074634e-07, "logits/chosen": -0.2446119487285614, "logits/rejected": -0.1413499414920807, "logps/chosen": -3.1099953651428223, "logps/rejected": -4.482461929321289, "loss": 0.4381, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1099953651428223, "rewards/margins": 1.3724663257598877, "rewards/rejected": -4.482461929321289, "sft_loss": 3.1997177600860596, "step": 3940 }, { "epoch": 2.1113898645258407, "grad_norm": 19.556528330778583, "learning_rate": 2.4408916308913105e-07, "logits/chosen": -0.28477686643600464, "logits/rejected": -0.06802688539028168, "logps/chosen": -3.315284013748169, "logps/rejected": -4.377490043640137, "loss": 0.5111, "rewards/accuracies": 0.75, "rewards/chosen": -3.315284013748169, "rewards/margins": 1.0622055530548096, "rewards/rejected": -4.377490043640137, "sft_loss": 3.5316920280456543, "step": 3945 }, { "epoch": 2.114065897307242, "grad_norm": 26.935194574649227, "learning_rate": 2.4275245495169025e-07, "logits/chosen": -0.18549111485481262, "logits/rejected": 0.003963217604905367, "logps/chosen": -3.165619373321533, "logps/rejected": -4.436346054077148, "loss": 0.4532, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.165619373321533, "rewards/margins": 1.2707264423370361, "rewards/rejected": -4.436346054077148, "sft_loss": 3.3000197410583496, "step": 3950 }, { "epoch": 2.1167419300886436, "grad_norm": 18.550851560974007, "learning_rate": 2.414182426310597e-07, "logits/chosen": -0.28685250878334045, "logits/rejected": -0.19694644212722778, "logps/chosen": -3.164374828338623, "logps/rejected": -4.608752250671387, "loss": 0.4438, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.164374828338623, "rewards/margins": 1.444377064704895, "rewards/rejected": -4.608752250671387, "sft_loss": 3.336451292037964, "step": 3955 }, { "epoch": 2.1194179628700454, "grad_norm": 14.200310042989717, "learning_rate": 2.400865390717734e-07, "logits/chosen": -0.22424855828285217, "logits/rejected": -0.09154853969812393, "logps/chosen": -3.138336658477783, "logps/rejected": -4.685622215270996, "loss": 0.4002, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.138336658477783, "rewards/margins": 1.5472863912582397, "rewards/rejected": -4.685622215270996, "sft_loss": 3.3620047569274902, "step": 3960 }, { "epoch": 2.1220939956514466, "grad_norm": 20.604088367208963, "learning_rate": 2.3875735719402475e-07, "logits/chosen": -0.24657312035560608, "logits/rejected": -0.06659922748804092, "logps/chosen": -3.314816951751709, "logps/rejected": -4.729458808898926, "loss": 0.4234, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.314816951751709, "rewards/margins": 1.414642333984375, "rewards/rejected": -4.729458808898926, "sft_loss": 3.615452289581299, "step": 3965 }, { "epoch": 2.1247700284328483, "grad_norm": 16.19776372970058, "learning_rate": 2.3743070989354258e-07, "logits/chosen": -0.2126585692167282, "logits/rejected": -0.06898088753223419, "logps/chosen": -3.199605703353882, "logps/rejected": -4.575144290924072, "loss": 0.4858, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.199605703353882, "rewards/margins": 1.3755385875701904, "rewards/rejected": -4.575144290924072, "sft_loss": 3.5406627655029297, "step": 3970 }, { "epoch": 2.12744606121425, "grad_norm": 15.578792534141156, "learning_rate": 2.3610661004146454e-07, "logits/chosen": -0.19905905425548553, "logits/rejected": -0.027686545625329018, "logps/chosen": -3.0088601112365723, "logps/rejected": -4.292378902435303, "loss": 0.42, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.0088601112365723, "rewards/margins": 1.2835190296173096, "rewards/rejected": -4.292378902435303, "sft_loss": 3.179039478302002, "step": 3975 }, { "epoch": 2.1301220939956513, "grad_norm": 17.082743353736696, "learning_rate": 2.3478507048421314e-07, "logits/chosen": -0.26677241921424866, "logits/rejected": -0.1466524749994278, "logps/chosen": -2.986079692840576, "logps/rejected": -4.280380725860596, "loss": 0.4531, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.986079692840576, "rewards/margins": 1.2943007946014404, "rewards/rejected": -4.280380725860596, "sft_loss": 3.35664439201355, "step": 3980 }, { "epoch": 2.132798126777053, "grad_norm": 24.458290951910765, "learning_rate": 2.334661040433713e-07, "logits/chosen": -0.34359875321388245, "logits/rejected": -0.19697892665863037, "logps/chosen": -3.104626178741455, "logps/rejected": -4.322833061218262, "loss": 0.4684, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.104626178741455, "rewards/margins": 1.218206763267517, "rewards/rejected": -4.322833061218262, "sft_loss": 3.375357151031494, "step": 3985 }, { "epoch": 2.1354741595584548, "grad_norm": 15.926635139586462, "learning_rate": 2.321497235155568e-07, "logits/chosen": -0.3426477909088135, "logits/rejected": -0.17477676272392273, "logps/chosen": -3.032522678375244, "logps/rejected": -4.347482204437256, "loss": 0.4145, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.032522678375244, "rewards/margins": 1.3149597644805908, "rewards/rejected": -4.347482204437256, "sft_loss": 3.267840623855591, "step": 3990 }, { "epoch": 2.138150192339856, "grad_norm": 21.33616482570865, "learning_rate": 2.3083594167229965e-07, "logits/chosen": -0.362566739320755, "logits/rejected": -0.062093090265989304, "logps/chosen": -3.2226223945617676, "logps/rejected": -4.450702667236328, "loss": 0.47, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.2226223945617676, "rewards/margins": 1.2280802726745605, "rewards/rejected": -4.450702667236328, "sft_loss": 3.400771379470825, "step": 3995 }, { "epoch": 2.1408262251212578, "grad_norm": 24.458228224023852, "learning_rate": 2.295247712599167e-07, "logits/chosen": -0.2583773732185364, "logits/rejected": -0.14837753772735596, "logps/chosen": -3.1129956245422363, "logps/rejected": -4.425230026245117, "loss": 0.4702, "rewards/accuracies": 0.75, "rewards/chosen": -3.1129956245422363, "rewards/margins": 1.3122342824935913, "rewards/rejected": -4.425230026245117, "sft_loss": 3.2709991931915283, "step": 4000 }, { "epoch": 2.1408262251212578, "eval_logits/chosen": 0.11977552622556686, "eval_logits/rejected": 0.23950588703155518, "eval_logps/chosen": -3.3071038722991943, "eval_logps/rejected": -4.328495025634766, "eval_loss": 0.5591782927513123, "eval_rewards/accuracies": 0.7240356206893921, "eval_rewards/chosen": -3.3071038722991943, "eval_rewards/margins": 1.0213911533355713, "eval_rewards/rejected": -4.328495025634766, "eval_runtime": 51.006, "eval_samples_per_second": 26.369, "eval_sft_loss": 3.511914014816284, "eval_steps_per_second": 6.607, "step": 4000 }, { "epoch": 2.1435022579026595, "grad_norm": 15.306638141971865, "learning_rate": 2.2821622499938948e-07, "logits/chosen": -0.23230516910552979, "logits/rejected": 0.016825273633003235, "logps/chosen": -3.3846182823181152, "logps/rejected": -4.501579761505127, "loss": 0.4927, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3846182823181152, "rewards/margins": 1.1169617176055908, "rewards/rejected": -4.501579761505127, "sft_loss": 3.4896881580352783, "step": 4005 }, { "epoch": 2.1461782906840607, "grad_norm": 17.626588110370694, "learning_rate": 2.269103155862391e-07, "logits/chosen": -0.3005710244178772, "logits/rejected": -0.16064947843551636, "logps/chosen": -3.107633113861084, "logps/rejected": -4.182787895202637, "loss": 0.4992, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.107633113861084, "rewards/margins": 1.0751547813415527, "rewards/rejected": -4.182787895202637, "sft_loss": 3.2684624195098877, "step": 4010 }, { "epoch": 2.1488543234654625, "grad_norm": 14.643533459411545, "learning_rate": 2.2560705569040483e-07, "logits/chosen": -0.2704944908618927, "logits/rejected": 0.012443220242857933, "logps/chosen": -3.1479153633117676, "logps/rejected": -4.254956245422363, "loss": 0.5012, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1479153633117676, "rewards/margins": 1.107041597366333, "rewards/rejected": -4.254956245422363, "sft_loss": 3.331916093826294, "step": 4015 }, { "epoch": 2.151530356246864, "grad_norm": 13.935643009683375, "learning_rate": 2.2430645795611963e-07, "logits/chosen": -0.35706567764282227, "logits/rejected": -0.17552462220191956, "logps/chosen": -3.247520923614502, "logps/rejected": -4.49759578704834, "loss": 0.448, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.247520923614502, "rewards/margins": 1.250075340270996, "rewards/rejected": -4.49759578704834, "sft_loss": 3.426701307296753, "step": 4020 }, { "epoch": 2.1542063890282654, "grad_norm": 21.06771273172547, "learning_rate": 2.230085350017884e-07, "logits/chosen": -0.26404106616973877, "logits/rejected": -0.12290849536657333, "logps/chosen": -3.0938751697540283, "logps/rejected": -4.20880651473999, "loss": 0.4914, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0938751697540283, "rewards/margins": 1.1149311065673828, "rewards/rejected": -4.20880651473999, "sft_loss": 3.2841713428497314, "step": 4025 }, { "epoch": 2.156882421809667, "grad_norm": 13.90057948341053, "learning_rate": 2.2171329941986554e-07, "logits/chosen": -0.32073497772216797, "logits/rejected": -0.19450710713863373, "logps/chosen": -3.0455260276794434, "logps/rejected": -4.355099201202393, "loss": 0.3986, "rewards/accuracies": 0.875, "rewards/chosen": -3.0455260276794434, "rewards/margins": 1.3095730543136597, "rewards/rejected": -4.355099201202393, "sft_loss": 3.1791176795959473, "step": 4030 }, { "epoch": 2.159558454591069, "grad_norm": 17.86472232480832, "learning_rate": 2.2042076377673202e-07, "logits/chosen": -0.26653456687927246, "logits/rejected": -0.24267368018627167, "logps/chosen": -3.016118288040161, "logps/rejected": -4.04874324798584, "loss": 0.4991, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.016118288040161, "rewards/margins": 1.0326251983642578, "rewards/rejected": -4.04874324798584, "sft_loss": 3.2150261402130127, "step": 4035 }, { "epoch": 2.16223448737247, "grad_norm": 17.784423087700567, "learning_rate": 2.1913094061257476e-07, "logits/chosen": -0.2587827444076538, "logits/rejected": -0.23740462958812714, "logps/chosen": -3.0443973541259766, "logps/rejected": -4.254450798034668, "loss": 0.433, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0443973541259766, "rewards/margins": 1.2100534439086914, "rewards/rejected": -4.254450798034668, "sft_loss": 3.1775622367858887, "step": 4040 }, { "epoch": 2.164910520153872, "grad_norm": 17.995001216291413, "learning_rate": 2.178438424412633e-07, "logits/chosen": -0.25195929408073425, "logits/rejected": -0.09768708795309067, "logps/chosen": -3.103498935699463, "logps/rejected": -4.227923393249512, "loss": 0.4917, "rewards/accuracies": 0.75, "rewards/chosen": -3.103498935699463, "rewards/margins": 1.1244243383407593, "rewards/rejected": -4.227923393249512, "sft_loss": 3.284104108810425, "step": 4045 }, { "epoch": 2.1675865529352736, "grad_norm": 22.167275790320975, "learning_rate": 2.165594817502302e-07, "logits/chosen": -0.32782620191574097, "logits/rejected": -0.1630801260471344, "logps/chosen": -3.380352020263672, "logps/rejected": -4.39149808883667, "loss": 0.521, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.380352020263672, "rewards/margins": 1.0111463069915771, "rewards/rejected": -4.39149808883667, "sft_loss": 3.6027674674987793, "step": 4050 }, { "epoch": 2.170262585716675, "grad_norm": 18.15649918008833, "learning_rate": 2.1527787100034806e-07, "logits/chosen": -0.2120496779680252, "logits/rejected": -0.1167217493057251, "logps/chosen": -3.2311127185821533, "logps/rejected": -4.212703227996826, "loss": 0.4863, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2311127185821533, "rewards/margins": 0.9815902709960938, "rewards/rejected": -4.212703227996826, "sft_loss": 3.3510711193084717, "step": 4055 }, { "epoch": 2.1729386184980766, "grad_norm": 17.64261135465353, "learning_rate": 2.1399902262581037e-07, "logits/chosen": -0.1881551444530487, "logits/rejected": 0.0027097121346741915, "logps/chosen": -3.259291887283325, "logps/rejected": -4.351016044616699, "loss": 0.5074, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.259291887283325, "rewards/margins": 1.0917246341705322, "rewards/rejected": -4.351016044616699, "sft_loss": 3.5189929008483887, "step": 4060 }, { "epoch": 2.1756146512794783, "grad_norm": 20.66924206249742, "learning_rate": 2.127229490340094e-07, "logits/chosen": -0.30917078256607056, "logits/rejected": -0.19973380863666534, "logps/chosen": -3.1926956176757812, "logps/rejected": -4.505526065826416, "loss": 0.4338, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1926956176757812, "rewards/margins": 1.3128302097320557, "rewards/rejected": -4.505526065826416, "sft_loss": 3.4061808586120605, "step": 4065 }, { "epoch": 2.1782906840608796, "grad_norm": 24.810042393157584, "learning_rate": 2.1144966260541698e-07, "logits/chosen": -0.2025396078824997, "logits/rejected": 0.03697749227285385, "logps/chosen": -3.2227625846862793, "logps/rejected": -4.55974817276001, "loss": 0.4911, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2227625846862793, "rewards/margins": 1.33698570728302, "rewards/rejected": -4.55974817276001, "sft_loss": 3.47778582572937, "step": 4070 }, { "epoch": 2.1809667168422813, "grad_norm": 17.448281363095408, "learning_rate": 2.1017917569346332e-07, "logits/chosen": -0.2623835802078247, "logits/rejected": -0.03134022280573845, "logps/chosen": -3.1874072551727295, "logps/rejected": -4.3760247230529785, "loss": 0.4427, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1874072551727295, "rewards/margins": 1.188617467880249, "rewards/rejected": -4.3760247230529785, "sft_loss": 3.296792984008789, "step": 4075 }, { "epoch": 2.183642749623683, "grad_norm": 16.51718107754312, "learning_rate": 2.0891150062441837e-07, "logits/chosen": -0.30696502327919006, "logits/rejected": -0.14113807678222656, "logps/chosen": -3.2767624855041504, "logps/rejected": -4.557929039001465, "loss": 0.4563, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2767624855041504, "rewards/margins": 1.2811663150787354, "rewards/rejected": -4.557929039001465, "sft_loss": 3.394482135772705, "step": 4080 }, { "epoch": 2.1863187824050843, "grad_norm": 14.918701464472884, "learning_rate": 2.0764664969727086e-07, "logits/chosen": -0.22599594295024872, "logits/rejected": -0.140573650598526, "logps/chosen": -3.057241678237915, "logps/rejected": -4.294919013977051, "loss": 0.4291, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.057241678237915, "rewards/margins": 1.2376779317855835, "rewards/rejected": -4.294919013977051, "sft_loss": 3.1836822032928467, "step": 4085 }, { "epoch": 2.188994815186486, "grad_norm": 18.39548959285876, "learning_rate": 2.0638463518361033e-07, "logits/chosen": -0.3747314214706421, "logits/rejected": -0.12570782005786896, "logps/chosen": -3.098694324493408, "logps/rejected": -4.325566291809082, "loss": 0.4412, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.098694324493408, "rewards/margins": 1.2268718481063843, "rewards/rejected": -4.325566291809082, "sft_loss": 3.2901692390441895, "step": 4090 }, { "epoch": 2.1916708479678877, "grad_norm": 22.101033579028414, "learning_rate": 2.0512546932750702e-07, "logits/chosen": -0.29583343863487244, "logits/rejected": -0.16991981863975525, "logps/chosen": -3.3528189659118652, "logps/rejected": -4.421876907348633, "loss": 0.487, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.3528189659118652, "rewards/margins": 1.0690577030181885, "rewards/rejected": -4.421876907348633, "sft_loss": 3.499919891357422, "step": 4095 }, { "epoch": 2.194346880749289, "grad_norm": 19.77436652933417, "learning_rate": 2.0386916434539343e-07, "logits/chosen": -0.230976864695549, "logits/rejected": -0.025323525071144104, "logps/chosen": -3.058785915374756, "logps/rejected": -4.476016044616699, "loss": 0.3986, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.058785915374756, "rewards/margins": 1.4172298908233643, "rewards/rejected": -4.476016044616699, "sft_loss": 3.337048053741455, "step": 4100 }, { "epoch": 2.1970229135306907, "grad_norm": 17.634780116643828, "learning_rate": 2.0261573242594627e-07, "logits/chosen": -0.2085772007703781, "logits/rejected": 0.01965431496500969, "logps/chosen": -3.4317848682403564, "logps/rejected": -4.651656150817871, "loss": 0.4657, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4317848682403564, "rewards/margins": 1.219870924949646, "rewards/rejected": -4.651656150817871, "sft_loss": 3.484034776687622, "step": 4105 }, { "epoch": 2.1996989463120924, "grad_norm": 27.154747986305658, "learning_rate": 2.0136518572996724e-07, "logits/chosen": -0.23308996856212616, "logits/rejected": 0.0261714868247509, "logps/chosen": -3.182469129562378, "logps/rejected": -4.524494171142578, "loss": 0.443, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.182469129562378, "rewards/margins": 1.342024803161621, "rewards/rejected": -4.524494171142578, "sft_loss": 3.384852170944214, "step": 4110 }, { "epoch": 2.202374979093494, "grad_norm": 18.084337518881036, "learning_rate": 2.0011753639026617e-07, "logits/chosen": -0.18893378973007202, "logits/rejected": -0.055924855172634125, "logps/chosen": -3.2787792682647705, "logps/rejected": -4.478785514831543, "loss": 0.4588, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2787792682647705, "rewards/margins": 1.2000062465667725, "rewards/rejected": -4.478785514831543, "sft_loss": 3.457000732421875, "step": 4115 }, { "epoch": 2.2050510118748954, "grad_norm": 20.83277114561886, "learning_rate": 1.988727965115421e-07, "logits/chosen": -0.24318411946296692, "logits/rejected": -0.09616503119468689, "logps/chosen": -3.1696531772613525, "logps/rejected": -4.420243740081787, "loss": 0.4518, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1696531772613525, "rewards/margins": 1.250590443611145, "rewards/rejected": -4.420243740081787, "sft_loss": 3.418315887451172, "step": 4120 }, { "epoch": 2.207727044656297, "grad_norm": 16.275212367228942, "learning_rate": 1.9763097817026713e-07, "logits/chosen": -0.29338568449020386, "logits/rejected": -0.045070819556713104, "logps/chosen": -3.2043399810791016, "logps/rejected": -4.738910675048828, "loss": 0.3962, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2043399810791016, "rewards/margins": 1.5345704555511475, "rewards/rejected": -4.738910675048828, "sft_loss": 3.4029903411865234, "step": 4125 }, { "epoch": 2.210403077437699, "grad_norm": 16.95359835468456, "learning_rate": 1.9639209341456796e-07, "logits/chosen": -0.14141836762428284, "logits/rejected": -0.013434246182441711, "logps/chosen": -3.340695858001709, "logps/rejected": -4.655120372772217, "loss": 0.462, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.340695858001709, "rewards/margins": 1.3144241571426392, "rewards/rejected": -4.655120372772217, "sft_loss": 3.5574791431427, "step": 4130 }, { "epoch": 2.2130791102191, "grad_norm": 14.211923543904996, "learning_rate": 1.951561542641102e-07, "logits/chosen": -0.10375924408435822, "logits/rejected": -0.10831058025360107, "logps/chosen": -3.3312363624572754, "logps/rejected": -4.6244707107543945, "loss": 0.5066, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3312363624572754, "rewards/margins": 1.2932345867156982, "rewards/rejected": -4.6244707107543945, "sft_loss": 3.4978721141815186, "step": 4135 }, { "epoch": 2.215755143000502, "grad_norm": 18.96497521571202, "learning_rate": 1.939231727099806e-07, "logits/chosen": -0.36096328496932983, "logits/rejected": -0.27226656675338745, "logps/chosen": -3.230943202972412, "logps/rejected": -4.421606540679932, "loss": 0.4769, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.230943202972412, "rewards/margins": 1.1906629800796509, "rewards/rejected": -4.421606540679932, "sft_loss": 3.3588504791259766, "step": 4140 }, { "epoch": 2.2184311757819035, "grad_norm": 17.525435531912834, "learning_rate": 1.926931607145719e-07, "logits/chosen": -0.09570387005805969, "logits/rejected": 0.07199952751398087, "logps/chosen": -3.4613890647888184, "logps/rejected": -4.609736442565918, "loss": 0.4874, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.4613890647888184, "rewards/margins": 1.148347020149231, "rewards/rejected": -4.609736442565918, "sft_loss": 3.61700439453125, "step": 4145 }, { "epoch": 2.221107208563305, "grad_norm": 15.292425629356083, "learning_rate": 1.9146613021146564e-07, "logits/chosen": -0.20875485241413116, "logits/rejected": -0.06416013836860657, "logps/chosen": -3.036895990371704, "logps/rejected": -4.218943119049072, "loss": 0.4766, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.036895990371704, "rewards/margins": 1.1820474863052368, "rewards/rejected": -4.218943119049072, "sft_loss": 3.233987808227539, "step": 4150 }, { "epoch": 2.2237832413447065, "grad_norm": 18.999966934345977, "learning_rate": 1.9024209310531736e-07, "logits/chosen": -0.17762920260429382, "logits/rejected": -0.15139761567115784, "logps/chosen": -3.201995372772217, "logps/rejected": -4.4034857749938965, "loss": 0.463, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.201995372772217, "rewards/margins": 1.2014906406402588, "rewards/rejected": -4.4034857749938965, "sft_loss": 3.354133129119873, "step": 4155 }, { "epoch": 2.2264592741261082, "grad_norm": 24.036186461034966, "learning_rate": 1.890210612717401e-07, "logits/chosen": -0.22414183616638184, "logits/rejected": -0.04488285258412361, "logps/chosen": -3.2703185081481934, "logps/rejected": -4.445802688598633, "loss": 0.4562, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.2703185081481934, "rewards/margins": 1.1754839420318604, "rewards/rejected": -4.445802688598633, "sft_loss": 3.437479019165039, "step": 4160 }, { "epoch": 2.2291353069075095, "grad_norm": 20.103343151615263, "learning_rate": 1.8780304655719054e-07, "logits/chosen": -0.21300466358661652, "logits/rejected": -0.032338377088308334, "logps/chosen": -3.191817283630371, "logps/rejected": -4.5335612297058105, "loss": 0.4473, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.191817283630371, "rewards/margins": 1.3417439460754395, "rewards/rejected": -4.5335612297058105, "sft_loss": 3.402716875076294, "step": 4165 }, { "epoch": 2.231811339688911, "grad_norm": 27.46925171279301, "learning_rate": 1.865880607788523e-07, "logits/chosen": -0.037474703043699265, "logits/rejected": 0.050803374499082565, "logps/chosen": -3.1691462993621826, "logps/rejected": -4.388891696929932, "loss": 0.4655, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1691462993621826, "rewards/margins": 1.2197450399398804, "rewards/rejected": -4.388891696929932, "sft_loss": 3.487267255783081, "step": 4170 }, { "epoch": 2.234487372470313, "grad_norm": 23.89090887577621, "learning_rate": 1.8537611572452316e-07, "logits/chosen": -0.2057659924030304, "logits/rejected": -0.08916045725345612, "logps/chosen": -3.134235143661499, "logps/rejected": -4.19904088973999, "loss": 0.4793, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.134235143661499, "rewards/margins": 1.0648062229156494, "rewards/rejected": -4.19904088973999, "sft_loss": 3.349034547805786, "step": 4175 }, { "epoch": 2.237163405251714, "grad_norm": 17.81412265801276, "learning_rate": 1.84167223152499e-07, "logits/chosen": -0.2543816566467285, "logits/rejected": 0.004586332943290472, "logps/chosen": -3.1645865440368652, "logps/rejected": -4.45761251449585, "loss": 0.438, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1645865440368652, "rewards/margins": 1.2930257320404053, "rewards/rejected": -4.45761251449585, "sft_loss": 3.3859825134277344, "step": 4180 }, { "epoch": 2.239839438033116, "grad_norm": 21.95605652435121, "learning_rate": 1.8296139479146112e-07, "logits/chosen": -0.2591497302055359, "logits/rejected": -0.19623364508152008, "logps/chosen": -3.06785249710083, "logps/rejected": -4.301652908325195, "loss": 0.4886, "rewards/accuracies": 0.78125, "rewards/chosen": -3.06785249710083, "rewards/margins": 1.2338005304336548, "rewards/rejected": -4.301652908325195, "sft_loss": 3.2426631450653076, "step": 4185 }, { "epoch": 2.2425154708145176, "grad_norm": 17.555340000578923, "learning_rate": 1.8175864234036132e-07, "logits/chosen": -0.07663850486278534, "logits/rejected": 0.023242291063070297, "logps/chosen": -3.1073408126831055, "logps/rejected": -4.364688873291016, "loss": 0.4727, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1073408126831055, "rewards/margins": 1.2573484182357788, "rewards/rejected": -4.364688873291016, "sft_loss": 3.2235629558563232, "step": 4190 }, { "epoch": 2.245191503595919, "grad_norm": 16.461587993114804, "learning_rate": 1.805589774683094e-07, "logits/chosen": -0.33367663621902466, "logits/rejected": -0.15988479554653168, "logps/chosen": -3.0423474311828613, "logps/rejected": -4.084300518035889, "loss": 0.4826, "rewards/accuracies": 0.78125, "rewards/chosen": -3.0423474311828613, "rewards/margins": 1.0419528484344482, "rewards/rejected": -4.084300518035889, "sft_loss": 3.2633557319641113, "step": 4195 }, { "epoch": 2.2478675363773206, "grad_norm": 19.07017109089392, "learning_rate": 1.79362411814459e-07, "logits/chosen": -0.0908198133111, "logits/rejected": -0.10554500669240952, "logps/chosen": -3.247302532196045, "logps/rejected": -4.188872814178467, "loss": 0.5486, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -3.247302532196045, "rewards/margins": 0.9415702819824219, "rewards/rejected": -4.188872814178467, "sft_loss": 3.4122467041015625, "step": 4200 }, { "epoch": 2.2505435691587223, "grad_norm": 15.418712620736029, "learning_rate": 1.7816895698789552e-07, "logits/chosen": -0.2686876654624939, "logits/rejected": -0.1589660346508026, "logps/chosen": -3.0749001502990723, "logps/rejected": -4.215682506561279, "loss": 0.4496, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0749001502990723, "rewards/margins": 1.140782117843628, "rewards/rejected": -4.215682506561279, "sft_loss": 3.247279405593872, "step": 4205 }, { "epoch": 2.2532196019401236, "grad_norm": 14.329212400039616, "learning_rate": 1.7697862456752271e-07, "logits/chosen": -0.2475365698337555, "logits/rejected": -0.07634967565536499, "logps/chosen": -3.143423318862915, "logps/rejected": -4.52433967590332, "loss": 0.4364, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.143423318862915, "rewards/margins": 1.3809159994125366, "rewards/rejected": -4.52433967590332, "sft_loss": 3.3332431316375732, "step": 4210 }, { "epoch": 2.2558956347215253, "grad_norm": 18.48070952568049, "learning_rate": 1.7579142610195124e-07, "logits/chosen": -0.21497571468353271, "logits/rejected": -0.01970536634325981, "logps/chosen": -3.140994071960449, "logps/rejected": -4.3509111404418945, "loss": 0.4771, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.140994071960449, "rewards/margins": 1.2099171876907349, "rewards/rejected": -4.3509111404418945, "sft_loss": 3.2334461212158203, "step": 4215 }, { "epoch": 2.258571667502927, "grad_norm": 13.278055635617374, "learning_rate": 1.7460737310938568e-07, "logits/chosen": -0.2955940067768097, "logits/rejected": -0.019859764724969864, "logps/chosen": -2.979950428009033, "logps/rejected": -4.360445499420166, "loss": 0.413, "rewards/accuracies": 0.84375, "rewards/chosen": -2.979950428009033, "rewards/margins": 1.3804947137832642, "rewards/rejected": -4.360445499420166, "sft_loss": 3.183371067047119, "step": 4220 }, { "epoch": 2.2612477002843283, "grad_norm": 14.165936883872977, "learning_rate": 1.734264770775133e-07, "logits/chosen": -0.25720852613449097, "logits/rejected": 0.03133372589945793, "logps/chosen": -3.1578311920166016, "logps/rejected": -4.32140588760376, "loss": 0.4857, "rewards/accuracies": 0.75, "rewards/chosen": -3.1578311920166016, "rewards/margins": 1.1635749340057373, "rewards/rejected": -4.32140588760376, "sft_loss": 3.2801055908203125, "step": 4225 }, { "epoch": 2.26392373306573, "grad_norm": 16.085409013269818, "learning_rate": 1.7224874946339241e-07, "logits/chosen": -0.23832368850708008, "logits/rejected": -0.12906375527381897, "logps/chosen": -3.1299870014190674, "logps/rejected": -4.382236480712891, "loss": 0.4664, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1299870014190674, "rewards/margins": 1.2522494792938232, "rewards/rejected": -4.382236480712891, "sft_loss": 3.1819064617156982, "step": 4230 }, { "epoch": 2.2665997658471317, "grad_norm": 14.745450418612242, "learning_rate": 1.7107420169334186e-07, "logits/chosen": -0.19955693185329437, "logits/rejected": -0.08378178626298904, "logps/chosen": -3.154358386993408, "logps/rejected": -4.298888206481934, "loss": 0.486, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.154358386993408, "rewards/margins": 1.1445305347442627, "rewards/rejected": -4.298888206481934, "sft_loss": 3.3586158752441406, "step": 4235 }, { "epoch": 2.269275798628533, "grad_norm": 16.213106248209986, "learning_rate": 1.6990284516282893e-07, "logits/chosen": -0.21659307181835175, "logits/rejected": -0.07814633846282959, "logps/chosen": -3.015805721282959, "logps/rejected": -4.245603561401367, "loss": 0.438, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.015805721282959, "rewards/margins": 1.2297978401184082, "rewards/rejected": -4.245603561401367, "sft_loss": 3.2131104469299316, "step": 4240 }, { "epoch": 2.2719518314099347, "grad_norm": 19.323395120992416, "learning_rate": 1.687346912363602e-07, "logits/chosen": -0.23957762122154236, "logits/rejected": -0.04110132157802582, "logps/chosen": -3.1102561950683594, "logps/rejected": -4.384282112121582, "loss": 0.4427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1102561950683594, "rewards/margins": 1.2740260362625122, "rewards/rejected": -4.384282112121582, "sft_loss": 3.2916996479034424, "step": 4245 }, { "epoch": 2.2746278641913364, "grad_norm": 16.581508891925253, "learning_rate": 1.675697512473697e-07, "logits/chosen": -0.255021333694458, "logits/rejected": -0.0009258926147595048, "logps/chosen": -3.1822104454040527, "logps/rejected": -4.498100757598877, "loss": 0.4329, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1822104454040527, "rewards/margins": 1.3158907890319824, "rewards/rejected": -4.498100757598877, "sft_loss": 3.278721332550049, "step": 4250 }, { "epoch": 2.2773038969727377, "grad_norm": 18.7455944053278, "learning_rate": 1.6640803649811087e-07, "logits/chosen": -0.23095567524433136, "logits/rejected": 0.0882527232170105, "logps/chosen": -3.2245659828186035, "logps/rejected": -4.542080879211426, "loss": 0.435, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2245659828186035, "rewards/margins": 1.3175151348114014, "rewards/rejected": -4.542080879211426, "sft_loss": 3.3141732215881348, "step": 4255 }, { "epoch": 2.2799799297541394, "grad_norm": 21.117638663135256, "learning_rate": 1.6524955825954472e-07, "logits/chosen": -0.1814691722393036, "logits/rejected": -0.07471819221973419, "logps/chosen": -3.1403911113739014, "logps/rejected": -4.332183837890625, "loss": 0.4644, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1403911113739014, "rewards/margins": 1.1917924880981445, "rewards/rejected": -4.332183837890625, "sft_loss": 3.2049949169158936, "step": 4260 }, { "epoch": 2.282655962535541, "grad_norm": 15.627427866447883, "learning_rate": 1.6409432777123277e-07, "logits/chosen": -0.2731134295463562, "logits/rejected": -0.046000413596630096, "logps/chosen": -3.2276968955993652, "logps/rejected": -4.660434722900391, "loss": 0.4399, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2276968955993652, "rewards/margins": 1.432737946510315, "rewards/rejected": -4.660434722900391, "sft_loss": 3.427386522293091, "step": 4265 }, { "epoch": 2.285331995316943, "grad_norm": 18.08578740103005, "learning_rate": 1.6294235624122577e-07, "logits/chosen": -0.138192817568779, "logits/rejected": 0.15803620219230652, "logps/chosen": -3.313798189163208, "logps/rejected": -4.528017997741699, "loss": 0.4846, "rewards/accuracies": 0.75, "rewards/chosen": -3.313798189163208, "rewards/margins": 1.2142199277877808, "rewards/rejected": -4.528017997741699, "sft_loss": 3.413130521774292, "step": 4270 }, { "epoch": 2.288008028098344, "grad_norm": 18.778224971256115, "learning_rate": 1.6179365484595697e-07, "logits/chosen": -0.20826223492622375, "logits/rejected": -0.047176480293273926, "logps/chosen": -3.294180393218994, "logps/rejected": -4.4106550216674805, "loss": 0.4984, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.294180393218994, "rewards/margins": 1.1164753437042236, "rewards/rejected": -4.4106550216674805, "sft_loss": 3.4413559436798096, "step": 4275 }, { "epoch": 2.290684060879746, "grad_norm": 17.130370672952917, "learning_rate": 1.60648234730132e-07, "logits/chosen": -0.22210946679115295, "logits/rejected": -0.08392616361379623, "logps/chosen": -3.220755100250244, "logps/rejected": -4.539450645446777, "loss": 0.4222, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.220755100250244, "rewards/margins": 1.3186959028244019, "rewards/rejected": -4.539450645446777, "sft_loss": 3.375286102294922, "step": 4280 }, { "epoch": 2.293360093661147, "grad_norm": 29.69947353654119, "learning_rate": 1.595061070066222e-07, "logits/chosen": -0.16046538949012756, "logits/rejected": -0.1463548094034195, "logps/chosen": -3.1819510459899902, "logps/rejected": -4.588136196136475, "loss": 0.4208, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1819510459899902, "rewards/margins": 1.4061849117279053, "rewards/rejected": -4.588136196136475, "sft_loss": 3.3452370166778564, "step": 4285 }, { "epoch": 2.296036126442549, "grad_norm": 25.87574365274802, "learning_rate": 1.5836728275635542e-07, "logits/chosen": -0.3043631911277771, "logits/rejected": -0.08773352950811386, "logps/chosen": -3.3570823669433594, "logps/rejected": -4.547373294830322, "loss": 0.4848, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.3570823669433594, "rewards/margins": 1.1902905702590942, "rewards/rejected": -4.547373294830322, "sft_loss": 3.4400806427001953, "step": 4290 }, { "epoch": 2.2987121592239506, "grad_norm": 23.134324933736, "learning_rate": 1.5723177302820984e-07, "logits/chosen": -0.2654676139354706, "logits/rejected": -0.1498531848192215, "logps/chosen": -3.3259081840515137, "logps/rejected": -4.402651786804199, "loss": 0.4778, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.3259081840515137, "rewards/margins": 1.076743483543396, "rewards/rejected": -4.402651786804199, "sft_loss": 3.395968198776245, "step": 4295 }, { "epoch": 2.3013881920053523, "grad_norm": 17.202436809824984, "learning_rate": 1.5609958883890544e-07, "logits/chosen": -0.18384288251399994, "logits/rejected": -0.0234028659760952, "logps/chosen": -3.232166290283203, "logps/rejected": -4.441971778869629, "loss": 0.4294, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.232166290283203, "rewards/margins": 1.2098052501678467, "rewards/rejected": -4.441971778869629, "sft_loss": 3.3029544353485107, "step": 4300 }, { "epoch": 2.3040642247867535, "grad_norm": 15.9017208826614, "learning_rate": 1.5497074117289865e-07, "logits/chosen": -0.3002277910709381, "logits/rejected": -0.1476120501756668, "logps/chosen": -3.1275136470794678, "logps/rejected": -4.544375419616699, "loss": 0.4294, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1275136470794678, "rewards/margins": 1.416861653327942, "rewards/rejected": -4.544375419616699, "sft_loss": 3.3830864429473877, "step": 4305 }, { "epoch": 2.3067402575681553, "grad_norm": 17.702231436246656, "learning_rate": 1.5384524098227402e-07, "logits/chosen": -0.24221201241016388, "logits/rejected": 0.003594371723011136, "logps/chosen": -3.3001155853271484, "logps/rejected": -4.792794227600098, "loss": 0.3949, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.3001155853271484, "rewards/margins": 1.4926788806915283, "rewards/rejected": -4.792794227600098, "sft_loss": 3.482017993927002, "step": 4310 }, { "epoch": 2.3094162903495565, "grad_norm": 19.830644859083993, "learning_rate": 1.5272309918663974e-07, "logits/chosen": -0.21288709342479706, "logits/rejected": -0.005537429358810186, "logps/chosen": -3.3459548950195312, "logps/rejected": -4.385908126831055, "loss": 0.5239, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3459548950195312, "rewards/margins": 1.0399537086486816, "rewards/rejected": -4.385908126831055, "sft_loss": 3.5866851806640625, "step": 4315 }, { "epoch": 2.3120923231309582, "grad_norm": 17.455952905286868, "learning_rate": 1.516043266730201e-07, "logits/chosen": -0.2241850644350052, "logits/rejected": -0.025332236662507057, "logps/chosen": -3.29628324508667, "logps/rejected": -4.559080600738525, "loss": 0.4467, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.29628324508667, "rewards/margins": 1.2627968788146973, "rewards/rejected": -4.559080600738525, "sft_loss": 3.4109034538269043, "step": 4320 }, { "epoch": 2.31476835591236, "grad_norm": 29.111777110334728, "learning_rate": 1.504889342957512e-07, "logits/chosen": -0.20829851925373077, "logits/rejected": -0.0016296729445457458, "logps/chosen": -3.3091533184051514, "logps/rejected": -4.5012311935424805, "loss": 0.5246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.3091533184051514, "rewards/margins": 1.1920771598815918, "rewards/rejected": -4.5012311935424805, "sft_loss": 3.4718575477600098, "step": 4325 }, { "epoch": 2.3174443886937617, "grad_norm": 19.87629918740408, "learning_rate": 1.4937693287637453e-07, "logits/chosen": -0.22584417462348938, "logits/rejected": -0.039196573197841644, "logps/chosen": -3.3293514251708984, "logps/rejected": -4.511477470397949, "loss": 0.4871, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3293514251708984, "rewards/margins": 1.1821262836456299, "rewards/rejected": -4.511477470397949, "sft_loss": 3.4103927612304688, "step": 4330 }, { "epoch": 2.320120421475163, "grad_norm": 16.06874907463283, "learning_rate": 1.4826833320353305e-07, "logits/chosen": -0.22309021651744843, "logits/rejected": -0.1022438034415245, "logps/chosen": -3.1893844604492188, "logps/rejected": -4.463017463684082, "loss": 0.4437, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1893844604492188, "rewards/margins": 1.2736327648162842, "rewards/rejected": -4.463017463684082, "sft_loss": 3.256054639816284, "step": 4335 }, { "epoch": 2.3227964542565647, "grad_norm": 21.328061639796502, "learning_rate": 1.4716314603286528e-07, "logits/chosen": -0.2732570767402649, "logits/rejected": -0.022668302059173584, "logps/chosen": -3.1785480976104736, "logps/rejected": -4.607804298400879, "loss": 0.3982, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1785480976104736, "rewards/margins": 1.429255723953247, "rewards/rejected": -4.607804298400879, "sft_loss": 3.4070191383361816, "step": 4340 }, { "epoch": 2.3254724870379664, "grad_norm": 30.04601648990877, "learning_rate": 1.4606138208690233e-07, "logits/chosen": -0.2265649288892746, "logits/rejected": -0.12993165850639343, "logps/chosen": -3.307100772857666, "logps/rejected": -4.453924655914307, "loss": 0.5078, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.307100772857666, "rewards/margins": 1.1468241214752197, "rewards/rejected": -4.453924655914307, "sft_loss": 3.414353847503662, "step": 4345 }, { "epoch": 2.3281485198193677, "grad_norm": 16.1389666815883, "learning_rate": 1.4496305205496251e-07, "logits/chosen": -0.1859712153673172, "logits/rejected": -0.08136789500713348, "logps/chosen": -3.3702845573425293, "logps/rejected": -4.737895965576172, "loss": 0.4478, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.3702845573425293, "rewards/margins": 1.3676111698150635, "rewards/rejected": -4.737895965576172, "sft_loss": 3.4948036670684814, "step": 4350 }, { "epoch": 2.3308245526007694, "grad_norm": 15.328700754654726, "learning_rate": 1.4386816659304895e-07, "logits/chosen": -0.31797105073928833, "logits/rejected": -0.13324691355228424, "logps/chosen": -3.2428574562072754, "logps/rejected": -4.482692718505859, "loss": 0.4206, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -3.2428574562072754, "rewards/margins": 1.2398353815078735, "rewards/rejected": -4.482692718505859, "sft_loss": 3.4420928955078125, "step": 4355 }, { "epoch": 2.333500585382171, "grad_norm": 19.534348069559012, "learning_rate": 1.4277673632374492e-07, "logits/chosen": -0.28887489438056946, "logits/rejected": 0.003976461477577686, "logps/chosen": -3.2906270027160645, "logps/rejected": -4.527071952819824, "loss": 0.451, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2906270027160645, "rewards/margins": 1.2364448308944702, "rewards/rejected": -4.527071952819824, "sft_loss": 3.4399611949920654, "step": 4360 }, { "epoch": 2.3361766181635724, "grad_norm": 15.875875054436587, "learning_rate": 1.416887718361119e-07, "logits/chosen": -0.12166640907526016, "logits/rejected": -0.08416490256786346, "logps/chosen": -3.2442119121551514, "logps/rejected": -4.411725044250488, "loss": 0.4747, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.2442119121551514, "rewards/margins": 1.1675128936767578, "rewards/rejected": -4.411725044250488, "sft_loss": 3.3793673515319824, "step": 4365 }, { "epoch": 2.338852650944974, "grad_norm": 21.08263580693356, "learning_rate": 1.406042836855859e-07, "logits/chosen": -0.1815926879644394, "logits/rejected": -0.031064201146364212, "logps/chosen": -3.109001874923706, "logps/rejected": -4.556884765625, "loss": 0.3986, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.109001874923706, "rewards/margins": 1.4478830099105835, "rewards/rejected": -4.556884765625, "sft_loss": 3.2362945079803467, "step": 4370 }, { "epoch": 2.341528683726376, "grad_norm": 23.273463422383422, "learning_rate": 1.3952328239387595e-07, "logits/chosen": -0.30537500977516174, "logits/rejected": -0.017103061079978943, "logps/chosen": -3.161263942718506, "logps/rejected": -4.516915798187256, "loss": 0.4535, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.161263942718506, "rewards/margins": 1.355651617050171, "rewards/rejected": -4.516915798187256, "sft_loss": 3.4046154022216797, "step": 4375 }, { "epoch": 2.344204716507777, "grad_norm": 17.58767621350646, "learning_rate": 1.3844577844886109e-07, "logits/chosen": -0.30194127559661865, "logits/rejected": -0.034553758800029755, "logps/chosen": -3.2942283153533936, "logps/rejected": -4.528740882873535, "loss": 0.4589, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2942283153533936, "rewards/margins": 1.2345125675201416, "rewards/rejected": -4.528740882873535, "sft_loss": 3.4428882598876953, "step": 4380 }, { "epoch": 2.346880749289179, "grad_norm": 21.7044093632797, "learning_rate": 1.3737178230448955e-07, "logits/chosen": -0.2997553050518036, "logits/rejected": -0.13085004687309265, "logps/chosen": -3.28190541267395, "logps/rejected": -4.441516399383545, "loss": 0.4993, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.28190541267395, "rewards/margins": 1.1596112251281738, "rewards/rejected": -4.441516399383545, "sft_loss": 3.4178061485290527, "step": 4385 }, { "epoch": 2.3495567820705805, "grad_norm": 15.029912018578793, "learning_rate": 1.363013043806764e-07, "logits/chosen": -0.2474454939365387, "logits/rejected": -0.08195488899946213, "logps/chosen": -3.1726186275482178, "logps/rejected": -4.371608257293701, "loss": 0.4506, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1726186275482178, "rewards/margins": 1.1989902257919312, "rewards/rejected": -4.371608257293701, "sft_loss": 3.3866195678710938, "step": 4390 }, { "epoch": 2.3522328148519818, "grad_norm": 18.499020289783466, "learning_rate": 1.352343550632034e-07, "logits/chosen": -0.2626621127128601, "logits/rejected": -0.05871880054473877, "logps/chosen": -3.1768505573272705, "logps/rejected": -4.5565314292907715, "loss": 0.4419, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1768505573272705, "rewards/margins": 1.3796809911727905, "rewards/rejected": -4.5565314292907715, "sft_loss": 3.2931389808654785, "step": 4395 }, { "epoch": 2.3549088476333835, "grad_norm": 15.70069909859834, "learning_rate": 1.3417094470361722e-07, "logits/chosen": -0.256940633058548, "logits/rejected": -0.08759448677301407, "logps/chosen": -3.290684938430786, "logps/rejected": -4.433601379394531, "loss": 0.4882, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.290684938430786, "rewards/margins": 1.1429167985916138, "rewards/rejected": -4.433601379394531, "sft_loss": 3.5128185749053955, "step": 4400 }, { "epoch": 2.3549088476333835, "eval_logits/chosen": 0.1602650135755539, "eval_logits/rejected": 0.28520262241363525, "eval_logps/chosen": -3.3795173168182373, "eval_logps/rejected": -4.435532569885254, "eval_loss": 0.560058057308197, "eval_rewards/accuracies": 0.7270029783248901, "eval_rewards/chosen": -3.3795173168182373, "eval_rewards/margins": 1.0560152530670166, "eval_rewards/rejected": -4.435532569885254, "eval_runtime": 51.3281, "eval_samples_per_second": 26.204, "eval_sft_loss": 3.520069122314453, "eval_steps_per_second": 6.566, "step": 4400 }, { "epoch": 2.357584880414785, "grad_norm": 17.075591569473236, "learning_rate": 1.3311108361913015e-07, "logits/chosen": -0.2871529161930084, "logits/rejected": -0.22709603607654572, "logps/chosen": -3.151932954788208, "logps/rejected": -4.415287971496582, "loss": 0.4184, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.151932954788208, "rewards/margins": 1.2633541822433472, "rewards/rejected": -4.415287971496582, "sft_loss": 3.2805488109588623, "step": 4405 }, { "epoch": 2.3602609131961865, "grad_norm": 14.639353219888728, "learning_rate": 1.3205478209251874e-07, "logits/chosen": -0.20357315242290497, "logits/rejected": -0.0712977796792984, "logps/chosen": -3.374850034713745, "logps/rejected": -4.77631950378418, "loss": 0.4383, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.374850034713745, "rewards/margins": 1.401469349861145, "rewards/rejected": -4.77631950378418, "sft_loss": 3.5529561042785645, "step": 4410 }, { "epoch": 2.362936945977588, "grad_norm": 14.59497078621693, "learning_rate": 1.310020503720254e-07, "logits/chosen": -0.21676579117774963, "logits/rejected": 0.007253182120621204, "logps/chosen": -3.2841403484344482, "logps/rejected": -4.623076438903809, "loss": 0.4617, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2841403484344482, "rewards/margins": 1.3389358520507812, "rewards/rejected": -4.623076438903809, "sft_loss": 3.3688995838165283, "step": 4415 }, { "epoch": 2.36561297875899, "grad_norm": 19.331359521271466, "learning_rate": 1.2995289867125752e-07, "logits/chosen": -0.22291286289691925, "logits/rejected": -0.1075035110116005, "logps/chosen": -3.3117096424102783, "logps/rejected": -4.385438442230225, "loss": 0.4872, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3117096424102783, "rewards/margins": 1.0737292766571045, "rewards/rejected": -4.385438442230225, "sft_loss": 3.410828113555908, "step": 4420 }, { "epoch": 2.368289011540391, "grad_norm": 15.560749043815754, "learning_rate": 1.2890733716908986e-07, "logits/chosen": -0.22724834084510803, "logits/rejected": -0.11599922180175781, "logps/chosen": -3.066779375076294, "logps/rejected": -4.344001293182373, "loss": 0.3823, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -3.066779375076294, "rewards/margins": 1.2772223949432373, "rewards/rejected": -4.344001293182373, "sft_loss": 3.265014171600342, "step": 4425 }, { "epoch": 2.370965044321793, "grad_norm": 21.602036138164067, "learning_rate": 1.2786537600956454e-07, "logits/chosen": -0.28180190920829773, "logits/rejected": -0.04159663990139961, "logps/chosen": -3.2508647441864014, "logps/rejected": -4.5376296043396, "loss": 0.4401, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.2508647441864014, "rewards/margins": 1.2867653369903564, "rewards/rejected": -4.5376296043396, "sft_loss": 3.342550754547119, "step": 4430 }, { "epoch": 2.3736410771031946, "grad_norm": 15.007896979622949, "learning_rate": 1.268270253017933e-07, "logits/chosen": -0.26863187551498413, "logits/rejected": -0.023631075397133827, "logps/chosen": -3.238671064376831, "logps/rejected": -4.531284332275391, "loss": 0.4468, "rewards/accuracies": 0.78125, "rewards/chosen": -3.238671064376831, "rewards/margins": 1.2926135063171387, "rewards/rejected": -4.531284332275391, "sft_loss": 3.482830047607422, "step": 4435 }, { "epoch": 2.376317109884596, "grad_norm": 15.623102106634239, "learning_rate": 1.257922951198591e-07, "logits/chosen": -0.3622228503227234, "logits/rejected": -0.029821058735251427, "logps/chosen": -3.1601808071136475, "logps/rejected": -4.420136451721191, "loss": 0.4584, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1601808071136475, "rewards/margins": 1.2599560022354126, "rewards/rejected": -4.420136451721191, "sft_loss": 3.3236122131347656, "step": 4440 }, { "epoch": 2.3789931426659976, "grad_norm": 25.78092111116869, "learning_rate": 1.24761195502719e-07, "logits/chosen": -0.2920494079589844, "logits/rejected": -0.02239333651959896, "logps/chosen": -3.278372287750244, "logps/rejected": -4.271549224853516, "loss": 0.5356, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -3.278372287750244, "rewards/margins": 0.9931772947311401, "rewards/rejected": -4.271549224853516, "sft_loss": 3.4388442039489746, "step": 4445 }, { "epoch": 2.3816691754473993, "grad_norm": 21.12682235871762, "learning_rate": 1.2373373645410573e-07, "logits/chosen": -0.2140198051929474, "logits/rejected": -0.035846177488565445, "logps/chosen": -3.3080954551696777, "logps/rejected": -4.657708644866943, "loss": 0.4588, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.3080954551696777, "rewards/margins": 1.3496134281158447, "rewards/rejected": -4.657708644866943, "sft_loss": 3.464963436126709, "step": 4450 }, { "epoch": 2.384345208228801, "grad_norm": 19.12954636525194, "learning_rate": 1.2270992794243175e-07, "logits/chosen": -0.30315282940864563, "logits/rejected": -0.14045214653015137, "logps/chosen": -3.2173290252685547, "logps/rejected": -4.5085835456848145, "loss": 0.452, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2173290252685547, "rewards/margins": 1.2912541627883911, "rewards/rejected": -4.5085835456848145, "sft_loss": 3.383535861968994, "step": 4455 }, { "epoch": 2.3870212410102023, "grad_norm": 12.617700363992414, "learning_rate": 1.2168977990069147e-07, "logits/chosen": -0.29781395196914673, "logits/rejected": -0.04217588156461716, "logps/chosen": -3.1576359272003174, "logps/rejected": -4.424050331115723, "loss": 0.4447, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1576359272003174, "rewards/margins": 1.2664148807525635, "rewards/rejected": -4.424050331115723, "sft_loss": 3.398397922515869, "step": 4460 }, { "epoch": 2.389697273791604, "grad_norm": 21.343624441388958, "learning_rate": 1.206733022263659e-07, "logits/chosen": -0.2910090386867523, "logits/rejected": -0.040756385773420334, "logps/chosen": -3.4104666709899902, "logps/rejected": -4.600484371185303, "loss": 0.5062, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.4104666709899902, "rewards/margins": 1.1900174617767334, "rewards/rejected": -4.600484371185303, "sft_loss": 3.4890716075897217, "step": 4465 }, { "epoch": 2.3923733065730053, "grad_norm": 15.448295342736577, "learning_rate": 1.1966050478132572e-07, "logits/chosen": -0.20816560089588165, "logits/rejected": -0.08821313083171844, "logps/chosen": -3.159154176712036, "logps/rejected": -4.3316545486450195, "loss": 0.4997, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.159154176712036, "rewards/margins": 1.1724998950958252, "rewards/rejected": -4.3316545486450195, "sft_loss": 3.3820927143096924, "step": 4470 }, { "epoch": 2.395049339354407, "grad_norm": 20.533095086063796, "learning_rate": 1.1865139739173635e-07, "logits/chosen": -0.2584991157054901, "logits/rejected": -0.008012844249606133, "logps/chosen": -3.2850277423858643, "logps/rejected": -4.464526176452637, "loss": 0.4552, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2850277423858643, "rewards/margins": 1.1794984340667725, "rewards/rejected": -4.464526176452637, "sft_loss": 3.3650975227355957, "step": 4475 }, { "epoch": 2.3977253721358087, "grad_norm": 18.04195391600987, "learning_rate": 1.1764598984796187e-07, "logits/chosen": -0.30945929884910583, "logits/rejected": -0.12186364829540253, "logps/chosen": -3.1606342792510986, "logps/rejected": -4.291658401489258, "loss": 0.4505, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1606342792510986, "rewards/margins": 1.1310243606567383, "rewards/rejected": -4.291658401489258, "sft_loss": 3.3320159912109375, "step": 4480 }, { "epoch": 2.4004014049172104, "grad_norm": 21.2521243699826, "learning_rate": 1.1664429190447095e-07, "logits/chosen": -0.16520674526691437, "logits/rejected": -0.06304512917995453, "logps/chosen": -3.2640156745910645, "logps/rejected": -4.534733772277832, "loss": 0.4605, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2640156745910645, "rewards/margins": 1.2707182168960571, "rewards/rejected": -4.534733772277832, "sft_loss": 3.390946865081787, "step": 4485 }, { "epoch": 2.4030774376986117, "grad_norm": 23.375242113989113, "learning_rate": 1.1564631327974122e-07, "logits/chosen": -0.26219305396080017, "logits/rejected": -0.012254145927727222, "logps/chosen": -3.255249500274658, "logps/rejected": -4.6344709396362305, "loss": 0.4409, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.255249500274658, "rewards/margins": 1.3792214393615723, "rewards/rejected": -4.6344709396362305, "sft_loss": 3.460409641265869, "step": 4490 }, { "epoch": 2.4057534704800134, "grad_norm": 17.363393487201144, "learning_rate": 1.1465206365616587e-07, "logits/chosen": -0.3416324853897095, "logits/rejected": -0.07559507340192795, "logps/chosen": -3.3181662559509277, "logps/rejected": -4.4394731521606445, "loss": 0.4865, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3181662559509277, "rewards/margins": 1.1213066577911377, "rewards/rejected": -4.4394731521606445, "sft_loss": 3.470642566680908, "step": 4495 }, { "epoch": 2.408429503261415, "grad_norm": 17.222664032980234, "learning_rate": 1.1366155267995887e-07, "logits/chosen": -0.19300106167793274, "logits/rejected": -0.13747408986091614, "logps/chosen": -3.1528539657592773, "logps/rejected": -4.371302604675293, "loss": 0.4426, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1528539657592773, "rewards/margins": 1.218449354171753, "rewards/rejected": -4.371302604675293, "sft_loss": 3.3510098457336426, "step": 4500 }, { "epoch": 2.4111055360428164, "grad_norm": 18.88858469752771, "learning_rate": 1.1267478996106228e-07, "logits/chosen": -0.24253828823566437, "logits/rejected": 0.018031585961580276, "logps/chosen": -3.231499195098877, "logps/rejected": -4.309110164642334, "loss": 0.4907, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.231499195098877, "rewards/margins": 1.077610731124878, "rewards/rejected": -4.309110164642334, "sft_loss": 3.3755271434783936, "step": 4505 }, { "epoch": 2.413781568824218, "grad_norm": 16.997664747937158, "learning_rate": 1.116917850730521e-07, "logits/chosen": -0.26118624210357666, "logits/rejected": -0.07530129700899124, "logps/chosen": -3.262040615081787, "logps/rejected": -4.351678371429443, "loss": 0.5123, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.262040615081787, "rewards/margins": 1.0896382331848145, "rewards/rejected": -4.351678371429443, "sft_loss": 3.352389097213745, "step": 4510 }, { "epoch": 2.41645760160562, "grad_norm": 16.598529612655412, "learning_rate": 1.1071254755304637e-07, "logits/chosen": -0.23019631206989288, "logits/rejected": -0.12985627353191376, "logps/chosen": -3.1173079013824463, "logps/rejected": -4.230454921722412, "loss": 0.5031, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1173079013824463, "rewards/margins": 1.1131466627120972, "rewards/rejected": -4.230454921722412, "sft_loss": 3.243206024169922, "step": 4515 }, { "epoch": 2.419133634387021, "grad_norm": 17.53402921610423, "learning_rate": 1.0973708690161143e-07, "logits/chosen": -0.275122731924057, "logits/rejected": -0.12595012784004211, "logps/chosen": -3.2308623790740967, "logps/rejected": -4.467679023742676, "loss": 0.4458, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2308623790740967, "rewards/margins": 1.2368162870407104, "rewards/rejected": -4.467679023742676, "sft_loss": 3.35723614692688, "step": 4520 }, { "epoch": 2.421809667168423, "grad_norm": 24.399068677605378, "learning_rate": 1.0876541258267119e-07, "logits/chosen": -0.31348997354507446, "logits/rejected": -0.07134351134300232, "logps/chosen": -3.2641544342041016, "logps/rejected": -4.654932975769043, "loss": 0.4228, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.2641544342041016, "rewards/margins": 1.39077889919281, "rewards/rejected": -4.654932975769043, "sft_loss": 3.3911826610565186, "step": 4525 }, { "epoch": 2.4244856999498245, "grad_norm": 19.399546547220996, "learning_rate": 1.0779753402341379e-07, "logits/chosen": -0.2883759140968323, "logits/rejected": -0.15745703876018524, "logps/chosen": -3.215301990509033, "logps/rejected": -4.2147536277771, "loss": 0.503, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.215301990509033, "rewards/margins": 0.9994519948959351, "rewards/rejected": -4.2147536277771, "sft_loss": 3.293431520462036, "step": 4530 }, { "epoch": 2.427161732731226, "grad_norm": 20.335051010407632, "learning_rate": 1.0683346061420157e-07, "logits/chosen": -0.15196876227855682, "logits/rejected": -0.02178243175148964, "logps/chosen": -3.069855213165283, "logps/rejected": -4.323353290557861, "loss": 0.479, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.069855213165283, "rewards/margins": 1.2534980773925781, "rewards/rejected": -4.323353290557861, "sft_loss": 3.3428139686584473, "step": 4535 }, { "epoch": 2.4298377655126275, "grad_norm": 17.277048138271496, "learning_rate": 1.0587320170847874e-07, "logits/chosen": -0.2203926295042038, "logits/rejected": -0.07835905998945236, "logps/chosen": -3.045024871826172, "logps/rejected": -4.137242317199707, "loss": 0.4971, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.045024871826172, "rewards/margins": 1.0922179222106934, "rewards/rejected": -4.137242317199707, "sft_loss": 3.2310516834259033, "step": 4540 }, { "epoch": 2.4325137982940293, "grad_norm": 15.126039297100926, "learning_rate": 1.0491676662268156e-07, "logits/chosen": -0.16868285834789276, "logits/rejected": -0.036202650517225266, "logps/chosen": -3.1142802238464355, "logps/rejected": -4.321670055389404, "loss": 0.4742, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1142802238464355, "rewards/margins": 1.2073900699615479, "rewards/rejected": -4.321670055389404, "sft_loss": 3.253473997116089, "step": 4545 }, { "epoch": 2.4351898310754305, "grad_norm": 25.445883199357247, "learning_rate": 1.0396416463614732e-07, "logits/chosen": -0.2712944746017456, "logits/rejected": -0.12396593391895294, "logps/chosen": -3.0776920318603516, "logps/rejected": -4.236860275268555, "loss": 0.4902, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.0776920318603516, "rewards/margins": 1.1591686010360718, "rewards/rejected": -4.236860275268555, "sft_loss": 3.244083881378174, "step": 4550 }, { "epoch": 2.4378658638568322, "grad_norm": 16.405844130937062, "learning_rate": 1.0301540499102479e-07, "logits/chosen": -0.20970351994037628, "logits/rejected": -0.0833965465426445, "logps/chosen": -3.313142776489258, "logps/rejected": -4.349070072174072, "loss": 0.5185, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.313142776489258, "rewards/margins": 1.0359264612197876, "rewards/rejected": -4.349070072174072, "sft_loss": 3.5295753479003906, "step": 4555 }, { "epoch": 2.440541896638234, "grad_norm": 20.700026707905256, "learning_rate": 1.0207049689218405e-07, "logits/chosen": -0.26775461435317993, "logits/rejected": -0.011146956123411655, "logps/chosen": -3.2772324085235596, "logps/rejected": -4.6195173263549805, "loss": 0.4729, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2772324085235596, "rewards/margins": 1.342284917831421, "rewards/rejected": -4.6195173263549805, "sft_loss": 3.369673252105713, "step": 4560 }, { "epoch": 2.4432179294196352, "grad_norm": 15.269201220575487, "learning_rate": 1.0112944950712782e-07, "logits/chosen": -0.2290545403957367, "logits/rejected": -0.06699430197477341, "logps/chosen": -3.1508140563964844, "logps/rejected": -4.458221435546875, "loss": 0.4326, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1508140563964844, "rewards/margins": 1.307407259941101, "rewards/rejected": -4.458221435546875, "sft_loss": 3.215066909790039, "step": 4565 }, { "epoch": 2.445893962201037, "grad_norm": 19.307766819089892, "learning_rate": 1.0019227196590174e-07, "logits/chosen": -0.1831798106431961, "logits/rejected": 0.01658095046877861, "logps/chosen": -3.2275054454803467, "logps/rejected": -4.416240692138672, "loss": 0.4878, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2275054454803467, "rewards/margins": 1.1887353658676147, "rewards/rejected": -4.416240692138672, "sft_loss": 3.3482584953308105, "step": 4570 }, { "epoch": 2.4485699949824387, "grad_norm": 17.31052507702184, "learning_rate": 9.925897336100664e-08, "logits/chosen": -0.13451997935771942, "logits/rejected": -0.033201027661561966, "logps/chosen": -3.103423595428467, "logps/rejected": -4.447488307952881, "loss": 0.4124, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.103423595428467, "rewards/margins": 1.3440649509429932, "rewards/rejected": -4.447488307952881, "sft_loss": 3.27244234085083, "step": 4575 }, { "epoch": 2.45124602776384, "grad_norm": 21.874282682259533, "learning_rate": 9.832956274730946e-08, "logits/chosen": -0.19495446979999542, "logits/rejected": -0.11899854987859726, "logps/chosen": -3.0447566509246826, "logps/rejected": -4.065402984619141, "loss": 0.5101, "rewards/accuracies": 0.75, "rewards/chosen": -3.0447566509246826, "rewards/margins": 1.020646333694458, "rewards/rejected": -4.065402984619141, "sft_loss": 3.20434308052063, "step": 4580 }, { "epoch": 2.4539220605452416, "grad_norm": 18.616466987766536, "learning_rate": 9.740404914195633e-08, "logits/chosen": -0.22558899223804474, "logits/rejected": -0.0379658117890358, "logps/chosen": -3.157686710357666, "logps/rejected": -4.34769868850708, "loss": 0.4652, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.157686710357666, "rewards/margins": 1.190011739730835, "rewards/rejected": -4.34769868850708, "sft_loss": 3.3493080139160156, "step": 4585 }, { "epoch": 2.4565980933266434, "grad_norm": 12.986218562492288, "learning_rate": 9.648244152428392e-08, "logits/chosen": -0.28454527258872986, "logits/rejected": -0.11226551234722137, "logps/chosen": -3.0807158946990967, "logps/rejected": -4.219048500061035, "loss": 0.4639, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0807158946990967, "rewards/margins": 1.1383326053619385, "rewards/rejected": -4.219048500061035, "sft_loss": 3.284151077270508, "step": 4590 }, { "epoch": 2.4592741261080446, "grad_norm": 18.4713320773109, "learning_rate": 9.556474883573379e-08, "logits/chosen": -0.2847925126552582, "logits/rejected": -0.12773391604423523, "logps/chosen": -3.047091007232666, "logps/rejected": -4.422765254974365, "loss": 0.4642, "rewards/accuracies": 0.78125, "rewards/chosen": -3.047091007232666, "rewards/margins": 1.3756738901138306, "rewards/rejected": -4.422765254974365, "sft_loss": 3.1898136138916016, "step": 4595 }, { "epoch": 2.4619501588894463, "grad_norm": 13.479154984246103, "learning_rate": 9.465097997976412e-08, "logits/chosen": -0.25030630826950073, "logits/rejected": 0.016726698726415634, "logps/chosen": -3.1371049880981445, "logps/rejected": -4.500518798828125, "loss": 0.421, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1371049880981445, "rewards/margins": 1.363413691520691, "rewards/rejected": -4.500518798828125, "sft_loss": 3.3470330238342285, "step": 4600 }, { "epoch": 2.464626191670848, "grad_norm": 17.62918485569358, "learning_rate": 9.374114382176457e-08, "logits/chosen": -0.2329416275024414, "logits/rejected": -0.02414068579673767, "logps/chosen": -3.3003411293029785, "logps/rejected": -4.541224002838135, "loss": 0.4822, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3003411293029785, "rewards/margins": 1.2408829927444458, "rewards/rejected": -4.541224002838135, "sft_loss": 3.5061416625976562, "step": 4605 }, { "epoch": 2.46730222445225, "grad_norm": 18.57472213271513, "learning_rate": 9.283524918896945e-08, "logits/chosen": -0.26498347520828247, "logits/rejected": -0.09776406735181808, "logps/chosen": -3.2808425426483154, "logps/rejected": -4.538356781005859, "loss": 0.4765, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2808425426483154, "rewards/margins": 1.2575145959854126, "rewards/rejected": -4.538356781005859, "sft_loss": 3.3779423236846924, "step": 4610 }, { "epoch": 2.469978257233651, "grad_norm": 17.256523520599416, "learning_rate": 9.193330487037232e-08, "logits/chosen": -0.2040288746356964, "logits/rejected": 0.01880154386162758, "logps/chosen": -3.298178195953369, "logps/rejected": -4.639803886413574, "loss": 0.4535, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.298178195953369, "rewards/margins": 1.3416259288787842, "rewards/rejected": -4.639803886413574, "sft_loss": 3.4679908752441406, "step": 4615 }, { "epoch": 2.4726542900150528, "grad_norm": 16.880230708349227, "learning_rate": 9.103531961664118e-08, "logits/chosen": -0.20273037254810333, "logits/rejected": 0.042335350066423416, "logps/chosen": -3.030534029006958, "logps/rejected": -4.304599761962891, "loss": 0.4104, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.030534029006958, "rewards/margins": 1.2740657329559326, "rewards/rejected": -4.304599761962891, "sft_loss": 3.2851033210754395, "step": 4620 }, { "epoch": 2.475330322796454, "grad_norm": 17.230042736206773, "learning_rate": 9.014130214003269e-08, "logits/chosen": -0.2727370858192444, "logits/rejected": -0.240513414144516, "logps/chosen": -3.1677136421203613, "logps/rejected": -4.453676223754883, "loss": 0.4537, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1677136421203613, "rewards/margins": 1.285962462425232, "rewards/rejected": -4.453676223754883, "sft_loss": 3.2961056232452393, "step": 4625 }, { "epoch": 2.4780063555778558, "grad_norm": 19.477289020834274, "learning_rate": 8.925126111430848e-08, "logits/chosen": -0.1317545622587204, "logits/rejected": -0.0074939606711268425, "logps/chosen": -3.152268886566162, "logps/rejected": -4.429300308227539, "loss": 0.4496, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.152268886566162, "rewards/margins": 1.277031660079956, "rewards/rejected": -4.429300308227539, "sft_loss": 3.366619825363159, "step": 4630 }, { "epoch": 2.4806823883592575, "grad_norm": 21.47276325117744, "learning_rate": 8.83652051746504e-08, "logits/chosen": -0.1325719654560089, "logits/rejected": 0.07381542026996613, "logps/chosen": -3.3022446632385254, "logps/rejected": -4.655218601226807, "loss": 0.4566, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3022446632385254, "rewards/margins": 1.3529744148254395, "rewards/rejected": -4.655218601226807, "sft_loss": 3.4520957469940186, "step": 4635 }, { "epoch": 2.483358421140659, "grad_norm": 15.970436118864976, "learning_rate": 8.748314291757696e-08, "logits/chosen": -0.19755138456821442, "logits/rejected": -0.045385174453258514, "logps/chosen": -3.224997043609619, "logps/rejected": -4.416315078735352, "loss": 0.4601, "rewards/accuracies": 0.8125, "rewards/chosen": -3.224997043609619, "rewards/margins": 1.1913175582885742, "rewards/rejected": -4.416315078735352, "sft_loss": 3.3598480224609375, "step": 4640 }, { "epoch": 2.4860344539220605, "grad_norm": 17.382939228249764, "learning_rate": 8.660508290086032e-08, "logits/chosen": -0.23071089386940002, "logits/rejected": -0.038423918187618256, "logps/chosen": -3.2272229194641113, "logps/rejected": -4.533236503601074, "loss": 0.4435, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2272229194641113, "rewards/margins": 1.306014060974121, "rewards/rejected": -4.533236503601074, "sft_loss": 3.416638135910034, "step": 4645 }, { "epoch": 2.488710486703462, "grad_norm": 21.010187765561493, "learning_rate": 8.573103364344231e-08, "logits/chosen": -0.292992502450943, "logits/rejected": -0.0057989866472780704, "logps/chosen": -3.13838267326355, "logps/rejected": -4.402870178222656, "loss": 0.4478, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.13838267326355, "rewards/margins": 1.2644875049591064, "rewards/rejected": -4.402870178222656, "sft_loss": 3.1901614665985107, "step": 4650 }, { "epoch": 2.4913865194848634, "grad_norm": 20.868876516980052, "learning_rate": 8.486100362535292e-08, "logits/chosen": -0.2437940388917923, "logits/rejected": -0.0457330122590065, "logps/chosen": -3.2630112171173096, "logps/rejected": -4.354058742523193, "loss": 0.4887, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2630112171173096, "rewards/margins": 1.091047763824463, "rewards/rejected": -4.354058742523193, "sft_loss": 3.476834774017334, "step": 4655 }, { "epoch": 2.494062552266265, "grad_norm": 14.923011648501324, "learning_rate": 8.399500128762693e-08, "logits/chosen": -0.20912370085716248, "logits/rejected": -0.05193439871072769, "logps/chosen": -3.2698898315429688, "logps/rejected": -4.5080246925354, "loss": 0.4386, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2698898315429688, "rewards/margins": 1.2381350994110107, "rewards/rejected": -4.5080246925354, "sft_loss": 3.34809947013855, "step": 4660 }, { "epoch": 2.496738585047667, "grad_norm": 19.72037202498631, "learning_rate": 8.313303503222313e-08, "logits/chosen": -0.2273527830839157, "logits/rejected": -0.1118924468755722, "logps/chosen": -3.168461561203003, "logps/rejected": -4.356649875640869, "loss": 0.4783, "rewards/accuracies": 0.78125, "rewards/chosen": -3.168461561203003, "rewards/margins": 1.1881887912750244, "rewards/rejected": -4.356649875640869, "sft_loss": 3.3031134605407715, "step": 4665 }, { "epoch": 2.4994146178290686, "grad_norm": 20.934731505678513, "learning_rate": 8.227511322194164e-08, "logits/chosen": -0.24165849387645721, "logits/rejected": -0.05451079457998276, "logps/chosen": -3.0880351066589355, "logps/rejected": -4.163828372955322, "loss": 0.4786, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0880351066589355, "rewards/margins": 1.0757930278778076, "rewards/rejected": -4.163828372955322, "sft_loss": 3.1622893810272217, "step": 4670 }, { "epoch": 2.50209065061047, "grad_norm": 26.497265788901736, "learning_rate": 8.142124418034385e-08, "logits/chosen": -0.15390422940254211, "logits/rejected": 0.05969760939478874, "logps/chosen": -3.1597721576690674, "logps/rejected": -4.265137672424316, "loss": 0.5263, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1597721576690674, "rewards/margins": 1.1053650379180908, "rewards/rejected": -4.265137672424316, "sft_loss": 3.3072402477264404, "step": 4675 }, { "epoch": 2.5047666833918716, "grad_norm": 22.961685650281634, "learning_rate": 8.057143619167073e-08, "logits/chosen": -0.14675331115722656, "logits/rejected": -0.010864680632948875, "logps/chosen": -3.1171987056732178, "logps/rejected": -4.339387893676758, "loss": 0.48, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1171987056732178, "rewards/margins": 1.2221894264221191, "rewards/rejected": -4.339387893676758, "sft_loss": 3.211840867996216, "step": 4680 }, { "epoch": 2.507442716173273, "grad_norm": 12.264521340299805, "learning_rate": 7.97256975007633e-08, "logits/chosen": -0.24845509231090546, "logits/rejected": 0.022098522633314133, "logps/chosen": -3.1032533645629883, "logps/rejected": -4.3729448318481445, "loss": 0.434, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.1032533645629883, "rewards/margins": 1.2696917057037354, "rewards/rejected": -4.3729448318481445, "sft_loss": 3.2458713054656982, "step": 4685 }, { "epoch": 2.5101187489546746, "grad_norm": 17.017415580056074, "learning_rate": 7.888403631298186e-08, "logits/chosen": -0.15666857361793518, "logits/rejected": -0.0690535455942154, "logps/chosen": -3.0472021102905273, "logps/rejected": -4.24338960647583, "loss": 0.4723, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.0472021102905273, "rewards/margins": 1.1961876153945923, "rewards/rejected": -4.24338960647583, "sft_loss": 3.148030996322632, "step": 4690 }, { "epoch": 2.5127947817360763, "grad_norm": 15.608211137207162, "learning_rate": 7.804646079412719e-08, "logits/chosen": -0.17254853248596191, "logits/rejected": 0.058733534067869186, "logps/chosen": -3.291088819503784, "logps/rejected": -4.588837146759033, "loss": 0.455, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.291088819503784, "rewards/margins": 1.2977479696273804, "rewards/rejected": -4.588837146759033, "sft_loss": 3.428480863571167, "step": 4695 }, { "epoch": 2.515470814517478, "grad_norm": 16.368817132261125, "learning_rate": 7.72129790703604e-08, "logits/chosen": -0.28066256642341614, "logits/rejected": -0.10028629004955292, "logps/chosen": -3.1184608936309814, "logps/rejected": -4.201938629150391, "loss": 0.4842, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1184608936309814, "rewards/margins": 1.0834776163101196, "rewards/rejected": -4.201938629150391, "sft_loss": 3.340056896209717, "step": 4700 }, { "epoch": 2.5181468472988793, "grad_norm": 21.002405475983192, "learning_rate": 7.638359922812504e-08, "logits/chosen": -0.16717539727687836, "logits/rejected": -0.06876533478498459, "logps/chosen": -3.1695704460144043, "logps/rejected": -4.404815673828125, "loss": 0.481, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1695704460144043, "rewards/margins": 1.2352455854415894, "rewards/rejected": -4.404815673828125, "sft_loss": 3.212214708328247, "step": 4705 }, { "epoch": 2.520822880080281, "grad_norm": 21.183191402271422, "learning_rate": 7.555832931406774e-08, "logits/chosen": -0.2358173429965973, "logits/rejected": 0.015494110994040966, "logps/chosen": -3.171396493911743, "logps/rejected": -4.4364213943481445, "loss": 0.454, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.171396493911743, "rewards/margins": 1.2650253772735596, "rewards/rejected": -4.4364213943481445, "sft_loss": 3.3194046020507812, "step": 4710 }, { "epoch": 2.5234989128616827, "grad_norm": 15.518506735899832, "learning_rate": 7.47371773349611e-08, "logits/chosen": -0.17813552916049957, "logits/rejected": -0.13089530169963837, "logps/chosen": -3.214340925216675, "logps/rejected": -4.631179332733154, "loss": 0.4016, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.214340925216675, "rewards/margins": 1.4168381690979004, "rewards/rejected": -4.631179332733154, "sft_loss": 3.372272491455078, "step": 4715 }, { "epoch": 2.526174945643084, "grad_norm": 17.855029324563343, "learning_rate": 7.392015125762496e-08, "logits/chosen": -0.2225007265806198, "logits/rejected": -0.036283355206251144, "logps/chosen": -3.060300350189209, "logps/rejected": -4.420039176940918, "loss": 0.4043, "rewards/accuracies": 0.875, "rewards/chosen": -3.060300350189209, "rewards/margins": 1.3597384691238403, "rewards/rejected": -4.420039176940918, "sft_loss": 3.1908531188964844, "step": 4720 }, { "epoch": 2.5288509784244857, "grad_norm": 18.797398384421548, "learning_rate": 7.310725900885018e-08, "logits/chosen": -0.25443488359451294, "logits/rejected": -0.1568032056093216, "logps/chosen": -3.246868133544922, "logps/rejected": -4.453755855560303, "loss": 0.5097, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.246868133544922, "rewards/margins": 1.2068877220153809, "rewards/rejected": -4.453755855560303, "sft_loss": 3.4013404846191406, "step": 4725 }, { "epoch": 2.5315270112058874, "grad_norm": 18.674359255751934, "learning_rate": 7.229850847532076e-08, "logits/chosen": -0.18834123015403748, "logits/rejected": 0.01394510269165039, "logps/chosen": -3.13272762298584, "logps/rejected": -4.519242286682129, "loss": 0.4157, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.13272762298584, "rewards/margins": 1.3865143060684204, "rewards/rejected": -4.519242286682129, "sft_loss": 3.35652494430542, "step": 4730 }, { "epoch": 2.5342030439872887, "grad_norm": 18.356699406304966, "learning_rate": 7.149390750353779e-08, "logits/chosen": -0.11418505012989044, "logits/rejected": -0.14707979559898376, "logps/chosen": -3.3050544261932373, "logps/rejected": -4.507040977478027, "loss": 0.4266, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.3050544261932373, "rewards/margins": 1.2019859552383423, "rewards/rejected": -4.507040977478027, "sft_loss": 3.4293060302734375, "step": 4735 }, { "epoch": 2.5368790767686904, "grad_norm": 13.183960579601973, "learning_rate": 7.069346389974374e-08, "logits/chosen": -0.23427972197532654, "logits/rejected": -0.03690224885940552, "logps/chosen": -3.3357748985290527, "logps/rejected": -4.470101833343506, "loss": 0.491, "rewards/accuracies": 0.78125, "rewards/chosen": -3.3357748985290527, "rewards/margins": 1.1343269348144531, "rewards/rejected": -4.470101833343506, "sft_loss": 3.506957530975342, "step": 4740 }, { "epoch": 2.539555109550092, "grad_norm": 19.94553345695387, "learning_rate": 6.989718542984563e-08, "logits/chosen": -0.2038155049085617, "logits/rejected": -0.12303312122821808, "logps/chosen": -3.2963204383850098, "logps/rejected": -4.538512229919434, "loss": 0.4771, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2963204383850098, "rewards/margins": 1.242192029953003, "rewards/rejected": -4.538512229919434, "sft_loss": 3.4833292961120605, "step": 4745 }, { "epoch": 2.5422311423314934, "grad_norm": 17.211013156643542, "learning_rate": 6.9105079819341e-08, "logits/chosen": -0.17637403309345245, "logits/rejected": 0.12716087698936462, "logps/chosen": -3.2101187705993652, "logps/rejected": -4.727436542510986, "loss": 0.3764, "rewards/accuracies": 0.875, "rewards/chosen": -3.2101187705993652, "rewards/margins": 1.517317771911621, "rewards/rejected": -4.727436542510986, "sft_loss": 3.330015182495117, "step": 4750 }, { "epoch": 2.544907175112895, "grad_norm": 18.59545753629147, "learning_rate": 6.831715475324163e-08, "logits/chosen": -0.25424811244010925, "logits/rejected": -0.03308358043432236, "logps/chosen": -3.2420711517333984, "logps/rejected": -4.640331745147705, "loss": 0.4475, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2420711517333984, "rewards/margins": 1.3982599973678589, "rewards/rejected": -4.640331745147705, "sft_loss": 3.4349277019500732, "step": 4755 }, { "epoch": 2.547583207894297, "grad_norm": 16.47927838456307, "learning_rate": 6.753341787600026e-08, "logits/chosen": -0.3032090961933136, "logits/rejected": -0.17475828528404236, "logps/chosen": -3.182875633239746, "logps/rejected": -4.66715145111084, "loss": 0.3949, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -3.182875633239746, "rewards/margins": 1.4842758178710938, "rewards/rejected": -4.66715145111084, "sft_loss": 3.4110794067382812, "step": 4760 }, { "epoch": 2.5502592406756985, "grad_norm": 20.809819346399998, "learning_rate": 6.67538767914353e-08, "logits/chosen": -0.2470838725566864, "logits/rejected": -0.023734260350465775, "logps/chosen": -3.3000004291534424, "logps/rejected": -4.46333646774292, "loss": 0.482, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.3000004291534424, "rewards/margins": 1.1633365154266357, "rewards/rejected": -4.46333646774292, "sft_loss": 3.459524631500244, "step": 4765 }, { "epoch": 2.5529352734571, "grad_norm": 22.254439383465648, "learning_rate": 6.597853906265793e-08, "logits/chosen": -0.2009831666946411, "logits/rejected": -0.017525160685181618, "logps/chosen": -3.3135883808135986, "logps/rejected": -4.76369047164917, "loss": 0.4309, "rewards/accuracies": 0.8125, "rewards/chosen": -3.3135883808135986, "rewards/margins": 1.4501022100448608, "rewards/rejected": -4.76369047164917, "sft_loss": 3.384169816970825, "step": 4770 }, { "epoch": 2.5556113062385015, "grad_norm": 21.898141341281875, "learning_rate": 6.5207412211998e-08, "logits/chosen": -0.1036507710814476, "logits/rejected": 0.020888470113277435, "logps/chosen": -3.3246219158172607, "logps/rejected": -4.676529884338379, "loss": 0.4777, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.3246219158172607, "rewards/margins": 1.3519079685211182, "rewards/rejected": -4.676529884338379, "sft_loss": 3.3856797218322754, "step": 4775 }, { "epoch": 2.558287339019903, "grad_norm": 17.081647007030167, "learning_rate": 6.444050372093186e-08, "logits/chosen": -0.26109281182289124, "logits/rejected": -0.11180180311203003, "logps/chosen": -3.223278045654297, "logps/rejected": -4.432318687438965, "loss": 0.4501, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.223278045654297, "rewards/margins": 1.2090413570404053, "rewards/rejected": -4.432318687438965, "sft_loss": 3.346792697906494, "step": 4780 }, { "epoch": 2.5609633718013045, "grad_norm": 19.517956039813384, "learning_rate": 6.367782103000873e-08, "logits/chosen": -0.20629271864891052, "logits/rejected": -0.1311255395412445, "logps/chosen": -3.2419943809509277, "logps/rejected": -4.231849670410156, "loss": 0.5059, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2419943809509277, "rewards/margins": 0.9898551106452942, "rewards/rejected": -4.231849670410156, "sft_loss": 3.343217372894287, "step": 4785 }, { "epoch": 2.5636394045827062, "grad_norm": 17.066269107224198, "learning_rate": 6.29193715387798e-08, "logits/chosen": -0.2514341175556183, "logits/rejected": -0.09561355412006378, "logps/chosen": -3.2857754230499268, "logps/rejected": -4.550307273864746, "loss": 0.4694, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2857754230499268, "rewards/margins": 1.2645318508148193, "rewards/rejected": -4.550307273864746, "sft_loss": 3.3797073364257812, "step": 4790 }, { "epoch": 2.566315437364108, "grad_norm": 26.19436760931643, "learning_rate": 6.216516260572502e-08, "logits/chosen": -0.19456951320171356, "logits/rejected": -0.04334023594856262, "logps/chosen": -3.3537120819091797, "logps/rejected": -4.570885181427002, "loss": 0.49, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3537120819091797, "rewards/margins": 1.2171725034713745, "rewards/rejected": -4.570885181427002, "sft_loss": 3.438067674636841, "step": 4795 }, { "epoch": 2.568991470145509, "grad_norm": 15.876151554491901, "learning_rate": 6.141520154818297e-08, "logits/chosen": -0.23168060183525085, "logits/rejected": -0.09211653470993042, "logps/chosen": -3.15653395652771, "logps/rejected": -4.232216835021973, "loss": 0.4952, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.15653395652771, "rewards/margins": 1.0756828784942627, "rewards/rejected": -4.232216835021973, "sft_loss": 3.413440227508545, "step": 4800 }, { "epoch": 2.568991470145509, "eval_logits/chosen": 0.1935766041278839, "eval_logits/rejected": 0.32095426321029663, "eval_logps/chosen": -3.3064892292022705, "eval_logps/rejected": -4.356972694396973, "eval_loss": 0.5579966306686401, "eval_rewards/accuracies": 0.7232937812805176, "eval_rewards/chosen": -3.3064892292022705, "eval_rewards/margins": 1.050482988357544, "eval_rewards/rejected": -4.356972694396973, "eval_runtime": 49.257, "eval_samples_per_second": 27.306, "eval_sft_loss": 3.4401662349700928, "eval_steps_per_second": 6.842, "step": 4800 }, { "epoch": 2.571667502926911, "grad_norm": 25.098344738779687, "learning_rate": 6.066949564227897e-08, "logits/chosen": -0.2947044372558594, "logits/rejected": -0.14726006984710693, "logps/chosen": -3.2008023262023926, "logps/rejected": -4.374998092651367, "loss": 0.506, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.2008023262023926, "rewards/margins": 1.1741957664489746, "rewards/rejected": -4.374998092651367, "sft_loss": 3.3073036670684814, "step": 4805 }, { "epoch": 2.574343535708312, "grad_norm": 17.540367301185075, "learning_rate": 5.992805212285523e-08, "logits/chosen": -0.23283466696739197, "logits/rejected": -0.10140033066272736, "logps/chosen": -3.1717512607574463, "logps/rejected": -4.4409589767456055, "loss": 0.4676, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1717512607574463, "rewards/margins": 1.2692081928253174, "rewards/rejected": -4.4409589767456055, "sft_loss": 3.3278002738952637, "step": 4810 }, { "epoch": 2.577019568489714, "grad_norm": 22.638111065501505, "learning_rate": 5.9190878183399684e-08, "logits/chosen": -0.2398640662431717, "logits/rejected": -0.06665889918804169, "logps/chosen": -2.9983909130096436, "logps/rejected": -4.388098239898682, "loss": 0.4861, "rewards/accuracies": 0.75, "rewards/chosen": -2.9983909130096436, "rewards/margins": 1.3897074460983276, "rewards/rejected": -4.388098239898682, "sft_loss": 3.2251102924346924, "step": 4815 }, { "epoch": 2.5796956012711156, "grad_norm": 25.42150384905573, "learning_rate": 5.845798097597748e-08, "logits/chosen": -0.19961199164390564, "logits/rejected": -0.07595672458410263, "logps/chosen": -3.2427210807800293, "logps/rejected": -4.271923542022705, "loss": 0.494, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2427210807800293, "rewards/margins": 1.029201626777649, "rewards/rejected": -4.271923542022705, "sft_loss": 3.2815985679626465, "step": 4820 }, { "epoch": 2.5823716340525174, "grad_norm": 18.515170835784442, "learning_rate": 5.772936761116026e-08, "logits/chosen": -0.19458623230457306, "logits/rejected": -0.0016370117664337158, "logps/chosen": -3.1516411304473877, "logps/rejected": -4.336785793304443, "loss": 0.4557, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1516411304473877, "rewards/margins": 1.1851444244384766, "rewards/rejected": -4.336785793304443, "sft_loss": 3.2261505126953125, "step": 4825 }, { "epoch": 2.5850476668339186, "grad_norm": 23.696243181872738, "learning_rate": 5.700504515795829e-08, "logits/chosen": -0.2503166198730469, "logits/rejected": -0.04413991421461105, "logps/chosen": -3.2913098335266113, "logps/rejected": -4.465832710266113, "loss": 0.4647, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2913098335266113, "rewards/margins": 1.1745226383209229, "rewards/rejected": -4.465832710266113, "sft_loss": 3.4344570636749268, "step": 4830 }, { "epoch": 2.5877236996153203, "grad_norm": 19.82750433438805, "learning_rate": 5.628502064375101e-08, "logits/chosen": -0.35966330766677856, "logits/rejected": -0.12091150134801865, "logps/chosen": -3.070188283920288, "logps/rejected": -4.504461288452148, "loss": 0.4042, "rewards/accuracies": 0.875, "rewards/chosen": -3.070188283920288, "rewards/margins": 1.4342727661132812, "rewards/rejected": -4.504461288452148, "sft_loss": 3.1828949451446533, "step": 4835 }, { "epoch": 2.5903997323967216, "grad_norm": 23.980809155950052, "learning_rate": 5.55693010542197e-08, "logits/chosen": -0.3103236258029938, "logits/rejected": -0.028705209493637085, "logps/chosen": -3.0640907287597656, "logps/rejected": -4.454373359680176, "loss": 0.3997, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0640907287597656, "rewards/margins": 1.3902822732925415, "rewards/rejected": -4.454373359680176, "sft_loss": 3.190805435180664, "step": 4840 }, { "epoch": 2.5930757651781233, "grad_norm": 19.274243518176583, "learning_rate": 5.485789333327856e-08, "logits/chosen": -0.2074917107820511, "logits/rejected": -0.13893333077430725, "logps/chosen": -3.134070634841919, "logps/rejected": -4.3168511390686035, "loss": 0.4809, "rewards/accuracies": 0.75, "rewards/chosen": -3.134070634841919, "rewards/margins": 1.1827805042266846, "rewards/rejected": -4.3168511390686035, "sft_loss": 3.336045026779175, "step": 4845 }, { "epoch": 2.595751797959525, "grad_norm": 21.548530065007366, "learning_rate": 5.4150804383008675e-08, "logits/chosen": -0.3585580289363861, "logits/rejected": -0.15167434513568878, "logps/chosen": -3.2743637561798096, "logps/rejected": -4.633362770080566, "loss": 0.4475, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2743637561798096, "rewards/margins": 1.3589991331100464, "rewards/rejected": -4.633362770080566, "sft_loss": 3.377138614654541, "step": 4850 }, { "epoch": 2.5984278307409268, "grad_norm": 19.91785176089767, "learning_rate": 5.344804106359002e-08, "logits/chosen": -0.1923711746931076, "logits/rejected": 0.0029977380763739347, "logps/chosen": -3.04135799407959, "logps/rejected": -4.348074436187744, "loss": 0.4699, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.04135799407959, "rewards/margins": 1.3067158460617065, "rewards/rejected": -4.348074436187744, "sft_loss": 3.242629289627075, "step": 4855 }, { "epoch": 2.601103863522328, "grad_norm": 22.53903758877158, "learning_rate": 5.274961019323559e-08, "logits/chosen": -0.2610209584236145, "logits/rejected": -0.14107844233512878, "logps/chosen": -2.971292495727539, "logps/rejected": -4.198145866394043, "loss": 0.4488, "rewards/accuracies": 0.8125, "rewards/chosen": -2.971292495727539, "rewards/margins": 1.226853370666504, "rewards/rejected": -4.198145866394043, "sft_loss": 3.202307939529419, "step": 4860 }, { "epoch": 2.6037798963037297, "grad_norm": 10.943657723423541, "learning_rate": 5.205551854812451e-08, "logits/chosen": -0.3178286552429199, "logits/rejected": -0.18322348594665527, "logps/chosen": -3.2609658241271973, "logps/rejected": -4.589636325836182, "loss": 0.4319, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2609658241271973, "rewards/margins": 1.328670859336853, "rewards/rejected": -4.589636325836182, "sft_loss": 3.3975632190704346, "step": 4865 }, { "epoch": 2.606455929085131, "grad_norm": 16.361947175383595, "learning_rate": 5.1365772862337177e-08, "logits/chosen": -0.20097629725933075, "logits/rejected": -0.04026917368173599, "logps/chosen": -2.9677042961120605, "logps/rejected": -4.50205659866333, "loss": 0.3729, "rewards/accuracies": 0.875, "rewards/chosen": -2.9677042961120605, "rewards/margins": 1.5343520641326904, "rewards/rejected": -4.50205659866333, "sft_loss": 3.053894519805908, "step": 4870 }, { "epoch": 2.6091319618665327, "grad_norm": 24.718626158852302, "learning_rate": 5.068037982778905e-08, "logits/chosen": -0.16204114258289337, "logits/rejected": -0.04081437736749649, "logps/chosen": -3.022818088531494, "logps/rejected": -4.330333232879639, "loss": 0.4815, "rewards/accuracies": 0.78125, "rewards/chosen": -3.022818088531494, "rewards/margins": 1.307515263557434, "rewards/rejected": -4.330333232879639, "sft_loss": 3.2373275756835938, "step": 4875 }, { "epoch": 2.6118079946479344, "grad_norm": 13.09594561850234, "learning_rate": 4.999934609416656e-08, "logits/chosen": -0.11323384195566177, "logits/rejected": 0.052326686680316925, "logps/chosen": -3.061737537384033, "logps/rejected": -4.523990631103516, "loss": 0.4264, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.061737537384033, "rewards/margins": 1.4622533321380615, "rewards/rejected": -4.523990631103516, "sft_loss": 3.2748007774353027, "step": 4880 }, { "epoch": 2.614484027429336, "grad_norm": 17.96320824985329, "learning_rate": 4.932267826886183e-08, "logits/chosen": -0.1457149088382721, "logits/rejected": -0.06468679010868073, "logps/chosen": -3.215524196624756, "logps/rejected": -4.595486640930176, "loss": 0.4412, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.215524196624756, "rewards/margins": 1.3799618482589722, "rewards/rejected": -4.595486640930176, "sft_loss": 3.408809185028076, "step": 4885 }, { "epoch": 2.6171600602107374, "grad_norm": 18.882118102463277, "learning_rate": 4.8650382916909206e-08, "logits/chosen": -0.32069796323776245, "logits/rejected": -0.09536401927471161, "logps/chosen": -3.2022228240966797, "logps/rejected": -4.488520622253418, "loss": 0.4706, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2022228240966797, "rewards/margins": 1.2862979173660278, "rewards/rejected": -4.488520622253418, "sft_loss": 3.4009814262390137, "step": 4890 }, { "epoch": 2.619836092992139, "grad_norm": 15.427171984599685, "learning_rate": 4.7982466560920976e-08, "logits/chosen": -0.21969299018383026, "logits/rejected": -0.10536585748195648, "logps/chosen": -3.188559055328369, "logps/rejected": -4.181743621826172, "loss": 0.5336, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.188559055328369, "rewards/margins": 0.9931844472885132, "rewards/rejected": -4.181743621826172, "sft_loss": 3.355111598968506, "step": 4895 }, { "epoch": 2.622512125773541, "grad_norm": 21.961264722235224, "learning_rate": 4.7318935681024685e-08, "logits/chosen": -0.15709593892097473, "logits/rejected": 0.04801901429891586, "logps/chosen": -3.1420769691467285, "logps/rejected": -4.425694942474365, "loss": 0.4329, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1420769691467285, "rewards/margins": 1.2836177349090576, "rewards/rejected": -4.425694942474365, "sft_loss": 3.3105030059814453, "step": 4900 }, { "epoch": 2.625188158554942, "grad_norm": 15.599904597557535, "learning_rate": 4.6659796714799745e-08, "logits/chosen": -0.1877932846546173, "logits/rejected": 0.018878992646932602, "logps/chosen": -3.198349714279175, "logps/rejected": -4.627870082855225, "loss": 0.4091, "rewards/accuracies": 0.84375, "rewards/chosen": -3.198349714279175, "rewards/margins": 1.4295203685760498, "rewards/rejected": -4.627870082855225, "sft_loss": 3.4322593212127686, "step": 4905 }, { "epoch": 2.627864191336344, "grad_norm": 16.54048979952472, "learning_rate": 4.60050560572155e-08, "logits/chosen": -0.22201958298683167, "logits/rejected": -0.22817635536193848, "logps/chosen": -3.1096553802490234, "logps/rejected": -4.568678855895996, "loss": 0.4544, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1096553802490234, "rewards/margins": 1.45902419090271, "rewards/rejected": -4.568678855895996, "sft_loss": 3.2503561973571777, "step": 4910 }, { "epoch": 2.6305402241177456, "grad_norm": 21.385888084225407, "learning_rate": 4.535472006056834e-08, "logits/chosen": -0.22342491149902344, "logits/rejected": -0.040875911712646484, "logps/chosen": -3.0525319576263428, "logps/rejected": -4.305843353271484, "loss": 0.46, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.0525319576263428, "rewards/margins": 1.2533115148544312, "rewards/rejected": -4.305843353271484, "sft_loss": 3.2571754455566406, "step": 4915 }, { "epoch": 2.6332162568991473, "grad_norm": 18.92057715118689, "learning_rate": 4.470879503442132e-08, "logits/chosen": -0.23863354325294495, "logits/rejected": -0.08575797080993652, "logps/chosen": -3.198439598083496, "logps/rejected": -4.544912338256836, "loss": 0.4611, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.198439598083496, "rewards/margins": 1.3464728593826294, "rewards/rejected": -4.544912338256836, "sft_loss": 3.3559088706970215, "step": 4920 }, { "epoch": 2.6358922896805486, "grad_norm": 16.23594963751572, "learning_rate": 4.406728724554154e-08, "logits/chosen": -0.36170822381973267, "logits/rejected": -0.05511064454913139, "logps/chosen": -3.168303966522217, "logps/rejected": -4.5181884765625, "loss": 0.4467, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.168303966522217, "rewards/margins": 1.3498847484588623, "rewards/rejected": -4.5181884765625, "sft_loss": 3.367772340774536, "step": 4925 }, { "epoch": 2.6385683224619503, "grad_norm": 15.157398299332206, "learning_rate": 4.3430202917840664e-08, "logits/chosen": -0.2529342770576477, "logits/rejected": -0.003716734703630209, "logps/chosen": -3.219101667404175, "logps/rejected": -4.630505561828613, "loss": 0.4364, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.219101667404175, "rewards/margins": 1.4114038944244385, "rewards/rejected": -4.630505561828613, "sft_loss": 3.2867751121520996, "step": 4930 }, { "epoch": 2.6412443552433515, "grad_norm": 25.37551161557867, "learning_rate": 4.279754823231346e-08, "logits/chosen": -0.2770128846168518, "logits/rejected": -0.029742831364274025, "logps/chosen": -3.149597644805908, "logps/rejected": -4.365159034729004, "loss": 0.4824, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.149597644805908, "rewards/margins": 1.2155616283416748, "rewards/rejected": -4.365159034729004, "sft_loss": 3.2964344024658203, "step": 4935 }, { "epoch": 2.6439203880247533, "grad_norm": 15.297478140529776, "learning_rate": 4.216932932697859e-08, "logits/chosen": -0.2701972723007202, "logits/rejected": -0.12614202499389648, "logps/chosen": -3.1263914108276367, "logps/rejected": -4.205435752868652, "loss": 0.4734, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.1263914108276367, "rewards/margins": 1.0790444612503052, "rewards/rejected": -4.205435752868652, "sft_loss": 3.333831787109375, "step": 4940 }, { "epoch": 2.646596420806155, "grad_norm": 22.327646162788575, "learning_rate": 4.154555229681844e-08, "logits/chosen": -0.23707015812397003, "logits/rejected": 0.0013411410618573427, "logps/chosen": -3.1890475749969482, "logps/rejected": -4.5590715408325195, "loss": 0.4182, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.1890475749969482, "rewards/margins": 1.3700246810913086, "rewards/rejected": -4.5590715408325195, "sft_loss": 3.3222873210906982, "step": 4945 }, { "epoch": 2.6492724535875567, "grad_norm": 24.20219575368579, "learning_rate": 4.092622319372069e-08, "logits/chosen": -0.19819870591163635, "logits/rejected": 0.018138539046049118, "logps/chosen": -3.139855146408081, "logps/rejected": -4.3406782150268555, "loss": 0.5064, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.139855146408081, "rewards/margins": 1.200823187828064, "rewards/rejected": -4.3406782150268555, "sft_loss": 3.278446912765503, "step": 4950 }, { "epoch": 2.651948486368958, "grad_norm": 22.468355517091446, "learning_rate": 4.031134802641889e-08, "logits/chosen": -0.2404346764087677, "logits/rejected": -0.163672536611557, "logps/chosen": -3.2455620765686035, "logps/rejected": -4.417026519775391, "loss": 0.4541, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2455620765686035, "rewards/margins": 1.1714643239974976, "rewards/rejected": -4.417026519775391, "sft_loss": 3.4159579277038574, "step": 4955 }, { "epoch": 2.6546245191503597, "grad_norm": 16.006458569884785, "learning_rate": 3.970093276043468e-08, "logits/chosen": -0.20295009016990662, "logits/rejected": -0.056883059442043304, "logps/chosen": -3.079036235809326, "logps/rejected": -4.43948221206665, "loss": 0.4306, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.079036235809326, "rewards/margins": 1.3604458570480347, "rewards/rejected": -4.43948221206665, "sft_loss": 3.2619762420654297, "step": 4960 }, { "epoch": 2.657300551931761, "grad_norm": 21.707948896911393, "learning_rate": 3.9094983318019584e-08, "logits/chosen": -0.28873950242996216, "logits/rejected": -0.11398355662822723, "logps/chosen": -3.094452381134033, "logps/rejected": -4.441015720367432, "loss": 0.4314, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.094452381134033, "rewards/margins": 1.3465631008148193, "rewards/rejected": -4.441015720367432, "sft_loss": 3.330700397491455, "step": 4965 }, { "epoch": 2.6599765847131627, "grad_norm": 16.677700003919625, "learning_rate": 3.849350557809789e-08, "logits/chosen": -0.14867134392261505, "logits/rejected": -0.04668787866830826, "logps/chosen": -2.994612693786621, "logps/rejected": -4.390146255493164, "loss": 0.4199, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.994612693786621, "rewards/margins": 1.395533800125122, "rewards/rejected": -4.390146255493164, "sft_loss": 3.0476841926574707, "step": 4970 }, { "epoch": 2.6626526174945644, "grad_norm": 17.366292153784123, "learning_rate": 3.789650537620903e-08, "logits/chosen": -0.18064144253730774, "logits/rejected": -0.10504363477230072, "logps/chosen": -3.2243492603302, "logps/rejected": -4.436784744262695, "loss": 0.4472, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2243492603302, "rewards/margins": 1.2124359607696533, "rewards/rejected": -4.436784744262695, "sft_loss": 3.332610607147217, "step": 4975 }, { "epoch": 2.665328650275966, "grad_norm": 17.975524302441073, "learning_rate": 3.730398850445182e-08, "logits/chosen": -0.11991802603006363, "logits/rejected": -0.06612943112850189, "logps/chosen": -3.369271755218506, "logps/rejected": -4.555083274841309, "loss": 0.4937, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.369271755218506, "rewards/margins": 1.1858118772506714, "rewards/rejected": -4.555083274841309, "sft_loss": 3.3891067504882812, "step": 4980 }, { "epoch": 2.6680046830573674, "grad_norm": 18.379547604926636, "learning_rate": 3.671596071142735e-08, "logits/chosen": -0.1973075568675995, "logits/rejected": 0.03738722950220108, "logps/chosen": -3.0913569927215576, "logps/rejected": -4.440980434417725, "loss": 0.4823, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0913569927215576, "rewards/margins": 1.349623441696167, "rewards/rejected": -4.440980434417725, "sft_loss": 3.18648362159729, "step": 4985 }, { "epoch": 2.670680715838769, "grad_norm": 18.614410610882675, "learning_rate": 3.6132427702183996e-08, "logits/chosen": -0.3264130651950836, "logits/rejected": -0.11211379617452621, "logps/chosen": -3.138422966003418, "logps/rejected": -4.492595672607422, "loss": 0.42, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.138422966003418, "rewards/margins": 1.3541723489761353, "rewards/rejected": -4.492595672607422, "sft_loss": 3.29201078414917, "step": 4990 }, { "epoch": 2.6733567486201704, "grad_norm": 16.98717184658078, "learning_rate": 3.555339513816147e-08, "logits/chosen": -0.2549840807914734, "logits/rejected": -0.23621661961078644, "logps/chosen": -3.2311882972717285, "logps/rejected": -4.25803279876709, "loss": 0.5272, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2311882972717285, "rewards/margins": 1.026845097541809, "rewards/rejected": -4.25803279876709, "sft_loss": 3.410748243331909, "step": 4995 }, { "epoch": 2.676032781401572, "grad_norm": 19.140590465296466, "learning_rate": 3.497886863713639e-08, "logits/chosen": -0.23707112669944763, "logits/rejected": -0.17840634286403656, "logps/chosen": -3.2316176891326904, "logps/rejected": -4.3862199783325195, "loss": 0.515, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.2316176891326904, "rewards/margins": 1.1546024084091187, "rewards/rejected": -4.3862199783325195, "sft_loss": 3.41558575630188, "step": 5000 }, { "epoch": 2.678708814182974, "grad_norm": 19.91206142487075, "learning_rate": 3.440885377316721e-08, "logits/chosen": -0.18267245590686798, "logits/rejected": -0.09775508940219879, "logps/chosen": -3.204019546508789, "logps/rejected": -4.2952752113342285, "loss": 0.4578, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.204019546508789, "rewards/margins": 1.091255784034729, "rewards/rejected": -4.2952752113342285, "sft_loss": 3.3476028442382812, "step": 5005 }, { "epoch": 2.6813848469643755, "grad_norm": 20.561056582271576, "learning_rate": 3.384335607654082e-08, "logits/chosen": -0.16455042362213135, "logits/rejected": -0.049335233867168427, "logps/chosen": -3.2577567100524902, "logps/rejected": -4.439981460571289, "loss": 0.4627, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2577567100524902, "rewards/margins": 1.1822245121002197, "rewards/rejected": -4.439981460571289, "sft_loss": 3.3594517707824707, "step": 5010 }, { "epoch": 2.684060879745777, "grad_norm": 20.92663548817318, "learning_rate": 3.328238103371811e-08, "logits/chosen": -0.24483826756477356, "logits/rejected": -0.14432474970817566, "logps/chosen": -3.205540180206299, "logps/rejected": -4.408441066741943, "loss": 0.4572, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.205540180206299, "rewards/margins": 1.2029006481170654, "rewards/rejected": -4.408441066741943, "sft_loss": 3.2874832153320312, "step": 5015 }, { "epoch": 2.6867369125271785, "grad_norm": 24.79855818848997, "learning_rate": 3.272593408728169e-08, "logits/chosen": -0.28618526458740234, "logits/rejected": 0.0043426095508039, "logps/chosen": -3.109508991241455, "logps/rejected": -4.331753730773926, "loss": 0.4717, "rewards/accuracies": 0.78125, "rewards/chosen": -3.109508991241455, "rewards/margins": 1.2222447395324707, "rewards/rejected": -4.331753730773926, "sft_loss": 3.335289478302002, "step": 5020 }, { "epoch": 2.6894129453085798, "grad_norm": 16.22444594329588, "learning_rate": 3.217402063588204e-08, "logits/chosen": -0.2819980978965759, "logits/rejected": -0.08756112307310104, "logps/chosen": -3.1473426818847656, "logps/rejected": -4.334247589111328, "loss": 0.4742, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1473426818847656, "rewards/margins": 1.1869052648544312, "rewards/rejected": -4.334247589111328, "sft_loss": 3.2710742950439453, "step": 5025 }, { "epoch": 2.6920889780899815, "grad_norm": 14.71109917318996, "learning_rate": 3.162664603418608e-08, "logits/chosen": -0.2247733771800995, "logits/rejected": -0.10569185018539429, "logps/chosen": -3.121309995651245, "logps/rejected": -4.529058933258057, "loss": 0.4511, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.121309995651245, "rewards/margins": 1.4077484607696533, "rewards/rejected": -4.529058933258057, "sft_loss": 3.2245278358459473, "step": 5030 }, { "epoch": 2.694765010871383, "grad_norm": 25.465988896577702, "learning_rate": 3.1083815592824416e-08, "logits/chosen": -0.2195226401090622, "logits/rejected": -0.06730414927005768, "logps/chosen": -3.2314651012420654, "logps/rejected": -4.430628776550293, "loss": 0.4673, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.2314651012420654, "rewards/margins": 1.1991630792617798, "rewards/rejected": -4.430628776550293, "sft_loss": 3.365612506866455, "step": 5035 }, { "epoch": 2.697441043652785, "grad_norm": 17.294025816863705, "learning_rate": 3.054553457834053e-08, "logits/chosen": -0.03186682611703873, "logits/rejected": -0.05140848085284233, "logps/chosen": -3.281543254852295, "logps/rejected": -4.466519355773926, "loss": 0.4721, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.281543254852295, "rewards/margins": 1.184975504875183, "rewards/rejected": -4.466519355773926, "sft_loss": 3.3809001445770264, "step": 5040 }, { "epoch": 2.700117076434186, "grad_norm": 22.62680292614812, "learning_rate": 3.0011808213139036e-08, "logits/chosen": -0.12738078832626343, "logits/rejected": -0.1271544247865677, "logps/chosen": -3.167606830596924, "logps/rejected": -4.394031524658203, "loss": 0.4707, "rewards/accuracies": 0.78125, "rewards/chosen": -3.167606830596924, "rewards/margins": 1.2264244556427002, "rewards/rejected": -4.394031524658203, "sft_loss": 3.285980224609375, "step": 5045 }, { "epoch": 2.702793109215588, "grad_norm": 17.79576892462079, "learning_rate": 2.948264167543568e-08, "logits/chosen": -0.2033161222934723, "logits/rejected": -0.09692879766225815, "logps/chosen": -2.983433246612549, "logps/rejected": -4.184727668762207, "loss": 0.4358, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -2.983433246612549, "rewards/margins": 1.2012943029403687, "rewards/rejected": -4.184727668762207, "sft_loss": 3.1466856002807617, "step": 5050 }, { "epoch": 2.7054691419969896, "grad_norm": 17.513471774648877, "learning_rate": 2.8958040099206216e-08, "logits/chosen": -0.33082449436187744, "logits/rejected": -0.20554928481578827, "logps/chosen": -2.940002679824829, "logps/rejected": -4.253390312194824, "loss": 0.4264, "rewards/accuracies": 0.8125, "rewards/chosen": -2.940002679824829, "rewards/margins": 1.3133876323699951, "rewards/rejected": -4.253390312194824, "sft_loss": 3.0926265716552734, "step": 5055 }, { "epoch": 2.708145174778391, "grad_norm": 22.424994647222533, "learning_rate": 2.843800857413775e-08, "logits/chosen": -0.17994250357151031, "logits/rejected": -0.08553121984004974, "logps/chosen": -3.0669708251953125, "logps/rejected": -4.204202651977539, "loss": 0.4954, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.0669708251953125, "rewards/margins": 1.137231469154358, "rewards/rejected": -4.204202651977539, "sft_loss": 3.2526612281799316, "step": 5060 }, { "epoch": 2.7108212075597926, "grad_norm": 19.948217087692427, "learning_rate": 2.7922552145578203e-08, "logits/chosen": -0.22585833072662354, "logits/rejected": 0.07776106148958206, "logps/chosen": -3.027068614959717, "logps/rejected": -4.328622341156006, "loss": 0.4434, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.027068614959717, "rewards/margins": 1.30155348777771, "rewards/rejected": -4.328622341156006, "sft_loss": 3.1880125999450684, "step": 5065 }, { "epoch": 2.7134972403411943, "grad_norm": 22.78413021377979, "learning_rate": 2.7411675814488277e-08, "logits/chosen": -0.1200130432844162, "logits/rejected": 0.06273828446865082, "logps/chosen": -3.0594770908355713, "logps/rejected": -4.220132350921631, "loss": 0.4505, "rewards/accuracies": 0.8125, "rewards/chosen": -3.0594770908355713, "rewards/margins": 1.1606553792953491, "rewards/rejected": -4.220132350921631, "sft_loss": 3.2853927612304688, "step": 5070 }, { "epoch": 2.7161732731225956, "grad_norm": 26.089826211640414, "learning_rate": 2.690538453739216e-08, "logits/chosen": -0.17602473497390747, "logits/rejected": -0.085169717669487, "logps/chosen": -3.109769105911255, "logps/rejected": -4.0252203941345215, "loss": 0.5595, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -3.109769105911255, "rewards/margins": 0.9154506921768188, "rewards/rejected": -4.0252203941345215, "sft_loss": 3.303795576095581, "step": 5075 }, { "epoch": 2.7188493059039973, "grad_norm": 16.519227680810705, "learning_rate": 2.6403683226330298e-08, "logits/chosen": -0.2547515034675598, "logits/rejected": -0.05338859558105469, "logps/chosen": -3.140300750732422, "logps/rejected": -4.311095714569092, "loss": 0.4865, "rewards/accuracies": 0.78125, "rewards/chosen": -3.140300750732422, "rewards/margins": 1.1707950830459595, "rewards/rejected": -4.311095714569092, "sft_loss": 3.2734668254852295, "step": 5080 }, { "epoch": 2.721525338685399, "grad_norm": 36.29648134115861, "learning_rate": 2.5906576748810804e-08, "logits/chosen": -0.26947951316833496, "logits/rejected": -0.11737142503261566, "logps/chosen": -3.053846836090088, "logps/rejected": -4.512036323547363, "loss": 0.413, "rewards/accuracies": 0.84375, "rewards/chosen": -3.053846836090088, "rewards/margins": 1.4581893682479858, "rewards/rejected": -4.512036323547363, "sft_loss": 3.2287800312042236, "step": 5085 }, { "epoch": 2.7242013714668003, "grad_norm": 22.703866516058344, "learning_rate": 2.5414069927763016e-08, "logits/chosen": -0.3520892262458801, "logits/rejected": -0.11236388981342316, "logps/chosen": -3.2202491760253906, "logps/rejected": -4.552691459655762, "loss": 0.439, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.2202491760253906, "rewards/margins": 1.3324424028396606, "rewards/rejected": -4.552691459655762, "sft_loss": 3.3652408123016357, "step": 5090 }, { "epoch": 2.726877404248202, "grad_norm": 15.012282479125542, "learning_rate": 2.4926167541490185e-08, "logits/chosen": -0.370398610830307, "logits/rejected": -0.10767936706542969, "logps/chosen": -3.0924620628356934, "logps/rejected": -4.428684234619141, "loss": 0.4472, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.0924620628356934, "rewards/margins": 1.3362222909927368, "rewards/rejected": -4.428684234619141, "sft_loss": 3.2309207916259766, "step": 5095 }, { "epoch": 2.7295534370296037, "grad_norm": 12.801368025013259, "learning_rate": 2.4442874323623574e-08, "logits/chosen": -0.1572403907775879, "logits/rejected": 0.01683208718895912, "logps/chosen": -3.1529765129089355, "logps/rejected": -4.545765399932861, "loss": 0.4614, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1529765129089355, "rewards/margins": 1.3927887678146362, "rewards/rejected": -4.545765399932861, "sft_loss": 3.323516368865967, "step": 5100 }, { "epoch": 2.7322294698110055, "grad_norm": 19.38537449809977, "learning_rate": 2.396419496307589e-08, "logits/chosen": -0.20657595992088318, "logits/rejected": 0.0014978349208831787, "logps/chosen": -3.275643825531006, "logps/rejected": -4.5586090087890625, "loss": 0.4631, "rewards/accuracies": 0.8125, "rewards/chosen": -3.275643825531006, "rewards/margins": 1.2829653024673462, "rewards/rejected": -4.5586090087890625, "sft_loss": 3.4100615978240967, "step": 5105 }, { "epoch": 2.7349055025924067, "grad_norm": 19.297910442054516, "learning_rate": 2.349013410399653e-08, "logits/chosen": -0.21427297592163086, "logits/rejected": -0.06879232078790665, "logps/chosen": -3.067349672317505, "logps/rejected": -4.291683673858643, "loss": 0.468, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.067349672317505, "rewards/margins": 1.2243340015411377, "rewards/rejected": -4.291683673858643, "sft_loss": 3.214346408843994, "step": 5110 }, { "epoch": 2.7375815353738084, "grad_norm": 15.978901754824962, "learning_rate": 2.3020696345725954e-08, "logits/chosen": -0.29842501878738403, "logits/rejected": -0.017918001860380173, "logps/chosen": -3.176945209503174, "logps/rejected": -4.515137672424316, "loss": 0.4066, "rewards/accuracies": 0.84375, "rewards/chosen": -3.176945209503174, "rewards/margins": 1.3381928205490112, "rewards/rejected": -4.515137672424316, "sft_loss": 3.2655911445617676, "step": 5115 }, { "epoch": 2.7402575681552097, "grad_norm": 25.512112118049004, "learning_rate": 2.2555886242751398e-08, "logits/chosen": -0.23838338255882263, "logits/rejected": -0.1361640989780426, "logps/chosen": -3.1773295402526855, "logps/rejected": -4.491230010986328, "loss": 0.4194, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1773295402526855, "rewards/margins": 1.313900351524353, "rewards/rejected": -4.491230010986328, "sft_loss": 3.287815809249878, "step": 5120 }, { "epoch": 2.7429336009366114, "grad_norm": 28.17408344751377, "learning_rate": 2.2095708304662453e-08, "logits/chosen": -0.35994330048561096, "logits/rejected": -0.07787968963384628, "logps/chosen": -3.064584255218506, "logps/rejected": -4.313921928405762, "loss": 0.4529, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.064584255218506, "rewards/margins": 1.2493374347686768, "rewards/rejected": -4.313921928405762, "sft_loss": 3.270467758178711, "step": 5125 }, { "epoch": 2.745609633718013, "grad_norm": 17.656274722525687, "learning_rate": 2.16401669961076e-08, "logits/chosen": -0.3636077344417572, "logits/rejected": -0.12653307616710663, "logps/chosen": -3.1638081073760986, "logps/rejected": -4.486863613128662, "loss": 0.4416, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1638081073760986, "rewards/margins": 1.3230552673339844, "rewards/rejected": -4.486863613128662, "sft_loss": 3.383385181427002, "step": 5130 }, { "epoch": 2.748285666499415, "grad_norm": 25.407359996428223, "learning_rate": 2.1189266736750532e-08, "logits/chosen": -0.14153358340263367, "logits/rejected": -0.0551319494843483, "logps/chosen": -3.1332790851593018, "logps/rejected": -4.269174098968506, "loss": 0.4787, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1332790851593018, "rewards/margins": 1.135895013809204, "rewards/rejected": -4.269174098968506, "sft_loss": 3.3371098041534424, "step": 5135 }, { "epoch": 2.750961699280816, "grad_norm": 17.329397758719967, "learning_rate": 2.0743011901227623e-08, "logits/chosen": -0.17390862107276917, "logits/rejected": 0.027312543243169785, "logps/chosen": -3.2330029010772705, "logps/rejected": -4.444141387939453, "loss": 0.4505, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2330029010772705, "rewards/margins": 1.2111393213272095, "rewards/rejected": -4.444141387939453, "sft_loss": 3.3181076049804688, "step": 5140 }, { "epoch": 2.753637732062218, "grad_norm": 25.672394019334682, "learning_rate": 2.030140681910508e-08, "logits/chosen": -0.18414786458015442, "logits/rejected": -0.015248274430632591, "logps/chosen": -3.1611456871032715, "logps/rejected": -4.285699844360352, "loss": 0.5, "rewards/accuracies": 0.75, "rewards/chosen": -3.1611456871032715, "rewards/margins": 1.1245542764663696, "rewards/rejected": -4.285699844360352, "sft_loss": 3.3388450145721436, "step": 5145 }, { "epoch": 2.756313764843619, "grad_norm": 16.777482622717063, "learning_rate": 1.986445577483753e-08, "logits/chosen": -0.27730321884155273, "logits/rejected": -0.11832135915756226, "logps/chosen": -3.099640369415283, "logps/rejected": -4.361514091491699, "loss": 0.4583, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.099640369415283, "rewards/margins": 1.2618744373321533, "rewards/rejected": -4.361514091491699, "sft_loss": 3.2630677223205566, "step": 5150 }, { "epoch": 2.758989797625021, "grad_norm": 16.454912586585895, "learning_rate": 1.9432163007725765e-08, "logits/chosen": -0.29692643880844116, "logits/rejected": -0.17871159315109253, "logps/chosen": -3.115957260131836, "logps/rejected": -4.290701389312744, "loss": 0.4671, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.115957260131836, "rewards/margins": 1.174743890762329, "rewards/rejected": -4.290701389312744, "sft_loss": 3.319054365158081, "step": 5155 }, { "epoch": 2.7616658304064226, "grad_norm": 15.672889445754924, "learning_rate": 1.9004532711876297e-08, "logits/chosen": -0.22892217338085175, "logits/rejected": -0.17264311015605927, "logps/chosen": -3.000776767730713, "logps/rejected": -4.343716144561768, "loss": 0.4387, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.000776767730713, "rewards/margins": 1.342938780784607, "rewards/rejected": -4.343716144561768, "sft_loss": 3.2002334594726562, "step": 5160 }, { "epoch": 2.7643418631878243, "grad_norm": 18.34683396965878, "learning_rate": 1.8581569036159928e-08, "logits/chosen": -0.2559751272201538, "logits/rejected": -0.019560130313038826, "logps/chosen": -3.0496087074279785, "logps/rejected": -4.328253269195557, "loss": 0.437, "rewards/accuracies": 0.84375, "rewards/chosen": -3.0496087074279785, "rewards/margins": 1.27864408493042, "rewards/rejected": -4.328253269195557, "sft_loss": 3.156299114227295, "step": 5165 }, { "epoch": 2.7670178959692255, "grad_norm": 13.638033094970202, "learning_rate": 1.8163276084172285e-08, "logits/chosen": -0.22779671847820282, "logits/rejected": -0.06113610416650772, "logps/chosen": -3.1405844688415527, "logps/rejected": -4.442783355712891, "loss": 0.4255, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1405844688415527, "rewards/margins": 1.3021987676620483, "rewards/rejected": -4.442783355712891, "sft_loss": 3.345402479171753, "step": 5170 }, { "epoch": 2.7696939287506273, "grad_norm": 18.255590078165085, "learning_rate": 1.7749657914193194e-08, "logits/chosen": -0.2028944194316864, "logits/rejected": -0.08475472033023834, "logps/chosen": -3.2638847827911377, "logps/rejected": -4.632298469543457, "loss": 0.3987, "rewards/accuracies": 0.84375, "rewards/chosen": -3.2638847827911377, "rewards/margins": 1.3684141635894775, "rewards/rejected": -4.632298469543457, "sft_loss": 3.346186876296997, "step": 5175 }, { "epoch": 2.7723699615320285, "grad_norm": 24.772714135947187, "learning_rate": 1.7340718539148203e-08, "logits/chosen": -0.16636791825294495, "logits/rejected": -0.08039329200983047, "logps/chosen": -3.3139376640319824, "logps/rejected": -4.398904323577881, "loss": 0.4981, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.3139376640319824, "rewards/margins": 1.0849663019180298, "rewards/rejected": -4.398904323577881, "sft_loss": 3.5393283367156982, "step": 5180 }, { "epoch": 2.7750459943134302, "grad_norm": 15.997616887752065, "learning_rate": 1.6936461926568724e-08, "logits/chosen": -0.17959436774253845, "logits/rejected": 0.0007152434554882348, "logps/chosen": -3.1098248958587646, "logps/rejected": -4.493106365203857, "loss": 0.4645, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1098248958587646, "rewards/margins": 1.3832814693450928, "rewards/rejected": -4.493106365203857, "sft_loss": 3.354473829269409, "step": 5185 }, { "epoch": 2.777722027094832, "grad_norm": 19.2292266193858, "learning_rate": 1.6536891998554346e-08, "logits/chosen": -0.32440274953842163, "logits/rejected": -0.11405463516712189, "logps/chosen": -3.0503506660461426, "logps/rejected": -4.309788227081299, "loss": 0.4564, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0503506660461426, "rewards/margins": 1.259437918663025, "rewards/rejected": -4.309788227081299, "sft_loss": 3.2820911407470703, "step": 5190 }, { "epoch": 2.7803980598762337, "grad_norm": 18.601636609926377, "learning_rate": 1.6142012631734093e-08, "logits/chosen": -0.1657555103302002, "logits/rejected": 0.01971413753926754, "logps/chosen": -3.1654350757598877, "logps/rejected": -4.4513959884643555, "loss": 0.4367, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1654350757598877, "rewards/margins": 1.2859606742858887, "rewards/rejected": -4.4513959884643555, "sft_loss": 3.287670135498047, "step": 5195 }, { "epoch": 2.783074092657635, "grad_norm": 23.10107289402409, "learning_rate": 1.575182765722949e-08, "logits/chosen": -0.311906635761261, "logits/rejected": -0.10322600603103638, "logps/chosen": -3.0944015979766846, "logps/rejected": -4.473546028137207, "loss": 0.4272, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.0944015979766846, "rewards/margins": 1.3791444301605225, "rewards/rejected": -4.473546028137207, "sft_loss": 3.301250457763672, "step": 5200 }, { "epoch": 2.783074092657635, "eval_logits/chosen": 0.22810406982898712, "eval_logits/rejected": 0.35918039083480835, "eval_logps/chosen": -3.313793182373047, "eval_logps/rejected": -4.3619208335876465, "eval_loss": 0.5578520894050598, "eval_rewards/accuracies": 0.7232937812805176, "eval_rewards/chosen": -3.313793182373047, "eval_rewards/margins": 1.0481278896331787, "eval_rewards/rejected": -4.3619208335876465, "eval_runtime": 50.946, "eval_samples_per_second": 26.401, "eval_sft_loss": 3.4522531032562256, "eval_steps_per_second": 6.615, "step": 5200 }, { "epoch": 2.7857501254390367, "grad_norm": 11.63526468440465, "learning_rate": 1.536634086061672e-08, "logits/chosen": -0.1776561737060547, "logits/rejected": -0.08043881505727768, "logps/chosen": -3.1294918060302734, "logps/rejected": -4.408661842346191, "loss": 0.4416, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1294918060302734, "rewards/margins": 1.2791701555252075, "rewards/rejected": -4.408661842346191, "sft_loss": 3.2111599445343018, "step": 5205 }, { "epoch": 2.788426158220438, "grad_norm": 16.73777140478313, "learning_rate": 1.4985555981890495e-08, "logits/chosen": -0.20793786644935608, "logits/rejected": -0.06591515988111496, "logps/chosen": -3.1309242248535156, "logps/rejected": -4.449841499328613, "loss": 0.4562, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1309242248535156, "rewards/margins": 1.31891667842865, "rewards/rejected": -4.449841499328613, "sft_loss": 3.2599101066589355, "step": 5210 }, { "epoch": 2.7911021910018396, "grad_norm": 16.52444894688706, "learning_rate": 1.4609476715427226e-08, "logits/chosen": -0.21759703755378723, "logits/rejected": -0.0982574075460434, "logps/chosen": -3.0120937824249268, "logps/rejected": -4.364049434661865, "loss": 0.4262, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.0120937824249268, "rewards/margins": 1.3519560098648071, "rewards/rejected": -4.364049434661865, "sft_loss": 3.1805450916290283, "step": 5215 }, { "epoch": 2.7937782237832414, "grad_norm": 16.345210936985595, "learning_rate": 1.4238106709949792e-08, "logits/chosen": -0.2661355137825012, "logits/rejected": -0.13124528527259827, "logps/chosen": -3.1420581340789795, "logps/rejected": -4.6199517250061035, "loss": 0.3903, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1420581340789795, "rewards/margins": 1.477893590927124, "rewards/rejected": -4.6199517250061035, "sft_loss": 3.2643871307373047, "step": 5220 }, { "epoch": 2.796454256564643, "grad_norm": 23.04024804031211, "learning_rate": 1.3871449568491511e-08, "logits/chosen": -0.20268850028514862, "logits/rejected": 0.005442657973617315, "logps/chosen": -3.203855514526367, "logps/rejected": -4.447484016418457, "loss": 0.4626, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.203855514526367, "rewards/margins": 1.2436293363571167, "rewards/rejected": -4.447484016418457, "sft_loss": 3.311129331588745, "step": 5225 }, { "epoch": 2.7991302893460444, "grad_norm": 13.165452282321871, "learning_rate": 1.3509508848361606e-08, "logits/chosen": -0.347541481256485, "logits/rejected": -0.14753584563732147, "logps/chosen": -3.1543853282928467, "logps/rejected": -4.4243059158325195, "loss": 0.4364, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1543853282928467, "rewards/margins": 1.2699207067489624, "rewards/rejected": -4.4243059158325195, "sft_loss": 3.2332236766815186, "step": 5230 }, { "epoch": 2.801806322127446, "grad_norm": 15.873162098156593, "learning_rate": 1.3152288061110517e-08, "logits/chosen": -0.3166837692260742, "logits/rejected": -0.1324232965707779, "logps/chosen": -3.044175386428833, "logps/rejected": -4.35813045501709, "loss": 0.436, "rewards/accuracies": 0.8125, "rewards/chosen": -3.044175386428833, "rewards/margins": 1.313955545425415, "rewards/rejected": -4.35813045501709, "sft_loss": 3.1847636699676514, "step": 5235 }, { "epoch": 2.804482354908848, "grad_norm": 20.314089406390774, "learning_rate": 1.2799790672495814e-08, "logits/chosen": -0.29718679189682007, "logits/rejected": -0.04755071923136711, "logps/chosen": -3.2006309032440186, "logps/rejected": -4.376867294311523, "loss": 0.4736, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2006309032440186, "rewards/margins": 1.176236867904663, "rewards/rejected": -4.376867294311523, "sft_loss": 3.3230297565460205, "step": 5240 }, { "epoch": 2.807158387690249, "grad_norm": 20.805364839798266, "learning_rate": 1.2452020102448835e-08, "logits/chosen": -0.22540588676929474, "logits/rejected": -0.13884001970291138, "logps/chosen": -3.12069034576416, "logps/rejected": -4.296230792999268, "loss": 0.4855, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.12069034576416, "rewards/margins": 1.1755400896072388, "rewards/rejected": -4.296230792999268, "sft_loss": 3.3076324462890625, "step": 5245 }, { "epoch": 2.8098344204716508, "grad_norm": 30.80994255066866, "learning_rate": 1.2108979725041103e-08, "logits/chosen": -0.32555264234542847, "logits/rejected": -0.14052915573120117, "logps/chosen": -3.2292141914367676, "logps/rejected": -4.531303405761719, "loss": 0.4532, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2292141914367676, "rewards/margins": 1.3020894527435303, "rewards/rejected": -4.531303405761719, "sft_loss": 3.4142825603485107, "step": 5250 }, { "epoch": 2.8125104532530525, "grad_norm": 17.44712826499318, "learning_rate": 1.1770672868451958e-08, "logits/chosen": -0.28547942638397217, "logits/rejected": -0.024611469358205795, "logps/chosen": -3.3335862159729004, "logps/rejected": -4.608405113220215, "loss": 0.4362, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.3335862159729004, "rewards/margins": 1.2748191356658936, "rewards/rejected": -4.608405113220215, "sft_loss": 3.3824000358581543, "step": 5255 }, { "epoch": 2.8151864860344538, "grad_norm": 22.87907119725168, "learning_rate": 1.1437102814935872e-08, "logits/chosen": -0.21431437134742737, "logits/rejected": -0.1254430115222931, "logps/chosen": -3.172545909881592, "logps/rejected": -4.361286640167236, "loss": 0.4921, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.172545909881592, "rewards/margins": 1.1887407302856445, "rewards/rejected": -4.361286640167236, "sft_loss": 3.4551422595977783, "step": 5260 }, { "epoch": 2.8178625188158555, "grad_norm": 16.548951869023035, "learning_rate": 1.1108272800791018e-08, "logits/chosen": -0.34953418374061584, "logits/rejected": -0.056724101305007935, "logps/chosen": -3.297877073287964, "logps/rejected": -4.5453925132751465, "loss": 0.4618, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.297877073287964, "rewards/margins": 1.2475159168243408, "rewards/rejected": -4.5453925132751465, "sft_loss": 3.439291477203369, "step": 5265 }, { "epoch": 2.820538551597257, "grad_norm": 17.836969106027368, "learning_rate": 1.078418601632769e-08, "logits/chosen": -0.1767922341823578, "logits/rejected": 0.0060912007465958595, "logps/chosen": -3.233365297317505, "logps/rejected": -4.544157981872559, "loss": 0.4216, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.233365297317505, "rewards/margins": 1.3107929229736328, "rewards/rejected": -4.544157981872559, "sft_loss": 3.4093177318573, "step": 5270 }, { "epoch": 2.8232145843786585, "grad_norm": 13.426686551628864, "learning_rate": 1.0464845605837159e-08, "logits/chosen": -0.21655690670013428, "logits/rejected": -0.036606211215257645, "logps/chosen": -3.2507522106170654, "logps/rejected": -4.522493362426758, "loss": 0.4176, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.2507522106170654, "rewards/margins": 1.2717409133911133, "rewards/rejected": -4.522493362426758, "sft_loss": 3.311718702316284, "step": 5275 }, { "epoch": 2.82589061716006, "grad_norm": 13.710812813707037, "learning_rate": 1.0150254667561642e-08, "logits/chosen": -0.23185142874717712, "logits/rejected": 0.02568121626973152, "logps/chosen": -3.3728013038635254, "logps/rejected": -4.749502182006836, "loss": 0.4133, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.3728013038635254, "rewards/margins": 1.376700758934021, "rewards/rejected": -4.749502182006836, "sft_loss": 3.4171886444091797, "step": 5280 }, { "epoch": 2.828566649941462, "grad_norm": 24.189221868553442, "learning_rate": 9.840416253663719e-09, "logits/chosen": -0.29940885305404663, "logits/rejected": -0.14495554566383362, "logps/chosen": -3.181389331817627, "logps/rejected": -4.61637544631958, "loss": 0.4307, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.181389331817627, "rewards/margins": 1.434985637664795, "rewards/rejected": -4.61637544631958, "sft_loss": 3.3046677112579346, "step": 5285 }, { "epoch": 2.8312426827228636, "grad_norm": 16.26694146579324, "learning_rate": 9.535333370197074e-09, "logits/chosen": -0.2290172129869461, "logits/rejected": -0.030090123414993286, "logps/chosen": -3.1338648796081543, "logps/rejected": -4.436646461486816, "loss": 0.4332, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1338648796081543, "rewards/margins": 1.3027812242507935, "rewards/rejected": -4.436646461486816, "sft_loss": 3.3667190074920654, "step": 5290 }, { "epoch": 2.833918715504265, "grad_norm": 17.591562684197445, "learning_rate": 9.23500897707713e-09, "logits/chosen": -0.3069230914115906, "logits/rejected": -0.06996993720531464, "logps/chosen": -3.3611602783203125, "logps/rejected": -4.745880603790283, "loss": 0.431, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.3611602783203125, "rewards/margins": 1.3847198486328125, "rewards/rejected": -4.745880603790283, "sft_loss": 3.4696857929229736, "step": 5295 }, { "epoch": 2.8365947482856666, "grad_norm": 18.556496185207415, "learning_rate": 8.939445988052574e-09, "logits/chosen": -0.23846478760242462, "logits/rejected": -0.1405213624238968, "logps/chosen": -3.142049551010132, "logps/rejected": -4.500870704650879, "loss": 0.4318, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.142049551010132, "rewards/margins": 1.3588216304779053, "rewards/rejected": -4.500870704650879, "sft_loss": 3.2427544593811035, "step": 5300 }, { "epoch": 2.839270781067068, "grad_norm": 34.73071470891618, "learning_rate": 8.648647270676656e-09, "logits/chosen": -0.2352648228406906, "logits/rejected": -0.0805886760354042, "logps/chosen": -3.24641489982605, "logps/rejected": -4.425256729125977, "loss": 0.4864, "rewards/accuracies": 0.78125, "rewards/chosen": -3.24641489982605, "rewards/margins": 1.1788415908813477, "rewards/rejected": -4.425256729125977, "sft_loss": 3.4514222145080566, "step": 5305 }, { "epoch": 2.8419468138484696, "grad_norm": 16.060659457953548, "learning_rate": 8.362615646279991e-09, "logits/chosen": -0.393909752368927, "logits/rejected": -0.10782015323638916, "logps/chosen": -3.2175469398498535, "logps/rejected": -4.7070841789245605, "loss": 0.4525, "rewards/accuracies": 0.8125, "rewards/chosen": -3.2175469398498535, "rewards/margins": 1.4895371198654175, "rewards/rejected": -4.7070841789245605, "sft_loss": 3.373231887817383, "step": 5310 }, { "epoch": 2.8446228466298713, "grad_norm": 24.592200703808505, "learning_rate": 8.081353889942466e-09, "logits/chosen": -0.17484773695468903, "logits/rejected": 0.05669660493731499, "logps/chosen": -3.2570137977600098, "logps/rejected": -4.378922462463379, "loss": 0.4675, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2570137977600098, "rewards/margins": 1.1219086647033691, "rewards/rejected": -4.378922462463379, "sft_loss": 3.40653920173645, "step": 5315 }, { "epoch": 2.847298879411273, "grad_norm": 23.247955885169546, "learning_rate": 7.804864730467042e-09, "logits/chosen": -0.1862056851387024, "logits/rejected": -0.10346100479364395, "logps/chosen": -3.1827163696289062, "logps/rejected": -4.400063991546631, "loss": 0.4369, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1827163696289062, "rewards/margins": 1.2173478603363037, "rewards/rejected": -4.400063991546631, "sft_loss": 3.239983320236206, "step": 5320 }, { "epoch": 2.8499749121926743, "grad_norm": 13.975924625354944, "learning_rate": 7.533150850352665e-09, "logits/chosen": -0.23412814736366272, "logits/rejected": -0.014413821510970592, "logps/chosen": -3.18890118598938, "logps/rejected": -4.628230094909668, "loss": 0.397, "rewards/accuracies": 0.84375, "rewards/chosen": -3.18890118598938, "rewards/margins": 1.439328908920288, "rewards/rejected": -4.628230094909668, "sft_loss": 3.325713634490967, "step": 5325 }, { "epoch": 2.852650944974076, "grad_norm": 20.02004348976186, "learning_rate": 7.2662148857686175e-09, "logits/chosen": -0.17493878304958344, "logits/rejected": -0.05669688060879707, "logps/chosen": -3.1119914054870605, "logps/rejected": -4.453869819641113, "loss": 0.4648, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.1119914054870605, "rewards/margins": 1.3418781757354736, "rewards/rejected": -4.453869819641113, "sft_loss": 3.3459744453430176, "step": 5330 }, { "epoch": 2.8553269777554773, "grad_norm": 17.006423751521428, "learning_rate": 7.0040594265287635e-09, "logits/chosen": -0.13383543491363525, "logits/rejected": -0.18039759993553162, "logps/chosen": -3.1353771686553955, "logps/rejected": -4.129621982574463, "loss": 0.5205, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -3.1353771686553955, "rewards/margins": 0.994244396686554, "rewards/rejected": -4.129621982574463, "sft_loss": 3.2968032360076904, "step": 5335 }, { "epoch": 2.858003010536879, "grad_norm": 15.672222239707732, "learning_rate": 6.746687016066566e-09, "logits/chosen": -0.18188676238059998, "logits/rejected": -0.09335924685001373, "logps/chosen": -3.066502332687378, "logps/rejected": -4.382439136505127, "loss": 0.4402, "rewards/accuracies": 0.8125, "rewards/chosen": -3.066502332687378, "rewards/margins": 1.3159363269805908, "rewards/rejected": -4.382439136505127, "sft_loss": 3.1553988456726074, "step": 5340 }, { "epoch": 2.8606790433182807, "grad_norm": 19.99708773348044, "learning_rate": 6.494100151410276e-09, "logits/chosen": -0.3316792845726013, "logits/rejected": -0.09101025760173798, "logps/chosen": -3.124213218688965, "logps/rejected": -4.363964557647705, "loss": 0.4457, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.124213218688965, "rewards/margins": 1.2397515773773193, "rewards/rejected": -4.363964557647705, "sft_loss": 3.3296375274658203, "step": 5345 }, { "epoch": 2.8633550760996824, "grad_norm": 26.350892057682206, "learning_rate": 6.246301283158728e-09, "logits/chosen": -0.15335838496685028, "logits/rejected": -0.14175648987293243, "logps/chosen": -3.2014992237091064, "logps/rejected": -4.230733394622803, "loss": 0.5284, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2014992237091064, "rewards/margins": 1.0292346477508545, "rewards/rejected": -4.230733394622803, "sft_loss": 3.289850950241089, "step": 5350 }, { "epoch": 2.8660311088810837, "grad_norm": 17.43689301985789, "learning_rate": 6.0032928154576944e-09, "logits/chosen": -0.2279408723115921, "logits/rejected": -0.13653866946697235, "logps/chosen": -3.2177162170410156, "logps/rejected": -4.31894588470459, "loss": 0.485, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.2177162170410156, "rewards/margins": 1.1012293100357056, "rewards/rejected": -4.31894588470459, "sft_loss": 3.341726779937744, "step": 5355 }, { "epoch": 2.8687071416624854, "grad_norm": 19.895315726239694, "learning_rate": 5.76507710597629e-09, "logits/chosen": -0.29223036766052246, "logits/rejected": -0.05113974213600159, "logps/chosen": -3.1430892944335938, "logps/rejected": -4.285546779632568, "loss": 0.4814, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.1430892944335938, "rewards/margins": 1.1424576044082642, "rewards/rejected": -4.285546779632568, "sft_loss": 3.3228225708007812, "step": 5360 }, { "epoch": 2.8713831744438867, "grad_norm": 14.092836229431335, "learning_rate": 5.531656465884438e-09, "logits/chosen": -0.2947445809841156, "logits/rejected": -0.10095179080963135, "logps/chosen": -3.106201171875, "logps/rejected": -4.485922813415527, "loss": 0.4225, "rewards/accuracies": 0.84375, "rewards/chosen": -3.106201171875, "rewards/margins": 1.3797216415405273, "rewards/rejected": -4.485922813415527, "sft_loss": 3.2518115043640137, "step": 5365 }, { "epoch": 2.8740592072252884, "grad_norm": 18.82181749076804, "learning_rate": 5.303033159830217e-09, "logits/chosen": -0.11835135519504547, "logits/rejected": -0.06067631393671036, "logps/chosen": -3.220038652420044, "logps/rejected": -4.250067710876465, "loss": 0.5098, "rewards/accuracies": 0.78125, "rewards/chosen": -3.220038652420044, "rewards/margins": 1.0300289392471313, "rewards/rejected": -4.250067710876465, "sft_loss": 3.4482359886169434, "step": 5370 }, { "epoch": 2.87673524000669, "grad_norm": 18.342939790515526, "learning_rate": 5.079209405917939e-09, "logits/chosen": -0.20136015117168427, "logits/rejected": -0.07960865646600723, "logps/chosen": -3.0695133209228516, "logps/rejected": -4.61713981628418, "loss": 0.4149, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.0695133209228516, "rewards/margins": 1.5476267337799072, "rewards/rejected": -4.61713981628418, "sft_loss": 3.305483341217041, "step": 5375 }, { "epoch": 2.879411272788092, "grad_norm": 21.89902867052027, "learning_rate": 4.860187375686664e-09, "logits/chosen": -0.3148624300956726, "logits/rejected": -0.014219949953258038, "logps/chosen": -3.2581634521484375, "logps/rejected": -4.58234977722168, "loss": 0.4281, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.2581634521484375, "rewards/margins": 1.3241863250732422, "rewards/rejected": -4.58234977722168, "sft_loss": 3.4043948650360107, "step": 5380 }, { "epoch": 2.882087305569493, "grad_norm": 17.55555860238155, "learning_rate": 4.64596919408905e-09, "logits/chosen": -0.16654792428016663, "logits/rejected": -0.05108920484781265, "logps/chosen": -3.102210521697998, "logps/rejected": -4.339724063873291, "loss": 0.4365, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.102210521697998, "rewards/margins": 1.237513542175293, "rewards/rejected": -4.339724063873291, "sft_loss": 3.348740816116333, "step": 5385 }, { "epoch": 2.884763338350895, "grad_norm": 16.612033893445844, "learning_rate": 4.436556939470814e-09, "logits/chosen": -0.23238448798656464, "logits/rejected": -0.030825773254036903, "logps/chosen": -3.314937114715576, "logps/rejected": -4.333499431610107, "loss": 0.5111, "rewards/accuracies": 0.75, "rewards/chosen": -3.314937114715576, "rewards/margins": 1.0185620784759521, "rewards/rejected": -4.333499431610107, "sft_loss": 3.47926664352417, "step": 5390 }, { "epoch": 2.887439371132296, "grad_norm": 15.359683404778131, "learning_rate": 4.23195264355064e-09, "logits/chosen": -0.4084858000278473, "logits/rejected": -0.1296214759349823, "logps/chosen": -3.121598720550537, "logps/rejected": -4.403274059295654, "loss": 0.4311, "rewards/accuracies": 0.8125, "rewards/chosen": -3.121598720550537, "rewards/margins": 1.281675934791565, "rewards/rejected": -4.403274059295654, "sft_loss": 3.3122265338897705, "step": 5395 }, { "epoch": 2.890115403913698, "grad_norm": 18.92062612189072, "learning_rate": 4.032158291400245e-09, "logits/chosen": -0.30001047253608704, "logits/rejected": 0.011221880093216896, "logps/chosen": -3.0287575721740723, "logps/rejected": -4.610257625579834, "loss": 0.3946, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.0287575721740723, "rewards/margins": 1.5815006494522095, "rewards/rejected": -4.610257625579834, "sft_loss": 3.1205813884735107, "step": 5400 }, { "epoch": 2.8927914366950995, "grad_norm": 17.423505608686366, "learning_rate": 3.837175821425398e-09, "logits/chosen": -0.18389992415905, "logits/rejected": -0.10028437525033951, "logps/chosen": -3.2295024394989014, "logps/rejected": -4.395108222961426, "loss": 0.4898, "rewards/accuracies": 0.78125, "rewards/chosen": -3.2295024394989014, "rewards/margins": 1.1656053066253662, "rewards/rejected": -4.395108222961426, "sft_loss": 3.303095579147339, "step": 5405 }, { "epoch": 2.8954674694765012, "grad_norm": 13.317687293815423, "learning_rate": 3.6470071253467683e-09, "logits/chosen": -0.25252842903137207, "logits/rejected": -0.0916924774646759, "logps/chosen": -3.2524707317352295, "logps/rejected": -4.713873863220215, "loss": 0.4554, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2524707317352295, "rewards/margins": 1.4614031314849854, "rewards/rejected": -4.713873863220215, "sft_loss": 3.4252593517303467, "step": 5410 }, { "epoch": 2.8981435022579025, "grad_norm": 12.30414809863451, "learning_rate": 3.461654048181939e-09, "logits/chosen": -0.27532312273979187, "logits/rejected": -0.017485082149505615, "logps/chosen": -3.317967176437378, "logps/rejected": -4.423308372497559, "loss": 0.4897, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.317967176437378, "rewards/margins": 1.1053411960601807, "rewards/rejected": -4.423308372497559, "sft_loss": 3.5379645824432373, "step": 5415 }, { "epoch": 2.9008195350393042, "grad_norm": 16.426245227176164, "learning_rate": 3.281118388227255e-09, "logits/chosen": -0.19364751875400543, "logits/rejected": -0.11015711724758148, "logps/chosen": -3.285496234893799, "logps/rejected": -4.39368200302124, "loss": 0.5159, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.285496234893799, "rewards/margins": 1.1081856489181519, "rewards/rejected": -4.39368200302124, "sft_loss": 3.4461159706115723, "step": 5420 }, { "epoch": 2.903495567820706, "grad_norm": 18.407673201533754, "learning_rate": 3.1054018970405048e-09, "logits/chosen": -0.20994678139686584, "logits/rejected": -0.03689366206526756, "logps/chosen": -3.2366814613342285, "logps/rejected": -4.640575885772705, "loss": 0.4153, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.2366814613342285, "rewards/margins": 1.4038949012756348, "rewards/rejected": -4.640575885772705, "sft_loss": 3.4189772605895996, "step": 5425 }, { "epoch": 2.906171600602107, "grad_norm": 16.73506873996829, "learning_rate": 2.9345062794238207e-09, "logits/chosen": -0.28392454981803894, "logits/rejected": -0.05108907073736191, "logps/chosen": -3.1887130737304688, "logps/rejected": -4.554531574249268, "loss": 0.4022, "rewards/accuracies": 0.84375, "rewards/chosen": -3.1887130737304688, "rewards/margins": 1.365817904472351, "rewards/rejected": -4.554531574249268, "sft_loss": 3.3268024921417236, "step": 5430 }, { "epoch": 2.908847633383509, "grad_norm": 19.505222114816558, "learning_rate": 2.7684331934072492e-09, "logits/chosen": -0.3449562191963196, "logits/rejected": -0.2275485098361969, "logps/chosen": -3.114872694015503, "logps/rejected": -4.464018821716309, "loss": 0.4348, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.114872694015503, "rewards/margins": 1.3491462469100952, "rewards/rejected": -4.464018821716309, "sft_loss": 3.3187732696533203, "step": 5435 }, { "epoch": 2.9115236661649107, "grad_norm": 14.20180113281798, "learning_rate": 2.6071842502326526e-09, "logits/chosen": -0.30079519748687744, "logits/rejected": -0.11716896295547485, "logps/chosen": -3.1549365520477295, "logps/rejected": -4.319884777069092, "loss": 0.4552, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.1549365520477295, "rewards/margins": 1.1649483442306519, "rewards/rejected": -4.319884777069092, "sft_loss": 3.3109402656555176, "step": 5440 }, { "epoch": 2.9141996989463124, "grad_norm": 21.58427056227777, "learning_rate": 2.450761014337888e-09, "logits/chosen": -0.08962593972682953, "logits/rejected": -0.01259320043027401, "logps/chosen": -3.1360087394714355, "logps/rejected": -4.532161712646484, "loss": 0.4774, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.1360087394714355, "rewards/margins": 1.3961527347564697, "rewards/rejected": -4.532161712646484, "sft_loss": 3.270613193511963, "step": 5445 }, { "epoch": 2.9168757317277136, "grad_norm": 21.42168683506041, "learning_rate": 2.299165003341985e-09, "logits/chosen": -0.07372574508190155, "logits/rejected": 0.03359115868806839, "logps/chosen": -3.2150301933288574, "logps/rejected": -4.484751224517822, "loss": 0.4444, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.2150301933288574, "rewards/margins": 1.2697209119796753, "rewards/rejected": -4.484751224517822, "sft_loss": 3.3405425548553467, "step": 5450 }, { "epoch": 2.9195517645091154, "grad_norm": 17.151872710305522, "learning_rate": 2.1523976880299945e-09, "logits/chosen": -0.28902652859687805, "logits/rejected": -0.07120291888713837, "logps/chosen": -3.227856397628784, "logps/rejected": -4.356668472290039, "loss": 0.487, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.227856397628784, "rewards/margins": 1.128812551498413, "rewards/rejected": -4.356668472290039, "sft_loss": 3.3441262245178223, "step": 5455 }, { "epoch": 2.9222277972905166, "grad_norm": 13.587709270053116, "learning_rate": 2.010460492339161e-09, "logits/chosen": -0.2607978880405426, "logits/rejected": -0.06789745390415192, "logps/chosen": -3.0319695472717285, "logps/rejected": -4.351911544799805, "loss": 0.4565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.0319695472717285, "rewards/margins": 1.319941759109497, "rewards/rejected": -4.351911544799805, "sft_loss": 3.221714735031128, "step": 5460 }, { "epoch": 2.9249038300719183, "grad_norm": 12.862169052333378, "learning_rate": 1.8733547933446614e-09, "logits/chosen": -0.29628288745880127, "logits/rejected": -0.019387567415833473, "logps/chosen": -3.3344168663024902, "logps/rejected": -4.406271934509277, "loss": 0.495, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.3344168663024902, "rewards/margins": 1.071855068206787, "rewards/rejected": -4.406271934509277, "sft_loss": 3.375487804412842, "step": 5465 }, { "epoch": 2.92757986285332, "grad_norm": 30.64073459683912, "learning_rate": 1.7410819212467231e-09, "logits/chosen": -0.21239089965820312, "logits/rejected": -0.10766670852899551, "logps/chosen": -3.2013115882873535, "logps/rejected": -4.312717914581299, "loss": 0.4911, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.2013115882873535, "rewards/margins": 1.1114060878753662, "rewards/rejected": -4.312717914581299, "sft_loss": 3.416760206222534, "step": 5470 }, { "epoch": 2.9302558956347218, "grad_norm": 16.489737610950392, "learning_rate": 1.613643159357192e-09, "logits/chosen": -0.18608808517456055, "logits/rejected": -0.2229352444410324, "logps/chosen": -3.1164584159851074, "logps/rejected": -4.203319549560547, "loss": 0.4869, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -3.1164584159851074, "rewards/margins": 1.0868608951568604, "rewards/rejected": -4.203319549560547, "sft_loss": 3.298535108566284, "step": 5475 }, { "epoch": 2.932931928416123, "grad_norm": 18.91438897778467, "learning_rate": 1.4910397440875967e-09, "logits/chosen": -0.2308618724346161, "logits/rejected": -0.06928835809230804, "logps/chosen": -3.2458624839782715, "logps/rejected": -4.498655796051025, "loss": 0.4807, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -3.2458624839782715, "rewards/margins": 1.2527928352355957, "rewards/rejected": -4.498655796051025, "sft_loss": 3.3884501457214355, "step": 5480 }, { "epoch": 2.9356079611975248, "grad_norm": 19.68856849840769, "learning_rate": 1.3732728649368253e-09, "logits/chosen": -0.18805158138275146, "logits/rejected": 0.04152602702379227, "logps/chosen": -3.037163019180298, "logps/rejected": -4.186461448669434, "loss": 0.4366, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -3.037163019180298, "rewards/margins": 1.149298906326294, "rewards/rejected": -4.186461448669434, "sft_loss": 3.1664881706237793, "step": 5485 }, { "epoch": 2.938283993978926, "grad_norm": 21.89724872026843, "learning_rate": 1.260343664479524e-09, "logits/chosen": -0.23621201515197754, "logits/rejected": -0.15477347373962402, "logps/chosen": -3.1175694465637207, "logps/rejected": -4.36012601852417, "loss": 0.4541, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1175694465637207, "rewards/margins": 1.2425566911697388, "rewards/rejected": -4.36012601852417, "sft_loss": 3.3829116821289062, "step": 5490 }, { "epoch": 2.9409600267603278, "grad_norm": 15.19303404485659, "learning_rate": 1.1522532383554384e-09, "logits/chosen": -0.297503262758255, "logits/rejected": -0.03792861849069595, "logps/chosen": -3.0819716453552246, "logps/rejected": -4.551427841186523, "loss": 0.397, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.0819716453552246, "rewards/margins": 1.4694559574127197, "rewards/rejected": -4.551427841186523, "sft_loss": 3.3261775970458984, "step": 5495 }, { "epoch": 2.9436360595417295, "grad_norm": 14.193725945698466, "learning_rate": 1.049002635258256e-09, "logits/chosen": -0.1557658463716507, "logits/rejected": -0.01763027533888817, "logps/chosen": -3.2304654121398926, "logps/rejected": -4.36229944229126, "loss": 0.477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2304654121398926, "rewards/margins": 1.1318340301513672, "rewards/rejected": -4.36229944229126, "sft_loss": 3.355865001678467, "step": 5500 }, { "epoch": 2.946312092323131, "grad_norm": 20.967168324555896, "learning_rate": 9.505928569258358e-10, "logits/chosen": -0.17183566093444824, "logits/rejected": -0.13018515706062317, "logps/chosen": -3.193500280380249, "logps/rejected": -4.40403413772583, "loss": 0.4508, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.193500280380249, "rewards/margins": 1.210533618927002, "rewards/rejected": -4.40403413772583, "sft_loss": 3.395143985748291, "step": 5505 }, { "epoch": 2.9489881251045325, "grad_norm": 16.65492714187424, "learning_rate": 8.57024858130273e-10, "logits/chosen": -0.2834213376045227, "logits/rejected": -0.08938617259263992, "logps/chosen": -3.1837801933288574, "logps/rejected": -4.70327091217041, "loss": 0.4322, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1837801933288574, "rewards/margins": 1.5194900035858154, "rewards/rejected": -4.70327091217041, "sft_loss": 3.2524771690368652, "step": 5510 }, { "epoch": 2.951664157885934, "grad_norm": 15.621599775505928, "learning_rate": 7.682995466686826e-10, "logits/chosen": -0.3269936442375183, "logits/rejected": -0.14027708768844604, "logps/chosen": -3.178954601287842, "logps/rejected": -4.473852634429932, "loss": 0.4562, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.178954601287842, "rewards/margins": 1.2948976755142212, "rewards/rejected": -4.473852634429932, "sft_loss": 3.4053070545196533, "step": 5515 }, { "epoch": 2.9543401906673354, "grad_norm": 18.20048266619669, "learning_rate": 6.844177833543741e-10, "logits/chosen": -0.2227473258972168, "logits/rejected": -0.12284767627716064, "logps/chosen": -3.1623284816741943, "logps/rejected": -4.347644329071045, "loss": 0.4639, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -3.1623284816741943, "rewards/margins": 1.1853158473968506, "rewards/rejected": -4.347644329071045, "sft_loss": 3.2885944843292236, "step": 5520 }, { "epoch": 2.957016223448737, "grad_norm": 20.387338307524058, "learning_rate": 6.053803820087467e-10, "logits/chosen": -0.21222662925720215, "logits/rejected": -0.028198879212141037, "logps/chosen": -3.389390230178833, "logps/rejected": -4.714000225067139, "loss": 0.4668, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -3.389390230178833, "rewards/margins": 1.3246099948883057, "rewards/rejected": -4.714000225067139, "sft_loss": 3.579840898513794, "step": 5525 }, { "epoch": 2.959692256230139, "grad_norm": 15.261340925699978, "learning_rate": 5.311881094528514e-10, "logits/chosen": -0.33066612482070923, "logits/rejected": -0.05994957685470581, "logps/chosen": -3.28670072555542, "logps/rejected": -4.361258029937744, "loss": 0.4708, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -3.28670072555542, "rewards/margins": 1.0745580196380615, "rewards/rejected": -4.361258029937744, "sft_loss": 3.3876736164093018, "step": 5530 }, { "epoch": 2.9623682890115406, "grad_norm": 22.93299851088539, "learning_rate": 4.6184168550050806e-10, "logits/chosen": -0.2684002220630646, "logits/rejected": -0.19519592821598053, "logps/chosen": -3.2278761863708496, "logps/rejected": -4.352793216705322, "loss": 0.5092, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -3.2278761863708496, "rewards/margins": 1.1249163150787354, "rewards/rejected": -4.352793216705322, "sft_loss": 3.4347176551818848, "step": 5535 }, { "epoch": 2.965044321792942, "grad_norm": 19.22349114083425, "learning_rate": 3.973417829510328e-10, "logits/chosen": -0.3656173050403595, "logits/rejected": -0.20642141997814178, "logps/chosen": -3.1979427337646484, "logps/rejected": -4.361952304840088, "loss": 0.4841, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.1979427337646484, "rewards/margins": 1.1640093326568604, "rewards/rejected": -4.361952304840088, "sft_loss": 3.2818546295166016, "step": 5540 }, { "epoch": 2.9677203545743436, "grad_norm": 23.038096136486516, "learning_rate": 3.3768902758274377e-10, "logits/chosen": -0.2491336315870285, "logits/rejected": -0.10685531795024872, "logps/chosen": -3.1194095611572266, "logps/rejected": -4.261729717254639, "loss": 0.4646, "rewards/accuracies": 0.8125, "rewards/chosen": -3.1194095611572266, "rewards/margins": 1.142319917678833, "rewards/rejected": -4.261729717254639, "sft_loss": 3.1879723072052, "step": 5545 }, { "epoch": 2.970396387355745, "grad_norm": 12.739377033329509, "learning_rate": 2.8288399814691e-10, "logits/chosen": -0.15858404338359833, "logits/rejected": -0.0856924057006836, "logps/chosen": -3.145125150680542, "logps/rejected": -4.276131629943848, "loss": 0.4553, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.145125150680542, "rewards/margins": 1.1310064792633057, "rewards/rejected": -4.276131629943848, "sft_loss": 3.2625198364257812, "step": 5550 }, { "epoch": 2.9730724201371466, "grad_norm": 20.870769093388706, "learning_rate": 2.3292722636220066e-10, "logits/chosen": -0.269282728433609, "logits/rejected": -0.041368477046489716, "logps/chosen": -3.202881336212158, "logps/rejected": -4.5406646728515625, "loss": 0.4462, "rewards/accuracies": 0.78125, "rewards/chosen": -3.202881336212158, "rewards/margins": 1.3377827405929565, "rewards/rejected": -4.5406646728515625, "sft_loss": 3.2747268676757812, "step": 5555 }, { "epoch": 2.9757484529185483, "grad_norm": 19.267796921776338, "learning_rate": 1.8781919690946668e-10, "logits/chosen": -0.1855693757534027, "logits/rejected": -0.13502904772758484, "logps/chosen": -3.1918249130249023, "logps/rejected": -4.2519426345825195, "loss": 0.5034, "rewards/accuracies": 0.78125, "rewards/chosen": -3.1918249130249023, "rewards/margins": 1.060117483139038, "rewards/rejected": -4.2519426345825195, "sft_loss": 3.400783061981201, "step": 5560 }, { "epoch": 2.97842448569995, "grad_norm": 20.50863505394151, "learning_rate": 1.4756034742696711e-10, "logits/chosen": -0.2911008596420288, "logits/rejected": -0.1451537311077118, "logps/chosen": -3.2650890350341797, "logps/rejected": -4.429064750671387, "loss": 0.5037, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.2650890350341797, "rewards/margins": 1.163975477218628, "rewards/rejected": -4.429064750671387, "sft_loss": 3.4079482555389404, "step": 5565 }, { "epoch": 2.9811005184813513, "grad_norm": 15.847220799145843, "learning_rate": 1.12151068506261e-10, "logits/chosen": -0.2483815848827362, "logits/rejected": -0.07205311954021454, "logps/chosen": -3.121838092803955, "logps/rejected": -4.662284851074219, "loss": 0.4266, "rewards/accuracies": 0.84375, "rewards/chosen": -3.121838092803955, "rewards/margins": 1.5404466390609741, "rewards/rejected": -4.662284851074219, "sft_loss": 3.2707436084747314, "step": 5570 }, { "epoch": 2.983776551262753, "grad_norm": 17.09495394877967, "learning_rate": 8.159170368826629e-11, "logits/chosen": -0.261943519115448, "logits/rejected": -0.061380576342344284, "logps/chosen": -2.987300395965576, "logps/rejected": -4.289914131164551, "loss": 0.4827, "rewards/accuracies": 0.78125, "rewards/chosen": -2.987300395965576, "rewards/margins": 1.3026129007339478, "rewards/rejected": -4.289914131164551, "sft_loss": 3.1605923175811768, "step": 5575 }, { "epoch": 2.9864525840441547, "grad_norm": 23.620417611688563, "learning_rate": 5.588254946015114e-11, "logits/chosen": -0.3461746275424957, "logits/rejected": -0.020816374570131302, "logps/chosen": -3.119680166244507, "logps/rejected": -4.386064529418945, "loss": 0.4468, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -3.119680166244507, "rewards/margins": 1.2663848400115967, "rewards/rejected": -4.386064529418945, "sft_loss": 3.278569459915161, "step": 5580 }, { "epoch": 2.989128616825556, "grad_norm": 14.07239124920289, "learning_rate": 3.502385525216978e-11, "logits/chosen": -0.3145311772823334, "logits/rejected": -0.09763443470001221, "logps/chosen": -3.109294891357422, "logps/rejected": -4.4843902587890625, "loss": 0.4082, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -3.109294891357422, "rewards/margins": 1.375095009803772, "rewards/rejected": -4.4843902587890625, "sft_loss": 3.376676559448242, "step": 5585 }, { "epoch": 2.9918046496069577, "grad_norm": 18.215483819237175, "learning_rate": 1.901582343555308e-11, "logits/chosen": -0.21330972015857697, "logits/rejected": -0.11640063673257828, "logps/chosen": -3.2996044158935547, "logps/rejected": -4.525092124938965, "loss": 0.4692, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -3.2996044158935547, "rewards/margins": 1.225487470626831, "rewards/rejected": -4.525092124938965, "sft_loss": 3.3841681480407715, "step": 5590 }, { "epoch": 2.9944806823883594, "grad_norm": 25.746389433320907, "learning_rate": 7.858609320232634e-12, "logits/chosen": -0.2573150098323822, "logits/rejected": -0.025281842797994614, "logps/chosen": -3.1486496925354004, "logps/rejected": -4.428877353668213, "loss": 0.445, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -3.1486496925354004, "rewards/margins": 1.2802274227142334, "rewards/rejected": -4.428877353668213, "sft_loss": 3.344500780105591, "step": 5595 }, { "epoch": 2.9971567151697607, "grad_norm": 18.07974208741398, "learning_rate": 1.5523211535639624e-12, "logits/chosen": -0.26546934247016907, "logits/rejected": -0.10036937892436981, "logps/chosen": -3.194868564605713, "logps/rejected": -4.7231011390686035, "loss": 0.459, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -3.194868564605713, "rewards/margins": 1.528232216835022, "rewards/rejected": -4.7231011390686035, "sft_loss": 3.330408811569214, "step": 5600 }, { "epoch": 2.9971567151697607, "eval_logits/chosen": 0.15628661215305328, "eval_logits/rejected": 0.28109732270240784, "eval_logps/chosen": -3.3285162448883057, "eval_logps/rejected": -4.381001949310303, "eval_loss": 0.5583122968673706, "eval_rewards/accuracies": 0.7225519418716431, "eval_rewards/chosen": -3.3285162448883057, "eval_rewards/margins": 1.0524852275848389, "eval_rewards/rejected": -4.381001949310303, "eval_runtime": 51.2808, "eval_samples_per_second": 26.228, "eval_sft_loss": 3.47049617767334, "eval_steps_per_second": 6.572, "step": 5600 }, { "epoch": 2.999297541394882, "step": 5604, "total_flos": 0.0, "train_loss": 0.5516859670777903, "train_runtime": 39088.5646, "train_samples_per_second": 4.589, "train_steps_per_second": 0.143 } ], "logging_steps": 5, "max_steps": 5604, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }